From bac76b97bcb7f5723e2a79f84ba1a9e24841869f Mon Sep 17 00:00:00 2001 From: caoqianming Date: Thu, 24 Aug 2023 17:12:15 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=A2=9E=E5=8A=A0make=5Fsimple=5Fcsv?= =?UTF-8?q?=5Ffrom=5Fdb?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/main.py b/main.py index 1ce8fa4..6b1da81 100644 --- a/main.py +++ b/main.py @@ -2,9 +2,10 @@ import pandas as pd import os import html2text import sys +import sqlite3 current_dir = os.getcwd() -wechat_dir = os.path.join(current_dir, 'wechat_dir') +wechat_dir = os.path.join(current_dir, 'article') web_dir = os.path.join(current_dir, 'web_dir') df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容') @@ -13,23 +14,30 @@ def trans_to_json(): with open('biao.json', 'w', encoding='utf-8') as f: f.write(json_str) +def make_simple_csv_from_db(): + conn = sqlite3.connect('db_folder/test.db') + query = "select id, g.nickname, a.title, a.content_url, datetime(a.p_date, 'unixepoch', 'localtime') as pub_date from articles a LEFT JOIN gzhs g on g.biz = a.biz" + df = pd.read_sql_query(query, conn) + # 关闭数据库连接 + conn.close() + # 将数据写入CSV文件 + df.to_csv(os.path.join(wechat_dir, 'articles.csv'), index=False) + + def make_wechat_articles_full(): - df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv'), encoding='gb18030') + df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv')) df['content'] = '' ind = 0 for ind, row in df.iterrows(): - full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.html') + full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.md') try: with open(full_path, encoding='utf-8') as f: - h = html2text.HTML2Text() - h.ignore_links = True - df.at[ind, 'content'] = h.handle(f.read()) - print(f'{ind}--{row["nickname"]}--{row["title"]}') - except: + df.at[ind, 'content'] = f.read() + except FileNotFoundError: print(full_path + '---不存在') ind +=1 output_path = os.path.join(wechat_dir, 'articles_full.csv') - df.to_csv(output_path, encoding='utf-8_sig') + df.to_csv(output_path) def ana_wechat(): articles_full_path = os.path.join(wechat_dir, 'articles_full.csv') @@ -48,7 +56,6 @@ def ana_wechat(): ind2 = 0 for ind2, row2 in result.iterrows(): alist = [row2['nickname'], row2['title'], row2['content_url'], row['错误表述'], row['建议修改词语'], row['错误分类']] - print(alist) df_a.loc[len(df_a.index)] = alist if need_save is False: need_save = True