diff --git a/main.py b/main.py index 1ce8fa4..6b1da81 100644 --- a/main.py +++ b/main.py @@ -2,9 +2,10 @@ import pandas as pd import os import html2text import sys +import sqlite3 current_dir = os.getcwd() -wechat_dir = os.path.join(current_dir, 'wechat_dir') +wechat_dir = os.path.join(current_dir, 'article') web_dir = os.path.join(current_dir, 'web_dir') df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容') @@ -13,23 +14,30 @@ def trans_to_json(): with open('biao.json', 'w', encoding='utf-8') as f: f.write(json_str) +def make_simple_csv_from_db(): + conn = sqlite3.connect('db_folder/test.db') + query = "select id, g.nickname, a.title, a.content_url, datetime(a.p_date, 'unixepoch', 'localtime') as pub_date from articles a LEFT JOIN gzhs g on g.biz = a.biz" + df = pd.read_sql_query(query, conn) + # 关闭数据库连接 + conn.close() + # 将数据写入CSV文件 + df.to_csv(os.path.join(wechat_dir, 'articles.csv'), index=False) + + def make_wechat_articles_full(): - df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv'), encoding='gb18030') + df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv')) df['content'] = '' ind = 0 for ind, row in df.iterrows(): - full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.html') + full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.md') try: with open(full_path, encoding='utf-8') as f: - h = html2text.HTML2Text() - h.ignore_links = True - df.at[ind, 'content'] = h.handle(f.read()) - print(f'{ind}--{row["nickname"]}--{row["title"]}') - except: + df.at[ind, 'content'] = f.read() + except FileNotFoundError: print(full_path + '---不存在') ind +=1 output_path = os.path.join(wechat_dir, 'articles_full.csv') - df.to_csv(output_path, encoding='utf-8_sig') + df.to_csv(output_path) def ana_wechat(): articles_full_path = os.path.join(wechat_dir, 'articles_full.csv') @@ -48,7 +56,6 @@ def ana_wechat(): ind2 = 0 for ind2, row2 in result.iterrows(): alist = [row2['nickname'], row2['title'], row2['content_url'], row['错误表述'], row['建议修改词语'], row['错误分类']] - print(alist) df_a.loc[len(df_a.index)] = alist if need_save is False: need_save = True