feat: 增加make_simple_csv_from_db

2023-08-24 17:12:15 +08:00 · 2023-08-24 17:12:15 +08:00 · bac76b97bc
parent 5a2129859f
commit bac76b97bc
1 changed files with 17 additions and 10 deletions
--- a/main.py
+++ b/main.py
@ -2,9 +2,10 @@ import pandas as pd
 import os
 import html2text
 import sys
 import sqlite3
 current_dir = os.getcwd()
-wechat_dir = os.path.join(current_dir, 'wechat_dir')
+wechat_dir = os.path.join(current_dir, 'article')
 web_dir = os.path.join(current_dir, 'web_dir')
 df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容')
@ -13,23 +14,30 @@ def trans_to_json():
    with open('biao.json', 'w', encoding='utf-8') as f:
        f.write(json_str)
 def make_simple_csv_from_db():
    conn = sqlite3.connect('db_folder/test.db')
    query = "select id, g.nickname, a.title, a.content_url, datetime(a.p_date, 'unixepoch', 'localtime') as pub_date from articles a LEFT JOIN gzhs g on g.biz = a.biz"
    df = pd.read_sql_query(query, conn)
    # 关闭数据库连接
    conn.close()
    # 将数据写入CSV文件
    df.to_csv(os.path.join(wechat_dir, 'articles.csv'), index=False)
 def make_wechat_articles_full():
-    df =  pd.read_csv(os.path.join(wechat_dir, 'articles.csv'), encoding='gb18030')
+    df =  pd.read_csv(os.path.join(wechat_dir, 'articles.csv'))
    df['content'] = ''
    ind = 0
    for ind, row in df.iterrows():
-        full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.html')
+        full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.md')
        try:
            with open(full_path, encoding='utf-8') as f:
-                h = html2text.HTML2Text()
+                df.at[ind, 'content'] = f.read()
-                h.ignore_links = True
+        except FileNotFoundError:
                df.at[ind, 'content'] = h.handle(f.read())
            print(f'{ind}--{row["nickname"]}--{row["title"]}')
        except:
            print(full_path + '---不存在')
        ind +=1
    output_path = os.path.join(wechat_dir, 'articles_full.csv')
-    df.to_csv(output_path, encoding='utf-8_sig')
+    df.to_csv(output_path)
 def ana_wechat():
    articles_full_path = os.path.join(wechat_dir, 'articles_full.csv')
@ -48,7 +56,6 @@ def ana_wechat():
        ind2 = 0
        for ind2, row2 in result.iterrows():
            alist = [row2['nickname'], row2['title'], row2['content_url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
            print(alist)
            df_a.loc[len(df_a.index)] = alist
            if need_save is False:
                need_save = True