feat: 增加make_simple_csv_from_db
This commit is contained in:
parent
5a2129859f
commit
bac76b97bc
27
main.py
27
main.py
|
@ -2,9 +2,10 @@ import pandas as pd
|
||||||
import os
|
import os
|
||||||
import html2text
|
import html2text
|
||||||
import sys
|
import sys
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
current_dir = os.getcwd()
|
current_dir = os.getcwd()
|
||||||
wechat_dir = os.path.join(current_dir, 'wechat_dir')
|
wechat_dir = os.path.join(current_dir, 'article')
|
||||||
web_dir = os.path.join(current_dir, 'web_dir')
|
web_dir = os.path.join(current_dir, 'web_dir')
|
||||||
df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容')
|
df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容')
|
||||||
|
|
||||||
|
@ -13,23 +14,30 @@ def trans_to_json():
|
||||||
with open('biao.json', 'w', encoding='utf-8') as f:
|
with open('biao.json', 'w', encoding='utf-8') as f:
|
||||||
f.write(json_str)
|
f.write(json_str)
|
||||||
|
|
||||||
|
def make_simple_csv_from_db():
|
||||||
|
conn = sqlite3.connect('db_folder/test.db')
|
||||||
|
query = "select id, g.nickname, a.title, a.content_url, datetime(a.p_date, 'unixepoch', 'localtime') as pub_date from articles a LEFT JOIN gzhs g on g.biz = a.biz"
|
||||||
|
df = pd.read_sql_query(query, conn)
|
||||||
|
# 关闭数据库连接
|
||||||
|
conn.close()
|
||||||
|
# 将数据写入CSV文件
|
||||||
|
df.to_csv(os.path.join(wechat_dir, 'articles.csv'), index=False)
|
||||||
|
|
||||||
|
|
||||||
def make_wechat_articles_full():
|
def make_wechat_articles_full():
|
||||||
df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv'), encoding='gb18030')
|
df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv'))
|
||||||
df['content'] = ''
|
df['content'] = ''
|
||||||
ind = 0
|
ind = 0
|
||||||
for ind, row in df.iterrows():
|
for ind, row in df.iterrows():
|
||||||
full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.html')
|
full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.md')
|
||||||
try:
|
try:
|
||||||
with open(full_path, encoding='utf-8') as f:
|
with open(full_path, encoding='utf-8') as f:
|
||||||
h = html2text.HTML2Text()
|
df.at[ind, 'content'] = f.read()
|
||||||
h.ignore_links = True
|
except FileNotFoundError:
|
||||||
df.at[ind, 'content'] = h.handle(f.read())
|
|
||||||
print(f'{ind}--{row["nickname"]}--{row["title"]}')
|
|
||||||
except:
|
|
||||||
print(full_path + '---不存在')
|
print(full_path + '---不存在')
|
||||||
ind +=1
|
ind +=1
|
||||||
output_path = os.path.join(wechat_dir, 'articles_full.csv')
|
output_path = os.path.join(wechat_dir, 'articles_full.csv')
|
||||||
df.to_csv(output_path, encoding='utf-8_sig')
|
df.to_csv(output_path)
|
||||||
|
|
||||||
def ana_wechat():
|
def ana_wechat():
|
||||||
articles_full_path = os.path.join(wechat_dir, 'articles_full.csv')
|
articles_full_path = os.path.join(wechat_dir, 'articles_full.csv')
|
||||||
|
@ -48,7 +56,6 @@ def ana_wechat():
|
||||||
ind2 = 0
|
ind2 = 0
|
||||||
for ind2, row2 in result.iterrows():
|
for ind2, row2 in result.iterrows():
|
||||||
alist = [row2['nickname'], row2['title'], row2['content_url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
|
alist = [row2['nickname'], row2['title'], row2['content_url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
|
||||||
print(alist)
|
|
||||||
df_a.loc[len(df_a.index)] = alist
|
df_a.loc[len(df_a.index)] = alist
|
||||||
if need_save is False:
|
if need_save is False:
|
||||||
need_save = True
|
need_save = True
|
||||||
|
|
Loading…
Reference in New Issue