103 lines
3.5 KiB
Python
103 lines
3.5 KiB
Python
import pandas as pd
|
|
import os
|
|
import sqlite3
|
|
from .base import BASE_DIR
|
|
|
|
wechat_dir = os.path.join(BASE_DIR, 'article')
|
|
web_dir = os.path.join(BASE_DIR, 'web_dir')
|
|
output_dir = os.path.join(BASE_DIR, 'summary')
|
|
df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')
|
|
|
|
def trans_to_json():
|
|
json_str = df_s.to_json(orient='records', force_ascii=False)
|
|
with open('biao.json', 'w', encoding='utf-8') as f:
|
|
f.write(json_str)
|
|
|
|
def make_simple_csv_from_db():
|
|
conn = sqlite3.connect(os.path.join(BASE_DIR, 'db_folder/test.db'))
|
|
query = "select id, g.nickname, a.title, a.content_url, datetime(a.p_date, 'unixepoch', 'localtime') as pub_date from articles a LEFT JOIN gzhs g on g.biz = a.biz"
|
|
df = pd.read_sql_query(query, conn)
|
|
# 关闭数据库连接
|
|
conn.close()
|
|
# 将数据写入CSV文件
|
|
df.to_csv(os.path.join(wechat_dir, 'articles.csv'), index=False)
|
|
|
|
|
|
def make_wechat_articles_full():
|
|
df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv'))
|
|
df['content'] = ''
|
|
for ind, row in df.iterrows():
|
|
full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.md')
|
|
try:
|
|
with open(full_path, encoding='utf-8') as f:
|
|
df.at[ind, 'content'] = f.read()
|
|
except FileNotFoundError:
|
|
print(full_path + '---不存在')
|
|
output_path = os.path.join(wechat_dir, 'articles_full.csv')
|
|
df.to_csv(output_path)
|
|
|
|
def ana_wechat():
|
|
articles_full_path = os.path.join(wechat_dir, 'articles_full.csv')
|
|
if not os.path.exists(articles_full_path):
|
|
make_wechat_articles_full()
|
|
|
|
df = pd.read_csv(articles_full_path)
|
|
df['content'] = df['content'].fillna('')
|
|
|
|
output_data = []
|
|
index = 1
|
|
|
|
for ind, row in df_s.iterrows():
|
|
mask = df['content'].str.contains(row['错误表述'])
|
|
result = df[mask]
|
|
|
|
if not result.empty:
|
|
for ind2, row2 in result.iterrows():
|
|
output_row = [
|
|
index,
|
|
row2['nickname'],
|
|
row2['title'],
|
|
row['错误表述'],
|
|
row['建议修改词语'],
|
|
row['错误分类'],
|
|
row2['content_url']
|
|
]
|
|
output_data.append(output_row)
|
|
index += 1
|
|
|
|
# output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
|
|
|
return output_data
|
|
|
|
|
|
def ana_web():
|
|
output_data = []
|
|
index = 1
|
|
|
|
for file in os.listdir(web_dir):
|
|
full_path = os.path.join(web_dir, file)
|
|
if os.path.getsize(full_path) > 0:
|
|
df = pd.read_excel(os.path.join(web_dir, file))
|
|
for ind, row in df_s.iterrows():
|
|
mask = df['text'].str.contains(row['错误表述'], na=False)
|
|
result = df[mask]
|
|
if not result.empty:
|
|
for ind2, row2 in result.iterrows():
|
|
output_row = [
|
|
index,
|
|
row2['name'],
|
|
"/",
|
|
row['错误表述'],
|
|
row['建议修改词语'],
|
|
row['错误分类'],
|
|
row2['url']
|
|
]
|
|
output_data.append(output_row)
|
|
index += 1
|
|
|
|
# output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
|
|
|
return output_data
|
|
|
|
|