90 lines
3.3 KiB
Python
90 lines
3.3 KiB
Python
import pandas as pd
|
|
import os
|
|
import html2text
|
|
import sys
|
|
|
|
current_dir = os.getcwd()
|
|
wechat_dir = os.path.join(current_dir, 'wechat_dir')
|
|
web_dir = os.path.join(current_dir, 'web_dir')
|
|
df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容')
|
|
|
|
def trans_to_json():
|
|
json_str = df_s.to_json(orient='records', force_ascii=False)
|
|
with open('biao.json', 'w', encoding='utf-8') as f:
|
|
f.write(json_str)
|
|
|
|
def make_wechat_articles_full():
|
|
df = pd.read_csv(os.path.jon(wechat_dir, 'articles.csv'), encoding='gb18030')
|
|
df['content'] = ''
|
|
ind = 0
|
|
for ind, row in df.iterrows():
|
|
full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.html')
|
|
try:
|
|
with open(full_path, encoding='utf-8') as f:
|
|
h = html2text.HTML2Text()
|
|
h.ignore_links = True
|
|
df.at[ind, 'content'] = h.handle(f.read())
|
|
print(f'{ind}--{row["nickname"]}--{row["title"]}')
|
|
except:
|
|
print(full_path + '---不存在')
|
|
ind +=1
|
|
df.to_csv('articles_full.csv', encoding='utf-8_sig')
|
|
|
|
def ana_wechat():
|
|
articles_full_path = os.path.join(wechat_dir, 'articles_full.csv')
|
|
if not os.path.exists(articles_full_path):
|
|
make_wechat_articles_full()
|
|
df_a = pd.DataFrame(columns = ['公众号', '标题', '地址', '错误表述', '建议修改词语', '错误分类'])
|
|
df = pd.read_csv(articles_full_path)
|
|
df['content'] = df['content'].fillna('')
|
|
ind = 0
|
|
need_save = False
|
|
for ind, row in df_s.iterrows():
|
|
mask = df['content'].str.contains(row['错误表述'])
|
|
result = df[mask]
|
|
if result.empty:
|
|
continue
|
|
ind2 = 0
|
|
for ind2, row2 in result.iterrows():
|
|
alist = [row2['nickname'], row2['title'], row2['content_url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
|
|
print(alist)
|
|
df_a.loc[len(df_a.index)] = alist
|
|
if need_save is False:
|
|
need_save = True
|
|
ind2 +=1
|
|
ind +=1
|
|
if need_save:
|
|
df_a.to_csv('ana_wechat.csv', encoding='utf-8_sig')
|
|
|
|
def ana_web():
|
|
df_a = pd.DataFrame(columns = ['单位', '主办', '地址', '错误表述', '建议修改词语', '错误分类'])
|
|
need_save = False
|
|
for file in os.listdir(web_dir):
|
|
full_path = os.path.join(web_dir, file)
|
|
if os.path.getsize(full_path) > 0:
|
|
df = pd.read_csv(os.path.join(web_dir, file), encoding='gb18030')
|
|
ind = 0
|
|
for ind, row in df_s.iterrows():
|
|
mask = df['text'].str.contains(row['错误表述'])
|
|
result = df[mask]
|
|
if result.empty:
|
|
continue
|
|
ind2 = 0
|
|
for ind2, row2 in result.iterrows():
|
|
alist = [row2['group'], row2['name'], row2['url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
|
|
print(alist)
|
|
df_a.loc[len(df_a.index)] = alist
|
|
if need_save is False:
|
|
need_save = True
|
|
ind2 +=1
|
|
ind +=1
|
|
if need_save:
|
|
df_a.to_csv('ana_web.csv', encoding='utf-8_sig')
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) > 1 and sys.argv[1] == 'wechat':
|
|
ana_wechat()
|
|
else:
|
|
ana_web()
|
|
|