import pandas as pd import os import html2text import sys current_dir = os.getcwd() wechat_dir = os.path.join(current_dir, 'wechat_dir') web_dir = os.path.join(current_dir, 'web_dir') df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容') def trans_to_json(): json_str = df_s.to_json(orient='records', force_ascii=False) with open('biao.json', 'w', encoding='utf-8') as f: f.write(json_str) def make_wechat_articles_full(): df = pd.read_csv(os.path.jon(wechat_dir, 'articles.csv'), encoding='gb18030') df['content'] = '' ind = 0 for ind, row in df.iterrows(): full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.html') try: with open(full_path, encoding='utf-8') as f: h = html2text.HTML2Text() h.ignore_links = True df.at[ind, 'content'] = h.handle(f.read()) print(f'{ind}--{row["nickname"]}--{row["title"]}') except: print(full_path + '---不存在') ind +=1 df.to_csv('articles_full.csv', encoding='utf-8_sig') def ana_wechat(): articles_full_path = os.path.join(wechat_dir, 'articles_full.csv') if not os.path.exists(articles_full_path): make_wechat_articles_full() df_a = pd.DataFrame(columns = ['公众号', '标题', '地址', '错误表述', '建议修改词语', '错误分类']) df = pd.read_csv(articles_full_path) df['content'] = df['content'].fillna('') ind = 0 need_save = False for ind, row in df_s.iterrows(): mask = df['content'].str.contains(row['错误表述']) result = df[mask] if result.empty: continue ind2 = 0 for ind2, row2 in result.iterrows(): alist = [row2['nickname'], row2['title'], row2['content_url'], row['错误表述'], row['建议修改词语'], row['错误分类']] print(alist) df_a.loc[len(df_a.index)] = alist if need_save is False: need_save = True ind2 +=1 ind +=1 if need_save: df_a.to_csv('ana_wechat.csv', encoding='utf-8_sig') def ana_web(): df_a = pd.DataFrame(columns = ['单位', '主办', '地址', '错误表述', '建议修改词语', '错误分类']) need_save = False for file in os.listdir(web_dir): full_path = os.path.join(web_dir, file) if os.path.getsize(full_path) > 0: df = pd.read_csv(os.path.join(web_dir, file), encoding='gb18030') ind = 0 for ind, row in df_s.iterrows(): mask = df['text'].str.contains(row['错误表述']) result = df[mask] if result.empty: continue ind2 = 0 for ind2, row2 in result.iterrows(): alist = [row2['group'], row2['name'], row2['url'], row['错误表述'], row['建议修改词语'], row['错误分类']] print(alist) df_a.loc[len(df_a.index)] = alist if need_save is False: need_save = True ind2 +=1 ind +=1 if need_save: df_a.to_csv('ana_web.csv', encoding='utf-8_sig') if __name__ == "__main__": if len(sys.argv) > 1 and sys.argv[1] == 'wechat': ana_wechat() else: ana_web()