91 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			91 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Python
		
	
	
	
| import pandas as pd
 | |
| import os
 | |
| import html2text
 | |
| import sys
 | |
| 
 | |
| current_dir = os.getcwd()
 | |
| wechat_dir = os.path.join(current_dir, 'wechat_dir')
 | |
| web_dir = os.path.join(current_dir, 'web_dir')
 | |
| df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容')
 | |
| 
 | |
| def trans_to_json():
 | |
|     json_str = df_s.to_json(orient='records', force_ascii=False)
 | |
|     with open('biao.json', 'w', encoding='utf-8') as f:
 | |
|         f.write(json_str)
 | |
| 
 | |
| def make_wechat_articles_full():
 | |
|     df =  pd.read_csv(os.path.join(wechat_dir, 'articles.csv'), encoding='gb18030')
 | |
|     df['content'] = ''
 | |
|     ind = 0
 | |
|     for ind, row in df.iterrows():
 | |
|         full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.html')
 | |
|         try:
 | |
|             with open(full_path, encoding='utf-8') as f:
 | |
|                 h = html2text.HTML2Text()
 | |
|                 h.ignore_links = True
 | |
|                 df.at[ind, 'content'] = h.handle(f.read())
 | |
|             print(f'{ind}--{row["nickname"]}--{row["title"]}')
 | |
|         except:
 | |
|             print(full_path + '---不存在')
 | |
|         ind +=1
 | |
|     output_path = os.path.join(wechat_dir, 'articles_full.csv')
 | |
|     df.to_csv(output_path, encoding='utf-8_sig')
 | |
| 
 | |
| def ana_wechat():
 | |
|     articles_full_path = os.path.join(wechat_dir, 'articles_full.csv')
 | |
|     if not os.path.exists(articles_full_path):
 | |
|         make_wechat_articles_full()
 | |
|     df_a = pd.DataFrame(columns = ['公众号', '标题', '地址', '错误表述', '建议修改词语', '错误分类'])
 | |
|     df = pd.read_csv(articles_full_path)
 | |
|     df['content'] = df['content'].fillna('')
 | |
|     ind = 0
 | |
|     need_save = False
 | |
|     for ind, row in df_s.iterrows():
 | |
|         mask = df['content'].str.contains(row['错误表述'])
 | |
|         result = df[mask]
 | |
|         if result.empty:
 | |
|             continue
 | |
|         ind2 = 0
 | |
|         for ind2, row2 in result.iterrows():
 | |
|             alist = [row2['nickname'], row2['title'], row2['content_url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
 | |
|             print(alist)
 | |
|             df_a.loc[len(df_a.index)] = alist
 | |
|             if need_save is False:
 | |
|                 need_save = True
 | |
|             ind2 +=1
 | |
|         ind +=1
 | |
|     if need_save:
 | |
|         df_a.to_csv('ana_wechat.csv', encoding='utf-8_sig')
 | |
| 
 | |
| def ana_web():
 | |
|     df_a = pd.DataFrame(columns = ['单位', '主办', '地址', '错误表述', '建议修改词语', '错误分类'])
 | |
|     need_save = False
 | |
|     for file in os.listdir(web_dir):
 | |
|         full_path = os.path.join(web_dir, file)
 | |
|         if os.path.getsize(full_path) > 0:
 | |
|             df = pd.read_excel(os.path.join(web_dir, file))
 | |
|             ind = 0
 | |
|             for ind, row in df_s.iterrows():
 | |
|                 mask = df['text'].str.contains(row['错误表述'], na=False)
 | |
|                 result = df[mask]
 | |
|                 if result.empty:
 | |
|                     continue
 | |
|                 ind2 = 0
 | |
|                 for ind2, row2 in result.iterrows():
 | |
|                     alist = [row2['group'], row2['name'], row2['url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
 | |
|                     print(alist)
 | |
|                     df_a.loc[len(df_a.index)] = alist
 | |
|                     if need_save is False:
 | |
|                         need_save = True
 | |
|                     ind2 +=1
 | |
|                 ind +=1
 | |
|     if need_save:
 | |
|         df_a.to_csv('ana_web.csv', encoding='utf-8_sig')
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     if len(sys.argv) > 1 and sys.argv[1] == 'wechat':
 | |
|         ana_wechat()
 | |
|     else:
 | |
|         ana_web()
 | |
| 
 |