diff --git a/main.py b/main.py index 1ce8fa4..bff089e 100644 --- a/main.py +++ b/main.py @@ -1,11 +1,11 @@ import pandas as pd import os import html2text -import sys current_dir = os.getcwd() wechat_dir = os.path.join(current_dir, 'wechat_dir') web_dir = os.path.join(current_dir, 'web_dir') +output_dir = os.path.join(current_dir, 'summary') df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容') def trans_to_json(): @@ -14,9 +14,8 @@ def trans_to_json(): f.write(json_str) def make_wechat_articles_full(): - df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv'), encoding='gb18030') + df = pd.read_excel(os.path.join(wechat_dir, 'articles.xlsx')) df['content'] = '' - ind = 0 for ind, row in df.iterrows(): full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.html') try: @@ -27,7 +26,6 @@ def make_wechat_articles_full(): print(f'{ind}--{row["nickname"]}--{row["title"]}') except: print(full_path + '---不存在') - ind +=1 output_path = os.path.join(wechat_dir, 'articles_full.csv') df.to_csv(output_path, encoding='utf-8_sig') @@ -35,56 +33,75 @@ def ana_wechat(): articles_full_path = os.path.join(wechat_dir, 'articles_full.csv') if not os.path.exists(articles_full_path): make_wechat_articles_full() - df_a = pd.DataFrame(columns = ['公众号', '标题', '地址', '错误表述', '建议修改词语', '错误分类']) + df = pd.read_csv(articles_full_path) df['content'] = df['content'].fillna('') - ind = 0 - need_save = False + + output_data = [] + index = 1 + for ind, row in df_s.iterrows(): mask = df['content'].str.contains(row['错误表述']) result = df[mask] - if result.empty: - continue - ind2 = 0 - for ind2, row2 in result.iterrows(): - alist = [row2['nickname'], row2['title'], row2['content_url'], row['错误表述'], row['建议修改词语'], row['错误分类']] - print(alist) - df_a.loc[len(df_a.index)] = alist - if need_save is False: - need_save = True - ind2 +=1 - ind +=1 - if need_save: - df_a.to_csv('ana_wechat.csv', encoding='utf-8_sig') + + if not result.empty: + for ind2, row2 in result.iterrows(): + output_row = [ + index, + row2['nickname'], + row2['title'], + row['错误表述'], + row['建议修改词语'], + row['错误分类'], + row2['content_url'] + ] + output_data.append(output_row) + index += 1 + + output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) + + return output_data + def ana_web(): - df_a = pd.DataFrame(columns = ['单位', '主办', '地址', '错误表述', '建议修改词语', '错误分类']) - need_save = False + output_data = [] + index = 1 + for file in os.listdir(web_dir): full_path = os.path.join(web_dir, file) if os.path.getsize(full_path) > 0: df = pd.read_excel(os.path.join(web_dir, file)) - ind = 0 for ind, row in df_s.iterrows(): mask = df['text'].str.contains(row['错误表述'], na=False) result = df[mask] - if result.empty: - continue - ind2 = 0 - for ind2, row2 in result.iterrows(): - alist = [row2['group'], row2['name'], row2['url'], row['错误表述'], row['建议修改词语'], row['错误分类']] - print(alist) - df_a.loc[len(df_a.index)] = alist - if need_save is False: - need_save = True - ind2 +=1 - ind +=1 - if need_save: - df_a.to_csv('ana_web.csv', encoding='utf-8_sig') + if not result.empty: + for ind2, row2 in result.iterrows(): + output_row = [ + index, + row2['name'], + "文章标题", + row['错误表述'], + row['建议修改词语'], + row['错误分类'], + row2['content_url'] + ] + output_data.append(output_row) + index += 1 -if __name__ == "__main__": - if len(sys.argv) > 1 and sys.argv[1] == 'wechat': - ana_wechat() - else: - ana_web() + output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) + return output_data + +# Run WeChat Analysis +wechat_results = ana_wechat() + +# Run Web Content Analysis +web_results = ana_web() + +# Save results in an Excel file with two sheets +output_excel_path = os.path.join(output_dir, '总院及下属公司官方公众号巡查结果汇总表.xlsx') +with pd.ExcelWriter(output_excel_path) as writer: + wechat_results.to_excel(writer, sheet_name='公众号', index=False) + web_results.to_excel(writer, sheet_name='网站', index=False) + +print("Analysis completed and results saved to Excel.")