import pandas as pd import os import sqlite3 current_dir = os.getcwd() wechat_dir = os.path.join(current_dir, 'article') web_dir = os.path.join(current_dir, 'web_dir') output_dir = os.path.join(current_dir, 'summary') df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容') def trans_to_json(): json_str = df_s.to_json(orient='records', force_ascii=False) with open('biao.json', 'w', encoding='utf-8') as f: f.write(json_str) def make_simple_csv_from_db(): conn = sqlite3.connect('db_folder/test.db') query = "select id, g.nickname, a.title, a.content_url, datetime(a.p_date, 'unixepoch', 'localtime') as pub_date from articles a LEFT JOIN gzhs g on g.biz = a.biz" df = pd.read_sql_query(query, conn) # 关闭数据库连接 conn.close() # 将数据写入CSV文件 df.to_csv(os.path.join(wechat_dir, 'articles.csv'), index=False) def make_wechat_articles_full(): df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv')) df['content'] = '' for ind, row in df.iterrows(): full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.md') try: with open(full_path, encoding='utf-8') as f: df.at[ind, 'content'] = f.read() except FileNotFoundError: print(full_path + '---不存在') output_path = os.path.join(wechat_dir, 'articles_full.csv') df.to_csv(output_path) def ana_wechat(): articles_full_path = os.path.join(wechat_dir, 'articles_full.csv') if not os.path.exists(articles_full_path): make_wechat_articles_full() df = pd.read_csv(articles_full_path) df['content'] = df['content'].fillna('') output_data = [] index = 1 for ind, row in df_s.iterrows(): mask = df['content'].str.contains(row['错误表述']) result = df[mask] if not result.empty: for ind2, row2 in result.iterrows(): output_row = [ index, row2['nickname'], row2['title'], row['错误表述'], row['建议修改词语'], row['错误分类'], row2['content_url'] ] output_data.append(output_row) index += 1 output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) return output_data def ana_web(): output_data = [] index = 1 for file in os.listdir(web_dir): full_path = os.path.join(web_dir, file) if os.path.getsize(full_path) > 0: df = pd.read_excel(os.path.join(web_dir, file)) for ind, row in df_s.iterrows(): mask = df['text'].str.contains(row['错误表述'], na=False) result = df[mask] if not result.empty: for ind2, row2 in result.iterrows(): output_row = [ index, row2['name'], "文章标题", row['错误表述'], row['建议修改词语'], row['错误分类'], row2['content_url'] ] output_data.append(output_row) index += 1 output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) return output_data # Run WeChat Analysis wechat_results = ana_wechat() # Run Web Content Analysis web_results = ana_web() # Save results in an Excel file with two sheets output_excel_path = os.path.join(output_dir, '总院及下属公司官方公众号巡查结果汇总表.xlsx') with pd.ExcelWriter(output_excel_path) as writer: wechat_results.to_excel(writer, sheet_name='公众号', index=False) web_results.to_excel(writer, sheet_name='网站', index=False) print("Analysis completed and results saved to Excel.")