This commit is contained in:
caoqianming 2023-08-24 17:16:51 +08:00
commit d2a1dd165d
2 changed files with 79 additions and 57 deletions

97
main.py
View File

@ -1,12 +1,11 @@
import pandas as pd import pandas as pd
import os import os
import html2text
import sys
import sqlite3 import sqlite3
current_dir = os.getcwd() current_dir = os.getcwd()
wechat_dir = os.path.join(current_dir, 'article') wechat_dir = os.path.join(current_dir, 'article')
web_dir = os.path.join(current_dir, 'web_dir') web_dir = os.path.join(current_dir, 'web_dir')
output_dir = os.path.join(current_dir, 'summary')
df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容') df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容')
def trans_to_json(): def trans_to_json():
@ -27,7 +26,6 @@ def make_simple_csv_from_db():
def make_wechat_articles_full(): def make_wechat_articles_full():
df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv')) df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv'))
df['content'] = '' df['content'] = ''
ind = 0
for ind, row in df.iterrows(): for ind, row in df.iterrows():
full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.md') full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.md')
try: try:
@ -35,7 +33,6 @@ def make_wechat_articles_full():
df.at[ind, 'content'] = f.read() df.at[ind, 'content'] = f.read()
except FileNotFoundError: except FileNotFoundError:
print(full_path + '---不存在') print(full_path + '---不存在')
ind +=1
output_path = os.path.join(wechat_dir, 'articles_full.csv') output_path = os.path.join(wechat_dir, 'articles_full.csv')
df.to_csv(output_path) df.to_csv(output_path)
@ -43,55 +40,75 @@ def ana_wechat():
articles_full_path = os.path.join(wechat_dir, 'articles_full.csv') articles_full_path = os.path.join(wechat_dir, 'articles_full.csv')
if not os.path.exists(articles_full_path): if not os.path.exists(articles_full_path):
make_wechat_articles_full() make_wechat_articles_full()
df_a = pd.DataFrame(columns = ['公众号', '标题', '地址', '错误表述', '建议修改词语', '错误分类'])
df = pd.read_csv(articles_full_path) df = pd.read_csv(articles_full_path)
df['content'] = df['content'].fillna('') df['content'] = df['content'].fillna('')
ind = 0
need_save = False output_data = []
index = 1
for ind, row in df_s.iterrows(): for ind, row in df_s.iterrows():
mask = df['content'].str.contains(row['错误表述']) mask = df['content'].str.contains(row['错误表述'])
result = df[mask] result = df[mask]
if result.empty:
continue if not result.empty:
ind2 = 0 for ind2, row2 in result.iterrows():
for ind2, row2 in result.iterrows(): output_row = [
alist = [row2['nickname'], row2['title'], row2['content_url'], row['错误表述'], row['建议修改词语'], row['错误分类']] index,
df_a.loc[len(df_a.index)] = alist row2['nickname'],
if need_save is False: row2['title'],
need_save = True row['错误表述'],
ind2 +=1 row['建议修改词语'],
ind +=1 row['错误分类'],
if need_save: row2['content_url']
df_a.to_csv('ana_wechat.csv', encoding='utf-8_sig') ]
output_data.append(output_row)
index += 1
output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
return output_data
def ana_web(): def ana_web():
df_a = pd.DataFrame(columns = ['单位', '主办', '地址', '错误表述', '建议修改词语', '错误分类']) output_data = []
need_save = False index = 1
for file in os.listdir(web_dir): for file in os.listdir(web_dir):
full_path = os.path.join(web_dir, file) full_path = os.path.join(web_dir, file)
if os.path.getsize(full_path) > 0: if os.path.getsize(full_path) > 0:
df = pd.read_excel(os.path.join(web_dir, file)) df = pd.read_excel(os.path.join(web_dir, file))
ind = 0
for ind, row in df_s.iterrows(): for ind, row in df_s.iterrows():
mask = df['text'].str.contains(row['错误表述'], na=False) mask = df['text'].str.contains(row['错误表述'], na=False)
result = df[mask] result = df[mask]
if result.empty: if not result.empty:
continue for ind2, row2 in result.iterrows():
ind2 = 0 output_row = [
for ind2, row2 in result.iterrows(): index,
alist = [row2['group'], row2['name'], row2['url'], row['错误表述'], row['建议修改词语'], row['错误分类']] row2['name'],
print(alist) "文章标题",
df_a.loc[len(df_a.index)] = alist row['错误表述'],
if need_save is False: row['建议修改词语'],
need_save = True row['错误分类'],
ind2 +=1 row2['content_url']
ind +=1 ]
if need_save: output_data.append(output_row)
df_a.to_csv('ana_web.csv', encoding='utf-8_sig') index += 1
if __name__ == "__main__": output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
if len(sys.argv) > 1 and sys.argv[1] == 'wechat':
ana_wechat()
else:
ana_web()
return output_data
# Run WeChat Analysis
wechat_results = ana_wechat()
# Run Web Content Analysis
web_results = ana_web()
# Save results in an Excel file with two sheets
output_excel_path = os.path.join(output_dir, '总院及下属公司官方公众号巡查结果汇总表.xlsx')
with pd.ExcelWriter(output_excel_path) as writer:
wechat_results.to_excel(writer, sheet_name='公众号', index=False)
web_results.to_excel(writer, sheet_name='网站', index=False)
print("Analysis completed and results saved to Excel.")

View File

@ -138,24 +138,29 @@ def main():
name = row['主办'] name = row['主办']
url = row['地址'] url = row['地址']
domain = urlparse(url).netloc domain = urlparse(url).netloc
# Open the website
driver = open_website(url)
# Retrieve cookies from previous session
cookies = get_cookies_from_previous_session(driver)
# Add cookies to the WebDriver
add_cookies(driver, cookies)
# Initialize the set to store visited pages
visited_pages = set()
# Initialize the data list
data = []
# Process the starting page and follow hyperlinks recursively
process_page(driver, url, visited_pages, domain, data)
# Export data to a separate Excel file for each URL
output_filename = f'web_dir/{name}_{domain}.xlsx'
export_to_excel(data, output_filename)
# Close the WebDriver # Open the website
driver.quit() driver = open_website(url)
# Retrieve cookies from previous session
cookies = get_cookies_from_previous_session(driver)
# Add cookies to the WebDriver
add_cookies(driver, cookies)
# Initialize the set to store visited pages
visited_pages = set()
# Initialize the data list
data = []
# Process the starting page and follow hyperlinks recursively
process_page(driver, url, visited_pages, domain, data)
# Export data to a separate Excel file in the web_dir directory
output_filename = f'web_dir/{name}_{domain}.xlsx'
export_to_excel(data, output_filename)
# Close the WebDriver
driver.quit()
if __name__ == "__main__": if __name__ == "__main__":
main() main()