From 32e093479205ac0d71d95dde7dceb68c92831a88 Mon Sep 17 00:00:00 2001 From: xiaobulu27 Date: Thu, 24 Aug 2023 17:06:02 +0800 Subject: [PATCH 1/2] =?UTF-8?q?selenium=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scrape.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/scrape.py b/scrape.py index 574263f..a593747 100644 --- a/scrape.py +++ b/scrape.py @@ -138,24 +138,29 @@ def main(): name = row['主办'] url = row['地址'] domain = urlparse(url).netloc - # Open the website - driver = open_website(url) - # Retrieve cookies from previous session - cookies = get_cookies_from_previous_session(driver) - # Add cookies to the WebDriver - add_cookies(driver, cookies) - # Initialize the set to store visited pages - visited_pages = set() - # Initialize the data list - data = [] - # Process the starting page and follow hyperlinks recursively - process_page(driver, url, visited_pages, domain, data) - # Export data to a separate Excel file for each URL - output_filename = f'web_dir/{name}_{domain}.xlsx' - export_to_excel(data, output_filename) - # Close the WebDriver - driver.quit() + # Open the website + driver = open_website(url) + + # Retrieve cookies from previous session + cookies = get_cookies_from_previous_session(driver) + # Add cookies to the WebDriver + add_cookies(driver, cookies) + + # Initialize the set to store visited pages + visited_pages = set() + # Initialize the data list + data = [] + + # Process the starting page and follow hyperlinks recursively + process_page(driver, url, visited_pages, domain, data) + + # Export data to a separate Excel file in the web_dir directory + output_filename = f'web_dir/{name}_{domain}.xlsx' + export_to_excel(data, output_filename) + + # Close the WebDriver + driver.quit() if __name__ == "__main__": main() From 9a8afacbfb4f33bf01714d451aa876078341f08d Mon Sep 17 00:00:00 2001 From: xiaobulu27 Date: Thu, 24 Aug 2023 17:07:18 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E5=AF=B9=E6=AF=94=E8=BE=93=E5=87=BA?= =?UTF-8?q?=E5=88=B0summary?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 99 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 58 insertions(+), 41 deletions(-) diff --git a/main.py b/main.py index 1ce8fa4..bff089e 100644 --- a/main.py +++ b/main.py @@ -1,11 +1,11 @@ import pandas as pd import os import html2text -import sys current_dir = os.getcwd() wechat_dir = os.path.join(current_dir, 'wechat_dir') web_dir = os.path.join(current_dir, 'web_dir') +output_dir = os.path.join(current_dir, 'summary') df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容') def trans_to_json(): @@ -14,9 +14,8 @@ def trans_to_json(): f.write(json_str) def make_wechat_articles_full(): - df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv'), encoding='gb18030') + df = pd.read_excel(os.path.join(wechat_dir, 'articles.xlsx')) df['content'] = '' - ind = 0 for ind, row in df.iterrows(): full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.html') try: @@ -27,7 +26,6 @@ def make_wechat_articles_full(): print(f'{ind}--{row["nickname"]}--{row["title"]}') except: print(full_path + '---不存在') - ind +=1 output_path = os.path.join(wechat_dir, 'articles_full.csv') df.to_csv(output_path, encoding='utf-8_sig') @@ -35,56 +33,75 @@ def ana_wechat(): articles_full_path = os.path.join(wechat_dir, 'articles_full.csv') if not os.path.exists(articles_full_path): make_wechat_articles_full() - df_a = pd.DataFrame(columns = ['公众号', '标题', '地址', '错误表述', '建议修改词语', '错误分类']) + df = pd.read_csv(articles_full_path) df['content'] = df['content'].fillna('') - ind = 0 - need_save = False + + output_data = [] + index = 1 + for ind, row in df_s.iterrows(): mask = df['content'].str.contains(row['错误表述']) result = df[mask] - if result.empty: - continue - ind2 = 0 - for ind2, row2 in result.iterrows(): - alist = [row2['nickname'], row2['title'], row2['content_url'], row['错误表述'], row['建议修改词语'], row['错误分类']] - print(alist) - df_a.loc[len(df_a.index)] = alist - if need_save is False: - need_save = True - ind2 +=1 - ind +=1 - if need_save: - df_a.to_csv('ana_wechat.csv', encoding='utf-8_sig') + + if not result.empty: + for ind2, row2 in result.iterrows(): + output_row = [ + index, + row2['nickname'], + row2['title'], + row['错误表述'], + row['建议修改词语'], + row['错误分类'], + row2['content_url'] + ] + output_data.append(output_row) + index += 1 + + output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) + + return output_data + def ana_web(): - df_a = pd.DataFrame(columns = ['单位', '主办', '地址', '错误表述', '建议修改词语', '错误分类']) - need_save = False + output_data = [] + index = 1 + for file in os.listdir(web_dir): full_path = os.path.join(web_dir, file) if os.path.getsize(full_path) > 0: df = pd.read_excel(os.path.join(web_dir, file)) - ind = 0 for ind, row in df_s.iterrows(): mask = df['text'].str.contains(row['错误表述'], na=False) result = df[mask] - if result.empty: - continue - ind2 = 0 - for ind2, row2 in result.iterrows(): - alist = [row2['group'], row2['name'], row2['url'], row['错误表述'], row['建议修改词语'], row['错误分类']] - print(alist) - df_a.loc[len(df_a.index)] = alist - if need_save is False: - need_save = True - ind2 +=1 - ind +=1 - if need_save: - df_a.to_csv('ana_web.csv', encoding='utf-8_sig') + if not result.empty: + for ind2, row2 in result.iterrows(): + output_row = [ + index, + row2['name'], + "文章标题", + row['错误表述'], + row['建议修改词语'], + row['错误分类'], + row2['content_url'] + ] + output_data.append(output_row) + index += 1 -if __name__ == "__main__": - if len(sys.argv) > 1 and sys.argv[1] == 'wechat': - ana_wechat() - else: - ana_web() + output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) + return output_data + +# Run WeChat Analysis +wechat_results = ana_wechat() + +# Run Web Content Analysis +web_results = ana_web() + +# Save results in an Excel file with two sheets +output_excel_path = os.path.join(output_dir, '总院及下属公司官方公众号巡查结果汇总表.xlsx') +with pd.ExcelWriter(output_excel_path) as writer: + wechat_results.to_excel(writer, sheet_name='公众号', index=False) + web_results.to_excel(writer, sheet_name='网站', index=False) + +print("Analysis completed and results saved to Excel.")