From 32e093479205ac0d71d95dde7dceb68c92831a88 Mon Sep 17 00:00:00 2001
From: xiaobulu27 <xiaobulu27@outlook.com>
Date: Thu, 24 Aug 2023 17:06:02 +0800
Subject: [PATCH 1/2] =?UTF-8?q?selenium=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scrape.py | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/scrape.py b/scrape.py
index 574263f..a593747 100644
--- a/scrape.py
+++ b/scrape.py
@@ -138,24 +138,29 @@ def main():
         name = row['主办']
         url = row['地址']
         domain = urlparse(url).netloc
-    # Open the website
-    driver = open_website(url)
-    # Retrieve cookies from previous session
-    cookies = get_cookies_from_previous_session(driver)
-    # Add cookies to the WebDriver
-    add_cookies(driver, cookies)
-    # Initialize the set to store visited pages
-    visited_pages = set()
-    # Initialize the data list
-    data = []
-    # Process the starting page and follow hyperlinks recursively
-    process_page(driver, url, visited_pages, domain, data)
-    # Export data to a separate Excel file for each URL
-    output_filename = f'web_dir/{name}_{domain}.xlsx'
-    export_to_excel(data, output_filename)
 
-    # Close the WebDriver
-    driver.quit()
+        # Open the website
+        driver = open_website(url)
+
+        # Retrieve cookies from previous session
+        cookies = get_cookies_from_previous_session(driver)
+        # Add cookies to the WebDriver
+        add_cookies(driver, cookies)
+
+        # Initialize the set to store visited pages
+        visited_pages = set()
+        # Initialize the data list
+        data = []
+
+        # Process the starting page and follow hyperlinks recursively
+        process_page(driver, url, visited_pages, domain, data)
+
+        # Export data to a separate Excel file in the web_dir directory
+        output_filename = f'web_dir/{name}_{domain}.xlsx'
+        export_to_excel(data, output_filename)
+
+        # Close the WebDriver
+        driver.quit()
 
 if __name__ == "__main__":
     main()

From 9a8afacbfb4f33bf01714d451aa876078341f08d Mon Sep 17 00:00:00 2001
From: xiaobulu27 <xiaobulu27@outlook.com>
Date: Thu, 24 Aug 2023 17:07:18 +0800
Subject: [PATCH 2/2] =?UTF-8?q?=E5=AF=B9=E6=AF=94=E8=BE=93=E5=87=BA?=
 =?UTF-8?q?=E5=88=B0summary?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 main.py | 99 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 58 insertions(+), 41 deletions(-)

diff --git a/main.py b/main.py
index 1ce8fa4..bff089e 100644
--- a/main.py
+++ b/main.py
@@ -1,11 +1,11 @@
 import pandas as pd
 import os
 import html2text
-import sys
 
 current_dir = os.getcwd()
 wechat_dir = os.path.join(current_dir, 'wechat_dir')
 web_dir = os.path.join(current_dir, 'web_dir')
+output_dir = os.path.join(current_dir, 'summary')
 df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容')
 
 def trans_to_json():
@@ -14,9 +14,8 @@ def trans_to_json():
         f.write(json_str)
 
 def make_wechat_articles_full():
-    df =  pd.read_csv(os.path.join(wechat_dir, 'articles.csv'), encoding='gb18030')
+    df =  pd.read_excel(os.path.join(wechat_dir, 'articles.xlsx'))
     df['content'] = ''
-    ind = 0
     for ind, row in df.iterrows():
         full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.html')
         try:
@@ -27,7 +26,6 @@ def make_wechat_articles_full():
             print(f'{ind}--{row["nickname"]}--{row["title"]}')
         except:
             print(full_path + '---不存在')
-        ind +=1
     output_path = os.path.join(wechat_dir, 'articles_full.csv')
     df.to_csv(output_path, encoding='utf-8_sig')
 
@@ -35,56 +33,75 @@ def ana_wechat():
     articles_full_path = os.path.join(wechat_dir, 'articles_full.csv')
     if not os.path.exists(articles_full_path):
         make_wechat_articles_full()
-    df_a = pd.DataFrame(columns = ['公众号', '标题', '地址', '错误表述', '建议修改词语', '错误分类'])
+
     df = pd.read_csv(articles_full_path)
     df['content'] = df['content'].fillna('')
-    ind = 0
-    need_save = False
+
+    output_data = []
+    index = 1
+
     for ind, row in df_s.iterrows():
         mask = df['content'].str.contains(row['错误表述'])
         result = df[mask]
-        if result.empty:
-            continue
-        ind2 = 0
-        for ind2, row2 in result.iterrows():
-            alist = [row2['nickname'], row2['title'], row2['content_url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
-            print(alist)
-            df_a.loc[len(df_a.index)] = alist
-            if need_save is False:
-                need_save = True
-            ind2 +=1
-        ind +=1
-    if need_save:
-        df_a.to_csv('ana_wechat.csv', encoding='utf-8_sig')
+
+        if not result.empty:
+            for ind2, row2 in result.iterrows():
+                output_row = [
+                    index,
+                    row2['nickname'],
+                    row2['title'],
+                    row['错误表述'],
+                    row['建议修改词语'],
+                    row['错误分类'],
+                    row2['content_url']
+                ]
+                output_data.append(output_row)
+                index += 1
+
+    output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
+
+    return output_data
+
 
 def ana_web():
-    df_a = pd.DataFrame(columns = ['单位', '主办', '地址', '错误表述', '建议修改词语', '错误分类'])
-    need_save = False
+    output_data = []
+    index = 1
+
     for file in os.listdir(web_dir):
         full_path = os.path.join(web_dir, file)
         if os.path.getsize(full_path) > 0:
             df = pd.read_excel(os.path.join(web_dir, file))
-            ind = 0
             for ind, row in df_s.iterrows():
                 mask = df['text'].str.contains(row['错误表述'], na=False)
                 result = df[mask]
-                if result.empty:
-                    continue
-                ind2 = 0
-                for ind2, row2 in result.iterrows():
-                    alist = [row2['group'], row2['name'], row2['url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
-                    print(alist)
-                    df_a.loc[len(df_a.index)] = alist
-                    if need_save is False:
-                        need_save = True
-                    ind2 +=1
-                ind +=1
-    if need_save:
-        df_a.to_csv('ana_web.csv', encoding='utf-8_sig')
+                if not result.empty:
+                    for ind2, row2 in result.iterrows():
+                        output_row = [
+                            index,
+                            row2['name'],
+                            "文章标题",
+                            row['错误表述'],
+                            row['建议修改词语'],
+                            row['错误分类'],
+                            row2['content_url']
+                        ]
+                        output_data.append(output_row)
+                        index += 1
 
-if __name__ == "__main__":
-    if len(sys.argv) > 1 and sys.argv[1] == 'wechat':
-        ana_wechat()
-    else:
-        ana_web()
+    output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
 
+    return output_data
+
+# Run WeChat Analysis
+wechat_results = ana_wechat()
+
+# Run Web Content Analysis
+web_results = ana_web()
+
+# Save results in an Excel file with two sheets
+output_excel_path = os.path.join(output_dir, '总院及下属公司官方公众号巡查结果汇总表.xlsx')
+with pd.ExcelWriter(output_excel_path) as writer:
+    wechat_results.to_excel(writer, sheet_name='公众号', index=False)
+    web_results.to_excel(writer, sheet_name='网站', index=False)
+
+print("Analysis completed and results saved to Excel.")