Merge branch 'master' of https://e.coding.net/ctcdevteam/zcspider/zcspider

2023-08-24 17:16:51 +08:00 · 2023-08-24 17:16:51 +08:00 · d2a1dd165d
parent bac76b97bc 9a8afacbfb
commit d2a1dd165d
2 changed files with 79 additions and 57 deletions
--- a/main.py
+++ b/main.py
@ -1,12 +1,11 @@
 import pandas as pd
 import os
 import html2text
 import sys
 import sqlite3
 current_dir = os.getcwd()
 wechat_dir = os.path.join(current_dir, 'article')
 web_dir = os.path.join(current_dir, 'web_dir')
 output_dir = os.path.join(current_dir, 'summary')
 df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容')
 def trans_to_json():
@ -27,7 +26,6 @@ def make_simple_csv_from_db():
 def make_wechat_articles_full():
    df =  pd.read_csv(os.path.join(wechat_dir, 'articles.csv'))
    df['content'] = ''
    ind = 0
    for ind, row in df.iterrows():
        full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.md')
        try:
@ -35,7 +33,6 @@ def make_wechat_articles_full():
                df.at[ind, 'content'] = f.read()
        except FileNotFoundError:
            print(full_path + '---不存在')
        ind +=1
    output_path = os.path.join(wechat_dir, 'articles_full.csv')
    df.to_csv(output_path)
@ -43,55 +40,75 @@ def ana_wechat():
    articles_full_path = os.path.join(wechat_dir, 'articles_full.csv')
    if not os.path.exists(articles_full_path):
        make_wechat_articles_full()
-    df_a = pd.DataFrame(columns = ['公众号', '标题', '地址', '错误表述', '建议修改词语', '错误分类'])
+
    df = pd.read_csv(articles_full_path)
    df['content'] = df['content'].fillna('')
-    ind = 0
+
-    need_save = False
+    output_data = []
    index = 1
    for ind, row in df_s.iterrows():
        mask = df['content'].str.contains(row['错误表述'])
        result = df[mask]
-        if result.empty:
+
-            continue
+        if not result.empty:
-        ind2 = 0
+            for ind2, row2 in result.iterrows():
-        for ind2, row2 in result.iterrows():
+                output_row = [
-            alist = [row2['nickname'], row2['title'], row2['content_url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
+                    index,
-            df_a.loc[len(df_a.index)] = alist
+                    row2['nickname'],
-            if need_save is False:
+                    row2['title'],
-                need_save = True
+                    row['错误表述'],
-            ind2 +=1
+                    row['建议修改词语'],
-        ind +=1
+                    row['错误分类'],
-    if need_save:
+                    row2['content_url']
-        df_a.to_csv('ana_wechat.csv', encoding='utf-8_sig')
+                ]
                output_data.append(output_row)
                index += 1
    output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
    return output_data
 def ana_web():
-    df_a = pd.DataFrame(columns = ['单位', '主办', '地址', '错误表述', '建议修改词语', '错误分类'])
+    output_data = []
-    need_save = False
+    index = 1
    for file in os.listdir(web_dir):
        full_path = os.path.join(web_dir, file)
        if os.path.getsize(full_path) > 0:
            df = pd.read_excel(os.path.join(web_dir, file))
            ind = 0
            for ind, row in df_s.iterrows():
                mask = df['text'].str.contains(row['错误表述'], na=False)
                result = df[mask]
-                if result.empty:
+                if not result.empty:
-                    continue
+                    for ind2, row2 in result.iterrows():
-                ind2 = 0
+                        output_row = [
-                for ind2, row2 in result.iterrows():
+                            index,
-                    alist = [row2['group'], row2['name'], row2['url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
+                            row2['name'],
-                    print(alist)
+                            "文章标题",
-                    df_a.loc[len(df_a.index)] = alist
+                            row['错误表述'],
-                    if need_save is False:
+                            row['建议修改词语'],
-                        need_save = True
+                            row['错误分类'],
-                    ind2 +=1
+                            row2['content_url']
-                ind +=1
+                        ]
-    if need_save:
+                        output_data.append(output_row)
-        df_a.to_csv('ana_web.csv', encoding='utf-8_sig')
+                        index += 1
-if __name__ == "__main__":
+    output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
    if len(sys.argv) > 1 and sys.argv[1] == 'wechat':
        ana_wechat()
    else:
        ana_web()
    return output_data
 # Run WeChat Analysis
 wechat_results = ana_wechat()
 # Run Web Content Analysis
 web_results = ana_web()
 # Save results in an Excel file with two sheets
 output_excel_path = os.path.join(output_dir, '总院及下属公司官方公众号巡查结果汇总表.xlsx')
 with pd.ExcelWriter(output_excel_path) as writer:
    wechat_results.to_excel(writer, sheet_name='公众号', index=False)
    web_results.to_excel(writer, sheet_name='网站', index=False)
 print("Analysis completed and results saved to Excel.")
--- a/scrape.py
+++ b/scrape.py
@ -138,24 +138,29 @@ def main():
        name = row['主办']
        url = row['地址']
        domain = urlparse(url).netloc
    # Open the website
    driver = open_website(url)
    # Retrieve cookies from previous session
    cookies = get_cookies_from_previous_session(driver)
    # Add cookies to the WebDriver
    add_cookies(driver, cookies)
    # Initialize the set to store visited pages
    visited_pages = set()
    # Initialize the data list
    data = []
    # Process the starting page and follow hyperlinks recursively
    process_page(driver, url, visited_pages, domain, data)
    # Export data to a separate Excel file for each URL
    output_filename = f'web_dir/{name}_{domain}.xlsx'
    export_to_excel(data, output_filename)
-    # Close the WebDriver
+        # Open the website
-    driver.quit()
+        driver = open_website(url)
        # Retrieve cookies from previous session
        cookies = get_cookies_from_previous_session(driver)
        # Add cookies to the WebDriver
        add_cookies(driver, cookies)
        # Initialize the set to store visited pages
        visited_pages = set()
        # Initialize the data list
        data = []
        # Process the starting page and follow hyperlinks recursively
        process_page(driver, url, visited_pages, domain, data)
        # Export data to a separate Excel file in the web_dir directory
        output_filename = f'web_dir/{name}_{domain}.xlsx'
        export_to_excel(data, output_filename)
        # Close the WebDriver
        driver.quit()
 if __name__ == "__main__":
    main()