feat: 完善功能

2023-11-11 10:07:24 +08:00 · 2023-11-11 10:07:24 +08:00 · abf9267d08
parent 1d1d5325cc
commit abf9267d08
5 changed files with 32 additions and 20 deletions
--- a/mycode/main.py
+++ b/mycode/main.py
@ -4,12 +4,20 @@ import sqlite3
 from mycode.base import BASE_DIR
 import re
 from openpyxl import load_workbook
+from urllib.parse import urlparse

 wechat_dir = os.path.join(BASE_DIR, 'article')
 web_dir = os.path.join(BASE_DIR, 'web_dir')
 output_dir = os.path.join(BASE_DIR, 'summary')
 df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')

+def fix_url_scheme(url, default_scheme='http'):
+        # 检查URL是否包含方案
+        if not url.startswith('http://') and not url.startswith('https://'):
+            # 如果没有方案，添加默认方案
+            url = f'{default_scheme}://{url}'
+        return url
+
 def trans_to_json():
    json_str = df_s.to_json(orient='records', force_ascii=False)
    with open('biao.json', 'w', encoding='utf-8') as f:
@ -162,7 +170,7 @@ def ana_wechat():
                ]
                output_data.append(output_row)
                index += 1
-
+                print(f'找到公众号问题{index}---{row2["nickname"]}')
    # output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])

    return output_data
@ -171,12 +179,20 @@ def ana_wechat():
 def ana_web():
    output_data = []
    index = 1
-    for file in os.listdir(web_dir):
-        full_path = os.path.join(web_dir, file)
-        if '$' in full_path:
-            continue
-        print(full_path)
-        if os.path.getsize(full_path) > 0:
+    # for file in os.listdir(web_dir):
+    #     full_path = os.path.join(web_dir, file)
+    #     if '$' in full_path:
+    #         continue
+    #     print(full_path)
+    # 只分析websites中的
+    df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
+    for ind, row in df.iterrows():
+        group = row['单位']
+        name = row['主办']
+        url = fix_url_scheme(row['地址'].strip())
+        domain = urlparse(url).netloc.replace('www.', '')
+        full_path =  os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') 
+        if  os.path.exists(full_path) and os.path.getsize(full_path) > 0:
            df = pd.read_excel(os.path.join(full_path), engine='openpyxl')
            for ind, row in df_s.iterrows():
                mask = df['text'].str.contains(row['错误表述'], na=False)
@ -198,7 +214,7 @@ def ana_web():
                        ]
                        output_data.append(output_row)
                        index += 1
-
+                        print(f'找到官网问题{index}---{row2["name"]}')
    # output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])

    return output_data
--- a/start.py
+++ b/start.py
@ -223,7 +223,7 @@ class MainWindow(QMainWindow):
            #     os.startfile(path)
            # except Exception as e:
            #     print("无法打开文件:", str(e))
-            if type == 'docs':
+            if type == 'docx':
                app = win32.Dispatch("Word.Application")
                app.Visible = True
                app.Documents.Open(path)
--- a/zcspider/middlewares.py
+++ b/zcspider/middlewares.py
@ -4,7 +4,8 @@
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html

 from scrapy import signals
-from scrapy.http import HtmlResponse
+from scrapy.http import TextResponse
+from scrapy.exceptions import IgnoreRequest

 # useful for handling different item types with a single interface
 from itemadapter import is_item, ItemAdapter
@ -104,11 +105,10 @@ class ZcspiderDownloaderMiddleware:
        spider.logger.info("Spider opened: %s" % spider.name)


-class FilterHTMLMiddleware:
+class FilterTextMiddleware:
    def process_response(self, request, response, spider):
-        if isinstance(response, HtmlResponse):
+        if isinstance(response, TextResponse):
            # 只接收HTML响应
            return response
-        else:
-            # 忽略其他类型的资源文件
-            return request 
+        # 忽略其他类型的资源文件
+        raise IgnoreRequest()
--- a/zcspider/settings.py
+++ b/zcspider/settings.py
@ -108,7 +108,7 @@ FEED_EXPORTERS = {
 }

 DOWNLOADER_MIDDLEWARES = {
-    'zcspider.middlewares.FilterHTMLMiddleware': 200,
+    'zcspider.middlewares.FilterTextMiddleware': 200,
    # 其他下载中间件...
 }
 EXTENSIONS = {
--- a/zcspider/spiders/base.py
+++ b/zcspider/spiders/base.py
@ -59,8 +59,6 @@ class BaseSpider(scrapy.Spider):
        rurl = fail.request.url
        self.logger.info(f'{rurl} 使用requests继续请求')
        r = requests.get(url=fail.request.url, headers=self.headers, timeout=20)
-        if self.is_file_res(r):
-            return
        if r.status_code < 400:
            rtext = r.text
            h = html2text.HTML2Text()
@ -86,8 +84,6 @@ class BaseSpider(scrapy.Spider):
    def parse(self, response):
        if response.status >= 500:
            return
-        if self.is_file_res(response):
-            return
        h = html2text.HTML2Text()
        h.ignore_links = True  # 忽略所有链接
        # 提取纯文本内容