From abf9267d086f70319a1c40e5e64164fadca60d74 Mon Sep 17 00:00:00 2001
From: caoqianming <caoqianming@foxmail.com>
Date: Sat, 11 Nov 2023 10:07:24 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E5=AE=8C=E5=96=84=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mycode/main.py           | 32 ++++++++++++++++++++++++--------
 start.py                 |  2 +-
 zcspider/middlewares.py  | 12 ++++++------
 zcspider/settings.py     |  2 +-
 zcspider/spiders/base.py |  4 ----
 5 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/mycode/main.py b/mycode/main.py
index 4345c97..924442d 100644
--- a/mycode/main.py
+++ b/mycode/main.py
@@ -4,12 +4,20 @@ import sqlite3
 from mycode.base import BASE_DIR
 import re
 from openpyxl import load_workbook
+from urllib.parse import urlparse
 
 wechat_dir = os.path.join(BASE_DIR, 'article')
 web_dir = os.path.join(BASE_DIR, 'web_dir')
 output_dir = os.path.join(BASE_DIR, 'summary')
 df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')
 
+def fix_url_scheme(url, default_scheme='http'):
+        # 检查URL是否包含方案
+        if not url.startswith('http://') and not url.startswith('https://'):
+            # 如果没有方案，添加默认方案
+            url = f'{default_scheme}://{url}'
+        return url
+
 def trans_to_json():
     json_str = df_s.to_json(orient='records', force_ascii=False)
     with open('biao.json', 'w', encoding='utf-8') as f:
@@ -162,7 +170,7 @@ def ana_wechat():
                 ]
                 output_data.append(output_row)
                 index += 1
-
+                print(f'找到公众号问题{index}---{row2["nickname"]}')
     # output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
 
     return output_data
@@ -171,12 +179,20 @@ def ana_wechat():
 def ana_web():
     output_data = []
     index = 1
-    for file in os.listdir(web_dir):
-        full_path = os.path.join(web_dir, file)
-        if '$' in full_path:
-            continue
-        print(full_path)
-        if os.path.getsize(full_path) > 0:
+    # for file in os.listdir(web_dir):
+    #     full_path = os.path.join(web_dir, file)
+    #     if '$' in full_path:
+    #         continue
+    #     print(full_path)
+    # 只分析websites中的
+    df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
+    for ind, row in df.iterrows():
+        group = row['单位']
+        name = row['主办']
+        url = fix_url_scheme(row['地址'].strip())
+        domain = urlparse(url).netloc.replace('www.', '')
+        full_path =  os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') 
+        if  os.path.exists(full_path) and os.path.getsize(full_path) > 0:
             df = pd.read_excel(os.path.join(full_path), engine='openpyxl')
             for ind, row in df_s.iterrows():
                 mask = df['text'].str.contains(row['错误表述'], na=False)
@@ -198,7 +214,7 @@ def ana_web():
                         ]
                         output_data.append(output_row)
                         index += 1
-
+                        print(f'找到官网问题{index}---{row2["name"]}')
     # output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
 
     return output_data
diff --git a/start.py b/start.py
index f089e71..f575fdd 100644
--- a/start.py
+++ b/start.py
@@ -223,7 +223,7 @@ class MainWindow(QMainWindow):
             #     os.startfile(path)
             # except Exception as e:
             #     print("无法打开文件:", str(e))
-            if type == 'docs':
+            if type == 'docx':
                 app = win32.Dispatch("Word.Application")
                 app.Visible = True
                 app.Documents.Open(path)
diff --git a/zcspider/middlewares.py b/zcspider/middlewares.py
index 4551b5a..c26445b 100644
--- a/zcspider/middlewares.py
+++ b/zcspider/middlewares.py
@@ -4,7 +4,8 @@
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 
 from scrapy import signals
-from scrapy.http import HtmlResponse
+from scrapy.http import TextResponse
+from scrapy.exceptions import IgnoreRequest
 
 # useful for handling different item types with a single interface
 from itemadapter import is_item, ItemAdapter
@@ -104,11 +105,10 @@ class ZcspiderDownloaderMiddleware:
         spider.logger.info("Spider opened: %s" % spider.name)
 
 
-class FilterHTMLMiddleware:
+class FilterTextMiddleware:
     def process_response(self, request, response, spider):
-        if isinstance(response, HtmlResponse):
+        if isinstance(response, TextResponse):
             # 只接收HTML响应
             return response
-        else:
-            # 忽略其他类型的资源文件
-            return request 
\ No newline at end of file
+        # 忽略其他类型的资源文件
+        raise IgnoreRequest()
\ No newline at end of file
diff --git a/zcspider/settings.py b/zcspider/settings.py
index 9c604b8..b70d94e 100644
--- a/zcspider/settings.py
+++ b/zcspider/settings.py
@@ -108,7 +108,7 @@ FEED_EXPORTERS = {
 }
 
 DOWNLOADER_MIDDLEWARES = {
-    'zcspider.middlewares.FilterHTMLMiddleware': 200,
+    'zcspider.middlewares.FilterTextMiddleware': 200,
     # 其他下载中间件...
 }
 EXTENSIONS = {
diff --git a/zcspider/spiders/base.py b/zcspider/spiders/base.py
index 060c995..1597853 100644
--- a/zcspider/spiders/base.py
+++ b/zcspider/spiders/base.py
@@ -59,8 +59,6 @@ class BaseSpider(scrapy.Spider):
         rurl = fail.request.url
         self.logger.info(f'{rurl} 使用requests继续请求')
         r = requests.get(url=fail.request.url, headers=self.headers, timeout=20)
-        if self.is_file_res(r):
-            return
         if r.status_code < 400:
             rtext = r.text
             h = html2text.HTML2Text()
@@ -86,8 +84,6 @@ class BaseSpider(scrapy.Spider):
     def parse(self, response):
         if response.status >= 500:
             return
-        if self.is_file_res(response):
-            return
         h = html2text.HTML2Text()
         h.ignore_links = True  # 忽略所有链接
         # 提取纯文本内容