From abf9267d086f70319a1c40e5e64164fadca60d74 Mon Sep 17 00:00:00 2001 From: caoqianming Date: Sat, 11 Nov 2023 10:07:24 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=AE=8C=E5=96=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mycode/main.py | 32 ++++++++++++++++++++++++-------- start.py | 2 +- zcspider/middlewares.py | 12 ++++++------ zcspider/settings.py | 2 +- zcspider/spiders/base.py | 4 ---- 5 files changed, 32 insertions(+), 20 deletions(-) diff --git a/mycode/main.py b/mycode/main.py index 4345c97..924442d 100644 --- a/mycode/main.py +++ b/mycode/main.py @@ -4,12 +4,20 @@ import sqlite3 from mycode.base import BASE_DIR import re from openpyxl import load_workbook +from urllib.parse import urlparse wechat_dir = os.path.join(BASE_DIR, 'article') web_dir = os.path.join(BASE_DIR, 'web_dir') output_dir = os.path.join(BASE_DIR, 'summary') df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容') +def fix_url_scheme(url, default_scheme='http'): + # 检查URL是否包含方案 + if not url.startswith('http://') and not url.startswith('https://'): + # 如果没有方案,添加默认方案 + url = f'{default_scheme}://{url}' + return url + def trans_to_json(): json_str = df_s.to_json(orient='records', force_ascii=False) with open('biao.json', 'w', encoding='utf-8') as f: @@ -162,7 +170,7 @@ def ana_wechat(): ] output_data.append(output_row) index += 1 - + print(f'找到公众号问题{index}---{row2["nickname"]}') # output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) return output_data @@ -171,12 +179,20 @@ def ana_wechat(): def ana_web(): output_data = [] index = 1 - for file in os.listdir(web_dir): - full_path = os.path.join(web_dir, file) - if '$' in full_path: - continue - print(full_path) - if os.path.getsize(full_path) > 0: + # for file in os.listdir(web_dir): + # full_path = os.path.join(web_dir, file) + # if '$' in full_path: + # continue + # print(full_path) + # 只分析websites中的 + df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1') + for ind, row in df.iterrows(): + group = row['单位'] + name = row['主办'] + url = fix_url_scheme(row['地址'].strip()) + domain = urlparse(url).netloc.replace('www.', '') + full_path = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') + if os.path.exists(full_path) and os.path.getsize(full_path) > 0: df = pd.read_excel(os.path.join(full_path), engine='openpyxl') for ind, row in df_s.iterrows(): mask = df['text'].str.contains(row['错误表述'], na=False) @@ -198,7 +214,7 @@ def ana_web(): ] output_data.append(output_row) index += 1 - + print(f'找到官网问题{index}---{row2["name"]}') # output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) return output_data diff --git a/start.py b/start.py index f089e71..f575fdd 100644 --- a/start.py +++ b/start.py @@ -223,7 +223,7 @@ class MainWindow(QMainWindow): # os.startfile(path) # except Exception as e: # print("无法打开文件:", str(e)) - if type == 'docs': + if type == 'docx': app = win32.Dispatch("Word.Application") app.Visible = True app.Documents.Open(path) diff --git a/zcspider/middlewares.py b/zcspider/middlewares.py index 4551b5a..c26445b 100644 --- a/zcspider/middlewares.py +++ b/zcspider/middlewares.py @@ -4,7 +4,8 @@ # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals -from scrapy.http import HtmlResponse +from scrapy.http import TextResponse +from scrapy.exceptions import IgnoreRequest # useful for handling different item types with a single interface from itemadapter import is_item, ItemAdapter @@ -104,11 +105,10 @@ class ZcspiderDownloaderMiddleware: spider.logger.info("Spider opened: %s" % spider.name) -class FilterHTMLMiddleware: +class FilterTextMiddleware: def process_response(self, request, response, spider): - if isinstance(response, HtmlResponse): + if isinstance(response, TextResponse): # 只接收HTML响应 return response - else: - # 忽略其他类型的资源文件 - return request \ No newline at end of file + # 忽略其他类型的资源文件 + raise IgnoreRequest() \ No newline at end of file diff --git a/zcspider/settings.py b/zcspider/settings.py index 9c604b8..b70d94e 100644 --- a/zcspider/settings.py +++ b/zcspider/settings.py @@ -108,7 +108,7 @@ FEED_EXPORTERS = { } DOWNLOADER_MIDDLEWARES = { - 'zcspider.middlewares.FilterHTMLMiddleware': 200, + 'zcspider.middlewares.FilterTextMiddleware': 200, # 其他下载中间件... } EXTENSIONS = { diff --git a/zcspider/spiders/base.py b/zcspider/spiders/base.py index 060c995..1597853 100644 --- a/zcspider/spiders/base.py +++ b/zcspider/spiders/base.py @@ -59,8 +59,6 @@ class BaseSpider(scrapy.Spider): rurl = fail.request.url self.logger.info(f'{rurl} 使用requests继续请求') r = requests.get(url=fail.request.url, headers=self.headers, timeout=20) - if self.is_file_res(r): - return if r.status_code < 400: rtext = r.text h = html2text.HTML2Text() @@ -86,8 +84,6 @@ class BaseSpider(scrapy.Spider): def parse(self, response): if response.status >= 500: return - if self.is_file_res(response): - return h = html2text.HTML2Text() h.ignore_links = True # 忽略所有链接 # 提取纯文本内容