feat: 完善功能

This commit is contained in:
caoqianming 2023-11-11 10:07:24 +08:00
parent 1d1d5325cc
commit abf9267d08
5 changed files with 32 additions and 20 deletions

View File

@ -4,12 +4,20 @@ import sqlite3
from mycode.base import BASE_DIR
import re
from openpyxl import load_workbook
from urllib.parse import urlparse
wechat_dir = os.path.join(BASE_DIR, 'article')
web_dir = os.path.join(BASE_DIR, 'web_dir')
output_dir = os.path.join(BASE_DIR, 'summary')
df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')
def fix_url_scheme(url, default_scheme='http'):
# 检查URL是否包含方案
if not url.startswith('http://') and not url.startswith('https://'):
# 如果没有方案,添加默认方案
url = f'{default_scheme}://{url}'
return url
def trans_to_json():
json_str = df_s.to_json(orient='records', force_ascii=False)
with open('biao.json', 'w', encoding='utf-8') as f:
@ -162,7 +170,7 @@ def ana_wechat():
]
output_data.append(output_row)
index += 1
print(f'找到公众号问题{index}---{row2["nickname"]}')
# output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
return output_data
@ -171,12 +179,20 @@ def ana_wechat():
def ana_web():
output_data = []
index = 1
for file in os.listdir(web_dir):
full_path = os.path.join(web_dir, file)
if '$' in full_path:
continue
print(full_path)
if os.path.getsize(full_path) > 0:
# for file in os.listdir(web_dir):
# full_path = os.path.join(web_dir, file)
# if '$' in full_path:
# continue
# print(full_path)
# 只分析websites中的
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
for ind, row in df.iterrows():
group = row['单位']
name = row['主办']
url = fix_url_scheme(row['地址'].strip())
domain = urlparse(url).netloc.replace('www.', '')
full_path = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
if os.path.exists(full_path) and os.path.getsize(full_path) > 0:
df = pd.read_excel(os.path.join(full_path), engine='openpyxl')
for ind, row in df_s.iterrows():
mask = df['text'].str.contains(row['错误表述'], na=False)
@ -198,7 +214,7 @@ def ana_web():
]
output_data.append(output_row)
index += 1
print(f'找到官网问题{index}---{row2["name"]}')
# output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
return output_data

View File

@ -223,7 +223,7 @@ class MainWindow(QMainWindow):
# os.startfile(path)
# except Exception as e:
# print("无法打开文件:", str(e))
if type == 'docs':
if type == 'docx':
app = win32.Dispatch("Word.Application")
app.Visible = True
app.Documents.Open(path)

View File

@ -4,7 +4,8 @@
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
from scrapy.http import HtmlResponse
from scrapy.http import TextResponse
from scrapy.exceptions import IgnoreRequest
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
@ -104,11 +105,10 @@ class ZcspiderDownloaderMiddleware:
spider.logger.info("Spider opened: %s" % spider.name)
class FilterHTMLMiddleware:
class FilterTextMiddleware:
def process_response(self, request, response, spider):
if isinstance(response, HtmlResponse):
if isinstance(response, TextResponse):
# 只接收HTML响应
return response
else:
# 忽略其他类型的资源文件
return request
# 忽略其他类型的资源文件
raise IgnoreRequest()

View File

@ -108,7 +108,7 @@ FEED_EXPORTERS = {
}
DOWNLOADER_MIDDLEWARES = {
'zcspider.middlewares.FilterHTMLMiddleware': 200,
'zcspider.middlewares.FilterTextMiddleware': 200,
# 其他下载中间件...
}
EXTENSIONS = {

View File

@ -59,8 +59,6 @@ class BaseSpider(scrapy.Spider):
rurl = fail.request.url
self.logger.info(f'{rurl} 使用requests继续请求')
r = requests.get(url=fail.request.url, headers=self.headers, timeout=20)
if self.is_file_res(r):
return
if r.status_code < 400:
rtext = r.text
h = html2text.HTML2Text()
@ -86,8 +84,6 @@ class BaseSpider(scrapy.Spider):
def parse(self, response):
if response.status >= 500:
return
if self.is_file_res(response):
return
h = html2text.HTML2Text()
h.ignore_links = True # 忽略所有链接
# 提取纯文本内容