feat: 完善功能
This commit is contained in:
parent
1d1d5325cc
commit
abf9267d08
|
@ -4,12 +4,20 @@ import sqlite3
|
||||||
from mycode.base import BASE_DIR
|
from mycode.base import BASE_DIR
|
||||||
import re
|
import re
|
||||||
from openpyxl import load_workbook
|
from openpyxl import load_workbook
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
wechat_dir = os.path.join(BASE_DIR, 'article')
|
wechat_dir = os.path.join(BASE_DIR, 'article')
|
||||||
web_dir = os.path.join(BASE_DIR, 'web_dir')
|
web_dir = os.path.join(BASE_DIR, 'web_dir')
|
||||||
output_dir = os.path.join(BASE_DIR, 'summary')
|
output_dir = os.path.join(BASE_DIR, 'summary')
|
||||||
df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')
|
df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')
|
||||||
|
|
||||||
|
def fix_url_scheme(url, default_scheme='http'):
|
||||||
|
# 检查URL是否包含方案
|
||||||
|
if not url.startswith('http://') and not url.startswith('https://'):
|
||||||
|
# 如果没有方案,添加默认方案
|
||||||
|
url = f'{default_scheme}://{url}'
|
||||||
|
return url
|
||||||
|
|
||||||
def trans_to_json():
|
def trans_to_json():
|
||||||
json_str = df_s.to_json(orient='records', force_ascii=False)
|
json_str = df_s.to_json(orient='records', force_ascii=False)
|
||||||
with open('biao.json', 'w', encoding='utf-8') as f:
|
with open('biao.json', 'w', encoding='utf-8') as f:
|
||||||
|
@ -162,7 +170,7 @@ def ana_wechat():
|
||||||
]
|
]
|
||||||
output_data.append(output_row)
|
output_data.append(output_row)
|
||||||
index += 1
|
index += 1
|
||||||
|
print(f'找到公众号问题{index}---{row2["nickname"]}')
|
||||||
# output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
# output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
||||||
|
|
||||||
return output_data
|
return output_data
|
||||||
|
@ -171,12 +179,20 @@ def ana_wechat():
|
||||||
def ana_web():
|
def ana_web():
|
||||||
output_data = []
|
output_data = []
|
||||||
index = 1
|
index = 1
|
||||||
for file in os.listdir(web_dir):
|
# for file in os.listdir(web_dir):
|
||||||
full_path = os.path.join(web_dir, file)
|
# full_path = os.path.join(web_dir, file)
|
||||||
if '$' in full_path:
|
# if '$' in full_path:
|
||||||
continue
|
# continue
|
||||||
print(full_path)
|
# print(full_path)
|
||||||
if os.path.getsize(full_path) > 0:
|
# 只分析websites中的
|
||||||
|
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
|
||||||
|
for ind, row in df.iterrows():
|
||||||
|
group = row['单位']
|
||||||
|
name = row['主办']
|
||||||
|
url = fix_url_scheme(row['地址'].strip())
|
||||||
|
domain = urlparse(url).netloc.replace('www.', '')
|
||||||
|
full_path = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||||
|
if os.path.exists(full_path) and os.path.getsize(full_path) > 0:
|
||||||
df = pd.read_excel(os.path.join(full_path), engine='openpyxl')
|
df = pd.read_excel(os.path.join(full_path), engine='openpyxl')
|
||||||
for ind, row in df_s.iterrows():
|
for ind, row in df_s.iterrows():
|
||||||
mask = df['text'].str.contains(row['错误表述'], na=False)
|
mask = df['text'].str.contains(row['错误表述'], na=False)
|
||||||
|
@ -198,7 +214,7 @@ def ana_web():
|
||||||
]
|
]
|
||||||
output_data.append(output_row)
|
output_data.append(output_row)
|
||||||
index += 1
|
index += 1
|
||||||
|
print(f'找到官网问题{index}---{row2["name"]}')
|
||||||
# output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
# output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
||||||
|
|
||||||
return output_data
|
return output_data
|
||||||
|
|
2
start.py
2
start.py
|
@ -223,7 +223,7 @@ class MainWindow(QMainWindow):
|
||||||
# os.startfile(path)
|
# os.startfile(path)
|
||||||
# except Exception as e:
|
# except Exception as e:
|
||||||
# print("无法打开文件:", str(e))
|
# print("无法打开文件:", str(e))
|
||||||
if type == 'docs':
|
if type == 'docx':
|
||||||
app = win32.Dispatch("Word.Application")
|
app = win32.Dispatch("Word.Application")
|
||||||
app.Visible = True
|
app.Visible = True
|
||||||
app.Documents.Open(path)
|
app.Documents.Open(path)
|
||||||
|
|
|
@ -4,7 +4,8 @@
|
||||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
from scrapy import signals
|
from scrapy import signals
|
||||||
from scrapy.http import HtmlResponse
|
from scrapy.http import TextResponse
|
||||||
|
from scrapy.exceptions import IgnoreRequest
|
||||||
|
|
||||||
# useful for handling different item types with a single interface
|
# useful for handling different item types with a single interface
|
||||||
from itemadapter import is_item, ItemAdapter
|
from itemadapter import is_item, ItemAdapter
|
||||||
|
@ -104,11 +105,10 @@ class ZcspiderDownloaderMiddleware:
|
||||||
spider.logger.info("Spider opened: %s" % spider.name)
|
spider.logger.info("Spider opened: %s" % spider.name)
|
||||||
|
|
||||||
|
|
||||||
class FilterHTMLMiddleware:
|
class FilterTextMiddleware:
|
||||||
def process_response(self, request, response, spider):
|
def process_response(self, request, response, spider):
|
||||||
if isinstance(response, HtmlResponse):
|
if isinstance(response, TextResponse):
|
||||||
# 只接收HTML响应
|
# 只接收HTML响应
|
||||||
return response
|
return response
|
||||||
else:
|
# 忽略其他类型的资源文件
|
||||||
# 忽略其他类型的资源文件
|
raise IgnoreRequest()
|
||||||
return request
|
|
|
@ -108,7 +108,7 @@ FEED_EXPORTERS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
DOWNLOADER_MIDDLEWARES = {
|
DOWNLOADER_MIDDLEWARES = {
|
||||||
'zcspider.middlewares.FilterHTMLMiddleware': 200,
|
'zcspider.middlewares.FilterTextMiddleware': 200,
|
||||||
# 其他下载中间件...
|
# 其他下载中间件...
|
||||||
}
|
}
|
||||||
EXTENSIONS = {
|
EXTENSIONS = {
|
||||||
|
|
|
@ -59,8 +59,6 @@ class BaseSpider(scrapy.Spider):
|
||||||
rurl = fail.request.url
|
rurl = fail.request.url
|
||||||
self.logger.info(f'{rurl} 使用requests继续请求')
|
self.logger.info(f'{rurl} 使用requests继续请求')
|
||||||
r = requests.get(url=fail.request.url, headers=self.headers, timeout=20)
|
r = requests.get(url=fail.request.url, headers=self.headers, timeout=20)
|
||||||
if self.is_file_res(r):
|
|
||||||
return
|
|
||||||
if r.status_code < 400:
|
if r.status_code < 400:
|
||||||
rtext = r.text
|
rtext = r.text
|
||||||
h = html2text.HTML2Text()
|
h = html2text.HTML2Text()
|
||||||
|
@ -86,8 +84,6 @@ class BaseSpider(scrapy.Spider):
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
if response.status >= 500:
|
if response.status >= 500:
|
||||||
return
|
return
|
||||||
if self.is_file_res(response):
|
|
||||||
return
|
|
||||||
h = html2text.HTML2Text()
|
h = html2text.HTML2Text()
|
||||||
h.ignore_links = True # 忽略所有链接
|
h.ignore_links = True # 忽略所有链接
|
||||||
# 提取纯文本内容
|
# 提取纯文本内容
|
||||||
|
|
Loading…
Reference in New Issue