diff --git a/.gitignore b/.gitignore index f7e7641..af168f9 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,9 @@ __pycache__/ twistd.pid ~$* *.xlsx -!template.xlsx +*.docx +!template*.xlsx +!template*.docx wechat_dir/* *.csv .idea/* diff --git a/main.ui b/main.ui index 42213ed..00dc8f7 100644 --- a/main.ui +++ b/main.ui @@ -7,7 +7,7 @@ 0 0 600 - 763 + 830 @@ -19,13 +19,13 @@ 600 - 763 + 830 600 - 763 + 830 @@ -123,70 +123,13 @@ - - - - 10 - 380 - 191 - 91 - - - - - 11 - - - - 2.确认分析对比库 - - - - - 20 - 30 - 151 - 24 - - - - background-color:#409EFF; color: white; border-radius: 2px - - - 打开分析标准Excel - - - - - - 20 - 60 - 151 - 16 - - - - - 楷体 - 10 - false - - - - color: red; - - - 请在修改后保存并关闭 - - - 10 - 650 + 540 581 - 71 + 121 @@ -195,13 +138,13 @@ - 最终结果 + 汇总分析 10 - 20 + 70 91 16 @@ -219,7 +162,7 @@ 10 - 40 + 90 91 16 @@ -237,7 +180,7 @@ 10 - 30 + 80 561 16 @@ -250,7 +193,7 @@ 110 - 15 + 65 381 21 @@ -268,7 +211,7 @@ 110 - 40 + 90 381 16 @@ -286,7 +229,7 @@ 520 - 10 + 60 51 24 @@ -302,7 +245,7 @@ 520 - 40 + 90 51 24 @@ -311,6 +254,71 @@ 打开 + + + + 20 + 30 + 151 + 24 + + + + + 11 + + + + background-color:#409EFF; color: white; border-radius: 2px + + + 打开分析标准Excel + + + + + + 180 + 30 + 151 + 16 + + + + + 楷体 + 11 + false + + + + color: red; + + + 请在修改后保存并关闭 + + + + + + 420 + 30 + 151 + 24 + + + + + 12 + + + + background-color:#409EFF; color: white; border-radius: 2px + + + 开始分析 + + @@ -345,10 +353,10 @@ - 210 + 220 280 371 - 361 + 251 @@ -357,7 +365,7 @@ - 日志显示 + 操作日志显示 @@ -365,7 +373,7 @@ 10 20 351 - 321 + 221 @@ -382,7 +390,7 @@ 10 - 490 + 380 191 151 @@ -393,7 +401,7 @@ - 2.确认需要抓取的网站 + 2.确认需要爬取的官网 @@ -491,7 +499,160 @@ background-color:#409EFF; color: white; border-radius: 2px - 开始巡查 + 开始爬取 + + + + + + + 10 + 670 + 581 + 111 + + + + + 11 + + + + 总院官微 + + + + + 10 + 60 + 91 + 16 + + + + + 10 + + + + 汇总结果Excel: + + + + + + 10 + 80 + 91 + 16 + + + + + 10 + + + + 汇总打分Excel: + + + + + + 10 + 70 + 561 + 16 + + + + Qt::Horizontal + + + + + + 110 + 55 + 381 + 21 + + + + + 9 + + + + + + + + + + 110 + 80 + 381 + 16 + + + + + 9 + + + + + + + + + + 520 + 50 + 51 + 24 + + + + background-color:#409EFF; color: white; border-radius: 2px + + + 打开 + + + + + + 520 + 80 + 51 + 24 + + + + 打开 + + + + + + 20 + 30 + 151 + 24 + + + + + 11 + + + + background-color:#409EFF; color: white; border-radius: 2px + + + 汇总打分 diff --git a/mycode/main.py b/mycode/main.py index 7f83388..b3ecc6c 100644 --- a/mycode/main.py +++ b/mycode/main.py @@ -1,7 +1,7 @@ import pandas as pd import os import sqlite3 -from .base import BASE_DIR +from mycode.base import BASE_DIR wechat_dir = os.path.join(BASE_DIR, 'article') web_dir = os.path.join(BASE_DIR, 'web_dir') @@ -53,6 +53,10 @@ def ana_wechat(): if not result.empty: for ind2, row2 in result.iterrows(): + if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['content']: + continue + if row['错误表述'] == '20大': + continue output_row = [ index, row2['nickname'], @@ -85,6 +89,10 @@ def ana_web(): result = df[mask] if not result.empty: for ind2, row2 in result.iterrows(): + if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['text']: + continue + if row['错误表述'] == '20大': + continue output_row = [ index, row2['name'], @@ -101,4 +109,6 @@ def ana_web(): return output_data +if __name__ == "__main__": + ana_web() diff --git a/requirements.txt b/requirements.txt index afd9d5c..04bbe22 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ openpyxl==3.1.2 scrapy-xlsx==0.1.1 selenium==4.9.1 pyside6==6.5.2 -pywin32==306 \ No newline at end of file +pywin32==306 +docxtpl==0.16.7 \ No newline at end of file diff --git a/start.py b/start.py index 8c5864e..847f0aa 100644 --- a/start.py +++ b/start.py @@ -13,6 +13,9 @@ import pandas as pd from urllib.parse import urlparse from openpyxl import load_workbook import threading +import traceback +from docxtpl import DocxTemplate +import json # from queue import Queue BASE_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -20,6 +23,8 @@ WEB_SITES_PATH = os.path.join(BASE_DIR, 'web_sites.xlsx') BIAO_PATH = os.path.join(BASE_DIR, 'biao.xlsx') PYTHON_PATH = os.path.join(BASE_DIR, 'runtime/python.exe') TEMPLATE_PATH = os.path.join(BASE_DIR, 'summary/template.xlsx') +TEMPLATE_REPORT_PATH = os.path.join(BASE_DIR, 'summary/template_report.docx') + def fix_url_scheme(url, default_scheme='http'): # 检查URL是否包含方案 @@ -37,26 +42,49 @@ class MyApplication(QApplication): self.main_window = MainWindow() return self.main_window -class MyThread(QThread): - update_signal = Signal(dict) +def gen_doc(w1, w2): + now = datetime.datetime.now() + now_3 = now - datetime.timedelta(days=3) + # with open('w2.json', 'r', encoding='utf-8') as f: + # w2 = json.loads(f.read()) + # with open('w1.json', 'r', encoding='utf-8') as f: + # w1 = json.loads(f.read()) + gdbs = 0 + yzbs = 0 + ybwz = 0 + zzcc = 0 + context = {'y': now.year, 'm': now.month, 'd': now.day, 'mo': now_3.month, 'do': now_3.day, 'su': 'xx', 'w1': w1, 'w2': w2} + output_report_path = os.path.join(BASE_DIR, f'summary/{now.year}年{now.month}月-分析结果简报.docx') + doc = DocxTemplate(TEMPLATE_REPORT_PATH) + for i in w1: + if i[5] == '固定表述错误': + gdbs =gdbs + 1 + elif i[5] == '严重表述错误': + yzbs = yzbs +1 + elif i[5] == '一般文字差错': + ybwz = ybwz +1 + elif i[5] == '政治差错': + zzcc = zzcc +1 + for i in w2: + if i[5] == '固定表述错误': + gdbs =gdbs + 1 + elif i[5] == '严重表述错误': + yzbs = yzbs +1 + elif i[5] == '一般文字差错': + ybwz = ybwz +1 + elif i[5] == '政治差错': + zzcc = zzcc +1 - def __init__(self, lsize) -> None: - super().__init__() - self.lsize = lsize - self.processes = [] - self.running = False + context['su'] = f'固定表述错误{gdbs}项, 严重表述错误{yzbs}项, 一般文字差错{ybwz}项, 政治差错{zzcc}项' + + doc.render(context) + doc.save(output_report_path) + return output_report_path +class AnaThread(QThread): + update_signal = Signal(object) - def capture_output(self, p): - while self.running and p.poll() is None: - output = p.stdout.readline() - err = p.stderr.readline() - if err: - self.update_signal.emit({'msg': err.strip()}) - if output: - self.update_signal.emit({'msg': output.strip()}) - def ana(self): - month = datetime.datetime.now().month + now = datetime.datetime.now() self.update_signal.emit({'msg': '对比开始...'}) self.update_signal.emit({'msg': '正在组合微信公众号爬取内容...'}) make_simple_csv_from_db() @@ -65,9 +93,10 @@ class MyThread(QThread): self.update_signal.emit({'msg': '开始对比分析所有内容...'}) wechat_results = ana_wechat() web_results = ana_web() - output_excel_path = os.path.join(BASE_DIR, f'summary/{month}月-总院宣传阵地巡查结果汇总表.xlsx') + # 生成汇总表 + self.update_signal.emit({'msg': '开始生成汇总表...'}) + output_excel_path = os.path.join(BASE_DIR, f'summary/{now.year}年{now.month}月-分析结果汇总表.xlsx') workbook = load_workbook(TEMPLATE_PATH) - # 选择要操作的工作表 wechat_sheet = workbook['公众号'] web_sheet = workbook['网站'] for row in wechat_results: @@ -76,7 +105,46 @@ class MyThread(QThread): web_sheet.append(row) workbook.save(output_excel_path) workbook.close() - self.update_signal.emit({'msg': '分析完毕, 请查看结果栏, 可手动校对', 'output_excel_path': output_excel_path}) + # with open('w1.json', 'w', encoding='utf-8') as f: + # f.write(json.dumps(wechat_results, ensure_ascii=False)) + + # with open('w2.json', 'w', encoding='utf-8') as f: + # f.write(json.dumps(web_results, ensure_ascii=False)) + # 生成简报 + self.update_signal.emit({'msg': '开始生成汇总简报...'}) + output_report_path = gen_doc(wechat_results, web_results) + self.update_signal.emit({'msg': '分析完毕, 请查看结果栏, 可手动校对', 'output_excel_path': output_excel_path, 'output_report_path': output_report_path}) + + def run(self) -> None: + try: + self.ana() + except Exception as e: + self.update_signal.emit({'msg': traceback.format_exc()}) + + +class MyThread(QThread): + update_signal = Signal(object) + + def __init__(self, lsize) -> None: + """ + lsize: 多少kb需要调取Chrome + """ + super().__init__() + self.lsize = lsize + self.processes = [] + self.running = False + + def capture_output(self, p): + while self.running and p.poll() is None: + output = p.stdout.readline() + if output: + self.update_signal.emit({'msg': output.strip()}) + + def capture_err(self, p): + while self.running and p.poll() is None: + err = p.stderr.readline() + if err: + self.update_signal.emit({'msg': err.strip()}) def run(self) -> None: self.update_signal.emit({'msg': '开始进行网站爬取...'}) @@ -95,6 +163,8 @@ class MyThread(QThread): self.running = True getlog_thread = threading.Thread(target=self.capture_output, args=(process,), daemon=True) getlog_thread.start() + getlog_thread_err = threading.Thread(target=self.capture_err, args=(process,), daemon=True) + getlog_thread_err.start() for process in self.processes: process.wait() @@ -114,8 +184,6 @@ class MyThread(QThread): self.update_signal.emit({'msg': '存在未爬取站点,正在调用Chrome继续爬取...'}) chrom_main_from_list(info_to_save) self.update_signal.emit({'msg': '网站爬取完毕!'}) - self.ana() - self.exec() def close(self): self.running = False @@ -129,7 +197,8 @@ class MainWindow(QMainWindow): def __init__(self): super(MainWindow, self).__init__() - self.worker_thread = None + self.web_thread = None + self.ana_thread = None self.wcplus = False self.logModel= QStringListModel([]) self.ui = Ui_MainWindow() @@ -139,6 +208,7 @@ class MainWindow(QMainWindow): self.ui.bWebSite.clicked.connect(self.open_websites_xlsx) self.ui.bBiao.clicked.connect(self.open_biao_xlsx) self.ui.bStart.clicked.connect(self.start) + self.ui.bAna.clicked.connect(self.start_ana) self.ui.bRes1.clicked.connect(self.open_res1) self.ui.bRes2.clicked.connect(self.open_res2) self.ui.vLog.setModel(self.logModel) @@ -170,9 +240,9 @@ class MainWindow(QMainWindow): def open_res2(self): if self.ui.lRes2.text(): - app = win32.Dispatch("Excel.Application") + app = win32.Dispatch("Word.Application") app.Visible = True - app.Workbooks.Open(self.ui.lRes2.text()) + app.Documents.Open(self.ui.lRes2.text()) app.WindowState = 3 def get_time(self): @@ -180,34 +250,45 @@ class MainWindow(QMainWindow): return now.strftime('%H:%M:%S') def start(self): - if self.ui.bStart.text() == '开始巡查' or self.ui.bStart.text() == '重新开始': + if self.ui.bStart.text() == '开始爬取' or self.ui.bStart.text() == '重新开始': self.log('', True) if self.res1Workbook: self.res1Workbook.Close() self.ui.lSize.setEnabled(False) - self.ui.bStart.setText('停止巡查') + self.ui.bStart.setText('停止爬取') self.start_web(int(self.ui.lSize.text())) - elif self.ui.bStart.text() == '停止巡查': + elif self.ui.bStart.text() == '停止爬取': self.update_log({'msg': '正在停止...'}) - if self.worker_thread: - self.worker_thread.close() + if self.web_thread: + self.web_thread.close() self.log('', True) self.ui.lSize.setEnabled(True) - self.ui.bStart.setText('开始巡查') + self.ui.bStart.setText('开始爬取') def start_web(self, lsize): - self.worker_thread = MyThread(lsize) - self.worker_thread.update_signal.connect(self.update_log) - self.worker_thread.start() + self.web_thread = MyThread(lsize) + self.web_thread.update_signal.connect(self.update_log) + self.web_thread.start() + def start_ana(self): + self.ana_thread = AnaThread() + self.ana_thread.update_signal.connect(self.update_log) + self.ana_thread.start() def update_log(self, rdict): - self.log(f'{self.get_time()}-{rdict["msg"]}', False) - if 'output_excel_path' in rdict: - self.ui.lRes1.setText(rdict['output_excel_path']) - self.ui.bStart.setText('重新开始') - self.ui.lSize.setEnabled(True) + if isinstance(rdict, str): + self.log(f'{self.get_time()}-{rdict}', False) + elif isinstance(rdict, dict): + self.log(f'{self.get_time()}-{rdict["msg"]}', False) + if 'output_report_path' in rdict: + self.ui.lRes2.setText(rdict['output_report_path']) + # self.ui.bStart.setText('重新开始') + # self.ui.lSize.setEnabled(True) + if 'output_excel_path' in rdict: + self.ui.lRes1.setText(rdict['output_excel_path']) + # self.ui.bStart.setText('重新开始') + # self.ui.lSize.setEnabled(True) def log(self, logLine: str, clear=False): log_list = self.logModel.stringList() @@ -228,12 +309,13 @@ class MainWindow(QMainWindow): except Exception as e: print(f"Error while terminating wcplus.exe: {str(e)}") self.wcplus = False - if self.worker_thread: - self.worker_thread.close() + if self.web_thread: + self.web_thread.close() event.accept() if __name__ == "__main__": + # gen_doc() app = MyApplication(sys.argv) main_window = app.createMainWindow() main_window.show() diff --git a/summary/template_report.docx b/summary/template_report.docx new file mode 100644 index 0000000..13dc734 Binary files /dev/null and b/summary/template_report.docx differ diff --git a/ui_mainwindow.py b/ui_mainwindow.py index f849192..38d6bb9 100644 --- a/ui_mainwindow.py +++ b/ui_mainwindow.py @@ -23,14 +23,14 @@ class Ui_MainWindow(object): def setupUi(self, MainWindow): if not MainWindow.objectName(): MainWindow.setObjectName(u"MainWindow") - MainWindow.resize(600, 763) + MainWindow.resize(600, 830) sizePolicy = QSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) sizePolicy.setHorizontalStretch(0) sizePolicy.setVerticalStretch(0) sizePolicy.setHeightForWidth(MainWindow.sizePolicy().hasHeightForWidth()) MainWindow.setSizePolicy(sizePolicy) - MainWindow.setMinimumSize(QSize(600, 763)) - MainWindow.setMaximumSize(QSize(600, 763)) + MainWindow.setMinimumSize(QSize(600, 830)) + MainWindow.setMaximumSize(QSize(600, 830)) icon = QIcon() icon.addFile(u"start.ico", QSize(), QIcon.Normal, QIcon.Off) MainWindow.setWindowIcon(icon) @@ -63,79 +63,87 @@ class Ui_MainWindow(object): font1.setBold(False) self.label_5.setFont(font1) self.label_5.setStyleSheet(u"color: red;") - self.groupBox_3 = QGroupBox(self.centralwidget) - self.groupBox_3.setObjectName(u"groupBox_3") - self.groupBox_3.setGeometry(QRect(10, 380, 191, 91)) - self.groupBox_3.setFont(font) - self.bBiao = QPushButton(self.groupBox_3) - self.bBiao.setObjectName(u"bBiao") - self.bBiao.setGeometry(QRect(20, 30, 151, 24)) - self.bBiao.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px") - self.label_4 = QLabel(self.groupBox_3) - self.label_4.setObjectName(u"label_4") - self.label_4.setGeometry(QRect(20, 60, 151, 16)) - self.label_4.setFont(font1) - self.label_4.setStyleSheet(u"color: red;") self.groupBox_5 = QGroupBox(self.centralwidget) self.groupBox_5.setObjectName(u"groupBox_5") - self.groupBox_5.setGeometry(QRect(10, 650, 581, 71)) + self.groupBox_5.setGeometry(QRect(10, 540, 581, 121)) self.groupBox_5.setFont(font) self.label_7 = QLabel(self.groupBox_5) self.label_7.setObjectName(u"label_7") - self.label_7.setGeometry(QRect(10, 20, 91, 16)) + self.label_7.setGeometry(QRect(10, 70, 91, 16)) font2 = QFont() font2.setPointSize(10) self.label_7.setFont(font2) self.label_8 = QLabel(self.groupBox_5) self.label_8.setObjectName(u"label_8") - self.label_8.setGeometry(QRect(10, 40, 91, 16)) + self.label_8.setGeometry(QRect(10, 90, 91, 16)) self.label_8.setFont(font2) self.line = QFrame(self.groupBox_5) self.line.setObjectName(u"line") - self.line.setGeometry(QRect(10, 30, 561, 16)) + self.line.setGeometry(QRect(10, 80, 561, 16)) self.line.setFrameShape(QFrame.HLine) self.line.setFrameShadow(QFrame.Sunken) self.lRes1 = QLabel(self.groupBox_5) self.lRes1.setObjectName(u"lRes1") - self.lRes1.setGeometry(QRect(110, 15, 381, 21)) + self.lRes1.setGeometry(QRect(110, 65, 381, 21)) font3 = QFont() font3.setPointSize(9) self.lRes1.setFont(font3) self.lRes2 = QLabel(self.groupBox_5) self.lRes2.setObjectName(u"lRes2") - self.lRes2.setGeometry(QRect(110, 40, 381, 16)) + self.lRes2.setGeometry(QRect(110, 90, 381, 16)) self.lRes2.setFont(font3) self.bRes1 = QPushButton(self.groupBox_5) self.bRes1.setObjectName(u"bRes1") - self.bRes1.setGeometry(QRect(520, 10, 51, 24)) + self.bRes1.setGeometry(QRect(520, 60, 51, 24)) self.bRes1.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px") self.bRes2 = QPushButton(self.groupBox_5) self.bRes2.setObjectName(u"bRes2") - self.bRes2.setGeometry(QRect(520, 40, 51, 24)) + self.bRes2.setGeometry(QRect(520, 90, 51, 24)) + self.bBiao = QPushButton(self.groupBox_5) + self.bBiao.setObjectName(u"bBiao") + self.bBiao.setGeometry(QRect(20, 30, 151, 24)) + self.bBiao.setFont(font) + self.bBiao.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px") + self.label_4 = QLabel(self.groupBox_5) + self.label_4.setObjectName(u"label_4") + self.label_4.setGeometry(QRect(180, 30, 151, 16)) + font4 = QFont() + font4.setFamilies([u"\u6977\u4f53"]) + font4.setPointSize(11) + font4.setBold(False) + self.label_4.setFont(font4) + self.label_4.setStyleSheet(u"color: red;") + self.bAna = QPushButton(self.groupBox_5) + self.bAna.setObjectName(u"bAna") + self.bAna.setGeometry(QRect(420, 30, 151, 24)) + font5 = QFont() + font5.setPointSize(12) + self.bAna.setFont(font5) + self.bAna.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px") self.label_9 = QLabel(self.centralwidget) self.label_9.setObjectName(u"label_9") self.label_9.setGeometry(QRect(150, 0, 291, 31)) - font4 = QFont() - font4.setFamilies([u"\u6977\u4f53"]) - font4.setPointSize(12) - font4.setBold(False) - font4.setItalic(False) - self.label_9.setFont(font4) + font6 = QFont() + font6.setFamilies([u"\u6977\u4f53"]) + font6.setPointSize(12) + font6.setBold(False) + font6.setItalic(False) + self.label_9.setFont(font6) self.label_9.setStyleSheet(u"color:white;") self.label_9.setAlignment(Qt.AlignRight|Qt.AlignTrailing|Qt.AlignVCenter) self.label_9.setMargin(6) self.groupBox_6 = QGroupBox(self.centralwidget) self.groupBox_6.setObjectName(u"groupBox_6") - self.groupBox_6.setGeometry(QRect(210, 280, 371, 361)) + self.groupBox_6.setGeometry(QRect(220, 280, 371, 251)) self.groupBox_6.setFont(font) self.vLog = QListView(self.groupBox_6) self.vLog.setObjectName(u"vLog") - self.vLog.setGeometry(QRect(10, 20, 351, 321)) + self.vLog.setGeometry(QRect(10, 20, 351, 221)) self.vLog.setFont(font3) self.vLog.setStyleSheet(u"") self.groupBox_2 = QGroupBox(self.centralwidget) self.groupBox_2.setObjectName(u"groupBox_2") - self.groupBox_2.setGeometry(QRect(10, 490, 191, 151)) + self.groupBox_2.setGeometry(QRect(10, 380, 191, 151)) self.groupBox_2.setFont(font) self.bWebSite = QPushButton(self.groupBox_2) self.bWebSite.setObjectName(u"bWebSite") @@ -158,10 +166,45 @@ class Ui_MainWindow(object): self.bStart = QPushButton(self.groupBox_2) self.bStart.setObjectName(u"bStart") self.bStart.setGeometry(QRect(20, 110, 151, 24)) - font5 = QFont() - font5.setPointSize(12) self.bStart.setFont(font5) self.bStart.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px") + self.groupBox_7 = QGroupBox(self.centralwidget) + self.groupBox_7.setObjectName(u"groupBox_7") + self.groupBox_7.setGeometry(QRect(10, 670, 581, 111)) + self.groupBox_7.setFont(font) + self.label_10 = QLabel(self.groupBox_7) + self.label_10.setObjectName(u"label_10") + self.label_10.setGeometry(QRect(10, 60, 91, 16)) + self.label_10.setFont(font2) + self.label_11 = QLabel(self.groupBox_7) + self.label_11.setObjectName(u"label_11") + self.label_11.setGeometry(QRect(10, 80, 91, 16)) + self.label_11.setFont(font2) + self.line_2 = QFrame(self.groupBox_7) + self.line_2.setObjectName(u"line_2") + self.line_2.setGeometry(QRect(10, 70, 561, 16)) + self.line_2.setFrameShape(QFrame.HLine) + self.line_2.setFrameShadow(QFrame.Sunken) + self.lCalRes1 = QLabel(self.groupBox_7) + self.lCalRes1.setObjectName(u"lCalRes1") + self.lCalRes1.setGeometry(QRect(110, 55, 381, 21)) + self.lCalRes1.setFont(font3) + self.lCalRes2 = QLabel(self.groupBox_7) + self.lCalRes2.setObjectName(u"lCalRes2") + self.lCalRes2.setGeometry(QRect(110, 80, 381, 16)) + self.lCalRes2.setFont(font3) + self.bOpenCalRes1 = QPushButton(self.groupBox_7) + self.bOpenCalRes1.setObjectName(u"bOpenCalRes1") + self.bOpenCalRes1.setGeometry(QRect(520, 50, 51, 24)) + self.bOpenCalRes1.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px") + self.bOpenCalRes2 = QPushButton(self.groupBox_7) + self.bOpenCalRes2.setObjectName(u"bOpenCalRes2") + self.bOpenCalRes2.setGeometry(QRect(520, 80, 51, 24)) + self.bCal = QPushButton(self.groupBox_7) + self.bCal.setObjectName(u"bCal") + self.bCal.setGeometry(QRect(20, 30, 151, 24)) + self.bCal.setFont(font) + self.bCal.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px") MainWindow.setCentralWidget(self.centralwidget) self.menubar = QMenuBar(MainWindow) self.menubar.setObjectName(u"menubar") @@ -182,24 +225,32 @@ class Ui_MainWindow(object): self.groupBox.setTitle(QCoreApplication.translate("MainWindow", u"1.\u5fae\u4fe1\u516c\u4f17\u53f7\u4fe1\u606f\u6293\u53d6", None)) self.bWechat.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5de5\u5177", None)) self.label_5.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u786e\u4fdd\u6240\u6709\u516c\u4f17\u53f7\u6293\u53d6\u5b8c\u6bd5", None)) - self.groupBox_3.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u5206\u6790\u5bf9\u6bd4\u5e93", None)) - self.bBiao.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5206\u6790\u6807\u51c6Excel", None)) - self.label_4.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None)) - self.groupBox_5.setTitle(QCoreApplication.translate("MainWindow", u"\u6700\u7ec8\u7ed3\u679c", None)) + self.groupBox_5.setTitle(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u5206\u6790", None)) self.label_7.setText(QCoreApplication.translate("MainWindow", u"\u5206\u6790\u7ed3\u679cExcel:", None)) self.label_8.setText(QCoreApplication.translate("MainWindow", u"\u5206\u6790\u62a5\u544aWord:", None)) self.lRes1.setText("") self.lRes2.setText("") self.bRes1.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None)) self.bRes2.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None)) + self.bBiao.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5206\u6790\u6807\u51c6Excel", None)) + self.label_4.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None)) + self.bAna.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u5206\u6790", None)) self.label_9.setText(QCoreApplication.translate("MainWindow", u"\u4e2d\u56fd\u5efa\u6750\u603b\u9662\u5ba3\u4f20\u5de5\u4f5c\u4fe1\u606f\u5316\u7ba1\u7406\u5e73\u53f0", None)) - self.groupBox_6.setTitle(QCoreApplication.translate("MainWindow", u"\u65e5\u5fd7\u663e\u793a", None)) - self.groupBox_2.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u9700\u8981\u6293\u53d6\u7684\u7f51\u7ad9", None)) + self.groupBox_6.setTitle(QCoreApplication.translate("MainWindow", u"\u64cd\u4f5c\u65e5\u5fd7\u663e\u793a", None)) + self.groupBox_2.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u9700\u8981\u722c\u53d6\u7684\u5b98\u7f51", None)) self.bWebSite.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u7f51\u7ad9\u5217\u8868Excel", None)) self.label_2.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None)) self.label_6.setText(QCoreApplication.translate("MainWindow", u"\u5c0f\u4e8e", None)) self.label_3.setText(QCoreApplication.translate("MainWindow", u"KB-Chrome", None)) self.lSize.setText(QCoreApplication.translate("MainWindow", u"20", None)) - self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u5de1\u67e5", None)) + self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u722c\u53d6", None)) + self.groupBox_7.setTitle(QCoreApplication.translate("MainWindow", u"\u603b\u9662\u5b98\u5fae", None)) + self.label_10.setText(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u7ed3\u679cExcel:", None)) + self.label_11.setText(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u6253\u5206Excel:", None)) + self.lCalRes1.setText("") + self.lCalRes2.setText("") + self.bOpenCalRes1.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None)) + self.bOpenCalRes2.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None)) + self.bCal.setText(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u6253\u5206", None)) # retranslateUi diff --git a/zcspider/middlewares.py b/zcspider/middlewares.py index 139f8ab..4551b5a 100644 --- a/zcspider/middlewares.py +++ b/zcspider/middlewares.py @@ -4,6 +4,7 @@ # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals +from scrapy.http import HtmlResponse # useful for handling different item types with a single interface from itemadapter import is_item, ItemAdapter @@ -101,3 +102,13 @@ class ZcspiderDownloaderMiddleware: def spider_opened(self, spider): spider.logger.info("Spider opened: %s" % spider.name) + + +class FilterHTMLMiddleware: + def process_response(self, request, response, spider): + if isinstance(response, HtmlResponse): + # 只接收HTML响应 + return response + else: + # 忽略其他类型的资源文件 + return request \ No newline at end of file diff --git a/zcspider/settings.py b/zcspider/settings.py index 6e4f619..1736130 100644 --- a/zcspider/settings.py +++ b/zcspider/settings.py @@ -105,4 +105,9 @@ ITEM_PIPELINES = { FEED_EXPORTERS = { # 'xlsx': 'scrapy_xlsx.XlsxItemExporter', +} + +DOWNLOADER_MIDDLEWARES = { + 'zcspider.middlewares.FilterHTMLMiddleware': 200, + # 其他下载中间件... } \ No newline at end of file diff --git a/zcspider/spiders/base.py b/zcspider/spiders/base.py index 18bd884..060c995 100644 --- a/zcspider/spiders/base.py +++ b/zcspider/spiders/base.py @@ -39,12 +39,14 @@ class BaseSpider(scrapy.Spider): def start_requests(self): for url in self.start_urls: url = self.fix_url_scheme(url) - r = scrapy.Request(url, dont_filter=True, headers=self.headers, callback=self.parse, errback=self.request2, meta={'download_timeout': 30}) + self.visited_urls.add(url) + r = scrapy.Request(url, dont_filter=True, headers=self.headers, callback=self.parse, meta={'download_timeout': 30}) yield r def is_file_url(self, url): - if f'.{url.split(".")[-1].lower()}' in self.ext: - return True + for item in self.ext: + if url.lower().endswith(item): + return True return False def is_file_res(self, res): @@ -82,42 +84,39 @@ class BaseSpider(scrapy.Spider): yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30}) def parse(self, response): - try: - if response.status >= 500: - return - self.visited_urls.add(response.url) - if self.is_file_res(response): - return - h = html2text.HTML2Text() - h.ignore_links = True # 忽略所有链接 - # 提取纯文本内容 - # try: - text = h.handle(response.text) - # except: - # text = h.handle(response.body.decode(encoding='gb18030')) - if response.status < 400: - yield { - 'group': self.group, - 'name': self.name, - 'domain': self.domain, - 'url': response.url, - 'text': text, - } - links = re.findall(r'href=["\']?([^"\'>]+)', response.text) - for link in links: - full_link = response.urljoin(link) - if not full_link.startswith('http'): - continue - if full_link not in self.visited_urls and (self.is_file_url(full_link) is False): - if urlparse(full_link).netloc.replace('www.', '') == self.domain: - # try: - yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30}) - # except ValueError: - # import traceback - # print(traceback.format_exc()) - # print(full_link) - except scrapy.exceptions.TimeoutError: - print(f'{response.url}-请求超时取消') + if response.status >= 500: + return + if self.is_file_res(response): + return + h = html2text.HTML2Text() + h.ignore_links = True # 忽略所有链接 + # 提取纯文本内容 + # try: + text = h.handle(response.text) + # except: + # text = h.handle(response.body.decode(encoding='gb18030')) + if response.status < 400: + yield { + 'group': self.group, + 'name': self.name, + 'domain': self.domain, + 'url': response.url, + 'text': text, + } + links = re.findall(r'href=["\']?([^"\'>]+)', response.text) + for link in links: + full_link = response.urljoin(link) + if not full_link.startswith('http'): + continue + if full_link not in self.visited_urls and (self.is_file_url(full_link) is False): + if urlparse(full_link).netloc.replace('www.', '') == self.domain: + self.visited_urls.add(response.url) + # try: + yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, meta={'download_timeout': 30}) + # except ValueError: + # import traceback + # print(traceback.format_exc()) + # print(full_link) def closed(self, reason): # This method will be called when the Spider is about to close