feat: 输出简报

This commit is contained in:
caoqianming 2023-11-08 16:09:27 +08:00
parent 70040f1c0a
commit e5ff671c97
10 changed files with 524 additions and 202 deletions

4
.gitignore vendored
View File

@ -5,7 +5,9 @@ __pycache__/
twistd.pid
~$*
*.xlsx
!template.xlsx
*.docx
!template*.xlsx
!template*.docx
wechat_dir/*
*.csv
.idea/*

315
main.ui
View File

@ -7,7 +7,7 @@
<x>0</x>
<y>0</y>
<width>600</width>
<height>763</height>
<height>830</height>
</rect>
</property>
<property name="sizePolicy">
@ -19,13 +19,13 @@
<property name="minimumSize">
<size>
<width>600</width>
<height>763</height>
<height>830</height>
</size>
</property>
<property name="maximumSize">
<size>
<width>600</width>
<height>763</height>
<height>830</height>
</size>
</property>
<property name="windowTitle">
@ -123,70 +123,13 @@
</property>
</widget>
</widget>
<widget class="QGroupBox" name="groupBox_3">
<property name="geometry">
<rect>
<x>10</x>
<y>380</y>
<width>191</width>
<height>91</height>
</rect>
</property>
<property name="font">
<font>
<pointsize>11</pointsize>
</font>
</property>
<property name="title">
<string>2.确认分析对比库</string>
</property>
<widget class="QPushButton" name="bBiao">
<property name="geometry">
<rect>
<x>20</x>
<y>30</y>
<width>151</width>
<height>24</height>
</rect>
</property>
<property name="styleSheet">
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
</property>
<property name="text">
<string>打开分析标准Excel</string>
</property>
</widget>
<widget class="QLabel" name="label_4">
<property name="geometry">
<rect>
<x>20</x>
<y>60</y>
<width>151</width>
<height>16</height>
</rect>
</property>
<property name="font">
<font>
<family>楷体</family>
<pointsize>10</pointsize>
<bold>false</bold>
</font>
</property>
<property name="styleSheet">
<string notr="true">color: red;</string>
</property>
<property name="text">
<string>请在修改后保存并关闭</string>
</property>
</widget>
</widget>
<widget class="QGroupBox" name="groupBox_5">
<property name="geometry">
<rect>
<x>10</x>
<y>650</y>
<y>540</y>
<width>581</width>
<height>71</height>
<height>121</height>
</rect>
</property>
<property name="font">
@ -195,13 +138,13 @@
</font>
</property>
<property name="title">
<string>最终结果</string>
<string>汇总分析</string>
</property>
<widget class="QLabel" name="label_7">
<property name="geometry">
<rect>
<x>10</x>
<y>20</y>
<y>70</y>
<width>91</width>
<height>16</height>
</rect>
@ -219,7 +162,7 @@
<property name="geometry">
<rect>
<x>10</x>
<y>40</y>
<y>90</y>
<width>91</width>
<height>16</height>
</rect>
@ -237,7 +180,7 @@
<property name="geometry">
<rect>
<x>10</x>
<y>30</y>
<y>80</y>
<width>561</width>
<height>16</height>
</rect>
@ -250,7 +193,7 @@
<property name="geometry">
<rect>
<x>110</x>
<y>15</y>
<y>65</y>
<width>381</width>
<height>21</height>
</rect>
@ -268,7 +211,7 @@
<property name="geometry">
<rect>
<x>110</x>
<y>40</y>
<y>90</y>
<width>381</width>
<height>16</height>
</rect>
@ -286,7 +229,7 @@
<property name="geometry">
<rect>
<x>520</x>
<y>10</y>
<y>60</y>
<width>51</width>
<height>24</height>
</rect>
@ -302,7 +245,7 @@
<property name="geometry">
<rect>
<x>520</x>
<y>40</y>
<y>90</y>
<width>51</width>
<height>24</height>
</rect>
@ -311,6 +254,71 @@
<string>打开</string>
</property>
</widget>
<widget class="QPushButton" name="bBiao">
<property name="geometry">
<rect>
<x>20</x>
<y>30</y>
<width>151</width>
<height>24</height>
</rect>
</property>
<property name="font">
<font>
<pointsize>11</pointsize>
</font>
</property>
<property name="styleSheet">
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
</property>
<property name="text">
<string>打开分析标准Excel</string>
</property>
</widget>
<widget class="QLabel" name="label_4">
<property name="geometry">
<rect>
<x>180</x>
<y>30</y>
<width>151</width>
<height>16</height>
</rect>
</property>
<property name="font">
<font>
<family>楷体</family>
<pointsize>11</pointsize>
<bold>false</bold>
</font>
</property>
<property name="styleSheet">
<string notr="true">color: red;</string>
</property>
<property name="text">
<string>请在修改后保存并关闭</string>
</property>
</widget>
<widget class="QPushButton" name="bAna">
<property name="geometry">
<rect>
<x>420</x>
<y>30</y>
<width>151</width>
<height>24</height>
</rect>
</property>
<property name="font">
<font>
<pointsize>12</pointsize>
</font>
</property>
<property name="styleSheet">
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
</property>
<property name="text">
<string>开始分析</string>
</property>
</widget>
</widget>
<widget class="QLabel" name="label_9">
<property name="geometry">
@ -345,10 +353,10 @@
<widget class="QGroupBox" name="groupBox_6">
<property name="geometry">
<rect>
<x>210</x>
<x>220</x>
<y>280</y>
<width>371</width>
<height>361</height>
<height>251</height>
</rect>
</property>
<property name="font">
@ -357,7 +365,7 @@
</font>
</property>
<property name="title">
<string>日志显示</string>
<string>操作日志显示</string>
</property>
<widget class="QListView" name="vLog">
<property name="geometry">
@ -365,7 +373,7 @@
<x>10</x>
<y>20</y>
<width>351</width>
<height>321</height>
<height>221</height>
</rect>
</property>
<property name="font">
@ -382,7 +390,7 @@
<property name="geometry">
<rect>
<x>10</x>
<y>490</y>
<y>380</y>
<width>191</width>
<height>151</height>
</rect>
@ -393,7 +401,7 @@
</font>
</property>
<property name="title">
<string>2.确认需要抓取的网站</string>
<string>2.确认需要爬取的官网</string>
</property>
<widget class="QPushButton" name="bWebSite">
<property name="geometry">
@ -491,7 +499,160 @@
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
</property>
<property name="text">
<string>开始巡查</string>
<string>开始爬取</string>
</property>
</widget>
</widget>
<widget class="QGroupBox" name="groupBox_7">
<property name="geometry">
<rect>
<x>10</x>
<y>670</y>
<width>581</width>
<height>111</height>
</rect>
</property>
<property name="font">
<font>
<pointsize>11</pointsize>
</font>
</property>
<property name="title">
<string>总院官微</string>
</property>
<widget class="QLabel" name="label_10">
<property name="geometry">
<rect>
<x>10</x>
<y>60</y>
<width>91</width>
<height>16</height>
</rect>
</property>
<property name="font">
<font>
<pointsize>10</pointsize>
</font>
</property>
<property name="text">
<string>汇总结果Excel:</string>
</property>
</widget>
<widget class="QLabel" name="label_11">
<property name="geometry">
<rect>
<x>10</x>
<y>80</y>
<width>91</width>
<height>16</height>
</rect>
</property>
<property name="font">
<font>
<pointsize>10</pointsize>
</font>
</property>
<property name="text">
<string>汇总打分Excel:</string>
</property>
</widget>
<widget class="Line" name="line_2">
<property name="geometry">
<rect>
<x>10</x>
<y>70</y>
<width>561</width>
<height>16</height>
</rect>
</property>
<property name="orientation">
<enum>Qt::Horizontal</enum>
</property>
</widget>
<widget class="QLabel" name="lCalRes1">
<property name="geometry">
<rect>
<x>110</x>
<y>55</y>
<width>381</width>
<height>21</height>
</rect>
</property>
<property name="font">
<font>
<pointsize>9</pointsize>
</font>
</property>
<property name="text">
<string/>
</property>
</widget>
<widget class="QLabel" name="lCalRes2">
<property name="geometry">
<rect>
<x>110</x>
<y>80</y>
<width>381</width>
<height>16</height>
</rect>
</property>
<property name="font">
<font>
<pointsize>9</pointsize>
</font>
</property>
<property name="text">
<string/>
</property>
</widget>
<widget class="QPushButton" name="bOpenCalRes1">
<property name="geometry">
<rect>
<x>520</x>
<y>50</y>
<width>51</width>
<height>24</height>
</rect>
</property>
<property name="styleSheet">
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
</property>
<property name="text">
<string>打开</string>
</property>
</widget>
<widget class="QPushButton" name="bOpenCalRes2">
<property name="geometry">
<rect>
<x>520</x>
<y>80</y>
<width>51</width>
<height>24</height>
</rect>
</property>
<property name="text">
<string>打开</string>
</property>
</widget>
<widget class="QPushButton" name="bCal">
<property name="geometry">
<rect>
<x>20</x>
<y>30</y>
<width>151</width>
<height>24</height>
</rect>
</property>
<property name="font">
<font>
<pointsize>11</pointsize>
</font>
</property>
<property name="styleSheet">
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
</property>
<property name="text">
<string>汇总打分</string>
</property>
</widget>
</widget>

View File

@ -1,7 +1,7 @@
import pandas as pd
import os
import sqlite3
from .base import BASE_DIR
from mycode.base import BASE_DIR
wechat_dir = os.path.join(BASE_DIR, 'article')
web_dir = os.path.join(BASE_DIR, 'web_dir')
@ -53,6 +53,10 @@ def ana_wechat():
if not result.empty:
for ind2, row2 in result.iterrows():
if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['content']:
continue
if row['错误表述'] == '20大':
continue
output_row = [
index,
row2['nickname'],
@ -85,6 +89,10 @@ def ana_web():
result = df[mask]
if not result.empty:
for ind2, row2 in result.iterrows():
if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['text']:
continue
if row['错误表述'] == '20大':
continue
output_row = [
index,
row2['name'],
@ -101,4 +109,6 @@ def ana_web():
return output_data
if __name__ == "__main__":
ana_web()

View File

@ -6,3 +6,4 @@ scrapy-xlsx==0.1.1
selenium==4.9.1
pyside6==6.5.2
pywin32==306
docxtpl==0.16.7

156
start.py
View File

@ -13,6 +13,9 @@ import pandas as pd
from urllib.parse import urlparse
from openpyxl import load_workbook
import threading
import traceback
from docxtpl import DocxTemplate
import json
# from queue import Queue
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
@ -20,6 +23,8 @@ WEB_SITES_PATH = os.path.join(BASE_DIR, 'web_sites.xlsx')
BIAO_PATH = os.path.join(BASE_DIR, 'biao.xlsx')
PYTHON_PATH = os.path.join(BASE_DIR, 'runtime/python.exe')
TEMPLATE_PATH = os.path.join(BASE_DIR, 'summary/template.xlsx')
TEMPLATE_REPORT_PATH = os.path.join(BASE_DIR, 'summary/template_report.docx')
def fix_url_scheme(url, default_scheme='http'):
# 检查URL是否包含方案
@ -37,26 +42,49 @@ class MyApplication(QApplication):
self.main_window = MainWindow()
return self.main_window
class MyThread(QThread):
update_signal = Signal(dict)
def gen_doc(w1, w2):
now = datetime.datetime.now()
now_3 = now - datetime.timedelta(days=3)
# with open('w2.json', 'r', encoding='utf-8') as f:
# w2 = json.loads(f.read())
# with open('w1.json', 'r', encoding='utf-8') as f:
# w1 = json.loads(f.read())
gdbs = 0
yzbs = 0
ybwz = 0
zzcc = 0
context = {'y': now.year, 'm': now.month, 'd': now.day, 'mo': now_3.month, 'do': now_3.day, 'su': 'xx', 'w1': w1, 'w2': w2}
output_report_path = os.path.join(BASE_DIR, f'summary/{now.year}{now.month}月-分析结果简报.docx')
doc = DocxTemplate(TEMPLATE_REPORT_PATH)
for i in w1:
if i[5] == '固定表述错误':
gdbs =gdbs + 1
elif i[5] == '严重表述错误':
yzbs = yzbs +1
elif i[5] == '一般文字差错':
ybwz = ybwz +1
elif i[5] == '政治差错':
zzcc = zzcc +1
for i in w2:
if i[5] == '固定表述错误':
gdbs =gdbs + 1
elif i[5] == '严重表述错误':
yzbs = yzbs +1
elif i[5] == '一般文字差错':
ybwz = ybwz +1
elif i[5] == '政治差错':
zzcc = zzcc +1
def __init__(self, lsize) -> None:
super().__init__()
self.lsize = lsize
self.processes = []
self.running = False
context['su'] = f'固定表述错误{gdbs}项, 严重表述错误{yzbs}项, 一般文字差错{ybwz}项, 政治差错{zzcc}'
def capture_output(self, p):
while self.running and p.poll() is None:
output = p.stdout.readline()
err = p.stderr.readline()
if err:
self.update_signal.emit({'msg': err.strip()})
if output:
self.update_signal.emit({'msg': output.strip()})
doc.render(context)
doc.save(output_report_path)
return output_report_path
class AnaThread(QThread):
update_signal = Signal(object)
def ana(self):
month = datetime.datetime.now().month
now = datetime.datetime.now()
self.update_signal.emit({'msg': '对比开始...'})
self.update_signal.emit({'msg': '正在组合微信公众号爬取内容...'})
make_simple_csv_from_db()
@ -65,9 +93,10 @@ class MyThread(QThread):
self.update_signal.emit({'msg': '开始对比分析所有内容...'})
wechat_results = ana_wechat()
web_results = ana_web()
output_excel_path = os.path.join(BASE_DIR, f'summary/{month}月-总院宣传阵地巡查结果汇总表.xlsx')
# 生成汇总表
self.update_signal.emit({'msg': '开始生成汇总表...'})
output_excel_path = os.path.join(BASE_DIR, f'summary/{now.year}{now.month}月-分析结果汇总表.xlsx')
workbook = load_workbook(TEMPLATE_PATH)
# 选择要操作的工作表
wechat_sheet = workbook['公众号']
web_sheet = workbook['网站']
for row in wechat_results:
@ -76,7 +105,46 @@ class MyThread(QThread):
web_sheet.append(row)
workbook.save(output_excel_path)
workbook.close()
self.update_signal.emit({'msg': '分析完毕, 请查看结果栏, 可手动校对', 'output_excel_path': output_excel_path})
# with open('w1.json', 'w', encoding='utf-8') as f:
# f.write(json.dumps(wechat_results, ensure_ascii=False))
# with open('w2.json', 'w', encoding='utf-8') as f:
# f.write(json.dumps(web_results, ensure_ascii=False))
# 生成简报
self.update_signal.emit({'msg': '开始生成汇总简报...'})
output_report_path = gen_doc(wechat_results, web_results)
self.update_signal.emit({'msg': '分析完毕, 请查看结果栏, 可手动校对', 'output_excel_path': output_excel_path, 'output_report_path': output_report_path})
def run(self) -> None:
try:
self.ana()
except Exception as e:
self.update_signal.emit({'msg': traceback.format_exc()})
class MyThread(QThread):
update_signal = Signal(object)
def __init__(self, lsize) -> None:
"""
lsize: 多少kb需要调取Chrome
"""
super().__init__()
self.lsize = lsize
self.processes = []
self.running = False
def capture_output(self, p):
while self.running and p.poll() is None:
output = p.stdout.readline()
if output:
self.update_signal.emit({'msg': output.strip()})
def capture_err(self, p):
while self.running and p.poll() is None:
err = p.stderr.readline()
if err:
self.update_signal.emit({'msg': err.strip()})
def run(self) -> None:
self.update_signal.emit({'msg': '开始进行网站爬取...'})
@ -95,6 +163,8 @@ class MyThread(QThread):
self.running = True
getlog_thread = threading.Thread(target=self.capture_output, args=(process,), daemon=True)
getlog_thread.start()
getlog_thread_err = threading.Thread(target=self.capture_err, args=(process,), daemon=True)
getlog_thread_err.start()
for process in self.processes:
process.wait()
@ -114,8 +184,6 @@ class MyThread(QThread):
self.update_signal.emit({'msg': '存在未爬取站点,正在调用Chrome继续爬取...'})
chrom_main_from_list(info_to_save)
self.update_signal.emit({'msg': '网站爬取完毕!'})
self.ana()
self.exec()
def close(self):
self.running = False
@ -129,7 +197,8 @@ class MainWindow(QMainWindow):
def __init__(self):
super(MainWindow, self).__init__()
self.worker_thread = None
self.web_thread = None
self.ana_thread = None
self.wcplus = False
self.logModel= QStringListModel([])
self.ui = Ui_MainWindow()
@ -139,6 +208,7 @@ class MainWindow(QMainWindow):
self.ui.bWebSite.clicked.connect(self.open_websites_xlsx)
self.ui.bBiao.clicked.connect(self.open_biao_xlsx)
self.ui.bStart.clicked.connect(self.start)
self.ui.bAna.clicked.connect(self.start_ana)
self.ui.bRes1.clicked.connect(self.open_res1)
self.ui.bRes2.clicked.connect(self.open_res2)
self.ui.vLog.setModel(self.logModel)
@ -170,9 +240,9 @@ class MainWindow(QMainWindow):
def open_res2(self):
if self.ui.lRes2.text():
app = win32.Dispatch("Excel.Application")
app = win32.Dispatch("Word.Application")
app.Visible = True
app.Workbooks.Open(self.ui.lRes2.text())
app.Documents.Open(self.ui.lRes2.text())
app.WindowState = 3
def get_time(self):
@ -180,34 +250,45 @@ class MainWindow(QMainWindow):
return now.strftime('%H:%M:%S')
def start(self):
if self.ui.bStart.text() == '开始巡查' or self.ui.bStart.text() == '重新开始':
if self.ui.bStart.text() == '开始爬取' or self.ui.bStart.text() == '重新开始':
self.log('', True)
if self.res1Workbook:
self.res1Workbook.Close()
self.ui.lSize.setEnabled(False)
self.ui.bStart.setText('停止巡查')
self.ui.bStart.setText('停止爬取')
self.start_web(int(self.ui.lSize.text()))
elif self.ui.bStart.text() == '停止巡查':
elif self.ui.bStart.text() == '停止爬取':
self.update_log({'msg': '正在停止...'})
if self.worker_thread:
self.worker_thread.close()
if self.web_thread:
self.web_thread.close()
self.log('', True)
self.ui.lSize.setEnabled(True)
self.ui.bStart.setText('开始巡查')
self.ui.bStart.setText('开始爬取')
def start_web(self, lsize):
self.worker_thread = MyThread(lsize)
self.worker_thread.update_signal.connect(self.update_log)
self.worker_thread.start()
self.web_thread = MyThread(lsize)
self.web_thread.update_signal.connect(self.update_log)
self.web_thread.start()
def start_ana(self):
self.ana_thread = AnaThread()
self.ana_thread.update_signal.connect(self.update_log)
self.ana_thread.start()
def update_log(self, rdict):
if isinstance(rdict, str):
self.log(f'{self.get_time()}-{rdict}', False)
elif isinstance(rdict, dict):
self.log(f'{self.get_time()}-{rdict["msg"]}', False)
if 'output_report_path' in rdict:
self.ui.lRes2.setText(rdict['output_report_path'])
# self.ui.bStart.setText('重新开始')
# self.ui.lSize.setEnabled(True)
if 'output_excel_path' in rdict:
self.ui.lRes1.setText(rdict['output_excel_path'])
self.ui.bStart.setText('重新开始')
self.ui.lSize.setEnabled(True)
# self.ui.bStart.setText('重新开始')
# self.ui.lSize.setEnabled(True)
def log(self, logLine: str, clear=False):
log_list = self.logModel.stringList()
@ -228,12 +309,13 @@ class MainWindow(QMainWindow):
except Exception as e:
print(f"Error while terminating wcplus.exe: {str(e)}")
self.wcplus = False
if self.worker_thread:
self.worker_thread.close()
if self.web_thread:
self.web_thread.close()
event.accept()
if __name__ == "__main__":
# gen_doc()
app = MyApplication(sys.argv)
main_window = app.createMainWindow()
main_window.show()

Binary file not shown.

View File

@ -23,14 +23,14 @@ class Ui_MainWindow(object):
def setupUi(self, MainWindow):
if not MainWindow.objectName():
MainWindow.setObjectName(u"MainWindow")
MainWindow.resize(600, 763)
MainWindow.resize(600, 830)
sizePolicy = QSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed)
sizePolicy.setHorizontalStretch(0)
sizePolicy.setVerticalStretch(0)
sizePolicy.setHeightForWidth(MainWindow.sizePolicy().hasHeightForWidth())
MainWindow.setSizePolicy(sizePolicy)
MainWindow.setMinimumSize(QSize(600, 763))
MainWindow.setMaximumSize(QSize(600, 763))
MainWindow.setMinimumSize(QSize(600, 830))
MainWindow.setMaximumSize(QSize(600, 830))
icon = QIcon()
icon.addFile(u"start.ico", QSize(), QIcon.Normal, QIcon.Off)
MainWindow.setWindowIcon(icon)
@ -63,79 +63,87 @@ class Ui_MainWindow(object):
font1.setBold(False)
self.label_5.setFont(font1)
self.label_5.setStyleSheet(u"color: red;")
self.groupBox_3 = QGroupBox(self.centralwidget)
self.groupBox_3.setObjectName(u"groupBox_3")
self.groupBox_3.setGeometry(QRect(10, 380, 191, 91))
self.groupBox_3.setFont(font)
self.bBiao = QPushButton(self.groupBox_3)
self.bBiao.setObjectName(u"bBiao")
self.bBiao.setGeometry(QRect(20, 30, 151, 24))
self.bBiao.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
self.label_4 = QLabel(self.groupBox_3)
self.label_4.setObjectName(u"label_4")
self.label_4.setGeometry(QRect(20, 60, 151, 16))
self.label_4.setFont(font1)
self.label_4.setStyleSheet(u"color: red;")
self.groupBox_5 = QGroupBox(self.centralwidget)
self.groupBox_5.setObjectName(u"groupBox_5")
self.groupBox_5.setGeometry(QRect(10, 650, 581, 71))
self.groupBox_5.setGeometry(QRect(10, 540, 581, 121))
self.groupBox_5.setFont(font)
self.label_7 = QLabel(self.groupBox_5)
self.label_7.setObjectName(u"label_7")
self.label_7.setGeometry(QRect(10, 20, 91, 16))
self.label_7.setGeometry(QRect(10, 70, 91, 16))
font2 = QFont()
font2.setPointSize(10)
self.label_7.setFont(font2)
self.label_8 = QLabel(self.groupBox_5)
self.label_8.setObjectName(u"label_8")
self.label_8.setGeometry(QRect(10, 40, 91, 16))
self.label_8.setGeometry(QRect(10, 90, 91, 16))
self.label_8.setFont(font2)
self.line = QFrame(self.groupBox_5)
self.line.setObjectName(u"line")
self.line.setGeometry(QRect(10, 30, 561, 16))
self.line.setGeometry(QRect(10, 80, 561, 16))
self.line.setFrameShape(QFrame.HLine)
self.line.setFrameShadow(QFrame.Sunken)
self.lRes1 = QLabel(self.groupBox_5)
self.lRes1.setObjectName(u"lRes1")
self.lRes1.setGeometry(QRect(110, 15, 381, 21))
self.lRes1.setGeometry(QRect(110, 65, 381, 21))
font3 = QFont()
font3.setPointSize(9)
self.lRes1.setFont(font3)
self.lRes2 = QLabel(self.groupBox_5)
self.lRes2.setObjectName(u"lRes2")
self.lRes2.setGeometry(QRect(110, 40, 381, 16))
self.lRes2.setGeometry(QRect(110, 90, 381, 16))
self.lRes2.setFont(font3)
self.bRes1 = QPushButton(self.groupBox_5)
self.bRes1.setObjectName(u"bRes1")
self.bRes1.setGeometry(QRect(520, 10, 51, 24))
self.bRes1.setGeometry(QRect(520, 60, 51, 24))
self.bRes1.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
self.bRes2 = QPushButton(self.groupBox_5)
self.bRes2.setObjectName(u"bRes2")
self.bRes2.setGeometry(QRect(520, 40, 51, 24))
self.bRes2.setGeometry(QRect(520, 90, 51, 24))
self.bBiao = QPushButton(self.groupBox_5)
self.bBiao.setObjectName(u"bBiao")
self.bBiao.setGeometry(QRect(20, 30, 151, 24))
self.bBiao.setFont(font)
self.bBiao.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
self.label_4 = QLabel(self.groupBox_5)
self.label_4.setObjectName(u"label_4")
self.label_4.setGeometry(QRect(180, 30, 151, 16))
font4 = QFont()
font4.setFamilies([u"\u6977\u4f53"])
font4.setPointSize(11)
font4.setBold(False)
self.label_4.setFont(font4)
self.label_4.setStyleSheet(u"color: red;")
self.bAna = QPushButton(self.groupBox_5)
self.bAna.setObjectName(u"bAna")
self.bAna.setGeometry(QRect(420, 30, 151, 24))
font5 = QFont()
font5.setPointSize(12)
self.bAna.setFont(font5)
self.bAna.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
self.label_9 = QLabel(self.centralwidget)
self.label_9.setObjectName(u"label_9")
self.label_9.setGeometry(QRect(150, 0, 291, 31))
font4 = QFont()
font4.setFamilies([u"\u6977\u4f53"])
font4.setPointSize(12)
font4.setBold(False)
font4.setItalic(False)
self.label_9.setFont(font4)
font6 = QFont()
font6.setFamilies([u"\u6977\u4f53"])
font6.setPointSize(12)
font6.setBold(False)
font6.setItalic(False)
self.label_9.setFont(font6)
self.label_9.setStyleSheet(u"color:white;")
self.label_9.setAlignment(Qt.AlignRight|Qt.AlignTrailing|Qt.AlignVCenter)
self.label_9.setMargin(6)
self.groupBox_6 = QGroupBox(self.centralwidget)
self.groupBox_6.setObjectName(u"groupBox_6")
self.groupBox_6.setGeometry(QRect(210, 280, 371, 361))
self.groupBox_6.setGeometry(QRect(220, 280, 371, 251))
self.groupBox_6.setFont(font)
self.vLog = QListView(self.groupBox_6)
self.vLog.setObjectName(u"vLog")
self.vLog.setGeometry(QRect(10, 20, 351, 321))
self.vLog.setGeometry(QRect(10, 20, 351, 221))
self.vLog.setFont(font3)
self.vLog.setStyleSheet(u"")
self.groupBox_2 = QGroupBox(self.centralwidget)
self.groupBox_2.setObjectName(u"groupBox_2")
self.groupBox_2.setGeometry(QRect(10, 490, 191, 151))
self.groupBox_2.setGeometry(QRect(10, 380, 191, 151))
self.groupBox_2.setFont(font)
self.bWebSite = QPushButton(self.groupBox_2)
self.bWebSite.setObjectName(u"bWebSite")
@ -158,10 +166,45 @@ class Ui_MainWindow(object):
self.bStart = QPushButton(self.groupBox_2)
self.bStart.setObjectName(u"bStart")
self.bStart.setGeometry(QRect(20, 110, 151, 24))
font5 = QFont()
font5.setPointSize(12)
self.bStart.setFont(font5)
self.bStart.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
self.groupBox_7 = QGroupBox(self.centralwidget)
self.groupBox_7.setObjectName(u"groupBox_7")
self.groupBox_7.setGeometry(QRect(10, 670, 581, 111))
self.groupBox_7.setFont(font)
self.label_10 = QLabel(self.groupBox_7)
self.label_10.setObjectName(u"label_10")
self.label_10.setGeometry(QRect(10, 60, 91, 16))
self.label_10.setFont(font2)
self.label_11 = QLabel(self.groupBox_7)
self.label_11.setObjectName(u"label_11")
self.label_11.setGeometry(QRect(10, 80, 91, 16))
self.label_11.setFont(font2)
self.line_2 = QFrame(self.groupBox_7)
self.line_2.setObjectName(u"line_2")
self.line_2.setGeometry(QRect(10, 70, 561, 16))
self.line_2.setFrameShape(QFrame.HLine)
self.line_2.setFrameShadow(QFrame.Sunken)
self.lCalRes1 = QLabel(self.groupBox_7)
self.lCalRes1.setObjectName(u"lCalRes1")
self.lCalRes1.setGeometry(QRect(110, 55, 381, 21))
self.lCalRes1.setFont(font3)
self.lCalRes2 = QLabel(self.groupBox_7)
self.lCalRes2.setObjectName(u"lCalRes2")
self.lCalRes2.setGeometry(QRect(110, 80, 381, 16))
self.lCalRes2.setFont(font3)
self.bOpenCalRes1 = QPushButton(self.groupBox_7)
self.bOpenCalRes1.setObjectName(u"bOpenCalRes1")
self.bOpenCalRes1.setGeometry(QRect(520, 50, 51, 24))
self.bOpenCalRes1.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
self.bOpenCalRes2 = QPushButton(self.groupBox_7)
self.bOpenCalRes2.setObjectName(u"bOpenCalRes2")
self.bOpenCalRes2.setGeometry(QRect(520, 80, 51, 24))
self.bCal = QPushButton(self.groupBox_7)
self.bCal.setObjectName(u"bCal")
self.bCal.setGeometry(QRect(20, 30, 151, 24))
self.bCal.setFont(font)
self.bCal.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
MainWindow.setCentralWidget(self.centralwidget)
self.menubar = QMenuBar(MainWindow)
self.menubar.setObjectName(u"menubar")
@ -182,24 +225,32 @@ class Ui_MainWindow(object):
self.groupBox.setTitle(QCoreApplication.translate("MainWindow", u"1.\u5fae\u4fe1\u516c\u4f17\u53f7\u4fe1\u606f\u6293\u53d6", None))
self.bWechat.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5de5\u5177", None))
self.label_5.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u786e\u4fdd\u6240\u6709\u516c\u4f17\u53f7\u6293\u53d6\u5b8c\u6bd5", None))
self.groupBox_3.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u5206\u6790\u5bf9\u6bd4\u5e93", None))
self.bBiao.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5206\u6790\u6807\u51c6Excel", None))
self.label_4.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
self.groupBox_5.setTitle(QCoreApplication.translate("MainWindow", u"\u6700\u7ec8\u7ed3\u679c", None))
self.groupBox_5.setTitle(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u5206\u6790", None))
self.label_7.setText(QCoreApplication.translate("MainWindow", u"\u5206\u6790\u7ed3\u679cExcel:", None))
self.label_8.setText(QCoreApplication.translate("MainWindow", u"\u5206\u6790\u62a5\u544aWord:", None))
self.lRes1.setText("")
self.lRes2.setText("")
self.bRes1.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None))
self.bRes2.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None))
self.bBiao.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5206\u6790\u6807\u51c6Excel", None))
self.label_4.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
self.bAna.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u5206\u6790", None))
self.label_9.setText(QCoreApplication.translate("MainWindow", u"\u4e2d\u56fd\u5efa\u6750\u603b\u9662\u5ba3\u4f20\u5de5\u4f5c\u4fe1\u606f\u5316\u7ba1\u7406\u5e73\u53f0", None))
self.groupBox_6.setTitle(QCoreApplication.translate("MainWindow", u"\u65e5\u5fd7\u663e\u793a", None))
self.groupBox_2.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u9700\u8981\u6293\u53d6\u7684\u7f51\u7ad9", None))
self.groupBox_6.setTitle(QCoreApplication.translate("MainWindow", u"\u64cd\u4f5c\u65e5\u5fd7\u663e\u793a", None))
self.groupBox_2.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u9700\u8981\u722c\u53d6\u7684\u5b98\u7f51", None))
self.bWebSite.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u7f51\u7ad9\u5217\u8868Excel", None))
self.label_2.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
self.label_6.setText(QCoreApplication.translate("MainWindow", u"\u5c0f\u4e8e", None))
self.label_3.setText(QCoreApplication.translate("MainWindow", u"KB-Chrome", None))
self.lSize.setText(QCoreApplication.translate("MainWindow", u"20", None))
self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u5de1\u67e5", None))
self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u722c\u53d6", None))
self.groupBox_7.setTitle(QCoreApplication.translate("MainWindow", u"\u603b\u9662\u5b98\u5fae", None))
self.label_10.setText(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u7ed3\u679cExcel:", None))
self.label_11.setText(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u6253\u5206Excel:", None))
self.lCalRes1.setText("")
self.lCalRes2.setText("")
self.bOpenCalRes1.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None))
self.bOpenCalRes2.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None))
self.bCal.setText(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u6253\u5206", None))
# retranslateUi

View File

@ -4,6 +4,7 @@
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
from scrapy.http import HtmlResponse
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
@ -101,3 +102,13 @@ class ZcspiderDownloaderMiddleware:
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class FilterHTMLMiddleware:
def process_response(self, request, response, spider):
if isinstance(response, HtmlResponse):
# 只接收HTML响应
return response
else:
# 忽略其他类型的资源文件
return request

View File

@ -106,3 +106,8 @@ ITEM_PIPELINES = {
FEED_EXPORTERS = {
# 'xlsx': 'scrapy_xlsx.XlsxItemExporter',
}
DOWNLOADER_MIDDLEWARES = {
'zcspider.middlewares.FilterHTMLMiddleware': 200,
# 其他下载中间件...
}

View File

@ -39,11 +39,13 @@ class BaseSpider(scrapy.Spider):
def start_requests(self):
for url in self.start_urls:
url = self.fix_url_scheme(url)
r = scrapy.Request(url, dont_filter=True, headers=self.headers, callback=self.parse, errback=self.request2, meta={'download_timeout': 30})
self.visited_urls.add(url)
r = scrapy.Request(url, dont_filter=True, headers=self.headers, callback=self.parse, meta={'download_timeout': 30})
yield r
def is_file_url(self, url):
if f'.{url.split(".")[-1].lower()}' in self.ext:
for item in self.ext:
if url.lower().endswith(item):
return True
return False
@ -82,10 +84,8 @@ class BaseSpider(scrapy.Spider):
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
def parse(self, response):
try:
if response.status >= 500:
return
self.visited_urls.add(response.url)
if self.is_file_res(response):
return
h = html2text.HTML2Text()
@ -110,14 +110,13 @@ class BaseSpider(scrapy.Spider):
continue
if full_link not in self.visited_urls and (self.is_file_url(full_link) is False):
if urlparse(full_link).netloc.replace('www.', '') == self.domain:
self.visited_urls.add(response.url)
# try:
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, meta={'download_timeout': 30})
# except ValueError:
# import traceback
# print(traceback.format_exc())
# print(full_link)
except scrapy.exceptions.TimeoutError:
print(f'{response.url}-请求超时取消')
def closed(self, reason):
# This method will be called when the Spider is about to close