diff --git a/.gitignore b/.gitignore
index f7e7641..af168f9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,7 +5,9 @@ __pycache__/
twistd.pid
~$*
*.xlsx
-!template.xlsx
+*.docx
+!template*.xlsx
+!template*.docx
wechat_dir/*
*.csv
.idea/*
diff --git a/main.ui b/main.ui
index 42213ed..00dc8f7 100644
--- a/main.ui
+++ b/main.ui
@@ -7,7 +7,7 @@
0
0
600
- 763
+ 830
@@ -19,13 +19,13 @@
600
- 763
+ 830
600
- 763
+ 830
@@ -123,70 +123,13 @@
-
-
-
- 10
- 380
- 191
- 91
-
-
-
-
- 11
-
-
-
- 2.确认分析对比库
-
-
-
-
- 20
- 30
- 151
- 24
-
-
-
- background-color:#409EFF; color: white; border-radius: 2px
-
-
- 打开分析标准Excel
-
-
-
-
-
- 20
- 60
- 151
- 16
-
-
-
-
- 楷体
- 10
- false
-
-
-
- color: red;
-
-
- 请在修改后保存并关闭
-
-
-
10
- 650
+ 540
581
- 71
+ 121
@@ -195,13 +138,13 @@
- 最终结果
+ 汇总分析
10
- 20
+ 70
91
16
@@ -219,7 +162,7 @@
10
- 40
+ 90
91
16
@@ -237,7 +180,7 @@
10
- 30
+ 80
561
16
@@ -250,7 +193,7 @@
110
- 15
+ 65
381
21
@@ -268,7 +211,7 @@
110
- 40
+ 90
381
16
@@ -286,7 +229,7 @@
520
- 10
+ 60
51
24
@@ -302,7 +245,7 @@
520
- 40
+ 90
51
24
@@ -311,6 +254,71 @@
打开
+
+
+
+ 20
+ 30
+ 151
+ 24
+
+
+
+
+ 11
+
+
+
+ background-color:#409EFF; color: white; border-radius: 2px
+
+
+ 打开分析标准Excel
+
+
+
+
+
+ 180
+ 30
+ 151
+ 16
+
+
+
+
+ 楷体
+ 11
+ false
+
+
+
+ color: red;
+
+
+ 请在修改后保存并关闭
+
+
+
+
+
+ 420
+ 30
+ 151
+ 24
+
+
+
+
+ 12
+
+
+
+ background-color:#409EFF; color: white; border-radius: 2px
+
+
+ 开始分析
+
+
@@ -345,10 +353,10 @@
- 210
+ 220
280
371
- 361
+ 251
@@ -357,7 +365,7 @@
- 日志显示
+ 操作日志显示
@@ -365,7 +373,7 @@
10
20
351
- 321
+ 221
@@ -382,7 +390,7 @@
10
- 490
+ 380
191
151
@@ -393,7 +401,7 @@
- 2.确认需要抓取的网站
+ 2.确认需要爬取的官网
@@ -491,7 +499,160 @@
background-color:#409EFF; color: white; border-radius: 2px
- 开始巡查
+ 开始爬取
+
+
+
+
+
+
+ 10
+ 670
+ 581
+ 111
+
+
+
+
+ 11
+
+
+
+ 总院官微
+
+
+
+
+ 10
+ 60
+ 91
+ 16
+
+
+
+
+ 10
+
+
+
+ 汇总结果Excel:
+
+
+
+
+
+ 10
+ 80
+ 91
+ 16
+
+
+
+
+ 10
+
+
+
+ 汇总打分Excel:
+
+
+
+
+
+ 10
+ 70
+ 561
+ 16
+
+
+
+ Qt::Horizontal
+
+
+
+
+
+ 110
+ 55
+ 381
+ 21
+
+
+
+
+ 9
+
+
+
+
+
+
+
+
+
+ 110
+ 80
+ 381
+ 16
+
+
+
+
+ 9
+
+
+
+
+
+
+
+
+
+ 520
+ 50
+ 51
+ 24
+
+
+
+ background-color:#409EFF; color: white; border-radius: 2px
+
+
+ 打开
+
+
+
+
+
+ 520
+ 80
+ 51
+ 24
+
+
+
+ 打开
+
+
+
+
+
+ 20
+ 30
+ 151
+ 24
+
+
+
+
+ 11
+
+
+
+ background-color:#409EFF; color: white; border-radius: 2px
+
+
+ 汇总打分
diff --git a/mycode/main.py b/mycode/main.py
index 7f83388..b3ecc6c 100644
--- a/mycode/main.py
+++ b/mycode/main.py
@@ -1,7 +1,7 @@
import pandas as pd
import os
import sqlite3
-from .base import BASE_DIR
+from mycode.base import BASE_DIR
wechat_dir = os.path.join(BASE_DIR, 'article')
web_dir = os.path.join(BASE_DIR, 'web_dir')
@@ -53,6 +53,10 @@ def ana_wechat():
if not result.empty:
for ind2, row2 in result.iterrows():
+ if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['content']:
+ continue
+ if row['错误表述'] == '20大':
+ continue
output_row = [
index,
row2['nickname'],
@@ -85,6 +89,10 @@ def ana_web():
result = df[mask]
if not result.empty:
for ind2, row2 in result.iterrows():
+ if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['text']:
+ continue
+ if row['错误表述'] == '20大':
+ continue
output_row = [
index,
row2['name'],
@@ -101,4 +109,6 @@ def ana_web():
return output_data
+if __name__ == "__main__":
+ ana_web()
diff --git a/requirements.txt b/requirements.txt
index afd9d5c..04bbe22 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,5 @@ openpyxl==3.1.2
scrapy-xlsx==0.1.1
selenium==4.9.1
pyside6==6.5.2
-pywin32==306
\ No newline at end of file
+pywin32==306
+docxtpl==0.16.7
\ No newline at end of file
diff --git a/start.py b/start.py
index 8c5864e..847f0aa 100644
--- a/start.py
+++ b/start.py
@@ -13,6 +13,9 @@ import pandas as pd
from urllib.parse import urlparse
from openpyxl import load_workbook
import threading
+import traceback
+from docxtpl import DocxTemplate
+import json
# from queue import Queue
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -20,6 +23,8 @@ WEB_SITES_PATH = os.path.join(BASE_DIR, 'web_sites.xlsx')
BIAO_PATH = os.path.join(BASE_DIR, 'biao.xlsx')
PYTHON_PATH = os.path.join(BASE_DIR, 'runtime/python.exe')
TEMPLATE_PATH = os.path.join(BASE_DIR, 'summary/template.xlsx')
+TEMPLATE_REPORT_PATH = os.path.join(BASE_DIR, 'summary/template_report.docx')
+
def fix_url_scheme(url, default_scheme='http'):
# 检查URL是否包含方案
@@ -37,26 +42,49 @@ class MyApplication(QApplication):
self.main_window = MainWindow()
return self.main_window
-class MyThread(QThread):
- update_signal = Signal(dict)
+def gen_doc(w1, w2):
+ now = datetime.datetime.now()
+ now_3 = now - datetime.timedelta(days=3)
+ # with open('w2.json', 'r', encoding='utf-8') as f:
+ # w2 = json.loads(f.read())
+ # with open('w1.json', 'r', encoding='utf-8') as f:
+ # w1 = json.loads(f.read())
+ gdbs = 0
+ yzbs = 0
+ ybwz = 0
+ zzcc = 0
+ context = {'y': now.year, 'm': now.month, 'd': now.day, 'mo': now_3.month, 'do': now_3.day, 'su': 'xx', 'w1': w1, 'w2': w2}
+ output_report_path = os.path.join(BASE_DIR, f'summary/{now.year}年{now.month}月-分析结果简报.docx')
+ doc = DocxTemplate(TEMPLATE_REPORT_PATH)
+ for i in w1:
+ if i[5] == '固定表述错误':
+ gdbs =gdbs + 1
+ elif i[5] == '严重表述错误':
+ yzbs = yzbs +1
+ elif i[5] == '一般文字差错':
+ ybwz = ybwz +1
+ elif i[5] == '政治差错':
+ zzcc = zzcc +1
+ for i in w2:
+ if i[5] == '固定表述错误':
+ gdbs =gdbs + 1
+ elif i[5] == '严重表述错误':
+ yzbs = yzbs +1
+ elif i[5] == '一般文字差错':
+ ybwz = ybwz +1
+ elif i[5] == '政治差错':
+ zzcc = zzcc +1
- def __init__(self, lsize) -> None:
- super().__init__()
- self.lsize = lsize
- self.processes = []
- self.running = False
+ context['su'] = f'固定表述错误{gdbs}项, 严重表述错误{yzbs}项, 一般文字差错{ybwz}项, 政治差错{zzcc}项'
+
+ doc.render(context)
+ doc.save(output_report_path)
+ return output_report_path
+class AnaThread(QThread):
+ update_signal = Signal(object)
- def capture_output(self, p):
- while self.running and p.poll() is None:
- output = p.stdout.readline()
- err = p.stderr.readline()
- if err:
- self.update_signal.emit({'msg': err.strip()})
- if output:
- self.update_signal.emit({'msg': output.strip()})
-
def ana(self):
- month = datetime.datetime.now().month
+ now = datetime.datetime.now()
self.update_signal.emit({'msg': '对比开始...'})
self.update_signal.emit({'msg': '正在组合微信公众号爬取内容...'})
make_simple_csv_from_db()
@@ -65,9 +93,10 @@ class MyThread(QThread):
self.update_signal.emit({'msg': '开始对比分析所有内容...'})
wechat_results = ana_wechat()
web_results = ana_web()
- output_excel_path = os.path.join(BASE_DIR, f'summary/{month}月-总院宣传阵地巡查结果汇总表.xlsx')
+ # 生成汇总表
+ self.update_signal.emit({'msg': '开始生成汇总表...'})
+ output_excel_path = os.path.join(BASE_DIR, f'summary/{now.year}年{now.month}月-分析结果汇总表.xlsx')
workbook = load_workbook(TEMPLATE_PATH)
- # 选择要操作的工作表
wechat_sheet = workbook['公众号']
web_sheet = workbook['网站']
for row in wechat_results:
@@ -76,7 +105,46 @@ class MyThread(QThread):
web_sheet.append(row)
workbook.save(output_excel_path)
workbook.close()
- self.update_signal.emit({'msg': '分析完毕, 请查看结果栏, 可手动校对', 'output_excel_path': output_excel_path})
+ # with open('w1.json', 'w', encoding='utf-8') as f:
+ # f.write(json.dumps(wechat_results, ensure_ascii=False))
+
+ # with open('w2.json', 'w', encoding='utf-8') as f:
+ # f.write(json.dumps(web_results, ensure_ascii=False))
+ # 生成简报
+ self.update_signal.emit({'msg': '开始生成汇总简报...'})
+ output_report_path = gen_doc(wechat_results, web_results)
+ self.update_signal.emit({'msg': '分析完毕, 请查看结果栏, 可手动校对', 'output_excel_path': output_excel_path, 'output_report_path': output_report_path})
+
+ def run(self) -> None:
+ try:
+ self.ana()
+ except Exception as e:
+ self.update_signal.emit({'msg': traceback.format_exc()})
+
+
+class MyThread(QThread):
+ update_signal = Signal(object)
+
+ def __init__(self, lsize) -> None:
+ """
+ lsize: 多少kb需要调取Chrome
+ """
+ super().__init__()
+ self.lsize = lsize
+ self.processes = []
+ self.running = False
+
+ def capture_output(self, p):
+ while self.running and p.poll() is None:
+ output = p.stdout.readline()
+ if output:
+ self.update_signal.emit({'msg': output.strip()})
+
+ def capture_err(self, p):
+ while self.running and p.poll() is None:
+ err = p.stderr.readline()
+ if err:
+ self.update_signal.emit({'msg': err.strip()})
def run(self) -> None:
self.update_signal.emit({'msg': '开始进行网站爬取...'})
@@ -95,6 +163,8 @@ class MyThread(QThread):
self.running = True
getlog_thread = threading.Thread(target=self.capture_output, args=(process,), daemon=True)
getlog_thread.start()
+ getlog_thread_err = threading.Thread(target=self.capture_err, args=(process,), daemon=True)
+ getlog_thread_err.start()
for process in self.processes:
process.wait()
@@ -114,8 +184,6 @@ class MyThread(QThread):
self.update_signal.emit({'msg': '存在未爬取站点,正在调用Chrome继续爬取...'})
chrom_main_from_list(info_to_save)
self.update_signal.emit({'msg': '网站爬取完毕!'})
- self.ana()
- self.exec()
def close(self):
self.running = False
@@ -129,7 +197,8 @@ class MainWindow(QMainWindow):
def __init__(self):
super(MainWindow, self).__init__()
- self.worker_thread = None
+ self.web_thread = None
+ self.ana_thread = None
self.wcplus = False
self.logModel= QStringListModel([])
self.ui = Ui_MainWindow()
@@ -139,6 +208,7 @@ class MainWindow(QMainWindow):
self.ui.bWebSite.clicked.connect(self.open_websites_xlsx)
self.ui.bBiao.clicked.connect(self.open_biao_xlsx)
self.ui.bStart.clicked.connect(self.start)
+ self.ui.bAna.clicked.connect(self.start_ana)
self.ui.bRes1.clicked.connect(self.open_res1)
self.ui.bRes2.clicked.connect(self.open_res2)
self.ui.vLog.setModel(self.logModel)
@@ -170,9 +240,9 @@ class MainWindow(QMainWindow):
def open_res2(self):
if self.ui.lRes2.text():
- app = win32.Dispatch("Excel.Application")
+ app = win32.Dispatch("Word.Application")
app.Visible = True
- app.Workbooks.Open(self.ui.lRes2.text())
+ app.Documents.Open(self.ui.lRes2.text())
app.WindowState = 3
def get_time(self):
@@ -180,34 +250,45 @@ class MainWindow(QMainWindow):
return now.strftime('%H:%M:%S')
def start(self):
- if self.ui.bStart.text() == '开始巡查' or self.ui.bStart.text() == '重新开始':
+ if self.ui.bStart.text() == '开始爬取' or self.ui.bStart.text() == '重新开始':
self.log('', True)
if self.res1Workbook:
self.res1Workbook.Close()
self.ui.lSize.setEnabled(False)
- self.ui.bStart.setText('停止巡查')
+ self.ui.bStart.setText('停止爬取')
self.start_web(int(self.ui.lSize.text()))
- elif self.ui.bStart.text() == '停止巡查':
+ elif self.ui.bStart.text() == '停止爬取':
self.update_log({'msg': '正在停止...'})
- if self.worker_thread:
- self.worker_thread.close()
+ if self.web_thread:
+ self.web_thread.close()
self.log('', True)
self.ui.lSize.setEnabled(True)
- self.ui.bStart.setText('开始巡查')
+ self.ui.bStart.setText('开始爬取')
def start_web(self, lsize):
- self.worker_thread = MyThread(lsize)
- self.worker_thread.update_signal.connect(self.update_log)
- self.worker_thread.start()
+ self.web_thread = MyThread(lsize)
+ self.web_thread.update_signal.connect(self.update_log)
+ self.web_thread.start()
+ def start_ana(self):
+ self.ana_thread = AnaThread()
+ self.ana_thread.update_signal.connect(self.update_log)
+ self.ana_thread.start()
def update_log(self, rdict):
- self.log(f'{self.get_time()}-{rdict["msg"]}', False)
- if 'output_excel_path' in rdict:
- self.ui.lRes1.setText(rdict['output_excel_path'])
- self.ui.bStart.setText('重新开始')
- self.ui.lSize.setEnabled(True)
+ if isinstance(rdict, str):
+ self.log(f'{self.get_time()}-{rdict}', False)
+ elif isinstance(rdict, dict):
+ self.log(f'{self.get_time()}-{rdict["msg"]}', False)
+ if 'output_report_path' in rdict:
+ self.ui.lRes2.setText(rdict['output_report_path'])
+ # self.ui.bStart.setText('重新开始')
+ # self.ui.lSize.setEnabled(True)
+ if 'output_excel_path' in rdict:
+ self.ui.lRes1.setText(rdict['output_excel_path'])
+ # self.ui.bStart.setText('重新开始')
+ # self.ui.lSize.setEnabled(True)
def log(self, logLine: str, clear=False):
log_list = self.logModel.stringList()
@@ -228,12 +309,13 @@ class MainWindow(QMainWindow):
except Exception as e:
print(f"Error while terminating wcplus.exe: {str(e)}")
self.wcplus = False
- if self.worker_thread:
- self.worker_thread.close()
+ if self.web_thread:
+ self.web_thread.close()
event.accept()
if __name__ == "__main__":
+ # gen_doc()
app = MyApplication(sys.argv)
main_window = app.createMainWindow()
main_window.show()
diff --git a/summary/template_report.docx b/summary/template_report.docx
new file mode 100644
index 0000000..13dc734
Binary files /dev/null and b/summary/template_report.docx differ
diff --git a/ui_mainwindow.py b/ui_mainwindow.py
index f849192..38d6bb9 100644
--- a/ui_mainwindow.py
+++ b/ui_mainwindow.py
@@ -23,14 +23,14 @@ class Ui_MainWindow(object):
def setupUi(self, MainWindow):
if not MainWindow.objectName():
MainWindow.setObjectName(u"MainWindow")
- MainWindow.resize(600, 763)
+ MainWindow.resize(600, 830)
sizePolicy = QSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed)
sizePolicy.setHorizontalStretch(0)
sizePolicy.setVerticalStretch(0)
sizePolicy.setHeightForWidth(MainWindow.sizePolicy().hasHeightForWidth())
MainWindow.setSizePolicy(sizePolicy)
- MainWindow.setMinimumSize(QSize(600, 763))
- MainWindow.setMaximumSize(QSize(600, 763))
+ MainWindow.setMinimumSize(QSize(600, 830))
+ MainWindow.setMaximumSize(QSize(600, 830))
icon = QIcon()
icon.addFile(u"start.ico", QSize(), QIcon.Normal, QIcon.Off)
MainWindow.setWindowIcon(icon)
@@ -63,79 +63,87 @@ class Ui_MainWindow(object):
font1.setBold(False)
self.label_5.setFont(font1)
self.label_5.setStyleSheet(u"color: red;")
- self.groupBox_3 = QGroupBox(self.centralwidget)
- self.groupBox_3.setObjectName(u"groupBox_3")
- self.groupBox_3.setGeometry(QRect(10, 380, 191, 91))
- self.groupBox_3.setFont(font)
- self.bBiao = QPushButton(self.groupBox_3)
- self.bBiao.setObjectName(u"bBiao")
- self.bBiao.setGeometry(QRect(20, 30, 151, 24))
- self.bBiao.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
- self.label_4 = QLabel(self.groupBox_3)
- self.label_4.setObjectName(u"label_4")
- self.label_4.setGeometry(QRect(20, 60, 151, 16))
- self.label_4.setFont(font1)
- self.label_4.setStyleSheet(u"color: red;")
self.groupBox_5 = QGroupBox(self.centralwidget)
self.groupBox_5.setObjectName(u"groupBox_5")
- self.groupBox_5.setGeometry(QRect(10, 650, 581, 71))
+ self.groupBox_5.setGeometry(QRect(10, 540, 581, 121))
self.groupBox_5.setFont(font)
self.label_7 = QLabel(self.groupBox_5)
self.label_7.setObjectName(u"label_7")
- self.label_7.setGeometry(QRect(10, 20, 91, 16))
+ self.label_7.setGeometry(QRect(10, 70, 91, 16))
font2 = QFont()
font2.setPointSize(10)
self.label_7.setFont(font2)
self.label_8 = QLabel(self.groupBox_5)
self.label_8.setObjectName(u"label_8")
- self.label_8.setGeometry(QRect(10, 40, 91, 16))
+ self.label_8.setGeometry(QRect(10, 90, 91, 16))
self.label_8.setFont(font2)
self.line = QFrame(self.groupBox_5)
self.line.setObjectName(u"line")
- self.line.setGeometry(QRect(10, 30, 561, 16))
+ self.line.setGeometry(QRect(10, 80, 561, 16))
self.line.setFrameShape(QFrame.HLine)
self.line.setFrameShadow(QFrame.Sunken)
self.lRes1 = QLabel(self.groupBox_5)
self.lRes1.setObjectName(u"lRes1")
- self.lRes1.setGeometry(QRect(110, 15, 381, 21))
+ self.lRes1.setGeometry(QRect(110, 65, 381, 21))
font3 = QFont()
font3.setPointSize(9)
self.lRes1.setFont(font3)
self.lRes2 = QLabel(self.groupBox_5)
self.lRes2.setObjectName(u"lRes2")
- self.lRes2.setGeometry(QRect(110, 40, 381, 16))
+ self.lRes2.setGeometry(QRect(110, 90, 381, 16))
self.lRes2.setFont(font3)
self.bRes1 = QPushButton(self.groupBox_5)
self.bRes1.setObjectName(u"bRes1")
- self.bRes1.setGeometry(QRect(520, 10, 51, 24))
+ self.bRes1.setGeometry(QRect(520, 60, 51, 24))
self.bRes1.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
self.bRes2 = QPushButton(self.groupBox_5)
self.bRes2.setObjectName(u"bRes2")
- self.bRes2.setGeometry(QRect(520, 40, 51, 24))
+ self.bRes2.setGeometry(QRect(520, 90, 51, 24))
+ self.bBiao = QPushButton(self.groupBox_5)
+ self.bBiao.setObjectName(u"bBiao")
+ self.bBiao.setGeometry(QRect(20, 30, 151, 24))
+ self.bBiao.setFont(font)
+ self.bBiao.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
+ self.label_4 = QLabel(self.groupBox_5)
+ self.label_4.setObjectName(u"label_4")
+ self.label_4.setGeometry(QRect(180, 30, 151, 16))
+ font4 = QFont()
+ font4.setFamilies([u"\u6977\u4f53"])
+ font4.setPointSize(11)
+ font4.setBold(False)
+ self.label_4.setFont(font4)
+ self.label_4.setStyleSheet(u"color: red;")
+ self.bAna = QPushButton(self.groupBox_5)
+ self.bAna.setObjectName(u"bAna")
+ self.bAna.setGeometry(QRect(420, 30, 151, 24))
+ font5 = QFont()
+ font5.setPointSize(12)
+ self.bAna.setFont(font5)
+ self.bAna.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
self.label_9 = QLabel(self.centralwidget)
self.label_9.setObjectName(u"label_9")
self.label_9.setGeometry(QRect(150, 0, 291, 31))
- font4 = QFont()
- font4.setFamilies([u"\u6977\u4f53"])
- font4.setPointSize(12)
- font4.setBold(False)
- font4.setItalic(False)
- self.label_9.setFont(font4)
+ font6 = QFont()
+ font6.setFamilies([u"\u6977\u4f53"])
+ font6.setPointSize(12)
+ font6.setBold(False)
+ font6.setItalic(False)
+ self.label_9.setFont(font6)
self.label_9.setStyleSheet(u"color:white;")
self.label_9.setAlignment(Qt.AlignRight|Qt.AlignTrailing|Qt.AlignVCenter)
self.label_9.setMargin(6)
self.groupBox_6 = QGroupBox(self.centralwidget)
self.groupBox_6.setObjectName(u"groupBox_6")
- self.groupBox_6.setGeometry(QRect(210, 280, 371, 361))
+ self.groupBox_6.setGeometry(QRect(220, 280, 371, 251))
self.groupBox_6.setFont(font)
self.vLog = QListView(self.groupBox_6)
self.vLog.setObjectName(u"vLog")
- self.vLog.setGeometry(QRect(10, 20, 351, 321))
+ self.vLog.setGeometry(QRect(10, 20, 351, 221))
self.vLog.setFont(font3)
self.vLog.setStyleSheet(u"")
self.groupBox_2 = QGroupBox(self.centralwidget)
self.groupBox_2.setObjectName(u"groupBox_2")
- self.groupBox_2.setGeometry(QRect(10, 490, 191, 151))
+ self.groupBox_2.setGeometry(QRect(10, 380, 191, 151))
self.groupBox_2.setFont(font)
self.bWebSite = QPushButton(self.groupBox_2)
self.bWebSite.setObjectName(u"bWebSite")
@@ -158,10 +166,45 @@ class Ui_MainWindow(object):
self.bStart = QPushButton(self.groupBox_2)
self.bStart.setObjectName(u"bStart")
self.bStart.setGeometry(QRect(20, 110, 151, 24))
- font5 = QFont()
- font5.setPointSize(12)
self.bStart.setFont(font5)
self.bStart.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
+ self.groupBox_7 = QGroupBox(self.centralwidget)
+ self.groupBox_7.setObjectName(u"groupBox_7")
+ self.groupBox_7.setGeometry(QRect(10, 670, 581, 111))
+ self.groupBox_7.setFont(font)
+ self.label_10 = QLabel(self.groupBox_7)
+ self.label_10.setObjectName(u"label_10")
+ self.label_10.setGeometry(QRect(10, 60, 91, 16))
+ self.label_10.setFont(font2)
+ self.label_11 = QLabel(self.groupBox_7)
+ self.label_11.setObjectName(u"label_11")
+ self.label_11.setGeometry(QRect(10, 80, 91, 16))
+ self.label_11.setFont(font2)
+ self.line_2 = QFrame(self.groupBox_7)
+ self.line_2.setObjectName(u"line_2")
+ self.line_2.setGeometry(QRect(10, 70, 561, 16))
+ self.line_2.setFrameShape(QFrame.HLine)
+ self.line_2.setFrameShadow(QFrame.Sunken)
+ self.lCalRes1 = QLabel(self.groupBox_7)
+ self.lCalRes1.setObjectName(u"lCalRes1")
+ self.lCalRes1.setGeometry(QRect(110, 55, 381, 21))
+ self.lCalRes1.setFont(font3)
+ self.lCalRes2 = QLabel(self.groupBox_7)
+ self.lCalRes2.setObjectName(u"lCalRes2")
+ self.lCalRes2.setGeometry(QRect(110, 80, 381, 16))
+ self.lCalRes2.setFont(font3)
+ self.bOpenCalRes1 = QPushButton(self.groupBox_7)
+ self.bOpenCalRes1.setObjectName(u"bOpenCalRes1")
+ self.bOpenCalRes1.setGeometry(QRect(520, 50, 51, 24))
+ self.bOpenCalRes1.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
+ self.bOpenCalRes2 = QPushButton(self.groupBox_7)
+ self.bOpenCalRes2.setObjectName(u"bOpenCalRes2")
+ self.bOpenCalRes2.setGeometry(QRect(520, 80, 51, 24))
+ self.bCal = QPushButton(self.groupBox_7)
+ self.bCal.setObjectName(u"bCal")
+ self.bCal.setGeometry(QRect(20, 30, 151, 24))
+ self.bCal.setFont(font)
+ self.bCal.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
MainWindow.setCentralWidget(self.centralwidget)
self.menubar = QMenuBar(MainWindow)
self.menubar.setObjectName(u"menubar")
@@ -182,24 +225,32 @@ class Ui_MainWindow(object):
self.groupBox.setTitle(QCoreApplication.translate("MainWindow", u"1.\u5fae\u4fe1\u516c\u4f17\u53f7\u4fe1\u606f\u6293\u53d6", None))
self.bWechat.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5de5\u5177", None))
self.label_5.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u786e\u4fdd\u6240\u6709\u516c\u4f17\u53f7\u6293\u53d6\u5b8c\u6bd5", None))
- self.groupBox_3.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u5206\u6790\u5bf9\u6bd4\u5e93", None))
- self.bBiao.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5206\u6790\u6807\u51c6Excel", None))
- self.label_4.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
- self.groupBox_5.setTitle(QCoreApplication.translate("MainWindow", u"\u6700\u7ec8\u7ed3\u679c", None))
+ self.groupBox_5.setTitle(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u5206\u6790", None))
self.label_7.setText(QCoreApplication.translate("MainWindow", u"\u5206\u6790\u7ed3\u679cExcel:", None))
self.label_8.setText(QCoreApplication.translate("MainWindow", u"\u5206\u6790\u62a5\u544aWord:", None))
self.lRes1.setText("")
self.lRes2.setText("")
self.bRes1.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None))
self.bRes2.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None))
+ self.bBiao.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5206\u6790\u6807\u51c6Excel", None))
+ self.label_4.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
+ self.bAna.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u5206\u6790", None))
self.label_9.setText(QCoreApplication.translate("MainWindow", u"\u4e2d\u56fd\u5efa\u6750\u603b\u9662\u5ba3\u4f20\u5de5\u4f5c\u4fe1\u606f\u5316\u7ba1\u7406\u5e73\u53f0", None))
- self.groupBox_6.setTitle(QCoreApplication.translate("MainWindow", u"\u65e5\u5fd7\u663e\u793a", None))
- self.groupBox_2.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u9700\u8981\u6293\u53d6\u7684\u7f51\u7ad9", None))
+ self.groupBox_6.setTitle(QCoreApplication.translate("MainWindow", u"\u64cd\u4f5c\u65e5\u5fd7\u663e\u793a", None))
+ self.groupBox_2.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u9700\u8981\u722c\u53d6\u7684\u5b98\u7f51", None))
self.bWebSite.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u7f51\u7ad9\u5217\u8868Excel", None))
self.label_2.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
self.label_6.setText(QCoreApplication.translate("MainWindow", u"\u5c0f\u4e8e", None))
self.label_3.setText(QCoreApplication.translate("MainWindow", u"KB-Chrome", None))
self.lSize.setText(QCoreApplication.translate("MainWindow", u"20", None))
- self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u5de1\u67e5", None))
+ self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u722c\u53d6", None))
+ self.groupBox_7.setTitle(QCoreApplication.translate("MainWindow", u"\u603b\u9662\u5b98\u5fae", None))
+ self.label_10.setText(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u7ed3\u679cExcel:", None))
+ self.label_11.setText(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u6253\u5206Excel:", None))
+ self.lCalRes1.setText("")
+ self.lCalRes2.setText("")
+ self.bOpenCalRes1.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None))
+ self.bOpenCalRes2.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None))
+ self.bCal.setText(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u6253\u5206", None))
# retranslateUi
diff --git a/zcspider/middlewares.py b/zcspider/middlewares.py
index 139f8ab..4551b5a 100644
--- a/zcspider/middlewares.py
+++ b/zcspider/middlewares.py
@@ -4,6 +4,7 @@
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
+from scrapy.http import HtmlResponse
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
@@ -101,3 +102,13 @@ class ZcspiderDownloaderMiddleware:
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class FilterHTMLMiddleware:
+ def process_response(self, request, response, spider):
+ if isinstance(response, HtmlResponse):
+ # 只接收HTML响应
+ return response
+ else:
+ # 忽略其他类型的资源文件
+ return request
\ No newline at end of file
diff --git a/zcspider/settings.py b/zcspider/settings.py
index 6e4f619..1736130 100644
--- a/zcspider/settings.py
+++ b/zcspider/settings.py
@@ -105,4 +105,9 @@ ITEM_PIPELINES = {
FEED_EXPORTERS = {
# 'xlsx': 'scrapy_xlsx.XlsxItemExporter',
+}
+
+DOWNLOADER_MIDDLEWARES = {
+ 'zcspider.middlewares.FilterHTMLMiddleware': 200,
+ # 其他下载中间件...
}
\ No newline at end of file
diff --git a/zcspider/spiders/base.py b/zcspider/spiders/base.py
index 18bd884..060c995 100644
--- a/zcspider/spiders/base.py
+++ b/zcspider/spiders/base.py
@@ -39,12 +39,14 @@ class BaseSpider(scrapy.Spider):
def start_requests(self):
for url in self.start_urls:
url = self.fix_url_scheme(url)
- r = scrapy.Request(url, dont_filter=True, headers=self.headers, callback=self.parse, errback=self.request2, meta={'download_timeout': 30})
+ self.visited_urls.add(url)
+ r = scrapy.Request(url, dont_filter=True, headers=self.headers, callback=self.parse, meta={'download_timeout': 30})
yield r
def is_file_url(self, url):
- if f'.{url.split(".")[-1].lower()}' in self.ext:
- return True
+ for item in self.ext:
+ if url.lower().endswith(item):
+ return True
return False
def is_file_res(self, res):
@@ -82,42 +84,39 @@ class BaseSpider(scrapy.Spider):
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
def parse(self, response):
- try:
- if response.status >= 500:
- return
- self.visited_urls.add(response.url)
- if self.is_file_res(response):
- return
- h = html2text.HTML2Text()
- h.ignore_links = True # 忽略所有链接
- # 提取纯文本内容
- # try:
- text = h.handle(response.text)
- # except:
- # text = h.handle(response.body.decode(encoding='gb18030'))
- if response.status < 400:
- yield {
- 'group': self.group,
- 'name': self.name,
- 'domain': self.domain,
- 'url': response.url,
- 'text': text,
- }
- links = re.findall(r'href=["\']?([^"\'>]+)', response.text)
- for link in links:
- full_link = response.urljoin(link)
- if not full_link.startswith('http'):
- continue
- if full_link not in self.visited_urls and (self.is_file_url(full_link) is False):
- if urlparse(full_link).netloc.replace('www.', '') == self.domain:
- # try:
- yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
- # except ValueError:
- # import traceback
- # print(traceback.format_exc())
- # print(full_link)
- except scrapy.exceptions.TimeoutError:
- print(f'{response.url}-请求超时取消')
+ if response.status >= 500:
+ return
+ if self.is_file_res(response):
+ return
+ h = html2text.HTML2Text()
+ h.ignore_links = True # 忽略所有链接
+ # 提取纯文本内容
+ # try:
+ text = h.handle(response.text)
+ # except:
+ # text = h.handle(response.body.decode(encoding='gb18030'))
+ if response.status < 400:
+ yield {
+ 'group': self.group,
+ 'name': self.name,
+ 'domain': self.domain,
+ 'url': response.url,
+ 'text': text,
+ }
+ links = re.findall(r'href=["\']?([^"\'>]+)', response.text)
+ for link in links:
+ full_link = response.urljoin(link)
+ if not full_link.startswith('http'):
+ continue
+ if full_link not in self.visited_urls and (self.is_file_url(full_link) is False):
+ if urlparse(full_link).netloc.replace('www.', '') == self.domain:
+ self.visited_urls.add(response.url)
+ # try:
+ yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, meta={'download_timeout': 30})
+ # except ValueError:
+ # import traceback
+ # print(traceback.format_exc())
+ # print(full_link)
def closed(self, reason):
# This method will be called when the Spider is about to close