feat: 输出简报
This commit is contained in:
parent
70040f1c0a
commit
e5ff671c97
|
@ -5,7 +5,9 @@ __pycache__/
|
|||
twistd.pid
|
||||
~$*
|
||||
*.xlsx
|
||||
!template.xlsx
|
||||
*.docx
|
||||
!template*.xlsx
|
||||
!template*.docx
|
||||
wechat_dir/*
|
||||
*.csv
|
||||
.idea/*
|
||||
|
|
315
main.ui
315
main.ui
|
@ -7,7 +7,7 @@
|
|||
<x>0</x>
|
||||
<y>0</y>
|
||||
<width>600</width>
|
||||
<height>763</height>
|
||||
<height>830</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="sizePolicy">
|
||||
|
@ -19,13 +19,13 @@
|
|||
<property name="minimumSize">
|
||||
<size>
|
||||
<width>600</width>
|
||||
<height>763</height>
|
||||
<height>830</height>
|
||||
</size>
|
||||
</property>
|
||||
<property name="maximumSize">
|
||||
<size>
|
||||
<width>600</width>
|
||||
<height>763</height>
|
||||
<height>830</height>
|
||||
</size>
|
||||
</property>
|
||||
<property name="windowTitle">
|
||||
|
@ -123,70 +123,13 @@
|
|||
</property>
|
||||
</widget>
|
||||
</widget>
|
||||
<widget class="QGroupBox" name="groupBox_3">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>10</x>
|
||||
<y>380</y>
|
||||
<width>191</width>
|
||||
<height>91</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<pointsize>11</pointsize>
|
||||
</font>
|
||||
</property>
|
||||
<property name="title">
|
||||
<string>2.确认分析对比库</string>
|
||||
</property>
|
||||
<widget class="QPushButton" name="bBiao">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>20</x>
|
||||
<y>30</y>
|
||||
<width>151</width>
|
||||
<height>24</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="styleSheet">
|
||||
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>打开分析标准Excel</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLabel" name="label_4">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>20</x>
|
||||
<y>60</y>
|
||||
<width>151</width>
|
||||
<height>16</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<family>楷体</family>
|
||||
<pointsize>10</pointsize>
|
||||
<bold>false</bold>
|
||||
</font>
|
||||
</property>
|
||||
<property name="styleSheet">
|
||||
<string notr="true">color: red;</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>请在修改后保存并关闭</string>
|
||||
</property>
|
||||
</widget>
|
||||
</widget>
|
||||
<widget class="QGroupBox" name="groupBox_5">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>10</x>
|
||||
<y>650</y>
|
||||
<y>540</y>
|
||||
<width>581</width>
|
||||
<height>71</height>
|
||||
<height>121</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
|
@ -195,13 +138,13 @@
|
|||
</font>
|
||||
</property>
|
||||
<property name="title">
|
||||
<string>最终结果</string>
|
||||
<string>汇总分析</string>
|
||||
</property>
|
||||
<widget class="QLabel" name="label_7">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>10</x>
|
||||
<y>20</y>
|
||||
<y>70</y>
|
||||
<width>91</width>
|
||||
<height>16</height>
|
||||
</rect>
|
||||
|
@ -219,7 +162,7 @@
|
|||
<property name="geometry">
|
||||
<rect>
|
||||
<x>10</x>
|
||||
<y>40</y>
|
||||
<y>90</y>
|
||||
<width>91</width>
|
||||
<height>16</height>
|
||||
</rect>
|
||||
|
@ -237,7 +180,7 @@
|
|||
<property name="geometry">
|
||||
<rect>
|
||||
<x>10</x>
|
||||
<y>30</y>
|
||||
<y>80</y>
|
||||
<width>561</width>
|
||||
<height>16</height>
|
||||
</rect>
|
||||
|
@ -250,7 +193,7 @@
|
|||
<property name="geometry">
|
||||
<rect>
|
||||
<x>110</x>
|
||||
<y>15</y>
|
||||
<y>65</y>
|
||||
<width>381</width>
|
||||
<height>21</height>
|
||||
</rect>
|
||||
|
@ -268,7 +211,7 @@
|
|||
<property name="geometry">
|
||||
<rect>
|
||||
<x>110</x>
|
||||
<y>40</y>
|
||||
<y>90</y>
|
||||
<width>381</width>
|
||||
<height>16</height>
|
||||
</rect>
|
||||
|
@ -286,7 +229,7 @@
|
|||
<property name="geometry">
|
||||
<rect>
|
||||
<x>520</x>
|
||||
<y>10</y>
|
||||
<y>60</y>
|
||||
<width>51</width>
|
||||
<height>24</height>
|
||||
</rect>
|
||||
|
@ -302,7 +245,7 @@
|
|||
<property name="geometry">
|
||||
<rect>
|
||||
<x>520</x>
|
||||
<y>40</y>
|
||||
<y>90</y>
|
||||
<width>51</width>
|
||||
<height>24</height>
|
||||
</rect>
|
||||
|
@ -311,6 +254,71 @@
|
|||
<string>打开</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QPushButton" name="bBiao">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>20</x>
|
||||
<y>30</y>
|
||||
<width>151</width>
|
||||
<height>24</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<pointsize>11</pointsize>
|
||||
</font>
|
||||
</property>
|
||||
<property name="styleSheet">
|
||||
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>打开分析标准Excel</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLabel" name="label_4">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>180</x>
|
||||
<y>30</y>
|
||||
<width>151</width>
|
||||
<height>16</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<family>楷体</family>
|
||||
<pointsize>11</pointsize>
|
||||
<bold>false</bold>
|
||||
</font>
|
||||
</property>
|
||||
<property name="styleSheet">
|
||||
<string notr="true">color: red;</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>请在修改后保存并关闭</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QPushButton" name="bAna">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>420</x>
|
||||
<y>30</y>
|
||||
<width>151</width>
|
||||
<height>24</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<pointsize>12</pointsize>
|
||||
</font>
|
||||
</property>
|
||||
<property name="styleSheet">
|
||||
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>开始分析</string>
|
||||
</property>
|
||||
</widget>
|
||||
</widget>
|
||||
<widget class="QLabel" name="label_9">
|
||||
<property name="geometry">
|
||||
|
@ -345,10 +353,10 @@
|
|||
<widget class="QGroupBox" name="groupBox_6">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>210</x>
|
||||
<x>220</x>
|
||||
<y>280</y>
|
||||
<width>371</width>
|
||||
<height>361</height>
|
||||
<height>251</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
|
@ -357,7 +365,7 @@
|
|||
</font>
|
||||
</property>
|
||||
<property name="title">
|
||||
<string>日志显示</string>
|
||||
<string>操作日志显示</string>
|
||||
</property>
|
||||
<widget class="QListView" name="vLog">
|
||||
<property name="geometry">
|
||||
|
@ -365,7 +373,7 @@
|
|||
<x>10</x>
|
||||
<y>20</y>
|
||||
<width>351</width>
|
||||
<height>321</height>
|
||||
<height>221</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
|
@ -382,7 +390,7 @@
|
|||
<property name="geometry">
|
||||
<rect>
|
||||
<x>10</x>
|
||||
<y>490</y>
|
||||
<y>380</y>
|
||||
<width>191</width>
|
||||
<height>151</height>
|
||||
</rect>
|
||||
|
@ -393,7 +401,7 @@
|
|||
</font>
|
||||
</property>
|
||||
<property name="title">
|
||||
<string>2.确认需要抓取的网站</string>
|
||||
<string>2.确认需要爬取的官网</string>
|
||||
</property>
|
||||
<widget class="QPushButton" name="bWebSite">
|
||||
<property name="geometry">
|
||||
|
@ -491,7 +499,160 @@
|
|||
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>开始巡查</string>
|
||||
<string>开始爬取</string>
|
||||
</property>
|
||||
</widget>
|
||||
</widget>
|
||||
<widget class="QGroupBox" name="groupBox_7">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>10</x>
|
||||
<y>670</y>
|
||||
<width>581</width>
|
||||
<height>111</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<pointsize>11</pointsize>
|
||||
</font>
|
||||
</property>
|
||||
<property name="title">
|
||||
<string>总院官微</string>
|
||||
</property>
|
||||
<widget class="QLabel" name="label_10">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>10</x>
|
||||
<y>60</y>
|
||||
<width>91</width>
|
||||
<height>16</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<pointsize>10</pointsize>
|
||||
</font>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>汇总结果Excel:</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLabel" name="label_11">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>10</x>
|
||||
<y>80</y>
|
||||
<width>91</width>
|
||||
<height>16</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<pointsize>10</pointsize>
|
||||
</font>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>汇总打分Excel:</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="Line" name="line_2">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>10</x>
|
||||
<y>70</y>
|
||||
<width>561</width>
|
||||
<height>16</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="orientation">
|
||||
<enum>Qt::Horizontal</enum>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLabel" name="lCalRes1">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>110</x>
|
||||
<y>55</y>
|
||||
<width>381</width>
|
||||
<height>21</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<pointsize>9</pointsize>
|
||||
</font>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string/>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLabel" name="lCalRes2">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>110</x>
|
||||
<y>80</y>
|
||||
<width>381</width>
|
||||
<height>16</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<pointsize>9</pointsize>
|
||||
</font>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string/>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QPushButton" name="bOpenCalRes1">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>520</x>
|
||||
<y>50</y>
|
||||
<width>51</width>
|
||||
<height>24</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="styleSheet">
|
||||
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>打开</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QPushButton" name="bOpenCalRes2">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>520</x>
|
||||
<y>80</y>
|
||||
<width>51</width>
|
||||
<height>24</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>打开</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QPushButton" name="bCal">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>20</x>
|
||||
<y>30</y>
|
||||
<width>151</width>
|
||||
<height>24</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<pointsize>11</pointsize>
|
||||
</font>
|
||||
</property>
|
||||
<property name="styleSheet">
|
||||
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>汇总打分</string>
|
||||
</property>
|
||||
</widget>
|
||||
</widget>
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import pandas as pd
|
||||
import os
|
||||
import sqlite3
|
||||
from .base import BASE_DIR
|
||||
from mycode.base import BASE_DIR
|
||||
|
||||
wechat_dir = os.path.join(BASE_DIR, 'article')
|
||||
web_dir = os.path.join(BASE_DIR, 'web_dir')
|
||||
|
@ -53,6 +53,10 @@ def ana_wechat():
|
|||
|
||||
if not result.empty:
|
||||
for ind2, row2 in result.iterrows():
|
||||
if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['content']:
|
||||
continue
|
||||
if row['错误表述'] == '20大':
|
||||
continue
|
||||
output_row = [
|
||||
index,
|
||||
row2['nickname'],
|
||||
|
@ -85,6 +89,10 @@ def ana_web():
|
|||
result = df[mask]
|
||||
if not result.empty:
|
||||
for ind2, row2 in result.iterrows():
|
||||
if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['text']:
|
||||
continue
|
||||
if row['错误表述'] == '20大':
|
||||
continue
|
||||
output_row = [
|
||||
index,
|
||||
row2['name'],
|
||||
|
@ -101,4 +109,6 @@ def ana_web():
|
|||
|
||||
return output_data
|
||||
|
||||
if __name__ == "__main__":
|
||||
ana_web()
|
||||
|
||||
|
|
|
@ -6,3 +6,4 @@ scrapy-xlsx==0.1.1
|
|||
selenium==4.9.1
|
||||
pyside6==6.5.2
|
||||
pywin32==306
|
||||
docxtpl==0.16.7
|
162
start.py
162
start.py
|
@ -13,6 +13,9 @@ import pandas as pd
|
|||
from urllib.parse import urlparse
|
||||
from openpyxl import load_workbook
|
||||
import threading
|
||||
import traceback
|
||||
from docxtpl import DocxTemplate
|
||||
import json
|
||||
# from queue import Queue
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
@ -20,6 +23,8 @@ WEB_SITES_PATH = os.path.join(BASE_DIR, 'web_sites.xlsx')
|
|||
BIAO_PATH = os.path.join(BASE_DIR, 'biao.xlsx')
|
||||
PYTHON_PATH = os.path.join(BASE_DIR, 'runtime/python.exe')
|
||||
TEMPLATE_PATH = os.path.join(BASE_DIR, 'summary/template.xlsx')
|
||||
TEMPLATE_REPORT_PATH = os.path.join(BASE_DIR, 'summary/template_report.docx')
|
||||
|
||||
|
||||
def fix_url_scheme(url, default_scheme='http'):
|
||||
# 检查URL是否包含方案
|
||||
|
@ -37,26 +42,49 @@ class MyApplication(QApplication):
|
|||
self.main_window = MainWindow()
|
||||
return self.main_window
|
||||
|
||||
class MyThread(QThread):
|
||||
update_signal = Signal(dict)
|
||||
def gen_doc(w1, w2):
|
||||
now = datetime.datetime.now()
|
||||
now_3 = now - datetime.timedelta(days=3)
|
||||
# with open('w2.json', 'r', encoding='utf-8') as f:
|
||||
# w2 = json.loads(f.read())
|
||||
# with open('w1.json', 'r', encoding='utf-8') as f:
|
||||
# w1 = json.loads(f.read())
|
||||
gdbs = 0
|
||||
yzbs = 0
|
||||
ybwz = 0
|
||||
zzcc = 0
|
||||
context = {'y': now.year, 'm': now.month, 'd': now.day, 'mo': now_3.month, 'do': now_3.day, 'su': 'xx', 'w1': w1, 'w2': w2}
|
||||
output_report_path = os.path.join(BASE_DIR, f'summary/{now.year}年{now.month}月-分析结果简报.docx')
|
||||
doc = DocxTemplate(TEMPLATE_REPORT_PATH)
|
||||
for i in w1:
|
||||
if i[5] == '固定表述错误':
|
||||
gdbs =gdbs + 1
|
||||
elif i[5] == '严重表述错误':
|
||||
yzbs = yzbs +1
|
||||
elif i[5] == '一般文字差错':
|
||||
ybwz = ybwz +1
|
||||
elif i[5] == '政治差错':
|
||||
zzcc = zzcc +1
|
||||
for i in w2:
|
||||
if i[5] == '固定表述错误':
|
||||
gdbs =gdbs + 1
|
||||
elif i[5] == '严重表述错误':
|
||||
yzbs = yzbs +1
|
||||
elif i[5] == '一般文字差错':
|
||||
ybwz = ybwz +1
|
||||
elif i[5] == '政治差错':
|
||||
zzcc = zzcc +1
|
||||
|
||||
def __init__(self, lsize) -> None:
|
||||
super().__init__()
|
||||
self.lsize = lsize
|
||||
self.processes = []
|
||||
self.running = False
|
||||
context['su'] = f'固定表述错误{gdbs}项, 严重表述错误{yzbs}项, 一般文字差错{ybwz}项, 政治差错{zzcc}项'
|
||||
|
||||
def capture_output(self, p):
|
||||
while self.running and p.poll() is None:
|
||||
output = p.stdout.readline()
|
||||
err = p.stderr.readline()
|
||||
if err:
|
||||
self.update_signal.emit({'msg': err.strip()})
|
||||
if output:
|
||||
self.update_signal.emit({'msg': output.strip()})
|
||||
doc.render(context)
|
||||
doc.save(output_report_path)
|
||||
return output_report_path
|
||||
class AnaThread(QThread):
|
||||
update_signal = Signal(object)
|
||||
|
||||
def ana(self):
|
||||
month = datetime.datetime.now().month
|
||||
now = datetime.datetime.now()
|
||||
self.update_signal.emit({'msg': '对比开始...'})
|
||||
self.update_signal.emit({'msg': '正在组合微信公众号爬取内容...'})
|
||||
make_simple_csv_from_db()
|
||||
|
@ -65,9 +93,10 @@ class MyThread(QThread):
|
|||
self.update_signal.emit({'msg': '开始对比分析所有内容...'})
|
||||
wechat_results = ana_wechat()
|
||||
web_results = ana_web()
|
||||
output_excel_path = os.path.join(BASE_DIR, f'summary/{month}月-总院宣传阵地巡查结果汇总表.xlsx')
|
||||
# 生成汇总表
|
||||
self.update_signal.emit({'msg': '开始生成汇总表...'})
|
||||
output_excel_path = os.path.join(BASE_DIR, f'summary/{now.year}年{now.month}月-分析结果汇总表.xlsx')
|
||||
workbook = load_workbook(TEMPLATE_PATH)
|
||||
# 选择要操作的工作表
|
||||
wechat_sheet = workbook['公众号']
|
||||
web_sheet = workbook['网站']
|
||||
for row in wechat_results:
|
||||
|
@ -76,7 +105,46 @@ class MyThread(QThread):
|
|||
web_sheet.append(row)
|
||||
workbook.save(output_excel_path)
|
||||
workbook.close()
|
||||
self.update_signal.emit({'msg': '分析完毕, 请查看结果栏, 可手动校对', 'output_excel_path': output_excel_path})
|
||||
# with open('w1.json', 'w', encoding='utf-8') as f:
|
||||
# f.write(json.dumps(wechat_results, ensure_ascii=False))
|
||||
|
||||
# with open('w2.json', 'w', encoding='utf-8') as f:
|
||||
# f.write(json.dumps(web_results, ensure_ascii=False))
|
||||
# 生成简报
|
||||
self.update_signal.emit({'msg': '开始生成汇总简报...'})
|
||||
output_report_path = gen_doc(wechat_results, web_results)
|
||||
self.update_signal.emit({'msg': '分析完毕, 请查看结果栏, 可手动校对', 'output_excel_path': output_excel_path, 'output_report_path': output_report_path})
|
||||
|
||||
def run(self) -> None:
|
||||
try:
|
||||
self.ana()
|
||||
except Exception as e:
|
||||
self.update_signal.emit({'msg': traceback.format_exc()})
|
||||
|
||||
|
||||
class MyThread(QThread):
|
||||
update_signal = Signal(object)
|
||||
|
||||
def __init__(self, lsize) -> None:
|
||||
"""
|
||||
lsize: 多少kb需要调取Chrome
|
||||
"""
|
||||
super().__init__()
|
||||
self.lsize = lsize
|
||||
self.processes = []
|
||||
self.running = False
|
||||
|
||||
def capture_output(self, p):
|
||||
while self.running and p.poll() is None:
|
||||
output = p.stdout.readline()
|
||||
if output:
|
||||
self.update_signal.emit({'msg': output.strip()})
|
||||
|
||||
def capture_err(self, p):
|
||||
while self.running and p.poll() is None:
|
||||
err = p.stderr.readline()
|
||||
if err:
|
||||
self.update_signal.emit({'msg': err.strip()})
|
||||
|
||||
def run(self) -> None:
|
||||
self.update_signal.emit({'msg': '开始进行网站爬取...'})
|
||||
|
@ -95,6 +163,8 @@ class MyThread(QThread):
|
|||
self.running = True
|
||||
getlog_thread = threading.Thread(target=self.capture_output, args=(process,), daemon=True)
|
||||
getlog_thread.start()
|
||||
getlog_thread_err = threading.Thread(target=self.capture_err, args=(process,), daemon=True)
|
||||
getlog_thread_err.start()
|
||||
|
||||
for process in self.processes:
|
||||
process.wait()
|
||||
|
@ -114,8 +184,6 @@ class MyThread(QThread):
|
|||
self.update_signal.emit({'msg': '存在未爬取站点,正在调用Chrome继续爬取...'})
|
||||
chrom_main_from_list(info_to_save)
|
||||
self.update_signal.emit({'msg': '网站爬取完毕!'})
|
||||
self.ana()
|
||||
self.exec()
|
||||
|
||||
def close(self):
|
||||
self.running = False
|
||||
|
@ -129,7 +197,8 @@ class MainWindow(QMainWindow):
|
|||
|
||||
def __init__(self):
|
||||
super(MainWindow, self).__init__()
|
||||
self.worker_thread = None
|
||||
self.web_thread = None
|
||||
self.ana_thread = None
|
||||
self.wcplus = False
|
||||
self.logModel= QStringListModel([])
|
||||
self.ui = Ui_MainWindow()
|
||||
|
@ -139,6 +208,7 @@ class MainWindow(QMainWindow):
|
|||
self.ui.bWebSite.clicked.connect(self.open_websites_xlsx)
|
||||
self.ui.bBiao.clicked.connect(self.open_biao_xlsx)
|
||||
self.ui.bStart.clicked.connect(self.start)
|
||||
self.ui.bAna.clicked.connect(self.start_ana)
|
||||
self.ui.bRes1.clicked.connect(self.open_res1)
|
||||
self.ui.bRes2.clicked.connect(self.open_res2)
|
||||
self.ui.vLog.setModel(self.logModel)
|
||||
|
@ -170,9 +240,9 @@ class MainWindow(QMainWindow):
|
|||
|
||||
def open_res2(self):
|
||||
if self.ui.lRes2.text():
|
||||
app = win32.Dispatch("Excel.Application")
|
||||
app = win32.Dispatch("Word.Application")
|
||||
app.Visible = True
|
||||
app.Workbooks.Open(self.ui.lRes2.text())
|
||||
app.Documents.Open(self.ui.lRes2.text())
|
||||
app.WindowState = 3
|
||||
|
||||
def get_time(self):
|
||||
|
@ -180,34 +250,45 @@ class MainWindow(QMainWindow):
|
|||
return now.strftime('%H:%M:%S')
|
||||
|
||||
def start(self):
|
||||
if self.ui.bStart.text() == '开始巡查' or self.ui.bStart.text() == '重新开始':
|
||||
if self.ui.bStart.text() == '开始爬取' or self.ui.bStart.text() == '重新开始':
|
||||
self.log('', True)
|
||||
if self.res1Workbook:
|
||||
self.res1Workbook.Close()
|
||||
self.ui.lSize.setEnabled(False)
|
||||
self.ui.bStart.setText('停止巡查')
|
||||
self.ui.bStart.setText('停止爬取')
|
||||
self.start_web(int(self.ui.lSize.text()))
|
||||
elif self.ui.bStart.text() == '停止巡查':
|
||||
elif self.ui.bStart.text() == '停止爬取':
|
||||
self.update_log({'msg': '正在停止...'})
|
||||
if self.worker_thread:
|
||||
self.worker_thread.close()
|
||||
if self.web_thread:
|
||||
self.web_thread.close()
|
||||
self.log('', True)
|
||||
self.ui.lSize.setEnabled(True)
|
||||
self.ui.bStart.setText('开始巡查')
|
||||
self.ui.bStart.setText('开始爬取')
|
||||
|
||||
|
||||
def start_web(self, lsize):
|
||||
self.worker_thread = MyThread(lsize)
|
||||
self.worker_thread.update_signal.connect(self.update_log)
|
||||
self.worker_thread.start()
|
||||
self.web_thread = MyThread(lsize)
|
||||
self.web_thread.update_signal.connect(self.update_log)
|
||||
self.web_thread.start()
|
||||
|
||||
def start_ana(self):
|
||||
self.ana_thread = AnaThread()
|
||||
self.ana_thread.update_signal.connect(self.update_log)
|
||||
self.ana_thread.start()
|
||||
|
||||
def update_log(self, rdict):
|
||||
self.log(f'{self.get_time()}-{rdict["msg"]}', False)
|
||||
if 'output_excel_path' in rdict:
|
||||
self.ui.lRes1.setText(rdict['output_excel_path'])
|
||||
self.ui.bStart.setText('重新开始')
|
||||
self.ui.lSize.setEnabled(True)
|
||||
if isinstance(rdict, str):
|
||||
self.log(f'{self.get_time()}-{rdict}', False)
|
||||
elif isinstance(rdict, dict):
|
||||
self.log(f'{self.get_time()}-{rdict["msg"]}', False)
|
||||
if 'output_report_path' in rdict:
|
||||
self.ui.lRes2.setText(rdict['output_report_path'])
|
||||
# self.ui.bStart.setText('重新开始')
|
||||
# self.ui.lSize.setEnabled(True)
|
||||
if 'output_excel_path' in rdict:
|
||||
self.ui.lRes1.setText(rdict['output_excel_path'])
|
||||
# self.ui.bStart.setText('重新开始')
|
||||
# self.ui.lSize.setEnabled(True)
|
||||
|
||||
def log(self, logLine: str, clear=False):
|
||||
log_list = self.logModel.stringList()
|
||||
|
@ -228,12 +309,13 @@ class MainWindow(QMainWindow):
|
|||
except Exception as e:
|
||||
print(f"Error while terminating wcplus.exe: {str(e)}")
|
||||
self.wcplus = False
|
||||
if self.worker_thread:
|
||||
self.worker_thread.close()
|
||||
if self.web_thread:
|
||||
self.web_thread.close()
|
||||
event.accept()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# gen_doc()
|
||||
app = MyApplication(sys.argv)
|
||||
main_window = app.createMainWindow()
|
||||
main_window.show()
|
||||
|
|
Binary file not shown.
135
ui_mainwindow.py
135
ui_mainwindow.py
|
@ -23,14 +23,14 @@ class Ui_MainWindow(object):
|
|||
def setupUi(self, MainWindow):
|
||||
if not MainWindow.objectName():
|
||||
MainWindow.setObjectName(u"MainWindow")
|
||||
MainWindow.resize(600, 763)
|
||||
MainWindow.resize(600, 830)
|
||||
sizePolicy = QSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed)
|
||||
sizePolicy.setHorizontalStretch(0)
|
||||
sizePolicy.setVerticalStretch(0)
|
||||
sizePolicy.setHeightForWidth(MainWindow.sizePolicy().hasHeightForWidth())
|
||||
MainWindow.setSizePolicy(sizePolicy)
|
||||
MainWindow.setMinimumSize(QSize(600, 763))
|
||||
MainWindow.setMaximumSize(QSize(600, 763))
|
||||
MainWindow.setMinimumSize(QSize(600, 830))
|
||||
MainWindow.setMaximumSize(QSize(600, 830))
|
||||
icon = QIcon()
|
||||
icon.addFile(u"start.ico", QSize(), QIcon.Normal, QIcon.Off)
|
||||
MainWindow.setWindowIcon(icon)
|
||||
|
@ -63,79 +63,87 @@ class Ui_MainWindow(object):
|
|||
font1.setBold(False)
|
||||
self.label_5.setFont(font1)
|
||||
self.label_5.setStyleSheet(u"color: red;")
|
||||
self.groupBox_3 = QGroupBox(self.centralwidget)
|
||||
self.groupBox_3.setObjectName(u"groupBox_3")
|
||||
self.groupBox_3.setGeometry(QRect(10, 380, 191, 91))
|
||||
self.groupBox_3.setFont(font)
|
||||
self.bBiao = QPushButton(self.groupBox_3)
|
||||
self.bBiao.setObjectName(u"bBiao")
|
||||
self.bBiao.setGeometry(QRect(20, 30, 151, 24))
|
||||
self.bBiao.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
|
||||
self.label_4 = QLabel(self.groupBox_3)
|
||||
self.label_4.setObjectName(u"label_4")
|
||||
self.label_4.setGeometry(QRect(20, 60, 151, 16))
|
||||
self.label_4.setFont(font1)
|
||||
self.label_4.setStyleSheet(u"color: red;")
|
||||
self.groupBox_5 = QGroupBox(self.centralwidget)
|
||||
self.groupBox_5.setObjectName(u"groupBox_5")
|
||||
self.groupBox_5.setGeometry(QRect(10, 650, 581, 71))
|
||||
self.groupBox_5.setGeometry(QRect(10, 540, 581, 121))
|
||||
self.groupBox_5.setFont(font)
|
||||
self.label_7 = QLabel(self.groupBox_5)
|
||||
self.label_7.setObjectName(u"label_7")
|
||||
self.label_7.setGeometry(QRect(10, 20, 91, 16))
|
||||
self.label_7.setGeometry(QRect(10, 70, 91, 16))
|
||||
font2 = QFont()
|
||||
font2.setPointSize(10)
|
||||
self.label_7.setFont(font2)
|
||||
self.label_8 = QLabel(self.groupBox_5)
|
||||
self.label_8.setObjectName(u"label_8")
|
||||
self.label_8.setGeometry(QRect(10, 40, 91, 16))
|
||||
self.label_8.setGeometry(QRect(10, 90, 91, 16))
|
||||
self.label_8.setFont(font2)
|
||||
self.line = QFrame(self.groupBox_5)
|
||||
self.line.setObjectName(u"line")
|
||||
self.line.setGeometry(QRect(10, 30, 561, 16))
|
||||
self.line.setGeometry(QRect(10, 80, 561, 16))
|
||||
self.line.setFrameShape(QFrame.HLine)
|
||||
self.line.setFrameShadow(QFrame.Sunken)
|
||||
self.lRes1 = QLabel(self.groupBox_5)
|
||||
self.lRes1.setObjectName(u"lRes1")
|
||||
self.lRes1.setGeometry(QRect(110, 15, 381, 21))
|
||||
self.lRes1.setGeometry(QRect(110, 65, 381, 21))
|
||||
font3 = QFont()
|
||||
font3.setPointSize(9)
|
||||
self.lRes1.setFont(font3)
|
||||
self.lRes2 = QLabel(self.groupBox_5)
|
||||
self.lRes2.setObjectName(u"lRes2")
|
||||
self.lRes2.setGeometry(QRect(110, 40, 381, 16))
|
||||
self.lRes2.setGeometry(QRect(110, 90, 381, 16))
|
||||
self.lRes2.setFont(font3)
|
||||
self.bRes1 = QPushButton(self.groupBox_5)
|
||||
self.bRes1.setObjectName(u"bRes1")
|
||||
self.bRes1.setGeometry(QRect(520, 10, 51, 24))
|
||||
self.bRes1.setGeometry(QRect(520, 60, 51, 24))
|
||||
self.bRes1.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
|
||||
self.bRes2 = QPushButton(self.groupBox_5)
|
||||
self.bRes2.setObjectName(u"bRes2")
|
||||
self.bRes2.setGeometry(QRect(520, 40, 51, 24))
|
||||
self.bRes2.setGeometry(QRect(520, 90, 51, 24))
|
||||
self.bBiao = QPushButton(self.groupBox_5)
|
||||
self.bBiao.setObjectName(u"bBiao")
|
||||
self.bBiao.setGeometry(QRect(20, 30, 151, 24))
|
||||
self.bBiao.setFont(font)
|
||||
self.bBiao.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
|
||||
self.label_4 = QLabel(self.groupBox_5)
|
||||
self.label_4.setObjectName(u"label_4")
|
||||
self.label_4.setGeometry(QRect(180, 30, 151, 16))
|
||||
font4 = QFont()
|
||||
font4.setFamilies([u"\u6977\u4f53"])
|
||||
font4.setPointSize(11)
|
||||
font4.setBold(False)
|
||||
self.label_4.setFont(font4)
|
||||
self.label_4.setStyleSheet(u"color: red;")
|
||||
self.bAna = QPushButton(self.groupBox_5)
|
||||
self.bAna.setObjectName(u"bAna")
|
||||
self.bAna.setGeometry(QRect(420, 30, 151, 24))
|
||||
font5 = QFont()
|
||||
font5.setPointSize(12)
|
||||
self.bAna.setFont(font5)
|
||||
self.bAna.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
|
||||
self.label_9 = QLabel(self.centralwidget)
|
||||
self.label_9.setObjectName(u"label_9")
|
||||
self.label_9.setGeometry(QRect(150, 0, 291, 31))
|
||||
font4 = QFont()
|
||||
font4.setFamilies([u"\u6977\u4f53"])
|
||||
font4.setPointSize(12)
|
||||
font4.setBold(False)
|
||||
font4.setItalic(False)
|
||||
self.label_9.setFont(font4)
|
||||
font6 = QFont()
|
||||
font6.setFamilies([u"\u6977\u4f53"])
|
||||
font6.setPointSize(12)
|
||||
font6.setBold(False)
|
||||
font6.setItalic(False)
|
||||
self.label_9.setFont(font6)
|
||||
self.label_9.setStyleSheet(u"color:white;")
|
||||
self.label_9.setAlignment(Qt.AlignRight|Qt.AlignTrailing|Qt.AlignVCenter)
|
||||
self.label_9.setMargin(6)
|
||||
self.groupBox_6 = QGroupBox(self.centralwidget)
|
||||
self.groupBox_6.setObjectName(u"groupBox_6")
|
||||
self.groupBox_6.setGeometry(QRect(210, 280, 371, 361))
|
||||
self.groupBox_6.setGeometry(QRect(220, 280, 371, 251))
|
||||
self.groupBox_6.setFont(font)
|
||||
self.vLog = QListView(self.groupBox_6)
|
||||
self.vLog.setObjectName(u"vLog")
|
||||
self.vLog.setGeometry(QRect(10, 20, 351, 321))
|
||||
self.vLog.setGeometry(QRect(10, 20, 351, 221))
|
||||
self.vLog.setFont(font3)
|
||||
self.vLog.setStyleSheet(u"")
|
||||
self.groupBox_2 = QGroupBox(self.centralwidget)
|
||||
self.groupBox_2.setObjectName(u"groupBox_2")
|
||||
self.groupBox_2.setGeometry(QRect(10, 490, 191, 151))
|
||||
self.groupBox_2.setGeometry(QRect(10, 380, 191, 151))
|
||||
self.groupBox_2.setFont(font)
|
||||
self.bWebSite = QPushButton(self.groupBox_2)
|
||||
self.bWebSite.setObjectName(u"bWebSite")
|
||||
|
@ -158,10 +166,45 @@ class Ui_MainWindow(object):
|
|||
self.bStart = QPushButton(self.groupBox_2)
|
||||
self.bStart.setObjectName(u"bStart")
|
||||
self.bStart.setGeometry(QRect(20, 110, 151, 24))
|
||||
font5 = QFont()
|
||||
font5.setPointSize(12)
|
||||
self.bStart.setFont(font5)
|
||||
self.bStart.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
|
||||
self.groupBox_7 = QGroupBox(self.centralwidget)
|
||||
self.groupBox_7.setObjectName(u"groupBox_7")
|
||||
self.groupBox_7.setGeometry(QRect(10, 670, 581, 111))
|
||||
self.groupBox_7.setFont(font)
|
||||
self.label_10 = QLabel(self.groupBox_7)
|
||||
self.label_10.setObjectName(u"label_10")
|
||||
self.label_10.setGeometry(QRect(10, 60, 91, 16))
|
||||
self.label_10.setFont(font2)
|
||||
self.label_11 = QLabel(self.groupBox_7)
|
||||
self.label_11.setObjectName(u"label_11")
|
||||
self.label_11.setGeometry(QRect(10, 80, 91, 16))
|
||||
self.label_11.setFont(font2)
|
||||
self.line_2 = QFrame(self.groupBox_7)
|
||||
self.line_2.setObjectName(u"line_2")
|
||||
self.line_2.setGeometry(QRect(10, 70, 561, 16))
|
||||
self.line_2.setFrameShape(QFrame.HLine)
|
||||
self.line_2.setFrameShadow(QFrame.Sunken)
|
||||
self.lCalRes1 = QLabel(self.groupBox_7)
|
||||
self.lCalRes1.setObjectName(u"lCalRes1")
|
||||
self.lCalRes1.setGeometry(QRect(110, 55, 381, 21))
|
||||
self.lCalRes1.setFont(font3)
|
||||
self.lCalRes2 = QLabel(self.groupBox_7)
|
||||
self.lCalRes2.setObjectName(u"lCalRes2")
|
||||
self.lCalRes2.setGeometry(QRect(110, 80, 381, 16))
|
||||
self.lCalRes2.setFont(font3)
|
||||
self.bOpenCalRes1 = QPushButton(self.groupBox_7)
|
||||
self.bOpenCalRes1.setObjectName(u"bOpenCalRes1")
|
||||
self.bOpenCalRes1.setGeometry(QRect(520, 50, 51, 24))
|
||||
self.bOpenCalRes1.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
|
||||
self.bOpenCalRes2 = QPushButton(self.groupBox_7)
|
||||
self.bOpenCalRes2.setObjectName(u"bOpenCalRes2")
|
||||
self.bOpenCalRes2.setGeometry(QRect(520, 80, 51, 24))
|
||||
self.bCal = QPushButton(self.groupBox_7)
|
||||
self.bCal.setObjectName(u"bCal")
|
||||
self.bCal.setGeometry(QRect(20, 30, 151, 24))
|
||||
self.bCal.setFont(font)
|
||||
self.bCal.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
|
||||
MainWindow.setCentralWidget(self.centralwidget)
|
||||
self.menubar = QMenuBar(MainWindow)
|
||||
self.menubar.setObjectName(u"menubar")
|
||||
|
@ -182,24 +225,32 @@ class Ui_MainWindow(object):
|
|||
self.groupBox.setTitle(QCoreApplication.translate("MainWindow", u"1.\u5fae\u4fe1\u516c\u4f17\u53f7\u4fe1\u606f\u6293\u53d6", None))
|
||||
self.bWechat.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5de5\u5177", None))
|
||||
self.label_5.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u786e\u4fdd\u6240\u6709\u516c\u4f17\u53f7\u6293\u53d6\u5b8c\u6bd5", None))
|
||||
self.groupBox_3.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u5206\u6790\u5bf9\u6bd4\u5e93", None))
|
||||
self.bBiao.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5206\u6790\u6807\u51c6Excel", None))
|
||||
self.label_4.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
|
||||
self.groupBox_5.setTitle(QCoreApplication.translate("MainWindow", u"\u6700\u7ec8\u7ed3\u679c", None))
|
||||
self.groupBox_5.setTitle(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u5206\u6790", None))
|
||||
self.label_7.setText(QCoreApplication.translate("MainWindow", u"\u5206\u6790\u7ed3\u679cExcel:", None))
|
||||
self.label_8.setText(QCoreApplication.translate("MainWindow", u"\u5206\u6790\u62a5\u544aWord:", None))
|
||||
self.lRes1.setText("")
|
||||
self.lRes2.setText("")
|
||||
self.bRes1.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None))
|
||||
self.bRes2.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None))
|
||||
self.bBiao.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5206\u6790\u6807\u51c6Excel", None))
|
||||
self.label_4.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
|
||||
self.bAna.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u5206\u6790", None))
|
||||
self.label_9.setText(QCoreApplication.translate("MainWindow", u"\u4e2d\u56fd\u5efa\u6750\u603b\u9662\u5ba3\u4f20\u5de5\u4f5c\u4fe1\u606f\u5316\u7ba1\u7406\u5e73\u53f0", None))
|
||||
self.groupBox_6.setTitle(QCoreApplication.translate("MainWindow", u"\u65e5\u5fd7\u663e\u793a", None))
|
||||
self.groupBox_2.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u9700\u8981\u6293\u53d6\u7684\u7f51\u7ad9", None))
|
||||
self.groupBox_6.setTitle(QCoreApplication.translate("MainWindow", u"\u64cd\u4f5c\u65e5\u5fd7\u663e\u793a", None))
|
||||
self.groupBox_2.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u9700\u8981\u722c\u53d6\u7684\u5b98\u7f51", None))
|
||||
self.bWebSite.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u7f51\u7ad9\u5217\u8868Excel", None))
|
||||
self.label_2.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
|
||||
self.label_6.setText(QCoreApplication.translate("MainWindow", u"\u5c0f\u4e8e", None))
|
||||
self.label_3.setText(QCoreApplication.translate("MainWindow", u"KB-Chrome", None))
|
||||
self.lSize.setText(QCoreApplication.translate("MainWindow", u"20", None))
|
||||
self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u5de1\u67e5", None))
|
||||
self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u722c\u53d6", None))
|
||||
self.groupBox_7.setTitle(QCoreApplication.translate("MainWindow", u"\u603b\u9662\u5b98\u5fae", None))
|
||||
self.label_10.setText(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u7ed3\u679cExcel:", None))
|
||||
self.label_11.setText(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u6253\u5206Excel:", None))
|
||||
self.lCalRes1.setText("")
|
||||
self.lCalRes2.setText("")
|
||||
self.bOpenCalRes1.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None))
|
||||
self.bOpenCalRes2.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None))
|
||||
self.bCal.setText(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u6253\u5206", None))
|
||||
# retranslateUi
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.http import HtmlResponse
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from itemadapter import is_item, ItemAdapter
|
||||
|
@ -101,3 +102,13 @@ class ZcspiderDownloaderMiddleware:
|
|||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info("Spider opened: %s" % spider.name)
|
||||
|
||||
|
||||
class FilterHTMLMiddleware:
|
||||
def process_response(self, request, response, spider):
|
||||
if isinstance(response, HtmlResponse):
|
||||
# 只接收HTML响应
|
||||
return response
|
||||
else:
|
||||
# 忽略其他类型的资源文件
|
||||
return request
|
|
@ -106,3 +106,8 @@ ITEM_PIPELINES = {
|
|||
FEED_EXPORTERS = {
|
||||
# 'xlsx': 'scrapy_xlsx.XlsxItemExporter',
|
||||
}
|
||||
|
||||
DOWNLOADER_MIDDLEWARES = {
|
||||
'zcspider.middlewares.FilterHTMLMiddleware': 200,
|
||||
# 其他下载中间件...
|
||||
}
|
|
@ -39,12 +39,14 @@ class BaseSpider(scrapy.Spider):
|
|||
def start_requests(self):
|
||||
for url in self.start_urls:
|
||||
url = self.fix_url_scheme(url)
|
||||
r = scrapy.Request(url, dont_filter=True, headers=self.headers, callback=self.parse, errback=self.request2, meta={'download_timeout': 30})
|
||||
self.visited_urls.add(url)
|
||||
r = scrapy.Request(url, dont_filter=True, headers=self.headers, callback=self.parse, meta={'download_timeout': 30})
|
||||
yield r
|
||||
|
||||
def is_file_url(self, url):
|
||||
if f'.{url.split(".")[-1].lower()}' in self.ext:
|
||||
return True
|
||||
for item in self.ext:
|
||||
if url.lower().endswith(item):
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_file_res(self, res):
|
||||
|
@ -82,42 +84,39 @@ class BaseSpider(scrapy.Spider):
|
|||
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
|
||||
|
||||
def parse(self, response):
|
||||
try:
|
||||
if response.status >= 500:
|
||||
return
|
||||
self.visited_urls.add(response.url)
|
||||
if self.is_file_res(response):
|
||||
return
|
||||
h = html2text.HTML2Text()
|
||||
h.ignore_links = True # 忽略所有链接
|
||||
# 提取纯文本内容
|
||||
# try:
|
||||
text = h.handle(response.text)
|
||||
# except:
|
||||
# text = h.handle(response.body.decode(encoding='gb18030'))
|
||||
if response.status < 400:
|
||||
yield {
|
||||
'group': self.group,
|
||||
'name': self.name,
|
||||
'domain': self.domain,
|
||||
'url': response.url,
|
||||
'text': text,
|
||||
}
|
||||
links = re.findall(r'href=["\']?([^"\'>]+)', response.text)
|
||||
for link in links:
|
||||
full_link = response.urljoin(link)
|
||||
if not full_link.startswith('http'):
|
||||
continue
|
||||
if full_link not in self.visited_urls and (self.is_file_url(full_link) is False):
|
||||
if urlparse(full_link).netloc.replace('www.', '') == self.domain:
|
||||
# try:
|
||||
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
|
||||
# except ValueError:
|
||||
# import traceback
|
||||
# print(traceback.format_exc())
|
||||
# print(full_link)
|
||||
except scrapy.exceptions.TimeoutError:
|
||||
print(f'{response.url}-请求超时取消')
|
||||
if response.status >= 500:
|
||||
return
|
||||
if self.is_file_res(response):
|
||||
return
|
||||
h = html2text.HTML2Text()
|
||||
h.ignore_links = True # 忽略所有链接
|
||||
# 提取纯文本内容
|
||||
# try:
|
||||
text = h.handle(response.text)
|
||||
# except:
|
||||
# text = h.handle(response.body.decode(encoding='gb18030'))
|
||||
if response.status < 400:
|
||||
yield {
|
||||
'group': self.group,
|
||||
'name': self.name,
|
||||
'domain': self.domain,
|
||||
'url': response.url,
|
||||
'text': text,
|
||||
}
|
||||
links = re.findall(r'href=["\']?([^"\'>]+)', response.text)
|
||||
for link in links:
|
||||
full_link = response.urljoin(link)
|
||||
if not full_link.startswith('http'):
|
||||
continue
|
||||
if full_link not in self.visited_urls and (self.is_file_url(full_link) is False):
|
||||
if urlparse(full_link).netloc.replace('www.', '') == self.domain:
|
||||
self.visited_urls.add(response.url)
|
||||
# try:
|
||||
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, meta={'download_timeout': 30})
|
||||
# except ValueError:
|
||||
# import traceback
|
||||
# print(traceback.format_exc())
|
||||
# print(full_link)
|
||||
|
||||
def closed(self, reason):
|
||||
# This method will be called when the Spider is about to close
|
||||
|
|
Loading…
Reference in New Issue