feat: 功能优化

This commit is contained in:
caoqianming 2023-10-30 13:39:54 +08:00
parent 7480590bd3
commit 70040f1c0a
13 changed files with 404 additions and 386 deletions

6
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,6 @@
{
"[python]": {
"editor.defaultFormatter": "ms-python.autopep8"
},
"python.formatting.provider": "none"
}

330
main.ui
View File

@ -7,7 +7,7 @@
<x>0</x>
<y>0</y>
<width>600</width>
<height>725</height>
<height>763</height>
</rect>
</property>
<property name="sizePolicy">
@ -19,13 +19,13 @@
<property name="minimumSize">
<size>
<width>600</width>
<height>725</height>
<height>763</height>
</size>
</property>
<property name="maximumSize">
<size>
<width>600</width>
<height>725</height>
<height>763</height>
</size>
</property>
<property name="windowTitle">
@ -80,7 +80,7 @@
<rect>
<x>20</x>
<y>30</y>
<width>75</width>
<width>151</width>
<height>24</height>
</rect>
</property>
@ -123,70 +123,13 @@
</property>
</widget>
</widget>
<widget class="QGroupBox" name="groupBox_2">
<widget class="QGroupBox" name="groupBox_3">
<property name="geometry">
<rect>
<x>10</x>
<y>380</y>
<width>191</width>
<height>111</height>
</rect>
</property>
<property name="font">
<font>
<pointsize>11</pointsize>
</font>
</property>
<property name="title">
<string>2.确认需要抓取的网站</string>
</property>
<widget class="QPushButton" name="bWebSite">
<property name="geometry">
<rect>
<x>20</x>
<y>30</y>
<width>151</width>
<height>24</height>
</rect>
</property>
<property name="styleSheet">
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
</property>
<property name="text">
<string>打开网站列表Excel</string>
</property>
</widget>
<widget class="QLabel" name="label_2">
<property name="geometry">
<rect>
<x>20</x>
<y>70</y>
<width>151</width>
<height>16</height>
</rect>
</property>
<property name="font">
<font>
<family>楷体</family>
<pointsize>10</pointsize>
<bold>false</bold>
</font>
</property>
<property name="styleSheet">
<string notr="true">color: red;</string>
</property>
<property name="text">
<string>请在修改后保存并关闭</string>
</property>
</widget>
</widget>
<widget class="QGroupBox" name="groupBox_3">
<property name="geometry">
<rect>
<x>10</x>
<y>500</y>
<width>191</width>
<height>101</height>
<height>91</height>
</rect>
</property>
<property name="font">
@ -217,7 +160,7 @@
<property name="geometry">
<rect>
<x>20</x>
<y>70</y>
<y>60</y>
<width>151</width>
<height>16</height>
</rect>
@ -237,115 +180,11 @@
</property>
</widget>
</widget>
<widget class="QGroupBox" name="groupBox_4">
<property name="geometry">
<rect>
<x>210</x>
<y>280</y>
<width>381</width>
<height>321</height>
</rect>
</property>
<property name="font">
<font>
<pointsize>11</pointsize>
</font>
</property>
<property name="title">
<string>4.开始执行巡查</string>
</property>
<widget class="QListView" name="vLog">
<property name="geometry">
<rect>
<x>10</x>
<y>80</y>
<width>361</width>
<height>231</height>
</rect>
</property>
<property name="font">
<font>
<pointsize>9</pointsize>
</font>
</property>
<property name="styleSheet">
<string notr="true"/>
</property>
</widget>
<widget class="QPushButton" name="bStart">
<property name="geometry">
<rect>
<x>300</x>
<y>50</y>
<width>75</width>
<height>24</height>
</rect>
</property>
<property name="styleSheet">
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
</property>
<property name="text">
<string>开始巡查</string>
</property>
</widget>
<widget class="QLabel" name="label_6">
<property name="geometry">
<rect>
<x>10</x>
<y>30</y>
<width>251</width>
<height>41</height>
</rect>
</property>
<property name="font">
<font>
<family>楷体</family>
<pointsize>12</pointsize>
<bold>false</bold>
</font>
</property>
<property name="styleSheet">
<string notr="true">color: red;</string>
</property>
<property name="text">
<string>务必确认前3步已经完成!</string>
</property>
<property name="wordWrap">
<bool>true</bool>
</property>
</widget>
<widget class="QLineEdit" name="lSize">
<property name="geometry">
<rect>
<x>300</x>
<y>20</y>
<width>41</width>
<height>20</height>
</rect>
</property>
<property name="text">
<string>30</string>
</property>
</widget>
<widget class="QLabel" name="label_3">
<property name="geometry">
<rect>
<x>350</x>
<y>20</y>
<width>41</width>
<height>21</height>
</rect>
</property>
<property name="text">
<string>KB</string>
</property>
</widget>
</widget>
<widget class="QGroupBox" name="groupBox_5">
<property name="geometry">
<rect>
<x>10</x>
<y>610</y>
<y>650</y>
<width>581</width>
<height>71</height>
</rect>
@ -503,6 +342,159 @@
<number>6</number>
</property>
</widget>
<widget class="QGroupBox" name="groupBox_6">
<property name="geometry">
<rect>
<x>210</x>
<y>280</y>
<width>371</width>
<height>361</height>
</rect>
</property>
<property name="font">
<font>
<pointsize>11</pointsize>
</font>
</property>
<property name="title">
<string>日志显示</string>
</property>
<widget class="QListView" name="vLog">
<property name="geometry">
<rect>
<x>10</x>
<y>20</y>
<width>351</width>
<height>321</height>
</rect>
</property>
<property name="font">
<font>
<pointsize>9</pointsize>
</font>
</property>
<property name="styleSheet">
<string notr="true"/>
</property>
</widget>
</widget>
<widget class="QGroupBox" name="groupBox_2">
<property name="geometry">
<rect>
<x>10</x>
<y>490</y>
<width>191</width>
<height>151</height>
</rect>
</property>
<property name="font">
<font>
<pointsize>11</pointsize>
</font>
</property>
<property name="title">
<string>2.确认需要抓取的网站</string>
</property>
<widget class="QPushButton" name="bWebSite">
<property name="geometry">
<rect>
<x>20</x>
<y>30</y>
<width>151</width>
<height>24</height>
</rect>
</property>
<property name="styleSheet">
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
</property>
<property name="text">
<string>打开网站列表Excel</string>
</property>
</widget>
<widget class="QLabel" name="label_2">
<property name="geometry">
<rect>
<x>20</x>
<y>60</y>
<width>151</width>
<height>16</height>
</rect>
</property>
<property name="font">
<font>
<family>楷体</family>
<pointsize>10</pointsize>
<bold>false</bold>
</font>
</property>
<property name="styleSheet">
<string notr="true">color: red;</string>
</property>
<property name="text">
<string>请在修改后保存并关闭</string>
</property>
</widget>
<widget class="QLabel" name="label_6">
<property name="geometry">
<rect>
<x>20</x>
<y>80</y>
<width>41</width>
<height>21</height>
</rect>
</property>
<property name="text">
<string>小于</string>
</property>
</widget>
<widget class="QLabel" name="label_3">
<property name="geometry">
<rect>
<x>100</x>
<y>80</y>
<width>121</width>
<height>21</height>
</rect>
</property>
<property name="text">
<string>KB-Chrome</string>
</property>
</widget>
<widget class="QLineEdit" name="lSize">
<property name="geometry">
<rect>
<x>50</x>
<y>80</y>
<width>41</width>
<height>20</height>
</rect>
</property>
<property name="text">
<string>20</string>
</property>
</widget>
<widget class="QPushButton" name="bStart">
<property name="geometry">
<rect>
<x>20</x>
<y>110</y>
<width>151</width>
<height>24</height>
</rect>
</property>
<property name="font">
<font>
<pointsize>12</pointsize>
</font>
</property>
<property name="styleSheet">
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
</property>
<property name="text">
<string>开始巡查</string>
</property>
</widget>
</widget>
</widget>
<widget class="QMenuBar" name="menubar">
<property name="geometry">

View File

@ -10,6 +10,30 @@ import os
chrome_driver_file = os.path.join(BASE_DIR, 'mycode', 'chromedriver.exe')
failed_sites_file = os.path.join(BASE_DIR, 'mycode/failed_sites.xlsx')
def fix_url_scheme(url, default_scheme='http'):
# 检查URL是否包含方案
if not url.startswith('http://') and not url.startswith('https://'):
# 如果没有方案,添加默认方案
url = f'{default_scheme}://{url}'
return url
def init_driver():
# Set up Chrome WebDriver with custom User-Agent
options = Options()
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36")
prefs = {"profile.managed_default_content_settings.images": 2, 'profile.managed_default_content_settings.notifications':2}
options.add_argument("--disable-default-apps") # 禁用默认应用程序
# 禁用 "tel" 协议处理
options.add_argument("--disable-features=WebAppInstallForceList,ProtocolHandler,ProtocolHandlerMixedSchemes")
options.add_argument("--disable-protocol-handler")
# 禁用 "mailto" 协议处理
options.add_argument("--disable-features=WebAppInstallForceList,ProtocolHandler,ProtocolHandlerMixedSchemes,PreloadMediaEngagementData")
options.add_argument("--disable-protocol-handler")
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(chrome_driver_file, options=options)
return driver
def open_website(url):
# Set up Chrome WebDriver with custom User-Agent
options = Options()
@ -34,7 +58,9 @@ def ignore_image_and_document_hrefs(href):
# Check if the href has a domain suffix of image or document file extensions
return file_extension.lower() in ('.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.css', '.js')
def process_page(driver, url, visited_pages, start_domain, data):
def process_page(driver, url, visited_pages, start_domain, data, group, name):
if not url.startswith('http'):
return
# Add the URL to visited pages
visited_pages.add(url)
# Navigate to the URL
@ -46,8 +72,8 @@ def process_page(driver, url, visited_pages, start_domain, data):
content_text = content_element.text
# print(content_text)
# Add URL, Domain, and Content to the data list
data.append([start_domain, url, content_text])
data.append([group, name, start_domain, url, content_text])
# Find and process hyperlinks
hrefs = extract_hyperlinks(driver)
for href in hrefs:
@ -65,26 +91,26 @@ def process_page(driver, url, visited_pages, start_domain, data):
parsed_href = urlparse(href)
if parsed_href.netloc.replace("www.", "") != start_domain:
continue
# Open the href in the same tab and retrieve data
driver.get(href)
# print(href)
# Wait for the page to load
time.sleep(2)
# Extract the content from the hyperlink page
hyperlink_content_element = driver.find_element(By.XPATH, '//body')
hyperlink_content_text = hyperlink_content_element.text
# print(hyperlink_content_text)
# Add URL, Domain, and Content of the hyperlink to the data list
data.append([start_domain, href, hyperlink_content_text])
# # Open the href in the same tab and retrieve data
# driver.get(href)
# # print(href)
# # Wait for the page to load
# time.sleep(2)
# # Extract the content from the hyperlink page
# hyperlink_content_element = driver.find_element(By.XPATH, '//body')
# hyperlink_content_text = hyperlink_content_element.text
# # print(hyperlink_content_text)
# # Add URL, Domain, and Content of the hyperlink to the data list
# data.append([start_domain, href, hyperlink_content_text])
# Recursively process the page and follow hyperlinks
process_page(driver, href, visited_pages, start_domain, data)
process_page(driver, href, visited_pages, start_domain, data, group, name)
except Exception as e:
print(f"Error processing hyperlink: {href}")
print(f"Error message: {str(e)}")
continue
# Return to the original page
driver.get(url)
# driver.get(url)
def check_href(href, original_url, visited_pages):
parsed_href = urlparse(href)
@ -134,14 +160,18 @@ def add_cookies(driver, cookies):
driver.add_cookie({'name': name, 'value': value})
def chrom_main_from_list(sites):
driver = init_driver()
for ind, item in enumerate(sites):
group = item[0] # Replace with the actual column name for group
name = item[1]
url = item[2]
domain = urlparse(url).netloc.replace("www.", "")
if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']:
continue
url = fix_url_scheme(url)
print(url)
# Open the website
driver = open_website(url)
driver.get(url)
# Retrieve cookies from previous session
cookies = get_cookies_from_previous_session(driver)
@ -154,14 +184,14 @@ def chrom_main_from_list(sites):
data = []
# Process the starting page and follow hyperlinks recursively
process_page(driver, url, visited_pages, domain, data)
process_page(driver, url, visited_pages, domain, data, group, name)
# Export data to a separate Excel file in the web_dir directory
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
export_to_excel(data, output_filename)
# Close the WebDriver
driver.quit()
# Close the WebDriver
driver.quit()
def chrome_main():
# Read failed URLs from the list
@ -172,7 +202,7 @@ def chrome_main():
name = row['主办']
url = row['地址']
domain = urlparse(url).netloc.replace("www.", "")
# Open the website
driver = open_website(url)

View File

@ -73,11 +73,13 @@ def ana_wechat():
def ana_web():
output_data = []
index = 1
for file in os.listdir(web_dir):
full_path = os.path.join(web_dir, file)
if '$' in full_path:
continue
print(full_path)
if os.path.getsize(full_path) > 0:
df = pd.read_excel(os.path.join(web_dir, file))
df = pd.read_excel(os.path.join(full_path), engine='openpyxl')
for ind, row in df_s.iterrows():
mask = df['text'].str.contains(row['错误表述'], na=False)
result = df[mask]

View File

@ -21,6 +21,12 @@ BIAO_PATH = os.path.join(BASE_DIR, 'biao.xlsx')
PYTHON_PATH = os.path.join(BASE_DIR, 'runtime/python.exe')
TEMPLATE_PATH = os.path.join(BASE_DIR, 'summary/template.xlsx')
def fix_url_scheme(url, default_scheme='http'):
# 检查URL是否包含方案
if not url.startswith('http://') and not url.startswith('https://'):
# 如果没有方案,添加默认方案
url = f'{default_scheme}://{url}'
return url
class MyApplication(QApplication):
def __init__(self, argv):
super(MyApplication, self).__init__(argv)
@ -49,52 +55,17 @@ class MyThread(QThread):
if output:
self.update_signal.emit({'msg': output.strip()})
def run(self) -> None:
def ana(self):
month = datetime.datetime.now().month
self.update_signal.emit({'msg': '巡查任务开始...'})
self.update_signal.emit({'msg': '对比开始...'})
self.update_signal.emit({'msg': '正在组合微信公众号爬取内容...'})
make_simple_csv_from_db()
make_wechat_articles_full()
self.update_signal.emit({'msg': "公众号爬取内容组装完毕!"})
self.update_signal.emit({'msg': '开始进行网站爬取...'})
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
ind = 0
for ind, row in df.iterrows():
group = row['单位']
name = row['主办']
url = row['地址']
domain = urlparse(url).netloc.replace('www.', '')
# output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
# -u 代表不缓冲,直接输出
cmd = [PYTHON_PATH, '-u', '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=False)
self.processes.append(process)
self.running = True
getlog_thread = threading.Thread(target=self.capture_output, args=(process,), daemon=True)
getlog_thread.start()
for process in self.processes:
process.wait()
self.update_signal.emit({'msg': '网站爬取结束,校验中...'})
info_to_save = []
for ind, row in df.iterrows():
group = row['单位']
name = row['主办']
url = row['地址']
domain = urlparse(url).netloc.replace("www.", "")
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
if os.path.exists(output_filename):
file_size = os.path.getsize(output_filename)
if file_size < self.lsize * 1024: # Convert KB to bytes
info_to_save.append([group, name, url])
if info_to_save:
self.update_signal.emit({'msg': '存在未爬取站点,正在调用Chrome继续爬取...'})
chrom_main_from_list(info_to_save)
self.update_signal.emit({'msg': '网站爬取完毕!'})
self.update_signal.emit({'msg': '开始对比分析所有内容...'})
wechat_results = ana_wechat()
web_results = ana_web()
output_excel_path = os.path.join(BASE_DIR, f'summary/{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx')
output_excel_path = os.path.join(BASE_DIR, f'summary/{month}月-总院宣传阵地巡查结果汇总表.xlsx')
workbook = load_workbook(TEMPLATE_PATH)
# 选择要操作的工作表
wechat_sheet = workbook['公众号']
@ -105,7 +76,45 @@ class MyThread(QThread):
web_sheet.append(row)
workbook.save(output_excel_path)
workbook.close()
self.update_signal.emit({'msg': '巡查任务执行完毕, 请查看结果栏, 可手动校对', 'output_excel_path': output_excel_path})
self.update_signal.emit({'msg': '分析完毕, 请查看结果栏, 可手动校对', 'output_excel_path': output_excel_path})
def run(self) -> None:
self.update_signal.emit({'msg': '开始进行网站爬取...'})
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
ind = 0
for ind, row in df.iterrows():
group = row['单位']
name = row['主办']
url = fix_url_scheme(row['地址'].strip())
domain = urlparse(url).netloc.replace('www.', '')
output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
# -u 代表不缓冲,直接输出
cmd = [PYTHON_PATH, '-u', '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=False)
self.processes.append(process)
self.running = True
getlog_thread = threading.Thread(target=self.capture_output, args=(process,), daemon=True)
getlog_thread.start()
for process in self.processes:
process.wait()
self.update_signal.emit({'msg': '网站爬取结束,校验中...'})
info_to_save = []
for ind, row in df.iterrows():
group = row['单位']
name = row['主办']
url = fix_url_scheme(row['地址'].strip())
domain = urlparse(url).netloc.replace("www.", "")
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
if os.path.exists(output_filename):
file_size = os.path.getsize(output_filename)
if file_size < self.lsize * 1024: # Convert KB to bytes
info_to_save.append([group, name, url])
if info_to_save:
self.update_signal.emit({'msg': '存在未爬取站点,正在调用Chrome继续爬取...'})
chrom_main_from_list(info_to_save)
self.update_signal.emit({'msg': '网站爬取完毕!'})
self.ana()
self.exec()
def close(self):
@ -192,6 +201,7 @@ class MainWindow(QMainWindow):
self.worker_thread.update_signal.connect(self.update_log)
self.worker_thread.start()
def update_log(self, rdict):
self.log(f'{self.get_time()}-{rdict["msg"]}', False)
if 'output_excel_path' in rdict:

Binary file not shown.

View File

@ -23,14 +23,14 @@ class Ui_MainWindow(object):
def setupUi(self, MainWindow):
if not MainWindow.objectName():
MainWindow.setObjectName(u"MainWindow")
MainWindow.resize(600, 725)
MainWindow.resize(600, 763)
sizePolicy = QSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed)
sizePolicy.setHorizontalStretch(0)
sizePolicy.setVerticalStretch(0)
sizePolicy.setHeightForWidth(MainWindow.sizePolicy().hasHeightForWidth())
MainWindow.setSizePolicy(sizePolicy)
MainWindow.setMinimumSize(QSize(600, 725))
MainWindow.setMaximumSize(QSize(600, 725))
MainWindow.setMinimumSize(QSize(600, 763))
MainWindow.setMaximumSize(QSize(600, 763))
icon = QIcon()
icon.addFile(u"start.ico", QSize(), QIcon.Normal, QIcon.Off)
MainWindow.setWindowIcon(icon)
@ -50,7 +50,7 @@ class Ui_MainWindow(object):
self.groupBox.setFont(font)
self.bWechat = QPushButton(self.groupBox)
self.bWechat.setObjectName(u"bWechat")
self.bWechat.setGeometry(QRect(20, 30, 75, 24))
self.bWechat.setGeometry(QRect(20, 30, 151, 24))
self.bWechat.setFont(font)
self.bWechat.setAutoFillBackground(False)
self.bWechat.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
@ -63,22 +63,9 @@ class Ui_MainWindow(object):
font1.setBold(False)
self.label_5.setFont(font1)
self.label_5.setStyleSheet(u"color: red;")
self.groupBox_2 = QGroupBox(self.centralwidget)
self.groupBox_2.setObjectName(u"groupBox_2")
self.groupBox_2.setGeometry(QRect(10, 380, 191, 111))
self.groupBox_2.setFont(font)
self.bWebSite = QPushButton(self.groupBox_2)
self.bWebSite.setObjectName(u"bWebSite")
self.bWebSite.setGeometry(QRect(20, 30, 151, 24))
self.bWebSite.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
self.label_2 = QLabel(self.groupBox_2)
self.label_2.setObjectName(u"label_2")
self.label_2.setGeometry(QRect(20, 70, 151, 16))
self.label_2.setFont(font1)
self.label_2.setStyleSheet(u"color: red;")
self.groupBox_3 = QGroupBox(self.centralwidget)
self.groupBox_3.setObjectName(u"groupBox_3")
self.groupBox_3.setGeometry(QRect(10, 500, 191, 101))
self.groupBox_3.setGeometry(QRect(10, 380, 191, 91))
self.groupBox_3.setFont(font)
self.bBiao = QPushButton(self.groupBox_3)
self.bBiao.setObjectName(u"bBiao")
@ -86,54 +73,23 @@ class Ui_MainWindow(object):
self.bBiao.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
self.label_4 = QLabel(self.groupBox_3)
self.label_4.setObjectName(u"label_4")
self.label_4.setGeometry(QRect(20, 70, 151, 16))
self.label_4.setGeometry(QRect(20, 60, 151, 16))
self.label_4.setFont(font1)
self.label_4.setStyleSheet(u"color: red;")
self.groupBox_4 = QGroupBox(self.centralwidget)
self.groupBox_4.setObjectName(u"groupBox_4")
self.groupBox_4.setGeometry(QRect(210, 280, 381, 321))
self.groupBox_4.setFont(font)
self.vLog = QListView(self.groupBox_4)
self.vLog.setObjectName(u"vLog")
self.vLog.setGeometry(QRect(10, 80, 361, 231))
font2 = QFont()
font2.setPointSize(9)
self.vLog.setFont(font2)
self.vLog.setStyleSheet(u"")
self.bStart = QPushButton(self.groupBox_4)
self.bStart.setObjectName(u"bStart")
self.bStart.setGeometry(QRect(300, 50, 75, 24))
self.bStart.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
self.label_6 = QLabel(self.groupBox_4)
self.label_6.setObjectName(u"label_6")
self.label_6.setGeometry(QRect(10, 30, 251, 41))
font3 = QFont()
font3.setFamilies([u"\u6977\u4f53"])
font3.setPointSize(12)
font3.setBold(False)
self.label_6.setFont(font3)
self.label_6.setStyleSheet(u"color: red;")
self.label_6.setWordWrap(True)
self.lSize = QLineEdit(self.groupBox_4)
self.lSize.setObjectName(u"lSize")
self.lSize.setGeometry(QRect(300, 20, 41, 20))
self.label_3 = QLabel(self.groupBox_4)
self.label_3.setObjectName(u"label_3")
self.label_3.setGeometry(QRect(350, 20, 41, 21))
self.groupBox_5 = QGroupBox(self.centralwidget)
self.groupBox_5.setObjectName(u"groupBox_5")
self.groupBox_5.setGeometry(QRect(10, 610, 581, 71))
self.groupBox_5.setGeometry(QRect(10, 650, 581, 71))
self.groupBox_5.setFont(font)
self.label_7 = QLabel(self.groupBox_5)
self.label_7.setObjectName(u"label_7")
self.label_7.setGeometry(QRect(10, 20, 91, 16))
font4 = QFont()
font4.setPointSize(10)
self.label_7.setFont(font4)
font2 = QFont()
font2.setPointSize(10)
self.label_7.setFont(font2)
self.label_8 = QLabel(self.groupBox_5)
self.label_8.setObjectName(u"label_8")
self.label_8.setGeometry(QRect(10, 40, 91, 16))
self.label_8.setFont(font4)
self.label_8.setFont(font2)
self.line = QFrame(self.groupBox_5)
self.line.setObjectName(u"line")
self.line.setGeometry(QRect(10, 30, 561, 16))
@ -142,11 +98,13 @@ class Ui_MainWindow(object):
self.lRes1 = QLabel(self.groupBox_5)
self.lRes1.setObjectName(u"lRes1")
self.lRes1.setGeometry(QRect(110, 15, 381, 21))
self.lRes1.setFont(font2)
font3 = QFont()
font3.setPointSize(9)
self.lRes1.setFont(font3)
self.lRes2 = QLabel(self.groupBox_5)
self.lRes2.setObjectName(u"lRes2")
self.lRes2.setGeometry(QRect(110, 40, 381, 16))
self.lRes2.setFont(font2)
self.lRes2.setFont(font3)
self.bRes1 = QPushButton(self.groupBox_5)
self.bRes1.setObjectName(u"bRes1")
self.bRes1.setGeometry(QRect(520, 10, 51, 24))
@ -157,15 +115,53 @@ class Ui_MainWindow(object):
self.label_9 = QLabel(self.centralwidget)
self.label_9.setObjectName(u"label_9")
self.label_9.setGeometry(QRect(150, 0, 291, 31))
font5 = QFont()
font5.setFamilies([u"\u6977\u4f53"])
font5.setPointSize(12)
font5.setBold(False)
font5.setItalic(False)
self.label_9.setFont(font5)
font4 = QFont()
font4.setFamilies([u"\u6977\u4f53"])
font4.setPointSize(12)
font4.setBold(False)
font4.setItalic(False)
self.label_9.setFont(font4)
self.label_9.setStyleSheet(u"color:white;")
self.label_9.setAlignment(Qt.AlignRight|Qt.AlignTrailing|Qt.AlignVCenter)
self.label_9.setMargin(6)
self.groupBox_6 = QGroupBox(self.centralwidget)
self.groupBox_6.setObjectName(u"groupBox_6")
self.groupBox_6.setGeometry(QRect(210, 280, 371, 361))
self.groupBox_6.setFont(font)
self.vLog = QListView(self.groupBox_6)
self.vLog.setObjectName(u"vLog")
self.vLog.setGeometry(QRect(10, 20, 351, 321))
self.vLog.setFont(font3)
self.vLog.setStyleSheet(u"")
self.groupBox_2 = QGroupBox(self.centralwidget)
self.groupBox_2.setObjectName(u"groupBox_2")
self.groupBox_2.setGeometry(QRect(10, 490, 191, 151))
self.groupBox_2.setFont(font)
self.bWebSite = QPushButton(self.groupBox_2)
self.bWebSite.setObjectName(u"bWebSite")
self.bWebSite.setGeometry(QRect(20, 30, 151, 24))
self.bWebSite.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
self.label_2 = QLabel(self.groupBox_2)
self.label_2.setObjectName(u"label_2")
self.label_2.setGeometry(QRect(20, 60, 151, 16))
self.label_2.setFont(font1)
self.label_2.setStyleSheet(u"color: red;")
self.label_6 = QLabel(self.groupBox_2)
self.label_6.setObjectName(u"label_6")
self.label_6.setGeometry(QRect(20, 80, 41, 21))
self.label_3 = QLabel(self.groupBox_2)
self.label_3.setObjectName(u"label_3")
self.label_3.setGeometry(QRect(100, 80, 121, 21))
self.lSize = QLineEdit(self.groupBox_2)
self.lSize.setObjectName(u"lSize")
self.lSize.setGeometry(QRect(50, 80, 41, 20))
self.bStart = QPushButton(self.groupBox_2)
self.bStart.setObjectName(u"bStart")
self.bStart.setGeometry(QRect(20, 110, 151, 24))
font5 = QFont()
font5.setPointSize(12)
self.bStart.setFont(font5)
self.bStart.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
MainWindow.setCentralWidget(self.centralwidget)
self.menubar = QMenuBar(MainWindow)
self.menubar.setObjectName(u"menubar")
@ -186,17 +182,9 @@ class Ui_MainWindow(object):
self.groupBox.setTitle(QCoreApplication.translate("MainWindow", u"1.\u5fae\u4fe1\u516c\u4f17\u53f7\u4fe1\u606f\u6293\u53d6", None))
self.bWechat.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5de5\u5177", None))
self.label_5.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u786e\u4fdd\u6240\u6709\u516c\u4f17\u53f7\u6293\u53d6\u5b8c\u6bd5", None))
self.groupBox_2.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u9700\u8981\u6293\u53d6\u7684\u7f51\u7ad9", None))
self.bWebSite.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u7f51\u7ad9\u5217\u8868Excel", None))
self.label_2.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
self.groupBox_3.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u5206\u6790\u5bf9\u6bd4\u5e93", None))
self.bBiao.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5206\u6790\u6807\u51c6Excel", None))
self.label_4.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
self.groupBox_4.setTitle(QCoreApplication.translate("MainWindow", u"4.\u5f00\u59cb\u6267\u884c\u5de1\u67e5", None))
self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u5de1\u67e5", None))
self.label_6.setText(QCoreApplication.translate("MainWindow", u"\u52a1\u5fc5\u786e\u8ba4\u524d3\u6b65\u5df2\u7ecf\u5b8c\u6210!", None))
self.lSize.setText(QCoreApplication.translate("MainWindow", u"30", None))
self.label_3.setText(QCoreApplication.translate("MainWindow", u"KB", None))
self.groupBox_5.setTitle(QCoreApplication.translate("MainWindow", u"\u6700\u7ec8\u7ed3\u679c", None))
self.label_7.setText(QCoreApplication.translate("MainWindow", u"\u5206\u6790\u7ed3\u679cExcel:", None))
self.label_8.setText(QCoreApplication.translate("MainWindow", u"\u5206\u6790\u62a5\u544aWord:", None))
@ -205,5 +193,13 @@ class Ui_MainWindow(object):
self.bRes1.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None))
self.bRes2.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None))
self.label_9.setText(QCoreApplication.translate("MainWindow", u"\u4e2d\u56fd\u5efa\u6750\u603b\u9662\u5ba3\u4f20\u5de5\u4f5c\u4fe1\u606f\u5316\u7ba1\u7406\u5e73\u53f0", None))
self.groupBox_6.setTitle(QCoreApplication.translate("MainWindow", u"\u65e5\u5fd7\u663e\u793a", None))
self.groupBox_2.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u9700\u8981\u6293\u53d6\u7684\u7f51\u7ad9", None))
self.bWebSite.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u7f51\u7ad9\u5217\u8868Excel", None))
self.label_2.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
self.label_6.setText(QCoreApplication.translate("MainWindow", u"\u5c0f\u4e8e", None))
self.label_3.setText(QCoreApplication.translate("MainWindow", u"KB-Chrome", None))
self.lSize.setText(QCoreApplication.translate("MainWindow", u"20", None))
self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u5de1\u67e5", None))
# retranslateUi

29
web3.py
View File

@ -26,6 +26,12 @@ def sigint_handler(signal, frame):
print('子进程已关闭,程序退出。')
sys.exit(0)
def fix_url_scheme(url, default_scheme='http'):
# 检查URL是否包含方案
if not url.startswith('http://') and not url.startswith('https://'):
# 如果没有方案,添加默认方案
url = f'{default_scheme}://{url}'
return url
if __name__ == '__main__':
print('巡查任务开始。。。')
now = datetime.datetime.now()
@ -46,15 +52,17 @@ if __name__ == '__main__':
ind = 0
for ind, row in df.iterrows():
group = row['单位']
name = row['主办']
url = row['地址']
domain = urlparse(url).netloc.replace('www.', '')
# output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
# cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
process = subprocess.Popen(cmd)
processes.append(process)
group = row['单位']
name = row['主办']
url = fix_url_scheme(row['地址'].strip())
domain = urlparse(url).netloc.replace('www.', '')
if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']: # 这几个网站直接跳过
continue
output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
# cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
process = subprocess.Popen(cmd)
processes.append(process)
# Wait for all processes to finish
for process in processes:
@ -71,7 +79,8 @@ if __name__ == '__main__':
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
if os.path.exists(output_filename):
file_size = os.path.getsize(output_filename)
if file_size < 30 * 1024: # Convert KB to bytes
print(file_size/1024)
if file_size < 20 * 1024: # Convert KB to bytes
info_to_save.append([group, name, url])
if info_to_save:

View File

@ -60,7 +60,6 @@ class ZcspiderPipeline:
# raise
line = [item['group'], item['name'], item['domain'], item['url'], item['text']]
self.ws.append(line)
self.wb.save(self.file_name)
return item
# 结束,关闭连接
@ -69,4 +68,5 @@ class ZcspiderPipeline:
# self.cur.close()
# # 关闭连接
# self.conn.close()
self.wb.save(self.file_name)
self.wb.close()

View File

@ -96,13 +96,13 @@ DEFAULT_REQUEST_HEADERS = {
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = 'gb18030'
LOG_LEVEL = 'ERROR'
LOG_LEVEL = 'WARNING'
DOWNLOAD_TIMEOUT = 10
ITEM_PIPELINES = {
# 'zcspider.pipelines.ZcspiderPipeline': 300,
'zcspider.pipelines.ZcspiderPipeline': 300,
}
FEED_EXPORTERS = {
'xlsx': 'scrapy_xlsx.XlsxItemExporter',
# 'xlsx': 'scrapy_xlsx.XlsxItemExporter',
}

View File

@ -82,40 +82,43 @@ class BaseSpider(scrapy.Spider):
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
def parse(self, response):
self.visited_urls.add(response.url)
if self.is_file_res(response):
return
h = html2text.HTML2Text()
h.ignore_links = True # 忽略所有链接
# 提取纯文本内容
# try:
text = h.handle(response.text)
# except:
# text = h.handle(response.body.decode(encoding='gb18030'))
if response.status < 400:
yield {
'group': self.group,
'name': self.name,
'domain': self.domain,
'url': response.url,
'text': text,
}
try:
if response.status >= 500:
return
self.visited_urls.add(response.url)
if self.is_file_res(response):
return
h = html2text.HTML2Text()
h.ignore_links = True # 忽略所有链接
# 提取纯文本内容
# try:
text = h.handle(response.text)
# except:
# text = h.handle(response.body.decode(encoding='gb18030'))
if response.status < 400:
yield {
'group': self.group,
'name': self.name,
'domain': self.domain,
'url': response.url,
'text': text,
}
links = re.findall(r'href=["\']?([^"\'>]+)', response.text)
for link in links:
full_link = response.urljoin(link)
if not full_link.startswith('http'):
continue
if full_link not in self.visited_urls and (self.is_file_url(full_link) is False):
if urlparse(full_link).netloc.replace('www.', '') == self.domain:
# try:
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
# except ValueError:
# import traceback
# print(traceback.format_exc())
# print(full_link)
except scrapy.exceptions.TimeoutError:
print(f'{response.url}-请求超时取消')
links = re.findall(r'href=["\']?([^"\'>]+)', text)
for link in links:
full_link = response.urljoin(link)
if not full_link.startswith('http'):
continue
if full_link not in self.visited_urls and (self.is_file_url(full_link) is False):
if urlparse(full_link).netloc.replace('www.', '') == self.domain:
# try:
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
# except ValueError:
# import traceback
# print(traceback.format_exc())
# print(full_link)
def closed(self, reason):
# This method will be called when the Spider is about to close
print(f'爬取完成: {self.name}_{self.domain}')

View File

@ -1,44 +1,14 @@
::[Bat To Exe Converter]
::
::YAwzoRdxOk+EWAjk
::fBw5plQjdCuDJE6L5kkgJBpXSTiYP3iqD7EZ+tTo++uVtgMUV+1f
::YAwzuBVtJxjWCl3EqQJgSA==
::ZR4luwNxJguZRRnk
::Yhs/ulQjdF+5
::cxAkpRVqdFKZSzk=
::cBs/ulQjdF+5
::ZR41oxFsdFKZSDk=
::eBoioBt6dFKZSDk=
::cRo6pxp7LAbNWATEpCI=
::egkzugNsPRvcWATEpCI=
::dAsiuh18IRvcCxnZtBJQ
::cRYluBh/LU+EWAnk
::YxY4rhs+aU+JeA==
::cxY6rQJ7JhzQF1fEqQJQ
::ZQ05rAF9IBncCkqN+0xwdVs0
::ZQ05rAF9IAHYFVzEqQJQ
::eg0/rx1wNQPfEVWB+kM9LVsJDGQ=
::fBEirQZwNQPfEVWB+kM9LVsJDGQ=
::cRolqwZ3JBvQF1fEqQJQ
::dhA7uBVwLU+EWDk=
::YQ03rBFzNR3SWATElA==
::dhAmsQZ3MwfNWATElA==
::ZQ0/vhVqMQ3MEVWAtB9wSA==
::Zg8zqx1/OA3MEVWAtB9wSA==
::dhA7pRFwIByZRRnk
::Zh4grVQjdCuDJE6L5kkgJBpXSTiYP3iqD7EZ+tTo++uVtgMYTOdf
::YB416Ek+ZG8=
::
::
::978f952a14a936cc963da21a135fa983
@echo off
if "%1" == "h" goto begin
mshta vbscript:createobject("wscript.shell").run("""%~nx0"" h",0)(window.close)&&exit
:begin
setlocal
REM 设置 Python 可执行文件路<E4BBB6><E8B7AF>?
set PYTHON_EXECUTABLE=.\runtime\python.exe
REM 设置要运行的 Python 脚本
set PYTHON_SCRIPT=start.py
REM 运行 Python 脚本
%PYTHON_EXECUTABLE% %PYTHON_SCRIPT%

Binary file not shown.