diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..5c80254 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "[python]": { + "editor.defaultFormatter": "ms-python.autopep8" + }, + "python.formatting.provider": "none" +} diff --git a/main.ui b/main.ui index 48e4fa2..42213ed 100644 --- a/main.ui +++ b/main.ui @@ -7,7 +7,7 @@ 0 0 600 - 725 + 763 @@ -19,13 +19,13 @@ 600 - 725 + 763 600 - 725 + 763 @@ -80,7 +80,7 @@ 20 30 - 75 + 151 24 @@ -123,70 +123,13 @@ - + 10 380 191 - 111 - - - - - 11 - - - - 2.确认需要抓取的网站 - - - - - 20 - 30 - 151 - 24 - - - - background-color:#409EFF; color: white; border-radius: 2px - - - 打开网站列表Excel - - - - - - 20 - 70 - 151 - 16 - - - - - 楷体 - 10 - false - - - - color: red; - - - 请在修改后保存并关闭 - - - - - - - 10 - 500 - 191 - 101 + 91 @@ -217,7 +160,7 @@ 20 - 70 + 60 151 16 @@ -237,115 +180,11 @@ - - - - 210 - 280 - 381 - 321 - - - - - 11 - - - - 4.开始执行巡查 - - - - - 10 - 80 - 361 - 231 - - - - - 9 - - - - - - - - - - 300 - 50 - 75 - 24 - - - - background-color:#409EFF; color: white; border-radius: 2px - - - 开始巡查 - - - - - - 10 - 30 - 251 - 41 - - - - - 楷体 - 12 - false - - - - color: red; - - - 务必确认前3步已经完成! - - - true - - - - - - 300 - 20 - 41 - 20 - - - - 30 - - - - - - 350 - 20 - 41 - 21 - - - - KB - - - 10 - 610 + 650 581 71 @@ -503,6 +342,159 @@ 6 + + + + 210 + 280 + 371 + 361 + + + + + 11 + + + + 日志显示 + + + + + 10 + 20 + 351 + 321 + + + + + 9 + + + + + + + + + + + 10 + 490 + 191 + 151 + + + + + 11 + + + + 2.确认需要抓取的网站 + + + + + 20 + 30 + 151 + 24 + + + + background-color:#409EFF; color: white; border-radius: 2px + + + 打开网站列表Excel + + + + + + 20 + 60 + 151 + 16 + + + + + 楷体 + 10 + false + + + + color: red; + + + 请在修改后保存并关闭 + + + + + + 20 + 80 + 41 + 21 + + + + 小于 + + + + + + 100 + 80 + 121 + 21 + + + + KB-Chrome + + + + + + 50 + 80 + 41 + 20 + + + + 20 + + + + + + 20 + 110 + 151 + 24 + + + + + 12 + + + + background-color:#409EFF; color: white; border-radius: 2px + + + 开始巡查 + + + diff --git a/mycode/crawl_chrome.py b/mycode/crawl_chrome.py index 7d9c7b0..d5a6603 100644 --- a/mycode/crawl_chrome.py +++ b/mycode/crawl_chrome.py @@ -10,6 +10,30 @@ import os chrome_driver_file = os.path.join(BASE_DIR, 'mycode', 'chromedriver.exe') failed_sites_file = os.path.join(BASE_DIR, 'mycode/failed_sites.xlsx') +def fix_url_scheme(url, default_scheme='http'): + # 检查URL是否包含方案 + if not url.startswith('http://') and not url.startswith('https://'): + # 如果没有方案,添加默认方案 + url = f'{default_scheme}://{url}' + return url + +def init_driver(): + # Set up Chrome WebDriver with custom User-Agent + options = Options() + options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36") + prefs = {"profile.managed_default_content_settings.images": 2, 'profile.managed_default_content_settings.notifications':2} + options.add_argument("--disable-default-apps") # 禁用默认应用程序 + # 禁用 "tel" 协议处理 + options.add_argument("--disable-features=WebAppInstallForceList,ProtocolHandler,ProtocolHandlerMixedSchemes") + options.add_argument("--disable-protocol-handler") + + # 禁用 "mailto" 协议处理 + options.add_argument("--disable-features=WebAppInstallForceList,ProtocolHandler,ProtocolHandlerMixedSchemes,PreloadMediaEngagementData") + options.add_argument("--disable-protocol-handler") + options.add_experimental_option("prefs", prefs) + driver = webdriver.Chrome(chrome_driver_file, options=options) + return driver + def open_website(url): # Set up Chrome WebDriver with custom User-Agent options = Options() @@ -34,7 +58,9 @@ def ignore_image_and_document_hrefs(href): # Check if the href has a domain suffix of image or document file extensions return file_extension.lower() in ('.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.css', '.js') -def process_page(driver, url, visited_pages, start_domain, data): +def process_page(driver, url, visited_pages, start_domain, data, group, name): + if not url.startswith('http'): + return # Add the URL to visited pages visited_pages.add(url) # Navigate to the URL @@ -46,8 +72,8 @@ def process_page(driver, url, visited_pages, start_domain, data): content_text = content_element.text # print(content_text) # Add URL, Domain, and Content to the data list - data.append([start_domain, url, content_text]) - + data.append([group, name, start_domain, url, content_text]) + # Find and process hyperlinks hrefs = extract_hyperlinks(driver) for href in hrefs: @@ -65,26 +91,26 @@ def process_page(driver, url, visited_pages, start_domain, data): parsed_href = urlparse(href) if parsed_href.netloc.replace("www.", "") != start_domain: continue - # Open the href in the same tab and retrieve data - driver.get(href) - # print(href) - # Wait for the page to load - time.sleep(2) - # Extract the content from the hyperlink page - hyperlink_content_element = driver.find_element(By.XPATH, '//body') - hyperlink_content_text = hyperlink_content_element.text - # print(hyperlink_content_text) - # Add URL, Domain, and Content of the hyperlink to the data list - data.append([start_domain, href, hyperlink_content_text]) + # # Open the href in the same tab and retrieve data + # driver.get(href) + # # print(href) + # # Wait for the page to load + # time.sleep(2) + # # Extract the content from the hyperlink page + # hyperlink_content_element = driver.find_element(By.XPATH, '//body') + # hyperlink_content_text = hyperlink_content_element.text + # # print(hyperlink_content_text) + # # Add URL, Domain, and Content of the hyperlink to the data list + # data.append([start_domain, href, hyperlink_content_text]) # Recursively process the page and follow hyperlinks - process_page(driver, href, visited_pages, start_domain, data) + process_page(driver, href, visited_pages, start_domain, data, group, name) except Exception as e: print(f"Error processing hyperlink: {href}") print(f"Error message: {str(e)}") continue # Return to the original page - driver.get(url) + # driver.get(url) def check_href(href, original_url, visited_pages): parsed_href = urlparse(href) @@ -134,14 +160,18 @@ def add_cookies(driver, cookies): driver.add_cookie({'name': name, 'value': value}) def chrom_main_from_list(sites): + driver = init_driver() for ind, item in enumerate(sites): group = item[0] # Replace with the actual column name for group name = item[1] url = item[2] domain = urlparse(url).netloc.replace("www.", "") - + if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']: + continue + url = fix_url_scheme(url) + print(url) # Open the website - driver = open_website(url) + driver.get(url) # Retrieve cookies from previous session cookies = get_cookies_from_previous_session(driver) @@ -154,14 +184,14 @@ def chrom_main_from_list(sites): data = [] # Process the starting page and follow hyperlinks recursively - process_page(driver, url, visited_pages, domain, data) + process_page(driver, url, visited_pages, domain, data, group, name) # Export data to a separate Excel file in the web_dir directory output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') export_to_excel(data, output_filename) - # Close the WebDriver - driver.quit() + # Close the WebDriver + driver.quit() def chrome_main(): # Read failed URLs from the list @@ -172,7 +202,7 @@ def chrome_main(): name = row['主办'] url = row['地址'] domain = urlparse(url).netloc.replace("www.", "") - + # Open the website driver = open_website(url) diff --git a/mycode/main.py b/mycode/main.py index da6b344..7f83388 100644 --- a/mycode/main.py +++ b/mycode/main.py @@ -73,11 +73,13 @@ def ana_wechat(): def ana_web(): output_data = [] index = 1 - for file in os.listdir(web_dir): full_path = os.path.join(web_dir, file) + if '$' in full_path: + continue + print(full_path) if os.path.getsize(full_path) > 0: - df = pd.read_excel(os.path.join(web_dir, file)) + df = pd.read_excel(os.path.join(full_path), engine='openpyxl') for ind, row in df_s.iterrows(): mask = df['text'].str.contains(row['错误表述'], na=False) result = df[mask] diff --git a/start.py b/start.py index 3d754b2..8c5864e 100644 --- a/start.py +++ b/start.py @@ -21,6 +21,12 @@ BIAO_PATH = os.path.join(BASE_DIR, 'biao.xlsx') PYTHON_PATH = os.path.join(BASE_DIR, 'runtime/python.exe') TEMPLATE_PATH = os.path.join(BASE_DIR, 'summary/template.xlsx') +def fix_url_scheme(url, default_scheme='http'): + # 检查URL是否包含方案 + if not url.startswith('http://') and not url.startswith('https://'): + # 如果没有方案,添加默认方案 + url = f'{default_scheme}://{url}' + return url class MyApplication(QApplication): def __init__(self, argv): super(MyApplication, self).__init__(argv) @@ -49,52 +55,17 @@ class MyThread(QThread): if output: self.update_signal.emit({'msg': output.strip()}) - def run(self) -> None: + def ana(self): month = datetime.datetime.now().month - self.update_signal.emit({'msg': '巡查任务开始...'}) + self.update_signal.emit({'msg': '对比开始...'}) self.update_signal.emit({'msg': '正在组合微信公众号爬取内容...'}) make_simple_csv_from_db() make_wechat_articles_full() self.update_signal.emit({'msg': "公众号爬取内容组装完毕!"}) - self.update_signal.emit({'msg': '开始进行网站爬取...'}) - df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1') - ind = 0 - for ind, row in df.iterrows(): - group = row['单位'] - name = row['主办'] - url = row['地址'] - domain = urlparse(url).netloc.replace('www.', '') - # output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') - # -u 代表不缓冲,直接输出 - cmd = [PYTHON_PATH, '-u', '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx'] - process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=False) - self.processes.append(process) - self.running = True - getlog_thread = threading.Thread(target=self.capture_output, args=(process,), daemon=True) - getlog_thread.start() - - for process in self.processes: - process.wait() - self.update_signal.emit({'msg': '网站爬取结束,校验中...'}) - info_to_save = [] - for ind, row in df.iterrows(): - group = row['单位'] - name = row['主办'] - url = row['地址'] - domain = urlparse(url).netloc.replace("www.", "") - output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') - if os.path.exists(output_filename): - file_size = os.path.getsize(output_filename) - if file_size < self.lsize * 1024: # Convert KB to bytes - info_to_save.append([group, name, url]) - if info_to_save: - self.update_signal.emit({'msg': '存在未爬取站点,正在调用Chrome继续爬取...'}) - chrom_main_from_list(info_to_save) - self.update_signal.emit({'msg': '网站爬取完毕!'}) self.update_signal.emit({'msg': '开始对比分析所有内容...'}) wechat_results = ana_wechat() web_results = ana_web() - output_excel_path = os.path.join(BASE_DIR, f'summary/{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx') + output_excel_path = os.path.join(BASE_DIR, f'summary/{month}月-总院宣传阵地巡查结果汇总表.xlsx') workbook = load_workbook(TEMPLATE_PATH) # 选择要操作的工作表 wechat_sheet = workbook['公众号'] @@ -105,7 +76,45 @@ class MyThread(QThread): web_sheet.append(row) workbook.save(output_excel_path) workbook.close() - self.update_signal.emit({'msg': '巡查任务执行完毕, 请查看结果栏, 可手动校对', 'output_excel_path': output_excel_path}) + self.update_signal.emit({'msg': '分析完毕, 请查看结果栏, 可手动校对', 'output_excel_path': output_excel_path}) + + def run(self) -> None: + self.update_signal.emit({'msg': '开始进行网站爬取...'}) + df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1') + ind = 0 + for ind, row in df.iterrows(): + group = row['单位'] + name = row['主办'] + url = fix_url_scheme(row['地址'].strip()) + domain = urlparse(url).netloc.replace('www.', '') + output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') + # -u 代表不缓冲,直接输出 + cmd = [PYTHON_PATH, '-u', '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}'] + process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=False) + self.processes.append(process) + self.running = True + getlog_thread = threading.Thread(target=self.capture_output, args=(process,), daemon=True) + getlog_thread.start() + + for process in self.processes: + process.wait() + self.update_signal.emit({'msg': '网站爬取结束,校验中...'}) + info_to_save = [] + for ind, row in df.iterrows(): + group = row['单位'] + name = row['主办'] + url = fix_url_scheme(row['地址'].strip()) + domain = urlparse(url).netloc.replace("www.", "") + output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') + if os.path.exists(output_filename): + file_size = os.path.getsize(output_filename) + if file_size < self.lsize * 1024: # Convert KB to bytes + info_to_save.append([group, name, url]) + if info_to_save: + self.update_signal.emit({'msg': '存在未爬取站点,正在调用Chrome继续爬取...'}) + chrom_main_from_list(info_to_save) + self.update_signal.emit({'msg': '网站爬取完毕!'}) + self.ana() self.exec() def close(self): @@ -192,6 +201,7 @@ class MainWindow(QMainWindow): self.worker_thread.update_signal.connect(self.update_log) self.worker_thread.start() + def update_log(self, rdict): self.log(f'{self.get_time()}-{rdict["msg"]}', False) if 'output_excel_path' in rdict: diff --git a/summary/10月-总院宣传阵地巡查简报.docx b/summary/10月-总院宣传阵地巡查简报.docx new file mode 100644 index 0000000..167b6f0 Binary files /dev/null and b/summary/10月-总院宣传阵地巡查简报.docx differ diff --git a/ui_mainwindow.py b/ui_mainwindow.py index e52b58d..f849192 100644 --- a/ui_mainwindow.py +++ b/ui_mainwindow.py @@ -23,14 +23,14 @@ class Ui_MainWindow(object): def setupUi(self, MainWindow): if not MainWindow.objectName(): MainWindow.setObjectName(u"MainWindow") - MainWindow.resize(600, 725) + MainWindow.resize(600, 763) sizePolicy = QSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) sizePolicy.setHorizontalStretch(0) sizePolicy.setVerticalStretch(0) sizePolicy.setHeightForWidth(MainWindow.sizePolicy().hasHeightForWidth()) MainWindow.setSizePolicy(sizePolicy) - MainWindow.setMinimumSize(QSize(600, 725)) - MainWindow.setMaximumSize(QSize(600, 725)) + MainWindow.setMinimumSize(QSize(600, 763)) + MainWindow.setMaximumSize(QSize(600, 763)) icon = QIcon() icon.addFile(u"start.ico", QSize(), QIcon.Normal, QIcon.Off) MainWindow.setWindowIcon(icon) @@ -50,7 +50,7 @@ class Ui_MainWindow(object): self.groupBox.setFont(font) self.bWechat = QPushButton(self.groupBox) self.bWechat.setObjectName(u"bWechat") - self.bWechat.setGeometry(QRect(20, 30, 75, 24)) + self.bWechat.setGeometry(QRect(20, 30, 151, 24)) self.bWechat.setFont(font) self.bWechat.setAutoFillBackground(False) self.bWechat.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px") @@ -63,22 +63,9 @@ class Ui_MainWindow(object): font1.setBold(False) self.label_5.setFont(font1) self.label_5.setStyleSheet(u"color: red;") - self.groupBox_2 = QGroupBox(self.centralwidget) - self.groupBox_2.setObjectName(u"groupBox_2") - self.groupBox_2.setGeometry(QRect(10, 380, 191, 111)) - self.groupBox_2.setFont(font) - self.bWebSite = QPushButton(self.groupBox_2) - self.bWebSite.setObjectName(u"bWebSite") - self.bWebSite.setGeometry(QRect(20, 30, 151, 24)) - self.bWebSite.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px") - self.label_2 = QLabel(self.groupBox_2) - self.label_2.setObjectName(u"label_2") - self.label_2.setGeometry(QRect(20, 70, 151, 16)) - self.label_2.setFont(font1) - self.label_2.setStyleSheet(u"color: red;") self.groupBox_3 = QGroupBox(self.centralwidget) self.groupBox_3.setObjectName(u"groupBox_3") - self.groupBox_3.setGeometry(QRect(10, 500, 191, 101)) + self.groupBox_3.setGeometry(QRect(10, 380, 191, 91)) self.groupBox_3.setFont(font) self.bBiao = QPushButton(self.groupBox_3) self.bBiao.setObjectName(u"bBiao") @@ -86,54 +73,23 @@ class Ui_MainWindow(object): self.bBiao.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px") self.label_4 = QLabel(self.groupBox_3) self.label_4.setObjectName(u"label_4") - self.label_4.setGeometry(QRect(20, 70, 151, 16)) + self.label_4.setGeometry(QRect(20, 60, 151, 16)) self.label_4.setFont(font1) self.label_4.setStyleSheet(u"color: red;") - self.groupBox_4 = QGroupBox(self.centralwidget) - self.groupBox_4.setObjectName(u"groupBox_4") - self.groupBox_4.setGeometry(QRect(210, 280, 381, 321)) - self.groupBox_4.setFont(font) - self.vLog = QListView(self.groupBox_4) - self.vLog.setObjectName(u"vLog") - self.vLog.setGeometry(QRect(10, 80, 361, 231)) - font2 = QFont() - font2.setPointSize(9) - self.vLog.setFont(font2) - self.vLog.setStyleSheet(u"") - self.bStart = QPushButton(self.groupBox_4) - self.bStart.setObjectName(u"bStart") - self.bStart.setGeometry(QRect(300, 50, 75, 24)) - self.bStart.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px") - self.label_6 = QLabel(self.groupBox_4) - self.label_6.setObjectName(u"label_6") - self.label_6.setGeometry(QRect(10, 30, 251, 41)) - font3 = QFont() - font3.setFamilies([u"\u6977\u4f53"]) - font3.setPointSize(12) - font3.setBold(False) - self.label_6.setFont(font3) - self.label_6.setStyleSheet(u"color: red;") - self.label_6.setWordWrap(True) - self.lSize = QLineEdit(self.groupBox_4) - self.lSize.setObjectName(u"lSize") - self.lSize.setGeometry(QRect(300, 20, 41, 20)) - self.label_3 = QLabel(self.groupBox_4) - self.label_3.setObjectName(u"label_3") - self.label_3.setGeometry(QRect(350, 20, 41, 21)) self.groupBox_5 = QGroupBox(self.centralwidget) self.groupBox_5.setObjectName(u"groupBox_5") - self.groupBox_5.setGeometry(QRect(10, 610, 581, 71)) + self.groupBox_5.setGeometry(QRect(10, 650, 581, 71)) self.groupBox_5.setFont(font) self.label_7 = QLabel(self.groupBox_5) self.label_7.setObjectName(u"label_7") self.label_7.setGeometry(QRect(10, 20, 91, 16)) - font4 = QFont() - font4.setPointSize(10) - self.label_7.setFont(font4) + font2 = QFont() + font2.setPointSize(10) + self.label_7.setFont(font2) self.label_8 = QLabel(self.groupBox_5) self.label_8.setObjectName(u"label_8") self.label_8.setGeometry(QRect(10, 40, 91, 16)) - self.label_8.setFont(font4) + self.label_8.setFont(font2) self.line = QFrame(self.groupBox_5) self.line.setObjectName(u"line") self.line.setGeometry(QRect(10, 30, 561, 16)) @@ -142,11 +98,13 @@ class Ui_MainWindow(object): self.lRes1 = QLabel(self.groupBox_5) self.lRes1.setObjectName(u"lRes1") self.lRes1.setGeometry(QRect(110, 15, 381, 21)) - self.lRes1.setFont(font2) + font3 = QFont() + font3.setPointSize(9) + self.lRes1.setFont(font3) self.lRes2 = QLabel(self.groupBox_5) self.lRes2.setObjectName(u"lRes2") self.lRes2.setGeometry(QRect(110, 40, 381, 16)) - self.lRes2.setFont(font2) + self.lRes2.setFont(font3) self.bRes1 = QPushButton(self.groupBox_5) self.bRes1.setObjectName(u"bRes1") self.bRes1.setGeometry(QRect(520, 10, 51, 24)) @@ -157,15 +115,53 @@ class Ui_MainWindow(object): self.label_9 = QLabel(self.centralwidget) self.label_9.setObjectName(u"label_9") self.label_9.setGeometry(QRect(150, 0, 291, 31)) - font5 = QFont() - font5.setFamilies([u"\u6977\u4f53"]) - font5.setPointSize(12) - font5.setBold(False) - font5.setItalic(False) - self.label_9.setFont(font5) + font4 = QFont() + font4.setFamilies([u"\u6977\u4f53"]) + font4.setPointSize(12) + font4.setBold(False) + font4.setItalic(False) + self.label_9.setFont(font4) self.label_9.setStyleSheet(u"color:white;") self.label_9.setAlignment(Qt.AlignRight|Qt.AlignTrailing|Qt.AlignVCenter) self.label_9.setMargin(6) + self.groupBox_6 = QGroupBox(self.centralwidget) + self.groupBox_6.setObjectName(u"groupBox_6") + self.groupBox_6.setGeometry(QRect(210, 280, 371, 361)) + self.groupBox_6.setFont(font) + self.vLog = QListView(self.groupBox_6) + self.vLog.setObjectName(u"vLog") + self.vLog.setGeometry(QRect(10, 20, 351, 321)) + self.vLog.setFont(font3) + self.vLog.setStyleSheet(u"") + self.groupBox_2 = QGroupBox(self.centralwidget) + self.groupBox_2.setObjectName(u"groupBox_2") + self.groupBox_2.setGeometry(QRect(10, 490, 191, 151)) + self.groupBox_2.setFont(font) + self.bWebSite = QPushButton(self.groupBox_2) + self.bWebSite.setObjectName(u"bWebSite") + self.bWebSite.setGeometry(QRect(20, 30, 151, 24)) + self.bWebSite.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px") + self.label_2 = QLabel(self.groupBox_2) + self.label_2.setObjectName(u"label_2") + self.label_2.setGeometry(QRect(20, 60, 151, 16)) + self.label_2.setFont(font1) + self.label_2.setStyleSheet(u"color: red;") + self.label_6 = QLabel(self.groupBox_2) + self.label_6.setObjectName(u"label_6") + self.label_6.setGeometry(QRect(20, 80, 41, 21)) + self.label_3 = QLabel(self.groupBox_2) + self.label_3.setObjectName(u"label_3") + self.label_3.setGeometry(QRect(100, 80, 121, 21)) + self.lSize = QLineEdit(self.groupBox_2) + self.lSize.setObjectName(u"lSize") + self.lSize.setGeometry(QRect(50, 80, 41, 20)) + self.bStart = QPushButton(self.groupBox_2) + self.bStart.setObjectName(u"bStart") + self.bStart.setGeometry(QRect(20, 110, 151, 24)) + font5 = QFont() + font5.setPointSize(12) + self.bStart.setFont(font5) + self.bStart.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px") MainWindow.setCentralWidget(self.centralwidget) self.menubar = QMenuBar(MainWindow) self.menubar.setObjectName(u"menubar") @@ -186,17 +182,9 @@ class Ui_MainWindow(object): self.groupBox.setTitle(QCoreApplication.translate("MainWindow", u"1.\u5fae\u4fe1\u516c\u4f17\u53f7\u4fe1\u606f\u6293\u53d6", None)) self.bWechat.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5de5\u5177", None)) self.label_5.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u786e\u4fdd\u6240\u6709\u516c\u4f17\u53f7\u6293\u53d6\u5b8c\u6bd5", None)) - self.groupBox_2.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u9700\u8981\u6293\u53d6\u7684\u7f51\u7ad9", None)) - self.bWebSite.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u7f51\u7ad9\u5217\u8868Excel", None)) - self.label_2.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None)) self.groupBox_3.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u5206\u6790\u5bf9\u6bd4\u5e93", None)) self.bBiao.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5206\u6790\u6807\u51c6Excel", None)) self.label_4.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None)) - self.groupBox_4.setTitle(QCoreApplication.translate("MainWindow", u"4.\u5f00\u59cb\u6267\u884c\u5de1\u67e5", None)) - self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u5de1\u67e5", None)) - self.label_6.setText(QCoreApplication.translate("MainWindow", u"\u52a1\u5fc5\u786e\u8ba4\u524d3\u6b65\u5df2\u7ecf\u5b8c\u6210!", None)) - self.lSize.setText(QCoreApplication.translate("MainWindow", u"30", None)) - self.label_3.setText(QCoreApplication.translate("MainWindow", u"KB", None)) self.groupBox_5.setTitle(QCoreApplication.translate("MainWindow", u"\u6700\u7ec8\u7ed3\u679c", None)) self.label_7.setText(QCoreApplication.translate("MainWindow", u"\u5206\u6790\u7ed3\u679cExcel:", None)) self.label_8.setText(QCoreApplication.translate("MainWindow", u"\u5206\u6790\u62a5\u544aWord:", None)) @@ -205,5 +193,13 @@ class Ui_MainWindow(object): self.bRes1.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None)) self.bRes2.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None)) self.label_9.setText(QCoreApplication.translate("MainWindow", u"\u4e2d\u56fd\u5efa\u6750\u603b\u9662\u5ba3\u4f20\u5de5\u4f5c\u4fe1\u606f\u5316\u7ba1\u7406\u5e73\u53f0", None)) + self.groupBox_6.setTitle(QCoreApplication.translate("MainWindow", u"\u65e5\u5fd7\u663e\u793a", None)) + self.groupBox_2.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u9700\u8981\u6293\u53d6\u7684\u7f51\u7ad9", None)) + self.bWebSite.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u7f51\u7ad9\u5217\u8868Excel", None)) + self.label_2.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None)) + self.label_6.setText(QCoreApplication.translate("MainWindow", u"\u5c0f\u4e8e", None)) + self.label_3.setText(QCoreApplication.translate("MainWindow", u"KB-Chrome", None)) + self.lSize.setText(QCoreApplication.translate("MainWindow", u"20", None)) + self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u5de1\u67e5", None)) # retranslateUi diff --git a/web3.py b/web3.py index d36289b..beeebf8 100644 --- a/web3.py +++ b/web3.py @@ -26,6 +26,12 @@ def sigint_handler(signal, frame): print('子进程已关闭,程序退出。') sys.exit(0) +def fix_url_scheme(url, default_scheme='http'): + # 检查URL是否包含方案 + if not url.startswith('http://') and not url.startswith('https://'): + # 如果没有方案,添加默认方案 + url = f'{default_scheme}://{url}' + return url if __name__ == '__main__': print('巡查任务开始。。。') now = datetime.datetime.now() @@ -46,15 +52,17 @@ if __name__ == '__main__': ind = 0 for ind, row in df.iterrows(): - group = row['单位'] - name = row['主办'] - url = row['地址'] - domain = urlparse(url).netloc.replace('www.', '') - # output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') - cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx'] - # cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}'] - process = subprocess.Popen(cmd) - processes.append(process) + group = row['单位'] + name = row['主办'] + url = fix_url_scheme(row['地址'].strip()) + domain = urlparse(url).netloc.replace('www.', '') + if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']: # 这几个网站直接跳过 + continue + output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') + # cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx'] + cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}'] + process = subprocess.Popen(cmd) + processes.append(process) # Wait for all processes to finish for process in processes: @@ -71,7 +79,8 @@ if __name__ == '__main__': output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') if os.path.exists(output_filename): file_size = os.path.getsize(output_filename) - if file_size < 30 * 1024: # Convert KB to bytes + print(file_size/1024) + if file_size < 20 * 1024: # Convert KB to bytes info_to_save.append([group, name, url]) if info_to_save: diff --git a/zcspider/pipelines.py b/zcspider/pipelines.py index fad39b9..31db3fa 100644 --- a/zcspider/pipelines.py +++ b/zcspider/pipelines.py @@ -60,7 +60,6 @@ class ZcspiderPipeline: # raise line = [item['group'], item['name'], item['domain'], item['url'], item['text']] self.ws.append(line) - self.wb.save(self.file_name) return item # 结束,关闭连接 @@ -69,4 +68,5 @@ class ZcspiderPipeline: # self.cur.close() # # 关闭连接 # self.conn.close() + self.wb.save(self.file_name) self.wb.close() \ No newline at end of file diff --git a/zcspider/settings.py b/zcspider/settings.py index c8ce2cf..6e4f619 100644 --- a/zcspider/settings.py +++ b/zcspider/settings.py @@ -96,13 +96,13 @@ DEFAULT_REQUEST_HEADERS = { REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" FEED_EXPORT_ENCODING = 'gb18030' -LOG_LEVEL = 'ERROR' +LOG_LEVEL = 'WARNING' DOWNLOAD_TIMEOUT = 10 ITEM_PIPELINES = { - # 'zcspider.pipelines.ZcspiderPipeline': 300, + 'zcspider.pipelines.ZcspiderPipeline': 300, } FEED_EXPORTERS = { - 'xlsx': 'scrapy_xlsx.XlsxItemExporter', + # 'xlsx': 'scrapy_xlsx.XlsxItemExporter', } \ No newline at end of file diff --git a/zcspider/spiders/base.py b/zcspider/spiders/base.py index c19e9bd..18bd884 100644 --- a/zcspider/spiders/base.py +++ b/zcspider/spiders/base.py @@ -82,40 +82,43 @@ class BaseSpider(scrapy.Spider): yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30}) def parse(self, response): - self.visited_urls.add(response.url) - if self.is_file_res(response): - return - h = html2text.HTML2Text() - h.ignore_links = True # 忽略所有链接 - # 提取纯文本内容 - # try: - text = h.handle(response.text) - # except: - # text = h.handle(response.body.decode(encoding='gb18030')) - if response.status < 400: - yield { - 'group': self.group, - 'name': self.name, - 'domain': self.domain, - 'url': response.url, - 'text': text, - } + try: + if response.status >= 500: + return + self.visited_urls.add(response.url) + if self.is_file_res(response): + return + h = html2text.HTML2Text() + h.ignore_links = True # 忽略所有链接 + # 提取纯文本内容 + # try: + text = h.handle(response.text) + # except: + # text = h.handle(response.body.decode(encoding='gb18030')) + if response.status < 400: + yield { + 'group': self.group, + 'name': self.name, + 'domain': self.domain, + 'url': response.url, + 'text': text, + } + links = re.findall(r'href=["\']?([^"\'>]+)', response.text) + for link in links: + full_link = response.urljoin(link) + if not full_link.startswith('http'): + continue + if full_link not in self.visited_urls and (self.is_file_url(full_link) is False): + if urlparse(full_link).netloc.replace('www.', '') == self.domain: + # try: + yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30}) + # except ValueError: + # import traceback + # print(traceback.format_exc()) + # print(full_link) + except scrapy.exceptions.TimeoutError: + print(f'{response.url}-请求超时取消') - links = re.findall(r'href=["\']?([^"\'>]+)', text) - - for link in links: - full_link = response.urljoin(link) - if not full_link.startswith('http'): - continue - if full_link not in self.visited_urls and (self.is_file_url(full_link) is False): - if urlparse(full_link).netloc.replace('www.', '') == self.domain: - # try: - yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30}) - # except ValueError: - # import traceback - # print(traceback.format_exc()) - # print(full_link) - def closed(self, reason): # This method will be called when the Spider is about to close print(f'爬取完成: {self.name}_{self.domain}') \ No newline at end of file diff --git a/网络巡查.bat b/网络巡查.bat index 4222027..9dad4d0 100644 --- a/网络巡查.bat +++ b/网络巡查.bat @@ -1,44 +1,14 @@ -::[Bat To Exe Converter] -:: -::YAwzoRdxOk+EWAjk -::fBw5plQjdCuDJE6L5kkgJBpXSTiYP3iqD7EZ+tTo++uVtgMUV+1f -::YAwzuBVtJxjWCl3EqQJgSA== -::ZR4luwNxJguZRRnk -::Yhs/ulQjdF+5 -::cxAkpRVqdFKZSzk= -::cBs/ulQjdF+5 -::ZR41oxFsdFKZSDk= -::eBoioBt6dFKZSDk= -::cRo6pxp7LAbNWATEpCI= -::egkzugNsPRvcWATEpCI= -::dAsiuh18IRvcCxnZtBJQ -::cRYluBh/LU+EWAnk -::YxY4rhs+aU+JeA== -::cxY6rQJ7JhzQF1fEqQJQ -::ZQ05rAF9IBncCkqN+0xwdVs0 -::ZQ05rAF9IAHYFVzEqQJQ -::eg0/rx1wNQPfEVWB+kM9LVsJDGQ= -::fBEirQZwNQPfEVWB+kM9LVsJDGQ= -::cRolqwZ3JBvQF1fEqQJQ -::dhA7uBVwLU+EWDk= -::YQ03rBFzNR3SWATElA== -::dhAmsQZ3MwfNWATElA== -::ZQ0/vhVqMQ3MEVWAtB9wSA== -::Zg8zqx1/OA3MEVWAtB9wSA== -::dhA7pRFwIByZRRnk -::Zh4grVQjdCuDJE6L5kkgJBpXSTiYP3iqD7EZ+tTo++uVtgMYTOdf -::YB416Ek+ZG8= -:: -:: -::978f952a14a936cc963da21a135fa983 @echo off +if "%1" == "h" goto begin +mshta vbscript:createobject("wscript.shell").run("""%~nx0"" h",0)(window.close)&&exit +:begin setlocal -REM 设置 Python 可执行文件路? + set PYTHON_EXECUTABLE=.\runtime\python.exe -REM 设置要运行的 Python 脚本 + set PYTHON_SCRIPT=start.py -REM 运行 Python 脚本 + %PYTHON_EXECUTABLE% %PYTHON_SCRIPT% diff --git a/网络巡查.exe b/网络巡查.exe deleted file mode 100644 index f8f4e94..0000000 Binary files a/网络巡查.exe and /dev/null differ