feat: 功能优化
This commit is contained in:
parent
7480590bd3
commit
70040f1c0a
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"[python]": {
|
||||
"editor.defaultFormatter": "ms-python.autopep8"
|
||||
},
|
||||
"python.formatting.provider": "none"
|
||||
}
|
330
main.ui
330
main.ui
|
@ -7,7 +7,7 @@
|
|||
<x>0</x>
|
||||
<y>0</y>
|
||||
<width>600</width>
|
||||
<height>725</height>
|
||||
<height>763</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="sizePolicy">
|
||||
|
@ -19,13 +19,13 @@
|
|||
<property name="minimumSize">
|
||||
<size>
|
||||
<width>600</width>
|
||||
<height>725</height>
|
||||
<height>763</height>
|
||||
</size>
|
||||
</property>
|
||||
<property name="maximumSize">
|
||||
<size>
|
||||
<width>600</width>
|
||||
<height>725</height>
|
||||
<height>763</height>
|
||||
</size>
|
||||
</property>
|
||||
<property name="windowTitle">
|
||||
|
@ -80,7 +80,7 @@
|
|||
<rect>
|
||||
<x>20</x>
|
||||
<y>30</y>
|
||||
<width>75</width>
|
||||
<width>151</width>
|
||||
<height>24</height>
|
||||
</rect>
|
||||
</property>
|
||||
|
@ -123,70 +123,13 @@
|
|||
</property>
|
||||
</widget>
|
||||
</widget>
|
||||
<widget class="QGroupBox" name="groupBox_2">
|
||||
<widget class="QGroupBox" name="groupBox_3">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>10</x>
|
||||
<y>380</y>
|
||||
<width>191</width>
|
||||
<height>111</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<pointsize>11</pointsize>
|
||||
</font>
|
||||
</property>
|
||||
<property name="title">
|
||||
<string>2.确认需要抓取的网站</string>
|
||||
</property>
|
||||
<widget class="QPushButton" name="bWebSite">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>20</x>
|
||||
<y>30</y>
|
||||
<width>151</width>
|
||||
<height>24</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="styleSheet">
|
||||
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>打开网站列表Excel</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLabel" name="label_2">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>20</x>
|
||||
<y>70</y>
|
||||
<width>151</width>
|
||||
<height>16</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<family>楷体</family>
|
||||
<pointsize>10</pointsize>
|
||||
<bold>false</bold>
|
||||
</font>
|
||||
</property>
|
||||
<property name="styleSheet">
|
||||
<string notr="true">color: red;</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>请在修改后保存并关闭</string>
|
||||
</property>
|
||||
</widget>
|
||||
</widget>
|
||||
<widget class="QGroupBox" name="groupBox_3">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>10</x>
|
||||
<y>500</y>
|
||||
<width>191</width>
|
||||
<height>101</height>
|
||||
<height>91</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
|
@ -217,7 +160,7 @@
|
|||
<property name="geometry">
|
||||
<rect>
|
||||
<x>20</x>
|
||||
<y>70</y>
|
||||
<y>60</y>
|
||||
<width>151</width>
|
||||
<height>16</height>
|
||||
</rect>
|
||||
|
@ -237,115 +180,11 @@
|
|||
</property>
|
||||
</widget>
|
||||
</widget>
|
||||
<widget class="QGroupBox" name="groupBox_4">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>210</x>
|
||||
<y>280</y>
|
||||
<width>381</width>
|
||||
<height>321</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<pointsize>11</pointsize>
|
||||
</font>
|
||||
</property>
|
||||
<property name="title">
|
||||
<string>4.开始执行巡查</string>
|
||||
</property>
|
||||
<widget class="QListView" name="vLog">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>10</x>
|
||||
<y>80</y>
|
||||
<width>361</width>
|
||||
<height>231</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<pointsize>9</pointsize>
|
||||
</font>
|
||||
</property>
|
||||
<property name="styleSheet">
|
||||
<string notr="true"/>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QPushButton" name="bStart">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>300</x>
|
||||
<y>50</y>
|
||||
<width>75</width>
|
||||
<height>24</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="styleSheet">
|
||||
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>开始巡查</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLabel" name="label_6">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>10</x>
|
||||
<y>30</y>
|
||||
<width>251</width>
|
||||
<height>41</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<family>楷体</family>
|
||||
<pointsize>12</pointsize>
|
||||
<bold>false</bold>
|
||||
</font>
|
||||
</property>
|
||||
<property name="styleSheet">
|
||||
<string notr="true">color: red;</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>务必确认前3步已经完成!</string>
|
||||
</property>
|
||||
<property name="wordWrap">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLineEdit" name="lSize">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>300</x>
|
||||
<y>20</y>
|
||||
<width>41</width>
|
||||
<height>20</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>30</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLabel" name="label_3">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>350</x>
|
||||
<y>20</y>
|
||||
<width>41</width>
|
||||
<height>21</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>KB</string>
|
||||
</property>
|
||||
</widget>
|
||||
</widget>
|
||||
<widget class="QGroupBox" name="groupBox_5">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>10</x>
|
||||
<y>610</y>
|
||||
<y>650</y>
|
||||
<width>581</width>
|
||||
<height>71</height>
|
||||
</rect>
|
||||
|
@ -503,6 +342,159 @@
|
|||
<number>6</number>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QGroupBox" name="groupBox_6">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>210</x>
|
||||
<y>280</y>
|
||||
<width>371</width>
|
||||
<height>361</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<pointsize>11</pointsize>
|
||||
</font>
|
||||
</property>
|
||||
<property name="title">
|
||||
<string>日志显示</string>
|
||||
</property>
|
||||
<widget class="QListView" name="vLog">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>10</x>
|
||||
<y>20</y>
|
||||
<width>351</width>
|
||||
<height>321</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<pointsize>9</pointsize>
|
||||
</font>
|
||||
</property>
|
||||
<property name="styleSheet">
|
||||
<string notr="true"/>
|
||||
</property>
|
||||
</widget>
|
||||
</widget>
|
||||
<widget class="QGroupBox" name="groupBox_2">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>10</x>
|
||||
<y>490</y>
|
||||
<width>191</width>
|
||||
<height>151</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<pointsize>11</pointsize>
|
||||
</font>
|
||||
</property>
|
||||
<property name="title">
|
||||
<string>2.确认需要抓取的网站</string>
|
||||
</property>
|
||||
<widget class="QPushButton" name="bWebSite">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>20</x>
|
||||
<y>30</y>
|
||||
<width>151</width>
|
||||
<height>24</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="styleSheet">
|
||||
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>打开网站列表Excel</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLabel" name="label_2">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>20</x>
|
||||
<y>60</y>
|
||||
<width>151</width>
|
||||
<height>16</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<family>楷体</family>
|
||||
<pointsize>10</pointsize>
|
||||
<bold>false</bold>
|
||||
</font>
|
||||
</property>
|
||||
<property name="styleSheet">
|
||||
<string notr="true">color: red;</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>请在修改后保存并关闭</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLabel" name="label_6">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>20</x>
|
||||
<y>80</y>
|
||||
<width>41</width>
|
||||
<height>21</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>小于</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLabel" name="label_3">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>100</x>
|
||||
<y>80</y>
|
||||
<width>121</width>
|
||||
<height>21</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>KB-Chrome</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLineEdit" name="lSize">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>50</x>
|
||||
<y>80</y>
|
||||
<width>41</width>
|
||||
<height>20</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>20</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QPushButton" name="bStart">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>20</x>
|
||||
<y>110</y>
|
||||
<width>151</width>
|
||||
<height>24</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="font">
|
||||
<font>
|
||||
<pointsize>12</pointsize>
|
||||
</font>
|
||||
</property>
|
||||
<property name="styleSheet">
|
||||
<string notr="true">background-color:#409EFF; color: white; border-radius: 2px</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>开始巡查</string>
|
||||
</property>
|
||||
</widget>
|
||||
</widget>
|
||||
</widget>
|
||||
<widget class="QMenuBar" name="menubar">
|
||||
<property name="geometry">
|
||||
|
|
|
@ -10,6 +10,30 @@ import os
|
|||
chrome_driver_file = os.path.join(BASE_DIR, 'mycode', 'chromedriver.exe')
|
||||
failed_sites_file = os.path.join(BASE_DIR, 'mycode/failed_sites.xlsx')
|
||||
|
||||
def fix_url_scheme(url, default_scheme='http'):
|
||||
# 检查URL是否包含方案
|
||||
if not url.startswith('http://') and not url.startswith('https://'):
|
||||
# 如果没有方案,添加默认方案
|
||||
url = f'{default_scheme}://{url}'
|
||||
return url
|
||||
|
||||
def init_driver():
|
||||
# Set up Chrome WebDriver with custom User-Agent
|
||||
options = Options()
|
||||
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36")
|
||||
prefs = {"profile.managed_default_content_settings.images": 2, 'profile.managed_default_content_settings.notifications':2}
|
||||
options.add_argument("--disable-default-apps") # 禁用默认应用程序
|
||||
# 禁用 "tel" 协议处理
|
||||
options.add_argument("--disable-features=WebAppInstallForceList,ProtocolHandler,ProtocolHandlerMixedSchemes")
|
||||
options.add_argument("--disable-protocol-handler")
|
||||
|
||||
# 禁用 "mailto" 协议处理
|
||||
options.add_argument("--disable-features=WebAppInstallForceList,ProtocolHandler,ProtocolHandlerMixedSchemes,PreloadMediaEngagementData")
|
||||
options.add_argument("--disable-protocol-handler")
|
||||
options.add_experimental_option("prefs", prefs)
|
||||
driver = webdriver.Chrome(chrome_driver_file, options=options)
|
||||
return driver
|
||||
|
||||
def open_website(url):
|
||||
# Set up Chrome WebDriver with custom User-Agent
|
||||
options = Options()
|
||||
|
@ -34,7 +58,9 @@ def ignore_image_and_document_hrefs(href):
|
|||
# Check if the href has a domain suffix of image or document file extensions
|
||||
return file_extension.lower() in ('.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.css', '.js')
|
||||
|
||||
def process_page(driver, url, visited_pages, start_domain, data):
|
||||
def process_page(driver, url, visited_pages, start_domain, data, group, name):
|
||||
if not url.startswith('http'):
|
||||
return
|
||||
# Add the URL to visited pages
|
||||
visited_pages.add(url)
|
||||
# Navigate to the URL
|
||||
|
@ -46,8 +72,8 @@ def process_page(driver, url, visited_pages, start_domain, data):
|
|||
content_text = content_element.text
|
||||
# print(content_text)
|
||||
# Add URL, Domain, and Content to the data list
|
||||
data.append([start_domain, url, content_text])
|
||||
|
||||
data.append([group, name, start_domain, url, content_text])
|
||||
|
||||
# Find and process hyperlinks
|
||||
hrefs = extract_hyperlinks(driver)
|
||||
for href in hrefs:
|
||||
|
@ -65,26 +91,26 @@ def process_page(driver, url, visited_pages, start_domain, data):
|
|||
parsed_href = urlparse(href)
|
||||
if parsed_href.netloc.replace("www.", "") != start_domain:
|
||||
continue
|
||||
# Open the href in the same tab and retrieve data
|
||||
driver.get(href)
|
||||
# print(href)
|
||||
# Wait for the page to load
|
||||
time.sleep(2)
|
||||
# Extract the content from the hyperlink page
|
||||
hyperlink_content_element = driver.find_element(By.XPATH, '//body')
|
||||
hyperlink_content_text = hyperlink_content_element.text
|
||||
# print(hyperlink_content_text)
|
||||
# Add URL, Domain, and Content of the hyperlink to the data list
|
||||
data.append([start_domain, href, hyperlink_content_text])
|
||||
# # Open the href in the same tab and retrieve data
|
||||
# driver.get(href)
|
||||
# # print(href)
|
||||
# # Wait for the page to load
|
||||
# time.sleep(2)
|
||||
# # Extract the content from the hyperlink page
|
||||
# hyperlink_content_element = driver.find_element(By.XPATH, '//body')
|
||||
# hyperlink_content_text = hyperlink_content_element.text
|
||||
# # print(hyperlink_content_text)
|
||||
# # Add URL, Domain, and Content of the hyperlink to the data list
|
||||
# data.append([start_domain, href, hyperlink_content_text])
|
||||
# Recursively process the page and follow hyperlinks
|
||||
process_page(driver, href, visited_pages, start_domain, data)
|
||||
process_page(driver, href, visited_pages, start_domain, data, group, name)
|
||||
except Exception as e:
|
||||
print(f"Error processing hyperlink: {href}")
|
||||
print(f"Error message: {str(e)}")
|
||||
continue
|
||||
|
||||
# Return to the original page
|
||||
driver.get(url)
|
||||
# driver.get(url)
|
||||
|
||||
def check_href(href, original_url, visited_pages):
|
||||
parsed_href = urlparse(href)
|
||||
|
@ -134,14 +160,18 @@ def add_cookies(driver, cookies):
|
|||
driver.add_cookie({'name': name, 'value': value})
|
||||
|
||||
def chrom_main_from_list(sites):
|
||||
driver = init_driver()
|
||||
for ind, item in enumerate(sites):
|
||||
group = item[0] # Replace with the actual column name for group
|
||||
name = item[1]
|
||||
url = item[2]
|
||||
domain = urlparse(url).netloc.replace("www.", "")
|
||||
|
||||
if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']:
|
||||
continue
|
||||
url = fix_url_scheme(url)
|
||||
print(url)
|
||||
# Open the website
|
||||
driver = open_website(url)
|
||||
driver.get(url)
|
||||
|
||||
# Retrieve cookies from previous session
|
||||
cookies = get_cookies_from_previous_session(driver)
|
||||
|
@ -154,14 +184,14 @@ def chrom_main_from_list(sites):
|
|||
data = []
|
||||
|
||||
# Process the starting page and follow hyperlinks recursively
|
||||
process_page(driver, url, visited_pages, domain, data)
|
||||
process_page(driver, url, visited_pages, domain, data, group, name)
|
||||
|
||||
# Export data to a separate Excel file in the web_dir directory
|
||||
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||
export_to_excel(data, output_filename)
|
||||
|
||||
# Close the WebDriver
|
||||
driver.quit()
|
||||
# Close the WebDriver
|
||||
driver.quit()
|
||||
|
||||
def chrome_main():
|
||||
# Read failed URLs from the list
|
||||
|
@ -172,7 +202,7 @@ def chrome_main():
|
|||
name = row['主办']
|
||||
url = row['地址']
|
||||
domain = urlparse(url).netloc.replace("www.", "")
|
||||
|
||||
|
||||
# Open the website
|
||||
driver = open_website(url)
|
||||
|
||||
|
|
|
@ -73,11 +73,13 @@ def ana_wechat():
|
|||
def ana_web():
|
||||
output_data = []
|
||||
index = 1
|
||||
|
||||
for file in os.listdir(web_dir):
|
||||
full_path = os.path.join(web_dir, file)
|
||||
if '$' in full_path:
|
||||
continue
|
||||
print(full_path)
|
||||
if os.path.getsize(full_path) > 0:
|
||||
df = pd.read_excel(os.path.join(web_dir, file))
|
||||
df = pd.read_excel(os.path.join(full_path), engine='openpyxl')
|
||||
for ind, row in df_s.iterrows():
|
||||
mask = df['text'].str.contains(row['错误表述'], na=False)
|
||||
result = df[mask]
|
||||
|
|
88
start.py
88
start.py
|
@ -21,6 +21,12 @@ BIAO_PATH = os.path.join(BASE_DIR, 'biao.xlsx')
|
|||
PYTHON_PATH = os.path.join(BASE_DIR, 'runtime/python.exe')
|
||||
TEMPLATE_PATH = os.path.join(BASE_DIR, 'summary/template.xlsx')
|
||||
|
||||
def fix_url_scheme(url, default_scheme='http'):
|
||||
# 检查URL是否包含方案
|
||||
if not url.startswith('http://') and not url.startswith('https://'):
|
||||
# 如果没有方案,添加默认方案
|
||||
url = f'{default_scheme}://{url}'
|
||||
return url
|
||||
class MyApplication(QApplication):
|
||||
def __init__(self, argv):
|
||||
super(MyApplication, self).__init__(argv)
|
||||
|
@ -49,52 +55,17 @@ class MyThread(QThread):
|
|||
if output:
|
||||
self.update_signal.emit({'msg': output.strip()})
|
||||
|
||||
def run(self) -> None:
|
||||
def ana(self):
|
||||
month = datetime.datetime.now().month
|
||||
self.update_signal.emit({'msg': '巡查任务开始...'})
|
||||
self.update_signal.emit({'msg': '对比开始...'})
|
||||
self.update_signal.emit({'msg': '正在组合微信公众号爬取内容...'})
|
||||
make_simple_csv_from_db()
|
||||
make_wechat_articles_full()
|
||||
self.update_signal.emit({'msg': "公众号爬取内容组装完毕!"})
|
||||
self.update_signal.emit({'msg': '开始进行网站爬取...'})
|
||||
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
|
||||
ind = 0
|
||||
for ind, row in df.iterrows():
|
||||
group = row['单位']
|
||||
name = row['主办']
|
||||
url = row['地址']
|
||||
domain = urlparse(url).netloc.replace('www.', '')
|
||||
# output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||
# -u 代表不缓冲,直接输出
|
||||
cmd = [PYTHON_PATH, '-u', '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
|
||||
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=False)
|
||||
self.processes.append(process)
|
||||
self.running = True
|
||||
getlog_thread = threading.Thread(target=self.capture_output, args=(process,), daemon=True)
|
||||
getlog_thread.start()
|
||||
|
||||
for process in self.processes:
|
||||
process.wait()
|
||||
self.update_signal.emit({'msg': '网站爬取结束,校验中...'})
|
||||
info_to_save = []
|
||||
for ind, row in df.iterrows():
|
||||
group = row['单位']
|
||||
name = row['主办']
|
||||
url = row['地址']
|
||||
domain = urlparse(url).netloc.replace("www.", "")
|
||||
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||
if os.path.exists(output_filename):
|
||||
file_size = os.path.getsize(output_filename)
|
||||
if file_size < self.lsize * 1024: # Convert KB to bytes
|
||||
info_to_save.append([group, name, url])
|
||||
if info_to_save:
|
||||
self.update_signal.emit({'msg': '存在未爬取站点,正在调用Chrome继续爬取...'})
|
||||
chrom_main_from_list(info_to_save)
|
||||
self.update_signal.emit({'msg': '网站爬取完毕!'})
|
||||
self.update_signal.emit({'msg': '开始对比分析所有内容...'})
|
||||
wechat_results = ana_wechat()
|
||||
web_results = ana_web()
|
||||
output_excel_path = os.path.join(BASE_DIR, f'summary/{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx')
|
||||
output_excel_path = os.path.join(BASE_DIR, f'summary/{month}月-总院宣传阵地巡查结果汇总表.xlsx')
|
||||
workbook = load_workbook(TEMPLATE_PATH)
|
||||
# 选择要操作的工作表
|
||||
wechat_sheet = workbook['公众号']
|
||||
|
@ -105,7 +76,45 @@ class MyThread(QThread):
|
|||
web_sheet.append(row)
|
||||
workbook.save(output_excel_path)
|
||||
workbook.close()
|
||||
self.update_signal.emit({'msg': '巡查任务执行完毕, 请查看结果栏, 可手动校对', 'output_excel_path': output_excel_path})
|
||||
self.update_signal.emit({'msg': '分析完毕, 请查看结果栏, 可手动校对', 'output_excel_path': output_excel_path})
|
||||
|
||||
def run(self) -> None:
|
||||
self.update_signal.emit({'msg': '开始进行网站爬取...'})
|
||||
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
|
||||
ind = 0
|
||||
for ind, row in df.iterrows():
|
||||
group = row['单位']
|
||||
name = row['主办']
|
||||
url = fix_url_scheme(row['地址'].strip())
|
||||
domain = urlparse(url).netloc.replace('www.', '')
|
||||
output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||
# -u 代表不缓冲,直接输出
|
||||
cmd = [PYTHON_PATH, '-u', '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
|
||||
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=False)
|
||||
self.processes.append(process)
|
||||
self.running = True
|
||||
getlog_thread = threading.Thread(target=self.capture_output, args=(process,), daemon=True)
|
||||
getlog_thread.start()
|
||||
|
||||
for process in self.processes:
|
||||
process.wait()
|
||||
self.update_signal.emit({'msg': '网站爬取结束,校验中...'})
|
||||
info_to_save = []
|
||||
for ind, row in df.iterrows():
|
||||
group = row['单位']
|
||||
name = row['主办']
|
||||
url = fix_url_scheme(row['地址'].strip())
|
||||
domain = urlparse(url).netloc.replace("www.", "")
|
||||
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||
if os.path.exists(output_filename):
|
||||
file_size = os.path.getsize(output_filename)
|
||||
if file_size < self.lsize * 1024: # Convert KB to bytes
|
||||
info_to_save.append([group, name, url])
|
||||
if info_to_save:
|
||||
self.update_signal.emit({'msg': '存在未爬取站点,正在调用Chrome继续爬取...'})
|
||||
chrom_main_from_list(info_to_save)
|
||||
self.update_signal.emit({'msg': '网站爬取完毕!'})
|
||||
self.ana()
|
||||
self.exec()
|
||||
|
||||
def close(self):
|
||||
|
@ -192,6 +201,7 @@ class MainWindow(QMainWindow):
|
|||
self.worker_thread.update_signal.connect(self.update_log)
|
||||
self.worker_thread.start()
|
||||
|
||||
|
||||
def update_log(self, rdict):
|
||||
self.log(f'{self.get_time()}-{rdict["msg"]}', False)
|
||||
if 'output_excel_path' in rdict:
|
||||
|
|
Binary file not shown.
138
ui_mainwindow.py
138
ui_mainwindow.py
|
@ -23,14 +23,14 @@ class Ui_MainWindow(object):
|
|||
def setupUi(self, MainWindow):
|
||||
if not MainWindow.objectName():
|
||||
MainWindow.setObjectName(u"MainWindow")
|
||||
MainWindow.resize(600, 725)
|
||||
MainWindow.resize(600, 763)
|
||||
sizePolicy = QSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed)
|
||||
sizePolicy.setHorizontalStretch(0)
|
||||
sizePolicy.setVerticalStretch(0)
|
||||
sizePolicy.setHeightForWidth(MainWindow.sizePolicy().hasHeightForWidth())
|
||||
MainWindow.setSizePolicy(sizePolicy)
|
||||
MainWindow.setMinimumSize(QSize(600, 725))
|
||||
MainWindow.setMaximumSize(QSize(600, 725))
|
||||
MainWindow.setMinimumSize(QSize(600, 763))
|
||||
MainWindow.setMaximumSize(QSize(600, 763))
|
||||
icon = QIcon()
|
||||
icon.addFile(u"start.ico", QSize(), QIcon.Normal, QIcon.Off)
|
||||
MainWindow.setWindowIcon(icon)
|
||||
|
@ -50,7 +50,7 @@ class Ui_MainWindow(object):
|
|||
self.groupBox.setFont(font)
|
||||
self.bWechat = QPushButton(self.groupBox)
|
||||
self.bWechat.setObjectName(u"bWechat")
|
||||
self.bWechat.setGeometry(QRect(20, 30, 75, 24))
|
||||
self.bWechat.setGeometry(QRect(20, 30, 151, 24))
|
||||
self.bWechat.setFont(font)
|
||||
self.bWechat.setAutoFillBackground(False)
|
||||
self.bWechat.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
|
||||
|
@ -63,22 +63,9 @@ class Ui_MainWindow(object):
|
|||
font1.setBold(False)
|
||||
self.label_5.setFont(font1)
|
||||
self.label_5.setStyleSheet(u"color: red;")
|
||||
self.groupBox_2 = QGroupBox(self.centralwidget)
|
||||
self.groupBox_2.setObjectName(u"groupBox_2")
|
||||
self.groupBox_2.setGeometry(QRect(10, 380, 191, 111))
|
||||
self.groupBox_2.setFont(font)
|
||||
self.bWebSite = QPushButton(self.groupBox_2)
|
||||
self.bWebSite.setObjectName(u"bWebSite")
|
||||
self.bWebSite.setGeometry(QRect(20, 30, 151, 24))
|
||||
self.bWebSite.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
|
||||
self.label_2 = QLabel(self.groupBox_2)
|
||||
self.label_2.setObjectName(u"label_2")
|
||||
self.label_2.setGeometry(QRect(20, 70, 151, 16))
|
||||
self.label_2.setFont(font1)
|
||||
self.label_2.setStyleSheet(u"color: red;")
|
||||
self.groupBox_3 = QGroupBox(self.centralwidget)
|
||||
self.groupBox_3.setObjectName(u"groupBox_3")
|
||||
self.groupBox_3.setGeometry(QRect(10, 500, 191, 101))
|
||||
self.groupBox_3.setGeometry(QRect(10, 380, 191, 91))
|
||||
self.groupBox_3.setFont(font)
|
||||
self.bBiao = QPushButton(self.groupBox_3)
|
||||
self.bBiao.setObjectName(u"bBiao")
|
||||
|
@ -86,54 +73,23 @@ class Ui_MainWindow(object):
|
|||
self.bBiao.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
|
||||
self.label_4 = QLabel(self.groupBox_3)
|
||||
self.label_4.setObjectName(u"label_4")
|
||||
self.label_4.setGeometry(QRect(20, 70, 151, 16))
|
||||
self.label_4.setGeometry(QRect(20, 60, 151, 16))
|
||||
self.label_4.setFont(font1)
|
||||
self.label_4.setStyleSheet(u"color: red;")
|
||||
self.groupBox_4 = QGroupBox(self.centralwidget)
|
||||
self.groupBox_4.setObjectName(u"groupBox_4")
|
||||
self.groupBox_4.setGeometry(QRect(210, 280, 381, 321))
|
||||
self.groupBox_4.setFont(font)
|
||||
self.vLog = QListView(self.groupBox_4)
|
||||
self.vLog.setObjectName(u"vLog")
|
||||
self.vLog.setGeometry(QRect(10, 80, 361, 231))
|
||||
font2 = QFont()
|
||||
font2.setPointSize(9)
|
||||
self.vLog.setFont(font2)
|
||||
self.vLog.setStyleSheet(u"")
|
||||
self.bStart = QPushButton(self.groupBox_4)
|
||||
self.bStart.setObjectName(u"bStart")
|
||||
self.bStart.setGeometry(QRect(300, 50, 75, 24))
|
||||
self.bStart.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
|
||||
self.label_6 = QLabel(self.groupBox_4)
|
||||
self.label_6.setObjectName(u"label_6")
|
||||
self.label_6.setGeometry(QRect(10, 30, 251, 41))
|
||||
font3 = QFont()
|
||||
font3.setFamilies([u"\u6977\u4f53"])
|
||||
font3.setPointSize(12)
|
||||
font3.setBold(False)
|
||||
self.label_6.setFont(font3)
|
||||
self.label_6.setStyleSheet(u"color: red;")
|
||||
self.label_6.setWordWrap(True)
|
||||
self.lSize = QLineEdit(self.groupBox_4)
|
||||
self.lSize.setObjectName(u"lSize")
|
||||
self.lSize.setGeometry(QRect(300, 20, 41, 20))
|
||||
self.label_3 = QLabel(self.groupBox_4)
|
||||
self.label_3.setObjectName(u"label_3")
|
||||
self.label_3.setGeometry(QRect(350, 20, 41, 21))
|
||||
self.groupBox_5 = QGroupBox(self.centralwidget)
|
||||
self.groupBox_5.setObjectName(u"groupBox_5")
|
||||
self.groupBox_5.setGeometry(QRect(10, 610, 581, 71))
|
||||
self.groupBox_5.setGeometry(QRect(10, 650, 581, 71))
|
||||
self.groupBox_5.setFont(font)
|
||||
self.label_7 = QLabel(self.groupBox_5)
|
||||
self.label_7.setObjectName(u"label_7")
|
||||
self.label_7.setGeometry(QRect(10, 20, 91, 16))
|
||||
font4 = QFont()
|
||||
font4.setPointSize(10)
|
||||
self.label_7.setFont(font4)
|
||||
font2 = QFont()
|
||||
font2.setPointSize(10)
|
||||
self.label_7.setFont(font2)
|
||||
self.label_8 = QLabel(self.groupBox_5)
|
||||
self.label_8.setObjectName(u"label_8")
|
||||
self.label_8.setGeometry(QRect(10, 40, 91, 16))
|
||||
self.label_8.setFont(font4)
|
||||
self.label_8.setFont(font2)
|
||||
self.line = QFrame(self.groupBox_5)
|
||||
self.line.setObjectName(u"line")
|
||||
self.line.setGeometry(QRect(10, 30, 561, 16))
|
||||
|
@ -142,11 +98,13 @@ class Ui_MainWindow(object):
|
|||
self.lRes1 = QLabel(self.groupBox_5)
|
||||
self.lRes1.setObjectName(u"lRes1")
|
||||
self.lRes1.setGeometry(QRect(110, 15, 381, 21))
|
||||
self.lRes1.setFont(font2)
|
||||
font3 = QFont()
|
||||
font3.setPointSize(9)
|
||||
self.lRes1.setFont(font3)
|
||||
self.lRes2 = QLabel(self.groupBox_5)
|
||||
self.lRes2.setObjectName(u"lRes2")
|
||||
self.lRes2.setGeometry(QRect(110, 40, 381, 16))
|
||||
self.lRes2.setFont(font2)
|
||||
self.lRes2.setFont(font3)
|
||||
self.bRes1 = QPushButton(self.groupBox_5)
|
||||
self.bRes1.setObjectName(u"bRes1")
|
||||
self.bRes1.setGeometry(QRect(520, 10, 51, 24))
|
||||
|
@ -157,15 +115,53 @@ class Ui_MainWindow(object):
|
|||
self.label_9 = QLabel(self.centralwidget)
|
||||
self.label_9.setObjectName(u"label_9")
|
||||
self.label_9.setGeometry(QRect(150, 0, 291, 31))
|
||||
font5 = QFont()
|
||||
font5.setFamilies([u"\u6977\u4f53"])
|
||||
font5.setPointSize(12)
|
||||
font5.setBold(False)
|
||||
font5.setItalic(False)
|
||||
self.label_9.setFont(font5)
|
||||
font4 = QFont()
|
||||
font4.setFamilies([u"\u6977\u4f53"])
|
||||
font4.setPointSize(12)
|
||||
font4.setBold(False)
|
||||
font4.setItalic(False)
|
||||
self.label_9.setFont(font4)
|
||||
self.label_9.setStyleSheet(u"color:white;")
|
||||
self.label_9.setAlignment(Qt.AlignRight|Qt.AlignTrailing|Qt.AlignVCenter)
|
||||
self.label_9.setMargin(6)
|
||||
self.groupBox_6 = QGroupBox(self.centralwidget)
|
||||
self.groupBox_6.setObjectName(u"groupBox_6")
|
||||
self.groupBox_6.setGeometry(QRect(210, 280, 371, 361))
|
||||
self.groupBox_6.setFont(font)
|
||||
self.vLog = QListView(self.groupBox_6)
|
||||
self.vLog.setObjectName(u"vLog")
|
||||
self.vLog.setGeometry(QRect(10, 20, 351, 321))
|
||||
self.vLog.setFont(font3)
|
||||
self.vLog.setStyleSheet(u"")
|
||||
self.groupBox_2 = QGroupBox(self.centralwidget)
|
||||
self.groupBox_2.setObjectName(u"groupBox_2")
|
||||
self.groupBox_2.setGeometry(QRect(10, 490, 191, 151))
|
||||
self.groupBox_2.setFont(font)
|
||||
self.bWebSite = QPushButton(self.groupBox_2)
|
||||
self.bWebSite.setObjectName(u"bWebSite")
|
||||
self.bWebSite.setGeometry(QRect(20, 30, 151, 24))
|
||||
self.bWebSite.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
|
||||
self.label_2 = QLabel(self.groupBox_2)
|
||||
self.label_2.setObjectName(u"label_2")
|
||||
self.label_2.setGeometry(QRect(20, 60, 151, 16))
|
||||
self.label_2.setFont(font1)
|
||||
self.label_2.setStyleSheet(u"color: red;")
|
||||
self.label_6 = QLabel(self.groupBox_2)
|
||||
self.label_6.setObjectName(u"label_6")
|
||||
self.label_6.setGeometry(QRect(20, 80, 41, 21))
|
||||
self.label_3 = QLabel(self.groupBox_2)
|
||||
self.label_3.setObjectName(u"label_3")
|
||||
self.label_3.setGeometry(QRect(100, 80, 121, 21))
|
||||
self.lSize = QLineEdit(self.groupBox_2)
|
||||
self.lSize.setObjectName(u"lSize")
|
||||
self.lSize.setGeometry(QRect(50, 80, 41, 20))
|
||||
self.bStart = QPushButton(self.groupBox_2)
|
||||
self.bStart.setObjectName(u"bStart")
|
||||
self.bStart.setGeometry(QRect(20, 110, 151, 24))
|
||||
font5 = QFont()
|
||||
font5.setPointSize(12)
|
||||
self.bStart.setFont(font5)
|
||||
self.bStart.setStyleSheet(u"background-color:#409EFF; color: white; border-radius: 2px")
|
||||
MainWindow.setCentralWidget(self.centralwidget)
|
||||
self.menubar = QMenuBar(MainWindow)
|
||||
self.menubar.setObjectName(u"menubar")
|
||||
|
@ -186,17 +182,9 @@ class Ui_MainWindow(object):
|
|||
self.groupBox.setTitle(QCoreApplication.translate("MainWindow", u"1.\u5fae\u4fe1\u516c\u4f17\u53f7\u4fe1\u606f\u6293\u53d6", None))
|
||||
self.bWechat.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5de5\u5177", None))
|
||||
self.label_5.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u786e\u4fdd\u6240\u6709\u516c\u4f17\u53f7\u6293\u53d6\u5b8c\u6bd5", None))
|
||||
self.groupBox_2.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u9700\u8981\u6293\u53d6\u7684\u7f51\u7ad9", None))
|
||||
self.bWebSite.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u7f51\u7ad9\u5217\u8868Excel", None))
|
||||
self.label_2.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
|
||||
self.groupBox_3.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u5206\u6790\u5bf9\u6bd4\u5e93", None))
|
||||
self.bBiao.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u5206\u6790\u6807\u51c6Excel", None))
|
||||
self.label_4.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
|
||||
self.groupBox_4.setTitle(QCoreApplication.translate("MainWindow", u"4.\u5f00\u59cb\u6267\u884c\u5de1\u67e5", None))
|
||||
self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u5de1\u67e5", None))
|
||||
self.label_6.setText(QCoreApplication.translate("MainWindow", u"\u52a1\u5fc5\u786e\u8ba4\u524d3\u6b65\u5df2\u7ecf\u5b8c\u6210!", None))
|
||||
self.lSize.setText(QCoreApplication.translate("MainWindow", u"30", None))
|
||||
self.label_3.setText(QCoreApplication.translate("MainWindow", u"KB", None))
|
||||
self.groupBox_5.setTitle(QCoreApplication.translate("MainWindow", u"\u6700\u7ec8\u7ed3\u679c", None))
|
||||
self.label_7.setText(QCoreApplication.translate("MainWindow", u"\u5206\u6790\u7ed3\u679cExcel:", None))
|
||||
self.label_8.setText(QCoreApplication.translate("MainWindow", u"\u5206\u6790\u62a5\u544aWord:", None))
|
||||
|
@ -205,5 +193,13 @@ class Ui_MainWindow(object):
|
|||
self.bRes1.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None))
|
||||
self.bRes2.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00", None))
|
||||
self.label_9.setText(QCoreApplication.translate("MainWindow", u"\u4e2d\u56fd\u5efa\u6750\u603b\u9662\u5ba3\u4f20\u5de5\u4f5c\u4fe1\u606f\u5316\u7ba1\u7406\u5e73\u53f0", None))
|
||||
self.groupBox_6.setTitle(QCoreApplication.translate("MainWindow", u"\u65e5\u5fd7\u663e\u793a", None))
|
||||
self.groupBox_2.setTitle(QCoreApplication.translate("MainWindow", u"2.\u786e\u8ba4\u9700\u8981\u6293\u53d6\u7684\u7f51\u7ad9", None))
|
||||
self.bWebSite.setText(QCoreApplication.translate("MainWindow", u"\u6253\u5f00\u7f51\u7ad9\u5217\u8868Excel", None))
|
||||
self.label_2.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
|
||||
self.label_6.setText(QCoreApplication.translate("MainWindow", u"\u5c0f\u4e8e", None))
|
||||
self.label_3.setText(QCoreApplication.translate("MainWindow", u"KB-Chrome", None))
|
||||
self.lSize.setText(QCoreApplication.translate("MainWindow", u"20", None))
|
||||
self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u5de1\u67e5", None))
|
||||
# retranslateUi
|
||||
|
||||
|
|
29
web3.py
29
web3.py
|
@ -26,6 +26,12 @@ def sigint_handler(signal, frame):
|
|||
print('子进程已关闭,程序退出。')
|
||||
sys.exit(0)
|
||||
|
||||
def fix_url_scheme(url, default_scheme='http'):
|
||||
# 检查URL是否包含方案
|
||||
if not url.startswith('http://') and not url.startswith('https://'):
|
||||
# 如果没有方案,添加默认方案
|
||||
url = f'{default_scheme}://{url}'
|
||||
return url
|
||||
if __name__ == '__main__':
|
||||
print('巡查任务开始。。。')
|
||||
now = datetime.datetime.now()
|
||||
|
@ -46,15 +52,17 @@ if __name__ == '__main__':
|
|||
|
||||
ind = 0
|
||||
for ind, row in df.iterrows():
|
||||
group = row['单位']
|
||||
name = row['主办']
|
||||
url = row['地址']
|
||||
domain = urlparse(url).netloc.replace('www.', '')
|
||||
# output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||
cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
|
||||
# cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
|
||||
process = subprocess.Popen(cmd)
|
||||
processes.append(process)
|
||||
group = row['单位']
|
||||
name = row['主办']
|
||||
url = fix_url_scheme(row['地址'].strip())
|
||||
domain = urlparse(url).netloc.replace('www.', '')
|
||||
if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']: # 这几个网站直接跳过
|
||||
continue
|
||||
output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||
# cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
|
||||
cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
|
||||
process = subprocess.Popen(cmd)
|
||||
processes.append(process)
|
||||
|
||||
# Wait for all processes to finish
|
||||
for process in processes:
|
||||
|
@ -71,7 +79,8 @@ if __name__ == '__main__':
|
|||
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||
if os.path.exists(output_filename):
|
||||
file_size = os.path.getsize(output_filename)
|
||||
if file_size < 30 * 1024: # Convert KB to bytes
|
||||
print(file_size/1024)
|
||||
if file_size < 20 * 1024: # Convert KB to bytes
|
||||
info_to_save.append([group, name, url])
|
||||
|
||||
if info_to_save:
|
||||
|
|
|
@ -60,7 +60,6 @@ class ZcspiderPipeline:
|
|||
# raise
|
||||
line = [item['group'], item['name'], item['domain'], item['url'], item['text']]
|
||||
self.ws.append(line)
|
||||
self.wb.save(self.file_name)
|
||||
return item
|
||||
|
||||
# 结束,关闭连接
|
||||
|
@ -69,4 +68,5 @@ class ZcspiderPipeline:
|
|||
# self.cur.close()
|
||||
# # 关闭连接
|
||||
# self.conn.close()
|
||||
self.wb.save(self.file_name)
|
||||
self.wb.close()
|
|
@ -96,13 +96,13 @@ DEFAULT_REQUEST_HEADERS = {
|
|||
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||
FEED_EXPORT_ENCODING = 'gb18030'
|
||||
LOG_LEVEL = 'ERROR'
|
||||
LOG_LEVEL = 'WARNING'
|
||||
DOWNLOAD_TIMEOUT = 10
|
||||
|
||||
ITEM_PIPELINES = {
|
||||
# 'zcspider.pipelines.ZcspiderPipeline': 300,
|
||||
'zcspider.pipelines.ZcspiderPipeline': 300,
|
||||
}
|
||||
|
||||
FEED_EXPORTERS = {
|
||||
'xlsx': 'scrapy_xlsx.XlsxItemExporter',
|
||||
# 'xlsx': 'scrapy_xlsx.XlsxItemExporter',
|
||||
}
|
|
@ -82,40 +82,43 @@ class BaseSpider(scrapy.Spider):
|
|||
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
|
||||
|
||||
def parse(self, response):
|
||||
self.visited_urls.add(response.url)
|
||||
if self.is_file_res(response):
|
||||
return
|
||||
h = html2text.HTML2Text()
|
||||
h.ignore_links = True # 忽略所有链接
|
||||
# 提取纯文本内容
|
||||
# try:
|
||||
text = h.handle(response.text)
|
||||
# except:
|
||||
# text = h.handle(response.body.decode(encoding='gb18030'))
|
||||
if response.status < 400:
|
||||
yield {
|
||||
'group': self.group,
|
||||
'name': self.name,
|
||||
'domain': self.domain,
|
||||
'url': response.url,
|
||||
'text': text,
|
||||
}
|
||||
try:
|
||||
if response.status >= 500:
|
||||
return
|
||||
self.visited_urls.add(response.url)
|
||||
if self.is_file_res(response):
|
||||
return
|
||||
h = html2text.HTML2Text()
|
||||
h.ignore_links = True # 忽略所有链接
|
||||
# 提取纯文本内容
|
||||
# try:
|
||||
text = h.handle(response.text)
|
||||
# except:
|
||||
# text = h.handle(response.body.decode(encoding='gb18030'))
|
||||
if response.status < 400:
|
||||
yield {
|
||||
'group': self.group,
|
||||
'name': self.name,
|
||||
'domain': self.domain,
|
||||
'url': response.url,
|
||||
'text': text,
|
||||
}
|
||||
links = re.findall(r'href=["\']?([^"\'>]+)', response.text)
|
||||
for link in links:
|
||||
full_link = response.urljoin(link)
|
||||
if not full_link.startswith('http'):
|
||||
continue
|
||||
if full_link not in self.visited_urls and (self.is_file_url(full_link) is False):
|
||||
if urlparse(full_link).netloc.replace('www.', '') == self.domain:
|
||||
# try:
|
||||
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
|
||||
# except ValueError:
|
||||
# import traceback
|
||||
# print(traceback.format_exc())
|
||||
# print(full_link)
|
||||
except scrapy.exceptions.TimeoutError:
|
||||
print(f'{response.url}-请求超时取消')
|
||||
|
||||
links = re.findall(r'href=["\']?([^"\'>]+)', text)
|
||||
|
||||
for link in links:
|
||||
full_link = response.urljoin(link)
|
||||
if not full_link.startswith('http'):
|
||||
continue
|
||||
if full_link not in self.visited_urls and (self.is_file_url(full_link) is False):
|
||||
if urlparse(full_link).netloc.replace('www.', '') == self.domain:
|
||||
# try:
|
||||
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
|
||||
# except ValueError:
|
||||
# import traceback
|
||||
# print(traceback.format_exc())
|
||||
# print(full_link)
|
||||
|
||||
def closed(self, reason):
|
||||
# This method will be called when the Spider is about to close
|
||||
print(f'爬取完成: {self.name}_{self.domain}')
|
42
网络巡查.bat
42
网络巡查.bat
|
@ -1,44 +1,14 @@
|
|||
::[Bat To Exe Converter]
|
||||
::
|
||||
::YAwzoRdxOk+EWAjk
|
||||
::fBw5plQjdCuDJE6L5kkgJBpXSTiYP3iqD7EZ+tTo++uVtgMUV+1f
|
||||
::YAwzuBVtJxjWCl3EqQJgSA==
|
||||
::ZR4luwNxJguZRRnk
|
||||
::Yhs/ulQjdF+5
|
||||
::cxAkpRVqdFKZSzk=
|
||||
::cBs/ulQjdF+5
|
||||
::ZR41oxFsdFKZSDk=
|
||||
::eBoioBt6dFKZSDk=
|
||||
::cRo6pxp7LAbNWATEpCI=
|
||||
::egkzugNsPRvcWATEpCI=
|
||||
::dAsiuh18IRvcCxnZtBJQ
|
||||
::cRYluBh/LU+EWAnk
|
||||
::YxY4rhs+aU+JeA==
|
||||
::cxY6rQJ7JhzQF1fEqQJQ
|
||||
::ZQ05rAF9IBncCkqN+0xwdVs0
|
||||
::ZQ05rAF9IAHYFVzEqQJQ
|
||||
::eg0/rx1wNQPfEVWB+kM9LVsJDGQ=
|
||||
::fBEirQZwNQPfEVWB+kM9LVsJDGQ=
|
||||
::cRolqwZ3JBvQF1fEqQJQ
|
||||
::dhA7uBVwLU+EWDk=
|
||||
::YQ03rBFzNR3SWATElA==
|
||||
::dhAmsQZ3MwfNWATElA==
|
||||
::ZQ0/vhVqMQ3MEVWAtB9wSA==
|
||||
::Zg8zqx1/OA3MEVWAtB9wSA==
|
||||
::dhA7pRFwIByZRRnk
|
||||
::Zh4grVQjdCuDJE6L5kkgJBpXSTiYP3iqD7EZ+tTo++uVtgMYTOdf
|
||||
::YB416Ek+ZG8=
|
||||
::
|
||||
::
|
||||
::978f952a14a936cc963da21a135fa983
|
||||
@echo off
|
||||
if "%1" == "h" goto begin
|
||||
mshta vbscript:createobject("wscript.shell").run("""%~nx0"" h",0)(window.close)&&exit
|
||||
:begin
|
||||
setlocal
|
||||
|
||||
REM 设置 Python 可执行文件路<E4BBB6><E8B7AF>?
|
||||
|
||||
set PYTHON_EXECUTABLE=.\runtime\python.exe
|
||||
|
||||
REM 设置要运行的 Python 脚本
|
||||
|
||||
set PYTHON_SCRIPT=start.py
|
||||
|
||||
REM 运行 Python 脚本
|
||||
|
||||
%PYTHON_EXECUTABLE% %PYTHON_SCRIPT%
|
||||
|
|
Loading…
Reference in New Issue