feat: 整合并打包runtime
This commit is contained in:
parent
722f9b2542
commit
18580ecd28
|
@ -6,8 +6,14 @@ twistd.pid
|
||||||
~$*
|
~$*
|
||||||
*.xlsx
|
*.xlsx
|
||||||
!biao.xlsx
|
!biao.xlsx
|
||||||
|
!template.xlsx
|
||||||
wechat_dir/*
|
wechat_dir/*
|
||||||
ana_web.csv
|
*.csv
|
||||||
ana_wechat.csv
|
|
||||||
.idea/*
|
.idea/*
|
||||||
*.pdf
|
*.pdf
|
||||||
|
article/*
|
||||||
|
db_folder/*
|
||||||
|
runtime/*
|
||||||
|
chrome117.exe
|
||||||
|
html/*
|
||||||
|
excel/*
|
|
@ -0,0 +1,2 @@
|
||||||
|
import os
|
||||||
|
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
Binary file not shown.
|
@ -5,6 +5,10 @@ import time
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from .base import BASE_DIR
|
||||||
|
import os
|
||||||
|
chrome_driver_file = os.path.join(BASE_DIR, 'mycode', 'chromedriver.exe')
|
||||||
|
failed_sites_file = os.path.join(BASE_DIR, 'mycode/failed_sites.xlsx')
|
||||||
|
|
||||||
def open_website(url):
|
def open_website(url):
|
||||||
# Set up Chrome WebDriver with custom User-Agent
|
# Set up Chrome WebDriver with custom User-Agent
|
||||||
|
@ -12,7 +16,7 @@ def open_website(url):
|
||||||
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36")
|
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36")
|
||||||
prefs = {"profile.managed_default_content_settings.images": 2, 'notifications':2}
|
prefs = {"profile.managed_default_content_settings.images": 2, 'notifications':2}
|
||||||
options.add_experimental_option("prefs", prefs)
|
options.add_experimental_option("prefs", prefs)
|
||||||
driver = webdriver.Chrome("./chromedriver.exe", options=options)
|
driver = webdriver.Chrome(chrome_driver_file, options=options)
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
return driver
|
return driver
|
||||||
|
|
||||||
|
@ -40,7 +44,7 @@ def process_page(driver, url, visited_pages, start_domain, data):
|
||||||
# Extract the content from the page
|
# Extract the content from the page
|
||||||
content_element = driver.find_element(By.XPATH, '//body')
|
content_element = driver.find_element(By.XPATH, '//body')
|
||||||
content_text = content_element.text
|
content_text = content_element.text
|
||||||
print(content_text)
|
# print(content_text)
|
||||||
# Add URL, Domain, and Content to the data list
|
# Add URL, Domain, and Content to the data list
|
||||||
data.append([start_domain, url, content_text])
|
data.append([start_domain, url, content_text])
|
||||||
|
|
||||||
|
@ -59,17 +63,17 @@ def process_page(driver, url, visited_pages, start_domain, data):
|
||||||
try:
|
try:
|
||||||
# Check if the new href belongs to the same domain as the original URL
|
# Check if the new href belongs to the same domain as the original URL
|
||||||
parsed_href = urlparse(href)
|
parsed_href = urlparse(href)
|
||||||
if parsed_href.netloc != start_domain:
|
if parsed_href.netloc.replace("www.", "") != start_domain:
|
||||||
continue
|
continue
|
||||||
# Open the href in the same tab and retrieve data
|
# Open the href in the same tab and retrieve data
|
||||||
driver.get(href)
|
driver.get(href)
|
||||||
print(href)
|
# print(href)
|
||||||
# Wait for the page to load
|
# Wait for the page to load
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
# Extract the content from the hyperlink page
|
# Extract the content from the hyperlink page
|
||||||
hyperlink_content_element = driver.find_element(By.XPATH, '//body')
|
hyperlink_content_element = driver.find_element(By.XPATH, '//body')
|
||||||
hyperlink_content_text = hyperlink_content_element.text
|
hyperlink_content_text = hyperlink_content_element.text
|
||||||
print(hyperlink_content_text)
|
# print(hyperlink_content_text)
|
||||||
# Add URL, Domain, and Content of the hyperlink to the data list
|
# Add URL, Domain, and Content of the hyperlink to the data list
|
||||||
data.append([start_domain, href, hyperlink_content_text])
|
data.append([start_domain, href, hyperlink_content_text])
|
||||||
# Recursively process the page and follow hyperlinks
|
# Recursively process the page and follow hyperlinks
|
||||||
|
@ -86,7 +90,7 @@ def check_href(href, original_url, visited_pages):
|
||||||
parsed_href = urlparse(href)
|
parsed_href = urlparse(href)
|
||||||
parsed_original_url = urlparse(original_url)
|
parsed_original_url = urlparse(original_url)
|
||||||
# Check if the href leads back to the original page
|
# Check if the href leads back to the original page
|
||||||
if parsed_href.netloc == parsed_original_url.netloc and parsed_href.path == parsed_original_url.path:
|
if parsed_href.netloc.replace("www.", "") == parsed_original_url.netloc.replace("www.", "") and parsed_href.path == parsed_original_url.path:
|
||||||
return True
|
return True
|
||||||
# Check if the href has already been visited
|
# Check if the href has already been visited
|
||||||
if href in visited_pages:
|
if href in visited_pages:
|
||||||
|
@ -129,15 +133,15 @@ def add_cookies(driver, cookies):
|
||||||
for name, value in cookies.items():
|
for name, value in cookies.items():
|
||||||
driver.add_cookie({'name': name, 'value': value})
|
driver.add_cookie({'name': name, 'value': value})
|
||||||
|
|
||||||
def main():
|
def chrome_main():
|
||||||
# Read failed URLs from the list
|
# Read failed URLs from the list
|
||||||
df = pd.read_excel('failed_files.xlsx')
|
df = pd.read_excel(failed_sites_file)
|
||||||
|
|
||||||
for ind, row in df.iterrows():
|
for ind, row in df.iterrows():
|
||||||
group = row['单位'] # Replace with the actual column name for group
|
group = row['单位'] # Replace with the actual column name for group
|
||||||
name = row['主办']
|
name = row['主办']
|
||||||
url = row['地址']
|
url = row['地址']
|
||||||
domain = urlparse(url).netloc
|
domain = urlparse(url).netloc.replace("www.", "")
|
||||||
|
|
||||||
# Open the website
|
# Open the website
|
||||||
driver = open_website(url)
|
driver = open_website(url)
|
||||||
|
@ -156,11 +160,11 @@ def main():
|
||||||
process_page(driver, url, visited_pages, domain, data)
|
process_page(driver, url, visited_pages, domain, data)
|
||||||
|
|
||||||
# Export data to a separate Excel file in the web_dir directory
|
# Export data to a separate Excel file in the web_dir directory
|
||||||
output_filename = f'web_dir/{name}_{domain}.xlsx'
|
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||||
export_to_excel(data, output_filename)
|
export_to_excel(data, output_filename)
|
||||||
|
|
||||||
# Close the WebDriver
|
# Close the WebDriver
|
||||||
driver.quit()
|
driver.quit()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
chrome_main()
|
|
@ -1,12 +1,12 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
from .base import BASE_DIR
|
||||||
|
|
||||||
current_dir = os.getcwd()
|
wechat_dir = os.path.join(BASE_DIR, 'article')
|
||||||
wechat_dir = os.path.join(current_dir, 'article')
|
web_dir = os.path.join(BASE_DIR, 'web_dir')
|
||||||
web_dir = os.path.join(current_dir, 'web_dir')
|
output_dir = os.path.join(BASE_DIR, 'summary')
|
||||||
output_dir = os.path.join(current_dir, 'summary')
|
df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')
|
||||||
df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容')
|
|
||||||
|
|
||||||
def trans_to_json():
|
def trans_to_json():
|
||||||
json_str = df_s.to_json(orient='records', force_ascii=False)
|
json_str = df_s.to_json(orient='records', force_ascii=False)
|
||||||
|
@ -14,7 +14,7 @@ def trans_to_json():
|
||||||
f.write(json_str)
|
f.write(json_str)
|
||||||
|
|
||||||
def make_simple_csv_from_db():
|
def make_simple_csv_from_db():
|
||||||
conn = sqlite3.connect('db_folder/test.db')
|
conn = sqlite3.connect(os.path.join(BASE_DIR, 'db_folder/test.db'))
|
||||||
query = "select id, g.nickname, a.title, a.content_url, datetime(a.p_date, 'unixepoch', 'localtime') as pub_date from articles a LEFT JOIN gzhs g on g.biz = a.biz"
|
query = "select id, g.nickname, a.title, a.content_url, datetime(a.p_date, 'unixepoch', 'localtime') as pub_date from articles a LEFT JOIN gzhs g on g.biz = a.biz"
|
||||||
df = pd.read_sql_query(query, conn)
|
df = pd.read_sql_query(query, conn)
|
||||||
# 关闭数据库连接
|
# 关闭数据库连接
|
||||||
|
@ -65,7 +65,7 @@ def ana_wechat():
|
||||||
output_data.append(output_row)
|
output_data.append(output_row)
|
||||||
index += 1
|
index += 1
|
||||||
|
|
||||||
output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
# output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
||||||
|
|
||||||
return output_data
|
return output_data
|
||||||
|
|
||||||
|
@ -86,29 +86,17 @@ def ana_web():
|
||||||
output_row = [
|
output_row = [
|
||||||
index,
|
index,
|
||||||
row2['name'],
|
row2['name'],
|
||||||
"文章标题",
|
"/",
|
||||||
row['错误表述'],
|
row['错误表述'],
|
||||||
row['建议修改词语'],
|
row['建议修改词语'],
|
||||||
row['错误分类'],
|
row['错误分类'],
|
||||||
row2['content_url']
|
row2['url']
|
||||||
]
|
]
|
||||||
output_data.append(output_row)
|
output_data.append(output_row)
|
||||||
index += 1
|
index += 1
|
||||||
|
|
||||||
output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
# output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
||||||
|
|
||||||
return output_data
|
return output_data
|
||||||
|
|
||||||
# Run WeChat Analysis
|
|
||||||
wechat_results = ana_wechat()
|
|
||||||
|
|
||||||
# Run Web Content Analysis
|
|
||||||
web_results = ana_web()
|
|
||||||
|
|
||||||
# Save results in an Excel file with two sheets
|
|
||||||
output_excel_path = os.path.join(output_dir, '总院及下属公司官方公众号巡查结果汇总表.xlsx')
|
|
||||||
with pd.ExcelWriter(output_excel_path) as writer:
|
|
||||||
wechat_results.to_excel(writer, sheet_name='公众号', index=False)
|
|
||||||
web_results.to_excel(writer, sheet_name='网站', index=False)
|
|
||||||
|
|
||||||
print("Analysis completed and results saved to Excel.")
|
|
|
@ -28,7 +28,7 @@ for ind, row in df.iterrows():
|
||||||
group = row['单位']
|
group = row['单位']
|
||||||
name = row['主办']
|
name = row['主办']
|
||||||
url = row['地址']
|
url = row['地址']
|
||||||
domain = urlparse(url).netloc
|
domain = urlparse(url).netloc.replace('www.', '')
|
||||||
cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
|
cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
|
||||||
process = subprocess.Popen(cmd)
|
process = subprocess.Popen(cmd)
|
||||||
processes.append(process)
|
processes.append(process)
|
||||||
|
@ -52,3 +52,4 @@ for ind, row in df.iterrows():
|
||||||
|
|
||||||
if info_to_save:
|
if info_to_save:
|
||||||
save_info_to_excel(info_to_save, 'failed_files.xlsx')
|
save_info_to_excel(info_to_save, 'failed_files.xlsx')
|
||||||
|
|
|
@ -0,0 +1,60 @@
|
||||||
|
import requests
|
||||||
|
import sqlite3
|
||||||
|
import pandas as pd
|
||||||
|
import html2text
|
||||||
|
import re
|
||||||
|
from urllib.parse import urlparse, urljoin
|
||||||
|
import concurrent.futures
|
||||||
|
|
||||||
|
class WebSpider:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.headers = {
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||||||
|
'Cache-Control': 'max-age=0',
|
||||||
|
'Proxy-Connection': 'keep-alive',
|
||||||
|
'Referer': 'https://www.baidu.com/',
|
||||||
|
'Upgrade-Insecure-Requests': '1',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
|
||||||
|
}
|
||||||
|
self.visited_urls = set()
|
||||||
|
self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.css', '.js'])
|
||||||
|
self.futures = []
|
||||||
|
|
||||||
|
def get_one_page(self, group, name, domain, url):
|
||||||
|
if url in self.visited_urls:
|
||||||
|
return
|
||||||
|
self.visited_urls.add(url)
|
||||||
|
r = requests.get(url=url, headers=self.headers, timeout=10)
|
||||||
|
rtext = r.text
|
||||||
|
if rtext:
|
||||||
|
h = html2text.HTML2Text()
|
||||||
|
h.ignore_links = True # 忽略所有链接
|
||||||
|
text = h.handle(rtext)
|
||||||
|
print(group, name, domain, url)
|
||||||
|
links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
|
||||||
|
for link in links:
|
||||||
|
full_link = urljoin(r.url, link)
|
||||||
|
if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
|
||||||
|
if urlparse(full_link).netloc.replace('www.', "") == domain:
|
||||||
|
self.get_one_page(group, name, domain, full_link)
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
|
||||||
|
# for ind, row in df.iterrows():
|
||||||
|
# group = row['单位']
|
||||||
|
# name = row['主办']
|
||||||
|
# url = row['地址']
|
||||||
|
# domain = urlparse(url).netloc.replace('www.', '')
|
||||||
|
# self.get_one_page(group, name, domain, url)
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
|
||||||
|
futures = []
|
||||||
|
for ind, row in df.iterrows():
|
||||||
|
group = row['单位']
|
||||||
|
name = row['主办']
|
||||||
|
url = row['地址']
|
||||||
|
domain = urlparse(url).netloc.replace('www.', '')
|
||||||
|
futures.append(executor.submit(self.get_one_page, group, name, domain, url))
|
||||||
|
concurrent.futures.wait(futures)
|
||||||
|
|
||||||
|
WebSpider().start()
|
|
@ -1,6 +1,4 @@
|
||||||
scrapy==2.8.0
|
scrapy==2.8.0
|
||||||
scrapyd==1.4.1
|
|
||||||
scrapyd-client==1.2.3
|
|
||||||
html2text==2020.1.16
|
html2text==2020.1.16
|
||||||
pandas==2.0.0
|
pandas==2.0.0
|
||||||
openpyxl==3.1.2
|
openpyxl==3.1.2
|
||||||
|
|
Binary file not shown.
|
@ -0,0 +1,110 @@
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import pandas as pd
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
import datetime
|
||||||
|
from openpyxl import load_workbook
|
||||||
|
|
||||||
|
from mycode.base import BASE_DIR
|
||||||
|
from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir
|
||||||
|
from mycode.crawl_chrome import chrome_main, failed_sites_file
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def save_info_to_excel(info_list, output_filename):
|
||||||
|
df = pd.DataFrame(info_list, columns=['单位', '主办' , '地址'])
|
||||||
|
df.to_excel(output_filename, index=False)
|
||||||
|
|
||||||
|
# 定义 SIGINT 信号处理函数
|
||||||
|
def sigint_handler(signal, frame):
|
||||||
|
print('收到 Ctrl-C 信号,正在关闭子进程...')
|
||||||
|
for process in processes:
|
||||||
|
process.terminate()
|
||||||
|
print('子进程已关闭,程序退出。')
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print('巡查任务开始。。。')
|
||||||
|
now = datetime.datetime.now()
|
||||||
|
month = now.month
|
||||||
|
|
||||||
|
print('正在组合微信公众号爬取内容。。。')
|
||||||
|
make_simple_csv_from_db()
|
||||||
|
make_wechat_articles_full()
|
||||||
|
print('公众号爬取内容组装完毕!')
|
||||||
|
|
||||||
|
print('开始进行网站爬取。。。')
|
||||||
|
|
||||||
|
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
|
||||||
|
processes = []
|
||||||
|
|
||||||
|
# 注册 SIGINT 信号处理函数
|
||||||
|
signal.signal(signal.SIGINT, sigint_handler)
|
||||||
|
|
||||||
|
ind = 0
|
||||||
|
for ind, row in df.iterrows():
|
||||||
|
group = row['单位']
|
||||||
|
name = row['主办']
|
||||||
|
url = row['地址']
|
||||||
|
domain = urlparse(url).netloc.replace('www.', '')
|
||||||
|
# output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||||
|
cmd = ['./runtime/Scripts/scrapy.exe', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
|
||||||
|
# cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
|
||||||
|
process = subprocess.Popen(cmd)
|
||||||
|
processes.append(process)
|
||||||
|
|
||||||
|
# Wait for all processes to finish
|
||||||
|
for process in processes:
|
||||||
|
process.wait()
|
||||||
|
|
||||||
|
print('网站爬取结束,校验中。。。')
|
||||||
|
# Check output file sizes and save information if size is less than 20KB
|
||||||
|
info_to_save = []
|
||||||
|
for ind, row in df.iterrows():
|
||||||
|
group = row['单位']
|
||||||
|
name = row['主办']
|
||||||
|
url = row['地址']
|
||||||
|
domain = urlparse(url).netloc.replace("www.", "")
|
||||||
|
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
||||||
|
if os.path.exists(output_filename):
|
||||||
|
file_size = os.path.getsize(output_filename)
|
||||||
|
if file_size < 20 * 1024: # Convert KB to bytes
|
||||||
|
info_to_save.append([group, name, url])
|
||||||
|
|
||||||
|
if info_to_save:
|
||||||
|
print('存在未爬取站点,正在调用Chrome继续爬取。。。')
|
||||||
|
save_info_to_excel(info_to_save, failed_sites_file)
|
||||||
|
chrome_main()
|
||||||
|
os.remove(failed_sites_file)
|
||||||
|
|
||||||
|
print('网站爬取完毕!')
|
||||||
|
|
||||||
|
print('开始对比分析所有内容。。。')
|
||||||
|
# Run WeChat Analysis
|
||||||
|
wechat_results = ana_wechat()
|
||||||
|
# Run Web Content Analysis
|
||||||
|
web_results = ana_web()
|
||||||
|
|
||||||
|
# Save results in an Excel file with two sheets
|
||||||
|
output_excel_path = os.path.join(output_dir, f'{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx')
|
||||||
|
# with pd.ExcelWriter(output_excel_path) as writer:
|
||||||
|
# df = pd.DataFrame(wechat_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
||||||
|
# df.to_excel(writer, sheet_name='公众号', index=False)
|
||||||
|
# df2 = pd.DataFrame(web_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
||||||
|
# df2.to_excel(writer, sheet_name='网站', index=False)
|
||||||
|
template_path = os.path.join(output_dir, 'template.xlsx')
|
||||||
|
workbook = load_workbook(template_path)
|
||||||
|
|
||||||
|
# 选择要操作的工作表
|
||||||
|
wechat_sheet = workbook['公众号']
|
||||||
|
web_sheet = workbook['网站']
|
||||||
|
for row in wechat_results:
|
||||||
|
wechat_sheet.append(row)
|
||||||
|
for row in web_results:
|
||||||
|
web_sheet.append(row)
|
||||||
|
workbook.save(output_excel_path)
|
||||||
|
workbook.close()
|
||||||
|
print('巡查任务执行完毕, 请查看summary文件夹, 可手动校对')
|
||||||
|
os.system("pause")
|
Binary file not shown.
517223
wechat_dir/articles_full.csv
517223
wechat_dir/articles_full.csv
File diff suppressed because one or more lines are too long
|
@ -7,7 +7,6 @@ from openpyxl import Workbook, load_workbook
|
||||||
|
|
||||||
# useful for handling different item types with a single interface
|
# useful for handling different item types with a single interface
|
||||||
from scrapy.exceptions import IgnoreRequest
|
from scrapy.exceptions import IgnoreRequest
|
||||||
import psycopg2
|
|
||||||
|
|
||||||
# class ZcspiderPipeline2:
|
# class ZcspiderPipeline2:
|
||||||
# """
|
# """
|
||||||
|
@ -28,10 +27,17 @@ import psycopg2
|
||||||
class ZcspiderPipeline:
|
class ZcspiderPipeline:
|
||||||
|
|
||||||
def open_spider(self, spider):
|
def open_spider(self, spider):
|
||||||
print('初始化数据库连接')
|
self.file_name = spider.output
|
||||||
self.conn = psycopg2.connect(host="49.232.14.174",database="zcspider", user="postgres", password="zcDsj2021")
|
if os.path.exists(self.file_name):
|
||||||
self.cur = self.conn.cursor()
|
os.remove(self.file_name)
|
||||||
self.cur.execute("delete from content where domain = %s", (spider.domain, ))
|
self.wb = Workbook()
|
||||||
|
self.ws = self.wb.active
|
||||||
|
self.ws.append(['group', 'name', 'domain', 'url', 'text'])
|
||||||
|
|
||||||
|
# print('初始化数据库连接')
|
||||||
|
# self.conn = psycopg2.connect(host="49.232.14.174",database="zcspider", user="postgres", password="zcDsj2021")
|
||||||
|
# self.cur = self.conn.cursor()
|
||||||
|
# self.cur.execute("delete from content where domain = %s", (spider.domain, ))
|
||||||
# rows = self.cur.fetchall()
|
# rows = self.cur.fetchall()
|
||||||
# spider.visited_urls_last = [i[0] for i in rows] if len(rows)>1 else []
|
# spider.visited_urls_last = [i[0] for i in rows] if len(rows)>1 else []
|
||||||
|
|
||||||
|
@ -45,18 +51,22 @@ class ZcspiderPipeline:
|
||||||
# return request
|
# return request
|
||||||
|
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
try:
|
# try:
|
||||||
self.cur.execute("INSERT INTO content (domain, url, text) VALUES(%s, %s, %s)",
|
# self.cur.execute("INSERT INTO content (domain, url, text) VALUES(%s, %s, %s)",
|
||||||
(item['domain'], item['url'], item['text']))
|
# (item['domain'], item['url'], item['text']))
|
||||||
self.conn.commit()
|
# self.conn.commit()
|
||||||
except:
|
# except:
|
||||||
self.conn.rollback()
|
# self.conn.rollback()
|
||||||
raise
|
# raise
|
||||||
|
line = [item['group'], item['name'], item['domain'], item['url'], item['text']]
|
||||||
|
self.ws.append(line)
|
||||||
|
self.wb.save(self.file_name)
|
||||||
return item
|
return item
|
||||||
|
|
||||||
# 结束,关闭连接
|
# 结束,关闭连接
|
||||||
def close_spider(self, spider):
|
def close_spider(self, spider):
|
||||||
# 关闭游标
|
# # 关闭游标
|
||||||
self.cur.close()
|
# self.cur.close()
|
||||||
# 关闭连接
|
# # 关闭连接
|
||||||
self.conn.close()
|
# self.conn.close()
|
||||||
|
self.wb.close()
|
|
@ -37,10 +37,15 @@ ROBOTSTXT_OBEY = False
|
||||||
#TELNETCONSOLE_ENABLED = False
|
#TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
# Override the default request headers:
|
# Override the default request headers:
|
||||||
#DEFAULT_REQUEST_HEADERS = {
|
DEFAULT_REQUEST_HEADERS = {
|
||||||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
# "Accept-Language": "en",
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||||||
#}
|
'Cache-Control': 'max-age=0',
|
||||||
|
'Proxy-Connection': 'keep-alive',
|
||||||
|
'Referer': 'https://www.baidu.com/',
|
||||||
|
'Upgrade-Insecure-Requests': '1',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
|
||||||
|
}
|
||||||
|
|
||||||
# Enable or disable spider middlewares
|
# Enable or disable spider middlewares
|
||||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
@ -91,11 +96,11 @@ ROBOTSTXT_OBEY = False
|
||||||
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||||
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||||
FEED_EXPORT_ENCODING = 'gb18030'
|
FEED_EXPORT_ENCODING = 'gb18030'
|
||||||
LOG_LEVEL = 'INFO'
|
LOG_LEVEL = 'ERROR'
|
||||||
DOWNLOAD_TIMEOUT = 30
|
DOWNLOAD_TIMEOUT = 10
|
||||||
|
|
||||||
ITEM_PIPELINES = {
|
ITEM_PIPELINES = {
|
||||||
# 'zcspider.pipelines.ZcspiderPipeline2': 300,
|
# 'zcspider.pipelines.ZcspiderPipeline': 300,
|
||||||
}
|
}
|
||||||
|
|
||||||
FEED_EXPORTERS = {
|
FEED_EXPORTERS = {
|
||||||
|
|
|
@ -25,7 +25,8 @@ class BaseSpider(scrapy.Spider):
|
||||||
self.start_urls = [start_url]
|
self.start_urls = [start_url]
|
||||||
self.name = name
|
self.name = name
|
||||||
self.group = group
|
self.group = group
|
||||||
self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico'])
|
self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.dat', '.css', '.js'])
|
||||||
|
print(f"爬取开始: {name}-{domain}")
|
||||||
|
|
||||||
def start_requests(self):
|
def start_requests(self):
|
||||||
for url in self.start_urls:
|
for url in self.start_urls:
|
||||||
|
@ -33,27 +34,28 @@ class BaseSpider(scrapy.Spider):
|
||||||
yield r
|
yield r
|
||||||
|
|
||||||
def request2(self, fail):
|
def request2(self, fail):
|
||||||
rurl = fail.request.url,
|
rurl = fail.request.url
|
||||||
self.logger.info(f'{rurl} 使用requests继续请求')
|
self.logger.info(f'{rurl} 使用requests继续请求')
|
||||||
r = requests.get(url=fail.request.url, headers=self.headers, timeout=20)
|
r = requests.get(url=fail.request.url, headers=self.headers, timeout=20)
|
||||||
rtext = r.text
|
if r.status_code < 400:
|
||||||
h = html2text.HTML2Text()
|
rtext = r.text
|
||||||
h.ignore_links = True # 忽略所有链接
|
h = html2text.HTML2Text()
|
||||||
text = h.handle(rtext)
|
h.ignore_links = True # 忽略所有链接
|
||||||
yield {
|
text = h.handle(rtext)
|
||||||
'group': self.group,
|
yield {
|
||||||
'name': self.name,
|
'group': self.group,
|
||||||
'domain': self.domain,
|
'name': self.name,
|
||||||
'url': rurl,
|
'domain': self.domain,
|
||||||
'text': text,
|
'url': rurl,
|
||||||
}
|
'text': text,
|
||||||
self.visited_urls.add(rurl)
|
}
|
||||||
links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
|
self.visited_urls.add(rurl)
|
||||||
for link in links:
|
links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
|
||||||
full_link = urljoin(r.url, link)
|
for link in links:
|
||||||
if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
|
full_link = urljoin(r.url, link)
|
||||||
if urlparse(full_link).netloc == self.domain:
|
if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
|
||||||
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
|
if urlparse(full_link).netloc.replace('www.', '') == self.domain:
|
||||||
|
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
self.visited_urls.add(response.url)
|
self.visited_urls.add(response.url)
|
||||||
|
@ -64,17 +66,21 @@ class BaseSpider(scrapy.Spider):
|
||||||
text = h.handle(response.text)
|
text = h.handle(response.text)
|
||||||
# except:
|
# except:
|
||||||
# text = h.handle(response.body.decode(encoding='gb18030'))
|
# text = h.handle(response.body.decode(encoding='gb18030'))
|
||||||
|
if response.status < 400:
|
||||||
|
yield {
|
||||||
|
'group': self.group,
|
||||||
|
'name': self.name,
|
||||||
|
'domain': self.domain,
|
||||||
|
'url': response.url,
|
||||||
|
'text': text,
|
||||||
|
}
|
||||||
|
|
||||||
yield {
|
for link in response.css("a::attr('href')").getall():
|
||||||
'group': self.group,
|
full_link = response.urljoin(link)
|
||||||
'name': self.name,
|
if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
|
||||||
'domain': self.domain,
|
if urlparse(full_link).netloc.replace('www.', '') == self.domain:
|
||||||
'url': response.url,
|
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
|
||||||
'text': text,
|
|
||||||
}
|
def closed(self, reason):
|
||||||
|
# This method will be called when the Spider is about to close
|
||||||
for link in response.css("a::attr('href')").getall():
|
print(f'爬取完成: {self.name}-{self.domain}')
|
||||||
full_link = response.urljoin(link)
|
|
||||||
if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
|
|
||||||
if urlparse(full_link).netloc == self.domain:
|
|
||||||
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
|
|
Loading…
Reference in New Issue