feat: 整合并打包runtime

This commit is contained in:
caoqianming 2023-08-25 15:01:45 +08:00
parent 722f9b2542
commit 18580ecd28
18 changed files with 285 additions and 517318 deletions

10
.gitignore vendored
View File

@ -6,8 +6,14 @@ twistd.pid
~$*
*.xlsx
!biao.xlsx
!template.xlsx
wechat_dir/*
ana_web.csv
ana_wechat.csv
*.csv
.idea/*
*.pdf
article/*
db_folder/*
runtime/*
chrome117.exe
html/*
excel/*

0
mycode/__init__.py Normal file
View File

2
mycode/base.py Normal file
View File

@ -0,0 +1,2 @@
import os
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

View File

@ -5,6 +5,10 @@ import time
from urllib.parse import urlparse
from pathlib import Path
import pandas as pd
from .base import BASE_DIR
import os
chrome_driver_file = os.path.join(BASE_DIR, 'mycode', 'chromedriver.exe')
failed_sites_file = os.path.join(BASE_DIR, 'mycode/failed_sites.xlsx')
def open_website(url):
# Set up Chrome WebDriver with custom User-Agent
@ -12,7 +16,7 @@ def open_website(url):
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36")
prefs = {"profile.managed_default_content_settings.images": 2, 'notifications':2}
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome("./chromedriver.exe", options=options)
driver = webdriver.Chrome(chrome_driver_file, options=options)
driver.get(url)
return driver
@ -40,7 +44,7 @@ def process_page(driver, url, visited_pages, start_domain, data):
# Extract the content from the page
content_element = driver.find_element(By.XPATH, '//body')
content_text = content_element.text
print(content_text)
# print(content_text)
# Add URL, Domain, and Content to the data list
data.append([start_domain, url, content_text])
@ -59,17 +63,17 @@ def process_page(driver, url, visited_pages, start_domain, data):
try:
# Check if the new href belongs to the same domain as the original URL
parsed_href = urlparse(href)
if parsed_href.netloc != start_domain:
if parsed_href.netloc.replace("www.", "") != start_domain:
continue
# Open the href in the same tab and retrieve data
driver.get(href)
print(href)
# print(href)
# Wait for the page to load
time.sleep(2)
# Extract the content from the hyperlink page
hyperlink_content_element = driver.find_element(By.XPATH, '//body')
hyperlink_content_text = hyperlink_content_element.text
print(hyperlink_content_text)
# print(hyperlink_content_text)
# Add URL, Domain, and Content of the hyperlink to the data list
data.append([start_domain, href, hyperlink_content_text])
# Recursively process the page and follow hyperlinks
@ -86,7 +90,7 @@ def check_href(href, original_url, visited_pages):
parsed_href = urlparse(href)
parsed_original_url = urlparse(original_url)
# Check if the href leads back to the original page
if parsed_href.netloc == parsed_original_url.netloc and parsed_href.path == parsed_original_url.path:
if parsed_href.netloc.replace("www.", "") == parsed_original_url.netloc.replace("www.", "") and parsed_href.path == parsed_original_url.path:
return True
# Check if the href has already been visited
if href in visited_pages:
@ -129,15 +133,15 @@ def add_cookies(driver, cookies):
for name, value in cookies.items():
driver.add_cookie({'name': name, 'value': value})
def main():
def chrome_main():
# Read failed URLs from the list
df = pd.read_excel('failed_files.xlsx')
df = pd.read_excel(failed_sites_file)
for ind, row in df.iterrows():
group = row['单位'] # Replace with the actual column name for group
name = row['主办']
url = row['地址']
domain = urlparse(url).netloc
domain = urlparse(url).netloc.replace("www.", "")
# Open the website
driver = open_website(url)
@ -156,11 +160,11 @@ def main():
process_page(driver, url, visited_pages, domain, data)
# Export data to a separate Excel file in the web_dir directory
output_filename = f'web_dir/{name}_{domain}.xlsx'
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
export_to_excel(data, output_filename)
# Close the WebDriver
driver.quit()
if __name__ == "__main__":
main()
chrome_main()

View File

@ -1,12 +1,12 @@
import pandas as pd
import os
import sqlite3
from .base import BASE_DIR
current_dir = os.getcwd()
wechat_dir = os.path.join(current_dir, 'article')
web_dir = os.path.join(current_dir, 'web_dir')
output_dir = os.path.join(current_dir, 'summary')
df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容')
wechat_dir = os.path.join(BASE_DIR, 'article')
web_dir = os.path.join(BASE_DIR, 'web_dir')
output_dir = os.path.join(BASE_DIR, 'summary')
df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')
def trans_to_json():
json_str = df_s.to_json(orient='records', force_ascii=False)
@ -14,7 +14,7 @@ def trans_to_json():
f.write(json_str)
def make_simple_csv_from_db():
conn = sqlite3.connect('db_folder/test.db')
conn = sqlite3.connect(os.path.join(BASE_DIR, 'db_folder/test.db'))
query = "select id, g.nickname, a.title, a.content_url, datetime(a.p_date, 'unixepoch', 'localtime') as pub_date from articles a LEFT JOIN gzhs g on g.biz = a.biz"
df = pd.read_sql_query(query, conn)
# 关闭数据库连接
@ -65,7 +65,7 @@ def ana_wechat():
output_data.append(output_row)
index += 1
output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
# output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
return output_data
@ -86,29 +86,17 @@ def ana_web():
output_row = [
index,
row2['name'],
"文章标题",
"/",
row['错误表述'],
row['建议修改词语'],
row['错误分类'],
row2['content_url']
row2['url']
]
output_data.append(output_row)
index += 1
output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
# output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
return output_data
# Run WeChat Analysis
wechat_results = ana_wechat()
# Run Web Content Analysis
web_results = ana_web()
# Save results in an Excel file with two sheets
output_excel_path = os.path.join(output_dir, '总院及下属公司官方公众号巡查结果汇总表.xlsx')
with pd.ExcelWriter(output_excel_path) as writer:
wechat_results.to_excel(writer, sheet_name='公众号', index=False)
web_results.to_excel(writer, sheet_name='网站', index=False)
print("Analysis completed and results saved to Excel.")

View File

@ -28,7 +28,7 @@ for ind, row in df.iterrows():
group = row['单位']
name = row['主办']
url = row['地址']
domain = urlparse(url).netloc
domain = urlparse(url).netloc.replace('www.', '')
cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
process = subprocess.Popen(cmd)
processes.append(process)
@ -52,3 +52,4 @@ for ind, row in df.iterrows():
if info_to_save:
save_info_to_excel(info_to_save, 'failed_files.xlsx')

60
mycode/web2.py Normal file
View File

@ -0,0 +1,60 @@
import requests
import sqlite3
import pandas as pd
import html2text
import re
from urllib.parse import urlparse, urljoin
import concurrent.futures
class WebSpider:
def __init__(self) -> None:
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Proxy-Connection': 'keep-alive',
'Referer': 'https://www.baidu.com/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
}
self.visited_urls = set()
self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.css', '.js'])
self.futures = []
def get_one_page(self, group, name, domain, url):
if url in self.visited_urls:
return
self.visited_urls.add(url)
r = requests.get(url=url, headers=self.headers, timeout=10)
rtext = r.text
if rtext:
h = html2text.HTML2Text()
h.ignore_links = True # 忽略所有链接
text = h.handle(rtext)
print(group, name, domain, url)
links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
for link in links:
full_link = urljoin(r.url, link)
if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
if urlparse(full_link).netloc.replace('www.', "") == domain:
self.get_one_page(group, name, domain, full_link)
def start(self):
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
# for ind, row in df.iterrows():
# group = row['单位']
# name = row['主办']
# url = row['地址']
# domain = urlparse(url).netloc.replace('www.', '')
# self.get_one_page(group, name, domain, url)
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
futures = []
for ind, row in df.iterrows():
group = row['单位']
name = row['主办']
url = row['地址']
domain = urlparse(url).netloc.replace('www.', '')
futures.append(executor.submit(self.get_one_page, group, name, domain, url))
concurrent.futures.wait(futures)
WebSpider().start()

View File

@ -1,6 +1,4 @@
scrapy==2.8.0
scrapyd==1.4.1
scrapyd-client==1.2.3
html2text==2020.1.16
pandas==2.0.0
openpyxl==3.1.2

BIN
summary/template.xlsx Normal file

Binary file not shown.

110
web3.py Normal file
View File

@ -0,0 +1,110 @@
import os
import subprocess
import pandas as pd
from urllib.parse import urlparse
import signal
import sys
import datetime
from openpyxl import load_workbook
from mycode.base import BASE_DIR
from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir
from mycode.crawl_chrome import chrome_main, failed_sites_file
def save_info_to_excel(info_list, output_filename):
df = pd.DataFrame(info_list, columns=['单位', '主办' , '地址'])
df.to_excel(output_filename, index=False)
# 定义 SIGINT 信号处理函数
def sigint_handler(signal, frame):
print('收到 Ctrl-C 信号,正在关闭子进程...')
for process in processes:
process.terminate()
print('子进程已关闭,程序退出。')
sys.exit(0)
if __name__ == '__main__':
print('巡查任务开始。。。')
now = datetime.datetime.now()
month = now.month
print('正在组合微信公众号爬取内容。。。')
make_simple_csv_from_db()
make_wechat_articles_full()
print('公众号爬取内容组装完毕!')
print('开始进行网站爬取。。。')
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
processes = []
# 注册 SIGINT 信号处理函数
signal.signal(signal.SIGINT, sigint_handler)
ind = 0
for ind, row in df.iterrows():
group = row['单位']
name = row['主办']
url = row['地址']
domain = urlparse(url).netloc.replace('www.', '')
# output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
cmd = ['./runtime/Scripts/scrapy.exe', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
# cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
process = subprocess.Popen(cmd)
processes.append(process)
# Wait for all processes to finish
for process in processes:
process.wait()
print('网站爬取结束,校验中。。。')
# Check output file sizes and save information if size is less than 20KB
info_to_save = []
for ind, row in df.iterrows():
group = row['单位']
name = row['主办']
url = row['地址']
domain = urlparse(url).netloc.replace("www.", "")
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
if os.path.exists(output_filename):
file_size = os.path.getsize(output_filename)
if file_size < 20 * 1024: # Convert KB to bytes
info_to_save.append([group, name, url])
if info_to_save:
print('存在未爬取站点,正在调用Chrome继续爬取。。。')
save_info_to_excel(info_to_save, failed_sites_file)
chrome_main()
os.remove(failed_sites_file)
print('网站爬取完毕!')
print('开始对比分析所有内容。。。')
# Run WeChat Analysis
wechat_results = ana_wechat()
# Run Web Content Analysis
web_results = ana_web()
# Save results in an Excel file with two sheets
output_excel_path = os.path.join(output_dir, f'{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx')
# with pd.ExcelWriter(output_excel_path) as writer:
# df = pd.DataFrame(wechat_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
# df.to_excel(writer, sheet_name='公众号', index=False)
# df2 = pd.DataFrame(web_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
# df2.to_excel(writer, sheet_name='网站', index=False)
template_path = os.path.join(output_dir, 'template.xlsx')
workbook = load_workbook(template_path)
# 选择要操作的工作表
wechat_sheet = workbook['公众号']
web_sheet = workbook['网站']
for row in wechat_results:
wechat_sheet.append(row)
for row in web_results:
web_sheet.append(row)
workbook.save(output_excel_path)
workbook.close()
print('巡查任务执行完毕, 请查看summary文件夹, 可手动校对')
os.system("pause")

BIN
wechat.exe Normal file

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -7,7 +7,6 @@ from openpyxl import Workbook, load_workbook
# useful for handling different item types with a single interface
from scrapy.exceptions import IgnoreRequest
import psycopg2
# class ZcspiderPipeline2:
# """
@ -28,10 +27,17 @@ import psycopg2
class ZcspiderPipeline:
def open_spider(self, spider):
print('初始化数据库连接')
self.conn = psycopg2.connect(host="49.232.14.174",database="zcspider", user="postgres", password="zcDsj2021")
self.cur = self.conn.cursor()
self.cur.execute("delete from content where domain = %s", (spider.domain, ))
self.file_name = spider.output
if os.path.exists(self.file_name):
os.remove(self.file_name)
self.wb = Workbook()
self.ws = self.wb.active
self.ws.append(['group', 'name', 'domain', 'url', 'text'])
# print('初始化数据库连接')
# self.conn = psycopg2.connect(host="49.232.14.174",database="zcspider", user="postgres", password="zcDsj2021")
# self.cur = self.conn.cursor()
# self.cur.execute("delete from content where domain = %s", (spider.domain, ))
# rows = self.cur.fetchall()
# spider.visited_urls_last = [i[0] for i in rows] if len(rows)>1 else []
@ -45,18 +51,22 @@ class ZcspiderPipeline:
# return request
def process_item(self, item, spider):
try:
self.cur.execute("INSERT INTO content (domain, url, text) VALUES(%s, %s, %s)",
(item['domain'], item['url'], item['text']))
self.conn.commit()
except:
self.conn.rollback()
raise
# try:
# self.cur.execute("INSERT INTO content (domain, url, text) VALUES(%s, %s, %s)",
# (item['domain'], item['url'], item['text']))
# self.conn.commit()
# except:
# self.conn.rollback()
# raise
line = [item['group'], item['name'], item['domain'], item['url'], item['text']]
self.ws.append(line)
self.wb.save(self.file_name)
return item
# 结束,关闭连接
def close_spider(self, spider):
# 关闭游标
self.cur.close()
# 关闭连接
self.conn.close()
# # 关闭游标
# self.cur.close()
# # 关闭连接
# self.conn.close()
self.wb.close()

View File

@ -37,10 +37,15 @@ ROBOTSTXT_OBEY = False
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
#}
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Proxy-Connection': 'keep-alive',
'Referer': 'https://www.baidu.com/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
@ -91,11 +96,11 @@ ROBOTSTXT_OBEY = False
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = 'gb18030'
LOG_LEVEL = 'INFO'
DOWNLOAD_TIMEOUT = 30
LOG_LEVEL = 'ERROR'
DOWNLOAD_TIMEOUT = 10
ITEM_PIPELINES = {
# 'zcspider.pipelines.ZcspiderPipeline2': 300,
# 'zcspider.pipelines.ZcspiderPipeline': 300,
}
FEED_EXPORTERS = {

View File

@ -25,7 +25,8 @@ class BaseSpider(scrapy.Spider):
self.start_urls = [start_url]
self.name = name
self.group = group
self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico'])
self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.dat', '.css', '.js'])
print(f"爬取开始: {name}-{domain}")
def start_requests(self):
for url in self.start_urls:
@ -33,27 +34,28 @@ class BaseSpider(scrapy.Spider):
yield r
def request2(self, fail):
rurl = fail.request.url,
rurl = fail.request.url
self.logger.info(f'{rurl} 使用requests继续请求')
r = requests.get(url=fail.request.url, headers=self.headers, timeout=20)
rtext = r.text
h = html2text.HTML2Text()
h.ignore_links = True # 忽略所有链接
text = h.handle(rtext)
yield {
'group': self.group,
'name': self.name,
'domain': self.domain,
'url': rurl,
'text': text,
}
self.visited_urls.add(rurl)
links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
for link in links:
full_link = urljoin(r.url, link)
if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
if urlparse(full_link).netloc == self.domain:
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
if r.status_code < 400:
rtext = r.text
h = html2text.HTML2Text()
h.ignore_links = True # 忽略所有链接
text = h.handle(rtext)
yield {
'group': self.group,
'name': self.name,
'domain': self.domain,
'url': rurl,
'text': text,
}
self.visited_urls.add(rurl)
links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
for link in links:
full_link = urljoin(r.url, link)
if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
if urlparse(full_link).netloc.replace('www.', '') == self.domain:
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
def parse(self, response):
self.visited_urls.add(response.url)
@ -64,17 +66,21 @@ class BaseSpider(scrapy.Spider):
text = h.handle(response.text)
# except:
# text = h.handle(response.body.decode(encoding='gb18030'))
if response.status < 400:
yield {
'group': self.group,
'name': self.name,
'domain': self.domain,
'url': response.url,
'text': text,
}
yield {
'group': self.group,
'name': self.name,
'domain': self.domain,
'url': response.url,
'text': text,
}
for link in response.css("a::attr('href')").getall():
full_link = response.urljoin(link)
if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
if urlparse(full_link).netloc.replace('www.', '') == self.domain:
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
for link in response.css("a::attr('href')").getall():
full_link = response.urljoin(link)
if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
if urlparse(full_link).netloc == self.domain:
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
def closed(self, reason):
# This method will be called when the Spider is about to close
print(f'爬取完成: {self.name}-{self.domain}')