feat: 优化爬虫/输出分析报告

This commit is contained in:
caoqianming 2023-11-09 17:51:54 +08:00
parent e5ff671c97
commit 7bf87671b9
12 changed files with 222 additions and 105 deletions

View File

@ -478,7 +478,7 @@
</rect>
</property>
<property name="text">
<string>20</string>
<string>5</string>
</property>
</widget>
<widget class="QPushButton" name="bStart">

View File

@ -7,6 +7,7 @@ from pathlib import Path
import pandas as pd
from .base import BASE_DIR
import os
from selenium.common.exceptions import TimeoutException
chrome_driver_file = os.path.join(BASE_DIR, 'mycode', 'chromedriver.exe')
failed_sites_file = os.path.join(BASE_DIR, 'mycode/failed_sites.xlsx')
@ -160,7 +161,6 @@ def add_cookies(driver, cookies):
driver.add_cookie({'name': name, 'value': value})
def chrom_main_from_list(sites):
driver = init_driver()
for ind, item in enumerate(sites):
group = item[0] # Replace with the actual column name for group
name = item[1]
@ -169,29 +169,40 @@ def chrom_main_from_list(sites):
if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']:
continue
url = fix_url_scheme(url)
print(url)
driver = init_driver()
# Open the website
driver.get(url)
# driver.get(url)
# Retrieve cookies from previous session
cookies = get_cookies_from_previous_session(driver)
# Add cookies to the WebDriver
add_cookies(driver, cookies)
# # Retrieve cookies from previous session
# cookies = get_cookies_from_previous_session(driver)
# # Add cookies to the WebDriver
# add_cookies(driver, cookies)
# Initialize the set to store visited pages
visited_pages = set()
# Initialize the data list
data = []
try:
# 设置页面加载超时时间为10秒
driver.set_page_load_timeout(10)
# 设置脚本执行超时时间为10秒
driver.set_script_timeout(10)
# Process the starting page and follow hyperlinks recursively
process_page(driver, url, visited_pages, domain, data, group, name)
# 在这里编写你的代码,例如打开网页、点击按钮等操作
# ...
process_page(driver, url, visited_pages, domain, data, group, name)
except TimeoutException:
# 当超时异常发生时,进行相应的操作,例如跳过或报错
print("超时异常")
driver.quit()
# Export data to a separate Excel file in the web_dir directory
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
export_to_excel(data, output_filename)
# Close the WebDriver
driver.quit()
# driver.quit()
def chrome_main():
# Read failed URLs from the list

View File

@ -2,6 +2,8 @@ import pandas as pd
import os
import sqlite3
from mycode.base import BASE_DIR
import re
from openpyxl import load_workbook
wechat_dir = os.path.join(BASE_DIR, 'article')
web_dir = os.path.join(BASE_DIR, 'web_dir')
@ -23,6 +25,98 @@ def make_simple_csv_from_db():
df.to_csv(os.path.join(wechat_dir, 'articles.csv'), index=False)
def get_cbma_info_from_db_and_ana(year: str = '2023'):
conn = sqlite3.connect(os.path.join(BASE_DIR, 'db_folder/test.db'))
query = f'''
SELECT
id,
strftime('%Y年%m月%d', datetime(a.p_date, 'unixepoch', 'localtime')) as pub_date,
g.nickname,
a.title,
a.content_url,
a.read_num
FROM
articles a
LEFT JOIN
gzhs g ON g.biz = a.biz
WHERE
pub_date > '{year}'
AND
g.biz = 'MzIzMDU4Njg3MA=='
ORDER BY
pub_date
'''
df = pd.read_sql_query(query, conn)
# 关闭数据库连接
conn.close()
for ind, row in df.iterrows():
id = row['id']
full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.md')
try:
with open(full_path, encoding='utf-8') as f:
content = f.read()
# 从content中获取来源
a_match = re.findall('来源丨(.*?)\n', content)
a_list = []
if a_match:
# a = a_match[0].replace('\xa0', '、').replace(' ', '、')
# a = re.sub(r'、+', '、', a)
a = re.sub(r'[\xa0\s]+', '', a_match[0])
df.at[ind, 'source'] = a
except FileNotFoundError:
print(full_path + '---不存在')
# 填充到execl中
template_path = os.path.join(BASE_DIR, 'summary/template_cbma.xlsx')
workbook = load_workbook(template_path)
sheet = workbook['公众号更新数']
sheet.cell(row=1, column=1, value=f'关于{year}年度中国建材总院新媒体更新情况明细表\n(官微)')
for ind, row in df.iterrows():
sheet.cell(row=ind+3, column=1, value=str(ind+1))
sheet.cell(row=ind+3, column=2, value=row['pub_date'])
sheet.cell(row=ind+3, column=3, value=row['title'])
sheet.cell(row=ind+3, column=4, value=row['source'])
sheet.cell(row=ind+3, column=6, value=row['read_num'])
sheet.cell(row=ind+3, column=7, value=row['content_url'])
output_path = os.path.join(BASE_DIR, f'summary/{year}年_cbma.xlsx')
workbook.save(output_path)
# 开始统计分数
t_1 = (df['source'].str.contains('瑞泰科技')).sum()
t_2 = (df['source'].str.contains('国检集团')).sum()
t_3 = (df['source'].str.contains('中材高新')).sum()
t_4 = (df['source'].str.contains('哈玻院')).sum()
t_5 = (df['source'].str.contains('中国新材院')).sum()
t_6 = (df['source'].str.contains('秦皇岛院')).sum()
t_7 = (df['source'].str.contains('西安墙材院')).sum()
t_8 = (df['source'].str.contains('咸阳陶瓷院')).sum()
t_9 = (df['source'].str.contains('钟表所')).sum()
t_10 = (df['source'].str.contains('总院北分')).sum()
t_11 = (df['source'].str.contains('中岩科技')).sum()
t_12 = (df['source'].str.contains('水泥新材院')).sum()
t_13 = (df['source'].str.contains('中建材科创院')).sum()
t_14 = (df['source'].str.contains('科建苑')).sum()
template_cal_path = os.path.join(BASE_DIR, 'summary/tempalte_cbma_cal.xlsx')
workbook2 = load_workbook(template_cal_path)
sheet2= workbook2['打分表']
sheet2.cell(row=1, column=1, value=f'中国建材总院宣传工作计分表({year}年度)')
sheet2.cell(row=6, column=5, value=t_1)
sheet2.cell(row=6, column=7, value=t_2)
sheet2.cell(row=6, column=9, value=t_3)
sheet2.cell(row=6, column=11, value=t_4)
sheet2.cell(row=6, column=13, value=t_5)
sheet2.cell(row=6, column=15, value=t_6)
sheet2.cell(row=6, column=17, value=t_7)
sheet2.cell(row=6, column=19, value=t_8)
sheet2.cell(row=6, column=21, value=t_9)
sheet2.cell(row=6, column=23, value=t_10)
sheet2.cell(row=6, column=25, value=t_11)
sheet2.cell(row=6, column=27, value=t_12)
sheet2.cell(row=6, column=29, value=t_13)
sheet2.cell(row=6, column=31, value=t_14)
output_path2 = os.path.join(BASE_DIR, f'summary/{year}年_cbma_cal.xlsx')
workbook2.save(output_path2)
return output_path, output_path2
def make_wechat_articles_full():
df = pd.read_csv(os.path.join(wechat_dir, 'articles.csv'))
df['content'] = ''
@ -110,5 +204,5 @@ def ana_web():
return output_data
if __name__ == "__main__":
ana_web()
get_cbma_info_from_db_and_ana()

View File

@ -7,7 +7,7 @@ import win32com.client as win32
import subprocess
import os
import datetime
from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir
from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir, get_cbma_info_from_db_and_ana
from mycode.crawl_chrome import chrom_main_from_list
import pandas as pd
from urllib.parse import urlparse
@ -139,12 +139,6 @@ class MyThread(QThread):
output = p.stdout.readline()
if output:
self.update_signal.emit({'msg': output.strip()})
def capture_err(self, p):
while self.running and p.poll() is None:
err = p.stderr.readline()
if err:
self.update_signal.emit({'msg': err.strip()})
def run(self) -> None:
self.update_signal.emit({'msg': '开始进行网站爬取...'})
@ -158,13 +152,14 @@ class MyThread(QThread):
output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
# -u 代表不缓冲,直接输出
cmd = [PYTHON_PATH, '-u', '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=False)
# cmd = [PYTHON_PATH, '-u', '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, text=True, shell=False)
self.processes.append(process)
self.running = True
getlog_thread = threading.Thread(target=self.capture_output, args=(process,), daemon=True)
getlog_thread.start()
getlog_thread_err = threading.Thread(target=self.capture_err, args=(process,), daemon=True)
getlog_thread_err.start()
# getlog_thread_err = threading.Thread(target=self.capture_err, args=(process,), daemon=True)
# getlog_thread_err.start()
for process in self.processes:
process.wait()
@ -205,12 +200,15 @@ class MainWindow(QMainWindow):
self.ui.setupUi(self)
self.ui.lSize.setValidator(QIntValidator())
self.ui.bWechat.clicked.connect(self.open_wcplus)
self.ui.bWebSite.clicked.connect(self.open_websites_xlsx)
self.ui.bBiao.clicked.connect(self.open_biao_xlsx)
self.ui.bWebSite.clicked.connect(lambda: self.open_file(WEB_SITES_PATH))
self.ui.bBiao.clicked.connect(lambda: self.open_file(BIAO_PATH))
self.ui.bStart.clicked.connect(self.start)
self.ui.bAna.clicked.connect(self.start_ana)
self.ui.bRes1.clicked.connect(self.open_res1)
self.ui.bRes2.clicked.connect(self.open_res2)
self.ui.bRes1.clicked.connect(lambda: self.open_file(self.ui.lRes1.text()))
self.ui.bRes2.clicked.connect(lambda: self.open_file(self.ui.lRes2.text(), 'docx'))
self.ui.bCal.clicked.connect(self.cbma_cal)
self.ui.bOpenCalRes1.clicked.connect(lambda: self.open_file(self.ui.lCalRes1.text()))
self.ui.bOpenCalRes2.clicked.connect(lambda: self.open_file(self.ui.lCalRes2.text()))
self.ui.vLog.setModel(self.logModel)
self.res1Workbook = None
@ -219,31 +217,22 @@ class MainWindow(QMainWindow):
subprocess.Popen('.\wcplus.exe')
self.wcplus = True
def open_websites_xlsx(self):
app = win32.Dispatch("Excel.Application")
app.Visible = True
app.Workbooks.Open(WEB_SITES_PATH)
app.WindowState = 3
def open_biao_xlsx(self):
app = win32.Dispatch("Excel.Application")
app.Visible = True
app.Workbooks.Open(BIAO_PATH)
app.WindowState = 3
def open_res1(self):
if self.ui.lRes1.text():
app = win32.Dispatch("Excel.Application")
app.Visible = True
self.res1Workbook = app.Workbooks.Open(self.ui.lRes1.text())
app.WindowState = 3
def open_res2(self):
if self.ui.lRes2.text():
app = win32.Dispatch("Word.Application")
app.Visible = True
app.Documents.Open(self.ui.lRes2.text())
app.WindowState = 3
def open_file(self, path, type='xlsx'):
if path:
# try:
# os.startfile(path)
# except Exception as e:
# print("无法打开文件:", str(e))
if type == 'docs':
app = win32.Dispatch("Word.Application")
app.Visible = True
app.Documents.Open(path)
app.WindowState = 3
elif type == 'xlsx':
app = win32.Dispatch("Excel.Application")
app.Visible = True
app.Workbooks.Open(path)
app.WindowState = 3
def get_time(self):
now = datetime.datetime.now()
@ -276,6 +265,14 @@ class MainWindow(QMainWindow):
self.ana_thread.update_signal.connect(self.update_log)
self.ana_thread.start()
def cbma_cal(self):
now_year = datetime.datetime.now().year
self.update_log({'msg': '正在分析本年总院官微数据...'})
output_path, output_path2 = get_cbma_info_from_db_and_ana(now_year)
self.ui.lCalRes1.setText(output_path)
self.ui.lCalRes2.setText(output_path2)
self.update_log({'msg': '分析完毕!'})
def update_log(self, rdict):
if isinstance(rdict, str):
self.log(f'{self.get_time()}-{rdict}', False)
@ -316,7 +313,9 @@ class MainWindow(QMainWindow):
if __name__ == "__main__":
# gen_doc()
print('正在启动程序...')
app = MyApplication(sys.argv)
main_window = app.createMainWindow()
main_window.show()
print('启动成功')
sys.exit(app.exec())

8
start.vbs Normal file
View File

@ -0,0 +1,8 @@
Set objShell = CreateObject("WScript.Shell")
strFolder = objShell.CurrentDirectory
strPythonPath = strFolder & "\runtime\python.exe"
strScriptPath = strFolder & "\start.py"
objShell.Run """" & strPythonPath & """ """ & strScriptPath & """"
' Set objExec = objShell.Exec("""" & strPythonPath & """ """ & strScriptPath & """")

BIN
summary/template_cbma.xlsx Normal file

Binary file not shown.

View File

@ -242,7 +242,7 @@ class Ui_MainWindow(object):
self.label_2.setText(QCoreApplication.translate("MainWindow", u"\u8bf7\u5728\u4fee\u6539\u540e\u4fdd\u5b58\u5e76\u5173\u95ed", None))
self.label_6.setText(QCoreApplication.translate("MainWindow", u"\u5c0f\u4e8e", None))
self.label_3.setText(QCoreApplication.translate("MainWindow", u"KB-Chrome", None))
self.lSize.setText(QCoreApplication.translate("MainWindow", u"20", None))
self.lSize.setText(QCoreApplication.translate("MainWindow", u"5", None))
self.bStart.setText(QCoreApplication.translate("MainWindow", u"\u5f00\u59cb\u722c\u53d6", None))
self.groupBox_7.setTitle(QCoreApplication.translate("MainWindow", u"\u603b\u9662\u5b98\u5fae", None))
self.label_10.setText(QCoreApplication.translate("MainWindow", u"\u6c47\u603b\u7ed3\u679cExcel:", None))

104
web3.py
View File

@ -33,40 +33,40 @@ def fix_url_scheme(url, default_scheme='http'):
url = f'{default_scheme}://{url}'
return url
if __name__ == '__main__':
print('巡查任务开始。。。')
now = datetime.datetime.now()
month = now.month
# print('巡查任务开始。。。')
# now = datetime.datetime.now()
# month = now.month
print('正在组合微信公众号爬取内容。。。')
make_simple_csv_from_db()
make_wechat_articles_full()
print('公众号爬取内容组装完毕!')
# print('正在组合微信公众号爬取内容。。。')
# make_simple_csv_from_db()
# make_wechat_articles_full()
# print('公众号爬取内容组装完毕!')
print('开始进行网站爬取。。。')
# print('开始进行网站爬取。。。')
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
processes = []
# processes = []
# 注册 SIGINT 信号处理函数
signal.signal(signal.SIGINT, sigint_handler)
# # 注册 SIGINT 信号处理函数
# signal.signal(signal.SIGINT, sigint_handler)
ind = 0
for ind, row in df.iterrows():
group = row['单位']
name = row['主办']
url = fix_url_scheme(row['地址'].strip())
domain = urlparse(url).netloc.replace('www.', '')
if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']: # 这几个网站直接跳过
continue
output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
# cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
process = subprocess.Popen(cmd)
processes.append(process)
# ind = 0
# for ind, row in df.iterrows():
# group = row['单位']
# name = row['主办']
# url = fix_url_scheme(row['地址'].strip())
# domain = urlparse(url).netloc.replace('www.', '')
# if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']: # 这几个网站直接跳过
# continue
# output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
# # cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
# cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
# process = subprocess.Popen(cmd)
# processes.append(process)
# Wait for all processes to finish
for process in processes:
process.wait()
# # Wait for all processes to finish
# for process in processes:
# process.wait()
print('网站爬取结束,校验中。。。')
# Check output file sizes and save information if size is less than 20KB
@ -90,30 +90,30 @@ if __name__ == '__main__':
print('网站爬取完毕!')
print('开始对比分析所有内容。。。')
# Run WeChat Analysis
wechat_results = ana_wechat()
# Run Web Content Analysis
web_results = ana_web()
# print('开始对比分析所有内容。。。')
# # Run WeChat Analysis
# wechat_results = ana_wechat()
# # Run Web Content Analysis
# web_results = ana_web()
# Save results in an Excel file with two sheets
output_excel_path = os.path.join(output_dir, f'{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx')
# with pd.ExcelWriter(output_excel_path) as writer:
# df = pd.DataFrame(wechat_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
# df.to_excel(writer, sheet_name='公众号', index=False)
# df2 = pd.DataFrame(web_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
# df2.to_excel(writer, sheet_name='网站', index=False)
template_path = os.path.join(output_dir, 'template.xlsx')
workbook = load_workbook(template_path)
# # Save results in an Excel file with two sheets
# output_excel_path = os.path.join(output_dir, f'{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx')
# # with pd.ExcelWriter(output_excel_path) as writer:
# # df = pd.DataFrame(wechat_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
# # df.to_excel(writer, sheet_name='公众号', index=False)
# # df2 = pd.DataFrame(web_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
# # df2.to_excel(writer, sheet_name='网站', index=False)
# template_path = os.path.join(output_dir, 'template.xlsx')
# workbook = load_workbook(template_path)
# 选择要操作的工作表
wechat_sheet = workbook['公众号']
web_sheet = workbook['网站']
for row in wechat_results:
wechat_sheet.append(row)
for row in web_results:
web_sheet.append(row)
workbook.save(output_excel_path)
workbook.close()
print('巡查任务执行完毕, 请查看summary文件夹, 可手动校对')
os.system("pause")
# # 选择要操作的工作表
# wechat_sheet = workbook['公众号']
# web_sheet = workbook['网站']
# for row in wechat_results:
# wechat_sheet.append(row)
# for row in web_results:
# web_sheet.append(row)
# workbook.save(output_excel_path)
# workbook.close()
# print('巡查任务执行完毕, 请查看summary文件夹, 可手动校对')
# os.system("pause")

View File

@ -4,6 +4,7 @@
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import os.path
from openpyxl import Workbook, load_workbook
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
# useful for handling different item types with a single interface
from scrapy.exceptions import IgnoreRequest
@ -58,6 +59,7 @@ class ZcspiderPipeline:
# except:
# self.conn.rollback()
# raise
item['text'] = ILLEGAL_CHARACTERS_RE.sub(r'', item['text'])
line = [item['group'], item['name'], item['domain'], item['url'], item['text']]
self.ws.append(line)
return item

View File

@ -96,7 +96,7 @@ DEFAULT_REQUEST_HEADERS = {
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = 'gb18030'
LOG_LEVEL = 'WARNING'
LOG_LEVEL = 'DEBUG'
DOWNLOAD_TIMEOUT = 10
ITEM_PIPELINES = {
@ -110,4 +110,7 @@ FEED_EXPORTERS = {
DOWNLOADER_MIDDLEWARES = {
'zcspider.middlewares.FilterHTMLMiddleware': 200,
# 其他下载中间件...
}
EXTENSIONS = {
'scrapy.extensions.telnet.TelnetConsole': None
}

BIN
宣传巡查.exe Normal file

Binary file not shown.