110 lines
4.4 KiB
Python
110 lines
4.4 KiB
Python
import os
|
|
import subprocess
|
|
import pandas as pd
|
|
from urllib.parse import urlparse
|
|
import signal
|
|
import sys
|
|
import datetime
|
|
from openpyxl import load_workbook
|
|
|
|
from mycode.base import BASE_DIR
|
|
from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir
|
|
from mycode.crawl_chrome import chrom_main_from_list, failed_sites_file
|
|
|
|
python_exe = os.path.join(BASE_DIR, 'runtime/python.exe')
|
|
# scrapy_exe = os.path.join(BASE_DIR, 'runtime/Scripts/scrapy.exe')
|
|
|
|
def save_info_to_excel(info_list, output_filename):
|
|
df = pd.DataFrame(info_list, columns=['单位', '主办' , '地址'])
|
|
df.to_excel(output_filename, index=False)
|
|
|
|
# 定义 SIGINT 信号处理函数
|
|
def sigint_handler(signal, frame):
|
|
print('收到 Ctrl-C 信号,正在关闭子进程...')
|
|
for process in processes:
|
|
process.terminate()
|
|
print('子进程已关闭,程序退出。')
|
|
sys.exit(0)
|
|
|
|
if __name__ == '__main__':
|
|
print('巡查任务开始。。。')
|
|
now = datetime.datetime.now()
|
|
month = now.month
|
|
|
|
print('正在组合微信公众号爬取内容。。。')
|
|
make_simple_csv_from_db()
|
|
make_wechat_articles_full()
|
|
print('公众号爬取内容组装完毕!')
|
|
|
|
print('开始进行网站爬取。。。')
|
|
|
|
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
|
|
processes = []
|
|
|
|
# 注册 SIGINT 信号处理函数
|
|
signal.signal(signal.SIGINT, sigint_handler)
|
|
|
|
ind = 0
|
|
for ind, row in df.iterrows():
|
|
group = row['单位']
|
|
name = row['主办']
|
|
url = row['地址']
|
|
domain = urlparse(url).netloc.replace('www.', '')
|
|
# output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
|
cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
|
|
# cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
|
|
process = subprocess.Popen(cmd)
|
|
processes.append(process)
|
|
|
|
# Wait for all processes to finish
|
|
for process in processes:
|
|
process.wait()
|
|
|
|
print('网站爬取结束,校验中。。。')
|
|
# Check output file sizes and save information if size is less than 20KB
|
|
info_to_save = []
|
|
for ind, row in df.iterrows():
|
|
group = row['单位']
|
|
name = row['主办']
|
|
url = row['地址']
|
|
domain = urlparse(url).netloc.replace("www.", "")
|
|
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
|
if os.path.exists(output_filename):
|
|
file_size = os.path.getsize(output_filename)
|
|
if file_size < 30 * 1024: # Convert KB to bytes
|
|
info_to_save.append([group, name, url])
|
|
|
|
if info_to_save:
|
|
print('存在未爬取站点,正在调用Chrome继续爬取。。。')
|
|
chrom_main_from_list(info_to_save)
|
|
# os.remove(failed_sites_file)
|
|
|
|
print('网站爬取完毕!')
|
|
|
|
print('开始对比分析所有内容。。。')
|
|
# Run WeChat Analysis
|
|
wechat_results = ana_wechat()
|
|
# Run Web Content Analysis
|
|
web_results = ana_web()
|
|
|
|
# Save results in an Excel file with two sheets
|
|
output_excel_path = os.path.join(output_dir, f'{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx')
|
|
# with pd.ExcelWriter(output_excel_path) as writer:
|
|
# df = pd.DataFrame(wechat_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
|
# df.to_excel(writer, sheet_name='公众号', index=False)
|
|
# df2 = pd.DataFrame(web_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
|
# df2.to_excel(writer, sheet_name='网站', index=False)
|
|
template_path = os.path.join(output_dir, 'template.xlsx')
|
|
workbook = load_workbook(template_path)
|
|
|
|
# 选择要操作的工作表
|
|
wechat_sheet = workbook['公众号']
|
|
web_sheet = workbook['网站']
|
|
for row in wechat_results:
|
|
wechat_sheet.append(row)
|
|
for row in web_results:
|
|
web_sheet.append(row)
|
|
workbook.save(output_excel_path)
|
|
workbook.close()
|
|
print('巡查任务执行完毕, 请查看summary文件夹, 可手动校对')
|
|
os.system("pause") |