119 lines
4.9 KiB
Python
119 lines
4.9 KiB
Python
import os
|
|
import subprocess
|
|
import pandas as pd
|
|
from urllib.parse import urlparse
|
|
import signal
|
|
import sys
|
|
import datetime
|
|
from openpyxl import load_workbook
|
|
|
|
from mycode.base import BASE_DIR
|
|
from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir
|
|
from mycode.crawl_chrome import chrom_main_from_list, failed_sites_file
|
|
|
|
python_exe = os.path.join(BASE_DIR, 'runtime/python.exe')
|
|
# scrapy_exe = os.path.join(BASE_DIR, 'runtime/Scripts/scrapy.exe')
|
|
|
|
def save_info_to_excel(info_list, output_filename):
|
|
df = pd.DataFrame(info_list, columns=['单位', '主办' , '地址'])
|
|
df.to_excel(output_filename, index=False)
|
|
|
|
# 定义 SIGINT 信号处理函数
|
|
def sigint_handler(signal, frame):
|
|
print('收到 Ctrl-C 信号,正在关闭子进程...')
|
|
for process in processes:
|
|
process.terminate()
|
|
print('子进程已关闭,程序退出。')
|
|
sys.exit(0)
|
|
|
|
def fix_url_scheme(url, default_scheme='http'):
|
|
# 检查URL是否包含方案
|
|
if not url.startswith('http://') and not url.startswith('https://'):
|
|
# 如果没有方案,添加默认方案
|
|
url = f'{default_scheme}://{url}'
|
|
return url
|
|
if __name__ == '__main__':
|
|
# print('巡查任务开始。。。')
|
|
# now = datetime.datetime.now()
|
|
# month = now.month
|
|
|
|
# print('正在组合微信公众号爬取内容。。。')
|
|
# make_simple_csv_from_db()
|
|
# make_wechat_articles_full()
|
|
# print('公众号爬取内容组装完毕!')
|
|
|
|
# print('开始进行网站爬取。。。')
|
|
|
|
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
|
|
# processes = []
|
|
|
|
# # 注册 SIGINT 信号处理函数
|
|
# signal.signal(signal.SIGINT, sigint_handler)
|
|
|
|
# ind = 0
|
|
# for ind, row in df.iterrows():
|
|
# group = row['单位']
|
|
# name = row['主办']
|
|
# url = fix_url_scheme(row['地址'].strip())
|
|
# domain = urlparse(url).netloc.replace('www.', '')
|
|
# if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']: # 这几个网站直接跳过
|
|
# continue
|
|
# output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
|
# # cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
|
|
# cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
|
|
# process = subprocess.Popen(cmd)
|
|
# processes.append(process)
|
|
|
|
# # Wait for all processes to finish
|
|
# for process in processes:
|
|
# process.wait()
|
|
|
|
print('网站爬取结束,校验中。。。')
|
|
# Check output file sizes and save information if size is less than 20KB
|
|
info_to_save = []
|
|
for ind, row in df.iterrows():
|
|
group = row['单位']
|
|
name = row['主办']
|
|
url = row['地址']
|
|
domain = urlparse(url).netloc.replace("www.", "")
|
|
output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
|
|
if os.path.exists(output_filename):
|
|
file_size = os.path.getsize(output_filename)
|
|
print(file_size/1024)
|
|
if file_size < 20 * 1024: # Convert KB to bytes
|
|
info_to_save.append([group, name, url])
|
|
|
|
if info_to_save:
|
|
print('存在未爬取站点,正在调用Chrome继续爬取。。。')
|
|
chrom_main_from_list(info_to_save)
|
|
# os.remove(failed_sites_file)
|
|
|
|
print('网站爬取完毕!')
|
|
|
|
# print('开始对比分析所有内容。。。')
|
|
# # Run WeChat Analysis
|
|
# wechat_results = ana_wechat()
|
|
# # Run Web Content Analysis
|
|
# web_results = ana_web()
|
|
|
|
# # Save results in an Excel file with two sheets
|
|
# output_excel_path = os.path.join(output_dir, f'{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx')
|
|
# # with pd.ExcelWriter(output_excel_path) as writer:
|
|
# # df = pd.DataFrame(wechat_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
|
# # df.to_excel(writer, sheet_name='公众号', index=False)
|
|
# # df2 = pd.DataFrame(web_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
|
|
# # df2.to_excel(writer, sheet_name='网站', index=False)
|
|
# template_path = os.path.join(output_dir, 'template.xlsx')
|
|
# workbook = load_workbook(template_path)
|
|
|
|
# # 选择要操作的工作表
|
|
# wechat_sheet = workbook['公众号']
|
|
# web_sheet = workbook['网站']
|
|
# for row in wechat_results:
|
|
# wechat_sheet.append(row)
|
|
# for row in web_results:
|
|
# web_sheet.append(row)
|
|
# workbook.save(output_excel_path)
|
|
# workbook.close()
|
|
# print('巡查任务执行完毕, 请查看summary文件夹, 可手动校对')
|
|
# os.system("pause") |