import os import subprocess import pandas as pd from urllib.parse import urlparse import signal import sys import datetime from openpyxl import load_workbook from mycode.base import BASE_DIR from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir from mycode.crawl_chrome import chrome_main, failed_sites_file python_exe = os.path.join(BASE_DIR, 'runtime/python.exe') # scrapy_exe = os.path.join(BASE_DIR, 'runtime/Scripts/scrapy.exe') def save_info_to_excel(info_list, output_filename): df = pd.DataFrame(info_list, columns=['单位', '主办' , '地址']) df.to_excel(output_filename, index=False) # 定义 SIGINT 信号处理函数 def sigint_handler(signal, frame): print('收到 Ctrl-C 信号,正在关闭子进程...') for process in processes: process.terminate() print('子进程已关闭,程序退出。') sys.exit(0) if __name__ == '__main__': print('巡查任务开始。。。') now = datetime.datetime.now() month = now.month print('正在组合微信公众号爬取内容。。。') make_simple_csv_from_db() make_wechat_articles_full() print('公众号爬取内容组装完毕!') print('开始进行网站爬取。。。') df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1') processes = [] # 注册 SIGINT 信号处理函数 signal.signal(signal.SIGINT, sigint_handler) ind = 0 for ind, row in df.iterrows(): group = row['单位'] name = row['主办'] url = row['地址'] domain = urlparse(url).netloc.replace('www.', '') # output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx'] # cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}'] process = subprocess.Popen(cmd) processes.append(process) # Wait for all processes to finish for process in processes: process.wait() print('网站爬取结束,校验中。。。') # Check output file sizes and save information if size is less than 20KB info_to_save = [] for ind, row in df.iterrows(): group = row['单位'] name = row['主办'] url = row['地址'] domain = urlparse(url).netloc.replace("www.", "") output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') if os.path.exists(output_filename): file_size = os.path.getsize(output_filename) if file_size < 20 * 1024: # Convert KB to bytes info_to_save.append([group, name, url]) if info_to_save: print('存在未爬取站点,正在调用Chrome继续爬取。。。') save_info_to_excel(info_to_save, failed_sites_file) chrome_main() os.remove(failed_sites_file) print('网站爬取完毕!') print('开始对比分析所有内容。。。') # Run WeChat Analysis wechat_results = ana_wechat() # Run Web Content Analysis web_results = ana_web() # Save results in an Excel file with two sheets output_excel_path = os.path.join(output_dir, f'{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx') # with pd.ExcelWriter(output_excel_path) as writer: # df = pd.DataFrame(wechat_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) # df.to_excel(writer, sheet_name='公众号', index=False) # df2 = pd.DataFrame(web_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) # df2.to_excel(writer, sheet_name='网站', index=False) template_path = os.path.join(output_dir, 'template.xlsx') workbook = load_workbook(template_path) # 选择要操作的工作表 wechat_sheet = workbook['公众号'] web_sheet = workbook['网站'] for row in wechat_results: wechat_sheet.append(row) for row in web_results: web_sheet.append(row) workbook.save(output_excel_path) workbook.close() print('巡查任务执行完毕, 请查看summary文件夹, 可手动校对') os.system("pause")