import os import subprocess import pandas as pd from urllib.parse import urlparse import signal import sys import datetime from openpyxl import load_workbook from mycode.base import BASE_DIR from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir from mycode.crawl_chrome import chrom_main_from_list, failed_sites_file python_exe = os.path.join(BASE_DIR, 'runtime/python.exe') # scrapy_exe = os.path.join(BASE_DIR, 'runtime/Scripts/scrapy.exe') def save_info_to_excel(info_list, output_filename): df = pd.DataFrame(info_list, columns=['单位', '主办' , '地址']) df.to_excel(output_filename, index=False) # 定义 SIGINT 信号处理函数 def sigint_handler(signal, frame): print('收到 Ctrl-C 信号,正在关闭子进程...') for process in processes: process.terminate() print('子进程已关闭,程序退出。') sys.exit(0) def fix_url_scheme(url, default_scheme='http'): # 检查URL是否包含方案 if not url.startswith('http://') and not url.startswith('https://'): # 如果没有方案,添加默认方案 url = f'{default_scheme}://{url}' return url if __name__ == '__main__': # print('巡查任务开始。。。') # now = datetime.datetime.now() # month = now.month # print('正在组合微信公众号爬取内容。。。') # make_simple_csv_from_db() # make_wechat_articles_full() # print('公众号爬取内容组装完毕!') # print('开始进行网站爬取。。。') df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1') # processes = [] # # 注册 SIGINT 信号处理函数 # signal.signal(signal.SIGINT, sigint_handler) # ind = 0 # for ind, row in df.iterrows(): # group = row['单位'] # name = row['主办'] # url = fix_url_scheme(row['地址'].strip()) # domain = urlparse(url).netloc.replace('www.', '') # if domain in ['xdjstc.com', 'epcyiqizu.com', 'cbra.ctc.ac.cn']: # 这几个网站直接跳过 # continue # output = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') # # cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx'] # cmd = [python_exe, '-m', 'scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}'] # process = subprocess.Popen(cmd) # processes.append(process) # # Wait for all processes to finish # for process in processes: # process.wait() print('网站爬取结束,校验中。。。') # Check output file sizes and save information if size is less than 20KB info_to_save = [] for ind, row in df.iterrows(): group = row['单位'] name = row['主办'] url = row['地址'] domain = urlparse(url).netloc.replace("www.", "") output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') if os.path.exists(output_filename): file_size = os.path.getsize(output_filename) print(file_size/1024) if file_size < 20 * 1024: # Convert KB to bytes info_to_save.append([group, name, url]) if info_to_save: print('存在未爬取站点,正在调用Chrome继续爬取。。。') chrom_main_from_list(info_to_save) # os.remove(failed_sites_file) print('网站爬取完毕!') # print('开始对比分析所有内容。。。') # # Run WeChat Analysis # wechat_results = ana_wechat() # # Run Web Content Analysis # web_results = ana_web() # # Save results in an Excel file with two sheets # output_excel_path = os.path.join(output_dir, f'{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx') # # with pd.ExcelWriter(output_excel_path) as writer: # # df = pd.DataFrame(wechat_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) # # df.to_excel(writer, sheet_name='公众号', index=False) # # df2 = pd.DataFrame(web_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接']) # # df2.to_excel(writer, sheet_name='网站', index=False) # template_path = os.path.join(output_dir, 'template.xlsx') # workbook = load_workbook(template_path) # # 选择要操作的工作表 # wechat_sheet = workbook['公众号'] # web_sheet = workbook['网站'] # for row in wechat_results: # wechat_sheet.append(row) # for row in web_results: # web_sheet.append(row) # workbook.save(output_excel_path) # workbook.close() # print('巡查任务执行完毕, 请查看summary文件夹, 可手动校对') # os.system("pause")