import os import subprocess import pandas as pd from urllib.parse import urlparse import signal import sys df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1') def save_info_to_excel(info_list, output_filename): df = pd.DataFrame(info_list, columns=['Group', 'Name' , 'URL']) df.to_excel(output_filename, index=False) processes = [] # 定义 SIGINT 信号处理函数 def sigint_handler(signal, frame): print('收到 Ctrl-C 信号,正在关闭子进程...') for process in processes: process.terminate() print('子进程已关闭,程序退出。') sys.exit(0) # 注册 SIGINT 信号处理函数 signal.signal(signal.SIGINT, sigint_handler) ind = 0 for ind, row in df.iterrows(): group = row['单位'] name = row['主办'] url = row['地址'] domain = urlparse(url).netloc.replace('www.', '') cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx'] process = subprocess.Popen(cmd) processes.append(process) ind +=1 # Wait for all processes to finish for process in processes: process.wait() # Check output file sizes and save information if size is less than 20KB info_to_save = [] for ind, row in df.iterrows(): name = row['主办'] url = row['地址'] domain = urlparse(row['地址']).netloc output_filename = f'web_dir/{name}_{domain}.xlsx' if os.path.exists(output_filename): file_size = os.path.getsize(output_filename) if file_size < 20 * 1024: # Convert KB to bytes info_to_save.append([row['单位'], name, row['地址']]) if info_to_save: save_info_to_excel(info_to_save, 'failed_files.xlsx')