zcspider/mycode/web.py

56 lines
1.8 KiB
Python

import os
import subprocess
import pandas as pd
from urllib.parse import urlparse
import signal
import sys
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
def save_info_to_excel(info_list, output_filename):
df = pd.DataFrame(info_list, columns=['Group', 'Name' , 'URL'])
df.to_excel(output_filename, index=False)
processes = []
# 定义 SIGINT 信号处理函数
def sigint_handler(signal, frame):
print('收到 Ctrl-C 信号,正在关闭子进程...')
for process in processes:
process.terminate()
print('子进程已关闭,程序退出。')
sys.exit(0)
# 注册 SIGINT 信号处理函数
signal.signal(signal.SIGINT, sigint_handler)
ind = 0
for ind, row in df.iterrows():
group = row['单位']
name = row['主办']
url = row['地址']
domain = urlparse(url).netloc.replace('www.', '')
cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
process = subprocess.Popen(cmd)
processes.append(process)
ind +=1
# Wait for all processes to finish
for process in processes:
process.wait()
# Check output file sizes and save information if size is less than 20KB
info_to_save = []
for ind, row in df.iterrows():
name = row['主办']
url = row['地址']
domain = urlparse(row['地址']).netloc
output_filename = f'web_dir/{name}_{domain}.xlsx'
if os.path.exists(output_filename):
file_size = os.path.getsize(output_filename)
if file_size < 20 * 1024: # Convert KB to bytes
info_to_save.append([row['单位'], name, row['地址']])
if info_to_save:
save_info_to_excel(info_to_save, 'failed_files.xlsx')