44 lines
1.3 KiB
Python
44 lines
1.3 KiB
Python
import subprocess
|
|
import pandas as pd
|
|
from urllib.parse import urlparse
|
|
import signal
|
|
import sys
|
|
|
|
df = pd.read_excel('web_sites_full.xlsx', sheet_name='Sheet1')
|
|
|
|
processes = []
|
|
# 定义 SIGINT 信号处理函数
|
|
def sigint_handler(signal, frame):
|
|
print('收到 Ctrl-C 信号,正在关闭子进程...')
|
|
for process in processes:
|
|
process.terminate()
|
|
print('子进程已关闭,程序退出。')
|
|
sys.exit(0)
|
|
# 注册 SIGINT 信号处理函数
|
|
signal.signal(signal.SIGINT, sigint_handler)
|
|
|
|
|
|
ind = 0
|
|
for ind, row in df.iterrows():
|
|
group = row['单位']
|
|
name = row['主办']
|
|
url = row['地址']
|
|
if 'http' in url:
|
|
sx = row['地址'].split('http')
|
|
ename = sx[0].strip()
|
|
if ename:
|
|
name = ename
|
|
url = 'http' + sx[1]
|
|
elif 'www' in url:
|
|
sx = row['地址'].split('www')
|
|
ename = sx[0].strip()
|
|
if ename:
|
|
name = ename
|
|
url = 'http://www' + sx[1]
|
|
domain = urlparse(url).netloc
|
|
cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
|
|
process = subprocess.Popen(cmd)
|
|
processes.append(process)
|
|
ind +=1
|
|
# if ind > 0:
|
|
# break |