web.py修改
This commit is contained in:
parent
e3a77c94ec
commit
a395ab6867
40
web.py
40
web.py
|
@ -1,10 +1,15 @@
|
||||||
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import signal
|
import signal
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
df = pd.read_excel('web_sites_full.xlsx', sheet_name='Sheet1')
|
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
|
||||||
|
|
||||||
|
def save_info_to_excel(info_list, output_filename):
|
||||||
|
df = pd.DataFrame(info_list, columns=['Group', 'Name' , 'URL'])
|
||||||
|
df.to_excel(output_filename, index=False)
|
||||||
|
|
||||||
processes = []
|
processes = []
|
||||||
# 定义 SIGINT 信号处理函数
|
# 定义 SIGINT 信号处理函数
|
||||||
|
@ -23,22 +28,27 @@ for ind, row in df.iterrows():
|
||||||
group = row['单位']
|
group = row['单位']
|
||||||
name = row['主办']
|
name = row['主办']
|
||||||
url = row['地址']
|
url = row['地址']
|
||||||
if 'http' in url:
|
|
||||||
sx = row['地址'].split('http')
|
|
||||||
ename = sx[0].strip()
|
|
||||||
if ename:
|
|
||||||
name = ename
|
|
||||||
url = 'http' + sx[1]
|
|
||||||
elif 'www' in url:
|
|
||||||
sx = row['地址'].split('www')
|
|
||||||
ename = sx[0].strip()
|
|
||||||
if ename:
|
|
||||||
name = ename
|
|
||||||
url = 'http://www' + sx[1]
|
|
||||||
domain = urlparse(url).netloc
|
domain = urlparse(url).netloc
|
||||||
cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
|
cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
|
||||||
process = subprocess.Popen(cmd)
|
process = subprocess.Popen(cmd)
|
||||||
processes.append(process)
|
processes.append(process)
|
||||||
ind +=1
|
ind +=1
|
||||||
# if ind > 0:
|
|
||||||
# break
|
# Wait for all processes to finish
|
||||||
|
for process in processes:
|
||||||
|
process.wait()
|
||||||
|
|
||||||
|
# Check output file sizes and save information if size is less than 20KB
|
||||||
|
info_to_save = []
|
||||||
|
for ind, row in df.iterrows():
|
||||||
|
name = row['主办']
|
||||||
|
url = row['地址']
|
||||||
|
domain = urlparse(row['地址']).netloc
|
||||||
|
output_filename = f'web_dir/{name}_{domain}.xlsx'
|
||||||
|
if os.path.exists(output_filename):
|
||||||
|
file_size = os.path.getsize(output_filename)
|
||||||
|
if file_size < 20 * 1024: # Convert KB to bytes
|
||||||
|
info_to_save.append([row['单位'], name, row['地址']])
|
||||||
|
|
||||||
|
if info_to_save:
|
||||||
|
save_info_to_excel(info_to_save, 'failed_files.xlsx')
|
||||||
|
|
Loading…
Reference in New Issue