web.py修改

This commit is contained in:
xiaobulu27 2023-08-24 10:44:48 +08:00
parent e3a77c94ec
commit a395ab6867
1 changed files with 25 additions and 15 deletions

40
web.py
View File

@ -1,10 +1,15 @@
import os
import subprocess import subprocess
import pandas as pd import pandas as pd
from urllib.parse import urlparse from urllib.parse import urlparse
import signal import signal
import sys import sys
df = pd.read_excel('web_sites_full.xlsx', sheet_name='Sheet1') df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
def save_info_to_excel(info_list, output_filename):
df = pd.DataFrame(info_list, columns=['Group', 'Name' , 'URL'])
df.to_excel(output_filename, index=False)
processes = [] processes = []
# 定义 SIGINT 信号处理函数 # 定义 SIGINT 信号处理函数
@ -23,22 +28,27 @@ for ind, row in df.iterrows():
group = row['单位'] group = row['单位']
name = row['主办'] name = row['主办']
url = row['地址'] url = row['地址']
if 'http' in url:
sx = row['地址'].split('http')
ename = sx[0].strip()
if ename:
name = ename
url = 'http' + sx[1]
elif 'www' in url:
sx = row['地址'].split('www')
ename = sx[0].strip()
if ename:
name = ename
url = 'http://www' + sx[1]
domain = urlparse(url).netloc domain = urlparse(url).netloc
cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx'] cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
process = subprocess.Popen(cmd) process = subprocess.Popen(cmd)
processes.append(process) processes.append(process)
ind +=1 ind +=1
# if ind > 0:
# break # Wait for all processes to finish
for process in processes:
process.wait()
# Check output file sizes and save information if size is less than 20KB
info_to_save = []
for ind, row in df.iterrows():
name = row['主办']
url = row['地址']
domain = urlparse(row['地址']).netloc
output_filename = f'web_dir/{name}_{domain}.xlsx'
if os.path.exists(output_filename):
file_size = os.path.getsize(output_filename)
if file_size < 20 * 1024: # Convert KB to bytes
info_to_save.append([row['单位'], name, row['地址']])
if info_to_save:
save_info_to_excel(info_to_save, 'failed_files.xlsx')