From a395ab6867156f9e3d77581659fb12c51fe338fa Mon Sep 17 00:00:00 2001 From: xiaobulu27 Date: Thu, 24 Aug 2023 10:44:48 +0800 Subject: [PATCH] =?UTF-8?q?web.py=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- web.py | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/web.py b/web.py index b81b7e9..dc8b8fe 100644 --- a/web.py +++ b/web.py @@ -1,10 +1,15 @@ +import os import subprocess import pandas as pd from urllib.parse import urlparse import signal import sys -df = pd.read_excel('web_sites_full.xlsx', sheet_name='Sheet1') +df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1') + +def save_info_to_excel(info_list, output_filename): + df = pd.DataFrame(info_list, columns=['Group', 'Name' , 'URL']) + df.to_excel(output_filename, index=False) processes = [] # 定义 SIGINT 信号处理函数 @@ -23,22 +28,27 @@ for ind, row in df.iterrows(): group = row['单位'] name = row['主办'] url = row['地址'] - if 'http' in url: - sx = row['地址'].split('http') - ename = sx[0].strip() - if ename: - name = ename - url = 'http' + sx[1] - elif 'www' in url: - sx = row['地址'].split('www') - ename = sx[0].strip() - if ename: - name = ename - url = 'http://www' + sx[1] domain = urlparse(url).netloc cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx'] process = subprocess.Popen(cmd) processes.append(process) ind +=1 - # if ind > 0: - # break \ No newline at end of file + +# Wait for all processes to finish +for process in processes: + process.wait() + +# Check output file sizes and save information if size is less than 20KB +info_to_save = [] +for ind, row in df.iterrows(): + name = row['主办'] + url = row['地址'] + domain = urlparse(row['地址']).netloc + output_filename = f'web_dir/{name}_{domain}.xlsx' + if os.path.exists(output_filename): + file_size = os.path.getsize(output_filename) + if file_size < 20 * 1024: # Convert KB to bytes + info_to_save.append([row['单位'], name, row['地址']]) + +if info_to_save: + save_info_to_excel(info_to_save, 'failed_files.xlsx')