web.py修改

2023-08-24 10:44:48 +08:00 · 2023-08-24 10:44:48 +08:00 · a395ab6867
parent e3a77c94ec
commit a395ab6867
1 changed files with 25 additions and 15 deletions
--- a/web.py
+++ b/web.py
@ -1,10 +1,15 @@
 import os
 import subprocess
 import pandas as pd
 from urllib.parse import urlparse
 import signal
 import sys
-df = pd.read_excel('web_sites_full.xlsx', sheet_name='Sheet1')
+df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
 def save_info_to_excel(info_list, output_filename):
    df = pd.DataFrame(info_list, columns=['Group', 'Name' , 'URL'])
    df.to_excel(output_filename, index=False)
 processes = []
 # 定义 SIGINT 信号处理函数
@ -23,22 +28,27 @@ for ind, row in df.iterrows():
        group = row['单位']
        name = row['主办']
        url = row['地址']
        if 'http' in url:
            sx = row['地址'].split('http')
            ename = sx[0].strip()
            if ename:
                name = ename
            url = 'http' + sx[1]
        elif 'www' in url:
            sx = row['地址'].split('www')
            ename = sx[0].strip()
            if ename:
                name = ename
            url = 'http://www' + sx[1]
        domain = urlparse(url).netloc
        cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
        process = subprocess.Popen(cmd)
        processes.append(process)
        ind +=1
-        # if ind > 0:
+
-        #     break
+# Wait for all processes to finish
 for process in processes:
    process.wait()
 # Check output file sizes and save information if size is less than 20KB
 info_to_save = []
 for ind, row in df.iterrows():
    name = row['主办']
    url = row['地址']
    domain = urlparse(row['地址']).netloc
    output_filename = f'web_dir/{name}_{domain}.xlsx'
    if os.path.exists(output_filename):
        file_size = os.path.getsize(output_filename)
        if file_size < 20 * 1024:  # Convert KB to bytes
            info_to_save.append([row['单位'], name, row['地址']])
 if info_to_save:
    save_info_to_excel(info_to_save, 'failed_files.xlsx')