From a395ab6867156f9e3d77581659fb12c51fe338fa Mon Sep 17 00:00:00 2001
From: xiaobulu27 <xiaobulu27@outlook.com>
Date: Thu, 24 Aug 2023 10:44:48 +0800
Subject: [PATCH] =?UTF-8?q?web.py=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 web.py | 40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/web.py b/web.py
index b81b7e9..dc8b8fe 100644
--- a/web.py
+++ b/web.py
@@ -1,10 +1,15 @@
+import os
 import subprocess
 import pandas as pd
 from urllib.parse import urlparse
 import signal
 import sys
 
-df = pd.read_excel('web_sites_full.xlsx', sheet_name='Sheet1')
+df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
+
+def save_info_to_excel(info_list, output_filename):
+    df = pd.DataFrame(info_list, columns=['Group', 'Name' , 'URL'])
+    df.to_excel(output_filename, index=False)
 
 processes = []
 # 定义 SIGINT 信号处理函数
@@ -23,22 +28,27 @@ for ind, row in df.iterrows():
         group = row['单位']
         name = row['主办']
         url = row['地址']
-        if 'http' in url:
-            sx = row['地址'].split('http')
-            ename = sx[0].strip()
-            if ename:
-                name = ename
-            url = 'http' + sx[1]
-        elif 'www' in url:
-            sx = row['地址'].split('www')
-            ename = sx[0].strip()
-            if ename:
-                name = ename
-            url = 'http://www' + sx[1]
         domain = urlparse(url).netloc
         cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
         process = subprocess.Popen(cmd)
         processes.append(process)
         ind +=1
-        # if ind > 0:
-        #     break
\ No newline at end of file
+
+# Wait for all processes to finish
+for process in processes:
+    process.wait()
+
+# Check output file sizes and save information if size is less than 20KB
+info_to_save = []
+for ind, row in df.iterrows():
+    name = row['主办']
+    url = row['地址']
+    domain = urlparse(row['地址']).netloc
+    output_filename = f'web_dir/{name}_{domain}.xlsx'
+    if os.path.exists(output_filename):
+        file_size = os.path.getsize(output_filename)
+        if file_size < 20 * 1024:  # Convert KB to bytes
+            info_to_save.append([row['单位'], name, row['地址']])
+
+if info_to_save:
+    save_info_to_excel(info_to_save, 'failed_files.xlsx')