From a877b198f6c4b3df008749ad513084f4a5d75ada Mon Sep 17 00:00:00 2001
From: xiaobulu27 <xiaobulu27@outlook.com>
Date: Thu, 24 Aug 2023 10:55:58 +0800
Subject: [PATCH] =?UTF-8?q?web.py=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scrape_nj.py => scrape.py | 45 ++++++++++++++++++++++-----------------
 summary/summary juin.md   | 23 --------------------
 2 files changed, 26 insertions(+), 42 deletions(-)
 rename scrape_nj.py => scrape.py (81%)
 delete mode 100644 summary/summary juin.md

diff --git a/scrape_nj.py b/scrape.py
similarity index 81%
rename from scrape_nj.py
rename to scrape.py
index ed7628c..574263f 100644
--- a/scrape_nj.py
+++ b/scrape.py
@@ -10,6 +10,8 @@ def open_website(url):
     # Set up Chrome WebDriver with custom User-Agent
     options = Options()
     options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36")
+    prefs = {"profile.managed_default_content_settings.images": 2, 'notifications':2}
+    options.add_experimental_option("prefs", prefs)
     driver = webdriver.Chrome("./chromedriver.exe", options=options)
     driver.get(url)
     return driver
@@ -84,26 +86,26 @@ def check_href(href, original_url, visited_pages):
     parsed_href = urlparse(href)
     parsed_original_url = urlparse(original_url)
     # Check if the href leads back to the original page
-    if parsed_href.netloc == parsed_original_url.netloc and parsed_href.path == parsed_original_url.path and parsed_href.fragment == parsed_original_url.fragment:
+    if parsed_href.netloc == parsed_original_url.netloc and parsed_href.path == parsed_original_url.path:
         return True
     # Check if the href has already been visited
     if href in visited_pages:
         return True
     return False
 
-def export_to_excel(data):
-    # Create separate lists for URL, Domain, and Content
-    domains = [item[0] for item in data]
-    urls = [item[1] for item in data]
-    texts = [item[2] for item in data]
+def export_to_excel(data, output_filename):
+    # Create separate lists for each column
+    groups = [item[0] for item in data]
+    names = [item[1] for item in data]
+    domains = [item[2] for item in data]
+    urls = [item[3] for item in data]
+    texts = [item[4] for item in data]
 
     # Create a DataFrame from the data lists
-    df = pd.DataFrame({'domain': domains, 'url': urls, 'text': texts})
+    df = pd.DataFrame({'group': groups, 'name': names, 'domain': domains, 'url': urls, 'text': texts})
 
     # Export the DataFrame to an Excel file
-    df.to_excel('output.xlsx', index=False)
-
-
+    df.to_excel(output_filename, index=False)
 
 def get_cookies_from_previous_session(driver):
     cookies = {}
@@ -128,13 +130,16 @@ def add_cookies(driver, cookies):
         driver.add_cookie({'name': name, 'value': value})
 
 def main():
-    # Starting URL
-    start_url = 'https://www.cbma.com.cn/'
-    # Parse the domain from the starting URL
-    parsed_start_url = urlparse(start_url)
-    start_domain = parsed_start_url.netloc
+    # Read failed URLs from the list
+    df = pd.read_excel('failed_files.xlsx')
+
+    for ind, row in df.iterrows():
+        group = row['单位']  # Replace with the actual column name for group
+        name = row['主办']
+        url = row['地址']
+        domain = urlparse(url).netloc
     # Open the website
-    driver = open_website(start_url)
+    driver = open_website(url)
     # Retrieve cookies from previous session
     cookies = get_cookies_from_previous_session(driver)
     # Add cookies to the WebDriver
@@ -144,9 +149,11 @@ def main():
     # Initialize the data list
     data = []
     # Process the starting page and follow hyperlinks recursively
-    process_page(driver, start_url, visited_pages, start_domain, data)
-    # Export the data to an Excel file
-    export_to_excel(data)
+    process_page(driver, url, visited_pages, domain, data)
+    # Export data to a separate Excel file for each URL
+    output_filename = f'web_dir/{name}_{domain}.xlsx'
+    export_to_excel(data, output_filename)
+
     # Close the WebDriver
     driver.quit()
 
diff --git a/summary/summary juin.md b/summary/summary juin.md
deleted file mode 100644
index dd6b02b..0000000
--- a/summary/summary juin.md	
+++ /dev/null
@@ -1,23 +0,0 @@
-# 项目总结2023.6
-
-爬取时间为6月5-9日, 分析时间为6月12-13 日
-
-## 涉及单位
-
-目前爬取并分析了阵地类型为官方网站、微信公众号两种类型的数据，其中官网网站69个（成功爬取63个），微信公众号102个。
-其中官方网站爬取了同域名下所有链接地址，微信公众号爬取了历史所有文章。
-
-## 分析结果
-
-根据分析要求进行得到分析结果，官方网站共发现错误187处，公众号共发现错误39处。具体见结果表
-
-## 存在问题
-
-目前存在部分网站因各种原因，未获取到数据，见下表
-|  单位   |  可能原因  |
-|  ----  |  ----  |
-|  中国建筑材料科学研究总院有限公司_http://www.cbma.com  |  不能访问  |
-|  中国建材检验认证集团江苏有限公司_http://www.ctcjs.com  |  不能访问  |
-|  乌鲁木齐京诚检测技术有限公司_http://www.wlmqjc.cn/  |  网站域名过期  |
-|  中材江西电瓷电气有限公司_http://www.sinoma-insulator.com  |  不能访问 |
-|  中国新型建材设计研究院有限公司_http://www.cnhdi.com/  |  不能访问 |
\ No newline at end of file