From 54019e9e5d1a0d6baa01c5e281073694693ce7cd Mon Sep 17 00:00:00 2001
From: caoqianming <caoqianming@foxmail.com>
Date: Tue, 29 Aug 2023 12:29:57 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E5=A2=9E=E5=8A=A0chrom=5Fmain=5Ffrom?=
 =?UTF-8?q?=5Flist=EF=BC=8C=E4=B8=8D=E4=BF=9D=E5=AD=98failed=5Fsites?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mycode/crawl_chrome.py   | 30 ++++++++++++++++++++++++++++++
 readme.md                |  9 ++++++++-
 web3.py                  |  7 +++----
 zcspider/spiders/base.py |  4 ++--
 4 files changed, 43 insertions(+), 7 deletions(-)
diff --git a/mycode/crawl_chrome.py b/mycode/crawl_chrome.py
index 0c7d1ad..7d9c7b0 100644
--- a/mycode/crawl_chrome.py
+++ b/mycode/crawl_chrome.py
@@ -133,6 +133,36 @@ def add_cookies(driver, cookies):
     for name, value in cookies.items():
         driver.add_cookie({'name': name, 'value': value})
 
+def chrom_main_from_list(sites):
+    for ind, item in enumerate(sites):
+        group = item[0] # Replace with the actual column name for group
+        name = item[1]
+        url = item[2]
+        domain = urlparse(url).netloc.replace("www.", "")
+
+        # Open the website
+        driver = open_website(url)
+
+        # Retrieve cookies from previous session
+        cookies = get_cookies_from_previous_session(driver)
+        # Add cookies to the WebDriver
+        add_cookies(driver, cookies)
+
+        # Initialize the set to store visited pages
+        visited_pages = set()
+        # Initialize the data list
+        data = []
+
+        # Process the starting page and follow hyperlinks recursively
+        process_page(driver, url, visited_pages, domain, data)
+
+        # Export data to a separate Excel file in the web_dir directory
+        output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') 
+        export_to_excel(data, output_filename)
+
+        # Close the WebDriver
+        driver.quit()
+
 def chrome_main():
     # Read failed URLs from the list
     df = pd.read_excel(failed_sites_file)
diff --git a/readme.md b/readme.md
index eb7cb65..e2d1fb4 100644
--- a/readme.md
+++ b/readme.md
@@ -1,12 +1,19 @@
 ## 安装告知
 务必先卸载本机的chrome浏览器，并安装文件夹下的chrome117.exe
+
+务必确保chrome不自动更新 操作方法:<br/>
+修改 "C:\Windows\System32\drivers\etc\host 文件，在最后增加
+127.0.0.1 update.googleapis.com<br/>
+保存后 win+r 快捷键，输入cmd打开命令行，输入"ipconfig /flushdns" 刷新dns
+
 biao.xlsx 为标准对比文件，需保持定期更新
 
+
 ## 操作说明
 用于执行内容巡查, 请按以下步骤依次执行
 
 ### 1.爬取公众号内容
-运行wechat.exe, 打开网页后按步骤操作，具体可参考视频教学 [https://www.bilibili.com/video/BV1kh411c7GX/]
+运行wechat.exe, 打开网页后按步骤操作，具体可参考视频教学 https://www.bilibili.com/video/BV1kh411c7GX/
 
 ### 2.整理需要爬取的官网
 打开web_sites.xlsx，进行编辑后保存并关闭
diff --git a/web3.py b/web3.py
index 94d0006..3b9a5e0 100644
--- a/web3.py
+++ b/web3.py
@@ -9,7 +9,7 @@ from openpyxl import load_workbook
 
 from mycode.base import BASE_DIR
 from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir
-from mycode.crawl_chrome import chrome_main, failed_sites_file
+from mycode.crawl_chrome import chrom_main_from_list, failed_sites_file
 
 python_exe = os.path.join(BASE_DIR, 'runtime/python.exe')
 # scrapy_exe = os.path.join(BASE_DIR, 'runtime/Scripts/scrapy.exe')
@@ -76,9 +76,8 @@ if __name__ == '__main__':
 
     if info_to_save:
         print('存在未爬取站点,正在调用Chrome继续爬取。。。')
-        save_info_to_excel(info_to_save, failed_sites_file)
-        chrome_main()
-        os.remove(failed_sites_file)
+        chrom_main_from_list(info_to_save)
+        # os.remove(failed_sites_file)
 
     print('网站爬取完毕!')
 
diff --git a/zcspider/spiders/base.py b/zcspider/spiders/base.py
index 4b966ec..ac11414 100644
--- a/zcspider/spiders/base.py
+++ b/zcspider/spiders/base.py
@@ -26,7 +26,7 @@ class BaseSpider(scrapy.Spider):
          self.name = name
          self.group = group
          self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.dat', '.css', '.js'])
-         print(f"爬取开始: {name}-{domain}")
+         print(f"爬取开始: {name}_{domain}")
 
     def start_requests(self):
         for url in self.start_urls:
@@ -83,4 +83,4 @@ class BaseSpider(scrapy.Spider):
     
     def closed(self, reason):
         # This method will be called when the Spider is about to close
-        print(f'爬取完成: {self.name}-{self.domain}')
\ No newline at end of file
+        print(f'爬取完成: {self.name}_{self.domain}')
\ No newline at end of file