From 54019e9e5d1a0d6baa01c5e281073694693ce7cd Mon Sep 17 00:00:00 2001 From: caoqianming Date: Tue, 29 Aug 2023 12:29:57 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=A2=9E=E5=8A=A0chrom=5Fmain=5Ffrom?= =?UTF-8?q?=5Flist=EF=BC=8C=E4=B8=8D=E4=BF=9D=E5=AD=98failed=5Fsites?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mycode/crawl_chrome.py | 30 ++++++++++++++++++++++++++++++ readme.md | 9 ++++++++- web3.py | 7 +++---- zcspider/spiders/base.py | 4 ++-- 4 files changed, 43 insertions(+), 7 deletions(-) diff --git a/mycode/crawl_chrome.py b/mycode/crawl_chrome.py index 0c7d1ad..7d9c7b0 100644 --- a/mycode/crawl_chrome.py +++ b/mycode/crawl_chrome.py @@ -133,6 +133,36 @@ def add_cookies(driver, cookies): for name, value in cookies.items(): driver.add_cookie({'name': name, 'value': value}) +def chrom_main_from_list(sites): + for ind, item in enumerate(sites): + group = item[0] # Replace with the actual column name for group + name = item[1] + url = item[2] + domain = urlparse(url).netloc.replace("www.", "") + + # Open the website + driver = open_website(url) + + # Retrieve cookies from previous session + cookies = get_cookies_from_previous_session(driver) + # Add cookies to the WebDriver + add_cookies(driver, cookies) + + # Initialize the set to store visited pages + visited_pages = set() + # Initialize the data list + data = [] + + # Process the starting page and follow hyperlinks recursively + process_page(driver, url, visited_pages, domain, data) + + # Export data to a separate Excel file in the web_dir directory + output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') + export_to_excel(data, output_filename) + + # Close the WebDriver + driver.quit() + def chrome_main(): # Read failed URLs from the list df = pd.read_excel(failed_sites_file) diff --git a/readme.md b/readme.md index eb7cb65..e2d1fb4 100644 --- a/readme.md +++ b/readme.md @@ -1,12 +1,19 @@ ## 安装告知 务必先卸载本机的chrome浏览器,并安装文件夹下的chrome117.exe + +务必确保chrome不自动更新 操作方法:
+修改 "C:\Windows\System32\drivers\etc\host 文件,在最后增加 +127.0.0.1 update.googleapis.com
+保存后 win+r 快捷键,输入cmd打开命令行,输入"ipconfig /flushdns" 刷新dns + biao.xlsx 为标准对比文件,需保持定期更新 + ## 操作说明 用于执行内容巡查, 请按以下步骤依次执行 ### 1.爬取公众号内容 -运行wechat.exe, 打开网页后按步骤操作,具体可参考视频教学 [https://www.bilibili.com/video/BV1kh411c7GX/] +运行wechat.exe, 打开网页后按步骤操作,具体可参考视频教学 https://www.bilibili.com/video/BV1kh411c7GX/ ### 2.整理需要爬取的官网 打开web_sites.xlsx,进行编辑后保存并关闭 diff --git a/web3.py b/web3.py index 94d0006..3b9a5e0 100644 --- a/web3.py +++ b/web3.py @@ -9,7 +9,7 @@ from openpyxl import load_workbook from mycode.base import BASE_DIR from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir -from mycode.crawl_chrome import chrome_main, failed_sites_file +from mycode.crawl_chrome import chrom_main_from_list, failed_sites_file python_exe = os.path.join(BASE_DIR, 'runtime/python.exe') # scrapy_exe = os.path.join(BASE_DIR, 'runtime/Scripts/scrapy.exe') @@ -76,9 +76,8 @@ if __name__ == '__main__': if info_to_save: print('存在未爬取站点,正在调用Chrome继续爬取。。。') - save_info_to_excel(info_to_save, failed_sites_file) - chrome_main() - os.remove(failed_sites_file) + chrom_main_from_list(info_to_save) + # os.remove(failed_sites_file) print('网站爬取完毕!') diff --git a/zcspider/spiders/base.py b/zcspider/spiders/base.py index 4b966ec..ac11414 100644 --- a/zcspider/spiders/base.py +++ b/zcspider/spiders/base.py @@ -26,7 +26,7 @@ class BaseSpider(scrapy.Spider): self.name = name self.group = group self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.dat', '.css', '.js']) - print(f"爬取开始: {name}-{domain}") + print(f"爬取开始: {name}_{domain}") def start_requests(self): for url in self.start_urls: @@ -83,4 +83,4 @@ class BaseSpider(scrapy.Spider): def closed(self, reason): # This method will be called when the Spider is about to close - print(f'爬取完成: {self.name}-{self.domain}') \ No newline at end of file + print(f'爬取完成: {self.name}_{self.domain}') \ No newline at end of file