From 32e093479205ac0d71d95dde7dceb68c92831a88 Mon Sep 17 00:00:00 2001 From: xiaobulu27 Date: Thu, 24 Aug 2023 17:06:02 +0800 Subject: [PATCH] =?UTF-8?q?selenium=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scrape.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/scrape.py b/scrape.py index 574263f..a593747 100644 --- a/scrape.py +++ b/scrape.py @@ -138,24 +138,29 @@ def main(): name = row['主办'] url = row['地址'] domain = urlparse(url).netloc - # Open the website - driver = open_website(url) - # Retrieve cookies from previous session - cookies = get_cookies_from_previous_session(driver) - # Add cookies to the WebDriver - add_cookies(driver, cookies) - # Initialize the set to store visited pages - visited_pages = set() - # Initialize the data list - data = [] - # Process the starting page and follow hyperlinks recursively - process_page(driver, url, visited_pages, domain, data) - # Export data to a separate Excel file for each URL - output_filename = f'web_dir/{name}_{domain}.xlsx' - export_to_excel(data, output_filename) - # Close the WebDriver - driver.quit() + # Open the website + driver = open_website(url) + + # Retrieve cookies from previous session + cookies = get_cookies_from_previous_session(driver) + # Add cookies to the WebDriver + add_cookies(driver, cookies) + + # Initialize the set to store visited pages + visited_pages = set() + # Initialize the data list + data = [] + + # Process the starting page and follow hyperlinks recursively + process_page(driver, url, visited_pages, domain, data) + + # Export data to a separate Excel file in the web_dir directory + output_filename = f'web_dir/{name}_{domain}.xlsx' + export_to_excel(data, output_filename) + + # Close the WebDriver + driver.quit() if __name__ == "__main__": main()