selenium修改
This commit is contained in:
parent
5a2129859f
commit
32e0934792
39
scrape.py
39
scrape.py
|
@ -138,24 +138,29 @@ def main():
|
||||||
name = row['主办']
|
name = row['主办']
|
||||||
url = row['地址']
|
url = row['地址']
|
||||||
domain = urlparse(url).netloc
|
domain = urlparse(url).netloc
|
||||||
# Open the website
|
|
||||||
driver = open_website(url)
|
|
||||||
# Retrieve cookies from previous session
|
|
||||||
cookies = get_cookies_from_previous_session(driver)
|
|
||||||
# Add cookies to the WebDriver
|
|
||||||
add_cookies(driver, cookies)
|
|
||||||
# Initialize the set to store visited pages
|
|
||||||
visited_pages = set()
|
|
||||||
# Initialize the data list
|
|
||||||
data = []
|
|
||||||
# Process the starting page and follow hyperlinks recursively
|
|
||||||
process_page(driver, url, visited_pages, domain, data)
|
|
||||||
# Export data to a separate Excel file for each URL
|
|
||||||
output_filename = f'web_dir/{name}_{domain}.xlsx'
|
|
||||||
export_to_excel(data, output_filename)
|
|
||||||
|
|
||||||
# Close the WebDriver
|
# Open the website
|
||||||
driver.quit()
|
driver = open_website(url)
|
||||||
|
|
||||||
|
# Retrieve cookies from previous session
|
||||||
|
cookies = get_cookies_from_previous_session(driver)
|
||||||
|
# Add cookies to the WebDriver
|
||||||
|
add_cookies(driver, cookies)
|
||||||
|
|
||||||
|
# Initialize the set to store visited pages
|
||||||
|
visited_pages = set()
|
||||||
|
# Initialize the data list
|
||||||
|
data = []
|
||||||
|
|
||||||
|
# Process the starting page and follow hyperlinks recursively
|
||||||
|
process_page(driver, url, visited_pages, domain, data)
|
||||||
|
|
||||||
|
# Export data to a separate Excel file in the web_dir directory
|
||||||
|
output_filename = f'web_dir/{name}_{domain}.xlsx'
|
||||||
|
export_to_excel(data, output_filename)
|
||||||
|
|
||||||
|
# Close the WebDriver
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
Loading…
Reference in New Issue