from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import time from urllib.parse import urlparse from pathlib import Path import pandas as pd def open_website(url): # Set up Chrome WebDriver with custom User-Agent options = Options() options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36") driver = webdriver.Chrome("./chromedriver.exe", options=options) driver.get(url) return driver def extract_hyperlinks(driver): # Find all elements with href attribute elements = driver.find_elements(By.XPATH, '//*[@href]') # Extract the href values from the elements hrefs = [element.get_attribute('href') for element in elements] return hrefs def ignore_image_and_document_hrefs(href): parsed_url = urlparse(href) path = parsed_url.path file_extension = Path(path).suffix # Check if the href has a domain suffix of image or document file extensions return file_extension.lower() in ('.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.css', '.js') def process_page(driver, url, visited_pages, start_domain, data): # Add the URL to visited pages visited_pages.add(url) # Navigate to the URL driver.get(url) # Wait for the page to load time.sleep(2) # Extract the content from the page content_element = driver.find_element(By.XPATH, '//body') content_text = content_element.text print(content_text) # Add URL, Domain, and Content to the data list data.append([start_domain, url, content_text]) # Find and process hyperlinks hrefs = extract_hyperlinks(driver) for href in hrefs: # Check if the href should be ignored if ignore_image_and_document_hrefs(href): continue # Check if the href is of type "javascript:void(0)" if href.startswith("javascript:void(0)"): continue # Check if the href leads back to the original page or has already been visited if check_href(href, driver.current_url, visited_pages): continue try: # Check if the new href belongs to the same domain as the original URL parsed_href = urlparse(href) if parsed_href.netloc != start_domain: continue # Open the href in the same tab and retrieve data driver.get(href) print(href) # Wait for the page to load time.sleep(2) # Extract the content from the hyperlink page hyperlink_content_element = driver.find_element(By.XPATH, '//body') hyperlink_content_text = hyperlink_content_element.text print(hyperlink_content_text) # Add URL, Domain, and Content of the hyperlink to the data list data.append([start_domain, href, hyperlink_content_text]) # Recursively process the page and follow hyperlinks process_page(driver, href, visited_pages, start_domain, data) except Exception as e: print(f"Error processing hyperlink: {href}") print(f"Error message: {str(e)}") continue # Return to the original page driver.get(url) def check_href(href, original_url, visited_pages): parsed_href = urlparse(href) parsed_original_url = urlparse(original_url) # Check if the href leads back to the original page if parsed_href.netloc == parsed_original_url.netloc and parsed_href.path == parsed_original_url.path and parsed_href.fragment == parsed_original_url.fragment: return True # Check if the href has already been visited if href in visited_pages: return True return False def export_to_excel(data): # Create separate lists for URL, Domain, and Content domains = [item[0] for item in data] urls = [item[1] for item in data] texts = [item[2] for item in data] # Create a DataFrame from the data lists df = pd.DataFrame({'domain': domains, 'url': urls, 'text': texts}) # Export the DataFrame to an Excel file df.to_excel('output.xlsx', index=False) def get_cookies_from_previous_session(driver): cookies = {} try: # Execute JavaScript to get the cookies using jQuery Cookie Plugin cookie_script = """ var cookies = $.cookie(); return Object.entries(cookies).map(([name, value]) => `${name}=${value}`); """ cookie_values = driver.execute_script(cookie_script) # Parse the cookie values and store them in a dictionary for cookie_value in cookie_values: cookie_name, cookie_value = cookie_value.split('=') cookies[cookie_name] = cookie_value except Exception as e: print('Error getting cookies:', e) return cookies def add_cookies(driver, cookies): for name, value in cookies.items(): driver.add_cookie({'name': name, 'value': value}) def main(): # Starting URL start_url = 'https://www.cbma.com.cn/' # Parse the domain from the starting URL parsed_start_url = urlparse(start_url) start_domain = parsed_start_url.netloc # Open the website driver = open_website(start_url) # Retrieve cookies from previous session cookies = get_cookies_from_previous_session(driver) # Add cookies to the WebDriver add_cookies(driver, cookies) # Initialize the set to store visited pages visited_pages = set() # Initialize the data list data = [] # Process the starting page and follow hyperlinks recursively process_page(driver, start_url, visited_pages, start_domain, data) # Export the data to an Excel file export_to_excel(data) # Close the WebDriver driver.quit() if __name__ == "__main__": main()