zcspider/scrape.py

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
from urllib.parse import urlparse
from pathlib import Path
import pandas as pd

def open_website(url):
    # Set up Chrome WebDriver with custom User-Agent
    options = Options()
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36")
    prefs = {"profile.managed_default_content_settings.images": 2, 'notifications':2}
    options.add_experimental_option("prefs", prefs)
    driver = webdriver.Chrome("./chromedriver.exe", options=options)
    driver.get(url)
    return driver

def extract_hyperlinks(driver):
    # Find all elements with href attribute
    elements = driver.find_elements(By.XPATH, '//*[@href]')
    # Extract the href values from the elements
    hrefs = [element.get_attribute('href') for element in elements]
    return hrefs

def ignore_image_and_document_hrefs(href):
    parsed_url = urlparse(href)
    path = parsed_url.path
    file_extension = Path(path).suffix
    # Check if the href has a domain suffix of image or document file extensions
    return file_extension.lower() in ('.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.css', '.js')

def process_page(driver, url, visited_pages, start_domain, data):
    # Add the URL to visited pages
    visited_pages.add(url)
    # Navigate to the URL
    driver.get(url)
    # Wait for the page to load
    time.sleep(2)
    # Extract the content from the page
    content_element = driver.find_element(By.XPATH, '//body')
    content_text = content_element.text
    print(content_text)
    # Add URL, Domain, and Content to the data list
    data.append([start_domain, url, content_text])

    # Find and process hyperlinks
    hrefs = extract_hyperlinks(driver)
    for href in hrefs:
        # Check if the href should be ignored
        if ignore_image_and_document_hrefs(href):
            continue
        # Check if the href is of type "javascript:void(0)"
        if href.startswith("javascript:void(0)"):
            continue
        # Check if the href leads back to the original page or has already been visited
        if check_href(href, driver.current_url, visited_pages):
            continue
        try:
            # Check if the new href belongs to the same domain as the original URL
            parsed_href = urlparse(href)
            if parsed_href.netloc != start_domain:
                continue
            # Open the href in the same tab and retrieve data
            driver.get(href)
            print(href)
            # Wait for the page to load
            time.sleep(2)
            # Extract the content from the hyperlink page
            hyperlink_content_element = driver.find_element(By.XPATH, '//body')
            hyperlink_content_text = hyperlink_content_element.text
            print(hyperlink_content_text)
            # Add URL, Domain, and Content of the hyperlink to the data list
            data.append([start_domain, href, hyperlink_content_text])
            # Recursively process the page and follow hyperlinks
            process_page(driver, href, visited_pages, start_domain, data)
        except Exception as e:
            print(f"Error processing hyperlink: {href}")
            print(f"Error message: {str(e)}")
            continue

    # Return to the original page
    driver.get(url)

def check_href(href, original_url, visited_pages):
    parsed_href = urlparse(href)
    parsed_original_url = urlparse(original_url)
    # Check if the href leads back to the original page
    if parsed_href.netloc == parsed_original_url.netloc and parsed_href.path == parsed_original_url.path:
        return True
    # Check if the href has already been visited
    if href in visited_pages:
        return True
    return False

def export_to_excel(data, output_filename):
    # Create separate lists for each column
    groups = [item[0] for item in data]
    names = [item[1] for item in data]
    domains = [item[2] for item in data]
    urls = [item[3] for item in data]
    texts = [item[4] for item in data]

    # Create a DataFrame from the data lists
    df = pd.DataFrame({'group': groups, 'name': names, 'domain': domains, 'url': urls, 'text': texts})

    # Export the DataFrame to an Excel file
    df.to_excel(output_filename, index=False)

def get_cookies_from_previous_session(driver):
    cookies = {}
    try:
        # Execute JavaScript to get the cookies using jQuery Cookie Plugin
        cookie_script = """
        var cookies = $.cookie();
        return Object.entries(cookies).map(([name, value]) => `${name}=${value}`);
        """
        cookie_values = driver.execute_script(cookie_script)

        # Parse the cookie values and store them in a dictionary
        for cookie_value in cookie_values:
            cookie_name, cookie_value = cookie_value.split('=')
            cookies[cookie_name] = cookie_value
    except Exception as e:
        print('Error getting cookies:', e)
    return cookies

def add_cookies(driver, cookies):
    for name, value in cookies.items():
        driver.add_cookie({'name': name, 'value': value})

def main():
    # Read failed URLs from the list
    df = pd.read_excel('failed_files.xlsx')

    for ind, row in df.iterrows():
        group = row['单位']  # Replace with the actual column name for group
        name = row['主办']
        url = row['地址']
        domain = urlparse(url).netloc

        # Open the website
        driver = open_website(url)

        # Retrieve cookies from previous session
        cookies = get_cookies_from_previous_session(driver)
        # Add cookies to the WebDriver
        add_cookies(driver, cookies)

        # Initialize the set to store visited pages
        visited_pages = set()
        # Initialize the data list
        data = []

        # Process the starting page and follow hyperlinks recursively
        process_page(driver, url, visited_pages, domain, data)

        # Export data to a separate Excel file in the web_dir directory
        output_filename = f'web_dir/{name}_{domain}.xlsx'
        export_to_excel(data, output_filename)

        # Close the WebDriver
        driver.quit()

if __name__ == "__main__":
    main()