zcspider/scrape.py

167 lines
6.2 KiB
Python

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
from urllib.parse import urlparse
from pathlib import Path
import pandas as pd
def open_website(url):
# Set up Chrome WebDriver with custom User-Agent
options = Options()
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36")
prefs = {"profile.managed_default_content_settings.images": 2, 'notifications':2}
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome("./chromedriver.exe", options=options)
driver.get(url)
return driver
def extract_hyperlinks(driver):
# Find all elements with href attribute
elements = driver.find_elements(By.XPATH, '//*[@href]')
# Extract the href values from the elements
hrefs = [element.get_attribute('href') for element in elements]
return hrefs
def ignore_image_and_document_hrefs(href):
parsed_url = urlparse(href)
path = parsed_url.path
file_extension = Path(path).suffix
# Check if the href has a domain suffix of image or document file extensions
return file_extension.lower() in ('.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.css', '.js')
def process_page(driver, url, visited_pages, start_domain, data):
# Add the URL to visited pages
visited_pages.add(url)
# Navigate to the URL
driver.get(url)
# Wait for the page to load
time.sleep(2)
# Extract the content from the page
content_element = driver.find_element(By.XPATH, '//body')
content_text = content_element.text
print(content_text)
# Add URL, Domain, and Content to the data list
data.append([start_domain, url, content_text])
# Find and process hyperlinks
hrefs = extract_hyperlinks(driver)
for href in hrefs:
# Check if the href should be ignored
if ignore_image_and_document_hrefs(href):
continue
# Check if the href is of type "javascript:void(0)"
if href.startswith("javascript:void(0)"):
continue
# Check if the href leads back to the original page or has already been visited
if check_href(href, driver.current_url, visited_pages):
continue
try:
# Check if the new href belongs to the same domain as the original URL
parsed_href = urlparse(href)
if parsed_href.netloc != start_domain:
continue
# Open the href in the same tab and retrieve data
driver.get(href)
print(href)
# Wait for the page to load
time.sleep(2)
# Extract the content from the hyperlink page
hyperlink_content_element = driver.find_element(By.XPATH, '//body')
hyperlink_content_text = hyperlink_content_element.text
print(hyperlink_content_text)
# Add URL, Domain, and Content of the hyperlink to the data list
data.append([start_domain, href, hyperlink_content_text])
# Recursively process the page and follow hyperlinks
process_page(driver, href, visited_pages, start_domain, data)
except Exception as e:
print(f"Error processing hyperlink: {href}")
print(f"Error message: {str(e)}")
continue
# Return to the original page
driver.get(url)
def check_href(href, original_url, visited_pages):
parsed_href = urlparse(href)
parsed_original_url = urlparse(original_url)
# Check if the href leads back to the original page
if parsed_href.netloc == parsed_original_url.netloc and parsed_href.path == parsed_original_url.path:
return True
# Check if the href has already been visited
if href in visited_pages:
return True
return False
def export_to_excel(data, output_filename):
# Create separate lists for each column
groups = [item[0] for item in data]
names = [item[1] for item in data]
domains = [item[2] for item in data]
urls = [item[3] for item in data]
texts = [item[4] for item in data]
# Create a DataFrame from the data lists
df = pd.DataFrame({'group': groups, 'name': names, 'domain': domains, 'url': urls, 'text': texts})
# Export the DataFrame to an Excel file
df.to_excel(output_filename, index=False)
def get_cookies_from_previous_session(driver):
cookies = {}
try:
# Execute JavaScript to get the cookies using jQuery Cookie Plugin
cookie_script = """
var cookies = $.cookie();
return Object.entries(cookies).map(([name, value]) => `${name}=${value}`);
"""
cookie_values = driver.execute_script(cookie_script)
# Parse the cookie values and store them in a dictionary
for cookie_value in cookie_values:
cookie_name, cookie_value = cookie_value.split('=')
cookies[cookie_name] = cookie_value
except Exception as e:
print('Error getting cookies:', e)
return cookies
def add_cookies(driver, cookies):
for name, value in cookies.items():
driver.add_cookie({'name': name, 'value': value})
def main():
# Read failed URLs from the list
df = pd.read_excel('failed_files.xlsx')
for ind, row in df.iterrows():
group = row['单位'] # Replace with the actual column name for group
name = row['主办']
url = row['地址']
domain = urlparse(url).netloc
# Open the website
driver = open_website(url)
# Retrieve cookies from previous session
cookies = get_cookies_from_previous_session(driver)
# Add cookies to the WebDriver
add_cookies(driver, cookies)
# Initialize the set to store visited pages
visited_pages = set()
# Initialize the data list
data = []
# Process the starting page and follow hyperlinks recursively
process_page(driver, url, visited_pages, domain, data)
# Export data to a separate Excel file in the web_dir directory
output_filename = f'web_dir/{name}_{domain}.xlsx'
export_to_excel(data, output_filename)
# Close the WebDriver
driver.quit()
if __name__ == "__main__":
main()