162 lines
6.2 KiB
Python
162 lines
6.2 KiB
Python
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.chrome.options import Options
|
|
import time
|
|
from urllib.parse import urlparse
|
|
from pathlib import Path
|
|
import pandas as pd
|
|
|
|
def open_website(url):
|
|
# Set up Chrome WebDriver with custom User-Agent
|
|
options = Options()
|
|
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36")
|
|
prefs = {"profile.managed_default_content_settings.images": 2, 'notifications':2}
|
|
options.add_experimental_option("prefs", prefs)
|
|
driver = webdriver.Chrome("./chromedriver.exe", options=options)
|
|
driver.get(url)
|
|
return driver
|
|
|
|
def extract_hyperlinks(driver):
|
|
# Find all elements with href attribute
|
|
elements = driver.find_elements(By.XPATH, '//*[@href]')
|
|
# Extract the href values from the elements
|
|
hrefs = [element.get_attribute('href') for element in elements]
|
|
return hrefs
|
|
|
|
def ignore_image_and_document_hrefs(href):
|
|
parsed_url = urlparse(href)
|
|
path = parsed_url.path
|
|
file_extension = Path(path).suffix
|
|
# Check if the href has a domain suffix of image or document file extensions
|
|
return file_extension.lower() in ('.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.css', '.js')
|
|
|
|
def process_page(driver, url, visited_pages, start_domain, data):
|
|
# Add the URL to visited pages
|
|
visited_pages.add(url)
|
|
# Navigate to the URL
|
|
driver.get(url)
|
|
# Wait for the page to load
|
|
time.sleep(2)
|
|
# Extract the content from the page
|
|
content_element = driver.find_element(By.XPATH, '//body')
|
|
content_text = content_element.text
|
|
print(content_text)
|
|
# Add URL, Domain, and Content to the data list
|
|
data.append([start_domain, url, content_text])
|
|
|
|
# Find and process hyperlinks
|
|
hrefs = extract_hyperlinks(driver)
|
|
for href in hrefs:
|
|
# Check if the href should be ignored
|
|
if ignore_image_and_document_hrefs(href):
|
|
continue
|
|
# Check if the href is of type "javascript:void(0)"
|
|
if href.startswith("javascript:void(0)"):
|
|
continue
|
|
# Check if the href leads back to the original page or has already been visited
|
|
if check_href(href, driver.current_url, visited_pages):
|
|
continue
|
|
try:
|
|
# Check if the new href belongs to the same domain as the original URL
|
|
parsed_href = urlparse(href)
|
|
if parsed_href.netloc != start_domain:
|
|
continue
|
|
# Open the href in the same tab and retrieve data
|
|
driver.get(href)
|
|
print(href)
|
|
# Wait for the page to load
|
|
time.sleep(2)
|
|
# Extract the content from the hyperlink page
|
|
hyperlink_content_element = driver.find_element(By.XPATH, '//body')
|
|
hyperlink_content_text = hyperlink_content_element.text
|
|
print(hyperlink_content_text)
|
|
# Add URL, Domain, and Content of the hyperlink to the data list
|
|
data.append([start_domain, href, hyperlink_content_text])
|
|
# Recursively process the page and follow hyperlinks
|
|
process_page(driver, href, visited_pages, start_domain, data)
|
|
except Exception as e:
|
|
print(f"Error processing hyperlink: {href}")
|
|
print(f"Error message: {str(e)}")
|
|
continue
|
|
|
|
# Return to the original page
|
|
driver.get(url)
|
|
|
|
def check_href(href, original_url, visited_pages):
|
|
parsed_href = urlparse(href)
|
|
parsed_original_url = urlparse(original_url)
|
|
# Check if the href leads back to the original page
|
|
if parsed_href.netloc == parsed_original_url.netloc and parsed_href.path == parsed_original_url.path:
|
|
return True
|
|
# Check if the href has already been visited
|
|
if href in visited_pages:
|
|
return True
|
|
return False
|
|
|
|
def export_to_excel(data, output_filename):
|
|
# Create separate lists for each column
|
|
groups = [item[0] for item in data]
|
|
names = [item[1] for item in data]
|
|
domains = [item[2] for item in data]
|
|
urls = [item[3] for item in data]
|
|
texts = [item[4] for item in data]
|
|
|
|
# Create a DataFrame from the data lists
|
|
df = pd.DataFrame({'group': groups, 'name': names, 'domain': domains, 'url': urls, 'text': texts})
|
|
|
|
# Export the DataFrame to an Excel file
|
|
df.to_excel(output_filename, index=False)
|
|
|
|
def get_cookies_from_previous_session(driver):
|
|
cookies = {}
|
|
try:
|
|
# Execute JavaScript to get the cookies using jQuery Cookie Plugin
|
|
cookie_script = """
|
|
var cookies = $.cookie();
|
|
return Object.entries(cookies).map(([name, value]) => `${name}=${value}`);
|
|
"""
|
|
cookie_values = driver.execute_script(cookie_script)
|
|
|
|
# Parse the cookie values and store them in a dictionary
|
|
for cookie_value in cookie_values:
|
|
cookie_name, cookie_value = cookie_value.split('=')
|
|
cookies[cookie_name] = cookie_value
|
|
except Exception as e:
|
|
print('Error getting cookies:', e)
|
|
return cookies
|
|
|
|
def add_cookies(driver, cookies):
|
|
for name, value in cookies.items():
|
|
driver.add_cookie({'name': name, 'value': value})
|
|
|
|
def main():
|
|
# Read failed URLs from the list
|
|
df = pd.read_excel('failed_files.xlsx')
|
|
|
|
for ind, row in df.iterrows():
|
|
group = row['单位'] # Replace with the actual column name for group
|
|
name = row['主办']
|
|
url = row['地址']
|
|
domain = urlparse(url).netloc
|
|
# Open the website
|
|
driver = open_website(url)
|
|
# Retrieve cookies from previous session
|
|
cookies = get_cookies_from_previous_session(driver)
|
|
# Add cookies to the WebDriver
|
|
add_cookies(driver, cookies)
|
|
# Initialize the set to store visited pages
|
|
visited_pages = set()
|
|
# Initialize the data list
|
|
data = []
|
|
# Process the starting page and follow hyperlinks recursively
|
|
process_page(driver, url, visited_pages, domain, data)
|
|
# Export data to a separate Excel file for each URL
|
|
output_filename = f'web_dir/{name}_{domain}.xlsx'
|
|
export_to_excel(data, output_filename)
|
|
|
|
# Close the WebDriver
|
|
driver.quit()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|