From 98e42e50af701683baa7a97115f48d5d09e3285d Mon Sep 17 00:00:00 2001 From: xiaobulu27 Date: Fri, 7 Jul 2023 08:46:20 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8F=98=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scrape_nj.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrape_nj.py b/scrape_nj.py index e288c96..ed7628c 100644 --- a/scrape_nj.py +++ b/scrape_nj.py @@ -40,7 +40,7 @@ def process_page(driver, url, visited_pages, start_domain, data): content_text = content_element.text print(content_text) # Add URL, Domain, and Content to the data list - data.append([url, start_domain, content_text]) + data.append([start_domain, url, content_text]) # Find and process hyperlinks hrefs = extract_hyperlinks(driver) @@ -129,7 +129,7 @@ def add_cookies(driver, cookies): def main(): # Starting URL - start_url = 'http://www.ctchn.ac.cn/' + start_url = 'https://www.cbma.com.cn/' # Parse the domain from the starting URL parsed_start_url = urlparse(start_url) start_domain = parsed_start_url.netloc