diff --git a/scrape_nj.py b/scrape_nj.py index e288c96..ed7628c 100644 --- a/scrape_nj.py +++ b/scrape_nj.py @@ -40,7 +40,7 @@ def process_page(driver, url, visited_pages, start_domain, data): content_text = content_element.text print(content_text) # Add URL, Domain, and Content to the data list - data.append([url, start_domain, content_text]) + data.append([start_domain, url, content_text]) # Find and process hyperlinks hrefs = extract_hyperlinks(driver) @@ -129,7 +129,7 @@ def add_cookies(driver, cookies): def main(): # Starting URL - start_url = 'http://www.ctchn.ac.cn/' + start_url = 'https://www.cbma.com.cn/' # Parse the domain from the starting URL parsed_start_url = urlparse(start_url) start_domain = parsed_start_url.netloc