This commit is contained in:
parent
637bbd3e17
commit
98e42e50af
|
@ -40,7 +40,7 @@ def process_page(driver, url, visited_pages, start_domain, data):
|
||||||
content_text = content_element.text
|
content_text = content_element.text
|
||||||
print(content_text)
|
print(content_text)
|
||||||
# Add URL, Domain, and Content to the data list
|
# Add URL, Domain, and Content to the data list
|
||||||
data.append([url, start_domain, content_text])
|
data.append([start_domain, url, content_text])
|
||||||
|
|
||||||
# Find and process hyperlinks
|
# Find and process hyperlinks
|
||||||
hrefs = extract_hyperlinks(driver)
|
hrefs = extract_hyperlinks(driver)
|
||||||
|
@ -129,7 +129,7 @@ def add_cookies(driver, cookies):
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# Starting URL
|
# Starting URL
|
||||||
start_url = 'http://www.ctchn.ac.cn/'
|
start_url = 'https://www.cbma.com.cn/'
|
||||||
# Parse the domain from the starting URL
|
# Parse the domain from the starting URL
|
||||||
parsed_start_url = urlparse(start_url)
|
parsed_start_url = urlparse(start_url)
|
||||||
start_domain = parsed_start_url.netloc
|
start_domain = parsed_start_url.netloc
|
||||||
|
|
Loading…
Reference in New Issue