import requests import sqlite3 import pandas as pd import html2text import re from urllib.parse import urlparse, urljoin import concurrent.futures class WebSpider: def __init__(self) -> None: self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Cache-Control': 'max-age=0', 'Proxy-Connection': 'keep-alive', 'Referer': 'https://www.baidu.com/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48', } self.visited_urls = set() self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.css', '.js']) self.futures = [] def get_one_page(self, group, name, domain, url): if url in self.visited_urls: return self.visited_urls.add(url) r = requests.get(url=url, headers=self.headers, timeout=10) rtext = r.text if rtext: h = html2text.HTML2Text() h.ignore_links = True # 忽略所有链接 text = h.handle(rtext) print(group, name, domain, url) links = re.findall(r'href=["\']?([^"\'>]+)', r.text) for link in links: full_link = urljoin(r.url, link) if full_link not in self.visited_urls and (not full_link.endswith(self.ext)): if urlparse(full_link).netloc.replace('www.', "") == domain: self.get_one_page(group, name, domain, full_link) def start(self): df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1') # for ind, row in df.iterrows(): # group = row['单位'] # name = row['主办'] # url = row['地址'] # domain = urlparse(url).netloc.replace('www.', '') # self.get_one_page(group, name, domain, url) with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: futures = [] for ind, row in df.iterrows(): group = row['单位'] name = row['主办'] url = row['地址'] domain = urlparse(url).netloc.replace('www.', '') futures.append(executor.submit(self.get_one_page, group, name, domain, url)) concurrent.futures.wait(futures) WebSpider().start()