zcspider/mycode/web2.py

60 lines
2.7 KiB
Python

import requests
import sqlite3
import pandas as pd
import html2text
import re
from urllib.parse import urlparse, urljoin
import concurrent.futures
class WebSpider:
def __init__(self) -> None:
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Proxy-Connection': 'keep-alive',
'Referer': 'https://www.baidu.com/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
}
self.visited_urls = set()
self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.css', '.js'])
self.futures = []
def get_one_page(self, group, name, domain, url):
if url in self.visited_urls:
return
self.visited_urls.add(url)
r = requests.get(url=url, headers=self.headers, timeout=10)
rtext = r.text
if rtext:
h = html2text.HTML2Text()
h.ignore_links = True # 忽略所有链接
text = h.handle(rtext)
print(group, name, domain, url)
links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
for link in links:
full_link = urljoin(r.url, link)
if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
if urlparse(full_link).netloc.replace('www.', "") == domain:
self.get_one_page(group, name, domain, full_link)
def start(self):
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
# for ind, row in df.iterrows():
# group = row['单位']
# name = row['主办']
# url = row['地址']
# domain = urlparse(url).netloc.replace('www.', '')
# self.get_one_page(group, name, domain, url)
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
futures = []
for ind, row in df.iterrows():
group = row['单位']
name = row['主办']
url = row['地址']
domain = urlparse(url).netloc.replace('www.', '')
futures.append(executor.submit(self.get_one_page, group, name, domain, url))
concurrent.futures.wait(futures)
WebSpider().start()