60 lines
2.7 KiB
Python
60 lines
2.7 KiB
Python
import requests
|
|
import sqlite3
|
|
import pandas as pd
|
|
import html2text
|
|
import re
|
|
from urllib.parse import urlparse, urljoin
|
|
import concurrent.futures
|
|
|
|
class WebSpider:
|
|
def __init__(self) -> None:
|
|
self.headers = {
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
|
'Cache-Control': 'max-age=0',
|
|
'Proxy-Connection': 'keep-alive',
|
|
'Referer': 'https://www.baidu.com/',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
|
|
}
|
|
self.visited_urls = set()
|
|
self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.css', '.js'])
|
|
self.futures = []
|
|
|
|
def get_one_page(self, group, name, domain, url):
|
|
if url in self.visited_urls:
|
|
return
|
|
self.visited_urls.add(url)
|
|
r = requests.get(url=url, headers=self.headers, timeout=10)
|
|
rtext = r.text
|
|
if rtext:
|
|
h = html2text.HTML2Text()
|
|
h.ignore_links = True # 忽略所有链接
|
|
text = h.handle(rtext)
|
|
print(group, name, domain, url)
|
|
links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
|
|
for link in links:
|
|
full_link = urljoin(r.url, link)
|
|
if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
|
|
if urlparse(full_link).netloc.replace('www.', "") == domain:
|
|
self.get_one_page(group, name, domain, full_link)
|
|
|
|
def start(self):
|
|
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
|
|
# for ind, row in df.iterrows():
|
|
# group = row['单位']
|
|
# name = row['主办']
|
|
# url = row['地址']
|
|
# domain = urlparse(url).netloc.replace('www.', '')
|
|
# self.get_one_page(group, name, domain, url)
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
|
|
futures = []
|
|
for ind, row in df.iterrows():
|
|
group = row['单位']
|
|
name = row['主办']
|
|
url = row['地址']
|
|
domain = urlparse(url).netloc.replace('www.', '')
|
|
futures.append(executor.submit(self.get_one_page, group, name, domain, url))
|
|
concurrent.futures.wait(futures)
|
|
|
|
WebSpider().start() |