feat: 后缀名过滤/超时设置
This commit is contained in:
parent
bc3f5b6ca6
commit
905b813b39
|
@ -25,14 +25,19 @@ class BaseSpider(scrapy.Spider):
|
|||
self.start_urls = [start_url]
|
||||
self.name = name
|
||||
self.group = group
|
||||
self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.dat', '.css', '.js'])
|
||||
self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.dat', '.css', '.js', '.mp4', '.m3u', '.flv'])
|
||||
print(f"爬取开始: {name}_{domain}")
|
||||
|
||||
def start_requests(self):
|
||||
for url in self.start_urls:
|
||||
r = scrapy.Request(url, dont_filter=True, headers=self.headers, callback=self.parse, errback=self.request2)
|
||||
r = scrapy.Request(url, dont_filter=True, headers=self.headers, callback=self.parse, errback=self.request2, meta={'download_timeout': 30})
|
||||
yield r
|
||||
|
||||
def is_file_url(self, url):
|
||||
if f'.{url.split(".")[-1].lower()}' in self.ext:
|
||||
return True
|
||||
return False
|
||||
|
||||
def request2(self, fail):
|
||||
rurl = fail.request.url
|
||||
self.logger.info(f'{rurl} 使用requests继续请求')
|
||||
|
@ -53,9 +58,9 @@ class BaseSpider(scrapy.Spider):
|
|||
links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
|
||||
for link in links:
|
||||
full_link = urljoin(r.url, link)
|
||||
if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
|
||||
if full_link not in self.visited_urls and (self.is_file_url(full_link) is False):
|
||||
if urlparse(full_link).netloc.replace('www.', '') == self.domain:
|
||||
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
|
||||
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
|
||||
|
||||
def parse(self, response):
|
||||
self.visited_urls.add(response.url)
|
||||
|
@ -77,9 +82,9 @@ class BaseSpider(scrapy.Spider):
|
|||
|
||||
for link in response.css("a::attr('href')").getall():
|
||||
full_link = response.urljoin(link)
|
||||
if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
|
||||
if full_link not in self.visited_urls and (self.is_file_url(full_link) is False):
|
||||
if urlparse(full_link).netloc.replace('www.', '') == self.domain:
|
||||
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
|
||||
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
|
||||
|
||||
def closed(self, reason):
|
||||
# This method will be called when the Spider is about to close
|
||||
|
|
Loading…
Reference in New Issue