diff --git a/zcspider/spiders/base.py b/zcspider/spiders/base.py index ac11414..186b95d 100644 --- a/zcspider/spiders/base.py +++ b/zcspider/spiders/base.py @@ -25,14 +25,19 @@ class BaseSpider(scrapy.Spider): self.start_urls = [start_url] self.name = name self.group = group - self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.dat', '.css', '.js']) + self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.dat', '.css', '.js', '.mp4', '.m3u', '.flv']) print(f"爬取开始: {name}_{domain}") def start_requests(self): for url in self.start_urls: - r = scrapy.Request(url, dont_filter=True, headers=self.headers, callback=self.parse, errback=self.request2) + r = scrapy.Request(url, dont_filter=True, headers=self.headers, callback=self.parse, errback=self.request2, meta={'download_timeout': 30}) yield r + def is_file_url(self, url): + if f'.{url.split(".")[-1].lower()}' in self.ext: + return True + return False + def request2(self, fail): rurl = fail.request.url self.logger.info(f'{rurl} 使用requests继续请求') @@ -53,9 +58,9 @@ class BaseSpider(scrapy.Spider): links = re.findall(r'href=["\']?([^"\'>]+)', r.text) for link in links: full_link = urljoin(r.url, link) - if full_link not in self.visited_urls and (not full_link.endswith(self.ext)): + if full_link not in self.visited_urls and (self.is_file_url(full_link) is False): if urlparse(full_link).netloc.replace('www.', '') == self.domain: - yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2) + yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30}) def parse(self, response): self.visited_urls.add(response.url) @@ -77,9 +82,9 @@ class BaseSpider(scrapy.Spider): for link in response.css("a::attr('href')").getall(): full_link = response.urljoin(link) - if full_link not in self.visited_urls and (not full_link.endswith(self.ext)): + if full_link not in self.visited_urls and (self.is_file_url(full_link) is False): if urlparse(full_link).netloc.replace('www.', '') == self.domain: - yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2) + yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30}) def closed(self, reason): # This method will be called when the Spider is about to close