diff --git a/start.py b/start.py index 07511a5..3d754b2 100644 --- a/start.py +++ b/start.py @@ -43,6 +43,9 @@ class MyThread(QThread): def capture_output(self, p): while self.running and p.poll() is None: output = p.stdout.readline() + err = p.stderr.readline() + if err: + self.update_signal.emit({'msg': err.strip()}) if output: self.update_signal.emit({'msg': output.strip()}) diff --git a/zcspider/spiders/base.py b/zcspider/spiders/base.py index 186b95d..c19e9bd 100644 --- a/zcspider/spiders/base.py +++ b/zcspider/spiders/base.py @@ -26,10 +26,19 @@ class BaseSpider(scrapy.Spider): self.name = name self.group = group self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.dat', '.css', '.js', '.mp4', '.m3u', '.flv']) + self.resource_content_types = ['image/', 'text/css', 'application/javascript', 'application/octet-stream'] print(f"爬取开始: {name}_{domain}") + def fix_url_scheme(self, url, default_scheme='http'): + # 检查URL是否包含方案 + if not url.startswith('http://') and not url.startswith('https://'): + # 如果没有方案,添加默认方案 + url = f'{default_scheme}://{url}' + return url + def start_requests(self): for url in self.start_urls: + url = self.fix_url_scheme(url) r = scrapy.Request(url, dont_filter=True, headers=self.headers, callback=self.parse, errback=self.request2, meta={'download_timeout': 30}) yield r @@ -38,10 +47,18 @@ class BaseSpider(scrapy.Spider): return True return False + def is_file_res(self, res): + content_type = res.headers['content-type'] + if isinstance(content_type, bytes): + content_type = str(content_type, encoding = "utf-8") + return any(content_type.startswith(prefix) for prefix in self.resource_content_types) + def request2(self, fail): rurl = fail.request.url self.logger.info(f'{rurl} 使用requests继续请求') r = requests.get(url=fail.request.url, headers=self.headers, timeout=20) + if self.is_file_res(r): + return if r.status_code < 400: rtext = r.text h = html2text.HTML2Text() @@ -58,12 +75,16 @@ class BaseSpider(scrapy.Spider): links = re.findall(r'href=["\']?([^"\'>]+)', r.text) for link in links: full_link = urljoin(r.url, link) + if not full_link.startswith('http'): + continue if full_link not in self.visited_urls and (self.is_file_url(full_link) is False): if urlparse(full_link).netloc.replace('www.', '') == self.domain: yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30}) def parse(self, response): self.visited_urls.add(response.url) + if self.is_file_res(response): + return h = html2text.HTML2Text() h.ignore_links = True # 忽略所有链接 # 提取纯文本内容 @@ -80,11 +101,20 @@ class BaseSpider(scrapy.Spider): 'text': text, } - for link in response.css("a::attr('href')").getall(): + links = re.findall(r'href=["\']?([^"\'>]+)', text) + + for link in links: full_link = response.urljoin(link) + if not full_link.startswith('http'): + continue if full_link not in self.visited_urls and (self.is_file_url(full_link) is False): if urlparse(full_link).netloc.replace('www.', '') == self.domain: + # try: yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30}) + # except ValueError: + # import traceback + # print(traceback.format_exc()) + # print(full_link) def closed(self, reason): # This method will be called when the Spider is about to close