feat: 修改bug
This commit is contained in:
parent
905b813b39
commit
7480590bd3
3
start.py
3
start.py
|
@ -43,6 +43,9 @@ class MyThread(QThread):
|
||||||
def capture_output(self, p):
|
def capture_output(self, p):
|
||||||
while self.running and p.poll() is None:
|
while self.running and p.poll() is None:
|
||||||
output = p.stdout.readline()
|
output = p.stdout.readline()
|
||||||
|
err = p.stderr.readline()
|
||||||
|
if err:
|
||||||
|
self.update_signal.emit({'msg': err.strip()})
|
||||||
if output:
|
if output:
|
||||||
self.update_signal.emit({'msg': output.strip()})
|
self.update_signal.emit({'msg': output.strip()})
|
||||||
|
|
||||||
|
|
|
@ -26,10 +26,19 @@ class BaseSpider(scrapy.Spider):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.group = group
|
self.group = group
|
||||||
self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.dat', '.css', '.js', '.mp4', '.m3u', '.flv'])
|
self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.dat', '.css', '.js', '.mp4', '.m3u', '.flv'])
|
||||||
|
self.resource_content_types = ['image/', 'text/css', 'application/javascript', 'application/octet-stream']
|
||||||
print(f"爬取开始: {name}_{domain}")
|
print(f"爬取开始: {name}_{domain}")
|
||||||
|
|
||||||
|
def fix_url_scheme(self, url, default_scheme='http'):
|
||||||
|
# 检查URL是否包含方案
|
||||||
|
if not url.startswith('http://') and not url.startswith('https://'):
|
||||||
|
# 如果没有方案,添加默认方案
|
||||||
|
url = f'{default_scheme}://{url}'
|
||||||
|
return url
|
||||||
|
|
||||||
def start_requests(self):
|
def start_requests(self):
|
||||||
for url in self.start_urls:
|
for url in self.start_urls:
|
||||||
|
url = self.fix_url_scheme(url)
|
||||||
r = scrapy.Request(url, dont_filter=True, headers=self.headers, callback=self.parse, errback=self.request2, meta={'download_timeout': 30})
|
r = scrapy.Request(url, dont_filter=True, headers=self.headers, callback=self.parse, errback=self.request2, meta={'download_timeout': 30})
|
||||||
yield r
|
yield r
|
||||||
|
|
||||||
|
@ -38,10 +47,18 @@ class BaseSpider(scrapy.Spider):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def is_file_res(self, res):
|
||||||
|
content_type = res.headers['content-type']
|
||||||
|
if isinstance(content_type, bytes):
|
||||||
|
content_type = str(content_type, encoding = "utf-8")
|
||||||
|
return any(content_type.startswith(prefix) for prefix in self.resource_content_types)
|
||||||
|
|
||||||
def request2(self, fail):
|
def request2(self, fail):
|
||||||
rurl = fail.request.url
|
rurl = fail.request.url
|
||||||
self.logger.info(f'{rurl} 使用requests继续请求')
|
self.logger.info(f'{rurl} 使用requests继续请求')
|
||||||
r = requests.get(url=fail.request.url, headers=self.headers, timeout=20)
|
r = requests.get(url=fail.request.url, headers=self.headers, timeout=20)
|
||||||
|
if self.is_file_res(r):
|
||||||
|
return
|
||||||
if r.status_code < 400:
|
if r.status_code < 400:
|
||||||
rtext = r.text
|
rtext = r.text
|
||||||
h = html2text.HTML2Text()
|
h = html2text.HTML2Text()
|
||||||
|
@ -58,12 +75,16 @@ class BaseSpider(scrapy.Spider):
|
||||||
links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
|
links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
|
||||||
for link in links:
|
for link in links:
|
||||||
full_link = urljoin(r.url, link)
|
full_link = urljoin(r.url, link)
|
||||||
|
if not full_link.startswith('http'):
|
||||||
|
continue
|
||||||
if full_link not in self.visited_urls and (self.is_file_url(full_link) is False):
|
if full_link not in self.visited_urls and (self.is_file_url(full_link) is False):
|
||||||
if urlparse(full_link).netloc.replace('www.', '') == self.domain:
|
if urlparse(full_link).netloc.replace('www.', '') == self.domain:
|
||||||
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
|
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
self.visited_urls.add(response.url)
|
self.visited_urls.add(response.url)
|
||||||
|
if self.is_file_res(response):
|
||||||
|
return
|
||||||
h = html2text.HTML2Text()
|
h = html2text.HTML2Text()
|
||||||
h.ignore_links = True # 忽略所有链接
|
h.ignore_links = True # 忽略所有链接
|
||||||
# 提取纯文本内容
|
# 提取纯文本内容
|
||||||
|
@ -80,11 +101,20 @@ class BaseSpider(scrapy.Spider):
|
||||||
'text': text,
|
'text': text,
|
||||||
}
|
}
|
||||||
|
|
||||||
for link in response.css("a::attr('href')").getall():
|
links = re.findall(r'href=["\']?([^"\'>]+)', text)
|
||||||
|
|
||||||
|
for link in links:
|
||||||
full_link = response.urljoin(link)
|
full_link = response.urljoin(link)
|
||||||
|
if not full_link.startswith('http'):
|
||||||
|
continue
|
||||||
if full_link not in self.visited_urls and (self.is_file_url(full_link) is False):
|
if full_link not in self.visited_urls and (self.is_file_url(full_link) is False):
|
||||||
if urlparse(full_link).netloc.replace('www.', '') == self.domain:
|
if urlparse(full_link).netloc.replace('www.', '') == self.domain:
|
||||||
|
# try:
|
||||||
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
|
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2, meta={'download_timeout': 30})
|
||||||
|
# except ValueError:
|
||||||
|
# import traceback
|
||||||
|
# print(traceback.format_exc())
|
||||||
|
# print(full_link)
|
||||||
|
|
||||||
def closed(self, reason):
|
def closed(self, reason):
|
||||||
# This method will be called when the Spider is about to close
|
# This method will be called when the Spider is about to close
|
||||||
|
|
Loading…
Reference in New Issue