feat: 初步功能提交
This commit is contained in:
parent
9d4356c5f5
commit
78054d37d3
|
@ -2,4 +2,9 @@ dbs/*
|
||||||
venv/*
|
venv/*
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.pyc
|
*.pyc
|
||||||
twistd.pid
|
summary*.md
|
||||||
|
output/*
|
||||||
|
twistd.pid
|
||||||
|
~$*
|
||||||
|
*.xlsx
|
||||||
|
!biao.xlsx
|
Binary file not shown.
|
@ -0,0 +1,21 @@
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
current_dir = os.getcwd()
|
||||||
|
|
||||||
|
def count_web():
|
||||||
|
total = 0
|
||||||
|
web_dir = os.path.join(current_dir, 'web_dir')
|
||||||
|
for file in os.listdir(web_dir):
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(os.path.join(web_dir, file), encoding='gb18030')
|
||||||
|
except pd.errors.EmptyDataError:
|
||||||
|
pass
|
||||||
|
total = total + len(df)
|
||||||
|
print(file, total)
|
||||||
|
return total
|
||||||
|
|
||||||
|
def count_wechat():
|
||||||
|
articles_full_path = os.path.jon(current_dir, 'wechat_dir/articles_full.csv')
|
||||||
|
df = pd.read_csv(articles_full_path)
|
||||||
|
return len(df)
|
||||||
|
print(count_web(), count_wechat())
|
|
@ -0,0 +1,89 @@
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
import html2text
|
||||||
|
import sys
|
||||||
|
|
||||||
|
current_dir = os.getcwd()
|
||||||
|
wechat_dir = os.path.join(current_dir, 'wechat_dir')
|
||||||
|
web_dir = os.path.join(current_dir, 'web_dir')
|
||||||
|
df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容')
|
||||||
|
|
||||||
|
def trans_to_json():
|
||||||
|
json_str = df_s.to_json(orient='records', force_ascii=False)
|
||||||
|
with open('biao.json', 'w', encoding='utf-8') as f:
|
||||||
|
f.write(json_str)
|
||||||
|
|
||||||
|
def make_wechat_articles_full():
|
||||||
|
df = pd.read_csv(os.path.jon(wechat_dir, 'articles.csv'), encoding='gb18030')
|
||||||
|
df['content'] = ''
|
||||||
|
ind = 0
|
||||||
|
for ind, row in df.iterrows():
|
||||||
|
full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.html')
|
||||||
|
try:
|
||||||
|
with open(full_path, encoding='utf-8') as f:
|
||||||
|
h = html2text.HTML2Text()
|
||||||
|
h.ignore_links = True
|
||||||
|
df.at[ind, 'content'] = h.handle(f.read())
|
||||||
|
print(f'{ind}--{row["nickname"]}--{row["title"]}')
|
||||||
|
except:
|
||||||
|
print(full_path + '---不存在')
|
||||||
|
ind +=1
|
||||||
|
df.to_csv('articles_full.csv', encoding='utf-8_sig')
|
||||||
|
|
||||||
|
def ana_wechat():
|
||||||
|
articles_full_path = os.path.jon(wechat_dir, 'articles_full.csv')
|
||||||
|
if not os.path.exists(articles_full_path):
|
||||||
|
make_wechat_articles_full()
|
||||||
|
df_a = pd.DataFrame(columns = ['公众号', '标题', '地址', '错误表述', '建议修改词语', '错误分类'])
|
||||||
|
df = pd.read_csv(articles_full_path)
|
||||||
|
df['content'] = df['content'].fillna('')
|
||||||
|
ind = 0
|
||||||
|
need_save = False
|
||||||
|
for ind, row in df_s.iterrows():
|
||||||
|
mask = df['content'].str.contains(row['错误表述'])
|
||||||
|
result = df[mask]
|
||||||
|
if result.empty:
|
||||||
|
continue
|
||||||
|
ind2 = 0
|
||||||
|
for ind2, row2 in result.iterrows():
|
||||||
|
alist = [row2['nickname'], row2['title'], row2['content_url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
|
||||||
|
print(alist)
|
||||||
|
df_a.loc[len(df_a.index)] = alist
|
||||||
|
if need_save is False:
|
||||||
|
need_save = True
|
||||||
|
ind2 +=1
|
||||||
|
ind +=1
|
||||||
|
if need_save:
|
||||||
|
df_a.to_csv('ana_wechat.csv', encoding='utf-8_sig')
|
||||||
|
|
||||||
|
def ana_web():
|
||||||
|
df_a = pd.DataFrame(columns = ['单位', '主办', '地址', '错误表述', '建议修改词语', '错误分类'])
|
||||||
|
need_save = False
|
||||||
|
for file in os.listdir(web_dir):
|
||||||
|
full_path = os.path.join(web_dir, file)
|
||||||
|
if os.path.getsize(full_path) > 0:
|
||||||
|
df = pd.read_csv(os.path.join(web_dir, file), encoding='gb18030')
|
||||||
|
ind = 0
|
||||||
|
for ind, row in df_s.iterrows():
|
||||||
|
mask = df['text'].str.contains(row['错误表述'])
|
||||||
|
result = df[mask]
|
||||||
|
if result.empty:
|
||||||
|
continue
|
||||||
|
ind2 = 0
|
||||||
|
for ind2, row2 in result.iterrows():
|
||||||
|
alist = [row2['group'], row2['name'], row2['url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
|
||||||
|
print(alist)
|
||||||
|
df_a.loc[len(df_a.index)] = alist
|
||||||
|
if need_save is False:
|
||||||
|
need_save = True
|
||||||
|
ind2 +=1
|
||||||
|
ind +=1
|
||||||
|
if need_save:
|
||||||
|
df_a.to_csv('ana_web.csv', encoding='utf-8_sig')
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) > 1 and sys.argv[1] == 'wechat':
|
||||||
|
ana_wechat()
|
||||||
|
else:
|
||||||
|
ana_web()
|
||||||
|
|
7
note.txt
7
note.txt
|
@ -1 +1,6 @@
|
||||||
scrapy crawl basespider -a start_url=http://ctc.ac.cn/
|
scrapy crawl basespider -a start_url=http://ctc.ac.cn/ -a name=中国国检测试控股集团股份有限公司 -o output/out.csv
|
||||||
|
|
||||||
|
scrapy crawl basespider -a start_url=http://www.ctc-hn.com -a name=test -a group=test -a domain=www.ctc-hn.com
|
||||||
|
|
||||||
|
|
||||||
|
scrapy crawl basespider -a start_url=http://www.zbyjs.cn -a name=西安轻工业钟表研究所有限公司 -a group=西安轻工业钟表研究所有限公司 -a domain=zbyjs.cn -o output/56_西安轻工业钟表研究所有限公司_www.zbyjs.cn.csv
|
|
@ -1,4 +1,6 @@
|
||||||
scrapy==2.8.0
|
scrapy==2.8.0
|
||||||
scrapyd==1.4.1
|
scrapyd==1.4.1
|
||||||
scrapyd-client==1.2.3
|
scrapyd-client==1.2.3
|
||||||
html2text==2020.1.16
|
html2text==2020.1.16
|
||||||
|
pandas==2.0.0
|
||||||
|
openpyxl==3.1.2
|
|
@ -0,0 +1,147 @@
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
import time
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from pathlib import Path
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
def open_website(url):
|
||||||
|
# Set up Chrome WebDriver with custom User-Agent
|
||||||
|
options = Options()
|
||||||
|
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36")
|
||||||
|
driver = webdriver.Chrome("./chromedriver.exe", options=options)
|
||||||
|
driver.get(url)
|
||||||
|
return driver
|
||||||
|
|
||||||
|
def extract_hyperlinks(driver):
|
||||||
|
# Find all elements with href attribute
|
||||||
|
elements = driver.find_elements(By.XPATH, '//*[@href]')
|
||||||
|
# Extract the href values from the elements
|
||||||
|
hrefs = [element.get_attribute('href') for element in elements]
|
||||||
|
return hrefs
|
||||||
|
|
||||||
|
def ignore_image_and_document_hrefs(href):
|
||||||
|
parsed_url = urlparse(href)
|
||||||
|
path = parsed_url.path
|
||||||
|
file_extension = Path(path).suffix
|
||||||
|
# Check if the href has a domain suffix of image or document file extensions
|
||||||
|
return file_extension.lower() in ('.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.css', '.js')
|
||||||
|
|
||||||
|
def process_page(driver, url, visited_pages, start_domain, data):
|
||||||
|
# Add the URL to visited pages
|
||||||
|
visited_pages.add(url)
|
||||||
|
# Navigate to the URL
|
||||||
|
driver.get(url)
|
||||||
|
# Wait for the page to load
|
||||||
|
time.sleep(2)
|
||||||
|
# Extract the content from the page
|
||||||
|
content_element = driver.find_element(By.XPATH, '//body')
|
||||||
|
content_text = content_element.text
|
||||||
|
print(content_text)
|
||||||
|
# Add URL, Domain, and Content to the data list
|
||||||
|
data.append([url, start_domain, content_text])
|
||||||
|
|
||||||
|
# Find and process hyperlinks
|
||||||
|
hrefs = extract_hyperlinks(driver)
|
||||||
|
for href in hrefs:
|
||||||
|
# Check if the href should be ignored
|
||||||
|
if ignore_image_and_document_hrefs(href):
|
||||||
|
continue
|
||||||
|
# Check if the href is of type "javascript:void(0)"
|
||||||
|
if href.startswith("javascript:void(0)"):
|
||||||
|
continue
|
||||||
|
# Check if the href leads back to the original page or has already been visited
|
||||||
|
if check_href(href, driver.current_url, visited_pages):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
# Open the href in the same tab and retrieve data
|
||||||
|
driver.get(href)
|
||||||
|
# Check if the new href belongs to the same domain as the original URL
|
||||||
|
parsed_href = urlparse(href)
|
||||||
|
if parsed_href.netloc != start_domain:
|
||||||
|
continue
|
||||||
|
print(href)
|
||||||
|
# Wait for the page to load
|
||||||
|
time.sleep(2)
|
||||||
|
# Extract the content from the hyperlink page
|
||||||
|
hyperlink_content_element = driver.find_element(By.XPATH, '//body')
|
||||||
|
hyperlink_content_text = hyperlink_content_element.text
|
||||||
|
print(hyperlink_content_text)
|
||||||
|
# Add URL, Domain, and Content of the hyperlink to the data list
|
||||||
|
data.append([href, start_domain, hyperlink_content_text])
|
||||||
|
# Recursively process the page and follow hyperlinks
|
||||||
|
process_page(driver, href, visited_pages, start_domain, data)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing hyperlink: {href}")
|
||||||
|
print(f"Error message: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Return to the original page
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
def check_href(href, original_url, visited_pages):
|
||||||
|
parsed_href = urlparse(href)
|
||||||
|
parsed_original_url = urlparse(original_url)
|
||||||
|
# Check if the href leads back to the original page
|
||||||
|
if parsed_href.netloc == parsed_original_url.netloc and parsed_href.path == parsed_original_url.path and parsed_href.fragment == parsed_original_url.fragment:
|
||||||
|
return True
|
||||||
|
# Check if the href has already been visited
|
||||||
|
if href in visited_pages:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def export_to_excel(data):
|
||||||
|
# Create a DataFrame from the data list
|
||||||
|
df = pd.DataFrame(data, columns=['URL', 'Domain', 'Content'])
|
||||||
|
# Export the DataFrame to an Excel file
|
||||||
|
df.to_excel('output.xlsx', index=False)
|
||||||
|
|
||||||
|
|
||||||
|
def get_cookies_from_previous_session(driver):
|
||||||
|
cookies = {}
|
||||||
|
try:
|
||||||
|
# Execute JavaScript to get the cookies using jQuery Cookie Plugin
|
||||||
|
cookie_script = """
|
||||||
|
var cookies = $.cookie();
|
||||||
|
return Object.entries(cookies).map(([name, value]) => `${name}=${value}`);
|
||||||
|
"""
|
||||||
|
cookie_values = driver.execute_script(cookie_script)
|
||||||
|
|
||||||
|
# Parse the cookie values and store them in a dictionary
|
||||||
|
for cookie_value in cookie_values:
|
||||||
|
cookie_name, cookie_value = cookie_value.split('=')
|
||||||
|
cookies[cookie_name] = cookie_value
|
||||||
|
except Exception as e:
|
||||||
|
print('Error getting cookies:', e)
|
||||||
|
return cookies
|
||||||
|
|
||||||
|
def add_cookies(driver, cookies):
|
||||||
|
for name, value in cookies.items():
|
||||||
|
driver.add_cookie({'name': name, 'value': value})
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Starting URL
|
||||||
|
start_url = 'http://www.ctc-yz.com/'
|
||||||
|
# Parse the domain from the starting URL
|
||||||
|
parsed_start_url = urlparse(start_url)
|
||||||
|
start_domain = parsed_start_url.netloc
|
||||||
|
# Open the website
|
||||||
|
driver = open_website(start_url)
|
||||||
|
# Retrieve cookies from previous session
|
||||||
|
cookies = get_cookies_from_previous_session(driver)
|
||||||
|
# Add cookies to the WebDriver
|
||||||
|
add_cookies(driver, cookies)
|
||||||
|
# Initialize the set to store visited pages
|
||||||
|
visited_pages = set()
|
||||||
|
# Initialize the data list
|
||||||
|
data = []
|
||||||
|
# Process the starting page and follow hyperlinks recursively
|
||||||
|
process_page(driver, start_url, visited_pages, start_domain, data)
|
||||||
|
# Export the data to an Excel file
|
||||||
|
export_to_excel(data)
|
||||||
|
# Close the WebDriver
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
14
scrapy.cfg
14
scrapy.cfg
|
@ -1,14 +0,0 @@
|
||||||
# Automatically created by: scrapy startproject
|
|
||||||
#
|
|
||||||
# For more information about the [deploy] section see:
|
|
||||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
|
||||||
|
|
||||||
[settings]
|
|
||||||
default = zcspider.settings
|
|
||||||
|
|
||||||
[scrapyd]
|
|
||||||
bind_address = 0.0.0.0
|
|
||||||
|
|
||||||
[deploy zc1]
|
|
||||||
# url = http://localhost:6800/
|
|
||||||
project = zcspider
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
import subprocess
|
||||||
|
import pandas as pd
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
|
||||||
|
df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
|
||||||
|
|
||||||
|
processes = []
|
||||||
|
# 定义 SIGINT 信号处理函数
|
||||||
|
def sigint_handler(signal, frame):
|
||||||
|
print('收到 Ctrl-C 信号,正在关闭子进程...')
|
||||||
|
for process in processes:
|
||||||
|
process.terminate()
|
||||||
|
print('子进程已关闭,程序退出。')
|
||||||
|
sys.exit(0)
|
||||||
|
# 注册 SIGINT 信号处理函数
|
||||||
|
signal.signal(signal.SIGINT, sigint_handler)
|
||||||
|
|
||||||
|
|
||||||
|
ind = 0
|
||||||
|
for ind, row in df.iterrows():
|
||||||
|
group = row['单位']
|
||||||
|
name = row['主办']
|
||||||
|
url = row['地址']
|
||||||
|
if 'http' in url:
|
||||||
|
sx = row['地址'].split('http')
|
||||||
|
ename = sx[0].strip()
|
||||||
|
if ename:
|
||||||
|
name = ename
|
||||||
|
url = 'http' + sx[1]
|
||||||
|
elif 'www' in url:
|
||||||
|
sx = row['地址'].split('www')
|
||||||
|
ename = sx[0].strip()
|
||||||
|
if ename:
|
||||||
|
name = ename
|
||||||
|
url = 'http://www' + sx[1]
|
||||||
|
domain = urlparse(url).netloc
|
||||||
|
cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.csv']
|
||||||
|
process = subprocess.Popen(cmd)
|
||||||
|
processes.append(process)
|
||||||
|
ind +=1
|
||||||
|
# if ind > 0:
|
||||||
|
# break
|
File diff suppressed because one or more lines are too long
|
@ -17,7 +17,7 @@ NEWSPIDER_MODULE = "zcspider.spiders"
|
||||||
#USER_AGENT = "zcspider (+http://www.yourdomain.com)"
|
#USER_AGENT = "zcspider (+http://www.yourdomain.com)"
|
||||||
|
|
||||||
# Obey robots.txt rules
|
# Obey robots.txt rules
|
||||||
ROBOTSTXT_OBEY = True
|
ROBOTSTXT_OBEY = False
|
||||||
|
|
||||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
#CONCURRENT_REQUESTS = 32
|
#CONCURRENT_REQUESTS = 32
|
||||||
|
@ -90,9 +90,10 @@ ROBOTSTXT_OBEY = True
|
||||||
# Set settings whose default value is deprecated to a future-proof value
|
# Set settings whose default value is deprecated to a future-proof value
|
||||||
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||||
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||||
FEED_EXPORT_ENCODING = "utf-8"
|
FEED_EXPORT_ENCODING = 'gb18030'
|
||||||
LOG_LEVEL = 'INFO'
|
LOG_LEVEL = 'INFO'
|
||||||
|
DOWNLOAD_TIMEOUT = 30
|
||||||
|
|
||||||
ITEM_PIPELINES = {
|
ITEM_PIPELINES = {
|
||||||
'zcspider.pipelines.ZcspiderPipeline': 300,
|
# 'zcspider.pipelines.ZcspiderPipeline': 300,
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,33 +1,80 @@
|
||||||
import scrapy
|
import scrapy
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import html2text
|
import html2text
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
class BaseSpider(scrapy.Spider):
|
class BaseSpider(scrapy.Spider):
|
||||||
name = "basespider"
|
name = "basespider"
|
||||||
start_urls = ["http://ctc.ac.cn/"]
|
start_urls = ["http://ctc.ac.cn/"]
|
||||||
visited_urls = set()
|
visited_urls = set()
|
||||||
|
|
||||||
def __init__(self, start_url: str, name=None, **kwargs):
|
def __init__(self, domain: str, start_url: str, name='', group='', **kwargs):
|
||||||
super().__init__(name, **kwargs)
|
super().__init__(name, **kwargs)
|
||||||
self.domain = urlparse(start_url).netloc
|
self.headers = {
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||||||
|
'Cache-Control': 'max-age=0',
|
||||||
|
'Proxy-Connection': 'keep-alive',
|
||||||
|
'Referer': 'https://www.baidu.com/',
|
||||||
|
'Upgrade-Insecure-Requests': '1',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
|
||||||
|
}
|
||||||
|
self.domain = domain
|
||||||
self.start_urls = [start_url]
|
self.start_urls = [start_url]
|
||||||
self.ext = tuple(['.png', '.jpg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'])
|
self.name = name
|
||||||
|
self.group = group
|
||||||
|
self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico'])
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
for url in self.start_urls:
|
||||||
|
r = scrapy.Request(url, dont_filter=True, headers=self.headers, callback=self.parse, errback=self.request2)
|
||||||
|
yield r
|
||||||
|
|
||||||
|
def request2(self, fail):
|
||||||
|
rurl = fail.request.url,
|
||||||
|
self.logger.info(f'{rurl} 使用requests继续请求')
|
||||||
|
r = requests.get(url=fail.request.url, headers=self.headers, timeout=20)
|
||||||
|
rtext = r.text
|
||||||
|
h = html2text.HTML2Text()
|
||||||
|
h.ignore_links = True # 忽略所有链接
|
||||||
|
text = h.handle(rtext)
|
||||||
|
yield {
|
||||||
|
'group': self.group,
|
||||||
|
'name': self.name,
|
||||||
|
'domain': self.domain,
|
||||||
|
'url': rurl,
|
||||||
|
'text': text,
|
||||||
|
}
|
||||||
|
self.visited_urls.add(rurl)
|
||||||
|
links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
|
||||||
|
for link in links:
|
||||||
|
full_link = urljoin(r.url, link)
|
||||||
|
if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
|
||||||
|
if urlparse(full_link).netloc == self.domain:
|
||||||
|
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
self.visited_urls.add(response.url)
|
self.visited_urls.add(response.url)
|
||||||
h = html2text.HTML2Text()
|
h = html2text.HTML2Text()
|
||||||
h.ignore_links = True # 忽略所有链接
|
h.ignore_links = True # 忽略所有链接
|
||||||
# 提取纯文本内容
|
# 提取纯文本内容
|
||||||
text = h.handle(response.body.decode())
|
# try:
|
||||||
|
text = h.handle(response.text)
|
||||||
|
# except:
|
||||||
|
# text = h.handle(response.body.decode(encoding='gb18030'))
|
||||||
|
|
||||||
yield {
|
yield {
|
||||||
|
'group': self.group,
|
||||||
|
'name': self.name,
|
||||||
'domain': self.domain,
|
'domain': self.domain,
|
||||||
'url': response.url,
|
'url': response.url,
|
||||||
'text': text,
|
'text': text,
|
||||||
}
|
}
|
||||||
|
|
||||||
for link in response.css("a::attr('href')").getall():
|
for link in response.css("a::attr('href')").getall():
|
||||||
if link not in self.visited_urls:
|
full_link = response.urljoin(link)
|
||||||
if link.startswith("/") or urlparse(link).netloc == self.domain:
|
if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
|
||||||
if not link.endswith(self.ext):
|
if urlparse(full_link).netloc == self.domain:
|
||||||
yield scrapy.Request(response.urljoin(link), callback=self.parse)
|
yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
|
Loading…
Reference in New Issue