feat: 整合并打包runtime

2023-08-25 15:01:45 +08:00 · 2023-08-25 15:01:45 +08:00 · 18580ecd28
parent 722f9b2542
commit 18580ecd28
18 changed files with 285 additions and 517318 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,8 +6,14 @@ twistd.pid
 ~$*
 *.xlsx
 !biao.xlsx
+!template.xlsx
 wechat_dir/*
-ana_web.csv
-ana_wechat.csv
+*.csv
 .idea/*
 *.pdf
+article/*
+db_folder/*
+runtime/*
+chrome117.exe
+html/*
+excel/*
--- a/mycode/init.py
+++ b/mycode/init.py
--- a/mycode/base.py
+++ b/mycode/base.py
@ -0,0 +1,2 @@
+import os
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
--- a/mycode/chromedriver.exe
+++ b/mycode/chromedriver.exe
--- a/mycode/count.py
+++ b/mycode/count.py
--- a/mycode/crawl_chrome.py
+++ b/mycode/crawl_chrome.py
@ -5,6 +5,10 @@ import time
 from urllib.parse import urlparse
 from pathlib import Path
 import pandas as pd
+from .base import BASE_DIR
+import os
+chrome_driver_file = os.path.join(BASE_DIR, 'mycode', 'chromedriver.exe')
+failed_sites_file = os.path.join(BASE_DIR, 'mycode/failed_sites.xlsx')

 def open_website(url):
    # Set up Chrome WebDriver with custom User-Agent
@ -12,7 +16,7 @@ def open_website(url):
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36")
    prefs = {"profile.managed_default_content_settings.images": 2, 'notifications':2}
    options.add_experimental_option("prefs", prefs)
-    driver = webdriver.Chrome("./chromedriver.exe", options=options)
+    driver = webdriver.Chrome(chrome_driver_file, options=options)
    driver.get(url)
    return driver

@ -40,7 +44,7 @@ def process_page(driver, url, visited_pages, start_domain, data):
    # Extract the content from the page
    content_element = driver.find_element(By.XPATH, '//body')
    content_text = content_element.text
-    print(content_text)
+    # print(content_text)
    # Add URL, Domain, and Content to the data list
    data.append([start_domain, url, content_text])

@ -59,17 +63,17 @@ def process_page(driver, url, visited_pages, start_domain, data):
        try:
            # Check if the new href belongs to the same domain as the original URL
            parsed_href = urlparse(href)
-            if parsed_href.netloc != start_domain:
+            if parsed_href.netloc.replace("www.", "") != start_domain:
                continue
            # Open the href in the same tab and retrieve data
            driver.get(href)
-            print(href)
+            # print(href)
            # Wait for the page to load
            time.sleep(2)
            # Extract the content from the hyperlink page
            hyperlink_content_element = driver.find_element(By.XPATH, '//body')
            hyperlink_content_text = hyperlink_content_element.text
-            print(hyperlink_content_text)
+            # print(hyperlink_content_text)
            # Add URL, Domain, and Content of the hyperlink to the data list
            data.append([start_domain, href, hyperlink_content_text])
            # Recursively process the page and follow hyperlinks
@ -86,7 +90,7 @@ def check_href(href, original_url, visited_pages):
    parsed_href = urlparse(href)
    parsed_original_url = urlparse(original_url)
    # Check if the href leads back to the original page
-    if parsed_href.netloc == parsed_original_url.netloc and parsed_href.path == parsed_original_url.path:
+    if parsed_href.netloc.replace("www.", "") == parsed_original_url.netloc.replace("www.", "") and parsed_href.path == parsed_original_url.path:
        return True
    # Check if the href has already been visited
    if href in visited_pages:
@ -129,15 +133,15 @@ def add_cookies(driver, cookies):
    for name, value in cookies.items():
        driver.add_cookie({'name': name, 'value': value})

-def main():
+def chrome_main():
    # Read failed URLs from the list
-    df = pd.read_excel('failed_files.xlsx')
+    df = pd.read_excel(failed_sites_file)

    for ind, row in df.iterrows():
        group = row['单位']  # Replace with the actual column name for group
        name = row['主办']
        url = row['地址']
-        domain = urlparse(url).netloc
+        domain = urlparse(url).netloc.replace("www.", "")

        # Open the website
        driver = open_website(url)
@ -156,11 +160,11 @@ def main():
        process_page(driver, url, visited_pages, domain, data)

        # Export data to a separate Excel file in the web_dir directory
-        output_filename = f'web_dir/{name}_{domain}.xlsx'
+        output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') 
        export_to_excel(data, output_filename)

        # Close the WebDriver
        driver.quit()

 if __name__ == "__main__":
-    main()
+    chrome_main()
--- a/mycode/main.py
+++ b/mycode/main.py
@ -1,12 +1,12 @@
 import pandas as pd
 import os
 import sqlite3
+from .base import BASE_DIR

-current_dir = os.getcwd()
-wechat_dir = os.path.join(current_dir, 'article')
-web_dir = os.path.join(current_dir, 'web_dir')
-output_dir = os.path.join(current_dir, 'summary')
-df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容')
+wechat_dir = os.path.join(BASE_DIR, 'article')
+web_dir = os.path.join(BASE_DIR, 'web_dir')
+output_dir = os.path.join(BASE_DIR, 'summary')
+df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')

 def trans_to_json():
    json_str = df_s.to_json(orient='records', force_ascii=False)
@ -14,7 +14,7 @@ def trans_to_json():
        f.write(json_str)

 def make_simple_csv_from_db():
-    conn = sqlite3.connect('db_folder/test.db')
+    conn = sqlite3.connect(os.path.join(BASE_DIR, 'db_folder/test.db'))
    query = "select id, g.nickname, a.title, a.content_url, datetime(a.p_date, 'unixepoch', 'localtime') as pub_date from articles a LEFT JOIN gzhs g on g.biz = a.biz"
    df = pd.read_sql_query(query, conn)
    # 关闭数据库连接
@ -65,7 +65,7 @@ def ana_wechat():
                output_data.append(output_row)
                index += 1

-    output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
+    # output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])

    return output_data

@ -86,29 +86,17 @@ def ana_web():
                        output_row = [
                            index,
                            row2['name'],
-                            "文章标题",
+                            "/",
                            row['错误表述'],
                            row['建议修改词语'],
                            row['错误分类'],
-                            row2['content_url']
+                            row2['url']
                        ]
                        output_data.append(output_row)
                        index += 1

-    output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
+    # output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])

    return output_data

-# Run WeChat Analysis
-wechat_results = ana_wechat()

-# Run Web Content Analysis
-web_results = ana_web()
-
-# Save results in an Excel file with two sheets
-output_excel_path = os.path.join(output_dir, '总院及下属公司官方公众号巡查结果汇总表.xlsx')
-with pd.ExcelWriter(output_excel_path) as writer:
-    wechat_results.to_excel(writer, sheet_name='公众号', index=False)
-    web_results.to_excel(writer, sheet_name='网站', index=False)
-
-print("Analysis completed and results saved to Excel.")
--- a/mycode/note.txt
+++ b/mycode/note.txt
--- a/mycode/web.py
+++ b/mycode/web.py
@ -28,7 +28,7 @@ for ind, row in df.iterrows():
        group = row['单位']
        name = row['主办']
        url = row['地址']
-        domain = urlparse(url).netloc
+        domain = urlparse(url).netloc.replace('www.', '')
        cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
        process = subprocess.Popen(cmd)
        processes.append(process)
@ -52,3 +52,4 @@ for ind, row in df.iterrows():

 if info_to_save:
    save_info_to_excel(info_to_save, 'failed_files.xlsx')
+
--- a/mycode/web2.py
+++ b/mycode/web2.py
@ -0,0 +1,60 @@
+import requests
+import sqlite3
+import pandas as pd
+import html2text
+import re
+from urllib.parse import urlparse, urljoin
+import concurrent.futures
+
+class WebSpider:
+    def __init__(self) -> None:
+        self.headers = {
+                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+                    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+                    'Cache-Control': 'max-age=0',
+                    'Proxy-Connection': 'keep-alive',
+                    'Referer': 'https://www.baidu.com/',
+                    'Upgrade-Insecure-Requests': '1',
+                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
+                }
+        self.visited_urls = set()
+        self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.css', '.js'])
+        self.futures = []
+
+    def get_one_page(self, group, name, domain, url):
+        if url in self.visited_urls:
+            return
+        self.visited_urls.add(url)
+        r = requests.get(url=url, headers=self.headers, timeout=10)
+        rtext = r.text
+        if rtext:
+            h = html2text.HTML2Text()
+            h.ignore_links = True  # 忽略所有链接
+            text = h.handle(rtext)
+            print(group, name, domain, url)
+            links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
+            for link in links:
+                full_link = urljoin(r.url, link)
+                if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
+                    if urlparse(full_link).netloc.replace('www.', "") == domain:
+                        self.get_one_page(group, name, domain, full_link)
+
+    def start(self):
+        df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
+        # for ind, row in df.iterrows():
+        #     group = row['单位']
+        #     name = row['主办']
+        #     url = row['地址']
+        #     domain = urlparse(url).netloc.replace('www.', '')
+        #     self.get_one_page(group, name, domain, url)
+        with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
+            futures = []
+            for ind, row in df.iterrows():
+                group = row['单位']
+                name = row['主办']
+                url = row['地址']
+                domain = urlparse(url).netloc.replace('www.', '')
+                futures.append(executor.submit(self.get_one_page, group, name, domain, url))
+            concurrent.futures.wait(futures)
+
+WebSpider().start()
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,4 @@
 scrapy==2.8.0
-scrapyd==1.4.1
-scrapyd-client==1.2.3
 html2text==2020.1.16
 pandas==2.0.0
 openpyxl==3.1.2
--- a/summary/template.xlsx
+++ b/summary/template.xlsx
--- a/web3.py
+++ b/web3.py
@ -0,0 +1,110 @@
+import os
+import subprocess
+import pandas as pd
+from urllib.parse import urlparse
+import signal
+import sys
+import datetime
+from openpyxl import load_workbook
+
+from mycode.base import BASE_DIR
+from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir
+from mycode.crawl_chrome import chrome_main, failed_sites_file
+
+
+
+def save_info_to_excel(info_list, output_filename):
+    df = pd.DataFrame(info_list, columns=['单位', '主办' , '地址'])
+    df.to_excel(output_filename, index=False)
+
+# 定义 SIGINT 信号处理函数
+def sigint_handler(signal, frame):
+    print('收到 Ctrl-C 信号，正在关闭子进程...')
+    for process in processes:
+        process.terminate()
+    print('子进程已关闭，程序退出。')
+    sys.exit(0)
+
+if __name__ == '__main__':
+    print('巡查任务开始。。。')
+    now = datetime.datetime.now()
+    month = now.month
+
+    print('正在组合微信公众号爬取内容。。。')
+    make_simple_csv_from_db()
+    make_wechat_articles_full()
+    print('公众号爬取内容组装完毕!')
+
+    print('开始进行网站爬取。。。')
+
+    df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
+    processes = []
+
+    # 注册 SIGINT 信号处理函数
+    signal.signal(signal.SIGINT, sigint_handler)
+
+    ind = 0
+    for ind, row in df.iterrows():
+            group = row['单位']
+            name = row['主办']
+            url = row['地址']
+            domain = urlparse(url).netloc.replace('www.', '')
+            # output =  os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') 
+            cmd = ['./runtime/Scripts/scrapy.exe', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
+            # cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
+            process = subprocess.Popen(cmd)
+            processes.append(process)
+
+    # Wait for all processes to finish
+    for process in processes:
+        process.wait()
+
+    print('网站爬取结束,校验中。。。')
+    # Check output file sizes and save information if size is less than 20KB
+    info_to_save = []
+    for ind, row in df.iterrows():
+        group = row['单位']
+        name = row['主办']
+        url = row['地址']
+        domain = urlparse(url).netloc.replace("www.", "")
+        output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
+        if os.path.exists(output_filename):
+            file_size = os.path.getsize(output_filename)
+            if file_size < 20 * 1024:  # Convert KB to bytes
+                info_to_save.append([group, name, url])
+
+    if info_to_save:
+        print('存在未爬取站点,正在调用Chrome继续爬取。。。')
+        save_info_to_excel(info_to_save, failed_sites_file)
+        chrome_main()
+        os.remove(failed_sites_file)
+
+    print('网站爬取完毕!')
+
+    print('开始对比分析所有内容。。。')
+    # Run WeChat Analysis
+    wechat_results = ana_wechat()
+    # Run Web Content Analysis
+    web_results = ana_web()
+
+    # Save results in an Excel file with two sheets
+    output_excel_path = os.path.join(output_dir, f'{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx')
+    # with pd.ExcelWriter(output_excel_path) as writer:
+    #     df = pd.DataFrame(wechat_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
+    #     df.to_excel(writer, sheet_name='公众号', index=False)
+    #     df2 = pd.DataFrame(web_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
+    #     df2.to_excel(writer, sheet_name='网站', index=False)
+    template_path = os.path.join(output_dir, 'template.xlsx')
+    workbook = load_workbook(template_path)
+
+    # 选择要操作的工作表
+    wechat_sheet = workbook['公众号']
+    web_sheet = workbook['网站']
+    for row in wechat_results:
+        wechat_sheet.append(row)
+    for row in web_results:
+        web_sheet.append(row)
+    workbook.save(output_excel_path)
+    workbook.close()
+    print('巡查任务执行完毕, 请查看summary文件夹, 可手动校对')
+    os.system("pause")
--- a/wechat.exe
+++ b/wechat.exe
--- a/wechat_dir/articles_full.csv
+++ b/wechat_dir/articles_full.csv
--- a/zcspider/pipelines.py
+++ b/zcspider/pipelines.py
@ -7,7 +7,6 @@ from openpyxl import Workbook, load_workbook

 # useful for handling different item types with a single interface
 from scrapy.exceptions import IgnoreRequest
-import psycopg2

 # class ZcspiderPipeline2:
 #     """
@ -28,10 +27,17 @@ import psycopg2
 class ZcspiderPipeline:

    def open_spider(self, spider):
-        print('初始化数据库连接')
-        self.conn = psycopg2.connect(host="49.232.14.174",database="zcspider", user="postgres", password="zcDsj2021")
-        self.cur = self.conn.cursor()
-        self.cur.execute("delete from content where domain = %s", (spider.domain, ))
+        self.file_name = spider.output
+        if os.path.exists(self.file_name):
+            os.remove(self.file_name)
+        self.wb = Workbook()
+        self.ws = self.wb.active
+        self.ws.append(['group', 'name', 'domain', 'url', 'text'])
+        
+        # print('初始化数据库连接')
+        # self.conn = psycopg2.connect(host="49.232.14.174",database="zcspider", user="postgres", password="zcDsj2021")
+        # self.cur = self.conn.cursor()
+        # self.cur.execute("delete from content where domain = %s", (spider.domain, ))
        # rows = self.cur.fetchall()
        # spider.visited_urls_last = [i[0] for i in rows] if len(rows)>1 else []
    
@ -45,18 +51,22 @@ class ZcspiderPipeline:
    #     return request

    def process_item(self, item, spider):
-        try:
-            self.cur.execute("INSERT INTO content (domain, url, text) VALUES(%s, %s, %s)",
-                            (item['domain'], item['url'], item['text']))
-            self.conn.commit()
-        except:
-            self.conn.rollback()
-            raise
+        # try:
+        #     self.cur.execute("INSERT INTO content (domain, url, text) VALUES(%s, %s, %s)",
+        #                     (item['domain'], item['url'], item['text']))
+        #     self.conn.commit()
+        # except:
+        #     self.conn.rollback()
+        #     raise
+        line = [item['group'], item['name'], item['domain'], item['url'], item['text']]
+        self.ws.append(line)
+        self.wb.save(self.file_name)
        return item

    # 结束，关闭连接
    def close_spider(self, spider):
-        # 关闭游标
-        self.cur.close()
-        # 关闭连接
-        self.conn.close()
+        # # 关闭游标
+        # self.cur.close()
+        # # 关闭连接
+        # self.conn.close()
+        self.wb.close()
--- a/zcspider/settings.py
+++ b/zcspider/settings.py
@ -37,10 +37,15 @@ ROBOTSTXT_OBEY = False
 #TELNETCONSOLE_ENABLED = False

 # Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
-#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-#    "Accept-Language": "en",
-#}
+DEFAULT_REQUEST_HEADERS = {
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+            'Cache-Control': 'max-age=0',
+            'Proxy-Connection': 'keep-alive',
+            'Referer': 'https://www.baidu.com/',
+            'Upgrade-Insecure-Requests': '1',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
+        }

 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
@ -91,11 +96,11 @@ ROBOTSTXT_OBEY = False
 REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
 TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 FEED_EXPORT_ENCODING = 'gb18030'
-LOG_LEVEL = 'INFO'
-DOWNLOAD_TIMEOUT = 30
+LOG_LEVEL = 'ERROR'
+DOWNLOAD_TIMEOUT = 10

 ITEM_PIPELINES = {
-    # 'zcspider.pipelines.ZcspiderPipeline2': 300,
+    # 'zcspider.pipelines.ZcspiderPipeline': 300,
 }

 FEED_EXPORTERS = {
--- a/zcspider/spiders/base.py
+++ b/zcspider/spiders/base.py
@ -25,7 +25,8 @@ class BaseSpider(scrapy.Spider):
         self.start_urls = [start_url]
         self.name = name
         self.group = group
-         self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico'])
+         self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.dat', '.css', '.js'])
+         print(f"爬取开始: {name}-{domain}")

    def start_requests(self):
        for url in self.start_urls:
@ -33,27 +34,28 @@ class BaseSpider(scrapy.Spider):
            yield r

    def request2(self, fail):
-        rurl = fail.request.url,
+        rurl = fail.request.url
        self.logger.info(f'{rurl} 使用requests继续请求')
        r = requests.get(url=fail.request.url, headers=self.headers, timeout=20)
-        rtext = r.text
-        h = html2text.HTML2Text()
-        h.ignore_links = True  # 忽略所有链接
-        text = h.handle(rtext)
-        yield {
-                'group': self.group,
-                'name': self.name,
-                'domain': self.domain,
-                'url': rurl,
-                'text': text,
-            }
-        self.visited_urls.add(rurl)
-        links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
-        for link in links:
-             full_link = urljoin(r.url, link)
-             if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
-                if urlparse(full_link).netloc == self.domain:
-                    yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
+        if r.status_code < 400:
+            rtext = r.text
+            h = html2text.HTML2Text()
+            h.ignore_links = True  # 忽略所有链接
+            text = h.handle(rtext)
+            yield {
+                    'group': self.group,
+                    'name': self.name,
+                    'domain': self.domain,
+                    'url': rurl,
+                    'text': text,
+                }
+            self.visited_urls.add(rurl)
+            links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
+            for link in links:
+                full_link = urljoin(r.url, link)
+                if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
+                    if urlparse(full_link).netloc.replace('www.', '') == self.domain:
+                        yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
         
    def parse(self, response):
        self.visited_urls.add(response.url)
@ -64,17 +66,21 @@ class BaseSpider(scrapy.Spider):
        text = h.handle(response.text)
        # except:
        #     text = h.handle(response.body.decode(encoding='gb18030'))
+        if response.status < 400:
+            yield {
+                    'group': self.group,
+                    'name': self.name,
+                    'domain': self.domain,
+                    'url': response.url,
+                    'text': text,
+                }

-        yield {
-                'group': self.group,
-                'name': self.name,
-                'domain': self.domain,
-                'url': response.url,
-                'text': text,
-            }
+            for link in response.css("a::attr('href')").getall():
+                full_link = response.urljoin(link)
+                if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
+                    if urlparse(full_link).netloc.replace('www.', '') == self.domain:
+                        yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
    
-        for link in response.css("a::attr('href')").getall():
-            full_link = response.urljoin(link)
-            if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
-                if urlparse(full_link).netloc == self.domain:
-                    yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
+    def closed(self, reason):
+        # This method will be called when the Spider is about to close
+        print(f'爬取完成: {self.name}-{self.domain}')