feat: 整合并打包runtime

2023-08-25 15:01:45 +08:00 · 2023-08-25 15:01:45 +08:00 · 18580ecd28
parent 722f9b2542
commit 18580ecd28
18 changed files with 285 additions and 517318 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,8 +6,14 @@ twistd.pid
 ~$*
 *.xlsx
 !biao.xlsx
 !template.xlsx
 wechat_dir/*
-ana_web.csv
+*.csv
 ana_wechat.csv
 .idea/*
-*.pdf
+*.pdf
 article/*
 db_folder/*
 runtime/*
 chrome117.exe
 html/*
 excel/*
--- a/mycode/init.py
+++ b/mycode/init.py
--- a/mycode/base.py
+++ b/mycode/base.py
@ -0,0 +1,2 @@
 import os
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
--- a/mycode/chromedriver.exe
+++ b/mycode/chromedriver.exe
--- a/mycode/count.py
+++ b/mycode/count.py
--- a/mycode/crawl_chrome.py
+++ b/mycode/crawl_chrome.py
@ -5,6 +5,10 @@ import time
 from urllib.parse import urlparse
 from pathlib import Path
 import pandas as pd
 from .base import BASE_DIR
 import os
 chrome_driver_file = os.path.join(BASE_DIR, 'mycode', 'chromedriver.exe')
 failed_sites_file = os.path.join(BASE_DIR, 'mycode/failed_sites.xlsx')
 def open_website(url):
    # Set up Chrome WebDriver with custom User-Agent
@ -12,7 +16,7 @@ def open_website(url):
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36")
    prefs = {"profile.managed_default_content_settings.images": 2, 'notifications':2}
    options.add_experimental_option("prefs", prefs)
-    driver = webdriver.Chrome("./chromedriver.exe", options=options)
+    driver = webdriver.Chrome(chrome_driver_file, options=options)
    driver.get(url)
    return driver
@ -40,7 +44,7 @@ def process_page(driver, url, visited_pages, start_domain, data):
    # Extract the content from the page
    content_element = driver.find_element(By.XPATH, '//body')
    content_text = content_element.text
-    print(content_text)
+    # print(content_text)
    # Add URL, Domain, and Content to the data list
    data.append([start_domain, url, content_text])
@ -59,17 +63,17 @@ def process_page(driver, url, visited_pages, start_domain, data):
        try:
            # Check if the new href belongs to the same domain as the original URL
            parsed_href = urlparse(href)
-            if parsed_href.netloc != start_domain:
+            if parsed_href.netloc.replace("www.", "") != start_domain:
                continue
            # Open the href in the same tab and retrieve data
            driver.get(href)
-            print(href)
+            # print(href)
            # Wait for the page to load
            time.sleep(2)
            # Extract the content from the hyperlink page
            hyperlink_content_element = driver.find_element(By.XPATH, '//body')
            hyperlink_content_text = hyperlink_content_element.text
-            print(hyperlink_content_text)
+            # print(hyperlink_content_text)
            # Add URL, Domain, and Content of the hyperlink to the data list
            data.append([start_domain, href, hyperlink_content_text])
            # Recursively process the page and follow hyperlinks
@ -86,7 +90,7 @@ def check_href(href, original_url, visited_pages):
    parsed_href = urlparse(href)
    parsed_original_url = urlparse(original_url)
    # Check if the href leads back to the original page
-    if parsed_href.netloc == parsed_original_url.netloc and parsed_href.path == parsed_original_url.path:
+    if parsed_href.netloc.replace("www.", "") == parsed_original_url.netloc.replace("www.", "") and parsed_href.path == parsed_original_url.path:
        return True
    # Check if the href has already been visited
    if href in visited_pages:
@ -129,15 +133,15 @@ def add_cookies(driver, cookies):
    for name, value in cookies.items():
        driver.add_cookie({'name': name, 'value': value})
-def main():
+def chrome_main():
    # Read failed URLs from the list
-    df = pd.read_excel('failed_files.xlsx')
+    df = pd.read_excel(failed_sites_file)
    for ind, row in df.iterrows():
        group = row['单位']  # Replace with the actual column name for group
        name = row['主办']
        url = row['地址']
-        domain = urlparse(url).netloc
+        domain = urlparse(url).netloc.replace("www.", "")
        # Open the website
        driver = open_website(url)
@ -156,11 +160,11 @@ def main():
        process_page(driver, url, visited_pages, domain, data)
        # Export data to a separate Excel file in the web_dir directory
-        output_filename = f'web_dir/{name}_{domain}.xlsx'
+        output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') 
        export_to_excel(data, output_filename)
        # Close the WebDriver
        driver.quit()
 if __name__ == "__main__":
-    main()
+    chrome_main()
--- a/mycode/main.py
+++ b/mycode/main.py
@ -1,12 +1,12 @@
 import pandas as pd
 import os
 import sqlite3
 from .base import BASE_DIR
-current_dir = os.getcwd()
+wechat_dir = os.path.join(BASE_DIR, 'article')
-wechat_dir = os.path.join(current_dir, 'article')
+web_dir = os.path.join(BASE_DIR, 'web_dir')
-web_dir = os.path.join(current_dir, 'web_dir')
+output_dir = os.path.join(BASE_DIR, 'summary')
-output_dir = os.path.join(current_dir, 'summary')
+df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')
 df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容')
 def trans_to_json():
    json_str = df_s.to_json(orient='records', force_ascii=False)
@ -14,7 +14,7 @@ def trans_to_json():
        f.write(json_str)
 def make_simple_csv_from_db():
-    conn = sqlite3.connect('db_folder/test.db')
+    conn = sqlite3.connect(os.path.join(BASE_DIR, 'db_folder/test.db'))
    query = "select id, g.nickname, a.title, a.content_url, datetime(a.p_date, 'unixepoch', 'localtime') as pub_date from articles a LEFT JOIN gzhs g on g.biz = a.biz"
    df = pd.read_sql_query(query, conn)
    # 关闭数据库连接
@ -65,7 +65,7 @@ def ana_wechat():
                output_data.append(output_row)
                index += 1
-    output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
+    # output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
    return output_data
@ -86,29 +86,17 @@ def ana_web():
                        output_row = [
                            index,
                            row2['name'],
-                            "文章标题",
+                            "/",
                            row['错误表述'],
                            row['建议修改词语'],
                            row['错误分类'],
-                            row2['content_url']
+                            row2['url']
                        ]
                        output_data.append(output_row)
                        index += 1
-    output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
+    # output_data.insert(0, ['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
    return output_data
 # Run WeChat Analysis
 wechat_results = ana_wechat()
 # Run Web Content Analysis
 web_results = ana_web()
 # Save results in an Excel file with two sheets
 output_excel_path = os.path.join(output_dir, '总院及下属公司官方公众号巡查结果汇总表.xlsx')
 with pd.ExcelWriter(output_excel_path) as writer:
    wechat_results.to_excel(writer, sheet_name='公众号', index=False)
    web_results.to_excel(writer, sheet_name='网站', index=False)
 print("Analysis completed and results saved to Excel.")
--- a/mycode/note.txt
+++ b/mycode/note.txt
--- a/mycode/web.py
+++ b/mycode/web.py
@ -28,7 +28,7 @@ for ind, row in df.iterrows():
        group = row['单位']
        name = row['主办']
        url = row['地址']
-        domain = urlparse(url).netloc
+        domain = urlparse(url).netloc.replace('www.', '')
        cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
        process = subprocess.Popen(cmd)
        processes.append(process)
@ -52,3 +52,4 @@ for ind, row in df.iterrows():
 if info_to_save:
    save_info_to_excel(info_to_save, 'failed_files.xlsx')
--- a/mycode/web2.py
+++ b/mycode/web2.py
@ -0,0 +1,60 @@
 import requests
 import sqlite3
 import pandas as pd
 import html2text
 import re
 from urllib.parse import urlparse, urljoin
 import concurrent.futures
 class WebSpider:
    def __init__(self) -> None:
        self.headers = {
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
                    'Cache-Control': 'max-age=0',
                    'Proxy-Connection': 'keep-alive',
                    'Referer': 'https://www.baidu.com/',
                    'Upgrade-Insecure-Requests': '1',
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
                }
        self.visited_urls = set()
        self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.css', '.js'])
        self.futures = []
    def get_one_page(self, group, name, domain, url):
        if url in self.visited_urls:
            return
        self.visited_urls.add(url)
        r = requests.get(url=url, headers=self.headers, timeout=10)
        rtext = r.text
        if rtext:
            h = html2text.HTML2Text()
            h.ignore_links = True  # 忽略所有链接
            text = h.handle(rtext)
            print(group, name, domain, url)
            links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
            for link in links:
                full_link = urljoin(r.url, link)
                if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
                    if urlparse(full_link).netloc.replace('www.', "") == domain:
                        self.get_one_page(group, name, domain, full_link)
    def start(self):
        df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
        # for ind, row in df.iterrows():
        #     group = row['单位']
        #     name = row['主办']
        #     url = row['地址']
        #     domain = urlparse(url).netloc.replace('www.', '')
        #     self.get_one_page(group, name, domain, url)
        with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
            futures = []
            for ind, row in df.iterrows():
                group = row['单位']
                name = row['主办']
                url = row['地址']
                domain = urlparse(url).netloc.replace('www.', '')
                futures.append(executor.submit(self.get_one_page, group, name, domain, url))
            concurrent.futures.wait(futures)
 WebSpider().start()
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,4 @@
 scrapy==2.8.0
 scrapyd==1.4.1
 scrapyd-client==1.2.3
 html2text==2020.1.16
 pandas==2.0.0
 openpyxl==3.1.2
--- a/summary/template.xlsx
+++ b/summary/template.xlsx
--- a/web3.py
+++ b/web3.py
@ -0,0 +1,110 @@
 import os
 import subprocess
 import pandas as pd
 from urllib.parse import urlparse
 import signal
 import sys
 import datetime
 from openpyxl import load_workbook
 from mycode.base import BASE_DIR
 from mycode.main import make_simple_csv_from_db, make_wechat_articles_full, ana_web, ana_wechat, output_dir
 from mycode.crawl_chrome import chrome_main, failed_sites_file
 def save_info_to_excel(info_list, output_filename):
    df = pd.DataFrame(info_list, columns=['单位', '主办' , '地址'])
    df.to_excel(output_filename, index=False)
 # 定义 SIGINT 信号处理函数
 def sigint_handler(signal, frame):
    print('收到 Ctrl-C 信号，正在关闭子进程...')
    for process in processes:
        process.terminate()
    print('子进程已关闭，程序退出。')
    sys.exit(0)
 if __name__ == '__main__':
    print('巡查任务开始。。。')
    now = datetime.datetime.now()
    month = now.month
    print('正在组合微信公众号爬取内容。。。')
    make_simple_csv_from_db()
    make_wechat_articles_full()
    print('公众号爬取内容组装完毕!')
    print('开始进行网站爬取。。。')
    df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
    processes = []
    # 注册 SIGINT 信号处理函数
    signal.signal(signal.SIGINT, sigint_handler)
    ind = 0
    for ind, row in df.iterrows():
            group = row['单位']
            name = row['主办']
            url = row['地址']
            domain = urlparse(url).netloc.replace('www.', '')
            # output =  os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx') 
            cmd = ['./runtime/Scripts/scrapy.exe', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.xlsx']
            # cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-a', f'output={output}']
            process = subprocess.Popen(cmd)
            processes.append(process)
    # Wait for all processes to finish
    for process in processes:
        process.wait()
    print('网站爬取结束,校验中。。。')
    # Check output file sizes and save information if size is less than 20KB
    info_to_save = []
    for ind, row in df.iterrows():
        group = row['单位']
        name = row['主办']
        url = row['地址']
        domain = urlparse(url).netloc.replace("www.", "")
        output_filename = os.path.join(BASE_DIR, f'web_dir/{name}_{domain}.xlsx')
        if os.path.exists(output_filename):
            file_size = os.path.getsize(output_filename)
            if file_size < 20 * 1024:  # Convert KB to bytes
                info_to_save.append([group, name, url])
    if info_to_save:
        print('存在未爬取站点,正在调用Chrome继续爬取。。。')
        save_info_to_excel(info_to_save, failed_sites_file)
        chrome_main()
        os.remove(failed_sites_file)
    print('网站爬取完毕!')
    print('开始对比分析所有内容。。。')
    # Run WeChat Analysis
    wechat_results = ana_wechat()
    # Run Web Content Analysis
    web_results = ana_web()
    # Save results in an Excel file with two sheets
    output_excel_path = os.path.join(output_dir, f'{month}月-总院及下属公司官方公众号巡查结果汇总表.xlsx')
    # with pd.ExcelWriter(output_excel_path) as writer:
    #     df = pd.DataFrame(wechat_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
    #     df.to_excel(writer, sheet_name='公众号', index=False)
    #     df2 = pd.DataFrame(web_results, columns=['序号', '信源名称', '文章标题', '错误表述', '建议修改词语', '错误分类', '原文链接'])
    #     df2.to_excel(writer, sheet_name='网站', index=False)
    template_path = os.path.join(output_dir, 'template.xlsx')
    workbook = load_workbook(template_path)
    # 选择要操作的工作表
    wechat_sheet = workbook['公众号']
    web_sheet = workbook['网站']
    for row in wechat_results:
        wechat_sheet.append(row)
    for row in web_results:
        web_sheet.append(row)
    workbook.save(output_excel_path)
    workbook.close()
    print('巡查任务执行完毕, 请查看summary文件夹, 可手动校对')
    os.system("pause")
--- a/wechat.exe
+++ b/wechat.exe
--- a/wechat_dir/articles_full.csv
+++ b/wechat_dir/articles_full.csv
--- a/zcspider/pipelines.py
+++ b/zcspider/pipelines.py
@ -7,7 +7,6 @@ from openpyxl import Workbook, load_workbook
 # useful for handling different item types with a single interface
 from scrapy.exceptions import IgnoreRequest
 import psycopg2
 # class ZcspiderPipeline2:
 #     """
@ -28,10 +27,17 @@ import psycopg2
 class ZcspiderPipeline:
    def open_spider(self, spider):
-        print('初始化数据库连接')
+        self.file_name = spider.output
-        self.conn = psycopg2.connect(host="49.232.14.174",database="zcspider", user="postgres", password="zcDsj2021")
+        if os.path.exists(self.file_name):
-        self.cur = self.conn.cursor()
+            os.remove(self.file_name)
-        self.cur.execute("delete from content where domain = %s", (spider.domain, ))
+        self.wb = Workbook()
        self.ws = self.wb.active
        self.ws.append(['group', 'name', 'domain', 'url', 'text'])
        # print('初始化数据库连接')
        # self.conn = psycopg2.connect(host="49.232.14.174",database="zcspider", user="postgres", password="zcDsj2021")
        # self.cur = self.conn.cursor()
        # self.cur.execute("delete from content where domain = %s", (spider.domain, ))
        # rows = self.cur.fetchall()
        # spider.visited_urls_last = [i[0] for i in rows] if len(rows)>1 else []
@ -45,18 +51,22 @@ class ZcspiderPipeline:
    #     return request
    def process_item(self, item, spider):
-        try:
+        # try:
-            self.cur.execute("INSERT INTO content (domain, url, text) VALUES(%s, %s, %s)",
+        #     self.cur.execute("INSERT INTO content (domain, url, text) VALUES(%s, %s, %s)",
-                            (item['domain'], item['url'], item['text']))
+        #                     (item['domain'], item['url'], item['text']))
-            self.conn.commit()
+        #     self.conn.commit()
-        except:
+        # except:
-            self.conn.rollback()
+        #     self.conn.rollback()
-            raise
+        #     raise
        line = [item['group'], item['name'], item['domain'], item['url'], item['text']]
        self.ws.append(line)
        self.wb.save(self.file_name)
        return item
    # 结束，关闭连接
    def close_spider(self, spider):
-        # 关闭游标
+        # # 关闭游标
-        self.cur.close()
+        # self.cur.close()
-        # 关闭连接
+        # # 关闭连接
-        self.conn.close()
+        # self.conn.close()
        self.wb.close()
--- a/zcspider/settings.py
+++ b/zcspider/settings.py
@ -37,10 +37,15 @@ ROBOTSTXT_OBEY = False
 #TELNETCONSOLE_ENABLED = False
 # Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
+DEFAULT_REQUEST_HEADERS = {
-#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
-#    "Accept-Language": "en",
+            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
-#}
+            'Cache-Control': 'max-age=0',
            'Proxy-Connection': 'keep-alive',
            'Referer': 'https://www.baidu.com/',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
        }
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
@ -91,11 +96,11 @@ ROBOTSTXT_OBEY = False
 REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
 TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 FEED_EXPORT_ENCODING = 'gb18030'
-LOG_LEVEL = 'INFO'
+LOG_LEVEL = 'ERROR'
-DOWNLOAD_TIMEOUT = 30
+DOWNLOAD_TIMEOUT = 10
 ITEM_PIPELINES = {
-    # 'zcspider.pipelines.ZcspiderPipeline2': 300,
+    # 'zcspider.pipelines.ZcspiderPipeline': 300,
 }
 FEED_EXPORTERS = {
--- a/zcspider/spiders/base.py
+++ b/zcspider/spiders/base.py
@ -25,7 +25,8 @@ class BaseSpider(scrapy.Spider):
         self.start_urls = [start_url]
         self.name = name
         self.group = group
-         self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico'])
+         self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.dat', '.css', '.js'])
         print(f"爬取开始: {name}-{domain}")
    def start_requests(self):
        for url in self.start_urls:
@ -33,27 +34,28 @@ class BaseSpider(scrapy.Spider):
            yield r
    def request2(self, fail):
-        rurl = fail.request.url,
+        rurl = fail.request.url
        self.logger.info(f'{rurl} 使用requests继续请求')
        r = requests.get(url=fail.request.url, headers=self.headers, timeout=20)
-        rtext = r.text
+        if r.status_code < 400:
-        h = html2text.HTML2Text()
+            rtext = r.text
-        h.ignore_links = True  # 忽略所有链接
+            h = html2text.HTML2Text()
-        text = h.handle(rtext)
+            h.ignore_links = True  # 忽略所有链接
-        yield {
+            text = h.handle(rtext)
-                'group': self.group,
+            yield {
-                'name': self.name,
+                    'group': self.group,
-                'domain': self.domain,
+                    'name': self.name,
-                'url': rurl,
+                    'domain': self.domain,
-                'text': text,
+                    'url': rurl,
-            }
+                    'text': text,
-        self.visited_urls.add(rurl)
+                }
-        links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
+            self.visited_urls.add(rurl)
-        for link in links:
+            links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
-             full_link = urljoin(r.url, link)
+            for link in links:
-             if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
+                full_link = urljoin(r.url, link)
-                if urlparse(full_link).netloc == self.domain:
+                if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
-                    yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
+                    if urlparse(full_link).netloc.replace('www.', '') == self.domain:
                        yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
    def parse(self, response):
        self.visited_urls.add(response.url)
@ -64,17 +66,21 @@ class BaseSpider(scrapy.Spider):
        text = h.handle(response.text)
        # except:
        #     text = h.handle(response.body.decode(encoding='gb18030'))
        if response.status < 400:
            yield {
                    'group': self.group,
                    'name': self.name,
                    'domain': self.domain,
                    'url': response.url,
                    'text': text,
                }
-        yield {
+            for link in response.css("a::attr('href')").getall():
-                'group': self.group,
+                full_link = response.urljoin(link)
-                'name': self.name,
+                if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
-                'domain': self.domain,
+                    if urlparse(full_link).netloc.replace('www.', '') == self.domain:
-                'url': response.url,
+                        yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
-                'text': text,
+    
-            }
+    def closed(self, reason):
-
+        # This method will be called when the Spider is about to close
-        for link in response.css("a::attr('href')").getall():
+        print(f'爬取完成: {self.name}-{self.domain}')
            full_link = response.urljoin(link)
            if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
                if urlparse(full_link).netloc == self.domain:
                    yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
		`@ -0,0 +1,2 @@`
							`import os`
							`BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))`