feat: 初步功能提交

2023-06-06 14:01:09 +08:00 · 2023-06-06 14:01:09 +08:00 · 78054d37d3
parent 9d4356c5f5
commit 78054d37d3
13 changed files with 642002 additions and 28 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,4 +2,9 @@ dbs/*
 venv/*
 __pycache__/
 *.pyc
-twistd.pid
+summary*.md
+output/*
+twistd.pid
+~$*
+*.xlsx
+!biao.xlsx
--- a/biao.xlsx
+++ b/biao.xlsx
--- a/chromedriver.exe
+++ b/chromedriver.exe
--- a/count.py
+++ b/count.py
@ -0,0 +1,21 @@
+import pandas as pd
+import os
+current_dir = os.getcwd()
+
+def count_web():
+    total = 0
+    web_dir = os.path.join(current_dir, 'web_dir')
+    for file in os.listdir(web_dir):
+        try:
+            df = pd.read_csv(os.path.join(web_dir, file), encoding='gb18030')
+        except pd.errors.EmptyDataError:
+            pass
+        total = total + len(df)
+        print(file, total)
+    return total
+
+def count_wechat():
+    articles_full_path = os.path.jon(current_dir, 'wechat_dir/articles_full.csv')
+    df = pd.read_csv(articles_full_path)
+    return len(df)
+print(count_web(), count_wechat())
--- a/main.py
+++ b/main.py
@ -0,0 +1,89 @@
+import pandas as pd
+import os
+import html2text
+import sys
+
+current_dir = os.getcwd()
+wechat_dir = os.path.join(current_dir, 'wechat_dir')
+web_dir = os.path.join(current_dir, 'web_dir')
+df_s = pd.read_excel('biao.xlsx', sheet_name='筛查内容')
+
+def trans_to_json():
+    json_str = df_s.to_json(orient='records', force_ascii=False)
+    with open('biao.json', 'w', encoding='utf-8') as f:
+        f.write(json_str)
+
+def make_wechat_articles_full():
+    df =  pd.read_csv(os.path.jon(wechat_dir, 'articles.csv'), encoding='gb18030')
+    df['content'] = ''
+    ind = 0
+    for ind, row in df.iterrows():
+        full_path = os.path.join(wechat_dir, row['nickname'], row['id'] + '.html')
+        try:
+            with open(full_path, encoding='utf-8') as f:
+                h = html2text.HTML2Text()
+                h.ignore_links = True
+                df.at[ind, 'content'] = h.handle(f.read())
+            print(f'{ind}--{row["nickname"]}--{row["title"]}')
+        except:
+            print(full_path + '---不存在')
+        ind +=1
+    df.to_csv('articles_full.csv', encoding='utf-8_sig')
+
+def ana_wechat():
+    articles_full_path = os.path.jon(wechat_dir, 'articles_full.csv')
+    if not os.path.exists(articles_full_path):
+        make_wechat_articles_full()
+    df_a = pd.DataFrame(columns = ['公众号', '标题', '地址', '错误表述', '建议修改词语', '错误分类'])
+    df = pd.read_csv(articles_full_path)
+    df['content'] = df['content'].fillna('')
+    ind = 0
+    need_save = False
+    for ind, row in df_s.iterrows():
+        mask = df['content'].str.contains(row['错误表述'])
+        result = df[mask]
+        if result.empty:
+            continue
+        ind2 = 0
+        for ind2, row2 in result.iterrows():
+            alist = [row2['nickname'], row2['title'], row2['content_url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
+            print(alist)
+            df_a.loc[len(df_a.index)] = alist
+            if need_save is False:
+                need_save = True
+            ind2 +=1
+        ind +=1
+    if need_save:
+        df_a.to_csv('ana_wechat.csv', encoding='utf-8_sig')
+
+def ana_web():
+    df_a = pd.DataFrame(columns = ['单位', '主办', '地址', '错误表述', '建议修改词语', '错误分类'])
+    need_save = False
+    for file in os.listdir(web_dir):
+        full_path = os.path.join(web_dir, file)
+        if os.path.getsize(full_path) > 0:
+            df = pd.read_csv(os.path.join(web_dir, file), encoding='gb18030')
+            ind = 0
+            for ind, row in df_s.iterrows():
+                mask = df['text'].str.contains(row['错误表述'])
+                result = df[mask]
+                if result.empty:
+                    continue
+                ind2 = 0
+                for ind2, row2 in result.iterrows():
+                    alist = [row2['group'], row2['name'], row2['url'], row['错误表述'], row['建议修改词语'], row['错误分类']]
+                    print(alist)
+                    df_a.loc[len(df_a.index)] = alist
+                    if need_save is False:
+                        need_save = True
+                    ind2 +=1
+                ind +=1
+    if need_save:
+        df_a.to_csv('ana_web.csv', encoding='utf-8_sig')
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == 'wechat':
+        ana_wechat()
+    else:
+        ana_web()
+
--- a/note.txt
+++ b/note.txt
@ -1 +1,6 @@
-scrapy crawl basespider -a start_url=http://ctc.ac.cn/
+scrapy crawl basespider -a start_url=http://ctc.ac.cn/ -a name=中国国检测试控股集团股份有限公司 -o output/out.csv
+
+scrapy crawl basespider -a start_url=http://www.ctc-hn.com -a name=test -a group=test -a domain=www.ctc-hn.com
+
+
+scrapy crawl basespider -a start_url=http://www.zbyjs.cn -a name=西安轻工业钟表研究所有限公司 -a group=西安轻工业钟表研究所有限公司 -a domain=zbyjs.cn  -o output/56_西安轻工业钟表研究所有限公司_www.zbyjs.cn.csv
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,6 @@
 scrapy==2.8.0
 scrapyd==1.4.1
 scrapyd-client==1.2.3
-html2text==2020.1.16
+html2text==2020.1.16
+pandas==2.0.0
+openpyxl==3.1.2
--- a/scrape_nj.py
+++ b/scrape_nj.py
@ -0,0 +1,147 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.options import Options
+import time
+from urllib.parse import urlparse
+from pathlib import Path
+import pandas as pd
+
+def open_website(url):
+    # Set up Chrome WebDriver with custom User-Agent
+    options = Options()
+    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36")
+    driver = webdriver.Chrome("./chromedriver.exe", options=options)
+    driver.get(url)
+    return driver
+
+def extract_hyperlinks(driver):
+    # Find all elements with href attribute
+    elements = driver.find_elements(By.XPATH, '//*[@href]')
+    # Extract the href values from the elements
+    hrefs = [element.get_attribute('href') for element in elements]
+    return hrefs
+
+def ignore_image_and_document_hrefs(href):
+    parsed_url = urlparse(href)
+    path = parsed_url.path
+    file_extension = Path(path).suffix
+    # Check if the href has a domain suffix of image or document file extensions
+    return file_extension.lower() in ('.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico', '.css', '.js')
+
+def process_page(driver, url, visited_pages, start_domain, data):
+    # Add the URL to visited pages
+    visited_pages.add(url)
+    # Navigate to the URL
+    driver.get(url)
+    # Wait for the page to load
+    time.sleep(2)
+    # Extract the content from the page
+    content_element = driver.find_element(By.XPATH, '//body')
+    content_text = content_element.text
+    print(content_text)
+    # Add URL, Domain, and Content to the data list
+    data.append([url, start_domain, content_text])
+
+    # Find and process hyperlinks
+    hrefs = extract_hyperlinks(driver)
+    for href in hrefs:
+        # Check if the href should be ignored
+        if ignore_image_and_document_hrefs(href):
+            continue
+        # Check if the href is of type "javascript:void(0)"
+        if href.startswith("javascript:void(0)"):
+            continue
+        # Check if the href leads back to the original page or has already been visited
+        if check_href(href, driver.current_url, visited_pages):
+            continue
+        try:
+            # Open the href in the same tab and retrieve data
+            driver.get(href)
+            # Check if the new href belongs to the same domain as the original URL
+            parsed_href = urlparse(href)
+            if parsed_href.netloc != start_domain:
+                continue
+            print(href)
+            # Wait for the page to load
+            time.sleep(2)
+            # Extract the content from the hyperlink page
+            hyperlink_content_element = driver.find_element(By.XPATH, '//body')
+            hyperlink_content_text = hyperlink_content_element.text
+            print(hyperlink_content_text)
+            # Add URL, Domain, and Content of the hyperlink to the data list
+            data.append([href, start_domain, hyperlink_content_text])
+            # Recursively process the page and follow hyperlinks
+            process_page(driver, href, visited_pages, start_domain, data)
+        except Exception as e:
+            print(f"Error processing hyperlink: {href}")
+            print(f"Error message: {str(e)}")
+            continue
+
+    # Return to the original page
+    driver.get(url)
+
+def check_href(href, original_url, visited_pages):
+    parsed_href = urlparse(href)
+    parsed_original_url = urlparse(original_url)
+    # Check if the href leads back to the original page
+    if parsed_href.netloc == parsed_original_url.netloc and parsed_href.path == parsed_original_url.path and parsed_href.fragment == parsed_original_url.fragment:
+        return True
+    # Check if the href has already been visited
+    if href in visited_pages:
+        return True
+    return False
+
+def export_to_excel(data):
+    # Create a DataFrame from the data list
+    df = pd.DataFrame(data, columns=['URL', 'Domain', 'Content'])
+    # Export the DataFrame to an Excel file
+    df.to_excel('output.xlsx', index=False)
+
+
+def get_cookies_from_previous_session(driver):
+    cookies = {}
+    try:
+        # Execute JavaScript to get the cookies using jQuery Cookie Plugin
+        cookie_script = """
+        var cookies = $.cookie();
+        return Object.entries(cookies).map(([name, value]) => `${name}=${value}`);
+        """
+        cookie_values = driver.execute_script(cookie_script)
+
+        # Parse the cookie values and store them in a dictionary
+        for cookie_value in cookie_values:
+            cookie_name, cookie_value = cookie_value.split('=')
+            cookies[cookie_name] = cookie_value
+    except Exception as e:
+        print('Error getting cookies:', e)
+    return cookies
+
+def add_cookies(driver, cookies):
+    for name, value in cookies.items():
+        driver.add_cookie({'name': name, 'value': value})
+
+def main():
+    # Starting URL
+    start_url = 'http://www.ctc-yz.com/'
+    # Parse the domain from the starting URL
+    parsed_start_url = urlparse(start_url)
+    start_domain = parsed_start_url.netloc
+    # Open the website
+    driver = open_website(start_url)
+    # Retrieve cookies from previous session
+    cookies = get_cookies_from_previous_session(driver)
+    # Add cookies to the WebDriver
+    add_cookies(driver, cookies)
+    # Initialize the set to store visited pages
+    visited_pages = set()
+    # Initialize the data list
+    data = []
+    # Process the starting page and follow hyperlinks recursively
+    process_page(driver, start_url, visited_pages, start_domain, data)
+    # Export the data to an Excel file
+    export_to_excel(data)
+    # Close the WebDriver
+    driver.quit()
+
+if __name__ == "__main__":
+    main()
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -1,14 +0,0 @@
-# Automatically created by: scrapy startproject
-#
-# For more information about the [deploy] section see:
-# https://scrapyd.readthedocs.io/en/latest/deploy.html
-
-[settings]
-default = zcspider.settings
-
-[scrapyd]
-bind_address = 0.0.0.0
-
-[deploy zc1]
-# url = http://localhost:6800/
-project = zcspider
--- a/web.py
+++ b/web.py
@ -0,0 +1,44 @@
+import subprocess
+import pandas as pd
+from urllib.parse import urlparse
+import signal
+import sys
+
+df = pd.read_excel('web_sites.xlsx', sheet_name='Sheet1')
+
+processes = []
+# 定义 SIGINT 信号处理函数
+def sigint_handler(signal, frame):
+    print('收到 Ctrl-C 信号，正在关闭子进程...')
+    for process in processes:
+        process.terminate()
+    print('子进程已关闭，程序退出。')
+    sys.exit(0)
+# 注册 SIGINT 信号处理函数
+signal.signal(signal.SIGINT, sigint_handler)
+
+
+ind = 0
+for ind, row in df.iterrows():
+        group = row['单位']
+        name = row['主办']
+        url = row['地址']
+        if 'http' in url:
+            sx = row['地址'].split('http')
+            ename = sx[0].strip()
+            if ename:
+                name = ename
+            url = 'http' + sx[1]
+        elif 'www' in url:
+            sx = row['地址'].split('www')
+            ename = sx[0].strip()
+            if ename:
+                name = ename
+            url = 'http://www' + sx[1]
+        domain = urlparse(url).netloc
+        cmd = ['scrapy', 'crawl', 'basespider', '-a', f'domain={domain}', '-a', f'start_url={url}', '-a', f'name={name}', '-a', f'group={group}', '-o', f'web_dir/{name}_{domain}.csv']
+        process = subprocess.Popen(cmd)
+        processes.append(process)
+        ind +=1
+        # if ind > 0:
+        #     break
--- a/wechat_dir/articles_full.csv
+++ b/wechat_dir/articles_full.csv
--- a/zcspider/settings.py
+++ b/zcspider/settings.py
@ -17,7 +17,7 @@ NEWSPIDER_MODULE = "zcspider.spiders"
 #USER_AGENT = "zcspider (+http://www.yourdomain.com)"

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@ -90,9 +90,10 @@ ROBOTSTXT_OBEY = True
 # Set settings whose default value is deprecated to a future-proof value
 REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
 TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
-FEED_EXPORT_ENCODING = "utf-8"
+FEED_EXPORT_ENCODING = 'gb18030'
 LOG_LEVEL = 'INFO'
+DOWNLOAD_TIMEOUT = 30

 ITEM_PIPELINES = {
-    'zcspider.pipelines.ZcspiderPipeline': 300,
+    # 'zcspider.pipelines.ZcspiderPipeline': 300,
 }
--- a/zcspider/spiders/base.py
+++ b/zcspider/spiders/base.py
@ -1,33 +1,80 @@
 import scrapy
 from urllib.parse import urlparse
 import html2text
+import requests
+import re
+from urllib.parse import urljoin

 class BaseSpider(scrapy.Spider):
    name = "basespider"
    start_urls = ["http://ctc.ac.cn/"]
    visited_urls = set()

-    def  __init__(self, start_url: str, name=None, **kwargs):
+    def  __init__(self, domain: str, start_url: str, name='',  group='', **kwargs):
         super().__init__(name, **kwargs)
-         self.domain = urlparse(start_url).netloc
+         self.headers = {
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+            'Cache-Control': 'max-age=0',
+            'Proxy-Connection': 'keep-alive',
+            'Referer': 'https://www.baidu.com/',
+            'Upgrade-Insecure-Requests': '1',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
+        }
+         self.domain = domain
         self.start_urls = [start_url]
-         self.ext = tuple(['.png', '.jpg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'])
+         self.name = name
+         self.group = group
+         self.ext = tuple(['.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.rar', '.zip', '.ico'])

+    def start_requests(self):
+        for url in self.start_urls:
+            r = scrapy.Request(url, dont_filter=True, headers=self.headers, callback=self.parse, errback=self.request2)
+            yield r
+
+    def request2(self, fail):
+        rurl = fail.request.url,
+        self.logger.info(f'{rurl} 使用requests继续请求')
+        r = requests.get(url=fail.request.url, headers=self.headers, timeout=20)
+        rtext = r.text
+        h = html2text.HTML2Text()
+        h.ignore_links = True  # 忽略所有链接
+        text = h.handle(rtext)
+        yield {
+                'group': self.group,
+                'name': self.name,
+                'domain': self.domain,
+                'url': rurl,
+                'text': text,
+            }
+        self.visited_urls.add(rurl)
+        links = re.findall(r'href=["\']?([^"\'>]+)', r.text)
+        for link in links:
+             full_link = urljoin(r.url, link)
+             if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
+                if urlparse(full_link).netloc == self.domain:
+                    yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)
+         
    def parse(self, response):
        self.visited_urls.add(response.url)
        h = html2text.HTML2Text()
        h.ignore_links = True  # 忽略所有链接
        # 提取纯文本内容
-        text = h.handle(response.body.decode())
+        # try:
+        text = h.handle(response.text)
+        # except:
+        #     text = h.handle(response.body.decode(encoding='gb18030'))

        yield {
+                'group': self.group,
+                'name': self.name,
                'domain': self.domain,
                'url': response.url,
                'text': text,
            }

        for link in response.css("a::attr('href')").getall():
-            if link not in self.visited_urls:
-                    if link.startswith("/") or urlparse(link).netloc == self.domain:
-                        if not link.endswith(self.ext):
-                                yield scrapy.Request(response.urljoin(link), callback=self.parse)
+            full_link = response.urljoin(link)
+            if full_link not in self.visited_urls and (not full_link.endswith(self.ext)):
+                if urlparse(full_link).netloc == self.domain:
+                    yield scrapy.Request(full_link, callback=self.parse, headers=self.headers, errback=self.request2)