72 lines
2.5 KiB
Python
72 lines
2.5 KiB
Python
# Define your item pipelines here
|
|
#
|
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
|
import os.path
|
|
from openpyxl import Workbook, load_workbook
|
|
|
|
# useful for handling different item types with a single interface
|
|
from scrapy.exceptions import IgnoreRequest
|
|
|
|
# class ZcspiderPipeline2:
|
|
# """
|
|
# 保存为xlsx
|
|
# """
|
|
# sheet = None
|
|
# def open_spider(self, spider):
|
|
# web_dir = 'd:/xx/web_dir'
|
|
# file_path = os.path.join(web_dir, f'{item["name"]}_{item["domain"]}.xlsx')
|
|
# wb = load_workbook(file_path)
|
|
# self.sheet = wb.get_sheet_names('Sheet1')
|
|
#
|
|
# def process_item(self, item, spider):
|
|
# self.sheet
|
|
#
|
|
# return item
|
|
|
|
class ZcspiderPipeline:
|
|
|
|
def open_spider(self, spider):
|
|
self.file_name = spider.output
|
|
if os.path.exists(self.file_name):
|
|
os.remove(self.file_name)
|
|
self.wb = Workbook()
|
|
self.ws = self.wb.active
|
|
self.ws.append(['group', 'name', 'domain', 'url', 'text'])
|
|
|
|
# print('初始化数据库连接')
|
|
# self.conn = psycopg2.connect(host="49.232.14.174",database="zcspider", user="postgres", password="zcDsj2021")
|
|
# self.cur = self.conn.cursor()
|
|
# self.cur.execute("delete from content where domain = %s", (spider.domain, ))
|
|
# rows = self.cur.fetchall()
|
|
# spider.visited_urls_last = [i[0] for i in rows] if len(rows)>1 else []
|
|
|
|
# def process_request(self, request, spider):
|
|
# print('检查当前地址有没有存储过')
|
|
# self.cur.execute("SELECT url from content where url= %s", request.url)
|
|
# row = self.cur.fetchone() is not None
|
|
# print(row)
|
|
# if row:
|
|
# raise IgnoreRequest(f"Duplicate URL found: {request.url}")
|
|
# return request
|
|
|
|
def process_item(self, item, spider):
|
|
# try:
|
|
# self.cur.execute("INSERT INTO content (domain, url, text) VALUES(%s, %s, %s)",
|
|
# (item['domain'], item['url'], item['text']))
|
|
# self.conn.commit()
|
|
# except:
|
|
# self.conn.rollback()
|
|
# raise
|
|
line = [item['group'], item['name'], item['domain'], item['url'], item['text']]
|
|
self.ws.append(line)
|
|
self.wb.save(self.file_name)
|
|
return item
|
|
|
|
# 结束,关闭连接
|
|
def close_spider(self, spider):
|
|
# # 关闭游标
|
|
# self.cur.close()
|
|
# # 关闭连接
|
|
# self.conn.close()
|
|
self.wb.close() |