zcspider/zcspider/pipelines.py

72 lines
2.5 KiB
Python

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import os.path
from openpyxl import Workbook, load_workbook
# useful for handling different item types with a single interface
from scrapy.exceptions import IgnoreRequest
# class ZcspiderPipeline2:
# """
# 保存为xlsx
# """
# sheet = None
# def open_spider(self, spider):
# web_dir = 'd:/xx/web_dir'
# file_path = os.path.join(web_dir, f'{item["name"]}_{item["domain"]}.xlsx')
# wb = load_workbook(file_path)
# self.sheet = wb.get_sheet_names('Sheet1')
#
# def process_item(self, item, spider):
# self.sheet
#
# return item
class ZcspiderPipeline:
def open_spider(self, spider):
self.file_name = spider.output
if os.path.exists(self.file_name):
os.remove(self.file_name)
self.wb = Workbook()
self.ws = self.wb.active
self.ws.append(['group', 'name', 'domain', 'url', 'text'])
# print('初始化数据库连接')
# self.conn = psycopg2.connect(host="49.232.14.174",database="zcspider", user="postgres", password="zcDsj2021")
# self.cur = self.conn.cursor()
# self.cur.execute("delete from content where domain = %s", (spider.domain, ))
# rows = self.cur.fetchall()
# spider.visited_urls_last = [i[0] for i in rows] if len(rows)>1 else []
# def process_request(self, request, spider):
# print('检查当前地址有没有存储过')
# self.cur.execute("SELECT url from content where url= %s", request.url)
# row = self.cur.fetchone() is not None
# print(row)
# if row:
# raise IgnoreRequest(f"Duplicate URL found: {request.url}")
# return request
def process_item(self, item, spider):
# try:
# self.cur.execute("INSERT INTO content (domain, url, text) VALUES(%s, %s, %s)",
# (item['domain'], item['url'], item['text']))
# self.conn.commit()
# except:
# self.conn.rollback()
# raise
line = [item['group'], item['name'], item['domain'], item['url'], item['text']]
self.ws.append(line)
self.wb.save(self.file_name)
return item
# 结束,关闭连接
def close_spider(self, spider):
# # 关闭游标
# self.cur.close()
# # 关闭连接
# self.conn.close()
self.wb.close()