zcspider/zcspider/pipelines.py

45 lines
1.6 KiB
Python

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from scrapy.exceptions import IgnoreRequest
import psycopg2
class ZcspiderPipeline:
def open_spider(self, spider):
print('初始化数据库连接')
self.conn = psycopg2.connect(host="49.232.14.174",database="zcspider", user="postgres", password="zcDsj2021")
self.cur = self.conn.cursor()
self.cur.execute("delete from content where domain = %s", (spider.domain, ))
# rows = self.cur.fetchall()
# spider.visited_urls_last = [i[0] for i in rows] if len(rows)>1 else []
# def process_request(self, request, spider):
# print('检查当前地址有没有存储过')
# self.cur.execute("SELECT url from content where url= %s", request.url)
# row = self.cur.fetchone() is not None
# print(row)
# if row:
# raise IgnoreRequest(f"Duplicate URL found: {request.url}")
# return request
def process_item(self, item, spider):
try:
self.cur.execute("INSERT INTO content (domain, url, text) VALUES(%s, %s, %s)",
(item['domain'], item['url'], item['text']))
self.conn.commit()
except:
self.conn.rollback()
raise
return item
# 结束,关闭连接
def close_spider(self, spider):
# 关闭游标
self.cur.close()
# 关闭连接
self.conn.close()