45 lines
1.6 KiB
Python
45 lines
1.6 KiB
Python
# Define your item pipelines here
|
|
#
|
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
|
|
|
|
|
# useful for handling different item types with a single interface
|
|
from scrapy.exceptions import IgnoreRequest
|
|
import psycopg2
|
|
|
|
class ZcspiderPipeline:
|
|
|
|
def open_spider(self, spider):
|
|
print('初始化数据库连接')
|
|
self.conn = psycopg2.connect(host="49.232.14.174",database="zcspider", user="postgres", password="zcDsj2021")
|
|
self.cur = self.conn.cursor()
|
|
self.cur.execute("delete from content where domain = %s", (spider.domain, ))
|
|
# rows = self.cur.fetchall()
|
|
# spider.visited_urls_last = [i[0] for i in rows] if len(rows)>1 else []
|
|
|
|
# def process_request(self, request, spider):
|
|
# print('检查当前地址有没有存储过')
|
|
# self.cur.execute("SELECT url from content where url= %s", request.url)
|
|
# row = self.cur.fetchone() is not None
|
|
# print(row)
|
|
# if row:
|
|
# raise IgnoreRequest(f"Duplicate URL found: {request.url}")
|
|
# return request
|
|
|
|
def process_item(self, item, spider):
|
|
try:
|
|
self.cur.execute("INSERT INTO content (domain, url, text) VALUES(%s, %s, %s)",
|
|
(item['domain'], item['url'], item['text']))
|
|
self.conn.commit()
|
|
except:
|
|
self.conn.rollback()
|
|
raise
|
|
return item
|
|
|
|
# 结束,关闭连接
|
|
def close_spider(self, spider):
|
|
# 关闭游标
|
|
self.cur.close()
|
|
# 关闭连接
|
|
self.conn.close() |