# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import CsvItemExporter
from datetime import datetime
import json
import pymongo
import redis
class ToutiaoPipeline(object):
def process_item(self, item, spider):
item['source'] = spider.name
item['utc_time'] = str(datetime.utcnow())
return item
class ToutiaoJsonPipeline(object):
def open_spider(self, spider):
self.filename = open("data.json", "w")
def process_item(self, item, spider):
content = json.dumps(dict(item)) + ",\n"
self.filename.write(content)
return item
def close_spider(self, spider):
self.filename.close()
class ToutiaoCsvPipeline(object):
def open_spider(self, spider):
self.filename = open("data.csv", "wb")
# 创建一个csv文件读写对象,参数是需要保存数据的csv文件对象
self.csv_exporter = CsvItemExporter(self.filename)
# 表示开始进行数据写入
self.csv_exporter.start_exporting()
def process_item(self, item, spider):
self.csv_exporter.export_item(item)
return item
def close_spider(self, spider):
# 表示结束数据写入
self.csv_exporter.finish_exporting()
self.filename.close()
class ToutiaoMongoPipeline(object):
def open_spider(self, spider):
self.client = pymongo.MongoClient(host="192.168.xx.xx", port=27017)
self.db = self.client['toutiao']
self.collection = self.db['content_data']
def process_item(self, item, spider):
self.collection.insert(dict(item))
return item
class ToutiaoRedisPipeline(object):
def open_spider(self, spider):
self.client = redis.Redis(host="127.0.0.1", port=6379)
def process_item(self, item, spider):
content = json.dumps(dict(item))
self.client.lpush("TOUTIAO_ITEM", content)
return item
# 下载图片
class MyImagePipelines(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url, meta={'item': item})
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise FengniaomoteItem("Item contains no images")
return item
def file_path(self, request, response=None, info=None):
item = request.meta['item']
image_guid = request.url.split('/')[-1]
filename = u'full/{0[mote_id]}/{1}'.format(item, image_guid)
return filename
#设置图片下载路径
IMAGES_STORE = 'D:\\doubanimgs'
# 过期天数
IMAGES_EXPIRES = 90 #90天内抓取的都不会被重抓