该篇博客主要是讲解scrapy框架中的itemPipeline中item对象的调用顺序问题,以下为代码部分,解释在下图中。
import os
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from xiaohua import settings
class DBPipeline(object):
def process_item(self, item, spider):
print('hhhhhhhhhhhh',item)
item['images']='qqqqqqqqqqqqqqqqq'
return item
class ImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
#下载图片
print('uuuuuuuuuuuuuuuuuuuuuu',item)
for image_url in item['images']:
yield scrapy.Request(url=image_url,meta={'name':item['title']})
def item_completed(self, results, item, info):
item['images'] = 'llllllllllllllllllll'
return item
def file_path(self, request, response=None, info=None):
#获取存储的文件名
dirpath = os.path.join(settings.IMAGES_STORE,request.meta['name'])
if not os.path.exists(dirpath):
os.mkdir(dirpath)
#相对与IMAGES_STORE的路径
return request.meta['name']+"/"+request.url.split('/')[-1]
class moviesPipeline(object):
def process_item(self,item,spider):
print('papapapapapapapapapaapapapapapapapapap',item)
item['images']='dddddddddddddddd'
return item
settings模块的相关设置:
ITEM_PIPELINES = {
'xiaohua.pipelines.DBPipeline': 450,
# redis存储管道
'scrapy_redis.pipelines.RedisPipeline': 400,
#下载图片的管道
'xiaohua.pipelines.ImagePipeline': 300,
'xiaohua.pipelines.moviesPipeline':500,
}
如下为各管道在settings文件配置的权重不同所调用item对象的先后顺序详解