from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
import scrapy
class MyImagesPipeline(ImagesPipeline):
'''自定义图片下载器,以图片url为路径保存图片'''
def get_media_requests(self, item, info):
'''发生图片下载请求,其中item['front_image_url']字段是scrapy中我们自定义的url字段,
以数组的方式存储,遍历数组请求图片'''
for image_url in item['front_image_url']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
'''results是一个list 第一个为图片下载状态,对应OK 第二个是一个tupled其中可以为path的字段对应存储路径,而item['front_image_path']是我们自定义字段,保存在item中'''
front_image_path = [x['path'] for ok, x in results if ok]
if not front_image_path:
raise DropItem("Item contains no images")
item['front_image_path'] = front_image_path
return item
def file_path(self, request, response=None, info=None):
'''自定义图片保存路径,以图片的url保存,重写前是图片的url经过MD5编码后存储'''
image_guid = request.url
return 'full/%s' % (image_guid)
以上代码放到scrapy项目中的pipelines文件中
重写ImagesPipeline后还要在setting中进行配置
import os
ITEM_PIPELINES = {
#自定义的pipelines
'articleSpider.pipelines.MyImagesPipeline': 300,
}
#自定义存储imageurl的字段,item["front_image_url"]
IMAGES_URL_FILED = "front_image_url"
#工程根目录
project_dir = os.path.dirname(__file__)
#下载图片存储位置
IMAGES_STORE = os.path.join(project_dir, 'images')