用 scrapy ImagesPipeline 下截图片 (保持图片原名或重命名)

要使用 scrapy自带的ImagesPipeline进行图片下载,需要在setting.py文件里开启通道

第一步

#setting.py
ITEM_PIPELINES = {
   #scrapy自带的imagespipeline:
   'scrapy.pipelines.images.ImagesPipeline':300,
   
   #自定义的图片下载pipeline
   #'syw.pipelines.MyImagePipeline': 300,
}

#设置图片存放路径
IMAGES_STORE ='./images'

第二步

设置图片下载item.py

#item.py
#项目item
class MyItem(scrapy.Item):
    # define the fields for your item here like:
    #image_urls,和images是scrapy默认值,不能更改,如果要更改,则需要重写get_media_requests

	image_urls =scrapy.Field()
    images =scrapy.Field() #images 是scrapy默认的参数,用来储存图片信息的
    #如
    ''''images':[ {'checksum': '501800c5e5bf8e544c5e7ec4fb8c31f4',  #md5sum 校验码 
        'path': 'full/7b3a79b62adbd3bcd023fb1790460243a4ae23f3.jpg',#保存的图片路径和名字
        'url': 'http://xxx/20200721942627185_220.jpg'}]'''#原图片url

第三步
你编写的爬虫文件spider

from ..items import MyItem
class MySpider(scrapy.Spider):
    name = 'spidername'
    start_urls = ['http://xxxx']
	
    def parse(self, response):
    #编写xpath提取image_urls
    item= MyItem()
    #先建一个列表,储存提取的的url
    image_urls = []
    提取的url添加到image_urls列表中
	image_urls.append(response.xpath(XXX)#将所有提取到的图片url赋值给item
	item['image_urls']=image_urls
	yield item

完成以上三步即可以实现图片下载了。

下面的方法,可以实现图片重命名,主要是能过yield scrapy.Request(image_url,meta={‘image_name’:image_name}) 给request,提供参数,file_path获取参数,参数可以是item

重写一个pipeline并且继承ImagesPipeline,如果不重写图片名字将是希哈值

#pipeline.py
import scrapy
from scrapy.pipelines.images import ImagesPipeline

class MyImagesPipeline(ImagesPipeline):
	#生成图片url requests
    def get_media_requests(self, item, info):
    
    	#image_urls_dit是dic={image_url:image_name}
    	
        for image_url,image_name in item['image_urls_dit'].items():
        	# image_name':name 将图片名字传给 Request,这样就可以在 file_path中获取图片名字
            yield scrapy.Request(image_url,meta={'image_name':image_name})

    def file_path(self, request, response=None, info=None):

    image_name = request.meta['image_name']

    return 'full/%s.jpg' % (image_name)

附:scrapy.pipelines.images 源代码如下:

"""
Images Pipeline

See documentation in topics/media-pipeline.rst
"""
import functools
import hashlib
from contextlib import suppress
from io import BytesIO

from itemadapter import ItemAdapter
from PIL import Image

from scrapy.exceptions import DropItem
from scrapy.http import Request
from scrapy.pipelines.files import FileException, FilesPipeline
# TODO: from scrapy.pipelines.media import MediaPipeline
from scrapy.settings import Settings
from scrapy.utils.misc import md5sum
from scrapy.utils.python import to_bytes


class NoimagesDrop(DropItem):
    """Product with no images exception"""


class ImageException(FileException):
    """General image error exception"""


class ImagesPipeline(FilesPipeline):
    """Abstract pipeline that implement the image thumbnail generation logic

    """

    MEDIA_NAME = 'image'

    # Uppercase attributes kept for backward compatibility with code that subclasses
    # ImagesPipeline. They may be overridden by settings.
    MIN_WIDTH = 0
    MIN_HEIGHT = 0
    EXPIRES = 90
    THUMBS = {}
    DEFAULT_IMAGES_URLS_FIELD = 'image_urls'
    DEFAULT_IMAGES_RESULT_FIELD = 'images'

    def __init__(self, store_uri, download_func=None, settings=None):
        super(ImagesPipeline, self).__init__(store_uri, settings=settings,
                                             download_func=download_func)

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        resolve = functools.partial(self._key_for_pipe,
                                    base_class_name="ImagesPipeline",
                                    settings=settings)
        self.expires = settings.getint(
            resolve("IMAGES_EXPIRES"), self.EXPIRES
        )

        if not hasattr(self, "IMAGES_RESULT_FIELD"):
            self.IMAGES_RESULT_FIELD = self.DEFAULT_IMAGES_RESULT_FIELD
        if not hasattr(self, "IMAGES_URLS_FIELD"):
            self.IMAGES_URLS_FIELD = self.DEFAULT_IMAGES_URLS_FIELD

        self.images_urls_field = settings.get(
            resolve('IMAGES_URLS_FIELD'),
            self.IMAGES_URLS_FIELD
        )
        self.images_result_field = settings.get(
            resolve('IMAGES_RESULT_FIELD'),
            self.IMAGES_RESULT_FIELD
        )
        self.min_width = settings.getint(
            resolve('IMAGES_MIN_WIDTH'), self.MIN_WIDTH
        )
        self.min_height = settings.getint(
            resolve('IMAGES_MIN_HEIGHT'), self.MIN_HEIGHT
        )
        self.thumbs = settings.get(
            resolve('IMAGES_THUMBS'), self.THUMBS
        )

    @classmethod
    def from_settings(cls, settings):
        s3store = cls.STORE_SCHEMES['s3']
        s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
        s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
        s3store.AWS_ENDPOINT_URL = settings['AWS_ENDPOINT_URL']
        s3store.AWS_REGION_NAME = settings['AWS_REGION_NAME']
        s3store.AWS_USE_SSL = settings['AWS_USE_SSL']
        s3store.AWS_VERIFY = settings['AWS_VERIFY']
        s3store.POLICY = settings['IMAGES_STORE_S3_ACL']

        gcs_store = cls.STORE_SCHEMES['gs']
        gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
        gcs_store.POLICY = settings['IMAGES_STORE_GCS_ACL'] or None

        ftp_store = cls.STORE_SCHEMES['ftp']
        ftp_store.FTP_USERNAME = settings['FTP_USER']
        ftp_store.FTP_PASSWORD = settings['FTP_PASSWORD']
        ftp_store.USE_ACTIVE_MODE = settings.getbool('FEED_STORAGE_FTP_ACTIVE')

        store_uri = settings['IMAGES_STORE']
        return cls(store_uri, settings=settings)

    def file_downloaded(self, response, request, info):
        return self.image_downloaded(response, request, info)

    def image_downloaded(self, response, request, info):
        checksum = None
        for path, image, buf in self.get_images(response, request, info):
            if checksum is None:
                buf.seek(0)
                checksum = md5sum(buf)
            width, height = image.size
            self.store.persist_file(
                path, buf, info,
                meta={'width': width, 'height': height},
                headers={'Content-Type': 'image/jpeg'})
        return checksum
    def get_images(self, response, request, info):
        path = self.file_path(request, response=response, info=info)
        orig_image = Image.open(BytesIO(response.body))

        width, height = orig_image.size
        if width < self.min_width or height < self.min_height:
            raise ImageException("Image too small (%dx%d < %dx%d)" %
                                 (width, height, self.min_width, self.min_height))
        image, buf = self.convert_image(orig_image)
        yield path, image, buf

        for thumb_id, size in self.thumbs.items():
            thumb_path = self.thumb_path(request, thumb_id, response=response, info=info)
            thumb_image, thumb_buf = self.convert_image(image, size)
            yield thumb_path, thumb_image, thumb_buf
    def convert_image(self, image, size=None):
        if image.format == 'PNG' and image.mode == 'RGBA':
            background = Image.new('RGBA', image.size, (255, 255, 255))
            background.paste(image, image)
            image = background.convert('RGB')
        elif image.mode == 'P':
            image = image.convert("RGBA")
            background = Image.new('RGBA', image.size, (255, 255, 255))
            background.paste(image, image)
            image = background.convert('RGB')
        elif image.mode != 'RGB':
            image = image.convert('RGB')
        if size:
            image = image.copy()
            image.thumbnail(size, Image.ANTIALIAS)
        buf = BytesIO()
        image.save(buf, 'JPEG')
        return image, buf
    def get_media_requests(self, item, info):
        urls = ItemAdapter(item).get(self.images_urls_field, [])
        return [Request(u) for u in urls]

    def item_completed(self, results, item, info):
        with suppress(KeyError):
            ItemAdapter(item)[self.images_result_field] = [x for ok, x in results if ok]
        return item

    def file_path(self, request, response=None, info=None):
        image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
        return 'full/%s.jpg' % (image_guid)

    def thumb_path(self, request, thumb_id, response=None, info=None):
        thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
        return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)
  • 2
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值