要使用 scrapy自带的ImagesPipeline进行图片下载,需要在setting.py文件里开启通道
第一步
#setting.py
ITEM_PIPELINES = {
#scrapy自带的imagespipeline:
'scrapy.pipelines.images.ImagesPipeline':300,
#自定义的图片下载pipeline
#'syw.pipelines.MyImagePipeline': 300,
}
#设置图片存放路径
IMAGES_STORE ='./images'
第二步
设置图片下载item.py
#item.py
#项目item
class MyItem(scrapy.Item):
# define the fields for your item here like:
#image_urls,和images是scrapy默认值,不能更改,如果要更改,则需要重写get_media_requests
image_urls =scrapy.Field()
images =scrapy.Field() #images 是scrapy默认的参数,用来储存图片信息的
#如
''''images':[ {'checksum': '501800c5e5bf8e544c5e7ec4fb8c31f4', #md5sum 校验码
'path': 'full/7b3a79b62adbd3bcd023fb1790460243a4ae23f3.jpg',#保存的图片路径和名字
'url': 'http://xxx/20200721942627185_220.jpg'}]'''#原图片url
第三步
你编写的爬虫文件spider
from ..items import MyItem
class MySpider(scrapy.Spider):
name = 'spidername'
start_urls = ['http://xxxx']
def parse(self, response):
#编写xpath提取image_urls
item= MyItem()
#先建一个列表,储存提取的的url
image_urls = []
提取的url添加到image_urls列表中
image_urls.append(response.xpath(XXX))
#将所有提取到的图片url赋值给item
item['image_urls']=image_urls
yield item
完成以上三步即可以实现图片下载了。
下面的方法,可以实现图片重命名,主要是能过yield scrapy.Request(image_url,meta={‘image_name’:image_name}) 给request,提供参数,file_path获取参数,参数可以是item
重写一个pipeline并且继承ImagesPipeline,如果不重写图片名字将是希哈值
#pipeline.py
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class MyImagesPipeline(ImagesPipeline):
#生成图片url requests
def get_media_requests(self, item, info):
#image_urls_dit是dic={image_url:image_name}
for image_url,image_name in item['image_urls_dit'].items():
# image_name':name 将图片名字传给 Request,这样就可以在 file_path中获取图片名字
yield scrapy.Request(image_url,meta={'image_name':image_name})
def file_path(self, request, response=None, info=None):
image_name = request.meta['image_name']
return 'full/%s.jpg' % (image_name)
附:scrapy.pipelines.images 源代码如下:
"""
Images Pipeline
See documentation in topics/media-pipeline.rst
"""
import functools
import hashlib
from contextlib import suppress
from io import BytesIO
from itemadapter import ItemAdapter
from PIL import Image
from scrapy.exceptions import DropItem
from scrapy.http import Request
from scrapy.pipelines.files import FileException, FilesPipeline
# TODO: from scrapy.pipelines.media import MediaPipeline
from scrapy.settings import Settings
from scrapy.utils.misc import md5sum
from scrapy.utils.python import to_bytes
class NoimagesDrop(DropItem):
"""Product with no images exception"""
class ImageException(FileException):
"""General image error exception"""
class ImagesPipeline(FilesPipeline):
"""Abstract pipeline that implement the image thumbnail generation logic
"""
MEDIA_NAME = 'image'
# Uppercase attributes kept for backward compatibility with code that subclasses
# ImagesPipeline. They may be overridden by settings.
MIN_WIDTH = 0
MIN_HEIGHT = 0
EXPIRES = 90
THUMBS = {}
DEFAULT_IMAGES_URLS_FIELD = 'image_urls'
DEFAULT_IMAGES_RESULT_FIELD = 'images'
def __init__(self, store_uri, download_func=None, settings=None):
super(ImagesPipeline, self).__init__(store_uri, settings=settings,
download_func=download_func)
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
resolve = functools.partial(self._key_for_pipe,
base_class_name="ImagesPipeline",
settings=settings)
self.expires = settings.getint(
resolve("IMAGES_EXPIRES"), self.EXPIRES
)
if not hasattr(self, "IMAGES_RESULT_FIELD"):
self.IMAGES_RESULT_FIELD = self.DEFAULT_IMAGES_RESULT_FIELD
if not hasattr(self, "IMAGES_URLS_FIELD"):
self.IMAGES_URLS_FIELD = self.DEFAULT_IMAGES_URLS_FIELD
self.images_urls_field = settings.get(
resolve('IMAGES_URLS_FIELD'),
self.IMAGES_URLS_FIELD
)
self.images_result_field = settings.get(
resolve('IMAGES_RESULT_FIELD'),
self.IMAGES_RESULT_FIELD
)
self.min_width = settings.getint(
resolve('IMAGES_MIN_WIDTH'), self.MIN_WIDTH
)
self.min_height = settings.getint(
resolve('IMAGES_MIN_HEIGHT'), self.MIN_HEIGHT
)
self.thumbs = settings.get(
resolve('IMAGES_THUMBS'), self.THUMBS
)
@classmethod
def from_settings(cls, settings):
s3store = cls.STORE_SCHEMES['s3']
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
s3store.AWS_ENDPOINT_URL = settings['AWS_ENDPOINT_URL']
s3store.AWS_REGION_NAME = settings['AWS_REGION_NAME']
s3store.AWS_USE_SSL = settings['AWS_USE_SSL']
s3store.AWS_VERIFY = settings['AWS_VERIFY']
s3store.POLICY = settings['IMAGES_STORE_S3_ACL']
gcs_store = cls.STORE_SCHEMES['gs']
gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
gcs_store.POLICY = settings['IMAGES_STORE_GCS_ACL'] or None
ftp_store = cls.STORE_SCHEMES['ftp']
ftp_store.FTP_USERNAME = settings['FTP_USER']
ftp_store.FTP_PASSWORD = settings['FTP_PASSWORD']
ftp_store.USE_ACTIVE_MODE = settings.getbool('FEED_STORAGE_FTP_ACTIVE')
store_uri = settings['IMAGES_STORE']
return cls(store_uri, settings=settings)
def file_downloaded(self, response, request, info):
return self.image_downloaded(response, request, info)
def image_downloaded(self, response, request, info):
checksum = None
for path, image, buf in self.get_images(response, request, info):
if checksum is None:
buf.seek(0)
checksum = md5sum(buf)
width, height = image.size
self.store.persist_file(
path, buf, info,
meta={'width': width, 'height': height},
headers={'Content-Type': 'image/jpeg'})
return checksum
def get_images(self, response, request, info):
path = self.file_path(request, response=response, info=info)
orig_image = Image.open(BytesIO(response.body))
width, height = orig_image.size
if width < self.min_width or height < self.min_height:
raise ImageException("Image too small (%dx%d < %dx%d)" %
(width, height, self.min_width, self.min_height))
image, buf = self.convert_image(orig_image)
yield path, image, buf
for thumb_id, size in self.thumbs.items():
thumb_path = self.thumb_path(request, thumb_id, response=response, info=info)
thumb_image, thumb_buf = self.convert_image(image, size)
yield thumb_path, thumb_image, thumb_buf
def convert_image(self, image, size=None):
if image.format == 'PNG' and image.mode == 'RGBA':
background = Image.new('RGBA', image.size, (255, 255, 255))
background.paste(image, image)
image = background.convert('RGB')
elif image.mode == 'P':
image = image.convert("RGBA")
background = Image.new('RGBA', image.size, (255, 255, 255))
background.paste(image, image)
image = background.convert('RGB')
elif image.mode != 'RGB':
image = image.convert('RGB')
if size:
image = image.copy()
image.thumbnail(size, Image.ANTIALIAS)
buf = BytesIO()
image.save(buf, 'JPEG')
return image, buf
def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.images_urls_field, [])
return [Request(u) for u in urls]
def item_completed(self, results, item, info):
with suppress(KeyError):
ItemAdapter(item)[self.images_result_field] = [x for ok, x in results if ok]
return item
def file_path(self, request, response=None, info=None):
image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
return 'full/%s.jpg' % (image_guid)
def thumb_path(self, request, thumb_id, response=None, info=None):
thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)