喜欢用scrapy来爬美女图片的朋友肯定有这样的一个困扰,爬下来的图片都是各种乱码 *****.jpg 看起来不舒服,而且利于套图的归类。我就是在爬完了几十万的图片后,实在是无法忍受了,故想办法解决这个问题。
首先定位到ImagePipelines.py
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
import scrapy
class MyImagesPipeline(ImagesPipeline):
'''自定义图片下载器,以图片url为路径保存图片'''
def get_media_requests(self, item, info):
'''发生图片下载请求,其中item['front_image_url']字段是scrapy中我们自定义的url字段,
以数组的方式存储,遍历数组请求图片'''
for image_url in item['front_image_url']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
'''results是一个list 第一个为图片下载状态,对应OK 第二个是一个tupled其中可以为path的字段对应存储路径,而item['front_image_path']是我们自定义字段,保存在item中'''
front_image_path = [x['path'] for ok, x in results if ok]
if not front_image_path:
raise DropItem("Item contains no images")
item['front_image_path'] = front_image_path
return item
def file_path(self, request, response=None, info=None):
## start of deprecation warning block (can be removed in the future)
def _warn():
from scrapy.exceptions import ScrapyDeprecationWarning
import warnings
warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
'please use file_path(request, response=None, info=None) instead',
category=ScrapyDeprecationWarning, stacklevel=1)
# check if called from image_key or file_key with url as first argument
if not isinstance(request, Request):
_warn()
url = request
else:
url = request.url
# detect if file_key() or image_key() methods have been overridden
if not hasattr(self.file_key, '_base'):
_warn()
return self.file_key(url)
elif not hasattr(self.image_key, '_base'):
_warn()
return self.image_key(url)
## end of deprecation warning block
image_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation
return 'full/%s.jpg' % (image_guid)
file_path方法就是针对文件名的,只需要change to request.url 就可以了
下面码出我的Pipelines.py
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.http import request
from scrapy.exceptions import DropItem
import scrapy
from scrapy.utils.project import get_project_settings #导入setting.py
class MyImagesPipeline(ImagesPipeline):
IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
def file_path(self,request,response=None,info=None):
image_guid = request.url.split('/')[-1] #image_guid就是之前图片那个hash值,改成把url通过‘/’分割后的最后一部分,就是**.jpg
return 'full/%s' %(image_guid)
def get_media_requests(self, item, spider): #这部分我的理解就是把得到的image_url(即jpg的地址),重新放到request里面去访问
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self,results,item,info):
image_paths = [x['path'] for ok,x in results if ok] #ImagesPipelines默认写法,没有深究
if not image_paths:
raise DropItem('Item contains no image')
return item
接着运行scrapy
scrapy crawl javbusSpider
上图就是你们想要的效果了吧