Imagepipelines下载图片
1、创建项目
# -*- coding: utf-8 -*-
import scrapy
class ZolSpider(scrapy.Spider):
name = 'zol'
allowed_domains = ['zol.com.cn']
start_urls = ['http://desk.zol.com.cn/bizhi/9220_112846_2.html']
def parse(self, response):
image_url=response.xpath('//div[@id="mouscroll"]/img/@src').extract()
#string(.)获取子孙所有元素
image_name=response.xpath('string(//h3)').extract_first()
#需要把image_name使用meta字典的格式推送给item对象,才能打印,因为pipelines里面没有定义
yield {
'image_urls':image_url,
'image_name':image_name
}
next_url = response.xpath('//a[@id="pageNext"]/@href').extract_first()
if next_url.find('.html')!=-1:
yield scrapy.Request(response.urljoin(next_url),callback=self.parse)
2、配置文件
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
#第一种配置保存图片的方式,注意,需要把Tupianpipeline改成ImagePipeline,才能保存图片
'tupian.pipelines.ImagePipeline': 300,
#第二种配置保存图片的方式
# 'scrapy.pipelines.images.ImagesPipeline':5,
}
#配置文件保存的位置
IMAGES_STORE ='e:/img'
3、重写方法
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
import scrapy
class TupianPipeline:
def process_item(self, item, spider):
return item
#重写ImagePipeline方法
class ImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
#把zol里面定义的image_name通过meta推送给item,进行解析
yield scrapy.Request(image_url,meta={'image_name':item['image_name']})
#重新定义图片文字名字
def file_path(self, request, response=None, info=None):
#使用断点调试,分析返回参数
filename=request.meta['image_name'].strip().replace('\r\n\t\t',r'')+'.jpg'
#因为打印出来的图片数字格式是1/10的类型,/代表的是文件下一级,所以需要把/转换格式
filename=filename.replace('/', '_')
return filename
4、运行文件
from scrapy.cmdline import execute
execute('scrapy crawl zol'.split())