1.解析网页并分析出图片地址
import scrapy
from ..items import ImgsproItem
class CImgSpider(scrapy.Spider):
name = "c_img"
#allowed_domains = ["www.xxx.com"]
start_urls = ["https://pic.netbian.com/4kdongwu/"]
def parse(self, response):
li_list = response.xpath('//*[@id="main"]/div[3]/ul/li')
for li in li_list:
img = li.xpath('./a/img/@src').extract_first()
img = 'https://pic.netbian.com'+img
item = ImgsproItem()
item['img_src'] = img
print(img)
yield item
2.配置item文件
class ImgsproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
img_src = scrapy.Field()
3.重写图片的管道类
from scrapy.pipelines.images import ImagesPipeline
import scrapy
'''备注掉原来管道类
class CrawImgPipeline:
def process_item(self, item, spider):
return item
'''
# 新管道类
class ImgsPipeline(ImagesPipeline):
# 该方法用作于请求发送
def get_media_requests(self, item, info):
print(item, 'media_request')
yield scrapy.Request(url=item['img_src'])
# 指定文件存储路径(文件夹 + 文件夹名称) # 看源码requset这里是get_media_requests中的item对象
def file_path(self, request, response=None, info=None):
return request.url.split('/')[-1]
# 将item传递给下一个即将被执行的管道类
def item_completed(self, results, item, info):
return item
4.配置setting文件,开启管道
ITEM_PIPELINES = {
# "craw_img.pipelines.CrawImgPipeline": 300,
"craw_img.pipelines.ImgsPipeline": 300,
}
#img图片存储路径
IMAGES_STORE = './imgs'
注意:需提要安装好pillow库