该分类下总共14页,每页45个内容,如何提取下一页链接、每个小分类链接、小分类的标题和图片详情链接,这里使用到了crawlspider的筛选规则
得到图片链接如何按照小分类创建文件夹、存储
这个理我们实现了自定义文件名和文件夹,但是文件夹名称和小分类标题无关
这里借助ImagesPipeline中的def get_media_requests(self, item, info)实现在请求图片链接下载前,将title信息塞入meta中
在file_path中提取meta中title信息,就实现了按照小分类标题,自定义文件夹,提取url中信息作为文件名
代码如下
items
import scrapy
class YouguoItem(scrapy.Item):
title = scrapy.Field()
image_urls = scrapy.Field()
YouGuozz
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from urllib import request
from YouGuo.items import YouguoItem
class YouguozzSpider(CrawlSpider):
name = 'YouGuozz'
allowed_domains = ['meinvtu123.net']
start_urls = ['https://www.meinvtu123.net/a/56/list_56_1.html']
#使用rules非常简单筛选出页面中下一页链接
rules = (
Rule(LinkExtractor(allow=r'.+a/56/list_56_.+\.html')),
Rule(LinkExtractor(allow=r'.+a/56/\d+\.html'), follow=True, callback='parse_item')
)
def parse_item(self, response):
#提取title,去除空h3
title = response.xpath('//div[@class="Title111"][2]/h3/text()').extract()
if title:
title = title[-1]
print(title)
pic_urls = response.xpath('//div[@class="contenta"]/img/@src').extract()
print(pic_urls)
item = YouguoItem(title=title,image_urls=pic_urls)
yield item
#判断小分类的下一页
next_urls = response.xpath('//div[@class="page"]/ul/a[last()]/@href').extract()[0]
if next_urls:
next_url = request.urljoin(response.url, next_urls)
print(next_url)
yield scrapy.Request(next_url, callback=self.parse_item)
重点在pipelines
from scrapy.pipelines.images import ImagesPipeline
from YouGuo.settings import IMAGES_STORE
from scrapy.http import Request
import os
class ImagesnamePipeline(ImagesPipeline):
#调用这个函数这要是为了将title传给file_path使用,
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
#在请求img_url前,在请求中带上title参数
yield Request(image_url,meta={'mid_item':item['title']})
def file_path(self, request, response=None, info=None):
#提取出title
title = request.meta['mid_item']
print(title)
#依据title创建文件名
image_store = os.path.join(IMAGES_STORE,title)
if not os.path.exists(image_store):
os.mkdir(image_store)
#https://pic.meinvtu123.net/tupian/2019/allimg/190321/21133024-1-3B4.jpg
#使用split('-')切割,提取最后一个作为文件名
name = request.url.split('-')[-1]
#构建完整存储路径并且返回
filename = os.path.join(image_store, name)
print(filename)
return filename
settings
机器人协议、等待时间、headers就不多说了
IMAGES_STORE = r'E:\YouGuo'
ITEM_PIPELINES = {
'YouGuo.pipelines.ImagesnamePipeline':1
}