#items.py
import scrapy
class JiandanItem(scrapy.Item):
# define the fields for your item here like:
image_urls = scrapy.Field()
image=scrapy.Field()
#spider.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from ..items import JiandanItem
class JiandanSpider(scrapy.Spider):
name='jiandan'
allowed_domains=[]
start_urls=['http://jandan.net/ooxx']
def parse(self,response):
item=JiandanItem()
item['image_urls']=response.css('img::attr(src)').extract()
yield item
new_url=response.css('a.previous-comment-page::attr(href)').extract_first()
if new_url:
yield Request(new_url,callback=self.parse)
#pipelines.py
from scrapy.pipelines.images import ImagesPipeline
import scrapy
class JiandanPipeline(ImagesPipeline):
#发起request,下载图片
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request('http:'+image_url)
#全部下载完成后调用的方法
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item
#settings.py
# 使用图片管道
ITEM_PIPELINES = {
'jiandan.pipelines.JiandanPipeline': 1,
}
IMAGES_STORE = 'f:/jiandan' # 图片存储路径
# 30 days of delay for images expiration
IMAGES_EXPIRES = 30
# 图片缩略图
IMAGES_THUMBS = {
'small': (50, 50),
'big': (270, 270),
}
# 图片过滤器,最小高度和宽度
IMAGES_MIN_HEIGHT = 110
IMAGES_MIN_WIDTH = 110
#下载延迟
DOWNLOAD_DELAY = 0.25
在item中设置image_urls字段,存储爬取到的图片的链接,在pipelines中request相应url,下载图片,并把下载失败的item移除。