斗图啦网址
https://www.doutula.com/photo/list/
(1)分析网站 得到图片的地址
(2)进入得到的网址分析
(3)编写项目代码
(4)scrapy爬取,源码
items.py
import scrapy
class DoutubaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
link = scrapy.Field()
settings.py
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
#'doutuba.pipelines.DoutubaPipeline': 300,
'doutuba.pipelines.DoutuImgPipeline': 300,
}
IMAGES_STORE = 'images' #图片的地址 当前文件夹
pipeline.py
class DoutuImgPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
yield scrapy.Request(url=item["link"], meta={'item': item})
def file_path(self, request, response=None, info=None):
item = request.meta['item']
path = item['name'] + '.jpg'
return path
spider.py
import scrapy
from doutuba.items import DoutubaItem
class DoutuSpider(scrapy.Spider):
name = 'doutu'
allowed_domains = ['doutula.com']
offset = 1
start_urls = ['https://www.doutula.com/photo/list/']
def parse(self, response):
img_srcs = response.xpath('//*[@id="pic-detail"]/div/div[2]/div[2]/ul/li/div/div/a/@href').extract()
for link in img_srcs:
yield scrapy.Request(link, callback=self.getNew)
self.offset += 1
url = "https://www.doutula.com/photo/list/?page=" + str(self.offset)
yield scrapy.Request(url, callback=self.parse)
def getNew(self, response):
item = DoutubaItem()
name = response.xpath('//*[@id="detail"]/div/div[2]/li/div[1]/h1/a/text()').extract()
img_src = response.xpath('//*[@id="detail"]/div/div[2]/li/div[2]/div/div/div/div/table/tbody/tr[1]/td/img/@src').extract()
item["name"] = name[0]
item["link"] = img_src[0]
yield item
(5)效果图
如有错误,多多指教