class ImgSpider(scrapy.Spider):
name = 'img'
start_urls = ['https://sc.chinaz.com/tupian/']
# 爬虫处理方法
def parse(self, response):
div_list = response.xpath('//*[@id="container"]/div')
for div in div_list:
# 图片懒加载
src = 'https:' + div.xpath('./div/a/img/@src2').extract_first()
# print(src)
item = ImgproItem()
item['src'] = src
yield item
# 图片对象item
class ImgproItem(scrapy.Item):
src = scrapy.Field()
# 图片管道类重写
class imgsPipeline(ImagesPipeline):
# 该方法负责 根据图片url 发起下载的请求
def get_media_requests(self, item, info):
yield scrapy.Request(item['src'])
# 该方法负责 处理下载的图片名称
def file_path(self, request, response=None, info=None, *, item=None):
imgName = request.url.split('/')[-1]
return imgName
# 该方法负责 执行完重写方法后 返回item对象
def item_completed(self, results, item, info):
return item
# 配置相关
IMAGES_STORE = './imgs'
# 使用自己重写的管道类
ITEM_PIPELINES = {
'imgPro.pipelines.imgsPipeline': 300,
}
# 不遵循robots协议
ROBOTSTXT_OBEY = False
# 日志级别
LOG_LEVEL = 'ERROR'
# 请求头
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
python爬虫入门学习10-scrapy-站长之家图片爬取
最新推荐文章于 2022-07-19 12:00:00 发布