scrapy爬虫采集的图片保存在不同的文件夹

mzt.py

# -*- coding: utf-8 -*-
import urllib

import scrapy
from copy import deepcopy


class MztSpider(scrapy.Spider):
    name = 'mzt'
    allowed_domains = ['www.mmjpg.com']
    start_urls = ['http://www.mmjpg.com/']

    def parse(self, response):
        url_list = response.xpath('/html/body/div[2]/div[1]/ul//li/span[1]/a')
        for url in url_list:
            # print(url)
            item = {}
            item['detail_url'] = url.xpath('./@href').extract_first()
            item['title'] = url.xpath('./text()').extract_first()
            print(item['detail_url'], item['title'])
            yield scrapy.Request(
                item['detail_url'],
                callback=self.parse_detail,
                meta={"item": item}
            )

        # 翻页
        next_page = response.xpath('//a[contains(text(),"下一页")]/@href').extract_first()
        print(next_page)
        if next_page is not None:
            next_page = urllib.parse.urljoin(response.url, next_page)
            yield scrapy.Request(
                next_page,
                callback=self.parse,
                meta={"item": item}
            )

    # 解析详情页
    def parse_detail(self, response):
        item = response.meta['item']
        item['img_urls'] = []
        # print(item)
        item['total'] = response.xpath('//*[@id="page"]/a[last()-1]/text()').extract_first()
        # print(item['total'])
        # one_img =response.xpath('//*[@id="content"]/a/img/@src').extract_first()
        for i in range(1, int(item['total']) + 1):
            detail_img = response.url + "/" + str(i)
            yield scrapy.Request(
                detail_img,
                callback=self.parse_img,
                meta={"item": item}
            )

    # 获取每个标题下所有图片url
    def parse_img(self, response):
        item = response.meta['item']
        item['img_url'] = response.xpath('//*[@id="content"]/a/img/@src').extract_first()
        item['img_urls'].append(item['img_url'])

        return item

 

pipelines.py

 #重写file_path,因为file_path只返回一个字符串,后续的处理还在ImagesPipeline类的其他函数里。

class MyImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):

        for image_url in item['img_urls']:
            referer=image_url  # 处理防盗链
            yield Request(image_url,
                          meta={'item': item,'referer':referer})#配合get_media_requests传递meta,不然拿不到item的.不会下载

    def file_path(self, request, response=None, info=None):
        item = request.meta['item']
        folder = item['title']
        folder_strip = folder.strip()
        image_guid = request.url.split('/')[-1]
        filename = u'imge/{0}/{1}'.format(folder_strip, image_guid)
        return filename

    def item_completed(self, results, item, info):
        image_path = [x['path'] for ok, x in results if ok]
        if not image_path:
            raise DropItem('Item contains no images')
        # item['image_paths'] = image_path
        return item

setting.py

DOWNLOADER_MIDDLEWARES = {
   'meizitu.middlewares.MeizituDownloaderMiddleware': 543,
}


IMAGES_STORE=r'C:\study\meizitu\meizitu'     #指定文件下载路径
# 图像管道避免下载最近已经下载的图片。使用 FILES_EXPIRES (或 IMAGES_EXPIRES) 设置可以调整失效期限,
# 可以用天数来指定
IMAGES_EXPIRES = 30

ITEM_PIPELINES = {
   'meizitu.pipelines.MyImagesPipeline': 2,
}

 

 

middleware.py
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
      #来处理图片下载的防盗链
        referer = request.meta.get('referer', None)
        if referer:
            request.headers['referer'] = referer

 

转载于:https://www.cnblogs.com/felix8200/p/9068702.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值