scrapy学习之路2(图片下载与下载的路径获取)

图片下载和拿到下载后的路径

1

items.py

import scrapy

class InfoItem(scrapy.Item):
    url = scrapy.Field()
    url_object_id = scrapy.Field()
    small_image = scrapy.Field()
    small_image_path = scrapy.Field()
    big_image = scrapy.Field()
    big_image_path = scrapy.Field()
    code = scrapy.Field()
    date = scrapy.Field()
    lengths = scrapy.Field()
    author = scrapy.Field()
    cate = scrapy.Field()
    av_artor = scrapy.Field()

spider/jxxx.py

# -*- coding: utf-8 -*-
import scrapy
from urllib import parse
from scrapy.http import Request
from JaSpider.items import InfoItem
from JaSpider.utils.common import get_md5


class JxxxSpider(scrapy.Spider):
    name = 'jxxx'
    allowed_domains = ['www.jxxx.com']
    start_urls = ['http://www.jxxx.com/cn/vl_update.php']

    def parse(self, response):
        for i in response.css('.video'):
            small_image = i.css('img::attr(src)').extract_first() # 小封面图的爬取,后面通过meta传到parse_info中
            link = i.css('a::attr(href)').extract_first() # 详情页的url爬取
            real_url = parse.urljoin(response.url, link) # 详情页的完整地址
            yield Request(url=real_url, meta={'small_image': small_image}, callback=self.parse_info)
        # 下一页的爬取与请求    
        next_url = response.css('.page_selector .page.next::attr(href)').extract_first()
        perfect_next_url = parse.urljoin(response.url, next_url)
        if next_url:
            yield Request(url=perfect_next_url, callback=self.parse)

    def parse_info(self, response):
        small_image = "http:"+response.meta['small_image']
        big_image = "http:"+response.xpath('//div[@id="video_jacket"]/img/@src').extract_first()
        code = response.css('#video_id .text::text').extract_first()
        date = response.css('#video_date .text::text').extract_first()
        lengths = response.css('#video_length .text::text').extract_first()
        author = response.css('#video_director .director a::text').extract_first() if response.css('#video_director .director a::text').extract_first() else "不明"
        cate = ','.join([i.css('a::text').extract_first() for i in response.css('#video_genres .genre') if i.css('a::text').extract_first()])
        av_artor = ','.join([i.css('a::text').extract_first() for i in response.css('.star') if i.css('a::text').extract_first()])
        # print("http:"+small_image)
        info_item = InfoItem()
        info_item['url'] = response.url
        info_item['url_object_id'] = get_md5(response.url)
        info_item['small_image'] = small_image
        info_item['big_image'] = [big_image]
        info_item['code'] = code
        info_item['date'] = date
        info_item['lengths'] = lengths
        info_item['author'] = author
        info_item['cate'] = cate
        info_item['av_artor'] = av_artor
        yield info_item

2

打开pipeline功能 settings.py

clipboard.png
注意!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!:
spider/jxxx.py
clipboard.png

3

如要进一步定制功能
settings.py
clipboard.png

pipeline.py

clipboard.png

4

补充
新建utils/common.py

import hashlib


def get_md5(url):
    if isinstance(url, str):
        url = url.encode("utf-8")
    m = hashlib.md5()
    m.update(url)
    return m.hexdigest()


if __name__ == "__main__":
    a = get_md5('http://www.haddu.com')
    print(a)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值