scrapy创建规则爬虫CrawlSpider获取读书网书籍信息保存数据库和本地表格,并用ImagePipeline下载封面图

 

scrapy genspider -t crawl dubook dushu.com
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class DubookSpider(CrawlSpider):
    name = 'dubook'
    allowed_domains = ['dushu.com']
    start_urls = ['https://www.dushu.com/book/']

    rules = (
        # 一级分类链接,采用css样式匹配
        Rule(LinkExtractor(restrict_css='.sub-catalog'), follow=True),
        # 一级分类链接,采用正则匹配
        # Rule(LinkExtractor(allow=r'/book/100\d+?\.html'),  follow=True),
        # 测试用
        # Rule(LinkExtractor(allow=r'/book/100[1-2]\.html'),  follow=True),

        # 下一页链接
        Rule(LinkExtractor(allow=r'/book/100\d+?_\d+?\.html'), callback='parse_book',follow=True),
        # 测试用:1001_1 1001_2  1002_1  1002_2
        # Rule(LinkExtractor(allow=r'/book/100[1-2]_[1-2]\.html'), callback='parse_book',follow=True),

        # 书籍详情页链接
        Rule(LinkExtractor(allow=r'/book/\d{5,}/'), callback='parse_item', follow=False),
    )
    def parse_book(self,response):
        next_url = response.url
        next_url = next_url.split('_')[-1].split('.')[0]
        title = response.xpath('//div/div[@class="row"]/div/div/dl[@class="active"]/dt/text()').get()
        print("准备处理[%s]第%s页" % (title, next_url))
    def parse_item(self, response):
        item = {}
        # item['book_name'] = response.xpath('//div[@class="bookslist"]/ul/li/div/h3/a/text()').extract()
        book_name = response.xpath('//div/div/div[@class="book-title"]/h1/text()').get() or "该项为空"
        # 获取书籍面包屑
        navbar = response.xpath('//div[@class="crumbs"]/a[position()>2]/text()').extract()
        length = len(navbar)
        if length == 2:
            print("该书籍一级分类:%s" % book_name)
            item['firstTitle'] = navbar[0]
            item['secondTitle'] = "-"
            item['threeTitle'] = "-"
            item['fourTitle'] = "-"
        elif length == 3:
            print("该书籍二级分类:%s" % book_name)
            item['firstTitle'] = navbar[0]
            item['secondTitle'] = navbar[1]
            item['threeTitle'] = "-"
            item['fourTitle'] = "-"
        elif length == 4:
            print("该书籍三级分类:%s" % book_name)
            item['firstTitle'] = navbar[0]
            item['secondTitle'] = navbar[1]
            item['threeTitle'] = navbar[2]
            item['fourTitle'] = "-"
        elif length == 5:
            print("该书籍四级分类:%s" % book_name)
            item['firstTitle'] = navbar[0]
            item['secondTitle'] = navbar[1]
            item['threeTitle'] = navbar[2]
            item['fourTitle'] = navbar[3]
        else:
            print("该书籍分类异常:%s" % book_name)
            item['firstTitle'] = "-"
            item['secondTitle'] = "-"
            item['threeTitle'] = "-"
            item['fourTitle'] = "-"

        # 作者
        book_author = response.xpath(
            '//div/div/div[@class="book-details"]/div/table//tr[1]/td[2]/text()').get() or "该项为空"
        # 标签
        book_tag = response.xpath('//div/div/div[@class="book-details"]/div/table//tr[4]/td[2]/text()').get() or "该项为空"
        # ISBN
        book_isbn = response.xpath('//div/div/div[@class="book-details"]/table//tr[1]/td[2]/text()').get() or "该项为空"
        # 价格
        book_price = response.xpath('//div/div/div[@class="book-details"]/div/p/span/text()').get() or "该项为空"
        # 简介
        book_info = response.xpath('//div/div/div[@class="book-summary"][1]/div/div/text()').get() or "该项为空"
        # 封面图
        cover_img_url = response.xpath('//div/div/div[@class="book-pic"]/div/img/@src').get() or "该项为空"
        # 书籍详细页地址
        book_url = response.url
        # 书籍id
        book_id = book_url.split("/")[-2]

        item['book_id'] = book_id
        item['book_name'] = book_name
        item['book_author'] = book_author
        item['book_tag'] = book_tag
        item['book_isbn'] = book_isbn
        item['book_price'] = book_price[1:]
        item['book_info'] = book_info.strip()
        item['cover_img_url'] = "暂无封面图" if "n200.png" in cover_img_url else cover_img_url
        item['book_url'] = book_url
        yield item

其他设置内容参考:https://blog.csdn.net/z564359805/article/details/109488215

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值