scrapy genspider -t crawl dubook dushu.com
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class DubookSpider(CrawlSpider):
name = 'dubook'
allowed_domains = ['dushu.com']
start_urls = ['https://www.dushu.com/book/']
rules = (
# 一级分类链接,采用css样式匹配
Rule(LinkExtractor(restrict_css='.sub-catalog'), follow=True),
# 一级分类链接,采用正则匹配
# Rule(LinkExtractor(allow=r'/book/100\d+?\.html'), follow=True),
# 测试用
# Rule(LinkExtractor(allow=r'/book/100[1-2]\.html'), follow=True),
# 下一页链接
Rule(LinkExtractor(allow=r'/book/100\d+?_\d+?\.html'), callback='parse_book',follow=True),
# 测试用:1001_1 1001_2 1002_1 1002_2
# Rule(LinkExtractor(allow=r'/book/100[1-2]_[1-2]\.html'), callback='parse_book',follow=True),
# 书籍详情页链接
Rule(LinkExtractor(allow=r'/book/\d{5,}/'), callback='parse_item', follow=False),
)
def parse_book(self,response):
next_url = response.url
next_url = next_url.split('_')[-1].split('.')[0]
title = response.xpath('//div/div[@class="row"]/div/div/dl[@class="active"]/dt/text()').get()
print("准备处理[%s]第%s页" % (title, next_url))
def parse_item(self, response):
item = {}
# item['book_name'] = response.xpath('//div[@class="bookslist"]/ul/li/div/h3/a/text()').extract()
book_name = response.xpath('//div/div/div[@class="book-title"]/h1/text()').get() or "该项为空"
# 获取书籍面包屑
navbar = response.xpath('//div[@class="crumbs"]/a[position()>2]/text()').extract()
length = len(navbar)
if length == 2:
print("该书籍一级分类:%s" % book_name)
item['firstTitle'] = navbar[0]
item['secondTitle'] = "-"
item['threeTitle'] = "-"
item['fourTitle'] = "-"
elif length == 3:
print("该书籍二级分类:%s" % book_name)
item['firstTitle'] = navbar[0]
item['secondTitle'] = navbar[1]
item['threeTitle'] = "-"
item['fourTitle'] = "-"
elif length == 4:
print("该书籍三级分类:%s" % book_name)
item['firstTitle'] = navbar[0]
item['secondTitle'] = navbar[1]
item['threeTitle'] = navbar[2]
item['fourTitle'] = "-"
elif length == 5:
print("该书籍四级分类:%s" % book_name)
item['firstTitle'] = navbar[0]
item['secondTitle'] = navbar[1]
item['threeTitle'] = navbar[2]
item['fourTitle'] = navbar[3]
else:
print("该书籍分类异常:%s" % book_name)
item['firstTitle'] = "-"
item['secondTitle'] = "-"
item['threeTitle'] = "-"
item['fourTitle'] = "-"
# 作者
book_author = response.xpath(
'//div/div/div[@class="book-details"]/div/table//tr[1]/td[2]/text()').get() or "该项为空"
# 标签
book_tag = response.xpath('//div/div/div[@class="book-details"]/div/table//tr[4]/td[2]/text()').get() or "该项为空"
# ISBN
book_isbn = response.xpath('//div/div/div[@class="book-details"]/table//tr[1]/td[2]/text()').get() or "该项为空"
# 价格
book_price = response.xpath('//div/div/div[@class="book-details"]/div/p/span/text()').get() or "该项为空"
# 简介
book_info = response.xpath('//div/div/div[@class="book-summary"][1]/div/div/text()').get() or "该项为空"
# 封面图
cover_img_url = response.xpath('//div/div/div[@class="book-pic"]/div/img/@src').get() or "该项为空"
# 书籍详细页地址
book_url = response.url
# 书籍id
book_id = book_url.split("/")[-2]
item['book_id'] = book_id
item['book_name'] = book_name
item['book_author'] = book_author
item['book_tag'] = book_tag
item['book_isbn'] = book_isbn
item['book_price'] = book_price[1:]
item['book_info'] = book_info.strip()
item['cover_img_url'] = "暂无封面图" if "n200.png" in cover_img_url else cover_img_url
item['book_url'] = book_url
yield item
其他设置内容参考:https://blog.csdn.net/z564359805/article/details/109488215