难点:同时实现内外翻页功能
重点:xpath,参数的传递
👇👇👇👇废话不多说直接上代码👇👇👇👇
所有解释均在代码里面
import scrapy
from ..items import *
class MyxmlySpider(scrapy.Spider):
name = 'myxmly'
allowed_domains = ['ximalaya.com']
start_urls = ['https://www.ximalaya.com/youshengshu/wenxue/p1/']
book_page = 1
def parse(self, response, **kwargs):
# 使用xpath解析
li_list = response.xpath('//div[@class="content"]/ul/li')
for li in li_list:
book_title = li.xpath('./div/a/span[@class="v-m kF_"]/text()').get() # 书籍标题
author = li.xpath('./div/a[@class="album-author kF_"]/@title').get() # 主播名称
book_url = 'https://www.ximalaya.com' + li.xpath('./div/a/@href').get() # 书籍url,可进入书籍里面看到章节
book_id = book_url.split('/')[-2] # 书籍的id
# callback回调函数,这里回调作用是进入书籍章节页码里面查询页码
yield scrapy.Request(
url=book_url,
callback=self.book_zj_page,
meta={'book_title': book_title, 'author': author, 'book_id': book_id} # 传入 book_zj_page 的参数
)
# 对书籍进行翻页,爬取到前5页的书籍标题
if self.book_page < 6: # 如果页码小于6,进入循环
print('获取到的书籍页码********', self.book_page) # 检查代码使用
url = f'https://www.ximalaya.com/youshengshu/wenxue/p{self.book_page}/' # 总书籍url和页码拼接
self.book_page += 1 # 页码加一页
yield scrapy.Request(url, callback=self.parse) # 回调自己
def book_zj_page(self, response):
# 对传过来的书本名称和作者和id,进行解开
book_title = response.meta['book_title']
author = response.meta['author']
book_id = response.meta['book_id']
# 通过传输过来的book_url,内部自动获取数据后,进行xpath解析
# 解析获取章节最后一页的页码
zjlast_page = response.xpath('//ul[@class="pagination-page WJ_"]/li[last()-1]/a/span/text()').get()
# 如果最后一页的页码不为空进入下面循环
if zjlast_page:
# 把最后一页的页码转换成为整数型
zjlast_page = int(zjlast_page)
zj_page = 1
# 如果书籍页码小于最后一页的页码,进入下面循环
while zj_page <= zjlast_page:
# 加一页以后拼接的url
zj_url = f'https://www.ximalaya.com/youshengshu/{book_id}/p{zj_page}/'
# 回调自己,把新的章节url传给专门取章节的函数,去取章节名称...同时把书籍名称/作者和书籍id传过去
zj_page += 1 # 章节页码加一页
yield scrapy.Request(url=zj_url, callback=self.book_zj,
meta={'book_title': book_title, 'author': author, 'book_id': book_id})
# 如果最后一页的页码为空,也就是说书籍章节只有一页,那么直接在下面的循环生成url
# 然后传输给专门取章节名称的函数...同时把书籍名称/作者和书籍id传过去
else:
zj_url = f'https://www.ximalaya.com/youshengshu/{book_id}/p1'
yield scrapy.Request(url=zj_url, callback=self.book_zj,
meta={'book_title': book_title, 'author': author, 'book_id': book_id})
def book_zj(self, response):
# 对传过来的书本名称和作者和id,进行解开
book_title = response.meta['book_title']
author = response.meta['author']
book_id = response.meta['book_id']
zj_list = response.xpath('//div[@class="sound-list _is"]/ul/li')
for zj in zj_list:
zj_name = zj.xpath('./div[@class="text lF_"]/a/@title').get()
print(f'书籍名称----({book_title})---------章节名称--------{zj_name}')