import scrapy
from urllib import request
class bookSpider(scrapy.Spider):
name = "bookSpider"
start_urls = ['https://read.qidian.com/chapter/sMwmRYRKF1KLTMDvzUJZaQ2/eGngSvaVqnlOBDFlr9quQA2']
def parse(self, response):
divs = response.xpath('//*[@class="read-content j_readContent"]')
zhangjie = ""
#将每一个p标签的文字循环输出
for p in divs.xpath('.//p/text()'):
zhangjie = zhangjie + p.extract().strip()
#txt命名
chaptername = response.xpath('//*[@class="j_chapterName"]/text()').extract()[0]
fileName = chaptername +".txt"
with open(fileName, "a") as f:
f.write(zhangjie)
#f.write(next_url)
f.write('\n')
f.close()
#自动爬下一页
next_url = response.xpath('//*[@id="j_chapterNext"]/@href').extract()[0]
if next_url is not None:
next_url = request.urljoin(response.url,next_url)
yield scrapy.Request(next_url, callback=self.parse)
补充:由于在最后一章会找不到next_url的xpath,所以在最后一章爬下来之后,会出现
next_url = response.xpath('//*[@id="j_chapterNext"]/@href').extract()[0]
IndexError: list index out of range
为了避免xpath为空的错误,将上方的代码改为
chaptername = response.xpath('//*[@class="j_chapterName"]/text()').extract_first()
next_url = response.xpath('//*[@id="j_chapterNext"]/@href').extract_first()
将extract()[0]改为extract_first()可以避免报错