scrapy python3.7_使用python3.7中的scrapy框架，爬取起点小说

最新推荐文章于 2022-08-26 17:42:40 发布

被ldy取笑

最新推荐文章于 2022-08-26 17:42:40 发布

阅读量314

点赞数

文章标签： scrapy python3.7

本文链接：https://blog.csdn.net/weixin_30951515/article/details/113540023

版权

这是一个使用Scrapy框架的Python3.7爬虫项目，旨在抓取起点中文网的小说信息，包括书名、作者、类型、状态、封面以及章节内容。爬虫首先获取小说列表，然后遍历每本书的详细信息，进一步抓取每个章节的URL，最终将书籍内容保存为TXT文件。

摘要由CSDN通过智能技术生成

#-*- coding: utf-8 -*-

importscrapyimportosfrom qidian.items importQidianItem, Zhangjie, BookItemclassXiaoshuoSpider(scrapy.Spider):

name= 'xiaoshuo'allowed_domains= ["qidian.com"]#开始爬取的页面

start_urls = ['https://www.qidian.com/free/all?orderId=&page=1&vip=hidden&style=1&pageSize=50&siteid=1&pubflag=0&hiddenField=1']defparse(self, response):#xpath定位html元素

li_list = response.xpath("//div[@class='book-img-text']//ul/li")for li in li_list: #遍历当前页每一本书

item = QidianItem() #实例化

#书名

item["book_name"] = li.xpath("./div[@class='book-mid-info']/h4/a/text()").extract_first()#作者

item["book_author"] = li.xpath("./div[@class='book-mid-info']/p[@class='author']/a[@class='name']/text()").extract_first()#书的类型，玄幻，修真，都市之类的

item["book_type"] = li.xpath("./div[@class='book-mid-info']/p[@class='author']/a[@data-eid='qd_B60']/text()").extract_first()#书的状态，，连载还是完结

item["book_status"] = li.xpath("./div[@class='book-mid-info']/p[@class='author']/span/text()").extract_first()#书的封面图片

item["book_img"] = "http:" + li.xpath("./div[@class='book-img-box']/a/img/@src").extract_first()#书的url

item["book_url"] ="http:" + li.xpath("./div[@class='book-img-box']/a/@href").extract_first()yieldscrapy.Request(

item["book_url"],

callback= self.parseBookUrl, #通过每一本书的url地址，获得每一本书的目录信息

#传递数据给parseBookUrl，字典的形式，这里只传递书名和作者

meta={"book_name":item["book_name"].strip(),"book_author":item["book_author"].strip()} #把每一本书的书名传递给下一个

)#下一页

page_num = len(response.xpath("//div[@class='pagination fr']//ul/li"))for i in range(page_num - 2): #只有5页，就这么做了

next_url = 'http://www.qidian.com/free/all?orderId=&vip=hidden&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=1&page={}'.format(i+1)yieldscrapy.Request(

next_url,

callback=self.parse

)defparseBookUrl(self,response):#接收数据

bookname = response.meta["book_name"]

bookauthor= response.meta["book_author"]#实例化，赋值

zhangjie =Zhangjie()

zhangjie["book_name"] =bookname

zhangjie["book_author"] =bookauthor

book_div= response.xpath("//div[@class='volume-wrap']//div[@class='volume']")

i=0for div in book_div: #遍历小说卷数

li_list = div.xpath("./ul/li")for li in li_list: #遍历小说的章节数

i = i + 1zhangjie["zhangjie_id"] =i

zhangjie["book_zhangjie"] = li.xpath("./a/text()").extract_first()

zhangjie["book_zhangjie_url"] ="http:" + li.xpath("./a/@href").extract_first() #通过目录的url地址爬取文本内容

yieldscrapy.Request(

zhangjie["book_zhangjie_url"],

callback=self.parseZhangjieUrl,#meta的传递数据给parseZhangjieUrl

meta={"book_name":zhangjie["book_name"].strip(),"zhangjie":zhangjie["book_zhangjie"].strip(),"book_author":bookauthor,"zhangjie_id":zhangjie["zhangjie_id"] }

)defparseZhangjieUrl(self,response):#接收传递过来的数据

bookname = response.meta["book_name"]

zhangjie= response.meta["zhangjie"]

book_author= response.meta["book_author"]

zhangjie_id=response.meta["zhangjie_id"]

bookitem=BookItem()

bookitem["book_name"] =bookname

bookitem["book_mulu"] =zhangjie

bookitem["book_author"] =book_author

bookitem["zhangjie_id"] =zhangjie_id

div_list= response.xpath("//div[@class='main-text-wrap']")

p_list= div_list.xpath("./div[@class='read-content j_readContent']/p")