在上一篇的基础上,将数据存储的字典进行改进。将获取到的内容存到 item 中。
一、在 items.py 文件夹下的类中添加列表名
import scrapy
class QidianHotItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
author = scrapy.Field()
type = scrapy.Field()
state = scrapy.Field()
二、在自己建的 qidian_hot_spider.py 中进行如下变化
导入包:
from qidian_hot.items import QidianHotItem
将原来用字典存储的部分改成用 item 存储:
item = QidianHotItem()
item["name"] = name
item["author"] = author
item["type"] = type
item["state"] = state
yield item
整体的代码如下:
from scrapy import Request
from scrapy.spiders import Spider
from qidian_hot.items import QidianHotItem
class HotSalesSpider(Spider):
name = "hot" #爬虫名称,一个项目中可以有多个爬虫功能
#start_urls = ["https://www.qidian.com/all"]
qidian_headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
# 本身已有这个方法,可以根据给出的网址发送请求,现在伪装浏览器登录,需要重写请求
def start_requests(self):
url = "https://www.qidian.com/all"
yield Request(url,headers = self.qidian_headers)
def parse(self,response): #用于数据解析
list_selector = response.xpath("//div[@class='book-mid-info']")
# 双斜杠开头是指查找所有html资源
for one in list_selector:
name = one.xpath("h4/a/text()").extract()[0]
author = one.xpath("p[@class='author']/a[1]/text()").extract()[0]
type = one.xpath("p[@class='author']/a[2]/text()").extract()[0]
state = one.xpath("p[@class='author']/span/text()").extract()[0]
# 将上面的字段变成同一条数据输出,所以先将他们存储在字典中
item = QidianHotItem()
item["name"] = name
item["author"] = author
item["type"] = type
item["state"] = state
yield item
介绍爬取多页数据的方法:
一、首先定义一个 current_page
current_page = 1
二、存完这页的数据后,重新发一个请求
self.current_page += 1
if self.current_page <= 25:
next_url = "https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page=%d" % self.current_page
yield Request(next_url, callback=self.parse)