本文代码基于系列八代码的基础上修改
1.安装fake-useragent库
这个库能随机生成UserAgent
pip install fake-useragent
2.编写Middleware
3.去掉原来的代理
原本我是在headers里传入UserAgent
4.Settings.py文件把ROBOTSTXT_OBEY设置为True
启动测试:
Middleware函数里随机生成打印的UserAgent
qidian_hot_spider.py完整代码
from scrapy import Request
from scrapy.spiders import Spider
from qidian_hot.items import QidianHotItem
from scrapy.loader import ItemLoader
class HotSalesSpider(Spider):
name = "hot"
#当前页数
current_page = 1
#设置用户代理为浏览器类型
#qidian_headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36"}
def start_requests(self):
url = "https://www.qidian.com/rank/hotsales?style=1&page=1"
yield Request(url, callback=self.qidian_parse)
# yield Request(url,headers=self.qidian_headers, callback=self.qidian_parse)
#数据解析,此时response对象已经有网页的html数据
def qidian_parse(self, response):
#使用xpath定位到小说内容的div元素
list_selector = response.xpath("//div[@class='book-mid-info']")
#读取每部小说的元素
for one_selector in list_selector:
#参数item接收QidianHotItem实例,selector接收一个选择器
novel = ItemLoader(item=QidianHotItem(), selector=one_selector)
#获取小说名称
novel.add_xpath("name", "h4/a/text()")
#获取作者,第一个p标签的第一个a标签的内容
novel.add_xpath("author", "p[1]/a[1]/text()")
#获取类型
novel.add_xpath("type", "p[1]/a[2]/text()")
#获取形式
novel.add_css("form", ".author span::text")
#将提取好的数据load处理,并返回。load_item方法给Item对象赋值
yield novel.load_item()
#多页数据爬取,原理:执行完一次爬取,当前页数加1,然后通过回调函数重新执行qidian_parse方法
self.current_page+=1
if(self.current_page <= 25):
next_url = "https://www.qidian.com/rank/hotsales?style=1&page=%d"%(self.current_page)
# yield Request(next_url, headers=self.qidian_headers, callback=self.qidian_parse)
yield Request(next_url, callback=self.qidian_parse)
middlewares.py代码
#导入UserAgenMiddleware组件模块
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
from fake_useragent import UserAgent
#设置随机设置user-agent
class QidianHotUserAgentMiddleware(UserAgentMiddleware):#继承UserAgentMiddleware
def process_request(self, request, spider):
ua = UserAgent()
#生成随机的UserAgent
request.headers['User-Agent'] = ua.random
print(request.headers['User-Agent'])