Scrapy+Selenium的使用
现在大多数网站反爬严格,通过js实现了数据的加密,破解起来非常吃力,用Selenium可以很好的绕过反爬,我们不需要关心页面后台发生的请求,也不需要分析渲染过程,就可以拿到我们想要的数据,因此学习一下Scrapy+Selenium,就拿某官网进行开刀!
一、新建项目
- 新建项目,项目名为Honor
scrapy startproject Honor
- 新建爬虫,命令如下:
scrapy genspider honor pvp.qq.com
二、定义item
import scrapy
# 主英雄信息item
class HonorItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 英雄编号
ename = scrapy.Field()
# 英雄外号
title = scrapy.Field()
# 英雄名字
cname = scrapy.Field()
# 英雄视频
hvideo_url = scrapy.Field()
# 生存能力
viability = scrapy.Field()
# 攻击伤害
ad = scrapy.Field()
# 技能效果
cover_skill = scrapy.Field()
# 上手难度
difficulity = scrapy.Field()
# 英雄故事
hero_story = scrapy.Field()
# 铭文tips
inscr_tips = scrapy.Field()
# 出装tips
eq_tips = scrapy.Field()
# 英雄介绍视频
honor_link = scrapy.Field()
三、Spider
import scrapy
from ..items import HonorItem
class HonorSpider(scrapy.Spider):
name = 'honor'
allowed_domains = ['pvp.qq.com']
start_urls = ['https://pvp.qq.com/web201605/herolist.shtml']
def parse(self, response):
"""
抓取列表页
:param response:
:return:
"""
# 获取英雄详情链接
honor_list = response.xpath("//ul[@class='herolist clearfix']/li/a/@href").extract()
for link in honor_list:
honor_link = "https://pvp.qq.com/web201605/" + link
yield scrapy.Request(honor_link, callback=self.parse_detail)
def parse_detail(self, response):
"""
解析selenium加载完成之后的详情页
:param response:
:return:
"""
viability = response.xpath("/html/body/div[3]/div[1]/div/div/div[1]/ul/li[1]/span/i/@style").extract_first()
ad = response.xpath("/html/body/div[3]/div[1]/div/div/div[1]/ul/li[2]/span/i/@style").extract_first()
cover_skill = response.xpath("//body/div[3]/div[1]/div/div/div[1]/ul/li[3]/span/i/@style").extract_first()
difficulity = response.xpath("//body/div[3]/div[1]/div/div/div[1]/ul/li[4]/span/i/@style").extract_first()
item = HonorItem()
# 英雄编号
item["ename"] = response.url[-9:-6]
# 英雄外号
item["title"] = response.xpath("//body/div[3]/div[1]/div/div/div[1]/h3/text()").extract_first()
# 英雄名字
item["cname"] = response.xpath("/html/body/div[3]/div[1]/div/div/div[1]/h2/text()").extract_first()
# 生存能力
item["viability"] = viability[viability.find(":") + 1:]
# 攻击伤害
item["ad"] = ad[ad.find(":") + 1:]
# 技能效果
item["cover_skill"] = cover_skill[cover_skill.find(":") + 1:]
# 上手难度
item["difficulity"] = difficulity[difficulity.find(":") + 1:]
# 英雄故事
item["hero_story"] = response.xpath("//*[@id='hero-story']/div[2]/p/text()").extract()
# 铭文tips
item["inscr_tips"] = response.xpath(
"/html/body/div[3]/div[2]/div/div[1]/div[3]/div[2]/p/text()").extract_first()
# 出装tips
item["eq_tips"] = response.xpath(
"/html/body/div[3]/div[2]/div/div[2]/div[2]/div[2]/div[1]/p/text()").extract_first()
yield item
四、对接selenium
采用Downloader Middleware来实现,在process_request方法里面对请求进行处理,也就是通过selenium进行加载,如果加载失败,采用retry进行重加载,然后,返回一个response响应对象给引擎,引擎会认为是下载器返回的响应,默认交给spider 中的parse_detail解析.
import scrapy
from scrapy import signals
from selenium import webdriver
from retrying import retry
# 驱动selenium中间件
class SeleniumMiddleware(object):
def __init__(self):
self.driver = webdriver.Chrome()
# 总共重试40次,每次间隔100毫秒
@retry(stop_max_attempt_number=40, wait_fixed=1000)
def retry_load_page(self, request, spider):
# 如果页面数据找到了,表示网页渲染成功,程序正常向下执行
try:
# 根据页面有无//h3节点,来判断网页是否加载成功
self.driver.find_element_by_xpath("//h3")
except:
self.count += 1
spider.logger.info("<{}> retry {} times".format(request.url, self.count))
# 手动抛出异常交给retry捕获,这样retry才能正常工作
raise Exception("<{}> page load failed.".format(request.url))
def process_request(self, request, spider):
self.count = 0
self.driver.get(request.url)
# 显示等待
# time.sleep(2)
try:
self.retry_load_page(request, spider)
# 隐式等待
# 判断页面数据是否渲染成功,如果没成功继续等待,如果成功提取数据不用等待。
# Unicode 字符串
html = self.driver.page_source
# 返回一个response响应对象给引擎,引擎会认为是下载器返回的响应,默认交给spider解析
return scrapy.http.HtmlResponse(url=self.driver.current_url, body=html.encode("utf-8"),
encoding="utf-8", request=request)
except Exception as e:
spider.logger.error(e)
return request
要在settings中开启SeleniumMiddleware的使用。
五、存储,暂时先存本地
pipelines.py
import json
from .items import HonorItem
# 测试将数据存为
class HonorPipeline(object):
def open_spider(self, spider):
self.f = open("honor.json", "w")
def process_item(self, item, spider):
json_str = json.dumps(dict(item)) + ',\n'
self.f.write(json_str)
def close_spider(self, spider):
self.f.close()
settings 开启管道