准备步骤:
1.首先在虚拟机搭建好Splash环境,参考:
https://blog.csdn.net/qq_41622603/article/details/105814689
2.安装Splash对应的Python库
pip install scrapy-splash
3.创建好项目
不会创建项目可以参考https://blog.csdn.net/qq_41622603/article/details/105129648
一,配置scrapy-splash环境
ROBOTSTXT_OBEY = False
#设置Splash服务器地址
SPLASH_URL = 'http://192.168.253.132:8050'
#设置去重过滤器
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
#设置缓存
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
#支持cache_args(可选)
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
#开启Splash的两个下载器中间件并调整HttpCompressionMiddleware的次序
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
二,封装爬取的属性
在item.py中把需要获取的属性封装起来
三,导入需要用到的库和模块
from scrapy.spiders import Spider
from yihaodian.items import YihaodianItem #导入item模块
from scrapy_splash import SplashRequest #导入SplashRequest模块,代替以前用的Request模块
四,编写Lua脚本
Splash是通过Lua脚本来执行一系列的渲染操作,从而实现页面的抓取
脚本的执行逻辑:
1.先打开url
2.等待指定时间
3.运行js代码,滚动到class='mod_turn_page clearfix mt20'的标签,这标签是一号店的翻页的div
lua_script="""
function main(splash, args)
splash:go(args.url) #加载的URl
splash:wait(args.wait) #等待时间
splash:runjs("document.getElementsByClassName('mod_turn_page clearfix mt20')[0].scrollIntoView(true)")
splash:wait(args.wait)
return splash:html()
end
"""
五,获取开始的Request
这里用了SplashRequest来代替Request
六,处理数据
完整代码
from scrapy.spiders import Spider
from yihaodian.items import YihaodianItem
from scrapy_splash import SplashRequest
lua_script="""
function main(splash, args)
splash:go(args.url)
splash:wait(args.wait)
splash:runjs("document.getElementsByClassName('mod_turn_page clearfix mt20')[0].scrollIntoView(true)")
splash:wait(args.wait)
return splash:html()
end
"""
class PhoneSpider(Spider):
name = 'iphone'
url = 'https://search.yhd.com/c0-0/kiphone/'
def start_requests(self):
yield SplashRequest(self.url,
callback=self.parse,
endpoint='execute',#Splash服务接口,执行lua脚本
args={
'lua_source':lua_script,
'images':0,#不显示图片
'wait':3
},
cache_args=['lua_source'])#缓存
def parse(self, response):
item = YihaodianItem()
list_selecotr = response.xpath("//div[@class='itemBox']")
for one_selecotr in list_selecotr:
price = one_selecotr.xpath(".//em[@class='num']/text()").extract()[-1]
price = price.strip("\n\t")
title = one_selecotr.xpath(".//p[@class='proName clearfix']/a/text()").extract()[-1]
title = title.strip("\n\t")
positiveRatio = one_selecotr.xpath(".//span[@class='positiveRatio']/text()").extract()[-1]
storeName = one_selecotr.xpath(".//span[@class='shop_text']/text()").extract()
item["price"] = price
item["title"] = title
item["positiveRatio"] = positiveRatio
item["storeName"] = storeName
yield item
#翻页
next_url = response.xpath("//a[@class='page_next']/@href").extract_first()
if next_url:
url = 'https://search.yhd.com/c0-0/kiphone/' + next_url
yield SplashRequest(url,
callback=self.parse,
endpoint='execute',
args={
'lua_source': lua_script,
'images': 0, # 不显示图片
'wait': 3
},
cache_args=['lua_source']) # 缓存