使用scrapy-splash进行全站爬取
完整的例子可见GITHUB
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule
from scrapy_splash import SplashRequest
from scrapy.linkextractors import LinkExtractor
class MySpider(CrawlSpider):
name = 'innda'
url = ['http://www.zjepb.gov.cn/col/col1201494/index.html']
def start_requests(self):
for url in self.url:
# Splash 默认是render.html,返回javascript呈现页面的HTML。
yield SplashRequest(url, args={'wait': 10})
rules = (
Rule(LinkExtractor(allow=('.*',)), follow=True),
)
def parse(self, response):
print(response.text)
写完简单的demo,发现这个代码并不能自动全站爬取。
原因:
可以查看 CrawlSpider
的源码,找到这个跟进爬取的方法。
def _requests_to_follow(self, response):
# 这里定义了返回的类型检测,只支持HtmlResponse 类型
if not isinstance(response, HtmlResponse):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
发现,这个跟进爬取的方法是有类型检测
的,只支持 HtmlResponse
返回类型。那我们用scrapy-splash
请求的页面的返回类型是什么?
简单的 可以覆盖这个方法,打印出相应的类型
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule
from scrapy_splash import SplashRequest
from scrapy.linkextractors import LinkExtractor
class MySpider(CrawlSpider):
name = 'innda'
url = ['http://www.zjepb.gov.cn/col/col1201494/index.html']
def start_requests(self):
for url in self.url:
# Splash 默认是render.html,返回javascript呈现页面的HTML。
yield SplashRequest(url, args={'wait': 10})
rules = (
Rule(LinkExtractor(allow=('.*',)), follow=True),
)
# 覆盖父类的方法
def _requests_to_follow(self, response):
# 打印出返回类型
print(type(response))
# 这里定义了返回的类型检测,只支持HtmlResponse 类型
if not isinstance(response, HtmlResponse):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def parse(self, response):
print(response.text)
运行爬虫,打印出的为 <class 'scrapy_splash.response.SplashTextResponse'>
返回的数据类型为 SplashTextResponse
,这我们splash 使用的 endpoint
是 render.html
这里我们就可以找出问题了,是跟进爬取的方法,把splash返回的数据给扔了
修改代码,覆盖源码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule
from scrapy_splash import SplashRequest
from scrapy.linkextractors import LinkExtractor
from scrapy_splash import SplashTextResponse, SplashJsonResponse, SplashResponse
from scrapy.http import HtmlResponse
class MySpider(CrawlSpider):
name = 'innda'
url = ['http://www.zjepb.gov.cn/col/col1201494/index.html']
def start_requests(self):
for url in self.url:
# Splash 默认是render.html,返回javascript呈现页面的HTML。
yield SplashRequest(url, args={'wait': 10})
rules = (
Rule(LinkExtractor(allow=('.*',)), process_request='splash_request', follow=True),
)
# 这个方法是给Rule 中的process_request用的。
def splash_request(self, request):
"""
process_request is a callable, or a string (in which case a method from the spider object with that name will
be used) which will be called with every request extracted by this rule,
and must return a request or None (to filter out the request).
:param request:
:return: SplashRequest
"""
return SplashRequest(url=request.url, args={'wait': 10})
# 重写CrawlSpider 的方法
def _requests_to_follow(self, response):
"""
splash 返回的类型 有这几种SplashTextResponse, SplashJsonResponse, SplashResponse以及scrapy的默认返回类型HtmlResponse
所以我们这里只需要检测这几种类型即可,相当于扩充了源码的检测类型
:param response:
:return:
"""
# print(type(response)) # <class 'scrapy_splash.response.SplashTextResponse'>
if not isinstance(response, (SplashTextResponse, SplashJsonResponse, SplashResponse, HtmlResponse)):
return
print('==========================进入_requests_to_follow=========================')
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def _build_request(self, rule, link):
# 重要!!!!!这里重写父类方法,特别注意,需要传递meta={'rule': rule, 'link_text': link.text}
# 详细可以查看 CrawlSpider 的源码
r = SplashRequest(url=link.url, callback=self._response_downloaded, meta={'rule': rule, 'link_text': link.text},
args={'wait': 5, 'url': link.url, 'lua_source': default_script})
r.meta.update(rule=rule, link_text=link.text)
return r