在 middlewares.py 文件中添加:
from selenium import webdriver
import time
from scrapy.http import HtmlResponse
class JSPageMiddleware(object):
# 通过selenium对接scrapy实现动态页面的爬取
def process_request(self, request, spider):
if spider.name == 'jobbole': # 只针对特定的网页实现selenium 爬取
self.browser.get(request.url)
time.sleep(5)
print('访问页面{}'.format(request.url))
# 不要让 Downloader 再次下载,直接返回给spider
return HtmlResponse(url=spider.browser.current_url,body=spider.browser.page_source,encoding='utf-8',request=request)
在 settings.py 文件中打开:
DOWNLOADER_MIDDLEWARES = {
#'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
# 'scrapydownloadertest.middlewares.RandomUserAgentMiddleware': 543,
'scrapydownloadertest.middlewares.JSPageMiddleware': 544,
}
在spider中配置以下代码,为了让爬虫结束的时候,关闭Chrome:
import scrapy
from selenium import webdriver
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals # scrapy的信号
class HttpbinSpider(scrapy.Spider):
name = 'httpbin'
allowed_domains = ['httpbin.org']
start_urls = ['http://httpbin.org/']
# 为了不让selenium每次请求网页都打开
def __init__(self):
self.browser = webdriver.Chrome()
super(HttpbinSpider,self).__init__() # 让上一行代码属于当前类。
dispatcher.connect(self.spider_closed,signals.spider_closed)
def spider_closed(self,spider):
# 当爬虫退出的时候关闭Chrome
print('关闭selenium打开的网页')
self.browser.quit()
def parse(self, response):
print(response.text)
通过劫持request发送给downloder middleware 的spider 实现对接selenium。
设置selenium 不加载图片
from selenium import webdriver
#设置selenium不加载图片,固定写法
chrome_opt = webdriver.ChromeOptions()
prefs = {'profile.managed_default_content_settings.images':2}
chrome_opt.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome(chrome_options=chrome_opt)
browser.get('https://www.oschina.net/blog')