第一步:在spider类的init方法中初始化一个浏览器驱动(webdriver)
如何配置selenium以及配置浏览器驱动在另一篇博客中selenium3.0环境搭建_与市场搏斗-CSDN博客
第二步:在spider类的close方法中关闭浏览器驱动
import os
import time
import scrapy
from selenium import webdriver
from myscrapy.network_log_option import *
class Selensprider(scrapy.Spider):
name = 'selenspider'
allow_domains = 'fw.zjjy.xyz'
start_urls = ['https://www.zjjy.xyz/web/index']
def __init__(self):
# 启动代理命令,打开代理浏览器窗口
cd_dir = r'cd C:\Users\Lenovo\AppData\Local\Google\Chrome\Application'
start_proxy = r'chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\selenum\AutomationProfile"'
os.system(cd_dir)
time.sleep(0.5)
os.system(start_proxy)
# 初始浏览器驱动
options = get_log_options()
# 配置代理参数
options.add_experimental_option('debuggerAddress', 'localhost:90222')
caps = get_caps()
self.chrome = webdriver.Chrome(options=options, desired_capabilities=caps)
super().__init__()
def parse(self, response, **kwargs):
pass
# 整个爬虫结束后关闭浏览器
def close(self, spider):
self.chrome.quit()
第三步:在中间件中拦截request和response,并对浏览器进行操作
# selenium处理中间件
class SeleniumMiddleware(object):
def process_response(self, request, response, spider):
chrome = spider.chrome
chrome.get(request.url)
#这里可以对浏览器进行操作,比如浏览器全屏,下拉上拉等加载事件的操作
chrome.maximize_window()
# 阻塞5秒,等待ajax加载完毕
time.sleep(5)
# network记录,这里可以获得所有ajax请求的结果
logs = get_xhr_logs(chrome)
# 动态加载后的网页
html = chrome.page_source
return scrapy.http.HtmlResponse(url=request.url, body=html.encode('utf-8'), encoding='utf-8',request=request)
util工具
import json
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
def get_xhr_logs(chrome):
log_xhr_array = []
for typelog in chrome.log_types:
perfs = chrome.get_log(typelog)
for row in perfs:
log_data = row
message_ = log_data['message']
try:
log_json = json.loads(message_)
log = log_json['message']
if log['method'] == 'Network.responseReceived':
# 去掉静态js、css等,仅保留xhr请求
type_ = log['params']['type']
if type_ == "XHR":
log_xhr_array.append(log)
except:
pass
return log_xhr_array
def get_log_options():
option = webdriver.ChromeOptions()
option.add_argument('--no-sandbox')
option.add_argument('--headless')
option.add_argument("--disable-extensions")
option.add_argument("--allow-running-insecure-content")
option.add_argument("--ignore-certificate-errors")
option.add_argument("--disable-single-click-autofill")
option.add_argument("--disable-autofill-keyboard-accessory-view[8]")
option.add_argument("--disable-full-form-autofill-ios")
option.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:55.0) Gecko/20100101 Firefox/55.0')
option.add_experimental_option('w3c', False)
option.add_experimental_option('perfLoggingPrefs', {
'enableNetwork': True,
'enablePage': False,
})
return option
def get_caps():
caps = DesiredCapabilities.CHROME
caps['loggingPrefs'] = {
'browser': 'ALL',
'performance': 'ALL',
}
caps['perfLoggingPrefs'] = {
'enableNetwork': True,
'enablePage': False,
'enableTimeline': False
}
return caps
第四步:在setting中启用此中间件。
DOWNLOADER_MIDDLEWARES = {
'asong.middlewares.SeleniumMiddleware': 543,
}