在Vue、React横行的时代,前后端分离使用Ajax做交互的比比皆是。Ajax请求往往带有各种加密验证,解密起来费力气。最好是selenium直接拦截ajax的结果。
1.安装依赖
pip install browsermob-proxy
2.下载代理驱动
3.解压并放到项目目录中。
4.测试用例
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from browsermobproxy import Server
if __name__ == '__main__':
# 开启代理
BMPserver = Server(r'D:\workspace\scrapyproject\testscrapy\browsermob-proxy-2.1.4\bin\browsermob-proxy.bat')
BMPserver.start()
BMPproxy = BMPserver.create_proxy()
# 初始化驱动
service = ChromeService(executable_path=ChromeDriverManager().install())
# 配置代理参数
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--proxy-server={}'.format(BMPproxy.proxy))
# 获取浏览器实例
driver = webdriver.Chrome(service=service, options=options)
# 监听ajax请求
BMPproxy.new_har("lagou", options={'captureContent': True, 'captureContent': True})
# 访问网页
driver.get("https://dynamic2.scrape.center/")
# 等待ajax加载完毕
time.sleep(10)
# 获取ajax加载结果
result = BMPproxy.har
for entry in result['log']['entries']:
entry_url = entry['request']['url']
# 根据URL找到你想要的的ajax请求。
5.做成scrapy中间件
class SeleniumMiddleware(object):
def process_response(self, request, response, spider):
# 开启代理
BMPserver = Server(r'D:\workspace\scrapyproject\testscrapy\browsermob-proxy-2.1.4\bin\browsermob-proxy.bat')
BMPserver.start()
BMPproxy = BMPserver.create_proxy()
# 初始化驱动
service = ChromeService(executable_path=ChromeDriverManager().install())
# 配置
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--proxy-server={}'.format(BMPproxy.proxy))
# 无头模式
# options.add_argument('--headless')
# 获取浏览器实例
driver = webdriver.Chrome(service=service, options=options)
# 监听ajax请求
BMPproxy.new_har("lagou", options={'captureContent': True, 'captureContent': True})
# 访问网页
driver.get(request.url)
# 等待ajax加载完毕
time.sleep(5)
# 获取ajax加载结果
result = BMPproxy.har
for entry in result['log']['entries']:
entry_url = entry['request']['url']
# 根据URL找到你想要的的ajax请求。
driver.quit()
return scrapy.http.HtmlResponse(url=request.url, body=json.dumps(result).encode('utf-8'), encoding='utf-8', request=request)
6.开启中间件
DOWNLOADER_MIDDLEWARES = {
'testscrapy.middlewares.SeleniumMiddleware': 543,
} # 下载中间件