做爬虫的时候,有时候遇到需要的数据在加载资源当中,通常做法是拼接url,然后获取数据,但首先需要进行分析,如果拼接中的参数有加密的情况时,如果不能模拟算法生成正确的参数,那就很头疼。而访问performance,可以获得加载网站时的资源请求信息,可以通过这一特点,获取url和数据。
import time
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import json
class Mychrome:
def __init__(self):
self.options = webdriver.ChromeOptions()
self.flash_urls = []
self.set_browser()
def set_browser(self):
prefs = {
"profile.managed_default_content_settings.images": 1,
}
if self.flash_urls is not None and len(self.flash_urls) != 0:
prefs['profile.managed_plugins_allowed_for_urls'] = self.flash_urls
self.options.add_experimental_option('prefs', prefs)
# 方法1
# capabilities = DesiredCapabilities.CHROME
# capabilities['loggingPrefs'] = {"performance","all"}
# self.driver = webdriver.Chrome(
# desired_capabilities=capabilities
# )
# 方法2
# self.options.add_experimental_option("excludeSwitches", ['enable-automation']) # window.navigator.webdriver设置为undefined,逃过网站的防爬检查,headless无效
desired_capabilities = self.options.to_capabilities() # 将功能添加到options中
desired_capabilities['loggingPrefs'] = {
"performance": "ALL" # 添加日志
}
self.driver =webdriver.Chrome(
desired_capabilities=desired_capabilities
)
def gethtml(self):
url ='http://www.baidu.com'
self.driver.get(url)
print(self.driver.get_log('performance'))
print('-'*60)
print(self.driver.get_log('performance'))
for entry in self.driver.get_log('performance'):
params = json.loads(entry.get('message')).get('message')
print(params.get('request')) # 请求连接 包含错误连接
print(params.get('response')) # 响应连接 正确有返回值得连接
if __name__ =='__main__':
browser = Mychrome().gethtml()