chromedirver.exe下载地址:https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json
找到对应系统及chrome版本号的驱动下载链接,例如我用的win32系统115.0.5763.x版本的chrome
启动chrome
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
chrome_options = Options()
# 在9222端口启动chrome
chrome_options.add_argument(f"--remote-debugging-port=9222") # 绑定端口
chrome_options.add_argument(f"--user-data-dir={os.path.join(os.environ['USERPROFILE'], r'AppData/Local/Google/Chrome/User Data')}") # 启动本地空间的浏览器
# 绑定已在9222端口启动的chrome
# chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
"""
不等待页面加载完成
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"
driver = webdriver.Chrome(executable_path='./chromedriver.exe', desired_capabilities=desired_capabilities, chrome_options=chrome_options)
"""
driver = webdriver.Chrome(executable_path='./chromedriver.exe', chrome_options=chrome_options)
driver.execute_cdp_cmd( # 屏蔽浏览器检测
"Page.addScriptToEvaluateOnNewDocument",
{"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"},
)
driver.set_page_load_timeout(60) # 设置超时时间为60秒,如果60秒后网页还是没有加载完成则抛出异常
driver.get('http://www.baidu.com')
# 阻塞20s,元素可见时停止阻塞
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '//div[@class="expander_content"]/ul/li[3]')))
driver.close()
获取当前页面cookies
page_cookies = driver.get_cookies()
cookies = {}
for i in page_cookies:
cookies[i["name"]] = i['value'].replace('"', '')
为页面设置cookies
driver.get('https://www.amazon.com') # 先访问网站
driver.delete_all_cookies() # 删除原来的cookies
cookies = {
"session-id": "130-5103042-5499226",
"session-id-time": "2082787201l"
} # 要添加的cookie
for key, value in cookies.items():
driver.add_cookie({'name': key, 'value': value})
driver.refresh() # 刷新页面
获取浏览器标签页
handles = driver.window_handles # 获取所有标签handle
handle = driver.current_window_handle # 获取当前标签handle
driver.switch_to.window(handle) # 跳转到指定handle的标签页
抓取network数据包(performance日志)
import json
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.options import Options
caps = {
"browserName": "chrome",
'goog:loggingPrefs': {'performance': 'ALL'}, # 开启日志性能监听
# 'loggingPrefs': {'performance': 'ALL'} # 解决异常:(无效参数:找不到日志类型“performance”)
}
chrome_options = Options()
driver = webdriver.Chrome(desired_capabilities=caps, options=chrome_options) # 启动浏览器
driver.get('要访问的页面') # 访问该url
def filter_type(_type: str):
types = [
'application/javascript', 'application/x-javascript', 'text/css', 'webp', 'image/png', 'image/gif',
'image/jpeg', 'image/x-icon', 'application/octet-stream'
]
if _type not in types:
return True
return False
data = ''
while not data: # 当得到想要的数据时结束循环
performance_log = driver.get_log('performance') # 获取名称为performance的日志
for packet in performance_log:
message = json.loads(packet.get('message')).get('message') # 获取message的数据
if message.get('method') != 'Network.responseReceived': # 如果method不是responseReceived 类型就不往下执行
continue
packet_type = message.get('params').get('response').get('mimeType') # 获取该请求返回的type
if not filter_type(_type=packet_type): # 过滤type
continue
requestId = message.get('params').get('requestId') # 唯一的请求标识符。相当于该请求的身份证
url = message.get('params').get('response').get('url') # 获取该请求的url
if url != '想要获取数据的接口url': # 通过url来判断数据包是不是我们想要的那个
continue
try:
resp = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': requestId}) # selenium调用cdp
print(f'type: {packet_type} url: {url}')
data = resp
with open('data.json', 'w') as fp:
fp.write(json.dumps(data)) # 将数据存在文件里
break
except WebDriverException: # 忽略异常
pass
判断页面是否加载完成
if driver.execute_script('return document.readyState') != 'complete': # 判断页面是否加载完毕
return 0
获取标签元素
find_element_by_id # 返回一个元素
find_element(s)_by_class_name # 根据类名获取元素列表
find_element(s)_by_name # 根据标签的name属性值返回包含标签对象元素的列表
find_element(s)_by_xpath # 返回一个包含元素的列表
find_element(s)_by_link_text # 根据连接文本获取元素列表
find_element(s)_by_partial_link_text # 根据链接包含的文本获取元素列表
find_element(s)_by_tag_name # 根据标签名获取元素列表
find_element(s)_by_css_selector # 根据css选择器来获取元素列表
获取shadow-root里的标签
# 先定位到#shadow-root(open)的上一层标签
kat_checkbox = driver.find_element_by_xpath('//div[@id="global-modal"]//kat-checkbox')
# 再定位到#shadow-root(open)里面的div标签
div = driver.execute_script("return arguments[0].shadowRoot.querySelector('div')", kat_checkbox)
其他操作
current_url = driver.current_url # 获取当前页面的URL