环境搭建:
selenium
-> 自动化测试模块,非必要不建议使用该模块写爬虫,因为爬取速度太慢
安装模块pip install selenium
下载浏览器对应的驱动版本:
把解压出来的浏览器驱动可执行文件,移动到python解释器的所在文件夹;或者将驱动所在路径添加进环境变量也是可以的
基本使用:
导入模块:
导入模块:from selenium import webdriver
创建浏览器对象:
driver = webdriver.Chrome/Edge()
打开一个网址:
元素定位(4.0之前版本):
利用xpath
查找指定元素:
driver.find_element_by_xpath('')
driver.find_elements_by_xpath('')
利用id
查找:
driver.find_element_by_id('')
利用class
查找:
driver.find_element_by_class_name('')
利用name
查找:
driver.find_element_by_name('')
利用标签
查找:
driver.find_element_by_tag_name('')
利用css
查找:
driver.find_element_by_css_selector('')
最新定位方法:
需导入模块:
from selenium.webdriver import Edge
from selenium.webdriver.common.by import By
定位单个元素
时使用:find_element(By.方法, '属性')
使用.text
即可获取元素内容 使用.get_attribute('src')
提取标签内的src属性值 使用.send_keys("内容")
向标签输入框内填写内容 定位多个元素
时使用:find_elements(By.方法, '属性')
ID = "id"
XPATH = "xpath"
LINK_TEXT = "link text"
PARTIAL_LINK_TEXT = "partial link text"
NAME = "name"
TAG_NAME = "tag name"
CLASS_NAME = "class name"
CSS_SELECTOR = "css selector"
鼠标动作链:
导入模块:from selenium.webdriver import ActionChains
鼠标移动
事件:
ActionChains(driver).move_to_element(元素的位置).perform()
鼠标左击
事件:
ActionChains(driver).click(元素的位置).perform()
鼠标左双击
事件:
ActionChains(driver).double_click(元素的位置).perform()
鼠标右击
事件:
ActionChains(driver).context_click(元素的位置).perform()
鼠标左击并保持
:
ActionChains(driver).click_and_hold(元素的位置).perform()
将元素拖
到指定元素并松开:
ActionChains(driver).drag_and_drop(原元素的位置, 新元素的位置).perform()
将元素拖
到指定坐标位置(以当前元素为起点):
ActionChains(driver).move_by_offset(x坐标, y坐标).perform()
释放
鼠标点击状态:
ActionChains(driver).release(xxx).perform()
基础案例:
import time
from selenium. webdriver import Edge
from selenium. webdriver. common. keys import Keys
driver = Edge( )
driver. get( "https://www.lagou.com/" )
el = driver. find_element_by_xpath( '//*[@id="changeCityBox"]/p[1]/a' )
el. click( )
time. sleep( 1 )
driver. find_element_by_xpath( '//*[@id="search_input"]' ) . send_keys( "python" , Keys. ENTER)
li_list = driver. find_elements_by_xpath( '//*[@id="s_position_list"]/ul/li' )
for li in li_list:
job_firm = li. find_elements_by_xpath( './div/div[2]/div/a' ) [ 0 ] . text
job_name = li. find_elements_by_tag_name( 'h3' ) [ 0 ] . text
job_money = li. find_elements_by_xpath( './div/div/div[2]/div/span' ) [ 0 ] . text
print ( job_firm, job_name, job_money)
窗口切换:
创建浏览器对象:
driver = webdriver.Chrome/Edge()
切换窗口:
driver.switch_to.window(driver.window_handles[下标])
关闭子窗口:driver.close()
页面操作:
页面前进:driver.forward()
页面后退:driver.back()
处理iframe
标签:
先拿到iframe
-> 切换视角到iframe
-> 拿iframe
内的数据 xxx = driver.find_element_by_xpath('')
driver.switch_to.frame(xxx)
切回原页面:driver.switch_to.default_content()
xxx = driver.find_element_by_xpath('')
窗口切换案例:
from selenium. webdriver import Edge
from selenium. webdriver. common. keys import Keys
driver = Edge( )
driver. get( 'https://www.zhipin.com/' )
driver. find_element_by_xpath( '//*[@id="wrap"]/div[3]/div/div[1]/div[1]/form/div[2]/p/input' ) . send_keys( 'python' , Keys. ENTER)
driver. find_element_by_xpath( '//*[@id="main"]/div/div[3]/ul/li[1]' ) . click( )
driver. switch_to. window( driver. window_handles[ - 1 ] )
cont = driver. find_element_by_xpath( '//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div' ) . text
driver. close( )
driver. switch_to. window( driver. window_handles[ 0 ] )
xinzi = driver. find_element_by_xpath( '//*[@id="main"]/div/div[3]/ul/li[1]/div/div[1]/div[1]/div/div[2]/span' ) . text
gongsi = driver. find_element_by_xpath( '//*[@id="main"]/div/div[3]/ul/li[1]/div/div[1]/div[2]/div/h3/a' ) . text
print ( "薪资:{0},公司:{1}\n职业需求:{2}" . format ( xinzi, gongsi, cont) )
driver. close( )
无头浏览器:
无头浏览器,顾名思义就是不显示浏览器窗口,让浏览器执行过程在后台自动完成 配置无头浏览器参数:
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
创建浏览器对象:
driver = webdriver.Chrome/Edge(options=xxx)
from selenium import webdriver
EDGE = {
"browserName" : "MicrosoftEdge" ,
"version" : "95.0.1020.40" ,
"platform" : "WINDOWS" ,
"ms:edgeOptions" : {
'extensions' : [ ] ,
'args' : [
'--headless' ,
'--disable-gpu'
] }
}
driver = webdriver. Edge( capabilities= EDGE)
driver. get( 'xxx' )
from selenium import webdriver
options = webdriver. ChromeOptions( )
options. add_argument( '--headless' )
options. add_argument( '-–disable-gpu' )
driver = webdriver. Chrome( options= options)
driver. get( 'xxx' )
处理验证码:
注册超级鹰验证码识别 ,调用该API接口 提取验证码图片:.screenshot_as_png
import time
import chaojiying
from selenium. webdriver import Edge
driver = Edge( )
driver. get( 'http://www.chaojiying.com/user/login/' )
time. sleep( 2 )
chaojiying = chaojiying. Chaojiying_Client( '账号' , '密码' , '产品ID' )
img = driver. find_element_by_xpath( '/html/body/div[3]/div/div[3]/div[1]/form/div/img' ) . screenshot_as_png
yzm = chaojiying. PostPic( img, 1902 ) [ "pic_str" ]
user = driver. find_element_by_xpath( '/html/body/div[3]/div/div[3]/div[1]/form/p[1]/input' ) . send_keys( '账号' )
password = driver. find_element_by_xpath( '/html/body/div[3]/div/div[3]/div[1]/form/p[2]/input' ) . send_keys( '密码' )
yanzheng = driver. find_element_by_xpath( '/html/body/div[3]/div/div[3]/div[1]/form/p[3]/input' ) . send_keys( yzm)
driver. find_element_by_xpath( '/html/body/div[3]/div/div[3]/div[1]/form/p[4]/input' ) . click( )
其他操作:
设置ua:
opt.add_argument(f'--user-agent={user_agent}')
设置请求头:
opt.add_argument(f'--headers={headers}')
设置代理:
opt.add_argument("--proxy-server=http://127.0.0.1:8080")
刷新页面:
获取cookie:
隐式等待:
driver.implicitly_wait(10)
跳转到页面的某个位置:
driver.execute_script("window.scrollBy(0,1500)")
拖动滚动条,实现加载页面:
for x in range ( 1 , 11 , 2 ) :
time. sleep( 0.5 )
j = x/ 10
js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
driver. execute_script( js)
driver. execute_cdp_cmd( "Page.addScriptToEvaluateOnNewDocument" , {
"source" : """
Object.defineProperty(navigator, 'webdriver', {
get: () => false
})
"""
} )
from selenium. webdriver import ActionChains as ac
def get_tracks ( self, distance) :
v = 0
t = 0.3
track = [ ]
current = 0
mid = distance* 4 / 5
while current < distance:
if current < mid:
a = 2
else :
a = - 3
v0 = v
s = v0* t+ 0.5 * a* ( t** 2 )
current += s
track. append( round ( s) )
v = v0 + a* t
return track
ele = driver. find_element_by_xpath( '' )
ac( driver) . click_and_hold( ele) . perform( )
tracks = get_tracks( xxx)
for track in tracks:
ac( driver) . move_to_offset( track, 0 ) . perform( )
time. sleep( 1 )
ac( driver) . release( ele) . perform( )