安装
pip install selenium
ChromeDriver下载链接
点击跳转/ChromeDriver
selenium 文档
点击打开selenium文档
声明浏览器对象
from selenium import webdriver
driver_path = '/Users/apple/soft/chromedriver'
browser = webdriver. Chrome( executable_path= driver_path)
browser = webdriver. Firefox( )
browser = webdriver. Edge( )
browser = webdriver. PhantomJS( )
browser = webdriver. Safari( )
访问页面
from selenium import webdriver
browser = webdriver. Chrome( )
browser. get( 'https://www.taobao.com' )
print ( browser. page_source)
browser. close( )
查找元素
单个元素
from selenium import webdriver
browser = webdriver. Chrome( )
browser. get( 'https://www.taobao.com' )
input_frst = browser. find_element_by_id( 'q' )
input_second = browser. find_element_by_css_selector( '#q' )
input_third = browser. find_element_by_xpath( '//*[@id="q"]' )
print ( input_first, input_second, input_third)
browser. close( )
from selenium import webdriver
browser = webdriver. Chrome( )
browser. get( 'https://www.baidu.com' )
wd = browser. find_element_by_name( 'wd' )
print ( wd. get_attribute( 'maxlength' ) )
from selenium import webdriver
browser = webdriver. Chrome( )
browser. get( 'https://www.baidu.com' )
ele = browser. find_element_by_tag_name( 'img' )
print ( ele. get_attribute( 'class' ) )
find_element_by_name 根据name属性的值来查找元素(表单) find_element_by_xpath 根据xpath语法获取元素 只能找对象, 不能直接找文本和属性 find_element_by_link_text find_element_by_partial_link_text find_element_by_tag_name 根据标签名查找元素 find_element_by_class_name 根据类名查找元素 find_element_by_css_selector 根据css选择器选择元素
from selenium import webdriver
from selenium. webdriver. common. by import By
browser = webdriver. Chrome( )
browser. get( 'https://www.taobao.com' )
input_first = browser. find_element( By. ID, 'q' )
browser. close( )
多个元素
from selenium import webdriver
browser = webdriver. Chrome( )
browser. get( 'https://www.taobao.com' )
lis = browser. find_elements_by_css_selector( '.service-bd li' )
print ( lis)
browser. close( )
from selenium import webdriver
from selenium. webdriver. common. by import By
browser = webdriver. Chrome( )
browser. get( 'https://www.taobao.com' )
lis = browser. find_element( By. CSS_SELECTOR, '.service-bd li' )
print ( lis)
browser. close( )
find_elements_by_name find_elements_by_xpath find_elements_by_link_text find_elements_by_partial_link_text find_elements_by_tag_name find_elements_by_class_name find_elements_by_css_selector
元素交互操作
from selenium import webdriver
import time
browser = webdriver. Chrome( )
browser. get( 'https://www.taobao.com' )
input = browser. find_element_by_id( 'q' )
input . send_keys( 'iPhone' )
time. sleep( 1 )
input . clear( )
input . send_keys( 'iPad' )
button = browser. find_element_by_class_name( 'btn-search' )
button. click( )
选择框select
from selenium. webdriver. support. ui import Select
from selenium import webdriver
driver = webdriver. Chrome( )
driver. get( 'http://tieba.baidu.com/f/search/adv?red_tag=y2157203949' )
sel = driver. find_element_by_name ( 'sm' )
selectTaq = Select( sel)
SelectTag. select_by_visible_text( '按时间顺序' )
交互动作
from selenium import webdriver
from selenium. webdriver import ActionChains
browser = webdriver. Chrome( )
url = 'http://www.runoob.com/try/try.php?flename=jqueryui-api-droppable' browser. get( url)
browser. switch_to. frame( 'iframeResult' )
source = browser. find_element_by_css_selector( '#draggable' )
target = browser. find_element_by_css_selector( '#droppable' )
actions = ActionChains( browser)
actions. drag_and_drop( source, target)
actions. perform( )
from selenium import webdriver
from selenium. webdriver import ActionChains
driver = webdriver. Chrome( )
driver. get( 'http://tieba.baidu.com/f/search/adv?red_tag=y2157203949' )
inputTag = driver. find_element_by_id( 'kw' )
submitTag = driver. find_element_by_id( 'su' )
actions = ActionChains( driver)
actions. move_to_element( inputTag)
actions. send_keys_to_element( inputTag, 'python' )
actions. move_to_element( submitTag)
actions. click( submitTag)
actions. perform( )
click_and_hold(element):点击但不松开鼠标 context_click(element):右键点击 double_click(element):双击 更多方法请点击
执行JavaScript
from selenium import webdriver
browser = webdriver. Chrome( )
browser. get( 'https://www.zhihu.com/explore' )
browser. execute_script( 'window.scrollTo(0, document.body.scrollHeight)' )
browser. execute_script( 'alert("To Bottom")' )
获取元素信息
from selenium import webdriver
from selenium. webdriver import ActionChains
browser = webdriver. Chrome( )
url = 'https://www.zhihu.com/explore'
browser. get( url)
logo = browser. find_element_by_id( 'zh-top-link-logo' )
print ( logo)
print ( logo. get_attribute( 'class' ) )
from selenium import webdriver
browser = webdriver. Chrome( )
url = 'https://www.zhihu.com/explore'
browser. get( url)
input = browser. find_element_by_class_name( 'zh-top-add-question' )
print ( input . text)
from selenium import webdriver
browser = webdriver. Chrome( )
url = 'https://www.zhihu.com/explore'
browser. get( url)
input = browser. find_element_by_class_name( 'zh-top-add-question' )
print ( input . id )
print ( input . location)
print ( input . tag_name)
print ( input . size)
from selenium import webdriver
browser = webdriver. Chrome( )
url = 'https://www.zhihu.com/explore'
browser. get( url)
browser. save_screenshot( './test.png' )
iframe
import time
from selenium import webdriver
from selenium. common. exceptions import NoSuchElementException
browser = webdriver. Chrome( )
url = 'http://www.runoob.com/try/try.php?flename-jqueryui-api-droppable'
browser. get( url)
browser. switch_to. frame( 'iframeResult' )
source = browser. find_element_by_css selector( '#draqqable' )
print ( source)
try :
logo = browser. find_element_by_class_name( 'logo' )
except NoSuchElementException:
print ( 'NO LOGO)
browser. switch_to. parent_frame( )
logo = browser. find_element_by_class_name( 'logo' )
print ( logo)
print ( logo. text)
from selenium import webdriver
from selenium. webdriver. support. ui import WebDriverWait
from selenium. webdriver. support import expected_conditions as EC
from selenium. webdriver. common. by import By
chrome_path = 'D:\chrom_add_in/chromedriver'
driver = webdriver. Chrome( executable_path= chrome_path)
driver. get( 'http://www.4399.com/' )
driver. find_element_by_id( 'login_tologin' ) . click( )
driver. switch_to. frame( 'popup_login_frame' )
driver. find_element_by_id( 'username' ) . send_keys( '15013518752' )
driver. find_element_by_id( 'j-password' ) . send_keys( 'python123-' )
driver. find_element_by_id( 'login_autoLogin' ) . click( )
driver. find_element_by_class_name( 'ptlogin_btn' ) . click( )
driver. switch_to. default_content( )
driver. find_element_by_xpath( '//div[@id="func"]/a[3]' ) . click( )
页面等待
现在的网页越来越多采用了 Ajax 技术,这样程序便不能确定何时某个元素完全加载出来了。如果实际页面等待时间过长导致某个dom元素还没出来,但是你的代码直接使用了这个WebElement,那么就会抛出NullPointer的异常。为了解决这个问题。所以 Selenium 提供了两种等待方式:一种是隐式等待、一种是显式等待。 隐式等待: 当使用了隐式等待执行测试的时候,如果WebDriver没有在DOM中找到元素,将继续等待,超出设定时间后则抛出找不到元素的异常,换句话说,当查找元素或元素并没有立即出现的时候,隐式等待将等待一段时间再查找DOM,默认的时间是0
from selenium import webdriver
browser = webdriver. Chrome( )
browser. implicitly_wait( 10 )
browser. get( 'https://www.zhihukcom/explore' )
input = browser. find element_by_class_name( 'zu-top-add-question' )
print ( input )
显示等待:显示等待是表明某个条件成立后才执行获取元素的操作。也可以在等待的时候指定一个最大的时间,如果超过这个时间那么就抛出一个异常。显示等待应该使用selenium.webdriver.support.excepted_conditions期望的条件和selenium.webdriver.support.ui.WebDriverWait来配合完成
from selenium import webdriver
from selenium. webdriver. common. by import By
from selenium. webdriver. support. ui import WebDriverWait
from selenium. webdriver. support import expected_conditions as EC
browser = webdriver. Chrome( )
browser. get( 'https://www.taobao.com/' )
wait = WebDriverWait( browser, 10 )
input = wait. until( EC. presence_of_element_located( ( By. ID, 'q' ) ) )
button = wait. until( EC. element_to_be_clickable( ( By. CSS SELECTOR, '. btn- search") ) )
print ( input , button)
from selenium import webdriver
from selenium. webdriver. support. ui import WebDriverWait
from selenium. webdriver. support import expected_conditions as EC
from selenium. webdriver. common. by import By
chrome_path = '/Users/apple/soft/chromedriver'
driver = webdriver. Chrome( executable_path= chrome_path)
driver. get( 'https://movie.douban.com/typerank?type_name=%E5%89%A7%E6%83%85&type=11&interval_id=100:90&action=' )
WebDriverWait( driver, 10 ) . until(
EC. presence_of_element_located( ( By. XPATH, '//div[@class="movie-list-panel pictext"]/div' ) )
)
con = driver. page_source
with open ( 'db1.html' , 'w' ) as fp:
fp. write( con)
presence_of_element_located:某个元素已经加载完毕了 presence_of_all_emement_located:网页中所有满足条件的元素都加载完毕了 element_to_be_cliable:某个元素是可以点击了 更多条件请点击
前进后退
import time
from selenium import webdriver
browser = webdriver. Chrome( )
browser. get( 'https://www.baidu.com/' )
browser. get( 'https://www.taobao.com/' )
browser. get( 'https://wwww.python.org/' )
browser. back( )
time. sleep( 1 )
browser. forward( )
browser. close( )
Cookies
from selenium import webdriver
browser = webdriver. Chrome( )
browser. qet( 'https://www.zhihu.com/explore' )
print ( browser. get_cookies( ) )
browser. add_cookie( { 'name' : 'name' , 'domain' : 'www.zhihu.com' , 'value' : 'germey' } )
print ( browser. get_cookies( ) )
browser. delete_all_cookies( )
print ( browser. get_cookies( ) )
from selenium import webdriver
from selenium. webdriver import ActionChains
from selenium. webdriver. support. ui import Select
options = webdriver. ChromeOptions( )
options. add_argument( "--proxy-server=http://114.237.42.74:4214" )
chrome_path = '/Users/apple/soft/chromedriver'
driver = webdriver. Chrome( executable_path= chrome_path, options= options)
driver. get( 'https://www.baidu.com/' )
cookies = 'BIDUPSID=80C8A3EE76EF75A1206FD121B6E34903; PSTM=1576670250; BAIDUID=80C8A3EE76EF75A1D29D7E3568D557C1:FG=1; BD_UPN=123253; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDSFRCVID=JxDsJeCCxG3eKhRu05sjJ68ymH70GXVmYDdP3J; H_BDCLCKID_SF=tJkfoIDhfIvbfP0khtnDMtA85h5Ka4CXa5rMVhcy3POkeqOJ2Mt5bfPJ0l_jtRJBWJFOXR6dJKJnoMQzWfOf05tpexbH55uHJJCj_U5; H_PS_PSSID=1444_21089_30211_18560_30284_26350_30447_22157; delPer=0; BD_CK_SAM=1; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; userFrom=null; PSINO=6; COOKIE_SESSION=4050_0_7_6_11_14_0_0_7_2_3_0_8246_0_2_0_1577254032_0_1577254030%7C9%230_0_1577254030%7C1; BDUSS=RwZlNWam1GRmRWTmR6OGpDMktRaHp6WXVhLU16eGdyTG9CQUpFSmF6RWtyeXBlSVFBQUFBJCQAAAAAAAAAAAEAAABITrkJMjkwNzkzOTkyemIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACQiA14kIgNebj; BD_HOME=1; sugstore=1'
cookie_list = cookies. split( ';' )
for data in cookie_list:
d = data. split( '=' )
driver. add_cookie( { 'name' : d[ 0 ] . strip( ) , 'value' : d[ 1 ] . strip( ) } )
driver. get( 'http://i.baidu.com/' )
选项卡管理 / 页面切换
有时候窗口中有很多子tab页面。这时候肯定是需要进行切换的。selenium提供了一个叫做switch_to.window来进行切换,具体切换到哪个页面,可以从driver.window_handles中找到
import timefrom selenium import webdriver
browser = webdriver. Chrome( )
browser. get( 'https://www.baidu.com' )
browser. execute_script( 'window.open("http://chinaz.com")' )
url = browser. current_url
print ( url)
print ( browser. window_handles)
browser. switch_to_window( browser. window_handles[ 1 ] )
browser. get( 'https://www.taobao.com' )
time. sleep( 1 )
browser. switch_to_window( browser. window_handles[ 0 ] )
browser. get( 'https://python.org' )
异常处理
from selenium import webdriver
browser = webdriver. Chrome( )
browser. get( 'https://www.baidu.com' )
browser. find_elementby_id( 'hello' )
from selenium import webdriver
from selenium. common. exceptions import TimeoutException, NoSuchElementException
browser = webdriver. Chrome( )
try :
browser. get( 'https://www.baidu.com' )
except TimeoutException:
print ( 'Time Out' )
try :
browser. find elementby_id( 'hello' )
except NoSuchElementException:
print ( 'No Element' )
finally :
browser. close( )
设置代理IP
有时候频繁爬取一些网页。服务器发现你是爬虫后会封掉你的ip地址。这时候我们可以更改代理ip。
from selenium import webdriver
chrome_path = 'D:\chrom_add_in/chromedriver'
option = webdriver. ChromeOptions( )
option. add_argument( '--proxy-server=http://113.88.87.23:4251' )
driver = webdriver. Chrome( executable_path= chrome_path, chrome_options= option)
driver. get( 'http://httpbin.org/ip' )