selenium模块
selenium最初是一个自动化测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题
selenium本质是通过驱动浏览器,完全模拟浏览器的操作,比如跳转、输入、点击、下拉等,来拿到网页渲染之后的结果,可支持多种浏览器
from selenium import webdriver
browser=webdriver.Chrome()
browser=webdriver.Firefox()
browser=webdriver.PhantomJS()
browser=webdriver.Safari()
browser=webdriver.Edge()
1.有界面浏览器
安装:selenium+chromedriver
pip3 install selenium
下载chromdriver.exe放到python安装路径的scripts目录中即可,注意最新版本是2.38,并非2.9
国内镜像网站地址:http://npm.taobao.org/mirrors/chromedriver/2.38/
最新的版本去官网找:https://sites.google.com/a/chromium.org/chromedriver/downloads
验证安装
C:\Users\Administrator>python3
Python 3.6.1 (v3.6.1:69c0db5, Mar 21 2017, 18:41:36) [MSC v.1900 64 bit (AMD64)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> from selenium import webdriver
>>> driver=webdriver.Chrome() #弹出浏览器
>>> driver.get('https://www.baidu.com')
>>> driver.page_source
注意:
selenium3默认支持的webdriver是Firfox,而Firefox需要安装geckodriver
下载链接:https://github.com/mozilla/geckodriver/releases
2.selenium+谷歌浏览器headless模式
from selenium import webdriver
from selenium. webdriver. chrome. options import Options
chrome_options = Options( )
chrome_options. add_argument( 'window-size=1920x3000' )
chrome_options. add_argument( '--disable-gpu' )
chrome_options. add_argument( '--hide-scrollbars' )
chrome_options. add_argument( 'blink-settings=imagesEnabled=false' )
chrome_options. add_argument( '--headless' )
chrome_options. binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe"
driver= webdriver. Chrome( chrome_options= chrome_options)
driver. get( 'https://www.baidu.com' )
print ( 'hao123' in driver. page_source)
driver. close( )
3.基本使用
from selenium import webdriver
from selenium. webdriver import ActionChains
from selenium. webdriver. common. by import By
from selenium. webdriver. common. keys import Keys
from selenium. webdriver. support import expected_conditions as EC
from selenium. webdriver. support. wait import WebDriverWait
browser= webdriver. Chrome( )
try :
browser. get( 'https://www.baidu.com' )
input_tag= browser. find_element_by_id( 'kw' )
input_tag. send_keys( '美女' )
input_tag. send_keys( Keys. ENTER)
wait= WebDriverWait( browser, 10 )
wait. until( EC. presence_of_element_located( ( By. ID, 'content_left' ) ) )
print ( browser. page_source)
print ( browser. current_url)
print ( browser. get_cookies( ) )
finally :
browser. close( )
4.选择器用法
4.1 基本用法
from selenium import webdriver
from selenium. webdriver import ActionChains
from selenium. webdriver. common. by import By
from selenium. webdriver. common. keys import Keys
from selenium. webdriver. support import expected_conditions as EC
from selenium. webdriver. support. wait import WebDriverWait
import time
driver= webdriver. Chrome( )
driver. get( 'https://www.baidu.com' )
wait= WebDriverWait( driver, 10 )
try :
print ( driver. find_element_by_id( 'kw' ) )
login= driver. find_elements_by_partial_link_text( '录' ) [ 0 ]
login. click( )
print ( driver. find_element_by_tag_name( 'a' ) )
button= wait. until( EC. element_to_be_clickable( ( By. CLASS_NAME, 'tang-pass-footerBarULogin' ) ) )
button. click( )
input_user= wait. until( EC. presence_of_element_located( ( By. NAME, 'userName' ) ) )
input_pwd= wait. until( EC. presence_of_element_located( ( By. NAME, 'password' ) ) )
commit= wait. until( EC. element_to_be_clickable( ( By. ID, 'TANGRAM__PSP_10__submit' ) ) )
input_user. send_keys( '18611453110' )
input_pwd. send_keys( 'xxxxxx' )
commit. click( )
driver. find_element_by_css_selector( '#kw' )
time. sleep( 5 )
finally :
driver. close( )
4.2 xpath
from selenium import webdriver
from selenium. webdriver import ActionChains
from selenium. webdriver. common. by import By
from selenium. webdriver. common. keys import Keys
from selenium. webdriver. support import expected_conditions as EC
from selenium. webdriver. support. wait import WebDriverWait
import time
driver= webdriver. PhantomJS( )
driver. get( 'https://doc.scrapy.org/en/latest/_static/selectors-sample1.html' )
driver. implicitly_wait( 3 )
try :
driver. find_element_by_xpath( '//body//a' )
driver. find_element_by_css_selector( 'body a' )
res1= driver. find_elements_by_xpath( '//body//a[1]' )
print ( res1[ 0 ] . text)
res1= driver. find_element_by_xpath( '//a[5]' )
res2= driver. find_element_by_xpath( '//a[@href="image5.html"]' )
res3= driver. find_element_by_xpath( '//a[contains(@href,"image5")]' )
print ( '==>' , res1. text)
print ( '==>' , res2. text)
print ( '==>' , res3. text)
res1= driver. find_element_by_xpath( '/html/body/div/a' )
print ( res1. text)
res2= driver. find_element_by_xpath( '//a[img/@src="image3_thumb.jpg"]' )
print ( res2. tag_name, res2. text)
res3 = driver. find_element_by_xpath( "//input[@name='continue'][@type='button']" )
res4 = driver. find_element_by_xpath( "//*[@name='continue'][@type='button']" )
time. sleep( 5 )
finally :
driver. close( )
4.3 获取标签属性
from selenium import webdriver
from selenium. webdriver import ActionChains
from selenium. webdriver. common. by import By
from selenium. webdriver. common. keys import Keys
from selenium. webdriver. support import expected_conditions as EC
from selenium. webdriver. support. wait import WebDriverWait
browser= webdriver. Chrome( )
browser. get( 'https://www.amazon.cn/' )
wait= WebDriverWait( browser, 10 )
wait. until( EC. presence_of_element_located( ( By. ID, 'cc-lm-tcgShowImgContainer' ) ) )
tag= browser. find_element( By. CSS_SELECTOR, '#cc-lm-tcgShowImgContainer img' )
print ( tag. get_attribute( 'src' ) )
print ( tag. id )
print ( tag. location)
print ( tag. tag_name)
print ( tag. size)
browser. close( )
5. 隐式等待与显示等待
#1、selenium只是模拟浏览器的行为,而浏览器解析页面是需要时间的(执行css,js),一些元素可能需要过一段时间才能加载出来,为了保证能查找到元素,必须等待
#2、等待的方式分两种:
隐式等待:在browser.get('xxx')前就设置,针对所有元素有效
显式等待:在browser.get('xxx')之后设置,只针对某个元素有效
from selenium import webdriver
from selenium. webdriver import ActionChains
from selenium. webdriver. common. by import By
from selenium. webdriver. common. keys import Keys
from selenium. webdriver. support import expected_conditions as EC
from selenium. webdriver. support. wait import WebDriverWait
browser= webdriver. Chrome( )
browser. implicitly_wait( 10 )
browser. get( 'https://www.baidu.com' )
input_tag= browser. find_element_by_id( 'kw' )
input_tag. send_keys( '美女' )
input_tag. send_keys( Keys. ENTER)
contents= browser. find_element_by_id( 'content_left' )
print ( contents)
browser. close( )
from selenium import webdriver
from selenium. webdriver import ActionChains
from selenium. webdriver. common. by import By
from selenium. webdriver. common. keys import Keys
from selenium. webdriver. support import expected_conditions as EC
from selenium. webdriver. support. wait import WebDriverWait
browser= webdriver. Chrome( )
browser. get( 'https://www.baidu.com' )
input_tag= browser. find_element_by_id( 'kw' )
input_tag. send_keys( '美女' )
input_tag. send_keys( Keys. ENTER)
wait= WebDriverWait( browser, 10 )
wait. until( EC. presence_of_element_located( ( By. ID, 'content_left' ) ) )
contents= browser. find_element( By. CSS_SELECTOR, '#content_left' )
print ( contents)
browser. close( )
6. 元素交互操作
from selenium import webdriver
from selenium. webdriver import ActionChains
from selenium. webdriver. common. by import By
from selenium. webdriver. common. keys import Keys
from selenium. webdriver. support import expected_conditions as EC
from selenium. webdriver. support. wait import WebDriverWait
browser= webdriver. Chrome( )
browser. get( 'https://www.amazon.cn/' )
wait= WebDriverWait( browser, 10 )
input_tag= wait. until( EC. presence_of_element_located( ( By. ID, 'twotabsearchtextbox' ) ) )
input_tag. send_keys( 'iphone 8' )
button= browser. find_element_by_css_selector( '#nav-search > form > div.nav-right > div > input' )
button. click( )
import time
time. sleep( 3 )
input_tag= browser. find_element_by_id( 'twotabsearchtextbox' )
input_tag. clear( )
input_tag. send_keys( 'iphone7plus' )
button= browser. find_element_by_css_selector( '#nav-search > form > div.nav-right > div > input' )
button. click( )
from selenium import webdriver
from selenium. webdriver import ActionChains
from selenium. webdriver. common. by import By
from selenium. webdriver. common. keys import Keys
from selenium. webdriver. support import expected_conditions as EC
from selenium. webdriver. support. wait import WebDriverWait
import time
driver = webdriver. Chrome( )
driver. get( 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable' )
wait= WebDriverWait( driver, 3 )
try :
driver. switch_to. frame( 'iframeResult' )
sourse= driver. find_element_by_id( 'draggable' )
target= driver. find_element_by_id( 'droppable' )
ActionChains( driver) . click_and_hold( sourse) . perform( )
distance= target. location[ 'x' ] - sourse. location[ 'x' ]
track= 0
while track < distance:
ActionChains( driver) . move_by_offset( xoffset= 2 , yoffset= 0 ) . perform( )
track+= 2
ActionChains( driver) . release( ) . perform( )
time. sleep( 10 )
finally :
driver. close( )
在交互动作比较难实现的时候可以自己写JS(万能方法)
from selenium import webdriver
from selenium. webdriver import ActionChains
from selenium. webdriver. common. by import By
from selenium. webdriver. common. keys import Keys
from selenium. webdriver. support import expected_conditions as EC
from selenium. webdriver. support. wait import WebDriverWait
try :
browser= webdriver. Chrome( )
browser. get( 'https://www.baidu.com' )
browser. execute_script( 'alert("hello world")' )
finally :
browser. close( )
from selenium import webdriver
from selenium. webdriver import ActionChains
from selenium. webdriver. common. by import By
from selenium. webdriver. common. keys import Keys
from selenium. webdriver. support import expected_conditions as EC
from selenium. webdriver. support. wait import WebDriverWait
try :
browser= webdriver. Chrome( )
browser. get( 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable' )
browser. switch_to. frame( 'iframeResult' )
tag1= browser. find_element_by_id( 'droppable' )
print ( tag1)
browser. switch_to. parent_frame( )
tag2= browser. find_element_by_id( 'textareaCode' )
print ( tag2)
finally :
browser. close( )
7. 其他操作
import time
from selenium import webdriver
browser= webdriver. Chrome( )
browser. get( 'https://www.baidu.com' )
browser. get( 'https://www.taobao.com' )
browser. get( 'http://www.sina.com.cn/' )
browser. back( )
time. sleep( 10 )
browser. forward( )
browser. close( )
from selenium import webdriver
browser= webdriver. Chrome( )
browser. get( 'https://www.zhihu.com/explore' )
print ( browser. get_cookies( ) )
browser. add_cookie( { 'k1' : 'xxx' , 'k2' : 'yyy' } )
print ( browser. get_cookies( ) )
import time
from selenium import webdriver
browser= webdriver. Chrome( )
browser. get( 'https://www.baidu.com' )
browser. execute_script( 'window.open()' )
print ( browser. window_handles)
browser. switch_to_window( browser. window_handles[ 1 ] )
browser. get( 'https://www.taobao.com' )
time. sleep( 10 )
browser. switch_to_window( browser. window_handles[ 0 ] )
browser. get( 'https://www.sina.com.cn' )
browser. close( )
from selenium import webdriver
from selenium. common. exceptions import TimeoutException, NoSuchElementException, NoSuchFrameException
try :
browser= webdriver. Chrome( )
browser. get( 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable' )
browser. switch_to. frame( 'iframssseResult' )
except TimeoutException as e:
print ( e)
except NoSuchFrameException as e:
print ( e)
finally :
browser. close( )