目录:
介绍安装
基本使用
元素交互操作
项目练习
一、安装介绍
介绍
selenium是一个自动化测试工具,爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题;
selenium本质是通过驱动浏览器,完全模拟浏览器的操作,例如:跳转、输入、点击、下拉等,来拿网页渲染之后的结果,可支持多种浏览器;
安装
pip3 install selenium
下载chromdriver.exe放到python安装路的scripts目录中 版本为2.29
验证安装是否成功:
from selenium import webdriver
driver = webdriver.Chrome() #弹出浏览器
driver.get("https://www.baidu.com")
print(driver.page_source)#输出页面源码
selenium3默认支持webdriver是Firfox,而Firefox需要安装geckodriver
下载链接:https://github.com/mozilla/geckodriver/releases
无界面浏览器Phantomjs
下载Phantomjs , 解压后把phantomjs.exe所在的bin目录放到环境变量
下载链接:http://phantomjs.org/download.html
验证环境
C:\Users\Administrator>phantomjs
phantomjs> console.log('egon gaga')
egon gaga
undefined
验证安装:
from selenium import webdriver
driver = webdriver.PhantomJS()
driver.get("https://www.baidu.com")
print(driver.page_source)#输出页面源码
二、基本使用
需求:百度搜索美女
#!/usr/bin/env python#-*- coding: utf-8 -*-
__author__ = 'tian'
__data__ = '2018/4/16 14:45'
from selenium importwebdriverfrom selenium.webdriver.common.by import By #按照什么方式查找,
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support importexpected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
browser=webdriver.Chrome()try:
browser.get('https://www.baidu.com')
input_tag= browser.find_element_by_id('kw')
input_tag.send_keys('美女')
input_tag.send_keys(Keys.ENTER)#输入回车
wait=WebDriverWait(browser,10)
wait.until(EC.presence_of_all_elements_located((By.ID,'content_left')))#content_left为百度搜索成功后美女图片列表div
print(browser.page_source)#打印页面内容
print(browser.current_url)#打印当前页面的url
print(browser.get_cookies())#打印cookies
finally:
browser.close()
百度输入美女进行搜索
获取标签属性
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID ,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
import time
driver = webdriver.Chrome()
driver.get('https://www.amazon.cn/')
wait = WebDriverWait(driver,10)
wait.until(EC.presence_of_all_elements_located((By.ID,'cc-lm-tcgShowImgContainer')))
tag = driver.find_element(By.CSS_SELECTOR,'#cc-lm-tcgShowImgContainer img')
#获取标签属性
print(tag.get_attribute('src'))
#获取标签ID,位置,名称,
print(tag.id)
print(tag.location)
print(tag.tag_name)
print(tag.size)
driver.close()
等待元素被加载
selenium只能模拟浏览器的行为,而浏览器解析网页需要时间(执行css,js),一些元素可以需要很长一段时间才能加载出来,为了保证能查找到元素,必须等待。
等待方式分两种:
隐式等待:implicitly_wait() 显式等待某个元素别加载 WebDriverWait(dirver,10)
元素交互操作
点击和清空
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID ,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
import time
driver = webdriver.Chrome()
driver.get('https://www.amazon.cn/')
wait = WebDriverWait(driver,10)
input_tag = wait.until(EC.presence_of_all_elements_located((By.ID,'twotabsearchtextbox')))
button = driver.find_element_by_css_selector('#twotabsearchtextbox')
button.send_keys('iphone8')
import time
time.sleep(3)
input_tag_1 = driver.find_element_by_id('twotabsearchtextbox')
input_tag_1.clear()
input_tag_1.send_keys('iphone7plus')
button = driver.find_element_by_css_selector('#twotabsearchtextbox')
button.send_keys(Keys.ENTER)
driver.close()
Action Chains 联动
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID ,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
import time
driver = webdriver.Chrome()
driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
wait = WebDriverWait(driver,3)
try:
driver.switch_to_frame('iframeResult') #切换到iframeResult
sourse = driver.find_element_by_id('droppable')
target = driver.find_element_by_id('draggable')
# 方式一:基于同一个动作连串执行
actions = ActionChains(driver) #拿到动作链对象
actions.drag_and_drop(sourse,target)
actions.perform()
#方式二:不同的动作链,每次移动的位置都不同
# ActionChains(driver).click_and_hold(sourse).perform()
# distance = target.location['x']-sourse.location['x']
# track = 0
# while track < distance:
# ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()
# track+=2
# ActionChains(driver).release().perform()
time.sleep(10)
finally:
driver.close()
cookies
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.zhihu.com/explore')
# driver.add_cookie({}) #添加cookie
print(driver.get_cookies())
driver.close()
选项卡:切换选项卡
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get('https://www.baidu.com')
driver.execute_script('window.open()')
print(driver.window_handles) #获取所有的选项卡
time.sleep(10)
driver.switch_to_window(driver.window_handles[1])
driver.get('https://www.taobao.com')
time.sleep(10)
driver.switch_to_window(driver.window_handles[0])
driver.get('https://www.sina.com.cn')
driver.close()
time.sleep(10)
例子:
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get('https://www.baidu.com/')
driver.maximize_window()
driver.implicitly_wait(3)
driver.find_element_by_css_selector('#u1 >.lb').click()
driver.find_element_by_link_text("立即注册").click()
driver.switch_to_window(driver.window_handles[0])
# ['CDwindow-406d8bcd-34ce-4ea3-ab1a-3853a8b4fddf', 'CDwindow-37fc67c6-f3a3-4b02-b887-d0dd1973db5d']
driver.get('https://www.baidu.com/')
time.sleep(3)
异常处理
from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchFrameException
try:
driver = webdriver.Chrome()
driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
driver.switch_to_frame('iframeResult')
except TimeoutException as e:
print(e)
except NoSuchFrameException as e:
print(e)
finally:
driver.close()
项目练习
1、126发送邮件
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID ,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
import time
driver = webdriver.Chrome()
try:
driver.get('https://www.126.com/')
wait = WebDriverWait(driver,5)
frame = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#x-URS-iframe'))) #等着#x-URS-iframe显示完毕
driver.switch_to_frame(frame)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-container'))) #form表单中div加载完成
input_user = driver.find_element_by_name('email').send_keys('test_tx')
input_pwd = driver.find_element_by_name('password').send_keys('xxxxooossoo')
# driver.find_element_by_css_selector("#dologin").click()
driver.find_element_by_css_selector("#dologin").send_keys(Keys.ENTER)
wait.until(EC.presence_of_element_located((By.ID,'dvNavTop')))
write_msg = driver.find_elements_by_css_selector("#dvNavTop li")[1]
write_msg.click()
wait.until(EC.presence_of_element_located((By.CLASS_NAME,'tH0')))
driver.find_element_by_class_name('nui-editableAddr-ipt').send_keys('352932341@qq.com')
title =driver.find_element_by_css_selector('.dG0 .nui-ipt-input')
title.send_keys('测试啊')
frame = wait.until(EC.presence_of_element_located((By.CLASS_NAME,'APP-editor-iframe')))
driver.switch_to_frame(frame)
driver.find_element(By.CSS_SELECTOR,'body').send_keys('发送成功,加工资了')
driver.switch_to_default_content()
driver.find_element_by_class_name('nui-toolbar-item').click()
time.sleep(100)
except Exception as e:
print(e)
finally:
driver.close()
京东商城
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
def get_goods(driver):
try:
goods = driver.find_elements_by_css_selector('.gl-item')
for good in goods:
detail_url = good.find_element_by_tag_name('a').get_attribute('href')
p_name = good.find_element_by_css_selector('.p-name em').text.replace('\n','')
price = good.find_element_by_css_selector('.p-price i').text
p_commit = good.find_element_by_css_selector('.p-commit a').text
msg = '''
商品 :{0}
链接 : {1}
价钱 : {2}
评论 : {3}
'''.format(p_name,price,p_commit)
print(msg,end='\n\n')
button = driver.find_element_by_css_selector(".pn-next em").click()#下一页
time.sleep(2)
get_goods(driver)
except Exception:
pass
def spider(url,keyword):
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(3)
try:
input_tage = driver.find_element_by_css_selector("#key")
input_tage.send_keys(keyword)
input_tage.send_keys(Keys.ENTER)
get_goods(driver)
finally:
driver.close()
if __name__ == '__main__':
spider('https://www.jd.com/',keyword='iPhone8手机')