selenium用法

最新推荐文章于 2024-07-25 20:58:36 发布

每天早睡

最新推荐文章于 2024-07-25 20:58:36 发布

阅读量3.8k

点赞数 25

分类专栏：爬虫文章标签： selenium xpath 验证码

本文链接：https://blog.csdn.net/BLee_0123/article/details/131467722

版权

爬虫专栏收录该内容

6 篇文章 12 订阅

订阅专栏

一、请求库selenium

selenium是一个自动化测试工具，而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题。操作浏览器模拟人的行为。

下载浏览器驱动：以谷歌浏览器为例---->谷歌浏览器驱动（版本号对应）

安装：pip3 install selenium

基本使用（元素定位和操作）

from selenium import webdriver
import time

# 浏览器对象，打开了谷歌浏览器
bro=webdriver.Chrome(executable_path='./chromedriver.exe')

# 在浏览器中输入一个网站并访问
bro.get('http://www.baidu.com')

# 通过id找到input输入框，往输入框中输入美女，敲回车
kw=bro.find_element_by_id('kw')
kw.clear() # 清空
kw.send_keys('美女')  # 输入
search=bro.find_element_by_css_selector('#su')
search.click() # 点击该控件

print(bro.page_source) # 加载完js后的页面内容

time.sleep(3)
bro.close()  # 关闭页面
bro.quit()  # 关闭整个浏览器

无界面浏览器

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options()

chrome_options.add_argument('window-size=1920x3000') # 指定浏览器分辨率
chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
chrome_options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败

bro=webdriver.Chrome(executable_path='./chromedriver.exe',options=chrome_options)

# 在浏览器中输入一个网站并访问
bro.get('https://www.baidu.com')

print(bro.page_source) # 页面内容，加载完js后

bro.close()

获取元素位置、属性、大小

from selenium import webdriver
import base64

bro = webdriver.Chrome(executable_path='./chromedriver.exe')

bro.get('https://kyfw.12306.cn/otn/resources/login.html')
driver.implicitly_wait(10)

user_login=driver.find_element_by_css_selector('.login-hd-account>a')
user_login.click()
time.sleep(2)

img = bro.find_element_by_id('J-qrImg')

print(img.id)    # selenium提供的id，忽略
print(img.tag_name) # 标签名

print('-----')  # 后续根据位置和大小把图截出来，一般是验证码，破解，自动输入
print(img.location) # img标签的位置
print(img.size)     # img标签大小

# 获取属性
src=img.get_attribute('src')
res=base64.b64decode(src.split(',')[-1])
with open('扫描.png','wb') as f:
    f.write(res)

隐式、显式等待
显式等待：指定某个标签等待时间，写起来复杂用的少

wait=WebDriverWait(driver,10)
wait.until(EC.presence_of_element_located((By.ID,'content_left')))
contents=browser.find_element(By.CSS_SELECTOR,'#content_left')

隐式等待：只要代码中要去找某一个标签，如果没有加载出来，就会等待设置的固定时间（秒）

# 只需要写一句话，等待所有要获取的标签
driver.implicitly_wait(10)
driver.find_element_by_css_selector()

示例：

from selenium import webdriver
import time

bro=webdriver.Chrome(executable_path='./chromedriver.exe')

# 隐式等待，等所有---->只要代码中要去找某一个标签，如果没有加载出来，就会等待10s
# 再找控件，只要没加载成功，就会等待，最多等10s
bro.implicitly_wait(10)

# 显式等待，指定某个标签等待，写起来复杂用的少
bro.get('http://www.baidu.com')

# 找到页面中登录按钮
# login_btn=bro.find_element_by_css_selector('#s-top-loginbtn')  #  css 选择器
# login_btn=bro.find_element_by_xpath('//*[@id="s-top-loginbtn"]')  #  xpath 选择器

# 自带的
# login_btn=bro.find_element_by_id('s-top-loginbtn')  #  selenium自带的选择器
login_btn=bro.find_element_by_link_text('登录')  #  selenium自带的选择器,通过a标签内容
# login_btn=bro.find_element_by_partial_link_text('录') # selenium自带的选择器,通过a标签内容模糊

# login_btn=bro.find_elements_by_class_name() # selenium自带的选择器,通过类名
# login_btn=bro.find_elements_by_name() # selenium自带的选择器,通过name属性
# login_btn=bro.find_elements_by_tag_name() # selenium自带的选择器,通过标签名

# 点击登录按钮
login_btn.click()

# 找到用户名框
name=bro.find_element_by_id('TANGRAM__PSP_11__userName')
password=bro.find_element_by_id('TANGRAM__PSP_11__password')
name.send_keys('xxx@qq.com')
password.send_keys('xxx')

submit=bro.find_element_by_id('TANGRAM__PSP_11__submit')
# 找到密码框
time.sleep(3)

submit.click()

time.sleep(3)

bro.close()

执行js

from selenium import webdriver
import time

bro=webdriver.Chrome(executable_path='./chromedriver.exe')
bro.implicitly_wait(10)

bro.get('http://www.baidu.com')

# bro.execute_script('console.log(vm)')
# bro.execute_script('alert(1)')
bro.execute_script('alert(document.cookie)')  # 这里面写js代码

time.sleep(3)
bro.close()

切换选项卡

import time
from selenium import webdriver

browser=webdriver.Chrome(executable_path='./chromedriver.exe')
browser.get('https://www.baidu.com')

browser.execute_script('window.open()')  # 执行js代码，查找一个已经存在的或新建的浏览器窗口

print(browser.window_handles) # 获取所有的选项卡

# 选择选项卡
# browser.switch_to_window(browser.window_handles[1])
browser.switch_to.window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(3)

browser.switch_to_window(browser.window_handles[0])
browser.get('https://www.sina.com.cn')

browser.close()
browser.quit()  # 关闭整个浏览器

模拟前进后退

import time
from selenium import webdriver

browser = webdriver.Chrome(executable_path='./chromedriver.exe')

browser.get('https://www.baidu.com')
browser.get('https://www.taobao.com')
browser.get('http://www.sina.com.cn/')

# 后退
browser.back()
time.sleep(2)
# 前进
browser.forward()
browser.close()

异常处理

from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException, NoSuchFrameException

browser = webdriver.Chrome()
try:
    browser.get('http://www.baidu.com')
except Exception as e:
    print(e)
finally:
    browser.close()

动作链
在这里插入图片描述

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
import time
from selenium.webdriver import ActionChains

driver = webdriver.Chrome()
driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
wait=WebDriverWait(driver,3)
try:
    driver.switch_to.frame('iframeResult') # 切换到iframeResult
    sourse=driver.find_element_by_id('draggable')
    target=driver.find_element_by_id('droppable')

    actions=ActionChains(driver) # 拿到动作链对象
    actions.drag_and_drop(sourse,target) # 把动作放到动作链中，准备串行执行
    actions.perform()  # 释放鼠标让动作生效
    time.sleep(2)
finally:
    driver.close()

示例：
12306的滑动

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
import time
from selenium.webdriver import ActionChains

from selenium.webdriver.chrome.options import Options

# 防止前端检测出我们是通过自动化软件控制的
options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(executable_path='./chromedriver.exe', chrome_options=options)

# driver = webdriver.Chrome()
driver.get('https://kyfw.12306.cn/otn/resources/login.html')

wait = WebDriverWait(driver, 3)

try:
    username_login = driver.find_element_by_link_text('账号登录')
    username_login.click()

    username = driver.find_element_by_id('J-userName')
    password = driver.find_element_by_id('J-password')
    username.send_keys('xxxxxxxxxxx')
    password.send_keys('xxx')
    time.sleep(2)
    submit = driver.find_element_by_id('J-login')
    submit.click()
    time.sleep(5)

    hk = driver.find_element_by_id('nc_1_n1z')

    actions = ActionChains(driver)  # 拿到动作链对象
    actions.drag_and_drop_by_offset(hk, 300, 0)
    actions.perform()
    time.sleep(50)
finally:
    driver.close()

在这里插入图片描述
注：
滑块验证，前端会判断是机器还是人工操作（window.navigator.webdriver==true），防止检测出我们是通过自动化软件控制的，需配置options.add_argument("--disable-blink-features=AutomationControlled")。

selenium登录cnblogs获取cookie
为了登录到某个网站，拿到cookie，使用selenium速度慢，使用requests发送请求。
半自动登录到cnblogs，拿到cookie，保存到本地；下次，在打开页面把cookie写入浏览器就是登录状态。

import time
from selenium import webdriver

driver = webdriver.Chrome()
driver.implicitly_wait(10)

###########  登录过程  ###########
# try:
#     driver.get('https://www.cnblogs.com/')
#
#     # 找到登录，点击
#     login = driver.find_element_by_css_selector('#navbar_login_status > a:nth-child(6)')
#     login.click()
#     username = driver.find_element_by_id('mat-input-0')
#     password = driver.find_element_by_id('mat-input-1')
#     username.send_keys('xxx@qq.com')
#     password.send_keys('xxx')
#
#     input('手动输入用户名密码')
#
#     summit = driver.find_element_by_css_selector(
#         'body > app-root > mat-sidenav-container > mat-sidenav-content > div > div > app-sign-in > app-content-container > div > div > div > form > div > button')
#     summit.click()
#
#     # 验证码（自动破解、手动破解）
#     input('已经破解了验证码，敲回车')
#
#     # 获取cookie
#     print(type(driver.get_cookies()))
#
#     # 把cookie保存到文件中
#     import json
#
#     with open('cnblogs.json', 'w', encoding='utf-8') as f:
#         f.write(json.dumps(driver.get_cookies()))
#
#     time.sleep(5)
#
# except Exception as e:
#     print(e)
#
# finally:
#     driver.close()


# 不登录，使用代码把cookie写入浏览器
import json

driver.get('https://www.cnblogs.com/')
# 把cookie写入浏览器
with open('../cnblogs.json', 'r', encoding='utf-8') as f:
    cookies = json.loads(f.read())

for cookie in cookies:  # cookie的json文件是列表形式，套字典，放一个个字典，所以用循环往里放
    driver.add_cookie(cookie)

# 刷新一下页面
driver.refresh()

time.sleep(10)
driver.close()

抽屉半自动点赞
selenium登录拿到cookie，再使用requests携带cookie发送请求。

from selenium import webdriver
import json
import time

#### 登录过程 ####
# bro=webdriver.Chrome(executable_path='chromedriver.exe')
# bro.implicitly_wait(10)
# bro.get('https://dig.chouti.com/')
# try:
#     sub_btn=bro.find_element_by_id('login_btn')
#     print(sub_btn)
#
#     # sub_btn.click()  # 报错
#     bro.execute_script('arguments[0].click();',sub_btn)  # 拿到控件，通过执行js代码，让js去执行点击
#
#     # username=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-body > div.form-item.login-item.clearfix.phone-item.mt24 > div.input-item.input-item-short.left.clearfix > input')
#     username=bro.find_element_by_css_selector('div.input-item>input.login-phone')
#     username.send_keys('18953675221')
#     # password=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div.form-item.login-item.clearfix.mt24 > div')
#     password = bro.find_element_by_css_selector('div.input-item>input.pwd-password-input')
#     password.send_keys('lqz123')
#
#     time.sleep(3)
#     btn=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div:nth-child(4) > button')
#
#     btn.click()
#
#     input('等')
#
#     with open('chouti.json','w') as f:
#         json.dump(bro.get_cookies(),f)
#
# finally:
#     bro.close()


#### 点赞过程 ####
import requests

bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.implicitly_wait(10)
bro.get('https://dig.chouti.com/')

# 使用把屏幕滑倒最底下，如果有加载，就加载完成了
bro.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# bro.find_elements_by_css_selector('.link-item')
cookie = {}

## 从文件中读出cookie
with open('../chouti.json', 'r') as f:
    res = json.load(f)

# 这是requests使用的cookie
for item in res:
    cookie[item['name']] = item['value']
print(cookie)  # requests能够使用的cookie

div = bro.find_element_by_class_name('link-con')
time.sleep(2)
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
div_list = div.find_elements_by_class_name('link-item')
for div in div_list:
    article_id = div.get_attribute('data-id')
    print(article_id)
    # 使用requests发送请求
    res = requests.post('https://dig.chouti.com/link/vote', data={'linkId': article_id}, cookies=cookie, headers=header)
    print(res.text)
bro.close()

二、打码平台使用

验证码种类：

简单的字母数字的---->验证码截图出来---->图像识别（OCR识别)---->数字字母
12306之前的，找出符合的图片
滑动验证
计算类…

验证码破解：

人工破解
自动破解
打码平台（验证码破解平台，成功率没有100%）---->云打码、超级鹰等---->给它一张图片，识别后返回结果（收费的）

超级鹰的使用：
下载源代码
输入账号密码、软件ID、验证码类型、上传图片，发送请求即可识别。

#!/usr/bin/env python
# coding:utf-8

import requests
from hashlib import md5


class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password = password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
                          headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()


if __name__ == '__main__':

    # res=requests.get('https://api.django-vue-admin.com/admin/captcha/image/1784cd5088e40cd3fb97f811ea8bd49befd646f7/')
    # with open('js.png','wb') as f:
    #     f.write(res.content)


    chaojiying = Chaojiying_Client('xxx', 'xxx', '903641')  # 账号 密码 用户中心>>>软件ID 生成一个替换 96001
    im = open('js.png', 'rb').read()
    print(chaojiying.PostPic(im, 6001))

注：账号需充值积分，才可以使用。

三、xpath使用

一门在html中查找数据的语言。

记住的语法：

/   取当前路径下的xx  # 从当前标签下找 /body/a/img
//  取所有路径下的xx  # 从当前标签的子子孙孙找 //a
标签名字 # //p
.   当前路径  # ./p .//p
..  上一层  # //body//a[1]/..     
@   取属性  # //a[@href="image.html"]

示例：

doc = '''
<html>
 <head>
  <base href='http://example.com/' />
  <title href="http://example.com/">Example website</title>
 </head>
 <body>
  <div id='images'>
   <a href='image1.html' xx='yy'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
   <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
   <a href='image4.html' class='li'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
   <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
   <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
  </div>
 </body>
</html>
'''

# 以lxml模块为例
from lxml import etree

html = etree.HTML(doc)  # 要解析的字符串（html）
# 如果是文件，使用贡献
# html=etree.parse('search.html',etree.HTMLParser())

# 1 所有节点
a=html.xpath('//*')
a=html.xpath('/*')

# 2 指定节点（结果为列表）
a=html.xpath('//head')  # 找出所有的head标签

# 3 子节点，子孙节点
a=html.xpath('//div/a')
a=html.xpath('//body/a') #无数据
a=html.xpath('//body//a')

# 4 父节点
a=html.xpath('//body//a[@href="image1.html"]/..')
# a=html.xpath('//body//a[1]/..')  # 第一个a标签
a=html.xpath('//a[1]')  # 第一个a标签
# 也可以这样
a=html.xpath('//body//a[1]/parent::*')

# 5 属性匹配
a=html.xpath('//body//a[@href="image1.html"]')
a=html.xpath('//a[@href="image1.html"]')

a=html.xpath('//base[@href="http://example.com/"]')
a=html.xpath('//*[@href="http://example.com/"]')

# 6 文本获取
a=html.xpath('//body//a[@href="image1.html"]/text()')
a=html.xpath('//a/text()')

# 7 属性获取
a=html.xpath('//body//a/@href')
a=html.xpath('//body//a[1]/@xx')
# # 注意从1 开始取（不是从0）
a=html.xpath('//body//a[1]/@href')

# 8 属性多值匹配
# a标签有多个class类，直接匹配就不可以了，需要用contains
a=html.xpath('//body//a[@class="li"]')

a=html.xpath('//body//a[contains(@class,"li")]')

a=html.xpath('//body//a[contains(@class,"li")]/text()')

# 9 多属性匹配
a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')
a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')
a=html.xpath('//body//a[contains(@class,"li")]/text()')

# 10 按序选择
a=html.xpath('//a[2]/text()')
a=html.xpath('//a[2]/@href')
a=html.xpath('//a/@href[contains(@class,"vervideo-lilink")]/@href')
# 取最后一个
a=html.xpath('//a[last()]/@href')
# 位置小于3的
a=html.xpath('//a[position()<3]/@href')
# 倒数第二个
a=html.xpath('//a[last()-2]/@href')

# 11 节点轴选择
# ancestor：祖先节点
# 使用了* 获取所有祖先节点
a=html.xpath('//a/ancestor::*')
a=html.xpath('//a/ancestor::div')
# # 获取祖先节点中的div
a=html.xpath('//a/ancestor::div')
# attribute：属性值
a=html.xpath('//a[1]/attribute::*')
# child：直接子节点
a=html.xpath('//a[1]/child::*')
a=html.xpath('//a[1]/child::img')
# descendant：所有子孙节点
a=html.xpath('//a[6]/descendant::*')

# following:当前节点之后所有节点
a=html.xpath('//a[1]/following::*')
a=html.xpath('//a[1]/following::*[1]/@href')

# following-sibling:当前节点之后同级节点
a=html.xpath('//a[1]/following-sibling::*')
a=html.xpath('//a[1]/following-sibling::a')
a=html.xpath('//a[1]/following-sibling::*[2]')
a = html.xpath('//a[1]/following-sibling::*[2]/@href')

print(a)

总结：
selenium：

只要人能做的，都可以使用代码实现
隐式，显式等待
模拟前进后退
切换选项卡
执行js
异常处理
动作链：模拟点击，拖拽行为
自动登录12306（前端能够检测到是否使用了selenium）
获取登录的cookie
继续使用selenium：add_cookie
使用requests携带cookie
抽屉半自动点赞

xpath语法：
lxml为例（lxml是解析器），bs4、re、selenium自带的、lxml这些都有xpath、css。

/
//
.
..
标签名
*
[@属性名='属性值']
类的话：有多个

打码平台：

花点钱破解验证码
原理：平台提供了接口，使用http传递图片给它，它给你破解，返回破解后的数据
本质用了requests模块
字母和数字：图像识别---->ocr技术
复杂的：人工

每天早睡

关注

25
点赞
踩
21

收藏

觉得还不错? 一键收藏
打赏
0
评论
selenium用法

selenium：只要人能做的，都可以使用代码实现隐式，显式等待模拟前进后退切换选项卡执行js异常处理动作链：模拟点击，拖拽行为自动登录12306（前端能够检测到是否使用了selenium）获取登录的cookie继续使用selenium：add_cookie使用requests携带cookiexpath语法：lxml为例（lxml是解析器），bs4、re、selenium自带的、lxml这些都有xpath、css。
复制链接

扫一扫