selenium的版本及操作


webdriver与Chrome浏览器 对应版本下载(Mac)

  • 查询本机Chrome版本
    Chrome浏览器右边三个小点---帮助---关于Google Chrome
  • 下载chromedriver对应版本
    chromedriver与Chrome浏览器 对应版本下载
  • 把webdriver移动到对应目录(/usr/local/bin)
    使用command+空格键 出现聚焦搜索,输入/usr/local/bin进行ChromeDriver环境配置,将下载好的chromedriver放入
  • 打开文件
open /usr/local/bin

代理部分

  • Chrome会自动使用本机代理
  • Firefox需要进入 设置--网络设置--使用系统代理(确定)

1.查找元素

https://blog.csdn.net/qq_32897143/article/details/80383502

一般根据xpath定位到元素之后获取文本和属性
  • text
  • 获取标签文本内容
  • get_attribute(‘属性’)
  • 获取元素属性值
list_info = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div[2]/div[3]/div[1]/div')
str1 = list_info.text

2.显式等待和隐式等待

显示等待:在 一段时间内 查到元素就执行,查不到报错。
隐示等待:类似于time.sleep(10),在等待一段时间后,才开始查找元素。

3.关闭(close和quit)

close:关闭当前页面。
quit:关闭浏览器。

4.存在内嵌frame导致无法定位元素

# 切换内嵌frame
fr=browser.find_element_by_xpath("//iframe[contains(@src,'https://uac.10010.com/portal/homeLoginNew')]")
browser.switch_to.frame(fr)

4.切换窗口句柄

# 切换句柄
logging.info('change handle---')
handle = self.driver.window_handles  # 获取句柄,得到的是一个列表
self.driver.switch_to.window(handle[-1])  # 切换至最新句柄
time.sleep(6)

4.存在shadow-root(open)

# 获取到shadow节点,后续元素需要CSS选择器方式获取,不能用xpath
# debugger查看shadow节点是否有自己需要的文本
shadow = self.driver.execute_script('return document.getElementsByClassName("igraal-shadow")[0]')
print('----')
print(shadow)
print(shadow.text)

5.chrom浏览器被识别-基础代码部分

from selenium import webdriver


def getDriver():
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-gpu")
    #options.add_argument("--no-sandbox") # linux only
    #options.add_argument('--proxy-server=127.0.0.1:8080') # 设置代理,目标url打不开
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    driver = webdriver.Chrome(executable_path='D:\mydriver\chromedriver.exe', options=options)
    driver.execute_cdp_cmd("Network.enable", {})
    driver.execute_cdp_cmd("Network.setExtraHTTPHeaders", {"headers": {"User-Agent": "browserClientA"}})
    driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined
            })
        """
    })
    return driver

driver = getDriver()
driver.maximize_window()
driver.get('https://www.baidu.com')#百度换成目标url

6.firefox设置无头和代理

# selenium版本 ----3.141.0
profile=webdriver.FirefoxOptions()
profile.add_argument('-headless') #设置无头模式
#设置代理服务器
profile.set_preference('network.proxy.type', 1) # 开启手动设置代理
profile.set_preference('network.proxy.http',IP)#IP为你的代理服务器地址:如‘127.0.0.0’,字符串类型
profile.set_preference('network.proxy.http_port', PORT)  #PORT为代理服务器端口号:如,9999,整数类型
# self.profile.set_preference('network.proxy.type', 1)
# self.profile.set_preference('network.proxy.http', PROXY_VY['host'])
# self.profile.set_preference('network.proxy.http_port', int(PROXY_VY['port']))
# self.profile.set_preference('network.proxy.ssl', PROXY_VY['host'])
# self.profile.set_preference('network.proxy.ssl_port', int(PROXY_VY['port']))
# self.profile.set_preference("network.proxy.username", PROXY_VY['username'])
# self.profile.set_preference("network.proxy.password", PROXY_VY['password'])
driver=webdriver.Firefox(options=profile)

7.selenium移动鼠标下拉获取数据demo

# -*- coding: utf-8 -*-

import json
import time

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from some_def import user, pwd
from my_mysql import conn
import time


def save_data(cs1, data_set, keyword):
    if not data_set:
        return True
    # 获取目标表名
    table_name = "ins_tag_count"
    # 执行存储
    # 查询已存在的page
    data_set_new = []
    for data_one in data_set:
        href = data_one[1]
        exists = query_db(
            conn,
            (
                f"SELECT href FROM {table_name} WHERE href='{href}' and keyword='{keyword}'"
            ),
            one=True
        )
        if exists:
            data_set.remove(data_one)
        else:
            data_set_new.append(data_one)
    cs1.executemany(
        f"INSERT INTO ins_tag_count (keyword,href) VALUES (%s,%s)",
        data_set_new,
    )
    count = cs1.execute(f'select distinct(href) from ins_tag_count where keyword="{keyword}"')

    return count


# 查询语句封装
def query_db(db, query, args=(), one=False):
    with db.cursor() as cur:
        cur.execute(query, args)
        if one:
            return cur.fetchone()
        else:
            return cur.fetchall()


def get_count(driver, url, keyword):
    # 设置变量url,用于浏览器访问。
    url_1 = 'url_1'
    # 打开浏览器并访问网址
    driver.get(url_1)
    # 登录
    time.sleep(1)
    driver.find_element_by_xpath('//*[@id="loginForm"]/div/div[1]/div/label/input').send_keys(user)
    time.sleep(2)
    driver.find_element_by_xpath('//*[@id="loginForm"]/div/div[2]/div/label/input').send_keys(pwd)
    time.sleep(2)
    driver.find_element_by_xpath('//*[@id="loginForm"]/div/div[3]/button').click()
    # 点击不保存信息
    time.sleep(3)
    driver.find_element_by_xpath('//*[@id="react-root"]/section/main/div/div/div/div/button').click()
    time.sleep(3)
    # 访问需要获取数据的页面
    url_2 = url
    driver.get(url_2)
    time.sleep(8)
    if '页面不存在' in driver.page_source:
        print('页面不存在')
        return
    try:
        # 获取页面总帖
        node = driver.find_elements_by_xpath('//*[@id="react-root"]/section/main/header/div[2]/div/div[2]/span/span')
        if node:
            count_all = node[0].text
            print(f'总帖数是{count_all}')
    except Exception as e:
        count_all = 0
    count = 0
    count2 = 0
    retry = 5
    first = True
    while retry > 0:
        # 获取href
        time.sleep(6)
        data_list = []
        # 热门
        if first:
            first = False
            res = driver.find_elements_by_xpath('//*[@id="react-root"]/section/main/article/div/div/div/div//a')
        else:
            res = driver.find_elements_by_xpath('//*[@id="react-root"]/section/main/article/div[2]/div/div//a')
        for i in res:
            data_one = []
            keyword = keyword
            href = i.get_attribute('href')
            data_one.append(keyword)
            data_one.append(href)
            data_list.append(data_one)
        print('data_list', data_list)
        count = save_data(cs1=cs1, data_set=data_list, keyword=keyword)
        if count2 == count:
            retry -= 1
            print(f'重复了{5-retry}次,最多5次')
        else:
            count2 = count
            retry = 5
        print(f'目前存入{count}条不重复的数据')
        conn.commit()
        # 移动加载页面
        target = res[-1]
        driver.execute_script("arguments[0].scrollIntoView();", target)


if __name__ == '__main__':
    key_words = ['test']

    # for i in range(len(key_words)):
    i = 0
    cs1 = conn.cursor()
    chrome_options = Options()
    # chrome_options.add_argument('--headless')
    driver = webdriver.Chrome(chrome_options=chrome_options)

    key_word = key_words[i]
    if '#' in key_word:
        key_word = key_word.replace('#', '')
        url = 'url'
    else:
        url = 'url'
    try:
        get_count(driver=driver, url=url, keyword=key_word)
    except Exception as e:
        print('error occur')
        print(e)
    finally:
        cs1.close()
        conn.close()
        driver.close()



显式等待和隐式等待
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

result =  WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="loginForm"]/div/div[1]/div/label/input')))

selenium通过xpath获取数据
# 创建一个谷歌浏览器对象
options = webdriver.ChromeOptions()
# options.add_argument('-headless') #设置无头模式
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
browser = webdriver.Chrome(options=options)
browser.get(url)
time.sleep(8)
# 房子房间数
list_info = browser.find_elements(by=By.XPATH, value='//div[@class="idt4x4 dir dir-ltr"]/div[3]/span[3]')

selenium遇到other element obscures it
# 通用
ele = browser.find_element_by_id("isStudentDan")
browser.execute_script('arguments[0].click()',ele)

# 隐藏阻碍元素 - 非通用,限 弹框广告 形式
try:
    ads = driver.find_element(by=By.XPATH, value='//div[@id="eventSHIELD"]')
    if ads:
        driver.execute_script("arguments[0].style.visibility='hidden'", ads)
        time.sleep(2)
except:
    print('no ads')

selenium在打开Firefox浏览器时加载插件

import time
from selenium import webdriver

# 默认配置
# profileDir为本地firefox配置文件路径

profileDir = r'C:\Users\Administrator\AppData\Roaming\Mozilla\Firefox\Profiles\aaaaa.default'
profile = webdriver.FirefoxProfile(profileDir)

driver = webdriver.Firefox(firefox_profile=profile, executable_path='D:\geckodriver\geckodriver.exe')
driver.get("https://www.baidu.com/")
time.sleep(5)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值