webdriver与Chrome浏览器 对应版本下载(Mac)
- 查询本机Chrome版本
Chrome浏览器右边三个小点---帮助---关于Google Chrome
- 下载chromedriver对应版本
chromedriver与Chrome浏览器 对应版本下载 - 把webdriver移动到对应目录(/usr/local/bin)
使用command+空格键 出现聚焦搜索,输入/usr/local/bin进行ChromeDriver环境配置,将下载好的chromedriver放入
- 打开文件
open /usr/local/bin
代理部分
- Chrome会自动使用本机代理
- Firefox需要进入
设置--网络设置--使用系统代理(确定)
1.查找元素
https://blog.csdn.net/qq_32897143/article/details/80383502
一般根据xpath定位到元素之后获取文本和属性
- text
- 获取标签文本内容
- get_attribute(‘属性’)
- 获取元素属性值
list_info = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div[2]/div[3]/div[1]/div')
str1 = list_info.text
2.显式等待和隐式等待
显示等待:在 一段时间内 查到元素就执行,查不到报错。
隐示等待:类似于time.sleep(10),在等待一段时间后,才开始查找元素。
3.关闭(close和quit)
close:关闭当前页面。
quit:关闭浏览器。
4.存在内嵌frame导致无法定位元素
# 切换内嵌frame
fr=browser.find_element_by_xpath("//iframe[contains(@src,'https://uac.10010.com/portal/homeLoginNew')]")
browser.switch_to.frame(fr)
4.切换窗口句柄
# 切换句柄
logging.info('change handle---')
handle = self.driver.window_handles # 获取句柄,得到的是一个列表
self.driver.switch_to.window(handle[-1]) # 切换至最新句柄
time.sleep(6)
4.存在shadow-root(open)
# 获取到shadow节点,后续元素需要CSS选择器方式获取,不能用xpath
# debugger查看shadow节点是否有自己需要的文本
shadow = self.driver.execute_script('return document.getElementsByClassName("igraal-shadow")[0]')
print('----')
print(shadow)
print(shadow.text)
5.chrom浏览器被识别-基础代码部分
from selenium import webdriver
def getDriver():
options = webdriver.ChromeOptions()
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
#options.add_argument("--no-sandbox") # linux only
#options.add_argument('--proxy-server=127.0.0.1:8080') # 设置代理,目标url打不开
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
driver = webdriver.Chrome(executable_path='D:\mydriver\chromedriver.exe', options=options)
driver.execute_cdp_cmd("Network.enable", {})
driver.execute_cdp_cmd("Network.setExtraHTTPHeaders", {"headers": {"User-Agent": "browserClientA"}})
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
return driver
driver = getDriver()
driver.maximize_window()
driver.get('https://www.baidu.com')#百度换成目标url
6.firefox设置无头和代理
# selenium版本 ----3.141.0
profile=webdriver.FirefoxOptions()
profile.add_argument('-headless') #设置无头模式
#设置代理服务器
profile.set_preference('network.proxy.type', 1) # 开启手动设置代理
profile.set_preference('network.proxy.http',IP)#IP为你的代理服务器地址:如‘127.0.0.0’,字符串类型
profile.set_preference('network.proxy.http_port', PORT) #PORT为代理服务器端口号:如,9999,整数类型
# self.profile.set_preference('network.proxy.type', 1)
# self.profile.set_preference('network.proxy.http', PROXY_VY['host'])
# self.profile.set_preference('network.proxy.http_port', int(PROXY_VY['port']))
# self.profile.set_preference('network.proxy.ssl', PROXY_VY['host'])
# self.profile.set_preference('network.proxy.ssl_port', int(PROXY_VY['port']))
# self.profile.set_preference("network.proxy.username", PROXY_VY['username'])
# self.profile.set_preference("network.proxy.password", PROXY_VY['password'])
driver=webdriver.Firefox(options=profile)
7.selenium移动鼠标下拉获取数据demo
# -*- coding: utf-8 -*-
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from some_def import user, pwd
from my_mysql import conn
import time
def save_data(cs1, data_set, keyword):
if not data_set:
return True
# 获取目标表名
table_name = "ins_tag_count"
# 执行存储
# 查询已存在的page
data_set_new = []
for data_one in data_set:
href = data_one[1]
exists = query_db(
conn,
(
f"SELECT href FROM {table_name} WHERE href='{href}' and keyword='{keyword}'"
),
one=True
)
if exists:
data_set.remove(data_one)
else:
data_set_new.append(data_one)
cs1.executemany(
f"INSERT INTO ins_tag_count (keyword,href) VALUES (%s,%s)",
data_set_new,
)
count = cs1.execute(f'select distinct(href) from ins_tag_count where keyword="{keyword}"')
return count
# 查询语句封装
def query_db(db, query, args=(), one=False):
with db.cursor() as cur:
cur.execute(query, args)
if one:
return cur.fetchone()
else:
return cur.fetchall()
def get_count(driver, url, keyword):
# 设置变量url,用于浏览器访问。
url_1 = 'url_1'
# 打开浏览器并访问网址
driver.get(url_1)
# 登录
time.sleep(1)
driver.find_element_by_xpath('//*[@id="loginForm"]/div/div[1]/div/label/input').send_keys(user)
time.sleep(2)
driver.find_element_by_xpath('//*[@id="loginForm"]/div/div[2]/div/label/input').send_keys(pwd)
time.sleep(2)
driver.find_element_by_xpath('//*[@id="loginForm"]/div/div[3]/button').click()
# 点击不保存信息
time.sleep(3)
driver.find_element_by_xpath('//*[@id="react-root"]/section/main/div/div/div/div/button').click()
time.sleep(3)
# 访问需要获取数据的页面
url_2 = url
driver.get(url_2)
time.sleep(8)
if '页面不存在' in driver.page_source:
print('页面不存在')
return
try:
# 获取页面总帖
node = driver.find_elements_by_xpath('//*[@id="react-root"]/section/main/header/div[2]/div/div[2]/span/span')
if node:
count_all = node[0].text
print(f'总帖数是{count_all}')
except Exception as e:
count_all = 0
count = 0
count2 = 0
retry = 5
first = True
while retry > 0:
# 获取href
time.sleep(6)
data_list = []
# 热门
if first:
first = False
res = driver.find_elements_by_xpath('//*[@id="react-root"]/section/main/article/div/div/div/div//a')
else:
res = driver.find_elements_by_xpath('//*[@id="react-root"]/section/main/article/div[2]/div/div//a')
for i in res:
data_one = []
keyword = keyword
href = i.get_attribute('href')
data_one.append(keyword)
data_one.append(href)
data_list.append(data_one)
print('data_list', data_list)
count = save_data(cs1=cs1, data_set=data_list, keyword=keyword)
if count2 == count:
retry -= 1
print(f'重复了{5-retry}次,最多5次')
else:
count2 = count
retry = 5
print(f'目前存入{count}条不重复的数据')
conn.commit()
# 移动加载页面
target = res[-1]
driver.execute_script("arguments[0].scrollIntoView();", target)
if __name__ == '__main__':
key_words = ['test']
# for i in range(len(key_words)):
i = 0
cs1 = conn.cursor()
chrome_options = Options()
# chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=chrome_options)
key_word = key_words[i]
if '#' in key_word:
key_word = key_word.replace('#', '')
url = 'url'
else:
url = 'url'
try:
get_count(driver=driver, url=url, keyword=key_word)
except Exception as e:
print('error occur')
print(e)
finally:
cs1.close()
conn.close()
driver.close()
显式等待和隐式等待
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
result = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="loginForm"]/div/div[1]/div/label/input')))
selenium通过xpath获取数据
# 创建一个谷歌浏览器对象
options = webdriver.ChromeOptions()
# options.add_argument('-headless') #设置无头模式
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
browser = webdriver.Chrome(options=options)
browser.get(url)
time.sleep(8)
# 房子房间数
list_info = browser.find_elements(by=By.XPATH, value='//div[@class="idt4x4 dir dir-ltr"]/div[3]/span[3]')
selenium遇到other element obscures it
# 通用
ele = browser.find_element_by_id("isStudentDan")
browser.execute_script('arguments[0].click()',ele)
# 隐藏阻碍元素 - 非通用,限 弹框广告 形式
try:
ads = driver.find_element(by=By.XPATH, value='//div[@id="eventSHIELD"]')
if ads:
driver.execute_script("arguments[0].style.visibility='hidden'", ads)
time.sleep(2)
except:
print('no ads')
selenium在打开Firefox浏览器时加载插件
import time
from selenium import webdriver
# 默认配置
# profileDir为本地firefox配置文件路径
profileDir = r'C:\Users\Administrator\AppData\Roaming\Mozilla\Firefox\Profiles\aaaaa.default'
profile = webdriver.FirefoxProfile(profileDir)
driver = webdriver.Firefox(firefox_profile=profile, executable_path='D:\geckodriver\geckodriver.exe')
driver.get("https://www.baidu.com/")
time.sleep(5)