一、为避免网站反爬虫,在每一操作步骤间加入等待时间,避免被网页识别为机器从而导致操作失败
1、强制等待:但当网络条件良好时,依旧按照预设定的时间继续等待,导致整个项目的自动化时间无限延长。该方式适用于脚本调试过程。
time.sleep(timeout)
2、隐式等待实际是设置了一个最长等待时间,如果在规定时间内网页加载完成,则执行下一步,否则一直等到时间结束,然后执行下一步。隐式等待对整个driver周期都起作用,在最开始设置一次就可以了。不要当作固定等待使用,到哪都来一下隐式等待。
driver.implicitly_wait(timeout)
3、指的是设定最长等待时间,有一个判断条件,会每隔设定的时间去执行判断条件是否成立。如果成立则往下执行,否则一直检查条件,直到达到最长等待时间。该方法是最常用的。参数有:
这个模块中,一共只有两种方法 until 与 until not
WebDriverWait(driver, 10, 0.5)
二、CSDN笔记链接:CSDN笔记链接
三、option常用参数
options = Options()
#关闭'chrome正受到自动测试软件的控制'提示
options.add_experimental_option('useAutomationExtension', False)
options.add_experimental_option("excludeSwitches", ['enable-automation'])
webdriver.chrome(options=options)
options一些常用参数统计
options.add_argument(f"--proxy-server=http://183.166.149.193:20005") 使用代理
options.add_argument('--disable-infobars') # 禁止策略化
options.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在的报错
options.add_argument('window-size=1920x3000') # 指定浏览器分辨率
options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
options.add_argument('--incognito') # 隐身模式(无痕模式)
options.add_argument('--disable-javascript') # 禁用javascript
options.add_argument('--start-maximized') # 最大化运行(全屏窗口),不设置,取元素会报错
options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
options.binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" # 手动指定使用的浏览器位置
options.add_argument('lang=en_US') # 设置语言
options.add_argument('User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36') 设置请求头的User-Agent
options.add_argument('--headless') # 浏览器不提供可视化页面
prefs = {"":""}
prefs["credentials_enable_service"] = False
prefs["profile.password_manager_enabled"] = False
chrome_option_set.add_experimental_option("prefs", prefs) # 屏蔽'保存密码'提示框
网站爬虫python代码示例
# encoding utf-8
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from time import sleep
from PIL import Image
import cv2
import random
url = 'https://console.cloud.tencent.com/cvm/snapshot/list/detail?rid=8&id=snap-hr0wj5bu&searchParams=cmlkPTg'
url2 = 'https://cloud.tencent.com/login/subAccount?s_url=https%3A%2F%2Fcloud.tencent.com'
# 1.登录参数
account = 'xxxxx'
username = 'name'
password = 'xxxxxxxxxxxxxxxxxxxxx'
# 2.验证滑块部分:截图的偏移量
distance_offset = 37
distance_offset2 = 5
img1 = "./img1.png"
img2 = "./img2.png"
distance = 0
rectangle = (789, 351, 1130, 548) # 截图坐标
options = webdriver.ChromeOptions()
options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36') # 替换User-Agent
driver = webdriver.Chrome(options=options)
def get_tracks(distance, rate=0.5, t=0.2, v=0):
"""
将distance分割成小段的距离
:param distance: 总距离
:param rate: 加速减速的临界比例
:param a1: 加速度
:param a2: 减速度
:param t: 单位时间
:param t: 初始速度
:return: 小段的距离集合
"""
tracks = []
# 加速减速的临界值
mid = rate * distance
# 当前位移
s = 0
# 循环
while s < distance:
# 初始速度
v0 = v
if s < mid:
a = 3
else:
a = -2
# 计算当前t时间段走的距离
s0 = v0 * t + 0.5 * a * t * t
# 计算当前速度
v = v0 + a * t
# 四舍五入距离,因为像素没有小数
tracks.append(round(s0))
# 计算当前距离
s += s0
print("位移:", tracks)
return tracks
def save_pic():
# 执行js,只显示滑块图片
driver.execute_script('document.getElementsByClassName("tc-bg-img unselectable")[0].style.opacity = "0"')
sleep(2)
driver.save_screenshot("./screen1.png")
# 执行js,显示完整背景图
driver.execute_script('document.getElementsByClassName("tc-bg-img unselectable")[0].style.opacity = "1"')
# 获取验证码的定位,打开图片并截图出所需的验证码部分
img_obj = Image.open("./screen1.png")
img_new = img_obj.crop(rectangle) # 截图
img_new.save(img1) # 保存
# 找到白底图片中的滑块图样并截出来
img = Image.open(img1)
width, height = img.size # 获取图片的宽度和高度
img = img.convert("RGB") # 转换为RGB模式,便于处理
# 遍历像素,找到有颜色的像素点
color_pixels = []
for x in range(width):
for y in range(height):
r, g, b = img.getpixel((x, y))
if r != 255 or g != 255 or b != 255:
color_pixels.append((x, y))
# 如果没有找到有颜色的像素点,提示错误并退出
if not color_pixels:
print("Error: no color pixels found")
exit()
# 计算包含有颜色的最小矩形
min_x, min_y = color_pixels[0]
max_x, max_y = color_pixels[0]
for x, y in color_pixels:
min_x = min(min_x, x)
min_y = min(min_y, y)
max_x = max(max_x, x)
max_y = max(max_y, y)
cropped_img = img.crop((min_x, min_y, max_x + 1, max_y + 1)) # 剪切包含有颜色的最小矩形
cropped_img.save(img1) # 保存剪切后的图片
sleep(1)
# 执行js,隐藏滑块并显示完整背景图
driver.execute_script('document.getElementsByClassName("tc-fg-item")[1].style.opacity = "0"')
driver.execute_script('document.getElementsByClassName("tc-fg-item")[2].style.opacity = "0"')
sleep(1)
driver.save_screenshot("./screen2.png") # 截屏保存
sleep(1)
# 处理截屏图片,截出验证码部分
img_obj = Image.open("./screen2.png")
img_new = img_obj.crop(rectangle)
img_new.save(img2)
# 执行js,显示包含滑块及背景图的完整图片
driver.execute_script('document.getElementsByClassName("tc-fg-item")[1].style.opacity = "1"')
driver.execute_script('document.getElementsByClassName("tc-fg-item")[2].style.opacity = "1"')
def discern_distance():
"""识别距离"""
# out = './out.png'
# 读取背景图片和缺口图片
bg_img = cv2.imread(img2) # 包含滑块的背景图片
tp_img = cv2.imread(img1) # 包含缺口的背景图片
# 识别图片边缘
bg_edge = cv2.Canny(bg_img, 100, 200)
tp_edge = cv2.Canny(tp_img, 100, 200)
# 转换图片格式
bg_pic = cv2.cvtColor(bg_edge, cv2.COLOR_GRAY2RGB)
tp_pic = cv2.cvtColor(tp_edge, cv2.COLOR_GRAY2RGB)
# 缺口匹配
res = cv2.matchTemplate(bg_pic, tp_pic, cv2.TM_CCOEFF_NORMED)
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res) # 寻找最优匹配
th, tw = tp_pic.shape[:2]
tl = max_loc # 左上角点的坐标
print(tl)
br = (tl[0] + tw, tl[1] + th) # 右下角点的坐标
cv2.rectangle(bg_img, tl, br, (0, 0, 255), 2) # 绘制矩形
cv2.imwrite(out, bg_img) # 保存在本地
# 计算位移
distance = tl[0] - distance_offset + distance_offset2
print("计算位移", distance)
return distance
def shake_mouse():
"""
模拟人手释放鼠标抖动
"""
ActionChains(driver).move_by_offset(xoffset=-2, yoffset=0).perform()
ActionChains(driver).move_by_offset(xoffset=2, yoffset=0).perform()
def slide():
"""滑动"""
# 找到滑块
distance = discern_distance()
block = driver.find_element(By.CSS_SELECTOR, '#tcOperation > div.tc-fg-item.tc-slider-normal')
# 循环
while True:
print("distance:", distance)
# 摁下滑块
ActionChains(driver).click_and_hold(block).perform()
# 移动
ActionChains(driver).move_by_offset(distance * 0.8, 0).perform()
tracks = get_tracks(distance * 0.2) # 获取位移
# 循环
for track in tracks:
ActionChains(driver).move_by_offset(track, 0).perform() # 移动
sleep(random.random() / 100) # 每移动一次随机停顿0-1/100秒之间骗过了极验,通过率很高
shake_mouse()
sleep(random.random())
ActionChains(driver).release().perform() # 释放
def rollback():
sleep(3)
roll = driver.find_element(By.XPATH, "//*[text()='回滚']") # 找到回滚按钮
ActionChains(driver).move_to_element(roll).click().perform() # 模拟鼠标点击回滚按钮
checkbox = driver.execute_script("return document.querySelector('input[type=checkbox]')") # 找到复选框按钮
sleep(3)
driver.find_element(By.CSS_SELECTOR, 'input[type="checkbox"]').click() # 勾选复选框
sleep(2)
sure = driver.find_element(By.XPATH, "//*[text()='确定']")
# ActionChains(driver).move_to_element(sure).click().perform() # 点击确定
driver.quit()
def main():
driver.maximize_window()
driver.get(url)
# 切换到账号密码子用户登录界面
driver.find_element(By.CLASS_NAME, 'J-subAccountLogin').click()
# 输入用户名和密码,模拟人工操作,加入显式等待
driver.find_element(By.ID, 'account').clear()
driver.find_element(By.ID, 'account').send_keys(account)
sleep(1)
driver.find_element(By.ID, 'username').clear()
driver.find_element(By.ID, 'username').send_keys(username)
sleep(1)
driver.find_element(By.ID, 'password').clear()
driver.find_element(By.ID, 'password').send_keys(password)
sleep(1) # 由于点击过快可能出现元素不可交互问题,在此加入等待时间
# 点击登录按钮
login = driver.find_element(By.CLASS_NAME, 'J-submitLogin') # selenium支持的JS脚本方法点击,避免出现元素被遮挡导致无法点击问题
driver.execute_script("arguments[0].click();", login)
# driver.implicitly_wait(3) # 等待滑块验证码出现
sleep(5)
if driver.current_url != url:
# 切换到滑块验证码所在的iframe
print(driver.current_url, "\n", url)
iframe = driver.find_element(By.XPATH, '//*[@id="tcaptcha_iframe_dy"]')
driver.switch_to.frame(iframe)
# 等待验证码图片加载
WebDriverWait(driver, 10, 0.5).until(EC.presence_of_element_located((By.ID, 'slideBg')))
save_pic()
discern_distance()
slide()
# 退出
rollback()
driver.quit()
if __name__ == '__main__':
main()