最近跟着书学习爬虫,发现书上关于极致验证码的内容已经过时了,按照书上操作已经得不到期望的效果,于是就自己动手写一个滑动验证码的破解。
工具
:Selenium、python、mongodb
基本思路:
1.使用selenium工具访问b站
2.自动输入用户名和密码
3.获取验证码图片
4.获取带缺口的验证码图片
5.识别缺口位置
6.算法模拟人拖动滑条
考虑到b站自动登录似乎没有什么商业价值,这边就直接上源码了
代码
虽然代码比较另类,但识别原理和网上大部分不可用的例子的识别方法其实差别不大,主要是对验证码的位置进行了修正,另外在拖动滑条时使用模拟人的校准行为,使得自动登录可用,最后用Mongodb记录成功与失败次数
from selenium import webdriver
import time
from io import BytesIO
from PIL import Image
from selenium.webdriver.common.action_chains import ActionChains
import pymongo
#browser = webdriver.Chrome()
client = pymongo.MongoClient(host='localhost', port=27017) # 建立连接
db = client.test # 指定数据库
collection = db.login_bilibili # 指定表
global success
global failed
def counts():
global success
global failed
success = 0
failed = 0
def get_img():
scr = browser.get_screenshot_as_png()
scr = Image.open(BytesIO(scr))
img = browser.find_element_by_class_name('gt_box')
scr_locate = img.location
size = img.size
print('imgsize:',size)
top, bottom, left, right=scr_locate['y']+60, scr_locate['y']+80 + size['height'],scr_locate['x']+150, scr_locate['x'] + size['width']+180
scr_img = scr.crop((left, top,right, bottom))
#scr_img.show()
return scr_img
def difference(img1, img2, x, y):
"""两张图片每个像素点对比,差值过大的为拼图位置"""
p1 = img1.load()[x, y]
p2 = img2.load()[x, y]
p = 60
#print('p1:',p1,'\np2:',p2)
if abs(p1[0]-p2[0]) < p and abs(p1[1]-p2[1]) < p and abs(p1[2]-p2[2]) < p:
return True
else:
print(p1, p2)
return False
def get_crave(img1, img2):
ahead = 80
for i in range(ahead, img1.size[0]):
for j in range(img1.size[1]):
if not difference(img1, img2, i, j):
print('第%s列' % i)
crave = i
return crave
def get_track(s):
track = []
for t in range(36):
if t < 21:
t = t/20
a1 = 8*s/5
v0 = a1 * t
t = 0.05
s1 = v0 * t + 0.5 * a1 * t * t
track.append(s1)
else:
a2 = -27*s/8
v0 = 8*s/5 + a2 * (-20 + t)/20
t = 0.05
s2 = v0*t + 0.5 * a2 * t * t
track.append(s2)
return track
def login(user, pwd):
global failed
browser.get('https://passport.bilibili.com/login') # 输入网址
time.sleep(1.5)
username = browser.find_element_by_id('login-username') # 定位id为draggable的节点
username.send_keys(user)
passwd = browser.find_element_by_id('login-passwd') # 定位id为draggable的节点
passwd.send_keys(pwd)
button = browser.find_element_by_class_name("gt_slider_knob")
ActionChains(browser).move_to_element(button).perform() # 鼠标移动到滑块上不点击
time.sleep(0.7)
img1 = get_img() # 没有缺口的图
ActionChains(browser).click_and_hold(browser.find_element_by_class_name("gt_slider_knob")).perform() # 按住不放
img2 = get_img() # 有缺口的图
crave = get_crave(img1, img2) - 10
if crave < 85:
browser.quit()
failed += 1
return crave + 10
if crave < 135:
crave += 4
if crave > 135 and crave < 189:
crave -= 5
if crave >= 190:
crave -= 19
print('carve:', crave)
move(crave)
time.sleep(3)
browser.quit()
return ''
def move(crave):
global success
s = 0
j = 0
track = get_track(crave)
if crave < 100:
track.extend([-3, 2.5, 1.5, -1.5, 1.5, -2.5])
else:
track.extend([-3, 4.1, -4.1, 1.5, -1.5])
for i in track:
j += 1
if j > 36:
time.sleep(0.5)
ActionChains(browser).move_by_offset(i, 0).perform()
time.sleep(0.05)
s += i
print('移动:', s)
ActionChains(browser).release(browser.find_element_by_class_name("gt_slider_knob")).perform()
time.sleep(2)
if browser.current_url == 'https://www.bilibili.com/':
success += 1
counts()
if __name__ == '__main__':
while True:
try:
browser = webdriver.Chrome()
failed_crave = login(用户名, 密码)
print('成功%s次,失败%s次' % (success, failed))
total = {
'id': 1,
'登陆成功次数': success,
'登陆失败次数': failed,
'成功率': success / (success + failed),
'failed_crave': failed_crave
}
r = collection.insert_one(total)
except:
failed += 1
total = {
'id': 1,
'登陆成功次数': success,
'登陆失败次数': failed,
'成功率': success / (success + failed),
'failed_crave': '此次是字母数字验证码'
}
r = collection.insert_one(total)
测试
经过挂机一晚上的测试,成功率在70%以上,但美中不足的是,在访问300次左右后,错误率开始上升,分析应该是访问过于频繁导致出现了特殊错误,到500次以后的数据基本上全都是失败