滑块只能用Selenium过,js逆向那套行不通,没办法,行情就是这样~~(来自某某经典语录)
采集抖音过程中,需要携带cookie才能获取数据,cookie失效就会出现滑块验证码,如图:
如果你的不出现验证码,可以去掉cookie,再去滚动翻页;
用selenium过滑块,原理都差不多,获取原图,缺口图片,识别距离,再模拟滚动,滚动速度不要不变,要有加速度,更像人为操作。根据代码操作几遍就可以,注意需要修改自己的浏览器驱动路径driver_path;
代码如下:
import cv2
import numpy as np
import time
import requests
import os
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver import ActionChains
class Douyin_Slider(object):
def __init__(self, bg, gap):
bg_size = (340, 212)
gap_size = (68, 68)
self.img_dir = os.path.join(os.getcwd(), 'image')
self.bg = self.get_img_path(bg, 'bg', bg_size)
self.gap = self.get_img_path(gap, 'gap', gap_size)
self.out = os.path.join(self.img_dir, 'out.jpg')
def get_img_path(self, img_path, img_name, resize):
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;"
"q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en-GB;q=0.8,en;q=0.7,ja;q=0.6",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Host": urlparse(img_path).hostname,
"Referer": "https://www.douyin.com/",
# "Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:93.0) Gecko/20100101 Firefox/93.0",
}
resp = requests.get(url=img_path, headers=headers)
if resp.status_code == 200:
img_path = f'./image/{img_name}.jpg'
image = np.asarray(bytearray(resp.content), dtype="uint8")
image = cv2.imdecode(image, cv2.IMREAD_COLOR)
if resize:
image = cv2.resize(image, dsize=resize)
# pass
cv2.imwrite(img_path, image)
return img_path
else:
print('下载失败,状态码为:{}'.format(resp.status_code))
@staticmethod
def clear_white(img):
img = cv2.imread(img)
rows, cols, channel = img.shape
min_x = 255
min_y = 255
max_x = 0
max_y = 0
for x in range(1, rows):
for y in range(1, cols):
t = set(img[x, y])
if len(t) >= 2:
if x <= min_x:
min_x = x
elif x >= max_x:
max_x = x
if y <= min_y:
min_y = y
elif y >= max_y:
max_y = y
img1 = img[min_x:max_x, min_y: max_y]
return img1
@staticmethod
def image_edge_detection(img):
edges = cv2.Canny(img, 100, 200)
return edges
def template_match(self, tpl, target):
th, tw = tpl.shape[:2]
result = cv2.matchTemplate(target, tpl, cv2.TM_CCOEFF_NORMED)
# 寻找矩阵(一维数组当作向量,用Mat定义) 中最小值和最大值的位置
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
tl = max_loc
br = (tl[0] + tw, tl[1] + th)
# 绘制矩形边框,将匹配区域标注出来
# target:目标图像
# tl:矩形定点
# br:矩形的宽高
# (0,0,255):矩形边框颜色
# 1:矩形边框大小
cv2.rectangle(target, tl, br, (0, 0, 255), 2)
cv2.imwrite(self.out, target)
return tl[0]
def discern(self):
img1 = self.clear_white(self.gap)
img1 = cv2.cvtColor(img1, cv2.COLOR_RGB2GRAY)
slide = self.image_edge_detection(img1)
back = cv2.imread(self.bg, 0)
back = self.image_edge_detection(back)
slide_pic = cv2.cvtColor(slide, cv2.COLOR_GRAY2RGB)
back_pic = cv2.cvtColor(back, cv2.COLOR_GRAY2RGB)
x = self.template_match(slide_pic, back_pic)
# print('x: ', x)
# 输出横坐标, 即 滑块在图片上的位置
return x
def get_track(distance):
"""
根据偏移量获取移动轨迹
:param distance: 偏移量
:return: 移动轨迹
"""
# 移动轨迹
track = []
# 当前位移
current = 0
# 减速阈值
mid = distance * 4 / 5
# 计算间隔
t = 0.35
# 初速度
v = 0
while current < distance:
if current < mid:
# 加速度为正 2
a = 5
else:
# 加速度为负 3
a = -2.5
# 初速度 v0
v0 = v
# 当前速度 v = v0 + at
v = v0 + a * t
move = v0 * t + 1 / 2 * a * t * t
# 当前位移
current += move
# 加入轨迹
track.append(round(move))
# print '轨迹 ', track
return track
def get_cookies():
url = 'https://www.douyin.com/search/python?source=switch_tab&type=user'
driver_path = r'E:\Python36\Scripts\geckodriver'
option = webdriver.FirefoxOptions()
# option.add_argument('--headless') # 添加之后可以不显示浏览器界面
option.add_argument('--disable-gpu')
option.add_argument('--no-sandbox')
# option.add_experimental_option('useAutomationExtension', False)
option.add_argument("disable-blink-features")
option.add_argument("disable-blink-features=AutomationControlled")
driver = webdriver.Firefox(options=option, executable_path=driver_path)
driver.get(url)
time.sleep(2)
while True:
try:
bg = driver.find_element_by_id('captcha-verify-image').get_attribute('src')
gap = driver.find_element_by_xpath('//*[@id="captcha_container"]/div/div[2]/img[2]').get_attribute('src')
slider = Douyin_Slider(bg=bg, gap=gap)
distance = slider.discern()
print('移动距离:', distance)
slider = driver.find_element_by_xpath('//div[@id="secsdk-captcha-drag-wrapper"]/div[2]')
ActionChains(driver).click_and_hold(slider).perform()
_tracks = get_track(distance)
print('开始验证....')
# print(_tracks)
new_1 = _tracks[-1] - (sum(_tracks) - distance)
_tracks.pop()
_tracks.append(new_1)
# print(_tracks)
for long in _tracks:
ActionChains(driver).move_by_offset(long, 0).perform()
ActionChains(driver).release().perform()
time.sleep(0.5)
# break
except:
break
cookies = driver.get_cookies()
cookie_dict = {}
for cookie in cookies:
k = cookie['name']
v = cookie['value']
cookie_dict[k] = v
# print(cookie_dict)
time.sleep(2)
driver.close()
driver.quit()
cookie_dict = cookie_dict.get('s_v_web_id')
return cookie_dict
if __name__ == '__main__':
cookie = get_cookies()
print(cookie)
运行结果如图: