抖音后台spider

最新推荐文章于 2023-06-07 09:17:05 发布
龙卷风zc
最新推荐文章于 2023-06-07 09:17:05 发布
阅读量1.4k
点赞数
分类专栏：爬虫
本文链接：https://blog.csdn.net/longjuanfengzc/article/details/109307929
版权
爬虫专栏收录该内容
5 篇文章 0 订阅
订阅专栏
# coding=utf-8
# coding=utf-8

import time
import os
from io import BytesIO
import random
import cv2
import cv2 as cv
import numpy as np
from hashlib import md5

import requests
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from PIL import Image

d = DesiredCapabilities.CHROME
d['loggingPrefs'] = { 'performance': 'ALL' }

chrome_options = Options()
chrome_options.add_experimental_option('w3c', False)
browser = webdriver.Chrome(desired_capabilities = d, options = chrome_options)

# 2. 点击元素显示出有缺口的图片并下载
# 3. 对比两张图片找出缺口的移动像素
# 4. 拖动元素

url = "https://e.toutiao.com/account/page/service/login"


def crop_image( image_file_name ):
	# 截图验证码图片
	# 定位某个元素在浏览器中的位置
	time.sleep(2)
	img = browser.find_element_by_xpath("//*[@class='geetest_slicebg geetest_absolute']")
	location = img.location
	print("图片的位置", location)
	size = img.size

	top, buttom, left, right = location["y"], location["y"] + size["height"], location["x"], location['x'] + size[
		"width"]
	print("验证码位置", left, top, right, buttom)
	screenshot = browser.get_screenshot_as_png()
	screenshot = Image.open(BytesIO(screenshot))
	captcha = screenshot.crop((int(left), int(top), int(right), int(buttom)))
	captcha.save(image_file_name)
	return captcha


def download_file( url, store_path ):
	filename = url.split("/")[-1]
	filepath = os.path.join(store_path, filename)

	file_data = requests.get(url, allow_redirects = True).content
	with open(filepath, 'wb') as handler:
		handler.write(file_data)


def download_file1( url, filename ):
	# filename = url.split("/")[-1]
	# filepath = os.path.join(store_path, filename)

	file_data = requests.get(url, allow_redirects = True).content
	with open(filename, 'wb') as handler:
		handler.write(file_data)
	return filename


def compare_pixel( image1, image2, i, j ):
	# 判断两个像素是否相同
	pixel1 = image1.load()[i, j]
	# pixel2 = image2.load()[i, j]
	pixel2 = image2.load()[0, 0]

	threshold = 60

	if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs(
			pixel1[2] - pixel2[2]) < threshold:
		return True
	return False


def get_positon(image1, image2):
	# 获取缺口图片的位置
	left = 60
	has_find = False
	for i in range(left, image1.size[0]):
		if has_find:
			break
		for j in range(image1.size[1]):
			if not compare_pixel(image1, image2, i, j):
				left = i
				has_find = True
				break

	left -= 6
	print(left)
	return left

def get_distinct():
	# 二值化图片,进行对比，输出匹配的坐标系
	target_rgb = cv2.imread("./captcha2.png")
	target_gray = cv2.cvtColor(target_rgb, cv2.COLOR_BGR2GRAY)
	template_rgb = cv2.imread("./captcha1.png", 0)
	res = cv2.matchTemplate(target_gray, template_rgb, cv2.TM_CCOEFF_NORMED)
	value = cv2.minMaxLoc(res)
	value = value[3][0]

	print("需要移动的距离:{}" . format(value))

	return value

def get_element_slide_distance():


	otemp = 'captcha2.png'
	oblk = 'captcha1.png'
	target = cv2.imread(otemp, 0) # 读取进行色度图片，转换为numpy中的数组类型数据
	template = cv2.imread(oblk, 0)
	width, height = target.shape[::-1] # 获取缺口图数组的形状 -->缺口图的宽和高
	temp = 'temp.jpg' # 将处理之后的图片另存
	targ = 'targ.jpg'
	cv2.imwrite(temp, template)
	cv2.imwrite(targ, target)
	target = cv2.imread(targ) # 读取另存的滑块图
	target = cv2.cvtColor(target, cv2.COLOR_BGR2GRAY) # 进行色彩转换
	# 去除白色部分 获取滑块正常大小
	target = target[target.any(1)]

	target = abs(255 - target) # 获取色差的绝对值
	cv2.imwrite(targ, target) # 保存图片
	target = cv2.imread(targ) # 读取滑块
	template = cv2.imread(temp) # 读取背景图
	result = cv2.matchTemplate(target, template, cv2.TM_CCOEFF_NORMED) # 比较两张图的重叠区域
	top, left = np.unravel_index(result.argmax(), result.shape) # 获取图片的缺口位置
	#缺口位置
	print((left, top, left + width, top + height)) # 背景图中的图片缺口坐标位置

	#调用PIL Image 做测试
	image = Image.open("captcha1.png")

	rectangle = (left + 3, top + 3, left + width - 3, top + height - 3) #去掉白色块的影响(上面去掉白色部分的功能并没有真的起作用)
	#切割
	imagecrop = image.crop(rectangle)
	#保存切割的缺口
	imagecrop.save("new_image.jpg")

	return left+3


# 返回两个数组：一个用于加速拖动滑块，一个用于减速拖动滑块
def generate_tracks( distance ):
	# 给距离加上20，这20像素用在滑块滑过缺口后，减速折返回到缺口
	distance += 20
	v = 0
	t = 0.2
	forward_tracks = []
	current = 0
	mid = distance * 3 / 5  # 减速阀值
	while current < distance:
		if current < mid:
			a = 2  # 加速度为+2
		else:
			a = -3  # 加速度-3
		s = v * t + 0.5 * a * (t ** 2)
		v = v + a * t
		current += s
		forward_tracks.append(round(s))

	back_tracks = [-3, -3, -2, -2, -2, -2, -2, -1, -1, -1, -1]

	# 鼠标移动到正确的元素上
	slider = browser.find_element_by_xpath("//div[@class='secsdk-captcha-drag-icon sc-jKJlTe fsBatO']")
	ActionChains(browser).click_and_hold(slider).perform()
	ActionChains(browser).pause(0.2)

	for x in forward_tracks:
		ActionChains(browser).move_by_offset(x, 0).perform()  # 前进移动滑块
		print(x)

	print('#' * 50)

	for x in back_tracks:
		ActionChains(browser).move_by_offset(x, 0).perform()  # 后退移动滑块
		print(x)

	ActionChains(browser).pause(0.2)
	time.sleep(0.5)
	ActionChains(browser).release(on_element = slider).perform()
	time.sleep(5)

	return forward_tracks, back_tracks

def generate_tracks1(XCoordinates):
	element = browser.find_element_by_xpath("//div[@class='secsdk-captcha-drag-icon sc-jKJlTe fsBatO']")
	ActionChains(browser).click_and_hold(on_element = element).perform()
	#
	# ActionChains(browser).move_by_offset(xoffset=0, yoffset=y - 445).perform()
	#
	# time.sleep(0.15)
	# print("第二步，拖动元素")
	distance = XCoordinates - 60
	# 初速度
	v = 0
	# 单位时间为0.2s来统计轨迹，轨迹即0.2内的位移,越低看起来越丝滑！！
	t = 0.08
	# 位移/轨迹列表，列表内的一个元素代表0.2s的位移
	tracks = []
	# 当前的位移
	current = 0
	# 到达mid值开始减速
	mid = distance * 5 / 8

	distance += 10  # 先滑过一点，最后再反着滑动回来
	# a = random.randint(1,3)
	while current < distance:
		if current < mid:
			# 加速度越小，单位时间的位移越小,模拟的轨迹就越多越详细
			a = random.randint(100, 200)  # 加速运动
		else:
			a = -random.randint(2, 10)  # 减速运动

		# 初速度
		v0 = v
		# 0.2秒时间内的位移
		s = v0 * t + 0.5 * a * (t ** 2)
		# 当前的位置
		current += s
		# 添加到轨迹列表
		tracks.append(round(s))

		# 速度已经达到v,该速度作为下次的初速度
		v = v0 + a * t
		if current > distance:
			break

	# 反着滑动到大概准确位置
	# for i in range(4):
	#     tracks.append(-random.randint(1, 3))
	# for i in range(4):
	#    tracks.append(-random.randint(1,3))
	random.shuffle(tracks)
	count = 0
	for item in tracks:
		print(item)
		count += item
		ActionChains(browser).move_by_offset(xoffset = item, yoffset = random.randint(-2, 2)).perform()

	# ActionChains(browser).move_to_element_with_offset(to_element=element, xoffset=XCoordinates-18,yoffset=y - 445).perform()
	# time.sleep(2)
	# # 释放鼠标
	print(count)
	ActionChains(browser).release(on_element = element).perform()

def crack_slider(distance):
	# 拖动图片
	# 根据偏移量获取移动轨迹
	# 一开始加速，然后减速，生长曲线，且加入点随机变动
	# 移动轨迹
	track = []
	# 当前位移
	current = 0
	# 减速阈值
	mid = distance * 3 / 4
	# 间隔时间
	t = 0.1
	v = 0
	while current < distance:
		if current < mid:
			a = random.randint(2, 3)
		else:
			a = - random.randint(6, 7)
		v0 = v
		# 当前速度
		v = v0 + a * t
		# 移动距离
		move = v0 * t + 1 / 2 * a * t * t
		# 当前位移
		current += move
		track.append(round(move))

	# 鼠标移动到正确的元素上
	slider = browser.find_element_by_xpath("//div[@class='secsdk-captcha-drag-icon sc-jKJlTe fsBatO']")
	ActionChains(browser).click_and_hold(slider).perform()
	ActionChains(browser).pause(0.2)

	for x in track:
		# ActionChains(browser).move_by_offset(xoffset = x, yoffset = 0).perform()
		ActionChains(browser).move_by_offset(xoffset = x, yoffset = random.randint(-5, 5)).perform()





	ActionChains(browser).pause(0.2)
	time.sleep(0.5)
	ActionChains(browser).release(on_element=slider).perform()
	time.sleep(5)


def findPic( img_bg_path, img_slider_path ):
	"""
	找出图像中最佳匹配位置
	:param img_bg_path: 滑块背景图本地路径
	:param img_slider_path: 滑块图片本地路径
	:return: 返回最差匹配、最佳匹配对应的x坐标
	"""

	# 读取滑块背景图片，参数是图片路径，OpenCV默认使用BGR模式
	# cv.imread()是 image read的简写
	# img_bg 是一个numpy库ndarray数组对象
	img_bg = cv.imread(img_bg_path)

	# 对滑块背景图片进行处理，由BGR模式转为gray模式（即灰度模式，也就是黑白图片）
	# 为什么要处理？ BGR模式（彩色图片）的数据比黑白图片的数据大，处理后可以加快算法的计算
	# BGR模式：常见的是RGB模式
	# R代表红，red; G代表绿，green;  B代表蓝，blue。
	# RGB模式就是，色彩数据模式，R在高位，G在中间，B在低位。BGR正好相反。
	# 如红色：RGB模式是(255,0,0)，BGR模式是(0,0,255)
	img_bg_gray = cv.cvtColor(img_bg, cv.COLOR_BGR2GRAY)

	# 读取滑块，参数1是图片路径，参数2是使用灰度模式
	img_slider_gray = cv.imread(img_slider_path, 0)

	# 在滑块背景图中匹配滑块。参数cv.TM_CCOEFF_NORMED是opencv中的一种算法
	res = cv.matchTemplate(img_bg_gray, img_slider_gray, cv.TM_CCOEFF_NORMED)

	print('#' * 50)
	print(type(res))  # 打印：<class 'numpy.ndarray'>
	print(res)
	# 打印：一个二维的ndarray数组
	# [[0.05604218  0.05557462  0.06844381... - 0.1784117 - 0.1811338 - 0.18415523]
	#  [0.06151756  0.04408009  0.07010461... - 0.18493137 - 0.18440475 - 0.1843424]
	# [0.0643926    0.06221284  0.0719175... - 0.18742703 - 0.18535161 - 0.1823346]
	# ...
	# [-0.07755355 - 0.08177952 - 0.08642308... - 0.16476074 - 0.16210903 - 0.15467581]
	# [-0.06975575 - 0.07566144 - 0.07783117... - 0.1412715 - 0.15145643 - 0.14800543]
	# [-0.08476129 - 0.08415948 - 0.0949327... - 0.1371379 - 0.14271489 - 0.14166716]]

	print('#' * 50)

	# cv2.minMaxLoc() 从ndarray数组中找到最小值、最大值及他们的坐标
	value = cv.minMaxLoc(res)
	# 得到的value，如：(-0.1653602570295334, 0.6102921366691589, (144, 1), (141, 56))

	print(value, "#" * 30)

	# 获取x坐标，如上面的144、141
	return value[2:][0][0], value[2:][1][0]


def login():
	username = "123456789@qq.com"
	password = "123456"

	browser.get(url)
	# browser.maximize_window() #很重要！！

	time.sleep(5)

	# 点击客户身份
	# login_btn = browser.find_element_by_xpath("//div[@class='id-list']/div[@class='id-item-container'][1]")
	# login_btn.click()

	# username_ele = browser.find_element_by_xpath("//input[@id='login-username']")
	username_ele = browser.find_element_by_xpath("//input[@name='email']")

	# password_ele = browser.find_element_by_xpath("//input[@id='login-passwd']")
	password_ele = browser.find_element_by_xpath("//input[@name='password']")
	username_ele.send_keys(username)
	password_ele.send_keys(password)

	time.sleep(5)

	# 点击登录按钮
	# login_btn = browser.find_element_by_xpath("//a[@class='btn btn-login']")
	login_btn = browser.find_element_by_xpath("//div[@class='account-center-submit']/button")
	login_btn.click()

	time.sleep(5)

	image1 = browser.find_element_by_xpath("//div[@class='slide-container']//img[1]").get_attribute("src")
	image2 = browser.find_element_by_xpath("//div[@class='slide-container']//img[2]").get_attribute("src")

	image1 = download_file1(image1, "captcha1.png")
	image2 = download_file1(image2, "captcha2.png")

	# image1 = Image.open(image1)
	# image2 = Image.open(image2)
	#
	# print(image1.size)
	# print(image2.size)
	#
	# get_positon(image1, image2)
	#
	# time.sleep(500)

	chaojiying = Chaojiying_Client('xxx', 'xxx', 'xxx')  # 用户中心>>软件ID 生成一个替换 96001
	im = open('captcha1.png', 'rb').read()  # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
	res = chaojiying.PostPic(im, 9101)  # 1902 验证码类型  官方网站>>价格体系 3.4+版 print 后要加()
	pic_str = res['pic_str']
	print(pic_str)

	pic_list = pic_str.split(',')
	# print(pic_list[0])
	distance = pic_list[0]


	# distance = get_element_slide_distance()

	# value_1, value_2 = findPic('captcha1.png', 'captcha2.png')
	# print("#" * 30)
	# print("最差匹配和最佳匹配对应的x坐标分别是：")
	# print(value_1)
	# print(value_2)
	# print("#" * 30)
	# distance = value_2 + 4


	# 滑动距离误差校正，滑动距离*图片在网页上显示的缩放比-滑块相对的初始位置
	print("获得的距离:", distance)

	# print(int(distance) * (340 / 276))
	distance = int(distance)

	# distance = int(distance * (340 / 276)) - 3
	distance = int(distance * (340 / 276)) + 8
	print("需要拖动的距离:", distance)

	# crack_slider(distance)
	generate_tracks1(distance)

	time.sleep(5)





	# time.sleep(10)
	# cookies = browser.get_cookies()
	# cookie_dict = { }
	# for item in cookies:
	# 	cookie_dict[item["name"]] = item["value"]
	#
	# res = requests.get(url, cookies = cookie_dict)
	# if "得物" in res.text:
	# 	print("已经登录")




	try:
		browser.find_element_by_xpath("//div[contains(text(), '上海识装')]")

		time.sleep(10)
		
		
		
		#
		browser.get("https://star.toutiao.com/ad#/video/detail?order_id=6879962917140168717")
		# time.sleep(500)

		time.sleep(5)
		time_picker_btn = browser.find_element_by_xpath("//div[@class='time-picker-container']")
		time_picker_btn.click()

		# //div[@class='el-picker-panel__content']//tr[@class='el-date-table__row']/td[@class='available'][last()]
		time_picker_btn = browser.find_element_by_xpath("//div[@class='el-picker-panel__content']//tr[@class='el-date-table__row']/td[@class='available'][last()]")
		time_picker_btn.click()
		
		# https://star.toutiao.com/ad#/video/detail?order_id=6881462096702210055
		# cookies = browser.get_cookies()
		# cookie_dict = { }
		# for item in cookies:
		# 	cookie_dict[item["name"]] = item["value"]
		#
		# res = requests.get("https://star.toutiao.com/ad#/user/task", cookies = cookie_dict)

		browser.get("https://star.toutiao.com/ad#/user/task")

		time.sleep(5)
		tab_btn = browser.find_element_by_xpath("//div[@id='tab-4']")
		tab_btn.click()

		while True:
			time.sleep(5)
			try:
				next_btn = browser.find_element_by_xpath("//button[@class='btn-next']")
				next_btn.click()
			except Exception as e:
				break

		return True
	except Exception as e:
		print("遇到错误了", e)

		if login():
			return True

	time.sleep(500)


# image2 = crop_image("captcha1.png")


# cookies = browser.get_cookies()
# print(cookies)
#
# cookie_dict = { }
# for item in cookies:
# 	cookie_dict[item['name']] = item['value']
# print(cookie_dict)
#
# res = requests.get(url, cookies=cookie_dict)
# print(res.text)
#
# time.sleep(500)

# browser.get('https://star.toutiao.com/ad#/video/detail?order_id=6881462096702210055')



class Chaojiying_Client(object):

	def __init__( self, username, password, soft_id ):
		self.username = username
		password = password.encode('utf8')

		self.password = md5(password).hexdigest()
		self.soft_id = soft_id
		self.base_params = {
			'user'  : self.username,
			'pass2' : self.password,
			'softid': self.soft_id,
		}
		self.headers = {
			'Connection': 'Keep-Alive',
			'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
		}


	def PostPic( self, im, codetype ):
		"""
		im: 图片字节
		codetype: 题目类型 参考 http://www.chaojiying.com/price.html
		"""
		params = {
			'codetype': codetype,
		}
		params.update(self.base_params)
		files = { 'userfile': ('ccc.jpg', im) }
		r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data = params, files = files,
		                  headers = self.headers)
		return r.json()


	def ReportError( self, im_id ):
		"""
		im_id:报错题目的图片ID
		"""
		params = {
			'id': im_id,
		}
		params.update(self.base_params)
		r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data = params, headers = self.headers)
		return r.json()



if __name__ == "__main__":
	login()

	time.sleep(5000)

	# get_distinct()

	# image1.size

	# img1 = "img1.png"
	# img1_info = Image.open("img1.png")
	#
	# img2 = "img2.png"
	# img2_info = Image.open("img2.png")
	#
	# print(img1_info.size)
	# print(img2_info.size)
龙卷风zc
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
抖音后台spider

# coding=utf-8# coding=utf-8import timeimport osfrom io import BytesIOimport randomimport cv2import cv2 as cvimport numpy as npfrom hashlib import md5import requestsfrom selenium import webdriverfrom selenium.webdriver import ActionChainsfr.
复制链接

扫一扫