解决思路: 图片链接提取下载-图片处理(降噪-灰度二值化-缺口横坐标计算-构造滑动轨迹-模拟拖动-验证)
import base64
import json
from datetime import datetime
import cv2
import requests
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import re, os, sys
from PIL import Image
from time import sleep
import pymysql
import random
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
sys.path.insert(0, BASE_DIR)
def _tran_canny(image):
"""降噪"""
image = cv2.GaussianBlur(image, (3, 3), 0)
return cv2.Canny(image, 50, 150)
def detect_displacement(img_slider_path, image_background_path):
"""detect displacement"""
# 灰度化
image = cv2.imread(img_slider_path, 0)
template = cv2.imread(image_background_path, 0)
# 图片模板位置匹配 归一化相关系数匹配法cv2.TM_CCOEFF_NORMED
res = cv2.matchTemplate(_tran_canny(image), _tran_canny(template), method=cv2.TM_CCOEFF_NORMED)
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)
top_left = max_loc[0] # 横坐标
print(top_left)
# 展示圈出来的区域
# x, y = max_loc # 获取x,y位置坐标
#
# w, h = image.shape[::-1] # 宽高
# cv2.rectangle(template, (x, y), (x + w, y + h), (7, 249, 151), 2)
# show(template)
return top_left
def init():
global index_url, hot_url, detail_url, headers, browser, username, password, wait0, wait1, wait2
index_url = 'https://www.hqew.com/'
hot_url = 'https://product.hqew.com/home/hotsearch?callback=hotwordsCallback'
detail_url = 'https://s.hqew.com/{}.html?from=hotsearch'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'host': 'product.hqew.com'}
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
# options.add_argument('--disable-gpu')
# options.add_argument('--no-sandbox')
# options.add_argument('disable-infobars')
# options.add_experimental_option('useAutomationExtension', False)
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
# options.add_argument("--disable-blink-features=AutomationControlled")
# options.add_experimental_option("excludeSwitches", ["enable-logging"])
# options.add_argument('-ignore-certificate-errors')
# options.add_argument('-ignore -ssl-errors')
browser = webdriver.Chrome(chrome_options=options)
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => false
})
"""
})
# browser = webdriver.Chrome(executable_path = '/usr/local/bin/chromedriver', chrome_options=options)
browser.maximize_window()
users = [{'u': '1341671****', 'p': '******'}]
user = random.choice(users)
username = user.get('u')
password = user.get('p')
wait0 = WebDriverWait(browser, 60)
wait1 = WebDriverWait(browser, 3)
wait2 = WebDriverWait(browser, 2)
# 构造滑动轨迹
def get_trace(distance):
'''
:param distance: (Int)缺口离滑块的距离
:return: (List)移动轨迹
'''
trace = []
# 设置加速距离
# faster_distance = distance * (4 / 5)
faster_distance = distance
# start, v0, t = 0, 0, 0.1
start, v0, t = 0, 0, 0.1
while start < distance:
if start < faster_distance:
# a = 1.5
a = 30
else:
# a = -3
a = -30
move = v0 * t + 1 / 2 * a * t * t
v = v0 + a * t
v0 = v
start += move
trace.append(round(move))
return trace
def get_tracks(distance):
# 构造滑动轨迹
tracks = []
v = 0
t = 0.9 # 单位时间
current = 0 # 滑块当前位移
distance += 10 # 多移动10px,然后回退
while current < distance:
if current < distance * 5 / 8:
a = random.randint(1, 3)
else:
a = -random.randint(2, 4)
v0 = v # 初速度
track = v0 * t + 0.5 * a * (t ** 2) # 单位时间(0.2s)的滑动距离
tracks.append(round(track)) # 加入轨迹
current += round(track)
v = v0 + a * t
# 回退到大致位置
for i in range(5):
tracks.append(-random.randint(1, 2))
# tracks.append(-random.choice([0.3, 0.5, 0.8]))
return tracks
# 模拟拖动
def move_to_gap(trace):
# 得到滑块标签
# slider = wait1.until(EC.presence_of_element_located((By.CLASS_NAME, 'verify-move-block')))
slider = wait1.until(EC.presence_of_element_located((By.CLASS_NAME, 'yidun_slider__icon')))
# 使用click_and_hold()方法悬停在滑块上,perform()方法用于执行
ActionChains(browser).click_and_hold(slider).perform()
for x in trace:
# 使用move_by_offset()方法拖动滑块,perform()方法用于执行
ActionChains(browser).move_by_offset(xoffset=x, yoffset=0).perform()
# 模拟人类对准时间
sleep(0.5)
# 释放滑块
ActionChains(browser).release().perform()
def login():
user = wait2.until(EC.presence_of_element_located((By.ID, 'J_loginName')))
passwd = wait2.until(EC.presence_of_element_located((By.ID, 'J_loginPsw')))
click_login = wait2.until(EC.presence_of_element_located((By.ID, 'J_btnLogin')))
user.send_keys(username)
passwd.send_keys(password)
click_login.click()
def save_data(item):
pass
def parse_data():
# table_reg = '//table[@class="list-table"]'
title_reg = '//table[@class="list-table"]/tbody/tr[2]/td[contains(@class, "td-model")]//a[1]'
brand_reg = '//table[@class="list-table"]/tbody/tr[2]/td[contains(@class, "brand")]/div[@class="list-pro"]'
title = browser.find_element(By.XPATH, title_reg)
# title = browser.find_elements(By.XPATH, title_reg)
brand = browser.find_element(By.XPATH, brand_reg)
title = title.text
brand = brand.text
url = browser.current_url
print(title, brand)
item = dict()
item['title'] = title.strip() if title else ''
item['brand_name'] = brand.strip() if brand else ''
item['url'] = url
item['url'] = url
item["sources"] = 'hqew'
item["creator"] = 'gxq'
item["create_time"] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
item["spider_time"] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
return item
def save_bs_img(bs, path):
imgdata = base64.b64decode(bs)
file = open(path, 'wb')
file.write(imgdata)
file.close()
def save_img(url, path):
resp = requests.get(url)
file = open(path, 'wb')
file.write(resp.content)
file.close()
def main():
init()
resp = requests.get(hot_url, headers=headers)
urls = re.search(r'"(.*)"', resp.text)
urls = urls.group(1).split(',')
for url in urls:
url = detail_url.format(url.strip('"'))
browser.get(url)
for n in range(30):
try:
# 获取验证码img
wait2.until(EC.presence_of_element_located((By.CLASS_NAME, 'search-wrapper')))
print('无验证和登录')
except:
try:
wait2.until(EC.presence_of_element_located((By.ID, 'captcha_div')))
img1_reg = '//*[@id="captcha_div"]//img[@class="yidun_bg-img"]'
img2_reg = '//*[@id="captcha_div"]//img[@class="yidun_jigsaw"]'
img1 = browser.find_element(By.XPATH, img1_reg)
img2 = browser.find_element(By.XPATH, img2_reg)
img1_url = img1.get_attribute("src")
img2_url = img2.get_attribute("src")
# 保存验证码img
tt2_path = sys.path[0] + "/Material/special/hqew/tt2.jpg"
tt1_path = sys.path[0] + "/Material/special/hqew/tt1.jpg"
save_img(img1_url, tt2_path)
save_img(img2_url, tt1_path)
# 计算滑块移动距离
distance = detect_displacement(tt2_path, tt1_path)
# 计算移动轨迹
# trace = get_trace(distance + 6)
trace = get_tracks(distance + 6)
# 移动滑块
move_to_gap(trace)
# wait1.until(EC.presence_of_element_located((By.CLASS_NAME, 'search-wrapper')))
sleep(2)
except Exception as e:
print('parse_img: ', e)
try:
login()
except:
pass
try:
wait1.until(EC.presence_of_element_located((By.CLASS_NAME, 'search-wrapper')))
print('验证登录成功!')
break
except:
# browser.refresh()
print('验证登录失败!')
try:
item = parse_data()
save_data(item)
except Exception as e:
print(e)
sleep(5)
browser.close()
if __name__ == '__main__':
main()