在假期接的python爬虫设计之一是实现当当网的登陆和获取一些数据。
在初期打开官网并想登陆时发发现有一个安全提醒标签,继续下一步后
可以发现是需要旋转的图片验证码。
开始正题,我们需要如何去解决旋转图片验证码呢,这方面的资料实在是太少了,但是大概的思路就是两个,第一个是能够知道每张图片需要点击的次数,然后通过程序去实现,第二个是想办法绕过图片验证码。
第一种方案很幸运找到了一位前辈的博客,给出了解决办法:
图片旋转验证码
在程序复现时会发现在程序复现时不能成功,这是因为这个网站在维护时添加了一个确认界面,只有当点击后才能进入真正的登陆界面,而且一些程序内关键元素的定位方式也发生了一些改变(极少数)。见下图:
这两幅图是不是十分眼熟,和当当网可以说一摸一样,所以我做的第一步就是复现这个历程,登陆这个漫画网站。我在前辈的代码基础上做过改变的代码如下(这真的是站在巨人的肩膀上,真的是万分感谢):
#样本采集
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import random
from io import BytesIO
import time
class Crack(object):
def __init__(self, start_number, count):
self.login_url = "http://www.1kkk.com/"
self.start_number = start_number
self.count = count
self.chrome_options = webdriver.ChromeOptions()#无弹窗
self.chrome_options.add_argument("--healess")
self.browser = webdriver.Chrome()
self.browser.maximize_window()
self.wait = WebDriverWait(self.browser, 30)
def login(self):
"""
输入账号,密码
:return:None
"""
self.browser.get(self.login_url)#进入登陆界面
self.browser.find_element_by_class_name("header-avatar").click()#点击右侧灰色人头像,打开登陆界面
self.browser.find_element_by_xpath("//*[@id='sy-win']/div/div[4]/a[2]").click()
#self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "overflow auto"))).click()
print("成功点击")
# 获取所有图片
for num in range(self.start_number, self.start_number+self.count):
self.image_png(num)#通过获取网页截图,然后进行切片,返回四张图片
self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "rotate-refresh"))).click()#不停的点击换一组,下载至足够多的数据
time.sleep(0.5)
def save_screen_png(self):
"""
获取网页截图
:return: 截图对象
"""
self.wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "rotate-background")))
screen_image = self.browser.get_screenshot_as_png()
screenshot = Image.open(BytesIO(screen_image))
screenshot.save("screenshot{}.png".format(random.randint(1, 5)))
return screenshot
def image_png(self, num):
"""
通过获取网页截图,然后进行切片,返回四张图片
:return:
"""
screenshot = self.save_screen_png()
images = []
for num_2 in range(1, 5):
# 依次获取5张图片,存入iamges列表中
images.append(self.get_image_position(screenshot, num, num_2))
# 获取整体四张图片的坐标
# 进行切片
def get_image_position(self, screenshot, number, number_2):
"""
获取四张图片的下标
:return: left, top, right, bottom
"""
image = self.wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='rotate-background'][{}]".format(number_2))))
location = image.location
size = image.size
top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
'width']
image = screenshot.crop((left, top, right, bottom))
image.save("./static/total_images/image{}_{}.png".format(number, number_2))
return image
def __del__(self):
self.browser.quit()
def download(start_number, count):
"""
初始化登录类,下载图片
:param start_number:开启位置
:param count: 数量
:return:
"""
c = Crack(start_number, count)#初始化
c.login()#调用c(即Crack)中的login()函数
del c
def main():
download(1, 1000)
if __name__ == '__main__':
main()
#图片去重
from PIL import Image
import os
import gevent
from gevent import monkey
monkey.patch_all()
# 图片数量
gCount = 0
# 列表,用来保存rgb
rgb_dif_list = []
# 当前保存图片的名称
gNumber = 0
def sum_rgb(image):
"""
计算rgb的值
:param images: 图片
:return: rgb的值
"""
num = 0
for i in range(image.size[0]):
for y in range(image.size[1]):
pixel = image.load()[i, y]
num = num + image.load()[i, y][0] + image.load()[i, y][1] + image.load()[i, y][2]
return num
def check_have_in(num):
"""
通过rgb的总值,来判断是否已经存在列表
:param num: Ture or False
:return:
"""
global rgb_dif_list
if num in rgb_dif_list:
# 如果存在,就得删除
return True
else:
# 否则就将rgb存入列表中,更改名字,并返回False
return False
def delete(image_url):
"""
删除图片
:param image_url: 图片的url
:return:
"""
print("删除图片:", image_url)
os.remove(image_url)
def start_check(start_number, count):
global rgb_dif_list
global gCount
global gNumber
images_url = "./static/total_images/{}"
save_url = "./static/images/{}"
for number_1 in range(start_number, start_number + count):
for number_2 in range(1, 5):
image_url = images_url.format("image{}_{}.png".format(number_1, number_2))
if os.path.isfile(image_url):
image = Image.open(image_url)
# 通过元素的rgb三个值相加的总数,通过列表保存,如果在列表中存在就添加,否则就删除
rgb_num = sum_rgb(image)
print("image{}_{}.png".format(number_1, number_2), rgb_num)
# 判断该图片的rgb是否已经存在列表中
if rgb_num > 4000000:
continue
for num in range(rgb_num - 3000, rgb_num + 3000):
check_result = check_have_in(num)
# 判断结果,做响应处理
if check_result:
# 存在情况,退出
break
else:
rgb_dif_list.append(rgb_num)
gCount += 1
# 不存在情况,更改名字
gNumber += 1
image.save(save_url.format("images{}.png".format(gNumber)))
if start_number + count == 501:
print("剩余图片总数为", gCount)
def main():
gevent.joinall([
gevent.spawn(start_check, 1, 100),
gevent.spawn(start_check, 101, 100),
gevent.spawn(start_check, 201, 100),
gevent.spawn(start_check, 301, 100),
gevent.spawn(start_check, 401, 100),
])
# start_check(1, 10)
if __name__ == "__main__":
main()
#验证码破解
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
import os
from PIL import Image
from io import BytesIO
class Crack(object):
def __init__(self):
self.login_url = "http://www.1kkk.com/"
# self.chrome_options = webdriver.ChromeOptions()
# self.chrome_options.add_argument("--healess")
self.browser = webdriver.Firefox()
self.browser.maximize_window()
self.wait = WebDriverWait(self.browser, 5)
self.browser.get(self.login_url)
time.sleep(1)
def login(self):
"""
输入账号,密码
:return:None
"""
try:
self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, "header-avatar"))).click()#点击右侧灰色人头像,打开登陆界面
self.browser.find_element_by_xpath("//*[@id='sy-win']/div/div[4]/a[2]").click()#点击同意按钮
except TimeoutException:#超时则再次请求登陆
self.browser.refresh()
self.login()
return
#输入账号和密码
name_page = self.browser.find_element_by_name("txt_name")
name_page.send_keys("18218299414")
password_page = self.browser.find_element_by_name("txt_password")
password_page.send_keys("shao0812")
true_or_false = True
while true_or_false:
true_or_false = False
# 获取四张需要旋转的图片
images = self.image_png()
# 获取整体四张图片的几次
turn_num_list = []
for image in images:
turn_num_list.append(self.image_turn_num(image))
# print(turn_num_list)
for i in turn_num_list:
if i == 5:
self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'rotate-refresh'))).click()
time.sleep(1)
true_or_false = True
# 根据上面得到的旋转次数点击图片
self.click_image(turn_num_list)
# 结果正确,点击登录按钮
self.click_submit()
# 如果旋转出问题,就得重新来
try:
if self.browser.find_element_by_xpath("/html/body/section[3]/div/div/div/div/div/div[1]/label"):
# 如果登录不成功,将重新刷新页面登录
self.browser.refresh()
self.login()
time.sleep(1)
except:
pass
def click_image(self, turn_num_list):
"""
通过算出来的点击次数,来点击图片
:param turn_num_list: 四张图需要点击的次数
:return: None
"""
for i in range(0, len(turn_num_list)):
if turn_num_list[i] == 0:
continue
image = self.wait.until(
EC.presence_of_element_located((By.XPATH, "//div[@class='rotate-background'][{}]".format(i+1))))
for _ in range(turn_num_list[i]):
image.click()
time.sleep(0.5)
def save_screen_png(self):
"""
获取网页截图
:return: 截图对象
"""
screen_image = self.browser.get_screenshot_as_png()
screenshot = Image.open(BytesIO(screen_image))
# screenshot.save("screenshot.png")
return screenshot
def image_png(self):
"""
通过获取网页截图,然后进行切片,返回四张图片
:return:
"""
screenshot = self.save_screen_png()
images = []
for num in range(1, 5):
# 依次获取4张图片,存入iamges列表中
images.append(self.get_image(screenshot, num))
return images
def get_image(self, screenshot, number):
"""
获取四张图片的下标
:return: left, top, right, bottom
"""
image = self.wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='rotate-background'][{}]".format(number))))
location = image.location
size = image.size
top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
'width']
image = screenshot.crop((left, top, right, bottom))
# image.save("image{}.png".format(number))
return image
def image_turn_num(self, image):
"""
用获取的图片跟图片库的图片比较,
:param image: 原图
:return:
"""
for i in range(0, 4):
# 原图最多转三次
dir_path = "./static/images/"
change_image = image.rotate(-90*i)
# change_image.save("change{}.png".format(i))
for or_path in os.listdir(dir_path):
or_image = Image.open(os.path.join(dir_path, or_path))
result = self.examine_pixel(or_image, change_image)
if result:
return i
return 5
def examine_pixel(self, image1, image2):
"""
判断来个图片是否相等
:param image1: 图片1
:param image2: 图片2
:return:
"""
thredhold = 100
for x in range(image1.size[0]):
for y in range(image1.size[1]):
pixel1 = image1.load()[x, y]
pixel2 = image2.load()[x, y]
if not (abs(pixel1[0] - pixel2[0]) < thredhold and abs(pixel1[1] - pixel2[1]) < thredhold and abs(pixel1[2] - pixel2[2]) < thredhold):
return False
return True
def click_submit(self):
"""
点击登录按钮
:return: None
"""
submit = self.wait.until(EC.element_to_be_clickable((By.ID, "btnLogin")))
submit.click()
def __del__(self):
self.browser.quit()
def main():
"""pass"""
c = Crack()
c.login()
if __name__ == "__main__":
main()
至此就实现了漫画网的登陆。
接下来我们需要将代码更改,更改就是根据当当网的网页源代码更改元素定位在程序中的定位方式,这里实现了采集样本,还有样本去重,但是在破解时没有用到采集的样本,是直接看每个验证码需要旋转多少次,然后手工输入每张验证码需要旋转的次数,至于原因我在后面进行叙述。
#当当采样
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import random
from io import BytesIO
import time
#报错selenium.common.exceptions.WebDriverException: Message: 'geckodriver' executable needs to be in PATH.的处理办法如下
#参考https://blog.csdn.net/sisteryaya/article/details/75257681
class Crack(object):
def __init__(self, start_number, count):
self.login_url = "http://www.dangdang.com/?_utm_brand_id=10934&_ddclickunion=621-pz-%CD%B7%B2%BF_%D6%F7%B1%EA%CC%E2_%B5%B1%B5%B1%B9%D9%CD%F8%2C%D5%FD%C6%B7%B5%CD%BC%DB%2C%C6%B7%D6%CA%B1%A3%D6%A4%2C30%CD%F2%CD%BC%CA%E9%C3%BF%C2%FA100%BC%F540%A3%A1|ad_type=0|sys_id=1"
self.start_number = start_number
self.count = count
self.chrome_options = webdriver.ChromeOptions()#无弹窗 网页包含"href=”javascript:void(0);"必须使用Chrome浏览器,否则登陆界面的“安全提醒”弹出界面中的“知道了”按钮没办法点击,会报错
self.chrome_options.add_argument("--healess")
self.browser = webdriver.Chrome()
self.browser.maximize_window()
self.wait = WebDriverWait(self.browser, 30)
def login(self):
"""
输入账号,密码
:return:None
"""
self.browser.get(self.login_url)#进入登陆界面
self.browser.find_element_by_class_name("login_link").click()#点击"登陆按钮",打开登陆界面
self.browser.find_element_by_xpath('//*[@id="J_loginMaskClose"]').click()# 点击知道了按钮
print("成功点击")
time.sleep(2)
# 获取所有图片
for num in range(self.start_number, self.start_number+self.count):
self.image_png(num)#通过获取网页截图,然后进行切片,返回四张图片
self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "Rotate-refresh"))).click()#不停的点击换一组,下载至足够多的数据
time.sleep(0.5)
def save_screen_png(self):
"""
获取网页截图
:return: 截图对象
"""
self.wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "Rotate-background")))
screen_image = self.browser.get_screenshot_as_png()
screenshot = Image.open(BytesIO(screen_image))
screenshot.save("screenshot{}.png".format(random.randint(1, 5)))
return screenshot
def image_png(self, num):
"""
通过获取网页截图,然后进行切片,返回四张图片
:return:
"""
screenshot = self.save_screen_png()
images = []
for num_2 in range(1, 5):
# 依次获取5张图片,存入iamges列表中
images.append(self.get_image_position(screenshot, num, num_2))
# 获取整体四张图片的坐标
# 进行切片
def get_image_position(self, screenshot, number, number_2):
"""
获取四张图片的下标
:return: left, top, right, bottom
"""
image = self.wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='Rotate-background'][{}]".format(number_2))))
location = image.location
size = image.size
top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
'width']
image = screenshot.crop((left, top, right, bottom))
image.save("./static/total_images/image{}_{}.png".format(number, number_2))
return image
def __del__(self):
self.browser.quit()
def download(start_number, count):
"""
初始化登录类,下载图片
:param start_number:开启位置
:param count: 数量
:return:
"""
c = Crack(start_number, count)#初始化
c.login()#调用c(即Crack)中的login()函数
del c
def main():
download(1, 10000)
if __name__ == '__main__':
main()
#当当样本去重
from PIL import Image
import os
import gevent
from gevent import monkey
monkey.patch_all()
# 图片数量
gCount = 0
# 列表,用来保存rgb
rgb_dif_list = []
# 当前保存图片的名称
gNumber = 0
def sum_rgb(image):
"""
计算rgb的值
:param images: 图片
:return: rgb的值
"""
num = 0
for i in range(image.size[0]):
for y in range(image.size[1]):
pixel = image.load()[i, y]
num = num + image.load()[i, y][0] + image.load()[i, y][1] + image.load()[i, y][2]
return num
def check_have_in(num):
"""
通过rgb的总值,来判断是否已经存在列表
:param num: Ture or False
:return:
"""
global rgb_dif_list
if num in rgb_dif_list:
# 如果存在,就得删除
return True
else:
# 否则就将rgb存入列表中,更改名字,并返回False
return False
def delete(image_url):
"""
删除图片
:param image_url: 图片的url
:return:
"""
print("删除图片:", image_url)
os.remove(image_url)
def start_check(start_number, count):
global rgb_dif_list
global gCount
global gNumber
images_url = "./static/total_images/{}"
save_url = "./static/images/{}"
for number_1 in range(start_number, start_number + count):
for number_2 in range(1, 5):
image_url = images_url.format("image{}_{}.png".format(number_1, number_2))
if os.path.isfile(image_url):
image = Image.open(image_url)
# 通过元素的rgb三个值相加的总数,通过列表保存,如果在列表中存在就添加,否则就删除
rgb_num = sum_rgb(image)
print("image{}_{}.png".format(number_1, number_2), rgb_num)
# 判断该图片的rgb是否已经存在列表中
if rgb_num > 4000000:
continue
for num in range(rgb_num - 3000, rgb_num + 3000):
check_result = check_have_in(num)
# 判断结果,做响应处理
if check_result:
# 存在情况,退出
break
else:
rgb_dif_list.append(rgb_num)
gCount += 1
# 不存在情况,更改名字
gNumber += 1
image.save(save_url.format("images{}.png".format(gNumber)))
if start_number + count == 10001:
print("剩余图片总数为", gCount)
def main():
gevent.joinall([
gevent.spawn(start_check, 1, 100),
gevent.spawn(start_check, 101, 100),
gevent.spawn(start_check, 201, 100),
gevent.spawn(start_check, 301, 100),
gevent.spawn(start_check, 401, 100),
gevent.spawn(start_check, 501, 100),
gevent.spawn(start_check, 601, 100),
gevent.spawn(start_check, 701, 100),
gevent.spawn(start_check, 801, 100),
gevent.spawn(start_check, 901, 100),
gevent.spawn(start_check, 1001, 100),
gevent.spawn(start_check, 1101, 100),
gevent.spawn(start_check, 1201, 100),
gevent.spawn(start_check, 1301, 100),
gevent.spawn(start_check, 1401, 100),
gevent.spawn(start_check, 1501, 100),
gevent.spawn(start_check, 1601, 100),
gevent.spawn(start_check, 1701, 100),
gevent.spawn(start_check, 1801, 100),
gevent.spawn(start_check, 1901, 100),
gevent.spawn(start_check, 2001, 100),
gevent.spawn(start_check, 2101, 100),
gevent.spawn(start_check, 2201, 100),
gevent.spawn(start_check, 2301, 100),
gevent.spawn(start_check, 2401, 100),
gevent.spawn(start_check, 2501, 100),
gevent.spawn(start_check, 2601, 100),
gevent.spawn(start_check, 2701, 100),
gevent.spawn(start_check, 2801, 100),
gevent.spawn(start_check, 2901, 100),
gevent.spawn(start_check, 3001, 100),
gevent.spawn(start_check, 3101, 100),
gevent.spawn(start_check, 3201, 100),
gevent.spawn(start_check, 3301, 100),
gevent.spawn(start_check, 3401, 100),
gevent.spawn(start_check, 3501, 100),
gevent.spawn(start_check, 3601, 100),
gevent.spawn(start_check, 3701, 100),
gevent.spawn(start_check, 3801, 100),
gevent.spawn(start_check, 3901, 100),
gevent.spawn(start_check, 4001, 100),
gevent.spawn(start_check, 4101, 100),
gevent.spawn(start_check, 4201, 100),
gevent.spawn(start_check, 4301, 100),
gevent.spawn(start_check, 4401, 100),
gevent.spawn(start_check, 4501, 100),
gevent.spawn(start_check, 4601, 100),
gevent.spawn(start_check, 4701, 100),
gevent.spawn(start_check, 4801, 100),
gevent.spawn(start_check, 4901, 100),
gevent.spawn(start_check, 5001, 100),
gevent.spawn(start_check, 5101, 100),
gevent.spawn(start_check, 5201, 100),
gevent.spawn(start_check, 5301, 100),
gevent.spawn(start_check, 5401, 100),
gevent.spawn(start_check, 5501, 100),
gevent.spawn(start_check, 5601, 100),
gevent.spawn(start_check, 5701, 100),
gevent.spawn(start_check, 5801, 100),
gevent.spawn(start_check, 5901, 100),
gevent.spawn(start_check, 6001, 100),
gevent.spawn(start_check, 6101, 100),
gevent.spawn(start_check, 6201, 100),
gevent.spawn(start_check, 6301, 100),
gevent.spawn(start_check, 6401, 100),
gevent.spawn(start_check, 6501, 100),
gevent.spawn(start_check, 6601, 100),
gevent.spawn(start_check, 6701, 100),
gevent.spawn(start_check, 6801, 100),
gevent.spawn(start_check, 6901, 100),
gevent.spawn(start_check, 7001, 100),
gevent.spawn(start_check, 7101, 100),
gevent.spawn(start_check, 7201, 100),
gevent.spawn(start_check, 7301, 100),
gevent.spawn(start_check, 7401, 100),
gevent.spawn(start_check, 7501, 100),
gevent.spawn(start_check, 7601, 100),
gevent.spawn(start_check, 7701, 100),
gevent.spawn(start_check, 7801, 100),
gevent.spawn(start_check, 7901, 100),
gevent.spawn(start_check, 8001, 100),
gevent.spawn(start_check, 8101, 100),
gevent.spawn(start_check, 8201, 100),
gevent.spawn(start_check, 8301, 100),
gevent.spawn(start_check, 8401, 100),
gevent.spawn(start_check, 8501, 100),
gevent.spawn(start_check, 8601, 100),
gevent.spawn(start_check, 8701, 100),
gevent.spawn(start_check, 8801, 100),
gevent.spawn(start_check, 8901, 100),
gevent.spawn(start_check, 9001, 100),
gevent.spawn(start_check, 9101, 100),
gevent.spawn(start_check, 9201, 100),
gevent.spawn(start_check, 9301, 100),
gevent.spawn(start_check, 9401, 100),
gevent.spawn(start_check, 9501, 100),
gevent.spawn(start_check, 9601, 100),
gevent.spawn(start_check, 9701, 100),
gevent.spawn(start_check, 9801, 100),
gevent.spawn(start_check, 9901, 100),
])
# start_check(1, 10)
if __name__ == "__main__":
main()
#当当验证码破解
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
import os
from PIL import Image
from io import BytesIO
class Crack(object):
def __init__(self):
self.login_url = "http://www.dangdang.com/"
# self.chrome_options = webdriver.ChromeOptions()
# self.chrome_options.add_argument("--healess")
self.browser = webdriver.Chrome()
self.browser.maximize_window()
self.wait = WebDriverWait(self.browser, 2)
time.sleep(0.5)
def login(self):
"""
输入账号,密码
:return:None
"""
try:
self.browser.get(self.login_url)
self.browser.find_element_by_class_name("login_link").click() # 点击"登陆按钮",打开登陆界面
self.browser.find_element_by_xpath('//*[@id="J_loginMaskClose"]').click() # 点击知道了按钮
except TimeoutException:#超时则再次请求登陆
self.browser.refresh()
self.login()
return
#输入账号和密码
name_page = self.browser.find_element_by_name('txtUsername')
name_page.send_keys("你的手机账号")
password_page = self.browser.find_element_by_name('txtPassword')
password_page.send_keys("手机账号密码")
true_or_false = True
turn_num_list = []
'''
while true_or_false:
print("进入while")
true_or_false = False
print("获取四张需要旋转的图片")
# 获取四张需要旋转的图片
images = self.image_png()
print(images)
# 获取整体四张图片的几次
turn_num_list = []
print("进入for")
for image in images:
turn_num_list.append(self.image_turn_num(image))
print("进入第二个for")
print("turn_num_list:",turn_num_list)
for i in turn_num_list:
if i == 5:
self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'Rotate-refresh'))).click()#数据库没有匹配的图片点击下一组
time.sleep(1)
true_or_false = True
print("出第二个for")
'''
turn_num_list.append(int(input("顺时针旋转次数")))
turn_num_list.append(int(input("顺时针旋转次数")))
turn_num_list.append(int(input("顺时针旋转次数")))
turn_num_list.append(int(input("顺时针旋转次数")))
print("根据上面得到的旋转次数点击图片")
# 根据上面得到的旋转次数点击图片
self.click_image(turn_num_list)
print("结果正确,点击登录按钮")
# 结果正确,点击登录按钮
self.click_submit()
#等待加载
time.sleep(3)
# 如果旋转出问题,就得重新来
if self.browser.find_element_by_class_name('logo_line'):#如果成功登陆则可以检测到搜索框
while True:
pass
pass
#print(self.browser.find_element_by_xpath('//*[@id="J_rotateVcodeWrap"]/div[1]/label[1]'))
# 如果登录不成功,将重新刷新页面登录
else:#登录失败则重新登陆
self.browser.refresh()
self.login()
time.sleep(10)
print("try没问题")
def click_image(self, turn_num_list):
"""
通过算出来的点击次数,来点击图片
:param turn_num_list: 四张图需要点击的次数
:return: None
"""
for i in range(0, len(turn_num_list)):
if turn_num_list[i] == 0:
continue
image = self.wait.until(
EC.presence_of_element_located((By.XPATH, "//div[@class='Rotate-background'][{}]".format(i+1))))
for _ in range(turn_num_list[i]):
image.click()
time.sleep(0.5)
def save_screen_png(self):
"""
获取网页截图
:return: 截图对象
"""
screen_image = self.browser.get_screenshot_as_png()
screenshot = Image.open(BytesIO(screen_image))
# screenshot.save("screenshot.png")
return screenshot
def image_png(self):
"""
通过获取网页截图,然后进行切片,返回四张图片
:return:
"""
screenshot = self.save_screen_png()
images = []
for num in range(1, 5):
# 依次获取4张图片,存入iamges列表中
images.append(self.get_image(screenshot, num))
return images
def get_image(self, screenshot, number):
"""
获取四张图片的下标
:return: left, top, right, bottom
"""
image = self.wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='Rotate-background'][{}]".format(number))))
location = image.location
size = image.size
top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
'width']
image = screenshot.crop((left, top, right, bottom))
# image.save("image{}.png".format(number))
return image
def image_turn_num(self, image):
"""
用获取的图片跟图片库的图片比较,
:param image: 原图
:return:
"""
for i in range(0, 4):
# 原图最多转三次
dir_path = "./static/images/"
change_image = image.rotate(-90*i)
# change_image.save("change{}.png".format(i))
for or_path in os.listdir(dir_path):
or_image = Image.open(os.path.join(dir_path, or_path))
result = self.examine_pixel(or_image, change_image)
print("result:%s",result)
print("i:%s", i)
if result:
return i
return 5
def examine_pixel(self, image1, image2):
"""
判断来个图片是否相等
:param image1: 图片1
:param image2: 图片2
:return:
"""
thredhold = 100
for x in range(image1.size[0]):
for y in range(image1.size[1]):
pixel1 = image1.load()[x, y]
pixel2 = image2.load()[x, y]
if not (abs(pixel1[0] - pixel2[0]) < thredhold and abs(pixel1[1] - pixel2[1]) < thredhold and abs(pixel1[2] - pixel2[2]) < thredhold):
return False
return True
def click_submit(self):
"""
点击登录按钮
:return: None
"""
submit = self.wait.until(EC.element_to_be_clickable((By.ID, "submitLoginBtn")))
submit.click()
def __del__(self):
self.browser.quit()
def main():
"""pass"""
c = Crack()
c.login()
if __name__ == "__main__":
main()
至此实现了登录操作。
我在采集样本时采了10000组验证码,一组验证码切割成4张图片,总计40000张样本,去重处理后剩下400多张验证码图片,但是在登陆匹配时很少有能够匹配的时候,原因在于验证码的链接是php网页,会随机调取图片,但是我取了共40000张,去重后只有400张,而且登陆时还一直无法匹配上,所以我推测大概类似于这个设计,将验证码装在很多个类里面,每个类之间是平行的关系,每次我调取一个类,然后从里面随机取调取图片,所以也就是造成了我采了如此多的样本确一直是如此低的匹配率的原因,当然这只是个人推测。如果要实现获取全部的样本也不无可能,但是一是采集样本的时间,二是匹配时也会浪费非常多的时间。
所以我最终采用的方案是利用第三方登陆绕过图片验证。
首先先进入登陆界面(并非真正的登陆界面,如下)
我们需要点击图中圈出来的登陆按钮,看下网页源代码(通过Ctrl+shift+C选定顶元素)
1实现点击 “登陆” 按钮
self.browser.find_element_by_class_name("login_link").click() # 点击"登陆按钮",打开登陆界面
2实现点击 “知道了”按钮
实现代码:
self.browser.find_element_by_xpath('//*[@id="J_loginMaskClose"]').click() # 点击知道了按钮
3点击 “QQ登陆”按钮
实现代码如下:
self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'qq'))).click()
点击后会弹出qq登陆弹窗,这是一个独立于原来窗口得新窗口
新窗口默认登陆方式为扫码登陆,或者当前电脑登陆qq时可点击头像登陆,但是我们要实现自动填写账号密码的,账号密码登陆。
这里就涉及到一个新的问题,如何去定位新弹出窗口的元素,按照原来的方式一定是无法定位的。
这里通过窗口句柄来实现窗口间的切换,而且在切换时也要做iframe的切换,然后只需要通过上文所讲的方法去定位并点击账号密码登陆按钮,
4填入账号和密码并点击登陆
实现代码:
self.browser.switch_to.window(i)#切换到新打开的窗口
self.browser.switch_to.frame("ptlogin_iframe")#切换打新窗口的iframe
WebDriverWait(self.browser,20).until(EC.element_to_be_clickable((By.ID, 'switcher_plogin'))).click()
name_page = self.browser.find_element_by_name('u')
name_page.send_keys(usr)
password_page = self.browser.find_element_by_name('p')
password_page.send_keys(pas)
self.click_submit()
5切换回主窗口,然后关闭
实现代码:
time.sleep(0.5)
self.browser.switch_to.window(first_handle) # 切换到主的窗口
self.browser.switch_to.default_content()#返回默认iframe
self.browser.quit()#关闭浏览器
以下是全部代码:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import requests
import pandas
from lxml import etree
from fake_useragent import UserAgent
import re
import numpy
#登陆当当网
class Crack(object):
def __init__(self):
self.login_url = "http://www.dangdang.com/"
self.chrome_options = webdriver.ChromeOptions()
self.chrome_options.add_argument("service_args=['–ignore-ssl-errors=true', '–ssl-protocol=TLSv1']")
self.browser = webdriver.Chrome()
#self.browser.minimize_window()#最小化
self.browser.maximize_window()# 最大化
self.size_flag=True#控制登陆弹出窗口显示 True最小化,False最小化
self.wait = WebDriverWait(self.browser, 4)
time.sleep(0.5)
def login(self,key,usr,pas):
"""
输入账号,密码
:return:None
在此函数里完成全部操作,因为浏览器退出后
"""
#try:
self.browser.get(self.login_url)
first_handle = self.browser.current_window_handle#在这里得到当前窗口句柄
self.browser.find_element_by_class_name("login_link").click() # 点击"登陆按钮",打开登陆界面
self.browser.find_element_by_xpath('//*[@id="J_loginMaskClose"]').click() # 点击知道了按钮
self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'qq'))).click() #
#browser.find_element_by_class_name('qq').click() # 点击"qq登陆按钮",打开qq登陆界面
handles = self.browser.window_handles#获取所有窗口句柄
for i in handles:#在所有窗口中查找弹出窗口
if i != first_handle:
self.browser.switch_to.window(i)#切换到新打开的窗口
self.browser.switch_to.frame("ptlogin_iframe")#切换打新窗口的iframe
WebDriverWait(self.browser,20).until(EC.element_to_be_clickable((By.ID, 'switcher_plogin'))).click()
name_page = self.browser.find_element_by_name('u')
name_page.send_keys(usr)
password_page = self.browser.find_element_by_name('p')
password_page.send_keys(pas)
self.click_submit()
time.sleep(0.5)
self.browser.switch_to.window(first_handle) # 切换到主的窗口
self.browser.switch_to.default_content()#返回默认iframe
self.browser.quit()#关闭浏览器
else:
pass
def click_submit(self):
"""
点击登录按钮
:return: None
"""
submit = self.wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="login_button"]')))
submit.click()
if __name__ == "__main__":
#usr="此处填自己的QQ号"
#pas = "此处填自己的QQ号密码"
usr='你的QQ号'
pas='你的QQ号密码'
c = Crack()
c.login('python',usr,pas)