建立微博cookie池准备收集微博验证码建立验证码数据库

最新推荐文章于 2021-02-20 14:14:32 发布

SHAN_9W

最新推荐文章于 2021-02-20 14:14:32 发布

阅读量625

点赞数

分类专栏：爬虫文章标签： cookie 验证码微博

本文链接：https://blog.csdn.net/u014248032/article/details/83580823

版权

爬虫专栏收录该内容

7 篇文章 1 订阅

订阅专栏

登录微博weibo.cn会出现验证码验证

可以排列组合算出4*3*2*1=24种，所以可以先用selenium模拟登录截取这些验证码，方便之后获取cookie时登录验证，在验证码数据库中匹配，所以写了这么一个脚本获取验证码，其实效率不是很高，可以多进程操作，或者用selenium模拟人随便滑动验证码，这时验证码会自动更新，不用重新打开浏览器，提升效率。

验证码收集全后，手动标记下轨迹。

from io import BytesIO
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from PIL import Image
from os import listdir
import os
from selenium.webdriver.chrome.options import Options


class GetCheckPic:
    def __init__(self, username, password, browser):
        self.username = username#微博账号
        self.password = password#微博密码
        self.browser = browser#无头浏览器
        self.wait = WebDriverWait(self.browser, 20)#等待时间
        self.url = "https://passport.weibo.cn/signin/login?entry=mweibo&r=https://m.weibo.cn/"#目标url

    def open(self):#打开网页
        self.browser.get(self.url)
        username = self.wait.until(EC.presence_of_element_located((By.ID, "loginName")))#用户名标签
        password = self.wait.until(EC.presence_of_element_located((By.ID, "loginPassword")))#密码标签
        submit = self.wait.until(EC.element_to_be_clickable((By.ID, "loginAction")))#提交按钮标签
        username.send_keys(self.username)#输入账号
        password.send_keys(self.password)#输入密码
        time.sleep(1)
        submit.click()#按提交按钮
        time.sleep(2)
        
    def get_position(self):#获取验证码图片的位置
        try:
            img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, "patt-shadow")))#出现验证码图片的html标签
        except TimeoutError:
            print("验证码未出现")
            self.open()#未出现验证码图片标签则重新打开网页
        time.sleep(2)
        location = img.location#获取验证码图片的位置
        size = img.size#验证码图片的大小
        top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
            'width']#验证码图片的坐标
        return top, bottom, left, right
    
    def is_pixel_equal(self, image1, image2, x, y):#像素匹配
        threshold = 20#阈值
        pixel1 = image1.load()[x, y]#读取图片1的像素
        pixel2 = image2.load()[x, y]#读取图片2的像素 RGBA既在原有RGB的的基础上加上A（透明度）  例如[252,232,255,255]
        if abs(pixel2[0] - pixel1[0]) < threshold and abs(pixel2[1] - pixel1[1]) < threshold and\
                abs(pixel2[2] - pixel1[2]) < threshold:#如果像素小于阈值则判断这一个像素一样
            return True
        else:
            return False

    def get_screenshot(self):#获得验证码的截图
        screenshot = self.browser.get_screenshot_as_png()#截图用png截取
        screenshot = Image.open(BytesIO(screenshot))#用bytes打开图片
        return screenshot

    def get_image(self, n):#获取验证码截图
        top, bottom, left, right = self.get_position()
        print('验证码位置', top, bottom, left, right)
        screenshot = self.get_screenshot()
        captcha = screenshot.crop((left, top, right, bottom))#截取的图片
        print("获取到验证码")
        if not self.detect_image(captcha):#如果截取的验证码在验证码数据库中没有则保存验证码
            os.chdir(r"D:\untitled\cookie池\login\CheckPic")#验证码数据库
            captcha.save(str(n) + ".png")
            print("保存成功")
            os.chdir(r"D:\untitled\cookie池")

    def same_image(self, image, model):#判断两张图片是否相同
        count = 0#如果这个像素点一样 则count加1
        threshold = 0.99#图片匹配度的阈值
        for x in range(image.width):
            for y in range(image.height):
                #匹配每个像素点
                if self.is_pixel_equal(image, model, x, y):#像素一样则加一
                    count += 1
        result = float(count)/(image.height * image.width)#匹配度
        if result > threshold:#大于匹配度阈值则相同
            return True
        else:
            return False

    def detect_image(self, image):#在验证码数据库中检测
        for model in listdir(r"D:\untitled\cookie池\login\CheckPic"):#遍历验证码数据库中的每张验证码
            image_path = r"D:\untitled\cookie池\login\CheckPic\\" + model
            model_image = Image.open(image_path)
            if self.same_image(image, model_image):
                print("验证码库中有此验证码，重新获取验证码。")
                return True

if __name__ == '__main__':
    n = 0
    while True:
        n += 1
        if len(listdir(r"D:\untitled\cookie池\login\CheckPic")) == 24:
            break
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        browser = webdriver.Chrome(chrome_options=chrome_options)#无头模式
        get = GetCheckPic("账号", "密码", browser)
        get.open()
        get.get_position()
        get.get_image(n)
        browser.close()
    print("验证码已收集完全。")