爬取某信息化服务平台登录页面验证码

马龙强_

已于 2023-11-01 13:48:44 修改

阅读量176

点赞数

文章标签： python

于 2023-10-30 20:31:21 首次发布

本文链接：https://blog.csdn.net/m0_74972727/article/details/134127403

版权

网站链接：https://login.cdpf.org.cn/uams/person.html

鼠标放到验证码上，右键“检查”

提取URL

爬虫代码：

import requests
# 导入base64库，用于对数据进行Base64编码和解码
import base64
# 从io模块导入BytesIO，它是一个类似文件的对象，可以在内存中读写二进制数据
from io import BytesIO
# 从PIL（Python Imaging Library，Python图像处理库）导入Image模块，用于处理图像
from PIL import Image
# 导入os模块，用于进行操作系统级别的功能操作，如文件和目录操作
import os
for i in range(50):
    # 构造一个URL，这个URL应该是用于获取验证码图片的接口，其中的'_'和i的值可能影响了获取的验证码内容
    url = "https://login.cdpf.org.cn/uams/kaptcha/validateCode?userType=4&_=" + str(1698646709676 + i)
    # 使用requests库发出GET请求到上面的URL，获取响应内容并赋值给response变量
    response = requests.get(url)
    # 使用response的json方法将响应内容解析为JSON格式并赋值给data变量
    data = response.json()
    # 从data中获取名为"image"的字段的值，这个值应该是Base64编码的图像数据，并赋值给image_base64变量
    image_base64 = data["image"]
    # 使用base64库的b64decode方法对image_base64进行解码，结果赋值给image_data变量
    image_data = base64.b64decode(image_base64)
    # 使用PIL库的Image模块的open方法打开image_data，结果赋值给image变量
    image = Image.open(BytesIO(image_data))
    if not os.path.exists('result'):
        os.makedirs('result')
    image.save('result/' + str(1698646709676 + i) + '.png')

爬取结果：

提取验证码数字：

相关链接：Tesseract-OCR-3.0.5 数字识别训练与合并多次训练数据_tesseract训练库合并-CSDN博客相关代码：

import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import os

def preprocess_image(image):
    # 转换为灰度图像
    image = image.convert('L')
    # 二值化
    threshold = 128
    table = []
    for i in range(256):
        if i < threshold:
            table.append(0)
        else:
            table.append(1)
    image = image.point(table, '1')
    # 去噪
    image = image.filter(ImageFilter.MedianFilter())
    # 缩放
    width, height = image.size
    new_width, new_height = int(width * 0.5), int(height * 0.5)
    image = image.resize((new_width, new_height), Image.ANTIALIAS)
    return image

def demo():
    # 获取当前路径下的result文件夹下的所有图片文件
    image_files = [f for f in os.listdir('result3') if f.endswith(('.png', '.jpg', '.jpeg'))]

    # 遍历图片文件，进行识别并打印结果
    for image_file in image_files:
        # 打开要识别的图片
        image = Image.open(os.path.join('result3', image_file))
        # 预处理图片
        image = preprocess_image(image)
        # 使用pytesseract调用image_to_string方法进行识别，传入要识别的图片，lang='chi_sim'是设置为中文识别，
        text = pytesseract.image_to_string(image, lang='num', config='--psm 6 --oem 3 -c tessedit_char_whitelist=0123456789')
        # 输入所识别的文字
        print(f"{image_file}: {text}")

if __name__ == '__main__':
    demo(