前言:
数据及标签介绍


图片大致张上面这样,分辨率50*200,6位,每位有36种可能性。
这里共1.4w张,二八分割,2900张左右测试集,训练集1.1w张左右。
一、训练

这里使用的是精简版的ResNet50,有点ResNet18和ResNet杂交版的感觉,图有点长,直接上代码,哈哈:
import numpy as np
import tensorflow as tf
import h5py
LABELS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
'v', 'w', 'x', 'y', 'z',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
def identity_block(X, f, filters, stage, block):
"""
跳转前后激活矩阵的维度相同的残差块
:param X: 要跳跃的激活值矩阵
:param f: 整型,卷积层的窗口大小
:param filters: 整型数组,表示残差块中卷积层的过滤器个数
:param stage:整型。辅助给网络层取名
:param block: 字符串,辅助给网络层取名
:return:X:残差块的最终输出矩阵
"""
conv_name_base = f"res{stage}{block}_branch"
bn_name_base = f"bn{stage}{block}_branch"
# 获取每个卷积层对应的过滤器个数
F1, F2, F3 = filters
# 定义参数初始化方法
initializer = tf.keras.initializers.GlorotUniform
# 保存输入的激活值,以便后面跳插入到后面的网络中
X_shortcut = X
# 这里我们使用functional API方式来构建模型
# 主路中的第一组网络层,就是图4的第一组绿橙黄小方块
X = tf.keras.layers.Conv2D(filters=F1,
kernel_size=(1, 1),
strides=(1, 1),
padding='valid',
name=f"{conv_name_base}2a",
kernel_initializer=initializer(seed=0))(X)
X = tf.keras.layers.BatchNormalization(axis=3, name=f"{bn_name_base}2a")(X)
X = tf.keras.layers.Activation('relu')(X)
# 主路中的第二组网络层
X = tf.keras.layers.Conv2D(filters=F2,
kernel_size=(f, f),
strides=(1, 1),
padding='same',
name=f"{conv_name_base}2b",
kernel_initializer=initializer(seed=0))(X)
X = tf.keras.layers.BatchNormalization(axis=3, name=f"{bn_name_base}2b")(X)
X = tf.keras.layers.Activation('relu')(X)
# 主路中的第三组网络层
X = tf.keras.layers.Conv2D(filters=F3,
kernel_size=(1, 1),
padding='valid',
name=f"{conv_name_base}2c",
kernel_initializer=initializer(seed=0))(X)
X = tf.keras.layers.BatchNormalization(axis=3, name=f"{bn_name_base}2c")(X)
# 这一步就是实现小路的地方,其实就是简单的将签名的激活值X_shortcut与第三组的网络层的输出值合并在一起
# 然后将合并的激活值向下传到激活函数中,进入到后面的神经网络中去
X = tf.keras.layers.Add()([X, X_shortcut])
X = tf.keras.layers.Activation('relu')(X)
return X
def convolutional_block(X, f, filters, stage, block, s=2):
conv_name_base = f'res{stage}{block}_branch'
bn_name_base = f'bn{stage}{block}_branch'
F1, F2, F3 = filters
initializer = tf.keras.initializers.GlorotUniform
X_shortcut = X
X = tf.keras.layers.Conv2D(filters=F1, kernel_size=(1, 1), strides=(s, s), padding='valid',
name=f"{conv_name_base}2a", kernel_initializer=initializer(seed=0))(X)
X = tf.keras.layers.BatchNormalization(axis=3, name=f"{bn_name_base}2a")(X)
X = tf.keras.layers.Activation('relu')(X)
X = tf.keras.layers.Conv2D(filters=F2, kernel_size=(f, f), strides=(1, 1), padding='same',
name=f"{conv_name_base}2b", kernel_initializer=initializer(seed=0))(X)
X = tf.keras.layers.BatchNormalization(axis=3, name=f"{bn_name_base}2b")(X)
X = tf.keras.layers.Activation('relu')(X)
X = tf.keras.layers.Conv2D(filters=F3, kernel_size=(1, 1), strides=(1, 1), padding='valid',
name=f"{conv_name_base}2c", kernel_initializer=initializer(seed=0))(X)
X = tf.keras.layers.BatchNormalization(axis=3, name=f"{bn_name_base}2c")(X)
# 对X_shortcut也做卷积,这样一来维度就一致了
X_shortcut = tf.keras.layers.Conv2D(filters=F3, kernel_size=(1, 1), strides=(s, s), padding='valid',
name=f"{conv_name_base}1", kernel_initializer=initializer(seed=0))(X_shortcut)
X_shortcut = tf.keras.layers.BatchNormalization(axis=3, name=f"{bn_name_base}1")(X_shortcut)
X = tf.keras.layers.Add()([X, X_shortcut])
X = tf.keras.layers.Activation('relu')(X)
return X
def ResNet(input_shape=(50, 200, 3), classes=36):
"""
:param input_shape: 输入图像矩阵的维度
:param classes: 类别数量
:return: 网络模型
"""
X_input = tf.keras.layers.Input(input_shape)
initializer = tf.keras.initializers.GlorotUniform
# 用零填充输入向量的周边
X = tf.keras.layers.ZeroPadding2D((3, 3))(X_input)
# Stage 1
X = tf.keras.layers.Conv2D(64, (7, 7), strides=(2, 2), name='conv1', kernel_initializer=initializer(seed=0))(X)
X = tf.keras.layers.BatchNormalization(axis=3, name='bn_conv1')(X)
X = tf.keras.layers.Activation('relu')(X)
X = tf.keras.layers.MaxPool2D((3, 3), strides=(2, 2))(X)
# Stage 2
X = convolutional_block(X, f=3, filters=[64, 64, 128], stage=2, block='a', s=1)
X = identity_block(X, 3, [64, 64, 128], stage=2, block='b')
# Stage 3
X = convolutional_block(X, f=3, filters=[64, 128, 256], stage=3, block='a', s=1)
X = identity_block(X, 3, [64, 128, 256], stage=3, block='b')
X = identity_block(X, 3, [64, 128, 256], stage=3, block='c')
# Stage 4
X = convolutional_block(X, f=3, filters=[32, 64, 128], stage=4, block='a', s=2)
X = identity_block(X, 3, [32, 64, 128], stage=4, block='b')
# 平均池化层
X = tf.keras.layers.AveragePooling2D(pool_size=(2, 2), padding='same')(X)
# 扁平化
X = tf.keras.layers.Flatten()(X)
# 全连接、softmax激活
outputs = [tf.keras.layers.Dense(classes, kernel_initializer=initializer(seed=0), activation='softmax',
name=f"digit_{i + 1}")(X) for i in range(6)]
model = tf.keras.Model(inputs=X_input, outputs=outputs, name='ResNet')
# model.summary()
# tf.keras.utils.plot_model(model, "model.png")
return model
def read_h5_data(key):
with h5py.File("dataset.h5", 'r') as f:
return np.array(f[key])
def load_data_set(sample_path, refresh=False, train_ratio=0.8):
train_set_x = read_h5_data('train_set_x')
train_set_y = read_h5_data('train_set_y')
verify_set_x = read_h5_data('verify_set_x')
verify_set_y = read_h5_data('verify_set_y')
return train_set_x, train_set_y, verify_set_x, verify_set_y
if __name__ == "__main__":
X_train_orig, Y_train_orig, X_test_orig, Y_test_orig = load_data_set(
sample_path="/tmp/samples", refresh=False)
X_train = X_train_orig / 255
X_test = X_test_orig / 255
# Y的维度由(样本数, 6)变成(样本数, 6, 36 )
Y_train = [tf.one_hot(Y_train_orig[:, i], len(LABELS)) for i in range(6)]
Y_test = [tf.one_hot(Y_test_orig[:, i], len(LABELS)) for i in range(6)]
model = ResNet((50, 200, 3), classes=36)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# model = tf.keras.models.load_model('captcha_model_resnet.h5')
model.fit(X_train, Y_train, epochs=20, batch_size=32)
model.save("captcha_model_resnet.h5")
preds = model.evaluate(X_test, Y_test)

大概在第17次epoch,达到一个比较优的状态。最终跑完20次epoch后,测试集准确率和损失如下:

可见6位都达到了99%
二、推理
这里采用6位合在一起做推理,评估一下准确率。
import os
import numpy as np
from PIL import Image
import tensorflow as tf
LABELS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
'v', 'w', 'x', 'y', 'z',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
def predict(model, img_path):
image = Image.open(img_path)
X = np.array(image)
X = X / 255.
preds = model.predict(X.reshape(-1, 50, 200, 3), 36)
predicted_digits = [np.argmax(pred, axis=1) for pred in preds]
# 将预测的每位字符组合成完整的验证码
digits = []
digits_str = ''
for digit in predicted_digits:
digits.append(digit[0])
digits_str += LABELS[digit[0]]
print(digits)
return digits_str
if __name__ == "__main__":
path = '/tmp/sample_verify'
model = tf.keras.models.load_model("captcha_model_resnet.h5")
imgs = []
for root, dirs, files in os.walk(path):
for file in files:
img_path = os.path.join(root, file)
imgs.append(img_path)
total_num = 0
correct_num = 0
error_num = 0
for img in imgs:
total_num += 1
left = img[:str(img).rindex("_")]
real_value = left[left.rindex("/") + 1:]
predict_value = predict(model, img)
if real_value == predict_value:
correct_num += 1
else:
error_num += 1
print(f"ERR_样本{total_num}, 真实值:{real_value}, 预测值:{predict_value}")
print(img)
print(f"样本{total_num}, 准确数:{correct_num}, 错误数:{error_num}, 准确率: {correct_num / total_num}")

综合准确率在97.8%左右。
三、预测
import numpy as np
from PIL import Image
import tensorflow as tf
LABELS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
'v', 'w', 'x', 'y', 'z',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
def predict(model, img_path):
image = Image.open(img_path)
X = np.array(image)
X = X / 255.
preds = model.predict(X.reshape(-1, 50, 200, 3), 36)
predicted_digits = [np.argmax(pred, axis=1) for pred in preds]
# 将预测的每位字符组合成完整的验证码
digits = []
digits_str = ''
for digit in predicted_digits:
digits.append(digit[0])
digits_str += LABELS[digit[0]]
print(digits)
return digits_str
if __name__ == "__main__":
img_path = '/tmp/sample_verify/3m66b2_f1289f385acea49c5a41c7bab135a607.png'
model = tf.keras.models.load_model("captcha_model_resnet.h5")
print("result:", predict(model, img_path))


没毛病
四、总结
残差,利用激活值a[l+2] = g( z[l+2] + a[l] ),来跳跃传递激活值,从而来避免梯度消失问题,从而可以加深网络的深度。从而学习到更多特征,提升准确率。

而残差块又分为输入与输出同维度的、不同维度的两种:

下面是ResNet50的结构:

x3表示3个identity_block
2408

被折叠的 条评论
为什么被折叠?



