- 前言
- 框架
- 数据生成
- 模型训练
- 结果预测
- 问题
- 参考链接和论文
前言
这篇文章主要介绍了验证码识别项目的CNN实现以及自己的思考,后续再对流程和模型细节加以改进。现在验证码识别的种类越来越多,类型概述可以看下 字符验证码杀手--CNN ,介绍的比较详细,图片较多。
本次任务主要是对4位数字和大小写字母组成的验证码图片进行识别,把4位字符识别看作一个多标签分类任务。涉及到的环境:captcha模块生成数据集,tensorflow搭建cnn模型,PIL和matplotlib用于处理图片和显示。模型训练4个小时,测试准确率达到85%。
框架
│ captcha_model.py
│ generate_captcha.py
│ predict_captcha_batch.py
│ trian_model.py
├─check_point
│
├─data
│
├─logs
│ ├─test
│ └─train
└─__pycache__
captcha_model.cpython-35.pyc
generate_captcha.cpython-35.pyc
- captcha_model.py 定义了cnn模型
- generate_captcha.py 用于生成训练数据和测试数据
- predict_captcha_batch.py 用于测试训练好的模型,显示图片和预测结果
- trian_model.py 用于训练模型
- check_point 保存模型参数
- data 保存生成的测试图片
- logs 记录了训练测试过程的损失和准确率,tensorboard可以进行可视化
数据生成
基于captcha实现图片验证码自动生成,本次实现未选择字体,生成图片大小默认宽高为160*60,包含4个字符,包含大小写字母和数字。实际处理图片验证码表现形式多样,可能需要爬取较大数量的图片训练模型。
generate_captcha.py
from captcha.image import ImageCaptcha
from PIL import Image
import numpy as np
import random
import string
class generateCaptcha():
def __init__(self,
width = 160,#验证码图片的宽
height = 60,#验证码图片的高
char_num = 4,#验证码字符个数
characters = string.digits + string.ascii_uppercase + string.ascii_lowercase):#验证码组成,数字+大写字母+小写字母
self.width = width
self.height = height
self.char_num = char_num
self.characters = characters
self.classes = len(characters)
def gen_captcha(self,batch_size = 50):
X = np.zeros([batch_size,self.height,self.width,1])
img = np.zeros((self.height,self.width),dtype=np.uint8)
Y = np.zeros([batch_size,self.char_num,self.classes])
image = ImageCaptcha(width = self.width,height = self.height)
while True:
for i in range(batch_size):
captcha_str = ''.join(random.sample(self.characters,self.char_num))
img = image.generate_image(captcha_str).convert('L')
img = np.array(img.getdata())
X[i] = np.reshape(img,[self.height,self.width,1])/255.0
for j,ch in enumerate(captcha_str):
Y[i,j,self.characters.find(ch)] = 1
Y = np.reshape(Y,(batch_size,self.char_num*self.classes))
yield X,Y
def decode_captcha(self,y):
y = np.reshape(y,(len(y),self.char_num,self.classes))
return ''.join(self.characters[x] for x in np.argmax(y,axis = 2)[0,:])
def get_parameter(self):
return self.width,self.height,self.char_num,self.characters,self.classes
'''
在data中批量产生验证码图片
'''
def gen_test_captcha(self,size = 10):
image = ImageCaptcha(width = self.width,height = self.height)
label = []
for i in range(size):
captcha_str = ''.join(random.sample(self.characters,self.char_num))
label.append(captcha_str)
img = image.generate_image(captcha_str)
img.save('./data/'+captcha_str + '.jpg')
return label
模型训练
定义了模型结构为 3个卷积层连接2个全连接层,以64张图片作为一批训练,显卡GTX960M。损失函数采用sigmoid_cross_entropy_with_logits(注意下tensorflow的四种交叉熵函数)。想得到较高的精度,需要迭代的次数较多。迭代40000次花了4个多小时,前面一个小时,损失下降慢,准确率提不上去。
train_model.py 参考别人的代码,修改了tensorboard可视化和训练,代码还有待优化。
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 2 19:53:50 2018
@author: Alvysinger
"""
import tensorflow as tf
import numpy as np
import string
import generate_captcha
import captcha_model
import datetime
if __name__ == '__main__':
captcha = generate_captcha.generateCaptcha()
width,height,char_num,characters,classes = captcha.get_parameter()
x = tf.placeholder(tf.float32, [None, height,width,1])
y_ = tf.placeholder(tf.float32, [None, char_num*classes])
keep_prob = tf.placeholder(tf.float32)
#损失函数 和 优化器
model = captcha_model.captchaModel(width,height,char_num,classes)
y_conv = model.create_model(x,keep_prob)
cross_entropy = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y_,logits=y_conv))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
#评价指标
predict = tf.reshape(y_conv, [-1,char_num, classes])
real = tf.reshape(y_,[-1,char_num, classes])
correct_prediction = tf.equal(tf.argmax(predict,2), tf.argmax(real,2))
correct_prediction = tf.cast(correct_prediction, tf.float32)
accuracy = tf.reduce_mean(correct_prediction)
saver = tf.train.Saver()
display_step = 100
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# add summary
cross_entropy_summary = tf.summary.scalar('cross_entropy',cross_entropy)
accuracy_summary = tf.summary.scalar('accuracy',accuracy)
merged = tf.summary.merge([cross_entropy_summary,accuracy_summary])
test_writer = tf.summary.FileWriter("./logs/test/",sess.graph)
train_writer = tf.summary.FileWriter("./logs/train/")
step = 0
while True:
batch_x,batch_y = next(captcha.gen_captcha(64))
summary_op_out,_,loss = sess.run([merged,train_step,cross_entropy],feed_dict={x: batch_x, y_: batch_y, keep_prob: 0.75})
if step % 50 == 0 :
train_writer.add_summary(summary_op_out,step)
if step % display_step == 0:
nowTime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
batch_x_test,batch_y_test = next(captcha.gen_captcha(64))
summary_op_out,acc,loss = sess.run([merged,accuracy,cross_entropy] ,feed_dict={x: batch_x_test, y_: batch_y_test, keep_prob: 1.})
test_writer.add_summary(summary_op_out,step)
#打印训练过程中的时间信息,迭代次数,准确率和loss
print ('[%s]step:%d' % (nowTime,step),end=' ')
print ('accuracy:%f ,loss:%f' % (acc,loss))
if step % 5000 ==0:
saver.save(sess,"./check_point/"+"captcha_model.ckpt")
if acc > 0.980 or step> 50000:
saver.save(sess,"./check_point/"+"captcha_model.ckpt")
break
step += 1
train_writer.close()
test_writer.close()
print('train finished!')
captcha_model.py
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 2 19:51:22 2018
@author: Alvysinger
"""
import tensorflow as tf
import math
class captchaModel():
def __init__(self,
width = 160,
height = 60,
char_num = 4,
classes = 62):
self.width = width
self.height = height
self.char_num = char_num
self.classes = classes
def conv2d(self,x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(self,x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
def weight_variable(self,shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(self,shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def create_model(self,x_images,keep_prob):
#first layer
w_conv1 = self.weight_variable([5, 5, 1, 32])
b_conv1 = self.bias_variable([32])
h_conv1 = tf.nn.relu(tf.nn.bias_add(self.conv2d(x_images, w_conv1), b_conv1))
h_pool1 = self.max_pool_2x2(h_conv1)
h_dropout1 = tf.nn.dropout(h_pool1,keep_prob)
conv_width = math.ceil(self.width/2)
conv_height = math.ceil(self.height/2)
#second layer
w_conv2 = self.weight_variable([5, 5, 32, 64])
b_conv2 = self.bias_variable([64])
h_conv2 = tf.nn.relu(tf.nn.bias_add(self.conv2d(h_dropout1, w_conv2), b_conv2))
h_pool2 = self.max_pool_2x2(h_conv2)
h_dropout2 = tf.nn.dropout(h_pool2,keep_prob)
conv_width = math.ceil(conv_width/2)
conv_height = math.ceil(conv_height/2)
#third layer
w_conv3 = self.weight_variable([5, 5, 64, 64])
b_conv3 = self.bias_variable([64])
h_conv3 = tf.nn.relu(tf.nn.bias_add(self.conv2d(h_dropout2, w_conv3), b_conv3))
h_pool3 = self.max_pool_2x2(h_conv3)
h_dropout3 = tf.nn.dropout(h_pool3,keep_prob)
conv_width = math.ceil(conv_width/2)
conv_height = math.ceil(conv_height/2)
#first fully layer
conv_width = int(conv_width)
conv_height = int(conv_height)
w_fc1 = self.weight_variable([64*conv_width*conv_height,1024])
b_fc1 = self.bias_variable([1024])
h_dropout3_flat = tf.reshape(h_dropout3,[-1,64*conv_width*conv_height])
h_fc1 = tf.nn.relu(tf.nn.bias_add(tf.matmul(h_dropout3_flat, w_fc1), b_fc1))
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
#second fully layer
w_fc2 = self.weight_variable([1024,self.char_num*self.classes])
b_fc2 = self.bias_variable([self.char_num*self.classes])
y_conv = tf.add(tf.matmul(h_fc1_drop, w_fc2), b_fc2)
return y_conv
结果预测
恢复模型,利用captcha生成验证码图片,从文件夹中读取再进行预测。少量样本预测的准确率不稳定,训练过程中测试的准确率有85%+。打印出部分结果,整体预测的还算理想,要处理细节的部分。容易发现的问题,字母大小判定错误(填写验证码可以不区分大小写),0 ,o和O 容易判断错误。
验证码预测: 1yz8 , 实际值: 1yzg , 结果:False
=============================
验证码预测: Yd6s , 实际值: Yd6S , 结果:False

=============================
验证码预测: cNFW , 实际值: CNFW , 结果:False

=============================
验证码预测: eskd , 实际值: eskd , 结果:True

=============================
验证码预测: 9SUI , 实际值: 9SUI , 结果:True

=============================
验证码预测: gTef , 实际值: gTef , 结果:True

=============================
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 2 22:26:39 2018
@author: Alvysinger
"""
from PIL import Image, ImageFilter
import tensorflow as tf
import numpy as np
import string
import sys
import generate_captcha
import captcha_model
import os
import matplotlib.pyplot as plt
os.environ['CUDA_ENABLE_DEVICES'] = '0'
def del_jpg():
path = './data'
ls = os.listdir(path)
if ls != []:
for i in ls:
os.remove('./data/'+i)
def display_captcha(pre_list, label ,showfigure = False , showlabel = False):
'''
1对预测的pre_list进行转换
2打印转换后的预测值,和实际对应的label
3显示图片
将预测结果和实际结果在图片中生成gif更加直观
'''
filePathName='./data/'
characters = string.digits + string.ascii_uppercase + string.ascii_lowercase
size = len(pre_list)
pre_label = []
result = []
count = 0
for i in pre_list:
s = ''
for j in i:
s += characters[j]
pre_label.append(s)
for i in range(size):
if pre_label[i] == label[i]:
result.append('True')
count+=1
else:
result.append('False')
if showlabel is True:
print('=============================')
print('验证码预测: %s , 实际值: %s , 结果:%s'%(pre_label[i],label[i],result[i]))
if showfigure is True:
img = Image.open(filePathName+label[i]+'.jpg')
plt.imshow(img)
plt.axis('off')
plt.show()
print('准确率: %f' %(count / float(size) ))
if __name__ == '__main__':
#检查data文件夹是否为空,清空data中生成的数据
del_jpg()
#批量生产验证码图片,默认生成10张
generate_size = 10
showfigure = True
showlabel = True
captcha = generate_captcha.generateCaptcha()
width,height,char_num,characters,classes = captcha.get_parameter()
label = captcha.gen_test_captcha( size = generate_size )
batch_size = len(label)
test_x = np.zeros([batch_size,height,width,1])
for i,name in enumerate(label):
gray_image = Image.open('./data/'+ name +'.jpg').convert('L')
img = np.array(gray_image.getdata())
test_x[i] = np.reshape(img,[height,width,1])/255.0
x = tf.placeholder(tf.float32, [None, height,width,1])
keep_prob = tf.placeholder(tf.float32)
model = captcha_model.captchaModel(width,height,char_num,classes)
y_conv = model.create_model(x,keep_prob)
predict = tf.argmax(tf.reshape(y_conv, [-1,char_num, classes]),2)
init_op = tf.global_variables_initializer()
saver = tf.train.Saver()
#gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.95)
#with tf.Session(config=tf.ConfigProto(log_device_placement=False,gpu_options=gpu_options)) as sess:
with tf.Session() as sess:
sess.run(init_op)
saver.restore(sess, "./check_point/"+"captcha_model.ckpt")
#读取data文件夹中的验证码图像
pre_list = sess.run(predict,feed_dict={x: test_x, keep_prob: 1})
display_captcha(pre_list,label,showfigure = showfigure , showlabel = showlabel)
问题
- 对大小写的误识别,容易出现错误的(C,c)(S,s)(U,u)(O,0,o),验证码填写的时候本身是不需要区别字母的大小写的。在计算损失函数的时候如何把这个考虑进去。
- 传统的验证码识别是进行字符分割判断,在验证码粘连不严重的情况下,都能分割出字符进行识别。目前模型是整体的基于CNN的实现,只能是4位验证码识别,功能上比较弱。可能出现5位或者6位的验证码,如何实现变长预测,怎么样处理更加方便。后面再尝试ocr的方法来处理。
- 目前搭建CNN和全连接的模型,尝试其他网络结构(CNN+LSTM)搭建网络模型,同时保持较好的准确率。
- 模型训练过程中,前1个小时损失的变化待解决,前面模型难收敛
- 模型大小的优化,tensorflow保存的模型文件 120M+
- 生成数据和训练模型同时进行,提高整体训练的速度。(有待学习)
- tensorboard做模型训练分析
CAPTCHA Recognition with Active Deep Learning 介绍的是6位验证码在较少样本训练的方法,准确率在80+%,暂时不关注。
参考链接:
CAPTCHA Recognition with Active Deep Learning