图形验证码如下:
训练两轮时的准确率:上边显示的是未识别的
config_demo.yaml
System: GpuMemoryFraction: 0.7 TrainSetPath: 'train/' TestSetPath: 'test/' ValSetPath: 'dev/' LabelRegex: '([\u4E00-\u9FA5]{4,8}).jpg' MaxTextLenth: 8 IMG_W: 200 IMG_H: 100 ModelName: 'captcha2.h5' Alphabet: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789' NeuralNet: RNNSize: 256 Dropout: 0.25 TrainParam: EarlyStoping: monitor: 'val_acc' patience: 10 mode: 'auto' baseline: 0.02 Epochs: 10 BatchSize: 100 TestBatchSize: 10
train.py
# coding=utf-8 """ 将三通道的图片转为灰度图进行训练 """ import itertools import os import re import random import string from collections import Counter from os.path import join import yaml import cv2 import numpy as np import tensorflow as tf from keras import backend as K from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback from keras.layers import Input, Dense, Activation, Dropout, BatchNormalization, Reshape, Lambda from keras.layers.convolutional import Conv2D, MaxPooling2D from keras.layers.merge import add, concatenate from keras.layers.recurrent import GRU from keras.models import Model, load_model f = open('./config/config_demo.yaml', 'r', encoding='utf-8') cfg = f.read() cfg_dict = yaml.load(cfg) config = tf.ConfigProto() config.gpu_options.allow_growth = True session = tf.Session(config=config) K.set_session(session) TRAIN_SET_PTAH = cfg_dict['System']['TrainSetPath'] VALID_SET_PATH = cfg_dict['System']['TrainSetPath'] TEST_SET_PATH = cfg_dict['System']['TestSetPath'] IMG_W = cfg_dict['System']['IMG_W'] IMG_H = cfg_dict['System']['IMG_H'] MODEL_NAME = cfg_dict['System']['ModelName'] LABEL_REGEX = cfg_dict['System']['LabelRegex'] RNN_SIZE = cfg_dict['NeuralNet']['RNNSize'] DROPOUT = cfg_dict['NeuralNet']['Dropout'] MONITOR = cfg_dict['TrainParam']['EarlyStoping']['monitor'] PATIENCE = cfg_dict['TrainParam']['EarlyStoping']['patience'] MODE = cfg_dict['TrainParam']['EarlyStoping']['mode'] BASELINE = cfg_dict['TrainParam']['EarlyStoping']['baseline'] EPOCHS = cfg_dict['TrainParam']['Epochs'] BATCH_SIZE = cfg_dict['TrainParam']['BatchSize'] TEST_BATCH_SIZE = cfg_dict['TrainParam']['TestBatchSize'] letters_dict = {} MAX_LEN = 0 def get_maxlen(): global MAX_LEN maxlen = 0 lines = open("train.csv", "r", encoding="utf-8").readlines() for line in lines: name,label = line.strip().split(",") if len(label)>maxlen: maxlen = len(label) MAX_LEN = maxlen return maxlen def get_letters(): global letters_dict letters = "" lines = open("train.csv","r",encoding="utf-8").readlines() maxlen = get_maxlen() for line in lines: name,label = line.strip().split(",") letters = letters+label if len(label) < maxlen: label = label + '_' * (maxlen - len(label)) letters_dict[name] = label if os.path.exists("letters.txt"): letters = open("letters.txt","r",encoding="utf-8").read() return letters return "".join(set(letters)) letters = get_letters() f_W = open("letters.txt","w",encoding="utf-8") f_W.write("".join(letters)) class_num = len(letters) + 1 # plus 1 for blank print('Letters:', ''.join(letters)) print("letters_num:",class_num) def labels_to_text(labels): return ''.join([letters[int(x)] if int(x) != len(letters) else '' for x in labels]) def text_to_labels(text): return [letters.find(x) if letters.find(x) > -1 else len(letters) for x in text] def is_valid_str(s): for ch in s: if not ch in letters: return False return True class TextImageGenerator: def __init__(self, dirpath, tag, img_w, img_h, batch_size, downsample_factor, ): global letters_dict self.img_h = img_h self.img_w = img_w self.batch_size = batch_size self.downsample_factor = downsample_factor self.letters_dict = letters_dict self.n = len(self.letters_dict) self.indexes = list(range(self.n)) self.cur_index = 0 self.imgs = np.zeros((self.n, self.img_h, self.img_w)) self.texts = [] for i, (img_filepath, text) in enumerate(self.letters_dict.items()): img_filepath = dirpath+img_filepath if i == 0: img_filepath = "train/0.jpg" img = cv2.imread(img_filepath) img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # cv2默认是BGR模式 img = cv2.resize(img, (self.img_w, self.img_h)) img = img.astype(np.float32) img /= 255 self.imgs[i, :, :] = img self.texts.append(text) print(len(self.texts),len(self.imgs),self.n) @staticmethod def get_output_size(): return len(letters) + 1 def next_sample(self): #每次返回一个数据和对应标签 self.cur_index += 1 if self.cur_index >= self.n: self.cur_index = 0 random.shuffle(self.indexes) return self.imgs[self.indexes[self.cur_index]], self.texts[self.indexes[self.cur_index]] def next_batch(self): # while True: # width and height are backwards from typical Keras convention # because width is the time dimension when it gets fed into the RNN if K.image_data_format() == 'channels_first': X_data = np.ones([self.batch_size, 1, self.img_w, self.img_h]) else: X_data = np.ones([self.batch_size, self.img_w, self.img_h, 1]) Y_data = np.ones([self.batch_size, MAX_LEN]) input_length = np.ones((self.batch_size, 1)) * (self.img_w // self.downsample_factor - 2) label_length = np.zeros((self.batch_size, 1)) source_str = [] for i in range(self.batch_size): img, text = self.next_sample() img = img.T if K.image_data_format() == 'channels_first': img = np.expand_dims(img, 0) #增加一个维度 else: img = np.expand_dims(img, -1) X_data[i] = img Y_data[i] = text_to_labels(text) source_str.append(text) text = text.replace("_", "") # important step label_length[i] = len(text) inputs = { 'the_input': X_data, 'the_labels': Y_data, 'input_length': input_length, 'label_length': label_length, # 'source_str': source_str } outputs = {'ctc': np.zeros([self.batch_size])} yield (inputs, outputs) # # Loss and train functions, network architecture def ctc_lambda_func(args): #ctc损失是时间序列损失函数 y_pred, labels, input_length, label_length = args # the 2 is critical here since the first couple outputs of the RNN # tend to be garbage: y_pred = y_pred[:, 2:, :] return K.ctc_batch_cost(labels, y_pred, input_length, label_length) downsample_factor = 4 def train(img_w=IMG_W, img_h=IMG_H, dropout=DROPOUT, batch_size=BATCH_SIZE, rnn_size=RNN_SIZE): # Input Parameters # Network parameters conv_filters = 16 kernel_size = (3, 3) pool_size = 2 time_dense_size = 32 if K.image_data_format() == 'channels_first': input_shape = (1, img_w, img_h) else: input_shape = (img_w, img_h, 1) global downsample_factor downsample_factor = pool_size ** 2 tiger_train = TextImageGenerator(TRAIN_SET_PTAH, 'train', img_w, img_h, batch_size, downsample_factor) tiger_val = TextImageGenerator(VALID_SET_PATH, 'val', img_w, img_h, batch_size, downsample_factor) act = 'relu' input_data = Input(name='the_input', shape=input_shape, dtype='float32') inner = Conv2D(conv_filters, kernel_size, padding='same', activation=None, kernel_initializer='he_normal', name='conv1')(input_data) inner = BatchNormalization()(inner) # add BN inner = Activation(act)(inner) inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max1')(inner) inner = Conv2D(conv_filters, kernel_size, padding='same', activation=None, kernel_initializer='he_normal', name='conv2')(inner) inner = BatchNormalization()(inner) # add BN inner = Activation(act)(inner) inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max2')(inner) conv_to_rnn_dims = (img_w // (pool_size ** 2), (img_h // (pool_size ** 2)) * conv_filters) inner = Reshape(target_shape=conv_to_rnn_dims, name='reshape')(inner) # cuts down input size going into RNN: inner = Dense(time_dense_size, activation=None, name='dense1')(inner) inner = BatchNormalization()(inner) # add BN inner = Activation(act)(inner) if dropout: inner = Dropout(dropout)(inner) # 防止过拟合 # Two layers of bidirecitonal GRUs # GRU seems to work as well, if not better than LSTM: gru_1 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru1')(inner) gru_1b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru1_b')( inner) gru1_merged = add([gru_1, gru_1b]) gru_2 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru2')(gru1_merged) gru_2b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru2_b')( gru1_merged) inner = concatenate([gru_2, gru_2b]) if dropout: inner = Dropout(dropout)(inner) # 防止过拟合 # transforms RNN output to character activations: inner = Dense(tiger_train.get_output_size(), kernel_initializer='he_normal', name='dense2')(inner) y_pred = Activation('softmax', name='softmax')(inner) base_model = Model(inputs=input_data, outputs=y_pred) base_model.summary() labels = Input(name='the_labels', shape=[MAX_LEN], dtype='float32') input_length = Input(name='input_length', shape=[1], dtype='int64') label_length = Input(name='label_length', shape=[1], dtype='int64') # Keras doesn't currently support loss funcs with extra parameters # so CTC loss is implemented in a lambda layer loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length]) model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out) # the loss calc occurs elsewhere, so use a dummy lambda func for the loss model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer='adadelta') earlystoping = EarlyStopping(monitor=MONITOR, patience=PATIENCE, verbose=1, mode=MODE, baseline=BASELINE) train_model_path = './tmp/train_' + MODEL_NAME checkpointer = ModelCheckpoint(filepath=train_model_path, verbose=1, save_best_only=True) if os.path.exists(train_model_path): model.load_weights(train_model_path) print('load model weights:%s' % train_model_path) evaluator = Evaluate(model) model.fit_generator(generator=tiger_train.next_batch(), steps_per_epoch=tiger_train.n, epochs=EPOCHS, initial_epoch=1, validation_data=tiger_val.next_batch(), validation_steps=tiger_val.n, callbacks=[checkpointer, earlystoping, evaluator]) print('----train end----') # For a real OCR application, this should be beam search with a dictionary # and language model. For this example, best path is sufficient. def decode_batch(out): ret = [] for j in range(out.shape[0]): out_best = list(np.argmax(out[j, 2:], 1)) out_best = [k for k, g in itertools.groupby(out_best)] outstr = '' for c in out_best: if c < len(letters): outstr += letters[c] ret.append(outstr) return ret class Evaluate(Callback): def __init__(self, model): self.accs = [] self.model = model def on_epoch_end(self, epoch, logs=None): acc = evaluate(self.model) self.accs.append(acc) # Test on validation images def evaluate(model): global downsample_factor tiger_test = TextImageGenerator(VALID_SET_PATH, 'test', IMG_W, IMG_H, TEST_BATCH_SIZE, downsample_factor) net_inp = model.get_layer(name='the_input').input net_out = model.get_layer(name='softmax').output predict_model = Model(inputs=net_inp, outputs=net_out) equalsIgnoreCaseNum = 0.00 equalsNum = 0.00 totalNum = 0.00 for inp_value, _ in tiger_test.next_batch(): batch_size = inp_value['the_input'].shape[0] X_data = inp_value['the_input'] net_out_value = predict_model.predict(X_data) pred_texts = decode_batch(net_out_value) labels = inp_value['the_labels'] texts = [] for label in labels: text = labels_to_text(label) texts.append(text) for i in range(batch_size): totalNum += 1 if pred_texts[i] == texts[i]: equalsNum += 1 if pred_texts[i].lower() == texts[i].lower(): equalsIgnoreCaseNum += 1 else: print('Predict: %s ---> Label: %s' % (pred_texts[i], texts[i])) if totalNum >= 10000: break print('---Result---') print('Test num: %d, accuracy: %.5f, ignoreCase accuracy: %.5f' % ( totalNum, equalsNum / totalNum, equalsIgnoreCaseNum / totalNum)) return equalsIgnoreCaseNum / totalNum if __name__ == '__main__': train() test = True if test: model_path = './tmp/train_' + MODEL_NAME model = load_model(model_path, compile=False) evaluate(model) print('----End----')
interface_testset.py
import itertools import string import yaml from tqdm import tqdm import cv2 import numpy as np import os import tensorflow as tf from keras import backend as K from keras.models import Model, load_model f = open('./config/config_demo.yaml', 'r', encoding='utf-8') cfg = f.read() cfg_dict = yaml.load(cfg) config = tf.ConfigProto() config.gpu_options.allow_growth = True session = tf.Session(config=config) K.set_session(session) MODEL_NAME = cfg_dict['System']['ModelName'] letters = string.ascii_uppercase + string.ascii_lowercase+string.digits def decode_batch(out): ret = [] for j in range(out.shape[0]): out_best = list(np.argmax(out[j, 2:], 1)) out_best = [k for k, g in itertools.groupby(out_best)] outstr = '' for c in out_best: if c < len(letters): outstr += letters[c] ret.append(outstr) return ret def get_x_data(img_data, img_w, img_h): img = cv2.cvtColor(img_data, cv2.COLOR_RGB2GRAY) img = cv2.resize(img, (img_w, img_h)) img = img.astype(np.float32) img /= 255 batch_size = 1 if K.image_data_format() == 'channels_first': X_data = np.ones([batch_size, 1, img_w, img_h]) else: X_data = np.ones([batch_size, img_w, img_h, 1]) img = img.T if K.image_data_format() == 'channels_first': img = np.expand_dims(img, 0) else: img = np.expand_dims(img, -1) X_data[0] = img return X_data # Test on validation images def interface(datapath ="./testset" ,img_w = 200,img_h = 100): save_file = open("answer.csv","a",encoding="utf-8") save_file.truncate() model_path = './tmp/train_' + MODEL_NAME model = load_model(model_path, compile=False) net_inp = model.get_layer(name='the_input').input net_out = model.get_layer(name='softmax').output predict_model = Model(inputs=net_inp, outputs=net_out) print("开始预测,预测结果:") listdir = os.listdir(datapath) bar = tqdm(range(len(listdir)),total=len(listdir)) for idx in bar: img_data = cv2.imread(datapath+"/" + str(idx) + ".jpg") X_data = get_x_data(img_data, img_w, img_h) net_out_value = predict_model.predict(X_data) pred_texts = decode_batch(net_out_value) #print(str(idx) + ".jpg" + "\t", pred_texts[0]) save_file.write(str(idx)+","+pred_texts[0]+"\r\n") if __name__ == '__main__': interface(datapath="./testset")