第1关:加载与预处理图片数据
本关任务:加载图片,预处理并以TFRecords格式存储
"""
Write text features and labels into tensorflow records
"""
import random
import json
import tensorflow as tf
import cv2
import numpy as np
import os
import sys
# 图片裁剪高度
_IMAGE_HEIGHT = 32
# 图片数据集路径
_IMAGE_DIR = './data/images/'
# image_list路径
_IMAGE_LIST_FILE = './data/image_list.txt'
# TFRecord文件目标路径
_DATA_DIR = './tfrecords/'
# 数据集分割比
_SPLIT_FRACTION = 0.1
# char_map文件路径
_CHAR_MAP_FILE = './char_map/char_map.json'
# 转换函数
def _int64_feature(value):
if not isinstance(value, list):
value = [value]
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def _bytes_feature(value):
if not isinstance(value, list):
value = [value]
return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
def _string_to_int(label):
# convert string label to int list by char map
char_map_dict = json.load(open(_CHAR_MAP_FILE, 'r'))
int_list = []
for c in label:
int_list.append(char_map_dict[c])
return int_list
# 加载、预处理图片,并写入TFRecord文件
# dataset_name: 数据集名称
# image_list: “图片名 文本”
def _write_tfrecord(dataset_name, image_list):
# **************** Begin **************** #
tfrecords_path = os.path.join(_DATA_DIR, dataset_name + '.tfrecord')
with tf.python_io.TFRecordWriter(tfrecords_path) as writer:
for i, data in enumerate(image_list):
data = data.strip() #去掉首尾的相关字符
name = data.split()[0]
label = data.split()[1].lower()
image_path = os.path.join(_IMAGE_DIR, name)
image = cv2.imread(image_path)
h, w, c = image.shape
height = _IMAGE_HEIGHT
width = int(w * height / h)
image = cv2.resize(image, (width, height))
_, image_buffer = cv2.imencode('.jpg', image)
name = name if sys.version_info[0] < 3 else name.encode('utf-8')
feature = {
'label': _int64_feature(_string_to_int(label)),
'image_raw': _bytes_feature(image_buffer.tostring()),
'name': _bytes_feature(name),
}
example = tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(example.SerializeToString())
# ***************** End ***************** #
# 取得训练集与测试集
# train_list: 训练集的image_list
# test_list: 测试集的image_list
def convert_dataset():
# **************** Begin **************** #
with open(_IMAGE_LIST_FILE, 'r') as path:
dataset = path.readlines()
split_ind = int(len(dataset) * (1 - _SPLIT_FRACTION))
train_list = dataset[:split_ind - 1]
test_list = dataset[split_ind:]
# ***************** End ***************** #
_write_tfrecord('train', train_list)
_write_tfrecord('test', test_list)
def main(unused_argv):
convert_dataset()
if __name__ == '__main__':
tf.app.run()
第2关:CRNN 上:CNN特征序列提取
本关任务:搭建一个深层卷积神经网络DCNN
import tensorflow as tf
from tensorflow.contrib import slim
# batch_norm的decay参数
_BATCH_DECAY = 0.999
class CRNNCTCNetwork(object):
def __init__(self, phase, hidden_num, layers_num, num_classes):
self.__phase = phase.lower()
self.__hidden_num = hidden_num
self.__layers_num = layers_num
self.__num_classes = num_classes
return
# net为cnn网络最后一层输出的tensor
def __feature_sequence_extraction(self, input_tensor):
# is_training: batch_norm的is_training参数
is_training = True if self.__phase == 'train' else False
# **************Begin*************** #
with slim.arg_scope([slim.conv2d],
weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
weights_regularizer=slim.l2_regularizer(0.0005)):
net = slim.conv2d(input_tensor, 64, [3, 3], stride = 1 , padding='same', scope='conv1')
net = slim.max_pool2d(net, [2,2], stride = 2, scope = 'pool1')
net = slim.conv2d(net, 128, [3, 3], stride = 1, padding='same', scope='conv2')
net = slim.max_pool2d(net, [2,2], stride = 2, scope = 'pool2')
net = slim.repeat(net, 2, slim.conv2d, 256, [3, 3], stride = 1, padding='same', scope='conv3')
net = slim.max_pool2d(net, [2,1], stride = [2,1], scope = 'pool3')
net = slim.conv2d(net, 512, [3, 3], stride = 1, padding='same', scope='conv4')
net = slim.batch_norm(net, decay=_BATCH_DECAY, is_training=is_training, scope='bn1')
net = slim.conv2d(net, 512, [3, 3], stride = 1, padding='same', scope='conv5')
net = slim.batch_norm(net, decay=_BATCH_DECAY, is_training=is_training, scope='bn2')
net = slim.max_pool2d(net, [2,1], stride = [2,1], scope = 'pool4')
net = slim.conv2d(net, 512, [2,1], stride = [2,1], padding='valid', scope='conv6')
# ************** End *************** #
return net
def build_network(self, images):
return self.__feature_sequence_extraction(images)
第3关:CRNN 中:RNN特征序列识别
本关任务:搭建一个双向循环神经网络LSTM
import tensorflow as tf
from tensorflow.contrib import rnn
_BATCH_DECAY = 0.999
class CRNNCTCNetwork(object):
def __init__(self, phase, hidden_num, layers_num, num_classes):
self.__phase = phase.lower()
self.__hidden_num = hidden_num
self.__layers_num = layers_num
self.__num_classes = num_classes
return
def sequence_label(self, input_tensor, input_sequence_length):
#********************************Begin********************************#
with tf.variable_scope('LSTM_Layers'):
# forward lstm cell
fw_cell_list = [rnn.BasicLSTMCell(nh, forget_bias=1.0) for nh in [self.__hidden_num] * self.__layers_num]
# Backward direction cells
bw_cell_list = [rnn.BasicLSTMCell(nh, forget_bias=1.0) for nh in [self.__hidden_num] * self.__layers_num]
stack_lstm_layer, _, _ = rnn.stack_bidirectional_dynamic_rnn(
fw_cell_list, bw_cell_list, input_tensor, sequence_length=input_sequence_length, dtype=tf.float32)
batch_size, _, hidden_num = input_tensor.get_shape().as_list()
rnn_reshaped = tf.reshape(stack_lstm_layer, [-1, hidden_num])
# Doing the affine projection
w = tf.Variable(tf.truncated_normal([hidden_num, self.__num_classes], stddev=0.01), name="w")
logits = tf.matmul(rnn_reshaped, w)
logits = tf.reshape(logits, [batch_size, -1, self.__num_classes])
#********************************End********************************#
return logits, stack_lstm_layer
第4关:CRNN 下:CTC实现端到端训练
本关任务:使用CTC loss完成CRNN模型
import os
import tensorflow as tf
import numpy as np
import sys
sys.path.append('..')
from model import model
os.environ["TF_CPP_MIN_LOG_LEVEL"]='3'
_TRAIN_FILE='./tfrecords/train.tfrecord'
_NUM_THREADS = 4
# 训练样本batch大小
_BATCH_SIZE = 32
# 学习率初值
_LEARNING_RATE = 0.1
# 学习率衰减速率
_DECAY_RATE = 0.8
# 学习率衰减周期
_DECAY_STEPS = 1000
# LSTM隐藏层数
_LSTM_HIDDEN_LAYERS = 2
# LSTM隐藏层神经元数
_LSTM_HIDDEN_UNITS = 256
_NUM_CLASSES = 37
# tfrecord_path: _TRAIN_FILE
def _read_tfrecord(tfrecord_path, num_epochs=None):
if not os.path.exists(tfrecord_path):
raise ValueError('cannot find tfrecord file in path:{:s}'.format(tfrecord_path))
# 生成文件名队列
filename_queue = tf.train.string_input_producer([tfrecord_path], num_epochs=num_epochs)
reader = tf.TFRecordReader()
_, serialized_examples = reader.read(filename_queue)
features = tf.parse_single_example(serialized_examples,
features={
'images': tf.FixedLenFeature([], tf.string),
'labels':
tf.VarLenFeature(tf.int64),
'names':
tf.FixedLenFeature([], tf.string),
})
images = tf.image.decode_jpeg(features['images'])
images.set_shape([32, None, 3])
images = tf.cast(images, tf.float32)
labels = tf.cast(features['labels'], tf.int32)
sequence_length = tf.cast(tf.shape(images)[-2]/4, tf.int32)
names = features['names']
return images, labels, sequence_length, names
# max_train_step为最大训练step
# 每个step训练一个batch的数据
# 返回值:cl 训练结束后的ctc-loss
def train_crnn_ctc(max_train_steps):
# **************Begin*************** #
images, labels, sequence_lengths, _ = _read_tfrecord(tfrecord_path=_TRAIN_FILE)
batch_images, batch_labels, batch_sequence_lengths = tf.train.batch(tensors=[images, labels, sequence_lengths], batch_size=_BATCH_SIZE,dynamic_pad=True,capacity=100 + 3*_BATCH_SIZE, num_threads=_NUM_THREADS, allow_smaller_final_batch=True)
input_images = tf.placeholder(tf.float32, shape=[_BATCH_SIZE, 32, None, 3], name='input_images')
input_labels = tf.sparse_placeholder(tf.int32, name='input_labels')
input_sequence_lengths = tf.placeholder(tf.int32, shape=[_BATCH_SIZE], name='input_sequence_lengths')
crnn_net = model.CRNNCTCNetwork(phase='train', hidden_num=_LSTM_HIDDEN_UNITS, layers_num=_LSTM_HIDDEN_LAYERS, num_classes=_NUM_CLASSES)
with tf.variable_scope('CRNN_CTC', reuse=False):
net_out = crnn_net.build_network(images=input_images, sequence_length=input_sequence_lengths)
ctc_loss = tf.reduce_mean(tf.nn.ctc_loss(labels=input_labels, inputs=net_out, sequence_length=input_sequence_lengths, ignore_longer_outputs_than_inputs=True))
global_step = tf.train.create_global_step()
learning_rate = tf.train.exponential_decay(_LEARNING_RATE, global_step, _DECAY_STEPS, _DECAY_RATE, staircase=True)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate).minimize(loss=ctc_loss, global_step=global_step)
init_op = tf.global_variables_initializer()
sess_config = tf.ConfigProto()
sess_config.gpu_options.allow_growth = True
with tf.Session(config=sess_config) as sess:
sess.run(init_op)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
for step in range(max_train_steps):
imgs, lbls, seq_lens = sess.run([batch_images, batch_labels, batch_sequence_lengths])
_, cl, lr = sess.run([optimizer, ctc_loss, learning_rate], feed_dict={input_images:imgs, input_labels:lbls, input_sequence_lengths:seq_lens})
coord.request_stop()
coord.join(threads=threads)
# **************End *************** #
return cl