基于CRNN的文本识别

第1关:加载与预处理图片数据

本关任务:加载图片,预处理并以TFRecords格式存储


"""
Write text features and labels into tensorflow records
"""
import random
import json

import tensorflow as tf

import cv2
import numpy as np
import os
import sys


# 图片裁剪高度
_IMAGE_HEIGHT = 32
# 图片数据集路径
_IMAGE_DIR = './data/images/'
# image_list路径
_IMAGE_LIST_FILE = './data/image_list.txt'
# TFRecord文件目标路径
_DATA_DIR = './tfrecords/'
# 数据集分割比
_SPLIT_FRACTION = 0.1
# char_map文件路径
_CHAR_MAP_FILE = './char_map/char_map.json'

# 转换函数 
def _int64_feature(value):
    if not isinstance(value, list):
        value = [value]
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _bytes_feature(value):
    if not isinstance(value, list):
        value = [value]
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

def _string_to_int(label):
    # convert string label to int list by char map
    char_map_dict = json.load(open(_CHAR_MAP_FILE, 'r'))
    int_list = []
    for c in label:
        int_list.append(char_map_dict[c])
    return int_list

# 加载、预处理图片,并写入TFRecord文件
# dataset_name: 数据集名称
# image_list: “图片名 文本”
def _write_tfrecord(dataset_name, image_list):
# **************** Begin **************** #
    tfrecords_path = os.path.join(_DATA_DIR, dataset_name + '.tfrecord')
    with tf.python_io.TFRecordWriter(tfrecords_path) as writer:
        for i, data in enumerate(image_list):
            data = data.strip() #去掉首尾的相关字符
            name = data.split()[0] 
            label = data.split()[1].lower()
            image_path = os.path.join(_IMAGE_DIR, name)
            image = cv2.imread(image_path)
            h, w, c = image.shape
            height = _IMAGE_HEIGHT
            width = int(w * height / h)
            image = cv2.resize(image, (width, height))
            _, image_buffer = cv2.imencode('.jpg', image)
            name = name if sys.version_info[0] < 3 else name.encode('utf-8')

            feature = {
                'label': _int64_feature(_string_to_int(label)),
                'image_raw': _bytes_feature(image_buffer.tostring()),
                'name': _bytes_feature(name),
            }
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            writer.write(example.SerializeToString())
# ***************** End ***************** #


# 取得训练集与测试集
# train_list: 训练集的image_list
# test_list: 测试集的image_list
def convert_dataset():
# **************** Begin **************** #

    with open(_IMAGE_LIST_FILE, 'r') as path:
        dataset = path.readlines()
        split_ind = int(len(dataset) * (1 - _SPLIT_FRACTION))
        train_list = dataset[:split_ind - 1]
        test_list = dataset[split_ind:]

# ***************** End ***************** #
    _write_tfrecord('train', train_list)
    _write_tfrecord('test', test_list)
   

def main(unused_argv):
    convert_dataset()

if __name__ == '__main__':
    tf.app.run()



第2关:CRNN 上:CNN特征序列提取

本关任务:搭建一个深层卷积神经网络DCNN

import tensorflow as tf
from tensorflow.contrib import slim

# batch_norm的decay参数
_BATCH_DECAY = 0.999


class CRNNCTCNetwork(object):
    def __init__(self, phase, hidden_num, layers_num, num_classes):
        self.__phase = phase.lower()
        self.__hidden_num = hidden_num
        self.__layers_num = layers_num
        self.__num_classes = num_classes
        return

    # net为cnn网络最后一层输出的tensor
    def __feature_sequence_extraction(self, input_tensor):
        # is_training: batch_norm的is_training参数
        is_training = True if self.__phase == 'train' else False
        # **************Begin*************** #
        with slim.arg_scope([slim.conv2d],
                   weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
                   weights_regularizer=slim.l2_regularizer(0.0005)):
            net = slim.conv2d(input_tensor, 64, [3, 3], stride = 1 , padding='same', scope='conv1')
            net = slim.max_pool2d(net, [2,2], stride = 2, scope = 'pool1')
            net = slim.conv2d(net, 128, [3, 3], stride = 1, padding='same', scope='conv2')
            net = slim.max_pool2d(net, [2,2], stride = 2, scope = 'pool2')
            net = slim.repeat(net, 2, slim.conv2d, 256, [3, 3], stride = 1, padding='same', scope='conv3')
            net = slim.max_pool2d(net, [2,1], stride = [2,1], scope = 'pool3')
            net = slim.conv2d(net, 512, [3, 3], stride = 1, padding='same', scope='conv4')
            net = slim.batch_norm(net, decay=_BATCH_DECAY, is_training=is_training, scope='bn1')
            net = slim.conv2d(net, 512, [3, 3], stride = 1, padding='same', scope='conv5')
            net = slim.batch_norm(net, decay=_BATCH_DECAY, is_training=is_training, scope='bn2')
            net = slim.max_pool2d(net, [2,1], stride = [2,1], scope = 'pool4')
            net = slim.conv2d(net, 512, [2,1], stride = [2,1], padding='valid', scope='conv6')
        # ************** End *************** #
        return net
    def build_network(self, images):
        return self.__feature_sequence_extraction(images)





第3关:CRNN 中:RNN特征序列识别

本关任务:搭建一个双向循环神经网络LSTM

import tensorflow as tf  
from tensorflow.contrib import rnn
_BATCH_DECAY = 0.999
class CRNNCTCNetwork(object):  
    def __init__(self, phase, hidden_num, layers_num, num_classes):  
        self.__phase = phase.lower()  
        self.__hidden_num = hidden_num  
        self.__layers_num = layers_num  
        self.__num_classes = num_classes  
        return
    def sequence_label(self, input_tensor, input_sequence_length):  
#********************************Begin********************************#  
        with tf.variable_scope('LSTM_Layers'):  
            # forward lstm cell  
            fw_cell_list = [rnn.BasicLSTMCell(nh, forget_bias=1.0) for nh in [self.__hidden_num] * self.__layers_num]  
            # Backward direction cells  
            bw_cell_list = [rnn.BasicLSTMCell(nh, forget_bias=1.0) for nh in [self.__hidden_num] * self.__layers_num]
            stack_lstm_layer, _, _ = rnn.stack_bidirectional_dynamic_rnn(  
                fw_cell_list, bw_cell_list, input_tensor, sequence_length=input_sequence_length, dtype=tf.float32)
            batch_size, _, hidden_num = input_tensor.get_shape().as_list()  
            rnn_reshaped = tf.reshape(stack_lstm_layer, [-1, hidden_num])
            # Doing the affine projection  
            w = tf.Variable(tf.truncated_normal([hidden_num, self.__num_classes], stddev=0.01), name="w")  
            logits = tf.matmul(rnn_reshaped, w)
            logits = tf.reshape(logits, [batch_size, -1, self.__num_classes])  
#********************************End********************************#  
        return logits, stack_lstm_layer






第4关:CRNN 下:CTC实现端到端训练

本关任务:使用CTC loss完成CRNN模型

import os

import tensorflow as tf

import numpy as np

import sys
sys.path.append('..')

from model import model

os.environ["TF_CPP_MIN_LOG_LEVEL"]='3'

_TRAIN_FILE='./tfrecords/train.tfrecord'

_NUM_THREADS = 4

# 训练样本batch大小
_BATCH_SIZE = 32

# 学习率初值
_LEARNING_RATE = 0.1
# 学习率衰减速率
_DECAY_RATE = 0.8
# 学习率衰减周期
_DECAY_STEPS = 1000
# LSTM隐藏层数
_LSTM_HIDDEN_LAYERS = 2
# LSTM隐藏层神经元数
_LSTM_HIDDEN_UNITS = 256

_NUM_CLASSES = 37

# tfrecord_path: _TRAIN_FILE
def _read_tfrecord(tfrecord_path, num_epochs=None):
    if not os.path.exists(tfrecord_path):
        raise ValueError('cannot find tfrecord file in path:{:s}'.format(tfrecord_path))
    
    # 生成文件名队列
    filename_queue = tf.train.string_input_producer([tfrecord_path], num_epochs=num_epochs)
    reader = tf.TFRecordReader()
    _, serialized_examples = reader.read(filename_queue)
    features = tf.parse_single_example(serialized_examples, 
                                        features={
                                            'images': tf.FixedLenFeature([], tf.string),
                                            'labels':
                                            tf.VarLenFeature(tf.int64),
                                            'names':
                                            tf.FixedLenFeature([], tf.string),
                                        })
    images = tf.image.decode_jpeg(features['images'])
    images.set_shape([32, None, 3])
    images = tf.cast(images, tf.float32)

    labels = tf.cast(features['labels'], tf.int32)

    sequence_length = tf.cast(tf.shape(images)[-2]/4, tf.int32)

    names = features['names']

    return images, labels, sequence_length, names

# max_train_step为最大训练step
# 每个step训练一个batch的数据
# 返回值:cl 训练结束后的ctc-loss
def train_crnn_ctc(max_train_steps):
    # **************Begin*************** #
    images, labels, sequence_lengths, _ = _read_tfrecord(tfrecord_path=_TRAIN_FILE)
    batch_images, batch_labels, batch_sequence_lengths = tf.train.batch(tensors=[images, labels, sequence_lengths], batch_size=_BATCH_SIZE,dynamic_pad=True,capacity=100 + 3*_BATCH_SIZE, num_threads=_NUM_THREADS, allow_smaller_final_batch=True)
    input_images = tf.placeholder(tf.float32, shape=[_BATCH_SIZE, 32, None, 3], name='input_images')  
    input_labels = tf.sparse_placeholder(tf.int32, name='input_labels')  
    input_sequence_lengths = tf.placeholder(tf.int32, shape=[_BATCH_SIZE], name='input_sequence_lengths')
    crnn_net = model.CRNNCTCNetwork(phase='train', hidden_num=_LSTM_HIDDEN_UNITS, layers_num=_LSTM_HIDDEN_LAYERS, num_classes=_NUM_CLASSES)
    with tf.variable_scope('CRNN_CTC', reuse=False):  
        net_out = crnn_net.build_network(images=input_images, sequence_length=input_sequence_lengths)
    ctc_loss = tf.reduce_mean(tf.nn.ctc_loss(labels=input_labels, inputs=net_out, sequence_length=input_sequence_lengths, ignore_longer_outputs_than_inputs=True))
    global_step = tf.train.create_global_step()  
    learning_rate = tf.train.exponential_decay(_LEARNING_RATE, global_step, _DECAY_STEPS, _DECAY_RATE, staircase=True)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)  
    with tf.control_dependencies(update_ops):  
        optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate).minimize(loss=ctc_loss, global_step=global_step)  
    init_op = tf.global_variables_initializer()
    sess_config = tf.ConfigProto()  
    sess_config.gpu_options.allow_growth = True  
    with tf.Session(config=sess_config) as sess:  
        sess.run(init_op)
        coord = tf.train.Coordinator()  
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        for step in range(max_train_steps):  
            imgs, lbls, seq_lens = sess.run([batch_images, batch_labels, batch_sequence_lengths])
            _, cl, lr = sess.run([optimizer, ctc_loss, learning_rate], feed_dict={input_images:imgs, input_labels:lbls, input_sequence_lengths:seq_lens})  
    coord.request_stop()  
    coord.join(threads=threads)  
    # **************End  *************** #
    return cl


  • 4
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

粥粥粥少女的拧发条鸟

你的鼓励是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值