基于CRNN的文本识别

粥粥粥少女的拧发条鸟

已于 2022-05-09 08:46:01 修改

阅读量1.2k

点赞数 4

分类专栏：深度学习框架文章标签： tensorflow python 人工智能

于 2022-05-09 08:45:47 首次发布

本文链接：https://blog.csdn.net/qq_41691212/article/details/124647908

版权

深度学习框架专栏收录该内容

30 篇文章 35 订阅

订阅专栏

第1关：加载与预处理图片数据

本关任务：加载图片，预处理并以TFRecords格式存储


"""
Write text features and labels into tensorflow records
"""
import random
import json

import tensorflow as tf

import cv2
import numpy as np
import os
import sys


# 图片裁剪高度
_IMAGE_HEIGHT = 32
# 图片数据集路径
_IMAGE_DIR = './data/images/'
# image_list路径
_IMAGE_LIST_FILE = './data/image_list.txt'
# TFRecord文件目标路径
_DATA_DIR = './tfrecords/'
# 数据集分割比
_SPLIT_FRACTION = 0.1
# char_map文件路径
_CHAR_MAP_FILE = './char_map/char_map.json'

# 转换函数 
def _int64_feature(value):
    if not isinstance(value, list):
        value = [value]
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _bytes_feature(value):
    if not isinstance(value, list):
        value = [value]
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

def _string_to_int(label):
    # convert string label to int list by char map
    char_map_dict = json.load(open(_CHAR_MAP_FILE, 'r'))
    int_list = []
    for c in label:
        int_list.append(char_map_dict[c])
    return int_list

# 加载、预处理图片，并写入TFRecord文件
# dataset_name: 数据集名称
# image_list: “图片名 文本”
def _write_tfrecord(dataset_name, image_list):
# **************** Begin **************** #
    tfrecords_path = os.path.join(_DATA_DIR, dataset_name + '.tfrecord')
    with tf.python_io.TFRecordWriter(tfrecords_path) as writer:
        for i, data in enumerate(image_list):
            data = data.strip() #去掉首尾的相关字符
            name = data.split()[0] 
            label = data.split()[1].lower()
            image_path = os.path.join(_IMAGE_DIR, name)
            image = cv2.imread(image_path)
            h, w, c = image.shape
            height = _IMAGE_HEIGHT
            width = int(w * height / h)
            image = cv2.resize(image, (width, height))
            _, image_buffer = cv2.imencode('.jpg', image)
            name = name if sys.version_info[0] < 3 else name.encode('utf-8')

            feature = {
                'label': _int64_feature(_string_to_int(label)),
                'image_raw': _bytes_feature(image_buffer.tostring()),
                'name': _bytes_feature(name),
            }
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            writer.write(example.SerializeToString())
# ***************** End ***************** #


# 取得训练集与测试集
# train_list: 训练集的image_list
# test_list: 测试集的image_list
def convert_dataset():
# **************** Begin **************** #

    with open(_IMAGE_LIST_FILE, 'r') as path:
        dataset = path.readlines()
        split_ind = int(len(dataset) * (1 - _SPLIT_FRACTION))
        train_list = dataset[:split_ind - 1]
        test_list = dataset[split_ind:]

# ***************** End ***************** #
    _write_tfrecord('train', train_list)
    _write_tfrecord('test', test_list)
   

def main(unused_argv):
    convert_dataset()

if __name__ == '__main__':
    tf.app.run()

第2关：CRNN 上：CNN特征序列提取

本关任务：搭建一个深层卷积神经网络DCNN

import tensorflow as tf
from tensorflow.contrib import slim

# batch_norm的decay参数
_BATCH_DECAY = 0.999


class CRNNCTCNetwork(object):
    def __init__(self, phase, hidden_num, layers_num, num_classes):
        self.__phase = phase.lower()
        self.__hidden_num = hidden_num
        self.__layers_num = layers_num
        self.__num_classes = num_classes
        return

    # net为cnn网络最后一层输出的tensor
    def __feature_sequence_extraction(self, input_tensor):
        # is_training: batch_norm的is_training参数
        is_training = True if self.__phase == 'train' else False
        # **************Begin*************** #
        with slim.arg_scope([slim.conv2d],
                   weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
                   weights_regularizer=slim.l2_regularizer(0.0005)):
            net = slim.conv2d(input_tensor, 64, [3, 3], stride = 1 , padding='same', scope='conv1')
            net = slim.max_pool2d(net, [2,2], stride = 2, scope = 'pool1')
            net = slim.conv2d(net, 128, [3, 3], stride = 1, padding='same', scope='conv2')
            net = slim.max_pool2d(net, [2,2], stride = 2, scope = 'pool2')
            net = slim.repeat(net, 2, slim.conv2d, 256, [3, 3], stride = 1, padding='same', scope='conv3')
            net = slim.max_pool2d(net, [2,1], stride = [2,1], scope = 'pool3')
            net = slim.conv2d(net, 512, [3, 3], stride = 1, padding='same', scope='conv4')
            net = slim.batch_norm(net, decay=_BATCH_DECAY, is_training=is_training, scope='bn1')
            net = slim.conv2d(net, 512, [3, 3], stride = 1, padding='same', scope='conv5')
            net = slim.batch_norm(net, decay=_BATCH_DECAY, is_training=is_training, scope='bn2')
            net = slim.max_pool2d(net, [2,1], stride = [2,1], scope = 'pool4')
            net = slim.conv2d(net, 512, [2,1], stride = [2,1], padding='valid', scope='conv6')
        # ************** End *************** #
        return net
    def build_network(self, images):
        return self.__feature_sequence_extraction(images)

第3关：CRNN 中：RNN特征序列识别

本关任务：搭建一个双向循环神经网络LSTM

import tensorflow as tf  
from tensorflow.contrib import rnn
_BATCH_DECAY = 0.999
class CRNNCTCNetwork(object):  
    def __init__(self, phase, hidden_num, layers_num, num_classes):  
        self.__phase = phase.lower()  
        self.__hidden_num = hidden_num  
        self.__layers_num = layers_num  
        self.__num_classes = num_classes  
        return
    def sequence_label(self, input_tensor, input_sequence_length):  
#********************************Begin********************************#  
        with tf.variable_scope('LSTM_Layers'):  
            # forward lstm cell  
            fw_cell_list = [rnn.BasicLSTMCell(nh, forget_bias=1.0) for nh in [self.__hidden_num] * self.__layers_num]  
            # Backward direction cells  
            bw_cell_list = [rnn.BasicLSTMCell(nh, forget_bias=1.0) for nh in [self.__hidden_num] * self.__layers_num]
            stack_lstm_layer, _, _ = rnn.stack_bidirectional_dynamic_rnn(  
                fw_cell_list, bw_cell_list, input_tensor, sequence_length=input_sequence_length, dtype=tf.float32)
            batch_size, _, hidden_num = input_tensor.get_shape().as_list()  
            rnn_reshaped = tf.reshape(stack_lstm_layer, [-1, hidden_num])
            # Doing the affine projection  
            w = tf.Variable(tf.truncated_normal([hidden_num, self.__num_classes], stddev=0.01), name="w")  
            logits = tf.matmul(rnn_reshaped, w)
            logits = tf.reshape(logits, [batch_size, -1, self.__num_classes])  
#********************************End********************************#  
        return logits, stack_lstm_layer

第4关：CRNN 下：CTC实现端到端训练

本关任务：使用CTC loss完成CRNN模型

import os

import tensorflow as tf

import numpy as np

import sys
sys.path.append('..')

from model import model

os.environ["TF_CPP_MIN_LOG_LEVEL"]='3'

_TRAIN_FILE='./tfrecords/train.tfrecord'

_NUM_THREADS = 4

# 训练样本batch大小
_BATCH_SIZE = 32

# 学习率初值
_LEARNING_RATE = 0.1
# 学习率衰减速率
_DECAY_RATE = 0.8
# 学习率衰减周期
_DECAY_STEPS = 1000
# LSTM隐藏层数
_LSTM_HIDDEN_LAYERS = 2
# LSTM隐藏层神经元数
_LSTM_HIDDEN_UNITS = 256

_NUM_CLASSES = 37

# tfrecord_path: _TRAIN_FILE
def _read_tfrecord(tfrecord_path, num_epochs=None):
    if not os.path.exists(tfrecord_path):
        raise ValueError('cannot find tfrecord file in path:{:s}'.format(tfrecord_path))
    
    # 生成文件名队列
    filename_queue = tf.train.string_input_producer([tfrecord_path], num_epochs=num_epochs)
    reader = tf.TFRecordReader()
    _, serialized_examples = reader.read(filename_queue)
    features = tf.parse_single_example(serialized_examples, 
                                        features={
                                            'images': tf.FixedLenFeature([], tf.string),
                                            'labels':
                                            tf.VarLenFeature(tf.int64),
                                            'names':
                                            tf.FixedLenFeature([], tf.string),
                                        })
    images = tf.image.decode_jpeg(features['images'])
    images.set_shape([32, None, 3])
    images = tf.cast(images, tf.float32)

    labels = tf.cast(features['labels'], tf.int32)

    sequence_length = tf.cast(tf.shape(images)[-2]/4, tf.int32)

    names = features['names']

    return images, labels, sequence_length, names

# max_train_step为最大训练step
# 每个step训练一个batch的数据
# 返回值：cl 训练结束后的ctc-loss
def train_crnn_ctc(max_train_steps):
    # **************Begin*************** #
    images, labels, sequence_lengths, _ = _read_tfrecord(tfrecord_path=_TRAIN_FILE)
    batch_images, batch_labels, batch_sequence_lengths = tf.train.batch(tensors=[images, labels, sequence_lengths], batch_size=_BATCH_SIZE,dynamic_pad=True,capacity=100 + 3*_BATCH_SIZE, num_threads=_NUM_THREADS, allow_smaller_final_batch=True)
    input_images = tf.placeholder(tf.float32, shape=[_BATCH_SIZE, 32, None, 3], name='input_images')  
    input_labels = tf.sparse_placeholder(tf.int32, name='input_labels')  
    input_sequence_lengths = tf.placeholder(tf.int32, shape=[_BATCH_SIZE], name='input_sequence_lengths')
    crnn_net = model.CRNNCTCNetwork(phase='train', hidden_num=_LSTM_HIDDEN_UNITS, layers_num=_LSTM_HIDDEN_LAYERS, num_classes=_NUM_CLASSES)
    with tf.variable_scope('CRNN_CTC', reuse=False):  
        net_out = crnn_net.build_network(images=input_images, sequence_length=input_sequence_lengths)
    ctc_loss = tf.reduce_mean(tf.nn.ctc_loss(labels=input_labels, inputs=net_out, sequence_length=input_sequence_lengths, ignore_longer_outputs_than_inputs=True))
    global_step = tf.train.create_global_step()  
    learning_rate = tf.train.exponential_decay(_LEARNING_RATE, global_step, _DECAY_STEPS, _DECAY_RATE, staircase=True)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)  
    with tf.control_dependencies(update_ops):  
        optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate).minimize(loss=ctc_loss, global_step=global_step)  
    init_op = tf.global_variables_initializer()
    sess_config = tf.ConfigProto()  
    sess_config.gpu_options.allow_growth = True  
    with tf.Session(config=sess_config) as sess:  
        sess.run(init_op)
        coord = tf.train.Coordinator()  
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        for step in range(max_train_steps):  
            imgs, lbls, seq_lens = sess.run([batch_images, batch_labels, batch_sequence_lengths])
            _, cl, lr = sess.run([optimizer, ctc_loss, learning_rate], feed_dict={input_images:imgs, input_labels:lbls, input_sequence_lengths:seq_lens})  
    coord.request_stop()  
    coord.join(threads=threads)  
    # **************End  *************** #
    return cl

粥粥粥少女的拧发条鸟

关注

4
点赞
踩
4

收藏

觉得还不错? 一键收藏
打赏
0
评论
基于CRNN的文本识别

目录第1关：加载与预处理图片数据第2关：CRNN 上：CNN特征序列提取第3关：CRNN 中：RNN特征序列识别第4关：CRNN 下：CTC实现端到端训练第1关：加载与预处理图片数据本关任务：加载图片，预处理并以TFRecords格式存储"""Write text features and labels into tensorflow records"""import randomimport jsonimport tensorflow as tfimport cv2import
复制链接

扫一扫