一、介绍
上一节我们简单介绍了CTC及数据准备过程,做好了数据准备,本节我们介绍CTC模型训练及源码解析。
CTC(Connectionist Temporal Classification)连接时间分类,直观上理解,循环神经网络(RNN)更适合于CTC训练,关于CTC的原理上的介绍,大家已经写的很多了,本节我们主要从代码着手,帮助大家从零搭建CTC-ASR训练系统。既然是系统,我们就让代码的扩展性更强一些,我们现在支持LSTM网络结构。
二、训练源码及解析
2.1配置文件(config-lstm.yml):
param: #配置参数
num_classes: 219 #我们使用声韵母建模,音素总个数为219个
encoder_type: lstm #网络结构使用LSTM结构
input_size: 40 #输入我们使用40维的MFCC
left_context: 10 #输入左边拼帧10帧
right_context: 10 #输入右边拼帧10帧
num_units: 512 #隐层单元个数
num_layers: 4 #隐层数
lstm_impl: BasicLSTMCell #LSTM结构类型
use_peephole: True #LSTM结构是否使用PEEPHOLE
weight_init: 0.1 #初始化参数
clip_grad_norm: 5.0 #梯度更新参数
clip_activation: 50 #激活函数截断参数
num_proj: 256 #映射层维数
weight_decay: 0 #正则化系数
train_data_size: 3000 #训练数据量
label_type: monophone #建模单元类型
optimizer: adam #使用的优化器
learning_rate: 0.0001 #初始学习率
dropout: 0.8 #参数更新比例
bottleneck_dim: 0 #瓶颈层维数
train_data_file: ./data/th30h.tfrecords #训练数据及标签
label_file: ./data/dict.txt #音素对应的字典
beam_width: 1 #解码beam宽度
batch_size: 32 #更新一次参数batch大小
print_step: 50 #保存模型的频率,50次迭代保存一次模型
num_epoch: 6 #数据迭代轮数
2.2网络结构文件(lstm.py):
# -*- coding: utf-8 -*-
"""Unidirectional LSTM encoder."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
#LSTM编码器,支持BasicLSTM,LSTM,BlockLSTM
#之所以叫编码器,相当于把语音特征编码到分类标签
class LSTMEncoder(object):
"""Unidirectional LSTM encoder.
Args:
num_units (int): 每一层的结点数
num_proj (int): 映射层的结点数
num_layers (int): 网络层数
lstm_impl (string, optional): LSTM结构的不同实现
- BasicLSTMCell: tf.contrib.rnn.BasicLSTMCell 基本LSTM (no peephole)
- LSTMCell: tf.contrib.rnn.LSTMCell 标准LSTM
- LSTMBlockCell: tf.contrib.rnn.LSTMBlockCell BLOCK LSTM
use_peephole (bool): 是否使用peephole
parameter_init (float): 初始化网络参数
clip_activation (float): 通过激活函数后的裁剪范围 (> 0)
time_major (bool, optional): 计算时是否使用时间为主序
name (string, optional): 设置网络结构名称
"""
def __init__(self,
num_units,
num_proj,
num_layers,
lstm_impl,
use_peephole,
parameter_init,
clip_activation,
time_major=False,
name='lstm_encoder'):
self.num_units = num_units
if lstm_impl != 'LSTMCell':
self.num_proj = None
else:
self.num_proj = num_proj
self.num_layers = num_layers
self.lstm_impl = lstm_impl
self.use_peephole = use_peephole
self.parameter_init = parameter_init
self.clip_activation = clip_activation
self.time_major = time_major
self.name = name
#可调用对象
def __call__(self, inputs, inputs_seq_len, keep_prob, is_training):
"""Construct model graph.
Args:
inputs (placeholder): A tensor of size`[B, T, input_size]`
inputs_seq_len (placeholder): A tensor of size` [B]`
keep_prob (placeholder, float): A probability to keep nodes
in the hidden-hidden connection
is_training (bool):
Returns:
outputs: Encoder states.
if time_major is True, a tensor of size
`[T, B, num_units (num_proj)]`
otherwise, `[B, T, num_units (num_proj)]`
final_state: A final hidden state of the encoder
"""
initializer = tf.random_uniform_initializer(
minval=-self.parameter_init, maxval=self.parameter_init)
if self.lstm_impl == 'BasicLSTMCell':
outputs, final_state = basiclstmcell(
self.num_units, self.num_layers,
inputs, inputs_seq_len, keep_prob, initializer,
self.time_major)
elif self.lstm_impl == 'LSTMCell':
outputs, final_state = lstmcell(
self.num_units, self.num_proj, self.num_layers,
self.use_peephole, self.clip_activation,
inputs, inputs_seq_len, keep_prob, initializer,
self.time_major)
elif self.lstm_impl == 'LSTMBlockCell':
outputs, final_state = lstmblockcell(
self.num_units, self.num_layers,
self.use_peephole,
inputs, inputs_seq_len, keep_prob, initializer,
self.time_major)
else:
raise IndexError( 'lstm_impl is "BasicLSTMCell" or "LSTMCell" or ' +
'"LSTMBlockCell" or "LSTMBlockFusedCell" or ' +
'"CudnnLSTM".')
return outputs, final_state
#basic lstm网络结构
def basiclstmcell(num_units, num_layers, inputs, inputs_seq_len,
keep_prob, initializer, time_major):
if time_major:
# Convert from batch-major to time-major
inputs = tf.transpose(inputs, [1, 0, 2])
lstm_list = []
with tf.variable_scope('multi_lstm', initializer=initializer) as scope:
for i_layer in range(1, num_layers + 1, 1):
lstm = tf.contrib.rnn.BasicLSTMCell(
num_units,
forget_bias=1.0,
state_is_tuple=True,
activation=tf.tanh)
# Dropout for the hidden-hidden connections
lstm = tf.contrib.rnn.DropoutWrapper(
lstm, output_keep_prob=keep_prob)
lstm_list.append(lstm)
# Stack multiple cells
stacked_lstm = tf.contrib.rnn.MultiRNNCell(
lstm_list, state_is_tuple=True)
# Ignore 2nd return (the last state)
outputs, final_state = tf.nn.dynamic_rnn(
cell=stacked_lstm,
inputs=inputs,
sequence_length=inputs_seq_len,
dtype=tf.float32,
time_major=time_major,
scope=scope)
return outputs, final_state
#标准lstm网络结构
def lstmcell(num_units, num_proj, num_layers, use_peephole, clip_activation,
inputs, inputs_seq_len, keep_prob, initializer, time_major):
if time_major:
# Convert form batch-major to time-major
inputs = tf.transpose(inputs, [1, 0, 2])
lstm_list = []
with tf.variable_scope('multi_lstm', initializer=initializer) as scope:
for i_layer in range(1, num_layers + 1, 1):
lstm = tf.contrib.rnn.LSTMCell(
num_units,
use_peepholes=use_peephole,
cell_clip=clip_activation,
num_proj=num_proj,
forget_bias=1.0,
state_is_tuple=True)
# Dropout for the hidden-hidden connections
lstm = tf.contrib.rnn.DropoutWrapper(
lstm, output_keep_prob=keep_prob)
lstm_list.append(lstm)
# Stack multiple cells
stacked_lstm = tf.contrib.rnn.MultiRNNCell(
lstm_list, state_is_tuple=True)
# Ignore 2nd return (the last state)
outputs, final_state = tf.nn.dynamic_rnn(
cell=stacked_lstm,
inputs=inputs,
sequence_length=inputs_seq_len,
dtype=tf.float32,
time_major=time_major,
scope=scope)
return outputs, final_state
#block lstm网络结构
def lstmblockcell(num_units, num_layers, use_peephole, inputs,
inputs_seq_len, keep_prob, initializer, time_major):
if time_major:
inputs = tf.transpose(inputs, [1, 0, 2])
lstm_list = []
with tf.variable_scope('multi_lstm', initializer=initializer) as scope:
for i_layer in range(1, num_layers + 1, 1):
lstm = tf.contrib.rnn.LSTMBlockCell(
num_units, forget_bias=1.0,
use_peephole=use_peephole)
lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=ke