基于Kaldi的DNN-HMM语音识别系统,run.sh文件包含从前期数据准备到最后解码的整个过程,该脚本是语音识别各个步骤的封装。
#!/bin/dash
#bash or dash
# Copyright 2016 Tsinghua University (Author: Dong Wang, Xuewei Zhang)
# 2018 Tsinghua University (Author: Zhiyuan Tang)
# Apache 2.0.
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
## This relates to the queue.
## 根据系统将cmd.sh最后三行改成queue.pl or run.pl
. ./path.sh
n=8 # parallel jobs
set -euo pipefail # 管道符中任意命令出错或者遇到未定义变量或方法时都停止运行
###### Bookmark: basic preparation ######
# corpus and trans directory
# 指定数据集和训练文件目录
# 数据集用thchs30
thchs=/home/yy/kaldi-trunk/egs/cslt_cases/asr_baseline/data/data_thchs30
# 下载数据集
# you can obtain the database by uncommting the following lines
# [ -d $thchs ] || mkdir -p $thchs
# echo "downloading THCHS30 at $thchs ..."
# local/download_and_untar.sh $thchs http://www.openslr.org/resources/18 data_thchs30
# local/download_and_untar.sh $thchs http://www.openslr.org/resources/18 resource
# local/download_and_untar.sh $thchs http://www.openslr.org/resources/18 test-noise
# generate text, wav.scp, utt2pk, spk2utt in data/{train,test}
local/thchs-30_data_prep.sh $thchs/data_thchs30
#数据准备
###### Bookmark: language preparation ######
#kaldi必需的4个文件:
#wav.scp 每条语言的ID及其存储地址
#text 每条语言的ID及其对应的文本
#utt2spk 每条语音的ID及其说话人ID
#spk2utt 每个说话人的ID及其所说语音的所有ID
# prepare lexicon.txt, extra_questions.txt, nonsilence_phones.txt, optional_silence.txt, silence_phones.txt
# build a large lexicon that invovles words in both the training and decoding, all in data/dict
#lexicon.txt 发音词典,每个词与其所对应的音素串
#lexiconp.txt 发音词典+概率
#silence_phones.txt 静音类音素,静音、噪声、笑声等
#optional_silence.txt 备用的静音类音素,一般为silence_phones.txt中的SIL
#nonsilence_phones.txt 语言直接相关的真实音素,同一行的音素是某一个音素的不同变体,共享决策树根
#extra_questions.txt 可为空,与GMM训练中自动生成的questions一同用于决策树的生成
mkdir -p data/dict;
cp $thchs/resource/dict/{extra_questions.txt,nonsilence_phones.txt,optional_silence.txt,silence_phones.txt} data/dict && \
cat $thchs/resource/dict/lexicon.txt $thchs/data_thchs30/lm_word/lexicon.txt | \
grep -v '<s>' | grep -v '</s>' | sort -u > data/dict/lexicon.txt
#数据处理
###### Bookmark: language processing ######
# generate language stuff used for training
# also lexicon to L_disambig.fst for graph making in local/thchs-30_decode.sh
mkdir -p data/lang;
utils/prepare_lang.sh --position_dependent_phones false data/dict "<SPOKEN_NOISE>" data/local/lang data/lang
#phones.txt 将所有音素映射为自然数,即音素ID
#words.txt 将词映射为自然数,即词ID
#oov.txt 未登陆词及其词ID
#topo 各个音素HMM模型的拓扑图,每个音素使用的HMM状态数以及转移概率
#L.fst L_disambig.fst 发音词典转换成的FST,输入是音素,输出是词,后者考虑了消歧
# format trained or provided language model to G.fst
# prepare things for graph making in local/thchs-30_decode.sh, not necessary for training
(
mkdir -p data/graph;
gzip -c $thchs/data_thchs30/lm_word/word.3gram.lm > data/graph/word.3gram.lm.gz
utils/format_lm.sh data/lang data/graph/word.3gram.lm.gz $thchs/data_thchs30/lm_word/lexicon.txt data/graph/lang
)
#特征抽取
###### Bookmark: feature extraction ######
#提取音频的MFCC和FBANK,并计算二者关于说话人的倒谱均值和方差统计量
#MFCC特征各维度之间有较弱的相关性,适合GMM训练,FBANK特征用于DNN训练更有优势
# produce MFCC and Fbank features in data/{mfcc,fbank}/{train,test}
rm -rf data/mfcc && mkdir -p data/mfcc && cp -r data/{train,test} data/mfcc
rm -rf data/fbank && mkdir -p data/fbank && cp -r data/{train,test} data/fbank
for x in train test; do
# make mfcc and fbank
steps/make_mfcc.sh --nj $n --cmd "$train_cmd" data/mfcc/$x
steps/make_fbank.sh --nj $n --cmd "$train_cmd" data/fbank/$x
# compute cmvn
steps/compute_cmvn_stats.sh data/mfcc/$x
steps/compute_cmvn_stats.sh data/fbank/$x
done
#高斯混合模型-隐马尔可夫模型 训练和解码
###### Bookmark: GMM-HMM training & decoding ######
# monophone
steps/train_mono.sh --boost-silence 1.25 --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/mono
# test monophone model
local/thchs-30_decode.sh --nj $n "steps/decode.sh" exp/mono data/mfcc &
# monophone ali
steps/align_si.sh --boost-silence 1.25 --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/mono exp/mono_ali
# triphone
steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 data/mfcc/train data/lang exp/mono_ali exp/tri1
# test tri1 model
local/thchs-30_decode.sh --nj $n "steps/decode.sh" exp/tri1 data/mfcc &
# triphone_ali
steps/align_si.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri1 exp/tri1_ali
# lda_mllt
steps/train_lda_mllt.sh --cmd "$train_cmd" --splice-opts "--left-context=3 --right-context=3" 2500 15000 data/mfcc/train data/lang exp/tri1_ali exp/tri2b
# test tri2b model
local/thchs-30_decode.sh --nj $n "steps/decode.sh" exp/tri2b data/mfcc &
# lda_mllt_ali
steps/align_si.sh --nj $n --cmd "$train_cmd" --use-graphs true data/mfcc/train data/lang exp/tri2b exp/tri2b_ali
# sat
steps/train_sat.sh --cmd "$train_cmd" 2500 15000 data/mfcc/train data/lang exp/tri2b_ali exp/tri3b
# test tri3b model
local/thchs-30_decode.sh --nj $n "steps/decode_fmllr.sh" exp/tri3b data/mfcc &
# sat_ali
steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri3b exp/tri3b_ali
# quick
steps/train_quick.sh --cmd "$train_cmd" 4200 40000 data/mfcc/train data/lang exp/tri3b_ali exp/tri4b
# test tri4b model
local/thchs-30_decode.sh --nj $n "steps/decode_fmllr.sh" exp/tri4b data/mfcc &
# quick_ali
steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri4b exp/tri4b_ali
#深度神经网络 训练和解码
###### Bookmark: DNN training & decoding ######
# train tdnn model
tdnn_dir=exp/nnet3/tdnn
local/nnet3/run_tdnn.sh data/fbank/train exp/tri4b_ali $tdnn_dir
# decoding
graph_dir=exp/tri4b/graph_word # the same as gmm
steps/nnet3/decode.sh --nj $n --cmd "$decode_cmd" $graph_dir data/fbank/test $tdnn_dir/decode_test_word
#判别训练和解码
###### Bookmark: discriminative training & decoding ######
# mmi training
criterion=mmi # mmi, mpfe or smbr
local/nnet3/run_tdnn_discriminative.sh --criterion $criterion $tdnn_dir data/fbank/train
# decoding
steps/nnet3/decode.sh --nj $n --cmd "$decode_cmd" $graph_dir data/fbank/test ${tdnn_dir}_$criterion/decode_test_word
exit 0