kaldi下thchs30相关脚本解析

**run.sh**
#!/usr/bin/env bash

. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
           ## This relates to the queue.
. ./path.sh  

H=`pwd`  #exp home   后面会引用这个路径
n=8      #parallel jobs cpu核心数

#corpus and trans directory
#thchs=/nfs/public/materials/data/thchs30-openslr

thchs=/home/huangsk/kaldi/egs/thchs30/s5/thchs30-openslr  #训练数据路径

#you can obtain the database by uncommting the following lines
#[ -d $thchs ] || mkdir -p $thchs  || exit 1
#echo "downloading THCHS30 at $thchs ..."
#local/download_and_untar.sh $thchs  http://www.openslr.org/resources/18 data_thchs30  || exit 1
#local/download_and_untar.sh $thchs  http://www.openslr.org/resources/18 resource      || exit 1
#local/download_and_untar.sh $thchs  http://www.openslr.org/resources/18 test-noise    || exit 1

#data preparation
#generate text, wav.scp, utt2pk, spk2utt
local/thchs-30_data_prep.sh $H $thchs/data_thchs30 || exit 1;

#produce MFCC features 生成mfcc特征
rm -rf data/mfcc && mkdir -p data/mfcc &&  cp -R data/{train,dev,test,test_phone} data/mfcc || exit 1; 
for x in train dev test; do	
   #make  mfcc
   steps/make_mfcc.sh --nj $n --cmd "$train_cmd" data/mfcc/$x exp/make_mfcc/$x mfcc/$x || exit 1;  #调用steps/make_mfcc.sh ,$n是cpu的并发数,--cmd "$train_cmd"是 训练的cmd ,它调用的是 cmd.sh中设置的train_cmd ,data/mfcc/$x 是每个数据的目录,exp/make_mfcc/$x, mfcc/$x 这些都是目录的参数
   #compute cmvn  计算cmvn
   steps/compute_cmvn_stats.sh data/mfcc/$x exp/mfcc_cmvn/$x mfcc/$x || exit 1;
done
#copy feats and cmvn to test.ph, avoid duplicated mfcc & cmvn
cp data/mfcc/test/feats.scp data/mfcc/test_phone && cp data/mfcc/test/cmvn.scp data/mfcc/test_phone || exit 1;


#prepare language stuff  #准备语料
#build a large lexicon that invovles words in both the training and decoding. 
#建立一个涉及训练和解码中的单词的大词典。
( 
  echo "make word graph ..."
   #将两个目录的lexicon.txt文件输出到data_thchs30/lm_word/lexicon.txt 同时过滤掉带<s>或</s>的行,
   #并且删除相同的重复信息
  cd $H; mkdir -p data/{dict,lang,graph} && \
  cp $thchs/resource/dict/{extra_questions.txt,nonsilence_phones.txt,optional_silence.txt,silence_phones.txt} data/dict && \
  cat $thchs/resource/dict/lexicon.txt $thchs/data_thchs30/lm_word/lexicon.txt | \
    grep -v '<s>' | grep -v '</s>' | sort -u > data/dict/lexicon.txt || exit 1; 
   #grep -v  排除输出
  # eg:cat test.log | grep "login"|grep -v "deviceType" 	找出test.log中包含login信息的,
  #且没有deviceType这个字段的。
  #调用utils下的prepare_lang.sh脚本来准备语言模型 参数 --position_dependent_phones为false 
  # 参数data/dict为"<SPOKEN_NOISE>" data/local/lang data/lang
  utils/prepare_lang.sh --position_dependent_phones false data/dict "<SPOKEN_NOISE>" data/local/lang data/lang || exit 1;
   #将word.3gram.lm压缩为word.3gram.lm.gz并保留源文件
  gzip -c $thchs/data_thchs30/lm_word/word.3gram.lm > data/graph/word.3gram.lm.gz || exit 1;
  utils/format_lm.sh data/lang data/graph/word.3gram.lm.gz $thchs/data_thchs30/lm_word/lexicon.txt data/graph/lang || exit 1; #格式化语言模型	
)

#make_phone_graph #制作音素图
(
  echo "make phone graph ..."
  cd $H; mkdir -p data/{dict_phone,graph_phone,lang_phone} && \
  cp $thchs/resource/dict/{extra_questions.txt,nonsilence_phones.txt,optional_silence.txt,silence_phones.txt} data/dict_phone  && \
  cat $thchs/data_thchs30/lm_phone/lexicon.txt | grep -v '<eps>' | sort -u > data/dict_phone/lexicon.txt  && \
  echo "<SPOKEN_NOISE> sil " >> data/dict_phone/lexicon.txt  || exit 1;
  utils/prepare_lang.sh --position_dependent_phones false data/dict_phone "<SPOKEN_NOISE>" data/local/lang_phone data/lang_phone || exit 1;
  gzip -c $thchs/data_thchs30/lm_phone/phone.3gram.lm > data/graph_phone/phone.3gram.lm.gz  || exit 1;
  utils/format_lm.sh data/lang_phone data/graph_phone/phone.3gram.lm.gz $thchs/data_thchs30/lm_phone/lexicon.txt \
    data/graph_phone/lang  || exit 1;
)

#monophone  train_mono.sh 用来训练单音子隐马尔科夫模型,一共进行40次迭代,每两次迭代进行一次对齐操作
steps/train_mono.sh --boost-silence 1.25 --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/mono || exit 1;
#test monophone model 
local/thchs-30_decode.sh --mono true --nj $n "steps/decode.sh" exp/mono data/mfcc &

#monophone_ali
steps/align_si.sh --boost-silence 1.25 --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/mono exp/mono_ali || exit 1;

#triphone   训练与上下文相关的三音素模型
steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 data/mfcc/train data/lang exp/mono_ali exp/tri1 || exit 1;
#test tri1 model
local/thchs-30_decode.sh --nj $n "steps/decode.sh" exp/tri1 data/mfcc &

#triphone_ali
steps/align_si.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri1 exp/tri1_ali || exit 1;

#lda_mllt 进行线性判别分析和最大似然线性转换
steps/train_lda_mllt.sh --cmd "$train_cmd" --splice-opts "--left-context=3 --right-context=3" 2500 15000 data/mfcc/train data/lang exp/tri1_ali exp/tri2b || exit 1;
#test tri2b model
local/thchs-30_decode.sh --nj $n "steps/decode.sh" exp/tri2b data/mfcc &


#lda_mllt_ali
steps/align_si.sh  --nj $n --cmd "$train_cmd" --use-graphs true data/mfcc/train data/lang exp/tri2b exp/tri2b_ali || exit 1;

#sat  train_sat.sh 用来训练发音人自适应,基于特征空间最大似然线性回归
steps/train_sat.sh --cmd "$train_cmd" 2500 15000 data/mfcc/train data/lang exp/tri2b_ali exp/tri3b || exit 1;
#test tri3b model  练三音素解码器
local/thchs-30_decode.sh --nj $n "steps/decode_fmllr.sh" exp/tri3b data/mfcc &

#sat_ali
steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri3b exp/tri3b_ali || exit 1;

#quick_ali  train_quick.sh 用来在现有特征上训练模型  对于当前模型中在树构建之后的每个状态,
#它基于树统计中的计数的重叠判断的相似性来选择旧模型中最接近的状态。
steps/train_quick.sh --cmd "$train_cmd" 4200 40000 data/mfcc/train data/lang exp/tri3b_ali exp/tri4b || exit 1;
#test tri4b model
local/thchs-30_decode.sh --nj $n "steps/decode_fmllr.sh" exp/tri4b data/mfcc &

steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri4b exp/tri4b_ali || exit 1;

#quick_ali_cv
steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/dev data/lang exp/tri4b exp/tri4b_ali_cv || exit 1;

#train dnn model    run_dnn.sh 用来训练DNN,包括xent和MPE
local/nnet/run_dnn.sh --stage 0 --nj $n  exp/tri4b exp/tri4b_ali exp/tri4b_ali_cv || exit 1;

#train dae model
#python2.6 or above is required for noisy data generation.
#To speed up the process, pyximport for python is recommeded.
local/dae/run_dae.sh $thchs || exit 1;




**path.sh**
export KALDI_ROOT=`pwd`/../../..  #设置kaldi源码的根目录
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh  #若存在这个环境变量脚本就执行这个脚本
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH  #将本目录下的utils目录, 
#kaldi根目录下的tools/openfst/bin目录 和 本目录加入到环境变量PATH中
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1  #判断如果在kaldi根目录#下的tools/config/common_path.sh不存在,就打印提示缺少该文件,并且退出
. $KALDI_ROOT/tools/config/common_path.sh  # 存在则执行该脚本
export LC_ALL=C   #去除所有本地化的设置,让命令能正确执行

**cmd.sh**
#export train_cmd=queue.pl
#export decode_cmd="queue.pl --mem 4G"
#export mkgraph_cmd="queue.pl --mem 8G"
#export cuda_cmd="queue.pl --gpu 1"

export train_cmd=run.pl   #将原来的queue.pl改为run.pl
export decode_cmd="run.pl --mem 4G"
export mkgraph_cmd="run.pl --mem 8G"
export cuda_cmd="run.pl --gpu 1"  #指定gpu参与计算


**thchs-30_data_prep.sh**

#!/usr/bin/env bash
# Copyright 2016  Tsinghua University (Author: Dong Wang, Xuewei Zhang).  Apache 2.0.
#           2016  LeSpeech (Author: Xingyu Na)

#This script pepares the data directory for thchs30 recipe.  
#It reads the corpus and get wav.scp and transcriptions.它读取语料库并得到wav.scp和音标。

dir=$1
corpus_dir=$2
#这两个参数其实就是上面这个命令local/thchs-30_data_prep.sh $H $thchs/data_thchs30的两个参数     $H     $thchs/data_thchs30
#$1  代表 $H 也就是 run.sh中的H=`pwd`   实际上就是当前目录

#$2  代表 $thchs/data_thchs30 因为run.sh中之前声明thchs=/home/huangsk/kaldi/egs/thchs30/thchs30-openslr 所以这里$thchs/data_thchs30就是指的/home/huangsk/kaldi/egs/thchs30/thchs30-openslr/data_thchs30 也就是语音目录
      
cd $dir 

echo "creating data/{train,dev,test}" # 进入该目录,打印文字"creating data/{train,dev,test}"
mkdir -p data/{train,dev,test} #创建data目录,及子目录,一会儿会在这下面生成数据准备文件

#create wav.scp, utt2spk.scp, spk2utt.scp, text  #创建相关文件:wav.scp记录每个ID的音频文件路径 utt2spk.scp与spk2utt.scp记录每个ID说话人的信息 text记录每个ID的文本内容
<<kkk
这里说明一下 根据音频名和标注创建:wav.scp, utt2spk.scp, spk2utt.scp, text以及word.txt phone.txt。
wav.scp中第一列为录音编号<recording-id>,第二列为音频文件路径<extended-filename>
举例:A11_000       /opt/kaldi/egs/thchs30/thchs30-openslr/data_thchs30/train/A11_0.wav

utt2spk中第一列为录音编号<utterance-id>,第二列为讲话者id<speaker-id>
举例:A11_000      A11
spk2utt中第一列为讲话着<speaker-id>,后面跟着他所说的话<utterance-id1> <utterance-id2> …

这个就是后面需要将data/train/utt2spk 转换为 data/train/spk2utt格式的
word.txt中第一列为录音编号<utterance-id>,第二列为讲话内容,后面我们在研究这些是怎么生成的。

举例:A11_000      绿 是 阳春 烟 景 大块 文章 的 底色 四月 的 林 峦 更是 绿 得 鲜活 秀媚 诗意 盎然
phone.txt中第一列为录音编号<utterance-id>,第二列为讲话内容的声音标注,后面我们在研究这些是怎么生成的。
举例:A11_000      l v4 sh ix4 ii iang2 ch un1 ii ian1 j ing3 d a4 k uai4 uu un2 zh ang1 d e5 d i3 s e4 s iy4 vv ve4 d e5 l in2 l uan2 g eng4 sh ix4 l v4 d e5 x ian1 h uo2 x iu4 m ei4 sh ix1 ii i4 aa ang4 r an2
kkk
(
##进入循环,这里是生成每个文件的步骤
for x in train dev test; do
  echo "cleaning data/$x" #循环显示
  cd $dir/data/$x #进入每个目录
  rm -rf wav.scp utt2spk spk2utt word.txt phone.txt text #删除这些文件,即如果有这些文件就重新生成
  echo "preparing scps and text in data/$x"
  #updated new "for loop" figured out the compatibility issue with Mac     created by Xi Chen, in 03/06/2018
  #for nn in `find  $corpus_dir/$x/*.wav | sort -u | xargs -i basename {} .wav`; do
  for nn in `find  $corpus_dir/$x -name "*.wav" | sort -u | xargs -I {} basename {} .wav`; do 
  #进入相应目录循环查找"*.wav"语音文件,并排序去除重复行;
  #xargs 可以将管道或标准输入(stdin)数据转换成命令行参数,也能够从文件的输出中读取数据;
  #xargs 默认的#命令是 echo,这意味着通过管道传递给 xargs 的输入将会包含换行和空白
  #sort将文件的每一行作为一个单位,相互比较,比较原则是从首字符向后,依次按ASCII码值进行比较,
  #最后将他们按升序输出。-u : unique 唯一,排序并且排除重复项。-r : reverse 反向排序。
  #basename命令用于显示去除路径和文件后缀部分的文件名或目录名。
      spkid=`echo $nn | awk -F"_" '{print "" $1}'`  #说话者id   awk就是把文件逐行的读入,
      #以空格为默认分隔符将每行切片,切开的部分再进行各种分析处理,awk工作流程是这样的:读入有'\n'换行
      #符分割的一条记录,然后将记录按指定的域分隔符划分域,填充域,$0则表示所有域,$1表示第一个域,
      #$n表示第n个域。-F指定分隔符 	
      spk_char=`echo $spkid | sed 's/\([A-Z]\).*/\1/'`  #说话者内容   sed同时处理多个文件多行的内容,
      #可以不对原文件改动,把整个文件输入到屏幕,可以把只匹配到模式的内容输入到屏幕上。
      #还可以对原文件改动,但是不会再屏幕上返回结果。
      spk_num=`echo $spkid | sed 's/[A-Z]\([0-9]\)/\1/'` #说话者编号(从0开始递增)
      spkid=$(printf '%s%.2d' "$spk_char" "$spk_num")  #说话者内容和编号输出
      utt_num=`echo $nn | awk -F"_" '{print $2}'`  #说话号,号码为0向上递增
      uttid=$(printf '%s%.2d_%.3d' "$spk_char" "$spk_num" "$utt_num")   #说话者内容和号码, 说话号输出
      echo $uttid $corpus_dir/$x/$nn.wav >> wav.scp #说话者内容和号码, 说话号码输出  
      # 语音文件全路径名称输出 例如   A11_000    /opt/kaldi/egs/thchs30/thchs30-openslr/data_thchs30/train/A11_0.wav
      echo $uttid $spkid >> utt2spk #说话者内容和号码, 说话号码输出  说话者id   例如 A11_000     A11
      echo $uttid `sed -n 1p $corpus_dir/data/$nn.wav.trn` >> word.txt  # #说话者内容和号码, 说话号码输出  并且找到相应文件获取语音数据(内容的第一行是中文)例如    A11_000        绿 是 阳春 烟 景 大块 文章 的 底色 四月 的 林 峦 更是 绿 得 鲜活 秀媚 诗意 盎然
      echo $uttid `sed -n 3p $corpus_dir/data/$nn.wav.trn` >> phone.txt     #说话者内容和号码, 说话号码输出  并且找到相应文件获取语音数据(内容的第三行是音标) 例如    A11_000        l v4 sh ix4 ii iang2 ch un1 ii ian1 j ing3 d a4 k uai4 uu un2 zh ang1 d e5 d i3 s e4 s iy4 vv ve4 d e5 l in2 l uan2 g eng4 sh ix4 l v4 d e5 x ian1 h uo2 x iu4 m ei4 sh ix1 ii i4 aa ang4 r an2
  done 
  cp word.txt text
  #所有的都进行排序
  sort wav.scp -o wav.scp
  sort utt2spk -o utt2spk
  sort text -o text
  sort phone.txt -o phone.txt
done
) || exit 1

utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt #调用utils/utt2spk_to_spk2utt.pl 将utt2spk文件转为spk2utt,以下同样
 #>表示覆盖原文件内容(文件的日期也会自动更新)使用  >  执行命令时,每次都会新生成一个 > 后面的文件,将之前生成的文件替换掉,>>表示追加内容(会另起一行,文件的日期也会自动更新)
utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt

echo "creating test_phone for phone decoding" #应该是创建测试集的音标
(
  rm -rf data/test_phone && cp -R data/test data/test_phone  || exit 1  #删除data下的test_phone目录,将data的test data下的拷过来
  cd data/test_phone && rm text &&  cp phone.txt text || exit 1  #进去后删除原来的text ,拷贝phone.txt作为text
)


**steps/make_mfcc.sh**
#!/bin/bash

# Copyright 2012-2016  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# To be run from .. (one directory up from here)
# see ../run.sh for example

# Begin configuration section.
nj=4 #默认nj为4
cmd=run.pl #调用run.pl
mfcc_config=conf/mfcc.conf  #mfcc配置文件为conf/mfcc.conf  此时说明一下  我们看到 配置文件中只有如下两行:
--use-energy=false   # only non-default option.
                     #--sample-frequency=8000    说明没有选定使用能量,也就是不使用能量
compress=true    #这里应该是启用了压缩
write_utt2num_frames=false  # if true writes utt2num_frames  这里表示如果选择true 则就写utt2num_frames
# End configuration section.

echo "$0 $@"  # Print the command line for logging   这里是打印命令行到日志,$0为执行的命令 $@表示所有参数脚本的内容

if [ -f path.sh ]; then . ./path.sh; fi    #这句的意思是如果存在path.sh那么就执行它
. parse_options.sh || exit 1;  #应该是解析命令行选项,其实调用的是utils/parse_options.sh,对于这个文件的研究我们放到后面。
if [ $# -lt 1 ] || [ $# -gt 3 ]; then
  cat >&2 <<EOF
Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]
 e.g.: $0 data/train
Note: <log-dir> defaults to <data-dir>/log, and
      <mfcc-dir> defaults to <data-dir>/data.
Options:
  --mfcc-config <config-file>          # config passed to compute-mfcc-feats.
  --nj <nj>                            # number of parallel jobs.
  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
  --write-utt2dur <true|false>         # If true, write utt2dur file.
EOF
   exit 1;
fi

data=$1  #将第一个参数赋给data,即<data-dir>
if [ $# -ge 2 ]; then #判断如果传给脚本参数总数大于2,则将第二个参数赋给logdir,否则将data下的log路径赋给logdir
  logdir=$2     #   $# 是传给脚本的参数个数 eg:[ $j -ge 2 -a $j -le $(($wide-1)) ]   $j -ge 2 变量j大于等于2
else             #$(($wide-1)) 计算 wide-1的值   $j -le $(($wide-1)) 变量j小于bai等duwide-1 -a 表示and      
  logdir=$data/log
fi
if [ $# -ge 3 ]; then
  mfccdir=$3
else
  mfccdir=$data/data
fi

# make $mfccdir an absolute pathname. 调用perl脚本来创建目录,给mfccdir赋值一个绝对路径名称
mfccdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $mfccdir ${PWD}`
#@ARGV是一个数组,不管脚本是否出现它,它始终是存在的。@ARGV是per默认用来接收参数的数组,这些参数来源于用户在命令行输入的参数。
# use "name" as part of name of the archive.
name=`basename $data`

mkdir -p $mfccdir || exit 1;
mkdir -p $logdir || exit 1;

if [ -f $data/feats.scp ]; then   #$data就是date/mfcc/train
  mkdir -p $data/.backup
  echo "$0: moving $data/feats.scp to $data/.backup"
  mv $data/feats.scp $data/.backup
fi

scp=$data/wav.scp

required="$scp $mfcc_config"  # $mfcc_config 是conf/mfcc_conf

for f in $required; do #判断是否有足够的文件才继续往下运行
  if [ ! -f $f ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
done

utils/validate_data_dir.sh --no-text --no-feats $data || exit 1; #这里调用了一个校验数据目录的脚本,来检测各种文件及目录是否存在

if [ -f $data/spk2warp ]; then   #这里是是否通过VTLN(特征级声道长度标准化),这里没有这两种文件就不调用。
  echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
  vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
elif [ -f $data/utt2warp ]; then
  echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
  vtln_opts="--vtln-map=ark:$data/utt2warp"
else
  vtln_opts=""
fi

for n in $(seq $nj); do
  # the next command does nothing unless $mfccdir/storage/ exists, see
  # utils/create_data_link.pl for more info.   除非$mfccdir/storage/存在,否则下一个命令不会执行任何操作 
  utils/create_data_link.pl $mfccdir/raw_mfcc_$name.$n.ark #mfccdir 是/home/haungsk/kaldi/egs/thchs30/s5/mafcc/train
done


if $write_utt2num_frames; then   #该脚本开始就声明该变量为true
  write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB"
else
  write_num_frames_opt=
fi

if $write_utt2dur; then
  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
else
  write_utt2dur_opt=
fi

if [ -f $data/segments ]; then
  echo "$0 [info]: segments file exists: using that."

  split_segments=
  for n in $(seq $nj); do
    split_segments="$split_segments $logdir/segments.$n"
  done

  utils/split_scp.pl $data/segments $split_segments || exit 1;
  rm $logdir/.error 2>/dev/null

  $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
    extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
      --config=$mfcc_config ark:- ark:- \| \
    copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
     || exit 1;

else
  echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
  split_scps=
  for n in $(seq $nj); do
    split_scps="$split_scps $logdir/wav_${name}.$n.scp"  #切分为四个,每一个都添加到这个变量中
  done   #将$scp(data/mfcc/train下的wav.acp)按照$split_scps切分

  utils/split_scp.pl $scp $split_scps || exit 1;


  # add ,p to the input rspecifier so that we can just skip over
  # utterances that have bad wave data. #这里是为了给run.sh中后面的compute-mfcc-feats计算MFCC参数用的。该程序需要两个命令行参数:rspecifier是用来读.wav数据(以发音为索引)和wspecifier是用来写特征(以发音为索引)

  $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
      --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
    copy-feats $write_num_frames_opt --compress=$compress ark:- \
      ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
      || exit 1;
fi


if [ -f $logdir/.error.$name ]; then
  echo "$0: Error producing MFCC features for $name:"
  tail $logdir/make_mfcc_${name}.1.log
  exit 1;
fi

# concatenate the .scp files together. #将.scp文件连接在一起 
for n in $(seq $nj); do  #下面这一步其实是将$mfccdir/raw_mfcc_train.1.scp ...... raw_mfcc_train.4.scp合并在一                 起,前提是需要将上面的$cmd JOB=1 ...命令全部打开,使其生成这些文件
  cat $mfccdir/raw_mfcc_$name.$n.scp || exit 1
done > $data/feats.scp || exit 1    #然后输出到$data/feats.scp中

if $write_utt2num_frames; then
  for n in $(seq $nj); do
    cat $logdir/utt2num_frames.$n || exit 1
  done > $data/utt2num_frames || exit 1
fi

if $write_utt2dur; then
  for n in $(seq $nj); do
    cat $logdir/utt2dur.$n || exit 1
  done > $data/utt2dur || exit 1
fi

# Store frame_shift and mfcc_config along with features.
frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
                          printf "%.3f", 0.001 * $1; exit; }' $mfcc_config)
echo ${frame_shift:-'0.01'} > $data/frame_shift
mkdir -p $data/conf && cp $mfcc_config $data/conf/mfcc.conf || exit 1

rm $logdir/wav_${name}.*.scp  $logdir/segments.* \  # 删除日志路径下的wav_train.*.scp,日志下的其他的类似segments.*的文件
   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null

nf=$(wc -l < $data/feats.scp)  #wc命令是统计命令,如文件的字符数等,wc -l是统计行数
nu=$(wc -l < $data/utt2spk)	
if [ $nf -ne $nu ]; then
  echo "$0: It seems not all of the feature files were successfully procesed" \
       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
fi

if (( nf < nu - nu/20 )); then
  echo "$0: Less than 95% the features were successfully generated."\
       "Probably a serious error."
  exit 1
fi


echo "$0: Succeeded creating MFCC features for $name"


**compute_cmvn_stats.sh**
#!/usr/bin/env bash

# Copyright 2012-2016  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# To be run from .. (one directory up from here)
# see ../run.sh for example

# Compute cepstral mean and variance statistics per speaker.
# We do this in just one job; it's fast.计算每个发言者的倒频谱均值和方差统计数据。
# This script takes no options.
#
# Note: there is no option to do CMVN per utterance.  The idea is
# that if you did it per utterance it would not make sense to do
# per-speaker fMLLR on top of that (since you'd be doing fMLLR on
# top of different offsets).  Therefore what would be the use
# of the speaker information?  In this case you should probably
# make the speaker-ids identical to the utterance-ids.  The
# speaker information does not have to correspond to actual
# speakers, it's just the level you want to adapt at.

echo "$0 $@"  # Print the command line for logging #打印命令行日志

fake=false   # If specified, can generate fake/dummy CMVN stats (that won't normalize)
fake_dims=   # as the "fake" option, but you can generate "fake" stats only for certain
             # dimensions.
two_channel=false

if [ "$1" == "--fake" ]; then
  fake=true
  shift   #位置参数可以用shift命令左移 比如shift 3表示原来的$4现在变成$1
fi
if [ "$1" == "--fake-dims" ]; then
  fake_dims=$2
  shift
  shift
fi
if [ "$1" == "--two-channel" ]; then
  two_channel=true
  shift
fi

if [ $# -lt 1 ] || [ $# -gt 3 ]; then  #-lt 表示小于  $# 表示这个程序的参数个数  -gt 表示大于  -ne 表示 不等于
   echo "Usage: $0 [options] <data-dir> [<log-dir> [<cmvn-dir>] ]";
   echo "e.g.: $0 data/train exp/make_mfcc/train mfcc"
   echo "Note: <log-dir> defaults to <data-dir>/log, and <cmvn-dir> defaults to <data-dir>/data"
   echo "Options:"
   echo " --fake          gives you fake cmvn stats that do no normalization."
   echo " --two-channel   is for two-channel telephone data, there must be no segments "
   echo "                 file and reco2file_and_channel must be present.  It will take"
   echo "                 only frames that are louder than the other channel."
   echo " --fake-dims <n1:n2>  Generate stats that won't cause normalization for these"
   echo "                  dimensions (e.g. 13:14:15)"
   exit 1;
fi

if [ -f path.sh ]; then . ./path.sh; fi	

data=$1
if [ $# -ge 2 ]; then
  logdir=$2
else
  logdir=$data/log
fi
if [ $# -ge 3 ]; then
  cmvndir=$3
else
  cmvndir=$data/data
fi

# make $cmvndir an absolute pathname. #这块是调用perl脚本来创建目录,给cmvndir一个绝对路径名称
cmvndir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $cmvndir ${PWD}`

# use "name" as part of name of the archive.
name=`basename $data`

mkdir -p $cmvndir || exit 1;
mkdir -p $logdir || exit 1;


required="$data/feats.scp $data/spk2utt"

for f in $required; do   #循环遍历两个文件 检测是否存在这两个文件
  if [ ! -f $f ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
done
 #以下是先后判断$fake、$two_channel、"$fake_dims"是否为空;三个条件,三个条件都不满足,运行下面的else
if $fake; then   
  dim=`feat-to-dim scp:$data/feats.scp -`
  ! cat $data/spk2utt | awk -v dim=$dim '{print $1, "["; for (n=0; n < dim; n++) { printf("0 "); } print "1";
                                                        for (n=0; n < dim; n++) { printf("1 "); } print "0 ]";}' | \
    copy-matrix ark:- ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp && \
     echo "Error creating fake CMVN stats.  See $logdir/cmvn_$name.log." && exit 1;
elif $two_channel; then
  ! compute-cmvn-stats-two-channel $data/reco2file_and_channel scp:$data/feats.scp \
       ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp \
    2> $logdir/cmvn_$name.log && echo "Error computing CMVN stats (using two-channel method). See $logdir/cmvn_$name.log." && exit 1;
elif [ ! -z "$fake_dims" ]; then
  ! compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp ark:- | \
    modify-cmvn-stats "$fake_dims" ark:- ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp && \
    echo "Error computing (partially fake) CMVN stats.  See $logdir/cmvn_$name.log" && exit 1;
else
  ! compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp \
    2> $logdir/cmvn_$name.log && echo "Error computing CMVN stats. See $logdir/cmvn_$name.log" && exit 1;
fi

cp $cmvndir/cmvn_$name.scp $data/cmvn.scp || exit 1;

nc=`cat $data/cmvn.scp | wc -l`  #统计$data/cmvn.scp文件行数
nu=`cat $data/spk2utt | wc -l`  #统计$data/spk2utt文件行数
if [ $nc -ne $nu ]; then      #如果$data/cmvn.scp文件行数 不等于 $data/spk2utt文件行数
  echo "$0: warning: it seems not all of the speakers got cmvn stats ($nc != $nu);"
  [ $nc -eq 0 ] && exit 1;   
fi
#-eq           //等于
#-ne           //不等于
#-gt            //大于 (greater )
#-lt            //小于  (less)
#-ge            //大于等于
#-le            //小于等于
echo "Succeeded creating CMVN stats for $name"


https://blog.csdn.net/dqxiaoxiao/article/details/81948404?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.channel_param&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.channel_param
https://blog.csdn.net/hengwen1991/article/details/82492978

©️2020 CSDN 皮肤主题: 1024 设计师:上身试试 返回首页