0. References
PyTorch-Kaldi简介 - 李理的博客fancyerii.github.io1. 执行
默认安装好了所有环境,包括:
- kaldi
- pytorch
- git clone https://github.com/mravanelli/pytorch-kaldi
- pip install -r requirements.txt
- TIMIT数据集
执行recipe的步骤:
1. kaldi训练
cd $KALDI_ROOT/egs/timit/s5
./run.sh
./local/nnet/run_dnn.sh
2. 利用kaldi中训好的DNN,对所有数据(train/dev/test)进行解码对齐,用于kaldi的训练
steps/nnet/align.sh --nj 4 data-fmllr-tri3/train data/lang exp/dnn4_pretrain-dbn_dnn exp/dnn4_pretrain-dbn_dnn_ali
steps/nnet/align.sh --nj 4 data-fmllr-tri3/dev data/lang exp/dnn4_pretrain-dbn_dnn exp/dnn4_pretrain-dbn_dnn_ali_dev
steps/nnet/align.sh --nj 4 data-fmllr-tri3/test data/lang exp/dnn4_pretrain-dbn_dnn exp/dnn4_pretrain-dbn_dnn_ali_test
3. pytorch训练
python run_exp.py cfg/TIMIT_baselines/TIMIT_MLP_mfcc_basic.cfg
2. 输出
- kaldi输出:特征,每一步的声学模型和对齐结果,dnn的最终对齐结果
$ pwd
/home/cxj/work/kaldi/egs/timit/s5/exp
$ tree . -L 1
.
├── dnn4_pretrain-dbn
├── dnn4_pretrain-dbn_dnn
├── dnn4_pretrain-dbn_dnn_ali
├── dnn4_pretrain-dbn_dnn_ali_dev
├── dnn4_pretrain-dbn_dnn_ali_test
├── dnn4_pretrain-dbn_dnn_denlats
├── dnn4_pretrain-dbn_dnn_smbr
├── make_mfcc
├── mono
├── mono_ali
├── tri1
├── tri1_ali
├── tri2
├── tri2_ali
├── tri3
└── tri3_ali
- pytorch-kaldi输出:cfg(数据、模型、训练、解码的配置)、pytorch声学模型的解码结果、kaldi端执行的log、指标
$ pwd
/home/cxj/work/project/pytorch-kaldi/exp/TIMIT_MLP_basic
$ tree . -L 1
.
├── conf.cfg
├── decode_TIMIT_test_out_dnn1
├── decoding_TIMIT_test_out_dnn1.conf
├── exp_files
├── final.mdl
├── generated_outputs
├── log.log
├── model.diag
└── res.res
3. 代码流程
3.1 逻辑入口
python run_exp.py cfg/TIMIT_baselines/TIMIT_MLP_mfcc_basic.cfg
3.2 run_exp.py
run_exp.py
里主要逻辑是四件事: 1. 配置解析,数据,模型,解码等 2. 在pytorch端训练nn模型 3. 利用训练好的nn模型做解码
3.3 数据流程
其中需要弄清楚的就是数据流向,从输入到输出,框架的做法是(K for kaldi and P for python): 1. [K]从kaldi产出的ark
和scp
文档,获取音频的特征、标签(kaldi在dnn阶段的对齐结果) 2. [P]数据切分成chunk
,然后分块生成子配置,执行训练 3. [P]前向传播 4. [K]用kaldi产出的mdl做对齐解码
3.4 Code Snippets
run_exp.py
# --------FORWARD--------#
for forward_data in forward_data_lst:
# forward_data是数据的标识,根据这个标识找对对应的输入输出cfg,
# 解析出数据模型后给nn训练
...
...
for ck in range(N_ck_forward):
# output file
info_file = (
out_folder
+ "/exp_files/forward_"
+ forward_data
+ "_ep"
+ format(ep, N_ep_str_format)
+ "_ck"
+ format(ck, N_ck_str_format)
+ ".info"
)
config_chunk_file = (
out_folder
+ "/exp_files/forward_"
+ forward_data
+ "_ep"
+ format(ep, N_ep_str_format)
+ "_ck"
+ format(ck, N_ck_str_format)
+ ".cfg"
)
# Do forward if the chunk was not already processed
if not (os.path.exists(info_file)):
# Doing forward
# getting the next chunk
next_config_file = cfg_file_list[op_counter]
if _run_forwarding_in_subprocesses(config):
...
...
else:
# KP 20/04/02 run_nn执行前向传播
[data_name, data_set, data_end_index, fea_dict, lab_dict, arch_dict] = run_nn(
data_name,
data_set,
data_end_index,
fea_dict,
lab_dict,
arch_dict,
config_chunk_file,
processed_first,
next_config_file,
)
processed_first = False
if not (os.path.exists(info_file)):
sys.stderr.write(
"ERROR: forward chunk %i of dataset %s not done! File %s does not exist.nSee %s n"
% (ck, forward_data, info_file, log_file)
)
sys.exit(0)
info_files.append(info_file)
# update the operation counter
op_counter += 1
core.py
def run_nn(
data_name, data_set, data_end_index, fea_dict, lab_dict, arch_dict, cfg_file, processed_first, next_config_file
):
# This function processes the current chunk using the information in cfg_file. In parallel, the next chunk is load into the CPU memory
...
...
# ***** Reading the Data********
if processed_first:
# Reading all the features and labels for this chunk
shared_list = []
# KP 20/04/01 read_lab_fea从cfg配置文件中读取所有数据相关的信息
p = threading.Thread(target=read_lab_fea, args=(cfg_file, is_production, shared_list, output_folder))
p.start()
p.join()
data_name = shared_list[0]
data_end_index = shared_list[1]
fea_dict = shared_list[2]
lab_dict = shared_list[3]
arch_dict = shared_list[4]
data_set = shared_list[5]
# converting numpy tensors into pytorch tensors and put them on GPUs if specified
if not (save_gpumem) and use_cuda:
data_set = torch.from_numpy(data_set).float().cuda()
else:
data_set = torch.from_numpy(data_set).float()
# Reading all the features and labels for the next chunk
shared_list = []
p = threading.Thread(target=read_lab_fea, args=(next_config_file, is_production, shared_list, output_folder))
p.start()
# Reading model and initialize networks
inp_out_dict = fea_dict
[nns, costs] = model_init(inp_out_dict, model, config, arch_dict, use_cuda, multi_gpu, to_do)
# optimizers initialization
optimizers = optimizer_init(nns, config, arch_dict)
# pre-training and multi-gpu init
for net in nns.keys():
pt_file_arch = config[arch_dict[net][0]]["arch_pretrain_file"]
if pt_file_arch != "none":
if use_cuda:
checkpoint_load = torch.load(pt_file_arch)
else:
checkpoint_load = torch.load(pt_file_arch, map_location="cpu")
nns[net].load_state_dict(checkpoint_load["model_par"])
optimizers[net].load_state_dict(checkpoint_load["optimizer_par"])
optimizers[net].param_groups[0]["lr"] = float(
config[arch_dict[net][0]]["arch_lr"]
) # loading lr of the cfg file for pt
if multi_gpu:
nns[net] = torch.nn.DataParallel(nns[net])
if to_do == "forward":
post_file = {}
for out_id in range(len(forward_outs)):
if require_decodings[out_id]:
out_file = info_file.replace(".info", "_" + forward_outs[out_id] + "_to_decode.ark")
else:
out_file = info_file.replace(".info", "_" + forward_outs[out_id] + ".ark")
post_file[forward_outs[out_id]] = open_or_fd(out_file, output_folder, "wb")
# check automatically if the model is sequential
seq_model = is_sequential_dict(config, arch_dict)
# ***** Minibatch Processing loop********
...
...
for i in range(N_batches):
max_len = 0
if seq_model:
...
...
else:
# features and labels for batch i
if to_do != "forward":
inp = data_set[beg_batch:end_batch, :].contiguous()
else:
snt_len = data_end_index[snt_index] - beg_snt
inp = data_set[beg_snt : beg_snt + snt_len, :].contiguous()
beg_snt = data_end_index[snt_index]
snt_index = snt_index + 1
...
...
if to_do == "train":
# Forward input, with autograd graph active
...
...
else:
with torch.no_grad(): # Forward input without autograd graph (save memory)
outs_dict = forward_model(
fea_dict,
lab_dict,
arch_dict,
model,
nns,
costs,
inp,
inp_out_dict,
max_len,
batch_size,
to_do,
forward_outs,
)
if to_do == "forward":
for out_id in range(len(forward_outs)):
out_save = outs_dict[forward_outs[out_id]].data.cpu().numpy()
if forward_normalize_post[out_id]:
# read the config file
counts = load_counts(forward_count_files[out_id])
out_save = out_save - np.log(counts / np.sum(counts))
# save the output
write_mat(output_folder, post_file[forward_outs[out_id]], out_save, data_name[i])
else:
loss_sum = loss_sum + outs_dict["loss_final"].detach()
err_sum = err_sum + outs_dict["err_final"].detach()
# update it to the next batch
beg_batch = end_batch
end_batch = beg_batch + batch_size
...
...
# Getting the data for the next chunk (read in parallel)
p.join()
data_name = shared_list[0]
data_end_index = shared_list[1]
fea_dict = shared_list[2]
lab_dict = shared_list[3]
arch_dict = shared_list[4]
data_set = shared_list[5]
# converting numpy tensors into pytorch tensors and put them on GPUs if specified
if not (save_gpumem) and use_cuda:
data_set = torch.from_numpy(data_set).float().cuda()
else:
data_set = torch.from_numpy(data_set).float()
return [data_name, data_set, data_end_index, fea_dict, lab_dict, arch_dict]
data_io.py
# KP 20/04/01 数据流,从配置中load出所有的数据和标签
def read_lab_fea(cfg_file, fea_only, shared_list, output_folder):
...
...
[data_name_fea, data_set_fea, data_end_index_fea] = load_chunk(
# 特征, 特征处理, 对齐标签, 标签处理, 特征左窗口, 右窗口
fea_scp, fea_opts, lab_folder, lab_opts, cw_left, cw_right, max_seq_length, output_folder, fea_only)
def load_chunk(
# 特征, 特征处理, 对齐标签, 标签处理, 左窗口,右窗口
fea_scp, fea_opts, lab_folder, lab_opts, left, right, max_sequence_length, output_folder, fea_only=False
):
def load_dataset(
# 特征, 特征处理, 对齐标签, 标签处理, 左窗口,右窗口,
fea_scp, fea_opts, lab_folder, lab_opts, left, right, max_sequence_length, output_folder, fea_only=False
):