相关代码路径:./kaldi/src/bin/compute-gop.cc
// bin/compute-gop.cc
// Copyright 2019 Junbo Zhang
// See ../../COPYING for clarification regarding multiple authors
/**
此代码用于计算发音优度(GOP)并提取音素级发音特征,用于发音错误检测任务,参考:基于深度神经网络训练的声学模型和基于迁移学习的logistic回归分类器改进的发音错误检测。
GOP被广泛用于检测发音错误。 基于DNN的GOP定义为规范音素与得分最高的音素之间的对数音素后验比率。
要计算GOP,我们需要先计算对数音素后向概率(LPP):可以看论文
*/
#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "hmm/transition-model.h"
#include "hmm/hmm-utils.h"
#include "hmm/tree-accu.h"
#include "hmm/posterior.h"
namespace kaldi {
int32 PhoneNum(const std::vector<std::set<int32> > &pdf2phones) {
int32 phone_num = 0;
for (auto &pdf: pdf2phones) {
if(!pdf.empty()) {
phone_num = std::max(phone_num, 1 + *pdf.rbegin());
}
}
return phone_num;
}
/** ComputeLpps compute log posteriors for pure-phones by sum the posterior
of the states belonging to those triphones whose current phone is the canonical
phone:
p(p|o_t) = \sum_{s \in p} p(s|o_t),
where s is the senone label, {s|s \in p} is the states belonging to those
triphones whose current phone is the canonical phone p.
ComputeLpps 通过 累加 属于某个纯音素的全部三音素的状态的后验概率, 来计算 这个纯音素 的对数后验概率,
*/
【补充说明】 下图展示了ComputeLpps这块代码的输入与输出
void ComputeLpps(const Matrix<BaseFloat> &prob,
const std::vector<std::set<int32> > &pdf2phones,
Matrix<BaseFloat> *lpps) {
int32 mono_num = PhoneNum(pdf2phones);
lpps->Resize(prob.NumRows(), mono_num, kSetZero); // lpps resize成(frame_num, mono_num),即(227, 42)
KALDI_WARN << "[test] lpps: NumRows: " << lpps->NumRows() << ", NumCols: " << lpps->NumCols();
KALDI_WARN << "[test] prob: NumRows: " << prob.NumRows() << ", NumCols: " << prob.NumCols();
【补充说明】 上面的这块代码输出的结果,第一个音频是227帧第二个音频是186帧
[test] lpps: NumRows: 227, NumCols: 42
[test] prob: NumRows: 227, NumCols: 5792
[test] lpps: NumRows: 186, NumCols: 42
[test] prob: NumRows: 186, NumCols: 5792
for (int32 i = 0; i < prob.NumCols(); i++) { // prob.NumCols(): 5792,即遍历每一个
SubMatrix<float> src(prob, 0, prob.NumRows(), i, 1); // src shape: (227, 1),表示某个音素的三音素状态在每一帧(227帧)上的概率值
KALDI_WARN << "[test] src: NumRows: " << src.NumRows() << ", NumCols: " << src.NumCols();
for (int32 ph : pdf2phones.at(i)) {
KALDI_WARN << "[test] i: " << i << ", ph: " << ph;
SubMatrix<float> dst(*lpps, 0, prob.NumRows(), ph, 1); // dst shape: (227, 1),获取第ph个纯音素
KALDI_WARN << "[test] dst: NumRows: " << dst.NumRows() << ", NumCols: " << dst.NumCols();
dst.AddMat(1, src);