项目模板和描述:链接地址
本次实验所用的数据为0-9(其中0的标签为Z(Zero))和o这11个字符的英文录音,每个录音的原始录音文件和39维的MFCC特征都已经提供, 实验中,每个字符用一个GMM来建模,在测试阶段,对于某句话,对数似然最大的模型对应的字符为当前语音数据的预测的标签(target)
训练数据:330句话,每个字符30句话,11个字符
测试数据:110句话,每个字符10句话,11个字符
digit_test/digit_train里面包含了测试和训练用数据,包括:
-
wav.scp, 句子id到wav的路径的映射,所用到的数据wav文件的相对路径
-
feats.scp, 语音识别工具kaldi提取的特征文件之一,句子id到特征数据真实路径和位置的映射
-
feats.ark, 语音识别工具kaldi提取的特征文件之一,特征实际存储在ark文件中,二进制
-
text, 句子id到标签的映射,本实验中标签(语音对应的文本)只能是0-9,o这11个字符
程序:
kaldi_io.py提供了读取kaldi特征的功能
utils.py 提供了一个特征读取工具
gmm_estimatior.py 核心代码,提供了GMM训练和测试的代码
gmm_estimator.py:
from utils import *
import scipy.cluster.vq as vq
#由逻辑可以看出,以一帧信号的39维MFCC特征为观测点单位而不是一个语音信号为观测点单位
#Kmeans分类个数
num_gaussian = 5
#高斯混合模型迭代次数
num_iterations = 5
#11个类别的标签
targets = ['Z', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O']
class GMM:
def __init__(self, D, K=5):
assert(D>0)
self.dim = D
self.K = K
#Kmeans Initial
#mu[5,39]每一个类别的中心坐标,即初始化的均值
#sigma[5,39,39],即初始化的协方差矩阵
#pi[5,],每一个类别的样本点个数占整体样本点个数的比例,即初始化的pi k
self.mu , self.sigma , self.pi= self.kmeans_initial()
def kmeans_initial(self):
mu = []
sigma = []
#返回所有330个语音的MFCC特征矩阵[18539,39],每个语音由于时长不一样帧数也不一样但是总和是18539
data = read_all_data('train/feats.scp')
#centroids[5,39]为K个高斯分布的39维中心坐标,labels[18593,]为每一个点属于哪一个类别的标志
#minit = "points"从数据中随机选择k个观察值(行)初始质心。iter为迭代100次,所以每一个初始化的gmm对象的此步骤得到结果不一样
(centroids, labels) = vq.kmeans2(data, self.K, minit="points", iter=100)
#创建K行数组,每一行存入属于对应类别点的观测值
clusters = [[] for i in range(self.K)]
for (l,d) in zip(labels,data):
clusters[l].append(d)
for cluster in clusters:
#axis = 0,计算cluster的每一行均值,即为初始化高斯分布的均值坐标
mu.append(np.mean(cluster, axis=0))
#计算协方差矩阵
sigma.append(np.cov(cluster, rowvar=False))
pi = np.array([len(c)*1.0 / len(data) for c in clusters])
return mu , sigma , pi
#求高斯分布概率
def gaussian(self , x , mu , sigma):
"""Calculate gaussion probability.
:param x: The observed data, dim*1.
:param mu: The mean vector of gaussian, dim*1
:param sigma: The covariance matrix, dim*dim
:return: the gaussion probability, scalor
"""
D=x.shape[0]
det_sigma = np.linalg.det(sigma)
inv_sigma = np.linalg.inv(sigma + 0.0001)
mahalanobis = np.dot(np.transpose(x-mu), inv_sigma)
mahalanobis = np.dot(mahalanobis, (x-mu))
const = 1/((2*np.pi)**(D/2))
return const * (det_sigma)**(-0.5) * np.exp(-0.5 * mahalanobis)
#计算对数似然概率
def calc_log_likelihood(self , X):
"""Calculate log likelihood of GMM
param: X: A matrix including data samples, num_samples * D
return: log likelihood of current model
"""
m = X.shape[0]
pdfs = np.zeros((m, self.K))
for k in range(self.K):
for i in range(X.shape[0]):
pdfs[i, k] = self.gaussian(X[i], self.mu[k], self.sigma[k])
return np.sum(np.log(np.sum(pdfs, axis=1)))
def em_estimator(self , X):
"""Update paramters of GMM
param: X: A matrix including data samples, num_samples * D
return: log likelihood of updated model
"""
#pdfs[n,k],为每一帧观测点数据由对应高斯方程生成的概率
pdfs = np.zeros((X.shape[0], self.K))
gamma = np.zeros((X.shape[0], self.K))
for k in range(self.K):
for i in range(X.shape[0]):
#传入第i个高斯方程的mu和sigma,以及输入观测数据X[i],返回此高斯方程生成该观测数据的概率
pdfs[i, k] = self.gaussian(X[i], self.mu[k], self.sigma[k])
#后验分布gamma
gamma = pdfs / np.sum(pdfs, axis=1).reshape(-1, 1)
#更新pi mu sigma
pi = np.sum(gamma, axis=0) / np.sum(gamma)
mu = np.zeros((self.K, self.dim))
sigma = np.zeros((self.K, self.dim, self.dim))
for k in range(self.K):
mu[k] = np.average(X, axis=0, weights=gamma[:, k])
cov = np.zeros((self.dim, self.dim))
for i in range(X.shape[0]):
tmp = (X[i] - mu[k]).reshape(-1, 1)
cov += gamma[i, k] * np.dot(tmp, tmp.T)
sigma[k, :, :] = cov / np.sum(gamma[:, k])
self.pi = pi
self.mu = mu
self.sigma = sigma
log_llh = self.calc_log_likelihood(X)
return log_llh
def train(gmms, num_iterations = num_iterations):
#dict_utt2feat{330} scp文件以空格分割,wav文件名为key
#dict_target2utt{11,30} 每一个类别对应的wav文件名,类型名为key
dict_utt2feat, dict_target2utt = read_feats_and_targets('train/feats.scp', 'train/text')
'''除了kmeans初始化时候取的默认中心之外,在这里对不同的gmm做出了区分
这里gmms[target]只训练标签为target的wav文件里的数据
'''
for target in targets:
#feats[n,39] n等于target类中的33个语音信号的所有帧数
feats = get_feats(target, dict_utt2feat, dict_target2utt)
#gmms[target]参数迭代num_iterations次
for i in range(num_iterations):
log_llh = gmms[target].em_estimator(feats)
print('GMM-Type \'' + target + '\' training succeeded!')
return gmms
def test(gmms):
correction_num = 0
error_num = 0
acc = 0.0
dict_utt2feat, dict_target2utt = read_feats_and_targets('test/feats.scp', 'test/text')
#获取到每个wav文件的标签,key为wav文件名
dict_utt2target = {}
for target in targets:
utts = dict_target2utt[target]
for utt in utts:
dict_utt2target[utt] = target
#遍历测试集
for utt in dict_utt2feat.keys():
#获取某一个wav的每一帧数据[n,39]
feats = kaldi_io.read_mat(dict_utt2feat[utt])
scores = []
for target in targets:
scores.append(gmms[target].calc_log_likelihood(feats))
#获取到概率最大的标签
predict_target = targets[scores.index(max(scores))]
#测试正确和错误的数量
if predict_target == dict_utt2target[utt]:
correction_num += 1
else:
error_num += 1
acc = correction_num * 1.0 / (correction_num + error_num)
print('测试完成!')
print('此次一共测试%d个数据' % (error_num+correction_num))
print('测试正确'+str(correction_num)+'次,测试错误'+str(error_num)+'次,正确率'+str(acc))
return acc
def main():
gmms = {}
for target in targets:
gmms[target] = GMM(39, K=num_gaussian) #Initial model
print('GMM Initialization succeeded!')
#训练的目的是将该gmm模型最大似然到最大概率生成类别target语音特征数据的gmm模型
gmms = train(gmms)
acc = test(gmms)
fid = open('acc.txt', 'w')
fid.write(str(acc))
fid.close()
if __name__ == '__main__':
main()
Terminal输出结果:
GMM Initialization succeeded!
GMM-Type 'Z' training succeeded!
GMM-Type '1' training succeeded!
GMM-Type '2' training succeeded!
GMM-Type '3' training succeeded!
GMM-Type '4' training succeeded!
GMM-Type '5' training succeeded!
GMM-Type '6' training succeeded!
GMM-Type '7' training succeeded!
GMM-Type '8' training succeeded!
GMM-Type '9' training succeeded!
GMM-Type 'O' training succeeded!
测试完成!
此次一共测试110个数据
测试正确109次,测试错误1次,正确率0.990909090909091
进程已结束,退出代码0