python quite_MFCC算法-Python

本文介绍了MFCC(梅尔倒谱系数)算法的基本思路,并提供了详细的Python代码实现。包括从读取波形文件到应用汉明窗、分帧、傅里叶变换,再到获取特征数据的整个过程。此外,还涵盖了预加重、DCT变换等相关步骤。
摘要由CSDN通过智能技术生成

MFCC(梅尔倒谱系数)的算法思路

读取波形文件

汉明窗

分帧

傅里叶变换

回归离散数据

取得特征数据

Python示例代码

import numpy, numpy.fft

def mel(f):

return 2595. * numpy.log10(1. + f / 700.)

def melinv(m):

return 700. * (numpy.power(10., m / 2595.) - 1.)

class MFCC(object):

def __init__(self, nfilt=40, ncep=13,

lowerf=133.3333, upperf=6855.4976, alpha=0.97,

samprate=16000, frate=100, wlen=0.0256,

nfft=512):

self.lowerf = lowerf

self.upperf = upperf

self.nfft = nfft

self.ncep = ncep

self.nfilt = nfilt

self.frate = frate

self.fshift = float(samprate) / frate

# 构建汉明窗

self.wlen = int(wlen * samprate)

self.win = numpy.hamming(self.wlen)

# Prior sample for pre-emphasis

self.prior = 0

self.alpha = alpha

# 构建梅尔滤波矩阵

self.filters = numpy.zeros((nfft/2+1,nfilt), 'd')

dfreq = float(samprate) / nfft

if upperf > samprate/2:

raise(Exception,

"Upper frequency %f exceeds Nyquist %f" % (upperf, samprate/2))

melmax = mel(upperf)

melmin = mel(lowerf)

dmelbw = (melmax - melmin) / (nfilt + 1)

# Filter edges, in Hz

filt_edge = melinv(melmin + dmelbw * numpy.arange(nfilt + 2, dtype='d'))

for whichfilt in range(0, nfilt):

# Filter triangles, in DFT points

leftfr = round(filt_edge[whichfilt] / dfreq)

centerfr = round(filt_edge[whichfilt + 1] / dfreq)

rightfr = round(filt_edge[whichfilt + 2] / dfreq)

# For some reason this is calculated in Hz, though I think

# it doesn't really matter

fwidth = (rightfr - leftfr) * dfreq

height = 2. / fwidth

if centerfr != leftfr:

leftslope = height / (centerfr - leftfr)

else:

leftslope = 0

freq = leftfr + 1

while freq < centerfr:

self.filters[freq,whichfilt] = (freq - leftfr) * leftslope

freq = freq + 1

if freq == centerfr: # This is always true

self.filters[freq,whichfilt] = height

freq = freq + 1

if centerfr != rightfr:

rightslope = height / (centerfr - rightfr)

while freq < rightfr:

self.filters[freq,whichfilt] = (freq - rightfr) * rightslope

freq = freq + 1

# print("Filter %d: left %d=%f center %d=%f right %d=%f width %d" %

# (whichfilt,

# leftfr, leftfr*dfreq,

# centerfr, centerfr*dfreq,

# rightfr, rightfr*dfreq,

# freq - leftfr))

# print self.filters[leftfr:rightfr,whichfilt]

# Build DCT matrix

self.s2dct = s2dctmat(nfilt, ncep, 1./nfilt)

self.dct = dctmat(nfilt, ncep, numpy.pi/nfilt)

def sig2s2mfc(self, sig):

nfr = int(len(sig) / self.fshift + 1)

mfcc = numpy.zeros((nfr, self.ncep), 'd')

fr = 0

while fr < nfr:

start = round(fr * self.fshift)

end = min(len(sig), start + self.wlen)

frame = sig[start:end]

if len(frame) < self.wlen:

frame = numpy.resize(frame,self.wlen)

frame[self.wlen:] = 0

mfcc[fr] = self.frame2s2mfc(frame)

fr = fr + 1

return mfcc

def sig2logspec(self, sig):

nfr = int(len(sig) / self.fshift + 1)

mfcc = numpy.zeros((nfr, self.nfilt), 'd')

fr = 0

while fr < nfr:

start = round(fr * self.fshift)

end = min(len(sig), start + self.wlen)

frame = sig[start:end]

if len(frame) < self.wlen:

frame = numpy.resize(frame,self.wlen)

frame[self.wlen:] = 0

mfcc[fr] = self.frame2logspec(frame)

fr = fr + 1

return mfcc

def pre_emphasis(self, frame):

# FIXME: Do this with matrix multiplication

outfr = numpy.empty(len(frame), 'd')

outfr[0] = frame[0] - self.alpha * self.prior

for i in range(1,len(frame)):

outfr[i] = frame[i] - self.alpha * frame[i-1]

self.prior = frame[-1]

return outfr

def frame2logspec(self, frame):

frame = self.pre_emphasis(frame) * self.win

fft = numpy.fft.rfft(frame, self.nfft)

# Square of absolute value

power = fft.real * fft.real + fft.imag * fft.imag

return numpy.log(numpy.dot(power, self.filters).clip(1e-5,numpy.inf))

def frame2s2mfc(self, frame):

logspec = self.frame2logspec(frame)

return numpy.dot(logspec, self.s2dct.T) / self.nfilt

def s2dctmat(nfilt,ncep,freqstep):

"""Return the 'legacy' not-quite-DCT matrix used by Sphinx"""

melcos = numpy.empty((ncep, nfilt), 'double')

for i in range(0,ncep):

freq = numpy.pi * float(i) / nfilt

melcos[i] = numpy.cos(freq * numpy.arange(0.5, float(nfilt)+0.5, 1.0, 'double'))

melcos[:,0] = melcos[:,0] * 0.5

return melcos

def logspec2s2mfc(logspec, ncep=13):

"""Convert log-power-spectrum bins to MFCC using the 'legacy'

Sphinx transform"""

nframes, nfilt = logspec.shape

melcos = s2dctmat(nfilt, ncep, 1./nfilt)

return numpy.dot(logspec, melcos.T) / nfilt

def dctmat(N,K,freqstep,orthogonalize=True):

"""Return the orthogonal DCT-II/DCT-III matrix of size NxK.

For computing or inverting MFCCs, N is the number of

log-power-spectrum bins while K is the number of cepstra.

回归正交变换/ dct-iii大小康矩阵。

计算或反相的MFCC,N是多少

对数功率谱箱,而K是倒谱的数量

"""

cosmat = numpy.zeros((N, K), 'double')

for n in range(0,N):

for k in range(0, K):

cosmat[n,k] = numpy.cos(freqstep * (n + 0.5) * k)

if orthogonalize:

cosmat[:,0] = cosmat[:,0] * 1./numpy.sqrt(2)

return cosmat

def dct(input, K=13):

"""Convert log-power-spectrum to MFCC using the orthogonal DCT-II"""

nframes, N = input.shape

freqstep = numpy.pi / N

cosmat = dctmat(N,K,freqstep)

return numpy.dot(input, cosmat) * numpy.sqrt(2.0 / N)

def dct2(input, K=13):

"""Convert log-power-spectrum to MFCC using the normalized DCT-II"""

nframes, N = input.shape

freqstep = numpy.pi / N

cosmat = dctmat(N,K,freqstep,False)

return numpy.dot(input, cosmat) * (2.0 / N)

def idct(input, K=40):

"""Convert MFCC to log-power-spectrum using the orthogonal DCT-III"""

nframes, N = input.shape

freqstep = numpy.pi / K

cosmat = dctmat(K,N,freqstep).T

return numpy.dot(input, cosmat) * numpy.sqrt(2.0 / K)

def dct3(input, K=40):

"""Convert MFCC to log-power-spectrum using the unnormalized DCT-III"""

nframes, N = input.shape

freqstep = numpy.pi / K

cosmat = dctmat(K,N,freqstep,False)

cosmat[:,0] = cosmat[:,0] * 0.5

return numpy.dot(input, cosmat.T)

copy-feats --compress=true --write-num-frames=ark,t:exp/features/mfcc/data_mfcc_23_pitch_seg/log/utt2num_frames.1 ark:- ark,scp:/work/VPR/subtools_1229/exp/features/mfcc/data_mfcc_23_pitch_seg/raw_mfcc_pitch_seg.1.ark,/work/VPR/subtools_1229/exp/features/mfcc/data_mfcc_23_pitch_seg/raw_mfcc_pitch_seg.1.scp paste-feats --length-tolerance=2 'ark:compute-mfcc-feats --write-utt2dur=ark,t:exp/features/mfcc/data_mfcc_23_pitch_seg/log/utt2dur.1 --verbose=2 --config=subtools/conf/sre-mfcc-23.conf scp,p:exp/features/mfcc/data_mfcc_23_pitch_seg/log/wav_seg.1.scp ark:- |' 'ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=subtools/conf/pitch.conf scp,p:exp/features/mfcc/data_mfcc_23_pitch_seg/log/wav_seg.1.scp ark:- | process-kaldi-pitch-feats ark:- ark:- |' ark:- compute-mfcc-feats --write-utt2dur=ark,t:exp/features/mfcc/data_mfcc_23_pitch_seg/log/utt2dur.1 --verbose=2 --config=subtools/conf/sre-mfcc-23.conf scp,p:exp/features/mfcc/data_mfcc_23_pitch_seg/log/wav_seg.1.scp ark:- VLOG[2] (compute-mfcc-feats[5.5]:main():compute-mfcc-feats.cc:182) Processed features for key 001_20230623160347_0319007398_mentianyu-1 compute-kaldi-pitch-feats --verbose=2 --config=subtools/conf/pitch.conf scp,p:exp/features/mfcc/data_mfcc_23_pitch_seg/log/wav_seg.1.scp ark:- ERROR (compute-kaldi-pitch-feats[5.5]:main():compute-kaldi-pitch-feats.cc:88) Sample frequency mismatch: you specified 16000 but data has 8000 (use --sample-frequency option). Utterance is 001_20230623160347_0319007398_mentianyu-1
最新发布
07-15
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值