本文主要参考以下两篇博客,并进行少量二次加工
Speech Processing for Machine Learning: Filter banks, Mel-Frequency Cepstral Coefficients (MFCCs) and What’s In-Between
CSDN
声音信号
声音信号是麦克风收集到的气压变化数据,是一种时间序列数据。
import numpy
import numpy as np
import scipy.io.wavfile
from scipy.fftpack import dct
import librosa
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
#plt.style.use("seaborn")
signal,sample_rate = librosa.load('test.wav',sr=None) # 返回声音信号序列和采样率,声音序列为一维序列
signal = signal[int(0.5 * sample_rate):int(3.5 * sample_rate)] # 截取部分片段
#做一个简单的可视化呈现
plt.figure(figsize=(10.5,3),dpi=300)
plt.plot(range(len(signal)),signal,c="blue")
plt.show()
预加重处理
pre_emphasis = 0.98
emphasized_signal = numpy.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
plt.figure(figsize=(10.5,3),dpi=300)
plt.plot(range(len(emphasized_signal)),emphasized_signal,c="blue")
plt.show()
分帧
frame_size = 0.025 #每帧跨时0.025秒,即25ms一帧
frame_stride = 0.01 #两帧之间有一定重叠,每帧移动10ms,即15ms区域内重叠
frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate # 将时间尺度转化为具体的窗口宽度
signal_length = len(emphasized_signal)
frame_length = int(round(frame_length))
frame_step = int(round(frame_step))
num_frames = int(numpy.ceil(float(numpy.abs(signal_length - frame_length)) / frame_step)) # Make sure that we have at least 1 frame
pad_signal_length = num_frames * frame_step + frame_length
z = numpy.zeros((pad_signal_length - signal_length))
pad_signal = numpy.append(emphasized_signal, z) # 将最后不足一帧的信号序列填充0以构成完整的一帧
indices = numpy.tile(numpy.arange(0, frame_length), (num_frames, 1)) + numpy.tile(numpy.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames_ = pad_signal[indices.astype(numpy.int32, copy=False)] #此行和上一行将一维序列转化为2维信号
使用hamming窗函数进行预处理
# 两种典型的窗函数
y = np.hamming(frame_length)
y2 = np.hanning(frame_length)
plt.figure(dpi=300)
plt.plot(range(frame_length),y,c="blue",label="hamming")
plt.plot(range(frame_length),y2,c="red",label = "hanning")
plt.legend()
plt.show()
frames = frames_ * numpy.hamming(frame_length)
进行FFT,获取频域信息
NFFT=1200
# frames *= 0.54 - 0.46 * numpy.cos((2 * numpy.pi * n) / (frame_length - 1)) # Explicit Implementation **
mag_frames = numpy.absolute(numpy.fft.rfft(frames, NFFT)) # Magnitude of the FFT
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2)) # Power Spectrum
# 绘制功率谱
plt.figure(dpi=300)
# 功率谱转为dB 20*log10()
plt.imshow(20*np.log10(pow_frames), aspect = "auto")
plt.xticks([0,100,200,300,400,500,600],np.array([0,100,200,300,400,500,600])*sample_rate/NFFT)
plt.xlabel("Frequence(Hz)") #Frequence=采样频率*坐标轴数值/NFFT
plt.ylabel("Frames index")
plt.title("Power Spectrum (dB)")
注意:这里需要考虑香农采样定理,频率上限为采样频率的一半
梅尔滤波器组
梅尔滤波器组是由一组等高三角形带通滤波器组成,具体原理可参考该链接
mel_N = 40 #滤波器数量,这个数字若要提高,则NFFT也要相应提高
# 最高频率计算过程中需要考虑香农采样定理,即系统所能辨别信号的最高频率为采样频率的一半
mel_low, mel_high = 0, (2595*np.log10(1+(sample_rate/2)/700))#梅尔频率上下限
mel_freq = np.linspace(mel_low,mel_high,mel_N+2) #梅尔频率分组,mel_N个滤波器需要mel_N+2个端点
hz_freq = (700 * (10**(mel_freq / 2595) - 1)) #将梅尔尺度下线性划分的频率转换为实际频率
bins = np.floor((NFFT)*hz_freq/sample_rate) #将频率转换成对应的sample位置
# 定义梅尔滤波器组
fbank = np.zeros((mel_N,int(NFFT/2+1))) #每一行储存一个梅尔滤波器的数据
for m in range(1, mel_N + 1):
f_m_minus = int(bins[m - 1]) # left
f_m = int(bins[m]) # center
f_m_plus = int(bins[m + 1]) # right
for k in range(f_m_minus, f_m):
fbank[m - 1, k] = (k - bins[m - 1]) / (bins[m] - bins[m - 1])
for k in range(f_m, f_m_plus):
fbank[m - 1, k] = (bins[m + 1] - k) / (bins[m + 1] - bins[m])
# 梅尔滤波器组和功率谱矩阵相乘,进行处理
filter_banks = np.matmul(pow_frames, fbank.T)
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # np.finfo(float)是最小正值,如果等于0就输出eps,如果大于0,则输出正常值
filter_banks = 20 * np.log10(filter_banks) # dB
#filter_banks -= np.mean(filter_banks,axis=1).reshape(-1,1)
plt.figure(dpi=300,figsize = [10,3])
plt.imshow(filter_banks, cmap=plt.cm.jet,aspect='auto')
x_ticks = np.array([0,8,16,24,32,39])
x_ticks_fre = hz_freq[x_ticks+1]
x_ticks_fre = ["Mel_"+str(i+1)+" ("+str(int(hz_freq[i+1]))+"Hz)" for i in x_ticks]
plt.xticks(x_ticks,x_ticks_fre)
plt.xlabel("Mel-Fileters (Mel-scale Frequence)") #Mel 尺度下的频率分布
plt.ylabel("Frames index")
plt.title("MelSpectogram (dB)")
原始声音信号到这里已经处理为梅尔语谱图,一般基于机器学习的语音算法以语谱图为输入数据。
在传统语音识别GMM、HMM等方法中,需要对数据进行进一步的处理,涉及梅尔倒谱分析等,有需要再更新。