语音数据是由录音设备采集的文件,文件后缀一般为.wav格式,在进行模型训练的时候,需要将录音数据转变成多帧的数值数据。语音数据属于信号处理的范畴,步骤确实比较繁琐,但也比较固定和单一。一般的流程入下:
1.读取音频数据
python使用soundfile读取wav文件
def read_data(self, file_path, name):
'''
读取音频数据
:return:
'''
file_path = os.path.join(file_path, name)
audio, sr = sf.read(file_path)
return audio, sr
2.分帧
需要将分帧的下标记录下来
def framing(self, frame_len_s, frame_shift_s, fs, sig):
'''
分帧,主要是计算对应下标
:param frame_len_s: 帧长
:param frame_shift_s: 帧位移
:param fs: 采样率
:param sig: 信号
:return: 二维列表,一个元素为一帧信号
'''
sig_n = len(sig)
frame_len_n, frame_shift_n = int(round(fs * frame_len_s)), int(round(fs * frame_shift_s))
# frame_len_n, frame_shift_n = int(fs * frame_len_s), int(fs * frame_shift_s)
num_frame = int(np.ceil(float(sig_n - frame_len_n) / frame_shift_n) + 1)
pad_num = frame_shift_n * (num_frame -1) + frame_len_n - sig_n
# pad_zero = []
# if pad_num > 0:
pad_zero = np.zeros(int(pad_num))
pad_sig = np.append(sig, pad_zero)
#计算每个帧的内部下标
frame_inner_index = np.arange(0, frame_len_n)
#分帧后每个帧的起始下标
frame_index = np.arange(0, num_frame) * frame_shift_n
#复制每个帧的内部下标,在行方向复制
frame_inner_index_extend = np.tile(frame_inner_index, (num_frame, 1))
#扩展维度
frame_index_extend = np.expand_dims(frame_index, 1)
#分帧后各帧的下标,二维数组,一个元素为一帧的下标
each_frame_index = frame_inner_index_extend + frame_index_extend
each_frame_index = each_frame_index.astype(np.int, copy=False)
frame_sig = pad_sig[each_frame_index]
return frame_sig
3.短时傅里叶变换
对分帧后的每帧数据做短时傅里叶变换
def stft(self, frame_sig, nfft=512):
'''
对每帧信号进行N点FFT变换,也称短时傅立叶变换。N通常取256或512
:param frame_sig:
:param nfft:
:return: 功谱率
'''
frame_spec = np.fft.rfft(frame_sig, nfft)
#幅度谱
frame_mag = np.abs(frame_spec)
#功率谱
frame_pow = (frame_mag ** 2) * 1.0 / nfft
return frame_pow
4.mel滤波
def mel_filter(self, frame_pow, fs, n_filter, nfft):
'''
mel滤波器系数计算
:param frame_pow:
:param fs:
:param n_filter:
:param nfft:
:return:
'''
mel_min = 0
mel_max = 2595 * np.log10(1 + fs / 2.0 / 700)
mel_points = np.linspace(mel_min, mel_max, n_filter + 2) #n_filter个mel值均匀分布在0-1之间
hz_points = 700 * (10 ** (mel_points / 2595.0) - 1) #mel值对应回频率点,频率间隔指数变化
filter_edge = np.floor(hz_points * (nfft + 1) / fs) #对应到fft到点数比例上
#求mel滤波器系数
fbank = np.zeros((n_filter, int(nfft / 2 + 1)))
for m in range(1, 1+n_filter):
f_left = int(filter_edge[m-1])
f_center = int(filter_edge[m])
f_right = int(filter_edge[m + 1])
for k in range(f_left, f_center):
fbank[m-1, k] = (k - f_left) / (f_center - f_left)
for k in range(f_center, f_right):
fbank[m-1, k] = (f_right - k) / (f_right - f_center)
#mel滤波
#[num_frame, nfft/2+1] * [nfft/2 + 1, num_filter] = [num_frame, n_filter]
filter_banks = np.dot(frame_pow, fbank.T)
filter_banks = np.where(filter_banks==0, np.finfo(float).eps, filter_banks)
filter_banks = 20 * np.log10(filter_banks)
return filter_banks
5.dct变换
def _dct(self, filter_banks):
'''
fbank特征进行dct变换,将相关的滤波器组系数进行压缩,会丢失语音信号中原本一些高度非线性成分
:param filter_banks:
:return:
'''
num_ceps = 12
mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1:(num_ceps+1)]
return mfcc
通过上述步骤会得到语音的mel频谱数据,可以输入到神经网络中进行相应的任务。
通常取短时傅里叶变换的数据,即时域频谱,也可以进行后续的任务,下面是用librosa包直接做stft变换的代码
def _stft(self, wav, mode, spec_len):
'''
对音频数据进行短时傅立叶变换,得到时域频谱
:param audio:
:return:
'''
#扩张音频,结尾加上反向信号数据
wav = np.append(wav, wav[::-1])
wav = wav.astype(np.float)
linear_spect = librosa.stft(wav, n_fft=512, win_length=400, hop_length=160).T
mag, _ = librosa.magphase(linear_spect)
mag_T = mag.T
freq, time = mag_T.shape
if mode == 'train':
if time > spec_len:
randtime = np.random.randint(0, time - spec_len)
spec_mag = mag_T[:, randtime:randtime + spec_len]
else:
spec_mag = np.pad(mag_T, ((0, 0), (0, spec_len - time)), 'constant')
else:
spec_mag = mag_T
mu = np.mean(spec_mag, 0, keepdims=True)
std = np.std(spec_mag, 0, keepdims=True)
specs = (spec_mag - mu) / (std + 1e-5)
# specs = np.expand_dims(np.expand_dims(specs, 0), -1)
return specs