语音特征提取—MFCC(实战篇)
本文主要针对特征提取中核心代码提取进行说明,如果有不懂或者想进一步了解的大佬可以私聊作者,
本文为语音特征提取—MFCC实战部分,参考哥伦比亚大学语音识别代码进行提取,其具体流程如下:
(1)分帧与加窗
void FrontEnd::do_window(const matrix<double>& in_feats,
matrix<double>& out_feats) const {
double sample_rate = get_float_param(params_, "window.sample_rate", 20000.0);
double frames_per_sec =
get_float_param(params_, "window.frames_per_sec", 100.0);
double window_width = get_float_param(params_, "window.window_size", 0.025);
bool do_Hamming = get_bool_param(params_, "window.hamming", true);
int in_samp_cnt = in_feats.size1();
if (in_feats.size2() != 1)
throw runtime_error("Windowing expected vector input.");
// Input sampling period in seconds,表示毎个语音点采样所占用时间;
double sample_period = 1.0 / sample_rate;
// Output frame period, in seconds.
double frame_period = 1.0 / frames_per_sec;
// Number of samples per window,表示一帧语音采样点数;
int samp_per_window = (int)(window_width / sample_period + 0.5);
// Number of samples to shift between each window,表示语音中帧移采样点数;
int samp_shift = (int)(frame_period / sample_period + 0.5);
// Number of output frames,表示一段语音采样帧数;
int out_frame_cnt = (in_samp_cnt - samp_per_window) / samp_shift + 1;
out_feats.resize(out_frame_cnt, samp_per_window);
out_feats.clear();
//汉明窗加入,对语音采样点进行处理
if (do_Hamming) {
// Hamming windows,对输出采样点语音做汉明窗变换;
for (int r = 0; r < out_frame_cnt; ++r) {
for (int c = 0; c < samp_per_window; ++c) {
out_feats(r, c) =
(0.54 - 0.46 * cos(2 * M_PI * c / (samp_per_window - 1))) *
in_feats(r * samp_shift + c, 0);
}
}
}
else {
// Rectangular window,即为直接将原始语音采样点数据分配到输出特征中;
for (int r = 0; r < out_frame_cnt; ++r) {
for (int c = 0; c < samp_per_window; ++c) {
out_feats(r, c) = in_feats(r * samp_shift + c, 0);
}
}
}
}
(2)傅里叶变换
void FrontEnd::do_fft(const matrix<double>& in_feats,
matrix<double>& out_feats) const {
int in_frame_cnt = in_feats.size1();
int in_dim_cnt = in_feats.size2();
int out_dim_cnt = 2;
//为了可以进行傅里叶变换,需要将输出变为2的幂指数倍数;
while (out_dim_cnt < in_dim_cnt) out_dim_cnt *= 2;
out_feats.resize(in_frame_cnt, out_dim_cnt);
out_feats.clear();
vector<double> fft_buf;
for (int frm_idx = 0; frm_idx < in_frame_cnt; ++frm_idx) {
//将matrix容器转换为vector容器,按行处理数据;
copy_matrix_row_to_vector(in_feats, frm_idx, fft_buf);
fft_buf.resize(out_dim_cnt, 0.0);
//real_fft对vector容器做傅里叶变换;
real_fft(fft_buf);
copy_vector_to_matrix_row(fft_buf, out_feats, frm_idx);
}
/*cout << out_feats << endl;*/
}
(3)Mel滤波
void FrontEnd::do_melbin(const matrix<double>& in_feats,
matrix<double>& out_feats) const {
int num_bins = get_int_param(params_, "melbin.bins", 26);
bool do_log = get_bool_param(params_, "melbin.log", true);
double sample_rate = get_float_param(params_, "window.sample_rate", 20000.0);
double sample_period = 1.0 / sample_rate;
int in_frame_cnt = in_feats.size1();
int in_dim_cnt = in_feats.size2();
int out_dim_cnt = num_bins;
out_feats.resize(in_frame_cnt, out_dim_cnt);
out_feats.clear();
// M = 26, N = 512, in_frame_cnt = 74;
int N = in_dim_cnt;
int M = out_dim_cnt;
// T即为每个语音点采样时间;
double T = sample_period;
for (int r = 0; r < in_frame_cnt; ++r) {
for (int m = 1; m <= M; ++m) {
double sum = 0;
for (int i = 0; i < N / 2; ++i) {
// N*T 即为N个语音点采样时间,即为一帧语音采样时间;
double f = i / (N * T);
//以[]2 * i, 2 * i + 1]读取经过fft后数据点数据;
double real = in_feats(r, 2 * i),
img = in_feats(r, 2 * i + 1);
double X_f = sqrt(real * real + img * img);
double Mel_f = 1127 * log(1 + f / 700); // Mel(f)
double Mel_f_max = 1127 * log(1 + 1 / (700 * 2 * T));
double Mel_f_m = m * Mel_f_max / (M + 1); // Mel_f_min is 0
double Mel_f_mp = (m - 1) * Mel_f_max / (M + 1); // p means previous
double Mel_f_mn = (m + 1) * Mel_f_max / (M + 1); // n means next
double H;
if (Mel_f < Mel_f_mp || Mel_f > Mel_f_mn) {
H = 0;
}
else if (Mel_f_mp <= Mel_f && Mel_f <= Mel_f_m) {
H = (Mel_f - Mel_f_mp) / (Mel_f_m - Mel_f_mp);
}
else if (Mel_f_m <= Mel_f && Mel_f <= Mel_f_mn) {
H = (Mel_f - Mel_f_mn) / (Mel_f_m - Mel_f_mn);
}
else {
std::cout << "Invalid Mel(f) value!!" << std::endl;
}
sum += X_f * H;
}
if (do_log) {
out_feats(r, m - 1) = log(sum);
}
else {
out_feats(r, m - 1) = sum;
}
}
}
}
(4)DCT变换
void FrontEnd::do_dct(const matrix<double>& in_feats,
matrix<double>& out_feats) const {
int num_coeffs = get_int_param(params_, "dct.coeffs", 12);
int in_frame_cnt = in_feats.size1();
int in_dim_cnt = in_feats.size2();
int out_dim_cnt = num_coeffs;
out_feats.resize(in_frame_cnt, out_dim_cnt);
out_feats.clear();
int N = in_dim_cnt;
for (int r = 0; r < in_frame_cnt; ++r) {
for (int j = 0; j < out_dim_cnt; ++j) {
double sum = 0;
for (int i = 0; i < N; ++i) {
sum += in_feats(r, i) * cos(M_PI * (j + 1) * (i + 0.5) / N);
}
sum *= sqrt(2.0 / N);
out_feats(r, j) = sum;
}
}
}