语音识别特征—MFCC（实战篇）

最新推荐文章于 2024-08-24 18:14:26 发布

Xwei1226

最新推荐文章于 2024-08-24 18:14:26 发布

阅读量2k

点赞数 3

分类专栏：语音识别-深度学习文章标签：语音识别 c++

本文链接：https://blog.csdn.net/Xwei1226/article/details/105790954

版权

语音识别-深度学习专栏收录该内容

64 篇文章 37 订阅

订阅专栏

语音特征提取—MFCC(实战篇)

本文主要针对特征提取中核心代码提取进行说明，如果有不懂或者想进一步了解的大佬可以私聊作者，

本文为语音特征提取—MFCC实战部分，参考哥伦比亚大学语音识别代码进行提取，其具体流程如下：

（1）分帧与加窗

void FrontEnd::do_window(const matrix<double>& in_feats,
    matrix<double>& out_feats) const {
    double sample_rate = get_float_param(params_, "window.sample_rate", 20000.0);
    double frames_per_sec =
        get_float_param(params_, "window.frames_per_sec", 100.0);
    double window_width = get_float_param(params_, "window.window_size", 0.025);
    bool do_Hamming = get_bool_param(params_, "window.hamming", true);
    int in_samp_cnt = in_feats.size1();
    if (in_feats.size2() != 1)
        throw runtime_error("Windowing expected vector input.");
    //  Input sampling period in seconds，表示毎个语音点采样所占用时间；
    double sample_period = 1.0 / sample_rate;
    //  Output frame period, in seconds.
    double frame_period = 1.0 / frames_per_sec;
    //  Number of samples per window,表示一帧语音采样点数；
    int samp_per_window = (int)(window_width / sample_period + 0.5);
    //  Number of samples to shift between each window，表示语音中帧移采样点数；
    int samp_shift = (int)(frame_period / sample_period + 0.5);
    //  Number of output frames，表示一段语音采样帧数；
    int out_frame_cnt = (in_samp_cnt - samp_per_window) / samp_shift + 1;
    out_feats.resize(out_frame_cnt, samp_per_window);
    out_feats.clear();
    //汉明窗加入，对语音采样点进行处理
    if (do_Hamming) {
        // Hamming windows，对输出采样点语音做汉明窗变换；
        for (int r = 0; r < out_frame_cnt; ++r) {
            for (int c = 0; c < samp_per_window; ++c) {
                out_feats(r, c) =
                    (0.54 - 0.46 * cos(2 * M_PI * c / (samp_per_window - 1))) *
                    in_feats(r * samp_shift + c, 0);
            }
        }
    }
    else {
        // Rectangular window,即为直接将原始语音采样点数据分配到输出特征中；
        for (int r = 0; r < out_frame_cnt; ++r) {
            for (int c = 0; c < samp_per_window; ++c) {
                out_feats(r, c) = in_feats(r * samp_shift + c, 0);
            }
        }
    }
}

（2）傅里叶变换

void FrontEnd::do_fft(const matrix<double>& in_feats,
    matrix<double>& out_feats) const {
    int in_frame_cnt = in_feats.size1();
    int in_dim_cnt = in_feats.size2();
    int out_dim_cnt = 2;
    //为了可以进行傅里叶变换，需要将输出变为2的幂指数倍数；
    while (out_dim_cnt < in_dim_cnt) out_dim_cnt *= 2;
    out_feats.resize(in_frame_cnt, out_dim_cnt);
    out_feats.clear();
    vector<double> fft_buf;
    for (int frm_idx = 0; frm_idx < in_frame_cnt; ++frm_idx) {
        //将matrix容器转换为vector容器，按行处理数据；
        copy_matrix_row_to_vector(in_feats, frm_idx, fft_buf);
        fft_buf.resize(out_dim_cnt, 0.0);
        //real_fft对vector容器做傅里叶变换；
        real_fft(fft_buf);
        copy_vector_to_matrix_row(fft_buf, out_feats, frm_idx);
    }

    /*cout << out_feats << endl;*/
}

（3）Mel滤波

void FrontEnd::do_melbin(const matrix<double>& in_feats,
    matrix<double>& out_feats) const {
    int num_bins = get_int_param(params_, "melbin.bins", 26);
    bool do_log = get_bool_param(params_, "melbin.log", true);
    double sample_rate = get_float_param(params_, "window.sample_rate", 20000.0);
    double sample_period = 1.0 / sample_rate;
    int in_frame_cnt = in_feats.size1();
    int in_dim_cnt = in_feats.size2();
    int out_dim_cnt = num_bins;

    out_feats.resize(in_frame_cnt, out_dim_cnt);
    out_feats.clear();

    // M = 26, N = 512, in_frame_cnt = 74;
    int N = in_dim_cnt;
    int M = out_dim_cnt;
    // T即为每个语音点采样时间；
    double T = sample_period;

    for (int r = 0; r < in_frame_cnt; ++r) {
        for (int m = 1; m <= M; ++m) {
            double sum = 0;
            for (int i = 0; i < N / 2; ++i) {
                // N*T 即为N个语音点采样时间，即为一帧语音采样时间；
                double f = i / (N * T);
                //以[]2 * i， 2 * i + 1]读取经过fft后数据点数据；
                double real = in_feats(r, 2 * i),
                    img = in_feats(r, 2 * i + 1);
                double X_f = sqrt(real * real + img * img);
                double Mel_f = 1127 * log(1 + f / 700);  // Mel(f)
                double Mel_f_max = 1127 * log(1 + 1 / (700 * 2 * T));
                double Mel_f_m = m * Mel_f_max / (M + 1);         // Mel_f_min is 0
                double Mel_f_mp = (m - 1) * Mel_f_max / (M + 1);  // p means previous
                double Mel_f_mn = (m + 1) * Mel_f_max / (M + 1);  // n means next
                double H;
                if (Mel_f < Mel_f_mp || Mel_f > Mel_f_mn) {
                    H = 0;
                }
                else if (Mel_f_mp <= Mel_f && Mel_f <= Mel_f_m) {
                    H = (Mel_f - Mel_f_mp) / (Mel_f_m - Mel_f_mp);
                }
                else if (Mel_f_m <= Mel_f && Mel_f <= Mel_f_mn) {
                    H = (Mel_f - Mel_f_mn) / (Mel_f_m - Mel_f_mn);
                }
                else {
                    std::cout << "Invalid Mel(f) value!!" << std::endl;
                }
                sum += X_f * H;
            }
            if (do_log) {
                out_feats(r, m - 1) = log(sum); 
            }
            else {
                out_feats(r, m - 1) = sum;
            }
        }
    }
}

（4）DCT变换

void FrontEnd::do_dct(const matrix<double>& in_feats,
    matrix<double>& out_feats) const {
    int num_coeffs = get_int_param(params_, "dct.coeffs", 12);
    int in_frame_cnt = in_feats.size1();
    int in_dim_cnt = in_feats.size2();
    int out_dim_cnt = num_coeffs;

    out_feats.resize(in_frame_cnt, out_dim_cnt);
    out_feats.clear();

    int N = in_dim_cnt;
    for (int r = 0; r < in_frame_cnt; ++r) {
        for (int j = 0; j < out_dim_cnt; ++j) {
            double sum = 0;
            for (int i = 0; i < N; ++i) {
                sum += in_feats(r, i) * cos(M_PI * (j + 1) * (i + 0.5) / N);
            }
            sum *= sqrt(2.0 / N);
            out_feats(r, j) = sum;
        }
    }
}