语音识别特征—MFCC(实战篇)

                                                                        语音特征提取—MFCC(实战篇)

  本文主要针对特征提取中核心代码提取进行说明,如果有不懂或者想进一步了解的大佬可以私聊作者,

  本文为语音特征提取—MFCC实战部分,参考哥伦比亚大学语音识别代码进行提取,其具体流程如下:

(1)分帧与加窗

void FrontEnd::do_window(const matrix<double>& in_feats,
    matrix<double>& out_feats) const {
    double sample_rate = get_float_param(params_, "window.sample_rate", 20000.0);
    double frames_per_sec =
        get_float_param(params_, "window.frames_per_sec", 100.0);
    double window_width = get_float_param(params_, "window.window_size", 0.025);
    bool do_Hamming = get_bool_param(params_, "window.hamming", true);
    int in_samp_cnt = in_feats.size1();
    if (in_feats.size2() != 1)
        throw runtime_error("Windowing expected vector input.");
    //  Input sampling period in seconds,表示毎个语音点采样所占用时间;
    double sample_period = 1.0 / sample_rate;
    //  Output frame period, in seconds.
    double frame_period = 1.0 / frames_per_sec;
    //  Number of samples per window,表示一帧语音采样点数;
    int samp_per_window = (int)(window_width / sample_period + 0.5);
    //  Number of samples to shift between each window,表示语音中帧移采样点数;
    int samp_shift = (int)(frame_period / sample_period + 0.5);
    //  Number of output frames,表示一段语音采样帧数;
    int out_frame_cnt = (in_samp_cnt - samp_per_window) / samp_shift + 1;
    out_feats.resize(out_frame_cnt, samp_per_window);
    out_feats.clear();
    //汉明窗加入,对语音采样点进行处理
    if (do_Hamming) {
        // Hamming windows,对输出采样点语音做汉明窗变换;
        for (int r = 0; r < out_frame_cnt; ++r) {
            for (int c = 0; c < samp_per_window; ++c) {
                out_feats(r, c) =
                    (0.54 - 0.46 * cos(2 * M_PI * c / (samp_per_window - 1))) *
                    in_feats(r * samp_shift + c, 0);
            }
        }
    }
    else {
        // Rectangular window,即为直接将原始语音采样点数据分配到输出特征中;
        for (int r = 0; r < out_frame_cnt; ++r) {
            for (int c = 0; c < samp_per_window; ++c) {
                out_feats(r, c) = in_feats(r * samp_shift + c, 0);
            }
        }
    }
}

(2)傅里叶变换

void FrontEnd::do_fft(const matrix<double>& in_feats,
    matrix<double>& out_feats) const {
    int in_frame_cnt = in_feats.size1();
    int in_dim_cnt = in_feats.size2();
    int out_dim_cnt = 2;
    //为了可以进行傅里叶变换,需要将输出变为2的幂指数倍数;
    while (out_dim_cnt < in_dim_cnt) out_dim_cnt *= 2;
    out_feats.resize(in_frame_cnt, out_dim_cnt);
    out_feats.clear();
    vector<double> fft_buf;
    for (int frm_idx = 0; frm_idx < in_frame_cnt; ++frm_idx) {
        //将matrix容器转换为vector容器,按行处理数据;
        copy_matrix_row_to_vector(in_feats, frm_idx, fft_buf);
        fft_buf.resize(out_dim_cnt, 0.0);
        //real_fft对vector容器做傅里叶变换;
        real_fft(fft_buf);
        copy_vector_to_matrix_row(fft_buf, out_feats, frm_idx);
    }

    /*cout << out_feats << endl;*/
}

(3)Mel滤波

void FrontEnd::do_melbin(const matrix<double>& in_feats,
    matrix<double>& out_feats) const {
    int num_bins = get_int_param(params_, "melbin.bins", 26);
    bool do_log = get_bool_param(params_, "melbin.log", true);
    double sample_rate = get_float_param(params_, "window.sample_rate", 20000.0);
    double sample_period = 1.0 / sample_rate;
    int in_frame_cnt = in_feats.size1();
    int in_dim_cnt = in_feats.size2();
    int out_dim_cnt = num_bins;

    out_feats.resize(in_frame_cnt, out_dim_cnt);
    out_feats.clear();

    // M = 26, N = 512, in_frame_cnt = 74;
    int N = in_dim_cnt;
    int M = out_dim_cnt;
    // T即为每个语音点采样时间;
    double T = sample_period;

    for (int r = 0; r < in_frame_cnt; ++r) {
        for (int m = 1; m <= M; ++m) {
            double sum = 0;
            for (int i = 0; i < N / 2; ++i) {
                // N*T 即为N个语音点采样时间,即为一帧语音采样时间;
                double f = i / (N * T);
                //以[]2 * i, 2 * i + 1]读取经过fft后数据点数据;
                double real = in_feats(r, 2 * i),
                    img = in_feats(r, 2 * i + 1);
                double X_f = sqrt(real * real + img * img);
                double Mel_f = 1127 * log(1 + f / 700);  // Mel(f)
                double Mel_f_max = 1127 * log(1 + 1 / (700 * 2 * T));
                double Mel_f_m = m * Mel_f_max / (M + 1);         // Mel_f_min is 0
                double Mel_f_mp = (m - 1) * Mel_f_max / (M + 1);  // p means previous
                double Mel_f_mn = (m + 1) * Mel_f_max / (M + 1);  // n means next
                double H;
                if (Mel_f < Mel_f_mp || Mel_f > Mel_f_mn) {
                    H = 0;
                }
                else if (Mel_f_mp <= Mel_f && Mel_f <= Mel_f_m) {
                    H = (Mel_f - Mel_f_mp) / (Mel_f_m - Mel_f_mp);
                }
                else if (Mel_f_m <= Mel_f && Mel_f <= Mel_f_mn) {
                    H = (Mel_f - Mel_f_mn) / (Mel_f_m - Mel_f_mn);
                }
                else {
                    std::cout << "Invalid Mel(f) value!!" << std::endl;
                }
                sum += X_f * H;
            }
            if (do_log) {
                out_feats(r, m - 1) = log(sum); 
            }
            else {
                out_feats(r, m - 1) = sum;
            }
        }
    }
}

(4)DCT变换

void FrontEnd::do_dct(const matrix<double>& in_feats,
    matrix<double>& out_feats) const {
    int num_coeffs = get_int_param(params_, "dct.coeffs", 12);
    int in_frame_cnt = in_feats.size1();
    int in_dim_cnt = in_feats.size2();
    int out_dim_cnt = num_coeffs;

    out_feats.resize(in_frame_cnt, out_dim_cnt);
    out_feats.clear();

    int N = in_dim_cnt;
    for (int r = 0; r < in_frame_cnt; ++r) {
        for (int j = 0; j < out_dim_cnt; ++j) {
            double sum = 0;
            for (int i = 0; i < N; ++i) {
                sum += in_feats(r, i) * cos(M_PI * (j + 1) * (i + 0.5) / N);
            }
            sum *= sqrt(2.0 / N);
            out_feats(r, j) = sum;
        }
    }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值