创作来源:
在语音识别的任务中,我们通常使用Python求语音信号的频谱特征,有时需要在其它平台使用该特征,网络上现有的使用C、C++求MFCC特征的案例很少,本文使用通俗易操作的代码基于VS2017实现使用C、C++求wav的频谱特征。只需要修改窗口函数大小,采样频率等相应的参数即可......
头文件1:librosa.h
#ifndef LIBROSA_H_
#define LIBROSA_H_
#include "eigen3/Eigen/Core"
#include "eigen3/unsupported/Eigen/FFT"
#include <vector>
#include <complex>
#include <iostream>
///
/// \brief c++ implemention of librosa
///
namespace librosa {
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif // !M_PI
typedef Eigen::Matrix<float, 1, Eigen::Dynamic, Eigen::RowMajor> Vectorf;
typedef Eigen::Matrix<std::complex<float>, 1, Eigen::Dynamic, Eigen::RowMajor> Vectorcf;
typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> Matrixf;
typedef Eigen::Matrix<std::complex<float>, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> Matrixcf;
namespace internal {
static Vectorf pad(Vectorf &x, int left, int right, const std::string &mode, float value) {
Vectorf x_paded = Vectorf::Constant(left + x.size() + right, value);
x_paded.segment(left, x.size()) = x;
if (mode.compare("reflect") == 0) {
for (int i = 0; i < left; ++i) {
x_paded[i] = x[left - i];
}
for (int i = left; i < left + right; ++i) {
x_paded[i + x.size()] = x[x.size() - 2 - i + left];
}
}
if (mode.compare("symmetric") == 0) {
for (int i = 0; i < left; ++i) {
x_paded[i] = x[left - i - 1];
}
for (int i = left; i < left + right; ++i) {
x_paded[i + x.size()] = x[x.size() - 1 - i + left];
}
}
if (mode.compare("edge") == 0) {
for (int i = 0; i < left; ++i) {
x_paded[i] = x[0];
}
for (int i = left; i < left + right; ++i) {
x_paded[i + x.size()] = x[x.size() - 1];
}
}
return x_paded;
}
static Matrixcf stft(Vectorf &x, int n_fft, int n_hop, const std::string &win, bool center, const std::string &mode) {
// hanning
Vectorf window = 0.5*(1.f - (Vectorf::LinSpaced(n_fft, 0.f, static_cast<float>(n_fft - 1))*2.f*M_PI / n_fft).array().cos());
int pad_len = center ? n_fft / 2 : 0;
Vectorf x_paded = pad(x, pad_len, pad_len, mode, 0.f);
int n_f = n_fft / 2 + 1;
int n_frames = 1 + (x_paded.size() - n_fft) / n_hop;
Matrixcf X(n_frames, n_fft);
Eigen::FFT<float> fft;
for (int i = 0; i < n_frames; ++i) {
Vectorf x_frame = window.array()*x_paded.segment(i*n_hop, n_fft).array();
X.row(i) = fft.fwd(x_frame);
}
return X.leftCols(n_f);
}
static Matrixf spectrogram(Matrixcf &X, float power = 1.f) {
return X.cwiseAbs().array().pow(power);
}
static Matrixf melfilter(int sr, int n_fft, int n_mels, int fmin, int fmax) {
int n_f = n_fft / 2 + 1;
Vectorf fft_freqs = (Vectorf::LinSpaced(n_f, 0.f, static_cast<float>(n_f - 1))*sr) / n_fft;
float f_min = 0.f;
float f_sp = 200.f / 3.f;
float min_log_hz = 1000.f;
float min_log_mel = (min_log_hz - f_min) / f_sp;
float logstep = logf(6.4f) / 27.f;
auto hz_to_mel = [=](int hz, bool htk = false) -> float {
if (htk) {
return 2595.0f*log10f(1.0f + hz / 700.0f);
}
float mel = (hz - f_min) / f_sp;
if (hz >= min_log_hz) {
mel = min_log_mel + logf(hz / min_log_hz) / logstep;
}
return mel;
};
auto mel_to_hz = [=](Vectorf &mels, bool htk = false) -> Vectorf {
if (htk) {
return 700.0f*(Vectorf::Constant(n_mels + 2, 10.f).array().pow(mels.array() / 2595.0f) - 1.0f);
}
return (mels.array() > min_log_mel).select(((mels.array() - min_log_mel)*logstep).exp()*min_log_hz, (mels*f_sp).array() + f_min);
};
float min_mel = hz_to_mel(fmin);
float max_mel = hz_to_mel(fmax);