import numpy as np
# np.set_printoptions(threshold=np.inf)
import pylab as pl
import matplotlib.pyplot as plt
from scipy.fftpack import dct
from scipy.io import wavfile
# download opensouce audio in
# http://www.voiptroubleshooter.com/open_speech/american.html
def mfcc(audio_file='OSR_us_000_0010_8k.wav'):
# 一、预处理(Preprocess)
# 加载模型(load audio)
sample_rate, signal = wavfile.read(audio_file)
print('sample_rate:{}, len:{}'.format(sample_rate, len(signal)))
signal = signal[: int(3.5 * sample_rate)] # read first 3.5s data for example.
# n_frames = len(signal)
# time = np.arange(0, n_frames) * (1.0 / sample_rate)
# pl.subplot(1, 1, 1)
# pl.plot(time, signal)
# pl.xlabel('time (seconds)')
# pl.ylabel('amplitude')
# pl.title('Original audio')
# pl.show()
# 预加重(Pre-Emphasis)
# y(t) = x(t) - a * x(t - 1)
pre_emphasis = 0.97 # usually 0.95 or 0.97
emphasized_signal = np.append(
signal[0],
signal[1:] - pre_emphasis * signal[: -1]
)
# n_frames = len(emphasized_signal)
# time = np.arange(0, n_frames) * (1.0 / sample_rate)
# pl.subplot(1, 1, 1)
# pl.plot(time, emphasized_signal)
#
# pl.xlabel('time (seconds)')
# pl.ylabel('amplitude')
# pl.title('Pre-Emphasis')
# pl.show()
# 分帧(Framing)
# here, params set as follows:
# frame_size = 0.025(s), it menas 8kHz signal has 0.025 * 8000 = 200 samples.
# frame_stride = 0.01(s), 0.01 * 8000 = 80 samples.
# overlap = 0.015(s), 0.015 * 8000 = 125 samples.
frame_size, frame_stride, overlap = 0.025, 0.01, 0.015 # Convert from seconds to samples
frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate
singal_length = len(emphasized_signal)
frame_length = int(round(frame_length))
frame_step = int(round(frame_step))
num_frames = int(np.ceil(
float(np.abs(singal_length - frame_length)) / frame_step
)) # make sure we have at least 1 frame.
pad_singal_length = num_frames * frame_step + frame_length
z = np.zeros((pad_singal_length - singal_length))
# pad singal to make sure that all frames have equal number of
# samples without truncating any samples from the original signal.
pad_singal = np.append(emphasized_signal, z)
indices = np.tile(
np.arange(0, frame_length), (num_frames, 1)
) + np.tile(
np.arange(0, num_frames * frame_step, frame_step),
(frame_length, 1)
).T
frames = pad_singal[indices.astype(np.int32, copy=False)]
# 加窗(Window)
# W(n, a) = (1 - a) - a * cos(2 * pi * n / (N - 1))
# 0 <= n <= N, N is Window length, set a = 0.46 here.
window = np.hamming(frame_length)
# plt.plot(window)
# plt.xlabel("Samples")
# plt.ylabel("Amplitude")
# plt.title("Hamming window")
# plt.show()
# n = np.arange(0, frame_length)
# window = 0.54 - 0.46 * np.cos(2.0 * np.pi * n) / (frame_length - 1) # Explicit Implementation
frames *= window
# n_frames = len(pad_singal)
# time = np.arange(0, n_frames) * (1.0 / sample_rate)
# pl.subplot(1, 1, 1)
# pl.plot(time, pad_singal)
#
# pl.xlabel('time (seconds)')
# pl.ylabel('amplitude')
# pl.title('Framing')
# pl.show()
# 二、傅里叶变换(FFT)和功率谱(Power Spectrum)
# do an N-pont FFT on each frame to calculate the frequency spectrum,
# also called STFT(Short-time FT), where N is typically 256 or 512,
NFFT = 512
mag_frames = np.absolute((np.fft.rfft(frames, NFFT))) # Magnitude of the FFT
# compute the power spectrum using the following eqution:
# P = |FFT(Xi)|^2 / N, Xi is ith frame of signal x.
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2)) # Power Spectrum
# 三、滤波器组(Filter Banks)
# The final step to computing filter banks is applying triangular filters,
# typically 40 filters, nfilt = 40 on a Mel-scale to the power spectrum to extract frequency bands.
# convert frequency(f) and Mel(m) with equations:
# m = 2595 * log10(1 + f / 700)
# f = 700 * (10 ^ (m / 2595) - 1)
n_filters = 40
low_freq_mel = 0
high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700)) # convert Hz to Mel
mel_points = np.linspace(low_freq_mel, high_freq_mel, n_filters + 2) # need 40 filters banks, so need 42 points
hz_points = (700 * (10 ** (mel_points / 2595) - 1)) # convert Mel to Hz
# bin = sample_rate / NFFT # fequency bin equation
bins = np.floor((NFFT + 1) * hz_points / sample_rate) # hz_points / bin
fbank = np.zeros((n_filters, int(np.floor(NFFT / 2 + 1))))
for m in range(1, n_filters + 1):
f_m_minus = int(bins[m - 1])
f_m = int(bins[m])
f_m_plus = int(bins[m + 1])
for k in range(f_m_minus, f_m):
fbank[m - 1, k] = (k - bins[m - 1]) / (bins[m] - bins[m - 1])
for k in range(f_m, f_m_plus):
fbank[m - 1, k] = (bins[m + 1] - k) / (bins[m + 1] - bins[m])
# pl.plot(fbank.T)
# pl.subplot(1, 1, 1)
# pl.xlabel("Frequency")
# pl.ylabel("time()")
# pl.title("40 Filter Banks")
# pl.show()
filter_banks = np.dot(pow_frames, fbank.T)
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
filter_banks = 20 * np.log10(filter_banks) # db
# pl.plot(filter_banks)
# pl.subplot(1, 1, 1)
# pl.xlabel("Frequency")
# pl.ylabel("Time")
# pl.title("Spectrogram")
# pl.show()
# 四、梅尔频率倒谱系数(MFCCs)
# apply Discrete Cosine Transform (DCT) to decorrelate the filter bank coefficients
# and yield a compressed representation of the filter banks.
num_ceps = 12
mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1: (num_ceps + 1)] # keep 2-13
# apply sinusoidal liftering to the MFCCs to de-emphasize higher MFCCs
(n_frames, n_coeff) = mfcc.shape
n = np.arange(n_coeff)
cep_lifter = 22
lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n / cep_lifter)
mfcc *= lift
# pl.plot(mfcc)
# pl.xlabel("Time")
# pl.ylabel("MFCC Coefficients")
# pl.title("MFCCs")
# pl.show()
mfcc -= (np.mean(mfcc, axis=0) + 1e-8)
# pl.plot(mfcc)
# pl.xlabel("Time")
# pl.ylabel("MFCC Coefficients")
# pl.title("norm-MFCCs")
# pl.show()
def bibrosa_mfcc(audio_file='OSR_us_000_0010_8k.wav'):
import librosa
signal, sample_rate = librosa.load(audio_file, sr=8000)
mfccs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=40, dct_type=2, norm='ortho')
更多详细信息参见:
- https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
- https://www.cnblogs.com/LXP-Never/p/10918590.html