1.导入语音
import librosa
audio_path = './1.wav'
x , sr = librosa.load(audio_path)
print(f'x = {x}')
print(f'x.shape = {x.shape}') # (采样率*语音时间,)
print(f'sr = {sr}')# 采样率
2. 修改采样率
y , sr = librosa.load(audio_path, sr=new_sr)
3. stft 短时傅里叶变换
源码
libroas.stft(y, n_fft=2048,)
def stft(
y,
n_fft=2048,
hop_length=None,
win_length=None,
window="hann",
center=True,
dtype=None,
pad_mode="reflect",
):
"""Short-time Fourier transform (STFT).
The STFT represents a signal in the time-frequency domain by
computing discrete Fourier transforms (DFT) over short overlapping
windows.
This function returns a complex-valued matrix D such that
- ``np.abs(D[f, t])`` is the magnitude of frequency bin ``f``
at frame ``t``, and
- ``np.angle(D[f, t])`` is the phase of frequency bin ``f``
at frame ``t``.
The integers ``t`` and ``f`` can be converted to physical units by means
of the utility functions `frames_to_sample` and `fft_frequencies`.
Returns
-------
D : np.ndarray [shape=(1 + n_fft/2, n_frames), dtype=dtype]
Complex-valued matrix of short-term Fourier transform
coefficients.
短时傅立叶变换(STFT),返回一个复数矩阵使得D(f,t)
复数的实部:np.abs(D(f,t))频率的振幅
复数的虚部:np.angle(D(f,t))频率的相位
参数:
• y:音频时间序列
• n_fft:FFT窗口大小,n_fft=hop_length+overlapping
• hop_length:帧移,如果未指定,则默认win_length / 4
• win_length:每一帧音频都由window()加窗。窗长win_length,然后用零填充以匹配n_fft
默认win_length=n_fft。
• window:字符串,元组,数字,函数 shape =(n_fft, )
窗口(字符串,元组或数字)
窗函数,例如scipy.signal.hanning
长度为n_fft的向量或数组
• center:bool
如果为True,则填充信号y,以使帧 D [:, t]以y [t * hop_length]为中心
如果为False,则D [:, t]从y [t * hop_length]开始
• dtype:D的复数值类型。默认值为64-bit complex复数
• pad_mode:如果center = True,则在信号的边缘使用填充模式
默认情况下,STFT使用reflection padding
返回:
• STFT矩阵 shape=(1+n_fft / 2,t)
关于 t 大小计算
查看源码
t = [fs*times - frame_length]/hop_length
librosa.util.frame(y, frame_length=2048, hop_length=64)
def frame(x, frame_length, hop_length, axis=-1):
"""Slice a data array into (overlapping) frames.
This implementation uses low-level stride manipulation to avoid
making a copy of the data. The resulting frame representation
is a new view of the same input data.
However, if the input data is not contiguous in memory, a warning
will be issued and the output will be a full copy, rather than
a view of the input data.
For example, a one-dimensional input ``x = [0, 1, 2, 3, 4, 5, 6]``
can be framed with frame length 3 and hop length 2 in two ways.
The first (``axis=-1``), results in the array ``x_frames``::
[[0, 2, 4],
[1, 3, 5],
[2, 4, 6]]
where each column ``x_frames[:, i]`` contains a contiguous slice of
the input ``x[i * hop_length : i * hop_length + frame_length]``.
The second way (``axis=0``) results in the array ``x_frames``::
[[0, 1, 2],
[2, 3, 4],
[4, 5, 6]]
where each row ``x_frames[i]`` contains a contiguous slice of the input.
This generalizes to higher dimensional inputs, as shown in the examples below.
In general, the framing operation increments by 1 the number of dimensions,
adding a new "frame axis" either to the end of the array (``axis=-1``)
or the beginning of the array (``axis=0``).
Parameters
----------
x : np.ndarray
Array to frame
frame_length : int > 0 [scalar]
Length of the frame
hop_length : int > 0 [scalar]
Number of steps to advance between frames
axis : 0 or -1
The axis along which to frame.
If ``axis=-1`` (the default), then ``x`` is framed along its last dimension.
``x`` must be "F-contiguous" in this case.
If ``axis=0``, then ``x`` is framed along its first dimension.
``x`` must be "C-contiguous" in this case.
Returns
-------
x_frames : np.ndarray [shape=(..., frame_length, N_FRAMES) or (N_FRAMES, frame_length, ...)]
A framed view of ``x``, for example with ``axis=-1`` (framing on the last dimension)::
x_frames[..., j] == x[..., j * hop_length : j * hop_length + frame_length]
If ``axis=0`` (framing on the first dimension), then::
x_frames[j] = x[j * hop_length : j * hop_length + frame_length]
Examples
--------
Extract 2048-sample frames from monophonic signal with a hop of 64 samples per frame
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> frames = librosa.util.frame(y, frame_length=2048, hop_length=64)
>>> frames
array([[-1.407e-03, -2.604e-02, ..., -1.795e-05, -8.108e-06],
[-4.461e-04, -3.721e-02, ..., -1.573e-05, -1.652e-05],
...,
[ 7.960e-02, -2.335e-01, ..., -6.815e-06, 1.266e-05],
[ 9.568e-02, -1.252e-01, ..., 7.397e-06, -1.921e-05]],
dtype=float32)
>>> y.shape
(117601,)
>>> frames.shape
(2048, 1806)
Or frame along the first axis instead of the last:
>>> frames = librosa.util.frame(y, frame_length=2048, hop_length=64, axis=0)
>>> frames.shape
(1806, 2048)
4. 梅尔谱提取
源码
librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
def melspectrogram(
y=None,
sr=22050,
S=None,
n_fft=2048,
hop_length=512,
win_length=None,
window="hann",
center=True,
pad_mode="reflect",
power=2.0,
**kwargs,
):
"""Compute a mel-scaled spectrogram.
If a spectrogram input ``S`` is provided, then it is mapped directly onto
the mel basis by ``mel_f.dot(S)``.
If a time-series input ``y, sr`` is provided, then its magnitude spectrogram
``S`` is first computed, and then mapped onto the mel scale by
``mel_f.dot(S**power)``.
By default, ``power=2`` operates on a power spectrum.
Returns
-------
S : np.ndarray [shape=(n_mels, t)]
Mel spectrogram
Examples
--------
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> librosa.feature.melspectrogram(y=y, sr=sr)
array([[3.837e-06, 1.451e-06, ..., 8.352e-14, 1.296e-11],
[2.213e-05, 7.866e-06, ..., 8.532e-14, 1.329e-11],
...,
[1.115e-05, 5.192e-06, ..., 3.675e-08, 2.470e-08],
[6.473e-07, 4.402e-07, ..., 1.794e-08, 2.908e-08]],
dtype=float32)
Using a pre-computed power spectrogram would give the same result:
>>> D = np.abs(librosa.stft(y))**2
>>> S = librosa.feature.melspectrogram(S=D, sr=sr)
Display of mel-frequency spectrogram coefficients, with custom
arguments for mel filterbank construction (default is fmax=sr/2):
>>> # Passing through arguments to the Mel filters
>>> S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128,
... fmax=8000)
5.提取 Log-Mel Spectrogram 对数梅尔
import librosa
# Load a wav file
y, sr = librosa.load('./beat.wav', sr=None)
# extract mel spectrogram feature
melspec = librosa.feature.melspectrogram(y, sr, n_fft=1024, hop_length=512, n_mels=128)
# convert to log scale
logmelspec = librosa.power_to_db(melspec)
6.提取 MFCC
源码
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
def mfcc(
y=None, sr=22050, S=None, n_mfcc=20, dct_type=2, norm="ortho", lifter=0, **kwargs
):
"""Mel-frequency cepstral coefficients (MFCCs)
Returns
-------
M : np.ndarray [shape=(n_mfcc, t)]
MFCC sequence
6. 绘图
librosa.display.wavplot()
ibrosa.display.specshow()
import librosa
import matplotlib.pyplot as plt
import librosa.display
audio_path = ./1.wav'
x , sr = librosa.load(audio_path)
print(f'x.shape = {x.shape}')
print(f'sr = {sr}')
#波形图
librosa.display.waveplot(x)
plt.show()
#频谱特征图
stft = librosa.stft(x, n_fft=2048)
print(f'stft.shape = {stft.shape}')
librosa.display.specshow(stft, x_axis='time', y_axis='hz')
plt.show()
mfcc = librosa.feature.mfcc(x, n_fft=2048, n_mfcc = 36)
print(f'mfcc.shape = {mfcc.shape}')
librosa.display.specshow(mfcc, x_axis='time', y_axis='hz')
plt.show()