import argparse
import os, librosa,scipy,csv
import numpy as np
class audio:
def __init__(self, input_file, sr=None, frame_len=512, n_fft=None, win_step=2 / 3, window="hamming"):
"""
初始化
:param input_file: 输入音频文件
:param sr: 所输入音频文件的采样率,默认为None
:param frame_len: 帧长,默认512个采样点(32ms,16kHz),与窗长相同
:param n_fft: FFT窗口的长度,默认与窗长相同
:param win_step: 窗移,默认移动2/3,512*2/3=341个采样点(21ms,16kHz)
:param window: 窗类型,默认汉明窗
"""
self.input_file = input_file
self.frame_len = frame_len # 帧长,单位采样点数
self.wave_data, self.sr = librosa.load(self.input_file, sr=sr)
self.window_len = frame_len # 窗长512
if n_fft is None:
self.fft_num = self.window_len # 设置NFFT点数与窗长相等
else:
self.fft_num = n_fft
self.win_step = win_step
self.hop_length = round(self.window_len * win_step) # 重叠部分采样点数设置为窗长的1/3(1/3~1/2),即帧移(窗移)2/3
self.window = window
def energy(self):
"""
每帧内所有采样点的幅值平方和作为能量值
:return: 每帧能量值,np.ndarray[shape=(1,n_frames), dtype=float64]
"""
mag_spec = np.abs(librosa.stft(self.wave_data, n_fft=self.fft_num, hop_length=self.hop_length,
win_length=self.frame_len, window=self.window))
pow_spec = np.square(mag_spec) # [frequency, time (n_frames)]
energy = np.sum(pow_spec, axis=0) # [n_frames]
energy = np.where(energy == 0, np.finfo(np.float64).eps,
energy) # 避免能量值为0,防止后续取log出错(eps是取非负的最小值), 即np.finfo(np.float64).eps = 2.220446049250313e-16
return energy
def short_time_energy(self):
"""
计算语音短时能量:每一帧中所有语音信号的平方和
:return: 语音短时能量列表(值范围0-每帧归一化后能量平方和,这里帧长512,则最大值为512),
np.ndarray[shape=(1,无加窗,帧移为0的
音频特征提取
最新推荐文章于 2024-04-28 11:17:04 发布