import argparse
import os, librosa,scipy,csv
import numpy as np
class audio:
def __init__(self, input_file, sr=None, frame_len=512, n_fft=None, win_step=2 / 3, window="hamming"):
"""
初始化
:param input_file: 输入音频文件
:param sr: 所输入音频文件的采样率,默认为None
:param frame_len: 帧长,默认512个采样点(32ms,16kHz),与窗长相同
:param n_fft: FFT窗口的长度,默认与窗长相同
:param win_step: 窗移,默认移动2/3,512*2/3=341个采样点(21ms,16kHz)
:param window: 窗类型,默认汉明窗
"""
self.input_file = input_file
self.frame_len = frame_len # 帧长,单位采样点数
self.wave_data, self.sr = librosa.load(self.input_file, sr=sr)
self.window_len = frame_len # 窗长512
if n_fft is None:
self.fft_num = self.window_len # 设置NFFT点数与窗长相等
else:
self.fft_num = n_fft
self.win_step = win_step
self.hop_length = round(self.window_len * win_step) # 重叠部分采样点数设置为窗长的1/3(1/3~1/2),即帧移(窗移)2/3
self.window = window
def energy(self):
"""
每帧内所有采样点的幅值平方和作为能量值
:return: 每帧能量值,np.ndarray[shape=(1,n_frames), dtype=float64]
"""
mag_spec = np.abs(librosa.stft(self.wave_data, n_fft=self.fft_num, hop_length=self.hop_length,
win_length=self.frame_len, window=self.window))
pow_spec = np.square(mag_spec) # [frequency, time (n_frames)]
energy = np.sum(pow_spec, axis=0) # [n_frames]
energy = np.where(energy == 0, np.finfo(np.float64).eps,
energy) # 避免能量值为0,防止后续取log出错(eps是取非负的最小值), 即np.finfo(np.float64).eps = 2.220446049250313e-16
return energy
def short_time_energy(self):
"""
计算语音短时能量:每一帧中所有语音信号的平方和
:return: 语音短时能量列表(值范围0-每帧归一化后能量平方和,这里帧长512,则最大值为512),
np.ndarray[shape=(1,无加窗,帧移为0的n_frames), dtype=float64]
"""
energy = [] # 语音短时能量列表
energy_sum_per_frame = 0 # 每一帧短时能量累加和
for i in range(len(self.wave_data)): # 遍历每一个采样点数据
energy_sum_per_frame += self.wave_data[i] ** 2 # 求语音信号能量的平方和
if (i + 1) % self.frame_len == 0: # 一帧所有采样点遍历结束
energy.append(energy_sum_per_frame) # 加入短时能量列表
energy_sum_per_frame = 0 # 清空和
elif i == len(self.wave_data) - 1: # 不满一帧,最后一个采样点
energy.append(energy_sum_per_frame) # 将最后一帧短时能量加入列表
energy = np.array(energy)
energy = np.where(energy == 0, np.finfo(np.float64).eps, energy) # 避免能量值为0,防止后续取log出错(eps是取非负的最小值)
return energy
def intensity(self):
"""
计算声音强度,用声压级表示:每帧语音在空气中的声压级Sound Pressure Level(SPL),单位dB
公式:20*lg(P/Pref),P为声压(Pa),Pref为参考压力(听力阈值压力),一般为1.0*10-6 Pa
这里P认定为声音的幅值:求得每帧所有幅值平方和均值,除以Pref平方,再取10倍lg
:return: 每帧声压级,dB,np.ndarray[shape=(1,无加窗,帧移为0的n_frames), dtype=float64]
"""
p0 = 1.0e-6 # 听觉阈限压力auditory threshold pressure: 2.0*10-5 Pa
e = self.short_time_energy()
spl = 10 * np.log10(1 / (np.power(p0, 2) * self.frame_len) * e)
return spl
def zero_crossing_rate(self):
"""
计算语音短时过零率:单位时间(每帧)穿过横轴(过零)的次数
:return: 每帧过零率次数列表,np.ndarray[shape=(1,无加窗,帧移为0的n_frames), dtype=uint32]
"""
zcr = [] # 语音短时过零率列表
counting_sum_per_frame = 0 # 每一帧过零次数累加和,即过零率
for i in range(len(self.wave_data)): # 遍历每一个采样点数据
if i % self.frame_len == 0: # 开头采样点无过零,因此每一帧的第一个采样点跳过
continue
if self.wave_data[i] * self.wave_data[i - 1] < 0: # 相邻两个采样点乘积小于0,则说明穿过横轴
counting_sum_per_frame += 1 # 过零次数加一
if (i + 1) % self.frame_len == 0: # 一帧所有采样点遍历结束
zcr.append(counting_sum_per_frame) # 加入短时过零率列表
counting_sum_per_frame = 0 # 清空和
elif i == len(self.wave_data) - 1: # 不满一帧,最后一个采样点
zcr.append(counting_sum_per_frame) # 将最后一帧短时过零率加入列表
return np.array(zcr, dtype=np.uint32)
def pitch(self, ts_mag=0.25):
"""
获取每帧音高,即基频,这里应该包括基频和各次谐波,最小的为基频(一次谐波),其他的依次为二次、三次...谐波
各次谐波等于基频的对应倍数,因此基频也等于各次谐波除以对应的次数,精确些等于所有谐波之和除以谐波次数之和
:param ts_mag: 幅值倍乘因子阈值,>0,大于np.average(np.nonzero(magnitudes)) * ts_mag则认为对应的音高有效,默认0.25
:return: 每帧基频及其对应峰的幅值(>0),
np.ndarray[shape=(1 + n_fft/2,n_frames), dtype=float32],(257,全部采样点数/(512*2/3)+1)
usage:
pitches, mags = self.pitch() # 获取每帧基频
f0_likely = [] # 可能的基频F0
for i in range(pitches.shape[1]): # 按列遍历非0最小值,作为每帧可能的F0
try:
f0_likely.append(np.min(pitches[np.nonzero(pitches[:, i]), i]))
except ValueError:
f0_likely.append(np.nan) # 当一列,即一帧全为0时,赋值最小值为nan
f0_all = np.array(f0_likely)
"""
mag_spec = np.abs(librosa.stft(self.wave_data, n_fft=self.fft_num, hop_length=self.hop_length,
win_length=self.frame_len, window=self.window))
pitches, magnitudes = librosa.piptrack(S=mag_spec, sr=self.sr, threshold=1.0, ref=np.mean,
fmin=50, fmax=500) # 人类正常说话基频最大可能范围50-500Hz
ts = np.average(magnitudes[np.nonzero(magnitudes)]) * ts_mag
pit_likely = pitches
mag_likely = magnitudes
pit_likely[magnitudes < ts] = 0
mag_likely[magnitudes < ts] = 0
return pit_likely.tolist(), mag_likely.tolist()
def preemphasis(self, coef=0.97, zi=None, return_zf=False):
"""Pre-emphasize an audio signal with a first-order auto-regressive filter:
y[n] -> y[n] - coef * y[n-1]
Parameters
----------
y : np.ndarray
Audio signal
coef : positive number
Pre-emphasis coefficient. Typical values of ``coef`` are between 0 and 1.
At the limit ``coef=0``, the signal is unchanged.
At ``coef=1``, the result is the first-order difference of the signal.
The default (0.97) matches the pre-emphasis filter used in the HTK
implementation of MFCCs [#]_.
.. [#] http://htk.eng.cam.ac.uk/
zi : number
Initial filter state. When making successive calls to non-overlapping
frames, this can be set to the ``zf`` returned from the previous call.
(See example below.)
By default ``zi`` is initialized as ``2*y[0] - y[1]``.
return_zf : boolean
If ``True``, return the final filter state.
If ``False``, only return the pre-emphasized signal.
Returns
-------
y_out : np.ndarray
pre-emphasized signal
zf : number
if ``return_zf=True``, the final filter state is also returned
"""
y=self.wave_data
b = np.asarray([1.0, -coef], dtype=y.dtype)
a = np.asarray([1.0], dtype=y.dtype)
if zi is None:
# Initialize the filter to implement linear extrapolation
zi = 2 * y[..., 0] - y[..., 1]
zi = np.atleast_1d(zi)
y_out, z_f = scipy.signal.lfilter(b, a, y, zi=np.asarray(zi, dtype=y.dtype))
if return_zf:
return y_out, z_f
return y_out
def get_max_min_avg(data):
'''
:param data:
:return: 最大值,最小值,平均值,方差,标准差,和,中值
'''
data=np.array(data)
# print(data.max(),data.min(),data.mean(),data.var(),data.std(),data.sum(),np.median(data))
return data.max(),data.min(),data.mean(),data.var(),data.std(),data.sum(),np.median(data)
def get_feature(filename):
'''
:param filename:
:return: [文件路径,能量,短时能量,声音强度,过零率, 基频, 各次协波]
'''
audio_process=audio(filename)
pit,mag=audio_process.pitch()
data = [filename,
get_max_min_avg(audio_process.energy()),
get_max_min_avg(audio_process.short_time_energy()),
get_max_min_avg(audio_process.intensity()),
get_max_min_avg(audio_process.zero_crossing_rate()),
get_max_min_avg(pit),
get_max_min_avg(mag)]
return data
def get_features_by_folder(args):
csv_result=[['文件路径','能量','短时能量', '声音强度', '过零率', '基频', '各次协波']]
for filename in os.listdir(args.audio_path):
if filename[-3:]=='wav':
features=get_feature(os.path.join(args.audio_path,filename))
csv_result.append(features)
with open(args.csv_path,'w',encoding='utf-8') as fp:
csv_writer=csv.writer(fp,delimiter='|')
csv_writer.writerows(csv_result)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--csv_path', type=str, default='/home/wangyuke_i/tmp/feature.csv')
parser.add_argument('--audio_path', type=str, default='/home/florence/data/amr_file_0502_trimed/')
args = parser.parse_args()
get_features_by_folder(args)
代码来自:https://zhuanlan.zhihu.com/p/510550742