音频特征提取

import argparse
import os, librosa,scipy,csv
import numpy as np


class audio:
    def __init__(self, input_file, sr=None, frame_len=512, n_fft=None, win_step=2 / 3, window="hamming"):
        """
        初始化
        :param input_file: 输入音频文件
        :param sr: 所输入音频文件的采样率,默认为None
        :param frame_len: 帧长,默认512个采样点(32ms,16kHz),与窗长相同
        :param n_fft: FFT窗口的长度,默认与窗长相同
        :param win_step: 窗移,默认移动2/3,512*2/3=341个采样点(21ms,16kHz)
        :param window: 窗类型,默认汉明窗
        """
        self.input_file = input_file
        self.frame_len = frame_len  # 帧长,单位采样点数
        self.wave_data, self.sr = librosa.load(self.input_file, sr=sr)
        self.window_len = frame_len  # 窗长512
        if n_fft is None:
            self.fft_num = self.window_len  # 设置NFFT点数与窗长相等
        else:
            self.fft_num = n_fft
        self.win_step = win_step
        self.hop_length = round(self.window_len * win_step)  # 重叠部分采样点数设置为窗长的1/3(1/3~1/2),即帧移(窗移)2/3
        self.window = window


    def energy(self):
        """
        每帧内所有采样点的幅值平方和作为能量值
        :return: 每帧能量值,np.ndarray[shape=(1,n_frames), dtype=float64]
        """
        mag_spec = np.abs(librosa.stft(self.wave_data, n_fft=self.fft_num, hop_length=self.hop_length,
                                       win_length=self.frame_len, window=self.window))
        pow_spec = np.square(mag_spec)  # [frequency, time (n_frames)]
        energy = np.sum(pow_spec, axis=0)  # [n_frames]
        energy = np.where(energy == 0, np.finfo(np.float64).eps,
                          energy)  # 避免能量值为0,防止后续取log出错(eps是取非负的最小值), 即np.finfo(np.float64).eps = 2.220446049250313e-16
        return energy

    def short_time_energy(self):
        """
        计算语音短时能量:每一帧中所有语音信号的平方和
        :return: 语音短时能量列表(值范围0-每帧归一化后能量平方和,这里帧长512,则最大值为512),
        np.ndarray[shape=(1,无加窗,帧移为0的n_frames), dtype=float64]
        """
        energy = []  # 语音短时能量列表
        energy_sum_per_frame = 0  # 每一帧短时能量累加和
        for i in range(len(self.wave_data)):  # 遍历每一个采样点数据
            energy_sum_per_frame += self.wave_data[i] ** 2  # 求语音信号能量的平方和
            if (i + 1) % self.frame_len == 0:  # 一帧所有采样点遍历结束
                energy.append(energy_sum_per_frame)  # 加入短时能量列表
                energy_sum_per_frame = 0  # 清空和
            elif i == len(self.wave_data) - 1:  # 不满一帧,最后一个采样点
                energy.append(energy_sum_per_frame)  # 将最后一帧短时能量加入列表
        energy = np.array(energy)
        energy = np.where(energy == 0, np.finfo(np.float64).eps, energy)  # 避免能量值为0,防止后续取log出错(eps是取非负的最小值)
        return energy

    def intensity(self):
        """
        计算声音强度,用声压级表示:每帧语音在空气中的声压级Sound Pressure Level(SPL),单位dB
        公式:20*lg(P/Pref),P为声压(Pa),Pref为参考压力(听力阈值压力),一般为1.0*10-6 Pa
        这里P认定为声音的幅值:求得每帧所有幅值平方和均值,除以Pref平方,再取10倍lg
        :return: 每帧声压级,dB,np.ndarray[shape=(1,无加窗,帧移为0的n_frames), dtype=float64]
        """
        p0 = 1.0e-6  # 听觉阈限压力auditory threshold pressure: 2.0*10-5 Pa
        e = self.short_time_energy()
        spl = 10 * np.log10(1 / (np.power(p0, 2) * self.frame_len) * e)
        return spl

    def zero_crossing_rate(self):
        """
        计算语音短时过零率:单位时间(每帧)穿过横轴(过零)的次数
        :return: 每帧过零率次数列表,np.ndarray[shape=(1,无加窗,帧移为0的n_frames), dtype=uint32]
        """
        zcr = []  # 语音短时过零率列表
        counting_sum_per_frame = 0  # 每一帧过零次数累加和,即过零率
        for i in range(len(self.wave_data)):  # 遍历每一个采样点数据
            if i % self.frame_len == 0:  # 开头采样点无过零,因此每一帧的第一个采样点跳过
                continue
            if self.wave_data[i] * self.wave_data[i - 1] < 0:  # 相邻两个采样点乘积小于0,则说明穿过横轴
                counting_sum_per_frame += 1  # 过零次数加一
            if (i + 1) % self.frame_len == 0:  # 一帧所有采样点遍历结束
                zcr.append(counting_sum_per_frame)  # 加入短时过零率列表
                counting_sum_per_frame = 0  # 清空和
            elif i == len(self.wave_data) - 1:  # 不满一帧,最后一个采样点
                zcr.append(counting_sum_per_frame)  # 将最后一帧短时过零率加入列表
        return np.array(zcr, dtype=np.uint32)

    def pitch(self, ts_mag=0.25):
        """
        获取每帧音高,即基频,这里应该包括基频和各次谐波,最小的为基频(一次谐波),其他的依次为二次、三次...谐波
        各次谐波等于基频的对应倍数,因此基频也等于各次谐波除以对应的次数,精确些等于所有谐波之和除以谐波次数之和
        :param ts_mag: 幅值倍乘因子阈值,>0,大于np.average(np.nonzero(magnitudes)) * ts_mag则认为对应的音高有效,默认0.25
        :return: 每帧基频及其对应峰的幅值(>0),
                 np.ndarray[shape=(1 + n_fft/2,n_frames), dtype=float32],(257,全部采样点数/(512*2/3)+1)

        usage:
        pitches, mags = self.pitch()  # 获取每帧基频
        f0_likely = []  # 可能的基频F0
        for i in range(pitches.shape[1]):  # 按列遍历非0最小值,作为每帧可能的F0
            try:
                f0_likely.append(np.min(pitches[np.nonzero(pitches[:, i]), i]))
            except ValueError:
                f0_likely.append(np.nan)  # 当一列,即一帧全为0时,赋值最小值为nan
        f0_all = np.array(f0_likely)
        """
        mag_spec = np.abs(librosa.stft(self.wave_data, n_fft=self.fft_num, hop_length=self.hop_length,
                                       win_length=self.frame_len, window=self.window))
        pitches, magnitudes = librosa.piptrack(S=mag_spec, sr=self.sr, threshold=1.0, ref=np.mean,
                                               fmin=50, fmax=500)  # 人类正常说话基频最大可能范围50-500Hz
        ts = np.average(magnitudes[np.nonzero(magnitudes)]) * ts_mag
        pit_likely = pitches
        mag_likely = magnitudes
        pit_likely[magnitudes < ts] = 0
        mag_likely[magnitudes < ts] = 0
        return pit_likely.tolist(), mag_likely.tolist()

    def preemphasis(self, coef=0.97, zi=None, return_zf=False):
        """Pre-emphasize an audio signal with a first-order auto-regressive filter:

            y[n] -> y[n] - coef * y[n-1]


        Parameters
        ----------
        y : np.ndarray
            Audio signal

        coef : positive number
            Pre-emphasis coefficient.  Typical values of ``coef`` are between 0 and 1.

            At the limit ``coef=0``, the signal is unchanged.

            At ``coef=1``, the result is the first-order difference of the signal.

            The default (0.97) matches the pre-emphasis filter used in the HTK
            implementation of MFCCs [#]_.

            .. [#] http://htk.eng.cam.ac.uk/

        zi : number
            Initial filter state.  When making successive calls to non-overlapping
            frames, this can be set to the ``zf`` returned from the previous call.
            (See example below.)

            By default ``zi`` is initialized as ``2*y[0] - y[1]``.

        return_zf : boolean
            If ``True``, return the final filter state.
            If ``False``, only return the pre-emphasized signal.

        Returns
        -------
        y_out : np.ndarray
            pre-emphasized signal

        zf : number
            if ``return_zf=True``, the final filter state is also returned
        """
        y=self.wave_data
        b = np.asarray([1.0, -coef], dtype=y.dtype)
        a = np.asarray([1.0], dtype=y.dtype)

        if zi is None:
            # Initialize the filter to implement linear extrapolation
            zi = 2 * y[..., 0] - y[..., 1]

        zi = np.atleast_1d(zi)

        y_out, z_f = scipy.signal.lfilter(b, a, y, zi=np.asarray(zi, dtype=y.dtype))

        if return_zf:
            return y_out, z_f

        return y_out
def get_max_min_avg(data):
    '''

    :param data:
    :return: 最大值,最小值,平均值,方差,标准差,和,中值
    '''
    data=np.array(data)
    # print(data.max(),data.min(),data.mean(),data.var(),data.std(),data.sum(),np.median(data))
    return data.max(),data.min(),data.mean(),data.var(),data.std(),data.sum(),np.median(data)

def get_feature(filename):
    '''

    :param filename:
    :return: [文件路径,能量,短时能量,声音强度,过零率, 基频, 各次协波]
    '''
    audio_process=audio(filename)
    pit,mag=audio_process.pitch()
    data = [filename,
            get_max_min_avg(audio_process.energy()),
            get_max_min_avg(audio_process.short_time_energy()),
            get_max_min_avg(audio_process.intensity()),
            get_max_min_avg(audio_process.zero_crossing_rate()),
            get_max_min_avg(pit),
            get_max_min_avg(mag)]
    return data

def get_features_by_folder(args):
    csv_result=[['文件路径','能量','短时能量', '声音强度', '过零率', '基频', '各次协波']]
    for filename in os.listdir(args.audio_path):
        if filename[-3:]=='wav':
            features=get_feature(os.path.join(args.audio_path,filename))
            csv_result.append(features)

    with open(args.csv_path,'w',encoding='utf-8') as fp:
        csv_writer=csv.writer(fp,delimiter='|')
        csv_writer.writerows(csv_result)
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--csv_path', type=str, default='/home/wangyuke_i/tmp/feature.csv')
    parser.add_argument('--audio_path', type=str, default='/home/florence/data/amr_file_0502_trimed/')
    args = parser.parse_args()

    get_features_by_folder(args)

代码来自:https://zhuanlan.zhihu.com/p/510550742

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值