提取mfcc,imfcc,cqcc,fft等各种语音特征

代码来自https://github.com/rosrad/asvspoof2017

def trim_silence(audio, threshold=0.1, frame_length=2048):
    if audio.size < frame_length:
        frame_length = audio.size
    energy = librosa.feature.rmse(audio, frame_length=frame_length)
    frames = np.nonzero(energy > threshold)
    indices = librosa.core.frames_to_samples(frames)[1]
    return audio[indices[0]:indices[-1]] if indices.size else audio[0:0]


def extract_imfcc(wav_path):
    audio, sr = librosa.load(wav_path, sr=16000)
    S = np.abs(librosa.core.stft(audio, n_fft=n_fft, hop_length=hop_length)) ** 2.0
    mel_basis = librosa.filters.mel(sr, n_fft)
    mel_basis = np.linalg.pinv(mel_basis).T
    mel = np.dot(mel_basis, S)
    S = librosa.power_to_db(mel)
    imfcc = np.dot(librosa.filters.dct(n_imfcc, S.shape[0]), S)
    imfcc_delta = librosa.feature.delta(imfcc)
    imfcc_delta_delta = librosa.feature.delta(imfcc)
    feature = np.concatenate((imfcc, imfcc_delta, imfcc_delta_delta), axis=0)
    return feature


def extract_mfcc(wav_path):
    audio, sr = librosa.load(wav_path, sr=16000)
    y = trim_silence(audio)
    if y.size == 0:
        y =audio
    mfcc = librosa.feature.mfcc(y, sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta_delta = librosa.feature.delta(mfcc)
    feature = np.concatenate((mfcc, mfcc_delta, mfcc_delta_delta), axis=0)
    return feature


def extract_cqt(wav_path):
    audio, sr = librosa.load(wav_path, sr=16000)
    y = trim_silence(audio)
    if y.size == 0:
        y =audio
    cqt = librosa.feature.chroma_cqt(y, sr, hop_length=hop_length, fmin=f_min, n_chroma=n_cqt, n_octaves=5)
    return cqt


def extract_spect(wav_path):
    audio, sr = librosa.load(wav_path, sr=16000)
    # audio = trim_silence(audio, 0.01)
    S, _ = librosa.core.spectrum._spectrogram(audio, hop_length=100, n_fft=1000, power=2)
    return librosa.power_to_db(S)


def extract_fft(wav_path):
    p_preemphasis = 0.97
    min_level_db = -100
    num_freq = 1025
    ref_level_db = 20
    frame_length_ms = 20
    frame_shift_ms = 10

    def _normalize(S):
        return np.clip((S - min_level_db) / -min_level_db, 0, 1)

    def preemphasis(x):
        return signal.lfilter([1, -p_preemphasis], [1], x)

    def _stft(y):
        n_fft, hop_length, win_length = _stft_parameters()
        return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length)

    def _stft_parameters():
        # n_fft = (num_freq - 1) * 2
        n_fft = 1800
        hop_length = 150
        # hop_length = int(frame_shift_ms / 1000 * sample_rate)
        # win_length = int(frame_length_ms / 1000 * sample_rate)
        win_length = 1500
        return n_fft, hop_length, win_length

    def _amp_to_db(x):
        return 20 * np.log10(np.maximum(1e-5, x))
    y = librosa.core.load(wav_path, sr=sample_rate)[0]
    D = _stft(preemphasis(y))
    S = _amp_to_db(np.abs(D)) - ref_level_db
    return _normalize(S)


def extract_db4(wav_path):
    audio, sr = librosa.load(wav_path, sr=16000)
    S, _ = librosa.core.spectrum._spectrogram(audio, hop_length=150, n_fft=1500, power=2)
    S =librosa.power_to_db(S)
    cA, cD = pywt.dwt(S, 'db4')
    return cA


def extract_db8(wav_path):
    audio, sr = librosa.load(wav_path, sr=16000)
    S, _ = librosa.core.spectrum._spectrogram(audio, hop_length=150, n_fft=1500, power=2)
    S =librosa.power_to_db(S)
    cA, cD = pywt.dwt(S, 'db8')
    return cA


def extract_raw(wav_path):
    audio, sr = librosa.load(wav_path, sr=16000)
    # y = trim_silence(audio, threshold=0.05)
    # if y.size == 0:
    #     y = audio
    return audio
  • 3
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 6
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值