多子带的谱减法(附Python 源码) @Learning Speech enhancement__2

#!/usr/bin/env python

# encoding: utf-8

'''

@author: Kolor

@license:

@contact: colorsu1922@163.com

@software: pycharm

@file: mband.py

@time: 2020/12/27 14:22

@desc:

'''



import librosa

import numpy as np

from scipy.signal import lfilter

import numpy.linalg as LA

import soundfile as sf




def mband(infile, outfile, n_band=6, freq_spacing='linear'):

    """

    Implements the multi-band spectral subtraction algorithm [1].



     Usage:  mband(infile, outputfile,Nband,Freq_spacing)



            infile - noisy speech file in .wav format

            outputFile - enhanced output file in .wav format

            Nband - Number of frequency bands (recommended 4-8)

            Freq_spacing - Type of frequency spacing for the bands, choices:

                           'linear', 'log' and 'mel'



     Example call:  mband('sp04_babble_sn10.wav','out_mband.wav',6,'linear');



     References:

      [1] Kamath, S. and Loizou, P. (2002). A multi-band spectral subtraction

          method for enhancing speech corrupted by colored noise. Proc. IEEE Int.

          Conf. Acoust.,Speech, Signal Processing



    Authors: Sunil Kamath and Philipos C. Loizou



    Copyright (c) 2006 by Philipos C. Loizou

    $Revision: 0.0 $  $Date: 10/09/2006 $

    -------------------------------------------------------------------------

    """

    AVERAGING = True

    FRAME_SIZE = 20

    OVERLAP_PERS = 50

    NOISE_FRAME = 6

    FLOOR = 0.001

    VAD = 1



    xin, fs = librosa.load(infile, sr=None, dtype=np.float64)



    # frame size in samples

    frm_len = int(np.floor(FRAME_SIZE * fs / 1000))

    # number of overlap samples

    ovlp_len = int(np.floor(frm_len * OVERLAP_PERS / 100))

    # number of common samples between adjacent frames

    common_len = frm_len - ovlp_len



    fft_len = 2

    while fft_len < frm_len:

        fft_len = fft_len * 2



    band_size = np.zeros((n_band, ), dtype=int)

    lo_bin = np.zeros((n_band, ), dtype=int)

    hi_bin = np.zeros((n_band, ), dtype=int)



    if (freq_spacing == 'linear') or (freq_spacing == 'LINEAR'):

        band_size[0] = int(np.floor((fft_len/2 + 1) / n_band))

        # 尽可能均匀分布,最后一个子带的freq_bin最大

        for i in range(0, n_band):

            lo_bin[i] = i * band_size[0] + 1

            hi_bin[i] = lo_bin[i] + band_size[0] - 1

            band_size[i] = band_size[0]

        band_size[n_band-1] = (fft_len / 2 + 1) % band_size[0] + band_size[0]

        hi_bin[n_band-1] = lo_bin[n_band-1] + band_size[n_band-1] - 1

    elif (freq_spacing == 'log') or (freq_spacing == 'LOG'):

        lof, midf, hif = log_filter(n_band, fs)

        lo_bin = np.round(lof * fft_len / fs) + 1

        hi_bin = np.round(hif * fft_len / fs) + 1

        band_size = hi_bin - lo_bin + 1

    elif (freq_spacing == 'mel') or (freq_spacing == 'MEL'):

        lof, midf, hif = mel_filter(n_band, 0, fs / 2)

        lo_bin = np.round(lof * fft_len / fs) + 1

        hi_bin = np.round(hif * fft_len / fs) + 1



        lo_bin[0] = 1

        # like hi_bin(end) in matlab

        hi_bin[-1] = fft_len / 2 + 1

        band_size = hi_bin - lo_bin + 1

    else:

        print("Freq Spacing support 'linear', 'log' and 'mel' only... ")



    img = 1j

    # calculate hamming window

    win = np.sqrt(np.hamming(frm_len))

    """

    Estimate noise magnitude for first 'NOISE_FRAME' frames

    Estimated noise spectrum is stored in noise_spect

    """

    noise_pow = np.zeros(fft_len)

    j = 0

    for k in range(NOISE_FRAME):

        n_fft = np.fft.fft(xin[j:j + frm_len] * win, fft_len)

        n_mag = np.abs(n_fft)

        n_magsq = n_mag ** 2

        noise_pow = noise_pow + n_magsq

        j = j + frm_len



    n_spect = np.sqrt(noise_pow / NOISE_FRAME)



    # input to noise reduction part

    x = xin



    # Framing the signal with Window = 20ms and overlap = 10ms,

    # the output is a matrix with each column representing a frame

    framed_x = frame(x, win, ovlp_len, 0, 0)

    [tmp, nframes] = framed_x.shape



    # start processing

    x_win = framed_x

    x_fft = np.fft.fft(x_win, n=fft_len, axis=0)

    x_mag = np.abs(x_fft)

    x_ph = np.angle(x_fft)

    x_magsm = np.zeros(x_mag.shape)



    if AVERAGING is True:

        # smooth the input spectrum

        filtb = [0.9, 0.1]

        x_magsm[:, 0] = lfilter(filtb, 1, x_mag[:, 0])

        for i in range(1, nframes):

            x_tmp1 = np.hstack((x_mag[frm_len - ovlp_len - 1, i - 1], x_mag[:, i]))

            x_tmp2 = lfilter(filtb, 1, x_tmp1)

            x_magsm[:, i] = x_tmp2[1:]



        # weighted spectrum estimate

        Wn2, Wn1 = 0.09, 0.25

        W0, W1, W2 = 0.32, 0.25, 0.09

        x_magsm[:, 0] = W0 * x_magsm[:, 0] + W1 * x_magsm[:, 1] + W2 * x_magsm[:, 2]

        x_magsm[:, 1] = Wn1 * x_magsm[:, 0] + W0 * x_magsm[:, 1] + W1 * x_magsm[:, 2] + W2 * x_magsm[:, 3]

        for i in range(2, nframes - 2):

            x_magsm[:, i] = Wn2 * x_magsm[:, i - 2] + Wn1 * x_magsm[:, i - 1] + \

                            W0 * x_magsm[:, i] + W1 * x_magsm[:, i + 1] + W2 * x_magsm[:, i + 2]

        x_magsm[:, nframes - 2] = Wn2 * x_magsm[:, nframes - 2 - 2] + Wn1 * x_magsm[:, nframes - 2 - 1] + \

                                  W0 * x_magsm[:, nframes - 2] + W1 * x_magsm[:, nframes - 1]

        x_magsm[:, nframes - 1] = Wn2 * x_magsm[:, nframes - 2 - 1] + Wn1 * x_magsm[:, nframes - 2] + \

                                  W0 * x_magsm[:, nframes - 1]

    else:

        x_magsm = x_mag



    # noise update during silence frames

    if VAD:

        [n_spect, state] = noise_update(x_magsm, n_spect, common_len, nframes)

    else:

        for i in range(1, nframes):

            n_spect[:, i] = n_spect[:, 1]



    # calculate the segmental SNR in each band

    k: int = 0

    SNR_x = np.zeros((n_band, nframes))

    for i in range(0, n_band - 1):

        start = int(lo_bin[i]) - 1

        stop = int(hi_bin[i])

        k = k + 1

        for j in range(0, nframes):

            SNR_x[i, j] = 10 * np.log10((LA.norm(x_magsm[start:stop, j], 2) ** 2)

                                        / (LA.norm(n_spect[start:stop, j], 2) ** 2))



    start = int(lo_bin[k]) - 1

    stop = int(fft_len / 2) + 1

    for j in range(0, nframes):

        SNR_x[k, j] = 10 * np.log10(LA.norm(x_magsm[start:stop, j], 2) ** 2

                                    / LA.norm(n_spect[start:stop, j], 2) ** 2)



    beta_x = berouti(SNR_x)



    # start subtractive procedure

    sub_speech_x = np.zeros((int(fft_len/2)+1, nframes))

    k = 0

    for i in range(0, n_band):

        sub_speech = np.zeros((band_size[i].astype(int), nframes))

        start = lo_bin[i].astype(int) - 1

        stop = hi_bin[i].astype(int)

        if i == 0:

            for j in range(0, nframes):

                n_spec_sq = n_spect[start:stop, j] ** 2

                sub_speech[:, j] = x_magsm[start:stop, j]**2 - np.dot(beta_x[i, j], n_spec_sq)

        else:

            for j in range(0, nframes):

                n_spec_sq = n_spect[start:stop, j] ** 2

                sub_speech[:, j] = x_magsm[start:stop, j]**2 - np.dot(beta_x[i, j]*2.5, n_spec_sq)

                k = k+1



        z = np.argwhere(sub_speech < 0)

        x_tmp = x_magsm[start:stop, :]

        # floor

        sub_speech[sub_speech < 0] = np.dot(FLOOR, x_tmp[sub_speech < 0]**2)

        sub_speech[:, :] = np.add(sub_speech[:, :], np.dot(0.05, x_magsm[start:stop, :]**2))

        sub_speech_x[start: stop, :] = sub_speech_x[start:stop, :] + sub_speech



    x_tmp = x_magsm[start:stop, :]

    sub_speech[sub_speech < 0] = FLOOR * x_tmp[sub_speech < 0]**2

    sub_speech = sub_speech + 0.01 * x_magsm[start:stop, :]**2

    sub_speech_x[start:stop, :] = sub_speech_x[start:stop, :] + sub_speech



    # reconstruct whole spectrum

    out_x = np.zeros((fft_len, nframes))

    out_x[0:int(fft_len/2)+1, :] = sub_speech_x

    out_x[int(fft_len/2)+1:fft_len, :] = np.flipud(sub_speech_x[1:int(fft_len/2)])



    # multiply the whole frame fft with the phase information

    y1_fft = out_x ** 0.5 * (np.cos(x_ph) + img*np.sin(x_ph))



    # to ensure a real signal

    y1_fft[0, :] = np.real(y1_fft[0, :])

    y1_fft[int(fft_len/2), :] = np.real(y1_fft[int(fft_len/2), :])



    # take the IFFT

    y1_ifft = np.fft.ifft(y1_fft, axis=0)

    y1_r = np.real(y1_ifft)



    # overlap and add

    y1 = np.zeros(common_len*(nframes+1))

    y1[0:frm_len] = y1_r[0:frm_len, 0]

    start = frm_len - ovlp_len

    mid = start + ovlp_len

    stop = start + frm_len



    for i in range(1, nframes):

        y1[start:mid] = y1[start:mid] + y1_r[0:ovlp_len, i]

        y1[mid:stop] = y1_r[ovlp_len:frm_len, i]

        start = mid

        mid = start + ovlp_len

        stop = start + frm_len

    out = y1



    # write output

    sf.write(outfile, out, fs, 'PCM_16')




def berouti(snr: np.ndarray):

    [n_bands, n_frames] = np.shape(snr)

    alp = np.zeros((n_bands, n_frames))



    for i in range(0, n_bands):

        for j in range(0, n_frames):

            if -5.0 <= snr[i, j] <= 20:

                alp[i, j] = 4 - snr[i, j] * 3 / 20

            elif snr[i, j] < -5.0:

                alp[i, j] = 4.75

            else:

                alp[i, j] = 1



    return alp




def noise_update(x_magsm: np.ndarray, n_spect: np.ndarray, common_len, nframes):

    SPEECH = 1

    SILENCE = 0

    i = 0

    n_spect_update = np.zeros((np.size(n_spect), nframes))

    x_var = x_magsm[:, i] ** 2

    n_var = n_spect[:] ** 2

    rti = x_var / n_var - np.log10(x_var / n_var) - 1

    judge_value = np.mean(rti, axis=0)

    state = np.zeros(nframes * common_len)

    n_spect_update[:, i] = n_spect[:]

    if judge_value > 0.4:

        state[0:common_len] = SPEECH

    else:

        state[0:common_len] = SILENCE

        n_spect_update[:, i] = np.sqrt(0.9 * n_spect_update[:, i] ** 2 + (1 - 0.9) * x_magsm[:, i] ** 2)



    for i in range(1, nframes):

        x_var = x_magsm[:, i] ** 2

        n_var = n_spect_update[:, i - 1] ** 2

        rti = x_var / n_var - np.log10(x_var / n_var) - 1

        judge_value = np.mean(rti, axis=0)

        if judge_value > 0.45:

            state[(i - 1) * common_len: i * common_len] = SPEECH

            n_spect_update[:, i] = n_spect_update[:, i - 1]

        else:

            state[(i - 1) * common_len: i * common_len] = SILENCE

            n_spect_update[:, i] = np.sqrt(0.9 * n_spect_update[:, i - 1] ** 2 + (1 - 0.9) * x_magsm[:, i] ** 2)



    return n_spect_update, state




def frame(sdata: np.ndarray, window: np.ndarray, frmshift, offset=0, trunc=0) -> np.ndarray:

    """

    frame(): Place vector data into a matrix of frames



    fdata = frame( sdata, window, [frmshift], [offset], [trunc] )



    This function places sampled data in vector sdata into a matrix of

    frame data.



    :param sdata: sampled data sdata must be a vector



    :param window:a windowing vector (eg, hamming) applied to each frame of

    sampled data and must be specified, because it defines the length

    of each frame in samples.



    :param frmshift:specifies the number of samples to shift

    between frames, and if not specified defaults to the window

    size (which implies no overlap).



    :param offset: specifies the offset from the first sample to

    be used for processing.  If not specified, it is set to 0, which

    means that the first sample of the sdata is the first sample of the

    frame data.  The value of offset can be negative, in which case

    initial padding of 0 samples is done.

    The optional argument trunc is a flag that



    :param trunc: specifies that sample data at the end should be

    truncated so that the last frame contains only valid data from the

    samples and no zero padding is done at the end of the sample data

    to fill a frame.  This means some sample data at the end will be

    lost.  The default is not to truncate, but to pad with zero

    samples until all sample data is represented in a frame at the end.



    :return: a matrix of frames

    """



    if sdata.ndim == 1:

        sdata = sdata.reshape(1, -1)

        ndata = sdata.size

    else:

        raise RuntimeError('frame: sdata must be vector')



    if window.ndim == 1:

        window = window.reshape(1, -1)

        nwind = window.size

    else:

        raise RuntimeError('frame: window must be vector')



    if frmshift <= 0:

        raise RuntimeError('frame: shift must be positive')



    # resize data based on offset



    if offset > 0:

        sdata = sdata[0, offset: max(offset, ndata)]

        sdata = sdata.reshape(1, -1)

        ndata = sdata.size

    elif offset < 0:

        zz = np.zeros((1, abs(offset)))

        sdata = np.hstack((zz, sdata))

        sdata = sdata.reshape(1, -1)

        ndata = sdata.size



    if trunc > 0:

        nframes = int(np.floor((ndata - nwind) / frmshift + 1))

    else:

        nframes = int(np.ceil(ndata / frmshift))



    tdata = np.zeros((nwind, nframes))

    dowind = np.any(window != 1)

    ixstrt = 0

    fstart = np.zeros(nframes)



    for frm in range(0, nframes):

        ixend = min(ndata, ixstrt + nwind)

        ixlen = ixend - ixstrt

        tdata[0:ixlen, frm] = np.array(sdata[0, ixstrt:ixend]).transpose()

        fstart[frm] = ixstrt

        ixstrt = ixstrt + frmshift



    if dowind is not None:

        fdata = scale_mtx(tdata, window, 1)



    return fdata




def scale_mtx(tdata: np.ndarray, window: np.ndarray, flag):

    [dros, dcol] = np.shape(tdata)

    [wros, wcol] = np.shape(window)

    if wros > wcol:

        window = window.transpose()



    fdata = np.zeros(np.shape(tdata))

    for i in range(0, dcol):

        fdata[:, i] = tdata[:, i] * window



    return fdata




def log_filter(n_ch: int, samp_rate):

    FS = samp_rate / 2

    upper_freq = FS

    lower_freq = 1

    filter_range = np.log10(upper_freq / lower_freq)

    interval = filter_range / n_ch

    upper = np.zeros(n_ch)

    center = np.zeros(n_ch)

    lower = np.zeros(n_ch)

    for i in range(1, n_ch + 1):

        upper[i - 1] = lower_freq * 10 ** (interval * i)

        lower[i - 1] = lower_freq * 10 ** (interval * (i - 1))

        center[i - 1] = 0.5 * (upper[i - 1] + lower[i - 1])



    return lower, center, upper




def mel_filter(N, low, high):

    """

     This function returns the lower, center and upper freqs

     of the filters equally spaced in mel-scale

     Input: N - number of filters

         low - (left-edge) 3dB frequency of the first filter

         high - (right-edge) 3dB frequency of the last filter



     Copyright (c) 1996-97 by Philipos C. Loizou

    """

    ac = 1100

    fc = 800

    LOW = ac * np.log(1 + low / fc)

    HIGH = ac * np.log(1 + high / fc)

    N1 = N + 1

    e1 = np.exp(1)

    fmel = np.zeros(N1)

    seq = np.linspace(1, N1, N1)

    fmel[0:N1] = LOW + seq * (HIGH - LOW) / N1

    cen2 = fc * (e1 ** (fmel / ac) - 1)



    lower = cen2[0:N]

    upper = cen2[1: N + 1]

    center = 0.5 * (lower + upper)

    return lower, center, upper

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 8
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值