#!/usr/bin/env python
# encoding: utf-8
'''
@author: Kolor
@license:
@contact: colorsu1922@163.com
@software: pycharm
@file: mband.py
@time: 2020/12/27 14:22
@desc:
'''
import librosa
import numpy as np
from scipy.signal import lfilter
import numpy.linalg as LA
import soundfile as sf
def mband(infile, outfile, n_band=6, freq_spacing='linear'):
"""
Implements the multi-band spectral subtraction algorithm [1].
Usage: mband(infile, outputfile,Nband,Freq_spacing)
infile - noisy speech file in .wav format
outputFile - enhanced output file in .wav format
Nband - Number of frequency bands (recommended 4-8)
Freq_spacing - Type of frequency spacing for the bands, choices:
'linear', 'log' and 'mel'
Example call: mband('sp04_babble_sn10.wav','out_mband.wav',6,'linear');
References:
[1] Kamath, S. and Loizou, P. (2002). A multi-band spectral subtraction
method for enhancing speech corrupted by colored noise. Proc. IEEE Int.
Conf. Acoust.,Speech, Signal Processing
Authors: Sunil Kamath and Philipos C. Loizou
Copyright (c) 2006 by Philipos C. Loizou
$Revision: 0.0 $ $Date: 10/09/2006 $
-------------------------------------------------------------------------
"""
AVERAGING = True
FRAME_SIZE = 20
OVERLAP_PERS = 50
NOISE_FRAME = 6
FLOOR = 0.001
VAD = 1
xin, fs = librosa.load(infile, sr=None, dtype=np.float64)
# frame size in samples
frm_len = int(np.floor(FRAME_SIZE * fs / 1000))
# number of overlap samples
ovlp_len = int(np.floor(frm_len * OVERLAP_PERS / 100))
# number of common samples between adjacent frames
common_len = frm_len - ovlp_len
fft_len = 2
while fft_len < frm_len:
fft_len = fft_len * 2
band_size = np.zeros((n_band, ), dtype=int)
lo_bin = np.zeros((n_band, ), dtype=int)
hi_bin = np.zeros((n_band, ), dtype=int)
if (freq_spacing == 'linear') or (freq_spacing == 'LINEAR'):
band_size[0] = int(np.floor((fft_len/2 + 1) / n_band))
# 尽可能均匀分布,最后一个子带的freq_bin最大
for i in range(0, n_band):
lo_bin[i] = i * band_size[0] + 1
hi_bin[i] = lo_bin[i] + band_size[0] - 1
band_size[i] = band_size[0]
band_size[n_band-1] = (fft_len / 2 + 1) % band_size[0] + band_size[0]
hi_bin[n_band-1] = lo_bin[n_band-1] + band_size[n_band-1] - 1
elif (freq_spacing == 'log') or (freq_spacing == 'LOG'):
lof, midf, hif = log_filter(n_band, fs)
lo_bin = np.round(lof * fft_len / fs) + 1
hi_bin = np.round(hif * fft_len / fs) + 1
band_size = hi_bin - lo_bin + 1
elif (freq_spacing == 'mel') or (freq_spacing == 'MEL'):
lof, midf, hif = mel_filter(n_band, 0, fs / 2)
lo_bin = np.round(lof * fft_len / fs) + 1
hi_bin = np.round(hif * fft_len / fs) + 1
lo_bin[0] = 1
# like hi_bin(end) in matlab
hi_bin[-1] = fft_len / 2 + 1
band_size = hi_bin - lo_bin + 1
else:
print("Freq Spacing support 'linear', 'log' and 'mel' only... ")
img = 1j
# calculate hamming window
win = np.sqrt(np.hamming(frm_len))
"""
Estimate noise magnitude for first 'NOISE_FRAME' frames
Estimated noise spectrum is stored in noise_spect
"""
noise_pow = np.zeros(fft_len)
j = 0
for k in range(NOISE_FRAME):
n_fft = np.fft.fft(xin[j:j + frm_len] * win, fft_len)
n_mag = np.abs(n_fft)
n_magsq = n_mag ** 2
noise_pow = noise_pow + n_magsq
j = j + frm_len
n_spect = np.sqrt(noise_pow / NOISE_FRAME)
# input to noise reduction part
x = xin
# Framing the signal with Window = 20ms and overlap = 10ms,
# the output is a matrix with each column representing a frame
framed_x = frame(x, win, ovlp_len, 0, 0)
[tmp, nframes] = framed_x.shape
# start processing
x_win = framed_x
x_fft = np.fft.fft(x_win, n=fft_len, axis=0)
x_mag = np.abs(x_fft)
x_ph = np.angle(x_fft)
x_magsm = np.zeros(x_mag.shape)
if AVERAGING is True:
# smooth the input spectrum
filtb = [0.9, 0.1]
x_magsm[:, 0] = lfilter(filtb, 1, x_mag[:, 0])
for i in range(1, nframes):
x_tmp1 = np.hstack((x_mag[frm_len - ovlp_len - 1, i - 1], x_mag[:, i]))
x_tmp2 = lfilter(filtb, 1, x_tmp1)
x_magsm[:, i] = x_tmp2[1:]
# weighted spectrum estimate
Wn2, Wn1 = 0.09, 0.25
W0, W1, W2 = 0.32, 0.25, 0.09
x_magsm[:, 0] = W0 * x_magsm[:, 0] + W1 * x_magsm[:, 1] + W2 * x_magsm[:, 2]
x_magsm[:, 1] = Wn1 * x_magsm[:, 0] + W0 * x_magsm[:, 1] + W1 * x_magsm[:, 2] + W2 * x_magsm[:, 3]
for i in range(2, nframes - 2):
x_magsm[:, i] = Wn2 * x_magsm[:, i - 2] + Wn1 * x_magsm[:, i - 1] + \
W0 * x_magsm[:, i] + W1 * x_magsm[:, i + 1] + W2 * x_magsm[:, i + 2]
x_magsm[:, nframes - 2] = Wn2 * x_magsm[:, nframes - 2 - 2] + Wn1 * x_magsm[:, nframes - 2 - 1] + \
W0 * x_magsm[:, nframes - 2] + W1 * x_magsm[:, nframes - 1]
x_magsm[:, nframes - 1] = Wn2 * x_magsm[:, nframes - 2 - 1] + Wn1 * x_magsm[:, nframes - 2] + \
W0 * x_magsm[:, nframes - 1]
else:
x_magsm = x_mag
# noise update during silence frames
if VAD:
[n_spect, state] = noise_update(x_magsm, n_spect, common_len, nframes)
else:
for i in range(1, nframes):
n_spect[:, i] = n_spect[:, 1]
# calculate the segmental SNR in each band
k: int = 0
SNR_x = np.zeros((n_band, nframes))
for i in range(0, n_band - 1):
start = int(lo_bin[i]) - 1
stop = int(hi_bin[i])
k = k + 1
for j in range(0, nframes):
SNR_x[i, j] = 10 * np.log10((LA.norm(x_magsm[start:stop, j], 2) ** 2)
/ (LA.norm(n_spect[start:stop, j], 2) ** 2))
start = int(lo_bin[k]) - 1
stop = int(fft_len / 2) + 1
for j in range(0, nframes):
SNR_x[k, j] = 10 * np.log10(LA.norm(x_magsm[start:stop, j], 2) ** 2
/ LA.norm(n_spect[start:stop, j], 2) ** 2)
beta_x = berouti(SNR_x)
# start subtractive procedure
sub_speech_x = np.zeros((int(fft_len/2)+1, nframes))
k = 0
for i in range(0, n_band):
sub_speech = np.zeros((band_size[i].astype(int), nframes))
start = lo_bin[i].astype(int) - 1
stop = hi_bin[i].astype(int)
if i == 0:
for j in range(0, nframes):
n_spec_sq = n_spect[start:stop, j] ** 2
sub_speech[:, j] = x_magsm[start:stop, j]**2 - np.dot(beta_x[i, j], n_spec_sq)
else:
for j in range(0, nframes):
n_spec_sq = n_spect[start:stop, j] ** 2
sub_speech[:, j] = x_magsm[start:stop, j]**2 - np.dot(beta_x[i, j]*2.5, n_spec_sq)
k = k+1
z = np.argwhere(sub_speech < 0)
x_tmp = x_magsm[start:stop, :]
# floor
sub_speech[sub_speech < 0] = np.dot(FLOOR, x_tmp[sub_speech < 0]**2)
sub_speech[:, :] = np.add(sub_speech[:, :], np.dot(0.05, x_magsm[start:stop, :]**2))
sub_speech_x[start: stop, :] = sub_speech_x[start:stop, :] + sub_speech
x_tmp = x_magsm[start:stop, :]
sub_speech[sub_speech < 0] = FLOOR * x_tmp[sub_speech < 0]**2
sub_speech = sub_speech + 0.01 * x_magsm[start:stop, :]**2
sub_speech_x[start:stop, :] = sub_speech_x[start:stop, :] + sub_speech
# reconstruct whole spectrum
out_x = np.zeros((fft_len, nframes))
out_x[0:int(fft_len/2)+1, :] = sub_speech_x
out_x[int(fft_len/2)+1:fft_len, :] = np.flipud(sub_speech_x[1:int(fft_len/2)])
# multiply the whole frame fft with the phase information
y1_fft = out_x ** 0.5 * (np.cos(x_ph) + img*np.sin(x_ph))
# to ensure a real signal
y1_fft[0, :] = np.real(y1_fft[0, :])
y1_fft[int(fft_len/2), :] = np.real(y1_fft[int(fft_len/2), :])
# take the IFFT
y1_ifft = np.fft.ifft(y1_fft, axis=0)
y1_r = np.real(y1_ifft)
# overlap and add
y1 = np.zeros(common_len*(nframes+1))
y1[0:frm_len] = y1_r[0:frm_len, 0]
start = frm_len - ovlp_len
mid = start + ovlp_len
stop = start + frm_len
for i in range(1, nframes):
y1[start:mid] = y1[start:mid] + y1_r[0:ovlp_len, i]
y1[mid:stop] = y1_r[ovlp_len:frm_len, i]
start = mid
mid = start + ovlp_len
stop = start + frm_len
out = y1
# write output
sf.write(outfile, out, fs, 'PCM_16')
def berouti(snr: np.ndarray):
[n_bands, n_frames] = np.shape(snr)
alp = np.zeros((n_bands, n_frames))
for i in range(0, n_bands):
for j in range(0, n_frames):
if -5.0 <= snr[i, j] <= 20:
alp[i, j] = 4 - snr[i, j] * 3 / 20
elif snr[i, j] < -5.0:
alp[i, j] = 4.75
else:
alp[i, j] = 1
return alp
def noise_update(x_magsm: np.ndarray, n_spect: np.ndarray, common_len, nframes):
SPEECH = 1
SILENCE = 0
i = 0
n_spect_update = np.zeros((np.size(n_spect), nframes))
x_var = x_magsm[:, i] ** 2
n_var = n_spect[:] ** 2
rti = x_var / n_var - np.log10(x_var / n_var) - 1
judge_value = np.mean(rti, axis=0)
state = np.zeros(nframes * common_len)
n_spect_update[:, i] = n_spect[:]
if judge_value > 0.4:
state[0:common_len] = SPEECH
else:
state[0:common_len] = SILENCE
n_spect_update[:, i] = np.sqrt(0.9 * n_spect_update[:, i] ** 2 + (1 - 0.9) * x_magsm[:, i] ** 2)
for i in range(1, nframes):
x_var = x_magsm[:, i] ** 2
n_var = n_spect_update[:, i - 1] ** 2
rti = x_var / n_var - np.log10(x_var / n_var) - 1
judge_value = np.mean(rti, axis=0)
if judge_value > 0.45:
state[(i - 1) * common_len: i * common_len] = SPEECH
n_spect_update[:, i] = n_spect_update[:, i - 1]
else:
state[(i - 1) * common_len: i * common_len] = SILENCE
n_spect_update[:, i] = np.sqrt(0.9 * n_spect_update[:, i - 1] ** 2 + (1 - 0.9) * x_magsm[:, i] ** 2)
return n_spect_update, state
def frame(sdata: np.ndarray, window: np.ndarray, frmshift, offset=0, trunc=0) -> np.ndarray:
"""
frame(): Place vector data into a matrix of frames
fdata = frame( sdata, window, [frmshift], [offset], [trunc] )
This function places sampled data in vector sdata into a matrix of
frame data.
:param sdata: sampled data sdata must be a vector
:param window:a windowing vector (eg, hamming) applied to each frame of
sampled data and must be specified, because it defines the length
of each frame in samples.
:param frmshift:specifies the number of samples to shift
between frames, and if not specified defaults to the window
size (which implies no overlap).
:param offset: specifies the offset from the first sample to
be used for processing. If not specified, it is set to 0, which
means that the first sample of the sdata is the first sample of the
frame data. The value of offset can be negative, in which case
initial padding of 0 samples is done.
The optional argument trunc is a flag that
:param trunc: specifies that sample data at the end should be
truncated so that the last frame contains only valid data from the
samples and no zero padding is done at the end of the sample data
to fill a frame. This means some sample data at the end will be
lost. The default is not to truncate, but to pad with zero
samples until all sample data is represented in a frame at the end.
:return: a matrix of frames
"""
if sdata.ndim == 1:
sdata = sdata.reshape(1, -1)
ndata = sdata.size
else:
raise RuntimeError('frame: sdata must be vector')
if window.ndim == 1:
window = window.reshape(1, -1)
nwind = window.size
else:
raise RuntimeError('frame: window must be vector')
if frmshift <= 0:
raise RuntimeError('frame: shift must be positive')
# resize data based on offset
if offset > 0:
sdata = sdata[0, offset: max(offset, ndata)]
sdata = sdata.reshape(1, -1)
ndata = sdata.size
elif offset < 0:
zz = np.zeros((1, abs(offset)))
sdata = np.hstack((zz, sdata))
sdata = sdata.reshape(1, -1)
ndata = sdata.size
if trunc > 0:
nframes = int(np.floor((ndata - nwind) / frmshift + 1))
else:
nframes = int(np.ceil(ndata / frmshift))
tdata = np.zeros((nwind, nframes))
dowind = np.any(window != 1)
ixstrt = 0
fstart = np.zeros(nframes)
for frm in range(0, nframes):
ixend = min(ndata, ixstrt + nwind)
ixlen = ixend - ixstrt
tdata[0:ixlen, frm] = np.array(sdata[0, ixstrt:ixend]).transpose()
fstart[frm] = ixstrt
ixstrt = ixstrt + frmshift
if dowind is not None:
fdata = scale_mtx(tdata, window, 1)
return fdata
def scale_mtx(tdata: np.ndarray, window: np.ndarray, flag):
[dros, dcol] = np.shape(tdata)
[wros, wcol] = np.shape(window)
if wros > wcol:
window = window.transpose()
fdata = np.zeros(np.shape(tdata))
for i in range(0, dcol):
fdata[:, i] = tdata[:, i] * window
return fdata
def log_filter(n_ch: int, samp_rate):
FS = samp_rate / 2
upper_freq = FS
lower_freq = 1
filter_range = np.log10(upper_freq / lower_freq)
interval = filter_range / n_ch
upper = np.zeros(n_ch)
center = np.zeros(n_ch)
lower = np.zeros(n_ch)
for i in range(1, n_ch + 1):
upper[i - 1] = lower_freq * 10 ** (interval * i)
lower[i - 1] = lower_freq * 10 ** (interval * (i - 1))
center[i - 1] = 0.5 * (upper[i - 1] + lower[i - 1])
return lower, center, upper
def mel_filter(N, low, high):
"""
This function returns the lower, center and upper freqs
of the filters equally spaced in mel-scale
Input: N - number of filters
low - (left-edge) 3dB frequency of the first filter
high - (right-edge) 3dB frequency of the last filter
Copyright (c) 1996-97 by Philipos C. Loizou
"""
ac = 1100
fc = 800
LOW = ac * np.log(1 + low / fc)
HIGH = ac * np.log(1 + high / fc)
N1 = N + 1
e1 = np.exp(1)
fmel = np.zeros(N1)
seq = np.linspace(1, N1, N1)
fmel[0:N1] = LOW + seq * (HIGH - LOW) / N1
cen2 = fc * (e1 ** (fmel / ac) - 1)
lower = cen2[0:N]
upper = cen2[1: N + 1]
center = 0.5 * (lower + upper)
return lower, center, upper
多子带的谱减法(附Python 源码) @Learning Speech enhancement__2
于 2021-01-03 17:10:29 首次发布