基于语谱图引导滤波去混响
论文
Guided spectrogram filtering for speech dereverberation
实验效果
基于python对论文简单复现了一下, 贴出几张效果图。
原始音频
文件
(发现csdn无法传文件, 有需要的可以私信我要)
波形图
语谱图
算法输出音频
文件
(发现csdn无法传文件, 有需要的可以私信我要)
链接: https://pan.baidu.com/s/1hzeX_z9aD2yzTQd9W8zRjA 提取码: vzjb 复制这段内容后打开百度网盘手机App,操作更方便哦
波形图
语谱图
仿真代码
# -*- coding: utf-8 -*-
# @Time : 2020-11-03
# @Author : jie
# @Email : jieyangzhang@outlook.com
# @File : guided_spectrogram_filtering_dereverb.py
# @Software: vscode
from scipy.io import wavfile
import numpy as np
import matplotlib.pyplot as plt
import sys
# 读取音频文件
if len(sys.argv) == 2:
file_name = sys.argv[1]
freq, sig = wavfile.read(file_name)
sig = np.array(sig, dtype=np.float)
# sig = sig.astype(np.float)
# 分析帧长
analyse_len = 512
# 傅立叶变换点数,数值越大,频谱分辨率越高 --- 时间分辨率会越低
NFFT = analyse_len * 2
half = int(NFFT/2 + 1)
# 重叠率
overlap = 50
# 分析帧长须为偶数
if analyse_len % 2 == 1:
analyse_len += 1
# 重叠部分长度
len1 = int(np.floor(analyse_len * overlap / 100))
# 非重叠部分长度
len2 = analyse_len - len1
# print(len1, len2)
# 汉宁窗,根据使用场景确定是否开方,不开方为一次加窗,开方为两次加窗
win = np.hanning(analyse_len + 1)
win = win[:-1]
win = np.sqrt(win)
# 总帧数计算
Nframes = int(np.floor(len(sig) / len2) - np.floor(analyse_len / len2))
print('frame num : ', Nframes)
# 分析帧的起始索引
k = 0
xfinal = np.zeros([Nframes, half])
X_angle = np.zeros([Nframes, half])
# 循环体
for i in range(Nframes):
# 分帧
x_buff = win * sig[k: k + analyse_len]
# 傅立叶变换
X_buff = np.fft.rfft(x_buff, NFFT)
angle = np.angle(X_buff)
X_buff = 20 * np.log10(np.abs(X_buff) ** 2)
xfinal[i, :] = X_buff # FFT,返回幅度谱
X_angle[i, :] = angle
# 取下一帧数据
k += len2
# stft标准化
x_min = np.max(xfinal) - 255
le_min_index = np.where(xfinal < x_min)
x_hat = xfinal.copy()
x_hat[le_min_index[0], le_min_index[1]] = x_min # 式7
x_bar = x_hat - np.min(x_hat) # 式6
x_widetilde = x_bar / np.max(x_bar) # 式5
# 基于引导滤波法对语谱图滤波 -- 类似于去雾效果
r1 = 4
r2 = 16
beta = 0.8
e = 0
N = (2 * r1 + 1) * (2 * r2 + 1) # window size
# cal felter input
varphi_widetilde = np.zeros_like(x_widetilde, dtype=np.float)
varphi_widetilde[0] = x_widetilde[0]
for i in range(1, Nframes):
varphi_widetilde[i] = beta * varphi_widetilde[i - 1] + \
(1 - beta) * x_widetilde[i] # 式8
# cal A, B
m_widetilde_x = np.zeros_like(x_widetilde, dtype=np.float)
m_widetilde_x_2 = np.zeros_like(x_widetilde, dtype=np.float)
sigma_widetilde_x_2 = np.zeros_like(x_widetilde, dtype=np.float)
m_widetilde_x_varphi = np.zeros_like(x_widetilde, dtype=np.float)
m_widetilde_varphi = np.zeros_like(x_widetilde, dtype=np.float)
for i in range(Nframes):
for j in range(half):
freq_min = max(0, j - r1)
frame_min = max(0, i - r2)
freq_max = min(half, j + r1)
frame_max = min(Nframes, i + r2)
window_x_widetilde = x_widetilde[np.arange(
frame_min, frame_max), ...][..., np.arange(freq_min, freq_max)]
window_varphi_widetilde = varphi_widetilde[np.arange(
frame_min, frame_max), ...][..., np.arange(freq_min, freq_max)]
m_widetilde_x[i][j] = np.average(window_x_widetilde) # 式12
m_widetilde_x_2[i][j] = np.average(window_x_widetilde ** 2) # 式11
sigma_widetilde_x_2[i][j] = m_widetilde_x_2[i][j] - \
m_widetilde_x[i][j] ** 2 # 式10
m_widetilde_x_varphi[i][j] = np.average(
window_x_widetilde * window_varphi_widetilde) # 式14
m_widetilde_varphi[i][j] = np.average(window_varphi_widetilde) # 式15
cov_widetilde_x_varphi = m_widetilde_x_varphi - \
m_widetilde_varphi * m_widetilde_x # 式13
a_widetilde = cov_widetilde_x_varphi / (sigma_widetilde_x_2 + e) # 式16
b_widetilde = m_widetilde_varphi - a_widetilde * m_widetilde_x # 式17
a = np.zeros_like(a_widetilde, dtype=np.float)
b = np.zeros_like(b_widetilde, dtype=np.float)
# smooth a, b
for i in range(Nframes):
for j in range(half):
freq_min = max(0, j - r1)
frame_min = max(0, i - r2)
freq_max = min(half, j + r1)
frame_max = min(Nframes, i + r2)
window_a_widetilde = a_widetilde[np.arange(
frame_min, frame_max), ...][..., np.arange(freq_min, freq_max)]
window_b_widetilde = b_widetilde[np.arange(
frame_min, frame_max), ...][..., np.arange(freq_min, freq_max)]
a[i][j] = np.average(window_a_widetilde) # 式18
b[i][j] = np.average(window_b_widetilde) # 式19
q_widetilde = a * x_widetilde + b # 式9
alpha = 0.7
y = x_widetilde - alpha * q_widetilde # 式20
index_y = np.where(y < 0)
y[index_y[0], index_y[1]] = 0
y_widetilde = y / np.max(y)
g = y_widetilde / x_widetilde
g_min = -20
g_max = 1
g_index_min = np.where(g < g_min)
g_index_max = np.where(g > g_max)
print(g_index_min)
# g[g_index_min[0], g_index_max[1]] = g_min
g[g_index_max[0], g_index_max[1]] = g_max
out = xfinal * g
# print(out[32])
output = np.zeros([Nframes*len2]) # 输出信号
k = 0
overlap_buff = np.zeros([len1])
# ifft 语谱图复原回时域音频信号
for i in range(Nframes):
buff = out[i]
buff = np.sqrt(np.power(10, out[i] / 20)) * np.exp(1j * X_angle[i])
segment = np.fft.irfft(buff, NFFT)[:analyse_len] * win
# segment = segment / np.max(segment)
# print(np.max(segment))
output[k:k + len1] = overlap_buff + segment[:len1]
output[k+len1:k+len2] = segment[len1:len2]
overlap_buff = segment[len2: analyse_len]
# print(np.max(output[k:k+len1+len2]))
k += len2
# plt.plot(output)
# plt.show()
wavfile.write("./out.wav", freq, output.astype(np.int16))
# xlocs = np.linspace(0, Nframes - 1, 5)
# frame_dur = 1 / float(freq) * len2
# plt.xticks(xlocs, ["%.02f" % l for l in (xlocs * frame_dur)])
# ylocs = np.linspace(0, half, 5)
# freq_solution = freq / 2 / half
# plt.yticks(ylocs, ["%.02f" % l for l in (ylocs * freq_solution)])
# plt.xlabel("time (s)")
# plt.title('out')
# plt.imshow(np.transpose(out), origin="lower",
# cmap="jet", aspect="auto", interpolation="none")
# plt.show()
# plt.imshow(np.transpose(xfinal), origin="lower",
# cmap="jet", aspect="auto", interpolation="none")
# plt.show()