利用librosa.feature.inverse.mel_to_audio函数将梅尔谱转换为语音时发现听感不佳,简单查看内部原理,发现是mel_to_audio函数的Griffin-Lim算法利用估计的相位谱还原语音导致的。因此利用原始相位谱还原语音则不会有听感不佳的现象,代码流程以上传Github。
chuqingi/Extract_Invert_Mel_Spectrogram (github.com)
直接从梅尔谱中还原语音:
import librosa
import soundfile
import numpy as np
import librosa.display
import matplotlib.pyplot as plt
if __name__ == '__main__':
n_fft = 320
win_length = 320
hop_length = 160
n_mels = 64
fmin = 100
fmax = 8000
wav_data, sr = librosa.load('./input.wav', sr=16000)
# Convert audio to mel spectrogram
mel = librosa.feature.melspectrogram(wav_data, sr=sr, n_fft=n_fft, hop_length=hop_length, win_length=win_length, n_mels=n_mels, fmin=fmin, fmax=fmax)
# Plot mel spectrogram
mel_spec_db = librosa.power_to_db(mel, ref=np.max)
plt.figure()
librosa.display.specshow(mel_spec_db, sr=sr)
plt.title('Mel spectrogram')
plt.xlabel("Time")
plt.ylabel("Frequency")
plt.colorbar(format='%+2.0f dB')
plt.show()
'''
processing mel
'''
# Convert mel spectrogram to audio
wav_data_2 = librosa.feature.inverse.mel_to_audio(mel, sr=sr, n_fft=n_fft, hop_length=hop_length, win_length=win_length, fmin=fmin, fmax=fmax)
soundfile.write('out.wav', wav_data_2, sr)
结果:
利用原始相位从梅尔谱中还原语音:
import librosa
import soundfile
import numpy as np
import librosa.display
import matplotlib.pyplot as plt
class MelExtractInvert:
def __init__(self, audio, sr, n_fft, win_length, hop_length, n_mels, fmin, fmax, power):
self.audio = audio
self.sr = sr # sample rate
self.n_fft = n_fft # fft points
self.win_length = win_length # frame length
self.hop_length = hop_length # frame shift
self.n_mels = n_mels # number of Mel banks
self.fmin = fmin # lowest frequency, remove low-frequency noise
self.fmax = fmax # highest frequency, sr / 2.0
self.power = power # exponent for the magnitude melspectrogram
def extract_feature(self):
spectrogram, phase = librosa.magphase(
librosa.stft(self.audio, self.n_fft, self.hop_length, self.win_length))
mel_basis = librosa.filters.mel(self.sr, self.n_fft, self.n_mels, fmin=self.fmin, fmax=self.fmax)
spectrogram_power = spectrogram ** self.power
mel_spectrogram = np.dot(mel_basis, spectrogram_power)
return phase, mel_spectrogram
def invert_feature(self, phase, mel_spectrogram):
spectrogram = librosa.feature.inverse.mel_to_stft(mel_spectrogram, self.sr, self.n_fft, self.power,
fmin=self.fmin, fmax=self.fmax)
audio = librosa.istft(spectrogram * phase, self.hop_length, self.win_length)
return audio
if __name__ == '__main__':
audio, sr = librosa.load('./input.wav', sr=16000)
MEL = MelExtractInvert(audio, sr, 320, 320, 160, 64, 100, 8000, 2)
# Convert audio to mel spectrogram
phase, mel_spectrogram = MEL.extract_feature()
# Plot mel spectrogram
mel_spec_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
plt.figure()
librosa.display.specshow(mel_spec_db, sr=sr)
plt.title('Mel spectrogram')
plt.xlabel("Time")
plt.ylabel("Frequency")
plt.colorbar(format='%+2.0f dB')
plt.show()
'''
processing mel_spectrogram
'''
# Convert mel spectrogram to audio
reconstructed_audio = MEL.invert_feature(phase, mel_spectrogram)
soundfile.write('./output.wav', reconstructed_audio, sr)
结果: