vad的主要目的是检测声音(这里特指人声),实际环境当中,声音往往存在很多的噪声,即便人在说话的时候也会存在噪声;所以前面制作标签的干净数据需要加入噪声来合成含有噪声的音频数据,从而使得训练出来的模型对噪声更具鲁棒性。数据合成代码实现如下:
audio_lib.py
# -*- coding: utf-8 -*-
"""
@author: chkarada
"""
import os
import numpy as np
import soundfile as sf
import subprocess
import glob
import librosa
import random
import tempfile
EPS = np.finfo(float).eps
np.random.seed(0)
def is_clipped(audio, clipping_threshold=0.99):
return any(abs(audio) > clipping_threshold)
def normalize(audio, target_level=-25):
'''Normalize the signal to the target level'''
rms = (audio ** 2).mean() ** 0.5
scalar = 10 ** (target_level / 20) / (rms+EPS)
audio = audio * scalar
return audio
def normalize_segmental_rms(audio, rms, target_level=-25):
'''Normalize the signal to the target level
based on segmental RMS'''
scalar = 10 ** (target_level / 20) / (rms+EPS)
audio = audio * scalar
return audio
def audioread(path, norm=False, start=0, stop=None, target_level=-25):
'''Function to read audio'''
path = os.path.abspath(path)
if not os.path.exists(path):
raise ValueError("[{}] does not exist!".format(path))
try:
audio, sample_rate = sf.read(path, start=start, stop=stop)
except RuntimeError: # fix for sph pcm-embedded shortened v2
print('WARNING: Audio type not supported')
if len(audio.shape) == 1: # mono
if norm:
rms = (audio ** 2).mean() ** 0.5
scalar = 10 ** (target_level / 20) / (rms+EPS)
audio = audio * scalar
else: # multi-channel
audio = audio.T
audio = audio.sum(axis=0)/audio.shape[0]
if norm:
audio = normalize(audio, target_level)
return audio, sample_rate
def audiowrite(destpath, audio, sample_rate=16000, norm=False, target_level=-25, \
clipping_threshold=0.99, clip_test=False):
'''Function to write audio'''
if clip_test:
if is_clipped(audio, clipping_threshold=clipping_threshold):
raise ValueError("Clipping detected in audiowrite()! " + \
destpath + " file not written to disk.")
if norm:
audio = normalize(audio, target_level)
max_amp = max(abs(audio))
if max_amp >= clipping_threshold:
audio = audio/max_amp * (clipping_threshold-EPS)
destpath = os.path.abspath(destpath)
destdir = os.path.dirname(destpath)
if not os.path.exists(destdir):
os.makedirs(destdir)
sf.write(destpath, audio, sample_rate)
return
def add_reverb(sasxExe, input_wav, filter_file, output_wav):
''' Function to add reverb'''
command_sasx_apply_reverb = "{0} -r {1} \
-f {2} -o {3}".format(sasxExe, input_wav, filter_file, output_wav)
subprocess.call(command_sasx_apply_reverb)
return output_wav
def add_clipping(audio, max_thresh_perc=0.8):
'''Function to add clipping'''
threshold = max(abs(audio))*max_thresh_perc
audioclipped = np.clip(audio, -threshold, threshold)
return audioclipped
def adsp_filter(Adspvqe, nearEndInput, nearEndOutput, farEndInput):
command_adsp_clean = "{0} --breakOnErrors 0 --sampleRate 16000 --useEchoCancellation 0 \
--operatingMode 2 --useDigitalAgcNearend 0 --useDigitalAgcFarend 0 \
--useVirtualAGC 0 --useComfortNoiseGenerator 0 --useAnalogAutomaticGainControl 0 \
--useNoiseReduction 0 --loopbackInputFile {1} --farEndInputFile {2} \
--nearEndInputFile {3} --nearEndOutputFile {4}".format(Adspvqe,
farEndInput, farEndInput, nearEndInput, nearEndOutput)
subprocess.call(command_adsp_clean)
def snr_mixer(params, clean, noise, snr, target_level=-25, clipping_threshold=0.99):
'''Function to mix clean speech and noise at various SNR levels'''
cfg = params['cfg']
if len(clean) > len(noise):
n_repeat = int(np.ceil(float(len(clean)) / float(len(noise))))
noise_ex = np.tile(noise,n_repeat)
noise = noise_ex[0 : len(clean)]
#noise = np.append(noise, np.zeros(len(clean)-len(noise)))
else:
noise = noise[:len(clean)]
#clean = np.append(clean, np.zeros(len(noise)-len(clean)))
# Normalizing to -25 dB FS
clean = clean/(max(abs(clean))+EPS)
clean = normalize(clean, target_level)
rmsclean = (clean**2).mean()**0.5
noise = noise/(max(abs(noise))+EPS)
noise = normalize(noise, target_level)
rmsnoise = (noise**2).mean()**0.5
# Set the noise level for a given SNR
noisescalar = rmsclean / (10**(snr/20)) / (rmsnoise+EPS)
noisenewlevel = noise * noisescalar
# Mix noise and clean speech
noisyspeech = clean + noisenewlevel
# Randomly select RMS value between -15 dBFS and -35 dBFS and normalize noisyspeech with that value
# There is a chance of clipping that might happen with very less probability, which is not a major issue.
noisy_rms_level = np.random.randint(params['target_level_lower'], params['target_level_upper'])
rmsnoisy = (noisyspeech**2).mean()**0.5
scalarnoisy = 10 ** (noisy_rms_level / 20) / (rmsnoisy+EPS)
noisyspeech = noisyspeech * scalarnoisy
clean = clean * scalarnoisy
noisenewlevel = noisenewlevel * scalarnoisy
# Final check to see if there are any amplitudes exceeding +/- 1. If so, normalize all the signals accordingly
if is_clipped(noisyspeech):
noisyspeech_maxamplevel = max(abs(noisyspeech))/(clipping_threshold-EPS)
noisyspeech = noisyspeech/noisyspeech_maxamplevel
clean = clean/noisyspeech_maxamplevel
noisenewlevel = noisenewlevel/noisyspeech_maxamplevel
noisy_rms_level = int(20*np.log10(scalarnoisy/noisyspeech_maxamplevel*(rmsnoisy+EPS)))
return clean, noisenewlevel, noisyspeech, noisy_rms_level
def segmental_snr_mixer(params, clean, noise, snr, target_level=-25, clipping_threshold=0.99):
'''Function to mix clean speech and noise at various segmental SNR levels'''
cfg = params['cfg']
if len(clean) > len(noise):
n_repeat = int(np.ceil(float(len(clean)) / float(len(noise))))
noise_ex = np.tile(noise, n_repeat)
noise = noise_ex[0: len(clean)]
# noise = np.append(noise, np.zeros(len(clean)-len(noise)))
else:
ln = len(noise) - len(clean)
idn = random.randint(0,ln - 1)
noise = noise[idn:idn + len(clean)]
clean = clean/(max(abs(clean))+EPS)
noise = noise/(max(abs(noise))+EPS)
rmsclean, rmsnoise = active_rms(clean=clean, noise=noise)
clean = normalize_segmental_rms(clean, rms=rmsclean, target_level=target_level)
noise = normalize_segmental_rms(noise, rms=rmsnoise, target_level=target_level)
# Set the noise level for a given SNR
noisescalar = rmsclean / (10**(snr/20)) / (rmsnoise+EPS)
noisenewlevel = noise * noisescalar
# Mix noise and clean speech
noisyspeech = clean + noisenewlevel
# Randomly select RMS value between -15 dBFS and -35 dBFS and normalize noisyspeech with that value
# There is a chance of clipping that might happen with very less probability, which is not a major issue.
noisy_rms_level = np.random.randint(params['target_level_lower'], params['target_level_upper'])
rmsnoisy = (noisyspeech**2).mean()**0.5
scalarnoisy = 10 ** (noisy_rms_level / 20) / (rmsnoisy+EPS)
noisyspeech = noisyspeech * scalarnoisy
clean = clean * scalarnoisy
noisenewlevel = noisenewlevel * scalarnoisy
# Final check to see if there are any amplitudes exceeding +/- 1. If so, normalize all the signals accordingly
if is_clipped(noisyspeech):
noisyspeech_maxamplevel = max(abs(noisyspeech))/(clipping_threshold-EPS)
noisyspeech = noisyspeech/noisyspeech_maxamplevel
clean = clean/noisyspeech_maxamplevel
noisenewlevel = noisenewlevel/noisyspeech_maxamplevel
noisy_rms_level = int(20*np.log10(scalarnoisy/noisyspeech_maxamplevel*(rmsnoisy+EPS)))
return clean, noisenewlevel, noisyspeech, noisy_rms_level
def active_rms(clean, noise, fs=16000, energy_thresh=-50):
'''Returns the clean and noise RMS of the noise calculated only in the active portions'''
window_size = 100 # in ms
window_samples = int(fs*window_size/1000)
sample_start = 0
noise_active_segs = []
clean_active_segs = []
while sample_start < len(noise):
sample_end = min(sample_start + window_samples, len(noise))
noise_win = noise[sample_start:sample_end]
clean_win = clean[sample_start:sample_end]
noise_seg_rms = 20*np.log10((noise_win**2).mean()+EPS)
# Considering frames with energy
if noise_seg_rms > energy_thresh:
noise_active_segs = np.append(noise_active_segs, noise_win)
clean_active_segs = np.append(clean_active_segs, clean_win)
sample_start += window_samples
if len(noise_active_segs)!=0:
noise_rms = (noise_active_segs**2).mean()**0.5
else:
noise_rms = EPS
if len(clean_active_segs)!=0:
clean_rms = (clean_active_segs**2).mean()**0.5
else:
clean_rms = EPS
return clean_rms, noise_rms
def activitydetector(audio, fs=16000, energy_thresh=0.13, target_level=-25):
'''Return the percentage of the time the audio signal is above an energy threshold'''
audio = normalize(audio, target_level)
window_size = 50 # in ms
window_samples = int(fs*window_size/1000)
sample_start = 0
cnt = 0
prev_energy_prob = 0
active_frames = 0
a = -1
b = 0.2
alpha_rel = 0.05
alpha_att = 0.8
while sample_start < len(audio):
sample_end = min(sample_start + window_samples, len(audio))
audio_win = audio[sample_start:sample_end]
frame_rms = 20*np.log10(sum(audio_win**2)+EPS)
frame_energy_prob = 1./(1+np.exp(-(a+b*frame_rms)))
if frame_energy_prob > prev_energy_prob:
smoothed_energy_prob = frame_energy_prob*alpha_att + prev_energy_prob*(1-alpha_att)
else:
smoothed_energy_prob = frame_energy_prob*alpha_rel + prev_energy_prob*(1-alpha_rel)
if smoothed_energy_prob > energy_thresh:
active_frames += 1
prev_energy_prob = frame_energy_prob
sample_start += window_samples
cnt += 1
perc_active = active_frames/cnt
return perc_active
def resampler(input_dir, target_sr=16000, ext='*.wav'):
'''Resamples the audio files in input_dir to target_sr'''
files = glob.glob(f"{input_dir}/"+ext)
for pathname in files:
print(pathname)
try:
audio, fs = audioread(pathname)
audio_resampled = librosa.core.resample(audio, fs, target_sr)
audiowrite(pathname, audio_resampled, target_sr)
except:
continue
def audio_segmenter(input_dir, dest_dir, segment_len=10, ext='*.wav'):
'''Segments the audio clips in dir to segment_len in secs'''
files = glob.glob(f"{input_dir}/"+ext)
for i in range(len(files)):
audio, fs = audioread(files[i])
if len(audio) > (segment_len*fs) and len(audio)%(segment_len*fs) != 0:
audio = np.append(audio, audio[0 : segment_len*fs - (len(audio)%(segment_len*fs))])
if len(audio) < (segment_len*fs):
while len(audio) < (segment_len*fs):
audio = np.append(audio, audio)
audio = audio[:segment_len*fs]
num_segments = int(len(audio)/(segment_len*fs))
audio_segments = np.split(audio, num_segments)
basefilename = os.path.basename(files[i])
basename, ext = os.path.splitext(basefilename)
for j in range(len(audio_segments)):
newname = basename+'_'+str(j)+ext
destpath = os.path.join(dest_dir,newname)
audiowrite(destpath, audio_segments[j], fs)
mix.py
import os
import numpy as np
import glob
import random
from random import shuffle
import librosa
from audiolib import audioread,audiowrite,snr_mixer,segmental_snr_mixer
import configparser as CP
import argparse
import utils
import scipy.io.wavfile as wav
def read_audio_file2(path,fmt):
source_files = glob.glob(os.path.join(path,fmt))
shuffle(source_files)
return source_files
def read_label_file(path):
fp = open(path,'r')
lines = fp.readlines()
info = []
for line in lines:
line = line.strip().split(' ')
tmp = []
for l in line:
tmp.append(l)
info.append(tmp)
fp.close()
return info
def write_label_file(path,labels):
fp = open(path,'w')
for label in labels:
for l in label:
fp.write(l + ' ')
fp.write('\n')
fp.close()
def combine_noise(data_paths,sample_length,silence_length = 0.2,fs = 16000):
'''
从噪声文件中,随机组合进行合并噪声文件到设定长度
'''
remain_length = sample_length
sig_combine = np.zeros((sample_length),dtype = np.int16)
silence = np.zeros(int(fs*silence_length))
plen = len(data_paths)
cnt = 0
while remain_length > 0:
index = random.randint(0,plen - 1)
(fs,sig) = wav.read(data_paths[index])
sig_len = len(sig)
if sig_len > remain_length:
sig_combine[:(cnt + remain_length)] = np.concatenate([sig_combine[:cnt],sig[:remain_length]])
cnt = cnt + remain_length
else:
#print('cnt = ',cnt)
#tmp = np.concatenate([sig_combine[:cnt],sig])
sig_combine[:(cnt + sig_len)] = np.concatenate([sig_combine[:cnt],sig])
cnt = cnt + sig_len
index = index + 1
remain_length = remain_length - sig_len
silence_len = 0
if remain_length > 0:
silence_len = min(remain_length,len(silence))
sig_combine[:(cnt + silence_len)] = np.concatenate([sig_combine[:cnt],silence[:silence_len]])
cnt = cnt + silence_len
remain_length = remain_length - silence_len
#print('remain_length = ',remain_length)
#sig_combine = sig_combine[:sample_length]
return sig_combine
def main_gen(params,clean_dir,noise_dir1,noise_dir2,N = 1,flag = 0):
'''
干净语音和噪声语音生成加噪语音
'''
if flag:
utils.del_folder(params['noisyspeech_dir'])
utils.del_folder(params['clean_proc_dir'])
utils.del_folder(params['noise_proc_dir'])
utils.create_folder(params['noisyspeech_dir'])
utils.create_folder(params['clean_proc_dir'])
utils.create_folder(params['noise_proc_dir'])
clean_index = 0
noise_index = 0
file_num = 0
#N = 20
fmt = params['audioformat']
speech_path = clean_dir #params['speech_dir']
#noise_path = noise_dir #params['noise_dir']
speech_dirs,speech_names = utils.read_audio_file1(speech_path,'.wav')
noise_dirs1,noise_names1 = utils.read_audio_file1(noise_dir1,fmt)
noise_dirs2,noise_names2 = utils.read_audio_file1(noise_dir2,fmt)
noise_dirs = noise_dirs1 + noise_dirs2
noise_names = noise_names1 + noise_names2
print('speech_len = %d,noise_len = %d\n'%(len(speech_dirs),len(noise_dirs)))
noise_len = len(noise_dirs)
randint = [i for i in range(noise_len)]
shuffle(randint)
for (speech_dir,speech_name) in zip(speech_dirs,speech_names):
clean,fs = audioread(speech_dir)
print('file_num = ',file_num)
for i in range(N):
noise = combine_noise(noise_dirs,len(clean))
noise_name = speech_name
#随机获取一定范围内的预设信噪比
snr = np.random.randint(params['snr_lower'], params['snr_upper'])
#干净数据加入预设信噪比的噪声,得到加噪数据
clean_snr, noise_snr, noisy_snr, target_level = snr_mixer(params=params,
clean=clean,
noise=noise,
snr=snr)
#保存处理后的相关音频文件
noisyfilename = speech_name + '_noisy_filed' + '_snr' + \
str(snr) + '_tl' + str(target_level) + '_fileid_' + str(file_num) + '.wav'
cleanfilename = speech_name + '_clean_fileid_' + str(file_num) + '.wav'
noisefilename = noise_name + '_noise_fileid_' + str(file_num) + '.wav'
noisypath = os.path.join(params['noisyspeech_dir'], noisyfilename)
cleanpath = os.path.join(params['clean_proc_dir'], cleanfilename)
noisepath = os.path.join(params['noise_proc_dir'], noisefilename)
audio_signals = [noisy_snr, clean_snr, noise_snr]
file_paths = [noisypath, cleanpath, noisepath]
file_num += 1
for i in range(len(audio_signals)):
try:
audiowrite(file_paths[i], audio_signals[i], params['fs'])
except Exception as e:
print(str(e))
def main_body():
parser = argparse.ArgumentParser()
parser.add_argument('--cfg', default = 'vad_synthesizer.cfg',help = 'read vad_synthesizer.cfg')
parser.add_argument('--cfg_str', type = str, default = 'vad_speech')
args = parser.parse_args()
params = dict()
params['args'] = args
cfgpath = os.path.join(os.path.dirname(__file__), args.cfg)
cfg = CP.ConfigParser()
cfg._interpolation = CP.ExtendedInterpolation()
cfg.read(cfgpath)
params['cfg'] = cfg._sections[args.cfg_str]
cfg = params['cfg']
clean_dir = os.path.join(os.path.dirname(__file__), 'CleanSpeech')
if cfg['speech_dir'] != 'None':
clean_dir = cfg['speech_dir']
if not os.path.exists(clean_dir):
assert False, ('Clean speech data is required')
noise_dir = os.path.join(os.path.dirname(__file__), 'Noise')
if cfg['noise_dir1'] != 'None':
noise_dir1 = cfg['noise_dir1']
if cfg['noise_dir2'] != 'None':
noise_dir2 = cfg['noise_dir2']
if not os.path.exists:
assert False, ('Noise data is required')
params['fs'] = int(cfg['sampling_rate'])
params['audioformat'] = cfg['audioformat']
params['audio_length'] = float(cfg['audio_length'])
params['silence_length'] = float(cfg['silence_length'])
params['total_hours'] = float(cfg['total_hours'])
if cfg['fileindex_start'] != 'None' and cfg['fileindex_start'] != 'None':
params['num_files'] = int(cfg['fileindex_end']) - int(cfg['fileindex_start'])
params['fileindex_start'] = int(cfg['fileindex_start'])
params['fileindex_end'] = int(cfg['fileindex_end'])
else:
params['num_files'] = int((params['total_hours'] * 60 * 60) / params['audio_length'])
params['fileindex_start'] = 0
params['fileindex_end'] = params['num_files']
print('Number of files to be synthesized:', params['num_files'])
params['is_test_set'] = utils.str2bool(cfg['is_test_set'])
params['clean_activity_threshold'] = float(cfg['clean_activity_threshold'])
params['noise_activity_threshold'] = float(cfg['noise_activity_threshold'])
params['snr_lower'] = int(cfg['snr_lower'])
params['snr_upper'] = int(cfg['snr_upper'])
params['randomize_snr'] = utils.str2bool(cfg['randomize_snr'])
params['target_level_lower'] = int(cfg['target_level_lower'])
params['target_level_upper'] = int(cfg['target_level_upper'])
params['noisyspeech_dir'] = utils.get_dir(cfg, 'noisy_destination', 'noisy')
params['noisylabel_dir'] = utils.get_dir(cfg, 'noisy_label_destination', 'label')
params['clean_proc_dir'] = utils.get_dir(cfg, 'clean_destination', 'clean')
params['noise_proc_dir'] = utils.get_dir(cfg, 'noise_destination', 'noise')
main_gen(params,clean_dir,noise_dir1,noise_dir2,1,1)
if __name__ == '__main__':
main_body()
通过上面程序就可以实现加噪数据的合成,水平有限,有不当之处还请指教