深度学习vad人声检测之数据合成

最新推荐文章于 2024-04-28 19:29:17 发布

pikaqiu_n95

最新推荐文章于 2024-04-28 19:29:17 发布

阅读量1.4k

点赞数 1

分类专栏：音频文章标签： pytorch 深度学习神经网络

本文链接：https://blog.csdn.net/pikaqiu_n95/article/details/113923400

版权

音频专栏收录该内容

10 篇文章 4 订阅

订阅专栏

vad的主要目的是检测声音(这里特指人声)，实际环境当中，声音往往存在很多的噪声，即便人在说话的时候也会存在噪声；所以前面制作标签的干净数据需要加入噪声来合成含有噪声的音频数据，从而使得训练出来的模型对噪声更具鲁棒性。数据合成代码实现如下：
audio_lib.py

# -*- coding: utf-8 -*-
"""
@author: chkarada
"""
import os
import numpy as np
import soundfile as sf
import subprocess
import glob
import librosa
import random
import tempfile

EPS = np.finfo(float).eps
np.random.seed(0)

def is_clipped(audio, clipping_threshold=0.99):
    return any(abs(audio) > clipping_threshold)

def normalize(audio, target_level=-25):
    '''Normalize the signal to the target level'''
    rms = (audio ** 2).mean() ** 0.5
    scalar = 10 ** (target_level / 20) / (rms+EPS)
    audio = audio * scalar
    return audio

def normalize_segmental_rms(audio, rms, target_level=-25):
    '''Normalize the signal to the target level
    based on segmental RMS'''
    scalar = 10 ** (target_level / 20) / (rms+EPS)
    audio = audio * scalar
    return audio

def audioread(path, norm=False, start=0, stop=None, target_level=-25):
    '''Function to read audio'''

    path = os.path.abspath(path)
    if not os.path.exists(path):
        raise ValueError("[{}] does not exist!".format(path))
    try:
        audio, sample_rate = sf.read(path, start=start, stop=stop)
    except RuntimeError:  # fix for sph pcm-embedded shortened v2
        print('WARNING: Audio type not supported')

    if len(audio.shape) == 1:  # mono
        if norm:
            rms = (audio ** 2).mean() ** 0.5
            scalar = 10 ** (target_level / 20) / (rms+EPS)
            audio = audio * scalar
    else:  # multi-channel
        audio = audio.T
        audio = audio.sum(axis=0)/audio.shape[0]
        if norm:
            audio = normalize(audio, target_level)

    return audio, sample_rate


def audiowrite(destpath, audio, sample_rate=16000, norm=False, target_level=-25, \
                clipping_threshold=0.99, clip_test=False):
    '''Function to write audio'''

    if clip_test:
        if is_clipped(audio, clipping_threshold=clipping_threshold):
            raise ValueError("Clipping detected in audiowrite()! " + \
                            destpath + " file not written to disk.")

    if norm:
        audio = normalize(audio, target_level)
        max_amp = max(abs(audio))
        if max_amp >= clipping_threshold:
            audio = audio/max_amp * (clipping_threshold-EPS)

    destpath = os.path.abspath(destpath)
    destdir = os.path.dirname(destpath)

    if not os.path.exists(destdir):
        os.makedirs(destdir)

    sf.write(destpath, audio, sample_rate)
    return


def add_reverb(sasxExe, input_wav, filter_file, output_wav):
    ''' Function to add reverb'''
    command_sasx_apply_reverb = "{0} -r {1} \
        -f {2} -o {3}".format(sasxExe, input_wav, filter_file, output_wav)
                                                               
    subprocess.call(command_sasx_apply_reverb)
    return output_wav


def add_clipping(audio, max_thresh_perc=0.8):
    '''Function to add clipping'''
    threshold = max(abs(audio))*max_thresh_perc
    audioclipped = np.clip(audio, -threshold, threshold)
    return audioclipped


def adsp_filter(Adspvqe, nearEndInput, nearEndOutput, farEndInput):

    command_adsp_clean = "{0} --breakOnErrors 0 --sampleRate 16000 --useEchoCancellation 0 \
                    --operatingMode 2 --useDigitalAgcNearend 0 --useDigitalAgcFarend 0 \
                    --useVirtualAGC 0 --useComfortNoiseGenerator 0 --useAnalogAutomaticGainControl 0 \
                    --useNoiseReduction 0 --loopbackInputFile {1} --farEndInputFile {2} \
                    --nearEndInputFile {3} --nearEndOutputFile {4}".format(Adspvqe,
                                farEndInput, farEndInput, nearEndInput, nearEndOutput)
    subprocess.call(command_adsp_clean)


def snr_mixer(params, clean, noise, snr, target_level=-25, clipping_threshold=0.99):
    '''Function to mix clean speech and noise at various SNR levels'''
    cfg = params['cfg']
    if len(clean) > len(noise):
        n_repeat = int(np.ceil(float(len(clean)) / float(len(noise))))
        noise_ex = np.tile(noise,n_repeat)
        noise = noise_ex[0 : len(clean)]

        #noise = np.append(noise, np.zeros(len(clean)-len(noise)))
    else:
        noise = noise[:len(clean)]
        #clean = np.append(clean, np.zeros(len(noise)-len(clean)))

    # Normalizing to -25 dB FS
    clean = clean/(max(abs(clean))+EPS)
    clean = normalize(clean, target_level)
    rmsclean = (clean**2).mean()**0.5

    noise = noise/(max(abs(noise))+EPS)
    noise = normalize(noise, target_level)
    rmsnoise = (noise**2).mean()**0.5

    # Set the noise level for a given SNR
    noisescalar = rmsclean / (10**(snr/20)) / (rmsnoise+EPS)
    noisenewlevel = noise * noisescalar

    # Mix noise and clean speech
    noisyspeech = clean + noisenewlevel
    
    # Randomly select RMS value between -15 dBFS and -35 dBFS and normalize noisyspeech with that value
    # There is a chance of clipping that might happen with very less probability, which is not a major issue. 
    noisy_rms_level = np.random.randint(params['target_level_lower'], params['target_level_upper'])
    rmsnoisy = (noisyspeech**2).mean()**0.5
    scalarnoisy = 10 ** (noisy_rms_level / 20) / (rmsnoisy+EPS)
    noisyspeech = noisyspeech * scalarnoisy
    clean = clean * scalarnoisy
    noisenewlevel = noisenewlevel * scalarnoisy

    # Final check to see if there are any amplitudes exceeding +/- 1. If so, normalize all the signals accordingly
    if is_clipped(noisyspeech):
        noisyspeech_maxamplevel = max(abs(noisyspeech))/(clipping_threshold-EPS)
        noisyspeech = noisyspeech/noisyspeech_maxamplevel
        clean = clean/noisyspeech_maxamplevel
        noisenewlevel = noisenewlevel/noisyspeech_maxamplevel
        noisy_rms_level = int(20*np.log10(scalarnoisy/noisyspeech_maxamplevel*(rmsnoisy+EPS)))

    return clean, noisenewlevel, noisyspeech, noisy_rms_level


def segmental_snr_mixer(params, clean, noise, snr, target_level=-25, clipping_threshold=0.99):
    '''Function to mix clean speech and noise at various segmental SNR levels'''
    cfg = params['cfg']
    if len(clean) > len(noise):
        n_repeat = int(np.ceil(float(len(clean)) / float(len(noise))))
        noise_ex = np.tile(noise, n_repeat)
        noise = noise_ex[0: len(clean)]

        # noise = np.append(noise, np.zeros(len(clean)-len(noise)))
    else:
        ln = len(noise) - len(clean)
        idn = random.randint(0,ln - 1)

        noise = noise[idn:idn + len(clean)]
        
        
    clean = clean/(max(abs(clean))+EPS)
    noise = noise/(max(abs(noise))+EPS)
    rmsclean, rmsnoise = active_rms(clean=clean, noise=noise)
    clean = normalize_segmental_rms(clean, rms=rmsclean, target_level=target_level)
    noise = normalize_segmental_rms(noise, rms=rmsnoise, target_level=target_level)
    # Set the noise level for a given SNR
    noisescalar = rmsclean / (10**(snr/20)) / (rmsnoise+EPS)
    noisenewlevel = noise * noisescalar

    # Mix noise and clean speech
    noisyspeech = clean + noisenewlevel
    # Randomly select RMS value between -15 dBFS and -35 dBFS and normalize noisyspeech with that value
    # There is a chance of clipping that might happen with very less probability, which is not a major issue. 
    noisy_rms_level = np.random.randint(params['target_level_lower'], params['target_level_upper'])
    rmsnoisy = (noisyspeech**2).mean()**0.5
    scalarnoisy = 10 ** (noisy_rms_level / 20) / (rmsnoisy+EPS)
    noisyspeech = noisyspeech * scalarnoisy
    clean = clean * scalarnoisy
    noisenewlevel = noisenewlevel * scalarnoisy
    # Final check to see if there are any amplitudes exceeding +/- 1. If so, normalize all the signals accordingly
    if is_clipped(noisyspeech):
        noisyspeech_maxamplevel = max(abs(noisyspeech))/(clipping_threshold-EPS)
        noisyspeech = noisyspeech/noisyspeech_maxamplevel
        clean = clean/noisyspeech_maxamplevel
        noisenewlevel = noisenewlevel/noisyspeech_maxamplevel
        noisy_rms_level = int(20*np.log10(scalarnoisy/noisyspeech_maxamplevel*(rmsnoisy+EPS)))

    return clean, noisenewlevel, noisyspeech, noisy_rms_level
    

def active_rms(clean, noise, fs=16000, energy_thresh=-50):
    '''Returns the clean and noise RMS of the noise calculated only in the active portions'''
    window_size = 100 # in ms
    window_samples = int(fs*window_size/1000)
    sample_start = 0
    noise_active_segs = []
    clean_active_segs = []

    while sample_start < len(noise):
        sample_end = min(sample_start + window_samples, len(noise))
        noise_win = noise[sample_start:sample_end]
        clean_win = clean[sample_start:sample_end]
        noise_seg_rms = 20*np.log10((noise_win**2).mean()+EPS)
        # Considering frames with energy
        if noise_seg_rms > energy_thresh:
            noise_active_segs = np.append(noise_active_segs, noise_win)
            clean_active_segs = np.append(clean_active_segs, clean_win)
        sample_start += window_samples

    if len(noise_active_segs)!=0:
        noise_rms = (noise_active_segs**2).mean()**0.5
    else:
        noise_rms = EPS
        
    if len(clean_active_segs)!=0:
        clean_rms = (clean_active_segs**2).mean()**0.5
    else:
        clean_rms = EPS

    return clean_rms, noise_rms


def activitydetector(audio, fs=16000, energy_thresh=0.13, target_level=-25):
    '''Return the percentage of the time the audio signal is above an energy threshold'''

    audio = normalize(audio, target_level)
    window_size = 50 # in ms
    window_samples = int(fs*window_size/1000)
    sample_start = 0
    cnt = 0
    prev_energy_prob = 0
    active_frames = 0

    a = -1
    b = 0.2
    alpha_rel = 0.05
    alpha_att = 0.8

    while sample_start < len(audio):
        sample_end = min(sample_start + window_samples, len(audio))
        audio_win = audio[sample_start:sample_end]
        frame_rms = 20*np.log10(sum(audio_win**2)+EPS)
        frame_energy_prob = 1./(1+np.exp(-(a+b*frame_rms)))

        if frame_energy_prob > prev_energy_prob:
            smoothed_energy_prob = frame_energy_prob*alpha_att + prev_energy_prob*(1-alpha_att)
        else:
            smoothed_energy_prob = frame_energy_prob*alpha_rel + prev_energy_prob*(1-alpha_rel)

        if smoothed_energy_prob > energy_thresh:
            active_frames += 1
        prev_energy_prob = frame_energy_prob
        sample_start += window_samples
        cnt += 1

    perc_active = active_frames/cnt
    return perc_active


def resampler(input_dir, target_sr=16000, ext='*.wav'):
    '''Resamples the audio files in input_dir to target_sr'''
    files = glob.glob(f"{input_dir}/"+ext)
    for pathname in files:
        print(pathname)
        try:
            audio, fs = audioread(pathname)
            audio_resampled = librosa.core.resample(audio, fs, target_sr)
            audiowrite(pathname, audio_resampled, target_sr)
        except:
            continue


def audio_segmenter(input_dir, dest_dir, segment_len=10, ext='*.wav'):
    '''Segments the audio clips in dir to segment_len in secs'''
    files = glob.glob(f"{input_dir}/"+ext)
    for i in range(len(files)):
        audio, fs = audioread(files[i])
        
        if len(audio) > (segment_len*fs) and len(audio)%(segment_len*fs) != 0:
            audio = np.append(audio, audio[0 : segment_len*fs - (len(audio)%(segment_len*fs))]) 
        if len(audio) < (segment_len*fs):
            while len(audio) < (segment_len*fs):
                audio = np.append(audio, audio)
            audio = audio[:segment_len*fs]
        
        num_segments = int(len(audio)/(segment_len*fs))
        audio_segments = np.split(audio, num_segments)

        basefilename = os.path.basename(files[i])
        basename, ext = os.path.splitext(basefilename)

        for j in range(len(audio_segments)):
            newname = basename+'_'+str(j)+ext
            destpath = os.path.join(dest_dir,newname)
            audiowrite(destpath, audio_segments[j], fs)

mix.py

import os
import numpy as np
import glob
import random
from random import shuffle
import librosa
from audiolib import audioread,audiowrite,snr_mixer,segmental_snr_mixer
import configparser as CP
import argparse
import utils
import scipy.io.wavfile as wav


def read_audio_file2(path,fmt):

    source_files = glob.glob(os.path.join(path,fmt))
    shuffle(source_files)

    return source_files


def read_label_file(path):
    fp = open(path,'r')
    lines = fp.readlines()

    info = []
    for line in lines:
        line = line.strip().split(' ')
        
        tmp = []
        for l in line:
            tmp.append(l)
        info.append(tmp)
    fp.close()
    
    return info


def write_label_file(path,labels):
    fp = open(path,'w')
    for label in labels:
        for l in label:
            fp.write(l + ' ')
        fp.write('\n')
    fp.close()
    


def combine_noise(data_paths,sample_length,silence_length = 0.2,fs = 16000):
    '''
    从噪声文件中，随机组合进行合并噪声文件到设定长度
    '''

    remain_length = sample_length
    
    sig_combine = np.zeros((sample_length),dtype = np.int16)

    silence = np.zeros(int(fs*silence_length))
    plen = len(data_paths)

    cnt = 0
    while remain_length > 0:
        
        index = random.randint(0,plen - 1)
        (fs,sig) = wav.read(data_paths[index])
        sig_len = len(sig)
        if sig_len > remain_length:
            sig_combine[:(cnt + remain_length)] = np.concatenate([sig_combine[:cnt],sig[:remain_length]])
            cnt = cnt + remain_length
        else:
            #print('cnt = ',cnt)
            #tmp = np.concatenate([sig_combine[:cnt],sig])
            sig_combine[:(cnt + sig_len)] = np.concatenate([sig_combine[:cnt],sig])
            cnt = cnt + sig_len
        
        index = index + 1
        remain_length = remain_length - sig_len

        silence_len = 0
        if remain_length > 0:
            silence_len = min(remain_length,len(silence))
            sig_combine[:(cnt + silence_len)] = np.concatenate([sig_combine[:cnt],silence[:silence_len]])
            cnt = cnt + silence_len

        remain_length = remain_length - silence_len

        #print('remain_length = ',remain_length)

    #sig_combine = sig_combine[:sample_length]
    
    return sig_combine



def main_gen(params,clean_dir,noise_dir1,noise_dir2,N = 1,flag = 0):
    '''
    干净语音和噪声语音生成加噪语音
    '''

    if flag:
        utils.del_folder(params['noisyspeech_dir'])
        utils.del_folder(params['clean_proc_dir'])
        utils.del_folder(params['noise_proc_dir'])

    utils.create_folder(params['noisyspeech_dir'])
    utils.create_folder(params['clean_proc_dir'])
    utils.create_folder(params['noise_proc_dir'])


    clean_index = 0
    noise_index = 0
    file_num = 0
    #N = 20

    fmt = params['audioformat']
    speech_path = clean_dir #params['speech_dir']
    #noise_path = noise_dir #params['noise_dir']

    speech_dirs,speech_names = utils.read_audio_file1(speech_path,'.wav')
    noise_dirs1,noise_names1 = utils.read_audio_file1(noise_dir1,fmt)
    noise_dirs2,noise_names2 = utils.read_audio_file1(noise_dir2,fmt)

    noise_dirs = noise_dirs1 + noise_dirs2
    noise_names = noise_names1 + noise_names2

    print('speech_len = %d,noise_len = %d\n'%(len(speech_dirs),len(noise_dirs)))

    noise_len = len(noise_dirs)
    randint = [i for i in range(noise_len)]
    shuffle(randint)

    for (speech_dir,speech_name) in zip(speech_dirs,speech_names):
        clean,fs = audioread(speech_dir)
        print('file_num = ',file_num)

        for i in range(N):
            noise = combine_noise(noise_dirs,len(clean))
            noise_name = speech_name

            #随机获取一定范围内的预设信噪比
            snr = np.random.randint(params['snr_lower'], params['snr_upper'])
            #干净数据加入预设信噪比的噪声，得到加噪数据
            clean_snr, noise_snr, noisy_snr, target_level = snr_mixer(params=params,
                                                                      clean=clean,
                                                                      noise=noise,
                                                                      snr=snr)

            #保存处理后的相关音频文件
            noisyfilename = speech_name + '_noisy_filed' + '_snr' + \
                            str(snr) + '_tl' + str(target_level) + '_fileid_' + str(file_num) + '.wav'
            cleanfilename = speech_name + '_clean_fileid_' + str(file_num) + '.wav'
            noisefilename = noise_name + '_noise_fileid_' + str(file_num) + '.wav'

            noisypath = os.path.join(params['noisyspeech_dir'], noisyfilename)
            cleanpath = os.path.join(params['clean_proc_dir'], cleanfilename)
            noisepath = os.path.join(params['noise_proc_dir'], noisefilename)

            audio_signals = [noisy_snr, clean_snr, noise_snr]
            file_paths = [noisypath, cleanpath, noisepath]

            file_num += 1
            for i in range(len(audio_signals)):
                try:
                    audiowrite(file_paths[i], audio_signals[i], params['fs'])
                except Exception as e:
                    print(str(e))


def main_body():

    parser = argparse.ArgumentParser()

    parser.add_argument('--cfg', default = 'vad_synthesizer.cfg',help = 'read vad_synthesizer.cfg')
    parser.add_argument('--cfg_str', type = str, default = 'vad_speech')
    args = parser.parse_args()

    params = dict()
    params['args'] = args
    cfgpath = os.path.join(os.path.dirname(__file__), args.cfg)

    cfg = CP.ConfigParser()
    cfg._interpolation = CP.ExtendedInterpolation()
    cfg.read(cfgpath)
    params['cfg'] = cfg._sections[args.cfg_str]
    cfg = params['cfg']

    clean_dir = os.path.join(os.path.dirname(__file__), 'CleanSpeech')
    if cfg['speech_dir'] != 'None':
        clean_dir = cfg['speech_dir']
    if not os.path.exists(clean_dir):
        assert False, ('Clean speech data is required')

    noise_dir = os.path.join(os.path.dirname(__file__), 'Noise')
    if cfg['noise_dir1'] != 'None':
        noise_dir1 = cfg['noise_dir1']
    
    if cfg['noise_dir2'] != 'None':
        noise_dir2 = cfg['noise_dir2']

    if not os.path.exists:
        assert False, ('Noise data is required')


    params['fs'] = int(cfg['sampling_rate'])
    params['audioformat'] = cfg['audioformat']
    params['audio_length'] = float(cfg['audio_length'])
    params['silence_length'] = float(cfg['silence_length'])
    params['total_hours'] = float(cfg['total_hours'])

    if cfg['fileindex_start'] != 'None' and cfg['fileindex_start'] != 'None':
        params['num_files'] = int(cfg['fileindex_end']) - int(cfg['fileindex_start'])
        params['fileindex_start'] = int(cfg['fileindex_start'])
        params['fileindex_end'] = int(cfg['fileindex_end'])
    else:
        params['num_files'] = int((params['total_hours'] * 60 * 60) / params['audio_length'])
        params['fileindex_start'] = 0
        params['fileindex_end'] = params['num_files']

    print('Number of files to be synthesized:', params['num_files'])

    params['is_test_set'] = utils.str2bool(cfg['is_test_set'])
    params['clean_activity_threshold'] = float(cfg['clean_activity_threshold'])
    params['noise_activity_threshold'] = float(cfg['noise_activity_threshold'])
    params['snr_lower'] = int(cfg['snr_lower'])
    params['snr_upper'] = int(cfg['snr_upper'])

    params['randomize_snr'] = utils.str2bool(cfg['randomize_snr'])
    params['target_level_lower'] = int(cfg['target_level_lower'])
    params['target_level_upper'] = int(cfg['target_level_upper'])

    params['noisyspeech_dir'] = utils.get_dir(cfg, 'noisy_destination', 'noisy')
    params['noisylabel_dir'] = utils.get_dir(cfg, 'noisy_label_destination', 'label')
    params['clean_proc_dir'] = utils.get_dir(cfg, 'clean_destination', 'clean')
    params['noise_proc_dir'] = utils.get_dir(cfg, 'noise_destination', 'noise')

    main_gen(params,clean_dir,noise_dir1,noise_dir2,1,1)


if __name__ == '__main__':
    main_body()

通过上面程序就可以实现加噪数据的合成，水平有限，有不当之处还请指教

pikaqiu_n95

关注

1
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
深度学习vad人声检测之数据合成

vad的主要目的是检测声音(这里特指人声)，实际环境当中，声音往往存在很多的噪声，即便人在说话的时候也会存在噪声；所以前面制作标签的干净数据需要加入噪声来合成含有噪声的音频数据，从而使得训练出来的模型对噪声更具鲁棒性。数据合成代码实现如下：audio_lib.py# -*- coding: utf-8 -*-"""@author: chkarada"""import osimport numpy as npimport soundfile as sfimport subprocessim
复制链接

扫一扫