深度学习和语音系列教程 2-100：语音录制6种模式和音频文件相关操作

最新推荐文章于 2024-08-23 00:06:25 发布

Mrrunsen

最新推荐文章于 2024-08-23 00:06:25 发布

阅读量234

点赞数

分类专栏：语音（深度学习) 文章标签： asynctask os csv profiling plist

本文链接：https://blog.csdn.net/Mrrunsen/article/details/118633827

版权

语音（深度学习) 专栏收录该内容

5 篇文章 11 订阅

订阅专栏

本文详细介绍了在深度学习语音处理中使用的6种录音模式，包括Active Asynchronous、Passive-synchronous等，并探讨了音频文件的清理步骤，如噪声去除、音量调整和沉默消除。此外，还涵盖了说话人日记化、音频文件转换为.FLAC和.OPUS格式、FTP及Google云存储上传等存储方法。同时，文章强调了MEMUPPS（Microphone, Environment, Mode, User Operation, Process, Publishing Medium, Storage）控制在确保高质量数据收集和分发中的重要性。" 113024601,10536839,MySQL5.7新特性：JSON类型与性能优化,"['数据库管理', 'MySQL', '数据类型', '性能调优', 'JSON']

摘要由CSDN通过智能技术生成

录制模式

采集终端用户的语音样本时，主要有6种录音模式：

Mode	Description
Active-synchronous (AS) mode	同步录制音频，提示用户操作。
Active-asynchronous (AA) mode	异步录制音频，提示用户操作。
Passive-synchronous (PS) mode	后台同步录音，不提示用户操作。
Passive-asynchronous (PA) mode	在后台异步录制音频，不提示用户操作
Active-Passive synchronous (APS) mode	主动录制音频 - 提示用户操作，然后被动录制音频样本 - 全部以同步方式
Active-Passive asynchronous (APA) mode	主动录制音频 - 提示用户操作，然后被动录制音频样本 - 全部以异步方式。

Active Asynchronous mode

as_record.py

import sounddevice as sd
import soundfile as sf 
from bs4 import BeautifulSoup
import speech_recognition as sr_audio
import os, pyttsx3, pygame, time

def sync_record(filename, duration, fs, channels):
    print('recording')
    myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=channels)
    sd.wait()
    sf.write(filename, myrecording, fs)
    print('done recording')
    
def sync_playback(filename):
    # takes in a file and plays it back 
    pygame.mixer.init()
    pygame.mixer.music.load(filename)
    pygame.mixer.music.play()

def speak_text(text):
    engine=pyttsx3.init()
    engine.say(text)
    engine.runAndWait()

def transcribe_audio_sphinx(filename):
    # transcribe the audio (note this is only done if a voice sample)
    r=sr_audio.Recognizer()
    with sr_audio.AudioFile(filename) as source:
        audio = r.record(source) 
    text=r.recognize_sphinx(audio)
    print('transcript: '+text)
    return text
    
def fetch_weather():
    os.system('open https://www.yahoo.com/news/weather')

speak_text('would you like to get the weather?')
sync_playback('beep.mp3')
time.sleep(2)
sync_record('response.wav',2,16000,1)
transcript=transcribe_audio_sphinx('response.wav')
if transcript.lower().find('yes') >= 0 or transcript.lower().find('yeah') >= 0:
    fetch_weather()

Active Asynchronous mode

aa_record.py

import sounddevice as sd
import soundfile as sf 
from bs4 import BeautifulSoup
import speech_recognition as sr_audio
import os, pyttsx3, pdfkit, pygame

def sync_record(filename, duration, fs, channels):
    print('recording')
    myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=channels)
    try:
        fetch_weather()
    except:
        pass
    sd.wait()
    sf.write(filename, myrecording, fs)
    print('done recording')
    
def sync_playback(filename):
    # takes in a file and plays it back 
    pygame.mixer.init()
    pygame.mixer.music.load(filename)
    pygame.mixer.music.play()

def speak_text(text):
    engine=pyttsx3.init()
    engine.say(text)
    engine.runAndWait()

def transcribe_audio_sphinx(filename):
    # transcribe the audio (note this is only done if a voice sample)
    r=sr_audio.Recognizer()
    with sr_audio.AudioFile(filename) as source:
        audio = r.record(source) 
    text=r.recognize_sphinx(audio)
    print('transcript: '+text)
    return text
    
def fetch_weather():
    link='https://www.yahoo.com/news/weather'
    pdfkit.from_url(link, 'out.pdf')

speak_text('would you like to get the weather?')
sync_playback('beep.mp3')
time.sleep(1.2)
sync_record('response.wav',5,16000,1)
transcript=transcribe_audio_sphinx('response.wav')
if transcript.lower().find('yes') >= 0 or transcript.lower().find('yeah')>=0:
    speak_text('ok, great here it is.')
    os.system('open out.pdf')

Passive-synchronous (PS) mode

ps_record.py

import sounddevice as sd
import soundfile as sf 
import time, os, shutil 


def sync_record(filename, duration, fs, channels):
    print('recording')
    myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=channels)
    sd.wait()
    sf.write(filename, myrecording, fs)
    print('done recording')


# make a folder to put recordings in 
try:
    os.mkdir('recordings')
    os.chdir(os.getcwd()+'/recordings')
except:
    shutil.rmtree('recordings')
    os.mkdir('recordings')
    os.chdir(os.getcwd()+'/recordings')
    
i=0

# loop through 10 times recording a 2 second sample 
# can change to infinite loop ==> while i > -1: 
while i<10:
    # record a mono file synchronously
    filename=str(i+1)+'.wav'
    print('recording %s'%(filename))
    sync_record(filename, 2, 16000, 1)
    time.sleep(10)
    i=i+1

Passive-asynchronous (PA) mode

pa_record.py

import sounddevice as sd
import soundfile as sf 
import time, os, shutil, psutil 
# define synchronous recording function (did this is Chapter 1)
def get_battery():
    battery = psutil.sensors_battery()
    plugged = battery.power_plugged
    percent = str(battery.percent)
    return percent 

def sync_record(filename, duration, fs, channels):
    print('recording')
    myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=channels)
    print('battery is currently at %s'%get_battery())
    sd.wait()
    sf.write(filename, myrecording, fs)
    print('done recording')
# make a folder to put recordings in 
try:
    os.mkdir('recordings')
    os.chdir(os.getcwd()+'/recordings')
except:
    shutil.rmtree('recordings')
    os.mkdir('recordings')
    os.chdir(os.getcwd()+'/recordings')
    
i=0

# loop through 10 times recoridng a 2 second sample 
# can change to infinite loop ==> while i > -1: 
while i<10:
    # record a mono file synchronously
    filename=str(i+1)+'.wav'
    print('recording %s'%(filename))
    sync_record(filename, 2, 16000, 1)
    time.sleep(10)
    i=i+1

Active-passive synchronous (APS) mode

aps_record.py

import os

# ONLY 1 CONFIGURATION 
# active-synchronous (AS)
os.system('python3 as_record.py')
# passive-synchronous (PS)
os.system('python3 ps_record.py')

Active-passive-asynchronous (APA) mode

apa_record.py

import os

# APA CONFIG 1 (AA → PA)
os.system('python3 aa_record.py')
os.system('python3 pa_record.py')

# APA CONFIG 2 (AS→ PA)
# os.system('python3 as_record.py')
# os.system('python3 pa_record.py')

# APA CONFIG 3 (AA→ PS)
# os.system('python3 aa_record.py')
# os.system('python3 ps_record.py')

清理音频文件

Removing noise

remove_noise.py

import soundfile as sf
import os

def remove_noise(filename):
    #now use sox to denoise using the noise profile
    data, samplerate =sf.read(filename)
    duration=data/samplerate
    first_data=samplerate/10
    filter_data=list()
    for i in range(int(first_data)):
        filter_data.append(data[i])
    noisefile='noiseprof.wav'
    sf.write(noisefile, filter_data, samplerate)
    os.system('sox %s -n noiseprof noise.prof'%(noisefile))
    filename2='tempfile.wav'
    filename3='tempfile2.wav'
    noisereduction="sox %s %s noisered noise.prof 0.21 "%(filename,filename2)
    command=noisereduction
    #run command 
    os.system(command)
    print(command)
    #reduce silence again
    #os.system(silenceremove)
    #print(silenceremove)
    #rename and remove files 
    os.remove(filename)
    os.rename(filename2,filename)
    #os.remove(filename2)
    os.remove(noisefile)
    os.remove('noise.prof')

    return filename

remove_noise('test.wav')

改变音量

change_volume.py

import os

def change_volume(filename, vol):
    # rename file
    if vol > 1:
        new_file=filename[0:-4]+'_increase_'+str(vol)+'.wav'
    else:
        new_file=filename[0:-4]+'_decrease_'+str(vol)+'.wav'
    # changes volume, vol, by input 
    os.system('sox -v %s %s %s'%(str(vol),filename,new_file))

    return new_file 
# increase volume by 2x 
new_file=change_volume('5.wav', 2)
# decrease volume by 1/2 
new_file=change_volume('5.wav', 0.5)

微调音频

trim_audio.py

import os
def trim_audio(filename, start, end):
	clip_duration=end-start 
	new_filename=filename[0:-4]+'_trimmed_'+str(start)+'_'+str(end)+'.wav'
	command='sox %s %s trim %s %s'%(filename,new_filename,str(start),str(clip_duration))
	os.system(command)
	return new_filename

# trim from second 30 to 40 => (test_trimmed_30_40.wav)
trim_audio('test.wav', 30, 40)

组合音频文件

combine.py

import os

def combine_files(one,two):
    three=one[0:-4]+'_'+two[0:-4]+'.wav'
    os.system('sox %s %s %s'%(one,two,three))
    return three

combine_files('test1.wav','test2.wav')

转码

transcode.py

import os

def combine_files(one,two):
    three=one[0:-4]+'_'+two[0:-4]+'.wav'
    os.system('sox %s %s %s'%(one,two,three))
    return three

combine_files('test1.wav','test2.wav')

更改采样率

change_samplerate.py

import os

def change_samplerate(filename, samplerate):
    new_filename=filename[0:-4]+'_sr'+str(samplerate)+'.wav
    new_samplerate=str(int(samplerate/1000))
    os.system('sox %s -r %sk %s'%(filename, new_samplerate, new_filename))
    return new_filename

change_samplerate('test.wav',48000)

更改频道数

change_channels.py

import os

def stereo2mono(filename):
    #Change stereo to mono 
    new_filename=filename[0:-4]+'_mono.wav'
    os.system('sox %s %s remix 1-2'%(filename,new_filename))
    return new_filename
def separate_channels(filename):
    #Change stereo to two mono files (mix-down)
    channel_1=filename[0:-4]+'_1.wav'
    channel_2=filename[0:-4]+'_2.wav'
    os.system('sox %s %s remix 1'%(filename, channel_1))
    os.system('sox %s %s remix 2'%(filename, channel_2))
    return channel_1, channel_2
def multiplex(channel_1, channel_2):
    #Convert two mono files into one stereo file (multiplexing)
    output=channel_1[0:-4]+'_'+channel_2[0:-4]+'.wav'
    os.system('sox -M %s %s %s'%(channel_1,channel_2,output))
    return output

stereo2mono('stereo.wav')
separate_channels('stereo.wav')
multiplex('stereo_1.wav','stereo_2.wav')

消除沉默

trim_silence.py

import os

def trim_silence(filename):
	new_filename=filename[0:-4]+'_trimmed.wav'
	command='sox %s %s silence -l 1 0.1 1'%(filename, new_filename)+"% -1 2.0 1%"
	os.system(command)
	return new_filename

# trim the leading and trailing silence => (test_trimmed.wav)
trim_silence('test.wav')

说话人日记化

Run this in the terminal:

cd ~
cd voicebook/chapter_2_collection
python3 diarize.py

现在在当前目录中有两个文件夹：diarize\u incoming和diarize\u processed。如果您将需要日记的文件放入diarize\u incoming文件夹中，文件将自动日记到Speaker A和Speaker B中。然后，使用googlespeechapi（如果适用）或Pocketsphinx转录每个说话人。

存储语音文件

converting to .FLAC format

convert_flac.py

import shutil, os, ffmpy 

def zipdir(folder, delete):
    # ziph is zipfile handle
    shutil.make_archive(folder, 'zip', folder)
    if delete == True:
        shutil.rmtree(folder)

def convert_flac():
    listdir=os.listdir()
    removedfiles=list()
    for i in range(len(listdir)):
        if listdir[i][-4:]!='flac':
            file=listdir[i]
            newfile=file[0:-4]+'.flac'
            os.system('ffmpeg -i %s %s'%(file,newfile))
            os.remove(file)
            removedfiles.append(file)
    return removedfiles 

# get 10 files recorded in 'recordings' folder in current directory
# record them if the folder doesn't exist 
hostdir=os.getcwd()
if 'recordings' not in os.listdir():
    os.system('python3 ps_record.py')

# change to directory of recordings to compress all files in directory 
os.chdir(hostdir+'/recordings')
convert_flac()

# change back to main directory and compress files, delete main folder 
os.chdir(hostdir)
zipdir('recordings', True)

converting to .OPUS format

import shutil, os, ffmpy 

def zipdir(folder, delete):
    # ziph is zipfile handle
    shutil.make_archive(folder, 'zip', folder)
    if delete == True:
        shutil.rmtree(folder)

def convert_opus(opusdir):
    curdir=os.getcwd()
    listdir=os.listdir()
    removedfiles=list()
    for i in range(len(listdir)):
        if listdir[i][-4:]!='opus':
            # get new file names 
            file=listdir[i]
            newfile=file[0:-4]+'.opus'
            # copy file to opus encoding folder 
            shutil.copy(curdir+'/'+file, opusdir+'/'+file)
            os.chdir(opusdir)
            # encode with opus codec 
            os.system('opusenc %s %s'%(file,newfile))
            shutil.copy(opusdir+'/'+newfile, curdir+'/'+newfile)
            # delete files in opus folder 
            os.remove(file)
            os.remove(newfile)
            # delete .wav file in original dir 
            os.chdir(curdir)
            os.remove(file)
            removedfiles.append(file)
    return removedfiles 

# get 10 files recorded in 'recordings' folder in current directory
# record them if the folder doesn't exist 
hostdir=os.getcwd()
opusdir=hostdir+'/opustools'
if 'recordings' not in os.listdir():
    os.system('python3 ps_record.py')

# change to directory of recordings to compress all files in directory 
os.chdir(hostdir+'/recordings')
convert_opus(opusdir)

# change back to main directory and compress files, delete main folder 
os.chdir(hostdir)
zipdir('recordings', True)

解包压缩的.FLAC或.OPUS文件

unpacking_files.py

import zipfile, os, shutil

def unzip(file):
    filepath=os.getcwd()+'/'+file
    folderpath=os.getcwd()+'/'+file[0:-4]
    zip = zipfile.ZipFile(filepath)
    zip.extractall(path=folderpath)
def convert_wav(opusdir):
    curdir=os.getcwd()
    listdir=os.listdir()
    removedfiles=list()

    for i in range(len(listdir)):
        file=listdir[i]
        newfile=file[0:-5]+'.wav'
        if file[-5:] in ['.opus','.flac']:
            if file[-5:]=='.flac':
                os.system('ffmpeg -i %s %s'%(file, newfile))
                os.remove(file)
            elif file[-5:]=='.opus':
                # copy file to opus encoding folder 
                print(file)
                shutil.copy(curdir+'/'+file, opusdir+'/'+file)
                os.chdir(opusdir)
                # encode with opus codec 
                os.system('opusdec %s %s'%(file,newfile))
                shutil.copy(opusdir+'/'+newfile, curdir+'/'+newfile)
                # delete files in opus folder 
                os.remove(file)
                os.remove(newfile)
                # delete .wav file in original dir 
                os.chdir(curdir)
                os.remove(file)

# extract zip file into 'recordings' folder
unzip('recordings.zip')
# now cd into this folder and convert files to wav format
opusdir=os.getcwd()+'/opustools'
os.chdir('recordings')
print(os.listdir())
convert_wav(opusdir)

将文件上载到FTP服务器

store_ftp.py

import sounddevice as sd
import soundfile as sf 
import time, os, shutil 
from ftplib import FTP


def sync_record(filename, duration, fs, channels):
    print('recording')
    myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=channels)
    sd.wait()
    sf.write(filename, myrecording, fs)
    print('done recording')
    return filename 

def upload_file(file, session)
    uploadfile = open(file,'rb')
    session.storbinary('STOR %s'%(file),uploadfile,1024)
    uploadfile.close() 

# get environment variables 
domain=os.environ['DOMAIN_NAME']
username=os.environ['DOMAIN_USER']
password=os.environ['DOMAIN_PASSWORD']

# log into session
session = ftplib.FTP(domain,username,password)

# record sample (note, could loop through and record samples with while loop)
file = sync_record('test.wav',10,16000,1)

# upload to server / remove file 
upload_file(file, session)
os.remove(file)

# log off server 
session.quit()

uploading files to Google cloud storage

store_gcp.py

import sounddevice as sd
import soundfile as sf 
import time, os, shutil 
from google.cloud import storage

def sync_record(filename, duration, fs, channels):
    print('recording')
    myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=channels)
    sd.wait()
    sf.write(filename, myrecording, fs)
    print('done recording')
    return filename 


def upload_gcp(bucket_name, source_file_name):
    destination_blob_name=source_file_name
    """Uploads a file to the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print('File {} uploaded to {}.'.format(
        source_file_name,
        destination_blob_name))

# Instantiates a client
storage_client = storage.Client()

# The name for the new bucket
bucket_name = 'test-bucket'

# Creates the new bucket
bucket = storage_client.create_bucket(bucket_name)
print('Bucket {} created.'.format(bucket.name))

# get a recording (can loop here too)
file=syn_record('test.wav', 10, 16000, 1)
# upload this recording to gcp
upload_gcp(bucket_name, file)
# delete file after the recording has been uploaded 
os.remove(file)

MEMUPPS voice controls

具有一致的Mmicrophone类型、录制Eenvironment、录制Mode、User operation、Pprocess、Ppublishing medium和Sstorage method对于收集和分发高质量数据非常重要。这些被称为MEMUPPS控件
label_memupps.py

import os, taglib, json
import sounddevice as sd
import soundfile as sf 

def get_defaults():
    if 'label.json' in os.listdir():
        data=json.load('label.json')
    else:
        mic=input('what is the microphone?')
        env=input('what is the environment?')
        mode=input('what is the mode?')
        sampletype=input('sample type? (e.g. voice)')
        distance=input('what is the distance from mic?')
        process=input('do you use any processing (e.g. SoX noisefloor, .wav--> .opus --> .wav)? if so what?')
        storage=input('where are you storing files?')
        data={
            'microphone':mic,
            'environment':env,
            'mode':mode,
            'sample type': sampletype,
            'distance':distance,
            'processing':process,
            'storage':storage,
        }
            
        jsonfile=open('label.json','w')
        json.dump(data,jsonfile)
        jsonfile.close()
    return data 

def label_sample(file):
    data=get_defaults()    
    audio=taglib.File(os.getcwd()+'/'+file)
    print(audio)
    audio.tags['microphone']=data['microphone']
    audio.tags['environment']=data['environment']
    audio.tags['mode']=data['mode']
    audio.tags['sample type']=data['sample type']
    audio.tags['distance']=data['distance']
    audio.tags['processing']=data['processing']
    audio.tags['storage']=data['storage']
    audio.save()

def sync_record(filename, duration, fs, channels):
    print('recording')
    myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=channels)
    sd.wait()
    sf.write(filename, myrecording, fs)
    print('done recording')
    label_sample(filename)

file='test.wav'
sync_record(file,10,18000,1)