语音计算定义
语音计算旨在开发硬件或软件来处理语音输入。 以下是在语音中会遇到的一些常见术语:
Term | Definition |
---|---|
voice computer | 任何可以处理语音输入的计算机化系统(组装的硬件和软件). |
voice computing software | 可以读/写、记录、清理、加密/解密、回放、转码、转录、压缩、发布、特征化、建模和可视化语音文件. |
voice computing hardware | 可以包括主板、麦克风、声卡(带有 D/A 和 A/D 转换器)、中央处理器 (CPU)、图形卡、存储设备(例如硬盘)、计算机显示器、WiFi 芯片、蓝牙芯片、无线电发射器 、扬声器和电源. |
microphone | 将声音(例如空气中的压力波)转换为电信号(例如安培 - C/s)的换能器。 |
sound cards | 通过音频编解码器将音频从 PCM 数据转换为各种音频格式(例如 .WAV). |
codec | 用于对数字音频数据与数字音频编码格式进行编码和解码的软件程序。 |
audio coding format | 已由音频编解码器程序处理的数字信号的输出文件类型. |
transcoding | 将一种音频编码格式转换为另一种音频编码格式的过程. |
audio channels | 记录的音频信号的音频输入或输出的数量。 |
speaker | 扬声器的工作方式与麦克风相反,其中模拟声音是从电信号(例如电流 - 放大器)转换而来的。 |
如何读写音频文件
使用各种库:pydub、wave、librosa 和 soundfile。
from pydub import AudioSegment
data = AudioSegment.from_wav("test.wav")
data.export("new_test.wav")
import wave
data=wave.open('test.wav', mode='rb')
params=data.getparams()
# _wave_params(nchannels=1, sampwidth=2, framerate=16000, nframes=47104, comptype='NONE', compname='not compressed')
import librosa
y, sr = librosa.load('test.wav')
librosa.output.write_wav('new_test.wav', y, sr)
from scipy.io import wavfile
fs, data = wavfile.read('test.wav')
wavfile.write('new_test.wav',fs, data)
import soundfile as sf
data, fs = sf.read('test.wav')
sf.write('new_test.ogg', data, fs)
处理音频文件
SoX 安装在主机系统上。
import os
# take in one.wav and two.wav to make three.wav
os.system('sox one.wav two.wav three.wav')
# take first second of one.wav and output to output.wav
os.system('sox one.wav output.wav trim 0 1')
# make volume 2x in one.wav and output to volup.wav
os.system('sox -v 2.0 one.wav volup.wav')
# make volume ½ in one.wav and output to voldown.wav
os.system('sox -v -0.5 one.wav volup.wav')
# reverse one.wav and output to reverse.wav
os.system('sox one.wav reverse.wav reverse')
# change sample rate of one.wav to 16000 Hz
os.system('sox one.wav -r 16000 sr.wav')
# change audio file to 16 bit quality
os.system('sox -b 16 one.wav 16bit.wav')
# convert mono file to stereo by cloning channels
os.system('sox one.wav -c 2 stereo.wav')
# make stereo file mono by averaging out the channels
os.system('sox stereo.wav -c 1 mono.wav')
# double speed of file
os.system('sox one.wav 2x.wav speed 2.0')
播放音频文件
同步播放
play_sync.py
'''
play_sync.py
Play back an audio file synchronously.
'''
import pygame
def sync_playback(filename):
# takes in a file and plays it back
pygame.mixer.init()
pygame.mixer.music.load(filename)
pygame.mixer.music.play()
sync_playback('one.wav')
异步播放
play_async.py
import sounddevice as sd
import soundfile as sf
import time
def async_playback(filename):
data, fs = sf.read(filename)
sd.play(data, fs)
return data, fs
# playback file
data, fs = async_playback('play.wav')
# can execute commands
print('able to execute this before finishing')
print('hi, this is cool!')
# can stop after 1 second playing back
time.sleep(1)
sd.stop()
print('stopped')
录制流媒体音频
检查麦克风/设置默认麦克风
mic_check.py
import sounddevice as sd
mics=sd.query_devices()
default_devices=sd.default.device
default_input=default_devices[0]
default_output=default_devices[1]
# prints all available devices
for i in range(len(mics)):
print(mics[i])
# can set default device easily with
sounddevice.default.device = 0
同步录音
sync_record.py
import sounddevice as sd
import soundfile as sf
import time
def sync_record(filename, duration, fs, channels):
print('recording')
myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=channels)
sd.wait()
sf.write(filename, myrecording, fs)
print('done recording')
# playback file
sync_record('sync_record.wav', 10, 16000, 1)
异步记录
async_record.py
import sounddevice as sd
import soundfile as sf
import time
def printstuff(number):
for i in range(number):
print(i)
def async_record(filename, duration, fs, channels):
print('recording')
myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=channels)
# can execute commands
print('able to execute this before finishing')
printstuff(30)
# now wait until done before writing to file
sd.wait()
sf.write(filename, myrecording, fs)
print('done recording')
# playback file
async_record('async_record.wav', 10, 16000, 1)
转换音频格式
using ffmpy module
convert_wav.py
import ffmpy
def convert_wav(filename):
#take in an audio file and convert with ffmpeg file type
#types of input files: .mp3
#output file type: .wav
if filename[-4:] in ['.mp3','.m4a','.ogg']:
ff = ffmpy.FFmpeg(
inputs={filename:None},
outputs={filename[0:-4]+'.wav': None}
)
ff.run()
convert_wav('test.mp3')
转录音频
using PocketSphinx
sphinx_transcribe.py
import speech_recognition as sr_audio
import sounddevice as sd
import soundfile as sf
import os, json, datetime
def sync_record(filename, duration, fs, channels):
print('recording')
myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=channels)
sd.wait()
sf.write(filename, myrecording, fs)
print('done recording')
def transcribe_audio_sphinx(filename):
# transcribe the audio (note this is only done if a voice sample)
r=sr_audio.Recognizer()
with sr_audio.AudioFile(filename) as source:
audio = r.record(source)
text=r.recognize_sphinx(audio)
print('transcript: '+text)
return text
def store_transcript(filename, transcript):
jsonfilename=filename[0:-4]+'.json'
print('saving %s to current directory'%(jsonfilename))
data = {
'date': str(datetime.datetime.now()),
'filename':filename,
'transcript':transcript,
}
print(data)
jsonfile=open(jsonfilename,'w')
json.dump(data,jsonfile)
jsonfile.close()
# record file and print transcript
filename='sync_record.wav'
sync_record(filename, 10, 16000, 1)
transcript=transcribe_audio_sphinx(filename)
# now write the transcript into a .json file
# e.g. sync_record.wav transcript will be stored in sync_record.json
store_transcript(filename, transcript)
using Google Speech-to-Text API
google_transcribe.py
# assumes environment variables are set properly following the Google Speech API documentation
import speech_recognition as sr_audio
import sounddevice as sd
import soundfile as sf
import os, json, datetime
def transcribe_audio_google(filename):
# transcribe the audio (note this is only done if a voice sample)
r=sr_audio.Recognizer()
with sr_audio.AudioFile(filename) as source:
audio = r.record(source)
text=r.recognize_google_cloud(audio)
return text
def sync_record(filename, duration, fs, channels):
print('recording')
myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=channels)
sd.wait()
sf.write(filename, myrecording, fs)
print('done recording')
def store_transcript(filename, transcript):
jsonfilename=filename[0:-4]+'.json'
print('saving %s to current directory'%(jsonfilename))
data = {
'date': str(datetime.datetime.now()),
'filename':filename,
'transcript':transcript,
}
print(data)
jsonfile=open(jsonfilename,'w')
json.dump(data,jsonfile)
jsonfile.close()
# record file and print transcript
filename='google_record.wav'
sync_record(filename, 10, 16000, 1)
transcript=transcribe_audio_google(filename)
# now write the transcript into a .json file
# e.g. sync_record.wav transcript will be stored in sync_record.json
store_transcript(filename, transcript)
文字转语音系统
using Pyttsx3
Abridged from speak_custom.py
import pyttsx3
def speak_text(text):
engine = pyttsx3.init()
engine.say(text)
engine.runAndWait()
speak_text('this is a test')
using Google TTS API
speak_google.py
def speak_google(text, filename, model):
"""Synthesizes speech from the input string of text."""
from google.cloud import texttospeech
client = texttospeech.TextToSpeechClient()
input_text = texttospeech.types.SynthesisInput(text=text)
# Note: the voice can also be specified by name.
# Names of voices can be retrieved with client.list_voices().
voice = texttospeech.types.VoiceSelectionParams(
language_code='en-US',
ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE,
name=model)
audio_config = texttospeech.types.AudioConfig(
audio_encoding=texttospeech.enums.AudioEncoding.MP3)
response = client.synthesize_speech(input_text, voice, audio_config)
# The response's audio_content is binary.
with open(filename, 'wb') as out:
out.write(response.audio_content)
print('Audio content written to file %s'%(filename))
# experiment with various voices
base='output'
models=['en-US-Wavenet-A','en-US-Wavenet-B','en-US-Wavenet-C','en-US-Wavenet-D',
'en-US-Wavenet-E','en-US-Wavenet-F']
text='hey I am testing out google TTS'
# loop through various voices
# now all these files will be in the current directory
for i in range(len(models)):
speak_google(text, base+'_'+models[i]+'.mp3', models[i])