1.基本思路
(1)Pyaudio库通过以下结构持续读取电脑麦克风的音频
import Pyaudio
p = pyaudio.PyAudio()
stream=p.open(format=pyaudio.paInt16,channels=channels,rate=rate,input=True)
while True:
data = stream.read(framenum,exception_on_overflow=False)
(2)音频段计算分贝大小
# 转换为 16 位整数数组
audio_data = np.frombuffer(data, dtype=np.int16)
# 计算振幅绝对值
amplitude = np.abs(audio_data)
# 计算分贝值
dB = 20 * np.log10(np.max(amplitude))
分贝值正常来说不是这么算的,但实测发现这样最好用:麦克风可以自动过滤非人声,所以有说话声50-70dB,没有说话声0dB,这样就够了
(3)Gummy的单次请求
class Callback(TranslationRecognizerCallback): #核心类
#开始时会自动调用这个函数
def on_open(self) -> None:
print("TranslationRecognizerCallback open.")
#过程中会一直调用这个函数
def on_event(self,
request_id,
transcription_result: TranscriptionResult,
translation_result: TranslationResult,
usage,
) -> None:
if transcription_result is not None:
print("transcription: ", transcription_result.text)
if transcription_result.is_sentence_end is True: #一句话结束了
print(f"sentence end,TAKE TIME {usage['duration']} SECOND")#用了多少秒
#结束时会自动调用这个函数
def on_close(self) -> None:
print("TranslationRecognizerCallback close.")
while True:
#持续音频读取+分贝分析,获得dB值
if dB > 50:
callback = Callback() #一个callback只能识别一句话
translator = TranslationRecognizerChat( # 这里是请求头
model="gummy-chat-v1",
format="pcm",
source_language="zh",
sample_rate=16000,
transcription_enabled=True,
translation_enabled=False,
translation_target_languages=["en"],
callback=callback,
max_end_silence=2000,)
translator.start()
while True:
data = stream.read(framenum,exception_on_overflow=False)
if stream:
if not translator.send_audio_frame(data):
print("这个句子识别完成了")
break
else:
break
关于Gummy调用的更多细节请见阿里云官方文档:Gummy一句话识别、翻译Python API_大模型服务平台百炼(Model Studio)-阿里云帮助中心
(4)优化dB检测吞音问题
while True:
#获取data,dB
#维护音频栈
stack.append(data)
if len(stack)*secondper >= 1: #这个数字是希望缓存的音频时间,此处为1s
del stack[0]
if dB>50:
#启动Gummy识别
for datapast in stack:
#将过去的音频有次序地输送给Gummy
translator.send_audio_frame(datapast)
while True:
#loop程序
这个地方维护音频栈可以使用多线程或者并行来进一步优化,不过我没来得及弄
2.源代码
import pyaudio
import numpy as np
from dashscope.audio.asr import *
class Callback(TranslationRecognizerCallback): #核心类
def on_open(self) -> None:
print("TranslationRecognizerCallback open.")
# format在这里的意思是格式
def on_close(self) -> None:
print("TranslationRecognizerCallback close.")
def on_event(
self,
request_id,
transcription_result: TranscriptionResult,
translation_result: TranslationResult,
usage,
) -> None:
if transcription_result is not None:
print("transcription: ", transcription_result.text)
if transcription_result.is_sentence_end is True: #一句话结束了
print(f"sentence end,TAKE TIME {usage['duration']} SECOND")
"""
second : 你希望每隔多少秒读取一次音频数据(检测或者上传)
channels : 单声道1,双声道2,我的电脑双声道语音识别会出问题
rate = 采样率
"""
def monitor_audio(secondper,rate,channels=1):
stack = []
framenum = int(rate / channels * secondper)
p = pyaudio.PyAudio()
stream=p.open(format=pyaudio.paInt16,channels=channels,rate=rate,input=True)
print("开始监测音频分贝")
try:
i=0
while True:
data = stream.read(framenum,exception_on_overflow=False)
stack.append(data)
if len(stack)*secondper >= 1: #这个数字是希望缓存的音频时间,此处为1s
del stack[0]
# 转换为 16 位整数数组
audio_data = np.frombuffer(data, dtype=np.int16)
# 计算振幅绝对值
amplitude = np.abs(audio_data)
# 计算分贝值
dB = 20 * np.log10(np.max(amplitude))
print(f'当前分贝值: {dB}')
# 若大于50分贝,认为有人说话
if dB > 50:
i+=1
print(f"----检测到大于50分贝的人声,开始进行第{i}段人声的录制。")
callback = Callback()
translator = TranslationRecognizerChat( # 这里是请求参数
model="gummy-chat-v1",
format="pcm",
source_language="zh",
sample_rate=16000,
transcription_enabled=True,
translation_enabled=False,
translation_target_languages=["en"],
callback=callback,
max_end_silence=2000,
)
translator.start()
for datapast in stack:
translator.send_audio_frame(datapast)
while True:
data = stream.read(framenum, exception_on_overflow=False)
if stream:
if not translator.send_audio_frame(data):
print("sentence end, stop sending")
break
else:
break
except KeyboardInterrupt: #用户中断程序
print("监测结束")
finally:
stream.stop_stream()
stream.close()
pyaudio.PyAudio().terminate()
monitor_audio(0.125,16000)