现在有个需求就是使用gradio的audio麦克风对接科大讯飞的流式听写api,实现的时候会遇到以下问题。
1.科大讯飞api对输入音频的要求。2.gradio的audio麦克风数据对科大讯飞的流式音频输入持续性 思路如下:
1)对在采集gradio的audio麦克风数据进行码率和格式的转化成音频数据块 。
2)保持流式听写的webscoket心跳,用个队列或者数组arr缓存采集到音频数据块作为buffer,利用gradio采集主动会触发数据传输,然后给到科大讯飞的流式传输音频api,传输音频块。
那么先上代码。如果客官觉得满意,菜码农不易请点赞一下支持~~感谢,(大佬勿喷,或者多多指点改进该方法双手合十~~谢谢~~)
首先第一个实现gradio的音频采集:(主要代码用颜色标识)
from stt_comm.stt import STT # 自定义的一个类处理科大讯飞的api听写
import gradio as gr # 调用gradio
from pubsub import pub #这是一个发布订阅
# 初始化
stt = STT()
# 被动接收消息科大讯飞的文字消息------start----#
def getMsg(msg=None):
print(msg)
# 被动接收消息科大讯飞的文字消息------end----#
pub.subscribe(getMsg, "updateMsg")
# stt.stop()#主动关闭
def changeFun(*args):
print(args)
def main_note(audio):
stt.sendMsg(audio) # 接受音频数据传入到自定义stt类
return stt.update_ws_msg() # 接受消息的方法可以和发布订阅的被动接受消息任选 一个,
demo = gr.Interface(
main_note,
gr.Audio(source="microphone", type='numpy', streaming=True), # 设置麦克风
gr.Textbox(),
interpretation="default",
live=True # 设置麦克风式实时流式语音输入
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=9775, share=False)
然后在STT 类的sendMsg里面进行音频数据转化成科大讯飞可接受的音频格式,主要代码如下:
def sendMsg(self, audio_data): # 处理
# print('start--server')
data = self.save_audio(audio_data[1], audio_data[0], 16000) # 给音频处理返回函数save_audio处理gradio传递过来的麦克风音频数据
self.stt_ws.sendMsg(data) //传递转换后的音频数据给科大讯飞的tts的api
下面式重点--处理音频转换数据函数save_audio,不理解的可以直接复制
def save_audio(self, audio_data, original_sample_rate, target_sample_rate):
# 采样音频数据
num_samples = int(len(audio_data) * target_sample_rate / original_sample_rate)
# 重采样音频数据
resampled_audio_data = resample(audio_data, num_samples)
audio = np.int16(resampled_audio_data / np.max(np.abs(resampled_audio_data)) * 32767)
# 使用python AudioSegment库进行转化
sound = AudioSegment(
audio.tobytes(),
frame_rate=target_sample_rate,
sample_width=audio.dtype.itemsize,
channels=1
)
# 用buffer进行缓存为wav格式兼容科大讯飞
buffer = BytesIO()
sound.export(buffer, format="wav")
return buffer.getvalue() //返回处理好的音频块
完整代码如下:
from stt_comm.sttwebclient import stt_ws
from stt_comm.ws_state import StateManager
from pydub import AudioSegment
from io import BytesIO
from scipy.signal import resample
import numpy as np
class STT:
def __init__(self):
self.is_init = None
self.stt_ws = stt_ws() //开启科大讯飞的wss
def save_audio(self, audio_data, original_sample_rate, target_sample_rate):
# 采样音频数据
num_samples = int(len(audio_data) * target_sample_rate / original_sample_rate)
# 重采样音频数据
resampled_audio_data = resample(audio_data, num_samples)
audio = np.int16(resampled_audio_data / np.max(np.abs(resampled_audio_data)) * 32767)
# 使用python AudioSegment库进行转化
sound = AudioSegment(
audio.tobytes(),
frame_rate=target_sample_rate,
sample_width=audio.dtype.itemsize,
channels=1
)
# 用buffer进行缓存为wav格式兼容科大讯飞
buffer = BytesIO()
sound.export(buffer, format="wav")
return buffer.getvalue()
def sendMsg(self, audio_data):
# print('start--server')
state = StateManager.get_state()
data = self.save_audio(audio_data[1], audio_data[0], 16000)
self.stt_ws.sendMsg(data)
if state == 1 and StateManager.get_ws_is_init(): #StateManager这里式传输音频数据给科大讯飞
self.stt_ws.sendMsg(data)
else:
self.stt_ws.on_init()
def update_ws_msg(self):
return self.stt_ws.msg
def stop(self):
self.stt_ws.close()
接下来就是处理科大讯飞那边的tts的接收音频数据和wss的链接等问题 主要代码如下:
#StateManager这里式传输音频数据给科大讯飞的主要方法
def sendMessage(self, stop_event): # 这里需要开启一个进程运行这个方法
while True:
if not self.q.empty() and self.q.qsize() > 0: # 用q队列缓存音频块
audio_data = self.q.get()
self.sendAudio(stop_event, audio_data) #按照科大讯飞的传输方式传输转化后的音频数据
time.sleep(0.04) #科大讯飞那边要求式0.04秒的传输数据间隔
重点sendAudio主要拿到队列数据然后用个循环去不停传输给科大讯飞的wss的音频传输接口,
块大小按照科大讯飞的要求具体代码如下
def sendAudio(self, stop_event, audio_data):
frameSize = 8000 # 每一帧的音频大小
intervel = 0.04 # 发送音频间隔(单位:s)
index = 0
StateManager.set_ws_status(STATUS_FIRST_FRAME)
while True:
if stop_event.is_set():
self.close()
break;
buf = audio_data[index:index + frameSize]
if not buf:
if not self.q.empty():
return
# else:
# StateManager.set_ws_status(STATUS_LAST_FRAME)
if StateManager.get_ws_status() == STATUS_FIRST_FRAME:
# print('第一个')
d = {"common": self.req_params.CommonArgs,
"business": self.req_params.BusinessArgs,
"data": {"status": 0, "format": "audio/L16;rate=16000",
"audio": str(base64.b64encode(buf), 'utf-8'),
"encoding": "raw"}}
d = json.dumps(d)
try:
self.ws.send(d)
except Exception as e:
StateManager.set_state(0)
StateManager.set_ws_is_init(False)
print("receive msg,but parse exception--sendMsg:", e)
# print('发送音频信息-----', datetime.now())
StateManager.set_ws_status(STATUS_CONTINUE_FRAME)
elif StateManager.get_ws_status() == STATUS_CONTINUE_FRAME:
# print('中间个')
d = {
# "business": wsParam.BusinessArgs,
"data": {"status": 1, "format": "audio/L16;rate=16000",
"audio": str(base64.b64encode(buf), 'utf-8'),
"encoding": "raw"}}
try:
self.ws.send(json.dumps(d))
except Exception as e:
StateManager.set_state(0)
StateManager.set_ws_is_init(False)
print("receive msg,but parse exception--sendmsg----:", e)
# print('再次发送音频信息-----', datetime.now())
elif StateManager.get_ws_status() == STATUS_LAST_FRAME:
print('最后一个')
d = {
# "business": wsParam.BusinessArgs,
"data": {"status": 2, "format": "audio/L16;rate=16000",
"audio": str(base64.b64encode(buf), 'utf-8'),
"encoding": "raw"}}
try:
self.ws.send(json.dumps(d))
except Exception as e:
StateManager.set_state(0)
StateManager.set_ws_is_init(False)
print("receive msg,but parse exception-----sendmsg----:", e)
print('结束')
break
index += frameSize
time.sleep(intervel)
完整代码如下:
import websocket
from queue import Queue
from stt_comm.Ws_Param import ws_setup
import ssl
from stt_comm.ws_state import StateManager
import base64
import json
import time
from threading import Thread
from threading import Event
from datetime import datetime
from pubsub import pub
STATUS_FIRST_FRAME = 0 # 第一帧的标识
STATUS_CONTINUE_FRAME = 1 # 中间帧标识
STATUS_LAST_FRAME = 2 # 最后一帧的标识
class stt_ws:
def __init__(self):
self.ws_thread = None
self.thread = None
self.ws = None
self.msg = ''
self.wsUrl = None
self.req_params = ws_setup()
self.config = self.req_params.config
self.event = Event()
self.q = Queue()
self.audio_data = None
def on_init(self):
print('启动----------')
StateManager.set_state(1)
StateManager.set_ws_is_init(True)
websocket.enableTrace(False) # 开启追踪状态,便于定位问题
wsUrl = self.req_params.create_url()
try:
self.ws = websocket.create_connection(wsUrl)
StateManager.set_state(1)
# 这个线程会自动关闭
self.ws_thread = Thread(target=self.on_message)
self.ws_thread.start()
except Exception as e:
StateManager.set_state(0)
StateManager.set_ws_is_init(False)
print(f"Error connecting to {StateManager.get_state()}: {e}")
# def three_start_ws(self):
# if self.ws:
# self.ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})
# else:
# self.event.set()
# StateManager.set_state(0)
# StateManager.set_ws_is_init(False)
def sendMsg(self, audio_data):
self.q.put(audio_data)
state = StateManager.get_state()
# 开启线程发送数据
is_alive = False
try:
is_alive = self.thread.is_alive()
except Exception as e:
print("线程可以重启:", e)
if state == 1 and not is_alive:
self.event.clear() # 重置线程开关
self.thread = Thread(target=self.sendMessage, args=(self.event,))
self.thread.start()
def on_open(self, ws):
StateManager.set_state(1)
pass
def on_message(self, ):
try:
while True:
if self.ws.connected:
result = str(self.ws.recv())
if len(result) == 0:
print("receive result end-onmessage")
break
result_dict = json.loads(result)
# print('接收返回信息-----',result_dict, datetime.now())
code = result_dict["code"]
sid = result_dict["sid"]
if code != 0:
errMsg = result_dict["message"]
if code == 10165: #检查下传入第一帧音频时,是否上传了status=0
StateManager.set_ws_status(STATUS_FIRST_FRAME)
print("sid:%s call error:%s code is:%s" % (sid, errMsg, code))
else:
data = result_dict["data"]["result"]["ws"]
result = ""
for i in data:
for w in i["cw"]:
result += w["w"]
self.msg = result
# print(self.msg)
pub.sendMessage("updateMsg", msg=self.msg)
except websocket.WebSocketConnectionClosedException:
self.q.clear()
StateManager.set_state(0)
StateManager.set_ws_is_init(False)
def on_error(self):
StateManager.set_state(0)
self.event.set()
pass
def on_close(self, ws, a, b):
StateManager.set_state(0)
self.event.set()
pass
def sendMessage(self, stop_event):
while True:
if stop_event.is_set():
break
if not self.q.empty() and self.q.qsize() > 0:
audio_data = self.q.get()
self.sendAudio(stop_event, audio_data)
time.sleep(0.04)
def sendAudio(self, stop_event, audio_data):
frameSize = 8000 # 每一帧的音频大小
intervel = 0.04 # 发送音频间隔(单位:s)
index = 0
StateManager.set_ws_status(STATUS_FIRST_FRAME)
while True:
if stop_event.is_set():
self.close()
break;
buf = audio_data[index:index + frameSize]
if not buf:
if not self.q.empty():
return
# else:
# StateManager.set_ws_status(STATUS_LAST_FRAME)
if StateManager.get_ws_status() == STATUS_FIRST_FRAME:
# print('第一个')
d = {"common": self.req_params.CommonArgs,
"business": self.req_params.BusinessArgs,
"data": {"status": 0, "format": "audio/L16;rate=16000",
"audio": str(base64.b64encode(buf), 'utf-8'),
"encoding": "raw"}}
d = json.dumps(d)
try:
self.ws.send(d)
except Exception as e:
StateManager.set_state(0)
StateManager.set_ws_is_init(False)
print("receive msg,but parse exception--sendMsg:", e)
# print('发送音频信息-----', datetime.now())
StateManager.set_ws_status(STATUS_CONTINUE_FRAME)
elif StateManager.get_ws_status() == STATUS_CONTINUE_FRAME:
# print('中间个')
d = {
# "business": wsParam.BusinessArgs,
"data": {"status": 1, "format": "audio/L16;rate=16000",
"audio": str(base64.b64encode(buf), 'utf-8'),
"encoding": "raw"}}
try:
self.ws.send(json.dumps(d))
except Exception as e:
StateManager.set_state(0)
StateManager.set_ws_is_init(False)
print("receive msg,but parse exception--sendmsg----:", e)
# print('再次发送音频信息-----', datetime.now())
elif StateManager.get_ws_status() == STATUS_LAST_FRAME:
print('最后一个')
d = {
# "business": wsParam.BusinessArgs,
"data": {"status": 2, "format": "audio/L16;rate=16000",
"audio": str(base64.b64encode(buf), 'utf-8'),
"encoding": "raw"}}
try:
self.ws.send(json.dumps(d))
except Exception as e:
StateManager.set_state(0)
StateManager.set_ws_is_init(False)
print("receive msg,but parse exception-----sendmsg----:", e)
print('结束')
break
index += frameSize
time.sleep(intervel)
def close(self):
if self.ws:
StateManager.set_ws_status(STATUS_LAST_FRAME)
StateManager.set_state(0)
self.ws.close()
好到这里完整的流程就写出来了,但具体还有些需要看官自己去实现,感谢大家的观看,有不懂和疑问的地方可以到评论区里,留言,离职的我有的是时间解答,也感谢大佬们的指点和指教,也想更进步一点,谢谢!