本文语言采用python+react,对接的阿里的通义听悟,直接上代码
后端接口
接口这里使用python实现,这里只有简单的示例,其他语言或者具体文档请参考官方的文档
#!/usr/bin/env python
# coding=utf-8
import json
import datetime
from aliyunsdkcore.client import AcsClient
from aliyunsdkcore.request import CommonRequest
from aliyunsdkcore.auth.credentials import AccessKeyCredential
from flask import Flask, abort
from flask_cors import CORS
app = Flask(__name__)
CORS(app)
APP_KEY = 'APP_KEY'
ACCESS_ID = 'ACCESS_ID'
ACCESS_SECRET = 'ACCESS_SECRET'
def create_common_request(domain, version, protocolType, method, uri):
curr_request = CommonRequest()
curr_request.set_accept_format('json')
curr_request.set_domain(domain)
curr_request.set_version(version)
curr_request.set_protocol_type(protocolType)
curr_request.set_method(method)
curr_request.set_uri_pattern(uri)
curr_request.add_header('Content-Type', 'application/json')
return curr_request
def init_parameters():
body = dict()
body['AppKey'] = APP_KEY
# 基本请求参数
input = dict()
# 输入语音流格式和采样率和以下参数设置保持一致
input['Format'] = 'pcm'
input['SampleRate'] = 16000
input['SourceLanguage'] = 'cn'
input['TaskKey'] = 'task' + datetime.datetime.now().strftime('%Y%m%d%H%M%S')
input['ProgressiveCallbacksEnabled'] = False
body['Input'] = input
# AI相关参数,按需设置即可
parameters = dict()
# 语音识别控制相关
transcription = dict()
# 角色分离 : 可选
transcription['DiarizationEnabled'] = True
diarization = dict()
diarization['SpeakerCount'] = 2
transcription['Diarization'] = diarization
parameters['Transcription'] = transcription
# 文本翻译控制相关 : 可选
parameters['TranslationEnabled'] = True
translation = dict()
translation['TargetLanguages'] = ['en'] # 假设翻译成英文
parameters['Translation'] = translation
# 章节速览相关 : 可选,包括: 标题、议程摘要
parameters['AutoChaptersEnabled'] = True
# 智能纪要相关 : 可选,包括: 待办、关键信息(关键词、重点内容、场景识别)
parameters['MeetingAssistanceEnabled'] = True
meetingAssistance = dict()
meetingAssistance['Types'] = ['Actions', 'KeyInformation']
parameters['MeetingAssistance'] = meetingAssistance
# 摘要控制相关 : 可选,包括: 全文摘要、发言人总结摘要、问答摘要(问答回顾)
parameters['SummarizationEnabled'] = True
summarization = dict()
summarization['Types'] = ['Paragraph', 'Conversational', 'QuestionsAnswering', 'MindMap']
parameters['Summarization'] = summarization
# ppt抽取和ppt总结 : 可选
parameters['PptExtractionEnabled'] = True
# 口语书面化 : 可选
parameters['TextPolishEnabled'] = True
body['Parameters'] = parameters
return body
@app.route('/createVoiceWsInfo', methods=['POST'])
def create_voice_ws_info():
body = init_parameters()
print(body)
credentials = AccessKeyCredential(ACCESS_ID, ACCESS_SECRET)
client = AcsClient(region_id='cn-beijing', credential=credentials)
curr_request = create_common_request('tingwu.cn-beijing.aliyuncs.com', '2023-09-30', 'https', 'PUT',
'/openapi/tingwu/v2/tasks')
curr_request.add_query_param('type', 'realtime')
curr_request.set_content(json.dumps(body).encode('utf-8'))
response = client.do_action_with_exception(curr_request)
res_json = json.dumps(json.loads(response), indent=4, ensure_ascii=False)
return res_json
@app.route('/getVoiceTask/<string:task_id>', methods=['GET'])
def get_voice_task_info(task_id):
# 在这里可以根据 item_name 返回不同的数据
if not task_id:
abort(400) # 如果 item_name 为空,返回 400 错误
credentials = AccessKeyCredential(ACCESS_ID, ACCESS_SECRET)
client = AcsClient(region_id='cn-beijing', credential=credentials)
uri = '/openapi/tingwu/v2/tasks' + '/' + task_id
request = create_common_request('tingwu.cn-beijing.aliyuncs.com', '2023-09-30', 'https', 'GET', uri)
response = client.do_action_with_exception(request)
res_json = json.dumps(json.loads(response), indent=4, ensure_ascii=False)
return res_json
@app.route('/closeVoiceTask/<string:task_id>', methods=['PUT'])
def close_voice_task_info(task_id):
# 在这里可以根据 item_name 返回不同的数据
if not task_id:
abort(400) # 如果 item_name 为空,返回 400 错误
credentials = AccessKeyCredential(ACCESS_ID, ACCESS_SECRET)
client = AcsClient(region_id='cn-beijing', credential=credentials)
request = create_common_request('tingwu.cn-beijing.aliyuncs.com', '2023-09-30', 'https', 'PUT',
'/openapi/tingwu/v2/tasks')
request.add_query_param('type', 'realtime')
request.add_query_param('operation', 'stop')
body = dict()
body['AppKey'] = APP_KEY
# 基本请求参数
input = dict()
# 输入语音流格式和采样率和以下参数设置保持一致
input['TaskId'] = task_id
body['Input'] = input
request.set_content(json.dumps(body).encode('utf-8'))
response = client.do_action_with_exception(request)
res_json = json.dumps(json.loads(response), indent=4, ensure_ascii=False)
return res_json
if __name__ == "__main__":
app.run(debug=True, host='0.0.0.0')
前端代码
VoiceWebSocket.ts
import { createVoiceWsInfo, closeTask } from "@/app/client/voiceApi";
import { v4 as uuidv4 } from "uuid";
import VoiceRecorder from "@/app/utils/VoiceRecorder";
class VoiceWebSocket {
public wsUrl: string | undefined;
public taskId: string | undefined;
public socket: WebSocket | undefined;
public socketStatus: string;
private voice: VoiceRecorder | undefined;
private setFlag: ((flag: boolean) => void | false) | undefined;
private callBack: ((result: string) => void) | undefined;
private currentResult = [];
constructor() {
this.socketStatus = "init";
}
async init(): Promise<void> {
return new Promise((resolve, reject) => {
createVoiceWsInfo()
.then((response) => {
if (response.ok) {
return response.json();
}
throw new Error("Failed to fetch WebSocket info");
})
.then((data) => {
if ("0" === data.Code) {
const { Data = {} } = data;
const { MeetingJoinUrl = "", TaskId = "" } = Data;
this.wsUrl = MeetingJoinUrl;
this.taskId = TaskId;
this.socketStatus = "ing";
resolve();
}
reject(data.Message);
});
});
}
async initSocket(): Promise<void> {
return new Promise((resolve, reject) => {
if (this.wsUrl && this.taskId) {
this.socket = new WebSocket(this.wsUrl);
this.socket.binaryType = "blob";
this.socket.onopen = (event) => {
console.log("WebSocket connection open ", event);
const message = {
header: {
message_id: uuidv4(),
name: "StartTranscription",
namespace: "SpeechTranscriber",
task_id: this.taskId,
},
payload: {},
context: {},
};
// 将对象转换为 JSON 字符串并发送
this.socket?.send(JSON.stringify(message));
};
this.socket.onmessage = (event) => {
this.onReceiveSocketData(event);
};
this.socket.onclose = (event) => {
this.voice && this.voice.stopRecording();
if (this.socketStatus === "ing") {
closeTask(this.taskId);
}
this.closeSocket(event);
this.setFlag && this.setFlag(false);
};
resolve();
}
reject("socket初始化未成功");
});
}
public setVoice(voice: VoiceRecorder) {
this.voice = voice;
}
public setStopFlagFun(setFlag: (flag: boolean) => void) {
this.setFlag = setFlag;
}
sendSocketData(data: string | ArrayBuffer | Blob | ArrayBufferView) {
this.socket?.send(data);
}
onReceiveSocketData(event: MessageEvent) {
console.log("receive socket data: ", event);
if ("string" == typeof event.data) {
var n = null;
try {
n = JSON.parse(event.data);
} catch (r) {
n = {};
}
this.handleMessage && this.handleMessage(n);
}
}
handleMessage(data = {}) {
// @ts-ignore
const { header = {}, payload = {} } = data;
if ("SentenceEnd" == header.name) {
const result = payload.result;
// @ts-ignore
this.currentResult.push(result);
this.callBack && this.callBack(this.currentResult.join(""));
} else if ("TranscriptionResultChanged" == header.name) {
const result = payload.result;
this.currentResult = [];
// @ts-ignore
this.currentResult.push(result);
this.callBack && this.callBack(this.currentResult.join(""));
}
}
closeSocket(event: CloseEvent) {
console.log("WebSocket connection closed: ", event);
}
stopSocket() {
this.socket?.close();
}
setCallback(callback: ((result: string) => void) | undefined): void {
this.callBack = callback;
}
}
export default VoiceWebSocket;
AudioRecorder.ts
class AudioRecorder {
socket: WebSocket;
sampleRate: number;
stream!: MediaStream;
source: MediaStreamAudioSourceNode | null = null;
processor: ScriptProcessorNode | null | undefined;
isRecording: boolean = false;
audioChunks: Blob[] = [];
constructor(socket: WebSocket, sampleRate: number = 16000) {
this.socket = socket;
this.sampleRate = sampleRate; // 采样频率
}
async startMicrophone(): Promise<void> {
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
this.stream = stream;
const audioContext = new window.AudioContext({
sampleRate: this.sampleRate,
});
// 创建源节点
const source = audioContext.createMediaStreamSource(stream);
this.source = source;
// 创建一个 ScriptProcessorNode 用于处理音频数据
const processor = audioContext.createScriptProcessor(4096, 1, 1); // 单声道
this.processor = processor;
source.connect(processor);
processor.connect(audioContext.destination); // 连接到目的地(可选)
processor.onaudioprocess = (event) => {
// console.log("可用数据:", event)
// console.log("可用数据1:", event.inputBuffer)
// console.log("可用数据2:", event.inputBuffer.getChannelData(0))
var arrayBuffer = this.floatTo16BitPCM(
event.inputBuffer.getChannelData(0),
);
this.socket.send(arrayBuffer);
};
} catch (err) {
console.error("Error accessing media devices:", err);
}
}
floatTo16BitPCM = function (e: Float32Array) {
for (
var t = new DataView(new ArrayBuffer(2 * e.length)), n = 0;
n < e.length;
n++
) {
var r = e[n] < 0 ? 32768 : 32767;
t.setInt16(2 * n, (e[n] * r) | 0, !0);
}
return t.buffer;
};
startRecording(): void {
this.isRecording = true;
this.audioChunks = []; // Reset chunks for new recording
}
stopRecording(): void {
if (this.isRecording) {
this.isRecording = false;
}
if (this.source) {
this.source.disconnect();
}
if (this.processor) {
this.processor.disconnect();
}
if (this.stream) {
this.stream.getTracks().forEach((item) => {
item.stop();
});
}
}
}
export default AudioRecorder;
VoiceComponent.ts
"use client";
import VoiceWebSocket from "../utils/VoiceWebSocket";
import AudioRecorder from "../utils/VoiceRecorder";
import React, {useState} from "react";
const VoiceComponent = () => {
const [isRecording, setIsRecording] = useState(false);
const [voiceHandle, setVoiceHandle] = useState<AudioRecorder | null>(null);
const [socket, setSocket] = useState<VoiceWebSocket | null>(null);
const [inputVal, setInputVal] = useState('')
const startListening = async () => {
if (!isRecording) {
const ws = new VoiceWebSocket();
ws.init()
.then((res) => {
console.log("socket init", res);
ws.initSocket()
.then((res) => {
console.log("web socket init", res);
setSocket(ws);
setIsRecording(true);
})
.then((next) => {
console.log(next)
if (ws.socket) {
const voice = new AudioRecorder(ws.socket);
setVoiceHandle(voice);
voice.startMicrophone().then(() => {
voice.startRecording();
});
// voice.startRecording();
ws.setVoice(voice);
ws.setCallback(setInputVal);
ws.setStopFlagFun(setIsRecording);
}
})
.catch((err) => {
alert(err);
});
})
.catch((err) => {
alert(err);
});
} else {
voiceHandle?.stopRecording();
setIsRecording(false);
}
};
return (
<div>
<button onClick={startListening} title={'监听语音'}> 监听语音</button>
<div style={{marginTop: 10}}>
<textarea cols={30} rows={8} value={inputVal} readOnly={true}/>
</div>
</div>
)
}
export default VoiceComponent;
然后点击监听语音的时候会实时发送数据
实现效果
其他问题:
现在新版本的浏览器都限制麦克风只能在安全环境下打开,如果打不开的话请参考
新版chrome 解决在http协议下无法调用摄像头和麦克风的问题(不安全)_--unsafely-treat-insecure-origin-as-secure-CSDN博客