fastgpt使用whisper过程中,经常出现识别不了,识别不对的情况。经研究发现,阿里的funasr效果也很好。测试后发现funasr的实际使用效果远高于whisper。
目录
一、启动funasr并测试服务
Funasr按照步骤安装完成,安装地址:FunASR/runtime/docs/SDK_advanced_guide_offline_zh.md at main · alibaba-damo-academy/FunASR · GitHub。我是docker部署的,通过docker attach 容器id 进入容器
docker启动之后,启动 funasr-wss-server服务程序:
docker启动之后,启动 funasr-wss-server服务程序:
cd FunASR/runtime
nohup bash run_server.sh \
--download-model-dir /workspace/models \
--vad-dir damo/speech_fsmn_vad_zh-cn-16k-common-onnx \
--model-dir damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-onnx \
--punc-dir damo/punc_ct-transformer_cn-en-common-vocab471067-large-onnx \
--lm-dir damo/speech_ngram_lm_zh-cn-ai-wesp-fst \
--itn-dir thuduj12/fst_itn_zh \
--certfile 0 \
--hotword /workspace/models/hotwords.txt > log.txt 2>&1 &
# 如果您想关闭ssl,增加参数:--certfile 0
# 如果您想使用时间戳或者nn热词模型进行部署,请设置--model-dir为对应模型:
# damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-onnx(时间戳)
# damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404-onnx(nn热词)
# 如果您想在服务端加载热词,请在宿主机文件./funasr-runtime-resources/models/hotwords.txt配置热词(docker映射地址为/workspace/models/hotwords.txt):
# 每行一个热词,格式(热词 权重):阿里巴巴 20(注:热词理论上无限制,但为了兼顾性能和效果,建议热词长度不超过10,个数不超过1k,权重1~100)
我是采用了关闭SSL,所以增加了 --certfile 0 \
执行以上命令后不会立刻启动服务,会去下载模型,需要等一段时间,可通过查看日志文件查看进度
启动完成后,找一台同局域网的机器测试下,我是采用在本机进行测试。下载funasr_samples测试代码。地址:
https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/sample/funasr_samples.tar.gz
进入python文件夹执行如下命令:python funasr_wss_client.py --host "192.168.1.39" --port 10096 --mode 2pass --audio_in "../audio/asr_example.wav"。
或者使用html目录下的测试例子,这个比较直观
注意由于关闭了ssl,所以ws地址是以ws://开头。
二、修改js代码并引入fastgpt中
由于要把funasr调用嵌入fastgpt,fastgpt前端代码是typescript编写的,所以去示例funasr_samples项目中的html目录下,查看他的js示例代码,并截取代码放入ts中。
以下是修改后的JS代码: 没做详细精简,js代码仅仅是截取了例子中的发送websocket代码,并修改代码为异步代码,使其可以在获取ws返回结果后可以将数据返回给ts调用者。
var isfilemode = true; // if it is in file mode
// 连接; 定义socket连接类对象与语音对象
var wsconnecter = new WebSocketConnectMethod({ msgHandle: getJsonMessage, stateHandle: getConnState });
var sampleBuf = new Int16Array();
var totalsend = 0;
var file_data_array; // array to save file data
let retText = "";
let resolveRecText;
function startWs() {
return new Promise((resolve, reject) => {
console.log("isfilemode:" + isfilemode);
wsconnecter = new WebSocketConnectMethod({
msgHandle: getJsonMessage,
stateHandle: getConnState
});
// resolve 函数将在 getJsonMessage 中被调用
resolveRecText = resolve;
var ret = wsconnecter.wsStart();
if (ret != 1) {
console.log("请点击开始");
reject("启动连接失败");
}
});
}
function stopWS() {
var chunk_size = new Array(5, 10, 5);
var request = {
"chunk_size": chunk_size,
"wav_name": "h5",
"is_speaking": false,
"chunk_interval": 10,
"wav_format": "mp3",
"mode": getAsrMode(),
};
console.log("request:" + request);
sampleBuf = new Int16Array();
wsconnecter.wsSend(JSON.stringify(request));
console.log("发送完数据,请等候,正在识别...");
// 停止连接
}
function getHotwords() {
let val = "阿里巴巴 20 \r\n hello world 40";
let items = val.split(/[(\r\n)\r\n]+/); //split by \r\n
var jsonresult = {};
const regexNum = /^[0-9]*$/; // test number
for (let item of items) {
let result = item.split(" ");
if (result.length >= 2 && regexNum.test(result[result.length - 1])) {
var wordstr = "";
for (var i = 0; i < result.length - 1; i++)
wordstr = wordstr + result[i] + " ";
jsonresult[wordstr.trim()] = parseInt(result[result.length - 1]);
}
}
console.log("jsonresult=" + JSON.stringify(jsonresult));
return JSON.stringify(jsonresult);
}
function getAsrMode() {
var item = "offline";// 2pass online offline
console.log("getAsrMode asr mode::" + item);
return item;
}
function getJsonMessage(jsonMsg) {
//console.log(jsonMsg);
const data = JSON.parse(jsonMsg.data);
retText = data['text'];
endFlag = true;
console.log("收到返回message: " + data['text']);
// 解析 Promise
if (resolveRecText) {
resolveRecText(data['text']);
}
// var rectxt = "" + data['text'];
// var asrmodel = data['mode'];
var is_final = data['is_final'];
// var timestamp = data['timestamp'];
// if (asrmodel == "2pass-offline" || asrmodel == "offline") {
// offline_text = offline_text + handleWithTimestamp(rectxt, timestamp); //rectxt; //.replace(/ +/g,"");
// rec_text = offline_text;
// } else {
// rec_text = rec_text + rectxt; //.replace(/ +/g,"");
// }
// console.log("---offline_text: " + asrmodel + "," + offline_text);
// console.log("---rec_text: " + rec_text);
if (isfilemode == true && is_final == false) {
console.log("call stop ws!");
wsconnecter.wsStop();
}
}
// 连接状态响应
function getConnState(connState) {
if (connState === 0) { //on open
console.log('连接成功!请点击开始');
if (isfilemode == true) {
console.log('请耐心等待,大文件等待时间更长');
start_file_send();
}
} else if (connState === 1) {
//stop();
} else if (connState === 2) {
stopWS();
}
}
function start_file_send() {
sampleBuf = new Uint8Array(file_data_array);
console.log("start_file_send 发送sampleBuf长度:" + sampleBuf.length);
var chunk_size = 960; // for asr chunk_size [5, 10, 5]
while (sampleBuf.length >= chunk_size) {
let sendBuf = sampleBuf.slice(0, chunk_size);
totalsend = totalsend + sampleBuf.length;
sampleBuf = sampleBuf.slice(chunk_size, sampleBuf.length);
console.log("发送ws信息长度:" + sendBuf.length);
wsconnecter.wsSend(sendBuf);
}
stopWS();
}
export async function audio2Text(audioBuffer) {
console.log("收到参数 audio2Text: " + audioBuffer.byteLength);
file_data_array = audioBuffer;
// downloadArrayBufferAsMP3(file_data_array);
//处理完文件后连接ws并发送数据
try {
const recText = await startWs();
console.log("转录结果为:" + recText);
return recText;
} catch (error) {
console.error("转录过程中发生错误:", error);
throw error; // 将错误向上传递
}
}
function downloadArrayBufferAsMP3(arrayBuffer, filename = 'download.mp3') {
// 将 ArrayBuffer 转换为 Blob
const blob = new Blob([arrayBuffer], { type: 'audio/mpeg' });
// 为 Blob 创建一个临时的 URL
const url = URL.createObjectURL(blob);
// 创建一个 <a> 标签以模拟下载动作
const a = document.createElement('a');
a.style.display = 'none';
a.href = url;
a.download = filename; // 设置下载文件的文件名
// 将 <a> 标签加入到文档中,然后模拟点击它以开始下载
document.body.appendChild(a);
a.click();
// 清理: 移除 <a> 标签和释放 Blob URL
document.body.removeChild(a);
URL.revokeObjectURL(url);
}
// 假设你已经有了一个代表 MP3 数据的 ArrayBuffer
// const arrayBuffer = ...
function handleWithTimestamp(tmptext, tmptime) {
console.log("-----tmptext: " + tmptext);
console.log("-----tmptime: " + tmptime);
if (tmptime == null || tmptime == "undefined" || tmptext.length <= 0) {
return tmptext;
}
tmptext = tmptext.replace(/。|?|,|、|\?|\.|\ /g, ","); // in case there are a lot of "。"
var words = tmptext.split(","); // split to chinese sentence or english words
var jsontime = JSON.parse(tmptime); //JSON.parse(tmptime.replace(/\]\]\[\[/g, "],[")); // in case there are a lot segments by VAD
var char_index = 0; // index for timestamp
var text_withtime = "";
for (var i = 0; i < words.length; i++) {
if (words[i] == "undefined" || words[i].length <= 0) {
continue;
}
console.log("words===", words[i]);
console.log("words: " + words[i] + ",time=" + jsontime[char_index][0] / 1000);
if (/^[a-zA-Z]+$/.test(words[i])) { // if it is english
text_withtime = text_withtime + jsontime[char_index][0] / 1000 + ":" + words[i] + "\n";
char_index = char_index + 1; //for english, timestamp unit is about a word
}
else {
// if it is chinese
text_withtime = text_withtime + jsontime[char_index][0] / 1000 + ":" + words[i] + "\n";
char_index = char_index + words[i].length; //for chinese, timestamp unit is about a char
}
}
return text_withtime;
}
//----------------ws connecter ---------------
function WebSocketConnectMethod(config) { //定义socket连接方法类
var speechSokt;
var connKeeperID;
var msgHandle = config.msgHandle;
var stateHandle = config.stateHandle;
this.wsStart = function () {
var Uri = "ws://192.168.1.39:10096"; //"wss://111.205.137.58:5821/wss/" //设置wss asr online接口地址 如 wss://X.X.X.X:port/wss/
console.log("Uri:" + Uri);
if ('WebSocket' in window) {
speechSokt = new WebSocket(Uri); // 定义socket连接对象
speechSokt.onopen = function (e) { onOpen(e); }; // 定义响应函数
speechSokt.onclose = function (e) {
console.log("onclose ws!");
//speechSokt.close();
onClose(e);
};
speechSokt.onmessage = function (e) { onMessage(e); };
speechSokt.onerror = function (e) { onError(e); };
return 1;
}
else {
alert('当前浏览器不支持 WebSocket');
return 0;
}
};
// 定义停止与发送函数
this.wsStop = function () {
if (speechSokt != undefined) {
console.log("stop ws!");
speechSokt.close();
}
};
this.wsSend = function (oneData) {
if (speechSokt == undefined) return;
if (speechSokt.readyState === 1) { // 0:CONNECTING, 1:OPEN, 2:CLOSING, 3:CLOSED
speechSokt.send(oneData);
}
};
// SOCEKT连接中的消息与状态响应
function onOpen(e) {
// 发送json
var chunk_size = new Array(5, 10, 5);
var request = {
"chunk_size": chunk_size,
"wav_name": "h5",
"is_speaking": true,
"chunk_interval": 10,
"itn": false,
"mode": getAsrMode(),
};
// if (isfilemode) {
// request.wav_format = file_ext;
// if (file_ext == "wav") {
// request.wav_format = "PCM";
// request.audio_fs = file_sample_rate;
// }
// }
var hotwords = getHotwords();
if (hotwords != null) {
request.hotwords = hotwords;
}
console.log(JSON.stringify(request));
speechSokt.send(JSON.stringify(request));
console.log("连接成功");
stateHandle(0);
}
function onClose(e) {
stateHandle(1);
}
function onMessage(e) {
msgHandle(e);
}
function onError(e) {
console.log(e);
stateHandle(2);
}
}
将以上代码放入一个js文件中,并将这个js文件复制到fastgpt中,并在useSpeech.ts中引入。我将代码放到了yxq.js中。
具体修改如下:
...其他import代码
import { audio2Text } from '../yxq'; 通过这个代码引入yxq.js。
...
...其他代码
...
mediaRecorder.current.onstop = async () => {
if (!cancelWhisperSignal.current) {
const formData = new FormData();
let options = {};
if (MediaRecorder.isTypeSupported('audio/webm')) {
options = { type: 'audio/webm' };
} else if (MediaRecorder.isTypeSupported('video/mp3')) {
options = { type: 'video/mp3' };
} else {
console.error('no suitable mimetype found for this device');
}
console.log('options:' + options);
const blob = new Blob(chunks, options);
// let optionsNew = { type: 'application/octet-binary' };
// const blobNew = new Blob(chunks, optionsNew);
const duration = Math.round((Date.now() - startTimestamp.current) / 1000);
formData.append('file', blob, 'recording.mp3');
formData.append(
'data',
JSON.stringify({
...props,
duration
})
);
setIsTransCription(true);
try {
//以下是修改的内容开始
const arrayBuffer = await blob.arrayBuffer();
const text = await handleAudioBuffer(arrayBuffer); // 这里通过 await 获取 handleAudioBuffer 的返回值
console.log('语音转文字完毕:', text);
onFinish(text); // 假设你之前定义了一个处理转录文本的函数 onFinish
//修改的内容结束
} catch (error) {
toast({
status: 'warning',
title: getErrText(error, t('common.speech.error tip'))
});
}
}
// close media stream
stream.getTracks().forEach((track) => track.stop());
setIsTransCription(false);
setIsSpeaking(false);
};
mediaRecorder.current.onerror = (e) => {
console.log('error', e);
setIsSpeaking(false);
};
mediaRecorder.current.start();
} catch (error) {
toast({
status: 'warning',
title: getErrText(error, 'Whisper error')
});
console.log(error);
}
};
async function handleAudioBuffer(audioBuffer: ArrayBuffer): Promise<string> {
try {
const text = await audio2Text(audioBuffer);
console.log('转录文本: ', text);
return text; // 确保返回这个文本结果
} catch (error) {
console.error('转录出错: ', error);
throw error; // 抛出错误供上层捕获
}
}
...
...其他代码
...