起因
因为有语音识别需求,且需要低延迟,在查找了很多ASR项目后,偶然间发现sherpa-onnx比较契合我的需求,因为可以本地部署还免费,查找了不少资料后发现很少有sherpa-onnx+unity的资料,因此尝试自己研究下。
sherpa-onnx
项目基础介绍
sherpa-onnx 是一个开源项目,旨在提供离线的语音识别、文本转语音、说话人识别和语音活动检测(VAD)功能。该项目支持很多语言,其中包括C#,在原项目文件中有C#示例,因此可以移植到unity中,本文参考了sherpa-onnx的TTS实现和原项目C#示例代码。
首先该项目进行语音识别需要语音识别大模型,我这边以 sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 模型作为测试,要下载该模型请自行去该项目的官方网址去下载。
然后需要导入相关dll库和lib库,我在原项目文件中找了很久都没找到,最后在他的示例项目中找到。
现在万事俱备,接下来可以开始了。
首先在官方的C#示例代码中有流式和非流式的示例代码,此处先以流式作为例子,至于非流式道理差不多。直接上代码:
using SherpaOnnx;
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Threading;
using UnityEngine;
public class MyAsr_Test : MonoBehaviour
{
private static int minFreq, maxFreq;//最小和最大频率
WaitForSeconds seconds = new WaitForSeconds(0.2f);
/// <summary>
/// 上一次采样位置
/// </summary>
int lastSampling;
float[] f = new float[16000];
AudioClip microphoneClip;
bool recoeding = true;
bool Init()
{
if (Microphone.devices.Length > 0)
{
Debug.Log($"设备名称为:{Microphone.devices[0]}");
Microphone.GetDeviceCaps(null, out minFreq, out maxFreq);
if (minFreq == 0 && maxFreq == 0)
{
maxFreq = 44100;
}
return true;
}
return false;
}
/// <summary>
/// 设置录音状态
/// </summary>
void SetRecoeding(bool recoeding)
{
this.recoeding = recoeding;
if (recoeding)
lastSampling = 0;
}
private IEnumerator Start()
{
Init();
yield return seconds;
var model_dir = Path.Combine(Application.streamingAssetsPath, "SoundToTextModel");
var modelDir = Path.Combine(model_dir, "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20");
OnlineRecognizerConfig config = new();
config.FeatConfig.SampleRate = 16000;
config.FeatConfig.FeatureDim = 80;
config.ModelConfig.Transducer.Encoder = Path.Combine(modelDir, "encoder-epoch-99-avg-1.onnx");
config.ModelConfig.Transducer.Decoder = Path.Combine(modelDir, "decoder-epoch-99-avg-1.onnx");
config.ModelConfig.Transducer.Joiner = Path.Combine(modelDir, "joiner-epoch-99-avg-1.onnx");
config.ModelConfig.Paraformer.Encoder = "";
config.ModelConfig.Paraformer.Decoder = "";
config.ModelConfig.Tokens = Path.Combine(modelDir, "tokens.txt");
config.ModelConfig.Provider = "cpu";
config.ModelConfig.NumThreads = 1;
config.ModelConfig.Debug = 0;
config.DecodingMethod = "greedy_search";
config.MaxActivePaths = 4;
config.EnableEndpoint = 1;
config.Rule1MinTrailingSilence = 2.4f;
config.Rule2MinTrailingSilence = 0.8f;
config.Rule3MinUtteranceLength = 20;
#region 添加识别标点符号的模型
OfflinePunctuationConfig opc = new OfflinePunctuationConfig();
OfflinePunctuationModelConfig opmc = new OfflinePunctuationModelConfig();
string model_path = Path.Combine(model_dir, "sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12", "model.onnx");
if (!File.Exists(model_path))
{
yield break;
}
opmc.CtTransformer = model_path;
opmc.NumThreads = 2;
opmc.Provider = "cpu";
opmc.Debug = 1;
opc.Model = opmc;
var offlinePunctuation = new OfflinePunctuation(opc);
#endregion
OnlineRecognizer recognizer = new(config);
OnlineStream s = recognizer.CreateStream();
string lastText = string.Empty;
int segmentIndex = 0;
// 等待应用获取麦克风权限
yield return Application.RequestUserAuthorization(UserAuthorization.Microphone);
// 如果未获取到麦克风权限
if (!Application.HasUserAuthorization(UserAuthorization.Microphone))
yield break;
do
{
microphoneClip = Microphone.Start(null, true, 1, 16000);
yield return null;
}
while (!Microphone.IsRecording(null));
Application.quitting += () => Microphone.End(null);
Debug.Log("开始录音");
while (true)
{
yield return seconds;
int currentPos = Microphone.GetPosition(null);
bool isSucceed = microphoneClip.GetData(f, 0);
if (!recoeding)
continue;
if (isSucceed)
{
if (lastSampling != currentPos)
{
int count = 0;
float[] p = default;
if (currentPos > lastSampling)
{
count = currentPos - lastSampling;
p = new float[count];
Array.Copy(f, lastSampling, p, 0, count);
}
else
{
count = 16000 - lastSampling;
p = new float[count + currentPos];
Array.Copy(f, lastSampling, p, 0, count);
Array.Copy(f, 0, p, count, currentPos);
count += currentPos;
}
lastSampling = currentPos;
s.AcceptWaveform(16000, p);
}
}
while (recognizer.IsReady(s))
{
recognizer.Decode(s);
}
var text = recognizer.GetResult(s).Text;
bool isEndpoint = recognizer.IsEndpoint(s);
if (!string.IsNullOrWhiteSpace(text) && lastText != text)
{
lastText = text;
Debug.Log($"\r{segmentIndex}: {lastText}");
}
if (isEndpoint)
{
if (!string.IsNullOrWhiteSpace(text))
{
++segmentIndex;
lastText = offlinePunctuation.AddPunct(text);
Debug.Log($"\r{segmentIndex}: {lastText}");
}
recognizer.Reset(s);
}
}
}
}
上述是一个测试脚本,引入了sherpa-onnx自动添加标点的模型(该模型同样去官网自行下载),因为仅使用sherpa-onnx的asr语音识别模型那么它识别的结果不会自动添加标点符号,如果不需要自动添加标点可以将自动添加标点的部分代码删掉。
将该脚本随便挂在一个物体上就可以测试了,个人觉得还行(这里以音乐来测试)。