一、演示效果
二、主要代码
using Cysharp.Threading.Tasks;
using System;
using System.Collections.Generic;
using System.IO;
using System.Security.Cryptography;
using System.Text;
using System.Threading;
using UnityEngine;
using UnityWebSocket;
namespace AIChat.Chat
{
[Serializable]
public class ApiResponse
{
public int code;
public string message;
public string voice_id;
public string message_id;
public Result result;
public int final;
}
[Serializable]
public class Result
{
public int slice_type;
public int index;
public int start_time;
public int end_time;
public string voice_text_str;
public int word_size;
public string[] word_list;
public object emotion_type;
}
[Serializable]
public class EndText
{
public string type;
public EndText()
{
type = "end";
}
}
public class TencentSpeachToText : MonoBehaviour
{
[SerializeField] private string _SpeechRecognizeURL = "asr.cloud.tencent.com";
[SerializeField] private string _appID = string.Empty;
[SerializeField] private string _secretid = string.Empty;
[SerializeField] private string _secretkey = string.Empty;
[SerializeField] private string _engine_model_type = "16k_zh";
private IWebSocket socket;
private string _microphoneDevice;
private AudioClip _origionRecording;
private CancellationTokenSource _cancelToken;
private float[] _sampleBuffer;
private int _lastSample = 0;
private string _lastResult;
private string _recognitionText;
public WebSocketState state;
private void Awake()
{
_SpeechRecognizeURL = GenerateSpeechRecognizeURL();
InitializeDevice();
_cancelToken = new CancellationTokenSource();
}
private void InitializeDevice()
{
if (Microphone.devices.Length > 0)
{
_microphoneDevice = Microphone.devices[0];
}
else
{
Debug.LogError("没有找到麦克风设备。");
}
}
private void Update()
{
state = socket == null ? WebSocketState.Closed : socket.ReadyState;
}
#region 签名生成
private string GenerateSpeechRecognizeURL()
{
string baseUrl = $"{_SpeechRecognizeURL}/asr/v2/{_appID}?";
SortedDictionary<string, string> headers = SplicingHeaders();
baseUrl = MakeSignPlainText(baseUrl, headers);
string signature = GetSignature(_secretkey, baseUrl);
return $"wss://{baseUrl}&signature={Uri.EscapeDataString(signature)}";
}
private string GetSignature(string signKey, string secret)
{
using (HMACSHA1 mac = new HMACSHA1(Encoding.UTF8.GetBytes(signKey)))
{
byte[] hash = mac.ComputeHash(Encoding.UTF8.GetBytes(secret));
return Convert.ToBase64String(hash);
}
}
private SortedDictionary<string, string> SplicingHeaders()
{
return new SortedDictionary<string, string>
{
{ "engine_model_type", _engine_model_type },
{ "expired", ((int)DateTime.UtcNow.AddDays(10).Subtract(DateTime.UnixEpoch).TotalSeconds).ToString() },
{ "filter_dirty", "1" },
{ "filter_modal", "2" },
{ "filter_punc", "1" },
{ "needvad", "1" },
{ "nonce", new System.Random().Next(1000000000).ToString() },
{ "secretid", _secretid },
{ "timestamp", ((int)DateTime.UtcNow.Subtract(DateTime.UnixEpoch).TotalSeconds).ToString() },
{ "voice_format", "1" },
{ "voice_id", Guid.NewGuid().ToString() }
};
}
private string MakeSignPlainText(string url, SortedDictionary<string, string> requestParams)
{
StringBuilder builder = new StringBuilder(url);
foreach (var kvp in requestParams)
{
builder.AppendFormat("{0}={1}&", kvp.Key, kvp.Value);
}
return builder.ToString().TrimEnd('&');
}
#endregion
public void ConnectToServer()
{
socket = new WebSocket(_SpeechRecognizeURL);
socket.OnOpen += Socket_OnOpen;
socket.OnMessage += Socket_OnMessage;
socket.OnClose += Socket_OnClose;
socket.OnError += Socket_OnError;
Debug.Log("<color=yellow>[Tencent]</color> 正在连接...");
socket.ConnectAsync();
}
public void CloseClient()
{
Debug.Log("<color=yellow>[Tencent]</color> 正在关闭...");
socket.CloseAsync();
}
#region 语音识别
public async UniTaskVoid StartMicrophone()
{
if (state != WebSocketState.Connecting && state != WebSocketState.Open)
{
ConnectToServer();
}
if (!Microphone.IsRecording(_microphoneDevice))
{
_origionRecording = Microphone.Start(_microphoneDevice, true, 200, 16000);
}
while (Microphone.IsRecording(_microphoneDevice))
{
SendAudioDataToRecognizer();
await UniTask.Delay(40);
}
}
public async UniTaskVoid StopMicrophone(Action<string> callback)
{
if (Microphone.IsRecording(_microphoneDevice))
{
Microphone.End(null);
}
await UniTask.WaitUntil(() => !Microphone.IsRecording(_microphoneDevice));
var endText = JsonUtility.ToJson(new EndText());
SendData(endText);
callback?.Invoke(_recognitionText);
_lastResult = string.Empty;
}
private void SendAudioDataToRecognizer()
{
int pos = Microphone.GetPosition(_microphoneDevice);
int diff = pos - _lastSample;
if (diff > 0)
{
if (_sampleBuffer == null || _sampleBuffer.Length != diff * _origionRecording.channels)
{
_sampleBuffer = new float[diff * _origionRecording.channels];
}
_origionRecording.GetData(_sampleBuffer, _lastSample);
byte[] ba = ConvertAudioClipDataToInt16ByteArray(_sampleBuffer);
if (ba.Length != 0)
{
SendData(ba);
}
}
_lastSample = pos;
}
private byte[] ConvertAudioClipDataToInt16ByteArray(float[] data)
{
byte[] byteArray = new byte[data.Length * 2];
for (int i = 0; i < data.Length; i++)
{
short value = (short)(data[i] * short.MaxValue);
byteArray[i * 2] = (byte)(value & 0x00ff);
byteArray[i * 2 + 1] = (byte)((value & 0xff00) >> 8);
}
return byteArray;
}
#endregion
private void SendData(string sendText)
{
if (state == WebSocketState.Connecting || state == WebSocketState.Closed || state == WebSocketState.Closing)
{
return;
}
socket?.SendAsync(sendText);
}
public void SendData(byte[] bytes)
{
if (state == WebSocketState.Connecting || state == WebSocketState.Closed || state == WebSocketState.Closing)
{
return;
}
socket?.SendAsync(bytes);
}
private void Socket_OnOpen(object sender, OpenEventArgs e)
{
Debug.Log($"<color=yellow>[Tencent]</color> 已连接: {_SpeechRecognizeURL}");
}
private void Socket_OnMessage(object sender, MessageEventArgs e)
{
if (e.IsBinary)
{
Debug.Log($"<color=yellow>[Tencent]</color> 接收到字节数据 ({e.RawData.Length}): {e.Data}");
}
else if (e.IsText)
{
Debug.Log($"<color=yellow>[Tencent]</color> 接收到文本: {e.Data}");
var response = JsonUtility.FromJson<ApiResponse>(e.Data);
if (response != null && response.final != 1)
{
_recognitionText = _lastResult + response.result.voice_text_str;
if (response.result.slice_type == 2)
{
_lastResult = _recognitionText;
}
}
}
}
private void Socket_OnClose(object sender, CloseEventArgs e)
{
Debug.Log($"<color=yellow>[Tencent]</color> 已关闭: 状态码: {e.StatusCode}, 原因: {e.Reason}");
}
private void Socket_OnError(object sender, UnityWebSocket.ErrorEventArgs e)
{
Debug.Log($"<color=yellow>[Tencent]</color> 错误: {e.Message}");
}
private void OnApplicationQuit()
{
if (socket != null && socket.ReadyState != WebSocketState.Closed)
{
CloseClient();
}
}
}
}
三、参考内容
腾讯实时语音识别API文档
UnityWebSocket