前段时间用WebAPI的方式去调用科大讯飞的语音识别功能,发现5s的音频需要接近6s的等待时间,但是通过调整发送数据的大小和等待时间还是能够加快识别速度的(可惜已经是完成这项文章之后才发现的),详情见上篇文章http://t.csdnimg.cn/KQCFP。
所以尝试一下SDK的方式会不会快一点
参考:语音听写 Windows SDK 文档 | 讯飞开放平台文档中心 (xfyun.cn)
MSC for Windows&Linux API V1.4 | 讯飞开放平台文档中心
项目地址:GitHub - cheng8876001/SDKtest
1、注册科大讯飞账号并下载SDK文件
讯飞开放平台-以语音交互为核心的人工智能开放平台 (xfyun.cn)
进入控制台下载
这里我只选择了这一项功能,如果有需求可以选上对应的功能
2、将下载好的bin、include、libs三个文件夹复制到自己创建的项目中
根据官方文档的说法,这些下载的文件会和账号相对应起来,所以必须保证下载文件的账号和在程序中设置的APPid属于同一个账号。例如用A的账号下载了SDK,在程序中是B的APPid就无法正常使用。
3、科大讯飞提供的dll是用c/c++写的,c#无法直接调用,需要自己新建一个文件用于函数导入
3.1、将复制到项目文件夹里的msc.dll文件添加到项目中
3.2、创建一个新文件,导入各种状态与要使用的函数
这些状态的定义可以在下载的SDK文件的include/msp_errors.h和msp_types.h找到,太长了我就不贴代码了
[DllImport("msc.dll",CallingConvention = CallingConvention.StdCall)]
public static extern int MSPLogin(string user, string password, string parameters);
[DllImport("msc.dll", CallingConvention = CallingConvention.StdCall)]
public static extern int MSPLogout();
[DllImport("msc.dll", CallingConvention = CallingConvention.StdCall)]
public static extern IntPtr QISRSessionBegin(string grammarList, string parameters, ref ErrorCode errorCode);
[DllImport("msc.dll", CallingConvention = CallingConvention.StdCall)]
public static extern int QISRAudioWrite(IntPtr sessionID, byte[] waveData, uint waveLen, AudioStatus audioStatus, ref EndPointerStatus epStatus, ref RecognitionStatus recogStatus);
[DllImport("msc.dll", CallingConvention = CallingConvention.StdCall)]
public static extern IntPtr QISRGetResult(IntPtr sessionID, ref RecognitionStatus rsltStatus, int waitTime, ref ErrorCode errorCode);
[DllImport("msc.dll", CallingConvention = CallingConvention.StdCall)]
public static extern int QISRSessionEnd(IntPtr sessionID, string hints);
4、根据流程图敲代码
打开SDK文件夹的samples/samples.sln
里面的iat_online_sample项目就是在线语音识别,可以参考来实现功能
注意:_sessionBeginParameters中的各个键值对需要按照实际进行修改,尤其是result_type,result_encoding,aue这三个。在涉及返回结果处理的函数也要根据自己的设置进行修改,主要是编码格式问题
private string IntPtr2Str(IntPtr ptr)
using Newtonsoft.Json;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows;
using System.Windows.Controls;
using System.Windows.Data;
using System.Windows.Documents;
using System.Windows.Input;
using System.Windows.Media;
using System.Windows.Media.Imaging;
using System.Windows.Navigation;
using System.Windows.Shapes;
namespace SDKtest
{
/// <summary>
/// MainWindow.xaml 的交互逻辑
/// </summary>
public partial class MainWindow : Window
{
Test t = new Test();
public MainWindow()
{
InitializeComponent();
}
private void Button_Click(object sender, RoutedEventArgs e)
{
t.Send();
}
}
public class Test : mscDLL
{
private string _loginParameters = "appid = ";
private string _sessionBeginParameters = "sub = iat, domain = iat, language = zh_cn, accent = mandarin, sample_rate = 16000, result_type = json, result_encoding = utf-8, aue = raw";
private string _filePath = "E:\\SystemData\\desktop\\recorded.wav";//录音文件路径
public void Send()
{
ErrorCode errorCode = ErrorCode.MSP_SUCCESS;
errorCode = (ErrorCode)MSPLogin(null, null, _loginParameters);//登录
if(errorCode != ErrorCode.MSP_SUCCESS)
{
MessageBox.Show($"MSPLogin Error!\r\n Error Code:{errorCode}");
MSPLogout();
return;
}
IntPtr sessionID = QISRSessionBegin(null, _sessionBeginParameters, ref errorCode);//开始一次语音识别,并传输各种参数
if(errorCode != ErrorCode.MSP_SUCCESS)
{
MessageBox.Show($"QISRSessionBegin Error!\r\n Error Code:{errorCode}");
MSPLogout();
return;
}
byte[] arr = File.ReadAllBytes(_filePath);//读取文件所有内容
if( arr == null )
{
return;
}
int pcmSize = arr.Length;//获取音频总长,并作为记录剩余
int pcmCount = 0;
int frameSize = 6400;//每次写入200ms音频(16k,16bit):1帧音频20ms,10帧=200ms。16k采样率的16位音频,一帧的大小为640Byte
AudioStatus audioStatus = AudioStatus.MSP_AUDIO_SAMPLE_CONTINUE;//记录音频发送状态
EndPointerStatus endPointerStatus = EndPointerStatus.MSP_EP_LOOKING_FOR_SPEECH;//端点检测状态
RecognitionStatus recognitionStatus = RecognitionStatus.MSP_REC_STATUS_SUCCESS;//识别器状态
while (true)
{
if(pcmSize <= frameSize)
{
frameSize = pcmSize;
if (frameSize <= 0)
break;
}
audioStatus = AudioStatus.MSP_AUDIO_SAMPLE_CONTINUE;
if(pcmCount == 0)
{
audioStatus = AudioStatus.MSP_AUDIO_SAMPLE_FIRST;//状态更新为第一次发送
}
byte[] buffer = new byte[frameSize];
Array.Copy(arr,pcmCount, buffer, 0, frameSize);
pcmCount += frameSize;
pcmSize -= frameSize;
errorCode = (ErrorCode)QISRAudioWrite(sessionID, buffer, (uint)frameSize, audioStatus, ref endPointerStatus, ref recognitionStatus);
if (errorCode != ErrorCode.MSP_SUCCESS)
{
MessageBox.Show($"QISRAudioWrite Error!Error Code:{errorCode}");
MSPLogout();
break;
}
/*
if(recognitionStatus == RecognitionStatus.MSP_REC_STATUS_SUCCESS)//识别有结果了
{
IntPtr rslt = QISRGetResult(sessionID, ref recognitionStatus, 0, ref errorCode);
if(errorCode != ErrorCode.MSP_SUCCESS)
{
MessageBox.Show($"QISRGetResult Error!Error Code:{errorCode}");
MSPLogout();
break;
}
if (rslt != null)
{
}
}
*/
if (endPointerStatus == EndPointerStatus.MSP_EP_AFTER_SPEECH)
break;
Thread.Sleep(10);//延时10ms
}
errorCode = (ErrorCode)QISRAudioWrite(sessionID,null,0,AudioStatus.MSP_AUDIO_SAMPLE_LAST, ref endPointerStatus, ref recognitionStatus);
if(errorCode != ErrorCode.MSP_SUCCESS)
{
MessageBox.Show($"QISRAudioWrite Error!Error Code:{errorCode}");
MSPLogout();
return;
}
string str = null;
while (recognitionStatus != RecognitionStatus.MSP_REC_STATUS_COMPLETE)
{
IntPtr rslt = QISRGetResult(sessionID, ref recognitionStatus, 0, ref errorCode);
if (errorCode != ErrorCode.MSP_SUCCESS)
{
MessageBox.Show($"QISRGetResult Error!Error Code:{errorCode}");
MSPLogout();
break;
}
str += IntPtr2Str(rslt);
Thread.Sleep(10);
}
MessageBox.Show(str);
QISRSessionEnd(sessionID,null);//结束本次识别
MSPLogout();//注销
}
//对返回结果的IntPtr进行解析并提取识别结果
private string IntPtr2Str(IntPtr ptr)
{
string uniStr = Marshal.PtrToStringUni(ptr);//将返回结果转换为Unicode编码
string str = null;//用于记录识别结果
if (uniStr != null)
{
byte[] bytes = Encoding.Unicode.GetBytes(uniStr);
string jsonData = Encoding.UTF8.GetString(bytes);//转为UTF8编码
var jsonObject = JsonConvert.DeserializeObject<dynamic>(jsonData);
var data = jsonObject["ws"];
foreach (var item in data)
{
foreach (var item2 in item["cw"])
{
str += item2["w"];
}
}
}
return str;
}
}
}