微软的语音识别,在这里我们简称它为SR(speech recognition),SR分为两种模式的监听:第一种模式:听写模式,即随意输入语音,监听对象将最为接近的字或者词,句反馈出来;第二种模式:命令与控制模式,划定范围监听,制定一组被选项做为监听的,用户的语音输入被反馈成最为接近的一个选项。说得通俗一些:第一种是填空题,第二种是选择题目。
之前转载的一品文章《用SAPI实现Speech Recognition(SR) - 听写模式》,介绍了“听写模式”的实现,这一篇给出“命令与控制”模式的例子程序。
#include <windows.h>
#include <sapi.h>
#include <stdio.h>
#include <string.h>
#include <atlbase.h>
#include "sphelper.h"
inline HRESULT BlockForResult(ISpRecoContext * pRecoCtxt, ISpRecoResult ** ppResult)
{
HRESULT hr = S_OK;
CSpEvent event;
while (SUCCEEDED(hr) && SUCCEEDED(hr = event.GetFrom(pRecoCtxt)) && hr == S_FALSE)
{
hr = pRecoCtxt->WaitForNotifyEvent(INFINITE);
}
*ppResult = event.RecoResult();
if (*ppResult)
{
(*ppResult)->AddRef();
}
return hr;
}
const WCHAR * StopWord()
{
const WCHAR * pchStop;
LANGID LangId = ::SpGetUserDefaultUILanguage();
switch (LangId)
{
case MAKELANGID(LANG_JAPANESE, SUBLANG_DEFAULT):
pchStop = L"}42N86\0b70e50fc0ea0e70fc/05708504608a087046";;
break;
default:
pchStop = L"Stop";
break;
}
return pchStop;
}
int main(int argc, char* argv[])
{
HRESULT hr = E_FAIL;
bool fUseTTS = true; // turn TTS play back on or off
bool fReplay = true; // turn Audio replay on or off
// Process optional arguments
if (argc > 1)
{
int i;
for (i = 1; i < argc; i++)
{
if (_stricmp(argv[i], "-noTTS") == 0)
{
fUseTTS = false;
continue;
}
if (_stricmp(argv[i], "-noReplay") == 0)
{
fReplay = false;
continue;
}
printf("Usage: %s [-noTTS] [-noReplay] ", argv[0]);
return -1;
}
}
if (SUCCEEDED(hr = ::CoInitialize(NULL)))
{
{
CComPtr<ISpRecoContext> cpRecoCtxt;
CComPtr<ISpRecoGrammar> cpRecoGrammar;
CComPtr<ISpVoice> cpVoice;
if (FAILED(hr = cpRecoCtxt.CoCreateInstance(CLSID_SpSharedRecoContext)))
{
printf("cpRecoCtxt.CoCreateInstance() fail. hr = %x", hr);
return -2;
}
if (FAILED(hr = cpRecoCtxt->GetVoice(&cpVoice)))
{
printf("cpRecoCtxt->GetVoice() fail. hr = %x", hr);
return -3;
}
if (cpRecoCtxt && cpVoice)
{
if (FAILED(hr = cpRecoCtxt->SetNotifyWin32Event()))
{
printf("cpRecoCtxt->SetNotifyWin32Event() fail. hr = %x", hr);
return -4;
}
if (FAILED(hr = cpRecoCtxt->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION))))
{
printf("cpRecoCtxt->SetInterest() fail. hr = %x", hr);
return -5;
}
if (FAILED(hr = cpRecoCtxt->SetAudioOptions(SPAO_RETAIN_AUDIO, NULL, NULL)))
{
printf("cpRecoCtxt->SetAudioOptions() fail. hr = %x", hr);
return -6;
}
if (FAILED(hr = cpRecoCtxt->CreateGrammar(7, &cpRecoGrammar)))
{
printf("cpRecoCtxt->CreateGrammar() fail. hr = %x", hr);
return -7;
}
if (FAILED(hr = cpRecoGrammar->SetGrammarState(SPGS_DISABLED)))
{
printf("cpRecoGrammar->SetGrammarState() fail. hr = %x", hr);
return -8;
}
if (FAILED(hr = cpRecoGrammar->LoadCmdFromFile(L"conf.xml", SPLO_DYNAMIC)))
{
printf("cpRecoGrammar->LoadCmdFromFile() fail. hr = %x", hr);
return -9;
}
SPSTATEHANDLE hRule;
if (FAILED(hr = cpRecoGrammar->GetRule(L"COMMAND", NULL, SPRAF_Active, FALSE, &hRule)))
{
printf("cpRecoGrammar->GetRule() fail. hr = %x", hr);
return -9;
}
///目前使用的是静态配置文件,以后可以研究动态加载命令///
//if (FAILED(hr = cpRecoGrammar->ClearRule(hRule)))
//{
// printf("cpRecoGrammar->ClearRule() fail. hr = %x", hr);
// return -10;
//}
//if (FAILED(hr = cpRecoGrammar->AddWordTransition(hRule, NULL, L"Frank Lee", NULL, SPWT_LEXICAL, 1, NULL)))
//{
// printf("cpRecoGrammar->AddWordTransition(1) fail. hr = %x", hr);
// return -11;
//}
//if (FAILED(hr = cpRecoGrammar->AddWordTransition(hRule, NULL, L"self", NULL, SPWT_LEXICAL, 1, NULL)))
//{
// printf("cpRecoGrammar->AddWordTransition(2) fail. hr = %x", hr);
// return -12;
//}
//if (FAILED(hr = cpRecoGrammar->AddWordTransition(hRule, NULL, L"SAPI beta", NULL, SPWT_LEXICAL, 1, NULL)))
//{
// printf("cpRecoGrammar->AddWordTransition(3) fail. hr = %x", hr);
// return -13;
//}
if (FAILED(hr = cpRecoGrammar->Commit(NULL)))
{
printf("cpRecoGrammar->Commit() fail. hr = %x", hr);
return -14;
}
if (FAILED(hr = cpRecoGrammar->SetGrammarState(SPGS_ENABLED)))
{
printf("cpRecoGrammar->SetGrammarState() fail. hr = %x", hr);
return -15;
}
if (FAILED(hr = cpRecoGrammar->SetRuleState(NULL, NULL, SPRS_ACTIVE)))
{
printf("cpRecoGrammar->SetRuleState() fail. hr = %x", hr);
}
/
printf("Read to listen your command:\n");
USES_CONVERSION;
CComPtr<ISpRecoResult> cpResult;
while (SUCCEEDED(hr = BlockForResult(cpRecoCtxt, &cpResult)))
{
CSpDynamicString dstrText;
if (SUCCEEDED(cpResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL)))
{
printf("I heard: %s \n", W2A(dstrText));
if (fUseTTS)
{
cpVoice->Speak(L"I heard", SPF_ASYNC, NULL);
cpVoice->Speak(dstrText, SPF_ASYNC, NULL);
}
if (fReplay)
{
if (fUseTTS)
cpVoice->Speak(L"when you said", SPF_ASYNC, NULL);
else
printf(" when you said ");
cpResult->SpeakAudio(NULL, 0, NULL, NULL);
}
cpResult.Release();
}
}
}
}
::CoUninitialize();
}
return hr;
}
命令与控制模式需要使用到配置文件来定义“候选命令”范围,本例中用到XML配置文件“conf.xml”如下:
<GRAMMAR LANGID="804">
<DEFINE>
<ID NAME="CMD" VAL="10"/>
</DEFINE>
<RULE NAME="COMMAND" ID="CMD" TOPLEVEL="ACTIVE">
<L>
<p>东南大学</P>
<p>滴水洞</p>
<p>运行趋势分析</p>
<p>接地监视</p>
<p>模型异动</p>
<p>中科院</p>
</L>
</RULE>
</GRAMMAR>
C&C模式的优点是识别范围小,识别准确率高,可以识别非常用字词组合。
后续如果有机会将在以下几个方面继续研究:
1. 如何实现动态修改识别范围;
2. 如何实现用候选字词组合成的基本语法,例如“毕业于”+“东南大学”;
3. 如何阻断操作系统“控制指令”对识别过程的干扰。