这个国庆节作死,答应别人在七天内做2个项目。智能客服项目是其中一个,当时是答应给隔壁兄弟团队做的。他们说实在搞不定了,节后要上线,我就因为随口说了一句:“这有什么难的”,结果祸从口出,这事情就落我头上了。
录音识别是智能客服大项目计划里面的一部分,简单说就是客户在跟用户聊天的时候,实时从声卡上抓取音频数据,然后发送给阿里云-智能语音交互识别成句子文本后,再发送给我们的智能客服助手服务器,生成提示展示给客户人员。听起来很简单,我当时也这么认为,但做完这个之后,我觉得以后没啥事还是少去隔壁团队那儿串门,^^。
这个小程序,步骤上分3步:录音,识别、提交。
1. 录音
本来是想着用java来写,开发能快点,而且java也有相应的AudioSystem音频处理模块,应该能轻松搞定。但是后来发现AudioSystem只能从麦克风这种音频输入设备读取数据,要想从声卡抓取音频数据只能用C++调用WASAPI来获取。网上也有不少例子,可以拿来借鉴。但这儿当时遇到的最大问题是,我抓取的音频格式是PCM FLOAT 32位,做音频处理的时候不方便,而且有些的音频处理程序不支持(比如: java的AudioSystem),所以我转换成PCM SIGNED 16位。另外,音频数据在发送给识别服务之前要按照要求转换成单轨方式,采样率也要调整成16000Hz。音频处理比较头疼,开始打算用ffmpeg来处理,但是这个库太重,用起来太复杂,后来就自己上网查资料,自己写算法转换的。
另外,网上也有种说法,就是说要在windows系统上进行录音,需要打开立体声混响设备。其实是不需要的,除非你想同时进行声卡和麦克风录音。仅仅从声卡录音,通过WASAPI够了。
2. 识别
将声卡抓取的音频数据调整成能被识别的格式之后,就可以调用阿里云提供的SDK进行发送了。因为文档比较齐全,所以这块儿还是挺顺利的。
3. 提交
因为智能服务接口是restful的所以,我这儿只用libcurl来进行HTTP处理。libcurl当时在编译成静态库后,连接时总是报 找不到函数入口。开始以为是忘记加extern "C",加了之后还是报错。最后,查了下资料,按照这个网页(https://blog.csdn.net/libaineu2004/article/details/79736921)的指导操作了下就好了。
4. 参考代码
其他模块都没什么难度,主要是音频处理部分,包括,位深转换,采样率转换,单轨调整。主要还是基于网上的代码做了少量的修改。
Capture.h
#pragma once
#pragma comment(lib,"avrt.lib")
#include <Audioclient.h>
#include <mmdeviceapi.h>
#include<iostream>
#include<avrt.h>
#include <vector>
typedef struct WAVE_HEADER {
char fccID[4]; //内容为""RIFF
unsigned long dwSize; //最后填写,WAVE格式音频的大小
char fccType[4]; //内容为"WAVE"
}WAVE_HEADER;
typedef struct WAVE_FMT {
char fccID[4]; //内容为"fmt "
unsigned long dwSize; //内容为WAVE_FMT占的字节数,为16
unsigned short wFormatTag; //如果为PCM,改值为 1
unsigned short wChannels; //通道数,单通道=1,双通道=2
unsigned long dwSamplesPerSec;//采用频率
unsigned long dwAvgBytesPerSec;/* ==dwSamplesPerSec*wChannels*uiBitsPerSample/8 */
unsigned short wBlockAlign;//==wChannels*uiBitsPerSample/8
unsigned short uiBitsPerSample;//每个采样点的bit数,8bits=8, 16bits=16
}WAVE_FMT;
typedef struct WAVE_DATA {
char fccID[4]; //内容为"data"
unsigned long dwSize; //==NumSamples*wChannels*uiBitsPerSample/8
}WAVE_DATA;
class Capture
{
public:
Capture();
int start();
int stop();
int cap(std::vector<BYTE> &buffer, int rate, int channels);
int wav(std::vector<BYTE>& buffer, int rate, int channels);
private:
bool adjustFormatTo16Bits(WAVEFORMATEX *pwfx);
int read(std::vector<BYTE> &buffer);
int resample(std::vector<BYTE> &buffer, int rate);
int singleChannel(std::vector<BYTE> &buffer);
IAudioCaptureClient * m_pAudioCaptureClient;
IAudioClient * m_pAudioClient;
WAVEFORMATEX * m_pwfx;
IMMDevice* m_pMMDevice;
size_t m_FrameSize;
int m_SampleRate;
int m_Channels;
};
Capture.cpp
#include "Capture.h"
#define RETURN_ON_ERROR(hr) if(FAILED(hr)){CoUninitialize();return -1;}
#define RETURN_ON_NULL(p) if(p==NULL){CoUninitialize();return -1;}
#define RETURN_ON_FALSE(b) if(!b){CoUninitialize();return -1;}
bool Capture::adjustFormatTo16Bits(WAVEFORMATEX *pwfx)
{
bool ret=false;
if (pwfx->wFormatTag == WAVE_FORMAT_IEEE_FLOAT)
{
pwfx->wFormatTag = WAVE_FORMAT_PCM;
pwfx->wBitsPerSample = 16;
pwfx->nBlockAlign = pwfx->nChannels * pwfx->wBitsPerSample / 8;
pwfx->nAvgBytesPerSec = pwfx->nBlockAlign * pwfx->nSamplesPerSec;
ret = true;
}
else if (pwfx->wFormatTag == WAVE_FORMAT_EXTENSIBLE)
{
PWAVEFORMATEXTENSIBLE pEx = reinterpret_cast<PWAVEFORMATEXTENSIBLE>(pwfx);
if (IsEqualGUID(KSDATAFORMAT_SUBTYPE_IEEE_FLOAT, pEx->SubFormat))
{
pEx->SubFormat = KSDATAFORMAT_SUBTYPE_PCM;
pEx->Samples.wValidBitsPerSample = 16;
pwfx->wBitsPerSample = 16;
pwfx->nBlockAlign = pwfx->nChannels * pwfx->wBitsPerSample / 8;
pwfx->nAvgBytesPerSec = pwfx->nBlockAlign * pwfx->nSamplesPerSec;
ret = true;
}
}
return ret;
}
Capture::Capture() {
m_pAudioCaptureClient = NULL;
m_pAudioClient = NULL;
m_pMMDevice = NULL;
m_pwfx = NULL;
m_FrameSize = 0;
m_SampleRate = -1;
m_Channels = -1;
}
int Capture::start() {
CoInitialize(NULL);
IMMDeviceEnumerator *pMMDeviceEnumerator = NULL;
HRESULT hr = CoCreateInstance(__uuidof(MMDeviceEnumerator), NULL, CLSCTX_ALL,
__uuidof(IMMDeviceEnumerator), (void**)&pMMDeviceEnumerator);
RETURN_ON_ERROR(hr);
hr = pMMDeviceEnumerator->GetDefaultAudioEndpoint(eRender, eConsole, &m_pMMDevice);
RETURN_ON_ERROR(hr);
pMMDeviceEnumerator->Release();
hr = m_pMMDevice->Activate(__uuidof(IAudioClient), CLSCTX_ALL, NULL, (void**)&m_pAudioClient);
RETURN_ON_ERROR(hr);
REFERENCE_TIME hnsDefaultDevicePeriod(0);
hr = m_pAudioClient->GetDevicePeriod(&hnsDefaultDevicePeriod, NULL);
RETURN_ON_ERROR(hr);
hr = m_pAudioClient->GetMixFormat(&m_pwfx);
RETURN_ON_ERROR(hr);
/*转换成signed 16位编码*/
adjustFormatTo16Bits(m_pwfx);
m_FrameSize = (m_pwfx->wBitsPerSample / 8)*m_pwfx->nChannels;
hr = m_pAudioClient->Initialize(AUDCLNT_SHAREMODE_SHARED, AUDCLNT_STREAMFLAGS_LOOPBACK, 0, 0, m_pwfx, 0);
RETURN_ON_ERROR(hr);
hr = m_pAudioClient->GetService(__uuidof(IAudioCaptureClient), (void**)&m_pAudioCaptureClient);
RETURN_ON_ERROR(hr);
hr = m_pAudioClient->Start();
RETURN_ON_ERROR(hr);
CoUninitialize();
m_Channels = m_pwfx->nChannels;
m_SampleRate = m_pwfx->nSamplesPerSec;
return 0;
}
int Capture::stop() {
if (m_pAudioClient)
{
m_pAudioClient->Stop();
m_pAudioClient->Release();
m_pAudioClient = NULL;
}
if (m_pwfx != NULL)
{
CoTaskMemFree(m_pwfx);
m_pwfx = NULL;
}
if (m_pAudioCaptureClient != NULL)
{
m_pAudioCaptureClient->Release();
m_pAudioCaptureClient = NULL;
}
return 0;
}
int Capture::cap(std::vector<BYTE> &buffer, int rate, int channels)
{
read(buffer);
resample(buffer, rate);
singleChannel(buffer);
return buffer.size();
}
int Capture::read(std::vector<BYTE> &buffer) {
DWORD dwWaitResult;
UINT32 nNextPacketSize(0);
BYTE *pData = NULL;
UINT32 framesAvailable;
DWORD flags;
CoInitialize(NULL);
HRESULT hr = m_pAudioCaptureClient->GetBuffer(&pData, &framesAvailable, &flags, NULL, NULL);
RETURN_ON_ERROR(hr);
if (0 != framesAvailable)
{
buffer.insert(buffer.end(), pData, pData+framesAvailable * m_FrameSize);
}
m_pAudioCaptureClient->ReleaseBuffer(framesAvailable);
CoUninitialize();
return framesAvailable * m_FrameSize;
}
int Capture::resample(std::vector<BYTE>& buffer, int rate)
{
if (m_SampleRate == rate)return buffer.size();
if (m_pwfx == nullptr)return -1;
std::vector<BYTE> resultBuffer;
int bytes = m_pwfx->wBitsPerSample/8;
int sampleCount = buffer.size() / bytes;
int srcRate = m_pwfx->nSamplesPerSec;
int dstRate = rate;
int rateLen = srcRate / dstRate;
if (rateLen == 1) return buffer.size();
if (rateLen > 0) {
short tempRead = 0;
short tempSum = 0;
int flag = 0;
for (int i = 0; i < sampleCount; i++) {
memcpy(&tempRead, buffer.data()+i*bytes, bytes);
tempSum = tempSum + tempRead;
flag++;
if (flag == rateLen)
{
flag = 0;
tempSum = tempSum / rateLen;
resultBuffer.insert(resultBuffer.end(), ((BYTE*)&tempSum), ((BYTE*)&tempSum) + bytes);
tempSum = 0;
}
}
}
else {
rateLen = dstRate / srcRate;
int tempRead1;
int tempRead2;
int tempSum;
int tempAvgDiff;
int tempWrite;
int flag;
for (int i = 0; i < (sampleCount-1); i++) {
memcpy(&tempRead1, buffer.data() + i * bytes, bytes);
memcpy(&tempRead2, buffer.data() + i * bytes+ bytes, bytes);
tempSum = tempRead2 - tempRead1;
tempAvgDiff = tempSum / rateLen;
tempWrite = tempRead1;
flag = rateLen;
do
{
tempWrite += tempAvgDiff;
resultBuffer.insert(resultBuffer.end(), ((BYTE*)&tempWrite), ((BYTE*)&tempWrite) + bytes);
} while (--flag);
}
}
buffer.swap(resultBuffer);
return buffer.size();
}
int Capture::singleChannel(std::vector<BYTE>& buffer)
{
if (m_Channels == 1) return buffer.size();
size_t len = buffer.size() / 2;
int bytes = m_pwfx->wBitsPerSample / 8;
//std::vector<BYTE> singleBuffer(len);
BYTE *singleBuffer = new BYTE[len];
//singleBuffer.reserve(len);
for (int i = 0; i < len/bytes; i++) {
//singleBuffer.insert(singleBuffer.end(), buffer.data() + i*bytes * 2, buffer.data() + i*bytes * 2 + bytes);
memcpy(singleBuffer+i*bytes, buffer.data()+i*(2*bytes), bytes);
}
buffer.assign(singleBuffer, singleBuffer + len);
delete[] singleBuffer;
return buffer.size();
}
int Capture::wav(std::vector<BYTE>& buffer, int rate, int channels)
{
std::vector<BYTE> wavBuffer;
WAVE_HEADER pcmHEADER;
WAVE_FMT pcmFMT;
WAVE_DATA pcmDATA;
unsigned short m_pcmData;
int dataSize = buffer.size();
/* WAVE_HEADER */
memcpy(pcmHEADER.fccID, "RIFF", strlen("RIFF"));
memcpy(pcmHEADER.fccType, "WAVE", strlen("WAVE"));
pcmHEADER.dwSize = 36 + dataSize;
/* WAVE_FMT */
memcpy(pcmFMT.fccID, "fmt ", strlen("fmt "));
pcmFMT.dwSize = 16;
pcmFMT.wFormatTag = 1;
pcmFMT.wChannels = channels;
pcmFMT.dwSamplesPerSec = rate;
pcmFMT.uiBitsPerSample = 16;
/* ==dwSamplesPerSec*wChannels*uiBitsPerSample/8 */
pcmFMT.dwAvgBytesPerSec = pcmFMT.dwSamplesPerSec*pcmFMT.wChannels*pcmFMT.uiBitsPerSample / 8;
/* ==wChannels*uiBitsPerSample/8 */
pcmFMT.wBlockAlign = pcmFMT.wChannels*pcmFMT.uiBitsPerSample / 8;
/* WAVE_DATA */
memcpy(pcmDATA.fccID, "data", strlen("data"));
pcmDATA.dwSize = dataSize;
wavBuffer.insert(wavBuffer.end(), (BYTE*)&pcmHEADER, ((BYTE*)&pcmHEADER) + sizeof(WAVE_HEADER));
wavBuffer.insert(wavBuffer.end(), (BYTE*)&pcmFMT, ((BYTE*)&pcmFMT) + sizeof(WAVE_FMT));
wavBuffer.insert(wavBuffer.end(), (BYTE*)&pcmDATA, ((BYTE*)&pcmDATA) + sizeof(WAVE_DATA));
wavBuffer.insert(wavBuffer.end(), buffer.begin(), buffer.end());
buffer.swap(wavBuffer);
return buffer.size();
}