上一篇文章进行了简单的SDK移植应用,但是我想达成的效果同示例的功能还是有区别,所以这篇文章探讨记录一下代码的实现原理和修改方法。
语音合成代码
1.官方源码
/*
* 语音合成(Text To Speech,TTS)技术能够自动将任意文字实时转换为连续的
* 自然语音,是一种能够在任何时间、任何地点,向任何人提供语音信息服务的
* 高效便捷手段,非常符合信息时代海量数据、动态更新和个性化查询的需求。
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include "qtts.h"
#include "msp_cmn.h"
#include "msp_errors.h"
/* wav音频头部格式 ,定义生成的音频文件的格式,一般不需要修改*/
typedef struct _wave_pcm_hdr
{
char riff[4]; // = "RIFF"
int size_8; // = FileSize - 8
char wave[4]; // = "WAVE"
char fmt[4]; // = "fmt "
int fmt_size; // = 下一个结构体的大小 : 16
short int format_tag; // = PCM : 1
short int channels; // = 通道数 : 1
int samples_per_sec; // = 采样率 : 8000 | 6000 | 11025 | 16000
int avg_bytes_per_sec; // = 每秒字节数 : samples_per_sec * bits_per_sample / 8
short int block_align; // = 每采样点字节数 : wBitsPerSample / 8
short int bits_per_sample; // = 量化比特数: 8 | 16
char data[4]; // = "data";
int data_size; // = 纯数据长度 : FileSize - 44
} wave_pcm_hdr;
/* 默认wav音频头部数据 */
wave_pcm_hdr default_wav_hdr =
{
{ 'R', 'I', 'F', 'F' },
0,
{'W', 'A', 'V', 'E'},
{'f', 'm', 't', ' '},
16,
1,
1,
16000,
32000,
2,
16,
{'d', 'a', 't', 'a'},
0
};
/* 文本合成 */
int text_to_speech(const char* src_text, const char* des_path, const char* params)
{
int ret = -1;
FILE* fp = NULL;
const char* sessionID = NULL;
unsigned int audio_len = 0;
wave_pcm_hdr wav_hdr = default_wav_hdr;
int synth_status = MSP_TTS_FLAG_STILL_HAVE_DATA;
if (NULL == src_text || NULL == des_path)
{
printf("params is error!\n");
return ret;
}
fp = fopen(des_path, "wb");//“wb”新建一个二进制文件,已存在的文件将内容清空,只允许写
if (NULL == fp)
{
printf("open %s error.\n", des_path);
return ret;
}
/* 开始合成 */
sessionID = QTTSSessionBegin(params, &ret);
if (MSP_SUCCESS != ret)
{
printf("QTTSSessionBegin failed, error code: %d.\n", ret);
fclose(fp);
return ret;
}
ret = QTTSTextPut(sessionID, src_text, (unsigned int)strlen(src_text), NULL);
if (MSP_SUCCESS != ret)
{
printf("QTTSTextPut failed, error code: %d.\n",ret);
QTTSSessionEnd(sessionID, "TextPutError");
fclose(fp);
return ret;
}
printf("正在合成 ...\n");
fwrite(&wav_hdr, sizeof(wav_hdr) ,1, fp); //添加wav音频头,使用采样率为16000,把wav_hdr数组中的元素写入fp
while (1)
{
/* 获取合成音频 */
const void* data = QTTSAudioGet(sessionID, &audio_len, &synth_status, &ret);
if (MSP_SUCCESS != ret)
break;
if (NULL != data)
{
fwrite(data, audio_len, 1, fp);
wav_hdr.data_size += audio_len; //计算data_size大小
}
if (MSP_TTS_FLAG_DATA_END == synth_status)
break;
printf(">");
usleep(150*1000); //防止频繁占用CPU
}
printf("\n");
if (MSP_SUCCESS != ret)
{
printf("QTTSAudioGet failed, error code: %d.\n",ret);
QTTSSessionEnd(sessionID, "AudioGetError");
fclose(fp);
return ret;
}
/* 修正wav文件头数据的大小 */
wav_hdr.size_8 += wav_hdr.data_size + (sizeof(wav_hdr) - 8);
/* 将修正过的数据写回文件头部,音频文件为wav格式 */
fseek(fp, 4, 0);
fwrite(&wav_hdr.size_8,sizeof(wav_hdr.size_8), 1, fp); //写入size_8的值
fseek(fp, 40, 0); //将文件指针偏移到存储data_size值的位置
fwrite(&wav_hdr.data_size,sizeof(wav_hdr.data_size), 1, fp); //写入data_size的值
fclose(fp);
fp = NULL;
/* 合成完毕 */
ret = QTTSSessionEnd(sessionID, "Normal");
if (MSP_SUCCESS != ret)
{
printf("QTTSSessionEnd failed, error code: %d.\n",ret);
}
return ret;
}
/*上述的函数定义一般都不需要我们进行修改,我们所需要做的修改是在函数定义完之后进行调用的时候实现自己想要实现的功能*/
int main(int argc, char* argv[])
{
int ret = MSP_SUCCESS;
const char* login_params = "appid = 5fa8bc98, work_dir = .";//登录参数,appid与msc库绑定,请勿随意改动,appid要换成自己申请的🆔id
/*
* rdn: 合成音频数字发音方式
* volume: 合成音频的音量
* pitch: 合成音频的音调
* speed: 合成音频对应的语速
* voice_name: 合成发音人
* sample_rate: 合成音频采样率
* text_encoding: 合成文本编码格式
*
*/
const char* session_begin_params = "voice_name = xiaoyan, text_encoding = utf8, sample_rate = 16000, speed = 50, volume = 50, pitch = 50, rdn = 2";//配置合成的语音的参数
const char* filename = "test.wav"; //合成的语音文件名称
const char* text = "亲爱的用户,您好,这是一个语音合成示例,感谢您对科大讯飞语音技术的支持!科大讯飞是亚太地区最大的语音上市公司,股票代码:002230"; //合成文本
/* 用户登录 */
ret = MSPLogin(NULL, NULL, login_params);//第一个参数是用户名,第二个参数是密码,第三个参数是登录参数,用户名和密码可在http://www.xfyun.cn注册获取
if (MSP_SUCCESS != ret)
{
printf("MSPLogin failed, error code: %d.\n", ret);
goto exit ;//登录失败,退出登录
}
printf("\n###########################################################################\n");
printf("## 语音合成(Text To Speech,TTS)技术能够自动将任意文字实时转换为连续的 ##\n");
printf("## 自然语音,是一种能够在任何时间、任何地点,向任何人提供语音信息服务的 ##\n");
printf("## 高效便捷手段,非常符合信息时代海量数据、动态更新和个性化查询的需求。 ##\n");
printf("###########################################################################\n\n");
/* 文本合成 */
printf("开始合成 ...\n");
ret = text_to_speech(text, filename, session_begin_params);//调用语音合成函数进行语音合成
if (MSP_SUCCESS != ret)
{
printf("text_to_speech failed, error code: %d.\n", ret);
}
printf("合成完毕\n");
exit:
printf("按任意键退出 ...\n");
getchar();
MSPLogout(); //退出登录
return 0;
}
源码的大部分都不需要我们进行修改,要想简单的调用只需要修改简单的几行代码即可。
这一部分修改生成语音的参数,修改合成的文件的名称,以及所需要合成的语音文字。
const char* session_begin_params = "voice_name = xiaoyan, text_encoding = utf8, sample_rate = 16000, speed = 50, volume = 50, pitch = 50, rdn = 2";//配置合成的语音的参数
const char* filename = "test.wav"; //合成的语音文件名称
const char* text = "亲爱的用户,您好,这是一个语音合成示例,感谢您对科大讯飞语音技术的支持!科大讯飞是亚太地区最大的语音上市公司,股票代码:002230"; //合成文本
调用语音合成函数进行合成。
ret = text_to_speech(text, filename, session_begin_params);//调用语音合成函数进行语音合成
2.终端输入要合成的语句
前半部分直到语音合成函数都是一样的,不再赘述。此部分代码本质上是编写了一个订阅器,通过订阅器发布的消息,参考教程。
void xfcallback(const std_msgs::String::ConstPtr& msg)//定义回调函数。
{
char cmd[2000];
const char* text;
int ret = MSP_SUCCESS;
const char* session_begin_params = "voice_name = xiaoyan, text_encoding = utf8, sample_rate = 16000, speed = 50, volume = 50, pitch = 50, rdn = 2";
const char* filename = "tts_sample.wav"; //合成的语音文件名称
std::cout<<"I heard :"<<msg->data.c_str()<<std::endl;
text = msg->data.c_str();
/* 文本合成 */
printf("开始合成 ...\n");
ret = text_to_speech(text, filename, session_begin_params);
if (MSP_SUCCESS != ret)
{
printf("text_to_speech failed, error code: %d.\n", ret);
}
printf("合成完毕\n");
unlink("/tmp/cmd");
mkfifo("/tmp/cmd", 0777);
popen("mplayer -quiet -slave -input file=/tmp/cmd 'tts_sample.wav'","r");
sleep(30);
printf("Mplayer Run Success\n");
}
void toPlay()
{
}
void toExit()
{
printf("按任意键退出 ...\n");
getchar();
MSPLogout(); //退出登录
}
int main(int argc, char* argv[])
{
int ret = MSP_SUCCESS;
const char* login_params = "appid = 58249817, work_dir = .";//登录参数,appid与msc库绑定,请勿随意改动
/*
* rdn: 合成音频数字发音方式
* volume: 合成音频的音量
* pitch: 合成音频的音调
* speed: 合成音频对应的语速
* voice_name: 合成发音人
* sample_rate: 合成音频采样率
* text_encoding: 合成文本编码格式
*
* 详细参数说明请参阅《讯飞语音云MSC--API文档》
*/
/* 用户登录 */
ret = MSPLogin(NULL, NULL, login_params);//第一个参数是用户名,第二个参数是密码,第三个参数是登录参数,用户名和密码可在http://open.voicecloud.cn注册获取
if (MSP_SUCCESS != ret)
{
printf("MSPLogin failed, error code: %d.\n", ret);
/*goto exit ;*///登录失败,退出登录
toExit();
}
printf("\n###########################################################################\n");
printf("## 语音合成(Text To Speech,TTS)技术能够自动将任意文字实时转换为连续的 ##\n");
printf("## 自然语音,是一种能够在任何时间、任何地点,向任何人提供语音信息服务的 ##\n");
printf("## 高效便捷手段,非常符合信息时代海量数据、动态更新和个性化查询的需求。 ##\n");
printf("###########################################################################\n\n");
ros::init(argc,argv,"xf_tts");//初始化ros节点
ros::NodeHandle n;//实例化ROS节点
ros::Subscriber sub =n.subscribe("xfwords",1000,xfcallback);
//告诉 master 我们要订阅 xfwords 话题上的消息。当有消息发布到这个话题时,ROS 就会调用 xfCallback() 函数。第二个参数是队列大小,以防我们处理消息的速度不够快,当缓存达到 1000 条消息后,再有新的消息到来就将开始丢弃先前接收的消息。
ros::spin();//进入自循环
exit:
printf("按任意键退出 ...\n");
getchar();
MSPLogout(); //退出登录
return 0;
}
关于ros节点函数的具体介绍可以查看参考教程。
调用命令
roscore
rosrun xfei_asr tts_subscribe_speak
rostopic pub xfwords std_msgs/String "语音合成测试"
//xfwords是发布的话题,std_msgs/String是发布类型
3.通过节点发布消息
根据发布器的代码编写进行简单修改就可以通过节点发布消息。值得注意的是话题的发布要写在循环函数中做到连续发布,否则订阅方会接受不到消息。
- 代码
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include "ros/ros.h"
#include "std_msgs/String.h"
#include <sstream>
#include <sys/types.h>
#include <sys/stat.h>
int main(int argc, char* argv[])
{
ros::init(argc, argv, "tts_pub");//初始化 ROS.可指定节点的名称。节点的名称必须唯一
ros::NodeHandle n;
ros::Publisher xfwords_pub = n.advertise<std_msgs::String>("xfwords", 1000);
ros::Rate loop_rate(10);
while (ros::ok())
{
std_msgs::String msg;
std::stringstream ss;
ss << "hello , my name is spotmicro ! \n ";
msg.data = ss.str();
printf("%s", msg.data.c_str());
xfwords_pub.publish(msg);
ros::spinOnce();
loop_rate.sleep();
}
return 0;
}
- 运行结果展示
语音识别代码
1.官方源码
移植示例:iat_online_record_samples,主要功能从麦克风接受实时语音输入转化为文本输出。
主要有三部分代码,iat_online_record_sample,linuxrec, speech_recognizer
- iat_online_record_sample
这一部分的代码是主函数程序,定义了一些输出函数,调用了语音识别函数。
/*
* 语音听写(iFly Auto Transform)技术能够实时地将语音转换成对应的文字。
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include "qisr.h"
#include "msp_cmn.h"
#include "msp_errors.h"
#include "speech_recognizer.h"
#define FRAME_LEN 640
#define BUFFER_SIZE 4096
/* Upload User words */
/*定义了上传用户词库函数*/
static int upload_userwords()
{
char* userwords = NULL;
size_t len = 0;
size_t read_len = 0;
FILE* fp = NULL;
int ret = -1;
fp = fopen("userwords.txt", "rb");//打开"userwords.txt"文件
if (NULL == fp)
{
printf("\nopen [userwords.txt] failed! \n");
goto upload_exit;
}
fseek(fp, 0, SEEK_END);//在文件当中定位,使用它可以到达文件任何位置,此处到达文件末尾。
len = ftell(fp); //ftell函数用于得到文件位置指针当前位置相对于文件首的偏移字节数,len就是文件字节数
fseek(fp, 0, SEEK_SET);//返回文件开头
userwords = (char*)malloc(len + 1);
if (NULL == userwords)
{
printf("\nout of memory! \n");
goto upload_exit;
}
read_len = fread((void*)userwords, 1, len, fp); //读取文件到userwords指定的内存区。
if (read_len != len)
{
printf("\nread [userwords.txt] failed!\n");
goto upload_exit;
}
userwords[len] = '\0';
MSPUploadData("userwords", userwords, len, "sub = uup, dtt = userword", &ret); //ÉÏ´«Óû§´Ê±í
if (MSP_SUCCESS != ret)
{
printf("\nMSPUploadData failed ! errorCode: %d \n", ret);
goto upload_exit;
}
upload_exit:
if (NULL != fp)
{
fclose(fp);
fp = NULL;
}
if (NULL != userwords)
{
free(userwords);
userwords = NULL;
}
return ret;
}
/*定义了显示结果的函数*/
static void show_result(char *string, char is_over)
{
printf("\rResult: [ %s ]", string);
if(is_over)
putchar('\n');
}
static char *g_result = NULL;
static unsigned int g_buffersize = BUFFER_SIZE;
/*定义了函数分配内存区域,现实结果*/
void on_result(const char *result, char is_last)
{
if (result) {
size_t left = g_buffersize - 1 - strlen(g_result);
size_t size = strlen(result);
if (left < size) {
g_result = (char*)realloc(g_result, g_buffersize + BUFFER_SIZE);
if (g_result)
g_buffersize += BUFFER_SIZE;
else {
printf("mem alloc failed\n");
return;
}
}
strncat(g_result, result, size);//连接字符串
show_result(g_result, is_last);
}
}
/*定义了开始说话函数*/
void on_speech_begin()
{
if (g_result)
{
free(g_result);
}
g_result = (char*)malloc(BUFFER_SIZE);
g_buffersize = BUFFER_SIZE;
memset(g_result, 0, g_buffersize);//置零,清空内存区
printf("Start Listening...\n");
}
/*说话结束*/
void on_speech_end(int reason)
{
if (reason == END_REASON_VAD_DETECT)
printf("\nSpeaking done \n");
else
printf("\nRecognizer error %d\n", reason);
}
/* demo send audio data from a file */
/*从已存在的音频文件中进行语音识别*/
static void demo_file(const char* audio_file, const char* session_begin_params)
{
int errcode = 0;
FILE* f_pcm = NULL;
char* p_pcm = NULL;
unsigned long pcm_count = 0;
unsigned long pcm_size = 0;
unsigned long read_size = 0;
struct speech_rec iat;
struct speech_rec_notifier recnotifier = {
on_result,
on_speech_begin,
on_speech_end
};
if (NULL == audio_file)
goto iat_exit;
f_pcm = fopen(audio_file, "rb");
if (NULL == f_pcm)
{
printf("\nopen [%s] failed! \n", audio_file);
goto iat_exit;
}
fseek(f_pcm, 0, SEEK_END);
pcm_size = ftell(f_pcm);
fseek(f_pcm, 0, SEEK_SET);
p_pcm = (char *)malloc(pcm_size);
if (NULL == p_pcm)
{
printf("\nout of memory! \n");
goto iat_exit;
}
read_size = fread((void *)p_pcm, 1, pcm_size, f_pcm);
if (read_size != pcm_size)
{
printf("\nread [%s] error!\n", audio_file);
goto iat_exit;
}
errcode = sr_init(&iat, session_begin_params, SR_USER, &recnotifier);
if (errcode) {
printf("speech recognizer init failed : %d\n", errcode);
goto iat_exit;
}
errcode = sr_start_listening(&iat);
if (errcode) {
printf("\nsr_start_listening failed! error code:%d\n", errcode);
goto iat_exit;
}
while (1)
{
unsigned int len = 10 * FRAME_LEN; /* 200ms audio */
int ret = 0;
if (pcm_size < 2 * len)
len = pcm_size;
if (len <= 0)
break;
ret = sr_write_audio_data(&iat, &p_pcm[pcm_count], len);
if (0 != ret)
{
printf("\nwrite audio data failed! error code:%d\n", ret);
goto iat_exit;
}
pcm_count += (long)len;
pcm_size -= (long)len;
}
errcode = sr_stop_listening(&iat);
if (errcode) {
printf("\nsr_stop_listening failed! error code:%d \n", errcode);
goto iat_exit;
}
iat_exit:
if (NULL != f_pcm)
{
fclose(f_pcm);
f_pcm = NULL;
}
if (NULL != p_pcm)
{
free(p_pcm);
p_pcm = NULL;
}
sr_stop_listening(&iat);
sr_uninit(&iat);
}
/* demo recognize the audio from microphone */
/*从麦克风中进行语音识别*/
static void demo_mic(const char* session_begin_params)
{
int errcode;
int i = 0;
struct speech_rec iat;
struct speech_rec_notifier recnotifier = {
on_result,
on_speech_begin,
on_speech_end
};
errcode = sr_init(&iat, session_begin_params, SR_MIC, &recnotifier);
if (errcode) {
printf("speech recognizer init failed\n");
return;
}
errcode = sr_start_listening(&iat);
if (errcode) {
printf("start listen failed %d\n", errcode);
}
/* demo 15 seconds recording */
while(i++ < 15)
sleep(1);
errcode = sr_stop_listening(&iat);
if (errcode) {
printf("stop listening failed %d\n", errcode);
}
sr_uninit(&iat);
}
/* main thread: start/stop record ; query the result of recgonization.
* record thread: record callback(data write)
* helper thread: ui(keystroke detection)
*/
int main(int argc, char* argv[])
{
int ret = MSP_SUCCESS;
int upload_on = 1; /* whether upload the user word */
/* login params, please do keep the appid correct */
const char* login_params = "appid = 5fa8bc98, work_dir = .";
int aud_src = 0; /* from mic or file */
/*
* See "iFlytek MSC Reference Manual"
*/
const char* session_begin_params =
"sub = iat, domain = iat, language = zh_cn, "
"accent = mandarin, sample_rate = 16000, "
"result_type = plain, result_encoding = utf8";
/* Login first. the 1st arg is username, the 2nd arg is password
* just set them as NULL. the 3rd arg is login paramertes
* */
ret = MSPLogin(NULL, NULL, login_params);
if (MSP_SUCCESS != ret) {
printf("MSPLogin failed , Error code %d.\n",ret);
goto exit; // login fail, exit the program
}
printf("Want to upload the user words ? \n0: No.\n1: Yes\n");
scanf("%d", &upload_on);
if (upload_on)
{
printf("Uploading the user words ...\n");
ret = upload_userwords();
if (MSP_SUCCESS != ret)
goto exit;
printf("Uploaded successfully\n");
}
printf("Where the audio comes from?\n"
"0: From a audio file.\n1: From microphone.\n");
scanf("%d", &aud_src);
if(aud_src != 0) {
printf("Demo recognizing the speech from microphone\n");
printf("Speak in 15 seconds\n");
demo_mic(session_begin_params);
printf("15 sec passed\n");
} else {
printf("Demo recgonizing the speech from a recorded audio file\n");
demo_file("wav/iflytek02.wav", session_begin_params);
}
exit:
MSPLogout(); // Logout...
return 0;
}
- speech_recognizer
这一部分定义了主要语音识别函数,包括从文件中识别和从麦克风识别。
/*
@file
@brief a simple demo to recognize speech from microphone
@author taozhang9
@date 2016/05/27
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include "speech_recognizer.h"
#include "qisr.h"
#include "msp_cmn.h"
#include "msp_errors.h"
#include "linuxrec.h"
#define SR_DBGON 1
#if SR_DBGON == 1
# define sr_dbg printf
#else
# define sr_dbg
#endif
#define DEFAULT_SESSION_PARA \
"sub = iat, domain = iat, language = zh_cn, accent = mandarin, sample_rate = 16000, result_type = plain, result_encoding = utf8"
#define DEFAULT_FORMAT \
{\
WAVE_FORMAT_PCM, \
1, \
16000, \
32000, \
2, \
16, \
sizeof(WAVEFORMATEX) \
}
/* internal state */
enum {
SR_STATE_INIT,
SR_STATE_STARTED
};
#define SR_MALLOC malloc
#define SR_MFREE free
#define SR_MEMSET memset
static void Sleep(size_t ms)
{
usleep(ms*1000);
}
static void end_sr_on_error(struct speech_rec *sr, int errcode)
{
if(sr->aud_src == SR_MIC)
stop_record(sr->recorder);
if (sr->session_id) {
if (sr->notif.on_speech_end)
sr->notif.on_speech_end(errcode);
QISRSessionEnd(sr->session_id, "err");
sr->session_id = NULL;
}
sr->state = SR_STATE_INIT;
}
static void end_sr_on_vad(struct speech_rec *sr)
{
int errcode;
const char *rslt;
if (sr->aud_src == SR_MIC)
stop_record(sr->recorder);
while(sr->rec_stat != MSP_REC_STATUS_COMPLETE ){
rslt = QISRGetResult(sr->session_id, &sr->rec_stat, 0, &errcode);
if (rslt && sr->notif.on_result)
sr->notif.on_result(rslt, sr->rec_stat == MSP_REC_STATUS_COMPLETE ? 1 : 0);
Sleep(100); /* for cpu occupy, should sleep here */
}
if (sr->session_id) {
if (sr->notif.on_speech_end)
sr->notif.on_speech_end(END_REASON_VAD_DETECT);
QISRSessionEnd(sr->session_id, "VAD Normal");
sr->session_id = NULL;
}
sr->state = SR_STATE_INIT;
}
/* the record call back */
static void iat_cb(char *data, unsigned long len, void *user_para)
{
int errcode;
struct speech_rec *sr;
if(len == 0 || data == NULL)
return;
sr = (struct speech_rec *)user_para;
if(sr == NULL || sr->ep_stat >= MSP_EP_AFTER_SPEECH)
return;
if (sr->state < SR_STATE_STARTED)
return; /* ignore the data if error/vad happened */
errcode = sr_write_audio_data(sr, data, len);
if (errcode) {
end_sr_on_error(sr, errcode);
return;
}
}
static char * skip_space(char *s)
{
while (s && *s != ' ' && *s != '\0')
s++;
return s;
}
static int update_format_from_sessionparam(const char * session_para, WAVEFORMATEX *wavefmt)
{
char *s;
if ((s = strstr(session_para, "sample_rate"))) {
s = strstr(s, "=");
if (s && *s) {
s = skip_space(s);
if (s && *s) {
wavefmt->nSamplesPerSec = atoi(s);
wavefmt->nAvgBytesPerSec = wavefmt->nBlockAlign * wavefmt->nSamplesPerSec;
}
}
else
return -1;
}
else {
return -1;
}
return 0;
}
/* devid will be ignored if aud_src is not SR_MIC ; use get_default_dev_id
* to use the default input device. Currently the device list function is
* not provided yet.
*/
int sr_init_ex(struct speech_rec * sr, const char * session_begin_params,
enum sr_audsrc aud_src, record_dev_id devid,
struct speech_rec_notifier * notify)
{
int errcode;
size_t param_size;
WAVEFORMATEX wavfmt = DEFAULT_FORMAT;
if (aud_src == SR_MIC && get_input_dev_num() == 0) {
return -E_SR_NOACTIVEDEVICE;
}
if (!sr)
return -E_SR_INVAL;
if (session_begin_params == NULL) {
session_begin_params = DEFAULT_SESSION_PARA;
}
SR_MEMSET(sr, 0, sizeof(struct speech_rec));
sr->state = SR_STATE_INIT;
sr->aud_src = aud_src;
sr->ep_stat = MSP_EP_LOOKING_FOR_SPEECH;
sr->rec_stat = MSP_REC_STATUS_SUCCESS;
sr->audio_status = MSP_AUDIO_SAMPLE_FIRST;
param_size = strlen(session_begin_params) + 1;
sr->session_begin_params = (char*)SR_MALLOC(param_size);
if (sr->session_begin_params == NULL) {
sr_dbg("mem alloc failed\n");
return -E_SR_NOMEM;
}
strncpy(sr->session_begin_params, session_begin_params, param_size);
sr->notif = *notify;
if (aud_src == SR_MIC) {
errcode = create_recorder(&sr->recorder, iat_cb, (void*)sr);
if (sr->recorder == NULL || errcode != 0) {
sr_dbg("create recorder failed: %d\n", errcode);
errcode = -E_SR_RECORDFAIL;
goto fail;
}
update_format_from_sessionparam(session_begin_params, &wavfmt);
errcode = open_recorder(sr->recorder, devid, &wavfmt);
if (errcode != 0) {
sr_dbg("recorder open failed: %d\n", errcode);
errcode = -E_SR_RECORDFAIL;
goto fail;
}
}
return 0;
fail:
if (sr->recorder) {
destroy_recorder(sr->recorder);
sr->recorder = NULL;
}
if (sr->session_begin_params) {
SR_MFREE(sr->session_begin_params);
sr->session_begin_params = NULL;
}
SR_MEMSET(&sr->notif, 0, sizeof(sr->notif));
return errcode;
}
/* use the default input device to capture the audio. see sr_init_ex */
int sr_init(struct speech_rec * sr, const char * session_begin_params,
enum sr_audsrc aud_src, struct speech_rec_notifier * notify)
{
return sr_init_ex(sr, session_begin_params, aud_src,
get_default_input_dev(), notify);
}
int sr_start_listening(struct speech_rec *sr)
{
int ret;
const char* session_id = NULL;
int errcode = MSP_SUCCESS;
if (sr->state >= SR_STATE_STARTED) {
sr_dbg("already STARTED.\n");
return -E_SR_ALREADY;
}
session_id = QISRSessionBegin(NULL, sr->session_begin_params, &errcode); //��д����Ҫ�����һ������ΪNULL
if (MSP_SUCCESS != errcode)
{
sr_dbg("\nQISRSessionBegin failed! error code:%d\n", errcode);
return errcode;
}
sr->session_id = session_id;
sr->ep_stat = MSP_EP_LOOKING_FOR_SPEECH;
sr->rec_stat = MSP_REC_STATUS_SUCCESS;
sr->audio_status = MSP_AUDIO_SAMPLE_FIRST;
if (sr->aud_src == SR_MIC) {
ret = start_record(sr->recorder);
if (ret != 0) {
sr_dbg("start record failed: %d\n", ret);
QISRSessionEnd(session_id, "start record fail");
sr->session_id = NULL;
return -E_SR_RECORDFAIL;
}
}
sr->state = SR_STATE_STARTED;
if (sr->notif.on_speech_begin)
sr->notif.on_speech_begin();
return 0;
}
/* after stop_record, there are still some data callbacks */
static void wait_for_rec_stop(struct recorder *rec, unsigned int timeout_ms)
{
while (!is_record_stopped(rec)) {
Sleep(1);
if (timeout_ms != (unsigned int)-1)
if (0 == timeout_ms--)
break;
}
}
int sr_stop_listening(struct speech_rec *sr)
{
int ret = 0;
const char * rslt = NULL;
if (sr->state < SR_STATE_STARTED) {
sr_dbg("Not started or already stopped.\n");
return 0;
}
if (sr->aud_src == SR_MIC) {
ret = stop_record(sr->recorder);
if (ret != 0) {
sr_dbg("Stop failed! \n");
return -E_SR_RECORDFAIL;
}
wait_for_rec_stop(sr->recorder, (unsigned int)-1);
}
sr->state = SR_STATE_INIT;
ret = QISRAudioWrite(sr->session_id, NULL, 0, MSP_AUDIO_SAMPLE_LAST, &sr->ep_stat, &sr->rec_stat);
if (ret != 0) {
sr_dbg("write LAST_SAMPLE failed: %d\n", ret);
QISRSessionEnd(sr->session_id, "write err");
return ret;
}
while (sr->rec_stat != MSP_REC_STATUS_COMPLETE) {
rslt = QISRGetResult(sr->session_id, &sr->rec_stat, 0, &ret);
if (MSP_SUCCESS != ret) {
sr_dbg("\nQISRGetResult failed! error code: %d\n", ret);
end_sr_on_error(sr, ret);
return ret;
}
if (NULL != rslt && sr->notif.on_result)
sr->notif.on_result(rslt, sr->rec_stat == MSP_REC_STATUS_COMPLETE ? 1 : 0);
Sleep(100);
}
QISRSessionEnd(sr->session_id, "normal");
sr->session_id = NULL;
return 0;
}
int sr_write_audio_data(struct speech_rec *sr, char *data, unsigned int len)
{
const char *rslt = NULL;
int ret = 0;
if (!sr )
return -E_SR_INVAL;
if (!data || !len)
return 0;
ret = QISRAudioWrite(sr->session_id, data, len, sr->audio_status, &sr->ep_stat, &sr->rec_stat);
if (ret) {
end_sr_on_error(sr, ret);
return ret;
}
sr->audio_status = MSP_AUDIO_SAMPLE_CONTINUE;
if (MSP_REC_STATUS_SUCCESS == sr->rec_stat) { //�Ѿ��в�����д���
rslt = QISRGetResult(sr->session_id, &sr->rec_stat, 0, &ret);
if (MSP_SUCCESS != ret) {
sr_dbg("\nQISRGetResult failed! error code: %d\n", ret);
end_sr_on_error(sr, ret);
return ret;
}
if (NULL != rslt && sr->notif.on_result)
sr->notif.on_result(rslt, sr->rec_stat == MSP_REC_STATUS_COMPLETE ? 1 : 0);
}
if (MSP_EP_AFTER_SPEECH == sr->ep_stat)
end_sr_on_vad(sr);
return 0;
}
void sr_uninit(struct speech_rec * sr)
{
if (sr->recorder) {
if(!is_record_stopped(sr->recorder))
stop_record(sr->recorder);
close_recorder(sr->recorder);
destroy_recorder(sr->recorder);
sr->recorder = NULL;
}
if (sr->session_begin_params) {
SR_MFREE(sr->session_begin_params);
sr->session_begin_params = NULL;
}
}
- linuxrec
代码略长,不再赘述,也没进行修改。
2.代码修改
针对上述三个文件,我们进行移植应用只需要关注第一个代码即可。
参考教程中的代码改写:相比于官方源码,删除了有关上传用户词库部分,删除了从文件中读取音频的部分,并且修改了读取时间,改为10秒钟。
主要结构还是由发布器和订阅器组成,订阅器订阅语音话题,识别到后就调用回调函数进行语音识别唤醒,值得注意的是话题的发布并没有写文本而是在终端中采用命令行输入的方式进行的。关于发布器,发布的是识别到的语音文本消息,可以供其他节点进行订阅。
/*
* 语音听写(iFly Auto Transform)技术能够实时地将语音转换成对应的文字。
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include "qisr.h"
#include "msp_cmn.h"
#include "msp_errors.h"
#include "speech_recognizer.h"
#include <iconv.h>
#include "ros/ros.h"
#include "std_msgs/String.h"
#define FRAME_LEN 640
#define BUFFER_SIZE 4096
int wakeupFlag = 0 ;//定义标志位,唤醒
int resultFlag = 0 ;
/*和官方源码相同的部分,对于不需要的功能进行了删减*/
static void show_result(char *string, char is_over)//显示结果
{
resultFlag=1;
printf("\rResult: [ %s ]", string);
if(is_over)
putchar('\n');
}
static char *g_result = NULL;
static unsigned int g_buffersize = BUFFER_SIZE;
void on_result(const char *result, char is_last)//分配内存区域
{
if (result) {
size_t left = g_buffersize - 1 - strlen(g_result);
size_t size = strlen(result);
if (left < size) {
g_result = (char*)realloc(g_result, g_buffersize + BUFFER_SIZE);
if (g_result)
g_buffersize += BUFFER_SIZE;
else {
printf("mem alloc failed\n");
return;
}
}
strncat(g_result, result, size);
show_result(g_result, is_last);
}
}
void on_speech_begin()
{
if (g_result)
{
free(g_result);
}
g_result = (char*)malloc(BUFFER_SIZE);
g_buffersize = BUFFER_SIZE;
memset(g_result, 0, g_buffersize);
printf("Start Listening...\n");
}
void on_speech_end(int reason)
{
if (reason == END_REASON_VAD_DETECT)
printf("\nSpeaking done \n");
else
printf("\nRecognizer error %d\n", reason);
}
/* demo recognize the audio from microphone */
static void demo_mic(const char* session_begin_params)//相比源代码,只保留通过麦克风进行识别的函数
{
int errcode;
int i = 0;
struct speech_rec iat;
struct speech_rec_notifier recnotifier = {
on_result,
on_speech_begin,
on_speech_end
};
errcode = sr_init(&iat, session_begin_params, SR_MIC, &recnotifier);//调用了speech_recognizer中的函数进行语音识别相关工作
if (errcode) {
printf("speech recognizer init failed\n");
return;
}
errcode = sr_start_listening(&iat);
if (errcode) {
printf("start listen failed %d\n", errcode);
}
/* demo 10 seconds recording */
while(i++ < 10)//修改了时间
sleep(1);
errcode = sr_stop_listening(&iat);
if (errcode) {
printf("stop listening failed %d\n", errcode);
}
sr_uninit(&iat);
}
/* main thread: start/stop record ; query the result of recgonization.
* record thread: record callback(data write)
* helper thread: ui(keystroke detection)
*/
/*定义回调函数,接收器接收到消息执行回调函数*/
void WakeUp(const std_msgs::String::ConstPtr& msg)
{
printf("waking up\r\n");
usleep(700*1000);
wakeupFlag=1;
}
int main(int argc, char* argv[])
{
// 初始化ROS
ros::init(argc, argv, "voiceRecognition");
ros::NodeHandle n;
ros::Rate loop_rate(10);
// 声明Publisher和Subscriber
// 订阅唤醒语音识别的信号
ros::Subscriber wakeUpSub = n.subscribe("voiceWakeup", 1000, WakeUp);
// 发布语音识别的信号
ros::Publisher voiceWordsPub = n.advertise<std_msgs::String>("voiceWords", 1000);
ROS_INFO("Sleeping...");//输出函数,输出信息打印在终端。
int count=0;
while(ros::ok())
{
// 语音识别唤醒
if (wakeupFlag)//wakeupFlage等于1的情况下接受到信息,语音识别唤醒。
{
ROS_INFO("Wakeup...");
int ret = MSP_SUCCESS;
const char* login_params = "appid = 5fa8bc98, work_dir = .";
const char* session_begin_params =
"sub = iat, domain = iat, language = zh_cn, "
"accent = mandarin, sample_rate = 16000, "
"result_type = plain, result_encoding = utf8";
ret = MSPLogin(NULL, NULL, login_params);
if(MSP_SUCCESS != ret){
MSPLogout();
printf("MSPLogin failed , Error code %d.\n",ret);
}
printf("Demo recognizing the speech from microphone\n");
printf("Speak in 10 seconds\n");
demo_mic(session_begin_params);//调用识别函数
printf("10 sec passed\n");
wakeupFlag=0;
MSPLogout();
}
// 语音识别完成
if(resultFlag){
resultFlag=0;
std_msgs::String msg;
msg.data = g_result;
voiceWordsPub.publish(msg);//将语音识别到的消息发布出去
}
ros::spinOnce();
loop_rate.sleep();
count++;
}
exit:
MSPLogout(); // Logout...
return 0;
}
- 编写订阅器专门订阅语音识别到的消息
在编写此部分代码的时候遇到了一些问题,主要还是工作区间的问题,我一开始简单的在跑官方示例的文件夹中进行的修改,但是编译过程遇到了错误。要想要创建可以发布的话题编写这种脚本的话需要创建功能包,参考如下:
不过观察功能包的结构也可以自行添加编译文件,不再重新构建功能包。参考之前构建的功能包,主要包含两个编译文件。
对两个文件进行简单的修改即可。注意功能包不能重名。
移植成功:
- 编写订阅器
上述代码是直接在终端输入指令行查看的发布的消息,接下来编写脚本文件直接对消息进行订阅并且可以对其数据作进一步的处理。
根据订阅器改写如下,但是遇到一个问题,节点启动失败。 - 问题描述
#include "ros/ros.h"
#include "std_msgs/String.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include "qisr.h"
#include "msp_cmn.h"
#include "msp_errors.h"
#include "speech_recognizer.h"
#include <iconv.h>
int flag=0; //declare the flag
void receiver(const std_msgs::String::ConstPtr& msg)
{
ROS_INFO("I heard : [%s]", msg->data.c_str()); //print
flag=1; //set flag 1
}
int main(int argc, char* argv[])
{
ros::init(argc, argv, "receiver");
ros::NodeHandle n;
ros::Subscriber sub = n.subscribe("voiceWords", 1000, receiver);
ros::spinOnce();
return 0;
}
节点启动失败:
显示并没有任何节点订阅voiceWords这个话题。然后我考虑是不是因为我的发布器不是一直发布消息的,是需要wakeup唤醒的导致订阅器接受不到消息,所以我修改代码使其一直处于发送消息的状态,但是还是同样的问题。本来只有在resultflag为1时才会发送消息,修改后一直在发送消息不过发送不同的消息。
// 语音识别完成
std_msgs::String msg;
if(resultFlag){
resultFlag=0;
msg.data = g_result;
voiceWordsPub.publish(msg);
printf("pub\n");
}
else{
std::stringstream ss;
ss << "sorry, no vaild input ! \n ";
msg.data = ss.str();
voiceWordsPub.publish(msg);
}
ros::spinOnce();
loop_rate.sleep();
count++;
}
- 问题解决。
发现一个智障的问题,我订阅的时候用的函数是spinOnce(),又没有在外面添加循环,所以导致节点只运行一遍就自行结束了,所以才会发生上述的现象。只需要将函数修改为spin()即可解决此问题。
- 效果展示
此时发现还有一个问题就是对于中文的输出显示为乱码“?”,这是因为输出格式的问题,进行简单的代码修改即可。
将输出部分修改为如下所示:
void receiver(const std_msgs::String::ConstPtr& msg)
{
std::cout<<"I heard :"<<msg->data.c_str()<<std::endl; //print
flag=1; //set flag 1
}
成功解决问题:
为了方便启动运行,考虑编写一个launch文件一起启动两个节点,也比较简单。
<launch>
<node name="iat_record_p" pkg="voice_synthesis" type="iat_record_p" output="screen"/>
<node name="iat_record_s" pkg="voice_synthesis" type="iat_record_s" output="screen"/>
</launch>
总结
通过语音合成和语音识别两个不同部分的代码研读以及初步修改,基本已经掌握了怎么样使用科大讯飞的语音模块,学会了编写发布器和订阅器,已经可以将识别到的语音文本成功发布并被其他节点成功订阅接收。接下来的目标是将语音识别到的消息传递给机器狗的控制程序,并初步控制机器狗的运动。