倒霉催的arduino 上的sr模型 不行,以不想装PIO 只能上IDF,好讨厌(官方搞了个ARDUINO-ESP2在IDF中使用ARUDINO代码)
AFE-声学前端 (回声消除 噪声抑制)
WakeNet-唤醒词 检测 (WakeNet5 用于esp32 WakeNet 8,9 用于s3芯片)
MultiNet-命令词 识别(参ASR)
TTs 语音合成(
示例: examples | components\esp-sr\test_apps
esp-skainet: ESP-Skainet 是乐鑫推出的智能语音助手,目前支持唤醒词识别和命令词识别。 - Gitee.com
chinese_tts 语音识别
cn_speech_commands_recognition 命令词识别
deep_noise_suppression 语音通话
direction_of_arrival 方位
en_speech_commands_recognition 英文命令识别
usb_mic_recorder usb声音记录
voice_activey_detection 语音检测
voice_communication 语音通话
wake_word_detection 唤醒词检测(内有afe 和 wakenet 两种)
afe-wakenet 区别 (来源:esp-skainet\examples\wake_word_detection\readme.md)
只使用命令词唤醒 和只有单麦克风 并且少消耗CPU资源和内存 则可以使用wakenet
如果有双麦,同时要提升语音效果,使用afe接口
声学前端 (AEC回声消除 NS-噪声抑制 bss 目标声源与干扰音抑制)
MultiNet 输入为经过前端语音算法(AFE)处理过的音频(格式为 16 KHz,16 bit,单声道)。通过对音频进行识别,则可以对应到相应的汉字或单词。
srmodel_list_t *models = esp_srmodel_init("model");
afe_config_t *afe_config = afe_config_init("MMNR", models, AFE_TYPE_SR, AFE_MODE_HIGH_PERF);
// 获取句柄
esp_afe_sr_iface_t *afe_handle = esp_afe_handle_from_config(afe_config);
// 创建实例
esp_afe_sr_data_t *afe_data = afe_handle->create_from_config(afe_config);
int feed_chunksize = afe_handle->get_feed_chunksize(afe_data);
int feed_nch = afe_handle->get_feed_channel_num(afe_data);
int16_t *feed_buff = (int16_t *) malloc(feed_chunksize * feed_nch * sizeof(int16_t));
afe_handle->feed(afe_data, feed_buff);
examples/wake_word_detection/wakenet/main/main.c · 乐鑫开源/esp-skainet - Gitee.com
示例IMP441 唤醒词 (参考的NOCODEC
PCM PDM 区别 I2S0 I2S1 区别 I2S可以PCM->PDM PDM->PCM 双向 I2S1无(参考官方文档)
#include "esp_log.h"
#include <stdio.h>
#include <freertos/FreeRTOS.h>
#include <driver/i2s_std.h>
#include <stdio.h>
#include <stdlib.h>
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#include "esp_wn_iface.h"
#include "esp_wn_models.h"
#include "model_path.h"
#include "string.h"
#include "hiesp.h"
#include "hilexin.h"
static const char *TAG = "EXAMPLE-VAD";
#define AUDIO_CODEC_DMA_DESC_NUM 6
#define AUDIO_CODEC_DMA_FRAME_NUM 240
#define AUDIO_CODEC_DEFAULT_MIC_GAIN 30.0
i2s_std_slot_mask_t mic_slot_mask = I2S_STD_SLOT_LEFT;
//-------------------i2s
#define MIC_BCLK GPIO_NUM_5 //22
#define MIC_WS GPIO_NUM_4 // 23
#define MIC_DIN GPIO_NUM_6//26
i2s_chan_handle_t rx_chan,tx_chan;
//uint32_t rbuf[ 16000*30/1000 * sizeof(short)];
//---------------------
#define CODEC_ADC_I2S_PORT 0
#define VAD_SAMPLE_RATE_HZ 16000
#define VAD_FRAME_LENGTH_MS 30
#define VAD_BUFFER_LENGTH (VAD_FRAME_LENGTH_MS * VAD_SAMPLE_RATE_HZ / 1000)
int16_t buffer[512];
int32_t b32[512];
esp_wn_iface_t *wakenet;
model_iface_data_t *model_data;
static void task(void *arg)
{
size_t r_bytes =0;
size_t w_bytes = 0;
size_t bytes_read = 0;
size_t samples = 512;
printf("in task");
while(1)
{
//I2S读取
if (i2s_channel_read(rx_chan, b32, 512*4, &bytes_read, portMAX_DELAY) != ESP_OK) {
ESP_LOGE("read", "Read Failed!");
}
//printf("r bytes = %d",bytes_read);
samples = 512 ; // / sizeof(int32_t);
for (int i = 0; i < samples; i++) {
int32_t value = b32[i] >> 12;
buffer[i] = (value > INT16_MAX) ? INT16_MAX : (value < -INT16_MAX) ? -INT16_MAX : (int16_t)value;
}
wakenet_state_t state = wakenet->detect(model_data, buffer);
if (state == WAKENET_DETECTED) {
printf("Detected\n");
}
// else
// printf("*");
}
}
void app_main(void)
{
//-----------------I2S初始化-----------RX
i2s_chan_config_t rx_chan_cfg = {
.id = I2S_NUM_0,
.role = I2S_ROLE_MASTER,
.dma_desc_num = AUDIO_CODEC_DMA_DESC_NUM,
.dma_frame_num = AUDIO_CODEC_DMA_FRAME_NUM,
.auto_clear_after_cb = true,
.auto_clear_before_cb = false,
.intr_priority = 0,
};
i2s_new_channel(&rx_chan_cfg,NULL,&rx_chan);
// i2s_chan_config_t rx_chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_0,I2S_ROLE_MASTER);
// i2s_new_channel(&rx_chan_cfg,NULL,&rx_chan);
i2s_std_config_t rx_cfg = {
//.clk_cfg = I2S_STD_CLK_DEFAULT_CONFIG(44100),
.clk_cfg= {
.sample_rate_hz = (uint32_t)16000,
.clk_src = I2S_CLK_SRC_DEFAULT,
.mclk_multiple = I2S_MCLK_MULTIPLE_256,
},
.slot_cfg = {
.data_bit_width = I2S_DATA_BIT_WIDTH_32BIT,
.slot_bit_width = I2S_SLOT_BIT_WIDTH_AUTO,
.slot_mode = I2S_SLOT_MODE_MONO,
.slot_mask = mic_slot_mask,
.ws_width = I2S_DATA_BIT_WIDTH_32BIT,
.ws_pol = false,
.bit_shift = true,
#ifdef I2S_HW_VERSION_2
.left_align = true,
.big_endian = false,
.bit_order_lsb = false
#endif
},
//.slot_cfg = I2S_STD_MSB_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_16BIT,I2S_SLOT_MODE_MONO),
.gpio_cfg = {
.mclk = I2S_GPIO_UNUSED,
.dout= I2S_GPIO_UNUSED,
.bclk = MIC_BCLK,
.ws = MIC_WS,
.din = MIC_DIN,
.invert_flags = {
false,false,false,
},
},
};
rx_cfg.slot_cfg.slot_mask = I2S_STD_SLOT_LEFT;
i2s_channel_init_std_mode(rx_chan,&rx_cfg);
i2s_channel_enable(rx_chan);
//---------------------------------------
vTaskDelay(500 / portTICK_PERIOD_MS);
srmodel_list_t *models = esp_srmodel_init("model");
char *model_name = esp_srmodel_filter(models, ESP_WN_PREFIX, "hilexin");
wakenet = (esp_wn_iface_t*)esp_wn_handle_from_name(model_name);
model_data = wakenet->create(model_name, DET_MODE_95);
int frequency = wakenet->get_samp_rate(model_data);
int audio_chunksize = wakenet->get_samp_chunksize(model_data) ;
//int16_t *buffer = (int16_t *) malloc(audio_chunksize);
wakenet->set_det_threshold(model_data, 0.8, 1);
wakenet->reset_det_threshold(model_data);
char *wake_words = NULL;
wake_words = esp_srmodel_get_wake_words(models, model_name);
unsigned char* data = NULL;
size_t data_size = 0;
int chunks = 0;
if (strstr(model_name, "hiesp") != NULL) {
data = (unsigned char*)hiesp;
data_size = sizeof(hiesp);
printf("wake word: %s, size:%d\n", "hiesp", data_size);
} else if(strstr(model_name, "hilexin") != NULL) {
data = (unsigned char*)hilexin;
data_size = sizeof(hilexin);
printf("wake word: %s, size:%d\n", "hilexin", data_size);
}
// int32_t *b32 = (int32_t *) malloc(audio_chunksize*2);
// int16_t *rbuf = (int16_t *) malloc(audio_chunksize);
printf("chuksize = %d",audio_chunksize);
xTaskCreate(task,"task",4096,NULL,5,NULL);
while(1)
{
printf(".");
vTaskDelay(50/portTICK_PERIOD_MS);
}
}
模型 烧写 加载过程 在CMakeLists.txt文件中(参文档)
Espressif\frameworks\esp-adf\components\esp-sr\include\esp32s3\esp_mn_models.h
示例(skainet 下的 components\tests exampoles 或是 esp -adf下的components\sr 或是 examples :
过程 读取 然后使用 afe或 wakenet 识别读取的内容
WakeNet 包含于AFE(前端算法中)
识别16KHZ 1MONO
命令词识别 打开电灯 关闭电灯 (参skainet
灵敏度
// #define CONFIG_SR_MN_CN_MULTINET2_SINGLE_RECOGNITION
#include "esp_log.h"
#include <stdio.h>
#include <freertos/FreeRTOS.h>
#include <driver/i2s_std.h>
#include <stdio.h>
#include <stdlib.h>
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#include "esp_wn_iface.h"
#include "esp_wn_models.h"
#include "model_path.h"
#include "string.h"
#include "hiesp.h"
#include "hilexin.h"
//multinet
#include "esp_mn_iface.h"
#include "esp_mn_models.h"
#include "esp_mn_speech_commands.h"
#include "esp_process_sdkconfig.h"
#include "alexa.h"
#include "dl_lib_convq_queue.h"
#include "da_kai_kong_tiao.h"
#include "tell_me_a_joke.h"
static const char *TAG = "EXAMPLE-VAD";
#define AUDIO_CODEC_DMA_DESC_NUM 6
#define AUDIO_CODEC_DMA_FRAME_NUM 240
#define AUDIO_CODEC_DEFAULT_MIC_GAIN 30.0
i2s_std_slot_mask_t mic_slot_mask = I2S_STD_SLOT_LEFT;
//-------------------i2s
#define MIC_BCLK GPIO_NUM_5 //22
#define MIC_WS GPIO_NUM_4 // 23
#define MIC_DIN GPIO_NUM_6//26
i2s_chan_handle_t rx_chan,tx_chan;
//uint32_t rbuf[ 16000*30/1000 * sizeof(short)];
//---------------------
#define CODEC_ADC_I2S_PORT 0
#define VAD_SAMPLE_RATE_HZ 16000
#define VAD_FRAME_LENGTH_MS 30
#define VAD_BUFFER_LENGTH (VAD_FRAME_LENGTH_MS * VAD_SAMPLE_RATE_HZ / 1000)
int16_t buffer[512];
int32_t b32[512];
esp_wn_iface_t *wakenet;
model_iface_data_t *model_data;
model_iface_data_t *mn_model_data;
// #define MULTINET_COEFF "COEFF_NULL"
// #define MULTINET_COEFF get_coeff_multinet2_ch
//static const esp_mn_iface_t *multinet = &MULTINET_MODEL;
// static const esp_mn_iface_t *multinet ;//= &MULTINET_MODEL;
esp_mn_iface_t *multinet;
static void task(void *arg)
{
size_t r_bytes =0;
size_t w_bytes = 0;
size_t bytes_read = 0;
size_t samples = 512;
esp_mn_state_t mn_state;
printf("in task");
while(1)
{
//I2S读取
if (i2s_channel_read(rx_chan, b32, 512*4, &bytes_read, portMAX_DELAY) != ESP_OK) {
ESP_LOGE("read", "Read Failed!");
}
//printf("r bytes = %d",bytes_read);
samples = 512 ; // / sizeof(int32_t);
for (int i = 0; i < samples; i++) {
int32_t value = b32[i] >> 12;
buffer[i] = (value > INT16_MAX) ? INT16_MAX : (value < -INT16_MAX) ? -INT16_MAX : (int16_t)value;
}
wakenet_state_t state = wakenet->detect(model_data, buffer);
if (state == WAKENET_DETECTED) {
printf("Detected\n");
}
// else
// printf("*");
//---------------multinet-------------------
mn_state = multinet->detect(mn_model_data, buffer);
if (mn_state == ESP_MN_STATE_DETECTED) {
esp_mn_results_t *mn_result = multinet->get_results(mn_model_data);
if (mn_result->num > 0)
printf("detected: command id:%d, string:%s\n",mn_result->command_id[0], mn_result->string);
else
printf("timeout\n");
// break;
}
}
}
void app_main(void)
{
//-----------------I2S初始化-----------RX
i2s_chan_config_t rx_chan_cfg = {
.id = I2S_NUM_0,
.role = I2S_ROLE_MASTER,
.dma_desc_num = AUDIO_CODEC_DMA_DESC_NUM,
.dma_frame_num = AUDIO_CODEC_DMA_FRAME_NUM,
.auto_clear_after_cb = true,
.auto_clear_before_cb = false,
.intr_priority = 0,
};
i2s_new_channel(&rx_chan_cfg,NULL,&rx_chan);
// i2s_chan_config_t rx_chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_0,I2S_ROLE_MASTER);
// i2s_new_channel(&rx_chan_cfg,NULL,&rx_chan);
i2s_std_config_t rx_cfg = {
//.clk_cfg = I2S_STD_CLK_DEFAULT_CONFIG(44100),
.clk_cfg= {
.sample_rate_hz = (uint32_t)16000,
.clk_src = I2S_CLK_SRC_DEFAULT,
.mclk_multiple = I2S_MCLK_MULTIPLE_256,
},
.slot_cfg = {
.data_bit_width = I2S_DATA_BIT_WIDTH_32BIT,
.slot_bit_width = I2S_SLOT_BIT_WIDTH_AUTO,
.slot_mode = I2S_SLOT_MODE_MONO,
.slot_mask = mic_slot_mask,
.ws_width = I2S_DATA_BIT_WIDTH_32BIT,
.ws_pol = false,
.bit_shift = true,
#ifdef I2S_HW_VERSION_2
.left_align = true,
.big_endian = false,
.bit_order_lsb = false
#endif
},
//.slot_cfg = I2S_STD_MSB_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_16BIT,I2S_SLOT_MODE_MONO),
.gpio_cfg = {
.mclk = I2S_GPIO_UNUSED,
.dout= I2S_GPIO_UNUSED,
.bclk = MIC_BCLK,
.ws = MIC_WS,
.din = MIC_DIN,
.invert_flags = {
false,false,false,
},
},
};
rx_cfg.slot_cfg.slot_mask = I2S_STD_SLOT_LEFT;
i2s_channel_init_std_mode(rx_chan,&rx_cfg);
i2s_channel_enable(rx_chan);
//---------------wake up------------------------
vTaskDelay(500 / portTICK_PERIOD_MS);
srmodel_list_t *models = esp_srmodel_init("model");
char *model_name = esp_srmodel_filter(models, ESP_WN_PREFIX, "hilexin");
wakenet = (esp_wn_iface_t*)esp_wn_handle_from_name(model_name);
model_data = wakenet->create(model_name, DET_MODE_95);
int frequency = wakenet->get_samp_rate(model_data);
int audio_chunksize = wakenet->get_samp_chunksize(model_data) ;
//int16_t *buffer = (int16_t *) malloc(audio_chunksize);
wakenet->set_det_threshold(model_data, 0.8, 1);
wakenet->reset_det_threshold(model_data);
char *wake_words = NULL;
wake_words = esp_srmodel_get_wake_words(models, model_name);
unsigned char* data = NULL;
size_t data_size = 0;
int chunks = 0;
if (strstr(model_name, "hiesp") != NULL) {
data = (unsigned char*)hiesp;
data_size = sizeof(hiesp);
printf("wake word: %s, size:%d\n", "hiesp", data_size);
} else if(strstr(model_name, "hilexin") != NULL) {
data = (unsigned char*)hilexin;
data_size = sizeof(hilexin);
printf("wake word: %s, size:%d\n", "hilexin", data_size);
}
printf("chuksize = %d",audio_chunksize);
char *mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_CHINESE);
//------------------------mul------------------- //adf components //test app test multinet
// model_iface_data_t *model_mn_data = multinet->create(&MULTINET_COEFF, 6000);
// int audio_mn_chunksize = multinet->get_samp_chunksize(model_mn_data);
// int mn_num = multinet->get_samp_chunknum(model_mn_data);
// int mn_sample_rate = multinet->get_samp_rate(model_mn_data);
// ESP_LOGI(TAG, "keywords_num = %d , sample_rate = %d, chunksize = %d, sizeof_uint16 = %d", mn_num, mn_sample_rate, audio_mn_chunksize, sizeof(int16_t));
multinet = esp_mn_handle_from_name(mn_name);
mn_model_data = multinet->create(mn_name, 500000);
frequency = multinet->get_samp_rate(mn_model_data);
audio_chunksize = multinet->get_samp_chunksize(mn_model_data) ;
char *lang = multinet->get_language(mn_model_data);
esp_mn_commands_update_from_sdkconfig(multinet, mn_model_data);
data = NULL;
data_size = 0;
if (strcmp(lang, ESP_MN_ENGLISH) == 0) {
data = (unsigned char*)tell_me_a_joke;
data_size = sizeof(tell_me_a_joke);
printf("commands: tell me a joke, size:%d\n", data_size);
} else if(strcmp(lang, ESP_MN_CHINESE) == 0) {
data = (unsigned char*)da_kai_kong_tiao;
data_size = sizeof(da_kai_kong_tiao);
printf("commands: da kai kong tiao, size:%d\n", data_size);
}
multinet->print_active_speech_commands(mn_model_data);
// int32_t *b32 = (int32_t *) malloc(audio_chunksize*2);
// int16_t *rbuf = (int16_t *) malloc(audio_chunksize);
printf("chuksize = %d",audio_chunksize);
xTaskCreate(task,"task",4096,NULL,5,NULL);
while(1)
{
printf(".");
vTaskDelay(50/portTICK_PERIOD_MS);
}
}
加载模型的过程 flash 函数 加载
# The following lines of boilerplate have to be in your project's
# CMakeLists in this exact order for cmake to work correctly
cmake_minimum_required(VERSION 3.5)
include($ENV{ADF_PATH}/CMakeLists.txt)
include($ENV{IDF_PATH}/tools/cmake/project.cmake)
add_compile_options (-fdiagnostics-color=always)
project(example_wwe)
# This is a cmake function, which is used to flash the bin file to the specified partition
function(esptool_py_flash_customize_image target_name image_name offset image)
idf_build_get_property(build_dir BUILD_DIR)
file(RELATIVE_PATH image ${build_dir} ${image})
set_property(TARGET ${target_name} APPEND PROPERTY FLASH_FILE
"\"${offset}\" : \"${image}\"")
set_property(TARGET ${target_name} APPEND PROPERTY FLASH_ENTRY
"\"${image_name}\" : { \"offset\" : \"${offset}\", \"file\" : \"${image}\" }")
set_property(TARGET ${target_name} APPEND PROPERTY IMAGES "${offset} ${image}")
if(CONFIG_SECURE_FLASH_ENCRYPTION_MODE_DEVELOPMENT)
set_property(TARGET encrypted-${target_name} APPEND PROPERTY FLASH_FILE
"\"${offset}\" : \"${image}\"")
set_property(TARGET encrypted-${target_name} APPEND PROPERTY FLASH_ENTRY
"\"${image_name}\" : { \"offset\" : \"${offset}\", \"file\" : \"${image}\" }")
set_property(TARGET encrypted-${target_name} APPEND PROPERTY IMAGES "${offset} ${image}")
endif()
endfunction()
# Flash the custom partition named `flash_tone`.
set(partition flash_tone)
idf_build_get_property(project_dir PROJECT_DIR)
set(image_file ${project_dir}/tone/audio_tone.bin)
partition_table_get_partition_info(offset "--partition-name ${partition}" "offset")
esptool_py_flash_customize_image(flash "${partition}" "${offset}" "${image_file}")
以乐鑫语音开发框架为例,系统了解嵌入式设备的语音唤醒和语音识别-RoboticsCV