sherpa-onnx可进行语音通话、人声识别等。
首先是安装
网址:https://k2-fsa.github.io/sherpa/onnx/install/index.html
我的电脑是windows64,所以选择了如下图
按照如下图所示拉取代码并编译,这里注意!!需要vs2019及以上版本,vs2017及以下都会出错,我在这里浪费了很长时间emmm
接下来是使用
最后会在bin/release下生成很多可执行文件,每一个的功能有所不同,比如我想要选择离线的语音识别,可以选择sherpa-onnx-microphone-offline.exe(一下都以这个相关模型为例)。需要先下载预训练模型:https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html#sherpa-onnx-pre-trained-models
选择想要下载的离线模型,其中提供的都是linux的命令,可以直接复制网址到浏览器下载,然后解压,将前面那个.exe复制到这个文件里,然后在此文件处打开终端,输入
sherpa-onnx-microphone-offline --tokens=./tokens.txt --paraformer=./model.int8.onnx
然后运行,发现可能有乱码,可输入命令
CHCP 65001
来就解决此问题。
接着就可以根据提示来使用此语音识别模型。
与外部通信
比如我此处使用的是sherpa-onnx-microphone-offline.exe,所以在sherpa-onnx-microphone-offline.cc文件中去更改代码,使用UDP通信,此处可能会存在乱码的问题,之后到应用阶段如果遇到的话再想办法解决。
// sherpa-onnx/csrc/sherpa-onnx-microphone-offline.cc
//
// Copyright (c) 2022-2023 Xiaomi Corporation
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <cctype> // std::tolower
#include <mutex> // NOLINT
#include <thread> // NOLINT
#include "portaudio.h" // NOLINT
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/microphone.h"
#include "sherpa-onnx/csrc/offline-recognizer.h"
//UDP发数据
#include <string.h>
//#include <winsock.h>
#include <WinSock2.h>
#include <iostream>
#pragma comment(lib, "ws2_32.lib")
#include <io.h>
#include <process.h>
enum class State {
kIdle,
kRecording,
kDecoding,
};
State state = State::kIdle;
// true to stop the program and exit
bool stop = false;
std::vector<float> samples;
std::mutex samples_mutex;
static void DetectKeyPress() {
SHERPA_ONNX_LOGE("Press Enter to start");
int32_t key;
while (!stop && (key = getchar())) {
if (key != 0x0a) {
continue;
}
switch (state) {
case State::kIdle:
SHERPA_ONNX_LOGE("Start recording. Press Enter to stop recording");
state = State::kRecording;
{
std::lock_guard<std::mutex> lock(samples_mutex);
samples.clear();
}
break;
case State::kRecording:
SHERPA_ONNX_LOGE("Stop recording. Decoding ...");
state = State::kDecoding;
break;
case State::kDecoding:
break;
}
}
}
static int32_t RecordCallback(const void *input_buffer,
void * /*output_buffer*/,
unsigned long frames_per_buffer, // NOLINT
const PaStreamCallbackTimeInfo * /*time_info*/,
PaStreamCallbackFlags /*status_flags*/,
void *user_data) {
std::lock_guard<std::mutex> lock(samples_mutex);
auto p = reinterpret_cast<const float *>(input_buffer);
samples.insert(samples.end(), p, p + frames_per_buffer);
return stop ? paComplete : paContinue;
}
static void Handler(int32_t sig) {
stop = true;
fprintf(stderr, "\nCaught Ctrl + C. Press Enter to exit\n");
}
int32_t main(int32_t argc, char *argv[]) {
signal(SIGINT, Handler);
const char *kUsageMessage = R"usage(
This program uses non-streaming models with microphone for speech recognition.
Usage:
(1) Transducer from icefall
./bin/sherpa-onnx-microphone-offline \
--tokens=/path/to/tokens.txt \
--encoder=/path/to/encoder.onnx \
--decoder=/path/to/decoder.onnx \
--joiner=/path/to/joiner.onnx \
--num-threads=2 \
--decoding-method=greedy_search
(2) Paraformer from FunASR
./bin/sherpa-onnx-microphone-offline \
--tokens=/path/to/tokens.txt \
--paraformer=/path/to/model.onnx \
--num-threads=1
(3) Whisper models
./bin/sherpa-onnx-microphone-offline \
--whisper-encoder=./sherpa-onnx-whisper-base.en/base.en-encoder.int8.onnx \
--whisper-decoder=./sherpa-onnx-whisper-base.en/base.en-decoder.int8.onnx \
--tokens=./sherpa-onnx-whisper-base.en/base.en-tokens.txt \
--num-threads=1
Please refer to
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
for a list of pre-trained models to download.
)usage";
sherpa_onnx::ParseOptions po(kUsageMessage);
sherpa_onnx::OfflineRecognizerConfig config;
config.Register(&po);
po.Read(argc, argv);
if (po.NumArgs() != 0) {
po.PrintUsage();
exit(EXIT_FAILURE);
}
fprintf(stderr, "%s\n", config.ToString().c_str());
if (!config.Validate()) {
fprintf(stderr, "Errors in config!\n");
return -1;
}
SHERPA_ONNX_LOGE("Creating recognizer ...");
sherpa_onnx::OfflineRecognizer recognizer(config);
SHERPA_ONNX_LOGE("Recognizer created!");
sherpa_onnx::Microphone mic;
PaDeviceIndex num_devices = Pa_GetDeviceCount();
fprintf(stderr, "Num devices: %d\n", num_devices);
int32_t device_index = Pa_GetDefaultInputDevice();
if (device_index == paNoDevice) {
fprintf(stderr, "No default input device found\n");
fprintf(stderr, "If you are using Linux, please switch to \n");
fprintf(stderr, " ./bin/sherpa-onnx-alsa-offline \n");
exit(EXIT_FAILURE);
}
const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE");
if (pDeviceIndex) {
fprintf(stderr, "Use specified device: %s\n", pDeviceIndex);
device_index = atoi(pDeviceIndex);
}
for (int32_t i = 0; i != num_devices; ++i) {
const PaDeviceInfo *info = Pa_GetDeviceInfo(i);
fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i,
info->name);
}
PaStreamParameters param;
param.device = device_index;
fprintf(stderr, "Use device: %d\n", param.device);
const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device);
fprintf(stderr, " Name: %s\n", info->name);
fprintf(stderr, " Max input channels: %d\n", info->maxInputChannels);
param.channelCount = 1;
param.sampleFormat = paFloat32;
param.suggestedLatency = info->defaultLowInputLatency;
param.hostApiSpecificStreamInfo = nullptr;
float mic_sample_rate = 16000;
const char *pSampleRateStr = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE");
if (pSampleRateStr) {
fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate);
mic_sample_rate = atof(pSampleRateStr);
}
float sample_rate = 16000;
PaStream *stream;
PaError err =
Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */
mic_sample_rate,
0, // frames per buffer
paClipOff, // we won't output out of range samples
// so don't bother clipping them
RecordCallback, nullptr);
if (err != paNoError) {
fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
exit(EXIT_FAILURE);
}
err = Pa_StartStream(stream);
fprintf(stderr, "Started\n");
if (err != paNoError) {
fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
exit(EXIT_FAILURE);
}
WSADATA data;
int ret = WSAStartup(MAKEWORD(2, 2), &data);
// 使用socket()函数获取一个socket文件描述符
SOCKET sockfd = socket(AF_INET, SOCK_DGRAM, 0);
// 准备接收方的地址和端口,'172.20.10.6'表示目的ip地址,8080表示目的端口号
struct sockaddr_in sock_addr = {0};
sock_addr.sin_family = AF_INET; // 设置地址族为IPv4
sock_addr.sin_port = htons(8080); // 设置地址的端口号信息
sock_addr.sin_addr.S_un.S_addr = inet_addr("172.20.10.6"); // 设置IP地址
ret = connect(sockfd, (struct sockaddr *)&sock_addr, sizeof(sock_addr));
// 后续根据项目需要,将这一部分代码改成了AAA-------------------------
std::thread t(DetectKeyPress);
while (!stop) {
switch (state) {
case State::kIdle:
break;
case State::kRecording:
break;
case State::kDecoding: {
std::vector<float> buf;
{
std::lock_guard<std::mutex> lock(samples_mutex);
buf = std::move(samples);
}
auto s = recognizer.CreateStream();
s->AcceptWaveform(mic_sample_rate, buf.data(), buf.size());
recognizer.DecodeStream(s.get());
SHERPA_ONNX_LOGE("Decoding Done! Result is:");
SHERPA_ONNX_LOGE("%s", s->GetResult().text.c_str());
// 发送数据到指定的ip和端口
char sendbuf[10];
strncpy(sendbuf, s->GetResult().text.c_str(),
strlen(s->GetResult().text.c_str()));
ret = send(sockfd, sendbuf, strlen(sendbuf), 0);
state = State::kIdle;
SHERPA_ONNX_LOGE("Press Enter to start");
break;
}
}
Pa_Sleep(20); // sleep for 20ms
}
t.join();
//------------------------------------------------------------------
//AAA----------------------------------------------------------------
while(true) {// 持续输出说话内容
std::vector<float> buf;
{
std::lock_guard<std::mutex> lock(samples_mutex);
buf = std::move(samples);
}
auto s = recognizer.CreateStream();
s->AcceptWaveform(mic_sample_rate, buf.data(), buf.size());
recognizer.DecodeStream(s.get());
SHERPA_ONNX_LOGE("Result is:");
SHERPA_ONNX_LOGE("%s", s->GetResult().text.c_str());
// 发送数据到指定的ip和端口
char sendbuf[10];
sendbuf[0] = '0';
char *cText;
strncpy(cText, s->GetResult().text.c_str(),
strlen(s->GetResult().text.c_str()));
char *c1 = {"前进"};
char *c2 = {"后退"};
char *c3 = {"确认"};
char *c4 = {"取消"};
char *c5 = {"关闭窗口"};
// SHERPA_ONNX_LOGE("%s", cText);
if (strstr(cText, c1)) {
sendbuf[0] = '1';
} else if (strstr(cText, c2)) {
sendbuf[0] = '2';
} else if (strstr(cText, c3)) {
sendbuf[0] = '3';
} else if (strstr(cText, c4)) {
sendbuf[0] = '4';
} else if (strstr(cText, c5)) {
sendbuf[0] = '5';
}
ret = send(sockfd, sendbuf, strlen(sendbuf), 0);
Pa_Sleep(3000);//每三秒输出所说话
//-----------------------------------------------------
// 关闭套接字
closesocket(sockfd);
WSACleanup();
err = Pa_CloseStream(stream);
if (err != paNoError) {
fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
exit(EXIT_FAILURE);
}
return 0;
}