【已读乱回的智能桌面助手】-CSDN博客

本文链接：https://blog.csdn.net/weixin_63186694/article/details/140557649

前言

项目的其中的语音接入大模型对话，是【ESP32S3 接入MiniMax文本语音大模型对话&语音克隆教程】的一次复现。博主：2345VOR。代码基本没啥改动。废话不多说，给代码

#include <Arduino.h>
#include "base64.h"
#include <WiFi.h>
#include "HTTPClient.h"
#include "cJSON.h"
#include <I2S.h>
#include <ArduinoJson.h>
#include <SoftwareSerial.h>
#include "UTF8ToGB2312.h"
#define MYPORT_TX 1
#define MYPORT_RX 2

EspSoftwareSerial::UART myPort;

uint8_t voicedata[] = { 0xFD, 0x00, 0x06, 0x01, 0x01, 0x5B, 0x76, 0x31, 0x5D };  //voicedata[7] =  0x31 ~ 0x39
// #define data_len 16000
 #define key 3             //端口0
// #define ADC 2             //端口39
// #define led 15            //端口2

HTTPClient http_client;
// 1. Replace with your network credentials
const char *ssid = "your";
const char *password = "your";
// 2. Check your Aduio port
const int buttonPin = 1;  // the number of the pushbutton pin
const int ledPin = 21;    // the number of the LED pin
hw_timer_t *timer = NULL;
const int adc_data_len = 8000 * 3;
const int data_json_len = adc_data_len * 6;
uint16_t *adc_data;
char *data_json;
// uint16_t adc_data[data_len];    //16000个数据，8K采样率，即2秒，录音时间为2秒，想要实现更长时间的语音识别，就要改这个数组大小
// char data_json[json_len];          //用于储存json格式的数据,大一点,JSON编码后数据字节数变成原来的4/3,所以得计算好,避免出现越界
//和下面data_json数组的大小，改大一些。
uint8_t adc_start_flag = 0;     //开始标志
uint8_t adc_complete_flag = 0;  //完成标志


// 3. Replace with your MiniMax API key
const char *apiKey = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJHcm91cE5hbWUiOiLmnpfplYfljYciLCJVc2VyTmFtZSI6Iuael-mVh-WNhyIsIkFjY291bnQiOiIiLCJTdWJqZWN0SUQiOiIxODEwNTQxMzM2MTEwMDU1NTA1IiwiUGhvbmUiOiIxNTkxNTcxOTk2NyIsIkdyb3VwSUQiOiIxODEwNTQxMzM2MTAxNjY2ODk3IiwiUGFnZU5hbWUiOiIiLCJNYWlsIwiQ3JlYXRlVGltZSI6IjIwMjQtMDctMTEgMjI6NDM6MjYiLCJpc3MiOiJtaW5pbWF4In0.ekC1wU5moSgH_WCXVx-zhIQU7nN0hd3lXjYpVlF8aFvKv2aE6gNTwnbCxJBr_kvC9rb8AyKzTXDgD4aEfVJb0YV-gZC5Qf45pj8VCuujB54cN5bQpZnxK4hdlIBkZkC9wCxgAT_6CMUDbAzh7pP_2St9wqeDNjMKz96g3naeV250bv1p_Lmt8d2VrdPL8R0SOouvk4HjP7bPZg9lHnsTIdeyUD7qIv8Wi1HPsTsqQmdQuAitt_iU6LLbJ5z3O52Z4ladQkf8FB91MO9DNyU5hfWuk7dOBTFPiiT5A7OanwZtF-WU0bJny_sxgssyzXbk0xiYMI9dgMHgbHm-U3cVTg";
// 3. Replace with your baidu voice detect token
String token = "24.3cfaf6309ad8e615.2592000.1723453636.282335-93542185";
HTTPClient http;
String token_key = String("Bearer ") + apiKey;
// Send request to MiniMax API
String inputText = "你好，minimax！";
String apiUrl = "https://api.minimax.chat/v1/text/chatcompletion_v2";
int httpResponseCode;
String response, question, answer;
DynamicJsonDocument jsonDoc(1024);

uint32_t num = 0;
portMUX_TYPE timerMux = portMUX_INITIALIZER_UNLOCKED;
void IRAM_ATTR onTimer() {
  // Increment the counter and set the time of ISR
  portENTER_CRITICAL_ISR(&timerMux);
  if (adc_start_flag == 1) {
    //Serial.println("");
    // adc_data[num] = analogRead(ADC);
    adc_data[num] = I2S.read();
    num++;
    if (num >= adc_data_len) {
      adc_complete_flag = 1;
      adc_start_flag = 0;
      num = 0;
      //Serial.println(Complete_flag);
    }
  }
  portEXIT_CRITICAL_ISR(&timerMux);
}

String getGPTAnswer(String inputText) {
  http.begin(apiUrl);
  http.addHeader("Content-Type", "application/json");
  http.addHeader("Authorization", token_key);
  String payload = "{\"model\":\"abab5.5s-chat\",\"messages\":[{\"role\": \"system\",\"content\": \"你是升哥的智能桌面助手，要求下面的回答严格控制在32字符以内。\"},{\"role\": \"user\",\"content\": \"" + inputText + "\"}]}";
  httpResponseCode = http.POST(payload);
  if (httpResponseCode == 200) {
    response = http.getString();
    http.end();
    Serial.println(response);
    // Parse JSON response
    deserializeJson(jsonDoc, response);
    String outputText = jsonDoc["choices"][0]["message"]["content"];
    return outputText;
    // Serial.println(outputText);
  } else {
    http.end();
    Serial.printf("Error %i \n", httpResponseCode);
    speech("语言大模型故障，请检查api是否失效");
    return "<error>";
  }
}
void speech(String data) {
    // 将输入的UTF-8字符串转换为GB2312编码
    String gb2312_str = GB.get(data);

    // 根据TTS模块的单次传输最大长度限制，分割文本
    int max_data_len = 4000; // TTS模块单次传输最大长度
    int start = 0;
    while (start < gb2312_str.length()) {
        int end = start + max_data_len;
        if (end > gb2312_str.length()) {
            end = gb2312_str.length();
        }
        String segment = gb2312_str.substring(start, end);

        // 构造头部信息
        unsigned char head[segment.length() + 6];
        head[0] = 0xFD; // 开始字节
        head[1] = segment.length() >> 8; // 数据长度的高字节
        head[2] = segment.length(); // 数据长度的低字节
        head[3] = 0x01; // 命令字节
        head[4] = 0x01; // 参数字节

        // 将转换后的字符串和头部信息合并到head数组中
        for (int i = 0; i < segment.length(); i++) {
            head[i + 5] = segment[i];
        }

        // 计算校验和并添加到数组末尾
        head[segment.length() + 5] = head[0];
        for (int i = 1; i < segment.length() + 5; i++) {
            head[segment.length() + 5] ^= head[i];
        }

        // 通过软件串口发送构造好的命令到TTS模块
        for (int j = 0; j < segment.length() + 6; j++) {
            myPort.write(head[j]);
        }


        // 更新起始位置，准备发送下一个数据段
        start += max_data_len;
        // 延时，等待TTS模块处理完毕
        delay(segment.length() * 100); // 延时时间可能需要根据实际情况调整

    }
}
void setup() {

  //Serial.begin(921600);
  Serial.begin(115200);
  adc_data = (uint16_t *)ps_malloc(adc_data_len * sizeof(uint16_t));  //ps_malloc 指使用片外PSRAM内存
  if (!adc_data) {
    Serial.println("Failed to allocate memory for adc_data");
  }

  data_json = (char *)ps_malloc(data_json_len * sizeof(char));  // 根据需要调整大小
  if (!data_json) {
    Serial.println("Failed to allocate memory for data_json");
  }
  myPort.begin(115200, SWSERIAL_8N1, MYPORT_RX, MYPORT_TX, false);
  delay(1000);
  if (!myPort) {  // If the object did not initialize, then its configuration is invalid
    Serial.println("Invalid EspSoftwareSerial pin configuration, check config");
    while (1) {  // Don't continue with invalid configuration
      delay(1000);
    }
  }
  speech("系统开机");
  delay(1500);
  for (int i = 0; i < sizeof(voicedata) / sizeof(voicedata[0]); i++) {
    myPort.write(voicedata[i]);
  }
   delay(15000);
  speech("主人，小助手检查您久坐时间过长，注意休息");
  // pinMode(ADC, ANALOG);
  // pinMode(buttonPin, INPUT_PULLUP);
  pinMode(ledPin, OUTPUT);
  // start I2S at 16 kHz with 16-bits per sample
  I2S.setAllPins(-1, 42, 41, -1, -1);
  if (!I2S.begin(PDM_MONO_MODE, 16000, 16)) {
    Serial.println("Failed to initialize I2S!");
    while (1)
      ;  // do nothing
  }
  uint8_t count = 0;
  WiFi.mode(WIFI_STA);
  WiFi.begin(ssid, password);
  while (WiFi.status() != WL_CONNECTED) {
    Serial.print(".");
    count++;
    if (count >= 75) {
      Serial.printf("\r\n-- wifi connect fail! --");
      break;
    }
    vTaskDelay(200);
  }
  Serial.printf("\r\n-- wifi connect success! --\r\n");
  Serial.println(WiFi.localIP());
  http.setTimeout(4000);
  http_client.setTimeout(4000);
  // gain_token();

  timer = timerBegin(0, 80, true);    //  80M的时钟 80分频 1M
  timerAlarmWrite(timer, 125, true);  //  1M  计125个数进中断  8K
  timerAttachInterrupt(timer, &onTimer, true);
  timerAlarmEnable(timer);
  timerStop(timer);  //先暂停
}


uint32_t time1, time2;
void loop() {

  if (Serial.available() > 0)  //按键按下
  {
    if (Serial.read() == '1') {
      Serial.printf("Start recognition\r\n");
      digitalWrite(ledPin, HIGH);
      adc_start_flag = 1;
      timerStart(timer);

      // time1=micros();
      while (!adc_complete_flag)  //等待采集完成
      {
        ets_delay_us(10);
      }
      // time2=micros()-time1;

      timerStop(timer);
      adc_complete_flag = 0;  //清标志
      digitalWrite(ledPin, LOW);
      // memset(data_json, '\0', strlen(data_json));  //将数组清空
      memset(data_json, '\0', data_json_len * sizeof(char));
      strcat(data_json, "{");
      strcat(data_json, "\"format\":\"pcm\",");
      strcat(data_json, "\"rate\":16000,");
      strcat(data_json, "\"dev_pid\":1537,");
      strcat(data_json, "\"channel\":1,");
      strcat(data_json, "\"cuid\":\"666666\",");
      strcat(data_json, "\"token\":\"");
      strcat(data_json, token.c_str());
      strcat(data_json, "\",");
      sprintf(data_json + strlen(data_json), "\"len\":%d,", adc_data_len * 2);
      strcat(data_json, "\"speech\":\"");
      strcat(data_json, base64::encode((uint8_t *)adc_data, adc_data_len * sizeof(uint16_t)).c_str());
      strcat(data_json, "\"");
      strcat(data_json, "}");
      // Serial.println(data_json);
      int httpCode;
      http_client.begin("http://vop.baidu.com/server_api");  //https://vop.baidu.com/pro_api
      http_client.addHeader("Content-Type", "application/json");
      httpCode = http_client.POST(data_json);

      if (httpCode == 200) {
        if (httpCode == HTTP_CODE_OK) {
          response = http_client.getString();
          http_client.end();
          Serial.print(response);
          // Parse JSON response
          // DynamicJsonDocument jsonDoc(512);
          deserializeJson(jsonDoc, response);
          String question = jsonDoc["result"][0];
          // 访问"result"数组，并获取其第一个元
          // 输出结果
          Serial.println("Input:" + question);
          answer = getGPTAnswer(question);
          speech(answer);
          Serial.println("Answer: " + answer);
          // Serial.println("Enter a prompt:");

        } else {
          Serial.printf("[HTTP] GET... failed, error: %s\n", http_client.errorToString(httpCode).c_str());
          speech("语音识别在线故障，请检查api是否失效");
        }
      }
      // while (!digitalRead(buttonPin))
      //   ;
      Serial.println("Recognition complete\r\n");
    }
  }
  vTaskDelay(1);
}

解析

这段代码是一个基于Arduino平台的项目，它使用了多种库来实现语音识别和文本到语音(TTS)的功能。代码中包含了网络连接、HTTP请求、JSON处理、I2S音频输入、定时器中断、软件串口通信等技术。下面是对代码的详细解析：

包含的库:
Arduino.h: Arduino核心库，用于基本的输入输出操作。
base64.h: 用于处理Base64编码。
WiFi.h: 用于连接到WiFi网络。
HTTPClient.h: 用于发起HTTP请求。
cJSON.h: 用于处理JSON数据。
I2S.h: 用于I2S音频接口。
ArduinoJson.h: 用于处理JSON数据。
SoftwareSerial.h: 用于软件串口通信。
UTF8ToGB2312.h: 用于将UTF-8编码转换为GB2312编码。
2.
定义和初始化:
定义了多个宏和变量，包括串口引脚、I2S引脚、WiFi凭据、API密钥、令牌等。
初始化了HTTPClient对象用于发起HTTP请求。
初始化了I2S对象用于音频数据的采集。
初始化了SoftwareSerial对象用于与TTS模块通信。
初始化了timer对象用于定时器中断。
3.
setup()函数:
初始化串口通信。
分配内存给adc_data和data_json数组。
初始化SoftwareSerial对象。
初始化I2S接口。
连接到WiFi网络。
设置定时器中断，用于控制音频数据的采集。
4.
loop()函数:
检查串口是否有数据输入，如果检测到特定字符（‘1’），则开始语音识别流程。
启动定时器中断，开始采集音频数据。
等待音频数据采集完成。
停止定时器中断。
将采集到的音频数据编码为Base64格式，并构建JSON格式的请求数据。
发送HTTP请求到百度语音识别API。
解析响应数据，提取识别结果。
使用MiniMax API获取回答。
将回答通过TTS模块转换为语音输出。
5.
getGPTAnswer()函数:
发送HTTP请求到MiniMax API，获取GPT模型的回答。
解析JSON响应并返回回答文本。
6.
speech()函数:
将输入的UTF-8字符串转换为GB2312编码。
根据TTS模块的限制，将文本分割成多个段落。
构造头部信息和校验和。
通过软件串口发送数据到TTS模块。
7.
onTimer()函数:
定时器中断服务程序，用于控制音频数据的采集。
整体来看，这段代码是一个完整的语音识别和语音输出系统，它通过WiFi连接到网络，使用百度的语音识别API和MiniMax的GPT模型来实现语音到文本和文本到语音的转换。代码中还包含了错误处理和状态提示，以确保系统的稳定运行。