语音实时识别-阿里通义听悟

驴唇马嘴

已于 2024-09-18 14:38:14 修改

阅读量366

点赞数 3

文章标签：语音识别阿里云

于 2024-09-18 14:33:37 首次发布

本文链接：https://blog.csdn.net/zz23385/article/details/142326932

版权

官方文档

本文语言采用python+react，对接的阿里的通义听悟，直接上代码

后端接口

接口这里使用python实现，这里只有简单的示例，其他语言或者具体文档请参考官方的文档

#!/usr/bin/env python
# coding=utf-8

import json
import datetime
from aliyunsdkcore.client import AcsClient
from aliyunsdkcore.request import CommonRequest
from aliyunsdkcore.auth.credentials import AccessKeyCredential
from flask import Flask, abort
from flask_cors import CORS

app = Flask(__name__)
CORS(app)

APP_KEY = 'APP_KEY'
ACCESS_ID = 'ACCESS_ID'
ACCESS_SECRET = 'ACCESS_SECRET'


def create_common_request(domain, version, protocolType, method, uri):
    curr_request = CommonRequest()
    curr_request.set_accept_format('json')
    curr_request.set_domain(domain)
    curr_request.set_version(version)
    curr_request.set_protocol_type(protocolType)
    curr_request.set_method(method)
    curr_request.set_uri_pattern(uri)
    curr_request.add_header('Content-Type', 'application/json')
    return curr_request


def init_parameters():
    body = dict()
    body['AppKey'] = APP_KEY

    # 基本请求参数
    input = dict()

    # 输入语音流格式和采样率和以下参数设置保持一致
    input['Format'] = 'pcm'
    input['SampleRate'] = 16000
    input['SourceLanguage'] = 'cn'
    input['TaskKey'] = 'task' + datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    input['ProgressiveCallbacksEnabled'] = False
    body['Input'] = input

    # AI相关参数，按需设置即可
    parameters = dict()

    # 语音识别控制相关
    transcription = dict()
    # 角色分离 ： 可选
    transcription['DiarizationEnabled'] = True
    diarization = dict()
    diarization['SpeakerCount'] = 2
    transcription['Diarization'] = diarization
    parameters['Transcription'] = transcription

    # 文本翻译控制相关 ： 可选
    parameters['TranslationEnabled'] = True
    translation = dict()
    translation['TargetLanguages'] = ['en']  # 假设翻译成英文
    parameters['Translation'] = translation

    # 章节速览相关 ： 可选，包括： 标题、议程摘要
    parameters['AutoChaptersEnabled'] = True

    # 智能纪要相关 ： 可选，包括： 待办、关键信息(关键词、重点内容、场景识别)
    parameters['MeetingAssistanceEnabled'] = True
    meetingAssistance = dict()
    meetingAssistance['Types'] = ['Actions', 'KeyInformation']
    parameters['MeetingAssistance'] = meetingAssistance

    # 摘要控制相关 ： 可选，包括： 全文摘要、发言人总结摘要、问答摘要(问答回顾)
    parameters['SummarizationEnabled'] = True
    summarization = dict()
    summarization['Types'] = ['Paragraph', 'Conversational', 'QuestionsAnswering', 'MindMap']
    parameters['Summarization'] = summarization

    # ppt抽取和ppt总结 ： 可选
    parameters['PptExtractionEnabled'] = True

    # 口语书面化 ： 可选
    parameters['TextPolishEnabled'] = True

    body['Parameters'] = parameters
    return body


@app.route('/createVoiceWsInfo', methods=['POST'])
def create_voice_ws_info():
    body = init_parameters()
    print(body)

    credentials = AccessKeyCredential(ACCESS_ID, ACCESS_SECRET)
    client = AcsClient(region_id='cn-beijing', credential=credentials)

    curr_request = create_common_request('tingwu.cn-beijing.aliyuncs.com', '2023-09-30', 'https', 'PUT',
                                         '/openapi/tingwu/v2/tasks')
    curr_request.add_query_param('type', 'realtime')

    curr_request.set_content(json.dumps(body).encode('utf-8'))
    response = client.do_action_with_exception(curr_request)
    res_json = json.dumps(json.loads(response), indent=4, ensure_ascii=False)
    return res_json


@app.route('/getVoiceTask/<string:task_id>', methods=['GET'])
def get_voice_task_info(task_id):
    # 在这里可以根据 item_name 返回不同的数据
    if not task_id:
        abort(400)  # 如果 item_name 为空，返回 400 错误

    credentials = AccessKeyCredential(ACCESS_ID, ACCESS_SECRET)
    client = AcsClient(region_id='cn-beijing', credential=credentials)

    uri = '/openapi/tingwu/v2/tasks' + '/' + task_id
    request = create_common_request('tingwu.cn-beijing.aliyuncs.com', '2023-09-30', 'https', 'GET', uri)

    response = client.do_action_with_exception(request)
    res_json = json.dumps(json.loads(response), indent=4, ensure_ascii=False)
    return res_json


@app.route('/closeVoiceTask/<string:task_id>', methods=['PUT'])
def close_voice_task_info(task_id):
    # 在这里可以根据 item_name 返回不同的数据
    if not task_id:
        abort(400)  # 如果 item_name 为空，返回 400 错误

    credentials = AccessKeyCredential(ACCESS_ID, ACCESS_SECRET)
    client = AcsClient(region_id='cn-beijing', credential=credentials)

    request = create_common_request('tingwu.cn-beijing.aliyuncs.com', '2023-09-30', 'https', 'PUT',
                                    '/openapi/tingwu/v2/tasks')
    request.add_query_param('type', 'realtime')
    request.add_query_param('operation', 'stop')

    body = dict()
    body['AppKey'] = APP_KEY

    # 基本请求参数
    input = dict()

    # 输入语音流格式和采样率和以下参数设置保持一致
    input['TaskId'] = task_id
    body['Input'] = input

    request.set_content(json.dumps(body).encode('utf-8'))
    response = client.do_action_with_exception(request)
    res_json = json.dumps(json.loads(response), indent=4, ensure_ascii=False)
    return res_json


if __name__ == "__main__":
    app.run(debug=True, host='0.0.0.0')

前端代码

VoiceWebSocket.ts

import { createVoiceWsInfo, closeTask } from "@/app/client/voiceApi";
import { v4 as uuidv4 } from "uuid";
import VoiceRecorder from "@/app/utils/VoiceRecorder";

class VoiceWebSocket {
  public wsUrl: string | undefined;
  public taskId: string | undefined;
  public socket: WebSocket | undefined;
  public socketStatus: string;
  private voice: VoiceRecorder | undefined;
  private setFlag: ((flag: boolean) => void | false) | undefined;
  private callBack: ((result: string) => void) | undefined;
  private currentResult = [];

  constructor() {
    this.socketStatus = "init";
  }

  async init(): Promise<void> {
    return new Promise((resolve, reject) => {
      createVoiceWsInfo()
        .then((response) => {
          if (response.ok) {
            return response.json();
          }
          throw new Error("Failed to fetch WebSocket info");
        })
        .then((data) => {
          if ("0" === data.Code) {
            const { Data = {} } = data;
            const { MeetingJoinUrl = "", TaskId = "" } = Data;
            this.wsUrl = MeetingJoinUrl;
            this.taskId = TaskId;
            this.socketStatus = "ing";
            resolve();
          }
          reject(data.Message);
        });
    });
  }

  async initSocket(): Promise<void> {
    return new Promise((resolve, reject) => {
      if (this.wsUrl && this.taskId) {
        this.socket = new WebSocket(this.wsUrl);
        this.socket.binaryType = "blob";

        this.socket.onopen = (event) => {
          console.log("WebSocket connection open ", event);
          const message = {
            header: {
              message_id: uuidv4(),
              name: "StartTranscription",
              namespace: "SpeechTranscriber",
              task_id: this.taskId,
            },
            payload: {},
            context: {},
          };
          // 将对象转换为 JSON 字符串并发送
          this.socket?.send(JSON.stringify(message));
        };

        this.socket.onmessage = (event) => {
          this.onReceiveSocketData(event);
        };

        this.socket.onclose = (event) => {
          this.voice && this.voice.stopRecording();
          if (this.socketStatus === "ing") {
            closeTask(this.taskId);
          }
          this.closeSocket(event);
          this.setFlag && this.setFlag(false);
        };

        resolve();
      }
      reject("socket初始化未成功");
    });
  }

  public setVoice(voice: VoiceRecorder) {
    this.voice = voice;
  }

  public setStopFlagFun(setFlag: (flag: boolean) => void) {
    this.setFlag = setFlag;
  }

  sendSocketData(data: string | ArrayBuffer | Blob | ArrayBufferView) {
    this.socket?.send(data);
  }

  onReceiveSocketData(event: MessageEvent) {
    console.log("receive socket data: ", event);
    if ("string" == typeof event.data) {
      var n = null;
      try {
        n = JSON.parse(event.data);
      } catch (r) {
        n = {};
      }
      this.handleMessage && this.handleMessage(n);
    }
  }

  handleMessage(data = {}) {
    // @ts-ignore
    const { header = {}, payload = {} } = data;
    if ("SentenceEnd" == header.name) {
      const result = payload.result;
      // @ts-ignore
      this.currentResult.push(result);
      this.callBack && this.callBack(this.currentResult.join(""));
    } else if ("TranscriptionResultChanged" == header.name) {
      const result = payload.result;
      this.currentResult = [];
      // @ts-ignore
      this.currentResult.push(result);
      this.callBack && this.callBack(this.currentResult.join(""));
    }
  }

  closeSocket(event: CloseEvent) {
    console.log("WebSocket connection closed: ", event);
  }

  stopSocket() {
    this.socket?.close();
  }

  setCallback(callback: ((result: string) => void) | undefined): void {
    this.callBack = callback;
  }
}

export default VoiceWebSocket;

AudioRecorder.ts

class AudioRecorder {
  socket: WebSocket;
  sampleRate: number;
  stream!: MediaStream;
  source: MediaStreamAudioSourceNode | null = null;
  processor: ScriptProcessorNode | null | undefined;
  isRecording: boolean = false;
  audioChunks: Blob[] = [];

  constructor(socket: WebSocket, sampleRate: number = 16000) {
    this.socket = socket;
    this.sampleRate = sampleRate; // 采样频率
  }

  async startMicrophone(): Promise<void> {
    try {
      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
      this.stream = stream;
      const audioContext = new window.AudioContext({
        sampleRate: this.sampleRate,
      });

      // 创建源节点
      const source = audioContext.createMediaStreamSource(stream);
      this.source = source;
      // 创建一个 ScriptProcessorNode 用于处理音频数据
      const processor = audioContext.createScriptProcessor(4096, 1, 1); // 单声道
      this.processor = processor;

      source.connect(processor);
      processor.connect(audioContext.destination); // 连接到目的地（可选）

      processor.onaudioprocess = (event) => {
        // console.log("可用数据：", event)
        // console.log("可用数据1：", event.inputBuffer)
        // console.log("可用数据2：", event.inputBuffer.getChannelData(0))

        var arrayBuffer = this.floatTo16BitPCM(
          event.inputBuffer.getChannelData(0),
        );
        this.socket.send(arrayBuffer);
      };
    } catch (err) {
      console.error("Error accessing media devices:", err);
    }
  }

  floatTo16BitPCM = function (e: Float32Array) {
    for (
      var t = new DataView(new ArrayBuffer(2 * e.length)), n = 0;
      n < e.length;
      n++
    ) {
      var r = e[n] < 0 ? 32768 : 32767;
      t.setInt16(2 * n, (e[n] * r) | 0, !0);
    }
    return t.buffer;
  };

  startRecording(): void {
    this.isRecording = true;
    this.audioChunks = []; // Reset chunks for new recording
  }

  stopRecording(): void {
    if (this.isRecording) {
      this.isRecording = false;
    }
    if (this.source) {
      this.source.disconnect();
    }
    if (this.processor) {
      this.processor.disconnect();
    }
    if (this.stream) {
      this.stream.getTracks().forEach((item) => {
        item.stop();
      });
    }
  }
}

export default AudioRecorder;

VoiceComponent.ts

"use client";

import VoiceWebSocket from "../utils/VoiceWebSocket";
import AudioRecorder from "../utils/VoiceRecorder";
import React, {useState} from "react";


const VoiceComponent = () => {
    const [isRecording, setIsRecording] = useState(false);
    const [voiceHandle, setVoiceHandle] = useState<AudioRecorder | null>(null);
    const [socket, setSocket] = useState<VoiceWebSocket | null>(null);

    const [inputVal, setInputVal] = useState('')

    const startListening = async () => {
        if (!isRecording) {
            const ws = new VoiceWebSocket();
            ws.init()
                .then((res) => {
                    console.log("socket init", res);
                    ws.initSocket()
                        .then((res) => {
                            console.log("web socket init", res);
                            setSocket(ws);
                            setIsRecording(true);
                        })
                        .then((next) => {
                            console.log(next)
                            if (ws.socket) {
                                const voice = new AudioRecorder(ws.socket);
                                setVoiceHandle(voice);
                                voice.startMicrophone().then(() => {
                                    voice.startRecording();
                                });
                                // voice.startRecording();
                                ws.setVoice(voice);
                                ws.setCallback(setInputVal);
                                ws.setStopFlagFun(setIsRecording);
                            }
                        })
                        .catch((err) => {
                            alert(err);
                        });
                })
                .catch((err) => {
                    alert(err);
                });
        } else {
            voiceHandle?.stopRecording();
            setIsRecording(false);
        }
    };



    return (
        <div>
            <button onClick={startListening} title={'监听语音'}> 监听语音</button>
            <div style={{marginTop: 10}}>
                <textarea cols={30} rows={8} value={inputVal} readOnly={true}/>
            </div>
        </div>
    )


}


export default VoiceComponent;

然后点击监听语音的时候会实时发送数据