H5语音识别功能(Web Speech API+科大讯飞)

H5语音识别效果图:

方案一:Web Speech API(免费,IE浏览器可用,谷歌浏览器不可用)

<template>
  <div class="voice-container">
    <h2 style="margin-bottom: 20px">方案一:Web Speech API</h2>
    <button @click="toggleRecording">开始</button>
    <button class="recording" @click="toggleRecording">停止</button>
    <div class="result">
      <p>识别结果:</p>
      <div class="output">{{ finalTranscript }}</div>
      <div class="interim">{{ interimTranscript }}</div>
    </div>
  </div>
</template>

<script setup>
import { ref, onMounted, onBeforeUnmount } from "vue";

const isSupported = ref(false);
const isRecording = ref(false);
const finalTranscript = ref("");
const interimTranscript = ref("");
const status = ref("等待中...");
const error = ref(null);
let recognition = null;

// 初始化语音识别
const initializeRecognition = () => {
  const SpeechRecognition =
    window.SpeechRecognition || window.webkitSpeechRecognition;

  if (!SpeechRecognition) {
    isSupported.value = false;
    return;
  }

  isSupported.value = true;
  recognition = new SpeechRecognition();

  // 配置参数
  recognition.continuous = true;
  recognition.interimResults = true;
  recognition.lang = "zh-CN";

  // 事件处理
  recognition.onstart = () => {
    isRecording.value = true;
    status.value = "录音中...";
  };

  recognition.onend = () => {
    isRecording.value = false;
    status.value = "已停止";
  };

  recognition.onresult = (event) => {
    interimTranscript.value = "";
    for (let i = event.resultIndex; i < event.results.length; i++) {
      const transcript = event.results[i][0].transcript;
      if (event.results[i].isFinal) {
        finalTranscript.value += transcript + " ";
      } else {
        interimTranscript.value += transcript;
      }
    }
  };

  recognition.onerror = (event) => {
    error.value = `识别错误: ${event.error}`;
    isRecording.value = false;
  };
};

// 开始/停止录音
const toggleRecording = () => {
  if (!isRecording.value) {
    finalTranscript.value = "";
    interimTranscript.value = "";
    recognition.start();
  } else {
    recognition.stop();
  }
};

// 生命周期
onMounted(() => {
  if (typeof window !== "undefined") {
    initializeRecognition();
  }
});

onBeforeUnmount(() => {
  if (recognition) {
    recognition.stop();
  }
});
</script>

<style scoped>
.voice-container {
  max-width: 600px;
  margin: 20px auto;
  padding: 20px;
}

button {
  padding: 8px 15px;
  background: linear-gradient(135deg, #6253e1, #04befe);
  color: white;
  border: none;
  border-radius: 4px;
  cursor: pointer;
  transition: background 0.3s;
}

button:hover {
  background: #66b1ff;
}

button.recording {
  background: #f56c6c;
  margin-left: 20px;
}

.result {
  margin-top: 20px;
  padding: 15px;
  border: 1px solid #eee;
  border-radius: 4px;
}

.interim {
  color: #666;
  margin-top: 10px;
  min-height: 20px;
}

.error {
  color: #f56c6c;
  margin: 10px 0;
}
</style>

方案二:科大讯飞(收费,但是每个账号每天500次试用机会)

<template>
  <div class="conter">
    <h2 style="margin: 40px 0 20px 0">方案二:科大讯飞</h2>
    <button @click="translationStart">开始</button>
    <button class="recording" @click="translationEnd">停止</button>
    <div class="result">
      <p>识别结果:</p>
      <div class="interim">{{ searchData }}</div>
    </div>
  </div>
</template>

<script setup>
import { ref } from "vue";
import IatRecorder from "../utils/lat_xunfei/IatRecorder";
const searchData = ref("");
const iatRecorder = new IatRecorder({
  language: "zh_cn",
  accent: "mandarin",
  appId: "f70c6429",
  domain: "iat",
});
const translationStart = () => {
  iatRecorder.start();
};
const translationEnd = () => {
  iatRecorder.onTextChange = (text) => {
    const inputText = text;
    console.log("text", text);

    searchData.value = inputText.substring(0, inputText.length - 1);
    //文字处理,因为不知道为什么识别输出的后面都带'。',这个方法是去除字符串最后一位
    console.log("searchData", searchData.value);
  };
  iatRecorder.stop();
};
</script>

<style scoped>
.conter {
  padding: 0 20px;
}
button {
  padding: 8px 15px;
  background: linear-gradient(135deg, #6253e1, #04befe);
  color: white;
  border: none;
  border-radius: 4px;
  cursor: pointer;
  transition: background 0.3s;
}

button.recording {
  background: #f56c6c;
  margin-left: 20px;
}
.result {
  margin-top: 20px;
  padding: 15px;
  border: 1px solid #eee;
  border-radius: 4px;
}

.interim {
  color: #666;
  margin-top: 10px;
  min-height: 20px;
}
</style>

utils/lat_xunfei/IatRecorder.js


const APPID = "f70c6429";
const API_SECRET = "YjBiOTU2YTU4YzEzZWQ3MWZlNGNkM2I3";
const API_KEY = "beba0ca58e09b5a244858b5d0dab8b8b";
import CryptoJS from "crypto-js";
const transWorker = new Worker(
  new URL("./transcode.worker.js", import.meta.url)
);

console.log("transWorker", transWorker);
var startTime = "";
var endTime = "";

function getWebSocketUrl() {
  return new Promise((resolve, reject) => {
    // 请求地址根据语种不同变化
    var url = "wss://iat-api.xfyun.cn/v2/iat";
    var host = "iat-api.xfyun.cn";
    var apiKey = API_KEY;
    var apiSecret = API_SECRET;
    var date = new Date().toGMTString();
    var algorithm = "hmac-sha256";
    var headers = "host date request-line";
    var signatureOrigin = `host: ${host}\ndate: ${date}\nGET /v2/iat HTTP/1.1`;
    var signatureSha = CryptoJS.HmacSHA256(signatureOrigin, apiSecret);
    var signature = CryptoJS.enc.Base64.stringify(signatureSha);
    var authorizationOrigin = `api_key="${apiKey}", algorithm="${algorithm}", headers="${headers}", signature="${signature}"`;
    var authorization = btoa(authorizationOrigin);
    url = `${url}?authorization=${authorization}&date=${date}&host=${host}`;
    resolve(url);
  });
}
const IatRecorder = class {
  constructor({ language, accent, appId } = {}) {
    let self = this;
    this.status = "null";
    this.language = language || "zh_cn";
    this.accent = accent || "mandarin";
    this.appId = appId || APPID;
    // 记录音频数据
    this.audioData = [];
    // 记录听写结果
    this.resultText = "";
    // wpgs下的听写结果需要中间状态辅助记录
    this.resultTextTemp = "";
    transWorker.onmessage = function (event) {
      // console.log("构造方法中",self.audioData)
      self.audioData.push(...event.data);
    };
  }

  // 修改录音听写状态
  setStatus(status) {
    this.onWillStatusChange &&
      this.status !== status &&
      this.onWillStatusChange(this.status, status);
    this.status = status;
  }
  setResultText({ resultText, resultTextTemp } = {}) {
    this.onTextChange && this.onTextChange(resultTextTemp || resultText || "");
    resultText !== undefined && (this.resultText = resultText);
    resultTextTemp !== undefined && (this.resultTextTemp = resultTextTemp);
  }
  // 修改听写参数
  setParams({ language, accent } = {}) {
    language && (this.language = language);
    accent && (this.accent = accent);
  }
  // 连接websocket
  connectWebSocket() {
    return getWebSocketUrl().then((url) => {
      let iatWS;
      if ("WebSocket" in window) {
        iatWS = new WebSocket(url);
      } else if ("MozWebSocket" in window) {
        iatWS = new MozWebSocket(url);
      } else {
        alert("浏览器不支持WebSocket");
        return;
      }
      this.webSocket = iatWS;
      this.setStatus("init");
      iatWS.onopen = (e) => {
        this.setStatus("ing");
        // 重新开始录音
        setTimeout(() => {
          this.webSocketSend();
        }, 500);
      };
      iatWS.onmessage = (e) => {
        this.result(e.data);
      };
      iatWS.onerror = (e) => {
        this.recorderStop();
      };
      iatWS.onclose = (e) => {
        endTime = Date.parse(new Date());
        console.log("持续时间", endTime - startTime);
        this.recorderStop();
      };
    });
  }
  // 初始化浏览器录音
  recorderInit() {
    navigator.getUserMedia =
      navigator.getUserMedia ||
      navigator.webkitGetUserMedia ||
      navigator.mozGetUserMedia ||
      navigator.msGetUserMedia;

    // 创建音频环境
    try {
      this.audioContext = new (window.AudioContext ||
        window.webkitAudioContext)();
      this.audioContext.resume();
      if (!this.audioContext) {
        alert("浏览器不支持webAudioApi相关接口");
        return;
      }
    } catch (e) {
      if (!this.audioContext) {
        alert("浏览器不支持webAudioApi相关接口");
        return;
      }
    }

    // 获取浏览器录音权限
    if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) {
      navigator.mediaDevices
        .getUserMedia({
          audio: true,
          video: false,
        })
        .then((stream) => {
          getMediaSuccess(stream);
        })
        .catch((e) => {
          getMediaFail(e);
        });
    } else if (navigator.getUserMedia) {
      navigator.getUserMedia(
        {
          audio: true,
          video: false,
        },
        (stream) => {
          getMediaSuccess(stream);
        },
        function (e) {
          getMediaFail(e);
        }
      );
    } else {
      if (
        navigator.userAgent.toLowerCase().match(/chrome/) &&
        location.origin.indexOf("https://") < 0
      ) {
        alert(
          "chrome下获取浏览器录音功能,因为安全性问题,需要在localhost或127.0.0.1或https下才能获取权限"
        );
      } else {
        alert("无法获取浏览器录音功能,请升级浏览器或使用chrome");
      }
      this.audioContext && this.audioContext.close();
      return;
    }
    // 获取浏览器录音权限成功的回调
    let getMediaSuccess = (stream) => {
      // 创建一个用于通过JavaScript直接处理音频
      this.scriptProcessor = this.audioContext.createScriptProcessor(0, 1, 1);
      this.scriptProcessor.onaudioprocess = (e) => {
        // 去处理音频数据
        if (this.status === "ing") {
          transWorker.postMessage(e.inputBuffer.getChannelData(0));
          //  this.audioData.push(e.inputBuffer.getChannelData(0))
        }
      };
      // 创建一个新的MediaStreamAudioSourceNode 对象,使来自MediaStream的音频可以被播放和操作
      this.mediaSource = this.audioContext.createMediaStreamSource(stream);
      // 连接
      this.mediaSource.connect(this.scriptProcessor);
      this.scriptProcessor.connect(this.audioContext.destination);
      this.connectWebSocket();
    };

    let getMediaFail = (e) => {
      this.audioContext && this.audioContext.close();
      this.audioContext = undefined;
      // 关闭websocket
      if (this.webSocket && this.webSocket.readyState === 1) {
        this.webSocket.close();
      }
    };
  }
  recorderStart() {
    if (!this.audioContext) {
      this.recorderInit();
    } else {
      this.audioContext.resume();
      this.connectWebSocket();
    }
  }
  // 暂停录音
  recorderStop() {
    // safari下suspend后再次resume录音内容将是空白,设置safari下不做suspend
    if (
      !(
        /Safari/.test(navigator.userAgent) && !/Chrome/.test(navigator.userAgen)
      )
    ) {
      this.audioContext && this.audioContext.suspend();
    }
    this.setStatus("end");
  }
  // 处理音频数据
  // transAudioData(audioData) {
  //   audioData = transAudioData.transaction(audioData)
  //   this.audioData.push(...audioData)
  // }
  // 对处理后的音频数据进行base64编码,
  toBase64(buffer) {
    var binary = "";
    var bytes = new Uint8Array(buffer);
    var len = bytes.byteLength;
    for (var i = 0; i < len; i++) {
      binary += String.fromCharCode(bytes[i]);
    }
    return window.btoa(binary);
  }
  // 向webSocket发送数据
  webSocketSend() {
    if (this.webSocket.readyState !== 1) {
      return;
    }
    let audioData = this.audioData.splice(0, 1280);
    var params = {
      common: {
        app_id: this.appId,
      },
      business: {
        language: this.language, //小语种可在控制台--语音听写(流式)--方言/语种处添加试用
        domain: "iat",
        accent: this.accent, //中文方言可在控制台--语音听写(流式)--方言/语种处添加试用
      },
      data: {
        status: 0,
        format: "audio/L16;rate=16000",
        encoding: "raw",
        audio: this.toBase64(audioData),
      },
    };
    console.log("参数language:", this.language);
    console.log("参数accent:", this.accent);
    this.webSocket.send(JSON.stringify(params));
    startTime = Date.parse(new Date());
    this.handlerInterval = setInterval(() => {
      // websocket未连接
      if (this.webSocket.readyState !== 1) {
        console.log("websocket未连接");
        this.audioData = [];
        clearInterval(this.handlerInterval);
        return;
      }
      if (this.audioData.length === 0) {
        console.log("自动关闭", this.status);
        if (this.status === "end") {
          this.webSocket.send(
            JSON.stringify({
              data: {
                status: 2,
                format: "audio/L16;rate=16000",
                encoding: "raw",
                audio: "",
              },
            })
          );
          this.audioData = [];
          clearInterval(this.handlerInterval);
        }
        return false;
      }
      audioData = this.audioData.splice(0, 1280);
      // 中间帧
      this.webSocket.send(
        JSON.stringify({
          data: {
            status: 1,
            format: "audio/L16;rate=16000",
            encoding: "raw",
            audio: this.toBase64(audioData),
          },
        })
      );
    }, 40);
  }
  result(resultData) {
    // 识别结束
    let jsonData = JSON.parse(resultData);
    if (jsonData.data && jsonData.data.result) {
      let data = jsonData.data.result;
      let str = "";
      let resultStr = "";
      let ws = data.ws;
      for (let i = 0; i < ws.length; i++) {
        str = str + ws[i].cw[0].w;
      }
      console.log("识别的结果为:", str);
      // 开启wpgs会有此字段(前提:在控制台开通动态修正功能)
      // 取值为 "apd"时表示该片结果是追加到前面的最终结果;取值为"rpl" 时表示替换前面的部分结果,替换范围为rg字段
      if (data.pgs) {
        if (data.pgs === "apd") {
          // 将resultTextTemp同步给resultText
          this.setResultText({
            resultText: this.resultTextTemp,
          });
        }
        // 将结果存储在resultTextTemp中
        this.setResultText({
          resultTextTemp: this.resultText + str,
        });
      } else {
        this.setResultText({
          resultText: this.resultText + str,
        });
      }
    }
    if (jsonData.code === 0 && jsonData.data.status === 2) {
      this.webSocket.close();
    }
    if (jsonData.code !== 0) {
      this.webSocket.close();
      console.log(`${jsonData.code}:${jsonData.message}`);
    }
  }
  start() {
    this.recorderStart();
    this.setResultText({ resultText: "", resultTextTemp: "" });
  }
  stop() {
    this.recorderStop();
  }
};

export default IatRecorder;

### 关于 API 流式语音交互实现方式及相关示例 #### 科大讯飞语音听写(流式版)WebAPI 的实现方式 科大讯飞提供了基于 Web 的流式语音识别接口,适用于实时语音输入场景。该接口支持前端网页和 H5 应用调用,可以用于语音搜索、语音听写等功能[^1]。其核心特性在于“流式”,即客户端可将录音数据分片上传至服务器端,在传输过程中逐步获取识别结果。 以下是其实现的关键步骤及其代码示例: ```javascript // 初始化 WebSocket 连接 const socket = new WebSocket('wss://webapi.xfyun.cn/v1/service/ws'); socket.onopen = function () { console.log("WebSocket连接已建立"); // 发送认证信息 const authMessage = JSON.stringify({ "common": {}, "business": {"domain": "iat"}, "data": {} }); socket.send(authMessage); }; socket.onmessage = function (event) { const response = JSON.parse(event.data); if (response.code === 0 && response.data.result) { console.log("识别结果:", response.data.result); // 输出语音识别的结果 } }; ``` 上述代码展示了如何通过 WebSocket 协议与科大讯飞的流式语音识别服务进行通信。需要注意的是,实际项目中还需要处理错误码以及断线重连逻辑。 --- #### Amazon Bedrock 中的大语言模型对话生成 Amazon Bedrock 提供了一种灵活的方式,可以通过配置不同的 Prompt Template 来定制化角色行为,并利用其中集成的大规模语言模型完成复杂的任务[^2]。对于流式语音交互而言,这种能力可用于构建动态响应机制,从而提升用户体验。 下面是一个简单的 Python 脚本例子,展示如何向 Amazon Bedrock 请求生成一段文字回复: ```python import boto3 bedrock_runtime = boto3.client(service_name='bedrock-runtime', region_name='us-west-2') body = { "prompt": "你好!今天天气怎么样?", "max_tokens_to_sample": 4096, "temperature": 0.7, "top_p": 0.9 } model_id = 'anthropic.claude-v2' accept = 'application/json' content_type = 'application/json' response = bedrock_runtime.invoke_model( body=json.dumps(body), modelId=model_id, accept=accept, contentType=content_type ) print(response['body'].read().decode()) ``` 此脚本会发送一条消息给指定的语言模型,并接收由模型生成的回答内容。 --- #### pyht —— PlayHT AI 文本转语音 SDK 如果目标是从文本生成自然流畅的声音,则可以考虑使用 `pyht` 工具包来简化开发流程[^3]。这个库封装好了大部分底层细节,让开发者只需专注于业务逻辑即可快速搭建起具备 TTS 功能的应用程序。 下面是创建自定义音色的一个实例片段: ```python from pyht import PyHTClient client = PyHTClient(api_key="your_api_key_here") result = client.text_to_speech( text="欢迎来到我们的语音助手世界。", voice_id="custom_voice_identifier" ) with open("output.mp3", "wb") as f: f.write(result.content) ``` 以上代码演示了怎样把字符串形式的文字转换成 MP3 文件保存下来的过程。 --- #### CosyVoice 大模型驱动下的语音合成方案 最后提到的一种方法叫做 **CosyVoice** ,它是一种融合了先进 NLP 技术的新一代语音合成人声引擎[^4]。相比传统 TTS 解决方案来说更加智能化,因为它不仅理解语义结构还能够模仿特定说话人的风格特点。 要接入此类服务通常也需要遵循 RESTful 风格设计原则或者采用 gRPC 等高性能 RPC 框架作为网络层协议栈基础架构的一部分。 --- ### 总结 综上所述,无论是国内厂商如科大讯飞还是国际巨头亚马逊所提供的产品和服务都能够很好地满足现代应用程序对高质量人机交流的需求;而像 PlayHT 和 CosyVoice 则进一步拓宽了我们想象空间——它们使机器发声变得更加生动逼真!
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值