ASR实时语音转换（Python）

最新推荐文章于 2024-07-23 00:55:03 发布

Marye_爱吃樱桃

最新推荐文章于 2024-07-23 00:55:03 发布

阅读量264

点赞数 7

文章标签： python 开发语言

本文链接：https://blog.csdn.net/marye_1325/article/details/140129506

版权

一、介绍

实时语音转换是在该项目：jianchang512/stt: Voice Recognition to Text Tool / 一个离线运行的本地语音识别转文字服务，输出json、srt字幕带时间戳、纯文字格式 (github.com)基础上，做了改进达到实时转换的目的。

安装过程请看项目的readme.md

二、代码

废话不多说，先上代码！

（一）服务端代码：

import os
import wave

from config import CONFIG
#os.environ["CUDA_VISIBLE_DEVICES"] = CONFIG["cuda_id"]
import asyncio
import websockets
from faster_whisper import WhisperModel  # 1.0.2
import uuid
import time
import librosa
import numpy as np
import torch
import opencc  # 1.1.6
import warnings
warnings.filterwarnings("ignore")


cc = opencc.OpenCC('t2s')

# model config
model_size = CONFIG["faster_whisper"]
out_dir = CONFIG["voice_dir"]

# CUDA ID
# device = f'cuda' if torch.cuda.is_available() else "cpu"
device = "cpu"


print(f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}\t配置\n\t- model：{model_size}\n\t- device：{device}", flush=True)

# tmp文件夹新建或清理
if not os.path.exists(out_dir):
    os.makedirs(out_dir, exist_ok=True)
else:
    files = os.listdir(out_dir)
    # 遍历文件并删除
    for file in files:
        if file.endswith("wav") or file.endswith("mp3"):
            file_path = os.path.join(out_dir, file)
            if os.path.isfile(file_path):
                os.remove(file_path)
                print(f"Deleted file: {file_path}", flush=True)

print(f"{time.strftime('%Y-%m-%d %H:%M:%S')}\t输出文件位置：{out_dir}", flush=True)

model = WhisperModel(model_size, device=device, compute_type="int8")   # 镜像的cuda不可用有可能会报错，需要使用int8
print(f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}\t模型实例化完成！")


async def run(audio_path, websocket):
    sentence = ""
    segments, info = model.transcribe(audio_path, language="zh", beam_size=5,
                                      word_timestamps=True, hallucination_silence_threshold=2, 
                                      vad_filter=True, vad_parameters=dict(min_silence_duration_ms=300))
    print("[Detected language]\t'%s' with probability %f" % (info.language, info.language_probability), flush=True)
    for segment in segments:
        print("\t[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text), flush=True)
        sentence = cc.convert(segment.text)
        await websocket.send(sentence)

        await asyncio.sleep(0.3)
        # sentence = sentence + segment.text
    return sentence


async def delete_file_later(audio_path, time_=0.3):
    await asyncio.sleep(time_)  # 等待10秒
    if os.path.exists(audio_path):
        os.remove(audio_path)
        print(f"\t{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} 删除录音文件: {audio_path}", flush=True)

# 第三版
async def audio_handler(websocket, path):
    client_id = str(uuid.uuid4())
    print(f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} {client_id} 连接成功", flush=True)
    try:
        while True:
            audio_data = b''  # 用于存储接收到的音频数据
            start_time = time.time()  # 记录开始时间
            print(time.time() - start_time)
            # 循环接收音频数据，直到超过 1 秒或者客户端关闭连接
            while time.time() - start_time < 0.3:
                message = await websocket.recv()
                audio_data += message


            # 将接收到的音频数据写入文件
            time_str = time.strftime("%Y_%m_%d_%H_%M_%S")
            audio_path = f"{out_dir}/audio_{time_str}.wav"
            with wave.open(audio_path, 'wb') as wf:
                # 设置音频流参数
                wf.setnchannels(2)  # 单声道
                wf.setsampwidth(2)  # 16位
                wf.setframerate(48000)  # 44100 Hz
                # 写入音频流数据
                wf.writeframes(audio_data)
                print(f"[文件写入] {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} 录音数据已保存至{audio_path}", flush=True)

            text = await run(audio_path, websocket)

            # await websocket.send(text)
            print(f"[结果] {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} 已发送文本数据：{text}", flush=True)

            # 创建一个异步任务来删除文件
            asyncio.create_task(delete_file_later(audio_path))
    finally:
        print(f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} 连接断开", flush=True)

start_server = websockets.serve(audio_handler, 'localhost', 8080)     # 部署端口
print(f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}\t服务启动完成！\n", flush=True)
asyncio.get_event_loop().run_until_complete(start_server)
asyncio.get_event_loop().run_forever()

（二）前端代码：

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>音频收音示例</title>
</head>
<body>
    <button id="startButton">开始录音</button>
    <button id="stopButton" disabled>停止录音</button>
    <div id="status"></div>
    <div id="recognizedText"></div>
    <script>
        let audioContext;
        let mediaStream;
        let mediaStreamSource;
        let processor;
        let socket;

        const startButton = document.getElementById('startButton');
        const stopButton = document.getElementById('stopButton');
        const statusDiv = document.getElementById('status');
        const recognizedTextDiv = document.getElementById('recognizedText'); 
        startButton.addEventListener('click', startRecording);
        stopButton.addEventListener('click', stopRecording);

        function startRecording() {

            if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
                alert('当前浏览器不支持音频设备');
                return;
            }

            startButton.disabled = true;
            stopButton.disabled = false;

            navigator.mediaDevices.getUserMedia({ audio: true })
                .then(stream => {
                    audioContext = new (window.AudioContext || window.webkitAudioContext);
                    mediaStream = stream;
                    mediaStreamSource = audioContext.createMediaStreamSource(stream);
                    processor = audioContext.createScriptProcessor(1024, 1, 1);

                    mediaStreamSource.connect(processor);
                    processor.connect(audioContext.destination);

                    processor.onaudioprocess = function(e) {
                        const audioData = e.inputBuffer.getChannelData(0);
                        sendAudioStream(audioData);
                    };

                    socket = new WebSocket('ws://127.0.0.1:8080');

                    socket.binaryType = 'arraybuffer';
                    socket.onopen = () => {

                        console.log('WebSocket连接已建立');
                        statusDiv.innerHTML = '正在录音...';
                    };

                    socket.onerror = (error) => {
                        console.error('WebSocket错误:', error);
                    };

                    socket.onmessage = (event) => { 
                        recognizedTextDiv.innerHTML = event.data; 
                    };

                    socket.onclose = () => {
                        console.log('WebSocket连接已关闭');
                        statusDiv.innerHTML = '录音已停止';
                    };
                })
                .catch(err => {
                    console.error('获取音频流失败:', err);
                });
        }

        function stopRecording() {
            if (mediaStream) {
                mediaStream.getTracks().forEach(track => track.stop());
            }
            if (processor) {
                processor.disconnect();
            }
            if (mediaStreamSource) {
                mediaStreamSource.disconnect();
            }
            if (audioContext) {
                audioContext.close();
            }
            if (socket) {
                socket.close();
            }

            startButton.disabled = false;
            stopButton.disabled = true;
        }

        function sendAudioStream(audioData) {
            if (socket && socket.readyState === WebSocket.OPEN) {
                const audioBuffer = new Float32Array(audioData);
                socket.send(audioBuffer.buffer);
            }
        }
    </script>
</body>
</html>

具体改进：

1.写了一个简单的前端界面进行收音和音频数据的传输。当然传输需要服务端与客户端数据稳定的传输，故此建立websocket长连接。

2.在ASR语音转文本方面，通过在huggingface上拉取Large-V3模型，通过模型的简单调用方式，引入服务端代码内，来实现音频转换的目的。这里为什么要用到Large-V3模型而不是Large-V2模型或者Large-V1模型，本人亲测，Large-V2和Large-V1在实时识别过程中存在识别幻觉问题，并且识别错误率较高与Large-V3。

三、结束

希望能帮助到大家！