ASR实时语音转换(Python)

一、介绍

实时语音转换是在该项目:jianchang512/stt: Voice Recognition to Text Tool / 一个离线运行的本地语音识别转文字服务,输出json、srt字幕带时间戳、纯文字格式 (github.com)基础上,做了改进达到实时转换的目的。

安装过程请看项目的readme.md

二、代码

   废话不多说,先上代码!

  (一)服务端代码:

import os
import wave

from config import CONFIG
#os.environ["CUDA_VISIBLE_DEVICES"] = CONFIG["cuda_id"]
import asyncio
import websockets
from faster_whisper import WhisperModel  # 1.0.2
import uuid
import time
import librosa
import numpy as np
import torch
import opencc  # 1.1.6
import warnings
warnings.filterwarnings("ignore")


cc = opencc.OpenCC('t2s')

# model config
model_size = CONFIG["faster_whisper"]
out_dir = CONFIG["voice_dir"]

# CUDA ID
# device = f'cuda' if torch.cuda.is_available() else "cpu"
device = "cpu"


print(f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}\t配置\n\t- model:{model_size}\n\t- device:{device}", flush=True)

# tmp文件夹新建或清理
if not os.path.exists(out_dir):
    os.makedirs(out_dir, exist_ok=True)
else:
    files = os.listdir(out_dir)
    # 遍历文件并删除
    for file in files:
        if file.endswith("wav") or file.endswith("mp3"):
            file_path = os.path.join(out_dir, file)
            if os.path.isfile(file_path):
                os.remove(file_path)
                print(f"Deleted file: {file_path}", flush=True)

print(f"{time.strftime('%Y-%m-%d %H:%M:%S')}\t输出文件位置:{out_dir}", flush=True)

model = WhisperModel(model_size, device=device, compute_type="int8")   # 镜像的cuda不可用有可能会报错,需要使用int8
print(f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}\t模型实例化完成!")


async def run(audio_path, websocket):
    sentence = ""
    segments, info = model.transcribe(audio_path, language="zh", beam_size=5,
                                      word_timestamps=True, hallucination_silence_threshold=2, 
                                      vad_filter=True, vad_parameters=dict(min_silence_duration_ms=300))
    print("[Detected language]\t'%s' with probability %f" % (info.language, info.language_probability), flush=True)
    for segment in segments:
        print("\t[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text), flush=True)
        sentence = cc.convert(segment.text)
        await websocket.send(sentence)

        await asyncio.sleep(0.3)
        # sentence = sentence + segment.text
    return sentence


async def delete_file_later(audio_path, time_=0.3):
    await asyncio.sleep(time_)  # 等待10秒
    if os.path.exists(audio_path):
        os.remove(audio_path)
        print(f"\t{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} 删除录音文件: {audio_path}", flush=True)

# 第三版
async def audio_handler(websocket, path):
    client_id = str(uuid.uuid4())
    print(f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} {client_id} 连接成功", flush=True)
    try:
        while True:
            audio_data = b''  # 用于存储接收到的音频数据
            start_time = time.time()  # 记录开始时间
            print(time.time() - start_time)
            # 循环接收音频数据,直到超过 1 秒或者客户端关闭连接
            while time.time() - start_time < 0.3:
                message = await websocket.recv()
                audio_data += message


            # 将接收到的音频数据写入文件
            time_str = time.strftime("%Y_%m_%d_%H_%M_%S")
            audio_path = f"{out_dir}/audio_{time_str}.wav"
            with wave.open(audio_path, 'wb') as wf:
                # 设置音频流参数
                wf.setnchannels(2)  # 单声道
                wf.setsampwidth(2)  # 16位
                wf.setframerate(48000)  # 44100 Hz
                # 写入音频流数据
                wf.writeframes(audio_data)
                print(f"[文件写入] {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} 录音数据已保存至{audio_path}", flush=True)

            text = await run(audio_path, websocket)

            # await websocket.send(text)
            print(f"[结果] {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} 已发送文本数据:{text}", flush=True)

            # 创建一个异步任务来删除文件
            asyncio.create_task(delete_file_later(audio_path))
    finally:
        print(f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} 连接断开", flush=True)

start_server = websockets.serve(audio_handler, 'localhost', 8080)     # 部署端口
print(f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}\t服务启动完成!\n", flush=True)
asyncio.get_event_loop().run_until_complete(start_server)
asyncio.get_event_loop().run_forever()

 (二)前端代码:

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>音频收音示例</title>
</head>
<body>
    <button id="startButton">开始录音</button>
    <button id="stopButton" disabled>停止录音</button>
    <div id="status"></div>
    <div id="recognizedText"></div>
    <script>
        let audioContext;
        let mediaStream;
        let mediaStreamSource;
        let processor;
        let socket;

        const startButton = document.getElementById('startButton');
        const stopButton = document.getElementById('stopButton');
        const statusDiv = document.getElementById('status');
        const recognizedTextDiv = document.getElementById('recognizedText'); 
        startButton.addEventListener('click', startRecording);
        stopButton.addEventListener('click', stopRecording);

        function startRecording() {

            if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
                alert('当前浏览器不支持音频设备');
                return;
            }

            startButton.disabled = true;
            stopButton.disabled = false;

            navigator.mediaDevices.getUserMedia({ audio: true })
                .then(stream => {
                    audioContext = new (window.AudioContext || window.webkitAudioContext);
                    mediaStream = stream;
                    mediaStreamSource = audioContext.createMediaStreamSource(stream);
                    processor = audioContext.createScriptProcessor(1024, 1, 1);

                    mediaStreamSource.connect(processor);
                    processor.connect(audioContext.destination);

                    processor.onaudioprocess = function(e) {
                        const audioData = e.inputBuffer.getChannelData(0);
                        sendAudioStream(audioData);
                    };

                    socket = new WebSocket('ws://127.0.0.1:8080');

                    socket.binaryType = 'arraybuffer';
                    socket.onopen = () => {

                        console.log('WebSocket连接已建立');
                        statusDiv.innerHTML = '正在录音...';
                    };

                    socket.onerror = (error) => {
                        console.error('WebSocket错误:', error);
                    };

                    socket.onmessage = (event) => { 
                        recognizedTextDiv.innerHTML = event.data; 
                    };

                    socket.onclose = () => {
                        console.log('WebSocket连接已关闭');
                        statusDiv.innerHTML = '录音已停止';
                    };
                })
                .catch(err => {
                    console.error('获取音频流失败:', err);
                });
        }

        function stopRecording() {
            if (mediaStream) {
                mediaStream.getTracks().forEach(track => track.stop());
            }
            if (processor) {
                processor.disconnect();
            }
            if (mediaStreamSource) {
                mediaStreamSource.disconnect();
            }
            if (audioContext) {
                audioContext.close();
            }
            if (socket) {
                socket.close();
            }

            startButton.disabled = false;
            stopButton.disabled = true;
        }

        function sendAudioStream(audioData) {
            if (socket && socket.readyState === WebSocket.OPEN) {
                const audioBuffer = new Float32Array(audioData);
                socket.send(audioBuffer.buffer);
            }
        }
    </script>
</body>
</html>

具体改进:

    1.写了一个简单的前端界面进行收音和音频数据的传输。当然传输需要服务端与客户端数据稳定的传输,故此建立websocket长连接。

   2.在ASR语音转文本方面,通过在huggingface上拉取Large-V3模型,通过模型的简单调用方式,引入服务端代码内,来实现音频转换的目的。这里为什么要用到Large-V3模型而不是Large-V2模型或者Large-V1模型,本人亲测,Large-V2和Large-V1在实时识别过程中存在识别幻觉问题,并且识别错误率较高与Large-V3。

三、结束

希望能帮助到大家!

Python语音助手是一个能够接收语音输入并进行文字识别、聊天回复、文字合成为语音并播放的应用程序。实现一个Python语音助手的过程可以分为以下几个步骤: 1. 语音转文字:使用百度AI开放平台的语音识别API,将音频文件转换为文字。可以使用FFmpeg工具将音频格式转换为API支持的格式。 2. 聊天接口:使用图灵机器人接口,将用户的文字输入发送给机器人,获取机器人的回复。 3. 文字转语音:使用讯飞开放平台的语音合成API,将机器人的回复文字转换语音文件。 4. 播放语音:使用pyaudio模块录制和播放音频,将语音文件播放出来。 下面是一个简单的Python语音助手的代码示例: ``` from aip import AipSpeech import pyaudio import wave import requests import json # 初始化百度语音识别的API APP_ID = '25016634' API_KEY = 'Qsj6XGf0m1ilsV0QwLTmHeiy' SECRET_KEY = 'Mctl1jHY85Hr3wmTpizLI********' client = AipSpeech(APP_ID, API_KEY, SECRET_KEY) # 录音函数 def record_audio(): # 使用pyaudio录制音频 CHUNK = 1024 FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 16000 RECORD_SECONDS = 5 p = pyaudio.PyAudio() stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) print("* recording") frames = [] for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): data = stream.read(CHUNK) frames.append(data) print("* done recording") stream.stop_stream() stream.close() p.terminate() # 保存录音文件 wf = wave.open("record.wav", 'wb') wf.setnchannels(CHANNELS) wf.setsampwidth(p.get_sample_size(FORMAT)) wf.setframerate(RATE) wf.writeframes(b''.join(frames)) wf.close() # 语音转文字函数 def speech_to_text(): # 读取录音文件 with open("record.wav", 'rb') as f: data = f.read() # 调用百度语音识别API,将音频文件转为文字 result = client.asr(data, 'wav', 16000, { 'dev_pid': 1536, }) # 提取识别结果 if 'result' in result.keys(): text = result['result'][0] else: text = "" return text # 聊天函数 def chat(text): # 调用图灵机器人接口,进行文字聊天 url = 'http://openapi.tuling123.com/openapi/api/v2' data = { "reqType": 0, "perception": { "inputText": { "text": text } }, "userInfo": { "apiKey": "YOUR_API_KEY", "userId": "YOUR_USER_ID" } } response = requests.post(url, json=data) result = json.loads(response.text) # 提取机器人的回复 reply = result['results'][0]['values']['text'] return reply # 文字转语音函数 def text_to_speech(text): # 调用讯飞开放平台的语音合成API,将文字转为语音文件 # 这里省略具体代码 # 返回生成的语音文件路径 return "speech.wav" # 播放语音函数 def play_audio(file_path): # 使用pyaudio播放音频 CHUNK = 1024 wf = wave.open(file_path, 'rb') p = pyaudio.PyAudio() stream = p.open(format=p.get_format_from_width(wf.getsampwidth()), channels=wf.getnchannels(), rate=wf.getframerate(), output=True) data = wf.readframes(CHUNK) while data: stream.write(data) data = wf.readframes(CHUNK) stream.stop_stream() stream.close() p.terminate() # 主函数 def main(): # 录音 record_audio() # 语音转文字 text = speech_to_text() # 聊天 reply = chat(text) # 文字转语音 speech_file = text_to_speech(reply) # 播放语音 play_audio(speech_file) # 运行主函数 if __name__ == '__main__': main() ``` 这段代码实现了一个基本的Python语音助手,可以录制音频文件,并将其转换为文字,然后发送给机器人进行聊天回复,最后将回复的文字转换语音并播放出来。需要注意的是,其中的一些API的调用需要替换为你自己的API Key和Secret Key。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* *3* [小白如何做一个Python人工智能语音助手](https://blog.csdn.net/m0_57307642/article/details/120849915)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 50%"] - *2* [简易语音助手—python](https://blog.csdn.net/chqhong/article/details/112427268)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 50%"] [ .reference_list ]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值