vosk实时语音识别

vosk介绍以及安装,参考地址:https://blog.csdn.net/qq_35385687/article/details/119209189?spm=1001.2014.3001.5501

命令行方式直接转写

#!/usr/bin/env python3

import argparse
import os
import queue
import sounddevice as sd
import sys
import vosk

q = queue.Queue()


def int_or_str(text):
    """Helper function for argument parsing."""
    try:
        return int(text)
    except ValueError:
        return text


def callback(indata, frames, time, status):
    """This is called (from a separate thread) for each audio block."""
    if status:
        print(status, file=sys.stderr)
    q.put(bytes(indata))


parser = argparse.ArgumentParser(add_help=False)
parser.add_argument(
    '-l', '--list-devices', action='store_true',
    help='show list of audio devices and exit')
args, remaining = parser.parse_known_args()
if args.list_devices:
    print(sd.query_devices())
    parser.exit(0)
parser = argparse.ArgumentParser(
    description=__doc__,
    formatter_class=argparse.RawDescriptionHelpFormatter,
    parents=[parser])
parser.add_argument(
    '-f', '--filename', type=str, metavar='FILENAME',
    help='audio file to store recording to')
parser.add_argument(
    '-m', '--model', type=str, metavar='MODEL_PATH',
    help='Path to the model')
parser.add_argument(
    '-d', '--device', type=int_or_str,
    help='input device (numeric ID or substring)')
parser.add_argument(
    '-r', '--samplerate', type=int, help='sampling rate')
args = parser.parse_args(remaining)

try:
    if args.model is None:
        args.model = "model"
    if not os.path.exists(args.model):
        print("Please download a model for your language from https://alphacephei.com/vosk/models")
        print("and unpack as 'model' in the current folder.")
        parser.exit(0)
    if args.samplerate is None:
        device_info = sd.query_devices(args.device, 'input')
        # soundfile expects an int, sounddevice provides a float:
        args.samplerate = int(device_info['default_samplerate'])

    model = vosk.Model(args.model)

    if args.filename:
        dump_fn = open(args.filename, "wb")
    else:
        dump_fn = None

    with sd.RawInputStream(samplerate=args.samplerate, blocksize=16000, device=args.device, dtype='int16',
                           channels=1, callback=callback):
        print('#' * 80)
        print('Press Ctrl+C to stop the recording')
        print('#' * 80)

        rec = vosk.KaldiRecognizer(model, args.samplerate)
        while True:
            data = q.get()
            if rec.AcceptWaveform(data):
                print(rec.Result())
            else:
                print(rec.PartialResult())
            if dump_fn is not None:
                dump_fn.write(data)

except KeyboardInterrupt:
    print('\nDone')
    parser.exit(0)
except Exception as e:
    parser.exit(type(e).__name__ + ': ' + str(e))

websoket实现实时转写

#!/usr/bin/env python3

import argparse
import os
import queue
import sounddevice as sd
import sys
import vosk

q = queue.Queue()


def int_or_str(text):
    """Helper function for argument parsing."""
    try:
        return int(text)
    except ValueError:
        return text


def callback(indata, frames, time, status):
    """This is called (from a separate thread) for each audio block."""
    if status:
        print(status, file=sys.stderr)
    q.put(bytes(indata))


parser = argparse.ArgumentParser(add_help=False)
parser.add_argument(
    '-l', '--list-devices', action='store_true',
    help='show list of audio devices and exit')
args, remaining = parser.parse_known_args()
if args.list_devices:
    print(sd.query_devices())
    parser.exit(0)
parser = argparse.ArgumentParser(
    description=__doc__,
    formatter_class=argparse.RawDescriptionHelpFormatter,
    parents=[parser])
parser.add_argument(
    '-f', '--filename', type=str, metavar='FILENAME',
    help='audio file to store recording to')
parser.add_argument(
    '-m', '--model', type=str, metavar='MODEL_PATH',
    help='Path to the model')
parser.add_argument(
    '-d', '--device', type=int_or_str,
    help='input device (numeric ID or substring)')
parser.add_argument(
    '-r', '--samplerate', type=int, help='sampling rate')
args = parser.parse_args(remaining)

try:
    if args.model is None:
        args.model = "model"
    if not os.path.exists(args.model):
        print("Please download a model for your language from https://alphacephei.com/vosk/models")
        print("and unpack as 'model' in the current folder.")
        parser.exit(0)
    if args.samplerate is None:
        device_info = sd.query_devices(args.device, 'input')
        # soundfile expects an int, sounddevice provides a float:
        args.samplerate = int(device_info['default_samplerate'])

    model = vosk.Model(args.model)

    if args.filename:
        dump_fn = open(args.filename, "wb")
    else:
        dump_fn = None

    with sd.RawInputStream(samplerate=args.samplerate, blocksize=16000, device=args.device, dtype='int16',
                           channels=1, callback=callback):
        print('#' * 80)
        print('Press Ctrl+C to stop the recording')
        print('#' * 80)

        rec = vosk.KaldiRecognizer(model, args.samplerate)
        while True:
            data = q.get()
            if rec.AcceptWaveform(data):
                print(rec.Result())
            else:
                print(rec.PartialResult())
            if dump_fn is not None:
                dump_fn.write(data)

except KeyboardInterrupt:
    print('\nDone')
    parser.exit(0)
except Exception as e:
    parser.exit(type(e).__name__ + ': ' + str(e))

前端获取pcm实时传输至后台

<html>

<head>
    <meta charset="UTF-8">
    <title>Simple Recorder.js demo with record, stop and pause</title>
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <!-- 控制宽度的自动适应 -->
    <style type="text/css">
        .comments {
            width: 100%; /*自动适应父布局宽度*/
            overflow: auto;
            word-break: break-all;
            /*在ie中解决断行问题(防止自动变为在一行显示,主要解决ie兼容问题,ie8中当设宽度为100%时,文本域类容超过一行时,
            当我们双击文本内容就会自动变为一行显示,所以只能用ie的专有断行属性“word-break或word-wrap”控制其断行)*/
        }
    </style>
</head>

<body>
<div id="controls">
    <button id="recordButton">Record</button>
    <button id="stopButton">Stop</button>
</div>

<textarea id="textResult" class="comments" rows="10" cols="10"></textarea>

</body>
<script type="text/javascript" src="./js/recorder3.js"></script>
<script>

    var ws = null; //实现WebSocket

    var interval; // 定时器

    let recorder = new Recorder({
        sampleBits: 16,                 // 采样位数,支持 8 或 16,默认是16
        sampleRate: 16000,              // 采样率,支持 11025、16000、22050、24000、44100、48000,根据浏览器默认值,
        numChannels: 1,                 // 声道,支持 1 或 2, 默认是1
        // compiling: false,(0.x版本中生效,1.x增加中)  // 是否边录边转换,默认是false
        compiling: true
    });

    var recordButton = document.getElementById("recordButton");
    var stopButton = document.getElementById("stopButton");
    var textResult = document.getElementById("textResult");

    recordButton.addEventListener("click", startRecording);
    stopButton.addEventListener("click", stopRecording);

    // 录音
    function startRecording() {
        console.log("recordButton clicked");
        recorder.start().then(() => {
            // 开始录音
            useWebSocket();
        }, (error) => {
            // 出错了
            console.log(`出错了`);
        });

    }

    // 停止录音
    function stopRecording() {
        console.log("stopButton clicked", recorder.getPCMBlob());

        recorder.stop();

        if (ws) {
            ws.close();
        }

        clearInterval(interval);

        textResult.innerText = '';

        // recorder.getPCMBlob();
        // recorder.downloadPCM('aaa');

    }

    /*
    * WebSocket
    */
    function useWebSocket() {
        // console.log(recorder.getNextData())
        ws = new WebSocket("ws://localhost:5678");

        ws.binaryType = 'arraybuffer'; //传输的是 ArrayBuffer 类型的数据
        ws.onopen = function () {
            console.log('握手成功');
            if (ws.readyState === 1) { //ws进入连接状态,则每隔500毫秒发送一包数据
                interval = setInterval(() => {
                    // recorder.getNextData();
                    // recorder.getWholeData();
                    // console.log(recorder.getNextData());
                    ws.send(recorder.getNextData());
                }, 500)

            }

        };

        ws.onmessage = function (msg) {
            var jsonStr = msg.data;
            console.info(jsonStr);
            textResult.innerText = jsonStr;
            autoTextarea(document.getElementById("textResult"));
        };

        ws.onerror = function (err) {
            console.error(err);
            textResult.innerText = '';
        };

        ws.onclose = function (msg) {
            console.info(msg);
            textResult.innerText = '';
        };

    }

    /**
     * 文本框根据输入内容自适应高度
     * @param                {HTMLElement}        输入框元素
     * @param                {Number}                设置光标与输入框保持的距离(默认0)
     * @param                {Number}                设置最大高度(可选)
     */
    var autoTextarea = function (elem, extra, maxHeight) {
        //判断elem是否为数组
        if (elem.length > 0) {
            for (var i = 0; i < elem.length; i++) {
                e(elem[i]);
            }
        } else {
            e(elem);
        }

        function e(elem) {
            extra = extra || 0;
            var isFirefox = !!document.getBoxObjectFor || 'mozInnerScreenX' in window,
                isOpera = !!window.opera && !!window.opera.toString().indexOf('Opera'),
                addEvent = function (type, callback) {
                    elem.addEventListener ?
                        elem.addEventListener(type, callback, false) :
                        elem.attachEvent('on' + type, callback);
                },
                getStyle = elem.currentStyle ? function (name) {
                    var val = elem.currentStyle[name];

                    if (name === 'height' && val.search(/px/i) !== 1) {
                        var rect = elem.getBoundingClientRect();
                        return rect.bottom - rect.top -
                            parseFloat(getStyle('paddingTop')) -
                            parseFloat(getStyle('paddingBottom')) + 'px';
                    }
                    ;

                    return val;
                } : function (name) {
                    return getComputedStyle(elem, null)[name];
                },
                minHeight = parseFloat(getStyle('height'));

            elem.style.resize = 'none';

            var change = function () {
                var scrollTop, height,
                    padding = 0,
                    style = elem.style;

                if (elem._length === elem.value.length) return;
                elem._length = elem.value.length;

                if (!isFirefox && !isOpera) {
                    padding = parseInt(getStyle('paddingTop')) + parseInt(getStyle('paddingBottom'));
                }
                ;
                scrollTop = document.body.scrollTop || document.documentElement.scrollTop;

                elem.style.height = minHeight + 'px';
                if (elem.scrollHeight > minHeight) {
                    if (maxHeight && elem.scrollHeight > maxHeight) {
                        height = maxHeight - padding;
                        style.overflowY = 'auto';
                    } else {
                        height = elem.scrollHeight - padding;
                        style.overflowY = 'hidden';
                    }
                    ;
                    style.height = height + extra + 'px';
                    scrollTop += parseInt(style.height) - elem.currHeight;
                    document.body.scrollTop = scrollTop;
                    document.documentElement.scrollTop = scrollTop;
                    elem.currHeight = parseInt(style.height);
                }
                ;
            };

            addEvent('propertychange', change);
            addEvent('input', change);
            addEvent('focus', change);
            change();
        }
    };

</script>
</html>

完整项目地址

https://gitee.com/yzdyzdyzd/speechToText

Vosk 是一个开源的语音识别工具包,支持多种语言,包括 Java。使用 Vosk 进行语音识别,需要按照以下步骤进行操作: 1. 下载 Vosk 的 Java 绑定库,并将其导入到 Java 项目中。 2. 下载 Vosk 的语音模型,可以从官方网站下载或者使用其他适合的语音模型。 3. 编写 Java 代码,读取音频文件并调用 Vosk 库中的方法进行语音识别。 4. 对识别结果进行处理和分析。 下面是一个简单的 Java 代码示例,实现了基于 Vosk语音识别功能: ```java import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import org.vosk.Model; import org.vosk.Recognizer; import org.vosk.SpeechRecognitionResult; public class VoskExample { public static void main(String[] args) throws Exception { // 加载语音模型 Model model = new Model("model_path"); // 创建语音识别器 Recognizer recognizer = new Recognizer(model); // 读取音频文件 InputStream inputStream = new FileInputStream(new File("audio_file_path")); byte[] buffer = new byte[1024]; int bytesRead; // 开始语音识别 recognizer.recognizerStart(); while ((bytesRead = inputStream.read(buffer)) > -1) { // 接收音频数据并进行识别 recognizer.recognize(buffer, bytesRead); } // 结束语音识别 recognizer.recognizeFinalize(); // 获取识别结果 SpeechRecognitionResult result = recognizer.getResult(); System.out.println(result.getText()); // 释放资源 recognizer.close(); } } ``` 需要注意的是,Vosk语音识别功能需要一定的计算资源和时间,识别的准确率也会受到多种因素的影响,如音频质量、语音模型的质量等。因此,在实际应用中需要进行充分的测试和优化。
评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

阳宗德

您的鼓励是我进步的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值