在线实时语音识别实现【完善中-本地测试已完成,只差服务器功能】

基本流程

  • 环境搭建

    • 客户端环境

      • 录音模块

        pip install pyaudio

    • 服务器环境

      • flask
  • 客户端

    • 录音模块
      • 硬件设备
        • 识别效果跟话筒的关系很大
      • 缓冲区
    • 发送数据
      • 缓冲区数据
        • 缓冲区优化
    • 接收数据
      • 识别结果
  • 服务器端

    • 接收缓冲区数据
    • 调用识别接口
      • 传入缓冲区数据
      • 返回识别文字
    • 发送识别文字给客户端

环境准备

相关包的安装
pip install pygame
SpeechRecognition
playsound
librosa

服务器端

文档结构

初始目录结构

├── cache #  缓冲区
│   └── temp.wav
├── client
│   ├── client.py
│   └── __init__.py
├── decoder # 解码器
│   ├── create_data_list.sh
│   ├── datalist # 生成datalist的地方
│   ├── recognize.py
│   └── wenet -> /home/asr/data/wenet/wenet
├── model # 模型存放地方
│   ├── 20210815_unified_conformer_exp
│   │   ├── final.pt
│   │   ├── global_cmvn
│   │   ├── train.yaml
│   │   └── words.txt
└── server 
    ├── __init__.py
    └── server.py

测试识别后的目录结构

├── cache # 
│   ├── temp1.wav
│   ├── temp2.wav
│   └── temp.wav
├── client
│   ├── client.py
│   └── __init__.py
├── decoder
│   ├── create_data_list.sh
│   ├── datalist
│   │   ├── temp
│   │   ├── temp1
│   │   └── temp2
│   ├── recognize.py
│   └── wenet -> /home/asr/data/wenet/wenet
├── model
│   ├── 20210618_u2pp_conformer_exp.tar.gz
│   ├── 20210815_unified_conformer_exp
│   │   ├── final.pt
│   │   ├── global_cmvn
│   │   ├── train.yaml
│   │   └── words.txt
│   └── 20210815_unified_conformer_exp.tar.gz
└── server
    ├── __init__.py
    └── server.py

服务器功能模块【待完成】

服务端环境搭建
pip install flask
from flask import Flask

app = Flask(__name__)
# 加载模型 传入参数
@app.route("/")
def getdata():
   	# 调用识别
    save_wav(data,save_path)

if __name__ =="__main__":
	app.run()

接收数据
# 保存麦克风数据	
def save_wav(frames, save_path):
	wf = wave.open(save_path, 'wb')
	wf.setnchannels(CHANNELS)
	wf.setsampwidth(2)
	wf.setframerate(SAMPALE_RATE)
	wf.writeframes(b''.join(frames))
	wf.close()
	print('\033[93m' + "已录入缓冲区" + '\033[0m')
返回客户识别结果

开启服务的时候加载模型,

当有数据传入的调用识别接口

from flask import Flask
from recognize import recognize

app = Flask(__name__)
# 加载模型 传入参数
model = recognize()
@app.route("/")
def run_recognize():
   	# 调用识别
    result = model.get_recognize()
    return result

if __name__ =="__main__":
	app.run()

识别引擎搭建【完成】

生成data_list

wenet识别需要这个文件,内部读取这个文件的数据,格式必须是如下

{"key":"temp","wav":"/home/sunao/data/StreamAIzimu/cache/temp.wav","txt":""}

#!/usr/bin/bash
root=..
data=${root}/cache/temp.wav

echo "{\"key\":\"temp\",\"wav\":\"${data}\",\"txt\":\"\"}" > online_data.list

修改wenet的识别文件recognize.py改为只加载一次模型,并且取消默认的bash脚本传入参数的方式

recognize.py

# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import argparse
import copy
import logging
import os
import sys
import time
import torch
import yaml
from torch.utils.data import DataLoader

from wenet.dataset.dataset import Dataset
from wenet.transformer.asr_model import init_asr_model
from wenet.utils.checkpoint import load_checkpoint
from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols
from wenet.utils.config import override_config



class recognize():
    def __init__(self, ):
        
        self.root_path = os.pardir
        self.batch_size = 1
        self.beam_size = 10
        self.bpe_model = None
        self.checkpoint = '../model/20210815_unified_conformer_exp/final.pt'
        self.config = '../model/20210815_unified_conformer_exp/train.yaml'
        self.ctc_weight = 0.5
        self.data_type = 'raw'
        self.decoding_chunk_size = -1
        self.dict = '../model/20210815_unified_conformer_exp/words.txt'
        self.gpu = -1
        self.mode = 'attention_rescoring'
        self.non_lang_syms = None
        self.num_decoding_left_chunks = -1
        self.override_config = []
        self.penalty = 0.0
        self.result_file = 'online_text'
        self.reverse_weight = 0.0
        self.simulate_streaming = False,
        self.test_data = 'online_data.list'
        
        self.use_cuda = self.gpu >= 0 and torch.cuda.is_available()
        
        self.device = torch.device('cuda' if False else 'cpu')
        self.load_configs()  # 加载配置
        self.test_data_conf()
        self.loadmodel()  # 加载模型
    
    def load_configs(self):
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s')
        os.environ['CUDA_VISIBLE_DEVICES'] = str(self.gpu)
        
        if self.mode in ['ctc_prefix_beam_search', 'attention_rescoring'
                              ] and self.batch_size > 1:
            logging.fatal(
                'decoding mode {} must be running with batch_size == 1'.format(
                    self.mode))
            sys.exit(1)
        with open(self.config, 'r') as fin:
            self.configs = yaml.load(fin, Loader=yaml.FullLoader)
        if len(self.override_config) > 0:
            self.configs = override_config(self.configs, self.override_config)
        # 加载词典
        self.symbol_table = read_symbol_table(self.dict)
    
    def loadmodel(self):
        # Init asr model from configs
        model = init_asr_model(self.configs)
        
        # Load dict
        self.char_dict = {v: k for k, v in self.symbol_table.items()}
        self.eos = len(self.char_dict) - 1
        
        load_checkpoint(model, self.checkpoint)
        self.model = model.to(self.device)
        self.model.eval()
    
    def test_data_conf(self):
        '''
        测试数据配置
        '''
        self.test_conf = copy.deepcopy(self.configs['dataset_conf'])
        self.test_conf['filter_conf']['max_length'] = 102400
        self.test_conf['filter_conf']['min_length'] = 0
        self.test_conf['filter_conf']['token_max_length'] = 102400
        self.test_conf['filter_conf']['token_min_length'] = 0
        self.test_conf['filter_conf']['max_output_input_ratio'] = 102400
        self.test_conf['filter_conf']['min_output_input_ratio'] = 0
        self.test_conf['speed_perturb'] = False
        self.test_conf['spec_aug'] = False
        self.test_conf['shuffle'] = False
        self.test_conf['sort'] = False
        if 'fbank_conf' in self.test_conf:
            self.test_conf['fbank_conf']['dither'] = 0.0
        elif 'mfcc_conf' in self.test_conf:
            self.test_conf['mfcc_conf']['dither'] = 0.0
        self.test_conf['batch_conf']['batch_type'] = "static"
        self.test_conf['batch_conf']['batch_size'] = self.batch_size
        self.non_lang_syms = read_non_lang_symbols(self.non_lang_syms)
    
    def get_test_data_loader(self):
        test_dataset = Dataset(self.data_type,
                               self.test_data,
                               self.symbol_table,
                               self.test_conf,
                               self.bpe_model,
                               self.non_lang_syms,
                               partition=False)
        return DataLoader(test_dataset, batch_size=None, num_workers=0)
    
    def get_recognize(self):
        test_data_loader = self.get_test_data_loader()
        with torch.no_grad():
            for batch_idx, batch in enumerate(test_data_loader):
                keys, feats, target, feats_lengths, target_lengths = batch
                feats = feats.to(self.device)
                feats_lengths = feats_lengths.to(self.device)
                assert (feats.size(0) == 1)
                if self.mode == 'attention':
                    hyps, _ = self.model.recognize(
                        feats,
                        feats_lengths,
                        beam_size=self.beam_size,
                        decoding_chunk_size=self.decoding_chunk_size,
                        num_decoding_left_chunks=self.num_decoding_left_chunks,
                        simulate_streaming=self.simulate_streaming)
                    hyps = [hyp.tolist() for hyp in hyps]
                elif self.mode == 'ctc_greedy_search':
                    hyps, _ = self.model.ctc_greedy_search(
                        feats,
                        feats_lengths,
                        decoding_chunk_size=self.decoding_chunk_size,
                        num_decoding_left_chunks=self.num_decoding_left_chunks,
                        simulate_streaming=self.simulate_streaming)
                # ctc_prefix_beam_search and attention_rescoring only return one
                # result in List[int], change it to List[List[int]] for compatible
                # with other batch decoding mode
                elif self.mode == 'ctc_prefix_beam_search':
                    assert (feats.size(0) == 1)
                    hyp, _ = self.model.ctc_prefix_beam_search(
                        feats,
                        feats_lengths,
                        self.beam_size,
                        decoding_chunk_size=self.decoding_chunk_size,
                        num_decoding_left_chunks=self.num_decoding_left_chunks,
                        simulate_streaming=self.simulate_streaming)
                    hyps = [hyp]
                elif self.mode == 'attention_rescoring':
                    assert (feats.size(0) == 1)
                    hyp, _ = self.model.attention_rescoring(
                        feats,
                        feats_lengths,
                        self.beam_size,
                        decoding_chunk_size=self.decoding_chunk_size,
                        num_decoding_left_chunks=self.num_decoding_left_chunks,
                        ctc_weight=self.ctc_weight,
                        simulate_streaming=self.simulate_streaming,
                        reverse_weight=self.reverse_weight)
                    hyps = [hyp]
                
                content = ''
                for w in hyps[0]:
                    if w == self.eos:
                        break
                    content += self.char_dict[w]
                return content


if __name__ == '__main__':
    # 加载模型
    recog = recognize()
    # 实时接收数据
    result = recog.get_recognize()
    print(result)
# 实时返回识别结果
修改支持指定音频识别

这样就可以支持多路翻译

# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import argparse
import copy
import logging
import os
import sys
import time
import torch
import yaml
from torch.utils.data import DataLoader

from wenet.dataset.dataset import Dataset
from wenet.transformer.asr_model import init_asr_model
from wenet.utils.checkpoint import load_checkpoint
from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols
from wenet.utils.config import override_config



class recognize():
    def __init__(self, ):
        
        self.root_path = os.pardir
        self.batch_size = 1
        self.beam_size = 10
        self.bpe_model = None
        self.checkpoint = '../model/20210815_unified_conformer_exp/final.pt'
        self.config = '../model/20210815_unified_conformer_exp/train.yaml'
        self.ctc_weight = 0.5
        self.data_type = 'raw'
        self.decoding_chunk_size = -1
        self.dict = '../model/20210815_unified_conformer_exp/words.txt'
        self.gpu = -1
        self.mode = 'attention_rescoring'
        self.non_lang_syms = None
        self.num_decoding_left_chunks = -1
        self.override_config = []
        self.penalty = 0.0
        self.result_file = 'online_text'
        self.reverse_weight = 0.0
        self.simulate_streaming = False,
        self.test_data = 'online_data.list'
        
        self.use_cuda = self.gpu >= 0 and torch.cuda.is_available()
        
        self.device = torch.device('cuda' if False else 'cpu')
        self.load_configs()  # 加载配置
        self.test_data_conf()
        self.loadmodel()  # 加载模型
    
    def load_configs(self):
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s')
        os.environ['CUDA_VISIBLE_DEVICES'] = str(self.gpu)
        
        if self.mode in ['ctc_prefix_beam_search', 'attention_rescoring'
                              ] and self.batch_size > 1:
            logging.fatal(
                'decoding mode {} must be running with batch_size == 1'.format(
                    self.mode))
            sys.exit(1)
        with open(self.config, 'r') as fin:
            self.configs = yaml.load(fin, Loader=yaml.FullLoader)
        if len(self.override_config) > 0:
            self.configs = override_config(self.configs, self.override_config)
        # 加载词典
        self.symbol_table = read_symbol_table(self.dict)
    
    def loadmodel(self):
        # Init asr model from configs
        model = init_asr_model(self.configs)
        
        # Load dict
        self.char_dict = {v: k for k, v in self.symbol_table.items()}
        self.eos = len(self.char_dict) - 1
        
        load_checkpoint(model, self.checkpoint)
        self.model = model.to(self.device)
        self.model.eval()
    
    def test_data_conf(self):
        '''
        测试数据配置
        '''
        self.test_conf = copy.deepcopy(self.configs['dataset_conf'])
        self.test_conf['filter_conf']['max_length'] = 102400
        self.test_conf['filter_conf']['min_length'] = 0
        self.test_conf['filter_conf']['token_max_length'] = 102400
        self.test_conf['filter_conf']['token_min_length'] = 0
        self.test_conf['filter_conf']['max_output_input_ratio'] = 102400
        self.test_conf['filter_conf']['min_output_input_ratio'] = 0
        self.test_conf['speed_perturb'] = False
        self.test_conf['spec_aug'] = False
        self.test_conf['shuffle'] = False
        self.test_conf['sort'] = False
        if 'fbank_conf' in self.test_conf:
            self.test_conf['fbank_conf']['dither'] = 0.0
        elif 'mfcc_conf' in self.test_conf:
            self.test_conf['mfcc_conf']['dither'] = 0.0
        self.test_conf['batch_conf']['batch_type'] = "static"
        self.test_conf['batch_conf']['batch_size'] = self.batch_size
        self.non_lang_syms = read_non_lang_symbols(self.non_lang_syms)
    
    def get_test_data_loader(self,path):
        self.test_data=path
        test_dataset = Dataset(self.data_type,
                               self.test_data,
                               self.symbol_table,
                               self.test_conf,
                               self.bpe_model,
                               self.non_lang_syms,
                               partition=False)
        return DataLoader(test_dataset, batch_size=None, num_workers=0)
    
    def create_data_list(self,path):
        file_name = path.split("/")[-1].split(".")[0]
        filepath = "./datalist/"+file_name
        if not os.path.exists(filepath):
            with open(filepath,'w',encoding="utf-8") as file:
                file.write('{"key":"%s","wav":"/home/sunao/data/StreamAIzimu/cache/%s.wav","txt":""}'%(file_name,file_name))
        return filepath

    def get_recognize(self , path):
        path = self.create_data_list(path) 
        test_data_loader = self.get_test_data_loader(path)
        with torch.no_grad():
            for batch_idx, batch in enumerate(test_data_loader):
                keys, feats, target, feats_lengths, target_lengths = batch
                feats = feats.to(self.device)
                feats_lengths = feats_lengths.to(self.device)
                assert (feats.size(0) == 1)
                if self.mode == 'attention':
                    hyps, _ = self.model.recognize(
                        feats,
                        feats_lengths,
                        beam_size=self.beam_size,
                        decoding_chunk_size=self.decoding_chunk_size,
                        num_decoding_left_chunks=self.num_decoding_left_chunks,
                        simulate_streaming=self.simulate_streaming)
                    hyps = [hyp.tolist() for hyp in hyps]
                elif self.mode == 'ctc_greedy_search':
                    hyps, _ = self.model.ctc_greedy_search(
                        feats,
                        feats_lengths,
                        decoding_chunk_size=self.decoding_chunk_size,
                        num_decoding_left_chunks=self.num_decoding_left_chunks,
                        simulate_streaming=self.simulate_streaming)
                # ctc_prefix_beam_search and attention_rescoring only return one
                # result in List[int], change it to List[List[int]] for compatible
                # with other batch decoding mode
                elif self.mode == 'ctc_prefix_beam_search':
                    assert (feats.size(0) == 1)
                    hyp, _ = self.model.ctc_prefix_beam_search(
                        feats,
                        feats_lengths,
                        self.beam_size,
                        decoding_chunk_size=self.decoding_chunk_size,
                        num_decoding_left_chunks=self.num_decoding_left_chunks,
                        simulate_streaming=self.simulate_streaming)
                    hyps = [hyp]
                elif self.mode == 'attention_rescoring':
                    assert (feats.size(0) == 1)
                    hyp, _ = self.model.attention_rescoring(
                        feats,
                        feats_lengths,
                        self.beam_size,
                        decoding_chunk_size=self.decoding_chunk_size,
                        num_decoding_left_chunks=self.num_decoding_left_chunks,
                        ctc_weight=self.ctc_weight,
                        simulate_streaming=self.simulate_streaming,
                        reverse_weight=self.reverse_weight)
                    hyps = [hyp]
                
                content = ''
                for w in hyps[0]:
                    if w == self.eos:
                        break
                    content += self.char_dict[w]
                return content


if __name__ == '__main__':
    # 加载模型
    recog = recognize()
    # 实时接收数据
    result1 = recog.get_recognize("../cache/temp.wav")
    result2 = recog.get_recognize("../cache/temp1.wav")
    result3 = recog.get_recognize("../cache/temp2.wav")
    print(result1)
    print(result2)
    print(result3)
# 实时返回识别结果

客户端

  • 首先判断是否有人说话,即是否有数据
    • 存入缓冲区,送入识别模块
    • 识别返回结果,存入字幕
    • 字幕的长度
      • 需要判断此处是否是句子结尾
        • 是的话,断句,vad
        • 不是的话,超过20个字进行断句
  • 判断静音时间
    • 如果静音时间过长,则

录音模块

获取麦克风数据以及保存音频

实际上这里不需要保存音频,这里只是为了测试录音是否可以正常运行

import pyaudio
import wave

CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
SAMPALE_RATE = 44100 # 默认是 44100 保真度最高,识别的时候使用16000
RECORD_SECONDS = 4
temp_save_path = "Audio/temp.wav"
p = pyaudio.PyAudio()


# 保存麦克风数据	
def save_wav(frames, save_path):
	
	wf = wave.open(save_path, 'wb')
	wf.setnchannels(CHANNELS)
	wf.setsampwidth(2)
	wf.setframerate(SAMPALE_RATE)
	wf.writeframes(b''.join(frames))
	wf.close()
	print('\033[93m' + "已录入缓冲区" + '\033[0m')

# 获取麦克风数据
def recording(save_path):
	p = pyaudio.PyAudio()
	stream = p.open(format=FORMAT,
	                channels=CHANNELS,
	                rate=SAMPALE_RATE,
	                input=True,
	                frames_per_buffer=CHUNK)
	print('\033[93m' + "recording" + '\033[0m')
	
	# 缓冲区大小
	frames = []
	max_size = 16*4
	while 1:
		data = stream.read(CHUNK)
		#data = int.from_bytes(data, byteorder='big', signed=False)
		frames.append(data)
		if len(frames) == max_size:
			# 保存缓冲区
			save_wav(frames, save_path)
			# 清空缓冲区
			frames = []
			break
	# 发送到服务器
	# result = requests(frames)
	# break
	# if result == "退出":
	# 	break

recording(temp_save_path)

简易VAD

基于短时能量和短时过零率门限进行端点检测

vad.py

# -*- coding: utf-8 -*-
import numpy as np
import pyaudio

SUCCESS = 0
FAIL = 1

audio2 = ""
stream2 = ""

# 需要添加录音互斥功能能,某些功能开启的时候录音暂时关闭
def ZCR(curFrame):
	# 过零率
	tmp1 = curFrame[:-1]
	tmp2 = curFrame[1:]
	sings = (tmp1 * tmp2 <= 0)
	diffs = (tmp1 - tmp2) > 0.02
	zcr = np.sum(sings * diffs)
	return zcr


def STE(curFrame):
	# 短时能量
	amp = np.sum(np.abs(curFrame))
	return amp


class Vad(object):
	def __init__(self,CHUNK=1024):
		# 初始短时能量高门限
		self.amp1 = 15
		# 初始短时能量低门限
		self.amp2 = 1
		# 初始短时过零率高门限
		self.zcr1 = 2
		# 初始短时过零率低门限
		self.zcr2 = 1
		# 允许最大静音长度
		self.maxsilence = 45  # 允许换气的最长时间
		# 语音的最短长度
		self.minlen = 40  # 过滤小音量
		# 能量最大值
		self.max_en = 20000
		# 初始状态为静音
		self.status = 0
		self.count = 0
		self.silence = 0
		self.frame_len = CHUNK
		self.frame_inc = CHUNK / 2
		self.cur_status = 0
		
		
	def check_ontime(self, cache_frame):  # self.cache的值为空   self.cache_frames的数据长度为744
		
		wave_data = np.frombuffer(cache_frame, dtype=np.int16)  # 这里的值竟然是256
		wave_data = wave_data * 1.0 / self.max_en  # max_en  为20000
		data = wave_data[np.arange(0, self.frame_len)]  # 取前frame_len个值   这个值为256
		# 获得音频过零率
		zcr = ZCR(data)
		# 获得音频的短时能量, 平方放大
		amp = STE(data) ** 2
		# 返回当前音频数据状态
		status = self.speech_status(amp, zcr)
		return status
		
	
	def speech_status(self, amp, zcr):
		status = 0
		# 0= 静音, 1= 可能开始, 2=确定进入语音段   3语音结束
		if self.cur_status in [0, 1]:  # 如果在静音状态或可能的语音状态,则执行下面操作
			# 确定进入语音段
			if amp > self.amp1 or zcr > self.zcr1:  # 超过最大  短时能量门限了
				status = 2
				self.silence = 0
				self.count += 1
			# 可能处于语音段   能量处于浊音段,过零率在清音或浊音段
			elif amp > self.amp2 or zcr > self.zcr2:
				status = 2
				self.count += 1
			# 静音状态
			else:
				status = 0
				self.count = 0
				self.count = 0
		# 2 = 语音段
		elif self.cur_status == 2:
			# 保持在语音段    能量处于浊音段,过零率在清音或浊音段
			if amp > self.amp2 or zcr > self.zcr2:
				self.count += 1
				status = 2
			# 语音将结束
			else:
				# 静音还不够长,尚未结束
				self.silence += 1
				if self.silence < self.maxsilence:
					self.count += 1
					status = 2
				# 语音长度太短认为是噪声
				elif self.count < self.minlen:
					status = 0
					self.silence = 0
					self.count = 0
				# 语音结束
				else:
					status = 3
					self.silence = 0
					self.count = 0
		return status

本地录音模块加测试模块结合

目前简化功能,实现对答时语音识别效果。因为还么有找到控制控制台输出的好方法,对实时字幕的显示很不好。

基于缓存的语音识别对说话者的尾音总是捕捉不到,需要进行尾音的追加,达到识别整段语音的效果。

import time

import pyaudio
import wave
from decoder.recognize import Recognize
import numpy as np
from vad import Vad

class RecognizeService():
	def __init__(self):
		self.CHUNK = 1024
		self.FORMAT = pyaudio.paInt16
		self.CHANNELS = 1
		self.SAMPALE_RATE = 16000  # 默认是 44100 保真度最高,识别的时候使用16000
		self.temp_save_path = "../cache/temp1.wav"
		self.p = pyaudio.PyAudio()
		self.model = Recognize()
		self.stream = self.p.open(format=self.FORMAT,
		                channels=self.CHANNELS,
		                rate=self.SAMPALE_RATE,
		                input=True,
		                frames_per_buffer=self.CHUNK,
		                input_device_index=0)
		self.v = Vad(self.CHUNK)
		
	# 保存麦克风数据
	def save_wav(self,frames, save_path):
		wf = wave.open(save_path, 'wb')
		wf.setnchannels(self.CHANNELS)
		wf.setsampwidth(self.p.get_sample_size(self.FORMAT))
		wf.setframerate(self.SAMPALE_RATE)
		wf.writeframes(b''.join(frames))
		wf.close()
	
	# print('\033[93m' + "已录入缓冲区" + '\033[0m')
	
	# 获取麦克风数据
	def recording(self,save_path):
		
		print('\033[93m' + "recording" + '\033[0m')
		
		# 缓冲区大小
		frames = []
		max_size = 5
		long_frames = []
		next=""
		num=0
		is_speak=False
		result=""
		while True:
			stream_data = self.stream.read(self.CHUNK,exception_on_overflow=False)
			status = self.v.check_ontime(stream_data)
			if status==2:
				is_speak=True
				# 增加音量
				wave_data = np.frombuffer(stream_data, dtype=np.int16)
				frames.append(wave_data)
				if len(frames) >= max_size:
					long_frames.extend(frames)
					if len(long_frames) > max_size * 10:
						long_frames = long_frames[-max_size * 10:]
					# 缓存
					self.save_wav(long_frames, self.temp_save_path)
					result = self.model.get_recognize(self.temp_save_path)
					# 清空缓冲区
					frames = []
					if next == result:
						continue
					next = result
					
						
			if status==0:
				num += 1
				if num == 10: # 停止识别
					if is_speak:
						if len(frames)>0 and len(long_frames)>0: # 判断是不是最后的尾音
							long_frames.extend(frames)
							self.save_wav(long_frames, self.temp_save_path)
							result = self.model.get_recognize(self.temp_save_path)
						if result != "":
							print(result)
					num = 0
					# 静音
					long_frames = []
					# 清空缓冲区
					frames = []
					is_speak=False
					result=""

if __name__ == '__main__':
	service = RecognizeService()
	service.recording(service.temp_save_path)

修改为在线实时识别

  • 1
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 9
    评论
评论 9
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

语音不识别

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值