python实时语音监听

第一种方法

创建虚拟环境

conda create -p E:\Python\envs\audiostu  python=3.8
conda activate E:\Python\envs\audiostu

安装需要的包

pip uninstall protobuf
pip uninstall google
pip install protobuf==3.20.2
pip install typeguard==2.5.0
pip install attrdict==2.0.1
pip install text2digits==0.1.0
pip install configargparse==1.5.3
pip install webrtcvad==2.0.10
pip install PySoundFile==0.9.0
pip install -r requirements.txt

修改yaml文件

accum_grad: 16
cmvn_file: conf/global_cmvn
data_conf:
  batch_conf:
    batch_size: 1
    batch_type: static
  fbank_conf:
    dither: 0.0
    frame_length: 25
    frame_shift: 10
    num_mel_bins: 80
  filter_conf:
    max_length: 40960
    min_length: 0
    token_max_length: 200
    token_min_length: 1
  resample_conf:
    resample_rate: 16000
  shuffle: False
  shuffle_conf:
    shuffle_size: 1500
  sort: False
  sort_conf:
    sort_size: 1000
  spec_aug: true
  spec_aug_conf:
    max_f: 10
    max_t: 50
    num_f_mask: 2
    num_t_mask: 2
  speed_perturb: False
decoder: transformer
decoder_conf:
  attention_heads: 8
  dropout_rate: 0.1
  linear_units: 2048
  num_blocks: 6
  positional_dropout_rate: 0.1
  self_attention_dropout_rate: 0.0
  src_attention_dropout_rate: 0.0
encoder: conformer
encoder_conf:
  activation_type: swish
  attention_dropout_rate: 0.0
  attention_heads: 8
  cnn_module_kernel: 15
  cnn_module_norm: layer_norm
  dropout_rate: 0.1
  input_layer: conv2d
  linear_units: 2048
  normalize_before: true
  num_blocks: 12
  output_size: 512
  pos_enc_layer_type: rel_pos
  positional_dropout_rate: 0.1
  selfattention_layer_type: rel_selfattn
  use_cnn_module: true
  #use_dynamic_chunk: true
  #use_dynamic_left_chunk: false
grad_clip: 5
input_dim: 80
is_json_cmvn: true
log_interval: 100
#max_epoch: 36
model_conf:
  ctc_weight: 0.3
  length_normalized_loss: false
  lsm_weight: 0.1
optim: adam
optim_conf:
  lr: 0.001
output_dim: 5537
scheduler: warmuplr
scheduler_conf:
  warmup_steps: 5000

engine_sample_rate_hertz: 16000
engine_max_decoders: 1
engine_max_inactivity_secs: 3

model_path: conf/final.pt
dict_path: conf/words.txt

beam_size: 10
mode: ctc_greedy_search
decoding_chunk_size: 11
num_decoding_left_chunks: -1
#override_config:
#penalty: 
gpu: 1
audio_save_path: G:\Python\ASR_python_deploy-main\audio_save

修改grpc_WenetEngine.py,将open打开修改为utf-8编码
在这里插入图片描述
运行即可

python server.py --model_config conf/decode_engine_V3.yaml --host 0.0.0.0 --port 9876 --vad_aggressiveness 3

参考
这个失败了

第二种 使用pyaudio+飞浆实现

安装步骤参考
安装gpu版本飞浆语音
这个方法监听的是系统声音
声音录制监听

import pyaudio
import wave
from enum import Enum
import numpy as np
import time
from threading import Thread
from paddlespeech.cli.tts.infer import TTSExecutor
from paddlespeech.cli.asr.infer import ASRExecutor
from paddlespeech.cli.text.infer import TextExecutor


def findInternalRecordingDevice(p):
    """获取内录设备序号,在windows操作系统上测试通过,hostAPI = 0 表明是MME设备"""
    # 要找查的设备名称中的关键字
    target = '立体声混音'
    # 逐一查找声音设备
    for i in range(p.get_device_count()):
        devInfo = p.get_device_info_by_index(i)
        # print(devInfo)
        if devInfo['name'].find(target) >= 0:  # and devInfo['hostApi'] == 0:
            # print('已找到内录设备,序号是 ',i)
            return i
    print('无法找到内录设备!')
    return -1

class AudioState(Enum):
    '''标记音频操作当前的状态'''
    LISTENING = 1  # 监听状态
    RECORDING = 2  # 录制状态
    PLAYING = 3  # 播放状态


p = pyaudio.PyAudio()
INTERNAL_RECORDING_DEVICE_ID = findInternalRecordingDevice(p);
frames = []
stopWatch_Record = None
audioState = AudioState.LISTENING
VOICE_STD_THRESHOLD = 150
RECORD_MAX_SECONDS = 5  # 最多录制5s
AUDIO_RATE = 16000 #44100
AUDIO_CHANNEL =1 #2
asr = ASRExecutor()

def getStdOfVoiceFrame(in_data):
    '''获取一帧音频信号的标准差'''
    return np.std(np.frombuffer(in_data, dtype=np.short))


def handleVoice(frames):
    '''处理接收到的声音信息'''
    # 保存文件
    # 语音转文字
    print("处理接收到的声音")
    localTime = time.strftime("%Y%m%d%H%M%S", time.localtime())
    audio_name = localTime+".wav"
    wf = wave.open(audio_name, 'wb')
    # 设置音频参数
    wf.setnchannels(AUDIO_CHANNEL)
    wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
    wf.setframerate(AUDIO_RATE)
    # 写入数据
    wf.writeframes(b''.join(frames))
    # 关闭文件
    wf.close()
    result = asr(audio_file=audio_name,force_yes=True)
    print(result)

def callback(in_data, frame_count, time_info, status):
    global audioState
    global frames
    global stopWatch_Record
    global RECORD_MAX_SECONDS

    # 如果当前帧的std大于阈值且处于监听状态,开始录音
    if audioState == AudioState.LISTENING and getStdOfVoiceFrame(in_data) >= VOICE_STD_THRESHOLD:
        # matrixLed.scrollingChar()
        # print(getStdOfVoiceFrame(in_data))
        stopWatch_Record = time.time()
        print('录制中...')
        frames.append(in_data)
        audioState = AudioState.RECORDING  # 当前处于RECORDING状态
        return (bytes(len(in_data)), pyaudio.paContinue)  # 如果不是播放状态,应该输入空的数据流
    # 如果当前处于RECORDING状态
    if audioState == AudioState.RECORDING:
        t0 = time.time() - stopWatch_Record
        frames.append(in_data)
        # 如果录音时长已经超过限定,停止录音,保存音频文件
        if t0 > RECORD_MAX_SECONDS:
            # handleVoice(frames)
            audioState = AudioState.PLAYING  # PLAYING状态
            t1 = Thread(target=handleVoice, args=(frames,))
            t1.start()
            frames = []
            print('处理完成')
            return (bytes(len(in_data)), pyaudio.paContinue)
    # 如果当前处于播放状态
    if audioState == AudioState.PLAYING:
        print("音频播放")
        # data = wf.readframes(frame_count)
        # wf.close()
        audioState = AudioState.LISTENING
        # return (data, pyaudio.paComplete)
        return (bytes(len(in_data)), pyaudio.paComplete)
    return (bytes(len(in_data)), pyaudio.paContinue)


def start():
    stream = p.open(
        input_device_index=INTERNAL_RECORDING_DEVICE_ID,
        format=pyaudio.paInt16,
        channels=AUDIO_CHANNEL,  # 声道 1单声道 2双声道
        rate=AUDIO_RATE,
        input=True,
        frames_per_buffer=1024,
        stream_callback=callback
    )
    stream.start_stream()
    print("开始监听...")
    try:
        while True:
            while stream.is_active():
                pass
            stream.stop_stream()
            stream.close()
            stream = p.open(
                input_device_index=INTERNAL_RECORDING_DEVICE_ID,
                format=pyaudio.paInt16,
                channels=AUDIO_CHANNEL,
                rate=AUDIO_RATE,
                input=True,
                frames_per_buffer=1024,
                stream_callback=callback
            )
            stream.start_stream()
            print('开始监听...')
    except Exception as e:
        stream.stop_stream()
        stream.close()
        p.terminate()
        raise e


if __name__ == '__main__':
    if INTERNAL_RECORDING_DEVICE_ID < 0:
        print("没有找到内置录音设备")
    else:
        start()
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

假装我不帅

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值