flask+python调用百度语音识别封装成一个接口

项目要求:实现一个语音输入的搜索框,前端调用接口,进行语音识别,后台负责调用百度语音识别处理语音,并返回结果。
注释:后台需要判断用户语音什么时候终止

# -*- coding: utf-8 -*-
# audioPart02.py 
import json
import flask
from flask import Flask
from flask import request
from flask import redirect
from flask import jsonify
import urllib.request
import urllib
import json
import base64
import os
import datetime
from pyaudio import PyAudio, paInt16
import numpy as np 
import pyaudio
import baiduASR
import time
import wave
import webrtcvad
import copy
from numba import jit
import contextlib
import sys
import wave
import random

SAMPLING_RATE=44100      #原始音频采样率
TargetFrameRate=16000    #降采样之后的采样率
framesz=30               #设置每帧的时间长度,可选择10ms,20ms,30ms

vad = webrtcvad.Vad(1)   #vad模式,等级从0,1,2,3选择,数字越大,对语音越敏感

app = Flask(__name__)

#此处设置百度的登入账号    #此处填写自己的百度申请的账号
api_key = "***********************"  
api_secert = "***************************"
bdr =  baiduASR.BaiduRest("test_python", api_key, api_secert)

@jit
def Resample(input_signal,src_fs,tar_fs): 
    '''
    :param input_signal:输入信号
    :param src_fs:输入信号采样率
    :param tar_fs:输出信号采样率
    :return:输出信号
    ''' 
    dtype = input_signal.dtype 
    audio_len = len(input_signal) 
    audio_time_max = 1.0*(audio_len-1) / src_fs 
    src_time = 1.0 * np.linspace(0,audio_len,audio_len) / src_fs 
    tar_time = 1.0 * np.linspace(0,np.int(audio_time_max*tar_fs),np.int(audio_time_max*tar_fs)) / tar_fs 
    output_signal = np.interp(tar_time,src_time,input_signal).astype(dtype) 
    return output_signal

class Frame(object):
    """Represents a "frame" of audio data."""
    def __init__(self, bytes, timestamp, duration):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration

def frame_generator(frame_duration_ms, audio, sample_rate):
    """
    Generates audio frames from PCM audio data.
    Takes the desired frame duration in milliseconds, the PCM data, and
    the sample rate.
    Yields Frames of the requested duration.
    """
    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
    offset = 0
    timestamp = 0.0
    duration = (float(n) / sample_rate) / 2.0
    while offset + n < len(audio):
        yield Frame(audio[offset:offset + n], timestamp, duration)
        timestamp += duration
        offset += n

#主程序入口
##改进之后的功能:可以按照200ms一段的形式进行接收,后端来拼接进行识别,这一改动会比之前的识别速度更快
mergeAudio=[]
@app.route('/audio' , methods=['GET', 'POST'])
def audio():
    dict2={}
    global mergeAudio
    if request.method == 'POST':
        binAudio = request.files['audioData'].read()    
        waveData = np.fromstring(binAudio,dtype=np.int16)                             #将字符串转化为int
        outputAudio = Resample(waveData,SAMPLING_RATE,TargetFrameRate)
        outputAudio=outputAudio.tostring()   
        mergeAudio.extend(outputAudio)
        print('[2] the length of mergeAudio :: ',len(mergeAudio))
        frames = frame_generator(framesz, outputAudio, TargetFrameRate)    #采样率只能设置8k,16k,32k,而44.1k行不通      
        frames = list(frames)
        num_voiced = [1 if vad.is_speech(f.bytes, TargetFrameRate) else 0 for f in frames]

        start1 = time.time()
        text = bdr.getText(bytes(mergeAudio[:]))
        print("Baidu cost start1 :: ",time.time()-start1)
        dict2["result"] = text
        if text!='' and sum(num_voiced) == 0:     #此处判断音频是否停止,将原来的[-20:]即判断末尾的连续几帧之和是否为0,改为只判断传过来的那一段的语音是否说话
            dict2["isStop"] = 1
            print("First dict2::",dict2)
            return json.dumps(dict2)
        else:
            dict2["isStop"] = 0
            print("second dict2:: ",dict2)
            return json.dumps(dict2)
          
    else:
        return '<h1>只接受post请求!</h1>'

@app.route('/')
def Hello():
    return "Hello, World!"

if __name__ == "__main__":
    app.run("0.0.0.0",port=5051,debug=True)

下面这个是调用百度语音接口

import urllib.request
import urllib
import json
import base64
import os
import subprocess
import datetime
from pyaudio import PyAudio, paInt16
import numpy as np 
import wave
import pyaudio

class BaiduRest:
    def __init__(self, cu_id, api_key, api_secert):
        # token认证的url
        self.token_url = "https://openapi.baidu.com/oauth/2.0/token?grant_type=client_credentials&client_id=%s&client_secret=%s"
        # 语音合成的resturl
        self.getvoice_url = "http://tsn.baidu.com/text2audio?tex=%s&lan=zh&cuid=%s&ctp=1&tok=%s"
        # 语音识别的resturl
        self.upvoice_url = 'http://vop.baidu.com/server_api'
        self.cu_id = cu_id
        self.getToken(api_key, api_secert)

    def getToken(self, api_key, api_secert):
        # 1.获取token
        token_url = self.token_url % (api_key,api_secert)
        r_str = urllib.request.urlopen(token_url).read()
        token_data = json.loads(r_str.decode('utf-8'))
        self.token_str = token_data['access_token']
        
    def getText(self, voice_data):
        data = {}
        # 语音的一些参数
        data['format'] = 'wav'
        data['rate'] = 16000
        data['channel'] = 1
        data['cuid'] = self.cu_id
        data['token'] = self.token_str
        data['len'] = len(voice_data)
        data['dev_pid'] = 1536
        data['speech'] = base64.b64encode(voice_data).decode('utf-8')
        post_data = json.dumps(data)
        r_data = urllib.request.urlopen(self.upvoice_url,data=bytes(post_data,encoding="utf-8")).read()
        # 3.处理返回数据
        try:
            result = json.loads(r_data.decode('utf-8'))['result'][0]
        except BaseException as e:
            result = ""
        return result

总结:
这个小项目是自己闲来无事做的,只供参考。
GitHub链接:
https://github.com/hqq624308/Baidu_Api-speech_recognition-

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 5
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值