通过百度语音api实现语音控制五子棋——基于flask

KILIG_yss

已于 2022-05-24 16:21:46 修改

阅读量987

点赞数 2

分类专栏：项目文章标签： python flask 语音识别 html5 javascript

于 2022-04-13 20:57:34 首次发布

本文链接：https://blog.csdn.net/Yss915/article/details/121140055

版权

项目专栏收录该内容

6 篇文章 0 订阅

订阅专栏

思路

使用python的pyaudio库来进行语音录入，将语音存放在本地文件，然后再通过百度api进行语音识别，通过识别内容判断下一步棋落在哪。

成果

登录页面

游戏页面

步骤

因为代码太长，文章发不出来，所以这里只放最重要的录音代码和语音识别代码，完整代码仓库地址放在了最后

录音代码

#录音并存放在本地文件中
import os
import sys
import wave
import numpy as np
from datetime import datetime
from pyaudio import PyAudio, paInt16


class GenAudio(object):
    def __init__(self):
        self.num_samples = 4000  # pyaudio内置缓冲大小
        self.sampling_rate = 16000  # 取样频率
        self.level = 1500  # 声音保存的阈值
        self.count_num = 20  # count_num个取样之内出现COUNT_NUM个大于LEVEL的取样则记录声音
        self.save_length = 8  # 声音记录的最小长度：save_length * num_samples 个取样
        self.time_count = 16  # 录音时间，单位s
        self.voice_string = []

    # 保存文件
    def save_wav(self, filename):
        wf = wave.open(filename, 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(self.sampling_rate)
        wf.writeframes(np.array(self.voice_string).tostring())
        wf.close()

    def read_audio(self):
        pa = PyAudio()
        stream = pa.open(format=paInt16, channels=1, rate=self.sampling_rate, input=True,
                         frames_per_buffer=self.num_samples)

        save_count = 0
        save_buffer = []
        time_count = self.time_count

        while True:
            time_count -= 1

            # 读入num_samples个取样
            string_audio_data = stream.read(self.num_samples)
            # 将读入的数据转换为数组
            # audio_data = np.fromstring(string_audio_data, dtype=np.short)
            audio_data = np.frombuffer(string_audio_data, dtype='int16')
            # 计算大于 level 的取样的个数
            large_sample_count = np.sum(audio_data > self.level)

            print(np.max(audio_data)), "large_sample_count=>", large_sample_count

            # 如果个数大于COUNT_NUM，则至少保存SAVE_LENGTH个块
            if large_sample_count > self.count_num:
                save_count = self.save_length
            else:
                save_count -= 1
            if save_count < 0:
                save_count = 0

            if save_count > 0:
                save_buffer.append(string_audio_data)
            else:
                if len(save_buffer) > 0:
                    self.voice_string = save_buffer
                    save_buffer = []
                    print("Recode a piece of  voice successfully!")
                    return True

            if time_count == 0:
                if len(save_buffer) > 0:
                    self.voice_string = save_buffer
                    save_buffer = []
                    print("Recode a piece of  voice successfully!")
                    return True
                else:
                    return False
        return True

# 单独测试录音时要把这块的代码去掉
# if __name__ == "__main__":
#     r = GenAudio()
#     r.read_audio()
#     r.save_wav("./templates/test.wav")

语音识别代码

使用百度语音识别api要去短语音识别标准版_短语音识别-百度AI开放平台 (baidu.com)

注册一下，具体注册步骤可以上网搜索，新用户送了几万次调用机会，不是新用户也没关系，这个也不是很贵，调用100次也才3毛钱，用来自己测试demo足够了。

注册好了以后有一个api key和secret key，将这个两个放入下面对应的代码中就行了。

下面是代码：

# coding=utf-8
# 此文档参考百度api的官方demo
import sys
import json
import base64
import time

IS_PY3 = sys.version_info.major == 3

if IS_PY3:
    from urllib.request import urlopen
    from urllib.request import Request
    from urllib.error import URLError
    from urllib.parse import urlencode
    timer = time.perf_counter
else:
    from urllib2 import urlopen
    from urllib2 import Request
    from urllib2 import URLError
    from urllib import urlencode
    if sys.platform == "win32":
        timer = time.clock
    else:
        # On most other platforms the best timer is time.time()
        timer = time.time

#你的API_KEY
API_KEY = 'fk6lgD'
#你的SECRET_KEY
SECRET_KEY = 'GELxN59GvdVBSpaErcG'

# 需要识别的文件
AUDIO_FILE = './templates/test.wav'  # 只支持 pcm/wav/amr 格式，极速版额外支持m4a 格式
# 文件格式
FORMAT = AUDIO_FILE[-3:]  # 文件后缀只支持 pcm/wav/amr 格式，极速版额外支持m4a 格式

CUID = '123456PYTHON'
# 采样率
RATE = 16000  # 固定值

# 普通版

DEV_PID = 1537  # 1537 表示识别普通话，使用输入法模型。根据文档填写PID，选择语言及识别模型
ASR_URL = 'http://vop.baidu.com/server_api'
SCOPE = 'audio_voice_assistant_get'  # 有此scope表示有asr能力，没有请在网页里勾选，非常旧的应用可能没有

#测试自训练平台需要打开以下信息， 自训练平台模型上线后，您会看见 第二步：“”获取专属模型参数pid:8001，modelid:1234”，按照这个信息获取 dev_pid=8001，lm_id=1234
# DEV_PID = 8001 ;
# LM_ID = 1234 ;


# 忽略scope检查，非常旧的应用可能没有
# SCOPE = False

class DemoError(Exception):
    pass


"""  TOKEN start """

TOKEN_URL = 'http://aip.baidubce.com/oauth/2.0/token'


def fetch_token():
    params = {'grant_type': 'client_credentials',
              'client_id': API_KEY,
              'client_secret': SECRET_KEY}
    post_data = urlencode(params)
    if (IS_PY3):
        post_data = post_data.encode( 'utf-8')
    req = Request(TOKEN_URL, post_data)
    try:
        f = urlopen(req)
        result_str = f.read()
    except URLError as err:
        result_str = err.read()
    if (IS_PY3):
        result_str =  result_str.decode()

    result = json.loads(result_str)
    if ('access_token' in result.keys() and 'scope' in result.keys()):
        if SCOPE and (not SCOPE in result['scope'].split(' ')):  # SCOPE = False 忽略检查
            raise DemoError('scope is not correct')
        return result['access_token']
    else:
        raise DemoError('MAYBE API_KEY or SECRET_KEY not correct: access_token or scope not found in token response')

"""  TOKEN end """

# 单独测试语音转文字时要把这块的注释去掉
# if __name__ == '__main__':
#     token = fetch_token()
#
#     speech_data = []
#     with open(AUDIO_FILE, 'rb') as speech_file:
#         speech_data = speech_file.read()
#
#     length = len(speech_data)
#     if length == 0:
#         raise DemoError('file %s length read 0 bytes' % AUDIO_FILE)
#     speech = base64.b64encode(speech_data)
#     if (IS_PY3):
#         speech = str(speech, 'utf-8')
#     params = {'dev_pid': DEV_PID,
#              #"lm_id" : LM_ID,    #测试自训练平台开启此项
#               'format': FORMAT,
#               'rate': RATE,
#               'token': token,
#               'cuid': CUID,
#               'channel': 1,
#               'speech': speech,
#               'len': length
#               }
#     post_data = json.dumps(params, sort_keys=False)
#     # print post_data
#     req = Request(ASR_URL, post_data.encode('utf-8'))
#     req.add_header('Content-Type', 'application/json')
#     try:
#         begin = timer()
#         f = urlopen(req)
#         result_str = f.read()
#         print ("Request time cost %f" % (timer() - begin))
#     except URLError as err:
#         print('asr http response http code : ' + str(err.code))
#         result_str = err.read()
#
#     if (IS_PY3):
#         result_str = str(result_str, 'utf-8')
#     print(result_str)
#     with open("result.txt","w") as of:
#         of.write(result_str)

有什么不会的地方可以给我留言，看到了会回复

完整项目的git仓库链接：基于flask的语音控制五子棋: 基于flask的语音控制五子棋,使用百度语音识别api (gitee.com)