基于开发平台实现语音识别（文字转语音）

忆盎

已于 2024-03-13 11:41:20 修改

阅读量388

点赞数 6

文章标签： python

于 2024-03-13 10:59:25 首次发布

本文链接：https://blog.csdn.net/qq_62141522/article/details/136674041

版权

本文介绍了如何在Python环境中利用讯飞开放平台的语音合成功能，包括设置参数、创建URL、处理WebSocket消息和错误，以及提供了一个完整的`text2wav`函数作为示例，展示了从文本到音频文件的合成过程。

摘要由CSDN通过智能技术生成

# 调用开发平台中语音合成功能

第一步：进入讯飞开放平台-以语音交互为核心的人工智能开放平台 (xfyun.cn)

第二步：点击控制台

第三步：找到语音合成--》在线语音合成

第四步：点击右边的语音合成文档找到调用实例下载想要使用的编译语言

tts_ws_python3_demo.py部分代码

# -*- coding:utf-8 -*-
#
#   author: iflytek
#
#  本demo测试时运行的环境为：Windows + Python3.7
#  本demo测试成功运行时所安装的第三方库及其版本如下：
#   cffi==1.12.3
#   gevent==1.4.0
#   greenlet==0.4.15
#   pycparser==2.19
#   six==1.12.0
#   websocket==0.2.1
#   websocket-client==0.56.0
#   合成小语种需要传输小语种文本、使用小语种发音人vcn、tte=unicode以及修改文本编码方式
#  错误码链接：https://www.xfyun.cn/document/error-code （code返回错误码时必看）
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
import _thread as thread
import base64
import datetime
import hashlib
import hmac
import json
import os
import ssl
import wave
from datetime import datetime
from time import mktime
from urllib.parse import urlencode
from wsgiref.handlers import format_date_time

import websocket

STATUS_FIRST_FRAME = 0  # 第一帧的标识
STATUS_CONTINUE_FRAME = 1  # 中间帧标识
STATUS_LAST_FRAME = 2  # 最后一帧的标识

PCM_PATH = 'demo.pcm'


class Ws_Param(object):
    # 初始化
    def __init__(self):
        self.APPID = ""# 开放平台的appid，api_secret，api_key
        self.APIKey = ""
        self.APISecret = ""
        self.tts_vcn = ""
        self.tts_business_args = ""
        self.tts_common_args = ""
        self.tts_text_data = ""

    def set_tts_params(self, text, vcn):
        self.tts_vcn = vcn
        self.tts_business_args = {"aue": "raw", "auf": "audio/L16;rate=16000", "vcn": self.tts_vcn, "tte": "utf8"}
        # 生成url
        self.tts_text_data = {"status": 2, "text": str(base64.b64encode(text.encode('utf-8')), "UTF8")}

    def set_params(self, appid, api_seccret, api_key):
        if appid != "":
            self.APPID = appid
            self.tts_common_args = {"app_id": self.APPID}
        if api_seccret != "":
            self.APISecret = api_seccret
        if api_key != "":
            self.APIKey = api_key

    def create_url(self):
        url = 'wss://tts-api.xfyun.cn/v2/tts'
        # 生成RFC1123格式的时间戳
        now = datetime.now()
        date = format_date_time(mktime(now.timetuple()))

        # 拼接字符串
        signature_origin = "host: " + "ws-api.xfyun.cn" + "\n"
        signature_origin += "date: " + date + "\n"
        signature_origin += "GET " + "/v2/tts " + "HTTP/1.1"
        # 进行hmac-sha256进行加密
        signature_sha = hmac.new(self.APISecret.encode('utf-8'), signature_origin.encode('utf-8'),
                                 digestmod=hashlib.sha256).digest()
        signature_sha = base64.b64encode(signature_sha).decode(encoding='utf-8')

        authorization_origin = "api_key=\"%s\", algorithm=\"%s\", headers=\"%s\", signature=\"%s\"" % (
            self.APIKey, "hmac-sha256", "host date request-line", signature_sha)
        authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8')
        # 将请求的鉴权参数组合为字典
        v = {
            "authorization": authorization,
            "date": date,
            "host": "ws-api.xfyun.cn"
        }
        # 拼接鉴权参数，生成url
        url = url + '?' + urlencode(v)
        # print("date: ",date)
        # print("v: ",v)
        # 此处打印出建立连接时候的url,参考本demo的时候可取消上方打印的注释，比对相同参数时生成的url与自己代码生成的url是否一致
        # print('websocket url :', url)
        return url


def on_message(ws, message):
    try:
        message = json.loads(message)
        code = message["code"]
        sid = message["sid"]
        audio = message["data"]["audio"]
        audio = base64.b64decode(audio)
        status = message["data"]["status"]
        print(message)
        if status == 2:
            print("ws is closed")
            ws.close()
        if code != 0:
            errMsg = message["message"]
            print("sid:%s call error:%s code is:%s" % (sid, errMsg, code))
        else:

            with open('demo.pcm', 'ab') as f:
                f.write(audio)

    except Exception as e:
        print("receive msg,but parse exception:", e)


# 收到websocket错误的处理
def on_error(ws, error):
    print("### error:", error)


# 收到websocket关闭的处理
def on_close(ws):
    print("### closed ###")


# 收到websocket连接建立的处理
def on_open(ws):
    def run(*args):
        d = {"common": wsParam.tts_common_args,
             "business": wsParam.tts_business_args,
             "data": wsParam.tts_text_data,
             }
        d = json.dumps(d)
        print("------>开始发送文本数据")
        ws.send(d)
        if os.path.exists(PCM_PATH):
            os.remove(PCM_PATH)

    thread.start_new_thread(run, ())


def text2wav(appid, api_secret, api_key, text, vcn, fname):
    wsParam.set_params(appid, api_secret, api_key)
    wsParam.set_tts_params(text, vcn)
    websocket.enableTrace(False)
    ws_url = wsParam.create_url()
    ws = websocket.WebSocketApp(ws_url, on_message=on_message, on_error=on_error, on_close=on_close)
    ws.on_open = on_open
    ws.run_forever(sslopt={'cert_reqs': ssl.CERT_NONE})

    pcm2wav(PCM_PATH, fname)


def pcm2wav(fname, dstname):
    with open(fname, 'rb') as pcmfile:
        pcmdata = pcmfile.read()
        print("pcmdata的值：",len(pcmdata))
    with wave.open(dstname, 'wb') as wavfile:
        wavfile.setparams((1, 2, 16000, 0, 'NONE', 'NONE'))
        wavfile.writeframes(pcmdata)


wsParam = Ws_Param()
if __name__ == "__main__":

    # 测试时候在此处正确填写相关信息即可运行
    text2wav(appid='', # 开放平台的appid，api_secret，api_key
             api_secret='',
             api_key='',
             text="这是一个语音合成案例",
             vcn='xiaoyan',
             fname='./demo.wav')

ttsplay.py部分代码

import os
import sys
import time
import tkinter as tk
import tkinter.messagebox
import ttkbootstrap as ttk
from playsound import playsound
from tts_ws_python3_demo import text2wav
class TtsPlay:
    def __init__(self):
        self.vcn = 'xiaoyan'
        self.APP_ID = ''# 开放平台的appid，api_secret，api_key
        self.API_KEY = ''
        self.SECRET_KEY = ''
        self.fname = ""

        # 初始化窗口
        self.root = ttk.Window()
        self.style = ttk.Style()
        self.root.title("语音合成系统")
        self.root.geometry("750x600")
        self.root.resizable(0,0)

        # 创建控件
        self.tk_lb = ttk.Label(self.root, text="请选择语音发音人")
        self.tk_text = ttk.Text(self.root, width=75, height=20)
        self.tk_cb_vcn = ttk.Combobox(self.root, width=15)
        # 设置下拉列表
        self.tk_cb_vcn['values'] = (
            "甜美女声-小燕", "亲切男声-许久", "知性女声-小萍", "可爱童声-许小宝", "亲切女声-小婧")
        self.tk_cb_vcn.current(0)
        self.tk_cb_vcn.bind("<<ComboboxSelected>>", self.select_vcn)
        self.tk_tts_file = tk.Label(self.root, text='生成文件路径')
        self.b1 = tk.Button(self.root, text="进行语音合成", width=12, height=1,command=self.xfyun_tts)
        self.tk_play = tk.Button(self.root, text="播放", width=12, height=1,command=self.play_sound)
        # 布局控件位置
        self.tk_tts_file.place(x=50, y=550)
        self.b1.place(x=260, y=515)
        self.tk_play.place(x=390, y=515)
        self.tk_lb.place(x=50, y=30)
        self.tk_cb_vcn.place(x=175, y=25)
        self.tk_text.place(x=50, y=60)
        self.root.mainloop()

    def select_vcn(self, *args):
        if self.tk_cb_vcn.get() == '甜美女声-小燕':
            self.vcn = 'xiaoyan'
        elif self.tk_cb_vcn.get() == '亲切男声-许久':
            self.vcn = 'aisjiuxu'
        elif self.tk_cb_vcn.get() == '知性女声-小萍':
            self.vcn = 'aisxping'
        elif self.tk_cb_vcn.get() == '可爱童声-许小宝':
            self.vcn = 'aisbabyxu'
        elif self.tk_cb_vcn.get() == '亲切女声-小婧':
            self.vcn = 'aisjinger'

    def xfyun_tts(self):
        tts_text = self.tk_text.get("0.0", "end")
        tts_text = tts_text.strip('\r\n')
        tts_text = tts_text.strip('\n')
        tts_text = tts_text.strip(' ')

        if not tts_text:
            tkinter.messagebox.showinfo("提示", "请输入文本内容")
            return
        fname = time.strftime("%Y%m%d%H%M%S", time.localtime())
        self.fname = os.path.dirname(sys.argv[0]) + "//" + fname + ".wav"
        # print(self.fname)
        self.tk_tts_file["text"] = self.fname
        text2wav(self.APP_ID, self.SECRET_KEY, self.API_KEY, tts_text, self.vcn, self.fname)

    def play_sound(self):
        playsound(self.fname)






if __name__ == '__main__':
    TtsPlay()