字节火山引擎-大模型声音复刻，流式语音合成接口

最新推荐文章于 2025-03-18 11:05:43 发布

岁月的眸

最新推荐文章于 2025-03-18 11:05:43 发布

阅读量919

点赞数 9

分类专栏：人工智能文章标签：火山引擎

本文链接：https://blog.csdn.net/m0_46168848/article/details/145887403

版权

人工智能专栏收录该内容

6 篇文章

订阅专栏

字节火山引擎-大模型声音复刻，流式语音合成接口

参考文档：火山引擎-大模型声音复刻文档
官网给出的示例代码有bug，这里已经修改了

创建应用

在这里插入图片描述

声音复刻大模型页面查看应用，获取接口调用需要的参数

在这里插入图片描述

注意调用tts接口时候需要三个参数：

APP ID => 获取到对应的应用
Access Token => 鉴权需要
声音ID => 语音合成需要

Http流式合成音频接口示例代码

package main

import (
	"bytes"
	"encoding/base64"
	"encoding/json"
	"errors"
	"fmt"
	"github.com/gin-gonic/gin"
	"github.com/google/uuid"
	"io/ioutil"
	"net/http"
	"time"
)

var (
	app_id      = "8678693223"
	clusterId   = "volcano_icl"
	SpeakId     = "S_v7xollyj1"
	BearerToken = "-50OZ81pPKpn8pRZEgNrxd0wCELJJMIN"
	resource_id = "volc.tts_async.emotion"
)
var durationTime time.Duration

// TTSServResponse response from backend services
type TTSServResponse struct {
	ReqID     string `json:"reqid"`
	Code      int    `json:"code"`
	Message   string `json:"Message"`
	Operation string `json:"operation"`
	Sequence  int    `json:"sequence"`
	Data      string `json:"data"`
}

func httpPost(url string, headers map[string]string, body []byte, timeout time.Duration) ([]byte, error) {
	client := &http.Client{
		Timeout: timeout,
	}
	req, err := http.NewRequest(http.MethodPost, url, bytes.NewBuffer(body))
	if err != nil {
		return nil, err
	}
	for key, value := range headers {
		req.Header.Set(key, value)
	}
	resp, err := client.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()
	retBody, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		return nil, err
	}
	return retBody, err
}

func synthesis(text string) ([]byte, error) {
	// 记录合成开始时间
	startTime := time.Now()

	reqID := uuid.NewString()
	params := make(map[string]map[string]interface{})
	params["app"] = make(map[string]interface{})
	params["app"]["appid"] = app_id
	params["app"]["token"] = "access_token"
	params["app"]["cluster"] = clusterId
	params["user"] = make(map[string]interface{})
	params["user"]["uid"] = "uid"
	params["audio"] = make(map[string]interface{})
	params["audio"]["voice_type"] = SpeakId
	params["audio"]["encoding"] = "wav"
	params["audio"]["speed_ratio"] = 1.0
	params["audio"]["volume_ratio"] = 1.0
	params["audio"]["pitch_ratio"] = 1.0
	params["request"] = make(map[string]interface{})
	params["request"]["reqid"] = reqID
	params["request"]["text"] = text
	params["request"]["text_type"] = "plain"
	params["request"]["operation"] = "query"

	headers := make(map[string]string)
	headers["Content-Type"] = "application/json"
	headers["Authorization"] = fmt.Sprintf("Bearer;%s", BearerToken)

	url := "https://openspeech.bytedance.com/api/v1/tts"
	timeo := 30 * time.Second
	bodyStr, _ := json.Marshal(params)
	synResp, err := httpPost(url, headers, []byte(bodyStr), timeo)
	if err != nil {
		fmt.Printf("http post fail [err:%s]\n", err.Error())
		return nil, err
	}
	fmt.Printf("resp body:%s\n", synResp)
	var respJSON TTSServResponse
	err = json.Unmarshal(synResp, &respJSON)
	if err != nil {
		fmt.Printf("unmarshal response fail [err:%s]\n", err.Error())
		return nil, err
	}
	codeMessages := map[int]string{
		3001: "无效的请求，请检查参数",
		3003: "并发超限，请稍后重试",
		3005: "后端服务忙，请稍后重试",
		3006: "服务中断，请检查参数",
		3010: "文本长度超限，请检查文本长度",
		3011: "无效文本，请检查文本内容",
		3030: "处理超时，请重试或检查文本",
		3031: "处理错误，后端出现异常，请重试",
		3032: "等待获取音频超时，请重试",
		3040: "后端链路连接错误，请重试",
		3050: "音色不存在，请检查voice_type代号",
	}

	code := respJSON.Code
	if code != 3000 {
		fmt.Printf("code fail [code:%d]\n", code)
		message, exists := codeMessages[respJSON.Code]
		if !exists {
			message = "未知错误，请重试"
		}
		return nil, errors.New(message)

	}

	audio, _ := base64.StdEncoding.DecodeString(respJSON.Data)

	// 记录合成结束时间
	endTime := time.Now()
	durationTime = endTime.Sub(startTime)

	// 打印合成时间
	fmt.Printf("音频合成时间: %s\n", durationTime)

	return audio, nil
}

// Handle TTS synthesis via Gin
func handleTTS(c *gin.Context) {
	var input map[string]string
	if err := c.ShouldBindJSON(&input); err != nil {
		c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid input"})
		return
	}

	text, exists := input["text"]
	if !exists || text == "" {
		c.JSON(http.StatusBadRequest, gin.H{"error": "Text is required"})
		return
	}

	audio, err := synthesis(text)
	if err != nil {
		c.JSON(http.StatusInternalServerError, gin.H{"error": "Synthesis failed", "details": err.Error()})
		return
	}

	// Return audio in response
	c.Data(http.StatusOK, "audio/wav", audio)
}

func main() {
	r := gin.Default()

	// POST request to synthesize text to speech
	r.POST("/synthesize", handleTTS)

	// Start the server
	r.Run(":8080")
}

websocket流式合成音频接口示例代码

package main

import (
	"bytes"
	"compress/gzip"
	"encoding/binary"
	"encoding/json"
	"errors"
	"fmt"
	"github.com/gin-gonic/gin"
	"github.com/gorilla/websocket"
	"github.com/satori/go.uuid"
	"io/ioutil"
	"net/http"
	"net/url"
	"time"
)

var (
	enumMessageType = map[byte]string{
		11: "audio-only server response",
		12: "frontend server response",
		15: "error message from server",
	}
	enumMessageTypeSpecificFlags = map[byte]string{
		0: "no sequence number",
		1: "sequence number > 0",
		2: "last message from server (seq < 0)",
		3: "sequence number < 0",
	}
	enumMessageSerializationMethods = map[byte]string{
		0:  "no serialization",
		1:  "JSON",
		15: "custom type",
	}
	enumMessageCompression = map[byte]string{
		0:  "no compression",
		1:  "gzip",
		15: "custom compression method",
	}
)

const (
	optQuery  string = "query"
	optSubmit string = "submit"
)

var duration time.Duration
var addr = "openspeech.bytedance.com"
var ZijieWs_Url = url.URL{Scheme: "wss", Host: addr, Path: "/api/v1/tts/ws_binary"}



// 保存appid和token的映射关系
var appTokenMap = map[string]string{
	"8678693223": "-50OZ81pPKpn8pRZEgNrxd0wCELJJMIN1ww", //胡桃
	"8724961923": "Ut0tYdEdwVvHgSzeV9gdXEN8EGXSlqBesdd", // 郭德纲
	"5046524282": "xRcD5NYAo0BMkMajm9JT3XAigKtRUBCOddd", // 钟离
}

type synResp struct {
	Audio  []byte
	IsLast bool
}

// version: b0001 (4 bits)
// header size: b0001 (4 bits)
// message type: b0001 (Full client request) (4bits)
// message type specific flags: b0000 (none) (4bits)
// message serialization method: b0001 (JSON) (4 bits)
// message compression: b0001 (gzip) (4bits)
// reserved data: 0x00 (1 byte)
var defaultHeader = []byte{0x11, 0x10, 0x11, 0x00}

func setupInput(text, voiceType, appid, opt string) []byte {
	var err error
	reqID := uuid.Must(uuid.NewV4(), err).String()
	params := make(map[string]map[string]interface{})
	params["app"] = make(map[string]interface{})
	//平台上查看具体appid
	params["app"]["appid"] = appid
	params["app"]["token"] = "access_token"
	//平台上查看具体集群名称
	params["app"]["cluster"] = "volcano_icl"
	params["user"] = make(map[string]interface{})
	params["user"]["uid"] = "uid"
	params["audio"] = make(map[string]interface{})
	params["audio"]["voice_type"] = voiceType
	params["audio"]["encoding"] = "mp3"    // 设为 MP3 格式
	params["audio"]["sample_rate"] = 24000 // 设为 24kHz 采样率
	params["audio"]["speed_ratio"] = 1.0
	params["audio"]["volume_ratio"] = 1.0
	params["audio"]["pitch_ratio"] = 1.0
	params["request"] = make(map[string]interface{})
	params["request"]["reqid"] = reqID
	params["request"]["text"] = text
	params["request"]["text_type"] = "plain"
	params["request"]["operation"] = opt
	resStr, _ := json.Marshal(params)
	return resStr
}

func gzipCompress(input []byte) []byte {
	var b bytes.Buffer
	w := gzip.NewWriter(&b)
	w.Write(input)
	w.Close()
	return b.Bytes()
}

func gzipDecompress(input []byte) []byte {
	b := bytes.NewBuffer(input)
	r, _ := gzip.NewReader(b)
	out, _ := ioutil.ReadAll(r)
	r.Close()
	return out
}

func parseResponse(res []byte) (resp synResp, err error) {
	protoVersion := res[0] >> 4
	headSize := res[0] & 0x0f
	messageType := res[1] >> 4
	messageTypeSpecificFlags := res[1] & 0x0f
	serializationMethod := res[2] >> 4
	messageCompression := res[2] & 0x0f
	reserve := res[3]
	headerExtensions := res[4 : headSize*4]
	payload := res[headSize*4:]

	fmt.Printf("            Protocol version: %x - version %d\n",
		protoVersion, protoVersion)
	fmt.Printf("                 Header size: %x - %d bytes\n",
		headSize, headSize*4)
	fmt.Printf("                Message type: %x - %s\n", messageType,
		enumMessageType[messageType])
	fmt.Printf(" Message type specific flags: %x - %s\n", messageTypeSpecificFlags,
		enumMessageTypeSpecificFlags[messageTypeSpecificFlags])
	fmt.Printf("Message serialization method: %x - %s\n",
		serializationMethod, enumMessageSerializationMethods[serializationMethod])
	fmt.Printf("         Message compression: %x - %s\n",
		messageCompression, enumMessageCompression[messageCompression])
	fmt.Printf("                    Reserved: %d\n", reserve)
	if headSize != 1 {
		fmt.Printf("           Header extensions: %s\n",
			headerExtensions)
	}
	// audio-only server response
	if messageType == 0xb {
		// no sequence number as ACK
		if messageTypeSpecificFlags == 0 {
			fmt.Println("                Payload size: 0")
		} else {
			sequenceNumber := int32(binary.BigEndian.Uint32(payload[0:4]))
			payloadSize := int32(binary.BigEndian.Uint32(payload[4:8]))
			payload = payload[8:]
			resp.Audio = append(resp.Audio, payload...)
			fmt.Printf("             Sequence number: %d\n",
				sequenceNumber)
			fmt.Printf("                Payload size: %d\n", payloadSize)
			if sequenceNumber < 0 {
				resp.IsLast = true
			}
		}
	} else if messageType == 0xf {
		code := int32(binary.BigEndian.Uint32(payload[0:4]))
		errMsg := payload[8:]
		if messageCompression == 1 {
			errMsg = gzipDecompress(errMsg)
		}
		fmt.Printf("                  Error code: %d\n", code)
		fmt.Printf("                   Error msg: %s\n", string(errMsg))
		err = errors.New(string(errMsg))
		return
	} else if messageType == 0xc {
		var msgSize int32
		msgSize = int32(binary.BigEndian.Uint32(payload[0:4]))
		fmt.Println(msgSize)
		payload = payload[4:]
		if messageCompression == 1 {
			payload = gzipDecompress(payload)
		}
		fmt.Printf("            Frontend message: %s\n", string(payload))
	} else {
		fmt.Printf("          wrong message type:%d\n", messageType)
		err = errors.New("wrong message type")
		return
	}
	return
}



// 流式合成
func DouBaoAudioStreamSynth(text, voiceType, appid string) ([]byte, time.Duration, error) {
	// 记录合成开始时间
	startTime := time.Now()
	// 从appTokenMap获取对应的token
	token, exists := appTokenMap[appid]
	if !exists {
		return nil, 0, errors.New("invalid appid")
	}
	//鉴权使用
	var header = http.Header{"Authorization": []string{fmt.Sprintf("Bearer;%s", token)}}

	input := setupInput(text, voiceType, appid, optSubmit)
	input = gzipCompress(input)
	payloadSize := len(input)
	payloadArr := make([]byte, 4)
	binary.BigEndian.PutUint32(payloadArr, uint32(payloadSize))
	clientRequest := append(defaultHeader, payloadArr...)
	clientRequest = append(clientRequest, input...)
	// websocket连接到字节服务器
	c, _, err := websocket.DefaultDialer.Dial(ZijieWs_Url.String(), header)
	if err != nil {
		return nil, 0, err
	}
	defer c.Close()
	//c.WriteMessage(websocket.TextMessage, []byte("已连接至字节服务器..."))

	// 连接成功后提醒
	fmt.Println("Successfully connected to the byte server.")

	err = c.WriteMessage(websocket.BinaryMessage, clientRequest)
	if err != nil {
		return nil, 0, err
	}

	var audio []byte
	for {
		_, message, err := c.ReadMessage()
		if err != nil {
			break
		}
		resp, err := parseResponse(message)
		if err != nil {
			break
		}
		audio = append(audio, resp.Audio...)
		if resp.IsLast {
			break
		}
	}

	// 记录合成结束时间
	endTime := time.Now()
	duration = endTime.Sub(startTime)

	// 打印合成时间
	fmt.Printf("音频合成时间: %s\n", duration)

	return audio, duration, nil
}

// WebSocket流式合成
func WsStreamSynth2(c *gin.Context) {

	//创建websocket连接
	conn, err := (&websocket.Upgrader{
		CheckOrigin: func(r *http.Request) bool {
			return true
		}}).Upgrade(c.Writer, c.Request, nil)
	if err != nil {
		http.NotFound(c.Writer, c.Request)
		return
	}

	defer conn.Close()
	appid := c.Query("appid")
	voiceType := c.Query("voiceType")
	//appid := c.Query("appid")
	//
	 从appTokenMap获取对应的token
	//token, exists := appTokenMap[appid]
	//if !exists {
	//	conn.WriteMessage(websocket.TextMessage, []byte("无效的 appid"))
	//	return
	//}
	//
	 设置请求头
	//var header = http.Header{"Authorization": []string{fmt.Sprintf("Bearer;%s", token)}}

	conn.WriteMessage(websocket.TextMessage, []byte("连接已建立"))

	for {
		type RequestData struct {
			Text string `json:"text"`
		}

		var requestData RequestData
		err = conn.ReadJSON(&requestData)
		if err != nil {
			conn.WriteMessage(websocket.TextMessage, []byte("Invalid JSON format"))
			break
			//return
		}

		text := requestData.Text
		//voiceType := c.DefaultPostForm("voiceType", "S_v7xollyj1")

		// 从voiceTypeNameMap获取对应的声线
		//voiceType, exists := voiceTypeNameMap[voiceTypeName]
		//if !exists {
		//	conn.WriteMessage(websocket.TextMessage, []byte("声线类型选择失败"))
		//	return
		//}

		conn.WriteMessage(websocket.TextMessage, []byte("开始处理合成音频"))
		audio, duration, err := DouBaoAudioStreamSynth(text, voiceType, appid)
		if err != nil {
			conn.WriteMessage(websocket.TextMessage, []byte("生成音频失败"))
			break
			//return
		}

		// 将 time.Duration 转换为字符串
		durationStr := duration.String()
		// Send the audio back to the client
		conn.WriteMessage(websocket.BinaryMessage, audio) // 发送二进制音频数据
		conn.WriteMessage(websocket.TextMessage, []byte("本次合成时间: "+durationStr))
		conn.WriteMessage(websocket.TextMessage, []byte("继续监听..."))

	}

	// You can keep the WebSocket open for further communication if needed:
	// For example, waiting for further synthesis requests or other commands.

	// Continue waiting for further requests or close after some time if no requests come.
	//for {
	//	_, msg, err := conn.ReadMessage()
	//	if err != nil {
	//		// Handle connection close or error
	//		break
	//	}
	//
	//	// Handle any incoming messages here if needed (optional)
	//	// For instance, send a "ping" to keep the connection alive, or re-trigger synthesis.
	//	conn.WriteMessage(websocket.TextMessage, []byte("继续监听..."))
	//}

}

func main() {
	// 初始化 Gin 路由
	r := gin.Default()

	// WebSocket 请求处理
	r.GET("/WsStreamSynth/ws", WsStreamSynth2)

	// 启动服务器
	r.Run(":8081")
}

Postman测试

在这里插入图片描述