字节火山引擎-大模型声音复刻,流式语音合成接口
-
参考文档:火山引擎-大模型声音复刻文档
-
官网给出的示例代码有bug,这里已经修改了
创建应用
声音复刻大模型页面查看应用,获取接口调用需要的参数
注意调用tts接口时候需要三个参数:
- APP ID => 获取到对应的应用
- Access Token => 鉴权需要
- 声音ID => 语音合成需要
Http流式合成音频接口示例代码
package main
import (
"bytes"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"github.com/gin-gonic/gin"
"github.com/google/uuid"
"io/ioutil"
"net/http"
"time"
)
var (
app_id = "8678693223"
clusterId = "volcano_icl"
SpeakId = "S_v7xollyj1"
BearerToken = "-50OZ81pPKpn8pRZEgNrxd0wCELJJMIN"
resource_id = "volc.tts_async.emotion"
)
var durationTime time.Duration
// TTSServResponse response from backend services
type TTSServResponse struct {
ReqID string `json:"reqid"`
Code int `json:"code"`
Message string `json:"Message"`
Operation string `json:"operation"`
Sequence int `json:"sequence"`
Data string `json:"data"`
}
func httpPost(url string, headers map[string]string, body []byte, timeout time.Duration) ([]byte, error) {
client := &http.Client{
Timeout: timeout,
}
req, err := http.NewRequest(http.MethodPost, url, bytes.NewBuffer(body))
if err != nil {
return nil, err
}
for key, value := range headers {
req.Header.Set(key, value)
}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
retBody, err := ioutil.ReadAll(resp.Body)
if err != nil {
return nil, err
}
return retBody, err
}
func synthesis(text string) ([]byte, error) {
// 记录合成开始时间
startTime := time.Now()
reqID := uuid.NewString()
params := make(map[string]map[string]interface{})
params["app"] = make(map[string]interface{})
params["app"]["appid"] = app_id
params["app"]["token"] = "access_token"
params["app"]["cluster"] = clusterId
params["user"] = make(map[string]interface{})
params["user"]["uid"] = "uid"
params["audio"] = make(map[string]interface{})
params["audio"]["voice_type"] = SpeakId
params["audio"]["encoding"] = "wav"
params["audio"]["speed_ratio"] = 1.0
params["audio"]["volume_ratio"] = 1.0
params["audio"]["pitch_ratio"] = 1.0
params["request"] = make(map[string]interface{})
params["request"]["reqid"] = reqID
params["request"]["text"] = text
params["request"]["text_type"] = "plain"
params["request"]["operation"] = "query"
headers := make(map[string]string)
headers["Content-Type"] = "application/json"
headers["Authorization"] = fmt.Sprintf("Bearer;%s", BearerToken)
url := "https://openspeech.bytedance.com/api/v1/tts"
timeo := 30 * time.Second
bodyStr, _ := json.Marshal(params)
synResp, err := httpPost(url, headers, []byte(bodyStr), timeo)
if err != nil {
fmt.Printf("http post fail [err:%s]\n", err.Error())
return nil, err
}
fmt.Printf("resp body:%s\n", synResp)
var respJSON TTSServResponse
err = json.Unmarshal(synResp, &respJSON)
if err != nil {
fmt.Printf("unmarshal response fail [err:%s]\n", err.Error())
return nil, err
}
codeMessages := map[int]string{
3001: "无效的请求,请检查参数",
3003: "并发超限,请稍后重试",
3005: "后端服务忙,请稍后重试",
3006: "服务中断,请检查参数",
3010: "文本长度超限,请检查文本长度",
3011: "无效文本,请检查文本内容",
3030: "处理超时,请重试或检查文本",
3031: "处理错误,后端出现异常,请重试",
3032: "等待获取音频超时,请重试",
3040: "后端链路连接错误,请重试",
3050: "音色不存在,请检查voice_type代号",
}
code := respJSON.Code
if code != 3000 {
fmt.Printf("code fail [code:%d]\n", code)
message, exists := codeMessages[respJSON.Code]
if !exists {
message = "未知错误,请重试"
}
return nil, errors.New(message)
}
audio, _ := base64.StdEncoding.DecodeString(respJSON.Data)
// 记录合成结束时间
endTime := time.Now()
durationTime = endTime.Sub(startTime)
// 打印合成时间
fmt.Printf("音频合成时间: %s\n", durationTime)
return audio, nil
}
// Handle TTS synthesis via Gin
func handleTTS(c *gin.Context) {
var input map[string]string
if err := c.ShouldBindJSON(&input); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid input"})
return
}
text, exists := input["text"]
if !exists || text == "" {
c.JSON(http.StatusBadRequest, gin.H{"error": "Text is required"})
return
}
audio, err := synthesis(text)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Synthesis failed", "details": err.Error()})
return
}
// Return audio in response
c.Data(http.StatusOK, "audio/wav", audio)
}
func main() {
r := gin.Default()
// POST request to synthesize text to speech
r.POST("/synthesize", handleTTS)
// Start the server
r.Run(":8080")
}
websocket流式合成音频接口示例代码
package main
import (
"bytes"
"compress/gzip"
"encoding/binary"
"encoding/json"
"errors"
"fmt"
"github.com/gin-gonic/gin"
"github.com/gorilla/websocket"
"github.com/satori/go.uuid"
"io/ioutil"
"net/http"
"net/url"
"time"
)
var (
enumMessageType = map[byte]string{
11: "audio-only server response",
12: "frontend server response",
15: "error message from server",
}
enumMessageTypeSpecificFlags = map[byte]string{
0: "no sequence number",
1: "sequence number > 0",
2: "last message from server (seq < 0)",
3: "sequence number < 0",
}
enumMessageSerializationMethods = map[byte]string{
0: "no serialization",
1: "JSON",
15: "custom type",
}
enumMessageCompression = map[byte]string{
0: "no compression",
1: "gzip",
15: "custom compression method",
}
)
const (
optQuery string = "query"
optSubmit string = "submit"
)
var duration time.Duration
var addr = "openspeech.bytedance.com"
var ZijieWs_Url = url.URL{Scheme: "wss", Host: addr, Path: "/api/v1/tts/ws_binary"}
// 保存appid和token的映射关系
var appTokenMap = map[string]string{
"8678693223": "-50OZ81pPKpn8pRZEgNrxd0wCELJJMIN1ww", //胡桃
"8724961923": "Ut0tYdEdwVvHgSzeV9gdXEN8EGXSlqBesdd", // 郭德纲
"5046524282": "xRcD5NYAo0BMkMajm9JT3XAigKtRUBCOddd", // 钟离
}
type synResp struct {
Audio []byte
IsLast bool
}
// version: b0001 (4 bits)
// header size: b0001 (4 bits)
// message type: b0001 (Full client request) (4bits)
// message type specific flags: b0000 (none) (4bits)
// message serialization method: b0001 (JSON) (4 bits)
// message compression: b0001 (gzip) (4bits)
// reserved data: 0x00 (1 byte)
var defaultHeader = []byte{0x11, 0x10, 0x11, 0x00}
func setupInput(text, voiceType, appid, opt string) []byte {
var err error
reqID := uuid.Must(uuid.NewV4(), err).String()
params := make(map[string]map[string]interface{})
params["app"] = make(map[string]interface{})
//平台上查看具体appid
params["app"]["appid"] = appid
params["app"]["token"] = "access_token"
//平台上查看具体集群名称
params["app"]["cluster"] = "volcano_icl"
params["user"] = make(map[string]interface{})
params["user"]["uid"] = "uid"
params["audio"] = make(map[string]interface{})
params["audio"]["voice_type"] = voiceType
params["audio"]["encoding"] = "mp3" // 设为 MP3 格式
params["audio"]["sample_rate"] = 24000 // 设为 24kHz 采样率
params["audio"]["speed_ratio"] = 1.0
params["audio"]["volume_ratio"] = 1.0
params["audio"]["pitch_ratio"] = 1.0
params["request"] = make(map[string]interface{})
params["request"]["reqid"] = reqID
params["request"]["text"] = text
params["request"]["text_type"] = "plain"
params["request"]["operation"] = opt
resStr, _ := json.Marshal(params)
return resStr
}
func gzipCompress(input []byte) []byte {
var b bytes.Buffer
w := gzip.NewWriter(&b)
w.Write(input)
w.Close()
return b.Bytes()
}
func gzipDecompress(input []byte) []byte {
b := bytes.NewBuffer(input)
r, _ := gzip.NewReader(b)
out, _ := ioutil.ReadAll(r)
r.Close()
return out
}
func parseResponse(res []byte) (resp synResp, err error) {
protoVersion := res[0] >> 4
headSize := res[0] & 0x0f
messageType := res[1] >> 4
messageTypeSpecificFlags := res[1] & 0x0f
serializationMethod := res[2] >> 4
messageCompression := res[2] & 0x0f
reserve := res[3]
headerExtensions := res[4 : headSize*4]
payload := res[headSize*4:]
fmt.Printf(" Protocol version: %x - version %d\n",
protoVersion, protoVersion)
fmt.Printf(" Header size: %x - %d bytes\n",
headSize, headSize*4)
fmt.Printf(" Message type: %x - %s\n", messageType,
enumMessageType[messageType])
fmt.Printf(" Message type specific flags: %x - %s\n", messageTypeSpecificFlags,
enumMessageTypeSpecificFlags[messageTypeSpecificFlags])
fmt.Printf("Message serialization method: %x - %s\n",
serializationMethod, enumMessageSerializationMethods[serializationMethod])
fmt.Printf(" Message compression: %x - %s\n",
messageCompression, enumMessageCompression[messageCompression])
fmt.Printf(" Reserved: %d\n", reserve)
if headSize != 1 {
fmt.Printf(" Header extensions: %s\n",
headerExtensions)
}
// audio-only server response
if messageType == 0xb {
// no sequence number as ACK
if messageTypeSpecificFlags == 0 {
fmt.Println(" Payload size: 0")
} else {
sequenceNumber := int32(binary.BigEndian.Uint32(payload[0:4]))
payloadSize := int32(binary.BigEndian.Uint32(payload[4:8]))
payload = payload[8:]
resp.Audio = append(resp.Audio, payload...)
fmt.Printf(" Sequence number: %d\n",
sequenceNumber)
fmt.Printf(" Payload size: %d\n", payloadSize)
if sequenceNumber < 0 {
resp.IsLast = true
}
}
} else if messageType == 0xf {
code := int32(binary.BigEndian.Uint32(payload[0:4]))
errMsg := payload[8:]
if messageCompression == 1 {
errMsg = gzipDecompress(errMsg)
}
fmt.Printf(" Error code: %d\n", code)
fmt.Printf(" Error msg: %s\n", string(errMsg))
err = errors.New(string(errMsg))
return
} else if messageType == 0xc {
var msgSize int32
msgSize = int32(binary.BigEndian.Uint32(payload[0:4]))
fmt.Println(msgSize)
payload = payload[4:]
if messageCompression == 1 {
payload = gzipDecompress(payload)
}
fmt.Printf(" Frontend message: %s\n", string(payload))
} else {
fmt.Printf(" wrong message type:%d\n", messageType)
err = errors.New("wrong message type")
return
}
return
}
// 流式合成
func DouBaoAudioStreamSynth(text, voiceType, appid string) ([]byte, time.Duration, error) {
// 记录合成开始时间
startTime := time.Now()
// 从appTokenMap获取对应的token
token, exists := appTokenMap[appid]
if !exists {
return nil, 0, errors.New("invalid appid")
}
//鉴权使用
var header = http.Header{"Authorization": []string{fmt.Sprintf("Bearer;%s", token)}}
input := setupInput(text, voiceType, appid, optSubmit)
input = gzipCompress(input)
payloadSize := len(input)
payloadArr := make([]byte, 4)
binary.BigEndian.PutUint32(payloadArr, uint32(payloadSize))
clientRequest := append(defaultHeader, payloadArr...)
clientRequest = append(clientRequest, input...)
// websocket连接到字节服务器
c, _, err := websocket.DefaultDialer.Dial(ZijieWs_Url.String(), header)
if err != nil {
return nil, 0, err
}
defer c.Close()
//c.WriteMessage(websocket.TextMessage, []byte("已连接至字节服务器..."))
// 连接成功后提醒
fmt.Println("Successfully connected to the byte server.")
err = c.WriteMessage(websocket.BinaryMessage, clientRequest)
if err != nil {
return nil, 0, err
}
var audio []byte
for {
_, message, err := c.ReadMessage()
if err != nil {
break
}
resp, err := parseResponse(message)
if err != nil {
break
}
audio = append(audio, resp.Audio...)
if resp.IsLast {
break
}
}
// 记录合成结束时间
endTime := time.Now()
duration = endTime.Sub(startTime)
// 打印合成时间
fmt.Printf("音频合成时间: %s\n", duration)
return audio, duration, nil
}
// WebSocket流式合成
func WsStreamSynth2(c *gin.Context) {
//创建websocket连接
conn, err := (&websocket.Upgrader{
CheckOrigin: func(r *http.Request) bool {
return true
}}).Upgrade(c.Writer, c.Request, nil)
if err != nil {
http.NotFound(c.Writer, c.Request)
return
}
defer conn.Close()
appid := c.Query("appid")
voiceType := c.Query("voiceType")
//appid := c.Query("appid")
//
从appTokenMap获取对应的token
//token, exists := appTokenMap[appid]
//if !exists {
// conn.WriteMessage(websocket.TextMessage, []byte("无效的 appid"))
// return
//}
//
设置请求头
//var header = http.Header{"Authorization": []string{fmt.Sprintf("Bearer;%s", token)}}
conn.WriteMessage(websocket.TextMessage, []byte("连接已建立"))
for {
type RequestData struct {
Text string `json:"text"`
}
var requestData RequestData
err = conn.ReadJSON(&requestData)
if err != nil {
conn.WriteMessage(websocket.TextMessage, []byte("Invalid JSON format"))
break
//return
}
text := requestData.Text
//voiceType := c.DefaultPostForm("voiceType", "S_v7xollyj1")
// 从voiceTypeNameMap获取对应的声线
//voiceType, exists := voiceTypeNameMap[voiceTypeName]
//if !exists {
// conn.WriteMessage(websocket.TextMessage, []byte("声线类型选择失败"))
// return
//}
conn.WriteMessage(websocket.TextMessage, []byte("开始处理合成音频"))
audio, duration, err := DouBaoAudioStreamSynth(text, voiceType, appid)
if err != nil {
conn.WriteMessage(websocket.TextMessage, []byte("生成音频失败"))
break
//return
}
// 将 time.Duration 转换为字符串
durationStr := duration.String()
// Send the audio back to the client
conn.WriteMessage(websocket.BinaryMessage, audio) // 发送二进制音频数据
conn.WriteMessage(websocket.TextMessage, []byte("本次合成时间: "+durationStr))
conn.WriteMessage(websocket.TextMessage, []byte("继续监听..."))
}
// You can keep the WebSocket open for further communication if needed:
// For example, waiting for further synthesis requests or other commands.
// Continue waiting for further requests or close after some time if no requests come.
//for {
// _, msg, err := conn.ReadMessage()
// if err != nil {
// // Handle connection close or error
// break
// }
//
// // Handle any incoming messages here if needed (optional)
// // For instance, send a "ping" to keep the connection alive, or re-trigger synthesis.
// conn.WriteMessage(websocket.TextMessage, []byte("继续监听..."))
//}
}
func main() {
// 初始化 Gin 路由
r := gin.Default()
// WebSocket 请求处理
r.GET("/WsStreamSynth/ws", WsStreamSynth2)
// 启动服务器
r.Run(":8081")
}