字节火山引擎-大模型声音复刻,流式语音合成接口

字节火山引擎-大模型声音复刻,流式语音合成接口


创建应用

在这里插入图片描述

声音复刻大模型页面查看应用,获取接口调用需要的参数

在这里插入图片描述

注意调用tts接口时候需要三个参数:

  • APP ID => 获取到对应的应用
  • Access Token => 鉴权需要
  • 声音ID => 语音合成需要

Http流式合成音频接口示例代码

package main

import (
	"bytes"
	"encoding/base64"
	"encoding/json"
	"errors"
	"fmt"
	"github.com/gin-gonic/gin"
	"github.com/google/uuid"
	"io/ioutil"
	"net/http"
	"time"
)

var (
	app_id      = "8678693223"
	clusterId   = "volcano_icl"
	SpeakId     = "S_v7xollyj1"
	BearerToken = "-50OZ81pPKpn8pRZEgNrxd0wCELJJMIN"
	resource_id = "volc.tts_async.emotion"
)
var durationTime time.Duration

// TTSServResponse response from backend services
type TTSServResponse struct {
	ReqID     string `json:"reqid"`
	Code      int    `json:"code"`
	Message   string `json:"Message"`
	Operation string `json:"operation"`
	Sequence  int    `json:"sequence"`
	Data      string `json:"data"`
}

func httpPost(url string, headers map[string]string, body []byte, timeout time.Duration) ([]byte, error) {
	client := &http.Client{
		Timeout: timeout,
	}
	req, err := http.NewRequest(http.MethodPost, url, bytes.NewBuffer(body))
	if err != nil {
		return nil, err
	}
	for key, value := range headers {
		req.Header.Set(key, value)
	}
	resp, err := client.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()
	retBody, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		return nil, err
	}
	return retBody, err
}

func synthesis(text string) ([]byte, error) {
	// 记录合成开始时间
	startTime := time.Now()

	reqID := uuid.NewString()
	params := make(map[string]map[string]interface{})
	params["app"] = make(map[string]interface{})
	params["app"]["appid"] = app_id
	params["app"]["token"] = "access_token"
	params["app"]["cluster"] = clusterId
	params["user"] = make(map[string]interface{})
	params["user"]["uid"] = "uid"
	params["audio"] = make(map[string]interface{})
	params["audio"]["voice_type"] = SpeakId
	params["audio"]["encoding"] = "wav"
	params["audio"]["speed_ratio"] = 1.0
	params["audio"]["volume_ratio"] = 1.0
	params["audio"]["pitch_ratio"] = 1.0
	params["request"] = make(map[string]interface{})
	params["request"]["reqid"] = reqID
	params["request"]["text"] = text
	params["request"]["text_type"] = "plain"
	params["request"]["operation"] = "query"

	headers := make(map[string]string)
	headers["Content-Type"] = "application/json"
	headers["Authorization"] = fmt.Sprintf("Bearer;%s", BearerToken)

	url := "https://openspeech.bytedance.com/api/v1/tts"
	timeo := 30 * time.Second
	bodyStr, _ := json.Marshal(params)
	synResp, err := httpPost(url, headers, []byte(bodyStr), timeo)
	if err != nil {
		fmt.Printf("http post fail [err:%s]\n", err.Error())
		return nil, err
	}
	fmt.Printf("resp body:%s\n", synResp)
	var respJSON TTSServResponse
	err = json.Unmarshal(synResp, &respJSON)
	if err != nil {
		fmt.Printf("unmarshal response fail [err:%s]\n", err.Error())
		return nil, err
	}
	codeMessages := map[int]string{
		3001: "无效的请求,请检查参数",
		3003: "并发超限,请稍后重试",
		3005: "后端服务忙,请稍后重试",
		3006: "服务中断,请检查参数",
		3010: "文本长度超限,请检查文本长度",
		3011: "无效文本,请检查文本内容",
		3030: "处理超时,请重试或检查文本",
		3031: "处理错误,后端出现异常,请重试",
		3032: "等待获取音频超时,请重试",
		3040: "后端链路连接错误,请重试",
		3050: "音色不存在,请检查voice_type代号",
	}

	code := respJSON.Code
	if code != 3000 {
		fmt.Printf("code fail [code:%d]\n", code)
		message, exists := codeMessages[respJSON.Code]
		if !exists {
			message = "未知错误,请重试"
		}
		return nil, errors.New(message)

	}

	audio, _ := base64.StdEncoding.DecodeString(respJSON.Data)

	// 记录合成结束时间
	endTime := time.Now()
	durationTime = endTime.Sub(startTime)

	// 打印合成时间
	fmt.Printf("音频合成时间: %s\n", durationTime)

	return audio, nil
}

// Handle TTS synthesis via Gin
func handleTTS(c *gin.Context) {
	var input map[string]string
	if err := c.ShouldBindJSON(&input); err != nil {
		c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid input"})
		return
	}

	text, exists := input["text"]
	if !exists || text == "" {
		c.JSON(http.StatusBadRequest, gin.H{"error": "Text is required"})
		return
	}

	audio, err := synthesis(text)
	if err != nil {
		c.JSON(http.StatusInternalServerError, gin.H{"error": "Synthesis failed", "details": err.Error()})
		return
	}

	// Return audio in response
	c.Data(http.StatusOK, "audio/wav", audio)
}

func main() {
	r := gin.Default()

	// POST request to synthesize text to speech
	r.POST("/synthesize", handleTTS)

	// Start the server
	r.Run(":8080")
}

websocket流式合成音频接口示例代码

package main

import (
	"bytes"
	"compress/gzip"
	"encoding/binary"
	"encoding/json"
	"errors"
	"fmt"
	"github.com/gin-gonic/gin"
	"github.com/gorilla/websocket"
	"github.com/satori/go.uuid"
	"io/ioutil"
	"net/http"
	"net/url"
	"time"
)

var (
	enumMessageType = map[byte]string{
		11: "audio-only server response",
		12: "frontend server response",
		15: "error message from server",
	}
	enumMessageTypeSpecificFlags = map[byte]string{
		0: "no sequence number",
		1: "sequence number > 0",
		2: "last message from server (seq < 0)",
		3: "sequence number < 0",
	}
	enumMessageSerializationMethods = map[byte]string{
		0:  "no serialization",
		1:  "JSON",
		15: "custom type",
	}
	enumMessageCompression = map[byte]string{
		0:  "no compression",
		1:  "gzip",
		15: "custom compression method",
	}
)

const (
	optQuery  string = "query"
	optSubmit string = "submit"
)

var duration time.Duration
var addr = "openspeech.bytedance.com"
var ZijieWs_Url = url.URL{Scheme: "wss", Host: addr, Path: "/api/v1/tts/ws_binary"}



// 保存appid和token的映射关系
var appTokenMap = map[string]string{
	"8678693223": "-50OZ81pPKpn8pRZEgNrxd0wCELJJMIN1ww", //胡桃
	"8724961923": "Ut0tYdEdwVvHgSzeV9gdXEN8EGXSlqBesdd", // 郭德纲
	"5046524282": "xRcD5NYAo0BMkMajm9JT3XAigKtRUBCOddd", // 钟离
}

type synResp struct {
	Audio  []byte
	IsLast bool
}

// version: b0001 (4 bits)
// header size: b0001 (4 bits)
// message type: b0001 (Full client request) (4bits)
// message type specific flags: b0000 (none) (4bits)
// message serialization method: b0001 (JSON) (4 bits)
// message compression: b0001 (gzip) (4bits)
// reserved data: 0x00 (1 byte)
var defaultHeader = []byte{0x11, 0x10, 0x11, 0x00}

func setupInput(text, voiceType, appid, opt string) []byte {
	var err error
	reqID := uuid.Must(uuid.NewV4(), err).String()
	params := make(map[string]map[string]interface{})
	params["app"] = make(map[string]interface{})
	//平台上查看具体appid
	params["app"]["appid"] = appid
	params["app"]["token"] = "access_token"
	//平台上查看具体集群名称
	params["app"]["cluster"] = "volcano_icl"
	params["user"] = make(map[string]interface{})
	params["user"]["uid"] = "uid"
	params["audio"] = make(map[string]interface{})
	params["audio"]["voice_type"] = voiceType
	params["audio"]["encoding"] = "mp3"    // 设为 MP3 格式
	params["audio"]["sample_rate"] = 24000 // 设为 24kHz 采样率
	params["audio"]["speed_ratio"] = 1.0
	params["audio"]["volume_ratio"] = 1.0
	params["audio"]["pitch_ratio"] = 1.0
	params["request"] = make(map[string]interface{})
	params["request"]["reqid"] = reqID
	params["request"]["text"] = text
	params["request"]["text_type"] = "plain"
	params["request"]["operation"] = opt
	resStr, _ := json.Marshal(params)
	return resStr
}

func gzipCompress(input []byte) []byte {
	var b bytes.Buffer
	w := gzip.NewWriter(&b)
	w.Write(input)
	w.Close()
	return b.Bytes()
}

func gzipDecompress(input []byte) []byte {
	b := bytes.NewBuffer(input)
	r, _ := gzip.NewReader(b)
	out, _ := ioutil.ReadAll(r)
	r.Close()
	return out
}

func parseResponse(res []byte) (resp synResp, err error) {
	protoVersion := res[0] >> 4
	headSize := res[0] & 0x0f
	messageType := res[1] >> 4
	messageTypeSpecificFlags := res[1] & 0x0f
	serializationMethod := res[2] >> 4
	messageCompression := res[2] & 0x0f
	reserve := res[3]
	headerExtensions := res[4 : headSize*4]
	payload := res[headSize*4:]

	fmt.Printf("            Protocol version: %x - version %d\n",
		protoVersion, protoVersion)
	fmt.Printf("                 Header size: %x - %d bytes\n",
		headSize, headSize*4)
	fmt.Printf("                Message type: %x - %s\n", messageType,
		enumMessageType[messageType])
	fmt.Printf(" Message type specific flags: %x - %s\n", messageTypeSpecificFlags,
		enumMessageTypeSpecificFlags[messageTypeSpecificFlags])
	fmt.Printf("Message serialization method: %x - %s\n",
		serializationMethod, enumMessageSerializationMethods[serializationMethod])
	fmt.Printf("         Message compression: %x - %s\n",
		messageCompression, enumMessageCompression[messageCompression])
	fmt.Printf("                    Reserved: %d\n", reserve)
	if headSize != 1 {
		fmt.Printf("           Header extensions: %s\n",
			headerExtensions)
	}
	// audio-only server response
	if messageType == 0xb {
		// no sequence number as ACK
		if messageTypeSpecificFlags == 0 {
			fmt.Println("                Payload size: 0")
		} else {
			sequenceNumber := int32(binary.BigEndian.Uint32(payload[0:4]))
			payloadSize := int32(binary.BigEndian.Uint32(payload[4:8]))
			payload = payload[8:]
			resp.Audio = append(resp.Audio, payload...)
			fmt.Printf("             Sequence number: %d\n",
				sequenceNumber)
			fmt.Printf("                Payload size: %d\n", payloadSize)
			if sequenceNumber < 0 {
				resp.IsLast = true
			}
		}
	} else if messageType == 0xf {
		code := int32(binary.BigEndian.Uint32(payload[0:4]))
		errMsg := payload[8:]
		if messageCompression == 1 {
			errMsg = gzipDecompress(errMsg)
		}
		fmt.Printf("                  Error code: %d\n", code)
		fmt.Printf("                   Error msg: %s\n", string(errMsg))
		err = errors.New(string(errMsg))
		return
	} else if messageType == 0xc {
		var msgSize int32
		msgSize = int32(binary.BigEndian.Uint32(payload[0:4]))
		fmt.Println(msgSize)
		payload = payload[4:]
		if messageCompression == 1 {
			payload = gzipDecompress(payload)
		}
		fmt.Printf("            Frontend message: %s\n", string(payload))
	} else {
		fmt.Printf("          wrong message type:%d\n", messageType)
		err = errors.New("wrong message type")
		return
	}
	return
}



// 流式合成
func DouBaoAudioStreamSynth(text, voiceType, appid string) ([]byte, time.Duration, error) {
	// 记录合成开始时间
	startTime := time.Now()
	// 从appTokenMap获取对应的token
	token, exists := appTokenMap[appid]
	if !exists {
		return nil, 0, errors.New("invalid appid")
	}
	//鉴权使用
	var header = http.Header{"Authorization": []string{fmt.Sprintf("Bearer;%s", token)}}

	input := setupInput(text, voiceType, appid, optSubmit)
	input = gzipCompress(input)
	payloadSize := len(input)
	payloadArr := make([]byte, 4)
	binary.BigEndian.PutUint32(payloadArr, uint32(payloadSize))
	clientRequest := append(defaultHeader, payloadArr...)
	clientRequest = append(clientRequest, input...)
	// websocket连接到字节服务器
	c, _, err := websocket.DefaultDialer.Dial(ZijieWs_Url.String(), header)
	if err != nil {
		return nil, 0, err
	}
	defer c.Close()
	//c.WriteMessage(websocket.TextMessage, []byte("已连接至字节服务器..."))

	// 连接成功后提醒
	fmt.Println("Successfully connected to the byte server.")

	err = c.WriteMessage(websocket.BinaryMessage, clientRequest)
	if err != nil {
		return nil, 0, err
	}

	var audio []byte
	for {
		_, message, err := c.ReadMessage()
		if err != nil {
			break
		}
		resp, err := parseResponse(message)
		if err != nil {
			break
		}
		audio = append(audio, resp.Audio...)
		if resp.IsLast {
			break
		}
	}

	// 记录合成结束时间
	endTime := time.Now()
	duration = endTime.Sub(startTime)

	// 打印合成时间
	fmt.Printf("音频合成时间: %s\n", duration)

	return audio, duration, nil
}

// WebSocket流式合成
func WsStreamSynth2(c *gin.Context) {

	//创建websocket连接
	conn, err := (&websocket.Upgrader{
		CheckOrigin: func(r *http.Request) bool {
			return true
		}}).Upgrade(c.Writer, c.Request, nil)
	if err != nil {
		http.NotFound(c.Writer, c.Request)
		return
	}

	defer conn.Close()
	appid := c.Query("appid")
	voiceType := c.Query("voiceType")
	//appid := c.Query("appid")
	//
	 从appTokenMap获取对应的token
	//token, exists := appTokenMap[appid]
	//if !exists {
	//	conn.WriteMessage(websocket.TextMessage, []byte("无效的 appid"))
	//	return
	//}
	//
	 设置请求头
	//var header = http.Header{"Authorization": []string{fmt.Sprintf("Bearer;%s", token)}}

	conn.WriteMessage(websocket.TextMessage, []byte("连接已建立"))

	for {
		type RequestData struct {
			Text string `json:"text"`
		}

		var requestData RequestData
		err = conn.ReadJSON(&requestData)
		if err != nil {
			conn.WriteMessage(websocket.TextMessage, []byte("Invalid JSON format"))
			break
			//return
		}

		text := requestData.Text
		//voiceType := c.DefaultPostForm("voiceType", "S_v7xollyj1")

		// 从voiceTypeNameMap获取对应的声线
		//voiceType, exists := voiceTypeNameMap[voiceTypeName]
		//if !exists {
		//	conn.WriteMessage(websocket.TextMessage, []byte("声线类型选择失败"))
		//	return
		//}

		conn.WriteMessage(websocket.TextMessage, []byte("开始处理合成音频"))
		audio, duration, err := DouBaoAudioStreamSynth(text, voiceType, appid)
		if err != nil {
			conn.WriteMessage(websocket.TextMessage, []byte("生成音频失败"))
			break
			//return
		}

		// 将 time.Duration 转换为字符串
		durationStr := duration.String()
		// Send the audio back to the client
		conn.WriteMessage(websocket.BinaryMessage, audio) // 发送二进制音频数据
		conn.WriteMessage(websocket.TextMessage, []byte("本次合成时间: "+durationStr))
		conn.WriteMessage(websocket.TextMessage, []byte("继续监听..."))

	}

	// You can keep the WebSocket open for further communication if needed:
	// For example, waiting for further synthesis requests or other commands.

	// Continue waiting for further requests or close after some time if no requests come.
	//for {
	//	_, msg, err := conn.ReadMessage()
	//	if err != nil {
	//		// Handle connection close or error
	//		break
	//	}
	//
	//	// Handle any incoming messages here if needed (optional)
	//	// For instance, send a "ping" to keep the connection alive, or re-trigger synthesis.
	//	conn.WriteMessage(websocket.TextMessage, []byte("继续监听..."))
	//}

}

func main() {
	// 初始化 Gin 路由
	r := gin.Default()

	// WebSocket 请求处理
	r.GET("/WsStreamSynth/ws", WsStreamSynth2)

	// 启动服务器
	r.Run(":8081")
}

Postman测试

在这里插入图片描述

### 关于火山引擎双向流式API #### 双向流式API概述 双向流式API允许客户端和服务端之间建立持久连接,在此期间双方可以发送多个消息而无需重新创建新的连接。这种方式特别适用于实时数据传输场景,如直播、在线会议等[^1]。 #### 文档获取途径 对于火山引擎提供的具体文档位置,建议访问官方开发者网站或者通过GitHub仓库中的README文件来查找最新的API接口描述以及使用指南。通常这类资源会包含详细的参数解释、请求响应格式等内容。 #### 示例代码展示 下面给出一段基于Python实现调用假设性的火山引擎双向流式的简化版伪代码: ```python import grpc class VolcanoEngineServiceStub(grpc.Client): def bidirectional_streaming_rpc(self, request_iterator): response_iterator = self.stub.BidirectionalStreamingCall(request_iterator) for response in response_iterator: yield response def generate_requests(): while True: message = input("Enter your message or type 'exit' to quit:") if message.lower() == "exit": break yield Request(message=message) if __name__ == "__main__": channel = grpc.insecure_channel('localhost:50051') stub = VolcanoEngineServiceStub(channel) responses = stub.bidirectional_streaming_rpc(generate_requests()) try: for resp in responses: print(f"Received from server {resp.message}") except Exception as e: print(e) ``` 上述例子展示了如何利用gRPC框架构建一个简单的客户端程序去发起与服务器之间的双向流通信过程。 #### 使用注意事项 当实际操作时需要注意认证授权机制、错误处理逻辑等方面的设计;另外也要遵循服务提供商关于流量控制等方面的指导原则以确保应用稳定运行。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值