首先需要去火山云的控制台开通TTS和STT服务语音技术 (volcengine.com)
火山这里都提供了免费的额度可以使用
我这里是使用了java来调用API
目前我还了解到阿里的开源项目SenseVoice(STT)和CosyVoice(TTS)非常的不错,但是都是使用Python开发的。可以做到说话情绪的识别,感兴趣可以去github上了解一下。
TTS(首先需要导入它给的类)
package com.erroright.backend_server_java.pojo.util;
import java.util.UUID;
public class TtsRequest {
public static final String APP_ID = "控制台的APPID";
public static final String CLUSTER = "";
public static final String Token = "";
public static final String VoiceType = "BV001_streaming";//生成声音的选择(如果生成语音报错,就是你没开通这个音色的权限)
public static final String Emotion = "angry";//语气
public TtsRequest() {
}
public TtsRequest(String text) {
this.request.text = text;
}
private App app = new App();
private User user = new User();
private Audio audio = new Audio();
private Request request = new Request();
public App getApp() {
return app;
}
public void setApp(App app) {
this.app = app;
}
public User getUser() {
return user;
}
public void setUser(User user) {
this.user = user;
}
public Audio getAudio() {
return audio;
}
public void setAudio(Audio audio) {
this.audio = audio;
}
public Request getRequest() {
return request;
}
public void setRequest(Request request) {
this.request = request;
}
public class App {
private String appid = APP_ID;
private String token = Token; // 目前未生效,填写默认值:access_token
private String cluster = CLUSTER;
public String getAppid() {
return appid;
}
public void setAppid(String appid) {
this.appid = appid;
}
public String getToken() {
return token;
}
public void setToken(String token) {
this.token = token;
}
public String getCluster() {
return cluster;
}
public void setCluster(String cluster) {
this.cluster = cluster;
}
}
public class User {
private String uid = "388808087185088"; // 目前未生效,填写一个默认值就可以
public String getUid() {
return uid;
}
public void setUid(String uid) {
this.uid = uid;
}
}
public class Audio {
private String voice_type = VoiceType;
private String encoding = "wav";
private float speed_ratio = 1.0F;
private float volume_ratio = 10;
private float pitch_ratio = 10;
private String emotion = Emotion;
public String getVoice_type() {
return voice_type;
}
public void setVoice_type(String voice_type) {
this.voice_type = voice_type;
}
public String getEncoding() {
return encoding;
}
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public float getSpeedRatio() {
return speed_ratio;
}
public void setSpeedRatio(int speed_ratio) {
this.speed_ratio = speed_ratio;
}
public float getVolumeRatio() {
return volume_ratio;
}
public void setVolumeRatio(int volume_ratio) {
this.volume_ratio = volume_ratio;
}
public float getPitchRatio() {
return pitch_ratio;
}
public void setPitchRatio(int pitch_ratio) {
this.pitch_ratio = pitch_ratio;
}
public String getEmotion() {
return emotion;
}
public void setEmotion(int emotion) {
this.emotion = String.valueOf(emotion);
}
}
public class Request {
private String reqid = UUID.randomUUID().toString();
private String text;
private String text_type = "plain";
private String operation = "query";
public String getReqid() {
return reqid;
}
public void setReqid(String reqid) {
this.reqid = reqid;
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
public String getText_type() {
return text_type;
}
public void setText_type(String text_type) {
this.text_type = text_type;
}
public String getOperation() {
return operation;
}
public void setOperation(String operation) {
this.operation = operation;
}
}
}
调用代码
package com.erroright.backend_server_java.util;
import com.alibaba.fastjson.JSON;
import com.erroright.backend_server_java.pojo.util.TtsRequest;
import lombok.extern.slf4j.Slf4j;
import okhttp3.*;
import org.springframework.stereotype.Component;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Base64;
@Component
@Slf4j
public class TtsHttpClient {
public static final String API_URL = "https://openspeech.bytedance.com/api/v1/tts";
public static final String ACCESS_TOKEN = "填入火山云开通项目的Token";
public static byte[] getTts(String content) throws IOException {
log.info("TTS生成:"+content);
TtsRequest ttsRequest = new TtsRequest(content);
String json= JSON.toJSONString(ttsRequest);
OkHttpClient client = new OkHttpClient();
RequestBody body = RequestBody.create(json, MediaType.get("application/json; charset=utf-8"));
Request request = new Request.Builder()
.url(API_URL)
.post(body)
.header("Authorization", "Bearer; " + ACCESS_TOKEN)
.build();
try (Response response = client.newCall(request).execute()) {
String TtsRresponse=response.body().string();
// 提取 "data" 字段的值
String data = TtsRresponse.split("\"data\":\"")[1].split("\"")[0];
//保存生成的文件
try (FileOutputStream fos = new FileOutputStream("output.wav")) {
fos.write(Base64.getDecoder().decode(data));
}
// 解码 Base64 数据
return Base64.getDecoder().decode(data);
}
}
}
STT(导入类,在官方文档中是三个类,为了在springBoot中封装,分开了一个)
package com.erroright.backend_server_java.pojo.util;
import com.fasterxml.jackson.databind.DeserializationFeat