一、能力与场景说明
同声传译,又称同步口译或同声翻译,是一种专业的口译形式,指的是在讲话者发言时,口译员几乎同时将讲话内容翻译成目标语言。这种翻译方式通常用于国际会议、高级别政治或商业会谈、研讨会和其他需要即时多语言交流的场合。本文是用JAVA调用原生麦克风技术,实现产生音频的同时用麦克风实现边说边翻译成英文的技术。不限制录音时长。
二、同传主调用代码
package main.com.iflytek;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import main.com.util.VideoPlayerService;
import okhttp3.*;
import main.com.util.AuthUtils;
import main.com.util.PcmToWav;
import javax.sound.sampled.AudioInputStream;
import java.io.*;
import java.util.*;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
/**
* 1、同声传译接口,可以将音频流实时翻译为不同语种的文本,并输对应的音频内容,广泛应用于国际论坛、智能会议、智慧教育、跨国交流等场景。
*/
public class SimultaneousTranslationMain extends WebSocketListener {
private static String requestUrl = "wss://ws-api.xf-yun.com/v1/private/simult_interpretation";
//控制台获取以下信息
private static String APPID = "";
private static String apiSecret = "";
private static String apiKey = "";
private static final String domain = "ist_ed_open";
private static final String language = "zh_cn";
private static final String accent = "mandarin";
// 翻译所需参数,从中文-英文
private static final String from = "cn"; // 源文件的语言类型
private static final String to = "en"; // 目标文件的语言类型
// 发声发音人
private static final String vcn = "x2_catherine";
// 输出音频编码方式 PCM
private static final String encoding = "raw";
// 输入的源音频文件
private static final String inputAudioPcm = "input/audio/original.pcm";
// 输出的音频与文本文件
private static final String outPutPcm = "output/audio/trans.pcm";
private static final String outPutWav = "output/audio/trans.wav";
private static final String asr_result = "output/text/asr.txt";
private static final String trans_result = "output/text/trans.txt";
public static final int StatusFirstFrame = 0;
public static final int StatusContinueFrame = 1;
public static final int StatusLastFrame = 2;
public static final Gson gson = new Gson();
private static BlockingQueue<String> queue = new LinkedBlockingQueue<>();
private static boolean overFlag = false;
public static byte[] audioDataByteArray;
public static Long ivwStartTime;
public static Long ivwEndTime;
// 主函数入口
public static void main(String[] args) {
// 清空文件夹
clearDir();
Thread thread = new Thread(new videoPlayer());
thread.start();
// 构建鉴权url
String authUrl = AuthUtils.assembleRequestUrl(requestUrl, apiKey, apiSecret);
// System.out.println(authUrl);
OkHttpClient client = new OkHttpClient.Builder().build();
Request request = new Request.Builder().url(authUrl).build();
// System.out.println("url===>" + authUrl);
WebSocket webSocket = client.newWebSocket(request, new SimultaneousTranslationMain());
try {
//打开音频文件
int frameSize = 1280; // 每一帧音频的大小 1280/40ms
int interval = 40;
int status = 0; // 音频的状态
int count = 0;
Constants.IVW_ASR_TARGET_DATA_LINE.open(Constants.IVW_ASR_AUDIO_FORMAT);
Constants.IVW_ASR_TARGET_DATA_LINE.start();
ivwStartTime = System.currentTimeMillis(); // 更新开始时间
// 发送音频
end:
while (true) {
//int len = fs.read(buffer);
audioDataByteArray = new byte[Constants.IVW_FRAME_SIZE];
int len = new AudioInputStream(Constants.IVW_ASR_TARGET_DATA_LINE).read(audioDataByteArray);
ivwEndTime = System.currentTimeMillis();
// System.out.println(ivwEndTime - ivwStartTime);
if (len < frameSize || (ivwEndTime - ivwStartTime) > 60000) { // 大于一分钟自动停止
status = StatusLastFrame; //文件读完,改变status 为 2
}
switch (status) {
case StatusFirstFrame: // 第一帧音频status = 0
JsonObject frame = new JsonObject();
JsonObject header = new JsonObject(); //第一帧必须发送
JsonObject parameter = new JsonObject();
JsonObject ist = new JsonObject();
JsonObject streamtrans = new JsonObject();
JsonObject tts = new JsonObject();
JsonObject tts_results = new JsonObject();
JsonObject payload = new JsonObject();
JsonObject data = new JsonObject();
// 填充header
header.addProperty("app_id", APPID);//appid 必须带上,只需第一帧发送
header.addProperty("status", 0);
// 填充parameter
// ist参数填充
ist.addProperty("eos", 600000);
ist.addProperty("vto", 15000);
ist.addProperty("accent", accent);
ist.addProperty("language", language);
ist.addProperty("language_type", 1);
ist.addProperty("domain", domain);
// streamtrans参数填充
streamtrans.addProperty("from", from);
streamtrans.addProperty("to", to);
// tts参数填充
tts.addProperty("vcn", vcn);
tts_results.addProperty("encoding", "raw");
tts_results.addProperty("sample_rate", 16000);
tts_results.addProperty("channels", 1);
tts_results.addProperty("bit_depth", 16);
tts.add("tts_results", tts_results);
parameter.add("ist", ist);
parameter.add("streamtrans", streamtrans);
parameter.add("tts", tts);
//填充payload
data.addProperty("audio", Base64.getEncoder().encodeToString(Arrays.copyOf(audioDataByteArray, len)));
data.addProperty("encoding", encoding);
data.addProperty("sample_rate", 16000);
data.addProperty("status", status);
data.addProperty("seq", count);
payload.add("data", data);
//填充frame
frame.add("header", header);
frame.add("parameter", parameter);
frame.add("payload", payload);
webSocket.send(frame.toString());
status = StatusContinueFrame; // 发送完第一帧改变status 为 1
System.out.println("send first 请开始说出中文:");
break;
case StatusContinueFrame: //中间帧status = 1
JsonObject contineuFrame = new JsonObject();
JsonObject header1 = new JsonObject();
JsonObject payload1 = new JsonObject();
JsonObject data1 = new JsonObject();
// 填充head
header1.addProperty("status", 1);
header1.addProperty("app_id", APPID);
//填充payload
data1.addProperty("audio", Base64.getEncoder().encodeToString(Arrays.copyOf(audioDataByteArray, len)));
data1.addProperty("encoding", encoding);
data1.addProperty("sample_rate", 16000);
data1.addProperty("status", status);
data1.addProperty("seq", count);
payload1.add("data", data1);
contineuFrame.add("header", header1);
contineuFrame.add("payload", payload1);
webSocket.send(contineuFrame.toString());
// System.out.println("send continue");
break;
case StatusLastFrame: // 最后一帧音频status = 2 ,标志音频发送结束
String audio = "";
if (len != 0) {
audio = Base64.getEncoder().encodeToString(Arrays.copyOf(audioDataByteArray, len));
}
JsonObject lastFrame = new JsonObject();
JsonObject header2 = new JsonObject();
JsonObject payload2 = new JsonObject();
JsonObject data2 = new JsonObject();
// 填充head
header2.addProperty("status", 2);
header2.addProperty("app_id", APPID);
//填充payload
data2.addProperty("audio", audio);
data2.addProperty("encoding", encoding);
data2.addProperty("sample_rate", 16000);
data2.addProperty("status", status);
data2.addProperty("seq", count);
payload2.add("data", data2);
lastFrame.add("header", header2);
lastFrame.add("payload", payload2);
webSocket.send(lastFrame.toString());
System.out.println("send last 中文讲话结束!");
break end;
}
count++;
Thread.sleep(interval); //模拟音频采样延时
}
System.out.println("all data is send 所有音频数据发送完毕!");
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public void onOpen(WebSocket webSocket, Response response) {
super.onOpen(webSocket, response);
// System.out.println("open connection");
}
// 客户端接收服务端的消息并处理
@Override
public void onMessage(WebSocket webSocket, String text) {
super.onMessage(webSocket, text);
ResponseData resp = gson.fromJson(text, ResponseData.class);
// System.err.println(text);
if (resp != null) {
if (resp.header.code != 0) {
System.out.println("error=>" + resp.header.message + " sid=" + resp.header.sid + " 错误码=" + resp.header.code);
return;
}
if (resp.header != null) {
if (resp.header.code == 0) {
// System.out.println(text);
if (resp.payload != null) {
// 接收到的识别结果写到文本
if (resp.payload.recognition_results != null) {
String s1 = resp.payload.recognition_results.text;
byte[] trans1 = Base64.getDecoder().decode(s1);
String res1 = new String(trans1);
try {
writeStringToFile(res1, asr_result);
} catch (IOException e) {
e.printStackTrace();
}
}
// 接收到的翻译结果写到文本
if (resp.payload.streamtrans_results != null) {
String s2 = resp.payload.streamtrans_results.text;
byte[] trans = Base64.getDecoder().decode(s2);
String res = new String(trans);
try {
writeStringToFile(res, trans_result);
} catch (IOException e) {
e.printStackTrace();
}
}
// 把接收到的音频流合成PCM
if (resp.payload.tts_results != null) {
String s = resp.payload.tts_results.audio;
queue.add(s);
// System.err.println("执行一次");
try {
writeBytesToFile(Base64.getDecoder().decode(s), outPutPcm);
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
if (resp.header.status == 2) {
// todo resp.data.status ==2 说明数据全部返回完毕,可以关闭连接,释放资源
System.out.println("session end 同声传译返回完毕!");
System.out.println("本次请求的sid==》 " + resp.header.sid);
System.out.println("数据处理完毕,等待实时转译结束!");
overFlag = true;
try {
// 流程完毕后,输出音频文件,把PCM转换为WAV
PcmToWav.convertAudioFiles(outPutPcm, outPutWav);
} catch (IOException e) {
e.printStackTrace();
}
webSocket.close(1000, "");
if (queue.size() == 0) {
System.exit(0);
}
} else {
// todo 根据返回的数据处理
}
}
}
}
@Override
public void onFailure(WebSocket webSocket, Throwable t, Response response) {
super.onFailure(webSocket, t, response);
System.out.println(t.getMessage());
try {
System.out.println("错误信息:" + response);
if (response == null) {
return;
}
System.out.println("错误信息" + response.code());
System.out.println(response.body().string());
} catch (IOException e) {
e.printStackTrace();
}
}
// 实时播放音频流方法
static class videoPlayer implements Runnable {
@Override
public void run() {
while (true) {
if (overFlag && queue.size() == 0) {
break;
}
if (queue.size() != 0) {
String poll = queue.poll();
VideoPlayerService.videoPlay(Base64.getDecoder().decode(poll));
}
}
System.out.println("实时转译结束!!!");
System.exit(1000);
}
}
// 清空已存在文件
public static void clearDir() {
String path = "output";
File file = new File(path);
for (File f : file.listFiles()) {
if (f.isDirectory()) {
for (File subFile : f.listFiles()) {
if (subFile != null) {
subFile.delete();
}
}
}
}
System.out.println("结果集初始化成功------");
}
// 写入文件
public static void writeBytesToFile(byte[] bs, String path) throws IOException {
OutputStream out = new FileOutputStream(path, true);
InputStream is = new ByteArrayInputStream(bs);
byte[] buff = new byte[1024];
int len = 0;
while ((len = is.read(buff)) != -1) {
out.write(buff, 0, len);
}
is.close();
out.close();
}
// 写入文件
public static void writeStringToFile(String content, String path) throws IOException {
OutputStream out = new FileOutputStream(path, true);
out.write(content.getBytes());
out.close();
}
// JSON解析
public static class ResponseData {
header header;
payload payload;
}
public static class payload {
streamtrans_results streamtrans_results;
recognition_results recognition_results;
tts_results tts_results;
@Override
public String toString() {
return "payload{" + "streamtrans_results=" + streamtrans_results + ", recognition_results=" + recognition_results + ", tts_results=" + tts_results + '}';
}
}
public static class header {
int code;
String message;
String sid;
int status;
}
public static class recognition_results {
String encoding;
String format;
String text;
int status;
@Override
public String toString() {
return "recognition_results{" + "encoding='" + encoding + '\'' + ", format='" + format + '\'' + ", text='" + text + '\'' + ", status=" + status + '}';
}
}
public static class streamtrans_results {
String encoding;
String format;
String text;
int status;
@Override
public String toString() {
return "streamtrans_results{" + "encoding='" + encoding + '\'' + ", format='" + format + '\'' + ", text='" + text + '\'' + ", status=" + status + '}';
}
}
public static class tts_results {
String encoding;
String audio;
int sample_rate;
int channels;
int bit_depth;
int status;
int seq;
int frame_size;
@Override
public String toString() {
return "tts_results{" + "encoding='" + encoding + '\'' + ", audio='" + audio + '\'' + ", sample_rate=" + sample_rate + ", channels=" + channels + ", bit_depth=" + bit_depth + ", status=" + status + ", seq=" + seq + ", frame_size=" + frame_size + '}';
}
}
}
三、鉴权代码
package main.com.util;
import javax.crypto.Mac;
import javax.crypto.spec.SecretKeySpec;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.text.SimpleDateFormat;
import java.util.Base64;
import java.util.Date;
import java.util.Locale;
import java.util.TimeZone;
/**
* @Author:sjliu7
* 鉴权使用
* @Date:2019/7/31 15:23
*/
public class AuthUtils {
private static final String serviceId = "simult_interpretation";
/**
* 生成用于鉴权的URL,websocket 接口
* @param requestUrl
* @param apiKey
* @param apiSecret
* @return final requestUrl
*/
public static String assembleRequestUrl(String requestUrl, String apiKey, String apiSecret) {
URL url = null;
String httpRequestUrl = requestUrl.replace("ws://", "http://").replace("wss://","https://" );
try {
url = new URL(httpRequestUrl);
SimpleDateFormat format = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z", Locale.US);
// format.setTimeZone(TimeZone.getTimeZone("UTC"));
format.setTimeZone(TimeZone.getTimeZone("GMT"));
String date = format.format(new Date());
// date = "Mon, 13 Dec 2021 03:05:05 GMT";
String host = url.getHost();
StringBuilder builder = new StringBuilder("host: ").append(host).append("\n").//
append("date: ").append(date).append("\n").//
append("GET ").append(url.getPath()).append(" HTTP/1.1");
// System.out.println(builder);
Charset charset = Charset.forName("UTF-8");
Mac mac = Mac.getInstance("hmacsha256");
// System.out.println(builder.toString());
SecretKeySpec spec = new SecretKeySpec(apiSecret.getBytes(charset), "hmacsha256");
mac.init(spec);
byte[] hexDigits = mac.doFinal(builder.toString().getBytes(charset));
String sha = Base64.getEncoder().encodeToString(hexDigits);
// System.out.println(sha);
String authorization = String.format("api_key=\"%s\", algorithm=\"%s\", headers=\"%s\", signature=\"%s\"", apiKey, "hmac-sha256", "host date request-line", sha);
String authBase = Base64.getEncoder().encodeToString(authorization.getBytes(charset));
// System.out.println(authBase);
// System.out.println(String.format("%s?authorization=%s&host=%s&date=%s&serviceId=%s", requestUrl, URLEncoder.encode(authBase), URLEncoder.encode(host), URLEncoder.encode(date),serviceId));
return String.format("%s?authorization=%s&host=%s&date=%s&serviceId=%s", requestUrl, URLEncoder.encode(authBase), URLEncoder.encode(host), URLEncoder.encode(date),serviceId);
} catch (Exception e) {
throw new RuntimeException("assemble requestUrl error:"+e.getMessage());
}
}
}
四、PCM转成WAV
package main.com.util;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
public class PcmToWav {
/**
* @param src 待转换文件路径
* @param target 目标文件路径
* @throws IOException 抛出异常
*/
public static void convertAudioFiles(String src, String target) throws IOException {
FileInputStream fis = new FileInputStream(src);
FileOutputStream fos = new FileOutputStream(target);
//计算长度
byte[] buf = new byte[1024 * 4];
int size = fis.read(buf);
int PCMSize = 0;
while (size != -1) {
PCMSize += size;
size = fis.read(buf);
}
fis.close();
//填入参数,比特率等等。这里用的是16位单声道 8000 hz
WaveHeader header = new WaveHeader();
//长度字段 = 内容的大小(PCMSize) + 头部字段的大小(不包括前面4字节的标识符RIFF以及fileLength本身的4字节)
header.fileLength = PCMSize + (44 - 8);
header.FmtHdrLeth = 16;
header.BitsPerSample = 16;
header.Channels = 2;
header.FormatTag = 0x0001;
header.SamplesPerSec = 8000;
header.BlockAlign = (short) (header.Channels * header.BitsPerSample / 8);
header.AvgBytesPerSec = header.BlockAlign * header.SamplesPerSec;
header.DataHdrLeth = PCMSize;
byte[] h = header.getHeader();
assert h.length == 44; //WAV标准,头部应该是44字节
fos.write(h, 0, h.length);
fis = new FileInputStream(src);
size = fis.read(buf);
while (size != -1) {
fos.write(buf, 0, size);
size = fis.read(buf);
}
fis.close();
fos.close();
System.out.println("Convert OK!");
}
}
五、音频播放
package main.com.util;
import javax.sound.sampled.*;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.IOException;
public class VideoPlayerService {
public static SourceDataLine auline = null;
static {
AudioFormat audioFormat=new AudioFormat(16000F, 16, 1,true,false);
DataLine.Info info = new DataLine.Info(SourceDataLine.class, audioFormat);
try {
auline = (SourceDataLine) AudioSystem.getLine(info);
auline.open(audioFormat);
auline.start();
} catch (LineUnavailableException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
}
public static byte[] byteArray(String file) throws IOException {
BufferedInputStream in = new BufferedInputStream(new FileInputStream(file));
ByteArrayOutputStream out = new ByteArrayOutputStream(1024);
System.out.println("Available bytes:" + in.available());
byte[] temp = new byte[1024];
int size = 0;
while ((size = in.read(temp)) != -1) {
out.write(temp, 0, size);
}
in.close();
byte[] content = out.toByteArray();
return content;
}
public static void videoPlay(byte[] video){
auline.write(video,0,video.length);
}
}
六、WAV头添加
package main.com.util;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
public class WaveHeader {
public final char fileID[] = {'R', 'I', 'F', 'F'};
public int fileLength;
public char wavTag[] = {'W', 'A', 'V', 'E'};
public char FmtHdrID[] = {'f', 'm', 't', ' '};
public int FmtHdrLeth;
public short FormatTag;
public short Channels;
public int SamplesPerSec;
public int AvgBytesPerSec;
public short BlockAlign;
public short BitsPerSample;
public char DataHdrID[] = {'d', 'a', 't', 'a'};
public int DataHdrLeth;
public byte[] getHeader() throws IOException {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
WriteChar(bos, fileID);
WriteInt(bos, fileLength);
WriteChar(bos, wavTag);
WriteChar(bos, FmtHdrID);
WriteInt(bos, FmtHdrLeth);
WriteShort(bos, FormatTag);
WriteShort(bos, Channels);
WriteInt(bos, SamplesPerSec);
WriteInt(bos, AvgBytesPerSec);
WriteShort(bos, BlockAlign);
WriteShort(bos, BitsPerSample);
WriteChar(bos, DataHdrID);
WriteInt(bos, DataHdrLeth);
bos.flush();
byte[] r = bos.toByteArray();
bos.close();
return r;
}
private void WriteShort(ByteArrayOutputStream bos, int s) throws IOException {
byte[] mybyte = new byte[2];
mybyte[1] = (byte) ((s << 16) >> 24);
mybyte[0] = (byte) ((s << 24) >> 24);
bos.write(mybyte);
}
private void WriteInt(ByteArrayOutputStream bos, int n) throws IOException {
byte[] buf = new byte[4];
buf[3] = (byte) (n >> 24);
buf[2] = (byte) ((n << 8) >> 24);
buf[1] = (byte) ((n << 16) >> 24);
buf[0] = (byte) ((n << 24) >> 24);
bos.write(buf);
}
private void WriteChar(ByteArrayOutputStream bos, char[] id) {
for (char c : id) {
bos.write(c);
}
}
}