通过阿里云平台工具实现文字转语音功能

透过窗的阳光H

已于 2022-05-07 15:14:01 修改

阅读量4.4k

点赞数

文章标签： java

于 2022-04-27 14:18:06 首次发布

本文链接：https://blog.csdn.net/weixin_47931063/article/details/124450007

版权

package com.comwinwin.project.speech;

import com.alibaba.nls.client.AccessToken;
import com.ruoyi.common.config.RuoYiConfig;
import com.ruoyi.common.core.domain.Response;
import com.ruoyi.common.utils.RandomUtil;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import org.springframework.beans.factory.annotation.Value;
import lombok.extern.log4j.Log4j2;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;

import javax.annotation.Resource;
import javax.servlet.http.HttpServletRequest;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Log4j2
@Api(value = "文字识别controller", tags = {"b文字识别接口"})
@RestController
public class SpeechSynthesizerLongTextController {

    @ApiOperation("识别文字转语音")
    @PostMapping("/api/SpeechSynthesizer/longText.do")
    public Response<String> SpeechSynthesizerLongText(HttpServletRequest request,@RequestParam(required = false) String ttsTextLong){
        try {
            if(ttsTextLong!=null&&!"".equals(ttsTextLong)){
                ttsTextLong = textUtil.getText(ttsTextLong);
            }
            //参数自己设置
            AccessToken token = new AccessToken("", "");
            token.apply();
            String accessToken = token.getToken();
            long expireTime = token.getExpireTime();

            String appKey = "";
            String url = ""; // 默认即可，默认值：wss://nls-gateway.cn-shanghai.aliyuncs.com/ws/v1

            String videoPath = "/mp3/" + new SimpleDateFormat("yyyyMMdd").format(new Date()) + "/";
            String basePath = getContextPath(request) + "/profile" + videoPath;
            String path = RuoYiConfig.getProfile() + videoPath;

            String filename = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()) + RandomUtil.getRandom(6);
            String urlPath = basePath + filename + ".mp3";
            String file = path + filename + ".mp3";

            Path pathDir = Paths.get(path);
            try {
                if (!Files.exists(pathDir))
                    Files.createDirectories(pathDir);
            } catch (IOException e) {
                e.printStackTrace();
            }
            File out = new File(file);
            FileOutputStream fout = new FileOutputStream(out);
            // 初期并不知道wav文件实际长度，假设为0，最后再校正
            int pcmSize = 0;
            WavHeader header = new WavHeader();
            // 长度字段 = 内容的大小（PCMSize) + 头部字段的大小(不包括前面4字节的标识符RIFF以及fileLength本身的4字节)
            header.fileLength = pcmSize + (44 - 8);
            header.fmtHdrLeth = 16;
            header.bitsPerSample = 16;
            header.channels = 1;
            header.formatTag = 0x0001;
            header.samplesPerSec = 16000;
            header.blockAlign = (short) (header.channels * header.bitsPerSample / 8);
            header.avgBytesPerSec = header.blockAlign * header.samplesPerSec;
            header.dataHdrLeth = pcmSize;
            byte[] h = header.getHeader();
            assert h.length == 44;
            // 先写入44字节的wav头，如果合成的不是wav，比如是pcm，则不需要此步骤
            fout.write(h);

            SpeechSynthesizerLongTextDemo demo = new SpeechSynthesizerLongTextDemo(appKey, accessToken, url);
            demo.process(ttsTextLong, fout);
            demo.shutdown();
            // 更新44字节的wav头，如果合成的不是wav，比如是pcm，则不需要此步骤
            RandomAccessFile wavFile = new RandomAccessFile(file, "rw");
            int fileLength = (int)wavFile.length();
            int dataSize = fileLength - 44;
            System.out.println("filelength = " + fileLength +", datasize = " + dataSize);
            header.fileLength = fileLength - 8;
            header.dataHdrLeth = fileLength - 44;
            wavFile.write(header.getHeader());
            wavFile.close();
            return Response.success(urlPath);
        }catch (IOException e){
            log.info("文字识别失败！ "+e);
            return Response.error("");
        }
    }

    private String getContextPath(HttpServletRequest request) {
        return request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort() + request.getContextPath();
    }

}

package com.comwinwin.project.speech;

import com.alibaba.nls.client.protocol.NlsClient;
import com.alibaba.nls.client.protocol.OutputFormatEnum;
import com.alibaba.nls.client.protocol.SampleRateEnum;
import com.alibaba.nls.client.protocol.tts.SpeechSynthesizer;
import com.alibaba.nls.client.protocol.tts.SpeechSynthesizerListener;
import com.alibaba.nls.client.protocol.tts.SpeechSynthesizerResponse;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;

/**
 * 此示例: tts 支持最多300个字符,此demo展示超过300字符的调用方式
 * 说明：这个示例和长文本语音合成并不完全相同，长文本语音合成是单独的产品功能，是将一长串文本直接发送给服务端去合成；
 * 而此处演示的是将一长串文本在调用方处切割然后分段调用语音合成接口
 */
public class SpeechSynthesizerLongTextDemo {
    private String appKey;
    NlsClient client;

    /// 直接传递token进来
    public SpeechSynthesizerLongTextDemo(String appKey, String token, String url) {
        this.appKey = appKey;
        //TODO 重要提示 创建NlsClient实例,应用全局创建一个即可,生命周期可和整个应用保持一致,默认服务地址为阿里云线上服务地址
        if(url.isEmpty()) {
            client = new NlsClient(token);
        }else {
            client = new NlsClient(url, token);
        }
    }

    private static SpeechSynthesizerListener getSynthesizerListener(final FileOutputStream fout) {
        SpeechSynthesizerListener listener = null;
        try {
            listener = new SpeechSynthesizerListener() {
                int totalSize = 0;
                //语音合成结束
                @Override
                public void onComplete(SpeechSynthesizerResponse response) {
                    System.out.println("task_id: " + response.getTaskId() +
                            ", name: " + response.getName() + ", status: " + response.getStatus());
                    System.out.println("onComplete, totalsize = " + totalSize);

                }

                //语音合成的语音二进制数据
                @Override
                public void onMessage(ByteBuffer message) {
                    try {
                        byte[] bytesArray = new byte[message.remaining()];
                        message.get(bytesArray, 0, bytesArray.length);
                        System.out.println("write arrya:" + bytesArray.length);
                        totalSize += bytesArray.length;
                        fout.write(bytesArray);
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }

                @Override
                public void onFail(SpeechSynthesizerResponse response) {
                    // 重要提示： task_id很重要，是调用方和服务端通信的唯一ID标识，当遇到问题时，需要提供此task_id以便排查
                    System.out.println(
                        "task_id: " + response.getTaskId() +
                            //状态码 20000000 表示识别成功
                            ", status: " + response.getStatus() +
                            //错误信息
                            ", status_text: " + response.getStatusText());
                }
            };
        } catch (Exception e) {
            e.printStackTrace();
        }
        return listener;
    }

    public void process(final String longText, final FileOutputStream fout) {
        List<String> textArr = splitLongText(longText, 100);
        SpeechSynthesizer synthesizer = null;
        try {
            //创建实例,建立连接
            synthesizer = new SpeechSynthesizer(client, getSynthesizerListener(fout));
            synthesizer.setAppKey(appKey);
            //此处一定要设置为pcm格式,才能将多次结果拼接起来
            synthesizer.setFormat(OutputFormatEnum.PCM);
            //设置返回音频的采样率
            synthesizer.setSampleRate(SampleRateEnum.SAMPLE_RATE_16K);

            for (int i = 0; i < textArr.size(); i++) {
                //设置用于语音合成的文本
                synthesizer.setText(textArr.get(i));
                //此方法将以上参数设置序列化为json发送给服务端,并等待服务端确认
                synthesizer.start();
                //等待语音合成结束
                synthesizer.waitForComplete();
            }

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            //关闭连接
            if (null != synthesizer) {
                synthesizer.close();
            }
        }
    }

    /**
     * 将长文本切分为每句字数不大于size数目的短句
     * @param text
     * @param size
     * @return
     */
    public static List<String> splitLongText(String text, int size) {
        //先按标点符号切分
        String[] texts = text.split("[、，。；？！,!\\?]");
        StringBuilder textPart = new StringBuilder();
        List<String> result = new ArrayList<String>();
        int len = 0;
        //再按size merge,避免标点符号切分出来的太短
        for (int i = 0; i < texts.length; i++) {
            if (textPart.length() + texts[i].length() + 1 > size) {
                result.add(textPart.toString());
                textPart.delete(0, textPart.length());

            }
            textPart.append(texts[i]);
            len += texts[i].length();
            if(len<text.length()){
                //System.out.println("at " + text.charAt(len));
                textPart.append(text.charAt(len));
                len += 1;
            }

        }
        if (textPart.length() > 0) {
            result.add(textPart.toString());
        }

        return result;

    }

    public void shutdown() {
        client.shutdown();
    }

    public static byte[] int2byte(int intData) {
        byte[] byteData = new byte[4];
        byteData[0] = (byte) (0xff & (intData >> 24));
        byteData[1] = (byte) (0xff & (intData >> 16));
        byteData[2] = (byte) (0xff & (intData >> 8));
        byteData[3] = (byte) (0xff & intData);
        return byteData;
    }

    public static byte[] short2byte(short s) {
        byte[] byteData = new byte[2];
        byteData[0] = (byte) (0xff & (s >> 8));
        byteData[1] = (byte) (0xff & s);
        return byteData;
    }

    public static void main(String[] args) throws Exception {
        String appKey = "你的appkey";
        String token = "你的token";
        String url = ""; // 默认即可，默认值：wss://nls-gateway.cn-shanghai.aliyuncs.com/ws/v1

        String ttsTextLong = "百草堂与三味书屋 鲁迅 \n" +
            "我家的后面有一个很大的园，相传叫作百草园。现在是早已并屋子一起卖给朱文公的子孙了，连那最末次的相见也已经隔了七八年，其中似乎确凿只有一些野草；但那时却是我的乐园。\n" +
            "不必说碧绿的菜畦，光滑的石井栏，高大的皂荚树，紫红的桑葚；也不必说鸣蝉在树叶里长吟，肥胖的黄蜂伏在菜花上，轻捷的叫天子(云雀)忽然从草间直窜向云霄里去了。\n" +
            "单是周围的短短的泥墙根一带，就有无限趣味。油蛉在这里低唱，蟋蟀们在这里弹琴。翻开断砖来，有时会遇见蜈蚣；还有斑蝥，倘若用手指按住它的脊梁，便会啪的一声，\n" +
            "从后窍喷出一阵烟雾。何首乌藤和木莲藤缠络着，木莲有莲房一般的果实，何首乌有臃肿的根。有人说，何首乌根是有像人形的，吃了便可以成仙，我于是常常拔它起来，牵连不断地拔起来，\n" +
            "也曾因此弄坏了泥墙，却从来没有见过有一块根像人样! 如果不怕刺，还可以摘到覆盆子，像小珊瑚珠攒成的小球，又酸又甜，色味都比桑葚要好得远......";

        String path = "longText4TTS.wav";
        File out = new File(path);
        FileOutputStream fout = new FileOutputStream(out);

        // 初期并不知道wav文件实际长度，假设为0，最后再校正
        int pcmSize = 0;
        WavHeader header = new WavHeader();
        // 长度字段 = 内容的大小（PCMSize) + 头部字段的大小(不包括前面4字节的标识符RIFF以及fileLength本身的4字节)
        header.fileLength = pcmSize + (44 - 8);
        header.fmtHdrLeth = 16;
        header.bitsPerSample = 16;
        header.channels = 1;
        header.formatTag = 0x0001;
        header.samplesPerSec = 16000;
        header.blockAlign = (short) (header.channels * header.bitsPerSample / 8);
        header.avgBytesPerSec = header.blockAlign * header.samplesPerSec;
        header.dataHdrLeth = pcmSize;
        byte[] h = header.getHeader();
        assert h.length == 44;

        // 先写入44字节的wav头，如果合成的不是wav，比如是pcm，则不需要此步骤
        fout.write(h);

        SpeechSynthesizerLongTextDemo demo = new SpeechSynthesizerLongTextDemo(appKey, token, url);
        demo.process(ttsTextLong, fout);
        demo.shutdown();

        // 更新44字节的wav头，如果合成的不是wav，比如是pcm，则不需要此步骤
        RandomAccessFile wavFile = new RandomAccessFile(path, "rw");
        int fileLength = (int)wavFile.length();
        int dataSize = fileLength - 44;
        System.out.println("filelength = " + fileLength +", datasize = " + dataSize);
        header.fileLength = fileLength - 8;
        header.dataHdrLeth = fileLength - 44;
        wavFile.write(header.getHeader());
        wavFile.close();
    }
}

package com.comwinwin.project.speech;

import java.io.ByteArrayOutputStream;
import java.io.IOException;

class WavHeader {
    /**
     * 4 资源交换文件标志（RIFF）
     */
    public final char fileID[] = {'R', 'I', 'F', 'F'};
    /**
     * 4 总字节数
     */
    public int fileLength;
    /**
     * 4 WAV文件标志（WAVE）
     */
    public char wavTag[] = {'W', 'A', 'V', 'E'};
    /**
     * 4 波形格式标志（fmt ），最后一位空格
     */
    public char fmtHdrID[] = {'f', 'm', 't', ' '};
    /**
     * 4 过滤字节（一般为00000010H），若为00000012H则说明数据头携带附加信息
     */
    public int fmtHdrLeth;
    /**
     * 2 格式种类（值为1时，表示数据为线性PCM编码）
     */
    public short formatTag;
    /**
     * 2 通道数，单声道为1，双声道为2
     */
    public short channels;
    /**
     * 4 采样频率
     */
    public int samplesPerSec;
    /**
     * 4 波形数据传输速率（每秒平均字节数）
     */
    public int avgBytesPerSec;
    /**
     * 2 DATA数据块长度，字节
     */
    public short blockAlign;
    /**
     * 2 PCM位宽
     */
    public short bitsPerSample;
    /**
     * 4 数据标志符（data）
     */
    public char dataHdrID[] = {'d', 'a', 't', 'a'};
    /**
     * 4 DATA总数据长度字节
     */
    public int dataHdrLeth;
    public byte[] getHeader() throws IOException {
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        WriteChar(bos, fileID);
        WriteInt(bos, fileLength);
        WriteChar(bos, wavTag);
        WriteChar(bos, fmtHdrID);
        WriteInt(bos, fmtHdrLeth);
        WriteShort(bos, formatTag);
        WriteShort(bos, channels);
        WriteInt(bos, samplesPerSec);
        WriteInt(bos, avgBytesPerSec);
        WriteShort(bos, blockAlign);
        WriteShort(bos, bitsPerSample);
        WriteChar(bos, dataHdrID);
        WriteInt(bos, dataHdrLeth);
        bos.flush();
        byte[] r = bos.toByteArray();
        bos.close();
        return r;
    }
    private void WriteShort(ByteArrayOutputStream bos, int s) throws IOException {
        byte[] mybyte = new byte[2];
        mybyte[1] = (byte) ((s << 16) >> 24);
        mybyte[0] = (byte) ((s << 24) >> 24);
        bos.write(mybyte);
    }
    private void WriteInt(ByteArrayOutputStream bos, int n) throws IOException {
        byte[] buf = new byte[4];
        buf[3] = (byte) (n >> 24);
        buf[2] = (byte) ((n << 8) >> 24);
        buf[1] = (byte) ((n << 16) >> 24);
        buf[0] = (byte) ((n << 24) >> 24);
        bos.write(buf);
    }
    private void WriteChar(ByteArrayOutputStream bos, char[] id) {
        for (int i = 0; i < id.length; i++) {
            char c = id[i];
            bos.write(c);
        }
    }
}

maven引用

<!--        文字转语音-->
        <dependency>
            <groupId>com.alibaba.nls</groupId>
            <artifactId>nls-sdk-tts</artifactId>
            <version>2.2.1</version>
        </dependency>
        <dependency>
            <groupId>com.alibaba.nls</groupId>
            <artifactId>nls-sdk-common</artifactId>
            <version>2.1.6</version>
        </dependency>

透过窗的阳光H

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
1
评论
通过阿里云平台工具实现文字转语音功能

package com.comwinwin.project.speech;import com.alibaba.nls.client.AccessToken;import com.ruoyi.common.config.RuoYiConfig;import com.ruoyi.common.core.domain.Response;import com.ruoyi.common.utils.RandomUtil;import io.swagger.annotations.Api;import.
复制链接

扫一扫