基于TTS模型快速生成大规模语音数据教程

Pika在线

已于 2024-08-12 09:16:03 修改

阅读量641

点赞数 6

分类专栏： AI语音文章标签： TTS 语音合成语音识别

于 2024-08-08 17:22:17 首次发布

本文链接：https://blog.csdn.net/Ephemeroptera/article/details/141031942

版权

AI语音专栏收录该内容

19 篇文章

订阅专栏

本教程将带您一步步了解如何使用 ModelScope 提供的文本到语音（TTS）模型，通过多进程批量处理的方式，快速生成大规模语音数据。我们将详细讲解代码中的每个部分，并最终展示如何将它们组合成一个高效的语音数据生成流程。
在这里插入图片描述

在这里插入图片描述

完整代码

from modelscope.outputs import OutputKeys
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from threading import Thread
from queue import Queue
import time 
import random
import string
import warnings
warnings.filterwarnings("ignore") 


def generate_random_string(n):
    letters = string.ascii_letters + string.digits
    random_string = ''.join(random.choice(letters) for i in range(n))
    return random_string

class Worker(Thread):
    def __init__(self,
                 wid,
                 model_id,
                 tasks,
                 outdir):
        super().__init__()
        self.wid = wid
        self.sambert_hifigan_tts = pipeline(task=Tasks.text_to_speech, model=model_id, device='gpu')
        self.tasks = tasks
        self.outdir = outdir
        self.speaker = ['zhitian_emo','zhibei_emo','zhiyan_emo','zhizhe_emo']
        self.format = "wav"
        self.daemon = True

    def run(self):
        print(f"worker-{self.wid}: Launch")
        while True:
            if not self.tasks.empty():

                text, i, N = self.tasks.get()
                speaker = random.choice(self.speaker)
                random_code = generate_random_string(10)
                utt = f"{speaker}_{random_code}"

                audio_path = f"{self.outdir}/{utt}.{self.format}"
                label_path = f"{self.outdir}/{utt}.txt"


                success = False
                while not success:
                    try:
                        print(f"worker-{self.wid}: Process => {i}/{N} {utt} {text}")
                        output = self.sambert_hifigan_tts(input=text, voice=speaker)
                        wav = output[OutputKeys.OUTPUT_WAV]
                        with open(audio_path, 'wb') as f:
                            f.write(wav)
                        with open(label_path, 'wt', encoding='utf-8') as f:
                            print(text, file=f)
                        success = True
                    except Exception as e:
                        print(f"worker-{self.wid}: {e}")
                
            else:
                break
            time.sleep(0.1)
        print(f"worker-{self.wid}: Done")
        


if __name__ == "__main__":

    text_list_file = "/opt/wangwei/TTS/data2.txt"
    model_id = 'damo/speech_sambert-hifigan_tts_zh-cn_16k'
    nj =  20
    tasks = Queue()
    outdir = "/dev/dataset_tts/"

    # 读取全部文本句子
    with open(text_list_file,'rt',encoding='utf-8') as f:
        text_list = f.readlines()

    # 生成任务列表
    for i, line in enumerate(text_list):
        tasks.put([line.strip(), i, len(text_list)])

    # 初始化多个worker
    workers = []
    for i in range(nj):
        workers.append(Worker(wid=i,
                              model_id=model_id,
                              tasks=tasks,
                              outdir=outdir))
    
    # 启动workers
    for worker in workers:
        worker.start()
 
    # 等待多线程结束
    for worker in workers:
        worker.join()