本教程将带您一步步了解如何使用 ModelScope 提供的文本到语音(TTS)模型,通过多进程批量处理的方式,快速生成大规模语音数据。我们将详细讲解代码中的每个部分,并最终展示如何将它们组合成一个高效的语音数据生成流程。
完整代码
from modelscope.outputs import OutputKeys
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from threading import Thread
from queue import Queue
import time
import random
import string
import warnings
warnings.filterwarnings("ignore")
def generate_random_string(n):
letters = string.ascii_letters + string.digits
random_string = ''.join(random.choice(letters) for i in range(n))
return random_string
class Worker(Thread):
def __init__(self,
wid,
model_id,
tasks,
outdir):
super().__init__()
self.wid = wid
self.sambert_hifigan_tts = pipeline(task=Tasks.text_to_speech, model=model_id, device='gpu')
self.tasks = tasks
self.outdir = outdir
self.speaker = ['zhitian_emo','zhibei_emo','zhiyan_emo','zhizhe_emo']
self.format = "wav"
self.daemon = True
def run(self):
print(f"worker-{self.wid}: Launch")
while True:
if not self.tasks.empty():
text, i, N = self.tasks.get()
speaker = random.choice(self.speaker)
random_code = generate_random_string(10)
utt = f"{speaker}_{random_code}"
audio_path = f"{self.outdir}/{utt}.{self.format}"
label_path = f"{self.outdir}/{utt}.txt"
success = False
while not success:
try:
print(f"worker-{self.wid}: Process => {i}/{N} {utt} {text}")
output = self.sambert_hifigan_tts(input=text, voice=speaker)
wav = output[OutputKeys.OUTPUT_WAV]
with open(audio_path, 'wb') as f:
f.write(wav)
with open(label_path, 'wt', encoding='utf-8') as f:
print(text, file=f)
success = True
except Exception as e:
print(f"worker-{self.wid}: {e}")
else:
break
time.sleep(0.1)
print(f"worker-{self.wid}: Done")
if __name__ == "__main__":
text_list_file = "/opt/wangwei/TTS/data2.txt"
model_id = 'damo/speech_sambert-hifigan_tts_zh-cn_16k'
nj = 20
tasks = Queue()
outdir = "/dev/dataset_tts/"
# 读取全部文本句子
with open(text_list_file,'rt',encoding='utf-8') as f:
text_list = f.readlines()
# 生成任务列表
for i, line in enumerate(text_list):
tasks.put([line.strip(), i, len(text_list)])
# 初始化多个worker
workers = []
for i in range(nj):
workers.append(Worker(wid=i,
model_id=model_id,
tasks=tasks,
outdir=outdir))
# 启动workers
for worker in workers:
worker.start()
# 等待多线程结束
for worker in workers:
worker.join()
总结
通过本文的介绍,您已经了解了如何使用 ModelScope 的 TTS 模型并行生成大规模的语音数据。该代码通过多进程并发执行,有效提高了生成效率,非常适合大批量语音数据的生成需求。您可以根据自己的需求,进一步调整代码中的参数和逻辑,以满足特定的应用场景。