背景
ChatTTS官方默认webui可供选择功能太少,我们今天来改造一下
在此基础上在做一些改造诸如音色保存等功能,并对功能实现做一下记录。有错误还请评论区勘正。
增加控制参数
增加speed、oral、laugh、break等参数
# 添加oral_slider laugh_slider break_slider 三个滑块
with gr.Row():
speed_slider = gr.Slider(minimum=0, maximum=9, step=1, value=2, label="speed")
oral_slider = gr.Slider(minimum=0, maximum=9, step=1, value=3, label="oral")
laugh_slider = gr.Slider(minimum=0, maximum=9, step=1, value=3, label="laugh")
break_slider = gr.Slider(minimum=0, maximum=9, step=1, value=3, label="break")
这样在启用优化文本时就会添加诸如笑声、停顿等
params_refine_text = {'prompt': f'[oral_{refine_oral}][laugh_{refine_laugh}][break_{refine_break}]'}
if refine_text_flag:
text = chat.infer(text,
...
params_refine_text=params_refine_text,
)
保存音色
绘制界面
with gr.Row():
text_input = gr.Textbox(label="音色名称", lines=4, placeholder="请输入音色名称", value="")
save_btn = gr.Button("保存音色")
将当前声音tensor参数存储下来
info = {}
info['spk_emb'] = ''
spk_embs_dir = Path("spk_embs")
if not spk_embs_dir.exists():
spk_embs_dir.mkdir()
...
rand_spk = chat.sample_random_speaker()
info['spk_emb'] = rand_spk
...
def save_voice(voice_name):
try:
spk_emb = info['spk_emb']
file_path = spk_embs_dir.joinpath(f'{voice_name}.json')
if file_path.exists():
return (f'{voice_name}.json already exists!')
data_list = spk_emb.tolist()
with open(file_path, 'w') as f:
json.dump(data_list, f)
return f'{voice_name}.json saved!'
except Exception as e:
return f'{voice_name}.json save failed! {e}'
读取音色
绘制界面
with gr.Row():
customer_speaker = gr.Checkbox(label="是否启用自定义音色", value=False)
# 音色下拉框
speaker = gr.Dropdown(choices=voice_infos.keys(), label="Speaker")
程序启动时加载音色
def read_voice_list():
files = spk_embs_dir.glob("*.json")
for file in files:
try:
with open(file, 'r', encoding='utf-8') as json_file:
tensor = json.load(json_file)
voice_infos[file.stem] = tensor
except:
print(f"Failed to read {file}")
read_voice_list()
生成音频时加载tensor
if customer_speaker_flag and speaker:
if speaker not in voice_infos:
return [(0, None), ""]
speak_tensor = torch.tensor(voice_infos[speaker])
params_infer_code = {"spk_emb": speak_tensor, "temperature": 0.0001}
...
wav = chat.infer(...
params_infer_code=params_infer_code
)
长文本生成
将长文本切分成段落,依次生成
paragraphs = re.split(r"\n+", text)
for paragraph in paragraphs:
if refine_text_flag:
paragraph = chat.infer(paragraph,
skip_refine_text=False,
refine_text_only=True,
params_refine_text=params_refine_text,
params_infer_code=params_infer_code
)
text_data = paragraph[0] if isinstance(paragraph, list) else paragraph
text_data_all.append(text_data)
# 生成音频文件
wav = chat.infer(paragraph,
skip_refine_text=True,
params_refine_text=params_refine_text,
params_infer_code=params_infer_code
)
audio_data = np.array(wav[0]).flatten()
audio_data_all.append(audio_data)
# 将所有的语音片段合并成一个完整的音频
audio_data_all = np.concatenate(audio_data_all)
自动保存当前音频
定义一个key保存当前音频存储路径
info = {}
info["current_voice"] = None
音频生成后自定保存到output文件夹
# 检查通道数
if audio_data_all.ndim == 1: # 单声道
num_channels = 1
elif audio_data_all.ndim == 2: # 立体声
num_channels = audio_data_all.shape[1]
audio_data_int16 = (audio_data_all * 32767).astype(np.int16)
# 写入WAV文件
# 将内存文件保存到磁盘上
timestamp = datetime.now().strftime('%Y-%m-%d_%H_%M_%S')
file_path = str(output.joinpath(f'{timestamp}.wav').resolve())
info["current_voice"] = file_path
wavfile.write(file_path, sample_rate, audio_data_all)
总结
做了一番改造后,功能丰富了不少,日常会易用很多。但仍不完美,等待后续做进一步改造。