原创,如需转转请著名出处。
安装或下载官方demo
https://github.com/TensorSpeech/TensorFlowTTS
这里如果我们直接pip install .它会直接直接给装个tf2.3.1gpu版本,你可以打开setup.py改配置,这个tf2.3.0-2.4应该都可以运行,以上
现在前提是你已经安装了tf2.3gpu版本,我们以tf2.3.1为例,因为我也不小心中招了,把我的2.3.0给升级了
show me the code
首选我们注释掉cleaners.py的德文py文件,因为源工程已经删除,会报错,里面的韩文没有报错就不管了
全部代码:
# -*- coding: utf-8 -*-
# Copyright (c) 2017 Keith Ito
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import re
from tensorflow_tts.utils.korean import tokenize as ko_tokenize
from tensorflow_tts.utils.number_norm import normalize_numbers
from unidecode import unidecode
# from german_transliterate.core import GermanTransliterate
# Regular expression matching whitespace:
_whitespace_re = re.compile(r"\s+")
# List of (regular expression, replacement) pairs for abbreviations:
_abbreviations = [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("mrs", "misess"),
("mr", "mister"),
("dr", "doctor"),
("st", "saint"),
("co", "company"),
("jr", "junior"),
("maj", "major"),
("gen", "general"),
("drs", "doctors"),
("rev", "reverend"),
("lt", "lieutenant"),
("hon", "honorable"),
("sgt", "sergeant"),
("capt", "captain"),
("esq", "esquire"),
("ltd", "limited"),
("col", "colonel"),
("ft", "fort"),
]
]
def expand_abbreviations(text):
for regex, replacement in _abbreviations:
text = re.sub(regex, replacement, text)
return text
def expand_numbers(text):
return normalize_numbers(text)
def lowercase(text):
return text.lower()
def collapse_whitespace(text):
return re.sub(_whitespace_re, " ", text)
def convert_to_ascii(text):
return unidecode(text)
def basic_cleaners(text):
"""Basic pipeline that lowercases and collapses whitespace without transliteration."""
text = lowercase(text)
text = collapse_whitespace(text)
return text
def transliteration_cleaners(text):
"""Pipeline for non-English text that transliterates to ASCII."""
text = convert_to_ascii(text)
text = lowercase(text)
text = collapse_whitespace(text)
return text
def english_cleaners(text):
"""Pipeline for English text, including number and abbreviation expansion."""
text = convert_to_ascii(text)
text = lowercase(text)
text = expand_numbers(text)
text = expand_abbreviations(text)
text = collapse_whitespace(text)
return text
def korean_cleaners(text):
"""Pipeline for Korean text, including number and abbreviation expansion."""
text = ko_tokenize(
text
) # '존경하는' --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ']
return text
def german_cleaners(text):
"""Pipeline for German text, including number and abbreviation expansion."""
# text = GermanTransliterate(replace={';': ',', ':': ' '}, sep_abbreviation=' -- ').transliterate(text)
# return text
return ""
简而言之,就是把关于GermanTransliterate的代码注释,有函数引用就返回空字符串
主代码:
import tensorflow as tf
import yaml
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd
from tensorflow_tts.inference import AutoConfig
from tensorflow_tts.inference import TFAutoModel
from tensorflow_tts.inference import AutoProcessor
tacotron2_config = AutoConfig.from_pretrained(r'H:\PYTHON\SDK\TensorFlowTTS-master\examples\tacotron2\conf\tacotron2.baker.v1.yaml')
tacotron2 = TFAutoModel.from_pretrained(
config=tacotron2_config,
pretrained_path="H:/PYTHON/SDK/TensorFlowTTS-master/models/model-100000.h5",
name="tacotron2"
)
fastspeech2_config = AutoConfig.from_pretrained(r'H:\PYTHON\SDK\TensorFlowTTS-master\examples\fastspeech2\conf\fastspeech2.baker.v2.yaml')
fastspeech2 = TFAutoModel.from_pretrained(
config=fastspeech2_config,
pretrained_path="H:/PYTHON/SDK/TensorFlowTTS-master/models/model-200000.h5",
name="fastspeech2"
)
mb_melgan_config = AutoConfig.from_pretrained(r'H:\PYTHON\SDK\TensorFlowTTS-master\examples\multiband_melgan\conf\multiband_melgan.baker.v1.yaml')
mb_melgan = TFAutoModel.from_pretrained(
config=mb_melgan_config,
pretrained_path="H:/PYTHON/SDK/TensorFlowTTS-master/models/generator-920000.h5",
name="mb_melgan"
)
processor = AutoProcessor.from_pretrained(pretrained_path="H:/PYTHON/SDK/TensorFlowTTS-master/models/baker_mapper.json")
def do_synthesis(input_text, text2mel_model, vocoder_model, text2mel_name, vocoder_name):
input_ids = processor.text_to_sequence(input_text, inference=True)
# text2mel part
if text2mel_name == "TACOTRON":
_, mel_outputs, stop_token_prediction, alignment_history = text2mel_model.inference(
tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
tf.convert_to_tensor([len(input_ids)], tf.int32),
tf.convert_to_tensor([0], dtype=tf.int32)
)
elif text2mel_name == "FASTSPEECH2":
mel_before, mel_outputs, duration_outputs, _, _ = text2mel_model.inference(
tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
)
else:
raise ValueError("Only TACOTRON, FASTSPEECH2 are supported on text2mel_name")
# vocoder part
if vocoder_name == "MB-MELGAN":
# tacotron-2 generate noise in the end symtematic, let remove it :v.
if text2mel_name == "TACOTRON":
remove_end = 1024
else:
remove_end = 1
audio = vocoder_model.inference(mel_outputs)[0, :-remove_end, 0]
else:
raise ValueError("Only MB_MELGAN are supported on vocoder_name")
if text2mel_name == "TACOTRON":
return mel_outputs.numpy(), alignment_history.numpy(), audio.numpy()
else:
return mel_outputs.numpy(), audio.numpy()
def visualize_attention(alignment_history):
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)
ax.set_title(f'Alignment steps')
im = ax.imshow(
alignment_history,
aspect='auto',
origin='lower',
interpolation='none')
fig.colorbar(im, ax=ax)
xlabel = 'Decoder timestep'
plt.xlabel(xlabel)
plt.ylabel('Encoder timestep')
plt.tight_layout()
plt.show()
plt.close()
def visualize_mel_spectrogram(mels):
mels = tf.reshape(mels, [-1, 80]).numpy()
fig = plt.figure(figsize=(10, 8))
ax1 = fig.add_subplot(311)
ax1.set_title(f'Predicted Mel-after-Spectrogram')
im = ax1.imshow(np.rot90(mels), aspect='auto', interpolation='none')
fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
plt.show()
plt.close()
input_text = "这是一个开源的端到端中文语音合成系统"
tacotron2.setup_window(win_front=5, win_back=5)
# Tacotron2 + MB-MelGAN
mels, alignment_history, audios = do_synthesis(input_text, tacotron2, mb_melgan, "TACOTRON", "MB-MELGAN")
visualize_attention(alignment_history[0])
visualize_mel_spectrogram(mels[0])
ipd.Audio(audios, rate=24000)
# FastSpeech2 + MB-MelGAN
mels, audios = do_synthesis(input_text, fastspeech2, mb_melgan, "FASTSPEECH2", "MB-MELGAN")
visualize_mel_spectrogram(mels[0])
ipd.Audio(audios, rate=24000)
import soundfile as sf
sf.write('./audio_after.wav', audios, 22050, "PCM_16")
print("-----------end-------------")
这里为了方便大家我全部用绝对路径,其实也可以相对路径
最后你可以得到一个audio_after.wav的音频文件,那么你就成功了
相关模型下载:https://colab.research.google.com/drive/1YpSHRBRPBI7cnTkQn1UcVTWEQVbsUm1S?usp=sharing
最后,有动手能力的童鞋可以看官方cpp和py的代码
魔改一下text2ids.py到CPP,其实就是汉字转音素加各种修饰
还有官方的TensorflowTTSCppInference.cpp,注意这个官方工程内部的json加载魔改的json会出现问题,基本上,也没什么作用,只是需要读取一个samplerate,我们可以直接赋值,比如中文是22050
然后魔改Voice.h和Voice.cpp,基本上其他类没什么用,中文还是自己实现好,核心类就是MultiBandMelGAN和FastSpeech2,我们只需要手动实现自己的汉语音素 to IDs(std::vector<int32_t>ids)即可
对于动手能力差的可以命令行直接调用text2ids.py的返回结果,在cpp里面用正则表达式获取IDs即可,官方的tflite Demo就是这样实现的(本来我想用tflite,奈何编译通不过,于是只能用官方的C API版本的Dll,可以参考我之前的文章),当然前提是你的主环境tf版本和推理的tf版本一致!
最后是生成的wav文件有噪声,我git试了几个项目都有,有一个用来阿里云的迁移学习的没有,可以考虑后期清除噪声算法,底噪都差不多
PS:tf2.3.1 dll版本英文语音生成正常,但是中文语言噪声严重,而且默认生成的wav普通播放器不可播放,需要用goldwave打开才能勉强播放,于是我用了最新版本tf2.4.0 dll推理生成wav才正常播放,甚至,这个版本播放完全没有噪声,喜大普奔!只需要断句注意一下即可,这个和其他博主得出的结论高版本播放正常也是一致的,各位童鞋可以用cppflow wrap一下旧版的api推理一下试试
PS:Ads时间,大家可以加群一起探讨一下各种深度学习或者图形框架,集思广益,群号:558174476(游戏与人工智能生命体)