搞了蛮多时间,就记录下 基于这个NVIDIA/tacotron2
首先,Tacotron2 是一个端到端的,就是文字序列进去出来的是语音序列,分两个一个是从文字到梅尔频谱图,一个是从梅尔频谱图转化成语音,两部分再Navida pytorch 代码里是分开的,就一个一个训练,首先先看tacotron2 前端部分,就跟着步骤一步步来就可以了,其中要下载和自己显卡相匹配的PyTorch版本,训练应该没有多大问题,就是想看下成果这样弄的。
看代码里model有forward和inference之分,生成的时候需要手动把inference改成forward,因为训练和生成走的不是一样的路子。往里面送字符的时候需要embedding下,这样子就可以把字符串变成包含了词汇意义的向量了,还需要增一维,就可以一次送进去好几个嘛。送进去一个,出来的话有4个矩阵,第一个是mel,第二个是melpost,第三个是gate,就管哪里可以发音的,第四个是alignment,对齐。因为出来的是tensor的结构,所以看图的话可以转成numpy然后用plt显示,如果要用waveglow声码器,也是训练下,按着步骤走就好了,训完之后有inference.py文件,替换下自己的mel.pt就可以了,就可以听见声音了。
Encoder
"""Encoder module:
- Three 1-d convolution banks
- Bidirectional LSTM
"""
def forward(self, x, input_lengths):
# [1, 512, 68]
for conv in self.convolutions:
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
# [1, 68, 512]
x = x.transpose(1, 2)
# pytorch tensor are not reversible, hence the conversion
# 68
input_lengths = input_lengths.cpu().numpy()
# [68, 512], [68]
x = nn.utils.rnn.pack_padded_sequence(
x, input_lengths, batch_first=True)
self.lstm.flatten_parameters()
# [68, 512], _
outputs, _ = self.lstm(x)
# [1, 68, 512], _
outputs, _ = nn.utils.rnn.pad_packed_sequence(
outputs, batch_first=True)
# [1, 68, 512]
return outputs
Decoder
def decode(self, decoder_input):
""" Decoder step using stored states, attention and memory
PARAMS
------
decoder_input: previous mel output
RETURNS
-------
mel_output:
gate_output: gate output energies
attention_weights:
"""
cell_input = torch.cat((decoder_input, self.attention_context), -1)
self.attention_hidden, self.attention_cell = self.attention_rnn(
cell_input, (self.attention_hidden, self.attention_cell))
self.attention_hidden = F.dropout(
self.attention_hidden, self.p_attention_dropout, self.training)
attention_weights_cat = torch.cat(
(self.attention_weights.unsqueeze(1),
self.attention_weights_cum.unsqueeze(1)), dim=1)
self.attention_context, self.attention_weights = self.attention_layer(
self.attention_hidden, self.memory, self.processed_memory,
attention_weights_cat, self.mask)
self.attention_weights_cum += self.attention_weights
decoder_input = torch.cat(
(self.attention_hidden, self.attention_context), -1)
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
decoder_input, (self.decoder_hidden, self.decoder_cell))
self.decoder_hidden = F.dropout(
self.decoder_hidden, self.p_decoder_dropout, self.training)
decoder_hidden_attention_context = torch.cat(
(self.decoder_hidden, self.attention_context), dim=1)
decoder_output = self.linear_projection(
decoder_hidden_attention_context)
gate_prediction = self.gate_layer(decoder_hidden_attention_context)
return decoder_output, gate_prediction, self.attention_weights
def forward(self, memory, decoder_inputs, memory_lengths):
""" Decoder forward pass for training
PARAMS
------
memory: Encoder outputs
decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs
memory_lengths: Encoder output lengths for attention masking.
RETURNS
-------
mel_outputs: mel outputs from the decoder
gate_outputs: gate outputs from the decoder
alignments: sequence of attention weights from the decoder
"""
# [1, 1, 80] 取一帧出来都是0
decoder_input = self.get_go_frame(memory).unsqueeze(0)
# [917, 1, 80]
decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
# [918, 1, 80]
decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
# [918, 1, 215]
decoder_inputs = self.prenet(decoder_inputs)
self.initialize_decoder_states(
# [1, 68, 512]
memory, mask=~get_mask_from_lengths(memory_lengths))
mel_outputs, gate_outputs, alignments = [], [], []
# [918, 1, 256]
# 直到将所有帧都循环出来
while len(mel_outputs) < decoder_inputs.size(0) - 1:
# [1, 256]
decoder_input = decoder_inputs[len(mel_outputs)]
# [1, 80], [1, 1], [1, 68]
mel_output, gate_output, attention_weights = self.decode(
decoder_input)
# 一帧的累加
mel_outputs += [mel_output.squeeze(1)]
# 是否是声音的累加
gate_outputs += [gate_output.squeeze(1)]
# 注意力累加,看最后是否对齐
alignments += [attention_weights]
# [1, 80, 917], [1, 917], [1, 917, 68]
mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
mel_outputs, gate_outputs, alignments)
return mel_outputs, gate_outputs, alignments
Tacotron2
"""
Tacotron2 module
"""
def forward(self, inputs):
# [1, 68], [68], [1, 80, 917], 68, [917]
text_inputs, text_lengths, mels, max_len, output_lengths = inputs
# [1, 68], [68]
text_lengths, output_lengths = text_lengths.data, output_lengths.data
# [1, 512, 68]
embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
# [1, 68, 512]
encoder_outputs = self.encoder(embedded_inputs, text_lengths)
#
mel_outputs, gate_outputs, alignments = self.decoder(
# [1, 68, 512], [1, 80, 917], 68
encoder_outputs, mels, memory_lengths=text_lengths)
mel_outputs_postnet = self.postnet(mel_outputs)
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
return self.parse_output(
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
output_lengths)