deepvoice3
DeepVoice3: Single-speaker text-to-speech demo
In this notebook, you can try DeepVoice3-based single-speaker text-to-speech (en) using a model trained on LJSpeech dataset. The notebook is supposed to be executed on Google colab so you don't have to setup your machines locally.
Estimated time to complete: 5 miniutes.
Code: https://github.com/r9y9/deepvoice3_pytorch
Audio samples: https://r9y9.github.io/deepvoice3_pytorch/
Install dependencies
import os
from os.path import exists, join, expanduser
# Clone
name = "deepvoice3_pytorch"
if not exists(name):
! git clone https://github.com/r9y9/$name
# Change working directory to the project dir
os.chdir(join(expanduser("~"), name))
!git checkout 7a10ac6763eda92595e257543494b6a95f64229b --quiet
# Install dependencices
!pip install -q -e '.[bin]'
%pylab inline
! pip install -q librosa nltk
import torch
import numpy as np
import librosa
import librosa.display
import IPython
from IPython.display import Audio
# need this for English text processing frontend
import nltk
! python -m nltk.downloader cmudict
Download a pre-trained model
preset = "20180505_deepvoice3_ljspeech.json"
checkpoint_path = "20180505_deepvoice3_checkpoint_step000640000.pth"
if not exists(preset):
!curl -O -L "https://www.dropbox.com/s/0ck82unm0bo0rxd/20180505_deepvoice3_ljspeech.json"
if not exists(checkpoint_path):
!curl -O -L "https://www.dropbox.com/s/5ucl9remrwy5oeg/20180505_deepvoice3_checkpoint_step000640000.pth"
Synthesis
Setup hyper parameters
import hparams
import json
# Load parameters from preset
with open(preset) as f:
hparams.hparams.parse_json(f.read())
# Inject frontend text processor
import synthesis
import train
from deepvoice3_pytorch import frontend
synthesis._frontend = getattr(frontend, "en")
train._frontend = getattr(frontend, "en")
# alises
fs = hparams.hparams.sample_rate
hop_length = hparams.hparams.hop_size
Define utility functions
def tts(model, text, p=0, speaker_id=None, fast=True, figures=True):
from synthesis import tts as _tts
waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id, fast)
if figures:
visualize(alignment, spectrogram)
IPython.display.display(Audio(waveform, rate=fs))
def visualize(alignment, spectrogram):
label_fontsize = 16
figure(figsize=(16,16))
subplot(2,1,1)
imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
xlabel("Decoder timestamp", fontsize=label_fontsize)
ylabel("Encoder timestamp", fontsize=label_fontsize)
colorbar()
subplot(2,1,2)
librosa.display.specshow(spectrogram.T, sr=fs,
hop_length=hop_length, x_axis="time", y_axis="linear")
xlabel("Time", fontsize=label_fontsize)
ylabel("Hz", fontsize=label_fontsize)
tight_layout()
colorbar()
Load the model checkpoint
from train import build_model
from train import restore_parts, load_checkpoint
model = build_model()
model = load_checkpoint(checkpoint_path, model, None, True)
Generate speech
# Try your favorite senteneces:)
texts = [
"Scientists at the CERN laboratory say they have discovered a new particle.",
"There's a way to measure the acute emotional intelligence that has never gone out of style.",
"President Trump met with other leaders at the Group of 20 conference.",
"The Senate's bill to repeal and replace the Affordable Care Act is now imperiled.",
"Generative adversarial network or variational auto-encoder.",
"The buses aren't the problem, they actually provide a solution.",
"peter piper picked a peck of pickled peppers how many peppers did peter piper pick.",
"Some have accepted this as a miracle without any physical explanation.",
]
for idx, text in enumerate(texts):
print(idx, text)
tts(model, text, figures=False)
# With attention plot
text = "Generative adversarial network or variational auto-encoder."
tts(model, text, figures=True)