本地搭建ChatTTS WebUi

声明

声明:本教程基于modelscope.cn的演示站进行本地搭建,环境为Windows

作者GitHub地址:https://github.com/2noise/ChatTTS

Webui体验地址:https://modelscope.cn/studios/AI-ModelScope/ChatTTS-demo/summary

第一步 克隆代码

先在终端输入以下内容,克隆modelscope的文件到本地

git clone https://www.modelscope.cn/studios/AI-ModelScope/ChatTTS-demo.git

  
  
  • 1

克隆好之后进入文件目录

在这里插入图片描述

到了目录之后直接执行安装txt中的内容太慢了,换成国内源很快就能下好

pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple

  
  
  • 1

第二步 安装库

下好之后也不能立马执行,需要在手动安装一些库

在这里插入图片描述

依此执行以下命令

pip install modelscope -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install gradio -i https://pypi.tuna.tsinghua.edu.cn/simple

  
  
  • 1
  • 2

上面安装没有问题,执行以下代码

python app.py

  
  
  • 1

启动等待程序下载完成(白条可能会卡住不动,因为他是一次显示很多的,看一下网络宽带有占用就行了,不要暂停程序!)

在这里插入图片描述

错误解决

我就遇到一个错误,说什么Windows不支持,然后我根据错误修改了一些代码我就能运行了,由于我第一次搭建是拿物理电脑搭建的,解决运行之后想着写一篇文章,然后用虚拟机继续搭建,遇到的错误又不一样了,烦死了,于是我直接把我物理机的core.py中的内容复制到虚拟机中的core.py就能运行了,直接复制以下代码覆盖克隆下来的core.py

core.py在ChatTTS目录下面


import os
import logging
from omegaconf import OmegaConf

import platform
import torch
from vocos import Vocos
from .model.dvae import DVAE
from .model.gpt import GPT_warpper
from .utils.gpu_utils import select_device
from .utils.infer_utils import count_invalid_characters, detect_language
from .utils.io_utils import get_latest_modified_file
from .infer.api import refine_text, infer_code

from huggingface_hub import snapshot_download

logging.basicConfig(level = logging.INFO)

class Chat:
def init(self, ):
self.pretrain_models = {}
self.normalizer = {}
self.logger = logging.getLogger(name)

def check_model(self, level = logging.INFO, use_decoder = False):
    not_finish = False
    check_list = ['vocos', 'gpt', 'tokenizer']
    
    if use_decoder:
        check_list.append('decoder')
    else:
        check_list.append('dvae')
        
    for module in check_list:
        if module not in self.pretrain_models:
            self.logger.log(logging.WARNING, f'{module} not initialized.')
            not_finish = True
            
    if not not_finish:
        self.logger.log(level, f'All initialized.')
        
    return not not_finish
    
def load_models(self, source='huggingface', force_redownload=False, local_path='<LOCAL_PATH>', **kwargs):
    if source == 'huggingface':
        hf_home = os.getenv('HF_HOME', os.path.expanduser("~/.cache/huggingface"))
        try:
            download_path = get_latest_modified_file(os.path.join(hf_home, 'hub/models--2Noise--ChatTTS/snapshots'))
        except:
            download_path = None
        if download_path is None or force_redownload: 
            self.logger.log(logging.INFO, f'Download from HF: https://huggingface.co/2Noise/ChatTTS')
            download_path = snapshot_download(repo_id="2Noise/ChatTTS", allow_patterns=["*.pt", "*.yaml"])
        else:
            self.logger.log(logging.INFO, f'Load from cache: {download_path}')
    elif source == 'local':
        self.logger.log(logging.INFO, f'Load from local: {local_path}')
        download_path = local_path

    self._load(**{k: os.path.join(download_path, v) for k, v in OmegaConf.load(os.path.join(download_path, 'config', 'path.yaml')).items()}, **kwargs)
    
def _load(
    self, 
    vocos_config_path: str = None, 
    vocos_ckpt_path: str = None,
    dvae_config_path: str = None,
    dvae_ckpt_path: str = None,
    gpt_config_path: str = None,
    gpt_ckpt_path: str = None,
    decoder_config_path: str = None,
    decoder_ckpt_path: str = None,
    tokenizer_path: str = None,
    device: str = None,
    compile: bool = True,
):
    if not device:
        device = select_device(4096)
        self.logger.log(logging.INFO, f'use {device}')
        
    if vocos_config_path:
        vocos = Vocos.from_hparams(vocos_config_path).to(device).eval()
        assert vocos_ckpt_path, 'vocos_ckpt_path should not be None'
        vocos.load_state_dict(torch.load(vocos_ckpt_path))
        self.pretrain_models['vocos'] = vocos
        self.logger.log(logging.INFO, 'vocos loaded.')
    
    if dvae_config_path:
        cfg = OmegaConf.load(dvae_config_path)
        dvae = DVAE(**cfg).to(device).eval()
        assert dvae_ckpt_path, 'dvae_ckpt_path should not be None'
        dvae.load_state_dict(torch.load(dvae_ckpt_path, map_location='cpu'))
        self.pretrain_models['dvae'] = dvae
        self.logger.log(logging.INFO, 'dvae loaded.')
        
    if gpt_config_path:
        cfg = OmegaConf.load(gpt_config_path)
        gpt = GPT_warpper(**cfg).to(device).eval()
        assert gpt_ckpt_path, 'gpt_ckpt_path should not be None'
        gpt.load_state_dict(torch.load(gpt_ckpt_path, map_location='cpu'))
        if platform.system() != 'Windows':
            gpt.gpt.forward = torch.compile(gpt.gpt.forward, backend='inductor', dynamic=True)
        self.pretrain_models['gpt'] = gpt
        spk_stat_path = os.path.join(os.path.dirname(gpt_ckpt_path), 'spk_stat.pt')
        assert os.path.exists(spk_stat_path), f'Missing spk_stat.pt: {spk_stat_path}'
        self.pretrain_models['spk_stat'] = torch.load(spk_stat_path).to(device)
        self.logger.log(logging.INFO, 'gpt loaded.')
        
    if decoder_config_path:
        cfg = OmegaConf.load(decoder_config_path)
        decoder = DVAE(**cfg).to(device).eval()
        assert decoder_ckpt_path, 'decoder_ckpt_path should not be None'
        decoder.load_state_dict(torch.load(decoder_ckpt_path, map_location='cpu'))
        self.pretrain_models['decoder'] = decoder
        self.logger.log(logging.INFO, 'decoder loaded.')
    
    if tokenizer_path:
        tokenizer = torch.load(tokenizer_path, map_location='cpu')
        tokenizer.padding_side = 'left'
        self.pretrain_models['tokenizer'] = tokenizer
        self.logger.log(logging.INFO, 'tokenizer loaded.')
        
    self.check_model()

def infer(
    self, 
    text, 
    skip_refine_text=False, 
    refine_text_only=False, 
    params_refine_text={}, 
    params_infer_code={'prompt':'[speed_5]'}, 
    use_decoder=True,
    do_text_normalization=False,
    lang=None,
):
    
    assert self.check_model(use_decoder=use_decoder)
    
    if not isinstance(text, list): 
        text = [text]
    
    if do_text_normalization:
        for i, t in enumerate(text):
            _lang = detect_language(t) if lang is None else lang
            self.init_normalizer(_lang)
            text[i] = self.normalizer[_lang].normalize(t, verbose=False, punct_post_process=True)
        
    for i in text:
        invalid_characters = count_invalid_characters(i)
        if len(invalid_characters):
            self.logger.log(logging.WARNING, f'Invalid characters found! : {invalid_characters}')
            
    if not skip_refine_text:
        text_tokens = refine_text(self.pretrain_models, text, **params_refine_text)['ids']
        text_tokens = [i[i < self.pretrain_models['tokenizer'].convert_tokens_to_ids('[break_0]')] for i in text_tokens]
        text = self.pretrain_models['tokenizer'].batch_decode(text_tokens)
        if refine_text_only:
            return text
        
    text = [params_infer_code.get('prompt', '') + i for i in text]
    params_infer_code.pop('prompt', '')
    result = infer_code(self.pretrain_models, text, **params_infer_code, return_hidden=use_decoder)
    
    if use_decoder:
        mel_spec = [self.pretrain_models['decoder'](i[None].permute(0,2,1)) for i in result['hiddens']]
    else:
        mel_spec = [self.pretrain_models['dvae'](i[None].permute(0,2,1)) for i in result['ids']]
        
    wav = [self.pretrain_models['vocos'].decode(i).cpu().numpy() for i in mel_spec]
    
    return wav

def sample_random_speaker(self, ):
    
    dim = self.pretrain_models['gpt'].gpt.layers[0].mlp.gate_proj.in_features
    std, mean = self.pretrain_models['spk_stat'].chunk(2)
    return torch.randn(dim, device=std.device) * std + mean

def init_normalizer(self, lang):
    
    if lang not in self.normalizer:
        from nemo_text_processing.text_normalization.normalize import Normalizer
        self.normalizer[lang] = Normalizer(input_case='cased', lang=lang)

运行结果

执行app.py之后会得到一个地址,访问即可

在这里插入图片描述

访问就可以生成了

在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值