deffm_comput_loss()# x1 是target_mel# random timestep
t = torch.rand([b,1,1], device=mu.device, dtype=mu.dtype)# sample noise p(x_0)
z = torch.randn_like(x1)
y =(1-(1- self.sigma_min)* t)* z + t * x1
u = x1 -(1- self.sigma_min)* z
pred_y = self.estimator(y, mask, mu, t.squeeze(), spks)
loss = F.mse_loss(pred_y, u, reduction="sum")/(
torch.sum(mask)* u.shape[1])return loss, y
defestimator_forward():
x = pack(y, mu)
x = pack(x, spks)
q,k,v = x, x, x
x = slf_attn(q,k,v)
outputs = linear(x)return outputs
cosyvoice 实现
deffm_forward():# mu: encoder_outputs# x1: target_mel# cond: prompt_mel 随机取的部分
conds = torch.zeros(feat.shape, device=token.device)for i, j inenumerate(feat_len):if random.random()<0.5:continue
index = random.randint(0,int(0.3* j))
conds[i,:index]= feat[i,:index]
conds = conds.transpose(1,2)
b, _, t = mu.shape
# random timestep
t = torch.rand([b,1,1], device=mu.device, dtype=mu.dtype)if self.t_scheduler =='cosine':
t =1- torch.cos(t *0.5* torch.pi)# sample noise p(x_0)
z = torch.randn_like(x1)
y =(1-(1- self.sigma_min)* t)* z + t * x1
u = x1 -(1- self.sigma_min)* z
# during training, we randomly drop condition to trade off mode coverage and sample fidelity# inference 的时候实际不需要condition, 给zero就可以if self.training_cfg_rate >0:
cfg_mask = torch.rand(b, device=x1.device)> self.training_cfg_rate
mu = mu * cfg_mask.view(-1,1,1)
spks = spks * cfg_mask.view(-1,1)
cond = cond * cfg_mask.view(-1,1,1)
pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond, streaming=streaming)
loss = F.mse_loss(pred * mask, u * mask, reduction="sum")/(torch.sum(mask)* u.shape[1])return loss, y
defestimator(x, mu, spks, cond):
x = pack(x, mu, spks, cond)
x = slf_attn(x)
outputs = linear(x)return outputs