【Diffusers】DDIM的深入探索:API、代码、以及reverse DDIM

from diffusers import StableDiffusionPipeline, DDIMScheduler
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to(device)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
  1. _class_name: 调度器的类名,即DDIMScheduler。
  2. _diffusers_version: Diffusers库的版本号。
  3. beta_end: 扩散过程中beta的结束值。Beta是一个控制噪声强度的参数,它的值会在扩散过程中从起始值逐渐变化到结束值。
  4. beta_schedule: Beta的变化方式。可以选择不同的变化方式,如线性变化、指数变化等。
  5. beta_start: 扩散过程中beta的起始值。
  6. clip_sample: 是否对采样结果进行裁剪,使其在一定范围内。
  7. clip_sample_range: 裁剪采样结果的范围。
  8. dynamic_thresholding_ratio: 动态阈值比例。用于控制动态阈值的变化。
  9. num_train_timesteps: 训练的时间步数。控制扩散过程的总步数
  10. prediction_type: 预测类型。可以选择epsilon或alpha。
  11. rescale_betas_zero_snr: 是否对beta进行重新缩放,以使信噪比为零
  12. sample_max_value: 采样结果的最大值。
  13. set_alpha_to_one: 是否将alpha设置为1。
  14. skip_prk_steps: 是否跳过PRK步骤。
  15. steps_offset: 步数的偏移量。
  16. thresholding: 是否进行阈值处理。
  17. timestep_spacing: 时间步的间隔方式。可以选择leading或trailing。
  18. trained_betas: 训练得到的beta值。
DDIMScheduler {
  "_class_name": "DDIMScheduler",
  "_diffusers_version": "0.27.2",
  "beta_end": 0.012,
  "beta_schedule": "scaled_linear",
  "beta_start": 0.00085,
  "clip_sample": false,
  "clip_sample_range": 1.0,
  "dynamic_thresholding_ratio": 0.995,
  "num_train_timesteps": 1000,
  "prediction_type": "epsilon",
  "rescale_betas_zero_snr": false,
  "sample_max_value": 1.0,
  "set_alpha_to_one": false,
  "skip_prk_steps": true,
  "steps_offset": 1,
  "thresholding": false,
  "timestep_spacing": "leading",
  "trained_betas": null
}
# Plot 'alpha' (alpha_bar in DDPM language, alphas_cumprod in diffusers for clarity)
timesteps = pipe.scheduler.timesteps.cpu()
alphas = pipe.scheduler.alphas_cumprod[timesteps]
plt.plot(timesteps, alphas, label='alpha_t');
plt.legend();

pipe.scheduler.alphas_cumprod 存放的是噪声程度,alpha。越大表示原图占比越多

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

采样的时候,从t=1000开始,到0。类似一个反转的过程

DDIM采样函数

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

prev_t = max(1, t.item() - (1000//num_inference_steps)) # t-1
alpha_t = pipe.scheduler.alphas_cumprod[t.item()]
alpha_t_prev = pipe.scheduler.alphas_cumprod[prev_t]
predicted_x0 = (latents - (1-alpha_t).sqrt()*noise_pred) / alpha_t.sqrt()
direction_pointing_to_xt = (1-alpha_t_prev).sqrt()*noise_pred
latents = alpha_t_prev.sqrt()*predicted_x0 + direction_pointing_to_xt
# Sample function (regular DDIM)
@torch.no_grad()
def sample(prompt, start_step=0, start_latents=None, # 可以从半路开始
           guidance_scale=3.5, num_inference_steps=30, # 引导强度,推理步长
           num_images_per_prompt=1, do_classifier_free_guidance=True, # 每次几张图,不引导
           negative_prompt='', device=device):
  
    # Encode prompt
    text_embeddings = pipe._encode_prompt(
            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt    )

    # Set num inference steps
    pipe.scheduler.set_timesteps(num_inference_steps, device=device)

    # Create a random starting point if we don't have one already
    # 没有起点就进行随机
    if start_latents is None:
        start_latents = torch.randn(1, 4, 64, 64, device=device)
        start_latents *= pipe.scheduler.init_noise_sigma

    latents = start_latents.clone()

    for i in tqdm(range(start_step, num_inference_steps)):
    
        t = pipe.scheduler.timesteps[i]

        # expand the latents if we are doing classifier free guidance
        # 是否进行隐层扩展(CFG)
        latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
        latent_model_input = pipe.scheduler.scale_model_input(latent_model_input, t)

        # predict the noise residual
        # 预测噪声差值
        noise_pred = pipe.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

        # perform guidance
        if do_classifier_free_guidance:
            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) # .chunk(2) 分成多个大小为2的子块
            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

        # Normally we'd rely on the scheduler to handle the update step:
        # latents = pipe.scheduler.step(noise_pred, t, latents).prev_sample

        # 自己实现一个调度器
        prev_t = max(1, t.item() - (1000//num_inference_steps)) # t-1
        alpha_t = pipe.scheduler.alphas_cumprod[t.item()]
        alpha_t_prev = pipe.scheduler.alphas_cumprod[prev_t]
        predicted_x0 = (latents - (1-alpha_t).sqrt()*noise_pred) / alpha_t.sqrt()
        direction_pointing_to_xt = (1-alpha_t_prev).sqrt()*noise_pred
        latents = alpha_t_prev.sqrt()*predicted_x0 + direction_pointing_to_xt

    # Post-processing
    images = pipe.decode_latents(latents)
    images = pipe.numpy_to_pil(images)
    return images

图像编码到latent space

# encode with VAE
with torch.no_grad(): latent = pipe.vae.encode(tfms.functional.to_tensor(input_image).unsqueeze(0).to(device)*2-1)
l = 0.18215 * latent.latent_dist.sample()

Inverted DDIM

## Inversion
@torch.no_grad()
def invert(start_latents, prompt, guidance_scale=3.5, num_inference_steps=80,
           num_images_per_prompt=1, do_classifier_free_guidance=True,
           negative_prompt='', device=device):
    # Encode prompt
    # torch.Size([2, 77, 768])
    text_embeddings = pipe._encode_prompt(
            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
    )
    # latents are now the specified start latents 指定起点
    latents = start_latents.clone()
    # We'll keep a list of the inverted latents as the process goes on
    # 保存反转的 过程特征图
    intermediate_latents = []
    # Set num inference steps
    pipe.scheduler.set_timesteps(num_inference_steps, device=device)
    # Reversed timesteps <<<<<<<<<<<<<<<<<<<<
    timesteps = reversed(pipe.scheduler.timesteps)
    翻转时间步
    for i in tqdm(range(1, num_inference_steps), total=num_inference_steps-1):

        # We'll skip the final iteration
        # 保留最后一步
        if i >= num_inference_steps - 1: continue

        t = timesteps[i]
        # expand the latents if we are doing classifier free guidance
        没有额外的引导,这里复制一个通道,两张图都会被推理
        latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
        latent_model_input = pipe.scheduler.scale_model_input(latent_model_input, t)
        # predict the noise residual
        noise_pred = pipe.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
        # perform guidance
        if do_classifier_free_guidance:
            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

        current_t = max(0, t.item() - (1000//num_inference_steps))#t
        next_t = t # min(999, t.item() + (1000//num_inference_steps)) # t+1
        alpha_t = pipe.scheduler.alphas_cumprod[current_t]
        alpha_t_next = pipe.scheduler.alphas_cumprod[next_t]
        # Inverted update step (re-arranging the update step to get x(t) (new latents) as a function of x(t-1) (current latents)
        latents = (latents - (1-alpha_t).sqrt()*noise_pred)*(alpha_t_next.sqrt()/alpha_t.sqrt()) + (1-alpha_t_next).sqrt()*noise_pred
        # Store
        intermediate_latents.append(latents)
            
    return torch.cat(intermediate_latents)

解码出对应的初始化图

inverted_latents = invert(l, input_image_prompt,num_inference_steps=50)
with torch.no_grad():
  im = pipe.decode_latents(inverted_latents[-1].unsqueeze(0))

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

pipe(input_image_prompt, latents=inverted_latents[-1][None], num_inference_steps=50, guidance_scale=3.5).images[0]

使用这个初始化重新生成图像,就能保持生成出来的图基本不变

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

同时,从inverted_latents的某一层开始推理,还可以更好的更换其中的promote

  • 3
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值