Sora打卡-part2

Sora打卡-part2

基于diffusion的视频生成技术

Stable diffusion模型推理方法1:SDXL模型:

from modelscope.utils.constant import Tasks
from modelscope.pipelines import pipeline
import cv2

pipe = pipeline(task=Tasks.text_to_image_synthesis, 
                model='AI-ModelScope/stable-diffusion-xl-base-1.0',
                use_safetensors=True,
                model_revision='v1.0.0')

prompt = "Beautiful and cute girl, 16 years old, denim jacket, gradient background, soft colors, soft lighting, cinematic edge lighting, light and dark contrast, anime, art station Seraflur, blind box, super detail, 8k"
output = pipe({'text': prompt})
cv2.imwrite('SDXL.png', output['output_imgs'][0])

秒级推理方法1:SDXL-turbo模型是SDXL 1.0的蒸馏版本,SDXL-Turbo基于一种称之为对抗扩散蒸馏(ADD)的新颖的训练方法,这种方法在扩散模型采样可以减少到1到4步,而生成高质量图像。ADD的训练方式使用得分蒸馏,利用大规模扩散模型作为教师模型,并将其与对抗性损失相结合,即使在1-2步的采样步骤的低步骤状态下,使用对抗学习的方式,引入discriminator来辅助生成质量的把控,也可以确保高质量图像的保真度。

from diffusers import AutoPipelineForText2Image
import torch
from modelscope import snapshot_download

model_dir = snapshot_download("AI-ModelScope/sdxl-turbo")

pipe = AutoPipelineForText2Image.from_pretrained(model_dir, torch_dtype=torch.float16, variant="fp16")
pipe.to("cuda")

prompt = "Beautiful and cute girl, 16 years old, denim jacket, gradient background, soft colors, soft lighting, cinematic edge lighting, light and dark contrast, anime, art station Seraflur, blind box, super detail, 8k"

image = pipe(prompt=prompt, num_inference_steps=1, guidance_scale=0.0).images[0]
image.save("SDXLturbo.png")

秒级推理方法2:SDXL+LCM,潜在一致性模型(LCM)受一致性模型(CM)启发,在预训练的LDM上以较少的步骤进行快速推理。LCM-SD系列是在Stable Diffusion的基础上新增Consistency 约束蒸馏的结果,仅通过2-8步的推理即可实现高质量的文本到图片的生成性能。

from diffusers import UNet2DConditionModel, DiffusionPipeline, LCMScheduler
import torch
from modelscope import snapshot_download

model_dir_lcm = snapshot_download("AI-ModelScope/lcm-sdxl",revision = "master")
model_dir_sdxl = snapshot_download("AI-ModelScope/stable-diffusion-xl-base-1.0",revision = "v1.0.9")

unet = UNet2DConditionModel.from_pretrained(model_dir_lcm, torch_dtype=torch.float16, variant="fp16")
pipe = DiffusionPipeline.from_pretrained(model_dir_sdxl, unet=unet, torch_dtype=torch.float16, variant="fp16")

pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda")

prompt = "Beautiful and cute girl, 16 years old, denim jacket, gradient background, soft colors, soft lighting, cinematic edge lighting, light and dark contrast, anime, art station Seraflur, blind box, super detail, 8k"
image = pipe(prompt, num_inference_steps=4, guidance_scale=8.0).images[0]
image.save("SDXLLCM.png")

秒级推理方法3:stable-cascade模型基于Würstchen架构构建,与稳定扩散等其他模型的主要区别在于它在更小的潜在空间中工作。潜在空间越小,推理速度就越快,训练成本也就越低。潜在空间有多小?稳定扩散使用压缩因子 8,从而将 1024x1024 图像编码为 128x128。Stable Cascade 的压缩系数为 42,这意味着可以将 1024x1024 图像编码为 24x24,同时保持清晰的重建。然后在高度压缩的潜在空间中训练文本条件模型。与稳定扩散 1.5 相比,该架构的先前版本实现了 16 倍的成本降低。

import torch
from modelscope import snapshot_download
from diffusers import StableCascadeDecoderPipeline, StableCascadePriorPipeline

device = "cuda"
num_images_per_prompt = 1

stable_cascade_prior = snapshot_download("AI-ModelScope/stable-cascade-prior")
stable_cascade = snapshot_download("AI-ModelScope/stable-cascade")

prior = StableCascadePriorPipeline.from_pretrained(stable_cascade_prior, torch_dtype=torch.bfloat16).to(device)
decoder = StableCascadeDecoderPipeline.from_pretrained(stable_cascade,  torch_dtype=torch.float16).to(device)

prompt = "Beautiful and cute girl, 16 years old, denim jacket, gradient background, soft colors, soft lighting, cinematic edge lighting, light and dark contrast, anime, art station Seraflur, blind box, super detail, 8k"
negative_prompt = ""

prior_output = prior(
    prompt=prompt,
    height=1024,
    width=1024,
    negative_prompt=negative_prompt,
    guidance_scale=4.0,
    num_images_per_prompt=num_images_per_prompt,
    num_inference_steps=20
)
decoder_output = decoder(
    image_embeddings=prior_output.image_embeddings.half(),
    prompt=prompt,
    negative_prompt=negative_prompt,
    guidance_scale=0.0,
    output_type="pil",
    num_inference_steps=10
).images

for i, img in enumerate(decoder_output):
    img.save(f"stablecascade_{i+1}.png")
#Now decoder_output is a list with your PIL images

秒级推理方法4:

import torch
from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler
from modelscope.hub.file_download import model_file_download
from modelscope import snapshot_download
from safetensors.torch import load_file

base = snapshot_download("AI-ModelScope/stable-diffusion-xl-base-1.0")
repo = "AI-ModelScope/SDXL-Lightning"
ckpt = "sdxl_lightning_4step_unet.safetensors" # Use the correct ckpt for your step setting!

# Load model.
unet = UNet2DConditionModel.from_config(base, subfolder="unet").to("cuda", torch.float16)
unet.load_state_dict(load_file(model_file_download(repo, ckpt), device="cuda"))
pipe = StableDiffusionXLPipeline.from_pretrained(base, unet=unet, torch_dtype=torch.float16, variant="fp16").to("cuda")

# Ensure sampler uses "trailing" timesteps.
pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")

# Ensure using the same inference steps as the loaded model and CFG set to 0.
pipe("A girl smiling", num_inference_steps=4, guidance_scale=0).images[0].save("sdxllightning.png")

微调lora叠加推理

from diffusers import AutoPipelineForText2Image
from modelscope import snapshot_download
import torch

model_dir=snapshot_download("YorickHe/majicmixRealistic_v6")
lora_dir = snapshot_download("PaperCloud/zju19_dunhuang_style_lora")

pipeline = AutoPipelineForText2Image.from_pretrained(f"{model_dir}/v7", torch_dtype=torch.float16).to("cuda")
pipeline.load_lora_weights(lora_dir, weight_name="dunhuang.safetensors")
prompt = "1 girl, close-up, waist shot, black long hair, clean face, dunhuang, Chinese ancient style, clean skin, organza_lace, Dunhuang wind, Art deco, Necklace, jewelry, Bracelet, Earrings, dunhuang_style, see-through_dress, Expressionism, looking towards the camera, upper_body, raw photo, masterpiece, solo, medium shot, high detail face, photorealistic, best quality"
#Negative Prompt = """(nsfw:2), paintings, sketches, (worst quality:2), (low quality:2), lowers, normal quality, ((monochrome)), ((grayscale)), logo, word, character, bad hand, tattoo, (username, watermark, signature, time signature, timestamp, artist name, copyright name, copyright),low res, ((monochrome)), ((grayscale)), skin spots, acnes, skin blemishes, age spot, glans, extra fingers, fewer fingers, strange fingers, bad hand, mole, ((extra legs)), ((extra hands))"""
image = pipeline(prompt).images[0]
image.save("sdlora.png")

SD+controlnet

from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL
from diffusers.utils import load_image, make_image_grid
from PIL import Image
from modelscope import snapshot_download
import cv2
import numpy as np
import torch


model_dir = snapshot_download("AI-ModelScope/stable-diffusion-xl-base-1.0")
controlnet_dir = snapshot_download("AI-ModelScope/controlnet-canny-sdxl-1.0")
VAE_dir = snapshot_download("AI-ModelScope/sdxl-vae-fp16-fix")
original_image = load_image(
    "/mnt/workspace/canny.jpg"
)

prompt = "sea turtle, hard lighting"
negative_prompt = 'low quality, bad quality, sketches'

image = load_image("/mnt/workspace/canny.jpg")

controlnet_conditioning_scale = 0.5  # recommended for good generalization

controlnet = ControlNetModel.from_pretrained(
    controlnet_dir,
    torch_dtype=torch.float16
)
vae = AutoencoderKL.from_pretrained(VAE_dir, torch_dtype=torch.float16)
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
    model_dir,
    controlnet=controlnet,
    vae=vae,
    torch_dtype=torch.float16,
)
pipe.enable_model_cpu_offload()

image = np.array(image)
image = cv2.Canny(image, 100, 200)
image = image[:, :, None]
image = np.concatenate([image, image, image], axis=2)
image = Image.fromarray(image)

images = pipe(
    prompt, negative_prompt=negative_prompt, image=image, controlnet_conditioning_scale=controlnet_conditioning_scale,
    ).images

images[0].save(f"controlnet.png")

LLM微调

from config import ModelArgs
from model import Llama
from tokenizer import Tokenizer
import numpy as np

args = ModelArgs(288, 6, 6, 6, 32000, None, 256)

token_model_path = "./tokenizer.model.np"
model_path = "./stories15M.model.npz"

tok = Tokenizer(token_model_path)
llama = Llama(model_path, args)
prompt = "Once upon"

ids = tok.encode(prompt)
input_ids = np.array([ids], dtype=np.int32)
token_num = input_ids.shape[1]

print(prompt, end="")
for ids in llama.generate(input_ids, args.max_seq_len, True, 1.0, 0.9, 0):
    output_ids = ids[0].tolist()
    if output_ids[-1] in [tok.eos_id, tok.bos_id]:
        break
    output_text = tok.decode(output_ids)
    print(output_text, end="")

声音生成TTS

基于PTTS-basemodel微调

from modelscope.metainfo import Trainers
from modelscope.trainers import build_trainer
from modelscope.utils.audio.audio_utils import TtsTrainType

pretrained_model_id = 'damo/speech_personal_sambert-hifigan_nsf_tts_zh-cn_pretrain_16k'

dataset_id = "./output_training_data/"
pretrain_work_dir = "./pretrain_work_dir/"
        
# 训练信息,用于指定需要训练哪个或哪些模型,这里展示AM和Vocoder模型皆进行训练
# 目前支持训练:TtsTrainType.TRAIN_TYPE_SAMBERT, TtsTrainType.TRAIN_TYPE_VOC
# 训练SAMBERT会以模型最新step作为基础进行finetune
train_info = {
    TtsTrainType.TRAIN_TYPE_SAMBERT: {  # 配置训练AM(sambert)模型
        'train_steps': 202,               # 训练多少个step 
        'save_interval_steps': 200,       # 每训练多少个step保存一次checkpoint
        'log_interval': 10               # 每训练多少个step打印一次训练日志
    }
}

# 配置训练参数,指定数据集,临时工作目录和train_info
kwargs = dict(
    model=pretrained_model_id,                  # 指定要finetune的模型
    model_revision = "v1.0.6",
    work_dir=pretrain_work_dir,                 # 指定临时工作目录
    train_dataset=dataset_id,                   # 指定数据集id
    train_type=train_info                       # 指定要训练类型及参数
)

trainer = build_trainer(Trainers.speech_kantts_trainer,
                        default_args=kwargs)

trainer.train()

参考链接

  • sora-tutorial/docs/chapter2/chapter2_2/attention-llm/attention/attention.ipynb at main · datawhalechina/sora-tutorial · GitHub,文中提到的Transformer项目

  • sora-tutorial/docs/chapter2/chapter2_2/attention-llm/llm/llm.ipynb at main · datawhalechina/sora-tutorial · GitHub,文中提到的LLM手搓案例

  • 【AI+X组队学习】Sora原理与技术实战:基于Transformers diffusion的 视频生成技术解析+实战介绍】 https://www.bilibili.com/video/BV1px421y7qU/?share_source=copy_web&vd_source=5c57be483caf55b4999079e963ad6eec

  • 【AI+X组队学习】Sora原理与技术实战:声音生成TTS技术解析与实战】 https://www.bilibili.com/video/BV1py421i7Ha/?share_source=copy_web&vd_source=5c57be483caf55b4999079e963ad6eec

  • 21
    点赞
  • 18
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Acfufu

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值