配置python虚拟环境
官方指明生成25秒的视频在16G显存,12G显存可以生成10秒的,应该8G显存也能运行,需要修改num_frames参数,视频时长大约为num_frames*0.1秒,内存要求16G以上
conda create -n py3.10 python==3.10
conda activate py3.10
pip install diffusers transformers accelerate torch opencv-python -f https://download.pytorch.org/whl/torch_stable.html
#加上-f xxx可以安装cuda gpu版torch,最好到网站上用浏览器下载好了对应版本的whl文件,再本地安装
运行代码
text2video.py
import os
os.environ['HUGGINGFACE_HUB_CACHE']="cache" #设置这个指定模型缓存目录
os.environ['HF_ENDPOINT']="https://hf-mirror.com" #HF模型国内可下载
import torch
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
from diffusers.utils import export_to_video
pipe = DiffusionPipeline.from_pretrained(
"ali-vilab/text-to-video-ms-1.7b", #本地模型使用 ./text-to-video-ms-1.7b
torch_dtype=torch.float16, variant="fp16")
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()
pipe.enable_vae_slicing()
generator = torch.Generator("cuda").manual_seed(1024)#手动设置随机种子
prompt = "Spiderman is surfing" #"Spiderman is surfing. Darth Vader is also surfing and following Spiderman"
video_frames = pipe(prompt, num_inference_steps=50, num_frames=100, generator=generator).frames #num_frames控制生成视频的时长
print(video_frames.shape)
video_path = export_to_video(video_frames[0],output_video_path="video/output.mp4")
print(video_path)
# 使用官方提供的代码报错,需要安装opencv。并且video_frames有4维:(1,num_frames,h,w,c),其中默认h=256,w=256,c=3
# video_path = export_to_video(video_frames,output_video_path="video/output.mp4")
# lib\site-packages\diffusers\utils\export_utils.py", line 135,
# in export_to_video h, w, c = video_frames[0].shape ValueError: too many values to unpack (expected 3)
# 这个错误是因为在尝试从视频帧中解包h, w, c时,发现实际上有4个值。这可能是因为视频帧中包含了一个多维数组
直接执行python text2video.py ,会自动下载模型,也可以手动下载,模型文件只需要下载带fp16.safetensors后缀的即可
模型目录结构
CACHE\MODELS–ALI-VILAB–TEXT-TO-VIDEO-MS-1.7B\SNAPSHOTS\8227DDDCA75A8561BF858D604CC5DAE52B954D01
│ model_index.json
│
├─scheduler
│ scheduler_config.json
│
├─text_encoder
│ config.json
│ model.fp16.safetensors
│
├─tokenizer
│ merges.txt
│ special_tokens_map.json
│ tokenizer_config.json
│ vocab.json
│
├─unet
│ config.json
│ diffusion_pytorch_model.fp16.safetensors
│
└─vae
config.json
diffusion_pytorch_model.fp16.safetensors
参考链接
https://hf-mirror.com/docs/diffusers/main/en/using-diffusers/custom_pipeline_overview
https://hf-mirror.com/ali-vilab/text-to-video-ms-1.7b