AMD 6750gre 通过SPAD加速的效果_6750gre加速pytorch-CSDN博客

本文链接：https://blog.csdn.net/weixin_45312538/article/details/147028110

前情提示：
如果没有安装ROCM或者Anaconda的，可以看我上一篇文章。以下的操作全部在你自己的Anaconda创建的环境中进行，千万不要在系统中搭建，因为万一各个版本出现冲突，那就麻爪了。但是单独的Anaconda环境中就无所谓了，冲突了删除环境重新创建一个新的就完事了。对了，如果你是使用的AMD官网提供的Docker，那你下面安装的代码可能不能直接使用，因为我不确定Docker里面的pyTorch和ROCM的版本是否支持。
以下操作全部在Anaconda创建的环境中进行
以下操作全部在Anaconda创建的环境中进行
以下操作全部在Anaconda创建的环境中进行
测试时需要提前安装diffusers以及对应的常用依赖。
最后说一点就是，目前对于ROCM来说，SPAD是一个不错的选择，但还是有一些我还没有测试到的，例如XFormers、BetterTransformer、MIOpen和hipBLAS以及Flash Attention。这些我都会在后续的文章中依次测试，感兴趣的可以持续关注。

1、diffusers安装

# 使用 pip 安装最新版
pip install diffusers

# 安装开发版（可选）
pip install git+https://github.com/huggingface/diffusers

2、依赖库安装

# 常用依赖
pip install transformers accelerate datasets

# 图像处理库
pip install pillow scipy

# 支持 Stable Diffusion（需登录HuggingFace）
pip install invisible_watermark safetensors

SPAD加速测试的代码：

import torch
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
from diffusers.utils import logging
import gc

logging.set_verbosity_error()

MODEL_PATH = "/home/wxa/stable-diffusion-2-1" //这里
TEST_PROMPT = "A photo of an astronaut riding a horse on Mars"
SEED = 42


def setup_pipeline(use_sdpa: bool) -> StableDiffusionPipeline:
    torch.cuda.empty_cache()
    gc.collect()

    dtype = torch.float16 if use_sdpa else torch.float32
    attn_impl = "sdpa" if use_sdpa else None

    pipe = StableDiffusionPipeline.from_pretrained(
        MODEL_PATH,
        torch_dtype=dtype,
        attn_implementation=attn_impl,
        variant="fp16" if use_sdpa else None,
        safety_checker=None,
        use_safetensors=True
    ).to("cuda")

    # 过滤废弃参数（包含 skip_prk_steps）
    scheduler_config = dict(pipe.scheduler.config)
    deprecated_keys = ["skip_prk_steps", "clip_sample", "clip_sample_range", "set_alpha_to_one"]
    scheduler_config = {k: v for k, v in scheduler_config.items() if k not in deprecated_keys}

    # 添加新版必要参数
    scheduler_config.update({
        "algorithm_type": "sde-dpmsolver++" if use_sdpa else "dpmsolver++",
        "use_karras_sigmas": True if use_sdpa else False,
        "lambda_min_clipped": -float("inf"),
        "timestep_spacing": "linspace",
        "steps_offset": 1
    })

    pipe.scheduler = DPMSolverMultistepScheduler(** scheduler_config)

    # 显存优化配置
    if use_sdpa:
        pipe.enable_vae_slicing()
        pipe.enable_attention_slicing(1)
    else:
        pipe.enable_model_cpu_offload()
        pipe.enable_attention_slicing(4)

    return pipe


def run_benchmark(pipe: StableDiffusionPipeline, steps: int, res: int) -> dict:
    """执行单次基准测试（带显存保护）"""
    generator = torch.Generator("cuda").manual_seed(SEED)

    try:
        # 预热（减少冷启动误差）
        for _ in range(2):
            _ = pipe(prompt="", num_inference_steps=1, width=64, height=64, generator=generator)

        # 正式测试
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)

        torch.cuda.reset_peak_memory_stats()
        torch.cuda.synchronize()
        start_event.record()

        image = pipe(
            prompt=TEST_PROMPT,
            num_inference_steps=steps,
            width=res,
            height=res,
            generator=generator
        ).images[0]

        end_event.record()
        torch.cuda.synchronize()

        return {
            "time": start_event.elapsed_time(end_event),
            "mem": torch.cuda.max_memory_allocated() / 1e9,
            "image": image
        }
    except RuntimeError as e:
        print(
            f"\033[31m显存不足: {res}x{res}@{steps}steps | 已用显存: {torch.cuda.memory_allocated() / 1e9:.1f}GB\033[0m")
        return None


def benchmark_compare():
    """性能对比测试"""
    test_cases = [
        {"steps": 20, "res": 512},
        {"steps": 32, "res": 768}  # 新增 768x768 测试
        # {"steps": 50, "res": 1024} #如果不信可以将这行注释打开，看看能不能跑起来。
    ]

    # SDPA 模式测试
    print("\n\033[1m=== SDPA (FP16) 模式 ===\033[0m")
    pipe_sdpa = setup_pipeline(use_sdpa=True)
    for case in test_cases:
        result = run_benchmark(pipe_sdpa,  ** case)
        if result:
            print(f"[SDPA] Steps: {case['steps']}, Res: {case['res']}x{case['res']}")
            print(f"Time: {result['time']:.1f}ms | Mem: {result['mem']:.2f}GB")
            result['image'].save(f"sdpa_{case['steps']}s_{case['res']}px.png")

    # 原始模式测试
    print("\n\033[1m=== 原始注意力 (FP32) 模式 ===\033[0m")
    pipe_original = setup_pipeline(use_sdpa=False)
    for case in test_cases:
        print(f"\n\033[33m正在测试 {case['res']}x{case['res']}@{case['steps']}steps...\033[0m")
        result = run_benchmark(pipe_original,  ** case)
        if result:
            print(f"[Original] Steps: {case['steps']}, Res: {case['res']}x{case['res']}")
            print(f"Time: {result['time']:.1f}ms | Mem: {result['mem']:.2f}GB")
            result['image'].save(f"original_{case['steps']}s_{case['res']}px.png")


if __name__ == "__main__":
    if not torch.cuda.is_available():
        raise RuntimeError("需要 NVIDIA 或 AMD 显卡支持")

    # 优化配置（适用于 AMD 显卡）
    if torch.cuda.get_device_properties(0).name.startswith("Radeon"):
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        # os.environ["HSA_OVERRIDE_GFX_VERSION"] = "11.0.0"  # RX 7000 系列需要

    benchmark_compare()

参数	SDPA (FP16)	原始注意力 (FP32)	提升幅度
512x512 / 20步	3.85s/3.04GB	5.88s/6.79GB	⬆️34.5%/⬇️55.2%
768x768 / 32步	18.76s/3.54GB	25.11s/9.13GB	⬆️25.3%/⬇️61.2%
1024x1024 / 50步	\	\	6750GRE不配测试，这个最低显存要求