实践:将cuda代码转化成hip代码,并编写为pytorch的extension
将cuda代码转化成hip代码,并编写为pytorch的extension
目的:将fused-attention这份代码,由cuda转化到hip。并且让转化后的hip代码也能编写为pytorch的extension。
难点:fused-attention 使用了wmma,如何让这种调库的代码编写为pytorch的extension。
首先将代码转到hip并能成功执行
- 下载代码到本地
git clone https://github.com/kst179/fused-attention.git
- 使用hipify工具转化代码
/opt/rocm/bin/hipconvertinplace-perl.sh [路径] # 这里路径指向fused attention本地地址
- 修改wmma相关代码
- wmma 替换成rocwmma
- #include <mma.h> 替换成 #include <rocwmma/rocwmma.hpp>
- warp_size 改成64
- 去掉所有的nvcuda
- 删掉多余的配置代码(AMD的共享内存不需要这样设置)
CHECK_CUDA_ERROR(cudaFuncSetAttribute(
attention_kernel<head_dim, chunk_size, n_warps>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem_size));
- 修改half计算相关代码
- 修改half的定义
using half_t = half;
using half2_t = half2;
- half2不能使用.x和.y获取元素,修改成__low2half()和__high2half()
if (threads_per_row == 1) {
vec[row] = __hmax(threadwise_val.x, threadwise_val.y);
return;
}
...
if (thread_idx < height) {
threadwise_val = *(half2_t*)&aux[thread_idx * ldm_aux];
#pragma unroll
for (uint32_t i = 1; i < threads_per_row; ++i) {
threadwise_val = __hmax2(threadwise_val, *(half2_t*)&aux[thread_idx * ldm_aux + i * elements_per_storage]);
}
vec[thread_idx] = __hmax(threadwise_val.x, threadwise_val.y);
}
}
- hip 缺少__hmax和__hmax2,自己添加到最前面
__device__ half __hmax(half a, half b) {
return __hgt(a, b) ? a : b;
}
__device__ half2 __hmax2(half2 a, half2 b) {
half tmp1;
half tmp2;
tmp2 = __hgt(__high2half(a), __high2half(b)) ? __high2half(a) : __high2half(b);
tmp1 = __hgt(__low2half(a), __low2half(b)) ? __low2half(a) : __low2half(b);
return __halves2half2(tmp1, tmp2);
}
测试代码是否可以运行
- 在src文件夹中编写makefile文件,内容如下:
ALL: test
include=-I../include
test:
hipcc test.cu ${include} --std=c++20 --no-offload-arch=gfx1030 --offload-arch=gfx90a
–no-offload-arch=gfx1030 表示不对gfx1030进行编译,rocwmma不支持gfx1030,编译gfx1030会报错。
- 执行a.out ,如果结果不是nan,说明前面的工作已完成。不然的话自查,看哪里还有问题。
将代码编写为pytorch的 extension
- 修改setup.py文件, 将除了gfx90a的架构全部设置成不编译。(这样设置的原因是pytorch编译时会将所有的架构加到最后,不设置不编译,编译到rocwmma不支持的架构就会报错)
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
from pathlib import Path
import os
workspace_dir = Path(os.path.dirname(os.path.abspath(__file__)))
setup(
name="fused_attn",
ext_modules=[
CUDAExtension(
name="fused_attn",
sources=[str(workspace_dir / "src" / "fused_attn_extention.cu")],
include_dirs=[str(workspace_dir / "include")],
extra_compile_args=[
"-O3",
"-std=c++20",
"-I/opt/rocm/include",
"-I/opt/rocm/hip/include",
"--no-offload-arch=gfx1030",
"--no-offload-arch=gfx900",
"--no-offload-arch=gfx906",
"--no-offload-arch=gfx908"
],
)
],
cmdclass={
"build_ext": BuildExtension
}
)
可能会遇到的错误
In file included from /data/zhaorong/code/fused-attention/src/fused_attn_extention.hip:4:
In file included from /data/zhaorong/code/fused-attention/include/fused_attn_hip.cuh:6:
In file included from /opt/rocm-5.4.0/include/rocwmma/rocwmma.hpp:31:
In file included from /opt/rocm-5.4.0/include/rocwmma/internal/io_config.hpp:29:
In file included from /opt/rocm-5.4.0/include/rocwmma/internal/broadcast.hpp:29:
In file included from /opt/rocm-5.4.0/include/rocwmma/internal/types.hpp:339:
/opt/rocm-5.4.0/include/rocwmma/internal/types_ext.hpp:328:40: error: no matching conversion for static_cast from 'const rocwmma::hfloat16_t' (aka 'const __half') to 'rocwmma::float16_t' (aka '_Float16')
return static_cast<hfloat16_t>(static_cast<float16_t>(x) * static_cast<float16_t>(y));
进入 types_ext.hpp 将报错代码修改为
__host__ inline hfloat16_t operator*(const hfloat16_t& x, const hfloat16_t& y)
{
float16_t mid1 = *(float16_t *)(void *)(&x);
float16_t mid2 = *(float16_t *)(void *)(&y);
mid1 = mid1 * mid2;
return *(hfloat16_t *)(void *)&mid1;
}
__host__ inline hfloat16_t operator+(const hfloat16_t& x, const hfloat16_t& y)
{
float16_t mid1 = *(float16_t *)(void *)(&x);
float16_t mid2 = *(float16_t *)(void *)(&y);
mid1 = mid1 + mid2;
return *(hfloat16_t *)(void *)&mid1;
}
__host__ inline hfloat16_t& operator+=(hfloat16_t& x, const hfloat16_t& y)
{
float16_t mid1 = *(float16_t *)(void *)(&x);
float16_t mid2 = *(float16_t *)(void *)(&y);
mid1 = mid1 + mid2;
return x = *(hfloat16_t *)(void *)&mid1;
}
测试能否在pytorch端运行,执行 tests/benchmark_fuse_attn.py
可能会出现out of memory 的错误,不要紧张。
将head_dim 修改为 64。这是因为之前将warp_size设置成64后,一个block的总线程数会超过1024。
修改之后,重新执行结果如下:
python3 tests/benchmark_fused_attn.py
Naive 12.1705 ms
Fused 25.2257 ms
后续工作
-
探索为什么fused 比 naive更慢。
-
测量一下算子的误差。