TVM生成shader教程（1）

最新推荐文章于 2025-02-02 17:26:10 发布

头发光了你就强了

最新推荐文章于 2025-02-02 17:26:10 发布

阅读量390

点赞数

分类专栏： tvm 文章标签：深度学习

本文链接：https://blog.csdn.net/qq_33345917/article/details/109311382

版权

tvm 专栏收录该内容

1 篇文章

订阅专栏

TVM 的Tensor expression language可以生成高效shader，不必再手写shader.
看代码：

from __future__ import absolute_import, print_function

import tvm
import tvm.testing
from tvm import te
import numpy as np

# Global declarations of environment.

tgt_host = "llvm"
# Change it to respective GPU if gpu is enabled Ex: cuda, opencl, rocm
tgt = "cuda"
n = te.var("n")
A = te.placeholder((n,), name="A")
B = te.placeholder((n,), name="B")
C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
print(type(C))

经过te.compute函数，生成的是for循环串行计算描述，相当于下面代码：

for (int i = 0; i < n; ++i) {
  C[i] = A[i] + B[i];
}

接下来就要按引入Tensor expression language的核心概念：schedule
schedule是描述计算方式的，比如串行和并行，调用te.compute函数后，生成的schedule默认是串行的
接下来我们把串行的变成并行的。
创建schedule

s = te.create_schedule(C.op)

设置grid和block，下面代码等价于block(64,1,1),grid(n/64,1,1)

bx, tx = s[C].split(C.op.axis[0], factor=64)
if tgt == "cuda" or tgt == "rocm" or tgt.startswith("opencl"):
    s[C].bind(bx, te.thread_axis("blockIdx.x"))
    s[C].bind(tx, te.thread_axis("threadIdx.x"))

指定create_schedule之后，就可以编译生成TVM函数，生成的TVM函数包括host的launch kernel函数和device的shader

fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd")

申请GPU资源，并向GPU拷贝数据，在GPU上进行计算

ctx = tvm.context(tgt, 0)

n = 1024
a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
fadd(a, b, c)
tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())

检查生成的shader

if tgt == "cuda" or tgt == "rocm" or tgt.startswith("opencl"):
    dev_module = fadd.imported_modules[0]
    print("-----GPU code-----")
    print(dev_module.get_source())
else:
    print(fadd.get_source())

成成shader如下：

-----GPU code-----
extern "C" __global__ void myadd_kernel0(float* __restrict__ C, float* __restrict__ A, float* __restrict__ B, int n, int stride, int stride1, int stride2) {
  if (((int)blockIdx.x) < (n >> 6)) {
    C[((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride2))] = (A[((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride))] + B[((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride1))]);
  } else {
    if (((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) < n) {
      C[((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride2))] = (A[((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride))] + B[((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride1))]);
    }
  }
}

保存编译模型，host代码保存为动态库，device代码保存为ptx

from tvm.contrib import cc
from tvm.contrib import util

temp = util.tempdir()
fadd.save(temp.relpath("myadd.o"))
if tgt == "cuda":
    fadd.imported_modules[0].save(temp.relpath("myadd.ptx"))
if tgt == "rocm":
    fadd.imported_modules[0].save(temp.relpath("myadd.hsaco"))
if tgt.startswith("opencl"):
    fadd.imported_modules[0].save(temp.relpath("myadd.cl"))
cc.create_shared(temp.relpath("myadd.so"), [temp.relpath("myadd.o")])
print(temp.listdir())

加载编译模型

fadd1 = tvm.runtime.load_module(temp.relpath("myadd.so"))
if tgt == "cuda":
    fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.ptx"))
    fadd1.import_module(fadd1_dev)

if tgt == "rocm":
    fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.hsaco"))
    fadd1.import_module(fadd1_dev)

if tgt.startswith("opencl"):
    fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.cl"))
    fadd1.import_module(fadd1_dev)

fadd1(a, b, c)
tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())