TVM扫描和递归核(Scan and Recurrent Kernel)

扫描和递归核(Scan and Recurrent Kernel)

#!/usr/bin/env python
# coding: utf-8

# # 扫描和递归核(Scan and Recurrent Kernel)

# 递归计算是神经网络的一种典型模式。

# In[1]:

from __future__ import absolute_import, print_function

import tvm
import tvm.testing
from tvm import te
import numpy as np

# TVM支持扫描(scan)操作符来描述符号循环.下面的扫描操作计算X列的cumsum.
# 
# 扫描是在张量的最高维上进行的。s_state是一个占位符,它描述扫描的转换状态。s_init描述如何初始化前k个时间步。这里,由于s_init的第一个维度是1,它描述了我们如何在第一个时间步初始化状态。
# 
# s_update描述如何在时间步t更新值。更新值可以通过状态占位符引用前一个时间步的值.注意,在当前或以后的时间步引用s_state是无效的。
# 
# 扫描接受状态占位符、初始值和更新描述。还建议(虽然不是必需的)列出扫描单元的输入。扫描的结果是一个张量,给出了在时域上更新后s_state的结果。

# In[2]:


m = te.var("m")
n = te.var("n")
X = te.placeholder((m, n), name="X")
s_state = te.placeholder((m, n))
s_init = te.compute((1, n), lambda _, i: X[0, i])
s_update = te.compute((m, n), lambda t, i: s_state[t - 1, i] + X[t, i])
s_scan = tvm.te.scan(s_init, s_update, s_state, inputs=[X])

# # 调度扫描核
# 
# 我们可以通过分别调度更新和初始化部分来调度扫描的主体。注意,调度更新部分的第一个迭代维度是无效的。要在时间迭代上分割,用户可以在scan_op.scan_axis上调度。

# In[3]:


s = te.create_schedule(s_scan.op)
num_thread = 256
block_x = te.thread_axis("blockIdx.x")
thread_x = te.thread_axis("threadIdx.x")
xo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread)
s[s_init].bind(xo, block_x)
s[s_init].bind(xi, thread_x)
xo, xi = s[s_update].split(s_update.op.axis[1], factor=num_thread)
s[s_update].bind(xo, block_x)
s[s_update].bind(xi, thread_x)
print(tvm.lower(s, [X, s_scan], simple_mode=True))

ouput[3]:
primfn(X_1: handle, scan_1: handle) -> ()
  attr = {"global_symbol": "main", "tir.noalias": True}
  buffers = {scan: Buffer(scan_2: Pointer(float32), float32, [m: int32, n: int32], [stride: int32, stride_1: int32], type="auto"),
             X: Buffer(X_2: Pointer(float32), float32, [m, n], [stride_2: int32, stride_3: int32], type="auto")}
  buffer_map = {X_1: X, scan_1: scan} {
  attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = floordiv((n + 255), 256);
  attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 256;
  if @tir.likely((((blockIdx.x*256) + threadIdx.x) < n), dtype=bool) {
    scan_2[(((blockIdx.x*256) + threadIdx.x)*stride_1)] = (float32*)X_2[(((blockIdx.x*256) + threadIdx.x)*stride_3)]
  }
  for (scan.idx: int32, 0, (m - 1)) {
    attr [IterVar(blockIdx.x, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = floordiv((n + 255), 256);
    attr [IterVar(threadIdx.x, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 256;
    if @tir.likely((((blockIdx.x*256) + threadIdx.x) < n), dtype=bool) {
      scan_2[(((scan.idx + 1)*stride) + (((blockIdx.x*256) + threadIdx.x)*stride_1))] = ((float32*)scan_2[((scan.idx*stride) + (((blockIdx.x*256) + threadIdx.x)*stride_1))] + (float32*)X_2[(((scan.idx + 1)*stride_2) + (((blockIdx.x*256) + threadIdx.x)*stride_3))])
    }
  }
}
# # 建立和验证
# 
# 我们可以像其他TVM内核一样构建扫描内核,这里我们使用numpy来验证结果的正确性。

# In[4]:


fscan = tvm.build(s, [X, s_scan], "cuda", name="myscan")
ctx = tvm.gpu(0)
n = 1024
m = 10
a_np = np.random.uniform(size=(m, n)).astype(s_scan.dtype)
a = tvm.nd.array(a_np, ctx)
b = tvm.nd.array(np.zeros((m, n), dtype=s_scan.dtype), ctx)
fscan(a, b)
tvm.testing.assert_allclose(b.asnumpy(), np.cumsum(a_np, axis=0))

# # 多级扫描核
# 
# 在上面的例子中,我们使用s_update中的一个张量计算阶段描述了扫描单元。在扫描单元中使用多个张量阶段是可能的。
# 
# 下面几行演示了扫描单元中包含两个阶段操作的扫描。

# In[7]:


m = te.var("m")
n = te.var("n")
X = te.placeholder((m, n), name="X")
s_state = te.placeholder((m, n))
s_init = te.compute((1, n), lambda _, i: X[0, i])
s_update_s1 = te.compute((m, n), lambda t, i: s_state[t - 1, i] * 2, name="s1")
s_update_s2 = te.compute((m, n), lambda t, i: s_update_s1[t, i] + X[t, i], name="s2")
s_scan = tvm.te.scan(s_init, s_update_s2, s_state, inputs=[X])


# 这些中间张量也可以被正常调度。为了确保正确性,TVM创建了一个组约束,禁止扫描体在扫描循环之外的位置进行计算。

# In[8]:


s = te.create_schedule(s_scan.op)
xo, xi = s[s_update_s2].split(s_update_s2.op.axis[1], factor=32)
s[s_update_s1].compute_at(s[s_update_s2], xo)
print(tvm.lower(s, [X, s_scan], simple_mode=True))

ouput[8]:
primfn(X_1: handle, scan_1: handle) -> ()
  attr = {"global_symbol": "main", "tir.noalias": True}
  buffers = {scan: Buffer(scan_2: Pointer(float32), float32, [m: int32, n: int32], [stride: int32, stride_1: int32], type="auto"),
             X: Buffer(X_2: Pointer(float32), float32, [m, n], [stride_2: int32, stride_3: int32], type="auto")}
  buffer_map = {X_1: X, scan_1: scan} {
  attr [s1: Pointer(float32)] "storage_scope" = "global";
  allocate(s1, float32, [32]) {
    for (i: int32, 0, n) {
      scan_2[(i*stride_1)] = (float32*)X_2[(i*stride_3)]
    }
    for (scan.idx: int32, 0, (m - 1)) {
      for (i.outer: int32, 0, floordiv((n + 31), 32)) {
        for (i_1: int32, 0, 32) {
          if @tir.likely((((i.outer*32) + i_1) < n), dtype=bool) {
            s1[i_1] = ((float32*)scan_2[((scan.idx*stride) + (((i.outer*32) + i_1)*stride_1))]*2f32)
          }
        }
        for (i.inner: int32, 0, 32) {
          if @tir.likely((((i.outer*32) + i.inner) < n), dtype=bool) {
            scan_2[(((scan.idx + 1)*stride) + (((i.outer*32) + i.inner)*stride_1))] = ((float32*)s1[i.inner] + (float32*)X_2[(((scan.idx + 1)*stride_2) + (((i.outer*32) + i.inner)*stride_3))])
          }
        }
      }
    }
  }
}
# # 多级状态
# 
# 对于像RNN这样的复杂应用程序,我们可能需要不止一种循环状态。扫描支持多种循环状态。下面的例子演示了如何构建具有两种状态的递归式。

# In[9]:


m = te.var("m")
n = te.var("n")
l = te.var("l")
X = te.placeholder((m, n), name="X")
s_state1 = te.placeholder((m, n))
s_state2 = te.placeholder((m ,l))
s_init1 = te.compute((1, n), lambda _, i: X[0, i])
s_init2 = te.compute((1, l), lambda _, i: 0.0)
s_update1 = te.compute((m, n), lambda t, i: s_state1[t - 1, i] + X[t, i])
s_update2 = te.compute((m, l), lambda t, i: s_state2[t - 1, i] + s_state1[t - 1, 0])
s_scan1, s_scan2 = tvm.te.scan([s_init1, s_init2], [s_update1, s_update2], [s_state1, s_state2], inputs=[X])
s = te.create_schedule(s_scan1.op)
print(tvm.lower(s, [X, s_scan1, s_scan2], simple_mode=True))

ouput[9]:
primfn(X_1: handle, scan.v0_1: handle, scan.v1_1: handle) -> ()
  attr = {"global_symbol": "main", "tir.noalias": True}
  buffers = {scan.v1: Buffer(scan.v1_2: Pointer(float32), float32, [m: int32, l: int32], [stride: int32, stride_1: int32], type="auto"),
             X: Buffer(X_2: Pointer(float32), float32, [m, n: int32], [stride_2: int32, stride_3: int32], type="auto"),
             scan.v0: Buffer(scan.v0_2: Pointer(float32), float32, [m, n], [stride_4: int32, stride_5: int32], type="auto")}
  buffer_map = {X_1: X, scan.v0_1: scan.v0, scan.v1_1: scan.v1} {
  for (i: int32, 0, n) {
    scan.v0_2[(i*stride_5)] = (float32*)X_2[(i*stride_3)]
  }
  for (i_1: int32, 0, l) {
    scan.v1_2[(i_1*stride_1)] = 0f32
  }
  for (scan.idx: int32, 0, (m - 1)) {
    for (i_2: int32, 0, n) {
      scan.v0_2[(((scan.idx + 1)*stride_4) + (i_2*stride_5))] = ((float32*)scan.v0_2[((scan.idx*stride_4) + (i_2*stride_5))] + (float32*)X_2[(((scan.idx + 1)*stride_2) + (i_2*stride_3))])
    }
    for (i_3: int32, 0, l) {
      scan.v1_2[(((scan.idx + 1)*stride) + (i_3*stride_1))] = ((float32*)scan.v1_2[((scan.idx*stride) + (i_3*stride_1))] + (float32*)scan.v0_2[(scan.idx*stride_4)])
    }
  }
}


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值