在C++中部署TVM GEMM

最新推荐文章于 2024-03-17 23:15:10 发布

HPC_C

最新推荐文章于 2024-03-17 23:15:10 发布

阅读量148

点赞数 1

分类专栏：深度学习编译器文章标签： c++ 深度学习

本文链接：https://blog.csdn.net/weixin_44604698/article/details/134788220

版权

深度学习编译器专栏收录该内容

2 篇文章 0 订阅

订阅专栏

上一篇使用TVM优化GEMM-CSDN博客文章中提到如何使用TVM优化GEMM，这篇文章主要讲一下在C++中的部署，以及测速结果，主要参考的是：使用 C++ API 部署 TVM 模块 | Apache TVM 中文站。主要代码可见：https://github.com/Beichen-Wang/HPCTest/blob/master/TVM/src/deploy/Gemm.cpp

1.python导出GEMM library

1.1先生成function

def TEBlockVectoryParallelGemm(self):
        self.M = te.var("M")
        self.K = te.var("K")
        self.N = te.var("N")
        k = te.reduce_axis((0, self.K), "k")
        A = te.placeholder((self.M, self.K), name="A")
        B = te.placeholder((self.K, self.N), name="B")
        C = te.compute((self.M, self.N), lambda x, y: te.sum(A[x,k]*B[k,y], axis = k), name="C")
        s = te.create_schedule(C.op)
        xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], self.bs, self.bs)
        ko, ki = s[C].split(k, factor=16)
        s[C].reorder(xo, yo, ko, xi, ki, yi)
        # s[C].unroll(ki)
        s[C].vectorize(yi)
        s[C].parallel(xo)
        func = tvm.build(s, [A,B,C], target = self.target, name = "blockVectoryParallel")
        print(tvm.lower(s, [A,B,C], simple_mode=True))
        return func

该步骤在上一篇使用TVM优化GEMM-CSDN博客文章中详细解释过，唯一不同的地方在于：

        self.M = te.var("M")
        self.K = te.var("K")
        self.N = te.var("N")

该命令将M、N和K定义为变量，方便在C++中改变GEMM不同的输入；

1.2导出Library

    def GetLibrary(self, func):
        curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
        dylib_path = os.path.join(curr_path, "lib/GEMM.so")
        func.export_library(dylib_path)

1.3使用方式

    M = 512
    K = 5120
    N = 512
    bs = 64
    instance = GEMM(M,N,K,bs)
    funcBlockPermuteVectory = instance.TEBlockVectoryParallelGemm()
    instance.GetLibrary(funcBlockPermuteVectory)

2.C++执行GEMM

整体代码为：

#include <dlpack/dlpack.h>
#include <tvm/runtime/module.h>
#include <tvm/runtime/packed_func.h>
#include <tvm/runtime/registry.h>

#include <cstdio>
#include <vector>
#include <random>
#include <cassert>

#include "Timer.hpp"

void Check(DLTensor a, DLTensor b)
{
    // 检查形状是否相同
    assert(a.ndim == b.ndim);
    for (int i = 0; i < a.ndim; ++i)
    {
        assert(a.shape[i] == b.shape[i]);
    }
    for (int i = 0; i < a.shape[0]; ++i)
    {
        for (int j = 0; j < a.shape[1]; ++j)
        {
            assert(static_cast<float *>(a.data)[i * a.shape[1] + j] == static_cast<float *>(b.data)[i * a.shape[1] + j]);
        }
    }

    // 检查数据类型是否相同
    assert(a.dtype.code == b.dtype.code);
    assert(a.dtype.bits == b.dtype.bits);
    assert(a.dtype.lanes == b.dtype.lanes);

    std::cout << "Check passed" << std::endl;
}

#define ITERTIME 10
#define ITER(func)                     \
    for (int i = 0; i < ITERTIME; i++) \
    {                                  \
        func;                          \
    }

class GEMM
{
private:
    std::shared_ptr<DLTensor> A, B, C;
    util::Timer timer;

public:
    void Init(int M, int K, int N)
    {
        // 创建输入矩阵并填充数据
        tvm::runtime::NDArray A_array = tvm::runtime::NDArray::Empty({M, K}, {kDLFloat, 32, 1}, {kDLCPU, 0});
        tvm::runtime::NDArray B_array = tvm::runtime::NDArray::Empty({K, N}, {kDLFloat, 32, 1}, {kDLCPU, 0});
        A = std::make_shared<DLTensor>(A_array.ToDLPack()->dl_tensor);
        B = std::make_shared<DLTensor>(B_array.ToDLPack()->dl_tensor);

        // 填充输入矩阵
        std::random_device rd;
        std::default_random_engine generator(rd());

        // 定义随机数分布范围
        std::uniform_int_distribution<int> distribution(1, 10);

        for (int i = 0; i < A->shape[0] * A->shape[1]; i++)
        {
            static_cast<float *>(A->data)[i] = distribution(generator);
        }
        for (int i = 0; i < B->shape[0] * B->shape[1]; i++)
        {
            static_cast<float *>(B->data)[i] = distribution(generator);
        }
    };

    DLTensor Process(std::string fname)
    {
        tvm::runtime::NDArray C_array = tvm::runtime::NDArray::Empty({A->shape[0], B->shape[1]}, {kDLFloat, 32, 1}, {kDLCPU, 0});
        tvm::runtime::Module mod = tvm::runtime::Module::LoadFromFile(fname);
        tvm::runtime::PackedFunc f = mod.GetFunction("blockVectoryParallel");
        DLTensor C = C_array.ToDLPack()->dl_tensor;
        timer.start();
        ITER(f(A.get(), B.get(), &C));
        timer.stop();
        std::cout << "TVM GEMM used : " << timer.duration()/ITERTIME << " ms" << std::endl;
        return C;
    }

    DLTensor NaiveProcess()
    {
        // 矩阵乘法
        tvm::runtime::NDArray C_array = tvm::runtime::NDArray::Empty({A->shape[0], B->shape[1]}, {kDLFloat, 32, 1}, {kDLCPU, 0});
        DLTensor C = C_array.ToDLPack()->dl_tensor;
        timer.start();
        for (int i = 0; i < A->shape[0]; ++i)
        {
            for (int j = 0; j < B->shape[1]; ++j)
            {
                float sum = 0;
                for (int k = 0; k < A->shape[1]; ++k)
                {
                    sum += static_cast<float *>(A->data)[i * A->shape[1] + k] * static_cast<float *>(B->data)[k * B->shape[1] + j];
                }
                static_cast<float *>(C.data)[i * C.shape[1] + j] = sum;
            }
        }
        timer.stop();
        std::cout << "Naive GEMM used : " << timer.duration() << " ms" << std::endl;
        // 返回结果tensor
        return C;
    }
};

int main(int chrc, char **chrv)
{
    if (chrc < 4)
    {
        std::cout << "you should use: ./gemm 512 5120 512" << std::endl;
        exit(1);
    }
    int M = atoi(chrv[1]);
    int K = atoi(chrv[2]);
    int N = atoi(chrv[3]);
    GEMM instance;
    instance.Init(M, K, N);
    DLTensor CP = instance.Process("../../lib/GEMM.so");
    DLTensor CN = instance.NaiveProcess();
    Check(CP, CN);
}

2.1 Init

TVM的C++采用DLTensor作为基本输入和输出的格式，其数据结构为：

typedef struct {
  /*!
   * \brief The data pointer points to the allocated data. This will be CUDA
   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
   * types. This pointer is always aligned to 256 bytes as in CUDA. The
   * `byte_offset` field should be used to point to the beginning of the data.
   *
   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
   * (after which this note will be updated); at the moment it is recommended
   * to not rely on the data pointer being correctly aligned.
   *
   * For given DLTensor, the size of memory required to store the contents of
   * data is calculated as follows:
   *
   * \code{.c}
   * static inline size_t GetDataSize(const DLTensor* t) {
   *   size_t size = 1;
   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
   *     size *= t->shape[i];
   *   }
   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
   *   return size;
   * }
   * \endcode
   */
  void* data;
  /*! \brief The device of the tensor */
  DLDevice device;
  /*! \brief Number of dimensions */
  int32_t ndim;
  /*! \brief The data type of the pointer*/
  DLDataType dtype;
  /*! \brief The shape of the tensor */
  int64_t* shape;
  /*!
   * \brief strides of the tensor (in number of elements, not bytes)
   *  can be NULL, indicating tensor is compact and row-majored.
   */
  int64_t* strides;
  /*! \brief The offset in bytes to the beginning pointer to data */
  uint64_t byte_offset;
} DLTensor;

其支持的device为：

typedef enum {
#endif
  /*! \brief CPU device */
  kDLCPU = 1,
  /*! \brief CUDA GPU device */
  kDLCUDA = 2,
  /*!
   * \brief Pinned CUDA CPU memory by cudaMallocHost
   */
  kDLCUDAHost = 3,
  /*! \brief OpenCL devices. */
  kDLOpenCL = 4,
  /*! \brief Vulkan buffer for next generation graphics. */
  kDLVulkan = 7,
  /*! \brief Metal for Apple GPU. */
  kDLMetal = 8,
  /*! \brief Verilog simulator buffer */
  kDLVPI = 9,
  /*! \brief ROCm GPUs for AMD GPUs */
  kDLROCM = 10,
  /*!
   * \brief Pinned ROCm CPU memory allocated by hipMallocHost
   */
  kDLROCMHost = 11,
  /*!
   * \brief Reserved extension device type,
   * used for quickly test extension device
   * The semantics can differ depending on the implementation.
   */
  kDLExtDev = 12,
  /*!
   * \brief CUDA managed/unified memory allocated by cudaMallocManaged
   */
  kDLCUDAManaged = 13,
  /*!
   * \brief Unified shared memory allocated on a oneAPI non-partititioned
   * device. Call to oneAPI runtime is required to determine the device
   * type, the USM allocation type and the sycl context it is bound to.
   *
   */
  kDLOneAPI = 14,
  /*! \brief GPU support for next generation WebGPU standard. */
  kDLWebGPU = 15,
  /*! \brief Qualcomm Hexagon DSP */
  kDLHexagon = 16,
} DLDeviceType;

Init的完整代码为：

    void Init(int M, int K, int N)
    {
        // 创建输入矩阵并填充数据
        tvm::runtime::NDArray A_array = tvm::runtime::NDArray::Empty({M, K}, {kDLFloat, 32, 1}, {kDLCPU, 0});
        tvm::runtime::NDArray B_array = tvm::runtime::NDArray::Empty({K, N}, {kDLFloat, 32, 1}, {kDLCPU, 0});
        A = std::make_shared<DLTensor>(A_array.ToDLPack()->dl_tensor);
        B = std::make_shared<DLTensor>(B_array.ToDLPack()->dl_tensor);

        // 填充输入矩阵
        std::random_device rd;
        std::default_random_engine generator(rd());

        // 定义随机数分布范围
        std::uniform_int_distribution<int> distribution(1, 10);

        for (int i = 0; i < A->shape[0] * A->shape[1]; i++)
        {
            static_cast<float *>(A->data)[i] = distribution(generator);
        }
        for (int i = 0; i < B->shape[0] * B->shape[1]; i++)
        {
            static_cast<float *>(B->data)[i] = distribution(generator);
        }
    };

2.2Process

        tvm::runtime::Module mod = tvm::runtime::Module::LoadFromFile(fname);
        tvm::runtime::PackedFunc f = mod.GetFunction("blockVectoryParallel");

主要通过这两行代码，将function进行load，其中“blockVectoryParallel”是在python中指定的name；

func = tvm.build(s, [A,B,C], target = self.target, name = "blockVectoryParallel")

具体调用过程为：

    DLTensor Process(std::string fname)
    {
        tvm::runtime::NDArray C_array = tvm::runtime::NDArray::Empty({A->shape[0], B->shape[1]}, {kDLFloat, 32, 1}, {kDLCPU, 0});
        tvm::runtime::Module mod = tvm::runtime::Module::LoadFromFile(fname);
        tvm::runtime::PackedFunc f = mod.GetFunction("blockVectoryParallel");
        DLTensor C = C_array.ToDLPack()->dl_tensor;
        timer.start();
        ITER(f(A.get(), B.get(), &C));
        timer.stop();
        std::cout << "TVM GEMM used : " << timer.duration()/ITERTIME << " ms" << std::endl;
        return C;
    }