上一篇使用TVM优化GEMM-CSDN博客文章中提到如何使用TVM优化GEMM,这篇文章主要讲一下在C++中的部署,以及测速结果,主要参考的是:使用 C++ API 部署 TVM 模块 | Apache TVM 中文站。主要代码可见:https://github.com/Beichen-Wang/HPCTest/blob/master/TVM/src/deploy/Gemm.cpp
1.python导出GEMM library
1.1先生成function
def TEBlockVectoryParallelGemm(self):
self.M = te.var("M")
self.K = te.var("K")
self.N = te.var("N")
k = te.reduce_axis((0, self.K), "k")
A = te.placeholder((self.M, self.K), name="A")
B = te.placeholder((self.K, self.N), name="B")
C = te.compute((self.M, self.N), lambda x, y: te.sum(A[x,k]*B[k,y], axis = k), name="C")
s = te.create_schedule(C.op)
xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], self.bs, self.bs)
ko, ki = s[C].split(k, factor=16)
s[C].reorder(xo, yo, ko, xi, ki, yi)
# s[C].unroll(ki)
s[C].vectorize(yi)
s[C].parallel(xo)
func = tvm.build(s, [A,B,C], target = self.target, name = "blockVectoryParallel")
print(tvm.lower(s, [A,B,C], simple_mode=True))
return func
该步骤在上一篇使用TVM优化GEMM-CSDN博客文章中详细解释过,唯一不同的地方在于:
self.M = te.var("M")
self.K = te.var("K")
self.N = te.var("N")
该命令将M、N和K定义为变量,方便在C++中改变GEMM不同的输入;
1.2导出Library
def GetLibrary(self, func):
curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
dylib_path = os.path.join(curr_path, "lib/GEMM.so")
func.export_library(dylib_path)
1.3使用方式
M = 512
K = 5120
N = 512
bs = 64
instance = GEMM(M,N,K,bs)
funcBlockPermuteVectory = instance.TEBlockVectoryParallelGemm()
instance.GetLibrary(funcBlockPermuteVectory)
2.C++执行GEMM
整体代码为:
#include <dlpack/dlpack.h>
#include <tvm/runtime/module.h>
#include <tvm/runtime/packed_func.h>
#include <tvm/runtime/registry.h>
#include <cstdio>
#include <vector>
#include <random>
#include <cassert>
#include "Timer.hpp"
void Check(DLTensor a, DLTensor b)
{
// 检查形状是否相同
assert(a.ndim == b.ndim);
for (int i = 0; i < a.ndim; ++i)
{
assert(a.shape[i] == b.shape[i]);
}
for (int i = 0; i < a.shape[0]; ++i)
{
for (int j = 0; j < a.shape[1]; ++j)
{
assert(static_cast<float *>(a.data)[i * a.shape[1] + j] == static_cast<float *>(b.data)[i * a.shape[1] + j]);
}
}
// 检查数据类型是否相同
assert(a.dtype.code == b.dtype.code);
assert(a.dtype.bits == b.dtype.bits);
assert(a.dtype.lanes == b.dtype.lanes);
std::cout << "Check passed" << std::endl;
}
#define ITERTIME 10
#define ITER(func) \
for (int i = 0; i < ITERTIME; i++) \
{ \
func; \
}
class GEMM
{
private:
std::shared_ptr<DLTensor> A, B, C;
util::Timer timer;
public:
void Init(int M, int K, int N)
{
// 创建输入矩阵并填充数据
tvm::runtime::NDArray A_array = tvm::runtime::NDArray::Empty({M, K}, {kDLFloat, 32, 1}, {kDLCPU, 0});
tvm::runtime::NDArray B_array = tvm::runtime::NDArray::Empty({K, N}, {kDLFloat, 32, 1}, {kDLCPU, 0});
A = std::make_shared<DLTensor>(A_array.ToDLPack()->dl_tensor);
B = std::make_shared<DLTensor>(B_array.ToDLPack()->dl_tensor);
// 填充输入矩阵
std::random_device rd;
std::default_random_engine generator(rd());
// 定义随机数分布范围
std::uniform_int_distribution<int> distribution(1, 10);
for (int i = 0; i < A->shape[0] * A->shape[1]; i++)
{
static_cast<float *>(A->data)[i] = distribution(generator);
}
for (int i = 0; i < B->shape[0] * B->shape[1]; i++)
{
static_cast<float *>(B->data)[i] = distribution(generator);
}
};
DLTensor Process(std::string fname)
{
tvm::runtime::NDArray C_array = tvm::runtime::NDArray::Empty({A->shape[0], B->shape[1]}, {kDLFloat, 32, 1}, {kDLCPU, 0});
tvm::runtime::Module mod = tvm::runtime::Module::LoadFromFile(fname);
tvm::runtime::PackedFunc f = mod.GetFunction("blockVectoryParallel");
DLTensor C = C_array.ToDLPack()->dl_tensor;
timer.start();
ITER(f(A.get(), B.get(), &C));
timer.stop();
std::cout << "TVM GEMM used : " << timer.duration()/ITERTIME << " ms" << std::endl;
return C;
}
DLTensor NaiveProcess()
{
// 矩阵乘法
tvm::runtime::NDArray C_array = tvm::runtime::NDArray::Empty({A->shape[0], B->shape[1]}, {kDLFloat, 32, 1}, {kDLCPU, 0});
DLTensor C = C_array.ToDLPack()->dl_tensor;
timer.start();
for (int i = 0; i < A->shape[0]; ++i)
{
for (int j = 0; j < B->shape[1]; ++j)
{
float sum = 0;
for (int k = 0; k < A->shape[1]; ++k)
{
sum += static_cast<float *>(A->data)[i * A->shape[1] + k] * static_cast<float *>(B->data)[k * B->shape[1] + j];
}
static_cast<float *>(C.data)[i * C.shape[1] + j] = sum;
}
}
timer.stop();
std::cout << "Naive GEMM used : " << timer.duration() << " ms" << std::endl;
// 返回结果tensor
return C;
}
};
int main(int chrc, char **chrv)
{
if (chrc < 4)
{
std::cout << "you should use: ./gemm 512 5120 512" << std::endl;
exit(1);
}
int M = atoi(chrv[1]);
int K = atoi(chrv[2]);
int N = atoi(chrv[3]);
GEMM instance;
instance.Init(M, K, N);
DLTensor CP = instance.Process("../../lib/GEMM.so");
DLTensor CN = instance.NaiveProcess();
Check(CP, CN);
}
2.1 Init
TVM的C++采用DLTensor作为基本输入和输出的格式,其数据结构为:
typedef struct {
/*!
* \brief The data pointer points to the allocated data. This will be CUDA
* device pointer or cl_mem handle in OpenCL. It may be opaque on some device
* types. This pointer is always aligned to 256 bytes as in CUDA. The
* `byte_offset` field should be used to point to the beginning of the data.
*
* Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
* TVM, perhaps others) do not adhere to this 256 byte aligment requirement
* on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed
* (after which this note will be updated); at the moment it is recommended
* to not rely on the data pointer being correctly aligned.
*
* For given DLTensor, the size of memory required to store the contents of
* data is calculated as follows:
*
* \code{.c}
* static inline size_t GetDataSize(const DLTensor* t) {
* size_t size = 1;
* for (tvm_index_t i = 0; i < t->ndim; ++i) {
* size *= t->shape[i];
* }
* size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
* return size;
* }
* \endcode
*/
void* data;
/*! \brief The device of the tensor */
DLDevice device;
/*! \brief Number of dimensions */
int32_t ndim;
/*! \brief The data type of the pointer*/
DLDataType dtype;
/*! \brief The shape of the tensor */
int64_t* shape;
/*!
* \brief strides of the tensor (in number of elements, not bytes)
* can be NULL, indicating tensor is compact and row-majored.
*/
int64_t* strides;
/*! \brief The offset in bytes to the beginning pointer to data */
uint64_t byte_offset;
} DLTensor;
其支持的device为:
typedef enum {
#endif
/*! \brief CPU device */
kDLCPU = 1,
/*! \brief CUDA GPU device */
kDLCUDA = 2,
/*!
* \brief Pinned CUDA CPU memory by cudaMallocHost
*/
kDLCUDAHost = 3,
/*! \brief OpenCL devices. */
kDLOpenCL = 4,
/*! \brief Vulkan buffer for next generation graphics. */
kDLVulkan = 7,
/*! \brief Metal for Apple GPU. */
kDLMetal = 8,
/*! \brief Verilog simulator buffer */
kDLVPI = 9,
/*! \brief ROCm GPUs for AMD GPUs */
kDLROCM = 10,
/*!
* \brief Pinned ROCm CPU memory allocated by hipMallocHost
*/
kDLROCMHost = 11,
/*!
* \brief Reserved extension device type,
* used for quickly test extension device
* The semantics can differ depending on the implementation.
*/
kDLExtDev = 12,
/*!
* \brief CUDA managed/unified memory allocated by cudaMallocManaged
*/
kDLCUDAManaged = 13,
/*!
* \brief Unified shared memory allocated on a oneAPI non-partititioned
* device. Call to oneAPI runtime is required to determine the device
* type, the USM allocation type and the sycl context it is bound to.
*
*/
kDLOneAPI = 14,
/*! \brief GPU support for next generation WebGPU standard. */
kDLWebGPU = 15,
/*! \brief Qualcomm Hexagon DSP */
kDLHexagon = 16,
} DLDeviceType;
Init的完整代码为:
void Init(int M, int K, int N)
{
// 创建输入矩阵并填充数据
tvm::runtime::NDArray A_array = tvm::runtime::NDArray::Empty({M, K}, {kDLFloat, 32, 1}, {kDLCPU, 0});
tvm::runtime::NDArray B_array = tvm::runtime::NDArray::Empty({K, N}, {kDLFloat, 32, 1}, {kDLCPU, 0});
A = std::make_shared<DLTensor>(A_array.ToDLPack()->dl_tensor);
B = std::make_shared<DLTensor>(B_array.ToDLPack()->dl_tensor);
// 填充输入矩阵
std::random_device rd;
std::default_random_engine generator(rd());
// 定义随机数分布范围
std::uniform_int_distribution<int> distribution(1, 10);
for (int i = 0; i < A->shape[0] * A->shape[1]; i++)
{
static_cast<float *>(A->data)[i] = distribution(generator);
}
for (int i = 0; i < B->shape[0] * B->shape[1]; i++)
{
static_cast<float *>(B->data)[i] = distribution(generator);
}
};
2.2Process
tvm::runtime::Module mod = tvm::runtime::Module::LoadFromFile(fname);
tvm::runtime::PackedFunc f = mod.GetFunction("blockVectoryParallel");
主要通过这两行代码,将function进行load,其中“blockVectoryParallel”是在python中指定的name;
func = tvm.build(s, [A,B,C], target = self.target, name = "blockVectoryParallel")
具体调用过程为:
DLTensor Process(std::string fname)
{
tvm::runtime::NDArray C_array = tvm::runtime::NDArray::Empty({A->shape[0], B->shape[1]}, {kDLFloat, 32, 1}, {kDLCPU, 0});
tvm::runtime::Module mod = tvm::runtime::Module::LoadFromFile(fname);
tvm::runtime::PackedFunc f = mod.GetFunction("blockVectoryParallel");
DLTensor C = C_array.ToDLPack()->dl_tensor;
timer.start();
ITER(f(A.get(), B.get(), &C));
timer.stop();
std::cout << "TVM GEMM used : " << timer.duration()/ITERTIME << " ms" << std::endl;
return C;
}
3.测试时间
语言 | 测试时间(ms) |
python | 468.118 |
C++ | 438.224 |
部署时间基本一致;