参考
环境(非conda):
- NVIDIA Driver Version : 545.23.08
- CUDA Version: 12.1
- Python Version: 3.11
- Pytorch Version: 2.1
- Cmake version : 3.18.1
- 工作目录:
workspace/test
目的
用TorchScript C++自定义运算符方法,添加两个向量相加的算子到pytorch。
一、 C++ 自定义运算符
创建workspace/test/add2.cpp
:
#include <stdio.h>
#include "add2.cuh"
#include "torch/script.h"
namespace {
using torch::Tensor;
using torch::DeviceType;
Tensor myadd_cpu(const Tensor& self_, const Tensor& other_) {
TORCH_CHECK(self_.sizes() == other_.sizes());
TORCH_INTERNAL_ASSERT(self_.device().type() == DeviceType::CPU);
TORCH_INTERNAL_ASSERT(other_.device().type() == DeviceType::CPU);
printf("cpu\n");
Tensor self = self_.contiguous();
Tensor other = other_.contiguous();
Tensor result = torch::empty(self.sizes(), self.options());
const float* self_ptr = self.data_ptr<float>();
const float* other_ptr = other.data_ptr<float>();
float* result_ptr = result.data_ptr<float>();
for (int64_t i = 0; i < result.numel(); i++) {
result_ptr[i] = self_ptr[i] + other_ptr[i];
}
return result;
}
Tensor myadd_cuda(const Tensor& self_, const Tensor& other_) {
TORCH_CHECK(self_.sizes() == other_.sizes());
TORCH_INTERNAL_ASSERT(self_.device().type() == DeviceType::CUDA);
TORCH_INTERNAL_ASSERT(other_.device().type() == DeviceType::CUDA);
printf("cuda\n");
Tensor self = self_.contiguous();
Tensor other = other_.contiguous();
Tensor result = torch::empty(self.sizes(), self.options());
const float* self_ptr = self.data_ptr<float>();
const float* other_ptr = other.data_ptr<float>();
float* result_ptr = result.data_ptr<float>();
launch_add2(result_ptr, self_ptr, other_ptr, result.numel());
return result;
}
} //namespace
TORCH_LIBRARY(myops, m) {
m.def("myadd(Tensor self, Tensor other) -> Tensor");
}
TORCH_LIBRARY_IMPL(myops, CPU, m) {
m.impl("myadd", myadd_cpu);
}
TORCH_LIBRARY_IMPL(myops, CUDA, m) {
m.impl("myadd", myadd_cuda);
}
/* 出自pytorch/torch/library.h
/// The `m` argument is bound to a torch::Library that is used to
/// register operators. There may only be one TORCH_LIBRARY()
/// for any given namespace.
#define TORCH_LIBRARY(ns, m) \
static void TORCH_LIBRARY_init_##ns(torch::Library&); \
static const torch::detail::TorchLibraryInit TORCH_LIBRARY_static_init_##ns( \
torch::Library::DEF, \
&TORCH_LIBRARY_init_##ns, \
#ns, \
c10::nullopt, \
__FILE__, \
__LINE__); \
void TORCH_LIBRARY_init_##ns(torch::Library& m)
/// If ``add_cpu_impl`` is an overloaded function, use a
/// ``static_cast`` to specify which overload you want
/// (by providing the full type).
///
// NB: if the dispatch key is not whitelisted, we simply omit the Library
// call entirely
#define TORCH_LIBRARY_IMPL(ns, k, m) _TORCH_LIBRARY_IMPL(ns, k, m, C10_UID)
/// \private
///
/// The above macro requires an extra unique identifier (uid) to prevent
/// variable name collisions. This can happen if TORCH_LIBRARY_IMPL is called
/// multiple times with the same namespace and dispatch key in the same
/// translation unit.
#define _TORCH_LIBRARY_IMPL(ns, k, m, uid) \
static void C10_CONCATENATE( \
TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid)(torch::Library&); \
static const torch::detail::TorchLibraryInit C10_CONCATENATE( \
TORCH_LIBRARY_IMPL_static_init_##ns##_##k##_, uid)( \
torch::Library::IMPL, \
(c10::impl::dispatch_key_allowlist_check(c10::DispatchKey::k) \
? &C10_CONCATENATE(TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid) \
: [](torch::Library&) -> void {}), \
#ns, \
c10::make_optional(c10::DispatchKey::k), \
__FILE__, \
__LINE__); \
void C10_CONCATENATE( \
TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid)(torch::Library & m)
*/
创建workspace/test/add2.cu
:
#include "add2.cuh"
__global__ void add2_kernel(float* c,
const float* a,
const float* b,
long n) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < n; i += gridDim.x * blockDim.x) {
c[i] = a[i] + b[i];
}
}
void launch_add2(float* c,
const float* a,
const float* b,
long n) {
dim3 grid((n + 1023) / 1024);
dim3 block(1024);
add2_kernel<<<grid, block>>>(c, a, b, n);
}
创建workspace/test/add2.cuh
:
void launch_add2(float* c, const float* a, const float* b, long n);
二、 cmake编译动态库
创建workspace/test/CMakeLists.txt
:
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
project(add2)
find_package(Torch REQUIRED)
#find_package(CUDA REQUIRED)
# Define our library target
add_library(add2 SHARED add2.cpp add2.cu)
# Enable C++17
target_compile_features(add2 PRIVATE cxx_std_17)
# Link against LibTorch
target_link_libraries(add2 "${TORCH_LIBRARIES}")
新建目录build,编译:
mkdir build
cd build
cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" ..
make
创建workspace/test/test.py
:
import time
import ctypes
import numpy as np
import torch
print(torch.__version__)
torch.ops.load_library("build/libadd2.so")
print(torch.ops.myops.myadd)
# c = a + b (shape: [n])
n = 1024 * 1024
a1 = torch.rand(n, device="cpu")
b1 = torch.rand(n, device="cpu")
a2 = torch.rand(n, device="cuda:0")
b2 = torch.rand(n, device="cuda:0")
def run_torch():
c = torch.ops.myops.myadd(a1, b1)
return c
def run_cuda():
c = torch.ops.myops.myadd(a2, b2)
return c
print("\nRunning cpu...")
print(a1)
print(b1)
start_time = time.time()
c_cpu = run_torch()
end_time = time.time()
print(c_cpu)
print((end_time-start_time)*1e6)
print("\nRunning cuda...")
print(a2)
print(b2)
start_time = time.time()
c_cuda = run_cuda()
end_time = time.time()
print(c_cuda)
print((end_time-start_time)*1e6)
结果如下
$ python3 test.py
2.1.0+cu121
myops.myadd
Running cpu...
tensor([0.5668, 0.9394, 0.5168, ..., 0.3057, 0.0873, 0.6022])
tensor([0.1668, 0.8012, 0.4616, ..., 0.7969, 0.7210, 0.8589])
cpu
tensor([0.7335, 1.7406, 0.9784, ..., 1.1026, 0.8083, 1.4611])
9006.977081298828
Running cuda...
tensor([0.3864, 0.3490, 0.5892, ..., 0.4237, 0.4182, 0.6051], device='cuda:0')
tensor([0.3069, 0.7079, 0.1878, ..., 0.7639, 0.6509, 0.5006], device='cuda:0')
cuda
tensor([0.6933, 1.0568, 0.7770, ..., 1.1876, 1.0690, 1.1058], device='cuda:0')
362.396240234375