TorchScript C++ 自定义运算符 cpu&cuda

之江小林

已于 2023-11-30 10:22:42 修改

阅读量531

点赞数 8

分类专栏： pytorch 文章标签： c++ 开发语言

于 2023-11-22 16:09:21 首次发布

本文链接：https://blog.csdn.net/weixin_43739110/article/details/134555413

版权

pytorch 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

参考

在 C++ 中注册调度运算符

使用自定义 C++ 运算符扩展 TorchScript

环境(非conda)：

NVIDIA Driver Version : 545.23.08
CUDA Version: 12.1
Python Version: 3.11
Pytorch Version: 2.1
Cmake version : 3.18.1
工作目录：workspace/test

目的

用TorchScript C++自定义运算符方法，添加两个向量相加的算子到pytorch。

一、 C++ 自定义运算符

创建workspace/test/add2.cpp:

#include <stdio.h>
#include "add2.cuh"

#include "torch/script.h"

namespace {
using torch::Tensor;
using torch::DeviceType;

Tensor myadd_cpu(const Tensor& self_, const Tensor& other_) {
  TORCH_CHECK(self_.sizes() == other_.sizes());
  TORCH_INTERNAL_ASSERT(self_.device().type() == DeviceType::CPU);
  TORCH_INTERNAL_ASSERT(other_.device().type() == DeviceType::CPU);

  printf("cpu\n");
  Tensor self = self_.contiguous();
  Tensor other = other_.contiguous();
  Tensor result = torch::empty(self.sizes(), self.options());
  const float* self_ptr = self.data_ptr<float>();
  const float* other_ptr = other.data_ptr<float>();
  float* result_ptr = result.data_ptr<float>();

  for (int64_t i = 0; i < result.numel(); i++) {
    result_ptr[i] = self_ptr[i] + other_ptr[i];
  }
  return result;
}

Tensor myadd_cuda(const Tensor& self_, const Tensor& other_) {
  TORCH_CHECK(self_.sizes() == other_.sizes());
  TORCH_INTERNAL_ASSERT(self_.device().type() == DeviceType::CUDA);
  TORCH_INTERNAL_ASSERT(other_.device().type() == DeviceType::CUDA);
  printf("cuda\n");

  Tensor self = self_.contiguous();
  Tensor other = other_.contiguous();
  Tensor result = torch::empty(self.sizes(), self.options());
  const float* self_ptr = self.data_ptr<float>();
  const float* other_ptr = other.data_ptr<float>();
  float* result_ptr = result.data_ptr<float>();

  launch_add2(result_ptr, self_ptr, other_ptr, result.numel());
  return result;
}

} //namespace

TORCH_LIBRARY(myops, m) {
  m.def("myadd(Tensor self, Tensor other) -> Tensor");
}
TORCH_LIBRARY_IMPL(myops, CPU, m) {
  m.impl("myadd", myadd_cpu);
}
TORCH_LIBRARY_IMPL(myops, CUDA, m) {
  m.impl("myadd", myadd_cuda);
}

/* 出自pytorch/torch/library.h
/// The `m` argument is bound to a torch::Library that is used to
/// register operators.  There may only be one TORCH_LIBRARY()
/// for any given namespace.
#define TORCH_LIBRARY(ns, m)                                                   \
  static void TORCH_LIBRARY_init_##ns(torch::Library&);                        \
  static const torch::detail::TorchLibraryInit TORCH_LIBRARY_static_init_##ns( \
      torch::Library::DEF,                                                     \
      &TORCH_LIBRARY_init_##ns,                                                \
      #ns,                                                                     \
      c10::nullopt,                                                            \
      __FILE__,                                                                \
      __LINE__);                                                               \
  void TORCH_LIBRARY_init_##ns(torch::Library& m)

/// If ``add_cpu_impl`` is an overloaded function, use a
/// ``static_cast`` to specify which overload you want
/// (by providing the full type).
///
// NB: if the dispatch key is not whitelisted, we simply omit the Library
// call entirely
#define TORCH_LIBRARY_IMPL(ns, k, m) _TORCH_LIBRARY_IMPL(ns, k, m, C10_UID)

/// \private
///
/// The above macro requires an extra unique identifier (uid) to prevent
/// variable name collisions. This can happen if TORCH_LIBRARY_IMPL is called
/// multiple times with the same namespace and dispatch key in the same
/// translation unit.
#define _TORCH_LIBRARY_IMPL(ns, k, m, uid)                                \
  static void C10_CONCATENATE(                                            \
      TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid)(torch::Library&);       \
  static const torch::detail::TorchLibraryInit C10_CONCATENATE(           \
      TORCH_LIBRARY_IMPL_static_init_##ns##_##k##_, uid)(                 \
      torch::Library::IMPL,                                               \
      (c10::impl::dispatch_key_allowlist_check(c10::DispatchKey::k)       \
           ? &C10_CONCATENATE(TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid) \
           : [](torch::Library&) -> void {}),                             \
      #ns,                                                                \
      c10::make_optional(c10::DispatchKey::k),                            \
      __FILE__,                                                           \
      __LINE__);                                                          \
  void C10_CONCATENATE(                                                   \
      TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid)(torch::Library & m)

*/

创建workspace/test/add2.cu:

#include "add2.cuh"

 
__global__ void add2_kernel(float* c,
                            const float* a,
                            const float* b,
                            long n) {
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
            i < n; i += gridDim.x * blockDim.x) {
        c[i] = a[i] + b[i];
    }
}
 
void launch_add2(float* c,
                 const float* a,
                 const float* b,
                 long n) {
    
    dim3 grid((n + 1023) / 1024);
    dim3 block(1024);
    add2_kernel<<<grid, block>>>(c, a, b, n);
}

创建workspace/test/add2.cuh:

void launch_add2(float* c, const float* a, const float* b, long n);

二、 cmake编译动态库

创建workspace/test/CMakeLists.txt:

cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
project(add2)

find_package(Torch REQUIRED)
#find_package(CUDA REQUIRED)

# Define our library target
add_library(add2 SHARED add2.cpp add2.cu)
# Enable C++17
target_compile_features(add2 PRIVATE cxx_std_17)
# Link against LibTorch
target_link_libraries(add2 "${TORCH_LIBRARIES}")

新建目录build，编译:

mkdir build
cd build
cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" ..
make

创建workspace/test/test.py:

import time
import ctypes
import numpy as np
import torch
 
print(torch.__version__)
torch.ops.load_library("build/libadd2.so")
print(torch.ops.myops.myadd)

# c = a + b (shape: [n])
n = 1024 * 1024
a1 = torch.rand(n, device="cpu")
b1 = torch.rand(n, device="cpu")

a2 = torch.rand(n, device="cuda:0")
b2 = torch.rand(n, device="cuda:0")

def run_torch():
    c = torch.ops.myops.myadd(a1, b1)
    return c

def run_cuda():
    c = torch.ops.myops.myadd(a2, b2)
    return c

print("\nRunning cpu...")
print(a1)
print(b1)
start_time = time.time()
c_cpu = run_torch()
end_time = time.time()
print(c_cpu)
print((end_time-start_time)*1e6)

print("\nRunning cuda...")
print(a2)
print(b2)
start_time = time.time()
c_cuda = run_cuda()
end_time = time.time()
print(c_cuda)
print((end_time-start_time)*1e6)

结果如下

$ python3 test.py
2.1.0+cu121
myops.myadd

Running cpu...
tensor([0.5668, 0.9394, 0.5168,  ..., 0.3057, 0.0873, 0.6022])
tensor([0.1668, 0.8012, 0.4616,  ..., 0.7969, 0.7210, 0.8589])
cpu
tensor([0.7335, 1.7406, 0.9784,  ..., 1.1026, 0.8083, 1.4611])
9006.977081298828

Running cuda...
tensor([0.3864, 0.3490, 0.5892,  ..., 0.4237, 0.4182, 0.6051], device='cuda:0')
tensor([0.3069, 0.7079, 0.1878,  ..., 0.7639, 0.6509, 0.5006], device='cuda:0')
cuda
tensor([0.6933, 1.0568, 0.7770,  ..., 1.1876, 1.0690, 1.1058], device='cuda:0')
362.396240234375