背景
我们知道访问Tensor中的一个元素由两种方法:
- 基于TensorAccessor
// torch::PackedTensorAccessor64<float, 3> data
data[i][j][k] = 1.0;
- 基于数据指针data_ptr
// float* data
data[i * dim1 * dim2 + j * dim2 + k] = 1.0;
这两种方法哪个更快?
结论
先说结论
两种方式没有明显差距,基本一样。
测试代码
// test.cu
#include <cstdint>
#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>
namespace {
namespace device {
#define CUDA_GET_THREAD_ID(tid, Q) const int tid = blockIdx.x * blockDim.x + threadIdx.x; \
if (tid >= Q) return
__global__ void test_kernel_1(torch::PackedTensorAccessor64<float, 3> data) {
CUDA_GET_THREAD_ID(tid, data.size(0) * data.size(1) * data.size(2));
const int i = tid / (data.size(1) * data.size(2));
const int j = (tid % (data.size(1) * data.size(2))) / data.size(2);
const int k = tid % data.size(2);
data[i][j][k] = 1.0;
}
__global__ void test_kernel_2(float* data, const int dim1, const int dim2, const int dim3) {
CUDA_GET_THREAD_ID(tid, dim1 * dim2 * dim3);
const int i = tid / (dim1 * dim2);
const int j = (tid % (dim1 * dim2)) / dim2;
const int k = tid % dim2;
data[i * dim1 * dim2 + j * dim2 + k] = 1.0;
}
} // namespace device
} // namespace
void test(torch::Tensor data, int test_mode) {
const auto Q = data.size(0) * data.size(1) * data.size(2);
#define CUDA_MAX_THREADS at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock
const int cuda_n_threads = std::min<int>(Q, CUDA_MAX_THREADS);
#define CUDA_N_BLOCKS_NEEDED(Q, CUDA_N_THREADS) ((Q - 1) / CUDA_N_THREADS + 1)
const int blocks = CUDA_N_BLOCKS_NEEDED(Q, cuda_n_threads);
if (test_mode == 1) {
device::test_kernel_1<<<blocks, cuda_n_threads>>>(
data.packed_accessor64<float, 3>()
);
} else if (test_mode == 2) {
device::test_kernel_2<<<blocks, cuda_n_threads>>>(
data.data_ptr<float>(),
data.size(0),
data.size(1),
data.size(2)
);
}
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
#define _REG_FUNC(funname) m.def(#funname, &funname)
_REG_FUNC(test);
#undef _REG_FUNC
}
## test.py
import glob
import os
from subprocess import DEVNULL, call
from torch.utils.cpp_extension import _get_build_directory, load
import time
import torch
PATH = os.path.dirname(os.path.abspath(__file__))
def cuda_toolkit_available():
"""Check if the nvcc is avaiable on the machine."""
try:
call(["nvcc"], stdout=DEVNULL, stderr=DEVNULL)
return True
except FileNotFoundError:
return False
def load_extention(name: str):
return load(
name=name,
sources=glob.glob(os.path.join(PATH, "csrc/*.cu")),
extra_cflags=["-O3"],
extra_cuda_cflags=["-O3"],
verbose=True,
)
_C = None
name = "test"
if os.listdir(_get_build_directory(name, verbose=False)) != []:
# If the build exists, we assume the extension has been built
# and we can load it.
_C = load_extention(name)
else:
# First time to build the extension
if cuda_toolkit_available():
_C = load_extention(name)
else:
print("No CUDA toolkit found.")
if _C is not None:
test = _C.test
class Timer:
def __init__(self):
self._time = 0
def __enter__(self):
self._time = time.time()
def __exit__(self, *args):
print(f"Time: {(time.time() - self._time) * 1000:.2f}ms")
N = 512 + 256 # 测试数据大小
K = 200 # 测试次数
data = torch.zeros((N,N,N), device='cuda:0')
with Timer():
test(data, 1) # 第一次测试用于唤醒cuda,测试结果没有参考价值
with Timer():
for _ in range(K):
test(data, 2)
with Timer():
for _ in range(K):
test(data, 1)
结果
Time: 19.14ms
Time: 1.55ms
Time: 1.55ms