使用TensorAccessor和data_ptr两种方式访问数据速度比较

最新推荐文章于 2024-07-19 15:44:38 发布

wzy-666

最新推荐文章于 2024-07-19 15:44:38 发布

阅读量513

点赞数

文章标签：深度学习 python 人工智能

本文链接：https://blog.csdn.net/C_C666/article/details/127824130

版权

背景

我们知道访问Tensor中的一个元素由两种方法：

基于TensorAccessor

// torch::PackedTensorAccessor64<float, 3> data
data[i][j][k] = 1.0;

基于数据指针data_ptr

// float* data
data[i * dim1 * dim2 + j * dim2 + k] = 1.0;

这两种方法哪个更快？

结论

先说结论
两种方式没有明显差距，基本一样。

测试代码

// test.cu
#include <cstdint>
#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>

namespace {
namespace device {


#define CUDA_GET_THREAD_ID(tid, Q) const int tid = blockIdx.x * blockDim.x + threadIdx.x; \
                    if (tid >= Q) return

__global__ void test_kernel_1(torch::PackedTensorAccessor64<float, 3> data) {
    CUDA_GET_THREAD_ID(tid, data.size(0) * data.size(1) * data.size(2));

    const int i = tid / (data.size(1) * data.size(2));
    const int j = (tid % (data.size(1) * data.size(2))) / data.size(2);
    const int k = tid % data.size(2);

    data[i][j][k] = 1.0;
}

__global__ void test_kernel_2(float* data, const int dim1, const int dim2, const int dim3) {
    CUDA_GET_THREAD_ID(tid, dim1 * dim2 * dim3);

    const int i = tid / (dim1 * dim2);
    const int j = (tid % (dim1 * dim2)) / dim2;
    const int k = tid % dim2;

    data[i * dim1 * dim2 + j * dim2 + k] = 1.0;
}

}  // namespace device
}  // namespace


void test(torch::Tensor data, int test_mode) {
    const auto Q = data.size(0) * data.size(1) * data.size(2);

#define CUDA_MAX_THREADS at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock

    const int cuda_n_threads = std::min<int>(Q, CUDA_MAX_THREADS);

#define CUDA_N_BLOCKS_NEEDED(Q, CUDA_N_THREADS) ((Q - 1) / CUDA_N_THREADS + 1)

    const int blocks = CUDA_N_BLOCKS_NEEDED(Q, cuda_n_threads);

    if (test_mode == 1) {
        device::test_kernel_1<<<blocks, cuda_n_threads>>>(
            data.packed_accessor64<float, 3>()
        );
    } else if (test_mode == 2) {
        device::test_kernel_2<<<blocks, cuda_n_threads>>>(
            data.data_ptr<float>(),
            data.size(0),
            data.size(1),
            data.size(2)
        );
    }
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
#define _REG_FUNC(funname) m.def(#funname, &funname)
  _REG_FUNC(test);
#undef _REG_FUNC
}

## test.py

import glob
import os
from subprocess import DEVNULL, call

from torch.utils.cpp_extension import _get_build_directory, load

import time

import torch

PATH = os.path.dirname(os.path.abspath(__file__))

def cuda_toolkit_available():
    """Check if the nvcc is avaiable on the machine."""
    try:
        call(["nvcc"], stdout=DEVNULL, stderr=DEVNULL)
        return True
    except FileNotFoundError:
        return False


def load_extention(name: str):
    return load(
        name=name,
        sources=glob.glob(os.path.join(PATH, "csrc/*.cu")),
        extra_cflags=["-O3"],
        extra_cuda_cflags=["-O3"],
        verbose=True,
    )


_C = None
name = "test"
if os.listdir(_get_build_directory(name, verbose=False)) != []:
    # If the build exists, we assume the extension has been built
    # and we can load it.
    _C = load_extention(name)
else:
    # First time to build the extension
    if cuda_toolkit_available():
            _C = load_extention(name)
    else:
        print("No CUDA toolkit found.")

if _C is not None:
    test = _C.test

class Timer:
    def __init__(self):
        self._time = 0

    def __enter__(self):
        self._time = time.time()

    def __exit__(self, *args):
        print(f"Time: {(time.time() - self._time) * 1000:.2f}ms")


N = 512 + 256 # 测试数据大小

K = 200 # 测试次数

data = torch.zeros((N,N,N), device='cuda:0')

with Timer():
    test(data, 1) # 第一次测试用于唤醒cuda，测试结果没有参考价值

with Timer():
    for _ in range(K):
        test(data, 2)

with Timer():
    for _ in range(K):
        test(data, 1)