使用TensorAccessor和data_ptr两种方式访问数据速度比较

背景

我们知道访问Tensor中的一个元素由两种方法:

  1. 基于TensorAccessor
// torch::PackedTensorAccessor64<float, 3> data
data[i][j][k] = 1.0;
  1. 基于数据指针data_ptr
// float* data
data[i * dim1 * dim2 + j * dim2 + k] = 1.0;

这两种方法哪个更快?

结论

先说结论
两种方式没有明显差距,基本一样。

测试代码

// test.cu
#include <cstdint>
#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>

namespace {
namespace device {


#define CUDA_GET_THREAD_ID(tid, Q) const int tid = blockIdx.x * blockDim.x + threadIdx.x; \
                    if (tid >= Q) return

__global__ void test_kernel_1(torch::PackedTensorAccessor64<float, 3> data) {
    CUDA_GET_THREAD_ID(tid, data.size(0) * data.size(1) * data.size(2));

    const int i = tid / (data.size(1) * data.size(2));
    const int j = (tid % (data.size(1) * data.size(2))) / data.size(2);
    const int k = tid % data.size(2);

    data[i][j][k] = 1.0;
}

__global__ void test_kernel_2(float* data, const int dim1, const int dim2, const int dim3) {
    CUDA_GET_THREAD_ID(tid, dim1 * dim2 * dim3);

    const int i = tid / (dim1 * dim2);
    const int j = (tid % (dim1 * dim2)) / dim2;
    const int k = tid % dim2;

    data[i * dim1 * dim2 + j * dim2 + k] = 1.0;
}

}  // namespace device
}  // namespace


void test(torch::Tensor data, int test_mode) {
    const auto Q = data.size(0) * data.size(1) * data.size(2);

#define CUDA_MAX_THREADS at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock

    const int cuda_n_threads = std::min<int>(Q, CUDA_MAX_THREADS);

#define CUDA_N_BLOCKS_NEEDED(Q, CUDA_N_THREADS) ((Q - 1) / CUDA_N_THREADS + 1)

    const int blocks = CUDA_N_BLOCKS_NEEDED(Q, cuda_n_threads);

    if (test_mode == 1) {
        device::test_kernel_1<<<blocks, cuda_n_threads>>>(
            data.packed_accessor64<float, 3>()
        );
    } else if (test_mode == 2) {
        device::test_kernel_2<<<blocks, cuda_n_threads>>>(
            data.data_ptr<float>(),
            data.size(0),
            data.size(1),
            data.size(2)
        );
    }
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
#define _REG_FUNC(funname) m.def(#funname, &funname)
  _REG_FUNC(test);
#undef _REG_FUNC
}

## test.py

import glob
import os
from subprocess import DEVNULL, call

from torch.utils.cpp_extension import _get_build_directory, load

import time

import torch

PATH = os.path.dirname(os.path.abspath(__file__))

def cuda_toolkit_available():
    """Check if the nvcc is avaiable on the machine."""
    try:
        call(["nvcc"], stdout=DEVNULL, stderr=DEVNULL)
        return True
    except FileNotFoundError:
        return False


def load_extention(name: str):
    return load(
        name=name,
        sources=glob.glob(os.path.join(PATH, "csrc/*.cu")),
        extra_cflags=["-O3"],
        extra_cuda_cflags=["-O3"],
        verbose=True,
    )


_C = None
name = "test"
if os.listdir(_get_build_directory(name, verbose=False)) != []:
    # If the build exists, we assume the extension has been built
    # and we can load it.
    _C = load_extention(name)
else:
    # First time to build the extension
    if cuda_toolkit_available():
            _C = load_extention(name)
    else:
        print("No CUDA toolkit found.")

if _C is not None:
    test = _C.test

class Timer:
    def __init__(self):
        self._time = 0

    def __enter__(self):
        self._time = time.time()

    def __exit__(self, *args):
        print(f"Time: {(time.time() - self._time) * 1000:.2f}ms")


N = 512 + 256 # 测试数据大小

K = 200 # 测试次数

data = torch.zeros((N,N,N), device='cuda:0')

with Timer():
    test(data, 1) # 第一次测试用于唤醒cuda,测试结果没有参考价值

with Timer():
    for _ in range(K):
        test(data, 2)

with Timer():
    for _ in range(K):
        test(data, 1)

结果

Time: 19.14ms
Time: 1.55ms
Time: 1.55ms
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值