CUDA提供的函数评估Occupancy

最新推荐文章于 2024-03-01 23:25:59 发布

黑帽子和猫

最新推荐文章于 2024-03-01 23:25:59 发布

阅读量1.4k

点赞数

分类专栏： CUDA

本文链接：https://blog.csdn.net/monroed/article/details/70185953

版权

CUDA 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

#include <iostream>                                                                                                                                 
#include <cuda_runtime.h>
#include <cuda_occupancy.h>

// Device code
__global__ void MyKernel(int *d, int *a, int *b)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    d[idx] = a[idx] * b[idx];
}
// Host code
int main()
{
    int numBlocks; // Occupancy in terms of active blocks
    int blockSize = 32;
    // These variables are used to convert occupancy to warps
    int device;
    cudaDeviceProp prop;
    int activeWarps;
    int maxWarps;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&prop, device);
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
    &numBlocks,
    MyKernel,
    blockSize,
    0);
    activeWarps = numBlocks * blockSize / prop.warpSize;
            maxWarps = prop.maxThreadsPerMultiProcessor / prop.warpSize;
    std::cout << "Occupancy: " << (double)activeWarps / maxWarps * 100 << "%" << std::endl;
return 0;
}

其实occupancy可以用nvpprof来获取的；倒是可以从这个例子看下occupancy的计算过程。

NV提供的例子选择合适的blocks, threads

// Device code
__global__ void MyKernel(int *array, int arrayCount)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < arrayCount) {
    array[idx] *= array[idx];
    }
}

// Host code
int launchMyKernel(int *array, int arrayCount)
{
    int blockSize; // The launch configurator returned block size
    int minGridSize; // The minimum grid size needed to achieve the
    // maximum occupancy for a full device
    // launch
    int gridSize; // The a
    cudaOccupancyMaxPotentialBlockSize(
    &minGridSize,
    &blockSize,
    (void*)MyKernel,
    0,arrayCount);
    // Round up according to array size
    gridSize = (arrayCount + blockSize - 1) / blockSize;
    MyKernel<<<gridSize, blockSize>>>(array, arrayCount);
    cudaDeviceSynchronize();
    // If interested, the occupancy can be calculated with
    // cudaOccupancyMaxActiveBlocksPerMultiprocessor
    return 0;
}