CUDA:clock_nvrtc

最新推荐文章于 2024-04-23 18:57:21 发布

edgelee

最新推荐文章于 2024-04-23 18:57:21 发布

阅读量186

点赞数

分类专栏： CUDA 文章标签： vscode

本文链接：https://blog.csdn.net/lixiaoguang20/article/details/121009310

版权

CUDA 专栏收录该内容

17 篇文章 1 订阅

订阅专栏

Sample: clock_nvrtc
Minimum spec: SM 3.0

This example shows how to use the clock function using libNVRTC to measure the performance of block of threads of a kernel accurately.

Key concepts:
Performance Strategies
Runtime Compilation

clock_kernel.cu

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

// This example shows how to use the clock function to measure the performance of
// block of threads of a kernel accurately.
//

// Blocks are executed in parallel and out of order. Since there's no synchronization
// mechanism between blocks, we measure the clock once for each block. The clock
// samples are written to device memory.


// This kernel computes a standard parallel reduction and evaluates the
// time it takes to do that for each block. The timing results are stored
// in device memory.

extern "C" __global__  void timedReduction(const float *input, float *output, clock_t *timer)
{
    // __shared__ float shared[2 * blockDim.x];
    extern __shared__ float shared[];

    const int tid = threadIdx.x;
    const int bid = blockIdx.x;

    if (tid == 0) timer[bid] = clock();

    // Copy input.
    shared[tid] = input[tid];
    shared[tid + blockDim.x] = input[tid + blockDim.x];

    // Perform reduction to find minimum.
    for (int d = blockDim.x; d > 0; d /= 2)
    {
        __syncthreads();

        if (tid < d)
        {
            float f0 = shared[tid];
            float f1 = shared[tid + d];

            if (f1 < f0)
            {
                shared[tid] = f1;
            }
        }
    }

    // Write result.
    if (tid == 0) output[bid] = shared[0];

    __syncthreads();

    if (tid == 0) timer[bid+gridDim.x] = clock();
}

clock.cpp

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */


// This example shows how to use the clock function to measure the performance of
// block of threads of a kernel accurately.
//

// Blocks are executed in parallel and out of order. Since there's no synchronization
// mechanism between blocks, we measure the clock once for each block. The clock
// samples are written to device memory.

// System includes
#include <stdio.h>
#include <stdint.h>
#include <assert.h>

#include <cuda_runtime.h>
#include <nvrtc_helper.h>

// helper functions and utilities to work with CUDA
#include <helper_functions.h>

#define NUM_BLOCKS    64

#define NUM_THREADS   256


// It's interesting to change the number of blocks and the number of threads to
// understand how to keep the hardware busy.
//

// Here are some numbers I get on my G80:
//    blocks - clocks
//    1 - 3096
//    8 - 3232
//    16 - 3364
//    32 - 4615
//    64 - 9981

//
// With less than 16 blocks some of the multiprocessors of the device are idle. With
// more than 16 you are using all the multiprocessors, but there's only one block per
// multiprocessor and that doesn't allow you to hide the latency of the memory. With
// more than 32 the speed scales linearly.

// Start the main CUDA Sample here

int main(int argc, char **argv)
{
    printf("CUDA Clock sample\n");

    typedef long clock_t;

    clock_t timer[NUM_BLOCKS * 2];

    float input[NUM_THREADS * 2];

    for (int i = 0; i < NUM_THREADS * 2; i++)
    {
        input[i] = (float)i;
    }

    char *ptx, *kernel_file;
    size_t ptxSize;

    kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
    compileFileToPTX(kernel_file, argc, argv, &ptx, &ptxSize, 0);

    CUmodule module = loadPTX(ptx, argc, argv);
    CUfunction kernel_addr;

    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));

    dim3 cudaBlockSize(NUM_THREADS,1,1);
    dim3 cudaGridSize(NUM_BLOCKS, 1, 1);

    CUdeviceptr dinput, doutput, dtimer;
    checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
    checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
    checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
    checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));



    void *arr[] = { (void *)&dinput, (void *)&doutput, (void *)&dtimer };

    checkCudaErrors(cuLaunchKernel(kernel_addr,
                                            cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, /* grid dim */
                                            cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
                                             sizeof(float) * 2 *NUM_THREADS,0, /* shared mem, stream */
                                            &arr[0], /* arguments */
                                            0));

    checkCudaErrors(cuCtxSynchronize());
    checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
    checkCudaErrors(cuMemFree(dinput));
    checkCudaErrors(cuMemFree(doutput));
    checkCudaErrors(cuMemFree(dtimer));

    long double avgElapsedClocks = 0;

    for (int i = 0; i < NUM_BLOCKS; i++)
    {
        avgElapsedClocks += (long double) (timer[i + NUM_BLOCKS] - timer[i]);
    }

    avgElapsedClocks = avgElapsedClocks/NUM_BLOCKS;
    printf("Average clocks/block = %Lf\n", avgElapsedClocks);

    return EXIT_SUCCESS;
}

edgelee

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
CUDA:clock_nvrtc

Sample: clock_nvrtcMinimum spec: SM 3.0This example shows how to use the clock function using libNVRTC to measure the performance of block of threads of a kernel accurately.Key concepts:Performance StrategiesRuntime Compilationclock_kernel.cu.
复制链接

扫一扫