CUDA samples系列 0.5 clock

最新推荐文章于 2024-10-11 06:00:00 发布

zcy0xy

最新推荐文章于 2024-10-11 06:00:00 发布

阅读量496

点赞数

分类专栏： Cuda Cuda自带samples例程系列文章标签： cuda gpu编程

本文链接：https://blog.csdn.net/zcy0xy/article/details/84502030

版权

Cuda 同时被 2 个专栏收录

7 篇文章 3 订阅

订阅专栏

Cuda自带samples例程系列

6 篇文章 8 订阅

订阅专栏

文章目录

这个代码主要是展示随着设定的block的增加，时间并不是越多block越快，反而超过一定的限制，会出现明显的传输拥堵，latency越来越难以隐藏。

动态共享内存

如果在写核函数时不知道共享内存的大小，可以使用动态共享内存，在调用函数时才设定其大小。方法为在核函数中定义：

extern __shared__ float shared[];

在调用核函数时：

 timedReduction<<<NUM_BLOCKS, NUM_THREADS,  共享内存大小>>>(dinput, doutput, dtimer);

核函数解析

先介绍下核函数：

// This kernel computes a standard parallel reduction and evaluates the
// time it takes to do that for each block. The timing results are stored
// in device memory.
__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
{
    extern __shared__ float shared[];
	//定义时不知道shared memory的大小，在调用时<<<>>>中第三个参数就是

    const int tid = threadIdx.x;
    const int bid = blockIdx.x;

    if (tid == 0) timer[bid] = clock();//每个block中的第0号thread计时开始

    // Copy input.
	//复制到block的共享内存
	//每个block有256个thread，input里是0-255,256-511
    shared[tid] = input[tid];	//block内线程号 
    shared[tid + blockDim.x] = input[tid + blockDim.x];//block内线程号+256

    // Perform reduction to find minimum.
    for (int d = blockDim.x; d > 0; d /= 2)//d = 256,128,64,32,16,8,4,2,1,0
    {
        __syncthreads();//阻塞，等到同一个block内的thread都走到了这里，才会继续

        if (tid < d)
        {
            float f0 = shared[tid];
            float f1 = shared[tid + d];

            if (f1 < f0)
            {
                shared[tid] = f1;
            }
        }
    }

    // Write result.
	//取每个block第0个thread的结果作为这个block的最终结果
    if (tid == 0) output[bid] = shared[0];
	

    __syncthreads();//同一个block内的thread都走到了这里，才继续，好能准确计时

    if (tid == 0) timer[bid+gridDim.x] = clock();
}

比较简单，每个block内的每个thread运行固定次数的运算，理论上来说，不管block个数，每个thread的运算量都是一定的，时间也应该差不多。但是实际如何呢？

主函数解析

代码很简单，介绍下运行结果
// block:64 35237
// block:1 26824
// block:8 26823
可以看到block的增加本应该不会影响时间的，但是增加到64时，明显开始变慢了。源码的英文注释提到，在G80显卡上运行时，超过32个block，时间就与block数成线性增长了。我则没有测试更多的block数目了。

// Start the main CUDA Sample here
int main(int argc, char **argv)
{
    printf("CUDA Clock sample\n");

    // This will pick the best possible CUDA capable device
    int dev = findCudaDevice(argc, (const char **)argv);

    float *dinput = NULL;
    float *doutput = NULL;
    clock_t *dtimer = NULL;

    clock_t timer[NUM_BLOCKS * 2];
    float input[NUM_THREADS * 2];

    for (int i = 0; i < NUM_THREADS * 2; i++)
    {
        input[i] = (float)i;
    }

    checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
    checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
    checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));

    checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));

    timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 *NUM_THREADS>>>(dinput, doutput, dtimer);

    checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));

    checkCudaErrors(cudaFree(dinput));
    checkCudaErrors(cudaFree(doutput));
    checkCudaErrors(cudaFree(dtimer));

    long double avgElapsedClocks = 0;

    for (int i = 0; i < NUM_BLOCKS; i++)
    {
        avgElapsedClocks += (long double) (timer[i + NUM_BLOCKS] - timer[i]);
    }

    avgElapsedClocks = avgElapsedClocks/NUM_BLOCKS;
    printf("Average clocks/block = %Lf\n", avgElapsedClocks);

    return EXIT_SUCCESS;
}