hist_gpu_shmem_atomics.cu这个gpu的速度我测试还可以。能理解原子操作add到底是在哪里操作的吗？

最新推荐文章于 2022-07-09 22:41:40 发布

a8491087

最新推荐文章于 2022-07-09 22:41:40 发布

阅读量213

点赞数

本文链接：https://blog.csdn.net/a8491087/article/details/84629190

版权

本文通过hist_gpu_shmem_atomics.cu代码示例，探讨CUDA GPU上的原子操作add的应用。在CUDA核函数中，使用原子操作更新共享内存的累加器，并同步线程块数据，最终将结果存储到全局内存的直方图中。文章解释了代码中的关键部分，包括共享内存的初始化、线程索引计算、原子加法以及性能计时。

摘要由CSDN通过智能技术生成

/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
*
*/

#include "../common/book.h"

#define SIZE (100*1024*1024)

__global__ void histo_kernel( unsigned char *buffer,
long size,
unsigned int *histo ) {

    // clear out the accumulation buffer called temp
    // since we are launched with 256 threads, it is easy
    // to clear that memory with one write per thread
    __shared__ unsigned int temp[256];
    temp[threadIdx.x] = 0;//这里的意思和cpu一样，是在共享内存里声明并且初始化一个数组，这个数组的名字叫做temp，长度是256,然后里面的i其实就是threadIdx索引。所有的值都是0。定义表明这个每一个共享内存都相当于一个线程块BLOCK。
    __syncthreads();

    // calculate the starting index and the offset to the next
    // block that each thread will be processing
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;//这里要强调的是为啥是这样。因为它的Dim3 gridsize=(60,1,1),Dim3 blocksize=(256,1,1)，因此可以通过这两个数相乘得到最终的线程数。
    while (i < size) {//这里为什么是<size，因为它是要去定义buffer[i]的i的值，其他的类似的定义也都是要把thread的索引小于数组长度。
        atomicAdd( &temp[buffer[i]], 1 );//记住这是个原子操作，原子的加法运算，1是value，也可以是变量应该，就是把所有的1相加，最终结果放到temp[buffer[i]]里。它就是一个累加求和的函数。
        i += stride;//这里的意思是为了啥。不是很明白。打印出来stride的值是60*256=15360。i经过加法变成了15360,15361,15362,15363,15364。这样加了之后有什么意义？意思是我必须在一个while循环中不能呢哥再让i的值再进去加一次，是吗？
    }
    // sync the data from the above writes to shared memory
    // then add the shared memory values to the values from
    // the other thread blocks using global memory
    // atomic adds
    // same as before, since we have 256 threads, updating the
    // global histogram is just one write per thread!
    __syncthreads();
    atomicAdd( &(histo[threadIdx.x]), temp[threadIdx.x] );
}

int main( void ) {
unsigned char *buffer =
(unsigned char*)big_random_block( SIZE );

    // capture the start time
    // starting the timer here so that we include the cost of
    // all of the operations on the GPU. if the data were
    // already on the GPU and we just timed the kernel
    // the timing would drop from 74 ms to 15 ms. Very fast.
    cudaEvent_t     start, stop;
    HANDLE_ERROR( cudaEventCreate( &start ) );
    HANDLE_ERROR( cudaEventCreate( &stop ) );
    HANDLE_ERROR( cudaEventRecord( start, 0 ) );

    // allocate memory on the GPU for the file's data
    unsigned char *dev_buffer;
    unsigned int *dev_histo;
    HANDLE_ERROR( cudaMalloc( (void**)&dev_buffer, SIZE ) );
    HANDLE_ERROR( cudaMemcpy( dev_buffer, buffer, SIZE,
                              cudaMemcpyHostToDevice ) );

    HANDLE_ERROR( cudaMalloc( (void**)&dev_histo,
                              256 * sizeof( int ) ) );
    HANDLE_ERROR( cudaMemset( dev_histo, 0,
                              256 * sizeof( int ) ) );

    // kernel launch - 2x the number of mps gave best timing
    cudaDeviceProp prop;
    HANDLE_ERROR( cudaGetDeviceProperties( &prop, 0 ) );
    int blocks = prop.multiProcessorCount;
    histo_kernel<<<blocks*2,256>>>( dev_buffer,
                                    SIZE, dev_histo );

    unsigned int    histo[256];
    HANDLE_ERROR( cudaMemcpy( histo, dev_histo,
                              256 * sizeof( int ),
                              cudaMemcpyDeviceToHost ) );

    // get stop time, and display the timing results
    HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
    HANDLE_ERROR( cudaEventSynchronize( stop ) );
    float   elapsedTime;
    HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
                                        start, stop ) );
    printf( "Time to generate: %3.1f ms\n", elapsedTime );

    long histoCount = 0;
    for (int i=0; i<256; i++) {
        histoCount += histo[i];
    }
    printf( "Histogram Sum: %ld\n", histoCount );

    // verify that we have the same counts via CPU
    for (int i=0; i<SIZE; i++)
        histo[buffer[i]]--;
    for (int i=0; i<256; i++) {
        if (histo[i] != 0)
            printf( "Failure at %d!\n", i );
    }

    HANDLE_ERROR( cudaEventDestroy( start ) );
    HANDLE_ERROR( cudaEventDestroy( stop ) );
    cudaFree( dev_histo );
    cudaFree( dev_buffer );
    free( buffer );
    return 0;
}

a8491087

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
hist_gpu_shmem_atomics.cu这个gpu的速度我测试还可以。能理解原子操作add到底是在哪里操作的吗？

/* * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. * * NVIDIA Corporation and its licensors retain all intellectual property and * proprietary rights in and to this software and re...
复制链接

扫一扫