【参加CUDA线上训练营】原子操作

达达今天学习了嘛

已于 2023-02-15 03:45:03 修改

阅读量193

点赞数

分类专栏： CUDA学习文章标签：算法 Powered by 金山文档

于 2023-02-15 03:39:25 首次发布

本文链接：https://blog.csdn.net/weixin_43250007/article/details/129036239

版权

CUDA学习专栏收录该内容

5 篇文章 0 订阅

订阅专栏

文章介绍了CUDA中的原子操作，这是一种用于全局内存或共享内存的线程安全的读取-修改-写入操作，确保了多线程环境下的变量更新的正确性。文中列举了常见的原子操作函数，如加法、减法、赋值等，并提供了一个利用原子操作求解数组元素之和的GPU程序示例，与CPU计算结果对比验证了原子操作的正确性。

摘要由CSDN通过智能技术生成

一、原子操作意义

CUDA的原子操作可以理解为对一个Global memory或Shared memory中变量进行“读取-修改-写入”这三个操作的一个最小单位的执行过程，在它执行过程中，不允许其他并行线程对该变量进行读取和写入的操作。基于这个机制，原子操作实现了对在多个线程间共享的变量的互斥保护，确保任何一次对变量的操作的结果的正确性。

使用原子操作可以使操作彼此之间不影响，达到想要的互斥操作，如上图所示的atomicAdd()函数可以将不同block中的数值相加

二、原子操作常用函数

以下为常用的原子操作函数

//加法    value = valude + num
atomicAdd(&value，num)
//减法    value = valude - num
atomicSub(&value，num)  
//赋值    value = valude + num
atomicExch(&value，num)  
//求最大    value = max(value,num)
atomicMax(&value，num)  
//求最小    value = main(value,num)
atomicMin(&value，num)  
//向上计数    
atomicLnc(&value，num)
//向下计数    
atomicDec(&value，num)
//计较并交换  
atomicCAS(&value，num)
//与运算    value = valude and num
atomicAnd(&value，num)
//或运算    value = valude or num
atomicOr(&value，num)
//异或运算    value = valude xor num
atomicXor(&value，num)

三、实验程序

完成下面的一个实例：

给定一个数组A，它包含1000000个int类型的元素，求他所有的元素之和：

输入：A[1000000]

输出：output（A中所有元素之和）

#include<stdio.h>
#include<stdint.h>
#include<time.h>     //for time()
#include<stdlib.h>   //for srand()/rand()
#include<sys/time.h> //for gettimeofday()/struct timeval
#include"error.cuh"

#define N 10000000
#define BLOCK_SIZE 256
#define BLOCKS 32 


__managed__ int source[N];               //input data
__managed__ int final_result[1] = {0};   //scalar output

__global__ void _sum_gpu(int *input, int count, int *output)
{
    __shared__ int sum_per_block[BLOCK_SIZE];

    int temp = 0;
    for (int idx = threadIdx.x + blockDim.x * blockIdx.x;
         idx < count;
     idx += gridDim.x * blockDim.x
    )
    {
        temp += input[idx];
    }

    sum_per_block[threadIdx.x] = temp;  //the per-thread partial sum is temp!
    __syncthreads();

    //**********shared memory summation stage***********
    for (int length = BLOCK_SIZE / 2; length >= 1; length /= 2)
    {
        int double_kill = -1;
    if (threadIdx.x < length)
    {
        double_kill = sum_per_block[threadIdx.x] + sum_per_block[threadIdx.x + length];
    }
    __syncthreads();  //why we need two __syncthreads() here, and,
    
    if (threadIdx.x < length)
    {
        sum_per_block[threadIdx.x] = double_kill;
    }
    __syncthreads();  //....here ?
    
    } //the per-block partial sum is sum_per_block[0]

    if (blockDim.x * blockIdx.x < count) //in case that our users are naughty
    {
        //the final reduction performed by atomicAdd()
        if (threadIdx.x == 0) atomicAdd(output, sum_per_block[0]);
    }
}

int _sum_cpu(int *ptr, int count)
{
    int sum = 0;
    for (int i = 0; i < count; i++)
    {
        sum += ptr[i];
    }
    return sum;
}

void _init(int *ptr, int count)
{
    uint32_t seed = (uint32_t)time(NULL); //make huan happy
    srand(seed);  //reseeding the random generator

    //filling the buffer with random data
    for (int i = 0; i < count; i++) ptr[i] = rand();
}

double get_time()
{
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return ((double)tv.tv_usec * 0.000001 + tv.tv_sec);
}

int main()
{
    //**********************************
    fprintf(stderr, "filling the buffer with %d elements...\n", N);
    _init(source, N);

    //**********************************
    //Now we are going to kick start your kernel.
    cudaDeviceSynchronize(); //steady! ready! go!
    
    fprintf(stderr, "Running on GPU...\n");
    
double t0 = get_time();
    _sum_gpu<<<BLOCKS, BLOCK_SIZE>>>(source, N, final_result);
    CHECK(cudaGetLastError());  //checking for launch failures
    CHECK(cudaDeviceSynchronize()); //checking for run-time failurs
double t1 = get_time();

    int A = final_result[0];
    fprintf(stderr, "GPU sum: %u\n", A);


    //**********************************
    //Now we are going to exercise your CPU...
    fprintf(stderr, "Running on CPU...\n");

double t2 = get_time();
    int B = _sum_cpu(source, N);
double t3 = get_time();
    fprintf(stderr, "CPU sum: %u\n", B);

    //******The last judgement**********
    if (A == B)
    {
        fprintf(stderr, "Test Passed!\n");
    }
    else
    {
        fprintf(stderr, "Test failed!\n");
    exit(-1);
    }
    
    //****and some timing details*******
    fprintf(stderr, "GPU time %.3f ms\n", (t1 - t0) * 1000.0);
    fprintf(stderr, "CPU time %.3f ms\n", (t3 - t2) * 1000.0);

    return 0;
}