这个程序实现的是加法的并行运算:
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <cuda_runtime.h>
__global__ void parallel_reduce_kernel(float* d_out, float* d_in){
int myID = threadIdx.x + blockIdx.x*blockDim.x;
int tid = threadIdx.x;
for (unsigned int s = blockDim.x/2; s > 0; s>>=1){
if(tid < s){
d_in[myID] += d_in[myID+s];
}
__syncthreads();//进行线程的同步
}
if(tid == 0){
d_out[blockIdx.x] = d_in[blockIdx.x*blockDim.x];
}
}
int main(){
srand(time(0));
float data[1024];
for(int i = 0; i < 1024; ++i){
data[i] = rand()%1024;
}
float* data_gpu, *out_gpu;
cudaMalloc(&data_gpu, sizeof(float)*1024);
cudaMemcpy(data_gpu, data, sizeof(float)*1024, cudaMemcpyHostToDevice);
cudaMalloc(&out_gpu, sizeof(float)*1);
parallel_reduce_kernel <<< 1, 1024 >>>(out_gpu, data_gpu);
float* out = (float*)malloc(sizeof(float));
cudaMemcpy(out, out_gpu, sizeof(float), cudaMemcpyDeviceToHost);
float sum = 0;
for(int i = 0; i < 1024; ++i){
sum += data[i];
}
printf("%f\n",sum);
printf("%f,", out[0]);
return 1;
}
附加原子运算:实现的是统计任务
__global__ void add(int* data, int* bin, int n){
int idx = threadIdx.x;
if(idx < n)
atomicAdd(bin+data[idx],1);
}
int main(){
srand(time(0));
//get random data [0-63]
int data[300];
for(int i = 0; i < 300; ++i){
data[i] = rand()%64;
}
//get the hist of random data in cpu
int hist[64];
memset(hist, 0, sizeof(int)*64);
for(int i = 0; i < 300; ++i){
++hist[data[i]];
}
for(int i = 0; i < 30; ++i){
printf("%d,", hist[i]);
}
printf("\n");
int* hist_gpu;
cudaMalloc(&hist_gpu, sizeof(int)*64);
cudaMemset(hist_gpu, 0, sizeof(int)*64);
int* data_gpu;
cudaMalloc(&data_gpu, sizeof(int)*300);
cudaMemcpy(data_gpu, data, sizeof(int)*300, cudaMemcpyHostToDevice);
add<<<1,300>>>(data_gpu, hist_gpu, 300);
cudaMemcpy(hist, hist_gpu, sizeof(int)*64, cudaMemcpyDeviceToHost);
for(int i = 0; i < 30; ++i){
printf("%d,", hist[i]);
}
cudaFree(hist_gpu);
return 1;
}