包括Naive版本和Optimize版本,脱稿手写,长的跟sample略有不同: #include <cuda.h> #include <time.h> #include <stdio.h> #include <pz_cpy.h> #include <stdlib.h> #include <math.h> #include "cuPrintf.cu" bool InitGPUSet() { cudaDeviceProp tCard; int num = 0; if(cudaSuccess == cudaGetDeviceCount(&num)) { for(int i = 0; i < num; ++ i) { cudaSetDevice(i); cudaGetDeviceProperties(&tCard, i); printf("GPU: %s/n", tCard.name); } } else return false; return true; } bool InitCuPrint() { cudaError_t err = cudaPrintfInit(); return 0 == strcmp("no error", cudaGetErrorString(err)); } __global__ void calc_sum_naive(int* ary, int size) { int tid = threadIdx.x; int bdm = blockDim.x; for(int i = 1; i < bdm; i <<= 1) { __syncthreads(); if(0 == tid % (i << 1)) ary[tid] += ary[tid + i]; } } __global__ void calc_sum_optimize(int* ary, int size) { int tid = threadIdx.x; for(int i = blockDim.x; i > 0; i >>= 1) { __syncthreads(); if(tid < i) ary[tid] += ary[tid + i]; if(i - 1 == tid && 1 == (i & 1) && (i ^ 1))//对于奇数个元素,最后一个要单独累加 { //cuPrintf("i = %d, tid = %d/n", i, tid); ary[0] += ary[tid]; } } } int main() { if(!InitGPUSet()) puts("Device is not ready"); else if(!InitCuPrint()) puts("Don't support CuPrint"); else { const int SIZE = 512; const int GRID = 1; const int BLOCK = SIZE; int ary[SIZE]; srand(time(NULL)); for(int i = 0; i < SIZE; ++ i) ary[i] = rand() % 100; int *Gary; cudaMalloc((void**) &Gary, SIZE * sizeof(int)); cudaMemcpy(Gary, ary, SIZE * sizeof(int), cudaMemcpyHostToDevice); int retVal; //calc_sum_naive<<<GRID, BLOCK>>>(Gary, SIZE); calc_sum_optimize<<<GRID, BLOCK / 2>>>(Gary, SIZE); //cudaPrintfDisplay(stdout, false); //cudaPrintfEnd(); cudaMemcpy(&retVal, &Gary[0], sizeof(int), cudaMemcpyDeviceToHost); cudaFree(Gary); if(SIZE & 1) retVal += ary[SIZE - 1]; int judge = 0; for(int i = 0; i < SIZE; ++ i) judge += ary[i]; printf("GPU: %d/nCPU: %d/n", retVal, judge); } return 0; }