加法代码参考了别人的博客
#include <iostream>
#include <stdio.h>
//#include "kmeans.h"
using namespace std;
const int count = 1000;
void generate_data(int *arr)
{
for(int i=0;i<count;i++)
{
arr[i] = i+1;
}
}
int nextPowerOfTwo(int n)
{
n--;
n = n >> 1 | n;
n = n >> 2 | n;
n = n >> 4 | n;
n = n >> 8 | n;
n = n >> 16 | n;
//n = n >> 32 | n; //For 64-bits int
return ++n;
}
/*
cnt : count
cnt2 : next power of two of count
*/
__global__ static void compute_sum(int *array,int cnt , int cnt2)
{
extern __shared__ unsigned int sharedMem[];
sharedMem[threadIdx.x] = (threadIdx.x < cnt) ? array[threadIdx.x] : 0 ;
__syncthreads();
//cnt2 "must" be a power of two!
for( unsigned int s = cnt2/2 ; s > 0 ; s>>=1 )
{
if( threadIdx.x < s )
{
sharedMem[threadIdx.x] += sharedMem[threadIdx.x + s];
}
__syncthreads();
}
if(threadIdx.x == 0)
{
array[0] = sharedMem[0];
}
}
int main()
{
int *a = new int[count];
generate_data(a);
int *deviceArray;
cudaMalloc( &deviceArray,count*sizeof(int) );
cudaMemcpy( deviceArray,a,count*sizeof(int),cudaMemcpyHostToDevice );
int npt_count = nextPowerOfTwo(count);//next power of two of count
//cout<<"npt_count = "<<npt_count<<endl;
int blockSharedDataSize = npt_count * sizeof(int);
compute_sum<<<1,count,blockSharedDataSize>>>(deviceArray,count,npt_count);
int sum ;
cudaMemcpy( &sum,deviceArray,sizeof(int),cudaMemcpyDeviceToHost );
cout<<"sum = "<<sum<<endl;
return 0;
}
[1]http://blog.csdn.net/lavorange/article/details/43031419