#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
using namespace std;
#define data_size 10000
#define thread_num 256
#define block_num 32
__global__ void kernel(int *d_idata,int * d_odata)
{
const int tid=threadIdx.x;
const int bid=blockIdx.x;
extern __shared__ int shared[];
int sum=0;
for (int i=bid*thread_num+tid;i<data_size;i+=thread_num*block_num)
{
shared[tid]+=d_idata[i]*d_idata[i];
}
__syncthreads();
if(tid == 0)
{
for(int i = 1; i < thread_num; i++)
{
shared[0] += shared[i];
}
d_odata[bid] = shared[0];
}
}
int main()
{
int h_idata[data_size];
for (int i=0;i<data_size;i++)
{
h_idata[i]=rand()%10;
}
int * d_idata;
int * d_odata;
cudaMalloc(&d_idata,sizeof(int)*data_size);
cudaMalloc(&d_odata,sizeof(int)*block_num);
cudaMemcpy(d_idata,h_idata,sizeof(int)*data_size,cudaMemcpyHostToDevice);
kernel<<<block_num,thread_num,thread_num*sizeof(int)>>>(d_idata,d_odata);
int gpu_sum[block_num];
cudaMemcpy(&gpu_sum,d_odata,sizeof(int)*block_num,cudaMemcpyDeviceToHost);
cudaFree(d_idata);
cudaFree(d_odata);
int final_gpu_sum=0;
for (int i=0;i<block_num;i++)
{
final_gpu_sum+=gpu_sum[i];
}
printf("final_gpu_sum=%d\n",final_gpu_sum);
int cpu_sum = 0;
for(int i = 0; i < data_size; i++)
{
cpu_sum+= h_idata[i] * h_idata[i];
}
printf("cpu_sum: %d\n", cpu_sum);
cin.get();
}
cuda 1000 32 block 256 threads 2 改进
最新推荐文章于 2015-01-07 10:36:49 发布