#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include<sdkHelper.h>
void output(float *arr,int n);//输出向量中的元素值
void VectorAdd(float *A,float *B,float *C,int n);//CPU上计算两向量相加
//device code
__global__ void VecAdd(float *A,float *B,float *C,int N)
{
int i=blockDim.x*blockIdx.x+threadIdx.x;
if(i<N)
{
C[i]=A[i]+B[i];
}
}
//host code
int main()
{
int N=512*512,i;
size_t size=N*sizeof(float);
//定义事件,记录运行时间
StopWatchInterface *timer=NULL;
float *h_A=(float*)malloc(size);
float *h_B=(float*)malloc(size);
float *h_C=(float*)malloc(size);
for(i=0;i<N;i++)
{
h_A[i]=i;
h_B[i]=i+i;
}
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
VectorAdd(h_A,h_B,h_C,N);//CPU上的向量相加
sdkStopTimer(&timer);
printf("\n CPU Processing time:%f(ms)\n",sdkGetTimerValue(&timer));
output(h_A,64);
output(h_B,64);
output(h_C,64);
//申请设备上的向量空间
float *d_A;
cudaMalloc((void **)&d_A,size);
float *d_B;
cudaMalloc((void **)&d_B,size);
float *d_C;
cudaMalloc((void **)&d_C,size);
//把主机向量的数据拷贝到设备向量
cudaMemcpy(d_A,h_A,size,cudaMemcpyHostToDevice);
cudaMemcpy(d_B,h_B,size,cudaMemcpyHostToDevice);
//调用kernel函数
int threadsPerBlock=16;
int blocksPerGrid=(N+threadsPerBlock-1)/threadsPerBlock;
//重新启动计时器
sdkResetTimer(&timer);
sdkStartTimer(&timer);
VecAdd<<<blocksPerGrid,threadsPerBlock>>>(d_A,d_B,d_C,N);
sdkStopTimer(&timer);
//从设备空间将结果拷贝到主存空间
cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost);
printf("\nGPU Processing time:%f(ms)\n",sdkGetTimerValue(&timer));
printf("输出向量中的数据h_C:\n ");
output(h_C,64);
getchar();
sdkDeleteTimer(&timer);
//释放设备空间
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
/*释放主存空间*/
delete h_A;
delete h_B;
delete h_C;
return 0;
}
void output(float *arr,int n)
{
printf("\n输出向量中的数据\n");
for(int i=0;i<n;i++)
{
printf("%-5.0f",arr[i]);
if((i+1)%8==0)
printf("\n");
}
}
//CPU上计算两向量之和
void VectorAdd(float *A,float *B,float *C,int n)
{
for(int i=0;i<n;i++)
C[i]=A[i]+B[i];
}
转自:http://blog.csdn.net/wufenxia/article/details/8281891