//
// include files
#include
#include
#include
#include "cutil_inline.h"
//如果你这里出现问题,找不到cutil_inline.h文件,就在用nvcc编译的时候加上它的路径吧(比如我的最后面加上了-I
~/NVIDIA_GPU_Computing_SDK/C/common/inc/)
//
//显卡计算的部分
__global__ void my_first_kernel(float *x)
{
int tid = threadIdx.x +
blockDim.x*blockIdx.x;
x[tid] = tid;
x[tid] *= x[tid];
}
//
// 主程序
int main(int argc, char **argv)
{
float *h_x, *d_x, tg, tc;
int nblocks, nthreads, nsize, n, i,
j;
struct timeval start,end;
//设置blocks和每个block中的threads数(set number of blocks, and threads
per block)
nblocks = 500;
nthreads = 500;
nsize = nblocks*nthreads
;
//分配存储空间(allocate memory for matrix)
h_x = (float *)malloc(nsize*sizeof(float));
cutilSafeCall(cudaMalloc((void **)&d_x,
nsize*sizeof(float)));//如果cutilSafeCall有问题,就不用它吧,只是为了防止出错。
//调用GPU初始化一个矩阵,并对其中每个元素求平方(Initializing a matrix and power
every elements)
printf("Using GPU to initializing and power a
matrix\n");
//记录开始时间(recording start time)
gettimeofday( &start, NULL );
//执行kernel(execute kernel)
my_first_kernel<<>>(d_x);
//检查错误的,可以不要,但是在大的矩阵下你的结果可能不对哦!(check error, you can comment
it)
cutilCheckMsg("Kernel execution failed");
//把数据从GPU中拷回到CPU的变量中(copy the data from GPU to CPU)
cutilSafeCall(cudaMemcpy(h_x,d_x,nsize*sizeof(float),cudaMemcpyDeviceToHost));
//记录结束的时间(recording end time)
gettimeofday( &end, NULL );
//打印数组元素(display partial elements)
printf("\tRow\tColumnx\tValue\n");
for (n=0; n
n+=nblocks/10)
printf("\t%d\t%d\t%f\n",n+1,nthreads,h_x[n*nblocks+nthreads-1]);
//显示花了多少时间(display how long it takes)
tg = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec -
start.tv_usec;
tg /= 1000000;
printf("Using GPU take %f second\n\n",tg);
//用cpu来做同样的事情(Do the same thing using CPU)
printf("Using CPU to initializing and power a same
matrix...\n");
//记录开始时间(recording start time)
gettimeofday( &start, NULL );
//初始化并平方每个元素(initializing and powering every elements)
for(i=0;i
for(j=0;j
h_x[i*nthreads +j]= pow(i*nthreads+j,2);
//记录结束时间(recoding end time)
gettimeofday( &end, NULL );
//显示部分数组元素以便和上面用GPU加速的结果进行对比(display partial elements to
comparing to the results using GPU to
accelarating)
printf("\tRow\tColumnx\tValue\n");
for (n=0; n
printf("\t%d\t%d\t%f\n",n+1,nthreads,h_x[n*nblocks+nthreads-1]);
//显示用CPU计算的时间(display caculation time using cpu)
tc = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec -
start.tv_usec;
tc /= 1000000;
printf("Using CPU take %f second\n\n",tc);
//计算关键部分的加速比(display Speed up ratio)
printf("Speed UP ratio:%f\n!", tc/tg);
//释放存储空间(free memory)
cudaFree(d_x);
free(h_x);
return 0;
}