cuda编程实例 linux,Ubuntu下我的第一个cuda程序

最新推荐文章于 2024-07-19 09:21:17 发布

卓er

最新推荐文章于 2024-07-19 09:21:17 发布

阅读量867

点赞数

文章标签： cuda编程实例 linux

// include files

#include

#include "cutil_inline.h"

//如果你这里出现问题，找不到cutil_inline.h文件，就在用nvcc编译的时候加上它的路径吧(比如我的最后面加上了-I

~/NVIDIA_GPU_Computing_SDK/C/common/inc/)

//显卡计算的部分

__global__ void my_first_kernel(float *x)

{

int tid = threadIdx.x +

blockDim.x*blockIdx.x;

x[tid] = tid;

x[tid] *= x[tid];

}

// 主程序

int main(int argc, char **argv)

{

float *h_x, *d_x, tg, tc;

int nblocks, nthreads, nsize, n, i,

struct timeval start,end;

//设置blocks和每个block中的threads数(set number of blocks, and threads

per block)

nblocks = 500;

nthreads = 500;

nsize = nblocks*nthreads

;

//分配存储空间(allocate memory for matrix)

h_x = (float *)malloc(nsize*sizeof(float));

cutilSafeCall(cudaMalloc((void **)&d_x,

nsize*sizeof(float)));//如果cutilSafeCall有问题，就不用它吧，只是为了防止出错。

//调用GPU初始化一个矩阵，并对其中每个元素求平方(Initializing a matrix and power

every elements)

printf("Using GPU to initializing and power a

matrix\n");

//记录开始时间(recording start time)

gettimeofday( &start, NULL );

//执行kernel(execute kernel)

my_first_kernel<<>>(d_x);

//检查错误的，可以不要，但是在大的矩阵下你的结果可能不对哦！(check error, you can comment

it)

cutilCheckMsg("Kernel execution failed");

//把数据从GPU中拷回到CPU的变量中(copy the data from GPU to CPU)

cutilSafeCall(cudaMemcpy(h_x,d_x,nsize*sizeof(float),cudaMemcpyDeviceToHost));

//记录结束的时间(recording end time)

gettimeofday( &end, NULL );

//打印数组元素(display partial elements)

printf("\tRow\tColumnx\tValue\n");

for (n=0; n

n+=nblocks/10)

printf("\t%d\t%d\t%f\n",n+1,nthreads,h_x[n*nblocks+nthreads-1]);

//显示花了多少时间(display how long it takes)

tg = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec -

start.tv_usec;

tg /= 1000000;

printf("Using GPU take %f second\n\n",tg);

//用cpu来做同样的事情(Do the same thing using CPU)

printf("Using CPU to initializing and power a same

matrix...\n");

//记录开始时间(recording start time)

gettimeofday( &start, NULL );

//初始化并平方每个元素(initializing and powering every elements)

for(i=0;i

for(j=0;j

h_x[i*nthreads +j]= pow(i*nthreads+j,2);

//记录结束时间(recoding end time)

gettimeofday( &end, NULL );

//显示部分数组元素以便和上面用GPU加速的结果进行对比(display partial elements to

comparing to the results using GPU to

accelarating)

printf("\tRow\tColumnx\tValue\n");

for (n=0; n

printf("\t%d\t%d\t%f\n",n+1,nthreads,h_x[n*nblocks+nthreads-1]);

//显示用CPU计算的时间(display caculation time using cpu)

tc = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec -

start.tv_usec;

tc /= 1000000;

printf("Using CPU take %f second\n\n",tc);

//计算关键部分的加速比(display Speed up ratio)

printf("Speed UP ratio:%f\n!", tc/tg);

//释放存储空间(free memory)

cudaFree(d_x);

free(h_x);

return 0;

}

卓er

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫