CUDA第一个程序（只有一个线程）

最新推荐文章于 2023-02-14 19:31:36 发布

yanghangjun

最新推荐文章于 2023-02-14 19:31:36 发布

阅读量1.3k

点赞数

分类专栏： CUDA 文章标签： cuda properties include struct 测试

本文链接：https://blog.csdn.net/yanghangjun/article/details/5600058

版权

CUDA 专栏收录该内容

27 篇文章 2 订阅

订阅专栏

#include <stdio.h>
#include <cuda_runtime.h>
#include<stdlib.h>
#define TOTAL_NUM 50000
bool CUDA_initial(void)
{
int i;
int device_count;
if( cudaGetDeviceCount(&device_count) )
{
  printf(" There is zero device beyond 1.0/n");
  return false;
}
else
  printf("There is %d device beyond 1.0/n",device_count);
for(i=0;i<device_count;i++)
{
  struct cudaDeviceProp device_prop;
  if(cudaGetDeviceProperties(&device_prop,i)==cudaSuccess)
  {
   printf("device properties is :/n"
      "/t device name is %s/n"
      "/t totalGlobalMem is %d/n"
      "/t sharedMemPerBlock is %d/n"
      "/t regsPerBlock is %d/n"
      "/t warpSize is %d/n"
      "/t memPitch is %d/n"
      "/t maxThreadsPerBlock is %d/n"
      "/t maxThreadsDim [3] is %d X %d X %d/n"
      "/t maxGridSize [3] is %d X %d X %d/n"
      "/t totalConstMem is %d/n"
      "/t device version is major %d ,minor %d/n"
      "/t clockRate is %d/n"
      "/t textureAlignment is %d/n"
      "/t deviceOverlap is %d/n"
      "/t multiProcessorCount is %d/n",
      device_prop.name,
      device_prop.totalGlobalMem,
      device_prop.sharedMemPerBlock,
      device_prop.regsPerBlock,
      device_prop.warpSize,
      device_prop.memPitch,
      device_prop.maxThreadsPerBlock,
      device_prop.maxThreadsDim[0],device_prop.maxThreadsDim[1],device_prop.maxThreadsDim[2],
      device_prop.maxGridSize[0],device_prop.maxGridSize[1],device_prop.maxGridSize[2],
      device_prop.totalConstMem,
      device_prop.major,device_prop.minor,
      device_prop.clockRate,
      device_prop.textureAlignment,
      device_prop.deviceOverlap,
      device_prop.multiProcessorCount);
   break;
  }
}

if(i==device_count)
{
  printf("Get the propertites of device occurred error/n");
  return false;
}

if(cudaSetDevice(i)==cudaErrorInvalidDevice)
{
printf("Set Device occurred error/n");
return false;
}

return true;
}

void generate_num(int *num,int data_num)
{
int i;
for(i=0;i<data_num;i++)
{
*(num+i)=rand()%10;
}
}

__global__ void square_sum(int *num,int num_of_num,int * result,clock_t *time)
{
int i;
int sum=0;
clock_t start,end;
start=clock();
for(i=0;i<num_of_num;i++)
{
sum+=(*(num+i)) *(*(num+i)); //这里若用*result+=(*(num+i)) *(*(num+i));会在存取内存方面浪费时钟周期
}
*result=sum;
end=clock();
*time=end-start;
}

int main()
{
if(CUDA_initial()==true)
printf("CUDA initial successed!/n");
int num_str[TOTAL_NUM];
generate_num(num_str,TOTAL_NUM);
int *gpudata;
int *result;
clock_t *time;
cudaMalloc((void **)&gpudata,sizeof(int)*TOTAL_NUM);
cudaMalloc((void **)&result,sizeof(int));
cudaMalloc((void **)&time,sizeof(clock_t));
cudaMemcpy((void *)gpudata,num_str,sizeof(int)*TOTAL_NUM,cudaMemcpyHostToDevice);
square_sum<<<1,1>>>(gpudata,TOTAL_NUM,result,time);
int result_in_GPU;
cudaMemcpy((void *)&result_in_GPU,result,sizeof(int),cudaMemcpyDeviceToHost);
clock_t time_used;
cudaMemcpy((void *)&time_used,time,sizeof(clock_t),cudaMemcpyDeviceToHost);
printf("In GPU result is %d/n",result_in_GPU);
printf("In GPU time used is %d/n",time_used);

int result_in_CPU=0;
int i;
for(i=0;i<TOTAL_NUM;i++)
{
result_in_CPU+=num_str[i]*num_str[i];
}
printf("In CPU result is %d/n",result_in_CPU);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
return 0;
}

测试结果：

There is 1 device beyond 1.0
device properties is :
         device name is GeForce 9800 GT
         totalGlobalMem is 536543232
         sharedMemPerBlock is 16384
         regsPerBlock is 8192
         warpSize is 32
         memPitch is 262144
         maxThreadsPerBlock is 512
         maxThreadsDim [3] is 512 X 512 X 64
         maxGridSize [3] is 65535 X 65535 X 1
         totalConstMem is 65536
         device version is major 1 ,minor 1
         clockRate is 1350000
         textureAlignment is 256
         deviceOverlap is 1
         multiProcessorCount is 14
CUDA initial successed!
In GPU result is 1419240
In GPU time used is 29763916
In CPU result is 1419240
请按任意键继续. . .

计算执行的时间：29763916/1.35GHz=0.022s;

内存带宽：50000/1048576*4/0.022=8.67MB/s

yanghangjun

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
CUDA第一个程序（只有一个线程）

#include #include #include#define TOTAL_NUM 50000bool CUDA_initial(void){ int i; int device_count; if( cudaGetDeviceCount(&device_count) ) { printf(" There is zero device beyond 1.0/n"); return fal
复制链接

扫一扫

专栏目录