CUDA第一个程序优化(若干个线程)

#include <stdio.h>
#include <cuda_runtime.h>
#include<stdlib.h>
#define TOTAL_NUM 50000
#define Thread_num 500
bool CUDA_initial(void)
{
 int i;
 int device_count;
 if( cudaGetDeviceCount(&device_count) )
 {
  printf(" There is zero device beyond 1.0/n");
  return false;
 }
 else
  printf("There is %d device beyond 1.0/n",device_count);
 for(i=0;i<device_count;i++)
 {
  struct cudaDeviceProp device_prop;
  if(cudaGetDeviceProperties(&device_prop,i)==cudaSuccess)
  {
   printf("device properties is :/n"
      "/t device name is %s/n"
      "/t totalGlobalMem is %d/n"
      "/t sharedMemPerBlock is %d/n"
      "/t regsPerBlock is %d/n"
      "/t warpSize is %d/n"
      "/t memPitch is %d/n"
      "/t maxThreadsPerBlock is %d/n"
      "/t maxThreadsDim [3] is %d X %d X %d/n"
      "/t maxGridSize [3] is %d X %d X %d/n"
      "/t totalConstMem is %d/n"
      "/t device version is major %d ,minor %d/n"
      "/t clockRate is %d/n"
      "/t textureAlignment is %d/n"
      "/t deviceOverlap is %d/n"
      "/t multiProcessorCount is %d/n",
      device_prop.name,
      device_prop.totalGlobalMem,
      device_prop.sharedMemPerBlock,
      device_prop.regsPerBlock,
      device_prop.warpSize,
      device_prop.memPitch,
      device_prop.maxThreadsPerBlock,
      device_prop.maxThreadsDim[0],device_prop.maxThreadsDim[1],device_prop.maxThreadsDim[2],
      device_prop.maxGridSize[0],device_prop.maxGridSize[1],device_prop.maxGridSize[2],
      device_prop.totalConstMem,
      device_prop.major,device_prop.minor,
      device_prop.clockRate,
      device_prop.textureAlignment,
      device_prop.deviceOverlap,
      device_prop.multiProcessorCount);
   break;
  }
 }
 
 if(i==device_count)
 {
  printf("Get the propertites of device occurred error/n");
  return false;
 }

 if(cudaSetDevice(i)==cudaErrorInvalidDevice)
 {
  printf("Set Device occurred error/n");
  return false;
 }

 return true;
}


void generate_num(int *num,int data_num)
{
 int i;
 for(i=0;i<data_num;i++)
 {
  *(num+i)=rand()%10;
 }
}


/*********************************time test*************************************/
 class TimeCounter{
                protected :
                        clock_t startp,endp;
                public :
                        TimeCounter():startp(-1),endp(-1){}
                        void start(){//设置计时起点
                        #ifdef __CUDACC__
                                cudaThreadSynchronize();
                        #endif
                                startp=clock();
                        }
                        void stop(){//设置计时终点
                                if(-1==startp){
                                        perror("you must set start point at first");
                                }else{
                                #ifdef __CUDACC__
                                        cudaThreadSynchronize();
                                #endif
                                        endp=clock();
                                }
                        }
                        virtual long getTimeDiff()=0;//返回时间差滴答数
                        virtual void printTimeDiff()=0;//打印出时间差
        };
        class MillisecondCounter:public TimeCounter{
                public :
                    long getTimeDiff(){

                                 if(-1==endp){
                                        perror("you must set stop point before invoke this function");
                                        exit(1);
                                }else{
                                        return 1.0f*(endp-startp)/CLOCKS_PER_SEC*1000;
                                }
                        }
                        void printTimeDiff(){
                                long temp=getTimeDiff();
                                printf("use time :%ldms/n",temp);
                        }
        };
#ifdef __CUDACC__
class MicrosecondCounter:public TimeCounter{
                public:
                        long getTimeDiff(){
                                if(-1==endp){
                                        printf("please set start point or end point/n");
                                        exit(1);
                                }else{
                                return 1.0f*(endp-startp)/CLOCKS_PER_SEC*1000000;
                                }
                        }
                        void printTimeDiff(){
                                long temp=getTimeDiff();
                                printf("use time:%ld us/n",temp);
                        }
        };
#endif
/****************time test end ************************/

 

__global__ void square_sum(int *num,int num_of_num,int *result,clock_t *time)
{
 int i;
 int sum=0;
 const int index=threadIdx.x;
 const int num_in_per_thread=num_of_num/Thread_num;
 clock_t start,end;
 if(index==0)
  start=clock();
 for(i=index;i<num_of_num;i=i+Thread_num)
  sum+=(*(num+i))*(*(num+i));
 *(result+index)=sum;
 if(index==0)
 {
 end=clock();
 *time=end-start;
 }
}


int main()
{
 int i;
 MicrosecondCounter mc;
    TimeCounter& tc = mc;
 if(CUDA_initial()==true)
  printf("CUDA initial successed!/n");
 int num_str[TOTAL_NUM];
 generate_num(num_str,TOTAL_NUM);
 int *gpudata;
    int *result;
 clock_t *time;
 cudaMalloc((void **)&gpudata,sizeof(int)*TOTAL_NUM);
 cudaMalloc((void **)&result,sizeof(int)*Thread_num);
 cudaMalloc((void **)&time,sizeof(clock_t));

 cudaMemcpy((void *)gpudata,num_str,sizeof(int)*TOTAL_NUM,cudaMemcpyHostToDevice);
 
 square_sum<<<1,Thread_num>>>(gpudata,TOTAL_NUM,result,time);

 int result_in_GPU=0;
 clock_t time_used;
 int sum_GPU[Thread_num];
 cudaMemcpy((void *)&time_used,time,sizeof(clock_t),cudaMemcpyDeviceToHost);
 cudaMemcpy((void *)sum_GPU,result,sizeof(int)*Thread_num,cudaMemcpyDeviceToHost);
 for(i=0;i<Thread_num;i++)
  result_in_GPU+=sum_GPU[i];
 printf("In GPU result is %d/n",result_in_GPU);
 printf("In GPU time used is %d/n",time_used);

 int result_in_CPU=0;
 
 for(i=0;i<TOTAL_NUM;i++)
 {
  result_in_CPU+=num_str[i]*num_str[i];
 }
 printf("In CPU result is %d/n",result_in_CPU);
 cudaFree(gpudata);
 cudaFree(result);
 cudaFree(time);
 return 0;
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值