#include <stdio.h>
#include <cuda_runtime.h>
#include<stdlib.h>
#define TOTAL_NUM 50000
#define Thread_num 500
bool CUDA_initial(void)
{
int i;
int device_count;
if( cudaGetDeviceCount(&device_count) )
{
printf(" There is zero device beyond 1.0/n");
return false;
}
else
printf("There is %d device beyond 1.0/n",device_count);
for(i=0;i<device_count;i++)
{
struct cudaDeviceProp device_prop;
if(cudaGetDeviceProperties(&device_prop,i)==cudaSuccess)
{
printf("device properties is :/n"
"/t device name is %s/n"
"/t totalGlobalMem is %d/n"
"/t sharedMemPerBlock is %d/n"
"/t regsPerBlock is %d/n"
"/t warpSize is %d/n"
"/t memPitch is %d/n"
"/t maxThreadsPerBlock is %d/n"
"/t maxThreadsDim [3] is %d X %d X %d/n"
"/t maxGridSize [3] is %d X %d X %d/n"
"/t totalConstMem is %d/n"
"/t device version is major %d ,minor %d/n"
"/t clockRate is %d/n"
"/t textureAlignment is %d/n"
"/t deviceOverlap is %d/n"
"/t multiProcessorCount is %d/n",
device_prop.name,
device_prop.totalGlobalMem,
device_prop.sharedMemPerBlock,
device_prop.regsPerBlock,
device_prop.warpSize,
device_prop.memPitch,
device_prop.maxThreadsPerBlock,
device_prop.maxThreadsDim[0],device_prop.maxThreadsDim[1],device_prop.maxThreadsDim[2],
device_prop.maxGridSize[0],device_prop.maxGridSize[1],device_prop.maxGridSize[2],
device_prop.totalConstMem,
device_prop.major,device_prop.minor,
device_prop.clockRate,
device_prop.textureAlignment,
device_prop.deviceOverlap,
device_prop.multiProcessorCount);
break;
}
}
if(i==device_count)
{
printf("Get the propertites of device occurred error/n");
return false;
}
if(cudaSetDevice(i)==cudaErrorInvalidDevice)
{
printf("Set Device occurred error/n");
return false;
}
return true;
}
void generate_num(int *num,int data_num)
{
int i;
for(i=0;i<data_num;i++)
{
*(num+i)=rand()%10;
}
}
/*********************************time test*************************************/
class TimeCounter{
protected :
clock_t startp,endp;
public :
TimeCounter():startp(-1),endp(-1){}
void start(){//设置计时起点
#ifdef __CUDACC__
cudaThreadSynchronize();
#endif
startp=clock();
}
void stop(){//设置计时终点
if(-1==startp){
perror("you must set start point at first");
}else{
#ifdef __CUDACC__
cudaThreadSynchronize();
#endif
endp=clock();
}
}
virtual long getTimeDiff()=0;//返回时间差滴答数
virtual void printTimeDiff()=0;//打印出时间差
};
class MillisecondCounter:public TimeCounter{
public :
long getTimeDiff(){
if(-1==endp){
perror("you must set stop point before invoke this function");
exit(1);
}else{
return 1.0f*(endp-startp)/CLOCKS_PER_SEC*1000;
}
}
void printTimeDiff(){
long temp=getTimeDiff();
printf("use time :%ldms/n",temp);
}
};
#ifdef __CUDACC__
class MicrosecondCounter:public TimeCounter{
public:
long getTimeDiff(){
if(-1==endp){
printf("please set start point or end point/n");
exit(1);
}else{
return 1.0f*(endp-startp)/CLOCKS_PER_SEC*1000000;
}
}
void printTimeDiff(){
long temp=getTimeDiff();
printf("use time:%ld us/n",temp);
}
};
#endif
/****************time test end ************************/
__global__ void square_sum(int *num,int num_of_num,int *result,clock_t *time)
{
int i;
int sum=0;
const int index=threadIdx.x;
const int num_in_per_thread=num_of_num/Thread_num;
clock_t start,end;
if(index==0)
start=clock();
for(i=index;i<num_of_num;i=i+Thread_num)
sum+=(*(num+i))*(*(num+i));
*(result+index)=sum;
if(index==0)
{
end=clock();
*time=end-start;
}
}
int main()
{
int i;
MicrosecondCounter mc;
TimeCounter& tc = mc;
if(CUDA_initial()==true)
printf("CUDA initial successed!/n");
int num_str[TOTAL_NUM];
generate_num(num_str,TOTAL_NUM);
int *gpudata;
int *result;
clock_t *time;
cudaMalloc((void **)&gpudata,sizeof(int)*TOTAL_NUM);
cudaMalloc((void **)&result,sizeof(int)*Thread_num);
cudaMalloc((void **)&time,sizeof(clock_t));
cudaMemcpy((void *)gpudata,num_str,sizeof(int)*TOTAL_NUM,cudaMemcpyHostToDevice);
square_sum<<<1,Thread_num>>>(gpudata,TOTAL_NUM,result,time);
int result_in_GPU=0;
clock_t time_used;
int sum_GPU[Thread_num];
cudaMemcpy((void *)&time_used,time,sizeof(clock_t),cudaMemcpyDeviceToHost);
cudaMemcpy((void *)sum_GPU,result,sizeof(int)*Thread_num,cudaMemcpyDeviceToHost);
for(i=0;i<Thread_num;i++)
result_in_GPU+=sum_GPU[i];
printf("In GPU result is %d/n",result_in_GPU);
printf("In GPU time used is %d/n",time_used);
int result_in_CPU=0;
for(i=0;i<TOTAL_NUM;i++)
{
result_in_CPU+=num_str[i]*num_str[i];
}
printf("In CPU result is %d/n",result_in_CPU);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
return 0;
}