#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;
const int N=1234567;
const int sample=100;
const int threads=256;
__global__ void judge(int *da,int *data,int sam)
{
const int tid=blockIdx.x *blockDim.x+threadIdx.x;
for(int i=0;i<sam;i++)
{
if(da[tid]<sample*(i+1))
{
atomicAdd(&data[i],1);
break;
}
}
//const int tid=threadIdx.x;
//const int bid=blockIdx.x;
//for(long i=tid+bid*blockDim.x;i<N+gridDim.x*blockDim.x;i+=gridDim.x*blockDim.x)
//{
// for(int j=0;j<sam;j++)
// {
// if(da[i]<sample*(j+1))
// {
// atomicAdd(&data[j],1);
// break;
// }
// }
//}
__syncthreads();
}
int main(void)
{
int *ha,*da;//用来申请空间
//测试unified memory的申请时间
clock_t a,b,c;
a=clock();
cudaMallocManaged (&da,N*sizeof(int));
b=clock()-a;
cout<<"unified-"<<b<<endl;
ha=new int[N];
for(int i=0;i<N;i++)//初值
{
ha[i]=i;
da[i]=ha[i];
}
int it_sam=(N+sample-1)/sample;//分区间的个数 ***************************
int *h_data,*d_data;
h_data=new int[it_sam ];
//int *a;
//a=new int[it_sam];
cudaMallocManaged (&d_data,it_sam*sizeof(int));
for(int i=0;i<it_sam;i++)//初始化为0
{
h_data[i]=0;
d_data[i]=0;
//a[i]=0;
}
for(int i=0;i<N;i++)//host端if
for(int it=0;it<it_sam;it++)
{
if(ha[i]<sample*(it+1))
{
h_data[it]++;
break;
}
}
cout<<"host____"<<endl;
//int blocks;
//if(it_sam<2048)
// blocks=it_sam;
//else
// blocks=2048;
int blocks=(N+threads-1)/threads;
//int *data;
//cudaMalloc(&data,it_sam*sizeof(int));
//cudaMemcpy(data,d_data,it_sam*sizeof(int),cudaMemcpyHostToDevice);
//int *dda;
//cudaMalloc(&dda,N*sizeof(int));
//cudaMemcpy(dda,da,N*sizeof(int),cudaMemcpyHostToDevice);
cudaEvent_t start,stop;//事件
float time_unified;//测试时间
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
judge<<<blocks,threads>>>(da,d_data,it_sam);
cudaDeviceSynchronize();
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_unified,start,stop);
cout<<"unified__"<<time_unified<<endl;
//cudaMemcpy(d_data,data,it_sam*sizeof(int),cudaMemcpyDeviceToHost);
int *data;
cudaMalloc(&data,it_sam*sizeof(int));
cudaMemcpy(data,d_data,it_sam*sizeof(int),cudaMemcpyHostToDevice);
int *dda;
//测试cudaMemcpy的申请时间
a=clock();
cudaMalloc(&dda,N*sizeof(int));
c=clock()-a;
cout<<"cudaMemcpy-"<<c<<endl;
cudaMemcpy(dda,da,N*sizeof(int),cudaMemcpyHostToDevice);
cudaEvent_t start1,stop1;//事件
float time_gpu;//测试时间
cudaEventCreate(&start1);
cudaEventCreate(&stop1);
cudaEventRecord(start1,0);
judge<<<blocks,threads>>>(dda,d_data,it_sam);
cudaDeviceSynchronize();
cudaEventRecord(stop1,0);
cudaEventSynchronize(stop1);
cudaEventElapsedTime(&time_gpu,start1,stop1);
cout<<"device__"<<time_gpu<<endl;
//for(int ii=0;ii<it_sam;ii++)
//{
// cout<<d_data[ii]<<" ";
//}
//for(int ii=0;ii<it_sam;ii++)
//{
// if(h_data[ii]!=d_data[ii])
// cout<<ii<<" "<<h_data[ii]<<" "<<d_data[ii]<<" "<<"error";
//}
cout<<"end__"<<endl;
cudaFree(d_data);
cudaFree(da);
return 0;
}
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;
const int N=1234567;
const int sample=100;
const int threads=256;
__global__ void judge(int *da,int *data,int sam)
{
const int tid=blockIdx.x *blockDim.x+threadIdx.x;
for(int i=0;i<sam;i++)
{
if(da[tid]<sample*(i+1))
{
atomicAdd(&data[i],1);
break;
}
}
//const int tid=threadIdx.x;
//const int bid=blockIdx.x;
//for(long i=tid+bid*blockDim.x;i<N+gridDim.x*blockDim.x;i+=gridDim.x*blockDim.x)
//{
// for(int j=0;j<sam;j++)
// {
// if(da[i]<sample*(j+1))
// {
// atomicAdd(&data[j],1);
// break;
// }
// }
//}
__syncthreads();
}
int main(void)
{
int *ha,*da;//用来申请空间
//测试unified memory的申请时间
clock_t a,b,c;
a=clock();
cudaMallocManaged (&da,N*sizeof(int));
b=clock()-a;
cout<<"unified-"<<b<<endl;
ha=new int[N];
for(int i=0;i<N;i++)//初值
{
ha[i]=i;
da[i]=ha[i];
}
int it_sam=(N+sample-1)/sample;//分区间的个数 ***************************
int *h_data,*d_data;
h_data=new int[it_sam ];
//int *a;
//a=new int[it_sam];
cudaMallocManaged (&d_data,it_sam*sizeof(int));
for(int i=0;i<it_sam;i++)//初始化为0
{
h_data[i]=0;
d_data[i]=0;
//a[i]=0;
}
for(int i=0;i<N;i++)//host端if
for(int it=0;it<it_sam;it++)
{
if(ha[i]<sample*(it+1))
{
h_data[it]++;
break;
}
}
cout<<"host____"<<endl;
//int blocks;
//if(it_sam<2048)
// blocks=it_sam;
//else
// blocks=2048;
int blocks=(N+threads-1)/threads;
//int *data;
//cudaMalloc(&data,it_sam*sizeof(int));
//cudaMemcpy(data,d_data,it_sam*sizeof(int),cudaMemcpyHostToDevice);
//int *dda;
//cudaMalloc(&dda,N*sizeof(int));
//cudaMemcpy(dda,da,N*sizeof(int),cudaMemcpyHostToDevice);
cudaEvent_t start,stop;//事件
float time_unified;//测试时间
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
judge<<<blocks,threads>>>(da,d_data,it_sam);
cudaDeviceSynchronize();
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_unified,start,stop);
cout<<"unified__"<<time_unified<<endl;
//cudaMemcpy(d_data,data,it_sam*sizeof(int),cudaMemcpyDeviceToHost);
int *data;
cudaMalloc(&data,it_sam*sizeof(int));
cudaMemcpy(data,d_data,it_sam*sizeof(int),cudaMemcpyHostToDevice);
int *dda;
//测试cudaMemcpy的申请时间
a=clock();
cudaMalloc(&dda,N*sizeof(int));
c=clock()-a;
cout<<"cudaMemcpy-"<<c<<endl;
cudaMemcpy(dda,da,N*sizeof(int),cudaMemcpyHostToDevice);
cudaEvent_t start1,stop1;//事件
float time_gpu;//测试时间
cudaEventCreate(&start1);
cudaEventCreate(&stop1);
cudaEventRecord(start1,0);
judge<<<blocks,threads>>>(dda,d_data,it_sam);
cudaDeviceSynchronize();
cudaEventRecord(stop1,0);
cudaEventSynchronize(stop1);
cudaEventElapsedTime(&time_gpu,start1,stop1);
cout<<"device__"<<time_gpu<<endl;
//for(int ii=0;ii<it_sam;ii++)
//{
// cout<<d_data[ii]<<" ";
//}
//for(int ii=0;ii<it_sam;ii++)
//{
// if(h_data[ii]!=d_data[ii])
// cout<<ii<<" "<<h_data[ii]<<" "<<d_data[ii]<<" "<<"error";
//}
cout<<"end__"<<endl;
cudaFree(d_data);
cudaFree(da);
return 0;
}