代码部分
#include <cuda_runtime.h>
#include <stdio.h>
#define BDIMX 16
void initData(float *ip,const int size)
{
for(int n=0;n<size;n++)
{
ip[n]=(float)(rand()&0xFF)/10.0f;
}
return;
}
void prinfData(float *ip,const int size)
{
for(int i=0;i<size;i++)
{
printf("%dth element:%f \n",i,ip[i]);
}
return;
}
void checkResult(float *hostRef,float *gpuRef,const int size,int showme)
{
double eps=1E-8;
bool match=1;
for(int i=0;i<size;i++)
{
if(abs(hostRef[i]-gpuRef[i]>eps))
{
match=0;
printf("hostref and gpuref is not match!");
break;
}
if(showme && i > size / 2 && i < size / 2 + 5)
{
printf("%dth element: host %f gpu %f\n",i,hostRef[i],gpuRef[i]);
}
}
if(!match) printf("array is not match");
else{printf("array is match");}
}
void transpostHost(float *out,float *in,const int nx,const int ny)
{
for(int iy=0;iy<ny;++iy)
{
for(int ix=0;ix<nx;++ix)
{
out[ix*ny+iy]=in[iy*nx+ix];
}
}
}
__global__ void warmup(float *out,float *in,const int nx,const int ny)
{
unsigned int ix=blockDim.x+blockIdx.x+threadIdx.x;
unsigned int iy=blockDim.y+blockIdx.y+threadIdx.y;
if(ix<nx&&iy<ny)
{
out[iy * nx + ix] = in[iy * nx + ix];
}
}
int main()
{
int dev=0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp,dev);
cudaSetDevice(dev);
//矩阵大小是2048*2048
int nx=1<<11;
int ny=1<<11;
//选择块大小运算
int blockx=16;
int blocky=16;
//总字节数
size_t nBytes=nx*ny*sizeof(float);
//设定块&&网格
dim3 block(blockx,blocky);
dim3 grid((nx+block.x-1)/block.x,(ny+block.y-1)/block.y);
//分配主机内存
float *h_A=(float*)malloc(nBytes);
float *hostRef=(float*)malloc(nBytes);
float *gpuRef=(float*)malloc(nBytes);
//初始化array
initData(h_A,nx*ny);
//在host做转置
transpostHost(hostRef,h_A,nx,ny);
//分配设备内存
float *d_A,*d_C;
cudaMalloc((float**)&d_A,nBytes);
cudaMalloc((float**)&d_C,nBytes);
//从主机拷贝数据到设备
cudaMemcpy(d_A,h_A,nBytes,cudaMemcpyHostToDevice);
//GPU计算转置
warmup<<<grid,block>>>(d_C,d_A,nx,ny);
cudaDeviceSynchronize();
cudaGetLastError();
//把数据从d_C 放到gpuRef上
cudaMemcpy(gpuRef,d_C,nBytes,cudaMemcpyDeviceToHost);
checkResult(hostRef,gpuRef,nx*ny,1);
cudaFree(d_A);
cudaFree(d_C);
free(h_A);
free(hostRef);
free(gpuRef);
// reset device
cudaDeviceReset();
return EXIT_SUCCESS;
}
编译部分
nvcc transpose.cu -o trans
执行部分
./trans
运行部分
注:不明白为什么不match,代码仅供示例