CUDA性能优化----kernel调优(nvprof工具的使用)
2017-01-13 11:41:28| 分类: HPC&CUDA优化 | 标签:hpc gpu cuda |举报 |字号 订阅
1、引言
输入数据矩阵的维度是nx=16384, ny=16384:__global__ void sumMatrix2DKernel(float *d_MatA,float *d_MatB,float *d_MatC,int nx,int ny) { int idx = threadIdx.x + blockDim.x * blockIdx.x; int idy = threadIdx.y + blockDim.y * blockIdx.y; int tid = nx*idy + idx; if(idx < nx && idy < ny) d_MatC[tid] = d_MatA[tid] + d_MatB[tid]; }
int nx = 1<<14; int ny = 1<<14;
if (argc > 2)
{ dimx = atoi(argv[1]); dimy = atoi(argv[2]); } dim3 block(dimx, dimy); dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
nvprof: NVIDIA (R) Cuda command line profiler Copyright (c) 2013 - 2014 NVIDIA Corporation Release version 6.0 (20)
2、Checking Active Warps with nvprof
$ ./sumMatrix2D 32 32
./sumMatrix2D Program Starting... --sumMatrix2DOnHost() elapsed 360.000000 ms.. --sumMatrix2DOnGPU<<<(512,512),(32,32)>>> elapsed 70.000000 ms..
$ ./sumMatrix2D 32 16 ./sumMatrix2D Program Starting... --sumMatrix2DOnHost() elapsed 360.000000 ms.. --sumMatrix2DOnGPU<<<(512,1024),(32,16)>>> elapsed 40.000000 ms..
$ ./sumMatrix2D 16 32 ./sumMatrix2D Program Starting... --sumMatrix2DOnHost() elapsed 360.000000 ms.. --sumMatrix2DOnGPU<<<(1024,512),(16,32)>>> elapsed 60.000000 ms..
比较这几个结果,不难发现,性能最差的是第一个(32,32),性能最好的是第二个(32,16),这里可以猜测到是:拥有更多的block数目并行性更好。这个猜测可以 使用nvprof 的achieved_occupancy这个metric参数来验证。该参数的定义公式在 CUDA性能优化----warp深度解析 有介绍,实际上就是指每个SM在每个cycle能够达到的最大active warp数目占总warp的比例。下面是使用该参数后得到的结果(注意由于输出项多,做了简化处理):$ ./sumMatrix2D 16 16 ./sumMatrix2D Program Starting... --sumMatrix2DOnHost() elapsed 360.000000 ms.. --sumMatrix2DOnGPU<<<(1024,1024),(16,16)>>> elapsed 50.000000 ms..
$ nvprof --metrics achieved_occupancy ./sumMatrix2D 32 32 ==27432== NVPROF is profiling process 27432, command: ./sumMatrix2D 32 32
--sumMatrix2DOnGPU<<<(512,512),(32,32)>>> achieved_occupancy 0.506396
$ nvprof --metrics achieved_occupancy ./sumMatrix2D 32 16
==27454== NVPROF is profiling process 27454, command: ./sumMatrix2D 32 16--sumMatrix2DOnGPU<<<(512,1024),(32,16)>>> achieved_occupancy 0.731333
$ nvprof --metrics achieved_occupancy ./sumMatrix2D 16 32
==27493== NVPROF is profiling process 27493, command: ./sumMatrix2D 16 32--sumMatrix2DOnGPU<<<(1024,512),(16,32)>>> achieved_occupancy 0.826147
$ nvprof --metrics achieved_occupancy ./sumMatrix2D 16 16
==27545== NVPROF is profiling process 27545, command: ./sumMatrix2D 16 16--sumMatrix2DOnGPU<<<(1024,1024),(16,16)>>> achieved_occupancy 0.819718
3、checking memory operations with nvprof
$ nvprof --metrics gld_throughput ./sumMatrix2D 32 32 --sumMatrix2DOnGPU<<<(512,512),(32,32)>>> elapsed 1090.000000 ms..
--Global Load Throughput:35.557GB/s
$ nvprof --metrics gld_throughput ./sumMatrix2D 32 16
--sumMatrix2DOnGPU<<<(512,1024),(32,16)>>> elapsed 1440.000000 ms..--Global Load Throughput:56.396GB/s
$ nvprof --metrics gld_throughput ./sumMatrix2D 16 32
--sumMatrix2DOnGPU<<<(1024,512),(16,32)>>> elapsed 1070.000000 ms..--Global Load Throughput:81.023GB/s
$ nvprof --metrics gld_throughput ./sumMatrix2D 16 16
--sumMatrix2DOnGPU<<<(1024,1024),(16,16)>>> elapsed 1060.000000 ms..--Global Load Throughput:93.694GB/s
$ nvprof --metrics gld_efficiency ./sumMatrix2D 32 32 --sumMatrix2DOnGPU<<<(512,512),(32,32)>>> elapsed 1610.000000 ms.. --Global Memory Load Efficiency:100.01%
$ nvprof --metrics gld_efficiency ./sumMatrix2D 32 16
--sumMatrix2DOnGPU<<<(512,1024),(32,16)>>> elapsed 1610.000000 ms..
--Global Memory Load Efficiency:99.95%$ nvprof --metrics gld_efficiency ./sumMatrix2D 16 32
--sumMatrix2DOnGPU<<<(1024,512),(16,32)>>> elapsed 1610.000000 ms..
--Global Memory Load Efficiency:49.89%$ nvprof --metrics gld_efficiency ./sumMatrix2D 16 16
--sumMatrix2DOnGPU<<<(1024,1024),(16,16)>>> elapsed 1610.000000 ms..
--Global Memory Load Efficiency:49.99%
4、Exposing More Parallelism
从上面测试数据,我们可以分析得到下面几条认识:$ ./sumMatrix2D 64 2 --sumMatrix2DOnGPU<<<(256,8192),(64,2)>>> elapsed 33.527294 ms..
$ ./sumMatrix2D 64 4 --sumMatrix2DOnGPU<<<(256,4096),(64,4)>>> elapsed 34.802238 ms..
$ ./sumMatrix2D 64 8 --sumMatrix2DOnGPU<<<(256,2048),(64,8)>>> elapsed 36.614143 ms..
$ ./sumMatrix2D 128 2 --sumMatrix2DOnGPU<<<(128,8192),(128,2)>>> elapsed 32.602848 ms..
$ ./sumMatrix2D 128 4 --sumMatrix2DOnGPU<<<(128,4096),(128,4)>>> elapsed 34.658592 ms..
$ ./sumMatrix2D 128 8 --sumMatrix2DOnGPU<<<(128,2048),(128,8)>>> elapsed 46.740578 ms..
$ ./sumMatrix2D 256 2 --sumMatrix2DOnGPU<<<(64,8192),(256,2)>>> elapsed 32.661919 ms..
$ ./sumMatrix2D 256 4 --sumMatrix2DOnGPU<<<(64,4096),(256,4)>>> elapsed 38.260609 ms..
$ ./sumMatrix2D 256 8 --sumMatrix2DOnGPU<<<(64,2048),(256,8)>>> elapsed 0.013440 ms.. Result verification failed at elemnt 0
$ nvprof --metrics achieved_occupancy ./sumMatrix2D 64 2 --sumMatrix2DOnGPU<<<(256,8192),(64,2)>>> elapsed 37.495487 ms.. --Achieved Occupancy: 0.555373
$ nvprof --metrics achieved_occupancy ./sumMatrix2D 64 4 --sumMatrix2DOnGPU<<<(256,4096),(64,4)>>> elapsed 38.886177 ms.. --Achieved Occupancy: 0.795769$ nvprof --metrics achieved_occupancy ./sumMatrix2D 64 8 --sumMatrix2DOnGPU<<<(256,2048),(64,8)>>> elapsed 40.603359 ms.. --Achieved Occupancy: 0.757109$ nvprof --metrics achieved_occupancy ./sumMatrix2D 128 2 --sumMatrix2DOnGPU<<<(128,8192),(128,2)>>> elapsed 36.666466 ms.. --Achieved Occupancy: 0.803921$ nvprof --metrics achieved_occupancy ./sumMatrix2D 128 4 --sumMatrix2DOnGPU<<<(128,4096),(128,4)>>> elapsed 38.689377 ms.. --Achieved Occupancy: 0.746745$ nvprof --metrics achieved_occupancy ./sumMatrix2D 128 8 --sumMatrix2DOnGPU<<<(128,2048),(128,8)>>> elapsed 50.706112 ms.. --Achieved Occupancy: 0.561505$ nvprof --metrics achieved_occupancy ./sumMatrix2D 256 2 --sumMatrix2DOnGPU<<<(64,8192),(256,2)>>> elapsed 36.828159 ms.. --Achieved Occupancy: 0.762112$ nvprof --metrics achieved_occupancy ./sumMatrix2D 256 4 --sumMatrix2DOnGPU<<<(64,4096),(256,4)>>> elapsed 42.040642 ms.. --Achieved Occupancy: 0.589849$ nvprof --metrics achieved_occupancy ./sumMatrix2D 256 8 --sumMatrix2DOnGPU<<<(64,2048),(256,8)>>> elapsed 0.015296 ms.. Result verification failed at elemnt 0 No events/metrics were profiled. ======== Error: Application returned non-zero code 1
这次测试有了更高的性能提升,并且(256,1)配置比(128,1)配置更好,再次查询(256,1)block配置的achieved Occupancy,load throughput和load efficiency等参数:$ ./sumMatrix2D 128 1 --sumMatrix2DOnGPU<<<(128,16384),(128,1)>>> elapsed 32.535934 ms.. $ ./sumMatrix2D 256 1 --sumMatrix2DOnGPU<<<(64,16384),(256,1)>>> elapsed 30.843328 ms..
现在可以看出,最佳的block配置既不是拥有最高achieved Occupancy也不是最高load throughput的。所以不存在唯一metric参数来优化计算性能,我们需要从众多metric中寻求一个平衡。$ nvprof --metrics achieved_occupancy ./sumMatrix2D 256 1 --sumMatrix2DOnGPU<<<(64,16384),(256,1)>>> Achieved Occupancy: 0.807456
$ nvprof --metrics gld_throughput ./sumMatrix2D 256 1 --sumMatrix2DOnGPU<<<(64,16384),(256,1)>>> Global Load Throughput: 69.512GB/s
$ nvprof --metrics gld_efficiency ./sumMatrix2D 256 1 --sumMatrix2DOnGPU<<<(64,16384),(256,1)>>> Global Memory Load Efficiency:100.21%
5、总结
#include <cuda_runtime.h> #include <stdio.h> #include <math.h> #include <time.h> #define PRECISION 1e-5 #define HANDLE_ERROR(err) (HandleError( err, __FILE__, __LINE__ )) int main(int argc, char **argv) { //printf("%s Program Starting...\n",argv[0]); // set up device int devID = 0; cudaDeviceProp deviceProp; HANDLE_ERROR(cudaGetDeviceProperties(&deviceProp, devID)); //printf("Using Device %d: %s\n", devID, deviceProp.name); HANDLE_ERROR(cudaSetDevice(devID)); // set up date size of matrix int nx = 1<<14; int ny = 1<<14; int nxy = nx*ny; int nBytes = nxy * sizeof(float); //printf("Matrix size: nx= %d, ny= %d\n",nx, ny); // malloc host memory float *h_A, *h_B, *hostRef, *gpuRef; h_A = (float *)malloc(nBytes); h_B = (float *)malloc(nBytes); hostRef = (float *)malloc(nBytes); gpuRef = (float *)malloc(nBytes); // initialize data at host side for(int i=0;i<nxy;i++) { h_A[i] = rand()/(float)RAND_MAX; h_B[i] = rand()/(float)RAND_MAX; } memset(hostRef, 0, nBytes); memset(gpuRef, 0, nBytes); // add matrix at host side for result checks float iElaps; clock_t iStart,iEnd; iStart = clock(); // time counter sumMatrix2DOnHost(h_A, h_B, hostRef, nx,ny); iEnd = clock(); //iElaps = (double)(iEnd-iStart)/CLOCKS_PER_SEC; // second iElaps = (double)(iEnd-iStart)/1000; // ms printf("--sumMatrix2DOnHost() elapsed %f ms..\n", iElaps); // malloc device global memory float *d_MatA, *d_MatB, *d_MatC; cudaMalloc((void **)&d_MatA, nBytes); cudaMalloc((void **)&d_MatB, nBytes); cudaMalloc((void **)&d_MatC, nBytes); // transfer data from host to device cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice); // invoke kernel at host side //int dimx = 32; //int dimx = 16; //int dimy = 32; //int dimy = 16; int dimx, dimy; if (argc > 2) //配置block的维度 { dimx = atoi(argv[1]); dimy = atoi(argv[2]); } dim3 block(dimx, dimy); dim3 grid((nx+block.x-1)/block.x, (ny+block.y-1)/block.y); // calculate run time on GPU float elapsedTime; cudaEvent_t start, stop; HANDLE_ERROR(cudaEventCreate(&start)); HANDLE_ERROR(cudaEventCreate(&stop)); HANDLE_ERROR(cudaEventRecord(start, 0)); sumMatrix2DKernel <<< grid, block >>>(d_MatA, d_MatB, d_MatC, nx, ny); cudaDeviceSynchronize(); HANDLE_ERROR(cudaEventRecord(stop, 0)); HANDLE_ERROR(cudaEventSynchronize(stop)); HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop)); printf("--sumMatrix2DOnGPU<<<(%d,%d),(%d,%d)>>> elapsed %f ms..\n", grid.x, grid.y, block.x, block.y, elapsedTime); // // copy kernel result back to host side cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost); // check device results for(int i=0; i< nxy; i++) { if(fabs(gpuRef[i]-hostRef[i]) > PRECISION) { fprintf(stderr,"Result verification failed at elemnt %d\n", i); exit(EXIT_FAILURE); } } // free device global memory cudaFree(d_MatA); cudaFree(d_MatB); cudaFree(d_MatC); // free host memory free(h_A); free(h_B); free(hostRef); free(gpuRef); // reset device cudaDeviceReset(); //printf("Test Passed..\n"); return 0; }