![CUDA性能优化----线程配置 - 樂不思蜀 - 樂不思蜀](http://img2.ph.126.net/3iN5hxKhiCloH2fC5kA4dQ==/6632080415909254400.png)
- 线程和block索引
- 矩阵中元素坐标
- 线性global memory 的偏移
![CUDA性能优化----线程配置 - 樂不思蜀 - 樂不思蜀](http://img0.ph.126.net/4P3-SnIlQVCrCByZLtzTTA==/6632198063653420458.png)
编译运行://Threads assign test #include <cuda_runtime.h> #include <stdio.h> #include <math.h> #include <time.h> #define PRECISION 1e-5 #define HANDLE_ERROR(err) (HandleError( err, __FILE__, __LINE__ )) static void HandleError( cudaError_t err,const char *file,int line ) { if (err != cudaSuccess) { printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); exit( EXIT_FAILURE ); } } __global__ void sumMatrix2DKernel(float *d_MatA,float *d_MatB,float *d_MatC,int nx,int ny) { int idx = threadIdx.x + blockDim.x * blockIdx.x; int idy = threadIdx.y + blockDim.y * blockIdx.y; int tid = nx*idy + idx; if(idx < nx && idy < ny) d_MatC[tid] = d_MatA[tid] + d_MatB[tid]; } void sumMatrix2DOnHost (float *h_A,float *h_B,float *hostRef,int nx,int ny) { for(int i=0; i< nx*ny; i++) hostRef[i] = h_A[i] + h_B[i]; } int main(int argc, char **argv) { printf("%s Program Starting...\n",argv[0]); // set up device int devID = 0; cudaDeviceProp deviceProp; HANDLE_ERROR(cudaGetDeviceProperties(&deviceProp, devID)); printf("Using Device %d: %s\n", devID, deviceProp.name); HANDLE_ERROR(cudaSetDevice(devID)); // set up date size of matrix int nx = 1<<14; int ny = 1<<14; int nxy = nx*ny; int nBytes = nxy * sizeof(float); printf("Matrix size: nx= %d, ny= %d\n",nx, ny); // malloc host memory float *h_A, *h_B, *hostRef, *gpuRef; h_A = (float *)malloc(nBytes); h_B = (float *)malloc(nBytes); hostRef = (float *)malloc(nBytes); gpuRef = (float *)malloc(nBytes); // initialize data at host side clock_t iStart,iEnd; iStart = clock(); for(int i=0;i<nxy;i++) { h_A[i] = rand()/(float)RAND_MAX; h_B[i] = rand()/(float)RAND_MAX; } iEnd = clock(); double iElaps = (double)(iEnd-iStart)/CLOCKS_PER_SEC; memset(hostRef, 0, nBytes); memset(gpuRef, 0, nBytes); // add matrix at host side for result checks iStart = clock(); sumMatrix2DOnHost(h_A, h_B, hostRef, nx,ny); iEnd = clock(); iElaps = (double)(iEnd-iStart)/CLOCKS_PER_SEC; printf("--sumMatrix2DOnHost() elapsed %f sec..\n", iElaps); // malloc device global memory float *d_MatA, *d_MatB, *d_MatC; cudaMalloc((void **)&d_MatA, nBytes); cudaMalloc((void **)&d_MatB, nBytes); cudaMalloc((void **)&d_MatC, nBytes); // transfer data from host to device cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice); /// // invoke kernel at host side int dimx = 32; int dimy = 32; //int dimy = 16; dim3 block(dimx, dimy); dim3 grid((nx+block.x-1)/block.x, (ny+block.y-1)/block.y); iStart = clock(); sumMatrix2DKernel <<< grid, block >>>(d_MatA, d_MatB, d_MatC, nx, ny); cudaDeviceSynchronize(); iEnd = clock(); iElaps = (double)(iEnd-iStart)/CLOCKS_PER_SEC; printf("--sumMatrix2DOnGPU<<<(%d,%d),(%d,%d)>>> elapsed %f sec..\n", grid.x, grid.y, block.x, block.y, iElaps); /// // copy kernel result back to host side cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost); // check device results for(int i=0; i< nxy; i++) { if(fabs(gpuRef[i]-hostRef[i]) > PRECISION) { fprintf(stderr,"Result verification failed at elemnt %d\n", i); exit(EXIT_FAILURE); } } // free device global memory cudaFree(d_MatA); cudaFree(d_MatB); cudaFree(d_MatC); // free host memory free(h_A); free(h_B); free(hostRef); free(gpuRef); // reset device cudaDeviceReset(); printf("Test Passed..\n"); return 0; }
$ nvcc -arch=sm_20 sumMatrix2D.cu -o sumMatrix2D $ ./sumMatrix2D
现在我们 将block的大小改成(32, 16),此时block数量为512*1024,再次编译运行,会发现:./sumMatrix2D Program Starting... Using Device 0: Tesla M2070 Matrix size: nx= 16384, ny= 16384 --sumMatrix2DOnHost() elapsed 1.410000 sec.. --sumMatrix2DOnGPU<<<(512,1024),(32,32)>>> elapsed 0.070000 sec.. Test Passed..
可以看到,程序性能提升了将近1倍,直观来看是第二次线程配置比第一次配置block的数量 增加了1倍,实际上也正是由于block数量增加了的缘故。但是如果继续增加block的数量,性能反而又会下降。./sumMatrix2D Program Starting... Using Device 0: Tesla M2070 Matrix size: nx= 16384, ny= 16384 --sumMatrix2DOnHost() elapsed 1.410000 sec.. --sumMatrix2DOnGPU<<<(512,1024),(32,16)>>> elapsed 0.040000 sec.. Test Passed..
关于线程块配置的性能分析参考后续章节。./sumMatrix2D Program Starting... Using Device 0: Tesla M2070 Matrix size: nx= 16384, ny= 16384 --sumMatrix2DOnHost() elapsed 1.400000 sec.. --sumMatrix2DOnGPU<<<(1024,1024),(16,16)>>> elapsed 0.050000 sec.. Test Passed..