3.3 并行性的表现
为更好地理解线程束执行的本质,将使用不同的执行配置分析下述的sumMatrixOn-GPU2D
核函数。
使用nvprof配置指标,可以有助于理解为什么有些网格/块的维数组合比其他的组合更好。这些练习会提供网格和块的启发式算法,这是CUDA编程人员必备的技能。
#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>
/*
* This example implements matrix element-wise addition on the host and GPU.
* sumMatrixOnHost iterates over the rows and columns of each matrix, adding
* elements from A and B together and storing the results in C. The current
* offset in each matrix is stored using pointer arithmetic. sumMatrixOnGPU2D
* implements the same logic, but using CUDA threads to process each matrix.
*/
void initialData(float *ip, const int size)
{
int i;
for(i = 0; i < size; i++)
{
ip[i] = (float)( rand() & 0xFF ) / 10.0f;
}
}
void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny)
{
float *ia = A;
float *ib = B;
float *ic = C;
for (int iy = 0; iy < ny; iy++)
{
for (int ix = 0; ix < nx; ix++)
{
ic[ix] = ia[ix] + ib[ix];
}
ia += nx;
ib += nx;
ic += nx;
}
return;
}
void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;
for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i]) > epsilon)
{
printf("host %f gpu %f ", hostRef[i], gpuRef[i]);
printf("Arrays do not match.\n\n");
break;
}
}
}
// grid 2D block 2D
__global__ void sumMatrixOnGPU2D(float *A, float *B, float *C, int NX, int NY)
{
unsigned int ix = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int iy = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int idx = iy * NX + ix;
if (ix < NX && iy < NY)
{
C[idx] = A[idx] + B[idx];
}
}
int main(int argc, char **argv)
{
// set up device
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
CHECK(cudaSetDevice(dev));
// set up data size of matrix
int nx = 1 << 13;
int ny = 1 << 13;
int nxy = nx * ny;
int nBytes = nxy * sizeof(float);
// malloc host memory
float *h_A, *h_B, *hostRef, *gpuRef;
h_A = (float *)malloc(nBytes);
h_B = (float *)malloc(nBytes);
hostRef = (float *)malloc(nBytes);
gpuRef = (float *)malloc(nBytes);
// initialize data at host side
double iStart = seconds();
initialData(h_A, nxy);
initialData(h_B, nxy);
double iElaps = seconds() - iStart;
memset(hostRef, 0, nBytes);
memset(gpuRef, 0, nBytes);
// add matrix at host side for result checks
iStart = seconds();
sumMatrixOnHost (h_A, h_B, hostRef, nx, ny);
iElaps = seconds() - iStart;
// malloc device global memory
float *d_MatA, *d_MatB, *d_MatC;
CHECK(cudaMalloc((void **)&d_MatA, nBytes));
CHECK(cudaMalloc((void **)&d_MatB, nBytes));
CHECK(cudaMalloc((void **)&d_MatC, nBytes));
// transfer data from host to device
CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice));
// invoke kernel at host side
int dimx = 32;
int dimy = 32;
if(argc > 2)
{
dimx = atoi(argv[1]);
dimy = atoi(argv[2]);
}
dim3 block(dimx, dimy);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
// execute the kernel
CHECK(cudaDeviceSynchronize());
iStart = seconds();
sumMatrixOnGPU2D<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny);
CHECK(cudaDeviceSynchronize());
iElaps = seconds() - iStart;
printf("sumMatrixOnGPU2D <<<(%d,%d), (%d,%d)>>> elapsed %f ms\n", grid.x,
grid.y,
block.x, block.y, iElaps);
CHECK(cudaGetLastError());
// copy kernel result back to host side
CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost));
// check device results
checkResult(hostRef, gpuRef, nxy);
// free device global memory
CHECK(cudaFree(d_MatA));
CHECK(cudaFree(d_MatB));
CHECK(cudaFree(d_MatC));
// free host memory
free(h_A);
free(h_B);
free(hostRef);
free(gpuRef);
// reset device
CHECK(cudaDeviceReset());
return EXIT_SUCCESS;
}
3.3.1 用nvprof检测活跃的线程束
一个内核的可实现占用率被定义为:每周期内活跃线程束的平均数量与一个SM支持的线程束最大数量的比值。
$ sudo nvprof --metrics achieved_occupancy ./main 32 32
==15352== NVPROF is profiling process 15352, command: ./main 32 32
sumMatrixOnGPU2D <<<(256,256), (32,32)>>> elapsed 0.017286 ms
==15352== Profiling application: ./main 32 32
==15352== Profiling result:
==15352== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "GeForce GTX 1050 (0)"
Kernel: sumMatrixOnGPU2D(float*, float*, float*, int, int)
1 achieved_occupancy Achieved Occupancy 0.800294 0.800294 0.800294
$ sudo nvprof --metrics achieved_occupancy ./main 32 16
==15366== NVPROF is profiling process 15366, command: ./main 32 16
sumMatrixOnGPU2D <<<(256,512), (32,16)>>> elapsed 0.016680 ms
==15366== Profiling application: ./main 32 16
==15366== Profiling result:
==15366== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "GeForce GTX 1050 (0)"
Kernel: sumMatrixOnGPU2D(float*, float*, float*, int, int)
1 achieved_occupancy Achieved Occupancy 0.823351 0.823351 0.823351
$ sudo nvprof --metrics achieved_occupancy ./main 16 32
==15380== NVPROF is profiling process 15380, command: ./main 16 32
sumMatrixOnGPU2D <<<(512,256), (16,32)>>> elapsed 0.016678 ms
==15380== Profiling application: ./main 16 32
==15380== Profiling result:
==15380== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "GeForce GTX 1050 (0)"
Kernel: sumMatrixOnGPU2D(float*, float*, float*, int, int)
1 achieved_occupancy Achieved Occupancy 0.839795 0.839795 0.839795
$ sudo nvprof --metrics achieved_occupancy ./main 16 16
==15394== NVPROF is profiling process 15394, command: ./main 16 16
sumMatrixOnGPU2D <<<(512,512), (16,16)>>> elapsed 0.017072 ms
==15394== Profiling application: ./main 16 16
==15394== Profiling result:
==15394== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "GeForce GTX 1050 (0)"
Kernel: sumMatrixOnGPU2D(float*, float*, float*, int, int)
1 achieved_occupancy Achieved Occupancy 0.862736 0.862736 0.862736
-
因为第二种情况中的块数比第一种情况的多,所以设备就可以有更多活跃的线程束。其原因可能是第二种情况与第一种情况相比有更高的可实现占用率和更好的性能。
-
第四种情况有最高的可实现占用率,但它不是最快的,因此,更高的占用率并不一定意味着有更高的性能。肯定有其他因素限制GPU的性能。
3.3.2 用nvprof检测内存操作
在sumMatrix内核(C[idx]=A[idx]+B[idx]
)中有3个内存操作:两个内存加载和一个内存存储。可以使用nvprof检测这些内存操作的效率。
内存读取效率
sudo nvprof --metrics gld_throughput ./main 32 32
首先,用gld_throughput指标检查内核的内存读取效率,从而得到每个执行配置的差异:
liwete@liwete-OMEN-by-HP-Laptop:~/CLionProjects/sumMatrix$ sudo nvprof --metrics gld_throughput ./main 32 32
==15453== NVPROF is profiling process 15453, command: ./main 32 32
sumMatrixOnGPU2D <<<(256,256), (32,32)>>> elapsed 0.154867 ms
==15453== Profiling application: ./main 32 32
==15453== Profiling result:
==15453== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "GeForce GTX 1050 (0)"
Kernel: sumMatrixOnGPU2D(float*, float*, float*, int, int)
1 gld_throughput Global Load Throughput 3.6326GB/s 3.6326GB/s 3.6326GB/s
liwete@liwete-OMEN-by-HP-Laptop:~/CLionProjects/sumMatrix$ sudo nvprof --metrics gld_throughput ./main 32 16
==15468== NVPROF is profiling process 15468, command: ./main 32 16
sumMatrixOnGPU2D <<<(256,512), (32,16)>>> elapsed 0.129354 ms
==15468== Profiling application: ./main 32 16
==15468== Profiling result:
==15468== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "GeForce GTX 1050 (0)"
Kernel: sumMatrixOnGPU2D(float*, float*, float*, int, int)
1 gld_throughput Global Load Throughput 4.4766GB/s 4.4766GB/s 4.4766GB/s
liwete@liwete-OMEN-by-HP-Laptop:~/CLionProjects/sumMatrix$ sudo nvprof --metrics gld_throughput ./main 16 32
==15482== NVPROF is profiling process 15482, command: ./main 16 32
sumMatrixOnGPU2D <<<(512,256), (16,32)>>> elapsed 0.125502 ms
==15482== Profiling application: ./main 16 32
==15482== Profiling result:
==15482== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "GeForce GTX 1050 (0)"
Kernel: sumMatrixOnGPU2D(float*, float*, float*, int, int)
1 gld_throughput Global Load Throughput 4.4943GB/s 4.4943GB/s 4.4943GB/s
liwete@liwete-OMEN-by-HP-Laptop:~/CLionProjects/sumMatrix$ sudo nvprof --metrics gld_throughput ./main 16 16
==15496== NVPROF is profiling process 15496, command: ./main 16 16
sumMatrixOnGPU2D <<<(512,512), (16,16)>>> elapsed 0.126294 ms
==15496== Profiling application: ./main 16 16
==15496== Profiling result:
==15496== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "GeForce GTX 1050 (0)"
Kernel: sumMatrixOnGPU2D(float*, float*, float*, int, int)
1 gld_throughput Global Load Throughput 4.4697GB/s 4.4697GB/s 4.4697GB/s
- 第四种情况中的加载吞吐量最高,但第四种情况却比第二种情况慢。所以,更高的加载吞吐量并不一定意味着更高的性能。第4章介绍内存事务在GPU设备上的工作原理时将会具体分析产生这种现象的原因。
全局加载效率
sudo nvprof --metrics gld_efficiency ./main 16 16
接下来,用gld_efficiency
指标检测全局加载效率,即被请求的全局加载吞吐量占所需的全局加载吞吐量的比值。它衡量了应用程序的加载操作利用设备内存带宽的程度。
liwete@liwete-OMEN-by-HP-Laptop:~/CLionProjects/sumMatrix$ sudo nvprof --metrics gld_efficiency ./main 16 16
==15700== NVPROF is profiling process 15700, command: ./main 16 16
==15700== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
Replaying kernel "sumMatrixOnGPU2D(float*, float*, float*, int, int)" (done)
sumMatrixOnGPU2D <<<(512,512), (16,16)>>> elapsed 0.436797 ms
==15700== Profiling application: ./main 16 16
==15700== Profiling result:
==15700== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "GeForce GTX 1050 (0)"
Kernel: sumMatrixOnGPU2D(float*, float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00%
liwete@liwete-OMEN-by-HP-Laptop:~/CLionProjects/sumMatrix$ sudo nvprof --metrics gld_efficiency ./main 16 8
==15714== NVPROF is profiling process 15714, command: ./main 16 8
==15714== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
Replaying kernel "sumMatrixOnGPU2D(float*, float*, float*, int, int)" (2 of 2)...
Replaying kernel "sumMatrixOnGPU2D(float*, float*, float*, int, int)" (done)
==15714== Profiling application: ./main 16 86,8)>>> elapsed 0.434808 ms
==15714== Profiling result:
==15714== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "GeForce GTX 1050 (0)"
Kernel: sumMatrixOnGPU2D(float*, float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00%
liwete@liwete-OMEN-by-HP-Laptop:~/CLionProjects/sumMatrix$ sudo nvprof --metrics gld_efficiency ./main 8 8
==15728== NVPROF is profiling process 15728, command: ./main 8 8
==15728== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
Replaying kernel "sumMatrixOnGPU2D(float*, float*, float*, int, int)" (2 of 2)...
Replaying kernel "sumMatrixOnGPU2D(float*, float*, float*, int, int)" (done)
==15728== Profiling application: ./main 8 8lapsed 0.440531 ms
==15728== Profiling result:
==15728== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "GeForce GTX 1050 (0)"
Kernel: sumMatrixOnGPU2D(float*, float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00%
liwete@liwete-OMEN-by-HP-Laptop:~/CLionProjects/sumMatrix$ sudo nvprof --metrics gld_efficiency ./main 4 4
==15743== NVPROF is profiling process 15743, command: ./main 4 4
==15743== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
Replaying kernel "sumMatrixOnGPU2D(float*, float*, float*, int, int)" (2 of 2)...
Replaying kernel "sumMatrixOnGPU2D(float*, float*, float*, int, int)" (done)
==15743== Profiling application: ./main 4 4lapsed 0.747474 ms
==15743== Profiling result:
==15743== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "GeForce GTX 1050 (0)"
Kernel: sumMatrixOnGPU2D(float*, float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 50.00% 50.00% 50.00%
最后一种情况下的加载效率是最前面两种情况的一半。这可以解释为什么最后一种情况下更高的加载吞吐量和可实现占用率没有产生较好的性能。
尽管在最后一种情况下正在执行的加载数量(即吞吐量)很多,但是那些加载的有效性(即效率)是较低的。
注意,最后情况的共同特征是它们在最内层维数中块的大小小于线程束。如前所述,对网格和块启发式算法来说,最内层的维数应该总是线程束大小的倍数。第4章将讨论半个线程束大小的线程块是如何影响性能的。
3.3.3 增大并行性
从前一节可以总结出,一个块的最内层维数(block.x)应该是线程束大小的倍数。 这样能极大地提高了加载效率。你可能对以下问题仍然很好奇:
- 调整block.x会进一步增加加载吞吐量吗
- 有其他方法可以增大并行性吗
现在已经建立了一个性能基准,可以通过测试sumMatrix使用更大范围的线程配置来回答这些问题:
-
线程块最内层维度的大小对性能起着的关键的作用。
-
在所有其他情况下,线程块的数量都比最好的情况少。因此,增大并行性仍然是性能优化的一个重要因素。
-
最好的执行配置既不具有最高的可实现占用率,也不具有最高的加载吞吐量。从这些实验中可以推断出,没有一个单独的指标能直接优化性能。我们需要在几个相关的指标间寻找一个恰当的平衡来达到最佳的总体性能。
指标与性能
-
在大部分情况下,一个单独的指标不能产生最佳的性能
-
与总体性能最直接相关的指标或事件取决于内核代码的本质
-
在相关的指标与事件之间寻求一个好的平衡
-
从不同角度查看内核以寻找相关指标间的平衡
-
网格/块启发式算法为性能调节提供了一个很好的起点