例子
GPU 跨步 矩阵加法
vector-add.cu
#include <stdio.h>
/*
* Refactor host function to run as CUDA kernel
*/
__global__
void initWith(float num, float *a, int N)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < N; i += stride)
{
a[i] = num;
}
}
__global__
void addArraysInto(float *result, float *a, float *b, int N)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < N; i += stride)
{
result[i] = a[i] + b[i];
}
}
void checkElementsAre(float target, float *array, int N)
{
for(int i = 0; i < N; i++)
{
if(array[i] != target)
{
printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
exit(1);
}
}
printf("Success! All values calculated correctly.\n");
}
int main()
{
int deviceId;
int numberOfSMs;
cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
printf("Device ID: %d\tNumber of SMs: %d\n", deviceId, numberOfSMs);
const int N = 2<<24;
size_t size = N * sizeof(float);
float *a;
float *b;
float *c;
cudaMallocManaged(&a, size);
cudaMallocManaged(&b, size);
cudaMallocManaged(&c, size);
size_t threadsPerBlock;
size_t numberOfBlocks;
threadsPerBlock = 256;
numberOfBlocks = 32 * numberOfSMs;
cudaError_t addArraysErr;
cudaError_t asyncErr;
/*
* Launch kernels.
*/
initWith<<<numberOfBlocks, threadsPerBlock>>>(3, a, N);
initWith<<<numberOfBlocks, threadsPerBlock>>>(4, b, N);
initWith<<<numberOfBlocks, threadsPerBlock>>>(0, c, N);
/*
* Now that initialization is happening on a GPU, host code
* must be synchronized to wait for its completion.
*/
cudaDeviceSynchronize();
addArraysInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
addArraysErr = cudaGetLastError();
if(addArraysErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addArraysErr));
asyncErr = cudaDeviceSynchronize();
if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));
checkElementsAre(7, c, N);
cudaFree(a);
cudaFree(b);
cudaFree(c);
}
编译:
nvcc -o single-thread-vector-add vector-add.cu -run
# nvcc -arch=sm_70 -O3 -Xcompiler="-march=native -fopenmp" vector-add.cu -o baseline
1. nsys
nsys 之前的名称叫做 nvprof
nsys 命令行使用方法
nsys profile --stats=true ./single-thread-vector-add
# nsys profile --stats=true --force-overwrite=true -o baseline-report ./single-thread-vector-add
nsys profile将生成一个qdrep报告文件,该文件可以以多种方式使用。 我们在这里使用–stats = true标志表示我们希望打印输出摘要统计信息。 输出的信息有很多,包括:
- 配置文件配置详细信息
- 报告文件的生成详细信息
- CUDA API统计信息
- CUDA核函数的统计信息
- CUDA内存操作统计信息(时间和大小)
- 操作系统内核调用接口的统计信息
要使用上面黑体字的3个部分。 可以将使用生成的报告文件将其提供给Nsight Systems 进行可视化分析。
默认情况下,nsys profile不会覆盖现有的报告文件。 这样做是为了防止在进行概要分析时意外丢失工作。 如果出于某种原因,宁愿覆盖现有的报告文件,例如在快速迭代期间,可以向nsys profile提供-f标志以允许覆盖现有的报告文件。
nsys 输出结果:
Warning: LBR backtrace method is not supported on this platform. DWARF backtrace method will be used.
Collecting data...
Device ID: 0 Number of SMs: 80
Success! All values calculated correctly.
Processing events...
Capturing symbol files...
Saving temporary "/tmp/nsys-report-591f-683b-4f10-63c8.qdstrm" file to disk...
Creating final output files...
Processing [==============================================================100%]
Saved report file to "/tmp/nsys-report-591f-683b-4f10-63c8.qdrep"
Exporting 1815 events: [==================================================100%]
Exported successfully to
/tmp/nsys-report-591f-683b-4f10-63c8.sqlite
Generating CUDA API Statistics...
CUDA API Statistics (nanoseconds)
Time(%) Total Time Calls Average Minimum Maximum Name
------- -------------- ---------- -------------- -------------- -------------- --------------------------------------------------------------------------------
82.0 230346229 3 76782076.3 27983 230270146 cudaMallocManaged
10.7 29967501 2 14983750.5 508540 29458961 cudaDeviceSynchronize
7.2 20316135 3 6772045.0 5613359 9006679 cudaFree
0.1 143177 4 35794.3 7600 76464 cudaLaunchKernel
Generating CUDA Kernel Statistics...
CUDA Kernel Statistics (nanoseconds)
Time(%) Total Time Instances Average Minimum Maximum Name
------- -------------- ---------- -------------- -------------- -------------- --------------------------------------------------------------------------------------------------------------------
98.3 29535995 3 9845331.7 9744129 9932220 initWith(float, float*, int)
1.7 506802 1 506802.0 506802 506802 addArraysInto(float*, float*, float*, int)
Generating CUDA Memory Operation Statistics...
CUDA Memory Operation Statistics (nanoseconds)
Time(%) Total Time Operations Average Minimum Maximum Name
------- -------------- ---------- -------------- -------------- -------------- --------------------------------------------------------------------------------
100.0 11401344 768 14845.5 1919 82688 [CUDA Unified Memory memcpy DtoH]
CUDA Memory Operation Statistics (KiB)
Total Operations Average Minimum Maximum Name
------------------- -------------- ------------------- ----------------- ------------------- --------------------------------------------------------------------------------
131072.000 768 170.667 4.000 1020.000 [CUDA Unified Memory memcpy DtoH]
Generating Operating System Runtime API Statistics...
Operating System Runtime API Statistics (nanoseconds)
Time(%) Total Time Calls Average Minimum Maximum Name
------- -------------- ---------- -------------- -------------- -------------- --------------------------------------------------------------------------------
48.0 582535369 32 18204230.3 44658 100178751 poll
43.5 527276305 31 17008913.1 25004 100129329 sem_timedwait
6.5 78624695 575 136738.6 1048 16511688 ioctl
1.9 22856000 87 262712.6 1702 8953591 mmap
0.1 749890 73 10272.5 3658 23110 open64
0.0 154491 4 38622.8 38189 38997 pthread_create
0.0 128939 23 5606.0 1642 15319 fopen
0.0 112944 10 11294.4 8110 15695 write
0.0 93073 3 31024.3 23805 42251 fgets
0.0 78912 69 1143.7 1006 4990 fcntl
0.0 52553 16 3284.6 1718 5436 munmap
0.0 44010 16 2750.6 1605 4228 fclose
0.0 35526 5 7105.2 3976 10642 open
0.0 26782 12 2231.8 1268 4230 read
0.0 14721 3 4907.0 4662 5059 pipe2
0.0 10488 3 3496.0 2115 4987 fread
0.0 9801 4 2450.2 2192 2745 mprotect
0.0 9542 2 4771.0 4501 5041 socket
0.0 6078 1 6078.0 6078 6078 connect
0.0 2548 1 2548.0 2548 2548 bind
0.0 1983 1 1983.0 1983 1983 listen
Generating NVTX Push-Pop Range Statistics...
NVTX Push-Pop Range Statistics (nanoseconds)
Report file moved to "report13.qdrep"
Report file moved to "report13.sqlite"
应用程序分析完毕后,请使用分析输出中显示的信息回答下列问题:
此应用程序中唯一调用的 CUDA 核函数的名称是什么?
Generating CUDA Kernel Statistics...
CUDA Kernel Statistics (nanoseconds)
Time(%) Total Time Instances Average Minimum Maximum Name
------- -------------- ---------- -------------- -------------- -------------- --------------------------------------------------------------------------------------------------------------------
100.0 2184487204 1 2184487204.0 2184487204 2184487204 addVectorsInto(float*, float*, float*, int)
此核函数运行了多少次?
1 次。
此核函数的运行时间为多少?请记录下此时间:您将继续优化此应用程序,再和此时间做对比。
2184487204 ns = 2.1 s