一、基本函数调用
cudaMalloc
在device上分配内存
cudaError_t cudaMalloc(void** devPtr, size_t size);
cudaMemcpy
负责host和device之间的数据通信
cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind)
cudaMallocManaged
cudaMalloc需要单独在host和device上进行内存分配,并且需要使用cudaMemcpy进行数据拷贝,很容易出错
在CUDA6.0后引入统一内存(unified memory)来避免这种麻烦
使用一个托管内存来共同管理host和device中的内存,并且自动在host和device中进行传输
cudaError_t cudaMallocManaged(void **devPtr, size_t size, unsigned int flag=0);
cudaDeviceSynchronize
配合cudaMallocManaged使用,在执行核函数后调用该函数完成device数据同步,保证结果在host能够正常访问
二、基础代码
2.1 常规内存使用方案时的向量加法
#include<stdio.h>
#include <sys/time.h>
// Tesla V100 每个block具有1024个thread
#define BLOCK_SIZE 1024
// 时间结构体
struct timezone Idunno;
struct timeval startTime, endTime;
__global__
void add(float* x, float* y, float* z, int n){
// 获取全局索引
int index = threadIdx.x + blockIdx.x * blockDim.x;
// 步长
int stride = blockDim.x * gridDim.x;
for(int i = index; i < n; i += stride){
z[i] = x[i] + y[i];
}
}
double report_running_time() {
long sec_diff, usec_diff;
gettimeofday(&endTime, &Idunno);
sec_diff = endTime.tv_sec - startTime.tv_sec;
usec_diff= endTime.tv_usec-startTime.tv_usec;
if(usec_diff < 0) {
sec_diff --;
usec_diff += 1000000;
}
printf("Running time for GPU version: %ld.%06ld\n", sec_diff, usec_diff);
return (double)(sec_diff*1.0 + usec_diff/1000000.0);
}
int main(){
int N = 1 << 28;
int nBytes = N * sizeof(float);
// 申请host内存并初始化数据
float *x, *y, *z;
x = (float*)malloc(nBytes);
y = (float*)malloc(nBytes);
z = (float*)malloc(nBytes);
for(int i = 0; i < N; ++i){
x[i] = 66.66;
y[i] = 77.77;
}
// 申请device内存并拷贝数据
float *dev_x, *dev_y, *dev_z;
cudaMalloc((void**)&dev_x, nBytes);
cudaMalloc((void**)&dev_y, nBytes);
cudaMalloc((void**)&dev_z, nBytes);
// 定制kernel执行配置
dim3 blockSize(BLOCK_SIZE);
dim3 gridSize((N + BLOCK_SIZE - 1) / BLOCK_SIZE);
gettimeofday(&startTime, &Idunno);
cudaMemcpy((void*)dev_x, (void*)x, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy((void*)dev_y, (void*)y, nBytes, cudaMemcpyHostToDevice);
// 执行核函数
add<<<gridSize, blockSize>>>(dev_x, dev_y, dev_z, N);
// 将device得到的结果拷贝回host
cudaMemcpy((void*)z, (void*)dev_z, nBytes, cudaMemcpyDeviceToHost);
// 打印执行时间
report_running_time();
// 检查
// float maxError = 0.0;
// for(int i = 0; i < N; ++i){
// printf("%f + %f = %f\n", x[i], y[i], z[i]);
// }
// 释放device内存
cudaFree(dev_x);
cudaFree(dev_y);
cudaFree(dev_z);
// 释放host内存
free(x);
free(y);
free(z);
}
2.2 智能内存使用方案的向量相加
#include<stdio.h>
#include <sys/time.h>
// Tesla V100 每个block具有1024个thread
#define BLOCK_SIZE 1024
// 时间结构体
struct timezone Idunno;
struct timeval startTime, endTime;
__global__
void add(float* x, float* y, float* z, int n){
// 获取全局索引
int index = threadIdx.x + blockIdx.x * blockDim.x;
// 步长
int stride = blockDim.x * gridDim.x;
for(int i = index; i < n; i += stride){
z[i] = x[i] + y[i];
}
}
double report_running_time() {
long sec_diff, usec_diff;
gettimeofday(&endTime, &Idunno);
sec_diff = endTime.tv_sec - startTime.tv_sec;
usec_diff= endTime.tv_usec-startTime.tv_usec;
if(usec_diff < 0) {
sec_diff --;
usec_diff += 1000000;
}
printf("Running time for GPU version: %ld.%06ld\n", sec_diff, usec_diff);
return (double)(sec_diff*1.0 + usec_diff/1000000.0);
}
int main(){
int N = 1 << 28;
int nBytes = N * sizeof(float);
// 申请托管内存
float *x, *y, *z;
cudaMallocManaged((void**)&x, nBytes);
cudaMallocManaged((void**)&y, nBytes);
cudaMallocManaged((void**)&z, nBytes);
for(int i = 0; i < N; ++i){
x[i] = 66.66;
y[i] = 77.77;
}
// 定制kernel执行配置
dim3 blockSize(BLOCK_SIZE);
dim3 gridSize((N + BLOCK_SIZE - 1) / BLOCK_SIZE);
gettimeofday(&startTime, &Idunno);
// 执行核函数
add<<<gridSize, blockSize>>>(x, y, z, N);
// 打印执行时间
report_running_time();
// 同步device 保证结果能够正常访问
cudaDeviceSynchronize();
float maxError;
for(int i = 0; i < N; ++i){
maxError = fmax(maxError, fabs(z[i] - 144.429993));
}
printf("最大误差:%f\n", maxError);
}
三、性能比较
3.1 nvprof简介
nvprof 是一个可用于Linux、Windows和OS X的命令行探查器。
使用 nvprof ./myApp 运行我的应用程序,我可以快速看到它所使用的所有内核和内存副本的摘要,摘要将对同一内核的所有调用组合在一起,显示每个内核的总时间和总应用程序时间的百分比。
除了摘要模式之外, nvprof 还支持 GPU – 跟踪和API跟踪模式 ,它可以让您看到所有内核启动和内存副本的完整列表,在API跟踪模式下,还可以看到所有CUDA API调用的完整列表。
基本使用方法:
nvcc test.cu -o t
nvprof ./t
将打印可执行cuda程序的性能摘要
3.2 常规内存使用方案性能摘要
[root@node1 self_code]# nvprof ./v
==112266== NVPROF is profiling process 112266, command: ./v
Running time for GPU version: 0.827946
==112266== Profiling application: ./v
==112266== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 53.12% 439.09ms 2 219.55ms 218.82ms 220.27ms [CUDA memcpy HtoD]
46.41% 383.59ms 1 383.59ms 383.59ms 383.59ms [CUDA memcpy DtoH]
0.47% 3.9018ms 1 3.9018ms 3.9018ms 3.9018ms add(float*, float*, float*, int)
API calls: 63.07% 827.86ms 3 275.95ms 219.04ms 388.37ms cudaMemcpy
29.35% 385.22ms 3 128.41ms 1.1112ms 382.99ms cudaMalloc
7.41% 97.244ms 3 32.415ms 4.1328ms 52.933ms cudaFree
0.09% 1.1639ms 1 1.1639ms 1.1639ms 1.1639ms cuDeviceTotalMem
0.07% 962.21us 96 10.023us 370ns 419.72us cuDeviceGetAttribute
0.01% 129.22us 1 129.22us 129.22us 129.22us cuDeviceGetName
0.00% 64.706us 1 64.706us 64.706us 64.706us cudaLaunchKernel
0.00% 8.8230us 1 8.8230us 8.8230us 8.8230us cuDeviceGetPCIBusId
0.00% 6.6990us 2 3.3490us 444ns 6.2550us cuDeviceGet
0.00% 3.2760us 3 1.0920us 411ns 2.3980us cuDeviceGetCount
0.00% 732ns 1 732ns 732ns 732ns cuDeviceGetUuid
3.2 智能内存使用方案性能摘要
[root@node1 self_code]# nvprof ./V
==113269== NVPROF is profiling process 113269, command: ./V
Running time for GPU version: 0.000285
最大误差:0.000000
==113269== Profiling application: ./V
==113269== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 749.36ms 1 749.36ms 749.36ms 749.36ms add(float*, float*, float*, int)
API calls: 64.20% 749.32ms 1 749.32ms 749.32ms 749.32ms cudaDeviceSynchronize
35.59% 415.45ms 3 138.48ms 61.305us 415.27ms cudaMallocManaged
0.10% 1.1580ms 1 1.1580ms 1.1580ms 1.1580ms cuDeviceTotalMem
0.08% 962.41us 96 10.025us 372ns 419.26us cuDeviceGetAttribute
0.02% 193.73us 1 193.73us 193.73us 193.73us cudaLaunchKernel
0.01% 126.98us 1 126.98us 126.98us 126.98us cuDeviceGetName
0.00% 7.0760us 1 7.0760us 7.0760us 7.0760us cuDeviceGetPCIBusId
0.00% 6.8750us 2 3.4370us 356ns 6.5190us cuDeviceGet
0.00% 3.6320us 3 1.2100us 468ns 2.5340us cuDeviceGetCount
0.00% 732ns 1 732ns 732ns 732ns cuDeviceGetUuid
==113269== Unified Memory profiling result:
Device "Tesla V100-SXM2-16GB (0)"
Count Avg Size Min Size Max Size Total Size Total Time Name
64488 32.398KB 4.0000KB 0.9961MB 1.992474GB 376.3038ms Host To Device
6142 170.71KB 4.0000KB 0.9961MB 0.999939GB 91.23770ms Device To Host
2452 - - - - 745.1175ms Gpu page fault groups
Total CPU Page faults: 9216