本文通过使用malloc分配内存和cudaHostAlloc分配页锁定内存,说明使用页锁定内存可提高运算效率,并指出哪些场合适合使用页锁定内存。
malloc分配的是标准的可分页的(pagable)的主机内存,操作系统在对内存进行调度的时候可能会将这种内存分页或者交换到磁盘上,需要的时候再调回内存,这样就会增加运算时间。而cudaHostAlloc分配的是页锁定的(page-locked)主机内存,操作系统不会对这块内存分页和交换到磁盘上,确保该内存始终驻留在物理内存中。
下面通过100M数据在主机和设备上的交换说明二者的差异。贴上代码:
/********************************************************************
* PageLockedMem.cu
* Compare the performance of general mem and page locked mem.
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil_inline.h>
#define _SIZE 100*1024*1024
/************************************************************************/
/* Init CUDA */
/************************************************************************/
bool InitCUDA(void)
{
......
}
float cudaMallocTest(int size, bool dir)
{
cudaEvent_t start, stop;
float elapsedTime;
int *a, *dev_a;
a = (int*)malloc(size*sizeof(int));
if (!a)
{
printf("Mem error!\n");
}
cutilSafeCall(cudaMalloc((void**)&dev_a, size*sizeof(int)));
cutilSafeCall(cudaEventCreate(&start));
cutilSafeCall(cudaEventCreate(&stop));
cutilSafeCall(cudaEventRecord(start, 0));
for (int i=0; i<10; i++)
{
if (dir)
{
cutilSafeCall(cudaMemcpy(dev_a, a, size*sizeof(int), cudaMemcpyHostToDevice));
}
else
{
cutilSafeCall(cudaMemcpy(a, dev_a, size*sizeof(int), cudaMemcpyDeviceToHost));
}
}
cutilSafeCall(cudaEventRecord(stop, 0));
cudaEventSynchronize(stop);
cutilSafeCall(cudaEventElapsedTime(&elapsedTime, start, stop));
free(a);
cutilSafeCall(cudaFree(dev_a));
cutilSafeCall(cudaEventDestroy(start));
cutilSafeCall(cudaEventDestroy(stop));
return elapsedTime;
}
float cudaHostAllocTest(int size, bool dir)
{
cudaEvent_t start, stop;
float elapsedTime;
int *a, *dev_a;
cutilSafeCall(cudaHostAlloc((void**)&a, size*sizeof(int), cudaHostAllocDefault));
cutilSafeCall(cudaMalloc((void**)&dev_a, size*sizeof(int)));
cutilSafeCall(cudaEventCreate(&start));
cutilSafeCall(cudaEventCreate(&stop));
cutilSafeCall(cudaEventRecord(start, 0));
for (int i=0; i<10; i++)
{
if (dir)
{
cutilSafeCall(cudaMemcpy(dev_a, a, size*sizeof(int), cudaMemcpyHostToDevice));
}
else
{
cutilSafeCall(cudaMemcpy(a, dev_a, size*sizeof(int), cudaMemcpyDeviceToHost));
}
}
cutilSafeCall(cudaEventRecord(stop, 0));
cudaEventSynchronize(stop);
cutilSafeCall(cudaEventElapsedTime(&elapsedTime, start, stop));
cutilSafeCall(cudaFreeHost(a));
cutilSafeCall(cudaFree(dev_a));
cutilSafeCall(cudaEventDestroy(start));
cutilSafeCall(cudaEventDestroy(stop));
return elapsedTime;
}
int main(int argc, char* argv[])
{
if(!InitCUDA()) {
return 0;
}
float elapsedTime;
float MB = (float)100*_SIZE*sizeof(int)/1024/1024;
elapsedTime = cudaMallocTest(_SIZE, true);
printf("Time using cudaMalloc: %3.1f ms\n", elapsedTime);
printf("\tMB/s during copy up: %3.1f \n", MB/(elapsedTime/1000));
elapsedTime = cudaMallocTest(_SIZE, true);
printf("Time using cudaMalloc: %3.1f ms\n", elapsedTime);
printf("\tMB/s during copy down: %3.1f \n", MB/(elapsedTime/1000));
elapsedTime = cudaHostAllocTest(_SIZE, true);
printf("Time using cudaHostAlloc: %3.1f ms\n", elapsedTime);
printf("\tMB/s during copy up: %3.1f \n", MB/(elapsedTime/1000));
elapsedTime = cudaHostAllocTest(_SIZE, true);
printf("Time using cudaHostAlloc: %3.1f ms\n", elapsedTime);
printf("\tMB/s during copy down: %3.1f \n", MB/(elapsedTime/1000));
return 0;
}
可以看出运算时间缩短了约2倍。
但是并不是所有的场合都适合用页锁定内存,因为使用固定内存时,将失去虚拟内存的所有功能,即需要为每个页锁定内存分配物理内存,系统将更快耗尽内存(跟使用普通内存相比)。所以要根据需要进行选择。