利用分页锁定主机存储器进行主机与设备之间的0拷贝

最新推荐文章于 2021-11-12 19:37:56 发布

yanghangjun

最新推荐文章于 2021-11-12 19:37:56 发布

阅读量1.6k

点赞数

分类专栏： CUDA 文章标签：存储 float pointers documentation c user

本文链接：https://blog.csdn.net/yanghangjun/article/details/6228287

版权

CUDA 专栏收录该内容

27 篇文章 2 订阅

订阅专栏

/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/

#include <stdio.h>
#include <stdlib.h>
#include <cutil_inline.h>
#include <cuda.h>

/* Add two vectors on the GPU */

__global__ void vectorAddGPU(float *a, float *b, float *c, int N)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < N)
c[idx] = a[idx] + b[idx];
}

int main(int argc, char **argv)
{
int n, nelem, idev, deviceCount;
char *device = NULL;
unsigned int flags;
size_t bytes;
float *a, *b, *c; // Pinned memory allocated on the CPU
float *d_a, *d_b, *d_c; // Device pointers for mapped memory
float errorNorm, refNorm, ref, diff;
cudaDeviceProp deviceProp;

if(cutCheckCmdLineFlag(argc, (const char **)argv, "help"))
{
    printf("Usage: simpleZeroCopy [OPTION]/n/n");
    printf("Options:/n");
    printf(" --device=[device #] Specify the device to be used/n");
    cutilExit(argc, argv);
}

/* Get the device selected by the user or default to 0, and then set it. */

if(cutGetCmdLineArgumentstr(argc, (const char**)argv, "device", &device))
{
 cudaGetDeviceCount(&deviceCount);
 idev = atoi(device);
 if(idev >= deviceCount || idev < 0)
 {
 fprintf(stderr, "Invalid device number %d, using default device 0./n",
 idev);
 idev = 0;
 }
}
else
{
 idev = 0;
}

cutilSafeCall(cudaSetDevice(idev));

/* Verify the selected device supports mapped memory and set the device
flags for mapping host memory. */

cutilSafeCall(cudaGetDeviceProperties(&deviceProp, idev));

#if CUDART_VERSION >= 2020
printf("CUDART_VERSION >= 2020/n");
if(!deviceProp.canMapHostMemory)
{
 fprintf(stderr, "Device %d cannot map host memory!/n", idev);
 printf("PASSED");
 cutilExit(argc, argv);
}
cutilSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost));
#else
fprintf(stderr, "This CUDART version does not support <cudaDeviceProp.canMapHostMemory> field/n");
printf("PASSED");
cutilExit(argc, argv);
#endif

/* Allocate mapped CPU memory. */

nelem = 1048576;
bytes = nelem*sizeof(float);
#if CUDART_VERSION >= 2020
flags = cudaHostAllocMapped;
cutilSafeCall(cudaHostAlloc((void **)&a, bytes, flags));
cutilSafeCall(cudaHostAlloc((void **)&b, bytes, flags));
cutilSafeCall(cudaHostAlloc((void **)&c, bytes, flags));
#endif
/* Initialize the vectors. */

for(n = 0; n < nelem; n++)
{
a[n] = rand() / (float)RAND_MAX;
b[n] = rand() / (float)RAND_MAX;
}

/* Get the device pointers for the pinned CPU memory mapped into the GPU
memory space. */

#if CUDART_VERSION >= 2020
cutilSafeCall(cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0));
cutilSafeCall(cudaHostGetDevicePointer((void **)&d_b, (void *)b, 0));
cutilSafeCall(cudaHostGetDevicePointer((void **)&d_c, (void *)c, 0));
#endif

/* Call the GPU kernel using the device pointers for the mapped memory. */

printf("Adding vectors using mapped CPU memory.../n");
dim3 block(256);
dim3 grid((unsigned int)ceil(nelem/(float)block.x));
vectorAddGPU<<<grid, block>>>(d_a, d_b, d_c, nelem);
cutilSafeCall(cudaThreadSynchronize());
cutilCheckMsg("vectorAddGPU() execution failed");

/* Compare the results */

printf("Checking the results.../n");
errorNorm = 0.f;
refNorm = 0.f;
for(n = 0; n < nelem; n++)
{
 ref = a[n] + b[n];
 diff = c[n] - ref;
 errorNorm += diff*diff;
 refNorm += ref*ref;
}
errorNorm = (float)sqrt((double)errorNorm);
refNorm = (float)sqrt((double)refNorm);

/* Memory clean up */

printf("Releasing CPU memory.../n");
#if CUDART_VERSION >= 2020
cutilSafeCall(cudaFreeHost(a));
cutilSafeCall(cudaFreeHost(b));
cutilSafeCall(cudaFreeHost(c));
#endif

cudaThreadExit();

printf("%s/n", (errorNorm/refNorm < 1.e-6f) ? "PASSED" : "FAILED");

cutilExit(argc, argv);
}

yanghangjun

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
利用分页锁定主机存储器进行主机与设备之间的0拷贝

 /* * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * th
复制链接

扫一扫

专栏目录