统一内存(unified memory)是一种逻辑概念,它既不是显存,又不是主机内存。它能提高GPU编程易用性,甚至是性能,从而不需要在手动的在主机与设备之间数据传输,也不需要对同一组数据同时定义两个指针,不用手动分配主机和GPU的内存。而且其能适应多GPU编程。
动态统一内存代码示例 add.cu:
#include "error.cuh"
#include <math.h>
#include <stdio.h>
const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void __global__ add(const double *x, const double *y, double *z);
void check(const double *z, const int N);
int main(void)
{
const int N = 100000000;
const int M = sizeof(double) * N;
double *x, *y, *z; //定义指针
CHECK(cudaMallocManaged((void **)&x, M)); //分配统一内存
CHECK(cudaMallocManaged((void **)&y, M));
CHECK(cudaMallocManaged((void **)&z, M));
for (int n = 0; n < N; ++n)
{
x[n] = a; //使用主机直接赋值
y[n] = b;
}
const int block_size = 128;
const int grid_size = N / block_size;
add<<<grid_size, block_size>>>(x, y, z); //使用GPU直接访问
CHECK(cudaDeviceSynchronize());
check(z, N);
CHECK(cudaFree(x)); //释放统一内存
CHECK(cudaFree(y));
CHECK(cudaFree(z));
return 0;
}
void __global__ add(const double *x, const double *y, double *z) //不需要对核函数进行修改
{
const int n = blockDim.x * blockIdx.x + threadIdx.x;
z[n] = x[n] + y[n];
}
void check(const double *z, const int N)
{
bool has_error = false;
for (int n = 0; n < N; ++n)
{
if (fabs(z[n] - c) > EPSILON)
{
has_error = true;
}
}
printf("%s\n", has_error ? "Has errors" : "No errors");
}
头文件error.cuh为错误检查宏CHECK函数:
#pragma once
#include <stdio.h>
#define CHECK(call) \
{ \
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) \
{ \
printf("CUDA Error:\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", \
cudaGetErrorString(error_code)); \
exit(1); \
} \
}
编译运行:
$ nvcc add.cu -o add
$ ./add