预备知识
先来看一张图:
Host:指的是CPU端。
Device:指的是GPU端。
Device上有很多执行单元,也就是上图中的Thread。每个Thread可以执行一个核函数(在GPU上执行的函数)。
Device上包含若干个网格(Grid),每个网格中包含若干个块(Block),每个块内包含若干个线程(Thread)。在上一篇博客中,下面这些变量就是和这三个名词相关的:
Maximum number of threads per SM: 2048
Maximum number of threads per block: 1024
Maximum size of each dimension of a block: 1024 x 1024 x 64
Maximum size of each dimension of a grid: 2147483647 x 65535 x 65535
需要注意的是:网格、块都可以是多维的,例如一维网格,二维网格,三维网格,一维块等等。不同的显卡的计算能力不同,这些维度上限可能也不一样(最大也就是3维),如下图:
由于GPU线程是并行执行的,因此每个线程的执行数据都是不相同的。CUDA提供了多个内置的变量来计算各个线程的ID(可以理解为核函数通过计算当前线程ID来抓取不同的数据进行处理),这些内置变量都定义在device_launch_parameters.h文件中,如下:
uint3 __device_builtin__ __STORAGE__ threadIdx;
uint3 __device_builtin__ __STORAGE__ blockIdx;
dim3 __device_builtin__ __STORAGE__ blockDim;
dim3 __device_builtin__ __STORAGE__ gridDim;
int __device_builtin__ __STORAGE__ warpSize;
附上常用的计算线程ID,块ID的一些代码:
//<1d grid, 1d block>
idx = blockIdx.x*blockDim.x+threadIdx
//<1d grid, 2d block>
idx = blockIDx.x*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x
//<1d grid, 3d block>
idx = blockIdx*blockDim.x*blockDim.y*blockDim.z +
threadIdx.z*blockDim.x*blockDim.y +
threadIdx.y*blockDim.x +
threadIdx.x
//<2d grid, 1d block>
blockId = blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x + threadIdx.x
//<2d grid, 2d block>
blockId = blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x
//<2d grid, 3d block>
blockId = blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x*blockDim.y*blockDim.z +
threadIdx.z*blockDim.x*blockDim.y +
threadIdx.y*blockDim.x +
threadIdx.x
//<3d grid, 1d block>
blockId = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x + threadIdx.x
//<3d grid, 2d block>
blockId = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x
//<3d grid, 3d block>
blockId = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x*blockDim.y*blockDim.z +
threadIdx.z*blockDim.x*blockDim.y +
threadIdx.y*blockDim.x +
threadIdx.x
实验部分
下面就正式开始编写一个执行向量加法的程序。同时包含CPU和GPU的代码,计算它们的各自运行时间(感叹一下,显卡计算能力不同,运行时间差的真是大,见后面结果部分)
add_vec.cu代码
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
typedef float FLOAT;
double get_time();
void warm_up();
void vec_add_host(FLOAT* x, FLOAT* y, FLOAT* z, int N);
__global__ void vec_add_device(FLOAT* x, FLOAT* y, FLOAT* z, int N);
// <2d grid, 1d block>
#define get_tid() ((blockIdx.y*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x)
#define get_bid() (blockIdx.y*gridDim.x + blockIdx.x)
#define WINDOWS 0
//time
#if WINDOWS
#include <windows.h>
double get_time()
{
LARGE_INTEGER timer;
static LARGE_INTEGER fre;
static int init = 0;
double t;
if (init != 1)
{
QueryPerformanceFrequency(&fre);
init = 1;
}
QueryPerformanceCounter(&timer);
t = timer.QuadPart * 1. / fre.QuadPart;
return t;
}
#else
#include <sys/time.h>
#include <time.h>
double get_time()
{
struct timeval tv;
double t;
gettimeofday(&tv, (struct timezone*)0);
t = tv.tv_sec + (double)tv.tv_usec*1e-6;
return t;
}
#endif
// warm up gpu
__global__ void warmup_knl(void)
{
int i, j;
i = 1;
j = 1;
i = i + j;
}
void warm_up()
{
int i = 0;
for (; i < 8; ++i)
{
warmup_knl <<<1, 256 >>> ();
}
}
// host code
void vec_add_host(FLOAT* x, FLOAT* y, FLOAT* z, int N)
{
int i;
for (i = 0; i < N; ++i) z[i] = x[i] + y[i] + z[i];
}
// device code
__global__ void vec_add_device(FLOAT* x, FLOAT* y, FLOAT* z, int N)
{
int idx = get_tid();
if (idx < N) z[idx] = x[idx] + y[idx] + z[idx];
}
int main()
{
int N = 20000000;
int nbytes = N * sizeof(FLOAT);
/* 1d block*/
int bs = 256;
/* 2d grid*/
int s = ceil(sqrt((N + bs - 1.) / bs));
dim3 grid = dim3(s, s);
FLOAT* dx = NULL, *hx = NULL;
FLOAT* dy = NULL, *hy = NULL;
FLOAT* dz = NULL, *hz = NULL;
int iter = 30;
int i;
double th, td;
warm_up();
/* allocate gpu memory */
cudaMalloc((void**)&dx, nbytes);
cudaMalloc((void**)&dy, nbytes);
cudaMalloc((void**)&dz, nbytes);
if (dx == NULL || dy == NULL || dz == NULL)
{
printf("could not allocate gpu memory/n");
return -1;
}
hx = (FLOAT*)malloc(nbytes);
hy = (FLOAT*)malloc(nbytes);
hz = (FLOAT*)malloc(nbytes);
if (hx == NULL || hy == NULL || hz == NULL)
{
printf("could not allocate cpu memory/n");
return -2;
}
/* init */
for (i = 0; i < N; ++i)
{
hx[i] = 1;
hy[i] = 1;
hz[i] = 1;
}
/* copy data to gpu*/
cudaMemcpy(dx, hx, nbytes, cudaMemcpyHostToDevice);
cudaMemcpy(dy, hy, nbytes, cudaMemcpyHostToDevice);
cudaMemcpy(dz, hz, nbytes, cudaMemcpyHostToDevice);
warm_up();
// call for gpu
cudaThreadSynchronize();
td = get_time();
for (i = 0; i < iter; ++i) vec_add_device <<<grid, bs >>> (dx, dy, dz, N);
cudaThreadSynchronize();
td = get_time() - td;
// call for cpu
th = get_time();
for (i = 0; i < iter; ++i) vec_add_host(hx, hy, hz, N);
th = get_time() - th;
printf("GPU time: %.4f, CPU time: %.4f, Speedup: %g\n", td, th, th / td);
// free
free(hx);
free(hy);
free(hz);
cudaFree(hx);
cudaFree(hy);
cudaFree(hz);
return 0;
}
注意
由于Windows和Linux下的运行时间计算方法不同,所以加了一个宏,请根据自己情况修改!
Linux下请自行安装cuda toolkit !!!
附上Ubuntu下的CMakeLists.txt内容:
cmake_minimum_required(VERSION 2.8)
project(vectorAdd)
FIND_PACKAGE(CUDA REQUIRED)
# set source files
set(PROJECT_SRC vec_add.cu)
cuda_add_executable(vectorAdd ${PROJECT_SRC})
以及Linux下的执行命令:
mkdir build
cd build
cmake ..
make
./vectorAdd
结果
1. 渣渣显卡 GeForce 920MX
GPU time: 1.3785, CPU time: 2.3346, Speedup: 1.6935
2. 牛逼显卡 Tesla K80
GPU time: 0.0729, CPU time: 2.0984, Speedup: 28.8006
3. 更牛逼显卡 RTX TITAN
GPU time: 0.0274, CPU time: 1.6466, Speedup: 60.0391