CUDA编程02——向量相加

预备知识

先来看一张图:

Host:指的是CPU端。

Device:指的是GPU端。

Device上有很多执行单元,也就是上图中的Thread。每个Thread可以执行一个核函数(在GPU上执行的函数)。

Device上包含若干个网格(Grid),每个网格中包含若干个块(Block),每个块内包含若干个线程(Thread)。在上一篇博客中,下面这些变量就是和这三个名词相关的:

Maximum number of threads per SM:               2048
Maximum number of threads per block:            1024
Maximum size of each dimension of a block:      1024 x 1024 x 64
Maximum size of each dimension of a grid:       2147483647 x 65535 x 65535

 需要注意的是:网格、块都可以是多维的,例如一维网格,二维网格,三维网格,一维块等等。不同的显卡的计算能力不同,这些维度上限可能也不一样(最大也就是3维),如下图:

 由于GPU线程是并行执行的,因此每个线程的执行数据都是不相同的。CUDA提供了多个内置的变量来计算各个线程的ID(可以理解为核函数通过计算当前线程ID来抓取不同的数据进行处理),这些内置变量都定义在device_launch_parameters.h文件中,如下:

uint3 __device_builtin__ __STORAGE__ threadIdx;
uint3 __device_builtin__ __STORAGE__ blockIdx;
dim3 __device_builtin__ __STORAGE__ blockDim;
dim3 __device_builtin__ __STORAGE__ gridDim;
int __device_builtin__ __STORAGE__ warpSize;

 附上常用的计算线程ID,块ID的一些代码:

//<1d grid, 1d block>
idx = blockIdx.x*blockDim.x+threadIdx

//<1d grid, 2d block>
idx = blockIDx.x*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x

//<1d grid, 3d block>
idx = blockIdx*blockDim.x*blockDim.y*blockDim.z +
      threadIdx.z*blockDim.x*blockDim.y +
	  threadIdx.y*blockDim.x +
	  threadIdx.x

//<2d grid, 1d block>
blockId = blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x + threadIdx.x

//<2d grid, 2d block>
blockId = blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x

//<2d grid, 3d block>
blockId = blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x*blockDim.y*blockDim.z +
           threadIdx.z*blockDim.x*blockDim.y +
		   threadIdx.y*blockDim.x +
		   threadIdx.x

//<3d grid, 1d block>
blockId = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x + threadIdx.x

//<3d grid, 2d block>
blockId = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x

//<3d grid, 3d block>
blockId = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x*blockDim.y*blockDim.z +
           threadIdx.z*blockDim.x*blockDim.y + 
		   threadIdx.y*blockDim.x +
		   threadIdx.x

实验部分 

下面就正式开始编写一个执行向量加法的程序。同时包含CPU和GPU的代码,计算它们的各自运行时间(感叹一下,显卡计算能力不同,运行时间差的真是大,见后面结果部分)

add_vec.cu代码

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

typedef float FLOAT;

double get_time();
void warm_up();
void vec_add_host(FLOAT* x, FLOAT* y, FLOAT* z, int N);
__global__ void vec_add_device(FLOAT* x, FLOAT* y, FLOAT* z, int N);

// <2d grid, 1d block>
#define get_tid() ((blockIdx.y*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x)
#define get_bid() (blockIdx.y*gridDim.x + blockIdx.x)

#define WINDOWS 0

//time
#if WINDOWS

#include <windows.h>
double get_time()
{
    LARGE_INTEGER timer;
    static LARGE_INTEGER fre;
    static int init = 0;
    double t;

    if (init != 1) 
    {
        QueryPerformanceFrequency(&fre);
        init = 1;
    }

    QueryPerformanceCounter(&timer);
    t = timer.QuadPart * 1. / fre.QuadPart;
    return t;
}

#else

#include <sys/time.h>
#include <time.h>
double get_time()
{
    struct timeval tv;
    double t;
    gettimeofday(&tv, (struct timezone*)0);
    t = tv.tv_sec + (double)tv.tv_usec*1e-6;
    return t;
}

#endif

// warm up gpu
__global__ void warmup_knl(void)
{
	int i, j;
	i = 1;
	j = 1;
	i = i + j;
}

void warm_up()
{
	int i = 0;
	for (; i < 8; ++i)
	{
		warmup_knl <<<1, 256 >>> ();
	}
}

// host code
void vec_add_host(FLOAT* x, FLOAT* y, FLOAT* z, int N)
{
	int i;
	for (i = 0; i < N; ++i) z[i] = x[i] + y[i] + z[i];
}

// device code
__global__ void vec_add_device(FLOAT* x, FLOAT* y, FLOAT* z, int N)
{
	int idx = get_tid();
	if (idx < N) z[idx] = x[idx] + y[idx] + z[idx];
}

int main()
{
	int N = 20000000;
	int nbytes = N * sizeof(FLOAT);
	/* 1d block*/
	int bs = 256;
	/* 2d grid*/
	int s = ceil(sqrt((N + bs - 1.) / bs));
	dim3 grid = dim3(s, s);

	FLOAT* dx = NULL, *hx = NULL;
	FLOAT* dy = NULL, *hy = NULL;
	FLOAT* dz = NULL, *hz = NULL;

	int iter = 30;
	int i;
	double th, td;

	warm_up();

	/* allocate gpu memory */
	cudaMalloc((void**)&dx, nbytes);
	cudaMalloc((void**)&dy, nbytes);
	cudaMalloc((void**)&dz, nbytes);
	if (dx == NULL || dy == NULL || dz == NULL) 
	{
		printf("could not allocate gpu memory/n");
		return -1;
	}

	hx = (FLOAT*)malloc(nbytes);
	hy = (FLOAT*)malloc(nbytes);
	hz = (FLOAT*)malloc(nbytes);
	if (hx == NULL || hy == NULL || hz == NULL)
	{
		printf("could not allocate cpu memory/n");
		return -2;
	}

	/* init */
	for (i = 0; i < N; ++i)
	{
		hx[i] = 1;
		hy[i] = 1;
		hz[i] = 1;
	}

	/* copy data to gpu*/
	cudaMemcpy(dx, hx, nbytes, cudaMemcpyHostToDevice);
	cudaMemcpy(dy, hy, nbytes, cudaMemcpyHostToDevice);
	cudaMemcpy(dz, hz, nbytes, cudaMemcpyHostToDevice);

	warm_up();

	// call for gpu
	cudaThreadSynchronize();
	td = get_time();

	for (i = 0; i < iter; ++i) vec_add_device <<<grid, bs >>> (dx, dy, dz, N);

	cudaThreadSynchronize();
	td = get_time() - td;

	// call for cpu
	th = get_time();
	for (i = 0; i < iter; ++i) vec_add_host(hx, hy, hz, N);
	th = get_time() - th;

	printf("GPU time: %.4f, CPU time: %.4f, Speedup: %g\n", td, th, th / td);

	// free
	free(hx);
	free(hy);
	free(hz);
	cudaFree(hx);
	cudaFree(hy);
	cudaFree(hz);

	return 0;
}

注意 

由于Windows和Linux下的运行时间计算方法不同,所以加了一个宏,请根据自己情况修改!

Linux下请自行安装cuda toolkit !!!

附上Ubuntu下的CMakeLists.txt内容:

cmake_minimum_required(VERSION 2.8)
project(vectorAdd)
FIND_PACKAGE(CUDA REQUIRED)

# set source files
set(PROJECT_SRC vec_add.cu)
cuda_add_executable(vectorAdd ${PROJECT_SRC})

以及Linux下的执行命令:

mkdir build
cd build
cmake ..
make
./vectorAdd

结果

1. 渣渣显卡 GeForce 920MX

GPU time: 1.3785, CPU time: 2.3346, Speedup: 1.6935

2. 牛逼显卡 Tesla K80

GPU time: 0.0729, CPU time: 2.0984, Speedup: 28.8006

3. 更牛逼显卡 RTX TITAN

GPU time: 0.0274, CPU time: 1.6466, Speedup: 60.0391
  • 2
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值