CUDA编程02——向量相加

最新推荐文章于 2024-04-17 16:07:23 发布

jieleiping

最新推荐文章于 2024-04-17 16:07:23 发布

阅读量1k

点赞数 2

分类专栏： CUDA编程文章标签：深度学习 pytorch python

本文链接：https://blog.csdn.net/jieleiping/article/details/107924654

版权

CUDA编程专栏收录该内容

7 篇文章 0 订阅

订阅专栏

预备知识

先来看一张图：

Host：指的是CPU端。

Device：指的是GPU端。

Device上有很多执行单元，也就是上图中的Thread。每个Thread可以执行一个核函数（在GPU上执行的函数）。

Device上包含若干个网格（Grid），每个网格中包含若干个块（Block），每个块内包含若干个线程（Thread）。在上一篇博客中，下面这些变量就是和这三个名词相关的：

Maximum number of threads per SM:               2048
Maximum number of threads per block:            1024
Maximum size of each dimension of a block:      1024 x 1024 x 64
Maximum size of each dimension of a grid:       2147483647 x 65535 x 65535

需要注意的是：网格、块都可以是多维的，例如一维网格，二维网格，三维网格，一维块等等。不同的显卡的计算能力不同，这些维度上限可能也不一样（最大也就是3维），如下图：

由于GPU线程是并行执行的，因此每个线程的执行数据都是不相同的。CUDA提供了多个内置的变量来计算各个线程的ID（可以理解为核函数通过计算当前线程ID来抓取不同的数据进行处理），这些内置变量都定义在device_launch_parameters.h文件中，如下：

uint3 __device_builtin__ __STORAGE__ threadIdx;
uint3 __device_builtin__ __STORAGE__ blockIdx;
dim3 __device_builtin__ __STORAGE__ blockDim;
dim3 __device_builtin__ __STORAGE__ gridDim;
int __device_builtin__ __STORAGE__ warpSize;

附上常用的计算线程ID，块ID的一些代码：

//<1d grid, 1d block>
idx = blockIdx.x*blockDim.x+threadIdx

//<1d grid, 2d block>
idx = blockIDx.x*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x

//<1d grid, 3d block>
idx = blockIdx*blockDim.x*blockDim.y*blockDim.z +
      threadIdx.z*blockDim.x*blockDim.y +
	  threadIdx.y*blockDim.x +
	  threadIdx.x

//<2d grid, 1d block>
blockId = blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x + threadIdx.x

//<2d grid, 2d block>
blockId = blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x

//<2d grid, 3d block>
blockId = blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x*blockDim.y*blockDim.z +
           threadIdx.z*blockDim.x*blockDim.y +
		   threadIdx.y*blockDim.x +
		   threadIdx.x

//<3d grid, 1d block>
blockId = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x + threadIdx.x

//<3d grid, 2d block>
blockId = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x

//<3d grid, 3d block>
blockId = blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x
threadId = blockId*blockDim.x*blockDim.y*blockDim.z +
           threadIdx.z*blockDim.x*blockDim.y + 
		   threadIdx.y*blockDim.x +
		   threadIdx.x

实验部分

下面就正式开始编写一个执行向量加法的程序。同时包含CPU和GPU的代码，计算它们的各自运行时间（感叹一下，显卡计算能力不同，运行时间差的真是大，见后面结果部分）

add_vec.cu代码

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

typedef float FLOAT;

double get_time();
void warm_up();
void vec_add_host(FLOAT* x, FLOAT* y, FLOAT* z, int N);
__global__ void vec_add_device(FLOAT* x, FLOAT* y, FLOAT* z, int N);

// <2d grid, 1d block>
#define get_tid() ((blockIdx.y*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x)
#define get_bid() (blockIdx.y*gridDim.x + blockIdx.x)

#define WINDOWS 0

//time
#if WINDOWS

#include <windows.h>
double get_time()
{
    LARGE_INTEGER timer;
    static LARGE_INTEGER fre;
    static int init = 0;
    double t;

    if (init != 1) 
    {
        QueryPerformanceFrequency(&fre);
        init = 1;
    }

    QueryPerformanceCounter(&timer);
    t = timer.QuadPart * 1. / fre.QuadPart;
    return t;
}

#else

#include <sys/time.h>
#include <time.h>
double get_time()
{
    struct timeval tv;
    double t;
    gettimeofday(&tv, (struct timezone*)0);
    t = tv.tv_sec + (double)tv.tv_usec*1e-6;
    return t;
}

#endif

// warm up gpu
__global__ void warmup_knl(void)
{
	int i, j;
	i = 1;
	j = 1;
	i = i + j;
}

void warm_up()
{
	int i = 0;
	for (; i < 8; ++i)
	{
		warmup_knl <<<1, 256 >>> ();
	}
}

// host code
void vec_add_host(FLOAT* x, FLOAT* y, FLOAT* z, int N)
{
	int i;
	for (i = 0; i < N; ++i) z[i] = x[i] + y[i] + z[i];
}

// device code
__global__ void vec_add_device(FLOAT* x, FLOAT* y, FLOAT* z, int N)
{
	int idx = get_tid();
	if (idx < N) z[idx] = x[idx] + y[idx] + z[idx];
}

int main()
{
	int N = 20000000;
	int nbytes = N * sizeof(FLOAT);
	/* 1d block*/
	int bs = 256;
	/* 2d grid*/
	int s = ceil(sqrt((N + bs - 1.) / bs));
	dim3 grid = dim3(s, s);

	FLOAT* dx = NULL, *hx = NULL;
	FLOAT* dy = NULL, *hy = NULL;
	FLOAT* dz = NULL, *hz = NULL;

	int iter = 30;
	int i;
	double th, td;

	warm_up();

	/* allocate gpu memory */
	cudaMalloc((void**)&dx, nbytes);
	cudaMalloc((void**)&dy, nbytes);
	cudaMalloc((void**)&dz, nbytes);
	if (dx == NULL || dy == NULL || dz == NULL) 
	{
		printf("could not allocate gpu memory/n");
		return -1;
	}

	hx = (FLOAT*)malloc(nbytes);
	hy = (FLOAT*)malloc(nbytes);
	hz = (FLOAT*)malloc(nbytes);
	if (hx == NULL || hy == NULL || hz == NULL)
	{
		printf("could not allocate cpu memory/n");
		return -2;
	}

	/* init */
	for (i = 0; i < N; ++i)
	{
		hx[i] = 1;
		hy[i] = 1;
		hz[i] = 1;
	}

	/* copy data to gpu*/
	cudaMemcpy(dx, hx, nbytes, cudaMemcpyHostToDevice);
	cudaMemcpy(dy, hy, nbytes, cudaMemcpyHostToDevice);
	cudaMemcpy(dz, hz, nbytes, cudaMemcpyHostToDevice);

	warm_up();

	// call for gpu
	cudaThreadSynchronize();
	td = get_time();

	for (i = 0; i < iter; ++i) vec_add_device <<<grid, bs >>> (dx, dy, dz, N);

	cudaThreadSynchronize();
	td = get_time() - td;

	// call for cpu
	th = get_time();
	for (i = 0; i < iter; ++i) vec_add_host(hx, hy, hz, N);
	th = get_time() - th;

	printf("GPU time: %.4f, CPU time: %.4f, Speedup: %g\n", td, th, th / td);

	// free
	free(hx);
	free(hy);
	free(hz);
	cudaFree(hx);
	cudaFree(hy);
	cudaFree(hz);

	return 0;
}

注意

由于Windows和Linux下的运行时间计算方法不同，所以加了一个宏，请根据自己情况修改！

Linux下请自行安装cuda toolkit !!!

附上Ubuntu下的CMakeLists.txt内容：

cmake_minimum_required(VERSION 2.8)
project(vectorAdd)
FIND_PACKAGE(CUDA REQUIRED)

# set source files
set(PROJECT_SRC vec_add.cu)
cuda_add_executable(vectorAdd ${PROJECT_SRC})

以及Linux下的执行命令：

mkdir build
cd build
cmake ..
make
./vectorAdd

结果

1. 渣渣显卡 GeForce 920MX

GPU time: 1.3785, CPU time: 2.3346, Speedup: 1.6935

2. 牛逼显卡 Tesla K80

GPU time: 0.0729, CPU time: 2.0984, Speedup: 28.8006

3. 更牛逼显卡 RTX TITAN

GPU time: 0.0274, CPU time: 1.6466, Speedup: 60.0391

jieleiping

关注

2
点赞
踩
8

收藏

觉得还不错? 一键收藏
1
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录