CUDA编程07——向量求和（并行规约，交错配对）

最新推荐文章于 2023-10-21 14:24:59 发布

jieleiping

最新推荐文章于 2023-10-21 14:24:59 发布

阅读量609

点赞数

分类专栏： CUDA编程 c/c++ gpu 文章标签：蓝桥杯 c++ fpga开发

本文链接：https://blog.csdn.net/jieleiping/article/details/121708395

版权

CUDA编程同时被 3 个专栏收录

7 篇文章 0 订阅

订阅专栏

c/c++

7 篇文章 0 订阅

订阅专栏

gpu

6 篇文章 0 订阅

订阅专栏

在CUDA编程06——向量求和（并行规约，相邻配对）中介绍了最简单向量求和规约算法。

这里补充一个概念：

关于warp和half-warp

一个warp包含32个threads。warp是调度和执行的基本单位，half-warp是存储器操作的基本单位，这两个非常重要。

因此上例中可以看到，由于采用了相邻配对，大多数的累加线程都是在不同的warp内，因此会需要更多的调度开销。因此可以考虑采用交错配对的方式，让那些已经完成了求和任务的线程不在调度。如下图所示：

核函数代码

// device code
__global__ void vec_add_device(FLOAT* a, int width)
{
	int tid = threadIdx.y*blockDim.x + threadIdx.x;
	for (int stride = width/2; stride > 0; stride /= 2)
	{
		if (tid < stride)
		{
			a[tid] += a[tid + stride];
		}

		__syncthreads();
	}
}

完整代码

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <windows.h>

typedef float FLOAT;

double get_time();
void warm_up();
void vec_add_host(FLOAT* M, int width);
__global__ void vec_add_device(FLOAT* M, FLOAT* N, FLOAT* P, int width);

// <2d grid, 1d block>
#define get_tid() ((blockIdx.y*gridDim.x + blockIdm.x)*blockDim.x + threadIdm.x)
#define get_bid() (blockIdx.y*gridDim.x + blockIdx.x)

double get_time()
{
	LARGE_INTEGER timer;
	static LARGE_INTEGER fre;
	static int init = 0;
	double t;

	if (init != 1)
	{
		QueryPerformanceFrequency(&fre);
		init = 1;
	}

	QueryPerformanceCounter(&timer);
	t = timer.QuadPart * 1. / fre.QuadPart;
	return t;
}

// warm up gpu
__global__ void warmup_knl(void)
{
	int i, j;
	i = 1;
	j = 1;
	i = i + j;
}

void warm_up()
{
	int i = 0;
	for (; i < 8; ++i)
	{
		warmup_knl << <1, 256 >> > ();
	}
}

// host code
void vec_add_host(FLOAT* a, int len)
{
	FLOAT sum = a[0];
	for (int i = 1; i < len; ++i)
	{
		a[0] += a[i];
	}
}


// device code
__global__ void vec_add_device(FLOAT* a, int width)
{
	int tid = threadIdx.y*blockDim.x + threadIdx.x;
	for (int stride = width/2; stride > 0; stride /= 2)
	{
		if (tid < stride)
		{
			a[tid] += a[tid + stride];
		}

		__syncthreads();
	}
}

int main()
{
	int M = 32, sb = 1;
	int N = M * M;
	int sg = int(M / sb);
	int nbytes = N * sizeof(FLOAT);
	dim3 dimGrid(1, 1);
	dim3 dimBlock(1024, 1);
	cudaError_t cudaStatus = cudaSetDevice(0);

	FLOAT* dm = NULL, *hm = NULL;

	int iter = 1, i;
	double th, td;

	warm_up();

	/* allocate gpu memory */
	cudaMalloc((void**)&dm, nbytes);
	if (cudaStatus != cudaSuccess)
	{
		fprintf(stderr, "cudaMalloc dm failed!");
		goto Error;
	}

	hm = (FLOAT*)malloc(nbytes);
	if (hm == NULL)
	{
		fprintf(stderr, "mallo hm failed!");
		goto Error;
	}

	/* init */
	for (i = 0; i < N; ++i) hm[i] = 1.0;

	/* copy data to gpu*/
	cudaMemcpy(dm, hm, nbytes, cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess)
	{
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}

	warm_up();

	// call for gpu
	cudaThreadSynchronize();
	td = get_time();

	for (i = 0; i < iter; ++i) vec_add_device << <dimGrid, dimBlock >> > (dm, N);

	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess)
	{
		fprintf(stderr, "launch kernel failed!");
		goto Error;
	}

	cudaThreadSynchronize();
	td = get_time() - td;

	// call for cpu
	th = get_time();
	for (i = 0; i < iter; ++i) vec_add_host(hm, N);
	th = get_time() - th;

	printf("GPU time: %.8f, CPU time: %.6f, Speedup: %g\n", td, th, th / td);

	FLOAT* hm2 = (FLOAT*)malloc(nbytes);
	for (int i = 0; i < N; ++i) hm2[i] = 0;

	cudaMemcpy(hm2, dm, nbytes, cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess)
	{
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}

	printf("%.1f, %.1f\n", hm[0], hm2[0]);

Error:
	// free
	cudaFree(dm);
	free(hm);
	free(hm2);

	// cudaDeviceReset must be called before exiting in order for profiling and
	// tracing tools such as Nsight and Visual Profiler to show complete traces.
	cudaStatus = cudaDeviceReset();
	if (cudaStatus != cudaSuccess)
	{
		fprintf(stderr, "cudaDeviceReset failed!");
		return -1;
	}


 	return 0;
}

实验结果

GPU time: 0.00006720, CPU time: 0.000003, Speedup: 0.0401787
1024.0, 1024.0

可以看到，相比于相邻配对的方式，速度有了一定的提升。

补充一些warp的概念

1. block被划分为以32为单位的线程组，叫做warp;

2. Warp是最基本的调度单元；

3. Warp一直执行相同的指令（SIMT）；

4. 每个线程只能执行自己的代码路径；

5. 设备切换没有时间代价；

6. 许多warps在一起可以隐藏访问延时。

jieleiping

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
CUDA编程07——向量求和（并行规约，交错配对）

在CUDA编程06——向量求和（并行规约，相邻配对）中介绍了最简单向量求和规约算法。这里补充一个概念：关于warp和half-warp一个warp包含32个threads。warp是调度和执行的基本单位，half-warp是存储器操作的基本单位，这两个非常重要。因此上例中可以看到，由于采用了相邻配对，大多数的累加线程都是在不同的warp内，因此会需要更多的调度开销。因此可以考虑采用交错配对的方式，让那些已经完成了求和任务的线程不在调度。如下图所示：核函数代码// de.
复制链接

扫一扫

专栏目录