【GPU加速】开发低延迟代码性能提升76.33%——通过VS2017创建CUDA项目对比CPU代码和GPU代码的延迟（亲测代码可运行简单可运行适合入手）

量化Mike

已于 2022-03-08 09:46:42 修改

阅读量776

点赞数 2

分类专栏： FPGA高频低延迟算法文章标签：数据库 python CUDA GPU 加速

于 2022-03-08 09:23:48 首次发布

本文链接：https://blog.csdn.net/weixin_43529465/article/details/123329424

版权

FPGA高频低延迟算法专栏收录该内容

10 篇文章 3 订阅

订阅专栏

文章目录

一、在CPU上创建数组相加的算法：

#include "stdio.h"
#include<iostream>

//定义array元素的个数
#define N	10000000

//定义CPU上的函数
void cpuAdd(int *h_a, int *h_b, int *h_c) {
	int tid = 0;
	while (tid < N)
	{
		h_c[tid] = h_a[tid] + h_b[tid];
		tid += 1;
	}
}

int main04(void) {
	int h_a[N], h_b[N], h_c[N];

	//初始化两个数组进行相加
	for (int i = 0; i < N; i++) {
		h_a[i] = 2 * i*i;
		h_b[i] = i;
	}

	//回调CPU上函数
	cpuAdd(h_a, h_b, h_c);

	//Printing Answer
	printf("Vector addition on CPU\n");

	for (int i = 0; i < N; i++) {
		printf("The sum of %d element is %d + %d = %d\n", i, h_a[i], h_b[i], h_c[i]);
	}
	return 0;
}

在这里插入图片描述

二、在GPU上创建数组相加的算法：

#include <device_launch_parameters.h>
#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <cuda_runtime.h>

//定义array元素的个数
#define N	10000000

//定义向量相加的内核函数
__global__ void gpuAdd05(int *d_a, int *d_b, int *d_c) {
	//Getting block index of current kernel
	int tid = blockIdx.x;	// handle the data at this index
	if (tid < N)
	{ 
		d_c[tid] = d_a[tid] + d_b[tid];
	}
}

int main(void) {
	//定义主机数组
	int h_a[N], h_b[N], h_c[N];
	//定义设备指针
	int *d_a, *d_b, *d_c;
	//分配内存空间
	cudaMalloc((void**)&d_a, N * sizeof(int));
	cudaMalloc((void**)&d_b, N * sizeof(int));
	cudaMalloc((void**)&d_c, N * sizeof(int));
	//初始化数组
	for (int i = 0; i < N; i++) {
		h_a[i] = 2 * i*i;
		h_b[i] = i;
	}
	// 从主机内存复制数组到设备显存
	cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice);

	//调用内核函数设置启动方式为N个块每个块内1个线程
	gpuAdd05 << <N, 1 >> > (d_a, d_b, d_c);

	//复制设备计算结果到主机内存
	cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);

	printf("Vector addition on GPU \n");

	//打印结果
	for (int i = 0; i < N; i++) {
		printf("向量加法为: %d + %d = %d\n", i, h_a[i], h_b[i], h_c[i]);
	}

	//释放设备显存
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);

	return 0;
}

在这里插入图片描述

三、对比上述CPU与GPU代码的延迟

上述的代码中我们设置的数组的元素个数N为5，这里我们为了CPU执行时间和GPU执行时间的显著差异，将N设置为30000。

#include <device_launch_parameters.h>
#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <time.h>
#include <cuda_runtime.h>

//定义array元素的个数
#define N	30000


//定义CPU上向量相加的函数
void cpuAdd06(int *h_a, int *h_b, int *h_c) {
	int tid = 0;
	while (tid < N)
	{
		h_c[tid] = h_a[tid] + h_b[tid];
		tid += 1;
	}
}


//定义GPU上向量相加的内核函数
__global__ void gpuAdd06(int *d_a, int *d_b, int *d_c) {
	//Getting block index of current kernel
	int tid = blockIdx.x;	// handle the data at this index
	if (tid < N)
	{
		d_c[tid] = d_a[tid] + d_b[tid];
	}
}

void run_cpu_code(void)
{
	int h_a[N], h_b[N], h_c[N];

	//初始化两个数组进行相加
	for (int i = 0; i < N; i++) {
		h_a[i] = 2 * i*i;
		h_b[i] = i;
	}

	//回调CPU上函数
	cpuAdd06(h_a, h_b, h_c);

	//Printing Answer
	printf("Vector addition on CPU\n");

	for (int i = 0; i < N; i++) {
		printf("The sum of %d element is %d + %d = %d\n", i, h_a[i], h_b[i], h_c[i]);
	}
}

void run_gpu_code(void)
{
	//定义主机数组
	int h_a[N], h_b[N], h_c[N];

	//定义设备指针
	int *d_a, *d_b, *d_c;

	//分配内存空间
	cudaMalloc((void**)&d_a, N * sizeof(int));
	cudaMalloc((void**)&d_b, N * sizeof(int));
	cudaMalloc((void**)&d_c, N * sizeof(int));

	//初始化数组
	for (int i = 0; i < N; i++) {
		h_a[i] = 2 * i * i;
		h_b[i] = i;
	}

	//从主机内存复制数组到设备显存
	cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice);

	//调用内核函数设置启动方式为N个块每个块内1个线程
	gpuAdd06 << <N, 4 >> > (d_a, d_b, d_c);

	//复制设备计算结果到主机内存
	cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);

	printf("Vector addition on GPU \n");
	//Printing result on console
	for (int i = 0; i < N; i++) {
		printf("The sum of %d element is %d + %d = %d\n", i, h_a[i], h_b[i], h_c[i]);
	}

	//Free up memory
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);
}

int main(void) {

	clock_t start_h = clock();
	printf("执行CPU向量相加加法运算:\n");
	run_cpu_code();
	clock_t end_h = clock();

	clock_t start_d = clock();
	printf("执行GPU向量相加加法运算:\n");
	run_gpu_code();
	cudaThreadSynchronize();
	clock_t end_d = clock();

	double time_d = (double)(end_d - start_d) / CLOCKS_PER_SEC;
	double time_h = (double)(end_h - start_h) / CLOCKS_PER_SEC;
	printf("数组中元素的个数为:%d \n GPU上运行的时间为: %f seconds \n CPU上运行的时间为： %f Secondsln",N,time_d, time_h);

	return 0;
}

在这里插入图片描述
这里我们对比可以发现，CPU运行耗时：4.157s、GPU运行耗时：0.984s。
我们计算一下便可发现提升的性能：
（4.157 - 0.984 ）/ 4.157 x 100% = 76.33 %
性能提升76.33%

量化Mike

关注

2
点赞
踩
7

收藏

觉得还不错? 一键收藏
4
评论
【GPU加速】开发低延迟代码性能提升76.33%——通过VS2017创建CUDA项目对比CPU代码和GPU代码的延迟（亲测代码可运行简单可运行适合入手）

文章目录一、在GPU上创建数组相加的算法：二、在GPU上创建数组相加的算法：三、对比上述CPU与GPU代码的延迟一、在GPU上创建数组相加的算法：#include "stdio.h"#include<iostream>//定义array元素的个数#define N 10000000//定义CPU上的函数void cpuAdd(int *h_a, int *h_b, int *h_c) { int tid = 0; while (tid < N) { h_c[ti
复制链接

扫一扫

专栏目录