一、在CPU上创建数组相加的算法:
#include "stdio.h"
#include<iostream>
//定义array元素的个数
#define N 10000000
//定义CPU上的函数
void cpuAdd(int *h_a, int *h_b, int *h_c) {
int tid = 0;
while (tid < N)
{
h_c[tid] = h_a[tid] + h_b[tid];
tid += 1;
}
}
int main04(void) {
int h_a[N], h_b[N], h_c[N];
//初始化两个数组进行相加
for (int i = 0; i < N; i++) {
h_a[i] = 2 * i*i;
h_b[i] = i;
}
//回调CPU上函数
cpuAdd(h_a, h_b, h_c);
//Printing Answer
printf("Vector addition on CPU\n");
for (int i = 0; i < N; i++) {
printf("The sum of %d element is %d + %d = %d\n", i, h_a[i], h_b[i], h_c[i]);
}
return 0;
}
二、在GPU上创建数组相加的算法:
#include <device_launch_parameters.h>
#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <cuda_runtime.h>
//定义array元素的个数
#define N 10000000
//定义向量相加的内核函数
__global__ void gpuAdd05(int *d_a, int *d_b, int *d_c) {
//Getting block index of current kernel
int tid = blockIdx.x; // handle the data at this index
if (tid < N)
{
d_c[tid] = d_a[tid] + d_b[tid];
}
}
int main(void) {
//定义主机数组
int h_a[N], h_b[N], h_c[N];
//定义设备指针
int *d_a, *d_b, *d_c;
//分配内存空间
cudaMalloc((void**)&d_a, N * sizeof(int));
cudaMalloc((void**)&d_b, N * sizeof(int));
cudaMalloc((void**)&d_c, N * sizeof(int));
//初始化数组
for (int i = 0; i < N; i++) {
h_a[i] = 2 * i*i;
h_b[i] = i;
}
// 从主机内存复制数组到设备显存
cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice);
//调用内核函数设置启动方式为N个块每个块内1个线程
gpuAdd05 << <N, 1 >> > (d_a, d_b, d_c);
//复制设备计算结果到主机内存
cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);
printf("Vector addition on GPU \n");
//打印结果
for (int i = 0; i < N; i++) {
printf("向量加法为: %d + %d = %d\n", i, h_a[i], h_b[i], h_c[i]);
}
//释放设备显存
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
三、对比上述CPU与GPU代码的延迟
上述的代码中我们设置的数组的元素个数N为5,这里我们为了CPU执行时间和GPU执行时间的显著差异,将N设置为30000。
#include <device_launch_parameters.h>
#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <time.h>
#include <cuda_runtime.h>
//定义array元素的个数
#define N 30000
//定义CPU上向量相加的函数
void cpuAdd06(int *h_a, int *h_b, int *h_c) {
int tid = 0;
while (tid < N)
{
h_c[tid] = h_a[tid] + h_b[tid];
tid += 1;
}
}
//定义GPU上向量相加的内核函数
__global__ void gpuAdd06(int *d_a, int *d_b, int *d_c) {
//Getting block index of current kernel
int tid = blockIdx.x; // handle the data at this index
if (tid < N)
{
d_c[tid] = d_a[tid] + d_b[tid];
}
}
void run_cpu_code(void)
{
int h_a[N], h_b[N], h_c[N];
//初始化两个数组进行相加
for (int i = 0; i < N; i++) {
h_a[i] = 2 * i*i;
h_b[i] = i;
}
//回调CPU上函数
cpuAdd06(h_a, h_b, h_c);
//Printing Answer
printf("Vector addition on CPU\n");
for (int i = 0; i < N; i++) {
printf("The sum of %d element is %d + %d = %d\n", i, h_a[i], h_b[i], h_c[i]);
}
}
void run_gpu_code(void)
{
//定义主机数组
int h_a[N], h_b[N], h_c[N];
//定义设备指针
int *d_a, *d_b, *d_c;
//分配内存空间
cudaMalloc((void**)&d_a, N * sizeof(int));
cudaMalloc((void**)&d_b, N * sizeof(int));
cudaMalloc((void**)&d_c, N * sizeof(int));
//初始化数组
for (int i = 0; i < N; i++) {
h_a[i] = 2 * i * i;
h_b[i] = i;
}
//从主机内存复制数组到设备显存
cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice);
//调用内核函数设置启动方式为N个块每个块内1个线程
gpuAdd06 << <N, 4 >> > (d_a, d_b, d_c);
//复制设备计算结果到主机内存
cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);
printf("Vector addition on GPU \n");
//Printing result on console
for (int i = 0; i < N; i++) {
printf("The sum of %d element is %d + %d = %d\n", i, h_a[i], h_b[i], h_c[i]);
}
//Free up memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
int main(void) {
clock_t start_h = clock();
printf("执行CPU向量相加加法运算:\n");
run_cpu_code();
clock_t end_h = clock();
clock_t start_d = clock();
printf("执行GPU向量相加加法运算:\n");
run_gpu_code();
cudaThreadSynchronize();
clock_t end_d = clock();
double time_d = (double)(end_d - start_d) / CLOCKS_PER_SEC;
double time_h = (double)(end_h - start_h) / CLOCKS_PER_SEC;
printf("数组中元素的个数为:%d \n GPU上运行的时间为: %f seconds \n CPU上运行的时间为: %f Secondsln",N,time_d, time_h);
return 0;
}
这里我们对比可以发现,CPU运行耗时:4.157s、GPU运行耗时:0.984s。
我们计算一下便可发现提升的性能:
(4.157 - 0.984 )/ 4.157 x 100% = 76.33 %
性能提升76.33%