CUDA编程实例1--向量加法

最新推荐文章于 2023-12-25 12:05:17 发布

几窗花鸢

最新推荐文章于 2023-12-25 12:05:17 发布

阅读量1.2k

点赞数

分类专栏： CUDA编程入门文章标签：硬件架构

本文链接：https://blog.csdn.net/weixin_44911248/article/details/127015923

版权

CUDA编程入门专栏收录该内容

5 篇文章 2 订阅

订阅专栏

1. 向量加法

通过初始化 $2^{24}$ 大小的向量A,B；将其相加，并且对于使用CPU计算以及使用GPU并行计算的时间效率。
首先，nelem= $2^{24}$ ，即向量的大小为16M
在使用gpu并行时，设置了 $2^{24}/1024=16384$ 个块，每个块的大小都为1024.
即设置dim3 block(1024);grid(16384);
最后对比CPU和GPU的运行效率

主要执行步骤：

设置向量大小，分配host内存，并进行数据的初始化
在host上进行向量加法运算，计算运行时间
分配device内存，将数据从host拷贝到device
调用CUDA的核函数在device上完成向量加法运算，计算运算时间
然后将device的运算结果拷贝到host
对比host和device上得到的结果是否一致（代码中只打印了前10个和）
最后记得释放device和host上分配的内存空间

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<stdio.h>
#pragma once
#include<malloc.h>
#include<time.h>
#include<windows.h>
#define N 1<<24



__global__ void addOnGPU(int* a, int* b, int* c) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
	if (tid < N) {
		c[tid] = a[tid] + b[tid];
	}
}
void addOnCpu(int* a, int* b, int* hostRef, const int n) {
    for (int idx = 0; idx < n; idx++) {
        hostRef[idx] = a[idx] + b[idx];
    }
}

int main() {
    int nelem = N;
	size_t nBytes = nelem * sizeof(int);
	int* h_A, * h_B, * hostRef, * gpuRef;
	h_A = (int *)malloc(nBytes);
	h_B = (int *)malloc(nBytes);
	hostRef = (int*)malloc(nBytes);
	gpuRef = (int*)malloc(nBytes);

    clock_t iStart = clock();
	double iElaps;
    iStart = clock();
    for (int i = 0; i < nelem; i++) {
        h_A[i] = i;
    }
    for (int i = 0; i < nelem; i++) {
        h_B[i] = i;
    }
    clock_t iEnd = clock();
    iElaps = (double)(iEnd-iStart)/CLOCKS_PER_SEC;
    printf("初始化用时：%f\n", iElaps);
    memset(hostRef, 0, nBytes);
    memset(gpuRef, 0, nBytes);

    //在cpu上做向量的加法
    iStart = clock();
    //printf("istart %f\n", (double)iStart/CLOCKS_PER_SEC*10000000000);
    addOnCpu(h_A, h_B, hostRef, nelem);
    iEnd = clock();
    //printf("iend %f\n", (double)iEnd/CLOCKS_PER_SEC*10000000000);
    iElaps = (double)(iEnd - iStart) / CLOCKS_PER_SEC;
    printf("cpu向量用时：%f\n", iElaps);

    //开辟gpu的内存
    int* d_a, *d_b, *d_c;
    cudaMalloc((int**)&d_a, nBytes);
    cudaMalloc((int**)&d_b, nBytes);
    cudaMalloc((int**)&d_c, nBytes);

    //将数据从gpu复制到cpu运行
    cudaMemcpy(d_a, h_A, nBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_B, nBytes, cudaMemcpyHostToDevice);

    //调用核函数
    iStart = clock();
    int iLen = 1024;
    dim3 block(iLen);
    dim3 grid((nelem + block.x - 1) / block.x);
    addOnGPU << <grid, block >> > (d_a, d_b, d_c);
    cudaDeviceSynchronize();//因为CPU和GPU是异步的，为了能够等待GPU的运行结果之后，CPU再进行下一步操作，使用此函数
    iEnd = clock();
    iElaps = (double)(iEnd - iStart) / CLOCKS_PER_SEC;
    printf("gpu向量用时：%f\n", iElaps);

    //将gpu结果从gpu拷贝到cpu
    cudaMemcpy(gpuRef, d_c, nBytes, cudaMemcpyDeviceToHost);
    //验证cpu和gpu得到的结果是否相同
    printf("cpu result:");
    for (int i = 0; i < 10; i++) {
        printf("%d ", hostRef[i]);
    }
    printf("\n gpu result:");
    for (int i = 0; i < 10; i++) {
        printf("%d ", gpuRef[i]);
    }
   //最后释放相关的cpu和gpu的内存
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    free(h_A);
    free(h_B);
    free(hostRef);
    free(gpuRef);
    return 0;

}