CUDA学习笔记04:向量之和

本文介绍了如何在VS2019环境中使用CUDA进行向量加法操作,包括在CPU和GPU上实现的vecAdd函数,以及如何使用CMake配置CUDA项目。作者通过代码示例展示了从头到尾的编译过程和性能验证。
摘要由CSDN通过智能技术生成

参考资料

CUDA编程模型系列二(向量操作)_哔哩哔哩_bilibili (非常好的学习资料!)

vs2019

随意新建一个空项目,按照之前的环境配置配好项目依赖:

CUDA学习笔记02:测试程序hello world-CSDN博客

代码结构如下:

代码片段,demo_gpu.cu:

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>
#include <math.h>

__global__ void vecAdd(const double* x, const double* y, double* z, int count)
{
    const int index = blockDim.x * blockIdx.x + threadIdx.x;
    // t00 t01 t02 t10 t11 t12 t20 t21 t22
    if (index < count)
    {
        z[index] = x[index] + y[index];
    }
}

void vecAdd_cpu(const double* x, const double* y, double* z, int count)
{
    for (int i = 0; i < count; ++i)
    {
        z[i] = x[i] + y[i];
    }
}

void test()
{
    const int N = 1000;
    const int M = sizeof(double) * N;

    //cpu mem alloc
    double* h_x = (double*)malloc(M);
    double* h_y = (double*)malloc(M);
    double* h_z = (double*)malloc(M);
    double* result_cpu = (double*)malloc(M);

    for (int i = 0; i < N; ++i)
    {
        h_x[i] = 1;
        h_y[i] = 2;
    }

    double* d_x, * d_y, * d_z;
    cudaMalloc((void**)&d_x, M);
    cudaMalloc((void**)&d_y, M);
    cudaMalloc((void**)&d_z, M);

    cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice);

    const int block_size = 128;
    const int grid_size = (N + block_size - 1) / block_size;

    vecAdd << <grid_size, block_size >> > (d_x, d_y, d_z, N);

    cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost);

    vecAdd_cpu(h_x, h_y, result_cpu, N);

    bool error = false;

    for (int i = 0; i < N; ++i)
    {
        if (fabs(result_cpu[i] - h_z[i]) > (1.0e-10))
        {
            error = true;
        }
    }

    printf("Result: %s\n", error ? "Errors" : "Pass");

    free(h_x);
    free(h_y);
    free(h_z);
    free(result_cpu);
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_z);
}

主函数:

#include <iostream>

void test();

int main()
{
    test();
    std::cout << "Finished! \n";
}

运行结果:

ok,结果通过。

Linux(CMake)

如果是使用CMake配置环境就更简单了,CMakeLists.txt这样写即可:

cmake_minimum_required(VERSION 3.10)

project(vector_add LANGUAGES CXX CUDA)

add_definitions(-std=c++11)
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)

set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CUDA_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/build)
set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -O0 -Wfatal-errors -pthread -w -g")

find_package(CUDA REQUIRED)

cuda_add_executable(vector_add add.cu)

代码结构:

add.cu和前边的代码没有区别:

#include <stdio.h>
#include <math.h>

__global__ void vecAdd(const double *x, const double *y, double *z, int count)
{
    const int index = blockDim.x * blockIdx.x + threadIdx.x;
    // t00 t01 t02 t10 t11 t12 t20 t21 t22
    if( index < count)
    {
        z[index] = x[index] + y[index];
    }
}

void vecAdd_cpu(const double *x, const double *y, double *z, int count)
{
    for(int i = 0; i<count; ++i)
    {
        z[i] = x[i] + y[i];
    }
}


int main()
{
    const int N = 1000;
    const int M = sizeof(double) * N;

    //cpu mem alloc
    double *h_x = (double*) malloc(M);
    double *h_y = (double*) malloc(M);
    double *h_z = (double*) malloc(M);
    double *result_cpu = (double*) malloc(M);

    for( int i = 0; i<N; ++i)
    {
        h_x[i] = 1;
        h_y[i] = 2;
    }

    double *d_x, *d_y, *d_z;
    cudaMalloc((void**) &d_x, M );
    cudaMalloc((void**) &d_y, M );
    cudaMalloc((void**) &d_z, M );

    cudaMemcpy(d_x ,h_x ,M , cudaMemcpyHostToDevice);
    cudaMemcpy(d_y ,h_y ,M , cudaMemcpyHostToDevice);

    const int block_size = 128;
    const int grid_size  = (N + block_size -1)/block_size;

    vecAdd<<<grid_size, block_size>>>(d_x, d_y, d_z, N);

    cudaMemcpy( h_z, d_z, M, cudaMemcpyDeviceToHost);

    vecAdd_cpu(h_x, h_y, result_cpu, N);

    bool error = false;

    for(int i=0; i<N; ++i)
    {
        if(fabs(result_cpu[i] - h_z[i]) > (1.0e-10))
        {
            error = true;
        }
    }
    
    printf("Result: %s\n", error?"Errors" : "Pass");

    free(h_x);
    free(h_y);
    free(h_z);
    free(result_cpu);
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_z);

}

当前目录下建立build路径并且进入,用cmake构建:

cmake ..

构建好后编译:

最后查看编译结果,并运行:

OK,结果通过!

  • 5
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值