本文为CUDA编程学习笔记
大家好吗我是陈同学最近在学习cuda编程,用来加速视觉算法中图像预处理部分。本文将提供一个示例:向量相加。
/**
* @file add.cpp
* @author chenshining
* @brief
* @version 0.1
* @date 2024-01-16
*
* @copyright Copyright (c) 2023
*
*/
#include <iostream>
#include <vector>
#include <string>
#include <cmath>
#include <cuda_runtime.h>
__global__ void add(int* a, int* b, int* c , const int size)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if(idx < size){
c[idx] = a[idx] + b[idx];
}
}
void add_test(int* a, int* b, int* c , const int size)
{
// 分配内存
int *device_a, *device_b, *device_c;
cudaMalloc((void **)&device_a, size * sizeof(int));
cudaMalloc((void **)&device_b, size * sizeof(int));
cudaMalloc((void **)&device_c, size * sizeof(int));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl;
return;
}
// 将数据从Host 复制到 Device
cudaMemcpy(device_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(device_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl;
return;
}
// 定义 Gride 和 Block结构
int blockSize = 256;
int grideSize = std::ceil((size + blockSize) / blockSize);
add<<<grideSize, blockSize >>> (device_a, device_b, device_c, size);
// 将结果从 Device 复制到 Host
auto ret = cudaMemcpy(c, device_c, size * sizeof(int), cudaMemcpyDeviceToHost);
// 释放内存
cudaFree(device_a);
cudaFree(device_b);
cudaFree(device_c);
err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl;
return;
}
}
int main(int argc ,char **argv){
if(argc != 2){
printf("argc:%d\n",argc);
printf("请输入向量的维度:\n");
return -1;
}
const int size = std::stoi(argv[1]);
// 分配和初始化数据
int h_a[size], h_b[size], h_c[size];
// 初始化数据
for(int i{0}; i < size; ++i){
h_a[i] = i;
h_b[i] = i * 2;
}
std::cout << "h_a:";
for(int i{0}; i < size; ++i){
std::cout << h_a[i] << ", ";
}
std::cout << std::endl;
std::cout << "h_b:";
for(int i{0}; i < size; ++i){
std::cout << h_b[i] << ", ";
}
std::cout << std::endl;
add_test(h_a, h_b, h_c, size);
std::cout << "result:";
for(int i{0}; i < size; ++i){
std::cout << h_c[i] << ", ";
}
std::cout << std::endl;
return 0;
}