Cuda编程2（计算两个数组的和）

篮球不火

已于 2024-03-12 09:22:19 修改

阅读量131

点赞数 2

分类专栏： CUDA编程文章标签： c++

于 2024-02-21 16:45:41 首次发布

本文链接：https://blog.csdn.net/weixin_44851010/article/details/136215221

版权

CUDA编程专栏收录该内容

3 篇文章 0 订阅

订阅专栏

文章详细介绍了如何使用CUDA在C++中计算两个数组的和，包括设置GPU、内存分配、数据初始化、核函数定义以及主机与设备间的数据复制，展示了GPU并行计算的基本流程。

摘要由CSDN通过智能技术生成

Cuda计算两个数组的和

// A code block
#include"test.cuh"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>

cudaError_t ErrorCheck(cudaError_t error_code, const char* filename, int lineNumber)
{
	if (error_code != cudaSuccess)
	{
		printf("CUDA error:\r\ncode=%d, name=%s, description=%s\r\nfile=%s, line%d\r\n",
			error_code, cudaGetErrorName(error_code), cudaGetErrorString(error_code), filename, lineNumber);
		return error_code;
	}
		return error_code;
}
void setGpu()
{
	int iDeviceCount = 0;
	cudaError_t error = ErrorCheck(cudaGetDeviceCount(&iDeviceCount), __FILE__, __LINE__);
	if (error != cudaSuccess || iDeviceCount == 0)
	{
		printf("No CUDA compatable GPU found! \n");
		exit(-1);
	}
	else
	{
		printf("The count of GPUs is %d. \n", iDeviceCount);
	}
	//设置执行
	int iDev = 0;
	error = ErrorCheck(cudaSetDevice(iDev), __FILE__, __LINE__);//规定使用0号gpu进行计算
	if (error != cudaSuccess)
	{
		printf("fail to set GPU 0 for computing .\n");
		exit(-1);
	}
	else
	{
		printf("set GPU 0 for computing.\n");
	}

}
void initialData(float* addr, int elemCount)
{
	for (int i = 0; i < elemCount; i++)
	{
		addr[i] = (float)(rand() & 0xFF) / 10.f;
	}
	return;
}
__global__ void addFromGPU(float *A, float *B, float *C, const int N)
{
	const int bid = blockIdx.x;//当前线程所处的块的索引
	const int tid = threadIdx.x;//当前线程所处的块中的线程索引
	const int id = bid * blockDim.x + tid;//当前线程在网格中的全局唯一索引
	//cuda并行计算的精髓,总的线程个数等于数组的元素个数，有可能是因为每个线程都有核函数的一个副本，都同时计算自己的id所对应的A[id]+B[id]放到C[id],其他所有工作都为下面一行代码服务
	C[id] = A[id] + B[id];
}


int test() {

	//1.设置gpu计算
	setGpu();

	int iElemCount = 512;
	size_t stBytesCount = iElemCount * sizeof(float);

	//2.分配主机内存空间
	float *fpHost_A, *fpHost_B, *fpHost_C;
	fpHost_A = (float*)malloc(stBytesCount);
	fpHost_B = (float*)malloc(stBytesCount);
	fpHost_C = (float*)malloc(stBytesCount);

	//3.初始化主机内存空间
	if (fpHost_A != NULL && fpHost_B != NULL && fpHost_C != NULL)
	{
		memset(fpHost_A, 0, stBytesCount);
		memset(fpHost_B, 0, stBytesCount);
		memset(fpHost_C, 0, stBytesCount);
	}
	else
	{
		printf("Fail to allocate host memory! \n");
		exit(-1);
	}

	//4.分配设备内存，并初始化
	float *fpDevice_A, *fpDevice_B, *fpDevice_C;
	cudaMalloc((float**)&fpDevice_A, stBytesCount);
	cudaMalloc((float**)&fpDevice_C, stBytesCount);
	cudaMalloc((float**)&fpDevice_B, stBytesCount);
	if (fpDevice_A != NULL && fpDevice_B != NULL && fpDevice_C != NULL)
	{
		cudaMemset(fpDevice_A, 0, stBytesCount);
		cudaMemset(fpDevice_B, 0, stBytesCount);
		cudaMemset(fpDevice_C, 0, stBytesCount);

	}
	else
	{
		printf("fail to allocate memory\n");
		free(fpHost_A);
		free(fpHost_B);
		free(fpHost_C);
		exit(-1);
	}

	//5.分配的主机空间中写入 数据
	srand(666);
	initialData(fpHost_A, iElemCount);
	initialData(fpHost_B, iElemCount);
	
	//6.将主机内存空间拷贝至gpu内存空间
	cudaMemcpy(fpDevice_A, fpHost_A, stBytesCount, cudaMemcpyHostToDevice);
	cudaMemcpy(fpDevice_B, fpHost_B, stBytesCount, cudaMemcpyHostToDevice);
	cudaMemcpy(fpDevice_C, fpHost_C, stBytesCount, cudaMemcpyHostToDevice);

	//7.配置核函数，一个网格中一共iElemCount / 32个线程块，每个线程块有32个线程，这32个线程之间可共享内存
	dim3 block(32);
	dim3 grid(iElemCount / 32);

	//8.在gpu中计算核函数，将计算结果放到fpDevice_C中
	addFromGPU << <grid, block >> > (fpDevice_A, fpDevice_B, fpDevice_C,iElemCount);

	//9.将数据从gpu内存拷贝至主机内存中
	cudaMemcpy(fpHost_C, fpDevice_C, stBytesCount, cudaMemcpyDeviceToHost);

	for (int i = 0; i < 512; i++)    // 打印
	{
		printf("idx=%2d\tmatrix_A:%.2f\tmatrix_B:%.2f\tresult=%.2f\n", i + 1, fpHost_A[i], fpHost_B[i], fpHost_C[i]);
	}

	// 10、释放主机与设备内存
	free(fpHost_A);
	free(fpHost_B);
	free(fpHost_C);
	cudaFree(fpDevice_A);
	cudaFree(fpDevice_B);
	cudaFree(fpDevice_C);

	cudaDeviceReset();

	return 0;
}