c++并行计算(基于cuda)

本文详细介绍了如何配置CUDA环境,包括更新NVIDIA驱动、安装CUDA工具包、下载CUDNN,并通过实例展示了如何在VS2017中编写并执行CUDA程序,实现8x8矩阵相乘。CUDA驱动版本为511.79,CUDA版本为11.6,CUDNN版本为8.2.1。
摘要由CSDN通过智能技术生成

1. 检查NVIDIA显卡驱动程序,我这里装系统时就已经安装了,于是去官网下载更新了版本;
GeForce Game Ready 驱动程序 | 511.79 | Windows 10 64-bit, Windows 11 | NVIDIA下载 Chinese (Simplified) GeForce Game Ready 驱动程序 匹配 Windows 10 64-bit, Windows 11 操作系统. 发布日期 2022.2.14https://www.nvidia.cn/Download/driverResults.aspx/187102/cn/

2. 查看驱动程序对应的CUDA版本,如图所示

3.安装cuda11.6,官网下载:

CUDA Toolkit 11.6 Downloads | NVIDIA Developerhttps://developer.nvidia.com/cuda-downloads4. 测试是否安装成功;

cmd下运行 nvcc -V

下载 cuda示例代码

https://github.com/nvidia/cuda-sampleshttps://github.com/nvidia/cuda-samples5. 安装cudnn,没有找到11.6对应版本,地址:

https://developer.nvidia.com/rdp/cudnn-downloadhttps://developer.nvidia.com/rdp/cudnn-download

下载需要登录,但是注册失败,于是使用迅雷下载绕过登录,版本为8.2.1对应cuda11.X。

 将cudnn解压并复制到cuda目录下同名文件夹

 6. 打开VS2017,新建项目,选择

 7. 自动生成一个.cu程序,实现数组相加功能。

CUDA程序执行过程:

  • 分配host内存,并进行数据初始化;
  • 分配device内存,并从host将数据拷贝到device上;
  • 调用CUDA的核函数在device上完成指定的运算;
  • 将device上的运算结果拷贝到host上;
  • 释放device和host上分配的内存。

kernel核函数是在device上线程中并行执行的函数,用__global__符号声明,在调用时需要用<<<grid, block>>>来指定一个kernel函数要执行的线程数量,在CUDA中,每个线程会分配一个唯一的线程号thread ID,这个ID值可以通过核函数的内置变量threadIdx来获得。

一个kernel核函数在device上执行时实际上启动了很多线程,一个kernel所启动的所有线程称为一个线程格(grid),同一个线程格上的线程共享相同的全局内存空间;一个线程格又分为很多线程块(block),一个线程块里面包含很多线程。

8.编写矩阵乘法运算程序:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

const int width = 1 << 3;
const int height = 1 << 3;

void getCudaInformation();

// 矩阵相乘kernel函数,2-D,每个线程计算一个元素Cij
__global__ void matMulKernel(float *A, float *B, float *C) {
	float Cvalue = 0.0;
	int row = threadIdx.y + blockIdx.y * blockDim.y;
	int col = threadIdx.x + blockIdx.x * blockDim.x;
	for (int i = 0; i < width; ++i) {
		Cvalue += A[row * width + i] * B[i * width + col];
	}

	//Cvalue = A[row * width + col] + B[row * width + col];

	C[row * width + col] = Cvalue;
}

cudaError_t mulWithCuda(float *A, float *B, float *C, unsigned int width, unsigned int height);

int main()
{
	//设备测试
	getCudaInformation();

	//数组运算
	float *A, *B, *C;

	//分配内存
	//int nBytes = width * height * sizeof(float);

	A = (float *)malloc(width * height * sizeof(float));
	B = (float *)malloc(width * height * sizeof(float));
	C = (float *)malloc(width * height * sizeof(float));


	// 初始化A矩阵所有元素为1.0,B矩阵所有元素为2.0
	for (int i = 0; i < width * height; ++i) {
		A[i] = 1.0;
		B[i] = 2.0;
		C[i] = 10.0;
	}

	//计时
	double startTime1 = clock();//1计时开始
	// Add vectors in parallel.
	cudaError_t cudaStatus = mulWithCuda(A, B, C, width, height);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "addWithCuda failed!");
		return 1;
	}

	//计时
	double endTime1 = clock();//1计时结束

	for (int i = 0; i < width * height; ++i)
	{
		printf("%f  ", C[i]);
		if (((i + 1) / width != 0) && ((i + 1) % width == 0))
			printf("\n");
	}

	//运行时间
	printf("%.9f \n", (double)(endTime1 - startTime1) / CLOCKS_PER_SEC);

	// cudaDeviceReset must be called before exiting in order for profiling and
	// tracing tools such as Nsight and Visual Profiler to show complete traces.
	cudaStatus = cudaDeviceReset();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaDeviceReset failed!");
		return 1;
	}

	//释放内存
	free(A);
	free(B);
	free(C);

	return 0;
}

// 使用CUDA进行矩阵乘法
cudaError_t mulWithCuda(float *A, float *B, float *C, unsigned int width, unsigned int height)
{
	float *dev_a;
	float *dev_b;
	float *dev_c;
	cudaError_t cudaStatus;
	// Choose which GPU to run on, change this on a multi-GPU system.
	cudaStatus = cudaSetDevice(0);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
		goto Error;
	}
	// Allocate GPU buffers for three vectors (two input, one output)    .
	int nBytes = width * height * sizeof(float);
	cudaStatus = cudaMalloc((void**)&dev_c, nBytes);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!");
		goto Error;
	}

	cudaStatus = cudaMalloc((void**)&dev_a, nBytes);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!");
		goto Error;
	}

	cudaStatus = cudaMalloc((void**)&dev_b, nBytes);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!");
		goto Error;
	}

	// Copy input vectors from host memory to GPU buffers.
	cudaStatus = cudaMemcpy(dev_a, A, nBytes, cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}

	cudaStatus = cudaMemcpy(dev_b, B, nBytes, cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}

	// 定义kernel的blocksize为(32, 32),那么grid大小为(32, 32)
	dim3 blockSize(32, 32);
	dim3 gridSize((width + blockSize.x - 1) / blockSize.x,
		(height + blockSize.y - 1) / blockSize.y);

	// Launch a kernel on the GPU with one thread for each element.
	matMulKernel << <gridSize, blockSize >> > (dev_a, dev_b, dev_c);

	// 同步device数据保证结果能正确访问
	//cudaDeviceSynchronize();

	// Check for any errors launching the kernel
	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
		goto Error;
	}

	// cudaDeviceSynchronize waits for the kernel to finish, and returns
	// any errors encountered during the launch.
	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
		goto Error;
	}

	// Copy output vector from GPU buffer to host memory.
	cudaStatus = cudaMemcpy(C, dev_c, nBytes, cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}


Error:
	cudaFree(dev_c);
	cudaFree(dev_a);
	cudaFree(dev_b);

	return cudaStatus;

}


//  设备测试
void getCudaInformation()
{
	int deviceCount;
	cudaGetDeviceCount(&deviceCount);
	int dev;
	for (dev = 0; dev < deviceCount; dev++) {
		int driver_version(0), runtime_version(0);
		cudaDeviceProp deviceProp;
		cudaGetDeviceProperties(&deviceProp, dev);
		if (dev == 0)
			if (deviceProp.minor = 9999 && deviceProp.major == 9999)
				printf("\n");
		printf("\nDevice%d:\"%s\"\n", dev, deviceProp.name);
		cudaDriverGetVersion(&driver_version);
		printf("CUDA驱动版本:                                   %d.%d\n", driver_version / 1000, (driver_version % 1000) / 10);
		cudaRuntimeGetVersion(&runtime_version);
		printf("CUDA运行时版本:                                 %d.%d\n", runtime_version / 1000, (runtime_version % 1000) / 10);
		printf("设备计算能力:                                   %d.%d\n", deviceProp.major, deviceProp.minor);
		printf("Total amount of Global Memory:                  %u bytes\n", deviceProp.totalGlobalMem);
		printf("Number of SMs:                                  %d\n", deviceProp.multiProcessorCount);
		printf("Total amount of Constant Memory:                %u bytes\n", deviceProp.totalConstMem);
		printf("Total amount of Shared Memory per block:        %u bytes\n", deviceProp.sharedMemPerBlock);
		printf("Total number of registers available per block:  %d\n", deviceProp.regsPerBlock);
		printf("Warp size:                                      %d\n", deviceProp.warpSize);
		printf("Maximum number of threads per SM:               %d\n", deviceProp.maxThreadsPerMultiProcessor);
		printf("Maximum number of threads per block:            %d\n", deviceProp.maxThreadsPerBlock);
		printf("Maximum size of each dimension of a block:      %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
		printf("Maximum size of each dimension of a grid:       %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);
		printf("Maximum memory pitch:                           %u bytes\n", deviceProp.memPitch);
		printf("Texture alignmemt:                              %u bytes\n", deviceProp.texturePitchAlignment);
		printf("Clock rate:                                     %.2f GHz\n", deviceProp.clockRate * 1e-6f);
		printf("Memory Clock rate:                              %.0f MHz\n", deviceProp.memoryClockRate * 1e-3f);
		printf("Memory Bus Width:                               %d-bit\n", deviceProp.memoryBusWidth);
		printf("--------------------------------------------------------------\n");

	}

}

运行结果:

 基于GPU实现了8*8的矩阵相乘。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值