Cuda10.1总结3-DriveAPI

加菲猫0320

已于 2024-05-27 09:15:45 修改

阅读量353

点赞数

分类专栏： cuda10.1 文章标签： c++ 开发语言

于 2023-03-05 20:42:44 首次发布

本文链接：https://blog.csdn.net/great_sea/article/details/129350779

版权

cuda10.1 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

概述

Driver API，除了手册以外，几乎没有其他资料。具体如下。

<>。官方的cuda driver api的手册。
<<CUDA_C_Programming_Guide>>的driver api章节。介绍driver api程序实现流程。

上下文(context)

经过对cuda相关文档的研读，总结如下。

cuInit()，在见到的源码中以cuInit(0)的形式调用。如果要使用Cuda Driver API，必须先调用这个函数。
cuCtxCreate()。创建context，并且将其加入当前线程堆栈中。创建context+cuCtxPushCurrent()。
cuCtxDestroy()。context从当前线程堆栈中弹出，并且销毁之。cuCtxPopCurrent+销毁context。

在库中如何使用context。在cuda手册给出了如下方法。
在这里插入图片描述

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-1H8b0Heo-1678020073554)(Images/Conex在库调用使用方法.png)]

但是其没有给出库释放资源的时候，如何调用。具体总结如下。

首先调用cuCtxPushCurrent()。
调用释放资源的driver api。
cuCtxDestroy()。

也可以是如下使用流程。

调用cuCtxCreate():创建context将其加入到线程堆栈中。
使用driver api。
调用cuCtxDestroy():将上下文弹出堆栈并销毁其。

基于纹理的图像处理

SDK sample代码\Sample\0_Simple\simpleTextureDrv\简单实现了基于纹理的图像处理，不过实现比较繁琐，具体整理如下。使用的编译工具是Visual studio 2013,具体的编译环境设置完全等同于simpleTextureDrv。

首先是核函数(simpleTexture_kernel.cu)的实现代码。

#ifndef _SIMPLETEXTURE_KERNEL_H_
#define _SIMPLETEXTURE_KERNEL_H_
#include <cuda.h>
#include <vector_types.h>
extern "C"
__global__ void
transformKernel(uchar4 *g_odata, int width, int height, float theta,  CUtexObject tex)
{
	unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
	unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
	float u = (float)x - (float)width/2; 
	float v = (float)y - (float)height/2; 
	float tu = u*cosf(theta) - v*sinf(theta); 
	float tv = v*cosf(theta) + u*sinf(theta); 
	g_odata[y*width + x] = tex2D<uchar4>(tex, tu + width / 2.0, tv + height / 2.0);
}
#endif

然后是C++的实现代码。

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <iostream>
#include <cstring>
#include <cuda.h>
#include <builtin_types.h>
#include <helper_cuda_drvapi.h>
#include <cstdint>
#include <fstream>
using namespace std;
float angle = 0.5f;    // angle to rotate image by (in radians)
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;

/**
*@brief 获取Ptx文件的二进制格式到ptxBin
*/
bool GetPtxBin(string &modulePath, string &ptxBin)
{
	if (modulePath.rfind(".ptx") != string::npos)
	{
		FILE *fp = fopen(modulePath.c_str(), "rb");
		fseek(fp, 0, SEEK_END);
		int file_size = ftell(fp);
		char *buf = new char[file_size + 1];
		fseek(fp, 0, SEEK_SET);
		fread(buf, sizeof(char), file_size, fp);
		fclose(fp);
		buf[file_size] = '\0';
		ptxBin = buf;
		delete[] buf;
	}
	else
	{
		return false;
	}

	return true;
}

static bool CreatCuContext(CUcontext &cuContext)
{
	CUfunction cuFunction = 0;
	CUresult status;
	int major = 0, minor = 0, devID = 0;
	char deviceName[100];
	string module_path, ptx_source;
	string modulePath = "./data/simpleTexture_kernel32.ptx";

	char name[100];
	devID = gpuGetMaxGflopsDeviceIdDRV();               //核心实现。
	checkCudaErrors(cuDeviceGet(&cuDevice, devID));     //核心实现。
	cuDeviceGetName(name, 100, cuDevice);

	checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
	checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
	checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));

	status = cuCtxCreate(&cuContext, 0, cuDevice);

	if (CUDA_SUCCESS != status)
	{
		printf("cuCtxCreate(0) returned %d\n-> %s\n", status, getCudaDrvErrorString(status));
		return false;
	}

	return true;
}

static bool LoadPtxBin(string &ptxBin)
{
	// in this branch we use compilation with parameters
	const unsigned int jitNumOptions = 3;
	CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
	void **jitOptVals = new void *[jitNumOptions];
	// set up size of compilation log buffer
	jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
	int jitLogBufferSize = 1024;
	jitOptVals[0] = (void *)(size_t)jitLogBufferSize;

	// set up pointer to the compilation log buffer
	jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
	char *jitLogBuffer = new char[jitLogBufferSize];
	jitOptVals[1] = jitLogBuffer;
	// set up pointer to set the Maximum # of registers for a particular kernel
	jitOptions[2] = CU_JIT_MAX_REGISTERS;
	int jitRegCount = 32;
	jitOptVals[2] = (void *)(size_t)jitRegCount;
	CUresult status;
	status = cuModuleLoadDataEx(&cuModule, ptxBin.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals);
	if (CUDA_SUCCESS != status)
	{
		return false;
	}

	return true;
}

/**
*@brief driver api初始化操作。
*/
static bool InitCUDA(string &modulePath, CUfunction *transform)
{
	CUfunction cuFunction = 0;
	string ptxBin;
	CUresult status;

	if (!CreatCuContext(cuContext))
		return false;
	if (!GetPtxBin(modulePath, ptxBin))
		return false;
	if (!LoadPtxBin(ptxBin))
		return false;

	status = cuModuleGetFunction(&cuFunction, cuModule, "transformKernel");
	if (CUDA_SUCCESS != status)
		return false;
	*transform = cuFunction;

	return true;
}

static bool DestroyCUDA()
{
	cuModuleUnload(cuModule);
	checkCudaErrors(cuCtxDestroy(cuContext));
	return true;
}

void ImageRotate(CUarray inArray, CUdeviceptr outMem, CUfunction transform, int32_t width, int32_t height, float angle)
{
	// set texture parameters
	CUtexObject TexObject;
	CUDA_RESOURCE_DESC ResDesc;
	memset(&ResDesc, 0, sizeof(CUDA_RESOURCE_DESC));
	ResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
	ResDesc.res.array.hArray = inArray;

	CUDA_TEXTURE_DESC TexDesc;
	memset(&TexDesc, 0, sizeof(CUDA_TEXTURE_DESC));
	TexDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
	TexDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
	TexDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
	TexDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
	//TexDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;浮点坐标的设置。
	TexDesc.flags = CU_TRSF_READ_AS_INTEGER;

	TexDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
	checkCudaErrors(cuTexObjectCreate(&TexObject, &ResDesc, &TexDesc, NULL));

	int block_size = 8;
	if (1)
	{
		// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
		void *args[5] = { &outMem, &width, &height, &angle, &TexObject };

		checkCudaErrors(cuLaunchKernel(transform, (width / block_size), (height / block_size), 1,
			block_size, block_size, 1,
			0,
			NULL, args, NULL));
		checkCudaErrors(cuCtxSynchronize());
	}
	else
	{
		// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (advanced method)
		int offset = 0;
		char argBuffer[256];

		// pass in launch parameters (not actually de-referencing CUdeviceptr).  CUdeviceptr is
		// storing the value of the parameters
		*((CUdeviceptr *)&argBuffer[offset]) = outMem;
		offset += sizeof(outMem);
		*((unsigned int *)&argBuffer[offset]) = width;
		offset += sizeof(width);
		*((unsigned int *)&argBuffer[offset]) = height;
		offset += sizeof(height);
		*((float *)&argBuffer[offset]) = angle;
		offset += sizeof(angle);
		*((CUtexObject *)&argBuffer[offset]) = TexObject;
		offset += sizeof(TexObject);

		void *kernel_launch_config[5] =
		{
			CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
			CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
			CU_LAUNCH_PARAM_END
		};

		// new CUDA 4.0 Driver API Kernel launch call (warmup)
		checkCudaErrors(cuLaunchKernel(transform, (width / block_size), (height / block_size), 1,
			block_size, block_size, 1,
			0,
			NULL, NULL, (void **)&kernel_launch_config));
		checkCudaErrors(cuCtxSynchronize());
	}
	checkCudaErrors(cuCtxSynchronize());
	checkCudaErrors(cuTexObjectDestroy(TexObject));
}

void main(int argc, char **argv)
{
	string inFileName = "F:\\Middleware\\In.bgra";
	string outFileName = "F:\\Middleware\\Out.bgra";
	uint32_t width = 1920;
	uint32_t height = 1080;
	ifstream inFile;
	ofstream outFile;
	uint8_t *RGBA = new uint8_t[width * height * 4];
	uint8_t *outRGBA = new uint8_t[width * height * 4];

	inFile.open(inFileName.c_str(), ios::binary);
	if (!inFile.is_open())
		throw inFileName;
	outFile.open(outFileName.c_str(), ios::binary);
	if (!outFile.is_open())
		throw outFileName;

    bool bTestResults = true;
    CUfunction transform = NULL;
	string modulePath = "./data/simpleTexture_kernel32.ptx";

    if (!InitCUDA(modulePath, &transform))
    {
        exit(EXIT_FAILURE);
    }

	size_t       size = width * height * sizeof(float);
	// allocate device memory for result
	CUdeviceptr d_data = (CUdeviceptr)NULL;
	checkCudaErrors(cuMemAlloc(&d_data, size));

	int32_t i = 0;
	for (i = 0; i < 20; i++)
	{
		inFile.read(reinterpret_cast<char *>(RGBA), width * height * 4);
		CUarray cu_array;
		CUDA_ARRAY_DESCRIPTOR desc;
		desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
		desc.NumChannels = 4;
		desc.Width = width;
		desc.Height = height;
		checkCudaErrors(cuArrayCreate(&cu_array, &desc));

		//将图像数据从主存拷贝到显存array中。
		CUDA_MEMCPY2D copyParam;
		memset(&copyParam, 0, sizeof(copyParam));
		copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
		copyParam.dstArray = cu_array;
		copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
		copyParam.srcHost = RGBA;

		copyParam.srcPitch = width * 4;
		copyParam.WidthInBytes = copyParam.srcPitch;
		copyParam.Height = height;
		checkCudaErrors(cuMemcpy2D(&copyParam));

		ImageRotate(cu_array, d_data, transform, width, height, angle);

		checkCudaErrors(cuMemcpyDtoH(outRGBA, d_data, size));

		if (i > 3)
			outFile.write(reinterpret_cast<char *>(outRGBA), width * height * 4);
		checkCudaErrors(cuArrayDestroy(cu_array));
	}

    checkCudaErrors(cuMemFree(d_data));
	DestroyCUDA();
   
	if (RGBA != NULL) {
		delete[] RGBA;
		RGBA = NULL;
	}

	if (outRGBA != nullptr)
	{
		delete[] outRGBA;
		outRGBA = nullptr;
	}

	if (inFile.is_open()) {
		inFile.clear();
		inFile.close();
	}

	if (outFile.is_open()) {
		outFile.clear();
		outFile.close();
	}
}

在上例，如果纹理坐标使用归一化的浮点坐标，输出了错误的图像，具体原因有待于进一步思考。