概述
Driver API,除了手册以外,几乎没有其他资料。具体如下。
- <>。 官方的cuda driver api的手册。
- <<CUDA_C_Programming_Guide>>的driver api章节。介绍driver api程序实现流程。
上下文(context)
经过对cuda相关文档的研读,总结如下。
- cuInit(),在见到的源码中以cuInit(0)的形式调用。如果要使用Cuda Driver API,必须先调用这个函数。
- cuCtxCreate()。创建context,并且将其加入当前线程堆栈中。创建context+cuCtxPushCurrent()。
- cuCtxDestroy()。context从当前线程堆栈中弹出,并且销毁之。cuCtxPopCurrent+销毁context。
在库中如何使用context。在cuda手册给出了如下方法。
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-1H8b0Heo-1678020073554)(Images/Conex在库调用使用方法.png)]
但是其没有给出库释放资源的时候,如何调用。具体总结如下。
- 首先调用cuCtxPushCurrent()。
- 调用释放资源的driver api。
- cuCtxDestroy()。
也可以是如下使用流程。
- 调用cuCtxCreate():创建context将其加入到线程堆栈中。
- 使用driver api。
- 调用cuCtxDestroy():将上下文弹出堆栈并销毁其。
基于纹理的图像处理
SDK sample代码\Sample\0_Simple\simpleTextureDrv\简单实现了基于纹理的图像处理,不过实现比较繁琐,具体整理如下。使用的编译工具是Visual studio 2013,具体的编译环境设置完全等同于simpleTextureDrv。
首先是核函数(simpleTexture_kernel.cu)的实现代码。
#ifndef _SIMPLETEXTURE_KERNEL_H_
#define _SIMPLETEXTURE_KERNEL_H_
#include <cuda.h>
#include <vector_types.h>
extern "C"
__global__ void
transformKernel(uchar4 *g_odata, int width, int height, float theta, CUtexObject tex)
{
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
float u = (float)x - (float)width/2;
float v = (float)y - (float)height/2;
float tu = u*cosf(theta) - v*sinf(theta);
float tv = v*cosf(theta) + u*sinf(theta);
g_odata[y*width + x] = tex2D<uchar4>(tex, tu + width / 2.0, tv + height / 2.0);
}
#endif
然后是C++的实现代码。
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <iostream>
#include <cstring>
#include <cuda.h>
#include <builtin_types.h>
#include <helper_cuda_drvapi.h>
#include <cstdint>
#include <fstream>
using namespace std;
float angle = 0.5f; // angle to rotate image by (in radians)
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
/**
*@brief 获取Ptx文件的二进制格式到ptxBin
*/
bool GetPtxBin(string &modulePath, string &ptxBin)
{
if (modulePath.rfind(".ptx") != string::npos)
{
FILE *fp = fopen(modulePath.c_str(), "rb");
fseek(fp, 0, SEEK_END);
int file_size = ftell(fp);
char *buf = new char[file_size + 1];
fseek(fp, 0, SEEK_SET);
fread(buf, sizeof(char), file_size, fp);
fclose(fp);
buf[file_size] = '\0';
ptxBin = buf;
delete[] buf;
}
else
{
return false;
}
return true;
}
static bool CreatCuContext(CUcontext &cuContext)
{
CUfunction cuFunction = 0;
CUresult status;
int major = 0, minor = 0, devID = 0;
char deviceName[100];
string module_path, ptx_source;
string modulePath = "./data/simpleTexture_kernel32.ptx";
char name[100];
devID = gpuGetMaxGflopsDeviceIdDRV(); //核心实现。
checkCudaErrors(cuDeviceGet(&cuDevice, devID)); //核心实现。
cuDeviceGetName(name, 100, cuDevice);
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
status = cuCtxCreate(&cuContext, 0, cuDevice);
if (CUDA_SUCCESS != status)
{
printf("cuCtxCreate(0) returned %d\n-> %s\n", status, getCudaDrvErrorString(status));
return false;
}
return true;
}
static bool LoadPtxBin(string &ptxBin)
{
// in this branch we use compilation with parameters
const unsigned int jitNumOptions = 3;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void *[jitNumOptions];
// set up size of compilation log buffer
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = 1024;
jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
// set up pointer to the compilation log buffer
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
char *jitLogBuffer = new char[jitLogBufferSize];
jitOptVals[1] = jitLogBuffer;
// set up pointer to set the Maximum # of registers for a particular kernel
jitOptions[2] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 32;
jitOptVals[2] = (void *)(size_t)jitRegCount;
CUresult status;
status = cuModuleLoadDataEx(&cuModule, ptxBin.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals);
if (CUDA_SUCCESS != status)
{
return false;
}
return true;
}
/**
*@brief driver api初始化操作。
*/
static bool InitCUDA(string &modulePath, CUfunction *transform)
{
CUfunction cuFunction = 0;
string ptxBin;
CUresult status;
if (!CreatCuContext(cuContext))
return false;
if (!GetPtxBin(modulePath, ptxBin))
return false;
if (!LoadPtxBin(ptxBin))
return false;
status = cuModuleGetFunction(&cuFunction, cuModule, "transformKernel");
if (CUDA_SUCCESS != status)
return false;
*transform = cuFunction;
return true;
}
static bool DestroyCUDA()
{
cuModuleUnload(cuModule);
checkCudaErrors(cuCtxDestroy(cuContext));
return true;
}
void ImageRotate(CUarray inArray, CUdeviceptr outMem, CUfunction transform, int32_t width, int32_t height, float angle)
{
// set texture parameters
CUtexObject TexObject;
CUDA_RESOURCE_DESC ResDesc;
memset(&ResDesc, 0, sizeof(CUDA_RESOURCE_DESC));
ResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
ResDesc.res.array.hArray = inArray;
CUDA_TEXTURE_DESC TexDesc;
memset(&TexDesc, 0, sizeof(CUDA_TEXTURE_DESC));
TexDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
TexDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
TexDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
TexDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
//TexDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;浮点坐标的设置。
TexDesc.flags = CU_TRSF_READ_AS_INTEGER;
TexDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
checkCudaErrors(cuTexObjectCreate(&TexObject, &ResDesc, &TexDesc, NULL));
int block_size = 8;
if (1)
{
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
void *args[5] = { &outMem, &width, &height, &angle, &TexObject };
checkCudaErrors(cuLaunchKernel(transform, (width / block_size), (height / block_size), 1,
block_size, block_size, 1,
0,
NULL, args, NULL));
checkCudaErrors(cuCtxSynchronize());
}
else
{
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (advanced method)
int offset = 0;
char argBuffer[256];
// pass in launch parameters (not actually de-referencing CUdeviceptr). CUdeviceptr is
// storing the value of the parameters
*((CUdeviceptr *)&argBuffer[offset]) = outMem;
offset += sizeof(outMem);
*((unsigned int *)&argBuffer[offset]) = width;
offset += sizeof(width);
*((unsigned int *)&argBuffer[offset]) = height;
offset += sizeof(height);
*((float *)&argBuffer[offset]) = angle;
offset += sizeof(angle);
*((CUtexObject *)&argBuffer[offset]) = TexObject;
offset += sizeof(TexObject);
void *kernel_launch_config[5] =
{
CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
CU_LAUNCH_PARAM_END
};
// new CUDA 4.0 Driver API Kernel launch call (warmup)
checkCudaErrors(cuLaunchKernel(transform, (width / block_size), (height / block_size), 1,
block_size, block_size, 1,
0,
NULL, NULL, (void **)&kernel_launch_config));
checkCudaErrors(cuCtxSynchronize());
}
checkCudaErrors(cuCtxSynchronize());
checkCudaErrors(cuTexObjectDestroy(TexObject));
}
void main(int argc, char **argv)
{
string inFileName = "F:\\Middleware\\In.bgra";
string outFileName = "F:\\Middleware\\Out.bgra";
uint32_t width = 1920;
uint32_t height = 1080;
ifstream inFile;
ofstream outFile;
uint8_t *RGBA = new uint8_t[width * height * 4];
uint8_t *outRGBA = new uint8_t[width * height * 4];
inFile.open(inFileName.c_str(), ios::binary);
if (!inFile.is_open())
throw inFileName;
outFile.open(outFileName.c_str(), ios::binary);
if (!outFile.is_open())
throw outFileName;
bool bTestResults = true;
CUfunction transform = NULL;
string modulePath = "./data/simpleTexture_kernel32.ptx";
if (!InitCUDA(modulePath, &transform))
{
exit(EXIT_FAILURE);
}
size_t size = width * height * sizeof(float);
// allocate device memory for result
CUdeviceptr d_data = (CUdeviceptr)NULL;
checkCudaErrors(cuMemAlloc(&d_data, size));
int32_t i = 0;
for (i = 0; i < 20; i++)
{
inFile.read(reinterpret_cast<char *>(RGBA), width * height * 4);
CUarray cu_array;
CUDA_ARRAY_DESCRIPTOR desc;
desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
desc.NumChannels = 4;
desc.Width = width;
desc.Height = height;
checkCudaErrors(cuArrayCreate(&cu_array, &desc));
//将图像数据从主存拷贝到显存array中。
CUDA_MEMCPY2D copyParam;
memset(©Param, 0, sizeof(copyParam));
copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
copyParam.dstArray = cu_array;
copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
copyParam.srcHost = RGBA;
copyParam.srcPitch = width * 4;
copyParam.WidthInBytes = copyParam.srcPitch;
copyParam.Height = height;
checkCudaErrors(cuMemcpy2D(©Param));
ImageRotate(cu_array, d_data, transform, width, height, angle);
checkCudaErrors(cuMemcpyDtoH(outRGBA, d_data, size));
if (i > 3)
outFile.write(reinterpret_cast<char *>(outRGBA), width * height * 4);
checkCudaErrors(cuArrayDestroy(cu_array));
}
checkCudaErrors(cuMemFree(d_data));
DestroyCUDA();
if (RGBA != NULL) {
delete[] RGBA;
RGBA = NULL;
}
if (outRGBA != nullptr)
{
delete[] outRGBA;
outRGBA = nullptr;
}
if (inFile.is_open()) {
inFile.clear();
inFile.close();
}
if (outFile.is_open()) {
outFile.clear();
outFile.close();
}
}
在上例,如果纹理坐标使用归一化的浮点坐标,输出了错误的图像,具体原因有待于进一步思考。