这篇文章的重点是啥:
关于OpenCL整体工作流程的文章有很多,比较好的有知乎华叔的文章和OpenCL 2.0 异构计算。流程他们已经讲得很详细了,我就不简单重复了,这篇博客主要是弄一个可运行代码,然后,强调一些我觉得重要的东西。
OpenCL的工作流程
创建并执行一个简单的OpenCL应用大致需要以下几步:
- 查询平台和设备信息
- 创建一个上下文
- 为每个设备创建一个命令队列
- 创建一个内存对象(数组)用于存储数据
- 拷贝输入数据到设备端
- 使用OpenCL C代码创建并编译出一个程序
- 从编译好的OpenCL程序中提取内核
- 执行内核
- 拷贝输出数据到主机端
- 释放资源
在下面的代码中,很容易找到对应的代码。
简单的可运行代码
这里复用了前面Windows下环境配置中的代码,运行时输出Result: 3
即为正确运行。
#include <CL/cl.h>
#include <iostream>
const int N = 1024;
const size_t size = N * N * sizeof(float);
const char* getErrorString(cl_int error) {
switch (error) {
// run-time and JIT compiler errors
case 0: return "CL_SUCCESS";
case -1: return "CL_DEVICE_NOT_FOUND";
case -2: return "CL_DEVICE_NOT_AVAILABLE";
case -3: return "CL_COMPILER_NOT_AVAILABLE";
case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
case -5: return "CL_OUT_OF_RESOURCES";
case -6: return "CL_OUT_OF_HOST_MEMORY";
case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
case -8: return "CL_MEM_COPY_OVERLAP";
case -9: return "CL_IMAGE_FORMAT_MISMATCH";
case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
case -11: return "CL_BUILD_PROGRAM_FAILURE";
case -12: return "CL_MAP_FAILURE";
case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
case -15: return "CL_COMPILE_PROGRAM_FAILURE";
case -16: return "CL_LINKER_NOT_AVAILABLE";
case -17: return "CL_LINK_PROGRAM_FAILURE";
case -18: return "CL_DEVICE_PARTITION_FAILED";
case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
// compile-time errors
case -30: return "CL_INVALID_VALUE";
case -31: return "CL_INVALID_DEVICE_TYPE";
case -32: return "CL_INVALID_PLATFORM";
case -33: return "CL_INVALID_DEVICE";
case -34: return "CL_INVALID_CONTEXT";
case -35: return "CL_INVALID_QUEUE_PROPERTIES";
case -36: return "CL_INVALID_COMMAND_QUEUE";
case -37: return "CL_INVALID_HOST_PTR";
case -38: return "CL_INVALID_MEM_OBJECT";
case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
case -40: return "CL_INVALID_IMAGE_SIZE";
case -41: return "CL_INVALID_SAMPLER";
case -42: return "CL_INVALID_BINARY";
case -43: return "CL_INVALID_BUILD_OPTIONS";
case -44: return "CL_INVALID_PROGRAM";
case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
case -46: return "CL_INVALID_KERNEL_NAME";
case -47: return "CL_INVALID_KERNEL_DEFINITION";
case -48: return "CL_INVALID_KERNEL";
case -49: return "CL_INVALID_ARG_INDEX";
case -50: return "CL_INVALID_ARG_VALUE";
case -51: return "CL_INVALID_ARG_SIZE";
case -52: return "CL_INVALID_KERNEL_ARGS";
case -53: return "CL_INVALID_WORK_DIMENSION";
case -54: return "CL_INVALID_WORK_GROUP_SIZE";
case -55: return "CL_INVALID_WORK_ITEM_SIZE";
case -56: return "CL_INVALID_GLOBAL_OFFSET";
case -57: return "CL_INVALID_EVENT_WAIT_LIST";
case -58: return "CL_INVALID_EVENT";
case -59: return "CL_INVALID_OPERATION";
case -60: return "CL_INVALID_GL_OBJECT";
case -61: return "CL_INVALID_BUFFER_SIZE";
case -62: return "CL_INVALID_MIP_LEVEL";
case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
case -64: return "CL_INVALID_PROPERTY";
case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
case -66: return "CL_INVALID_COMPILER_OPTIONS";
case -67: return "CL_INVALID_LINKER_OPTIONS";
case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
// extension errors
case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
default: return "Unknown OpenCL error";
}
}
void CheckError(cl_int error) {
if (error != CL_SUCCESS) {
std::cerr << "Error occurred at file: " << __FILE__ << ", line: " << __LINE__<<" with status "<< getErrorString(error) << std::endl;
exit(1);
}
}
int main() {
// 初始化输入矩阵
float* A = new float[N * N];
float* B = new float[N * N];
for (size_t i = 0; i < N * N; i++)
{
A[i] = 1.0f;
B[i] = 2.0f;
}
cl_platform_id platform;
cl_int status;
status=clGetPlatformIDs(1, &platform, NULL);
CheckError(status);
cl_device_id device;
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
CheckError(status);
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
CheckError(status);
cl_command_queue queue = clCreateCommandQueueWithProperties(context, device, 0, &status);
CheckError(status);
// 创建OpenCL内存缓冲区
cl_mem bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &status);
cl_mem bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &status);
cl_mem bufferC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, size, NULL, &status);
CheckError(status);
// 将输入数据传输到OpenCL缓冲区
status = clEnqueueWriteBuffer(queue, bufferA, CL_TRUE, 0, size, A, 0, NULL, NULL);
status = clEnqueueWriteBuffer(queue, bufferB, CL_TRUE, 0, size, B, 0, NULL, NULL);
CheckError(status);
const char* source = "__kernel void add_matrices(__global const float* A, __global const float* B, __global float* C) { int id = get_global_id(0); C[id] = A[id] + B[id]; }";
cl_program program = clCreateProgramWithSource(context, 1, &source, NULL, &status);
CheckError(status);
status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
CheckError(status);
cl_kernel kernel = clCreateKernel(program, "add_matrices", &status);
CheckError(status);
// 设置OpenCL内核参数
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufferA);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferB);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufferC);
CheckError(status);
// 启动内核
size_t globalWorkSize[2] = { N, N };
status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
CheckError(status);
// 读取结果数据
status = clEnqueueReadBuffer(queue, bufferC, CL_TRUE, 0, size, A, 0, NULL, NULL);
CheckError(status);
// 清理OpenCL资源
status = clReleaseMemObject(bufferA);
status = clReleaseMemObject(bufferB);
status = clReleaseMemObject(bufferC);
status = clReleaseProgram(program);
status = clReleaseKernel(kernel);
status = clReleaseCommandQueue(queue);
status = clReleaseContext(context);
CheckError(status);
// 打印结果
std::cout << "Result: " << A[1023] << std::endl;
delete[] A;
delete[] B;
return 0;
}
保留了之前逻辑的同时,此处的重要区别是,引入了void CheckError(cl_int error)
函数,用于校验每个环节执行结束后的状态值。
整体流程中需要注意的点
- 在本地调试阶段,建议监控每一个可能出错位置的状态值
- 状态值是OpenCL反馈错误的唯一方式,如果没有状态值,排查错误几乎是不可能进行的;状态值可以作为排除错误的重要参考
- 通常,OpenCL相关的错误一般都是可稳定复现的,当多次计算相同数据时状态值出错位置不同时,请优先考虑是否在其它环节有比较重大的内存泄漏
- 每个返回状态值的函数,都会有关于状态值对应的原因的描述,因此,当返回状态值有误,且无法凭借状态值信息直接得知出错原因时,可以查看该函数官方描述中关于该状态值的信息,以排除当前可能存在的错误
- 程序退出时记得释放系统资源
- OpenCL中利用Create创建的资源都需要释放,例如,此处的
MemObject
、Program
、Kernel
、CommandQueue
、Context
。创建的资源对系统而言是比较大的负担,当你在某个程序中多次创建资源而不释放时,系统内存占用将会显著增加直至无法创建新的资源。
- OpenCL中利用Create创建的资源都需要释放,例如,此处的