简易程序介绍
今天我们就拿一个简单矩阵做例子
我们在C语言(主机上)跑一次,在CL核中跑一次,然后对比其计算结果,若计算值相同则输出结果。
CL核程序
程序很简单,就是通过dot()函数进行简单的矩阵相乘运算
__kernel void mac(__global float4*matrix,
__global float4*vector,
__global float* result)
{
int i = get_global_id(0);
result[i] = dot(matrix[i], vector[0]);
}
CL程序这里就不多加赘述
主程序分步
首先我们要对各部件进行定义
cl_platform_id platform_id = NULL;
cl_uint ret_num_platforms;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem mat_buff = NULL;
cl_mem vec_buff = NULL;
cl_mem res_buff = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
size_t kernel_code_size;
char *kernel_str;
cl_int ret;
FILE *fp;
size_t work_units_per_kernel;
这里对平台/(数量)、设备/(数量)、上下文、命令、缓存、程序、内核等进行了定义
接下来我们要对矩阵进行定义并计算
float mat[16], vec[4], result[4];
float correct[4] = { 0.0f ,0.0f,0.0f,0.0f };
for (int i = 0; i < 16; i++)
{
mat[i] = i * 2.0f;
}
for (int i = 0; i < 4; i++)
{
vec[i] = i * 3.0f;
correct[0] += mat[i] * vec[i];
correct[1] += mat[i + 4] * vec[i];
correct[2] += mat[i + 8] * vec[i];
correct[3] += mat[i + 12] * vec[i];
}
至此主程序中的矩阵计算已经结束了,接下来是对OpenCL程序的设计
首先是获取连接上下文的平台及设备,以及上下文的创建
kernel_str = (char *)malloc(MAX_SOURCE_SIZE);
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id,
&ret_num_devices);
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
然后是创建命令队列
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
接下来是对内核文件的读取
fp = fopen("Conv2D.cl", "r");
kernel_code_size = fread(kernel_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
我这里用了很简单的一步结束了,正常情况下我们还可以采取fseek和rewind及fread函数来获取内核文件,比如像这样
fp = fopen(PROGRAM_FILE, "r");
fseek(fp, 0, SEEK_END);
program_size = ftell(fp);
rewind(fp);
kernel_str = (char*)malloc(program_size + 1);
kernel_str[program_size] = '\0';
fread(kernel_str, sizeof(char), program_size,fp);
fclose(fp);
接下来是编译程序
program = clCreateProgramWithSource(context, 1, (const char **)&kernel_str,
(const size_t *)&kernel_code_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
创建内核
kernel = clCreateKernel(program, "mac", &ret);
CreateBuffer
mat_buff = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * 16, mat, &ret);
vec_buff = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * 4, vec, &ret);
res_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * 4, NULL, &ret);
以及设置内核参数
clSetKernelArg(kernel, 0, sizeof(cl_mem), &mat_buff);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &vec_buff);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &res_buff);
设置完一切之后呢,最重要的一步不能落下!没错,执行!
work_units_per_kernel = 4;
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &work_units_per_kernel, NULL, 0, NULL, NULL);
然后我们再用clEnqueueReadBuffer函数将缓存中的信息读出来
clEnqueueReadBuffer(command_queue, res_buff, CL_TRUE, 0, sizeof(float) * 4, result, 0, NULL, NULL);
差点忘了,我们在做验证来着,那就还有一步
if ((result[0] == correct[0]) && (result[1] == correct[1]) && (result[2] == correct[2]) && (result[3] == correct[3]))
{
printf("Correct!");
printf("\n");
for (int i = 0; i < 4; i++)
{
printf("%f\t", result[i]);
}
}
如果内核运行数据结果与主程序中计算结果相同,则输出结果如下图
整合程序
#include<stdio.h>
#include<stdlib.h>
#include<CL/cl.h>
#include<string.h>
#include <iostream>
#define MAX_SOURCE_SIZE (0x10000)
#pragma warning(disable : 4996)
int main(void)
{
cl_platform_id platform_id = NULL;
cl_uint ret_num_platforms;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem mat_buff=NULL;
cl_mem vec_buff=NULL;
cl_mem res_buff=NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
size_t kernel_code_size;
char *kernel_str;
cl_int ret;
FILE *fp;
size_t work_units_per_kernel;
float mat[16], vec[4], result[4];
float correct[4] = { 0.0f ,0.0f,0.0f,0.0f };
for (int i = 0; i < 16; i++)
{
mat[i] = i * 2.0f;
}
for (int i = 0; i < 4; i++)
{
vec[i] = i * 3.0f;
correct[0] += mat[i] * vec[i];
correct[1] += mat[i + 4] * vec[i];
correct[2] += mat[i + 8] * vec[i];
correct[3] += mat[i + 12] * vec[i];
}
kernel_str = (char *)malloc(MAX_SOURCE_SIZE);
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id,
&ret_num_devices);
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
fp = fopen("Conv2D.cl", "r");
kernel_code_size = fread(kernel_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
program = clCreateProgramWithSource(context, 1, (const char **)&kernel_str,
(const size_t *)&kernel_code_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
kernel = clCreateKernel(program, "mac", &ret);
mat_buff = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * 16, mat, &ret);
vec_buff = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * 4, vec, &ret);
res_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * 4, NULL, &ret);
clSetKernelArg(kernel, 0, sizeof(cl_mem), &mat_buff);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &vec_buff);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &res_buff);
//execute kernel
work_units_per_kernel = 4;
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &work_units_per_kernel, NULL, 0, NULL, NULL);
//read from buffer
clEnqueueReadBuffer(command_queue, res_buff, CL_TRUE, 0, sizeof(float) * 4, result, 0, NULL, NULL);
//verify
if ((result[0] == correct[0]) && (result[1] == correct[1]) && (result[2] == correct[2]) && (result[3] == correct[3]))
{
printf("Correct!");
printf("\n");
for (int i = 0; i < 4; i++)
{
printf("%f\t", result[i]);
}
}
//releasememobject
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}
(注:这里加上#pragma warning(disable : 4996)是因为OpenCL的版号不同,有些函数不支持)
今日试错心得
在编译程序的时候最好加上一个ret返回值,这样可以通过阅读error表来找到自己是哪出错了
比如我今天就这(-45)程序未能成功地构建命令队列关联设备上的一个可执行程序 卡了好久,最后干脆重新写了一遍程序才发现问题所在。这个网址csdn上也有链接: link.
然后编写每个函数的时候打完一个逗号都会有每个属性提示,今天因为没有看清是指针类型出了好多BUG,今后也会注意的。
= =今天试了挺多程序,想熟练下OpenCL编程的框架,结果BUG太多没时间去理卷积模型。
只能把希望寄托在明天了哈哈哈哈