运用openCL实现基于FPGA的数控加工参数智能优化算法
1、引言
实验需要做一个基于边缘计算的数控加工参数智能优化系统。由于FPGA的并行能力强大,因此准备用openCL实现智能优化算法。
智能优化算法的大致思想为bp神经网络+ 遗传算法。通过BP,对实验获得的大量:n 转速、 f 进给率、 ap 切割宽度、 ae 切割深度、 刀具磨损量一共五个参数进行训练 ,并将训练结果用作于遗传算法适应度函数的算子,最终通关遗传算法的不断选择、交叉、变异选出适应性最强的个体作为优化算出。
(由于机床实际加工原因,遗传算法只对n、 f两个参数进行交叉、变异
2、遗传算法
实验获取数据并通过BP获取遗传算法适应度函数算子的部分由实验室师兄完成,作者只负责遗传算法和openCL编写智能优化算法的部分。
由于openCL对于并行计算的部分要开内核进行内核编程,遗传算法的数据初始化、选择部分在主机(main.cpp)内完成,交叉和变异部分在内核中完成。
种群储存在一个float类型的一维数组float popu[5 * NUM]中,NUM为种群规模
- 数据初始化
初始化部分先通过随机数初始化,各参数通过归一化处理变成0到1之间的浮点数。
//初始化种群,n、f、ap、ae和刀具磨损量已进行归一化处理
void init_problem(float* popu) {
int i = 0;
srand((unsigned)time(NULL));
for(; i < NUM * 4; i++){
popu[i] = rand()%1000000/1000000.0;
}
printf("innt_matrix:\n");
}
- 选择
选择我用的是锦标赛选择方法,当然你也可以用其他的。new_popu数组用于中间计算,无实际含义。
//选择
void select(float* popu){
int i;
float new_popu[NUM * 5];
for(i = 0; i < NUM; i++){
int a, b, c, max;
a = (rand()%1000000/100000) * 5;//随机获得每个个体的首属性的坐标
b = (rand()%1000000/100000) * 5;
c = (rand()%1000000/100000) * 5;
printf("\na=%d,b=%d,c=%d\n",a,b,c);
max = getFitness(a,popu) > getFitness(b,popu) ? a : b;
max = getFitness(max,popu) > getFitness(c,popu) ? max : c;
printf("max=%d\n",max);
for(int j = 0; j < 5; j++){
new_popu[i*5+j] = popu[max+j];
}
}
for(i = 0; i < NUM * 5; i++){
popu[i] = new_popu[i];
}
printf("\nthis is select\n");
show(popu);
}
- 交叉
两两进行交叉,一共能变两个参数,且不同参数间不能进行交叉,也就只有这一种交叉方法了
(注意:交叉用的是内核编程)
//交叉
#include "mutation.cl"
__kernel void cross(__global float *popu)
{
int NUM = 10;//种群数
int i = get_global_id(0) * 2;
int index = i * 5;//获取本次交叉的个体1的首参数位置
int index2 = ((i + 1) * 5) % (NUM*5 - 1);//获取本次交叉的个体2的首参数位置
float temp;
//交叉操作
temp = popu[index + 1];
popu[index + 1] = popu[index2 + 1];
popu[index2 + 1] = temp;
}
- 变异
变异用的也是内核编程,固定变异
//变异
__kernel void mutation(__global float *popu, __global float* random_num)
{
int index = get_global_id(0);
//打印popu
printf("kernel index is:%d\n",index);
int i;
//赋值
int index2 = index*5 + index%2;//变异位置,变异n或变异f
popu[index2] = random_num[index];
}
3、openCL编写程序
openCL编程主要分为几个步骤
1、获得openCL平台、查询可用设备,创建上下文context
2、读取内核文件,创建并构建程序program
3、为每个设备创建一个命令队列(我只用一个FPGA就只创建一个就OK了)
4、创建内核(交叉变异两个函数需要并行,因此创建两个内核
5、创建缓冲区
6、设置内核
7、初始化种群
8、选择
9、交叉:
- 写缓冲区clEnqueueWriteBuffer()
- 如命令队列clEnqueueNDRangeKernel()
- 等待前面事件完成,释放事件
- 写缓冲区clEnqueueReadBuffer()
10、变异:
- 写缓冲区clEnqueueWriteBuffer()
- 如命令队列clEnqueueNDRangeKernel()
- 等待前面事件完成,释放事件
- 写缓冲区clEnqueueReadBuffer()
11、执行循环结束后选择适应度最高的个体并打印
注:步骤1~6和交叉变异里的缓冲区操作是所有openCL必须要执行的步骤
openCL程序完整代码:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cstring>
#include "CL/opencl.h"
#include "AOCLUtils/aocl_utils.h"
#define PROGRAM_FILE_1 "matrix_init"
#define PROGRAM_FILE_2 "cross"
#define PROGRAM_FILE_3 "mutation"
#define NUM_FILE 3
#define STRING_BUFFER_LEN 1024
using namespace aocl_utils;
// OpenCL运行时配置
FILE *program_handle;
char *program_buffer[NUM_FILE];
char *file_name[] = {PROGRAM_FILE_1, PROGRAM_FILE_2, PROGRAM_FILE_3};
char *program_log;
size_t program_size[NUM_FILE];
size_t log_size;
cl_program program = NULL;
const char options[] = "-cl-finite-math-only -cl-no-signed-zeros";
static const char* kernel_names[3] =
{
"matrix_init",
"cross",
"mutation"
};
static cl_platform_id platform = NULL;
cl_uint num_devices = 1;
static cl_device_id device = NULL;
cl_context context = NULL;
scoped_array<cl_command_queue> queue; // num_devices elements
scoped_array<cl_kernel> kernel; // num_devices elements
#if USE_SVM_API == 0
scoped_array<cl_mem> input_a_buf; // num_devices elements
scoped_array<cl_mem> input_b_buf; // num_devices elements
scoped_array<cl_mem> output_buf; // num_devices elements
cl_mem input_popu;
#endif /* USE_SVM_API == 0 */
// 问题参数配置
unsigned N = 1024;// problem size
const int NUM = 10;//种群规模
const float PC = 0.8;//交叉概率
const float PM = 0.05;//变异概率
#if USE_SVM_API == 0
scoped_array<scoped_aligned_ptr<float> > input_a, input_b; // num_devices elements
scoped_array<scoped_aligned_ptr<float> > output; // num_devices elements
#else
scoped_array<scoped_SVM_aligned_ptr<float> > input_a, input_b; // num_devices elements
scoped_array<scoped_SVM_aligned_ptr<float> > output; // num_devices elements
#endif /* USE_SVM_API == 0 */
scoped_array<scoped_array<float> > ref_output; // num_devices elements
scoped_array<unsigned> n_per_device; // num_devices elements
// 控制是否应该使用快速模拟器
bool use_fast_emulator = false;
// 函数定义
float rand_float();
bool init_opencl();
void init_problem();
void run();
void cleanup();//释放资源
static void display_device_info( cl_device_id device );
static void device_info_ulong( cl_device_id device, cl_device_info param, const char* name);// 帮助函数来显示OpenCL查询返回的参数
//获取适应度
float getFitness(int index, float* popu){
return popu[index] + popu[index+1] + popu[index+2] + popu[index+3];
}
//显示
void show(float* popu){
int i, j;
for(i = 0; i < NUM; i++){
for(j = 0; j < 5; j++){
printf("%f ",popu[i*5 + j]);
}
printf("\n");
}
printf("============================================\n");
}
//选择
void select(float* popu){
int i;
float new_popu[NUM * 5];
for(i = 0; i < NUM; i++){
int a, b, c, max;
a = (rand()%1000000/100000) * 5;//随机获得每个个体的首属性的坐标
b = (rand()%1000000/100000) * 5;
c = (rand()%1000000/100000) * 5;
printf("\na=%d,b=%d,c=%d\n",a,b,c);
max = getFitness(a,popu) > getFitness(b,popu) ? a : b;
max = getFitness(max,popu) > getFitness(c,popu) ? max : c;
printf("max=%d\n",max);
for(int j = 0; j < 5; j++){
new_popu[i*5+j] = popu[max+j];
}
}
for(i = 0; i < NUM * 5; i++){
popu[i] = new_popu[i];
}
printf("\nthis is select\n");
show(popu);
}
//随机函数初始化
void random_init(float* random_num){
int i = 0;
for(; i < NUM; i++){
random_num[i] = rand()%1000000/1000000.0;
printf("random_num is:%f\n",random_num[i]);
}
}
int main(int argc, char **argv) {
Options options(argc, argv);
// 指定问题大小的可选参数
if(options.has("n")) {
N = options.get<unsigned>("n");
}
// 可选参数指定是否应使用快速仿真器
if(options.has("fast-emulator")) {
use_fast_emulator = options.get<bool>("fast-emulator");
}
// 初始化 OpenCL.
if(!init_opencl()) {
return -1;
}
// 运行内核
run();
// 释放分配的资源
cleanup();
return 0;
}
bool init_opencl(){
cl_int status;
printf("Initializing OpenCL\n");
if(!setCwdToExeDir()) {
return false;
}
// 获得OpenCL平台
if (use_fast_emulator) {
platform = findPlatform("Intel(R) FPGA Emulation Platform for OpenCL(TM)");
} else {
platform = findPlatform("Intel(R) FPGA SDK for OpenCL(TM)");
}
if(platform == NULL) {
printf("ERROR: Unable to find Intel(R) FPGA OpenCL platform.\n");
return false;
}
{
char char_buffer[STRING_BUFFER_LEN];
printf("Querying platform for info:\n");
printf("==========================\n");
clGetPlatformInfo(platform, CL_PLATFORM_NAME, STRING_BUFFER_LEN, char_buffer, NULL);
printf("%-40s = %s\n", "CL_PLATFORM_NAME", char_buffer);
clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, STRING_BUFFER_LEN, char_buffer, NULL);
printf("%-40s = %s\n", "CL_PLATFORM_VENDOR ", char_buffer);
clGetPlatformInfo(platform, CL_PLATFORM_VERSION, STRING_BUFFER_LEN, char_buffer, NULL);
printf("%-40s = %s\n\n", "CL_PLATFORM_VERSION ", char_buffer);
}
scoped_array<cl_device_id> devices;
// 查询可用的OpenCL设备
devices.reset(getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices));
//使用第一个设备
device = devices[0];
display_device_info(device);
// 创建上下文
context = clCreateContext(NULL, 1, &device, &oclContextCallback, NULL, &status);
checkError(status, "Failed to create context");
//为所有设备创建程序.使用第一个设备作为代表设备(假设所有设备类型相同)。
std::string binary_file = getBoardBinaryFile("cross", device);
printf("Using AOCX: %s\n", binary_file.c_str());
program = createProgramFromBinary(context, binary_file.c_str(), &device, num_devices);
// 构建刚刚创建的程序
status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
checkError(status, "Failed to build program");
// 为设备创建对象
queue.reset(num_devices);
kernel.reset(num_devices);
n_per_device.reset(num_devices);
#if USE_SVM_API == 0
input_a_buf.reset(num_devices);
input_b_buf.reset(num_devices);
output_buf.reset(num_devices);
#endif /* USE_SVM_API == 0 */
for(unsigned i = 0; i < num_devices; ++i) {
// 命令队列
queue[i] = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
checkError(status, "Failed to create command queue");
// 内核
kernel[0] = clCreateKernel(program, kernel_names[2], &status);
kernel[1] = clCreateKernel(program, kernel_names[1], &status);
checkError(status, "Failed to create kernel");
printf("kernel:%d\n",i);
// 确定此设备处理的元素数量
n_per_device[i] = N / num_devices; // 由该设备处理的元素数
// 在第一个元素上展开其余的元素
if(i < (N % num_devices)) {
n_per_device[i]++;
}
#if USE_SVM_API == 0
// 设置缓冲区
input_a_buf[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
10000 * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for input A");
input_b_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
1000 * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for input B");
input_popu = clCreateBuffer(context, CL_MEM_READ_WRITE,
1000 * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for input_popu");
//output_buf[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
// n_per_device[i] * sizeof(float), NULL, &status);
//checkError(status, "Failed to create buffer for input B");
// 输出缓冲区
// output_buf[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
// n_per_device[i] * sizeof(float), NULL, &status);
// checkError(status, "Failed to create buffer for output");
#else
cl_device_svm_capabilities caps = 0;
status = clGetDeviceInfo(
device[i],
CL_DEVICE_SVM_CAPABILITIES,
sizeof(cl_device_svm_capabilities),
&caps,
0
);
checkError(status, "Failed to get device info");
if (!(caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER)) {
printf("The host was compiled with USE_SVM_API, however the device currently being targeted does not support SVM.\n");
// 释放资源
cleanup();
return false;
}
#endif /* USE_SVM_API == 0 */
}
return true;
}
// I初始化问题的数据. 需要知道 num_devices
void init_problem(float* popu) {
int i = 0;
srand((unsigned)time(NULL));
for(; i < NUM * 4; i++){
popu[i] = rand()%1000000/1000000.0;
}
printf("innt_matrix:\n");
}
void run() {
cl_int status;
const double start_time = getCurrentTimestamp();
// 为每个设备启动问题.
scoped_array<cl_event> kernel_event(num_devices);
scoped_array<cl_event> finish_event(num_devices);
cl_event write_event[2];
// 设置内核参数.
unsigned argi = 0;
//设置内核0
status = clSetKernelArg(kernel[0], argi++, sizeof(cl_mem), (void *)&input_a_buf[0]);
checkError(status, "Failed to set argument %d", argi - 1);
status|= clSetKernelArg(kernel[0], argi++, sizeof(cl_mem), (void *)&input_b_buf[0]);
checkError(status, "Failed to set argument %d", argi - 1);
argi = 0;
status = clSetKernelArg(kernel[1], argi++, sizeof(cl_mem), (void *)&input_popu);
checkError(status, "Failed to set argument %d", argi - 1);
//队列内核。
//使用与要添加的元素数量相对应的全局工作大小
//对于这个设备。我们不指定本地工作大小,让运行时选择(它将选择使用一个与全局相同大小的工作组工艺尺寸)。
//事件用于确保内核直到对输入缓冲区的写操作已经完成。
float fitne_list[NUM];//每个个体的适应度值
float popu[NUM * 5];//种群
float random_num[NUM];
cl_uint work_dims = 1;//数据维度
size_t global_work_offset = 0;//工作项i/j/k的起始值
size_t global_work_size = NUM;//各维度处理工作项的个数
int i = 0;
init_problem(popu);
cl_event evt1;
cl_event evt2;
cl_event evt3;
for(i = 0; i < NUM; i++){
show(popu);
select(popu);
float random_P = rand()%1000000/1000000.0;
printf("random_p=%f\n", random_P);
if(random_P < PC){ //概率交叉
//cross(popu);
status = clEnqueueWriteBuffer(queue[0], input_popu, CL_FALSE,
0, 1000 * sizeof(float), popu, 0, NULL, &evt1);
status = clEnqueueNDRangeKernel(queue[0], kernel[1], work_dims, NULL,
&global_work_size / 2, NULL, 1, &evt1, &evt3);
clWaitForEvents(1, &evt3);
clReleaseEvent(evt1);
clReleaseEvent(evt3);
status = clEnqueueReadBuffer(queue[0], input_popu, CL_TRUE,
0, 1000 * sizeof(float), popu, 0, NULL, NULL);
}
if(random_P > PM){ //概率变异
printf("\naaaaaaaaaaaaaaaaaaaaaahhhhh!!!! I`m mutation\n");
random_init(random_num);
printf("1\n");
status = clEnqueueWriteBuffer(queue[0], input_b_buf[0], CL_FALSE,
0, 1000 * sizeof(float), random_num, 0, NULL, &evt1);
printf("2\n");
status = clEnqueueWriteBuffer(queue[0], input_a_buf[0], CL_FALSE,
0, 1000 * sizeof(float), popu, 1, &evt1, &evt2);
printf("3\n");
status = clEnqueueNDRangeKernel(queue[0], kernel[0], work_dims, NULL,
&global_work_size, NULL, 1, &evt2, &evt3);
printf("4\n");
clWaitForEvents(1, &evt3);
clReleaseEvent(evt1);
clReleaseEvent(evt2);
clReleaseEvent(evt3);
printf("5\n");
status = clEnqueueReadBuffer(queue[0], input_a_buf[0], CL_TRUE,
0, 1000 * sizeof(float), popu, 0, NULL, NULL);
printf("6\n");
}
}
//选择最优
float max = getFitness(0,popu);
int result_index = 0;
printf("the result:");
for(i = 0; i < NUM - 1; i++){
if(getFitness((i+1)*4, popu) > max){
max = getFitness((i+1)*4, popu);
result_index = i + 1;
}
}
//打印结果
printf("[");
for(i = 0; i < 4; i++){
printf("%f, ",popu[result_index*4 + i]);
}
printf("1]\n");
}
// 释放初始化期间分配的资源
void cleanup() {
for(unsigned i = 0; i < num_devices; ++i) {
if(kernel && kernel[i]) {
clReleaseKernel(kernel[i]);
}
if(queue && queue[i]) {
clReleaseCommandQueue(queue[i]);
}
#if USE_SVM_API == 0
if(input_a_buf && input_a_buf[i]) {
clReleaseMemObject(input_a_buf[i]);
}
if(input_b_buf && input_b_buf[i]) {
clReleaseMemObject(input_b_buf[i]);
}
if(output_buf && output_buf[i]) {
clReleaseMemObject(output_buf[i]);
}
#else
if(input_a[i].get())
input_a[i].reset();
if(input_b[i].get())
input_b[i].reset();
if(output[i].get())
output[i].reset();
#endif /* USE_SVM_API == 0 */
}
if(program) {
clReleaseProgram(program);
}
if(context) {
clReleaseContext(context);
}
}
// 帮助函数来显示OpenCL查询返回的参数
static void device_info_ulong( cl_device_id device, cl_device_info param, const char* name) {
cl_ulong a;
clGetDeviceInfo(device, param, sizeof(cl_ulong), &a, NULL);
printf("%-40s = %lu\n", name, a);
}
static void device_info_uint( cl_device_id device, cl_device_info param, const char* name) {
cl_uint a;
clGetDeviceInfo(device, param, sizeof(cl_uint), &a, NULL);
printf("%-40s = %u\n", name, a);
}
static void device_info_bool( cl_device_id device, cl_device_info param, const char* name) {
cl_bool a;
clGetDeviceInfo(device, param, sizeof(cl_bool), &a, NULL);
printf("%-40s = %s\n", name, (a?"true":"false"));
}
static void device_info_string( cl_device_id device, cl_device_info param, const char* name) {
char a[STRING_BUFFER_LEN];
clGetDeviceInfo(device, param, STRING_BUFFER_LEN, &a, NULL);
printf("%-40s = %s\n", name, a);
}
// 查询和显示关于设备和运行时环境的OpenCL信息
static void display_device_info( cl_device_id device ) {
printf("Querying device for info:\n");
printf("========================\n");
device_info_string(device, CL_DEVICE_NAME, "CL_DEVICE_NAME");
device_info_string(device, CL_DEVICE_VENDOR, "CL_DEVICE_VENDOR");
device_info_uint(device, CL_DEVICE_VENDOR_ID, "CL_DEVICE_VENDOR_ID");
device_info_string(device, CL_DEVICE_VERSION, "CL_DEVICE_VERSION");
device_info_string(device, CL_DRIVER_VERSION, "CL_DRIVER_VERSION");
device_info_uint(device, CL_DEVICE_ADDRESS_BITS, "CL_DEVICE_ADDRESS_BITS");
device_info_bool(device, CL_DEVICE_AVAILABLE, "CL_DEVICE_AVAILABLE");
device_info_bool(device, CL_DEVICE_ENDIAN_LITTLE, "CL_DEVICE_ENDIAN_LITTLE");
device_info_ulong(device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, "CL_DEVICE_GLOBAL_MEM_CACHE_SIZE");
device_info_ulong(device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, "CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE");
device_info_ulong(device, CL_DEVICE_GLOBAL_MEM_SIZE, "CL_DEVICE_GLOBAL_MEM_SIZE");
device_info_bool(device, CL_DEVICE_IMAGE_SUPPORT, "CL_DEVICE_IMAGE_SUPPORT");
device_info_ulong(device, CL_DEVICE_LOCAL_MEM_SIZE, "CL_DEVICE_LOCAL_MEM_SIZE");
device_info_ulong(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, "CL_DEVICE_MAX_CLOCK_FREQUENCY");
device_info_ulong(device, CL_DEVICE_MAX_COMPUTE_UNITS, "CL_DEVICE_MAX_COMPUTE_UNITS");
device_info_ulong(device, CL_DEVICE_MAX_CONSTANT_ARGS, "CL_DEVICE_MAX_CONSTANT_ARGS");
device_info_ulong(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, "CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE");
device_info_uint(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS");
device_info_uint(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, "CL_DEVICE_MEM_BASE_ADDR_ALIGN");
device_info_uint(device, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, "CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE");
device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR");
device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT");
device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT");
device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG");
device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT");
device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE");
//printf("\nbbbbbbbbbb\n");
{
cl_command_queue_properties ccp;
clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &ccp, NULL);
printf("%-40s = %s\n", "Command queue out of order? ", ((ccp & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)?"true":"false"));
printf("%-40s = %s\n", "Command queue profiling enabled? ", ((ccp & CL_QUEUE_PROFILING_ENABLE)?"true":"false"));
//printf("\nccccccccccccccC\n");
}
}
4、在opecnCL编程中可能遇到的问题
1、创建程序时的问题。创建程序时需要读所有的内核文件,在书上看到有一种是读是用fopen来读内核文件,然后用createProgramWithSource()创建程序的方法,我照书上用这种方法敲过,会越界,在linux下会报段错误(核心已转储)的错误。
因此本文在创建程序的时候用的是FPGA官方示例的创建方法:
std::string binary_file = getBoardBinaryFile("GA", device);
printf("Using AOCX: %s\n", binary_file.c_str());
program = createProgramFromBinary(context, binary_file.c_str(), &device, num_devices);
但是在getBoardBinaryFile方法里只能写一个文件名,所以如果你有多个内核需要在如GA.cl文件中include进其他内核,例如:
#include "kernel1.cl"
#include "kernel2.cl"
#include "kernel3.cl"
#include "kernel4.cl"
······
__kernel void cross(__global float *popu)
{
······
}
2、向内核传入数据、读取数据的问题。如果是传入简单的数据如float a, int a这样的,可以在设置内核参数的时候以这样的形式设置:clSetKernelArg(kernel[0], argi++, sizeof(cl_mem),&a);
但如果要传入的的是数组,指针之类需要设计内存操作的数据,就需要开辟缓冲区了,
例如:
input_a_buf[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
10000 * sizeof(float), NULL, &status);
status = clSetKernelArg(kernel[0], argi++, sizeof(cl_mem), (void *)&input_a_buf[0]);
checkError(status, "Failed to set argument %d", argi - 1);
然后在用
status = clEnqueueWriteBuffer(queue[0], input_popu, CL_FALSE,
0, 1000 * sizeof(float), popu, 0, NULL, &evt1);
status = clEnqueueNDRangeKernel(queue[0], kernel[1], work_dims, NULL,
&global_work_size / 2, NULL, 1, &evt1, &evt3);
就可以传入内核了
而如果要把传入内核的数据再读出来就要再接一个
//等待内核执行完毕
clWaitForEvents(1, &evt3);
clReleaseEvent(evt1);
clReleaseEvent(evt3);
//读取数据
status = clEnqueueReadBuffer(queue[0], input_popu, CL_TRUE,
0, 1000 * sizeof(float), popu, 0, NULL, NULL);
还遇到了很多问题之后再更,如果有遇上什么问题的可以在评论里留言一起讨论