运用openCL实现基于FPGA的智能优化算法

运用openCL实现基于FPGA的数控加工参数智能优化算法

1、引言

实验需要做一个基于边缘计算的数控加工参数智能优化系统。由于FPGA的并行能力强大,因此准备用openCL实现智能优化算法。

智能优化算法的大致思想为bp神经网络+ 遗传算法。通过BP,对实验获得的大量:n 转速、 f 进给率、 ap 切割宽度、 ae 切割深度、 刀具磨损量一共五个参数进行训练 ,并将训练结果用作于遗传算法适应度函数的算子,最终通关遗传算法的不断选择、交叉、变异选出适应性最强的个体作为优化算出。
(由于机床实际加工原因,遗传算法只对n、 f两个参数进行交叉、变异

2、遗传算法

实验获取数据并通过BP获取遗传算法适应度函数算子的部分由实验室师兄完成,作者只负责遗传算法和openCL编写智能优化算法的部分。
由于openCL对于并行计算的部分要开内核进行内核编程,遗传算法的数据初始化、选择部分在主机(main.cpp)内完成,交叉和变异部分在内核中完成。
种群储存在一个float类型的一维数组float popu[5 * NUM]中,NUM为种群规模

  • 数据初始化
    初始化部分先通过随机数初始化,各参数通过归一化处理变成0到1之间的浮点数。
//初始化种群,n、f、ap、ae和刀具磨损量已进行归一化处理
void init_problem(float* popu) {
  int i = 0;
  
  srand((unsigned)time(NULL));
  for(; i < NUM * 4; i++){
      popu[i] = rand()%1000000/1000000.0;
  }
  printf("innt_matrix:\n");
}
  • 选择
    选择我用的是锦标赛选择方法,当然你也可以用其他的。new_popu数组用于中间计算,无实际含义。
//选择
void select(float* popu){
   int i;
   float new_popu[NUM * 5];
   for(i = 0; i < NUM; i++){
       int a, b, c, max;
       a = (rand()%1000000/100000) * 5;//随机获得每个个体的首属性的坐标
       b = (rand()%1000000/100000) * 5;
       c = (rand()%1000000/100000) * 5;
       printf("\na=%d,b=%d,c=%d\n",a,b,c);
       max = getFitness(a,popu) > getFitness(b,popu) ? a : b;
       max = getFitness(max,popu) > getFitness(c,popu) ? max : c;
       printf("max=%d\n",max);
       for(int j = 0; j < 5; j++){
       new_popu[i*5+j] = popu[max+j];
       }
   }
   for(i = 0; i < NUM * 5; i++){
       popu[i] = new_popu[i];
   }
       printf("\nthis is select\n");
       show(popu);
}
  • 交叉
    两两进行交叉,一共能变两个参数,且不同参数间不能进行交叉,也就只有这一种交叉方法了
    (注意:交叉用的是内核编程)
//交叉
#include "mutation.cl" 
__kernel void cross(__global float *popu)
{
        int NUM = 10;//种群数
        int i = get_global_id(0) * 2;
        int index = i * 5;//获取本次交叉的个体1的首参数位置
        int index2 = ((i + 1) * 5) % (NUM*5 - 1);//获取本次交叉的个体2的首参数位置
        float temp;
        //交叉操作
        temp = popu[index + 1];
        popu[index + 1] = popu[index2 + 1];
	    popu[index2 + 1] = temp;
}
  • 变异
    变异用的也是内核编程,固定变异
//变异
__kernel void mutation(__global float *popu, __global float* random_num)
{
    
        int index = get_global_id(0);
        //打印popu
        printf("kernel index is:%d\n",index);
         int i;
        //赋值
        int index2 = index*5 + index%2;//变异位置,变异n或变异f
        popu[index2] = random_num[index];
}

3、openCL编写程序

openCL编程主要分为几个步骤
1、获得openCL平台、查询可用设备,创建上下文context
2、读取内核文件,创建并构建程序program
3、为每个设备创建一个命令队列(我只用一个FPGA就只创建一个就OK了)
4、创建内核(交叉变异两个函数需要并行,因此创建两个内核
5、创建缓冲区
6、设置内核
7、初始化种群
8、选择
9、交叉:

  • 写缓冲区clEnqueueWriteBuffer()
  • 如命令队列clEnqueueNDRangeKernel()
  • 等待前面事件完成,释放事件
  • 写缓冲区clEnqueueReadBuffer()

10、变异:

  • 写缓冲区clEnqueueWriteBuffer()
  • 如命令队列clEnqueueNDRangeKernel()
  • 等待前面事件完成,释放事件
  • 写缓冲区clEnqueueReadBuffer()

11、执行循环结束后选择适应度最高的个体并打印

注:步骤1~6和交叉变异里的缓冲区操作是所有openCL必须要执行的步骤

openCL程序完整代码:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cstring>
#include "CL/opencl.h"
#include "AOCLUtils/aocl_utils.h"

#define PROGRAM_FILE_1 "matrix_init"
#define PROGRAM_FILE_2 "cross"
#define PROGRAM_FILE_3 "mutation"
#define NUM_FILE 3
#define STRING_BUFFER_LEN 1024
using namespace aocl_utils;

// OpenCL运行时配置
FILE *program_handle;
char *program_buffer[NUM_FILE];
char *file_name[] = {PROGRAM_FILE_1, PROGRAM_FILE_2, PROGRAM_FILE_3};
char *program_log;
size_t program_size[NUM_FILE];
size_t log_size;
cl_program program = NULL;

const char options[] = "-cl-finite-math-only -cl-no-signed-zeros";

static const char* kernel_names[3] =
{
  "matrix_init",
  "cross",
  "mutation"
};


static cl_platform_id platform = NULL;
cl_uint num_devices = 1;
static cl_device_id device = NULL;
cl_context context = NULL;
scoped_array<cl_command_queue> queue; // num_devices elements
scoped_array<cl_kernel> kernel; // num_devices elements
#if USE_SVM_API == 0
scoped_array<cl_mem> input_a_buf; // num_devices elements
scoped_array<cl_mem> input_b_buf; // num_devices elements
scoped_array<cl_mem> output_buf; // num_devices elements
cl_mem input_popu;
#endif /* USE_SVM_API == 0 */

// 问题参数配置
unsigned N = 1024;// problem size
const int NUM = 10;//种群规模
const float PC = 0.8;//交叉概率
const float PM = 0.05;//变异概率
#if USE_SVM_API == 0
scoped_array<scoped_aligned_ptr<float> > input_a, input_b; // num_devices elements
scoped_array<scoped_aligned_ptr<float> > output; // num_devices elements
#else
scoped_array<scoped_SVM_aligned_ptr<float> > input_a, input_b; // num_devices elements
scoped_array<scoped_SVM_aligned_ptr<float> > output; // num_devices elements
#endif /* USE_SVM_API == 0 */
scoped_array<scoped_array<float> > ref_output; // num_devices elements
scoped_array<unsigned> n_per_device; // num_devices elements

// 控制是否应该使用快速模拟器
bool use_fast_emulator = false;

// 函数定义
float rand_float();
bool init_opencl();
void init_problem();
void run();
void cleanup();//释放资源
static void display_device_info( cl_device_id device );
static void device_info_ulong( cl_device_id device, cl_device_info param, const char* name);// 帮助函数来显示OpenCL查询返回的参数

//获取适应度
float getFitness(int index, float* popu){
    return popu[index] + popu[index+1] + popu[index+2] + popu[index+3];
}

//显示
void show(float* popu){
    int i, j;
    for(i = 0; i < NUM; i++){
        for(j = 0; j < 5; j++){
            printf("%f ",popu[i*5 + j]);
        }
        printf("\n");
    }
    printf("============================================\n");
}
//选择
void select(float* popu){
    int i;
    float new_popu[NUM * 5];
    for(i = 0; i < NUM; i++){
        int a, b, c, max;
        a = (rand()%1000000/100000) * 5;//随机获得每个个体的首属性的坐标
        b = (rand()%1000000/100000) * 5;
        c = (rand()%1000000/100000) * 5;
        printf("\na=%d,b=%d,c=%d\n",a,b,c);
        max = getFitness(a,popu) > getFitness(b,popu) ? a : b;
        max = getFitness(max,popu) > getFitness(c,popu) ? max : c;
        printf("max=%d\n",max);
        for(int j = 0; j < 5; j++){
	    new_popu[i*5+j] = popu[max+j];
        }
    }
    for(i = 0; i < NUM * 5; i++){
        popu[i] = new_popu[i];
    }
        printf("\nthis is select\n");
        show(popu);
}

//随机函数初始化
void random_init(float* random_num){
    int i = 0;
    for(; i < NUM; i++){
        random_num[i] = rand()%1000000/1000000.0;
        printf("random_num is:%f\n",random_num[i]);
    }
}


int main(int argc, char **argv) {
 Options options(argc, argv);

  // 指定问题大小的可选参数
  if(options.has("n")) {
    N = options.get<unsigned>("n");
  }
  
  // 可选参数指定是否应使用快速仿真器
  if(options.has("fast-emulator")) {
    use_fast_emulator = options.get<bool>("fast-emulator");
  }



  // 初始化 OpenCL.
  if(!init_opencl()) {
    return -1;
  }

  
  // 运行内核
  run();

  // 释放分配的资源
  cleanup();

  return 0;
}



bool init_opencl(){
  
  cl_int status;

  printf("Initializing OpenCL\n");

  if(!setCwdToExeDir()) {
    return false;
  }

  // 获得OpenCL平台
  if (use_fast_emulator) {
    platform = findPlatform("Intel(R) FPGA Emulation Platform for OpenCL(TM)");
  } else {
    platform = findPlatform("Intel(R) FPGA SDK for OpenCL(TM)");
  }
  if(platform == NULL) {
    printf("ERROR: Unable to find Intel(R) FPGA OpenCL platform.\n");
    return false;
  }
  {
    char char_buffer[STRING_BUFFER_LEN]; 
    printf("Querying platform for info:\n");
    printf("==========================\n");
    clGetPlatformInfo(platform, CL_PLATFORM_NAME, STRING_BUFFER_LEN, char_buffer, NULL);
    printf("%-40s = %s\n", "CL_PLATFORM_NAME", char_buffer);
    clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, STRING_BUFFER_LEN, char_buffer, NULL);
    printf("%-40s = %s\n", "CL_PLATFORM_VENDOR ", char_buffer);
    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, STRING_BUFFER_LEN, char_buffer, NULL);
    printf("%-40s = %s\n\n", "CL_PLATFORM_VERSION ", char_buffer);
  }
 scoped_array<cl_device_id> devices;
   // 查询可用的OpenCL设备
  devices.reset(getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices));

  //使用第一个设备
  device = devices[0];
  display_device_info(device);

  // 创建上下文
  context = clCreateContext(NULL, 1, &device, &oclContextCallback, NULL, &status);
  checkError(status, "Failed to create context");
  //为所有设备创建程序.使用第一个设备作为代表设备(假设所有设备类型相同)。
  std::string binary_file = getBoardBinaryFile("cross", device);
  printf("Using AOCX: %s\n", binary_file.c_str());
  program = createProgramFromBinary(context, binary_file.c_str(), &device, num_devices);

  // 构建刚刚创建的程序
  status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
  checkError(status, "Failed to build program");

  // 为设备创建对象
  queue.reset(num_devices);
  kernel.reset(num_devices);
  n_per_device.reset(num_devices);
#if USE_SVM_API == 0
  input_a_buf.reset(num_devices);
  input_b_buf.reset(num_devices);
  output_buf.reset(num_devices);
#endif /* USE_SVM_API == 0 */

  for(unsigned i = 0; i < num_devices; ++i) {
    // 命令队列
    queue[i] = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
    checkError(status, "Failed to create command queue");

    // 内核
    kernel[0] = clCreateKernel(program, kernel_names[2], &status);
    kernel[1] = clCreateKernel(program, kernel_names[1], &status);
    checkError(status, "Failed to create kernel");
   printf("kernel:%d\n",i);
    // 确定此设备处理的元素数量
    n_per_device[i] = N / num_devices; // 由该设备处理的元素数

    // 在第一个元素上展开其余的元素
    if(i < (N % num_devices)) {
      n_per_device[i]++;
    }

#if USE_SVM_API == 0
    // 设置缓冲区
    input_a_buf[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, 
        10000 * sizeof(float), NULL, &status);
    checkError(status, "Failed to create buffer for input A");

    input_b_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY, 
        1000 * sizeof(float), NULL, &status);
    checkError(status, "Failed to create buffer for input B");
    
    
    input_popu = clCreateBuffer(context, CL_MEM_READ_WRITE, 
        1000 * sizeof(float), NULL, &status);
    checkError(status, "Failed to create buffer for input_popu");
    //output_buf[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, 
    //    n_per_device[i] * sizeof(float), NULL, &status);
    //checkError(status, "Failed to create buffer for input B");

    // 输出缓冲区
   // output_buf[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
      //  n_per_device[i] * sizeof(float), NULL, &status);
   // checkError(status, "Failed to create buffer for output");
#else
    cl_device_svm_capabilities caps = 0;

    status = clGetDeviceInfo(
      device[i],
      CL_DEVICE_SVM_CAPABILITIES,
      sizeof(cl_device_svm_capabilities),
      &caps,
      0
    );
    checkError(status, "Failed to get device info");                                                                                                                                                                                                                                                    

    if (!(caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER)) {
      printf("The host was compiled with USE_SVM_API, however the device currently being targeted does not support SVM.\n");
      // 释放资源
      cleanup();
      return false;
    }
#endif /* USE_SVM_API == 0 */
  }

  return true;
}
    
// I初始化问题的数据. 需要知道 num_devices
void init_problem(float* popu) {
  int i = 0;
  
  srand((unsigned)time(NULL));
  for(; i < NUM * 4; i++){
      popu[i] = rand()%1000000/1000000.0;
  }
  printf("innt_matrix:\n");
}

void run() {
  cl_int status;

  const double start_time = getCurrentTimestamp();

  // 为每个设备启动问题.
  scoped_array<cl_event> kernel_event(num_devices);
  scoped_array<cl_event> finish_event(num_devices);


    cl_event write_event[2];

    // 设置内核参数.
    unsigned argi = 0;
    //设置内核0
    status = clSetKernelArg(kernel[0], argi++, sizeof(cl_mem), (void *)&input_a_buf[0]);
    checkError(status, "Failed to set argument %d", argi - 1);

    status|= clSetKernelArg(kernel[0], argi++, sizeof(cl_mem), (void *)&input_b_buf[0]);
    checkError(status, "Failed to set argument %d", argi - 1);
    
    argi = 0;
    status = clSetKernelArg(kernel[1], argi++, sizeof(cl_mem), (void *)&input_popu);
    checkError(status, "Failed to set argument %d", argi - 1);
    
 
  //队列内核。
  //使用与要添加的元素数量相对应的全局工作大小
  //对于这个设备。我们不指定本地工作大小,让运行时选择(它将选择使用一个与全局相同大小的工作组工艺尺寸)。
  //事件用于确保内核直到对输入缓冲区的写操作已经完成。
    float fitne_list[NUM];//每个个体的适应度值
    float popu[NUM * 5];//种群
    float random_num[NUM];
    
    cl_uint work_dims = 1;//数据维度
    size_t global_work_offset = 0;//工作项i/j/k的起始值
    size_t global_work_size = NUM;//各维度处理工作项的个数
    int i = 0;
    
    init_problem(popu);
    cl_event evt1;
    cl_event evt2;
    cl_event evt3;
  for(i = 0; i < NUM; i++){
        show(popu);
        select(popu);
        float random_P = rand()%1000000/1000000.0;
        printf("random_p=%f\n", random_P);
        if(random_P < PC){    //概率交叉
            //cross(popu);
            status = clEnqueueWriteBuffer(queue[0], input_popu, CL_FALSE,
                        0, 1000 * sizeof(float), popu, 0, NULL, &evt1);
                        
            status = clEnqueueNDRangeKernel(queue[0], kernel[1], work_dims, NULL,
                      &global_work_size / 2, NULL, 1, &evt1, &evt3);
           
            clWaitForEvents(1, &evt3);
            clReleaseEvent(evt1); 
            clReleaseEvent(evt3);
            
            status = clEnqueueReadBuffer(queue[0], input_popu, CL_TRUE,
                     0, 1000 * sizeof(float), popu, 0, NULL, NULL);
        }
        if(random_P > PM){    //概率变异
           printf("\naaaaaaaaaaaaaaaaaaaaaahhhhh!!!! I`m mutation\n");
            random_init(random_num);
            printf("1\n");
            status = clEnqueueWriteBuffer(queue[0], input_b_buf[0], CL_FALSE,
                     0, 1000 * sizeof(float), random_num, 0, NULL, &evt1);
            printf("2\n");
            status = clEnqueueWriteBuffer(queue[0], input_a_buf[0], CL_FALSE,
                    0, 1000 * sizeof(float), popu, 1, &evt1, &evt2);
            printf("3\n");
            status = clEnqueueNDRangeKernel(queue[0], kernel[0], work_dims, NULL,
                      &global_work_size, NULL, 1, &evt2, &evt3);
            printf("4\n");          
            clWaitForEvents(1, &evt3);
            clReleaseEvent(evt1);
            clReleaseEvent(evt2);
            clReleaseEvent(evt3);
            printf("5\n");          
            status = clEnqueueReadBuffer(queue[0], input_a_buf[0], CL_TRUE,
                     0, 1000 * sizeof(float), popu, 0, NULL, NULL);
            printf("6\n");
        }
    }
    //选择最优
    float max = getFitness(0,popu);
    int result_index = 0;
    printf("the result:");
    for(i = 0; i < NUM - 1; i++){
        if(getFitness((i+1)*4, popu) > max){
            max = getFitness((i+1)*4, popu);
            result_index = i + 1; 
        }
    }
    //打印结果
    printf("[");
    for(i = 0; i < 4; i++){
        printf("%f, ",popu[result_index*4 + i]);
    }
    printf("1]\n");
  
  
  
  
  
 
}

// 释放初始化期间分配的资源
void cleanup() {
  for(unsigned i = 0; i < num_devices; ++i) {
    if(kernel && kernel[i]) {
      clReleaseKernel(kernel[i]);
    }
    if(queue && queue[i]) {
      clReleaseCommandQueue(queue[i]);
    }
#if USE_SVM_API == 0
    if(input_a_buf && input_a_buf[i]) {
      clReleaseMemObject(input_a_buf[i]);
    }
    if(input_b_buf && input_b_buf[i]) {
      clReleaseMemObject(input_b_buf[i]);
    }
    if(output_buf && output_buf[i]) {
      clReleaseMemObject(output_buf[i]);
    }
#else
    if(input_a[i].get())
      input_a[i].reset();
    if(input_b[i].get())
      input_b[i].reset();
    if(output[i].get())
      output[i].reset();
#endif /* USE_SVM_API == 0 */
  }

  if(program) {
    clReleaseProgram(program);
  }
  if(context) {
    clReleaseContext(context);
  }
}

// 帮助函数来显示OpenCL查询返回的参数
static void device_info_ulong( cl_device_id device, cl_device_info param, const char* name) {
   cl_ulong a;
   clGetDeviceInfo(device, param, sizeof(cl_ulong), &a, NULL);
   printf("%-40s = %lu\n", name, a);
}
static void device_info_uint( cl_device_id device, cl_device_info param, const char* name) {
   cl_uint a;
   clGetDeviceInfo(device, param, sizeof(cl_uint), &a, NULL);
   printf("%-40s = %u\n", name, a);
}
static void device_info_bool( cl_device_id device, cl_device_info param, const char* name) {
   cl_bool a;
   clGetDeviceInfo(device, param, sizeof(cl_bool), &a, NULL);
   printf("%-40s = %s\n", name, (a?"true":"false"));
}
static void device_info_string( cl_device_id device, cl_device_info param, const char* name) {
   char a[STRING_BUFFER_LEN]; 
   clGetDeviceInfo(device, param, STRING_BUFFER_LEN, &a, NULL);
   printf("%-40s = %s\n", name, a);
}

// 查询和显示关于设备和运行时环境的OpenCL信息

static void display_device_info( cl_device_id device ) {

   printf("Querying device for info:\n");
   printf("========================\n");
   device_info_string(device, CL_DEVICE_NAME, "CL_DEVICE_NAME");
   device_info_string(device, CL_DEVICE_VENDOR, "CL_DEVICE_VENDOR");
   device_info_uint(device, CL_DEVICE_VENDOR_ID, "CL_DEVICE_VENDOR_ID");
   device_info_string(device, CL_DEVICE_VERSION, "CL_DEVICE_VERSION");
   device_info_string(device, CL_DRIVER_VERSION, "CL_DRIVER_VERSION");
   device_info_uint(device, CL_DEVICE_ADDRESS_BITS, "CL_DEVICE_ADDRESS_BITS");
   device_info_bool(device, CL_DEVICE_AVAILABLE, "CL_DEVICE_AVAILABLE");
   device_info_bool(device, CL_DEVICE_ENDIAN_LITTLE, "CL_DEVICE_ENDIAN_LITTLE");
   device_info_ulong(device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, "CL_DEVICE_GLOBAL_MEM_CACHE_SIZE");
   device_info_ulong(device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, "CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE");
   device_info_ulong(device, CL_DEVICE_GLOBAL_MEM_SIZE, "CL_DEVICE_GLOBAL_MEM_SIZE");
   device_info_bool(device, CL_DEVICE_IMAGE_SUPPORT, "CL_DEVICE_IMAGE_SUPPORT");
   device_info_ulong(device, CL_DEVICE_LOCAL_MEM_SIZE, "CL_DEVICE_LOCAL_MEM_SIZE");
   device_info_ulong(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, "CL_DEVICE_MAX_CLOCK_FREQUENCY");
   device_info_ulong(device, CL_DEVICE_MAX_COMPUTE_UNITS, "CL_DEVICE_MAX_COMPUTE_UNITS");
   device_info_ulong(device, CL_DEVICE_MAX_CONSTANT_ARGS, "CL_DEVICE_MAX_CONSTANT_ARGS");
   device_info_ulong(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, "CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE");
   device_info_uint(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS");
   device_info_uint(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, "CL_DEVICE_MEM_BASE_ADDR_ALIGN");
   device_info_uint(device, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, "CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE");
   device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR");
   device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT");
   device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT");
   device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG");
   device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT");
   device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE");
   //printf("\nbbbbbbbbbb\n");
   {
      cl_command_queue_properties ccp;
      clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &ccp, NULL);
      printf("%-40s = %s\n", "Command queue out of order? ", ((ccp & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)?"true":"false"));
      printf("%-40s = %s\n", "Command queue profiling enabled? ", ((ccp & CL_QUEUE_PROFILING_ENABLE)?"true":"false"));
      //printf("\nccccccccccccccC\n");
   }
}

4、在opecnCL编程中可能遇到的问题

1、创建程序时的问题。创建程序时需要读所有的内核文件,在书上看到有一种是读是用fopen来读内核文件,然后用createProgramWithSource()创建程序的方法,我照书上用这种方法敲过,会越界,在linux下会报段错误(核心已转储)的错误。
因此本文在创建程序的时候用的是FPGA官方示例的创建方法:

  std::string binary_file = getBoardBinaryFile("GA", device);
  printf("Using AOCX: %s\n", binary_file.c_str());
  program = createProgramFromBinary(context, binary_file.c_str(), &device, num_devices);

但是在getBoardBinaryFile方法里只能写一个文件名,所以如果你有多个内核需要在如GA.cl文件中include进其他内核,例如:

#include "kernel1.cl"
#include "kernel2.cl"
#include "kernel3.cl"
#include "kernel4.cl"
······ 
__kernel void cross(__global float *popu)
{
     ······
}

2、向内核传入数据、读取数据的问题。如果是传入简单的数据如float a, int a这样的,可以在设置内核参数的时候以这样的形式设置:clSetKernelArg(kernel[0], argi++, sizeof(cl_mem),&a);
但如果要传入的的是数组,指针之类需要设计内存操作的数据,就需要开辟缓冲区了,
例如:

input_a_buf[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, 
        10000 * sizeof(float), NULL, &status);
status = clSetKernelArg(kernel[0], argi++, sizeof(cl_mem), (void *)&input_a_buf[0]);
    checkError(status, "Failed to set argument %d", argi - 1);  

然后在用

status = clEnqueueWriteBuffer(queue[0], input_popu, CL_FALSE,
                        0, 1000 * sizeof(float), popu, 0, NULL, &evt1);
                        
            status = clEnqueueNDRangeKernel(queue[0], kernel[1], work_dims, NULL,
                      &global_work_size / 2, NULL, 1, &evt1, &evt3);

就可以传入内核了
而如果要把传入内核的数据再读出来就要再接一个

//等待内核执行完毕
clWaitForEvents(1, &evt3);
clReleaseEvent(evt1); 
clReleaseEvent(evt3);
//读取数据            
status = clEnqueueReadBuffer(queue[0], input_popu, CL_TRUE,
               0, 1000 * sizeof(float), popu, 0, NULL, NULL);

还遇到了很多问题之后再更,如果有遇上什么问题的可以在评论里留言一起讨论

Mon 06 Mon 13 Mon 20 学习 进行中 计划一 计划二 现有任务 Adding GANTT diagram functionality to mermaid
  • 1
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值