Opencl sample code


// g++  -g  -o 1 cl-sample.cpp         -lOpenCL -IMali_OpenCL_SDK_v1.1.0/include   -std=c++11

//http://gameeer.com/2015/01/09/opencl-env/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <CL/opencl.h>
#include <iostream>
// OpenCL kernel. Each work item takes care of one element of c
/* const char *kernelSource =                                          "\n" \  
    "__kernel void vecAdd(  __global float *a,           \n" \
    "                       __global float *b,                       \n" \
    "                       __global float *c,                       \n" \
    "                       const unsigned int n)                    \n" \
    "{                                                               \n" \
    "    //Get our global thread ID                                  \n" \
    "    int id = get_global_id(0);                                  \n" \
    "                                                                \n" \
    "    //Make sure we do not go out of bounds                      \n" \
    "    if (id < n)                                                 \n" \
    "        c[id] = a[id] + b[id];                                  \n" \
    "}                                                               \n" \
    "\n" ;


*/

// OpenCL kernel. Each work item takes care of one element of c
const char *kernelSource =                                          "\n" \
    "__kernel void vecMul(  __global int *a,           \n" \
    "                       __global int *b,                       \n" \
    "                       __global int *c,                       \n" \
    "                       const unsigned int n)                    \n" \
    "{                                                               \n" \
    "    //Get our global thread ID                                  \n" \
    "    int id = get_global_id(0);                                  \n" \
    "                                                                \n" \
    "    //Make sure we do not go out of bounds                      \n" \
    "    if (id < n)                                                 \n" \
    "        c[id] = a[id] * b[id];                                  \n" \
    "}                                                               \n" \
    "\n" ;



   
// Use a static data size for simplicity   
//   
#define IMAGE_X_PIXELS          176   
#define IMAGE_Y_PIXELS          144   
#define IMAGE_SOURCE1_LIMPID    0.5f   
#define IMAGE_SOURCE2_LIMPID    0.5f   

   

// Simple compute kernel which computes the square of an input array    
//   
const char *KernelSource1 = "\n" \
"#define IMAGE_Y_PIXELS   144                                           \n" \
"#define IMAGE_SOURCE1_LIMPID   0.5f                                    \n" \
"#define IMAGE_SOURCE2_LIMPID   0.5f                                    \n" \
"                                                                       \n" \
"__kernel void Limpid(                                                  \n" \
"   __global float image1[][IMAGE_Y_PIXELS],                            \n" \
"   __global float image2[][IMAGE_Y_PIXELS],                            \n" \
"   __global float output[][IMAGE_Y_PIXELS])                            \n" \
"{                                                                      \n" \
"   int x = get_global_id(0);                                           \n" \
"   int y = get_global_id(1);                                           \n" \
"   output[x][y] = image1[x][y] * IMAGE_SOURCE1_LIMPID + image2[x][y] * IMAGE_SOURCE2_LIMPID; \n" \
"}                                                                      \n" \
"\n";


  


int main( int argc, char* argv[] )  
{
    int len = 10;
    cl_mem d_a;
    cl_mem d_b;
    cl_mem d_c;
    cl_platform_id cpPlatform;        // OpenCL 平台
    cl_device_id device_id;           // device ID
    cl_context context,context1;               // context
    cl_command_queue queue,queue1;           // command queue
    cl_program program,program1;               // program
    cl_kernel kernel,kernel1;                 // kernel
    size_t bytes = len*sizeof(int);
    /*h_a = (int*)malloc(bytes);
    h_b = (int*)malloc(bytes);
    h_c = (int*)malloc(bytes);*/
    size_t globalSize, localSize;
    cl_int err;
    localSize = 2;
    globalSize = (size_t)ceil(len/(float)localSize)*localSize;
    





      
    float *image1, *image2;             // original data set given to device   
    float *results;                     // results returned from device   
    unsigned int correct;               // number of correct results returned   
  
    size_t global;                      // global domain size for our calculation   
    size_t local;                       // local domain size for our calculation   
    cl_mem input1, input2;              // device memory used for the input array   
    cl_mem output;                      // device memory used for the output array   
  
  
  
// Initialize the original data buffer and the result buffer   
    image1 = (float*)malloc(IMAGE_X_PIXELS * IMAGE_Y_PIXELS * sizeof(*image1));   
    image2 = (float*)malloc(IMAGE_X_PIXELS * IMAGE_Y_PIXELS * sizeof(*image2));   
    results = (float*)malloc(IMAGE_X_PIXELS * IMAGE_Y_PIXELS * sizeof(*results));   
    const unsigned int count = IMAGE_X_PIXELS * IMAGE_Y_PIXELS;   
  
// Automatically generate 2 images   
    for(int i = 0; i < count; i++) {   
        image1[i] = rand() / (float)RAND_MAX;   
        image2[i] = rand() / (float)RAND_MAX;   
    }   










    err = clGetPlatformIDs(1, &cpPlatform, NULL);
    err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    context1 = clCreateContext(0, 1, &device_id, NULL, NULL, &err);

    queue = clCreateCommandQueue(context, device_id, 0, &err);
queue1 = clCreateCommandQueue(context1, device_id, 0, &err);
    program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err);
program1 = clCreateProgramWithSource(context1, 1, (const char **) & KernelSource1, NULL, &err);
    clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    clBuildProgram(program1, 0, NULL, NULL, NULL, NULL);
    kernel = clCreateKernel(program, "vecMul", &err);
    kernel1 = clCreateKernel(program1, "Limpid", &err);

    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, bytes, NULL, NULL);
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, bytes, NULL, NULL);
    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, bytes, NULL, NULL);
    // (将向量信息写入设备缓冲)
    /*err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
        bytes, h_a, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
        bytes, h_b, 0, NULL, NULL);
    // (设置计算内核的参数)*/
    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
    err = clSetKernelArg(kernel, 3, sizeof(int), &len);



int *mappedBuffer_a = NULL;
int *mappedBuffer_b = NULL;
int *mappedBuffer_c = NULL;

mappedBuffer_a = (int *)clEnqueueMapBuffer(queue, d_a, CL_TRUE, CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, NULL);
mappedBuffer_b = (int *)clEnqueueMapBuffer(queue, d_b, CL_TRUE, CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, NULL);
mappedBuffer_c = (int *)clEnqueueMapBuffer(queue, d_c, CL_TRUE, CL_MAP_READ, 0, bytes, 0, NULL, NULL, NULL);
clFinish(queue);






// Create the input and output arrays in device memory for our calculation   
    //   
    input1 = clCreateBuffer(context1,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);   
    input2 = clCreateBuffer(context1,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);   
    output = clCreateBuffer(context1, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);   
    if (!input1 || !input2 || !output)   
    {   
printf("Error: Failed to allocate device memory!\n");   
        exit(1);   
    }       
  
// Write our data set into the input array in device memory    
    //   
    err = clEnqueueWriteBuffer(queue1, input1, CL_TRUE, 0, sizeof(float) * count, image1, 0, NULL, NULL);   
    err |= clEnqueueWriteBuffer(queue1, input2, CL_TRUE, 0, sizeof(float) * count, image2, 0, NULL, NULL);   
    if (err != CL_SUCCESS)   
    {   
printf("Error: Failed to write to source array!\n");   
        exit(1);   
    }   
  
// Set the arguments to our compute kernel   
    //   
    err = 0;   
    err = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &input1);   
    err |= clSetKernelArg(kernel1, 1, sizeof(cl_mem), &input2);   
    err |= clSetKernelArg(kernel1, 2, sizeof(cl_mem), &output);   
    if (err != CL_SUCCESS)   
    {   
printf("Error: Failed to set kernel arguments! %d\n", err);   
        exit(1);   
    }   
  
// Get the maximum work group size for executing the kernel on the device   
    //   
    err = clGetKernelWorkGroupInfo(kernel1, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);   
    if (err != CL_SUCCESS)   
    {   
printf("Error: Failed to retrieve kernel work group info! %d\n", err);   
        exit(1);   
    }   
    else   
printf("The number of work items in a work group is: %lu\r\n", local);   
  
// Execute the kernel over the entire range of our 1d input data set   
// using the maximum number of work group items for this device   
    //   
    global = count; 






















for (int j=1;j<10;j++) {

     for( int i = 0; i < len; i++ )
    {
        mappedBuffer_a[i] = j+1;
	mappedBuffer_b[i] = j;
    }

    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,0, NULL, NULL);
    clFinish(queue);
    /*clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
        bytes, h_c, 0, NULL, NULL );
    float sum = 0;
    for(i=0; i<n; i++)
        printf("%d =====",h_c[i]);
    */

	 std::cout<<"+++c=(a*b)++++"<<std::endl;
         for( int i = 0; i < len; i++ )
    {
        std::cout<<mappedBuffer_c[i]<<"=("<< mappedBuffer_a[i]<<"*"<<mappedBuffer_b[i]<<") |  ";
    }
         std::cout<<"\n"<<"========"<<std::endl;


}





  err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,0, NULL, NULL);
   size_t globalSize1[2] = {IMAGE_X_PIXELS, IMAGE_Y_PIXELS};
   size_t  localSize1[2]={22, 12};
   err = clEnqueueNDRangeKernel(queue1, kernel1, 2, NULL, globalSize1, localSize1,0, NULL, NULL);
    if (err)   
    {   
printf("Error: Failed to execute kernel!\n");   
return EXIT_FAILURE;   
    }   
  
// Wait for the command commands to get serviced before reading back results   
    //   
    clFinish(queue1);   
  
// Read back the results from the device to verify the output   
    //   
    err = clEnqueueReadBuffer(queue1, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );     
    if (err != CL_SUCCESS)   
    {   
printf("Error: Failed to read output array! %d\n", err);   
        exit(1);   
    }   
  
// Validate our results   
    //   
    correct = 0;   
    for(int i = 0; i < count; i++)   
    {   
        if(results[i] == image1[i] * IMAGE_SOURCE1_LIMPID  + image2[i] * IMAGE_SOURCE2_LIMPID)   
            correct++;   
    }   
  
// Print a brief summary detailing the results   
    //   
printf("Computed '%d/%d' correct values!\n", correct, count);   









clEnqueueUnmapMemObject(queue, d_c, mappedBuffer_c, 0, NULL, NULL);
clEnqueueUnmapMemObject(queue, d_a, mappedBuffer_a, 0, NULL, NULL);
clEnqueueUnmapMemObject(queue, d_b, mappedBuffer_b, 0, NULL, NULL);
clFinish(queue);
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    /*free(h_a);
    free(h_b);
    free(h_c);
    return 0;*/

clReleaseMemObject(input1);   
clReleaseMemObject(input2);   
clReleaseMemObject(output);   
clReleaseProgram(program1);   
clReleaseKernel(kernel1);   
clReleaseCommandQueue(queue1);   
clReleaseContext(context1);   
  
free(image1);   
free(image2);   
free(results);   




}




  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值