一、BasicDebug
main:
#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cstring>
#include <string>
#include <fstream>
#define SUCCESS 0
#define FAILURE 1
#define EXPECTED_FAILURE 2
#define GlobalThreadSize 256
#define GroupSize 64
using namespace std;
int main()
{
//set up OpenCL...
cl_uint platformNum;
cl_int status;
status=clGetPlatformIDs(0,NULL,&platformNum);
if(status!=CL_SUCCESS){
printf("cannot get platforms number.\n");
return -1;
}
cl_platform_id* platforms;
platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);
status=clGetPlatformIDs(platformNum,platforms,NULL);
if(status!=CL_SUCCESS){
printf("cannot get platforms addresses.\n");
return -1;
}
cl_platform_id platformInUse=platforms[0];
cl_device_id device;
status=clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_DEFAULT,1,&device,NULL);
cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,&status);
cl_command_queue_properties prop=0; //CL_QUEUE_PROFILING_ENABLE;
cl_command_queue_properties *propers;
propers=∝
cl_command_queue commandQueue=clCreateCommandQueueWithProperties(context,device,propers, &status);
std::ifstream srcFile("/home/jumper/OpenCL_projects/AMD-Sample-BasicDebug/BasicDebug_Kernel.cl");
std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));
const char * src = srcProg.c_str();
size_t srclength = srcProg.length();
cl_program program=clCreateProgramWithSource(context,1,&src,&srclength,&status);
char buildoption[128];
sprintf(buildoption,"-g -D WGSIZE=%d",GroupSize);
status=clBuildProgram(program,1,&device,buildoption,NULL,&status);
if (status != CL_SUCCESS)
{
cout<<"error:Build BasicDebug_Kernel()..."<<endl;
return(EXIT_FAILURE);
}
//set input data
cl_uint inputSizeBytes = GlobalThreadSize * sizeof(cl_uint);
cl_float *input = (cl_float *) malloc(inputSizeBytes);
for(int i=0;i< GlobalThreadSize;i++)
{
input[i] = (float)i;
}
//create input buffer
cl_mem inputBuffer = clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR,sizeof(cl_uint) * GlobalThreadSize,(void *)input,&status);
if (status != CL_SUCCESS)
{
std::cout<<"Error: Creating input buffer failed!"<<std::endl;
return FAILURE;
}
//create output buffer
cl_mem outputBuffer = clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(cl_uint) * GlobalThreadSize,NULL, &status);
if (status != CL_SUCCESS)
{
std::cout<<"Error: Creating output buffer failed!"<<std::endl;
return FAILURE;
}
cl_kernel kernel1 = clCreateKernel(program, "printfKernel", &status);
if (status != CL_SUCCESS)
{
std::cout<<"Error: Creating printfKernel failed!"<<std::endl;
return FAILURE;
}
//set kernel args.
status = clSetKernelArg(kernel1, 0, sizeof(cl_mem), (void *)&inputBuffer);
//create debug kernel
cl_kernel kernel2 = clCreateKernel(program, "debugKernel2", &status);
if (status != CL_SUCCESS)
{
std::cout<<"Error: Creating debugKernel2 failed!"<<std::endl;
return FAILURE;
}
//set kernel args.
status = clSetKernelArg(kernel2, 0, sizeof(cl_mem), (void *)&inputBuffer);
status = clSetKernelArg(kernel2, 1, sizeof(cl_mem), (void *)&outputBuffer);
size_t global_threads[1];
size_t local_threads[1];
global_threads[0] = GlobalThreadSize;
local_threads[0] = GroupSize;
//execute the kernel
status = clEnqueueNDRangeKernel(commandQueue, kernel1, 1, NULL, global_threads, local_threads, 0, NULL, NULL);
if (status != CL_SUCCESS)
{
std::cout<<"Error: Enqueue kernel onto command queue failed!"<<std::endl;
return FAILURE;
}
status = clFinish(commandQueue);
status = clEnqueueNDRangeKernel(commandQueue, kernel2, 1, NULL, global_threads, local_threads, 0, NULL, NULL);
if (status != CL_SUCCESS)
{
std::cout<<"Error: Enqueue kernel onto command queue failed!"<<std::endl;
return FAILURE;
}
status = clFinish(commandQueue);
// Clean the resources.
status = clReleaseKernel(kernel1);//Release kernel.
status = clReleaseKernel(kernel2);
status = clReleaseMemObject(inputBuffer);//Release mem object.
status = clReleaseMemObject(outputBuffer);
status = clReleaseProgram(program);//Release program.
status = clReleaseCommandQueue(commandQueue);//Release command queue.
status = clReleaseContext(context);//Release context.
status = clReleaseDevice(device);
free(input);
std::cout<<"Passed!\n";
return 0;
}
cl文件:
//#define WGSIZE 64
__kernel void printfKernel(__global float *inputbuffer)
{
uint globalID = get_global_id(0);
uint groupID = get_group_id(0);
uint localID = get_local_id(0);
__local int data[WGSIZE];
int idx = WGSIZE - 1;
if(idx == globalID)
{
float4 f = (float4)(inputbuffer[0], inputbuffer[1], inputbuffer[2], inputbuffer[3]);
printf("Output vector data: f4 = %2.2v4hlf\n", f);
}
data[localID] = localID;
barrier(CLK_LOCAL_MEM_FENCE);
if(idx == localID)
{
printf("\tThis is group %d\n",groupID);
printf("\tOutput LDS data: %d\n",data[idx]);
}
printf("the global ID of this thread is : %d\n",globalID);
}
__kernel void debugKernel2(__global float *inputbuffer,__global float *outputbuffer)
{
uint globalID = get_global_id(0);
uint value = 0;
value = inputbuffer[globalID];
outputbuffer[globalID] = value;
}
补充:所有的clReleaseEvent()前忘记了一句:clWaitForEvents(1,&); 补上去!!!
这个很简单,没必要说什么!但是有一个新地方:我看到kernel里需要某个常量时,以前我是1、直接用字面值常量(这种比较麻烦);2、在cl中用#define XXX XX(相对简单);这个例子还给出了第三种方式:3、使用clBuildProgram的buildOption传进kernel!!!
char buildoption[128];
sprintf(buildoption,"-g -D WGSIZE=%d",GroupSize);
status=clBuildProgram(program,1,&device,buildoption,NULL,&status);
这样将WGSIZE(GroupSize即64)传进cl文件里!