主机端代码:
//create device queue
cl_queue_properties props[]={
CL_QUEUE_PROPERTIES,
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|
CL_QUEUE_ON_DEVICE|CL_QUEUE_ON_DEVICE_DEFAULT,
0
};
cl_command_queue devicequeue = clCreateCommandQueueWithProperties(context, device, props, &err);
size_t program_length;
//create kernel
cl_kernel kernel_saxpy_dp_no_wait = clCreateKernel(program, "saxpy_dp_no_wait", &err);
opencl_check(err);
//SVM malloc buffer
float *saxpy_src_0;
float *saxpy_src_1;
float *saxpy_dst_0;
int glbSize = 8192;
float factor = 2.3f;
size_t glbSizeBytes = glbSize * sizeof(float);
saxpy_dst_0 = (float*)clSVMAlloc(context, CL_MEM_READ_WRITE, glbSizeBytes, 0);
saxpy_src_0 = (float*)clSVMAlloc(context, CL_MEM_READ_WRITE, glbSizeBytes, 0);
saxpy_src_1 = (float*)clSVMAlloc(context, CL_MEM_READ_WRITE, glbSizeBytes, 0);
float one = 1.f;
float two = 2.0f;
float three = 5.f;
//initialize the saxpy_src_0, saxpy_src_1,saxpy_dst_0
err = clEnqueueSVMMemFill(queue, saxpy_src_0, (const void*)&one, sizeof(float), glbSizeBytes, 0, NULL, NULL);
opencl_check(err);
err = clEnqueueSVMMemFill(queue, saxpy_src_1, (const void*)&two, sizeof(float), glbSizeBytes, 0, NULL, NULL);
opencl_check(err);
err = clEnqueueSVMMemFill(queue, saxpy_dst_0, (const void*)&three, sizeof(float), glbSizeBytes, 0, NULL, NULL);
opencl_check(err);
err = clFinish(queue);
opencl_check(err);
//set kernel args and enqueue kernel
err = clSetKernelArg(kernel_saxpy_dp_no_wait, 0, sizeof(int), (void*)&glbSize);
opencl_check(err);
err = clSetKernelArg(kernel_saxpy_dp_no_wait, 1, sizeof(float), (void*)&factor);
opencl_check(err);
err = clSetKernelArgSVMPointer(kernel_saxpy_dp_no_wait, 2, saxpy_src_0);
opencl_check(err);
err = clSetKernelArgSVMPointer(kernel_saxpy_dp_no_wait, 3, saxpy_src_1);
opencl_check(err);
err = clSetKernelArgSVMPointer(kernel_saxpy_dp_no_wait, 4, saxpy_dst_0);
opencl_check(err);
size_t localsize[1] = {256};
size_t globalsize[1] = {glbSize/localsize[0]};
err = clEnqueueNDRangeKernel(queue, kernel_saxpy_dp_no_wait, 1, NULL, globalsize, NULL, 0, NULL, NULL);
err = clFinish(queue);
opencl_check(err);
//printf the result of saxpy_dst_0
for(int j = 0; j < 8; j++)
{
for(int i = 0; i < 10; i++)
{
printf("i=%d. %f, (%f, %f)\n", i+256*j, saxpy_dst_0[i+256*j], saxpy_src_0[i+256*j], saxpy_src_1[i+256*j]);
}
}
device端代码:
__kernel void saxpy_dp_child(const int numElems, const float factor,
__global const float *src_0,
__global const float *src_1,
__global float *dst_0
)
{
uint gid = get_global_id(0);
if(gid < numElems)
dst_0[gid] = factor * src_0[gid] * src_1[gid];
}
__kernel void saxpy_dp_no_wait(const int numElems, const float factor,
__global const float *src_0,
__global const float *src_1,
__global float *dst_0
)
{
uint global_id = get_global_id(0);
uint global_sz = get_global_size(0);
uint child_global_sz = numElems / global_sz;
uint child_offset = global_id * child_global_sz;
//follow is the realize of device queue
//set child buffer
__global const float *src_0_child = &src_0[child_offset];
__global const float *src_1_child = &src_1[child_offset];
__global const float *dst_0_child = &dst_0[child_offset];
//get device queue
queue_t defQ = get_default_queue();//we create in host code as devicequeue
ndrange_t ndrange = ndrange_1D(child_global_sz);
//build block
void (^saxpy_dp_child_wrapper)(void) =
^{
saxpy_dp_child(child_global_sz, factor, src_0_child, src_1_child, dst_0_child);
};
//enque kernel into device queue with build-in block
int err_ret = enqueue_kernel(defQ, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, saxpy_dp_child_wrapper);
}
一共起了32个全局工作项,8192/256=32;每个全局工作项的子内核中的全局工作项为256个。
注意:在主机代码端cl_command_queue devicequeue = clCreateCommandQueueWithProperties(context, device, props, &err);创建devcie queue, 在设备代码端 queue_t defQ = get_default_queue();//we create in host code as devicequeue获取设备队列。