opencl device queue

主机端代码:

        //create device queue
        cl_queue_properties props[]={
                    CL_QUEUE_PROPERTIES,
                    CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|
                        CL_QUEUE_ON_DEVICE|CL_QUEUE_ON_DEVICE_DEFAULT,
                    0
                };
        cl_command_queue devicequeue = clCreateCommandQueueWithProperties(context, device, props, &err);
        size_t program_length;

        //create kernel
        cl_kernel kernel_saxpy_dp_no_wait = clCreateKernel(program, "saxpy_dp_no_wait", &err);
        opencl_check(err);

        //SVM malloc buffer
        float *saxpy_src_0;
        float *saxpy_src_1;
        float *saxpy_dst_0;
        int glbSize = 8192;
        float factor = 2.3f;
        size_t glbSizeBytes = glbSize * sizeof(float);
        saxpy_dst_0 = (float*)clSVMAlloc(context, CL_MEM_READ_WRITE, glbSizeBytes, 0);
        saxpy_src_0 = (float*)clSVMAlloc(context, CL_MEM_READ_WRITE,  glbSizeBytes, 0);
        saxpy_src_1 = (float*)clSVMAlloc(context, CL_MEM_READ_WRITE, glbSizeBytes, 0);

        float one = 1.f;
        float two = 2.0f;
        float three = 5.f;
        //initialize the saxpy_src_0, saxpy_src_1,saxpy_dst_0
        err = clEnqueueSVMMemFill(queue, saxpy_src_0, (const void*)&one, sizeof(float), glbSizeBytes, 0, NULL, NULL);
        opencl_check(err);
        err = clEnqueueSVMMemFill(queue, saxpy_src_1, (const void*)&two, sizeof(float), glbSizeBytes, 0, NULL, NULL);
        opencl_check(err);
        err = clEnqueueSVMMemFill(queue, saxpy_dst_0, (const void*)&three, sizeof(float), glbSizeBytes, 0, NULL, NULL);
        opencl_check(err);
        err = clFinish(queue);
        opencl_check(err);

        //set kernel args and enqueue kernel
        err = clSetKernelArg(kernel_saxpy_dp_no_wait, 0, sizeof(int), (void*)&glbSize);
        opencl_check(err);
        err = clSetKernelArg(kernel_saxpy_dp_no_wait, 1, sizeof(float), (void*)&factor);
        opencl_check(err);
        err = clSetKernelArgSVMPointer(kernel_saxpy_dp_no_wait, 2, saxpy_src_0);
        opencl_check(err);
        err = clSetKernelArgSVMPointer(kernel_saxpy_dp_no_wait, 3, saxpy_src_1);
        opencl_check(err);
        err = clSetKernelArgSVMPointer(kernel_saxpy_dp_no_wait, 4, saxpy_dst_0);
        opencl_check(err);
        size_t localsize[1] = {256};
        size_t globalsize[1] = {glbSize/localsize[0]};
        err = clEnqueueNDRangeKernel(queue, kernel_saxpy_dp_no_wait, 1, NULL, globalsize, NULL, 0, NULL, NULL);
        err = clFinish(queue);
        opencl_check(err);
        //printf the result of saxpy_dst_0
        for(int j = 0; j < 8; j++)
        {
            for(int i = 0; i < 10; i++)
            {
                printf("i=%d. %f, (%f, %f)\n", i+256*j, saxpy_dst_0[i+256*j], saxpy_src_0[i+256*j], saxpy_src_1[i+256*j]);
            }
        }
 

device端代码:

__kernel void saxpy_dp_child(const int numElems, const float factor,
                        __global const float *src_0,
                        __global const float *src_1,
                        __global float *dst_0
                        )
{
    uint gid = get_global_id(0);
    if(gid < numElems)
        dst_0[gid] = factor * src_0[gid] * src_1[gid];
}

__kernel void saxpy_dp_no_wait(const int numElems, const float factor,
                        __global const float *src_0,
                        __global const float *src_1,
                        __global float *dst_0
                        )
{
    uint global_id = get_global_id(0);
    uint global_sz = get_global_size(0);

    uint child_global_sz = numElems / global_sz;
    uint child_offset = global_id * child_global_sz;

    //follow is the realize of device queue
        //set child buffer
    __global const float *src_0_child = &src_0[child_offset];
    __global const float *src_1_child = &src_1[child_offset];
    __global const float *dst_0_child = &dst_0[child_offset];

    //get device queue
    queue_t defQ = get_default_queue();//we create in host code as devicequeue
    ndrange_t ndrange = ndrange_1D(child_global_sz);

    //build block
    void (^saxpy_dp_child_wrapper)(void) =
            ^{
                saxpy_dp_child(child_global_sz, factor, src_0_child, src_1_child, dst_0_child);
            };

    //enque kernel into device queue with build-in block
    int err_ret = enqueue_kernel(defQ, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, saxpy_dp_child_wrapper);

}

一共起了32个全局工作项,8192/256=32;每个全局工作项的子内核中的全局工作项为256个。

注意:在主机代码端cl_command_queue devicequeue = clCreateCommandQueueWithProperties(context, device, props, &err);创建devcie queue, 在设备代码端  queue_t defQ = get_default_queue();//we create in host code as devicequeue获取设备队列。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值