OpenCL做并行滤波

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u011028771/article/details/52821947

本实验主要进行OpenCL一维信号的滤波;主要思路是以离散信号的序列点作为目标,一个工作项负责一个信号点的计算;这样做的好处是方便,相对于串行实现获得相当大的性能提升;但是每个工作项负载不均衡。

host.c
#include<stdio.h>
#include<windows.h>
#include<math.h>
#include<CL/cl.h>
#pragma warning( disable : 4996 )
#define MIXSIZE 8192*65

int main() {
    cl_int error;
    cl_platform_id platforms;
    cl_device_id devices;
    cl_context context;
    FILE *program_handle;
    size_t program_size;
    char *program_buffer;
    cl_program program;
    size_t log_size;
    char *program_log;
    char kernel_name[] = "createBuffer";
    cl_kernel kernel;
    cl_command_queue queue;
    //获取平台
    error = clGetPlatformIDs(1, &platforms, NULL);
    if (error != 0) {
        printf("Get platform failed!");
        return -1;
    }
    error = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_GPU, 1, &devices, NULL);
    if (error != 0) {
        printf("Get device failed!");
        return -1;
    }
    //创建上下文
    context = clCreateContext(NULL,1,&devices,NULL,NULL,&error);
    if (error != 0) {
        printf("Creat context failed!");
        return -1;
    }
    //创建程序
    program_handle = fopen("kernel.cl","rb");
    if (program_handle == NULL) {
        printf("The kernle can not be opened!");
        return -1;
    }
    fseek(program_handle,0,SEEK_END);
    program_size = ftell(program_handle);
    rewind(program_handle);

    program_buffer = (char *)malloc(program_size+1);
    program_buffer[program_size] = '\0';
    error=fread(program_buffer,sizeof(char),program_size,program_handle);
    if (error == 0) {
        printf("Read kernel failed!");
        return -1;
    }
    fclose(program_handle);
    program = clCreateProgramWithSource(context,1,(const char **)&program_buffer,&program_size,&error);
    if (error < 0) {
        printf("Couldn't create the program!");
        return -1;
    }
    //编译程序
    error = clBuildProgram(program,1,&devices,NULL,NULL,NULL);
    if (error < 0) {
        //确定日志文件的大小
        clGetProgramBuildInfo(program,devices,CL_PROGRAM_BUILD_LOG,0,NULL,&log_size);
        program_log = (char *)malloc(log_size+1);
        program_log[log_size] = '\0';
        //读取日志
        clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
        printf("%s\n",program_log);
        free(program_log);
        getchar();
        return -1;
    }
    //创建命令队列
    queue = clCreateCommandQueue(context, devices, CL_QUEUE_PROFILING_ENABLE, &error);
    if (error < 0) {
        printf("Coudn't create the command queue");
        return -1;
    }
    //创建内核
    kernel = clCreateKernel(program,kernel_name,&error);
    if (kernel==NULL) {
        printf("Couldn't create kernel!\n");
        return -1;
    }
    //创建缓存对象
    cl_mem memObject1 = clCreateBuffer(context,CL_MEM_READ_ONLY ,
                                                                    sizeof(float) * 256,NULL,&error);
    if (error < 0) {
        printf("Creat memObject1 failed!\n");
        return -1;
    }
    cl_mem memObject2 = clCreateBuffer(context, CL_MEM_READ_ONLY , 
                                                            sizeof(float) * MIXSIZE / 65, NULL, &error);
    if (error < 0) {
        printf("Creat memObject2 failed!\n");
        return -1;
    }

    cl_mem memObject3 = clCreateBuffer(context, CL_MEM_WRITE_ONLY , 
                                                             sizeof(float) * MIXSIZE/65, NULL, &error);
    if (error < 0) {
        printf("Creat memObject3 failed!\n");
        return -1;
    }
    //设置内核参数
    error = clSetKernelArg(kernel,0,sizeof(cl_mem),&memObject1);
    error |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObject2);
    error |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObject3);
    if (error != CL_SUCCESS) {
        printf("Error setting kernel arguments!\n");
        return -1;
    }

    //初始化参数
    float* input1 = (float *)malloc(sizeof(float)* 256);
    float* input2 = (float *)malloc(sizeof(float)* MIXSIZE / 65);
    float* result = (float *)malloc(sizeof(float)* MIXSIZE / 65);
    float *check = (float *)malloc(sizeof(float) * MIXSIZE / 65);
    memset(check, 0, sizeof(float) * MIXSIZE / 65);
    memset(input1, 1, sizeof(float) * 256);
    memset(input2, 0, sizeof(float) * MIXSIZE / 65);
    memset(result, 0, sizeof(float) * MIXSIZE / 65);
    cl_event evt1;
    cl_event evt2;
    cl_event evt3;

    float* tmp1 = (float *)malloc(sizeof(float)* MIXSIZE);
    float *tmp2 = (float *)malloc(sizeof(float) * MIXSIZE / 65);
    memset(tmp2, 0, sizeof(float) * MIXSIZE / 65);
    //数据读入
    //采用随机数函数产生输入
    //input2是65*8192
        srand(1);
        for (int j = 0; j < 8192; j++) {
            input2[ j] = 20 * rand() / (double)(RAND_MAX);
        //  input2[j] = 1;
            check[j] = 0;
        }

        for (int j = 0; j < 256; j++) {
            input1[j] = 1;
        }
    //检查运算结果
        for (int j = 0; j < 8192; j++) {

            if (j > 255) {
                for (int k = 0; k < 256; k++) {
                    check[j] += input2[j - k] * input1[k];
                }
            }
            else {
                for (int k = 0; k < j+1; k++) {
                    check[j] += input2[j-k] * input1[k];
                }
            }   
        }

    //数据写入内存
    error = clEnqueueWriteBuffer(queue, memObject1, CL_FALSE, 0,
        256 * sizeof(float), input1, 0, NULL, &evt1);
    if (error != CL_SUCCESS) {
        printf("write data failed!\n");
        return -1;
    }
    error = clEnqueueWriteBuffer(queue, memObject2, CL_FALSE, 0,
        MIXSIZE * sizeof(float) / 65, input2, 1, &evt1, &evt2);
    if (error != CL_SUCCESS) {
        printf("write data failed!\n");
        return -1;
    }
    //配置工作项
    size_t maxWorkGroupSize = 0;
    clGetDeviceInfo(devices, CL_DEVICE_MAX_WORK_GROUP_SIZE,
        sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL);
    size_t globalWorkSize = 8192;
    size_t localWorkSize = maxWorkGroupSize;
    //执行内核
    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalWorkSize,
        &localWorkSize, 1, &evt2, &evt3);
    if (error != CL_SUCCESS) {
        printf("Error queuing kernel for execution!\n");
        return -1;
    }

    //读取执行结果
    error = clEnqueueReadBuffer(queue,memObject3,CL_TRUE,0,
        MIXSIZE*sizeof(float)/65,result,1,&evt3,NULL);
    if (error != CL_SUCCESS) {
        printf("Error reading result buffer!\n");
        return -1;
    }
    //显示结果
    for (int i = 0; i < MIXSIZE/65; i++) {
        if ((result[i] /check[i]<0.999) | check[i]==0) { 
            printf("failed!\n");
            printf("%f,%f,%d\n",result[i],check[i],i);
            getchar();
            return 0;
        }
    }
    printf("successed!\n");
    clReleaseEvent(evt1);
    clReleaseEvent(evt2);
    clReleaseEvent(evt3);
    clReleaseProgram(program);
    clReleaseContext(context);
    clReleaseCommandQueue(queue);
    clReleaseDevice(devices);
    clReleaseKernel(kernel);

    getchar();
    return 0;
}
kernel.cl
//卷积
//假设有8192个数据
//全局工作项8192
//卷积系数256
//输入1为卷积系数
//输入2为数据
__kernel void createBuffer(__global float *input1,
    __global float *input2,
    __global float *result) {
    int gid = get_global_id(0);
    if (gid > 255) {
        for (int i = 0; i < 256; i++) {
            result[gid] += input1[i] * input2[gid-i];
        }
    }
    else {
        for (int i = 0; i < gid+1; i++) {
            result[gid] += input1[i] * input2[gid - i];
        }
    }

}

这篇博客中kernel函数是将工作项分为两部分,索引大于255和小于255的,因为这两部分运算量不同。
在上一篇关于OpenCL实现序列卷积的博客中提到了一种方式;但是没有考虑到工作项之间的同步问题;运算结果有误;
下一篇博客中会对两种方式进行比较;总结工作项之间的同步问题。以前的博客中总结过主机端命令同步问题,这次完成对工作项之间的同步;OpenCL中同步的基础知识就差不多了。

阅读更多
想对作者说点什么?

博主推荐

换一批

没有更多推荐了,返回首页