OpenCL的学习---计算直方图的理解

本文链接：https://blog.csdn.net/wd1603926823/article/details/70171567

之前的理解device上每个compute unit：

看到《OpenCL编程指南》第14章---计算直方图，有点难理解，我对内存中抽象的东西。所以kernel函数那里看了很久。感谢北邮的大神 http://www.mrobotit.cn/~shanxinyan 他很懂OpenCL,我们学校和中南湖大我没听到有搞OpenCL的人，甚至网上也少，所以学习讨论较困难。书上的不能在我电脑上直接运行我修改了kernel的几个地方：amd24以及read_imagef那里整个工程在 http://download.csdn.net/detail/wd1603926823/9813986 这里。

其中host端：

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <fstream>
#include <CL/cl.h>
#include <iostream>
#include <sstream>
#include "FreeImage.h"
#include "gFreeImage.h"

const int num_pixels_per_work_item = 32;
static int num_iterations = 1000;

cl_mem LoadImage(cl_context context, char *fileName, int &width, int &height);
static int read_kernel_from_file(const char *filename, char **source, size_t *len);
static int verify_histogram_results(const char *str, unsigned int *histogram_results, unsigned int *ref_histogram_results, int num_entries);
static void * generate_reference_histogram_results_fp32(void *image_data, int w, int h);
static void * create_image_data_fp32(int w, int h);
static void * generate_reference_histogram_results_unorm8(void *image_data, int w, int h);
static void * create_image_data_unorm8(int w, int h);

int main()
{
	cl_uint platformNum;
	cl_int err;
	err=clGetPlatformIDs(0,NULL,&platformNum);
	if(err!=CL_SUCCESS){
		printf("cannot get platforms number.\n");
		return -1;
	}
	cl_platform_id* platforms;
	platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);
	err=clGetPlatformIDs(platformNum,platforms,NULL);
	if(err!=CL_SUCCESS){
		printf("cannot get platforms addresses.\n");
		return -1;
	}
	cl_platform_id platformInUse=platforms[0];
	cl_device_id device;
	clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_GPU,1,&device,NULL);
	cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
	cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE, &err);

	const char  cl_kernel_histogram_filename[] = "/home/jumper/OpenCL_projects/Book_ch14_Histogram/partial_histogram.cl";
	size_t              src_len[1];
	char                *source[1];
	err = read_kernel_from_file(cl_kernel_histogram_filename, &source[0], &src_len[0]);
	cl_program program = clCreateProgramWithSource(context, 1, (const char **)source, (size_t *)src_len, &err);
	err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
	if(err!=CL_SUCCESS){
		printf("cannot build program.\n");
		size_t log_size;
		clGetProgramBuildInfo(program,device,CL_PROGRAM_BUILD_LOG,0,NULL,&log_size);
		char *log=(char*)alloca(log_size);
		clGetProgramBuildInfo(program,device,CL_PROGRAM_BUILD_LOG,log_size,log,NULL);
		printf("%s\n",log);
		return -1;
	}
	cl_kernel histogram_rgba_unorm8 = clCreateKernel(program, "histogram_local", &err);
	if(err!=CL_SUCCESS){
		printf("cannot build kernel.\n");
		return -1;
	}

	std::ifstream srcFile2("/home/jumper/OpenCL_projects/Book_ch14_Histogram/sum_histogram.cl");
	std::string srcProg2(std::istreambuf_iterator<char>(srcFile2),(std::istreambuf_iterator<char>()));
	const char * src2 = srcProg2.c_str();
	size_t length2 = srcProg2.length();
	cl_program program2=clCreateProgramWithSource(context,1,&src2,&length2,&err);
	err=clBuildProgram(program2,1,&device,NULL,NULL,NULL);
	if(err!=CL_SUCCESS){
		printf("cannot build program2.\n");
		return -1;
	}
	cl_kernel histogram_sum_partial_results_unorm8 = clCreateKernel(program2, "histogram_global", &err);
	if(err!=CL_SUCCESS)
	{
		printf("clCreateKernel() failed creating kernel void histogram_sum_partial_results_unorm8(). (%d)\n", err);
		return EXIT_FAILURE;
	}
	//Create Input Image Object
	char file[]={"/home/jumper/OpenCL_projects/Book_ch14_Histogram/lenna.jpg"};
	int imgwidth,imgheight;
	cl_mem image=LoadImage(context,file,imgwidth,imgheight);

	cl_mem histogram_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 256*3*sizeof(unsigned int), NULL, &err);
	if (!histogram_buffer || err)
	{
		printf("clCreateBuffer() failed. (%d)\n", err);
		return EXIT_FAILURE;
	}

	cl_image_format image_format;
	image_format.image_channel_order = CL_RGBA;
	image_format.image_channel_data_type = CL_UNORM_INT8;
	void                *image_data_unorm8;
	cl_mem              input_image_unorm8;
	void                *image_data_fp32;
	cl_mem              input_image_fp32;
	image_data_unorm8 = create_image_data_unorm8(imgwidth, imgheight);
	input_image_unorm8 = clCreateImage2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,&image_format, imgwidth, imgheight, 0, image_data_unorm8, &err);
	if (!input_image_unorm8 || err)
	{
		printf("clCreateImage2D() failed. (%d)\n", err);
		return EXIT_FAILURE;
	}
	image_format.image_channel_order = CL_RGBA;
	image_format.image_channel_data_type = CL_FLOAT;
	image_data_fp32 = create_image_data_fp32(imgwidth, imgheight);
	input_image_fp32 = clCreateImage2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,&image_format, imgwidth, imgheight, 0, image_data_fp32, &err);
	if (!input_image_fp32 || err)
	{
		printf("clCreateImage2D() failed. (%d)\n", err);
		return EXIT_FAILURE;
	}
	/************  Testing RGBA 8-bit histogram **********/
	size_t workgroup_size;
	size_t local_work_size[2],global_work_size[2];
	size_t num_groups;
	clGetKernelWorkGroupInfo(histogram_rgba_unorm8, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
	{
		size_t  gsize[2];
		int     w;

		if (workgroup_size <= 256)
		{
			gsize[0] = 16;
			gsize[1] = workgroup_size / 16;
		}
		else if (workgroup_size <= 1024)
		{
			gsize[0] = workgroup_size / 16;
			gsize[1] = 16;
		}
		else
		{
			gsize[0] = workgroup_size / 32;
			gsize[1] = 32;
		}

		local_work_size[0] = gsize[0];
		local_work_size[1] = gsize[1];

		w = (imgwidth + num_pixels_per_work_item - 1) / num_pixels_per_work_item;
		global_work_size[0] = ((w + gsize[0] - 1) / gsize[0]);
		global_work_size[1] = ((imgheight + gsize[1] - 1) / gsize[1]);

		num_groups = global_work_size[0] * global_work_size[1];
		global_work_size[0] *= gsize[0];
		global_work_size[1] *= gsize[1];
	}

	cl_mem partial_histogram_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, num_groups*256*3*sizeof(unsigned int), NULL, &err);
	if (!partial_histogram_buffer || err)
	{
		printf("clCreateBuffer() failed. (%d)\n", err);
		return EXIT_FAILURE;
	}

	//clSetKernelArg(histogram_rgba_unorm8, 0, sizeof(cl_mem), &input_image_unorm8);
	clSetKernelArg(histogram_rgba_unorm8, 0, sizeof(cl_mem), &image);
	clSetKernelArg(histogram_rgba_unorm8, 1, sizeof(int), &num_pixels_per_work_item);
	clSetKernelArg(histogram_rgba_unorm8, 2, sizeof(cl_mem), &partial_histogram_buffer);

	clSetKernelArg(histogram_sum_partial_results_unorm8, 0, sizeof(cl_mem), &partial_histogram_buffer);
	clSetKernelArg(histogram_sum_partial_results_unorm8, 1, sizeof(int), &num_groups);
	clSetKernelArg(histogram_sum_partial_results_unorm8, 2, sizeof(cl_mem), &histogram_buffer);

	// verify that the kernel works correctly.  also acts as a warmup
	err = clEnqueueNDRangeKernel(queue, histogram_rgba_unorm8, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);

	unsigned int *partial_histogram_results = (unsigned int *)malloc(num_groups*256*3*sizeof(unsigned int));
	err = clEnqueueReadBuffer(queue, partial_histogram_buffer, CL_TRUE, 0, num_groups*256*3*sizeof(unsigned int), partial_histogram_results, 0, NULL, NULL);
	for(int j=0;j<num_groups;j++){
		int ind=j*256*3;
		for(int i=0;i<256*3;i++){
			printf("the %dth work-group: R:%d G:%d B:%d \n",j+1,partial_histogram_results[ind+i],partial_histogram_results[ind+256+i],partial_histogram_results[ind+512+i]);
		}
	}

	// verify that the kernel works correctly.  also acts as a warmup
	clGetKernelWorkGroupInfo(histogram_sum_partial_results_unorm8, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);

	size_t partial_global_work_size[2],partial_local_work_size[2];
	partial_global_work_size[0] = 256*3;
	partial_local_work_size[0] = (workgroup_size > 256) ? 256 : workgroup_size;
	err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_unorm8, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL);

	unsigned int *ref_histogram_results = (unsigned int *)generate_reference_histogram_results_unorm8(image_data_unorm8, imgwidth, imgheight);
	unsigned int *histogram_results = (unsigned int *)malloc(256*3*sizeof(unsigned int));
	err = clEnqueueReadBuffer(queue, histogram_buffer, CL_TRUE, 0, 256*3*sizeof(unsigned int), histogram_results, 0, NULL, NULL);
	if (err)
	{
		printf("clEnqueueReadBuffer() failed. (%d)\n", err);
		return EXIT_FAILURE;
	}

	verify_histogram_results("Image Histogram for image type = CL_RGBA, CL_UNORM_INT8", histogram_results, ref_histogram_results, 256*3);
//	for(int i=0;i<256*3;i++){
//		printf("R:%d G:%d B:%d \n",histogram_results[i],histogram_results[256+i],histogram_results[512+i]);
//	}
	// now measure performance
	cl_event events[2];
	err = clEnqueueMarker(queue, &events[0]);
	if (err)
	{
		printf("clEnqeueMarker() failed for histogram_rgba_unorm8 kernel. (%d)\n", err);
		return EXIT_FAILURE;
	}
	for (int i=0; i<num_iterations; i++)
	{
		err = clEnqueueNDRangeKernel(queue, histogram_rgba_unorm8, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);
		if (err)
		{
			printf("clEnqueueNDRangeKernel() failed for histogram_rgba_unorm8 kernel. (%d)\n", err);
			return EXIT_FAILURE;
		}

		err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_unorm8, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL);
		if (err)
		{
			printf("clEnqueueNDRangeKernel() failed for histogram_sum_partial_results_unorm8 kernel. (%d)\n", err);
			return EXIT_FAILURE;
		}
	}
	err = clEnqueueMarker(queue, &events[1]);
	if (err)
	{
		printf("clEnqeueMarker() failed for histogram_rgba_unorm8 kernel. (%d)\n", err);
		return EXIT_FAILURE;
	}
	err = clWaitForEvents(1, &events[1]);
	if (err)
	{
		printf("clWaitForEvents() failed for histogram_rgba_unorm8 kernel. (%d)\n", err);
		return EXIT_FAILURE;
	}

	cl_ulong time_start,time_end;
	err = clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_QUEUED, sizeof(cl_long), &time_start, NULL);
	err |= clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_END, sizeof(cl_long), &time_end, NULL);
	if (err)
	{
		printf("clGetEventProfilingInfo() failed for histogram_rgba_unorm8 kernel. (%d)\n", err);
		return EXIT_FAILURE;
	}

	printf("Image dimensions: %d x %d pixels, Image type = CL_RGBA, CL_UNORM_INT8\n", imgwidth, imgheight);
	printf("Time to compute histogram = %g ms\n", (double)(time_end - time_start) * 1e-9 * 1000.0 / (double)num_iterations);

	clReleaseEvent(events[0]);
	clReleaseEvent(events[1]);

	free(ref_histogram_results);
	free(histogram_results);
	free(image_data_unorm8);
	free(image_data_fp32);

	clReleaseKernel(histogram_rgba_unorm8);
	//clReleaseKernel(histogram_rgba_fp);
	clReleaseKernel(histogram_sum_partial_results_unorm8);
	//clReleaseKernel(histogram_sum_partial_results_fp);

	clReleaseProgram(program);
	clReleaseMemObject(partial_histogram_buffer);
	clReleaseMemObject(histogram_buffer);
	clReleaseMemObject(input_image_unorm8);
	clReleaseMemObject(input_image_fp32);

	clReleaseCommandQueue(queue);
	clReleaseContext(context);

    return EXIT_SUCCESS;
}

static void * create_image_data_unorm8(int w, int h)
{
    unsigned char   *p = (unsigned char *)malloc(w * h * 4);
    int             i;

    for (i=0; i<w*h*4; i++)
        p[i] = (unsigned char)(rand() & 0xFF);

    return (void *)p;
}

static void * generate_reference_histogram_results_unorm8(void *image_data, int w, int h)
{
    unsigned int    *ref_histogram_results = (unsigned int *)malloc(256 * 3 * sizeof(unsigned int));
    unsigned char   *img = (unsigned char *)image_data;
    unsigned int    *ptr = ref_histogram_results;
    int             i;
    memset(ref_histogram_results, 0x0, 256 * 3 * sizeof(unsigned int));
    for (i=0; i<w*h*4; i+=4)
    {
        int indx = img[i];
        ptr[indx]++;
    }

    ptr += 256;
    for (i=1; i<w*h*4; i+=4)
    {
        int indx = img[i];
        ptr[indx]++;
    }

    ptr += 256;
    for (i=2; i<w*h*4; i+=4)
    {
        int indx = img[i];
        ptr[indx]++;
    }

    return ref_histogram_results;
}

static void * create_image_data_fp32(int w, int h)
{
    float   *p = (float *)malloc(w * h * 4 * sizeof(float));
    int     i;
    for (i=0; i<w*h*4; i++)
        p[i] = (float)rand() / (float)RAND_MAX;

    return (void *)p;
}

static void * generate_reference_histogram_results_fp32(void *image_data, int w, int h)
{
    unsigned int    *ref_histogram_results = (unsigned int *)malloc(256 * 3 * sizeof(unsigned int));
    float           *img = (float *)image_data;
    unsigned int    *ptr = ref_histogram_results;
    int             i;
    memset(ref_histogram_results, 0x0, 256 * 3 * sizeof(unsigned int));
    for (i=0; i<w*h*4; i+=4)
    {
        float           f = img[i];
        unsigned int    indx;
        if (f > 1.0f)
          f = 1.0f;

        f *= 256.0f;
        indx = (unsigned int)f;
        ptr[indx]++;
    }

    ptr += 256;
    for (i=1; i<w*h*4; i+=4)
    {
        float           f = img[i];
        unsigned int    indx;
        if (f > 1.0f)
          f = 1.0f;

        f *= 256.0f;
        indx = (unsigned int)f;
        ptr[indx]++;
    }

    ptr += 256;
    for (i=2; i<w*h*4; i+=4)
    {
        float           f = img[i];
        unsigned int    indx;
        if (f > 1.0f)
          f = 1.0f;

        f *= 256.0f;
        indx = (unsigned int)f;
        ptr[indx]++;
    }
    return ref_histogram_results;
}

static int verify_histogram_results(const char *str, unsigned int *histogram_results, unsigned int *ref_histogram_results, int num_entries)
{
    int     i;
    for (i=0; i<num_entries; i++)
    {
        if (histogram_results[i] != ref_histogram_results[i])
        {
            printf("%s: verify_histogram_results failed for indx = %d, gpu result = %d, expected result = %d\n",
                                                            str, i, histogram_results[i], ref_histogram_results[i]);
            return -1;
        }
    }
    printf("%s: VERIFIED\n", str);
    return 0;
}

static int read_kernel_from_file(const char *filename, char **source, size_t *len)
{
    struct stat statbuf;
    FILE        *fh;
    size_t      file_len;
    fh = fopen(filename, "r");
    if (fh == 0)
        return -1;
    stat(filename, &statbuf);
    file_len = (size_t)statbuf.st_size;
    *len = file_len;
    *source = (char *) malloc(file_len+1);
    fread(*source, file_len, 1, fh);
    (*source)[file_len] = '\0';
    fclose(fh);
    return 0;
}

cl_mem LoadImage(cl_context context, char *fileName, int &width, int &height)
{
    FREE_IMAGE_FORMAT format = FreeImage_GetFileType(fileName, 0);
    FIBITMAP* image = FreeImage_Load(format, fileName);
    // Convert to 32-bit image
    FIBITMAP* temp = image;
    image = FreeImage_ConvertTo32Bits(image);
    FreeImage_Unload(temp);
    width = FreeImage_GetWidth(image);
    height = FreeImage_GetHeight(image);
    char *buffer = new char[width * height * 4];
    memcpy(buffer, FreeImage_GetBits(image), width * height * 4);
    FreeImage_Unload(image);
    // Create OpenCL image
    cl_image_format clImageFormat;
    clImageFormat.image_channel_order = CL_RGBA;
    clImageFormat.image_channel_data_type = CL_UNORM_INT8;
    cl_int errNum;
    cl_mem clImage;
    clImage = clCreateImage2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,&clImageFormat,width,height, 0,buffer,&errNum);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error creating CL image object" << std::endl;
        return 0;
    }
    return clImage;
}

接下来是2个kernel：

__kernel void histogram_local(__read_only image2d_t img, int num_pixels_per_workitem, global uint *histogram)
{
    int     local_size = (int)get_local_size(0) * (int)get_local_size(1);
    int     image_width = get_image_width(img);
    int     image_height = get_image_height(img);
    //int     group_indx = mad24(get_group_id(1), get_num_groups(0), get_group_id(0)) * 256 * 3;
    int	group_indx=(get_group_id(1)*get_num_groups(0)+get_group_id(0))*256*3;
    int     x = get_global_id(0);
    int     y = get_global_id(1);
    
    local uint  tmp_histogram[256 * 3];
        
    //int     tid = mad24(get_local_id(1), get_local_size(0), get_local_id(0));
    int tid=get_local_id(1)*get_local_size(0)+get_local_id(0);
    int     j = 256 * 3;
    int     indx = 0;
    
    // clear the local buffer that will generate the partial histogram
    do
    {
        if (tid < j)
            tmp_histogram[indx+tid] = 0;


        j -= local_size;
        indx += local_size;
    } while (j > 0);
    
    barrier(CLK_LOCAL_MEM_FENCE);
    
    const sampler_t sampler=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    if ((x < image_width) && (y < image_height))
    {                
        float4 clr = read_imagef(img, sampler, (float2)(x, y));
        uchar   indx_x, indx_y, indx_z;
        indx_x = convert_uchar_sat(clr.x * 255.0f);
        indx_y = convert_uchar_sat(clr.y * 255.0f);
        indx_z = convert_uchar_sat(clr.z * 255.0f);
        
        atom_inc(&tmp_histogram[indx_x]);
        atom_inc(&tmp_histogram[256+(uint)indx_y]);
        atom_inc(&tmp_histogram[512+(uint)indx_z]);
    }
    
    
    barrier(CLK_LOCAL_MEM_FENCE);


    // copy the partial histogram to appropriate location in histogram given by group_indx
    if (local_size >= (256 * 3))
    {
        if (tid < (256 * 3))
            histogram[group_indx + tid] = tmp_histogram[tid];
    }
    else
    {
        j = 256 * 3;
        indx = 0;
        do 
        {
            if (tid < j)
                histogram[group_indx + indx + tid] = tmp_histogram[indx + tid];
                
            j -= local_size;
            indx += local_size;
        } while (j > 0);
    }
}
	group_indx=(get_group_id(1)*get_num_groups(0)+get_group_id(0))*256*3;
    int     x = get_global_id(0);
    int     y = get_global_id(1);
    
    local uint  tmp_histogram[256 * 3];
        
    //int     tid = mad24(get_local_id(1), get_local_size(0), get_local_id(0));
    int tid=get_local_id(1)*get_local_size(0)+get_local_id(0);
    int     j = 256 * 3;
    int     indx = 0;
    
    // clear the local buffer that will generate the partial histogram
    do
    {
        if (tid < j)
            tmp_histogram[indx+tid] = 0;


        j -= local_size;
        indx += local_size;
    } while (j > 0);
    
    barrier(CLK_LOCAL_MEM_FENCE);
    
    const sampler_t sampler=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    if ((x < image_width) && (y < image_height))
    {                
        float4 clr = read_imagef(img, sampler, (float2)(x, y));
        uchar   indx_x, indx_y, indx_z;
        indx_x = convert_uchar_sat(clr.x * 255.0f);
        indx_y = convert_uchar_sat(clr.y * 255.0f);
        indx_z = convert_uchar_sat(clr.z * 255.0f);
        
        atom_inc(&tmp_histogram[indx_x]);
        atom_inc(&tmp_histogram[256+(uint)indx_y]);
        atom_inc(&tmp_histogram[512+(uint)indx_z]);
    }
    
    
    barrier(CLK_LOCAL_MEM_FENCE);


    // copy the partial histogram to appropriate location in histogram given by group_indx
    if (local_size >= (256 * 3))
    {
        if (tid < (256 * 3))
            histogram[group_indx + tid] = tmp_histogram[tid];
    }
    else
    {
        j = 256 * 3;
        indx = 0;
        do 
        {
            if (tid < j)
                histogram[group_indx + indx + tid] = tmp_histogram[indx + tid];
                
            j -= local_size;
            indx += local_size;
        } while (j > 0);
    }
}

__kernel void histogram_global(global uint *partial_histogram, int num_groups, global uint *histogram)
{
    int     tid = (int)get_global_id(0);
    int     group_indx;
    int     n = num_groups;
    local uint    tmp_histogram[256*3];

    tmp_histogram[tid] = partial_histogram[tid];
    
    group_indx = 256*3;
    while (--n > 0)
    {
        tmp_histogram[tid] += partial_histogram[group_indx + tid];
        group_indx += 256*3;
    }
    
    histogram[tid] = tmp_histogram[tid];
}

但出来结果不对：将rgb的值打印出来又是无意义的值！？？？

好烦躁啊！！！！！！！！！！！！！！！！

书上的案例部分代码晦涩难懂，大神给我的建议是 “kernel最快的应当直接使用buffer读取的，用image, 然后做一个空间方块型的读取，用来做直方图统计，这无意义。因为最终结果只会这些像素点本身有关，而和你是否按照特定的顺序读取无关。使用image额外增加了创建成image的数据格式，和读取时候的代价。而且使用临近的方块区域读取，因为图像本身的性质，很可能像素值接近甚至相同，这增加了在__local上进行原子+1统计时候的冲突风险。从而会降低性能。而直接简单平铺work-items, 例如按行读取，不仅具有访存上的优势（例如，减少了刚才说的转换成image的后备存储格式（你的host上将图像转换为cl_mem的那个wrapper函数）的代价），而且还可以尽量降低像素值上的相关性。避免对同一个__local的bank的访问，从而可能的提高性能。第二个kernel建立了__local上的数组，然后每个线程只使用其中的独一无二的一个元素。这毫无意义。这种线程间完全无交流的，却使用了共享的__local数组，除了用来迷惑，毫无用途。建议修正。” 据说不应该学习。还是按照https://chenxiaowei.gitbooks.io/heterogeneous-computing-with-opencl2-0/content/content/chapter4/4.2-chinese.html 上的，亲测 https://chenxiaowei.gitbooks.io/heterogeneous-computing-with-opencl2-0/content/content/chapter4/4.2-chinese.html 通过。虽然这是计算的一幅灰度图的。
对于这个正确书写的kernel：

#define HIST_BINS 256
__kernel void histogram(__global int *data,int  numData, __global int *histogram){

  __local int localHistorgram[HIST_BINS];

  int lid = get_local_id(0);

  /* Initialize local histogram to zero */
  for (int i = lid; i < HIST_BINS; i += get_local_size(0)){
    localHistorgram[i] = 0;
  }

  /* Wait nutil all work-items within
   * the work-group have completed their stores */
  barrier(CLK_LOCAL_MEM_FENCE);

  /* Compute local histogram */
  int gid = get_global_id(0);
  for (int i = gid; i < numData; i += get_global_size(0)){
    atomic_add(&(localHistorgram[data[i]]), 1);
  }

  /* Wait nutil all work-items within
   * the work-group have completed their stores */
  barrier(CLK_LOCAL_MEM_FENCE);

  /* Write the local histogram out to
   * the global histogram */
  for (int i = lid; i < HIST_BINS; i += get_local_size(0)){
    atomic_add(&(histogram[i]), localHistorgram[i]);
  }
}

我之前是这样理解的：

所以我怎么也想不明白，对于globalsize=1024 在第一个for循环清零后，其实只有4个局部结果被清零而这个程序的本意是每个工作组有一个局部直方图结果可是现在只有4个啊而公共16个工作组啊 ?!我之前一直停留在这里。

后来通过和一个CUDA大神讨论 @UFO&ET 恍然明白，原来是下面这样：

其实就是对于第一个工作组的256个结果是由第一个工作组中的每1个人邀请自己后面对应位置的3个人去给256数组的对应位置清零的。对于第二个工作组的256个结果，是由第二个工作组中的每1个人去邀请后面对应位置的3个伙伴去给第二个工作组家的256数组对应位置清零。其实对于第二个工作组中每个人都给自己和前一个工作组一共帮了两次忙。一个64大小的workgroup做完其实结果就是清零了一个localHistogram。那么16个工作组肯定清零16个256，而不是像我上面画的“就地” In-place的关系不是在那个位置上。

上面是多年前的错误表述，不要当真，一直没去改，今天导致网友疑惑来改下，下面5点是现在的正确表述：

灰度图kernel直方图统计功能：
1，灰度图data的总元素个数是numData=rows * cols （假设numData=1024*3=3072，想象成这是排列整齐的rowsxcols摆在样品区地上的砖，每块砖有256种颜色）
2，搬砖工厂里总共1024个工人，分成了16个小组，每个小组64个人，共同完成上面的搬砖工作（将样品区的砖搬到工厂老总办公室并按相同颜色摆好）。
3，老总办公室距离样品区太远，所以每个组内有个临时的小推车（localHist，里面肯定也是256个位置用来放不同颜色的砖），每个组先将搬砖的结果放进自己的小推车，然后再去老总办公室（globalHist）摆砖；
4，第一个for循环（局部变量清零，即将组内唯一的小推车打扫干净）：
	对于第i组group_i的64个人（姓名编号lid=0~63），lid=0的人打扫小推车localHist的lid即0位置；同时lid=1的人打扫小推车localHist的lid即1位置...同时lid=63的人打扫小推车localHist的lid即63位置；
	那么第i组的人第一轮打扫小推车并没打扫干净，只打扫了0~63位置；
	
	于是第i组lid=0这个人继续打扫小推车localHist的lid+64即64位置；同时lid=1这个人继续打扫小推车localHist的lid+64即65位置...同时lid=63这个人继续打扫小推车localHist的lid+64即127位置；
	结果第i组的人发现第二轮打扫小推车也没打扫干净，目前才到127位置；
	
	于是第i组lid=0这个人继续打扫小推车localHist的lid+128即128位置；同时lid=1这个人继续打扫小推车localHist的lid+128即129位置...同时lid=63这个人继续打扫小推车localHist的lid+128即191位置；
	结果第i组的人发现第三轮打扫小推车也没打扫干净，目前才到191位置；
	
	于是第i组lid=0这个人继续打扫小推车localHist的lid+192即192位置；同时lid=1这个人继续打扫小推车localHist的lid+192即193位置...同时lid=63这个人继续打扫小推车localHist的lid+192即255位置；
	第i组的人终于清理完了自己的小推车localHist；

	所有小组的人都如第i组的工人一样清理自己组内的小推车，那么16个小组的小推车都已清理完毕（第一个for结束）
5，后面的步骤没有网友疑惑，懒得打了

其实localHistogram这个变量可能包括后面的data变量和全局结果Histogram变量都只是借助每个workitem的ID 才完成自己的计算。总算明白了，我是个对“抽象”看不见摸不着的东西反应比较慢理解不太容易的人！这个问题我真的想了挺久的。想明白以后才觉得自己竟然思考这么简单的问题思考了这么久，蠢得跟猪一样，难怪问问题会被大神们鄙视。说真的当理解时就觉得简单，不理解时真的觉得难。

谁来做这个计算，就下标是谁。localHistogram 下标用每个工作项的ID 而不是直接像我以前一样用0---255 ，因为以前是串行！
那个大神画的这幅图更清晰：

向下到第16次！

至于后两个for循环我也理解了：

另外对于事件机制同步点那里非常感谢这个大神http://blog.csdn.net/bob_dong/article/details/70172165#reply 写了这篇文章，我懂了很多。如果用clWaitForEvents，那么host端的后续程序要等clWaitForEvents规定的那些事件完成后才能往下执行，如果用clEnqueueWaitForEvents那么queue中的后续命令要等待clEnqueueWaitForEvents之前的所有命令执行完，但host端不用等待，可以与clEnqueueWaitForEvents之前的命令同时运行。

尝试着自己仿照写了一个rgb彩图的直方图计算：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <CL/cl.h>
#include "gFreeImage.h"
static const int HIST_BINS = 256;
void check(cl_int status);
char* readFile(const char *filename);
int main()
{
	int imageCols, imageRows;
	gFreeImage img;
	int readflag=img.LoadImage("/home/jumper/OpenCL_projects/Book_ch14_Histogram/lenna.jpg");
	unsigned char *hInputImage = img.getImageData(imageCols,imageRows);
	const int imageElements = imageRows*imageCols;
	const size_t imageSize = imageElements*sizeof(unsigned char)*4;
   const int histogramSize = HIST_BINS*sizeof(int)*3;
   int *hOutputHistogram = (int*)malloc(histogramSize);
   if (!hOutputHistogram) { exit(-1); }

   cl_int status;
   cl_platform_id platform;
   status = clGetPlatformIDs(1, &platform, NULL);
   check(status);
   cl_device_id device;
   status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
   check(status);
   cl_context context;
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
   check(status);
   cl_command_queue cmdQueue=clCreateCommandQueue(context, device, 0, &status);
   check(status);

   cl_mem  bufInputImage = clCreateBuffer(context, CL_MEM_READ_ONLY, imageSize, NULL,&status);
   check(status);
   cl_mem bufOutputHistogram = clCreateBuffer(context, CL_MEM_WRITE_ONLY,histogramSize, NULL, &status);
   check(status);
   status = clEnqueueWriteBuffer(cmdQueue, bufInputImage, CL_TRUE, 0, imageSize,(void*)hInputImage, 0, NULL, NULL);
   check(status);
   int zero = 0;
   status = clEnqueueFillBuffer(cmdQueue, bufOutputHistogram, &zero,sizeof(int), 0, histogramSize, 0, NULL, NULL);
   check(status);

   /* Create a program with source code */
   char *programSource = readFile("/home/jumper/OpenCL_projects/Book_ch14_Histogram/rgbHistogram.cl");
   size_t programSourceLen = strlen(programSource);
   cl_program program = clCreateProgramWithSource(context, 1,(const char**)&programSource, &programSourceLen, &status);
   check(status);
   status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
   //std::cout<<status<<std::endl;
   if (status != CL_SUCCESS) {
	   printf("cannot build program successfully!\n");
	   size_t logSize;
	  status = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &logSize);
	  char *log = (char*)malloc(logSize);
	  status = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,logSize, log, NULL);
	  printf("%s\n", log);
      exit(-1);
   }
   cl_kernel kernel;
   kernel = clCreateKernel(program, "histogramforRGB", &status);
   check(status);
   status  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufInputImage);
   status |= clSetKernelArg(kernel, 1, sizeof(int), &imageElements);
   status |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufOutputHistogram);
   check(status);
   size_t globalWorkSize[1];
   globalWorkSize[0] = 1024;
   size_t localWorkSize[1];
   localWorkSize[0] = 64;
   status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL,globalWorkSize, localWorkSize, 0, NULL, NULL);
   check(status);
   status = clEnqueueReadBuffer(cmdQueue, bufOutputHistogram, CL_TRUE, 0,histogramSize, hOutputHistogram, 0, NULL, NULL);
   check(status);
   for (int i = 0; i < HIST_BINS; i+=3) {
	    printf("histogram%d R:%d G:%d B:%d \n",i,hOutputHistogram[i],hOutputHistogram[1+i],hOutputHistogram[2+i]);
     }
   clReleaseKernel(kernel);
   clReleaseProgram(program);
   clReleaseCommandQueue(cmdQueue);
   clReleaseMemObject(bufInputImage);
   clReleaseMemObject(bufOutputHistogram);
   clReleaseContext(context);
   free(hOutputHistogram);
   free(programSource);
   return 0;
}

kernel函数部分：

#define HIST_BINS 256
__kernel void histogramforRGB(__global unsigned char *data,int  numData, __global int *histogram){
  __local int localHistorgram[HIST_BINS*3];
  int lid = get_local_id(0);

  //for (int i = lid; i < HIST_BINS; i += get_local_size(0)){
  //  localHistorgram[i] = 0;
  //  localHistorgram[256+i] = 0;
  //  localHistorgram[512+i] = 0;
  //}
  //codes below faster!
  for (int i = lid; i < HIST_BINS*3; i += get_local_size(0)){
    localHistorgram[i] = 0;
  }
  barrier(CLK_LOCAL_MEM_FENCE);

  int gid = get_global_id(0);
  for (int i = gid; i < numData*3; i += get_global_size(0)){
    atomic_add(&(localHistorgram[data[i]]), 1);
  }
  barrier(CLK_LOCAL_MEM_FENCE);

  for (int i = lid; i < HIST_BINS*3; i += get_local_size(0)){
    atomic_add(&(histogram[i]), localHistorgram[i]);
  }
}

结果：

怎么办我不知道我写的这个正确与否。

经过大神提示：http://bbs.gpuworld.cn/forum.php?mod=viewthread&tid=10651&page=1#pid19338 意识到问题首先图像数据data是RGBARGBARGBA...这样排列的，但我第二个for循环中是将图像像素点的RGBARGBARGBA....分别统计却遗留了剩下的1/4像素点没有统计然后将这RGB、ARG、BAR、.....当成了我想要的.....是错的。

二、自己修改后：

kernel函数：

#define HIST_BINS 256

__kernel void histogramforRGB(__global unsigned char *data,int  numData, __global int *histogram){

  __local int localHistorgram[HIST_BINS*4];
  int lid = get_local_id(0);

  for (int i = lid; i < HIST_BINS*4; i += get_local_size(0)){
    localHistorgram[i] = 0;
  }
  barrier(CLK_LOCAL_MEM_FENCE);

  int gid = get_global_id(0);
  for (int i = gid; i < numData*4; i += get_global_size(0)){
    atomic_add(&(localHistorgram[data[i]]), 1);
  }
  barrier(CLK_LOCAL_MEM_FENCE);

  for (int i = lid; i < HIST_BINS*4; i += get_local_size(0)){
    atomic_add(&(histogram[i]), localHistorgram[i]);
  }
}

host端也要修改一点：

总共两处地方，第一个是kernel的直方图结果拷贝回host端的变量大小：

const int histogramSize = HIST_BINS*sizeof(int)*4;

还有就是打印那里：

int j=0;
   for (int i = 0; i < HIST_BINS*4; i+=4) {
	    printf("histogram%d R:%d G:%d B:%d \n",j,hOutputHistogram[i],hOutputHistogram[1+i],hOutputHistogram[2+i]);
	    j++;
   }

但是结果看起来怪怪的！

这个是错的：因为：第二个for循环那里就开始加混乱了：

正确的是下面的：

#define HIST_BINS 256

__kernel void histogramforRGB(__global unsigned char *data,int  numData, __global int *histogramR, __global int *histogramG, __global int *histogramB){

  __local int localHistorgramR[HIST_BINS];
  __local int localHistorgramG[HIST_BINS];
  __local int localHistorgramB[HIST_BINS];
  int lid = get_local_id(0);

  for (int i = lid; i < HIST_BINS; i += get_local_size(0)){
    localHistorgramR[i] = 0;
    localHistorgramG[i] = 0;
    localHistorgramB[i] = 0;
  }
  barrier(CLK_LOCAL_MEM_FENCE);

  int gid = get_global_id(0);
  for (int i = gid; i < numData*4; i += get_global_size(0)){
  	if(i%4==0)
  	{
    	atomic_add(&(localHistorgramR[data[i]]), 1);
    	continue;
	}
	if(i%4==1)
	{
		atomic_add(&(localHistorgramG[data[i]]), 1);
		continue;
	}
	if(i%4==2)
	{
		atomic_add(&(localHistorgramB[data[i]]), 1);
		continue;
	}
  }
  barrier(CLK_LOCAL_MEM_FENCE);

  for (int i = lid; i < HIST_BINS; i += get_local_size(0)){
    atomic_add(&(histogramR[i]), localHistorgramR[i]);
    atomic_add(&(histogramG[i]), localHistorgramG[i]);
    atomic_add(&(histogramB[i]), localHistorgramB[i]);
  }
}

host端的：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
/* OpenCL includes */
#include <CL/cl.h>
#include "gFreeImage.h"

static const int HIST_BINS = 256;

void check(cl_int status);
char* readFile(const char *filename);

int main()
{
	int imageCols, imageRows;
	gFreeImage img;
	int readflag=img.LoadImage("/home/jumper/OpenCL_projects/Book_ch14_Histogram/lenna.jpg");
	unsigned char *hInputImage = img.getImageData(imageCols,imageRows);
	const int imageElements = imageRows*imageCols;
	const size_t imageSize = imageElements*sizeof(unsigned char)*4;

   const int histogramSize = HIST_BINS*sizeof(int);
   int *hOutputHistogramR = (int*)malloc(histogramSize);
   int *hOutputHistogramG = (int*)malloc(histogramSize);
   int *hOutputHistogramB = (int*)malloc(histogramSize);

   cl_int status;
   cl_platform_id platform;
   status = clGetPlatformIDs(1, &platform, NULL);
   check(status);
   cl_device_id device;
   status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
   check(status);
   cl_context context;
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
   check(status);
   cl_command_queue cmdQueue=clCreateCommandQueue(context, device, 0, &status);
   check(status);

   cl_mem  bufInputImage = clCreateBuffer(context, CL_MEM_READ_ONLY, imageSize, NULL,&status);
   check(status);
   cl_mem bufOutputHistogramR = clCreateBuffer(context, CL_MEM_WRITE_ONLY,histogramSize, NULL, &status);
   cl_mem bufOutputHistogramG = clCreateBuffer(context, CL_MEM_WRITE_ONLY,histogramSize, NULL, &status);
   cl_mem bufOutputHistogramB = clCreateBuffer(context, CL_MEM_WRITE_ONLY,histogramSize, NULL, &status);
   check(status);
   status = clEnqueueWriteBuffer(cmdQueue, bufInputImage, CL_TRUE, 0, imageSize,(void*)hInputImage, 0, NULL, NULL);
   check(status);
   int zero = 0;
   status = clEnqueueFillBuffer(cmdQueue, bufOutputHistogramR, &zero,sizeof(int), 0, histogramSize, 0, NULL, NULL);
   status = clEnqueueFillBuffer(cmdQueue, bufOutputHistogramG, &zero,sizeof(int), 0, histogramSize, 0, NULL, NULL);
   status = clEnqueueFillBuffer(cmdQueue, bufOutputHistogramB, &zero,sizeof(int), 0, histogramSize, 0, NULL, NULL);
   check(status);

   /* Create a program with source code */
   char *programSource = readFile("/home/jumper/OpenCL_projects/Book_ch14_Histogram/rgbHistogram.cl");
   size_t programSourceLen = strlen(programSource);
   cl_program program = clCreateProgramWithSource(context, 1,(const char**)&programSource, &programSourceLen, &status);
   check(status);
   status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
   //std::cout<<status<<std::endl;
   if (status != CL_SUCCESS) {
	   printf("cannot build program successfully!\n");
	   size_t logSize;
	  status = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &logSize);
	  char *log = (char*)malloc(logSize);
	  status = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,logSize, log, NULL);
	  printf("%s\n", log);
      exit(-1);
   }


   cl_kernel kernel;
   kernel = clCreateKernel(program, "histogramforRGB", &status);
   check(status);
   status  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufInputImage);
   status |= clSetKernelArg(kernel, 1, sizeof(int), &imageElements);
   status |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufOutputHistogramR);
   status |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &bufOutputHistogramG);
   status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &bufOutputHistogramB);
   check(status);

   size_t globalWorkSize[1];
   globalWorkSize[0] = 1024;
   size_t localWorkSize[1];
   localWorkSize[0] = 64;
   status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL,globalWorkSize, localWorkSize, 0, NULL, NULL);
   check(status);

   status = clEnqueueReadBuffer(cmdQueue, bufOutputHistogramR, CL_TRUE, 0,histogramSize, hOutputHistogramR, 0, NULL, NULL);
   status = clEnqueueReadBuffer(cmdQueue, bufOutputHistogramG, CL_TRUE, 0,histogramSize, hOutputHistogramG, 0, NULL, NULL);
   status = clEnqueueReadBuffer(cmdQueue, bufOutputHistogramB, CL_TRUE, 0,histogramSize, hOutputHistogramB, 0, NULL, NULL);
   check(status);
   for (int i = 0; i < HIST_BINS; i++) {
	    printf("histogram%d R:%d G:%d B:%d\n",i,hOutputHistogramR[i],hOutputHistogramG[i],hOutputHistogramB[i]);
     }

   clReleaseKernel(kernel);
   clReleaseProgram(program);
   clReleaseCommandQueue(cmdQueue);
   clReleaseMemObject(bufInputImage);
   clReleaseMemObject(bufOutputHistogramR);
   clReleaseMemObject(bufOutputHistogramG);
   clReleaseMemObject(bufOutputHistogramB);
   clReleaseContext(context);
   free(hOutputHistogramR);
   free(hOutputHistogramG);
   free(hOutputHistogramB);
   free(programSource);

   return 0;
}

结果：

可以告一段落了。

或者像大神所说这样写也行： http://bbs.gpuworld.cn/forum.php?mod=viewthread&tid=10651&extra=

今天用opencv写的还加上与CPU的结果对比，结果正确：

#define HIST_BINS 256  
  
__kernel void histogramforRGB(__global unsigned char *data,int  numData, __global int *histogramR, __global int *histogramG, __global int *histogramB){  
  
  __local int localHistorgramR[HIST_BINS];  
  __local int localHistorgramG[HIST_BINS];  
  __local int localHistorgramB[HIST_BINS];  
  int lid = get_local_id(0);  
  
  for (int i = lid; i < HIST_BINS; i += get_local_size(0)){  
    localHistorgramR[i] = 0;  
    localHistorgramG[i] = 0;  
    localHistorgramB[i] = 0;  
  }  
  barrier(CLK_LOCAL_MEM_FENCE);  
  
  int gid = get_global_id(0);  
  for (int i = gid; i < numData*3; i += get_global_size(0)){  
    if(i%3==0)  
    {  
        atomic_add(&(localHistorgramB[data[i]]), 1);  
        //continue; 网友问的不加continue时其实也一样  
    }  
    if(i%3==1)  
    {  
        atomic_add(&(localHistorgramG[data[i]]), 1);  
        //continue;  
    }  
    if(i%3==2)  
    {  
        atomic_add(&(localHistorgramR[data[i]]), 1);  
        //continue;  
    }  
  }  
  barrier(CLK_LOCAL_MEM_FENCE);  
  
  for (int i = lid; i < HIST_BINS; i += get_local_size(0)){  
    atomic_add(&(histogramR[i]), localHistorgramR[i]);  
    atomic_add(&(histogramG[i]), localHistorgramG[i]);  
    atomic_add(&(histogramB[i]), localHistorgramB[i]);  
  }  
}

main：

#include <stdio.h>  
#include <stdlib.h>  
#include <string.h>  
#include <iostream>  
/* OpenCL includes */  
#include <CL/cl.h>  
#include <opencv2/opencv.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc.hpp>

using namespace cv;
using namespace std;
  
static const int HIST_BINS = 256;  
  
void check(cl_int status){
	if(status!=CL_SUCCESS)
	{
		cout<<"error:status is "<<status<<endl;
		return;
	}
}
  
int main()  
{  
    Mat img=imread("/home/jumper/OpenCL_projects/Book_ch14_Histogram/lenna.jpg");
    int imageRows=img.rows;
    int imageCols=img.cols;

    //CPU result,reconfirm the GPU result...
    int rgbhistogram[256*3];
    memset(rgbhistogram,0,256*3*sizeof(int));
    for(int i=0;i<imageRows;i++)
    {
    	uchar *rowdatas=img.ptr<uchar>(i);
    	for(int j=0;j<imageCols*3;j+=3)
    	{
    		int r=rowdatas[j+2];
    		int g=rowdatas[j+1];
    		int b=rowdatas[j];
    		rgbhistogram[r*3]+=1;
    		rgbhistogram[g*3+1]+=1;
    		rgbhistogram[b*3+2]+=1;
    	}
    }


    const int imageElements = imageRows*imageCols;  
    const size_t imageSize = imageElements*sizeof(unsigned char)*3;
  
   const int histogramSize = HIST_BINS*sizeof(int);  
   int *hOutputHistogramR = (int*)malloc(histogramSize);  
   int *hOutputHistogramG = (int*)malloc(histogramSize);  
   int *hOutputHistogramB = (int*)malloc(histogramSize);  
  
   cl_int status;  
   cl_platform_id platform;  
   status = clGetPlatformIDs(1, &platform, NULL);  
   check(status);  
   cl_device_id device;  
   status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);  
   check(status);  
   cl_context context;  
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);  
   check(status);  
   cl_command_queue cmdQueue=clCreateCommandQueue(context, device, 0, &status);  
   check(status);  
  
   cl_mem  bufInputImage = clCreateBuffer(context, CL_MEM_READ_ONLY, imageSize, NULL,&status);  
   check(status);  
   cl_mem bufOutputHistogramR = clCreateBuffer(context, CL_MEM_WRITE_ONLY,histogramSize, NULL, &status);  
   cl_mem bufOutputHistogramG = clCreateBuffer(context, CL_MEM_WRITE_ONLY,histogramSize, NULL, &status);  
   cl_mem bufOutputHistogramB = clCreateBuffer(context, CL_MEM_WRITE_ONLY,histogramSize, NULL, &status);  
   check(status);  
   status = clEnqueueWriteBuffer(cmdQueue, bufInputImage, CL_TRUE, 0, imageSize,img.data, 0, NULL, NULL);
   check(status);  
   int zero = 0;  
   status = clEnqueueFillBuffer(cmdQueue, bufOutputHistogramR, &zero,sizeof(int), 0, histogramSize, 0, NULL, NULL);  
   status = clEnqueueFillBuffer(cmdQueue, bufOutputHistogramG, &zero,sizeof(int), 0, histogramSize, 0, NULL, NULL);  
   status = clEnqueueFillBuffer(cmdQueue, bufOutputHistogramB, &zero,sizeof(int), 0, histogramSize, 0, NULL, NULL);  
   check(status);  
  
   /* Create a program with source code */  
   std::ifstream srcFile("/home/jumper/OpenCL_projects/Book_ch14_Histogram/kernel.cl");
   std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));
   const char * src = srcProg.c_str();
   size_t length = srcProg.length();
   cl_program program=clCreateProgramWithSource(context,1,&src,&length,&status);
   status=clBuildProgram(program,1,&device,NULL,NULL,&status);
   if (status != CL_SUCCESS)
	{
		return(EXIT_FAILURE);
	}

  
   cl_kernel kernel;  
   kernel = clCreateKernel(program, "histogramforRGB", &status);
   check(status);  
   status  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufInputImage);  
   status |= clSetKernelArg(kernel, 1, sizeof(int), &imageElements);  
   status |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufOutputHistogramR);  
   status |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &bufOutputHistogramG);  
   status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &bufOutputHistogramB);  
   check(status);  
  
   size_t globalWorkSize[1];  
   globalWorkSize[0] = 1024;  
   size_t localWorkSize[1];  
   localWorkSize[0] = 64;  
   status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL,globalWorkSize, localWorkSize, 0, NULL, NULL);  
   check(status);  
  
   status = clEnqueueReadBuffer(cmdQueue, bufOutputHistogramR, CL_TRUE, 0,histogramSize, hOutputHistogramR, 0, NULL, NULL);  
   status = clEnqueueReadBuffer(cmdQueue, bufOutputHistogramG, CL_TRUE, 0,histogramSize, hOutputHistogramG, 0, NULL, NULL);  
   status = clEnqueueReadBuffer(cmdQueue, bufOutputHistogramB, CL_TRUE, 0,histogramSize, hOutputHistogramB, 0, NULL, NULL);  
   check(status);  
   for (int i = 0; i < HIST_BINS; i++) {  
        printf("histogram %d  OpenCL-result:R:%d G:%d B:%d---CPU-result:R:%d G:%d B:%d\n",i,hOutputHistogramR[i],hOutputHistogramG[i],hOutputHistogramB[i],rgbhistogram[i*3],rgbhistogram[i*3+1],rgbhistogram[i*3+2]);
     }  
  
   clReleaseKernel(kernel);  
   clReleaseProgram(program);  
   clReleaseCommandQueue(cmdQueue);  
   clReleaseMemObject(bufInputImage);  
   clReleaseMemObject(bufOutputHistogramR);  
   clReleaseMemObject(bufOutputHistogramG);  
   clReleaseMemObject(bufOutputHistogramB);  
   clReleaseContext(context);  
   free(hOutputHistogramR);  
   free(hOutputHistogramG);  
   free(hOutputHistogramB);  
   //free(programSource);
  
   return 0;  
}

结果一致：