讨论后自己写了一个腐蚀膨胀,开操作:
一、最开始的C++版本windows
int myopen(Mat bwsrc, Mat &dstimg2, int kernelwidth = 2)
{
int comparerows = bwsrc.rows;
int comparecols = bwsrc.cols;
//do erode...
Mat dstimg(bwsrc.size(), CV_8UC1, Scalar(0));
for (int i = 0; i < comparerows-1; i ++)
{
uchar *currentRow = bwsrc.ptr<uchar>(i);
uchar *nextRow = bwsrc.ptr<uchar>(i + 1);
for (int j = 0; j < comparecols-1; j ++)
{
int up0 = currentRow[j];
int down0 = nextRow[j];
int up1 = currentRow[j + 1];
int down1 = nextRow[j + 1];
if (up0 == 255 && up1 == 255 && down0 == 255 && down1 == 255)
{
dstimg.ptr<uchar>(i)[j] = 255;
}
else
{
dstimg.ptr<uchar>(i)[j] = 0;
}
}
}
//do dilate...
//Mat dstimg2(bwsrc.size(), CV_8UC1, Scalar(0));
for (int i = 0; i < comparerows-1; i ++)
{
uchar *currentRow = dstimg.ptr<uchar>(i);
uchar *nextRow = dstimg.ptr<uchar>(i + 1);
for (int j = 0; j < comparecols-1; j ++)
{
int up0 = currentRow[j];
int down0 = nextRow[j];
int up1 = currentRow[j + 1];
int down1 = nextRow[j + 1];
if (up0 == 0 && up1 == 0 && down0 == 0 && down1 == 0)
{
dstimg2.ptr<uchar>(i)[j] = 0;
}
else
{
dstimg2.ptr<uchar>(i)[j] = 255;
}
}
}
//count non-zero points...
int non_zero_num = 0;
for (int i = 0; i < comparerows; i++)
{
uchar *currentrow = dstimg2.ptr<uchar>(i);
for (int j = 0; j < comparecols; j++)
{
int value = currentrow[j];
if (value != 0)
{
non_zero_num++;
}
}
}
return non_zero_num;
}
结果与opencv的一致。
二、OpenCL版本
1,本来我是这样规划的:
但这样写到一半时卡住了,因为始终无法解决“访存合并”的问题。。。
2,后来改了一种写法可以规避“访存合并”的问题:
a,昨晚就写好了,但运行时遇到个问题 简而言之就是:
这样写是OK的 会打印出problem?这个检测语句;但如果改成:
这样写就不行 NDRange会返回 -30!!!
详细来讲就是:这个开操作 我本来是这样写的:
__kernel void imgProcess(__global uchar *csvArray,__global uchar *currentImg,int myimgcols,int myimgrows)
{
int mydstImg[1936*1456]={0};
int dstimg[1936*1456]={0};
int dstimg2[1936*1456]={0};
int secondTempSum[1456]={0};
for(uint i=get_global_id(1);i<myimgrows;i+=get_global_size(1))
{
for(uint j=get_global_id(0);j<myimgcols;j+=get_global_size(0))
{
int rowstart=i*myimgcols*3;
int tempb=currentImg[rowstart+j*3];
int tempg=currentImg[rowstart+j*3+1];
int tempr=currentImg[rowstart+j*3+2];
int rgbpixels=tempr+tempg*256+tempb*256*256;
uchar rgbelement=csvArray[rgbpixels];
if((int)rgbelement>0)
{
mydstImg[i*myimgcols+j]=255;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//do erode...
//__global uchar *dstimg (default is Scalar(0))
for(uint currentGroupID=get_global_id(1);currentGroupID<myimgrows-1;currentGroupID+=get_global_size(1))
{
uint rowposition=currentGroupID*myimgcols+get_global_id(0);
for(;rowposition<myimgcols-1;rowposition+=get_global_size(0))
{
__private int erodetempValues[4];
for(int t=0;t<2;t++)
{
erodetempValues[t]=mydstImg[rowposition+t*myimgcols];
}
for(int t=2;t<4;t++)
{
erodetempValues[t]=mydstImg[rowposition+(t-2)*myimgcols+1];
}
if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255)
{
dstimg[rowposition]=1;//if need the img ,it is should be changed to 255.
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
//do dilate...
for(int currentGroupID=get_group_id(1);currentGroupID<myimgrows-1;currentGroupID+=get_num_groups(1))
{
int rowposition=currentGroupID*myimgcols+get_global_id(0);
for(;rowposition<myimgcols-1;rowposition+=get_global_size(0))
{
__private int dilatetempValues[4];
for(int t=0;t<2;t++)
{
dilatetempValues[t]=dstimg[rowposition+t*myimgcols];
}
for(int t=2;t<4;t++)
{
dilatetempValues[t]=dstimg[rowposition+(t-2)*myimgcols+1];
}
if(dilatetempValues[0]==1 || dilatetempValues[1]==1 || dilatetempValues[2]==1 || dilatetempValues[3]==1)
{
dstimg2[rowposition]=1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
//if need the fluore-img ,you need return dstimg2 here.
//do count the fluore of the image after open operation...
//__global secondTempSum[myimgrows];
for(uint i=get_global_id(1);i<myimgrows;i+=get_global_size(1))
{
for(uint j=myimgcols/2;j>0;j/=2)
{
if(get_global_id(0)<j)
{
dstimg2[i*myimgcols+get_global_id(0)]+=dstimg2[i*myimgcols+get_global_id(0)+j];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(get_global_id(0)==0)
{
secondTempSum[i]=dstimg2[i*myimgcols];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
//the last Sum:secondTempSum[0]...
if(get_global_id(1)==0)
{
for(uint j=myimgrows/2;j>0;j/=2)
{
if(get_global_id(0)<j)
{
secondTempSum[get_global_id(0)]+=secondTempSum[get_global_id(0)+j];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
//if(get_global_id(0)==0)
//{
// fluorecountResult=secondTempSum[0];
//}
}
if(get_global_id(0)==0 && get_global_id(1)==0)
{
printf("fluore points of the image: %d \n",secondTempSum[0]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
但在腐蚀中这句就会报 -30的错:
if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255)
{
dstimg[rowposition]=1;
}
百思不得其解?
3,大神说我的腐蚀部分对于一个group需要的只是两行的数据而已,故可以更简单更快:
按照他的建议,我写成了:
//size_t localsize[2]={1024,1};
// size_t globalsize[2]={1024,1024};
__kernel void imgProcess(__global uchar *csvArray,__global uchar *currentImg,int myimgcols,int myimgrows,__global int *tempSum)
{
__local int mydstImg[1936*3];
__local int erodeImg[1936*2]; //={0}; //local variable can not be inited like this....
__local int dilateImg[1936]; //={0};
//__global int tempSum[1455]; //exclusive the last row...
int currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
for(;currentGroupID<myimgrows-1;currentGroupID+=get_num_groups(0)*get_num_groups(1))
{
//step1: calculate the currentImg's result:mydstImg for every group,according to csvArray...
for(uint i=get_local_id(0);i<myimgcols*3;i+=get_local_size(0))
{
int tempb=currentImg[currentGroupID*myimgcols+i*3];
int tempg=currentImg[currentGroupID*myimgcols+i*3+1];
int tempr=currentImg[currentGroupID*myimgcols+i*3+2];
int rgbpixels=tempr+tempg*256+tempb*256*256;
uchar rgbelement=csvArray[rgbpixels];
if((int)rgbelement>0)
{
mydstImg[i]=255;
}
else
{
mydstImg[i]=0;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//step2:erode...
for(uint j=get_local_id(0);j<myimgcols*2;j+=get_local_size(0))
{
if((j==myimgcols-1) || (j==myimgcols*2-1))
{
erodeImg[j]=0;
break; //??????????????????????
}
__private int erodetempValues[4]={0}; //private array can be inited like this??
for(int t=0;t<2;t++)
{
erodetempValues[t]=mydstImg[j+t*myimgcols];
}
for(int t=2;t<4;t++)
{
erodetempValues[t]=mydstImg[j+(t-2)*myimgcols+1];
}
if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255)
{
erodeImg[j]=1;
}
else
{
erodeImg[j]=0;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//step3:dilate...
for(uint j=get_global_id(0);j<myimgcols;j+=get_local_size(0))
{
if(j==myimgcols-1)
{
dilateImg[j]=0;
break; //??????????????????????
}
__private int dilatetempValues[4]={0};
for(int t=0;t<2;t++)
{
dilatetempValues[t]=erodeImg[j+t*myimgcols];
}
for(int t=2;t<4;t++)
{
dilatetempValues[t]=erodeImg[j+(t-2)*myimgcols+1];
}
if(dilatetempValues[0]==1 || dilatetempValues[1]==1 || dilatetempValues[2]==1 || dilatetempValues[3]==1)
{
dilateImg[j]=1;
}
else
{
dilateImg[j]=0;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//step4:count the every fluore sum of every row (in charge of every group)of the dilate image...
for(uint stride=myimgcols/2;stride>0;stride/=2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(get_local_id(0)<stride)
{
dilateImg[get_local_id(0)]+=dilateImg[get_local_id(0)+stride];
}
}
if(get_local_id(0)==0)
{
tempSum[currentGroupID]=dilateImg[0];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE); //other groups wait for the working groups...//not CLK_LOCAL_MEM_FENCE????
//step5:count the last fluore sum...
if(get_global_id(1)==0)
{
for(uint j=(myimgrows-1)/2;j>0;j/=2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(get_local_id(0)<j)
{
tempSum[get_local_id(0)]+=tempSum[get_local_id(0)+j];
}
}
if(get_local_id(0)==0)
{
printf("fluore points of the image: %d \n",tempSum[0]+tempSum[1454]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
但这个结果是无意义的数。。。我好像知道是哪里错了 修改后:
//size_t localsize[2]={1024,1};
// size_t globalsize[2]={1024,1024};
__kernel void imgProcess(__global uchar *csvArray,__global uchar *currentImg,int myimgcols,int myimgrows,__global int *tempSum)
{
__local int mydstImg[1936*3];
__local int erodeImg[1936*2]; //={0}; //local variable can not be inited like this....
__local int dilateImg[1936]; //={0};
//__global int tempSum[1454]; //exclusive the last row...
int currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
for(;currentGroupID<myimgrows-3;currentGroupID+=get_num_groups(0)*get_num_groups(1))
{
//step1: calculate the currentImg's result:mydstImg for every group,according to csvArray...
for(uint i=get_local_id(0);i<myimgcols*3;i+=get_local_size(0))
{
int tempb=currentImg[currentGroupID*myimgcols+i*3];
int tempg=currentImg[currentGroupID*myimgcols+i*3+1];
int tempr=currentImg[currentGroupID*myimgcols+i*3+2];
int rgbpixels=tempr+tempg*256+tempb*256*256;
uchar rgbelement=csvArray[rgbpixels];
if((int)rgbelement>0)
{
mydstImg[i]=255;
}
else
{
mydstImg[i]=0;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//step2:erode...
for(uint j=get_local_id(0);j<myimgcols*2;j+=get_local_size(0))
{
if((j==myimgcols-1) || (j==myimgcols*2-1))
{
erodeImg[j]=0;
break; //??????????????????????
}
__private int erodetempValues[4]={0}; //private array can be inited like this??
for(int t=0;t<2;t++)
{
erodetempValues[t]=mydstImg[j+t*myimgcols];
}
for(int t=2;t<4;t++)
{
erodetempValues[t]=mydstImg[j+(t-2)*myimgcols+1];
}
if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255)
{
erodeImg[j]=255;
}
else
{
erodeImg[j]=0;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//step3:dilate...
for(uint j=get_local_id(0);j<myimgcols;j+=get_local_size(0))
{
if(j==myimgcols-1)
{
dilateImg[j]=0;
break; //??????????????????????
}
__private int dilatetempValues[4]={0};
for(int t=0;t<2;t++)
{
dilatetempValues[t]=erodeImg[j+t*myimgcols];
}
for(int t=2;t<4;t++)
{
dilatetempValues[t]=erodeImg[j+(t-2)*myimgcols+1];
}
if(dilatetempValues[0]==255 || dilatetempValues[1]==255 || dilatetempValues[2]==255 || dilatetempValues[3]==255)
{
dilateImg[j]=1;
}
else
{
dilateImg[j]=0;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//step4:count the every fluore sum of every row (in charge of every group)of the dilate image...
for(uint stride=myimgcols/2;stride>0;stride/=2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(get_local_id(0)<stride)
{
dilateImg[get_local_id(0)]+=dilateImg[get_local_id(0)+stride];
}
}
if(get_local_id(0)==0)
{
tempSum[currentGroupID]=dilateImg[0];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE); //other groups wait for the working groups...//not CLK_LOCAL_MEM_FENCE????
//step5:count the last fluore sum...tempSum[0]
if(get_global_id(1)==0)
{
for(uint j=1454/2;j>0;j/=2)
{
if(get_local_id(0)<j)
{
tempSum[get_local_id(0)]+=tempSum[get_local_id(0)+j];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
//if(get_local_id(0)==0)
//{
// printf("fluore-points-last: %d \n",tempSum[0]);
//}
//barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
main.cpp:
int main()
{
char filename[100];
cl_uint platformNum;
cl_int status;
status=clGetPlatformIDs(0,NULL,&platformNum);
if(status!=CL_SUCCESS){
printf("cannot get platforms number.\n");
return -1;
}
cl_platform_id* platforms;
platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);
status=clGetPlatformIDs(platformNum,platforms,NULL);
if(status!=CL_SUCCESS){
printf("cannot get platforms addresses.\n");
return -1;
}
cl_platform_id platformInUse=platforms[0];
cl_device_id device;
clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_GPU,1,&device,NULL);
cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE, &status);
std::ifstream srcFile("/home/jumper/OpenCL_projects/FluoreTest4Channels/open_god.cl");
std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));
const char * src = srcProg.c_str();
size_t length = srcProg.length();
cl_program program=clCreateProgramWithSource(context,1,&src,&length,&status);
status=clBuildProgram(program,1,&device,NULL,NULL,NULL);
if (status != CL_SUCCESS)
{
shrLogEx(LOGBOTH | ERRORMSG, status, STDERROR);
oclLogBuildInfo(program, oclGetFirstDev(context));
oclLogPtx(program, oclGetFirstDev(context), "oclfluore.ptx");
return(EXIT_FAILURE);
}
//get the csv model from the disk.
const int rgbsize=256*256*256;
uchar* rgbarray = new uchar[rgbsize];
memset(rgbarray, 0, rgbsize * sizeof(uchar));
Ptr<ml::TrainData> ygdata=cv::ml::TrainData::loadFromCSV("/home/jumper/OpenCL_projects/FluoreTest4Channels/fluore_0728pm.csv",0,-2,0);
Mat csvimg2=ygdata->getSamples();
int csvrows=csvimg2.rows; //csv points
//cout<<"primer csvrows:"<<csvrows<<endl;
for (int j = 0; j < csvrows; j++)
{
float* pixeldata = csvimg2.ptr<float>(j);
float x = pixeldata[0];
float y = pixeldata[1];
float z = pixeldata[2];
int newindex = x + y * 256 + z * 256 * 256;
rgbarray[newindex] = 255;
}
TickMeter tm;
tm.start();
//get the src images from the disk.
int imgwidth,imgheight;
int ii=817;
sprintf(filename, "/home/jumper/OpenCL_projects/FluoreTest4Channels/silver-background0721/%d.bmp", ii);
Mat srcimg=imread(filename);
imgheight=srcimg.rows;
imgwidth=srcimg.cols;
int pixels=imgheight*imgwidth;
int srcdatasize=pixels*3*sizeof(uchar);
cl_mem rgbArray_buffer=clCreateBuffer(context,CL_MEM_READ_ONLY,rgbsize*sizeof(uchar),rgbarray,&status);
status = clEnqueueWriteBuffer(queue, rgbArray_buffer, CL_FALSE, 0, rgbsize * sizeof(uchar), rgbarray, 0, NULL, NULL);
cl_mem srcdata_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, srcdatasize, NULL,&status);
status = clEnqueueWriteBuffer(queue, srcdata_buffer, CL_FALSE, 0, srcdatasize, srcimg.data, 0, NULL, NULL);
size_t sumsize=1454*sizeof(int);
cl_mem sumArray_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sumsize, NULL,&status);
cl_kernel kernel_imgProc=clCreateKernel(program,"imgProcess",NULL);
status = clSetKernelArg(kernel_imgProc, 0, sizeof(cl_mem), (void*)&rgbArray_buffer);
status = clSetKernelArg(kernel_imgProc, 1, sizeof(cl_mem), (void*)&srcdata_buffer);
status = clSetKernelArg(kernel_imgProc, 2, sizeof(cl_int), &imgwidth);
status = clSetKernelArg(kernel_imgProc, 3, sizeof(cl_int), &imgheight);
status = clSetKernelArg(kernel_imgProc, 4, sizeof(cl_mem), (void*)&sumArray_buffer);
size_t localsize[2]={1024,1};
size_t globalsize[2]={1024,1024};
status =clEnqueueNDRangeKernel(queue, kernel_imgProc, 2, NULL, globalsize, localsize,0,NULL,NULL);
if (status != CL_SUCCESS)
{
cout<<"clEnqueueNDRangeKernel() failed..."<<status<<endl;
return(EXIT_FAILURE);
}
status=clFinish(queue);
if (status != CL_SUCCESS)
{
cout<<"clFinish() failed..."<<status<<endl;
return(EXIT_FAILURE);
}
int *sumMap=(int*)malloc(sumsize);
status=clEnqueueReadBuffer(queue,sumArray_buffer,CL_TRUE, 0,sumsize, sumMap, 0, NULL, NULL);
status=clFinish(queue);
if (status != CL_SUCCESS)
{
cout<<"clEnqueueReadBuffer() failed..."<<status<<endl;
return(EXIT_FAILURE);
}
cout<<"fluore result:"<<sumMap[0]<<endl;
tm.stop();
cout<<"count="<<tm.getCounter()<<" ,process time="<<tm.getTimeMilli()<<" ms."<<endl;
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseProgram(program);
clReleaseKernel(kernel_imgProc);
clReleaseMemObject(srcdata_buffer);
clReleaseMemObject(rgbArray_buffer);
delete [] rgbarray;
free(sumMap);
return 0;
}
但结果有点不稳定,出现过一次6,另外再试N次又都是稳定的1311????可能是我cl文件中疑惑的那处造成的?!先不管结果正不正确,比opencv的open()和在host端计算点数之和减速了1ms!!!
但这个不稳定就证明还有问题 我想想。。。
于是我准备将cl部分腐蚀和膨胀的图片传出来看看:
//size_t localsize[2]={1024,1};
// size_t globalsize[2]={1024,1024};
__kernel void imgProcess(__global uchar *csvArray,__global uchar *currentImg,int myimgcols,int myimgrows,
__global int *tempSum,__global uchar *testErodeImg,__global uchar *testDilateImg)
{
__local int mydstImg[1936*3];
__local int erodeImg[1936*2]; //={0}; //local variable can not be inited like this....
__local int dilateImg[1936]; //={0};
//__global int tempSum[1454]; //exclusive the last row...
int currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
for(;currentGroupID<myimgrows-3;currentGroupID+=get_num_groups(0)*get_num_groups(1))
{
//step1: calculate the currentImg's result:mydstImg for every group,according to csvArray...
for(uint i=get_local_id(0);i<myimgcols*3;i+=get_local_size(0))
{
int tempb=currentImg[currentGroupID*myimgcols+i*3];
int tempg=currentImg[currentGroupID*myimgcols+i*3+1];
int tempr=currentImg[currentGroupID*myimgcols+i*3+2];
int rgbpixels=tempr+tempg*256+tempb*256*256;
uchar rgbelement=csvArray[rgbpixels];
if((int)rgbelement>0)
{
mydstImg[i]=255;
}
else
{
mydstImg[i]=0;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//step2:erode...
for(uint j=get_local_id(0);j<myimgcols*2;j+=get_local_size(0))
{
if((j==myimgcols-1) || (j==myimgcols*2-1))
{
erodeImg[j]=0;
break; //????????
}
__private int erodetempValues[4]={0}; //private array can be inited like this??
for(int t=0;t<2;t++)
{
erodetempValues[t]=mydstImg[j+t*myimgcols];
}
for(int t=2;t<4;t++)
{
erodetempValues[t]=mydstImg[j+(t-2)*myimgcols+1];
}
if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255)
{
erodeImg[j]=255;
}
else
{
erodeImg[j]=0;
}
if(get_local_id(0)<myimgcols)
{
testErodeImg[currentGroupID*myimgcols+j]=erodeImg[j];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
//step3:dilate...
for(uint j=get_local_id(0);j<myimgcols;j+=get_local_size(0))
{
if(j==myimgcols-1)
{
dilateImg[j]=0;
break; //??????????????????????
}
__private int dilatetempValues[4]={0};
for(int t=0;t<2;t++)
{
dilatetempValues[t]=erodeImg[j+t*myimgcols];
}
for(int t=2;t<4;t++)
{
dilatetempValues[t]=erodeImg[j+(t-2)*myimgcols+1];
}
if(dilatetempValues[0]==255 || dilatetempValues[1]==255 || dilatetempValues[2]==255 || dilatetempValues[3]==255)
{
dilateImg[j]=255;
}
else
{
dilateImg[j]=0;
}
testDilateImg[currentGroupID*myimgcols+j]=dilateImg[j];
}
barrier(CLK_LOCAL_MEM_FENCE);
//step4:count the every fluore sum of every row (in charge of every group)of the dilate image...
for(uint stride=myimgcols/2;stride>0;stride/=2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(get_local_id(0)<stride)
{
dilateImg[get_local_id(0)]+=dilateImg[get_local_id(0)+stride];
}
}
if(get_local_id(0)==0)
{
tempSum[currentGroupID]=dilateImg[0];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE); //other groups wait for the working groups...//not CLK_LOCAL_MEM_FENCE????
//step5:count the last fluore sum...tempSum[0]
if(get_global_id(1)==0)
{
for(uint j=1454/2;j>0;j/=2)
{
if(get_local_id(0)<j)
{
tempSum[get_local_id(0)]+=tempSum[get_local_id(0)+j];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
//if(get_local_id(0)==0)
//{
// printf("fluore-points-last: %d \n",tempSum[0]);
//}
//barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
main.cpp:
int main()
{
char filename[100];
cl_uint platformNum;
cl_int status;
status=clGetPlatformIDs(0,NULL,&platformNum);
if(status!=CL_SUCCESS){
printf("cannot get platforms number.\n");
return -1;
}
cl_platform_id* platforms;
platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);
status=clGetPlatformIDs(platformNum,platforms,NULL);
if(status!=CL_SUCCESS){
printf("cannot get platforms addresses.\n");
return -1;
}
cl_platform_id platformInUse=platforms[0];
cl_device_id device;
clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_GPU,1,&device,NULL);
cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE, &status);
std::ifstream srcFile("/home/jumper/OpenCL_projects/FluoreTest4Channels/open_god.cl");
std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));
const char * src = srcProg.c_str();
size_t length = srcProg.length();
cl_program program=clCreateProgramWithSource(context,1,&src,&length,&status);
status=clBuildProgram(program,1,&device,NULL,NULL,NULL);
if (status != CL_SUCCESS)
{
shrLogEx(LOGBOTH | ERRORMSG, status, STDERROR);
oclLogBuildInfo(program, oclGetFirstDev(context));
oclLogPtx(program, oclGetFirstDev(context), "oclfluore.ptx");
return(EXIT_FAILURE);
}
//get the csv model from the disk.
const int rgbsize=256*256*256;
uchar* rgbarray = new uchar[rgbsize];
memset(rgbarray, 0, rgbsize * sizeof(uchar));
Ptr<ml::TrainData> ygdata=cv::ml::TrainData::loadFromCSV("/home/jumper/OpenCL_projects/FluoreTest4Channels/fluore_0728pm.csv",0,-2,0);
Mat csvimg2=ygdata->getSamples();
int csvrows=csvimg2.rows; //csv points
//cout<<"primer csvrows:"<<csvrows<<endl;
for (int j = 0; j < csvrows; j++)
{
float* pixeldata = csvimg2.ptr<float>(j);
float x = pixeldata[0];
float y = pixeldata[1];
float z = pixeldata[2];
int newindex = x + y * 256 + z * 256 * 256;
rgbarray[newindex] = 255;
}
TickMeter tm;
tm.start();
//get the src images from the disk.
int imgwidth,imgheight;
int ii=817;
sprintf(filename, "/home/jumper/OpenCL_projects/FluoreTest4Channels/silver-background0721/%d.bmp", ii);
Mat srcimg=imread(filename);
imgheight=srcimg.rows;
imgwidth=srcimg.cols;
int pixels=imgheight*imgwidth;
int srcdatasize=pixels*3*sizeof(uchar);
cl_mem rgbArray_buffer=clCreateBuffer(context,CL_MEM_READ_ONLY,rgbsize*sizeof(uchar),rgbarray,&status);
status = clEnqueueWriteBuffer(queue, rgbArray_buffer, CL_FALSE, 0, rgbsize * sizeof(uchar), rgbarray, 0, NULL, NULL);
cl_mem srcdata_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, srcdatasize, NULL,&status);
status = clEnqueueWriteBuffer(queue, srcdata_buffer, CL_FALSE, 0, srcdatasize, srcimg.data, 0, NULL, NULL);
size_t sumsize=1454*sizeof(int);
cl_mem sumArray_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sumsize, NULL,&status);
cl_mem erodeImg4test_buffer=clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, pixels*sizeof(uchar), NULL,&status);
cl_mem dilateImg4test_buffer=clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, pixels*sizeof(uchar), NULL,&status);
int zero = 0;
status = clEnqueueFillBuffer(queue, erodeImg4test_buffer, &zero,sizeof(int), 0, pixels*sizeof(uchar), 0, NULL,NULL);
status = clEnqueueFillBuffer(queue, dilateImg4test_buffer, &zero,sizeof(int), 0, pixels*sizeof(uchar), 0, NULL,NULL);
cl_kernel kernel_imgProc=clCreateKernel(program,"imgProcess",NULL);
status = clSetKernelArg(kernel_imgProc, 0, sizeof(cl_mem), (void*)&rgbArray_buffer);
status = clSetKernelArg(kernel_imgProc, 1, sizeof(cl_mem), (void*)&srcdata_buffer);
status = clSetKernelArg(kernel_imgProc, 2, sizeof(cl_int), &imgwidth);
status = clSetKernelArg(kernel_imgProc, 3, sizeof(cl_int), &imgheight);
status = clSetKernelArg(kernel_imgProc, 4, sizeof(cl_mem), (void*)&sumArray_buffer);
status = clSetKernelArg(kernel_imgProc, 5, sizeof(cl_mem), (void*)&erodeImg4test_buffer);
status = clSetKernelArg(kernel_imgProc, 6, sizeof(cl_mem), (void*)&dilateImg4test_buffer);
size_t localsize[2]={1024,1};
size_t globalsize[2]={1024,1024};
status =clEnqueueNDRangeKernel(queue, kernel_imgProc, 2, NULL, globalsize, localsize,0,NULL,NULL);
if (status != CL_SUCCESS)
{
cout<<"clEnqueueNDRangeKernel() failed..."<<status<<endl;
return(EXIT_FAILURE);
}
status=clFinish(queue);
if (status != CL_SUCCESS)
{
cout<<"clFinish() failed..."<<status<<endl;
return(EXIT_FAILURE);
}
int *sumMap=(int*)malloc(sumsize);
status=clEnqueueReadBuffer(queue,sumArray_buffer,CL_TRUE, 0,sumsize, sumMap, 0, NULL, NULL);
status=clFinish(queue);
if (status != CL_SUCCESS)
{
cout<<"clEnqueueReadBuffer() failed..."<<status<<endl;
return(EXIT_FAILURE);
}
cout<<"fluore result:"<<sumMap[0]<<endl;
uchar *hostErode=NULL;
hostErode=(uchar*)clEnqueueMapBuffer(queue,erodeImg4test_buffer,CL_TRUE, CL_MAP_READ, 0, pixels*sizeof(uchar), 0, NULL, NULL, &status);
clEnqueueUnmapMemObject(queue, erodeImg4test_buffer, (void*)hostErode, 0, NULL, NULL);
Mat dstErodeimg=Mat(imgheight,imgwidth,CV_8UC1,hostErode);
imwrite("/home/jumper/OpenCL_projects/FluoreTest4Channels/myerode.jpg",dstErodeimg);
uchar *hostDilate=NULL;
hostDilate=(uchar*)clEnqueueMapBuffer(queue,dilateImg4test_buffer,CL_TRUE, CL_MAP_READ, 0, pixels*sizeof(uchar), 0, NULL, NULL, &status);
clEnqueueUnmapMemObject(queue, dilateImg4test_buffer, (void*)hostDilate, 0, NULL, NULL);
Mat dstDilateimg=Mat(imgheight,imgwidth,CV_8UC1,hostDilate);
imwrite("/home/jumper/OpenCL_projects/FluoreTest4Channels/mydilate.jpg",dstDilateimg);
tm.stop();
cout<<"count="<<tm.getCounter()<<" ,process time="<<tm.getTimeMilli()<<" ms."<<endl;
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseProgram(program);
clReleaseKernel(kernel_imgProc);
clReleaseMemObject(srcdata_buffer);
clReleaseMemObject(rgbArray_buffer);
delete [] rgbarray;
free(sumMap);
clReleaseMemObject(erodeImg4test_buffer);
clReleaseMemObject(dilateImg4test_buffer);
return 0;
}
但结果是:
但原图和opencv开之和的图是这样子的:
我将我的中间结果腐蚀和膨胀的图片 放大看,是看到有膨胀效果的,证明没有编写错,根据腐蚀的结果图基础上进行膨胀。那么就是腐蚀那里错了?
结果一步步找,发现最开始根据那个csv文件得到每个像素点是0还是255生成原始待处理图片那里就错了:上周我是这样写的:
上周这样写没错,返回的mydstImg这个图片是正确的;但今天改成了下面这样,返回的就是错的了:
我屏蔽了后面的腐蚀膨胀和计数。但这里返回的图片是错的,与上周对比,为什么?
另外,大神跟我将了bit map/bit mask:
4,终于知道了问题所在,拿三行的那里应该*3通道的,我眼瞎:
修改后:将腐蚀和膨胀的图返回并与OpenCV的结果图对比一致,肉眼上一致 !!还没具体算点数:
__kernel void imgProcess(__global uchar *csvArray,__global uchar *currentImg,int myimgcols,int myimgrows,
__global int *tempSum,__global uchar *testErodeImg,__global uchar *testDilateImg)
{
__local uchar mydstImg[1936*3];
__local uchar erodeImg[1936*2]; //={0}; //local variable can not be inited like this....
__local int dilateImg[1936]; //={0};
int currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
for(;currentGroupID<myimgrows-3;currentGroupID+=get_num_groups(0)*get_num_groups(1))
{
//step1: calculate the currentImg's result:mydstImg for every group,according to csvArray...
for(uint i=get_local_id(0);i<myimgcols*3;i+=get_local_size(0))
{
int tempb=currentImg[currentGroupID*myimgcols*3+i*3];
int tempg=currentImg[currentGroupID*myimgcols*3+i*3+1];
int tempr=currentImg[currentGroupID*myimgcols*3+i*3+2];
int rgbpixels=tempr+tempg*256+tempb*256*256;
uchar rgbelement=csvArray[rgbpixels];
if((int)rgbelement>0)
{
mydstImg[i]=255;
}
else
{
mydstImg[i]=0;
}
}
//step2:erode...
for(uint j=get_local_id(0);j<myimgcols*2;j+=get_local_size(0))
{
if((j==myimgcols-1) || (j==myimgcols*2-1))
{
erodeImg[j]=0;
break;
}
__private int erodetempValues[4]={0}; //private array can be inited like this??
for(int t=0;t<2;t++)
{
erodetempValues[t]=(int)mydstImg[j+t*myimgcols];
}
for(int t=2;t<4;t++)
{
erodetempValues[t]=(int)mydstImg[j+(t-2)*myimgcols+1];
}
if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255)
{
erodeImg[j]=255;
}
else
{
erodeImg[j]=0;
}
if(get_local_id(0)<myimgcols)
{
testErodeImg[currentGroupID*myimgcols+j]=mydstImg[j];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
//step3:dilate...
for(uint j=get_local_id(0);j<myimgcols;j+=get_local_size(0))
{
if(j==myimgcols-1)
{
dilateImg[j]=0;
break;
}
__private int dilatetempValues[4]={0};
for(int t=0;t<2;t++)
{
dilatetempValues[t]=(int)erodeImg[j+t*myimgcols];
}
for(int t=2;t<4;t++)
{
dilatetempValues[t]=(int)erodeImg[j+(t-2)*myimgcols+1];
}
if(dilatetempValues[0]==255 || dilatetempValues[1]==255 || dilatetempValues[2]==255 || dilatetempValues[3]==255)
{
dilateImg[j]=255;
}
else
{
dilateImg[j]=0;
}
testDilateImg[currentGroupID*myimgcols+j]=dilateImg[j];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
main.cpp:
int main()
{
char filename[100];
cl_uint platformNum;
cl_int status;
status=clGetPlatformIDs(0,NULL,&platformNum);
if(status!=CL_SUCCESS){
printf("cannot get platforms number.\n");
return -1;
}
cl_platform_id* platforms;
platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);
status=clGetPlatformIDs(platformNum,platforms,NULL);
if(status!=CL_SUCCESS){
printf("cannot get platforms addresses.\n");
return -1;
}
cl_platform_id platformInUse=platforms[0];
cl_device_id device;
clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_GPU,1,&device,NULL);
cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE, &status);
std::ifstream srcFile("/home/jumper/OpenCL_projects/FluoreTest4Channels/open_test.cl");
std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));
const char * src = srcProg.c_str();
size_t length = srcProg.length();
cl_program program=clCreateProgramWithSource(context,1,&src,&length,&status);
status=clBuildProgram(program,1,&device,NULL,NULL,NULL);
if (status != CL_SUCCESS)
{
shrLogEx(LOGBOTH | ERRORMSG, status, STDERROR);
oclLogBuildInfo(program, oclGetFirstDev(context));
oclLogPtx(program, oclGetFirstDev(context), "oclfluore.ptx");
return(EXIT_FAILURE);
}
//get the csv model from the disk.
const int rgbsize=256*256*256;
uchar* rgbarray = new uchar[rgbsize];
memset(rgbarray, 0, rgbsize * sizeof(uchar));
Ptr<ml::TrainData> ygdata=cv::ml::TrainData::loadFromCSV("/home/jumper/OpenCL_projects/FluoreTest4Channels/fluore_0728pm.csv",0,-2,0);
Mat csvimg2=ygdata->getSamples();
int csvrows=csvimg2.rows; //csv points
//cout<<"primer csvrows:"<<csvrows<<endl;
for (int j = 0; j < csvrows; j++)
{
float* pixeldata = csvimg2.ptr<float>(j);
float x = pixeldata[0];
float y = pixeldata[1];
float z = pixeldata[2];
int newindex = x + y * 256 + z * 256 * 256;
rgbarray[newindex] = 255;
}
TickMeter tm;
tm.start();
//get the src images from the disk.
int imgwidth,imgheight;
int ii=817;
sprintf(filename, "/home/jumper/OpenCL_projects/FluoreTest4Channels/silver-background0721/%d.bmp", ii);
Mat srcimg=imread(filename);
imgheight=srcimg.rows;
imgwidth=srcimg.cols;
int pixels=imgheight*imgwidth;
int srcdatasize=pixels*3*sizeof(uchar);
cl_mem rgbArray_buffer=clCreateBuffer(context,CL_MEM_READ_ONLY,rgbsize*sizeof(uchar),rgbarray,&status);
status = clEnqueueWriteBuffer(queue, rgbArray_buffer, CL_FALSE, 0, rgbsize * sizeof(uchar), rgbarray, 0, NULL, NULL);
cl_mem srcdata_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, srcdatasize, NULL,&status);
status = clEnqueueWriteBuffer(queue, srcdata_buffer, CL_FALSE, 0, srcdatasize, srcimg.data, 0, NULL, NULL);
size_t sumsize=1454*sizeof(int);
cl_mem sumArray_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sumsize, NULL,&status);
cl_mem erodeImg4test_buffer=clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, pixels*sizeof(uchar), NULL,&status);
cl_mem dilateImg4test_buffer=clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, pixels*sizeof(uchar), NULL,&status);
int zero = 0;
status = clEnqueueFillBuffer(queue, erodeImg4test_buffer, &zero,sizeof(int), 0, pixels*sizeof(uchar), 0, NULL,NULL);
status = clEnqueueFillBuffer(queue, dilateImg4test_buffer, &zero,sizeof(int), 0, pixels*sizeof(uchar), 0, NULL,NULL);
cl_kernel kernel_imgProc=clCreateKernel(program,"imgProcess",NULL);
status = clSetKernelArg(kernel_imgProc, 0, sizeof(cl_mem), (void*)&rgbArray_buffer);
status = clSetKernelArg(kernel_imgProc, 1, sizeof(cl_mem), (void*)&srcdata_buffer);
status = clSetKernelArg(kernel_imgProc, 2, sizeof(cl_int), &imgwidth);
status = clSetKernelArg(kernel_imgProc, 3, sizeof(cl_int), &imgheight);
status = clSetKernelArg(kernel_imgProc, 4, sizeof(cl_mem), (void*)&sumArray_buffer);
status = clSetKernelArg(kernel_imgProc, 5, sizeof(cl_mem), (void*)&erodeImg4test_buffer);
status = clSetKernelArg(kernel_imgProc, 6, sizeof(cl_mem), (void*)&dilateImg4test_buffer);
size_t localsize[2]={1024,1};
size_t globalsize[2]={1024,1024};
status =clEnqueueNDRangeKernel(queue, kernel_imgProc, 2, NULL, globalsize, localsize,0,NULL,NULL);
if (status != CL_SUCCESS)
{
cout<<"clEnqueueNDRangeKernel() failed..."<<status<<endl;
return(EXIT_FAILURE);
}
status=clFinish(queue);
if (status != CL_SUCCESS)
{
cout<<"clFinish() failed..."<<status<<endl;
return(EXIT_FAILURE);
}
//int *sumMap=(int*)malloc(sumsize);
//status=clEnqueueReadBuffer(queue,sumArray_buffer,CL_TRUE, 0,sumsize, sumMap, 0, NULL, NULL);
//status=clFinish(queue);
//if (status != CL_SUCCESS)
// {
// cout<<"clEnqueueReadBuffer() failed..."<<status<<endl;
// return(EXIT_FAILURE);
// }
//cout<<"fluore result:"<<sumMap[0]<<endl;
uchar *hostErode=NULL;
hostErode=(uchar*)clEnqueueMapBuffer(queue,erodeImg4test_buffer,CL_TRUE, CL_MAP_READ, 0, pixels*sizeof(uchar), 0, NULL, NULL, &status);
clEnqueueUnmapMemObject(queue, erodeImg4test_buffer, (void*)hostErode, 0, NULL, NULL);
Mat dstErodeimg=Mat(imgheight,imgwidth,CV_8UC1,hostErode);
imwrite("/home/jumper/OpenCL_projects/FluoreTest4Channels/myerode.jpg",dstErodeimg);
uchar *hostDilate=NULL;
hostDilate=(uchar*)clEnqueueMapBuffer(queue,dilateImg4test_buffer,CL_TRUE, CL_MAP_READ, 0, pixels*sizeof(uchar), 0, NULL, NULL, &status);
clEnqueueUnmapMemObject(queue, dilateImg4test_buffer, (void*)hostDilate, 0, NULL, NULL);
Mat dstDilateimg=Mat(imgheight,imgwidth,CV_8UC1,hostDilate);
imwrite("/home/jumper/OpenCL_projects/FluoreTest4Channels/mydilate.jpg",dstDilateimg);
tm.stop();
cout<<"count="<<tm.getCounter()<<" ,process time="<<tm.getTimeMilli()<<" ms."<<endl;
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseProgram(program);
clReleaseKernel(kernel_imgProc);
clReleaseMemObject(srcdata_buffer);
clReleaseMemObject(rgbArray_buffer);
delete [] rgbarray;
//free(sumMap);
clReleaseMemObject(erodeImg4test_buffer);
clReleaseMemObject(dilateImg4test_buffer);
return 0;
}
这个返回的先腐蚀后膨胀后的图基本与opencv下一致,我待会儿算一下点数,就知道是否真的一致了!刚返回host端算点是正确的,但我想在求和算点在kernel上算,因为快些。但是在kernel端的结果竟然是错的:
//step4:count the every fluore sum of every row (in charge of every group)of the dilate image...
//1: correct...
int rowSum=0;
if(get_local_id(0)==0)
{
for(uint t=0;t<myimgcols;t++)
{
rowSum+=dilateImg[t];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//2: false...why?????
for(uint stride=myimgcols/2;stride>0;stride/=2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(get_local_id(0)<stride)
{
dilateImg[get_local_id(0)]+=dilateImg[get_local_id(0)+stride];
}
}
if(get_local_id(0)==0)
{
printf("false: %d correct:%d\n",dilateImg[0],rowSum);
}
barrier(CLK_LOCAL_MEM_FENCE);
这两种写法,为什么第二种的结果和第一种会不一样呢?
位置是正确的,但数据怎么计算得不一样呢???????
终于知道原因了 ,因为myimgcols=1936,不是2的整数次幂,故规约法求和时漏掉了4个数:
把这4个加上就好了!!!
5,初步正确的OpenCL开操作版本:
__kernel void imgProcess(__global uchar *csvArray,__global uchar *currentImg,int myimgcols,int myimgrows,
__global int *tempSum,__global uchar *testErodeImg,__global uchar *testDilateImg)
{
__local uchar mydstImg[1936*3];
__local uchar erodeImg[1936*2]; //={0}; //local variable can not be inited like this....
__local int dilateImg[1936]; //={0};
int currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
for(;currentGroupID<myimgrows-3;currentGroupID+=get_num_groups(0)*get_num_groups(1))
{
//step1: calculate the currentImg's result:mydstImg for every group,according to csvArray...
for(uint i=get_local_id(0);i<myimgcols*3;i+=get_local_size(0))
{
int tempb=currentImg[currentGroupID*myimgcols*3+i*3];
int tempg=currentImg[currentGroupID*myimgcols*3+i*3+1];
int tempr=currentImg[currentGroupID*myimgcols*3+i*3+2];
int rgbpixels=tempr+tempg*256+tempb*256*256;
uchar rgbelement=csvArray[rgbpixels];
if((int)rgbelement>0)
{
mydstImg[i]=255;
}
else
{
mydstImg[i]=0;
}
}
//step2:erode...
for(uint j=get_local_id(0);j<myimgcols*2;j+=get_local_size(0))
{
if((j==myimgcols-1) || (j==myimgcols*2-1))
{
erodeImg[j]=0;
break;
}
__private int erodetempValues[4]={0}; //private array can be inited like this??
for(int t=0;t<2;t++)
{
erodetempValues[t]=(int)mydstImg[j+t*myimgcols];
}
for(int t=2;t<4;t++)
{
erodetempValues[t]=(int)mydstImg[j+(t-2)*myimgcols+1];
}
if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255)
{
erodeImg[j]=255;
}
else
{
erodeImg[j]=0;
}
if(get_local_id(0)<myimgcols)
{
testErodeImg[currentGroupID*myimgcols+j]=mydstImg[j];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
//step3:dilate...
for(uint j=get_local_id(0);j<myimgcols;j+=get_local_size(0))
{
if(j==myimgcols-1)
{
dilateImg[j]=0;
break;
}
__private int dilatetempValues[4]={0};
for(int t=0;t<2;t++)
{
dilatetempValues[t]=(int)erodeImg[j+t*myimgcols];
}
for(int t=2;t<4;t++)
{
dilatetempValues[t]=(int)erodeImg[j+(t-2)*myimgcols+1];
}
if(dilatetempValues[0]==255 || dilatetempValues[1]==255 || dilatetempValues[2]==255 || dilatetempValues[3]==255)
{
dilateImg[j]=1;
}
else
{
dilateImg[j]=0;
}
testDilateImg[currentGroupID*myimgcols+j]=dilateImg[j];
}
barrier(CLK_LOCAL_MEM_FENCE);
//step4:count the every fluore sum of every row (in charge of every group)of the dilate image...
for(uint stride=myimgcols/2;stride>0;stride/=2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(get_local_id(0)<stride)
{
dilateImg[get_local_id(0)]+=dilateImg[get_local_id(0)+stride];
}
}
if(get_local_id(0)==0)
{
tempSum[currentGroupID]=dilateImg[0]+dilateImg[2]+dilateImg[6]+dilateImg[14]+dilateImg[120];
//printf("false: %d correct:%d\n",tempSum[currentGroupID],rowSum);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
//barrier(CLK_LOCAL_MEM_FENCE); //CLK_LOCAL_MEM_FENCE can not help here!!!!!!result in the unstable...
//step5:count the last fluore sum...tempSum[0]
if(get_global_id(1)==0)
{
for(uint j=1454/2;j>0;j/=2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(get_local_id(0)<j)
{
tempSum[get_local_id(0)]+=tempSum[get_local_id(0)+j];
}
}
if(get_local_id(0)==0)
{
printf("kernel-points-last: %d\n",tempSum[0]+tempSum[4]+tempSum[10]+tempSum[44]+tempSum[180]+tempSum[362]+tempSum[726]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
main.cpp:
int main()
{
char filename[100];
cl_uint platformNum;
cl_int status;
status=clGetPlatformIDs(0,NULL,&platformNum);
if(status!=CL_SUCCESS){
printf("cannot get platforms number.\n");
return -1;
}
cl_platform_id* platforms;
platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);
status=clGetPlatformIDs(platformNum,platforms,NULL);
if(status!=CL_SUCCESS){
printf("cannot get platforms addresses.\n");
return -1;
}
cl_platform_id platformInUse=platforms[0];
cl_device_id device;
clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_GPU,1,&device,NULL);
cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE, &status);
std::ifstream srcFile("/home/jumper/OpenCL_projects/FluoreTest4Channels/blog.cl");
std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));
const char * src = srcProg.c_str();
size_t length = srcProg.length();
cl_program program=clCreateProgramWithSource(context,1,&src,&length,&status);
status=clBuildProgram(program,1,&device,NULL,NULL,NULL);
if (status != CL_SUCCESS)
{
shrLogEx(LOGBOTH | ERRORMSG, status, STDERROR);
oclLogBuildInfo(program, oclGetFirstDev(context));
oclLogPtx(program, oclGetFirstDev(context), "oclfluore.ptx");
return(EXIT_FAILURE);
}
//get the csv model from the disk.
const int rgbsize=256*256*256;
uchar* rgbarray = new uchar[rgbsize];
memset(rgbarray, 0, rgbsize * sizeof(uchar));
Ptr<ml::TrainData> ygdata=cv::ml::TrainData::loadFromCSV("/home/jumper/OpenCL_projects/FluoreTest4Channels/fluore_0728pm.csv",0,-2,0);
Mat csvimg2=ygdata->getSamples();
int csvrows=csvimg2.rows; //csv points
//cout<<"primer csvrows:"<<csvrows<<endl;
for (int j = 0; j < csvrows; j++)
{
float* pixeldata = csvimg2.ptr<float>(j);
float x = pixeldata[0];
float y = pixeldata[1];
float z = pixeldata[2];
int newindex = x + y * 256 + z * 256 * 256;
rgbarray[newindex] = 255;
}
TickMeter tm;
tm.start();
//get the src images from the disk.
int imgwidth,imgheight;
int ii=817;
sprintf(filename, "/home/jumper/OpenCL_projects/FluoreTest4Channels/silver-background0721/%d.bmp", ii);
Mat srcimg=imread(filename);
imgheight=srcimg.rows;
imgwidth=srcimg.cols;
int pixels=imgheight*imgwidth;
int srcdatasize=pixels*3*sizeof(uchar);
cl_mem rgbArray_buffer=clCreateBuffer(context,CL_MEM_READ_ONLY,rgbsize*sizeof(uchar),rgbarray,&status);
status = clEnqueueWriteBuffer(queue, rgbArray_buffer, CL_FALSE, 0, rgbsize * sizeof(uchar), rgbarray, 0, NULL, NULL);
cl_mem srcdata_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, srcdatasize, NULL,&status);
status = clEnqueueWriteBuffer(queue, srcdata_buffer, CL_FALSE, 0, srcdatasize, srcimg.data, 0, NULL, NULL);
size_t sumsize=1454*sizeof(int);
cl_mem sumArray_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sumsize, NULL,&status);
cl_mem erodeImg4test_buffer=clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, pixels*sizeof(uchar), NULL,&status);
cl_mem dilateImg4test_buffer=clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, pixels*sizeof(uchar), NULL,&status);
int zero = 0;
status = clEnqueueFillBuffer(queue, erodeImg4test_buffer, &zero,sizeof(int), 0, pixels*sizeof(uchar), 0, NULL,NULL);
status = clEnqueueFillBuffer(queue, dilateImg4test_buffer, &zero,sizeof(int), 0, pixels*sizeof(uchar), 0, NULL,NULL);
cl_kernel kernel_imgProc=clCreateKernel(program,"imgProcess",NULL);
status = clSetKernelArg(kernel_imgProc, 0, sizeof(cl_mem), (void*)&rgbArray_buffer);
status = clSetKernelArg(kernel_imgProc, 1, sizeof(cl_mem), (void*)&srcdata_buffer);
status = clSetKernelArg(kernel_imgProc, 2, sizeof(cl_int), &imgwidth);
status = clSetKernelArg(kernel_imgProc, 3, sizeof(cl_int), &imgheight);
status = clSetKernelArg(kernel_imgProc, 4, sizeof(cl_mem), (void*)&sumArray_buffer);
status = clSetKernelArg(kernel_imgProc, 5, sizeof(cl_mem), (void*)&erodeImg4test_buffer);
status = clSetKernelArg(kernel_imgProc, 6, sizeof(cl_mem), (void*)&dilateImg4test_buffer);
size_t localsize[2]={1024,1};
size_t globalsize[2]={1024,1024};
status =clEnqueueNDRangeKernel(queue, kernel_imgProc, 2, NULL, globalsize, localsize,0,NULL,NULL);
if (status != CL_SUCCESS)
{
cout<<"clEnqueueNDRangeKernel() failed..."<<status<<endl;
return(EXIT_FAILURE);
}
status=clFinish(queue);
if (status != CL_SUCCESS)
{
cout<<"clFinish() failed..."<<status<<endl;
return(EXIT_FAILURE);
}
//int *sumMap=(int*)malloc(sumsize);
//status=clEnqueueReadBuffer(queue,sumArray_buffer,CL_TRUE, 0,sumsize, sumMap, 0, NULL, NULL);
//status=clFinish(queue);
//if (status != CL_SUCCESS)
// {
// cout<<"clEnqueueReadBuffer() failed..."<<status<<endl;
// return(EXIT_FAILURE);
// }
//cout<<"fluore result:"<<sumMap[0]<<endl;
uchar *hostErode=NULL;
hostErode=(uchar*)clEnqueueMapBuffer(queue,erodeImg4test_buffer,CL_TRUE, CL_MAP_READ, 0, pixels*sizeof(uchar), 0, NULL, NULL, &status);
clEnqueueUnmapMemObject(queue, erodeImg4test_buffer, (void*)hostErode, 0, NULL, NULL);
Mat dstErodeimg=Mat(imgheight,imgwidth,CV_8UC1,hostErode);
imwrite("/home/jumper/OpenCL_projects/FluoreTest4Channels/myerode.jpg",dstErodeimg);
uchar *hostDilate=NULL;
hostDilate=(uchar*)clEnqueueMapBuffer(queue,dilateImg4test_buffer,CL_TRUE, CL_MAP_READ, 0, pixels*sizeof(uchar), 0, NULL, NULL, &status);
clEnqueueUnmapMemObject(queue, dilateImg4test_buffer, (void*)hostDilate, 0, NULL, NULL);
Mat dstDilateimg=Mat(imgheight,imgwidth,CV_8UC1,hostDilate);
imwrite("/home/jumper/OpenCL_projects/FluoreTest4Channels/mydilate.jpg",dstDilateimg);
tm.stop();
cout<<"count="<<tm.getCounter()<<" ,process time="<<tm.getTimeMilli()<<" ms."<<endl;
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseProgram(program);
clReleaseKernel(kernel_imgProc);
clReleaseMemObject(srcdata_buffer);
clReleaseMemObject(rgbArray_buffer);
delete [] rgbarray;
//free(sumMap);
clReleaseMemObject(erodeImg4test_buffer);
clReleaseMemObject(dilateImg4test_buffer);
return 0;
}
结果是4139个点!与opencv的开结果一致!
但有个偶然性:什么都没改 第一次和后面很多次的结果不一样。
可能是同步的问题
因为有个地方我没处理好
我想等所有groups运行完 再将这些groups的结果加起来
但这没办法等待的!!!!!!所以造成了第一次的结果是乱七八糟的数,后面再次运行都正确!
6,稳定的OpenCL版本
我将无法同步的最后的计数部分返回到host端计数,这样就相当于一种同步了,测试了每次的结果都正确。
__kernel void imgProcess(__global uchar *csvArray,__global uchar *currentImg,int myimgcols,int myimgrows,
__global int *tempSum,__global uchar *testErodeImg,__global uchar *testDilateImg)
{
__local uchar mydstImg[1936*3];
__local uchar erodeImg[1936*2]; //={0}; //local variable can not be inited like this....
__local int dilateImg[1936]; //={0};
int currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
for(;currentGroupID<myimgrows-2;currentGroupID+=get_num_groups(0)*get_num_groups(1))
{
//step1: calculate the currentImg's result:mydstImg for every group,according to csvArray...
for(uint i=get_local_id(0);i<myimgcols*3;i+=get_local_size(0))
{
int tempb=currentImg[currentGroupID*myimgcols*3+i*3];
int tempg=currentImg[currentGroupID*myimgcols*3+i*3+1];
int tempr=currentImg[currentGroupID*myimgcols*3+i*3+2];
int rgbpixels=tempr+tempg*256+tempb*256*256;
uchar rgbelement=csvArray[rgbpixels];
if((int)rgbelement>0)
{
mydstImg[i]=255;
}
else
{
mydstImg[i]=0;
}
}
//step2:erode...
for(uint j=get_local_id(0);j<myimgcols*2;j+=get_local_size(0))
{
if((j==myimgcols-1) || (j==myimgcols*2-1))
{
erodeImg[j]=0;
break;
}
__private int erodetempValues[4]={0}; //private array can be inited like this??
for(int t=0;t<2;t++)
{
erodetempValues[t]=(int)mydstImg[j+t*myimgcols];
}
for(int t=2;t<4;t++)
{
erodetempValues[t]=(int)mydstImg[j+(t-2)*myimgcols+1];
}
if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255)
{
erodeImg[j]=255;
}
else
{
erodeImg[j]=0;
}
if(get_local_id(0)<myimgcols)
{
testErodeImg[currentGroupID*myimgcols+j]=mydstImg[j];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
//step3:dilate...
for(uint j=get_local_id(0);j<myimgcols;j+=get_local_size(0))
{
if(j==myimgcols-1)
{
dilateImg[j]=0;
break;
}
__private int dilatetempValues[4]={0};
for(int t=0;t<2;t++)
{
dilatetempValues[t]=(int)erodeImg[j+t*myimgcols];
}
for(int t=2;t<4;t++)
{
dilatetempValues[t]=(int)erodeImg[j+(t-2)*myimgcols+1];
}
if(dilatetempValues[0]==255 || dilatetempValues[1]==255 || dilatetempValues[2]==255 || dilatetempValues[3]==255)
{
dilateImg[j]=1;
}
else
{
dilateImg[j]=0;
}
testDilateImg[currentGroupID*myimgcols+j]=dilateImg[j];
}
barrier(CLK_LOCAL_MEM_FENCE);
//step4:count the every fluore sum of every row (in charge of every group)of the dilate image...
for(uint stride=myimgcols/2;stride>0;stride/=2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(get_local_id(0)<stride)
{
dilateImg[get_local_id(0)]+=dilateImg[get_local_id(0)+stride];
}
}
if(get_local_id(0)==0)
{
tempSum[currentGroupID]=dilateImg[0]+dilateImg[2]+dilateImg[6]+dilateImg[14]+dilateImg[120];
//printf("correct: %d ID:%d\n",tempSum[currentGroupID],currentGroupID);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
//Above all is correct...
}
main.cpp:
int main()
{
char filename[100];
cl_uint platformNum;
cl_int status;
status=clGetPlatformIDs(0,NULL,&platformNum);
if(status!=CL_SUCCESS){
printf("cannot get platforms number.\n");
return -1;
}
cl_platform_id* platforms;
platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);
status=clGetPlatformIDs(platformNum,platforms,NULL);
if(status!=CL_SUCCESS){
printf("cannot get platforms addresses.\n");
return -1;
}
cl_platform_id platformInUse=platforms[0];
cl_device_id device;
clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_GPU,1,&device,NULL);
cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE, &status);
std::ifstream srcFile("/home/jumper/OpenCL_projects/FluoreTest4Channels/blog.cl");
std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));
const char * src = srcProg.c_str();
size_t length = srcProg.length();
cl_program program=clCreateProgramWithSource(context,1,&src,&length,&status);
status=clBuildProgram(program,1,&device,NULL,NULL,NULL);
if (status != CL_SUCCESS)
{
shrLogEx(LOGBOTH | ERRORMSG, status, STDERROR);
oclLogBuildInfo(program, oclGetFirstDev(context));
oclLogPtx(program, oclGetFirstDev(context), "oclfluore.ptx");
return(EXIT_FAILURE);
}
//get the csv model from the disk.
const int rgbsize=256*256*256;
uchar* rgbarray = new uchar[rgbsize];
memset(rgbarray, 0, rgbsize * sizeof(uchar));
Ptr<ml::TrainData> ygdata=cv::ml::TrainData::loadFromCSV("/home/jumper/OpenCL_projects/FluoreTest4Channels/fluore_0728pm.csv",0,-2,0);
Mat csvimg2=ygdata->getSamples();
int csvrows=csvimg2.rows; //csv points
//cout<<"primer csvrows:"<<csvrows<<endl;
for (int j = 0; j < csvrows; j++)
{
float* pixeldata = csvimg2.ptr<float>(j);
float x = pixeldata[0];
float y = pixeldata[1];
float z = pixeldata[2];
int newindex = x + y * 256 + z * 256 * 256;
rgbarray[newindex] = 255;
}
TickMeter tm;
tm.start();
//get the src images from the disk.
int imgwidth,imgheight;
int ii=817;
sprintf(filename, "/home/jumper/OpenCL_projects/FluoreTest4Channels/silver-background0721/%d.bmp", ii);
Mat srcimg=imread(filename);
imgheight=srcimg.rows;
imgwidth=srcimg.cols;
int pixels=imgheight*imgwidth;
int srcdatasize=pixels*3*sizeof(uchar);
cl_mem rgbArray_buffer=clCreateBuffer(context,CL_MEM_READ_ONLY,rgbsize*sizeof(uchar),rgbarray,&status);
status = clEnqueueWriteBuffer(queue, rgbArray_buffer, CL_FALSE, 0, rgbsize * sizeof(uchar), rgbarray, 0, NULL, NULL);
cl_mem srcdata_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, srcdatasize, NULL,&status);
status = clEnqueueWriteBuffer(queue, srcdata_buffer, CL_FALSE, 0, srcdatasize, srcimg.data, 0, NULL, NULL);
size_t sumsize=1454*sizeof(int);
cl_mem sumArray_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY| CL_MEM_ALLOC_HOST_PTR, sumsize, NULL,&status);
cl_mem erodeImg4test_buffer=clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, pixels*sizeof(uchar), NULL,&status);
cl_mem dilateImg4test_buffer=clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, pixels*sizeof(uchar), NULL,&status);
int zero = 0;
status = clEnqueueFillBuffer(queue, erodeImg4test_buffer, &zero,sizeof(int), 0, pixels*sizeof(uchar), 0, NULL,NULL);
status = clEnqueueFillBuffer(queue, dilateImg4test_buffer, &zero,sizeof(int), 0, pixels*sizeof(uchar), 0, NULL,NULL);
cl_kernel kernel_imgProc=clCreateKernel(program,"imgProcess",NULL);
status = clSetKernelArg(kernel_imgProc, 0, sizeof(cl_mem), (void*)&rgbArray_buffer);
status = clSetKernelArg(kernel_imgProc, 1, sizeof(cl_mem), (void*)&srcdata_buffer);
status = clSetKernelArg(kernel_imgProc, 2, sizeof(cl_int), &imgwidth);
status = clSetKernelArg(kernel_imgProc, 3, sizeof(cl_int), &imgheight);
status = clSetKernelArg(kernel_imgProc, 4, sizeof(cl_mem), (void*)&sumArray_buffer);
status = clSetKernelArg(kernel_imgProc, 5, sizeof(cl_mem), (void*)&erodeImg4test_buffer);
status = clSetKernelArg(kernel_imgProc, 6, sizeof(cl_mem), (void*)&dilateImg4test_buffer);
size_t localsize[2]={1024,1};
size_t globalsize[2]={1024,1024};
status =clEnqueueNDRangeKernel(queue, kernel_imgProc, 2, NULL, globalsize, localsize,0,NULL,NULL);
if (status != CL_SUCCESS)
{
cout<<"clEnqueueNDRangeKernel() failed..."<<status<<endl;
return(EXIT_FAILURE);
}
status=clFinish(queue);
if (status != CL_SUCCESS)
{
cout<<"clFinish() failed..."<<status<<endl;
return(EXIT_FAILURE);
}
int *sumMap=NULL;
sumMap=(int*)clEnqueueMapBuffer(queue,sumArray_buffer,CL_TRUE, CL_MAP_READ, 0, sumsize, 0, NULL, NULL, &status);
clEnqueueUnmapMemObject(queue, sumArray_buffer, (void*)sumMap, 0, NULL, NULL);
int finalSum=0;
for(int j=0;j<1454;j++)
{
finalSum+=sumMap[j];
//cout<<"ID: "<<j<<"--Value: "<<sumMap[j]<<endl;
}
cout<<"host fluore result:"<<finalSum<<endl;
uchar *hostErode=NULL;
hostErode=(uchar*)clEnqueueMapBuffer(queue,erodeImg4test_buffer,CL_TRUE, CL_MAP_READ, 0, pixels*sizeof(uchar), 0, NULL, NULL, &status);
clEnqueueUnmapMemObject(queue, erodeImg4test_buffer, (void*)hostErode, 0, NULL, NULL);
Mat dstErodeimg=Mat(imgheight,imgwidth,CV_8UC1,hostErode);
imwrite("/home/jumper/OpenCL_projects/FluoreTest4Channels/myerode.jpg",dstErodeimg);
uchar *hostDilate=NULL;
hostDilate=(uchar*)clEnqueueMapBuffer(queue,dilateImg4test_buffer,CL_TRUE, CL_MAP_READ, 0, pixels*sizeof(uchar), 0, NULL, NULL, &status);
clEnqueueUnmapMemObject(queue, dilateImg4test_buffer, (void*)hostDilate, 0, NULL, NULL);
Mat dstDilateimg=Mat(imgheight,imgwidth,CV_8UC1,hostDilate);
imwrite("/home/jumper/OpenCL_projects/FluoreTest4Channels/mydilate.jpg",dstDilateimg);
tm.stop();
cout<<"count="<<tm.getCounter()<<" ,process time="<<tm.getTimeMilli()<<" ms."<<endl;
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseProgram(program);
clReleaseKernel(kernel_imgProc);
clReleaseMemObject(srcdata_buffer);
clReleaseMemObject(rgbArray_buffer);
delete [] rgbarray;
//free(sumMap);
clReleaseMemObject(erodeImg4test_buffer);
clReleaseMemObject(dilateImg4test_buffer);
return 0;
}
当然这里方便计数我在kernel中将膨胀后的结果255改成了1。反正不影响计数,如果要看膨胀后的图片,用255就好了。
7,最终的OpenCL-Open()版本
__kernel void imgProcess(__global uchar *csvArray,__global uchar *currentImg,int myimgcols,int myimgrows, __global int *tempSum)
{
__local uchar mydstImg[1936*3];
__local uchar erodeImg[1936*2]; //={0}; //local variable can not be inited like this....
__local int dilateImg[1936]; //={0};
int currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
for(;currentGroupID<myimgrows-2;currentGroupID+=get_num_groups(0)*get_num_groups(1))
{
//step1: calculate the currentImg's result:mydstImg for every group,according to csvArray...
for(uint i=get_local_id(0);i<myimgcols*3;i+=get_local_size(0))
{
int tempb=currentImg[currentGroupID*myimgcols*3+i*3];
int tempg=currentImg[currentGroupID*myimgcols*3+i*3+1];
int tempr=currentImg[currentGroupID*myimgcols*3+i*3+2];
int rgbpixels=tempr+tempg*256+tempb*256*256;
uchar rgbelement=csvArray[rgbpixels];
if((int)rgbelement>0)
{
mydstImg[i]=255;
}
else
{
mydstImg[i]=0;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//step2:erode...
for(uint j=get_local_id(0);j<myimgcols*2;j+=get_local_size(0))
{
if((j==myimgcols-1) || (j==myimgcols*2-1))
{
erodeImg[j]=0;
break;
}
__private int erodetempValues[4]={0}; //private array can be inited like this??
for(int t=0;t<2;t++)
{
erodetempValues[t]=(int)mydstImg[j+t*myimgcols];
}
for(int t=2;t<4;t++)
{
erodetempValues[t]=(int)mydstImg[j+(t-2)*myimgcols+1];
}
if(erodetempValues[0]==255 && erodetempValues[1]==255 && erodetempValues[2]==255 && erodetempValues[3]==255)
{
erodeImg[j]=255;
}
else
{
erodeImg[j]=0;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//step3:dilate...
for(uint j=get_local_id(0);j<myimgcols;j+=get_local_size(0))
{
if(j==myimgcols-1)
{
dilateImg[j]=0;
break;
}
__private int dilatetempValues[4]={0};
for(int t=0;t<2;t++)
{
dilatetempValues[t]=(int)erodeImg[j+t*myimgcols];
}
for(int t=2;t<4;t++)
{
dilatetempValues[t]=(int)erodeImg[j+(t-2)*myimgcols+1];
}
if(dilatetempValues[0]==255 || dilatetempValues[1]==255 || dilatetempValues[2]==255 || dilatetempValues[3]==255)
{
dilateImg[j]=1;
}
else
{
dilateImg[j]=0;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//step4:count the every fluore sum of every row (in charge of every group)of the dilate image...
for(uint stride=myimgcols/2;stride>0;stride/=2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(get_local_id(0)<stride)
{
dilateImg[get_local_id(0)]+=dilateImg[get_local_id(0)+stride];
}
}
if(get_local_id(0)==0)
{
tempSum[currentGroupID]=dilateImg[0]+dilateImg[2]+dilateImg[6]+dilateImg[14]+dilateImg[120];
//printf("correct: %d ID:%d\n",tempSum[currentGroupID],currentGroupID);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
//Above all is correct...
}
main.cpp部分:
int main()
{
char filename[100];
cl_uint platformNum;
cl_int status;
status=clGetPlatformIDs(0,NULL,&platformNum);
if(status!=CL_SUCCESS){
printf("cannot get platforms number.\n");
return -1;
}
cl_platform_id* platforms;
platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);
status=clGetPlatformIDs(platformNum,platforms,NULL);
if(status!=CL_SUCCESS){
printf("cannot get platforms addresses.\n");
return -1;
}
cl_platform_id platformInUse=platforms[0];
cl_device_id device;
clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_GPU,1,&device,NULL);
cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE, &status);
std::ifstream srcFile("/home/jumper/OpenCL_projects/FluoreTest4Channels/blog.cl");
std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));
const char * src = srcProg.c_str();
size_t length = srcProg.length();
cl_program program=clCreateProgramWithSource(context,1,&src,&length,&status);
status=clBuildProgram(program,1,&device,NULL,NULL,NULL);
if (status != CL_SUCCESS)
{
shrLogEx(LOGBOTH | ERRORMSG, status, STDERROR);
oclLogBuildInfo(program, oclGetFirstDev(context));
oclLogPtx(program, oclGetFirstDev(context), "oclfluore.ptx");
return(EXIT_FAILURE);
}
//get the csv model from the disk.
const int rgbsize=256*256*256;
uchar* rgbarray = new uchar[rgbsize];
memset(rgbarray, 0, rgbsize * sizeof(uchar));
Ptr<ml::TrainData> ygdata=cv::ml::TrainData::loadFromCSV("/home/jumper/OpenCL_projects/FluoreTest4Channels/fluore_0728pm.csv",0,-2,0);
Mat csvimg2=ygdata->getSamples();
int csvrows=csvimg2.rows; //csv points
//cout<<"primer csvrows:"<<csvrows<<endl;
for (int j = 0; j < csvrows; j++)
{
float* pixeldata = csvimg2.ptr<float>(j);
float x = pixeldata[0];
float y = pixeldata[1];
float z = pixeldata[2];
int newindex = x + y * 256 + z * 256 * 256;
rgbarray[newindex] = 255;
}
TickMeter tm;
tm.start();
//get the src images from the disk.
int imgwidth,imgheight;
int ii=817;
sprintf(filename, "/home/jumper/OpenCL_projects/FluoreTest4Channels/silver-background0721/%d.bmp", ii);
Mat srcimg=imread(filename);
imgheight=srcimg.rows;
imgwidth=srcimg.cols;
int pixels=imgheight*imgwidth;
int srcdatasize=pixels*3*sizeof(uchar);
cl_mem rgbArray_buffer=clCreateBuffer(context,CL_MEM_READ_ONLY,rgbsize*sizeof(uchar),rgbarray,&status);
status = clEnqueueWriteBuffer(queue, rgbArray_buffer, CL_FALSE, 0, rgbsize * sizeof(uchar), rgbarray, 0, NULL, NULL);
cl_mem srcdata_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, srcdatasize, NULL,&status);
status = clEnqueueWriteBuffer(queue, srcdata_buffer, CL_FALSE, 0, srcdatasize, srcimg.data, 0, NULL, NULL);
size_t sumsize=1454*sizeof(int);
cl_mem sumArray_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY| CL_MEM_ALLOC_HOST_PTR, sumsize, NULL,&status);
cl_kernel kernel_imgProc=clCreateKernel(program,"imgProcess",NULL);
status = clSetKernelArg(kernel_imgProc, 0, sizeof(cl_mem), (void*)&rgbArray_buffer);
status = clSetKernelArg(kernel_imgProc, 1, sizeof(cl_mem), (void*)&srcdata_buffer);
status = clSetKernelArg(kernel_imgProc, 2, sizeof(cl_int), &imgwidth);
status = clSetKernelArg(kernel_imgProc, 3, sizeof(cl_int), &imgheight);
status = clSetKernelArg(kernel_imgProc, 4, sizeof(cl_mem), (void*)&sumArray_buffer);
size_t localsize[2]={1024,1};
size_t globalsize[2]={1024,1024};
status =clEnqueueNDRangeKernel(queue, kernel_imgProc, 2, NULL, globalsize, localsize,0,NULL,NULL);
if (status != CL_SUCCESS)
{
cout<<"clEnqueueNDRangeKernel() failed..."<<status<<endl;
return(EXIT_FAILURE);
}
status=clFinish(queue);
if (status != CL_SUCCESS)
{
cout<<"clFinish() failed..."<<status<<endl;
return(EXIT_FAILURE);
}
int *sumMap=NULL;
sumMap=(int*)clEnqueueMapBuffer(queue,sumArray_buffer,CL_TRUE, CL_MAP_READ, 0, sumsize, 0, NULL, NULL, &status);
clEnqueueUnmapMemObject(queue, sumArray_buffer, (void*)sumMap, 0, NULL, NULL);
int finalSum=0;
for(int j=0;j<1454;j++)
{
finalSum+=sumMap[j];
//cout<<"ID: "<<j<<"--Value: "<<sumMap[j]<<endl;
}
cout<<"host fluore result:"<<finalSum<<endl;
tm.stop();
cout<<"count="<<tm.getCounter()<<" ,process time="<<tm.getTimeMilli()<<" ms."<<endl;
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseProgram(program);
clReleaseKernel(kernel_imgProc);
clReleaseMemObject(srcdata_buffer);
clReleaseMemObject(rgbArray_buffer);
delete [] rgbarray;
//free(sumMap);
return 0;
}
测试结果对比:
8,kernel与二进制
按照 http://www.cnblogs.com/mikewolf2002/archive/2012/09/06/2674125.html 这个大神的,将kernel转成二进制.bin文件。下载http://www.cnblogs.com/mikewolf2002/archive/2012/09/06/2674125.html 他的gclFile.h和gclFile.cpp文件即可。
char **binaries = (char **)malloc( sizeof(char *) * 1 ); //只有一个设备
size_t *binarySizes = (size_t*)malloc( sizeof(size_t) * 1 );
status = clGetProgramInfo(program,CL_PROGRAM_BINARY_SIZES,sizeof(size_t) * 1,binarySizes, NULL);
binaries[0] = (char *)malloc( sizeof(char) * binarySizes[0]);
status = clGetProgramInfo(program,CL_PROGRAM_BINARIES,sizeof(char *) * 1, binaries,NULL);
kernelFile.writeBinaryToFile("vecadd.bin", binaries[0],binarySizes[0]);
然后下次将.bin加载进OpenCL工程:
gclFile kernelFile;
if(!kernelFile.readBinaryFromFile("fluore_open.bin"))
{
printf("can not load the kernel file.\n");
}
const char *binary=kernelFile.source().c_str();
size_t binarySize=kernelFile.source().size();
cl_program program=clCreateProgramWithBinary(context,1,&device,(const size_t*)&binarySize,(const unsigned char**)&binary,NULL,NULL);
其它都没什么区别。已经测试过 正确。