读取大型稀疏矩阵&KNN算法的OpenCL加速版本

最新推荐文章于 2023-03-14 15:26:10 发布

元气少女缘结神

最新推荐文章于 2023-03-14 15:26:10 发布

阅读量1.4k

点赞数 3

分类专栏： Improve Performance 文章标签： OpenCL

本文链接：https://blog.csdn.net/wd1603926823/article/details/73199137

版权

Improve Performance 专栏收录该内容

39 篇文章 9 订阅

订阅专栏

https://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/

一、读取稀疏矩阵。先以一个简单的稀疏矩阵为例，读进这个稀疏矩阵

即这个稀疏矩阵本来其实有4行12列。前面第1列0不用管，是标签，是不要读进矩阵的。

#include <stdio.h>
//#include <iostream>
#include <fstream>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/opencv.hpp>
using namespace std;
using namespace cv;
void main(){
	int width = 12, height = 4;
	float *matrix = new float[width*height];
	memset(matrix, 0,width*height * sizeof(float));
	

	ifstream infile("simple_data.txt");
	vector<string> vec_str;
	string line;
	size_t curpos, pos_maohao, pos_kongge, pos_enter, prepos,lineLength;
	//char *end;
	int lineID=0;
	while (getline(infile, line))
	{
		if (line.size() == 0)
		{
			continue;
		}
		for (size_t maohao1 = 0; maohao1 < line.size(); maohao1 = pos_maohao+7)
		{
			pos_maohao = line.find_first_of(":");
			pos_kongge = line.find_first_of(" ");
			if (pos_maohao == string::npos || pos_kongge == string::npos)
			{
				continue;
			}
			string tempIndex = line.substr(pos_kongge + 1, pos_maohao - pos_kongge-1);
			string tempData = line.substr(pos_maohao+1, 8);
			line = line.substr(pos_maohao + 9);

			//int ind = static_cast<int>(strtol(tempIndex.c_str(), &end, 10));
			int ind = atoi(tempIndex.c_str());
			float data = atof(tempData.c_str());
			cout << ind << ":"<< data << " ";
			matrix[lineID*width + ind] = data;
	    }
		lineID += 1;
		cout << endl;
	}
	
	cout << endl << endl;
	cout << "the matrix is..." << endl;
	for (int i = 0; i < height;i++)
	{
		for (int j = 0; j < width;j++)
		{
			cout << matrix[i*width + j] << " ";
		}
		cout << endl;
	}
	
	infile.close();
	delete[] matrix;
}

即可。

一、写好的第一个KNN版本(先只拿前k个距离而不是前k个最小距离因为只是先测试一下是否写错)：

在算一个200(高height)X931(宽width)的矩阵matrix，对于每一行去算它与另199行的距离，(它与自己这一行的距离为0 不用算)，对于每一行可以得到一个200维的向量，取向量的前k个结果作为最终的距离矩阵instanceMatrix的结果(应该是200Xk的矩阵)，这k个结果的位置矩阵positionMatrix(附带的，其实这个不用算) localsize是[1][512] globalsize是[100][1024]即有200个工作组我是让每个工作组去算一行与另199行的距离。结果距离矩阵instanceMatrix中只有第1列是正确的另k-1列是无意义的值位置矩阵positionMatrix中k-1列无意义的值。

cl部分：

__kernel void knnForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localSizeInstance,__local float *localInstanceNoSort)
{
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
	__global float *row=matrix+width*currentGroupID;
	
	for(uint anotherID=0;anotherID<height;anotherID++)
	{
		if(anotherID==currentGroupID)
		{
			localInstanceNoSort[anotherID]=0;
			continue;
		}
		float sum=0;
		const __global float *rowAnother=matrix+width*anotherID;
		for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
		{
			sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		localSizeInstance[get_local_id(0)]=sum;
		for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
		{
         barrier(CLK_LOCAL_MEM_FENCE);
         if (get_local_id(0) < stride) 
        	{
            localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
        	}
        }
        
     	if (get_local_id(0) == 0)
       	{ 
     			localInstanceNoSort[anotherID] = localSizeInstance[0];
      	 }
    	 barrier(CLK_LOCAL_MEM_FENCE);
	}
	// no sort
	if(get_local_id(0)<k)
		{
			instanceMatrix[currentGroupID*k+get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
			positionMatrix[currentGroupID*k+get_local_id(0)]=get_local_id(0);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
	
}

main函数部分：

#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <fstream>
#include <vector>
#include "oclUtils.h"
#include "shrQATest.h"
using namespace std;

const int width=931;
const int height=200;
const int k=5;

void readTxt(const char *txtname,float *matrix)
{
		//int width = 931, height = 200;
		//float *matrix = new float[width*height];
		//memset(matrix, 0,width*height * sizeof(float));

		//ifstream infile("fea_sparse.txt");
		ifstream infile(txtname);
		vector<string> vec_str;
		string line;
		size_t curpos, pos_maohao, pos_kongge, pos_enter, prepos,lineLength;
		//char *end;
		int lineID=0;
		while (getline(infile, line))
		{
			if (line.size() == 0)
			{
				continue;
			}
			for (size_t maohao1 = 0; maohao1 < line.size(); maohao1 = pos_maohao+7)
			{
				pos_maohao = line.find_first_of(":");
				pos_kongge = line.find_first_of(" ");
				if (pos_maohao == string::npos || pos_kongge == string::npos)
				{
					continue;
				}
				string tempIndex = line.substr(pos_kongge + 1, pos_maohao - pos_kongge-1);
				string tempData = line.substr(pos_maohao+1, 8);
				line = line.substr(pos_maohao + 9);

				//int ind = static_cast<int>(strtol(tempIndex.c_str(), &end, 10));
				int ind = atoi(tempIndex.c_str());
				float data = atof(tempData.c_str());
				//cout << ind << ":"<< data << " ";
				matrix[lineID*width + ind] = data;
		    }
			lineID += 1;
			//cout << endl;
		}

		infile.close();
		//delete[] matrix;
}

int main( int argc, const char** argv)
{
    shrQAStart(argc, (char **)argv);
    // set logfile name and start logs
    shrSetLogFileName ("oclKnn4Robin.txt");
    shrLog("%s Starting...\n\n", argv[0]);
    //Get the NVIDIA platform
    cl_platform_id cpPlatform;
	 cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Get the devices
	 cl_uint uiNumDevices;
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Create the context
	 cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id device ;
	 clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
	 cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
	 if (ciErrNum != CL_SUCCESS)
	{
			shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
			return ciErrNum;
	}

	 //read kernel file and build it
	 size_t program_length;
	 char* source_path = shrFindFilePath("knn.cl", argv[0]);
	 oclCheckError(source_path != NULL, shrTRUE);
	 char *source = oclLoadProgSource(source_path, "", &program_length);
	 oclCheckError(source != NULL, shrTRUE);
	 // create the program
	 cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 // build the program
	 ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
	 if (ciErrNum != CL_SUCCESS)
	 {
		 // write out standard error, Build Log and PTX, then return error
		 shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
		 oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
		 oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclknn.ptx");
		 return(EXIT_FAILURE);
	 }
	 cl_kernel ckKernel = clCreateKernel(cpProgram, "knnForRobin", &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //host data
	 float *matrix = new float[width*height];
	 const size_t matrixSize=width*height*sizeof(float);
	 memset(matrix, 0,matrixSize);
	 const char *txtfile="fea_sparse.txt";
	 readTxt(txtfile,matrix);
	 int *resultPosMatrix=new int[height*k];
	 size_t resultPosSize=height*k*sizeof(int);
	 memset(resultPosMatrix,0,resultPosSize);
	 float *resultInstanceMatrix=new float[height*k];
	 size_t resultInstanceSize=height*k*sizeof(float);
	 memset(resultInstanceMatrix,0,resultInstanceSize);

	 //send arguments to kernel file  | CL_MEM_COPY_HOST_PTR
	 cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
	 cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
	 cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(int),  &width);
	 ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(int), &height);
	 ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(int), &k);
	 ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&result_pos_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&result_ins_buffer);
	 //ciErrNum |= clSetKernelArg(ckKernel, 6, k * sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 6, 512*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 7, height*sizeof(float), 0);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 size_t localsize[2]={1,512};
	 size_t globalsize[2]={100,1024};
	 clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);

	 int *positionHost=(int*)alloca(resultPosSize);
	 memset(positionHost,0,resultPosSize);
	 float *instanceHost=(float*)alloca(resultInstanceSize);
	 memset(instanceHost,0,resultInstanceSize);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //save as txt to check.
	 ofstream posOutfile("position.txt",ios::out);
	 ofstream instanceOutfile("instance.txt",ios::out);
	 for(size_t r=0;r<height;r++)
	 {
		 for(size_t c=0;c<k;c++)
		 {
			 int data=positionHost[r*k+c];
			 float instanceData=instanceHost[r*k+c];
			 posOutfile<<data<<"\t";
			 instanceOutfile<<instanceData<<"\t";
		 }
		 posOutfile<<endl;
		 instanceOutfile<<endl;
	 }
	 posOutfile.close();
	 instanceOutfile.close();

	 delete [] matrix;
	 delete [] resultPosMatrix;
	 delete [] resultInstanceMatrix;
	 clReleaseCommandQueue(commandQueue);
	 clReleaseContext(cxGPUContext);
	 clReleaseProgram(cpProgram);
	 clReleaseKernel(ckKernel);
	 shrLog("%s Successfully end.\n\n", argv[0]);
	 return 0;
}

因为检查了cl部分自我感觉没错，但距离矩阵中只有第1列是正确的，后面4列均是无意义的值，所以请教大神。大神说是main出错了，kernel的启动方式那里size_t localsize[2]={1,512}; 应该倒过来{512，1},当然相应的globalsize也要倒过来，不然每一维度上无法整除。这样不改动kernel结果就可以正确。

if(get_local_id(0)<k)
		{
			instanceMatrix[currentGroupID*k+get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
			positionMatrix[currentGroupID*k+get_local_id(0)]=get_local_id(0);
		}

或者如果不像大神那样改动的话，就把这里的所有0换成1 换个方向即可。结果一致。

ps：原来这个localsize[2]={M,N}实际是NXM 记住了。

ps：CL_DEVICE_MAX_WORK_ITEM_SIZES：1024X1024X64是对localsize每个维度的限制，即[0]<1024，[1]<1024，[2]<64， globalsize基本上是无限的，不用担心。 我之前以为那个就是globalsize的限制。太傻。

ps：CL_DEVICE_MAX_WORK_GROUP_SIZE: 1024 是说每个CU上三个维度总共最多1024个工作组。就是说localsize[0]*[1]*[2]<=1024！！！

另外大神夸了我，这个kernel写得很清晰，有进步。好开心！！！

这个版本与knn的区别就是取前k个最小的，而我取的直接就是前k个，这个后续再改就好了。

另外大神说，我这个忘记考虑一点了不然可以节省一半的时间，除了我自己想到的“对每个group的511个线程拿当前行的地址是重复的511次还有自己组的groupID也重复去拿了还有别的行的地址也是重复拿了”这里外，还有更重要的一点，我这个版本，第1行去算它与第8行的距离，第8行还会算一次它与第1行的距离，所以重复了一倍的计算！我目前想的改进就是“让每个group代表行只访问后续的group代表行即可解决你说的那个重复一半的问题不让它向前追溯 ”当然这个是后话（等我把取前k个最小写完编译运行都正确的情况下再来对这个部分提高性能）。

另外大神还说了一个要注意事项：

即group 0读取第888行的时候, 和group 777读取888行的时候,可能前者导致的888行被载入到cache中, 到了777个group需要888行的时候,888行实际上已经不在cache中了.导致需要从显存芯片被重复读取到cache一次(尽量不是这种情况).

二、第二个KNN版本(即仍旧是200X931的稀疏矩阵，取前k个最小距离及其相应位置，先不要求这k个最小距离一定要按从小到大排列)

我本来用的是CUDA里有的thrust下的sort.h，我想用sort_by_key()这个函数，于是在main.cpp前加入了#include <thrust/sort.h> 然后在cl文件中直接用sort_by_key()，这样距离和距离对应的位置可以跟着自动排序，但结果在编译kernel时报错：ptxas application ptx input, line16; fatal error :Parsing error near '[]': syntax error

ptxas: fatal error :Ptx assembly aborted due to errors

我检查了下，并不关第16行或者附近的[]的问题，我想应该是与网上比较多的“ptx, line 150; fatal error :Parsing error near '-': syntax error”这个报错就是因为路径中文，与什么150行的- 根本没关系。我感觉我这个也与[]没关系。但具体什么原因谷歌不到？！！！！！！

这个kernel：

__kernel void knnForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localSizeInstance,__local float *localInstanceNoSort,
								  __local float *localTempInst199,__local int *localTempPos199)	 
{
	///the first step : calculate the distances!!!!
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
	__global float *row=matrix+width*currentGroupID;
	
	for(uint anotherID=0;anotherID<height;anotherID++)
	{
		if(anotherID==currentGroupID)
		{
			localInstanceNoSort[anotherID]=0;
			continue;
		}
		float sum=0;
		const __global float *rowAnother=matrix+width*anotherID;
		for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
		{
			sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		localSizeInstance[get_local_id(0)]=sum;
		for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
		{
         barrier(CLK_LOCAL_MEM_FENCE);
         if (get_local_id(0) < stride) 
        	{
            localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
        	}
        }
        
     	if (get_local_id(0) == 0)
       	{ 
     			localInstanceNoSort[anotherID] = localSizeInstance[0];
      	 }
    	 barrier(CLK_LOCAL_MEM_FENCE);
	}
	
	
	///erase the 0 instance (the instance from the current row to iteself).
	if(get_local_id(0)<height)
	{
		if(get_local_id(0)<currentGroupID)
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0);
		}
		else if(get_local_id(0)>currentGroupID)
		{
			localTempInst199[get_local_id(0)+1]=localInstanceNoSort[get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0)+1;
		}
		else
		{
			;
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
	extract the first k datas!!!
	  //solution1:
	//sort()! but it cannot work!!!
	if(get_local_id(0)==0)
	{
		//thrust::sort(localTempInst199,localTempInst199+199);
		//thrust::sort_by_key(localTempInst199,localTempInst199+199,localTempPos199);
		sort_by_key(localTempInst199,localTempInst199+199,localTempPos199);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	if(get_local_id(0)<k)
	{
		instanceMatrix[currentGroupID*k+get_local_id(0)]=localTempInst199[get_local_id(0)];
		positionMatrix[currentGroupID*k+get_local_id(0)]=localTempPos199[get_local_id(0)];
	}
	barrier(CLK_LOCAL_MEM_FENCE);
}

所以我想如果这个函数sort_by_key()不报错那么这个版本的knn是可以用的。

三、第三个KNN版本

思路(我们公司的大神提供的思路，比我自己想的另一个好很多)：在想一个从N(非常非常大)个数里拿出前k个最小值和对应位置(k<<N)。先用小一点的数实验：N=199，k=5,thread_num=10,每个线程负责20个数(thread9其实只负责19个数)。对于某个thread，比如负责的数是:2,1，7，3，9，5，10，2，7，3，6，5.9，9，3.5，7.2，8，8.8，17，1，12。这20个数对应的位置分别是8，1，3，7，6，24，48，15，0，12，41，26，59，33，14，57，89，57，31，55. 先把前k个数放进一个data数组里[2，1，7，3，9] 同时将对应位置前k个放进pos数组[8，1，3，7，6]。先算data的最大值max为9.然后这个线程将第6个数(第6个数为5)与max比较，如果小于max，则替换。(pos中对应地方也要替换)。结果data为[2，1，7，3，5],pos为[8，1，3，7，24]然后又计算这个data的最大值max=7，将第7个数(为10)与此max比较，大于max不用替换。继续将第8个数2与max=7比较，小于它，又替换掉7，此时data变成[2，1，2，3，5] pos变成[8，1，15，7，24]....此线程遍历到第20个数结束时，此线程将负责的20个数变成了只有k=5个。。。共10个线程，所以199个数变成了50个数。然后进行第2轮，需要三个线程，每个线程负责20个数。。。

按照这个思路，我开始写cl文件，写完了但结果不正确：

__kernel void knnForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localSizeInstance,__local float *localInstanceNoSort,
								  __local float *localTempInst199,__local int *localTempPos199,
								  __local float *localTempInst50,__local int *localTempPos50,
   							  __local float *localTempInst15,__local int *localTempPos15,
   							  __local float *data,__local int *pos )	 
{
	///the first step : calculate the distances!!!!
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
	__global float *row=matrix+width*currentGroupID;
	
	for(uint anotherID=0;anotherID<height;anotherID++)
	{
		if(anotherID==currentGroupID)
		{
			localInstanceNoSort[anotherID]=0;
			continue;
		}
		float sum=0;
		const __global float *rowAnother=matrix+width*anotherID;
		for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
		{
			sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		localSizeInstance[get_local_id(0)]=sum;
		for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
		{
         barrier(CLK_LOCAL_MEM_FENCE);
         if (get_local_id(0) < stride) 
        	{
            localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
        	}
        }
        
     	if (get_local_id(0) == 0)
       	{ 
     			localInstanceNoSort[anotherID] = localSizeInstance[0];
      	 }
    	 barrier(CLK_LOCAL_MEM_FENCE);
	}
	
	
	///erase the 0 instance (the instance from the current row to iteself).
	if(get_local_id(0)<height)
	{
		if(get_local_id(0)<currentGroupID)
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0);
		}
		else if(get_local_id(0)>currentGroupID)
		{
			localTempInst199[get_local_id(0)+1]=localInstanceNoSort[get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0)+1;
		}
		else
		{
			;
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
	extract the first k datas!!!
	/*  //solution1:
	//sort()! but it cannot work!!!
	if(get_local_id(0)==0)
	{
		//thrust::sort(localTempInst199,localTempInst199+199);
		//thrust::sort_by_key(localTempInst199,localTempInst199+199,localTempPos199);
		sort_by_key(localTempInst199,localTempInst199+199,localTempPos199);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	if(get_local_id(0)<k)
	{
		instanceMatrix[currentGroupID*k+get_local_id(0)]=localTempInst199[get_local_id(0)];
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
	
	
	//solution2:use these instead!
	//use the 10 threads of a group ,every thread include 20 datas,k=5
	//circle0 ,handle with 199 datas!
	if(get_local_id(0)<10)
	{
		//copy the forward k data and position
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst199[k0];
			pos[ind]=localTempPos199[k0];
		}
		
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==9 && the1st==199)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			
			if(localTempInst199[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst199[the1st];
				pos[tempMaxPos]=localTempPos199[the1st];
			}
			the1st+=1;
		}
		
		//result of circle0
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
	if(get_local_id(0)<3)
	{
		//copy the forward k data and position
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst50[k0];
			pos[ind]=localTempPos50[k0];
		}
		
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==2 && the1st==49)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			
			if(localTempInst50[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst50[the1st];
				pos[tempMaxPos]=localTempPos50[the1st];
			}
			
			the1st+=1;
		}
		
		//result of circle1
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
	if(get_local_id(0)<1)
	{
		//copy the forward k data and position
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst15[k0];
			pos[ind]=localTempPos15[k0];
		}
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(the1st==15)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			if(localTempInst15[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst15[the1st];
				pos[tempMaxPos]=localTempPos15[the1st];
			}
			
			the1st+=1;
		}
		//result of circle2
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
			positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
		}
		
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
}

main.cpp文件：

/*
 * main.cpp
 *breaf:knn algorithm with OpenCL accelaration
 *  Created on: 2017年6月13日
 *      Author: root
 */
//#define __CL_ENABLE_EXCEPTIONS
#include <stdio.h>
#include <stdlib.h>
//#include <CL/cl.hpp>
#include <CL/cl.h>
#include <thrust/sort.h>
#include <fstream>
#include <vector>
#include "oclUtils.h"
#include "shrQATest.h"
using namespace std;

const int width=931;
const int height=200;
const int k=5;

void readTxt(const char *txtname,float *matrix)
{
		//int width = 931, height = 200;
		//float *matrix = new float[width*height];
		//memset(matrix, 0,width*height * sizeof(float));

		//ifstream infile("fea_sparse.txt");
		ifstream infile(txtname);
		vector<string> vec_str;
		string line;
		size_t curpos, pos_maohao, pos_kongge, pos_enter, prepos;//lineLength;
		//char *end;
		int lineID=0;
		while (getline(infile, line))
		{
			if (line.size() == 0)
			{
				continue;
			}
			for (size_t maohao1 = 0; maohao1 < line.size(); maohao1 = pos_maohao+7)
			{
				pos_maohao = line.find_first_of(":");
				pos_kongge = line.find_first_of(" ");
				if (pos_maohao == string::npos || pos_kongge == string::npos)
				{
					continue;
				}
				string tempIndex = line.substr(pos_kongge + 1, pos_maohao - pos_kongge-1);
				string tempData = line.substr(pos_maohao+1, 8);
				line = line.substr(pos_maohao + 9);

				//int ind = static_cast<int>(strtol(tempIndex.c_str(), &end, 10));
				int ind = atoi(tempIndex.c_str());
				float data = atof(tempData.c_str());
				//cout << ind << ":"<< data << " ";
				matrix[lineID*width + ind] = data;
		    }
			lineID += 1;
			//cout << endl;
		}
		infile.close();
		//delete[] matrix;
}

int main( int argc, const char** argv)
{
    shrQAStart(argc, (char **)argv);
    // set logfile name and start logs
    shrSetLogFileName ("oclKnn4Robin.txt");
    shrLog("%s Starting...\n\n", argv[0]);
    //Get the NVIDIA platform
    cl_platform_id cpPlatform;
	 cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Get the devices
	 cl_uint uiNumDevices;
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Create the context
	 cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id device ;
	 clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
	 cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
	 if (ciErrNum != CL_SUCCESS)
	{
			shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
			return ciErrNum;
	}

	 //read kernel file and build it
	 size_t program_length;
	 char* source_path = shrFindFilePath("knn.cl", argv[0]);
	 oclCheckError(source_path != NULL, shrTRUE);
	 char *source = oclLoadProgSource(source_path, "", &program_length);
	 oclCheckError(source != NULL, shrTRUE);
	 // create the program
	 cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 // build the program
	 ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
	 if (ciErrNum != CL_SUCCESS)
	 {

		 // write out standard error, Build Log and PTX, then return error
		 shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
		 oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
		 oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclknn.ptx");
		 return(EXIT_FAILURE);
	 }
	 cl_kernel ckKernel = clCreateKernel(cpProgram, "knnForRobin", &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //host data
	 float *matrix = new float[width*height];
	 const size_t matrixSize=width*height*sizeof(float);
	 memset(matrix, 0,matrixSize);
	 const char *txtfile="fea_sparse.txt";
	 readTxt(txtfile,matrix);
	 int *resultPosMatrix=new int[height*k];
	 size_t resultPosSize=height*k*sizeof(int);
	 memset(resultPosMatrix,0,resultPosSize);
	 float *resultInstanceMatrix=new float[height*k];
	 size_t resultInstanceSize=height*k*sizeof(float);
	 memset(resultInstanceMatrix,0,resultInstanceSize);

	 //send arguments to kernel file  | CL_MEM_COPY_HOST_PTR
	 cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
	 cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
	 cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(int),  &width);
	 ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(int), &height);
	 ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(int), &k);
	 ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&result_pos_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&result_ins_buffer);
	 //ciErrNum |= clSetKernelArg(ckKernel, 6, k * sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 6, 512*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 7, height*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 8, 199*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 9, 199*sizeof(int), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 10, 50*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 11, 50*sizeof(int), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 12, 15*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 13, 15*sizeof(int), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 14, k*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 15, k*sizeof(int), 0);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 size_t localsize[2]={512,1};
	 size_t globalsize[2]={1024,200};
	 clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);

	 int *positionHost=(int*)alloca(resultPosSize);
	 memset(positionHost,0,resultPosSize);
	 float *instanceHost=(float*)alloca(resultInstanceSize);
	 memset(instanceHost,0,resultInstanceSize);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	   //save as txt to check.
	 ofstream posOutfile("position.txt",ios::out);
	 ofstream instanceOutfile("instance.txt",ios::out);
	 for(size_t r=0;r<height;r++)
	 {
		 for(size_t c=0;c<k;c++)
		 {
			 int data=positionHost[r*k+c];
			 float instanceData=instanceHost[r*k+c];
			 posOutfile<<data<<"\t";
			 instanceOutfile<<instanceData<<"\t";
		 }
		 posOutfile<<endl;
		 instanceOutfile<<endl;
	 }
	 posOutfile.close();
	 instanceOutfile.close();

	 delete [] matrix;
	 delete [] resultPosMatrix;
	 delete [] resultInstanceMatrix;
	 clReleaseCommandQueue(commandQueue);
	 clReleaseContext(cxGPUContext);
	 clReleaseProgram(cpProgram);
	 clReleaseKernel(ckKernel);
	 shrLog("%s Successfully end.\n\n", argv[0]);
	 return 0;
}

但结果运行完不是正确的。

因为总共存在5处bug：host上1处，cl上4处（大神帮我看到4处自己只检查出1处）

1：host上的bug：

size_t globalsize[2]={1024,200};

我要的是200个，在纸上规划好了结果这里心急写了200，实际是100,100*2=200。

2：cl上4处bug：

a：从200个距离中去掉与自己这一行的距离0时，我以为越过了那个空位，实际没有，大神帮我点出此处，然后自己修改正确。

b：本来10个线程处理199个数，最后一个线程是处理19个数，我就写的19而不是199！相应的后面我写的9而不是49.后来才改过来。

c：第2个for中 break的条件应该是==50就break！

d：对于每个线程data和pos我设置的是__local型，每个线程都可以操作这个，导致data race！应该设置成每个线程独享的而不是local。

修改好后全部正确并稳定：

修改后的cl文件：

__kernel void knnForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,
								  __local float *localSizeInstance,__local float *localInstanceNoSort,
								  __local float *localTempInst199,__local int *localTempPos199,
								  __local float *localTempInst50,__local int *localTempPos50,
   							  __local float *localTempInst15,__local int *localTempPos15)	 
{
	///the first step : calculate the distances!!!!
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
	__global float *row=matrix+width*currentGroupID;
	
	for(uint anotherID=0;anotherID<height;anotherID++)
	{
		if(anotherID==currentGroupID)
		{
			localInstanceNoSort[anotherID]=0;
			continue;
		}
		float sum=0;
		const __global float *rowAnother=matrix+width*anotherID;
		for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
		{
			sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		localSizeInstance[get_local_id(0)]=sum;
		for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
		{
         barrier(CLK_LOCAL_MEM_FENCE);
         if (get_local_id(0) < stride) 
        	{
            localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
        	}
        }
        
     	if (get_local_id(0) == 0)
       	{ 
     			localInstanceNoSort[anotherID] = localSizeInstance[0];
      	 }
    	 barrier(CLK_LOCAL_MEM_FENCE);
	}
	
	
	///erase the 0 instance (the instance from the current row to iteself).
	if(get_local_id(0)<height-1)
	{
		if(get_local_id(0)<currentGroupID)
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0);
		}
		else
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort[get_local_id(0)+1];
			localTempPos199[get_local_id(0)]=get_local_id(0)+1;
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
	extract the first k min datas!!!
	//use the 10 threads of a group ,every thread include 20 datas,k=5
	//circle0 ,handle with 199 datas!
	if(get_local_id(0)<10)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst199[k0];
			pos[ind]=localTempPos199[k0];
		}
		
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==9 && the1st>198)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			
			if(localTempInst199[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst199[the1st];
				pos[tempMaxPos]=localTempPos199[the1st];
			}
			the1st+=1;
		}
		
		//result of circle0
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(get_local_id(0)==0)
	{
		 for(int r=0;r<50;r++)
		 {
				 int instanceData=localTempPos50[r];
				 printf("%d  \n",instanceData);
		 }
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
	
	
	//circle1
	if(get_local_id(0)<3)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst50[k0];
			pos[ind]=localTempPos50[k0];
		}
		
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==2 && the1st==50)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			
			if(localTempInst50[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst50[the1st];
				pos[tempMaxPos]=localTempPos50[the1st];
			}
			
			the1st+=1;
		}
		
		//result of circle1
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(currentGroupID<200 && get_local_id(0)==0)
	{
	  printf("group:%d data:%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n",(int)currentGroupID,localTempInst15[0],localTempInst15[1],localTempInst15[2],localTempInst15[3],localTempInst15[4],localTempInst15[5],localTempInst15[6],localTempInst15[7],localTempInst15[8],localTempInst15[9],localTempInst15[10],localTempInst15[11],localTempInst15[12],localTempInst15[13],localTempInst15[14]);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
	
	
	//circle2
	if(get_local_id(0)<1)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst15[k0];
			pos[ind]=localTempPos15[k0];
		}
		
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(the1st>14)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			
			if(localTempInst15[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst15[the1st];
				pos[tempMaxPos]=localTempPos15[the1st];
			}
			
			the1st+=1;
		}
		
		//result of circle2
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
			positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(currentGroupID<200 && get_local_id(0)==0)
	{
	  printf("group:%d data:%f %f %f %f %f\n\n",(int)currentGroupID,instanceMatrix[currentGroupID*k+0],instanceMatrix[currentGroupID*k+1],instanceMatrix[currentGroupID*k+2],instanceMatrix[currentGroupID*k+3],instanceMatrix[currentGroupID*k+4]);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
}

main.cpp文件：

const int width=931;
const int height=200;
const int k=5;

void readTxt(const char *txtname,float *matrix)
{
		//int width = 931, height = 200;
		//float *matrix = new float[width*height];
		//memset(matrix, 0,width*height * sizeof(float));

		//ifstream infile("fea_sparse.txt");
		ifstream infile(txtname);
		vector<string> vec_str;
		string line;
		size_t curpos, pos_maohao, pos_kongge, pos_enter, prepos;//lineLength;
		//char *end;
		int lineID=0;
		while (getline(infile, line))
		{
			if (line.size() == 0)
			{
				continue;
			}
			for (size_t maohao1 = 0; maohao1 < line.size(); maohao1 = pos_maohao+7)
			{
				pos_maohao = line.find_first_of(":");
				pos_kongge = line.find_first_of(" ");
				if (pos_maohao == string::npos || pos_kongge == string::npos)
				{
					continue;
				}
				string tempIndex = line.substr(pos_kongge + 1, pos_maohao - pos_kongge-1);
				string tempData = line.substr(pos_maohao+1, 8);
				line = line.substr(pos_maohao + 9);

				//int ind = static_cast<int>(strtol(tempIndex.c_str(), &end, 10));
				int ind = atoi(tempIndex.c_str());
				float data = atof(tempData.c_str());
				//cout << ind << ":"<< data << " ";
				matrix[lineID*width + ind] = data;
		    }
			lineID += 1;
			//cout << endl;
		}

		/*
		cout << endl << endl;
		cout << "the matrix is..." << endl;
		for (int i = 0; i < height;i++)
		{
			for (int j = 0; j < width;j++)
			{
				cout << matrix[i*width + j] << " ";
			}
			cout << endl;
		}
		 */
		infile.close();
		//delete[] matrix;
}

int main( int argc, const char** argv)
{
    shrQAStart(argc, (char **)argv);
    // set logfile name and start logs
    shrSetLogFileName ("oclKnn4Robin.txt");
    shrLog("%s Starting...\n\n", argv[0]);
    //Get the NVIDIA platform
    cl_platform_id cpPlatform;
	 cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Get the devices
	 cl_uint uiNumDevices;
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Create the context
	 cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id device ;
	 clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
	 cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
	 if (ciErrNum != CL_SUCCESS)
	{
			shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
			return ciErrNum;
	}

	 //read kernel file and build it
	 size_t program_length;
	 char* source_path = shrFindFilePath("knn.cl", argv[0]);
	 oclCheckError(source_path != NULL, shrTRUE);
	 char *source = oclLoadProgSource(source_path, "", &program_length);
	 oclCheckError(source != NULL, shrTRUE);
	 // create the program
	 cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 // build the program
	 ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
	 if (ciErrNum != CL_SUCCESS)
	 {

		 // write out standard error, Build Log and PTX, then return error
		 shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
		 oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
		 oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclknn.ptx");
		 return(EXIT_FAILURE);
	 }
	 cl_kernel ckKernel = clCreateKernel(cpProgram, "knnForRobin", &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //host data
	 float *matrix = new float[width*height];
	 const size_t matrixSize=width*height*sizeof(float);
	 memset(matrix, 0,matrixSize);
	 const char *txtfile="fea_sparse.txt";
	 readTxt(txtfile,matrix);
	 int *resultPosMatrix=new int[height*k];
	 size_t resultPosSize=height*k*sizeof(int);
	 memset(resultPosMatrix,0,resultPosSize);
	 float *resultInstanceMatrix=new float[height*k];
	 size_t resultInstanceSize=height*k*sizeof(float);
	 memset(resultInstanceMatrix,0,resultInstanceSize);

	 cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
	 cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
	 cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(int),  &width);
	 ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(int), &height);
	 ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(int), &k);
	 ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&result_pos_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&result_ins_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 6, 512*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 7, height*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 8, 199*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 9, 199*sizeof(int), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 10, 50*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 11, 50*sizeof(int), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 12, 15*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 13, 15*sizeof(int), 0);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 size_t localsize[2]={512,1};
	 size_t globalsize[2]={1024,100};
	 clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);

	 int *positionHost=(int*)alloca(resultPosSize);
	 memset(positionHost,0,resultPosSize);
	 float *instanceHost=(float*)alloca(resultInstanceSize);
	 memset(instanceHost,0,resultInstanceSize);

	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 //save as txt to check.
	 ofstream posOutfile("position.txt",ios::out);
	 ofstream instanceOutfile("instance.txt",ios::out);
	 for(size_t r=0;r<height;r++)
	 {
		 for(size_t c=0;c<k;c++)
		 {
			 int data=positionHost[r*k+c];
			 float instanceData=instanceHost[r*k+c];
			 posOutfile<<data<<"\t";
			 instanceOutfile<<instanceData<<"\t";
		 }
		 posOutfile<<endl;
		 instanceOutfile<<endl;
	 }
	 posOutfile.close();
	 instanceOutfile.close();

	 delete [] matrix;
	 delete [] resultPosMatrix;
	 delete [] resultInstanceMatrix;
	 clReleaseCommandQueue(commandQueue);
	 clReleaseContext(cxGPUContext);
	 clReleaseProgram(cpProgram);
	 clReleaseKernel(ckKernel);
	 shrLog("%s Successfully end.\n\n", argv[0]);
	 return 0;
}

(截图不完全)距离结果和位置结果(即这一行与第几行的距离)：

原稀疏矩阵：

另外大神说：

四、第4个KNN版本(仍旧是200X931的小测数据，但是计算距离时采用向下追溯节约时间)

__kernel void knn200ForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__global float *localInstanceNoSort0)	 
{
	///the first step : calculate the distances!!!!
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
	__global float *row=matrix+width*currentGroupID;
	//__global float localInstanceNoSort0[40000];
	__local float localSizeInstance[512];
	localInstanceNoSort0[currentGroupID*height+currentGroupID]=0;
	for(uint anotherID=currentGroupID+1;anotherID<height;anotherID++)
	{
		float sum=0;
		const __global float *rowAnother=matrix+width*anotherID;
		for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
		{
			sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		localSizeInstance[get_local_id(0)]=sum;
		for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
		{
			 barrier(CLK_LOCAL_MEM_FENCE);
			 if (get_local_id(0) < stride) 
			 {
			    localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
			 }
       	}	 
     	if (get_local_id(0) == 0)
       	 { 
     		localInstanceNoSort0[currentGroupID*height+anotherID] = localSizeInstance[0];
      	 }
    	 barrier(CLK_LOCAL_MEM_FENCE);
	}
	for(uint forward0=get_local_id(0);forward0<currentGroupID;forward0++)
	{
		localInstanceNoSort0[currentGroupID*height+forward0]=localInstanceNoSort0[forward0*height+currentGroupID];
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	//erase the 0 instance (the instance from the current row to iteself).
	__local float localTempInst199[199];
	__local int localTempPos199[199];
	if(get_local_id(0)<height-1)
	{
		if(get_local_id(0)<currentGroupID)
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort0[currentGroupID*height+get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0);
		}
		else
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort0[currentGroupID*height+get_local_id(0)+1];
			localTempPos199[get_local_id(0)]=get_local_id(0)+1;
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	//extract the first k min datas!!!
	 __local float localTempInst50[50];
	 __local int localTempPos50[50];
	//use the 10 threads of a group ,every thread include 20 datas,k=5
	//circle0 ,handle with 199 datas!
	if(get_local_id(0)<10)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst199[k0];
			pos[ind]=localTempPos199[k0];
		}
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==9 && the1st>198)
			{
				break;
			}
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			if(localTempInst199[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst199[the1st];
				pos[tempMaxPos]=localTempPos199[the1st];
			}
			the1st+=1;
		}
		//result of circle0
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(get_local_id(0)==0)
	{
		 for(int r=0;r<50;r++)
		 {
				 int instanceData=localTempPos50[r];
				 printf("%d  \n",instanceData);
		 }
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
	//circle1
	 __local float localTempInst15[15];
	 __local int localTempPos15[15];
	if(get_local_id(0)<3)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst50[k0];
			pos[ind]=localTempPos50[k0];
		}
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==2 && the1st==50)
			{
				break;
			}
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			if(localTempInst50[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst50[the1st];
				pos[tempMaxPos]=localTempPos50[the1st];
			}
			the1st+=1;
		}
		//result of circle1
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(currentGroupID<200 && get_local_id(0)==0)
	{
	  printf("group:%d data:%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n",(int)currentGroupID,localTempInst15[0],localTempInst15[1],localTempInst15[2],localTempInst15[3],localTempInst15[4],localTempInst15[5],localTempInst15[6],localTempInst15[7],localTempInst15[8],localTempInst15[9],localTempInst15[10],localTempInst15[11],localTempInst15[12],localTempInst15[13],localTempInst15[14]);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
	//circle2
	if(get_local_id(0)<1)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst15[k0];
			pos[ind]=localTempPos15[k0];
		}
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(the1st>14)
			{
				break;
			}
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			if(localTempInst15[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst15[the1st];
				pos[tempMaxPos]=localTempPos15[the1st];
			}
			the1st+=1;
		}
		//result of circle2
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
			positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(currentGroupID<200 && get_local_id(0)==0)
	{
	  printf("group:%d data:%f %f %f %f %f\n\n",(int)currentGroupID,instanceMatrix[currentGroupID*k+0],instanceMatrix[currentGroupID*k+1],instanceMatrix[currentGroupID*k+2],instanceMatrix[currentGroupID*k+3],instanceMatrix[currentGroupID*k+4]);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
}

结果与前面一致，是正确的。时间第3个版本是:13262us，而此版本是:4960us！！！！！！！！整整一倍多的时间啊！！！！其实还可以继续优化的，不用每次比较k个中的最大值，上次没有变的最大值下次就不用比，这样会更快！

大神说：她说的这个手工缓冲我下次再去想！

在写这个时，我开始是对localInstanceNoSort0[]这个是local型，OpenCL中只有barrier(CLK_LOCAL_MEM_FENCE)让同一个group内所有线程都到达这个栅栏点，再一起往下走。但opencl没有CLK_GLOBAL_MEM_FENCE这种说法，所以我后面的group中要拿前面某个group的结果，如果前面那个group还没有算完怎么给这个group！而且即使拿到了，第2个group的结果是data[]数组里，而第199个group的结果也是data[]。只是这两个data是并行，里面的东西不一样而已。但后面第199个group的data[]的前198个数要替换成第2个group的data[]的前198个数。虽然data是global型，但第199个group的data[0]要替换成第2个group的data[0]，这无法做到啊因为分不清谁是谁的！所以我将这个变量改成了global类型，进行了一个变样，不过实质是达到了这个目的，这样就行了。也就是说几个Work Group，那么必须通过全局变量进行通信。

于是我开始是：__global float localInstanceNoSort0[40000]; 但报错说OpenCL不支持这样，然后我又在host端

ciErrNum |= clSetKernelArg(ckKernel, 6, 40000*sizeof(float), 0);

结果报错说：<kernel>:error:global variables in function scope must have static storage class 所以后来我在host改成正经的创建一个新数组，创建此数组的buffer然后写进kernel的第7个参数！这样才行了！所以我想应该是：kernel中要使用global类型的数组变量，此变量必须是从host传进来的实际变量，而不是像传local型一样给个大小赋0传进kernel就好了！

五、第5个KNN版本(2002350X931的矩阵，即假设稀疏矩阵还原后的样子)

__kernel void knn2002350ForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localInstanceNoSort)	 
{
	__local float localSizeInstance[931];
	__local float localTempInst50[1024*5];
	__local float localTempPos50[1024*5];
	__local float localTempInst15[32*5];
	__local float localTempPos15[32*5];
	__local float localTempInst5[5*5];
	__local float localTempPos5[5*5];
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
	for (size_t y = currentGroupID; y < height; y += get_local_size(0))
	{
			the first step : calculate the distances!!!!
			__global float *row=matrix+width*y;
			localInstanceNoSort[y*height+y]=0;
			for(uint i=y+1;i<height;i+=1)
			{
				float sum=0;
				const __global float *rowAnother=matrix+width*i;
			   for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
				{
					sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
				}
				barrier(CLK_LOCAL_MEM_FENCE);
				
				localSizeInstance[get_local_id(0)]=sum;
				for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
				{
		         barrier(CLK_LOCAL_MEM_FENCE);
		         if (get_local_id(0) < stride) 
		        	{
		            localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
		        	}
		        }
		     	if (get_local_id(0) == 0)
		       	{ 
		     			localInstanceNoSort[y*height+i] = localSizeInstance[0];
		      	 }
		    	 barrier(CLK_LOCAL_MEM_FENCE);
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			for(uint forward0=get_local_id(0);forward0<y;forward0+=get_local_size(0))
			{
				localInstanceNoSort[y*height+forward0]=localInstanceNoSort[forward0*height+y];
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			/erase the 0 instance (the instance from the current row to iteself).
			__local float localTempInst199[2002349];
			__local int localTempPos199[2002349];
			for(uint t=get_local_id(0);t<height-1;t+=get_local_size(0))
			{
				if(t<currentGroupID)
				{
					localTempInst199[t]=localInstanceNoSort[currentGroupID*height+t];
					localTempPos199[t]=t;
				}
				else
				{
					localTempInst199[t]=localInstanceNoSort[currentGroupID*height+t+1];
					localTempPos199[t]=t+1;
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			///extract the first k min datas!!!
			//use the 1024 threads of a group ,every thread include 1956 datas,k=5
			//circle0 ,handle with 2002349 datas!
			//copy the forward k data and position
			float data[5];
			int pos[5];
			size_t ind=0;
			for(size_t k0=get_local_id(0)*1956;k0<get_local_id(0)*1956+k;k0++,ind++)
			{
				data[ind]=localTempInst199[k0];
				pos[ind]=localTempPos199[k0];
			}
			int the1st=get_local_id(0)*1956+k;
			int thelast=get_local_id(0)*1956+1956;
			while(the1st<thelast)
			{
				if(get_local_id(0)==1023 && the1st>2002348)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempInst199[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst199[the1st];
					pos[tempMaxPos]=localTempPos199[the1st];
				}
				the1st+=1;
			}
			//result of circle0
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			//circle1
			if(get_local_id(0)<32)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0)*32*k;k0<get_local_id(0)*32*k+k;k0++,ind++)
				{
					data[ind]=localTempInst50[k0];
					pos[ind]=localTempPos50[k0];
				}
				
				int the1st=get_local_id(0)*32*k+k;
				int thelast=get_local_id(0)*32*k+32*k;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst50[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst50[the1st];
						pos[tempMaxPos]=localTempPos50[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle1
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			//circle2
			if(get_local_id(0)<5)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0)*32;k0<get_local_id(0)*32*+k;k0++,ind++)
				{
					data[ind]=localTempInst50[k0];
					pos[ind]=localTempPos50[k0];
				}
				
				int the1st=get_local_id(0)*32+k;
				int thelast=get_local_id(0)*32+32;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst15[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst15[the1st];
						pos[tempMaxPos]=localTempPos15[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle1
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst5[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos5[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			//circle2
			if(get_local_id(0)<1)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0);k0<get_local_id(0)+k;k0++,ind++)
				{
					data[ind]=localTempInst5[k0];
					pos[ind]=localTempPos5[k0];
				}
				
				int the1st=k;
				int thelast=25;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst15[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst15[the1st];
						pos[tempMaxPos]=localTempPos15[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle2
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
					positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
	}
}

但这个kernel不够好，改成下面这样线程利用率更高：

__kernel void knn2002350ForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localInstanceNoSort)	 
{
	__local float localSizeInstance[931];
	__local float localTempInst50[1024*5];
	__local float localTempPos50[1024*5];
	__local float localTempInst15[32*5];
	__local float localTempPos15[32*5];
	__local float localTempInst5[5*5];
	__local float localTempPos5[5*5];
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
	for (size_t y = currentGroupID; y < height; y += get_local_size(0))
	{
			the first step : calculate the distances!!!!
			__global float *row=matrix+width*y;
			localInstanceNoSort[y*height+y]=0;
			for(uint i=get_local_id(0)+y+1;i<height;i+=get_local_size(0))
			{
				float sum=0;
				const __global float *rowAnother=matrix+width*i;
			   for(uint j=0;j<width;j+=1)
				{
					sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
				}
				localInstanceNoSort[y*height+i] = sum;
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			for(uint forward0=get_local_id(0);forward0<y;forward0+=get_local_size(0))
			{
				localInstanceNoSort[y*height+forward0]=localInstanceNoSort[forward0*height+y];
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			/erase the 0 instance (the instance from the current row to iteself).
			__local float localTempInst199[2002349];
			__local int localTempPos199[2002349];
			for(uint t=get_local_id(0);t<height-1;t+=get_local_size(0))
			{
				if(t<currentGroupID)
				{
					localTempInst199[t]=localInstanceNoSort[currentGroupID*height+t];
					localTempPos199[t]=t;
				}
				else
				{
					localTempInst199[t]=localInstanceNoSort[currentGroupID*height+t+1];
					localTempPos199[t]=t+1;
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			///extract the first k min datas!!!
			//use the 1024 threads of a group ,every thread include 1956 datas,k=5
			//circle0 ,handle with 2002349 datas!
			//copy the forward k data and position
			float data[5];
			int pos[5];
			size_t ind=0;
			for(size_t k0=get_local_id(0)*1956;k0<get_local_id(0)*1956+k;k0++,ind++)
			{
				data[ind]=localTempInst199[k0];
				pos[ind]=localTempPos199[k0];
			}
			int the1st=get_local_id(0)*1956+k;
			int thelast=get_local_id(0)*1956+1956;
			while(the1st<thelast)
			{
				if(get_local_id(0)==1023 && the1st>2002348)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempInst199[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst199[the1st];
					pos[tempMaxPos]=localTempPos199[the1st];
				}
				the1st+=1;
			}
			//result of circle0
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			//circle1
			if(get_local_id(0)<32)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0)*32*k;k0<get_local_id(0)*32*k+k;k0++,ind++)
				{
					data[ind]=localTempInst50[k0];
					pos[ind]=localTempPos50[k0];
				}
				
				int the1st=get_local_id(0)*32*k+k;
				int thelast=get_local_id(0)*32*k+32*k;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst50[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst50[the1st];
						pos[tempMaxPos]=localTempPos50[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle1
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			//circle2
			if(get_local_id(0)<5)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0)*32;k0<get_local_id(0)*32*+k;k0++,ind++)
				{
					data[ind]=localTempInst50[k0];
					pos[ind]=localTempPos50[k0];
				}
				
				int the1st=get_local_id(0)*32+k;
				int thelast=get_local_id(0)*32+32;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst15[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst15[the1st];
						pos[tempMaxPos]=localTempPos15[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle1
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst5[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos5[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			//circle2
			if(get_local_id(0)<1)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0);k0<get_local_id(0)+k;k0++,ind++)
				{
					data[ind]=localTempInst5[k0];
					pos[ind]=localTempPos5[k0];
				}
				
				int the1st=k;
				int thelast=25;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst15[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst15[the1st];
						pos[tempMaxPos]=localTempPos15[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle2
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
					positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
		
	}
}

main：

#include <stdio.h>
#include <stdlib.h>
//#include <CL/cl.hpp>
#include <CL/cl.h>
#include <time.h>
#include <fstream>
#include <vector>
#include "oclUtils.h"
#include "shrQATest.h"
using namespace std;

const int width=931;
const int height=200;
const int height2=2002350;
const int k=5;

int64_t system_time()
{
	struct timespec t;
	clock_gettime(CLOCK_MONOTONIC,&t);
	return (int64_t)(t.tv_sec)*1e9+t.tv_nsec;
}

int main( int argc, const char** argv)
{
	shrQAStart(argc, (char **)argv);
	// set logfile name and start logs
	shrSetLogFileName ("oclKnn4Robin.txt");
	shrLog("%s Starting...\n\n", argv[0]);
	//Get the NVIDIA platform
	cl_platform_id cpPlatform;
	 cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Get the devices
	 cl_uint uiNumDevices;
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Create the context
	 cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id device ;
	 clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
	 cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
	 if (ciErrNum != CL_SUCCESS)
	{
			shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
			return ciErrNum;
	}

	 //read kernel file and build it
	 size_t program_length;
	 char* source_path = shrFindFilePath("knn2002350.cl", argv[0]);
	 oclCheckError(source_path != NULL, shrTRUE);
	 char *source = oclLoadProgSource(source_path, "", &program_length);
	 oclCheckError(source != NULL, shrTRUE);
	 // create the program
	 cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 // build the program
	 ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
	 if (ciErrNum != CL_SUCCESS)
	 {
		 // write out standard error, Build Log and PTX, then return error
		 shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
		 oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
		 oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclknn.ptx");
		 return(EXIT_FAILURE);
	 }
	 cl_kernel ckKernel = clCreateKernel(cpProgram, "knn2002350ForRobin", &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //host data
	 float *matrix;
	 const int matrixSizeNoFloat=width*height2;
	 const size_t matrixSize=matrixSizeNoFloat*sizeof(float);
	 matrix=(float*)malloc(matrixSize);
	 shrFillArray(matrix,matrixSizeNoFloat);
	 size_t resultPosSize=height2*k*sizeof(int);
	 int *resultPosMatrix=(int*)malloc(resultPosSize);
	 memset(resultPosMatrix,0,resultPosSize);
	 size_t resultInstanceSize=height2*k*sizeof(float);
	 float *resultInstanceMatrix=(float*)malloc(resultInstanceSize);
	 shrFillArray(resultInstanceMatrix,height2*k);

	 size_t localsize[2]={1024,1};
	 size_t globalsize[2]={1024,1024};

	 cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
	 cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
	 cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(int),  &width);
	 ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(int), &height2);
	 ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(int), &k);
	 ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&result_pos_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&result_ins_buffer);

	 size_t NoSortSize=height2*height2*sizeof(float);
	 float *NoSort=(float*)malloc(NoSortSize);
	 memset(NoSort,0,NoSortSize);
	 cl_mem NoSort_buffer=clCreateBuffer(cxGPUContext,CL_MEM_READ_WRITE,NoSortSize,NoSort,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, NoSort_buffer, CL_FALSE, 0, NoSortSize, NoSort, 0, NULL, NULL);
	 ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(cl_mem), (void*)&NoSort_buffer);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 clFinish(commandQueue);
	 int64_t time_start=system_time();
	 clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);
	 clFinish(commandQueue);
	 int64_t time_end=system_time();
	 printf("kernel execute time: %f(us)\n",(time_end-time_start)/1e3);

	 int *positionHost=(int*)alloca(resultPosSize);
	 memset(positionHost,0,resultPosSize);
	 float *instanceHost=(float*)alloca(resultInstanceSize);
	 memset(instanceHost,0,resultInstanceSize);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 //save as txt to check.
	 ofstream posOutfile("position2002350.txt",ios::out);
	 ofstream instanceOutfile("distance2002350.txt",ios::out);
	 for(size_t r=0;r<height2;r++)
	 {
		 for(size_t c=0;c<k;c++)
		 {
			 int data=positionHost[r*k+c];
			 float instanceData=instanceHost[r*k+c];
			 posOutfile<<data<<"\t";
			 instanceOutfile<<instanceData<<"\t";
		 }
		 posOutfile<<endl;
		 instanceOutfile<<endl;
	 }
	 posOutfile.close();
	 instanceOutfile.close();

	 free(matrix);
	 free(resultPosMatrix);
	 free(resultInstanceMatrix);
	 clReleaseCommandQueue(commandQueue);
	 clReleaseContext(cxGPUContext);
	 clReleaseProgram(cpProgram);
	 clReleaseKernel(ckKernel);
	 shrLog("%s Successfully end.\n\n", argv[0]);
	 
	 return 0;
}

报错：clBuildProgram(): ptxas:error:Entry function use too much shared data!!!

大神说：

__local float localTempInst199[2002349];
			__local int localTempPos199[2002349];

用得太多了！应该将这多行分段完成！(效率不高)

于是拆分成一节一节：即将原矩阵分成4个矩阵传进来，因为大神说：比如CL_DEVICE_MAX_MEM_ALLOC_SIZE:最大不能超过500M,而原本需要600MB的缓冲区，可以拆分成两个300MB的缓冲区传进kernel。

（1）NV的卡总是一次可以分配最多1/4的显存大小，所以你如果使用2GB显存的卡，一次能分配在512MB左右。此时可以考虑换一个较大显存的卡（例如4GB显存的，可以分配1GB一次）。

（2）CUDA没有此限制，可以一次性分配将近全部的（如果你是2GB的卡，能分配0MB 到大约2GB，一次性）。可以考虑转换到CUDA.

我也不再一下算某行与另2002350的距离，而是算其与1024个行的距离，利用循环遍历完2002350!

__kernel void knn2002350ForRobin(__global float *matrix,__global float *matrix2,__global float *matrix3,__global float *matrix4,
											int width,int height,int k,__global int *positionMatrix,
											__global float *instanceMatrix,__global float *localInstanceNoSort,
											__local float *distanceK,__local float *positionK)	 
{
	__local float localTempInst199[1024];
	__local int localTempPos199[1024];
	__local float localTempInst50[51*5+4];
	__local float localTempPos50[51*5+4];
	__local float localTempInst15[65];
	__local int localTempPos15[65];
	__local float localTempInst5[20];
	__local int localTempPos5[20];
	
	__local float localTempInst430[429];
	__local int localTempPos430[429];
	__local float localTempInst110[110];
	__local int localTempPos110[110];
	__local float localTempInst30[30];
	__local int localTempPos30[30];
	__local float localTempInst10[10];
	__local int localTempPos10[10];
	__local float localTempInst489[2445];
	__local int localTempPos489[2445];
	__local float localTempInst123[123*5];
	__local int localTempPos123[123*5];
	__local float localTempInst31[31*5];
	__local int localTempPos31[31*5];
	__local float localTempInst40[40];
	__local int localTempPos40[40];

	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
	for (size_t y = currentGroupID; y < height; y += get_local_size(0))
	{
		the first step : calculate the distances!!!!
		__global float *row;
		if(currentGroupID<500000)
		{
			row=matrix+width*y;
		}
		else if(currentGroupID>=500000 && currentGroupID<1000000)
		{
			row=matrix2+width*(y-500000);
		}
		else if(currentGroupID>=1000000 && currentGroupID<1500000)
		{
			row=matrix3+width*(y-1000000);
		}
		else
		{
			row=matrix4+width*(y-1500000);
		}
		localInstanceNoSort[y*height+y]=0;
		const __global float *rowAnother;
		for(uint i=get_local_id(0)+y+1;i<height;i+=get_local_size(0))
		{
			float sum=0;
			if(i<500000)
			{
				rowAnother=matrix+width*i;
			}
			else if(i>=500000 && i<1000000)
			{
				rowAnother=matrix2+width*(i-500000);
			}
			else if(i>=1000000 && i<1500000)
			{
				rowAnother=matrix3+width*(i-1000000);
			}
			else
			{
				rowAnother=matrix4+width*(i-1500000);
			}
			
		   for(uint j=0;j<width;j+=1)
			{
				sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
			}
			localInstanceNoSort[y*height+i] = sum;
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		for(uint forward0=get_local_id(0);forward0<y;forward0+=get_local_size(0))
		{
			localInstanceNoSort[y*height+forward0]=localInstanceNoSort[forward0*height+y];
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		//2002350=1955*1024+430
		for(uint index=0;index<1955;index++)
		{
			//get 1024 datas from 2002350 every time .
			uint t=get_local_id(0);
			if(index*1024+t<currentGroupID)
			{
				localTempInst199[t]=localInstanceNoSort[currentGroupID*height+index*1024+t];
				localTempPos199[t]=index*1024+t;
			}
			else
			{
				localTempInst199[t]=localInstanceNoSort[currentGroupID*height+index*1024+t+1];
				localTempPos199[t]=index*1024+t+1;
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			/circle0 ,handle with 1024 datas!
			//use the 52 threads of a group ,every thread handles 20 datas,k=5
			//copy the forward k data and position
			float data[5];
			int pos[5];
			int ind=0;
			if(get_local_id(0)<52)
			{
				for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
				{
					if(k0<1024)
					{
						data[ind]=localTempInst199[k0];
						pos[ind]=localTempPos199[k0];
					}
					else
					{
						break;
					}
				}
			
				int the1st=get_local_id(0)*20+k;
				int thelast=get_local_id(0)*20+20;
				while(the1st<thelast)
				{
					if(get_local_id(0)==52)
					{
						break;
					}
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					if(localTempInst199[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst199[the1st];
						pos[tempMaxPos]=localTempPos199[the1st];
					}
					the1st+=1;
				}
				//result of circle0
				if(get_local_id(0)<51)
				{
					for(size_t finalk0=0;finalk0<k;finalk0++)
					{
						localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
						localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
					}
				}
				else
				{
					for(size_t finalk0=0;finalk0<k-1;finalk0++)
					{
						localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
						localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
					}
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			
			/circle1 ,handle with 514 datas!
			//use the 13 threads of a group ,every thread handles 20 datas,k=5
			//copy the forward k data and position
			//float data[5];
			//int pos[5];
			ind=0;
			if(get_local_id(0)<13)
			{
				for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
				{
					if(k0<259)
					{
						data[ind]=localTempInst50[k0];
						pos[ind]=localTempPos50[k0];
					}
					else
					{
						break;
					}
				}
			
				int the1st=get_local_id(0)*20+k;
				int thelast=get_local_id(0)*20+20;
				while(the1st<thelast)
				{
					if(get_local_id(0)==12 && the1st>18)
					{
						break;
					}
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					if(localTempPos50[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst50[the1st];
						pos[tempMaxPos]=localTempPos50[the1st];
					}
					the1st+=1;
				}
				//result of circle1
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			
			/circle2 ,handle with 65 datas!
			//use the 4 threads of a group ,every thread handles 20 datas,k=5
			//copy the forward k data and position
			//float data[5];
			//int pos[5];
			ind=0;
			if(get_local_id(0)<4)
			{
				for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
				{
					if(k0<65)
					{
						data[ind]=localTempInst15[k0];
						pos[ind]=localTempPos15[k0];
					}
					else
					{
						break;
					}
				}
			
				int the1st=get_local_id(0)*20+k;
				int thelast=get_local_id(0)*20+20;
				while(the1st<thelast)
				{
					if(get_local_id(0)==4)
					{
						break;
					}
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					if(localTempPos15[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst15[the1st];
						pos[tempMaxPos]=localTempPos15[the1st];
					}
					the1st+=1;
				}
				//result of circle2
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst5[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos5[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			
			/circle3 ,handle with 20 datas!
			//use the 1 thread of a group ,every thread handles 20 datas,k=5
			//copy the forward k data and position
			//float data[5];
			//int pos[5];
			ind=0;
			if(get_local_id(0)<1)
			{
				for(size_t k0=0;k0<k;k0++,ind++)
				{
					data[ind]=localTempInst15[k0];
					pos[ind]=localTempPos15[k0];
				}
			
				int the1st=k;
				int thelast=20;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					if(localTempPos5[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst5[the1st];
						pos[tempMaxPos]=localTempPos5[the1st];
					}
					the1st+=1;
				}
				//result of circle2
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					distanceK[index*k+finalk0]=data[finalk0];
					positionK[index*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
		}
		
		get the last 430 or 429 datas from 2002350 .
		uint t=get_local_id(0);
		uint tt=1955*1024+t;
		if(t<429)
		{
			if(currentGroupID<1955*1024)
			{
				
				localTempInst430[t]=localInstanceNoSort[currentGroupID*height+tt+1];
				localTempPos430[t]=tt+1;
			}
			else
			{
				if(tt<currentGroupID)
				{
					localTempInst430[t]=localInstanceNoSort[currentGroupID*height+tt];
					localTempPos430[t]=tt;
				}
				else
				{
					localTempInst430[t]=localInstanceNoSort[currentGroupID*height+tt+1];
					localTempPos430[t]=tt+1;
				}
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/circle0 ,handle with  430 or 429 datas!
		//use the 22 threads of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		float data[5];
		int pos[5];
		int ind=0;
		if(get_local_id(0)<22)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=localTempInst430[k0];
				pos[ind]=localTempPos430[k0];
			}
		
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				if(get_local_id(0)==21 && the1st==429)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempInst430[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst430[the1st];
					pos[tempMaxPos]=localTempPos430[the1st];
				}
				the1st+=1;
			}
			//result circle0
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst110[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos110[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/circle1 ,handle with  110 datas!
		//use the 5 threads of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<6)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=localTempInst110[k0];
				pos[ind]=localTempPos110[k0];
			}
		
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				if(get_local_id(0)==5 && the1st==10)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempInst110[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst110[the1st];
					pos[tempMaxPos]=localTempPos110[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst30[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos30[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/circle2 ,handle with  30 datas!
		//use the 2 threads of a group ,every thread handles 15 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<2)
		{
			for(size_t k0=get_local_id(0)*15;k0<get_local_id(0)*15+k;k0++,ind++)
			{
				data[ind]=localTempInst110[k0];
				pos[ind]=localTempPos110[k0];
			}
		
			int the1st=get_local_id(0)*15+k;
			int thelast=get_local_id(0)*15+15;
			while(the1st<thelast)
			{
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempInst30[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst30[the1st];
					pos[tempMaxPos]=localTempPos30[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst10[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos10[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/circle3 ,handle with 20 datas!
		//use the 1 thread of a group ,every thread handles 10 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<1)
		{
			for(size_t k0=0;k0<k;k0++,ind++)
			{
				data[ind]=localTempInst10[k0];
				pos[ind]=localTempPos10[k0];
			}
		
			int the1st=k;
			int thelast=10;
			while(the1st<thelast)
			{
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempPos10[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst10[the1st];
					pos[tempMaxPos]=localTempPos10[the1st];
				}
				the1st+=1;
			}
			//result of circle2
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				distanceK[1955*k+finalk0]=data[finalk0];
				positionK[1955*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		result of a group :1956*5 data
		/circle0,handle with 1956*5 datas!
		//use the 489 thread of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<489)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
		
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst489[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos489[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		/circle1,handle with 489*5 datas!
		//use the 123 thread of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<123)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				if(get_local_id(0)==122)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst123[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos123[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		/circle2,handle with 123*5 datas!
		//use the 31 thread of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<31)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				if(get_local_id(0)==30 && the1st>614)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst31[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos31[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/circle3,handle with 31*5 datas!
		//use the 8 thread of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<8)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				if(get_local_id(0)==7 && the1st>154)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst40[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos40[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		/circle4,handle with 40 datas!
		//use the 2 thread of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<2)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst10[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos10[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/circle5,handle with 10 datas!
		//use the 1 thread of a group ,every thread handles 10 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<1)
		{
			for(size_t k0=0;k0<k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
			int the1st=k;
			int thelast=10;
			while(the1st<thelast)
			{
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
				positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
	}
}

#include <stdio.h>
#include <stdlib.h>
//#include <CL/cl.hpp>
#include <CL/cl.h>
#include <time.h>
#include <fstream>
#include <vector>
#include "oclUtils.h"
#include "shrQATest.h"
using namespace std;

const int width=931;
const int height=200;
const int height2=500000;
const int height3=2350;
//const int height2=2002350; 2002350=500000*4+2350
const int k=5;

int64_t system_time()
{
	struct timespec t;
	clock_gettime(CLOCK_MONOTONIC,&t);
	return (int64_t)(t.tv_sec)*1e9+t.tv_nsec;
}

int main( int argc, const char** argv)
{
	
	shrQAStart(argc, (char **)argv);
	// set logfile name and start logs
	shrSetLogFileName ("oclKnn4Robin.txt");
	shrLog("%s Starting...\n\n", argv[0]);
	//Get the NVIDIA platform
	cl_platform_id cpPlatform;
	 cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Get the devices
	 cl_uint uiNumDevices;
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Create the context
	 cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id device ;
	 clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
	 cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
	 if (ciErrNum != CL_SUCCESS)
	{
			shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
			return ciErrNum;
	}

	 //read kernel file and build it
	 size_t program_length;
	 char* source_path = shrFindFilePath("knn2002350advance.cl", argv[0]);
	 oclCheckError(source_path != NULL, shrTRUE);
	 char *source = oclLoadProgSource(source_path, "", &program_length);
	 oclCheckError(source != NULL, shrTRUE);
	 // create the program
	 cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 // build the program
	 ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
	 if (ciErrNum != CL_SUCCESS)
	 {
		 printf("cannot build program successfully!\n");
		 size_t logSize;
		 ciErrNum = clGetProgramBuildInfo(cpProgram, device, CL_PROGRAM_BUILD_LOG,0, NULL, &logSize);
		 char *log = (char*)malloc(logSize);
		 ciErrNum = clGetProgramBuildInfo(cpProgram, device, CL_PROGRAM_BUILD_LOG,logSize, log, NULL);
		 printf("%s\n", log);
		 free(log);
		 exit(-1);
	 }
	 cl_kernel ckKernel = clCreateKernel(cpProgram, "knn2002350ForRobin", &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //host data
	 float *matrix,*matrix2,*matrix3,*matrix4;
	 const int matrixSizeNoFloat=width*height2;
	 const size_t matrixSize=matrixSizeNoFloat*sizeof(float);
	 matrix=(float*)malloc(matrixSize);
	 memset(matrix,0,matrixSize);
	 matrix2=(float*)malloc(matrixSize);
	 memset(matrix2,0,matrixSize);
	 matrix3=(float*)malloc(matrixSize);
	 memset(matrix3,0,matrixSize);
	 const size_t matrixSize4=width*height3*sizeof(float);
	 matrix4=(float*)malloc(matrixSize4);
	 memset(matrix4,0,matrixSize4);
	 //shrFillArray(matrix,matrixSizeNoFloat);
	 size_t resultPosSize=height2*k*sizeof(int);
	 int *resultPosMatrix=(int*)malloc(resultPosSize);
	 memset(resultPosMatrix,0,resultPosSize);
	 size_t resultInstanceSize=height2*k*sizeof(float);
	 float *resultInstanceMatrix=(float*)malloc(resultInstanceSize);
	 shrFillArray(resultInstanceMatrix,height2*k);

	 size_t localsize[2]={1024,1};
	 size_t globalsize[2]={1024,1024};

	 cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
	 cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
	 cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 cl_mem matrix2_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix2, &ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix2_buffer, CL_FALSE, 0, matrixSize, matrix2, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_mem matrix3_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix3, &ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix3_buffer, CL_FALSE, 0, matrixSize, matrix3, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_mem matrix4_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize4, matrix4, &ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix4_buffer, CL_FALSE, 0, matrixSize4, matrix4, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
	 ciErrNum  = clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&matrix2_buffer);
	 ciErrNum  = clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&matrix3_buffer);
	 ciErrNum  = clSetKernelArg(ckKernel, 3, sizeof(cl_mem), (void*)&matrix4_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(int),  &width);
	 ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(int), &height2);
	 ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(int), &k);
	 ciErrNum |= clSetKernelArg(ckKernel, 7, sizeof(cl_mem), (void*)&result_pos_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 8, sizeof(cl_mem), (void*)&result_ins_buffer);

	 size_t NoSortSize=height2*height2*sizeof(float);
	 float *NoSort=(float*)malloc(NoSortSize);
	 memset(NoSort,0,NoSortSize);
	 cl_mem NoSort_buffer=clCreateBuffer(cxGPUContext,CL_MEM_READ_WRITE,NoSortSize,NoSort,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, NoSort_buffer, CL_FALSE, 0, NoSortSize, NoSort, 0, NULL, NULL);
	 ciErrNum |= clSetKernelArg(ckKernel, 9, sizeof(cl_mem), (void*)&NoSort_buffer);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 size_t KDistanceSize=1956*5*sizeof(float);
	 float *KDistance=(float*)malloc(KDistanceSize);
	 memset(KDistance,0,KDistanceSize);
	 cl_mem KDistance_buffer=clCreateBuffer(cxGPUContext,CL_MEM_READ_WRITE,KDistanceSize,KDistance,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, KDistance_buffer, CL_FALSE, 0, KDistanceSize, KDistance, 0, NULL, NULL);
	 ciErrNum |= clSetKernelArg(ckKernel, 10, sizeof(cl_mem), (void*)&KDistance_buffer);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 size_t KPositinSize=1956*5*sizeof(float);
	 float *KPosition=(float*)malloc(KPositinSize);
	 memset(KPosition,0,KPositinSize);
	 cl_mem KPosition_buffer=clCreateBuffer(cxGPUContext,CL_MEM_READ_WRITE,KPositinSize,KPosition,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, KDistance_buffer, CL_FALSE, 0, KPositinSize, KPosition, 0, NULL, NULL);
	 ciErrNum |= clSetKernelArg(ckKernel, 11, sizeof(cl_mem), (void*)&KPosition_buffer);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 clFinish(commandQueue);
	 int64_t time_start=system_time();
	 clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);
	 clFinish(commandQueue);
	 int64_t time_end=system_time();
	 printf("kernel execute time: %f(us)\n",(time_end-time_start)/1e3);

	 int *positionHost=(int*)alloca(resultPosSize);
	 memset(positionHost,0,resultPosSize);
	 float *instanceHost=(float*)alloca(resultInstanceSize);
	 memset(instanceHost,0,resultInstanceSize);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 //save as txt to check.
	 ofstream posOutfile("position2002350.txt",ios::out);
	 ofstream instanceOutfile("distance2002350.txt",ios::out);
	 for(size_t r=0;r<height2;r++)
	 {
		 for(size_t c=0;c<k;c++)
		 {
			 int data=positionHost[r*k+c];
			 float instanceData=instanceHost[r*k+c];
			 posOutfile<<data<<"\t";
			 instanceOutfile<<instanceData<<"\t";
		 }
		 posOutfile<<endl;
		 instanceOutfile<<endl;
	 }
	 posOutfile.close();
	 instanceOutfile.close();

	 free(matrix);
	 free(resultPosMatrix);
	 free(resultInstanceMatrix);
	 free(NoSort);
	 free(KDistance);
	 free(KPosition);
	 clReleaseCommandQueue(commandQueue);
	 clReleaseContext(cxGPUContext);
	 clReleaseProgram(cpProgram);
	 clReleaseKernel(ckKernel);
	 shrLog("%s Successfully end.\n\n", argv[0]);


	 return 0;
}

第一次WriteBuffer通过了，第二次WriteBuffer时返回 -4.超过限制？！

元气少女缘结神

关注

3
点赞
踩
2

收藏

觉得还不错? 一键收藏
打赏
2
评论
读取大型稀疏矩阵&KNN算法的OpenCL加速版本

一、先以一个简单的稀疏矩阵为例，读进这个稀疏矩阵即这个稀疏矩阵本来其实有4行12列。前面第1列0不用管，是不要的。#include //#include #include #include #include using namespace std;using namespace cv;void main(){ int width = 12, height = 4; f
复制链接

扫一扫