kNN(k Nearest Neighbor)是常用的群集算法(Cluster Algorithm)用于空间搜索。目前最快的kNN方法莫过于KDTree的版本,不过基本上都是CPU的比如ANN C++ Library。对于GPU来说,实现加速结构比较复杂,因为没有栈所以无法递归,而且执行资源有限,不能像CPU一样舒舒服服的顺序执行。
为了方便起见我直接用komrade粘合C++与CUDA,下面是测试程序的代码。
<!--
Code highlighting produced by Actipro CodeHighlighter (freeware)
http://www.CodeHighlighter.com/
--> #include < komrade / device_vector.h >
#include < komrade / transform.h >
#include < komrade / range.h >
#include < komrade / copy.h >
#include < komrade / fill.h >
#include < komrade / sort.h >
#include < komrade / replace.h >
#include < komrade / functional.h >
#include < iostream >
#include < iterator >
#include < math.h >
#include < memory >
#include < boost / timer.hpp >
using namespace std;
#include < ANN / ANN.h >
#pragma comment(lib,"ANN.lib")
struct dist
{
dist( float qx, float qy, float qz) : x(qx), y(qy), z(qz)
{
}
float x,y,z;
__host__ __device__ float operator ()( const float3 & p) const
{
float a = p.x - x;
float b = p.y - y;
float c = p.z - z;
return a * a + b * b + c * c;
}
};
int main( int argc, char * argv[])
{
if ( argc < 2 )
return - 1 ;
int N = 0 ;
sscanf(argv[ 1 ], " %d " , & N);
boost::timer Stopwatch;
ANNpointArray ANNPh = annAllocPts(N, 3 );
komrade::host_vector < float3 > Ph(N);
komrade::device_vector < int > Idx(N);
komrade::device_vector < float > Dist(N);
for ( int i = 0 ; i < N; ++ i)
{
float x = ( float )rand() / ( float )RAND_MAX;;
float y = ( float )rand() / ( float )RAND_MAX;;
float z = ( float )rand() / ( float )RAND_MAX;;
Ph[i].x = x;
Ph[i].y = y;
Ph[i].z = z;
ANNPh[i][ 0 ] = x;
ANNPh[i][ 1 ] = y;
ANNPh[i][ 2 ] = z;
if ( N < 32 )
printf( " <%f %f %f>\n " ,x,y,z);
Idx[i] = i;
}
cout << fixed << " Generate Data Used " << Stopwatch.elapsed() << endl;
Stopwatch.restart();
float QueryPos[ 3 ];
QueryPos[ 0 ] = ( float )rand() / ( float )RAND_MAX * 0.1f + 0.5f ;
QueryPos[ 1 ] = ( float )rand() / ( float )RAND_MAX * 0.1f + 0.5f ;
QueryPos[ 2 ] = ( float )rand() / ( float )RAND_MAX * 0.1f + 0.5f ;
if ( N < 32 )
printf( " [%f %f %f]\n " ,QueryPos[ 0 ],QueryPos[ 1 ],QueryPos[ 2 ]);
komrade::device_vector < float3 > Pd = Ph;
cout << fixed << " Copy Data Used " << Stopwatch.elapsed() << endl;
Stopwatch.restart();
komrade::transform(Pd.begin(), Pd.end(), Dist.begin(), dist(QueryPos[ 0 ], QueryPos[ 1 ], QueryPos[ 2 ]));
komrade::sort_by_key(Dist.begin(), Dist.end(), Idx.begin());
cout << fixed << " Sort Data by CUDA Used " << Stopwatch.elapsed() << endl;
Stopwatch.restart();
ANNkd_tree ANNTree(ANNPh,N, 3 );
ANNidx ANNIdx[ 1 ];
ANNdist ANNDst[ 1 ];
ANNTree.annkSearch(QueryPos, 1 ,ANNIdx,ANNDst);
cout << fixed << " Sort Data by ANN Used " << Stopwatch.elapsed() << endl;
if ( N < 32 )
{
copy(Dist.begin(), Dist.end(), ostream_iterator < float > (cout, " " ));
cout << endl;
copy(Idx.begin(), Idx.end(), ostream_iterator < int > (cout, " " ));
}
return 0 ;
}
Code highlighting produced by Actipro CodeHighlighter (freeware)
http://www.CodeHighlighter.com/
--> #include < komrade / device_vector.h >
#include < komrade / transform.h >
#include < komrade / range.h >
#include < komrade / copy.h >
#include < komrade / fill.h >
#include < komrade / sort.h >
#include < komrade / replace.h >
#include < komrade / functional.h >
#include < iostream >
#include < iterator >
#include < math.h >
#include < memory >
#include < boost / timer.hpp >
using namespace std;
#include < ANN / ANN.h >
#pragma comment(lib,"ANN.lib")
struct dist
{
dist( float qx, float qy, float qz) : x(qx), y(qy), z(qz)
{
}
float x,y,z;
__host__ __device__ float operator ()( const float3 & p) const
{
float a = p.x - x;
float b = p.y - y;
float c = p.z - z;
return a * a + b * b + c * c;
}
};
int main( int argc, char * argv[])
{
if ( argc < 2 )
return - 1 ;
int N = 0 ;
sscanf(argv[ 1 ], " %d " , & N);
boost::timer Stopwatch;
ANNpointArray ANNPh = annAllocPts(N, 3 );
komrade::host_vector < float3 > Ph(N);
komrade::device_vector < int > Idx(N);
komrade::device_vector < float > Dist(N);
for ( int i = 0 ; i < N; ++ i)
{
float x = ( float )rand() / ( float )RAND_MAX;;
float y = ( float )rand() / ( float )RAND_MAX;;
float z = ( float )rand() / ( float )RAND_MAX;;
Ph[i].x = x;
Ph[i].y = y;
Ph[i].z = z;
ANNPh[i][ 0 ] = x;
ANNPh[i][ 1 ] = y;
ANNPh[i][ 2 ] = z;
if ( N < 32 )
printf( " <%f %f %f>\n " ,x,y,z);
Idx[i] = i;
}
cout << fixed << " Generate Data Used " << Stopwatch.elapsed() << endl;
Stopwatch.restart();
float QueryPos[ 3 ];
QueryPos[ 0 ] = ( float )rand() / ( float )RAND_MAX * 0.1f + 0.5f ;
QueryPos[ 1 ] = ( float )rand() / ( float )RAND_MAX * 0.1f + 0.5f ;
QueryPos[ 2 ] = ( float )rand() / ( float )RAND_MAX * 0.1f + 0.5f ;
if ( N < 32 )
printf( " [%f %f %f]\n " ,QueryPos[ 0 ],QueryPos[ 1 ],QueryPos[ 2 ]);
komrade::device_vector < float3 > Pd = Ph;
cout << fixed << " Copy Data Used " << Stopwatch.elapsed() << endl;
Stopwatch.restart();
komrade::transform(Pd.begin(), Pd.end(), Dist.begin(), dist(QueryPos[ 0 ], QueryPos[ 1 ], QueryPos[ 2 ]));
komrade::sort_by_key(Dist.begin(), Dist.end(), Idx.begin());
cout << fixed << " Sort Data by CUDA Used " << Stopwatch.elapsed() << endl;
Stopwatch.restart();
ANNkd_tree ANNTree(ANNPh,N, 3 );
ANNidx ANNIdx[ 1 ];
ANNdist ANNDst[ 1 ];
ANNTree.annkSearch(QueryPos, 1 ,ANNIdx,ANNDst);
cout << fixed << " Sort Data by ANN Used " << Stopwatch.elapsed() << endl;
if ( N < 32 )
{
copy(Dist.begin(), Dist.end(), ostream_iterator < float > (cout, " " ));
cout << endl;
copy(Idx.begin(), Idx.end(), ostream_iterator < int > (cout, " " ));
}
return 0 ;
}
下面是性能分析对比,可以看到CUDA版本在大量点的情况下还是非常有优势的,不过在纯RenderFarm就用不上了,恐怕只能在装备有G80以上芯片的桌面环境或者工作站上执行。测试环境如下,
- Intel E5200@2.9G
- Apacer 2Gx2
- 9800GT 512M
- CUDA 2.2, komrade 0.9
| 10 | 100 | 1000 | 10000 | 100000 | 1000000 |
CUDA | 0.003 | 0.003 | 0.00300 | 0.007 | 0.012 | 0.038 |
ANN | 0.000 | 0.000 | 0.00100 | 0.008 | 0.131 | 2.490 |
from: http://www.cnblogs.com/Jedimaster/archive/2009/06/07/1497985.html