利用OpenCV sample修改的,研一的时候花过一点时间做过一个小项目
k-means印象中又叫做c-means,至于原理,网上的资料很多,随便google下就可以,wiki上面的就不错
k-means的原理比较简单,就是不停的迭代寻找聚类最优点,实现分类。
这几天工作上想对潜在客户做个分类,重新看了下代码,好多都有点忘了,囧~~~~~~
工科生,总觉得data-drive出来的结果肯定要比 领导 拍拍脑袋的结果要好
写个blog,OpenCV是个好东西,要坚持使用下去,
CvMat* points = cvCreateMat( sample_count, 1, CV_32FC2 );
CvMat* clusters = cvCreateMat( sample_count, 1, CV_32SC1 );
CvMat* center_thy = cvCreateMat( cluster_count, points->cols * CV_MAT_CN(points->type), CV_64FC1 );
points是输入的数据,注意这个是二维情况下,如果多维向量这边就要变化
要注意access cvmat数据的方式和他们的type,这个上面很容易出错
centers是一个label cluster 的中心,可以得到这个中心的坐标
把这个函数放到opencv\\sample\\kmean.c里面代替掉相应的函数调用就可以使用了
先凑合着mark下吧,留个纪念。
貌似最近国家领导人挺关注互联网企业的,额,我作为一个pre-XX的人,心里还是好酸啊,
不提了,还是多看看程序吧
void kmeans_thy(const CvMat* samples, CvTermCriteria termcrit, CvMat* centers, CvMat* labels)
{
CvMat* ret_centers = centers; // cache the outter memory, so that it can return correctly.
int numof_samples = samples->rows; // number of samples
int dimof_samples = samples->cols* CV_MAT_CN(samples->type);; // dimension of samples
int numof_clusters = centers->rows; // number of cluters
termcrit = cvCheckTermCriteria(termcrit, 1e-6, 100);
termcrit.epsilon *= termcrit.epsilon; // use square dist, so ...
std::cout << "MaxIter: " << termcrit.max_iter << ", Eps: " << termcrit.epsilon << std::endl;
CvMat* old_centers = cvCreateMat(numof_clusters, dimof_samples, CV_64FC1);//centers should be same as centers
CvMat* counters = cvCreateMat( numof_clusters,1, CV_32SC1);//counters is the box
CvMat* temp = 0;
int ids_delta = labels->step ? labels->step/(int)sizeof(int) : 1;
CvRNG rng = CvRNG(-1);
// randomly assign each sample an label, the number of label is the number of cluster
for (int i=0; i<numof_samples; ++i)
{
labels->data.i[i] = cvRandInt(&rng) % numof_clusters;
}
double max_dist = termcrit.epsilon * 2;
for (int iter=0; iter<termcrit.max_iter; ++iter)//you knows
{
std::cout << "iteration: " << iter << std::endl;
cvZero(centers);
cvZero(counters);
// accumulate the centers
for (int i=0; i<numof_samples; ++i)
{
// s: the i-th sample.
float* s = (float*)(samples->data.ptr + i * samples->step);
// k: the label of s.
int k = labels->data.i[i*ids_delta];
// c: the k-th cluster, the label of s is c.
double* c = (double*)(centers->data.ptr + k*centers->step);
for ( int j = 0; j<dimof_samples; ++j)
{
c[j] += s[j];
}
// increase the counter.
counters->data.i[k]++;
}
if (iter > 0)
{
max_dist = 0;
}
for (int k=0; k<numof_clusters; ++k)
{
double* c = (double*)(centers->data.ptr + k*centers->step);
if (counters->data.i[k] != 0)
{
//normalize
double scale = 1.0 / counters->data.i[k];
for (int j=0; j<dimof_samples; ++j)
{
c[j] *= scale;
}
}
else
{
//random assign
int i = cvRandInt(&rng) % numof_samples;
float* s = (float*)(samples->data.ptr + i*samples->step);
for (int j=0; j<dimof_samples; ++j)
{
c[j] = s[j];
}
}
// compare distance between centers and old centers.
if (iter > 0)
{
double dist = 0;
double* c_o = (double*)(old_centers->data.ptr + k*old_centers->step);
for (int j=0 ; j<dimof_samples; ++j)
{
double t = c[j] - c_o[j];
dist += t*t;
}
if (max_dist < dist)
{
max_dist = dist;
}
}
}
//assign_labels(samples, centers, labels);
if( max_dist < termcrit.epsilon )
break;
for( int i = 0; i < numof_samples; i++ )
{
float* s = (float*)(samples->data.ptr + i*samples->step);
int k_best = 0;
double min_dist = DBL_MAX;
for(int k = 0; k < numof_clusters; k++ )
{
double* c = (double*)(centers->data.ptr + k*centers->step);
double dist = 0;
for( int j = 0; j < dimof_samples; j++ )
{
double t = c[j] - s[j];
dist += t*t;
}
if( min_dist > dist )
{
min_dist = dist;
k_best = k;
}
}
labels->data.i[i*ids_delta] = k_best;
}
// check converge
std::cout << "max_dist: " << max_dist << std::endl;
if (max_dist < termcrit.epsilon)
{
break;
}
CV_SWAP(centers, old_centers, temp);
}
cvReleaseMat(&old_centers);
cvReleaseMat(&counters);
}