源码位于opencv-2.4.9\modules\ocl\src\kmeans.cpp
主要就是两个函数,一个是中心点选取法:The Advantages of Careful Seeding,另一个是kmeans算法
generateCentersPP函数
对应于k-means++的中心点初始化
引入随机化,下一个被选为中心点的样本不是固定的,而是一个概率值,这个概率值正比于“整体最小距离“。
/*
k-means center initialization using the following algorithm:
Arthur & Vassilvitskii (2007) k-means++: The Advantages of Careful Seeding
*/
static void generateCentersPP(const Mat& _data, Mat& _out_centers,
int K, RNG& rng, int trials)
{
int i, j, k, dims = _data.cols, N = _data.rows;
const float* data = (float*)_data.data;
size_t step = _data.step / sizeof(data[0]);
//k个中心
vector<int> _centers(K);
int* centers = &_centers[0];
//三个距离的内存
vector<float> _dist(N * 3);
float* dist = &_dist[0], * tdist = dist + N, * tdist2 = tdist + N;
double sum0 = 0;
//随机取N个数据中的一个作为首个数据中心
centers[0] = (unsigned)rng % N;
//遍历数据集
for (i = 0; i < N; i++)
{
//计算数据i到该中心的距离
dist[i] = normL2Sqr_(data + step * i, data + step * centers[0], dims);
sum0 += dist[i];
}
//接下去取剩下k个中心
for (k = 1; k < K; k++)
{
double bestSum = DBL_MAX;
int bestCenter = -1;
//随机取trials 次中心,得到最佳
for (j = 0; j < trials; j++)
{
//随机整体距离
double p = (double)rng * sum0, s = 0;
//计算满足该距离的数据集[0,i)
for (i = 0; i < N - 1; i++)
if ((p -= dist[i]) <= 0)
break;
// 将该数据作为中心
int ci = i;
//取tdist2= min(dist[i], 数据到新的中心ci的距离)
parallel_for_(Range(0, N),
KMeansPPDistanceComputer(tdist2, data, dist, dims, step, step * ci));
//计算整体距离
for (i = 0; i < N; i++)
{
s += tdist2[i];
}
//记录距离最小值
if (s < bestSum)
{
bestSum = s;
bestCenter = ci;
std::swap(tdist, tdist2);
}
}
//记录最佳中心值
centers[k] = bestCenter;
sum0 = bestSum;
std::swap(dist, tdist);//记录最佳距离
}
//遍历k个中心
for (k = 0; k < K; k++)
{
const float* src = data + step * centers[k];
float* dst = _out_centers.ptr<float>(k);
//输出中心的 具体数据
for (j = 0; j < dims; j++)
dst[j] = src[j];
}
}
kmeans函数
double cv::ocl::kmeans(const oclMat& _src, int K, oclMat& _bestLabels,
TermCriteria criteria, int attempts, int flags, oclMat& _centers)
{
const int SPP_TRIALS = 3;
bool isrow = _src.rows == 1 && _src.oclchannels() > 1;
int N = !isrow ? _src.rows : _src.cols;
int dims = (!isrow ? _src.cols : 1) * _src.oclchannels();
int type = _src.depth();
attempts = std::max(attempts, 1);
CV_Assert(type == CV_32F && K > 0);
CV_Assert(N >= K);
Mat _labels;
if (flags & CV_KMEANS_USE_INITIAL_LABELS)
{
CV_Assert((_bestLabels.cols == 1 || _bestLabels.rows == 1) &&
_bestLabels.cols * _bestLabels.rows == N &&
_bestLabels.type() == CV_32S);
_bestLabels.download(_labels);
}
else
{
if (!((_bestLabels.cols == 1 || _bestLabels.rows == 1) &&
_bestLabels.cols * _bestLabels.rows == N &&
_bestLabels.type() == CV_32S &&
_bestLabels.isContinuous()))
_bestLabels.create(N, 1, CV_32S);
_labels.create(_bestLabels.size(), _bestLabels.type());
}
int* labels = _labels.ptr<int>();
Mat data;
_src.download(data);
//K*dims维矩阵,存储k个中心
Mat centers(K, dims, type), old_centers(K, dims, type), temp(1, dims, type);
vector<int> counters(K);
vector<Vec2f> _box(dims);
Vec2f* box = &_box[0];
double best_compactness = DBL_MAX, compactness = 0;
RNG& rng = theRNG();
int a, iter, i, j, k;
if (criteria.type & TermCriteria::EPS)
criteria.epsilon = std::max(criteria.epsilon, 0.);
else
criteria.epsilon = FLT_EPSILON;
criteria.epsilon *= criteria.epsilon;
if (criteria.type & TermCriteria::COUNT)
criteria.maxCount = std::min(std::max(criteria.maxCount, 2), 100);
else
criteria.maxCount = 100;
if (K == 1)
{
attempts = 1;
criteria.maxCount = 2;
}
const float* sample = data.ptr<float>();
//初始化box[dim]向量
for (j = 0; j < dims; j++)
box[j] = Vec2f(sample[j], sample[j]);
//遍历数据集
for (i = 1; i < N; i++)
{
sample = data.ptr<float>(i);
//寻找各维度的最大最小值
for (j = 0; j < dims; j++)
{
float v = sample[j];
box[j][0] = std::min(box[j][0], v);//min
box[j][1] = std::max(box[j][1], v);//max
}
}
//重复attempts次
for (a = 0; a < attempts; a++)
{
double max_center_shift = DBL_MAX;
for (iter = 0;; )//首次迭代
{
swap(centers, old_centers);//交换指针,old_centers指向centers
//不使用label
if (iter == 0 && (a > 0 || !(flags & KMEANS_USE_INITIAL_LABELS)))
{
//初始化中心
if (flags & KMEANS_PP_CENTERS)
generateCentersPP(data, centers, K, rng, SPP_TRIALS);//kmeanspp法
else
{
for (k = 0; k < K; k++)
generateRandomCenter(_box, centers.ptr<float>(k), rng);//随机法
//各维度选择在max, min区间,随机选取为 rng + (2rng-1)/dims
}
}
else
{
//使用label
if (iter == 0 && a == 0 && (flags & KMEANS_USE_INITIAL_LABELS))
{
for (i = 0; i < N; i++)
CV_Assert((unsigned)labels[i] < (unsigned)K);//label在0~k范围
}
// compute centers
centers = Scalar(0);
for (k = 0; k < K; k++)
counters[k] = 0;//初始化每个label的数据量
//遍历数据集
for (i = 0; i < N; i++)
{
sample = data.ptr<float>(i);
k = labels[i];//取得数据label
float* center = centers.ptr<float>(k);//取第k个中心
j = 0;
#if CV_ENABLE_UNROLLED
//四字节对齐计算加速
for (; j <= dims - 4; j += 4)
{
float t0 = center[j] + sample[j];
float t1 = center[j + 1] + sample[j + 1];
center[j] = t0;
center[j + 1] = t1;
t0 = center[j + 2] + sample[j + 2];
t1 = center[j + 3] + sample[j + 3];
center[j + 2] = t0;
center[j + 3] = t1;
}
#endif
//中心加上当前样本
for (; j < dims; j++)
center[j] += sample[j];
counters[k]++;//统计label的数据量
}
//初始化中心移动步长为0
if (iter > 0)
max_center_shift = 0;
//遍历k个中心
for (k = 0; k < K; k++)
{
if (counters[k] != 0)
continue;
// if some cluster appeared to be empty then:
// 1. find the biggest cluster
// 2. find the farthest from the center point in the biggest cluster
// 3. exclude the farthest point from the biggest cluster and form a new 1-point cluster.
//找到数据量最大的中心
int max_k = 0;
for (int k1 = 1; k1 < K; k1++)
{
if (counters[max_k] < counters[k1])
max_k = k1;
}
double max_dist = 0;
int farthest_i = -1;
float* new_center = centers.ptr<float>(k);
float* old_center = centers.ptr<float>(max_k);
float* _old_center = temp.ptr<float>(); // normalized
float scale = 1.f / counters[max_k];
//计算标签数据均值,作为中心
for (j = 0; j < dims; j++)
_old_center[j] = old_center[j] * scale;
//遍历数据集
for (i = 0; i < N; i++)
{
//选取标签为max_k的部分
if (labels[i] != max_k)
continue;
sample = data.ptr<float>(i);
double dist = normL2Sqr_(sample, _old_center, dims);//计算L2范数
//记录距离最远的样本i
if (max_dist <= dist)
{
max_dist = dist;
farthest_i = i;
}
}
//为了保证每个中心的样本数均衡
counters[max_k]--;//从样本最多的中心取出
counters[k]++;//加入当前中心
labels[farthest_i] = k;//更新label
sample = data.ptr<float>(farthest_i);
//更新样本累计
for (j = 0; j < dims; j++)
{
old_center[j] -= sample[j];
new_center[j] += sample[j];
}
}
//遍历k个中心
for (k = 0; k < K; k++)
{
//取当前中心
float* center = centers.ptr<float>(k);
CV_Assert(counters[k] != 0);
//计算均值作为中心
float scale = 1.f / counters[k];
for (j = 0; j < dims; j++)
center[j] *= scale;
//非首次迭代
if (iter > 0)
{
double dist = 0;
const float* old_center = old_centers.ptr<float>(k);
//计算和旧中心的距离,作为移动步长
for (j = 0; j < dims; j++)
{
double t = center[j] - old_center[j];
dist += t * t;
}
//得到各中心最大的迭代偏移
max_center_shift = std::max(max_center_shift, dist);
}
}
}
//满足最大迭代次数 或者 迭代中心收敛
if (++iter == MAX(criteria.maxCount, 2) || max_center_shift <= criteria.epsilon)
break;
// assign labels
Mat dists(1, N, CV_64F);
_centers.upload(centers);
distanceToCenters(_src, _centers, dists, _labels);//重新计算数据到各自中心的距离,更新label
_bestLabels.upload(_labels);
//计算整体距离
float* dist = dists.ptr<float>(0);
compactness = 0;
for (i = 0; i < N; i++)
compactness += (double)dist[i];
}
//记录最小值
if (compactness < best_compactness)
best_compactness = compactness;
}
return best_compactness;
}