opencv 2.49 Kmeans.cpp源码分析

源码位于opencv-2.4.9\modules\ocl\src\kmeans.cpp

主要就是两个函数,一个是中心点选取法:The Advantages of Careful Seeding,另一个是kmeans算法

generateCentersPP函数

对应于k-means++的中心点初始化

引入随机化,下一个被选为中心点的样本不是固定的,而是一个概率值,这个概率值正比于“整体最小距离“。

/*
k-means center initialization using the following algorithm:
Arthur & Vassilvitskii (2007) k-means++: The Advantages of Careful Seeding
*/
static void generateCentersPP(const Mat& _data, Mat& _out_centers,
	int K, RNG& rng, int trials)
{
	int i, j, k, dims = _data.cols, N = _data.rows;
	const float* data = (float*)_data.data;
	size_t step = _data.step / sizeof(data[0]);
	//k个中心
	vector<int> _centers(K);
	int* centers = &_centers[0];
	//三个距离的内存
	vector<float> _dist(N * 3);
	float* dist = &_dist[0], * tdist = dist + N, * tdist2 = tdist + N;
	double sum0 = 0;

	//随机取N个数据中的一个作为首个数据中心
	centers[0] = (unsigned)rng % N;

	//遍历数据集
	for (i = 0; i < N; i++)
	{
		//计算数据i到该中心的距离
		dist[i] = normL2Sqr_(data + step * i, data + step * centers[0], dims);
		sum0 += dist[i];
	}

	//接下去取剩下k个中心
	for (k = 1; k < K; k++)
	{
		double bestSum = DBL_MAX;
		int bestCenter = -1;

		//随机取trials 次中心,得到最佳
		for (j = 0; j < trials; j++)
		{
			//随机整体距离
			double p = (double)rng * sum0, s = 0;
			//计算满足该距离的数据集[0,i)
			for (i = 0; i < N - 1; i++)
				if ((p -= dist[i]) <= 0)
					break;
			// 将该数据作为中心
			int ci = i;

			//取tdist2= min(dist[i], 数据到新的中心ci的距离)
			parallel_for_(Range(0, N),
				KMeansPPDistanceComputer(tdist2, data, dist, dims, step, step * ci));

			//计算整体距离
			for (i = 0; i < N; i++)
			{
				s += tdist2[i];
			}
			
			//记录距离最小值
			if (s < bestSum)
			{
				bestSum = s;
				bestCenter = ci;
				std::swap(tdist, tdist2);
			}
		}
		//记录最佳中心值
		centers[k] = bestCenter;
		sum0 = bestSum;
		std::swap(dist, tdist);//记录最佳距离
	}
	//遍历k个中心
	for (k = 0; k < K; k++)
	{
		const float* src = data + step * centers[k];
		float* dst = _out_centers.ptr<float>(k);
		//输出中心的 具体数据
		for (j = 0; j < dims; j++)
			dst[j] = src[j];
	}
}

kmeans函数


double cv::ocl::kmeans(const oclMat& _src, int K, oclMat& _bestLabels,
	TermCriteria criteria, int attempts, int flags, oclMat& _centers)
{
	const int SPP_TRIALS = 3;
	bool isrow = _src.rows == 1 && _src.oclchannels() > 1;
	int N = !isrow ? _src.rows : _src.cols;
	int dims = (!isrow ? _src.cols : 1) * _src.oclchannels();
	int type = _src.depth();

	attempts = std::max(attempts, 1);
	CV_Assert(type == CV_32F && K > 0);
	CV_Assert(N >= K);

	Mat _labels;
	if (flags & CV_KMEANS_USE_INITIAL_LABELS)
	{
		CV_Assert((_bestLabels.cols == 1 || _bestLabels.rows == 1) &&
			_bestLabels.cols * _bestLabels.rows == N &&
			_bestLabels.type() == CV_32S);
		_bestLabels.download(_labels);
	}
	else
	{
		if (!((_bestLabels.cols == 1 || _bestLabels.rows == 1) &&
			_bestLabels.cols * _bestLabels.rows == N &&
			_bestLabels.type() == CV_32S &&
			_bestLabels.isContinuous()))
			_bestLabels.create(N, 1, CV_32S);
		_labels.create(_bestLabels.size(), _bestLabels.type());
	}
	int* labels = _labels.ptr<int>();

	Mat data;
	_src.download(data);
	//K*dims维矩阵,存储k个中心
	Mat centers(K, dims, type), old_centers(K, dims, type), temp(1, dims, type);

	vector<int> counters(K);
	vector<Vec2f> _box(dims);
	Vec2f* box = &_box[0];
	double best_compactness = DBL_MAX, compactness = 0;
	RNG& rng = theRNG();
	int a, iter, i, j, k;

	if (criteria.type & TermCriteria::EPS)
		criteria.epsilon = std::max(criteria.epsilon, 0.);
	else
		criteria.epsilon = FLT_EPSILON;
	criteria.epsilon *= criteria.epsilon;

	if (criteria.type & TermCriteria::COUNT)
		criteria.maxCount = std::min(std::max(criteria.maxCount, 2), 100);
	else
		criteria.maxCount = 100;

	if (K == 1)
	{
		attempts = 1;
		criteria.maxCount = 2;
	}

	const float* sample = data.ptr<float>();
	//初始化box[dim]向量
	for (j = 0; j < dims; j++)
		box[j] = Vec2f(sample[j], sample[j]);
	//遍历数据集
	for (i = 1; i < N; i++)
	{
		sample = data.ptr<float>(i);
		//寻找各维度的最大最小值
		for (j = 0; j < dims; j++)
		{
			float v = sample[j];
			box[j][0] = std::min(box[j][0], v);//min
			box[j][1] = std::max(box[j][1], v);//max
		}
	}

	//重复attempts次
	for (a = 0; a < attempts; a++)
	{
		double max_center_shift = DBL_MAX;
		for (iter = 0;; )//首次迭代
		{
			swap(centers, old_centers);//交换指针,old_centers指向centers
			//不使用label
			if (iter == 0 && (a > 0 || !(flags & KMEANS_USE_INITIAL_LABELS)))
			{
				//初始化中心
				if (flags & KMEANS_PP_CENTERS)
					generateCentersPP(data, centers, K, rng, SPP_TRIALS);//kmeanspp法
				    
				else
				{
					for (k = 0; k < K; k++)
						generateRandomCenter(_box, centers.ptr<float>(k), rng);//随机法
					//各维度选择在max, min区间,随机选取为 rng + (2rng-1)/dims
				}
			}
			else
			{
				//使用label
				if (iter == 0 && a == 0 && (flags & KMEANS_USE_INITIAL_LABELS))
				{
					for (i = 0; i < N; i++)
						CV_Assert((unsigned)labels[i] < (unsigned)K);//label在0~k范围
				}

				// compute centers
				centers = Scalar(0);
				for (k = 0; k < K; k++)
					counters[k] = 0;//初始化每个label的数据量

				//遍历数据集
				for (i = 0; i < N; i++)
				{
					sample = data.ptr<float>(i);
					k = labels[i];//取得数据label
					float* center = centers.ptr<float>(k);//取第k个中心
					j = 0;
#if CV_ENABLE_UNROLLED
					//四字节对齐计算加速
					for (; j <= dims - 4; j += 4)
					{
						float t0 = center[j] + sample[j];
						float t1 = center[j + 1] + sample[j + 1];

						center[j] = t0;
						center[j + 1] = t1;

						t0 = center[j + 2] + sample[j + 2];
						t1 = center[j + 3] + sample[j + 3];

						center[j + 2] = t0;
						center[j + 3] = t1;
					}
#endif
					//中心加上当前样本
					for (; j < dims; j++)
						center[j] += sample[j];

					counters[k]++;//统计label的数据量
				}

				//初始化中心移动步长为0
				if (iter > 0)
					max_center_shift = 0;

				//遍历k个中心
				for (k = 0; k < K; k++)
				{
					if (counters[k] != 0)
						continue;

					// if some cluster appeared to be empty then:
					//   1. find the biggest cluster
					//   2. find the farthest from the center point in the biggest cluster
					//   3. exclude the farthest point from the biggest cluster and form a new 1-point cluster.

					//找到数据量最大的中心
					int max_k = 0;
					for (int k1 = 1; k1 < K; k1++)
					{
						if (counters[max_k] < counters[k1])
							max_k = k1;
					}

					double max_dist = 0;
					int farthest_i = -1;
					float* new_center = centers.ptr<float>(k);
					float* old_center = centers.ptr<float>(max_k);
					float* _old_center = temp.ptr<float>(); // normalized
					float scale = 1.f / counters[max_k];
					
					//计算标签数据均值,作为中心
					for (j = 0; j < dims; j++)
						_old_center[j] = old_center[j] * scale;

					//遍历数据集
					for (i = 0; i < N; i++)
					{
						//选取标签为max_k的部分
						if (labels[i] != max_k)
							continue;
						sample = data.ptr<float>(i);
						double dist = normL2Sqr_(sample, _old_center, dims);//计算L2范数

						//记录距离最远的样本i
						if (max_dist <= dist)
						{
							max_dist = dist;
							farthest_i = i;
						}
					}

					//为了保证每个中心的样本数均衡

					counters[max_k]--;//从样本最多的中心取出
					counters[k]++;//加入当前中心
					labels[farthest_i] = k;//更新label

					sample = data.ptr<float>(farthest_i);
					//更新样本累计
					for (j = 0; j < dims; j++)
					{
						old_center[j] -= sample[j];
						new_center[j] += sample[j];
					}
				}

				//遍历k个中心
				for (k = 0; k < K; k++)
				{
					//取当前中心
					float* center = centers.ptr<float>(k);
					CV_Assert(counters[k] != 0);

					//计算均值作为中心
					float scale = 1.f / counters[k];
					for (j = 0; j < dims; j++)
						center[j] *= scale;

					//非首次迭代
					if (iter > 0)
					{
						double dist = 0;
						const float* old_center = old_centers.ptr<float>(k);
						//计算和旧中心的距离,作为移动步长
						for (j = 0; j < dims; j++)
						{
							double t = center[j] - old_center[j];
							dist += t * t;
						}
						//得到各中心最大的迭代偏移
						max_center_shift = std::max(max_center_shift, dist);
					}
				}
			}
			//满足最大迭代次数  或者  迭代中心收敛
			if (++iter == MAX(criteria.maxCount, 2) || max_center_shift <= criteria.epsilon)
				break;

			// assign labels
			Mat dists(1, N, CV_64F);
			_centers.upload(centers);
			distanceToCenters(_src, _centers, dists, _labels);//重新计算数据到各自中心的距离,更新label
			_bestLabels.upload(_labels);

			//计算整体距离
			float* dist = dists.ptr<float>(0);
			compactness = 0;
			for (i = 0; i < N; i++)
				compactness += (double)dist[i];
		}

		//记录最小值
		if (compactness < best_compactness)
			best_compactness = compactness;
	}

	return best_compactness;
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值