提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
文章目录
Kmeans聚类定义
基于距离的聚类算法,采用距离作为相似性的评价指标,即认为两个对象越近,其相似度越大。
一、伪代码
一、步骤
1.选取中心点 initizlize_centers
方法1 随机选取中心点
void Kmeans::initialize_centers() {
std::set<int> random_idx =
get_random_index(samples_.size() - 1, centers_.size());
int i_center = 0;
for (auto index : random_idx) {
centers_[i_center].feature_ = samples_[index].feature_;
i_center++;
}
}
std::set<int> get_random_index(int max_idx, int n) {
std::uniform_int_distribution<int> dist(1, max_idx + 1); //随机生成一个整数
std::set<int> random_idx;
while (random_idx.size() < n) {
random_idx.insert(dist(rng) - 1);
}
return random_idx;
}
方法2 kmeans++
kmeans++只是在选择聚类中心上对kmeans进行了改进,聚类过程任然采用kmeans的方法。因此,这里对于kmeans++对于聚类中心的选取进行解释。聚类中心之间的距离应当尽可能的大。
void Kmeans::initialize_centers_plus()
{
//将所有数据看成一个聚类,计算聚类中心
std::array<float, 3>sums = { 0.f,0.f,0.f };
std::array<float, 3>center;
for (int i_s = 0; i_s < samples_.size(); ++i_s)
{
for (int i_c = 0; i_c < samples_[0].feature_.size(); ++i_c)
{
sums[i_c] += samples_[i_s].feature_[i_c];
}
}
for (int i_c = 0; i_c < samples_[0].feature_.size(); ++i_c)
{
center[i_c] = sums[i_c] / samples_.size();
}
//寻找与该中心最远的点,划入一部分数据到最远点所在的区域
float largest_dist = -1;
Center first_center;
for (auto sample : samples_)
{
float temp_dist = calc_square_distance(sample.feature_, center);
if (temp_dist > largest_dist)
{
largest_dist = temp_dist;
first_center.feature_ = sample.feature_;
}
}
centers_[0] = first_center;
//对剩下的数据,以此类推
std::vector<float>distances;
for (int i_k = 1; i_k < centers_.size(); ++i_k)
{
for (Sample& sample : samples_)
{
float smallestDist2Center = FLT_MAX;
for (int i_cent = 0; i_cent < i_k; ++i_cent)
{
float currDist = calc_square_distance(sample.feature_, centers_[i_cent].feature_);
if (currDist < smallestDist2Center)
{
sample.label_ = i_cent;
smallestDist2Center = currDist;
}
}
distances.push_back(smallestDist2Center);
}
auto max_it = std::max_element(distances.begin(), distances.end());//查询最大值所在的位置
centers_[i_k].feature_ = samples_[max_it - distances.begin()].feature_;
}
}
2.更新标签 updatae_labels()
void Kmeans::update_labels() {
for (Sample& sample : samples_) {
// TODO update labels of each feature
float smallestDist2Center = FLT_MAX;
for (int i_cent = 0; i_cent < centers_.size(); ++i_cent)
{
float currDist = calc_square_distance(sample.feature_, centers_[i_cent].feature_);
if(currDist<smallestDist2Center)
{
sample.label_ = i_cent;
smallestDist2Center = currDist;
}
}
}
}
3.更新中心点 update_centers()
void Kmeans::update_centers() {
last_centers_ = centers_;
for (int i_cent = 0; i_cent < centers_.size(); i_cent++)
{
std::array<float, 3>sums = { 0,0,0 };
int n = 0;
for (auto sample : samples_)
{
if (sample.label_ == i_cent)
{
for (int channel = 0; channel < 3; channel++)
{
sums[channel] += sample.feature_[channel];
}
++n;
}
}
for (int channel = 0; channel < 3; channel++)
{
centers_[i_cent].feature_[channel] = sums[channel] / n;
}
}
}
4.设置终止条件 is_terminate
bool Kmeans::is_terminate(int current_iter, int max_iteration,
float smallest_convergence_radius) const {
if (current_iter >= max_iteration || check_convergence(centers_, last_centers_)<smallest_convergence_radius)
{
float sum_squared = 0.f;
for (auto& sample : samples_)
{
sum_squared += calc_square_distance(sample.feature_, centers_[sample.label_].feature_);
}
std::cout << "iteration " << current_iter << "nums" << std::endl;
std::cout << "k=" << centers_.size() << "sum suared: " << sum_squared << std::endl;
return true;
}
return false;
}
总结
k_means.h
#include <array>
#include <opencv2/core.hpp>
#include <random>
#include <set>
/**
* @brief: Each sample is the data that to be clustered.
*
*/
struct Sample {
Sample(const std::array<float, 3>& feature, const int row, const int col,
const int label = -1)
: feature_(feature), row_(row), col_(col), label_(label){};
std::array<float, 3> feature_; // feature vector of the data 像素特征值(如255 255 255)
int label_; // corresponding center id 对应中心ID
int row_; // row in original image 图像中的行
int col_; // col in original image 图像中的列
};
struct Center {
std::array<float, 3> feature_; // center's position
};
/**
* @brief K means class
*
*/
class Kmeans {
public:
Kmeans(cv::Mat img, const int k);
std::vector<Sample> get_result_samples() const;
std::vector<Center> get_result_centers() const;
void run(int max_iteration, float smallest_convergence_rate);
private:
void initialize_centers();
void update_labels();
void update_centers();
bool is_terminate(int current_iter, int max_iteration,
float smallest_convergence_rate) const;
std::vector<Sample> samples_; // all the data to be clustered
std::vector<Center> centers_; // center of each cluster
std::vector<Center> last_centers_; // center of last iteration
};
k_means.cpp
#include "k_means.h"
#include <iostream>
#include <algorithm>
#include <vector>
// to generate random number
static std::random_device rd;
static std::mt19937 rng(rd());
/**
* @brief get_random_index, check_convergence, calc_square_distance are helper
* functions, you can use it to finish your homework:)
*
*/
std::set<int> get_random_index(int max_idx, int n);
float check_convergence(const std::vector<Center>& current_centers,
const std::vector<Center>& last_centers);
inline float calc_square_distance(const std::array<float, 3>& arr1,
const std::array<float, 3>& arr2);
/**
* @brief Construct a new Kmeans object
*
* @param img : image with 3 channels
* @param k : wanted number of cluster
*/
Kmeans::Kmeans(cv::Mat img, const int k) {
centers_.resize(k);
last_centers_.resize(k);
samples_.reserve(img.rows * img.cols);
// save each feature vector into samples
for (int r = 0; r < img.rows; r++) {
for (int c = 0; c < img.cols; c++) {
std::array<float, 3> tmp_feature;
for (int channel = 0; channel < 3; channel++) {
tmp_feature[channel] =
static_cast<float>(img.at<cv::Vec3b>(r, c)[channel]);
}
samples_.emplace_back(tmp_feature, r, c, -1);
}
}
}
/**
* @brief initialize k centers randomly, using set to ensure there are no
* repeated elements
*
*/
// TODO Try to implement a better initialization function
void Kmeans::initialize_centers() {
std::set<int> random_idx =
get_random_index(samples_.size() - 1, centers_.size());
int i_center = 0;
for (auto index : random_idx) {
centers_[i_center].feature_ = samples_[index].feature_;
i_center++;
}
}
void Kmeans::initialize_centers_plus()
{
std::cout<<"hello world"<<std::endl;
}
/**
* @brief change the label of each sample to the nearst center
*
*/
void Kmeans::update_labels() {
for (Sample& sample : samples_) {
// TODO update labels of each feature
float smallestDist2Center = FLT_MAX;
for (int i_cent = 0; i_cent < centers_.size(); ++i_cent)
{
float currDist = calc_square_distance(sample.feature_, centers_[i_cent].feature_);
if (currDist < smallestDist2Center)
{
sample.label_ = i_cent;
smallestDist2Center = currDist;
}
}
}
}
/**
* @brief move the centers according to new lables
*
*/
void Kmeans::update_centers() {
// backup centers of last iteration
last_centers_ = centers_;
// calculate the mean value of feature vectors in each cluster
// TODO complete update centers functions.
for (int i_cent = 0; i_cent < centers_.size(); i_cent++)
{
std::array<float, 3>sums = { 0,0,0 };
int n = 0;
for (auto sample : samples_)
{
if (sample.label_ == i_cent)
{
for (int channel = 0; channel < 3; channel++)
{
sums[channel] += sample.feature_[channel];
}
++n;
}
}
for (int channel = 0; channel < 3; channel++)
{
centers_[i_cent].feature_[channel] = sums[channel] / n;
}
}
}
/**
* @brief check terminate conditions, namely maximal iteration is reached or it
* convergents
*
* @param current_iter
* @param max_iteration
* @param smallest_convergence_radius
* @return true
* @return false
*/
bool Kmeans::is_terminate(int current_iter, int max_iteration,
float smallest_convergence_radius) const {
if (current_iter >= max_iteration || check_convergence(centers_, last_centers_) < smallest_convergence_radius)
{
float sum_squared = 0.f;
for (auto& sample : samples_)
{
sum_squared += calc_square_distance(sample.feature_, centers_[sample.label_].feature_);
}
std::cout << "iteration " << current_iter << "nums" << std::endl;
std::cout << "k=" << centers_.size() << "sum suared: " << sum_squared << std::endl;
return true;
}
return false;
}
std::vector<Sample> Kmeans::get_result_samples() const {
return samples_;
}
std::vector<Center> Kmeans::get_result_centers() const {
return centers_;
}
/**
* @brief Execute k means algorithm
* 1. initialize k centers randomly
* 2. assign each feature to the corresponding centers
* 3. calculate new centers
* 4. check terminate condition, if it is not fulfilled, return
* to step 2
* @param max_iteration
* @param smallest_convergence_radius
*/
void Kmeans::run(int max_iteration, float smallest_convergence_radius) {
initialize_centers();
int current_iter = 0;
while (!is_terminate(current_iter, max_iteration,
smallest_convergence_radius)) {
current_iter++;
update_labels();
update_centers();
}
}
/**
* @brief Get n random numbers from 1 to parameter max_idx
*
* @param max_idx
* @param n
* @return std::set<int> A set of random numbers, which has n elements
*/
std::set<int> get_random_index(int max_idx, int n) {
std::uniform_int_distribution<int> dist(1, max_idx + 1);
std::set<int> random_idx;
while (random_idx.size() < n) {
random_idx.insert(dist(rng) - 1);
}
return random_idx;
}
/**
* @brief Calculate the L2 norm of current centers and last centers
*
* @param current_centers current assigned centers with 3 channels
* @param last_centers last assigned centers with 3 channels
* @return float
*/
float check_convergence(const std::vector<Center>& current_centers,
const std::vector<Center>& last_centers) {
float convergence_radius = 0;
for (int i_center = 0; i_center < current_centers.size(); i_center++) {
convergence_radius +=
calc_square_distance(current_centers[i_center].feature_,
last_centers[i_center].feature_);
}
return convergence_radius;
}
/**
* @brief calculate L2 norm of two arrays
*
* @param arr1
* @param arr2
* @return float
*/
inline float calc_square_distance(const std::array<float, 3>& arr1,
const std::array<float, 3>& arr2) {
return std::pow((arr1[0] - arr2[0]), 2) + std::pow((arr1[1] - arr2[1]), 2) +
std::pow((arr1[2] - arr2[2]), 2);
}
test_k_means.cpp
#include "k_means.h"
#include <iostream>
#include <opencv2/highgui/highgui.hpp>
int main(int argc, char** argv) {
cv::Mat img = cv::imread(argv[1], CV_LOAD_IMAGE_COLOR);
if (img.empty()) {
std::cerr << "useage: ./test_k_means input_image_path k iteration\n "
"example: ./test_k_means "
"../images/test_data/lena.png 3 10"
<< std::endl;
std::exit(-1);
}
if (img.channels() != 3) {
std::cout << "please use a image with 3 channels";
std::exit(-1);
}
int k = strtol(argv[2], NULL, 10);
int iteration = strtol(argv[3], NULL, 10);
int convergence_radius = 1e-6;
Kmeans kmeans(img, k);
kmeans.run(iteration, convergence_radius);
std::vector<Sample> samples = kmeans.get_result_samples();
std::vector<Center> centers = kmeans.get_result_centers();
cv::Mat result(img.size(), img.type());
for (const Sample& sample : samples) {
for (int channel = 0; channel < 3; channel++) {
result.at<cv::Vec3b>(sample.row_, sample.col_)[channel] =
(int)centers[sample.label_].feature_[channel];
}
}
cv::Mat concat_img;
cv::hconcat(img, result, concat_img);
cv::imshow("left: original image, right: kmeans result", concat_img);
cv::waitKey(0);
}
CmakeLists.txt
cmake_minimum_required(VERSION 3.10.2)
project(kmeans_clustering)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_FLAGS "-o2")
#搜索 第三方依赖库
set(OpenCV_DIR E:\\opencv_347)
find_package(OpenCV REQUIRED)
MESSAGE ( STATUS "OpenCV library status:" )
MESSAGE ( STATUS "version ${OpenCV_VERSION}" )
MESSAGE ( STATUS "libraries ${OpenCV_LIBS}" )
MESSAGE ( STATUS "include path ${OpenCV_INCLUDE_DIRS}" )
include_directories(${OpenCV_INCLUDE_DIRS})
include_directories(${CMAKE_SOURCE_DIR}/include)
add_library(k_means ${CMAKE_SOURCE_DIR}/src/k_means.cpp ${CMAKE_SOURCE_DIR}/include/k_means.h)
add_executable(test_k_means ${CMAKE_SOURCE_DIR}/src/test_k_means.cpp)
target_link_libraries(test_k_means ${OpenCV_LIBS} k_means)