注: 本文代码及方法仅供参考,请勿直接使用
C++实现过程
算法流程
kmeans - 点作为数据,cluster是点的聚簇
BEGIN
选出来 k 个点作为中心点生成聚簇
循环
计算点与聚簇的距离
每个点加入到距离最近的聚簇中
更新聚簇中心点
聚簇中心点未变 或 循环次数足够?退出
输出聚簇
END
数据结构设计
为了设计出更为通用的结构,选择采用OOP面向对象设计,结构比较复杂,尤其是距离计算,求中心这两个函数。想要通用,那么就不能限定距离的计算方法,同理,求中心点的方法也可能是任意的,因此需要作为参数传递给算法。
结构概要
VirtualPoint - 虚拟点类(抽象类),无数据成员,定义了 == != 两个纯虚函数
Cluster - 聚簇类,数据成员: VirtualPoint的集合 和 中心点(VirtualPoint类型)
函数成员: 设置中心 更新中心 清空点...
KmeansAlg - 算法框架,run方法实现了聚类算法,提供必要参数(点之间距离计算,求平均点方法),无需重写算法即可运行
------------------
NDimenPoint - 多维点类,继承VirtualPoint,用来处理多维数据
首先是两个通用类 - 虚拟点与聚簇,实际使用的时候,继承VirtualPoint
类,实现两个运算符之后即可(当然由于avgPoints
和calcDis
两个函数,可能需要添加其它方法,不过这是另一回事儿了)。
class VirtualPoint {
private:
public:
VirtualPoint() {}
virtual ~VirtualPoint() {}
// 如下的 相等 判断主要在判断中心是否更改时用到
virtual bool operator==(const VirtualPoint &p) = 0;
virtual bool operator!=(const VirtualPoint &p) = 0;
virtual string toString() = 0;
};
typedef shared_ptr<VirtualPoint> sharedVPoint;
typedef sharedVPoint avgPointFunc(const vector<sharedVPoint> &);
// 聚簇类
class Cluster {
private:
vector<sharedVPoint> points; // 所有的点
sharedVPoint centroid; // 中心
avgPointFunc *avgPoints; // 计算所有点的中心的方法
public:
Cluster(avgPointFunc avg);
~Cluster() {}
Cluster &setCentroid(sharedVPoint p); // 设置中心
bool updateCentroid(); // 更新中心,使用 avgPoints 函数更新得到新的中心,并且返回新中心是否与旧中心不同
void clear(); // 清空点
void addPoint(sharedVPoint p); // 添加点
string toString();
// 获取中心与所有的点,输出时用
sharedVPoint getCentroid();
const vector<sharedVPoint> &getPoints();
};
然后是kmeans主要过程类,注意下面的run
方法为算法框架,已经实现,因此如果要针对其他数据类型实现kmeans,无需修改该类,而是继承VirtualPoint
然后调用该类即可。
// 计算 VirtualPoint 与 Cluster的质心 之间的距离
typedef double calcFunc(const VirtualPoint &, const Cluster &);
class KmeansAlg {
public:
KmeansAlg() {}
~KmeansAlg() {}
// 生成 k 个 位于 [0, n) 中的随机数, n < 100000000
static vector<int> randDiffNumbers(int n, int k);
static vector<Cluster> run(vector<sharedVPoint> data, int k, calcFunc calcDis, avgPointFunc avgPoints, const int maxRuond = 2000);
};
然后是一个继承VirtualPoint
的多维点类,能够处理任意维度的点
class NDimenPoint : public VirtualPoint {
private:
int dimension; // 维度
vector<double> xs; // x1 x2 x3 ...
public:
NDimenPoint(const int d);
NDimenPoint(const int d, vector<double> l);
NDimenPoint(const NDimenPoint &p);
~NDimenPoint();
bool operator==(const VirtualPoint &p) override; // 重载,需要 static_cast
bool operator!=(const VirtualPoint &p) override; // 重载,需要 static_cast
void add(const NDimenPoint &p); // 主要用来计算点的平均值
NDimenPoint operator/(const int n);
double disTo(const NDimenPoint &p); // 计算到某个点的距离
string toString() override;
// 两个静态函数,计算点到聚簇距离 以及 计算点的中心值
static double calcDisToCluster(const VirtualPoint &p, const Cluster &c);
static sharedVPoint avgPoints(const vector<sharedVPoint> &points);
};
和多维点类一样,对于其他非点类型的数据,通过继承VirtualPoint
,实现必要的函数之后即可调用前述KmeansAlg
的run
方法从而实现kmeans聚类。
代码:
kmeans_oop.h
#include <algorithm>
#include <cmath>
#include <ctime>
#include <exception>
#include <iostream>
#include <memory>
#include <random>
#include <sstream>
#include <string>
#include <vector>
using std::cerr;
using std::endl;
using std::make_shared;
using std::pow;
using std::shared_ptr;
using std::sqrt;
using std::string;
using std::stringstream;
using std::to_string;
using std::vector;
/**
* kmeans - 点作为数据,cluster是点的聚簇
* BEGIN
* 选出来 k 个点作为中心点生成聚簇
* 循环
* 计算点与聚簇的距离
* 每个点加入到距离最近的聚簇中
* 更新聚簇中心点
* 聚簇中心点未变?退出
* 输出聚簇
* END
*
* 数据结构
* 点 - ==() toString()
* 聚簇 - 计算中心点()
* calcDis(point cluster)
* kmeans() -
*/
class VirtualPoint {
private:
public:
VirtualPoint() {}
virtual ~VirtualPoint() {}
virtual bool operator==(const VirtualPoint &p) = 0;
virtual bool operator!=(const VirtualPoint &p) = 0;
virtual string toString() = 0;
};
typedef shared_ptr<VirtualPoint> sharedVPoint;
typedef sharedVPoint avgPointFunc(const vector<sharedVPoint> &);
class Cluster {
private:
vector<sharedVPoint> points;
sharedVPoint centroid;
avgPointFunc *avgPoints;
public:
Cluster(avgPointFunc avg) { avgPoints = avg; }
~Cluster() {}
Cluster &setCentroid(sharedVPoint p) {
centroid = p;
points.push_back(p);
return *this;
}
bool updateCentroid() {
sharedVPoint tmpPoint = avgPoints(points);
if (tmpPoint == nullptr) return false;
bool changed;
if (tmpPoint != nullptr && centroid != nullptr)
changed = (*tmpPoint) != (*centroid);
else
changed = true;
centroid = tmpPoint;
return changed;
}
void clear() { points.clear(); }
void addPoint(sharedVPoint p) {
points.push_back(p);
}
string toString() const {
stringstream ss;
if (centroid == nullptr || points.size() == 0) return "{}";
ss << "{\"centroid\": " << centroid->toString() << ",\"points\": [";
for (int i = 0; i < points.size(); i++) {
if (i > 0) ss << ", ";
ss << points[i]->toString();
}
ss << "]}";
return ss.str();
}
sharedVPoint getCentroid() const { return centroid; }
const vector<sharedVPoint> &getPoints() { return points; }
};
// 计算 VirtualPoint 与 Cluster的质心 之间的距离
typedef double calcFunc(const VirtualPoint &, const Cluster &);
class KmeansAlg {
public:
KmeansAlg() {}
~KmeansAlg() {}
// 生成 k 个 位于 [0, n) 中的随机数, n < 100000000
static vector<int> randDiffNumbers(int n, int k) {
const int maxn = 100000000;
vector<int> res;
if (n <= 0 || n >= maxn)
throw std::runtime_error("n is less than zero or greater than maxn(100,000,000)");
for (int i = 0; i < n; i++)
res.push_back(i);
random_shuffle(res.begin(), res.end());
res.resize(k);
return res;
}
static vector<Cluster> run(vector<sharedVPoint> data, int k, calcFunc calcDis, avgPointFunc avgPoints, const int maxRuond = 2000) {
if (k <= 1) throw std::runtime_error("k is less than 1");
vector<Cluster> clusters;
for (auto &&i : randDiffNumbers(data.size(), k))
clusters.push_back(Cluster(avgPoints).setCentroid(data[i]));
for (int round = 0; round < maxRuond; round++) {
// 清空
for (auto &&c : clusters) c.clear();
for (size_t i = 0; i < data.size(); i++) {
// 计算距离,加入到最近聚簇中
double minDis = calcDis(*(data[i]), clusters[0]);
int minIndex = 0;
for (size_t j = 1; j < clusters.size(); j++) {
double tmpDis = calcDis(*(data[i]), clusters[j]);
if (tmpDis < minDis) minDis = tmpDis, minIndex = j;
}
clusters[minIndex].addPoint(data[i]);
}
bool changed = false;
for (auto &&c : clusters) changed = changed || c.updateCentroid();
if (!changed) break;
// cerr << "debug\t\tround: " << round << " ";
// for (auto &&c : clusters)
// if (c.getPoints().size() > 0)
// cerr << c.getCentroid()->toString() << ", ";
// cerr << endl;
}
return clusters;
}
};
kmeans_h
#include "kmeans_oop.h"
using std::cin;
using std::cout;
using std::initializer_list;
using std::runtime_error;
class NDimenPoint : public VirtualPoint {
private:
int dimension;
vector<double> xs;
public:
NDimenPoint(const int d) : dimension(d) { xs.resize(d); }
NDimenPoint(const int d, vector<double> l) : dimension(d), xs(l){};
NDimenPoint(const NDimenPoint &p) : dimension(p.dimension), xs(p.xs) {}
~NDimenPoint(){};
bool operator==(const VirtualPoint &p) override {
auto pp = static_cast<const NDimenPoint &>(p);
if (dimension != pp.dimension) return false;
for (size_t i = 0; i < xs.size(); i++)
if (xs[i] != pp.xs[i]) return false;
return true;
}
bool operator!=(const VirtualPoint &p) override {
auto pp = static_cast<const NDimenPoint &>(p);
if (dimension != pp.dimension) return true;
for (size_t i = 0; i < xs.size(); i++)
if (xs[i] != pp.xs[i]) return true;
return false;
}
void add(const NDimenPoint &p) {
if (p.dimension != dimension) throw runtime_error("dimension mismatch");
for (size_t i = 0; i < xs.size(); i++)
xs[i] += p.xs[i];
}
NDimenPoint operator/(const int n) {
if (n == 0) throw std::runtime_error("divisor zero error!");
NDimenPoint res(dimension);
for (size_t i = 0; i < dimension; i++) {
res.xs[i] = xs[i] / n;
}
return res;
}
double disTo(const NDimenPoint &p) {
double tmp = 0;
for (size_t i = 0; i < dimension; i++) tmp += pow(xs[i] - p.xs[i], 2);
return sqrt(tmp);
}
string toString() override {
stringstream ss;
ss << "[";
for (size_t i = 0; i < dimension; i++) {
if (i > 0) ss << ", ";
ss << xs[i];
}
ss << "]";
return ss.str();
}
static double calcDisToCluster(const VirtualPoint &p, const Cluster &c) {
auto pp = static_cast<const NDimenPoint &>(p);
auto cp = static_cast<const NDimenPoint &>(*(c.getCentroid()));
return pp.disTo(cp);
}
static sharedVPoint avgPoints(const vector<sharedVPoint> &points) {
if (points.size() <= 0) return nullptr;
NDimenPoint resPoint(static_cast<const NDimenPoint &>(*points[0]).dimension);
for (auto &&p : points)
resPoint.add(static_cast<const NDimenPoint &>(*p));
resPoint = resPoint / points.size();
// cerr << "DEBUG\t" << resPoint.toString() << ", POINTS.SIZE " << points.size() << endl;
return make_shared<NDimenPoint>(resPoint);
};
};
vector<NDimenPoint> geneData(int num, const int dimension, double maxVal = 1000) {
std::default_random_engine generator(time(NULL));
std::uniform_real_distribution<double> distribution(0, maxVal);
vector<NDimenPoint> points;
for (size_t i = 0; i < num; i++) {
vector<double> tmpVec;
for (size_t j = 0; j < dimension; j++) tmpVec.push_back(distribution(generator));
points.push_back(NDimenPoint(dimension, tmpVec));
}
return points;
}
void output(const vector<Cluster> &clusters, const int dimension) {
cout << "{"
<< "\"dimension\":" << dimension << "," << endl
<< "\"clusters\":[";
for (int i = 0; i < clusters.size(); i++) {
if (i > 0) cout << ", ";
std::cout << clusters[i].toString() << std::endl;
}
cout << "]}" << endl;
}
void kmeans_work() {
const int maxRound = 10000;
const int pointCnt = 150;
int dimension = 1;
int k = 0;
cerr << "dimension, k: ";
cin >> dimension >> k;
vector<sharedVPoint> points;
for (auto &&p : geneData(pointCnt, dimension)) points.push_back(make_shared<NDimenPoint>(p));
auto clusters = KmeansAlg::run(points, k, NDimenPoint::calcDisToCluster, NDimenPoint::avgPoints, maxRound);
output(clusters, dimension);
}
main.cpp
int main(int argc, char const *argv[]) {
kmeans_work();
return 0;
}
Python可视化过程
原本打算使用opengl可视化,但是绘制一个三角形就需要一二百行代码实在难以接受且低效,则选择使用matplotlib
实现,支持二维和三维
实现过程的tips:
- matplotlib 绘制三维图 -
plt.figure().add_subplot(111, projection='3d')
- 二维参数 -
ax.scatter(xs=xs, ys=ys, zs=zs, zdir='z', c=color, marker=marker)
- 三维参数 -
ax.scatter(x=xs, y=ys, c=color, marker=marker)
- 二维参数 -
- 散点图scatter
- 可以在一个ax(fig.add_subplot返回值)上多次scatter
- 每次scatter的时候可以指定一个颜色’#000000’
- marker - “.”: 点, “,”:像素 , “o”: 圈, “^”: 倒三角, “+”: 加, 参考官方文档
具体实现过程与代码如下
# 运行kmeans算法
# 将结果(JSON化)输出到文件中
# 使用Python读取文件内容
# 使用pyplot可视化
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import json
import random
colors = [
"#ff0000", "#00ff00", "#0000ff", "#404040", "#ff00ff", "#00ffff", "#C0ff00", "#ffC000", "#ff00C0", "#000070", "#007000", "#700000",
]
def paint(ax, xs, ys, color, zs=None, marker='.'):
if zs != None:
ax.scatter(xs=xs, ys=ys, zs=zs, zdir='z', c=color, marker=marker)
else:
ax.scatter(x=xs, y=ys, c=color, marker=marker)
def readData():
random.shuffle(colors)
data = json.load(open("foo.json", mode="r", encoding="utf-8"))
dimension = data["dimension"]
clusters = []
clusterCnt = 0
for tmpRawCluster in data["clusters"]:
tmpCluster = {"centroid": None, "xss": [],
"color": colors[clusterCnt % 140]}
if "centroid" in tmpRawCluster:
tmpCluster["centroid"] = tmpRawCluster["centroid"]
for i in range(0, dimension):
tmpCluster["xss"].append([])
if "points" in tmpRawCluster:
for tmpRawPoint in tmpRawCluster["points"]:
for j in range(0, len(tmpRawPoint)):
tmpCluster["xss"][j].append(tmpRawPoint[j])
clusters.append(tmpCluster)
clusterCnt += 1
return {"dimension": dimension, "clusters": clusters}
def work():
data = readData()
fig = plt.figure()
if data["dimension"] == 2:
ax = fig.add_subplot(111)
for cluster in data["clusters"]:
if cluster["centroid"]:
paint(ax, cluster["xss"][0],
cluster["xss"][1], cluster["color"], marker='o')
paint(ax, [cluster["centroid"][0]], [
cluster["centroid"][1]], "#000000", marker='^')
elif data["dimension"] == 3:
ax = fig.add_subplot(111, projection='3d')
for cluster in data["clusters"]:
paint(ax, cluster["xss"][0], cluster["xss"]
[1], cluster["color"], cluster["xss"][2])
plt.show()
pass
if __name__ == "__main__":
work()
部分截图
如下效果图仅供参考,三角形为聚簇中心点,后续考虑使用更优化的算法。