原理参考:
K-means聚类算法原理及python实现
Sklearn之KMeans算法
代码传送门:https://github.com/taifyang/machine-learning
python实现:
import random
import pandas as pd
import numpy as np
class KMeans:
def __init__(self, dataSet, k):
self.dataSet = dataSet
self.k = k
# 计算欧拉距离
def calcDis(self, centroids):
clalist=[]
for data in self.dataSet:
diff = np.tile(data, (self.k, 1)) - centroids #相减 a=[0,1,2], (np.tile(a,(2,1))就是把a先沿x轴复制1倍,即没有复制,仍然是 [0,1,2]。再把结果沿y方向复制2倍得到array([[0,1,2],[0,1,2]]))
squaredDiff = diff ** 2 #平方
squaredDist = np.sum(squaredDiff, axis=1) #和 (axis=1表示行)
distance = squaredDist ** 0.5 #开根号
clalist.append(distance)
clalist = np.array(clalist) #返回一个每个点到质点的距离len(dateSet)*k的数组
return clalist
# 计算质心
def classify(self, centroids):
# 计算样本到质心的距离
clalist = self.calcDis(centroids)
#print(dataSet, centroids, clalist)
# 分组并计算新的质心
minDistIndices = np.argmin(clalist, axis=1) #axis=1 表示求出每行的最小值的下标
#print(clalist, minDistIndices)
newCentroids = pd.DataFrame(self.dataSet).groupby(minDistIndices).mean() #DataFrame(dataSet)对DataSet分组,groupby(min)按照min进行统计分类,mean()对分类结果求均值
#print(newCentroids, newCentroids.values)
newCentroids = newCentroids.values
changed = newCentroids - centroids #计算变化量
return changed, newCentroids
# 使用k-means分类
def predict(self):
# 随机取质心
centroids = self.dataSet[np.random.choice(self.dataSet.shape[0], size=self.k, replace=False), :]
# 更新质心 直到变化量全为0
changed, newCentroids = self.classify(centroids)
#print(centroids,newCentroids)
while np.any(changed != 0):
changed, newCentroids = self.classify(newCentroids)
#print(changed)
centroids = newCentroids.tolist() #tolist()将矩阵转换成列表
# 根据质心计算每个集群
cluster = []
clalist = self.calcDis(centroids) #调用欧拉距离
minDistIndices = np.argmin(clalist, axis=1)
for i in range(self.k):
cluster.append([])
for i, j in enumerate(minDistIndices): #enumerate()可同时遍历索引和遍历元素
cluster[j].append(self.dataSet[i])
return centroids, cluster
if __name__=='__main__':
x = np.array([[1, 1], [1, 2], [2, 1], [6, 4], [6, 3], [5, 4]])
kmeans = KMeans(x, 2)
centroids, cluster = kmeans.predict()
print('质心为:%s' % centroids)
print('集群为:%s' % cluster)
python调包:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn import metrics
import matplotlib.pyplot as plt
x = np.array([[1, 1], [1, 2], [2, 1], [6, 4], [6, 3], [5, 4]])
k_means = KMeans(n_clusters=2)
k_means.fit(x)
y_predict = k_means.predict(x)
plt.scatter(x[:,0],x[:,1],c=y_predict)
plt.show()
print(k_means.predict((x[:,:])))
print(k_means.cluster_centers_)
print(k_means.inertia_)
print(metrics.silhouette_score(x,y_predict))
C++实现:
#include <iostream>
#include <vector>
#include <time.h>
void printMat(std::vector<std::vector<float>> mat)
{
for (size_t i = 0; i < mat.size(); i++)
{
for (size_t j = 0; j < mat[0].size(); j++)
{
std::cout << mat[i][j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
bool checkZeros(std::vector<std::vector<float>> mat)
{
bool flag = true;
for (size_t i = 0; i < mat.size(); i++)
{
for (size_t j = 0; j < mat[0].size(); j++)
{
if (mat[i][j] != 0) flag = false;
}
}
return flag;
}
std::vector<int> getminDistIndices(std::vector<std::vector<float>> clalist)
{
std::vector<int> minDistIndices(clalist.size());
for (size_t i = 0; i < clalist.size(); i++)
{
float minDist = INT_MAX;
int minDistIndex = 0;
for (size_t j = 0; j < clalist[0].size(); j++)
{
if (clalist[i][j] < minDist)
{
minDist = clalist[i][j];
minDistIndex = j;
}
}
//std::cout << minDistIndex << std::endl;
minDistIndices[i] = minDistIndex;
}
return minDistIndices;
}
class KMeans
{
public:
KMeans(std::vector<std::vector<float>> dataSet, int k) :m_dataSet(dataSet), m_k(k) {};
std::vector<std::vector<float>> calcDis(std::vector<std::vector<float>> centroids)
{
std::vector<std::vector<float>> clalist;
for (auto data : m_dataSet)
{
std::vector<std::vector<float>> diff(m_k);
for (size_t i = 0; i < diff.size(); i++)
{
diff[i] = data;
}
for (size_t i = 0; i < diff.size(); i++)
{
for (size_t j = 0; j < diff[0].size(); j++)
{
diff[i][j] -= centroids[i][j];
diff[i][j] = pow(diff[i][j], 2);
}
}
std::vector<float> squaredDist(diff.size());
for (size_t i = 0; i < diff.size(); i++)
{
for (size_t j = 0; j < diff[0].size(); j++)
{
squaredDist[i] += diff[i][j];
}
squaredDist[i] = sqrt(squaredDist[i]);
}
clalist.push_back(squaredDist);
}
//printMat(clalist);
return clalist;
}
void classify(std::vector<std::vector<float>> centroids, std::vector<std::vector<float>>& newCentroids, std::vector<std::vector<float>>& changed)
{
std::vector<std::vector<float>> clalist = calcDis(centroids);
std::vector<int> minDistIndices = getminDistIndices(clalist);
newCentroids.resize(m_k, std::vector<float>(m_dataSet[0].size()));
for (size_t i = 0; i < m_dataSet[0].size(); i++)
{
std::vector<float> sum(m_k);
std::vector<int> num(m_k, 0);
for (size_t j = 0; j < m_dataSet.size(); j++)
{
sum[minDistIndices[j]] += m_dataSet[j][i];
++num[minDistIndices[j]];
}
for (size_t j = 0; j < m_k; j++)
{
//std::cout << sum[j] <<" "<<num[j] << std::endl;
newCentroids[j][i] = sum[j] / num[j];
}
}
//printMat(newCentroids);
changed.resize(m_k, std::vector<float>(m_dataSet[0].size()));
for (size_t i = 0; i < changed.size(); i++)
{
for (size_t j = 0; j < changed[0].size(); j++)
{
changed[i][j] = newCentroids[i][j] - centroids[i][j];
}
}
}
void predict(std::vector<std::vector<float>>& centroids, std::vector<std::vector<std::vector<float>>>& cluster)
{
srand((unsigned)time(NULL));
std::vector<int> random_indices;
while (random_indices.size() < m_k)
{
int random_index = rand() % m_dataSet.size();
if(find(random_indices.begin(), random_indices.end(), random_index)== random_indices.end())
random_indices.push_back(random_index);
}
centroids.resize(m_k, std::vector<float>(m_dataSet[0].size()));
for (size_t i = 0; i < m_k; i++)
{
centroids[i] = m_dataSet[random_indices[i]];
}
std::vector<std::vector<float>> newCentroids;
std::vector<std::vector<float>> changed;
classify(centroids, newCentroids, changed);
//printMat(centroids); printMat(newCentroids);
while (!checkZeros(changed))
{
std::vector<std::vector<float>> copyCentroids = newCentroids;
classify(copyCentroids, newCentroids, changed);
//printMat(changed);
}
centroids = newCentroids;
std::vector<std::vector<float>> clalist = calcDis(newCentroids);
std::vector<int> minDistIndices = getminDistIndices(clalist);
//for (auto i : minDistIndices) std::cout << i << std::endl;
cluster.resize(m_k);
for (size_t i = 0; i < minDistIndices.size(); i++)
{
cluster[minDistIndices[i]].push_back(m_dataSet[i]);
}
}
private:
std::vector<std::vector<float>> m_dataSet;
int m_k;
};
int main(int argc, char* argv[])
{
std::vector<std::vector<float>> dataSet = { {1, 1},{1, 2},{2, 1},{6, 4},{6, 3},{5, 4} };
//std::vector<std::vector<float>> centroids = { {1, 2},{6, 4} };
int k = 2;
KMeans kmeans = KMeans(dataSet, k);
//kmeans.calcDis(centroids);
//kmeans.classify(centroids);
std::vector<std::vector<float>> centroids;
std::vector<std::vector<std::vector<float>>> cluster;
kmeans.predict(centroids, cluster);
printMat(centroids);
printMat(cluster[0]); printMat(cluster[1]);
system("pause");
return EXIT_SUCCESS;
}