KMeans算法

原理参考:
K-means聚类算法原理及python实现
Sklearn之KMeans算法

代码传送门:https://github.com/taifyang/machine-learning

python实现:

import random
import pandas as pd
import numpy as np

class KMeans:
    def __init__(self, dataSet, k):
        self.dataSet = dataSet
        self.k = k

    # 计算欧拉距离
    def calcDis(self, centroids):
        clalist=[]
        for data in self.dataSet:
            diff = np.tile(data, (self.k, 1)) - centroids  #相减  a=[0,1,2], (np.tile(a,(2,1))就是把a先沿x轴复制1倍,即没有复制,仍然是 [0,1,2]。再把结果沿y方向复制2倍得到array([[0,1,2],[0,1,2]]))
            squaredDiff = diff ** 2     #平方
            squaredDist = np.sum(squaredDiff, axis=1)   #和  (axis=1表示行)
            distance = squaredDist ** 0.5  #开根号
            clalist.append(distance) 
        clalist = np.array(clalist)  #返回一个每个点到质点的距离len(dateSet)*k的数组
        return clalist

    # 计算质心
    def classify(self, centroids):
        # 计算样本到质心的距离
        clalist = self.calcDis(centroids)
        #print(dataSet, centroids, clalist)
        # 分组并计算新的质心
        minDistIndices = np.argmin(clalist, axis=1)    #axis=1 表示求出每行的最小值的下标
        #print(clalist, minDistIndices)
        newCentroids = pd.DataFrame(self.dataSet).groupby(minDistIndices).mean() #DataFrame(dataSet)对DataSet分组,groupby(min)按照min进行统计分类,mean()对分类结果求均值
        #print(newCentroids, newCentroids.values)
        newCentroids = newCentroids.values    
        changed = newCentroids - centroids #计算变化量
        return changed, newCentroids

    # 使用k-means分类
    def predict(self):
        # 随机取质心
        centroids = self.dataSet[np.random.choice(self.dataSet.shape[0], size=self.k, replace=False), :]
        # 更新质心 直到变化量全为0
        changed, newCentroids = self.classify(centroids)    
        #print(centroids,newCentroids) 
        while np.any(changed != 0):          
            changed, newCentroids = self.classify(newCentroids)
            #print(changed)   
        centroids = newCentroids.tolist()   #tolist()将矩阵转换成列表
        # 根据质心计算每个集群
        cluster = []
        clalist = self.calcDis(centroids) #调用欧拉距离
        minDistIndices = np.argmin(clalist, axis=1)  
        for i in range(self.k):
            cluster.append([])
        for i, j in enumerate(minDistIndices):   #enumerate()可同时遍历索引和遍历元素
            cluster[j].append(self.dataSet[i])        
        return centroids, cluster
 

if __name__=='__main__': 
    x = np.array([[1, 1], [1, 2], [2, 1], [6, 4], [6, 3], [5, 4]])
    kmeans = KMeans(x, 2)
    centroids, cluster = kmeans.predict()
    print('质心为:%s' % centroids)
    print('集群为:%s' % cluster)

python调包:

import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn import metrics
import matplotlib.pyplot as plt

x = np.array([[1, 1], [1, 2], [2, 1], [6, 4], [6, 3], [5, 4]])
k_means = KMeans(n_clusters=2)
k_means.fit(x)

y_predict = k_means.predict(x)
plt.scatter(x[:,0],x[:,1],c=y_predict)
plt.show()
print(k_means.predict((x[:,:])))
print(k_means.cluster_centers_)
print(k_means.inertia_)
print(metrics.silhouette_score(x,y_predict))

C++实现:

#include <iostream>
#include <vector>
#include <time.h>

void printMat(std::vector<std::vector<float>> mat)
{
	for (size_t i = 0; i < mat.size(); i++)
	{
		for (size_t j = 0; j < mat[0].size(); j++)
		{
			std::cout << mat[i][j] << " ";
		}
		std::cout << std::endl;
	}
	std::cout << std::endl;
}

bool checkZeros(std::vector<std::vector<float>> mat)
{
	bool flag = true;
	for (size_t i = 0; i < mat.size(); i++)
	{
		for (size_t j = 0; j < mat[0].size(); j++)
		{
			if (mat[i][j] != 0) flag = false;
		}
	}
	return flag;
}

std::vector<int> getminDistIndices(std::vector<std::vector<float>> clalist)
{
	std::vector<int> minDistIndices(clalist.size());
	for (size_t i = 0; i < clalist.size(); i++)
	{
		float minDist = INT_MAX;
		int minDistIndex = 0;
		for (size_t j = 0; j < clalist[0].size(); j++)
		{
			if (clalist[i][j] < minDist)
			{
				minDist = clalist[i][j];
				minDistIndex = j;
			}
		}
		//std::cout << minDistIndex << std::endl;
		minDistIndices[i] = minDistIndex;
	}
	return minDistIndices;
}

class KMeans
{
public:
	KMeans(std::vector<std::vector<float>> dataSet, int k) :m_dataSet(dataSet), m_k(k) {};

	std::vector<std::vector<float>> calcDis(std::vector<std::vector<float>> centroids)
	{
		std::vector<std::vector<float>> clalist;
		for (auto data : m_dataSet)
		{
			std::vector<std::vector<float>> diff(m_k);
			for (size_t i = 0; i < diff.size(); i++)
			{
				diff[i] = data;
			}
			for (size_t i = 0; i < diff.size(); i++)
			{
				for (size_t j = 0; j < diff[0].size(); j++)
				{
					diff[i][j] -= centroids[i][j];
					diff[i][j] = pow(diff[i][j], 2);
				}
			}
			std::vector<float> squaredDist(diff.size());
			for (size_t i = 0; i < diff.size(); i++)
			{
				for (size_t j = 0; j < diff[0].size(); j++)
				{
					squaredDist[i] += diff[i][j];
				}
				squaredDist[i] = sqrt(squaredDist[i]);
			}
			clalist.push_back(squaredDist);
		}

		//printMat(clalist);
		return clalist;
	}

	void classify(std::vector<std::vector<float>> centroids, std::vector<std::vector<float>>& newCentroids, std::vector<std::vector<float>>& changed)
	{
		std::vector<std::vector<float>> clalist = calcDis(centroids);
		std::vector<int> minDistIndices = getminDistIndices(clalist);

		newCentroids.resize(m_k, std::vector<float>(m_dataSet[0].size()));
		for (size_t i = 0; i < m_dataSet[0].size(); i++)
		{
			std::vector<float> sum(m_k);
			std::vector<int> num(m_k, 0);
			for (size_t j = 0; j < m_dataSet.size(); j++)
			{
				sum[minDistIndices[j]] += m_dataSet[j][i];
				++num[minDistIndices[j]];
			}

			for (size_t j = 0; j < m_k; j++)
			{
				//std::cout << sum[j] <<" "<<num[j] << std::endl;
				newCentroids[j][i] = sum[j] / num[j];
			}
		}

		//printMat(newCentroids);

		changed.resize(m_k, std::vector<float>(m_dataSet[0].size()));
		for (size_t i = 0; i < changed.size(); i++)
		{
			for (size_t j = 0; j < changed[0].size(); j++)
			{
				changed[i][j] = newCentroids[i][j] - centroids[i][j];
			}
		}
	}

	void predict(std::vector<std::vector<float>>& centroids, std::vector<std::vector<std::vector<float>>>& cluster)
	{
		srand((unsigned)time(NULL));
		std::vector<int> random_indices;
		while (random_indices.size() < m_k)
		{
			int random_index = rand() % m_dataSet.size();
			if(find(random_indices.begin(), random_indices.end(), random_index)== random_indices.end())
				random_indices.push_back(random_index);
		}

		centroids.resize(m_k, std::vector<float>(m_dataSet[0].size()));
		for (size_t i = 0; i < m_k; i++)
		{
			centroids[i] = m_dataSet[random_indices[i]];
		}

		std::vector<std::vector<float>> newCentroids;
		std::vector<std::vector<float>> changed;
		classify(centroids, newCentroids, changed);
		//printMat(centroids); printMat(newCentroids);

		while (!checkZeros(changed))
		{
			std::vector<std::vector<float>> copyCentroids = newCentroids;
			classify(copyCentroids, newCentroids, changed);
			//printMat(changed);
		}
		centroids = newCentroids;

		std::vector<std::vector<float>> clalist = calcDis(newCentroids);
		std::vector<int> minDistIndices = getminDistIndices(clalist);
		//for (auto i : minDistIndices)	std::cout << i << std::endl;

		cluster.resize(m_k);
		for (size_t i = 0; i < minDistIndices.size(); i++)
		{
			cluster[minDistIndices[i]].push_back(m_dataSet[i]);
		}
	}

private:
	std::vector<std::vector<float>> m_dataSet;
	int m_k;
};


int main(int argc, char* argv[])
{
	std::vector<std::vector<float>> dataSet = { {1, 1},{1, 2},{2, 1},{6, 4},{6, 3},{5, 4} };
	//std::vector<std::vector<float>> centroids = { {1, 2},{6, 4} };
	int k = 2;
	KMeans kmeans = KMeans(dataSet, k);
	//kmeans.calcDis(centroids);
	//kmeans.classify(centroids);

	std::vector<std::vector<float>> centroids;
	std::vector<std::vector<std::vector<float>>> cluster;
	kmeans.predict(centroids, cluster);
	printMat(centroids);
	printMat(cluster[0]); printMat(cluster[1]);

	system("pause");
	return EXIT_SUCCESS;
}
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

给算法爸爸上香

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值