import numpy as np
import matplotlib.pyplot as plt
def euclidean_distantce(x1,x2):
return np.sqrt(np.sum((x1-x2)**2))
class KMeans:
def __init__(self, K=5, max_iter=100, plot_steps=False):
self.K = K
self.max_iter = max_iter
self.plot_steps = plot_steps
self.clusters = [[] for _ in range(self.K)]
self.centroids = []
def predict(self,X):
self.X = X
self.n_samples, self.n_features = X.shape
# initialize the centroids
centroids_idx = np.random.choice(self.n_samples, self.K, replace=False)
self.centroids = [self.X[idx] for idx in centroids_idx]
# update the centroids and labels
for _ in range(self.max_iter):
# update clusters
self.clusters = self._create_clusters(self.centroids)
if self.plot_steps:
self.plot()
# update centroids
centroids_old = self.centroids
self.centroids = self._get_centroids(self.clusters)
# converge check
if self._is_converged(centroids_old,self.centroids):
break
if self.plot_steps:
self.plot()
labels = self._get_cluster_label(self.clusters)
return labels
def _get_centroids(self, clusters):
centroids = np.zeros((self.K, self.n_features))
for idx, cluster in enumerate(self.clusters):
cluster_mean = np.mean(self.X[cluster], axis=0)
centroids[idx]=cluster_mean
return centroids
def _is_converged(self, centroids,centroids_old):
distances = [euclidean_distantce(centroids[i],centroids_old[i]) for i in range(self.K)]
return sum(distances)==0
def _create_clusters(self, centroids):
clusters = [[] for _ in range(self.K)]
for idx, sample in enumerate(self.X):
label = np.argmin([euclidean_distantce(sample, point) for point in self.centroids])
clusters[label].append(idx)
return clusters
def _get_cluster_label(self, clusters):
labels = np.empty(self.n_samples)
for cluster_idx, cluster in enumerate(clusters):
for sample_idx in cluster:
labels[sample_idx]=cluster_idx
return labels
def plot(self):
fig, ax = plt.subplots(figsize=(12,8))
for i, index in enumerate(self.clusters):
point = self.X[index].T
ax.scatter(*point)
for point in self.centroids:
ax.scatter(*point, marker="x", color="black", linewidth=2)
plt.show()
###
# test
if __name__=='__main__':
np.random.seed(2)
from sklearn.datasets import make_blobs
X, y = make_blobs(centers=3,n_samples=500,n_features=2,shuffle=True,random_state=40)
print(X.shape)
clusters = len(np.unique(y))
print(clusters)
k = KMeans(K=clusters, max_iter=100, plot_steps=True)
y_predict = k.predict(X)
k.plot()
KMeans from scratch --Python 实现
最新推荐文章于 2024-08-06 17:45:33 发布