import numpy as np
class OnlineKMeans:
""" Online K Means Algorithm """
def __init__(self,
num_features: int,
num_clusters: int,
lr: tuple = None):
"""
:param num_features: The dimension of the data
:param num_clusters: The number of clusters to form as well as the number of centroids to generate.
:param lr: The learning rate of the online k-means (c', t0). If None, then we will use the simplest update
rule (c'=1, t0=0) as described in the lecture.
"""
if num_features < 1:
raise ValueError(f"num_features must be greater or equal to 1!\nGet {num_features}")
if num_clusters < 1:
raise ValueError(f"num_clusters must be greater or equal to 1!\nGet {num_clusters}")
self.num_features = num_features
self.num_clusters = num_clusters
self.num_centroids = 0
self.centroid = np.zeros((num_clusters, num_features))
self.cluster_counter = np.zeros(num_clusters) # Count how many points have been assigned into this cluster
self.num_samples = 0
self.lr = lr
def fit(self, X):
"""
Receive a sample (or mini batch of samples) online, and update the centroids of the clusters
:param X: (num_features,) or (num_samples, num_features)
:return:
"""
if len(X.shape) == 1:
X = X[np.newaxis, :]
num_samples, num_features = X.shape
for i in range(num_samples):
self.num_samples += 1
# Did not find enough samples, directly set it to mean
if self.num_centroids < self.num_clusters:
self.centroid[self.num_centroids] = X[i]
self.cluster_counter[self.num_centroids] += 1
self.num_centroids += 1
else:
# Determine the closest centroid for this sample
sample = X[i]
dist = np.linalg.norm(self.centroid - sample, axis=1)
centroid_idx = np.argmin(dist)
if self.lr is None:
self.centroid[centroid_idx] = (self.cluster_counter[centroid_idx] * self.centroid[centroid_idx] +
sample) / (self.cluster_counter[centroid_idx] + 1)
self.cluster_counter[centroid_idx] += 1
else:
c_prime, t0 = self.lr
rate = c_prime / (t0 + self.num_samples)
# rate = self.lr
self.centroid[centroid_idx] = (1 - rate) * self.centroid[centroid_idx] + rate * sample
self.cluster_counter[centroid_idx] += 1
def predict(self, X):
"""
Predict the cluster labels for each sample in X
:param X: (num_features,) or (num_samples, num_features)
:return: Returned index starts from zero
"""
if len(X.shape) == 1:
X = X[np.newaxis, :]
num_samples, num_features = X.shape
clusters = np.zeros(num_samples)
for i in range(num_samples):
sample = X[i]
dist = np.linalg.norm(self.centroid - sample, axis=1)
clusters[i] = np.argmin(dist)
return clusters
def fit_predict(self, X):
"""
Compute cluster centers and predict cluster index for each sample.
:param X: (num_features,) or (num_samples, num_features)
:return:
"""
# Because the centroid may change in the online setting, we cannot determine the cluster of each label until
# we finish fitting.
self.fit(X)
return self.predict(X)
def calculate_cost(self, X):
"""
Calculate the KMean cost on the dataset X
The cost is defined in the L2 distance.
:param X: (num_features,) or (num_samples, num_features) the dataset
:return: The cost of this KMean
"""
if len(X.shape) == 1:
X = X[np.newaxis, :]
num_samples, num_features = X.shape
cost = 0
for i in range(num_samples):
# Determine the closest centroid for this sample
sample = X[i]
dist = np.linalg.norm(self.centroid - sample, axis=1)
cost += np.square(np.min(dist))
return cost
受输入数据流的顺序影响,可能出现不同的聚类结果:
for i in range(30):
okm = OnlineKMeans(2,3)
index = [i for i in range(len(data))]
np.random.shuffle(index)
data = data[index]
for d in data:
okm.fit(d)
labels = okm.predict(data)
label2color = ['r','g','b']
colors = [label2color[int(i)] for i in labels]
show_scatter(data,colors)