Online KMeans

最新推荐文章于 2023-11-26 18:19:05 发布

颹蕭蕭

最新推荐文章于 2023-11-26 18:19:05 发布

阅读量2.3k

点赞数 1

分类专栏： # 机器学习

我们不生产知识，我们只是互联网的搬运工

本文链接：https://blog.csdn.net/itnerd/article/details/109329078

版权

机器学习专栏收录该内容

136 篇文章

订阅专栏

import numpy as np


class OnlineKMeans:
    """ Online K Means Algorithm """

    def __init__(self,
                 num_features: int,
                 num_clusters: int,
                 lr: tuple = None):
        """
        :param num_features: The dimension of the data
        :param num_clusters: The number of clusters to form as well as the number of centroids to generate.
        :param lr: The learning rate of the online k-means (c', t0). If None, then we will use the simplest update
        rule (c'=1, t0=0) as described in the lecture.
        """
        if num_features < 1:
            raise ValueError(f"num_features must be greater or equal to 1!\nGet {num_features}")
        if num_clusters < 1:
            raise ValueError(f"num_clusters must be greater or equal to 1!\nGet {num_clusters}")

        self.num_features = num_features
        self.num_clusters = num_clusters

        self.num_centroids = 0
        self.centroid = np.zeros((num_clusters, num_features))
        self.cluster_counter = np.zeros(num_clusters)  # Count how many points have been assigned into this cluster

        self.num_samples = 0
        self.lr = lr

    def fit(self, X):
        """
        Receive a sample (or mini batch of samples) online, and update the centroids of the clusters
        :param X: (num_features,) or (num_samples, num_features)
        :return:
        """
        if len(X.shape) == 1:
            X = X[np.newaxis, :]
        num_samples, num_features = X.shape

        for i in range(num_samples):
            self.num_samples += 1
            # Did not find enough samples, directly set it to mean
            if self.num_centroids < self.num_clusters:
                self.centroid[self.num_centroids] = X[i]
                self.cluster_counter[self.num_centroids] += 1
                self.num_centroids += 1
            else:
                # Determine the closest centroid for this sample
                sample = X[i]
                dist = np.linalg.norm(self.centroid - sample, axis=1)
                centroid_idx = np.argmin(dist)

                if self.lr is None:
                    self.centroid[centroid_idx] = (self.cluster_counter[centroid_idx] * self.centroid[centroid_idx] +
                                                   sample) / (self.cluster_counter[centroid_idx] + 1)
                    self.cluster_counter[centroid_idx] += 1
                else:
                    c_prime, t0 = self.lr
                    rate = c_prime / (t0 + self.num_samples)
                    # rate = self.lr
                    self.centroid[centroid_idx] = (1 - rate) * self.centroid[centroid_idx] + rate * sample
                    self.cluster_counter[centroid_idx] += 1

    def predict(self, X):
        """
        Predict the cluster labels for each sample in X
        :param X: (num_features,) or (num_samples, num_features)
        :return: Returned index starts from zero
        """
        if len(X.shape) == 1:
            X = X[np.newaxis, :]
        num_samples, num_features = X.shape

        clusters = np.zeros(num_samples)
        for i in range(num_samples):
            sample = X[i]
            dist = np.linalg.norm(self.centroid - sample, axis=1)
            clusters[i] = np.argmin(dist)
        return clusters

    def fit_predict(self, X):
        """
        Compute cluster centers and predict cluster index for each sample.
        :param X: (num_features,) or (num_samples, num_features)
        :return:
        """
        # Because the centroid may change in the online setting, we cannot determine the cluster of each label until
        # we finish fitting.
        self.fit(X)
        return self.predict(X)

    def calculate_cost(self, X):
        """
        Calculate the KMean cost on the dataset X
        The cost is defined in the L2 distance.

        :param X: (num_features,) or (num_samples, num_features) the dataset
        :return: The cost of this KMean
        """

        if len(X.shape) == 1:
            X = X[np.newaxis, :]
        num_samples, num_features = X.shape

        cost = 0
        for i in range(num_samples):
            # Determine the closest centroid for this sample
            sample = X[i]
            dist = np.linalg.norm(self.centroid - sample, axis=1)
            cost += np.square(np.min(dist))

        return cost

受输入数据流的顺序影响，可能出现不同的聚类结果：

for i in range(30):
	okm = OnlineKMeans(2,3)
	
	index = [i for i in range(len(data))]
	np.random.shuffle(index)
	data = data[index]
	
	for d in data:
	    okm.fit(d)
	labels = okm.predict(data)
	label2color = ['r','g','b']
	colors = [label2color[int(i)] for i in labels]
	show_scatter(data,colors)