Online KMeans

import numpy as np


class OnlineKMeans:
    """ Online K Means Algorithm """

    def __init__(self,
                 num_features: int,
                 num_clusters: int,
                 lr: tuple = None):
        """
        :param num_features: The dimension of the data
        :param num_clusters: The number of clusters to form as well as the number of centroids to generate.
        :param lr: The learning rate of the online k-means (c', t0). If None, then we will use the simplest update
        rule (c'=1, t0=0) as described in the lecture.
        """
        if num_features < 1:
            raise ValueError(f"num_features must be greater or equal to 1!\nGet {num_features}")
        if num_clusters < 1:
            raise ValueError(f"num_clusters must be greater or equal to 1!\nGet {num_clusters}")

        self.num_features = num_features
        self.num_clusters = num_clusters

        self.num_centroids = 0
        self.centroid = np.zeros((num_clusters, num_features))
        self.cluster_counter = np.zeros(num_clusters)  # Count how many points have been assigned into this cluster

        self.num_samples = 0
        self.lr = lr

    def fit(self, X):
        """
        Receive a sample (or mini batch of samples) online, and update the centroids of the clusters
        :param X: (num_features,) or (num_samples, num_features)
        :return:
        """
        if len(X.shape) == 1:
            X = X[np.newaxis, :]
        num_samples, num_features = X.shape

        for i in range(num_samples):
            self.num_samples += 1
            # Did not find enough samples, directly set it to mean
            if self.num_centroids < self.num_clusters:
                self.centroid[self.num_centroids] = X[i]
                self.cluster_counter[self.num_centroids] += 1
                self.num_centroids += 1
            else:
                # Determine the closest centroid for this sample
                sample = X[i]
                dist = np.linalg.norm(self.centroid - sample, axis=1)
                centroid_idx = np.argmin(dist)

                if self.lr is None:
                    self.centroid[centroid_idx] = (self.cluster_counter[centroid_idx] * self.centroid[centroid_idx] +
                                                   sample) / (self.cluster_counter[centroid_idx] + 1)
                    self.cluster_counter[centroid_idx] += 1
                else:
                    c_prime, t0 = self.lr
                    rate = c_prime / (t0 + self.num_samples)
                    # rate = self.lr
                    self.centroid[centroid_idx] = (1 - rate) * self.centroid[centroid_idx] + rate * sample
                    self.cluster_counter[centroid_idx] += 1

    def predict(self, X):
        """
        Predict the cluster labels for each sample in X
        :param X: (num_features,) or (num_samples, num_features)
        :return: Returned index starts from zero
        """
        if len(X.shape) == 1:
            X = X[np.newaxis, :]
        num_samples, num_features = X.shape

        clusters = np.zeros(num_samples)
        for i in range(num_samples):
            sample = X[i]
            dist = np.linalg.norm(self.centroid - sample, axis=1)
            clusters[i] = np.argmin(dist)
        return clusters

    def fit_predict(self, X):
        """
        Compute cluster centers and predict cluster index for each sample.
        :param X: (num_features,) or (num_samples, num_features)
        :return:
        """
        # Because the centroid may change in the online setting, we cannot determine the cluster of each label until
        # we finish fitting.
        self.fit(X)
        return self.predict(X)

    def calculate_cost(self, X):
        """
        Calculate the KMean cost on the dataset X
        The cost is defined in the L2 distance.

        :param X: (num_features,) or (num_samples, num_features) the dataset
        :return: The cost of this KMean
        """

        if len(X.shape) == 1:
            X = X[np.newaxis, :]
        num_samples, num_features = X.shape

        cost = 0
        for i in range(num_samples):
            # Determine the closest centroid for this sample
            sample = X[i]
            dist = np.linalg.norm(self.centroid - sample, axis=1)
            cost += np.square(np.min(dist))

        return cost

受输入数据流的顺序影响,可能出现不同的聚类结果:

for i in range(30):
	okm = OnlineKMeans(2,3)
	
	index = [i for i in range(len(data))]
	np.random.shuffle(index)
	data = data[index]
	
	for d in data:
	    okm.fit(d)
	labels = okm.predict(data)
	label2color = ['r','g','b']
	colors = [label2color[int(i)] for i in labels]
	show_scatter(data,colors)

在这里插入图片描述
在这里插入图片描述

在这里插入图片描述

在这里插入图片描述

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值