python简单思维实现K-means-CSDN博客

本文链接：https://blog.csdn.net/weixin_40902563/article/details/102828267

自己定义了几个点，随机选择初始中心，也可以random去选取

#!/usr/bin/python3
# -*- coding:utf-8 -*-
# Author:ChenYuan
class KMeans(object):

    def __init__(self):
        self._final_center = None
        self._final_result = None
        self._final_point = None

    @staticmethod
    def _call_distance(p1, p2, distance_function='cosine'):
        distance = 0
        if distance_function == 'cosine':
            distance = np.dot(p1, p2.T) / (np.linalg.norm(p1) * np.linalg.norm(p2))
        elif distance_function == 'euclidean':
            distance = np.linalg.norm(p1 - p2)

        return distance

    @staticmethod
    def _choice_random_center(X, k):
        random_index = []
        random_center = []
        while len(random_index) != k:
            index = int(len(X) * random.random())
            if index not in random_index:
                random_index.append(index)
        for index in random_index:
            random_center.append(X[index])

        return random_center

    @staticmethod
    def _check_stop(result, last_result):
        result = result.tolist()
        last_result = last_result.tolist()
        if sorted(result) == sorted(last_result):
            return False
        else:
            return True

    def fit(self, X, k=2, start_center=None, distance_function='cosine', verbose=0):
        if start_center is None:
            start_center = self._choice_random_center(X, k)
        print(start_center)
        label = ['Doc{}'.format(i+1) for i in range(len(X))]
        last_result = np.array([])
        result = np.array(start_center)
        count = 1
        # [array([0, 2, 4, 0, 2, 3, 0, 0]), array([1, 1, 0, 2, 0, 1, 1, 3])]
        final_result = None
        while self._check_stop(result, last_result):
            point_result = [[] for i in range(k)]
            clusters = [[] for i in range(k)]
            for index, point in enumerate(X):
                distance = [self._call_distance(point, center, distance_function) for center in result]
                clusters[distance.index(min(distance))].append(point.tolist())
                point_result[distance.index(min(distance))].append(label[index])
            temp_result = []
            for cluster in clusters:
                center = np.zeros(len(X[1]))
                for p in cluster:
                    center += p
                temp_result.append(center / len(cluster))
            last_result = result.copy()
            result = np.array(temp_result)
            if verbose != 0:
                print('第{}次循环的中心点为：{}'.format(count, result))
                print('第{}次循环的中心为：{}'.format(count, point_result))
                print('\n')
                count += 1
            final_result = point_result
            self._final_point = clusters
        print('最终的聚类结果为：')
        print(final_result)
        self._final_center = result
        self._final_result = final_result

    def predict(self, x):
        predict_result = []
        for data in x:
            distance = []
            for center in self._final_center:
                distance.append(self._call_distance(data, center))
            min_distance = min(distance)
            min_index = distance.index(min_distance)
            predict_result.append(min_index)
        return predict_result

    def evaluate(self):
        sse = 0
        for points, center in zip(self._final_point, self._final_center):
            for point in points:
                distance = self._call_distance(point, center)
                sse += distance
            sse /= len(points)
        sse /= len(self._final_center)
        print(sse)

这里提供一个例子，调用我写的类：

x = [[2, 0, 4, 3, 0, 1, 0, 100],
     [0, 2, 4, 0, 2, 3, 0, 0],
     [4, 0, 1, 3, 0, 1, 0, 1],
     [0, 1, 0, 2, 0, 0, 1, 0],
     [0, 0, 2, 0, 0, 4, 0, 0],
     [1, 1, 0, 2, 0, 1, 1, 3],
     [2, 1, 3, 4, 0, 2, 0, 2]]
x = np.array(x)
k = 2
km = KMeans()
km.fit(x, k=2, start_center=None, verbose=1)
km.evaluate()