自己定义了几个点,随机选择初始中心,也可以random去选取
#!/usr/bin/python3
# -*- coding:utf-8 -*-
# Author:ChenYuan
class KMeans(object):
def __init__(self):
self._final_center = None
self._final_result = None
self._final_point = None
@staticmethod
def _call_distance(p1, p2, distance_function='cosine'):
distance = 0
if distance_function == 'cosine':
distance = np.dot(p1, p2.T) / (np.linalg.norm(p1) * np.linalg.norm(p2))
elif distance_function == 'euclidean':
distance = np.linalg.norm(p1 - p2)
return distance
@staticmethod
def _choice_random_center(X, k):
random_index = []
random_center = []
while len(random_index) != k:
index = int(len(X) * random.random())
if index not in random_index:
random_index.append(index)
for index in random_index:
random_center.append(X[index])
return random_center
@staticmethod
def _check_stop(result, last_result):
result = result.tolist()
last_result = last_result.tolist()
if sorted(result) == sorted(last_result):
return False
else:
return True
def fit(self, X, k=2, start_center=None, distance_function='cosine', verbose=0):
if start_center is None:
start_center = self._choice_random_center(X, k)
print(start_center)
label = ['Doc{}'.format(i+1) for i in range(len(X))]
last_result = np.array([])
result = np.array(start_center)
count = 1
# [array([0, 2, 4, 0, 2, 3, 0, 0]), array([1, 1, 0, 2, 0, 1, 1, 3])]
final_result = None
while self._check_stop(result, last_result):
point_result = [[] for i in range(k)]
clusters = [[] for i in range(k)]
for index, point in enumerate(X):
distance = [self._call_distance(point, center, distance_function) for center in result]
clusters[distance.index(min(distance))].append(point.tolist())
point_result[distance.index(min(distance))].append(label[index])
temp_result = []
for cluster in clusters:
center = np.zeros(len(X[1]))
for p in cluster:
center += p
temp_result.append(center / len(cluster))
last_result = result.copy()
result = np.array(temp_result)
if verbose != 0:
print('第{}次循环的中心点为:{}'.format(count, result))
print('第{}次循环的中心为:{}'.format(count, point_result))
print('\n')
count += 1
final_result = point_result
self._final_point = clusters
print('最终的聚类结果为:')
print(final_result)
self._final_center = result
self._final_result = final_result
def predict(self, x):
predict_result = []
for data in x:
distance = []
for center in self._final_center:
distance.append(self._call_distance(data, center))
min_distance = min(distance)
min_index = distance.index(min_distance)
predict_result.append(min_index)
return predict_result
def evaluate(self):
sse = 0
for points, center in zip(self._final_point, self._final_center):
for point in points:
distance = self._call_distance(point, center)
sse += distance
sse /= len(points)
sse /= len(self._final_center)
print(sse)
这里提供一个例子,调用我写的类:
x = [[2, 0, 4, 3, 0, 1, 0, 100],
[0, 2, 4, 0, 2, 3, 0, 0],
[4, 0, 1, 3, 0, 1, 0, 1],
[0, 1, 0, 2, 0, 0, 1, 0],
[0, 0, 2, 0, 0, 4, 0, 0],
[1, 1, 0, 2, 0, 1, 1, 3],
[2, 1, 3, 4, 0, 2, 0, 2]]
x = np.array(x)
k = 2
km = KMeans()
km.fit(x, k=2, start_center=None, verbose=1)
km.evaluate()