```python
import numpy as np
class KMeans(object):
"""
- 参数
n_clusters:
聚类个数,即k
initCent:
质心初始化方式,可选"random"或指定一个具体的array,默认random,即随机初始化
max_iter:
最大迭代次数
"""
def __init__(self,n_clusters=5,initCent='random',max_iter=300):
if hasattr(initCent, '__array__'):
n_clusters = initCent.shape[0]
self.centroids = np.asarray(initCent, dtype=np.float)
else:
self.centroids = None
self.n_clusters = n_clusters
self.max_iter = max_iter
self.initCent = initCent
self.clusterAssment = None
self.labels = None
self.sse = None
def _distEclud(self, vecA, vecB):
return np.linalg.norm(vecA - vecB)
def _randCent(self, X, k):
n = X.shape[1]
centroids = np.empty((k,n))
for j in range(n):
minJ = min(X[:,j])
rangeJ = float(max(X[:,j]) - minJ)
centroids[:,j] = (minJ + rangeJ * np.random.rand(k,1)).flatten()
return centroids
def fit(self, X):
if not isinstance(X,np.ndarray):
try:
X = np.asarray(X)
except:
raise TypeError("numpy.ndarray required for X")
m = X.shape[0]
self.clusterAssment = np.empty((m,2))
if self.initCent == 'random':
self.centroids = self._randCent(X, self.n_clusters)
clusterChanged = True
for _ in range(self.max_iter):
clusterChanged = False
for i in range(m):
minDist = np.inf; minIndex = -1
for j in range(self.n_clusters):
distJI = self._distEclud(self.centroids[j,:],X[i,:])
if distJI < minDist:
minDist = distJI; minIndex = j
if self.clusterAssment[i,0] != minIndex:
clusterChanged = True
self.clusterAssment[i,:] = minIndex,minDist**2
if not clusterChanged:
break
for i in range(self.n_clusters):
ptsInClust = X[np.nonzero(self.clusterAssment[:,0]==i)[0]]
self.centroids[i,:] = np.mean(ptsInClust, axis=0)
self.labels = self.clusterAssment[:,0]
self.sse = sum(self.clusterAssment[:,1])
def predict(self,X):
if not isinstance(X,np.ndarray):
try:
X = np.asarray(X)
except:
raise TypeError("numpy.ndarray required for X")
m = X.shape[0]
preds = np.empty((m,))
for i in range(m):
minDist = np.inf
for j in range(self.n_clusters):
distJI = self._distEclud(self.centroids[j,:],X[i,:])
if distJI < minDist:
minDist = distJI
preds[i] = j
return preds
class biKMeans(object):
def __init__(self,n_clusters=5):
self.n_clusters = n_clusters
self.centroids = None
self.clusterAssment = None
self.labels = None
self.sse = None
def _distEclud(self, vecA, vecB):
return np.linalg.norm(vecA - vecB)
def fit(self,X):
m = X.shape[0]
self.clusterAssment = np.zeros((m,2))
centroid0 = np.mean(X, axis=0).tolist()
centList =[centroid0]
for j in range(m):
self.clusterAssment[j,1] = self._distEclud(np.asarray(centroid0), X[j,:])**2
while (len(centList) < self.n_clusters):
lowestSSE = np.inf
for i in range(len(centList)):
ptsInCurrCluster = X[np.nonzero(self.clusterAssment[:,0]==i)[0],:]
clf = KMeans(n_clusters=2)
clf.fit(ptsInCurrCluster)
centroidMat, splitClustAss = clf.centroids, clf.clusterAssment
sseSplit = sum(splitClustAss[:,1])
sseNotSplit = sum(self.clusterAssment[np.nonzero(self.clusterAssment[:,0]!=i)[0],1])
if (sseSplit + sseNotSplit) < lowestSSE:
bestCentToSplit = i
bestNewCents = centroidMat
bestClustAss = splitClustAss.copy()
lowestSSE = sseSplit + sseNotSplit
bestClustAss[np.nonzero(bestClustAss[:,0] == 1)[0],0] = len(centList)
bestClustAss[np.nonzero(bestClustAss[:,0] == 0)[0],0] = bestCentToSplit
centList[bestCentToSplit] = bestNewCents[0,:].tolist()
centList.append(bestNewCents[1,:].tolist())
self.clusterAssment[np.nonzero(self.clusterAssment[:,0] == bestCentToSplit)[0],:]= bestClustAss
self.labels = self.clusterAssment[:,0]
self.sse = sum(self.clusterAssment[:,1])
self.centroids = np.asarray(centList)
def predict(self,X):
if not isinstance(X,np.ndarray):
try:
X = np.asarray(X)
except:
raise TypeError("numpy.ndarray required for X")
m = X.shape[0]
preds = np.empty((m,))
for i in range(m):
minDist = np.inf
for j in range(self.n_clusters):
distJI = self._distEclud(self.centroids[j,:],X[i,:])
if distJI < minDist:
minDist = distJI
preds[i] = j
return preds