#reference:https://github.com/fengdu78/Coursera-ML-AndrewNg-Notes/blob/master/code/ex7-kmeans%20and%20PCA/2-%202D%20kmeans.ipynb
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import scipy.io as sio
def combine_data_C(data,C):
data_copy = data.copy()
data_copy['C'] = C
return data_copy
def _find_your_cluster(x,centroids):
"""find the right cluster for x with respect to shortest distance
Args:
x: ndarray (n, ) -> n features # ???
centroids: ndarray (k, n)
Returns:
k: int
"""
distance = np.apply_along_axis(func1d=np.linalg.norm,axis=1,arr=centroids-x)
return np.argmin(distance)
def assign_cluster(data,centroids):
"""assign cluster for each node in data
return C ndarray
"""
return np.apply_along_axis(lambda
x:_find_your_cluster(x,centroids),axis=1,arr=data.values) # !!! 666
def new_centroids(data,C):
data_with_c = combine_data_C(data,C)
# print(data_with_c)
return data_with_c.groupby('C',as_index=False).mean().sort_values('C')
.drop('C',axis=1).values
def cost(data,centroids,C):
# print("cost",data.shape)
expand_C_with_centroids = centroids[C]
distance = np.apply_along_axis(np.linalg.norm,axis=1,
arr=data.values-expand_C_with_centroids)
m = data.shape[0]
return distance.sum()/m
def _k_means_iter(data,k,epoch=100,tol=0.0001):
"""one shot k-means
with early break
"""
cost_progress = []
centroids = data.sample(k).values
for i in range(epoch):
C = assign_cluster(data,centroids)
centroids = new_centroids(data,C)
cost_progress.append(cost(data,centroids,C))
if len(cost_progress)>1:
if (np.abs(cost_progress[-2]) -
np.abs(cost_progress[-1]))/cost_progress[-1] <tol:
break
return C,centroids,cost_progress[-1]
def k_means(data,k,epoch=50,n_init=10): ## 输出聚类中心C
tries = np.array([_k_means_iter(data,k,epoch) for _ in range(n_init)])
print("tries",tries.shape)
least_cost_idx = np.argmin(tries[:,-1])
return tries[least_cost_idx]
# def random_init(data,k):
# """choose k sample from data set as init centroids
# Args:
# data: DataFrame
# k: int
# Returns:
# k samples: ndarray
# """
# return data.sample(k).as_matrix()
if __name__=='__main__':
mat = sio.loadmat('./ex7data2.mat')
data2 = pd.DataFrame(mat.get('X'),columns=['X1','X2'])
print(data2.head())
# sns.set()
# sns.lmplot('X1','X2',data=data2,fit_reg=False)
# # plt.show()
#
# init_centroids = data2.sample(3).values
# print(init_centroids)
#
# #数据先可视化下:
# fig,ax = plt.subplots(figsize=(6,4))
# ax.scatter(x=init_centroids[:,0],y= init_centroids[:,1])
# for i,node in enumerate(init_centroids):
# ax.annotate("{}: ({},{})".format(i,node[0],node[1]),node)
#
# x = np.array([1,1])
# ax.scatter(x[0],x[1],marker='x',s=200)
# # plt.show()
#
# ans = _find_your_cluster(x,init_centroids)
# # print(ans)
# ans2 = assign_cluster(data2,init_centroids)
# print(ans2)
# ans2c = combine_data_C(data2,ans2)
# print(ans2c)
# ans3 = ans2c.groupby('C',as_index=False).mean().sort_values('C').drop('C',axis=1).values
# print(ans3)
#
# print(ans3[ans2].shape)
# # data2 = pd.DataFrame(mat.get('X'),columns=['X1','X2'])
# ans4 = cost(data2,ans3,ans2)
# print(ans4)
best_C,best_centroids,least_cost = k_means(data2,3)
data_with_c = combine_data_C(data2,best_C)
sns.set()
sns.lmplot('X1','X2',data_with_c,hue='C',fit_reg=False)
plt.show()
Broadcasting
https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.html
numpy.argmax
https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.argmax.html
numpy.apply_along_axis
https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.apply_along_axis.html
numpy.linalg.norm
https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.linalg.norm.html
https://zhuanlan.zhihu.com/p/33217726
Pandas.DataFrame.sample
https://zhuanlan.zhihu.com/p/38255793
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html
pandas.DataFrame.values
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.values.html#pandas-dataframe-values
pandas.DataFrame.copy
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.copy.html?highlight=copy#pandas-dataframe-copy
pandas groupby
http://pandas.pydata.org/pandas-docs/stable/groupby.html
pandas drop
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html
pandas shape
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.shape.html
pandas mean vs numpy mean
https://stackoverflow.com/questions/31037298/pandas-get-column-average-mean
seaborn.set .lmplot()
https://seaborn.pydata.org/generated/seaborn.set.html?highlight=set#seaborn.set
https://seaborn.pydata.org/generated/seaborn.lmplot.html?highlight=lmplot#seaborn.lmplot (回归)
seaborn 数据可以是dataform或numpy,而matplotlib数据为numpy:https://www.jianshu.com/p/4b925654f506