K-means实践作业记录

#reference:https://github.com/fengdu78/Coursera-ML-AndrewNg-Notes/blob/master/code/ex7-kmeans%20and%20PCA/2-%202D%20kmeans.ipynb
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import scipy.io as sio

def combine_data_C(data,C):
    data_copy = data.copy()
    data_copy['C'] = C
    return data_copy

def _find_your_cluster(x,centroids):
    """find the right cluster for x with respect to shortest distance
    Args:
        x: ndarray (n, ) -> n features    # ???
        centroids: ndarray (k, n)
    Returns:
        k: int
    """
    distance = np.apply_along_axis(func1d=np.linalg.norm,axis=1,arr=centroids-x)
    return np.argmin(distance)

def assign_cluster(data,centroids):
    """assign cluster for each node in data
    return C ndarray
    """
    return  np.apply_along_axis(lambda 
    x:_find_your_cluster(x,centroids),axis=1,arr=data.values)   #  !!! 666

def new_centroids(data,C):
    data_with_c = combine_data_C(data,C)
    # print(data_with_c)
    return data_with_c.groupby('C',as_index=False).mean().sort_values('C')
    .drop('C',axis=1).values

def cost(data,centroids,C):
    # print("cost",data.shape)
    expand_C_with_centroids = centroids[C]
    distance = np.apply_along_axis(np.linalg.norm,axis=1,
    arr=data.values-expand_C_with_centroids)
    m = data.shape[0]
    return distance.sum()/m

def _k_means_iter(data,k,epoch=100,tol=0.0001):
    """one shot k-means
    with early break
    """
    cost_progress = []
    centroids = data.sample(k).values

    for i in range(epoch):
        C = assign_cluster(data,centroids)
        centroids = new_centroids(data,C)
        cost_progress.append(cost(data,centroids,C))

        if len(cost_progress)>1:
            if (np.abs(cost_progress[-2]) - 
            np.abs(cost_progress[-1]))/cost_progress[-1] <tol:
                break

    return C,centroids,cost_progress[-1]

def k_means(data,k,epoch=50,n_init=10):   ##  输出聚类中心C
    tries = np.array([_k_means_iter(data,k,epoch) for _ in range(n_init)])
    print("tries",tries.shape)
    least_cost_idx = np.argmin(tries[:,-1])
    return tries[least_cost_idx]



# def random_init(data,k):
#     """choose k sample from data set as init centroids
#      Args:
#          data: DataFrame
#          k: int
#      Returns:
#          k samples: ndarray
#      """
#     return data.sample(k).as_matrix()




if __name__=='__main__':

    mat = sio.loadmat('./ex7data2.mat')
    data2 = pd.DataFrame(mat.get('X'),columns=['X1','X2'])
    print(data2.head())

    # sns.set()
    # sns.lmplot('X1','X2',data=data2,fit_reg=False)
    # # plt.show()
    #
    # init_centroids = data2.sample(3).values
    # print(init_centroids)
    #
    # #数据先可视化下:
    # fig,ax = plt.subplots(figsize=(6,4))
    # ax.scatter(x=init_centroids[:,0],y= init_centroids[:,1])
    # for i,node in enumerate(init_centroids):
    #     ax.annotate("{}: ({},{})".format(i,node[0],node[1]),node)
    #
    # x = np.array([1,1])
    # ax.scatter(x[0],x[1],marker='x',s=200)
    # # plt.show()
    #
    # ans = _find_your_cluster(x,init_centroids)
    # # print(ans)
    # ans2 = assign_cluster(data2,init_centroids)
    # print(ans2)
    # ans2c = combine_data_C(data2,ans2)
    # print(ans2c)
    # ans3 = ans2c.groupby('C',as_index=False).mean().sort_values('C').drop('C',axis=1).values
    # print(ans3)
    #
    # print(ans3[ans2].shape)
    # # data2 = pd.DataFrame(mat.get('X'),columns=['X1','X2'])
    # ans4 = cost(data2,ans3,ans2)
    # print(ans4)

    best_C,best_centroids,least_cost = k_means(data2,3)
    data_with_c = combine_data_C(data2,best_C)
    sns.set()
    sns.lmplot('X1','X2',data_with_c,hue='C',fit_reg=False)
    plt.show()

Broadcasting
https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.html

numpy.argmax
https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.argmax.html

numpy.apply_along_axis
https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.apply_along_axis.html

numpy.linalg.norm
https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.linalg.norm.html
https://zhuanlan.zhihu.com/p/33217726

Pandas.DataFrame.sample
https://zhuanlan.zhihu.com/p/38255793
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html

pandas.DataFrame.values
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.values.html#pandas-dataframe-values

pandas.DataFrame.copy
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.copy.html?highlight=copy#pandas-dataframe-copy

pandas groupby
http://pandas.pydata.org/pandas-docs/stable/groupby.html

pandas drop
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html

pandas shape
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.shape.html

pandas mean vs numpy mean
https://stackoverflow.com/questions/31037298/pandas-get-column-average-mean

seaborn.set .lmplot()
https://seaborn.pydata.org/generated/seaborn.set.html?highlight=set#seaborn.set
https://seaborn.pydata.org/generated/seaborn.lmplot.html?highlight=lmplot#seaborn.lmplot (回归)

seaborn 数据可以是dataform或numpy,而matplotlib数据为numpy:https://www.jianshu.com/p/4b925654f506

scipy.io
https://docs.scipy.org/doc/scipy/reference/io.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值