应用多元统计分析(6)第六章 聚类分析

课堂代码整理及注释

import numpy as np
import pandas as pd
import scipy.spatial.distance as dis
import math
import scipy as scipy
from scipy.cluster.hierarchy import linkage , dendrogram
from sklearn.cluster import AgglomerativeClustering, KMeans



dat = pd.read_excel("F:\\基础数学课\\应用多元统计分析\\examp6.3.3.xlsx",header = 1)
dat = dat.set_index('region')

#### ---------------------------------------------
# numpy & pandas
# 夹角余弦
#### ---------------------------------------------

## 夹角余弦
def cosv(x1,x2):
    return sum(x1*x2)/np.sqrt(sum(x1**2)*sum(x2**2))

cosv(dat['x1'],dat['x2'])

d_12 = 1 - cosv(dat['x1'],dat['x2'])**2

discovX = pd.DataFrame(np.zeros((8,8)))

## 夹角余弦矩阵
for j in range(8):
    discov=[]
    for i in range(8):
        discov.append(cosv(dat.iloc[:,j],dat.iloc[:,i]))
    discovX.iloc[j,:] = discov

distance_cosine = 1 - discovX

##  agg() 函数
dat.agg(np.std,axis=1) #按每一行求 np.std
dat.agg(dcosv,axis=0,x2=dat['x1'])
dat.agg(lambda x:dcosv(x,dat['x1']),axis=0) #与上一行效果相同

dat.agg(lambda y:dat.agg(lambda x:1-abs(cosv(x,y))))

# 夹角余弦距离
def dcosv(x1,x2):
    return (1-(sum(x1*x2)/np.sqrt(sum(x1**2)*sum(x2**2)))**2)**0.5

dcosv(dat['x1'],dat['x2'])

## 兰氏距离
def dlan(xi,xj):
    return np.mean(np.abs(xi-xj)/(xi+xj))

dlan(dat.iloc[0,:],dat.iloc[1,:])

#兰氏距离矩阵
Dlan = dat.agg(lambda x:dat.agg(lambda y:dlan(x,y),axis=1),axis=1)


#### ---------------------------------------------
#### 常见距离总结
#### ---------------------------------------------

# scipy.spatial.distance
# 明氏距离
pdist(X,'minkowski',p=)

#绝对距离
pdist(X,'cityblock')

# 欧氏距离
pdist(X,'euclidean')

#切比雪夫距离
pdist(X,'chebyshev')

# 马氏距离
pdist(X,'mahalanobis')

# 兰氏距离
pdist(X,'canberra')

# 相关系数距离
pdist(X,'correlation')

# 夹角余弦距离
pdist(X,'cosine')

#### ---------------------------------------------
#### 应用:夹角余弦距离
#### ---------------------------------------------

# pdist 向量 ; cdist 矩阵
dis.pdist(dat[['x1','x2']].T,'cosine')

dat[['x1','x2']].T
dat[['x1','x2']].values.reshape(2,31)

dis.cdist(dat['x1'].values.reshape((1,31)),dat['x2'].values.reshape((1,31)),'cosine')

dis.pdist(dat.T,'cosine')
pd.DataFrame(dis.cdist(dat.T,dat.T,'cosine'))


#### ---------------------------------------------
#### 应用:欧氏距离 pdist(X,'euclidean')
#### ---------------------------------------------

# 数据:6.3.3

eu_distance = pd.DataFrame(dis.cdist(dat,dat,'euclidean'))

eu_distance.index = eu_distance.columns=dat.index

np.where(eu_distance == min(eu_distance.values.reshape(-1)[eu_distance.values.reshape(-1)!=0]))[0].tolist()

eu_distance.index[[3,27]].tolist()

# step 1 找出最小距离 和 最小距离的位置
eu_distance = pd.DataFrame(dis.cdist(dat,dat,'euclidean'))
eu_distance.index = eu_distance.columns=dat.index
np.where(eu_distance == min(eu_distance.values.reshape(-1)[eu_distance.values.reshape(-1)!=0]))[0].tolist()
eu_distance.index[[3,27]].tolist()

# step 2
ou_D1 = np.delete(eu_distance.values,[3,27],0)    ##运用函数 np.delete()  0:行  1:列
ou_D2 = np.delete(ou_D1,[3,27],1)
ou_D2.shape

# 最短距离法
D_min = pd.DataFrame(np.delete(eu_distance.iloc[[3,27],:].values,[3,27],1)).min()
D_min.values.shape

#### ---------------------------------------------
#### 取出距离最小的点 方法汇总
#### ---------------------------------------------
import math
## step 1 找出最小距离
#我的方法:取出距离最小的点
np.min(np.min(pd.DataFrame(np.eye(31)*math.ceil(np.max(np.max(eu_distance)))+eu_distance.values)))

#金林方法一
np.partition(eu_distance.values.reshape(-1),32)[32]

#金林方法二
min(eu_distance.values.reshape(-1)[eu_distance.values.reshape(-1)!=0])

#金林方法三
eu_min = sorted(set(eu_distance.values.reshape(-1)))[1]  ##本次上课使用

## step 2 找出最小距离的位置
np.where(eu_distance == min(eu_distance.values.reshape(-1)[eu_distance.values.reshape(-1)!=0]))[0].tolist()

np.where(eu_distance == eu_min)[0]

eu_distance.index[[3,27]].tolist()

#### ---------------------------------------------
#### 系统聚类---类间距离 
#### ---------------------------------------------

# 类与类间距离总结

# 最短距离法
scipy.cluster.hierarchy.single

# 最长距离法
scipy.cluster.hierarchy.complete

#中间距离法
scipy.cluster.hierarchy.median

#重心法
scipy.cluster.hierarchy.centroid

# 类平均法
scipy.cluster.hierarchy.average

# 可变类平均法/可变法?
scipy.cluster.hierarchy.weighted

# 离差平方和法
scipy.cluster.hierarchy.ward

#### ---------------------------------------------
#### 应用:系统聚类 scipy
#### ---------------------------------------------

import scipy as scipy
from scipy.cluster.hierarchy import linkage , dendrogram

X = dis.pdist(dat,'euclidean')

cluster = linkage(X,'single')
cluster = scipy.cluster.hierarchy.single(X)
dendrogram(cluster)

# 系统聚类 kmeans
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

est2 = AgglomerativeClustering(distance_threshold=0,n_clusters=None,linkage='average').fit(dat_std)
plot_dendrogram(est2 ,truncate_mode='level',p=3)

est3 = AgglomerativeClustering(distance_threshold=0,n_clusters=None,linkage='ward').fit(dat_std)
plot_dendrogram(est3,truncate_mode='level',p=3)


#### ---------------------------------------------
#### kmeans 聚类 scipy
#### ---------------------------------------------
import scipy.cluster.vq as vq
dat = pd.read_excel("F:\\基础数学课\\应用多元统计分析\\examp6.3.3.xlsx",header = 1)
dat = dat.set_index('region')

dat_std = vq.whiten(dat)

Cs, _ = vq.kmeans(dat_std,3)

cluster1 = vq.vq(dat_std,Cs)[0]
distance = vq.vq(dat_std,Cs)[1]

pd.DataFrame(cluster1 , dat.index)

#### ---------------------------------------------
#### kmeans 聚类 sklearn
#### ---------------------------------------------

from sklearn.cluster import AgglomerativeClustering, KMeans

est1 = KMeans(n_clusters=3).fit(dat_std)
cluster2 = est1.labels_
pd.DataFrame(cluster2,dat.index)

est1.labels_
est1.predict([x1,x2,x3,x4,x5,x6,x7,x8])
est1.cluster_centers_

np.column_stack((cluster1,cluster2,dat.index))

第六次作业

import numpy as np
import pandas as pd
import scipy.spatial.distance as dis
import scipy.cluster.hierarchy as sch
import scipy.cluster.vq as vq
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering

dat = pd.read_excel("F:\\基础数学课\\应用多元统计分析\\examp6.3.3.xlsx",header = 1)
dat = dat.set_index('region')

#### ---------------------------------------------
####  1.计算欧氏,马氏距离矩阵,和夹角余炫距离矩阵
#### ---------------------------------------------
# 数据标准化
dat_std = vq.whiten(dat)

# (1) 计算欧式距离
eu_dis = dis.pdist(dat_std,'euclidean')

# (2) 计算马式距离
ma_dis = dis.pdist(dat_std,'mahalanobis')

# (3) 夹角余炫距离矩阵
def cosv(x1,x2):
    return sum(x1*x2)/np.sqrt(sum(x1**2)*sum(x2**2))

cosv(dat['x1'],dat['x2'])

d_12 = 1 - cosv(dat['x1'],dat['x2'])**2

discovX = pd.DataFrame(np.zeros((8,8)))

for j in range(8):
    discov=[]
    for i in range(8):
        discov.append(cosv(dat.iloc[:,j],dat.iloc[:,i]))
    discovX.iloc[j,:] = discov

distance_cosine = 1 - discovX

#### ---------------------------------------------
####  2.利用scipy进行分别利用 欧氏-最短距离法和马氏-重心法进行系统聚类
#### ---------------------------------------------
import scipy as scipy
from scipy.cluster.hierarchy import linkage , dendrogram ,centroid

## (1) 欧氏-最短距离法
X = dis.pdist(dat,'euclidean')

cluster1 = scipy.cluster.hierarchy.single(X)
dendrogram(cluster1)

## (2) 马氏-重心法
X = dis.pdist(dat,'mahalanobis')

cluster2 = scipy.cluster.hierarchy.centroid(X)
dendrogram(cluster2)

#### ---------------------------------------------
####  3. 利用sklearn 进行k=3均值聚类
#### ---------------------------------------------

# k-means---scipy
codebook, x = vq.kmeans(dat_std,3)
cluster1 = vq.vq(dat_std,codebook)[0]
pd.DataFrame(cluster1,dat.index)

# k-means---sklearn
est = KMeans(3).fit(dat_std)
cluster2 = est.labels_
pd.DataFrame(cluster2,dat.index)

# k-means---scipy & k-means --- sklearn
np.column_stack((dat.index,cluster1,cluster2))

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值