课堂代码整理及注释
import numpy as np
import pandas as pd
import scipy.spatial.distance as dis
import math
import scipy as scipy
from scipy.cluster.hierarchy import linkage , dendrogram
from sklearn.cluster import AgglomerativeClustering, KMeans
dat = pd.read_excel("F:\\基础数学课\\应用多元统计分析\\examp6.3.3.xlsx",header = 1)
dat = dat.set_index('region')
def cosv(x1,x2):
return sum(x1*x2)/np.sqrt(sum(x1**2)*sum(x2**2))
cosv(dat['x1'],dat['x2'])
d_12 = 1 - cosv(dat['x1'],dat['x2'])**2
discovX = pd.DataFrame(np.zeros((8,8)))
for j in range(8):
discov=[]
for i in range(8):
discov.append(cosv(dat.iloc[:,j],dat.iloc[:,i]))
discovX.iloc[j,:] = discov
distance_cosine = 1 - discovX
dat.agg(np.std,axis=1)
dat.agg(dcosv,axis=0,x2=dat['x1'])
dat.agg(lambda x:dcosv(x,dat['x1']),axis=0)
dat.agg(lambda y:dat.agg(lambda x:1-abs(cosv(x,y))))
def dcosv(x1,x2):
return (1-(sum(x1*x2)/np.sqrt(sum(x1**2)*sum(x2**2)))**2)**0.5
dcosv(dat['x1'],dat['x2'])
def dlan(xi,xj):
return np.mean(np.abs(xi-xj)/(xi+xj))
dlan(dat.iloc[0,:],dat.iloc[1,:])
Dlan = dat.agg(lambda x:dat.agg(lambda y:dlan(x,y),axis=1),axis=1)
pdist(X,'minkowski',p=)
pdist(X,'cityblock')
pdist(X,'euclidean')
pdist(X,'chebyshev')
pdist(X,'mahalanobis')
pdist(X,'canberra')
pdist(X,'correlation')
pdist(X,'cosine')
dis.pdist(dat[['x1','x2']].T,'cosine')
dat[['x1','x2']].T
dat[['x1','x2']].values.reshape(2,31)
dis.cdist(dat['x1'].values.reshape((1,31)),dat['x2'].values.reshape((1,31)),'cosine')
dis.pdist(dat.T,'cosine')
pd.DataFrame(dis.cdist(dat.T,dat.T,'cosine'))
eu_distance = pd.DataFrame(dis.cdist(dat,dat,'euclidean'))
eu_distance.index = eu_distance.columns=dat.index
np.where(eu_distance == min(eu_distance.values.reshape(-1)[eu_distance.values.reshape(-1)!=0]))[0].tolist()
eu_distance.index[[3,27]].tolist()
eu_distance = pd.DataFrame(dis.cdist(dat,dat,'euclidean'))
eu_distance.index = eu_distance.columns=dat.index
np.where(eu_distance == min(eu_distance.values.reshape(-1)[eu_distance.values.reshape(-1)!=0]))[0].tolist()
eu_distance.index[[3,27]].tolist()
ou_D1 = np.delete(eu_distance.values,[3,27],0)
ou_D2 = np.delete(ou_D1,[3,27],1)
ou_D2.shape
D_min = pd.DataFrame(np.delete(eu_distance.iloc[[3,27],:].values,[3,27],1)).min()
D_min.values.shape
import math
np.min(np.min(pd.DataFrame(np.eye(31)*math.ceil(np.max(np.max(eu_distance)))+eu_distance.values)))
np.partition(eu_distance.values.reshape(-1),32)[32]
min(eu_distance.values.reshape(-1)[eu_distance.values.reshape(-1)!=0])
eu_min = sorted(set(eu_distance.values.reshape(-1)))[1]
np.where(eu_distance == min(eu_distance.values.reshape(-1)[eu_distance.values.reshape(-1)!=0]))[0].tolist()
np.where(eu_distance == eu_min)[0]
eu_distance.index[[3,27]].tolist()
scipy.cluster.hierarchy.single
scipy.cluster.hierarchy.complete
scipy.cluster.hierarchy.median
scipy.cluster.hierarchy.centroid
scipy.cluster.hierarchy.average
scipy.cluster.hierarchy.weighted
scipy.cluster.hierarchy.ward
import scipy as scipy
from scipy.cluster.hierarchy import linkage , dendrogram
X = dis.pdist(dat,'euclidean')
cluster = linkage(X,'single')
cluster = scipy.cluster.hierarchy.single(X)
dendrogram(cluster)
def plot_dendrogram(model, **kwargs):
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack([model.children_, model.distances_,counts]).astype(float)
dendrogram(linkage_matrix, **kwargs)
est2 = AgglomerativeClustering(distance_threshold=0,n_clusters=None,linkage='average').fit(dat_std)
plot_dendrogram(est2 ,truncate_mode='level',p=3)
est3 = AgglomerativeClustering(distance_threshold=0,n_clusters=None,linkage='ward').fit(dat_std)
plot_dendrogram(est3,truncate_mode='level',p=3)
import scipy.cluster.vq as vq
dat = pd.read_excel("F:\\基础数学课\\应用多元统计分析\\examp6.3.3.xlsx",header = 1)
dat = dat.set_index('region')
dat_std = vq.whiten(dat)
Cs, _ = vq.kmeans(dat_std,3)
cluster1 = vq.vq(dat_std,Cs)[0]
distance = vq.vq(dat_std,Cs)[1]
pd.DataFrame(cluster1 , dat.index)
from sklearn.cluster import AgglomerativeClustering, KMeans
est1 = KMeans(n_clusters=3).fit(dat_std)
cluster2 = est1.labels_
pd.DataFrame(cluster2,dat.index)
est1.labels_
est1.predict([x1,x2,x3,x4,x5,x6,x7,x8])
est1.cluster_centers_
np.column_stack((cluster1,cluster2,dat.index))
第六次作业
import numpy as np
import pandas as pd
import scipy.spatial.distance as dis
import scipy.cluster.hierarchy as sch
import scipy.cluster.vq as vq
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
dat = pd.read_excel("F:\\基础数学课\\应用多元统计分析\\examp6.3.3.xlsx",header = 1)
dat = dat.set_index('region')
dat_std = vq.whiten(dat)
eu_dis = dis.pdist(dat_std,'euclidean')
ma_dis = dis.pdist(dat_std,'mahalanobis')
def cosv(x1,x2):
return sum(x1*x2)/np.sqrt(sum(x1**2)*sum(x2**2))
cosv(dat['x1'],dat['x2'])
d_12 = 1 - cosv(dat['x1'],dat['x2'])**2
discovX = pd.DataFrame(np.zeros((8,8)))
for j in range(8):
discov=[]
for i in range(8):
discov.append(cosv(dat.iloc[:,j],dat.iloc[:,i]))
discovX.iloc[j,:] = discov
distance_cosine = 1 - discovX
import scipy as scipy
from scipy.cluster.hierarchy import linkage , dendrogram ,centroid
X = dis.pdist(dat,'euclidean')
cluster1 = scipy.cluster.hierarchy.single(X)
dendrogram(cluster1)
X = dis.pdist(dat,'mahalanobis')
cluster2 = scipy.cluster.hierarchy.centroid(X)
dendrogram(cluster2)
codebook, x = vq.kmeans(dat_std,3)
cluster1 = vq.vq(dat_std,codebook)[0]
pd.DataFrame(cluster1,dat.index)
est = KMeans(3).fit(dat_std)
cluster2 = est.labels_
pd.DataFrame(cluster2,dat.index)
np.column_stack((dat.index,cluster1,cluster2))