1、算法描述
2、python代码实现
import numpy as np
import matplotlib. pyplot as plt
'''
AGNES层次聚类,采用自底向上聚合策略的算法。先将数据集的每个样本看做一个初始的聚类簇,然后算法运行的每一步中找出距离最近的两个
类簇进行合并,该过程不断重复,直至达到预设的聚类簇的个数。
'''
def calDist ( X1 , X2 ) :
sum = 0
for x1 , x2 in zip ( X1 , X2) :
sum += ( x1 - x2) ** 2
return sum ** 0.5
def updateClusterDis ( dataset, distance, sets, cluster_i) :
i= 0
while i< len ( sets) :
dis = [ ]
for e in sets[ i] :
for ele in sets[ cluster_i] :
dis. append( calDist( dataset[ e] , dataset[ ele] ) )
distance[ i, cluster_i] = max ( dis)
distance[ cluster_i, i] = max ( dis)
i+= 1
distance[ np. diag_indices_from( distance) ] = float ( 'inf' )
return distance
def agens ( dataset, k) :
sets= [ ]
for i in range ( 0 , len ( dataset) ) :
sets. append( { i} )
delta = np. array( dataset[ 0 ] - dataset)
for e in dataset[ 1 : , : ] :
delta = np. vstack( ( delta, ( e - dataset) ) )
distance = np. sqrt( np. sum ( np. square( delta) , axis= 1 ) )
distance = np. reshape( distance, ( len ( dataset) , len ( dataset) ) )
distance[ np. diag_indices_from( distance) ] = float ( 'inf' )
while len ( sets) > k:
locations= np. argwhere( distance== np. min ( distance) )
locations= locations[ locations[ : , 0 ] < locations[ : , 1 ] ]
cluster_i= locations[ 0 , 0 ]
cluster_j= locations[ 0 , 1 ]
for e in sets[ cluster_j] :
sets[ cluster_i] . add( e)
del sets[ cluster_j]
distance= np. delete( distance, cluster_j, axis= 0 )
distance= np. delete( distance, cluster_j, axis= 1 )
distance= updateClusterDis( dataset, distance, sets, cluster_i)
print ( sets)
return sets
dataset= np. loadtxt( 'data.txt' )
results= agens( dataset, 4 )
for r in results:
drawpoints = [ ]
for points in r:
drawpoints. append( points)
drawdata= dataset[ drawpoints]
plt. scatter( drawdata[ : , 0 ] , drawdata[ : , 1 ] , marker= 'o' )
plt. show( )
3、结果