#!/usr/bin/env python
# -*- coding: utf-8 -*-
from numpy import *
"""
Code for hierarchical clustering, modified from
Programming Collective Intelligence by Toby Segaran
(O'Reilly Media 2007, page 33).
"""
class cluster_node:
def __init__(self, vec, left=None, right=None, distance=0.0, id=None, count=1):#面向对象oo 构造函数
self.left = left
self.right = right
self.vec = vec
self.id = id
self.distance = distance
self.count = count # only used for weighted average
def L2dist(v1, v2):
return sqrt(sum((v1 - v2) ** 2))
def L1dist(v1, v2):
return sum(abs(v1 - v2))
# def Chi2dist(v1,v2):
# return sqrt(sum((v1-v2)**2))
def hcluster(features, distance=L2dist):
# cluster the rows of the "features" matrix
distances = {}
currentclustid = -1
# clusters are initially just the individual rows
clust = [cluster_node(array(features[i]), id=i) for i in range(len(features))]#每一个实例都赋值id
while len(clust) > 1:
lowestpair = (0, 1)
closest = distance(clust[0].vec, clust[1].vec)
# loop through every pair looking for the smallest distance
for i in range(len(clust)):
for j in range(i + 1, len(clust)):
# distances is the cache of distance calculations
if (clust[i].id, clust[j].id) not in distances:
distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec)
d = distances[(clust[i].id, clust[j].id)]
if d < closest:
closest = d
lowestpair = (i, j)#距离最小的一对点
# calculate the average of the two clusters
mergevec = [(clust[lowestpair[0]].vec[i] + clust[lowestpair[1]].vec[i]) / 2.0 \
for i in range(len(clust[0].vec))]#计算一个类中的两个点的距离的中间点
# create the new cluster
newcluster = cluster_node(array(mergevec), left=clust[lowestpair[0]],
right=clust[lowestpair[1]],
distance=closest, id=currentclustid)#将左右儿子已接近两点距离 两点的中心向量进行赋值,构造新的节点
# cluster ids that weren't in the original set are negative
currentclustid -= 1
del clust[lowestpair[1]]#删除掉已经合并为一个节点的左右两个(在clust里面)节点
del clust[lowestpair[0]]
clust.append(newcluster)
return clust[0]#返回一个包含所有节点的树结构
def extract_clusters(clust, dist):#
# extract list of sub-tree clusters from hcluster tree with distance<dist
clusters = {}
if clust.distance < dist:
# we have found a cluster subtree
return [clust]
else:
# check the right and left branches
cl = []
cr = []
if clust.left != None:
cl = extract_clusters(clust.left, dist=dist)
if clust.right != None:
cr = extract_clusters(clust.right, dist=dist)
return cl + cr
def get_cluster_elements(clust):
# return ids for elements in a cluster sub-tree
if clust.id >= 0:
# positive id means that this is a leaf
return [clust.id]
else:
# check the right and left branches
cl = []
cr = []
if clust.left != None:
cl = get_cluster_elements(clust.left)
if clust.right != None:
cr = get_cluster_elements(clust.right)
return cl + cr
def printclust(clust, labels=None, n=0):
# indent to make a hierarchy layout
for i in range(n): print ' ',
if clust.id < 0:
# negative id means that this is branch
print '-'
else:
# positive id means that this is an endpoint
if labels == None:
print clust.id
else:
print labels[clust.id]
# now print the right and left branches
if clust.left != None: printclust(clust.left, labels=labels, n=n + 1)
if clust.right != None: printclust(clust.right, labels=labels, n=n + 1)
def getheight(clust):
# Is this an endpoint? Then the height is just 1
if clust.left == None and clust.right == None: return 1
# Otherwise the height is the same of the heights of
# each branch
return getheight(clust.left) + getheight(clust.right)
def getdepth(clust):
# The distance of an endpoint is 0.0
if clust.left == None and clust.right == None: return 0
# The distance of a branch is the greater of its two sides
# plus its own distance
return max(getdepth(clust.left), getdepth(clust.right)) + clust.distance