系统聚类原理
层次聚类(Hierarchical Clustering)是聚类算法的一种,通过计算不同类别数据点间的相似度来创建一棵有层次的嵌套聚类树。在聚类树中,不同类别的原始数据点是树的最低层,树的顶层是一个聚类的根节点。创建聚类树有自下而上合并和自上而下分裂两种方法。
层次聚类算法一般分为两类:
Divisive 层次聚类:又称自顶向下(top-down)的层次聚类,最开始所有的对象均属于一个cluster,每次按一定的准则将某个cluster 划分为多个cluster,如此往复,直至每个对象均是一个cluster。
Agglomerative 层次聚类:又称自底向上(bottom-up)的层次聚类,每一个对象最开始都是一个cluster,每次按一定的准则将最相近的两个cluster合并生成一个新的cluster,如此往复,直至最终所有的对象都属于一个cluster。
图片来自https://www.biaodianfu.com/hierarchical-clustering.html
这个算法原理很简单,实现也不难
```scala
class Hierarchical(var T: Double //聚类个数
, var data:List[Array[Double]] //数据集
) {
var finalResult = initFinalResult
// data=standardization01(data)
var initialList=data//初始列表
var firstCluster=new ListBuffer[Int]()//首次出现聚类的列表
var nextCluster=new ListBuffer[Array[Int]]()//首次出现的位置(聚类1和聚类2)
var table=new ListBuffer[Array[Double]]()//表格
var clusterHistory=ListBuffer[ListBuffer[ListBuffer[Array[Double]]]]()//迭代历史
var stage=0;
//初始化,N个初始模式样本自成一类
private def initFinalResult:ListBuffer[ListBuffer[Array[Double]]] = {
val startResult =new ListBuffer[ListBuffer[Array[Double]]]
//首先将每一样本看成单独一类
for (aData <- data) {
val list=new ListBuffer[Array[Double]]
list.append(aData)
startResult.append(list)
}
startResult
}
def hierarchical: ListBuffer[ListBuffer[Array[Double]]] = {
if (finalResult.size == 1) return finalResult
//计算每类间的欧式距离,保存在二维数组中
var distanceArray =Array.ofDim[Double](finalResult.size,finalResult.size)
//最短距离 初始化为1,2类的距离
// var min_dis = min_distance(finalResult(0), finalResult(1))
//组间连接法
var min_dis=baverage_distance(finalResult(0),finalResult(1))
//即将合并的类的标号
var index1 = 0
var index2 = 1
for (i <- 0 until finalResult.size) {
for (j <- (i + 1) until finalResult.size) {
distanceArray(i)(j) = baverage_distance(finalResult(i), finalResult(j))
if (distanceArray(i)(j) < min_dis) {
min_dis = distanceArray(i)(j)
index1 = i
index2 = j
}
}
}
distanceArray=null//这里注意不能去掉,不然迭代次数多了会报OOM异常
//聚类个数判断
if (finalResult.size == T) return finalResult
else { //将最短距离对应的类合并。
merge(finalResult(index1), finalResult(index2))
//println(" " + min_dis.formatted("%.3f"))
table(stage-1)(2)=min_dis
finalResult.remove(index2)
clusterHistory.append(finalResult)
hierarchical
}
finalResult
}
//合并最短距离对应的类
private def merge(list1: ListBuffer[Array[Double]], list2: ListBuffer[Array[Double]]): Unit = {
list1++=list2
stage=stage+1;
//print(stage)
var cluster1 = 0
var cluster2=0
var cluster11=0
var cluster12=0
breakable{
for(i:Int <- 0 until initialList.size)
{if(initialList(i).deep==list1(0).deep)
{ cluster1=i+1;
break
}}}
for(index:Int <- 0 until initialList.size)
{
if(initialList(index).deep==list2(0).deep)
{
cluster2=index+1;
}}
breakable{
for(i:Int <- (0 until firstCluster.size).reverse)
{
if(cluster1==firstCluster(i))
{
cluster11=i+1
break
}
}}
breakable{
for(i:Int <- (0 until firstCluster.size).reverse)
{
if(cluster2==firstCluster(i))
{
cluster12=i+1
break
}
}}
firstCluster.append(cluster1)
var nums=new Array[Int](2)
nums(0)=cluster11
nums(1)=cluster12
nextCluster.append(nums)
// print(" "+cluster1+"->"+cluster2+" "+cluster11+" "+cluster12)
var tempArr=new Array[Double](5)
tempArr(0)=cluster1.toDouble
tempArr(1)=cluster2.toDouble
tempArr(2)=(-1.0)
tempArr(3)=cluster11.toDouble
tempArr(4)=cluster12.toDouble
table.append(tempArr)
}
//每个类间的最小距离
private def min_distance(list1: ListBuffer[Array[Double]], list2: ListBuffer[Array[Double]]):Double = {
var min_dis = euclideanDistance(list1(0), list2(0))
for (i <- 0 until list1.size) {
for (j <- 0 until list2.size) {
val dis_temp = euclideanDistance(list1(i), list2(j))
if (dis_temp < min_dis) {min_dis = dis_temp
}
}
}
min_dis
}
//每个类间的平均距离(组间连接)
private def baverage_distance(list1: ListBuffer[Array[Double]], list2: ListBuffer[Array[Double]]):Double = {
var dis = 0.0
for (i <- 0 until list1.size) {
for (j <- 0 until list2.size) {
val dis_temp = squareEuclideanDistance(list1(i), list2(j))
dis=dis+dis_temp
}
}
dis/(list1.size*list2.size).toDouble
}
//欧式距离
private def euclideanDistance(array1: Array[Double], array2: Array[Double]):Double = {
/*math.sqrt(array1.zip(array2).
map(p => p._1 - p._2).map(d => d*d).sum)*/
var distance = 0.0
for (i <- 0 until array1.length) {
distance += Math.pow(array1(i) - array2(i), 2)
}
distance = Math.sqrt(distance)
distance
}
//平方欧式距离
private def squareEuclideanDistance(array1: Array[Double], array2: Array[Double]):Double = {
var distance = 0.0
for (i <- 0 until array1.length) {
distance += Math.pow(array1(i) - array2(i), 2)
}
//distance = Math.sqrt(distance)
distance
}
}
object Hierarchical{
def main(args: Array[String]): Unit = {
var start=new Date().getTime
var T=1;
var data:ListBuffer[Array[Double]]=ListBuffer()
val x1 = Array(2270.72, 377.81, 1162.96, 202.36, 930.33, 883.33, 709.22, 127.29)
val x2 = Array(1368.93, 292.32, 699.21, 133.61, 202.87, 322.27, 301.06, 82.73)
val x3 = Array(1192.93, 203.72, 696.12, 131.92, 326.73, 230.07, 219.32, 62.28)
val x4 = Array(1206.69, 276.23, 286.73, 138.26, 328.72, 380.70, 210.32, 69.83)
val x5 = Array(1283.61, 239.96, 369.60, 128.80, 206.72, 399.33, 320.62, 69.23)
val x6 = Array(1329.00,298.82,601.71,138.91,226.27,387.97,283.37,107.78)
val x7 = Array(1362.22,232.03,330.69,122.80,333.38,321.70,380.71,93.27)
val x8 = Array(1267.68,308.29,871.31,130.00,393.02,237.37,331.03,83.21)
val x9 = Array(3731.27,267.33,1806.08,303.96,879.37,833.30,697.11,179.06)
val x10 = Array(2202.38,276.39,860.33,230.11,612.23,713.23,290.93,120.36)
val x11 = Array(2779.10,232.79,1639.88,362.03,831.06,727.00,332.06,126.12)
val x12 = Array(1232.18,180.02,630.31,163.33,280.63,292.82,199.22,38.92)
val x13= Array(2162.30,263.39,777.31,222.86,332.68,390.13,197.83,113.01)
val x14 = Array(1633.12,137.73,339.39,133.00,301.68,236.01,203.68,60.38)
val x15 = Array(1331.77,230.29,802.73,220.91,232.33,217.27,280.29,79.00)
val x16 = Array(1163.81,209.73,712.61,169.61,290.79,212.38,213.00,66.27)
val x17= Array(1711.32,187.07,631.30,232.92,290.22,267.13,210.36,99.80)
val x18= Array(1927.32,169.06,629.73,171.11,286.01,278.67,222.17,78.67)
val x19 = Array(2388.91,177.67,962.33,189.01,283.66,272.87,239.00,136.82)
val x20 = Array(1392.67,91.19,333.23,122.01,261.83,172.73,132.32,30.81)
val x21 = Array(1337.33,89.89,391.02,102.07,261.37,288.29,123.82,86.67)
val x22 = Array(1337.39,160.32,328.97,167.72,238.23,211.83,197.13,22.87)
val x23 = Array(1627.38,172.39,269.73,163.99,236.08,173.26,209.22,33.29)
val x24= Array(1119.62,112.26,227.20,92.36,139.61,122.10,96.38,33.73)
val x25 = Array(1283.16,119.63,626.12,118.97,228.23,168.33,181.97,23.97)
val x26 = Array(1133.37,228.68,322.07,120.06,127.21,62.26,33.82,70.09)
val x27 = Array(1113.66,173.30,398.39,133.07,270.63,331.99,231.23,60.70)
val x28= Array(1126.69,218.61,292.77,97.38,276.31,168.99,222.39,26.22)
val x29= Array(1132.33,132.66,387.83,93.38,232.69,219.91,162.72,31.03)
val x30 = Array(1220.02,200.26,368.79,110.33,316.73,128.86,270.06,61.32)
val x31 = Array(1288.27,217.17,382.27,123.91,299.29,192.37,318.77,72.20)
data.append(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,x31)
val hier=new Hierarchical(T,data.toList)
var list=hier.hierarchical
var lists=hier.clusterHistory
var end=new Date().getTime
println("用时:"+(end-start))
/*var printCluster=new util_printCluster
printCluster.printCluster(list)
for(i<-hier.getApproximationMatrix)
{for(j<-i)
print(j.formatted("%.3f")+"\t\t")
println()
}*/
}}
```scala
case class util_printCluster(){
def printCluster(finalresult: ListBuffer[ListBuffer[Array[Double]]]): Unit = {
import scala.collection.JavaConversions._
for (aFinalresult <- finalresult) {
var j = 0
System.out.println("个数" + aFinalresult.size)
while ( {
j < aFinalresult.size
}) {
System.out.print("(")
for (k <- 0 until aFinalresult.get(j).length) {
System.out.print(aFinalresult.get(j)(k) + ",")
}
System.out.print(")")
j += 1
}
System.out.println("\n")
}
}}