最近进行了一些聚类的学习,需要对聚类效果进行评估,用python进行了简单的DBI实现,给自己做一个小的记录。
DBI的理论知识见:http://blog.sina.com.cn/s/blog_65c8baf901016flh.html
```python
import math
import pandas as pd
import os
import numpy as np
cluster_path = os.getcwd() + '/1.csv'
data_path = os.getcwd() + '/数据.csv'
# read all data from csv
f = open(cluster_path)
cluster_data = pd.read_csv(f)
cluster_data_array = np.array(cluster_data)
f = open(data_path)
data_data = pd.read_csv(f)
data_data_array = np.array(data_data)
# print(cluster_data)
# print("---------------------------")
# print(cluster_data_array)
# 求两个点的欧式距离
def VD(v1, v2):
s = 0
for i in range(1, 9):
s += (v1[i] - v2[i]) ** 2
return s ** 0.5
# 求两个簇质心的距离
def Compute_decent(center1, center2):
return VD(center1, center2)
# 求簇i内任两点距离的和
def Compute_Ci(x, clusterIndex):
Ci = 0
for i in range(x.shape[0] - 1):
if x[i][0] == clusterIndex:
for j in range(i + 1, x.shape[0]):
if x[j][0] == clusterIndex:
Ci += VD(x[i], x[j])
return Ci
# 取Ci的均值
def Ci_avg(x, ClusterIndex):
clusterNum = 0
for i in range(x.shape[0]):
if x[i][0] == ClusterIndex:
clusterNum += 1
n = clusterNum * (clusterNum-1) / 2
return Compute_Ci(x, ClusterIndex) / n
# 计算max后面那个式子
def Compute_Mij(avgCi, avgCj, denominator):
numerator = avgCi + avgCj
if denominator == 0:
print('error! denominator is 0')
return numerator / denominator
# nc就是聚类数目,由于测试的数据量相对较大,用array把值存储下来,避免重复计算
def Compute_DBI(x, y, nc):
# s--final result;
# avgCi--1*nc list stores avgCi
# decentList--nc*nc list stores decent
s = 0
avgCi = np.zeros(nc)
decentList = np.zeros((nc, nc))
for i in range(nc):
for j in range(nc):
decentList[i][j] = Compute_decent(y[i], y[j])
print('decentList store over')
for i in range(nc):
avgCi[i] = Ci_avg(x, i + 1)
print(avgCi[i])
print('avgCi', i + 1, 'store over')
for i in range(nc):
print('epoc is:', i + 1)
m = 0
for j in range(i + 1, nc):
temp = Compute_Mij(avgCi[i], avgCi[j], decentList[i][j])
if m < temp:
m = temp
print('epoc', i + 1, 'over')
s += m
return s / nc
print(Compute_DBI(data_data_array, cluster_data_array, 12))