【机器学习】AGNES层次聚类算法

算法思想:

  1. 初始每个数据都是一个簇;
  2. 寻找每个簇之间的距离,获取距离列表;
  3. 合并距离最近的两个簇Ci,Cj,并删除Cj,更新距离列表,使簇个数减1;
  4. 重复第3步,直到簇个数等于所需个数k。

数据集来源:西瓜数据集4.0

1.读取文件:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("./西瓜数据集4.0.csv", index_col='number')
data = data.values.tolist()

2.使用matplotlib.pyplot绘出原始数据

# 画出原始图像
fig, ax = plt.subplots()
plt.scatter([i[0] for i in data], [i[1] for i in data])
plt.show()

原始数据

3.计算欧氏距离

def cal_dist(a, b):
	"""
	计算欧氏距离
	"""
    a = np.array(a)
    b = np.array(b)
    dist = np.sqrt(np.dot((a - b), (a - b).T))

    return dist

4.使用最小距离法计算簇之间的最小距离

'''
def cal_cluster_min_dist(c1, c2):
	"""
	计算簇之间的最小距离
	"""
    minDist = 1e5
    for vec1 in c1:
        for vec2 in c2:
            dist = cal_dist(vec1, vec2)
            if dist < minDist:
                minDist = dist

    return minDist

5.使用平均距离法计算簇之间的平均距离

def cal_cluster_avg_dist(c1, c2):
	"""
	计算簇之间的平均距离
	"""
    num = len(c1) * len(c2)
    sum_dist = 0
    for vec1 in c1:
        for vec2 in c2:
            dist = cal_dist(vec1, vec2)
            sum_dist += dist
    return sum_dist

6.获取簇之间的距离列表

def get_minDist_list(data, method):
	"""
	获取最小距离列表
	"""
    cluster_num = len(data)
    #     print("cluster_num",cluster_num)
    minDistList = [[0 for i in range(cluster_num)] for j in range(cluster_num)]
    for i in range(cluster_num):
        j = i + 1
        while j < cluster_num:
            #             print("data[i]:",data[i])
            #             print("data[j]:",data[j])
            if method == "minDist":  # 使用最小距离计算
                minDistList[i][j] = cal_cluster_min_dist(data[i], data[j])
                minDistList[j][i] = minDistList[i][j]
            elif method == "avgDist":  # 使用平均距离计算
                minDistList[i][j] = cal_cluster_avg_dist(data[i], data[j])
                minDistList[j][i] = minDistList[i][j]
            j += 1

    return minDistList

7.寻找最小值

def find_min(minDistList):
	"""
	寻找距离列表中的最小值,用于合并簇以及删除
	"""
    row = len(minDistList)
    minDist = 1e5
    min_i = 0
    min_j = 0
    for i in range(row):
        for j in range(row):
            dist = minDistList[i][j]
            if dist < minDist and dist != 0:
                minDist = minDistList[i][j]
                min_i = i
                min_j = j

    return min_i, min_j, minDist

8.实现

def AGNES(data, k, method):
	"""
	AGNES算法实现
	"""
    cluster_num = len(data)
    C = []
    for i in data:  # 添加数据
        tmp = [i]
        C.append(tmp)
    minDistList = get_minDistList(C, method)
    while cluster_num > k:
        i, j, minDist = find_min(minDistList)
        #         print(len(minDistList))
        #         print(i,j,minDist)
        C[i].extend(C[j])  # 合并
        del C[j]  # 删除
        minDistList = get_min_dist_list(C, method)
        cluster_num -= 1

    return C

9.程序入口

'''
程序入口
'''
if __name__ == "__main__":
    C_min = AGNES(data, 3, 'minDist')
    C_avg = AGNES(data, 3, 'avgDist')
    fig, ax = plt.subplots(nrows=2, ncols=1)
    ax[0].scatter([i[0] for i in C_min[0]], [i[1] for i in C_min[0]], c='r')
    ax[0].scatter([i[0] for i in C_min[1]], [i[1] for i in C_min[1]], c='g')
    ax[0].scatter([i[0] for i in C_min[2]], [i[1] for i in C_min[2]], c='b')
    ax[0].set_title("使用最小距离进行聚类")

    ax[1].scatter([i[0] for i in C_avg[0]], [i[1] for i in C_avg[0]], c='r')
    ax[1].scatter([i[0] for i in C_avg[1]], [i[1] for i in C_avg[1]], c='g')
    ax[1].scatter([i[0] for i in C_avg[2]], [i[1] for i in C_avg[2]], c='b')
    ax[1].set_title("使用平均距离进行聚类")

    fig.tight_layout()
    plt.show()

完整代码如下:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("./西瓜数据集4.0.csv", index_col='number')
data = data.values.tolist()

# 画出原始图像
fig, ax = plt.subplots()
plt.scatter([i[0] for i in data], [i[1] for i in data])
plt.show()

'''
计算欧氏距离
'''

def calDist(a, b):
    a = np.array(a)
    b = np.array(b)
    dist = np.sqrt(np.dot((a - b), (a - b).T))

    return dist


'''
计算簇之间的最小距离
'''


def cal_cluster_min_dist(c1, c2):
    minDist = 1e5
    for vec1 in c1:
        for vec2 in c2:
            dist = calDist(vec1, vec2)
            if dist < minDist:
                minDist = dist

    return minDist


'''
计算簇之间的平均距离
'''


def cal_cluster_avg_dist(c1, c2):
    num = len(c1) * len(c2)
    sum_dist = 0
    for vec1 in c1:
        for vec2 in c2:
            dist = calDist(vec1, vec2)
            sum_dist += dist
    return sum_dist


'''
获取最小距离列表
'''


def get_min_dist_list(data, method):
    cluster_num = len(data)
    #     print("cluster_num",cluster_num)
    minDistList = [[0 for i in range(cluster_num)] for j in range(cluster_num)]
    for i in range(cluster_num):
        j = i + 1
        while j < cluster_num:
            #             print("data[i]:",data[i])
            #             print("data[j]:",data[j])
            if method == "minDist":  # 使用最小距离计算
                minDistList[i][j] = cal_cluster_min_dist(data[i], data[j])
                minDistList[j][i] = minDistList[i][j]
            elif method == "avgDist":  # 使用平均距离计算
                minDistList[i][j] = cal_cluster_avg_dist(data[i], data[j])
                minDistList[j][i] = minDistList[i][j]
            j += 1

    return minDistList


'''
寻找距离列表中的最小值,用于合并簇以及删除
'''


def find_min(minDistList):
    row = len(minDistList)
    minDist = 1e5
    min_i = 0
    min_j = 0
    for i in range(row):
        for j in range(row):
            dist = minDistList[i][j]
            if dist < minDist and dist != 0:
                minDist = minDistList[i][j]
                min_i = i
                min_j = j

    return min_i, min_j, minDist


'''
AGNES算法实现
'''


def AGNES(data, k, method):
    cluster_num = len(data)
    C = []
    for i in data:  # 添加数据
        tmp = [i]
        C.append(tmp)
    minDistList = get_min_dist_list(C, method)
    while cluster_num > k:
        i, j, minDist = find_min(minDistList)
        #         print(len(minDistList))
        #         print(i,j,minDist)
        C[i].extend(C[j])  # 合并
        del C[j]  # 删除
        minDistList = get_min_dist_list(C, method)
        cluster_num -= 1

    return C


'''
程序入口
'''
if __name__ == "__main__":
    C_min = AGNES(data, 3, 'minDist')
    C_avg = AGNES(data, 3, 'avgDist')
    fig, ax = plt.subplots(nrows=2, ncols=1)
    ax[0].scatter([i[0] for i in C_min[0]], [i[1] for i in C_min[0]], c='r')
    ax[0].scatter([i[0] for i in C_min[1]], [i[1] for i in C_min[1]], c='g')
    ax[0].scatter([i[0] for i in C_min[2]], [i[1] for i in C_min[2]], c='b')
    ax[0].set_title("使用最小距离进行聚类")

    ax[1].scatter([i[0] for i in C_avg[0]], [i[1] for i in C_avg[0]], c='r')
    ax[1].scatter([i[0] for i in C_avg[1]], [i[1] for i in C_avg[1]], c='g')
    ax[1].scatter([i[0] for i in C_avg[2]], [i[1] for i in C_avg[2]], c='b')
    ax[1].set_title("使用平均距离进行聚类")

    fig.tight_layout()
    plt.show()

在这里插入图片描述

  • 8
    点赞
  • 54
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

每天进步一点丶

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值