K-means的理解

k-means是把一批数据通过一定方法划分为预先设置好的几个簇内,同一个簇内的数据是有一定共性。

比如有一批数据,我想划分为K个簇,那步骤如下:

  1. 先随机选取K个数据的位置作为簇的簇心位置;

  1. 每一个数据依次与K个簇心计算欧式距离(假定用的欧式距离来计算),每一个数据与簇心距离最小的归为此簇;

  1. 接着根据每个簇内的数据计算此簇内所有数据的中心(方法不唯一),并把此中心更新为此簇的簇心;

  1. 重复步骤2-3,直到每个簇的簇心不改变或者变化在给定范围内则停止;

数据示例

# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt

color = np.array(['blue', 'black', 'gray'])

def plot_cluster(data, cls=None, cluster=None, title=''):
    if cls is None:
        c = [color[0]] * data.shape[0]
    else:
        c = color[cls].tolist()
    plt.scatter(data[:, 0], data[:, 1], s=150, c=c)
    # for i, clu in enumerate(cluster):
    plt.scatter(cluster[:, 0], cluster[:, 1], s=180, c='red',marker='*')
    plt.title(title)
    plt.show()
    plt.close()

def distance(data, cluster):
    return np.sum(np.power(data[:, None] - cluster[None], 2), axis=-1)

def k_means(data, k):

    # 随机选取K个数据作为簇心
    cluster = data[np.random.choice(data.shape[0], k, replace=False)]
    print(f'init cluster: {cluster}')

    # 构建与data.shape[0]相同长度一维的数据,用于记录每个元素属于哪个簇心是否需要更改
    last_loc = np.zeros(data.shape[0])
    step = 0
    plot_cluster(data, cls=None, cluster=cluster, title=f'step: {step}')

    while True:
        # 计算每个坐标点与随机选取的簇心之间的欧式距离/1-IOU距离
        d = distance(data, cluster)
        # 选取每个坐标点与K个簇心距离最小的那个,则该坐标归属于距离最小的簇心
        current_loc = np.argmin(d, axis=-1)
        # 如果这两者完全相等,或者差异小于一定范围(这里范围是10,如果不设置,那么1-wh_iou则执行几百次还没找到),则说明没有坐标需要更改,则簇心更新完
        if (last_loc == current_loc).all() or np.sum(last_loc == current_loc) >= np.sum(np.ones(data.shape[0]-10)):
            break
        # 计算每个簇内数据的中值,作为每个簇新的簇心
        for clu in range(k):
            cluster[clu] = np.median(data[current_loc == clu], axis=0)
        last_loc = current_loc
        step += 1
        plot_cluster(data, cls=current_loc, cluster=cluster, title=f'step: {step}')
    print(f'step: {step}')
    print(f'final cluster: {cluster}')
    return cluster


def wh_iou(wh1, wh2):
    wh1 = wh1[:,None]
    wh2 = wh2[None]
    inter = np.minimum(wh1, wh2).prod(2)

    iou = inter/(wh1.prod(1)+wh2.prod(1)-inter)
    return iou

if __name__ == '__main__':
    # 这里先创建一些点符合设定的正泰分布的数据,然后把这些构建为坐标点(坐标为x,y)
    x1 = np.random.normal(loc=1, size=180)
    y1 = np.random.normal(loc=3, size=180)
    # 构建为坐标点
    data = np.concatenate([x1[:, None], y1[:, None]], axis=-1)
    k_means(data, k=3)

图像数据示例

图像数据其实也是跟数据示例一样,这样把图像上标注框取宽和高,然后除以对应图片的宽和高,这样标注框就都归一化,接着采取把每张图片的宽和高的最大值resize到统一的一个尺寸(指传给网络训练的图片尺寸),另一个小的值也resize,只是不是resize到统一的尺寸。这里的解释引用参考文章中所说:“假如在原始尺寸图片上进行聚类,比如1280*1280,得到聚类anchor为100*100,但是输入网络图片尺寸为640*640,那么对应的anchor就不适用了”

以下代码完全是照搬参考地址的博主的

import random
import numpy as np
import os
from tqdm import tqdm
from lxml import etree


class VOCDataSet(object):
    def __init__(self, voc_root, year="2012", txt_name: str = "train.txt"):
        assert year in ["2007", "2012"], "year must be in ['2007', '2012']"
        self.root = os.path.join(voc_root, "VOCdevkit", f"VOC{year}")
        self.annotations_root = os.path.join(self.root, "Annotations")

        # read train.txt or val.txt file
        txt_path = os.path.join(self.root, "ImageSets", "Main", txt_name)
        assert os.path.exists(txt_path), "not found {} file.".format(txt_name)

        with open(txt_path) as read:
            self.xml_list = [os.path.join(self.annotations_root, line.strip() + ".xml")
                             for line in read.readlines() if len(line.strip()) > 0]

        # check file
        assert len(self.xml_list) > 0, "in '{}' file does not find any information.".format(txt_path)
        for xml_path in self.xml_list:
            assert os.path.exists(xml_path), "not found '{}' file.".format(xml_path)

    def __len__(self):
        return len(self.xml_list)

    def parse_xml_to_dict(self, xml):
        """
        将xml文件解析成字典形式,参考tensorflow的recursive_parse_xml_to_dict
        Args:
            xml: xml tree obtained by parsing XML file contents using lxml.etree
        Returns:
            Python dictionary holding XML contents.
        """

        if len(xml) == 0:  # 遍历到底层,直接返回tag对应的信息
            return {xml.tag: xml.text}

        result = {}
        for child in xml:
            child_result = self.parse_xml_to_dict(child)  # 递归遍历标签信息
            if child.tag != 'object':
                result[child.tag] = child_result[child.tag]
            else:
                if child.tag not in result:  # 因为object可能有多个,所以需要放入列表里
                    result[child.tag] = []
                result[child.tag].append(child_result[child.tag])
        return {xml.tag: result}

    def get_info(self):
        im_wh_list = []
        boxes_wh_list = []
        for xml_path in tqdm(self.xml_list, desc="read data info."):
            # read xml
            with open(xml_path) as fid:
                xml_str = fid.read()
            xml = etree.fromstring(xml_str)
            data = self.parse_xml_to_dict(xml)["annotation"]
            im_height = int(data["size"]["height"])
            im_width = int(data["size"]["width"])

            wh = []
            for obj in data["object"]:
                xmin = float(obj["bndbox"]["xmin"])
                xmax = float(obj["bndbox"]["xmax"])
                ymin = float(obj["bndbox"]["ymin"])
                ymax = float(obj["bndbox"]["ymax"])
                wh.append([(xmax - xmin) / im_width, (ymax - ymin) / im_height])

            if len(wh) == 0:
                continue

            im_wh_list.append([im_width, im_height])
            boxes_wh_list.append(wh)

        return im_wh_list, boxes_wh_list


def wh_iou(wh1, wh2):
    # Returns the nxm IoU matrix. wh1 is nx2, wh2 is mx2
    wh1 = wh1[:, None]  # [N,1,2]
    wh2 = wh2[None]  # [1,M,2]
    inter = np.minimum(wh1, wh2).prod(2)  # [N,M]
    return inter / (wh1.prod(2) + wh2.prod(2) - inter)  # iou = inter / (area1 + area2 - inter)


def k_means(boxes, k, dist=np.median):
    """
    yolo k-means methods
    refer: https://github.com/qqwweee/keras-yolo3/blob/master/kmeans.py
    Args:
        boxes: 需要聚类的bboxes
        k: 簇数(聚成几类)
        dist: 更新簇坐标的方法(默认使用中位数,比均值效果略好)
    """
    box_number = boxes.shape[0]
    last_nearest = np.zeros((box_number,))
    # np.random.seed(0)  # 固定随机数种子

    # init k clusters
    clusters = boxes[np.random.choice(box_number, k, replace=False)]

    while True:
        distances = 1 - wh_iou(boxes, clusters)
        current_nearest = np.argmin(distances, axis=1)
        if (last_nearest == current_nearest).all():
            break  # clusters won't change
        for cluster in range(k):
            # update clusters
            clusters[cluster] = dist(boxes[current_nearest == cluster], axis=0)

        last_nearest = current_nearest

    return clusters

def anchor_fitness(k: np.ndarray, wh: np.ndarray, thr: float):  # mutation fitness
    r = wh[:, None] / k[None]
    x = np.minimum(r, 1. / r).min(2)  # ratio metric
    # x = wh_iou(wh, k)  # iou metric
    best = x.max(1)
    f = (best * (best > thr).astype(np.float32)).mean()  # fitness
    bpr = (best > thr).astype(np.float32).mean()  # best possible recall
    return f, bpr


def main(img_size=512, n=9, thr=0.25, gen=1000):
    # 从数据集中读取所有图片的wh以及对应bboxes的wh
    dataset = VOCDataSet(voc_root=r"D:\data\voc", year="2012", txt_name="train.txt")
    im_wh, boxes_wh = dataset.get_info()

    # 最大边缩放到img_size
    im_wh = np.array(im_wh, dtype=np.float32)
    shapes = img_size * im_wh / im_wh.max(1, keepdims=True)
    wh0 = np.concatenate([l * s for s, l in zip(shapes, boxes_wh)])  # wh

    # Filter 过滤掉小目标
    i = (wh0 < 3.0).any(1).sum()
    if i:
        print(f'WARNING: Extremely small objects found. {i} of {len(wh0)} labels are < 3 pixels in size.')
    wh = wh0[(wh0 >= 2.0).any(1)]  # 只保留wh都大于等于2个像素的box

    # Kmeans calculation
    # print(f'Running kmeans for {n} anchors on {len(wh)} points...')
    # s = wh.std(0)  # sigmas for whitening
    # k, dist = kmeans(wh / s, n, iter=30)  # points, mean distance
    # assert len(k) == n, print(f'ERROR: scipy.cluster.vq.kmeans requested {n} points but returned only {len(k)}')
    # k *= s
    k = k_means(wh, n)

    # 按面积排序
    k = k[np.argsort(k.prod(1))]  # sort small to large
    f, bpr = anchor_fitness(k, wh, thr)
    print("kmeans: " + " ".join([f"[{int(i[0])}, {int(i[1])}]" for i in k]))
    print(f"fitness: {f:.5f}, best possible recall: {bpr:.5f}")

    # Evolve
    # 遗传算法(在kmeans的结果基础上变异mutation)
    npr = np.random
    f, sh, mp, s = anchor_fitness(k, wh, thr)[0], k.shape, 0.9, 0.1  # fitness, generations, mutation prob, sigma
    pbar = tqdm(range(gen), desc=f'Evolving anchors with Genetic Algorithm:')  # progress bar
    for _ in pbar:
        v = np.ones(sh)
        while (v == 1).all():  # mutate until a change occurs (prevent duplicates)
            v = ((npr.random(sh) < mp) * random.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0)
        kg = (k.copy() * v).clip(min=2.0)
        fg, bpr = anchor_fitness(kg, wh, thr)
        if fg > f:
            f, k = fg, kg.copy()
            pbar.desc = f'Evolving anchors with Genetic Algorithm: fitness = {f:.4f}'

    # 按面积排序
    k = k[np.argsort(k.prod(1))]  # sort small to large
    print("genetic: " + " ".join([f"[{int(i[0])}, {int(i[1])}]" for i in k]))
    print(f"fitness: {f:.5f}, best possible recall: {bpr:.5f}")


if __name__ == "__main__":
    main()

上述使用的是 1-IOU 作为距离,而非欧式距离,这个效果比较好。

另外代码中:anchor_fitness 方法是计算每个box与计算得到的k个anchor的适应度和best possible recall。适应度就是box宽和高与k个anchor对应的宽和高分别做除,然后把比取min(r,1/r)(因为可能有些比大于1),完全相等是比等于1。这样就把所有比全部取到,然后取每个box与k个box中最大的,取这些最大的大于给定thr阈值的数值和,作平均,作为适应度。另一个 best possible reall 也是这么计算,只不过其取平均是用大于thr个数来取平均,适应度是用大于thr数值之和取平均。

def anchor_fitness(k: np.ndarray, wh: np.ndarray, thr: float):  # mutation fitness
    r = wh[:, None] / k[None]
    x = np.minimum(r, 1. / r).min(2)  # ratio metric
    # x = wh_iou(wh, k)  # iou metric
    best = x.max(1)
    f = (best * (best > thr).astype(np.float32)).mean()  # fitness
    bpr = (best > thr).astype(np.float32).mean()  # best possible recall
    return f, bpr

Yolov5中还对找到的anchor进行遗传突变,遗传突变的方法我没有明白具体什么意思,但是我看代码,大致意思就是:取对应k个shape的随机数,这个随机数在一定范围,然后这个随机数乘以聚类得到的anchor,然后把刚刚乘以的结果在计算anchor_fitness,如果其适应度大于上次的,就把乘以随机数得到的anchor更新为新的。这就是遗传突变的方法。

# Evolve
# 遗传算法(在kmeans的结果基础上变异mutation)
npr = np.random
f, sh, mp, s = anchor_fitness(k, wh, thr)[0], k.shape, 0.9, 0.1  # fitness, generations, mutation prob, sigma
pbar = tqdm(range(gen), desc=f'Evolving anchors with Genetic Algorithm:')  # progress bar
for _ in pbar:
    v = np.ones(sh)
    while (v == 1).all():  # mutate until a change occurs (prevent duplicates)
        v = ((npr.random(sh) < mp) * random.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0)
    kg = (k.copy() * v).clip(min=2.0)
    fg, bpr = anchor_fitness(kg, wh, thr)
    if fg > f:
        f, k = fg, kg.copy()
        pbar.desc = f'Evolving anchors with Genetic Algorithm: fitness = {f:.4f}'

参考:

  1. https://blog.csdn.net/qq_38253797/article/details/119713706

这里附带下维度的理解:

当我们对好多shape的数据进行操作某一维度时,容易不知道操作后结果的维度,其实只要记住:对哪个维度操作,哪个维度就消失,结果的维度就是没有操作的维度。

比如一个数据维度是(3,4,5),我都0维进行操作,那结果维度就是(4,5)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
K-means聚类是一种常用的聚类算法,其目标是将数据点划分为K个类簇,并找到每个簇的中心,使得每个数据点与所属簇的中心之间的距离最小化。该算法的优点是简单易懂、计算速度较快,适用于连续型数据。 在K-means聚类算法中,K值的确定非常重要,不同的K值可能会导致不同的聚类效果。常用的确定K值的方法有先验法和手肘法等。先验法是基于领域知识或经验来选择合适的K值;手肘法则是通过绘制不同K值对应的聚类结果的度量值(如簇内平方和)与K值的关系曲线,选择使得曲线出现拐点的K值作为最佳的聚类个数。 K-means聚类算法的一个缺陷是对初始中心点的选择比较敏感,不同的初始中心点可能会导致不同的聚类结果。为了解决这个问题,一种改进的算法是ISODATA算法,它在K-means算法的基础上增加了对聚类结果的“合并”和“分裂”两个操作,从而自动确定最终的聚类结果,不需人为指定K值。 总结来说,K-means聚类是一种简单且高效的聚类算法,适用于连续型数据。K值的选择可以通过先验法和手肘法等方法确定。然而,K-means聚类算法对初始中心点的选择比较敏感,这个问题可以通过使用ISODATA算法来解决。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* *2* [【机器学习】全面解析Kmeans聚类算法(Python)](https://blog.csdn.net/fengdu78/article/details/122183696)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 50%"] - *3* [聚类算法:K-means聚类图像分割](https://download.csdn.net/download/weixin_38639642/13752303)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 50%"] [ .reference_list ]

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值