K-means的理解

写代码_不错哦

已于 2023-02-24 21:19:15 修改

阅读量349

点赞数

分类专栏：知识点整理文章标签：计算机视觉 Powered by 金山文档

于 2023-02-16 21:37:47 首次发布

本文链接：https://blog.csdn.net/shenjianhua005/article/details/129071254

版权

知识点整理专栏收录该内容

4 篇文章 0 订阅

订阅专栏

k-means是把一批数据通过一定方法划分为预先设置好的几个簇内，同一个簇内的数据是有一定共性。

比如有一批数据，我想划分为K个簇，那步骤如下：

先随机选取K个数据的位置作为簇的簇心位置；

每一个数据依次与K个簇心计算欧式距离（假定用的欧式距离来计算），每一个数据与簇心距离最小的归为此簇；

接着根据每个簇内的数据计算此簇内所有数据的中心（方法不唯一），并把此中心更新为此簇的簇心；

重复步骤2-3，直到每个簇的簇心不改变或者变化在给定范围内则停止；

数据示例

# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt

color = np.array(['blue', 'black', 'gray'])

def plot_cluster(data, cls=None, cluster=None, title=''):
    if cls is None:
        c = [color[0]] * data.shape[0]
    else:
        c = color[cls].tolist()
    plt.scatter(data[:, 0], data[:, 1], s=150, c=c)
    # for i, clu in enumerate(cluster):
    plt.scatter(cluster[:, 0], cluster[:, 1], s=180, c='red',marker='*')
    plt.title(title)
    plt.show()
    plt.close()

def distance(data, cluster):
    return np.sum(np.power(data[:, None] - cluster[None], 2), axis=-1)

def k_means(data, k):

    # 随机选取K个数据作为簇心
    cluster = data[np.random.choice(data.shape[0], k, replace=False)]
    print(f'init cluster: {cluster}')

    # 构建与data.shape[0]相同长度一维的数据，用于记录每个元素属于哪个簇心是否需要更改
    last_loc = np.zeros(data.shape[0])
    step = 0
    plot_cluster(data, cls=None, cluster=cluster, title=f'step: {step}')

    while True:
        # 计算每个坐标点与随机选取的簇心之间的欧式距离/1-IOU距离
        d = distance(data, cluster)
        # 选取每个坐标点与K个簇心距离最小的那个，则该坐标归属于距离最小的簇心
        current_loc = np.argmin(d, axis=-1)
        # 如果这两者完全相等，或者差异小于一定范围（这里范围是10，如果不设置，那么1-wh_iou则执行几百次还没找到），则说明没有坐标需要更改，则簇心更新完
        if (last_loc == current_loc).all() or np.sum(last_loc == current_loc) >= np.sum(np.ones(data.shape[0]-10)):
            break
        # 计算每个簇内数据的中值，作为每个簇新的簇心
        for clu in range(k):
            cluster[clu] = np.median(data[current_loc == clu], axis=0)
        last_loc = current_loc
        step += 1
        plot_cluster(data, cls=current_loc, cluster=cluster, title=f'step: {step}')
    print(f'step: {step}')
    print(f'final cluster: {cluster}')
    return cluster


def wh_iou(wh1, wh2):
    wh1 = wh1[:,None]
    wh2 = wh2[None]
    inter = np.minimum(wh1, wh2).prod(2)

    iou = inter/(wh1.prod(1)+wh2.prod(1)-inter)
    return iou

if __name__ == '__main__':
    # 这里先创建一些点符合设定的正泰分布的数据，然后把这些构建为坐标点（坐标为x,y）
    x1 = np.random.normal(loc=1, size=180)
    y1 = np.random.normal(loc=3, size=180)
    # 构建为坐标点
    data = np.concatenate([x1[:, None], y1[:, None]], axis=-1)
    k_means(data, k=3)

图像数据示例

图像数据其实也是跟数据示例一样，这样把图像上标注框取宽和高，然后除以对应图片的宽和高，这样标注框就都归一化，接着采取把每张图片的宽和高的最大值resize到统一的一个尺寸（指传给网络训练的图片尺寸），另一个小的值也resize，只是不是resize到统一的尺寸。这里的解释引用参考文章中所说：“假如在原始尺寸图片上进行聚类，比如1280*1280，得到聚类anchor为100*100，但是输入网络图片尺寸为640*640，那么对应的anchor就不适用了”

以下代码完全是照搬参考地址的博主的

import random
import numpy as np
import os
from tqdm import tqdm
from lxml import etree


class VOCDataSet(object):
    def __init__(self, voc_root, year="2012", txt_name: str = "train.txt"):
        assert year in ["2007", "2012"], "year must be in ['2007', '2012']"
        self.root = os.path.join(voc_root, "VOCdevkit", f"VOC{year}")
        self.annotations_root = os.path.join(self.root, "Annotations")

        # read train.txt or val.txt file
        txt_path = os.path.join(self.root, "ImageSets", "Main", txt_name)
        assert os.path.exists(txt_path), "not found {} file.".format(txt_name)

        with open(txt_path) as read:
            self.xml_list = [os.path.join(self.annotations_root, line.strip() + ".xml")
                             for line in read.readlines() if len(line.strip()) > 0]

        # check file
        assert len(self.xml_list) > 0, "in '{}' file does not find any information.".format(txt_path)
        for xml_path in self.xml_list:
            assert os.path.exists(xml_path), "not found '{}' file.".format(xml_path)

    def __len__(self):
        return len(self.xml_list)

    def parse_xml_to_dict(self, xml):
        """
        将xml文件解析成字典形式，参考tensorflow的recursive_parse_xml_to_dict
        Args:
            xml: xml tree obtained by parsing XML file contents using lxml.etree
        Returns:
            Python dictionary holding XML contents.
        """

        if len(xml) == 0:  # 遍历到底层，直接返回tag对应的信息
            return {xml.tag: xml.text}

        result = {}
        for child in xml:
            child_result = self.parse_xml_to_dict(child)  # 递归遍历标签信息
            if child.tag != 'object':
                result[child.tag] = child_result[child.tag]
            else:
                if child.tag not in result:  # 因为object可能有多个，所以需要放入列表里
                    result[child.tag] = []
                result[child.tag].append(child_result[child.tag])
        return {xml.tag: result}

    def get_info(self):
        im_wh_list = []
        boxes_wh_list = []
        for xml_path in tqdm(self.xml_list, desc="read data info."):
            # read xml
            with open(xml_path) as fid:
                xml_str = fid.read()
            xml = etree.fromstring(xml_str)
            data = self.parse_xml_to_dict(xml)["annotation"]
            im_height = int(data["size"]["height"])
            im_width = int(data["size"]["width"])

            wh = []
            for obj in data["object"]:
                xmin = float(obj["bndbox"]["xmin"])
                xmax = float(obj["bndbox"]["xmax"])
                ymin = float(obj["bndbox"]["ymin"])
                ymax = float(obj["bndbox"]["ymax"])
                wh.append([(xmax - xmin) / im_width, (ymax - ymin) / im_height])

            if len(wh) == 0:
                continue

            im_wh_list.append([im_width, im_height])
            boxes_wh_list.append(wh)

        return im_wh_list, boxes_wh_list


def wh_iou(wh1, wh2):
    # Returns the nxm IoU matrix. wh1 is nx2, wh2 is mx2
    wh1 = wh1[:, None]  # [N,1,2]
    wh2 = wh2[None]  # [1,M,2]
    inter = np.minimum(wh1, wh2).prod(2)  # [N,M]
    return inter / (wh1.prod(2) + wh2.prod(2) - inter)  # iou = inter / (area1 + area2 - inter)


def k_means(boxes, k, dist=np.median):
    """
    yolo k-means methods
    refer: https://github.com/qqwweee/keras-yolo3/blob/master/kmeans.py
    Args:
        boxes: 需要聚类的bboxes
        k: 簇数(聚成几类)
        dist: 更新簇坐标的方法(默认使用中位数，比均值效果略好)
    """
    box_number = boxes.shape[0]
    last_nearest = np.zeros((box_number,))
    # np.random.seed(0)  # 固定随机数种子

    # init k clusters
    clusters = boxes[np.random.choice(box_number, k, replace=False)]

    while True:
        distances = 1 - wh_iou(boxes, clusters)
        current_nearest = np.argmin(distances, axis=1)
        if (last_nearest == current_nearest).all():
            break  # clusters won't change
        for cluster in range(k):
            # update clusters
            clusters[cluster] = dist(boxes[current_nearest == cluster], axis=0)

        last_nearest = current_nearest

    return clusters

def anchor_fitness(k: np.ndarray, wh: np.ndarray, thr: float):  # mutation fitness
    r = wh[:, None] / k[None]
    x = np.minimum(r, 1. / r).min(2)  # ratio metric
    # x = wh_iou(wh, k)  # iou metric
    best = x.max(1)
    f = (best * (best > thr).astype(np.float32)).mean()  # fitness
    bpr = (best > thr).astype(np.float32).mean()  # best possible recall
    return f, bpr


def main(img_size=512, n=9, thr=0.25, gen=1000):
    # 从数据集中读取所有图片的wh以及对应bboxes的wh
    dataset = VOCDataSet(voc_root=r"D:\data\voc", year="2012", txt_name="train.txt")
    im_wh, boxes_wh = dataset.get_info()

    # 最大边缩放到img_size
    im_wh = np.array(im_wh, dtype=np.float32)
    shapes = img_size * im_wh / im_wh.max(1, keepdims=True)
    wh0 = np.concatenate([l * s for s, l in zip(shapes, boxes_wh)])  # wh

    # Filter 过滤掉小目标
    i = (wh0 < 3.0).any(1).sum()
    if i:
        print(f'WARNING: Extremely small objects found. {i} of {len(wh0)} labels are < 3 pixels in size.')
    wh = wh0[(wh0 >= 2.0).any(1)]  # 只保留wh都大于等于2个像素的box

    # Kmeans calculation
    # print(f'Running kmeans for {n} anchors on {len(wh)} points...')
    # s = wh.std(0)  # sigmas for whitening
    # k, dist = kmeans(wh / s, n, iter=30)  # points, mean distance
    # assert len(k) == n, print(f'ERROR: scipy.cluster.vq.kmeans requested {n} points but returned only {len(k)}')
    # k *= s
    k = k_means(wh, n)

    # 按面积排序
    k = k[np.argsort(k.prod(1))]  # sort small to large
    f, bpr = anchor_fitness(k, wh, thr)
    print("kmeans: " + " ".join([f"[{int(i[0])}, {int(i[1])}]" for i in k]))
    print(f"fitness: {f:.5f}, best possible recall: {bpr:.5f}")

    # Evolve
    # 遗传算法(在kmeans的结果基础上变异mutation)
    npr = np.random
    f, sh, mp, s = anchor_fitness(k, wh, thr)[0], k.shape, 0.9, 0.1  # fitness, generations, mutation prob, sigma
    pbar = tqdm(range(gen), desc=f'Evolving anchors with Genetic Algorithm:')  # progress bar
    for _ in pbar:
        v = np.ones(sh)
        while (v == 1).all():  # mutate until a change occurs (prevent duplicates)
            v = ((npr.random(sh) < mp) * random.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0)
        kg = (k.copy() * v).clip(min=2.0)
        fg, bpr = anchor_fitness(kg, wh, thr)
        if fg > f:
            f, k = fg, kg.copy()
            pbar.desc = f'Evolving anchors with Genetic Algorithm: fitness = {f:.4f}'

    # 按面积排序
    k = k[np.argsort(k.prod(1))]  # sort small to large
    print("genetic: " + " ".join([f"[{int(i[0])}, {int(i[1])}]" for i in k]))
    print(f"fitness: {f:.5f}, best possible recall: {bpr:.5f}")


if __name__ == "__main__":
    main()

上述使用的是 1-IOU 作为距离，而非欧式距离，这个效果比较好。

另外代码中：anchor_fitness 方法是计算每个box与计算得到的k个anchor的适应度和best possible recall。适应度就是box宽和高与k个anchor对应的宽和高分别做除，然后把比取min(r,1/r)(因为可能有些比大于1)，完全相等是比等于1。这样就把所有比全部取到，然后取每个box与k个box中最大的，取这些最大的大于给定thr阈值的数值和，作平均，作为适应度。另一个 best possible reall 也是这么计算，只不过其取平均是用大于thr个数来取平均，适应度是用大于thr数值之和取平均。

def anchor_fitness(k: np.ndarray, wh: np.ndarray, thr: float):  # mutation fitness
    r = wh[:, None] / k[None]
    x = np.minimum(r, 1. / r).min(2)  # ratio metric
    # x = wh_iou(wh, k)  # iou metric
    best = x.max(1)
    f = (best * (best > thr).astype(np.float32)).mean()  # fitness
    bpr = (best > thr).astype(np.float32).mean()  # best possible recall
    return f, bpr

Yolov5中还对找到的anchor进行遗传突变，遗传突变的方法我没有明白具体什么意思，但是我看代码，大致意思就是：取对应k个shape的随机数，这个随机数在一定范围，然后这个随机数乘以聚类得到的anchor，然后把刚刚乘以的结果在计算anchor_fitness，如果其适应度大于上次的，就把乘以随机数得到的anchor更新为新的。这就是遗传突变的方法。

# Evolve
# 遗传算法(在kmeans的结果基础上变异mutation)
npr = np.random
f, sh, mp, s = anchor_fitness(k, wh, thr)[0], k.shape, 0.9, 0.1  # fitness, generations, mutation prob, sigma
pbar = tqdm(range(gen), desc=f'Evolving anchors with Genetic Algorithm:')  # progress bar
for _ in pbar:
    v = np.ones(sh)
    while (v == 1).all():  # mutate until a change occurs (prevent duplicates)
        v = ((npr.random(sh) < mp) * random.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0)
    kg = (k.copy() * v).clip(min=2.0)
    fg, bpr = anchor_fitness(kg, wh, thr)
    if fg > f:
        f, k = fg, kg.copy()
        pbar.desc = f'Evolving anchors with Genetic Algorithm: fitness = {f:.4f}'

参考：