k-means是把一批数据通过一定方法划分为预先设置好的几个簇内,同一个簇内的数据是有一定共性。
比如有一批数据,我想划分为K个簇,那步骤如下:
先随机选取K个数据的位置作为簇的簇心位置;
每一个数据依次与K个簇心计算欧式距离(假定用的欧式距离来计算),每一个数据与簇心距离最小的归为此簇;
接着根据每个簇内的数据计算此簇内所有数据的中心(方法不唯一),并把此中心更新为此簇的簇心;
重复步骤2-3,直到每个簇的簇心不改变或者变化在给定范围内则停止;
数据示例
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
color = np.array(['blue', 'black', 'gray'])
def plot_cluster(data, cls=None, cluster=None, title=''):
if cls is None:
c = [color[0]] * data.shape[0]
else:
c = color[cls].tolist()
plt.scatter(data[:, 0], data[:, 1], s=150, c=c)
# for i, clu in enumerate(cluster):
plt.scatter(cluster[:, 0], cluster[:, 1], s=180, c='red',marker='*')
plt.title(title)
plt.show()
plt.close()
def distance(data, cluster):
return np.sum(np.power(data[:, None] - cluster[None], 2), axis=-1)
def k_means(data, k):
# 随机选取K个数据作为簇心
cluster = data[np.random.choice(data.shape[0], k, replace=False)]
print(f'init cluster: {cluster}')
# 构建与data.shape[0]相同长度一维的数据,用于记录每个元素属于哪个簇心是否需要更改
last_loc = np.zeros(data.shape[0])
step = 0
plot_cluster(data, cls=None, cluster=cluster, title=f'step: {step}')
while True:
# 计算每个坐标点与随机选取的簇心之间的欧式距离/1-IOU距离
d = distance(data, cluster)
# 选取每个坐标点与K个簇心距离最小的那个,则该坐标归属于距离最小的簇心
current_loc = np.argmin(d, axis=-1)
# 如果这两者完全相等,或者差异小于一定范围(这里范围是10,如果不设置,那么1-wh_iou则执行几百次还没找到),则说明没有坐标需要更改,则簇心更新完
if (last_loc == current_loc).all() or np.sum(last_loc == current_loc) >= np.sum(np.ones(data.shape[0]-10)):
break
# 计算每个簇内数据的中值,作为每个簇新的簇心
for clu in range(k):
cluster[clu] = np.median(data[current_loc == clu], axis=0)
last_loc = current_loc
step += 1
plot_cluster(data, cls=current_loc, cluster=cluster, title=f'step: {step}')
print(f'step: {step}')
print(f'final cluster: {cluster}')
return cluster
def wh_iou(wh1, wh2):
wh1 = wh1[:,None]
wh2 = wh2[None]
inter = np.minimum(wh1, wh2).prod(2)
iou = inter/(wh1.prod(1)+wh2.prod(1)-inter)
return iou
if __name__ == '__main__':
# 这里先创建一些点符合设定的正泰分布的数据,然后把这些构建为坐标点(坐标为x,y)
x1 = np.random.normal(loc=1, size=180)
y1 = np.random.normal(loc=3, size=180)
# 构建为坐标点
data = np.concatenate([x1[:, None], y1[:, None]], axis=-1)
k_means(data, k=3)
图像数据示例
图像数据其实也是跟数据示例一样,这样把图像上标注框取宽和高,然后除以对应图片的宽和高,这样标注框就都归一化,接着采取把每张图片的宽和高的最大值resize到统一的一个尺寸(指传给网络训练的图片尺寸),另一个小的值也resize,只是不是resize到统一的尺寸。这里的解释引用参考文章中所说:“假如在原始尺寸图片上进行聚类,比如1280*1280,得到聚类anchor为100*100,但是输入网络图片尺寸为640*640,那么对应的anchor就不适用了”
以下代码完全是照搬参考地址的博主的
import random
import numpy as np
import os
from tqdm import tqdm
from lxml import etree
class VOCDataSet(object):
def __init__(self, voc_root, year="2012", txt_name: str = "train.txt"):
assert year in ["2007", "2012"], "year must be in ['2007', '2012']"
self.root = os.path.join(voc_root, "VOCdevkit", f"VOC{year}")
self.annotations_root = os.path.join(self.root, "Annotations")
# read train.txt or val.txt file
txt_path = os.path.join(self.root, "ImageSets", "Main", txt_name)
assert os.path.exists(txt_path), "not found {} file.".format(txt_name)
with open(txt_path) as read:
self.xml_list = [os.path.join(self.annotations_root, line.strip() + ".xml")
for line in read.readlines() if len(line.strip()) > 0]
# check file
assert len(self.xml_list) > 0, "in '{}' file does not find any information.".format(txt_path)
for xml_path in self.xml_list:
assert os.path.exists(xml_path), "not found '{}' file.".format(xml_path)
def __len__(self):
return len(self.xml_list)
def parse_xml_to_dict(self, xml):
"""
将xml文件解析成字典形式,参考tensorflow的recursive_parse_xml_to_dict
Args:
xml: xml tree obtained by parsing XML file contents using lxml.etree
Returns:
Python dictionary holding XML contents.
"""
if len(xml) == 0: # 遍历到底层,直接返回tag对应的信息
return {xml.tag: xml.text}
result = {}
for child in xml:
child_result = self.parse_xml_to_dict(child) # 递归遍历标签信息
if child.tag != 'object':
result[child.tag] = child_result[child.tag]
else:
if child.tag not in result: # 因为object可能有多个,所以需要放入列表里
result[child.tag] = []
result[child.tag].append(child_result[child.tag])
return {xml.tag: result}
def get_info(self):
im_wh_list = []
boxes_wh_list = []
for xml_path in tqdm(self.xml_list, desc="read data info."):
# read xml
with open(xml_path) as fid:
xml_str = fid.read()
xml = etree.fromstring(xml_str)
data = self.parse_xml_to_dict(xml)["annotation"]
im_height = int(data["size"]["height"])
im_width = int(data["size"]["width"])
wh = []
for obj in data["object"]:
xmin = float(obj["bndbox"]["xmin"])
xmax = float(obj["bndbox"]["xmax"])
ymin = float(obj["bndbox"]["ymin"])
ymax = float(obj["bndbox"]["ymax"])
wh.append([(xmax - xmin) / im_width, (ymax - ymin) / im_height])
if len(wh) == 0:
continue
im_wh_list.append([im_width, im_height])
boxes_wh_list.append(wh)
return im_wh_list, boxes_wh_list
def wh_iou(wh1, wh2):
# Returns the nxm IoU matrix. wh1 is nx2, wh2 is mx2
wh1 = wh1[:, None] # [N,1,2]
wh2 = wh2[None] # [1,M,2]
inter = np.minimum(wh1, wh2).prod(2) # [N,M]
return inter / (wh1.prod(2) + wh2.prod(2) - inter) # iou = inter / (area1 + area2 - inter)
def k_means(boxes, k, dist=np.median):
"""
yolo k-means methods
refer: https://github.com/qqwweee/keras-yolo3/blob/master/kmeans.py
Args:
boxes: 需要聚类的bboxes
k: 簇数(聚成几类)
dist: 更新簇坐标的方法(默认使用中位数,比均值效果略好)
"""
box_number = boxes.shape[0]
last_nearest = np.zeros((box_number,))
# np.random.seed(0) # 固定随机数种子
# init k clusters
clusters = boxes[np.random.choice(box_number, k, replace=False)]
while True:
distances = 1 - wh_iou(boxes, clusters)
current_nearest = np.argmin(distances, axis=1)
if (last_nearest == current_nearest).all():
break # clusters won't change
for cluster in range(k):
# update clusters
clusters[cluster] = dist(boxes[current_nearest == cluster], axis=0)
last_nearest = current_nearest
return clusters
def anchor_fitness(k: np.ndarray, wh: np.ndarray, thr: float): # mutation fitness
r = wh[:, None] / k[None]
x = np.minimum(r, 1. / r).min(2) # ratio metric
# x = wh_iou(wh, k) # iou metric
best = x.max(1)
f = (best * (best > thr).astype(np.float32)).mean() # fitness
bpr = (best > thr).astype(np.float32).mean() # best possible recall
return f, bpr
def main(img_size=512, n=9, thr=0.25, gen=1000):
# 从数据集中读取所有图片的wh以及对应bboxes的wh
dataset = VOCDataSet(voc_root=r"D:\data\voc", year="2012", txt_name="train.txt")
im_wh, boxes_wh = dataset.get_info()
# 最大边缩放到img_size
im_wh = np.array(im_wh, dtype=np.float32)
shapes = img_size * im_wh / im_wh.max(1, keepdims=True)
wh0 = np.concatenate([l * s for s, l in zip(shapes, boxes_wh)]) # wh
# Filter 过滤掉小目标
i = (wh0 < 3.0).any(1).sum()
if i:
print(f'WARNING: Extremely small objects found. {i} of {len(wh0)} labels are < 3 pixels in size.')
wh = wh0[(wh0 >= 2.0).any(1)] # 只保留wh都大于等于2个像素的box
# Kmeans calculation
# print(f'Running kmeans for {n} anchors on {len(wh)} points...')
# s = wh.std(0) # sigmas for whitening
# k, dist = kmeans(wh / s, n, iter=30) # points, mean distance
# assert len(k) == n, print(f'ERROR: scipy.cluster.vq.kmeans requested {n} points but returned only {len(k)}')
# k *= s
k = k_means(wh, n)
# 按面积排序
k = k[np.argsort(k.prod(1))] # sort small to large
f, bpr = anchor_fitness(k, wh, thr)
print("kmeans: " + " ".join([f"[{int(i[0])}, {int(i[1])}]" for i in k]))
print(f"fitness: {f:.5f}, best possible recall: {bpr:.5f}")
# Evolve
# 遗传算法(在kmeans的结果基础上变异mutation)
npr = np.random
f, sh, mp, s = anchor_fitness(k, wh, thr)[0], k.shape, 0.9, 0.1 # fitness, generations, mutation prob, sigma
pbar = tqdm(range(gen), desc=f'Evolving anchors with Genetic Algorithm:') # progress bar
for _ in pbar:
v = np.ones(sh)
while (v == 1).all(): # mutate until a change occurs (prevent duplicates)
v = ((npr.random(sh) < mp) * random.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0)
kg = (k.copy() * v).clip(min=2.0)
fg, bpr = anchor_fitness(kg, wh, thr)
if fg > f:
f, k = fg, kg.copy()
pbar.desc = f'Evolving anchors with Genetic Algorithm: fitness = {f:.4f}'
# 按面积排序
k = k[np.argsort(k.prod(1))] # sort small to large
print("genetic: " + " ".join([f"[{int(i[0])}, {int(i[1])}]" for i in k]))
print(f"fitness: {f:.5f}, best possible recall: {bpr:.5f}")
if __name__ == "__main__":
main()
上述使用的是 1-IOU 作为距离,而非欧式距离,这个效果比较好。
另外代码中:anchor_fitness 方法是计算每个box与计算得到的k个anchor的适应度和best possible recall。适应度就是box宽和高与k个anchor对应的宽和高分别做除,然后把比取min(r,1/r)(因为可能有些比大于1),完全相等是比等于1。这样就把所有比全部取到,然后取每个box与k个box中最大的,取这些最大的大于给定thr阈值的数值和,作平均,作为适应度。另一个 best possible reall 也是这么计算,只不过其取平均是用大于thr个数来取平均,适应度是用大于thr数值之和取平均。
def anchor_fitness(k: np.ndarray, wh: np.ndarray, thr: float): # mutation fitness
r = wh[:, None] / k[None]
x = np.minimum(r, 1. / r).min(2) # ratio metric
# x = wh_iou(wh, k) # iou metric
best = x.max(1)
f = (best * (best > thr).astype(np.float32)).mean() # fitness
bpr = (best > thr).astype(np.float32).mean() # best possible recall
return f, bpr
Yolov5中还对找到的anchor进行遗传突变,遗传突变的方法我没有明白具体什么意思,但是我看代码,大致意思就是:取对应k个shape的随机数,这个随机数在一定范围,然后这个随机数乘以聚类得到的anchor,然后把刚刚乘以的结果在计算anchor_fitness,如果其适应度大于上次的,就把乘以随机数得到的anchor更新为新的。这就是遗传突变的方法。
# Evolve
# 遗传算法(在kmeans的结果基础上变异mutation)
npr = np.random
f, sh, mp, s = anchor_fitness(k, wh, thr)[0], k.shape, 0.9, 0.1 # fitness, generations, mutation prob, sigma
pbar = tqdm(range(gen), desc=f'Evolving anchors with Genetic Algorithm:') # progress bar
for _ in pbar:
v = np.ones(sh)
while (v == 1).all(): # mutate until a change occurs (prevent duplicates)
v = ((npr.random(sh) < mp) * random.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0)
kg = (k.copy() * v).clip(min=2.0)
fg, bpr = anchor_fitness(kg, wh, thr)
if fg > f:
f, k = fg, kg.copy()
pbar.desc = f'Evolving anchors with Genetic Algorithm: fitness = {f:.4f}'
参考:
这里附带下维度的理解:
当我们对好多shape的数据进行操作某一维度时,容易不知道操作后结果的维度,其实只要记住:对哪个维度操作,哪个维度就消失,结果的维度就是没有操作的维度。
比如一个数据维度是(3,4,5),我都0维进行操作,那结果维度就是(4,5)