算法原文:SSD: Single Shot MultiBox Detector
github链接:https://github.com/pierluigiferrari/ssd_keras#overview
以ssd7作为主程序一共如下几类:
- train_ssd7
- keras_ssd7
- keras_ssd_loss
- ssd_box_encode_decode_utils
- ssd_batch_generator
ssd_box_encode_decode_utils
代码主要作用:
- 对生成的dafult box进行IOU检测
- 对其进行非极大抑制
- 将其做成model对SSD进行输出
- 将其做成model提供给SSD主程序使用
def iou(boxes1, boxes2, coords='centroids'):
# IOU函数:模型产生的目标窗口和原来标记窗口的交叠率
# 矩阵box有两种储存方式:一维状态下包含一个box所需要的坐标数据,2维下是n个box所需要的数据
if len(boxes1.shape) > 2: raise ValueError("boxes1 must have rank either 1 or 2, but has rank {}.".format(len(boxes1.shape)))
if len(boxes2.shape) > 2: raise ValueError("boxes2 must have rank either 1 or 2, but has rank {}.".format(len(boxes2.shape)))
# 如果输入了一维矩阵,则对其补0扩维
if len(boxes1.shape) == 1: boxes1 = np.expand_dims(boxes1, axis=0)
if len(boxes2.shape) == 1: boxes2 = np.expand_dims(boxes2, axis=0)
# box是存储有四个方向坐标的,如果缺少坐标则判断错误
if not (boxes1.shape[1] == boxes2.shape[1] == 4): raise ValueError("It must be boxes1.shape[1] == boxes2.shape[1] == 4, but it is boxes1.shape[1] == {}, boxes2.shape[1] == {}.".format(boxes1.shape[1], boxes2.shape[1]))
# 判断矩阵的坐标格式:满足则为(cx,cy,w,h),否则(xmin,xmax,ymin,ymax)
if coords == 'centroids':
boxes1 = convert_coordinates(boxes1, start_index=0, conversion='centroids2minmax')
boxes2 = convert_coordinates(boxes2, start_index=0, conversion='centroids2minmax')
elif coords != 'minmax':
raise ValueError("Unexpected value for `coords`. Supported values are 'minmax' and 'centroids'.")
#交集比并集并对其进行非极大抑制,最终返回
intersection = np.maximum(0, np.minimum(boxes1[:,1], boxes2[:,1]) - np.maximum(boxes1[:,0], boxes2[:,0])) * np.maximum(0, np.minimum(boxes1[:,3], boxes2[:,3]) - np.maximum(boxes1[:,2], boxes2[:,2]))
union = (boxes1[:,1] - boxes1[:,0]) * (boxes1[:,3] - boxes1[:,2]) + (boxes2[:,1] - boxes2[:,0]) * (boxes2[:,3] - boxes2[:,2]) - intersection
return intersection / union
def convert_coordinates(tensor, start_index, conversion='minmax2centroids'):
#转换坐标函数:将box轴对齐,其实就是放在一个中心下,但是转换会有误差
#参数:1.N维包含四个坐标的长量,即N个输入框
#2.在tensor的最后一维进行坐标变换
#3.默认minmax2centroids,对box进行转向
ind = start_index
tensor1 = np.copy(tensor).astype(np.float) #copy后改变不会影响原tensor
if conversion == 'minmax2centroids':
tensor1[..., ind] = (tensor[..., ind] + tensor[..., ind+1]) / 2.0
tensor1[..., ind+1] = (tensor[..., ind+2] + tensor[..., ind+3]) / 2.0
tensor1[..., ind+2] = tensor[..., ind+1] - tensor[..., ind]
tensor1[..., ind+3] = tensor[..., ind+3] - tensor[..., ind+2]
elif conversion == 'centroids2minmax':
tensor1[..., ind] = tensor[..., ind] - tensor[..., ind+2] / 2.0
tensor1[..., ind+1] = tensor[..., ind] + tensor[..., ind+2] / 2.0
tensor1[..., ind+2] = tensor[..., ind+1] - tensor[..., ind+3] / 2.0
tensor1[..., ind+3] = tensor[..., ind+1] + tensor[..., ind+3] / 2.0 #
else:
raise ValueError("Unexpected conversion value. Supported values are 'minmax2centroids' and 'centroids2minmax'.")
return tensor1
def convert_coordinates2(tensor, start_index, conversion='minmax2centroids'):
#对上面的convert_coordinates矩阵乘法实现
#这里给出了不适用的原因:偏多的系数矩阵相乘是非常浪费时间并且没意义的
ind = start_index
tensor1 = np.copy(tensor).astype(np.float)
if conversion == 'minmax2centroids':
M = np.array([[0.5, 0. , -1., 0.],
[0.5, 0. , 1., 0.],
[0. , 0.5, 0., -1.],
[0. , 0.5, 0., 1.]])
tensor1[..., ind:ind+4] = np.dot(tensor1[..., ind:ind+4], M)
elif conversion == 'centroids2minmax':
M = np.array([[ 1. , 1. , 0. , 0. ],
[ 0. , 0. , 1. , 1. ],
[-0.5, 0.5, 0. , 0. ],
[ 0. , 0. , -0.5, 0.5]])
tensor1[..., ind:ind+4] = np.dot(tensor1[..., ind:ind+4], M)
else:
raise ValueError("Unexpected conversion value. Supported values are 'minmax2centroids' and 'centroids2minmax'.")
return tensor1
def greedy_nms(y_pred_decoded, iou_threshold=0.45, coords='minmax'):
'''
NMS(非极大值抑制):
对输入框向量进行非极大贪婪抑制,主要是消除多余的框,找到最佳匹配的位置
设定一个IOU的阈值,将输入框的A,B,C...与最后的F相比如果重叠度大于IOU的阈值则扔掉原本
设定A和C与F进行重叠比较,如果大于IOU设定阈值则排除AC保留F
顾名思义,保留最大的,抑制较小的
'''
'''
参数设定:
y_pred_decoded(list变量):就是一批输入框,N个输入框,每个框为一个二维numpy的数组,比如有k个框,那么此参数表示为[k,6],参数6代表[class_id, score, xmin, xmax, ymin, ymax]向量。
iou_threshold(float变量):所有的box都会进行相似度度量,如果相似度大于设定的IOU阈值,则从本地列表中删除该框,默认为0.45
coords(str变量):设定y_pred_decoded的输入格式
'''
y_pred_decoded_nms = [] #候选框
for batch_item in y_pred_decoded:
boxes_left = np.copy(batch_item) #先将当前的box进行复制
maxima = [] #存非极大抑制box的临时数组
while boxes_left.shape[0] > 0:#看box个数是否为0
maximum_index = np.argmax(boxes_left[:,1])
maximum_box = np.copy(boxes_left[maximum_index])
maxima.append(maximum_box) #存储极大框
boxes_left = np.delete(boxes_left, maximum_index, axis=0) #删除极大框
if boxes_left.shape[0] == 0: break
similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords) #求解相似度
boxes_left = boxes_left[similarities <= iou_threshold]#小于IOU的留下继续相互比较
y_pred_decoded_nms.append(np.array(maxima)) #将余留的预选框加入继续比较
return y_pred_decoded_nms#返回结果
def _greedy_nms(predictions, iou_threshold=0.45, coords='minmax'):
#与上式相同但是被用作“decode_y()”中的每一个NMS类中
boxes_left = np.copy(predictions)
maxima = []
while boxes_left.shape[0] > 0:
maximum_index = np.argmax(boxes_left[:,0]) #
maximum_box = np.copy(boxes_left[maximum_index])
maxima.append(maximum_box)
boxes_left = np.delete(boxes_left, maximum_index, axis=0)
if boxes_left.shape[0] == 0: break
similarities = iou(boxes_left[:,1:], maximum_box[1:], coords=coords)
boxes_left = boxes_left[similarities <= iou_threshold]
return np.array(maxima)
def _greedy_nms2(predictions, iou_threshold=0.45, coords='minmax'):
'''
用于 `decode_y2()`.
'''
boxes_left = np.copy(predictions)
maxima = []
while boxes_left.shape[0] > 0:
maximum_index = np.argmax(boxes_left[:,1])
maximum_box = np.copy(boxes_left[maximum_index])
maxima.append(maximum_box)
boxes_left = np.delete(boxes_left, maximum_index, axis=0)
if boxes_left.shape[0] == 0: break
similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords)
boxes_left = boxes_left[similarities <= iou_threshold]
return np.array(maxima)
#该函数将原本的模型预测输出更改为仅保留有用box的形式输出
#一共执行两个阶段:1.先对输入图像进行N个框生成并预测IOU 2.对其进行非极大化抑制
def decode_y(y_pred,
confidence_thresh=0.01,
iou_threshold=0.45,
top_k=200,
input_coords='centroids',
normalize_coords=False,
img_height=None,
img_width=None):
'''
参数设定:y_pre:SSD模型的预测输出,期待输出一个数组:(batch_size,#boxes,#classes+4+4+4)
其中#boxes是对每一张图预测的所有box的集合,
#classes是one-hot码,包含四个预测坐标,四个锚点坐标,四个方差
confidence_thresh:一个基于[0,1)的浮点数,考虑的是在分类过程中尽可能的满足非
部分发生在置信阈值阶段,而较小的值则会导致选择过大的部分由非极大抑制来处理
iou_threshold:IOU的阈值参数,如果小于该阈值则保留,如果两两比较大于某一阈值则从
数据集中移除,maximal表示的是得分系统
top_k:非极大抑制后,每个批次保留最高得分预测框的数量
input_coords:模行输出box坐标的格式,'centroids'模式下输出(cx,cy,w,h)表示
box中心,宽与高。否则为'minmax'模式:(xmin,xmax,ymin,ymax)
normalize_coords:正则化选项,如果模型输出相对坐标则可以使用该选项变回绝对坐标
如果不想变则设置为false。如果模型输出绝对坐标,那么就应当利用下面的H和W来判断位置
img_height,img_width:图像的高和宽,当上式为True用该参数判定
Return:返回一个长度为batch_size的列表,每个列表元素表示一个图像的预测框,
每个元素包含box的具体形状和参数
'''
#如果转化为绝对坐标且不存在宽和高的参数则报错
if normalize_coords and ((img_height is None) or (img_width is None)):
raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
#1:将相对坐标变为绝对坐标
y_pred_decoded_raw = np.copy(y_pred[:,:,:-8])#对向量进行操作:删除类别和四个偏移量,扔掉锚点坐标和偏差;保留为[批量,box数量,类别+4个坐标]
if input_coords == 'centroids':
#对四个坐标进行数值处理
y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]])#
y_pred_decoded_raw[:,:,[-2,-1]] *= y_pred[:,:,[-6,-5]]
y_pred_decoded_raw[:,:,[-4,-3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]]
y_pred_decoded_raw[:,:,[-4,-3]] += y_pred[:,:,[-8,-7]]
y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2minmax')#进行坐标转换
elif input_coords == 'minmax':#如果不转换则直接加权保留相对坐标进行增纬
y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:]
y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1)
y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1)
y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4]
else:
raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax' and 'centroids'.")
#2 如果坐标已经是标准化的了,需要被转回绝对坐标
if normalize_coords:
y_pred_decoded_raw[:,:,-4:-2] *= img_width #把xmin和xmax转回绝对坐标
y_pred_decoded_raw[:,:,-2:] *= img_height #ymin和ymax
#3 进行阈值约束和非极大抑制
n_classes = y_pred_decoded_raw.shape[-1] - 4 #相当于纬度-四个坐标的纬度
y_pred_decoded = [] #预测框的集合
for batch_item in y_pred_decoded_raw: #item:[boxes,n_class+4坐标]
pred = [] #保存每一批度的最终预测
for class_id in range(1, n_classes): #除了背景以外的类
#保留该类的置信度,整合进一个变量中[n_boxes,5]
single_class = batch_item[:,[class_id, -4, -3, -2, -1]]
#保留该变量中满足阈值的box
threshold_met = single_class[single_class[:,0] > confidence_thresh]
if threshold_met.shape[0] > 0: #如果存在这些box
maxima = _greedy_nms(threshold_met, iou_threshold=iou_threshold, coords='minmax') #进行非极大贪婪抑制
maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1))#进行扩围,把最后一个维度扩展为第N类的ID
maxima_output[:,0] = class_id
maxima_output[:,1:] = maxima
pred.append(maxima_output)#讲该类别的最大值加入pre中
pred = np.concatenate(pred, axis=0)#数组拼接
if pred.shape[0] > top_k: #完成了后只保留类别中得分最高的
top_k_indices = np.argpartition(pred[:,1], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] #求出满足最高top_k的分数
pred = pred[top_k_indices] #只保留其中最高的那个
y_pred_decoded.append(pred)#加入到类别中
return y_pred_decoded
def decode_y2(y_pred,
confidence_thresh=0.5,
iou_threshold=0.45,
top_k='all',
input_coords='centroids',
normalize_coords=False,
img_height=None,
img_width=None):
'''
进行两次过滤(这里的filter不知道翻译的确切不)操作:先获取置信度后进行非极大贪婪抑制,对每一类预测取top_k作为当前批次的置信选择。decode_y2()函数采用了不同于原版的对每个类采取非极大贪婪抑制而是对全局进行了此操作,更加的高效。
'''
if normalize_coords and ((img_height is None) or (img_width is None)):
raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
# 对每个类进行One-Hot编码
y_pred_converted = np.copy(y_pred[:,:,-14:-8]) #先获取四个偏移量的预测以及两个元素(-14~-8是6个元素)
y_pred_converted[:,:,0] = np.argmax(y_pred[:,:,:-12], axis=-1) #把one-hot编码中置信度最高的记错该类别的标记
y_pred_converted[:,:,1] = np.amax(y_pred[:,:,:-12], axis=-1) #同时存储该置信度的值
# 将相对坐标转换成绝对坐标
if input_coords == 'centroids':
y_pred_converted[:,:,[4,5]] = np.exp(y_pred_converted[:,:,[4,5]] * y_pred[:,:,[-2,-1]])
y_pred_converted[:,:,[4,5]] *= y_pred[:,:,[-6,-5]]
y_pred_converted[:,:,[2,3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]]
y_pred_converted[:,:,[2,3]] += y_pred[:,:,[-8,-7]]
y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='centroids2minmax')
elif input_coords == 'minmax':
y_pred_converted[:,:,2:] *= y_pred[:,:,-4:]
y_pred_converted[:,:,[2,3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1)
y_pred_converted[:,:,[4,5]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1)
y_pred_converted[:,:,2:] += y_pred[:,:,-8:-4]
else:
raise ValueError("Unexpected value for `coords`. Supported values are 'minmax' and 'centroids'.")
# 如果进行标准化的预测那么应该转回绝对坐标操作
if normalize_coords:
y_pred_converted[:,:,2:4] *= img_width
y_pred_converted[:,:,4:] *= img_height
#将得到的(batch_size , box_number , 6) 数组转换成一个长度为batch_size的list,其中每个列表只包含有效预测数组
y_pred_decoded = []
for batch_item in y_pred_converted:
boxes = batch_item[np.nonzero(batch_item[:,0])] #获取所有box
boxes = boxes[boxes[:,1] >= confidence_thresh] #过滤掉置信度低的box
if iou_threshold: # 判断是否设置了阈值
boxes = _greedy_nms2(boxes, iou_threshold=iou_threshold, coords='minmax') # 对box进行非极大贪婪抑制
if top_k != 'all' and boxes.shape[0] > top_k: # 如果剩下的比top_k多
top_k_indices = np.argpartition(boxes[:,1], kth=boxes.shape[0]-top_k, axis=0)[boxes.shape[0]-top_k:] #从最高置信度的开始排序选top_k个
boxes = boxes[top_k_indices] # 保留topk个
y_pred_decoded.append(boxes) #这批boxes就是最终预测,显然操作比y1简单
return y_pred_decoded
class SSDBoxEncoder:
'''
该类的主要作用是将图像数据(二维的坐标数组和分类标签)处理成SSD算法可以训练的格式,以及提供可以将预测结果转换成原始格式。
'''
def __init__(self,
img_height,
img_width,
n_classes,
predictor_sizes,
min_scale=0.1,
max_scale=0.9,
scales=None,
aspect_ratios_global=[0.5, 1.0, 2.0],
aspect_ratios_per_layer=None,
two_boxes_for_ar1=True,
limit_boxes=True,
variances=[1.0, 1.0, 1.0, 1.0],
pos_iou_threshold=0.5,
neg_iou_threshold=0.3,
coords='centroids',
normalize_coords=False):
'''
img_height (int): 输入图像的高度
img_width (int): 输入图像的宽度
n_classes (int): 输入图像的分类标签
predictor_sizes (list): 格式为(高,宽)的元组,用来存储卷积层的输出预测信息
#主要为了生成default box
min_scale (float, optional):最小锚点缩放比例,对box进行放大缩小
max_scale (float, optional):最小锚点缩放比例,对box进行放大缩小
scales (list, optional):对每个卷积预测层所包含的锚点因子,一共有K个
aspect_ratios_global (list, optional): 锚点设置列表,对应成比例这样对于不同的物体可以产生不同的效果
aspect_ratios_per_layer (list, optional): 每一层的锚点设置列表,优先级高于上面的全局,如果传递list则会覆盖全局锚点列表。
two_boxes_for_ar1 (bool, optional):布尔类型,只判断是否为真;如果为真,则生成锚点框,并且比例逐渐增大。
limit_boxes (bool, optional): 限制锚点框在图像边界内
variances (list, optional):存储方差
pos_iou_threshold (float, optional): 交集相似度阈值
neg_iou_threshold (float, optional): 最大交集相似度,将地面标记为背景
coords (str, optional): 规定的坐标框格式,和上述两种一样
normalize_coords (bool, optional):坐标正则化
'''
predictor_sizes = np.array(predictor_sizes) #获取图片数量
if len(predictor_sizes.shape) == 1: #如果只有一张
predictor_sizes = np.expand_dims(predictor_sizes, axis=0)#扩维
if (min_scale is None or max_scale is None) and scales is None:
raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.")
if scales:
if (len(scales) != len(predictor_sizes)+1):
raise ValueError("It must be either scales is None or len(scales) == len(predictor_sizes)+1, but len(scales) == {} and len(predictor_sizes)+1 == {}".format(len(scales), len(predictor_sizes)+1))
scales = np.array(scales)
if np.any(scales <= 0):
raise ValueError("All values in `scales` must be greater than 0, but the passed list of scales is {}".format(scales))
else:
if not 0 < min_scale <= max_scale:
raise ValueError("It must be 0 < min_scale <= max_scale, but it is min_scale = {} and max_scale = {}".format(min_scale, max_scale))
if aspect_ratios_per_layer:
if (len(aspect_ratios_per_layer) != len(predictor_sizes)):
raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == len(predictor_sizes), but len(aspect_ratios_per_layer) == {} and len(predictor_sizes) == {}".format(len(aspect_ratios_per_layer), len(predictor_sizes)))
for aspect_ratios in aspect_ratios_per_layer:
aspect_ratios = np.array(aspect_ratios)
if np.any(aspect_ratios <= 0):
raise ValueError("All aspect ratios must be greater than zero.")
else:
if not aspect_ratios_global:
raise ValueError("At least one of `aspect_ratios_global` and `aspect_ratios_per_layer` cannot be `None`.")
aspect_ratios_global = np.array(aspect_ratios_global)
if np.any(aspect_ratios_global <= 0):
raise ValueError("All aspect ratios must be greater than zero.")
if len(variances) != 4:
raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
variances = np.array(variances)
if np.any(variances <= 0):
raise ValueError("All variances must be >0, but the variances given are {}".format(variances))
if neg_iou_threshold > pos_iou_threshold:
raise ValueError("It cannot be `neg_iou_threshold > pos_iou_threshold`.")
if not (coords == 'minmax' or coords == 'centroids'):
raise ValueError("Unexpected value for `coords`. Supported values are 'minmax' and 'centroids'.")
self.img_height = img_height
self.img_width = img_width
self.n_classes = n_classes
self.predictor_sizes = predictor_sizes
self.min_scale = min_scale
self.max_scale = max_scale
self.scales = scales
self.aspect_ratios_global = aspect_ratios_global
self.aspect_ratios_per_layer = aspect_ratios_per_layer
self.two_boxes_for_ar1 = two_boxes_for_ar1
self.limit_boxes = limit_boxes
self.variances = variances
self.pos_iou_threshold = pos_iou_threshold
self.neg_iou_threshold = neg_iou_threshold
self.coords = coords
self.normalize_coords = normalize_coords
# 计算每个神经元中的预测框数量
if aspect_ratios_per_layer:
self.n_boxes = []
for aspect_ratios in aspect_ratios_per_layer:#对于每层的锚点设置列表
if (1 in aspect_ratios) & two_boxes_for_ar1:
self.n_boxes.append(len(aspect_ratios) + 1)
else:
self.n_boxes.append(len(aspect_ratios))
else:
if (1 in aspect_ratios_global) & two_boxes_for_ar1:
self.n_boxes = len(aspect_ratios_global) + 1
else:
self.n_boxes = len(aspect_ratios_global)
#生成标准框
def generate_anchor_boxes(self,
batch_size,
feature_map_size,
aspect_ratios,
this_scale,
next_scale,
diagnostics=False):
'''
根据锚点设定一个特定分类的box
this_scale (float):对于输入图像的短边部分生成锚点框的缩放因子
next_scale (float):如果self.two_boxes_for_ar1 == True,则生成下一个较大的比例因子
diagnostics (bool, optional): 布尔类型,满足返回两个输出
1) 包含每个box的宽度和高度的数组
2) 记录box的质心相对于垂直和水平多远
主要作用是用来记录每个锚点box下产生了多少个新的box,确保是否覆盖到图像
Returns:
四维数组(feature_map_height, feature_map_width, n_boxes_per_cell, 4),最后一维的4包含了box的四个坐标
'''
# 计算box的宽高比下的宽度和长度
# 找到较短边使用scale函数和aspect_rations函数来计算w和h
aspect_ratios = np.sort(aspect_ratios)
size = min(self.img_height, self.img_width)
# 计算所有box的长度和宽度
wh_list = []
n_boxes = len(aspect_ratios)
for ar in aspect_ratios:
if (ar == 1) & self.two_boxes_for_ar1:
#计算正常锚点,即宽高比为1的box
w = this_scale * size * np.sqrt(ar)
h = this_scale * size / np.sqrt(ar)
wh_list.append((w,h))
#使用该值应用再计算一个比例较大的版本
w = np.sqrt(this_scale * next_scale) * size * np.sqrt(ar)
h = np.sqrt(this_scale * next_scale) * size / np.sqrt(ar)
wh_list.append((w,h))
#得到了两个box,因此box数量+1
n_boxes += 1
else:
w = this_scale * size * np.sqrt(ar)
h = this_scale * size / np.sqrt(ar)
wh_list.append((w,h))
wh_list = np.array(wh_list)
#计算box中心的网格
cell_height = self.img_height / feature_map_size[0]
cell_width = self.img_width / feature_map_size[1]
cx = np.linspace(cell_width/2, self.img_width-cell_width/2, feature_map_size[1])
cy = np.linspace(cell_height/2, self.img_height-cell_height/2, feature_map_size[0])
cx_grid, cy_grid = np.meshgrid(cx, cy)
cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down
cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down
# 创建四维数组`(feature_map_height, feature_map_width, n_boxes, 4)`
# 最后一维标注为:(cx, cy, w, h)`
boxes_tensor = np.zeros((feature_map_size[0], feature_map_size[1], n_boxes, 4))
boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, n_boxes)) # Set cx
boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, n_boxes)) # Set cy
boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w
boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h
#坐标转换
boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2minmax')
# 如果使用了limit_boxes限制生成图像在边界内,则进行裁剪
if self.limit_boxes:
x_coords = boxes_tensor[:,:,:,[0, 1]]
x_coords[x_coords >= self.img_width] = self.img_width - 1
x_coords[x_coords < 0] = 0
boxes_tensor[:,:,:,[0, 1]] = x_coords
y_coords = boxes_tensor[:,:,:,[2, 3]]
y_coords[y_coords >= self.img_height] = self.img_height - 1
y_coords[y_coords < 0] = 0
boxes_tensor[:,:,:,[2, 3]] = y_coords
# 如果使用该变量则将坐标标准化
if self.normalize_coords:
boxes_tensor[:, :, :, :2] /= self.img_width
boxes_tensor[:, :, :, 2:] /= self.img_height
if self.coords == 'centroids':
# 如果等于centroids模式,则直接限制坐标位置
# 坐标转换
boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='minmax2centroids')
# 生成一个新的数组将对每个batch展开平铺
# 最终产生五维向量 `(batch_size, feature_map_height, feature_map_width, n_boxes, 4)`
boxes_tensor = np.expand_dims(boxes_tensor, axis=0)
boxes_tensor = np.tile(boxes_tensor, (batch_size, 1, 1, 1, 1))
'''
将5D向量改为3D向量:
(batch, feature_map_height * feature_map_width * n_boxes, 4)
tf.reshape() and np.reshape()可以用来解决换维的问题
'''
boxes_tensor = np.reshape(boxes_tensor, (batch_size, -1, 4))
if diagnostics:
return boxes_tensor, wh_list, (int(cell_height), int(cell_width))
else:
return boxes_tensor
def generate_encode_template(self, batch_size, diagnostics=False):
'''
对于每个batch_size生成基于真实图像的标签张量的编码模板
这个函数是对每个批次进行创建,转换重塑和连接操作
该子函数与在卷积网络中模型中使用的是同一个函数,它主要是为了保证对于每个box的预测空间含义相同
参数申明:
1.batch_size:每次批量的大小
2.generate_anchor_boxes()函数有解释
返回参数:
Numpy格式的:(batch_size, boxes, classes + 12),
该模板构造出符合标准的真实图像的label用于训练,+12是因为不单最后返回四个预测box坐标,还有锚点的四个坐标以及它们的四个偏差值
'''
# 1:获取每个卷积层预测的锚点框缩放因子,如果给出了锚点因子,则不使用min和max锚点因子, np.linspace用来创建min~max的等差数列,来不断变大缩放因子
if self.scales is None:
self.scales = np.linspace(self.min_scale, self.max_scale, len(self.predictor_sizes)+1)
# 2: 对于每个卷积预测曾,获取其锚点box的信息(batch, n_boxes_total, 4)
boxes_tensor = []
if diagnostics:
wh_list = [] # 保存box的宽和高
cell_sizes = [] # 保存任意两个box之间的水平和垂直距离
#如果每个特征图有单独的宽高比则需要自行生成锚点,传递到generate_anchor_boxes()
if self.aspect_ratios_per_layer:
for i in range(len(self.predictor_sizes)):
boxes, wh, cells = self.generate_anchor_boxes(batch_size=batch_size,
feature_map_size=self.predictor_sizes[i],
aspect_ratios=self.aspect_ratios_per_layer[i],
this_scale=self.scales[i],
next_scale=self.scales[i+1],diagnostics=True)
boxes_tensor.append(boxes)
wh_list.append(wh)
cell_sizes.append(cells)
else: #否则全局使用相同的宽高比来生成锚点
for i in range(len(self.predictor_sizes)):
boxes, wh, cells = self.generate_anchor_boxes(batch_size=batch_size,
feature_map_size=self.predictor_sizes[i],
aspect_ratios=self.aspect_ratios_global,
this_scale=self.scales[i],
next_scale=self.scales[i+1],
diagnostics=True)
boxes_tensor.append(boxes)
wh_list.append(wh)
cell_sizes.append(cells)
else:#如果给出锚点因子的话
if self.aspect_ratios_per_layer:
for i in range(len(self.predictor_sizes)):
boxes_tensor.append(self.generate_anchor_boxes(batch_size=batch_size,
feature_map_size=self.predictor_sizes[i],
aspect_ratios=self.aspect_ratios_per_layer[i],
this_scale=self.scales[i],
next_scale=self.scales[i+1],
diagnostics=False))
else:
for i in range(len(self.predictor_sizes)):
boxes_tensor.append(self.generate_anchor_boxes(batch_size=batch_size,
feature_map_size=self.predictor_sizes[i],
aspect_ratios=self.aspect_ratios_global,
this_scale=self.scales[i],
next_scale=self.scales[i+1],
diagnostics=False))
boxes_tensor = np.concatenate(boxes_tensor, axis=1) # Concatenate the anchor tensors from the individual layers to one
# 3: 使用模板张量来保存one-hot编码后的形状:(batch, #boxes, #classes)
# 全部初始化为0,当遇到满足的再赋于1
classes_tensor = np.zeros((batch_size, boxes_tensor.shape[1], self.n_classes))
# 4: 创建一个张量来存储方差信息,包含最后一维中每个位置都有四个方差
variances_tensor = np.zeros_like(boxes_tensor)
variances_tensor += self.variances # 累加
# 5: 对创建的张量进行连接,获取y_decoded的模板,
# 额外需要一个box_tensor来进行合并,实际上并没有什么具体的意义
y_encode_template = np.concatenate((classes_tensor, boxes_tensor, boxes_tensor, variances_tensor), axis=2)
if diagnostics:
return y_encode_template, wh_list, cell_sizes
else:
return y_encode_template
def encode_y(self, ground_truth_labels):
'''
将真实的图像标记转换成可以训练SSD模型的数据格式:
对于每一批的每张图片,每个属于该图像的真实边框box会利用相似性与锚点的box互相比较。对于每个图片,如果他们的相似度大于等于设定的阈值,那么这个box会被匹配,将背景真实的box的坐标和类别写入锚点box的位置。除了与ground_truth_labels的IOU相似度高的图像被保留,其余不相同的锚点框都会被判为背景。
参数:
ground_truth_labels (list):一个包含长度为batch_size的二维Numpy数组列表,对于每个批度的图像而言,每个数组对于k个属于真实较大图像有k行,每个ground_box都包含(class_id, xmin, xmax, ymin, ymax),对于class=0代表属于背景,其余的都是大于0的类别
返回:
`y_encoded`,三维数组包含 `(batch_size, #boxes, #classes + 4 + 4 + 4)` boxed是预测每张图的边框的总数,classes是one-hot编码。后面的4分别代表:边框的坐标,虚拟变量和方差
'''
# 1:生成y_encode模板
y_encode_template = self.generate_encode_template(batch_size=len(ground_truth_labels), diagnostics=False)
y_encoded = np.copy(y_encode_template) # 将ground truth box存入这里
# 2: 将ground_truth_labels的box与y_encode_template进行匹配,每个匹配结果的box和坐标录到y_encoded中,如果没有匹配结果则记作0表示背景
class_vector = np.eye(self.n_classes) # 单位矩阵拿来进行处理One-hot编码
for i in range(y_encode_template.shape[0]):#本质就是每个batch
available_boxes = np.ones((y_encode_template.shape[1])) #1代表未匹配,否则为0
negative_boxes = np.ones((y_encode_template.shape[1])) #1代表负框,否则为0
for true_box in ground_truth_labels[i]: #对于当前的所有实际的box
true_box = true_box.astype(np.float) #设定box的坐标为浮点数
if abs(true_box[2] - true_box[1] < 0.001) or abs(true_box[4] - true_box[3] < 0.001): continue # 如果存在高或宽为0的框,则不与计算
if self.normalize_coords:#如果选择正则化,进行正则化变换
true_box[1:3] /= self.img_width # Normalize xmin and xmax to be within [0,1]
true_box[3:5] /= self.img_height # Normalize ymin and ymax to be within [0,1]
if self.coords == 'centroids':
true_box = convert_coordinates(true_box, start_index=1, conversion='minmax2centroids')
similarities = iou(y_encode_template[i,:,-12:-8], true_box[1:], coords=self.coords) # 对所有锚点框进行相似度计算
negative_boxes[similarities >= self.neg_iou_threshold] = 0 # 获取相似度大于负数相似度的加入
similarities *= available_boxes # 将相似度卷积准备剔除掉不用的锚点box
available_and_thresh_met = np.copy(similarities)
available_and_thresh_met[available_and_thresh_met < self.pos_iou_threshold] = 0 # 删除掉小于IOU的锚点box
assign_indices = np.nonzero(available_and_thresh_met)[0] # 获取剩余用来测试ground_truth_box的锚点box索引
if len(assign_indices) > 0: # 如果存在匹配
y_encoded[i,assign_indices,:-8] = np.concatenate((class_vector[int(true_box[0])], true_box[1:]), axis=0) #将ground_truth_box的坐标和分类写入所有分配锚点box的位置
available_boxes[assign_indices] = 0 # 保证使用过的锚点不再被使用
else: # 如果无法分配
best_match_index = np.argmax(similarities) # 从所有可用的box中获取相似度最大的
y_encoded[i,best_match_index,:-8] = np.concatenate((class_vector[int(true_box[0])], true_box[1:]), axis=0) # 将数据写入最佳匹配锚点的位置
available_boxes[best_match_index] = 0 # 标记已经使用过了
negative_boxes[best_match_index] = 0 # 取消标记的负框
# 将所有剩余的锚点box设置为背景类
background_class_indices = np.nonzero(negative_boxes)[0]
y_encoded[i,background_class_indices,0] = 1
# 3: 绝对坐标转换
if self.coords == 'centroids':
y_encoded[:,:,[-12,-11]] -= y_encode_template[:,:,[-12,-11]]
y_encoded[:,:,[-12,-11]] /= y_encode_template[:,:,[-10,-9]] * y_encode_template[:,:,[-4,-3]]
y_encoded[:,:,[-10,-9]] /= y_encode_template[:,:,[-10,-9]]
y_encoded[:,:,[-10,-9]] = np.log(y_encoded[:,:,[-10,-9]]) / y_encode_template[:,:,[-2,-1]]
else:
y_encoded[:,:,-12:-8] -= y_encode_template[:,:,-12:-8]
y_encoded[:,:,[-12,-11]] /= np.expand_dims(y_encode_template[:,:,-11] - y_encode_template[:,:,-12], axis=-1)
y_encoded[:,:,[-10,-9]] /= np.expand_dims(y_encode_template[:,:,-9] - y_encode_template[:,:,-10], axis=-1)
y_encoded[:,:,-12:-8] /= y_encode_template[:,:,-4:]
return y_encoded