1 CenterNet目标检测平台
CenterNet与其他目标检测方法不同的地方在于不需要先验框,根据类别确定真实框在特征图上对应的网格点,再结合预测的偏移得到预测中心点,预测中心点结合预测的宽高得到预测框。
2 Backbone
a. 结构
下采样得到P4,然后用转置卷积上采样
cls[b,128,128,cls]
reg[b,128,128,2]
wh[b,128,128,2]
b.主干网络
#-------------------------------------------------------------#
# ResNet50的网络部分
#-------------------------------------------------------------#
import numpy as np
import tensorflow.keras.backend as K
from tensorflow.keras import layers
from tensorflow.keras.layers import (Activation, AveragePooling2D,
BatchNormalization, Conv2D,
Conv2DTranspose, Dense, Dropout, Flatten,
Input, MaxPooling2D, ZeroPadding2D)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.regularizers import l2
def identity_block(input_tensor, kernel_size, filters, stage, block):
filters1, filters2, filters3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = Conv2D(filters1, (1, 1), name=conv_name_base + '2a', use_bias=False)(input_tensor)
x = BatchNormalization(name=bn_name_base + '2a')(x)
x = Activation('relu')(x)
x = Conv2D(filters2, kernel_size,padding='same', name=conv_name_base + '2b', use_bias=False)(x)
x = BatchNormalization(name=bn_name_base + '2b')(x)
x = Activation('relu')(x)
x = Conv2D(filters3, (1, 1), name=conv_name_base + '2c', use_bias=False)(x)
x = BatchNormalization(name=bn_name_base + '2c')(x)
x = layers.add([x, input_tensor])
x = Activation('relu')(x)
return x
def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)):
filters1, filters2, filters3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = Conv2D(filters1, (1, 1), strides=strides,
name=conv_name_base + '2a', use_bias=False)(input_tensor)
x = BatchNormalization(name=bn_name_base + '2a')(x)
x = Activation('relu')(x)
x = Conv2D(filters2, kernel_size, padding='same',
name=conv_name_base + '2b', use_bias=False)(x)
x = BatchNormalization(name=bn_name_base + '2b')(x)
x = Activation('relu')(x)
x = Conv2D(filters3, (1, 1), name=conv_name_base + '2c', use_bias=False)(x)
x = BatchNormalization(name=bn_name_base + '2c')(x)
shortcut = Conv2D(filters3, (1, 1), strides=strides,
name=conv_name_base + '1', use_bias=False)(input_tensor)
shortcut = BatchNormalization(name=bn_name_base + '1')(shortcut)
x = layers.add([x, shortcut])
x = Activation('relu')(x)
return x
def ResNet50(inputs):
# 512x512x3
x = ZeroPadding2D((3, 3))(inputs)
# 256,256,64
x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=False)(x)
x = BatchNormalization(name='bn_conv1')(x)
x = Activation('relu')(x)
# 256,256,64 -> 128,128,64
x = MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)
# 128,128,64 -> 128,128,256
x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
# 128,128,256 -> 64,64,512
x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
x = identity_block(x, 3, [128, 128, 512], stage=3, block='b')
x = identity_block(x, 3, [128, 128, 512], stage=3, block='c')
x = identity_block(x, 3, [128, 128, 512], stage=3, block='d')
# 64,64,512 -> 32,32,1024
x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f')
# 32,32,1024 -> 16,16,2048
x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a')
x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b')
x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c')
return x
上采样
x = Dropout(rate=0.5)(x)
#-------------------------------#
# 解码器
#-------------------------------#
num_filters = 256
# 16, 16, 2048 -> 32, 32, 256 -> 64, 64, 128 -> 128, 128, 64
for i in range(3):
# 进行上采样
x = Conv2DTranspose(num_filters // pow(2, i), (4, 4), strides=2, use_bias=False, padding='same',
kernel_initializer='he_normal',
kernel_regularizer=l2(5e-4))(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
预测结果
def centernet_head(x,num_classes):
x = Dropout(rate=0.5)(x)
#-------------------------------#
# 解码器
#-------------------------------#
num_filters = 256
# 16, 16, 2048 -> 32, 32, 256 -> 64, 64, 128 -> 128, 128, 64
for i in range(3):
# 进行上采样
x = Conv2DTranspose(num_filters // pow(2, i), (4, 4), strides=2, use_bias=False, padding='same',
kernel_initializer='he_normal',
kernel_regularizer=l2(5e-4))(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
# 最终获得128,128,64的特征层
# hm header
y1 = Conv2D(64, 3, padding='same', use_bias=False, kernel_initializer='he_normal', kernel_regularizer=l2(5e-4))(x)
y1 = BatchNormalization()(y1)
y1 = Activation('relu')(y1)
y1 = Conv2D(num_classes, 1, kernel_initializer='he_normal', kernel_regularizer=l2(5e-4), activation='sigmoid')(y1)
# wh header
y2 = Conv2D(64, 3, padding='same', use_bias=False, kernel_initializer='he_normal', kernel_regularizer=l2(5e-4))(x)
y2 = BatchNormalization()(y2)
y2 = Activation('relu')(y2)
y2 = Conv2D(2, 1, kernel_initializer='he_normal', kernel_regularizer=l2(5e-4))(y2)
# reg header
y3 = Conv2D(64, 3, padding='same', use_bias=False, kernel_initializer='he_normal', kernel_regularizer=l2(5e-4))(x)
y3 = BatchNormalization()(y3)
y3 = Activation('relu')(y3)
y3 = Conv2D(2, 1, kernel_initializer='he_normal', kernel_regularizer=l2(5e-4))(y3)
return y1, y2, y3
2.1 解码
a. 结构
nms:1.对hm最大池;2.hm与最大池化结果对比,如果数值相等就保留,如果不一样就取0. nms相当于筛选一次框。
topk:1.nms(hm)-> hm.reshape([b,-1])-> 排序得到前K个score,indices->根据上一步得到scores, indices, class_ids, xs, ys
decoder: 根据scores, indices, class_ids, xs, ys-> 得到topk_reg,topk_wh,topk_cx,topk_cy ->topk_x1, topk_y1 ,topk_x2, topk_y2 -> detections
b.代码
def nms(heat, kernel=3):
hmax = MaxPooling2D((kernel, kernel), strides=1, padding='SAME')(heat)
heat = tf.where(tf.equal(hmax, heat), heat, tf.zeros_like(heat))
return heat
def topk(hm, max_objects=100):
# hm -> Hot map热力图
# 进行热力图的非极大抑制,利用3x3的卷积对热力图进行Max筛选,找出值最大的
hm = nms(hm)
# (b, h * w * c)
b, h, w, c = tf.shape(hm)[0], tf.shape(hm)[1], tf.shape(hm)[2], tf.shape(hm)[3]
# 将所有结果平铺,获得(b, h * w * c)
hm = tf.reshape(hm, (b, -1))
# (b, k), (b, k)
scores, indices = tf.math.top_k(hm, k=max_objects, sorted=True)
# 计算求出网格点,类别
class_ids = indices % c
xs = indices // c % w
ys = indices // c // w
indices = ys * w + xs
return scores, indices, class_ids, xs, ys
def decode(hm, wh, reg, max_objects=100,num_classes=20):
scores, indices, class_ids, xs, ys = topk(hm, max_objects=max_objects)
# 获得batch_size
b = tf.shape(hm)[0]
# (b, h * w, 2)
reg = tf.reshape(reg, [b, -1, 2])
# (b, h * w, 2)
wh = tf.reshape(wh, [b, -1, 2])
length = tf.shape(wh)[1]
# 找到其在1维上的索引
batch_idx = tf.expand_dims(tf.range(0, b), 1)
batch_idx = tf.tile(batch_idx, (1, max_objects))
full_indices = tf.reshape(batch_idx, [-1]) * tf.to_int32(length) + tf.reshape(indices, [-1])
# 取出top_k个框对应的参数
topk_reg = tf.gather(tf.reshape(reg, [-1,2]), full_indices)
topk_reg = tf.reshape(topk_reg, [b, -1, 2])
topk_wh = tf.gather(tf.reshape(wh, [-1,2]), full_indices)
topk_wh = tf.reshape(topk_wh, [b, -1, 2])
# 计算调整后的中心
topk_cx = tf.cast(tf.expand_dims(xs, axis=-1), tf.float32) + topk_reg[..., 0:1]
topk_cy = tf.cast(tf.expand_dims(ys, axis=-1), tf.float32) + topk_reg[..., 1:2]
# (b,k,1) (b,k,1)
topk_x1, topk_y1 = topk_cx - topk_wh[..., 0:1] / 2, topk_cy - topk_wh[..., 1:2] / 2
# (b,k,1) (b,k,1)
topk_x2, topk_y2 = topk_cx + topk_wh[..., 0:1] / 2, topk_cy + topk_wh[..., 1:2] / 2
# (b,k,1)
scores = tf.expand_dims(scores, axis=-1)
# (b,k,1)
class_ids = tf.cast(tf.expand_dims(class_ids, axis=-1), tf.float32)
# (b,k,6)
detections = tf.concat([topk_x1, topk_y1, topk_x2, topk_y2, scores, class_ids], axis=-1)
return detections
2.2 编码
a. 结构
box
↓
hm->ct,ct_int->dx,dy
hm->高斯热力图
b.代码
class Generator(object):
def __init__(self,batch_size,train_lines,val_lines,
input_size,num_classes,max_objects=100):
self.batch_size = batch_size
self.train_lines = train_lines
self.val_lines = val_lines
self.input_size = input_size
self.output_size = (int(input_size[0]/4) , int(input_size[1]/4))
self.num_classes = num_classes
self.max_objects = max_objects
def get_random_data(self, annotation_line, input_shape, random=True, jitter=.3, hue=.1, sat=1.5, val=1.5, proc_img=True):
'''r实时数据增强的随机预处理'''
line = annotation_line.split()
image = Image.open(line[0])
iw, ih = image.size
h, w = input_shape
box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
# resize image
new_ar = w/h * rand(1-jitter,1+jitter)/rand(1-jitter,1+jitter)
scale = rand(0.25, 2)
if new_ar < 1:
nh = int(scale*h)
nw = int(nh*new_ar)
else:
nw = int(scale*w)
nh = int(nw/new_ar)
image = image.resize((nw,nh), Image.BICUBIC)
# place image
dx = int(rand(0, w-nw))
dy = int(rand(0, h-nh))
new_image = Image.new('RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image = new_image
# flip image or not
flip = rand()<.5
if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)
# distort image
hue = rand(-hue, hue)
sat = rand(1, sat) if rand()<.5 else 1/rand(1, sat)
val = rand(1, val) if rand()<.5 else 1/rand(1, val)
x = cv2.cvtColor(np.array(image,np.float32)/255, cv2.COLOR_RGB2HSV)
x[..., 0] += hue*360
x[..., 0][x[..., 0]>1] -= 1
x[..., 0][x[..., 0]<0] += 1
x[..., 1] *= sat
x[..., 2] *= val
x[x[:,:, 0]>360, 0] = 360
x[:, :, 1:][x[:, :, 1:]>1] = 1
x[x<0] = 0
image_data = cv2.cvtColor(x, cv2.COLOR_HSV2RGB)*255
# correct boxes
box_data = np.zeros((len(box),5))
if len(box)>0:
np.random.shuffle(box)
box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
if flip: box[:, [0,2]] = w - box[:, [2,0]]
box[:, 0:2][box[:, 0:2]<0] = 0
box[:, 2][box[:, 2]>w] = w
box[:, 3][box[:, 3]>h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box
box_data = np.zeros((len(box),5))
box_data[:len(box)] = box
if len(box) == 0:
return image_data, []
if (box_data[:,:4]>0).any():
return image_data, box_data
else:
return image_data, []
def generate(self, train=True):
while True:
if train:
# 打乱
shuffle(self.train_lines)
lines = self.train_lines
else:
shuffle(self.val_lines)
lines = self.val_lines
batch_images = np.zeros((self.batch_size, self.input_size[0], self.input_size[1], self.input_size[2]), dtype=np.float32)
batch_hms = np.zeros((self.batch_size, self.output_size[0], self.output_size[1], self.num_classes), dtype=np.float32)
batch_whs = np.zeros((self.batch_size, self.max_objects, 2), dtype=np.float32)
batch_regs = np.zeros((self.batch_size, self.max_objects, 2), dtype=np.float32)
batch_reg_masks = np.zeros((self.batch_size, self.max_objects), dtype=np.float32)
batch_indices = np.zeros((self.batch_size, self.max_objects), dtype=np.float32)
b = 0
for annotation_line in lines:
img,y=self.get_random_data(annotation_line,self.input_size[0:2])
if len(y)!=0:
boxes = np.array(y[:,:4],dtype=np.float32)
boxes[:,0] = boxes[:,0]/self.input_size[1]*self.output_size[1]
boxes[:,1] = boxes[:,1]/self.input_size[0]*self.output_size[0]
boxes[:,2] = boxes[:,2]/self.input_size[1]*self.output_size[1]
boxes[:,3] = boxes[:,3]/self.input_size[0]*self.output_size[0]
for i in range(len(y)):
bbox = boxes[i].copy()
bbox = np.array(bbox)
bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, self.output_size[1] - 1)
bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, self.output_size[0] - 1)
cls_id = int(y[i,-1])
h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
if h > 0 and w > 0:
ct = np.array([(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
ct_int = ct.astype(np.int32)
# 获得热力图
radius = gaussian_radius((math.ceil(h), math.ceil(w)))
radius = max(0, int(radius))
batch_hms[b, :, :, cls_id] = draw_gaussian(batch_hms[b, :, :, cls_id], ct_int, radius)
batch_whs[b, i] = 1. * w, 1. * h
# 计算中心偏移量
batch_regs[b, i] = ct - ct_int
# 将对应的mask设置为1,用于排除多余的0
batch_reg_masks[b, i] = 1
# 表示第ct_int[1]行的第ct_int[0]个。
batch_indices[b, i] = ct_int[1] * self.output_size[0] + ct_int[0]
batch_images[b] = preprocess_image(img)
b = b + 1
if b == self.batch_size:
b = 0
yield [batch_images, batch_hms, batch_whs, batch_regs, batch_reg_masks, batch_indices], np.zeros((self.batch_size,))
batch_images = np.zeros((self.batch_size, self.input_size[0], self.input_size[1], 3), dtype=np.float32)
batch_hms = np.zeros((self.batch_size, self.output_size[0], self.output_size[1], self.num_classes),
dtype=np.float32)
batch_whs = np.zeros((self.batch_size, self.max_objects, 2), dtype=np.float32)
batch_regs = np.zeros((self.batch_size, self.max_objects, 2), dtype=np.float32)
batch_reg_masks = np.zeros((self.batch_size, self.max_objects), dtype=np.float32)
batch_indices = np.zeros((self.batch_size, self.max_objects), dtype=np.float32)
2 .3 损失
a. 结构
1.热力图损失:
y=1:1/N*sum((1-y_pred)**a*log(y_pred))
y!=1:1/N*sum((1-y_true)**b*(y_pred)**a*log(1-y_pred))
2.reg中心点的loss:1/N*sum(y_pred-y_true)
3.wh宽高的loss:1/N*sum(y_pred-y_true)
4. L = Lk+a1*Lreg+a2*Lwh
b.代码
def focal_loss(hm_pred, hm_true):
# 找到正样本和负样本
pos_mask = tf.cast(tf.equal(hm_true, 1), tf.float32)
# 小于1的都是负样本
neg_mask = tf.cast(tf.less(hm_true, 1), tf.float32)
neg_weights = tf.pow(1 - hm_true, 4)
pos_loss = -tf.log(tf.clip_by_value(hm_pred, 1e-7, 1.)) * tf.pow(1 - hm_pred, 2) * pos_mask
neg_loss = -tf.log(tf.clip_by_value(1 - hm_pred, 1e-7, 1.)) * tf.pow(hm_pred, 2) * neg_weights * neg_mask
num_pos = tf.reduce_sum(pos_mask)
pos_loss = tf.reduce_sum(pos_loss)
neg_loss = tf.reduce_sum(neg_loss)
cls_loss = tf.cond(tf.greater(num_pos, 0), lambda: (pos_loss + neg_loss) / num_pos, lambda: neg_loss)
return cls_loss
def reg_l1_loss(y_pred, y_true, indices, mask):
b, c = tf.shape(y_pred)[0], tf.shape(y_pred)[-1]
k = tf.shape(indices)[1]
y_pred = tf.reshape(y_pred, (b, -1, c))
length = tf.shape(y_pred)[1]
indices = tf.cast(indices, tf.int32)
# 找到其在1维上的索引
batch_idx = tf.expand_dims(tf.range(0, b), 1)
batch_idx = tf.tile(batch_idx, (1, k))
full_indices = (tf.reshape(batch_idx, [-1]) * tf.to_int32(length) +
tf.reshape(indices, [-1]))
# 取出对应的预测值
y_pred = tf.gather(tf.reshape(y_pred, [-1,c]),full_indices)
y_pred = tf.reshape(y_pred, [b, -1, c])
mask = tf.tile(tf.expand_dims(mask, axis=-1), (1, 1, 2))
# 求取l1损失值
total_loss = tf.reduce_sum(tf.abs(y_true * mask - y_pred * mask))
reg_loss = total_loss / (tf.reduce_sum(mask) + 1e-4)
return reg_loss
def loss(args):
#-----------------------------------------------------------------------------------------------------------------#
# hm_pred:热力图的预测值 (self.batch_size, self.output_size[0], self.output_size[1], self.num_classes)
# wh_pred:宽高的预测值 (self.batch_size, self.output_size[0], self.output_size[1], 2)
# reg_pred:中心坐标偏移预测值 (self.batch_size, self.output_size[0], self.output_size[1], 2)
# hm_true:热力图的真实值 (self.batch_size, self.output_size[0], self.output_size[1], self.num_classes)
# wh_true:宽高的真实值 (self.batch_size, self.max_objects, 2)
# reg_true:中心坐标偏移真实值 (self.batch_size, self.max_objects, 2)
# reg_mask:真实值的mask (self.batch_size, self.max_objects)
# indices:真实值对应的坐标 (self.batch_size, self.max_objects)
#-----------------------------------------------------------------------------------------------------------------#
hm_pred, wh_pred, reg_pred, hm_true, wh_true, reg_true, reg_mask, indices = args
hm_loss = focal_loss(hm_pred, hm_true)
wh_loss = 0.1 * reg_l1_loss(wh_pred, wh_true, indices, reg_mask)
reg_loss = reg_l1_loss(reg_pred, reg_true, indices, reg_mask)
total_loss = hm_loss + wh_loss + reg_loss
# total_loss = tf.Print(total_loss,[hm_loss,wh_loss,reg_loss])
return total_loss