参考
1.目标检测——搭建更好更快的YOLO!
2.You Only Look Once: Unified, Real-Time Object Detection
模型框架
1. 论文模型框架
1.1 Backbone
1.2 Head
SPP+SAM
1.2.1 SPP
1.2.2 SAM
代码实现
1. Backbone(ResNet)
1.1 残差原理
1.2 ResNet 基本结构
1.3 BasicBlock and Bottleneck
1.4 实现思路
- 通过形如resnet*()(*代指18,50,101,152等)的函数返回ressnet实例
- _resnet()具体实例化以及加载模型参数,返回实例化后的参数
- 定义ResNet主体框架
- 定义BasicBlock以及Bottleneck
import torch
import torch.nn as nn
from torch import Tensor
import torch.utils.model_zoo as model_zoo
from typing import Type, Any, Callable, Union, List, Optional
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152']
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}
def conv3x3(in_planes: int, out_planes: int, stride: int=1)->nn.Conv2d:
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
def conv1x1(in_planes: int, out_planes: int, stride: int=1)->nn.Conv2d:
"""1x1 convolution"""
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride,
bias=False)
class BasicBlock(nn.Module):
expansion: int = 1
def __init__(
self,
inplanes: int,
planes: int,
stride: int=1,
downsample: Optional[nn.Module]=None
) -> None:
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x: Tensor) -> Tensor:
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion: int = 4
def __init__(
self,
inplanes: int,
planes: int,
stride: int=1,
downsample: Optional[nn.Module] = None
) -> None:
super(Bottleneck, self).__init__()
self.conv1 = conv1x1(inplanes, planes)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = conv3x3(planes, planes, stride)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = conv1x1(planes, planes)
self.bn3 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x: Tensor) -> Tensor:
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(
self,
block: Type[Union[BasicBlock, Bottleneck]],
layers: List[int],
zero_init_residual: bool=False
) -> None:
super(ResNet, self).__init__()
self.inplanes = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size = 7, stride = 2, padding = 3,
bias = False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace = True)
self.maxpool = nn.MaxPool2d(kernel_size = 3, stride = 2, padding = 1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride = 2)
self.layer3 = self._make_layer(block, 256, layers[2], stride = 2)
self.layer4 = self._make_layer(block, 512, layers[3], stride = 2)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
# Zero-initialize the last BN in each residual branch,
# so that the residual branch starts with zeros, and each residual block behaves like an identity.
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
if zero_init_residual:
for m in self.modules():
if isinstance(m, Bottleneck):
nn.init.constant_(m.bn3.weight, 0)
elif isinstance(m, BasicBlock):
nn.init.constant_(m.bn2.weight, 0)
def _make_layer(
self,
block: Type[Union[BasicBlock, Bottleneck]],
planes: int,
blocks: int,
stride: int=1
) -> nn.Sequential:
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x: Tensor) -> Tensor:
C_1 = self.conv1(x)
C_1 = self.bn1(C_1)
C_1 = self.relu(C_1)
C_1 = self.maxpool(C_1)
C_2 = self.layer1(C_1)
C_3 = self.layer2(C_2)
C_4 = self.layer3(C_3)
C_5 = self.layer4(C_4)
return C_3, C_4, C_5
def resnet18(pretrained: bool=False, hr_pretrained: bool=False, **kwargs: Any)->ResNet:
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
if pretrained:
# strict = False as we don't need fc layer params.
if hr_pretrained:
print('Loading the high resolution pretrained model ...')
model.load_state_dict(torch.load("backbone/weights/resnet18_hr_10.pth"), strict=False)
else:
model.load_state_dict(model_zoo.load_url(model_urls['resnet18']), strict=False)
return model
def resnet34(pretrained: bool=False, hr_pretrained: bool=False, **kwargs: Any)->ResNet:
"""Constructs a ResNet-34 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet34']), strict=False)
return model
def resnet50(pretrained: bool=False, hr_pretrained: bool=False, **kwargs: Any)->ResNet:
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet50']), strict=False)
return model
def resnet101(pretrained: bool=False, hr_pretrained: bool=False, **kwargs: Any)->ResNet:
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet101']), strict=False)
return model
def resnet152(pretrained: bool=False, hr_pretrained: bool=False, **kwargs: Any)->ResNet:
"""Constructs a ResNet-152 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
return model
# if __name__=='__main__':
# #model = torchvision.models.resnet50()
# print("found ", torch.cuda.device_count(), " GPU(s)")
# device = torch.device("cuda")
# model = resnet18(pretrained=True)#.to(device)
# print(model)
# input_data = torch.randn(1, 3, 512, 512)#.to(device)
# output = model(input_data)
# print(output[0].shape)
2. Head(SPP + SAM)
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from typing import Type, Any, Callable, Union, List, Optional
class Conv2d(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
padding: int=0,
stride: int=0,
dilation: int=1,
leakyReLU: bool=False
)->None:
super(Conv2d, self).__init__()
self.convs = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation),
nn.BatchNorm2d(out_channels),
nn.LeakyReLU(0.1, inplace=True) if leakyReLU else nn.ReLU(inplace=True)
)
def forward(self, x: Tensor)->Tensor:
return self.convs(x)
class SAM(nn.Module):
"""Parallel CBAM"""
def __init__(self, in_channels: int) ->None:
super(SAM, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(in_channels, in_channels, 1),
nn.Sigmoid()
)
def forward(self, x: Tensor)->Tensor:
""" Spatial Attention Module """
x_attention = self.conv(x)
return x * x_attention
class SPP(nn.Module):
"""
Spatial Pyramid Pooling
"""
def __init__(self, in_channels: int, out_channels: int)->None:
super(SPP, self).__init__()
self.fuse_conv = Conv2d(in_channels * 4, out_channels, 1, leakyReLU=True)
def forward(self, x: Tensor)-> Tensor:
x_1 = F.max_pool2d(x, 5, stride=1, padding=2)
x_2 = F.max_pool2d(x, 9, stride=1, padding=4)
x_3 = F.max_pool2d(x, 13, stride=1, padding=6)
x = torch.cat([x, x_1, x_2, x_3], dim=1)
return self.fuse_conv(x)
3. Loss
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
class BCELoss(nn.Module):
def __init__(self, weight=None, ignore_index=-100, reduce=None, reduction='mean'):
super(BCELoss, self).__init__()
self.reduction = reduction
def forward(self, inputs, targets):
pos_id = (targets==1.0).float()
neg_id = (targets==0.0).float()
pos_loss = -pos_id * torch.log(inputs + 1e-14)
neg_loss = -neg_id * torch.log(1.0 - inputs + 1e-14)
if self.reduction == 'mean':
pos_loss = torch.mean(torch.sum(pos_loss, 1))
neg_loss = torch.mean(torch.sum(neg_loss, 1))
return pos_loss, neg_loss
else:
return pos_loss, neg_loss
class MSELoss(nn.Module):
def __init__(self, reduction='mean'):
super(MSELoss, self).__init__()
self.reduction = reduction
def forward(self, inputs, targets):
pos_id = (targets==1.0).float()
neg_id = (targets==0.0).float()
pos_loss = pos_id * (inputs - targets)**2
neg_loss = neg_id * (inputs)**2
if self.reduction == 'mean':
pos_loss = torch.mean(torch.sum(pos_loss, 1))
neg_loss = torch.mean(torch.sum(neg_loss, 1))
return pos_loss, neg_loss
else:
return pos_loss, neg_loss
class BCE_focal_loss(nn.Module):
def __init__(self, weight=None, gamma=2, reduction='mean'):
super(BCE_focal_loss, self).__init__()
self.gamma = gamma
self.reduction = reduction
def forward(self, inputs, targets):
pos_id = (targets==1.0).float()
neg_id = (1 - pos_id).float()
pos_loss = -pos_id * (1.0-inputs)**self.gamma * torch.log(inputs + 1e-14)
neg_loss = -neg_id * (inputs)**self.gamma * torch.log(1.0 - inputs + 1e-14)
if self.reduction == 'mean':
return torch.mean(torch.sum(pos_loss+neg_loss, 1))
else:
return pos_loss+neg_loss
def generate_dxdywh(gt_label, w, h, s):
xmin, ymin, xmax, ymax = gt_label[:-1]
# compute the center, width and height
c_x = (xmax + xmin) / 2 * w
c_y = (ymax + ymin) / 2 * h
box_w = (xmax - xmin) * w
box_h = (ymax - ymin) * h
if box_w < 1e-28 or box_h < 1e-28:
# print('A dirty data !!!')
return False
# map center point of box to the grid cell
c_x_s = c_x / s
c_y_s = c_y / s
grid_x = int(c_x_s)
grid_y = int(c_y_s)
# compute the (x, y, w, h) for the corresponding grid cell
tx = c_x_s - grid_x
ty = c_y_s - grid_y
tw = np.log(box_w)
th = np.log(box_h)
weight = 2.0 - (box_w / w) * (box_h / h)
return grid_x, grid_y, tx, ty, tw, th, weight
def gt_creator(input_size, stride, label_lists=[], name='VOC'):
assert len(input_size) > 0 and len(label_lists) > 0
# prepare the all empty gt datas
batch_size = len(label_lists)
w = input_size[1]
h = input_size[0]
# We make gt labels by anchor-free method and anchor-based method.
ws = w // stride
hs = h // stride
s = stride
gt_tensor = np.zeros([batch_size, hs, ws, 1+1+4+1])
# generate gt whose style is yolo-v1
for batch_index in range(batch_size):
for gt_label in label_lists[batch_index]:
gt_class = int(gt_label[-1])
result = generate_dxdywh(gt_label, w, h, s)
if result:
grid_x, grid_y, tx, ty, tw, th, weight = result
if grid_x < gt_tensor.shape[2] and grid_y < gt_tensor.shape[1]:
gt_tensor[batch_index, grid_y, grid_x, 0] = 1.0
gt_tensor[batch_index, grid_y, grid_x, 1] = gt_class
gt_tensor[batch_index, grid_y, grid_x, 2:6] = np.array([tx, ty, tw, th])
gt_tensor[batch_index, grid_y, grid_x, 6] = weight
gt_tensor = gt_tensor.reshape(batch_size, -1, 1+1+4+1)
return gt_tensor
def loss(pred_conf, pred_cls, pred_txtytwth, label):
obj = 5.0
noobj = 1.0
# create loss_f
conf_loss_function = MSELoss(reduction='mean')
cls_loss_function = nn.CrossEntropyLoss(reduction='none')
txty_loss_function = nn.BCEWithLogitsLoss(reduction='none')
twth_loss_function = nn.MSELoss(reduction='none')
pred_conf = torch.sigmoid(pred_conf[:, :, 0])
pred_cls = pred_cls.permute(0, 2, 1)
pred_txty = pred_txtytwth[:, :, :2]
pred_twth = pred_txtytwth[:, :, 2:]
gt_obj = label[:, :, 0].float()
gt_cls = label[:, :, 1].long()
gt_txtytwth = label[:, :, 2:-1].float()
gt_box_scale_weight = label[:, :, -1]
# objectness loss
pos_loss, neg_loss = conf_loss_function(pred_conf, gt_obj)
conf_loss = obj * pos_loss + noobj * neg_loss
# class loss
cls_loss = torch.mean(torch.sum(cls_loss_function(pred_cls, gt_cls) * gt_obj, 1))
# box loss
txty_loss = torch.mean(torch.sum(torch.sum(txty_loss_function(pred_txty, gt_txtytwth[:, :, :2]), 2) * gt_box_scale_weight * gt_obj, 1))
twth_loss = torch.mean(torch.sum(torch.sum(twth_loss_function(pred_twth, gt_txtytwth[:, :, 2:]), 2) * gt_box_scale_weight * gt_obj, 1))
txtytwth_loss = txty_loss + twth_loss
total_loss = conf_loss + cls_loss + txtytwth_loss
return conf_loss, cls_loss, txtytwth_loss, total_loss
4. YOLO_V1
import torch
import torch.nn as nn
from torch import Tensor
import numpy as np
class CustomYOLO(nn.Module):
def __init__(
self,
device: torch.device,
input_size: int,
num_classes: int=20,
trainable: bool=False,
conf_thresh: float=0.01,
nms_thresh: float=0.5,
) -> None:
super(CustomYOLO, self).__init__()
self.device = device
self.num_classes = num_classes
self.trainable = trainable
self.conf_thresh = conf_thresh
self.nms_thresh = nms_thresh
self.stride = 32
self.grid_cell = self.create_grid(input_size)
self.input_size = input_size
self.scale = np.array([[input_size[1], input_size[0], input_size[1], input_size[0]]])
self.scale_torch = torch.tensor(self.scale.copy(), device=device).float()
# we use resnet18 as backbone
self.backbone = resnet18(pretrained=True)
self.SPP = SPP(512, 512)
self.SAM = SAM(512)
self.conv_set = nn.Sequential(
Conv2d(512, 256, 1, leakyReLU=True),
Conv2d(256, 512, 3, padding=1, leakyReLU=True),
Conv2d(512, 256, 1, leakyReLU=True),
Conv2d(256, 512, 3, padding=1, leakyReLU=True),
)
self.pred = nn.Conv2d(512, 1 + self.num_classes + 4, 1)
def create_grid(self, input_size: int)->Tensor:
w, h = input_size[1], input_size[0]
# generate grid cells
ws, hs = w // self.stride, h // self.stride
grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)])
grid_xy = torch.stack([grid_x, grid_y], dim=-1).float()
grid_xy = grid_xy.view(1, hs*ws, 2).to(self.device)
return grid_xy
def set_grid(self, input_size: int)->None:
self.input_size = input_size
self.grid_cell = self.create_grid(input_size)
self.scale = np.array([[[input_size[1], input_size[0], input_size[1], input_size[0]]]])
self.scale_torch = torch.tensor(self.scale.copy(), device=self.device).float()
def decode_boxes(self, pred: Tensor)-> Tensor:
"""
input box : [tx, ty, tw, th]
output box : [xmin, ymin, xmax, ymax]
"""
output = torch.zeros_like(pred)
pred[:, :, :2] = torch.sigmoid(pred[:, :, :2]) + self.grid_cell
pred[:, :, 2:] = torch.exp(pred[:, :, 2:])
# [c_x, c_y, w, h] -> [xmin, ymin, xmax, ymax]
output[:, :, 0] = pred[:, :, 0] * self.stride - pred[:, :, 2] / 2
output[:, :, 1] = pred[:, :, 1] * self.stride - pred[:, :, 3] / 2
output[:, :, 2] = pred[:, :, 0] * self.stride + pred[:, :, 2] / 2
output[:, :, 3] = pred[:, :, 1] * self.stride + pred[:, :, 3] / 2
return output
def nms(self, dets: Tensor, scores: Tensor)-> List:
x1, y1, x2, y2 = dets[:, 0], dets[:, 1], dets[:, 2], dets[:, 3] # xmin, ymin, xmax, ymax
areas = (x2-x1)*(y2-y1) # the size of bbox
order = scores.argsort()[::-1] # sort bounding boxes by decreasing order
keep = [] # store the final bounding boxes
while order.size > 0:
i = order[0] #the index of the bbox with highest confidence
keep.append(i) #save it to keep
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(1e-28, xx2 - xx1)
h = np.maximum(1e-28, yy2 - yy1)
inter = w * h
# Cross Area / (bbox + particular area - Cross Area)
ovr = inter / (areas[i] + areas[order[1:]] - inter)
#reserve all the boundingbox whose ovr less than thresh
inds = np.where(ovr <= self.nms_thresh)[0]
order = order[inds + 1]
return keep
def postprocess(self, all_local, all_conf, exchange=True, im_shape=None):
"""
bbox_pred: (HxW, 4), bsize = 1
prob_pred: (HxW, num_classes), bsize = 1
"""
bbox_pred = all_local
prob_pred = all_conf
cls_inds = np.argmax(prob_pred, axis=1)
prob_pred = prob_pred[(np.arange(prob_pred.shape[0]), cls_inds)]
scores = prob_pred.copy()
# threshold
keep = np.where(scores >= self.conf_thresh)
bbox_pred = bbox_pred[keep]
scores = scores[keep]
cls_inds = cls_inds[keep]
# NMS
keep = np.zeros(len(bbox_pred), dtype=np.int)
for i in range(self.num_classes):
inds = np.where(cls_inds == i)[0]
if len(inds) == 0:
continue
c_bboxes = bbox_pred[inds]
c_scores = scores[inds]
c_keep = self.nms(c_bboxes, c_scores)
keep[inds[c_keep]] = 1
keep = np.where(keep > 0)
bbox_pred = bbox_pred[keep]
scores = scores[keep]
cls_inds = cls_inds[keep]
if im_shape != None:
# clip
bbox_pred = self.clip_boxes(bbox_pred, im_shape)
return bbox_pred, scores, cls_inds
def forward(self, x, target=None):
# backbone
_, _, C_5 = self.backbone(x)
# head
C_5 = self.SPP(C_5)
C_5 = self.SAM(C_5)
C_5 = self.conv_set(C_5)
# pred
prediction = self.pred(C_5)
prediction = prediction.view(C_5.size(0), 1 + self.num_classes + 4, -1).permute(0, 2, 1)
B, HW, C = prediction.size()
# Divide prediction to obj_pred, txtytwth_pred and cls_pred
# [B, H*W, 1]
conf_pred = prediction[:, :, :1]
# [B, H*W, num_cls]
cls_pred = prediction[:, :, 1 : 1 + self.num_classes]
# [B, H*W, 4]
txtytwth_pred = prediction[:, :, 1 + self.num_classes:]
if not self.trainable:
with torch.no_grad():
# batch size = 1
all_conf = torch.sigmoid(conf_pred)[0] # 0 is because that these is only 1 batch.
all_bbox = torch.clamp((self.decode_boxes(txtytwth_pred) / self.scale_torch)[0], 0., 1.)
all_class = (torch.softmax(cls_pred[0, :, :], 1) * all_conf)
# separate box pred and class conf
all_conf = all_conf.to('cpu').numpy()
all_class = all_class.to('cpu').numpy()
all_bbox = all_bbox.to('cpu').numpy()
bboxes, scores, cls_inds = self.postprocess(all_bbox, all_class)
return bboxes, scores, cls_inds
else:
conf_loss, cls_loss, txtytwth_loss, total_loss = loss(pred_conf=conf_pred, pred_cls=cls_pred,
pred_txtytwth=txtytwth_pred,
label=target)
return conf_loss, cls_loss, txtytwth_loss, total_loss
if __name__=='__main__':
#model = torchvision.models.resnet50()
print("found ", torch.cuda.device_count(), " GPU(s)")
device = torch.device("cpu")
model = CustomYOLO(device, input_size=[512, 512], num_classes=20, trainable=False)
print(model)
input_data = torch.randn(1, 3, 512, 512)#.to(device)
output = model(input_data)
print(output[0].shape)