原理篇
网络结构
1. 候选区域选择
设计label格式,设置真值。
def encoder(self,boxes,labels):
'''
boxes (tensor) [[x1,y1,x2,y2],[]]
labels (tensor) [...]
return 7x7x30
'''
grid_num = 14
# 每个网格需要预测B个BBox的位置信息(Region Proposal)和confidence(置信度)信息
# 在PASCAL VOC中,图像输入为448x448,取S=7,B=2,一共有20个类别(C=20)。则输出就是7x7x(B*2+C)的一个tensor
target = torch.zeros((grid_num,grid_num,30))
cell_size = 1./grid_num
wh = boxes[:,2:]-boxes[:,:2]
cxcy = (boxes[:,2:]+boxes[:,:2])/2
for i in range(cxcy.size()[0]):
cxcy_sample = cxcy[i]
# 物体中心对应的网格,在图中的大致位置,即物体中心在第几个网格
ij = (cxcy_sample/cell_size).ceil()-1
# =========================================================== #
# 因为每个网格需要预测B个BBox的位置信息,所以这里有俩坐标的信息
# 下图中坐标的置信度设置为1
target[int(ij[1]),int(ij[0]),4] = 1
# 下图中坐标的置信度设置为1
target[int(ij[1]),int(ij[0]),9] = 1
# =========================================================== #
# =========================================================== #
# 20个类,前面两个BBox占了10位,故而第10个通道开始是label的信息
target[int(ij[1]),int(ij[0]),int(labels[i])+10] = 1
#匹配到的网格的左上角相对坐标
xy = ij*cell_size
# 物体中心坐标到网格坐标的偏差
delta_xy = (cxcy_sample -xy)/cell_size
target[int(ij[1]),int(ij[0]),2:4] = wh[i]
target[int(ij[1]),int(ij[0]),:2] = delta_xy
target[int(ij[1]),int(ij[0]),7:9] = wh[i]
target[int(ij[1]),int(ij[0]),5:7] = delta_xy
return target
2. Loss 函数
class yoloLoss(nn.Module):
def __init__(self,S,B,l_coord,l_noobj):
# 7,2,5,0.5
super(yoloLoss,self).__init__()
self.S = S
self.B = B
self.l_coord = l_coord
self.l_noobj = l_noobj
def compute_iou(self, box1, box2):
'''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2].
Args:
box1: (tensor) bounding boxes, sized [N,4].
box2: (tensor) bounding boxes, sized [M,4].
Return:
(tensor) iou, sized [N,M].
'''
N = box1.size(0)
M = box2.size(0)
lt = torch.max(
box1[:,:2].unsqueeze(1).expand(N,M,2), # [N,2] -> [N,1,2] -> [N,M,2]
box2[:,:2].unsqueeze(0).expand(N,M,2), # [M,2] -> [1,M,2] -> [N,M,2]
)
rb = torch.min(
box1[:,2:].unsqueeze(1).expand(N,M,2), # [N,2] -> [N,1,2] -> [N,M,2]
box2[:,2:].unsqueeze(0).expand(N,M,2), # [M,2] -> [1,M,2] -> [N,M,2]
)
wh = rb - lt # [N,M,2]
wh[wh<0] = 0 # clip at 0
inter = wh[:,:,0] * wh[:,:,1] # [N,M]
area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1]) # [N,]
area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1]) # [M,]
area1 = area1.unsqueeze(1).expand_as(inter) # [N,] -> [N,1] -> [N,M]
area2 = area2.unsqueeze(0).expand_as(inter) # [M,] -> [1,M] -> [N,M]
iou = inter / (area1 + area2 - inter)
return iou
def forward(self,pred_tensor,target_tensor):
'''
pred_tensor: (tensor) size(batchsize,S,S,Bx5+20=30) [x,y,w,h,c]
target_tensor: (tensor) size(batchsize,S,S,30)
'''
# batch_size
N = pred_tensor.size()[0]
# 划分的格子中,找出真值中有目标的格子(置信度大于0)
# shape: [batch_size, S, S]
coo_mask = target_tensor[:,:,:,4] > 0
# 划分的格子中,找出真值中没有目标的格子(置信度等于0)
# shape: [batch_size, S, S]
noo_mask = target_tensor[:,:,:,4] == 0
coo_mask = coo_mask.unsqueeze(-1).expand_as(target_tensor)
noo_mask = noo_mask.unsqueeze(-1).expand_as(target_tensor)
coo_pred = pred_tensor[coo_mask].view(-1,30)
box_pred = coo_pred[:,:10].contiguous().view(-1,5) #box[x1,y1,w1,h1,c1]
class_pred = coo_pred[:,10:] #[x2,y2,w2,h2,c2]
coo_target = target_tensor[coo_mask].view(-1,30)
box_target = coo_target[:,:10].contiguous().view(-1,5)
class_target = coo_target[:,10:]
# compute not contain obj loss
# 对于那些没有包括目标的格子来说:在predict中找出noo_mask对应的值,在真值中找出noo_mask对应的值
noo_pred = pred_tensor[noo_mask].view(-1,30)
noo_target = target_tensor[noo_mask].view(-1,30)
# 初始化mask为True的矩阵,然后全部置0
# noo pred只需要计算 confidence 的损失,也就是4和9的位置 size[-1,2]
noo_pred_mask = torch.cuda.ByteTensor(noo_pred.size()).bool()
noo_pred_mask.zero_()
noo_pred_mask[:,4]=1;noo_pred_mask[:,9]=1
noo_pred_c = noo_pred[noo_pred_mask]
noo_target_c = noo_target[noo_pred_mask]
nooobj_loss = F.mse_loss(noo_pred_c,noo_target_c,size_average=False)
#compute contain obj loss
coo_response_mask = torch.cuda.ByteTensor(box_target.size()).bool()
coo_response_mask.zero_()
coo_not_response_mask = torch.cuda.ByteTensor(box_target.size()).bool()
coo_not_response_mask.zero_()
box_target_iou = torch.zeros(box_target.size()).cuda()
# 由于每个网格输出的都是两个bbox,所以间隔为2, box_target.shape [box_num, 5]
for i in range(0,box_target.size()[0],2): #choose the best iou box
# box1代表模型输出的两个box信息
box1 = box_pred[i:i+2]
box1_xyxy = Variable(torch.FloatTensor(box1.size()))
box1_xyxy[:,:2] = box1[:,:2]/14. -0.5*box1[:,2:4]
box1_xyxy[:,2:4] = box1[:,:2]/14. +0.5*box1[:,2:4]
# box2代表真值的box信息
box2 = box_target[i].view(-1,5)
box2_xyxy = Variable(torch.FloatTensor(box2.size()))
box2_xyxy[:,:2] = box2[:,:2]/14. -0.5*box2[:,2:4]
box2_xyxy[:,2:4] = box2[:,:2]/14. +0.5*box2[:,2:4]
# 计算IOU是需要把IoU的值当做置信度
iou = self.compute_iou(box1_xyxy[:,:4],box2_xyxy[:,:4]) #[2,1]
# 两个里面选一个重叠度最大的作为输出
max_iou,max_index = iou.max(0)
max_index = max_index.data.cuda()
coo_response_mask[i+max_index]=1
coo_not_response_mask[i+1-max_index]=1
#####
# we want the confidence score to equal the
# intersection over union (IOU) between the predicted box
# and the ground truth
#####
box_target_iou[i+max_index,torch.LongTensor([4]).cuda()] = (max_iou).data.cuda()
box_target_iou = Variable(box_target_iou).cuda()
#1.response loss
box_pred_response = box_pred[coo_response_mask].view(-1,5)
box_target_response_iou = box_target_iou[coo_response_mask].view(-1,5)
box_target_response = box_target[coo_response_mask].view(-1,5)
contain_loss = F.mse_loss(box_pred_response[:,4],box_target_response_iou[:,4],size_average=False)
loc_loss = F.mse_loss(box_pred_response[:,:2],box_target_response[:,:2],size_average=False) + F.mse_loss(torch.sqrt(box_pred_response[:,2:4]),torch.sqrt(box_target_response[:,2:4]),size_average=False)
#2.not response loss
box_pred_not_response = box_pred[coo_not_response_mask].view(-1,5)
box_target_not_response = box_target[coo_not_response_mask].view(-1,5)
box_target_not_response[:,4]= 0
#not_contain_loss = F.mse_loss(box_pred_response[:,4],box_target_response[:,4],size_average=False)
#I believe this bug is simply a typo
not_contain_loss = F.mse_loss(box_pred_not_response[:,4], box_target_not_response[:,4],size_average=False)
#3.class loss
# 这里用的是one-hot向量编码,所以用的是mse loss
class_loss = F.mse_loss(class_pred,class_target,size_average=False)
return (self.l_coord*loc_loss + 2*contain_loss + not_contain_loss + self.l_noobj*nooobj_loss + class_loss)/N
3. 网络结构
用到的网络结构是resnet
可以参考: ResNet 猫狗大战