YOLOV2 pytorch版本代码详解

最新推荐文章于 2024-03-10 23:08:42 发布

山居秋暝LS

最新推荐文章于 2024-03-10 23:08:42 发布

阅读量3.3k

点赞数 11

分类专栏：计算机视觉文章标签： pytorch 深度学习机器学习

本文链接：https://blog.csdn.net/qq_35732321/article/details/127511193

版权

计算机视觉专栏收录该内容

44 篇文章 3 订阅

订阅专栏

YOLOV2 Pytorch 版本

YOLOV2和YOLOV1的不同点：

backbone : darknet19,(26x26xchannels),(13x13xchannels)
loss : 在计算框的损失时，用的是预测偏移和真实偏移的mse。
anchor : 增加先验框提高预测精度，需要对数据进行编码和解码。

目录
1. 数据处理
    1.1 数据集分类
    1.2 数据转换为hdf5格式
    1.3 编码
2. 网络模型
    2.1 DarkNet19
    2.2 yolo_body+decoder
3. 损失函数
    3.1 正样本损失
    3.2 负样本损失
    3.3 类别损失
    3.4 框损失
4. 训    练
    4.1 载入数据
    4.2 载入模型
    4.3 损失函数
    4.4 更新参数
5. 预    测
    5.1 数据处理
    5.2 预测
    5.3 筛选
    5.4 画框

1. 数据处理

1.1 数据集分类

data_process/datasets_split_1.py

aim    ： 把数据集划分为训练集、测试集、验证集。每个数据集存放的是图片的名称。
input  :xml_path、base_path、trainval_radio、train_radio
output : base_path+trainval.txt、base_path+train.txt、base_path+val.txt、base_path+test.txt。
process：
        1. 根据xml_path里的文件获取总样本名称。
        2. 根据trainval_radio、train_radio获取各个数据集的样本数量。根据各个数据集的样本数量从总样本中抽取样本，获取样本的下标。
        3. 根据下标所在的数据集，把数据集的名称放在不同的数据集中。

import random,os

xml_path = '../VOCdevkit/VOC2007/Annotations'    # 总样本
base_path = '../VOCdevkit/VOC2007/ImageSets/Main'
trainval_radio = 0.9   # 训练测试数据集的样本比例
train_radio = 0.9      # 验证集比例

names_list = []
img_names = os.listdir(xml_path)
for name in img_names:
    if name.endswith('.xml'):
        names_list.append(name[:-4])

N = len(names_list)       # 总样本量
trainval_num = int(N*trainval_radio)  # 训练测试数据集量
train_num = int(trainval_num*train_radio)  # 训练集样本量
trainval_idx = random.sample(range(N),trainval_num)  # 训练测试数据集下标
train_idx = random.sample(trainval_idx,train_num)
# 训练集下标

# 数据集地址
ftrain_val = open(os.path.join(base_path,'trainval.txt'),'w')
ftrain = open(os.path.join(base_path,'train.txt'),'w')
fval = open(os.path.join(base_path,'val.txt'),'w')
ftest = open(os.path.join(base_path,'test.txt'),'w')

# 读入数据
for i in range(N) :
    name = names_list[i] + '\n'
    if i in trainval_idx:
        ftrain_val.write(name)
        if i in train_idx:
            ftrain.write(name)
        else:
            fval.write(name)
    else:
        ftest.write(name)

ftrain_val.close()
ftrain.close()
fval.close()
ftest.close()

1.2 数据转换为hdf5格式

data_process/data2hdf5_2.py

input  : 数据集
output : pascal_voc_07_12_LS.hdf5
process：
        1. 获取数据集的样本。train_set --> get_ids(voc_path,train_set) --> train_ids
        2. 生成voc_h5file，设置存储的图片数据类型和框的数据类型。划分每个数据集所属的group。voc_h5file存储'classes'。在每个group中设置train_images和train_boxes项目用来存储图片和框。
        3. train_ids ,train_images,train_boxes --> add_to_dataset();
        img_id --> get_img(voc_path,year,img_id);get_boxes(voc_path,year,img_id)  --> img_data;img_box

代码

import numpy as np
import os,h5py,argparse
import xml.etree.ElementTree as ElementTree

sets_from_2007 = [('2007','train'),('2007','val')]
train_set = [('2007','train')]
val_set = [('2007','val')]
test_set = [('2007','test')]

classes = [
    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
    "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
    "pottedplant", "sheep", "sofa", "train", "tvmonitor"]

parser = argparse.ArgumentParser(description='Conver Pascal VOC 2007 detection dataset to HDF5')
parser.add_argument('-p','--path_to_voc',help='path to VOCdevkit directory',
                    default='../VOCdevkit')

def get_ids(voc_path,datasets):
    ''' 数据集中的样本'''
    ids = []
    for year,set in datasets:
        id_path = os.path.join(voc_path,'VOC%s/ImageSets/Main/%s.txt'%(year,set))
        print(id_path)
        with open(id_path,'r')as f:
            ids.extend(f.read().strip().split())
    return ids

def get_img(voc_path,year,img_id):
    '''  读取图片 '''
    img_path = os.path.join(voc_path,'VOC%s/JPEGImages/%s.jpg'%(year,img_id))
    with open(img_path,'rb')as f:
        data = f.read()
    return np.frombuffer(data,dtype='uint8')  # [n,]

def get_boxes(voc_path,year,img_id):
    '''  读取框 '''
    boxes_path = os.path.join(voc_path,'VOC%s/Annotations/%s.xml'%(year,img_id))
    with open(boxes_path,'r') as f:
        xml_tree = ElementTree.parse(f)
    root = xml_tree.getroot()
    boxes = []
    for obj in root.iter('object'):
        difficult = obj.find('difficult').text
        cls = obj.find('name').text
        if cls not in classes or int(difficult) == 1:
            continue
        xml_box = obj.find('bndbox')
        bbox = (int(xml_box.find('xmin').text),
                int(xml_box.find('ymin').text),
                int(xml_box.find('xmax').text),
                int(xml_box.find('ymax').text),
                classes.index(cls))
        boxes.extend(bbox)
    return np.array(boxes)  # [n,]

def add_to_dataset(voc_path,year,ids,images,boxes,start = 0):
    '''  遍历每一个样本，读取数据集的样本和框  '''
    for i,img_id  in enumerate(ids):
        img_data = get_img(voc_path,year,img_id)
        img_box = get_boxes(voc_path,year,img_id)
        images[start+i] = img_data
        boxes[start+i] = img_box
    return i

def _main(args):
    voc_path = os.path.expanduser(args.path_to_voc)
    # 1 获取数据集样本
    train_ids = get_ids(voc_path,train_set)
    val_ids = get_ids(voc_path,val_set)
    test_ids = get_ids(voc_path,test_set)
    train_ids_2007 = get_ids(voc_path,sets_from_2007)
    total_train_ids = len(train_ids)+len(train_ids_2007)

    # 2 设置voc_h5file、数据类型、train_group
    print('Creating HDF5 dataset structure.')
    fname = os.path.join(voc_path,'pascal_voc_07_12_LS.hdf5')
    voc_h5file = h5py.File(fname,'w')
    uint8_dt = h5py.special_dtype(vlen = np.dtype('uint8')) # variable length uint8
    int_dt = h5py.special_dtype(vlen = np.dtype(int))
    train_group = voc_h5file.create_group('train')
    val_group = voc_h5file.create_group('val')
    test_group = voc_h5file.create_group('test')
    # 设置classes，实际应用中没有使用
    voc_h5file.attrs['classes'] = np.string_(str.join(',',classes))
    # 3 设置train_images 、train_boxes容器
    train_images = train_group.create_dataset('images',shape=(total_train_ids,),dtype=uint8_dt)
    val_images = val_group.create_dataset('images',shape=(len(val_ids),),dtype=uint8_dt)
    test_images = test_group.create_dataset('images',shape=(len(test_ids),),dtype=uint8_dt)

    train_boxes = train_group.create_dataset('boxes',shape=(total_train_ids,),dtype=int_dt)
    val_boxes = val_group.create_dataset('boxes',shape=(len(val_ids),),dtype=int_dt)
    test_boxes = test_group.create_dataset('boxes',shape=(len(test_ids),),dtype=int_dt)
    # 4 加载数据
    print('Process Pascal VOC 2007 datasets for training set')
    last_2007 = add_to_dataset(voc_path,'2007',train_ids_2007,train_images,train_boxes)
    print('Processing Pascal VOC 2012 training set.')
    add_to_dataset(voc_path,'2007',train_ids,train_images,train_boxes,start=last_2007+1)
    print('Processing Pascal VOC 2012 val set.')
    add_to_dataset(voc_path, '2007', val_ids, val_images, val_boxes)
    print('Processing Pascal VOC 2007 test set.')
    add_to_dataset(voc_path, '2007', test_ids, test_images, test_boxes)
    print('Closing HDF5 file.')
    voc_h5file.close()
    print('Done.')

if __name__ == '__main__':
    _main(parser.parse_args())
    # voc_path = parser.parse_args().path_to_voc
    # datasets = [('2007','train')]
    # ids = get_ids(voc_path,datasets)
    # # print(ids)
    # img = get_img(voc_path,year='2007',img_id='000025')
    # box = get_boxes(voc_path,year='2007',img_id='000025')
    # print(box.reshape(-1,5))

1.3 编码

data_process/data_encoder_3.py

input  : data_path,anchors_path,idx
output : processed_images[n,3,416,416]，out[n,13,13,5,4+1+5]
process：
        1.读取图片、框、类别数据。processed_images,processed_boxes = self.process_data(idx)
        2.对框编码，得到真实偏移和cls。out = self.encoder(processed_boxes)

代码

import numpy as np
import io,os,PIL,h5py,argparse
from PIL import Image
import torch
import torch.utils.data as data

YOLO_ANCHORS = np.array(
    ((0.57273, 0.677385), (1.87446, 2.06253), (3.33843, 5.47434),
     (7.88282, 3.52778), (9.77052, 9.16828)))

def get_classes(classes_path):
    with open(classes_path) as f:
        class_name = f.read().strip().split()
    return class_name

def get_anchors(anchors_path):
    if os.path.isfile(anchors_path):
        with open(anchors_path)as f:
            anchors = f.read().strip().split()
        return np.array(list(map(float,anchors))).reshape(-1, 2)
    else:
        Warning('Could not open anchors file, using default.')
        return YOLO_ANCHORS

class yoloDataset(data.Dataset):
    image_size = [416,416]
    def __init__(self,data_path,anchors_path):
        self.anchors = self.get_anchors(anchors_path)
        data = h5py.File(data_path, 'r')
        self.images = data['train/images'][:]
        self.boxes = data['train/boxes'][:]
        # 1 每张图片中，框最多是多少
        self.max_num = 0
        self.num_samples = len(self.boxes)
        self.flag = self.boxes is not None
        if self.flag:
            for i in range(self.num_samples):
                self.boxes[i] = self.boxes[i].reshape(-1,5)
                if self.max_num < self.boxes[i].shape[0]:
                    self.max_num = self.boxes[i].shape[0]

    def __len__(self):
        return self.num_samples

    def __getitem__(self,idx):
        processed_images,processed_boxes = self.process_data(idx)
        out = self.encoder(processed_boxes)
        return torch.tensor(processed_images), torch.tensor(out)

    def get_anchors(self,anchors_path):
        if os.path.isfile(anchors_path):
            with open(anchors_path)as f:
                anchors = f.read().strip().split()
            return np.array(list(map(float,anchors))).reshape(-1, 2)
        else:
            Warning('Could not open anchors file, using default.')
            return YOLO_ANCHORS

    def process_data(self,idx):
        ''' 
        aim :  1.把图片归一化到0`1，转换通道。
               2.box[x1,y1,x2,y2]-->[cx,cy,w,h];在原图上的相对位置;
                 每张图片上框的shape为[max_num,5],多余的补零。
        inputs: idx
        outputs: np.array(img),np.array(new_box) 
        '''
        images = self.images[idx]
        boxes = self.boxes[idx]
        img = Image.open(io.BytesIO(images))
        img_shape = np.array(img.size)           #
        img = img.resize(self.image_size, PIL.Image.BICUBIC) #  (416, 416)
        img = np.array(img,np.float)/255.
        img = np.transpose(img,(2,0,1))

        if self.flag:
            box = np.concatenate([(boxes[:,2:4] + boxes[:,:2])*0.5/img_shape,(boxes[:,2:4] - boxes[:,:2])/img_shape,boxes[:,4:5]],1)
            new_box = np.zeros((self.max_num,5),dtype=np.float32)
            new_box[:len(box),:] = box                       # box(cx,cy,w,h,cls)
            return np.array(img),np.array(new_box)
        else:
            return np.array(img),None

    def encoder(self,boxes):
        '''   one picture
        aim   : 把真实框映射到特征图上。
                1. 真实框在特征图上对应的数值；
                2 真实框在特征图上对应的对应的下标；
                3 计算预测偏移
        inputs:
            box[max_num_box, 5(cx,cy,w,h,cls)],anchors[5,2]   max_num_box=10 ; image_size=[416,416]
        outputs:
            true_boxes：[h, w, num_boxes, 4]
            detectors_mask: (h, w, num_boxes, 1)          eg:(13, 13, 5, 1)
            matching_true_boxes:(h, w, num_boxes, 5)      eg:(13, 13, 5, 5)
        '''
        # 1 创建模版
        h,w = self.image_size
        num_anchors = len(self.anchors)
        num_box_params = boxes.shape[1]
        assert h % 32 == 0,'Image sizes in YOLO_v2 must be multiples of 32.'
        assert w % 32 == 0, 'Image sizes in YOLO_v2 must be multiples of 32.'
        grid_h = h//32  # 13
        grid_w = w//32
        true_boxes = np.zeros([grid_h,grid_w,num_anchors,4],dtype=np.float32)
        detectors_mask = np.zeros([grid_h,grid_w,num_anchors,1],dtype=np.float32)  # (13, 13, 5, 1)
        matching_true_boxes = np.zeros([grid_h,grid_w,num_anchors,num_box_params],dtype=np.float32)  # (13, 13, 5, 5)
        # 2 编码
        box_class = boxes[:,4]  # [n,1]
        box = boxes[:,:4]*np.array([grid_w,grid_h,grid_w,grid_h])
        i,j = list(map(int,box[:,0])),list(map(int,box[:,1]))
        best_idx = self.iou_wh(box[:,:2],self.anchors)  #  (10, 2), (5, 2)-->  ((10,), (10,))
        true_boxes[i, j, best_idx] = boxes[:,:4]/np.array([grid_h,grid_w,grid_h,grid_w])
        detectors_mask[i,j,best_idx] = 1
        adjusted_box = np.array(
            [
                box[:,0] - i, box[:,1] - j,
                np.log(box[:,2] / self.anchors[best_idx][:,0]),
                np.log(box[:,3] / self.anchors[best_idx][:,1]), box_class
            ],
            dtype=np.float32).T
        matching_true_boxes[i, j, best_idx] = adjusted_box
        out = np.concatenate([np.array(true_boxes),np.array(detectors_mask),np.array(matching_true_boxes)],-1)
        return out  # true_boxes,detectors_mask, matching_true_boxes  # ((13, 13, 5, 1), (13, 13, 5, 5))

    def iou_wh(self,boxes_wh,anchors_wh):
        '''boxes_wh[n,2],anchors_wh [m,2]
        iou[n,m]'''
        boxes_wh=np.expand_dims(boxes_wh,1)      # [10,1,2]
        anchors_wh=np.expand_dims(anchors_wh,0)  # [1,5,2]
        box_max = boxes_wh/2.
        box_min = -box_max
        anchor_max = anchors_wh/2.
        anchor_min = -anchor_max

        inter_mins = np.maximum(box_min,anchor_min)      # [10,5,2]
        inter_maxs = np.minimum(box_max,anchor_max)
        inter_wh = np.maximum(inter_maxs-inter_mins,0.)
        inter_area = inter_wh[...,0] * inter_wh [...,1]  # [10,5]
        boxes_area = boxes_wh[...,0] * boxes_wh[...,1]
        anchors_area = anchors_wh[...,0]*anchors_wh[...,1]  #[1,5]
        iou = inter_area/(boxes_area+anchors_area-inter_area)  # [10,5]
        best_iou = np.max(iou,1)
        best_idx = np.argmax(iou,1)
        return list(best_idx*(best_iou > 0))

if __name__ == '__main__':
    from torch.utils.data import DataLoader
    data_path = '../VOCdevkit/pascal_voc_07_12_LS.hdf5'
    anchors_path = '../model_data.pascal_classes.txt'
    train_dataset = yoloDataset(data_path,anchors_path)  # [3, 416, 416],[13, 13, 5, 10]
    train_loader = DataLoader(train_dataset,batch_size=1,shuffle=True,num_workers=0)
    for i,(img,boxes) in enumerate(train_loader):
        print(img.shape)     # torch.Size([1, 3, 416, 416])
        print(boxes.shape)   # torch.Size([1, 13, 13, 5, 10]) 4+1+5

2. 网络模型

2.1 DarkNet19

nets/darketnet19.py

input  : img[b,3,416,416]
output : feas[b,1024,13,13]
process：
    1.features_26 = (cov_bn_leaky3  --> maxpool)*2 --> 
      (bottleneck_block*2  --> maxpool)*2 --> 
      bottleneck_x2_block  --> maxpool --> 
      bottleneck_x2_block
    2.features_13 = features_26  --> maxpool --> bbx22

代码

import torch
import torch.nn as nn
import math

def cov_bn_leaky3(inplanes,outplanes):
    return nn.Sequential(
        nn.Conv2d(inplanes,outplanes,kernel_size=3,padding=1),
        nn.BatchNorm2d(outplanes),
        nn.LeakyReLU(0.1)
    )
def cov_bn_leaky1(inplanes,outplanes):
    return nn.Sequential(
        nn.Conv2d(inplanes,outplanes,kernel_size=1),
        nn.BatchNorm2d(outplanes),
        nn.LeakyReLU(0.1)
    )

def bottleneck_block(inplanes,outplanes,bottleneck_filters):
    return nn.Sequential(
        cov_bn_leaky3(inplanes,outplanes),
        cov_bn_leaky1(outplanes,bottleneck_filters),
        cov_bn_leaky3(bottleneck_filters,outplanes)
    )

def bottleneck_x2_block(inplanes,outplanes,bottleneck_filters):
    return nn.Sequential(
        bottleneck_block(inplanes,outplanes,bottleneck_filters),
        cov_bn_leaky1(outplanes,bottleneck_filters),
        cov_bn_leaky3(bottleneck_filters,outplanes)
    )

class darknet_body(nn.Module):
    def __init__(self,):
        super(darknet_body, self).__init__()
        self.cbl1 = cov_bn_leaky3(3,32)
        self.cbl2 = cov_bn_leaky3(32,64)
        self.bb1 =  bottleneck_block(64,128, 64)
        self.bb2 =  bottleneck_block(128,256, 128)
        self.bbx21 =  bottleneck_x2_block(256,512, 256)
        self.bbx22 =  bottleneck_x2_block(512,1024, 512)
        self.maxpool = nn.MaxPool2d(kernel_size=2,stride=2)
        self.features_26 = nn.Sequential(self.cbl1,self.maxpool,self.cbl2,self.maxpool,self.bb1,
                                         self.maxpool, self.bb2 ,self.maxpool,  self.bbx21)
        self.features_13 = nn.Sequential(self.features_26 ,self.maxpool, self.bbx22)
        for m in self.modules():
            if isinstance(m,nn.Conv2d):
                n = m.kernel_size[0]*m.kernel_size[1]*m.out_channels
                m.weight.data.normal_(0,math.sqrt(2./n))
            elif isinstance(m,nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def forward(self,x):
        # out = self.cbl1(x)      #  [1, 32, 416, 416]
        # out = self.maxpool(out) #  [1, 32, 208, 208]
        # out = self.cbl2(out)    #  [1, 64, 208, 208]
        # out = self.maxpool(out) #  [1, 64, 104, 104]
        # out = self.bb1(out)     #  [1, 128, 104, 104]
        # out = self.maxpool(out) #  [1, 128, 52, 52]
        # out = self.bb2(out)     #  [1, 256, 52, 52]
        # out = self.maxpool(out) #  [1, 256, 26, 26]
        # out = self.bbx21(out)   #  [1, 512, 26, 26]
        # out = self.maxpool(x)   #  [1, 512, 13, 13]
        # out = self.bbx22(out)   #  [1, 1024, 13, 13]
        x = self.features_13(x)
        return x

def darknet19(inputs):
    """Generate Darknet-19 model for Imagenet classification."""
    body = darknet_body()(inputs)
    logits = nn.Conv2d(1024,1000, (1, 1))(body)
    logits = nn.Softmax(1)(logits)
    return logits

if __name__ == '__main__':
    x = torch.randn([1,3,416,416])
    # y = cov_bn_leaky1(3,10)(x)
    # y = bottleneck_block(3,30,20)
    # y = bottleneck_x2_block(3,30,20)(x)
    # net = darknet_body()
    # y = net(x)
    y = darknet_body()
    print('y.features_26 :',y.features_26)
    print('\n')
    print('y.bbx22 :',y.bbx22)
    # for i in y.children():
    #     print(i)

2.2 yolo_body+decoder

nets/yolo_model.py

（1）yolo_body
input  :[1,3,416,416] 
output :[1, 13, 13, 125]
process：
1.fea_26，fea_13
2.torch.cat([fea_26,fea_13],1) --> 
  cov_bn_leaky3,cov_bn_leaky1  --> 
  transpose

（2）yolo_decoder
inputs:
        feats: tensor, [None,125,13,13],
        anchors: array-like,Anchor box widths and heights. （5，2）
        num_classes: int, Number of target classes. 20
outputs:
        box_xy[1, 13, 13, 5, 2] 
        box_wh[1, 13, 13, 5, 2]
        box_conf[1, 13, 13, 5, 1]
        box_class_pred[1, 13, 13, 5, 20]
process：
        根据公式，是编码过程的逆过程。

代码

import sys
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
from nets.darketnet19 import cov_bn_leaky1,cov_bn_leaky3,darknet_body

sys.path.append('..') # 这个是干什么的？
voc_anchors = np.array( [[1.08, 1.19], [3.42, 4.41],
                         [6.63, 11.38], [9.42, 5.11], [16.62, 10.52]])
voc_classes = [
    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
    "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
    "pottedplant", "sheep", "sofa", "train", "tvmonitor"]

def grid(h,w):
    cx = torch.repeat_interleave(torch.arange(h),w).view(-1,1)
    cy = torch.Tensor.repeat(torch.arange(w),h).view(-1,1)
    return torch.cat([cx,cy],1)

class yolo_body(nn.Module):
    def __init__(self,num_anchors=5,num_classes=20):
        super(yolo_body, self).__init__()
        self.num_anchors = num_anchors
        self.num_classes =  num_classes
        self.darknet = darknet_body()
        self.fea_13 = nn.Sequential(self.darknet.features_13,cov_bn_leaky3(1024,1024),
                                    cov_bn_leaky3(1024,1024))
        self.fea_26 = nn.Sequential(self.darknet.features_26,cov_bn_leaky1(512,64))

    def pass_through(self,x):
        return torch.cat([x[:,:,::2,::2],x[:,:,::2,1::2],x[:,:,1::2,::2],x[:,:,1::2,1::2]],1)

    def forward(self,x):
        fea_13 = self.fea_13(x)
        fea_26 = self.fea_26(x)
        fea_26 = self.pass_through(fea_26)
        out = torch.cat([fea_26,fea_13],1)
        out = cov_bn_leaky3(1280,1024)(out)
        out = cov_bn_leaky1(1024,self.num_anchors*(self.num_classes+5))(out)
        out = torch.transpose(out,1,3)
        return out  # inputs:[1,3,416,416] --> outputs:[1, 13, 13, 125]

'''
def yolo_body(inputs,num_anchors=5,num_classes=20):
    darknet = darknet_body()
    features_26 = darknet.features_26
    features_13 = darknet.features_13
    fea_13 = nn.Sequential(features_13,cov_bn_leaky3(1024,1024),
                       cov_bn_leaky3(1024,1024))(inputs)
    fea_26 = nn.Sequential(features_26,cov_bn_leaky1(512,64))(inputs)
    fea_26 = pass_through(fea_26)
    out = torch.cat([fea_26,fea_13],1)
    out = cov_bn_leaky3(1280,1024)(out)
    out = cov_bn_leaky1(1024,num_anchors*(num_classes+5))(out)
    out = torch.transpose(out,1,3)
    print('out.shape:',out.shape)
    return out  # inputs:[1,3,416,416] --> outputs:[1, 13, 13, 125]

'''

def yolo_decoder(feats,anchors,num_classes):
    ''' Convert final layer features to bounding box parameters.
    inputs:
        feats: tensor, [None,125,13,13],
        anchors: array-like,Anchor box widths and heights.
        num_classes: int, Number of target classes.
    outputs:
        box_xy ,box_wh,box_conf ,box_class_pred
    '''
    grids = feats.shape[1:3]     #  torch.Size([13, 13])
    num_anchors = len(anchors)   # 5
    anchors_wh = Variable(torch.from_numpy(anchors)).view(1,1,1,num_anchors,2) # [1, 1, 1, 5, 2]
    anchors_cxy = grid(grids[0],grids[1]).view(-1,grids[0],grids[1],1,2)       # [1, 13, 13, 1, 2]

    feats = feats.view(-1,grids[0],grids[1],num_anchors,num_classes+5) # [1, 13, 13, 125]-->[1, 13, 13, 5, 25]
    box_xy = torch.sigmoid(feats[..., :2])  # [1,13,13,5,2]
    box_wh = torch.exp(feats[..., 2:4])     # [1,13,13,5,2]
    box_confidence = torch.sigmoid(feats[..., 4:5])     # [1,13,13,5,1]
    box_class_probs = torch.softmax(feats[..., 5:],-1)  # [1,13,13,5,20]

    box_xy = (box_xy + anchors_cxy) / torch.tensor(list(grids)) # [1, 13, 13, 5, 2]
    box_wh = box_wh * anchors_wh / torch.tensor(list(grids))    # [1, 13, 13, 5, 2]
    return box_xy, box_wh, box_confidence, box_class_probs


if __name__ == '__main__':
    x = torch.randn([1,3,416,416])
    net = yolo_body()
    params = []
    params_dict = dict(net.named_parameters())
    print(net(x).shape)  # torch.Size([1, 13, 13, 125])


# x = yolo_body(inputs=x,num_anchors=5,num_classes=20)
    # box_xy, box_wh, box_confidence, box_class_probs = yolo_encoder(feats=x,anchors=voc_anchors,num_classes=20)

3. 损失函数

loss.py

input  : pred(b, 13, 13, 125),target(b, 13, 13, 5, 1)
output : total_loss
process：
    1.数据准备 
       target --> true_boxes, detectors_mask, matching_true_boxes
       pred --> sigmoid --> pred_d_boxes
       pred --> yolo_decoder() --> pred_xy, pred_wh, pred_confidence, pred_class_prob
    2.正样本损失 best_iou/1 - pred_confidence,detectors_mask --> objects_loss
    3.负样本损失 (pred_xy, pred_wh),true_boxes --> iou --> object_detections;
                object_detections,detectors_mask,pred_confidence --> no_objects_loss
    4.类别损失  matching_true_boxes[...,-1],pred_class_prob,detectors_mask --> classification_loss
    5.框损失    matching_true_boxes[...,:4],pred_d_boxes,detectors_mask --> coordinates_loss

代码

import torch
import numpy as np
import torch.nn as nn
from nets.yolo_model import yolo_decoder

voc_anchors = np.array( [[1.08, 1.19], [3.42, 4.41],
                         [6.63, 11.38], [9.42, 5.11], [16.62, 10.52]])
'''
model_body.output (b, 13, 13, 125)
detectors_mask_input (b, 13, 13, 5, 1)
matching_boxes_input (b, 13, 13, 5, 5)
'''
class yoloLoss(nn.Module):
    def __init__(self,object_scale,no_object_scale,class_scale,
                 coordinates_scale,anchors,num_classes,
                 rescore_confidence=False,print_loss=False):   # criterion = yoloLoss(7,2,5,0.5)
        super(yoloLoss, self).__init__()
        self.object_scale = object_scale
        self.no_object_scale = no_object_scale
        self.class_scale = class_scale
        self.coordinates_scale = coordinates_scale
        self.rescore_confidence = rescore_confidence
        self.print_loss = print_loss
        self.anchors = anchors
        self.num_classes = num_classes

    def compute_iou(self,box_t,box_p):
        ''' box_pred [b,13, 13, 5, 4],box_true[b,13, 13, 5, 4] (x1,y1,x2,y2)'''
        # 1 lt,rd --> wh --> inter + areas  --> iou

        lt = torch.maximum(box_t[...,:2],box_p[...,:2])
        rd = torch.minimum(box_t[...,2:],box_p[...,2:])
        wh = rd - lt
        wh[wh<0]=0                   # [b,h,w,5,n,2]
        inter = wh[...,0]*wh[...,1]  # [b,h,w,5,n]
        area_t = (box_t[...,3]-box_t[...,1])*(box_t[...,2]-box_t[...,0]) # [b,1,1,1,n]
        area_p = (box_p[...,3]-box_p[...,1])*(box_p[...,2]-box_p[...,0]) # [b,1,1,1,n]
        iou = inter/(area_t+area_p-inter)
        return iou           # [b,h,w,5,n]

    def yolo_loss(self,pred,target):
        # 1 数据准备
        num_anchors = len(self.anchors)

        yolo_output = pred                    # [1, 13, 13, 125  ]
        true_boxes = target[...,:4]           # [1, 13, 13,  5, 4]
        detectors_mask = target[...,4:5]      # [1, 13, 13,  5, 1]
        matching_true_boxes = target[...,5:]  # [1, 13, 13,  5, 5]
        pred_xy, pred_wh, pred_confidence, pred_class_prob = yolo_decoder(
            yolo_output,anchors=voc_anchors,num_classes=self.num_classes)

        # 预测偏移
        yolo_output_shape = yolo_output.shape[1:3] # # torch.Size([1, 13, 13, 125])
        feats = yolo_output.view(-1,yolo_output_shape[0],yolo_output_shape[1],
                                 num_anchors,self.num_classes+5)  # torch.Size([1, 13, 13, 5, 25])
        pred_d_boxes = torch.cat((torch.sigmoid(feats[...,0:2]),feats[...,2:4]),axis=-1) # torch.Size([1, 13, 13, 5, 4])

        # 2 true_boxes与pred_xy, pred_wh的iou
        ## true_boxes[(1,13, 13, 5, 4)],pred_xy[1, 13, 13, 5, 2]
        true_box = torch.cat([(true_boxes[...,:2]-true_boxes[...,2:4]/2.),(true_boxes[...,:2]+true_boxes[...,2:4]/2.)],-1)
        pred_box = torch.cat([(pred_xy-pred_wh/2.),(pred_xy+pred_wh/2.)],-1)  # [1, 13, 13, 5, 1, 4]
        iou = self.compute_iou(true_box,pred_box)    # [1, 13, 13, 5, 10]
        best_iou, _ = iou.max(-1)                    # [1, 13, 13, 5    ]
        best_iou = best_iou.unsqueeze(-1)            # [1, 13, 13, 5, 1 ]
        object_detections = best_iou > 0.6           # [1, 13, 13, 5, 1 ]
        # 3 loss
        # 3.1 no_obj loss
        no_objects_loss = self.no_object_scale * (1-object_detections)*torch.logical_not(detectors_mask)*torch.square(-pred_confidence)

        # 3.2 obj loss
        if self.rescore_confidence:
            objects_loss = self.object_scale * detectors_mask * torch.square(best_iou - pred_confidence)
        else:
            objects_loss = self.object_scale * detectors_mask * torch.square(1 - pred_confidence)
        # 3.3  (obj loss + no_obj loss)
        confidence_loss = (objects_loss + no_objects_loss).sum()

        # detectors_mask[b, 13, 13, 5, 1]
        # 3.4 cls loss  true_boxes[b,n]                 # pred_class_prob [b,13,13,5,20]
        matching_classes = matching_true_boxes[...,4]   # [b, 13, 13, 5, 1]
        s1,s2,s3,s4 = matching_classes.shape
        one_hot = torch.eye(self.num_classes)
        matching_classes = one_hot[matching_classes.flatten()].view(s1,s2,s3,s4,self.num_classes)
        classification_loss = (self.class_scale * detectors_mask * torch.square(matching_classes - pred_class_prob)).sum() #  [b,n,20]
        # boxes loss
        matching_boxes = matching_true_boxes[...,0:4]
        coordinates_loss = (self.coordinates_scale * detectors_mask * torch.square(matching_boxes-pred_d_boxes)).sum()
        total_loss = 0.5 * (confidence_loss + classification_loss + coordinates_loss)
        return total_loss

if __name__ == '__main__':

    print('PyCharm')

4. 训练

train.py

process：
        1.载入数据
        2.载入模型
        3.损失函数
        4.更新参数

代码

import os
import torch,h5py
import numpy as np
from loss import yoloLoss
from torch.autograd import Variable
from nets.yolo_model import yolo_body
from torch.utils.data import DataLoader
from data_process.data_encoder_3 import get_classes,get_anchors,yoloDataset

# 1 parameters
use_gpu = False
learning_rate = 0.001
num_epochs = 1
batch_size = 1

# 2 model
net = yolo_body()
params = []
params_dict = dict(net.named_parameters())
for k,v in params_dict.items():
    if k.startswith('features'):
        params += [{'params':[v],'lr':learning_rate*1}]
    else:
        params += [{'params':[v],'lr':learning_rate*1}]
        
# 3 loss + optimizer
anchors_path = 'model_data/anchors.txt'
classes_path = 'model_data/pascal_classes.txt'
anchors = get_anchors(anchors_path)
classes = get_classes(classes_path)
num_classes = len(classes)
cost = yoloLoss(5,1,1,1,anchors,num_classes)
optimizer = torch.optim.SGD(params,lr=learning_rate,momentum=0.9,weight_decay=5e-4)

# 4 data
data_path = 'VOCdevkit/pascal_voc_07_12_LS.hdf5'
data = h5py.File(data_path, 'r')
train_dataset = yoloDataset(data_path,anchors_path)   # (11, 3, 416, 416) (11, 13, 13, 5, 10)
train_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True,num_workers=0)

# 5 train
num_iter = 0
best_test_loss = np.inf
for epoch in range(num_epochs):
    net.train()
    if epoch == 30:
        learning_rate = 0.0001
    if epoch == 40:
        learning_rate = 0.00001
    for params_group in optimizer.param_groups:
        params_group['lr'] = learning_rate

    print('\n\nStarting epoch %d / %d' % (epoch + 1, num_epochs))
    print('Learning Rate for this epoch: {}'.format(learning_rate))

    total_loss = 0.

    for i,(img,targets) in enumerate(train_loader):
        imgs = Variable(img).to(torch.float32)          # torch.Size[b, 3, 416, 416]
        targets = Variable(targets)   # torch.Size[b, 13, 13, 5, 10]

        pred = net(imgs)
        loss = cost.yolo_loss(pred,targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if(i+1)%5 == 0:
            print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f, average_loss: %.4f'
                   %(epoch+1, num_epochs, i+1, len(train_loader), loss.data.item(), total_loss / (i+1)))
            num_iter += 1

5. 预测

predict.py

process：
    1.数据处理
    2.预测
    3.筛选
    4.画框

代码

'''
# 1 img process
# 2 predict --> decoder
# 3 filter_boxes
# 4 draw

'''
from torch.autograd import Variable
import torchvision.transforms as transforms
import numpy as np
from PIL import Image,ImageDraw,ImageFont
import colorsys,imghdr,os,torch,cv2
from nets.yolo_model import yolo_body,yolo_decoder
from data_process.data_encoder_3 import get_classes,get_anchors

def yolo_boxes_to_corners(box_xy,box_wh):
    box_mins = box_xy - (box_wh/2.)
    box_maxes = box_xy + (box_wh/2.)
    return torch.cat([box_mins[...,1:2], box_mins[...,0:1],
                          box_maxes[...,1:2],box_maxes[...,0:1]],-1)

def yolo_filter_boxes(boxes,box_confidence,box_class_probs,threshold=.6):
    '''
    inputs:
        box             [1,13,13,5,4 ]_
        confidence      [1,13,13,5,1 ]
        box_class_probs [1,13,13,5,20]
    outputs:
        boxes[n,4], scores[n], classes[n]
    '''
    box_scores = box_confidence * box_class_probs          # box_scores.shape [1,13,13,5,20]
    box_class_scores ,box_classes = torch.max(box_scores,axis=-1)  # [1, 13, 13, 5]), torch.Size([1, 13, 13, 5]
    prediction_mask = box_class_scores >= threshold        # [1, 13, 13, 5])
    boxes = boxes[prediction_mask]              # [n,4]
    scores = box_class_scores[prediction_mask]  # [n]
    classes = box_classes[prediction_mask]      # [n]
    return boxes, scores, classes

def nms(bboxes,scores,threshold=0.5):
    x1 = bboxes[:,0]
    y1 = bboxes[:,1]
    x2 = bboxes[:,2]
    y2 = bboxes[:,3]
    areas = (x2-x1)*(y2-y1)

    _,order = scores.sort(0,descending=True)
    keep = []
    while order.numel() > 0:
        if order.numel()>1:
            i = order[0]
        else:
            i = order
        keep.append(i)

        if order.numel() == 1:
            break

        xx1 = x1[order[1:]].clamp(min=x1[i])
        yy1 = y1[order[1:]].clamp(min=y1[i])
        xx2 = x2[order[1:]].clamp(max=x1[i])
        yy2 = y2[order[1:]].clamp(max=y1[i])
        w = (xx2-xx1).clamp(min=0)
        h = (yy2-yy1).clamp(min=0)
        inter = w*h

        ove = inter/(areas[i]+areas[order[1:]]-inter)
        ids = torch.nonzero(ove <= threshold).squeeze()
        if ids.numel() == 0:
            break
        order = order[ids+1]
    return torch.LongTensor(keep)

def yolo_eval(yolo_outputs,image_shape=[416,416],
              score_threshold=.6,iou_threshold=.5):
    ''' score_filter + NMS
        box_xy[1,13,13,5,2],
        box_wh[1,13,13,5,2]_
        confidence[1,13,13,5,1],
        box_class_probs [1,13,13,5,20]
    '''
    box_xy,box_wh,box_confidence,box_class_probs = yolo_outputs
    boxes = yolo_boxes_to_corners(box_xy, box_wh) #[1, 13, 13, 5, 4]
    # 1 score_filter 
    boxes, scores, classes = yolo_filter_boxes(
        boxes, box_confidence, box_class_probs, threshold=score_threshold)
    # 预测框映射到原图
    boxes = boxes * torch.tensor([image_shape[0],image_shape[1],image_shape[0],image_shape[1]])
    # 2 NMS
    keep = nms(boxes,scores,iou_threshold)
    return boxes[keep],scores[keep],classes[keep]

def detect_img():
    # 1 img process
    image_name = '000015.jpg'
    image = cv2.imread('VOCdevkit/VOC2007/JPEGImages/'+image_name) # (375, 500, 3)
    h,w,_ = image.shape                # h,w,_ =(375, 500, 3)
    img = cv2.resize(image,(416,416))  # (448, 448, 3)
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)  #  dtype('float32')
    img = np.array(img,np.float)/255.
    transform = transforms.Compose([transforms.ToTensor(),])
    img = transform(img)  # torch.Size([3, 448, 448])
    img = Variable(img[None,:,:,:],volatile=True)  #t
    # 2 predict --> decoder
    net = yolo_body()
    net.eval()
    print('load model...')

    print('predicting...')
    feas = net(img)
    anchors_path = 'model_data/anchors.txt'
    classes_path = 'model_data/pascal_classes.txt'
    anchors = get_anchors(anchors_path)
    class_names = get_classes(classes_path)
    num_classes = len(class_names)
    pred = yolo_decoder(feas,anchors,num_classes)
    # box_xy[1,13,13,5,2], box_wh[1,13,13,5,2]_confidence[1,13,13,5,1], box_class_probs [1,13,13,5,20]

    # 3 filter_boxes
    boxes, scores, classes = yolo_eval(pred)  # [n,4],[n],[n]
    print(boxes.shape, scores.shape, classes.shape)

    # 4 draw
    hsv_tuples = [(x / len(class_names), 1., 1.)
                  for x in range(len(class_names))]
    colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
    colors = list(
        map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)),
            colors))

    font = ImageFont.truetype(
        font='font/FiraMono-Medium.otf',
        size= np.floor(3e-2 * h + 0.5).astype('int32'))
    thickness = (h + w) // 300

    for i, c in reversed(list(enumerate(classes))):
        predicted_class = class_names[c]
        box = boxes[i]
        score = scores[i]

        label = '{} {:.2f}'.format(predicted_class, score)

        draw = ImageDraw.Draw(image)
        label_size = draw.textsize(label, font)

        top, left, bottom, right = box
        top = max(0, np.floor(top + 0.5).astype('int32'))
        left = max(0, np.floor(left + 0.5).astype('int32'))
        bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32'))
        right = min(image.size[0], np.floor(right + 0.5).astype('int32'))
        print(label, (left, top), (right, bottom))

        if top - label_size[1] >= 0:
            text_origin = np.array([left, top - label_size[1]])
        else:
            text_origin = np.array([left, top + 1])

        # My kingdom for a good redistributable image drawing library.
        for i in range(thickness):
            draw.rectangle(
                [left + i, top + i, right - i, bottom - i],
                outline=colors[c])
            draw.rectangle(
                [tuple(text_origin), tuple(text_origin + label_size)],
                fill=colors[c])
            draw.text(text_origin, label, fill=(0, 0, 0), font=font)
            del draw
        print('\n',1111111)
        image.save(os.path.join('image', image_name), quality=90)



# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    detect_img()