few-shot-learning for object detection

最新推荐文章于 2023-01-09 17:58:02 发布

高颜值的杀生丸

最新推荐文章于 2023-01-09 17:58:02 发布

阅读量496

点赞数

文章标签： cobol hevc cuda processing stack

本文链接：https://blog.csdn.net/u010970956/article/details/117377502

版权

该代码实现了一个基于PyTorch的YOLOv3目标检测模型训练过程，包括数据加载、预处理、模型定义、损失函数计算等。训练过程中，模型会根据配置文件调整学习率，并记录训练进度、损失和精度等指标。同时，代码中还包含了一个用于负样本过滤的辅助函数，以优化训练过程。

摘要由CSDN通过智能技术生成

github https://github.com/LiuXinyu12378/few-shot-learning-for-object-detection

train.py

from __future__ import print_function
import sys

import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torchvision import datasets, transforms
from torch.autograd import Variable
from tqdm import tqdm

import dataset
import random
import math
import os
from utils import *
from cfg import parse_cfg, cfg
from darknet import Darknet
import pdb

# Training settings
# datacfg = sys.argv[1]
# darknetcfg = parse_cfg(sys.argv[2])
# learnetcfg = parse_cfg(sys.argv[3])

datacfg = "cfg/fewyolov3_voc.data"
darknetcfg = parse_cfg("cfg/darknet_yolov3_spp.cfg")
learnetcfg = parse_cfg("cfg/reweighting_net.cfg")
weightfile = "tmp/000050.weights"
if len(sys.argv) == 5:
    weightfile = sys.argv[4]

data_options = read_data_cfg(datacfg)
net_options = darknetcfg[0]
meta_options = learnetcfg[0]

# Configure options
cfg.config_data(data_options)
cfg.config_meta(meta_options)
cfg.config_net(net_options)

# Parameters
metadict = data_options['meta']
trainlist = data_options['train']

testlist = data_options['valid']
backupdir = data_options['backup']
gpus = data_options['gpus']  # e.g. 0,1,2,3
ngpus = len(gpus.split(','))
num_workers = int(data_options['num_workers'])

batch_size = int(net_options['batch'])
print("batch_size:",batch_size)
max_batches = int(net_options['max_batches'])
learning_rate = float(data_options['learning_rate'])
momentum = float(net_options['momentum'])
decay = float(net_options['decay'])
steps = [float(step) for step in data_options['steps'].split(',')]
scales = [float(scale) for scale in data_options['scales'].split(',')]

# Train parameters
use_cuda = True
seed = int(time.time())

## --------------------------------------------------------------------------
## MAIN
backupdir = cfg.backup
print('logging to ' + backupdir)
if not os.path.exists(backupdir):
    os.makedirs(backupdir)

torch.manual_seed(seed)
if use_cuda:
    os.environ['CUDA_VISIBLE_DEVICES'] = gpus
    torch.cuda.manual_seed(seed)

model = Darknet(darknetcfg, learnetcfg)
region_loss = model.loss

model.print_network()
# if len(sys.argv) == 5:
model.load_weights(weightfile)

###################################################
### Meta-model parameters
region_loss.seen = model.seen
processed_batches = 0 if cfg.tuning else model.seen / batch_size
trainlist = dataset.build_dataset(data_options)
nsamples = len(trainlist)
init_width = model.width
init_height = model.height
init_epoch = 0 if cfg.tuning else model.seen / nsamples
max_epochs = max_batches * batch_size / nsamples + 1
max_epochs = int(math.ceil(cfg.max_epoch * 1. / cfg.repeat)) if cfg.tuning else max_epochs
print(cfg.repeat, nsamples, max_batches, batch_size)
print(num_workers)

kwargs = {'num_workers': num_workers, 'pin_memory': True} if use_cuda else {}

if use_cuda:
    if ngpus > 1:
        model = torch.nn.DataParallel(model).cuda()
    else:
        model = model.cuda()

optimizer = optim.Adam(model.parameters(), lr=learning_rate)


def adjust_learning_rate(optimizer, processed_batches):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = learning_rate
    for i in range(len(steps)):
        scale = scales[i] if i < len(scales) else 1
        if processed_batches >= steps[i]:
            lr = lr * scale
            if processed_batches == steps[i]:
                break
        else:
            break
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return lr


def train(epoch):
    global processed_batches
    t0 = time.time()
    if ngpus > 1:
        cur_model = model.module
    else:
        cur_model = model

    train_loader = torch.utils.data.DataLoader(
        dataset.listDataset(trainlist, shape=(init_width, init_height),
                            shuffle=False,
                            transform=transforms.Compose([
                                transforms.ToTensor(),
                            ]),
                            train=True,
                            seen=cur_model.seen,
                            batch_size=batch_size,
                            num_workers=num_workers),
        batch_size=batch_size, shuffle=False, **kwargs)

    metaset = dataset.MetaDataset(metafiles=metadict, train=True)
    metaloader = torch.utils.data.DataLoader(
        metaset,
        batch_size=metaset.batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True
    )
    metaloader = iter(metaloader)

    lr = adjust_learning_rate(optimizer, processed_batches)
    logging('epoch %d/%d, processed %d samples, lr %e' % (epoch, max_epochs, epoch * len(train_loader.dataset), lr))

    model.train()
    t1 = time.time()
    avg_time = torch.zeros(9)
    with tqdm(total=train_loader.__len__()) as t:

        for batch_idx, (data, target) in enumerate(train_loader):
            metax, mask = metaloader.next()
            t2 = time.time()
            adjust_learning_rate(optimizer, processed_batches)
            processed_batches = processed_batches + 1
            if use_cuda:
                data = data.cuda()
                metax = metax.cuda()
                mask = mask.cuda()
                # target= target.cuda()
            t3 = time.time()
            data, target = Variable(data), Variable(target)
            metax, mask = Variable(metax), Variable(mask)
            t4 = time.time()
            optimizer.zero_grad()
            t5 = time.time()
            output = model(data, metax, mask)
            t6 = time.time()
            region_loss.seen = region_loss.seen + data.data.size(0)
            cur_model.seen = region_loss.seen
            region_loss.input_size = (data.data.size(2), data.data.size(3))
            loss,loss_box,loss_conf,loss_cls,cls_acc,recall50,recall75,nProposals = region_loss(output, target)
            t.set_description('Epoch %d' % epoch)
            t.set_postfix(loss=loss.item(), loss_bbox=loss_box,loss_conf=loss_conf,loss_cls=loss_cls,
                          cls_acc=cls_acc, recall50=recall50, recall75=recall75,Proposals=nProposals)
            t.update()

            t7 = time.time()
            loss.backward()
            t8 = time.time()
            optimizer.step()
            t9 = time.time()
            if False and batch_idx > 1:
                avg_time[0] = avg_time[0] + (t2 - t1)
                avg_time[1] = avg_time[1] + (t3 - t2)
                avg_time[2] = avg_time[2] + (t4 - t3)
                avg_time[3] = avg_time[3] + (t5 - t4)
                avg_time[4] = avg_time[4] + (t6 - t5)
                avg_time[5] = avg_time[5] + (t7 - t6)
                avg_time[6] = avg_time[6] + (t8 - t7)
                avg_time[7] = avg_time[7] + (t9 - t8)
                avg_time[8] = avg_time[8] + (t9 - t1)
                print('-------------------------------')
                print('       load data : %f' % (avg_time[0] / (batch_idx)))
                print('     cpu to cuda : %f' % (avg_time[1] / (batch_idx)))
                print('cuda to variable : %f' % (avg_time[2] / (batch_idx)))
                print('       zero_grad : %f' % (avg_time[3] / (batch_idx)))
                print(' forward feature : %f' % (avg_time[4] / (batch_idx)))
                print('    forward loss : %f' % (avg_time[5] / (batch_idx)))
                print('        backward : %f' % (avg_time[6] / (batch_idx)))
                print('            step : %f' % (avg_time[7] / (batch_idx)))
                print('           total : %f' % (avg_time[8] / (batch_idx)))
            t1 = time.time()
        print('')
        t1 = time.time()
        logging('training with %f samples/s' % (len(train_loader.dataset) / (t1 - t0)))

        if (epoch + 1) % cfg.save_interval == 0:
            logging('save weights to %s/%06d.weights' % (backupdir, epoch + 1))
            cur_model.save_weights('%s/%06d.weights' % (backupdir, epoch + 1))

init_epoch = int(init_epoch)
max_epochs = int(max_epochs)
print("init_epoch:",init_epoch)
print("max_epochs:",max_epochs)
for epoch in range(init_epoch, max_epochs):
    train(epoch)

region_loss.py

import time
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.autograd import Variable
from utils import *
from cfg import cfg
from numbers import Number
from random import random, randint
import pdb


def neg_filter(pred_boxes, target, withids=False):
    assert pred_boxes.size(0) == target.size(0)
    if cfg.neg_ratio == 'full':
        inds = list(range(pred_boxes.size(0)))
    elif isinstance(cfg.neg_ratio, Number):
        flags = torch.sum(target, 1) != 0
        flags = flags.cpu().data.tolist()
        ratio = cfg.neg_ratio * sum(flags) * 1. / (len(flags) - sum(flags))
        if ratio >= 1:
            inds = list(range(pred_boxes.size(0)))
        else:
            flags = [0 if f == 0 and random() > ratio else 1 for f in flags]
            inds = np.argwhere(flags).squeeze()
            pred_boxes, target = pred_boxes[inds], target[inds]
    else:
        raise NotImplementedError('neg_ratio not recognized')
    if withids:
        return pred_boxes, target, inds
    else:
        return pred_boxes, target


def neg_filter_v2(pred_boxes, target, withids=False):
    assert pred_boxes.size(0) == target.size(0)
    if cfg.neg_ratio == 'full':
        inds = list(range(pred_boxes.size(0)))
    elif isinstance(cfg.neg_ratio, Number):
        flags = torch.sum(target, 1) != 0
        flags = flags.cpu().data.tolist()
        ratio = cfg.neg_ratio * sum(flags) * 1. / (len(flags) - sum(flags))
        if ratio >= 1:
            inds = list(range(pred_boxes.size(0)))
        else:
            flags = [0 if f == 0 and random() > ratio else 1 for f in flags]
            if sum(flags) == 0:
                flags[randint(0, len(flags) - 1)] = 1
            inds = np.nonzero(flags)[0]
            pred_boxes, target = pred_boxes[inds], target[inds]
    else:
        raise NotImplementedError('neg_ratio not recognized')
    if withids:
        return pred_boxes, target, inds
    else:
        return pred_boxes, target


def build_targets(pred_boxes, target, conf, anchors, num_anchors, feature_size, input_size, ignore_thresh):
    nB = target.size(0)
    nA = num_anchors
    # print('anchor_step: ', anchor_step)
    obj_mask = torch.cuda.ByteTensor(nB, nA, feature_size[0], feature_size[1]).fill_(0)
    noobj_mask = torch.cuda.ByteTensor(nB, nA, feature_size[0], feature_size[1]).fill_(1)
    tx = torch.zeros(nB, nA, feature_size[0], feature_size[1]).cuda()
    ty = torch.zeros(nB, nA, feature_size[0], feature_size[1]).cuda()
    tw = torch.zeros(nB, nA, feature_size[0], feature_size[1]).cuda()
    th = torch.zeros(nB, nA, feature_size[0], feature_size[1]).cuda()
    tcls = torch.zeros(nB, nA, feature_size[0], feature_size[1]).cuda()
    iou_scores = torch.zeros(nB, nA, feature_size[0], feature_size[1]).cuda()

    tboxes = target.view(-1, 5)
    nonzero_ind = tboxes[:, 3] > 0
    tboxes = tboxes[nonzero_ind.unsqueeze(1).repeat(1, 5)].view(-1, 5)
    ind_B = torch.linspace(0, nB - 1, nB).unsqueeze(1).repeat(1, 50).view(-1).long().cuda()
    ind_B = ind_B[nonzero_ind]
    gx = (tboxes[:, 1] * feature_size[1]).float()
    gy = (tboxes[:, 2] * feature_size[0]).float()
    gw = (tboxes[:, 3] * input_size[1]).float()
    gh = (tboxes[:, 4] * input_size[0]).float()
    aw = anchors[:, 0]
    ah = anchors[:, 1]
    nbox = tboxes.size(0)
    gt_box = torch.cat([torch.zeros(1, nbox).cuda(), torch.zeros(1, nbox).cuda(), gw.unsqueeze(0), gh.unsqueeze(0)], 0)
    anchor_box = torch.cat([torch.zeros(1, nA).cuda(), torch.zeros(1, nA).cuda(), aw.unsqueeze(0), ah.unsqueeze(0)], 0)
    ious = bbox_ious(gt_box.unsqueeze(2).repeat(1, 1, nA), anchor_box.unsqueeze(1).repeat(1, nbox, 1), x1y1x2y2=False)
    best_ious, best_a = ious.max(1)
    gj = gy.long()
    gi = gx.long()
    obj_mask[ind_B, best_a, gj, gi] = 1
    noobj_mask[ind_B, best_a, gj, gi] = 0

    for i, iou in enumerate(ious):
        if (iou > ignore_thresh).sum():
            noobj_mask[ind_B[i:i + 1], (iou > ignore_thresh).nonzero().squeeze(1), gj[i:i + 1], gi[i:i + 1]] = 0

    tx[ind_B, best_a, gj, gi] = gx - gx.floor()
    ty[ind_B, best_a, gj, gi] = gy - gy.floor()
    tw[ind_B, best_a, gj, gi] = torch.log(gw / anchors[best_a][:, 0])
    th[ind_B, best_a, gj, gi] = torch.log(gh / anchors[best_a][:, 1])
    tcls[ind_B, best_a, gj, gi] = tboxes[:, 0].float()
    tconf = obj_mask.float()
    pred_boxes = pred_boxes.contiguous().view(nB, nA, feature_size[0], feature_size[1], 4).cuda()
    conf = conf.contiguous().view(nB, nA, feature_size[0], feature_size[1]).data
    target_boxes = torch.cat([(tboxes[:, 1] * input_size[1]).float().unsqueeze(0),
                              (tboxes[:, 2] * input_size[0]).float().unsqueeze(0),
                              gw.unsqueeze(0),
                              gh.unsqueeze(0)], 0)

    iou_scores[ind_B, best_a, gj, gi] = bbox_ious(pred_boxes[ind_B, best_a, gj, gi].t(), target_boxes, x1y1x2y2=False)
    conf50 = (conf[ind_B, best_a, gj, gi] > 0.5).float()
    detected50 = (iou_scores[ind_B, best_a, gj, gi] > 0.5).float() * conf50
    detected75 = (iou_scores[ind_B, best_a, gj, gi] > 0.75).float() * conf50

    return nbox, iou_scores, obj_mask, noobj_mask, tx, ty, tw, th, tconf, tcls, detected50, detected75


class RegionLoss(nn.Module):
    def __init__(self, num_classes=0, anchors=[], num_anchors=1):
        super(RegionLoss, self).__init__()
        self.num_classes = num_classes
        self.anchors = anchors
        self.num_anchors = num_anchors
        self.anchor_step = len(anchors) / num_anchors
        self.coord_scale = 1
        self.noobject_scale = 1
        self.object_scale = 5
        self.class_scale = 1
        self.thresh = 0.6
        self.seen = 0

    def forward(self, output, target):
        # import pdb; pdb.set_trace()
        # output : BxAs*(4+1+num_classes)*H*W

        # if target.dim() == 3:
        #     # target : B * n_cls * l
        #     l = target.size(-1)
        #     target = target.permute(1,0,2).contiguous().view(-1, l)
        if target.dim() == 3:
            target = target.view(-1, target.size(-1))
        bef = target.size(0)
        output, target = neg_filter(output, target)
        # print("{}/{}".format(target.size(0), bef))

        t0 = time.time()
        nB = output.data.size(0)
        nA = self.num_anchors
        nC = self.num_classes
        nH = output.data.size(2)
        nW = output.data.size(3)

        output = output.view(nB, nA, (5 + nC), nH, nW)
        x = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([0]))).view(nB, nA, nH, nW))
        y = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([1]))).view(nB, nA, nH, nW))
        w = output.index_select(2, Variable(torch.cuda.LongTensor([2]))).view(nB, nA, nH, nW)
        h = output.index_select(2, Variable(torch.cuda.LongTensor([3]))).view(nB, nA, nH, nW)
        conf = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([4]))).view(nB, nA, nH, nW))
        # [nB, nA, nC, nW, nH] | (bs, 5, 1, 13, 13)
        cls = output.index_select(2, Variable(torch.linspace(5, 5 + nC - 1, nC).long().cuda()))
        cls = cls.view(nB * nA, nC, nH * nW).transpose(1, 2).contiguous().view(nB * nA * nH * nW, nC)

        t1 = time.time()

        pred_boxes = torch.cuda.FloatTensor(4, nB * nA * nH * nW)
        grid_x = torch.linspace(0, nW - 1, nW).repeat(nH, 1).repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda()
        grid_y = torch.linspace(0, nH - 1, nH).repeat(nW, 1).t().repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda()
        anchor_w = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([0])).cuda()
        anchor_h = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([1])).cuda()
        anchor_w = anchor_w.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW)
        anchor_h = anchor_h.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW)
        pred_boxes[0] = x.data + grid_x
        pred_boxes[1] = y.data + grid_y
        pred_boxes[2] = torch.exp(w.data) * anchor_w
        pred_boxes[3] = torch.exp(h.data) * anchor_h
        pred_boxes = convert2cpu(pred_boxes.transpose(0, 1).contiguous().view(-1, 4))
        t2 = time.time()

        nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes,
                                                                                                    target.data,
                                                                                                    self.anchors, nA,
                                                                                                    nC, \
                                                                                                    nH, nW,
                                                                                                    self.noobject_scale,
                                                                                                    self.object_scale,
                                                                                                    self.thresh,
                                                                                                    self.seen)
        cls_mask = (cls_mask == 1)
        if cfg.metayolo:
            tcls.zero_()
        nProposals = int((conf > 0.25).float().sum().data[0])

        tx = Variable(tx.cuda())
        ty = Variable(ty.cuda())
        tw = Variable(tw.cuda())
        th = Variable(th.cuda())
        tconf = Variable(tconf.cuda())
        tcls = Variable(tcls.view(-1)[cls_mask].long().cuda())

        coord_mask = Variable(coord_mask.cuda())
        conf_mask = Variable(conf_mask.cuda().sqrt())
        cls_mask = Variable(cls_mask.view(-1, 1).repeat(1, nC).cuda())
        cls = cls[cls_mask].view(-1, nC)

        t3 = time.time()

        loss_x = self.coord_scale * nn.MSELoss(size_average=False)(x * coord_mask, tx * coord_mask) / 2.0
        loss_y = self.coord_scale * nn.MSELoss(size_average=False)(y * coord_mask, ty * coord_mask) / 2.0
        loss_w = self.coord_scale * nn.MSELoss(size_average=False)(w * coord_mask, tw * coord_mask) / 2.0
        loss_h = self.coord_scale * nn.MSELoss(size_average=False)(h * coord_mask, th * coord_mask) / 2.0
        loss_conf = nn.MSELoss(size_average=False)(conf * conf_mask, tconf * conf_mask) / 2.0
        loss_cls = self.class_scale * nn.CrossEntropyLoss(size_average=False)(cls, tcls)
        loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
        t4 = time.time()
        if False:
            print('-----------------------------------')
            print('        activation : %f' % (t1 - t0))
            print(' create pred_boxes : %f' % (t2 - t1))
            print('     build targets : %f' % (t3 - t2))
            print('       create loss : %f' % (t4 - t3))
            print('             total : %f' % (t4 - t0))
        print('%d: nGT %d, recall %d, proposals %d, loss: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f' % (
            self.seen, nGT, nCorrect, nProposals, loss_x.data[0], loss_y.data[0], loss_w.data[0], loss_h.data[0],
            loss_conf.data[0], loss_cls.data[0], loss.data[0]))
        return loss


class RegionLossV2(nn.Module):
    """
    Yolo region loss + Softmax classification across meta-inputs
    """

    def __init__(self, num_classes=0, anchors=[], num_anchors=1, input_size=(832, 832)):
        super(RegionLossV2, self).__init__()
        self.num_classes = num_classes
        self.anchors = anchors
        self.num_anchors = num_anchors
        self.coord_scale = 1
        self.class_scale = 1
        self.obj_scale = 1
        self.noobj_scale = 100
        self.thresh = 0.5
        self.seen = 0
        self.input_size = input_size
        self.feature_scale = [32, 16, 8]
        print('class_scale', self.class_scale)

    def forward(self, output, target):
        # output : (bs*cs, nA*(5+1), N)
        # target : (bs, cs, 50*5)
        # Get all classification prediction
        # pdb.set_trace()
        bs = target.size(0)
        cs = target.size(1)
        nA = self.num_anchors
        nC = self.num_classes
        N = output.data.size(2)
        # feature_size = [[26, 26], [52, 52], [104, 104]]
        cls = output.view(output.size(0), nA, (5 + nC), N)
        cls = cls.index_select(2, Variable(torch.linspace(5, 5 + nC - 1, nC).long().cuda())).squeeze()
        cls = cls.view(bs, cs, nA * N).transpose(1, 2).contiguous().view(bs * nA * N, cs)
        cls_conf = F.softmax(cls, 1)
        _, cls_max_ids = torch.max(cls_conf, 1)
        cls_max_ids = cls_max_ids.data
        pre_cls_mask = torch.zeros(bs * nA * N, cs).cuda()
        pre_cls_mask[torch.linspace(0, bs * nA * N - 1, bs * nA * N).long().cuda(), cls_max_ids] = 1
        pre_cls_mask = pre_cls_mask.view(bs, nA * N, cs).transpose(1, 2).contiguous().view(bs * cs, nA, N)

        # Rearrange target and perform filtering operation
        target = target.view(-1, target.size(-1))
        # bef = target.size(0)
        output, target, inds = neg_filter_v2(output, target, withids=True)
        counts, _ = np.histogram(inds, bins=bs, range=(0, bs * cs))
        # print("{}/{}".format(target.size(0), bef))
        pre_cls_mask = pre_cls_mask[inds]

        t0 = time.time()
        nB = output.data.size(0)

        output = output.view(nB, nA, (5 + nC), N)  # (nB, nA, (5+nC), N)
        x = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([0]))).squeeze(2))  # (nB, nA, N)
        y = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([1]))).squeeze(2))
        w = output.index_select(2, Variable(torch.cuda.LongTensor([2]))).squeeze(2)
        h = output.index_select(2, Variable(torch.cuda.LongTensor([3]))).squeeze(2)
        conf = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([4]))).squeeze(2))
        # [nB, nA, nC, nW, nH] | (bs, 5, 1, 13, 13)
        # cls  = output.index_select(2, Variable(torch.linspace(5,5+nC-1,nC).long().cuda()))
        # cls  = cls.view(nB*nA, nC, nH*nW).transpose(1,2).contiguous().view(nB*nA*nH*nW, nC)
        t1 = time.time()

        pred_boxes = torch.cuda.FloatTensor(4, nB, nA, N)
        grid_x = []
        grid_y = []
        anchor_w = []
        anchor_h = []
        scale = []
        feature_size = []
        for fs in self.feature_scale:
            feature_h = self.input_size[0] // fs
            feature_w = self.input_size[1] // fs
            # print("feature_h:",feature_h)
            # print("feature_w:",feature_w)
            feature_size.append([feature_h, feature_w])
            grid_x.append(torch.linspace(0, feature_w - 1, feature_w).repeat(feature_h, 1) \
                          .repeat(nB * nA, 1, 1).view(nB, nA, feature_h * feature_w).cuda())
            grid_y.append(torch.linspace(0, feature_h - 1, feature_h).repeat(feature_w, 1).t() \
                          .repeat(nB * nA, 1, 1).view(nB, nA, feature_h * feature_w).cuda())
            scale.append((torch.ones(nB, nA, feature_h * feature_w) * fs).cuda())
        grid_x = torch.cat(grid_x, 2)  # (nB, nA, N)
        grid_y = torch.cat(grid_y, 2)
        scale = torch.cat(scale, 2)
        for i in range(3):
            aw = torch.Tensor(self.anchors[6 * i:6 * (i + 1)]).view(nA, -1) \
                .index_select(1, torch.LongTensor([0])).cuda()
            ah = torch.Tensor(self.anchors[6 * i:6 * (i + 1)]).view(nA, -1) \
                .index_select(1, torch.LongTensor([1])).cuda()
            anchor_w.append(aw.repeat(nB, feature_size[i][0] * feature_size[i][1]) \
                            .view(nB, nA, feature_size[i][0] * feature_size[i][1]))
            anchor_h.append(ah.repeat(nB, feature_size[i][0] * feature_size[i][1]) \
                            .view(nB, nA, feature_size[i][0] * feature_size[i][1]))
        anchor_w = torch.cat(anchor_w, 2)
        anchor_h = torch.cat(anchor_h, 2)
        pred_boxes[0] = (x.data + grid_x) * scale
        pred_boxes[1] = (y.data + grid_y) * scale
        pred_boxes[2] = torch.exp(w.data) * anchor_w
        pred_boxes[3] = torch.exp(h.data) * anchor_h
        pred_boxes = convert2cpu(pred_boxes.permute(1, 2, 3, 0).contiguous())  # (nB, nA, N, 4)
        t2 = time.time()
        nGT = 0
        iou_scores = []
        obj_mask = []
        noobj_mask = []
        tx = []
        ty = []
        tw = []
        th = []
        tconf = []
        tcls = []
        start_N = 0
        detected50 = torch.zeros(0)
        detected75 = torch.zeros(0)
        for imap in range(3):
            nGT, iou_scores_temp, obj_mask_temp, noobj_mask_temp, tx_temp, ty_temp, tw_temp, th_temp, tconf_temp, \
            tcls_temp, detected50_temp, detected75_temp = build_targets(
                pred_boxes[:, :, start_N:start_N + feature_size[imap][0] * feature_size[imap][1], :],
                target.data.cuda(),
                conf[:, :, start_N:start_N + feature_size[imap][0] * feature_size[imap][1]],
                torch.Tensor(self.anchors[6 * imap:6 * (imap + 1)]).view(nA, -1).cuda(),
                nA,
                feature_size[imap],
                self.input_size,
                self.thresh)
            if not len(detected50):
                detected50 = torch.zeros(nGT).cuda()
            if not len(detected75):
                detected75 = torch.zeros(nGT).cuda()
            detected50 += detected50_temp
            detected75 += detected75_temp
            start_N += feature_size[imap][0] * feature_size[imap][1]
            iou_scores.append(iou_scores_temp.view(nB, nA, feature_size[imap][0] * feature_size[imap][1]))
            obj_mask.append(obj_mask_temp.view(nB, nA, feature_size[imap][0] * feature_size[imap][1]))
            noobj_mask.append(noobj_mask_temp.view(nB, nA, feature_size[imap][0] * feature_size[imap][1]))
            tx.append(tx_temp.view(nB, nA, feature_size[imap][0] * feature_size[imap][1]))
            ty.append(ty_temp.view(nB, nA, feature_size[imap][0] * feature_size[imap][1]))
            tw.append(tw_temp.view(nB, nA, feature_size[imap][0] * feature_size[imap][1]))
            th.append(th_temp.view(nB, nA, feature_size[imap][0] * feature_size[imap][1]))
            tconf.append(tconf_temp.view(nB, nA, feature_size[imap][0] * feature_size[imap][1]))
            tcls.append(tcls_temp.view(nB, nA, feature_size[imap][0] * feature_size[imap][1]))

        iou_scores = torch.cat(iou_scores, 2)
        obj_mask = torch.cat(obj_mask, 2)
        noobj_mask = torch.cat(noobj_mask, 2)
        tx = torch.cat(tx, 2)
        ty = torch.cat(ty, 2)
        tw = torch.cat(tw, 2)
        th = torch.cat(th, 2)
        tconf = torch.cat(tconf, 2)
        tcls = torch.cat(tcls, 2)

        # Take care of class mask
        idx_start = 0
        cls_mask_list = []
        tcls_list = []
        for i in range(len(counts)):
            if counts[i] == 0:
                cur_mask = torch.zeros(nA, N).cuda()
                cur_tcls = torch.zeros(nA, N).cuda()
            else:
                cur_mask = torch.sum(obj_mask[idx_start:idx_start + counts[i]].float(), dim=0)
                cur_tcls = torch.sum(tcls[idx_start:idx_start + counts[i]], dim=0)
            cls_mask_list.append(cur_mask)
            tcls_list.append(cur_tcls)
            idx_start += counts[i]
        cls_mask = torch.stack(cls_mask_list)  # (bs, nA, N)
        tcls = torch.stack(tcls_list)

        cls_mask = (cls_mask == 1)
        conf50 = (conf > 0.5).float().data
        iou50 = (iou_scores > 0.5).float()
        detected_mask = conf50 * tconf
        precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16)
        detected50 = (detected50 > 0).float()
        detected75 = (detected75 > 0).float()
        recall50 = detected50.sum() / (nGT + 1e-16)
        recall75 = detected75.sum() / (nGT + 1e-16)
        nProposals = int((conf > 0.25).float().sum().item())
        tx = Variable(tx)
        ty = Variable(ty)
        tw = Variable(tw)
        th = Variable(th)
        tconf = Variable(tconf)

        obj_mask = Variable(obj_mask.bool())
        noobj_mask = Variable(noobj_mask.bool())
        # cls_mask   = Variable(cls_mask.view(-1, 1).repeat(1,cs).cuda())
        cls = cls[Variable(cls_mask.view(-1, 1).repeat(1, cs))].view(-1, cs)
        cls_max_ids = cls_max_ids[cls_mask.view(-1)]
        tcls = Variable(tcls[cls_mask].long())
        cls_acc = float(torch.sum(cls_max_ids == tcls.data)) / (cls_max_ids.numel() + 1e-16)

        ClassificationLoss = nn.CrossEntropyLoss()
        MseLoss = nn.MSELoss()
        BceLoss = nn.BCELoss()

        t3 = time.time()

        loss_x = self.coord_scale * MseLoss(x[obj_mask], tx[obj_mask])
        loss_y = self.coord_scale * MseLoss(y[obj_mask], ty[obj_mask])
        loss_w = self.coord_scale * MseLoss(w[obj_mask], tw[obj_mask])
        loss_h = self.coord_scale * MseLoss(h[obj_mask], th[obj_mask])
        loss_conf_obj = BceLoss(conf[obj_mask], tconf[obj_mask])
        loss_conf_noobj = BceLoss(conf[noobj_mask], tconf[noobj_mask])
        loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj
        if len(cls):
            loss_cls = self.class_scale * ClassificationLoss(cls, tcls)
        else:
            loss_cls = Variable(torch.Tensor([0]).float().cuda())

        # # pdb.set_trace()
        # ids = [9,11,12,16]
        # new_cls, new_tcls = select_classes(cls, tcls, ids)
        # new_tcls = Variable(torch.from_numpy(new_tcls).long().cuda())
        # loss_cls_new = self.class_scale * nn.CrossEntropyLoss(size_average=False)(new_cls, new_tcls)
        # loss_cls_new *= 10
        # loss_cls += loss_cls_new

        loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
        t4 = time.time()
        if False:
            print('-----------------------------------')
            print('        activation : %f' % (t1 - t0))
            print(' create pred_boxes : %f' % (t2 - t1))
            print('     build targets : %f' % (t3 - t2))
            print('       create loss : %f' % (t4 - t3))
            print('             total : %f' % (t4 - t0))
        # print(
        #     '%d: nGT %d, precision %f, recall50 %f, recall75 %f, cls_acc %f, loss: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f' % \
        #     (self.seen, nGT, precision, recall50, recall75, cls_acc, loss_x.item(), loss_y.item(), \
        #      loss_w.item(), loss_h.item(), loss_conf.item(), loss_cls.item(), loss.item()))
        # print('%d: nGT %d, recall %d, proposals %d, loss: x %f, y %f, w %f, h %f, conf %f, cls %f, cls_new %f, total %f' % (self.seen, nGT, nCorrect, nProposals, loss_x.data[0], loss_y.data[0], loss_w.data[0], loss_h.data[0], loss_conf.data[0], loss_cls.data[0], loss_cls_new.data[0], loss.data[0]))
        return loss,loss_x.item() + loss_y.item() + loss_w.item() + loss_h.item(),loss_conf.item(),loss_cls.item(),cls_acc,recall50.item(),recall75.item(),nProposals


def select_classes(pred, tgt, ids):
    # convert tgt to numpy
    tgt = tgt.cpu().data.numpy()
    new_tgt = [(tgt == d) * i for i, d in enumerate(ids)]
    new_tgt = np.max(np.stack(new_tgt), axis=0)
    idxes = np.argwhere(new_tgt > 0).squeeze()
    new_pred = pred[idxes]
    new_pred = new_pred[:, ids]
    new_tgt = new_tgt[idxes]
    return new_pred, new_tgt