# 基于MXNet gluon 的SSD模型训练

##2. 数据集制作

## 3.模型训练

3.1 训练代码

import mxnet as mx
import matplotlib.pyplot as plt
import os.path as osp
import mxnet.image as image
from mxnet import gluon
from mxnet import nd
from mxnet.contrib.ndarray import MultiBoxPrior
from mxnet.gluon import nn
# setp 1
n = 40
# shape: batch x channel x height x weight
x = nd.random_uniform(shape=(1, 3, n, n))
y = MultiBoxPrior(x, sizes=[.5, .25, .1], ratios=[1, 2, .5])
# the first anchor box generated for pixel at (20,20)
# its format is (x_min, y_min, x_max, y_max)
boxes = y.reshape((n, n, -1, 4))
print('The first anchor box at row 21, column 21:', boxes[20, 20, 0, :])
# setp 2
def box_to_rect(box, color, linewidth=3):
"""convert an anchor box to a matplotlib rectangle"""
box = box.asnumpy()
return plt.Rectangle(
(box[0], box[1]), (box[2]-box[0]), (box[3]-box[1]),
fill=False, edgecolor=color, linewidth=linewidth)
colors = ['blue', 'green', 'red', 'black', 'magenta']
plt.imshow(nd.ones((n, n, 3)).asnumpy())
anchors = boxes[20, 20, :, :]
for i in range(anchors.shape[0]):
plt.show()
# setp 3
def class_predictor(num_anchors, num_classes):
"""return a layer to predict classes"""
return nn.Conv2D(num_anchors * (num_classes + 1), 3, padding=1)
cls_pred = class_predictor(5, 10)
cls_pred.initialize()
x = nd.zeros((2, 3, 20, 20))
print('Class prediction', cls_pred(x).shape)
# setp 4
def box_predictor(num_anchors):
"""return a layer to predict delta locations"""
return nn.Conv2D(num_anchors * 4, 3, padding=1)
box_pred = box_predictor(10)
box_pred.initialize()
x = nd.zeros((2, 3, 20, 20))
print('Box prediction', box_pred(x).shape)
# setp 5
def down_sample(num_filters):
"""stack two Conv-BatchNorm-Relu blocks and then a pooling layer
to halve the feature size"""
out = nn.HybridSequential()
for _ in range(2):
return out
blk = down_sample(10)
blk.initialize()
x = nd.zeros((2, 3, 20, 20))
print('Before', x.shape, 'after', blk(x).shape)
# setp 6
# a certain feature map with 20x20 spatial shape
feat1 = nd.zeros((2, 8, 20, 20))
print('Feature map 1', feat1.shape)
cls_pred1 = class_predictor(5, 10)
cls_pred1.initialize()
y1 = cls_pred1(feat1)
print('Class prediction for feature map 1', y1.shape)
# down-sample
ds = down_sample(16)
ds.initialize()
feat2 = ds(feat1)
print('Feature map 2', feat2.shape)
cls_pred2 = class_predictor(3, 10)
cls_pred2.initialize()
y2 = cls_pred2(feat2)
print('Class prediction for feature map 2', y2.shape)
# setp 7
def flatten_prediction(pred):
return nd.flatten(nd.transpose(pred, axes=(0, 2, 3, 1)))
def concat_predictions(preds):
return nd.concat(*preds, dim=1)
flat_y1 = flatten_prediction(y1)
print('Flatten class prediction 1', flat_y1.shape)
flat_y2 = flatten_prediction(y2)
print('Flatten class prediction 2', flat_y2.shape)
print('Concat class predictions', concat_predictions([flat_y1, flat_y2]).shape)
# setp 8
def body():
"""return the body network"""
out = nn.HybridSequential()
for nfilters in [16, 32, 64]:
return out
bnet = body()
bnet.initialize()
x = nd.zeros((2, 3, 256, 256))
#print('Body network', [y.shape for y in bnet(x)])
# setp 9
def toy_ssd_model(num_anchors, num_classes):
"""return SSD modules"""
downsamples = nn.Sequential()
class_preds = nn.Sequential()
box_preds = nn.Sequential()
for scale in range(5):
return body(), downsamples, class_preds, box_preds
#print(toy_ssd_model(5, 2))
# setp 10
def toy_ssd_forward(x, body, downsamples, class_preds, box_preds, sizes, ratios):
# extract feature with the body network
x = body(x)
# for each scale, add anchors, box and class predictions,
# then compute the input to next scale
default_anchors = []
predicted_boxes = []
predicted_classes = []
for i in range(5):
default_anchors.append(MultiBoxPrior(x, sizes=sizes[i], ratios=ratios[i]))
predicted_boxes.append(flatten_prediction(box_preds[i](x)))
predicted_classes.append(flatten_prediction(class_preds[i](x)))
if i < 3:
x = downsamples[i](x)
elif i == 3:
# simply use the pooling layer
x = nd.Pooling(x, global_pool=True, pool_type='max', kernel=(4, 4))
return default_anchors, predicted_classes, predicted_boxes
# setp 11
class ToySSD(gluon.Block):
def __init__(self, num_classes, **kwargs):
super(ToySSD, self).__init__(**kwargs)
# anchor box sizes for 4 feature scales
self.anchor_sizes = [[.2, .272], [.37, .447], [.54, .619], [.71, .79], [.88, .961]]
# anchor box ratios for 4 feature scales
self.anchor_ratios = [[1, 2, .5]] * 5
self.num_classes = num_classes
with self.name_scope():
self.body, self.downsamples, self.class_preds, self.box_preds = toy_ssd_model(4, num_classes)
def forward(self, x):
default_anchors, predicted_classes, predicted_boxes = toy_ssd_forward(x, self.body, self.downsamples,
self.class_preds, self.box_preds, self.anchor_sizes, self.anchor_ratios)
# we want to concatenate anchors, class predictions, box predictions from different layers
anchors = concat_predictions(default_anchors)
box_preds = concat_predictions(predicted_boxes)
class_preds = concat_predictions(predicted_classes)
# it is better to have class predictions reshaped for softmax computation
class_preds = nd.reshape(class_preds, shape=(0, -1, self.num_classes + 1))

return anchors, class_preds, box_preds
# setp 12
net = ToySSD(2)
net.initialize()
x = nd.zeros((1, 3, 256, 256))
default_anchors, class_predictions, box_predictions = net(x)
print('Outputs:', 'anchors', default_anchors.shape, 'class prediction', class_predictions.shape, 'box prediction', box_predictions.shape)
#step 13
data_shape = 512
batch_size = 4
def get_iterators(data_shape, batch_size):
class_names = ['ship']
num_class = len(class_names)
train_iter = image.ImageDetIter(
batch_size=batch_size,
data_shape=(3, data_shape, data_shape),
#path_imgrec='./data/pikachu_train.rec',
#path_imgidx='./data/pikachu_train.idx',
path_imglist='F:/interest_of_imags_for_recognation/UTS_Release/normal_data/train.lst',
path_root='F:/interest_of_imags_for_recognation/UTS_Release/normal_data//',
mean=True)
val_iter = image.ImageDetIter(
batch_size=batch_size,
data_shape=(3, data_shape, data_shape),
path_imglist='F:/interest_of_imags_for_recognation/UTS_Release/normal_data/val.lst',
path_root='F:/interest_of_imags_for_recognation/UTS_Release/normal_data/',
mean=True)
return train_iter, val_iter, class_names, num_class

train_data, test_data, class_names, num_class = get_iterators(data_shape, batch_size)
batch = train_data.next()
print(batch)
#
import numpy as np

img = batch.data[0][0].asnumpy()  # grab the first image, convert to numpy array
img = img.transpose((1, 2, 0))  # we want channel to be the last dimension
img += np.array([123, 117, 104])
img = img.astype(np.uint8)  # use uint8 (0-255)
# draw bounding boxes on image
for label in batch.label[0][0].asnumpy():
if label[0] < 0:
break
print(label)
xmin, ymin, xmax, ymax = [int(x * data_shape) for x in label[1:5]]
rect = plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False, edgecolor=(1, 0, 0), linewidth=3)
plt.imshow(img)
plt.show()
#
from mxnet.contrib.ndarray import MultiBoxTarget
def training_targets(default_anchors, class_predicts, labels):
class_predicts = nd.transpose(class_predicts, axes=(0, 2, 1))
z = MultiBoxTarget(*[default_anchors, labels, class_predicts])
box_target = z[0]  # box offset target for (x, y, width, height)
box_mask = z[1]  # mask is used to ignore box offsets we don't want to penalize, e.g. negative samples
cls_target = z[2]  # cls_target is an array of labels for all anchors boxes
class FocalLoss(gluon.loss.Loss):
def __init__(self, axis=-1, alpha=0.25, gamma=2, batch_axis=0, **kwargs):
super(FocalLoss, self).__init__(None, batch_axis, **kwargs)
self._axis = axis
self._alpha = alpha
self._gamma = gamma

def hybrid_forward(self, F, output, label):
output = F.softmax(output)
pt = F.pick(output, label, axis=self._axis, keepdims=True)
loss = -self._alpha * ((1 - pt) ** self._gamma) * F.log(pt)
return F.mean(loss, axis=self._batch_axis, exclude=True)

# cls_loss = gluon.loss.SoftmaxCrossEntropyLoss()
cls_loss = FocalLoss()
print(cls_loss)
class SmoothL1Loss(gluon.loss.Loss):
def __init__(self, batch_axis=0, **kwargs):
super(SmoothL1Loss, self).__init__(None, batch_axis, **kwargs)

def hybrid_forward(self, F, output, label, mask):
loss = F.smooth_l1((output - label) * mask, scalar=1.0)
return F.mean(loss, self._batch_axis, exclude=True)

box_loss = SmoothL1Loss()
print(box_loss)
cls_metric = mx.metric.Accuracy()
box_metric = mx.metric.MAE()  # measure absolute difference between prediction and target
### Set context for training
ctx = mx.gpu()  # it may takes too long to train using CPU
try:
_ = nd.zeros(1, ctx=ctx)
# pad label for cuda implementation
#train_data.reshape(label_shape=(3, 5))
train_data = test_data.sync_label_shape(train_data)
except mx.base.MXNetError as err:
print('No GPU enabled, fall back to CPU, sit back and be patient...')
ctx = mx.cpu()
net = ToySSD(num_class)
net.initialize(mx.init.Xavier(magnitude=2), ctx=ctx)
net.collect_params().reset_ctx(ctx)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1, 'wd': 5e-4})
epochs = 10  # set larger to get better performance
log_interval = 1000
from_scratch = True#False  # set to True to train from scratch
if from_scratch:
start_epoch = 0
else:
start_epoch = 148
pretrained = 'ssd_pretrained.params'
sha1 = 'fbb7d872d76355fff1790d864c2238decdb452bc'
url = 'https://apache-mxnet.s3-accelerate.amazonaws.com/gluon/models/ssd_pikachu-fbb7d872.params'
if not osp.exists(pretrained) or not verified(pretrained, sha1):
import time
from mxnet import autograd as ag
for epoch in range(start_epoch, epochs):
# reset iterator and tick
train_data.reset()
cls_metric.reset()
box_metric.reset()
tic = time.time()
# iterate through all batch
for i, batch in enumerate(train_data):
btic = time.time()
with ag.record():
x = batch.data[0].as_in_context(ctx)
y = batch.label[0].as_in_context(ctx)
default_anchors, class_predictions, box_predictions = net(x)
box_target, box_mask, cls_target = training_targets(default_anchors, class_predictions, y)
# losses
loss1 = cls_loss(class_predictions, cls_target)
# sum all losses
loss = loss1 + loss2
# backpropagate
loss.backward()
# apply
trainer.step(batch_size)
# update metrics
cls_metric.update([cls_target], [nd.transpose(class_predictions, (0, 2, 1))])
if (i + 1) % log_interval == 0:
name1, val1 = cls_metric.get()
name2, val2 = box_metric.get()
print('[Epoch %d Batch %d] speed: %f samples/s, training: %s=%f, %s=%f'
%(epoch ,i, batch_size/(time.time()-btic), name1, val1, name2, val2))

# end of epoch logging
name1, val1 = cls_metric.get()
name2, val2 = box_metric.get()
print('[Epoch %d] training: %s=%f, %s=%f'%(epoch, name1, val1, name2, val2))
print('[Epoch %d] time cost: %f'%(epoch, time.time()-tic))

# we can save the trained parameters to disk
net.save_params('ssd_%d.params' % epochs)

3.2 模型测试

import numpy as np
import cv2
def preprocess(image):
"""Takes an image and apply preprocess"""
# resize to data_shape
image = cv2.resize(image, (data_shape, data_shape))
# swap BGR to RGB
image = image[:, :, (2, 1, 0)]
# convert to float before subtracting mean
image = image.astype(np.float32)
# subtract mean
image -= np.array([123, 117, 104])
# organize as [batch-channel-height-width]
image = np.transpose(image, (2, 0, 1))
image = image[np.newaxis, :]
# convert to ndarray
image = nd.array(image)
return image

x = preprocess(image)
print('x', x.shape)
anchors, cls_preds, box_preds = net(x.as_in_context(ctx))
#print('anchors', anchors)
#print('class predictions', cls_preds)
#print('box delta predictions', box_preds)
from mxnet.contrib.ndarray import MultiBoxDetection
# convert predictions to probabilities using softmax
cls_probs = nd.SoftmaxActivation(nd.transpose(cls_preds, (0, 2, 1)), mode='channel')
# apply shifts to anchors boxes, non-maximum-suppression, etc...
output = MultiBoxDetection(*[cls_probs, box_preds, anchors], force_suppress=True, clip=False)
#print(output)
def display(img, out, thresh=0.5):
import random
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (10,10)
pens = dict()
plt.clf()
plt.imshow(img)
for det in out:
cid = int(det[0])
if cid < 0:
continue
score = det[1]
if score < thresh:
continue
if cid not in pens:
pens[cid] = (random.random(), random.random(), random.random())
scales = [img.shape[1], img.shape[0]] * 2
xmin, ymin, xmax, ymax = [int(p * s) for p, s in zip(det[2:6].tolist(), scales)]
rect = plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False,
edgecolor=pens[cid], linewidth=3)