DenseNet
相比于ResNet,DenseNet把最后的+改为concat,然后为了解决output_channels增长过快的问题使用了transition_blocks每次使用1x1 Conv来减小output_channels以及用avg_pooling来减半高和宽。
代码
import d2lzh as d2l
from mxnet import gluon, init, nd, autograd
from mxnet.gluon import nn
import time
def conv_block(num_channels):
blk = nn.Sequential()
blk.add(nn.BatchNorm(), nn.Activation('relu'),
nn.Conv2D(num_channels, kernel_size=3, padding=1))
return blk
class DenseBlock(nn.Block):
def __init__(self, num_convs, num_channels, **kwargs):
super(DenseBlock, self).__init__(**kwargs)
self.net = nn.Sequential()
for _ in range(num_convs):
self.net.add(conv_block(num_channels))
def forward(self, X):
for blk in self.net:
Y = blk(X)
X = nd.concat(X, Y, dim=1) # 在通道维上将输入和输出连结
return X
def transition_block(num_channels):
blk = nn.Sequential()
blk.add(nn.BatchNorm(), nn.Activation('relu'),
nn.Conv2D(num_channels, kernel_size=1),
nn.AvgPool2D(pool_size=2, strides=2))
return blk
# 首先是跟ResNet一样的1x1 Conv和max_pooling
net = nn.Sequential()
net.add(nn.Conv2D(64, kernel_size=7, strides=2, padding=3),
nn.BatchNorm(), nn.Activation('relu'),
nn.MaxPool2D(pool_size=3, strides=2, padding=1))
num_channels, growth_rate = 64, 32 # num_channels为当前的通道数
num_convs_in_dense_blocks = [4, 4, 4, 4]
for i, num_convs in enumerate(num_convs_in_dense_blocks):
net.add(DenseBlock(num_convs, growth_rate))
# 上一个稠密块的输出通道数
num_channels += num_convs * growth_rate
# 在稠密块之间加入通道数减半的过渡层
if i != len(num_convs_in_dense_blocks) - 1:
num_channels //= 2
net.add(transition_block(num_channels))
# 最后接上全局池化层和dense
net.add(nn.BatchNorm(), nn.Activation('relu'), nn.GlobalAvgPool2D(),
nn.Dense(10))
batch_size = 256
# 构建数据集,将原来28x28的图片放大到96x96
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
ctx = d2l.try_gpu()
net.initialize(ctx=ctx, init=init.Xavier())
print('training on', ctx)
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1})
for epoch in range(3):
train_loss_sum = 0
train_acc_sum = 0
n = 0
start = time.time()
for X, y in train_iter:
X, y = X.as_in_context(ctx), y.as_in_context(ctx)
with autograd.record():
y_hat = net(X)
loss = softmax_cross_entropy(y_hat, y).sum()
loss.backward()
trainer.step(batch_size)
y = y.astype('float32')
train_loss_sum += loss.asscalar()
train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
n += y.size
test_acc = d2l.evaluate_accuracy(test_iter, net, ctx)
print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
% (epoch + 1, train_loss_sum / n, train_acc_sum / n, test_acc, time.time() - start))
结果
training on gpu(0)
epoch 1, loss 0.5401, train acc 0.808, test acc 0.881, time 93.8 sec
epoch 2, loss 0.3137, train acc 0.886, test acc 0.892, time 88.2 sec
epoch 3, loss 0.2632, train acc 0.904, test acc 0.878, time 87.1 sec