dropout最早出现在AlexNet中,它对CNN的发展起到了一定程度的推进作用,本章我们从0开始来实现一个dropout方法。
实现dropout:
def dropout(X, drop_probability):
keep_prob = 1 - drop_probability # 留着的概率
if keep_prob == 0: # 这种情况把所有元素全部丢弃
return X.zeros_like()
# 随机选择一部分该层的输出作为丢弃元素
# mask与keep_prob相比,小于变成1,大于变成0
mask = nd.random.uniform(low=0, high=1, shape=X.shape) < keep_prob
# 保证E[dropout(X)]==X,保证dropout之后的期望值与X一样
scale = 1 / keep_prob
return mask * X * scale
运行一个实例看看:
1)概率为0:
# 运行一个实例看看
A=nd.arange(20).reshape((5,4))
print(A)
print("==========")
print(dropout(A,0)) # 当概率为0,就是不使用dropout时
结果:
2)概率为0.5:
print(dropout(A,0.5)) # 当概率为0.5,就是有50%的元素会被置为0
结果:
有一般地可能性会变成0,剩下的值全部乘以2。
3)当概率为1.0时:
print(dropout(A,1.0))
全部元素均变为0。
加入dropout的MLP
下面我们实现一个MLP,不过这个MLP引入了dropout方法,具体的变化主要在全连接层中加入了dropout:
drop_out1 = 0.2 # dropout只在训练时使用
drop_out2 = 0.5
def net(x,dropout1,dropout2):
x=x.reshape(-1,num_input)
# 第一层全连接
h1=relu(nd.dot(x,w1)+b1)
h1=dropout(h1,dropout1) # 加入dropout
# 第二层全连接
h2=nd.dot(h1,w2)+b2
h2=dropout(h2,dropout2)
# 输出层
output=nd.dot(h2,w3)+b3# 最后一层一般不做激活
return output
剩下的代码基本上与MXNET深度学习框架-07-从0开始实现多层感知机(MLP)一致。下面放上所有代码(dropout只在模型训练时使用!!!):
import mxnet.gluon as gn
import mxnet.autograd as ag
import mxnet.ndarray as nd
def dropout(X, drop_probability):
keep_prob = 1 - drop_probability # 留着的概率
if keep_prob == 0: # 这种情况把所有元素全部丢弃
return X.zeros_like()
# 随机选择一部分该层的输出作为丢弃元素
# mask与keep_prob相比,小于变成1,大于变成0
mask = nd.random.uniform(low=0, high=1, shape=X.shape) < keep_prob
# 保证E[dropout(X)]==X,保证dropout之后的期望值与X一样
scale = 1 / keep_prob
return mask * X * scale
#
# # 运行一个实例看看
# A=nd.arange(20).reshape((5,4))
# print(A)
# print("==========")
# # print(dropout(A,0)) # 当概率为0,就是不使用dropout时
#
# # print(dropout(A,0.5))
# print(dropout(A,1.0))
'''---模型训练实例(引入dropout)---'''
# 下面的代码完全引入<从0开始的MLP>代码,只是多加了一层隐藏层
def transform(data, label):
return data.astype("float32") / 255, label.astype("float32") # 样本归一化
mnist_train = gn.data.vision.FashionMNIST(train=True)
mnist_test = gn.data.vision.FashionMNIST(train=False)
data, label = mnist_train[0:9]
print(data.shape, label) # 查看数据维度
import matplotlib.pyplot as plt
def show_image(image): # 显示图像
n = image.shape[0]
_, figs = plt.subplots(1, n, figsize=(15, 15))
for i in range(n):
figs[i].imshow(image[i].reshape((28, 28)).asnumpy())
plt.show()
def get_fashion_mnist_labels(labels): # 显示图像标签
text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat',
'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
return [text_labels[int(i)] for i in labels]
#
# show_image(data)
# print(get_fashion_mnist_labels(label))
'''----数据读取----'''
batch_size = 100
transformer = gn.data.vision.transforms.ToTensor()
train_data = gn.data.DataLoader(dataset=mnist_train, batch_size=batch_size, shuffle=True)
test_data = gn.data.DataLoader(dataset=mnist_test, batch_size=batch_size, shuffle=False)
'''----初始化模型参数----'''
num_input = 28 * 28 * 1
num_output = 10
num_hidden1=256 # 隐藏层神经元个数
num_hidden2=128 # 隐藏层神经元个数
w1 = nd.random_normal(shape=(num_input, num_hidden1),scale=0.01) # 初始化(0.01内的值)
b1 = nd.zeros(shape=(num_hidden1))
w2 = nd.random_normal(shape=(num_hidden1, num_hidden2),scale=0.01)
b2 = nd.zeros(shape=(num_hidden2))
w3 = nd.random_normal(shape=(num_hidden2, num_output),scale=0.01)
b3 = nd.zeros(shape=(num_output))
params = [w1, b1,w2,b2,w3,b3]
for param in params:
param.attach_grad() # 开辟临时空间
# 定义激活函数relu
def relu(x):
return nd.maximum(0,x)
'''----定义模型----'''
# 所谓的模型就是将全连接层与relu串起来(这里使用dropout方法)
drop_out1 = 0.2 # dropout只在训练时使用
drop_out2 = 0.5
def net(x,dropout1,dropout2):
x=x.reshape(-1,num_input)
# 第一层全连接
h1=relu(nd.dot(x,w1)+b1)
h1=dropout(h1,dropout1) # 加入dropout
# 第二层全连接
h2=nd.dot(h1,w2)+b2
h2=dropout(h2,dropout2)
# 输出层
output=nd.dot(h2,w3)+b3# 最后一层一般不做激活
return output
# softmax和交叉熵损失函数
# 由于将它们分开会导致数值不稳定(前两章博文的结果可以对比),所以直接使用gluon提供的API
cross_loss=gn.loss.SoftmaxCrossEntropyLoss()
# 定义准确率
def accuracy(output,label):
return nd.mean(output.argmax(axis=1)==label).asscalar()
def evaluate_accuracy(data_iter,net):# 定义测试集准确率
acc=0
for data,label in data_iter:
data,label=transform(data,label)
output=net(data,dropout1=0,dropout2=0) # 测试的时候dropout必须为0
acc+=accuracy(output,label)
return acc/len(data_iter)
# 梯度下降优化器
def SGD(params,lr):
for pa in params:
pa[:]=pa-lr*pa.grad # 参数沿着梯度的反方向走特定距离
# 训练
lr=0.1
epochs=20
for epoch in range(epochs):
train_loss=0
train_acc=0
for image,y in train_data:
image,y=transform(image,y) # 类型转换,数据归一化
with ag.record():
output=net(image,dropout1=drop_out1,dropout2=drop_out2)
loss=cross_loss(output,y)
loss.backward()
# 将梯度做平均,这样学习率不会对batch_size那么敏感
SGD(params,lr/batch_size)
train_loss+=nd.mean(loss).asscalar()
train_acc+=accuracy(output,y)
test_acc=evaluate_accuracy(test_data,net)
print("Epoch %d, Loss:%f, Train acc:%f, Test acc:%f"
%(epoch,train_loss/len(train_data),train_acc/len(train_data),test_acc))
'''----预测-------'''
# 训练完成后,可对样本进行预测
image_10,label_10=mnist_test[:10] #拿到前10个数据
show_image(image_10)
print("真实样本标签:",label_10)
print("真实数字标签对应的服饰名:",get_fashion_mnist_labels(label_10))
image_10,label_10=transform(image_10,label_10)
predict_label=net(image_10,dropout1=0,dropout2=0).argmax(axis=1)
print("预测样本标签:",predict_label.astype("int8"))
print("预测数字标签对应的服饰名:",get_fashion_mnist_labels(predict_label.asnumpy()))
训练过程: