MLP二分类,softmax回归,只使用批量大小为1的样本,进行一次前向传播和反向传播
手推MLP反向传播
先使用数学公式描述一下这个网络,使用交叉熵损失函数
前向过程,值得注意的是比如当前元素是0.0675,使用
L
e
a
k
y
R
e
l
u
(
x
)
=
x
Leaky Relu(x)=x
LeakyRelu(x)=x这一部分,反向传播时需要对激活函数求导并逐元素相乘,0.0675这个元素的位置反向传播时
L
e
a
k
y
R
e
l
u
(
x
)
′
=
1
Leaky Relu(x)'=1
LeakyRelu(x)′=1
由于使用链式求导法则,可以一段一段求导,最后相乘,需要注意求导的元素是标量、向量还是矩阵
反向传播过程,主要对
W
(
1
)
,
W
(
2
)
W^{(1)},W^{(2)}
W(1),W(2)进行了更新,需要注意的是如果激活函数是分段的(比如Leaky Relu),这里橙色部分的导数是1还是
α
\alpha
α
使用PyTorch自动求导进行验证
将上文红色框中手推的更新后的权重与PyTorch自动求导后相对比
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
torch.set_printoptions(precision=6)
class MLP(nn.Module):
def __init__(self):
super(MLP, self).__init__()
self.w1 = nn.Linear(2,3)
self.w1.weight.data = torch.Tensor([[0.15, 0.2], [0.25, 0.3], [0.35, 0.4]])
self.w1.bias.data = torch.Tensor([0.04, 0.08, 0.12])
# self.activate = nn.ReLU()
self.activate = nn.LeakyReLU(negative_slope=0.1)
# self.activate = nn.Sigmoid()
self.w2 = nn.Linear(2,2)
self.w2.weight.data = torch.Tensor([[0.45, 0.5, 0.55], [0.6, 0.65, 0.7]])
self.w2.bias.data = torch.Tensor([0.16, 0.20])
self.softmax = nn.Softmax(dim=0)
def forward(self, x):
h = self.activate(self.w1(x))
o = self.w2(h)
y = self.softmax(o)
return h, o, y
# def init_weights(m):
# if type(m) == nn.Linear:
# nn.init.normal_(m.weight, std=0.01)
# nn.init.zeros_(m.bias)
def unrandom(seed=2022):
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
unrandom()
net = MLP()
# net.apply(init_weights)
optimizer = optim.SGD(net.parameters(), lr=0.1)
loss = nn.CrossEntropyLoss()
# 以e为底数
unrandom()
x = torch.tensor([0.05, 0.10], requires_grad=True)
y = torch.tensor([1], requires_grad=False)
print(x, y)
tensor([0.050000, 0.100000], requires_grad=True) tensor([1])
h, o, y_pred = net(x)
o.unsqueeze_(0)
print(h, o, y_pred)
tensor([0.067500, 0.122500, 0.177500], grad_fn=<LeakyReluBackward0>) tensor([[0.349250, 0.444375]], grad_fn=<UnsqueezeBackward1>) tensor([0.476237, 0.523763], grad_fn=<SoftmaxBackward>)
l = loss(o, y)
l
tensor(0.646715, grad_fn=<NllLossBackward>)
print(net.state_dict())
optimizer.zero_grad()
l.backward(retain_graph=True)
OrderedDict([('w1.weight', tensor([[0.150000, 0.200000],
[0.250000, 0.300000],
[0.350000, 0.400000]])), ('w1.bias', tensor([0.040000, 0.080000, 0.120000])), ('w2.weight', tensor([[0.450000, 0.500000, 0.550000],
[0.600000, 0.650000, 0.700000]])), ('w2.bias', tensor([0.160000, 0.200000]))])
optimizer.step()
print(net.state_dict())
OrderedDict([('w1.weight', tensor([[0.150357, 0.200714],
[0.250357, 0.300714],
[0.350357, 0.400714]])), ('w1.bias', tensor([0.047144, 0.087144, 0.127144])), ('w2.weight', tensor([[0.446785, 0.494166, 0.541547],
[0.603215, 0.655834, 0.708453]])), ('w2.bias', tensor([0.112376, 0.247624]))])
含有dropout的MLP反向传播过程
含有dropout的MLP数学描述和前向过程,其中权重等参数与上文相同,唯一不同在于添加了一个dropout层
反向传播过程:
使用PyTorch自动求导进行验证
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
torch.set_printoptions(precision=6)
class MLP(nn.Module):
def __init__(self):
super(MLP, self).__init__()
self.w1 = nn.Linear(2,3)
self.w1.weight.data = torch.Tensor([[0.15, 0.2], [0.25, 0.3], [0.35, 0.4]])
self.w1.bias.data = torch.Tensor([0.04, 0.08, 0.12])
self.activate = nn.LeakyReLU(negative_slope=0.1)
self.dropout = nn.Dropout(p=0.75)
self.w2 = nn.Linear(2,2)
self.w2.weight.data = torch.Tensor([[0.45, 0.5, 0.55], [0.6, 0.65, 0.7]])
self.w2.bias.data = torch.Tensor([0.16, 0.20])
self.softmax = nn.Softmax(dim=0)
def forward(self, x):
h = self.activate(self.w1(x))
d = self.dropout(h)
o = self.w2(d)
y = self.softmax(o)
return h, d, o, y
# def init_weights(m):
# if type(m) == nn.Linear:
# nn.init.normal_(m.weight, std=0.01)
# nn.init.zeros_(m.bias)
def unrandom(seed=2022):
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
unrandom()
net = MLP()
# net.apply(init_weights)
optimizer = optim.SGD(net.parameters(), lr=0.1)
loss = nn.CrossEntropyLoss()
# 以e为底数
unrandom()
x = torch.tensor([0.05, 0.10], requires_grad=True)
y = torch.tensor([1], requires_grad=False)
print(x, y)
tensor([0.050000, 0.100000], requires_grad=True) tensor([1])
h, d, o, y_pred = net(x)
o.unsqueeze_(0)
print(h, d, o, y_pred)
tensor([0.067500, 0.122500, 0.177500], grad_fn=<LeakyReluBackward0>) tensor([0.270000, 0.490000, 0.000000], grad_fn=<MulBackward0>) tensor([[0.526500, 0.680500]], grad_fn=<UnsqueezeBackward1>) tensor([0.461576, 0.538424], grad_fn=<SoftmaxBackward>)
l = loss(o, y)
l
tensor(0.619109, grad_fn=<NllLossBackward>)
net
MLP(
(w1): Linear(in_features=2, out_features=3, bias=True)
(activate): LeakyReLU(negative_slope=0.1)
(dropout): Dropout(p=0.75, inplace=False)
(w2): Linear(in_features=2, out_features=2, bias=True)
(softmax): Softmax(dim=0)
)
print(net.state_dict())
optimizer.zero_grad()
l.backward(retain_graph=True)
OrderedDict([('w1.weight', tensor([[0.150000, 0.200000],
[0.250000, 0.300000],
[0.350000, 0.400000]])), ('w1.bias', tensor([0.040000, 0.080000, 0.120000])), ('w2.weight', tensor([[0.450000, 0.500000, 0.550000],
[0.600000, 0.650000, 0.700000]])), ('w2.bias', tensor([0.160000, 0.200000]))])
optimizer.step()
print(net.state_dict())
OrderedDict([('w1.weight', tensor([[0.151385, 0.202769],
[0.251385, 0.302769],
[0.350000, 0.400000]])), ('w1.bias', tensor([0.067695, 0.107695, 0.120000])), ('w2.weight', tensor([[0.437537, 0.477383, 0.550000],
[0.612463, 0.672617, 0.700000]])), ('w2.bias', tensor([0.113842, 0.246158]))])