文章目录
一、线性回归训练的代码与结果
(1)代码
import random
import numpy as np
import torch
def data_iter(batch_size, features, labels):
num_examples = len(features)
indices = list(range(num_examples))
random.shuffle(indices) # 样本的读取顺序是随机的
for i in range(0, num_examples, batch_size):
j = torch.LongTensor(indices[i: min(i + batch_size, num_examples)]) # 最后一次可能不足一个batch
yield features.index_select(0, j), labels.index_select(0, j)
def linreg(X, w, b): # 本函数已保存在d2lzh包中方便以后使用
return torch.mm(X, w) + b
def squared_loss(y_hat, y): # 本函数已保存在pytorch_d2lzh包中方便以后使用
return (y_hat - y.view(y_hat.size())) ** 2 / 2
def sgd(params, lr, batch_size): # 本函数已保存在d2lzh_pytorch包中方便以后使用
for param in params:
param.data -= lr * param.grad / batch_size # 注意这里更改param时用的param.data
# print(param.grad.data)
if __name__ == '__main__':
num_inputs = 2
num_examples = 1000
true_w = [2, -3.4]
true_b = 4.2
features = torch.randn(num_examples, num_inputs)
labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float)
batch_size = 10
w = torch.tensor(np.random.normal(0, 0.01, (num_inputs, 1)), dtype=torch.float)
b = torch.zeros(1)
w.requires_grad_(requires_grad=True)
b.requires_grad_(requires_grad=True)
lr = 0.01
num_epochs = 5
net = linreg
loss = squared_loss
for epoch in range(num_epochs): # 训练模型一共需要num_epochs个迭代周期
# 在每一个迭代周期中,会使用训练数据集中所有样本一次(假设样本数能够被批量大小整除)。X
# 和y分别是小批量样本的特征和标签
for X, y in data_iter(batch_size, features, labels):
# with torch.no_grad():
# # 测试w代码块
# w1 = 0
# for i in range(10):
# w1 += (X[i, 0] * w[0] + b - y[i]) * X[i, 0]
# print('--------------')
# print(w1)
#
# # 测试b代码块
# b1 = 0
# for i in range(10):
# b1 += ((X[i, 0] * w[0] + X[i, 1] * w[1]) + b - y[i])
# print(b1)
l = loss(net(X, w, b), y).sum() # l是有关小批量X和y的损失
l.backward() # 小批量的损失对模型参数求梯度
sgd([w, b], lr, batch_size) # 使用小批量随机梯度下降迭代模型参数
# 不要忘了梯度清零
w.grad.data.zero_()
b.grad.data.zero_()
train_l = loss(net(features, w, b), labels)
print('epoch %d, loss %f' % (epoch + 1, train_l.mean().item()))
(2)结果
epoch 1, loss 2.210815
epoch 2, loss 0.294341
epoch 3, loss 0.039363
epoch 4, loss 0.005334
epoch 5, loss 0.000763
二、w和b反向传播的计算过程
- 在上述程序中,
sgd([w, b], lr, batch_size)
这一步中,代入sgd函数中有w.data -= Ir * w.grad / batch_size
和b.data -= Ir * b.grad / batch_size
。
下面详解如何计算w.grad和b.grad:
-
设学习率为 η , ∣ B ∣ \eta, |\mathcal{B}| η,∣B∣为batch_size。
-
w是一个2X1的矩阵,设为 w 1 , w 2 w_{1},w_{2} w1,w2。b是一个标量。X在每一次循环中是一个 ∣ B ∣ |\mathcal{B}| ∣B∣X2的矩阵,设为 x j ( i ) , i = 1 , 2... ∣ B ∣ ; j = 1 , 2 x_{j}^{(i)},i=1,2...|\mathcal{B}|;j=1,2 xj(i),i=1,2...∣B∣;j=1,2 。y是一个1X ∣ B ∣ |\mathcal{B}| ∣B∣的矩阵,设为 y i , i = 1 , 2... ∣ B ∣ y_{i},i=1,2...|\mathcal{B}| yi,i=1,2...∣B∣。
-
l = loss(net(X, w, b), y).sum()
的计算公式:
1)计算 l l l对 w 1 , w 2 w_{1},w_{2} w1,w2的偏微分
-
由 l o s s loss loss:
l o s s = 1 2 ∗ ( y ^ − y ) 2 = 1 2 ∗ [ ( X w + b ) − y ] 2 loss = \frac{1}{2}*(\hat{y}-y)^{2} = \frac{1}{2}*[(Xw+b)-y]^{2} loss=21∗(y^−y)2=21∗[(Xw+b)−y]2 -
可得 l l l:
l = ∑ i ∈ B l o s s = 1 2 ∗ ∑ i ∈ B [ ( X i w + b ) − y i ] 2 l =\sum_{i \in \mathcal{B}} {loss} = \frac{1}{2}*\sum_{i \in \mathcal{B}} {[(X_{i}w+b)-y_{i}]^{2}} l=i∈B∑loss=21∗i∈B∑[(Xiw+b)−yi]2 -
利用链式法则,计算 l o s s loss loss关于 w 1 w_{1} w1的偏微分:
∂ l o s s ∂ w 1 = ∂ l o s s ∂ y ^ ∂ y ^ ∂ w 1 = [ ( X w + b ) − y ] x 1 ( i ) \frac{\partial{loss}}{\partial{w_{1}}} = \frac{\partial{loss}}{\partial{\hat{y}}} \frac{\partial{\hat{y}}}{\partial{w_{1}}} = [(Xw+b)-y]x_{1}^{(i)} ∂w1∂loss=∂y^∂loss∂w1∂y^=[(Xw+b)−y]x1(i) -
计算 l l l关于 w 1 w_{1} w1的偏微分:
∂ l ∂ w 1 = ∑ ∂ l o s s ∂ w 1 = ∑ i ∈ B ( x 1 ( i ) w 1 + x 2 ( i ) w 2 + b − y ( i ) ) x 1 ( i ) \frac{\partial{l}}{\partial{w_{1}}} = \sum{\frac{\partial{loss}}{\partial{w_{1}}}} = \sum_{i \in \mathcal{B}} \left(x_1^{(i)} w_1 + x_2^{(i)} w_2 + b - y^{(i)}\right) x_{1}^{(i)} ∂w1∂l=∑∂w1∂loss=i∈B∑(x1(i)w1+x2(i)w2+b−y(i))x1(i) -
关于 w 2 w_{2} w2的偏微分计算同理:
∂ l ∂ w 2 = ∑ ∂ l o s s ∂ w 2 = ∑ i ∈ B ( x 1 ( i ) w 1 + x 2 ( i ) w 2 + b − y ( i ) ) x 2 ( i ) \frac{\partial{l}}{\partial{w_{2}}} = \sum{\frac{\partial{loss}}{\partial{w_{2}}}} = \sum_{i \in \mathcal{B}} \left(x_1^{(i)} w_1 + x_2^{(i)} w_2 + b - y^{(i)}\right) x_{2}^{(i)} ∂w2∂l=∑∂w2∂loss=i∈B∑(x1(i)w1+x2(i)w2+b−y(i))x2(i)
2)计算 l l l对 b b b的偏微分
-
计算 l o s s loss loss关于 b b b的偏微分:
∂ l o s s ∂ b = ∂ l o s s ∂ y ^ ∂ y ^ ∂ b = [ ( X w + b ) − y ] \frac{\partial{loss}}{\partial{b}} = \frac{\partial{loss}}{\partial{\hat{y}}} \frac{\partial{\hat{y}}}{\partial{b}} = [(Xw+b)-y] ∂b∂loss=∂y^∂loss∂b∂y^=[(Xw+b)−y] -
计算 l l l关于 b b b的偏微分:
∂ l ∂ b = ∑ i ∈ B ( x 1 ( i ) w 1 + x 2 ( i ) w 2 + b − y ( i ) ) \frac{\partial{l}}{\partial{b}} = \sum_{i \in \mathcal{B}} \left(x_1^{(i)} w_1 + x_2^{(i)} w_2 + b - y^{(i)}\right) ∂b∂l=i∈B∑(x1(i)w1+x2(i)w2+b−y(i))
2) w 1 , w 2 , b w_{1},w_{2},b w1,w2,b的更新公式
- 得到
w
1
w_{1}
w1,
w
2
w_{2}
w2反向传播更新后的值为(设学习率为
η
,
∣
B
∣
\eta, |\mathcal{B}|
η,∣B∣为batch_size):
w 1 ← w 1 − η ∣ B ∣ ∂ l ∂ w 1 ← w 1 − η ∣ B ∣ ∑ i ∈ B ( x 1 ( i ) w 1 + x 2 ( i ) w 2 + b − y ( i ) ) x 1 ( i ) , w 2 ← w 2 − η ∣ B ∣ ∂ l ∂ w 2 ← w 2 − η ∣ B ∣ ∑ i ∈ B ( x 1 ( i ) w 1 + x 2 ( i ) w 2 + b − y ( i ) ) x 2 ( i ) , b ← b − η ∣ B ∣ ∂ l ∂ b ← b − η ∣ B ∣ ( x 1 ( i ) w 1 + x 2 ( i ) w 2 + b − y ( i ) ) . \begin{aligned} w_1 &\leftarrow w_1 - \frac{\eta}{|\mathcal{B}|} \frac{\partial{l}}{\partial{w_{1}}} & \leftarrow w_1 - \frac{\eta}{|\mathcal{B}|} \sum_{i \in \mathcal{B}} \left(x_1^{(i)} w_1 + x_2^{(i)} w_2 + b - y^{(i)}\right) x_{1}^{(i)},\\ w_2 &\leftarrow w_2 - \frac{\eta}{|\mathcal{B}|} \frac{\partial{l}}{\partial{w_{2}}} & \leftarrow w_2 - \frac{\eta}{|\mathcal{B}|} \sum_{i \in \mathcal{B}} \left(x_1^{(i)} w_1 + x_2^{(i)} w_2 + b - y^{(i)}\right) x_{2}^{(i)},\\ b & \leftarrow b - \frac{\eta}{|\mathcal{B}|} \frac{\partial{l}}{\partial{b}} & \leftarrow b - \frac{\eta}{|\mathcal{B}|} \left(x_1^{(i)} w_1 + x_2^{(i)} w_2 + b - y^{(i)}\right).\qquad\qquad \end{aligned} w1w2b←w1−∣B∣η∂w1∂l←w2−∣B∣η∂w2∂l←b−∣B∣η∂b∂l←w1−∣B∣ηi∈B∑(x1(i)w1+x2(i)w2+b−y(i))x1(i),←w2−∣B∣ηi∈B∑(x1(i)w1+x2(i)w2+b−y(i))x2(i),←b−∣B∣η(x1(i)w1+x2(i)w2+b−y(i)).
三、对理论进行证明测试
- 根据上面的理论,分别手动计算了 w 1 w_{1} w1 和 b b b 反向传播的值,与w1.grad,b.grad相互对照,看结果是否一致。
(1)相关代码块
with torch.no_grad():
# 测试w代码块
w1 = 0
w2 = 0
for i in range(10):
w1 += ((X[i, 0] * w[0] + X[i, 1] * w[1]) + b - y[i]) * X[i, 0]
w2 += ((X[i, 0] * w[0] + X[i, 1] * w[1]) + b - y[i]) * X[i, 1]
print('--------------')
print(w1, w2)
# 测试b代码块
b1 = 0
for i in range(10):
b1 += ((X[i, 0] * w[0] + X[i, 1] * w[1]) + b - y[i])
print(b1)
(2)结果及分析
- 结果解读:前三个tensor是 w 1 , w 2 w_{1},w_{2} w1,w2 和 b b b 手动计算反向传播偏导数的值。后面三个tensor是w1.grad,w2.grad和b.grad的值。
- 可以发现手动计算的 w 1 , w 2 w_{1} ,w_{2} w1,w2和 b b b 反向传播偏导数的值 -20.3094 ,21.7094 和 -46.1437 与w1.grad,w2.grad,b.grad的值-20.3094,21.7094,-46.1437完全一致,推导完全正确!可喜可乐。
--------------
tensor([-20.3094]) tensor([21.7094])
tensor([-46.1437])
tensor([[-20.3094],
[ 21.7094]])
tensor([-46.1437])
--------------
tensor([-26.9922]) tensor([8.3662])
tensor([-31.1934])
tensor([[-26.9922],
[ 8.3662]])
tensor([-31.1934])
--------------
tensor([-18.5115]) tensor([74.9838])
tensor([-47.7214])
tensor([[-18.5115],
[ 74.9838]])
tensor([-47.7214])
(3)完整的测试代码
import random
import numpy as np
import torch
def data_iter(batch_size, features, labels):
num_examples = len(features)
indices = list(range(num_examples))
random.shuffle(indices) # 样本的读取顺序是随机的
for i in range(0, num_examples, batch_size):
j = torch.LongTensor(indices[i: min(i + batch_size, num_examples)]) # 最后一次可能不足一个batch
yield features.index_select(0, j), labels.index_select(0, j)
def linreg(X, w, b): # 本函数已保存在d2lzh包中方便以后使用
return torch.mm(X, w) + b
def squared_loss(y_hat, y): # 本函数已保存在pytorch_d2lzh包中方便以后使用
return (y_hat - y.view(y_hat.size())) ** 2 / 2
def sgd(params, lr, batch_size): # 本函数已保存在d2lzh_pytorch包中方便以后使用
for param in params:
param.data -= lr * param.grad / batch_size # 注意这里更改param时用的param.data
print(param.grad.data)
if __name__ == '__main__':
num_inputs = 2
num_examples = 1000
true_w = [2, -3.4]
true_b = 4.2
features = torch.randn(num_examples, num_inputs)
labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float)
batch_size = 10
w = torch.tensor(np.random.normal(0, 0.01, (num_inputs, 1)), dtype=torch.float)
b = torch.zeros(1)
w.requires_grad_(requires_grad=True)
b.requires_grad_(requires_grad=True)
lr = 0.01
num_epochs = 5
net = linreg
loss = squared_loss
for epoch in range(num_epochs): # 训练模型一共需要num_epochs个迭代周期
# 在每一个迭代周期中,会使用训练数据集中所有样本一次(假设样本数能够被批量大小整除)。X
# 和y分别是小批量样本的特征和标签
for X, y in data_iter(batch_size, features, labels):
with torch.no_grad():
# 测试w代码块
w1 = 0
w2 = 0
for i in range(10):
w1 += ((X[i, 0] * w[0] + X[i, 1] * w[1]) + b - y[i]) * X[i, 0]
w2 += ((X[i, 0] * w[0] + X[i, 1] * w[1]) + b - y[i]) * X[i, 1]
print('--------------')
print(w1, w2)
# 测试b代码块
b1 = 0
for i in range(10):
b1 += ((X[i, 0] * w[0] + X[i, 1] * w[1]) + b - y[i])
print(b1)
l = loss(net(X, w, b), y).sum() # l是有关小批量X和y的损失
l.backward() # 小批量的损失对模型参数求梯度
sgd([w, b], lr, batch_size) # 使用小批量随机梯度下降迭代模型参数
# 不要忘了梯度清零
w.grad.data.zero_()
b.grad.data.zero_()
train_l = loss(net(features, w, b), labels)
print('epoch %d, loss %f' % (epoch + 1, train_l.mean().item()))
四、参考
CNN反向传播 - 文章对反向传播方向更新w和b的值讲解的很清晰