只有conv里面的变量有梯度,其他的都没有
loss.backward() 每进行一次,假如梯度不归零的话,weights里面的梯度是累加的,
假如Loss是一个数组,mean和sum的区别就是一个尺度因子,就是数组的总个数
在loss中减去常数对梯度无任何影响。比如x**2-10000000,因为求导自动忽略
单次迭代中,不同batch中的梯度,也是累加的。具体看qq的值
x = torch.randn(2,2,1,1)
x1 = x
w = torch.nn.Conv2d(2,2,(1,1))
y1 = w(x)
y = y1.mean()
gt = 1
de = gt-y
loss = de**2-100
loss.backward()
qq = (de*2*-1*x[0,0,0,0])/4 + (de*2*-1*x[1,0,0,0])/4
qqq = (de*2*-1*x[0,1,0,0])/4+ (de*2*-1*x[1,1,0,0])/4
x = torch.randn(2,2,1,1)
x2 = x
y1 = w(x)
y = y1.mean()
gt = 1
de = gt-y
loss = de**2-100
loss.backward()
qq2 = (de*2*-1*x[0,0,0,0])/4 + (de*2*-1*x[1,0,0,0])/4 + qq
qqq2 = (de*2*-1*x[0,1,0,0])/4+ (de*2*-1*x[1,1,0,0])/4 + qqq
import torch
x = torch.randn(1,1,4,4)
x1 = x
w = torch.nn.Conv2d(1,1,(3,3))
y1 = w(x)
# y = y1.mean()
gt = 1
de = torch.abs(gt-y1)
# loss = (de**2).mean()
cond = de<1
loss = torch.where(cond, (de**2), de).mean()
loss.backward()
# qq = (de*2*-1*x[0,0,0,0])/4 + (de*2*-1*x[1,0,0,0])/4
# qqq = (de*2*-1*x[0,1,0,0])/4+ (de*2*-1*x[1,1,0,0])/4
qq = 1/4 * 2 * (de[0,0,0,0] * x[0,0,0,0] + de[0,0,0,1] * x[0,0,0,1]+ \
de[0,0,1,0] * x[0,0,1,0] + de[0,0,1,1] * x[0,0,1,1])
qq2 = 1/4 * 2 * (de[0,0,0,0] * x[0,0,0,1] + de[0,0,0,1] * x[0,0,0,2]+ \
de[0,0,1,0] * x[0,0,1,1] + de[0,0,1,1] * x[0,0,1,2])
# qq = 1/4 * 2 * (de[0,0,0,0] * x[0,0,0,0] + de[0,0,0,1] * x[0,0,0,1]+ \
# de[0,0,1,0] * x[0,0,1,0] + de[0,0,1,1] * x[0,0,1,1])
print (2)
1/4 * (2*de[0,0,0,0] * x[0,0,0,0] + x[0,0,0,1]+ \
2*de[0,0,1,0] * x[0,0,1,0] + x[0,0,1,1])
假如用sgd,就是普通的梯度下降,weight更新:net.w.weight -0.25*net.w.weight.grad,
需要说明的是:scheduler1.last_epoch初始为0,进行一次scheduler1.step(),scheduler1.last_epoch加1,假如scheduler1.last_epoch=milestone1中的一个值,那么学习率会减半,
其中scheduler1.get_lr()[0]有bug,只是显示问题,就是说,虽然下面代码打印出0.01,但是其实还是用0.1值做更新的
import torch
class Network(torch.nn.Module):
def __init__(self):
super(Network, self).__init__()
self.w = torch.nn.Conv2d(2,2,(1,1))
def forward(self,x):
out = self.w(x)
return out
net = Network()
optimizer1 = torch.optim.SGD(net.parameters(), 1)
milestone1 = [2,4,6,8]
scheduler1 = torch.optim.lr_scheduler.MultiStepLR(optimizer1, milestones=milestone1, gamma=0.5)
for i in range(5):
x = torch.randn(2,2,1,1)
y1 = net(x)
y = y1.sum()
gt = 1
de = gt-y
loss = de**2-100
aa = scheduler1.last_epoch
scheduler1.step()
bb = scheduler1.get_lr()[0]
loss.backward()
optimizer1.step()
optimizer1.zero_grad()
print (2)
import torch
from torch.optim.lr_scheduler import StepLR
model = torch.nn.Linear(5,10)
optim = torch.optim.SGD(model.parameters(), lr = 1)
scheduler = StepLR(optim,step_size=5 , gamma=0.1)
model.train()
for i in range(25):
optim.step()
scheduler.step()
print(scheduler.get_lr()[0])
1
1
1
1
0.010000000000000002
0.1
0.1
0.1
0.1
0.0010000000000000002
0.010000000000000002
0.010000000000000002
0.010000000000000002
0.010000000000000002
0.00010000000000000003
0.0010000000000000002
0.0010000000000000002
0.0010000000000000002
0.0010000000000000002
1.0000000000000004e-05
0.00010000000000000003
0.00010000000000000003
0.00010000000000000003
0.00010000000000000003
1.0000000000000004e-06