代码展示:
# 测试训练模型时,模型有多余未参与传播的参数
import torch
import torch.nn as nn
torch.manual_seed(seed=20200910)
class Model(torch.nn.Module):
def __init__(self):
super(Model,self).__init__()
self.conv1=torch.nn.Sequential( # 输入torch.Size([64, 1, 28, 28])
torch.nn.Conv2d(1,64,kernel_size=3,stride=1,padding=1),
torch.nn.ReLU(), # 输出torch.Size([64, 64, 28, 28])
torch.nn.Conv2d(64,128,kernel_size=3,stride=1,padding=1), # 输出torch.Size([64, 128, 28, 28])
torch.nn.ReLU(),
torch.nn.MaxPool2d(stride=2,kernel_size=2) # 输出torch.Size([64, 128, 14, 14])
)
self.dense=torch.nn.Sequential( # 输入torch.Size([64, 14*14*128])
torch.nn.Linear(14*14*128,1024), # 输出torch.Size([64, 1024])
torch.nn.ReLU(),
torch.nn.Dropout(p=0.5),
torch.nn.Linear(1024,10) # 输出torch.Size([64, 10])
)
self.layer4cxq1 = torch.nn.Conv2d(2,33,4,4)
self.layer4cxq2 = torch.nn.ReLU()
self.layer4cxq3 = torch.nn.MaxPool2d(stride=2,kernel_size=2)
self.layer4cxq4 = torch.nn.Linear(14*14*128,1024)
self.layer4cxq5 = torch.nn.Dropout(p=0.8)
self.attribute4cxq = nn.Parameter(torch.tensor(20200910.0))
self.attribute4lzq = nn.Parameter(torch.tensor([2.0,3.0,4.0,5.0]))
self.attribute4hh = nn.Parameter(torch.randn(3,4,5,6))
self.attribute4wyf = nn.Parameter(torch.randn(7,8,9,10))
def forward(self,x): # torch.Size([64, 1, 28, 28])
x = self.conv1(x) # 输出torch.Size([64, 128, 14, 14])
x = x.view(-1,14*14*128) # torch.Size([64, 14*14*128])
x = self.dense(x) # 输出torch.Size([64, 10])
return x
print('cuda(GPU)是否可用:',torch.cuda.is_available()) # cuda(GPU)是否可用: True
print('torch的版本:',torch.__version__) # torch的版本: 1.7.1
model = Model().cuda()
optimizer = torch.optim.Adam(model.parameters())
optimizer.zero_grad()
data_in = torch.randn(64, 1, 28, 28,requires_grad=True).cuda()
print(data_in.shape) # torch.Size([64, 1, 28, 28])
loss = model(data_in).sum()
print(loss,type(loss),loss.shape) # tensor(-10.7881, device='cuda:0', grad_fn=<SumBackward0>) <class 'torch.Tensor'> torch.Size([])
print(model.conv1[0].weight.grad) # None
loss.backward()
optimizer.step()
print(model.conv1[0].weight.grad.shape) # torch.Size([64, 1, 3, 3])
print(model.layer4cxq1.weight.grad)
print(model.attribute4cxq.grad)
print()
控制台输出结果展示:
Windows PowerShell
版权所有 (C) Microsoft Corporation。保留所有权利。
尝试新的跨平台 PowerShell https://aka.ms/pscore6
加载个人及系统配置文件用了 1141 毫秒。
(base) PS C:\Users\chenxuqi\Desktop\News4cxq\测试模型有多余参数> conda activate pytorch_1.7.1_cu102
(pytorch_1.7.1_cu102) PS C:\Users\chenxuqi\Desktop\News4cxq\测试模型有多余参数> & 'D:\Anaconda3\envs\pytorch_1.7.1_cu102\python.exe' 'c:\Users\chenxuqi\.vscode\extensions\ms-python.python-2021.1.502429796\pythonFiles\lib\python\debugpy\launcher' '53151' '--' 'c:\Users\chenxuqi\Desktop\News4cxq\测试模型
有多余参数\test.py'
cuda(GPU)是否可用: True
torch的版本: 1.7.1
torch.Size([64, 1, 28, 28])
tensor(-10.7881, device='cuda:0', grad_fn=<SumBackward0>) <class 'torch.Tensor'> torch.Size([])
None
torch.Size([64, 1, 3, 3])
None
None
(pytorch_1.7.1_cu102) PS C:\Users\chenxuqi\Desktop\News4cxq\测试模型有多余参数>
实验结论: 当给优化器传入多余的参数时,那些在前向传播函数中参与运算的参数会被正常训练,会正常被计算梯度,并优化。但是对于那些多余的参数(比如代码中的model.layer4cxq1.weight和model.attribute4cxq等等),它们实际上并未参与到前向传播中,因此优化器不会为它们求梯度,它们的属性.grad是None,因此也不会被优化。
model.conv1[0].weight.grad反向传播之前: model._modules['conv1']._modules.['0'].weight.grad
model.conv1[0].weight.grad反向传播之后: model._modules['conv1']._modules.['0'].weight.grad
model.layer4cxq1.weight.grad反向传播之前: model.layer4cxq1.weight.grad
model.layer4cxq1.weight.grad反向传播之后: model.layer4cxq1.weight.grad
model.attribute4cxq.grad反向传播之前:
model.attribute4cxq.grad反向传播之后: