1,对于__init__中使用self定义的变量会使用这个变量名作为存储时的名字。
self.conv1 = nn.Conv2d(3, 12, kernel_size=3, stride=1, padding=1)
卷积层有两个参数:权重和偏移项,上例对应的名称为conv1.weight、conv1.bias
self.bn1 = nn.BatchNorm2d(12)
BN层有5个参数:bn1.weight、bn1.bias、bn1.running_mean、bn1.running_var、bn1.num_batches_tracked
2,当使用nn.Sequential时会根据传入的list的顺序对其进行编号,从0开始。
conv1 = nn.Conv2d(3, 12, kernel_size=3, stride=1, padding=1)
bn1 = nn.BatchNorm2d(12)
s1 = [conv1, bn1]
self.stage1 = nn.Sequential(*s1)
注意此时的conv1和bn1都没有self,stage1有self,由于Sequential将conv1和bn1进行顺序封装,因此conv1会被编号为stage1.0,bn1会被编号为stage1.1,具体结果如下:
stage1.0.weight、stage1.0.bias
stage1.1.weight、stage1.1.bias、stage1.1.running_mean、stage1.1.running_var、stage1.1.num_batches_tracked
3,当一个module被from torch.nn import DataParallel或者from torch.nn.parallel import DistributedDataParallel包围住后,会在这个变量名后面加上module.。
conv1 = nn.Conv2d(3, 12, kernel_size=3, stride=1, padding=1)
bn1 = nn.BatchNorm2d(12)
s1 = [conv1, bn1]
stage1 = nn.Sequential(*s1)
self.stage2 = DataParallel(stage1)
注意只有stage2前面有self,输出结果如下:
stage2.module.0.weight、stage2.module.0.bias
stage2.module.1.weight、stage2.module.1.bias、stage2.module.1.running_mean、stage2.module.1.running_var、stage2.module.1.num_batches_tracked
举例一:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.nn import DataParallel
from torch.nn.parallel import DistributedDataParallel
import torch.distributed as dist
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(3, 12, kernel_size=3, stride=1, padding=1)
self.bn1 = nn.BatchNorm2d(12)
self.conv2 = nn.Conv2d(12, 24, kernel_size=3, stride=1, padding=1)
self.bn2 = nn.BatchNorm2d(24)
self.fc1 = nn.Linear(24 * 5 * 5, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
x = x.view(-1, 24 * 5 * 5)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
if __name__ == '__main__':
# torch.cuda.set_device(dist.get_rank())
# dist.init_process_group(backend="nccl", init_method="tcp://localhost:23456",
# rank=dist.get_rank(), world_size=dist.get_world_size())
model = CNN()
# model = DataParallel(model)
for name in model.state_dict():
print(name)
输出结果:根据self后面具体的层输出对应的变量名
conv1.weight、conv1.bias
bn1.weight、bn1.bias、bn1.running_mean、bn1.running_var、bn1.num_batches_tracked
conv2.weight、conv2.bias
bn2.weight、bn2.bias、bn2.running_mean、bn2.running_var、bn2.num_batches_tracked
fc1.weight、fc1.bias
fc2.weight、fc2.bias
举例二:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.nn import DataParallel
from torch.nn.parallel import DistributedDataParallel
import torch.distributed as dist
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(3, 12, kernel_size=3, stride=1, padding=1)
self.bn1 = nn.BatchNorm2d(12)
# s1 = [self.conv1, self.bn1]
self.s1 = [self.conv1, self.bn1]
self.stage1 = nn.Sequential(*self.s1)
self.conv2 = nn.Conv2d(12, 24, kernel_size=3, stride=1, padding=1)
self.bn2 = nn.BatchNorm2d(24)
self.fc1 = nn.Linear(24 * 5 * 5, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
x = x.view(-1, 24 * 5 * 5)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
if __name__ == '__main__':
# torch.cuda.set_device(dist.get_rank())
# dist.init_process_group(backend="nccl", init_method="tcp://localhost:23456",
# rank=dist.get_rank(), world_size=dist.get_world_size())
model = CNN()
# model = DataParallel(model)
for name in model.state_dict():
print(name)
结果:self.conv1和self.bn1通过self.s1传入Sequential,所以self.stage会根据出现顺序进行编号,但原本的self.conv1和self.bn1仍然存在,同时self.s1并没有,虽然他有self,但是他不是pytorch自带的层,是python的基本数据结构
conv1.weight、conv1.bias
bn1.weight、bn1.bias、bn1.running_mean、bn1.running_var、bn1.num_batches_tracked
stage1.0.weight、stage1.0.bias
stage1.1.weight、stage1.1.bias、stage1.1.running_mean、stage1.1.running_var、stage1.1.num_batches_tracked
conv2.weight、conv2.bias
bn2.weight、bn2.bias、bn2.running_mean、bn2.running_var、bn2.num_batches_tracked
fc1.weight、fc1.bias
fc2.weight、fc2.bias
举例三:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.nn import DataParallel
from torch.nn.parallel import DistributedDataParallel
import torch.distributed as dist
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
conv1 = nn.Conv2d(3, 12, kernel_size=3, stride=1, padding=1)
bn1 = nn.BatchNorm2d(12)
s1 = [conv1, bn1]
self.stage1 = nn.Sequential(*s1)
self.stage2 = DataParallel(self.stage1)
self.conv2 = nn.Conv2d(12, 24, kernel_size=3, stride=1, padding=1)
self.bn2 = nn.BatchNorm2d(24)
self.fc1 = nn.Linear(24 * 5 * 5, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.stage2(x)
x = F.relu(self.bn2(self.conv2(x)))
x = x.view(-1, 24 * 5 * 5)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
if __name__ == '__main__':
# torch.cuda.set_device(dist.get_rank())
# dist.init_process_group(backend="nccl", init_method="tcp://localhost:23456",
# rank=dist.get_rank(), world_size=dist.get_world_size())
model = CNN()
model = DataParallel(model)
for name in model.state_dict():
print(name)
结果:self.stage1按照Sequential进行编号,self.stage通过DataParallel进行包裹,因此会在stage2后面多出module.,由于最后的model也被DataParallel包裹,所以CNN里面所有变量前面都多了module.
module.stage1.0.weight、module.stage1.0.bias
module.stage1.1.weight、module.stage1.1.bias、module.stage1.1.running_mean、module.stage1.1.running_var、module.stage1.1.num_batches_tracked
module.stage2.module.0.weight、module.stage2.module.0.bias
module.stage2.module.1.weight、module.stage2.module.1.bias、module.stage2.module.1.running_mean、module.stage2.module.1.running_var、module.stage2.module.1.num_batches_tracked
module.conv2.weight、module.conv2.bias
module.bn2.weight、module.bn2.bias、module.bn2.running_mean、module.bn2.running_var、module.bn2.num_batches_tracked
module.fc1.weight、module.fc1.bias
module.fc2.weight、module.fc2.bias