pytorch2维模型转换为3维模型——以resnext为例
最近在整理视频理解相关领域的算法,看了I3D的文章,觉得使用2D模型的预训练初始化3D网络的模型参数是一个非常重要的步骤(提点),因此写一篇博客进行记录,希望对大家有帮助。
2D网络结构转换到3D网络结构
3D网络相比较于2D网络主要是数据多了一个时空维度BCHW 到 BCTHW
因此将2D网络中的2D结构转换为3D结构,注意要设计好3D的卷积核、stride、padding等参数,主要有:
nn.BatchNorm2d -> nn.BatchNorm3d
nn.Conv2d -> nn.Conv3d
nn.AdaptiveAvgPool2d -> nn.AdaptiveAvgPool3d
Inflate 初始化3D网络的参数
在之前的博客基础上进行了修改(https://blog.csdn.net/Abo_luo/article/details/117304751?spm=1001.2014.3001.5502)
def transfer_model(pretrained_file, model):
pretrained_dict = torch.load(pretrained_file) # get pretrained dict
model_dict = model.state_dict() # get model dict
# 在合并前(update),需要去除pretrained_dict一些不需要的参数
pretrained_dict = transfer_state_dict(pretrained_dict, model_dict)
model_dict.update(pretrained_dict) # 更新(合并)模型的参数
model.load_state_dict(model_dict)
return model
def transfer_state_dict(pretrained_dict, model_dict):
q = 0
for (k1, v1), (k2, v2) in zip(pretrained_dict.items(), model_dict.items()):
if v1.shape == v2.shape:
model_dict[k2] = pretrained_dict[k1]
q += 1
else:
#膨胀2D结构的参数
temp_dim = v2.shape[2]
model_dict[k2] = torch.unsqueeze(v1, 2).repeat(1, 1, temp_dim, 1, 1) / temp_dim
q += 1
print('模型总结构数%d个,成功加载参数%d个'%(len(model_dict),q))
return model_dict
完整结构主要分为2步骤
1、找到2d网络的代码,并修改为3d的代码
2、找到对应的2d网络权重(下面代码中用的是timm库提供好的预训练权重,但是通常要模型适配,可以修改上面的transfer_state_dict函数),并迁移到3d网络中
2维resnext网络结构
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import timm
CARDINALITY = 32
DEPTH = 4
BASEWIDTH = 64
def transfer_model(pretrained_file, model):
pretrained_dict = torch.load(pretrained_file) # get pretrained dict
model_dict = model.state_dict() # get model dict
# 在合并前(update),需要去除pretrained_dict一些不需要的参数
pretrained_dict = transfer_state_dict(pretrained_dict, model_dict)
print('成功加载参数%d个'%len(pretrained_dict))
model_dict.update(pretrained_dict) # 更新(合并)模型的参数
model.load_state_dict(model_dict)
return model
def transfer_state_dict(pretrained_dict, model_dict):
state_dict = {}
for (k1, v1), (k2, v2) in zip(pretrained_dict.items(), model_dict.items()):
if v1.shape == v2.shape:
state_dict[k2] = pretrained_dict[k1]
else:
print('%s形状不匹配'%k2)
return state_dict
class ResNextBottleNeckC(nn.Module):
def __init__(self, in_channels, out_channels, stride):
super().__init__()
C = CARDINALITY
D = int(DEPTH * out_channels / BASEWIDTH) #number of channels per group
self.split_transforms = nn.Sequential(
nn.Conv2d(in_channels, C * D, kernel_size=1, groups=1, bias=False),
nn.BatchNorm2d(C * D),
nn.ReLU(inplace=True),
nn.Conv2d(C * D, C * D, kernel_size=3, stride=stride, groups=C, padding=1, bias=False),
nn.BatchNorm2d(C * D),
nn.ReLU(inplace=True),
nn.Conv2d(C * D, out_channels * 4, kernel_size=1, bias=False),
nn.BatchNorm2d(out_channels * 4),
)
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels * 4:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels, out_channels * 4, stride=stride, kernel_size=1, bias=False),
nn.BatchNorm2d(out_channels * 4)
)
def forward(self, x):
return F.relu(self.split_transforms(x) + self.shortcut(x))
class ResNext(nn.Module):
def __init__(self, block, num_blocks, class_names=100):
super().__init__()
self.in_channels = 64
self.conv1 = nn.Sequential(
nn.Conv2d(3, 64, 7, stride=1, padding=3, bias=False),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True)
)
self.conv2 = self._make_layer(block, num_blocks[0], 64, 1)
self.conv3 = self._make_layer(block, num_blocks[1], 128, 2)
self.conv4 = self._make_layer(block, num_blocks[2], 256, 2)
self.conv5 = self._make_layer(block, num_blocks[3], 512, 2)
self.avg = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * 4, class_names)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)
x = self.conv4(x)
x = self.conv5(x)
x = self.avg(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def _make_layer(self, block, num_block, out_channels, stride):
strides = [stride] + [1] * (num_block - 1)
layers = []
for stride in strides:
layers.append(block(self.in_channels, out_channels, stride))
self.in_channels = out_channels * 4
return nn.Sequential(*layers)
def resnext50(num_class):
""" return a resnext50(c32x4d) network
"""
return ResNext(ResNextBottleNeckC, [3, 4, 6, 3],class_names=num_class)
def resnext101():
""" return a resnext101(c32x4d) network
"""
return ResNext(ResNextBottleNeckC, [3, 4, 23, 3])
def resnext152():
""" return a resnext101(c32x4d) network
"""
return ResNext(ResNextBottleNeckC, [3, 4, 36, 3])
if __name__ == '__main__':
#使用timm的resnext预训练权重
model_names = timm.list_models(pretrained=True)
print("支持的预训练模型数量:%s" % model_names)
model = timm.create_model('resnext50d_32x4d',pretrained=True)
torch.save(model.state_dict(),'./resnext50.pth')
#定义自己的2d resnext网络,并使用timm的预训练权重
model = resnext50(num_class=2)
model = transfer_model("./resnext50.pth", model)
#并保存resnext50_after_timm.pth,为了下一个程序不需要做key匹配
torch.save(model.state_dict(),'./resnext50_after_timm.pth')
input = torch.randn(2,3,224,224)
output = model(input)
print(output.shape)
3维resnext网络结构
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import timm
CARDINALITY = 32
DEPTH = 4
BASEWIDTH = 64
def transfer_model(pretrained_file, model):
pretrained_dict = torch.load(pretrained_file) # get pretrained dict
model_dict = model.state_dict() # get model dict
# 在合并前(update),需要去除pretrained_dict一些不需要的参数
pretrained_dict = transfer_state_dict(pretrained_dict, model_dict)
model_dict.update(pretrained_dict) # 更新(合并)模型的参数
model.load_state_dict(model_dict)
return model
def transfer_state_dict(pretrained_dict, model_dict):
q = 0
for (k1, v1), (k2, v2) in zip(pretrained_dict.items(), model_dict.items()):
if v1.shape == v2.shape:
model_dict[k2] = pretrained_dict[k1]
q += 1
else:
temp_dim = v2.shape[2]
model_dict[k2] = torch.unsqueeze(v1, 2).repeat(1, 1, temp_dim, 1, 1) / temp_dim
q += 1
print('模型总结构数%d个,成功加载参数%d个'%(len(model_dict),q))
return model_dict
class ResNextBottleNeckC(nn.Module):
def __init__(self, in_channels, out_channels, stride):
super().__init__()
C = CARDINALITY
D = int(DEPTH * out_channels / BASEWIDTH) #number of channels per group
self.split_transforms = nn.Sequential(
nn.Conv3d(in_channels, C * D, kernel_size=1, groups=1, bias=False),
nn.BatchNorm3d(C * D),
nn.ReLU(inplace=True),
nn.Conv3d(C * D, C * D, kernel_size=3, stride=(1,stride,stride), groups=C, padding=1, bias=False),
nn.BatchNorm3d(C * D),
nn.ReLU(inplace=True),
nn.Conv3d(C * D, out_channels * 4, kernel_size=1, bias=False),
nn.BatchNorm3d(out_channels * 4),
)
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels * 4:
self.shortcut = nn.Sequential(
nn.Conv3d(in_channels, out_channels * 4, stride=(1,stride,stride), kernel_size=1, bias=False),
nn.BatchNorm3d(out_channels * 4)
)
def forward(self, x):
return F.relu(self.split_transforms(x) + self.shortcut(x))
class ResNext(nn.Module):
def __init__(self, block, num_blocks, class_names=100):
super().__init__()
self.in_channels = 64
self.conv1 = nn.Sequential(
nn.Conv3d(3, 64, 7, stride=(2,2,2), padding=3, bias=False),
nn.BatchNorm3d(64),
nn.ReLU(inplace=True)
)
self.conv2 = self._make_layer(block, num_blocks[0], 64, 1)
self.conv3 = self._make_layer(block, num_blocks[1], 128, 2)
self.conv4 = self._make_layer(block, num_blocks[2], 256, 2)
self.conv5 = self._make_layer(block, num_blocks[3], 512, 2)
self.avg = nn.AdaptiveAvgPool3d((1, 1, 1))
self.fc = nn.Linear(512 * 4, class_names)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)
x = self.conv4(x)
x = self.conv5(x)
x = self.avg(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def _make_layer(self, block, num_block, out_channels, stride):
strides = [stride] + [1] * (num_block - 1)
layers = []
for stride in strides:
layers.append(block(self.in_channels, out_channels, stride))
self.in_channels = out_channels * 4
return nn.Sequential(*layers)
def resnext50(num_class):
""" return a resnext50(c32x4d) network
"""
return ResNext(ResNextBottleNeckC, [3, 4, 6, 3],class_names=num_class)
def resnext101():
""" return a resnext101(c32x4d) network
"""
return ResNext(ResNextBottleNeckC, [3, 4, 23, 3])
def resnext152():
""" return a resnext101(c32x4d) network
"""
return ResNext(ResNextBottleNeckC, [3, 4, 36, 3])
if __name__ == '__main__':
model = resnext50(2).cuda()
model = transfer_model("./resnext50_after_timm.pth", model)
input = torch.randn(2,3,10,224,224).cuda()
output = model(input)
print(output.shape)
总结
上述第一段程序中主要需要修改的是transfer_state_dict函数,第二段程序(3D)和第一段程序(2D)模型的num_class需要相等,因为在第二段程序中本来就存在网络参数形状不匹配,因此没有做相关判断,直接使用了key(顺序)进行迁移。模型权重链接:https://pan.baidu.com/s/1AH1yJP5DWrEL7hqQByVStQ?pwd=x9l2
提取码:x9l2