FCN in paper
Convolutional networks are powerful visual models that yield hierarchies of features. convolutional networks trained end-to-end, pixelsto-pixels, exceed the state-of-the-art in semantic segmentation.
Fully convolutional versions of existing networks predict dense outputs from arbitrary-sized inputs. Both learning and inference are performed whole-image-at-a-time by dense feedforward computation and backpropagation. Combines semantic information from a deep, coarse layer with appearance information from a shallow, fine layer to produce accurate and detailed segmentations.
Semantic segmentation faces an inherent tension between semantics and location: global information resolves
what while local information resolves where.
Each layer of data in a convnet is a three-dimensional array of size h × w × d, where h and w are spatial dimensions, and d is the feature or channel dimension. The first layer is the image, with pixel size h × w, and d color channels. Locations in higher layers correspond to the locations in the image they are path-connected to, which are called
their receptive fields.
自定义FCN
使用VGG16作为backbone,并自定义了VGG16的网络结构只创建到pooling5,用来作为卷积部分的特征提取工具。并加载预训练的参数。然后再创建FCN网络,在FCN网络中实现剩余的操作:fully_layer 和 transpose_conv layer.并将其进行卷积化改造。
这里给出转置卷积的尺寸计算公式:
n
o
u
t
=
(
n
i
n
−
1
)
×
s
−
2
×
p
+
k
+
o
u
t
p
a
d
n_{out}=(n_{in}-1) \times s-2\times p+k+out_{pad}
nout=(nin−1)×s−2×p+k+outpad
由于进行下采样时会让featuremap的尺寸变小,当输入图像尺寸为224x224时,最后一层会变成1x1,这样在上采样的时候就不太方便;若输入尺寸小于224那么网络将无法成功运行。针对这个问题有两种思路:
- 在数据输入时先做一个100的padding,这样能保证任何尺寸的输入都能够成功的运行,但是这样会引入一些问题,如:增加了计算量;最后还要把多余的部分修剪掉。
- 在数据输入时,设定输入尺寸大于224.
我使用了第二种方式,但这种方式在 transpose_conv 部分也同样需要进行crop,而且最后上采样输出的尺寸永远比输入的尺寸小。
这里就有了疑问:对于任何尺寸的输入,怎样处理才能保证上采样后输出的尺寸与输入一致呢?
代码实例:
1.创建VGG16网络,输出pool3,pool4,pool5
class VGG16(torch.nn.Module):
def __init__(self):
super(VGG16,self).__init__()
#convolutional layers beased on pretrained vgg16
# conv1
# 第一层卷积的padding未进行扩充
self.conv1_1 = torch.nn.Conv2d(in_channels=3,out_channels=64,kernel_size=3,stride=1,padding=1)
self.relu1_1 = torch.nn.ReLU(inplace=True)
self.conv1_2 = torch.nn.Conv2d(in_channels=64,out_channels=64,kernel_size=3,stride=1,padding=1)
self.relu1_2 = torch.nn.ReLU(inplace=True)
self.pool1 = torch.nn.MaxPool2d(kernel_size=2,stride=2,ceil_mode=True) # 1/2
# conv2
self.conv2_1 = torch.nn.Conv2d(in_channels=64,out_channels=128,kernel_size=3,stride=1,padding=1)
self.relu2_1 = torch.nn.ReLU(inplace=True)
self.conv2_2 = torch.nn.Conv2d(in_channels=128,out_channels=128,kernel_size=3,stride=1,padding=1)
self.relu2_2 = torch.nn.ReLU(inplace=True)
self.pool2 = torch.nn.MaxPool2d(kernel_size=2,stride=2,ceil_mode=True) # 1/4
# conv3
self.conv3_1 = torch.nn.Conv2d(in_channels=128,out_channels=256,kernel_size=3,stride=1,padding=1)
self.relu3_1 = torch.nn.ReLU(inplace=True)
self.conv3_2 = torch.nn.Conv2d(in_channels=256,out_channels=256,kernel_size=3,stride=1,padding=1)
self.relu3_2 = torch.nn.ReLU(inplace=True)
self.conv3_3 = torch.nn.Conv2d(in_channels=256,out_channels=256,kernel_size=3,stride=1,padding=1)
self.relu3_3 = torch.nn.ReLU(inplace=True)
self.pool3 = torch.nn.MaxPool2d(kernel_size=2,stride=2,ceil_mode=True) # 1/8
# conv4
self.conv4_1 = torch.nn.Conv2d(in_channels=256,out_channels=512,kernel_size=3,stride=1,padding=1)
self.relu4_1 = torch.nn.ReLU(inplace=True)
self.conv4_2 = torch.nn.Conv2d(in_channels=512,out_channels=512,kernel_size=3,stride=1,padding=1)
self.relu4_2 = torch.nn.ReLU(inplace=True)
self.conv4_3 = torch.nn.Conv2d(in_channels=512,out_channels=512,kernel_size=3,stride=1,padding=1)
self.relu4_3 = torch.nn.ReLU(inplace=True)
self.pool4 = torch.nn.MaxPool2d(kernel_size=2,stride=2,ceil_mode=True) # 1/16
# conv5
self.conv5_1 = torch.nn.Conv2d(in_channels=512,out_channels=512,kernel_size=3,stride=1,padding=1)
self.relu5_1 = torch.nn.ReLU(inplace=True)
self.conv5_2 = torch.nn.Conv2d(in_channels=512,out_channels=512,kernel_size=3,stride=1,padding=1)
self.relu5_2 = torch.nn.ReLU(inplace=True)
self.conv5_3 = torch.nn.Conv2d(in_channels=512,out_channels=512,kernel_size=3,stride=1,padding=1)
self.relu5_3 = torch.nn.ReLU(inplace=True)
self.pool5 = torch.nn.MaxPool2d(kernel_size=2,stride=2,ceil_mode=True) # 1/32
# get model weights dict
self.conv_model_dict = self.state_dict()
def load_pretrained_weights(self):
try:
with open('model/pretrained_vgg16_conv_weights.pkl', 'rb') as f:
self.conv_model_dict.update(pickle.load(f))
except:
vgg = models.vgg16(pretrained=True)
# load pretrained convolution layers weights
pretrained_dict = vgg.state_dict()
pretrained_keys = list(pretrained_dict.keys())
# print(pretrained_dict.keys())
model_dict_keys = list(self.conv_model_dict.keys())
pretrained_keys = pretrained_keys[:len(model_dict_keys)]
for i in range(len(model_dict_keys)):
# print(model_dict[model_dict_keys[i]].shape)
# print(pretrained_dict[pretrained_keys[i]].shape)
self.conv_model_dict[model_dict_keys[i]]=pretrained_dict[pretrained_keys[i]]
with open('model/pretrained_vgg16_conv_weights.pkl','wb') as f:
pickle.dump(self.conv_model_dict, f, pickle.HIGHEST_PROTOCOL)
self.load_state_dict(self.conv_model_dict)
def forward(self,x):
h = x
h = self.relu1_1(self.conv1_1(h))
h = self.relu1_2(self.conv1_2(h))
pool1 = self.pool1(h)
h = self.relu2_1(self.conv2_1(pool1))
h = self.relu2_2(self.conv2_2(h))
pool2 = self.pool2(h)
h = self.relu3_1(self.conv3_1(pool2))
h = self.relu3_2(self.conv3_2(h))
h = self.relu3_3(self.conv3_3(h))
pool3 = self.pool3(h)
h = self.relu4_1(self.conv4_1(pool3))
h = self.relu4_2(self.conv4_2(h))
h = self.relu4_3(self.conv4_3(h))
pool4 = self.pool4(h)
h = self.relu5_1(self.conv5_1(pool4))
h = self.relu5_2(self.conv5_2(h))
h = self.relu5_3(self.conv5_3(h))
pool5 = self.pool5(h)
print(pool5.size())
return pool3,pool4,pool5
2.创建FCNNet,将全连接层改造成卷积层
class FCN_Net(torch.nn.Module):
def __init__(self,class_num,net_type='32s'):
super(FCN_Net,self).__init__()
self.net_type = net_type
self.vgg = VGG16()
self.vgg.load_pretrained_weights()
# fully layers
# fc6
self.fc_6 = torch.nn.Conv2d(512,4096,7)
self.relu6 = torch.nn.ReLU(inplace=True)
self.dropout6 = torch.nn.Dropout2d()
# fc7
self.fc_7 = torch.nn.Conv2d(4096,4096,1)
self.relu7 = torch.nn.ReLU(inplace=True)
self.dropout7 = torch.nn.Dropout2d()
# coarse output layer
self.score = torch.nn.Conv2d(4096,class_num,1)
# transpose convolution layer
self.trans_conv_32time = torch.nn.ConvTranspose2d(class_num,class_num,64,32)
self.trans_conv_16time = torch.nn.ConvTranspose2d(class_num,class_num,32,16)
self.trans_conv_8time = torch.nn.ConvTranspose2d(class_num,class_num,16,8)
self.trans_conv_2time = torch.nn.ConvTranspose2d(class_num,class_num,4,2)
self.crop_channel_pool3 = torch.nn.Conv2d(256,class_num,1)
self.crop_channel_pool4 = torch.nn.Conv2d(512,class_num,1)
for m in self.modules():
if isinstance(m, torch.nn.ConvTranspose2d):
# initial transcon weight with bilinear_kernel
m.weight.data = bilinear_kernel(class_num, class_num, m.kernel_size[0])
def forward(self,x):
pool3,pool4,pool5 = self.vgg.forward(x)
h = self.dropout6(self.relu6(self.fc_6(pool5)))
h = self.dropout7(self.relu7(self.fc_7(h)))
h = self.score(h)
if self.net_type == '32s':
out = self.trans_conv_32time(h)
return out
elif self.net_type == '16s':
up_score = self.trans_conv_2time(h)
crop_val = int((pool4.size()[3]-up_score.size()[3])/2)
crop_pool4 = self.crop_channel_pool4(pool4)
crop_pool4 = crop_pool4[:, :, crop_val:crop_val + up_score.size()[3], crop_val:crop_val + up_score.size()[3]]
out = self.trans_conv_16time(crop_pool4+up_score)
return out
elif self.net_type == '8s':
up_score = self.trans_conv_2time(h)
crop_val = int((pool4.size()[3]-up_score.size()[3])/2)
crop_pool4 = self.crop_channel_pool4(pool4)
crop_pool4 = crop_pool4[:, :, crop_val:crop_val + up_score.size()[3], crop_val:crop_val + up_score.size()[3]]
out = self.trans_conv_2time(crop_pool4+up_score)
crop_val = int((pool3.size()[3]-out.size()[3])/2)
crop_pool3 = self.crop_channel_pool3(pool3)
crop_pool3 = crop_pool3[:, :, crop_val:crop_val + out.size()[3], crop_val:crop_val + out.size()[3]]
out = self.trans_conv_8time(crop_pool3+out)
return out
else:
raise ValueError('net_type should be in \[\'32s\',\'16s\',\'8s\'\]')
使用双线性插值对转置卷积的权重进行初始化,可以提高其训练速度。
def bilinear_kernel(in_channels, out_channels, kernel_size):
factor = (kernel_size + 1) // 2
center = kernel_size/2
og = np.ogrid[:kernel_size, :kernel_size]
filt = (1 - abs(og[0] - center) / factor) * (1 - abs(og[1] - center) / factor)
weight = np.zeros((in_channels, out_channels, kernel_size, kernel_size), dtype='float32')
weight[range(in_channels), range(out_channels), :, :] = filt
return torch.from_numpy(weight)
下面是使用input size为[3,256,256],fcn-8s结构,运行得到的结果:上采样后的尺寸为[class_num,120,120]
----------------------------------------------------------------
Layer (type) Output Shape Param #
================================================================
Conv2d-1 [-1, 64, 256, 256] 1,792
ReLU-2 [-1, 64, 256, 256] 0
Conv2d-3 [-1, 64, 256, 256] 36,928
ReLU-4 [-1, 64, 256, 256] 0
MaxPool2d-5 [-1, 64, 128, 128] 0
Conv2d-6 [-1, 128, 128, 128] 73,856
ReLU-7 [-1, 128, 128, 128] 0
Conv2d-8 [-1, 128, 128, 128] 147,584
ReLU-9 [-1, 128, 128, 128] 0
MaxPool2d-10 [-1, 128, 64, 64] 0
Conv2d-11 [-1, 256, 64, 64] 295,168
ReLU-12 [-1, 256, 64, 64] 0
Conv2d-13 [-1, 256, 64, 64] 590,080
ReLU-14 [-1, 256, 64, 64] 0
Conv2d-15 [-1, 256, 64, 64] 590,080
ReLU-16 [-1, 256, 64, 64] 0
MaxPool2d-17 [-1, 256, 32, 32] 0
Conv2d-18 [-1, 512, 32, 32] 1,180,160
ReLU-19 [-1, 512, 32, 32] 0
Conv2d-20 [-1, 512, 32, 32] 2,359,808
ReLU-21 [-1, 512, 32, 32] 0
Conv2d-22 [-1, 512, 32, 32] 2,359,808
ReLU-23 [-1, 512, 32, 32] 0
MaxPool2d-24 [-1, 512, 16, 16] 0
Conv2d-25 [-1, 512, 16, 16] 2,359,808
ReLU-26 [-1, 512, 16, 16] 0
Conv2d-27 [-1, 512, 16, 16] 2,359,808
ReLU-28 [-1, 512, 16, 16] 0
Conv2d-29 [-1, 512, 16, 16] 2,359,808
ReLU-30 [-1, 512, 16, 16] 0
MaxPool2d-31 [-1, 512, 8, 8] 0
Conv2d-32 [-1, 4096, 2, 2] 102,764,544
ReLU-33 [-1, 4096, 2, 2] 0
Dropout2d-34 [-1, 4096, 2, 2] 0
Conv2d-35 [-1, 4096, 2, 2] 16,781,312
ReLU-36 [-1, 4096, 2, 2] 0
Dropout2d-37 [-1, 4096, 2, 2] 0
Conv2d-38 [-1, 12, 2, 2] 49,164
ConvTranspose2d-39 [-1, 12, 6, 6] 2,316
Conv2d-40 [-1, 12, 16, 16] 6,156
ConvTranspose2d-41 [-1, 12, 14, 14] 2,316
Conv2d-42 [-1, 12, 32, 32] 3,084
ConvTranspose2d-43 [-1, 12, 120, 120] 36,876
================================================================
Total params: 134,360,456
Trainable params: 134,360,456
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.75
Forward/backward pass size (MB): 287.46
Params size (MB): 512.54
Estimated Total Size (MB): 800.75
----------------------------------------------------------------