最近在看slim代码。发现一个用"# Use conv2d instead of fully_connected layers.".做下记录.
# vgg16
net = slim.max_pool2d(net, [2, 2], scope='pool5')
# Use conv2d instead of fully_connected layers.
net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6')
# alexnet
net = slim.max_pool2d(net, [3, 3], 2, scope='pool5')
# Use conv2d instead of fully_connected layers.
with slim.arg_scope([slim.conv2d],
weights_initializer=trunc_normal(0.005),
biases_initializer=tf.constant_initializer(0.1)):
net = slim.conv2d(net, 4096, [5, 5], padding='VALID',
scope='fc6')
我在百度找到了两个个相同问的帖子。
第一个人
前段时间校招面试的问题有一个就是如何使用卷积层来替代全连接层。以前没有认真考虑过,所以第一时间的想法是1×1的卷积核去实现。后来仔细想一想,1×1卷积核并不能实现这样的功能,涉及到参数共享。昨天,看到PixelLink文字定位网络,该网络使用了VGG网络的全连接层。所以就想着今天使用keras实现下VGG16网络卷积层替换全连接层。参数还是依然使用预训练参数,最后保存参数。查看对比.h5文件,全连接和转换卷积层后的权重一致。下面简单介绍下代码:
from keras.applications.vgg16 import VGG16
from keras.layers import Conv2D
from keras.models import Input,Model
import numpy as np
model = VGG16(weights='imagenet', include_top=True) #####加载VGG16完整模型,包括3层全连接层
block5_pool = model.get_layer('block5_pool').output ######得到block5_pool flatten之前的输入
model.summary() ######block5_pool.shape=(7, 7, 512)
fc1_weight = model.get_layer('fc1').get_weights() ########得到fc1 fc2 和predictions全连接层的权重参数
fc2_weight = model.get_layer('fc2').get_weights() #######fc1 4096个 fc2 4096个 predictions 1000个
predictions_weight = model.get_layer('predictions').get_weights()
########之前网络是block5_pool.flatten 之后是25088个神经元 后接4096个全连接层神经元,一共有25088×4096+4096=102764544个参数
########卷积层需要使用 4096个(7×7×512)大小的卷积核去实现这样的功能,输出大小为(None,1,1,4096)。参数量依然是7×7×512×4096+4096=102764544个。
fc1to_conv = Conv2D(filters=4096,kernel_size=(7,7),name='fc1to_conv')(block5_pool)
fc1to_conv_w = fc1_weight[0].reshape(7,7,512,4096)#######这里对取出的全连接层权重直接reshape成卷积层形状大小参数即可
fc1_weight[0] = fc1to_conv_w
############同理上层输入(None,1,1,4096),这层全连接是4096个神经元,所以本次卷积核就是4096个,卷积核大小就是原来输入张量的h,w。本层的输出(None,1,1,4096)
fc2to_conv = Conv2D(filters=4096,kernel_size=(1,1),name='fc2to_conv')(fc1to_conv)
fc2to_conv_w = fc2_weight[0].reshape(1,1,4096,4096)
fc2_weight[0] = fc2to_conv_w
###########同理上层输入(None,1,1,4096),这层全连接是1000个神经元,所以本次卷积核就是1000个,卷积核大小就是原来输入张量的h,w。本层的输出(None,1,1,1000)
predictionsto_conv = Conv2D(filters=1000,kernel_size=(1,1),name='predictionsto_conv')(fc2to_conv)
predictionsto_conv_w = predictions_weight[0].reshape(1,1,4096,1000)
predictions_weight[0] = predictionsto_conv_w
new_model = Model(inputs=model.input,outputs=predictionsto_conv)
new_model.get_layer('fc1to_conv').set_weights(fc1_weight)########将对应全连接层权重reshape后送入对应卷积层。
new_model.get_layer('fc2to_conv').set_weights(fc2_weight)
new_model.get_layer('predictionsto_conv').set_weights(predictions_weight)
new_model.summary()
new_model.save_weights('./my_weight.h5') ###########保存卷积层替代全连接层后的权重
总之,卷积替换全连接层。卷积层的filters大小对应全连接层神经元个数,kernel_size大小对应输入张量的h,w的大小。
第二个人
As a workaround (besides just using GammaFlopsRegularizer) and for future reference, most modern convolutional networks forgo the flatten/fully_connected pattern, and instead use 1x1conv/reduce_mean.
原模型
def base_model(x_ph, is_training_ph, scope, channels=[32, 64, 64], reuse=False):
norm_params = {'is_training': False, 'scale': True, 'center': False}
# Network Definition
with tf.variable_scope(scope, reuse=reuse):
with slim.arg_scope([slim.conv2d, slim.fully_connected],
normalizer_fn=slim.batch_norm,
normalizer_params=norm_params,
weights_initializer=tf.truncated_normal_initializer(0.0, 0.01),
weights_regularizer=slim.l2_regularizer(0.0005)):
conv1 = slim.conv2d(x_ph, num_outputs=channels[0], kernel_size=3, scope='conv1')
pool1 = slim.max_pool2d(conv1, kernel_size=2, scope='pool1')
conv2 = slim.conv2d(pool1, num_outputs=channels[1], kernel_size=3, scope='conv2')
pool2 = slim.max_pool2d(conv2, kernel_size=2, scope='pool2')
conv3 = slim.conv2d(pool2, num_outputs=channels[2], kernel_size=3, scope='conv3')
conv3_flat = slim.flatten(conv3)
out = slim.fully_connected(conv3_flat, num_outputs=10, normalizer_fn=None, normalizer_params=None,
activation_fn=None, scope='output')
pred = tf.argmax(out, axis=1)
return out, pred
替换全连接
def base_model(x_ph, is_training_ph, scope, channels=[32, 64, 64], reuse=False):
norm_params = {'is_training': False, 'scale': True, 'center': False}
# Network Definition
with tf.variable_scope(scope, reuse=reuse):
with slim.arg_scope([slim.conv2d, slim.fully_connected],
normalizer_fn=slim.batch_norm,
normalizer_params=norm_params,
weights_initializer=tf.truncated_normal_initializer(0.0, 0.01),
weights_regularizer=slim.l2_regularizer(0.0005)):
conv1 = slim.conv2d(x_ph, num_outputs=channels[0], kernel_size=3, scope='conv1')
pool1 = slim.max_pool2d(conv1, kernel_size=2, scope='pool1')
conv2 = slim.conv2d(pool1, num_outputs=channels[1], kernel_size=3, scope='conv2')
pool2 = slim.max_pool2d(conv2, kernel_size=2, scope='pool2')
conv3 = slim.conv2d(pool2, num_outputs=channels[2], kernel_size=3, scope='conv3')
out = slim.conv2d(
conv3, 10, [1, 1], activation_fn=None, normalizer_fn=None, scope='output_conv')
out = tf.reduce_mean(out, [1, 2], name='output', keepdims=False)
pred = tf.argmax(out, axis=1)
return out, pred
参考 https://blog.csdn.net/weixin_43194555/article/details/90476338