论文: Geometry Guided Pose-Invariant Facial Expression Recognition
github: https://github.com/FFZhang1231/Facial-expression-recognition
self.f = self.encoder(
image=self.input_image
)
self.PE = self.encoder_pose(
pose=self.pose,
is_training=self.is_training
)
self.G = self.generator(
f=self.f,
latent_variable=self.PE,
# y=self.expression,
enable_tile_label=self.enable_tile_label,
tile_ratio=self.tile_ratio
)
f(x)
def encoder(self, image, reuse_variables=False):
# if reuse_variables:
# tf.get_variable_scope().reuse_variables()
with tf.variable_scope("", reuse=reuse_variables):
num_layers = int(np.log2(self.size_image)) - int(self.size_kernel / 2)
current = image
# conv layers with stride 2
for i in range(num_layers):
name = 'E_conv' + str(i)
current = conv2d(
input_map=current,
num_output_channels=self.num_encoder_channels * (2 ** i),
size_kernel=self.size_kernel,
name=name
)
current = tf.nn.relu(current)
# fully connection layer
name = 'E_fc'
current = fc(
input_vector=tf.reshape(current, [self.size_batch, -1]),
num_output_length=self.num_fx,
name=name
)
# output
return tf.nn.tanh(current)
f(g’)
def encoder_pose(self, pose, is_training=True, reuse_variables=False, num_hidden_layer_channels=(128, 64, 32),
enable_bn=True):
with tf.variable_scope("", reuse=reuse_variables):
current = pose
for i in range(len(num_hidden_layer_channels)):
name = 'E_p_fc' + str(i)
current = fc(
input_vector=current,
num_output_length=num_hidden_layer_channels[i],
name=name
)
if enable_bn:
name = 'E_p_bn' + str(i)
current = tf.contrib.layers.batch_norm(
current,
scale=False,
is_training=is_training,
scope=name,
reuse=reuse_variables
)
current = tf.nn.relu(current)
latent_variable = current
return latent_variable
生成网络G
在对抗网络的基础上,引入f(g’)人脸关键点几何等先验信息.
生成网络输入包括,输入图像特征向量f(x),f(g’)【z,y】
def generator(self, f, latent_variable, reuse_variables=False, enable_tile_label=True, tile_ratio=1.0):
# if reuse_variables:
# tf.get_variable_scope().reuse_variables()
with tf.variable_scope("", reuse=reuse_variables):
num_layers = int(np.log2(self.size_image)) - int(self.size_kernel / 2)
if enable_tile_label:
duplicate = int(self.num_fx * tile_ratio / self.num_poses)
else:
duplicate = 1
f = concat_label(f, latent_variable, duplicate=duplicate)
size_mini_map = int(self.size_image / 2 ** num_layers)
# fc layer
name = 'G_fc'
current = fc(
input_vector=f,
num_output_length=self.num_gen_channels * size_mini_map * size_mini_map,
name=name
)
# reshape to cube for deconv
current = tf.reshape(current, [-1, size_mini_map, size_mini_map, self.num_gen_channels])
current = tf.nn.relu(current)
# deconv layers with stride 2
for i in range(num_layers):
name = 'G_deconv' + str(i)
current = deconv2d(
input_map=current,
output_shape=[self.size_batch,
size_mini_map * 2 ** (i + 1),
size_mini_map * 2 ** (i + 1),
int(self.num_gen_channels / 2 ** (i + 1))],
size_kernel=self.size_kernel,
name=name
)
current = tf.nn.relu(current)
name = 'G_deconv' + str(i + 1)
current = deconv2d(
input_map=current,
output_shape=[self.size_batch,
self.size_image,
self.size_image,
int(self.num_gen_channels / 2 ** (i + 2))],
size_kernel=self.size_kernel,
stride=1,
name=name
)
current = tf.nn.relu(current)
name = 'G_deconv' + str(i + 2)
current = deconv2d(
input_map=current,
output_shape=[self.size_batch,
self.size_image,
self.size_image,
self.num_input_channels],
size_kernel=self.size_kernel,
stride=1,
name=name
)
# output
return tf.nn.tanh(current)
重构几何关键点
self.PD = self.decoder_pose(
latent_variable=self.PE,
is_training=self.is_training
)
def decoder_pose(self, latent_variable, is_training=True, reuse_variables=False,
num_hidden_layer_channels=(64, 128, 136), enable_bn=True):
with tf.variable_scope("", reuse=reuse_variables):
current = latent_variable
for i in range(len(num_hidden_layer_channels)):
name = 'D_p_fc' + str(i)
current = fc(
input_vector=current,
num_output_length=num_hidden_layer_channels[i],
name=name
)
if enable_bn:
name = 'D_p_bn' + str(i)
current = tf.contrib.layers.batch_norm(
current,
scale=False,
is_training=is_training,
scope=name,
reuse=reuse_variables
)
current = tf.nn.relu(current)
p_output = current
return p_output
3 根据输入图像判别表情,姿态类别
self.D_input_ex_logits, self.D_input_pose_logits= self.discriminator_acc(
image=self.input_image,
is_training=self.is_training
)
def discriminator_acc(self,image,is_training=True, reuse_variables=False):
current = image
with tf.variable_scope("RS", reuse=reuse_variables):
with tf.variable_scope("my_resnet"):
base_model = tf.keras.applications.ResNet50(include_top=False, weights='imagenet', input_tensor=current,input_shape=None, pooling=max, classes=1000)
#pdb.set_trace()
my_output = base_model(current)
name = 'D_acc_fc1'
current1 = fc(
input_vector=tf.reshape(my_output, [self.size_batch, -1]),
num_output_length=1024,
name=name
)
current1 = lrelu(current1)
if self.is_training:
current1 = tf.nn.dropout(current1, 0.5)
name = 'D_acc_fc2'
current2 = fc(
input_vector=tf.reshape(current1, [self.size_batch, -1]),
num_output_length=self.num_categories,
name=name
)
name = 'D_acc_fc3'
current3 = fc(
input_vector=tf.reshape(current1, [self.size_batch, -1]),
num_output_length=self.num_poses,
name=name
)
return current2,current3
根据f(x)判定身份类别(0,1)
# discriminator on identity
self.D_f, self.D_f_logits = self.discriminator_i(
f=self.f,
is_training=self.is_training
)
def discriminator_i(self, f, is_training=True, reuse_variables=False, num_hidden_layer_channels=(64, 32, 16),
enable_bn=True):
with tf.variable_scope("", reuse=reuse_variables):
current = f
# fully connection layer
for i in range(len(num_hidden_layer_channels)):
name = 'D_f_fc' + str(i)
current = fc(
input_vector=current,
num_output_length=num_hidden_layer_channels[i],
name=name
)
if enable_bn:
name = 'D_f_bn' + str(i)
current = tf.contrib.layers.batch_norm(
current,
scale=False,
is_training=is_training,
scope=name,
reuse=reuse_variables
)
current = tf.nn.relu(current)
# output layer
name = 'D_f_fc' + str(i + 1)
current = fc(
input_vector=current,
num_output_length=1,
name=name
)
return tf.nn.sigmoid(current), current
self.D_G, self.D_G_logits = self.discriminator_att(
image=self.G,
pose=self.pose,
is_training=self.is_training
)
def discriminator_att(self, image, pose, is_training=True, reuse_variables=False,
num_hidden_layer_channels=(16, 32, 64, 128), enable_bn=True):
with tf.variable_scope("", reuse=reuse_variables):
num_layers = len(num_hidden_layer_channels)
current = image
# conv layers with stride 2
for i in range(num_layers):
name = 'D_img_conv' + str(i)
# pdb.set_trace()
current = conv2d(
input_map=current,
num_output_channels=num_hidden_layer_channels[i],
size_kernel=self.size_kernel,
name=name
)
# pdb.set_trace()
if enable_bn:
name = 'D_img_bn' + str(i)
current = tf.contrib.layers.batch_norm(
current,
scale=False,
is_training=is_training,
scope=name,
reuse=reuse_variables
)
current = tf.nn.relu(current)
if i == 0:
# current = concat_label(current, y)
current = concat_label(current, pose, int(self.num_categories / self.num_poses))
# fully connection layer
name = 'D_img_fc1'
current = fc(
input_vector=tf.reshape(current, [self.size_batch, -1]),
num_output_length=1024,
name=name
)
current = lrelu(current)
name = 'D_img_fc2'
current1 = fc(
input_vector=current,
num_output_length=1,
name=name
)
return tf.nn.sigmoid(current1), current
几何嵌入网络E是为了提取几何特征向量
编码器-解码器结构的发生器G,产生新的。编码器Genc的输入是任意表情和姿态的人脸图像,它从输入的人脸图像学习到特征表示的映射。表示然后与几何信息连接,以送入Gdec。译码器Gdec的输出是一幅具有目标表情和姿态的合成人脸图像,学习到的身份表示是Genc和Gdec之间的桥梁。
Datt用于在一个潜在空间中从面部图像中分离出姿态、表情和身份,以改变属性(表情、姿态)但保留身份。
Di来控制身份特征的分布。
分类器Cexp,力争使生成的人脸图像与输入的真实人脸图像具有相同的表情