源码地址:https://github.com/davidsandberg/facenet
Facenet 提出了triplet loss 一种新的损失函数计算方式。引入三元组概念 anchor positive negative
损失函数定义为: L = dist(a,p)− dist(a,n) + alpha 训练就是为了尽可能减小loss 也就是将 dist(a, p) + alpah < dist(a, n)的样本对进行修改。减小loss同时,缩小同类样本之间的距离,不同样本之间的距离尽可能的大
下面分析代码
损失函数 最终变成 negative - positive >= alpha 也就是相同样本距离近 不同样本距离远 alpha用来控制不同样本的距离 如果没有 alpha
会出现 就算样本都区分不出来 loss也会很小 也就是所以样本都落在同一区间。模型没有学到任何类别间的空间表示
def triplet_loss(anchor, positive, negative, alpha):
"""Calculate the triplet loss according to the FaceNet paper
Args:
anchor: the embeddings for the anchor images.
positive: the embeddings for the positive images.
negative: the embeddings for the negative images.
Returns:
the triplet loss according to the FaceNet paper as a float tensor.
"""
with tf.variable_scope('triplet_loss'):
pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, positive)), 1)
neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, negative)), 1)
basic_loss = tf.add(tf.subtract(pos_dist,neg_dist), alpha)
loss = tf.reduce_mean(tf.maximum(basic_loss, 0.0), 0)
return loss
1. 读取数据
# 得到训练数据 格式为 【ImageClass, ImageClass, ...】
train_set = facenet.get_dataset(args.data_dir)
def get_dataset(path, has_class_directories=True):
# path 数据文件路径
# 存放格式每个人一个文件夹
dataset = []
path_exp = os.path.expanduser(path)
# 这里将每个人当做一个类
classes = [path for path in os.listdir(path_exp) \
if os.path.isdir(os.path.join(path_exp, path))]
# 进行排序
classes.sort()
nrof_classes = len(classes)
for i in range(nrof_classes):
class_name = classes[i]
facedir = os.path.join(path_exp, class_name)
image_paths = get_image_paths(facedir)
dataset.append(ImageClass(class_name, image_paths))
return dataset
2 然后分析facenet是如何训练的
train方法 最主要做了两件事,
1 从样本中挑选出1800张图片,是1800张图片 不是1800个人。因为每个人可能有多张。然后进行前向运算,得到1800个embedding。
2 然后从这1800张图片中挑选出用于triplet loss训练的batch=45数据
def train(args, sess, dataset, epoch, image_paths_placeholder, labels_placeholder, labels_batch,
batch_size_placeholder, learning_rate_placeholder, phase_train_placeholder, enqueue_op, input_queue, global_step,
embeddings, loss, train_op, summary_op, summary_writer, learning_rate_schedule_file,
embedding_size, anchor, positive, negative, triplet_loss):
batch_number = 0
if args.learning_rate>0.0:
lr = args.learning_rate
else:
lr = facenet.get_learning_rate_from_file(learning_rate_schedule_file, epoch)
while batch_number < args.epoch_size:
# Sample people randomly from the dataset
# 采样1800 len(image_paths) = np.sum(num_per_class) = 1800
image_paths, num_per_class = sample_people(dataset, args.people_per_batch, args.images_per_person)
print('Running forward pass on sampled images: ', end='')
start_time = time.time()
# 1800 个样本
nrof_examples = args.people_per_batch * args.images_per_person
# (600, 3)
labels_array = np.reshape(np.arange(nrof_examples),(-1,3))
# (600, 3)
image_paths_array = np.reshape(np.expand_dims(np.array(image_paths),1), (-1,3))
sess.run(enqueue_op, {image_paths_placeholder: image_paths_array, labels_placeholder: labels_array})
emb_array = np.zeros((nrof_examples, embedding_size))
nrof_batches = int(np.ceil(nrof_examples / args.batch_size))
# 得到1800个样本的embeddings
for i in range(nrof_batches):
# batch_size:90
batch_size = min(nrof_examples-i*args.batch_size, args.batch_size)
emb, lab = sess.run([embeddings, labels_batch], feed_dict={batch_size_placeholder: batch_size,
learning_rate_placeholder: lr, phase_train_placeholder: True})
emb_array[lab,:] = emb
print('%.3f' % (time.time()-start_time))
# Select triplets based on the embeddings
# 并不是1800中所有的三元组都满足triplet loss
# 所以需要从里面挑选出满足条件的triplet loss
print('Selecting suitable triplets for training')
triplets, nrof_random_negs, nrof_triplets = select_triplets(emb_array, num_per_class,
image_paths, args.people_per_batch, args.alpha)
selection_time = time.time() - start_time
print('(nrof_random_negs, nrof_triplets) = (%d, %d): time=%.3f seconds' %
(nrof_random_negs, nrof_triplets, selection_time))
# Perform training on the selected triplets
# 开始训练
nrof_batches = int(np.ceil(nrof_triplets*3/args.batch_size))
triplet_paths = list(itertools.chain(*triplets))
labels_array = np.reshape(np.arange(len(triplet_paths)),(-1,3))
triplet_paths_array = np.reshape(np.expand_dims(np.array(triplet_paths),1), (-1,3))
sess.run(enqueue_op, {image_paths_placeholder: triplet_paths_array, labels_placeholder: labels_array})
nrof_examples = len(triplet_paths)
train_time = 0
i = 0
emb_array = np.zeros((nrof_examples, embedding_size))
loss_array = np.zeros((nrof_triplets,))
summary = tf.Summary()
step = 0
while i < nrof_batches:
start_time = time.time()
batch_size = min(nrof_examples-i*args.batch_size, args.batch_size)
feed_dict = {batch_size_placeholder: batch_size, learning_rate_placeholder: lr, phase_train_placeholder: True}
err, _, step, emb, lab = sess.run([loss, train_op, global_step, embeddings, labels_batch], feed_dict=feed_dict)
emb_array[lab,:] = emb
loss_array[i] = err
duration = time.time() - start_time
print('Epoch: [%d][%d/%d]\tTime %.3f\tLoss %2.3f' %
(epoch, batch_number+1, args.epoch_size, duration, err))
batch_number += 1
i += 1
train_time += duration
summary.value.add(tag='loss', simple_value=err)
# Add validation loss and accuracy to summary
#pylint: disable=maybe-no-member
summary.value.add(tag='time/selection', simple_value=selection_time)
summary_writer.add_summary(summary, step)
return step
下面分析具体的实现
def sample_people(dataset, people_per_batch, images_per_person):
# 45 * 40
nrof_images = people_per_batch * images_per_person
# Sample classes from the dataset
# 得到数据的类别数 也就是有多少个人
nrof_classes = len(dataset)
class_indices = np.arange(nrof_classes)
# 打乱数据样本
np.random.shuffle(class_indices)
i = 0
image_paths = []
num_per_class = []
sampled_class_indices = []
# Sample images from these classes until we have enough
# 如果还没有采用1800不跳出循环
while len(image_paths)<nrof_images:
class_index = class_indices[i]
# 得到某个人下面的多张图片
nrof_images_in_class = len(dataset[class_index])
image_indices = np.arange(nrof_images_in_class)
np.random.shuffle(image_indices)
# 设置从某个人下面取多少张图片
nrof_images_from_class = min(nrof_images_in_class, images_per_person, nrof_images-len(image_paths))
idx = image_indices[0:nrof_images_from_class]
image_paths_for_class = [dataset[class_index].image_paths[j] for j in idx]
sampled_class_indices += [class_index]*nrof_images_from_class
image_paths += image_paths_for_class
num_per_class.append(nrof_images_from_class)
i+=1
# 返回数据
# num_per_class 的长度就是1800张图片中有多少人。也就是有多少不同样本
return image_paths, num_per_class
但是这1800张图片并不是所有的三元组都适合我们用来训练triplet loss 所有需要进行挑选
def select_triplets(embeddings, nrof_images_per_class, image_paths, people_per_batch, alpha):
"""
Args:
embeddings: (1800, 128)
nrof_images_per_class: (m, ) 1800个样本中有多少个类别
image_paths: (1800, )
people_per_batch: 45
alpha:
Returns:
"""
""" Select the triplets for training
"""
trip_idx = 0
emb_start_idx = 0
num_trips = 0
triplets = []
# VGG Face: Choosing good triplets is crucial and should strike a balance between
# selecting informative (i.e. challenging) examples and swamping training with examples that
# are too hard. This is achieve by extending each pair (a, p) to a triplet (a, p, n) by sampling
# the image n at random, but only between the ones that violate the triplet loss margin. The
# latter is a form of hard-negative mining, but it is not as aggressive (and much cheaper) than
# choosing the maximally violating example, as often done in structured output learning.
for i in xrange(people_per_batch):
nrof_images = int(nrof_images_per_class[i])
# 这里相当于过滤掉只有一张图的人
for j in xrange(1,nrof_images):
a_idx = emb_start_idx + j - 1
# 计算当前图片和所有图片的距离 shape (1800, )
neg_dists_sqr = np.sum(np.square(embeddings[a_idx] - embeddings), 1)
for pair in xrange(j, nrof_images): # For every possible positive pair.
p_idx = emb_start_idx + pair
# 计算positive样本之间的距离
pos_dist_sqr = np.sum(np.square(embeddings[a_idx]-embeddings[p_idx]))
# 在negative数组中将positive样本之间的距离设置为NaN
neg_dists_sqr[emb_start_idx:emb_start_idx+nrof_images] = np.NaN
#all_neg = np.where(np.logical_and(neg_dists_sqr-pos_dist_sqr<alpha, pos_dist_sqr<neg_dists_sqr))[0]
# FaceNet selection
# 挑选满足跳的tripplet
# basic loss = L = dist(a,p)− dist(a,n) + alpha
# 目的就是要优化basic loss 所有选出需要进行优化的样本对
all_neg = np.where(neg_dists_sqr-pos_dist_sqr<alpha)[0] # VGG Face selecction
nrof_random_negs = all_neg.shape[0]
if nrof_random_negs>0:
# 从满足条件的样本对中随机挑选出一个
rnd_idx = np.random.randint(nrof_random_negs)
n_idx = all_neg[rnd_idx]
# 添加进训练triplet loss的三元组数据集
# 最终要让模型满足 negative - positve > alpah 所以挑出需要训练 negatvie - positive < alpha的图片
triplets.append((image_paths[a_idx], image_paths[p_idx], image_paths[n_idx]))
#print('Triplet %d: (%d, %d, %d), pos_dist=%2.6f, neg_dist=%2.6f (%d, %d, %d, %d, %d)' %
# (trip_idx, a_idx, p_idx, n_idx, pos_dist_sqr, neg_dists_sqr[n_idx], nrof_random_negs, rnd_idx, i, j, emb_start_idx))
trip_idx += 1
num_trips += 1
emb_start_idx += nrof_images
np.random.shuffle(triplets)
return triplets, num_trips, len(triplets)
最后分析下 facenet的另外一种训练方式 center loss
注意 center loss只能缩小类间距离 不能缩小类类之间的距离 其中centers不是计算出来的, 是给了个初始值然后进行更新 细节看下面代码
def center_loss(features, label, alfa, nrof_classes):
"""Center loss based on the paper "A Discriminative Feature Learning Approach for Deep Face Recognition"
(http://ydwen.github.io/papers/WenECCV16.pdf)
"""
nrof_features = features.get_shape()[1]
# 这里会用梯度自己进行更新 centerloss 只能缩小类间距离
# [nrof_classes, nrof_features] --> nrof_classes 也就是总共有多少个人
# 比如数据集中100个人 每个人10张图片 nrof_classes = 100 但是总的参与训练的样本有100 * 10
centers = tf.get_variable('centers', [nrof_classes, nrof_features], dtype=tf.float32,
initializer=tf.constant_initializer(0), trainable=False)
label = tf.reshape(label, [-1])
# 从所有数据集中找到和这个batch中数据对应的位置
centers_batch = tf.gather(centers, label)
diff = (1 - alfa) * (centers_batch - features)
# 对centers进行更新
centers = tf.scatter_sub(centers, label, diff)
# 计算center-loss
with tf.control_dependencies([centers]):
loss = tf.reduce_mean(tf.square(features - centers_batch))
return loss, centers