参考代码:Github: davidsandberg/facenet
train_tripletloss.py 代码解析
1. main函数
主要目标:构建整个训练的总体流程。
流程简单介绍:
(1)训练准备工作。
(2)构建计算图:主要包括数据集构建,以及模型构建。
(3)运行计算图(通过调用train函数实现)。
(4)评估模型性能(通过调用evaluate函数实现)。
难点介绍:
为什么构建数据集时,输入的图片路径与标签的shape均为(3,)?
答:因为计算 triplet loss 需要 anchor positive negative三个数据。
代码:
def main(args):
# 导入网络结构文件
network = importlib.import_module(args.model_def)
# 建立日志目录和模型目录
subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S')
log_dir = os.path.join(os.path.expanduser(args.logs_base_dir), subdir)
if not os.path.isdir(log_dir): # Create the log directory if it doesn't exist
os.makedirs(log_dir)
model_dir = os.path.join(os.path.expanduser(args.models_base_dir), subdir)
if not os.path.isdir(model_dir): # Create the model directory if it doesn't exist
os.makedirs(model_dir)
# Write arguments to a text file:保存输入参数信息
facenet.write_arguments_to_file(args, os.path.join(log_dir, 'arguments.txt'))
# Store some git revision info in a text file in the log directory
src_path,_ = os.path.split(os.path.realpath(__file__))
facenet.store_revision_info(src_path, log_dir, ' '.join(sys.argv))
# 获取数据集:按类别存储
np.random.seed(seed=args.seed)
train_set = facenet.get_dataset(args.data_dir)
# 预训练模型路径
if args.pretrained_model:
print('Pre-trained model: %s' % os.path.expanduser(args.pretrained_model))
# lfw测试集
if args.lfw_dir:
print('LFW directory: %s' % args.lfw_dir)
# Read the file containing the pairs used for testing
pairs = lfw.read_pairs(os.path.expanduser(args.lfw_pairs))
# Get the paths for the corresponding images
lfw_paths, actual_issame = lfw.get_paths(os.path.expanduser(args.lfw_dir), pairs)
with tf.Graph().as_default():
tf.set_random_seed(args.seed)
global_step = tf.Variable(0, trainable=False)
# 创建各种Placeholder
learning_rate_placeholder = tf.placeholder(tf.float32, name='learning_rate')
batch_size_placeholder = tf.placeholder(tf.int32, name='batch_size')
phase_train_placeholder = tf.placeholder(tf.bool, name='phase_train')
# shape=(None,3), tripletloss输入需要3张图片
image_paths_placeholder = tf.placeholder(tf.string, shape=(None,3), name='image_paths')
labels_placeholder = tf.placeholder(tf.int64, shape=(None,3), name='labels')
# 队列准备
input_queue = data_flow_ops.FIFOQueue(capacity=100000,
dtypes=[tf.string, tf.int64],
shapes=[(3,), (3,)],
shared_name=None, name=None)
enqueue_op = input_queue.enqueue_many([image_paths_placeholder, labels_placeholder])
# 数据预处理
nrof_preprocess_threads = 4
images_and_labels = []
for _ in range(nrof_preprocess_threads):
filenames, label = input_queue.dequeue()
images = []
for filename in tf.unstack(filenames):
file_contents = tf.read_file(filename)
image = tf.image.decode_image(file_contents, channels=3)
if args.random_crop:
image = tf.random_crop(image, [args.image_size, args.image_size, 3])
else:
image = tf.image.resize_image_with_crop_or_pad(image, args.image_size, args.image_size)
if args.random_flip:
image = tf.image.random_flip_left_right(image)
#pylint: disable=no-member
image.set_shape((args.image_size, args.image_size, 3))
images.append(tf.image.per_image_standardization(image))
# 得到的结果是[images, label], 其中images的shape为(3,image_size,image_size,3), label的shape为(3,)
images_and_labels.append([images, label])
# 由于enqueue_many=True
# 因此输入队列 images_and_labels 中的每个元素,即[images, label]也会根据axis=0切分为多条数据
# image_batch 的 shape 为 (batch_size,image_size,image_size,3)
# labels_batch 的 shape 为 (batch_size)
image_batch, labels_batch = tf.train.batch_join(
images_and_labels, batch_size=batch_size_placeholder,
shapes=[(args.image_size, args.image_size, 3), ()], enqueue_many=True,
capacity=4 * nrof_preprocess_threads * args.batch_size,
allow_smaller_final_batch=True)
image_batch = tf.identity(image_batch, 'image_batch')
image_batch = tf.identity(image_batch, 'input')
labels_batch = tf.identity(labels_batch, 'label_batch')
# 图片经前向传播提取128维的特征向量,并进行L2 normalize操作
prelogits, _ = network.inference(image_batch, args.keep_probability,
phase_train=phase_train_placeholder, bottleneck_layer_size=args.embedding_size,
weight_decay=args.weight_decay)
embeddings = tf.nn.l2_normalize(prelogits, 1, 1e-10, name='embeddings')
# Split embeddings into anchor, positive and negative and calculate triplet loss:计算损失
anchor, positive, negative = tf.unstack(tf.reshape(embeddings, [-1,3,args.embedding_size]), 3, 1)
triplet_loss = facenet.triplet_loss(anchor, positive, negative, args.alpha)
learning_rate = tf.train.exponential_decay(learning_rate_placeholder, global_step,
args.learning_rate_decay_epochs*args.epoch_size, args.learning_rate_decay_factor, staircase=True)
tf.summary.scalar('learning_rate', learning_rate)
# Calculate the total losses
regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
total_loss = tf.add_n([triplet_loss] + regularization_losses, name='total_loss')
# Build a Graph that trains the model with one batch of examples and updates the model parameters
train_op = facenet.train(total_loss, global_step, args.optimizer,
learning_rate, args.moving_average_decay, tf.global_variables())
# Create a saver :保留3个模型文件
saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=3)
# Build the summary operation based on the TF collection of Summaries.
summary_op = tf.summary.merge_all()
# Start running operations on the Graph.# 创建Session
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_memory_fraction)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
# Initialize variables
sess.run(tf.global_variables_initializer(), feed_dict={phase_train_placeholder:True})
sess.run(tf.local_variables_initializer(), feed_dict={phase_train_placeholder:True})
summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
coord = tf.train.Coordinator()
tf.train.start_queue_runners(coord=coord, sess=sess)
with sess.as_default():
# 导入预训练模型
if args.pretrained_model:
print('Restoring pretrained model: %s' % args.pretrained_model)
saver.restore(sess, os.path.expanduser(args.pretrained_model))
# Training and validation loop
epoch = 0
while epoch < args.max_nrof_epochs:
step = sess.run(global_step, feed_dict=None)
epoch = step // args.epoch_size
# Train for one epoch:构建一个epoch的数据并训练
train(args, sess, train_set, epoch, image_paths_placeholder, labels_placeholder, labels_batch,
batch_size_placeholder, learning_rate_placeholder, phase_train_placeholder, enqueue_op, input_queue, global_step,
embeddings, total_loss, train_op, summary_op, summary_writer, args.learning_rate_schedule_file,
args.embedding_size, anchor, positive, negative, triplet_loss)
# Save variables and the metagraph if it doesn't exist already:保存模型
save_variables_and_metagraph(sess, saver, summary_writer, model_dir, subdir, step)
# Evaluate on LFW:测试LFW
if args.lfw_dir:
evaluate(sess, lfw_paths, embeddings, labels_batch, image_paths_placeholder, labels_placeholder,
batch_size_placeholder, learning_rate_placeholder, phase_train_placeholder, enqueue_op, actual_issame, args.batch_size,
args.lfw_nrof_folds, log_dir, step, summary_writer, args.embedding_size)
return model_dir
2. train函数
目标:训练一个epoch。
主要流程:
(1)根据 people_per_batch(数据集中,最少有多少类数据) 和 images_per_person(数据集中,每一类数据有多少样本) 获取基础数据集。
(2)获取选中数据对应的 embedding 向量(通过tensorflow实现)。
(3)根据数据集信息以及对应的 embedding 向量,进行 select triplets,构建 triplets 数据(调用select_triplets函数实现,使用numpy而不是tensorflow)。
(4)以 triplets 数据作为输入,训练模型(通过tensorflow实现)。
代码:
def train(args, sess, dataset, epoch, image_paths_placeholder, labels_placeholder, labels_batch,
batch_size_placeholder, learning_rate_placeholder, phase_train_placeholder, enqueue_op, input_queue, global_step,
embeddings, loss, train_op, summary_op, summary_writer, learning_rate_schedule_file,
embedding_size, anchor, positive, negative, triplet_loss):
batch_number = 0
# 获取学习率
if args.learning_rate>0.0:
lr = args.learning_rate
else:
lr = facenet.get_learning_rate_from_file(learning_rate_schedule_file, epoch)
# 每个 epoch 要训练 epoch_size 个 batch
while batch_number < args.epoch_size:
# Sample people randomly from the dataset:从数据集中获取数据
image_paths, num_per_class = sample_people(dataset, args.people_per_batch, args.images_per_person)
print('Running forward pass on sampled images: ', end='')
start_time = time.time()
nrof_examples = args.people_per_batch * args.images_per_person
# 将图片shape转换为(-1,3), 和占位符对应(anchor,positive,negative)
labels_array = np.reshape(np.arange(nrof_examples),(-1,3))
image_paths_array = np.reshape(np.expand_dims(np.array(image_paths),1), (-1,3))
# 进行入队操作
sess.run(enqueue_op, {image_paths_placeholder: image_paths_array, labels_placeholder: labels_array})
emb_array = np.zeros((nrof_examples, embedding_size)) #初始化特征矩阵
nrof_batches = int(np.ceil(nrof_examples / args.batch_size)) #向上取整
for i in range(nrof_batches): # 可借鉴的操作
batch_size = min(nrof_examples-i*args.batch_size, args.batch_size)
emb, lab = sess.run([embeddings, labels_batch], feed_dict={batch_size_placeholder: batch_size,
learning_rate_placeholder: lr, phase_train_placeholder: True}) # 前向传播:提取特征
emb_array[lab,:] = emb # 将特征存入特征矩阵
print('%.3f' % (time.time()-start_time))
# Select triplets based on the embeddings
print('Selecting suitable triplets for training') # 选择三元组
triplets, nrof_random_negs, nrof_triplets = select_triplets(emb_array, num_per_class,
image_paths, args.people_per_batch, args.alpha)
selection_time = time.time() - start_time
print('(nrof_random_negs, nrof_triplets) = (%d, %d): time=%.3f seconds' %
(nrof_random_negs, nrof_triplets, selection_time))
# Perform training on the selected triplets# 基于获取的tripletloss 进行训练
nrof_batches = int(np.ceil(nrof_triplets*3/args.batch_size))
triplet_paths = list(itertools.chain(*triplets))
labels_array = np.reshape(np.arange(len(triplet_paths)),(-1,3))
triplet_paths_array = np.reshape(np.expand_dims(np.array(triplet_paths),1), (-1,3))
sess.run(enqueue_op, {image_paths_placeholder: triplet_paths_array, labels_placeholder: labels_array})
nrof_examples = len(triplet_paths)
train_time = 0
i = 0
emb_array = np.zeros((nrof_examples, embedding_size))
loss_array = np.zeros((nrof_triplets,))
summary = tf.Summary()
step = 0
while i < nrof_batches:
start_time = time.time()
batch_size = min(nrof_examples-i*args.batch_size, args.batch_size) #动态batch_size操作,值得借鉴
# 执行训练
feed_dict = {batch_size_placeholder: batch_size, learning_rate_placeholder: lr, phase_train_placeholder: True}
err, _, step, emb, lab = sess.run([loss, train_op, global_step, embeddings, labels_batch], feed_dict=feed_dict)
emb_array[lab,:] = emb
loss_array[i] = err
duration = time.time() - start_time
print('Epoch: [%d][%d/%d]\tTime %.3f\tLoss %2.3f' %
(epoch, batch_number+1, args.epoch_size, duration, err))
batch_number += 1
i += 1
train_time += duration
summary.value.add(tag='loss', simple_value=err)
# Add validation loss and accuracy to summary
#pylint: disable=maybe-no-member
summary.value.add(tag='time/selection', simple_value=selection_time)
summary_writer.add_summary(summary, step)
return step
3. select_triplets 函数
作用:三元组的选择
总体流程:
(1)枚举所有可能的 anchor positive 对。
(2)计算anchor与所有数据的欧几里德距离。
(3)根据得到的欧式距离,筛选符合条件的negative结果,并随机选择其中之一构建 triplet。
输入介绍:
(1)embedding:所有输入样本的 embedding 向量
(2)nrof_images_per_class:sample_people函数的输出
(3)image_paths:sample_people的输出
(4)people_per_batch:每个batch中的类型数量
(5)alpha:triplet的参数
输出介绍:
(1)triplets:生成的 triplets loss 输入,列表,每个元素为三元组,分别代表 anchor positive negative 的 image_path。
(2)num_trips:尝试生成 triplet 的次数。
(3)len(triplets):生成triplets的数量。
注:使用numpy实现,而非tensorflow。
代码:
def select_triplets(embeddings, nrof_images_per_class, image_paths, people_per_batch, alpha):
""" Select the triplets for training
"""
trip_idx = 0
emb_start_idx = 0
num_trips = 0
triplets = []
# VGG Face: Choosing good triplets is crucial and should strike a balance between
# selecting informative (i.e. challenging) examples and swamping training with examples that
# are too hard. This is achieve by extending each pair (a, p) to a triplet (a, p, n) by sampling
# the image n at random, but only between the ones that violate the triplet loss margin. The
# latter is a form of hard-negative mining, but it is not as aggressive (and much cheaper) than
# choosing the maximally violating example, as often done in structured output learning.
# 遍历数据集中所有类别
for i in xrange(people_per_batch):
# 获得当前类别拥有的样本数量
nrof_images = int(nrof_images_per_class[i])
for j in xrange(1,nrof_images): # 遍历当前类别样本
a_idx = emb_start_idx + j - 1 #指定anchor
# 计算当前anchor 与其余所有图像的欧式距离
neg_dists_sqr = np.sum(np.square(embeddings[a_idx] - embeddings), 1)
for pair in xrange(j, nrof_images): # For every possible positive pair.
p_idx = emb_start_idx + pair
pos_dist_sqr = np.sum(np.square(embeddings[a_idx]-embeddings[p_idx])) #anchor和positive的距离(1个pair)
neg_dists_sqr[emb_start_idx:emb_start_idx+nrof_images] = np.NaN #anchor和negative的距离
#all_neg = np.where(np.logical_and(neg_dists_sqr-pos_dist_sqr<alpha, pos_dist_sqr<neg_dists_sqr))[0] # FaceNet selection
all_neg = np.where(neg_dists_sqr-pos_dist_sqr<alpha)[0] # VGG Face selecction
nrof_random_negs = all_neg.shape[0]
if nrof_random_negs>0:
rnd_idx = np.random.randint(nrof_random_negs)#随机抽取一个nagative
n_idx = all_neg[rnd_idx]
triplets.append((image_paths[a_idx], image_paths[p_idx], image_paths[n_idx]))# 组成三元组
#print('Triplet %d: (%d, %d, %d), pos_dist=%2.6f, neg_dist=%2.6f (%d, %d, %d, %d, %d)' %
# (trip_idx, a_idx, p_idx, n_idx, pos_dist_sqr, neg_dists_sqr[n_idx], nrof_random_negs, rnd_idx, i, j, emb_start_idx))
trip_idx += 1
num_trips += 1
emb_start_idx += nrof_images
np.random.shuffle(triplets)
return triplets, num_trips, len(triplets)
4. sample_people 函数
目标:根据给定的条件,从原始数据集中提取一部分数据作为后续输入。
输入介绍:
(1)dataset:本质是一个ImageClass列表,每个ImageClass代表一个类型,主要属性有类型名称以及对应的image paths。
(2)people_per_batch:在生成的数据集中,类别的最少数量。
(3)images_per_person:在生成的数据集中,每个类别样本的最多数量。
输出介绍:
(1)image_paths: 列表,长度为类别数量,每个元素为一个列表,代表该类的 image_path。
(2)num_per_class:列表,长度为类别数量,每个元素为整数,代表该类样本数量。
注:最终输出的结果中,可能类别数量会多余输入指定的people_per_batch。
代码:
def sample_people(dataset, people_per_batch, images_per_person):
# 每次提取的样本总数量
nrof_images = people_per_batch * images_per_person
# Sample classes from the dataset
nrof_classes = len(dataset)
class_indices = np.arange(nrof_classes)
np.random.shuffle(class_indices)# 打乱类别顺序
i = 0
image_paths = []
num_per_class = []
sampled_class_indices = []
# Sample images from these classes until we have enough
while len(image_paths)<nrof_images:
class_index = class_indices[i]
nrof_images_in_class = len(dataset[class_index])
image_indices = np.arange(nrof_images_in_class)
np.random.shuffle(image_indices)# 打乱样本顺序
# 当前类别需要添加多少图像到 image_paths 中
nrof_images_from_class = min(nrof_images_in_class, images_per_person, nrof_images-len(image_paths))
# 获取指定数量的图片并添加到最终输出中
idx = image_indices[0:nrof_images_from_class]
image_paths_for_class = [dataset[class_index].image_paths[j] for j in idx]
sampled_class_indices += [class_index]*nrof_images_from_class
image_paths += image_paths_for_class
# 每个类有多少个样本
num_per_class.append(nrof_images_from_class)
i+=1
return image_paths, num_per_class
5. 损失函数triplet_loss
训练目标:anchor与positive的距离比anchor与negative的距离小(相似度高)
代码:
def triplet_loss(anchor, positive, negative, alpha):
"""Calculate the triplet loss according to the FaceNet paper
Args:
anchor: the embeddings for the anchor images.
positive: the embeddings for the positive images.
negative: the embeddings for the negative images.
Returns:
the triplet loss according to the FaceNet paper as a float tensor.
"""
with tf.variable_scope('triplet_loss'):
pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, positive)), 1)
neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, negative)), 1)
basic_loss = tf.add(tf.subtract(pos_dist,neg_dist), alpha)
loss = tf.reduce_mean(tf.maximum(basic_loss, 0.0), 0)
return loss