代码地址 https://github.com/AITTSMD/MTCNN-Tensorflow
MTCNN源码详细解读(1)- PNet/RNet/ONet的网络结构和损失函数
MTCNN源码详细解读(3)- RNet的训练和数据集的构建
这篇博客主要分析PNet的数据准备和训练
从下面几个文件入手:
Run prepare_data/gen_12net_data.py
Run gen_landmark_aug_12.py
Run gen_imglist_pnet.py
Run gen_PNet_tfrecords.py
首先分析 gen_12net_data.py
这个文件主要做了一件事 准备postion/negative/part数据图片对应label分别为{1, 0, -1} label为1,0的参与分类损失 label为1,-1的参与边框回归损失的计算上篇博客在损失函数中已经说明了
主要代码如下
# 构建negative数据集
while neg_num < 50:
#neg_num's size [40,min(width, height) / 2],min_size:40
# 随机生成一个尺寸
size = npr.randint(12, min(width, height) / 2)
#top_left
# 随机生成 left top 坐标
nx = npr.randint(0, width - size)
ny = npr.randint(0, height - size)
#random crop
# 进行图片裁剪
crop_box = np.array([nx, ny, nx + size, ny + size])
#cal iou
# 计算IOU
Iou = IoU(crop_box, boxes)
cropped_im = img[ny : ny + size, nx : nx + size, :]
# 将裁剪的图片resize到12 * 12
resized_im = cv2.resize(cropped_im, (12, 12), interpolation=cv2.INTER_LINEAR)
if np.max(Iou) < 0.3:
# 小于.3的IOU当做negative label 0
# Iou with all gts must below 0.3
save_file = os.path.join(neg_save_dir, "%s.jpg"%n_idx)
f2.write("12/negative/%s.jpg"%n_idx + ' 0\n')
cv2.imwrite(save_file, resized_im)
n_idx += 1
neg_num += 1
for i in range(20):
# pos and part face size [minsize*0.8,maxsize*1.25]
# 生成size尺寸
size = npr.randint(int(min(w, h) * 0.8), np.ceil(1.25 * max(w, h)))
# delta here is the offset of box center
# 随机生成偏移尺寸
delta_x = npr.randint(-w * 0.2, w * 0.2)
delta_y = npr.randint(-h * 0.2, h * 0.2)
#show this way: nx1 = max(x1+w/2-size/2+delta_x)
# 进行平移
nx1 = int(max(x1 + w / 2 + delta_x - size / 2, 0))
#show this way: ny1 = max(y1+h/2-size/2+delta_y)
ny1 = int(max(y1 + h / 2 + delta_y - size / 2, 0))
nx2 = nx1 + size
ny2 = ny1 + size
if nx2 > width or ny2 > height:
continue
crop_box = np.array([nx1, ny1, nx2, ny2])
#yu gt de offset
# 计算bouding box的偏移 然后进行归一化
offset_x1 = (x1 - nx1) / float(size)
offset_y1 = (y1 - ny1) / float(size)
offset_x2 = (x2 - nx2) / float(size)
offset_y2 = (y2 - ny2) / float(size)
#crop
# 裁剪图片
cropped_im = img[ny1 : ny2, nx1 : nx2, :]
#resize
resized_im = cv2.resize(cropped_im, (12, 12), interpolation=cv2.INTER_LINEAR)
box_ = box.reshape(1, -1)
if IoU(crop_box, box_) >= 0.65:
# 对于IOU > .65为positive label 1
save_file = os.path.join(pos_save_dir, "%s.jpg"%p_idx)
f1.write("12/positive/%s.jpg"%p_idx + ' 1 %.2f %.2f %.2f %.2f\n'%(offset_x1, offset_y1, offset_x2, offset_y2))
cv2.imwrite(save_file, resized_im)
p_idx += 1
elif IoU(crop_box, box_) >= 0.4:
# IOU [0.4, 0.65)为part label -1
save_file = os.path.join(part_save_dir, "%s.jpg"%d_idx)
f3.write("12/part/%s.jpg"%d_idx + ' -1 %.2f %.2f %.2f %.2f\n'%(offset_x1, offset_y1, offset_x2, offset_y2))
cv2.imwrite(save_file, resized_im)
下面分析 gen_landmark_aug_12.py 这个文件主要获取landmark坐标
下面分析主要代码 去掉了augment数据增广的操作
# 得的landmark写入下面文件
f = open(join(OUTPUT,"landmark_%s_aug.txt" %(size)),'w')
#dstdir = "train_landmark_few"
# 读取数据 得到图片地址 图片gtbox landmark
data = getDataFromTxt(ftxt)
idx = 0
#image_path bbox landmark(5*2)
#遍历数据
for (imgPath, bbox, landmarkGt) in data:
#print imgPath
F_imgs = []
F_landmarks = []
img = cv2.imread(imgPath)
if img is None:
continue
img_h,img_w,img_c = img.shape
gt_box = np.array([bbox.left,bbox.top,bbox.right,bbox.bottom])
# 得到脸的坐标
f_face = img[bbox.top:bbox.bottom+1,bbox.left:bbox.right+1]
# 将脸的坐标缩放到 12 * 12 PNet的输入
f_face = cv2.resize(f_face,(size,size))
landmark = np.zeros((5, 2))
#normalize
for index, one in enumerate(landmarkGt):
# 这里是进行归一化 计算的是 相对于脸的边框的坐标 将脸的(left, top) 当做起始点 然后归一化到gtbox的尺寸上[0, 1]
rv = ((one[0]-gt_box[0])/(gt_box[2]-gt_box[0]), (one[1]-gt_box[1])/(gt_box[3]-gt_box[1]))
landmark[index] = rv
# 得到image 和 landmark坐标
F_imgs, F_landmarks = np.asarray(F_imgs), np.asarray(F_landmarks)
#print F_imgs.shape
#print F_landmarks.shape
for i in range(len(F_imgs)):
if np.sum(np.where(F_landmarks[i] <= 0, 1, 0)) > 0:
continue
if np.sum(np.where(F_landmarks[i] >= 1, 1, 0)) > 0:
continue
# 将数据写入文件 label -2
cv2.imwrite(join(dstdir,"%d.jpg" %(image_id)), F_imgs[i])
landmarks = map(str,list(F_landmarks[i]))
f.write(join(dstdir,"%d.jpg" %(image_id))+" -2 "+" ".join(landmarks)+"\n")
image_id = image_id + 1
gen_imglist_pnet.py 这里就是开始构建PNet训练数据集了直接看代码
# 读取所有的pos negative part landmark数据 就是上面运行的文件生成的数据
with open(os.path.join(data_dir, '%s/pos_%s.txt' % (size, size)), 'r') as f:
pos = f.readlines()
with open(os.path.join(data_dir, '%s/neg_%s.txt' % (size, size)), 'r') as f:
neg = f.readlines()
with open(os.path.join(data_dir, '%s/part_%s.txt' % (size, size)), 'r') as f:
part = f.readlines()
with open(os.path.join(data_dir,'%s/landmark_%s_aug.txt' %(size,size)), 'r') as f:
landmark = f.readlines()
dir_path = os.path.join(data_dir, 'imglists')
if not os.path.exists(dir_path):
os.makedirs(dir_path)
if not os.path.exists(os.path.join(dir_path, "%s" %(net))):
os.makedirs(os.path.join(dir_path, "%s" %(net)))
# 将训练数据写入下面文件
with open(os.path.join(dir_path, "%s" %(net),"train_%s_landmark.txt" % (net)), "w") as f:
# 按照 3:1:1来生成训练样本
nums = [len(neg), len(pos), len(part)]
ratio = [3, 1, 1]
#base_num = min(nums)
base_num = 250000
print(len(neg), len(pos), len(part), base_num)
if len(neg) > base_num * 3:
neg_keep = npr.choice(len(neg), size=base_num * 3, replace=True)
else:
neg_keep = npr.choice(len(neg), size=len(neg), replace=True)
pos_keep = npr.choice(len(pos), size=base_num, replace=True)
part_keep = npr.choice(len(part), size=base_num, replace=True)
print(len(neg_keep), len(pos_keep), len(part_keep))
for i in pos_keep:
f.write(pos[i])
for i in neg_keep:
f.write(neg[i])
for i in part_keep:
f.write(part[i])
# landmark是训练所有的
for item in landmark:
f.write(item)
gen_PNet_tfrecords.py 进行tfrecord打包
首先看下数据存储结构
def get_dataset(dir, net='PNet'):
#item = 'imglists/PNet/train_%s_raw.txt' % net
# 读取数据存储文件
item = 'imglists/PNet/train_%s_landmark.txt' % net
dataset_dir = os.path.join(dir, item)
imagelist = open(dataset_dir, 'r')
dataset = []
# 遍历数据
for line in imagelist.readlines():
info = line.strip().split(' ')
data_example = dict()
bbox = dict()
# 定义16个属性 filename label 4个坐标 10个landmark
data_example['filename'] = info[0]
data_example['label'] = int(info[1])
bbox['xmin'] = 0
bbox['ymin'] = 0
bbox['xmax'] = 0
bbox['ymax'] = 0
bbox['xlefteye'] = 0
bbox['ylefteye'] = 0
bbox['xrighteye'] = 0
bbox['yrighteye'] = 0
bbox['xnose'] = 0
bbox['ynose'] = 0
bbox['xleftmouth'] = 0
bbox['yleftmouth'] = 0
bbox['xrightmouth'] = 0
bbox['yrightmouth'] = 0
# len(info) == 6 的是针对positive negative part样本 只存了6位 filepath label 4个坐标
if len(info) == 6:
bbox['xmin'] = float(info[2])
bbox['ymin'] = float(info[3])
bbox['xmax'] = float(info[4])
bbox['ymax'] = float(info[5])
# 长度等于12的是landmark的数据 存了14位 filepath label 10 个 landmark
if len(info) == 12:
bbox['xlefteye'] = float(info[2])
bbox['ylefteye'] = float(info[3])
bbox['xrighteye'] = float(info[4])
bbox['yrighteye'] = float(info[5])
bbox['xnose'] = float(info[6])
bbox['ynose'] = float(info[7])
bbox['xleftmouth'] = float(info[8])
bbox['yleftmouth'] = float(info[9])
bbox['xrightmouth'] = float(info[10])
bbox['yrightmouth'] = float(info[11])
data_example['bbox'] = bbox
dataset.append(data_example)
return dataset
打包tfrecord的结构为:
# 生成tf.train.Example存入写入tfrecord
example = tf.train.Example(features=tf.train.Features(feature={
'image/encoded': _bytes_feature(image_buffer),
'image/label': _int64_feature(class_label),
'image/roi': _float_feature(roi),
'image/landmark': _float_feature(landmark)
}))
最后分析下PNet的训练过程
train_PNet.py 调用 train.py 里面的train()方法
def train(net_factory, prefix, end_epoch, base_dir,
display=200, base_lr=0.01):
"""
train PNet/RNet/ONet
:param net_factory:
:param prefix:
:param end_epoch:16
:param dataset:
:param display:
:param base_lr:
:return:
"""
net = prefix.split('/')[-1]
#label file
label_file = os.path.join(base_dir,'train_%s_landmark.txt' % net)
#label_file = os.path.join(base_dir,'landmark_12_few.txt')
f = open(label_file, 'r')
# 得到训练样本数
num = len(f.readlines())
print("Total datasets is: ", num)
#PNet use this method to get data
if net == 'PNet':
#dataset_dir = os.path.join(base_dir,'train_%s_ALL.tfrecord_shuffle' % net)
dataset_dir = os.path.join(base_dir,'train_%s_landmark.tfrecord_shuffle' % net)
# 从tfrecord中读取batch数据
image_batch, label_batch, bbox_batch,landmark_batch = read_single_tfrecord(dataset_dir, config.BATCH_SIZE, net)
pass
#landmark_dir
if net == 'PNet':
image_size = 12
radio_cls_loss = 1.0;radio_bbox_loss = 0.5;radio_landmark_loss = 0.5;
elif net == 'RNet':
image_size = 24
radio_cls_loss = 1.0;radio_bbox_loss = 0.5;radio_landmark_loss = 0.5;
else:
radio_cls_loss = 1.0;radio_bbox_loss = 0.5;radio_landmark_loss = 1.0;
image_size = 48
#define placeholder
# 定义placehoder 开始训练网络
input_image = tf.placeholder(tf.float32, shape=[config.BATCH_SIZE, image_size, image_size, 3], name='input_image')
label = tf.placeholder(tf.float32, shape=[config.BATCH_SIZE], name='label')
bbox_target = tf.placeholder(tf.float32, shape=[config.BATCH_SIZE, 4], name='bbox_target')
landmark_target = tf.placeholder(tf.float32,shape=[config.BATCH_SIZE,10],name='landmark_target')
#class,regression
# 得到网络输出
cls_loss_op,bbox_loss_op,landmark_loss_op,L2_loss_op,accuracy_op = net_factory(input_image, label, bbox_target,landmark_target,training=True)
#train,update learning rate(3 loss)
# 计算损失 每个损失对应各自的权重 radio_cls_loss = 1.0;radio_bbox_loss = 0.5;radio_landmark_loss = 0.5;
train_op, lr_op = train_model(base_lr, radio_cls_loss*cls_loss_op + radio_bbox_loss*bbox_loss_op + radio_landmark_loss*landmark_loss_op + L2_loss_op, num)
# init
init = tf.global_variables_initializer()
sess = tf.Session()
#save model
saver = tf.train.Saver(max_to_keep=0)
sess.run(init)
#visualize some variables
tf.summary.scalar("cls_loss",cls_loss_op)#cls_loss
tf.summary.scalar("bbox_loss",bbox_loss_op)#bbox_loss
tf.summary.scalar("landmark_loss",landmark_loss_op)#landmark_loss
tf.summary.scalar("cls_accuracy",accuracy_op)#cls_acc
summary_op = tf.summary.merge_all()
logs_dir = "../logs/%s" %(net)
if os.path.exists(logs_dir) == False:
os.mkdir(logs_dir)
writer = tf.summary.FileWriter(logs_dir,sess.graph)
#begin
coord = tf.train.Coordinator()
#begin enqueue thread
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
i = 0
#total steps
MAX_STEP = int(num / config.BATCH_SIZE + 1) * end_epoch
print("Train steps: %d" % MAX_STEP)
epoch = 0
sess.graph.finalize()
try:
for step in range(MAX_STEP):
i = i + 1
if coord.should_stop():
break
# 这里就是得到训练数据
image_batch_array, label_batch_array, bbox_batch_array,landmark_batch_array = sess.run([image_batch, label_batch, bbox_batch,landmark_batch])
#random flip
# 随机翻转数据
image_batch_array,landmark_batch_array = random_flip_images(image_batch_array,label_batch_array,landmark_batch_array)
'''
print image_batch_array.shape
print label_batch_array.shape
print bbox_batch_array.shape
print landmark_batch_array.shape
print label_batch_array[0]
print bbox_batch_array[0]
print landmark_batch_array[0]
'''
# 开始进行反向传播
_,_,summary = sess.run([train_op, lr_op ,summary_op], feed_dict={input_image: image_batch_array, label: label_batch_array, bbox_target: bbox_batch_array,landmark_target:landmark_batch_array})
现在看到这我相信对应PNet网络的训练已经很清楚了