MTCNN源码详细解读(1)- PNet/RNet/ONet的网络结构和损失函数
MTCNN源码详细解读(2)- PNet的训练和数据集的构建
在此之前我们先分析下训练RNet和训练PNet的 区别:
1 训练PNet时的候选框是random生成的
2 R训练之前需要用到训练好的PNet预测出候选框(置信度大于一定阈值),然后计算PNet预测出的候选框和gtbox之间的偏移量
3 每张图片会生成金字塔图像 下面就是具体实现
# 得到初始的scale 和min_face_size取值有关 代码中是24 所以初始scale=0.5
current_scale = float(net_size) / self.min_face_size # find initial
pass
while min(current_height, current_width) > net_size:
pass
# 这句话就是实现图像金字塔 每次按比例缩放
# scale_factor 一般取0.7-0.8之间
current_scale *= self.scale_factor
RNet训练分为
1 run gen_hard_example.py to generate training data(Face Detection Part) for RNet.
2 Run gen_landmark_aug_24.py to generate training data(Face Landmark Detection Part) for RNet.
3 Run gen_imglist_rnet.py to merge two parts of training data.
4 Run gen_RNet_tfrecords.py
后面三个步骤基本和PNet一致 这里就不展开了 重点分析下gen_hard_example.py
接下来分析 gen_hard_example.py 为PNet生成训练数据for face detection
def t_net(prefix, epoch,
batch_size, test_mode="PNet",
thresh=[0.6, 0.6, 0.7], min_face_size=25,
stride=2, slide_window=False, shuffle=False, vis=False):
detectors = [None, None, None]
print("Test model: ", test_mode)
#PNet-echo
model_path = ['%s-%s' % (x, y) for x, y in zip(prefix, epoch)]
print(model_path[0])
# load pnet model
# 加载上一步训练的PNet 用于inference
if slide_window:
PNet = Detector(P_Net, 12, batch_size[0], model_path[0])
else:
PNet = FcnDetector(P_Net, model_path[0])
detectors[0] = PNet
# load rnet model
if test_mode in ["RNet", "ONet"]:
print("==================================", test_mode)
# 加载RNet 在训练PNet时 这里不会执行
RNet = Detector(R_Net, 24, batch_size[1], model_path[1])
detectors[1] = RNet
# load onet model
if test_mode == "ONet":
print("==================================", test_mode)
ONet = Detector(O_Net, 48, batch_size[2], model_path[2])
detectors[2] = ONet
basedir = '.'
#anno_file
filename = './wider_face_train_bbx_gt.txt'
#read annatation(type:dict)
# 这里很简单读取数据集中的 image和box
# data['images']存的是图片 data['bboxes] 存的是gtboxes 每张图片可以对于多个gt_boxes
data = read_annotation(basedir,filename)
# 构建用来推断的类 方便inference
mtcnn_detector = MtcnnDetector(detectors=detectors, min_face_size=min_face_size,
stride=stride, threshold=thresh, slide_window=slide_window)
print("==================================")
# 注意是在“test”模式下
# imdb = IMDB("wider", image_set, root_path, dataset_path, 'test')
# gt_imdb = imdb.gt_imdb()
test_data = TestLoader(data['images'])
#list
# 这里就是得到每张图片预测出来的候选框 长度和data一致 对应每张图片有多个候选框
detections,_ = mtcnn_detector.detect_face(test_data)
save_net = 'RNet'
if test_mode == "PNet":
save_net = "RNet"
elif test_mode == "RNet":
save_net = "ONet"
#save detect result
save_path = os.path.join(data_dir, save_net)
print(save_path)
if not os.path.exists(save_path):
os.mkdir(save_path)
# 保存候选框
save_file = os.path.join(save_path, "detections.pkl")
with open(save_file, 'wb') as f:
pickle.dump(detections, f,1)
print("%s测试完成开始OHEM" % image_size)
# 得到训练数据集并保存起来
save_hard_example(image_size, data, save_path)
然后分析下上面提到的两个方法
mtcnn_detector.detect_face(test_data)
save_hard_example(image_size, data, save_path)
def detect_face(self, test_data):
all_boxes = []#save each image's bboxes
landmarks = []
batch_idx = 0
sum_time = 0
#test_data is iter_
# 遍历数据集
for databatch in test_data:
#databatch(image returned)
if batch_idx % 100 == 0:
print("%d images done" % batch_idx)
im = databatch
# 对PNet进行reference
# pnet
t1 = 0
if self.pnet_detector:
t = time.time()
#ignore landmark
# reference的具体实现 下面会具体分析 图像金字塔就是在这方法里面实现的
boxes, boxes_c, landmark = self.detect_pnet(im)
t1 = time.time() - t
sum_time += t1
if boxes_c is None:
print("boxes_c is None...")
all_boxes.append(np.array([]))
#pay attention
landmarks.append(np.array([]))
batch_idx += 1
continue
# 如果是训练RNet下面关于rnet 和 onet的处理是不会执行的
# rnet
t2 = 0
if self.rnet_detector:
t = time.time()
#ignore landmark
boxes, boxes_c, landmark = self.detect_rnet(im, boxes_c)
t2 = time.time() - t
sum_time += t2
if boxes_c is None:
all_boxes.append(np.array([]))
landmarks.append(np.array([]))
batch_idx += 1
continue
# onet
t3 = 0
if self.onet_detector:
t = time.time()
boxes, boxes_c, landmark = self.detect_onet(im, boxes_c)
t3 = time.time() - t
sum_time += t3
if boxes_c is None:
all_boxes.append(np.array([]))
landmarks.append(np.array([]))
batch_idx += 1
continue
print(
"time cost " + '{:.3f}'.format(sum_time) + ' pnet {:.3f} rnet {:.3f} onet {:.3f}'.format(t1, t2,t3))
# 添加上面推断出来的boxes 候选框
all_boxes.append(boxes_c)
landmarks.append(landmark)
batch_idx += 1
#num_of_data*9,num_of_data*10
return all_boxes,landmarks
然后分析 detect_pnet()
def detect_pnet(self, im):
"""Get face candidates through pnet
Parameters:
----------
im: numpy array
input image array
Returns:
-------
boxes: numpy array
detected boxes before calibration
boxes_c: numpy array
boxes after calibration
"""
h, w, c = im.shape
net_size = 12
# 这里就是计算一个缩放scale
current_scale = float(net_size) / self.min_face_size # find initial scale
# print("current_scale", net_size, self.min_face_size, current_scale)
im_resized = self.processed_image(im, current_scale)
current_height, current_width, _ = im_resized.shape
# fcn
# 存放boxes结构集合
all_boxes = list()
# 循环退出条件 图片大小需要大于12
while min(current_height, current_width) > net_size:
#return the result predicted by pnet
#cls_cls_map : H*w*2
#reg: H*w*4
# 得到Pnet预测结果 返回 cls_prob, bbox_pred,
cls_cls_map, reg = self.pnet_detector.predict(im_resized)
#boxes: num*9(x1,y1,x2,y2,score,x1_offset,y1_offset,x2_offset,y2_offset)
# 用预测的结果生成 boxes 下面会具体分析
boxes = self.generate_bbox(cls_cls_map[:, :,1], reg, current_scale, self.thresh[0])
# 这句话就是实现图像金字塔 每次按比例缩放
current_scale *= self.scale_factor
im_resized = self.processed_image(im, current_scale)
current_height, current_width, _ = im_resized.shape
if boxes.size == 0:
continue
# 这里其实就是做个非极大值抑制
keep = py_nms(boxes[:, :5], 0.5, 'Union')
boxes = boxes[keep]
all_boxes.append(boxes)
if len(all_boxes) == 0:
return None, None, None
all_boxes = np.vstack(all_boxes)
# merge the detection from first stage
# 提高置信度 在进行一次非极大值抑制
keep = py_nms(all_boxes[:, 0:5], 0.7, 'Union')
all_boxes = all_boxes[keep]
# 得到最终的候选框
boxes = all_boxes[:, :5]
# 计算候选框的宽和高
bbw = all_boxes[:, 2] - all_boxes[:, 0] + 1
bbh = all_boxes[:, 3] - all_boxes[:, 1] + 1
# refine the boxes
# 根据预测值进行边框的映射 边框的x1, y1 x2, y2 分别加上预测的偏移量
boxes_c = np.vstack([all_boxes[:, 0] + all_boxes[:, 5] * bbw,
all_boxes[:, 1] + all_boxes[:, 6] * bbh,
all_boxes[:, 2] + all_boxes[:, 7] * bbw,
all_boxes[:, 3] + all_boxes[:, 8] * bbh,
all_boxes[:, 4]])
boxes_c = boxes_c.T
return boxes, boxes_c, None
继续分析下 generate_bbox()里面做了什么
这里有个前提条件就是
理解这个方法有个前提 PNet不是提供精确的候选框位置
仅仅在包含人脸图片的位置提供一个候选框(不精确)
def generate_bbox(self, cls_map, reg, scale, threshold):
"""
generate bbox from feature cls_map
Parameters:
----------
cls_map: numpy array , n x m
detect score for each position
reg: numpy array , n x m x 4
bbox
scale: float number
scale of this detection
threshold: float number
detect threshold
Returns:
-------
bbox array
"""
# 理解这个方法有个前提 PNet不是提供精确的候选框位置
# 仅仅在包含人脸图片的位置提供一个候选框(不准确)
# 个人理解是一个偏移量 没有固定设置多大
stride = 2
#stride = 4
# cellsize = 12对应12 * 12的窗口
cellsize = 12
#cellsize = 25
# 当特征输出中找到执行度大于阈值的位置
# Pnet训练时图片输入 12 * 12 * 3 得到 1 * 1 * 2
# 然后推断的时候图片不是12 * 12 所有会得到 H * W * 2
# 这里cls_map = pre_cls_map[:, :, 1]只取了是人脸的置信度值 所有cls_map 就是 H * W
# 可以理解为这张图片用12 * 12的窗口进行滑动 然后每个窗口输出是 人脸的置信度/边框回归值/landmark
# 每个窗口对应PNet全卷机网络最后输出特征图H * W个点 每个点就是对原图上12 * 12窗口的预测
# 得到两个数组 第0个表示H上面的索引 第一个表示在W上的索引
t_index = np.where(cls_map > threshold)
# find nothing
if t_index[0].size == 0:
return np.array([])
#offset
# 得到预测时的偏移量
dx1, dy1, dx2, dy2 = [reg[t_index[0], t_index[1], i] for i in range(4)]
reg = np.array([dx1, dy1, dx2, dy2])
# 得到置信度
score = cls_map[t_index[0], t_index[1]]
# 这里就是得到候选框的大小 下面就是我的个人理解 请大家自行判断是否正确 有更好地理解可以留言
# 首先去掉stride 分析
# x1, y1, x2, y2 = t_index[1], t_index[0], t_index[1] + 12, t_index[0] + 12
# 可以这样理解上面式子 对于最后输出的特征图 H * W 找到是人脸的置信度大于某个阈值的位置加上一个12 *12候选框
# 然后 用stride * [x1, y1, x2, y2] 相当于是进行一定偏移 最后除以scale 其实就是将其映射回原图
# 我们举例说明下 假设 原图 P: H * W 经过scale 后变成 P1 (H * scale, W * scale)
# 然后经过PNet变成 (h, w)的特征图 特征图上每一个点的感受野在 P1 (H * scale, W * scale) 上是一个12 * 12的size
# 在这每个点构建一个 12 * 12大小的候选框 只保留是人脸的置信度大于某个阈值的点 然后除以scale就是映射回原图了
# 然后返回的是一个 n * 9的数组 前四位存的是候选框的坐标 第5位是置信度 后四位是预测的偏移值
boundingbox = np.vstack([np.round((stride * t_index[1]) / scale),
np.round((stride * t_index[0]) / scale),
np.round((stride * t_index[1] + cellsize) / scale),
np.round((stride * t_index[0] + cellsize) / scale),
score,
reg])
return boundingbox.T
至此PNet获取的候选框分析完毕
然后分析save_hard_example()方法
这里就是保存positive negative part的数据并写入文件 这里就不分析了很简单
def save_hard_example(net, data,save_path):
# load ground truth from annotation file
# format of each line: image/path [x1,y1,x2,y2] for each gt_box in this image
im_idx_list = data['images']
# print(images[0])
gt_boxes_list = data['bboxes']
num_of_images = len(im_idx_list)
print("processing %d images in total" % num_of_images)
# save files
neg_label_file = "%d/neg_%d.txt" % (net, net)
neg_file = open(neg_label_file, 'w')
pos_label_file = "%d/pos_%d.txt" % (net, net)
pos_file = open(pos_label_file, 'w')
part_label_file = "%d/part_%d.txt" % (net, net)
part_file = open(part_label_file, 'w')
#read detect result
det_boxes = pickle.load(open(os.path.join(save_path, 'detections.pkl'), 'rb'))
# print(len(det_boxes), num_of_images)
assert len(det_boxes) == num_of_images, "incorrect detections or ground truths"
# index of neg, pos and part face, used as their image names
n_idx = 0
p_idx = 0
d_idx = 0
image_done = 0
#im_idx_list image index(list)
#det_boxes detect result(list)
#gt_boxes_list gt(list)
for im_idx, dets, gts in zip(im_idx_list, det_boxes, gt_boxes_list):
gts = np.array(gts, dtype=np.float32).reshape(-1, 4)
if image_done % 100 == 0:
print("%d images done" % image_done)
image_done += 1
if dets.shape[0] == 0:
continue
img = cv2.imread(im_idx)
#change to square
dets = convert_to_square(dets)
dets[:, 0:4] = np.round(dets[:, 0:4])
neg_num = 0
for box in dets:
x_left, y_top, x_right, y_bottom, _ = box.astype(int)
width = x_right - x_left + 1
height = y_bottom - y_top + 1
# ignore box that is too small or beyond image border
if width < 20 or x_left < 0 or y_top < 0 or x_right > img.shape[1] - 1 or y_bottom > img.shape[0] - 1:
continue
# compute intersection over union(IoU) between current box and all gt boxes
Iou = IoU(box, gts)
cropped_im = img[y_top:y_bottom + 1, x_left:x_right + 1, :]
resized_im = cv2.resize(cropped_im, (image_size, image_size),
interpolation=cv2.INTER_LINEAR)
# save negative images and write label
# Iou with all gts must below 0.3
if np.max(Iou) < 0.3 and neg_num < 60:
#save the examples
save_file = get_path(neg_dir, "%s.jpg" % n_idx)
# print(save_file)
neg_file.write(save_file + ' 0\n')
cv2.imwrite(save_file, resized_im)
n_idx += 1
neg_num += 1
else:
# find gt_box with the highest iou
idx = np.argmax(Iou)
assigned_gt = gts[idx]
x1, y1, x2, y2 = assigned_gt
# compute bbox reg label
offset_x1 = (x1 - x_left) / float(width)
offset_y1 = (y1 - y_top) / float(height)
offset_x2 = (x2 - x_right) / float(width)
offset_y2 = (y2 - y_bottom) / float(height)
# save positive and part-face images and write labels
if np.max(Iou) >= 0.65:
save_file = get_path(pos_dir, "%s.jpg" % p_idx)
pos_file.write(save_file + ' 1 %.2f %.2f %.2f %.2f\n' % (
offset_x1, offset_y1, offset_x2, offset_y2))
cv2.imwrite(save_file, resized_im)
p_idx += 1
elif np.max(Iou) >= 0.4:
save_file = os.path.join(part_dir, "%s.jpg" % d_idx)
part_file.write(save_file + ' -1 %.2f %.2f %.2f %.2f\n' % (
offset_x1, offset_y1, offset_x2, offset_y2))
cv2.imwrite(save_file, resized_im)
d_idx += 1
neg_file.close()
part_file.close()
pos_file.close()
相信看到这基本对MTCNN是怎么工作已经有清晰的认识了。后续ONet和RNet基本是差不多的。这里就不在展开了。