MTCNN源码详细解读（3）- RNet的训练和数据集的构建

本文链接：https://blog.csdn.net/xiaoxu1025/article/details/111410880

MTCNN源码详细解读（1）- PNet/RNet/ONet的网络结构和损失函数
 MTCNN源码详细解读（2）- PNet的训练和数据集的构建
在此之前我们先分析下训练RNet和训练PNet的区别:
1 训练PNet时的候选框是random生成的
2 R训练之前需要用到训练好的PNet预测出候选框（置信度大于一定阈值），然后计算PNet预测出的候选框和gtbox之间的偏移量
3 每张图片会生成金字塔图像下面就是具体实现
# 得到初始的scale 和min_face_size取值有关代码中是24 所以初始scale=0.5
current_scale = float(net_size) / self.min_face_size # find initial
pass
while min(current_height, current_width) > net_size:
pass
# 这句话就是实现图像金字塔每次按比例缩放
# scale_factor 一般取0.7-0.8之间
current_scale *= self.scale_factor

RNet训练分为
1 run gen_hard_example.py to generate training data(Face Detection Part) for RNet.
2 Run gen_landmark_aug_24.py to generate training data(Face Landmark Detection Part) for RNet.
3 Run gen_imglist_rnet.py to merge two parts of training data.
4 Run gen_RNet_tfrecords.py
后面三个步骤基本和PNet一致这里就不展开了重点分析下gen_hard_example.py
接下来分析 gen_hard_example.py 为PNet生成训练数据for face detection

def t_net(prefix, epoch,
             batch_size, test_mode="PNet",
             thresh=[0.6, 0.6, 0.7], min_face_size=25,
             stride=2, slide_window=False, shuffle=False, vis=False):
    detectors = [None, None, None]
    print("Test model: ", test_mode)
    #PNet-echo
    model_path = ['%s-%s' % (x, y) for x, y in zip(prefix, epoch)]
    print(model_path[0])
    # load pnet model
    # 加载上一步训练的PNet 用于inference
    if slide_window:
        PNet = Detector(P_Net, 12, batch_size[0], model_path[0])
    else:
        PNet = FcnDetector(P_Net, model_path[0])
    detectors[0] = PNet

    # load rnet model
    if test_mode in ["RNet", "ONet"]:
        print("==================================", test_mode)
        # 加载RNet 在训练PNet时 这里不会执行
        RNet = Detector(R_Net, 24, batch_size[1], model_path[1])
        detectors[1] = RNet

    # load onet model
    if test_mode == "ONet":
        print("==================================", test_mode)
        ONet = Detector(O_Net, 48, batch_size[2], model_path[2])
        detectors[2] = ONet
        
    basedir = '.'    
    #anno_file
    filename = './wider_face_train_bbx_gt.txt'
    #read annatation(type:dict)
    # 这里很简单读取数据集中的 image和box
    # data['images']存的是图片  data['bboxes] 存的是gtboxes 每张图片可以对于多个gt_boxes
    data = read_annotation(basedir,filename)
    # 构建用来推断的类  方便inference
    mtcnn_detector = MtcnnDetector(detectors=detectors, min_face_size=min_face_size,
                                   stride=stride, threshold=thresh, slide_window=slide_window)
    print("==================================")
    # 注意是在“test”模式下
    # imdb = IMDB("wider", image_set, root_path, dataset_path, 'test')
    # gt_imdb = imdb.gt_imdb()
    test_data = TestLoader(data['images'])
    #list
    # 这里就是得到每张图片预测出来的候选框  长度和data一致   对应每张图片有多个候选框
    detections,_ = mtcnn_detector.detect_face(test_data)

    save_net = 'RNet'
    if test_mode == "PNet":
        save_net = "RNet"
    elif test_mode == "RNet":
        save_net = "ONet"
    #save detect result
    save_path = os.path.join(data_dir, save_net)
    print(save_path)
    if not os.path.exists(save_path):
        os.mkdir(save_path)
    #  保存候选框
    save_file = os.path.join(save_path, "detections.pkl")
    with open(save_file, 'wb') as f:
        pickle.dump(detections, f,1)
    print("%s测试完成开始OHEM" % image_size)
    # 得到训练数据集并保存起来
    save_hard_example(image_size, data, save_path)

然后分析下上面提到的两个方法
mtcnn_detector.detect_face(test_data)
save_hard_example(image_size, data, save_path)

    def detect_face(self, test_data):
        all_boxes = []#save each image's bboxes
        landmarks = []
        batch_idx = 0
        sum_time = 0
        #test_data is iter_
        # 遍历数据集
        for databatch in test_data:
            #databatch(image returned)
            if batch_idx % 100 == 0:
                print("%d images done" % batch_idx)
            im = databatch
            # 对PNet进行reference
            # pnet
            t1 = 0
            if self.pnet_detector:
                t = time.time()
                #ignore landmark 
                # reference的具体实现 下面会具体分析 图像金字塔就是在这方法里面实现的
                boxes, boxes_c, landmark = self.detect_pnet(im)
                t1 = time.time() - t
                sum_time += t1
                if boxes_c is None:
                    print("boxes_c is None...")
                    all_boxes.append(np.array([]))
                    #pay attention
                    landmarks.append(np.array([]))
                    batch_idx += 1
                    continue
            # 如果是训练RNet下面关于rnet 和 onet的处理是不会执行的
            # rnet
            t2 = 0
            if self.rnet_detector:
                t = time.time()
                #ignore landmark                 
                boxes, boxes_c, landmark = self.detect_rnet(im, boxes_c)
                t2 = time.time() - t
                sum_time += t2
                if boxes_c is None:
                    all_boxes.append(np.array([]))
                    landmarks.append(np.array([]))
                    batch_idx += 1
                    continue
            # onet
            t3 = 0
            if self.onet_detector:
                t = time.time()
                boxes, boxes_c, landmark = self.detect_onet(im, boxes_c)
                t3 = time.time() - t
                sum_time += t3
                if boxes_c is None:
                    all_boxes.append(np.array([]))
                    landmarks.append(np.array([]))                    
                    batch_idx += 1
                    continue
                print(
                    "time cost " + '{:.3f}'.format(sum_time) + '  pnet {:.3f}  rnet {:.3f}  onet {:.3f}'.format(t1, t2,t3))
                                                                                                                    
            # 添加上面推断出来的boxes 候选框
            all_boxes.append(boxes_c)
            landmarks.append(landmark)
            batch_idx += 1
        #num_of_data*9,num_of_data*10
        return all_boxes,landmarks

然后分析 detect_pnet()

    def detect_pnet(self, im):
        """Get face candidates through pnet

        Parameters:
        ----------
        im: numpy array
            input image array

        Returns:
        -------
        boxes: numpy array
            detected boxes before calibration
        boxes_c: numpy array
            boxes after calibration
        """
        h, w, c = im.shape
        net_size = 12
        # 这里就是计算一个缩放scale
        current_scale = float(net_size) / self.min_face_size  # find initial scale
        # print("current_scale", net_size, self.min_face_size, current_scale)
        im_resized = self.processed_image(im, current_scale)
        current_height, current_width, _ = im_resized.shape
        # fcn
        # 存放boxes结构集合
        all_boxes = list()
        # 循环退出条件 图片大小需要大于12
        while min(current_height, current_width) > net_size:
            #return the result predicted by pnet
            #cls_cls_map : H*w*2
            #reg: H*w*4
            # 得到Pnet预测结果  返回 cls_prob, bbox_pred,
            cls_cls_map, reg = self.pnet_detector.predict(im_resized)
            #boxes: num*9(x1,y1,x2,y2,score,x1_offset,y1_offset,x2_offset,y2_offset)
            # 用预测的结果生成 boxes 下面会具体分析
            boxes = self.generate_bbox(cls_cls_map[:, :,1], reg, current_scale, self.thresh[0])
            # 这句话就是实现图像金字塔 每次按比例缩放
            current_scale *= self.scale_factor
            im_resized = self.processed_image(im, current_scale)
            current_height, current_width, _ = im_resized.shape

            if boxes.size == 0:
                continue
            # 这里其实就是做个非极大值抑制
            keep = py_nms(boxes[:, :5], 0.5, 'Union')
            boxes = boxes[keep]
            all_boxes.append(boxes)

        if len(all_boxes) == 0:
            return None, None, None

        all_boxes = np.vstack(all_boxes)

        # merge the detection from first stage
        # 提高置信度 在进行一次非极大值抑制
        keep = py_nms(all_boxes[:, 0:5], 0.7, 'Union')
        all_boxes = all_boxes[keep]
        # 得到最终的候选框
        boxes = all_boxes[:, :5]
        # 计算候选框的宽和高
        bbw = all_boxes[:, 2] - all_boxes[:, 0] + 1
        bbh = all_boxes[:, 3] - all_boxes[:, 1] + 1

        # refine the boxes
        # 根据预测值进行边框的映射 边框的x1, y1 x2, y2 分别加上预测的偏移量
        boxes_c = np.vstack([all_boxes[:, 0] + all_boxes[:, 5] * bbw,
                             all_boxes[:, 1] + all_boxes[:, 6] * bbh,
                             all_boxes[:, 2] + all_boxes[:, 7] * bbw,
                             all_boxes[:, 3] + all_boxes[:, 8] * bbh,
                             all_boxes[:, 4]])
        boxes_c = boxes_c.T

        return boxes, boxes_c, None

继续分析下 generate_bbox()里面做了什么
这里有个前提条件就是
理解这个方法有个前提 PNet不是提供精确的候选框位置
仅仅在包含人脸图片的位置提供一个候选框（不精确）

    def generate_bbox(self, cls_map, reg, scale, threshold):
        """
            generate bbox from feature cls_map
        Parameters:
        ----------
            cls_map: numpy array , n x m 
                detect score for each position
            reg: numpy array , n x m x 4
                bbox
            scale: float number
                scale of this detection
            threshold: float number
                detect threshold
        Returns:
        -------
            bbox array
        """
        # 理解这个方法有个前提  PNet不是提供精确的候选框位置
        # 仅仅在包含人脸图片的位置提供一个候选框（不准确）
        # 个人理解是一个偏移量 没有固定设置多大
        stride = 2
        #stride = 4
        # cellsize = 12对应12 * 12的窗口
        cellsize = 12
        #cellsize = 25
        # 当特征输出中找到执行度大于阈值的位置
        # Pnet训练时图片输入 12 * 12 * 3 得到 1 * 1 * 2
        # 然后推断的时候图片不是12 * 12 所有会得到  H * W * 2
        # 这里cls_map = pre_cls_map[:, :, 1]只取了是人脸的置信度值 所有cls_map 就是 H * W
        # 可以理解为这张图片用12 * 12的窗口进行滑动 然后每个窗口输出是 人脸的置信度/边框回归值/landmark
        #  每个窗口对应PNet全卷机网络最后输出特征图H * W个点  每个点就是对原图上12 * 12窗口的预测
        # 得到两个数组 第0个表示H上面的索引  第一个表示在W上的索引
        t_index = np.where(cls_map > threshold)

        # find nothing
        if t_index[0].size == 0:
            return np.array([])
        #offset
        # 得到预测时的偏移量
        dx1, dy1, dx2, dy2 = [reg[t_index[0], t_index[1], i] for i in range(4)]

        reg = np.array([dx1, dy1, dx2, dy2])
        # 得到置信度
        score = cls_map[t_index[0], t_index[1]]
        # 这里就是得到候选框的大小 下面就是我的个人理解 请大家自行判断是否正确 有更好地理解可以留言
        # 首先去掉stride 分析
        # x1, y1, x2, y2 = t_index[1], t_index[0], t_index[1] + 12, t_index[0] + 12
        # 可以这样理解上面式子  对于最后输出的特征图 H * W 找到是人脸的置信度大于某个阈值的位置加上一个12 *12候选框
        #  然后 用stride * [x1, y1, x2, y2] 相当于是进行一定偏移 最后除以scale 其实就是将其映射回原图
        # 我们举例说明下  假设 原图 P: H * W 经过scale 后变成 P1  (H * scale, W * scale)
        # 然后经过PNet变成 (h, w)的特征图 特征图上每一个点的感受野在 P1 (H * scale, W * scale) 上是一个12 * 12的size
        # 在这每个点构建一个 12 * 12大小的候选框  只保留是人脸的置信度大于某个阈值的点  然后除以scale就是映射回原图了
        # 然后返回的是一个 n * 9的数组  前四位存的是候选框的坐标 第5位是置信度 后四位是预测的偏移值
        boundingbox = np.vstack([np.round((stride * t_index[1]) / scale),
                                 np.round((stride * t_index[0]) / scale),
                                 np.round((stride * t_index[1] + cellsize) / scale),
                                 np.round((stride * t_index[0] + cellsize) / scale),
                                 score,
                                 reg])

        return boundingbox.T

至此PNet获取的候选框分析完毕
然后分析save_hard_example()方法
这里就是保存positive negative part的数据并写入文件这里就不分析了很简单

def save_hard_example(net, data,save_path):
    # load ground truth from annotation file
    # format of each line: image/path [x1,y1,x2,y2] for each gt_box in this image

    im_idx_list = data['images']
    # print(images[0])
    gt_boxes_list = data['bboxes']
    num_of_images = len(im_idx_list)

    print("processing %d images in total" % num_of_images)

    
    # save files
    neg_label_file = "%d/neg_%d.txt" % (net, net)
    neg_file = open(neg_label_file, 'w')

    pos_label_file = "%d/pos_%d.txt" % (net, net)
    pos_file = open(pos_label_file, 'w')

    part_label_file = "%d/part_%d.txt" % (net, net)
    part_file = open(part_label_file, 'w')
    #read detect result
    det_boxes = pickle.load(open(os.path.join(save_path, 'detections.pkl'), 'rb'))
    # print(len(det_boxes), num_of_images)
    assert len(det_boxes) == num_of_images, "incorrect detections or ground truths"

    # index of neg, pos and part face, used as their image names
    n_idx = 0
    p_idx = 0
    d_idx = 0
    image_done = 0
    #im_idx_list image index(list)
    #det_boxes detect result(list)
    #gt_boxes_list gt(list)
    for im_idx, dets, gts in zip(im_idx_list, det_boxes, gt_boxes_list):
        gts = np.array(gts, dtype=np.float32).reshape(-1, 4)
        if image_done % 100 == 0:
            print("%d images done" % image_done)
        image_done += 1

        if dets.shape[0] == 0:
            continue
        img = cv2.imread(im_idx)
        #change to square
        dets = convert_to_square(dets)
        dets[:, 0:4] = np.round(dets[:, 0:4])
        neg_num = 0
        for box in dets:
            x_left, y_top, x_right, y_bottom, _ = box.astype(int)
            width = x_right - x_left + 1
            height = y_bottom - y_top + 1

            # ignore box that is too small or beyond image border
            if width < 20 or x_left < 0 or y_top < 0 or x_right > img.shape[1] - 1 or y_bottom > img.shape[0] - 1:
                continue

            # compute intersection over union(IoU) between current box and all gt boxes
            Iou = IoU(box, gts)
            cropped_im = img[y_top:y_bottom + 1, x_left:x_right + 1, :]
            resized_im = cv2.resize(cropped_im, (image_size, image_size),
                                    interpolation=cv2.INTER_LINEAR)

            # save negative images and write label
            # Iou with all gts must below 0.3            
            if np.max(Iou) < 0.3 and neg_num < 60:
                #save the examples
                save_file = get_path(neg_dir, "%s.jpg" % n_idx)
                # print(save_file)
                neg_file.write(save_file + ' 0\n')
                cv2.imwrite(save_file, resized_im)
                n_idx += 1
                neg_num += 1
            else:
                # find gt_box with the highest iou
                idx = np.argmax(Iou)
                assigned_gt = gts[idx]
                x1, y1, x2, y2 = assigned_gt

                # compute bbox reg label
                offset_x1 = (x1 - x_left) / float(width)
                offset_y1 = (y1 - y_top) / float(height)
                offset_x2 = (x2 - x_right) / float(width)
                offset_y2 = (y2 - y_bottom) / float(height)

                # save positive and part-face images and write labels
                if np.max(Iou) >= 0.65:
                    save_file = get_path(pos_dir, "%s.jpg" % p_idx)
                    pos_file.write(save_file + ' 1 %.2f %.2f %.2f %.2f\n' % (
                        offset_x1, offset_y1, offset_x2, offset_y2))
                    cv2.imwrite(save_file, resized_im)
                    p_idx += 1

                elif np.max(Iou) >= 0.4:
                    save_file = os.path.join(part_dir, "%s.jpg" % d_idx)
                    part_file.write(save_file + ' -1 %.2f %.2f %.2f %.2f\n' % (
                        offset_x1, offset_y1, offset_x2, offset_y2))
                    cv2.imwrite(save_file, resized_im)
                    d_idx += 1
    neg_file.close()
    part_file.close()
    pos_file.close()