MTCNN源码详细解读（2）- PNet的训练和数据集的构建

最新推荐文章于 2024-01-19 11:36:39 发布

那时那月那人

最新推荐文章于 2024-01-19 11:36:39 发布

阅读量1.5k

点赞数

分类专栏：源码分析文章标签：计算机视觉人脸识别

本文链接：https://blog.csdn.net/xiaoxu1025/article/details/111409339

版权

源码分析专栏收录该内容

16 篇文章

订阅专栏

此博客围绕MTCNN中PNet的数据准备和训练展开。从多个文件入手，如gen_12net_data.py准备不同类型数据图片及对应标签，gen_landmark_aug_12.py获取landmark坐标，还介绍了构建训练数据集、tfrecord打包等内容，最后分析了PNet的训练过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

代码地址 https://github.com/AITTSMD/MTCNN-Tensorflow

MTCNN源码详细解读（1）- PNet/RNet/ONet的网络结构和损失函数
 MTCNN源码详细解读（3）- RNet的训练和数据集的构建
这篇博客主要分析PNet的数据准备和训练

从下面几个文件入手:
Run prepare_data/gen_12net_data.py
Run gen_landmark_aug_12.py
Run gen_imglist_pnet.py
Run gen_PNet_tfrecords.py

首先分析 gen_12net_data.py
这个文件主要做了一件事准备postion/negative/part数据图片对应label分别为{1, 0, -1} label为1，0的参与分类损失 label为1，-1的参与边框回归损失的计算上篇博客在损失函数中已经说明了
主要代码如下

# 构建negative数据集
    while neg_num < 50:
        #neg_num's size [40,min(width, height) / 2],min_size:40
        # 随机生成一个尺寸
        size = npr.randint(12, min(width, height) / 2)

        #top_left
        # 随机生成 left top 坐标
        nx = npr.randint(0, width - size)
        ny = npr.randint(0, height - size)
        #random crop
        # 进行图片裁剪
        crop_box = np.array([nx, ny, nx + size, ny + size])
        #cal iou
        # 计算IOU
        Iou = IoU(crop_box, boxes)
        
        cropped_im = img[ny : ny + size, nx : nx + size, :]
        # 将裁剪的图片resize到12 * 12
        resized_im = cv2.resize(cropped_im, (12, 12), interpolation=cv2.INTER_LINEAR)

        if np.max(Iou) < 0.3:
            # 小于.3的IOU当做negative  label 0
            # Iou with all gts must below 0.3
            save_file = os.path.join(neg_save_dir, "%s.jpg"%n_idx)
            f2.write("12/negative/%s.jpg"%n_idx + ' 0\n')
            cv2.imwrite(save_file, resized_im)
            n_idx += 1
            neg_num += 1
        

	    for i in range(20):
            # pos and part face size [minsize*0.8,maxsize*1.25]
            # 生成size尺寸
            size = npr.randint(int(min(w, h) * 0.8), np.ceil(1.25 * max(w, h)))

            # delta here is the offset of box center
            # 随机生成偏移尺寸
            delta_x = npr.randint(-w * 0.2, w * 0.2)
            delta_y = npr.randint(-h * 0.2, h * 0.2)
            #show this way: nx1 = max(x1+w/2-size/2+delta_x)
            # 进行平移
            nx1 = int(max(x1 + w / 2 + delta_x - size / 2, 0))
            #show this way: ny1 = max(y1+h/2-size/2+delta_y)
            ny1 = int(max(y1 + h / 2 + delta_y - size / 2, 0))
            nx2 = nx1 + size
            ny2 = ny1 + size

            if nx2 > width or ny2 > height:
                continue 
            crop_box = np.array([nx1, ny1, nx2, ny2])
            #yu gt de offset
            # 计算bouding box的偏移 然后进行归一化
            offset_x1 = (x1 - nx1) / float(size)
            offset_y1 = (y1 - ny1) / float(size)
            offset_x2 = (x2 - nx2) / float(size)
            offset_y2 = (y2 - ny2) / float(size)
            #crop
            # 裁剪图片
            cropped_im = img[ny1 : ny2, nx1 : nx2, :]
            #resize
            resized_im = cv2.resize(cropped_im, (12, 12), interpolation=cv2.INTER_LINEAR)

            box_ = box.reshape(1, -1)
            if IoU(crop_box, box_) >= 0.65:
                # 对于IOU > .65为positive  label 1
                save_file = os.path.join(pos_save_dir, "%s.jpg"%p_idx)
                f1.write("12/positive/%s.jpg"%p_idx + ' 1 %.2f %.2f %.2f %.2f\n'%(offset_x1, offset_y1, offset_x2, offset_y2))
                cv2.imwrite(save_file, resized_im)
                p_idx += 1
            elif IoU(crop_box, box_) >= 0.4:
                # IOU [0.4, 0.65)为part label -1
                save_file = os.path.join(part_save_dir, "%s.jpg"%d_idx)
                f3.write("12/part/%s.jpg"%d_idx + ' -1 %.2f %.2f %.2f %.2f\n'%(offset_x1, offset_y1, offset_x2, offset_y2))
                cv2.imwrite(save_file, resized_im)

下面分析 gen_landmark_aug_12.py 这个文件主要获取landmark坐标
下面分析主要代码去掉了augment数据增广的操作

# 得的landmark写入下面文件
    f = open(join(OUTPUT,"landmark_%s_aug.txt" %(size)),'w')
    #dstdir = "train_landmark_few"
    # 读取数据  得到图片地址 图片gtbox landmark
    data = getDataFromTxt(ftxt)
    idx = 0
    #image_path bbox landmark(5*2)
    #遍历数据
    for (imgPath, bbox, landmarkGt) in data:
        #print imgPath
        F_imgs = []
        F_landmarks = []        
        img = cv2.imread(imgPath)
        if img is None:
            continue
        img_h,img_w,img_c = img.shape
        gt_box = np.array([bbox.left,bbox.top,bbox.right,bbox.bottom])
        # 得到脸的坐标
        f_face = img[bbox.top:bbox.bottom+1,bbox.left:bbox.right+1]
        # 将脸的坐标缩放到 12 * 12 PNet的输入
        f_face = cv2.resize(f_face,(size,size))
        landmark = np.zeros((5, 2))
        #normalize
        for index, one in enumerate(landmarkGt):
            # 这里是进行归一化 计算的是 相对于脸的边框的坐标 将脸的(left, top) 当做起始点 然后归一化到gtbox的尺寸上[0, 1]
            rv = ((one[0]-gt_box[0])/(gt_box[2]-gt_box[0]), (one[1]-gt_box[1])/(gt_box[3]-gt_box[1]))
            landmark[index] = rv
       # 得到image 和 landmark坐标
       F_imgs, F_landmarks = np.asarray(F_imgs), np.asarray(F_landmarks)
            #print F_imgs.shape
            #print F_landmarks.shape
            for i in range(len(F_imgs)):
                if np.sum(np.where(F_landmarks[i] <= 0, 1, 0)) > 0:
                    continue

                if np.sum(np.where(F_landmarks[i] >= 1, 1, 0)) > 0:
                    continue
                # 将数据写入文件  label -2
                cv2.imwrite(join(dstdir,"%d.jpg" %(image_id)), F_imgs[i])
                landmarks = map(str,list(F_landmarks[i]))
                f.write(join(dstdir,"%d.jpg" %(image_id))+" -2 "+" ".join(landmarks)+"\n")
                image_id = image_id + 1

gen_imglist_pnet.py 这里就是开始构建PNet训练数据集了直接看代码

# 读取所有的pos negative part landmark数据 就是上面运行的文件生成的数据 
with open(os.path.join(data_dir, '%s/pos_%s.txt' % (size, size)), 'r') as f:
    pos = f.readlines()

with open(os.path.join(data_dir, '%s/neg_%s.txt' % (size, size)), 'r') as f:
    neg = f.readlines()

with open(os.path.join(data_dir, '%s/part_%s.txt' % (size, size)), 'r') as f:
    part = f.readlines()

with open(os.path.join(data_dir,'%s/landmark_%s_aug.txt' %(size,size)), 'r') as f:
    landmark = f.readlines()
    
dir_path = os.path.join(data_dir, 'imglists')
if not os.path.exists(dir_path):
    os.makedirs(dir_path)
if not os.path.exists(os.path.join(dir_path, "%s" %(net))):
    os.makedirs(os.path.join(dir_path, "%s" %(net)))
# 将训练数据写入下面文件
with open(os.path.join(dir_path, "%s" %(net),"train_%s_landmark.txt" % (net)), "w") as f:
    # 按照 3:1:1来生成训练样本
    nums = [len(neg), len(pos), len(part)]
    ratio = [3, 1, 1]
    #base_num = min(nums)
    base_num = 250000
    print(len(neg), len(pos), len(part), base_num)
    if len(neg) > base_num * 3:
        neg_keep = npr.choice(len(neg), size=base_num * 3, replace=True)
    else:
        neg_keep = npr.choice(len(neg), size=len(neg), replace=True)
    pos_keep = npr.choice(len(pos), size=base_num, replace=True)
    part_keep = npr.choice(len(part), size=base_num, replace=True)
    print(len(neg_keep), len(pos_keep), len(part_keep))

    for i in pos_keep:
        f.write(pos[i])
    for i in neg_keep:
        f.write(neg[i])
    for i in part_keep:
        f.write(part[i])
    # landmark是训练所有的
    for item in landmark:
        f.write(item)

gen_PNet_tfrecords.py 进行tfrecord打包
首先看下数据存储结构

def get_dataset(dir, net='PNet'):
    #item = 'imglists/PNet/train_%s_raw.txt' % net
    # 读取数据存储文件
    item = 'imglists/PNet/train_%s_landmark.txt' % net
    
    dataset_dir = os.path.join(dir, item)
    imagelist = open(dataset_dir, 'r')

    dataset = []
    # 遍历数据
    for line in imagelist.readlines():
        info = line.strip().split(' ')
        data_example = dict()
        bbox = dict()
        # 定义16个属性 filename label 4个坐标  10个landmark
        data_example['filename'] = info[0]
        data_example['label'] = int(info[1])
        bbox['xmin'] = 0
        bbox['ymin'] = 0
        bbox['xmax'] = 0
        bbox['ymax'] = 0
        bbox['xlefteye'] = 0
        bbox['ylefteye'] = 0
        bbox['xrighteye'] = 0
        bbox['yrighteye'] = 0
        bbox['xnose'] = 0
        bbox['ynose'] = 0
        bbox['xleftmouth'] = 0
        bbox['yleftmouth'] = 0
        bbox['xrightmouth'] = 0
        bbox['yrightmouth'] = 0
        # len(info) == 6 的是针对positive negative part样本 只存了6位 filepath label 4个坐标
        if len(info) == 6:
            bbox['xmin'] = float(info[2])
            bbox['ymin'] = float(info[3])
            bbox['xmax'] = float(info[4])
            bbox['ymax'] = float(info[5])
        # 长度等于12的是landmark的数据  存了14位  filepath label 10 个 landmark
        if len(info) == 12:
            bbox['xlefteye'] = float(info[2])
            bbox['ylefteye'] = float(info[3])
            bbox['xrighteye'] = float(info[4])
            bbox['yrighteye'] = float(info[5])
            bbox['xnose'] = float(info[6])
            bbox['ynose'] = float(info[7])
            bbox['xleftmouth'] = float(info[8])
            bbox['yleftmouth'] = float(info[9])
            bbox['xrightmouth'] = float(info[10])
            bbox['yrightmouth'] = float(info[11])
            
        data_example['bbox'] = bbox
        dataset.append(data_example)

    return dataset

打包tfrecord的结构为：

# 生成tf.train.Example存入写入tfrecord
    example = tf.train.Example(features=tf.train.Features(feature={
        'image/encoded': _bytes_feature(image_buffer),
        'image/label': _int64_feature(class_label),
        'image/roi': _float_feature(roi),
        'image/landmark': _float_feature(landmark)
    }))

最后分析下PNet的训练过程
train_PNet.py 调用 train.py 里面的train()方法

def train(net_factory, prefix, end_epoch, base_dir,
          display=200, base_lr=0.01):
    """
    train PNet/RNet/ONet
    :param net_factory:
    :param prefix:
    :param end_epoch:16
    :param dataset:
    :param display:
    :param base_lr:
    :return:
    """
    net = prefix.split('/')[-1]
    #label file
    label_file = os.path.join(base_dir,'train_%s_landmark.txt' % net)
    #label_file = os.path.join(base_dir,'landmark_12_few.txt')
    f = open(label_file, 'r')
    # 得到训练样本数
    num = len(f.readlines())
    print("Total datasets is: ", num)

    #PNet use this method to get data
    if net == 'PNet':
        #dataset_dir = os.path.join(base_dir,'train_%s_ALL.tfrecord_shuffle' % net)
        dataset_dir = os.path.join(base_dir,'train_%s_landmark.tfrecord_shuffle' % net)
        # 从tfrecord中读取batch数据
        image_batch, label_batch, bbox_batch,landmark_batch = read_single_tfrecord(dataset_dir, config.BATCH_SIZE, net)
       	pass
        #landmark_dir    
    if net == 'PNet':
        image_size = 12
        radio_cls_loss = 1.0;radio_bbox_loss = 0.5;radio_landmark_loss = 0.5;
    elif net == 'RNet':
        image_size = 24
        radio_cls_loss = 1.0;radio_bbox_loss = 0.5;radio_landmark_loss = 0.5;
    else:
        radio_cls_loss = 1.0;radio_bbox_loss = 0.5;radio_landmark_loss = 1.0;
        image_size = 48
    #define placeholder
    # 定义placehoder 开始训练网络
    input_image = tf.placeholder(tf.float32, shape=[config.BATCH_SIZE, image_size, image_size, 3], name='input_image')
    label = tf.placeholder(tf.float32, shape=[config.BATCH_SIZE], name='label')
    bbox_target = tf.placeholder(tf.float32, shape=[config.BATCH_SIZE, 4], name='bbox_target')
    landmark_target = tf.placeholder(tf.float32,shape=[config.BATCH_SIZE,10],name='landmark_target')
    #class,regression
    # 得到网络输出
    cls_loss_op,bbox_loss_op,landmark_loss_op,L2_loss_op,accuracy_op = net_factory(input_image, label, bbox_target,landmark_target,training=True)
    #train,update learning rate(3 loss)
    # 计算损失 每个损失对应各自的权重 radio_cls_loss = 1.0;radio_bbox_loss = 0.5;radio_landmark_loss = 0.5;
    train_op, lr_op = train_model(base_lr, radio_cls_loss*cls_loss_op + radio_bbox_loss*bbox_loss_op + radio_landmark_loss*landmark_loss_op + L2_loss_op, num)
    # init
    init = tf.global_variables_initializer()
    sess = tf.Session()
    #save model
    saver = tf.train.Saver(max_to_keep=0)
    sess.run(init)
    #visualize some variables
    tf.summary.scalar("cls_loss",cls_loss_op)#cls_loss
    tf.summary.scalar("bbox_loss",bbox_loss_op)#bbox_loss
    tf.summary.scalar("landmark_loss",landmark_loss_op)#landmark_loss
    tf.summary.scalar("cls_accuracy",accuracy_op)#cls_acc
    summary_op = tf.summary.merge_all()
    logs_dir = "../logs/%s" %(net)
    if os.path.exists(logs_dir) == False:
        os.mkdir(logs_dir)
    writer = tf.summary.FileWriter(logs_dir,sess.graph)
    #begin 
    coord = tf.train.Coordinator()
    #begin enqueue thread
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    i = 0
    #total steps
    MAX_STEP = int(num / config.BATCH_SIZE + 1) * end_epoch
    print("Train steps: %d" % MAX_STEP)
    epoch = 0
    sess.graph.finalize()    
    try:
        for step in range(MAX_STEP):
            i = i + 1
            if coord.should_stop():
                break
            # 这里就是得到训练数据
            image_batch_array, label_batch_array, bbox_batch_array,landmark_batch_array = sess.run([image_batch, label_batch, bbox_batch,landmark_batch])
            #random flip
            # 随机翻转数据
            image_batch_array,landmark_batch_array = random_flip_images(image_batch_array,label_batch_array,landmark_batch_array)
            '''
            print image_batch_array.shape
            print label_batch_array.shape
            print bbox_batch_array.shape
            print landmark_batch_array.shape
            print label_batch_array[0]
            print bbox_batch_array[0]
            print landmark_batch_array[0]
            '''
            # 开始进行反向传播
            _,_,summary = sess.run([train_op, lr_op ,summary_op], feed_dict={input_image: image_batch_array, label: label_batch_array, bbox_target: bbox_batch_array,landmark_target:landmark_batch_array})