将生成的pnet数据(人脸与关键点数据合并到一个文件中)
import numpy as np
import numpy.random as npr
import os
data_dir = 'E:/MTCNN'
#anno_file = os.path.join(data_dir, "anno.txt")
size = 12 #pnet size的判断
if size == 12:
net = "PNet"
elif size == 24:
net = "RNet"
elif size == 48:
net = "ONet"
#在data_dir路径对应的文件夹打开三个文档,就是人脸框数据对应的三个样本
with open(os.path.join(data_dir, '%s/pos_%s.txt' % (size, size)), 'r') as f:
pos = f.readlines()
with open(os.path.join(data_dir, '%s/neg_%s.txt' % (size, size)), 'r') as f:
neg = f.readlines()
with open(os.path.join(data_dir, '%s/part_%s.txt' % (size, size)), 'r') as f:
part = f.readlines()
with open(os.path.join(data_dir,'%s/landmark_%s_aug.txt' %(size,size)), 'r') as f:
landmark = f.readlines()
#创建路径
dir_path = os.path.join(data_dir, 'imglists')
if not os.path.exists(dir_path):
os.makedirs(dir_path)
if not os.path.exists(os.path.join(dir_path, "%s" %(net))):
os.makedirs(os.path.join(dir_path, "%s" %(net)))
#train_PNet_landmark.txt
with open(os.path.join(dir_path, "%s" %(net),"train_%s_landmark.txt" % (net)), "w") as f:
nums = [len(neg), len(pos), len(part)]
ratio = [3, 1, 1]
#base_num = min(nums)
base_num = 250000
#打印数据量
print(len(neg), len(pos), len(part), base_num)
#shuffle the order of the initial data
#if negative examples are more than 750k then only choose 750k
#将初始化的数据重新排序
#当负样本的个数大于750k的时候,只选取750k个数,不满足750K的时候就随机选取负样本数。
#注意这里随机选的是数字,而不是数据,等下会根据数字的顺序为索引来重新排列裁剪三个样本
if len(neg) > base_num * 3:
neg_keep = npr.choice(len(neg), size=base_num * 3, replace=True)
else:
neg_keep = npr.choice(len(neg), size=len(neg), replace=True)
#随机选取base_num个数
pos_keep = npr.choice(len(pos), size=base_num, replace=True)
#随机选取base_num个数
part_keep = npr.choice(len(part), size=base_num, replace=True)
#打印出选取了多少个数
print(len(neg_keep), len(pos_keep), len(part_keep))
#按照已经打乱的数字顺序重新写入数据到文本中
for i in pos_keep:
f.write(pos[i])
for i in neg_keep:
f.write(neg[i])
for i in part_keep:
f.write(part[i])
#landmark数据量和排序保持不变
for item in landmark:
f.write(item)