实习的时候,主要是做物体检测,需要将XML文件划分进训练集(train)、验证集(val)和测试集(test)。
于是mentor给了个任务,要求写一个python脚本实现对XML文件的随机划分,具体要求如下:
1.新建两个文件夹test和trainval,其中test文件夹中保留划分进测试集的XML文件,并在其中新建一个txt文档记录文件名;trainval文件夹中保留划分进训练集和验证集的XML文件,并在其中新建一个txt文档记录文件名。
2.在trainval文件夹中新建两个文件夹train和val,其中train文件夹中保留trainval文件夹中被划分进训练集的XML文件,并在其中新建一个txt文档记录文件名;val文件夹中保留trainval文件夹中被划分进验证集的XML文件,并在其中新建一个txt文档记录文件名。
3.要求对文件进行随机划分。
4.要求test文件夹中的文件数目为总文件数目的1/2,train文件夹中的文件数目为总文件数目的1/4,val文件夹中的文件数目为总文件数目的1/4。
即,总文件 = test文件夹中的文件 + trainval文件夹中的文件,二者各占1/2。
其中,trainval文件夹中的文件 = train文件夹中的文件 + val文件夹中的文件,二者各占1/2。
用了两个小时完成该脚本代码,脚本代码如下:
#coding:utf-8
import os
import random
import shutil
import sys
import getopt
def main(argv):
try:
opts,args = getopt.getopt(argv, "hd:", ["dir="])
except getopt.GetoptError:
print 'Error: fileSeperate.py -d <targetDir>'
print ' or: fileSeperate.py --targetDir=<targetDir>'
sys.exit(2)
for opt,arg in opts:
if opt == "-h":
print "fileSeperate.py -d <targetDir>"
print "or:fileSeperate.py --targetDir=<targetDir>"
sys.exit()
elif opt in ("-d", "--targetDir"):
root = arg
#Create dir
#root = "/home/ts/python/FileSeperate"
trainval = os.path.join(root, "trainval")
test = os.path.join(root, "test")
trainval_isExists = os.path.exists(trainval)
test_isExists = os.path.exists(test)
if not trainval_isExists:
os.mkdir("trainval")
if not test_isExists:
os.mkdir("test")
#Counting number of files
count = 0
files = os.listdir(root)
for i in files:
if os.path.isfile(os.path.join(root, i)):
count = count + 1
all_xml_list = [];
random_name_list_for_test = [];
name_list_for_trainval = [];
for name in os.listdir(root):
if ".xml" in name:
all_xml_list.append(name)
#print "all: ", all_xml_list
#Select files which will be put into dir 'test' randomly
while 1:
random_name = random.choice(all_xml_list)
if random_name not in random_name_list_for_test:
if len(random_name_list_for_test) < count/2:
random_name_list_for_test.append(random_name)
else:
break
#print "test: ", random_name_list_for_test
#Put files into dir 'test' and create index txt
for name in random_name_list_for_test:
shutil.copy(os.path.join(root, name), test)
fp_test = open(os.path.join(test, "test.txt"), "a")
fp_test.write(name[0:len(name)-4])
fp_test.write("\n")
#Record files which will be put into dir 'trainval'
for name in all_xml_list:
if name not in random_name_list_for_test:
name_list_for_trainval.append(name)
if len(name_list_for_trainval) == count - len(random_name_list_for_test):
break
#print "trainval: ", name_list_for_trainval
#Put files into dir 'trainval' and create index txt
for name in name_list_for_trainval:
shutil.copy(os.path.join(root, name), trainval)
fp_test = open(os.path.join(trainval, "trainval.txt"), "a")
fp_test.write(name[0:len(name) - 4])
fp_test.write("\n")
#Operation in dir 'trainval'
train = os.path.join(trainval, "train")
val = os.path.join(trainval, "val")
train_isExists = os.path.exists(train)
val_isExists = os.path.exists(val)
if not train_isExists:
os.mkdir(os.path.join(trainval,"train"))
if not val_isExists:
os.mkdir(os.path.join(trainval,"val"))
count = 0
files = os.listdir(trainval)
for i in files:
if os.path.isfile(os.path.join(trainval, i)):
count = count + 1
train_list = []
val_list = []
while 1:
random_name = random.choice(name_list_for_trainval)
if random_name not in train_list:
if len(train_list) < count/2:
train_list.append(random_name)
else:
break
for name in train_list:
shutil.copy(os.path.join(trainval, name), train)
fp_test = open(os.path.join(train, "train.txt"), "a")
fp_test.write(name[0:len(name)-4])
fp_test.write("\n")
for name in name_list_for_trainval:
if name not in train_list:
val_list.append(name)
if len(val_list) == count - len(train_list):
break
for name in val_list:
shutil.copy(os.path.join(trainval, name), val)
fp_test = open(os.path.join(val, "val.txt"), "a")
fp_test.write(name[0:len(name) - 4])
fp_test.write("\n")
if __name__ == "__main__":
main(sys.argv[1:])