数据集划分是炼丹的准备工作。附上一次性划分的步骤。
Step1: 获取所有文件名
ls >a.txt
注意在生成的txt里将其本名删去。
Step2:按照比例划分
这里使用的是train:test:val = 7:2:1 的比例划分。直接用mod10结果分类。
# clear contents
with open('val.txt', 'a+', encoding='utf-8') as f_val:
f_val.truncate(0)
f_val.close()
with open('test.txt', 'a+', encoding='utf-8') as f_test:
f_test.truncate(0)
f_test.close()
with open('train.txt', 'a+', encoding='utf-8') as f_train:
f_train.truncate(0)
f_train.close()
# read and write
with open("a.txt", encoding='utf-8') as f:
count = 0
for line in f.readlines():
count = count + 1
# print("NO.", count, ":", line)
if count % 10 == 1:
with open('val.txt', 'a') as f_val:
f_val.write(line)
f_val.close()
elif count % 5 == 0:
with open('test.txt', 'a') as f_test:
f_test.write(line)
f_test.close()
else:
with open('train.txt', 'a') as f_train:
f_train.write(line)
f_train.close()
f.close()
结果: