自从入坑深度学习,一直都是用现有的数据集进行训练网络,今天想自己制作自己的数据集,因此将收集到的图片进行数据集制作。
我的图片是人眼睛的图片,平均每个人有40张图片,命名是一个人的图片名称的前几个是一样的,后面是按顺序增加的,全部图片是在一个文件夹下,并有一个txt文档罗列出来全部的图片。
目标:将图片分成3大部分,按照7:2:1的比例,分别作为训练集、验证集以及测试集,并实现一个人一个文件夹,文件夹名称是此人图片名称中一样的部分。
话不多说,先上代码~ 好东西要大家分享!
实现将图片进行比例分割,并生成相应的txt,并提取出名称
import os
import numpy as np
file_path = r'E:\datasets\DataPrePro\12.04\box.txt'
save_train_path = r'E:\datasets\DataPrePro\12.04\label_train.txt'
save_valid_path = r'E:\datasets\DataPrePro\12.04\label_valid.txt'
save_test_path = r'E:\datasets\DataPrePro\12.04\label_test.txt'
all_person_labels = []
person_dict = {}
person_dict_train = []
person_dict_valid = []
person_dict_test = []
label_dict = {}
# 将所有数据按[序号:所有图片]的形式提取
# 生成人序号与标签对于的字典
with open(file_path, 'r') as fp:
index = 0
#label_dict = {}
ori_lines = fp.readlines()
for line in ori_lines:
line = line.strip().split('\t')
name = line[0].split('/')[-1]
data = line[1:]
label = name.split('_')[0]
if label not in person_dict.keys():
person_dict[label] = [name]
else:
person_dict[label].append(name)
if label not in label_dict:
label_dict[label] = index
index += 1
print(label_dict)
print(label_dict.values())
#按序号取出该序号人的所有图片,并随机打乱顺序然后按7:2:1保持到训练、验证、测试数据集,最后保持。
with open(save_train_path, 'w') as strp, open(save_valid_path, 'w') as svp, open(save_test_path, 'w') as step:
for person in person_dict.keys():
img_names = person_dict[person]
Num_img = len(img_names)
Num_train = int(Num_img * 0.7)
Num_valid = int(Num_img * 0.2)
Num_test = Num_img - Num_train - Num_valid
img_names = np.array(img_names)
np.random.shuffle(img_names)
img_train = img_names[:Num_train]
img_valid = img_names[Num_train:Num_train+Num_valid]
img_test = img_names[Num_train+Num_valid:]
for img_name in img_train:
label = img_name.split(‘_’)[0]
strp.write(img_name)
strp.write(‘\t’)
strp.write(label)
strp.write(‘\n’)
for img_name in img_valid:
label = img_name.split(‘_’)[0]
svp.write(img_name)
svp.write(‘\t’)
svp.write(label)
svp.write(‘\n’)
for img_name in img_test:
label = img_name.split(‘_’)[0]
step.write(img_name)
step.write(‘\t’)
step.write(label)
step.write(‘\n’)
下面的部分就添加了将图片按照txt文档中的情况移动到相应的文件夹中:
import os
import numpy as np
import shutil
file_path = r'E:\datasets\DataPrePro\12.04\box.txt'
path = r'E:\datasets\DataPrePro\12.04\imgs'
save_train_path = r'E:\datasets\DataPrePro\12.04\label_train.txt'
save_valid_path = r'E:\datasets\DataPrePro\12.04\label_valid.txt'
save_test_path = r'E:\datasets\DataPrePro\12.04\label_test.txt'
train_path = r'E:\datasets\DataPrePro\12.04\train'
valid_path = r'E:\datasets\DataPrePro\12.04\valid'
test_path = r'E:\datasets\DataPrePro\12.04\test'
all_person_labels = []
person_dict = {}
person_dict_train = []
person_dict_valid = []
person_dict_test = []
# 将所有数据按[序号:所有图片]的形式提取
# 生成人序号与标签对于的字典
with open(file_path, 'r') as fp:
index = 0
label_dict = {}
ori_lines = fp.readlines()
for line in ori_lines:
line = line.strip().split('\t')
name = line[0].split('/')[-1]
data = line[1:]
label = name.split('_')[0]
if label not in person_dict.keys():
person_dict[label] = [name]
else:
person_dict[label].append(name)
if label not in label_dict:
label_dict[label] = index
index += 1
print(label_dict)
print(label_dict.values())
#按序号取出该序号人的所有图片,并随机打乱顺序然后按7:2:1保持到训练、验证、测试数据集,最后保持。
with open(save_train_path, 'w') as strp, open(save_valid_path, 'w') as svp, open(save_test_path, 'w') as step:
for person in person_dict.keys():
img_names = person_dict[person]
Num_img = len(img_names)
Num_train = int(Num_img * 0.7)
Num_valid = int(Num_img * 0.2)
Num_test = Num_img - Num_train - Num_valid
img_names = np.array(img_names)
np.random.shuffle(img_names)
img_train = img_names[:Num_train]
img_valid = img_names[Num_train:Num_train+Num_valid]
img_test = img_names[Num_train+Num_valid:]
if not os.path.exists(train_path):
print(“Create new folder:” + train_path)
os.mkdir(train_path)
for img_name in img_train:
label = img_name.split(‘_’)[0]
strp.write(img_name)
strp.write(‘\t’)
strp.write(label)
strp.write(‘\n’)
for file in os.listdir(path):
if os.path.isfile(path + ‘/‘ + file):
if img_name in file:
shutil.copy(path + ‘/‘ + file, train_path + ‘/‘ + file)
if not os.path.exists(valid_path):
print(“Create new folder:” + valid_path)
os.mkdir(valid_path)
for img_name in img_valid:
label = img_name.split(‘_’)[0]
svp.write(img_name)
svp.write(‘\t’)
svp.write(label)
svp.write(‘\n’)
for file in os.listdir(path):
if os.path.isfile(path + ‘/‘ + file):
if img_name in file:
shutil.copy(path + ‘/‘ + file, valid_path + ‘/‘ + file)
if not os.path.exists(test_path):
print(“Create new folder:” + test_path)
os.mkdir(test_path)
for img_name in img_test:
label = img_name.split(‘_’)[0]
step.write(img_name)
step.write(‘\t’)
step.write(label)
step.write(‘\n’)
for file in os.listdir(path):
if os.path.isfile(path + ‘/‘ + file):
if img_name in file:
shutil.copy(path + ‘/‘ + file, test_path + ‘/‘ + file)
为了实现将每个人的图片放到同一个文件夹下,代码如下:
import os
import shutil
train_path = r'E:\datasets\DataPrePro\12.04\label_train.txt'
valid_path = r'E:\datasets\DataPrePro\12.04\label_valid.txt'
test_path = r'E:\datasets\DataPrePro\12.04\label_test.txt'
path_01 = r'E:\datasets\DataPrePro\12.04\train'
path_02 = r'E:\datasets\DataPrePro\12.04\valid'
path_03 = r'E:\datasets\DataPrePro\12.04\test'
f = open(train_path, 'r')
lines = f.readlines()
for line in lines:
line = line.strip().split(‘\t’)
name = line[1]
if not os.path.exists(path_01 + ‘/‘ + name):
os.mkdir(path_01 + ‘/‘ + name)
for file in os.listdir(path_01):
if os.path.isfile(path_01 + ‘/‘ + file):
if name in file:
shutil.move(path_01 + ‘/‘ + file, path_01 + ‘/‘ + name + ‘/‘ + file)
f = open(valid_path, ‘r’)
lines = f.readlines()
for line in lines:
line = line.strip().split(‘\t’)
name = line[1]
if not os.path.exists(path_02 + ‘/‘ + name):
os.mkdir(path_02 + ‘/‘ + name)
for file in os.listdir(path_02):
if os.path.isfile(path_02 + ‘/‘ + file):
if name in file:
shutil.move(path_02 + ‘/‘ + file, path_02+ ‘/‘ + name + ‘/‘ + file)
f = open(test_path, ‘r’)
lines = f.readlines()
for line in lines:
line = line.strip().split(‘\t’)
name = line[1]
if not os.path.exists(path_03 + ‘/‘ + name):
os.mkdir(path_03 + ‘/‘ + name)
for file in os.listdir(path_03):
if os.path.isfile(path_03 + ‘/‘ + file):
if name in file:
shutil.move(path_03 + ‘/‘ + file, path_03 + ‘/‘ + name + ‘/‘ + file)
这是本人第一次制作数据集,如有不妥之处,希望大家多多指教,一起进步~