1、数据集介绍
1.1、数据集图片组成
- 【有害垃圾】:电池(1 号、2 号、5 号)、过期药品或内包装等;
- 【可回收垃圾】:易拉罐、小号矿泉水瓶;
- 【厨余垃圾】:小土豆、切过的白萝卜、胡萝卜,尺寸为电池大小;
- 【其他垃圾】:瓷片、鹅卵石(小土豆大小)、砖块等。
部分类别图片展示:
|
|
|
文件结构:
----ImageSet\
|----classes.txt # 标签种类
|----data-txt\ # 数据集文件集合
| |----test.txt
| |----train.txt
| |----val.txt
|----images\ # 数据集图片
| |----test\
| | |----fimg_23.jpg
| | |----fimg_38.jpg
| | |----.....
| |----train\
| | |----fimg_1.jpg
| | |----fimg_2.jpg
| | |----.....
| |----val\
| | |----fimg_4.jpg
| | |----fimg_6.jpg
| | |----.....
|----labels\ # yolo标签
| |----test\
| | |----fimg_23.txt
| | |----fimg_38.txt
| | |----.....
| |----train\
| | |----fimg_1.txt
| | |----fimg_2.txt
| | |----.....
| |----val\
| | |----fimg_4.txt
| | |----fimg_6.txt
| | |----.....
2.1、获取数据集方式
2、扩展代码
2.1、文件结构树
draw-tree.py
from pathlib import Path
import os
tree_str = ''
def generate_tree(pathname, n=0):
global tree_str
if pathname.is_file():
tree_str += ' |' * n + '-' * 4 + pathname.name + '\n'
elif pathname.is_dir():
tree_str += ' |' * n + '-' * 4 + \
str(pathname.relative_to(pathname.parent)) + '\\' + '\n'
for cp in pathname.iterdir():
generate_tree(cp, n + 1)
if __name__ == '__main__':
path=os.getcwd()+'/ImageSet' # 查看当前ImageSet目录下的文件树
generate_tree(Path(path), 0)
print(tree_str)
2.2、划分数据集
split-data.py
import os, shutil, random
from tqdm import tqdm
"""
标注文件是yolo格式(txt文件)
训练集:验证集:测试集 (7:2:1)
"""
def split_img(current_path,img_path, label_path, split_list):
try:
Data = current_path+'/ImageSets'
# Data是你要将要创建的文件夹路径(路径一定是相对于你当前的这个脚本而言的)
# os.mkdir(Data)
train_img_dir = Data + '/images/train'
val_img_dir = Data + '/images/val'
test_img_dir = Data + '/images/test'
train_label_dir = Data + '/labels/train'
val_label_dir = Data + '/labels/val'
test_label_dir = Data + '/labels/test'
# 创建文件夹
os.makedirs(train_img_dir)
os.makedirs(train_label_dir)
os.makedirs(val_img_dir)
os.makedirs(val_label_dir)
os.makedirs(test_img_dir)
os.makedirs(test_label_dir)
except:
print('文件目录已存在')
train, val, test = split_list
all_img = os.listdir(img_path)
all_img_path = [os.path.join(img_path, img) for img in all_img]
train_img = random.sample(all_img_path, int(train * len(all_img_path)))
train_img_copy = [os.path.join(train_img_dir, img.split('\\')[-1]) for img in train_img]
train_label = [toLabelPath(img, label_path) for img in train_img]
train_label_copy = [os.path.join(train_label_dir, label.split('\\')[-1]) for label in train_label]
for i in tqdm(range(len(train_img)), desc='train ', ncols=80, unit='img'):
_copy(train_img[i], train_img_dir)
_copy(train_label[i], train_label_dir)
all_img_path.remove(train_img[i])
val_img = random.sample(all_img_path, int(val / (val + test) * len(all_img_path)))
val_label = [toLabelPath(img, label_path) for img in val_img]
for i in tqdm(range(len(val_img)), desc='val ', ncols=80, unit='img'):
_copy(val_img[i], val_img_dir)
_copy(val_label[i], val_label_dir)
all_img_path.remove(val_img[i])
test_img = all_img_path
test_label = [toLabelPath(img, label_path) for img in test_img]
for i in tqdm(range(len(test_img)), desc='test ', ncols=80, unit='img'):
_copy(test_img[i], test_img_dir)
_copy(test_label[i], test_label_dir)
def _copy(from_path, to_path):
shutil.copy(from_path, to_path)
def toLabelPath(img_path, label_path):
img = img_path.split('\\')[-1]
label = img.split('.jpg')[0] + '.txt'
return os.path.join(label_path, label)
if __name__ == '__main__':
current_path=os.current_path()
img_path = current_path+'/images' # 你的图片存放的路径(路径一定是相对于你当前的这个脚本文件而言的)
label_path = current_path+'/labels' # 你的txt文件存放的路径(路径一定是相对于你当前的这个脚本文件而言的)
split_list = [0.7, 0.2, 0.1] # 数据集划分比例[train:val:test]
split_img(current_path,img_path, label_path, split_list)
2.3、获取数据集文件名字
list_name.py
import os
"""
整体目录
----ImageSets\
|----images\
| |----test\
| |----train\
| |----val\
|----labels\
| |----test\
| |----train\
| |----val\
"""
def generate(current_path):
# 存放图片的路径 这里images下面分别有train,test和val文件夹
for item in os.listdir(current_path):
files_path=current_path+'/'+item
files = os.listdir(files_path) # 全部的图片名称
files.sort()
print
'****************'
print
'input :', item
print
'start...'
# 打开train.txt
item_txt='ImageSets/data-txt/'+item+'.txt'
listText = open(item_txt, 'a')
for file in files:
# 具体图片存放路径,如 data/images/train/1.jpg
name = 'data/images/'+item+'/'+file + '\n'
listText.write(name)
listText.close()
print
'down!'
print
'****************'
if __name__ == '__main__':
current_path=os.getcwd()+'/ImageSets/images' # # 查看当前ImageSet\images目录下各个文件的名字
generate(current_path)
2.4、文件成功对应检测
del-error-files.py
用于获取数据集文件后,检测images文件和labels文件是否都一一对应上。
创建.py文件放在mages文件和labels同目录下运行
import os
currentPath=os.getcwd()
currentPath=currentPath
print(currentPath)
labelArray=[]
imagesArray=[]
def split_extension(fileName):
return fileName.split('.')[0]
print('自检程序启动!查找到labels目录下异常文件:')
error_label_names = []
for la_name in os.listdir('labels'):
la_name = split_extension(la_name)
is_exists = False
for im_name in os.listdir('images'):
im_name = split_extension(im_name)
if la_name == im_name:
is_exists = True
if not is_exists:
error_la_name = la_name + '.txt'
error_label_names.append(error_la_name)
labelArray.append(error_la_name)
print(error_la_name)
print('自检程序启动!查找到images目录下异常文件:')
error_image_names = []
for im_name in os.listdir('images'):
im_name = split_extension(im_name)
is_exists = False
for la_name in os.listdir('labels'):
la_name = split_extension(la_name)
if la_name == im_name:
is_exists = True
if not is_exists:
error_im_name = im_name + '.jpg'
error_image_names.append(error_im_name)
print(error_im_name)
imagesArray.append(error_im_name)
# 自动删除文件
def delLabels():
for item in labelArray:
labelPath=currentPath+'/labels/'+item
os.remove(labelPath)
print(labelPath+'\t'+'删除成功')
def delImages():
for item in imagesArray:
imagesPath=currentPath+'/images/'+item
os.remove(imagesPath)
print(imagesPath+'\t'+'删除成功')
if __name__ == '__main__':
delLabels()
delImages()
3、其他文章
参考文献