1.默认标注好了所有数据,将标注好的json转成VOC分割数据集格式
from __future__ import print_function
import argparse
import glob
import os
import os.path as osp
import sys
import imgviz
import numpy as np
import labelme
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("--input_dir",default="label", help="input annotated directory")
parser.add_argument("--output_dir",default="data_dataset_voc", help="output dataset directory")
parser.add_argument("--labels",default="label.txt", help="labels file")
parser.add_argument(
"--noviz", help="no visualization", action="store_true"
)
args = parser.parse_args()
if osp.exists(args.output_dir):
print("Output directory already exists:", args.output_dir)
sys.exit(1)
os.makedirs(args.output_dir)
os.makedirs(osp.join(args.output_dir, "JPEGImages"))
os.makedirs(osp.join(args.output_dir, "SegmentationClass"))
os.makedirs(osp.join(args.output_dir, "SegmentationClassPNG"))
if not args.noviz:
os.makedirs(
osp.join(args.output_dir, "SegmentationClassVisualization")
)
print("Creating dataset:", args.output_dir)
class_names = []
class_name_to_id = {}
for i, line in enumerate(open(args.labels).readlines()):
class_id = i - 1 # starts with -1
class_name = line.strip()
class_name_to_id[class_name] = class_id
if class_id == -1:
assert class_name == "__ignore__"
continue
elif class_id == 0:
assert class_name == "_background_"
class_names.append(class_name)
class_names = tuple(class_names)
print("class_names:", class_names)
out_class_names_file = osp.join(args.output_dir, "class_names.txt")
with open(out_class_names_file, "w") as f:
f.writelines("\n".join(class_names))
print("Saved class_names:", out_class_names_file)
for filename in glob.glob(osp.join(args.input_dir, "*.json")):
print("Generating dataset from:", filename)
label_file = labelme.LabelFile(filename=filename)
base = osp.splitext(osp.basename(filename))[0]
out_img_file = osp.join(args.output_dir, "JPEGImages", base + ".jpg")
out_lbl_file = osp.join(
args.output_dir, "SegmentationClass", base + ".npy"
)
out_png_file = osp.join(
args.output_dir, "SegmentationClassPNG", base + ".png"
)
if not args.noviz:
out_viz_file = osp.join(
args.output_dir,
"SegmentationClassVisualization",
base + ".jpg",
)
with open(out_img_file, "wb") as f:
f.write(label_file.imageData)
img = labelme.utils.img_data_to_arr(label_file.imageData)
lbl, _ = labelme.utils.shapes_to_label(
img_shape=img.shape,
shapes=label_file.shapes,
label_name_to_value=class_name_to_id,
)
labelme.utils.lblsave(out_png_file, lbl)
np.save(out_lbl_file, lbl)
if not args.noviz:
viz = imgviz.label2rgb(
label=lbl,
#img改成image,labelme接口的问题不然会报错
#img=imgviz.rgb2gray(img),
image=imgviz.rgb2gray(img),
font_size=15,
label_names=class_names,
loc="rb",
)
imgviz.io.imsave(out_viz_file, viz)
if __name__ == "__main__":
main()
label.txt里放
__ignore__
_background_
class1
class2
...
转换成功后有以下几个文件
JPEGImages里放的是原图,SegmentationClass文件下是npy格的分割数据,SegmentationClassPNG分割需要的mask,SegmentationClassVisualization图像标签叠加的图像,主要看标注有没有出现错误。
2.划分训练验证测试集
from sklearn.model_selection import train_test_split
import os
imagedir = './XXXX/JPEGImages'
outdir = 'voc'
os.makedirs(outdir,exist_ok=True)
images = []
for file in os.listdir(imagedir):
filename = file.split('.')[0]
images.append(filename)
# Split the data into training, validation, and test sets (8:1:1 ratio)
train_size = 0.8
val_size = 0.1
test_size = 0.1
train, temp = train_test_split(images, test_size=(val_size + test_size), random_state=0)
val, test = train_test_split(temp, test_size=(test_size / (val_size + test_size)), random_state=0)
# Write the lists to text files
with open(os.path.join(outdir, "train.txt"), 'w') as f:
f.write('\n'.join(train))
with open(os.path.join(outdir, "val.txt"), 'w') as f:
f.write('\n'.join(val))
with open(os.path.join(outdir, "test.txt"), 'w') as f:
f.write('\n'.join(test))
3.cityspace数据集划分脚本
import os
import shutil
import random
def split_data(source_dir, dest_train_dir, dest_val_dir, split_ratio):
image_dir = os.path.join(source_dir, 'images')
label_dir = os.path.join(source_dir, 'labels')
os.makedirs(os.path.join(dest_train_dir, 'images'), exist_ok=True)
os.makedirs(os.path.join(dest_train_dir, 'labels'), exist_ok=True)
os.makedirs(os.path.join(dest_val_dir, 'images'), exist_ok=True)
os.makedirs(os.path.join(dest_val_dir, 'labels'), exist_ok=True)
image_files = os.listdir(image_dir)
random.shuffle(image_files)
num_train = int(len(image_files) * split_ratio)
train_files = image_files[:num_train]
val_files = image_files[num_train:]
for file in train_files:
image_filename = file
label_filename = file.replace('.jpg', '.png')
image_path = os.path.join(image_dir, image_filename)
label_path = os.path.join(label_dir, label_filename)
shutil.copy(image_path, os.path.join(dest_train_dir, 'images', image_filename))
shutil.copy(label_path, os.path.join(dest_train_dir, 'labels', label_filename))
for file in val_files:
image_filename = file
label_filename = file.replace('.jpg', '.png')
image_path = os.path.join(image_dir, image_filename)
label_path = os.path.join(label_dir, label_filename)
shutil.copy(image_path, os.path.join(dest_val_dir, 'images', image_filename))
shutil.copy(label_path, os.path.join(dest_val_dir, 'labels', label_filename))
# 设置数据集路径和划分比例
source_dir = 'D:/XXXDataSet/dataset' # 数据集路径
dest_train_dir = 'D:/XXXDataSet/dataset/train' # 保存训练集的文件夹路径
dest_val_dir = 'D:/XXXDataSet/dataset/test' # 保存验证集的文件夹路径
split_ratio = 0.8 # 80% 训练集,20% 验证集
split_data(source_dir, dest_train_dir, dest_val_dir, split_ratio)