一、原始数据集【未划分,包含全部图片、标签】
在yolo系列的根目录下(ultralytics_main)创建datasets文件夹,在datasets文件夹中放入coco128数据集,coco128文件夹下创建image文件夹:存放数据集的全部图片和标签文件。
二、数据集分割
在datasets文件夹下创建DatasetProduction.py文件,用来将images中的图片和标签进行分割,DatasetProduction.py代码如下:(!该代码一定放在datasets文件夹下,不要放在images里)
import os
import random
import shutil
def split_dataset(data_dir, train_val_test_dir, train_ratio, val_ratio, test_ratio):
# 创建目标文件夹
train_dir = os.path.join(train_val_test_dir, 'train')
val_dir = os.path.join(train_val_test_dir, 'val')
test_dir = os.path.join(train_val_test_dir, 'test')
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
# 获取数据集中的所有文件
files = os.listdir(data_dir)
# 过滤掉非图片文件
image_files = [f for f in files if f.endswith('.jpg') or f.endswith('.png')]
# 随机打乱文件列表
random.shuffle(image_files)
# 计算切分数据集的索引
num_files = len(image_files)
num_train = int(num_files * train_ratio)
num_val = int(num_files * val_ratio)
num_test = num_files - num_train - num_val
# 分离训练集
train_files = image_files[:num_train]
for file in train_files:
src_image_path = os.path.join(data_dir, file)
src_label_path = os.path.join(data_dir, file.replace('.jpg', '.txt').replace('.png', '.txt'))
dst_image_path = os.path.join(train_dir, file)
dst_label_path = os.path.join(train_dir, file.replace('.jpg', '.txt').replace('.png', '.txt'))
shutil.copy(src_image_path, dst_image_path)
shutil.copy(src_label_path, dst_label_path)
# 分离验证集
val_files = image_files[num_train:num_train + num_val]
for file in val_files:
src_image_path = os.path.join(data_dir, file)
src_label_path = os.path.join(data_dir, file.replace('.jpg', '.txt').replace('.png', '.txt'))
dst_image_path = os.path.join(val_dir, file)
dst_label_path = os.path.join(val_dir, file.replace('.jpg', '.txt').replace('.png', '.txt'))
shutil.copy(src_image_path, dst_image_path)
shutil.copy(src_label_path, dst_label_path)
# 分离测试集
test_files = image_files[num_train + num_val:]
for file in test_files:
src_image_path = os.path.join(data_dir, file)
src_label_path = os.path.join(data_dir, file.replace('.jpg', '.txt').replace('.png', '.txt'))
dst_image_path = os.path.join(test_dir, file)
dst_label_path = os.path.join(test_dir, file.replace('.jpg', '.txt').replace('.png', '.txt'))
shutil.copy(src_image_path, dst_image_path)
shutil.copy(src_label_path, dst_label_path)
print("数据集分离完成!")
print(f"训练集数量:{len(train_files)}")
print(f"验证集数量:{len(val_files)}")
print(f"测试集数量:{len(test_files)}")
def move_files(data_dir):
# 创建目标文件夹
images_dir = os.path.join(data_dir, 'images')
labels_dir = os.path.join(data_dir, 'labels')
os.makedirs(images_dir, exist_ok=True)
os.makedirs(labels_dir, exist_ok=True)
# 获取数据集中的所有文件
files = os.listdir(data_dir)
# 移动jpg文件到images文件夹
png_files = [f for f in files if f.endswith('.jpg')]
for file in png_files:
src_path = os.path.join(data_dir, file)
dst_path = os.path.join(images_dir, file)
shutil.move(src_path, dst_path)
# 移动TXT文件到labels文件夹
txt_files = [f for f in files if f.endswith('.txt')]
for file in txt_files:
src_path = os.path.join(data_dir, file)
dst_path = os.path.join(labels_dir, file)
shutil.move(src_path, dst_path)
print(f"{data_dir}文件移动完成!")
print(f"总共移动了 {len(png_files)} 个jpg文件到images文件夹")
print(f"总共移动了 {len(txt_files)} 个TXT文件到labels文件夹")
# 设置数据集路径和切分比例 !!按照自己数据路径进行相应的修改!!
data_dir = './coco128/images' # 图片和标签路径
train_val_test_dir = './coco128' # 目标文件夹
train_ratio = 0.7 # 训练集比例
val_ratio = 0.2 # 验证集比例
test_ratio = 0.1 # 测试集比例
# 调用函数分离数据集
split_dataset(data_dir, train_val_test_dir, train_ratio, val_ratio, test_ratio)
# 调用函数移动文件
move_files(os.path.join(train_val_test_dir, 'train'))
move_files(os.path.join(train_val_test_dir, 'val'))
move_files(os.path.join(train_val_test_dir, 'test'))
!在代码中需要将数据路径及分割比例进行相应的修改。【#设置数据集路径和切分比例】
三、数据集分割完成
(1)检查images文件中的图片和标签是否是一一对应;
(2)运行DatasetProduction.py,生成相应的train、test、val文件夹,并按设置的比例分割好。.
四、制作数据集的yaml文件
ultralytics_main\ultralytics\cfg\datasets中存放数据集的yaml文件
(1)train、test、val是图片的位置(相对于path的相对路径)
(2)nc是数据集中分类的总数
(3)names 中要写入数据集的各个标签对应的名字
path: ../datasets/coco128 # dataset root dir
train: train/images # train images (relative to 'path')
val: val/images # val images (relative to 'path')
test: test/images # test images (relative to 'path') (optional)
# Numbers of classes
nc: 80
# Classes
names:
0: person
1: bicycle
2: car
3: motorcycle
4: airplane
5: bus
6: train
7: truck
8: boat
9: traffic light
10: fire hydrant
11: stop sign
12: parking meter
13: bench
14: bird
15: cat
16: dog
17: horse
18: sheep
19: cow
20: elephant
21: bear
22: zebra
23: giraffe
24: backpack
25: umbrella
26: handbag
27: tie
28: suitcase
29: frisbee
30: skis
31: snowboard
32: sports ball
33: kite
34: baseball bat
35: baseball glove
36: skateboard
37: surfboard
38: tennis racket
39: bottle
40: wine glass
41: cup
42: fork
43: knife
44: spoon
45: bowl
46: banana
47: apple
48: sandwich
49: orange
50: broccoli
51: carrot
52: hot dog
53: pizza
54: donut
55: cake
56: chair
57: couch
58: potted plant
59: bed
60: dining table
61: toilet
62: tv
63: laptop
64: mouse
65: remote
66: keyboard
67: cell phone
68: microwave
69: oven
70: toaster
71: sink
72: refrigerator
73: book
74: clock
75: vase
76: scissors
77: teddy bear
78: hair drier
79: toothbrush
五、训练数据集
以yolov8为例,在train.py中,使用自己的数据集进行训练。
# yolo task=detect mode=train model=yolov8n.pt data=data/fall.yaml batch=32 epochs=100 imgsz=640 workers=16 device=0
from ultralytics import YOLO
# Create a new YOLO model from scratch
model = YOLO('yolov8n.yaml')
# Load a pretrained YOLO model (recommended for training)
# model = YOLO('E:/ultralytics-8.1.0/yolov8n.pt')
# Train the model using the 'coco128.yaml' dataset for 3 epochs
results = model.train(data='coco128.yaml', batch=32, epochs=3,imgsz=640)
# Evaluate the model's performance on the validation set
results = model.val()
# Perform object detection on an image using the model
#results = model('https://ultralytics.com/images/bus.jpg')
# Export the model to ONNX format
success = model.export(format='onnx')
运行python train.py开始训练,训练后的结果会保存在run中的train文件夹中。