VOC转yolov5格式数据集python脚本，划分训练集验证集

本文链接：https://blog.csdn.net/qq_40570751/article/details/124081401

如果数据集是voc格式的，转yolov5格式代码如下：

import os
import xml.etree.ElementTree as ET

classes = ["bj_bpmh", "bj_bpps", "bj_wkps", "jyz_pl", "sly_dmyw", "hxq_gjtps", "xmbhyc", "yw_gkxfw", "yw_nc", "gbps", "wcaqm", "wcgz",
           "xy", "bjdsyc", "ywzt_yfyc", "hxq_gjbs", "kgg_ybh", "badge", "person", "glove","wrongglove","operatingbar","powerchecker","clothes","wrongclothes","offground","ground","safebelt"]

# 将x1, y1, x2, y2转换成yolov5所需要的x, y, w, h格式
def xyxy2xywh(size, box):
    dw = 1. / size[0]
    dh = 1. / size[1]
    x = (box[0] + box[2]) / 2 * dw
    y = (box[1] + box[3]) / 2 * dh
    w = (box[2] - box[0]) * dw
    h = (box[3] - box[1]) * dh
    return (x, y, w, h)         # 返回的都是标准化后的值


def voc2yolo(path):
    # 可以打印看看该路径是否正确
    print(len(os.listdir(path)))
    # 遍历每一个xml文件
    for file in os.listdir(path):
        # xml文件的完整路径, 注意：因为是路径所以要确保准确，我是直接使用了字符串拼接, 为了保险可以用os.path.join(path, file)
        label_file = path + file
        # 最终要改成的txt格式文件,这里我是放在voc2007/labels/下面
        # 注意: labels文件夹必须存在，没有就先创建，不然会报错
        out_file = open(path.replace('Annotations', 'labels') + file.replace('xml', 'txt'), 'w')
        # print(label_file)

        # 开始解析xml文件
        tree = ET.parse(label_file)
        root = tree.getroot()
        size = root.find('size')            # 图片的shape值
        w = int(size.find('width').text)
        h = int(size.find('height').text)

        for obj in root.iter('object'):
            difficult = obj.find('difficult').text
            cls = obj.find('name').text
            if cls not in classes or int(difficult) == 1:
                continue
            # 将名称转换为id下标
            cls_id = classes.index(cls)
            # 获取整个bounding box框
            bndbox = obj.find('bndbox')
            # xml给出的是x1, y1, x2, y2
            box = [float(bndbox.find('xmin').text), float(bndbox.find('ymin').text), float(bndbox.find('xmax').text),
                float(bndbox.find('ymax').text)]

            # 将x1, y1, x2, y2转换成yolov5所需要的x_center, y_center, w, h格式
            bbox = xyxy2xywh((w, h), box)
            # 写入目标文件中，格式为 id x y w h
            out_file.write(str(cls_id) + " " + " ".join(str(x) for x in bbox) + '\n')

if __name__ == '__main__':
	# 这里要改成自己数据集路径的格式
    path = '/home/yuzun/Annotations/'
    voc2yolo(path)
    # 输出在labels目录下

yolov5还需要指定训练集和验证集的图片路径，那么可以将所有的图片路径读取成一个list，然后使用 sklearn 中的 train_test_split 将一个 list,pd随机分成指定比例

import os
import pandas as pd
from sklearn.model_selection import train_test_split

images_dir = "/home/yuzun/dian_data/images"   # 训练集
labels_dir = "/home/yuzun/dian_data/labels"   # 训练集

images_path = [ ]
labels_path = [ ]

for i in os.listdir(images_dir):
    images_path.append(os.path.join(images_dir,i))
    labels_path.append(os.path.join(labels_dir,i.split('.')[0]+'.txt'))

df = pd.DataFrame({"images":images_path,"labels":labels_path})  # 将图片和掩膜路径读取为csv文件格式
# print(df)
#
df_train, df_val = train_test_split(df,test_size = 0.15)  # 按照训练集：测试集 0.85:0.15的比例划分两个集
print(df_val["images"])
df_val["images"].to_csv("/home/yuzun/dian_data/valimage.txt",index = False)
df_val["labels"].to_csv("/home/yuzun/dian_data/vallabel.txt",index = False)
df_train["images"].to_csv("/home/yuzun/dian_data/trainimage.txt",index = False)
df_train["labels"].to_csv("/home/yuzun/dian_data/trainlabel.txt",index = False)
# print(df_train.values.shape)  # 打印出训练集数量：4789张图片
# print(df_val.values.shape)