按比例划分数据集coco。更改标签类别号

最新推荐文章于 2024-07-20 17:12:48 发布
TUOLONGcsdn
最新推荐文章于 2024-07-20 17:12:48 发布
阅读量532
点赞数
文章标签： python 开发语言
本文链接：https://blog.csdn.net/TUOLONGcsdn/article/details/128954730
版权
import os,shutil
import math
import random

#混合数据集划分
# path = ""
# destinationPath = ""
# os.chdir(path)
# datapath = os.getcwd()




class DataProcess:
    def __init__(self):
        self.sourcepath = datapath
        # dataname 当前文件的文件名列表
        # self.desPath = destinationPath
        self.train = 0.6
        self.valid = 0.2
        self.test = 0.2
        # 更改类别号方法参数
        # self.NewClsNum = NewClassNum
        # self.ChangeClsNum_path = Path2

    def move_file(self,key, desfile):
        if not os.path.exists(desfile):
            print("desfile not exist,so create the dir")
            os.makedirs(desfile, 1)
        if os.path.exists(desfile):
            for file in os.listdir(self.sourcepath):
                if key in file:
                    shutil.copy(self.sourcepath + '/' + file, desfile + '/' + file)

    '''
    混合数据集划分，根据 .txt 文件划分
    '''
    def split_to_single_class(self,sourcepath,destpath):
        datanames = os.listdir(sourcepath)
        for file in datanames:
            if file.endswith('.txt'):
                filepath = os.path.join(sourcepath, file)
                with open(filepath, 'r') as f:
                    destinationfile = f.read(1)
                    # print(destinationfile)
                    f.close()
            print(destinationfile)
            desfile = os.path.join(destpath, destinationfile)
            key = file[:-5]  # 去除文件名后缀
            self.move_file(key, desfile)
            pass
        pass

    '''
    更改数据集类别号
    '''
    def change_cluss_num(self,path,newclassnum):
        datanames = os.listdir(path)
        for file in datanames:
            if file.endswith('.txt'):
                txt_item = os.path.join(path, file)
                with open(txt_item,'r') as f:
                    lines = f.readlines()
                    # print(lines)
                    f.close()
                with open(txt_item,'w') as f:
                    for line in lines:
                        # line.strip()删除首尾空格和换行符，.split()按空格分割字符串
                        line_split = line.strip().split()
                        line_split[0] = newclassnum
                        f.write(
                            line_split[0] + ' ' +
                            line_split[1] + ' ' +
                            line_split[2] + ' ' +
                            line_split[3] + ' ' +
                            line_split[4] + '\n'
                        )
                f.close()
    '''
    按比例划分数据集,
    输入为各个单个类的数据集，
    '''
    def make_dirs(self , dest_path , images_or_labels):
        dirlist = ['train','valid','test']
        if not os.path.exists(dest_path + '/' + str(images_or_labels)):
            print("desfile not exist,so create the dir")
            for dir in dirlist:
                os.makedirs(dest_path + '/' + str(images_or_labels) +'/'+ dir)
    def copy_splited_data(self,filelist,sourcepath,destpath):
        # sourcelist = os.listdir(sourcepath)
        for file in filelist:
            filepath = os.path.join(sourcepath, file)
            if filepath:
                shutil.copy(filepath,destpath)

    '''
    按比例划分数据集，输入为图片路径，标签路径，目标文件夹。图片和标签路径可相同
    '''
    def split_train_dataset(self,source_img_path,source_label_path,dest_path):
        #按比例划分训练集0.6，验证集0.2，和测试集0.2
        sourceimgnames = os.listdir(source_img_path)
        sourcelabelnames = os.listdir(source_label_path)
        key = []
        imgnames = []
        labelnames = []
        for file in sourceimgnames:
            # 获取源图片目录下的图片名字列表
            filename = os.path.join(source_img_path,file)
            if os.path.isfile(filename):
                if file.endswith('.txt'):
                    labelnames.append(file)
                    key.append(file[:-4])
        for file2 in sourcelabelnames:
            file2names = os.path.join(source_label_path,file2)
            if os.path.isfile(file2names):
                if file2[-4:] != '.txt':
                    imgnames.append(file2)

        if imgnames:self.make_dirs(dest_path,'images')
        if labelnames:
            self.make_dirs(dest_path,'labels')
            rantestlabels = []
            rantestimages = []
            rantrainlabels = []
            rantrainimages = []
            ranvalidimages = []
            rantwolabels = random.sample(labelnames,int((self.valid+self.test)*len(labelnames)))
            num = int((self.valid / (self.valid + self.test)) * len(rantwolabels))
            ranvalidlabels = random.sample(rantwolabels,num)#随机验证集列表
            for ranname3 in ranvalidlabels:
                rankey3 = ranname3[:-4]
                for ranimg3 in imgnames:
                    if rankey3 in ranimg3:
                        ranvalidimages.append(ranimg3)#随机验证集标签列表

            for ranname1 in rantwolabels:
                if not ranname1 in ranvalidlabels:
                    rankey1 = ranname1[:-4]
                    for ranimg1 in imgnames:
                        if rankey1 in ranimg1:
                            rantestimages.append(ranimg1)#随机测试集图片列表
                    rantestlabels.append(ranname1)#随机测试集标签列表

            for ranname2 in labelnames:
                if not ranname2 in rantwolabels:
                    rankey2 = ranname2[:-4]
                    for ranimg2 in imgnames:
                        if rankey2 in ranimg2:
                            rantrainimages.append(ranimg2)
                    rantrainlabels.append(ranname2)#随机训练集列表
            self.copy_splited_data(rantrainimages,source_img_path,dest_path+'/'+'images'+'/'+'train')
            self.copy_splited_data(ranvalidimages,source_img_path,dest_path+'/'+'images'+'/'+'valid')
            self.copy_splited_data(rantestimages,source_img_path,dest_path+'/'+'images'+'/'+'test')
            self.copy_splited_data(rantrainlabels, source_label_path, dest_path + '/' + 'labels' + '/' + 'train')
            self.copy_splited_data(ranvalidlabels, source_label_path, dest_path + '/' + 'labels' + '/' + 'valid')
            self.copy_splited_data(rantestlabels, source_label_path, dest_path + '/' + 'labels' + '/' + 'test')






if __name__ == '__main__':
    '''
    datapro = DataProcess()
    # 更改数据集类别号 例子
    NewClassNum = '1'
    Path2 = "D:\Learning_software\deeplearning\learnV2_Pytorch\chapter3"
    datapro.change_cluss_num(Path2,NewClassNum)
    # datapro.split_data() 例子
    dest_path =r"D:\masterPROJECT\laser＿weeding\weed_dataset\testsplitAG\dest"
    source_img_path =r"D:\masterPROJECT\laser＿weeding\weed_dataset\testsplitAG\1"
    source_label_path =r"D:\masterPROJECT\laser＿weeding\weed_dataset\testsplitAG\1"
    datapro.split_train_dataset(source_img_path,source_label_path,dest_path)
    '''

    classlist = []
    datapath = r"D:\masterPROJECT\laser＿weeding\weed_dataset\dataclassfication"
    classdirs = os.listdir(datapath)
    for dir in classdirs:
        classdir = os.path.join(datapath,dir)
        if os.path.isdir(classdir):
            classlist.append(dir)
    class_txt = os.path.join(datapath,'classes.txt')
    with open(class_txt,'w') as f:
        for line in classlist:
            f.write(line+'\n')
        f.close()

    dataprocess = DataProcess()
    #根据classlist索引值,更改类别号
    destpath = r"D:\masterPROJECT\laser＿weeding\weed_dataset\testsplitAG"
    for cls in classlist:
        num = classlist.index(cls)
        num = str(num)
        clspath = os.path.join(datapath,cls)
        print(num)
        dataprocess.change_cluss_num(clspath,num)
        dataprocess.split_train_dataset(clspath,clspath,destpath)