理论上用于分类的图像一般都不需要用labelme来标注的,笔者是因为刚好手上有这么一组数据,所以就顺带处理了。labelme标注完的数据每张还包含了一个json文件,这个在分类任务中用不上。具体的mmclassification使用方法在我的另一篇文章里有,需要注意的是现在分类任务被集合在mmpretrain里了。用法也优点区别,不过也都是细微的修改,都还能用。
数据处理代码如下:
import os, random, shutil
# 最开始train的文件夹路径
ori_path = r'F:\Data\doctor_research\Arthritis_mini'
# label种类:
label_list = ['arthritis', 'normal']
# 抽取比例 ******自己改*******
val_ratio, test_ratio = 0.1, 0.2 # 抽取比例 ******自己改*******
# 处理好的数据集路径
result_path = os.path.join(ori_path, 'result')
# 存放图片的文件地址
train_path = os.path.join(result_path, 'train')
test_path = os.path.join(result_path, 'test')
val_path = os.path.join(result_path, 'val')
# meta文件夹地址
meta_path = os.path.join(result_path, 'meta')
# meta文件夹下的txt文件
train_txt = os.path.join(meta_path, 'train.txt')
test_txt = os.path.join(meta_path, 'test.txt')
val_txt = os.path.join(meta_path, 'val.txt')
# 预留内容
traintxt = []
testtxt = []
valtxt = []
# 如果没有就创建
if not os.path.exists(result_path):
os.mkdir(result_path)
if not os.path.exists(meta_path):
os.makedirs(meta_path)
if not os.path.exists(train_path):
os.makedirs(train_path)
if not os.path.exists(test_path):
os.makedirs(test_path)
if not os.path.exists(val_path):
os.makedirs(val_path)
# 在train test val 文件夹中创建对应的label文件夹:
for label in label_list:
print(f'label:{label}')
os.makedirs(os.path.join(train_path, label), exist_ok=True)
# print(f'train_path:{train_path}')
os.makedirs(os.path.join(test_path, label), exist_ok=True)
# print(f'test_path:{test_path}')
os.makedirs(os.path.join(val_path, label), exist_ok=True)
# print(f'val_path:{val_path}')
ori_pic_path = os.path.join(ori_path, label)
print(f'ori_pic_path is : {ori_pic_path}')
# 将该label下所有的图像文件名暂存在一个list里面:
temp_path = []
for pic in os.listdir(ori_pic_path):
# 判断是否为图片
if pic.endswith('.jpg') or pic.endswith('.jpeg') or pic.endswith('.png'):
# 先全部计入到一个临时list里:
print(f'pic is {pic}')
temp_path.append(pic)
# print(f'temp_path is {temp_path}')
# 计算该类别下的所有图片数量以及对应的验证集、测试集图片数量
print(f'{label} num is {len(temp_path)}')
# 验证集数量
val_number = int(len(temp_path) * val_ratio)
# 测试集数量
test_number = int(len(temp_path) * test_ratio)
# 抽取val数据集
val_sample = random.sample(temp_path, val_number)
print(f'val_number is:{val_number}')
print(f'val_sample is:{val_sample}')
# 把抽取的val数据剔除
# temp_path.remove(val_sample)
temp_path = [item for item in temp_path if item not in val_sample]
# 对应的val文件夹:
goal_val_path = os.path.join(result_path, 'val', label)
# 将文件移动到val文件夹
for name1 in val_sample:
goal_name1_path = os.path.join(goal_val_path, name1)
shutil.copy(os.path.join(ori_pic_path, name1), goal_name1_path)
# 同时将该文件地址记录到val.txt中
val_content = goal_name1_path + " " + str(label_list.index(label))
valtxt.append(val_content)
# with open(val_txt, 'w') as f:
# f.write(goal_name1_path + " " + str(label_list.index(label)) + "\n")
# 抽取test数据集
test_sample = random.sample(temp_path, test_number)
print(f'test_number is:{test_number}')
# 把抽取的test数据剔除
temp_path = [item for item in temp_path if item not in test_sample]
# 对应的test文件夹
goal_test_path = os.path.join(result_path, 'test', label)
# 将文件移动到test文件夹
for name2 in test_sample:
goal_name2_path = os.path.join(goal_test_path, name2)
shutil.copy(os.path.join(ori_pic_path, name2), goal_name2_path)
# 同时将该文件地址记录到test.txt中
# 同时将该文件地址记录到test.txt中
test_content = goal_name2_path + " " + str(label_list.index(label))
testtxt.append(test_content)
# with open(test_txt, 'w') as f:
# f.write(goal_name2_path + " " + str(label_list.index(label)) + "\n")
# 把剩下的数据移入train数据集
goal_train_path = os.path.join(result_path, 'train', label)
print(f'goal_train_path is : {goal_train_path}')
for name3 in temp_path:
goal_name3_path = os.path.join(goal_train_path, name3)
shutil.copy(os.path.join(ori_pic_path, name3), goal_name3_path)
# 同时将该文件地址记录到test.txt中
train_content = goal_name3_path + " " + str(label_list.index(label))
traintxt.append(train_content)
with open(train_txt, 'w') as f:
for item in traintxt:
f.write(item + '\n')
with open(test_txt, 'w') as f:
for item in testtxt:
f.write(item + '\n')
with open(val_txt, 'w') as f:
for item in valtxt:
f.write(item + '\n')
转imagenet(用于mmclassification)&spm=1001.2101.3001.5002&articleId=143415482&d=1&t=3&u=f5b48f7b1d30426ab53df40406c4b501)
1412

被折叠的 条评论
为什么被折叠?



