参考博客:
https://blog.csdn.net/weixin_42749767/article/details/82772149
https://blog.csdn.net/chris_pei/article/details/79087284
由于制作VOC数据集比较麻烦,参考了别人很多的教程,所以记录自己成功的方法,防止自己遗忘。本着分享的精神,也希望能够帮到别人。
1. 介绍
不要自己在新建什么文件夹,仿照VOC的数据格式制作lmdb数据最方便,因为最终的目的是为了训练,不要在制作数据集上面话太多时间。
2. 检查xml和pic
需要讲xml和pic文件对应,去掉多余的xml或是pic
#encoding:utf-8
# !/usr/bin/python
import os
import sys
import argparse
import os.path
import shutil
path = os.path.abspath('.')
def parse_args():
"""
Parse input arguments
"""
parser = argparse.ArgumentParser(description='cut pic and xml 1 to 2')
parser.add_argument('--xml', dest='xml', type=str)
parser.add_argument('--pic', dest='pic',type=str)
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_args()
print('Called with args:')
print(args)
xml = args.xml
pic = args.pic
xml_path = path + '/' + xml
pic_path = path + '/' + pic
dst_xml_path = path + '/' + "not_pic"
dst_pic_path = path + '/' + "not_xml"
if not os.path.exists(dst_pic_path):
os.mkdir(dst_pic_path)
else:
print "此目录已存在!"
sys.exit(1)
if not os.path.exists(dst_xml_path):
os.mkdir(dst_xml_path)
else:
print "此目录已存在!"
sys.exit(1)
for file in os.listdir(pic_path):
xml_name = os.path.join(xml_path, os.path.splitext(file)[0] + ".xml")
if os.path.exists(xml_name):
print "This file", file, "has xml !"
elif not os.path.exists(xml_name):
print "This file", file, "has not xml !"
shutil.move(os.path.join(pic_path,file), dst_pic_path)
for file in os.listdir(xml_path):
pic_name = os.path.join(pic_path, os.path.splitext(file)[0] + ".jpg")
if os.path.exists(pic_name):
print "This file", file, "has pic !"
elif not os.path.exists(pic_name):
print "This file", file, "has not pic !"
shutil.move(os.path.join(xml_path, file), dst_xml_path)
print "Done!"
#pic_path = os.path.join(pic_path, pic_name)
#shutil.move(xml_name, dst_path)
#for dirpath, dirnames, filenames in os.walk("f:/"):
# for filename in filenames:
# if os.path.splitext(filename)[1] == ".txt":
# print filepath
# copy(filepath, "F:/test/" + filename)
# Shutil.move(changeFilePath, dst_path)
运行该python文件:python --pic pic --xml xml 就能去掉多余的pic或是xml文件。
3.xml中删除多余的标签
删除xml中不需要的标签
#coding=utf-8
import xml.etree.cElementTree as ET
import os
#path_root = ['E:\data-VOC0712\VOC2007\Annotations',
# 'E:\data-VOC0712\VOC2012\Annotations']
path_root = '/home/caffe-ssd/data/VOCdevkit/VOC2007/Annotations11'
CLASSES = ["box"]
for anno_path in os.listdir(path_root):
#xml_list = os.listdir(anno_path)
#for axml in xml_list:
path_xml = os.path.join(path_root, anno_path)
tree = ET.parse(path_xml)
root = tree.getroot()
for child in root.findall('object'):
name = child.find('name').text
if not name in CLASSES:
root.remove(child)
tree.write(os.path.join('/home/caffe-ssd/data/VOCdevkit/VOC2007/Annotations', anno_path))
运行该代码,只保留CLASSES中的box,其余的都删除
4. 生成txt文件
在/caffe-ssd/data/VOCdevkit/VOC2007下新建Annotations,ImageSets,JPEGImages三个文件夹(如果不存在的话)。讲xml放入Annotations,pic放入JPEGImages,在ImageSets下新建Main文件夹,生成最左lmdb需要的txt文件。
#encoding:utf-8
import os
import random
# 下面两个目录改成自己的目录
xmlfilepath=r'/home/caffe-ssd/data/VOCdevkit/VOC2007/Annotations'
saveBasePath=r"/home/caffe-ssd/data/VOCdevkit"
trainval_percent=0.9 # 划分训练集和验证集的比例
train_percent=0.9 # trainval 中 训练集所占比例
total_xml = os.listdir(xmlfilepath)
num=len(total_xml)
list=range(num)
tv=int(num*trainval_percent)
tr=int(tv*train_percent)
trainval= random.sample(list,tv)
train=random.sample(trainval,tr)
print("train and val size",tv)
print("traub suze",tr)
ftrainval = open(os.path.join(saveBasePath,'VOC2007/ImageSets/Main/trainval.txt'), 'w')
ftest = open(os.path.join(saveBasePath,'VOC2007/ImageSets/Main/test.txt'), 'w')
ftrain = open(os.path.join(saveBasePath,'VOC2007/ImageSets/Main/train.txt'), 'w')
fval = open(os.path.join(saveBasePath,'VOC2007/ImageSets/Main/val.txt'), 'w')
for i in list:
name=total_xml[i][:-4]+'\n'
if i in trainval:
ftrainval.write(name)
if i in train:
ftrain.write(name)
else:
fval.write(name)
else:
ftest.write(name)
ftrainval.close()
ftrain.close()
fval.close()
ftest .close()
会在Main文件夹下生成4个txt文件:
- train.txt :用来训练的图片文件的文件名列表
- val.txt:用来验证的图片文件的文件名列表
- trainval.txt:用来训练和验证的图片文件的文件名列表
- test.txt :用来测试的图片文件的文件名列表
5.制作LMDB数据集
分别更改labelmap_voc.prototxt中需要训练的类别,之后运行create_list.sh和create_data.sh便可以生成LMDB数据集。
- create_list.sh
#!/bin/bash
root_dir=/home/caffe-ssd/data/VOCdevkit/
sub_dir=ImageSets/Main
bash_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
for dataset in trainval test
do
dst_file=$bash_dir/$dataset.txt
if [ -f $dst_file ]
then
rm -f $dst_file
fi
for name in VOC2007
do
# if [[ $dataset == "test" && $name == "VOC2012" ]]
# then
# continue
# fi
echo "Create list for $name $dataset..."
dataset_file=$root_dir/$name/$sub_dir/$dataset.txt
img_file=$bash_dir/$dataset"_img.txt"
cp $dataset_file $img_file
sed -i "s/^/$name\/JPEGImages\//g" $img_file
sed -i "s/$/.jpg/g" $img_file
label_file=$bash_dir/$dataset"_label.txt"
cp $dataset_file $label_file
sed -i "s/^/$name\/Annotations\//g" $label_file
sed -i "s/$/.xml/g" $label_file
paste -d' ' $img_file $label_file >> $dst_file
rm -f $label_file
rm -f $img_file
done
# Generate image name and size infomation.
if [ $dataset == "test" ]
then
$bash_dir/../../build/tools/get_image_size $root_dir $dst_file $bash_dir/$dataset"_name_size.txt"
fi
# Shuffle trainval file.
if [ $dataset == "trainval" ]
then
rand_file=$dst_file.random
cat $dst_file | perl -MList::Util=shuffle -e 'print shuffle(<STDIN>);' > $rand_file
mv $rand_file $dst_file
fi
done
- create_data.sh
cur_dir=$(cd $( dirname ${BASH_SOURCE[0]} ) && pwd )
root_dir="/home/xionglin/NewDisk/274G1/programfiles/caffe-ssd"
#root_dir=$cur_dir/../..
cd $root_dir
redo=1
data_root_dir="/home/caffe-ssd/data/VOCdevkit"
dataset_name="VOC2007"
mapfile="$root_dir/data/$dataset_name/labelmap_voc.prototxt"
anno_type="detection"
db="lmdb"
min_dim=0
max_dim=0
width=0
height=0
extra_cmd="--encode-type=jpg --encoded"
if [ $redo ]
then
extra_cmd="$extra_cmd --redo"
fi
for subset in test trainval
do
python $root_dir/scripts/create_annoset.py --anno-type=$anno_type --label-map-file=$mapfile --min-dim=$min_dim --max-dim=$max_dim --resize-width=$width --resize-height=$height --check-label $extra_cmd $data_root_dir $root_dir/data/$dataset_name/$subset.txt $data_root_dir/$dataset_name/$db/$dataset_name"_"$subset"_"$db examples/$dataset_name
done