VOC 2007[1] 是一个多标签数据集,有 20 类。这里为 multi-label classification 任务做预处理,包括:
- 将图片移到同一个目录(方便读取);
- 数据划分(本身就已经分好 train/val 和 test 两部分);
- 处理标签。
Prepare
[1] 有下载链,train/val 450M,test 430M。下下来就是 VOCtrainval_06-Nov-2007.tar 和 VOCtest_06-Nov-2007.tar 两个文件。以 test set 的文件为例,解压之后在 VOCtest_06-Nov-2007/VOCdevkit/VOC2007/ 下可以见到:
- Annotations/:各样本对应的 .xml 标注文件,可以从中提取 label 信息,解析可参考 [5]。其中
<object>
标签下的<difficult>
子标签与下一条的 0 tag 有对应关系,见 [2]; - ImageSets/:只用到其中 Main/ 目录,里面是按类组织的 .txt 文件,标注每幅 image 样本是否包含此类物体,有 1/0/-1 三种标记(解释见 [2]):1 是含有,-1 是不含,0 表示 difficult。
- JPEGImages/:图片;
- SegmentationClass/:其它任务的,用不到;
- SegmentationObject/:其它任务的,用不到;
ID, Label
JPEGImages/ 下的图片是用 ID 命名的,可以从此获取样本 ID;而在 ImageSets/Main/ 中,又有 test.txt、train.txt、val.txt、trainval.txt 这 4 个 ID 划分文件。经验证,以两种方式获得的 ID 划分是一致的,且 train/val 与 test 无重合。
处理 label 时,参照 [4],将 0 当成 -1,即只有 1 表示正例,0/-1 都表示负例,结果与 [3] 里每类正例数统计是对得上的。 获取 label 又有两中方式:通过 Annotations/ 中的 .xml 文件,或通过 ImageSets/Main/(除了刚才的 ID 划分文件之外的).txt 文件。经验证,将 .txt 中的 0 当成 -1 处理与忽略 .xml 中 <difficult>
为 1 的效果相同。
Code
- update 2020.12.5:label 改为用
scipy.io.savemat
存,加压缩参数,见 [7]。 - notes 2020.7.22:ML-GCN[6] 会把 ID、label 写进两个 csv 文件:classification_trainval.csv、classification_test.csv,后续读数据也是按这两个 csv 文件来。经验证,所得 label 信息与本文处理的结果一致。
- update 2020.7.5:由于 ML-GCN 对 0 标签的处理是当成 1 而不是上述的 -1,所以改变处理标签的做法:保留 -1/0/1 原貌,读入后再按需处理 0 的问题。
import os
from os.path import join
from xml.dom import minidom
import numpy as np
import scipy.io as sio
# http://host.robots.ox.ac.uk/pascal/VOC/voc2007/index.html
# http://host.robots.ox.ac.uk/pascal/VOC/voc2007/htmldoc/voc.html#SECTION00090000000000000000
P = "E:/iTom/dataset/VOC2007" # 下载目录
ALL_IMAGE_P = join(P, "images") # 所有 image 复制一份到此目录下
# train/val 解压目录
TRAIN_P = join(P, "VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007")
TRAIN_IMAGE_P = join(TRAIN_P, "JPEGImages")
TRAIN_LABEL_P = join(TRAIN_P, "ImageSets/Main")
TRAIN_ANNO_P = join(TRAIN_P, "Annotations")
# test 解压目录
TEST_P = join(P, "VOCtest_06-Nov-2007/VOCdevkit/VOC2007")
TEST_IMAGE_P = join(TEST_P, "JPEGImages")
TEST_LABEL_P = join(TEST_P, "ImageSets/Main")
TEST_ANNO_P = join(TEST_P, "Annotations")
# ID 划分文件
SPLIT_TRAIN = join(TRAIN_LABEL_P, "train.txt")
SPLIT_VAL = join(TRAIN_LABEL_P, "val.txt")
SPLIT_TRAIN_VAL = join(TRAIN_LABEL_P, "trainval.txt")
SPLIT_TEST = join(TEST_LABEL_P, "test.txt")
"""处理 ID 划分"""
# print("--- 第一种方式:从 JPEGImages/ 目录提取 ID ---")
file_key = lambda s: int(s.split('.')[0])
# def get_id_list(path):
# id_list = os.listdir(path)
# id_list = list(map(file_key, id_list))
# print("#files:", len(id_list))
# id_set = set(id_list)
# print("#unique:", len(id_set))
# return id_list
# print("- train -")
# train_img_id = get_id_list(TRAIN_IMAGE_P) # 5011
# print("- test -")
# test_img_id = get_id_list(TEST_IMAGE_P) # 4952
# print("- 验证 train/val 与 test 无重复 ID -")
# train_img_id_set = set(train_img_id)
# test_img_id_set = set(test_img_id)
# # no intersection in id of train/val & test
# print("#common in train & test:", len(train_img_id_set.intersection(test_img_id_set))) # 0
print("--- 第二种方式:从 ID 划分文件提取 ID ---")
def get_id_list_from_file(_file):
id_list = []
with open(_file, "r") as f:
for line in f:
id_list.append(int(line))
print("#id:", len(id_list))
id_set = set(id_list)
print("#unique id:", len(id_set))
return id_list
print("- train -")
id_train = get_id_list_from_file(SPLIT_TRAIN) # 2501
print("- val -")
id_val = get_id_list_from_file(SPLIT_VAL) # 2510
print("- train-val -")
id_train_val = get_id_list_from_file(SPLIT_TRAIN_VAL) # 5011
print("- test -")
id_test = get_id_list_from_file(SPLIT_TEST) # 4952
# print("- 验证 train/val 与 test 无重复 ID -")
# train_val_id_set = set(id_train_val)
# test_id_set = set(id_test)
# # train/val 和 test 无重复 ID
# print("#common in train & test:", len(train_val_id_set.intersection(test_id_set))) # 0
# print("- 验证两种方法获取的 ID 划分一致 -")
# print("#common in train:", len(train_img_id_set.intersection(train_val_id_set))) # 5011
# print("#common in test:", len(test_img_id_set.intersection(test_id_set))) # 4952
# print("- check id complete -")
id_all = id_train_val + id_test
print("#id:", len(id_all), ", max id:", max(id_all), ", min id:", min(id_all))
n_id = max(id_all)
# for i in range(1, n_id + 1):
# if i not in id_all:
# print("id absent:", i)
# print("complete check done")
print("- save indices -")
id_train = np.array(id_train) - 1
id_val = np.array(id_val) - 1
id_train_val = np.array(id_train_val) - 1
id_test = np.array(id_test) - 1
print("id train-val:", id_train_val.max(), id_train_val.min())
print("id test:", id_test.max(), id_test.min())
np.save(join(P, "idx_train.npy"), id_train)
np.save(join(P, "idx_val.npy"), id_val)
np.save(join(P, "idx_train_val.npy"), id_train_val)
np.save(join(P, "idx_test.npy"), id_test)
"""将全部 image 移到同一个目录"""
# since all IDs are distinct
# we can move all image into one dir
if not os.path.exists(ALL_IMAGE_P):
os.makedirs(ALL_IMAGE_P)
def copy_image(path):
img_ls = os.listdir(path)
for i, f in enumerate(img_ls):
# os.system("cp {} {}".format(join(path, f), ALL_IMAGE_P)) # linux
os.system("copy {} {}".format(join(path, f), ALL_IMAGE_P)) # windows
if i % 100 == 0:
print(i)
copy_image(TRAIN_IMAGE_P)
copy_image(TEST_IMAGE_P)
"""处理 label"""
# 2 method for processing label
# both treat 0 tag as -1
# http://host.robots.ox.ac.uk/pascal/VOC/voc2007/htmldoc/voc.html#SECTION00031000000000000000
test_ls = os.listdir(TEST_LABEL_P)
test_ls = [f for f in test_ls if "_test" in f]
N_CLASS = len(test_ls)
print("#class:", N_CLASS)
# map id: name -> num
test_ls = [f.split("_test")[0] for f in test_ls] # 保留类名
id_map = {name: num for num, name in enumerate(test_ls)} # 类名 -> 类 ID
print(id_map)
print("--- 第一种方式:从 ImageSets/Main/ 提取 label ---")
L_label = np.zeros((n_id, N_CLASS))
def proc_label(path, suffix):
"""process by class
path: {TRAIN_LABEL_P, TEST_LABEL_P}
suffix: {"_trainval", "_test"}
"""
file_ls = os.listdir(path)
for _f in file_ls:
if suffix not in _f:
continue
class_name = _f.split(suffix)[0]
assert class_name in id_map
c = id_map[class_name]
pos_cnt = 0
with open(join(path, _f), "r") as f:
for line in f: # format: ID 1/0/-1
line = line.split()
# if int(line[1]) > 0: # 只把 1 当正例
lb = int(line[1])
if 0 != lb: # 保留 -1/0/1
sid = int(line[0]) - 1 # 0-base
L_label[sid][c] = lb
if 1 == lb:
pos_cnt += 1
print("#{}: {}".format(class_name, pos_cnt))
print("- train-val label -")
proc_label(TRAIN_LABEL_P, "_trainval")
print("- test label -")
proc_label(TEST_LABEL_P, "_test")
sum_label = L_label.sum(0)
print("label statistics:", sum_label)
L_label = L_label.astype(np.uint8)
# np.save(join(P, "labels.l.npy"), L_label)
sio.savemat(join(P, "labels.l.mat"), {"labels": L_label}, do_compression=True)
print("--- 第二种方式:从 Annotations/ 提取 label ---")
# https://github.com/HCPLab-SYSU/SSGRL/blob/master/datasets/voc07dataset.py
L_anno = np.zeros((n_id, N_CLASS)) - 1 # 先设为 -1
def proc_annotation(path):
"""process by sample
path: {TRAIN_ANNO_P, TEST_ANNO_P}
"""
pos_cnt = {k: 0 for k in id_map.keys()}
file_ls = os.listdir(path)
for _f in file_ls:
sid = file_key(_f) - 1
DOMTree = minidom.parse(join(path, _f))
root = DOMTree.documentElement
objects = root.getElementsByTagName('object')
for obj in objects:
lb = 1
if '1' == obj.getElementsByTagName('difficult')[0].firstChild.data: # 保留 difficult
# continue # 忽略 difficult
lb = 0
class_name = obj.getElementsByTagName('name')[0].firstChild.data.lower()
assert class_name in id_map
c = id_map[class_name]
# if 0 == L_anno[sid][c]:
if 1 != L_anno[sid][c]: # 保留 -1/0/1
L_anno[sid][c] = lb
if 1 == lb:
pos_cnt[class_name] += 1
print("pos count:", pos_cnt)
print("- train-val annotation -")
proc_annotation(TRAIN_ANNO_P)
print("- test annotation -")
proc_annotation(TEST_ANNO_P)
sum_label = L_anno.sum(0)
print("label statistics:", sum_label)
L_anno = L_anno.astype(np.uint8)
# np.save(join(P, "labels.a.npy"), L_anno)
sio.savemat(join(P, "labels.a.mat"), {"labels": L_anno}, do_compression=True)
print("#diff:", (L_label != L_anno).astype(np.int8).sum()) # 0
Cloud Drive
链接:https://pan.baidu.com/s/1Mh_nX-y-ijvZEmy3lzTaNw,提取码:oq10
。