coco2017数据集中val2017,功能1:将图片中物体数量小于N的图片和标签提取,生成子集。功能2:将图片中物体数量大于N的图片和标签提取,生成子集。
时间:2023.9.7
功能1:针对coco2017中val2017数据集,将图片中物体数量小于N的图片和标签提取,生成子集
from pycocotools.coco import COCO
import os
import shutil
import json
# 定义每个图片中物体的数量
obj_num_per_img = 5
# 定义COCO数据集的路径
dataDir = '/root/autodl-tmp/coco2017/'
dataType = 'val2017'
annFile = os.path.join(dataDir, 'annotations', 'instances_{}.json'.format(dataType))
# 初始化COCO对象
coco = COCO(annFile)
# 创建一个目录来保存子数据集
output_img = '/root/coco2017_less_than_' + str(obj_num_per_img) + '/val2017' # /root/coco2017_less_than_3/val2017
output_ann = '/root/coco2017_less_than_' + str(obj_num_per_img) + '/annotations' # /root/coco2017_less_than_3/annotations
os.makedirs(output_img, exist_ok=True)
os.makedirs(output_ann, exist_ok=True)
# 创建一个新的子数据集
new_subset = {
"info": coco.dataset['info'],
"licenses": coco.dataset['licenses'],
"categories": coco.dataset['categories'],
"images": [],
"annotations": []
}
total_sub_img = 0
# 遍历COCO数据集中的图像
image_ids = coco.getImgIds()
for image_id in image_ids:
print(f'Processing {image_id}')
# 获取图像的注释信息
annotations = coco.loadAnns(coco.getAnnIds(imgIds=image_id))
# 计算图像中的物体数量
num_objects = len(annotations)
print(f'There are {num_objects} in {image_id}')
# 如果物体数量小于N,将图像和注释复制到子数据集目录
if num_objects < obj_num_per_img:
image_info = coco.loadImgs(image_id)[0]
image_file_name = image_info['file_name']
image_path = os.path.join(dataDir, dataType, image_file_name)
# 复制图像到子数据集目录
shutil.copy(image_path, os.path.join(output_img, image_file_name))
print(f'Saved {os.path.join(output_img, image_file_name)}')
new_subset['images'].append(image_info)
new_subset['annotations'].extend(annotations)
total_sub_img += 1
print(f'Total {total_sub_img}')
output_ann_json_path = os.path.join(output_ann, 'instances_val2017.json')
with open(output_ann_json_path, 'w') as json_file:
json.dump(new_subset, json_file, indent=4)
print(f'Saved subset JSON to {output_ann_json_path}')
功能2:针对coco2017中val2017数据集,将图片中物体数量大于N的图片和标签提取,生成子集
from pycocotools.coco import COCO
import os
import shutil
import json
# 定义每个图片中物体的数量
obj_num_per_img = 30
# 定义COCO数据集的路径
dataDir = '/root/autodl-tmp/coco2017/'
dataType = 'val2017'
annFile = os.path.join(dataDir, 'annotations', 'instances_{}.json'.format(dataType))
# 初始化COCO对象
coco = COCO(annFile)
# 创建一个目录来保存子数据集
output_img = '/root/coco2017_more_than_' + str(obj_num_per_img) + '/val2017' # /root/coco2017_less_than_3/val2017
output_ann = '/root/coco2017_more_than_' + str(obj_num_per_img) + '/annotations' # /root/coco2017_less_than_3/annotations
os.makedirs(output_img, exist_ok=True)
os.makedirs(output_ann, exist_ok=True)
# 创建一个新的子数据集
new_subset = {
"info": coco.dataset['info'],
"licenses": coco.dataset['licenses'],
"categories": coco.dataset['categories'],
"images": [],
"annotations": []
}
total_sub_img = 0
# 遍历COCO数据集中的图像
image_ids = coco.getImgIds()
for image_id in image_ids:
print(f'Processing {image_id}')
# 获取图像的注释信息
annotations = coco.loadAnns(coco.getAnnIds(imgIds=image_id))
# 计算图像中的物体数量
num_objects = len(annotations)
print(f'There are {num_objects} in {image_id}')
# 如果物体数大于等于N,将图像和注释复制到子数据集目录
if num_objects >= obj_num_per_img:
image_info = coco.loadImgs(image_id)[0]
image_file_name = image_info['file_name']
image_path = os.path.join(dataDir, dataType, image_file_name)
# 复制图像到子数据集目录
shutil.copy(image_path, os.path.join(output_img, image_file_name))
print(f'Saved {os.path.join(output_img, image_file_name)}')
new_subset['images'].append(image_info)
new_subset['annotations'].extend(annotations)
total_sub_img += 1
print(f'Total {total_sub_img}')
output_ann_json_path = os.path.join(output_ann, 'instances_val2017.json')
with open(output_ann_json_path, 'w') as json_file:
json.dump(new_subset, json_file, indent=4)
print(f'Saved subset JSON to {output_ann_json_path}')