【python数据处理】pandas/文件/json的数据处理（COCO格式的标注json文件合并andVOC格式标注转为COCO格式）

本文链接：https://blog.csdn.net/weixin_45004305/article/details/132304049

文章目录

文件处理
- 重命名的处理
json文件
pandas库

文件处理

重命名的处理

taco数据集中多个batch文件夹（每个batch里面存储的是图片）合并，并修改对应json文件的值

import os
base_path = r'/Project/dataset/taco'
batch_directories = os.listdir(base_path)
print(batch_directories)
# 对于每一个batch文件夹
for batch_directory in batch_directories:
	# 文件夹路径
    batch_path = os.path.join(base_path, batch_directory)
    filelist = os.listdir(batch_path)
    for filename in filelist:
        new_filename = f"{batch_directory}_{filename}"
        old_path = os.path.join(batch_path, filename)
        new_path = os.path.join(batch_path, new_filename)
        os.rename(old_path, new_path)

json文件

查询

当前垃圾检测在测试集（coco_garbage）中把cup错检为orange，目的测试是否是标注错误。
在json文件的annotations这一key中查询category_id为2（即为cup）并找到对应唯一标识的image_id，在images中找到对应id的图片名称，结果表明未标注错误。
其中需要注意json文件排序的list从0开始。
原因：训练数据的及其多样性不足，train仅有400张，cup类81张，orange77张。过拟合（在训练集上测试）

import json
with open('/data/usrname/PaddleDetection/dataset/coco_garbage/annotations/train.json') as f:
    superHeroSquad = json.load(f)
print(type(superHeroSquad))  # Output: dict
print(superHeroSquad['annotations'][1].keys())

# 1、anno的category_id为 2 cup -> 输出对应image_id -> 到相应图片中查看
list =[]
for i in range(len(superHeroSquad['annotations'])):
    if superHeroSquad['annotations'][i]['category_id'] == 2:
        list.append(superHeroSquad['annotations'][i]['image_id'])
        # print(superHeroSquad['annotations'][i]['image_id'])

for id in list:
    print(id)
    print(superHeroSquad['images'][id - 1]['file_name']) #list列表是从0开始的,需要 - 1

很多时候序列号和真实的id排序不一定一一对应，需要根据图片唯一命名的id号进行查询，查询部分如下

num = 0
for id in list:
    # print(id)
    # 序列号和真实的id排序不一定一一对应
    for idx, idy in enumerate(superHeroSquad['images']):
        if idy['id'] == id:
            print(superHeroSquad['images'][idx]['file_name'])
            num += 1
print(f'num:{num}') # num值和图片数量不会相等其原因为一张图片中可能有多个标注

合并

json文件的合并需要根据具体情况具体处理，下列列举的为一个coco格式的文件处理。
内容参考：合并多个coco格式数据的json标注文件
同一目录下处理

'''
    拼接多个coco格式的json(单个coco json中可包含多个图片与标注)
    仅支持单个category
    会自动将image id 与 bbox id 顺序排列
'''
import json
import os
import shutil
from pathlib import Path

if __name__ == '__main__':
    # 多个json文件的目录路径
    # 存储在storage
    json_root = Path('/data/usrname/PaddleDetection/dataset/combine_data/coco_garbage_AND_bottle_AND_LaJiJianCheVoc/zanshi')
    output_file = r'/data/usrname/PaddleDetection/dataset/combine_data/coco_garbage_AND_bottle_AND_LaJiJianCheVoc/output.json'
    # images_path = r'/storage/user/yanyan/dataset_trash/coco_garbage/images'
    # 类别
    categories = [
        {"supercategory": "none", "id": 1, "name": "battery"},
        {"supercategory": "none", "id": 2, "name": "cup"},
        {"supercategory": "none", "id": 3, "name": "bottle"},
        {"supercategory": "none", "id": 4, "name": "paper"},  # 纸团
        {"supercategory": "none", "id": 5, "name": "fruit_peel"},  # 改名字 img重命名
        {"supercategory": "none", "id": 6, "name": "plastic_bag"},  # 新增
        {"supercategory": "none", "id": 7, "name": "cigarette_end"},
        {"supercategory": "none", "id": 8, "name": "can"},
    ]
    # 整个字典
    total_data = {"images": [], "annotations": [], "categories": categories}
    # id 顺序
    img_id_count, bbox_id_count = 0, 0
    # 列举目录下的json文件并存到filelist中
    file_list = [str(i) for i in json_root.iterdir() if i.suffix == '.json']
    print(file_list)
    # 对于每个json文件
    for js_file in file_list:
        print(js_file)
        # 打开当前json文件
        with open(js_file, 'r') as f:
            js_data = json.load(f)
            # 存储image和anno信息
            images = js_data['images']
            annotations = js_data['annotations']
        # Dont change origin data 不改变原有数据文件
        for img_idx, image in enumerate(images):# img_idx 和 image各是什么 顺序号和内容
            # print(img_id_count, bbox_id_count)
            image_new = image.copy()
            # id 号变了，但filename名字没有变化
            # 原image_id
            origin_img_id = image['id']
            # 新image_id
            image_new['id'] = img_id_count
            # 新字典
            total_data['images'].append(image_new)

            for idx, anno in enumerate(annotations):
                if anno['image_id'] == origin_img_id:
                    anno_new = anno.copy()
                    anno_new['id'] = bbox_id_count
                    anno_new['image_id'] = img_id_count
                    total_data['annotations'].append(anno_new)
                    bbox_id_count += 1
            img_id_count += 1

    with open(output_file, 'w') as f:
        json.dump(total_data, f)

针对每一个json文件处理

'''
    coco_garbage 里有5类
    bootle 有4类都应该归属在新的数据集bottle一类
'''
import json
import os
def rename_images_file(origin_file_name, bottle_number):
    filelist = os.listdir(bottle_images)
    # 找到匹配文件名
    if origin_file_name in filelist:
        new_filename = f'bottle_{bottle_number}.jpg'
        old_path = os.path.join(bottle_images, origin_file_name)
        new_path = os.path.join(bottle_images, new_filename)
        os.rename(old_path, new_path)

bottle_images = r'/data/usrname/PaddleDetection/dataset/PlasticPaperGarbageBagSyntheticImages/bottle_train2017'
file_coco = r'/storage/user/usrname/dataset_trash/coco_garbage/annotations/output.json'
file_bottle = r'/data/usrname/PaddleDetection/dataset/PlasticPaperGarbageBagSyntheticImages/bottle_anno/instances_train2017.json'
output_file = r'/data/usrname/PaddleDetection/dataset/combine_data/output.json'
# total_data = {"images": [], "annotations": [], "categories": categories}
# images直接复制 id width height file_name
# annotations 复制修改 bottle类修改image_id
# categories 取有5类的coco_garbage数据集保持不变
# 修改对应图片文件的名称
categories = [
        {"supercategory": "none", "id": 1, "name": "battery"},
        {"supercategory": "none", "id": 2, "name": "cup"},
        {"supercategory": "none", "id": 3, "name": "bottle"},
        {"supercategory": "none", "id": 4, "name": "paper"},
        {"supercategory": "none", "id": 5, "name": "orange"}
    ]
total_data = {"images": [], "annotations": [], "categories": categories}
# 重新编码id顺序
img_id_count, bbox_id_count = 0, 0
with open(file_coco, 'r') as f:
    js_coco = json.load(f)
    # 存储原有image和anno信息
    images = js_coco['images']
    annotations = js_coco['annotations']

for img_idx, image in enumerate(images):
    image_new = image.copy()
    # 原image_id
    origin_img_id = image['id']
    # 新image_id
    image_new['id'] = img_id_count
    # 新字典
    total_data['images'].append(image_new)

    for idx, anno in enumerate(annotations):
        # 根据image_id找到对应的anno
        if anno['image_id'] == origin_img_id:
            anno_new = anno.copy()
            # 修改两个id号
            anno_new['id'] = bbox_id_count
            anno_new['image_id'] = img_id_count
            total_data['annotations'].append(anno_new)
            bbox_id_count += 1
    img_id_count += 1
print(img_id_count)
print(bbox_id_count)
# 处理第2个bottle数据集的信息
with open(file_bottle, 'r') as f2:
    js_bottle = json.load(f2)
    images = js_bottle['images']
    annotations = js_bottle['annotations']

bottle_number = 100
for img_idx, image in enumerate(images):
    image_new = image.copy()
    # 原image_id
    origin_img_id = image['id']
    # 新image_id
    image_new['id'] = img_id_count
    # file_name处理
    origin_file_name = image['file_name']
    # 重新命名bottle数据集里的图片 - bottle_100
    rename_images_file(origin_file_name, bottle_number)
    # json文件里的file_name更改
    new_filename = f'bottle_{bottle_number}.jpg'
    image_new['file_name'] = new_filename
    bottle_number += 1
    # 加入新字典中去
    total_data['images'].append(image_new)

    for idx, anno in enumerate(annotations):
        # 根据image_id找到对应的anno
        if anno['image_id'] == origin_img_id:
            anno_new = anno.copy()
            # 修改两个id号
            anno_new['id'] = bbox_id_count
            anno_new['image_id'] = img_id_count
            # 修改bottle中的类别标签全部改为3
            anno_new['category_id'] = 3
            total_data['annotations'].append(anno_new)
            bbox_id_count += 1
    img_id_count += 1

with open(output_file, 'w') as f:
    json.dump(total_data, f)

VOC格式(.xml)转COCO格式(.json)

VOC格式的数据为每张图片对应一个.xml文件记录标注信息，现根据其信息转化为COCO格式的json文件。

'''
    将VOC数据的标注转化为COCO数据标注
'''
import os
import xml.etree.ElementTree as ET
import json

# VOC数据集的根目录
voc_root = '/data/usrname/PaddleDetection/dataset/LaJiJianCheVoc'

# 输出COCO格式的JSON文件路径
coco_output_file = '/data/usrname/PaddleDetection/dataset/LaJiJianCheVoc/LaJiJianCheVocCOCO.json'

# 初始化COCO数据结构
coco_data = {
    "images": [],
    "annotations": [],
    "categories": []
}

# 添加类别信息
voc_classes = ['ChaHe', 'ShuLiaoDai', 'YanTou', 'KuaiZhi', 'ShuiGuoPi', 'HuaZhuangPingPin', 'YiLaGuan', 'ShuLiaoPin', 'GanDianChi']  # 替换为VOC数据集的类别名称
for i, voc_class in enumerate(voc_classes):
    coco_data["categories"].append({
        "id": i + 1,
        "name": voc_class,
        "supercategory": ""
    })

image_folder = os.path.join(voc_root, 'images')
xml_folder = os.path.join(voc_root, 'annotations')

for xml_filename in os.listdir(xml_folder):
    if xml_filename.endswith('.xml'):
        xml_path = os.path.join(xml_folder, xml_filename)

        # 解析XML注释文件
        tree = ET.parse(xml_path)
        root = tree.getroot()

        # 添加图像信息
        image_info = root.find("filename")
        image_filename = image_info.text
        image_id = len(coco_data["images"]) + 1
        image_path = os.path.join(image_folder, image_filename)
        image_width = int(root.find("size/width").text)
        image_height = int(root.find("size/height").text)

        image = {
            "id": image_id,
            "file_name": image_filename, # 名称
            "width": image_width,
            "height": image_height,
            "date_captured": "",
            "license": 0,
        }
        coco_data["images"].append(image)

        # 解析物体注释
        for obj in root.findall("object"):
            category_name = obj.find("name").text
            category_id = voc_classes.index(category_name) + 1
            bndbox = obj.find("bndbox")
            xmin = int(bndbox.find("xmin").text)
            ymin = int(bndbox.find("ymin").text)
            xmax = int(bndbox.find("xmax").text)
            ymax = int(bndbox.find("ymax").text)

            # 添加注释信息
            annotation = {
                "id": len(coco_data["annotations"]) + 1,
                "image_id": image_id,
                "category_id": category_id,
                "segmentation": [],
                "area": (xmax - xmin) * (ymax - ymin),
                "bbox": [xmin, ymin, xmax - xmin, ymax - ymin],
                "iscrowd": 0,
            }
            coco_data["annotations"].append(annotation)

# 保存COCO格式的JSON文件
with open(coco_output_file, 'w') as output:
    json.dump(coco_data, output)

修改图片名称及同时更改json文件

'''
    1、重命名图片名称
    2、修改json文件中的images中的file_name
'''
import os
import json
file_path = r'/data/usrname/PaddleDetection/dataset/combine_data/coco_garbage_AND_bottle_AND_LaJiJianCheVoc/images'
json_file_path = r'/data/usrname/PaddleDetection/dataset/combine_data/coco_garbage_AND_bottle_AND_LaJiJianCheVoc/output.json'
output_file = r'/data/usrname/PaddleDetection/dataset/combine_data/coco_garbage_AND_bottle_AND_LaJiJianCheVoc/output2.json'

with open(json_file_path, 'r') as json_file:
    data = json.load(json_file)
'''
名称命名起始number及其类别
plastic_bag_number = 0 # 6
cigarette_end_number = 0 # 7
fruit_peel = 100 # 5
can_number = 0 # 8
bottle_number = 1300 # 3
battery_number = 100 # 1
'''
category_counts = {
    'plastic_bag_number': 0 ,
    'cigarette_end_number': 0,
    'fruit_peel_number': 100,
    'can_number': 0,
    'bottle_number': 1300,
    'battery_number': 100
}

def rename_images_file(origin_file_name, name):
    filelist = os.listdir(file_path)
    # 找到匹配文件名
    if origin_file_name in filelist:
        old_path = os.path.join(file_path, origin_file_name)
        new_path = os.path.join(file_path, name)
        os.rename(old_path, new_path)

for idx, anno in enumerate(data.get('annotations', [])):
    # 找到对应的图像信息
    for img_idx, image_info in enumerate(data.get('images', [])):
        if image_info.get('id') == anno.get('image_id') and image_info.get('file_name').startswith('img'):
            original_filename = image_info.get('file_name')
            # 找到对应的类别，进行重命名
            if anno.get('category_id') == 6:
                new_filename = f'plastic_bag_{category_counts["plastic_bag_number"]}.jpg'
                category_counts['plastic_bag_number'] += 1
            elif anno.get('category_id') == 7:
                new_filename = f'cigarette_end_{category_counts["cigarette_end_number"]}.jpg'
                category_counts['cigarette_end_number'] += 1
            elif anno.get('category_id') == 5:
                new_filename = f'fruit_peel_{category_counts["fruit_peel_number"]}.jpg'
                category_counts['fruit_peel_number'] += 1
            elif anno.get('category_id') == 8:
                new_filename = f'can_{category_counts["can_number"]}.jpg'
                category_counts['can_number'] += 1
            elif anno.get('category_id') == 3:
                new_filename = f'bottle_{category_counts["bottle_number"]}.jpg'
                category_counts['bottle_number'] += 1
            elif anno.get('category_id') == 1:
                new_filename = f'battery_{category_counts["battery_number"]}.jpg'
                category_counts['battery_number'] += 1
            else:
                new_filename = None  # 未知类别

            # 如果找到了对应的类别，修改对应的文件名
            if new_filename is not None:
                # 修改annotations文件中的filename
                print(f'Matching: {original_filename} -> {new_filename}')
                data['images'][img_idx]['file_name'] = new_filename
                rename_images_file(original_filename, new_filename)
                break

with open(output_file, 'w') as f:
    json.dump(data, f)

pandas库

查看所有特征值

train1_data.columns

查看缺失值

用isnull方法来查看空值，得到的结果是布尔值。

df_list.isnull()

较大数据集用info()方法

df_list.info()
#计数
df_list.isnull().sum()
#单独某列查看空值
df_list["name"].isnull().sum()

缺失值删除

只要一行中有一个缺失值这一行就全都删除

df_list.dropna()
#整行为空的前提下才删除
df_list.dropna(how = "all")

缺失值填充用众数/均值

众数可能存在多个，pandas返回的是一个series，默认取第一个众数。

import pandas as pd
train1_data.f0.fillna(train1_data.f0.mode()[0],inplace=True)

用均值填充

train1_data[f_feas] = train1_data[f_feas].fillna(train1_data[f_feas].median())

查看某变量取值范围

print(train_inte['sub_class'].unique())
print(train_data['class'].unique())
print(test_public['class'].unique())