文章目录
文件处理
重命名的处理
taco数据集中多个batch文件夹(每个batch里面存储的是图片)合并,并修改对应json文件的值
import os
base_path = r'/Project/dataset/taco'
batch_directories = os.listdir(base_path)
print(batch_directories)
# 对于每一个batch文件夹
for batch_directory in batch_directories:
# 文件夹路径
batch_path = os.path.join(base_path, batch_directory)
filelist = os.listdir(batch_path)
for filename in filelist:
new_filename = f"{batch_directory}_{filename}"
old_path = os.path.join(batch_path, filename)
new_path = os.path.join(batch_path, new_filename)
os.rename(old_path, new_path)
json文件
查询
当前垃圾检测在测试集(coco_garbage)中把cup错检为orange,目的测试是否是标注错误。
在json文件的annotations这一key中查询category_id为2(即为cup)并找到对应唯一标识的image_id,在images中找到对应id的图片名称,结果表明未标注错误。
其中需要注意json文件排序的list从0开始。
原因:训练数据的及其多样性不足,train仅有400张,cup类81张,orange77张。过拟合(在训练集上测试)
import json
with open('/data/usrname/PaddleDetection/dataset/coco_garbage/annotations/train.json') as f:
superHeroSquad = json.load(f)
print(type(superHeroSquad)) # Output: dict
print(superHeroSquad['annotations'][1].keys())
# 1、anno的category_id为 2 cup -> 输出对应image_id -> 到相应图片中查看
list =[]
for i in range(len(superHeroSquad['annotations'])):
if superHeroSquad['annotations'][i]['category_id'] == 2:
list.append(superHeroSquad['annotations'][i]['image_id'])
# print(superHeroSquad['annotations'][i]['image_id'])
for id in list:
print(id)
print(superHeroSquad['images'][id - 1]['file_name']) #list列表是从0开始的,需要 - 1
很多时候序列号和真实的id排序不一定一一对应,需要根据图片唯一命名的id号进行查询,查询部分如下
num = 0
for id in list:
# print(id)
# 序列号和真实的id排序不一定一一对应
for idx, idy in enumerate(superHeroSquad['images']):
if idy['id'] == id:
print(superHeroSquad['images'][idx]['file_name'])
num += 1
print(f'num:{num}') # num值和图片数量不会相等其原因为一张图片中可能有多个标注
合并
json文件的合并需要根据具体情况具体处理,下列列举的为一个coco格式的文件处理。
内容参考:合并多个coco格式数据的json标注文件
同一目录下处理
'''
拼接多个coco格式的json(单个coco json中可包含多个图片与标注)
仅支持单个category
会自动将image id 与 bbox id 顺序排列
'''
import json
import os
import shutil
from pathlib import Path
if __name__ == '__main__':
# 多个json文件的目录路径
# 存储在storage
json_root = Path('/data/usrname/PaddleDetection/dataset/combine_data/coco_garbage_AND_bottle_AND_LaJiJianCheVoc/zanshi')
output_file = r'/data/usrname/PaddleDetection/dataset/combine_data/coco_garbage_AND_bottle_AND_LaJiJianCheVoc/output.json'
# images_path = r'/storage/user/yanyan/dataset_trash/coco_garbage/images'
# 类别
categories = [
{"supercategory": "none", "id": 1, "name": "battery"},
{"supercategory": "none", "id": 2, "name": "cup"},
{"supercategory": "none", "id": 3, "name": "bottle"},
{"supercategory": "none", "id": 4, "name": "paper"}, # 纸团
{"supercategory": "none", "id": 5, "name": "fruit_peel"}, # 改名字 img重命名
{"supercategory": "none", "id": 6, "name": "plastic_bag"}, # 新增
{"supercategory": "none", "id": 7, "name": "cigarette_end"},
{"supercategory": "none", "id": 8, "name": "can"},
]
# 整个字典
total_data = {"images": [], "annotations": [], "categories": categories}
# id 顺序
img_id_count, bbox_id_count = 0, 0
# 列举目录下的json文件并存到filelist中
file_list = [str(i) for i in json_root.iterdir() if i.suffix == '.json']
print(file_list)
# 对于每个json文件
for js_file in file_list:
print(js_file)
# 打开当前json文件
with open(js_file, 'r') as f:
js_data = json.load(f)
# 存储image和anno信息
images = js_data['images']
annotations = js_data['annotations']
# Dont change origin data 不改变原有数据文件
for img_idx, image in enumerate(images):# img_idx 和 image各是什么 顺序号和内容
# print(img_id_count, bbox_id_count)
image_new = image.copy()
# id 号变了,但filename名字没有变化
# 原image_id
origin_img_id = image['id']
# 新image_id
image_new['id'] = img_id_count
# 新字典
total_data['images'].append(image_new)
for idx, anno in enumerate(annotations):
if anno['image_id'] == origin_img_id:
anno_new = anno.copy()
anno_new['id'] = bbox_id_count
anno_new['image_id'] = img_id_count
total_data['annotations'].append(anno_new)
bbox_id_count += 1
img_id_count += 1
with open(output_file, 'w') as f:
json.dump(total_data, f)
针对每一个json文件处理
'''
coco_garbage 里有5类
bootle 有4类都应该归属在新的数据集bottle一类
'''
import json
import os
def rename_images_file(origin_file_name, bottle_number):
filelist = os.listdir(bottle_images)
# 找到匹配文件名
if origin_file_name in filelist:
new_filename = f'bottle_{bottle_number}.jpg'
old_path = os.path.join(bottle_images, origin_file_name)
new_path = os.path.join(bottle_images, new_filename)
os.rename(old_path, new_path)
bottle_images = r'/data/usrname/PaddleDetection/dataset/PlasticPaperGarbageBagSyntheticImages/bottle_train2017'
file_coco = r'/storage/user/usrname/dataset_trash/coco_garbage/annotations/output.json'
file_bottle = r'/data/usrname/PaddleDetection/dataset/PlasticPaperGarbageBagSyntheticImages/bottle_anno/instances_train2017.json'
output_file = r'/data/usrname/PaddleDetection/dataset/combine_data/output.json'
# total_data = {"images": [], "annotations": [], "categories": categories}
# images直接复制 id width height file_name
# annotations 复制修改 bottle类修改image_id
# categories 取有5类的coco_garbage数据集保持不变
# 修改对应图片文件的名称
categories = [
{"supercategory": "none", "id": 1, "name": "battery"},
{"supercategory": "none", "id": 2, "name": "cup"},
{"supercategory": "none", "id": 3, "name": "bottle"},
{"supercategory": "none", "id": 4, "name": "paper"},
{"supercategory": "none", "id": 5, "name": "orange"}
]
total_data = {"images": [], "annotations": [], "categories": categories}
# 重新编码id顺序
img_id_count, bbox_id_count = 0, 0
with open(file_coco, 'r') as f:
js_coco = json.load(f)
# 存储原有image和anno信息
images = js_coco['images']
annotations = js_coco['annotations']
for img_idx, image in enumerate(images):
image_new = image.copy()
# 原image_id
origin_img_id = image['id']
# 新image_id
image_new['id'] = img_id_count
# 新字典
total_data['images'].append(image_new)
for idx, anno in enumerate(annotations):
# 根据image_id找到对应的anno
if anno['image_id'] == origin_img_id:
anno_new = anno.copy()
# 修改两个id号
anno_new['id'] = bbox_id_count
anno_new['image_id'] = img_id_count
total_data['annotations'].append(anno_new)
bbox_id_count += 1
img_id_count += 1
print(img_id_count)
print(bbox_id_count)
# 处理第2个bottle数据集的信息
with open(file_bottle, 'r') as f2:
js_bottle = json.load(f2)
images = js_bottle['images']
annotations = js_bottle['annotations']
bottle_number = 100
for img_idx, image in enumerate(images):
image_new = image.copy()
# 原image_id
origin_img_id = image['id']
# 新image_id
image_new['id'] = img_id_count
# file_name处理
origin_file_name = image['file_name']
# 重新命名bottle数据集里的图片 - bottle_100
rename_images_file(origin_file_name, bottle_number)
# json文件里的file_name更改
new_filename = f'bottle_{bottle_number}.jpg'
image_new['file_name'] = new_filename
bottle_number += 1
# 加入新字典中去
total_data['images'].append(image_new)
for idx, anno in enumerate(annotations):
# 根据image_id找到对应的anno
if anno['image_id'] == origin_img_id:
anno_new = anno.copy()
# 修改两个id号
anno_new['id'] = bbox_id_count
anno_new['image_id'] = img_id_count
# 修改bottle中的类别标签全部改为3
anno_new['category_id'] = 3
total_data['annotations'].append(anno_new)
bbox_id_count += 1
img_id_count += 1
with open(output_file, 'w') as f:
json.dump(total_data, f)
VOC格式(.xml)转COCO格式(.json)
VOC格式的数据为每张图片对应一个.xml文件记录标注信息,现根据其信息转化为COCO格式的json文件。
'''
将VOC数据的标注转化为COCO数据标注
'''
import os
import xml.etree.ElementTree as ET
import json
# VOC数据集的根目录
voc_root = '/data/usrname/PaddleDetection/dataset/LaJiJianCheVoc'
# 输出COCO格式的JSON文件路径
coco_output_file = '/data/usrname/PaddleDetection/dataset/LaJiJianCheVoc/LaJiJianCheVocCOCO.json'
# 初始化COCO数据结构
coco_data = {
"images": [],
"annotations": [],
"categories": []
}
# 添加类别信息
voc_classes = ['ChaHe', 'ShuLiaoDai', 'YanTou', 'KuaiZhi', 'ShuiGuoPi', 'HuaZhuangPingPin', 'YiLaGuan', 'ShuLiaoPin', 'GanDianChi'] # 替换为VOC数据集的类别名称
for i, voc_class in enumerate(voc_classes):
coco_data["categories"].append({
"id": i + 1,
"name": voc_class,
"supercategory": ""
})
image_folder = os.path.join(voc_root, 'images')
xml_folder = os.path.join(voc_root, 'annotations')
for xml_filename in os.listdir(xml_folder):
if xml_filename.endswith('.xml'):
xml_path = os.path.join(xml_folder, xml_filename)
# 解析XML注释文件
tree = ET.parse(xml_path)
root = tree.getroot()
# 添加图像信息
image_info = root.find("filename")
image_filename = image_info.text
image_id = len(coco_data["images"]) + 1
image_path = os.path.join(image_folder, image_filename)
image_width = int(root.find("size/width").text)
image_height = int(root.find("size/height").text)
image = {
"id": image_id,
"file_name": image_filename, # 名称
"width": image_width,
"height": image_height,
"date_captured": "",
"license": 0,
}
coco_data["images"].append(image)
# 解析物体注释
for obj in root.findall("object"):
category_name = obj.find("name").text
category_id = voc_classes.index(category_name) + 1
bndbox = obj.find("bndbox")
xmin = int(bndbox.find("xmin").text)
ymin = int(bndbox.find("ymin").text)
xmax = int(bndbox.find("xmax").text)
ymax = int(bndbox.find("ymax").text)
# 添加注释信息
annotation = {
"id": len(coco_data["annotations"]) + 1,
"image_id": image_id,
"category_id": category_id,
"segmentation": [],
"area": (xmax - xmin) * (ymax - ymin),
"bbox": [xmin, ymin, xmax - xmin, ymax - ymin],
"iscrowd": 0,
}
coco_data["annotations"].append(annotation)
# 保存COCO格式的JSON文件
with open(coco_output_file, 'w') as output:
json.dump(coco_data, output)
修改图片名称及同时更改json文件
'''
1、重命名图片名称
2、修改json文件中的images中的file_name
'''
import os
import json
file_path = r'/data/usrname/PaddleDetection/dataset/combine_data/coco_garbage_AND_bottle_AND_LaJiJianCheVoc/images'
json_file_path = r'/data/usrname/PaddleDetection/dataset/combine_data/coco_garbage_AND_bottle_AND_LaJiJianCheVoc/output.json'
output_file = r'/data/usrname/PaddleDetection/dataset/combine_data/coco_garbage_AND_bottle_AND_LaJiJianCheVoc/output2.json'
with open(json_file_path, 'r') as json_file:
data = json.load(json_file)
'''
名称命名起始number及其类别
plastic_bag_number = 0 # 6
cigarette_end_number = 0 # 7
fruit_peel = 100 # 5
can_number = 0 # 8
bottle_number = 1300 # 3
battery_number = 100 # 1
'''
category_counts = {
'plastic_bag_number': 0 ,
'cigarette_end_number': 0,
'fruit_peel_number': 100,
'can_number': 0,
'bottle_number': 1300,
'battery_number': 100
}
def rename_images_file(origin_file_name, name):
filelist = os.listdir(file_path)
# 找到匹配文件名
if origin_file_name in filelist:
old_path = os.path.join(file_path, origin_file_name)
new_path = os.path.join(file_path, name)
os.rename(old_path, new_path)
for idx, anno in enumerate(data.get('annotations', [])):
# 找到对应的图像信息
for img_idx, image_info in enumerate(data.get('images', [])):
if image_info.get('id') == anno.get('image_id') and image_info.get('file_name').startswith('img'):
original_filename = image_info.get('file_name')
# 找到对应的类别,进行重命名
if anno.get('category_id') == 6:
new_filename = f'plastic_bag_{category_counts["plastic_bag_number"]}.jpg'
category_counts['plastic_bag_number'] += 1
elif anno.get('category_id') == 7:
new_filename = f'cigarette_end_{category_counts["cigarette_end_number"]}.jpg'
category_counts['cigarette_end_number'] += 1
elif anno.get('category_id') == 5:
new_filename = f'fruit_peel_{category_counts["fruit_peel_number"]}.jpg'
category_counts['fruit_peel_number'] += 1
elif anno.get('category_id') == 8:
new_filename = f'can_{category_counts["can_number"]}.jpg'
category_counts['can_number'] += 1
elif anno.get('category_id') == 3:
new_filename = f'bottle_{category_counts["bottle_number"]}.jpg'
category_counts['bottle_number'] += 1
elif anno.get('category_id') == 1:
new_filename = f'battery_{category_counts["battery_number"]}.jpg'
category_counts['battery_number'] += 1
else:
new_filename = None # 未知类别
# 如果找到了对应的类别,修改对应的文件名
if new_filename is not None:
# 修改annotations文件中的filename
print(f'Matching: {original_filename} -> {new_filename}')
data['images'][img_idx]['file_name'] = new_filename
rename_images_file(original_filename, new_filename)
break
with open(output_file, 'w') as f:
json.dump(data, f)
pandas库
查看所有特征值
train1_data.columns
查看缺失值
用isnull方法来查看空值,得到的结果是布尔值。
df_list.isnull()
较大数据集用info()方法
df_list.info()
#计数
df_list.isnull().sum()
#单独某列查看空值
df_list["name"].isnull().sum()
缺失值删除
只要一行中有一个缺失值这一行就全都删除
df_list.dropna()
#整行为空的前提下才删除
df_list.dropna(how = "all")
缺失值填充 用众数/均值
众数可能存在多个,pandas返回的是一个series,默认取第一个众数。
import pandas as pd
train1_data.f0.fillna(train1_data.f0.mode()[0],inplace=True)
用均值填充
train1_data[f_feas] = train1_data[f_feas].fillna(train1_data[f_feas].median())
查看某变量取值范围
print(train_inte['sub_class'].unique())
print(train_data['class'].unique())
print(test_public['class'].unique())