【自用代码】yolov5数据预处理代码：python批量修改xml某字段，Coco格式转VOC格式

带你去吃小豆花

已于 2022-07-20 13:21:05 修改

阅读量625

点赞数 1

分类专栏： python 文章标签： python 目标检测人工智能

于 2022-05-18 17:44:28 首次发布

本文链接：https://blog.csdn.net/qq_39965059/article/details/124257235

版权

python 专栏收录该内容

7 篇文章 6 订阅

订阅专栏

情景1：使用labelimg标注时，标签名写错了，需要批量改名，代码如下：

'''
task:改变name字段由WuLongTea-sandeliwww 改为WuLongTea-sandeli
date:2022-4-15
author:epiphany
ref:https://blog.csdn.net/weixin_42384743/article/details/113982063
'''
import os
import os.path
import xml.dom.minidom
path="H:\\Acolab\\goods\\WuLongTea-sandeli\\annotations"  #存放大量xml文件的路径，win下为两个\\
files=os.listdir(path)                                    #返回文件夹中的文件名列表
#print(files)
s=[]

for xmlFile in files:
    if not os.path.isdir(xmlFile):                         #os.path.isdir()用于判断对象是否为一个目录；如果不是目录，则直接打开

        print(xmlFile)
        # dom=xml.dom.minidom.parse(path+'\\'+xmlFile) 与 dom=xml.dom.minidom.parse(os.path.join(path,xmlFile)) 作用相同
        #作用是打开文档，并解析文档元素对象
        dom=xml.dom.minidom.parse(os.path.join(path,xmlFile))
        root=dom.documentElement
        #print(dom)

        name = root.getElementsByTagName('name')
        for i in range(len(name)):
            print(name[i].firstChild.data)
            name[i].firstChild.data='WuLongTea-sandeli'        #重新赋值
            print(name[i].firstChild.data)
        with open(os.path.join(path,xmlFile),'w') as fh:
            dom.writexml(fh)
            print("ok!!!!")

情景2：公开数据集RPC官方给定的为COCO，需要改为VOC格式，才可以训练yolov5模型，代码如下：

'''
一个coco的json文件-->N个voc的xml
'''
import os
import json
import cv2
from lxml import etree
import xml.etree.cElementTree as ET
import time
import pandas as pd
from tqdm import tqdm
from xml.dom.minidom import Document
anno = "H:/GOODSDATASET/dataset/Annotations/instances_val2019.json"
xml_dir = "H:/GOODSDATASET/dataset/voc/annotations"
# dttm = time.strftime("%Y%m%d%H%M%S", time.localtime())
# if os.path.exists(xml_dir):
#     os.rename(xml_dir,xml_dir+dttm)
# os.mkdir(xml_dir)
import json



with open(anno, 'r') as load_f:
    f = json.load(load_f)
df_anno = pd.DataFrame(f['annotations'])
imgs = f['images']
cata={}
nameList=[ '1_puffed_food', '2_puffed_food', '3_puffed_food', '4_puffed_food', '5_puffed_food', '6_puffed_food', '7_puffed_food',
           '8_puffed_food', '9_puffed_food', '10_puffed_food', '11_puffed_food', '12_puffed_food', '13_dried_fruit', '14_dried_fruit', '15_dried_fruit',
           '16_dried_fruit', '17_dried_fruit', '18_dried_fruit', '19_dried_fruit', '20_dried_fruit', '21_dried_fruit', '22_dried_food', '23_dried_food',
           '24_dried_food', '25_dried_food', '26_dried_food', '27_dried_food', '28_dried_food', '29_dried_food', '30_dried_food', '31_instant_drink',
           '32_instant_drink', '33_instant_drink', '34_instant_drink', '35_instant_drink', '36_instant_drink', '37_instant_drink', '38_instant_drink',
           '39_instant_drink', '40_instant_drink', '41_instant_drink', '42_instant_noodles', '43_instant_noodles', '44_instant_noodles',
           '45_instant_noodles', '46_instant_noodles', '47_instant_noodles', '48_instant_noodles', '49_instant_noodles', '50_instant_noodles',
           '51_instant_noodles', '52_instant_noodles', '53_instant_noodles', '54_dessert', '55_dessert', '56_dessert', '57_dessert', '58_dessert',
           '59_dessert', '60_dessert', '61_dessert', '62_dessert', '63_dessert', '64_dessert', '65_dessert', '66_dessert', '67_dessert', '68_dessert',
           '69_dessert', '70_dessert', '71_drink', '72_drink', '73_drink', '74_drink', '75_drink', '76_drink', '77_drink', '78_drink', '79_alcohol',
           '80_alcohol', '81_drink', '82_drink', '83_drink', '84_drink', '85_drink', '86_drink', '87_drink', '88_alcohol', '89_alcohol', '90_alcohol',
           '91_alcohol', '92_alcohol', '93_alcohol', '94_alcohol', '95_alcohol', '96_alcohol', '97_milk', '98_milk', '99_milk', '100_milk', '101_milk',
           '102_milk', '103_milk', '104_milk', '105_milk', '106_milk', '107_milk', '108_canned_food', '109_canned_food', '110_canned_food',
           '111_canned_food', '112_canned_food', '113_canned_food', '114_canned_food', '115_canned_food', '116_canned_food', '117_canned_food',
           '118_canned_food', '119_canned_food', '120_canned_food', '121_canned_food', '122_chocolate', '123_chocolate', '124_chocolate', '125_chocolate',
           '126_chocolate', '127_chocolate', '128_chocolate', '129_chocolate', '130_chocolate', '131_chocolate', '132_chocolate', '133_chocolate', '134_gum',
           '135_gum', '136_gum', '137_gum', '138_gum', '139_gum', '140_gum', '141_gum', '142_candy', '143_candy', '144_candy', '145_candy', '146_candy',
           '147_candy', '148_candy', '149_candy', '150_candy', '151_candy', '152_seasoner', '153_seasoner', '154_seasoner', '155_seasoner', '156_seasoner',
           '157_seasoner', '158_seasoner', '159_seasoner', '160_seasoner', '161_seasoner', '162_seasoner', '163_seasoner', '164_personal_hygiene',
           '165_personal_hygiene', '166_personal_hygiene', '167_personal_hygiene', '168_personal_hygiene', '169_personal_hygiene', '170_personal_hygiene',
           '171_personal_hygiene', '172_personal_hygiene', '173_personal_hygiene', '174_tissue', '175_tissue', '176_tissue', '177_tissue', '178_tissue',
           '179_tissue', '180_tissue', '181_tissue', '182_tissue', '183_tissue', '184_tissue', '185_tissue', '186_tissue', '187_tissue', '188_tissue',
           '189_tissue', '190_tissue', '191_tissue', '192_tissue', '193_tissue', '194_stationery', '195_stationery', '196_stationery', '197_stationery',
           '198_stationery', '199_stationery', '200_stationery']
flag=0
def createCate():
    global cata
    df_cate = f['categories']
    for item in df_cate:
        cata[item['id']]=item['name']

def json2xml():
    global cata
    global flag
    #global imageSum
    for im in imgs:
        #imageSum = imageSum+1
        flag = 0
        filename = im['file_name']
        height = im['height']
        img_id = im['id']
        width = im['width']
        doc = Document()
        annotation = doc.createElement('annotation')
        doc.appendChild(annotation)
        filenamedoc = doc.createElement("filename")
        annotation.appendChild(filenamedoc)
        filename_txt=doc.createTextNode(filename)
        filenamedoc.appendChild(filename_txt)
        size = doc.createElement("size")
        annotation.appendChild(size)
        widthdoc = doc.createElement("width")
        size.appendChild(widthdoc)
        width_txt = doc.createTextNode(str(width))
        widthdoc.appendChild(width_txt)

        heightdoc = doc.createElement("height")
        size.appendChild(heightdoc)
        height_txt = doc.createTextNode(str(height))
        heightdoc.appendChild(height_txt)

        annos = df_anno[df_anno["image_id"].isin([img_id])]
        for index, row in annos.iterrows():
            bbox = row["bbox"]
            category_id = row["category_id"]
            cate_name = cata[category_id]
            if cate_name not in nameList:
                print(cate_name+",don`t in namelis")
                continue
            flag=1
            #nameNum[cate_name]=nameNum[cate_name]+1
            object = doc.createElement('object')
            annotation.appendChild(object)

            name = doc.createElement('name')
            object.appendChild(name)
            name_txt = doc.createTextNode(cate_name)
            name.appendChild(name_txt)

            pose = doc.createElement('pose')
            object.appendChild(pose)
            pose_txt = doc.createTextNode('Unspecified')
            pose.appendChild(pose_txt)

            truncated = doc.createElement('truncated')
            object.appendChild(truncated)
            truncated_txt = doc.createTextNode('0')
            truncated.appendChild(truncated_txt)

            difficult = doc.createElement('difficult')
            object.appendChild(difficult)
            difficult_txt = doc.createTextNode('0')
            difficult.appendChild(difficult_txt)

            bndbox = doc.createElement('bndbox')
            object.appendChild(bndbox)

            xmin = doc.createElement('xmin')
            bndbox.appendChild(xmin)
            xmin_txt = doc.createTextNode(str(int(bbox[0])))
            xmin.appendChild(xmin_txt)

            ymin = doc.createElement('ymin')
            bndbox.appendChild(ymin)
            ymin_txt = doc.createTextNode(str(int(bbox[1])))
            ymin.appendChild(ymin_txt)

            xmax = doc.createElement('xmax')
            bndbox.appendChild(xmax)
            xmax_txt = doc.createTextNode(str(int(bbox[0]+bbox[2])))
            xmax.appendChild(xmax_txt)

            ymax = doc.createElement('ymax')
            bndbox.appendChild(ymax)
            ymax_txt = doc.createTextNode(str(int(bbox[1]+bbox[3])))
            ymax.appendChild(ymax_txt)
        if flag==1:
            xml_path = os.path.join(xml_dir,filename.replace('.jpg','.xml'))

            f = open(xml_path, "w")
            f.write(doc.toprettyxml(indent="  "))
            f.close()

createCate()

json2xml()
#print('imagenum:',imageSum)
#print(nameNum)

带你去吃小豆花

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
【自用代码】yolov5数据预处理代码：python批量修改xml某字段，Coco格式转VOC格式

'''task:改变name字段由WuLongTea-sandeliwww 改为WuLongTea-sandelidate:2022-4-15author:epiphanyref:https://blog.csdn.net/weixin_42384743/article/details/113982063'''import osimport os.pathimport xml.dom.minidompath="H:\\Acolab\\goods\\WuLongTea-sandeli\\an
复制链接

扫一扫