在对一些label进行重写时往往会造成编码问题’Invalid byte 2 of 2-byte UTF-8 sequence’,所以在写label的同时也要进行相应的写码操作,如下:
import os
import xml.etree.ElementTree as ET
# 批量修改VOC数据集中xml标签文件的标签名称
def changelabelname(inputpath):
listdir = os.listdir(inputpath)
for file in listdir:
if file.endswith('xml'):
file = os.path.join(inputpath,file)
tree = ET.parse(file)
root = tree.getroot()
for object1 in root.findall('object'):
for sku in object1.findall('name'): #查找需要修改的名称
if (sku.text == '你的label'): #修改前的名称
sku.text = '你的新label' #修改后的名称
tree.write(file,encoding='utf-8') #写进原始的xml文件并避免原始xml中文字符乱码
else:
pass
else:
pass
if __name__ == '__main__':
inputpath = 'C:/Users/201901/Desktop/anotations' #此处替换为自己的路径
changelabelname(inputpath)
改好后,如果遇到除数为零的情况,“division by zero …”,有可能是图片的序号不规范,这时就需要进行重新编号,在编号时要注意jpg与xml相对应,脚本如下:
import numpy as np
import glob
import os
import xml.etree.ElementTree as ET
import xml.dom.minidom
'''
第一步,将xml文件和图片重新命名
'''
# 获取文件夹中jpg图片的数量
def getDirImageNum(path):
bmpDirImagesNum = 0
for bmpfile in os.listdir(path):
if os.path.splitext(bmpfile)[1] == '.jpg':
bmpDirImagesNum += 1
return bmpDirImagesNum
# 获取文件夹中xml文件的数量
def getDirXmlNum(path):
xmlDirXmlNum = 0
for xmlfile in os.listdir(path):
if os.path.splitext(xmlfile)[1] == '.xml':
xmlDirXmlNum += 1
return xmlDirXmlNum
inputpath1 = "C:/Users/lindsay/Desktop/datasets/images"
inputpath2 = 'C:/Users/lindsay/Desktop/datasets/annotations'
outpath1 = "D:/datasets/img"
outpath2 = "D:/datasets/anno"
file_name = os.listdir(inputpath2)
error = []
for item in file_name:
print(item)
o_imap = inputpath1 + item.split('.')[0] + ".jpg"
o_xmlp = inputpath2 + item.split('.')[0] + ".xml"
i = getDirImageNum(outpath1) # 表示jpg文件的命名是从当前输出文件夹中的jpg文件数目开始的
if os.path.exists(o_imap) and os.path.exists(o_xmlp):
i = i + 1
new_name = '0' + format(str(i), '0>5s') + '.jpg'
dst1 = os.path.join(os.path.abspath(outpath1), new_name) # 为000001.jpg
os.rename(o_imap, dst1)
dst2 = os.path.join(os.path.abspath(outpath2), '0' + format(str(i), '0>5s') + '.xml') # 为000000.xml形式,想要的格式
try:
dom = xml.dom.minidom.parse(o_xmlp)
root = dom.documentElement
# 获取标签对path之间的值并赋予新值j
# 文件夹赋值
root.getElementsByTagName('folder')[0].firstChild.data = "VOC2007"
# 获取标签对filename之间的值并赋予新值j
root.getElementsByTagName('filename')[0].firstChild.data = new_name
# 将修改后的xml文件保存,xml文件修改前后的路径
# 打开并写入
with open(o_xmlp, 'w') as fh:
dom.writexml(fh)
os.rename(o_xmlp, dst2)
print('converting %s to %s ...' % (o_xmlp, dst2))
except:
error.append(new_name)
continue
# 如果有出错的文件,error++
print(len(error))
脚本中的输入输出文件夹千万不能这样写
inputpath1 = "C:/Users/lindsay/Desktop/datasets/images"
inputpath2 = 'C:/Users/lindsay/Desktop/datasets/annotations'
outpath1 = "D:/datasets/img"
outpath2 = "D:/datasets/anno"
否则代码不报错,只刷新文件夹,加一个反斜杠就行,output的最后反斜杠也可以不加:
inputpath1 = "C:/Users/lindsay/Desktop/datasets/images/"
inputpath2 = 'C:/Users/lindsay/Desktop/datasets/annotations/'
outpath1 = "D:/datasets/img/"
outpath2 = "D:/datasets/anno/"
一起开启新世界的大门吧😊