工作中需要把PPOCRLabel标注格式转化成需要的xml格式。
遇到的问题:1、之前只使用过使用minidom读xml,还没有使用过minidom写xml。2、minidom写xml顺序与写入的顺序不一致问题。
minidom源码在我本地路径:D:\Anaconda3\Lib\xml\dom\minidom.py
参考:Python3.7写入xml文件 保持节点属性顺序不变
首先导入模块 from xml.dom.minidom import Document,然后按下ctrl 使用鼠标点击minidom进入源码
ctrl+F 搜索 a_names = sorted(attrs.keys())。然后注释掉
添加 a_names = attrs.keys() 不使用排序
下面是我的PPOCRLabel标注格式转化成需要的xml格式的代码。
# -*- coding : UTF-8 -*-
# @file : conver_json_icdar.py
# @Time : 2021/4/9 11:24
# @Author : wmz
import os
import json
import xml.dom.minidom as minidom
def json_2_icdar(js_path, ic_path):
with open(js_path, 'r', encoding='utf-8') as f:
for line in f.readlines():
print(line)
content = line.split('\t')
print(content[0])
txt_file = str(content[0]).replace('jpg', 'txt')
dst_file = os.path.join(ic_path, txt_file)
# write file
file_lineinfo = open(dst_file, 'w', encoding='utf-8')
list_dict = json.loads(content[1])
nsize = len(list_dict)
print(nsize)
for i in range(nsize):
print(list_dict[i])
lin = list_dict[i]
info = lin['transcription']
points = lin['points']
points = [int(y) for x in points for y in x]
pts = ','.join(map(str, points))
lineinfo = pts + ',' + info + '\n'
file_lineinfo.write(lineinfo)
file_lineinfo.close()
def json_2_xml(js_path, xml_path):
with open(js_path, 'r', encoding='utf-8') as f:
for line in f.readlines():
print(line)
content = line.split('\t')
print(content[0])
xml_file = str(content[0]).replace('jpg', 'xml')
dst_xml_file = os.path.join(xml_path, xml_file)
# txt_file = str(content[0]).replace('jpg', 'txt')
# dst_file = os.path.join(xml_path, txt_file)
# write file
# 1.创建DOM树对象
dom = minidom.Document()
# 2.创建根节点。每次都要用DOM对象来创建任何节点。
root_node = dom.createElement('ImageInfo')
# 3.用DOM对象添加根节点
dom.appendChild(root_node)
# 设置该节点的属性
root_node.setAttribute('bModify', '3')
# file_lineinfo = open(dst_file, 'w', encoding='utf-8')
list_dict = json.loads(content[1])
nsize = len(list_dict)
print(nsize)
for i in range(nsize):
print(list_dict[i])
lin = list_dict[i]
info = lin['transcription']
points = lin['points']
points = [int(y) for x in points for y in x]
# 用DOM对象创建元素子节点
info_node = dom.createElement('LineInfo')
# 用父节点对象添加元素子节点
root_node.appendChild(info_node)
# 设置该节点的属性
info_node.setAttribute('ptLTX', str(points[0]))
info_node.setAttribute('ptLTY', str(points[1]))
info_node.setAttribute('ptRTX', str(points[2]))
info_node.setAttribute('ptRTY', str(points[3]))
info_node.setAttribute('ptRBX', str(points[4]))
info_node.setAttribute('ptRBY', str(points[5]))
info_node.setAttribute('ptLBX', str(points[6]))
info_node.setAttribute('ptLBY', str(points[7]))
info_node.setAttribute('Chars', info)
info_node.setAttribute('bModify', '3')
pts = ','.join(map(str, points))
# lineinfo = pts + ',' + info + '\n'
# file_lineinfo.write(lineinfo)
# file_lineinfo.close()
with open(dst_xml_file, 'w', encoding='UTF-8') as fh:
dom.writexml(fh, indent='', addindent='\t', newl='\n', encoding='UTF-8')
if __name__ == "__main__":
# src_path = r"C:\Users\WT\Desktop\hkb-bz\Label.txt"
src_path = r"C:\Users\WT\Desktop\hkb\Cache.cach"
dst_path = r"C:\Users\WT\Desktop"
# src_path = r"C:\Users\WT\Desktop\户口本\Cache.cach"
# dst_path = r"C:\Users\WT\Desktop"
json_2_xml(src_path, dst_path)