一、背景
上文工作将xml另存为jpg文件后,
利用drawo.io将xml转化为JPG_xml怎么变图片-CSDN博客
算是得到了数据,那么后面我们如何得到准确的标签呢?本文将总结如何将JPG流程图与xml的数据进行准确映射,最后得到数据标签存储在json文件中。
二、思路
在xml提取element、edge等关键信息存储在json文件中,利用这些数据用PIL库,在得到的jpg图片上进行再作图,观察偏移和箭头等信息,调整偏移和其他参数信息。
三、方法
xml_to_json
# -*- coding:utf-8 -*-
import os
import json
import xml.etree.ElementTree as ET
from tqdm import tqdm
# 定义输入和输出目录
base_dir = r''
output_dir = r''
os.makedirs(output_dir, exist_ok=True)
def parse_xml(xml_path):
tree = ET.parse(xml_path)
root = tree.getroot()
elements = []
edges = []
min_x, min_y = float('inf'), float('inf')
max_x, max_y = float('-inf'), float('-inf')
def update_bounds(x, y):
nonlocal min_x, min_y, max_x, max_y
min_x = min(min_x, x)
min_y = min(min_y, y)
max_x = max(max_x, x)
max_y = max(max_y, y)
# 解析节点
for cell in root.findall(".//mxCell[@vertex='1']"):
element = {}
element['id'] = cell.get('id')
element['value'] = cell.get('value')
geometry = cell.find('mxGeometry')
if geometry is not None:
x = float(geometry.get('x', 0))
y = float(geometry.get('y', 0))
width = float(geometry.get('width', 0))
height = float(geometry.get('height', 0))
element['x'] = x
element['y'] = y
element['width'] = width
element['height'] = height
update_bounds(x, y)
update_bounds(x + width, y + height)
element['style'] = cell.get('style')
elements.append(element)
# 解析边缘
for edge in root.findall(".//mxCell[@edge='1']"):
edge_element = {}
edge_element['id'] = edge.get('id')
edge_element['value'] = edge.get('value')
edge_element['style'] = edge.get('style')
edge_element['source'] = edge.get('source')
edge_element['target'] = edge.get('target')
geometry = edge.find('mxGeometry')
if geometry is not None:
edge_element['points'] = []
for point in geometry.findall('Array[@as="points"]/mxPoint'):
x = float(point.get('x', 0))
y = float(point.get('y', 0))
point_data = {'x': x, 'y': y}
edge_element['points'].append(point_data)
update_bounds(x, y)
edges.append(edge_element)
# 解析边框和箭头
for cell in root.findall(".//mxCell"):
if cell.get('vertex') != '1' and cell.get('edge') != '1':
geometry = cell.find('mxGeometry')
if geometry is not None:
x = float(geometry.get('x', 0))
y = float(geometry.get('y', 0))
width = float(geometry.get('width', 0))
height = float(geometry.get('height', 0))
update_bounds(x, y)
update_bounds(x + width, y + height)
# 计算最大矩形
max_rectangle = {
'min_x': min_x,
'min_y': min_y,
'max_x': max_x,
'max_y': max_y,
'width': max_x - min_x,
'height': max_y - min_y
}
return {'elements': elements, 'edges': edges, 'max_rectangle': max_rectangle}
for name in tqdm(os.listdir(base_dir)[::-1]):
# 跳过不是 .xml 结尾的文件
if not name.endswith('.xml'):
continue
xml_path = os.path.join(base_dir, name)
json_path = os.path.join(output_dir, name[:-4] + '.json')
# 解析 XML 文件
data = parse_xml(xml_path)
# 将结果保存为 JSON 文件
with open(json_path, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, ensure_ascii=False, indent=4)
print(f'Converted {xml_path} to {json_path}')
match_json_xml
得到初始的json文件,要判断是否能以此为标签,需要利用PIL匹配JPG,观测匹配度.
黑色是原图,红色是element中的参数,蓝色是edge中的参数。
# -*- coding=utf-8 -*-
import json
import os
from PIL import Image, ImageDraw, ImageFont
from tqdm import tqdm
def load_json(json_file_path):
with open(json_file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
return data
def parse_style(style):
style_dict = {}
for item in style.split(';'):
if '=' in item:
key, value = item.split('=')
try:
style_dict[key] = float(value)
except ValueError:
style_dict[key] = value # keep as string if it cannot be converted to float
return style_dict
def get_edge_points(source, target, style, max_rectangle):
# Adjust coordinates based on max_rectangle
source_x = source['x'] - max_rectangle['min_x']
source_y = source['y'] - max_rectangle['min_y']
target_x = target['x'] - max_rectangle['min_x']
target_y = target['y'] - max_rectangle['min_y']
# Get exit and entry points
exit_x = source_x + style.get('exitX', 0.5) * source['width']
exit_y = source_y + style.get('exitY', 0.5) * source['height']
entry_x = target_x + style.get('entryX', 0.5) * target['width']
entry_y = target_y + style.get('entryY', 0.5) * target['height']
return exit_x, exit_y, entry_x, entry_y
def validate_elements(json_data, image_path, output_path):
image = Image.open(image_path)
draw = ImageDraw.Draw(image)
font = ImageFont.load_default()
elements = json_data['elements']
edges = json_data['edges']
max_rectangle = json_data['max_rectangle']
for element in elements:
x = element['x'] - max_rectangle['min_x']
y = element['y'] - max_rectangle['min_y']
width = element['width']
height = element['height']
draw.rectangle([x, y, x + width, y + height], outline="red", width=2)
if element['value'] is not None:
draw.text((x + 2, y + 2), element['value'], fill="red", font=font)
for edge in edges:
source = next((element for element in elements if element['id'] == edge['source']), None)
target = next((element for element in elements if element['id'] == edge['target']), None)
if source and target:
style = parse_style(edge['style'])
start_x, start_y, end_x, end_y = get_edge_points(source, target, style, max_rectangle)
draw.line([start_x, start_y, end_x, end_y], fill="blue", width=2)
if edge['value'] is not None:
draw.text(((start_x + end_x) / 2, (start_y + end_y) / 2), edge['value'], fill="blue", font=font)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Save the image with annotations
image.save(output_path)
def batch_process(json_directory, jpg_directory, output_directory):
files = [f for f in os.listdir(json_directory) if f.endswith('.json')]
for filename in tqdm(files, total=len(files)):
json_file_path = os.path.join(json_directory, filename)
image_file_path = os.path.join(jpg_directory, filename.replace('.json', '.jpg'))
output_file_path = os.path.join(output_directory, filename.replace('.json', '_annotated.jpg'))
if os.path.exists(image_file_path):
json_data = load_json(json_file_path)
validate_elements(json_data, image_file_path, output_file_path)
print(f"Processed and saved: {output_file_path}")
else:
print(f"Image file not found for {json_file_path}")
if __name__ == "__main__":
json_directory = r''
jpg_directory = r''
output_directory = r'match_jpg'
os.makedirs(output_directory, exist_ok=True)
batch_process(json_directory, jpg_directory, output_directory)
remove_offset
匹配效果好的话,就可以进行下一步,去除初始json的偏移量,得到最后的JSON标签。
# -*- coding=utf-8 -*-
import json
import os
from PIL import Image
from tqdm import tqdm
def load_json(json_file_path):
with open(json_file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
return data
def save_json(data, json_file_path):
with open(json_file_path, 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
def update_json_values(json_data, image_width, image_height):
max_rectangle = json_data['max_rectangle']
min_x = max_rectangle['min_x']
min_y = max_rectangle['min_y']
for element in json_data['elements']:
element['x'] -= min_x
element['y'] -= min_y
for edge in json_data['edges']:
source = next((el for el in json_data['elements'] if el['id'] == edge['source']), None)
target = next((el for el in json_data['elements'] if el['id'] == edge['target']), None)
if source and target:
edge['points'] = [
{'x': source['x'] + source['width'], 'y': source['y'] + source['height'] / 2},
{'x': target['x'], 'y': target['y'] + target['height'] / 2}
]
# Update max_rectangle
max_rectangle['min_x'] = 0
max_rectangle['min_y'] = 0
max_rectangle['max_x'] = image_width
max_rectangle['max_y'] = image_height
max_rectangle['width'] = image_width
max_rectangle['height'] = image_height
return json_data
def process_file(json_file_path, image_file_path, output_json_path):
json_data = load_json(json_file_path)
image = Image.open(image_file_path)
image_width, image_height = image.size
updated_json_data = update_json_values(json_data, image_width, image_height)
save_json(updated_json_data, output_json_path)
def batch_process(json_directory, jpg_directory, output_directory):
files = [f for f in os.listdir(json_directory) if f.endswith('.json')]
for filename in tqdm(files, total=len(files)):
json_file_path = os.path.join(json_directory, filename)
image_file_path = os.path.join(jpg_directory, filename.replace('.json', '.jpg'))
output_json_path = os.path.join(output_directory, filename)
if os.path.exists(image_file_path):
process_file(json_file_path, image_file_path, output_json_path)
print(f"Processed and saved: {output_json_path}")
else:
print(f"Image file not found for {json_file_path}")
if __name__ == "__main__":
json_directory = r''
jpg_directory = r''
output_directory = r''
os.makedirs(output_directory, exist_ok=True)
batch_process(json_directory, jpg_directory, output_directory)
得到最后的标签数据。