labelme数据集转sharegpt数据集（Qwen-VL）-CSDN博客

本文链接：https://blog.csdn.net/qq_44908396/article/details/141687726

最近在鼓捣QwenVL的Grounding能力微调，但是公司标注平台提供的数据集格式为json文件，类似于labelme格式数据集，而sharegpt数据集格式需要添加提问和回答，因此在这里编写了一个脚本自动实现该功能

import jsonlines
import json
import os
import cv2

def normalize_coordinates(box, image_width, image_height):
    x1, y1, x2, y2 = box
    normalized_box = [
        round((x1 / image_width) * 1000),
        round((y1 / image_height) * 1000),
        round((x2 / image_width) * 1000),
        round((y2 / image_height) * 1000)
    ]
    return normalized_box

def getFileList(dir, Filelist, ext=None):
    """
    获取文件夹及其子文件夹中文件列表
    输入 dir：文件夹根目录
    输入 ext: 扩展名
    返回： 文件路径列表
    """
    newDir = dir
    if os.path.isfile(dir):
        if ext is None:
            Filelist.append(dir)
        else:
            if ext in dir:
                Filelist.append(dir)

    elif os.path.isdir(dir):
        for s in os.listdir(dir):
            newDir = os.path.join(dir, s)
            getFileList(newDir, Filelist, ext)

    return Filelist

def get_json_label(json_file):
    '''
    输入任意labelme的json读取共有哪几个类别
    '''
    res = {}
    with open(json_file, 'r') as f:
        data = json.load(f)
        for i in data['dicts']:
            res[int(i['label'])] = i["labelName"]
    return res

def get_ever_json_label_num(json_file):
    print(json_file)
    '''
    输入任意labelme的json读取共有哪几个类别
    '''
    res = {}
    with open(json_file, 'r') as f:
        data = json.load(f)
        for i in data['shapes']:
            if i['label'] not in res:
                res[i['label']] = 1
            else:
                res[i['label']] = res[i['label']]  + 1
    return res

def slim_json(json_file):
    image_path = json_file.replace('.json', '.jpg')
    if not os.path.exists(image_path):
        print(image_path)
        return
    img = cv2.imread(image_path)
    image_width = img.shape[1]
    image_height = img.shape[0]
    temp = {}
    with open(json_file, 'r') as f:
        data = json.load(f)
        image_name = data['imagePath']
        temp['image'] = image_name
        temp['width'] = image_width
        temp['height'] = image_height
        for i in data['shapes']:
            T = [i['points'][0][0], i['points'][0][1],i['points'][1][0], i['points'][1][1]]
            if i['label'] not in temp:
                temp[i['label']] = [normalize_coordinates(T, image_width, image_height)]
            else:
                temp[i['label']] = temp[i['label']] + [normalize_coordinates(T, image_width, image_height)]
    return temp

def get_jsonl(json_path,json_file,label_dict,id):
    data = slim_json(json_file)
    ever_label_num = get_ever_json_label_num(json_file)
    temp = {}
    temp['id'] = id
    #temp['image'] = None
    #temp['width'] = None
    #temp['height'] = None
    temp['conversations'] = []
    for i in data:
        complate_1 = {"from": "user", "value": "<img>xxxxx</img>\n框出图中的*****"}#第一句带图像
        complate_2 = {"from": "assistant", "value": "<ref>*****</ref>"}
        complate_3 = {"from": "user", "value": "框出图中的*****"}# 第二句不带图像
        if i =="image":
            image = os.path.join(json_path, data[i])
        if i in ever_label_num:
            complate_1['value'] = complate_1['value'].replace('*****', label_dict[int(i)])
            complate_1['value'] = complate_1['value'].replace('xxxxx', image)
            complate_2['value'] = complate_2['value'].replace('*****', label_dict[int(i)])
            complate_3['value'] = complate_3['value'].replace('*****', label_dict[int(i)])
            for j in data[i]:
                complate_4 = "<box>&&&&&</box>"
                complate_4 = complate_4.replace('&&&&&', str((j[0], j[1]))+','+str((j[2], j[3])))
                complate_2['value'] = complate_2['value'] + complate_4
            if len(temp['conversations'])==0:
                temp['conversations'].append(complate_1)
            else:
                temp['conversations'].append(complate_3)
            temp['conversations'].append(complate_2)
    return temp

def write_jsonl(json_path,json_file_list,label_dict):
    res = []
    id = 0
    for i in json_file_list:
        temp = get_jsonl(json_path,i, label_dict, id)
        res.append(temp)
        id = id + 1
    with jsonlines.open('qwenvl_dataset_all.jsonl', mode='w') as writer:
        writer.write(res)



if __name__ == '__main__':
    json_path = r'your_json_and_pic_path'#注意这里要把图像和标注文件放在一起
    json_file_list = []
    json_file_list = getFileList(json_path, json_file_list, ext='.json')
    label_dict = get_json_label(json_file_list[0])
    write_jsonl(json_path,json_file_list[0:6], label_dict)

这里需要将图像和标注的json文件放在同一文件夹内，另外公司的标注平台里产生的标注数据里会记录类别对应的中文名称，如果直接拷贝代码是跑不起来的，需要自己在代码倒数第二行定义一个label_dicr变量，如{0:"dog",1:"cat"}等，还有就是此脚本仅适用于QwenVL的微调，如果微调其他大模型，需要根据对应大模型格式进行修改