labelme数据集转sharegpt数据集（InternVL2）

魔障阿Q

已于 2024-08-29 20:25:49 修改

阅读量182

点赞数 3

文章标签：深度学习人工智能

于 2024-08-26 15:16:29 首次发布

本文链接：https://blog.csdn.net/qq_44908396/article/details/141564632

版权

最近在鼓捣InternVL2的Grounding能力微调，但是公司标注平台提供的数据集格式为json文件，类似于labelme格式数据集，而sharegpt数据集格式需要添加提问和回答，因此在这里编写了一个脚本自动实现该功能

import jsonlines
import json
import os
import cv2

def normalize_coordinates(box, image_width, image_height):
    x1, y1, x2, y2 = box
    normalized_box = [
        round((x1 / image_width) * 1000),
        round((y1 / image_height) * 1000),
        round((x2 / image_width) * 1000),
        round((y2 / image_height) * 1000)
    ]
    return normalized_box

def getFileList(dir, Filelist, ext=None):
    """
    获取文件夹及其子文件夹中文件列表
    输入 dir：文件夹根目录
    输入 ext: 扩展名
    返回： 文件路径列表
    """
    newDir = dir
    if os.path.isfile(dir):
        if ext is None:
            Filelist.append(dir)
        else:
            if ext in dir:
                Filelist.append(dir)

    elif os.path.isdir(dir):
        for s in os.listdir(dir):
            newDir = os.path.join(dir, s)
            getFileList(newDir, Filelist, ext)

    return Filelist

def get_json_label(json_file):
    '''
    输入任意labelme的json读取共有哪几个类别
    '''
    res = {}
    with open(json_file, 'r') as f:
        data = json.load(f)
        for i in data['dicts']:
            res[int(i['label'])] = i["labelName"]
    return res

def get_ever_json_label_num(json_file):
    print(json_file)
    '''
    输入任意labelme的json读取共有哪几个类别
    '''
    res = {}
    with open(json_file, 'r') as f:
        data = json.load(f)
        for i in data['shapes']:
            if i['label'] not in res:
                res[i['label']] = 1
            else:
                res[i['label']] = res[i['label']]  + 1
    return res

def slim_json(json_file):
    image_path = json_file.replace('.json', '.jpg')
    if not os.path.exists(image_path):
        print(image_path)
        return
    img = cv2.imread(image_path)
    image_width = img.shape[1]
    image_height = img.shape[0]
    temp = {}
    with open(json_file, 'r') as f:
        data = json.load(f)
        image_name = data['imagePath']
        temp['image'] = image_name
        temp['width'] = image_width
        temp['height'] = image_height
        for i in data['shapes']:
            T = [i['points'][0][0], i['points'][0][1],i['points'][1][0], i['points'][1][1]]
            if i['label'] not in temp:
                temp[i['label']] = [normalize_coordinates(T, image_width, image_height)]
            else:
                temp[i['label']] = temp[i['label']] + [normalize_coordinates(T, image_width, image_height)]
    return temp

def get_jsonl(json_file,label_dict,id):
    data = slim_json(json_file)
    ever_label_num = get_ever_json_label_num(json_file)
    temp = {}
    temp['id'] = id
    temp['image'] = None
    temp['width'] = None
    temp['height'] = None
    temp['conversations'] = []
    for i in data:
        complate_1 = {"from": "human", "value": "<image>\n请提供这句话描述的区域的边界框坐标:<ref>*****</ref>"}#第一句带图像
        complate_2 = {"from": "gpt", "value": "<ref>*****共有%%个,区域分别为</ref><box>&&&&&</box>"}
        complate_3 = {"from": "human", "value": "请提供这句话描述的区域的边界框坐标:<ref>*****</ref>"}  # 第二句不带图像
        if i == 'image':
            temp['image'] = [data[i]]
        elif i == 'width':
            temp['width'] = data[i]
        elif i == 'height':
            temp['height'] = data[i]
        else:
            if i in ever_label_num:
                complate_2['value'] = complate_2['value'].replace('*****', label_dict[int(i)])
                complate_1['value'] = complate_1['value'].replace('*****', label_dict[int(i)])
                complate_3['value'] = complate_3['value'].replace('*****', label_dict[int(i)])
                complate_2['value'] = complate_2['value'].replace('%%', str(ever_label_num[str(i)]))
                complate_2['value'] = complate_2['value'].replace('&&&&&', str(data[i]))
                if len(temp['conversations'])==0:
                    temp['conversations'].append(complate_1)
                else:
                    temp['conversations'].append(complate_3)
                temp['conversations'].append(complate_2)
    return temp

def write_jsonl(json_file_list,label_dict):
    res = []
    id = 0
    for i in json_file_list:
        temp = get_jsonl(i, label_dict, id)
        res.append(temp)
        id = id + 1
    with jsonlines.open('internvl_dataset_100.jsonl', mode='w') as writer:
        writer.write(res)



if __name__ == '__main__':
    json_path = r'your_data_and_label_path'
    json_file_list = []
    json_file_list = getFileList(json_path, json_file_list, ext='.json')
    label_dict = get_json_label(json_file_list[0])
    write_jsonl(json_file_list[0:100], label_dict)

这里需要将图像和标注的json文件放在同一文件夹内，另外公司的标注平台里产生的标注数据里会记录类别对应的中文名称，如果直接拷贝代码是跑不起来的，需要自己在代码倒数第二行定义一个label_dicr变量，如{0:"dog",1:"cat"}等，还有就是此脚本仅适用于InternVL的微调，因为其使用normalize_coordinates函数进行了坐标转换，如果微调其他大模型，需要根据对应大模型的转换公式进行修改