labelme数据集转sharegpt数据集(InternVL2)

最近在鼓捣InternVL2的Grounding能力微调,但是公司标注平台提供的数据集格式为json文件,类似于labelme格式数据集,而sharegpt数据集格式需要添加提问和回答,因此在这里编写了一个脚本自动实现该功能

import jsonlines
import json
import os
import cv2

def normalize_coordinates(box, image_width, image_height):
    x1, y1, x2, y2 = box
    normalized_box = [
        round((x1 / image_width) * 1000),
        round((y1 / image_height) * 1000),
        round((x2 / image_width) * 1000),
        round((y2 / image_height) * 1000)
    ]
    return normalized_box

def getFileList(dir, Filelist, ext=None):
    """
    获取文件夹及其子文件夹中文件列表
    输入 dir:文件夹根目录
    输入 ext: 扩展名
    返回: 文件路径列表
    """
    newDir = dir
    if os.path.isfile(dir):
        if ext is None:
            Filelist.append(dir)
        else:
            if ext in dir:
                Filelist.append(dir)

    elif os.path.isdir(dir):
        for s in os.listdir(dir):
            newDir = os.path.join(dir, s)
            getFileList(newDir, Filelist, ext)

    return Filelist

def get_json_label(json_file):
    '''
    输入任意labelme的json读取共有哪几个类别
    '''
    res = {}
    with open(json_file, 'r') as f:
        data = json.load(f)
        for i in data['dicts']:
            res[int(i['label'])] = i["labelName"]
    return res

def get_ever_json_label_num(json_file):
    print(json_file)
    '''
    输入任意labelme的json读取共有哪几个类别
    '''
    res = {}
    with open(json_file, 'r') as f:
        data = json.load(f)
        for i in data['shapes']:
            if i['label'] not in res:
                res[i['label']] = 1
            else:
                res[i['label']] = res[i['label']]  + 1
    return res

def slim_json(json_file):
    image_path = json_file.replace('.json', '.jpg')
    if not os.path.exists(image_path):
        print(image_path)
        return
    img = cv2.imread(image_path)
    image_width = img.shape[1]
    image_height = img.shape[0]
    temp = {}
    with open(json_file, 'r') as f:
        data = json.load(f)
        image_name = data['imagePath']
        temp['image'] = image_name
        temp['width'] = image_width
        temp['height'] = image_height
        for i in data['shapes']:
            T = [i['points'][0][0], i['points'][0][1],i['points'][1][0], i['points'][1][1]]
            if i['label'] not in temp:
                temp[i['label']] = [normalize_coordinates(T, image_width, image_height)]
            else:
                temp[i['label']] = temp[i['label']] + [normalize_coordinates(T, image_width, image_height)]
    return temp

def get_jsonl(json_file,label_dict,id):
    data = slim_json(json_file)
    ever_label_num = get_ever_json_label_num(json_file)
    temp = {}
    temp['id'] = id
    temp['image'] = None
    temp['width'] = None
    temp['height'] = None
    temp['conversations'] = []
    for i in data:
        complate_1 = {"from": "human", "value": "<image>\n请提供这句话描述的区域的边界框坐标:<ref>*****</ref>"}#第一句带图像
        complate_2 = {"from": "gpt", "value": "<ref>*****共有%%个,区域分别为</ref><box>&&&&&</box>"}
        complate_3 = {"from": "human", "value": "请提供这句话描述的区域的边界框坐标:<ref>*****</ref>"}  # 第二句不带图像
        if i == 'image':
            temp['image'] = [data[i]]
        elif i == 'width':
            temp['width'] = data[i]
        elif i == 'height':
            temp['height'] = data[i]
        else:
            if i in ever_label_num:
                complate_2['value'] = complate_2['value'].replace('*****', label_dict[int(i)])
                complate_1['value'] = complate_1['value'].replace('*****', label_dict[int(i)])
                complate_3['value'] = complate_3['value'].replace('*****', label_dict[int(i)])
                complate_2['value'] = complate_2['value'].replace('%%', str(ever_label_num[str(i)]))
                complate_2['value'] = complate_2['value'].replace('&&&&&', str(data[i]))
                if len(temp['conversations'])==0:
                    temp['conversations'].append(complate_1)
                else:
                    temp['conversations'].append(complate_3)
                temp['conversations'].append(complate_2)
    return temp

def write_jsonl(json_file_list,label_dict):
    res = []
    id = 0
    for i in json_file_list:
        temp = get_jsonl(i, label_dict, id)
        res.append(temp)
        id = id + 1
    with jsonlines.open('internvl_dataset_100.jsonl', mode='w') as writer:
        writer.write(res)



if __name__ == '__main__':
    json_path = r'your_data_and_label_path'
    json_file_list = []
    json_file_list = getFileList(json_path, json_file_list, ext='.json')
    label_dict = get_json_label(json_file_list[0])
    write_jsonl(json_file_list[0:100], label_dict)

这里需要将图像和标注的json文件放在同一文件夹内,另外公司的标注平台里产生的标注数据里会记录类别对应的中文名称,如果直接拷贝代码是跑不起来的,需要自己在代码倒数第二行定义一个label_dicr变量,如{0:"dog",1:"cat"}等,还有就是此脚本仅适用于InternVL的微调,因为其使用normalize_coordinates函数进行了坐标转换,如果微调其他大模型,需要根据对应大模型的转换公式进行修改

  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值