unsloth微调qwen2vl 实现复杂游戏装备ocr识别

前期准备

数据集准备:

1、图像准备:首先获取所研究的目标的游戏的道具截图:

2、标签准备

由于没有标准的标签数据,为了快速制作数据集标签,借助Qwen2-VL-2B-OCR"https://huggingface.co/JackChew/Qwen2-VL-2B-OCR/tree/main"模型对图像进行初步识别,将识别的结果作为标签,为了微调的效果,需要对表标签质量进行把关,针对你的应用场景,归纳出目标游戏道具的一些固定字段,特殊字段,字段的组合,字段的取值范围,等等知识结合起来,对有错误的标签进行修正,直到标签全部正确。

按照qwen2vl需要的训练数据格式对数据进行预处理,对应的函数如下:

# 获取数据
instruction = "识别图像中的文字内容,并按照顺序输出结果"
from PIL import Image
def convert_to_conversation(sample):
    conversation = [
        { "role": "user",
          "content" : [
              {"type" : "text",  "text"  : instruction},
              {"type" : "image", "image" :  Image.open(sample['conversations'][0]['value'].split('/')[-1]).convert("RGB")} ]
        },
        { "role" : "assistant",
          "content" : [
              {"type" : "text",  "text"  : sample['conversations'][1]['value']} ]
        },
    ]
    return { "messages" : conversation }
pass

对图像,进行缩放,二值化,等等一系列操纵,最终获得数据集6W条

环境安装

由于显卡资源的限制,以及训练时间的成本控制,也借鉴其他人的微调经验,最后选择了使用unsloth框架进行微调,框架安装请按照unsloth官网进行(过程很曲折,请自行解决)

微调开始

  • 导入需要的库
from unsloth import FastLanguageModel
import os


# 设置缓存路径到D盘
os.environ["HF_DATASETS_CACHE"] = "D:/huggingface/datasets"
os.environ["TRANSFORMERS_CACHE"] = "D:/huggingface/transformers"

from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch
import json
  • 加载模型

这里需要注意的上,当选择了下载到本地的模型之后。load_in_4bit需要设置成False,应为Qwen2-VL-2B-Instruct数据16bit模型,若未设置成False,会自动下载模型到C盘,造成资源浪费

model, tokenizer = FastVisionModel.from_pretrained(
    # "unsloth/Qwen2-VL-2B-Instruct",
    'D:/model/Qwen2-VL-2B-Instruct',
    load_in_4bit = False, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

补充:如果想要修改huggingface模型下载的默认位置,需要找你的虚拟环境xxx\env\rag\Lib\site-packages\huggingface_hub\constants.py文件,将里面的路径修改到你的目标路径

  • 配置loar微调
# ===============================================配置Lora微调========================================
model = FastVisionModel.get_peft_model(
    # 微调配置:
    model,
    finetune_vision_layers     = True, # 微调视觉部分的层
    finetune_language_layers   = True, # 微调语言部分的层
    finetune_attention_modules = True, # 微调注意力模块
    finetune_mlp_modules       = True, # 微调多层感知机模块
    # Lora参数
    r = 16,           # 秩。越大,模型表达能力越强,但是可能过拟合。
    lora_alpha = 16,  # Lora的缩放因子,通常简易alpha==r
    lora_dropout = 0, # dropout率,设置为0表述不启用dropout
    bias = "none",    # 不添加额外的偏置参数
    random_state = 3407, # 随机种子,确保试验可以复现
    use_rslora = False,  # 是否启用rslora(一种改进的LoRA)
    loftq_config = None, # 是否使用Loftq(一种量化方法)
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)
  • 获取数据集
# 获取数据
instruction = "识别图像中的文字内容,并按照顺序输出结果"
from PIL import Image
def convert_to_conversation(sample):
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" :  Image.open(sample['conversations'][0]['value'].split('/')[-1]).convert("RGB")} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : sample['conversations'][1]['value']} ]
        },
    ]
    return { "messages" : conversation }
pass


import json

train_dataset_json_path = r"D:\work\ocr\image_ocr\test\linshi\new_linshi_train.json"
# 读取 JSON 文件并修改路径
with open(train_dataset_json_path, 'r',encoding="utf-8") as f:
    train_data = json.load(f)
converted_dataset = [convert_to_conversation(sample) for sample in train_data]

  • 配置微调的超参数
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # max_steps = 60,
        num_train_epochs = 2, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)
  • 查看显卡情况
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")
  • 开始训练
trainer_stats = trainer.train()
  • 保存lora模型
model.save_pretrained("new_lora_model")  # Local saving
tokenizer.save_pretrained("new_lora_model")
  • 模型合并
from transformers import AutoModelForCausalLM, AutoTokenizer, Qwen2VLForConditionalGeneration, AutoProcessor
from peft import PeftModel
import torch

import os
import argparse

def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--base_model_name_or_path", type=str,default='D:\\model\\Qwen2-VL-2B-Instruct')
    parser.add_argument("--peft_model_path", type=str,default='D:\\work\\ocr\\qwen2_loar\\unsloth_try\\new_lora_model')
    parser.add_argument("--output_dir", type=str,default='D:\\work\\ocr\\qwen2_loar\\unsloth_try\\lingshi_qwen2_ocr_new')
    parser.add_argument("--device", type=str, default="auto")
    parser.add_argument("--push_to_hub", action="store_false")

    return parser.parse_args()

def main():
    args = get_args()

    if args.device == 'auto':
        device_arg = { 'device_map': 'auto' }
    else:
        device_arg = { 'device_map': { "": args.device} }

    print(f"Loading base model: {args.base_model_name_or_path}")
    base_model = Qwen2VLForConditionalGeneration.from_pretrained(
        args.base_model_name_or_path,
        return_dict=True,
        torch_dtype=torch.float16,
        **device_arg
    )

    print(f"Loading PEFT: {args.peft_model_path}")
    model = PeftModel.from_pretrained(base_model, args.peft_model_path, **device_arg)
    print(f"Running merge_and_unload")
    model = model.merge_and_unload()

    tokenizer = AutoProcessor.from_pretrained(args.base_model_name_or_path)

    # if args.push_to_hub:
    #     print(f"Saving to hub ...")
    #     model.push_to_hub(f"{args.output_dir}", use_temp_dir=False)
    #     tokenizer.push_to_hub(f"{args.output_dir}", use_temp_dir=False)
    # else:
    model.save_pretrained(f"{args.output_dir}")
    tokenizer.save_pretrained(f"{args.output_dir}")
    print(f"Model saved to {args.output_dir}")

if __name__ == "__main__" :
    main()

参考来源如下:

模型:

https://huggingface.co/JackChew/Qwen2-VL-2B-OCR

https://huggingface.co/unsloth/Qwen2-VL-2B-Instruct

训练代码:

https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_VL_(7B)-Vision.ipynb

模型合并代码:

https://github.com/unslothai/unsloth/issues/1352

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值