unsloth微调qwen2vl 实现复杂游戏装备ocr识别

M_try的小尾巴

已于 2025-04-17 17:56:15 修改

阅读量714

点赞数 22

分类专栏：人工智能文章标签：游戏 ocr

于 2025-04-09 09:47:32 首次发布

本文链接：https://blog.csdn.net/qq_28319843/article/details/147084970

版权

人工智能专栏收录该内容

11 篇文章

订阅专栏

前期准备

数据集准备：

1、图像准备：首先获取所研究的目标的游戏的道具截图：

2、标签准备

由于没有标准的标签数据，为了快速制作数据集标签，借助Qwen2-VL-2B-OCR"https://huggingface.co/JackChew/Qwen2-VL-2B-OCR/tree/main"模型对图像进行初步识别，将识别的结果作为标签，为了微调的效果，需要对表标签质量进行把关，针对你的应用场景，归纳出目标游戏道具的一些固定字段，特殊字段，字段的组合，字段的取值范围，等等知识结合起来，对有错误的标签进行修正，直到标签全部正确。

按照qwen2vl需要的训练数据格式对数据进行预处理，对应的函数如下：

# 获取数据
instruction = "识别图像中的文字内容，并按照顺序输出结果"
from PIL import Image
def convert_to_conversation(sample):
    conversation = [
        { "role": "user",
          "content" : [
              {"type" : "text",  "text"  : instruction},
              {"type" : "image", "image" :  Image.open(sample['conversations'][0]['value'].split('/')[-1]).convert("RGB")} ]
        },
        { "role" : "assistant",
          "content" : [
              {"type" : "text",  "text"  : sample['conversations'][1]['value']} ]
        },
    ]
    return { "messages" : conversation }
pass

对图像，进行缩放，二值化，等等一系列操纵，最终获得数据集6W条

环境安装

由于显卡资源的限制，以及训练时间的成本控制，也借鉴其他人的微调经验，最后选择了使用unsloth框架进行微调，框架安装请按照unsloth官网进行（过程很曲折，请自行解决）

微调开始

导入需要的库

from unsloth import FastLanguageModel
import os


# 设置缓存路径到D盘
os.environ["HF_DATASETS_CACHE"] = "D:/huggingface/datasets"
os.environ["TRANSFORMERS_CACHE"] = "D:/huggingface/transformers"

from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch
import json

加载模型

这里需要注意的上，当选择了下载到本地的模型之后。load_in_4bit需要设置成False，应为Qwen2-VL-2B-Instruct数据16bit模型，若未设置成False，会自动下载模型到C盘，造成资源浪费

model, tokenizer = FastVisionModel.from_pretrained(
    # "unsloth/Qwen2-VL-2B-Instruct",
    'D:/model/Qwen2-VL-2B-Instruct',
    load_in_4bit = False, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

补充：如果想要修改huggingface模型下载的默认位置，需要找你的虚拟环境xxx\env\rag\Lib\site-packages\huggingface_hub\constants.py文件，将里面的路径修改到你的目标路径

配置loar微调

# ===============================================配置Lora微调========================================
model = FastVisionModel.get_peft_model(
    # 微调配置：
    model,
    finetune_vision_layers     = True, # 微调视觉部分的层
    finetune_language_layers   = True, # 微调语言部分的层
    finetune_attention_modules = True, # 微调注意力模块
    finetune_mlp_modules       = True, # 微调多层感知机模块
    # Lora参数
    r = 16,           # 秩。越大，模型表达能力越强，但是可能过拟合。
    lora_alpha = 16,  # Lora的缩放因子，通常简易alpha==r
    lora_dropout = 0, # dropout率，设置为0表述不启用dropout
    bias = "none",    # 不添加额外的偏置参数
    random_state = 3407, # 随机种子，确保试验可以复现
    use_rslora = False,  # 是否启用rslora（一种改进的LoRA）
    loftq_config = None, # 是否使用Loftq（一种量化方法）
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

获取数据集

# 获取数据
instruction = "识别图像中的文字内容，并按照顺序输出结果"
from PIL import Image
def convert_to_conversation(sample):
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" :  Image.open(sample['conversations'][0]['value'].split('/')[-1]).convert("RGB")} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : sample['conversations'][1]['value']} ]
        },
    ]
    return { "messages" : conversation }
pass


import json

train_dataset_json_path = r"D:\work\ocr\image_ocr\test\linshi\new_linshi_train.json"
# 读取 JSON 文件并修改路径
with open(train_dataset_json_path, 'r',encoding="utf-8") as f:
    train_data = json.load(f)
converted_dataset = [convert_to_conversation(sample) for sample in train_data]

配置微调的超参数

from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # max_steps = 60,
        num_train_epochs = 2, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)

查看显卡情况

# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

开始训练

trainer_stats = trainer.train()

保存lora模型

model.save_pretrained("new_lora_model")  # Local saving
tokenizer.save_pretrained("new_lora_model")

模型合并

from transformers import AutoModelForCausalLM, AutoTokenizer, Qwen2VLForConditionalGeneration, AutoProcessor
from peft import PeftModel
import torch

import os
import argparse

def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--base_model_name_or_path", type=str,default='D:\\model\\Qwen2-VL-2B-Instruct')
    parser.add_argument("--peft_model_path", type=str,default='D:\\work\\ocr\\qwen2_loar\\unsloth_try\\new_lora_model')
    parser.add_argument("--output_dir", type=str,default='D:\\work\\ocr\\qwen2_loar\\unsloth_try\\lingshi_qwen2_ocr_new')
    parser.add_argument("--device", type=str, default="auto")
    parser.add_argument("--push_to_hub", action="store_false")

    return parser.parse_args()

def main():
    args = get_args()

    if args.device == 'auto':
        device_arg = { 'device_map': 'auto' }
    else:
        device_arg = { 'device_map': { "": args.device} }

    print(f"Loading base model: {args.base_model_name_or_path}")
    base_model = Qwen2VLForConditionalGeneration.from_pretrained(
        args.base_model_name_or_path,
        return_dict=True,
        torch_dtype=torch.float16,
        **device_arg
    )

    print(f"Loading PEFT: {args.peft_model_path}")
    model = PeftModel.from_pretrained(base_model, args.peft_model_path, **device_arg)
    print(f"Running merge_and_unload")
    model = model.merge_and_unload()

    tokenizer = AutoProcessor.from_pretrained(args.base_model_name_or_path)

    # if args.push_to_hub:
    #     print(f"Saving to hub ...")
    #     model.push_to_hub(f"{args.output_dir}", use_temp_dir=False)
    #     tokenizer.push_to_hub(f"{args.output_dir}", use_temp_dir=False)
    # else:
    model.save_pretrained(f"{args.output_dir}")
    tokenizer.save_pretrained(f"{args.output_dir}")
    print(f"Model saved to {args.output_dir}")

if __name__ == "__main__" :
    main()

参考来源如下：

模型：

https://huggingface.co/JackChew/Qwen2-VL-2B-OCR

https://huggingface.co/unsloth/Qwen2-VL-2B-Instruct

训练代码：

https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_VL_(7B)-Vision.ipynb

模型合并代码：

https://github.com/unslothai/unsloth/issues/1352