多模态图生文模型lora微调Blip2ForConditionalGeneration-CSDN博客

本文链接：https://blog.csdn.net/weixin_40777649/article/details/134825410

一、目录

Blip2 模型
模型试用
no trainer +lora 微调
trainer+lora 微调

二、实现

Blip2 模型
多模态图生文入门：
核心：图片的向量特征与文本的向量进行融合。

架构： image encoder 部分+ qformer 部分+ gpt decoder 部分。
融合：1. 视觉到文本生成
将Q-Former学习的文本和图像向量，加上一个全连接层（一个Linear，从768维到2560维），然后输入到大预言模型，预测文本输出。
常见的LLM语言模型包含Decoder only(chatgpt,LLaMa等，目前更常用)和encoder-decoder(谷歌T5，清华GLM)结构。

图文匹配（ITM：Image-Text Matching）

这是一个分类任务，判断图像和文本描述是一对，是为1，否则为0。
在该过程中，图像和文本一起经过Qfromer的bert，将文本的token id和query一起embeding和图像进行交互，此过程中query的图像和文本可以互
相看到，因此query能学习到多模态的信息。
3.图文对比学习(ITC：Image-text Contrastive)
该部分和clip相同，图像和文本分别经过bert进行encoder，得到对应的特征，然后分别经过一个线性层，最后进行一致性loss计算，具体参考clip。对于文本，此
时CA 层不起作用。
该过程中query和文本互相看不见，避免了query直接从文本学习信息，如果query直接从文本学习，这样query就学不到图像信息，对比学习也就没有意义了。
4. 根据图像生成文本(ITG：Image-text Generation)
即解码时文本可以看到图像信息和解码过的文本信息，但是图像看不到文本，解码时看着文本和图像一起进行解码，这样模型预测的文本可
以从图像中学习到信息。
参考网址：https://zhuanlan.zhihu.com/p/649132737?utm_id=0
https://huggingface.co/docs/transformers/main/en/model_doc/blip-2
github:llm-action
2. 模型试用

from PIL import Image
import requests
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
from transformers import AutoModelForVision2Seq, AutoProcessor
import datasets
# dataset=datasets.load_dataset("ybelkada/football-dataset")
# dataset.save_to_disk("football-dataset")
dataset=datasets.load_from_disk("football-dataset")
print(dataset["train"][:2])

device = "cuda" if torch.cuda.is_available() else "cpu"
def test():
    #加载模型
    processor = Blip2Processor.from_pretrained("/home/blip2-opt-2.7b",)
    model = Blip2ForConditionalGeneration.from_pretrained(
        "/home/blip2-opt-2.7b",  device_map={"": 0}, torch_dtype=torch.float16
    )
    for k,v in model.named_parameters():
        if v.requires_grad==False:
            print(k,v.requires_grad)
    processor = AutoProcessor.from_pretrained("/home/blip2-opt-2.7b", )
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    image = Image.open(requests.get(url, stream=True).raw)
    image.show()
    inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
    print(inputs)
    generated_ids = model.generate(**inputs)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    print(generated_text)

    prompt = "Question: how many cats are there? Answer:"
    inputs = processor(images=image, text=prompt, return_tensors="pt").to(device="cuda", dtype=torch.float16)   #

    generated_ids = model.generate(**inputs)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    print(generated_text)

def lora_test():
    from transformers import Blip2Processor, Blip2Model
    import torch
    from peft import LoraConfig, get_peft_model

    device = "cuda" if torch.cuda.is_available() else "cpu"

    processor = Blip2Processor.from_pretrained("/home/blip2-opt-2.7b")
    model = Blip2Model.from_pretrained("/home/blip2-opt-2.7b", torch_dtype=torch.float16)

    config = LoraConfig(        #lora 训练时，所有的参数均被冻结
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
    )
    # Get our peft model and print the number of trainable parameters
    model = get_peft_model(model, config)
    model.print_trainable_parameters()
    print("============================")
    model.to(device)
    #查看训练参数   lora 之后，原参数被冻结
    for k, v in model.named_parameters():
        if v.requires_grad == False:
            print(k, v.requires_grad)

no trainer +lora 微调
输入： input_ids=labels pixel_value: 图片像素特征


import torch
import datasets
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForVision2Seq, AutoProcessor
import argparse
from peft import LoraConfig, get_peft_model
import os
class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.processor(images=item["image"], padding="max_length", return_tensors="pt")
        # remove batch dimension
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        encoding["text"] = item["text"]
        return encoding


def main():
    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
    parser.add_argument("--pretrain-model-path", dest="pretrain_model_path", required=False, type=str, default="/home/blip2-opt-2.7b",
                        help="预训练模型路径")
    parser.add_argument("--output-path", type=str, default="output", help="模型输出路径")
    args = parser.parse_args()
    output_path = args.output_path
    pretrain_model_path = args.pretrain_model_path
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    peft_model_id = output_path
    # We load our model and processor using `transformers`    加载模型
    model = AutoModelForVision2Seq.from_pretrained(pretrain_model_path)
    processor = AutoProcessor.from_pretrained(pretrain_model_path)
    # Let's define the LoraConfig
    config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
    )
    # Get our peft model and print the number of trainable parameters
    model = get_peft_model(model, config)
    model.print_trainable_parameters()
    model.cuda()

    # Let's load the dataset here!
    # dataset = load_dataset("ybelkada/football-dataset", split="train")

    def collator(batch):
        # pad the input_ids and attention_mask
        processed_batch = {}
        for key in batch[0].keys():
            if key != "text":
                processed_batch[key] = torch.stack([example[key] for example in batch])
            else:
                text_inputs = processor.tokenizer(
                    [example["text"] for example in batch], padding=True, return_tensors="pt"
                )
                processed_batch["input_ids"] = text_inputs["input_ids"]
                processed_batch["attention_mask"] = text_inputs["attention_mask"]
        return processed_batch


    dataset = datasets.load_from_disk("football-dataset")["train"]

    train_dataset = ImageCaptioningDataset(dataset, processor)

    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=2, collate_fn=collator)

    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    device = "cuda" if torch.cuda.is_available() else "cpu"

    model.train()
    loss_list = []
    for epoch in range(11):
        print("Epoch:", epoch)
        sum_loss_list = []
        for idx, batch in enumerate(train_dataloader):
            input_ids = batch.pop("input_ids").to(device)
            pixel_values = batch.pop("pixel_values").to(device, torch.float16)
            #语言模型输入=输出
            outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)

            loss = outputs.loss

            print("Loss:", loss.item())

            sum_loss_list.append(float(loss.item()))

            loss.backward()

            optimizer.step()
            optimizer.zero_grad()

            if idx % 10 == 0:
                generated_output = model.generate(pixel_values=pixel_values)
                print(processor.batch_decode(generated_output, skip_special_tokens=True))

        avg_sum_loss = sum(sum_loss_list) / len(sum_loss_list)
        print("epoch: ", epoch, "loss: ", float(avg_sum_loss))
        loss_list.append(float(avg_sum_loss))

    if not os.path.exists(peft_model_id):
        os.makedirs(peft_model_id)

    print("model_output:", peft_model_id)
    model.save_pretrained(peft_model_id)
if __name__ == "__main__":
    main()

推理：


from transformers import AutoModelForVision2Seq, AutoProcessor

from peft import LoraConfig, get_peft_model
from peft import PeftModel, PeftConfig
import torch
import requests
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import datasets

peft_model_id = "output"
config = PeftConfig.from_pretrained(peft_model_id)
processor = Blip2Processor.from_pretrained(config.base_model_name_or_path)

model = AutoModelForVision2Seq.from_pretrained(config.base_model_name_or_path, device_map="auto")
model = PeftModel.from_pretrained(model, peft_model_id)

dataset = datasets.load_from_disk("football-dataset")["train"]
item = dataset[0]
device = "cuda" if torch.cuda.is_available() else "cpu"
model.eval()

encoding = processor(images=item["image"], padding="max_length", return_tensors="pt")
encoding = {k: v.squeeze() for k, v in encoding.items()}
encoding["text"] = item["text"]

print(encoding.keys())

processed_batch = {}
for key in encoding.keys():
    if key != "text":
        processed_batch[key] = torch.stack([example[key] for example in [encoding]])
    else:
        text_inputs = processor.tokenizer(
            [example["text"] for example in [encoding]], padding=True, return_tensors="pt"
        )
        processed_batch["input_ids"] = text_inputs["input_ids"]
        processed_batch["attention_mask"] = text_inputs["attention_mask"]


pixel_values = processed_batch.pop("pixel_values").to(device, torch.float16)
print("----------")
generated_output = model.generate(pixel_values=pixel_values)
print(processor.batch_decode(generated_output, skip_special_tokens=True))

trainer+lora 微调

#!/usr/bin/env bash
python tune_lora_trainer.py \
--output_dir output \
--model_name_or_path /home/blip2-opt-2.7b \
--dataset_name football-dataset \
--remove_unused_columns=False \
--do_train \
--do_eval \
--per_device_train_batch_size=2 \
--per_device_eval_batch_size=2 \
--learning_rate="5e-5" \
--warmup_steps="0" \
--weight_decay 0.1 \
--overwrite_output_dir \
--max_steps 70 \
--logging_steps=5

训练：

#!/usr/bin/env python
Training a CLIP like dual encoder models using text and vision encoders in the library.

The script can be used to train CLIP like models for languages other than English by using
a text encoder pre-trained in the desired language. Currently this script supports the following vision
and text models:
Vision models: ViT(https://huggingface.co/models?filter=vit), CLIP (https://huggingface.co/models?filter=clip)
Text models: BERT, ROBERTa (https://huggingface.co/models?filter=fill-mask)
"""

import logging
import os
import sys
import warnings
from dataclasses import dataclass, field
from typing import Optional
import datasets
import torch
import transformers
from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    set_seed,
    AutoProcessor,
    AutoModelForVision2Seq
)

from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from peft import LoraConfig,get_peft_model
logger = logging.getLogger(__name__)

#check_min_version("4.36.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")

@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    token: str = field(
        default=None,
        metadata={
            "help": (
                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
            )
        },
    )
    use_auth_token: bool = field(
        default=None,
        metadata={
            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )
    trust_remote_code: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
                "execute code present on the Hub on your local machine."
            )
        },
    )
    freeze_vision_model: bool = field(
        default=False, metadata={"help": "Whether to freeze the vision model parameters or not."}
    )
    freeze_text_model: bool = field(
        default=False, metadata={"help": "Whether to freeze the text model parameters or not."}
    )


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    data_dir: Optional[str] = field(default=None, metadata={"help": "The data directory containing input files."})
    image_column: Optional[str] = field(
        default="image_path",
        metadata={"help": "The name of the column in the datasets containing the full image file paths."},
    )
    caption_column: Optional[str] = field(
        default="caption",
        metadata={"help": "The name of the column in the datasets containing the image captions."},
    )
    train_file: Optional[str] = field(
        default=None, metadata={"help": "The input training data file (a jsonlines file)."}
    )
    validation_file: Optional[str] = field(
        default=None,
        metadata={"help": "An optional input evaluation data file (a jsonlines file)."},
    )
    test_file: Optional[str] = field(
        default=None,
        metadata={"help": "An optional input testing data file (a jsonlines file)."},
    )
    max_seq_length: Optional[int] = field(
        default=128,
        metadata={
            "help": (
                "The maximum total input sequence length after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of training examples to this "
                "value if set."
            )
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
                "value if set."
            )
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
            raise ValueError("Need either a dataset name or a training/validation file.")
        else:
            if self.train_file is not None:
                extension = self.train_file.split(".")[-1]
                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
            if self.validation_file is not None:
                extension = self.validation_file.split(".")[-1]
                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
            if self.validation_file is not None:
                extension = self.validation_file.split(".")[-1]
                assert extension == "json", "`validation_file` should be a json file."
def main():
    # 1. Parse input arguments    加载参数
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
    print(training_args)
    # 2. Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    if training_args.should_log:
        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
        transformers.utils.logging.set_verbosity_info()

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")


    # 4. Load dataset
    dataset = datasets.load_from_disk("football-dataset")

    # 5. Load pretrained model, tokenizer, and image processor  加载模型
    processor = AutoProcessor.from_pretrained(model_args.model_name_or_path)
    # Load image_processor, in this script we only use this to get the mean and std for normalization.
    model = AutoModelForVision2Seq.from_pretrained(model_args.model_name_or_path)
    print(model.config)

    config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
    )
    # Get our peft model and print the number of trainable parameters
    model = get_peft_model(model, config)
    model.print_trainable_parameters()
    model.cuda()


    def _freeze_params(module):
        for param in module.parameters():
            param.requires_grad = False

    if model_args.freeze_vision_model:
        _freeze_params(model.vision_model)

    if model_args.freeze_text_model:
        _freeze_params(model.text_model)

    # set seed for torch dataloaders
    set_seed(training_args.seed)

    # Preprocessing the datasets. ##########数据处理

    # We need to tokenize input captions and transform the images.
    def tokenize_captions(examples):
        results=[]
        results = {"pixel_values":[],"text":[]}
        for idx in range(len(examples["image"])):
            item = examples
            encoding = processor(images=item["image"][idx], padding="max_length", return_tensors="pt")
            # remove batch dimension
            encoding = {k: v.squeeze() for k, v in encoding.items()}
            results["pixel_values"].append(encoding["pixel_values"])
            results["text"].append(item["text"][idx])
        return results

    train_dataset = dataset["train"].map(
            function=tokenize_captions,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on train dataset",
            remove_columns=["image"]
        )

    def collate_fn(examples):
        batch=examples
        processed_batch = {}
        for key in batch[0].keys():
            if key != "text":
                processed_batch[key] = torch.stack([torch.tensor(example[key]) for example in batch])
            else:
                text_inputs = processor.tokenizer(
                    [example[key] for example in batch], padding=True, return_tensors="pt"
                )
                processed_batch["input_ids"] = text_inputs["input_ids"]
                processed_batch["attention_mask"] = text_inputs["attention_mask"]
                processed_batch["labels"] = text_inputs["input_ids"]
        return processed_batch

    # 8. Initalize our trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset= None,
        data_collator=collate_fn,
    )

    # 9. Training
    if training_args.do_train:
        checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()

        trainer.log_metrics("train", train_result.metrics)
        trainer.save_metrics("train", train_result.metrics)
        trainer.save_state()
    print("============================")
    model.save_pretrained(training_args.output_dir)
if __name__ == "__main__":
    main()