一、目录
- Blip2 模型
- 模型试用
- no trainer +lora 微调
- trainer+lora 微调
二、实现
-
Blip2 模型
多模态图生文入门:
核心: 图片的向量特征与文本的向量进行融合。
架构: image encoder 部分+ qformer 部分+ gpt decoder 部分。
融合:1. 视觉到文本生成
将Q-Former学习的文本和图像向量,加上一个全连接层(一个Linear,从768维到2560维),然后输入到大预言模型,预测文本输出。
常见的LLM语言模型包含Decoder only(chatgpt,LLaMa等,目前更常用)和encoder-decoder(谷歌T5,清华GLM)结构。
-
图文匹配(ITM:Image-Text Matching)
这是一个分类任务,判断图像和文本描述是一对,是为1,否则为0。
在该过程中,图像和文本一起经过Qfromer的bert,将文本的token id和query一起embeding和图像进行交互,此过程中query的图像和文本可以互
相看到,因此query能学习到多模态的信息。
3.图文对比学习(ITC:Image-text Contrastive)
该部分和clip相同,图像和文本分别经过bert进行encoder,得到对应的特征,然后分别经过一个线性层,最后进行一致性loss计算,具体参考clip。对于文本,此
时CA 层不起作用。
该过程中query和文本互相看不见,避免了query直接从文本学习信息,如果query直接从文本学习,这样query就学不到图像信息,对比学习也就没有意义了。
4. 根据图像生成文本(ITG:Image-text Generation)
即解码时文本可以看到图像信息和解码过的文本信息,但是图像看不到文本,解码时看着文本和图像一起进行解码,这样模型预测的文本可
以从图像中学习到信息。
参考网址:https://zhuanlan.zhihu.com/p/649132737?utm_id=0
https://huggingface.co/docs/transformers/main/en/model_doc/blip-2
github:llm-action
2. 模型试用
from PIL import Image
import requests
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
from transformers import AutoModelForVision2Seq, AutoProcessor
import datasets
# dataset=datasets.load_dataset("ybelkada/football-dataset")
# dataset.save_to_disk("football-dataset")
dataset=datasets.load_from_disk("football-dataset")
print(dataset["train"][:2])
device = "cuda" if torch.cuda.is_available() else "cpu"
def test():
#加载模型
processor = Blip2Processor.from_pretrained("/home/blip2-opt-2.7b",)
model = Blip2ForConditionalGeneration.from_pretrained(
"/home/blip2-opt-2.7b", device_map={"": 0}, torch_dtype=torch.float16
)
for k,v in model.named_parameters():
if v.requires_grad==False:
print(k,v.requires_grad)
processor = AutoProcessor.from_pretrained("/home/blip2-opt-2.7b", )
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
image.show()
inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
print(inputs)
generated_ids = model.generate(**inputs)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(generated_text)
prompt = "Question: how many cats are there? Answer:"
inputs = processor(images=image, text=prompt, return_tensors="pt").to(device="cuda", dtype=torch.float16) #
generated_ids = model.generate(**inputs)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(generated_text)
def lora_test():
from transformers import Blip2Processor, Blip2Model
import torch
from peft import LoraConfig, get_peft_model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("/home/blip2-opt-2.7b")
model = Blip2Model.from_pretrained("/home/blip2-opt-2.7b", torch_dtype=torch.float16)
config = LoraConfig( #lora 训练时,所有的参数均被冻结
r=16,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
)
# Get our peft model and print the number of trainable parameters
model = get_peft_model(model, config)
model.print_trainable_parameters()
print("============================")
model.to(device)
#查看训练参数 lora 之后,原参数被冻结
for k, v in model.named_parameters():
if v.requires_grad == False:
print(k, v.requires_grad)
- no trainer +lora 微调
输入: input_ids=labels pixel_value: 图片像素特征
import torch
import datasets
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForVision2Seq, AutoProcessor
import argparse
from peft import LoraConfig, get_peft_model
import os
class ImageCaptioningDataset(Dataset):
def __init__(self, dataset, processor):
self.dataset = dataset
self.processor = processor
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
item = self.dataset[idx]
encoding = self.processor(images=item["image"], padding="max_length", return_tensors="pt")
# remove batch dimension
encoding = {k: v.squeeze() for k, v in encoding.items()}
encoding["text"] = item["text"]
return encoding
def main():
parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
parser.add_argument("--pretrain-model-path", dest="pretrain_model_path", required=False, type=str, default="/home/blip2-opt-2.7b",
help="预训练模型路径")
parser.add_argument("--output-path", type=str, default="output", help="模型输出路径")
args = parser.parse_args()
output_path = args.output_path
pretrain_model_path = args.pretrain_model_path
if not os.path.exists(output_path):
os.makedirs(output_path)
peft_model_id = output_path
# We load our model and processor using `transformers` 加载模型
model = AutoModelForVision2Seq.from_pretrained(pretrain_model_path)
processor = AutoProcessor.from_pretrained(pretrain_model_path)
# Let's define the LoraConfig
config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
)
# Get our peft model and print the number of trainable parameters
model = get_peft_model(model, config)
model.print_trainable_parameters()
model.cuda()
# Let's load the dataset here!
# dataset = load_dataset("ybelkada/football-dataset", split="train")
def collator(batch):
# pad the input_ids and attention_mask
processed_batch = {}
for key in batch[0].keys():
if key != "text":
processed_batch[key] = torch.stack([example[key] for example in batch])
else:
text_inputs = processor.tokenizer(
[example["text"] for example in batch], padding=True, return_tensors="pt"
)
processed_batch["input_ids"] = text_inputs["input_ids"]
processed_batch["attention_mask"] = text_inputs["attention_mask"]
return processed_batch
dataset = datasets.load_from_disk("football-dataset")["train"]
train_dataset = ImageCaptioningDataset(dataset, processor)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=2, collate_fn=collator)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.train()
loss_list = []
for epoch in range(11):
print("Epoch:", epoch)
sum_loss_list = []
for idx, batch in enumerate(train_dataloader):
input_ids = batch.pop("input_ids").to(device)
pixel_values = batch.pop("pixel_values").to(device, torch.float16)
#语言模型输入=输出
outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
loss = outputs.loss
print("Loss:", loss.item())
sum_loss_list.append(float(loss.item()))
loss.backward()
optimizer.step()
optimizer.zero_grad()
if idx % 10 == 0:
generated_output = model.generate(pixel_values=pixel_values)
print(processor.batch_decode(generated_output, skip_special_tokens=True))
avg_sum_loss = sum(sum_loss_list) / len(sum_loss_list)
print("epoch: ", epoch, "loss: ", float(avg_sum_loss))
loss_list.append(float(avg_sum_loss))
if not os.path.exists(peft_model_id):
os.makedirs(peft_model_id)
print("model_output:", peft_model_id)
model.save_pretrained(peft_model_id)
if __name__ == "__main__":
main()
推理:
from transformers import AutoModelForVision2Seq, AutoProcessor
from peft import LoraConfig, get_peft_model
from peft import PeftModel, PeftConfig
import torch
import requests
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import datasets
peft_model_id = "output"
config = PeftConfig.from_pretrained(peft_model_id)
processor = Blip2Processor.from_pretrained(config.base_model_name_or_path)
model = AutoModelForVision2Seq.from_pretrained(config.base_model_name_or_path, device_map="auto")
model = PeftModel.from_pretrained(model, peft_model_id)
dataset = datasets.load_from_disk("football-dataset")["train"]
item = dataset[0]
device = "cuda" if torch.cuda.is_available() else "cpu"
model.eval()
encoding = processor(images=item["image"], padding="max_length", return_tensors="pt")
encoding = {k: v.squeeze() for k, v in encoding.items()}
encoding["text"] = item["text"]
print(encoding.keys())
processed_batch = {}
for key in encoding.keys():
if key != "text":
processed_batch[key] = torch.stack([example[key] for example in [encoding]])
else:
text_inputs = processor.tokenizer(
[example["text"] for example in [encoding]], padding=True, return_tensors="pt"
)
processed_batch["input_ids"] = text_inputs["input_ids"]
processed_batch["attention_mask"] = text_inputs["attention_mask"]
pixel_values = processed_batch.pop("pixel_values").to(device, torch.float16)
print("----------")
generated_output = model.generate(pixel_values=pixel_values)
print(processor.batch_decode(generated_output, skip_special_tokens=True))
- trainer+lora 微调
#!/usr/bin/env bash
python tune_lora_trainer.py \
--output_dir output \
--model_name_or_path /home/blip2-opt-2.7b \
--dataset_name football-dataset \
--remove_unused_columns=False \
--do_train \
--do_eval \
--per_device_train_batch_size=2 \
--per_device_eval_batch_size=2 \
--learning_rate="5e-5" \
--warmup_steps="0" \
--weight_decay 0.1 \
--overwrite_output_dir \
--max_steps 70 \
--logging_steps=5
训练:
#!/usr/bin/env python
Training a CLIP like dual encoder models using text and vision encoders in the library.
The script can be used to train CLIP like models for languages other than English by using
a text encoder pre-trained in the desired language. Currently this script supports the following vision
and text models:
Vision models: ViT(https://huggingface.co/models?filter=vit), CLIP (https://huggingface.co/models?filter=clip)
Text models: BERT, ROBERTa (https://huggingface.co/models?filter=fill-mask)
"""
import logging
import os
import sys
import warnings
from dataclasses import dataclass, field
from typing import Optional
import datasets
import torch
import transformers
from transformers import (
HfArgumentParser,
Trainer,
TrainingArguments,
set_seed,
AutoProcessor,
AutoModelForVision2Seq
)
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from peft import LoraConfig,get_peft_model
logger = logging.getLogger(__name__)
#check_min_version("4.36.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
"""
model_name_or_path: str = field(
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
)
config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
)
tokenizer_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
)
model_revision: str = field(
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
use_fast_tokenizer: bool = field(
default=True,
metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
)
token: str = field(
default=None,
metadata={
"help": (
"The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
"generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
)
},
)
use_auth_token: bool = field(
default=None,
metadata={
"help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
},
)
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
)
},
)
freeze_vision_model: bool = field(
default=False, metadata={"help": "Whether to freeze the vision model parameters or not."}
)
freeze_text_model: bool = field(
default=False, metadata={"help": "Whether to freeze the text model parameters or not."}
)
@dataclass
class DataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
"""
dataset_name: Optional[str] = field(
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
)
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
data_dir: Optional[str] = field(default=None, metadata={"help": "The data directory containing input files."})
image_column: Optional[str] = field(
default="image_path",
metadata={"help": "The name of the column in the datasets containing the full image file paths."},
)
caption_column: Optional[str] = field(
default="caption",
metadata={"help": "The name of the column in the datasets containing the image captions."},
)
train_file: Optional[str] = field(
default=None, metadata={"help": "The input training data file (a jsonlines file)."}
)
validation_file: Optional[str] = field(
default=None,
metadata={"help": "An optional input evaluation data file (a jsonlines file)."},
)
test_file: Optional[str] = field(
default=None,
metadata={"help": "An optional input testing data file (a jsonlines file)."},
)
max_seq_length: Optional[int] = field(
default=128,
metadata={
"help": (
"The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
)
},
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": (
"For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
)
},
)
max_eval_samples: Optional[int] = field(
default=None,
metadata={
"help": (
"For debugging purposes or quicker training, truncate the number of evaluation examples to this "
"value if set."
)
},
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
)
def __post_init__(self):
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
raise ValueError("Need either a dataset name or a training/validation file.")
else:
if self.train_file is not None:
extension = self.train_file.split(".")[-1]
assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
if self.validation_file is not None:
extension = self.validation_file.split(".")[-1]
assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
if self.validation_file is not None:
extension = self.validation_file.split(".")[-1]
assert extension == "json", "`validation_file` should be a json file."
def main():
# 1. Parse input arguments 加载参数
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
print(training_args)
# 2. Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
if training_args.should_log:
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers.utils.logging.set_verbosity_info()
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary:
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")
# 4. Load dataset
dataset = datasets.load_from_disk("football-dataset")
# 5. Load pretrained model, tokenizer, and image processor 加载模型
processor = AutoProcessor.from_pretrained(model_args.model_name_or_path)
# Load image_processor, in this script we only use this to get the mean and std for normalization.
model = AutoModelForVision2Seq.from_pretrained(model_args.model_name_or_path)
print(model.config)
config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
)
# Get our peft model and print the number of trainable parameters
model = get_peft_model(model, config)
model.print_trainable_parameters()
model.cuda()
def _freeze_params(module):
for param in module.parameters():
param.requires_grad = False
if model_args.freeze_vision_model:
_freeze_params(model.vision_model)
if model_args.freeze_text_model:
_freeze_params(model.text_model)
# set seed for torch dataloaders
set_seed(training_args.seed)
# Preprocessing the datasets. ##########数据处理
# We need to tokenize input captions and transform the images.
def tokenize_captions(examples):
results=[]
results = {"pixel_values":[],"text":[]}
for idx in range(len(examples["image"])):
item = examples
encoding = processor(images=item["image"][idx], padding="max_length", return_tensors="pt")
# remove batch dimension
encoding = {k: v.squeeze() for k, v in encoding.items()}
results["pixel_values"].append(encoding["pixel_values"])
results["text"].append(item["text"][idx])
return results
train_dataset = dataset["train"].map(
function=tokenize_captions,
batched=True,
num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on train dataset",
remove_columns=["image"]
)
def collate_fn(examples):
batch=examples
processed_batch = {}
for key in batch[0].keys():
if key != "text":
processed_batch[key] = torch.stack([torch.tensor(example[key]) for example in batch])
else:
text_inputs = processor.tokenizer(
[example[key] for example in batch], padding=True, return_tensors="pt"
)
processed_batch["input_ids"] = text_inputs["input_ids"]
processed_batch["attention_mask"] = text_inputs["attention_mask"]
processed_batch["labels"] = text_inputs["input_ids"]
return processed_batch
# 8. Initalize our trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset if training_args.do_train else None,
eval_dataset= None,
data_collator=collate_fn,
)
# 9. Training
if training_args.do_train:
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model()
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()
print("============================")
model.save_pretrained(training_args.output_dir)
if __name__ == "__main__":
main()