coca脚本
import open_clip
import torch
from PIL import Image
model, _, transform = open_clip.create_model_and_transforms(
model_name="coca_ViT-L-14",
pretrained="mscoco_finetuned_laion2B-s13B-b90k"
)
im = Image.open("woman_in_gym.png").convert("RGB")
im = transform(im).unsqueeze(0)
with torch.no_grad(), torch.cuda.amp.autocast():
generated = model.generate(im)
print(open_clip.decode(generated[0]).split("<end_of_text>")[0].replace("<start_of_text>", ""))
blip脚本
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
img_path = "woman_in_gym.png"
device = torch.device("cuda:6" if torch.cuda.is_available() else "cpu")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
raw_image = Image.open(img_path).convert('RGB')
inputs = processor(raw_image, return_tensors="pt").to(device)
out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))
blip2脚本
import torch
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
device = torch.device("cuda:6" if torch.cuda.is_available() else "cpu")
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
"Salesforce/blip2-opt-2.7b").to(device)
image = Image.open("woman_in_gym.png").convert('RGB')
inputs = processor(images=image, return_tensors="pt").to(device)
generated_ids = model.generate(**inputs, max_new_tokens=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(generated_text)
效果对比