from transformers import CLIPModel, CLIPProcessor
from PIL import Image
import requests
# 本地路径
local_model_path = "clip-vit-large-patch14"
# 从本地加载模型和处理器
model = CLIPModel.from_pretrained(local_model_path, local_files_only=True)
processor = CLIPProcessor.from_pretrained(local_model_path, local_files_only=True)
# 加载并处理图像
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
# 处理输入
inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
# 获取模型输出
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # 图像-文本相似度分数
probs = logits_per_image.softmax(dim=1) # 获取标签概率
# 输出结果
print("Logits per image:", logits_per_image)
print("Probabilities:", probs)