# "openai/clip-vit-base-patch32": [1, 512]
# "http://bj.bcebos.com/paddlenlp/models/community/openai/clip-vit-base-patch32/model_state.pdparams",
# "openai/clip-rn50": [1, 1024]
# "http://bj.bcebos.com/paddlenlp/models/community/openai/clip-rn50/model_state.pdparams",
# "openai/clip-rn101": [1, 512]
# "http://bj.bcebos.com/paddlenlp/models/community/openai/clip-rn101/model_state.pdparams",
# "openai/clip-vit-large-patch14": [1, 768]
# "http://bj.bcebos.com/paddlenlp/models/community/openai/clip-vit-large-patch14/model_state.pdparams",
paddle版本
from PIL import Image
from paddlenlp.transformers import CLIPProcessor, CLIPModel
resnet = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
images=Image.open("xxx.jpg")
inputs = processor(images=images, return_tensors="pd")
imgs_feature = resnet.get_image_features(**inputs)
torch版本
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from PIL import Image
# Load the CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model_ID = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_ID).to(device)
preprocess = CLIPProcessor.from_pretrained(model_ID)
image = Image.open('frames_60.jpg')
inputs = preprocess(images=image, return_tensors="pt")
image_features = model.get_image_features(**inputs.to(device))