from transformers import CLIPProcessor, CLIPModel
fromPILimport Image
from IPython import embed
from transformers import CLIPTokenizer
import torch
model_name ="/public_bme/data/breast-10-12/CausalFromText/clip-vit-b-16/"
model = CLIPModel.from_pretrained(model_name)
tokenizer = CLIPTokenizer.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)
texts =["This is a description of the first image.","New as a word of the good fuck."]
images =[Image.open("CLIP.png"), Image.open("CLIP.png")]
inputs_text =tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512