from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

# Load the model and processor
processor = BlipProcessor.from_pretrained("huggingface.co/Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("huggingface.co/Salesforce/blip-image-captioning-base")

# Load and preprocess an image
img = Image.open("data/input_image.png")
inputs = processor(img, return_tensors="pt")

# Generate caption
out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)

# Print the generated caption
print(caption)
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.