import gc from os import getenv from PIL.Image import Image from transformers import AutoProcessor, BlipForConditionalGeneration from utils import get_pytorch_device, spaces_gpu @spaces_gpu def image_to_text(image: Image) -> list[str]: image_to_text_model_id = getenv("IMAGE_TO_TEXT_MODEL") pytorch_device = get_pytorch_device() processor = AutoProcessor.from_pretrained(image_to_text_model_id) model = BlipForConditionalGeneration.from_pretrained(image_to_text_model_id).to(pytorch_device) inputs = processor(images=image, return_tensors="pt").to(pytorch_device) generated_ids = model.generate(pixel_values=inputs.pixel_values, num_beams=3, max_length=20, min_length=5) results = processor.batch_decode(generated_ids, skip_special_tokens=True) del model, inputs gc.collect() return results