import gc
from os import getenv
from PIL.Image import Image
from transformers import AutoProcessor, BlipForConditionalGeneration
from utils import get_pytorch_device, spaces_gpu


@spaces_gpu
def image_to_text(image: Image) -> list[str]:
    image_to_text_model_id = getenv("IMAGE_TO_TEXT_MODEL")
    pytorch_device = get_pytorch_device()
    processor = AutoProcessor.from_pretrained(image_to_text_model_id)
    model = BlipForConditionalGeneration.from_pretrained(image_to_text_model_id).to(pytorch_device)
    inputs = processor(images=image, return_tensors="pt").to(pytorch_device)
    generated_ids = model.generate(pixel_values=inputs.pixel_values, num_beams=3, max_length=20, min_length=5)
    results = processor.batch_decode(generated_ids, skip_special_tokens=True)
    del model, inputs
    gc.collect()
    return results