import gradio as gr from transformers import AutoProcessor, AutoModelForVision2Seq from PIL import Image import torch # Load the model and processor model_name = "ds4sd/SmolDocling-256M-preview" processor = AutoProcessor.from_pretrained(model_name) model = AutoModelForVision2Seq.from_pretrained( model_name, torch_dtype=torch.bfloat16 ).to("cuda" if torch.cuda.is_available() else "cpu") # Define the inference function def process_image(image): inputs = processor(images=image, return_tensors="pt").to(model.device) outputs = model.generate(**inputs, max_new_tokens=1024) result = processor.batch_decode(outputs, skip_special_tokens=True)[0] return result # Create the Gradio interface iface = gr.Interface( fn=process_image, inputs=gr.inputs.Image(type="pil"), outputs="text", title="SmolDocling Document Conversion", description="Upload an image of a document page to convert it to structured text." ) if __name__ == "__main__": iface.launch()