File size: 2,034 Bytes
e291546
 
 
 
 
 
6664cab
112d29e
c4a118e
e291546
2361b06
6664cab
c4a118e
e291546
c4a118e
 
 
 
8157279
6664cab
c4a118e
 
6664cab
c4a118e
e291546
6664cab
c4a118e
e291546
c4a118e
 
 
38932cd
c4a118e
 
38932cd
c4a118e
cb872a6
68bcd32
c4a118e
 
 
 
 
 
cb872a6
 
c4a118e
 
 
cb872a6
 
 
 
 
 
c4a118e
cb872a6
6664cab
c4a118e
e291546
 
 
 
 
 
 
c4a118e
e291546
 
c4a118e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import torch
from unsloth import FastVisionModel
from peft import PeftModel
from transformers import AutoProcessor
from PIL import Image
import gradio as gr

# Load base LLaMA vision model
model_name = "unsloth/Llama-3.2-11B-Vision-Instruct"
lora_repo = "alinasdkey/unsloth-pret-lora"

# Load base model and processor
model, processor = FastVisionModel.from_pretrained(
    model_name=model_name,
    device_map="auto",
    load_in_4bit=False,
    load_in_8bit=True,
)

# Apply LoRA adapter
model = PeftModel.from_pretrained(model, model_id=lora_repo)

# Set to inference mode
FastVisionModel.for_inference(model)

# Inference function
def describe_image(image, instruction):
    # Load and preprocess image
    image = image.convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(model.device)

    # Create input prompt with instruction
    prompt = instruction if instruction else "Describe this graph."

    # Tokenize text prompt
    input_ids = processor.tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

    # Extract necessary vision inputs
    pixel_values = inputs["pixel_values"]
    aspect_ratio_ids = inputs.get("aspect_ratio_ids")
    aspect_ratio_mask = inputs.get("aspect_ratio_mask")

    # Generate model output
    outputs = model.generate(
        input_ids=input_ids,
        pixel_values=pixel_values,
        aspect_ratio_ids=aspect_ratio_ids,
        aspect_ratio_mask=aspect_ratio_mask,
        max_new_tokens=256,
        do_sample=False,
        temperature=0.2,
        top_p=0.95,
    )

    # Decode and return result
    return processor.tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

# Gradio Interface
gr.Interface(
    fn=describe_image,
    inputs=[
        gr.Image(type="pil", label="Upload a Graph Image"),
        gr.Textbox(label="Instruction (e.g. Summarize this graph)")
    ],
    outputs="text",
    title="Welcome to the Graph Description AI: Pret",
    description="Upload a graph and get insightful analysis!"
).launch()