alinasdkey commited on
Commit
7382fb8
·
verified ·
1 Parent(s): 38932cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -11
app.py CHANGED
@@ -33,35 +33,41 @@ FastVisionModel.for_inference(model)
33
  #Inference function
34
  def describe_image(image, instruction):
35
  try:
36
- # Step 1: Prepare multimodal prompt
 
 
37
  messages = [
38
  {
39
  "role": "user",
40
  "content": [
41
  {"type": "image"},
42
- {"type": "text", "text": instruction.strip() if instruction else "Describe this image"}
43
  ]
44
  }
45
  ]
46
- prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
47
 
48
- # Step 2: Tokenize prompt
49
- input_ids = processor.tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
 
 
50
 
51
- # Step 3: Process image to get pixel values
52
- image_inputs = processor(images=image, return_tensors="pt").to(model.device)
 
 
53
 
54
- # Step 4: Generate output using correct arguments
55
  outputs = model.generate(
56
  input_ids=input_ids,
57
- pixel_values=image_inputs["pixel_values"],
 
58
  max_new_tokens=256,
59
  do_sample=False,
60
  temperature=0.2,
61
- top_p=0.95,
62
  )
63
 
64
- # Step 5: Decode the output
65
  return processor.tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
66
 
67
  except Exception as e:
 
33
  #Inference function
34
  def describe_image(image, instruction):
35
  try:
36
+ # Step 1: Prepare prompt
37
+ prompt = instruction.strip() if instruction else "Describe this image."
38
+
39
  messages = [
40
  {
41
  "role": "user",
42
  "content": [
43
  {"type": "image"},
44
+ {"type": "text", "text": prompt}
45
  ]
46
  }
47
  ]
 
48
 
49
+ prompt_text = processor.apply_chat_template(messages, add_generation_prompt=True)
50
+
51
+ # Step 2: Tokenize the prompt text
52
+ input_ids = processor.tokenizer(prompt_text, return_tensors="pt").input_ids.to(model.device)
53
 
54
+ # Step 3: Process image to get pixel values + aspect ratio IDs
55
+ image_inputs = processor(image, return_tensors="pt").to(model.device)
56
+ pixel_values = image_inputs["pixel_values"]
57
+ aspect_ratio_ids = image_inputs["aspect_ratio_ids"]
58
 
59
+ # Step 4: Generate
60
  outputs = model.generate(
61
  input_ids=input_ids,
62
+ pixel_values=pixel_values,
63
+ aspect_ratio_ids=aspect_ratio_ids,
64
  max_new_tokens=256,
65
  do_sample=False,
66
  temperature=0.2,
67
+ top_p=0.95
68
  )
69
 
70
+ # Step 5: Decode
71
  return processor.tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
72
 
73
  except Exception as e: