Voxtral_Mini_Evaluation

Running

App Files Files Community

Loren commited on Jul 25

Commit

454244f

verified ·

1 Parent(s): 139db11

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -32

app.py CHANGED Viewed

@@ -50,7 +50,7 @@ def process_translate(language: str, audio_path: str) -> str:
     outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
     decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-    return decoded_outputs
 def disable_buttons():
@@ -86,15 +86,22 @@ dict_languages = {"English": "en",
 #### Gradio interface
 with gr.Blocks(title="Voxtral") as voxtral:
     gr.Markdown("# Voxtral Mini Evaluation")
-    gr.Markdown("Voxtral Mini is an enhancement of Ministral 3B, incorporating state-of-the-art audio input \
-    capabilities while retaining best-in-class text performance. It excels at speech transcription, \
-    translation and audio understanding.")
-    btn = gr.Button("🔎 More on Voxtral", variant="huggingface")
-    with Modal(visible=False, allow_user_close=True) as modal:
         gr.Markdown("## Key features:")
-        gr.Markdown("Voici comment utiliser l’interface…")
-        # placez ici des composants supplémentaires selon besoin
-    btn.click(lambda: Modal(visible=True), None, modal)
     gr.Markdown("## Upload an audio file, record via microphone, or select a demo file:")
     gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription)*")
@@ -114,31 +121,31 @@ with gr.Blocks(title="Voxtral") as voxtral:
     with gr.Row():
         with gr.Column():
-            gr.Button("📝 Transcription", variant="huggingface", interactive=False)
-            sel_language = gr.Dropdown(
-                choices=list(dict_languages.keys()),
-                value="English",
-                label="Select the language of the audio file:"
-            )
-            submit_transcript = gr.Button("Extract transcription", variant="primary")
-            text_transcript = gr.Textbox(label="💬 Generated transcription", lines=10)
         with gr.Column():
-            gr.Button("🔁 Translation", variant="huggingface", interactive=False)
-            sel_translate_language = gr.Dropdown(
-                choices=list(dict_languages.keys()),
-                value="English",
-                label="Select the language for translation:"
-            )
-            submit_translate = gr.Button("Translate audio file", variant="primary")
-            text_translate = gr.Textbox(label="💬 Generated translation", lines=10)
         with gr.Column():
-            gr.Button("🤖 Ask audio file", variant="huggingface", interactive=False)
-            question = gr.Textbox(label="Ask audio file", placeholder="Enter your question about audio file")
-            submit_chat = gr.Button("Ask audio file:", variant="primary")
-            text_chat = gr.Textbox(label="💬 Model answer", lines=10)
 ### Processing
@@ -162,9 +169,9 @@ with gr.Blocks(title="Voxtral") as voxtral:
         outputs=[submit_transcript, submit_translate, submit_chat],
         trigger_mode="once",
     ).then(
-        fn=process_transcript,
         inputs=[sel_translate_language, sel_audio],
-        outputs=text_transcript
     ).then(
         enable_buttons,
         outputs=[submit_transcript, submit_translate, submit_chat],

     outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
     decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+    return decoded_outputs[0]
 def disable_buttons():
 #### Gradio interface
 with gr.Blocks(title="Voxtral") as voxtral:
     gr.Markdown("# Voxtral Mini Evaluation")
+    gr.Markdown("""### Voxtral Mini is an enhancement of **Ministral 3B**, incorporating state-of-the-art audio input \
+    capabilities while retaining best-in-class text performance.
+    It excels at speech transcription, translation and audio understanding.""")
+    with gr.Accordion("🔎 More on Voxtral", open=False):
         gr.Markdown("## Key features:")
+        gr.Markdown("""## **Key Features:**
+Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
+- **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly
+- **Long-form context**: With a 32k token context length, Voxtral handles audios up to 30 minutes for transcription, or 40 minutes for understanding
+- **Built-in Q&A and summarization**: Supports asking questions directly through audio. Analyze audio and generate structured summaries without the need for separate ASR and language models
+- **Natively multilingual**: Automatic language detection and state-of-the-art performance in the world’s most widely used languages (English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)
+- **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents
+- **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""")
     gr.Markdown("## Upload an audio file, record via microphone, or select a demo file:")
     gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription)*")
     with gr.Row():
         with gr.Column():
+            with gr.Accordion("📝 Transcription", open=True):
+                sel_language = gr.Dropdown(
+                    choices=list(dict_languages.keys()),
+                    value="English",
+                    label="Select the language of the audio file:"
+                )
+                submit_transcript = gr.Button("Extract transcription", variant="primary")
+                text_transcript = gr.Textbox(label="💬 Generated transcription", lines=10)
         with gr.Column():
+            with gr.Accordion("🔁 Translation", open=True):
+                sel_translate_language = gr.Dropdown(
+                    choices=list(dict_languages.keys()),
+                    value="English",
+                    label="Select the language for translation:"
+                )
+                submit_translate = gr.Button("Translate audio file", variant="primary")
+                text_translate = gr.Textbox(label="💬 Generated translation", lines=10)
         with gr.Column():
+            with gr.Accordion("🤖 Ask audio file", open=True):
+                question = gr.Textbox(label="Ask audio file", placeholder="Enter your question about audio file")
+                submit_chat = gr.Button("Ask audio file:", variant="primary")
+                text_chat = gr.Textbox(label="💬 Model answer", lines=10)
 ### Processing
         outputs=[submit_transcript, submit_translate, submit_chat],
         trigger_mode="once",
     ).then(
+        fn=process_translate,
         inputs=[sel_translate_language, sel_audio],
+        outputs=text_translate
     ).then(
         enable_buttons,
         outputs=[submit_transcript, submit_translate, submit_chat],