Loren commited on
Commit
454244f
·
verified ·
1 Parent(s): 139db11

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -32
app.py CHANGED
@@ -50,7 +50,7 @@ def process_translate(language: str, audio_path: str) -> str:
50
  outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
51
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
52
 
53
- return decoded_outputs
54
 
55
 
56
  def disable_buttons():
@@ -86,15 +86,22 @@ dict_languages = {"English": "en",
86
  #### Gradio interface
87
  with gr.Blocks(title="Voxtral") as voxtral:
88
  gr.Markdown("# Voxtral Mini Evaluation")
89
- gr.Markdown("Voxtral Mini is an enhancement of Ministral 3B, incorporating state-of-the-art audio input \
90
- capabilities while retaining best-in-class text performance. It excels at speech transcription, \
91
- translation and audio understanding.")
92
- btn = gr.Button("🔎 More on Voxtral", variant="huggingface")
93
- with Modal(visible=False, allow_user_close=True) as modal:
94
  gr.Markdown("## Key features:")
95
- gr.Markdown("Voici comment utiliser l’interface…")
96
- # placez ici des composants supplémentaires selon besoin
97
- btn.click(lambda: Modal(visible=True), None, modal)
 
 
 
 
 
 
 
98
 
99
  gr.Markdown("## Upload an audio file, record via microphone, or select a demo file:")
100
  gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription)*")
@@ -114,31 +121,31 @@ with gr.Blocks(title="Voxtral") as voxtral:
114
 
115
  with gr.Row():
116
  with gr.Column():
117
- gr.Button("📝 Transcription", variant="huggingface", interactive=False)
118
- sel_language = gr.Dropdown(
119
- choices=list(dict_languages.keys()),
120
- value="English",
121
- label="Select the language of the audio file:"
122
- )
123
- submit_transcript = gr.Button("Extract transcription", variant="primary")
124
- text_transcript = gr.Textbox(label="💬 Generated transcription", lines=10)
125
 
126
  with gr.Column():
127
- gr.Button("🔁 Translation", variant="huggingface", interactive=False)
128
- sel_translate_language = gr.Dropdown(
129
- choices=list(dict_languages.keys()),
130
- value="English",
131
- label="Select the language for translation:"
132
- )
133
-
134
- submit_translate = gr.Button("Translate audio file", variant="primary")
135
- text_translate = gr.Textbox(label="💬 Generated translation", lines=10)
136
 
137
  with gr.Column():
138
- gr.Button("🤖 Ask audio file", variant="huggingface", interactive=False)
139
- question = gr.Textbox(label="Ask audio file", placeholder="Enter your question about audio file")
140
- submit_chat = gr.Button("Ask audio file:", variant="primary")
141
- text_chat = gr.Textbox(label="💬 Model answer", lines=10)
142
 
143
  ### Processing
144
 
@@ -162,9 +169,9 @@ with gr.Blocks(title="Voxtral") as voxtral:
162
  outputs=[submit_transcript, submit_translate, submit_chat],
163
  trigger_mode="once",
164
  ).then(
165
- fn=process_transcript,
166
  inputs=[sel_translate_language, sel_audio],
167
- outputs=text_transcript
168
  ).then(
169
  enable_buttons,
170
  outputs=[submit_transcript, submit_translate, submit_chat],
 
50
  outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
51
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
52
 
53
+ return decoded_outputs[0]
54
 
55
 
56
  def disable_buttons():
 
86
  #### Gradio interface
87
  with gr.Blocks(title="Voxtral") as voxtral:
88
  gr.Markdown("# Voxtral Mini Evaluation")
89
+ gr.Markdown("""### Voxtral Mini is an enhancement of **Ministral 3B**, incorporating state-of-the-art audio input \
90
+ capabilities while retaining best-in-class text performance.
91
+ It excels at speech transcription, translation and audio understanding.""")
92
+
93
+ with gr.Accordion("🔎 More on Voxtral", open=False):
94
  gr.Markdown("## Key features:")
95
+ gr.Markdown("""## **Key Features:**
96
+
97
+ Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
98
+ - **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly
99
+ - **Long-form context**: With a 32k token context length, Voxtral handles audios up to 30 minutes for transcription, or 40 minutes for understanding
100
+ - **Built-in Q&A and summarization**: Supports asking questions directly through audio. Analyze audio and generate structured summaries without the need for separate ASR and language models
101
+ - **Natively multilingual**: Automatic language detection and state-of-the-art performance in the world’s most widely used languages (English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)
102
+ - **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents
103
+ - **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""")
104
+
105
 
106
  gr.Markdown("## Upload an audio file, record via microphone, or select a demo file:")
107
  gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription)*")
 
121
 
122
  with gr.Row():
123
  with gr.Column():
124
+ with gr.Accordion("📝 Transcription", open=True):
125
+ sel_language = gr.Dropdown(
126
+ choices=list(dict_languages.keys()),
127
+ value="English",
128
+ label="Select the language of the audio file:"
129
+ )
130
+ submit_transcript = gr.Button("Extract transcription", variant="primary")
131
+ text_transcript = gr.Textbox(label="💬 Generated transcription", lines=10)
132
 
133
  with gr.Column():
134
+ with gr.Accordion("🔁 Translation", open=True):
135
+ sel_translate_language = gr.Dropdown(
136
+ choices=list(dict_languages.keys()),
137
+ value="English",
138
+ label="Select the language for translation:"
139
+ )
140
+
141
+ submit_translate = gr.Button("Translate audio file", variant="primary")
142
+ text_translate = gr.Textbox(label="💬 Generated translation", lines=10)
143
 
144
  with gr.Column():
145
+ with gr.Accordion("🤖 Ask audio file", open=True):
146
+ question = gr.Textbox(label="Ask audio file", placeholder="Enter your question about audio file")
147
+ submit_chat = gr.Button("Ask audio file:", variant="primary")
148
+ text_chat = gr.Textbox(label="💬 Model answer", lines=10)
149
 
150
  ### Processing
151
 
 
169
  outputs=[submit_transcript, submit_translate, submit_chat],
170
  trigger_mode="once",
171
  ).then(
172
+ fn=process_translate,
173
  inputs=[sel_translate_language, sel_audio],
174
+ outputs=text_translate
175
  ).then(
176
  enable_buttons,
177
  outputs=[submit_transcript, submit_translate, submit_chat],