Loren commited on
Commit
7edddd6
·
verified ·
1 Parent(s): 5cb72ae

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +79 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoProcessor, VoxtralForConditionalGeneration
4
+
5
+ MAX_TOKENS = 32000
6
+
7
+ device = "cuda" if torch.cuda.is_available() else "cpu"
8
+ print(f"*** Device: {device}")
9
+
10
+ # List models
11
+ dict_models = {'Voxtral-Mini-3B-2507': 'Loren/Voxtral-Mini-3B-2507-dup',
12
+ 'Voxtral-Small-24B-2507': 'Loren/Voxtral-Small-24B-2507-dup'}
13
+
14
+ # Load models
15
+ list_processor = []
16
+ list_model = []
17
+ for model_name in dict_models.values():
18
+ list_processor.append(AutoProcessor.from_pretrained(model_name))
19
+ list_model.append(VoxtralForConditionalGeneration.from_pretrained(model_name,
20
+ torch_dtype=torch.bfloat16,
21
+ device_map=device))
22
+ # Supported languages
23
+ dict_languages = {"English": "en",
24
+ "French": "fr",
25
+ "German": "de",
26
+ "Spanish": "es",
27
+ "Italian": "it",
28
+ "Portuguese": "pt",
29
+ "Dutch": "nl",
30
+ "Hindi": "hi"}
31
+
32
+ @spaces.GPU
33
+ def process_transcript(audio_path, model_name, language):
34
+ """Process audio with selected Voxtral model and return the generated response"""
35
+
36
+ inputs = processor.apply_transcrition_request(language=language, audio=audio_path, model_id=model_name)
37
+ inputs = inputs.to(device, dtype=torch.bfloat16)
38
+
39
+ outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
40
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
41
+
42
+ return decoded_outputs[0]
43
+
44
+
45
+
46
+ # Define Gradio interface
47
+ with gr.Blocks(title="Transcription") as transcript:
48
+ gr.Markdown("# Audio Transcription")
49
+ gr.Markdown("#### Choose the language of the audio and the model, then set an audio file to get its transcription.")
50
+ gr.Markdown("#### **(Voxtral handles audios up to 30 minutes for transcription)**")
51
+
52
+ with gr.Row():
53
+ with gr.Column():
54
+ sel_language = gr.Dropdown(
55
+ choices=list(dict_languages.keys()),
56
+ value="English",
57
+ label="Select the language of the audio file:"
58
+ )
59
+
60
+ sel_model = gr.Radio(dict_models.keys(), label="Select the model:")
61
+
62
+ with gr.Column():
63
+ sel_audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload an audio file or record via microphone:")
64
+
65
+ submit_transcript = gr.Button("Extract Transcription", variant="primary")
66
+
67
+ with gr.Column():
68
+ text_transcript = gr.Textbox(label="Generated Response", lines=10)
69
+
70
+ submit_transcript.click(
71
+ fn=process_transcript,
72
+ inputs=[dict_languages[sel_language], dict_models[sel_model], sel_audio],
73
+ outputs=text_transcript
74
+ )
75
+
76
+
77
+ # Launch the app
78
+ if __name__ == "__main__":
79
+ transcript.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ mistral-common
2
+ git+https://github.com/huggingface/transformers
3
+ gradio
4
+ torch