ai-building-blocks / automatic_speech_recognition.py
LiKenun's picture
Move environment variable querying code out of the Gradio UI-construction functions all the way to the root of the application, `app.py`
55d79e2
raw
history blame
3.14 kB
from functools import partial
from huggingface_hub import InferenceClient
from os import path, unlink
import gradio as gr
from utils import save_audio_to_temp_file, get_model_sample_rate, request_audio
def automatic_speech_recognition(client: InferenceClient, model: str, audio: tuple[int, bytes]) -> str:
"""Transcribe audio to text using Hugging Face Inference API.
This function converts speech audio into text transcription. The audio is
resampled to match the model's expected sample rate, saved to a temporary
file, and then sent to the Inference API for transcription.
Args:
client: Hugging Face InferenceClient instance for API calls.
model: Hugging Face model ID to use for automatic speech recognition.
audio: Tuple containing:
- int: Sample rate of the input audio (e.g., 44100 Hz)
- bytes: Raw audio data as bytes
Returns:
String containing the transcribed text from the audio.
Note:
- Audio is automatically resampled to match the model's expected sample rate.
- Audio is saved as a WAV file for InferenceClient compatibility.
- Automatically cleans up temporary files after transcription.
"""
temp_file_path = None
try:
sample_rate = get_model_sample_rate(model)
temp_file_path = save_audio_to_temp_file(sample_rate, audio)
result = client.automatic_speech_recognition(temp_file_path, model=model)
return result["text"]
finally:
if temp_file_path and path.exists(temp_file_path): # Clean up temporary file.
try:
unlink(temp_file_path)
except Exception:
pass # Ignore clean-up errors.
def create_asr_tab(client: InferenceClient, model: str):
"""Create the automatic speech recognition tab in the Gradio interface.
This function sets up all UI components for automatic speech recognition, including:
- URL input textbox for fetching audio files from the web
- Button to retrieve audio from URL
- Audio input component for uploading or recording audio
- Transcribe button and output textbox
Args:
client: Hugging Face InferenceClient instance to pass to the automatic_speech_recognition function.
model: Hugging Face model ID to use for automatic speech recognition.
"""
gr.Markdown("Transcribe audio to text.")
audio_transcription_url_input = gr.Textbox(label="Audio URL")
audio_transcription_audio_request_button = gr.Button("Get Audio")
audio_transcription_audio_input = gr.Audio(label="Audio")
audio_transcription_audio_request_button.click(
fn=request_audio,
inputs=audio_transcription_url_input,
outputs=audio_transcription_audio_input
)
audio_transcription_generate_button = gr.Button("Transcribe")
audio_transcription_output = gr.Textbox(label="Text")
audio_transcription_generate_button.click(
fn=partial(automatic_speech_recognition, client, model),
inputs=audio_transcription_audio_input,
outputs=audio_transcription_output
)