from functools import partial from os import path, unlink import gradio as gr import numpy as np from huggingface_hub import InferenceClient from utils import save_audio_to_temp_file, get_model_sample_rate, request_audio def automatic_speech_recognition(client: InferenceClient, model: str, audio: tuple[int, bytes | np.ndarray]) -> str: """Transcribe audio to text using Hugging Face Inference API. This function converts speech audio into text transcription. The audio is resampled to match the model's expected sample rate, saved to a temporary file, and then sent to the Inference API for transcription. Args: client: Hugging Face InferenceClient instance for API calls. model: Hugging Face model ID to use for automatic speech recognition. audio: Tuple containing: - int: Sample rate of the input audio (e.g., 44100 Hz) - bytes | np.ndarray: Raw audio data as bytes or numpy array Returns: String containing the transcribed text from the audio. Note: - Audio is automatically resampled to match the model's expected sample rate. - Audio is saved as a WAV file for InferenceClient compatibility. - Automatically cleans up temporary files after transcription. - Uses Inference API to offload model loading and inference to Hugging Face's infrastructure, which is more suitable for environments with limited GPU memory or time constraints (like Hugging Face Spaces with Zero GPU). """ temp_file_path = None try: target_sample_rate = get_model_sample_rate(model) temp_file_path = save_audio_to_temp_file(target_sample_rate, audio) result = client.automatic_speech_recognition(temp_file_path, model=model) return result["text"] finally: if temp_file_path and path.exists(temp_file_path): try: unlink(temp_file_path) except Exception: pass # Ignore clean-up errors. def create_asr_tab(client: InferenceClient, model: str): """Create the automatic speech recognition tab in the Gradio interface. This function sets up all UI components for automatic speech recognition, including: - URL input textbox for fetching audio files from the web - Button to retrieve audio from URL - Audio input component for uploading or recording audio - Transcribe button and output textbox Args: client: Hugging Face InferenceClient instance to pass to the automatic_speech_recognition function. model: Hugging Face model ID to use for automatic speech recognition. """ gr.Markdown("Transcribe audio to text.") audio_transcription_url_input = gr.Textbox(label="Audio URL") audio_transcription_audio_request_button = gr.Button("Get Audio") audio_transcription_audio_input = gr.Audio(label="Audio") audio_transcription_audio_request_button.click( fn=request_audio, inputs=audio_transcription_url_input, outputs=audio_transcription_audio_input ) audio_transcription_generate_button = gr.Button("Transcribe") audio_transcription_output = gr.Textbox(label="Text") audio_transcription_generate_button.click( fn=partial(automatic_speech_recognition, client, model), inputs=audio_transcription_audio_input, outputs=audio_transcription_output )