from functools import partial from huggingface_hub import InferenceClient from os import path, unlink import gradio as gr from utils import save_audio_to_temp_file, get_model_sample_rate, request_audio def automatic_speech_recognition(client: InferenceClient, model: str, audio: tuple[int, bytes]) -> str: """Transcribe audio to text using Hugging Face Inference API. This function converts speech audio into text transcription. The audio is resampled to match the model's expected sample rate, saved to a temporary file, and then sent to the Inference API for transcription. Args: client: Hugging Face InferenceClient instance for API calls. model: Hugging Face model ID to use for automatic speech recognition. audio: Tuple containing: - int: Sample rate of the input audio (e.g., 44100 Hz) - bytes: Raw audio data as bytes Returns: String containing the transcribed text from the audio. Note: - Audio is automatically resampled to match the model's expected sample rate. - Audio is saved as a WAV file for InferenceClient compatibility. - Automatically cleans up temporary files after transcription. """ temp_file_path = None try: sample_rate = get_model_sample_rate(model) temp_file_path = save_audio_to_temp_file(sample_rate, audio) result = client.automatic_speech_recognition(temp_file_path, model=model) return result["text"] finally: if temp_file_path and path.exists(temp_file_path): # Clean up temporary file. try: unlink(temp_file_path) except Exception: pass # Ignore clean-up errors. def create_asr_tab(client: InferenceClient, model: str): """Create the automatic speech recognition tab in the Gradio interface. This function sets up all UI components for automatic speech recognition, including: - URL input textbox for fetching audio files from the web - Button to retrieve audio from URL - Audio input component for uploading or recording audio - Transcribe button and output textbox Args: client: Hugging Face InferenceClient instance to pass to the automatic_speech_recognition function. model: Hugging Face model ID to use for automatic speech recognition. """ gr.Markdown("Transcribe audio to text.") audio_transcription_url_input = gr.Textbox(label="Audio URL") audio_transcription_audio_request_button = gr.Button("Get Audio") audio_transcription_audio_input = gr.Audio(label="Audio") audio_transcription_audio_request_button.click( fn=request_audio, inputs=audio_transcription_url_input, outputs=audio_transcription_audio_input ) audio_transcription_generate_button = gr.Button("Transcribe") audio_transcription_output = gr.Textbox(label="Text") audio_transcription_generate_button.click( fn=partial(automatic_speech_recognition, client, model), inputs=audio_transcription_audio_input, outputs=audio_transcription_output )