Spaces:

LiKenun
/

ai-building-blocks

Running on Zero

ai-building-blocks / automatic_speech_recognition.py

Move environment variable querying code out of the Gradio UI-construction functions all the way to the root of the application, `app.py`

55d79e2 about 1 month ago

raw

history blame

3.14 kB

	from functools import partial
	from huggingface_hub import InferenceClient
	from os import path, unlink
	import gradio as gr
	from utils import save_audio_to_temp_file, get_model_sample_rate, request_audio

	def automatic_speech_recognition(client: InferenceClient, model: str, audio: tuple[int, bytes]) -> str:
	"""Transcribe audio to text using Hugging Face Inference API.

	This function converts speech audio into text transcription. The audio is
	resampled to match the model's expected sample rate, saved to a temporary
	file, and then sent to the Inference API for transcription.

	Args:
	client: Hugging Face InferenceClient instance for API calls.
	model: Hugging Face model ID to use for automatic speech recognition.
	audio: Tuple containing:
	- int: Sample rate of the input audio (e.g., 44100 Hz)
	- bytes: Raw audio data as bytes

	Returns:
	String containing the transcribed text from the audio.

	Note:
	- Audio is automatically resampled to match the model's expected sample rate.
	- Audio is saved as a WAV file for InferenceClient compatibility.
	- Automatically cleans up temporary files after transcription.
	"""
	temp_file_path = None
	try:
	sample_rate = get_model_sample_rate(model)
	temp_file_path = save_audio_to_temp_file(sample_rate, audio)
	result = client.automatic_speech_recognition(temp_file_path, model=model)
	return result["text"]
	finally:
	if temp_file_path and path.exists(temp_file_path): # Clean up temporary file.
	try:
	unlink(temp_file_path)
	except Exception:
	pass # Ignore clean-up errors.


	def create_asr_tab(client: InferenceClient, model: str):
	"""Create the automatic speech recognition tab in the Gradio interface.

	This function sets up all UI components for automatic speech recognition, including:
	- URL input textbox for fetching audio files from the web
	- Button to retrieve audio from URL
	- Audio input component for uploading or recording audio
	- Transcribe button and output textbox

	Args:
	client: Hugging Face InferenceClient instance to pass to the automatic_speech_recognition function.
	model: Hugging Face model ID to use for automatic speech recognition.
	"""
	gr.Markdown("Transcribe audio to text.")
	audio_transcription_url_input = gr.Textbox(label="Audio URL")
	audio_transcription_audio_request_button = gr.Button("Get Audio")
	audio_transcription_audio_input = gr.Audio(label="Audio")
	audio_transcription_audio_request_button.click(
	fn=request_audio,
	inputs=audio_transcription_url_input,
	outputs=audio_transcription_audio_input
	)
	audio_transcription_generate_button = gr.Button("Transcribe")
	audio_transcription_output = gr.Textbox(label="Text")
	audio_transcription_generate_button.click(
	fn=partial(automatic_speech_recognition, client, model),
	inputs=audio_transcription_audio_input,
	outputs=audio_transcription_output
	)