import gc from os import getenv import gradio as gr from transformers import pipeline from utils import spaces_gpu @spaces_gpu def text_to_speech(text: str) -> tuple[int, bytes]: """Convert text to speech audio using a TTS (Text-to-Speech) model. This function uses a transformer pipeline to generate speech audio from text input. The model is loaded, inference is performed, and then cleaned up to free GPU memory. Args: text: Input text string to convert to speech. Returns: Tuple containing: - int: Sampling rate of the generated audio (e.g., 22050 Hz) - bytes: Raw audio data as bytes Note: - The model ID is determined by the TEXT_TO_SPEECH_MODEL environment variable. - Uses safetensors for secure model loading. - Automatically selects the best available device (CUDA/XPU/MPS/CPU). - Cleans up model and GPU memory after inference. - Returns audio in format compatible with Gradio Audio component. """ narrator = pipeline( "text-to-speech", getenv("TEXT_TO_SPEECH_MODEL"), model_kwargs={"use_safetensors": True} # Use safetensors to avoid torch.load restriction. ) result = narrator(text) del narrator gc.collect() return (result["sampling_rate"], result["audio"][0]) def create_text_to_speech_tab(): """Create the text-to-speech tab in the Gradio interface. This function sets up all UI components for text-to-speech generation, including input textbox, generate button, and output audio player. """ gr.Markdown("Generate speech from text.") text_to_speech_text = gr.Textbox(label="Text") text_to_speech_generate_button = gr.Button("Generate") text_to_speech_output = gr.Audio(label="Speech") text_to_speech_generate_button.click( fn=text_to_speech, inputs=text_to_speech_text, outputs=text_to_speech_output )