import gc from functools import partial import gradio as gr import torch from transformers import pipeline from utils import get_pytorch_device, spaces_gpu, get_torch_dtype @spaces_gpu def text_to_speech(model: str, text: str) -> tuple[int, bytes]: """Convert text to speech audio using a TTS (Text-to-Speech) model. This function uses a transformer pipeline to generate speech audio from text input. The model is loaded, inference is performed, and then cleaned up to free GPU memory. Args: model: Hugging Face model ID to use for text-to-speech. text: Input text string to convert to speech. Returns: Tuple containing: - int: Sampling rate of the generated audio (e.g., 22050 Hz) - bytes: Raw audio data as bytes Note: - Uses safetensors for secure model loading. - Automatically selects the best available device (CUDA/XPU/MPS/CPU). - Cleans up model and GPU memory after inference. - Returns audio in format compatible with Gradio Audio component. """ pytorch_device = get_pytorch_device() dtype = get_torch_dtype() # During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad() # reduces memory consumption by not storing gradients. This can significantly reduce the # amount of memory used during the inference phase. model_kwargs = {"use_safetensors": True} # Use safetensors to avoid torch.load restriction. if dtype is not None: model_kwargs["dtype"] = dtype narrator = pipeline( "text-to-speech", model, device=0 if pytorch_device == "cuda" else -1, model_kwargs=model_kwargs ) with torch.no_grad(): result = narrator(text) # Clean up GPU memory del narrator if pytorch_device == "cuda": torch.cuda.empty_cache() gc.collect() return (result["sampling_rate"], result["audio"][0]) def create_text_to_speech_tab(model: str): """Create the text-to-speech tab in the Gradio interface. This function sets up all UI components for text-to-speech generation, including input textbox, generate button, and output audio player. Args: model: Hugging Face model ID to use for text-to-speech. """ gr.Markdown("Generate speech from text.") text_to_speech_text = gr.Textbox(label="Text") text_to_speech_generate_button = gr.Button("Generate") text_to_speech_output = gr.Audio(label="Speech") text_to_speech_generate_button.click( fn=partial(text_to_speech, model), inputs=text_to_speech_text, outputs=text_to_speech_output )