Spaces:

LiKenun
/

ai-building-blocks

Running on Zero

App Files Files Community

LiKenun commited on Nov 9

Commit

b71a3ad

1 Parent(s): 0b93b56

Switch text-to-image and automatic speech recognition (ASR) back to using the Hugging Face inference client; Zero GPU cannot accommodate the time it takes for those tasks

Browse files

Files changed (4) hide show

README.md +12 -0
app.py +9 -3
automatic_speech_recognition.py +27 -60
text_to_image.py +12 -39

README.md CHANGED Viewed

@@ -26,6 +26,18 @@ This application provides the following AI building blocks:
 - **Automatic Speech Recognition (ASR)**: Transcribe audio to text using Whisper models
 - **Chatbot**: Have conversations with AI chatbots supporting both modern chat models and seq2seq models
 ## Prerequisites
 - Python 3.8 or higher

 - **Automatic Speech Recognition (ASR)**: Transcribe audio to text using Whisper models
 - **Chatbot**: Have conversations with AI chatbots supporting both modern chat models and seq2seq models
+### Architecture: Local Models vs. Inference API
+This application uses a hybrid approach:
+- **Text-to-image Generation** and **Automatic Speech Recognition (ASR)** use the **Hugging Face Inference API** (via `InferenceClient`) instead of loading models locally. This is because:
+  - Text-to-image models (like FLUX.1-dev) are extremely large and memory-intensive
+  - ASR models (like Whisper-large-v3) are also large and can cause timeouts in constrained environments
+  - Loading them locally can cause timeouts or out-of-memory errors, especially in constrained environments like Hugging Face Spaces with Zero GPU
+  - Using the Inference API offloads the model loading and inference to Hugging Face's infrastructure, ensuring reliable operation
+- **All other tasks** (image classification, translation, image-to-text, text-to-speech, chatbot) load models **locally** to take advantage of Hugging Face Zero GPU for cost-effective hosting. These models are smaller and can be loaded efficiently within memory constraints.
 ## Prerequisites
 - Python 3.8 or higher

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from dotenv import load_dotenv
 from os import getenv
 import gradio as gr
 from automatic_speech_recognition import create_asr_tab
 from chatbot import create_chatbot_tab
 from image_classification import create_image_classification_tab
@@ -19,6 +20,7 @@ class App:
     def __init__(
         self,
         text_to_image_model: str,
         image_to_text_model: str,
         image_classification_model: str,
@@ -27,9 +29,11 @@ class App:
         chat_model: str,
         fallback_translation_model: str
     ):
-        """Initialize the App with model IDs.
         Args:
             text_to_image_model: Model ID for text-to-image generation.
             image_to_text_model: Model ID for image captioning.
             image_classification_model: Model ID for image classification.
@@ -39,6 +43,7 @@ class App:
             fallback_translation_model: Fallback translation model ID for languages
                 without specific translation models.
         """
         self.text_to_image_model = text_to_image_model
         self.image_to_text_model = image_to_text_model
         self.image_classification_model = image_classification_model
@@ -59,7 +64,7 @@ class App:
             gr.Markdown("A gallery of building blocks for building AI applications")
             with gr.Tabs():
                 with gr.Tab("Text-to-image Generation"):
-                    create_text_to_image_tab(self.text_to_image_model)
                 with gr.Tab("Image-to-text or Image Captioning"):
                     create_image_to_text_tab(self.image_to_text_model)
                 with gr.Tab("Image Classification"):
@@ -67,7 +72,7 @@ class App:
                 with gr.Tab("Text-to-speech (TTS)"):
                     create_text_to_speech_tab(self.text_to_speech_model)
                 with gr.Tab("Automatic Speech Recognition (ASR)"):
-                    create_asr_tab(self.audio_transcription_model)
                 with gr.Tab("Chat"):
                     create_chatbot_tab(self.chat_model)
                 with gr.Tab("Translation to English"):
@@ -79,6 +84,7 @@ class App:
 if __name__ == "__main__":
     load_dotenv()
     app = App(
         text_to_image_model=getenv("TEXT_TO_IMAGE_MODEL"),
         image_to_text_model=getenv("IMAGE_TO_TEXT_MODEL"),
         image_classification_model=getenv("IMAGE_CLASSIFICATION_MODEL"),

 from dotenv import load_dotenv
 from os import getenv
 import gradio as gr
+from huggingface_hub import InferenceClient
 from automatic_speech_recognition import create_asr_tab
 from chatbot import create_chatbot_tab
 from image_classification import create_image_classification_tab
     def __init__(
         self,
+        client: InferenceClient,
         text_to_image_model: str,
         image_to_text_model: str,
         image_classification_model: str,
         chat_model: str,
         fallback_translation_model: str
     ):
+        """Initialize the App with an InferenceClient instance and model IDs.
         Args:
+            client: Hugging Face InferenceClient instance for making API calls
+                to Hugging Face's inference endpoints (used for text-to-image and ASR).
             text_to_image_model: Model ID for text-to-image generation.
             image_to_text_model: Model ID for image captioning.
             image_classification_model: Model ID for image classification.
             fallback_translation_model: Fallback translation model ID for languages
                 without specific translation models.
         """
+        self.client = client
         self.text_to_image_model = text_to_image_model
         self.image_to_text_model = image_to_text_model
         self.image_classification_model = image_classification_model
             gr.Markdown("A gallery of building blocks for building AI applications")
             with gr.Tabs():
                 with gr.Tab("Text-to-image Generation"):
+                    create_text_to_image_tab(self.client, self.text_to_image_model)
                 with gr.Tab("Image-to-text or Image Captioning"):
                     create_image_to_text_tab(self.image_to_text_model)
                 with gr.Tab("Image Classification"):
                 with gr.Tab("Text-to-speech (TTS)"):
                     create_text_to_speech_tab(self.text_to_speech_model)
                 with gr.Tab("Automatic Speech Recognition (ASR)"):
+                    create_asr_tab(self.client, self.audio_transcription_model)
                 with gr.Tab("Chat"):
                     create_chatbot_tab(self.chat_model)
                 with gr.Tab("Translation to English"):
 if __name__ == "__main__":
     load_dotenv()
     app = App(
+        client=InferenceClient(),
         text_to_image_model=getenv("TEXT_TO_IMAGE_MODEL"),
         image_to_text_model=getenv("IMAGE_TO_TEXT_MODEL"),
         image_classification_model=getenv("IMAGE_CLASSIFICATION_MODEL"),

automatic_speech_recognition.py CHANGED Viewed

@@ -1,19 +1,19 @@
-import gc
 from functools import partial
 import gradio as gr
 import numpy as np
-import torch
-from transformers import pipeline
-from utils import get_pytorch_device, spaces_gpu, resample_audio, get_model_sample_rate, request_audio, get_torch_dtype
-@spaces_gpu
-def automatic_speech_recognition(model: str, audio: tuple[int, bytes | np.ndarray]) -> str:
-    """Transcribe audio to text using a Whisper or similar ASR model.
     This function converts speech audio into text transcription. The audio is
-    resampled to match the model's expected sample rate, then processed locally.
     Args:
         model: Hugging Face model ID to use for automatic speech recognition.
         audio: Tuple containing:
             - int: Sample rate of the input audio (e.g., 44100 Hz)
@@ -24,61 +24,27 @@ def automatic_speech_recognition(model: str, audio: tuple[int, bytes | np.ndarra
     Note:
         - Audio is automatically resampled to match the model's expected sample rate.
-        - Uses safetensors for secure model loading.
-        - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
-        - Cleans up model and GPU memory after inference.
     """
-    pytorch_device = get_pytorch_device()
-    target_sample_rate = get_model_sample_rate(model)
-    # Resample audio to target sample rate
-    audio_array = resample_audio(target_sample_rate, audio)
-    # During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
-    # reduces memory consumption by not storing gradients. This can significantly reduce the
-    # amount of memory used during the inference phase.
-    dtype = get_torch_dtype()
-    model_kwargs = {"use_safetensors": True}
-    if dtype is not None:
-        model_kwargs["dtype"] = dtype
-    # Load and run ASR pipeline
-    asr_pipeline = pipeline(
-        "automatic-speech-recognition",
-        model=model,
-        device=0 if pytorch_device == "cuda" else -1,
-        model_kwargs=model_kwargs
-    )
-    # Use return_timestamps="word" for long audio (>30 seconds) to avoid errors
-    # Using "word" ensures WhisperTimeStampLogitsProcessor is properly used during generation
-    # Set task='transcribe' and language='en' to avoid deprecation warnings and language detection
-    # Note: sampling_rate is not passed here since audio is already resampled to the model's expected rate
-    with torch.no_grad():
-        result = asr_pipeline(
-            audio_array,
-            return_timestamps="word",
-            task="transcribe",
-            language="en"
-        )
-    # Clean up GPU memory
-    del asr_pipeline
-    if pytorch_device == "cuda":
-        torch.cuda.empty_cache()
-    gc.collect()
-    # Extract text from result (works for both short and long audio)
-    if isinstance(result, dict) and "text" in result:
         return result["text"]
-    elif isinstance(result, str):
-        return result
-    else:
-        # Fallback: try to extract text from chunks if present
-        if isinstance(result, dict) and "chunks" in result:
-            return " ".join(chunk.get("text", "") for chunk in result["chunks"] if isinstance(chunk, dict))
-        return str(result)
-def create_asr_tab(model: str):
     """Create the automatic speech recognition tab in the Gradio interface.
     This function sets up all UI components for automatic speech recognition, including:
@@ -88,6 +54,7 @@ def create_asr_tab(model: str):
     - Transcribe button and output textbox
     Args:
         model: Hugging Face model ID to use for automatic speech recognition.
     """
     gr.Markdown("Transcribe audio to text.")
@@ -102,7 +69,7 @@ def create_asr_tab(model: str):
     audio_transcription_generate_button = gr.Button("Transcribe")
     audio_transcription_output = gr.Textbox(label="Text")
     audio_transcription_generate_button.click(
-        fn=partial(automatic_speech_recognition, model),
         inputs=audio_transcription_audio_input,
         outputs=audio_transcription_output
     )

 from functools import partial
+from os import path, unlink
 import gradio as gr
 import numpy as np
+from huggingface_hub import InferenceClient
+from utils import save_audio_to_temp_file, get_model_sample_rate, request_audio
+def automatic_speech_recognition(client: InferenceClient, model: str, audio: tuple[int, bytes | np.ndarray]) -> str:
+    """Transcribe audio to text using Hugging Face Inference API.
     This function converts speech audio into text transcription. The audio is
+    resampled to match the model's expected sample rate, saved to a temporary
+    file, and then sent to the Inference API for transcription.
     Args:
+        client: Hugging Face InferenceClient instance for API calls.
         model: Hugging Face model ID to use for automatic speech recognition.
         audio: Tuple containing:
             - int: Sample rate of the input audio (e.g., 44100 Hz)
     Note:
         - Audio is automatically resampled to match the model's expected sample rate.
+        - Audio is saved as a WAV file for InferenceClient compatibility.
+        - Automatically cleans up temporary files after transcription.
+        - Uses Inference API to offload model loading and inference to Hugging Face's
+          infrastructure, which is more suitable for environments with limited GPU memory
+          or time constraints (like Hugging Face Spaces with Zero GPU).
     """
+    temp_file_path = None
+    try:
+        target_sample_rate = get_model_sample_rate(model)
+        temp_file_path = save_audio_to_temp_file(target_sample_rate, audio)
+        result = client.automatic_speech_recognition(temp_file_path, model=model)
         return result["text"]
+    finally:
+        if temp_file_path and path.exists(temp_file_path):
+            try:
+                unlink(temp_file_path)
+            except Exception:
+                pass  # Ignore clean-up errors.
+def create_asr_tab(client: InferenceClient, model: str):
     """Create the automatic speech recognition tab in the Gradio interface.
     This function sets up all UI components for automatic speech recognition, including:
     - Transcribe button and output textbox
     Args:
+        client: Hugging Face InferenceClient instance to pass to the automatic_speech_recognition function.
         model: Hugging Face model ID to use for automatic speech recognition.
     """
     gr.Markdown("Transcribe audio to text.")
     audio_transcription_generate_button = gr.Button("Transcribe")
     audio_transcription_output = gr.Textbox(label="Text")
     audio_transcription_generate_button.click(
+        fn=partial(automatic_speech_recognition, client, model),
         inputs=audio_transcription_audio_input,
         outputs=audio_transcription_output
     )

text_to_image.py CHANGED Viewed

@@ -1,63 +1,36 @@
-import gc
 from functools import partial
 import gradio as gr
-import torch
-from os import getenv
 from PIL.Image import Image
-from diffusers import DiffusionPipeline
-from utils import get_pytorch_device, spaces_gpu, get_torch_dtype
-@spaces_gpu
-def text_to_image(model: str, prompt: str) -> Image:
-    """Generate an image from a text prompt using a diffusion model.
-    This function uses a diffusion pipeline (e.g., Stable Diffusion, FLUX) to generate
-    images from text prompts. The model is loaded, inference is performed, and then
-    cleaned up to free GPU memory.
     Args:
         model: Hugging Face model ID to use for text-to-image generation.
         prompt: Text description of the desired image.
     Returns:
         PIL Image object representing the generated image.
-    Note:
-        - Uses safetensors for secure model loading.
-        - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
-        - Cleans up model and GPU memory after inference.
     """
-    pytorch_device = get_pytorch_device()
-    dtype = get_torch_dtype()
-    # During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
-    # reduces memory consumption by not storing gradients. This can significantly reduce the
-    # amount of memory used during the inference phase.
-    pipe = DiffusionPipeline.from_pretrained(
-        model,
-        use_safetensors=True,
-        dtype=dtype
-    )
-    pipe = pipe.to(pytorch_device)
-    with torch.no_grad():
-        result = pipe(prompt).images[0]
-    # Clean up GPU memory
-    del pipe
-    if pytorch_device == "cuda":
-        torch.cuda.empty_cache()
-    gc.collect()
-    return result
-def create_text_to_image_tab(model: str):
     """Create the text-to-image generation tab in the Gradio interface.
     This function sets up all UI components for text-to-image generation,
     including input textbox, generate button, and output image display.
     Args:
         model: Hugging Face model ID to use for text-to-image generation.
     """
     gr.Markdown("Generate an image from a text prompt.")
@@ -65,7 +38,7 @@ def create_text_to_image_tab(model: str):
     text_to_image_generate_button = gr.Button("Generate")
     text_to_image_output = gr.Image(label="Image", type="pil")
     text_to_image_generate_button.click(
-        fn=partial(text_to_image, model),
         inputs=text_to_image_prompt,
         outputs=text_to_image_output
     )

 from functools import partial
 import gradio as gr
 from PIL.Image import Image
+from huggingface_hub import InferenceClient
+def text_to_image(client: InferenceClient, model: str, prompt: str) -> Image:
+    """Generate an image from a text prompt using Hugging Face Inference API.
+    This function uses the Hugging Face Inference API to generate images from text prompts.
+    This approach offloads the model loading and inference to Hugging Face's infrastructure,
+    which is more suitable for environments with limited GPU memory or time constraints
+    (like Hugging Face Spaces with Zero GPU).
     Args:
+        client: Hugging Face InferenceClient instance for API calls.
         model: Hugging Face model ID to use for text-to-image generation.
         prompt: Text description of the desired image.
     Returns:
         PIL Image object representing the generated image.
     """
+    return client.text_to_image(prompt, model=model)
+def create_text_to_image_tab(client: InferenceClient, model: str):
     """Create the text-to-image generation tab in the Gradio interface.
     This function sets up all UI components for text-to-image generation,
     including input textbox, generate button, and output image display.
     Args:
+        client: Hugging Face InferenceClient instance to pass to the text_to_image function.
         model: Hugging Face model ID to use for text-to-image generation.
     """
     gr.Markdown("Generate an image from a text prompt.")
     text_to_image_generate_button = gr.Button("Generate")
     text_to_image_output = gr.Image(label="Image", type="pil")
     text_to_image_generate_button.click(
+        fn=partial(text_to_image, client, model),
         inputs=text_to_image_prompt,
         outputs=text_to_image_output
     )