Spaces:
Running
on
Zero
Running
on
Zero
Switch text-to-image and automatic speech recognition (ASR) back to using the Hugging Face inference client; Zero GPU cannot accommodate the time it takes for those tasks
Browse files- README.md +12 -0
- app.py +9 -3
- automatic_speech_recognition.py +27 -60
- text_to_image.py +12 -39
README.md
CHANGED
|
@@ -26,6 +26,18 @@ This application provides the following AI building blocks:
|
|
| 26 |
- **Automatic Speech Recognition (ASR)**: Transcribe audio to text using Whisper models
|
| 27 |
- **Chatbot**: Have conversations with AI chatbots supporting both modern chat models and seq2seq models
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
## Prerequisites
|
| 30 |
|
| 31 |
- Python 3.8 or higher
|
|
|
|
| 26 |
- **Automatic Speech Recognition (ASR)**: Transcribe audio to text using Whisper models
|
| 27 |
- **Chatbot**: Have conversations with AI chatbots supporting both modern chat models and seq2seq models
|
| 28 |
|
| 29 |
+
### Architecture: Local Models vs. Inference API
|
| 30 |
+
|
| 31 |
+
This application uses a hybrid approach:
|
| 32 |
+
|
| 33 |
+
- **Text-to-image Generation** and **Automatic Speech Recognition (ASR)** use the **Hugging Face Inference API** (via `InferenceClient`) instead of loading models locally. This is because:
|
| 34 |
+
- Text-to-image models (like FLUX.1-dev) are extremely large and memory-intensive
|
| 35 |
+
- ASR models (like Whisper-large-v3) are also large and can cause timeouts in constrained environments
|
| 36 |
+
- Loading them locally can cause timeouts or out-of-memory errors, especially in constrained environments like Hugging Face Spaces with Zero GPU
|
| 37 |
+
- Using the Inference API offloads the model loading and inference to Hugging Face's infrastructure, ensuring reliable operation
|
| 38 |
+
|
| 39 |
+
- **All other tasks** (image classification, translation, image-to-text, text-to-speech, chatbot) load models **locally** to take advantage of Hugging Face Zero GPU for cost-effective hosting. These models are smaller and can be loaded efficiently within memory constraints.
|
| 40 |
+
|
| 41 |
## Prerequisites
|
| 42 |
|
| 43 |
- Python 3.8 or higher
|
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from dotenv import load_dotenv
|
| 2 |
from os import getenv
|
| 3 |
import gradio as gr
|
|
|
|
| 4 |
from automatic_speech_recognition import create_asr_tab
|
| 5 |
from chatbot import create_chatbot_tab
|
| 6 |
from image_classification import create_image_classification_tab
|
|
@@ -19,6 +20,7 @@ class App:
|
|
| 19 |
|
| 20 |
def __init__(
|
| 21 |
self,
|
|
|
|
| 22 |
text_to_image_model: str,
|
| 23 |
image_to_text_model: str,
|
| 24 |
image_classification_model: str,
|
|
@@ -27,9 +29,11 @@ class App:
|
|
| 27 |
chat_model: str,
|
| 28 |
fallback_translation_model: str
|
| 29 |
):
|
| 30 |
-
"""Initialize the App with model IDs.
|
| 31 |
|
| 32 |
Args:
|
|
|
|
|
|
|
| 33 |
text_to_image_model: Model ID for text-to-image generation.
|
| 34 |
image_to_text_model: Model ID for image captioning.
|
| 35 |
image_classification_model: Model ID for image classification.
|
|
@@ -39,6 +43,7 @@ class App:
|
|
| 39 |
fallback_translation_model: Fallback translation model ID for languages
|
| 40 |
without specific translation models.
|
| 41 |
"""
|
|
|
|
| 42 |
self.text_to_image_model = text_to_image_model
|
| 43 |
self.image_to_text_model = image_to_text_model
|
| 44 |
self.image_classification_model = image_classification_model
|
|
@@ -59,7 +64,7 @@ class App:
|
|
| 59 |
gr.Markdown("A gallery of building blocks for building AI applications")
|
| 60 |
with gr.Tabs():
|
| 61 |
with gr.Tab("Text-to-image Generation"):
|
| 62 |
-
create_text_to_image_tab(self.text_to_image_model)
|
| 63 |
with gr.Tab("Image-to-text or Image Captioning"):
|
| 64 |
create_image_to_text_tab(self.image_to_text_model)
|
| 65 |
with gr.Tab("Image Classification"):
|
|
@@ -67,7 +72,7 @@ class App:
|
|
| 67 |
with gr.Tab("Text-to-speech (TTS)"):
|
| 68 |
create_text_to_speech_tab(self.text_to_speech_model)
|
| 69 |
with gr.Tab("Automatic Speech Recognition (ASR)"):
|
| 70 |
-
create_asr_tab(self.audio_transcription_model)
|
| 71 |
with gr.Tab("Chat"):
|
| 72 |
create_chatbot_tab(self.chat_model)
|
| 73 |
with gr.Tab("Translation to English"):
|
|
@@ -79,6 +84,7 @@ class App:
|
|
| 79 |
if __name__ == "__main__":
|
| 80 |
load_dotenv()
|
| 81 |
app = App(
|
|
|
|
| 82 |
text_to_image_model=getenv("TEXT_TO_IMAGE_MODEL"),
|
| 83 |
image_to_text_model=getenv("IMAGE_TO_TEXT_MODEL"),
|
| 84 |
image_classification_model=getenv("IMAGE_CLASSIFICATION_MODEL"),
|
|
|
|
| 1 |
from dotenv import load_dotenv
|
| 2 |
from os import getenv
|
| 3 |
import gradio as gr
|
| 4 |
+
from huggingface_hub import InferenceClient
|
| 5 |
from automatic_speech_recognition import create_asr_tab
|
| 6 |
from chatbot import create_chatbot_tab
|
| 7 |
from image_classification import create_image_classification_tab
|
|
|
|
| 20 |
|
| 21 |
def __init__(
|
| 22 |
self,
|
| 23 |
+
client: InferenceClient,
|
| 24 |
text_to_image_model: str,
|
| 25 |
image_to_text_model: str,
|
| 26 |
image_classification_model: str,
|
|
|
|
| 29 |
chat_model: str,
|
| 30 |
fallback_translation_model: str
|
| 31 |
):
|
| 32 |
+
"""Initialize the App with an InferenceClient instance and model IDs.
|
| 33 |
|
| 34 |
Args:
|
| 35 |
+
client: Hugging Face InferenceClient instance for making API calls
|
| 36 |
+
to Hugging Face's inference endpoints (used for text-to-image and ASR).
|
| 37 |
text_to_image_model: Model ID for text-to-image generation.
|
| 38 |
image_to_text_model: Model ID for image captioning.
|
| 39 |
image_classification_model: Model ID for image classification.
|
|
|
|
| 43 |
fallback_translation_model: Fallback translation model ID for languages
|
| 44 |
without specific translation models.
|
| 45 |
"""
|
| 46 |
+
self.client = client
|
| 47 |
self.text_to_image_model = text_to_image_model
|
| 48 |
self.image_to_text_model = image_to_text_model
|
| 49 |
self.image_classification_model = image_classification_model
|
|
|
|
| 64 |
gr.Markdown("A gallery of building blocks for building AI applications")
|
| 65 |
with gr.Tabs():
|
| 66 |
with gr.Tab("Text-to-image Generation"):
|
| 67 |
+
create_text_to_image_tab(self.client, self.text_to_image_model)
|
| 68 |
with gr.Tab("Image-to-text or Image Captioning"):
|
| 69 |
create_image_to_text_tab(self.image_to_text_model)
|
| 70 |
with gr.Tab("Image Classification"):
|
|
|
|
| 72 |
with gr.Tab("Text-to-speech (TTS)"):
|
| 73 |
create_text_to_speech_tab(self.text_to_speech_model)
|
| 74 |
with gr.Tab("Automatic Speech Recognition (ASR)"):
|
| 75 |
+
create_asr_tab(self.client, self.audio_transcription_model)
|
| 76 |
with gr.Tab("Chat"):
|
| 77 |
create_chatbot_tab(self.chat_model)
|
| 78 |
with gr.Tab("Translation to English"):
|
|
|
|
| 84 |
if __name__ == "__main__":
|
| 85 |
load_dotenv()
|
| 86 |
app = App(
|
| 87 |
+
client=InferenceClient(),
|
| 88 |
text_to_image_model=getenv("TEXT_TO_IMAGE_MODEL"),
|
| 89 |
image_to_text_model=getenv("IMAGE_TO_TEXT_MODEL"),
|
| 90 |
image_classification_model=getenv("IMAGE_CLASSIFICATION_MODEL"),
|
automatic_speech_recognition.py
CHANGED
|
@@ -1,19 +1,19 @@
|
|
| 1 |
-
import gc
|
| 2 |
from functools import partial
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
import numpy as np
|
| 5 |
-
import
|
| 6 |
-
from
|
| 7 |
-
from utils import get_pytorch_device, spaces_gpu, resample_audio, get_model_sample_rate, request_audio, get_torch_dtype
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
"""Transcribe audio to text using a Whisper or similar ASR model.
|
| 12 |
|
| 13 |
This function converts speech audio into text transcription. The audio is
|
| 14 |
-
resampled to match the model's expected sample rate,
|
|
|
|
| 15 |
|
| 16 |
Args:
|
|
|
|
| 17 |
model: Hugging Face model ID to use for automatic speech recognition.
|
| 18 |
audio: Tuple containing:
|
| 19 |
- int: Sample rate of the input audio (e.g., 44100 Hz)
|
|
@@ -24,61 +24,27 @@ def automatic_speech_recognition(model: str, audio: tuple[int, bytes | np.ndarra
|
|
| 24 |
|
| 25 |
Note:
|
| 26 |
- Audio is automatically resampled to match the model's expected sample rate.
|
| 27 |
-
-
|
| 28 |
-
- Automatically
|
| 29 |
-
-
|
|
|
|
|
|
|
| 30 |
"""
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
# During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
|
| 38 |
-
# reduces memory consumption by not storing gradients. This can significantly reduce the
|
| 39 |
-
# amount of memory used during the inference phase.
|
| 40 |
-
dtype = get_torch_dtype()
|
| 41 |
-
model_kwargs = {"use_safetensors": True}
|
| 42 |
-
if dtype is not None:
|
| 43 |
-
model_kwargs["dtype"] = dtype
|
| 44 |
-
|
| 45 |
-
# Load and run ASR pipeline
|
| 46 |
-
asr_pipeline = pipeline(
|
| 47 |
-
"automatic-speech-recognition",
|
| 48 |
-
model=model,
|
| 49 |
-
device=0 if pytorch_device == "cuda" else -1,
|
| 50 |
-
model_kwargs=model_kwargs
|
| 51 |
-
)
|
| 52 |
-
# Use return_timestamps="word" for long audio (>30 seconds) to avoid errors
|
| 53 |
-
# Using "word" ensures WhisperTimeStampLogitsProcessor is properly used during generation
|
| 54 |
-
# Set task='transcribe' and language='en' to avoid deprecation warnings and language detection
|
| 55 |
-
# Note: sampling_rate is not passed here since audio is already resampled to the model's expected rate
|
| 56 |
-
with torch.no_grad():
|
| 57 |
-
result = asr_pipeline(
|
| 58 |
-
audio_array,
|
| 59 |
-
return_timestamps="word",
|
| 60 |
-
task="transcribe",
|
| 61 |
-
language="en"
|
| 62 |
-
)
|
| 63 |
-
|
| 64 |
-
# Clean up GPU memory
|
| 65 |
-
del asr_pipeline
|
| 66 |
-
if pytorch_device == "cuda":
|
| 67 |
-
torch.cuda.empty_cache()
|
| 68 |
-
gc.collect()
|
| 69 |
-
# Extract text from result (works for both short and long audio)
|
| 70 |
-
if isinstance(result, dict) and "text" in result:
|
| 71 |
return result["text"]
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
return str(result)
|
| 79 |
|
| 80 |
|
| 81 |
-
def create_asr_tab(model: str):
|
| 82 |
"""Create the automatic speech recognition tab in the Gradio interface.
|
| 83 |
|
| 84 |
This function sets up all UI components for automatic speech recognition, including:
|
|
@@ -88,6 +54,7 @@ def create_asr_tab(model: str):
|
|
| 88 |
- Transcribe button and output textbox
|
| 89 |
|
| 90 |
Args:
|
|
|
|
| 91 |
model: Hugging Face model ID to use for automatic speech recognition.
|
| 92 |
"""
|
| 93 |
gr.Markdown("Transcribe audio to text.")
|
|
@@ -102,7 +69,7 @@ def create_asr_tab(model: str):
|
|
| 102 |
audio_transcription_generate_button = gr.Button("Transcribe")
|
| 103 |
audio_transcription_output = gr.Textbox(label="Text")
|
| 104 |
audio_transcription_generate_button.click(
|
| 105 |
-
fn=partial(automatic_speech_recognition, model),
|
| 106 |
inputs=audio_transcription_audio_input,
|
| 107 |
outputs=audio_transcription_output
|
| 108 |
)
|
|
|
|
|
|
|
| 1 |
from functools import partial
|
| 2 |
+
from os import path, unlink
|
| 3 |
import gradio as gr
|
| 4 |
import numpy as np
|
| 5 |
+
from huggingface_hub import InferenceClient
|
| 6 |
+
from utils import save_audio_to_temp_file, get_model_sample_rate, request_audio
|
|
|
|
| 7 |
|
| 8 |
+
def automatic_speech_recognition(client: InferenceClient, model: str, audio: tuple[int, bytes | np.ndarray]) -> str:
|
| 9 |
+
"""Transcribe audio to text using Hugging Face Inference API.
|
|
|
|
| 10 |
|
| 11 |
This function converts speech audio into text transcription. The audio is
|
| 12 |
+
resampled to match the model's expected sample rate, saved to a temporary
|
| 13 |
+
file, and then sent to the Inference API for transcription.
|
| 14 |
|
| 15 |
Args:
|
| 16 |
+
client: Hugging Face InferenceClient instance for API calls.
|
| 17 |
model: Hugging Face model ID to use for automatic speech recognition.
|
| 18 |
audio: Tuple containing:
|
| 19 |
- int: Sample rate of the input audio (e.g., 44100 Hz)
|
|
|
|
| 24 |
|
| 25 |
Note:
|
| 26 |
- Audio is automatically resampled to match the model's expected sample rate.
|
| 27 |
+
- Audio is saved as a WAV file for InferenceClient compatibility.
|
| 28 |
+
- Automatically cleans up temporary files after transcription.
|
| 29 |
+
- Uses Inference API to offload model loading and inference to Hugging Face's
|
| 30 |
+
infrastructure, which is more suitable for environments with limited GPU memory
|
| 31 |
+
or time constraints (like Hugging Face Spaces with Zero GPU).
|
| 32 |
"""
|
| 33 |
+
temp_file_path = None
|
| 34 |
+
try:
|
| 35 |
+
target_sample_rate = get_model_sample_rate(model)
|
| 36 |
+
temp_file_path = save_audio_to_temp_file(target_sample_rate, audio)
|
| 37 |
+
result = client.automatic_speech_recognition(temp_file_path, model=model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
return result["text"]
|
| 39 |
+
finally:
|
| 40 |
+
if temp_file_path and path.exists(temp_file_path):
|
| 41 |
+
try:
|
| 42 |
+
unlink(temp_file_path)
|
| 43 |
+
except Exception:
|
| 44 |
+
pass # Ignore clean-up errors.
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
+
def create_asr_tab(client: InferenceClient, model: str):
|
| 48 |
"""Create the automatic speech recognition tab in the Gradio interface.
|
| 49 |
|
| 50 |
This function sets up all UI components for automatic speech recognition, including:
|
|
|
|
| 54 |
- Transcribe button and output textbox
|
| 55 |
|
| 56 |
Args:
|
| 57 |
+
client: Hugging Face InferenceClient instance to pass to the automatic_speech_recognition function.
|
| 58 |
model: Hugging Face model ID to use for automatic speech recognition.
|
| 59 |
"""
|
| 60 |
gr.Markdown("Transcribe audio to text.")
|
|
|
|
| 69 |
audio_transcription_generate_button = gr.Button("Transcribe")
|
| 70 |
audio_transcription_output = gr.Textbox(label="Text")
|
| 71 |
audio_transcription_generate_button.click(
|
| 72 |
+
fn=partial(automatic_speech_recognition, client, model),
|
| 73 |
inputs=audio_transcription_audio_input,
|
| 74 |
outputs=audio_transcription_output
|
| 75 |
)
|
text_to_image.py
CHANGED
|
@@ -1,63 +1,36 @@
|
|
| 1 |
-
import gc
|
| 2 |
from functools import partial
|
| 3 |
import gradio as gr
|
| 4 |
-
import torch
|
| 5 |
-
from os import getenv
|
| 6 |
from PIL.Image import Image
|
| 7 |
-
from
|
| 8 |
-
from utils import get_pytorch_device, spaces_gpu, get_torch_dtype
|
| 9 |
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
"""Generate an image from a text prompt using a diffusion model.
|
| 14 |
|
| 15 |
-
This function uses
|
| 16 |
-
|
| 17 |
-
|
|
|
|
| 18 |
|
| 19 |
Args:
|
|
|
|
| 20 |
model: Hugging Face model ID to use for text-to-image generation.
|
| 21 |
prompt: Text description of the desired image.
|
| 22 |
|
| 23 |
Returns:
|
| 24 |
PIL Image object representing the generated image.
|
| 25 |
-
|
| 26 |
-
Note:
|
| 27 |
-
- Uses safetensors for secure model loading.
|
| 28 |
-
- Automatically selects the best available device (CUDA/XPU/MPS/CPU).
|
| 29 |
-
- Cleans up model and GPU memory after inference.
|
| 30 |
"""
|
| 31 |
-
|
| 32 |
-
dtype = get_torch_dtype()
|
| 33 |
-
|
| 34 |
-
# During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
|
| 35 |
-
# reduces memory consumption by not storing gradients. This can significantly reduce the
|
| 36 |
-
# amount of memory used during the inference phase.
|
| 37 |
-
pipe = DiffusionPipeline.from_pretrained(
|
| 38 |
-
model,
|
| 39 |
-
use_safetensors=True,
|
| 40 |
-
dtype=dtype
|
| 41 |
-
)
|
| 42 |
-
pipe = pipe.to(pytorch_device)
|
| 43 |
-
with torch.no_grad():
|
| 44 |
-
result = pipe(prompt).images[0]
|
| 45 |
-
|
| 46 |
-
# Clean up GPU memory
|
| 47 |
-
del pipe
|
| 48 |
-
if pytorch_device == "cuda":
|
| 49 |
-
torch.cuda.empty_cache()
|
| 50 |
-
gc.collect()
|
| 51 |
-
return result
|
| 52 |
|
| 53 |
|
| 54 |
-
def create_text_to_image_tab(model: str):
|
| 55 |
"""Create the text-to-image generation tab in the Gradio interface.
|
| 56 |
|
| 57 |
This function sets up all UI components for text-to-image generation,
|
| 58 |
including input textbox, generate button, and output image display.
|
| 59 |
|
| 60 |
Args:
|
|
|
|
| 61 |
model: Hugging Face model ID to use for text-to-image generation.
|
| 62 |
"""
|
| 63 |
gr.Markdown("Generate an image from a text prompt.")
|
|
@@ -65,7 +38,7 @@ def create_text_to_image_tab(model: str):
|
|
| 65 |
text_to_image_generate_button = gr.Button("Generate")
|
| 66 |
text_to_image_output = gr.Image(label="Image", type="pil")
|
| 67 |
text_to_image_generate_button.click(
|
| 68 |
-
fn=partial(text_to_image, model),
|
| 69 |
inputs=text_to_image_prompt,
|
| 70 |
outputs=text_to_image_output
|
| 71 |
)
|
|
|
|
|
|
|
| 1 |
from functools import partial
|
| 2 |
import gradio as gr
|
|
|
|
|
|
|
| 3 |
from PIL.Image import Image
|
| 4 |
+
from huggingface_hub import InferenceClient
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
+
def text_to_image(client: InferenceClient, model: str, prompt: str) -> Image:
|
| 8 |
+
"""Generate an image from a text prompt using Hugging Face Inference API.
|
|
|
|
| 9 |
|
| 10 |
+
This function uses the Hugging Face Inference API to generate images from text prompts.
|
| 11 |
+
This approach offloads the model loading and inference to Hugging Face's infrastructure,
|
| 12 |
+
which is more suitable for environments with limited GPU memory or time constraints
|
| 13 |
+
(like Hugging Face Spaces with Zero GPU).
|
| 14 |
|
| 15 |
Args:
|
| 16 |
+
client: Hugging Face InferenceClient instance for API calls.
|
| 17 |
model: Hugging Face model ID to use for text-to-image generation.
|
| 18 |
prompt: Text description of the desired image.
|
| 19 |
|
| 20 |
Returns:
|
| 21 |
PIL Image object representing the generated image.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
"""
|
| 23 |
+
return client.text_to_image(prompt, model=model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
+
def create_text_to_image_tab(client: InferenceClient, model: str):
|
| 27 |
"""Create the text-to-image generation tab in the Gradio interface.
|
| 28 |
|
| 29 |
This function sets up all UI components for text-to-image generation,
|
| 30 |
including input textbox, generate button, and output image display.
|
| 31 |
|
| 32 |
Args:
|
| 33 |
+
client: Hugging Face InferenceClient instance to pass to the text_to_image function.
|
| 34 |
model: Hugging Face model ID to use for text-to-image generation.
|
| 35 |
"""
|
| 36 |
gr.Markdown("Generate an image from a text prompt.")
|
|
|
|
| 38 |
text_to_image_generate_button = gr.Button("Generate")
|
| 39 |
text_to_image_output = gr.Image(label="Image", type="pil")
|
| 40 |
text_to_image_generate_button.click(
|
| 41 |
+
fn=partial(text_to_image, client, model),
|
| 42 |
inputs=text_to_image_prompt,
|
| 43 |
outputs=text_to_image_output
|
| 44 |
)
|