LiKenun commited on
Commit
b71a3ad
·
1 Parent(s): 0b93b56

Switch text-to-image and automatic speech recognition (ASR) back to using the Hugging Face inference client; Zero GPU cannot accommodate the time it takes for those tasks

Browse files
Files changed (4) hide show
  1. README.md +12 -0
  2. app.py +9 -3
  3. automatic_speech_recognition.py +27 -60
  4. text_to_image.py +12 -39
README.md CHANGED
@@ -26,6 +26,18 @@ This application provides the following AI building blocks:
26
  - **Automatic Speech Recognition (ASR)**: Transcribe audio to text using Whisper models
27
  - **Chatbot**: Have conversations with AI chatbots supporting both modern chat models and seq2seq models
28
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  ## Prerequisites
30
 
31
  - Python 3.8 or higher
 
26
  - **Automatic Speech Recognition (ASR)**: Transcribe audio to text using Whisper models
27
  - **Chatbot**: Have conversations with AI chatbots supporting both modern chat models and seq2seq models
28
 
29
+ ### Architecture: Local Models vs. Inference API
30
+
31
+ This application uses a hybrid approach:
32
+
33
+ - **Text-to-image Generation** and **Automatic Speech Recognition (ASR)** use the **Hugging Face Inference API** (via `InferenceClient`) instead of loading models locally. This is because:
34
+ - Text-to-image models (like FLUX.1-dev) are extremely large and memory-intensive
35
+ - ASR models (like Whisper-large-v3) are also large and can cause timeouts in constrained environments
36
+ - Loading them locally can cause timeouts or out-of-memory errors, especially in constrained environments like Hugging Face Spaces with Zero GPU
37
+ - Using the Inference API offloads the model loading and inference to Hugging Face's infrastructure, ensuring reliable operation
38
+
39
+ - **All other tasks** (image classification, translation, image-to-text, text-to-speech, chatbot) load models **locally** to take advantage of Hugging Face Zero GPU for cost-effective hosting. These models are smaller and can be loaded efficiently within memory constraints.
40
+
41
  ## Prerequisites
42
 
43
  - Python 3.8 or higher
app.py CHANGED
@@ -1,6 +1,7 @@
1
  from dotenv import load_dotenv
2
  from os import getenv
3
  import gradio as gr
 
4
  from automatic_speech_recognition import create_asr_tab
5
  from chatbot import create_chatbot_tab
6
  from image_classification import create_image_classification_tab
@@ -19,6 +20,7 @@ class App:
19
 
20
  def __init__(
21
  self,
 
22
  text_to_image_model: str,
23
  image_to_text_model: str,
24
  image_classification_model: str,
@@ -27,9 +29,11 @@ class App:
27
  chat_model: str,
28
  fallback_translation_model: str
29
  ):
30
- """Initialize the App with model IDs.
31
 
32
  Args:
 
 
33
  text_to_image_model: Model ID for text-to-image generation.
34
  image_to_text_model: Model ID for image captioning.
35
  image_classification_model: Model ID for image classification.
@@ -39,6 +43,7 @@ class App:
39
  fallback_translation_model: Fallback translation model ID for languages
40
  without specific translation models.
41
  """
 
42
  self.text_to_image_model = text_to_image_model
43
  self.image_to_text_model = image_to_text_model
44
  self.image_classification_model = image_classification_model
@@ -59,7 +64,7 @@ class App:
59
  gr.Markdown("A gallery of building blocks for building AI applications")
60
  with gr.Tabs():
61
  with gr.Tab("Text-to-image Generation"):
62
- create_text_to_image_tab(self.text_to_image_model)
63
  with gr.Tab("Image-to-text or Image Captioning"):
64
  create_image_to_text_tab(self.image_to_text_model)
65
  with gr.Tab("Image Classification"):
@@ -67,7 +72,7 @@ class App:
67
  with gr.Tab("Text-to-speech (TTS)"):
68
  create_text_to_speech_tab(self.text_to_speech_model)
69
  with gr.Tab("Automatic Speech Recognition (ASR)"):
70
- create_asr_tab(self.audio_transcription_model)
71
  with gr.Tab("Chat"):
72
  create_chatbot_tab(self.chat_model)
73
  with gr.Tab("Translation to English"):
@@ -79,6 +84,7 @@ class App:
79
  if __name__ == "__main__":
80
  load_dotenv()
81
  app = App(
 
82
  text_to_image_model=getenv("TEXT_TO_IMAGE_MODEL"),
83
  image_to_text_model=getenv("IMAGE_TO_TEXT_MODEL"),
84
  image_classification_model=getenv("IMAGE_CLASSIFICATION_MODEL"),
 
1
  from dotenv import load_dotenv
2
  from os import getenv
3
  import gradio as gr
4
+ from huggingface_hub import InferenceClient
5
  from automatic_speech_recognition import create_asr_tab
6
  from chatbot import create_chatbot_tab
7
  from image_classification import create_image_classification_tab
 
20
 
21
  def __init__(
22
  self,
23
+ client: InferenceClient,
24
  text_to_image_model: str,
25
  image_to_text_model: str,
26
  image_classification_model: str,
 
29
  chat_model: str,
30
  fallback_translation_model: str
31
  ):
32
+ """Initialize the App with an InferenceClient instance and model IDs.
33
 
34
  Args:
35
+ client: Hugging Face InferenceClient instance for making API calls
36
+ to Hugging Face's inference endpoints (used for text-to-image and ASR).
37
  text_to_image_model: Model ID for text-to-image generation.
38
  image_to_text_model: Model ID for image captioning.
39
  image_classification_model: Model ID for image classification.
 
43
  fallback_translation_model: Fallback translation model ID for languages
44
  without specific translation models.
45
  """
46
+ self.client = client
47
  self.text_to_image_model = text_to_image_model
48
  self.image_to_text_model = image_to_text_model
49
  self.image_classification_model = image_classification_model
 
64
  gr.Markdown("A gallery of building blocks for building AI applications")
65
  with gr.Tabs():
66
  with gr.Tab("Text-to-image Generation"):
67
+ create_text_to_image_tab(self.client, self.text_to_image_model)
68
  with gr.Tab("Image-to-text or Image Captioning"):
69
  create_image_to_text_tab(self.image_to_text_model)
70
  with gr.Tab("Image Classification"):
 
72
  with gr.Tab("Text-to-speech (TTS)"):
73
  create_text_to_speech_tab(self.text_to_speech_model)
74
  with gr.Tab("Automatic Speech Recognition (ASR)"):
75
+ create_asr_tab(self.client, self.audio_transcription_model)
76
  with gr.Tab("Chat"):
77
  create_chatbot_tab(self.chat_model)
78
  with gr.Tab("Translation to English"):
 
84
  if __name__ == "__main__":
85
  load_dotenv()
86
  app = App(
87
+ client=InferenceClient(),
88
  text_to_image_model=getenv("TEXT_TO_IMAGE_MODEL"),
89
  image_to_text_model=getenv("IMAGE_TO_TEXT_MODEL"),
90
  image_classification_model=getenv("IMAGE_CLASSIFICATION_MODEL"),
automatic_speech_recognition.py CHANGED
@@ -1,19 +1,19 @@
1
- import gc
2
  from functools import partial
 
3
  import gradio as gr
4
  import numpy as np
5
- import torch
6
- from transformers import pipeline
7
- from utils import get_pytorch_device, spaces_gpu, resample_audio, get_model_sample_rate, request_audio, get_torch_dtype
8
 
9
- @spaces_gpu
10
- def automatic_speech_recognition(model: str, audio: tuple[int, bytes | np.ndarray]) -> str:
11
- """Transcribe audio to text using a Whisper or similar ASR model.
12
 
13
  This function converts speech audio into text transcription. The audio is
14
- resampled to match the model's expected sample rate, then processed locally.
 
15
 
16
  Args:
 
17
  model: Hugging Face model ID to use for automatic speech recognition.
18
  audio: Tuple containing:
19
  - int: Sample rate of the input audio (e.g., 44100 Hz)
@@ -24,61 +24,27 @@ def automatic_speech_recognition(model: str, audio: tuple[int, bytes | np.ndarra
24
 
25
  Note:
26
  - Audio is automatically resampled to match the model's expected sample rate.
27
- - Uses safetensors for secure model loading.
28
- - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
29
- - Cleans up model and GPU memory after inference.
 
 
30
  """
31
- pytorch_device = get_pytorch_device()
32
- target_sample_rate = get_model_sample_rate(model)
33
-
34
- # Resample audio to target sample rate
35
- audio_array = resample_audio(target_sample_rate, audio)
36
-
37
- # During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
38
- # reduces memory consumption by not storing gradients. This can significantly reduce the
39
- # amount of memory used during the inference phase.
40
- dtype = get_torch_dtype()
41
- model_kwargs = {"use_safetensors": True}
42
- if dtype is not None:
43
- model_kwargs["dtype"] = dtype
44
-
45
- # Load and run ASR pipeline
46
- asr_pipeline = pipeline(
47
- "automatic-speech-recognition",
48
- model=model,
49
- device=0 if pytorch_device == "cuda" else -1,
50
- model_kwargs=model_kwargs
51
- )
52
- # Use return_timestamps="word" for long audio (>30 seconds) to avoid errors
53
- # Using "word" ensures WhisperTimeStampLogitsProcessor is properly used during generation
54
- # Set task='transcribe' and language='en' to avoid deprecation warnings and language detection
55
- # Note: sampling_rate is not passed here since audio is already resampled to the model's expected rate
56
- with torch.no_grad():
57
- result = asr_pipeline(
58
- audio_array,
59
- return_timestamps="word",
60
- task="transcribe",
61
- language="en"
62
- )
63
-
64
- # Clean up GPU memory
65
- del asr_pipeline
66
- if pytorch_device == "cuda":
67
- torch.cuda.empty_cache()
68
- gc.collect()
69
- # Extract text from result (works for both short and long audio)
70
- if isinstance(result, dict) and "text" in result:
71
  return result["text"]
72
- elif isinstance(result, str):
73
- return result
74
- else:
75
- # Fallback: try to extract text from chunks if present
76
- if isinstance(result, dict) and "chunks" in result:
77
- return " ".join(chunk.get("text", "") for chunk in result["chunks"] if isinstance(chunk, dict))
78
- return str(result)
79
 
80
 
81
- def create_asr_tab(model: str):
82
  """Create the automatic speech recognition tab in the Gradio interface.
83
 
84
  This function sets up all UI components for automatic speech recognition, including:
@@ -88,6 +54,7 @@ def create_asr_tab(model: str):
88
  - Transcribe button and output textbox
89
 
90
  Args:
 
91
  model: Hugging Face model ID to use for automatic speech recognition.
92
  """
93
  gr.Markdown("Transcribe audio to text.")
@@ -102,7 +69,7 @@ def create_asr_tab(model: str):
102
  audio_transcription_generate_button = gr.Button("Transcribe")
103
  audio_transcription_output = gr.Textbox(label="Text")
104
  audio_transcription_generate_button.click(
105
- fn=partial(automatic_speech_recognition, model),
106
  inputs=audio_transcription_audio_input,
107
  outputs=audio_transcription_output
108
  )
 
 
1
  from functools import partial
2
+ from os import path, unlink
3
  import gradio as gr
4
  import numpy as np
5
+ from huggingface_hub import InferenceClient
6
+ from utils import save_audio_to_temp_file, get_model_sample_rate, request_audio
 
7
 
8
+ def automatic_speech_recognition(client: InferenceClient, model: str, audio: tuple[int, bytes | np.ndarray]) -> str:
9
+ """Transcribe audio to text using Hugging Face Inference API.
 
10
 
11
  This function converts speech audio into text transcription. The audio is
12
+ resampled to match the model's expected sample rate, saved to a temporary
13
+ file, and then sent to the Inference API for transcription.
14
 
15
  Args:
16
+ client: Hugging Face InferenceClient instance for API calls.
17
  model: Hugging Face model ID to use for automatic speech recognition.
18
  audio: Tuple containing:
19
  - int: Sample rate of the input audio (e.g., 44100 Hz)
 
24
 
25
  Note:
26
  - Audio is automatically resampled to match the model's expected sample rate.
27
+ - Audio is saved as a WAV file for InferenceClient compatibility.
28
+ - Automatically cleans up temporary files after transcription.
29
+ - Uses Inference API to offload model loading and inference to Hugging Face's
30
+ infrastructure, which is more suitable for environments with limited GPU memory
31
+ or time constraints (like Hugging Face Spaces with Zero GPU).
32
  """
33
+ temp_file_path = None
34
+ try:
35
+ target_sample_rate = get_model_sample_rate(model)
36
+ temp_file_path = save_audio_to_temp_file(target_sample_rate, audio)
37
+ result = client.automatic_speech_recognition(temp_file_path, model=model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  return result["text"]
39
+ finally:
40
+ if temp_file_path and path.exists(temp_file_path):
41
+ try:
42
+ unlink(temp_file_path)
43
+ except Exception:
44
+ pass # Ignore clean-up errors.
 
45
 
46
 
47
+ def create_asr_tab(client: InferenceClient, model: str):
48
  """Create the automatic speech recognition tab in the Gradio interface.
49
 
50
  This function sets up all UI components for automatic speech recognition, including:
 
54
  - Transcribe button and output textbox
55
 
56
  Args:
57
+ client: Hugging Face InferenceClient instance to pass to the automatic_speech_recognition function.
58
  model: Hugging Face model ID to use for automatic speech recognition.
59
  """
60
  gr.Markdown("Transcribe audio to text.")
 
69
  audio_transcription_generate_button = gr.Button("Transcribe")
70
  audio_transcription_output = gr.Textbox(label="Text")
71
  audio_transcription_generate_button.click(
72
+ fn=partial(automatic_speech_recognition, client, model),
73
  inputs=audio_transcription_audio_input,
74
  outputs=audio_transcription_output
75
  )
text_to_image.py CHANGED
@@ -1,63 +1,36 @@
1
- import gc
2
  from functools import partial
3
  import gradio as gr
4
- import torch
5
- from os import getenv
6
  from PIL.Image import Image
7
- from diffusers import DiffusionPipeline
8
- from utils import get_pytorch_device, spaces_gpu, get_torch_dtype
9
 
10
 
11
- @spaces_gpu
12
- def text_to_image(model: str, prompt: str) -> Image:
13
- """Generate an image from a text prompt using a diffusion model.
14
 
15
- This function uses a diffusion pipeline (e.g., Stable Diffusion, FLUX) to generate
16
- images from text prompts. The model is loaded, inference is performed, and then
17
- cleaned up to free GPU memory.
 
18
 
19
  Args:
 
20
  model: Hugging Face model ID to use for text-to-image generation.
21
  prompt: Text description of the desired image.
22
 
23
  Returns:
24
  PIL Image object representing the generated image.
25
-
26
- Note:
27
- - Uses safetensors for secure model loading.
28
- - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
29
- - Cleans up model and GPU memory after inference.
30
  """
31
- pytorch_device = get_pytorch_device()
32
- dtype = get_torch_dtype()
33
-
34
- # During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
35
- # reduces memory consumption by not storing gradients. This can significantly reduce the
36
- # amount of memory used during the inference phase.
37
- pipe = DiffusionPipeline.from_pretrained(
38
- model,
39
- use_safetensors=True,
40
- dtype=dtype
41
- )
42
- pipe = pipe.to(pytorch_device)
43
- with torch.no_grad():
44
- result = pipe(prompt).images[0]
45
-
46
- # Clean up GPU memory
47
- del pipe
48
- if pytorch_device == "cuda":
49
- torch.cuda.empty_cache()
50
- gc.collect()
51
- return result
52
 
53
 
54
- def create_text_to_image_tab(model: str):
55
  """Create the text-to-image generation tab in the Gradio interface.
56
 
57
  This function sets up all UI components for text-to-image generation,
58
  including input textbox, generate button, and output image display.
59
 
60
  Args:
 
61
  model: Hugging Face model ID to use for text-to-image generation.
62
  """
63
  gr.Markdown("Generate an image from a text prompt.")
@@ -65,7 +38,7 @@ def create_text_to_image_tab(model: str):
65
  text_to_image_generate_button = gr.Button("Generate")
66
  text_to_image_output = gr.Image(label="Image", type="pil")
67
  text_to_image_generate_button.click(
68
- fn=partial(text_to_image, model),
69
  inputs=text_to_image_prompt,
70
  outputs=text_to_image_output
71
  )
 
 
1
  from functools import partial
2
  import gradio as gr
 
 
3
  from PIL.Image import Image
4
+ from huggingface_hub import InferenceClient
 
5
 
6
 
7
+ def text_to_image(client: InferenceClient, model: str, prompt: str) -> Image:
8
+ """Generate an image from a text prompt using Hugging Face Inference API.
 
9
 
10
+ This function uses the Hugging Face Inference API to generate images from text prompts.
11
+ This approach offloads the model loading and inference to Hugging Face's infrastructure,
12
+ which is more suitable for environments with limited GPU memory or time constraints
13
+ (like Hugging Face Spaces with Zero GPU).
14
 
15
  Args:
16
+ client: Hugging Face InferenceClient instance for API calls.
17
  model: Hugging Face model ID to use for text-to-image generation.
18
  prompt: Text description of the desired image.
19
 
20
  Returns:
21
  PIL Image object representing the generated image.
 
 
 
 
 
22
  """
23
+ return client.text_to_image(prompt, model=model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
 
26
+ def create_text_to_image_tab(client: InferenceClient, model: str):
27
  """Create the text-to-image generation tab in the Gradio interface.
28
 
29
  This function sets up all UI components for text-to-image generation,
30
  including input textbox, generate button, and output image display.
31
 
32
  Args:
33
+ client: Hugging Face InferenceClient instance to pass to the text_to_image function.
34
  model: Hugging Face model ID to use for text-to-image generation.
35
  """
36
  gr.Markdown("Generate an image from a text prompt.")
 
38
  text_to_image_generate_button = gr.Button("Generate")
39
  text_to_image_output = gr.Image(label="Image", type="pil")
40
  text_to_image_generate_button.click(
41
+ fn=partial(text_to_image, client, model),
42
  inputs=text_to_image_prompt,
43
  outputs=text_to_image_output
44
  )