File size: 4,306 Bytes
979bbdf
55d79e2
b4d819b
b71a3ad
39d9406
 
 
 
 
 
24f37c6
dc382c8
 
 
5bebd85
 
 
 
 
dc382c8
55d79e2
 
b71a3ad
55d79e2
 
 
 
 
24f37c6
 
55d79e2
b71a3ad
5bebd85
 
b71a3ad
 
55d79e2
 
 
 
 
 
24f37c6
 
5bebd85
b71a3ad
55d79e2
 
 
 
 
 
24f37c6
dc382c8
 
5bebd85
 
 
 
 
 
dc382c8
 
 
 
 
b71a3ad
d56b9d9
55d79e2
dc382c8
5c395b2
caf2559
55d79e2
bb6107f
b71a3ad
4c71b8b
55d79e2
24f37c6
5c395b2
979bbdf
dc382c8
979bbdf
 
dc382c8
 
55d79e2
b71a3ad
55d79e2
 
 
 
 
24f37c6
 
55d79e2
dc382c8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from dotenv import load_dotenv
from os import getenv
import gradio as gr
from huggingface_hub import InferenceClient
from automatic_speech_recognition import create_asr_tab
from chatbot import create_chatbot_tab
from image_classification import create_image_classification_tab
from image_to_text import create_image_to_text_tab
from text_to_image import create_text_to_image_tab
from text_to_speech import create_text_to_speech_tab
from translation import create_translation_tab


class App:
    """Main application class for the AI Building Blocks Gradio interface.
    
    This class orchestrates the entire application by creating the Gradio UI
    and integrating all the individual building block tabs.
    """

    def __init__(
        self,
        client: InferenceClient,
        text_to_image_model: str,
        image_to_text_model: str,
        image_classification_model: str,
        text_to_speech_model: str,
        audio_transcription_model: str,
        chat_model: str,
        fallback_translation_model: str
    ):
        """Initialize the App with an InferenceClient instance and model IDs.
        
        Args:
            client: Hugging Face InferenceClient instance for making API calls
                to Hugging Face's inference endpoints (used for text-to-image and ASR).
            text_to_image_model: Model ID for text-to-image generation.
            image_to_text_model: Model ID for image captioning.
            image_classification_model: Model ID for image classification.
            text_to_speech_model: Model ID for text-to-speech.
            audio_transcription_model: Model ID for automatic speech recognition.
            chat_model: Model ID for chatbot.
            fallback_translation_model: Fallback translation model ID for languages
                without specific translation models.
        """
        self.client = client
        self.text_to_image_model = text_to_image_model
        self.image_to_text_model = image_to_text_model
        self.image_classification_model = image_classification_model
        self.text_to_speech_model = text_to_speech_model
        self.audio_transcription_model = audio_transcription_model
        self.chat_model = chat_model
        self.fallback_translation_model = fallback_translation_model

    def run(self):
        """Launch the Gradio application with all building block tabs.
        
        Creates a Gradio Blocks interface with multiple tabs, each representing
        a different AI building block. The application will block until the
        interface is closed.
        """
        with gr.Blocks(title="AI Building Blocks") as demo:
            gr.Markdown("# AI Building Blocks")
            gr.Markdown("A gallery of building blocks for building AI applications")
            with gr.Tabs():
                with gr.Tab("Text-to-image Generation"):
                    create_text_to_image_tab(self.client, self.text_to_image_model)
                with gr.Tab("Image-to-text or Image Captioning"):
                    create_image_to_text_tab(self.image_to_text_model)
                with gr.Tab("Image Classification"):
                    create_image_classification_tab(self.image_classification_model)
                with gr.Tab("Text-to-speech (TTS)"):
                    create_text_to_speech_tab(self.text_to_speech_model)
                with gr.Tab("Automatic Speech Recognition (ASR)"):
                    create_asr_tab(self.client, self.audio_transcription_model)
                with gr.Tab("Chat"):
                    create_chatbot_tab(self.chat_model)
                with gr.Tab("Translation to English"):
                    create_translation_tab(self.fallback_translation_model)

            demo.launch()


if __name__ == "__main__":
    load_dotenv()
    app = App(
        client=InferenceClient(),
        text_to_image_model=getenv("TEXT_TO_IMAGE_MODEL"),
        image_to_text_model=getenv("IMAGE_TO_TEXT_MODEL"),
        image_classification_model=getenv("IMAGE_CLASSIFICATION_MODEL"),
        text_to_speech_model=getenv("TEXT_TO_SPEECH_MODEL"),
        audio_transcription_model=getenv("AUDIO_TRANSCRIPTION_MODEL"),
        chat_model=getenv("CHAT_MODEL"),
        fallback_translation_model=getenv("FALLBACK_TRANSLATION_MODEL")
    )
    app.run()