File size: 2,831 Bytes
e48436e
8c337f2
2674c93
8c337f2
e48436e
 
8c337f2
e48436e
 
 
 
 
 
 
a68cb46
a1f93f9
 
8c337f2
8fd9afa
2674c93
 
 
 
 
 
8fd9afa
881ad0f
a1f93f9
a68cb46
05fce26
a68cb46
 
05fce26
881ad0f
a1f93f9
a68cb46
05fce26
a68cb46
 
05fce26
881ad0f
a68cb46
 
8c337f2
a1f93f9
e48436e
 
 
05fce26
8c337f2
e48436e
8c337f2
e48436e
05fce26
a68cb46
 
 
 
05fce26
e48436e
 
 
 
 
 
 
 
 
 
 
a68cb46
 
 
 
 
8c337f2
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import gradio as gr
from huggingface_hub import InferenceClient

# Pega o token do Hugging Face dos Secrets
HF_TOKEN = os.environ.get("HF_TOKEN")

# Inicializa os clientes dos modelos
client_main = InferenceClient(token=HF_TOKEN, model="meta-llama/Llama-3.1-8B-Instruct")
client_aux1 = InferenceClient(token=HF_TOKEN, model="google/flan-t5-large")
client_aux2 = InferenceClient(token=HF_TOKEN, model="facebook/bart-large-cnn")

# Função principal de resposta
def respond(message, history, system_message, max_tokens, temperature, top_p):
    try:
        # Prompt principal incluindo system_message
        full_prompt = f"{system_message}\nUsuário: {message}"

        # --- Passo 1: Llama 3.1 (conversational) ---
        result_main = client_main.conversational(
            prompt=full_prompt,        # string diretamente
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p
        )
        response_main = result_main.generated_responses[-1]  # pega a última resposta

        # --- Passo 2: FLAN-T5 ---
        result_aux1 = client_aux1.text_generation(
            prompt=f"Reformule este texto de forma clara:\n{response_main}",
            max_new_tokens=max_tokens
        )
        response_aux1 = result_aux1.generated_text

        # --- Passo 3: BART ---
        result_aux2 = client_aux2.text_generation(
            prompt=f"Resuma este texto em 3 frases:\n{response_aux1}",
            max_new_tokens=150
        )
        response_aux2 = result_aux2.generated_text

    except Exception as e:
        response_aux2 = f"Erro ao gerar resposta: {e}"

    # Atualiza histórico no formato correto para o Gradio Chatbot
    history.append({"role": "user", "content": message})
    history.append({"role": "assistant", "content": response_aux2})

    return history, history

# Interface Gradio
with gr.Blocks() as demo:
    gr.Markdown("## 🤖 Chatbot em Cascata (Llama 3.1 + FLAN-T5 + BART)")

    system_message = gr.Textbox(
        value="Você é um chatbot amigável e prestativo.", 
        label="System Message"
    )

    chatbot = gr.Chatbot()
    msg = gr.Textbox(label="Digite sua mensagem")
    max_tokens = gr.Slider(50, 2048, 512, step=50, label="Max Tokens")
    temperature = gr.Slider(0.1, 1.0, 0.7, step=0.05, label="Temperature")
    top_p = gr.Slider(0.1, 1.0, 0.95, step=0.05, label="Top-p (nucleus sampling)")
    
    history = gr.State([])

    def handle_submit(message, history, system_message, max_tokens, temperature, top_p):
        return respond(message, history, system_message, max_tokens, temperature, top_p)

    msg.submit(
        handle_submit, 
        inputs=[msg, history, system_message, max_tokens, temperature, top_p], 
        outputs=[chatbot, history]
    )

if __name__ == "__main__":
    demo.launch()