# app.py # Chatbot em cascata para Hugging Face Space / execução local # - Llama 3.1 (entrada) # - FLAN-T5 (reformulação) # - BART (resumo em 3 frases) # # Requisitos (no Space): defina HF_TOKEN nos Secrets. # Variáveis opcionais para troca de modelos: # - LLAMA_MODEL (padrao: meta-llama/Llama-3.1-8B-Instruct) # - AUX1_MODEL (padrao: google/flan-t5-large) # - AUX2_MODEL (padrao: facebook/bart-large-cnn) # # Use: python app.py # Recomendações: requirements.txt com gradio, huggingface-hub, transformers, accelerate, etc. import os import traceback import logging from typing import List, Dict, Any, Tuple import gradio as gr from huggingface_hub import InferenceClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger("cascade_chatbot") HF_TOKEN = os.environ.get("HF_TOKEN") DEFAULT_LLAMA_MODEL = os.environ.get("LLAMA_MODEL", "meta-llama/Llama-3.1-8B-Instruct") DEFAULT_AUX1 = os.environ.get("AUX1_MODEL", "google/flan-t5-large") DEFAULT_AUX2 = os.environ.get("AUX2_MODEL", "facebook/bart-large-cnn") if not HF_TOKEN: logger.warning("HF_TOKEN não encontrado nas variáveis de ambiente. Configure nos Secrets do Space ou no ambiente local.") # ------------------------- # Inicializa clientes HF # ------------------------- try: client_main = InferenceClient(token=HF_TOKEN, model=DEFAULT_LLAMA_MODEL) client_aux1 = InferenceClient(token=HF_TOKEN, model=DEFAULT_AUX1) client_aux2 = InferenceClient(token=HF_TOKEN, model=DEFAULT_AUX2) except Exception: logger.exception("Falha ao inicializar InferenceClient(s). Verifique HF_TOKEN e nomes dos modelos.") # Criar objetos None para evitar crash imediato; erros aparecerão ao tentar usar client_main = None client_aux1 = None client_aux2 = None # ------------------------- # Helpers # ------------------------- def _messages_to_prompt(messages: List[Dict[str, str]]) -> str: lines = [] for m in messages: role = m.get("role", "user") content = m.get("content", "") lines.append(f"{role.upper()}: {content}") lines.append("ASSISTANT:") return "\n".join(lines) def _extract_text_from_response(obj: Any) -> str: if obj is None: return "" # Common atributos for attr in ("content", "text", "generated_text", "generation_text"): if hasattr(obj, attr): try: v = getattr(obj, attr) if isinstance(v, str): return v return str(v) except Exception: pass try: choices = None if hasattr(obj, "choices"): choices = obj.choices elif isinstance(obj, dict) and "choices" in obj: choices = obj["choices"] if choices: first = choices[0] if isinstance(first, dict): if "message" in first and isinstance(first["message"], dict) and "content" in first["message"]: return first["message"]["content"] if "text" in first: return first["text"] if "content" in first: return first["content"] if hasattr(first, "message"): msg = first.message if isinstance(msg, dict) and "content" in msg: return msg["content"] if hasattr(first, "text"): return first.text except Exception: pass try: if hasattr(obj, "generations") and len(obj.generations) > 0: g = obj.generations[0] if isinstance(g, dict) and "text" in g: return g["text"] if hasattr(g, "text"): return g.text except Exception: pass try: if isinstance(obj, dict): for k in ("text", "content", "generated_text"): if k in obj and isinstance(obj[k], str): return obj[k] except Exception: pass try: return str(obj) except Exception: return "" # ------------------------- # Chamadas robustas ao InferenceClient # ------------------------- def call_model_with_messages(client: InferenceClient, messages: List[Dict[str, str]], max_new_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.95) -> Any: """ Tenta múltiplas assinaturas (chat_completion, client.chat, text_generation, etc). Registra exceções completas para diagnóstico. """ def try_call(method, /, *pos_args, **kw_args): try: # Não imprimir todo messages no log — resumir safe_kw = {k: ("[MESSAGES]" if k == "messages" else v) for k, v in kw_args.items()} logger.info("Tentando %s pos=%s kwargs=%s", getattr(method, "__name__", str(method)), pos_args, safe_kw) return method(*pos_args, **kw_args) except Exception: logger.exception("Falha ao chamar %s", getattr(method, "__name__", str(method))) return None # Tentar obter nome do modelo model_name = getattr(client, "model", None) or DEFAULT_LLAMA_MODEL # 1) chat_completion try: cc = getattr(client, "chat_completion", None) if cc: # a) cc(model=..., messages=...) res = try_call(cc, model=model_name, messages=messages, max_new_tokens=max_new_tokens, temperature=temperature) if res is not None: return res # b) cc(messages=..., model=...) res = try_call(cc, messages=messages, model=model_name, max_new_tokens=max_new_tokens, temperature=temperature) if res is not None: return res # c) cc.create(...) if hasattr(cc, "create"): res = try_call(cc.create, model=model_name, messages=messages, max_new_tokens=max_new_tokens, temperature=temperature) if res is not None: return res # d) positional res = try_call(cc, messages) if res is not None: return res except Exception: logger.exception("Erro no bloco chat_completion") # 2) client.chat namespace try: chat_ns = getattr(client, "chat", None) if chat_ns: if hasattr(chat_ns, "create"): res = try_call(chat_ns.create, model=model_name, messages=messages, max_new_tokens=max_new_tokens, temperature=temperature) if res is not None: return res if hasattr(chat_ns, "chat_completion") and hasattr(chat_ns.chat_completion, "create"): res = try_call(chat_ns.chat_completion.create, model=model_name, messages=messages, max_new_tokens=max_new_tokens, temperature=temperature) if res is not None: return res res = try_call(chat_ns, model_name, messages) if res is not None: return res except Exception: logger.exception("Erro no bloco chat namespace") # 3) text_generation prompt = _messages_to_prompt(messages) try: if hasattr(client, "text_generation"): res = try_call(client.text_generation, prompt=prompt, max_new_tokens=max_new_tokens, temperature=temperature) if res is not None: return res if hasattr(client, "generate") and callable(client.generate): res = try_call(client.generate, prompt=prompt, max_new_tokens=max_new_tokens) if res is not None: return res except Exception: logger.exception("Erro no bloco text_generation/generate") # 4) última tentativa: explorar métodos candidatos candidate_methods = [m for m in dir(client) if any(k in m for k in ("create", "generate", "complete", "run"))] for name in candidate_methods: try: method = getattr(client, name) if callable(method): res = try_call(method, messages=messages) if res is not None: return res res = try_call(method, prompt) if res is not None: return res res = try_call(method, messages) if res is not None: return res except Exception: logger.exception("Erro testando candidato %s", name) # falhou todas as tentativas debug = {"available_attrs": dir(client), "messages_sample": messages[:3]} logger.error("Todas as tentativas falharam. Debug: %s", debug) raise RuntimeError(f"Não foi possível chamar o cliente HF com as assinaturas testadas. Debug: {debug}") # ------------------------- # Pipeline: Llama -> FLAN -> BART # ------------------------- def pipeline_cascade(user_message: str, system_message: str, max_tokens: int, temperature: float, top_p: float) -> Tuple[str, List[str]]: """ Executa a cascata: Llama (client_main) -> FLAN (client_aux1) -> BART (client_aux2). Retorna o texto final e um log de passos. """ logs = [] # Monta mensagens messages = [{"role": "system", "content": system_message or ""}, {"role": "user", "content": user_message}] try: logs.append("1) Chamando Llama (entrada)") response_main_obj = call_model_with_messages(client_main, messages, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p) response_main = _extract_text_from_response(response_main_obj) logs.append(f"-> Llama respondeu (resumo): {response_main[:300]}") # Aux1: FLAN-T5 - reformular logs.append("2) Chamando FLAN-T5 (reformular)") prompt_aux1 = f"Reformule este texto de forma clara e concisa:\n{response_main}" try: if client_aux1 and hasattr(client_aux1, "text_generation"): res_a1 = client_aux1.text_generation(prompt=prompt_aux1, max_new_tokens=max(128, max_tokens // 4)) elif client_aux1 and hasattr(client_aux1, "completions") and hasattr(client_aux1.completions, "create"): res_a1 = client_aux1.completions.create(prompt=prompt_aux1, max_new_tokens=max(128, max_tokens // 4)) else: res_a1 = None response_aux1 = _extract_text_from_response(res_a1) if res_a1 is not None else response_main logs.append(f"-> FLAN-T5 respondeu (resumo): {response_aux1[:300]}") except Exception: logs.append("FLAN-T5 falhou; usando resposta do Llama") response_aux1 = response_main # Aux2: BART - resumo em 3 frases logs.append("3) Chamando BART (resumo em 3 frases)") prompt_aux2 = f"Resuma este texto em 3 frases:\n{response_aux1}" try: if client_aux2 and hasattr(client_aux2, "text_generation"): res_a2 = client_aux2.text_generation(prompt=prompt_aux2, max_new_tokens=150) elif client_aux2 and hasattr(client_aux2, "completions") and hasattr(client_aux2.completions, "create"): res_a2 = client_aux2.completions.create(prompt=prompt_aux2, max_new_tokens=150) else: res_a2 = None response_aux2 = _extract_text_from_response(res_a2) if res_a2 is not None else response_aux1 logs.append(f"-> BART respondeu (resumo): {response_aux2[:300]}") except Exception: logs.append("BART falhou; usando resposta do passo anterior") response_aux2 = response_aux1 except Exception as e: tb = traceback.format_exc(limit=5) logger.exception("Erro pipeline principal: %s", e) response_aux2 = f"Erro ao gerar resposta: {e}\n\nTraceback (curto):\n{tb}" logs.append("Erro no pipeline: " + str(e)) return response_aux2, logs # ------------------------- # Gradio App # ------------------------- with gr.Blocks(title="Chatbot em Cascata - Llama + FLAN + BART") as demo: gr.Markdown("## Trabalho Acadêmico FMU - Chatbot em Cascata\n" "Fluxo: **Llama (entrada)** → **FLAN-T5 (reformulação)** → **BART(resumo)**\n\n" "Disciplina: INTELIGÊNCIA ARTIFICIAL E APRENDIZADO DE MÁQUINA") with gr.Row(): with gr.Column(scale=2): system_message = gr.Textbox(value="Você é um chatbot racional e alegre.", label="System Message", lines=2) chatbot = gr.Chatbot(label="Chat") user_input = gr.Textbox(label="Digite sua mensagem", placeholder="Digite aqui...") max_tokens = gr.Slider(50, 2048, value=512, step=50, label="Max Tokens") temperature = gr.Slider(0.0, 1.0, value=0.7, step=0.05, label="Temperature") top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)") history = gr.State([]) def submit_handler(msg, history, system_message, max_tokens, temperature, top_p): # roda pipeline e atualiza histórico out_text, logs = pipeline_cascade(msg, system_message, int(max_tokens), float(temperature), float(top_p)) history.append({"role": "user", "content": msg}) history.append({"role": "assistant", "content": out_text}) # exibimos também logs no console (útil) logger.info("Pipeline logs:\n%s", "\n".join(logs)) return history, history user_input.submit(submit_handler, inputs=[user_input, history, system_message, max_tokens, temperature, top_p], outputs=[chatbot, history]) btn_send = gr.Button("Enviar") btn_send.click(submit_handler, inputs=[user_input, history, system_message, max_tokens, temperature, top_p], outputs=[chatbot, history]) with gr.Column(scale=1): gr.Markdown("### Informações sobre o Projeto\n" "Painel feito para descrever as **configurações**, **testar a geração** e sobre os **envolvidos**:") model_info_md = f""" **Modelos usados:** - Llama (input): `{DEFAULT_LLAMA_MODEL}` - Aux 1 (reformulação): `{DEFAULT_AUX1}` - Aux 2 (resumo): `{DEFAULT_AUX2}` **Como foram configurados:** - Cada modelo é instanciado via `InferenceClient(token=HF_TOKEN, model=)`. - Chamadas preferenciais: - Para chat: `client.chat_completion(messages=..., model=...)` (quando disponível) - Fallback: `client.text_generation(prompt=...)` - Ajustes de inferência controlados pelo usuário: `max_tokens`, `temperature`, `top_p`. - Logs de diagnóstico são gravados (úteis se houver erros de assinatura/permissão). """ gr.Markdown(model_info_md) # Self-test: roda testes com mensagens predefinidas e mostra o resultado test_output = gr.Textbox(label="Resultado do Self-Test", lines=12, interactive=False) def run_self_test(system_message, max_tokens, temperature, top_p): msgs = [ "Explique resumidamente o que é a técnica de regressão linear.", "Resuma em 1 frase as vantagens de usar validação cruzada.", "Como posso autenticar usuários em uma aplicação web?" ] accumulated = [] for m in msgs: out, logs = pipeline_cascade(m, system_message, int(max_tokens), float(temperature), float(top_p)) accumulated.append("INPUT: " + m) accumulated.append("OUTPUT: " + out) accumulated.append("LOGS: " + " | ".join(logs)) accumulated.append("-" * 40) return "\n".join(accumulated) btn_test = gr.Button("Run self-test") btn_test.click(run_self_test, inputs=[system_message, max_tokens, temperature, top_p], outputs=[test_output]) gr.Markdown( "### Disciplina: INTELIGÊNCIA ARTIFICIAL E APRENDIZADO DE MÁQUINA\n" "- Trabalho N2\n" "- Turma Noturna de Bacharelado em Ciências da Computação 2025.\n" "- Integrantes:\n " "- Lucas Antonini - 1722631\n " "- Carlos Eduardo da Silva - 1961011\n " "- Felipe Rios Amaral - 1847080 \n" "- Kawrê Britto de Oliveira - 2260931\n" "- Miguel Putini Alfano - 2879347 ") if __name__ == "__main__": demo.launch()