Spaces:

MERaLiON
/

MERaLiON-AudioLLM

Running

App Files Files Community

YingxuHe commited on Mar 13

Commit

8b1f711

2 Parent(s): 1ae3a00 52b1f13

apply new backend

Browse files

Files changed (7) hide show

.gitignore +2 -0
src/content/agent.py +13 -2
src/content/common.py +97 -88
src/exceptions.py +0 -4
src/generation.py +0 -140
src/retrieval.py +0 -75
src/tunnel.py +0 -72

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .venv/
2	+ __pycache__/

src/content/agent.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import numpy as np
 import streamlit as st
-from src.retrieval import STANDARD_QUERIES, retrieve_relevant_docs
 from src.content.common import (
     MODEL_NAMES,
     AUDIO_SAMPLES_W_INSTRUCT,
@@ -17,6 +20,9 @@ from src.content.common import (
 )
 LLM_NO_AUDIO_PROMPT_TEMPLATE = """{user_question}"""
@@ -96,7 +102,12 @@ def _prepare_final_prompt_with_ui(one_time_prompt):
         return LLM_NO_AUDIO_PROMPT_TEMPLATE.format(user_question=one_time_prompt)
     with st.spinner("Searching appropriate querys..."):
-        relevant_query_indices = retrieve_relevant_docs(one_time_prompt)
         if len(st.session_state.ag_messages) <= 2:
             relevant_query_indices.append(0)

+import os
+import requests
 import numpy as np
 import streamlit as st
+from src.retrieval import STANDARD_QUERIES
 from src.content.common import (
     MODEL_NAMES,
     AUDIO_SAMPLES_W_INSTRUCT,
 )
+API_BASE_URL = os.getenv('API_BASE_URL')
 LLM_NO_AUDIO_PROMPT_TEMPLATE = """{user_question}"""
         return LLM_NO_AUDIO_PROMPT_TEMPLATE.format(user_question=one_time_prompt)
     with st.spinner("Searching appropriate querys..."):
+        response = requests.get(
+            f"{API_BASE_URL}retrieve_relevant_docs",
+            params={"user_question": one_time_prompt}
+        )
+        relevant_query_indices = response.json()
         if len(st.session_state.ag_messages) <= 2:
             relevant_query_indices.append(0)

src/content/common.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import os
 import copy
 import base64
 import itertools
 from collections import OrderedDict
 from typing import List, Optional
@@ -8,17 +10,11 @@ from typing import List, Optional
 import numpy as np
 import streamlit as st
-from src.tunnel import start_server
-from src.retrieval import load_retriever
 from src.logger import load_logger
 from src.utils import array_to_bytes, bytes_to_array, postprocess_voice_transcription
-from src.generation import (
-    FIXED_GENERATION_CONFIG,
-    MAX_AUDIO_LENGTH,
-    load_model,
-    retrive_response
-)
 PLAYGROUND_DIALOGUE_STATES = dict(
     pg_audio_base64='',
@@ -65,46 +61,26 @@ DEFAULT_DIALOGUE_STATE_DICTS = [
 ]
-MODEL_NAMES = OrderedDict({})
 AUDIO_SAMPLES_W_INSTRUCT = {
-    "female_pilot#1": {
-        "apperance": "Female Pilot Interview: Transcription",
-        "instructions": [
-            "Please transcribe the speech"
-        ]
-    },
-    "female_pilot#2": {
-        "apperance": "Female Pilot Interview: Aircraft name",
-        "instructions": [
-            "What does 大力士 mean in the conversation"
-        ]
-    },
-    "female_pilot#3": {
-        "apperance": "Female Pilot Interview: Air Force Personnel Count",
-        "instructions": [
-            "How many air force personnel are there?"
-        ]
-    },
-    "female_pilot#4": {
-        "apperance": "Female Pilot Interview: Air Force Personnel Name",
-        "instructions": [
-            "Can you tell me the names of the two pilots?"
-        ]
-    },
-    "female_pilot#5": {
-        "apperance": "Female Pilot Interview: Pilot Seat Restriction",
-        "instructions": [
-            "What is the concern of having a big butt for pilot?"
-        ]
-    },
-    "female_pilot#6": {
-        "apperance": "Female Pilot Interview: Conversation Mood",
-        "instructions": [
-            "What is the mood of the conversation?"
-        ]
-    },
     "7_ASR_IMDA_PART3_30_ASR_v2_2269": {
         "apperance": "7. Automatic Speech Recognition task: conversation in Singapore accent",
         "instructions": [
@@ -358,13 +334,40 @@ AUDIO_SAMPLES_W_INSTRUCT = {
         "instructions": [
             "Please follow the instruction in the speech."
         ]
     }
 }
-exec(os.getenv('APP_CONFIGS'))
 def reset_states(*state_dicts):
     for states in state_dicts:
         st.session_state.update(copy.deepcopy(states))
@@ -403,14 +406,6 @@ def init_state_section():
         st.session_state.logger = load_logger()
         st.session_state.session_id = st.session_state.logger.register_session()
-    if "server" not in st.session_state:
-        st.session_state.server = start_server()
-    if "client_mapper" not in st.session_state:
-        st.session_state.client_mapper = load_model()
-    if "retriever" not in st.session_state:
-        st.session_state.retriever = load_retriever()
     for key, value in FIXED_GENERATION_CONFIG.items():
         if key not in st.session_state:
@@ -551,54 +546,68 @@ def retrive_response_with_ui(
     if history is None:
         history = []
-    generation_params = dict(
-        model=model_name,
-        max_completion_tokens=st.session_state.max_completion_tokens,
-        temperature=st.session_state.temperature,
-        top_p=st.session_state.top_p,
-        extra_body={
-            "repetition_penalty": st.session_state.repetition_penalty,
-            "top_k": st.session_state.top_k,
-            "length_penalty": st.session_state.length_penalty
-        },
-        stream=stream,
-        seed=st.session_state.seed
-    )
-    error_msg, warnings, response_obj = retrive_response(
-        text_input,
-        array_audio_input,
-        base64_audio_input=base64_audio_input,
-        history=history,
-        **generation_params,
-        **kwargs
-    )
-    if error_msg:
-        st.error(error_msg)
-    if show_warning:
-        for warning_msg in warnings:
-            st.warning(warning_msg)
     response = ""
-    if response_obj is not None:
         if stream:
-            response_obj = itertools.chain([prefix], response_obj)
             response = st.write_stream(response_obj)
         else:
-            response = response_obj.choices[0].message.content
             if normalise_response:
                 response = postprocess_voice_transcription(response)
             response = prefix + response
             st.write(response)
     st.session_state.logger.register_query(
         session_id=st.session_state.session_id,
         base64_audio=base64_audio_input,
         text_input=text_input,
         history=history,
-        params=generation_params,
         response=response,
         warnings=warnings,
         error_msg=error_msg

 import os
+import re
 import copy
 import base64
+import requests
 import itertools
 from collections import OrderedDict
 from typing import List, Optional
 import numpy as np
 import streamlit as st
 from src.logger import load_logger
 from src.utils import array_to_bytes, bytes_to_array, postprocess_voice_transcription
+from src.generation import FIXED_GENERATION_CONFIG, MAX_AUDIO_LENGTH
+API_BASE_URL = os.getenv('API_BASE_URL')
 PLAYGROUND_DIALOGUE_STATES = dict(
     pg_audio_base64='',
 ]
+MODEL_NAMES = OrderedDict({
+        "llm": {
+            "vllm_name": "MERaLiON-Gemma",
+            "model_name": "MERaLiON-Gemma",
+            "ui_name": "MERaLiON-Gemma"
+        },
+        "audiollm": {
+            "vllm_name": "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION",
+            "model_name": "MERaLiON-AudioLLM-Whisper-SEA-LION",
+            "ui_name": "MERaLiON-AudioLLM"
+        },
+        "audiollm-it": {
+            "vllm_name": "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION-it",
+            "model_name": "MERaLiON-AudioLLM-Whisper-SEA-LION-it",
+            "ui_name": "MERaLiON-AudioLLM-Instruction-Tuning"
+        }
+})
 AUDIO_SAMPLES_W_INSTRUCT = {
     "7_ASR_IMDA_PART3_30_ASR_v2_2269": {
         "apperance": "7. Automatic Speech Recognition task: conversation in Singapore accent",
         "instructions": [
         "instructions": [
             "Please follow the instruction in the speech."
         ]
+    },
+    "female_pilot#1": {
+        "apperance": "Female Pilot Interview: Transcription",
+        "instructions": [
+            "Please transcribe the speech"
+        ]
+    },
+    "female_pilot#2": {
+        "apperance": "Female Pilot Interview: Aircraft name",
+        "instructions": [
+            "What does 大力士 mean in the conversation"
+        ]
+    },
+    "female_pilot#3": {
+        "apperance": "Female Pilot Interview: Air Force Personnel Count",
+        "instructions": [
+            "How many air force personnel are there?"
+        ]
+    },
+    "female_pilot#4": {
+        "apperance": "Female Pilot Interview: Air Force Personnel Name",
+        "instructions": [
+            "Can you tell me the names of the two pilots?"
+        ]
+    },
+    "female_pilot#5": {
+        "apperance": "Female Pilot Interview: Conversation Mood",
+        "instructions": [
+            "What is the mood of the conversation?"
+        ]
     }
 }
 def reset_states(*state_dicts):
     for states in state_dicts:
         st.session_state.update(copy.deepcopy(states))
         st.session_state.logger = load_logger()
         st.session_state.session_id = st.session_state.logger.register_session()
     for key, value in FIXED_GENERATION_CONFIG.items():
         if key not in st.session_state:
     if history is None:
         history = []
+    # Prepare request data
+    request_data = {
+        "text_input": str(text_input),
+        "model_name": str(model_name),
+        "array_audio_input": array_audio_input.tolist(),  # Convert numpy array to list
+        "base64_audio_input": str(base64_audio_input) if base64_audio_input else None,
+        "history": list(history) if history else None,
+        "stream": bool(stream),
+        "max_completion_tokens": int(st.session_state.max_completion_tokens),
+        "temperature": float(st.session_state.temperature),
+        "top_p": float(st.session_state.top_p),
+        "repetition_penalty": float(st.session_state.repetition_penalty),
+        "top_k": int(st.session_state.top_k),
+        "length_penalty": float(st.session_state.length_penalty),
+        "seed": int(st.session_state.seed),
+        "extra_params": {}
+    }
+    # print(request_data)
+    # print(model_name)
+    error_msg = ""
+    warnings = []
     response = ""
+    try:
         if stream:
+            # Streaming response
+            response_stream = requests.post(f"{API_BASE_URL}chat", json=request_data, stream=True)
+            response_stream.raise_for_status()
+            response_obj = itertools.chain([prefix], (chunk.decode() for chunk in response_stream))
             response = st.write_stream(response_obj)
         else:
+            # Non-streaming response
+            api_response = requests.post(f"{API_BASE_URL}chat", json=request_data)
+            api_response.raise_for_status()
+            result = api_response.json()
+            if "warnings" in result:
+                warnings = result["warnings"]
+            response = result.get("response", "")
             if normalise_response:
                 response = postprocess_voice_transcription(response)
             response = prefix + response
             st.write(response)
+    except requests.exceptions.RequestException as e:
+        error_msg = f"API request failed: {str(e)}"
+        st.error(error_msg)
+    if show_warning:
+        for warning_msg in warnings:
+            st.warning(warning_msg)
     st.session_state.logger.register_query(
         session_id=st.session_state.session_id,
         base64_audio=base64_audio_input,
         text_input=text_input,
         history=history,
+        params=request_data["extra_params"],
         response=response,
         warnings=warnings,
         error_msg=error_msg

src/exceptions.py CHANGED Viewed

@@ -1,6 +1,2 @@
 class NoAudioException(Exception):
-    pass
-class TunnelNotRunningException(Exception):
     pass


1	class NoAudioException(Exception):




2	pass

src/generation.py CHANGED Viewed

@@ -1,15 +1,3 @@
-import os
-import re
-import time
-from typing import List, Dict, Optional
-import numpy as np
-import streamlit as st
-from openai import OpenAI, APIConnectionError
-from src.exceptions import TunnelNotRunningException
 FIXED_GENERATION_CONFIG = dict(
     max_completion_tokens=1024,
     top_k=50,
@@ -20,25 +8,6 @@ FIXED_GENERATION_CONFIG = dict(
 MAX_AUDIO_LENGTH = 120
-def load_model() -> Dict:
-    """
-    Create an OpenAI client with connection to vllm server.
-    """
-    openai_api_key = os.getenv('API_KEY')
-    local_ports = os.getenv('LOCAL_PORTS').split(" ")
-    name_to_client_mapper = {}
-    for port in local_ports:
-        client = OpenAI(
-            api_key=openai_api_key,
-            base_url=f"http://localhost:{port}/v1",
-        )
-        for model in client.models.list().data:
-            name_to_client_mapper[model.id] = client
-    return name_to_client_mapper
 def prepare_multimodal_content(text_input, base64_audio_input):
     return [
@@ -76,112 +45,3 @@ def change_multimodal_content(
         }
     return original_content
-def _retrive_response(
-        model: str,
-        text_input: str,
-        base64_audio_input: str,
-        history: Optional[List] = None,
-        **kwargs):
-    """
-    Send request through OpenAI client.
-    """
-    if history is None:
-        history = []
-    if base64_audio_input:
-        content = [
-            {
-                "type": "text",
-                "text": f"Text instruction: {text_input}"
-            },
-            {
-                "type": "audio_url",
-                "audio_url": {
-                    "url": f"data:audio/ogg;base64,{base64_audio_input}"
-                },
-            },
-        ]
-    else:
-        content = text_input
-    current_client = st.session_state.client_mapper[model]
-    return current_client.chat.completions.create(
-        messages=history + [{"role": "user", "content": content}],
-        model=model,
-        **kwargs
-    )
-def _retry_retrive_response_throws_exception(retry=3, **kwargs):
-    try:
-        response_object = _retrive_response(**kwargs)
-    except APIConnectionError as e:
-        if not st.session_state.server.is_running():
-            if retry == 0:
-                raise TunnelNotRunningException()
-            st.toast(f":warning: Internet connection is down. Trying to re-establish connection ({retry}).")
-            if st.session_state.server.is_down():
-                st.session_state.server.restart()
-            elif st.session_state.server.is_starting():
-                time.sleep(2)
-            return _retry_retrive_response_throws_exception(retry-1, **kwargs)
-        raise e
-    return response_object
-def _validate_input(text_input, array_audio_input) -> List[str]:
-    """
-    TODO: improve the input validation regex.
-    """
-    warnings = []
-    if re.search("tool|code|python|java|math|calculate", text_input):
-        warnings.append("WARNING: MERaLiON-AudioLLM is not intended for use in tool calling, math, and coding tasks.")
-    if re.search(r'[\u4e00-\u9fff]+', text_input):
-        warnings.append("NOTE: Please try to prompt in English for the best performance.")
-    if array_audio_input.shape[0] == 0:
-        warnings.append("NOTE: Please specify audio from examples or local files.")
-    if array_audio_input.shape[0] / 16000 > 30.0:
-        warnings.append((
-            "WARNING: MERaLiON-AudioLLM is trained to process audio up to **30 seconds**."
-            f" Audio longer than **{MAX_AUDIO_LENGTH} seconds** will be truncated."
-        ))
-    return warnings
-def retrive_response(
-        text_input: str,
-        array_audio_input: np.ndarray,
-        **kwargs
-    ):
-    warnings = _validate_input(text_input, array_audio_input)
-    response_object, error_msg = None, ""
-    try:
-        response_object = _retry_retrive_response_throws_exception(
-            text_input=text_input,
-            **kwargs
-        )
-    except TunnelNotRunningException:
-        error_msg = "Internet connection cannot be established. Please contact the administrator."
-    except Exception as e:
-        error_msg = f"Caught Exception: {repr(e)}. Please contact the administrator."
-    return error_msg, warnings, response_object
-def postprocess_voice_transcription(text):
-    text = re.sub("<.*>:?|\(.*\)|\[.*\]", "", text)
-    text = re.sub("\s+", " ", text).strip()
-    return text

 FIXED_GENERATION_CONFIG = dict(
     max_completion_tokens=1024,
     top_k=50,
 MAX_AUDIO_LENGTH = 120
 def prepare_multimodal_content(text_input, base64_audio_input):
     return [
         }
     return original_content

src/retrieval.py CHANGED Viewed

@@ -1,10 +1,3 @@
-from typing import List
-import numpy as np
-import streamlit as st
-from FlagEmbedding import BGEM3FlagModel
 STANDARD_QUERIES = [
     {
         "query_text": "Please transcribe this speech.",
@@ -43,71 +36,3 @@ STANDARD_QUERIES = [
         "ui_text": "emotion recognition"
     },
 ]
-def _colbert_score(q_reps, p_reps):
-    """Compute colbert scores of input queries and passages.
-    Args:
-        q_reps (np.ndarray): Multi-vector embeddings for queries.
-        p_reps (np.ndarray): Multi-vector embeddings for passages/corpus.
-    Returns:
-        torch.Tensor: Computed colbert scores.
-    """
-    # q_reps, p_reps = torch.from_numpy(q_reps), torch.from_numpy(p_reps)
-    token_scores = np.einsum('in,jn->ij', q_reps, p_reps)
-    scores = token_scores.max(-1)
-    scores = np.sum(scores) / q_reps.shape[0]
-    return scores
-class QueryRetriever:
-    def __init__(self, docs):
-        self.model = BGEM3FlagModel('BAAI/bge-m3',  use_fp16=True)
-        self.docs = docs
-        self.doc_vectors = self.model.encode(
-            [d["doc_text"] for d in self.docs],
-            return_sparse=True,
-            return_colbert_vecs=True
-        )
-        self.scorer_attrs = {
-            "lexical_weights": {
-                "method": self.model.compute_lexical_matching_score,
-                "weight": 0.2
-                },
-            "colbert_vecs": {
-                "method": _colbert_score,
-                "weight": 0.8
-                },
-        }
-    def get_relevant_doc_indices(self, prompt, normalize=False) -> np.ndarray:
-        scores = np.zeros(len(self.docs))
-        if not prompt:
-            return scores
-        prompt_vector = self.model.encode(
-            prompt,
-            return_sparse=True,
-            return_colbert_vecs=True
-        )
-        for scorer_name, scorer_attrs in self.scorer_attrs.items():
-            for i, doc_vec in enumerate(self.doc_vectors[scorer_name]):
-                scores[i] += scorer_attrs["method"](prompt_vector[scorer_name], doc_vec)
-        if normalize:
-            scores = scores / np.sum(scores)
-        return scores
-@st.cache_resource()
-def load_retriever():
-    return QueryRetriever(docs=STANDARD_QUERIES)
-def retrieve_relevant_docs(user_question: str) -> List[int]:
-    scores = st.session_state.retriever.get_relevant_doc_indices(user_question, normalize=True)
-    selected_indices = np.where(scores > 0.2)[0]
-    return selected_indices.tolist()

 STANDARD_QUERIES = [
     {
         "query_text": "Please transcribe this speech.",
         "ui_text": "emotion recognition"
     },
 ]

src/tunnel.py DELETED Viewed

@@ -1,72 +0,0 @@
-import io
-import os
-import paramiko
-import streamlit as st
-from sshtunnel import SSHTunnelForwarder
-DEFAULT_LOCAL_PORTS = "8000 8001"
-DEFAULT_REMOTE_PORTS = "8000 8001"
-@st.cache_resource()
-def start_server():
-    server = SSHTunnelManager()
-    server.start()
-    return server
-class SSHTunnelManager:
-    def __init__(self):
-        pkey = paramiko.RSAKey.from_private_key(io.StringIO(os.getenv('PRIVATE_KEY')))
-        self.server = SSHTunnelForwarder(
-            ssh_address_or_host=os.getenv('SERVER_DNS_NAME'),
-            ssh_username="ec2-user",
-            ssh_pkey=pkey,
-            local_bind_addresses=[
-                ("127.0.0.1", int(port))
-                for port in os.getenv('LOCAL_PORTS', DEFAULT_LOCAL_PORTS).split(" ")
-            ],
-            remote_bind_addresses=[
-                ("127.0.0.1", int(port))
-                for port in os.getenv('REMOTE_PORTS', DEFAULT_REMOTE_PORTS).split(" ")
-            ]
-        )
-        self._is_starting = False
-        self._is_running = False
-    def update_status(self):
-        if not self._is_starting:
-            self.server.check_tunnels()
-            self._is_running = all(
-                list(self.server.tunnel_is_up.values())
-            )
-        else:
-            self._is_running = False
-    def is_starting(self):
-        self.update_status()
-        return self._is_starting
-    def is_running(self):
-        self.update_status()
-        return self._is_running
-    def is_down(self):
-        self.update_status()
-        return (not self._is_running) and (not self._is_starting)
-    def start(self, *args, **kwargs):
-        if not self._is_starting:
-            self._is_starting = True
-            self.server.start(*args, **kwargs)
-            self._is_starting = False
-    def restart(self, *args, **kwargs):
-        if not self._is_starting:
-            self._is_starting = True
-            self.server.restart(*args, **kwargs)
-            self._is_starting = False