Spaces:

pollen-robotics
/

reachy_mini_conversation_app

Running

App Files Files Community

Alina Lozovskaya commited on Nov 12

Commit

3bd411c

1 Parent(s): 21147c0

Handle abrupt websocket closures with retry and safe shutdown

Browse files

Files changed (2) hide show

src/reachy_mini_conversation_app/openai_realtime.py +222 -193
tests/test_openai_realtime.py +88 -0

src/reachy_mini_conversation_app/openai_realtime.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import base64
 import asyncio
 import logging
 from typing import Any, Tuple, Literal, cast
@@ -10,6 +11,7 @@ import gradio as gr
 from openai import AsyncOpenAI
 from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
 from numpy.typing import NDArray
 from reachy_mini_conversation_app.tools import (
     ALL_TOOL_SPECS,
@@ -68,206 +70,227 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
         return cast(NDArray[np.int16], resampled.astype(np.int16))
     async def start_up(self) -> None:
-        """Start the handler."""
         self.client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
-        async with self.client.realtime.connect(model=config.MODEL_NAME) as conn:
-            try:
-                await conn.session.update(
-                    session={
-                        "type": "realtime",
-                        "instructions": SESSION_INSTRUCTIONS,
-                        "audio": {
-                            "input": {
-                                "format": {
-                                    "type": "audio/pcm",
-                                    "rate": self.target_input_rate,
-                                },
-                                "transcription": {
-                                    "model": "whisper-1",
-                                    "language": "en"
-                                },
-                                "turn_detection": {
-                                    "type": "server_vad",
-                                    "interrupt_response": True,
-                                },
-                            },
-                            "output": {
-                                "format": {
-                                    "type": "audio/pcm",
-                                    "rate": self.output_sample_rate,
-                                },
-                                "voice": "cedar",
-                            },
-                        },
-                        "tools": ALL_TOOL_SPECS,  # type: ignore[typeddict-item]
-                        "tool_choice": "auto",
-                    },
-                )
-            except Exception:
-                logger.exception("Realtime session.update failed; aborting startup")
-                return
-            logger.info("Realtime session updated successfully")
-            # Manage event received from the openai server
-            self.connection = conn
-            async for event in self.connection:
-                logger.debug(f"OpenAI event: {event.type}")
-                if event.type == "input_audio_buffer.speech_started":
-                    if hasattr(self, "_clear_queue") and callable(self._clear_queue):
-                        self._clear_queue()
-                    if self.deps.head_wobbler is not None:
-                        self.deps.head_wobbler.reset()
-                    self.deps.movement_manager.set_listening(True)
-                    logger.debug("User speech started")
-                if event.type == "input_audio_buffer.speech_stopped":
-                    self.deps.movement_manager.set_listening(False)
-                    logger.debug("User speech stopped - server will auto-commit with VAD")
-                if event.type in (
-                    "response.audio.done",            # GA
-                    "response.output_audio.done",     # GA alias
-                    "response.audio.completed",       # legacy (for safety)
-                    "response.completed",             # text-only completion
-                ):
-                    logger.debug("response completed")
-                if event.type == "response.created":
-                    logger.debug("Response created")
-                if event.type == "response.done":
-                    # Doesn't mean the audio is done playing
-                    logger.debug("Response done")
-                # Handle partial transcription (user speaking in real-time)
-                if event.type == "conversation.item.input_audio_transcription.partial":
-                    logger.debug(f"User partial transcript: {event.transcript}")
-                    await self.output_queue.put(
-                        AdditionalOutputs({"role": "user_partial", "content": event.transcript})
-                    )
-                # Handle completed transcription (user finished speaking)
-                if event.type == "conversation.item.input_audio_transcription.completed":
-                    logger.debug(f"User transcript: {event.transcript}")
-                    await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
-                # Handle assistant transcription
-                if event.type in ("response.audio_transcript.done", "response.output_audio_transcript.done"):
-                    logger.debug(f"Assistant transcript: {event.transcript}")
-                    await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
-                # Handle audio delta
-                if event.type in ("response.audio.delta", "response.output_audio.delta"):
-                    if self.deps.head_wobbler is not None:
-                        self.deps.head_wobbler.feed(event.delta)
-                    self.last_activity_time = asyncio.get_event_loop().time()
-                    logger.debug("last activity time updated to %s", self.last_activity_time)
-                    await self.output_queue.put(
-                        (
-                            self.output_sample_rate,
-                            np.frombuffer(base64.b64decode(event.delta), dtype=np.int16).reshape(1, -1),
-                        ),
-                    )
-                # ---- tool-calling plumbing ----
-                if event.type == "response.function_call_arguments.done":
-                    tool_name = getattr(event, "name", None)
-                    args_json_str = getattr(event, "arguments", None)
-                    call_id = getattr(event, "call_id", None)
-                    if not isinstance(tool_name, str) or not isinstance(args_json_str, str):
-                        logger.error("Invalid tool call: tool_name=%s, args=%s", tool_name, args_json_str)
-                        continue
                     try:
-                        tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
-                        logger.debug("Tool '%s' executed successfully", tool_name)
-                        logger.debug("Tool result: %s", tool_result)
-                    except Exception as e:
-                        logger.error("Tool '%s' failed", tool_name)
-                        tool_result = {"error": str(e)}
-                    # send the tool result back
-                    if isinstance(call_id, str):
-                        await self.connection.conversation.item.create(
-                            item={
-                                "type": "function_call_output",
-                                "call_id": call_id,
-                                "output": json.dumps(tool_result),
-                            },
-                        )
-                    await self.output_queue.put(
-                        AdditionalOutputs(
-                            {
-                                "role": "assistant",
-                                "content": json.dumps(tool_result),
-                                "metadata": {"title": f"🛠️ Used tool {tool_name}", "status": "done"},
-                            },
-                        ),
-                    )
-                    if tool_name == "camera" and "b64_im" in tool_result:
-                        # use raw base64, don't json.dumps (which adds quotes)
-                        b64_im = tool_result["b64_im"]
-                        if not isinstance(b64_im, str):
-                            logger.warning("Unexpected type for b64_im: %s", type(b64_im))
-                            b64_im = str(b64_im)
-                        await self.connection.conversation.item.create(
-                            item={
-                                "type": "message",
-                                "role": "user",
-                                "content": [
-                                    {
-                                        "type": "input_image",
-                                        "image_url": f"data:image/jpeg;base64,{b64_im}",
                                     },
-                                ],
-                            },
-                        )
-                        logger.info("Added camera image to conversation")
-                        if self.deps.camera_worker is not None:
-                            np_img = self.deps.camera_worker.get_latest_frame()
-                            img = gr.Image(value=np_img)
-                            await self.output_queue.put(
-                                AdditionalOutputs(
-                                    {
-                                        "role": "assistant",
-                                        "content": img,
                                     },
-                                ),
-                            )
-                    # if this tool call was triggered by an idle signal, don't make the robot speak
-                    # for other tool calls, let the robot reply out loud
-                    if self.is_idle_tool_call:
-                        self.is_idle_tool_call = False
-                    else:
-                        await self.connection.response.create(
-                            response={
-                                "instructions": "Use the tool result just returned and answer concisely in speech.",
                             },
                         )
-                    # re synchronize the head wobble after a tool call that may have taken some time
-                    if self.deps.head_wobbler is not None:
-                        self.deps.head_wobbler.reset()
-                # server error
-                if event.type == "error":
-                    err = getattr(event, "error", None)
-                    msg = getattr(err, "message", str(err) if err else "unknown error")
-                    code = getattr(err, "code", "")
-                    logger.error("Realtime error [%s]: %s (raw=%s)", code, msg, err)
-                    # Only show user-facing errors, not internal state errors
-                    if code not in ("input_audio_buffer_commit_empty", "conversation_already_has_active_response"):
-                        await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
     # Microphone receive
     async def receive(self, frame: Tuple[int, NDArray[np.int16]]) -> None:
@@ -305,8 +328,14 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
     async def shutdown(self) -> None:
         """Shutdown the handler."""
         if self.connection:
-            await self.connection.close()
-            self.connection = None
         # Clear any remaining items in the output queue
         while not self.output_queue.empty():

 import json
 import base64
+import random
 import asyncio
 import logging
 from typing import Any, Tuple, Literal, cast
 from openai import AsyncOpenAI
 from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
 from numpy.typing import NDArray
+from websockets.exceptions import ConnectionClosedError
 from reachy_mini_conversation_app.tools import (
     ALL_TOOL_SPECS,
         return cast(NDArray[np.int16], resampled.astype(np.int16))
     async def start_up(self) -> None:
+        """Start the handler with minimal retries on unexpected websocket closure."""
         self.client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
+        max_attempts = 3
+        for attempt in range(1, max_attempts + 1):
+            try:
+                async with self.client.realtime.connect(model=config.MODEL_NAME) as conn:
                     try:
+                        await conn.session.update(
+                            session={
+                                "type": "realtime",
+                                "instructions": SESSION_INSTRUCTIONS,
+                                "audio": {
+                                    "input": {
+                                        "format": {
+                                            "type": "audio/pcm",
+                                            "rate": self.target_input_rate,
+                                        },
+                                        "transcription": {
+                                            "model": "whisper-1",
+                                            "language": "en"
+                                        },
+                                        "turn_detection": {
+                                            "type": "server_vad",
+                                            "interrupt_response": True,
+                                        },
                                     },
+                                    "output": {
+                                        "format": {
+                                            "type": "audio/pcm",
+                                            "rate": self.output_sample_rate,
+                                        },
+                                        "voice": "cedar",
                                     },
+                                },
+                                "tools": ALL_TOOL_SPECS,  # type: ignore[typeddict-item]
+                                "tool_choice": "auto",
                             },
                         )
+                    except Exception:
+                        logger.exception("Realtime session.update failed; aborting startup")
+                        return
+                    logger.info("Realtime session updated successfully")
+                    # Manage event received from the openai server
+                    self.connection = conn
+                    try:
+                        async for event in self.connection:
+                            logger.debug(f"OpenAI event: {event.type}")
+                            if event.type == "input_audio_buffer.speech_started":
+                                if hasattr(self, "_clear_queue") and callable(self._clear_queue):
+                                    self._clear_queue()
+                                if self.deps.head_wobbler is not None:
+                                    self.deps.head_wobbler.reset()
+                                self.deps.movement_manager.set_listening(True)
+                                logger.debug("User speech started")
+                            if event.type == "input_audio_buffer.speech_stopped":
+                                self.deps.movement_manager.set_listening(False)
+                                logger.debug("User speech stopped - server will auto-commit with VAD")
+                            if event.type in (
+                                "response.audio.done",            # GA
+                                "response.output_audio.done",     # GA alias
+                                "response.audio.completed",       # legacy (for safety)
+                                "response.completed",             # text-only completion
+                            ):
+                                logger.debug("response completed")
+                            if event.type == "response.created":
+                                logger.debug("Response created")
+                            if event.type == "response.done":
+                                # Doesn't mean the audio is done playing
+                                logger.debug("Response done")
+                            # Handle partial transcription (user speaking in real-time)
+                            if event.type == "conversation.item.input_audio_transcription.partial":
+                                logger.debug(f"User partial transcript: {event.transcript}")
+                                await self.output_queue.put(
+                                    AdditionalOutputs({"role": "user_partial", "content": event.transcript})
+                                )
+                            # Handle completed transcription (user finished speaking)
+                            if event.type == "conversation.item.input_audio_transcription.completed":
+                                logger.debug(f"User transcript: {event.transcript}")
+                                await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
+                            # Handle assistant transcription
+                            if event.type in ("response.audio_transcript.done", "response.output_audio_transcript.done"):
+                                logger.debug(f"Assistant transcript: {event.transcript}")
+                                await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
+                            # Handle audio delta
+                            if event.type in ("response.audio.delta", "response.output_audio.delta"):
+                                if self.deps.head_wobbler is not None:
+                                    self.deps.head_wobbler.feed(event.delta)
+                                self.last_activity_time = asyncio.get_event_loop().time()
+                                logger.debug("last activity time updated to %s", self.last_activity_time)
+                                await self.output_queue.put(
+                                    (
+                                        self.output_sample_rate,
+                                        np.frombuffer(base64.b64decode(event.delta), dtype=np.int16).reshape(1, -1),
+                                    ),
+                                )
+                            # ---- tool-calling plumbing ----
+                            if event.type == "response.function_call_arguments.done":
+                                tool_name = getattr(event, "name", None)
+                                args_json_str = getattr(event, "arguments", None)
+                                call_id = getattr(event, "call_id", None)
+                                if not isinstance(tool_name, str) or not isinstance(args_json_str, str):
+                                    logger.error("Invalid tool call: tool_name=%s, args=%s", tool_name, args_json_str)
+                                    continue
+                                try:
+                                    tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
+                                    logger.debug("Tool '%s' executed successfully", tool_name)
+                                    logger.debug("Tool result: %s", tool_result)
+                                except Exception as e:
+                                    logger.error("Tool '%s' failed", tool_name)
+                                    tool_result = {"error": str(e)}
+                                # send the tool result back
+                                if isinstance(call_id, str):
+                                    await self.connection.conversation.item.create(
+                                        item={
+                                            "type": "function_call_output",
+                                            "call_id": call_id,
+                                            "output": json.dumps(tool_result),
+                                        },
+                                    )
+                                await self.output_queue.put(
+                                    AdditionalOutputs(
+                                        {
+                                            "role": "assistant",
+                                            "content": json.dumps(tool_result),
+                                            "metadata": {"title": f"🛠️ Used tool {tool_name}", "status": "done"},
+                                        },
+                                    ),
+                                )
+                                if tool_name == "camera" and "b64_im" in tool_result:
+                                    # use raw base64, don't json.dumps (which adds quotes)
+                                    b64_im = tool_result["b64_im"]
+                                    if not isinstance(b64_im, str):
+                                        logger.warning("Unexpected type for b64_im: %s", type(b64_im))
+                                        b64_im = str(b64_im)
+                                    await self.connection.conversation.item.create(
+                                        item={
+                                            "type": "message",
+                                            "role": "user",
+                                            "content": [
+                                                {
+                                                    "type": "input_image",
+                                                    "image_url": f"data:image/jpeg;base64,{b64_im}",
+                                                },
+                                            ],
+                                        },
+                                    )
+                                    logger.info("Added camera image to conversation")
+                                    if self.deps.camera_worker is not None:
+                                        np_img = self.deps.camera_worker.get_latest_frame()
+                                        img = gr.Image(value=np_img)
+                                        await self.output_queue.put(
+                                            AdditionalOutputs(
+                                                {
+                                                    "role": "assistant",
+                                                    "content": img,
+                                                },
+                                            ),
+                                        )
+                                # if this tool call was triggered by an idle signal, don't make the robot speak
+                                # for other tool calls, let the robot reply out loud
+                                if self.is_idle_tool_call:
+                                    self.is_idle_tool_call = False
+                                else:
+                                    await self.connection.response.create(
+                                        response={
+                                            "instructions": "Use the tool result just returned and answer concisely in speech.",
+                                        },
+                                    )
+                                # re synchronize the head wobble after a tool call that may have taken some time
+                                if self.deps.head_wobbler is not None:
+                                    self.deps.head_wobbler.reset()
+                            # server error
+                            if event.type == "error":
+                                err = getattr(event, "error", None)
+                                msg = getattr(err, "message", str(err) if err else "unknown error")
+                                code = getattr(err, "code", "")
+                                logger.error("Realtime error [%s]: %s (raw=%s)", code, msg, err)
+                                # Only show user-facing errors, not internal state errors
+                                if code not in ("input_audio_buffer_commit_empty", "conversation_already_has_active_response"):
+                                    await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
+                    except ConnectionClosedError as e:
+                        # Abrupt close (e.g., "no close frame received or sent") → retry
+                        logger.warning(
+                            "Realtime websocket closed unexpectedly (attempt %d/%d): %s",
+                            attempt, max_attempts, e
+                        )
+                        if attempt < max_attempts:
+                            # small jittered backoff
+                            await asyncio.sleep(1.0 + random.uniform(0, 0.5))
+                            continue
+                        raise
+                    # Normal exit from the receive loop, stop retrying
+                    return
+            finally:
+                # never keep a stale reference
+                self.connection = None
     # Microphone receive
     async def receive(self, frame: Tuple[int, NDArray[np.int16]]) -> None:
     async def shutdown(self) -> None:
         """Shutdown the handler."""
         if self.connection:
+            try:
+                await self.connection.close()
+            except ConnectionClosedError:
+                pass
+            except Exception as e:
+                logger.debug(f"connection.close() ignored: {e}")
+            finally:
+                self.connection = None
         # Clear any remaining items in the output queue
         while not self.output_queue.empty():

tests/test_openai_realtime.py CHANGED Viewed

@@ -1,7 +1,12 @@
 import asyncio
 from datetime import datetime, timezone
 from unittest.mock import MagicMock
 from reachy_mini_conversation_app.tools import ToolDependencies
 from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler
@@ -27,3 +32,86 @@ def test_format_timestamp_uses_wall_clock() -> None:
     # Extract year from "[YYYY-MM-DD ...]"
     year = int(formatted[1:5])
     assert year == datetime.now(timezone.utc).year

 import asyncio
+import logging
+from typing import Any
 from datetime import datetime, timezone
 from unittest.mock import MagicMock
+import pytest
+import reachy_mini_conversation_app.openai_realtime as rt_mod
 from reachy_mini_conversation_app.tools import ToolDependencies
 from reachy_mini_conversation_app.openai_realtime import OpenaiRealtimeHandler
     # Extract year from "[YYYY-MM-DD ...]"
     year = int(formatted[1:5])
     assert year == datetime.now(timezone.utc).year
+@pytest.mark.asyncio
+async def test_start_up_retries_on_abrupt_close(monkeypatch: Any, caplog: Any) -> None:
+    """First connection dies with ConnectionClosedError during iteration -> retried.
+    Second connection iterates cleanly (no events) -> start_up returns without raising.
+    Ensures handler clears self.connection at the end.
+    """
+    caplog.set_level(logging.WARNING)
+    # Use a local Exception as the module's ConnectionClosedError to avoid ws dependency
+    FakeCCE = type("FakeCCE", (Exception,), {})
+    monkeypatch.setattr(rt_mod, "ConnectionClosedError", FakeCCE)
+    # Make asyncio.sleep return immediately (for backoff)
+    async def _fast_sleep(*_a: Any, **_kw: Any) -> None: return None
+    monkeypatch.setattr(asyncio, "sleep", _fast_sleep, raising=False)
+    attempt_counter = {"n": 0}
+    class FakeConn:
+        """Minimal realtime connection stub."""
+        def __init__(self, mode: str):
+            self._mode = mode
+            class _Session:
+                async def update(self, **_kw: Any) -> None: return None
+            self.session = _Session()
+            class _InputAudioBuffer:
+                async def append(self, **_kw: Any) -> None: return None
+            self.input_audio_buffer = _InputAudioBuffer()
+            class _Item:
+                async def create(self, **_kw: Any) -> None: return None
+            class _Conversation:
+                item = _Item()
+            self.conversation = _Conversation()
+            class _Response:
+                async def create(self, **_kw: Any) -> None: return None
+                async def cancel(self, **_kw: Any) -> None: return None
+            self.response = _Response()
+        async def __aenter__(self) -> "FakeConn": return self
+        async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool: return False
+        async def close(self) -> None: return None
+        # Async iterator protocol
+        def __aiter__(self) -> "FakeConn": return self
+        async def __anext__(self) -> None:
+            if self._mode == "raise_on_iter":
+                raise FakeCCE("abrupt close (simulated)")
+            raise StopAsyncIteration  # clean exit (no events)
+    class FakeRealtime:
+        def connect(self, **_kw: Any) -> FakeConn:
+            attempt_counter["n"] += 1
+            mode = "raise_on_iter" if attempt_counter["n"] == 1 else "clean"
+            return FakeConn(mode)
+    class FakeClient:
+        def __init__(self, **_kw: Any) -> None: self.realtime = FakeRealtime()
+    # Patch the OpenAI client used by the handler
+    monkeypatch.setattr(rt_mod, "AsyncOpenAI", FakeClient)
+    # Build handler with minimal deps
+    deps = ToolDependencies(reachy_mini=MagicMock(), movement_manager=MagicMock())
+    handler = rt_mod.OpenaiRealtimeHandler(deps)
+    # Run: should retry once and exit cleanly
+    await handler.start_up()
+    # Validate: two attempts total (fail -> retry -> succeed), and connection cleared
+    assert attempt_counter["n"] == 2
+    assert handler.connection is None
+    # Optional: confirm we logged the unexpected close once
+    warnings = [r for r in caplog.records if r.levelname == "WARNING" and "closed unexpectedly" in r.msg]
+    assert len(warnings) == 1