Alina Lozovskaya
commited on
Commit
·
5593ee6
1
Parent(s):
76ee3b3
Cleanup, fixes, logger
Browse files- src/reachy_mini_conversation_demo/audio/speech_tapper.py +0 -13
- src/reachy_mini_conversation_demo/camera_worker.py +1 -1
- src/reachy_mini_conversation_demo/config.py +30 -9
- src/reachy_mini_conversation_demo/console.py +11 -6
- src/reachy_mini_conversation_demo/main.py +4 -3
- src/reachy_mini_conversation_demo/moves.py +3 -22
- src/reachy_mini_conversation_demo/openai_realtime.py +69 -19
- src/reachy_mini_conversation_demo/tools.py +14 -221
- src/reachy_mini_conversation_demo/utils.py +9 -7
- src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py +0 -86
src/reachy_mini_conversation_demo/audio/speech_tapper.py
CHANGED
|
@@ -120,7 +120,6 @@ class SwayRollRT:
|
|
| 120 |
self._seed = int(rng_seed)
|
| 121 |
self.samples = deque(maxlen=10 * SR) # sliding window for VAD/env
|
| 122 |
self.carry = np.zeros(0, dtype=np.float32)
|
| 123 |
-
self.frame_idx = 0
|
| 124 |
|
| 125 |
self.vad_on = False
|
| 126 |
self.vad_above = 0
|
|
@@ -143,7 +142,6 @@ class SwayRollRT:
|
|
| 143 |
"""Reset state (VAD/env/buffers/time) but keep initial phases/seed."""
|
| 144 |
self.samples.clear()
|
| 145 |
self.carry = np.zeros(0, dtype=np.float32)
|
| 146 |
-
self.frame_idx = 0
|
| 147 |
self.vad_on = False
|
| 148 |
self.vad_above = 0
|
| 149 |
self.vad_below = 0
|
|
@@ -152,16 +150,6 @@ class SwayRollRT:
|
|
| 152 |
self.sway_down = 0
|
| 153 |
self.t = 0.0
|
| 154 |
|
| 155 |
-
def reset_phases(self) -> None:
|
| 156 |
-
"""Re-randomize phases deterministically from stored seed (Optional)."""
|
| 157 |
-
rng = np.random.default_rng(self._seed)
|
| 158 |
-
self.phase_pitch = float(rng.random() * 2 * math.pi)
|
| 159 |
-
self.phase_yaw = float(rng.random() * 2 * math.pi)
|
| 160 |
-
self.phase_roll = float(rng.random() * 2 * math.pi)
|
| 161 |
-
self.phase_x = float(rng.random() * 2 * math.pi)
|
| 162 |
-
self.phase_y = float(rng.random() * 2 * math.pi)
|
| 163 |
-
self.phase_z = float(rng.random() * 2 * math.pi)
|
| 164 |
-
|
| 165 |
def feed(self, pcm: np.ndarray, sr: Optional[int]) -> List[Dict[str, float]]:
|
| 166 |
"""Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
|
| 167 |
|
|
@@ -196,7 +184,6 @@ class SwayRollRT:
|
|
| 196 |
self.samples.extend(hop.tolist())
|
| 197 |
if len(self.samples) < FRAME:
|
| 198 |
self.t += HOP_MS / 1000.0
|
| 199 |
-
self.frame_idx += 1
|
| 200 |
continue
|
| 201 |
|
| 202 |
frame = np.fromiter(
|
|
|
|
| 120 |
self._seed = int(rng_seed)
|
| 121 |
self.samples = deque(maxlen=10 * SR) # sliding window for VAD/env
|
| 122 |
self.carry = np.zeros(0, dtype=np.float32)
|
|
|
|
| 123 |
|
| 124 |
self.vad_on = False
|
| 125 |
self.vad_above = 0
|
|
|
|
| 142 |
"""Reset state (VAD/env/buffers/time) but keep initial phases/seed."""
|
| 143 |
self.samples.clear()
|
| 144 |
self.carry = np.zeros(0, dtype=np.float32)
|
|
|
|
| 145 |
self.vad_on = False
|
| 146 |
self.vad_above = 0
|
| 147 |
self.vad_below = 0
|
|
|
|
| 150 |
self.sway_down = 0
|
| 151 |
self.t = 0.0
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
def feed(self, pcm: np.ndarray, sr: Optional[int]) -> List[Dict[str, float]]:
|
| 154 |
"""Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
|
| 155 |
|
|
|
|
| 184 |
self.samples.extend(hop.tolist())
|
| 185 |
if len(self.samples) < FRAME:
|
| 186 |
self.t += HOP_MS / 1000.0
|
|
|
|
| 187 |
continue
|
| 188 |
|
| 189 |
frame = np.fromiter(
|
src/reachy_mini_conversation_demo/camera_worker.py
CHANGED
|
@@ -229,7 +229,7 @@ class CameraWorker:
|
|
| 229 |
time.sleep(0.01)
|
| 230 |
|
| 231 |
except Exception as e:
|
| 232 |
-
logger.
|
| 233 |
time.sleep(0.1) # Longer sleep on error
|
| 234 |
|
| 235 |
logger.debug("Camera worker thread exited")
|
|
|
|
| 229 |
time.sleep(0.01)
|
| 230 |
|
| 231 |
except Exception as e:
|
| 232 |
+
logger.exception(f"Camera worker error: {e}")
|
| 233 |
time.sleep(0.1) # Longer sleep on error
|
| 234 |
|
| 235 |
logger.debug("Camera worker thread exited")
|
src/reachy_mini_conversation_demo/config.py
CHANGED
|
@@ -1,17 +1,28 @@
|
|
| 1 |
import os
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
|
| 5 |
|
| 6 |
-
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
class Config:
|
|
@@ -19,13 +30,23 @@ class Config:
|
|
| 19 |
|
| 20 |
# Required
|
| 21 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 22 |
-
if
|
| 23 |
-
raise RuntimeError(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
# Optional
|
| 26 |
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-realtime")
|
| 27 |
HF_HOME = os.getenv("HF_HOME", "./cache")
|
|
|
|
| 28 |
HF_TOKEN = os.getenv("HF_TOKEN") # Optional, falls back to hf auth login if not set
|
| 29 |
|
|
|
|
|
|
|
| 30 |
|
| 31 |
config = Config()
|
|
|
|
| 1 |
import os
|
| 2 |
+
import logging
|
| 3 |
+
from pathlib import Path
|
| 4 |
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
|
| 7 |
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
|
| 10 |
+
# Check if .env file exists
|
| 11 |
+
env_file = Path(".env")
|
| 12 |
+
if not env_file.exists():
|
| 13 |
+
raise RuntimeError(
|
| 14 |
+
".env file not found. Please create one based on .env.example:\n"
|
| 15 |
+
" cp .env.example .env\n"
|
| 16 |
+
"Then add your OPENAI_API_KEY to the .env file."
|
| 17 |
+
)
|
| 18 |
|
| 19 |
+
# Load .env and verify it was loaded successfully
|
| 20 |
+
if not load_dotenv():
|
| 21 |
+
raise RuntimeError(
|
| 22 |
+
"Failed to load .env file. Please ensure the file is readable and properly formatted."
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
logger.info("Configuration loaded from .env file")
|
| 26 |
|
| 27 |
|
| 28 |
class Config:
|
|
|
|
| 30 |
|
| 31 |
# Required
|
| 32 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 33 |
+
if OPENAI_API_KEY is None:
|
| 34 |
+
raise RuntimeError(
|
| 35 |
+
"OPENAI_API_KEY is not set in .env file. Please add it:\n"
|
| 36 |
+
" OPENAI_API_KEY=your_api_key_here"
|
| 37 |
+
)
|
| 38 |
+
if not OPENAI_API_KEY.strip():
|
| 39 |
+
raise RuntimeError(
|
| 40 |
+
"OPENAI_API_KEY is empty in .env file. Please provide a valid API key."
|
| 41 |
+
)
|
| 42 |
|
| 43 |
# Optional
|
| 44 |
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-realtime")
|
| 45 |
HF_HOME = os.getenv("HF_HOME", "./cache")
|
| 46 |
+
LOCAL_VISION_MODEL = os.getenv("LOCAL_VISION_MODEL", "HuggingFaceTB/SmolVLM2-2.2B-Instruct")
|
| 47 |
HF_TOKEN = os.getenv("HF_TOKEN") # Optional, falls back to hf auth login if not set
|
| 48 |
|
| 49 |
+
logger.debug(f"Model: {MODEL_NAME}, HF_HOME: {HF_HOME}, Vision Model: {LOCAL_VISION_MODEL}")
|
| 50 |
+
|
| 51 |
|
| 52 |
config = Config()
|
src/reachy_mini_conversation_demo/console.py
CHANGED
|
@@ -19,14 +19,19 @@ logger = logging.getLogger(__name__)
|
|
| 19 |
class LocalStream:
|
| 20 |
"""LocalStream using Reachy Mini's recorder/player."""
|
| 21 |
|
| 22 |
-
def __init__(self,
|
| 23 |
-
"""Initialize the stream with
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
self._robot = robot
|
| 26 |
self._stop_event = asyncio.Event()
|
| 27 |
self._tasks = []
|
| 28 |
-
#
|
| 29 |
-
self.handler
|
| 30 |
|
| 31 |
def launch(self) -> None:
|
| 32 |
"""Start the recorder/player and run the async processing loops."""
|
|
@@ -69,7 +74,7 @@ class LocalStream:
|
|
| 69 |
self._robot.media.stop_recording()
|
| 70 |
self._robot.media.stop_playing()
|
| 71 |
|
| 72 |
-
def
|
| 73 |
"""Flush the player's appsrc to drop any queued audio immediately."""
|
| 74 |
logger.info("User intervention: flushing player queue")
|
| 75 |
self.handler.output_queue = asyncio.Queue()
|
|
|
|
| 19 |
class LocalStream:
|
| 20 |
"""LocalStream using Reachy Mini's recorder/player."""
|
| 21 |
|
| 22 |
+
def __init__(self, deps, robot: ReachyMini):
|
| 23 |
+
"""Initialize the stream with tool dependencies and robot.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
deps: ToolDependencies for the handler
|
| 27 |
+
robot: ReachyMini robot instance
|
| 28 |
+
|
| 29 |
+
"""
|
| 30 |
self._robot = robot
|
| 31 |
self._stop_event = asyncio.Event()
|
| 32 |
self._tasks = []
|
| 33 |
+
# Create handler with callback to this instance's clear_audio_queue method
|
| 34 |
+
self.handler = OpenaiRealtimeHandler(deps, clear_audio_queue_callback=self.clear_audio_queue)
|
| 35 |
|
| 36 |
def launch(self) -> None:
|
| 37 |
"""Start the recorder/player and run the async processing loops."""
|
|
|
|
| 74 |
self._robot.media.stop_recording()
|
| 75 |
self._robot.media.stop_playing()
|
| 76 |
|
| 77 |
+
def clear_audio_queue(self) -> None:
|
| 78 |
"""Flush the player's appsrc to drop any queued audio immediately."""
|
| 79 |
logger.info("User intervention: flushing player queue")
|
| 80 |
self.handler.output_queue = asyncio.Queue()
|
src/reachy_mini_conversation_demo/main.py
CHANGED
|
@@ -74,11 +74,11 @@ def main():
|
|
| 74 |
)
|
| 75 |
logger.debug(f"Chatbot avatar images: {chatbot.avatar_images}")
|
| 76 |
|
| 77 |
-
handler = OpenaiRealtimeHandler(deps)
|
| 78 |
-
|
| 79 |
stream_manager = None
|
| 80 |
|
| 81 |
if args.gradio:
|
|
|
|
|
|
|
| 82 |
stream = Stream(
|
| 83 |
handler=handler,
|
| 84 |
mode="send-receive",
|
|
@@ -92,7 +92,8 @@ def main():
|
|
| 92 |
app = FastAPI()
|
| 93 |
app = gr.mount_gradio_app(app, stream.ui, path="/")
|
| 94 |
else:
|
| 95 |
-
|
|
|
|
| 96 |
|
| 97 |
# Each async service → its own thread/loop
|
| 98 |
movement_manager.start()
|
|
|
|
| 74 |
)
|
| 75 |
logger.debug(f"Chatbot avatar images: {chatbot.avatar_images}")
|
| 76 |
|
|
|
|
|
|
|
| 77 |
stream_manager = None
|
| 78 |
|
| 79 |
if args.gradio:
|
| 80 |
+
# Gradio mode: no LocalStream, so no audio queue callback needed
|
| 81 |
+
handler = OpenaiRealtimeHandler(deps)
|
| 82 |
stream = Stream(
|
| 83 |
handler=handler,
|
| 84 |
mode="send-receive",
|
|
|
|
| 92 |
app = FastAPI()
|
| 93 |
app = gr.mount_gradio_app(app, stream.ui, path="/")
|
| 94 |
else:
|
| 95 |
+
# Console mode: LocalStream creates handler internally with proper callback
|
| 96 |
+
stream_manager = LocalStream(deps, robot)
|
| 97 |
|
| 98 |
# Each async service → its own thread/loop
|
| 99 |
movement_manager.start()
|
src/reachy_mini_conversation_demo/moves.py
CHANGED
|
@@ -190,13 +190,7 @@ class MovementState:
|
|
| 190 |
0.0,
|
| 191 |
)
|
| 192 |
|
| 193 |
-
# Legacy movement state (for goto moves)
|
| 194 |
-
moving_start: float = 0.0
|
| 195 |
-
moving_for: float = 0.0
|
| 196 |
-
|
| 197 |
# Status flags
|
| 198 |
-
is_playing_move: bool = False
|
| 199 |
-
is_moving: bool = False
|
| 200 |
last_primary_pose: Optional[FullBodyPose] = None
|
| 201 |
|
| 202 |
def update_activity(self) -> None:
|
|
@@ -325,7 +319,7 @@ class MovementManager:
|
|
| 325 |
"""
|
| 326 |
self._command_queue.put(("queue_move", move))
|
| 327 |
|
| 328 |
-
def
|
| 329 |
"""Stop the active move and discard any queued primary moves.
|
| 330 |
|
| 331 |
Thread-safe: executed by the worker thread via the command queue.
|
|
@@ -361,10 +355,6 @@ class MovementManager:
|
|
| 361 |
|
| 362 |
return self._now() - last_activity >= self.idle_inactivity_delay
|
| 363 |
|
| 364 |
-
def mark_user_activity(self) -> None:
|
| 365 |
-
"""Record external activity and postpone idle behaviours (thread-safe)."""
|
| 366 |
-
self._command_queue.put(("mark_activity", None))
|
| 367 |
-
|
| 368 |
def set_listening(self, listening: bool) -> None:
|
| 369 |
"""Enable or disable listening mode without touching shared state directly.
|
| 370 |
|
|
@@ -427,7 +417,7 @@ class MovementManager:
|
|
| 427 |
duration_str = str(duration)
|
| 428 |
else:
|
| 429 |
duration_str = "?"
|
| 430 |
-
logger.
|
| 431 |
"Queued move with duration %ss, queue size: %s",
|
| 432 |
duration_str,
|
| 433 |
len(self.move_queue),
|
|
@@ -438,7 +428,6 @@ class MovementManager:
|
|
| 438 |
self.move_queue.clear()
|
| 439 |
self.state.current_move = None
|
| 440 |
self.state.move_start_time = None
|
| 441 |
-
self.state.is_playing_move = False
|
| 442 |
self._breathing_active = False
|
| 443 |
logger.info("Cleared move queue and stopped current move")
|
| 444 |
elif command == "set_moving_state":
|
|
@@ -447,8 +436,6 @@ class MovementManager:
|
|
| 447 |
except (TypeError, ValueError):
|
| 448 |
logger.warning("Invalid moving state duration: %s", payload)
|
| 449 |
return
|
| 450 |
-
self.state.moving_start = current_time
|
| 451 |
-
self.state.moving_for = max(0.0, duration)
|
| 452 |
self.state.update_activity()
|
| 453 |
elif command == "mark_activity":
|
| 454 |
self.state.update_activity()
|
|
@@ -534,7 +521,7 @@ class MovementManager:
|
|
| 534 |
self.state.current_move = None
|
| 535 |
self.state.move_start_time = None
|
| 536 |
self._breathing_active = False
|
| 537 |
-
logger.
|
| 538 |
|
| 539 |
if self.state.current_move is not None and not isinstance(self.state.current_move, BreathingMove):
|
| 540 |
self._breathing_active = False
|
|
@@ -561,14 +548,9 @@ class MovementManager:
|
|
| 561 |
float(body_yaw),
|
| 562 |
)
|
| 563 |
|
| 564 |
-
self.state.is_playing_move = True
|
| 565 |
-
self.state.is_moving = True
|
| 566 |
self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
|
| 567 |
else:
|
| 568 |
# Otherwise reuse the last primary pose so we avoid jumps between moves
|
| 569 |
-
self.state.is_playing_move = False
|
| 570 |
-
self.state.is_moving = current_time - self.state.moving_start < self.state.moving_for
|
| 571 |
-
|
| 572 |
if self.state.last_primary_pose is not None:
|
| 573 |
primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
|
| 574 |
else:
|
|
@@ -746,7 +728,6 @@ class MovementManager:
|
|
| 746 |
self._thread.join()
|
| 747 |
self._thread = None
|
| 748 |
logger.debug("Move worker stopped")
|
| 749 |
-
logger.debug("Move worker stopped")
|
| 750 |
|
| 751 |
def get_status(self) -> dict[str, Any]:
|
| 752 |
"""Return a lightweight status snapshot for observability."""
|
|
|
|
| 190 |
0.0,
|
| 191 |
)
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
# Status flags
|
|
|
|
|
|
|
| 194 |
last_primary_pose: Optional[FullBodyPose] = None
|
| 195 |
|
| 196 |
def update_activity(self) -> None:
|
|
|
|
| 319 |
"""
|
| 320 |
self._command_queue.put(("queue_move", move))
|
| 321 |
|
| 322 |
+
def clear_move_queue(self) -> None:
|
| 323 |
"""Stop the active move and discard any queued primary moves.
|
| 324 |
|
| 325 |
Thread-safe: executed by the worker thread via the command queue.
|
|
|
|
| 355 |
|
| 356 |
return self._now() - last_activity >= self.idle_inactivity_delay
|
| 357 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
def set_listening(self, listening: bool) -> None:
|
| 359 |
"""Enable or disable listening mode without touching shared state directly.
|
| 360 |
|
|
|
|
| 417 |
duration_str = str(duration)
|
| 418 |
else:
|
| 419 |
duration_str = "?"
|
| 420 |
+
logger.debug(
|
| 421 |
"Queued move with duration %ss, queue size: %s",
|
| 422 |
duration_str,
|
| 423 |
len(self.move_queue),
|
|
|
|
| 428 |
self.move_queue.clear()
|
| 429 |
self.state.current_move = None
|
| 430 |
self.state.move_start_time = None
|
|
|
|
| 431 |
self._breathing_active = False
|
| 432 |
logger.info("Cleared move queue and stopped current move")
|
| 433 |
elif command == "set_moving_state":
|
|
|
|
| 436 |
except (TypeError, ValueError):
|
| 437 |
logger.warning("Invalid moving state duration: %s", payload)
|
| 438 |
return
|
|
|
|
|
|
|
| 439 |
self.state.update_activity()
|
| 440 |
elif command == "mark_activity":
|
| 441 |
self.state.update_activity()
|
|
|
|
| 521 |
self.state.current_move = None
|
| 522 |
self.state.move_start_time = None
|
| 523 |
self._breathing_active = False
|
| 524 |
+
logger.debug("Stopping breathing due to new move activity")
|
| 525 |
|
| 526 |
if self.state.current_move is not None and not isinstance(self.state.current_move, BreathingMove):
|
| 527 |
self._breathing_active = False
|
|
|
|
| 548 |
float(body_yaw),
|
| 549 |
)
|
| 550 |
|
|
|
|
|
|
|
| 551 |
self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
|
| 552 |
else:
|
| 553 |
# Otherwise reuse the last primary pose so we avoid jumps between moves
|
|
|
|
|
|
|
|
|
|
| 554 |
if self.state.last_primary_pose is not None:
|
| 555 |
primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
|
| 556 |
else:
|
|
|
|
| 728 |
self._thread.join()
|
| 729 |
self._thread = None
|
| 730 |
logger.debug("Move worker stopped")
|
|
|
|
| 731 |
|
| 732 |
def get_status(self) -> dict[str, Any]:
|
| 733 |
"""Return a lightweight status snapshot for observability."""
|
src/reachy_mini_conversation_demo/openai_realtime.py
CHANGED
|
@@ -15,6 +15,7 @@ from reachy_mini_conversation_demo.tools import (
|
|
| 15 |
dispatch_tool_call,
|
| 16 |
)
|
| 17 |
from reachy_mini_conversation_demo.config import config
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
logger = logging.getLogger(__name__)
|
|
@@ -23,19 +24,28 @@ logger = logging.getLogger(__name__)
|
|
| 23 |
class OpenaiRealtimeHandler(AsyncStreamHandler):
|
| 24 |
"""An OpenAI realtime handler for fastrtc Stream."""
|
| 25 |
|
| 26 |
-
def __init__(self, deps: ToolDependencies):
|
| 27 |
-
"""Initialize the handler.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
super().__init__(
|
| 29 |
expected_layout="mono",
|
| 30 |
output_sample_rate=24000, # openai outputs
|
| 31 |
input_sample_rate=16000, # respeaker output
|
| 32 |
)
|
| 33 |
self.deps = deps
|
|
|
|
| 34 |
|
| 35 |
self.connection = None
|
| 36 |
self.output_queue = asyncio.Queue()
|
| 37 |
|
| 38 |
self._pending_calls: dict[str, dict] = {}
|
|
|
|
|
|
|
| 39 |
|
| 40 |
self.last_activity_time = asyncio.get_event_loop().time()
|
| 41 |
self.start_time = asyncio.get_event_loop().time()
|
|
@@ -43,7 +53,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 43 |
|
| 44 |
def copy(self):
|
| 45 |
"""Create a copy of the handler."""
|
| 46 |
-
return OpenaiRealtimeHandler(self.deps)
|
| 47 |
|
| 48 |
async def start_up(self):
|
| 49 |
"""Start the handler."""
|
|
@@ -59,7 +69,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 59 |
"language": "en",
|
| 60 |
},
|
| 61 |
"voice": "ballad",
|
| 62 |
-
"instructions":
|
| 63 |
"tools": ALL_TOOL_SPECS,
|
| 64 |
"tool_choice": "auto",
|
| 65 |
"temperature": 0.7,
|
|
@@ -71,14 +81,15 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 71 |
async for event in self.connection:
|
| 72 |
logger.debug(f"OpenAI event: {event.type}")
|
| 73 |
if event.type == "input_audio_buffer.speech_started":
|
| 74 |
-
self.
|
|
|
|
| 75 |
self.deps.head_wobbler.reset()
|
| 76 |
self.deps.movement_manager.set_listening(True)
|
| 77 |
-
logger.debug("
|
| 78 |
|
| 79 |
if event.type == "input_audio_buffer.speech_stopped":
|
| 80 |
self.deps.movement_manager.set_listening(False)
|
| 81 |
-
logger.debug("
|
| 82 |
|
| 83 |
if event.type in ("response.audio.completed", "response.completed"):
|
| 84 |
# Doesn't seem to be called
|
|
@@ -87,19 +98,27 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 87 |
|
| 88 |
if event.type == "response.created":
|
| 89 |
logger.debug("response created")
|
|
|
|
| 90 |
|
| 91 |
if event.type == "response.done":
|
| 92 |
# Doesn't mean the audio is done playing
|
| 93 |
logger.debug("response done")
|
| 94 |
-
|
| 95 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
if event.type == "conversation.item.input_audio_transcription.completed":
|
| 98 |
-
logger.debug(f"
|
| 99 |
await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
|
| 100 |
|
| 101 |
if event.type == "response.audio_transcript.done":
|
| 102 |
-
logger.debug(f"
|
| 103 |
await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
|
| 104 |
|
| 105 |
if event.type == "response.audio.delta":
|
|
@@ -144,10 +163,10 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 144 |
|
| 145 |
try:
|
| 146 |
tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
|
| 147 |
-
logger.debug("
|
| 148 |
logger.debug("Tool result: %s", tool_result)
|
| 149 |
except Exception as e:
|
| 150 |
-
logger.
|
| 151 |
tool_result = {"error": str(e)}
|
| 152 |
|
| 153 |
# send the tool result back
|
|
@@ -183,7 +202,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 183 |
],
|
| 184 |
}
|
| 185 |
)
|
| 186 |
-
logger.info("
|
| 187 |
|
| 188 |
np_img = self.deps.camera_worker.get_latest_frame()
|
| 189 |
img = gr.Image(value=np_img)
|
|
@@ -198,7 +217,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 198 |
)
|
| 199 |
|
| 200 |
if not self.is_idle_tool_call:
|
| 201 |
-
await self.
|
| 202 |
response={
|
| 203 |
"instructions": "Use the tool result just returned and answer concisely in speech."
|
| 204 |
}
|
|
@@ -215,8 +234,18 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 215 |
if event.type == "error":
|
| 216 |
err = getattr(event, "error", None)
|
| 217 |
msg = getattr(err, "message", str(err) if err else "unknown error")
|
| 218 |
-
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
# Microphone receive
|
| 222 |
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
|
@@ -256,6 +285,28 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 256 |
dt = datetime.fromtimestamp(current_time)
|
| 257 |
return f"[{dt.strftime('%Y-%m-%d %H:%M:%S')} | +{elapsed_seconds:.1f}s]"
|
| 258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
async def send_idle_signal(self, idle_duration) -> None:
|
| 260 |
"""Send an idle signal to the openai server."""
|
| 261 |
logger.debug("Sending idle signal")
|
|
@@ -271,11 +322,10 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 271 |
"content": [{"type": "input_text", "text": timestamp_msg}],
|
| 272 |
}
|
| 273 |
)
|
| 274 |
-
await self.
|
| 275 |
response={
|
| 276 |
"modalities": ["text"],
|
| 277 |
"instructions": "You MUST respond with function calls only - no speech or text. Choose appropriate actions for idle behavior.",
|
| 278 |
"tool_choice": "required",
|
| 279 |
}
|
| 280 |
)
|
| 281 |
-
# TODO additional inputs
|
|
|
|
| 15 |
dispatch_tool_call,
|
| 16 |
)
|
| 17 |
from reachy_mini_conversation_demo.config import config
|
| 18 |
+
from reachy_mini_conversation_demo.prompts import SESSION_INSTRUCTIONS
|
| 19 |
|
| 20 |
|
| 21 |
logger = logging.getLogger(__name__)
|
|
|
|
| 24 |
class OpenaiRealtimeHandler(AsyncStreamHandler):
|
| 25 |
"""An OpenAI realtime handler for fastrtc Stream."""
|
| 26 |
|
| 27 |
+
def __init__(self, deps: ToolDependencies, clear_audio_queue_callback=None):
|
| 28 |
+
"""Initialize the handler.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
deps: Tool dependencies for executing tools
|
| 32 |
+
clear_audio_queue_callback: Optional callback to clear the audio queue when speech starts
|
| 33 |
+
|
| 34 |
+
"""
|
| 35 |
super().__init__(
|
| 36 |
expected_layout="mono",
|
| 37 |
output_sample_rate=24000, # openai outputs
|
| 38 |
input_sample_rate=16000, # respeaker output
|
| 39 |
)
|
| 40 |
self.deps = deps
|
| 41 |
+
self._clear_audio_queue_callback = clear_audio_queue_callback
|
| 42 |
|
| 43 |
self.connection = None
|
| 44 |
self.output_queue = asyncio.Queue()
|
| 45 |
|
| 46 |
self._pending_calls: dict[str, dict] = {}
|
| 47 |
+
self._response_in_progress = False
|
| 48 |
+
self._pending_response_queue = asyncio.Queue()
|
| 49 |
|
| 50 |
self.last_activity_time = asyncio.get_event_loop().time()
|
| 51 |
self.start_time = asyncio.get_event_loop().time()
|
|
|
|
| 53 |
|
| 54 |
def copy(self):
|
| 55 |
"""Create a copy of the handler."""
|
| 56 |
+
return OpenaiRealtimeHandler(self.deps, self._clear_audio_queue_callback)
|
| 57 |
|
| 58 |
async def start_up(self):
|
| 59 |
"""Start the handler."""
|
|
|
|
| 69 |
"language": "en",
|
| 70 |
},
|
| 71 |
"voice": "ballad",
|
| 72 |
+
"instructions": SESSION_INSTRUCTIONS,
|
| 73 |
"tools": ALL_TOOL_SPECS,
|
| 74 |
"tool_choice": "auto",
|
| 75 |
"temperature": 0.7,
|
|
|
|
| 81 |
async for event in self.connection:
|
| 82 |
logger.debug(f"OpenAI event: {event.type}")
|
| 83 |
if event.type == "input_audio_buffer.speech_started":
|
| 84 |
+
if self._clear_audio_queue_callback:
|
| 85 |
+
self._clear_audio_queue_callback()
|
| 86 |
self.deps.head_wobbler.reset()
|
| 87 |
self.deps.movement_manager.set_listening(True)
|
| 88 |
+
logger.debug("User speech started")
|
| 89 |
|
| 90 |
if event.type == "input_audio_buffer.speech_stopped":
|
| 91 |
self.deps.movement_manager.set_listening(False)
|
| 92 |
+
logger.debug("User speech stopped")
|
| 93 |
|
| 94 |
if event.type in ("response.audio.completed", "response.completed"):
|
| 95 |
# Doesn't seem to be called
|
|
|
|
| 98 |
|
| 99 |
if event.type == "response.created":
|
| 100 |
logger.debug("response created")
|
| 101 |
+
self._response_in_progress = True
|
| 102 |
|
| 103 |
if event.type == "response.done":
|
| 104 |
# Doesn't mean the audio is done playing
|
| 105 |
logger.debug("response done")
|
| 106 |
+
self._response_in_progress = False
|
| 107 |
+
# Process any queued response requests
|
| 108 |
+
if not self._pending_response_queue.empty():
|
| 109 |
+
queued_params = await self._pending_response_queue.get()
|
| 110 |
+
logger.debug("Processing queued response request")
|
| 111 |
+
try:
|
| 112 |
+
await self.connection.response.create(**queued_params)
|
| 113 |
+
except Exception as e:
|
| 114 |
+
logger.error(f"Failed to create queued response: {e}")
|
| 115 |
|
| 116 |
if event.type == "conversation.item.input_audio_transcription.completed":
|
| 117 |
+
logger.debug(f"User transcript: {event.transcript}")
|
| 118 |
await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
|
| 119 |
|
| 120 |
if event.type == "response.audio_transcript.done":
|
| 121 |
+
logger.debug(f"Assistant transcript: {event.transcript}")
|
| 122 |
await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
|
| 123 |
|
| 124 |
if event.type == "response.audio.delta":
|
|
|
|
| 163 |
|
| 164 |
try:
|
| 165 |
tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
|
| 166 |
+
logger.debug("Tool '%s' executed successfully", tool_name)
|
| 167 |
logger.debug("Tool result: %s", tool_result)
|
| 168 |
except Exception as e:
|
| 169 |
+
logger.exception("Tool '%s' failed", tool_name)
|
| 170 |
tool_result = {"error": str(e)}
|
| 171 |
|
| 172 |
# send the tool result back
|
|
|
|
| 202 |
],
|
| 203 |
}
|
| 204 |
)
|
| 205 |
+
logger.info("Added camera image to conversation")
|
| 206 |
|
| 207 |
np_img = self.deps.camera_worker.get_latest_frame()
|
| 208 |
img = gr.Image(value=np_img)
|
|
|
|
| 217 |
)
|
| 218 |
|
| 219 |
if not self.is_idle_tool_call:
|
| 220 |
+
await self._safe_create_response(
|
| 221 |
response={
|
| 222 |
"instructions": "Use the tool result just returned and answer concisely in speech."
|
| 223 |
}
|
|
|
|
| 234 |
if event.type == "error":
|
| 235 |
err = getattr(event, "error", None)
|
| 236 |
msg = getattr(err, "message", str(err) if err else "unknown error")
|
| 237 |
+
err_code = getattr(err, "code", None)
|
| 238 |
+
|
| 239 |
+
# Handle concurrent response error gracefully
|
| 240 |
+
if err_code == "conversation_already_has_active_response":
|
| 241 |
+
logger.warning(
|
| 242 |
+
"Attempted to create response while one is in progress. "
|
| 243 |
+
"This is expected during rapid tool calls and will be handled automatically."
|
| 244 |
+
)
|
| 245 |
+
# Don't send error to user for this specific case
|
| 246 |
+
else:
|
| 247 |
+
logger.error("Realtime error: %s (raw=%s)", msg, err)
|
| 248 |
+
await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
|
| 249 |
|
| 250 |
# Microphone receive
|
| 251 |
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
|
|
|
| 285 |
dt = datetime.fromtimestamp(current_time)
|
| 286 |
return f"[{dt.strftime('%Y-%m-%d %H:%M:%S')} | +{elapsed_seconds:.1f}s]"
|
| 287 |
|
| 288 |
+
async def _safe_create_response(self, **kwargs) -> None:
|
| 289 |
+
"""Safely create a response, queuing if one is already in progress.
|
| 290 |
+
|
| 291 |
+
Args:
|
| 292 |
+
**kwargs: Arguments to pass to connection.response.create()
|
| 293 |
+
|
| 294 |
+
"""
|
| 295 |
+
if self._response_in_progress:
|
| 296 |
+
logger.debug("Response already in progress, queuing request (expected during rapid tool calls)")
|
| 297 |
+
await self._pending_response_queue.put(kwargs)
|
| 298 |
+
else:
|
| 299 |
+
try:
|
| 300 |
+
await self.connection.response.create(**kwargs)
|
| 301 |
+
except Exception as e:
|
| 302 |
+
error_msg = str(e)
|
| 303 |
+
if "conversation_already_has_active_response" in error_msg:
|
| 304 |
+
logger.warning("Race condition detected, queuing response request")
|
| 305 |
+
await self._pending_response_queue.put(kwargs)
|
| 306 |
+
else:
|
| 307 |
+
logger.error(f"Failed to create response: {e}")
|
| 308 |
+
raise
|
| 309 |
+
|
| 310 |
async def send_idle_signal(self, idle_duration) -> None:
|
| 311 |
"""Send an idle signal to the openai server."""
|
| 312 |
logger.debug("Sending idle signal")
|
|
|
|
| 322 |
"content": [{"type": "input_text", "text": timestamp_msg}],
|
| 323 |
}
|
| 324 |
)
|
| 325 |
+
await self._safe_create_response(
|
| 326 |
response={
|
| 327 |
"modalities": ["text"],
|
| 328 |
"instructions": "You MUST respond with function calls only - no speech or text. Choose appropriate actions for idle behavior.",
|
| 329 |
"tool_choice": "required",
|
| 330 |
}
|
| 331 |
)
|
|
|
src/reachy_mini_conversation_demo/tools.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
import abc
|
| 3 |
import json
|
| 4 |
-
import time
|
| 5 |
import asyncio
|
| 6 |
import inspect
|
| 7 |
import logging
|
|
@@ -12,12 +11,8 @@ from reachy_mini import ReachyMini
|
|
| 12 |
from reachy_mini.utils import create_head_pose
|
| 13 |
|
| 14 |
|
| 15 |
-
# from reachy_mini_conversation_demo.vision.processors import VisionManager
|
| 16 |
-
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
| 19 |
-
ENABLE_FACE_RECOGNITION = False
|
| 20 |
-
|
| 21 |
# Initialize dance and emotion libraries
|
| 22 |
try:
|
| 23 |
from reachy_mini.motion.recorded_move import RecordedMoves
|
|
@@ -40,16 +35,6 @@ except ImportError as e:
|
|
| 40 |
DANCE_AVAILABLE = False
|
| 41 |
EMOTION_AVAILABLE = False
|
| 42 |
|
| 43 |
-
FACE_RECOGNITION_AVAILABLE = False
|
| 44 |
-
if ENABLE_FACE_RECOGNITION:
|
| 45 |
-
# Initialize face recognition
|
| 46 |
-
try:
|
| 47 |
-
from deepface import DeepFace
|
| 48 |
-
|
| 49 |
-
FACE_RECOGNITION_AVAILABLE = True
|
| 50 |
-
except ImportError as e:
|
| 51 |
-
logger.warning(f"DeepFace not available: {e}")
|
| 52 |
-
|
| 53 |
|
| 54 |
def all_concrete_subclasses(base):
|
| 55 |
"""Recursively find all concrete (non-abstract) subclasses of a base class."""
|
|
@@ -76,30 +61,9 @@ class ToolDependencies:
|
|
| 76 |
camera_worker: Optional[Any] = None # CameraWorker for frame buffering
|
| 77 |
vision_manager: Optional[Any] = None
|
| 78 |
head_wobbler: Optional[Any] = None # HeadWobbler for audio-reactive motion
|
| 79 |
-
camera_retry_attempts: int = 5
|
| 80 |
-
camera_retry_delay_s: float = 0.10
|
| 81 |
-
vision_timeout_s: float = 8.0
|
| 82 |
motion_duration_s: float = 1.0
|
| 83 |
|
| 84 |
|
| 85 |
-
# Helpers - removed _read_frame as it's no longer needed with camera worker
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
def _execute_motion(deps: ToolDependencies, target: Any) -> Dict[str, Any]:
|
| 89 |
-
"""Apply motion to reachy_mini and update movement_manager state."""
|
| 90 |
-
movement_manager = deps.movement_manager
|
| 91 |
-
movement_manager.moving_start = time.monotonic()
|
| 92 |
-
movement_manager.moving_for = deps.motion_duration_s
|
| 93 |
-
movement_manager.current_head_pose = target
|
| 94 |
-
try:
|
| 95 |
-
deps.reachy_mini.goto_target(target, duration=deps.motion_duration_s)
|
| 96 |
-
except Exception as e:
|
| 97 |
-
logger.exception("motion failed")
|
| 98 |
-
return {"error": f"motion failed: {type(e).__name__}: {e}"}
|
| 99 |
-
|
| 100 |
-
return {"status": "ok"}
|
| 101 |
-
|
| 102 |
-
|
| 103 |
# Tool base class
|
| 104 |
class Tool(abc.ABC):
|
| 105 |
"""Base abstraction for tools used in function-calling.
|
|
@@ -277,100 +241,6 @@ class HeadTracking(Tool):
|
|
| 277 |
return {"status": f"head tracking {status}"}
|
| 278 |
|
| 279 |
|
| 280 |
-
# class DescribeCurrentScene(Tool):
|
| 281 |
-
# name = "describe_current_scene"
|
| 282 |
-
# description = "Get a detailed description of the current scene."
|
| 283 |
-
# parameters_schema = {"type": "object", "properties": {}, "required": []}
|
| 284 |
-
|
| 285 |
-
# async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
|
| 286 |
-
# logger.info("Tool call: describe_current_scene")
|
| 287 |
-
|
| 288 |
-
# result = await deps.vision_manager.process_current_frame(
|
| 289 |
-
# "Describe what you currently see in detail, focusing on people, objects, and activities."
|
| 290 |
-
# )
|
| 291 |
-
|
| 292 |
-
# if isinstance(result, dict) and "error" in result:
|
| 293 |
-
# return result
|
| 294 |
-
# return result
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
# class GetSceneContext(Tool):
|
| 298 |
-
# name = "get_scene_context"
|
| 299 |
-
# description = (
|
| 300 |
-
# "Get the most recent automatic scene description for conversational context."
|
| 301 |
-
# )
|
| 302 |
-
# parameters_schema = {"type": "object", "properties": {}, "required": []}
|
| 303 |
-
|
| 304 |
-
# async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
|
| 305 |
-
# logger.info("Tool call: get_scene_context")
|
| 306 |
-
# vision_manager = deps.vision_manager
|
| 307 |
-
# if not vision_manager:
|
| 308 |
-
# return {"error": "Vision processing not available"}
|
| 309 |
-
|
| 310 |
-
# try:
|
| 311 |
-
# description = await deps.vision_manager.get_current_description()
|
| 312 |
-
|
| 313 |
-
# if not description:
|
| 314 |
-
# return {
|
| 315 |
-
# "context": "No scene description available yet",
|
| 316 |
-
# "note": "Vision processing may still be initializing",
|
| 317 |
-
# }
|
| 318 |
-
# return {
|
| 319 |
-
# "context": description,
|
| 320 |
-
# "note": "This comes from periodic automatic analysis",
|
| 321 |
-
# }
|
| 322 |
-
# except Exception as e:
|
| 323 |
-
# logger.exception("Failed to get scene context")
|
| 324 |
-
# return {"error": f"Scene context failed: {type(e).__name__}: {e}"}
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
# class AnalyzeSceneFor(Tool):
|
| 328 |
-
# name = "analyze_scene_for"
|
| 329 |
-
# description = "Analyze the current scene for a specific purpose."
|
| 330 |
-
# parameters_schema = {
|
| 331 |
-
# "type": "object",
|
| 332 |
-
# "properties": {
|
| 333 |
-
# "purpose": {
|
| 334 |
-
# "type": "string",
|
| 335 |
-
# "enum": [
|
| 336 |
-
# "safety",
|
| 337 |
-
# "people",
|
| 338 |
-
# "objects",
|
| 339 |
-
# "activity",
|
| 340 |
-
# "navigation",
|
| 341 |
-
# "general",
|
| 342 |
-
# ],
|
| 343 |
-
# "default": "general",
|
| 344 |
-
# }
|
| 345 |
-
# },
|
| 346 |
-
# "required": [],
|
| 347 |
-
# }
|
| 348 |
-
|
| 349 |
-
# async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
|
| 350 |
-
# purpose = (kwargs.get("purpose") or "general").lower()
|
| 351 |
-
# logger.info("Tool call: analyze_scene_for purpose=%s", purpose)
|
| 352 |
-
|
| 353 |
-
# prompts = {
|
| 354 |
-
# "safety": "Look for safety concerns, obstacles, or hazards.",
|
| 355 |
-
# "people": "Describe people, their positions and actions.",
|
| 356 |
-
# "objects": "Identify and describe main visible objects.",
|
| 357 |
-
# "activity": "Describe ongoing activities or actions.",
|
| 358 |
-
# "navigation": "Describe the space for navigation: obstacles, pathways, layout.",
|
| 359 |
-
# "general": "Give a general description of the scene including people, objects, and activities.",
|
| 360 |
-
# }
|
| 361 |
-
# prompt = prompts.get(purpose, prompts["general"])
|
| 362 |
-
|
| 363 |
-
# result = await deps.vision_manager.process_current_frame(prompt)
|
| 364 |
-
|
| 365 |
-
# if isinstance(result, dict) and "error" in result:
|
| 366 |
-
# return result
|
| 367 |
-
|
| 368 |
-
# if not isinstance(result, dict):
|
| 369 |
-
# return {"error": "vision returned non-dict"}
|
| 370 |
-
|
| 371 |
-
# result["analysis_purpose"] = purpose
|
| 372 |
-
# return result
|
| 373 |
-
|
| 374 |
|
| 375 |
class Dance(Tool):
|
| 376 |
"""Play a named or random dance move once (or repeat). Non-blocking."""
|
|
@@ -461,25 +331,24 @@ class StopDance(Tool):
|
|
| 461 |
"""Stop the current dance move."""
|
| 462 |
logger.info("Tool call: stop_dance")
|
| 463 |
movement_manager = deps.movement_manager
|
| 464 |
-
movement_manager.
|
| 465 |
return {"status": "stopped dance and cleared queue"}
|
| 466 |
|
| 467 |
|
| 468 |
-
def get_available_emotions_and_descriptions():
|
| 469 |
"""Get formatted list of available emotions with descriptions."""
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
ret = """
|
| 473 |
-
Available emotions:
|
| 474 |
-
|
| 475 |
-
"""
|
| 476 |
-
|
| 477 |
-
for name in names:
|
| 478 |
-
description = RECORDED_MOVES.get(name).description
|
| 479 |
-
ret += f" - {name}: {description}\n"
|
| 480 |
-
|
| 481 |
-
return ret
|
| 482 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
|
| 484 |
class PlayEmotion(Tool):
|
| 485 |
"""Play a pre-recorded emotion."""
|
|
@@ -549,70 +418,10 @@ class StopEmotion(Tool):
|
|
| 549 |
"""Stop the current emotion."""
|
| 550 |
logger.info("Tool call: stop_emotion")
|
| 551 |
movement_manager = deps.movement_manager
|
| 552 |
-
movement_manager.
|
| 553 |
return {"status": "stopped emotion and cleared queue"}
|
| 554 |
|
| 555 |
|
| 556 |
-
class FaceRecognition(Tool):
|
| 557 |
-
"""Get the name of the person you are talking to."""
|
| 558 |
-
|
| 559 |
-
name = "get_person_name"
|
| 560 |
-
description = "Get the name of the person you are talking to"
|
| 561 |
-
parameters_schema = {
|
| 562 |
-
"type": "object",
|
| 563 |
-
"properties": {
|
| 564 |
-
"dummy": {
|
| 565 |
-
"type": "boolean",
|
| 566 |
-
"description": "dummy boolean, set it to true",
|
| 567 |
-
}
|
| 568 |
-
},
|
| 569 |
-
"required": ["dummy"],
|
| 570 |
-
}
|
| 571 |
-
|
| 572 |
-
async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
|
| 573 |
-
"""Get the name of the person you are talking to."""
|
| 574 |
-
if not FACE_RECOGNITION_AVAILABLE:
|
| 575 |
-
return {"error": "Face recognition not available"}
|
| 576 |
-
|
| 577 |
-
logger.info("Tool call: face_recognition")
|
| 578 |
-
|
| 579 |
-
try:
|
| 580 |
-
# Get frame from camera worker buffer (like main_works.py)
|
| 581 |
-
if deps.camera_worker is not None:
|
| 582 |
-
frame = deps.camera_worker.get_latest_frame()
|
| 583 |
-
if frame is None:
|
| 584 |
-
logger.error("No frame available from camera worker")
|
| 585 |
-
return {"error": "No frame available"}
|
| 586 |
-
else:
|
| 587 |
-
logger.error("Camera worker not available")
|
| 588 |
-
return {"error": "Camera worker not available"}
|
| 589 |
-
|
| 590 |
-
# Save frame temporarily (same as main_works.py pattern)
|
| 591 |
-
temp_path = "/tmp/face_recognition.jpg"
|
| 592 |
-
import cv2
|
| 593 |
-
|
| 594 |
-
cv2.imwrite(temp_path, frame)
|
| 595 |
-
|
| 596 |
-
# Use DeepFace to find face
|
| 597 |
-
results = await asyncio.to_thread(DeepFace.find, img_path=temp_path, db_path="./pollen_faces")
|
| 598 |
-
|
| 599 |
-
if len(results) == 0:
|
| 600 |
-
return {"error": "Didn't recognize the face"}
|
| 601 |
-
|
| 602 |
-
# Extract name from results
|
| 603 |
-
name = "Unknown"
|
| 604 |
-
for index, row in results[0].iterrows():
|
| 605 |
-
file_path = row["identity"]
|
| 606 |
-
name = file_path.split("/")[-2]
|
| 607 |
-
break
|
| 608 |
-
|
| 609 |
-
return {"answer": f"The name is {name}"}
|
| 610 |
-
|
| 611 |
-
except Exception as e:
|
| 612 |
-
logger.exception("Face recognition failed")
|
| 613 |
-
return {"error": f"Face recognition failed: {str(e)}"}
|
| 614 |
-
|
| 615 |
-
|
| 616 |
class DoNothing(Tool):
|
| 617 |
"""Choose to do nothing - stay still and silent. Use when you want to be contemplative or just chill."""
|
| 618 |
|
|
@@ -636,22 +445,6 @@ class DoNothing(Tool):
|
|
| 636 |
return {"status": "doing nothing", "reason": reason}
|
| 637 |
|
| 638 |
|
| 639 |
-
def get_available_emotions_and_descriptions() -> str:
|
| 640 |
-
"""Get formatted list of available emotions with descriptions."""
|
| 641 |
-
if not EMOTION_AVAILABLE:
|
| 642 |
-
return "Emotions not available"
|
| 643 |
-
|
| 644 |
-
try:
|
| 645 |
-
names = RECORDED_MOVES.list_moves()
|
| 646 |
-
ret = "Available emotions:\n"
|
| 647 |
-
for name in names:
|
| 648 |
-
description = RECORDED_MOVES.get(name).description
|
| 649 |
-
ret += f" - {name}: {description}\n"
|
| 650 |
-
return ret
|
| 651 |
-
except Exception as e:
|
| 652 |
-
return f"Error getting emotions: {e}"
|
| 653 |
-
|
| 654 |
-
|
| 655 |
# Registry & specs (dynamic)
|
| 656 |
|
| 657 |
# List of available tool classes
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
import abc
|
| 3 |
import json
|
|
|
|
| 4 |
import asyncio
|
| 5 |
import inspect
|
| 6 |
import logging
|
|
|
|
| 11 |
from reachy_mini.utils import create_head_pose
|
| 12 |
|
| 13 |
|
|
|
|
|
|
|
| 14 |
logger = logging.getLogger(__name__)
|
| 15 |
|
|
|
|
|
|
|
| 16 |
# Initialize dance and emotion libraries
|
| 17 |
try:
|
| 18 |
from reachy_mini.motion.recorded_move import RecordedMoves
|
|
|
|
| 35 |
DANCE_AVAILABLE = False
|
| 36 |
EMOTION_AVAILABLE = False
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
def all_concrete_subclasses(base):
|
| 40 |
"""Recursively find all concrete (non-abstract) subclasses of a base class."""
|
|
|
|
| 61 |
camera_worker: Optional[Any] = None # CameraWorker for frame buffering
|
| 62 |
vision_manager: Optional[Any] = None
|
| 63 |
head_wobbler: Optional[Any] = None # HeadWobbler for audio-reactive motion
|
|
|
|
|
|
|
|
|
|
| 64 |
motion_duration_s: float = 1.0
|
| 65 |
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
# Tool base class
|
| 68 |
class Tool(abc.ABC):
|
| 69 |
"""Base abstraction for tools used in function-calling.
|
|
|
|
| 241 |
return {"status": f"head tracking {status}"}
|
| 242 |
|
| 243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
class Dance(Tool):
|
| 246 |
"""Play a named or random dance move once (or repeat). Non-blocking."""
|
|
|
|
| 331 |
"""Stop the current dance move."""
|
| 332 |
logger.info("Tool call: stop_dance")
|
| 333 |
movement_manager = deps.movement_manager
|
| 334 |
+
movement_manager.clear_move_queue()
|
| 335 |
return {"status": "stopped dance and cleared queue"}
|
| 336 |
|
| 337 |
|
| 338 |
+
def get_available_emotions_and_descriptions() -> str:
|
| 339 |
"""Get formatted list of available emotions with descriptions."""
|
| 340 |
+
if not EMOTION_AVAILABLE:
|
| 341 |
+
return "Emotions not available"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
|
| 343 |
+
try:
|
| 344 |
+
names = RECORDED_MOVES.list_moves()
|
| 345 |
+
ret = "Available emotions:\n"
|
| 346 |
+
for name in names:
|
| 347 |
+
description = RECORDED_MOVES.get(name).description
|
| 348 |
+
ret += f" - {name}: {description}\n"
|
| 349 |
+
return ret
|
| 350 |
+
except Exception as e:
|
| 351 |
+
return f"Error getting emotions: {e}"
|
| 352 |
|
| 353 |
class PlayEmotion(Tool):
|
| 354 |
"""Play a pre-recorded emotion."""
|
|
|
|
| 418 |
"""Stop the current emotion."""
|
| 419 |
logger.info("Tool call: stop_emotion")
|
| 420 |
movement_manager = deps.movement_manager
|
| 421 |
+
movement_manager.clear_move_queue()
|
| 422 |
return {"status": "stopped emotion and cleared queue"}
|
| 423 |
|
| 424 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
class DoNothing(Tool):
|
| 426 |
"""Choose to do nothing - stay still and silent. Use when you want to be contemplative or just chill."""
|
| 427 |
|
|
|
|
| 445 |
return {"status": "doing nothing", "reason": reason}
|
| 446 |
|
| 447 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
# Registry & specs (dynamic)
|
| 449 |
|
| 450 |
# List of available tool classes
|
src/reachy_mini_conversation_demo/utils.py
CHANGED
|
@@ -3,6 +3,7 @@ import argparse
|
|
| 3 |
import warnings
|
| 4 |
|
| 5 |
from reachy_mini_conversation_demo.camera_worker import CameraWorker
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
def parse_args():
|
|
@@ -21,26 +22,27 @@ def parse_args():
|
|
| 21 |
|
| 22 |
|
| 23 |
def handle_vision_stuff(args, current_robot):
|
| 24 |
-
"""Initialize camera, head tracker and
|
| 25 |
camera_worker = None
|
| 26 |
head_tracker = None
|
| 27 |
vision_manager = None
|
|
|
|
| 28 |
if not args.no_camera:
|
|
|
|
| 29 |
if args.head_tracker is not None:
|
| 30 |
if args.head_tracker == "yolo":
|
| 31 |
-
from reachy_mini_conversation_demo.vision.yolo_head_tracker import
|
| 32 |
-
HeadTracker,
|
| 33 |
-
)
|
| 34 |
-
|
| 35 |
head_tracker = HeadTracker()
|
| 36 |
-
|
| 37 |
elif args.head_tracker == "mediapipe":
|
| 38 |
from reachy_mini_toolbox.vision import HeadTracker
|
| 39 |
-
|
| 40 |
head_tracker = HeadTracker()
|
| 41 |
|
|
|
|
| 42 |
camera_worker = CameraWorker(current_robot, head_tracker)
|
| 43 |
|
|
|
|
|
|
|
|
|
|
| 44 |
return camera_worker, head_tracker, vision_manager
|
| 45 |
|
| 46 |
|
|
|
|
| 3 |
import warnings
|
| 4 |
|
| 5 |
from reachy_mini_conversation_demo.camera_worker import CameraWorker
|
| 6 |
+
from reachy_mini_conversation_demo.vision.processors import initialize_vision_manager
|
| 7 |
|
| 8 |
|
| 9 |
def parse_args():
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
def handle_vision_stuff(args, current_robot):
|
| 25 |
+
"""Initialize camera, head tracker, camera worker, and vision manager."""
|
| 26 |
camera_worker = None
|
| 27 |
head_tracker = None
|
| 28 |
vision_manager = None
|
| 29 |
+
|
| 30 |
if not args.no_camera:
|
| 31 |
+
# Initialize head tracker if specified
|
| 32 |
if args.head_tracker is not None:
|
| 33 |
if args.head_tracker == "yolo":
|
| 34 |
+
from reachy_mini_conversation_demo.vision.yolo_head_tracker import HeadTracker
|
|
|
|
|
|
|
|
|
|
| 35 |
head_tracker = HeadTracker()
|
|
|
|
| 36 |
elif args.head_tracker == "mediapipe":
|
| 37 |
from reachy_mini_toolbox.vision import HeadTracker
|
|
|
|
| 38 |
head_tracker = HeadTracker()
|
| 39 |
|
| 40 |
+
# Initialize camera worker
|
| 41 |
camera_worker = CameraWorker(current_robot, head_tracker)
|
| 42 |
|
| 43 |
+
# Initialize vision manager (handles model download and configuration)
|
| 44 |
+
vision_manager = initialize_vision_manager(camera_worker)
|
| 45 |
+
|
| 46 |
return camera_worker, head_tracker, vision_manager
|
| 47 |
|
| 48 |
|
src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py
CHANGED
|
@@ -94,77 +94,6 @@ class HeadTracker:
|
|
| 94 |
|
| 95 |
return np.array([norm_x, norm_y], dtype=np.float32)
|
| 96 |
|
| 97 |
-
def get_eyes(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
|
| 98 |
-
"""Get eye positions (approximated from face bbox).
|
| 99 |
-
|
| 100 |
-
Note: YOLO only provides face bbox, so we estimate eye positions
|
| 101 |
-
|
| 102 |
-
Args:
|
| 103 |
-
img: Input image
|
| 104 |
-
|
| 105 |
-
Returns:
|
| 106 |
-
Tuple of (left_eye, right_eye) in [-1, 1] coordinates
|
| 107 |
-
|
| 108 |
-
"""
|
| 109 |
-
h, w = img.shape[:2]
|
| 110 |
-
|
| 111 |
-
# Run YOLO inference
|
| 112 |
-
results = self.model(img, verbose=False)
|
| 113 |
-
detections = Detections.from_ultralytics(results[0])
|
| 114 |
-
|
| 115 |
-
# Select best face
|
| 116 |
-
face_idx = self._select_best_face(detections)
|
| 117 |
-
if face_idx is None:
|
| 118 |
-
return None, None
|
| 119 |
-
|
| 120 |
-
bbox = detections.xyxy[face_idx]
|
| 121 |
-
|
| 122 |
-
# Estimate eye positions from face bbox (approximate locations)
|
| 123 |
-
face_width = bbox[2] - bbox[0]
|
| 124 |
-
face_height = bbox[3] - bbox[1]
|
| 125 |
-
|
| 126 |
-
# Eye positions are roughly at 1/3 and 2/3 of face width, 1/3 of face height
|
| 127 |
-
eye_y = bbox[1] + face_height * 0.35
|
| 128 |
-
left_eye_x = bbox[0] + face_width * 0.35
|
| 129 |
-
right_eye_x = bbox[0] + face_width * 0.65
|
| 130 |
-
|
| 131 |
-
# Convert to MediaPipe coordinates
|
| 132 |
-
left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
|
| 133 |
-
right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
|
| 134 |
-
|
| 135 |
-
return left_eye, right_eye
|
| 136 |
-
|
| 137 |
-
def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
|
| 138 |
-
"""Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object."""
|
| 139 |
-
if not hasattr(face_landmarks, "_bbox") or not hasattr(face_landmarks, "_img_shape"):
|
| 140 |
-
raise ValueError("Face landmarks object missing required attributes")
|
| 141 |
-
|
| 142 |
-
bbox = face_landmarks._bbox
|
| 143 |
-
h, w = face_landmarks._img_shape[:2]
|
| 144 |
-
|
| 145 |
-
# Estimate eyes from stored bbox
|
| 146 |
-
face_width = bbox[2] - bbox[0]
|
| 147 |
-
face_height = bbox[3] - bbox[1]
|
| 148 |
-
|
| 149 |
-
eye_y = bbox[1] + face_height * 0.35
|
| 150 |
-
left_eye_x = bbox[0] + face_width * 0.35
|
| 151 |
-
right_eye_x = bbox[0] + face_width * 0.65
|
| 152 |
-
|
| 153 |
-
left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
|
| 154 |
-
right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
|
| 155 |
-
|
| 156 |
-
return left_eye, right_eye
|
| 157 |
-
|
| 158 |
-
def get_eye_center(self, face_landmarks) -> np.ndarray:
|
| 159 |
-
"""Get center point between estimated eyes."""
|
| 160 |
-
left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
|
| 161 |
-
return np.mean([left_eye, right_eye], axis=0)
|
| 162 |
-
|
| 163 |
-
def get_roll(self, face_landmarks) -> float:
|
| 164 |
-
"""Estimate roll from eye positions (will be 0 for YOLO since we estimate symmetric eyes)."""
|
| 165 |
-
left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
|
| 166 |
-
return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
|
| 167 |
-
|
| 168 |
def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
|
| 169 |
"""Get head position from face detection.
|
| 170 |
|
|
@@ -204,18 +133,3 @@ class HeadTracker:
|
|
| 204 |
except Exception as e:
|
| 205 |
logger.error(f"Error in head position detection: {e}")
|
| 206 |
return None, None
|
| 207 |
-
|
| 208 |
-
def cleanup(self):
|
| 209 |
-
"""Clean up resources."""
|
| 210 |
-
if hasattr(self, "model"):
|
| 211 |
-
del self.model
|
| 212 |
-
logger.info("YOLO model cleaned up")
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
class FaceLandmarks:
|
| 216 |
-
"""Simple container for face detection results to maintain API compatibility."""
|
| 217 |
-
|
| 218 |
-
def __init__(self, bbox: np.ndarray, img_shape: tuple):
|
| 219 |
-
"""Initialize with bounding box and image shape."""
|
| 220 |
-
self._bbox = bbox
|
| 221 |
-
self._img_shape = img_shape
|
|
|
|
| 94 |
|
| 95 |
return np.array([norm_x, norm_y], dtype=np.float32)
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
|
| 98 |
"""Get head position from face detection.
|
| 99 |
|
|
|
|
| 133 |
except Exception as e:
|
| 134 |
logger.error(f"Error in head position detection: {e}")
|
| 135 |
return None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|