Alina Lozovskaya commited on
Commit
5593ee6
·
1 Parent(s): 76ee3b3

Cleanup, fixes, logger

Browse files
src/reachy_mini_conversation_demo/audio/speech_tapper.py CHANGED
@@ -120,7 +120,6 @@ class SwayRollRT:
120
  self._seed = int(rng_seed)
121
  self.samples = deque(maxlen=10 * SR) # sliding window for VAD/env
122
  self.carry = np.zeros(0, dtype=np.float32)
123
- self.frame_idx = 0
124
 
125
  self.vad_on = False
126
  self.vad_above = 0
@@ -143,7 +142,6 @@ class SwayRollRT:
143
  """Reset state (VAD/env/buffers/time) but keep initial phases/seed."""
144
  self.samples.clear()
145
  self.carry = np.zeros(0, dtype=np.float32)
146
- self.frame_idx = 0
147
  self.vad_on = False
148
  self.vad_above = 0
149
  self.vad_below = 0
@@ -152,16 +150,6 @@ class SwayRollRT:
152
  self.sway_down = 0
153
  self.t = 0.0
154
 
155
- def reset_phases(self) -> None:
156
- """Re-randomize phases deterministically from stored seed (Optional)."""
157
- rng = np.random.default_rng(self._seed)
158
- self.phase_pitch = float(rng.random() * 2 * math.pi)
159
- self.phase_yaw = float(rng.random() * 2 * math.pi)
160
- self.phase_roll = float(rng.random() * 2 * math.pi)
161
- self.phase_x = float(rng.random() * 2 * math.pi)
162
- self.phase_y = float(rng.random() * 2 * math.pi)
163
- self.phase_z = float(rng.random() * 2 * math.pi)
164
-
165
  def feed(self, pcm: np.ndarray, sr: Optional[int]) -> List[Dict[str, float]]:
166
  """Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
167
 
@@ -196,7 +184,6 @@ class SwayRollRT:
196
  self.samples.extend(hop.tolist())
197
  if len(self.samples) < FRAME:
198
  self.t += HOP_MS / 1000.0
199
- self.frame_idx += 1
200
  continue
201
 
202
  frame = np.fromiter(
 
120
  self._seed = int(rng_seed)
121
  self.samples = deque(maxlen=10 * SR) # sliding window for VAD/env
122
  self.carry = np.zeros(0, dtype=np.float32)
 
123
 
124
  self.vad_on = False
125
  self.vad_above = 0
 
142
  """Reset state (VAD/env/buffers/time) but keep initial phases/seed."""
143
  self.samples.clear()
144
  self.carry = np.zeros(0, dtype=np.float32)
 
145
  self.vad_on = False
146
  self.vad_above = 0
147
  self.vad_below = 0
 
150
  self.sway_down = 0
151
  self.t = 0.0
152
 
 
 
 
 
 
 
 
 
 
 
153
  def feed(self, pcm: np.ndarray, sr: Optional[int]) -> List[Dict[str, float]]:
154
  """Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
155
 
 
184
  self.samples.extend(hop.tolist())
185
  if len(self.samples) < FRAME:
186
  self.t += HOP_MS / 1000.0
 
187
  continue
188
 
189
  frame = np.fromiter(
src/reachy_mini_conversation_demo/camera_worker.py CHANGED
@@ -229,7 +229,7 @@ class CameraWorker:
229
  time.sleep(0.01)
230
 
231
  except Exception as e:
232
- logger.error(f"Camera worker error: {e}")
233
  time.sleep(0.1) # Longer sleep on error
234
 
235
  logger.debug("Camera worker thread exited")
 
229
  time.sleep(0.01)
230
 
231
  except Exception as e:
232
+ logger.exception(f"Camera worker error: {e}")
233
  time.sleep(0.1) # Longer sleep on error
234
 
235
  logger.debug("Camera worker thread exited")
src/reachy_mini_conversation_demo/config.py CHANGED
@@ -1,17 +1,28 @@
1
  import os
 
 
2
 
3
  from dotenv import load_dotenv
4
 
5
 
6
- load_dotenv()
7
 
 
 
 
 
 
 
 
 
8
 
9
- def getenv_bool(key: str, default: bool = False) -> bool:
10
- """Read env var as a Python bool (case-insensitive)."""
11
- val = os.getenv(key)
12
- if val is None:
13
- return default
14
- return val.strip().lower() in {"true", "1", "yes", "on"}
 
15
 
16
 
17
  class Config:
@@ -19,13 +30,23 @@ class Config:
19
 
20
  # Required
21
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
22
- if not OPENAI_API_KEY:
23
- raise RuntimeError("OPENAI_API_KEY is missing in .env")
 
 
 
 
 
 
 
24
 
25
  # Optional
26
  MODEL_NAME = os.getenv("MODEL_NAME", "gpt-realtime")
27
  HF_HOME = os.getenv("HF_HOME", "./cache")
 
28
  HF_TOKEN = os.getenv("HF_TOKEN") # Optional, falls back to hf auth login if not set
29
 
 
 
30
 
31
  config = Config()
 
1
  import os
2
+ import logging
3
+ from pathlib import Path
4
 
5
  from dotenv import load_dotenv
6
 
7
 
8
+ logger = logging.getLogger(__name__)
9
 
10
+ # Check if .env file exists
11
+ env_file = Path(".env")
12
+ if not env_file.exists():
13
+ raise RuntimeError(
14
+ ".env file not found. Please create one based on .env.example:\n"
15
+ " cp .env.example .env\n"
16
+ "Then add your OPENAI_API_KEY to the .env file."
17
+ )
18
 
19
+ # Load .env and verify it was loaded successfully
20
+ if not load_dotenv():
21
+ raise RuntimeError(
22
+ "Failed to load .env file. Please ensure the file is readable and properly formatted."
23
+ )
24
+
25
+ logger.info("Configuration loaded from .env file")
26
 
27
 
28
  class Config:
 
30
 
31
  # Required
32
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
33
+ if OPENAI_API_KEY is None:
34
+ raise RuntimeError(
35
+ "OPENAI_API_KEY is not set in .env file. Please add it:\n"
36
+ " OPENAI_API_KEY=your_api_key_here"
37
+ )
38
+ if not OPENAI_API_KEY.strip():
39
+ raise RuntimeError(
40
+ "OPENAI_API_KEY is empty in .env file. Please provide a valid API key."
41
+ )
42
 
43
  # Optional
44
  MODEL_NAME = os.getenv("MODEL_NAME", "gpt-realtime")
45
  HF_HOME = os.getenv("HF_HOME", "./cache")
46
+ LOCAL_VISION_MODEL = os.getenv("LOCAL_VISION_MODEL", "HuggingFaceTB/SmolVLM2-2.2B-Instruct")
47
  HF_TOKEN = os.getenv("HF_TOKEN") # Optional, falls back to hf auth login if not set
48
 
49
+ logger.debug(f"Model: {MODEL_NAME}, HF_HOME: {HF_HOME}, Vision Model: {LOCAL_VISION_MODEL}")
50
+
51
 
52
  config = Config()
src/reachy_mini_conversation_demo/console.py CHANGED
@@ -19,14 +19,19 @@ logger = logging.getLogger(__name__)
19
  class LocalStream:
20
  """LocalStream using Reachy Mini's recorder/player."""
21
 
22
- def __init__(self, handler: OpenaiRealtimeHandler, robot: ReachyMini):
23
- """Initialize the stream with an OpenAI realtime handler and pipelines."""
24
- self.handler = handler
 
 
 
 
 
25
  self._robot = robot
26
  self._stop_event = asyncio.Event()
27
  self._tasks = []
28
- # Allow the handler to flush the player queue when appropriate.
29
- self.handler._clear_queue = self.clear_queue # type: ignore[assignment]
30
 
31
  def launch(self) -> None:
32
  """Start the recorder/player and run the async processing loops."""
@@ -69,7 +74,7 @@ class LocalStream:
69
  self._robot.media.stop_recording()
70
  self._robot.media.stop_playing()
71
 
72
- def clear_queue(self) -> None:
73
  """Flush the player's appsrc to drop any queued audio immediately."""
74
  logger.info("User intervention: flushing player queue")
75
  self.handler.output_queue = asyncio.Queue()
 
19
  class LocalStream:
20
  """LocalStream using Reachy Mini's recorder/player."""
21
 
22
+ def __init__(self, deps, robot: ReachyMini):
23
+ """Initialize the stream with tool dependencies and robot.
24
+
25
+ Args:
26
+ deps: ToolDependencies for the handler
27
+ robot: ReachyMini robot instance
28
+
29
+ """
30
  self._robot = robot
31
  self._stop_event = asyncio.Event()
32
  self._tasks = []
33
+ # Create handler with callback to this instance's clear_audio_queue method
34
+ self.handler = OpenaiRealtimeHandler(deps, clear_audio_queue_callback=self.clear_audio_queue)
35
 
36
  def launch(self) -> None:
37
  """Start the recorder/player and run the async processing loops."""
 
74
  self._robot.media.stop_recording()
75
  self._robot.media.stop_playing()
76
 
77
+ def clear_audio_queue(self) -> None:
78
  """Flush the player's appsrc to drop any queued audio immediately."""
79
  logger.info("User intervention: flushing player queue")
80
  self.handler.output_queue = asyncio.Queue()
src/reachy_mini_conversation_demo/main.py CHANGED
@@ -74,11 +74,11 @@ def main():
74
  )
75
  logger.debug(f"Chatbot avatar images: {chatbot.avatar_images}")
76
 
77
- handler = OpenaiRealtimeHandler(deps)
78
-
79
  stream_manager = None
80
 
81
  if args.gradio:
 
 
82
  stream = Stream(
83
  handler=handler,
84
  mode="send-receive",
@@ -92,7 +92,8 @@ def main():
92
  app = FastAPI()
93
  app = gr.mount_gradio_app(app, stream.ui, path="/")
94
  else:
95
- stream_manager = LocalStream(handler, robot)
 
96
 
97
  # Each async service → its own thread/loop
98
  movement_manager.start()
 
74
  )
75
  logger.debug(f"Chatbot avatar images: {chatbot.avatar_images}")
76
 
 
 
77
  stream_manager = None
78
 
79
  if args.gradio:
80
+ # Gradio mode: no LocalStream, so no audio queue callback needed
81
+ handler = OpenaiRealtimeHandler(deps)
82
  stream = Stream(
83
  handler=handler,
84
  mode="send-receive",
 
92
  app = FastAPI()
93
  app = gr.mount_gradio_app(app, stream.ui, path="/")
94
  else:
95
+ # Console mode: LocalStream creates handler internally with proper callback
96
+ stream_manager = LocalStream(deps, robot)
97
 
98
  # Each async service → its own thread/loop
99
  movement_manager.start()
src/reachy_mini_conversation_demo/moves.py CHANGED
@@ -190,13 +190,7 @@ class MovementState:
190
  0.0,
191
  )
192
 
193
- # Legacy movement state (for goto moves)
194
- moving_start: float = 0.0
195
- moving_for: float = 0.0
196
-
197
  # Status flags
198
- is_playing_move: bool = False
199
- is_moving: bool = False
200
  last_primary_pose: Optional[FullBodyPose] = None
201
 
202
  def update_activity(self) -> None:
@@ -325,7 +319,7 @@ class MovementManager:
325
  """
326
  self._command_queue.put(("queue_move", move))
327
 
328
- def clear_queue(self) -> None:
329
  """Stop the active move and discard any queued primary moves.
330
 
331
  Thread-safe: executed by the worker thread via the command queue.
@@ -361,10 +355,6 @@ class MovementManager:
361
 
362
  return self._now() - last_activity >= self.idle_inactivity_delay
363
 
364
- def mark_user_activity(self) -> None:
365
- """Record external activity and postpone idle behaviours (thread-safe)."""
366
- self._command_queue.put(("mark_activity", None))
367
-
368
  def set_listening(self, listening: bool) -> None:
369
  """Enable or disable listening mode without touching shared state directly.
370
 
@@ -427,7 +417,7 @@ class MovementManager:
427
  duration_str = str(duration)
428
  else:
429
  duration_str = "?"
430
- logger.info(
431
  "Queued move with duration %ss, queue size: %s",
432
  duration_str,
433
  len(self.move_queue),
@@ -438,7 +428,6 @@ class MovementManager:
438
  self.move_queue.clear()
439
  self.state.current_move = None
440
  self.state.move_start_time = None
441
- self.state.is_playing_move = False
442
  self._breathing_active = False
443
  logger.info("Cleared move queue and stopped current move")
444
  elif command == "set_moving_state":
@@ -447,8 +436,6 @@ class MovementManager:
447
  except (TypeError, ValueError):
448
  logger.warning("Invalid moving state duration: %s", payload)
449
  return
450
- self.state.moving_start = current_time
451
- self.state.moving_for = max(0.0, duration)
452
  self.state.update_activity()
453
  elif command == "mark_activity":
454
  self.state.update_activity()
@@ -534,7 +521,7 @@ class MovementManager:
534
  self.state.current_move = None
535
  self.state.move_start_time = None
536
  self._breathing_active = False
537
- logger.info("Stopping breathing due to new move activity")
538
 
539
  if self.state.current_move is not None and not isinstance(self.state.current_move, BreathingMove):
540
  self._breathing_active = False
@@ -561,14 +548,9 @@ class MovementManager:
561
  float(body_yaw),
562
  )
563
 
564
- self.state.is_playing_move = True
565
- self.state.is_moving = True
566
  self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
567
  else:
568
  # Otherwise reuse the last primary pose so we avoid jumps between moves
569
- self.state.is_playing_move = False
570
- self.state.is_moving = current_time - self.state.moving_start < self.state.moving_for
571
-
572
  if self.state.last_primary_pose is not None:
573
  primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
574
  else:
@@ -746,7 +728,6 @@ class MovementManager:
746
  self._thread.join()
747
  self._thread = None
748
  logger.debug("Move worker stopped")
749
- logger.debug("Move worker stopped")
750
 
751
  def get_status(self) -> dict[str, Any]:
752
  """Return a lightweight status snapshot for observability."""
 
190
  0.0,
191
  )
192
 
 
 
 
 
193
  # Status flags
 
 
194
  last_primary_pose: Optional[FullBodyPose] = None
195
 
196
  def update_activity(self) -> None:
 
319
  """
320
  self._command_queue.put(("queue_move", move))
321
 
322
+ def clear_move_queue(self) -> None:
323
  """Stop the active move and discard any queued primary moves.
324
 
325
  Thread-safe: executed by the worker thread via the command queue.
 
355
 
356
  return self._now() - last_activity >= self.idle_inactivity_delay
357
 
 
 
 
 
358
  def set_listening(self, listening: bool) -> None:
359
  """Enable or disable listening mode without touching shared state directly.
360
 
 
417
  duration_str = str(duration)
418
  else:
419
  duration_str = "?"
420
+ logger.debug(
421
  "Queued move with duration %ss, queue size: %s",
422
  duration_str,
423
  len(self.move_queue),
 
428
  self.move_queue.clear()
429
  self.state.current_move = None
430
  self.state.move_start_time = None
 
431
  self._breathing_active = False
432
  logger.info("Cleared move queue and stopped current move")
433
  elif command == "set_moving_state":
 
436
  except (TypeError, ValueError):
437
  logger.warning("Invalid moving state duration: %s", payload)
438
  return
 
 
439
  self.state.update_activity()
440
  elif command == "mark_activity":
441
  self.state.update_activity()
 
521
  self.state.current_move = None
522
  self.state.move_start_time = None
523
  self._breathing_active = False
524
+ logger.debug("Stopping breathing due to new move activity")
525
 
526
  if self.state.current_move is not None and not isinstance(self.state.current_move, BreathingMove):
527
  self._breathing_active = False
 
548
  float(body_yaw),
549
  )
550
 
 
 
551
  self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
552
  else:
553
  # Otherwise reuse the last primary pose so we avoid jumps between moves
 
 
 
554
  if self.state.last_primary_pose is not None:
555
  primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
556
  else:
 
728
  self._thread.join()
729
  self._thread = None
730
  logger.debug("Move worker stopped")
 
731
 
732
  def get_status(self) -> dict[str, Any]:
733
  """Return a lightweight status snapshot for observability."""
src/reachy_mini_conversation_demo/openai_realtime.py CHANGED
@@ -15,6 +15,7 @@ from reachy_mini_conversation_demo.tools import (
15
  dispatch_tool_call,
16
  )
17
  from reachy_mini_conversation_demo.config import config
 
18
 
19
 
20
  logger = logging.getLogger(__name__)
@@ -23,19 +24,28 @@ logger = logging.getLogger(__name__)
23
  class OpenaiRealtimeHandler(AsyncStreamHandler):
24
  """An OpenAI realtime handler for fastrtc Stream."""
25
 
26
- def __init__(self, deps: ToolDependencies):
27
- """Initialize the handler."""
 
 
 
 
 
 
28
  super().__init__(
29
  expected_layout="mono",
30
  output_sample_rate=24000, # openai outputs
31
  input_sample_rate=16000, # respeaker output
32
  )
33
  self.deps = deps
 
34
 
35
  self.connection = None
36
  self.output_queue = asyncio.Queue()
37
 
38
  self._pending_calls: dict[str, dict] = {}
 
 
39
 
40
  self.last_activity_time = asyncio.get_event_loop().time()
41
  self.start_time = asyncio.get_event_loop().time()
@@ -43,7 +53,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
43
 
44
  def copy(self):
45
  """Create a copy of the handler."""
46
- return OpenaiRealtimeHandler(self.deps)
47
 
48
  async def start_up(self):
49
  """Start the handler."""
@@ -59,7 +69,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
59
  "language": "en",
60
  },
61
  "voice": "ballad",
62
- "instructions": "We speak in English",
63
  "tools": ALL_TOOL_SPECS,
64
  "tool_choice": "auto",
65
  "temperature": 0.7,
@@ -71,14 +81,15 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
71
  async for event in self.connection:
72
  logger.debug(f"OpenAI event: {event.type}")
73
  if event.type == "input_audio_buffer.speech_started":
74
- self.clear_queue()
 
75
  self.deps.head_wobbler.reset()
76
  self.deps.movement_manager.set_listening(True)
77
- logger.debug("user speech started")
78
 
79
  if event.type == "input_audio_buffer.speech_stopped":
80
  self.deps.movement_manager.set_listening(False)
81
- logger.debug("user speech stopped")
82
 
83
  if event.type in ("response.audio.completed", "response.completed"):
84
  # Doesn't seem to be called
@@ -87,19 +98,27 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
87
 
88
  if event.type == "response.created":
89
  logger.debug("response created")
 
90
 
91
  if event.type == "response.done":
92
  # Doesn't mean the audio is done playing
93
  logger.debug("response done")
94
- pass
95
- # self.deps.head_wobbler.reset()
 
 
 
 
 
 
 
96
 
97
  if event.type == "conversation.item.input_audio_transcription.completed":
98
- logger.debug(f"user transcript: {event.transcript}")
99
  await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
100
 
101
  if event.type == "response.audio_transcript.done":
102
- logger.debug(f"assistant transcript: {event.transcript}")
103
  await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
104
 
105
  if event.type == "response.audio.delta":
@@ -144,10 +163,10 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
144
 
145
  try:
146
  tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
147
- logger.debug("[Tool %s executed]", tool_name)
148
  logger.debug("Tool result: %s", tool_result)
149
  except Exception as e:
150
- logger.error("Tool %s failed", tool_name)
151
  tool_result = {"error": str(e)}
152
 
153
  # send the tool result back
@@ -183,7 +202,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
183
  ],
184
  }
185
  )
186
- logger.info("additional input camera")
187
 
188
  np_img = self.deps.camera_worker.get_latest_frame()
189
  img = gr.Image(value=np_img)
@@ -198,7 +217,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
198
  )
199
 
200
  if not self.is_idle_tool_call:
201
- await self.connection.response.create(
202
  response={
203
  "instructions": "Use the tool result just returned and answer concisely in speech."
204
  }
@@ -215,8 +234,18 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
215
  if event.type == "error":
216
  err = getattr(event, "error", None)
217
  msg = getattr(err, "message", str(err) if err else "unknown error")
218
- logger.error("Realtime error: %s (raw=%s)", msg, err)
219
- await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
 
 
 
 
 
 
 
 
 
 
220
 
221
  # Microphone receive
222
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
@@ -256,6 +285,28 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
256
  dt = datetime.fromtimestamp(current_time)
257
  return f"[{dt.strftime('%Y-%m-%d %H:%M:%S')} | +{elapsed_seconds:.1f}s]"
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  async def send_idle_signal(self, idle_duration) -> None:
260
  """Send an idle signal to the openai server."""
261
  logger.debug("Sending idle signal")
@@ -271,11 +322,10 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
271
  "content": [{"type": "input_text", "text": timestamp_msg}],
272
  }
273
  )
274
- await self.connection.response.create(
275
  response={
276
  "modalities": ["text"],
277
  "instructions": "You MUST respond with function calls only - no speech or text. Choose appropriate actions for idle behavior.",
278
  "tool_choice": "required",
279
  }
280
  )
281
- # TODO additional inputs
 
15
  dispatch_tool_call,
16
  )
17
  from reachy_mini_conversation_demo.config import config
18
+ from reachy_mini_conversation_demo.prompts import SESSION_INSTRUCTIONS
19
 
20
 
21
  logger = logging.getLogger(__name__)
 
24
  class OpenaiRealtimeHandler(AsyncStreamHandler):
25
  """An OpenAI realtime handler for fastrtc Stream."""
26
 
27
+ def __init__(self, deps: ToolDependencies, clear_audio_queue_callback=None):
28
+ """Initialize the handler.
29
+
30
+ Args:
31
+ deps: Tool dependencies for executing tools
32
+ clear_audio_queue_callback: Optional callback to clear the audio queue when speech starts
33
+
34
+ """
35
  super().__init__(
36
  expected_layout="mono",
37
  output_sample_rate=24000, # openai outputs
38
  input_sample_rate=16000, # respeaker output
39
  )
40
  self.deps = deps
41
+ self._clear_audio_queue_callback = clear_audio_queue_callback
42
 
43
  self.connection = None
44
  self.output_queue = asyncio.Queue()
45
 
46
  self._pending_calls: dict[str, dict] = {}
47
+ self._response_in_progress = False
48
+ self._pending_response_queue = asyncio.Queue()
49
 
50
  self.last_activity_time = asyncio.get_event_loop().time()
51
  self.start_time = asyncio.get_event_loop().time()
 
53
 
54
  def copy(self):
55
  """Create a copy of the handler."""
56
+ return OpenaiRealtimeHandler(self.deps, self._clear_audio_queue_callback)
57
 
58
  async def start_up(self):
59
  """Start the handler."""
 
69
  "language": "en",
70
  },
71
  "voice": "ballad",
72
+ "instructions": SESSION_INSTRUCTIONS,
73
  "tools": ALL_TOOL_SPECS,
74
  "tool_choice": "auto",
75
  "temperature": 0.7,
 
81
  async for event in self.connection:
82
  logger.debug(f"OpenAI event: {event.type}")
83
  if event.type == "input_audio_buffer.speech_started":
84
+ if self._clear_audio_queue_callback:
85
+ self._clear_audio_queue_callback()
86
  self.deps.head_wobbler.reset()
87
  self.deps.movement_manager.set_listening(True)
88
+ logger.debug("User speech started")
89
 
90
  if event.type == "input_audio_buffer.speech_stopped":
91
  self.deps.movement_manager.set_listening(False)
92
+ logger.debug("User speech stopped")
93
 
94
  if event.type in ("response.audio.completed", "response.completed"):
95
  # Doesn't seem to be called
 
98
 
99
  if event.type == "response.created":
100
  logger.debug("response created")
101
+ self._response_in_progress = True
102
 
103
  if event.type == "response.done":
104
  # Doesn't mean the audio is done playing
105
  logger.debug("response done")
106
+ self._response_in_progress = False
107
+ # Process any queued response requests
108
+ if not self._pending_response_queue.empty():
109
+ queued_params = await self._pending_response_queue.get()
110
+ logger.debug("Processing queued response request")
111
+ try:
112
+ await self.connection.response.create(**queued_params)
113
+ except Exception as e:
114
+ logger.error(f"Failed to create queued response: {e}")
115
 
116
  if event.type == "conversation.item.input_audio_transcription.completed":
117
+ logger.debug(f"User transcript: {event.transcript}")
118
  await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
119
 
120
  if event.type == "response.audio_transcript.done":
121
+ logger.debug(f"Assistant transcript: {event.transcript}")
122
  await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
123
 
124
  if event.type == "response.audio.delta":
 
163
 
164
  try:
165
  tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
166
+ logger.debug("Tool '%s' executed successfully", tool_name)
167
  logger.debug("Tool result: %s", tool_result)
168
  except Exception as e:
169
+ logger.exception("Tool '%s' failed", tool_name)
170
  tool_result = {"error": str(e)}
171
 
172
  # send the tool result back
 
202
  ],
203
  }
204
  )
205
+ logger.info("Added camera image to conversation")
206
 
207
  np_img = self.deps.camera_worker.get_latest_frame()
208
  img = gr.Image(value=np_img)
 
217
  )
218
 
219
  if not self.is_idle_tool_call:
220
+ await self._safe_create_response(
221
  response={
222
  "instructions": "Use the tool result just returned and answer concisely in speech."
223
  }
 
234
  if event.type == "error":
235
  err = getattr(event, "error", None)
236
  msg = getattr(err, "message", str(err) if err else "unknown error")
237
+ err_code = getattr(err, "code", None)
238
+
239
+ # Handle concurrent response error gracefully
240
+ if err_code == "conversation_already_has_active_response":
241
+ logger.warning(
242
+ "Attempted to create response while one is in progress. "
243
+ "This is expected during rapid tool calls and will be handled automatically."
244
+ )
245
+ # Don't send error to user for this specific case
246
+ else:
247
+ logger.error("Realtime error: %s (raw=%s)", msg, err)
248
+ await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
249
 
250
  # Microphone receive
251
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
 
285
  dt = datetime.fromtimestamp(current_time)
286
  return f"[{dt.strftime('%Y-%m-%d %H:%M:%S')} | +{elapsed_seconds:.1f}s]"
287
 
288
+ async def _safe_create_response(self, **kwargs) -> None:
289
+ """Safely create a response, queuing if one is already in progress.
290
+
291
+ Args:
292
+ **kwargs: Arguments to pass to connection.response.create()
293
+
294
+ """
295
+ if self._response_in_progress:
296
+ logger.debug("Response already in progress, queuing request (expected during rapid tool calls)")
297
+ await self._pending_response_queue.put(kwargs)
298
+ else:
299
+ try:
300
+ await self.connection.response.create(**kwargs)
301
+ except Exception as e:
302
+ error_msg = str(e)
303
+ if "conversation_already_has_active_response" in error_msg:
304
+ logger.warning("Race condition detected, queuing response request")
305
+ await self._pending_response_queue.put(kwargs)
306
+ else:
307
+ logger.error(f"Failed to create response: {e}")
308
+ raise
309
+
310
  async def send_idle_signal(self, idle_duration) -> None:
311
  """Send an idle signal to the openai server."""
312
  logger.debug("Sending idle signal")
 
322
  "content": [{"type": "input_text", "text": timestamp_msg}],
323
  }
324
  )
325
+ await self._safe_create_response(
326
  response={
327
  "modalities": ["text"],
328
  "instructions": "You MUST respond with function calls only - no speech or text. Choose appropriate actions for idle behavior.",
329
  "tool_choice": "required",
330
  }
331
  )
 
src/reachy_mini_conversation_demo/tools.py CHANGED
@@ -1,7 +1,6 @@
1
  from __future__ import annotations
2
  import abc
3
  import json
4
- import time
5
  import asyncio
6
  import inspect
7
  import logging
@@ -12,12 +11,8 @@ from reachy_mini import ReachyMini
12
  from reachy_mini.utils import create_head_pose
13
 
14
 
15
- # from reachy_mini_conversation_demo.vision.processors import VisionManager
16
-
17
  logger = logging.getLogger(__name__)
18
 
19
- ENABLE_FACE_RECOGNITION = False
20
-
21
  # Initialize dance and emotion libraries
22
  try:
23
  from reachy_mini.motion.recorded_move import RecordedMoves
@@ -40,16 +35,6 @@ except ImportError as e:
40
  DANCE_AVAILABLE = False
41
  EMOTION_AVAILABLE = False
42
 
43
- FACE_RECOGNITION_AVAILABLE = False
44
- if ENABLE_FACE_RECOGNITION:
45
- # Initialize face recognition
46
- try:
47
- from deepface import DeepFace
48
-
49
- FACE_RECOGNITION_AVAILABLE = True
50
- except ImportError as e:
51
- logger.warning(f"DeepFace not available: {e}")
52
-
53
 
54
  def all_concrete_subclasses(base):
55
  """Recursively find all concrete (non-abstract) subclasses of a base class."""
@@ -76,30 +61,9 @@ class ToolDependencies:
76
  camera_worker: Optional[Any] = None # CameraWorker for frame buffering
77
  vision_manager: Optional[Any] = None
78
  head_wobbler: Optional[Any] = None # HeadWobbler for audio-reactive motion
79
- camera_retry_attempts: int = 5
80
- camera_retry_delay_s: float = 0.10
81
- vision_timeout_s: float = 8.0
82
  motion_duration_s: float = 1.0
83
 
84
 
85
- # Helpers - removed _read_frame as it's no longer needed with camera worker
86
-
87
-
88
- def _execute_motion(deps: ToolDependencies, target: Any) -> Dict[str, Any]:
89
- """Apply motion to reachy_mini and update movement_manager state."""
90
- movement_manager = deps.movement_manager
91
- movement_manager.moving_start = time.monotonic()
92
- movement_manager.moving_for = deps.motion_duration_s
93
- movement_manager.current_head_pose = target
94
- try:
95
- deps.reachy_mini.goto_target(target, duration=deps.motion_duration_s)
96
- except Exception as e:
97
- logger.exception("motion failed")
98
- return {"error": f"motion failed: {type(e).__name__}: {e}"}
99
-
100
- return {"status": "ok"}
101
-
102
-
103
  # Tool base class
104
  class Tool(abc.ABC):
105
  """Base abstraction for tools used in function-calling.
@@ -277,100 +241,6 @@ class HeadTracking(Tool):
277
  return {"status": f"head tracking {status}"}
278
 
279
 
280
- # class DescribeCurrentScene(Tool):
281
- # name = "describe_current_scene"
282
- # description = "Get a detailed description of the current scene."
283
- # parameters_schema = {"type": "object", "properties": {}, "required": []}
284
-
285
- # async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
286
- # logger.info("Tool call: describe_current_scene")
287
-
288
- # result = await deps.vision_manager.process_current_frame(
289
- # "Describe what you currently see in detail, focusing on people, objects, and activities."
290
- # )
291
-
292
- # if isinstance(result, dict) and "error" in result:
293
- # return result
294
- # return result
295
-
296
-
297
- # class GetSceneContext(Tool):
298
- # name = "get_scene_context"
299
- # description = (
300
- # "Get the most recent automatic scene description for conversational context."
301
- # )
302
- # parameters_schema = {"type": "object", "properties": {}, "required": []}
303
-
304
- # async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
305
- # logger.info("Tool call: get_scene_context")
306
- # vision_manager = deps.vision_manager
307
- # if not vision_manager:
308
- # return {"error": "Vision processing not available"}
309
-
310
- # try:
311
- # description = await deps.vision_manager.get_current_description()
312
-
313
- # if not description:
314
- # return {
315
- # "context": "No scene description available yet",
316
- # "note": "Vision processing may still be initializing",
317
- # }
318
- # return {
319
- # "context": description,
320
- # "note": "This comes from periodic automatic analysis",
321
- # }
322
- # except Exception as e:
323
- # logger.exception("Failed to get scene context")
324
- # return {"error": f"Scene context failed: {type(e).__name__}: {e}"}
325
-
326
-
327
- # class AnalyzeSceneFor(Tool):
328
- # name = "analyze_scene_for"
329
- # description = "Analyze the current scene for a specific purpose."
330
- # parameters_schema = {
331
- # "type": "object",
332
- # "properties": {
333
- # "purpose": {
334
- # "type": "string",
335
- # "enum": [
336
- # "safety",
337
- # "people",
338
- # "objects",
339
- # "activity",
340
- # "navigation",
341
- # "general",
342
- # ],
343
- # "default": "general",
344
- # }
345
- # },
346
- # "required": [],
347
- # }
348
-
349
- # async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
350
- # purpose = (kwargs.get("purpose") or "general").lower()
351
- # logger.info("Tool call: analyze_scene_for purpose=%s", purpose)
352
-
353
- # prompts = {
354
- # "safety": "Look for safety concerns, obstacles, or hazards.",
355
- # "people": "Describe people, their positions and actions.",
356
- # "objects": "Identify and describe main visible objects.",
357
- # "activity": "Describe ongoing activities or actions.",
358
- # "navigation": "Describe the space for navigation: obstacles, pathways, layout.",
359
- # "general": "Give a general description of the scene including people, objects, and activities.",
360
- # }
361
- # prompt = prompts.get(purpose, prompts["general"])
362
-
363
- # result = await deps.vision_manager.process_current_frame(prompt)
364
-
365
- # if isinstance(result, dict) and "error" in result:
366
- # return result
367
-
368
- # if not isinstance(result, dict):
369
- # return {"error": "vision returned non-dict"}
370
-
371
- # result["analysis_purpose"] = purpose
372
- # return result
373
-
374
 
375
  class Dance(Tool):
376
  """Play a named or random dance move once (or repeat). Non-blocking."""
@@ -461,25 +331,24 @@ class StopDance(Tool):
461
  """Stop the current dance move."""
462
  logger.info("Tool call: stop_dance")
463
  movement_manager = deps.movement_manager
464
- movement_manager.clear_queue()
465
  return {"status": "stopped dance and cleared queue"}
466
 
467
 
468
- def get_available_emotions_and_descriptions():
469
  """Get formatted list of available emotions with descriptions."""
470
- names = RECORDED_MOVES.list_moves()
471
-
472
- ret = """
473
- Available emotions:
474
-
475
- """
476
-
477
- for name in names:
478
- description = RECORDED_MOVES.get(name).description
479
- ret += f" - {name}: {description}\n"
480
-
481
- return ret
482
 
 
 
 
 
 
 
 
 
 
483
 
484
  class PlayEmotion(Tool):
485
  """Play a pre-recorded emotion."""
@@ -549,70 +418,10 @@ class StopEmotion(Tool):
549
  """Stop the current emotion."""
550
  logger.info("Tool call: stop_emotion")
551
  movement_manager = deps.movement_manager
552
- movement_manager.clear_queue()
553
  return {"status": "stopped emotion and cleared queue"}
554
 
555
 
556
- class FaceRecognition(Tool):
557
- """Get the name of the person you are talking to."""
558
-
559
- name = "get_person_name"
560
- description = "Get the name of the person you are talking to"
561
- parameters_schema = {
562
- "type": "object",
563
- "properties": {
564
- "dummy": {
565
- "type": "boolean",
566
- "description": "dummy boolean, set it to true",
567
- }
568
- },
569
- "required": ["dummy"],
570
- }
571
-
572
- async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
573
- """Get the name of the person you are talking to."""
574
- if not FACE_RECOGNITION_AVAILABLE:
575
- return {"error": "Face recognition not available"}
576
-
577
- logger.info("Tool call: face_recognition")
578
-
579
- try:
580
- # Get frame from camera worker buffer (like main_works.py)
581
- if deps.camera_worker is not None:
582
- frame = deps.camera_worker.get_latest_frame()
583
- if frame is None:
584
- logger.error("No frame available from camera worker")
585
- return {"error": "No frame available"}
586
- else:
587
- logger.error("Camera worker not available")
588
- return {"error": "Camera worker not available"}
589
-
590
- # Save frame temporarily (same as main_works.py pattern)
591
- temp_path = "/tmp/face_recognition.jpg"
592
- import cv2
593
-
594
- cv2.imwrite(temp_path, frame)
595
-
596
- # Use DeepFace to find face
597
- results = await asyncio.to_thread(DeepFace.find, img_path=temp_path, db_path="./pollen_faces")
598
-
599
- if len(results) == 0:
600
- return {"error": "Didn't recognize the face"}
601
-
602
- # Extract name from results
603
- name = "Unknown"
604
- for index, row in results[0].iterrows():
605
- file_path = row["identity"]
606
- name = file_path.split("/")[-2]
607
- break
608
-
609
- return {"answer": f"The name is {name}"}
610
-
611
- except Exception as e:
612
- logger.exception("Face recognition failed")
613
- return {"error": f"Face recognition failed: {str(e)}"}
614
-
615
-
616
  class DoNothing(Tool):
617
  """Choose to do nothing - stay still and silent. Use when you want to be contemplative or just chill."""
618
 
@@ -636,22 +445,6 @@ class DoNothing(Tool):
636
  return {"status": "doing nothing", "reason": reason}
637
 
638
 
639
- def get_available_emotions_and_descriptions() -> str:
640
- """Get formatted list of available emotions with descriptions."""
641
- if not EMOTION_AVAILABLE:
642
- return "Emotions not available"
643
-
644
- try:
645
- names = RECORDED_MOVES.list_moves()
646
- ret = "Available emotions:\n"
647
- for name in names:
648
- description = RECORDED_MOVES.get(name).description
649
- ret += f" - {name}: {description}\n"
650
- return ret
651
- except Exception as e:
652
- return f"Error getting emotions: {e}"
653
-
654
-
655
  # Registry & specs (dynamic)
656
 
657
  # List of available tool classes
 
1
  from __future__ import annotations
2
  import abc
3
  import json
 
4
  import asyncio
5
  import inspect
6
  import logging
 
11
  from reachy_mini.utils import create_head_pose
12
 
13
 
 
 
14
  logger = logging.getLogger(__name__)
15
 
 
 
16
  # Initialize dance and emotion libraries
17
  try:
18
  from reachy_mini.motion.recorded_move import RecordedMoves
 
35
  DANCE_AVAILABLE = False
36
  EMOTION_AVAILABLE = False
37
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def all_concrete_subclasses(base):
40
  """Recursively find all concrete (non-abstract) subclasses of a base class."""
 
61
  camera_worker: Optional[Any] = None # CameraWorker for frame buffering
62
  vision_manager: Optional[Any] = None
63
  head_wobbler: Optional[Any] = None # HeadWobbler for audio-reactive motion
 
 
 
64
  motion_duration_s: float = 1.0
65
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  # Tool base class
68
  class Tool(abc.ABC):
69
  """Base abstraction for tools used in function-calling.
 
241
  return {"status": f"head tracking {status}"}
242
 
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
  class Dance(Tool):
246
  """Play a named or random dance move once (or repeat). Non-blocking."""
 
331
  """Stop the current dance move."""
332
  logger.info("Tool call: stop_dance")
333
  movement_manager = deps.movement_manager
334
+ movement_manager.clear_move_queue()
335
  return {"status": "stopped dance and cleared queue"}
336
 
337
 
338
+ def get_available_emotions_and_descriptions() -> str:
339
  """Get formatted list of available emotions with descriptions."""
340
+ if not EMOTION_AVAILABLE:
341
+ return "Emotions not available"
 
 
 
 
 
 
 
 
 
 
342
 
343
+ try:
344
+ names = RECORDED_MOVES.list_moves()
345
+ ret = "Available emotions:\n"
346
+ for name in names:
347
+ description = RECORDED_MOVES.get(name).description
348
+ ret += f" - {name}: {description}\n"
349
+ return ret
350
+ except Exception as e:
351
+ return f"Error getting emotions: {e}"
352
 
353
  class PlayEmotion(Tool):
354
  """Play a pre-recorded emotion."""
 
418
  """Stop the current emotion."""
419
  logger.info("Tool call: stop_emotion")
420
  movement_manager = deps.movement_manager
421
+ movement_manager.clear_move_queue()
422
  return {"status": "stopped emotion and cleared queue"}
423
 
424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
  class DoNothing(Tool):
426
  """Choose to do nothing - stay still and silent. Use when you want to be contemplative or just chill."""
427
 
 
445
  return {"status": "doing nothing", "reason": reason}
446
 
447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
  # Registry & specs (dynamic)
449
 
450
  # List of available tool classes
src/reachy_mini_conversation_demo/utils.py CHANGED
@@ -3,6 +3,7 @@ import argparse
3
  import warnings
4
 
5
  from reachy_mini_conversation_demo.camera_worker import CameraWorker
 
6
 
7
 
8
  def parse_args():
@@ -21,26 +22,27 @@ def parse_args():
21
 
22
 
23
  def handle_vision_stuff(args, current_robot):
24
- """Initialize camera, head tracker and camera worker."""
25
  camera_worker = None
26
  head_tracker = None
27
  vision_manager = None
 
28
  if not args.no_camera:
 
29
  if args.head_tracker is not None:
30
  if args.head_tracker == "yolo":
31
- from reachy_mini_conversation_demo.vision.yolo_head_tracker import (
32
- HeadTracker,
33
- )
34
-
35
  head_tracker = HeadTracker()
36
-
37
  elif args.head_tracker == "mediapipe":
38
  from reachy_mini_toolbox.vision import HeadTracker
39
-
40
  head_tracker = HeadTracker()
41
 
 
42
  camera_worker = CameraWorker(current_robot, head_tracker)
43
 
 
 
 
44
  return camera_worker, head_tracker, vision_manager
45
 
46
 
 
3
  import warnings
4
 
5
  from reachy_mini_conversation_demo.camera_worker import CameraWorker
6
+ from reachy_mini_conversation_demo.vision.processors import initialize_vision_manager
7
 
8
 
9
  def parse_args():
 
22
 
23
 
24
  def handle_vision_stuff(args, current_robot):
25
+ """Initialize camera, head tracker, camera worker, and vision manager."""
26
  camera_worker = None
27
  head_tracker = None
28
  vision_manager = None
29
+
30
  if not args.no_camera:
31
+ # Initialize head tracker if specified
32
  if args.head_tracker is not None:
33
  if args.head_tracker == "yolo":
34
+ from reachy_mini_conversation_demo.vision.yolo_head_tracker import HeadTracker
 
 
 
35
  head_tracker = HeadTracker()
 
36
  elif args.head_tracker == "mediapipe":
37
  from reachy_mini_toolbox.vision import HeadTracker
 
38
  head_tracker = HeadTracker()
39
 
40
+ # Initialize camera worker
41
  camera_worker = CameraWorker(current_robot, head_tracker)
42
 
43
+ # Initialize vision manager (handles model download and configuration)
44
+ vision_manager = initialize_vision_manager(camera_worker)
45
+
46
  return camera_worker, head_tracker, vision_manager
47
 
48
 
src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py CHANGED
@@ -94,77 +94,6 @@ class HeadTracker:
94
 
95
  return np.array([norm_x, norm_y], dtype=np.float32)
96
 
97
- def get_eyes(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
98
- """Get eye positions (approximated from face bbox).
99
-
100
- Note: YOLO only provides face bbox, so we estimate eye positions
101
-
102
- Args:
103
- img: Input image
104
-
105
- Returns:
106
- Tuple of (left_eye, right_eye) in [-1, 1] coordinates
107
-
108
- """
109
- h, w = img.shape[:2]
110
-
111
- # Run YOLO inference
112
- results = self.model(img, verbose=False)
113
- detections = Detections.from_ultralytics(results[0])
114
-
115
- # Select best face
116
- face_idx = self._select_best_face(detections)
117
- if face_idx is None:
118
- return None, None
119
-
120
- bbox = detections.xyxy[face_idx]
121
-
122
- # Estimate eye positions from face bbox (approximate locations)
123
- face_width = bbox[2] - bbox[0]
124
- face_height = bbox[3] - bbox[1]
125
-
126
- # Eye positions are roughly at 1/3 and 2/3 of face width, 1/3 of face height
127
- eye_y = bbox[1] + face_height * 0.35
128
- left_eye_x = bbox[0] + face_width * 0.35
129
- right_eye_x = bbox[0] + face_width * 0.65
130
-
131
- # Convert to MediaPipe coordinates
132
- left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
133
- right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
134
-
135
- return left_eye, right_eye
136
-
137
- def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
138
- """Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object."""
139
- if not hasattr(face_landmarks, "_bbox") or not hasattr(face_landmarks, "_img_shape"):
140
- raise ValueError("Face landmarks object missing required attributes")
141
-
142
- bbox = face_landmarks._bbox
143
- h, w = face_landmarks._img_shape[:2]
144
-
145
- # Estimate eyes from stored bbox
146
- face_width = bbox[2] - bbox[0]
147
- face_height = bbox[3] - bbox[1]
148
-
149
- eye_y = bbox[1] + face_height * 0.35
150
- left_eye_x = bbox[0] + face_width * 0.35
151
- right_eye_x = bbox[0] + face_width * 0.65
152
-
153
- left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
154
- right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
155
-
156
- return left_eye, right_eye
157
-
158
- def get_eye_center(self, face_landmarks) -> np.ndarray:
159
- """Get center point between estimated eyes."""
160
- left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
161
- return np.mean([left_eye, right_eye], axis=0)
162
-
163
- def get_roll(self, face_landmarks) -> float:
164
- """Estimate roll from eye positions (will be 0 for YOLO since we estimate symmetric eyes)."""
165
- left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
166
- return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
167
-
168
  def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
169
  """Get head position from face detection.
170
 
@@ -204,18 +133,3 @@ class HeadTracker:
204
  except Exception as e:
205
  logger.error(f"Error in head position detection: {e}")
206
  return None, None
207
-
208
- def cleanup(self):
209
- """Clean up resources."""
210
- if hasattr(self, "model"):
211
- del self.model
212
- logger.info("YOLO model cleaned up")
213
-
214
-
215
- class FaceLandmarks:
216
- """Simple container for face detection results to maintain API compatibility."""
217
-
218
- def __init__(self, bbox: np.ndarray, img_shape: tuple):
219
- """Initialize with bounding box and image shape."""
220
- self._bbox = bbox
221
- self._img_shape = img_shape
 
94
 
95
  return np.array([norm_x, norm_y], dtype=np.float32)
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
98
  """Get head position from face detection.
99
 
 
133
  except Exception as e:
134
  logger.error(f"Error in head position detection: {e}")
135
  return None, None