Caroline Pascal commited on
Commit
fdb5f7f
·
unverified ·
1 Parent(s): 5e2eb4f

fix(audio channels): making the play_loop and receive methods robust to audio inputs/outputs shapes (#132)

Browse files
src/reachy_mini_conversation_app/console.py CHANGED
@@ -108,7 +108,12 @@ class LocalStream:
108
 
109
  # Reshape if needed
110
  if audio_data.ndim == 2:
111
- audio_data = audio_data.squeeze()
 
 
 
 
 
112
 
113
  # Cast if needed
114
  audio_frame = audio_to_float32(audio_data)
 
108
 
109
  # Reshape if needed
110
  if audio_data.ndim == 2:
111
+ # Scipy channels last convention
112
+ if audio_data.shape[1] > audio_data.shape[0]:
113
+ audio_data = audio_data.T
114
+ # Multiple channels -> Mono channel
115
+ if audio_data.shape[1] > 1:
116
+ audio_data = audio_data[:, 0]
117
 
118
  # Cast if needed
119
  audio_frame = audio_to_float32(audio_data)
src/reachy_mini_conversation_app/openai_realtime.py CHANGED
@@ -349,13 +349,22 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
349
  if not self.connection:
350
  return
351
  input_sample_rate, audio_frame = frame
352
- # Make mono if it's stereo
 
353
  if audio_frame.ndim == 2:
354
- audio_frame = audio_frame[:, 0]
 
 
 
 
 
355
 
356
  # Resample if needed
357
  if self.input_sample_rate != input_sample_rate:
358
- audio_frame = resample(audio_frame, int(len(audio_frame) * self.input_sample_rate / input_sample_rate))
 
 
 
359
 
360
  # Cast if needed
361
  audio_frame = audio_to_int16(audio_frame)
 
349
  if not self.connection:
350
  return
351
  input_sample_rate, audio_frame = frame
352
+
353
+ #Reshape if needed
354
  if audio_frame.ndim == 2:
355
+ # Scipy channels last convention
356
+ if audio_frame.shape[1] > audio_frame.shape[0]:
357
+ audio_frame = audio_frame.T
358
+ # Multiple channels -> Mono channel
359
+ if audio_frame.shape[1] > 1:
360
+ audio_frame = audio_frame[:, 0]
361
 
362
  # Resample if needed
363
  if self.input_sample_rate != input_sample_rate:
364
+ audio_frame = resample(
365
+ audio_frame,
366
+ int(len(audio_frame) * self.input_sample_rate / input_sample_rate)
367
+ )
368
 
369
  # Cast if needed
370
  audio_frame = audio_to_int16(audio_frame)