Caroline Pascal
commited on
fix(audio channels): making the play_loop and receive methods robust to audio inputs/outputs shapes (#132)
Browse files
src/reachy_mini_conversation_app/console.py
CHANGED
|
@@ -108,7 +108,12 @@ class LocalStream:
|
|
| 108 |
|
| 109 |
# Reshape if needed
|
| 110 |
if audio_data.ndim == 2:
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
# Cast if needed
|
| 114 |
audio_frame = audio_to_float32(audio_data)
|
|
|
|
| 108 |
|
| 109 |
# Reshape if needed
|
| 110 |
if audio_data.ndim == 2:
|
| 111 |
+
# Scipy channels last convention
|
| 112 |
+
if audio_data.shape[1] > audio_data.shape[0]:
|
| 113 |
+
audio_data = audio_data.T
|
| 114 |
+
# Multiple channels -> Mono channel
|
| 115 |
+
if audio_data.shape[1] > 1:
|
| 116 |
+
audio_data = audio_data[:, 0]
|
| 117 |
|
| 118 |
# Cast if needed
|
| 119 |
audio_frame = audio_to_float32(audio_data)
|
src/reachy_mini_conversation_app/openai_realtime.py
CHANGED
|
@@ -349,13 +349,22 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 349 |
if not self.connection:
|
| 350 |
return
|
| 351 |
input_sample_rate, audio_frame = frame
|
| 352 |
-
|
|
|
|
| 353 |
if audio_frame.ndim == 2:
|
| 354 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
|
| 356 |
# Resample if needed
|
| 357 |
if self.input_sample_rate != input_sample_rate:
|
| 358 |
-
audio_frame = resample(
|
|
|
|
|
|
|
|
|
|
| 359 |
|
| 360 |
# Cast if needed
|
| 361 |
audio_frame = audio_to_int16(audio_frame)
|
|
|
|
| 349 |
if not self.connection:
|
| 350 |
return
|
| 351 |
input_sample_rate, audio_frame = frame
|
| 352 |
+
|
| 353 |
+
#Reshape if needed
|
| 354 |
if audio_frame.ndim == 2:
|
| 355 |
+
# Scipy channels last convention
|
| 356 |
+
if audio_frame.shape[1] > audio_frame.shape[0]:
|
| 357 |
+
audio_frame = audio_frame.T
|
| 358 |
+
# Multiple channels -> Mono channel
|
| 359 |
+
if audio_frame.shape[1] > 1:
|
| 360 |
+
audio_frame = audio_frame[:, 0]
|
| 361 |
|
| 362 |
# Resample if needed
|
| 363 |
if self.input_sample_rate != input_sample_rate:
|
| 364 |
+
audio_frame = resample(
|
| 365 |
+
audio_frame,
|
| 366 |
+
int(len(audio_frame) * self.input_sample_rate / input_sample_rate)
|
| 367 |
+
)
|
| 368 |
|
| 369 |
# Cast if needed
|
| 370 |
audio_frame = audio_to_int16(audio_frame)
|