Spaces:

pollen-robotics
/

reachy_mini_conversation_app

Running

Caroline Pascal commited on 13 days ago

Commit

fdb5f7f

unverified ·

1 Parent(s): 5e2eb4f

fix(audio channels): making the play_loop and receive methods robust to audio inputs/outputs shapes (#132)

Files changed (2) hide show

src/reachy_mini_conversation_app/console.py CHANGED Viewed

@@ -108,7 +108,12 @@ class LocalStream:
                 # Reshape if needed
                 if audio_data.ndim == 2:
-                    audio_data = audio_data.squeeze()
                 # Cast if needed
                 audio_frame = audio_to_float32(audio_data)

                 # Reshape if needed
                 if audio_data.ndim == 2:
+                    # Scipy channels last convention
+                    if audio_data.shape[1] > audio_data.shape[0]:
+                        audio_data = audio_data.T
+                    # Multiple channels -> Mono channel
+                    if audio_data.shape[1] > 1:
+                        audio_data = audio_data[:, 0]
                 # Cast if needed
                 audio_frame = audio_to_float32(audio_data)

src/reachy_mini_conversation_app/openai_realtime.py CHANGED Viewed

@@ -349,13 +349,22 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
         if not self.connection:
             return
         input_sample_rate, audio_frame = frame
-        # Make mono if it's stereo
         if audio_frame.ndim == 2:
-            audio_frame = audio_frame[:, 0]
         # Resample if needed
         if self.input_sample_rate != input_sample_rate:
-            audio_frame = resample(audio_frame, int(len(audio_frame) * self.input_sample_rate / input_sample_rate))
         # Cast if needed
         audio_frame = audio_to_int16(audio_frame)

         if not self.connection:
             return
         input_sample_rate, audio_frame = frame
+        #Reshape if needed
         if audio_frame.ndim == 2:
+            # Scipy channels last convention
+            if audio_frame.shape[1] > audio_frame.shape[0]:
+                audio_frame = audio_frame.T
+            # Multiple channels -> Mono channel
+            if audio_frame.shape[1] > 1:
+                audio_frame = audio_frame[:, 0]
         # Resample if needed
         if self.input_sample_rate != input_sample_rate:
+            audio_frame = resample(
+                audio_frame,
+                int(len(audio_frame) * self.input_sample_rate / input_sample_rate)
+            )
         # Cast if needed
         audio_frame = audio_to_int16(audio_frame)