Spaces:

Rcarvalo
/

speech-to-speech

Runtime error

App Files Files Community

Rcarvalo commited on 8 days ago

Commit

fc67d54

verified ·

1 Parent(s): eec54dd

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +141 -129

app.py CHANGED Viewed

@@ -1,12 +1,15 @@
 """
-Gradio app for LFM2-Audio speech-to-speech demo
-Compatible with Hugging Face Spaces
 """
 import gradio as gr
 import numpy as np
 import torch
-import torchaudio
 from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality
@@ -28,132 +31,138 @@ mimi = mimi.to(device)
 print(f"Models loaded on {device}")
-def generate_response(audio_input, temperature, top_k, chat_state):
-    """Generate speech-to-speech response"""
-    if audio_input is None:
-        return None, "Please record audio first", chat_state
-    # Parse audio input
-    rate, wav = audio_input
-    # Convert to torch tensor
-    if wav.dtype == np.int16:
-        wav_tensor = torch.tensor(wav / 32768.0, dtype=torch.float32)
-    else:
-        wav_tensor = torch.tensor(wav, dtype=torch.float32)
-    # Ensure mono and correct shape (channels, samples)
-    if len(wav_tensor.shape) > 1:
-        wav_tensor = wav_tensor.mean(dim=-1)
-    # add_audio expects shape (channels, samples), so add channel dimension
-    if len(wav_tensor.shape) == 1:
-        wav_tensor = wav_tensor.unsqueeze(0)
-    # Initialize chat state if empty
-    if len(chat_state.text) == 1:
-        chat_state.new_turn("system")
-        chat_state.add_text("Respond with interleaved text and audio.")
-        chat_state.end_turn()
-    # Add user audio
-    chat_state.new_turn("user")
-    chat_state.add_audio(wav_tensor, rate)
-    chat_state.end_turn()
-    # Start assistant turn
-    chat_state.new_turn("assistant")
-    # Set generation parameters
-    temp = None if temperature == 0 else float(temperature)
-    topk = None if top_k == 0 else int(top_k)
-    # Generate response
-    text_out = []
-    audio_out = []
-    modality_out = []
-    full_text = ""
-    print("Generating response...")
-    with torch.no_grad():
         for t in model.generate_interleaved(
-            **chat_state,
             max_new_tokens=1024,
             audio_temperature=temp,
             audio_top_k=topk,
         ):
-            if t.numel() == 1:  # Text token
-                text_out.append(t)
-                modality_out.append(LFMModality.TEXT)
-                decoded = processor.text.decode(t)
-                full_text += decoded
-                print(decoded, end="", flush=True)
-            elif t.numel() == 8:  # Audio token
-                audio_out.append(t)
-                modality_out.append(LFMModality.AUDIO_OUT)
-    print("\nGeneration complete")
-    # Clean up text
-    full_text = full_text.replace("<|text_end|>", "").strip()
-    # Decode audio (remove last end-of-audio token)
-    if len(audio_out) > 1:
-        mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0).to(device)
-        with torch.no_grad():
-            waveform = mimi.decode(mimi_codes)[0]
-        # Convert to numpy for Gradio
-        audio_np = waveform.cpu().numpy()
-        audio_output = (24000, audio_np.T)  # Gradio expects (rate, data)
-    else:
-        audio_output = None
-    # Update chat state
-    if text_out and audio_out:
-        chat_state.append(
-            text=torch.stack(text_out, 1),
-            audio_out=torch.stack(audio_out, 1),
-            modality_flag=torch.tensor(modality_out, device=device),
-        )
-    chat_state.end_turn()
-    chat_state.new_turn("user")
-    return audio_output, full_text, chat_state
-def reset_chat():
-    """Reset chat state"""
-    return ChatState(processor), "", None
 # Create Gradio interface
-with gr.Blocks(title="LFM2-Audio Speech-to-Speech") as demo:
     gr.Markdown("""
-    # LFM2-Audio Speech-to-Speech Chat
-    Talk to LFM2-Audio! Record your voice and get a response with both text and audio.
     **How to use:**
-    1. Click the microphone button to record your voice
-    2. Adjust temperature and top-k parameters if needed (or leave defaults)
-    3. Click "Generate Response"
-    4. Listen to the audio response and read the text transcription
-    **Note:** This model runs on GPU. If you experience long wait times, the Space might be on CPU or heavily loaded.
     """)
     chat_state = gr.State(ChatState(processor))
     with gr.Row():
         with gr.Column():
-            audio_input = gr.Audio(
-                sources=["microphone"],
-                type="numpy",
-                label="Record your voice"
             )
             with gr.Row():
@@ -163,7 +172,7 @@ with gr.Blocks(title="LFM2-Audio Speech-to-Speech") as demo:
                     value=1.0,
                     step=0.1,
                     label="Temperature (0 for greedy)",
-                    info="Higher = more creative, lower = more deterministic"
                 )
                 top_k = gr.Slider(
                     minimum=0,
@@ -171,47 +180,50 @@ with gr.Blocks(title="LFM2-Audio Speech-to-Speech") as demo:
                     value=4,
                     step=1,
                     label="Top-k (0 for no filtering)",
-                    info="Number of top tokens to sample from"
                 )
-            generate_btn = gr.Button("Generate Response", variant="primary")
-            reset_btn = gr.Button("Reset Chat")
         with gr.Column():
-            text_output = gr.Textbox(
-                label="Assistant Response (Text)",
-                lines=4,
-                interactive=False
-            )
-            audio_output = gr.Audio(
-                label="Assistant Response (Audio)",
-                type="numpy",
                 interactive=False
             )
     gr.Markdown("""
-    ### About LFM2-Audio
-    LFM2-Audio-1.5B is Liquid AI's first end-to-end audio foundation model. It supports:
-    - Real-time speech-to-speech conversations
-    - Low-latency interleaved text and audio generation
-    - Natural flowing conversations
-    [Learn more](https://www.liquid.ai/) | [GitHub](https://github.com/Liquid4All/liquid-audio/)
     """)
-    # Event handlers
-    generate_btn.click(
-        fn=generate_response,
-        inputs=[audio_input, temperature, top_k, chat_state],
-        outputs=[audio_output, text_output, chat_state]
     )
-    reset_btn.click(
-        fn=reset_chat,
-        outputs=[chat_state, text_output, audio_output]
     )
 if __name__ == "__main__":
-    demo.launch()

 """
+Real-time WebRTC speech-to-speech demo with fastrtc
+Based on the original liquid-audio demo
 """
+from queue import Queue
+from threading import Thread
 import gradio as gr
 import numpy as np
 import torch
+from fastrtc import AdditionalOutputs, ReplyOnPause, WebRTC
 from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality
 print(f"Models loaded on {device}")
+def chat_producer(
+    q: Queue[torch.Tensor | None],
+    chat: ChatState,
+    temp: float | None,
+    topk: int | None,
+):
+    """Producer thread that generates tokens"""
+    print(f"Starting generation with state {chat}.")
+    with torch.no_grad(), mimi.streaming(1):
         for t in model.generate_interleaved(
+            **chat,
             max_new_tokens=1024,
             audio_temperature=temp,
             audio_top_k=topk,
         ):
+            q.put(t)
+            if t.numel() > 1:
+                if (t == 2048).any():
+                    continue
+                wav_chunk = mimi.decode(t[None, :, None])[0]
+                q.put(wav_chunk)
+    q.put(None)
+def chat_response(audio: tuple[int, np.ndarray], _id: str, chat: ChatState, temp: float | None = 1.0, topk: int | None = 4):
+    """Handle incoming audio and generate streaming response"""
+    if temp == 0:
+        temp = None
+    if topk == 0:
+        topk = None
+    if temp is not None:
+        temp = float(temp)
+    if topk is not None:
+        topk = int(topk)
+    if len(chat.text) == 1:
+        chat.new_turn("system")
+        chat.add_text("Respond with interleaved text and audio.")
+        chat.end_turn()
+        chat.new_turn("user")
+    rate, wav = audio
+    # Convert to tensor with proper shape (channels, samples)
+    wav_tensor = torch.tensor(wav / 32_768, dtype=torch.float)
+    # Ensure correct shape
+    if len(wav_tensor.shape) == 1:
+        wav_tensor = wav_tensor.unsqueeze(0)
+    elif len(wav_tensor.shape) > 1:
+        # If stereo, convert to mono
+        wav_tensor = wav_tensor.mean(dim=-1, keepdim=True).T
+    chat.add_audio(wav_tensor, rate)
+    chat.end_turn()
+    chat.new_turn("assistant")
+    q: Queue[torch.Tensor | None] = Queue()
+    chat_thread = Thread(target=chat_producer, args=(q, chat, temp, topk))
+    chat_thread.start()
+    out_text: list[torch.Tensor] = []
+    out_audio: list[torch.Tensor] = []
+    out_modality: list[LFMModality] = []
+    while True:
+        t = q.get()
+        if t is None:
+            break
+        elif t.numel() == 1:  # text
+            out_text.append(t)
+            out_modality.append(LFMModality.TEXT)
+            print(processor.text.decode(t), end="")
+            cur_string = processor.text.decode(torch.cat(out_text)).removesuffix("<|text_end|>")
+            yield AdditionalOutputs(cur_string)
+        elif t.numel() == 8:
+            out_audio.append(t)
+            out_modality.append(LFMModality.AUDIO_OUT)
+        elif t.numel() == 1920:
+            np_chunk = (t.cpu().numpy() * 32_767).astype(np.int16)
+            yield (24_000, np_chunk)
+        else:
+            raise RuntimeError(f"unexpected shape: {t.shape}")
+    chat.append(
+        text=torch.stack(out_text, 1),
+        audio_out=torch.stack(out_audio, 1),
+        modality_flag=torch.tensor(out_modality, device=device),
+    )
+    chat.end_turn()
+    chat.new_turn("user")
+def clear():
+    """Clear chat history"""
+    gr.Info("Cleared chat history", duration=3)
+    return ChatState(processor), None
 # Create Gradio interface
+with gr.Blocks(title="LFM2-Audio Real-time Speech-to-Speech") as demo:
     gr.Markdown("""
+    # LFM2-Audio Real-time Speech-to-Speech Chat
+    **Real-time WebRTC streaming** powered by fastrtc - Talk naturally and get instant responses!
     **How to use:**
+    1. Click "Allow" when prompted for microphone access
+    2. Start speaking - the model listens and responds in real-time
+    3. The conversation flows naturally with minimal latency
+    **Features:**
+    - 🎙️ Real-time WebRTC streaming
+    - ⚡ Low latency response
+    - 💬 Interleaved text and audio output
+    - 🔄 Multi-turn conversations
     """)
     chat_state = gr.State(ChatState(processor))
     with gr.Row():
         with gr.Column():
+            webrtc = WebRTC(
+                modality="audio",
+                mode="send-receive",
+                full_screen=False,
             )
             with gr.Row():
                     value=1.0,
                     step=0.1,
                     label="Temperature (0 for greedy)",
+                    info="Higher = more creative"
                 )
                 top_k = gr.Slider(
                     minimum=0,
                     value=4,
                     step=1,
                     label="Top-k (0 for no filtering)",
+                    info="Sampling diversity"
                 )
+            clear_btn = gr.Button("Reset Chat")
         with gr.Column():
+            text_out = gr.Textbox(
+                lines=10,
+                label="Conversation Text",
                 interactive=False
             )
     gr.Markdown("""
+    ### About this demo
+    This demo uses **fastrtc** for WebRTC streaming, enabling real-time speech-to-speech interaction with minimal latency.
+    The model processes your speech and generates both text and audio responses simultaneously.
+    **Model**: LFM2-Audio-1.5B by Liquid AI
+    **Mode**: Interleaved generation (optimized for real-time)
+    **Audio Codec**: Mimi (24kHz)
+    [Liquid AI](https://www.liquid.ai/) | [GitHub](https://github.com/Liquid4All/liquid-audio/) | [Model Card](https://huggingface.co/LiquidAI/LFM2-Audio-1.5B)
     """)
+    # Setup WebRTC streaming
+    webrtc.stream(
+        ReplyOnPause(
+            chat_response,  # type: ignore[arg-type]
+            input_sample_rate=24_000,
+            output_sample_rate=24_000,
+            can_interrupt=False,
+        ),
+        inputs=[webrtc, chat_state, temperature, top_k],
+        outputs=[webrtc],
     )
+    webrtc.on_additional_outputs(
+        lambda s: s,
+        outputs=[text_out],
     )
+    clear_btn.click(clear, outputs=[chat_state, text_out])
 if __name__ == "__main__":
+    demo.launch()