Spaces:
Runtime error
Runtime error
File size: 7,416 Bytes
71c51fd fc67d54 71c51fd fc67d54 7f443a6 fc67d54 71c51fd fc67d54 71c51fd 7f443a6 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 7f443a6 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 71c51fd fc67d54 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 |
"""
Real-time WebRTC speech-to-speech demo with fastrtc
Based on the original liquid-audio demo
"""
from queue import Queue
from threading import Thread
import os
import gradio as gr
import numpy as np
import torch
from fastrtc import AdditionalOutputs, ReplyOnPause, WebRTC
from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality
# Configure WebRTC with STUN/TURN servers
# This is CRITICAL for WebRTC connections to work through firewalls/NAT
rtc_configuration = {
"iceServers": [
{
"urls": [
"stun:stun.l.google.com:19302",
"stun:stun1.l.google.com:19302",
]
}
]
}
# For production deployment on Hugging Face Spaces, you can use Cloudflare TURN:
# Uncomment these lines and set TURN_KEY_ID and TURN_KEY_API_TOKEN as Secrets
# from fastrtc import get_cloudflare_turn_credentials_async
# if os.getenv("TURN_KEY_ID") and os.getenv("TURN_KEY_API_TOKEN"):
# rtc_configuration = get_cloudflare_turn_credentials_async()
# Load models
HF_REPO = "LiquidAI/LFM2-Audio-1.5B"
print("Loading processor...")
processor = LFM2AudioProcessor.from_pretrained(HF_REPO).eval()
print("Loading model...")
model = LFM2AudioModel.from_pretrained(HF_REPO).eval()
print("Loading audio codec...")
mimi = processor.mimi.eval()
# Move to CUDA if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
mimi = mimi.to(device)
print(f"Models loaded on {device}")
def chat_producer(
q: Queue[torch.Tensor | None],
chat: ChatState,
temp: float | None,
topk: int | None,
):
"""Producer thread that generates tokens"""
print(f"Starting generation with state {chat}.")
with torch.no_grad(), mimi.streaming(1):
for t in model.generate_interleaved(
**chat,
max_new_tokens=1024,
audio_temperature=temp,
audio_top_k=topk,
):
q.put(t)
if t.numel() > 1:
if (t == 2048).any():
continue
wav_chunk = mimi.decode(t[None, :, None])[0]
q.put(wav_chunk)
q.put(None)
def chat_response(audio: tuple[int, np.ndarray], _id: str, chat: ChatState, temp: float | None = 1.0, topk: int | None = 4):
"""Handle incoming audio and generate streaming response"""
if temp == 0:
temp = None
if topk == 0:
topk = None
if temp is not None:
temp = float(temp)
if topk is not None:
topk = int(topk)
if len(chat.text) == 1:
chat.new_turn("system")
chat.add_text("Respond with interleaved text and audio.")
chat.end_turn()
chat.new_turn("user")
rate, wav = audio
# Convert to tensor with proper shape (channels, samples)
wav_tensor = torch.tensor(wav / 32_768, dtype=torch.float)
# Ensure correct shape
if len(wav_tensor.shape) == 1:
wav_tensor = wav_tensor.unsqueeze(0)
elif len(wav_tensor.shape) > 1:
# If stereo, convert to mono
wav_tensor = wav_tensor.mean(dim=-1, keepdim=True).T
chat.add_audio(wav_tensor, rate)
chat.end_turn()
chat.new_turn("assistant")
q: Queue[torch.Tensor | None] = Queue()
chat_thread = Thread(target=chat_producer, args=(q, chat, temp, topk))
chat_thread.start()
out_text: list[torch.Tensor] = []
out_audio: list[torch.Tensor] = []
out_modality: list[LFMModality] = []
while True:
t = q.get()
if t is None:
break
elif t.numel() == 1: # text
out_text.append(t)
out_modality.append(LFMModality.TEXT)
print(processor.text.decode(t), end="")
cur_string = processor.text.decode(torch.cat(out_text)).removesuffix("<|text_end|>")
yield AdditionalOutputs(cur_string)
elif t.numel() == 8:
out_audio.append(t)
out_modality.append(LFMModality.AUDIO_OUT)
elif t.numel() == 1920:
np_chunk = (t.cpu().numpy() * 32_767).astype(np.int16)
yield (24_000, np_chunk)
else:
raise RuntimeError(f"unexpected shape: {t.shape}")
chat.append(
text=torch.stack(out_text, 1),
audio_out=torch.stack(out_audio, 1),
modality_flag=torch.tensor(out_modality, device=device),
)
chat.end_turn()
chat.new_turn("user")
def clear():
"""Clear chat history"""
gr.Info("Cleared chat history", duration=3)
return ChatState(processor), None
# Create Gradio interface
with gr.Blocks(title="LFM2-Audio Real-time Speech-to-Speech") as demo:
gr.Markdown("""
# LFM2-Audio Real-time Speech-to-Speech Chat
**Real-time WebRTC streaming** powered by fastrtc - Talk naturally and get instant responses!
**How to use:**
1. Click "Allow" when prompted for microphone access
2. Start speaking - the model listens and responds in real-time
3. The conversation flows naturally with minimal latency
**Features:**
- ποΈ Real-time WebRTC streaming
- β‘ Low latency response
- π¬ Interleaved text and audio output
- π Multi-turn conversations
""")
chat_state = gr.State(ChatState(processor))
with gr.Row():
with gr.Column():
webrtc = WebRTC(
modality="audio",
mode="send-receive",
full_screen=False,
rtc_configuration=rtc_configuration,
)
with gr.Row():
temperature = gr.Slider(
minimum=0,
maximum=2.0,
value=1.0,
step=0.1,
label="Temperature (0 for greedy)",
info="Higher = more creative"
)
top_k = gr.Slider(
minimum=0,
maximum=100,
value=4,
step=1,
label="Top-k (0 for no filtering)",
info="Sampling diversity"
)
clear_btn = gr.Button("Reset Chat")
with gr.Column():
text_out = gr.Textbox(
lines=10,
label="Conversation Text",
interactive=False
)
gr.Markdown("""
### About this demo
This demo uses **fastrtc** for WebRTC streaming, enabling real-time speech-to-speech interaction with minimal latency.
The model processes your speech and generates both text and audio responses simultaneously.
**Model**: LFM2-Audio-1.5B by Liquid AI
**Mode**: Interleaved generation (optimized for real-time)
**Audio Codec**: Mimi (24kHz)
[Liquid AI](https://www.liquid.ai/) | [GitHub](https://github.com/Liquid4All/liquid-audio/) | [Model Card](https://huggingface.co/LiquidAI/LFM2-Audio-1.5B)
""")
# Setup WebRTC streaming
webrtc.stream(
ReplyOnPause(
chat_response, # type: ignore[arg-type]
input_sample_rate=24_000,
output_sample_rate=24_000,
can_interrupt=False,
),
inputs=[webrtc, chat_state, temperature, top_k],
outputs=[webrtc],
)
webrtc.on_additional_outputs(
lambda s: s,
outputs=[text_out],
)
clear_btn.click(clear, outputs=[chat_state, text_out])
if __name__ == "__main__":
demo.launch() |