Spaces:

WJ88
/

NVIDIA-Parakeet-TDT-0.6B-v2-INT8-Real-Time-Mic-Transcription

Running

App Files Files Community

WJ88 commited on May 22

Commit

41d27d9

verified ·

1 Parent(s): d22d81d

Create app.py

Browse files

Files changed (1) hide show

app.py +153 -0

app.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""
+Optimised NeMo Parakeet-TDT streaming demo for CPU-only Hugging Face Spaces
+"""
+import os, time, threading, queue, logging
+import numpy as np
+import gradio as gr
+from scipy import signal
+import torch
+from nemo.collections.asr.models import ASRModel
+# ────────────────────────────────────────────────
+# General CPU settings (2 vCPU space)
+# ────────────────────────────────────────────────
+os.environ["OMP_NUM_THREADS"] = "2"          # One MKL/OpenMP thread per vCPU
+torch.set_num_threads(2)
+torch.backends.quantized.engine = "fbgemm"   # Fastest INT8 kernels on x86
+# ────────────────────────────────────────────────
+# Logging
+# ────────────────────────────────────────────────
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(message)s",
+    datefmt="%H:%M:%S",
+)
+logger = logging.getLogger("asr_app")
+# ────────────────────────────────────────────────
+# Constants
+# ────────────────────────────────────────────────
+SR              = 16_000            # Model sample-rate
+CHUNK_SECONDS   = 4                 # seconds per inference window
+CHUNK_SAMPLES   = SR * CHUNK_SECONDS
+# ────────────────────────────────────────────────
+# ASR Application
+# ────────────────────────────────────────────────
+class ASRApp:
+    def __init__(self):
+        self.audio_queue      = queue.Queue(maxsize=100)
+        self.transcript_queue = queue.Queue()
+        self.transcript_list  = []
+        self._load_model()
+        self._start_worker()
+    # ---------- helpers ----------
+    def _log(self, func: str, msg: str):
+        logger.info(
+            f"{func} | audio_q={self.audio_queue.qsize():02}, "
+            f"txt_q={self.transcript_queue.qsize():02} | {msg}"
+        )
+    # ---------- model ----------
+    def _load_model(self):
+        self._log("load_model", "loading Parakeet-TDT-0.6B-V2 (CPU)…")
+        t0 = time.time()
+        model = ASRModel.from_pretrained(
+            model_name="nvidia/parakeet-tdt-0.6b-v2",
+            map_location="cpu",
+        )
+        model.eval()                         # inference mode
+        # ---- dynamic INT8 quantisation ----
+        try:
+            model = torch.quantization.quantize_dynamic(
+                model,
+                {torch.nn.Linear, torch.nn.LSTM, torch.nn.GRU},
+                dtype=torch.qint8,
+            )
+            self._log("load_model", "INT8 quantisation applied")
+        except Exception as e:
+            self._log("load_model", f"quantisation skipped ({e})")
+        self.asr_model = model
+        self._log("load_model", f"model ready in {time.time()-t0:.1f}s")
+        # warm-up (1 × 1 s of zeros)
+        with torch.inference_mode():
+            _ = self.asr_model.transcribe(
+                [np.zeros(SR, dtype=np.float32)]
+            )
+        self._log("load_model", "warm-up done")
+    # ---------- threading ----------
+    def _start_worker(self):
+        threading.Thread(
+            target=self._worker,
+            daemon=True,
+        ).start()
+    def _worker(self):
+        buf = np.array([], dtype=np.float32)
+        while True:
+            try:
+                # accumulate until CHUNK_SAMPLES
+                while len(buf) < CHUNK_SAMPLES:
+                    buf = np.concatenate([buf, self.audio_queue.get()])
+                    self._log("_worker", f"buffer={len(buf)}")
+                chunk, buf = buf[:CHUNK_SAMPLES], buf[CHUNK_SAMPLES:]
+                self._log("_worker", f"→ transcribe {len(chunk)} samples")
+                t0 = time.time()
+                with torch.inference_mode():
+                    out = self.asr_model.transcribe([chunk])
+                dur = time.time() - t0
+                text = out[0].text
+                self._log("_worker", f"inference {dur:.2f}s → “{text}”")
+                self.transcript_queue.put(text)
+            except Exception as e:
+                self._log("_worker", f"ASR error: {e}")
+    # ---------- audio preprocessing ----------
+    def _preprocess(self, audio):
+        sr, y = audio
+        if y.ndim > 1:
+            y = y.mean(axis=1)
+        if sr != SR:
+            # resample faster with polyphase filter
+            y = signal.resample_poly(y, SR, sr)
+        y = y.astype(np.float32)
+        y /= (np.abs(y).max() + 1e-9)
+        return y
+    # ---------- Gradio stream callback ----------
+    def stream_fn(self, audio):
+        self._log("stream_fn", "audio arrived")
+        self.audio_queue.put(self._preprocess(audio))
+        while not self.transcript_queue.empty():
+            self.transcript_list.append(self.transcript_queue.get())
+        return (
+            " ".join(self.transcript_list)
+            if self.transcript_list
+            else "…listening…"
+        )
+# ────────────────────────────────────────────────
+# Gradio UI
+# ────────────────────────────────────────────────
+asr_app = ASRApp()
+with gr.Blocks() as demo:
+    mic = gr.Audio(
+        sources=["microphone"],
+        type="numpy",
+        streaming=True,
+        label="Microphone",
+    )
+    out = gr.Textbox(label="Transcription")
+    mic.stream(
+        fn=asr_app.stream_fn,
+        inputs=mic,
+        outputs=out,
+        stream_every=0.5,        # ↓ UI calls per second
+    )
+asr_app._log("main", "launching UI")
+demo.launch()