From 01fa12e0dee93fedf777922996aa63995fb7495a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=95=AA=E8=8C=84=E6=91=94=E6=88=90=E7=95=AA=E8=8C=84?=
 =?UTF-8?q?=E9=85=B1?= <68098251+fqscfqj@users.noreply.github.com>
Date: Sun, 21 Jun 2026 23:04:19 +0800
Subject: [PATCH] feat(nemo): enable word-level timestamps for ASR models
 (#10297)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(nemo): enable word-level timestamps for ASR models

The nemo backend ignored timestamp_granularities and always returned a
single segment with start=0 end=0, making word-level timestamps
impossible to obtain even though the NeMo models (parakeet-tdt, etc.)
fully support them.

Changes:
- Add _get_stride_seconds() to compute frame duration from the model's
  preprocessor window_stride and encoder subsampling_factor.
- Add _build_segments_with_words() that extracts word offsets from the
  NeMo Hypothesis.timestamp dict and converts frame indices to
  nanosecond timestamps.
- Support 'word' granularity (one segment per word) and 'segment'
  granularity (merge at time-gap boundaries using a dynamic threshold).
- Populate TranscriptSegment.words with TranscriptWord entries so
  callers get both segment-level and word-level timing.
- Only request timestamps from NeMo when the caller actually asks for
  them (timestamp_granularities is non-empty), keeping the fast path
  unchanged for callers that don't need timestamps.

Tested with nvidia/parakeet-tdt-0.6b-v3 on the JFK "ask not" clip:
  curl -X POST /v1/audio/transcriptions \
    -F file=@jfk.wav -F model=nemo-parakeet-tdt-0.6b \
    -F 'timestamp_granularities[]=word' -F response_format=verbose_json
  → each word has correct start/end times in seconds.

Signed-off-by: fqscfqj <fqscfqj@outlook.com>

* fix(nemo): address Copilot review feedback

- Narrow exception handling in _get_stride_seconds to catch only
  AttributeError, KeyError, TypeError instead of bare Exception, and
  emit a warning when falling back to the hardcoded stride.
- Remove explicit return_hypotheses=False when timestamps are requested;
  timestamps=True already forces NeMo to return Hypothesis objects.
- Add a warning when NeMo does not return Hypothesis objects despite
  timestamps being requested.

Signed-off-by: fqscfqj <fqscfqj@outlook.com>

---------

Signed-off-by: fqscfqj <fqscfqj@outlook.com>
---
 backend/python/nemo/backend.py | 202 ++++++++++++++++++++++++++++++---
 1 file changed, 186 insertions(+), 16 deletions(-)

diff --git a/backend/python/nemo/backend.py b/backend/python/nemo/backend.py
index ccbff7cd2..a5c30694e 100644
--- a/backend/python/nemo/backend.py
+++ b/backend/python/nemo/backend.py
@@ -84,6 +84,135 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 
         return backend_pb2.Result(message="Model loaded successfully", success=True)
 
+    def _get_stride_seconds(self):
+        """Compute the seconds-per-frame stride for the loaded model.
+
+        stride = preprocessor_window_stride * encoder_subsampling_factor
+        """
+        try:
+            preprocessor = self.model.preprocessor
+            window_stride = preprocessor._cfg.get('window_stride', 0.01)
+            subsampling_factor = getattr(self.model.encoder, 'subsampling_factor', 8)
+            return window_stride * subsampling_factor
+        except (AttributeError, KeyError, TypeError) as err:
+            print(
+                f"Warning: could not compute stride from model config ({err}), "
+                f"falling back to 0.08s/frame",
+                file=sys.stderr,
+            )
+            return 0.08
+
+    def _build_segments_with_words(self, hypothesis, stride, timestamp_granularities=None):
+        """Build TranscriptSegment list from a NeMo Hypothesis with timestamps.
+
+        Supports two granularity modes:
+          - "word": one TranscriptSegment per word, each with a single TranscriptWord entry
+          - "segment" (default): merge consecutive words into sentence-level segments,
+            splitting at word-level time gaps that exceed a dynamic threshold.
+        """
+        if not hypothesis or not isinstance(hypothesis.timestamp, dict):
+            return []
+
+        word_offsets = hypothesis.timestamp.get('word', [])
+        if not word_offsets:
+            return []
+
+        granularities = list(timestamp_granularities) if timestamp_granularities else []
+        granularity = "word" if "word" in granularities else "segment"
+
+        # Build a flat list of (text, start_ns, end_ns) from NeMo word offsets
+        transcript_words = []
+        for wo in word_offsets:
+            word_text = wo.get('word', '')
+            if not word_text:
+                continue
+            start_offset = wo.get('start_offset', 0)
+            end_offset = wo.get('end_offset', start_offset)
+            start_ns = int(start_offset * stride * 1_000_000_000)
+            end_ns = int(end_offset * stride * 1_000_000_000)
+            transcript_words.append({
+                'text': word_text,
+                'start': start_ns,
+                'end': end_ns,
+            })
+
+        if not transcript_words:
+            return []
+
+        if granularity == "word":
+            # One segment per word
+            result = []
+            for idx, tw in enumerate(transcript_words):
+                word = backend_pb2.TranscriptWord(
+                    start=tw['start'], end=tw['end'], text=tw['text']
+                )
+                result.append(backend_pb2.TranscriptSegment(
+                    id=idx,
+                    start=tw['start'],
+                    end=tw['end'],
+                    text=tw['text'],
+                    words=[word],
+                ))
+            return result
+
+        # segment mode — merge at word-level time-gap boundaries
+        # Compute gap threshold: median inter-word gap * 3, clamped to [0.3, 2.0]s
+        gaps = []
+        for i in range(1, len(transcript_words)):
+            gap = (transcript_words[i]['start'] - transcript_words[i - 1]['end']) / 1_000_000_000
+            if gap > 0:
+                gaps.append(gap)
+        if gaps:
+            gaps.sort()
+            median_gap = gaps[len(gaps) // 2]
+            threshold_ns = int(max(0.3, min(median_gap * 3, 2.0)) * 1_000_000_000)
+        else:
+            threshold_ns = int(0.5 * 1_000_000_000)
+
+        result = []
+        buf_words = []  # list of TranscriptWord protobuf
+        buf_start = None
+        buf_end = 0
+        buf_text = []
+        prev_end = None
+
+        for tw in transcript_words:
+            # Detect word-level time gap
+            if prev_end is not None and (tw['start'] - prev_end) >= threshold_ns and buf_text:
+                seg_text = ' '.join(buf_text)
+                result.append(backend_pb2.TranscriptSegment(
+                    id=len(result),
+                    start=buf_start,
+                    end=buf_end,
+                    text=seg_text,
+                    words=list(buf_words),
+                ))
+                buf_words = []
+                buf_text = []
+                buf_start = None
+
+            if buf_start is None:
+                buf_start = tw['start']
+            buf_end = tw['end']
+            buf_text.append(tw['text'])
+            buf_words.append(backend_pb2.TranscriptWord(
+                start=tw['start'], end=tw['end'], text=tw['text']
+            ))
+            prev_end = tw['end']
+
+        # flush remaining
+        if buf_text and buf_start is not None:
+            seg_text = ' '.join(buf_text)
+            result.append(backend_pb2.TranscriptSegment(
+                id=len(result),
+                start=buf_start,
+                end=buf_end,
+                text=seg_text,
+                words=list(buf_words),
+            ))
+
+        return result
+
     def AudioTranscription(self, request, context):
         result_segments = []
         text = ""
@@ -93,26 +222,67 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                 print(f"Error: Audio file not found: {audio_path}", file=sys.stderr)
                 return backend_pb2.TranscriptResult(segments=[], text="")
 
-            # NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
-            results = self.model.transcribe([audio_path])
+            # Determine requested timestamp granularity
+            timestamp_granularities = list(request.timestamp_granularities) if request.timestamp_granularities else []
+            want_timestamps = bool(timestamp_granularities)
 
-            if not results or len(results) == 0:
-                return backend_pb2.TranscriptResult(segments=[], text="")
+            if want_timestamps:
+                # Request timestamps from NeMo.
+                # timestamps=True forces NeMo to return Hypothesis objects with
+                # the timestamp dict populated, so we omit return_hypotheses to
+                # let NeMo choose the correct return type.
+                results = self.model.transcribe([audio_path], timestamps=True)
 
-            # Get the transcript text from the first result.
-            # CTC models return List[str], TDT/RNNT models return List[Hypothesis]
-            # where the actual text lives in Hypothesis.text.
-            result = results[0]
-            if isinstance(result, str):
-                text = result
+                if results and len(results) > 0:
+                    hypotheses = results[0] if isinstance(results[0], list) else results
+                    if hypotheses and len(hypotheses) > 0:
+                        hypothesis = hypotheses[0]
+
+                        # Hypothesis object should have .timestamp populated
+                        if not hasattr(hypothesis, 'timestamp') or not isinstance(hypothesis.timestamp, dict):
+                            print(
+                                "Warning: timestamps were requested but NeMo did not return "
+                                "Hypothesis objects; falling back to untimestamped output",
+                                file=sys.stderr,
+                            )
+
+                        # Extract text
+                        if hasattr(hypothesis, 'text'):
+                            text = hypothesis.text or ""
+                        elif isinstance(hypothesis, str):
+                            text = hypothesis
+
+                        # Build segments with word-level timestamps
+                        stride = self._get_stride_seconds()
+                        result_segments = self._build_segments_with_words(
+                            hypothesis, stride, timestamp_granularities
+                        )
+
+                        # If no word offsets but we have text, fall back to single segment
+                        if not result_segments and text:
+                            result_segments.append(backend_pb2.TranscriptSegment(
+                                id=0, start=0, end=0, text=text
+                            ))
             else:
-                text = getattr(result, 'text', None) or ""
+                # Simple transcription without timestamps
+                # NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
+                results = self.model.transcribe([audio_path])
 
-            if text:
-                # Create a single segment with the full transcription
-                result_segments.append(backend_pb2.TranscriptSegment(
-                    id=0, start=0, end=0, text=text
-                ))
+                if results and len(results) > 0:
+                    # Get the transcript text from the first result.
+                    # CTC models return List[str], TDT/RNNT models return List[Hypothesis]
+                    # where the actual text lives in Hypothesis.text.
+                    result = results[0]
+                    if isinstance(result, str):
+                        text = result
+                    else:
+                        text = getattr(result, 'text', None) or ""
+
+                    if text:
+                        # Create a single segment with the full transcription
+                        result_segments.append(backend_pb2.TranscriptSegment(
+                            id=0, start=0, end=0, text=text
+                        ))
 
         except Exception as err:
             print(f"Error in AudioTranscription: {err}", file=sys.stderr)