From 071872bb536acfb04934d186a8589eb480f9bd21 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sun, 7 Jun 2026 08:47:12 +0000
Subject: [PATCH] feat(parakeet-cpp): real segment timestamps (NeMo-faithful)

Offline: replace the single synthetic whole-clip segment with multiple
segments grouped exactly like NeMo's get_segment_offsets - a new segment
after sentence-ending punctuation ('. ? !'), each carrying start/end and
its time-window token ids. The optional model option segment_gap_threshold
(NeMo's unit: encoder FRAMES, default 0=off) adds NeMo's silence-gap split,
converted to seconds via the JSON frame_sec the engine now reports.
Per-segment words are still gated behind timestamp_granularities=["word"];
a zero-word document falls back to a single text segment.

Streaming: when libparakeet.so exposes the ABI v4 JSON entry points
(probed), drive parakeet_capi_stream_feed_json / _finalize_json and
accumulate the streamed per-word timestamps into per-utterance segments
(EOU stays the boundary), so streaming FinalResult segments now carry
start/end. Falls back to the text-only feed against an older library.

Pure-Go specs cover splitWordsIntoSegments (punctuation + gap rules, NeMo
elif order, fallback), transcriptResultFromDoc (multi-segment, token
windows, word-granularity gate), and the streaming segmenter.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/go/parakeet-cpp/goparakeetcpp.go      | 287 ++++++++++++++++--
 backend/go/parakeet-cpp/goparakeetcpp_test.go |   4 +
 backend/go/parakeet-cpp/main.go               |   8 +
 backend/go/parakeet-cpp/segments_test.go      | 127 ++++++++
 4 files changed, 403 insertions(+), 23 deletions(-)
 create mode 100644 backend/go/parakeet-cpp/segments_test.go
diff --git a/backend/go/parakeet-cpp/goparakeetcpp.go b/backend/go/parakeet-cpp/goparakeetcpp.go
index 4821b9c40..a65b4bca0 100644
--- a/backend/go/parakeet-cpp/goparakeetcpp.go
+++ b/backend/go/parakeet-cpp/goparakeetcpp.go
@@ -67,6 +67,13 @@ var (
 	// plus a trailing target_lang ("" means the model default). Present only in
 	// newer libparakeet.so; nil falls back to CppStreamBegin.
 	CppStreamBeginLang func(ctx uintptr, targetLang string) uintptr
+
+	// Streaming JSON variants (ABI v4): feed/finalize returning a malloc'd char*
+	// JSON document {text,eou,frame_sec,words} (uintptr, freed via CppFreeString)
+	// so streaming segments can carry per-word timestamps. Present only in newer
+	// libparakeet.so; nil falls back to the text-only CppStreamFeed/Finalize path.
+	CppStreamFeedJSON     func(s uintptr, pcm []float32, nSamples int32) uintptr
+	CppStreamFinalizeJSON func(s uintptr) uintptr
 )
 
 // streamChunkSamples is how much 16 kHz mono PCM we hand to stream_feed per
@@ -84,9 +91,26 @@ const streamChunkSamples = 16000
 //
 // "start"/"end"/"t" are seconds; "conf" is confidence in (0,1].
 type transcriptJSON struct {
-	Text   string            `json:"text"`
-	Words  []transcriptWord  `json:"words"`
-	Tokens []transcriptToken `json:"tokens"`
+	Text     string            `json:"text"`
+	FrameSec float64           `json:"frame_sec"`
+	Words    []transcriptWord  `json:"words"`
+	Tokens   []transcriptToken `json:"tokens"`
+}
+
+// streamFeedJSON mirrors the document returned by
+// parakeet_capi_stream_feed_json / parakeet_capi_stream_finalize_json (ABI v4):
+//
+//	{"text":"...","eou":0,"frame_sec":0.080000,
+//	 "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...]}
+//
+// "text" is the newly-finalized text since the last call; "eou" is 1 when an
+// <EOU>/<EOB> fired this feed; "words" are the words finalized this call with
+// absolute (stream-relative) start/end seconds.
+type streamFeedJSON struct {
+	Text     string           `json:"text"`
+	Eou      int              `json:"eou"`
+	FrameSec float64          `json:"frame_sec"`
+	Words    []transcriptWord `json:"words"`
 }
 
 type transcriptWord struct {
@@ -115,6 +139,10 @@ type ParakeetCpp struct {
 	engineMu sync.Mutex // sole guard of the one C engine (dispatcher + streaming)
 	bat      *batcher
 	batStop  chan struct{}
+	// segmentGapFrames is NeMo's segment_gap_threshold in ENCODER FRAMES (model
+	// YAML option, default 0=off). When >0 it adds NeMo's silence-gap split on
+	// top of the punctuation split; converted to seconds via the JSON frame_sec.
+	segmentGapFrames int
 }
 
 // Load is the LocalAI gRPC entry point for LoadModel: it calls
@@ -144,6 +172,11 @@ func (p *ParakeetCpp) Load(opts *pb.ModelOptions) error {
 	if maxWaitMs < 0 {
 		maxWaitMs = 0
 	}
+
+	// NeMo's segment_gap_threshold (encoder frames, default 0=off). Off by
+	// default matches NeMo's default (punctuation-only segments); when set it
+	// additionally splits segments on inter-word silence (see transcriptResultFromDoc).
+	p.segmentGapFrames = optInt(opts, "segment_gap_threshold", 0)
 	if CppTranscribePcmBatchJSON != nil {
 		p.batStop = make(chan struct{})
 		p.bat = newBatcher(maxSize, time.Duration(maxWaitMs)*time.Millisecond, p.runBatch)
@@ -283,7 +316,7 @@ func (p *ParakeetCpp) AudioTranscription(ctx context.Context, opts *pb.Transcrip
 		if err := json.Unmarshal([]byte(raw), &doc); err != nil {
 			return pb.TranscriptResult{}, fmt.Errorf("parakeet-cpp: decode transcript json: %w", err)
 		}
-		return transcriptResultFromDoc(doc, opts), nil
+		return transcriptResultFromDoc(doc, opts, p.segmentGapFrames), nil
 	}
 
 	// Batched path: decode to PCM, submit to the batcher, wait for this request's
@@ -312,34 +345,169 @@ func (p *ParakeetCpp) AudioTranscription(ctx context.Context, opts *pb.Transcrip
 	if err := json.Unmarshal([]byte(res.json), &doc); err != nil {
 		return pb.TranscriptResult{}, fmt.Errorf("parakeet-cpp: decode transcript json: %w", err)
 	}
-	return transcriptResultFromDoc(doc, opts), nil
+	return transcriptResultFromDoc(doc, opts, p.segmentGapFrames), nil
 }
 
+// segmentSeparators is NeMo's default segment_seperators (sentence-ending
+// punctuation). Splitting on these matches NeMo's default segment timestamps.
+var segmentSeparators = []rune{'.', '?', '!'}
+
 // transcriptResultFromDoc maps a decoded transcriptJSON to a TranscriptResult,
-// synthesising a single whole-clip segment and attaching word timings only when
-// the caller requested word granularity. Shared by the batched and direct paths.
-func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest) pb.TranscriptResult {
+// grouping words into NeMo-faithful segments (see splitWordsIntoSegments). The
+// optional gapFrames (NeMo's segment_gap_threshold, in encoder FRAMES; 0=off)
+// additionally splits on inter-word silence; it is converted to a seconds gap
+// with the document's frame_sec. Per-segment word timings are attached only when
+// the caller requested word granularity; token ids populate each segment's
+// Tokens by time-window membership. Shared by the batched and direct paths.
+func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gapFrames int) pb.TranscriptResult {
 	text := strings.TrimSpace(doc.Text)
-	words := make([]*pb.TranscriptWord, 0, len(doc.Words))
-	for _, w := range doc.Words {
-		words = append(words, &pb.TranscriptWord{Start: secondsToNanos(w.Start), End: secondsToNanos(w.End), Text: w.W})
+
+	// Frame-unit gap threshold -> seconds (NeMo segment_gap_threshold). 0 = off.
+	gapSeconds := 0.0
+	if gapFrames > 0 {
+		if doc.FrameSec > 0 {
+			gapSeconds = float64(gapFrames) * doc.FrameSec
+		} else {
+			xlog.Warn("parakeet-cpp: segment_gap_threshold set but libparakeet.so " +
+				"did not report frame_sec; falling back to punctuation-only segments")
+		}
 	}
-	tokens := make([]int32, 0, len(doc.Tokens))
-	for _, t := range doc.Tokens {
-		tokens = append(tokens, t.ID)
+
+	groups := splitWordsIntoSegments(doc.Words, segmentSeparators, gapSeconds)
+	if len(groups) == 0 {
+		// No words (edge case): single whole-clip text segment.
+		return pb.TranscriptResult{
+			Text:     text,
+			Segments: []*pb.TranscriptSegment{{Id: 0, Text: text}},
+		}
 	}
-	var segStart, segEnd int64
-	if len(words) > 0 {
-		segStart = words[0].Start
-		segEnd = words[len(words)-1].End
+
+	wantWords := wordsRequested(opts.TimestampGranularities)
+	segments := make([]*pb.TranscriptSegment, 0, len(groups))
+	for id, group := range groups {
+		parts := make([]string, len(group))
+		for i, gw := range group {
+			parts[i] = gw.W
+		}
+		seg := &pb.TranscriptSegment{
+			Id:     int32(id),
+			Start:  secondsToNanos(group[0].Start),
+			End:    secondsToNanos(group[len(group)-1].End),
+			Text:   strings.TrimSpace(strings.Join(parts, " ")),
+			Tokens: tokensInWindow(doc.Tokens, group[0].Start, group[len(group)-1].End),
+		}
+		if wantWords {
+			ws := make([]*pb.TranscriptWord, len(group))
+			for i, gw := range group {
+				ws[i] = &pb.TranscriptWord{Start: secondsToNanos(gw.Start), End: secondsToNanos(gw.End), Text: gw.W}
+			}
+			seg.Words = ws
+		}
+		segments = append(segments, seg)
 	}
-	seg := &pb.TranscriptSegment{Id: 0, Start: segStart, End: segEnd, Text: text, Tokens: tokens}
-	if wordsRequested(opts.TimestampGranularities) {
-		seg.Words = words
-	}
-	return pb.TranscriptResult{Text: text, Segments: []*pb.TranscriptSegment{seg}}
+	return pb.TranscriptResult{Text: text, Segments: segments}
 }
 
+// splitWordsIntoSegments groups words into segments exactly as NeMo's
+// get_segment_offsets does (nemo/collections/asr/parts/utils/timestamp_utils.py).
+// Walking the words, it closes a segment when (1) the gap rule is enabled
+// (gapSeconds > 0) and the segment already has words and the gap from the
+// previous word's end to this word's start is >= gapSeconds - the current word
+// then STARTS a new segment - or, checked only when the gap rule did not apply
+// (NeMo's elif), (2) the word ends with (or is) a separator, which closes the
+// segment INCLUDING that word. Trailing words flush into a final segment.
+// gapSeconds <= 0 disables the gap rule, matching NeMo's default
+// segment_gap_threshold=None (punctuation-only segments).
+func splitWordsIntoSegments(words []transcriptWord, separators []rune, gapSeconds float64) [][]transcriptWord {
+	var segments [][]transcriptWord
+	var cur []transcriptWord
+	for i, word := range words {
+		gapActive := gapSeconds > 0 && len(cur) > 0
+		if gapActive && (word.Start-words[i-1].End) >= gapSeconds {
+			segments = append(segments, cur)
+			cur = []transcriptWord{word}
+			continue
+		}
+		if !gapActive && endsWithSeparator(word.W, separators) {
+			cur = append(cur, word)
+			segments = append(segments, cur)
+			cur = nil
+			continue
+		}
+		cur = append(cur, word)
+	}
+	if len(cur) > 0 {
+		segments = append(segments, cur)
+	}
+	return segments
+}
+
+// endsWithSeparator reports whether w's last rune is in separators (matching
+// NeMo's `word[-1] in delims or word in delims`).
+func endsWithSeparator(w string, separators []rune) bool {
+	r := []rune(strings.TrimSpace(w))
+	if len(r) == 0 {
+		return false
+	}
+	last := r[len(r)-1]
+	for _, s := range separators {
+		if last == s {
+			return true
+		}
+	}
+	return false
+}
+
+// tokensInWindow returns the ids of tokens whose timestamp t falls in
+// [start, end] (inclusive), assigning each token to the segment that spans its
+// time. The last segment's end is the last word end, so the final token is
+// included.
+func tokensInWindow(tokens []transcriptToken, start, end float64) []int32 {
+	var ids []int32
+	for _, t := range tokens {
+		if t.T >= start && t.T <= end {
+			ids = append(ids, t.ID)
+		}
+	}
+	return ids
+}
+
+// streamSegmenter accumulates streaming words into per-utterance segments. EOU
+// is the model's own utterance boundary; each closed segment takes its start/end
+// from its first/last accumulated word.
+type streamSegmenter struct {
+	segs   []*pb.TranscriptSegment
+	cur    []transcriptWord
+	nextID int32
+}
+
+func (s *streamSegmenter) add(doc streamFeedJSON) {
+	s.cur = append(s.cur, doc.Words...)
+	if doc.Eou != 0 {
+		s.flush()
+	}
+}
+
+func (s *streamSegmenter) flush() {
+	if len(s.cur) == 0 {
+		return
+	}
+	parts := make([]string, len(s.cur))
+	for i, w := range s.cur {
+		parts[i] = w.W
+	}
+	s.segs = append(s.segs, &pb.TranscriptSegment{
+		Id:    s.nextID,
+		Start: secondsToNanos(s.cur[0].Start),
+		End:   secondsToNanos(s.cur[len(s.cur)-1].End),
+		Text:  strings.TrimSpace(strings.Join(parts, " ")),
+	})
+	s.nextID++
+	s.cur = nil
+}
+
+func (s *streamSegmenter) segments() []*pb.TranscriptSegment { return s.segs }
+
 // wordsRequested reports whether the caller asked for word-level timestamps.
 // The OpenAI transcription API gates word timings behind
 // timestamp_granularities[] containing "word" and defaults to segment-level
@@ -419,6 +587,14 @@ func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.Tra
 		return err
 	}
 
+	// ABI v4: when the streaming JSON entry points are present, drive them so the
+	// per-utterance segments carry per-word start/end timestamps. Falls through to
+	// the text-only loop below against an older libparakeet.so. Runs under the
+	// engineMu already held above.
+	if CppStreamFeedJSON != nil {
+		return p.streamJSON(ctx, stream, data, duration, results)
+	}
+
 	var (
 		full     strings.Builder
 		segText  strings.Builder
@@ -495,6 +671,71 @@ func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.Tra
 	return nil
 }
 
+// streamJSON drives the ABI v4 streaming JSON entry points: each feed/finalize
+// returns a {text,eou,frame_sec,words} document. The newly-finalized text is
+// emitted as a delta (unchanged streaming contract) while words are accumulated
+// into per-utterance segments (closed on EOU) so the closing FinalResult carries
+// timestamped segments. Runs under engineMu (already held by the caller).
+func (p *ParakeetCpp) streamJSON(ctx context.Context, stream uintptr, data []float32,
+	duration float32, results chan *pb.TranscriptStreamResponse) error {
+	var (
+		full strings.Builder
+		seg  streamSegmenter
+	)
+	// consume frees the malloc'd char* (a 0 return is an error), parses the JSON,
+	// emits the delta, and routes words through the segmenter.
+	consume := func(ret uintptr) error {
+		if ret == 0 {
+			msg := CppLastError(p.ctxPtr)
+			if msg == "" {
+				msg = "unknown error"
+			}
+			return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
+		}
+		raw := goStringFromCPtr(ret)
+		CppFreeString(ret)
+		var doc streamFeedJSON
+		if err := json.Unmarshal([]byte(raw), &doc); err != nil {
+			return fmt.Errorf("parakeet-cpp: decode stream json: %w", err)
+		}
+		if doc.Text != "" {
+			full.WriteString(doc.Text)
+			results <- &pb.TranscriptStreamResponse{Delta: doc.Text}
+		}
+		seg.add(doc)
+		return nil
+	}
+
+	for off := 0; off < len(data); off += streamChunkSamples {
+		if err := ctx.Err(); err != nil {
+			return status.Error(codes.Canceled, "transcription cancelled")
+		}
+		end := min(off+streamChunkSamples, len(data))
+		chunk := data[off:end]
+		if err := consume(CppStreamFeedJSON(stream, chunk, int32(len(chunk)))); err != nil {
+			return err
+		}
+	}
+	if err := consume(CppStreamFinalizeJSON(stream)); err != nil {
+		return err
+	}
+	seg.flush() // close any trailing utterance that never saw an EOU
+
+	text := strings.TrimSpace(full.String())
+	segments := seg.segments()
+	if len(segments) == 0 && text != "" {
+		segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text})
+	}
+	results <- &pb.TranscriptStreamResponse{
+		FinalResult: &pb.TranscriptResult{
+			Text:     text,
+			Segments: segments,
+			Duration: duration,
+		},
+	}
+	return nil
+}
+
 // decodeWavMono16k converts any input audio to 16 kHz mono PCM and returns the
 // float samples plus the clip duration in seconds. Mirrors the whisper
 // backend: utils.AudioToWav (ffmpeg) normalises rate/channels, go-audio
diff --git a/backend/go/parakeet-cpp/goparakeetcpp_test.go b/backend/go/parakeet-cpp/goparakeetcpp_test.go
index d72e10d05..f595ecb31 100644
--- a/backend/go/parakeet-cpp/goparakeetcpp_test.go
+++ b/backend/go/parakeet-cpp/goparakeetcpp_test.go
@@ -53,6 +53,10 @@ func ensureLibLoaded() {
 		purego.RegisterLibFunc(&CppStreamFeed, lib, "parakeet_capi_stream_feed")
 		purego.RegisterLibFunc(&CppStreamFinalize, lib, "parakeet_capi_stream_finalize")
 		purego.RegisterLibFunc(&CppStreamFree, lib, "parakeet_capi_stream_free")
+		if sym, err := purego.Dlsym(lib, "parakeet_capi_stream_feed_json"); err == nil && sym != 0 {
+			purego.RegisterLibFunc(&CppStreamFeedJSON, lib, "parakeet_capi_stream_feed_json")
+			purego.RegisterLibFunc(&CppStreamFinalizeJSON, lib, "parakeet_capi_stream_finalize_json")
+		}
 		purego.RegisterLibFunc(&CppFreeString, lib, "parakeet_capi_free_string")
 		purego.RegisterLibFunc(&CppLastError, lib, "parakeet_capi_last_error")
 	})
diff --git a/backend/go/parakeet-cpp/main.go b/backend/go/parakeet-cpp/main.go
index 23b4ec8a1..963056e23 100644
--- a/backend/go/parakeet-cpp/main.go
+++ b/backend/go/parakeet-cpp/main.go
@@ -76,6 +76,14 @@ func main() {
 		purego.RegisterLibFunc(&CppStreamBeginLang, lib, "parakeet_capi_stream_begin_lang")
 	}
 
+	// Streaming JSON entry points (ABI v4): surface per-word timestamps on the
+	// streaming path. Same probe pattern; absent in older libparakeet.so, where
+	// the backend falls back to the text-only streaming feed.
+	if sym, err := purego.Dlsym(lib, "parakeet_capi_stream_feed_json"); err == nil && sym != 0 {
+		purego.RegisterLibFunc(&CppStreamFeedJSON, lib, "parakeet_capi_stream_feed_json")
+		purego.RegisterLibFunc(&CppStreamFinalizeJSON, lib, "parakeet_capi_stream_finalize_json")
+	}
+
 	fmt.Fprintf(os.Stderr, "[parakeet-cpp] ABI=%d\n", CppAbiVersion())
 
 	flag.Parse()
diff --git a/backend/go/parakeet-cpp/segments_test.go b/backend/go/parakeet-cpp/segments_test.go
new file mode 100644
index 000000000..147c151cc
--- /dev/null
+++ b/backend/go/parakeet-cpp/segments_test.go
@@ -0,0 +1,127 @@
+package main
+
+import (
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func tw(text string, start, end float64) transcriptWord {
+	return transcriptWord{W: text, Start: start, End: end}
+}
+
+var _ = Describe("splitWordsIntoSegments (NeMo get_segment_offsets parity)", func() {
+	seps := []rune{'.', '?', '!'}
+
+	It("splits on sentence-ending punctuation, including the delimiter word", func() {
+		words := []transcriptWord{tw("hello", 0, 0.4), tw("world.", 0.4, 0.8), tw("bye", 1.0, 1.3)}
+		segs := splitWordsIntoSegments(words, seps, 0)
+		Expect(segs).To(HaveLen(2))
+		Expect(segs[0]).To(HaveLen(2))
+		Expect(segs[0][1].W).To(Equal("world."))
+		Expect(segs[1]).To(HaveLen(1))
+		Expect(segs[1][0].W).To(Equal("bye"))
+	})
+
+	It("keeps a single segment with no terminal punctuation and gap off", func() {
+		words := []transcriptWord{tw("a", 0, 0.2), tw("b", 0.2, 0.4), tw("c", 5.0, 5.2)}
+		segs := splitWordsIntoSegments(words, seps, 0)
+		Expect(segs).To(HaveLen(1))
+	})
+
+	It("splits on the gap rule when enabled, the gapped word starting the next segment", func() {
+		words := []transcriptWord{tw("a", 0, 0.2), tw("b", 0.2, 0.4), tw("c", 5.0, 5.2)}
+		segs := splitWordsIntoSegments(words, seps, 1.0) // c is 4.6s after b
+		Expect(segs).To(HaveLen(2))
+		Expect(segs[0]).To(HaveLen(2)) // a b
+		Expect(segs[1]).To(HaveLen(1)) // c
+		Expect(segs[1][0].W).To(Equal("c"))
+	})
+
+	It("checks the gap rule before punctuation (NeMo elif order)", func() {
+		// "b." would terminate, but c is far after it -> gap closes [a b.] at b.
+		words := []transcriptWord{tw("a", 0, 0.2), tw("b.", 0.2, 0.4), tw("c", 9.0, 9.2)}
+		segs := splitWordsIntoSegments(words, seps, 1.0)
+		Expect(segs).To(HaveLen(2))
+		Expect(segs[0]).To(HaveLen(2))
+		Expect(segs[1][0].W).To(Equal("c"))
+	})
+
+	It("still splits on punctuation when the gap rule is enabled but does not fire", func() {
+		words := []transcriptWord{tw("hi.", 0, 0.4), tw("bye", 0.4, 0.8)}
+		segs := splitWordsIntoSegments(words, seps, 5.0) // gap never reached
+		Expect(segs).To(HaveLen(2))
+		Expect(segs[0][0].W).To(Equal("hi."))
+	})
+
+	It("returns nothing for empty input", func() {
+		Expect(splitWordsIntoSegments(nil, seps, 0)).To(BeEmpty())
+	})
+})
+
+var _ = Describe("transcriptResultFromDoc (multi-segment)", func() {
+	doc := transcriptJSON{
+		Text:     "hello world. bye now",
+		FrameSec: 0.08,
+		Words: []transcriptWord{
+			{W: "hello", Start: 0.0, End: 0.4},
+			{W: "world.", Start: 0.4, End: 0.8},
+			{W: "bye", Start: 1.0, End: 1.3},
+			{W: "now", Start: 1.3, End: 1.6},
+		},
+		Tokens: []transcriptToken{{ID: 1, T: 0.1}, {ID: 2, T: 0.5}, {ID: 3, T: 1.1}, {ID: 4, T: 1.4}},
+	}
+
+	It("emits one segment per punctuation-delimited group with start/end", func() {
+		res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
+		Expect(res.Segments).To(HaveLen(2))
+		Expect(res.Segments[0].Text).To(Equal("hello world."))
+		Expect(res.Segments[0].Start).To(Equal(int64(0)))
+		Expect(res.Segments[0].End).To(Equal(secondsToNanos(0.8)))
+		Expect(res.Segments[1].Text).To(Equal("bye now"))
+		Expect(res.Segments[1].Start).To(Equal(secondsToNanos(1.0)))
+		Expect(res.Segments[1].Id).To(Equal(int32(1)))
+	})
+
+	It("assigns tokens to the segment whose time window contains them", func() {
+		res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
+		Expect(res.Segments[0].Tokens).To(Equal([]int32{1, 2}))
+		Expect(res.Segments[1].Tokens).To(Equal([]int32{3, 4}))
+	})
+
+	It("attaches per-segment words only when word granularity requested", func() {
+		plain := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
+		Expect(plain.Segments[0].Words).To(BeEmpty())
+		withWords := transcriptResultFromDoc(doc, &pb.TranscriptRequest{TimestampGranularities: []string{"word"}}, 0)
+		Expect(withWords.Segments[0].Words).To(HaveLen(2))
+	})
+
+	It("falls back to a single text segment when there are no words", func() {
+		res := transcriptResultFromDoc(transcriptJSON{Text: "hi"}, &pb.TranscriptRequest{}, 0)
+		Expect(res.Segments).To(HaveLen(1))
+		Expect(res.Segments[0].Text).To(Equal("hi"))
+	})
+})
+
+var _ = Describe("streaming segment assembly", func() {
+	It("closes a segment with start/end from its words on EOU", func() {
+		acc := &streamSegmenter{}
+		acc.add(streamFeedJSON{Text: "hello world", Eou: 1, Words: []transcriptWord{
+			{W: "hello", Start: 0.0, End: 0.4}, {W: "world", Start: 0.4, End: 0.9},
+		}})
+		segs := acc.segments()
+		Expect(segs).To(HaveLen(1))
+		Expect(segs[0].Text).To(Equal("hello world"))
+		Expect(segs[0].Start).To(Equal(int64(0)))
+		Expect(segs[0].End).To(Equal(secondsToNanos(0.9)))
+	})
+
+	It("buffers words across feeds until EOU", func() {
+		acc := &streamSegmenter{}
+		acc.add(streamFeedJSON{Text: "hi", Eou: 0, Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}})
+		Expect(acc.segments()).To(BeEmpty())
+		acc.add(streamFeedJSON{Text: "there", Eou: 1, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}})
+		Expect(acc.segments()).To(HaveLen(1))
+		Expect(acc.segments()[0].Text).To(Equal("hi there"))
+	})
+})