From 071872bb536acfb04934d186a8589eb480f9bd21 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 7 Jun 2026 08:47:12 +0000 Subject: [PATCH] feat(parakeet-cpp): real segment timestamps (NeMo-faithful) Offline: replace the single synthetic whole-clip segment with multiple segments grouped exactly like NeMo's get_segment_offsets - a new segment after sentence-ending punctuation ('. ? !'), each carrying start/end and its time-window token ids. The optional model option segment_gap_threshold (NeMo's unit: encoder FRAMES, default 0=off) adds NeMo's silence-gap split, converted to seconds via the JSON frame_sec the engine now reports. Per-segment words are still gated behind timestamp_granularities=["word"]; a zero-word document falls back to a single text segment. Streaming: when libparakeet.so exposes the ABI v4 JSON entry points (probed), drive parakeet_capi_stream_feed_json / _finalize_json and accumulate the streamed per-word timestamps into per-utterance segments (EOU stays the boundary), so streaming FinalResult segments now carry start/end. Falls back to the text-only feed against an older library. Pure-Go specs cover splitWordsIntoSegments (punctuation + gap rules, NeMo elif order, fallback), transcriptResultFromDoc (multi-segment, token windows, word-granularity gate), and the streaming segmenter. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto --- backend/go/parakeet-cpp/goparakeetcpp.go | 287 ++++++++++++++++-- backend/go/parakeet-cpp/goparakeetcpp_test.go | 4 + backend/go/parakeet-cpp/main.go | 8 + backend/go/parakeet-cpp/segments_test.go | 127 ++++++++ 4 files changed, 403 insertions(+), 23 deletions(-) create mode 100644 backend/go/parakeet-cpp/segments_test.go diff --git a/backend/go/parakeet-cpp/goparakeetcpp.go b/backend/go/parakeet-cpp/goparakeetcpp.go index 4821b9c40..a65b4bca0 100644 --- a/backend/go/parakeet-cpp/goparakeetcpp.go +++ b/backend/go/parakeet-cpp/goparakeetcpp.go @@ -67,6 +67,13 @@ var ( // plus a trailing target_lang ("" means the model default). Present only in // newer libparakeet.so; nil falls back to CppStreamBegin. CppStreamBeginLang func(ctx uintptr, targetLang string) uintptr + + // Streaming JSON variants (ABI v4): feed/finalize returning a malloc'd char* + // JSON document {text,eou,frame_sec,words} (uintptr, freed via CppFreeString) + // so streaming segments can carry per-word timestamps. Present only in newer + // libparakeet.so; nil falls back to the text-only CppStreamFeed/Finalize path. + CppStreamFeedJSON func(s uintptr, pcm []float32, nSamples int32) uintptr + CppStreamFinalizeJSON func(s uintptr) uintptr ) // streamChunkSamples is how much 16 kHz mono PCM we hand to stream_feed per @@ -84,9 +91,26 @@ const streamChunkSamples = 16000 // // "start"/"end"/"t" are seconds; "conf" is confidence in (0,1]. type transcriptJSON struct { - Text string `json:"text"` - Words []transcriptWord `json:"words"` - Tokens []transcriptToken `json:"tokens"` + Text string `json:"text"` + FrameSec float64 `json:"frame_sec"` + Words []transcriptWord `json:"words"` + Tokens []transcriptToken `json:"tokens"` +} + +// streamFeedJSON mirrors the document returned by +// parakeet_capi_stream_feed_json / parakeet_capi_stream_finalize_json (ABI v4): +// +// {"text":"...","eou":0,"frame_sec":0.080000, +// "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...]} +// +// "text" is the newly-finalized text since the last call; "eou" is 1 when an +// / fired this feed; "words" are the words finalized this call with +// absolute (stream-relative) start/end seconds. +type streamFeedJSON struct { + Text string `json:"text"` + Eou int `json:"eou"` + FrameSec float64 `json:"frame_sec"` + Words []transcriptWord `json:"words"` } type transcriptWord struct { @@ -115,6 +139,10 @@ type ParakeetCpp struct { engineMu sync.Mutex // sole guard of the one C engine (dispatcher + streaming) bat *batcher batStop chan struct{} + // segmentGapFrames is NeMo's segment_gap_threshold in ENCODER FRAMES (model + // YAML option, default 0=off). When >0 it adds NeMo's silence-gap split on + // top of the punctuation split; converted to seconds via the JSON frame_sec. + segmentGapFrames int } // Load is the LocalAI gRPC entry point for LoadModel: it calls @@ -144,6 +172,11 @@ func (p *ParakeetCpp) Load(opts *pb.ModelOptions) error { if maxWaitMs < 0 { maxWaitMs = 0 } + + // NeMo's segment_gap_threshold (encoder frames, default 0=off). Off by + // default matches NeMo's default (punctuation-only segments); when set it + // additionally splits segments on inter-word silence (see transcriptResultFromDoc). + p.segmentGapFrames = optInt(opts, "segment_gap_threshold", 0) if CppTranscribePcmBatchJSON != nil { p.batStop = make(chan struct{}) p.bat = newBatcher(maxSize, time.Duration(maxWaitMs)*time.Millisecond, p.runBatch) @@ -283,7 +316,7 @@ func (p *ParakeetCpp) AudioTranscription(ctx context.Context, opts *pb.Transcrip if err := json.Unmarshal([]byte(raw), &doc); err != nil { return pb.TranscriptResult{}, fmt.Errorf("parakeet-cpp: decode transcript json: %w", err) } - return transcriptResultFromDoc(doc, opts), nil + return transcriptResultFromDoc(doc, opts, p.segmentGapFrames), nil } // Batched path: decode to PCM, submit to the batcher, wait for this request's @@ -312,34 +345,169 @@ func (p *ParakeetCpp) AudioTranscription(ctx context.Context, opts *pb.Transcrip if err := json.Unmarshal([]byte(res.json), &doc); err != nil { return pb.TranscriptResult{}, fmt.Errorf("parakeet-cpp: decode transcript json: %w", err) } - return transcriptResultFromDoc(doc, opts), nil + return transcriptResultFromDoc(doc, opts, p.segmentGapFrames), nil } +// segmentSeparators is NeMo's default segment_seperators (sentence-ending +// punctuation). Splitting on these matches NeMo's default segment timestamps. +var segmentSeparators = []rune{'.', '?', '!'} + // transcriptResultFromDoc maps a decoded transcriptJSON to a TranscriptResult, -// synthesising a single whole-clip segment and attaching word timings only when -// the caller requested word granularity. Shared by the batched and direct paths. -func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest) pb.TranscriptResult { +// grouping words into NeMo-faithful segments (see splitWordsIntoSegments). The +// optional gapFrames (NeMo's segment_gap_threshold, in encoder FRAMES; 0=off) +// additionally splits on inter-word silence; it is converted to a seconds gap +// with the document's frame_sec. Per-segment word timings are attached only when +// the caller requested word granularity; token ids populate each segment's +// Tokens by time-window membership. Shared by the batched and direct paths. +func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gapFrames int) pb.TranscriptResult { text := strings.TrimSpace(doc.Text) - words := make([]*pb.TranscriptWord, 0, len(doc.Words)) - for _, w := range doc.Words { - words = append(words, &pb.TranscriptWord{Start: secondsToNanos(w.Start), End: secondsToNanos(w.End), Text: w.W}) + + // Frame-unit gap threshold -> seconds (NeMo segment_gap_threshold). 0 = off. + gapSeconds := 0.0 + if gapFrames > 0 { + if doc.FrameSec > 0 { + gapSeconds = float64(gapFrames) * doc.FrameSec + } else { + xlog.Warn("parakeet-cpp: segment_gap_threshold set but libparakeet.so " + + "did not report frame_sec; falling back to punctuation-only segments") + } } - tokens := make([]int32, 0, len(doc.Tokens)) - for _, t := range doc.Tokens { - tokens = append(tokens, t.ID) + + groups := splitWordsIntoSegments(doc.Words, segmentSeparators, gapSeconds) + if len(groups) == 0 { + // No words (edge case): single whole-clip text segment. + return pb.TranscriptResult{ + Text: text, + Segments: []*pb.TranscriptSegment{{Id: 0, Text: text}}, + } } - var segStart, segEnd int64 - if len(words) > 0 { - segStart = words[0].Start - segEnd = words[len(words)-1].End + + wantWords := wordsRequested(opts.TimestampGranularities) + segments := make([]*pb.TranscriptSegment, 0, len(groups)) + for id, group := range groups { + parts := make([]string, len(group)) + for i, gw := range group { + parts[i] = gw.W + } + seg := &pb.TranscriptSegment{ + Id: int32(id), + Start: secondsToNanos(group[0].Start), + End: secondsToNanos(group[len(group)-1].End), + Text: strings.TrimSpace(strings.Join(parts, " ")), + Tokens: tokensInWindow(doc.Tokens, group[0].Start, group[len(group)-1].End), + } + if wantWords { + ws := make([]*pb.TranscriptWord, len(group)) + for i, gw := range group { + ws[i] = &pb.TranscriptWord{Start: secondsToNanos(gw.Start), End: secondsToNanos(gw.End), Text: gw.W} + } + seg.Words = ws + } + segments = append(segments, seg) } - seg := &pb.TranscriptSegment{Id: 0, Start: segStart, End: segEnd, Text: text, Tokens: tokens} - if wordsRequested(opts.TimestampGranularities) { - seg.Words = words - } - return pb.TranscriptResult{Text: text, Segments: []*pb.TranscriptSegment{seg}} + return pb.TranscriptResult{Text: text, Segments: segments} } +// splitWordsIntoSegments groups words into segments exactly as NeMo's +// get_segment_offsets does (nemo/collections/asr/parts/utils/timestamp_utils.py). +// Walking the words, it closes a segment when (1) the gap rule is enabled +// (gapSeconds > 0) and the segment already has words and the gap from the +// previous word's end to this word's start is >= gapSeconds - the current word +// then STARTS a new segment - or, checked only when the gap rule did not apply +// (NeMo's elif), (2) the word ends with (or is) a separator, which closes the +// segment INCLUDING that word. Trailing words flush into a final segment. +// gapSeconds <= 0 disables the gap rule, matching NeMo's default +// segment_gap_threshold=None (punctuation-only segments). +func splitWordsIntoSegments(words []transcriptWord, separators []rune, gapSeconds float64) [][]transcriptWord { + var segments [][]transcriptWord + var cur []transcriptWord + for i, word := range words { + gapActive := gapSeconds > 0 && len(cur) > 0 + if gapActive && (word.Start-words[i-1].End) >= gapSeconds { + segments = append(segments, cur) + cur = []transcriptWord{word} + continue + } + if !gapActive && endsWithSeparator(word.W, separators) { + cur = append(cur, word) + segments = append(segments, cur) + cur = nil + continue + } + cur = append(cur, word) + } + if len(cur) > 0 { + segments = append(segments, cur) + } + return segments +} + +// endsWithSeparator reports whether w's last rune is in separators (matching +// NeMo's `word[-1] in delims or word in delims`). +func endsWithSeparator(w string, separators []rune) bool { + r := []rune(strings.TrimSpace(w)) + if len(r) == 0 { + return false + } + last := r[len(r)-1] + for _, s := range separators { + if last == s { + return true + } + } + return false +} + +// tokensInWindow returns the ids of tokens whose timestamp t falls in +// [start, end] (inclusive), assigning each token to the segment that spans its +// time. The last segment's end is the last word end, so the final token is +// included. +func tokensInWindow(tokens []transcriptToken, start, end float64) []int32 { + var ids []int32 + for _, t := range tokens { + if t.T >= start && t.T <= end { + ids = append(ids, t.ID) + } + } + return ids +} + +// streamSegmenter accumulates streaming words into per-utterance segments. EOU +// is the model's own utterance boundary; each closed segment takes its start/end +// from its first/last accumulated word. +type streamSegmenter struct { + segs []*pb.TranscriptSegment + cur []transcriptWord + nextID int32 +} + +func (s *streamSegmenter) add(doc streamFeedJSON) { + s.cur = append(s.cur, doc.Words...) + if doc.Eou != 0 { + s.flush() + } +} + +func (s *streamSegmenter) flush() { + if len(s.cur) == 0 { + return + } + parts := make([]string, len(s.cur)) + for i, w := range s.cur { + parts[i] = w.W + } + s.segs = append(s.segs, &pb.TranscriptSegment{ + Id: s.nextID, + Start: secondsToNanos(s.cur[0].Start), + End: secondsToNanos(s.cur[len(s.cur)-1].End), + Text: strings.TrimSpace(strings.Join(parts, " ")), + }) + s.nextID++ + s.cur = nil +} + +func (s *streamSegmenter) segments() []*pb.TranscriptSegment { return s.segs } + // wordsRequested reports whether the caller asked for word-level timestamps. // The OpenAI transcription API gates word timings behind // timestamp_granularities[] containing "word" and defaults to segment-level @@ -419,6 +587,14 @@ func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.Tra return err } + // ABI v4: when the streaming JSON entry points are present, drive them so the + // per-utterance segments carry per-word start/end timestamps. Falls through to + // the text-only loop below against an older libparakeet.so. Runs under the + // engineMu already held above. + if CppStreamFeedJSON != nil { + return p.streamJSON(ctx, stream, data, duration, results) + } + var ( full strings.Builder segText strings.Builder @@ -495,6 +671,71 @@ func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.Tra return nil } +// streamJSON drives the ABI v4 streaming JSON entry points: each feed/finalize +// returns a {text,eou,frame_sec,words} document. The newly-finalized text is +// emitted as a delta (unchanged streaming contract) while words are accumulated +// into per-utterance segments (closed on EOU) so the closing FinalResult carries +// timestamped segments. Runs under engineMu (already held by the caller). +func (p *ParakeetCpp) streamJSON(ctx context.Context, stream uintptr, data []float32, + duration float32, results chan *pb.TranscriptStreamResponse) error { + var ( + full strings.Builder + seg streamSegmenter + ) + // consume frees the malloc'd char* (a 0 return is an error), parses the JSON, + // emits the delta, and routes words through the segmenter. + consume := func(ret uintptr) error { + if ret == 0 { + msg := CppLastError(p.ctxPtr) + if msg == "" { + msg = "unknown error" + } + return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg) + } + raw := goStringFromCPtr(ret) + CppFreeString(ret) + var doc streamFeedJSON + if err := json.Unmarshal([]byte(raw), &doc); err != nil { + return fmt.Errorf("parakeet-cpp: decode stream json: %w", err) + } + if doc.Text != "" { + full.WriteString(doc.Text) + results <- &pb.TranscriptStreamResponse{Delta: doc.Text} + } + seg.add(doc) + return nil + } + + for off := 0; off < len(data); off += streamChunkSamples { + if err := ctx.Err(); err != nil { + return status.Error(codes.Canceled, "transcription cancelled") + } + end := min(off+streamChunkSamples, len(data)) + chunk := data[off:end] + if err := consume(CppStreamFeedJSON(stream, chunk, int32(len(chunk)))); err != nil { + return err + } + } + if err := consume(CppStreamFinalizeJSON(stream)); err != nil { + return err + } + seg.flush() // close any trailing utterance that never saw an EOU + + text := strings.TrimSpace(full.String()) + segments := seg.segments() + if len(segments) == 0 && text != "" { + segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text}) + } + results <- &pb.TranscriptStreamResponse{ + FinalResult: &pb.TranscriptResult{ + Text: text, + Segments: segments, + Duration: duration, + }, + } + return nil +} + // decodeWavMono16k converts any input audio to 16 kHz mono PCM and returns the // float samples plus the clip duration in seconds. Mirrors the whisper // backend: utils.AudioToWav (ffmpeg) normalises rate/channels, go-audio diff --git a/backend/go/parakeet-cpp/goparakeetcpp_test.go b/backend/go/parakeet-cpp/goparakeetcpp_test.go index d72e10d05..f595ecb31 100644 --- a/backend/go/parakeet-cpp/goparakeetcpp_test.go +++ b/backend/go/parakeet-cpp/goparakeetcpp_test.go @@ -53,6 +53,10 @@ func ensureLibLoaded() { purego.RegisterLibFunc(&CppStreamFeed, lib, "parakeet_capi_stream_feed") purego.RegisterLibFunc(&CppStreamFinalize, lib, "parakeet_capi_stream_finalize") purego.RegisterLibFunc(&CppStreamFree, lib, "parakeet_capi_stream_free") + if sym, err := purego.Dlsym(lib, "parakeet_capi_stream_feed_json"); err == nil && sym != 0 { + purego.RegisterLibFunc(&CppStreamFeedJSON, lib, "parakeet_capi_stream_feed_json") + purego.RegisterLibFunc(&CppStreamFinalizeJSON, lib, "parakeet_capi_stream_finalize_json") + } purego.RegisterLibFunc(&CppFreeString, lib, "parakeet_capi_free_string") purego.RegisterLibFunc(&CppLastError, lib, "parakeet_capi_last_error") }) diff --git a/backend/go/parakeet-cpp/main.go b/backend/go/parakeet-cpp/main.go index 23b4ec8a1..963056e23 100644 --- a/backend/go/parakeet-cpp/main.go +++ b/backend/go/parakeet-cpp/main.go @@ -76,6 +76,14 @@ func main() { purego.RegisterLibFunc(&CppStreamBeginLang, lib, "parakeet_capi_stream_begin_lang") } + // Streaming JSON entry points (ABI v4): surface per-word timestamps on the + // streaming path. Same probe pattern; absent in older libparakeet.so, where + // the backend falls back to the text-only streaming feed. + if sym, err := purego.Dlsym(lib, "parakeet_capi_stream_feed_json"); err == nil && sym != 0 { + purego.RegisterLibFunc(&CppStreamFeedJSON, lib, "parakeet_capi_stream_feed_json") + purego.RegisterLibFunc(&CppStreamFinalizeJSON, lib, "parakeet_capi_stream_finalize_json") + } + fmt.Fprintf(os.Stderr, "[parakeet-cpp] ABI=%d\n", CppAbiVersion()) flag.Parse() diff --git a/backend/go/parakeet-cpp/segments_test.go b/backend/go/parakeet-cpp/segments_test.go new file mode 100644 index 000000000..147c151cc --- /dev/null +++ b/backend/go/parakeet-cpp/segments_test.go @@ -0,0 +1,127 @@ +package main + +import ( + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func tw(text string, start, end float64) transcriptWord { + return transcriptWord{W: text, Start: start, End: end} +} + +var _ = Describe("splitWordsIntoSegments (NeMo get_segment_offsets parity)", func() { + seps := []rune{'.', '?', '!'} + + It("splits on sentence-ending punctuation, including the delimiter word", func() { + words := []transcriptWord{tw("hello", 0, 0.4), tw("world.", 0.4, 0.8), tw("bye", 1.0, 1.3)} + segs := splitWordsIntoSegments(words, seps, 0) + Expect(segs).To(HaveLen(2)) + Expect(segs[0]).To(HaveLen(2)) + Expect(segs[0][1].W).To(Equal("world.")) + Expect(segs[1]).To(HaveLen(1)) + Expect(segs[1][0].W).To(Equal("bye")) + }) + + It("keeps a single segment with no terminal punctuation and gap off", func() { + words := []transcriptWord{tw("a", 0, 0.2), tw("b", 0.2, 0.4), tw("c", 5.0, 5.2)} + segs := splitWordsIntoSegments(words, seps, 0) + Expect(segs).To(HaveLen(1)) + }) + + It("splits on the gap rule when enabled, the gapped word starting the next segment", func() { + words := []transcriptWord{tw("a", 0, 0.2), tw("b", 0.2, 0.4), tw("c", 5.0, 5.2)} + segs := splitWordsIntoSegments(words, seps, 1.0) // c is 4.6s after b + Expect(segs).To(HaveLen(2)) + Expect(segs[0]).To(HaveLen(2)) // a b + Expect(segs[1]).To(HaveLen(1)) // c + Expect(segs[1][0].W).To(Equal("c")) + }) + + It("checks the gap rule before punctuation (NeMo elif order)", func() { + // "b." would terminate, but c is far after it -> gap closes [a b.] at b. + words := []transcriptWord{tw("a", 0, 0.2), tw("b.", 0.2, 0.4), tw("c", 9.0, 9.2)} + segs := splitWordsIntoSegments(words, seps, 1.0) + Expect(segs).To(HaveLen(2)) + Expect(segs[0]).To(HaveLen(2)) + Expect(segs[1][0].W).To(Equal("c")) + }) + + It("still splits on punctuation when the gap rule is enabled but does not fire", func() { + words := []transcriptWord{tw("hi.", 0, 0.4), tw("bye", 0.4, 0.8)} + segs := splitWordsIntoSegments(words, seps, 5.0) // gap never reached + Expect(segs).To(HaveLen(2)) + Expect(segs[0][0].W).To(Equal("hi.")) + }) + + It("returns nothing for empty input", func() { + Expect(splitWordsIntoSegments(nil, seps, 0)).To(BeEmpty()) + }) +}) + +var _ = Describe("transcriptResultFromDoc (multi-segment)", func() { + doc := transcriptJSON{ + Text: "hello world. bye now", + FrameSec: 0.08, + Words: []transcriptWord{ + {W: "hello", Start: 0.0, End: 0.4}, + {W: "world.", Start: 0.4, End: 0.8}, + {W: "bye", Start: 1.0, End: 1.3}, + {W: "now", Start: 1.3, End: 1.6}, + }, + Tokens: []transcriptToken{{ID: 1, T: 0.1}, {ID: 2, T: 0.5}, {ID: 3, T: 1.1}, {ID: 4, T: 1.4}}, + } + + It("emits one segment per punctuation-delimited group with start/end", func() { + res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0) + Expect(res.Segments).To(HaveLen(2)) + Expect(res.Segments[0].Text).To(Equal("hello world.")) + Expect(res.Segments[0].Start).To(Equal(int64(0))) + Expect(res.Segments[0].End).To(Equal(secondsToNanos(0.8))) + Expect(res.Segments[1].Text).To(Equal("bye now")) + Expect(res.Segments[1].Start).To(Equal(secondsToNanos(1.0))) + Expect(res.Segments[1].Id).To(Equal(int32(1))) + }) + + It("assigns tokens to the segment whose time window contains them", func() { + res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0) + Expect(res.Segments[0].Tokens).To(Equal([]int32{1, 2})) + Expect(res.Segments[1].Tokens).To(Equal([]int32{3, 4})) + }) + + It("attaches per-segment words only when word granularity requested", func() { + plain := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0) + Expect(plain.Segments[0].Words).To(BeEmpty()) + withWords := transcriptResultFromDoc(doc, &pb.TranscriptRequest{TimestampGranularities: []string{"word"}}, 0) + Expect(withWords.Segments[0].Words).To(HaveLen(2)) + }) + + It("falls back to a single text segment when there are no words", func() { + res := transcriptResultFromDoc(transcriptJSON{Text: "hi"}, &pb.TranscriptRequest{}, 0) + Expect(res.Segments).To(HaveLen(1)) + Expect(res.Segments[0].Text).To(Equal("hi")) + }) +}) + +var _ = Describe("streaming segment assembly", func() { + It("closes a segment with start/end from its words on EOU", func() { + acc := &streamSegmenter{} + acc.add(streamFeedJSON{Text: "hello world", Eou: 1, Words: []transcriptWord{ + {W: "hello", Start: 0.0, End: 0.4}, {W: "world", Start: 0.4, End: 0.9}, + }}) + segs := acc.segments() + Expect(segs).To(HaveLen(1)) + Expect(segs[0].Text).To(Equal("hello world")) + Expect(segs[0].Start).To(Equal(int64(0))) + Expect(segs[0].End).To(Equal(secondsToNanos(0.9))) + }) + + It("buffers words across feeds until EOU", func() { + acc := &streamSegmenter{} + acc.add(streamFeedJSON{Text: "hi", Eou: 0, Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}}) + Expect(acc.segments()).To(BeEmpty()) + acc.add(streamFeedJSON{Text: "there", Eou: 1, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}}) + Expect(acc.segments()).To(HaveLen(1)) + Expect(acc.segments()[0].Text).To(Equal("hi there")) + }) +})