diff --git a/backend/go/parakeet-cpp/Makefile b/backend/go/parakeet-cpp/Makefile index eea251cb1..83ddb27c0 100644 --- a/backend/go/parakeet-cpp/Makefile +++ b/backend/go/parakeet-cpp/Makefile @@ -1,6 +1,6 @@ # parakeet-cpp backend Makefile. # -# Upstream pin lives below as PARAKEET_VERSION?=9db92be63179a27201d3b88d5d40c545b2ac48ae +# Upstream pin lives below as PARAKEET_VERSION?=b8012f11e5269126eddb7f4fd02f891a2ccc29b0 # (.github/bump_deps.sh) can find and update it - matches the # whisper.cpp / ds4 / vibevoice-cpp convention. # @@ -15,7 +15,7 @@ # That's what the L0 smoke test uses. The default target below does the # proper clone-at-pin + cmake build so CI doesn't need a side-checkout. -PARAKEET_VERSION?=9db92be63179a27201d3b88d5d40c545b2ac48ae +PARAKEET_VERSION?=b8012f11e5269126eddb7f4fd02f891a2ccc29b0 PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp GOCMD?=go diff --git a/backend/go/parakeet-cpp/goparakeetcpp.go b/backend/go/parakeet-cpp/goparakeetcpp.go index a65b4bca0..e87409255 100644 --- a/backend/go/parakeet-cpp/goparakeetcpp.go +++ b/backend/go/parakeet-cpp/goparakeetcpp.go @@ -98,17 +98,21 @@ type transcriptJSON struct { } // streamFeedJSON mirrors the document returned by -// parakeet_capi_stream_feed_json / parakeet_capi_stream_finalize_json (ABI v4): +// parakeet_capi_stream_feed_json / parakeet_capi_stream_finalize_json (ABI v5): // -// {"text":"...","eou":0,"frame_sec":0.080000, +// {"text":"...","eou":0,"eob":0,"frame_sec":0.080000, // "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...]} // // "text" is the newly-finalized text since the last call; "eou" is 1 when an -// / fired this feed; "words" are the words finalized this call with -// absolute (stream-relative) start/end seconds. +// (end of utterance) fired this feed and "eob" is 1 when an +// (backchannel) fired. ABI v4 conflated the two into "eou"; v5 split them, so +// we read both and treat either as an utterance boundary for segmentation. +// "words" are the words finalized this call with absolute (stream-relative) +// start/end seconds. type streamFeedJSON struct { Text string `json:"text"` Eou int `json:"eou"` + Eob int `json:"eob"` FrameSec float64 `json:"frame_sec"` Words []transcriptWord `json:"words"` } @@ -483,7 +487,10 @@ type streamSegmenter struct { func (s *streamSegmenter) add(doc streamFeedJSON) { s.cur = append(s.cur, doc.Words...) - if doc.Eou != 0 { + // Close the segment on either turn signal: (end of utterance) or + // (backchannel). ABI v4 reported both via "eou"; v5 split them, so we + // OR them here to keep the v4 segmentation boundaries. + if doc.Eou != 0 || doc.Eob != 0 { s.flush() } } @@ -671,11 +678,12 @@ func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.Tra return nil } -// streamJSON drives the ABI v4 streaming JSON entry points: each feed/finalize -// returns a {text,eou,frame_sec,words} document. The newly-finalized text is -// emitted as a delta (unchanged streaming contract) while words are accumulated -// into per-utterance segments (closed on EOU) so the closing FinalResult carries -// timestamped segments. Runs under engineMu (already held by the caller). +// streamJSON drives the streaming JSON entry points (present since ABI v4): each +// feed/finalize returns a {text,eou,eob,frame_sec,words} document. The +// newly-finalized text is emitted as a delta (unchanged streaming contract) +// while words are accumulated into per-utterance segments (closed on or +// ) so the closing FinalResult carries timestamped segments. Runs under +// engineMu (already held by the caller). func (p *ParakeetCpp) streamJSON(ctx context.Context, stream uintptr, data []float32, duration float32, results chan *pb.TranscriptStreamResponse) error { var ( diff --git a/backend/go/parakeet-cpp/segments_test.go b/backend/go/parakeet-cpp/segments_test.go index 147c151cc..9d8e9f8d5 100644 --- a/backend/go/parakeet-cpp/segments_test.go +++ b/backend/go/parakeet-cpp/segments_test.go @@ -124,4 +124,17 @@ var _ = Describe("streaming segment assembly", func() { Expect(acc.segments()).To(HaveLen(1)) Expect(acc.segments()[0].Text).To(Equal("hi there")) }) + + // ABI v5 split (backchannel) out of the "eou" flag into its own "eob" + // field; a backchannel must still close the segment as it did in v4. + It("closes a segment on EOB (backchannel) too", func() { + acc := &streamSegmenter{} + acc.add(streamFeedJSON{Text: "uh huh", Eou: 0, Eob: 1, Words: []transcriptWord{ + {W: "uh", Start: 0.0, End: 0.2}, {W: "huh", Start: 0.2, End: 0.5}, + }}) + segs := acc.segments() + Expect(segs).To(HaveLen(1)) + Expect(segs[0].Text).To(Equal("uh huh")) + Expect(segs[0].End).To(Equal(secondsToNanos(0.5))) + }) })