diff --git a/.githooks/pre-commit b/.githooks/pre-commit index c09f68772..025ecfafd 100755 --- a/.githooks/pre-commit +++ b/.githooks/pre-commit @@ -7,8 +7,11 @@ # Runs only the checks relevant to what's staged: # - Go files -> make lint + make test-coverage-check # - core/http/react-ui -> make test-ui-coverage-check (Playwright e2e + gate) -# A commit touching neither is skipped entirely (docs/YAML/etc. can't change -# lint findings, Go coverage, or the UI). +# - realtime state machines / specs -> make test-realtime-conformance +# (respcoord/**, turncoord/**, or formal-verification/** -- a pure .fizz +# spec edit must still re-verify the design, detected separately from Go) +# A commit touching none of these is skipped entirely (other docs/YAML can't +# change lint findings, Go coverage, the UI, or the realtime conformance gate). # # To bypass for a single commit (e.g. a WIP checkpoint): git commit --no-verify set -eu @@ -20,11 +23,13 @@ staged="$(git diff --cached --name-only --diff-filter=ACMRD)" go_changed=0 ui_changed=0 +rt_changed=0 if echo "$staged" | grep -qE '\.go$'; then go_changed=1; fi if echo "$staged" | grep -qE '^core/http/react-ui/'; then ui_changed=1; fi +if echo "$staged" | grep -qE '^(core/http/endpoints/openai/(coordinator|respcoord|turncoord|conncoord|compactcoord|ttscoord)/|formal-verification/)'; then rt_changed=1; fi -if [ "$go_changed" -eq 0 ] && [ "$ui_changed" -eq 0 ]; then - echo "pre-commit: no Go or React UI changes staged — skipping." +if [ "$go_changed" -eq 0 ] && [ "$ui_changed" -eq 0 ] && [ "$rt_changed" -eq 0 ]; then + echo "pre-commit: no Go, React UI, or realtime-spec changes staged — skipping." exit 0 fi @@ -57,4 +62,11 @@ if [ "$ui_changed" -eq 1 ]; then make test-ui-coverage-check fi +if [ "$rt_changed" -eq 1 ]; then + echo "pre-commit ▶ realtime state-machine conformance (make test-realtime-conformance) —" + echo " Go transition/rapid tests under -race + FizzBee model check of the" + echo " authoritative specs. Fail-closed: needs FizzBee (make install-fizzbee)." + make test-realtime-conformance +fi + echo "pre-commit ✓ all relevant checks passed" diff --git a/.github/workflows/realtime-conformance.yml b/.github/workflows/realtime-conformance.yml new file mode 100644 index 000000000..c844a3003 --- /dev/null +++ b/.github/workflows/realtime-conformance.yml @@ -0,0 +1,69 @@ +--- +name: 'realtime-conformance' + +# Verifies the realtime state-machine implementations conform to their formal +# designs (docs/design/realtime-state-machines.md, formal-verification/). BOTH +# layers are enforced and the gate is fail-closed: the Go conformance layer +# (respcoord + turncoord transition/rapid tests under -race) AND the FizzBee model check of +# the authoritative specs. FizzBee is pinned + checksum-verified +# (formal-verification/fizzbee.sha256), so a failed install fails the job rather +# than silently skipping verification. + +on: + pull_request: + paths: + - 'core/http/endpoints/openai/coordinator/**' + - 'core/http/endpoints/openai/respcoord/**' + - 'core/http/endpoints/openai/turncoord/**' + - 'core/http/endpoints/openai/conncoord/**' + - 'core/http/endpoints/openai/compactcoord/**' + - 'core/http/endpoints/openai/ttscoord/**' + - 'formal-verification/**' + - 'scripts/realtime-conformance.sh' + - 'scripts/install-fizzbee.sh' + - '.github/workflows/realtime-conformance.yml' + push: + branches: + - master + paths: + - 'core/http/endpoints/openai/coordinator/**' + - 'core/http/endpoints/openai/respcoord/**' + - 'core/http/endpoints/openai/turncoord/**' + - 'core/http/endpoints/openai/conncoord/**' + - 'core/http/endpoints/openai/compactcoord/**' + - 'core/http/endpoints/openai/ttscoord/**' + - 'formal-verification/**' + - 'scripts/realtime-conformance.sh' + +concurrency: + group: realtime-conformance-${{ github.event.pull_request.number || github.sha }}-${{ github.repository }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +jobs: + conformance: + runs-on: ubuntu-latest + strategy: + matrix: + go-version: ['1.26.x'] + steps: + - name: Clone + uses: actions/checkout@v7 + - name: Setup Go ${{ matrix.go-version }} + uses: actions/setup-go@v5 + with: + go-version: ${{ matrix.go-version }} + cache: false + - name: Cache FizzBee + uses: actions/cache@v4 + with: + path: .tools/fizzbee + key: fizzbee-v0.5.2-${{ runner.os }}-${{ hashFiles('formal-verification/fizzbee.sha256') }} + - name: Install FizzBee (pinned, checksum-verified) + # No `|| true`: a failed/forged download must fail the job, not silently + # drop the design verification. install-fizzbee.sh is a no-op if the + # cached binary is already present and valid. + run: ./scripts/install-fizzbee.sh + - name: Run conformance gate (fail-closed) + # No skip env: both the Go conformance and the FizzBee model check are + # required. The gate auto-detects .tools/fizzbee/fizz. + run: make test-realtime-conformance diff --git a/.gitignore b/.gitignore index 91582c006..666b81df9 100644 --- a/.gitignore +++ b/.gitignore @@ -97,3 +97,12 @@ core/http/react-ui/test-results/ # Local Apple signing material (never commit) .certs/ + +# Pinned dev tools (e.g. FizzBee for the realtime-conformance gate) +.tools/ + +# FizzBee model-check artifacts: the parser emits .json next to each +# .fizz and the checker writes run dirs under out/. Both are regenerated by +# the realtime-conformance gate; only the .fizz sources are authoritative. +formal-verification/*.json +formal-verification/out/ diff --git a/Makefile b/Makefile index 2a8edc3fc..3e640a3b7 100644 --- a/Makefile +++ b/Makefile @@ -405,6 +405,18 @@ test-realtime: build-mock-backend @echo 'Running realtime e2e tests (mock backend)' $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="Realtime && !real-models" --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e +# Verify the realtime state-machine implementations conform to their formal +# designs (Go transition/rapid tests under -race + FizzBee model check of the +# authoritative specs). See docs/design/realtime-state-machines.md (Part 6) and +# docs/design/specs/README.md. +test-realtime-conformance: + GOCMD=$(GOCMD) ./scripts/realtime-conformance.sh + +# Install the pinned, checksum-verified FizzBee model checker (into .tools/, +# gitignored) used by test-realtime-conformance. Idempotent; no-op if present. +install-fizzbee: + ./scripts/install-fizzbee.sh + # Container-based real-model realtime testing. Build env vars / pipeline # definition kept here so test-realtime-models-docker can drive a fully wired # pipeline (VAD + STT + LLM + TTS) from inside a containerised runner. @@ -1027,7 +1039,7 @@ test-extra-backend-whisper-transcription: docker-build-whisper ## is reachable. test-extra-backend-parakeet-cpp-transcription: docker-build-parakeet-cpp BACKEND_IMAGE=local-ai-backend:parakeet-cpp \ - BACKEND_TEST_MODEL_URL=https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/tdt_ctc-110m-f16.gguf \ + BACKEND_TEST_MODEL_URL=https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/realtime_eou_120m-v1-f16.gguf \ BACKEND_TEST_AUDIO_URL=https://github.com/ggml-org/whisper.cpp/raw/master/samples/jfk.wav \ BACKEND_TEST_CAPS=health,load,transcription \ $(MAKE) test-extra-backend diff --git a/backend/backend.proto b/backend/backend.proto index 2a575426e..01c5b63a7 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -18,6 +18,18 @@ service Backend { rpc GenerateVideo(GenerateVideoRequest) returns (Result) {} rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {} rpc AudioTranscriptionStream(TranscriptRequest) returns (stream TranscriptStreamResponse) {} + // AudioTranscriptionLive is the bidirectional live-microphone ASR RPC. The + // first message MUST carry a Config; subsequent messages carry Audio frames + // (mono float PCM at config.sample_rate, 16 kHz default). After a + // successful open the backend replies with a single ready ack + // (TranscriptLiveResponse{ready:true}); backends or models without + // cache-aware streaming support return UNIMPLEMENTED instead. Newly + // finalized text streams back as deltas; eou=true marks the model's + // end-of-utterance token. One stream spans many utterances (the decoder + // resets itself after each EOU). Closing the send side finalizes: the + // backend flushes the decoder tail and emits a terminal message carrying + // final_result. A second Config mid-stream resets the decode session. + rpc AudioTranscriptionLive(stream TranscriptLiveRequest) returns (stream TranscriptLiveResponse) {} rpc TTS(TTSRequest) returns (Result) {} rpc TTSStream(TTSRequest) returns (stream Reply) {} rpc SoundGeneration(SoundGenerationRequest) returns (Result) {} @@ -479,6 +491,10 @@ message TranscriptResult { string text = 2; string language = 3; float duration = 4; + // True when the decode ended on the model's end-of-utterance special token + // (/, emitted by cache-aware streaming models such as + // parakeet_realtime_eou_120m-v1). The marker itself is stripped from text. + bool eou = 5; } message TranscriptStreamResponse { @@ -486,6 +502,34 @@ message TranscriptStreamResponse { TranscriptResult final_result = 2; } +// === AudioTranscriptionLive messages ===================================== + +message TranscriptLiveRequest { + oneof payload { + TranscriptLiveConfig config = 1; + TranscriptLiveAudio audio = 2; + } +} + +message TranscriptLiveConfig { + string language = 1; // "" => model default + int32 sample_rate = 2; // 0 => 16000; backends may reject others + map params = 3; // backend-specific tuning +} + +message TranscriptLiveAudio { + repeated float pcm = 1; // mono PCM in [-1,1] at config.sample_rate +} + +message TranscriptLiveResponse { + bool ready = 1; // open ack: sent once, before any delta + string delta = 2; // newly-finalized text since previous response + bool eou = 3; // fired during this feed (the user yielded the turn) + repeated TranscriptWord words = 4; // words finalized by this feed (stream-relative ns) + TranscriptResult final_result = 5; // terminal message only, after the send side closes + bool eob = 6; // fired: a backchannel ("uh-huh") ended — NOT a turn boundary +} + message TranscriptWord { int64 start = 1; int64 end = 2; diff --git a/backend/go/parakeet-cpp/boundary.go b/backend/go/parakeet-cpp/boundary.go new file mode 100644 index 000000000..9c960cbc7 --- /dev/null +++ b/backend/go/parakeet-cpp/boundary.go @@ -0,0 +1,81 @@ +package main + +// utteranceBoundary is the single definition of a small state machine that was +// previously open-coded three times — as a bare `finalEou` bool with an ad-hoc +// toggle — in the live feed (live.go), the file-stream text path, and the +// file-stream JSON path (goparakeetcpp.go). +// +// It answers one running question: does the decode currently rest on an +// end-of-utterance boundary? That is the value a closing FinalResult reports as +// .Eou and the realtime turn detector treats as a commit point. +// +// parakeet auto-resets its decoder after every /, so one streaming +// session is a sequence of utterances and this is a LATCH, not a monotonic +// flag: it closes on an and reopens as soon as the next utterance starts. +// (Contrast the realtime API's per-turn `eouSeen`, which only ever goes +// false->true because each turn gets a fresh stream. Here the stream outlives +// the turn, so the boundary status must be able to reopen.) +// +// The only transitions, over the events one streamFeedResult carries — an +// , an (backchannel), or plain speech output (text and/or words): +// +// +// open ───────────► closed +// ▲ ▲ │ │ │ +// │ └─┘ |speech │ │ +// │ (stay open) │ └─┘ (stay closed) +// └──────────────────┘ +// |speech +// +// open = NOT on an utterance boundary: mid-utterance, the last boundary was +// a backchannel , or the stream just began (the initial state). +// closed = the last meaningful event was an with no later speech: a real +// turn boundary. +// +// A feed that carries nothing (no eou/eob/text/words — e.g. a finalize flush +// that produced no tail) is a no-op and leaves the state unchanged, matching +// the legacy "leave finalEou as it was" behaviour. +// +// The state carries no data, so it is modelled as a two-valued type (a named +// bool) rather than an int enum: every inhabitant is legal, so illegal states +// are unrepresentable — the payload-free analog of the sealed sum types the +// realtime machines use (those need interfaces because their states carry data, +// e.g. Active{ID}, where "Active with no ID" is the illegal combination a scalar +// cannot even express). +type utteranceBoundary bool + +const ( + // boundaryOpen is the zero value (false), so a fresh decode starts open — + // exactly the legacy `var finalEou bool` (false) initial condition. + boundaryOpen utteranceBoundary = false + boundaryClosed utteranceBoundary = true +) + +// observe folds one decode increment into the latch and returns the new state. +// +// takes priority when a single feed carries both an and speech +// (e.g. {"text":"hello","eou":1}): the utterance both produced that text AND +// ended, so the decode rests on the boundary. This matches the legacy +// eou-checked-first ordering at every call site. +func (b utteranceBoundary) observe(r streamFeedResult) utteranceBoundary { + switch { + case r.Eou: + return boundaryClosed + case r.Eob || r.Delta != "" || len(r.Words) > 0: + return boundaryOpen + default: + return b + } +} + +// ended reports whether the decode currently rests on an end-of-utterance +// boundary (a real , not a backchannel ). This is what a closing +// FinalResult carries as .Eou. +func (b utteranceBoundary) ended() bool { return b == boundaryClosed } + +func (b utteranceBoundary) String() string { + if b == boundaryClosed { + return "closed" + } + return "open" +} diff --git a/backend/go/parakeet-cpp/boundary_test.go b/backend/go/parakeet-cpp/boundary_test.go new file mode 100644 index 000000000..affd79bf0 --- /dev/null +++ b/backend/go/parakeet-cpp/boundary_test.go @@ -0,0 +1,92 @@ +package main + +import ( + "math/rand/v2" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("utteranceBoundary (decode end-of-utterance latch)", func() { + It("starts open: a fresh decode is not on a boundary", func() { + var b utteranceBoundary + Expect(b).To(Equal(boundaryOpen)) + Expect(b.ended()).To(BeFalse()) + }) + + DescribeTable("single feed transition from the open state", + func(r streamFeedResult, wantEnded bool) { + Expect(boundaryOpen.observe(r).ended()).To(Equal(wantEnded)) + }, + Entry(" closes it", streamFeedResult{Eou: true}, true), + Entry(" with text closes it (eou wins)", streamFeedResult{Delta: "hi", Eou: true}, true), + Entry(" stays open (backchannel is not a turn boundary)", streamFeedResult{Eob: true}, false), + Entry("plain text stays open", streamFeedResult{Delta: "hello"}, false), + Entry("words-only stays open", streamFeedResult{Words: []transcriptWord{{W: "x"}}}, false), + Entry("empty feed is a no-op (stays open)", streamFeedResult{}, false), + ) + + DescribeTable("single feed transition from the closed state", + func(r streamFeedResult, wantEnded bool) { + Expect(boundaryClosed.observe(r).ended()).To(Equal(wantEnded)) + }, + Entry("another stays closed", streamFeedResult{Eou: true}, true), + Entry("trailing speech reopens it", streamFeedResult{Delta: "and more"}, false), + Entry("words reopen it", streamFeedResult{Words: []transcriptWord{{W: "x"}}}, false), + Entry("a backchannel reopens it", streamFeedResult{Eob: true}, false), + Entry("empty feed is a no-op (stays closed)", streamFeedResult{}, true), + ) + + It("is a latch: then trailing speech reopens, then closes again", func() { + b := boundaryOpen + b = b.observe(streamFeedResult{Delta: "turn one", Eou: true}) + Expect(b.ended()).To(BeTrue()) + b = b.observe(streamFeedResult{Delta: " and more"}) + Expect(b.ended()).To(BeFalse(), "trailing speech without an EOU is an open utterance") + b = b.observe(streamFeedResult{Eou: true}) + Expect(b.ended()).To(BeTrue()) + }) + + It("treats a backchannel before a real EOU correctly", func() { + b := boundaryOpen + b = b.observe(streamFeedResult{Delta: "uh huh", Eob: true}) + Expect(b.ended()).To(BeFalse(), "a backchannel must not masquerade as a turn boundary") + b = b.observe(streamFeedResult{Delta: "done", Eou: true}) + Expect(b.ended()).To(BeTrue()) + }) + + It("matches the reference fold over seeded random feed sequences", func() { + // The invariant: after any sequence of feeds, ended() is true iff the + // last feed that carried ANY event was an . takes priority + // when a feed carries both an EOU and speech; empty feeds are ignored. + for seed := uint64(1); seed <= 200; seed++ { + rng := rand.New(rand.NewPCG(seed, seed*2654435761)) + b := boundaryOpen + lastWasEou := false // reference: did the last meaningful feed end on EOU? + steps := rng.IntN(30) + for i := 0; i < steps; i++ { + var r streamFeedResult + switch rng.IntN(5) { + case 0: + r = streamFeedResult{Eou: true} + case 1: + r = streamFeedResult{Eob: true} + case 2: + r = streamFeedResult{Delta: "w"} + case 3: + r = streamFeedResult{Delta: "w", Eou: true} // eou + speech, eou wins + case 4: + r = streamFeedResult{} // empty: no-op + } + b = b.observe(r) + if r.Eou { + lastWasEou = true + } else if r.Eob || r.Delta != "" || len(r.Words) > 0 { + lastWasEou = false + } + } + Expect(b.ended()).To(Equal(lastWasEou), + "seed %d: latch disagreed with the reference fold", seed) + } + }) +}) diff --git a/backend/go/parakeet-cpp/driver.go b/backend/go/parakeet-cpp/driver.go new file mode 100644 index 000000000..cf832b165 --- /dev/null +++ b/backend/go/parakeet-cpp/driver.go @@ -0,0 +1,82 @@ +package main + +import ( + "context" + + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +// streamFeedResult is one decode increment from a cache-aware streaming session: +// the newly-finalized text plus the model's own per-feed boundary tokens +// (/) and word timings. It is the single event type both the live +// (bidi) and file (server-stream) paths fold over, hiding the ABI v4 JSON vs +// older text-only entry-point split behind one shape. +type streamFeedResult struct { + Delta string + Eou bool + Eob bool + Words []transcriptWord +} + +// feedChunk feeds one PCM chunk to the streaming session (or finalizes it, when +// finalize is true) and returns the unified decode increment. It prefers the +// ABI v4 JSON entry points (which also carry per-word timestamps) and falls +// back to the older text-only entry points against an older libparakeet.so. +// +// This is the one place the JSON-vs-text choice is made; every consumer works +// in terms of streamFeedResult. +func (p *ParakeetCpp) feedChunk(stream uintptr, pcm []float32, finalize bool) (streamFeedResult, error) { + if CppStreamFeedJSON != nil { + doc, err := p.streamFeedDoc(stream, pcm, finalize) + if err != nil { + return streamFeedResult{}, err + } + return streamFeedResult{Delta: doc.Text, Eou: doc.Eou != 0, Eob: doc.Eob != 0, Words: doc.Words}, nil + } + delta, eou, eob, err := p.streamFeedText(stream, pcm, finalize) + if err != nil { + return streamFeedResult{}, err + } + return streamFeedResult{Delta: delta, Eou: eou, Eob: eob}, nil +} + +// feedSlices feeds pcm through the session in streamChunkSamples slices, +// invoking onFeed for each decode increment. It does NOT finalize: callers +// decide when the send side is done. The file path finalizes after the whole +// file; the live path finalizes only when its request channel closes, never +// between audio messages. Slicing keeps each per-call engineMu hold short so +// concurrent unary transcription interleaves fairly (the C session buffers +// internally). +// +// If ctx is non-nil it is checked before each slice so a cancelled file +// transcription stops promptly; the live path passes nil (it is bounded by its +// request channel instead of a ctx). +func (p *ParakeetCpp) feedSlices(ctx context.Context, stream uintptr, pcm []float32, onFeed func(streamFeedResult) error) error { + for off := 0; off < len(pcm); off += streamChunkSamples { + if ctx != nil { + if err := ctx.Err(); err != nil { + return status.Error(codes.Canceled, "transcription cancelled") + } + } + end := min(off+streamChunkSamples, len(pcm)) + res, err := p.feedChunk(stream, pcm[off:end], false) + if err != nil { + return err + } + if err := onFeed(res); err != nil { + return err + } + } + return nil +} + +// flushTail finalizes the session once and folds the flushed tail (the last +// ~2 encoder frames of text, which only appear on finalize) through onFeed. +func (p *ParakeetCpp) flushTail(stream uintptr, onFeed func(streamFeedResult) error) error { + res, err := p.feedChunk(stream, nil, true) + if err != nil { + return err + } + return onFeed(res) +} diff --git a/backend/go/parakeet-cpp/goparakeetcpp.go b/backend/go/parakeet-cpp/goparakeetcpp.go index e87409255..5e023e927 100644 --- a/backend/go/parakeet-cpp/goparakeetcpp.go +++ b/backend/go/parakeet-cpp/goparakeetcpp.go @@ -103,12 +103,13 @@ type transcriptJSON struct { // {"text":"...","eou":0,"eob":0,"frame_sec":0.080000, // "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...]} // -// "text" is the newly-finalized text since the last call; "eou" is 1 when an -// (end of utterance) fired this feed and "eob" is 1 when an -// (backchannel) fired. ABI v4 conflated the two into "eou"; v5 split them, so -// we read both and treat either as an utterance boundary for segmentation. -// "words" are the words finalized this call with absolute (stream-relative) -// start/end seconds. +// "text" is the newly-finalized text since the last call. Under ABI v5 "eou" +// is 1 iff an fired this feed (the user yielded the turn) and "eob" 1 +// iff an fired (a backchannel like "uh-huh" ended — NOT a turn +// boundary). A v4 library has no "eob" field and its "eou" conflates both +// tokens: Eob stays 0 and Eou keeps the old any-event meaning. "words" are +// the words finalized this call with absolute (stream-relative) start/end +// seconds. type streamFeedJSON struct { Text string `json:"text"` Eou int `json:"eou"` @@ -364,7 +365,7 @@ var segmentSeparators = []rune{'.', '?', '!'} // the caller requested word granularity; token ids populate each segment's // Tokens by time-window membership. Shared by the batched and direct paths. func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gapFrames int) pb.TranscriptResult { - text := strings.TrimSpace(doc.Text) + text, eou := stripEouMarker(strings.TrimSpace(doc.Text)) // Frame-unit gap threshold -> seconds (NeMo segment_gap_threshold). 0 = off. gapSeconds := 0.0 @@ -383,6 +384,7 @@ func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gap return pb.TranscriptResult{ Text: text, Segments: []*pb.TranscriptSegment{{Id: 0, Text: text}}, + Eou: eou, } } @@ -409,7 +411,25 @@ func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gap } segments = append(segments, seg) } - return pb.TranscriptResult{Text: text, Segments: segments} + return pb.TranscriptResult{Text: text, Segments: segments, Eou: eou} +} + +// stripEouMarker removes a trailing literal / from offline-decode +// text and reports whether the decode ended on an end-of-UTTERANCE token. The +// realtime EOU model's offline decode keeps the special token in the +// detokenized text (the streaming path strips it and surfaces it as flags +// instead); user-visible transcripts must never carry either marker, but only +// may confirm the semantic_vad retranscribe cross-check — a decode +// ending on means the last thing heard was a backchannel, not the user +// yielding the turn. +func stripEouMarker(text string) (string, bool) { + if strings.HasSuffix(text, "") { + return strings.TrimSpace(strings.TrimSuffix(text, "")), true + } + if strings.HasSuffix(text, "") { + return strings.TrimSpace(strings.TrimSuffix(text, "")), false + } + return text, false } // splitWordsIntoSegments groups words into segments exactly as NeMo's @@ -476,41 +496,55 @@ func tokensInWindow(tokens []transcriptToken, start, end float64) []int32 { return ids } -// streamSegmenter accumulates streaming words into per-utterance segments. EOU -// is the model's own utterance boundary; each closed segment takes its start/end -// from its first/last accumulated word. +// streamSegmenter accumulates streaming decode increments into per-utterance +// segments. / are the model's own utterance boundaries; each closes a +// segment. When the feed carries per-word timings (ABI v4 JSON), a closed +// segment takes its start/end from its first/last word; against an older +// text-only library (no words) it falls back to segmenting the delta text, so +// the same assembler serves both paths. type streamSegmenter struct { - segs []*pb.TranscriptSegment - cur []transcriptWord - nextID int32 + segs []*pb.TranscriptSegment + cur []transcriptWord // words for the open segment (ABI v4 JSON path) + curText []string // delta text for the open segment (text-only path) + nextID int32 } -func (s *streamSegmenter) add(doc streamFeedJSON) { - s.cur = append(s.cur, doc.Words...) - // Close the segment on either turn signal: (end of utterance) or - // (backchannel). ABI v4 reported both via "eou"; v5 split them, so we - // OR them here to keep the v4 segmentation boundaries. - if doc.Eou != 0 || doc.Eob != 0 { +func (s *streamSegmenter) add(r streamFeedResult) { + s.cur = append(s.cur, r.Words...) + if len(r.Words) == 0 && r.Delta != "" { + // Older libparakeet.so with no per-word timing: segment from the text. + s.curText = append(s.curText, r.Delta) + } + // Both and reset the decoder, so both close a segment. + if r.Eou || r.Eob { s.flush() } } func (s *streamSegmenter) flush() { - if len(s.cur) == 0 { - return + switch { + case len(s.cur) > 0: + parts := make([]string, len(s.cur)) + for i, w := range s.cur { + parts[i] = w.W + } + s.segs = append(s.segs, &pb.TranscriptSegment{ + Id: s.nextID, + Start: secondsToNanos(s.cur[0].Start), + End: secondsToNanos(s.cur[len(s.cur)-1].End), + Text: strings.TrimSpace(strings.Join(parts, " ")), + }) + s.nextID++ + case len(s.curText) > 0: + // No words this segment: emit a text-only segment (no timestamps), + // skipping a purely-whitespace one as the legacy text path did. + if t := strings.TrimSpace(strings.Join(s.curText, "")); t != "" { + s.segs = append(s.segs, &pb.TranscriptSegment{Id: s.nextID, Text: t}) + s.nextID++ + } } - parts := make([]string, len(s.cur)) - for i, w := range s.cur { - parts[i] = w.W - } - s.segs = append(s.segs, &pb.TranscriptSegment{ - Id: s.nextID, - Start: secondsToNanos(s.cur[0].Start), - End: secondsToNanos(s.cur[len(s.cur)-1].End), - Text: strings.TrimSpace(strings.Join(parts, " ")), - }) - s.nextID++ s.cur = nil + s.curText = nil } func (s *streamSegmenter) segments() []*pb.TranscriptSegment { return s.segs } @@ -535,18 +569,119 @@ func secondsToNanos(sec float64) int64 { return int64(sec * 1e9) } +// Per-C-call engine serialization for the streaming paths. +// +// Every individual C call (begin / feed / finalize / free) takes engineMu and +// re-checks ctxPtr under the lock; the lock is NEVER held across a stream's +// lifetime. This is safe because each parakeet.cpp call builds its own ggml +// graph and all streaming caches live in the session object, not the ctx — +// the only ctx-shared mutable state is last_error, which is why it is read +// under the same lock as the failing call. Holding the lock per call (rather +// than per stream, as this file previously did) keeps a long-lived live +// session from starving batched unary transcription and vice versa. +// +// A stream must not outlive its ctx (C-API contract). Free() takes engineMu +// and zeroes ctxPtr, so a racing per-call helper returns ModelNotLoaded +// instead of feeding a freed engine; streamFree of an orphaned session only +// runs the session destructor, which does not touch the ctx. + +// streamBegin opens a cache-aware streaming session. A 0 stream with nil +// error means the loaded model is not a streaming model. +func (p *ParakeetCpp) streamBegin(lang string) (uintptr, error) { + p.engineMu.Lock() + defer p.engineMu.Unlock() + if p.ctxPtr == 0 { + return 0, grpcerrors.ModelNotLoaded("parakeet-cpp") + } + if CppStreamBeginLang != nil { + return CppStreamBeginLang(p.ctxPtr, lang), nil + } + return CppStreamBegin(p.ctxPtr), nil +} + +func (p *ParakeetCpp) streamFree(stream uintptr) { + if stream == 0 { + return + } + p.engineMu.Lock() + defer p.engineMu.Unlock() + CppStreamFree(stream) +} + +// streamFeedText runs one text-mode feed (or the finalize flush when +// finalize is true) under engineMu, returning the newly-finalized delta and +// whether an / fired during the call. +func (p *ParakeetCpp) streamFeedText(stream uintptr, pcm []float32, finalize bool) (delta string, eou, eob bool, err error) { + p.engineMu.Lock() + defer p.engineMu.Unlock() + if p.ctxPtr == 0 { + return "", false, false, grpcerrors.ModelNotLoaded("parakeet-cpp") + } + var ret uintptr + var events int32 + if finalize { + ret = CppStreamFinalize(stream) + } else { + ret = CppStreamFeed(stream, pcm, int32(len(pcm)), unsafe.Pointer(&events)) + } + if ret == 0 { + // last_error is ctx-shared: read it under the same lock as the call. + msg := CppLastError(p.ctxPtr) + if msg == "" { + msg = "unknown error" + } + return "", false, false, fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg) + } + delta = goStringFromCPtr(ret) + CppFreeString(ret) + // ABI v5: eou_out is a bitmask (bit 0 = , bit 1 = ). A v4 + // library sets 0/1 for either token, which the bit-0 test reads as the + // old conflated eou — the EOB distinction simply isn't available there. + return delta, events&1 != 0, events&2 != 0, nil +} + +// streamFeedDoc runs one ABI v4 JSON feed (or finalize) under engineMu and +// returns the parsed {text,eou,frame_sec,words} document. +func (p *ParakeetCpp) streamFeedDoc(stream uintptr, pcm []float32, finalize bool) (streamFeedJSON, error) { + p.engineMu.Lock() + defer p.engineMu.Unlock() + if p.ctxPtr == 0 { + return streamFeedJSON{}, grpcerrors.ModelNotLoaded("parakeet-cpp") + } + var ret uintptr + if finalize { + ret = CppStreamFinalizeJSON(stream) + } else { + ret = CppStreamFeedJSON(stream, pcm, int32(len(pcm))) + } + if ret == 0 { + msg := CppLastError(p.ctxPtr) + if msg == "" { + msg = "unknown error" + } + return streamFeedJSON{}, fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg) + } + raw := goStringFromCPtr(ret) + CppFreeString(ret) + var doc streamFeedJSON + if err := json.Unmarshal([]byte(raw), &doc); err != nil { + return streamFeedJSON{}, fmt.Errorf("parakeet-cpp: decode stream json: %w", err) + } + return doc, nil +} + // AudioTranscriptionStream drives the cache-aware streaming RNN-T over the -// audio at opts.Dst: it decodes the file to 16 kHz mono PCM, feeds it in -// chunks to parakeet_capi_stream_feed, and emits each newly-finalized text -// run as a TranscriptStreamResponse delta. / events close the -// current segment; a closing FinalResult carries the full transcript and the -// per-utterance segments. +// audio at opts.Dst: it decodes the file to 16 kHz mono PCM, feeds it through +// the shared decode driver (feedSlices/flushTail), and emits each +// newly-finalized text run as a TranscriptStreamResponse delta. / +// events close the current segment; a closing FinalResult carries the full +// transcript, the per-utterance segments, and whether the file ended on an +// utterance boundary. // // stream_begin returns 0 for models that are not cache-aware streaming models -// (only e.g. nvidia/parakeet_realtime_eou_120m-v1 qualifies). For those we fall -// back to a single offline transcription emitted as one delta plus a closing -// FinalResult, matching LocalAI's non-streaming streaming contract (and the -// whisper backend), so the streaming endpoint works for every model. +// (only e.g. nvidia/parakeet_realtime_eou_120m-v1 qualifies). For those this +// returns codes.Unimplemented rather than faking a stream from an offline +// decode — see the stream==0 branch and grpcerrors.StreamTranscriptionUnsupported. func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.TranscriptRequest, results chan *pb.TranscriptStreamResponse) error { defer close(results) @@ -560,185 +695,73 @@ func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.Tra return status.Error(codes.Canceled, "transcription cancelled") } - var stream uintptr - if CppStreamBeginLang != nil { - stream = CppStreamBeginLang(p.ctxPtr, opts.GetLanguage()) - } else { - stream = CppStreamBegin(p.ctxPtr) + stream, err := p.streamBegin(opts.GetLanguage()) + if err != nil { + return err } if stream == 0 { - // Not a cache-aware streaming model: run a normal offline - // transcription and emit it as one delta + a closing final result. - res, err := p.AudioTranscription(ctx, opts) - if err != nil { - return err - } - if t := strings.TrimSpace(res.Text); t != "" { - results <- &pb.TranscriptStreamResponse{Delta: t} - } - results <- &pb.TranscriptStreamResponse{FinalResult: &res} - return nil + // Not a cache-aware streaming model. Report the missing capability + // honestly instead of decoding offline and emitting it as one "delta" + // + final: a client that asked for streaming must learn the model + // cannot stream, not receive a batch result dressed as a stream (which + // is indistinguishable except qualitatively, and silently breaks any + // feature that genuinely needs incremental output). Callers wanting a + // plain transcript use the unary AudioTranscription path. This mirrors + // AudioTranscriptionLive, which already returns Unimplemented here. + return grpcerrors.StreamTranscriptionUnsupported("parakeet-cpp", + "loaded model is not a cache-aware streaming model") } - defer CppStreamFree(stream) - // The C engine is a single shared context: a streaming session and a batched - // unary dispatch must never touch it at once, so hold engineMu for the whole - // stream. This lock is intentionally taken AFTER the non-streaming fallback - // above returns: that fallback goes through AudioTranscription -> the batcher - // -> runBatch, which itself acquires engineMu, so locking here first would - // deadlock. Do not hoist this lock above the fallback. - p.engineMu.Lock() - defer p.engineMu.Unlock() + defer p.streamFree(stream) data, duration, err := decodeWavMono16k(opts.Dst) if err != nil { return err } - // ABI v4: when the streaming JSON entry points are present, drive them so the - // per-utterance segments carry per-word start/end timestamps. Falls through to - // the text-only loop below against an older libparakeet.so. Runs under the - // engineMu already held above. - if CppStreamFeedJSON != nil { - return p.streamJSON(ctx, stream, data, duration, results) - } - + // Fold the shared decode driver's per-feed increments into the streamed + // deltas and the closing batch result: words/text accumulate into + // per-utterance segments (streamSegmenter), and the utterance-boundary + // latch (boundary.go) records whether the file ended on an . These + // are the offline path's concern — the live RPC carries none of them. var ( full strings.Builder - segText strings.Builder - segments []*pb.TranscriptSegment - segID int32 + seg streamSegmenter + boundary utteranceBoundary ) - - flushSegment := func() { - t := strings.TrimSpace(segText.String()) - segText.Reset() - if t == "" { - return + emit := func(r streamFeedResult) error { + if r.Delta != "" { + full.WriteString(r.Delta) + results <- &pb.TranscriptStreamResponse{Delta: r.Delta} } - segments = append(segments, &pb.TranscriptSegment{Id: segID, Text: t}) - segID++ - } - - // emitDelta consumes the malloc'd char* returned by feed/finalize: frees - // it, accumulates the text, and sends a delta when non-empty. A 0 return - // is an error (vs the "" empty-but-non-NULL no-new-text case). - emitDelta := func(ret uintptr) error { - if ret == 0 { - msg := CppLastError(p.ctxPtr) - if msg == "" { - msg = "unknown error" - } - return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg) - } - delta := goStringFromCPtr(ret) - CppFreeString(ret) - if delta == "" { - return nil - } - full.WriteString(delta) - segText.WriteString(delta) - results <- &pb.TranscriptStreamResponse{Delta: delta} + seg.add(r) + boundary = boundary.observe(r) return nil } - for off := 0; off < len(data); off += streamChunkSamples { - if err := ctx.Err(); err != nil { - return status.Error(codes.Canceled, "transcription cancelled") - } - end := min(off+streamChunkSamples, len(data)) - chunk := data[off:end] - - var eou int32 - ret := CppStreamFeed(stream, chunk, int32(len(chunk)), unsafe.Pointer(&eou)) - if err := emitDelta(ret); err != nil { - return err - } - if eou != 0 { - flushSegment() - } - } - - // Flush the streaming tail (final encoder chunk). - if err := emitDelta(CppStreamFinalize(stream)); err != nil { + if err := p.feedSlices(ctx, stream, data, emit); err != nil { return err } - flushSegment() - - text := strings.TrimSpace(full.String()) - if len(segments) == 0 && text != "" { - segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text}) - } - results <- &pb.TranscriptStreamResponse{ - FinalResult: &pb.TranscriptResult{ - Text: text, - Segments: segments, - Duration: duration, - }, - } - return nil -} - -// streamJSON drives the streaming JSON entry points (present since ABI v4): each -// feed/finalize returns a {text,eou,eob,frame_sec,words} document. The -// newly-finalized text is emitted as a delta (unchanged streaming contract) -// while words are accumulated into per-utterance segments (closed on or -// ) so the closing FinalResult carries timestamped segments. Runs under -// engineMu (already held by the caller). -func (p *ParakeetCpp) streamJSON(ctx context.Context, stream uintptr, data []float32, - duration float32, results chan *pb.TranscriptStreamResponse) error { - var ( - full strings.Builder - seg streamSegmenter - ) - // consume frees the malloc'd char* (a 0 return is an error), parses the JSON, - // emits the delta, and routes words through the segmenter. - consume := func(ret uintptr) error { - if ret == 0 { - msg := CppLastError(p.ctxPtr) - if msg == "" { - msg = "unknown error" - } - return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg) - } - raw := goStringFromCPtr(ret) - CppFreeString(ret) - var doc streamFeedJSON - if err := json.Unmarshal([]byte(raw), &doc); err != nil { - return fmt.Errorf("parakeet-cpp: decode stream json: %w", err) - } - if doc.Text != "" { - full.WriteString(doc.Text) - results <- &pb.TranscriptStreamResponse{Delta: doc.Text} - } - seg.add(doc) - return nil - } - - for off := 0; off < len(data); off += streamChunkSamples { - if err := ctx.Err(); err != nil { - return status.Error(codes.Canceled, "transcription cancelled") - } - end := min(off+streamChunkSamples, len(data)) - chunk := data[off:end] - if err := consume(CppStreamFeedJSON(stream, chunk, int32(len(chunk)))); err != nil { - return err - } - } - if err := consume(CppStreamFinalizeJSON(stream)); err != nil { + if err := p.flushTail(stream, emit); err != nil { return err } - seg.flush() // close any trailing utterance that never saw an EOU + seg.flush() // close a trailing utterance that never saw an - text := strings.TrimSpace(full.String()) + // final.Text is the exact concatenation of the streamed deltas (full is + // their accumulation), so concat(deltas) == FinalResult.Text holds even + // when the model prepends a leading space to the first word (SentencePiece + // detokenization). This matches the whisper backend's streaming contract. + // The single-segment fallback stays trimmed. + fullText := full.String() segments := seg.segments() - if len(segments) == 0 && text != "" { - segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text}) + if trimmed := strings.TrimSpace(fullText); len(segments) == 0 && trimmed != "" { + segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: trimmed}) } results <- &pb.TranscriptStreamResponse{ FinalResult: &pb.TranscriptResult{ - Text: text, + Text: fullText, Segments: segments, Duration: duration, + Eou: boundary.ended(), }, } return nil @@ -803,6 +826,10 @@ func (p *ParakeetCpp) Free() error { close(p.batStop) p.batStop = nil } + // engineMu so an in-flight streaming call (which locks per C call and + // re-checks ctxPtr under the lock) can never feed into a freed ctx. + p.engineMu.Lock() + defer p.engineMu.Unlock() if p.ctxPtr != 0 { CppFree(p.ctxPtr) p.ctxPtr = 0 diff --git a/backend/go/parakeet-cpp/goparakeetcpp_test.go b/backend/go/parakeet-cpp/goparakeetcpp_test.go index 0cfcc37e5..a6f6af1f0 100644 --- a/backend/go/parakeet-cpp/goparakeetcpp_test.go +++ b/backend/go/parakeet-cpp/goparakeetcpp_test.go @@ -14,6 +14,8 @@ import ( pb "github.com/mudler/LocalAI/pkg/grpc/proto" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" ) func TestParakeetCpp(t *testing.T) { @@ -201,6 +203,29 @@ var _ = Describe("ParakeetCpp", func() { }) Context("AudioTranscriptionStream", func() { + It("returns the typed Unimplemented signal for non-streaming models (no offline fallback)", func() { + // stream_begin == 0 means the loaded model is not a cache-aware + // streaming model. The backend must surface that, not silently + // decode offline and fake a one-shot "stream". + savedBegin, savedBeginLang := CppStreamBegin, CppStreamBeginLang + defer func() { CppStreamBegin, CppStreamBeginLang = savedBegin, savedBeginLang }() + CppStreamBeginLang = nil + CppStreamBegin = func(ctx uintptr) uintptr { return 0 } + + p := &ParakeetCpp{ctxPtr: 1} + results := make(chan *pb.TranscriptStreamResponse, 8) + err := p.AudioTranscriptionStream(context.Background(), + &pb.TranscriptRequest{Dst: "ignored.wav"}, results) + Expect(status.Code(err)).To(Equal(codes.Unimplemented)) + + // Honest signal: nothing was emitted — no faked batch result. + var emitted []*pb.TranscriptStreamResponse + for r := range results { + emitted = append(emitted, r) + } + Expect(emitted).To(BeEmpty()) + }) + It("streams deltas and a closing FinalResult from a cache-aware model", func() { // Streaming needs a cache-aware streaming model (e.g. // realtime_eou); the offline test model would fail stream_begin. diff --git a/backend/go/parakeet-cpp/live.go b/backend/go/parakeet-cpp/live.go new file mode 100644 index 000000000..3d68a2914 --- /dev/null +++ b/backend/go/parakeet-cpp/live.go @@ -0,0 +1,186 @@ +package main + +import ( + "strings" + "time" + + "github.com/mudler/LocalAI/pkg/grpc/grpcerrors" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + "github.com/mudler/xlog" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +// liveSampleRate is the only PCM rate the parakeet C streaming API accepts. +const liveSampleRate = 16000 + +// AudioTranscriptionLive drives one cache-aware streaming session over audio +// fed incrementally by the caller (the realtime API's semantic_vad turn +// detection). Contract: +// +// - the first request must carry a Config; a Config mid-stream resets the +// decode session (free + begin) and drops accumulated transcript state; +// - a Ready ack is sent right after a successful stream_begin so callers +// can degrade synchronously when the model has no streaming support +// (LiveTranscriptionUnsupported, codes.Unimplemented); +// - every feed that produced output is forwarded as {delta, eou, words}; +// the / flag is the model's own utterance boundary and the +// decoder auto-resets after it, so one session spans many utterances; +// - closing the send side finalizes: the held-back tail chunk is flushed +// (the last ~2 encoder frames of words only appear here) and a terminal +// FinalResult carries the full transcript Text only. Per-utterance +// segments, duration, and the terminal flag are NOT produced here — +// the realtime core consumes the streamed per-feed tokens and the final +// Text; those batch fields are the file path's concern (see +// AudioTranscriptionStream). +// +// Engine access is serialized per C call (streamBegin/streamFeed*/streamFree +// take engineMu internally), never for the session lifetime — unary +// transcription keeps flowing between feeds. +func (p *ParakeetCpp) AudioTranscriptionLive(in <-chan *pb.TranscriptLiveRequest, out chan<- *pb.TranscriptLiveResponse) error { + defer close(out) + + if p.ctxPtr == 0 { + return grpcerrors.ModelNotLoaded("parakeet-cpp") + } + + first, ok := <-in + if !ok { + return nil // caller closed without sending anything + } + cfg := first.GetConfig() + if cfg == nil { + return status.Error(codes.InvalidArgument, "parakeet-cpp: first live message must carry a config") + } + if err := validateLiveConfig(cfg); err != nil { + return err + } + + stream, err := p.streamBegin(cfg.GetLanguage()) + if err != nil { + return err + } + if stream == 0 { + return grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp", + "loaded model is not a cache-aware streaming model") + } + // stream is reassigned on a mid-stream Config reset; free whatever is + // current when the RPC unwinds. + defer func() { p.streamFree(stream) }() + + out <- &pb.TranscriptLiveResponse{Ready: true} + + var ( + full strings.Builder + fedSecs float64 + + // behindSec accumulates how far decode wall time has fallen behind + // the audio it was fed. A live caller feeds in real time, so a + // persistent positive backlog means every downstream signal — + // including the the turn detector waits on — arrives that many + // seconds late. Warned once per session; reset by a Config reset. + behindSec float64 + behindWarned bool + ) + + // emit forwards one decode increment: it streams the per-feed tokens the + // realtime turn detector consumes (delta/eou/eob/words) and accumulates the + // running transcript for the closing FinalResult. No segmentation or + // boundary latch here — the live consumer reads only the streamed tokens + // and the final Text; per-utterance segments and the terminal flag + // are an offline-path concern (see AudioTranscriptionStream / boundary.go). + emit := func(r streamFeedResult) error { + if r.Delta != "" { + full.WriteString(r.Delta) + } + if r.Delta != "" || r.Eou || r.Eob || len(r.Words) > 0 { + out <- &pb.TranscriptLiveResponse{ + Delta: r.Delta, + Eou: r.Eou, + Eob: r.Eob, + Words: liveWordsToProto(r.Words), + } + } + return nil + } + + for req := range in { + switch payload := req.GetPayload().(type) { + case *pb.TranscriptLiveRequest_Config: + if err := validateLiveConfig(payload.Config); err != nil { + return err + } + // Reset: a fresh decode session, dropping accumulated state. + p.streamFree(stream) + stream, err = p.streamBegin(payload.Config.GetLanguage()) + if err != nil { + return err + } + if stream == 0 { + return grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp", + "loaded model is not a cache-aware streaming model") + } + full.Reset() + fedSecs = 0 + case *pb.TranscriptLiveRequest_Audio: + pcm := payload.Audio.GetPcm() + audioSec := float64(len(pcm)) / liveSampleRate + fedSecs += audioSec + start := time.Now() + // nil ctx: a live session is bounded by this request channel, not a + // context — cancellation is the caller closing the stream. + if err := p.feedSlices(nil, stream, pcm, emit); err != nil { + return err + } + wallSec := time.Since(start).Seconds() + behindSec += wallSec - audioSec + if behindSec < 0 { + behindSec = 0 + } + xlog.Debug("parakeet-cpp: live feed", + "audio_ms", int(audioSec*1000), "wall_ms", int(wallSec*1000), + "behind_ms", int(behindSec*1000), "fed_s", fedSecs) + if behindSec > 1 && !behindWarned { + behindWarned = true + xlog.Warn("parakeet-cpp: live decode is falling behind real time; "+ + "end-of-utterance signals will arrive late", + "behind_s", behindSec, "fed_s", fedSecs) + } + } + } + + // Send side closed: flush the streaming tail and emit the final transcript. + // The live FinalResult carries only Text — the authoritative full-turn + // transcript the realtime core commits. Per-utterance segments, duration, + // and the terminal flag are not produced on the live path. + if err := p.flushTail(stream, emit); err != nil { + return err + } + out <- &pb.TranscriptLiveResponse{ + FinalResult: &pb.TranscriptResult{Text: strings.TrimSpace(full.String())}, + } + return nil +} + +func validateLiveConfig(cfg *pb.TranscriptLiveConfig) error { + if sr := cfg.GetSampleRate(); sr != 0 && sr != liveSampleRate { + return status.Errorf(codes.InvalidArgument, + "parakeet-cpp: unsupported live sample_rate %d (only %d)", sr, liveSampleRate) + } + return nil +} + +func liveWordsToProto(words []transcriptWord) []*pb.TranscriptWord { + if len(words) == 0 { + return nil + } + out := make([]*pb.TranscriptWord, len(words)) + for i, w := range words { + out[i] = &pb.TranscriptWord{ + Start: secondsToNanos(w.Start), + End: secondsToNanos(w.End), + Text: w.W, + } + } + return out +} diff --git a/backend/go/parakeet-cpp/live_test.go b/backend/go/parakeet-cpp/live_test.go new file mode 100644 index 000000000..0462ee521 --- /dev/null +++ b/backend/go/parakeet-cpp/live_test.go @@ -0,0 +1,417 @@ +package main + +import ( + "sync" + "time" + "unsafe" + + "github.com/mudler/LocalAI/pkg/grpc/grpcerrors" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +// The live-RPC specs drive AudioTranscriptionLive entirely against stubbed +// Cpp* package vars (the same seam batcher_test.go uses), so they run +// without libparakeet.so. + +// liveCstrPool hands out NUL-terminated C-style strings backed by Go memory +// and keeps them alive for the duration of a spec (goStringFromCPtr reads +// through the raw pointer; Go's GC must not collect the backing array while +// a stub's return value is in flight). +type liveCstrPool struct { + mu sync.Mutex + bufs [][]byte +} + +func (p *liveCstrPool) cstr(s string) uintptr { + p.mu.Lock() + defer p.mu.Unlock() + b := append([]byte(s), 0) + p.bufs = append(p.bufs, b) + return uintptr(unsafe.Pointer(&b[0])) +} + +// liveStubs swaps every C entry point the live path touches and returns a +// restore func for AfterEach. +func liveStubs() (restore func()) { + savedBegin, savedBeginLang := CppStreamBegin, CppStreamBeginLang + savedFeed, savedFeedJSON := CppStreamFeed, CppStreamFeedJSON + savedFinalize, savedFinalizeJSON := CppStreamFinalize, CppStreamFinalizeJSON + savedFree, savedLastError := CppStreamFree, CppLastError + savedFreeString := CppFreeString + return func() { + CppStreamBegin, CppStreamBeginLang = savedBegin, savedBeginLang + CppStreamFeed, CppStreamFeedJSON = savedFeed, savedFeedJSON + CppStreamFinalize, CppStreamFinalizeJSON = savedFinalize, savedFinalizeJSON + CppStreamFree, CppLastError = savedFree, savedLastError + CppFreeString = savedFreeString + } +} + +// runLive starts the RPC on its own goroutine and returns the request +// channel plus a collector for everything the backend emitted. +func runLive(p *ParakeetCpp) (chan *pb.TranscriptLiveRequest, chan *pb.TranscriptLiveResponse, chan error) { + in := make(chan *pb.TranscriptLiveRequest) + out := make(chan *pb.TranscriptLiveResponse, 32) + errCh := make(chan error, 1) + go func() { errCh <- p.AudioTranscriptionLive(in, out) }() + return in, out, errCh +} + +func liveConfig(lang string) *pb.TranscriptLiveRequest { + return &pb.TranscriptLiveRequest{ + Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{Language: lang}}, + } +} + +func liveAudio(pcm []float32) *pb.TranscriptLiveRequest { + return &pb.TranscriptLiveRequest{ + Payload: &pb.TranscriptLiveRequest_Audio{Audio: &pb.TranscriptLiveAudio{Pcm: pcm}}, + } +} + +func collectLive(out chan *pb.TranscriptLiveResponse) []*pb.TranscriptLiveResponse { + var got []*pb.TranscriptLiveResponse + for r := range out { + got = append(got, r) + } + return got +} + +var _ = Describe("AudioTranscriptionLive (stubbed C API)", func() { + var ( + pool *liveCstrPool + restore func() + p *ParakeetCpp + ) + + BeforeEach(func() { + pool = &liveCstrPool{} + restore = liveStubs() + p = &ParakeetCpp{ctxPtr: 1} + + CppStreamBeginLang = nil + CppStreamBegin = func(ctx uintptr) uintptr { return 7 } + CppStreamFree = func(s uintptr) {} + CppFreeString = func(s uintptr) {} + CppLastError = func(ctx uintptr) string { return "stub error" } + CppStreamFeed = nil + CppStreamFeedJSON = nil + CppStreamFinalize = nil + CppStreamFinalizeJSON = nil + }) + + AfterEach(func() { restore() }) + + It("rejects a stream whose first message is not a config", func() { + in, out, errCh := runLive(p) + in <- liveAudio([]float32{0.1}) + close(in) + + err := <-errCh + Expect(status.Code(err)).To(Equal(codes.InvalidArgument)) + Expect(collectLive(out)).To(BeEmpty()) + }) + + It("rejects a non-16k sample rate", func() { + in, _, errCh := runLive(p) + in <- &pb.TranscriptLiveRequest{ + Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{SampleRate: 8000}}, + } + close(in) + Expect(status.Code(<-errCh)).To(Equal(codes.InvalidArgument)) + }) + + It("returns the typed Unimplemented signal for non-streaming models, before any ack", func() { + CppStreamBegin = func(ctx uintptr) uintptr { return 0 } + + in, out, errCh := runLive(p) + in <- liveConfig("") + close(in) + + err := <-errCh + Expect(grpcerrors.IsLiveTranscriptionUnsupported(err)).To(BeTrue()) + Expect(collectLive(out)).To(BeEmpty()) + }) + + It("streams deltas, eou flags and words on the JSON path and finalizes on close", func() { + var freed []uintptr + CppStreamFree = func(s uintptr) { freed = append(freed, s) } + feeds := 0 + CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr { + feeds++ + switch feeds { + case 1: + return pool.cstr(`{"text":"hello ","eou":0,"frame_sec":0.08,` + + `"words":[{"w":"hello","start":0.1,"end":0.4,"conf":0.9}]}`) + default: + return pool.cstr(`{"text":"world","eou":1,"frame_sec":0.08,` + + `"words":[{"w":"world","start":0.5,"end":0.8,"conf":0.9}]}`) + } + } + CppStreamFinalizeJSON = func(s uintptr) uintptr { + return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`) + } + + in, out, errCh := runLive(p) + in <- liveConfig("en") + in <- liveAudio(make([]float32, 100)) + in <- liveAudio(make([]float32, 200)) + close(in) + Expect(<-errCh).NotTo(HaveOccurred()) + + got := collectLive(out) + Expect(got).To(HaveLen(4)) // ready, two deltas, final + + Expect(got[0].Ready).To(BeTrue()) + + Expect(got[1].Delta).To(Equal("hello ")) + Expect(got[1].Eou).To(BeFalse()) + Expect(got[1].Words).To(HaveLen(1)) + Expect(got[1].Words[0].Text).To(Equal("hello")) + + Expect(got[2].Delta).To(Equal("world")) + Expect(got[2].Eou).To(BeTrue()) + + final := got[3].FinalResult + Expect(final).NotTo(BeNil()) + Expect(final.Text).To(Equal("hello world")) + // The live FinalResult carries only Text. Per-utterance segments, + // duration and the terminal eou flag are an offline-path concern (see + // boundary.go / AudioTranscriptionStream); the realtime core reads the + // streamed per-feed tokens above plus this Text. + Expect(final.Eou).To(BeFalse()) + Expect(final.Segments).To(BeEmpty()) + Expect(final.Duration).To(BeZero()) + + Expect(freed).To(Equal([]uintptr{7})) + }) + + It("falls back to the text feed (eou out-param) when the JSON entry points are absent", func() { + feeds := 0 + CppStreamFeed = func(s uintptr, pcm []float32, n int32, eouOut unsafe.Pointer) uintptr { + feeds++ + if feeds == 2 { + *(*int32)(eouOut) = 1 + return pool.cstr("done") + } + return pool.cstr("first ") + } + CppStreamFinalize = func(s uintptr) uintptr { return pool.cstr("") } + + in, out, errCh := runLive(p) + in <- liveConfig("") + in <- liveAudio(make([]float32, 10)) + in <- liveAudio(make([]float32, 10)) + close(in) + Expect(<-errCh).NotTo(HaveOccurred()) + + got := collectLive(out) + Expect(got).To(HaveLen(4)) + Expect(got[1].Delta).To(Equal("first ")) + Expect(got[1].Eou).To(BeFalse()) + Expect(got[2].Delta).To(Equal("done")) + Expect(got[2].Eou).To(BeTrue()) + Expect(got[3].FinalResult.Text).To(Equal("first done")) + }) + + It("forwards as eob — a backchannel, never an eou (ABI v5 JSON)", func() { + feeds := 0 + CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr { + feeds++ + if feeds == 1 { + return pool.cstr(`{"text":"uh-huh","eou":0,"eob":1,"frame_sec":0.08,` + + `"words":[{"w":"uh-huh","start":0.1,"end":0.3,"conf":0.9}]}`) + } + return pool.cstr(`{"text":"the turn","eou":1,"eob":0,"frame_sec":0.08,` + + `"words":[{"w":"the","start":0.5,"end":0.6,"conf":0.9},{"w":"turn","start":0.6,"end":0.8,"conf":0.9}]}`) + } + CppStreamFinalizeJSON = func(s uintptr) uintptr { + return pool.cstr(`{"text":"","eou":0,"eob":0,"frame_sec":0.08,"words":[]}`) + } + + in, out, errCh := runLive(p) + in <- liveConfig("") + in <- liveAudio(make([]float32, 10)) + in <- liveAudio(make([]float32, 10)) + close(in) + Expect(<-errCh).NotTo(HaveOccurred()) + + got := collectLive(out) + Expect(got).To(HaveLen(4)) + Expect(got[1].Eob).To(BeTrue()) + Expect(got[1].Eou).To(BeFalse(), "a backchannel must not masquerade as a turn boundary") + Expect(got[2].Eou).To(BeTrue()) + }) + + It("maps the v5 eou_out bitmask on the text path (bit0 , bit1 )", func() { + feeds := 0 + CppStreamFeed = func(s uintptr, pcm []float32, n int32, eouOut unsafe.Pointer) uintptr { + feeds++ + if feeds == 1 { + *(*int32)(eouOut) = 2 // only + return pool.cstr("uh-huh") + } + *(*int32)(eouOut) = 1 // + return pool.cstr(" done") + } + CppStreamFinalize = func(s uintptr) uintptr { return pool.cstr("") } + + in, out, errCh := runLive(p) + in <- liveConfig("") + in <- liveAudio(make([]float32, 10)) + in <- liveAudio(make([]float32, 10)) + close(in) + Expect(<-errCh).NotTo(HaveOccurred()) + + got := collectLive(out) + Expect(got).To(HaveLen(4)) + Expect(got[1].Eob).To(BeTrue()) + Expect(got[1].Eou).To(BeFalse()) + Expect(got[2].Eou).To(BeTrue()) + Expect(got[2].Eob).To(BeFalse()) + }) + + It("accumulates trailing text after an EOU into the final transcript", func() { + feeds := 0 + CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr { + feeds++ + if feeds == 1 { + return pool.cstr(`{"text":"turn one","eou":1,"frame_sec":0.08,"words":[]}`) + } + return pool.cstr(`{"text":" and more","eou":0,"frame_sec":0.08,"words":[]}`) + } + CppStreamFinalizeJSON = func(s uintptr) uintptr { + return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`) + } + + in, out, errCh := runLive(p) + in <- liveConfig("") + in <- liveAudio(make([]float32, 10)) + in <- liveAudio(make([]float32, 10)) + close(in) + Expect(<-errCh).NotTo(HaveOccurred()) + + got := collectLive(out) + final := got[len(got)-1].FinalResult + Expect(final.Text).To(Equal("turn one and more")) + }) + + It("resets the decode session on a mid-stream config", func() { + var begun, freed int + CppStreamBegin = func(ctx uintptr) uintptr { begun++; return uintptr(10 + begun) } + CppStreamFree = func(s uintptr) { freed++ } + CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr { + return pool.cstr(`{"text":"x","eou":0,"frame_sec":0.08,"words":[]}`) + } + CppStreamFinalizeJSON = func(s uintptr) uintptr { + return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`) + } + + in, out, errCh := runLive(p) + in <- liveConfig("") + in <- liveAudio(make([]float32, 10)) + in <- liveConfig("") // reset + in <- liveAudio(make([]float32, 10)) + close(in) + Expect(<-errCh).NotTo(HaveOccurred()) + + got := collectLive(out) + final := got[len(got)-1].FinalResult + Expect(final.Text).To(Equal("x"), "pre-reset transcript dropped") + Expect(begun).To(Equal(2)) + Expect(freed).To(Equal(2), "old session freed on reset, new one on unwind") + }) + + It("does not hold engineMu between feeds (unary work interleaves with a live session)", func() { + CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr { + return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`) + } + CppStreamFinalizeJSON = func(s uintptr) uintptr { + return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`) + } + + in, out, errCh := runLive(p) + in <- liveConfig("") + in <- liveAudio(make([]float32, 10)) + + // The session is open and idle between feeds: the engine lock must be + // acquirable, which is what lets batched unary transcription proceed + // mid-session. Under stream-lifetime locking this probe would block + // until the stream ended and the Eventually would time out. + locked := make(chan struct{}) + go func() { + p.engineMu.Lock() + p.engineMu.Unlock() //nolint:staticcheck // probe: acquire-release proves availability + close(locked) + }() + Eventually(locked, time.Second).Should(BeClosed()) + + close(in) + Expect(<-errCh).NotTo(HaveOccurred()) + collectLive(out) + }) + + It("errors out and reads last_error under the lock when a feed fails", func() { + CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr { return 0 } + + in, out, errCh := runLive(p) + in <- liveConfig("") + in <- liveAudio(make([]float32, 10)) + + err := <-errCh + Expect(err).To(MatchError(ContainSubstring("stub error"))) + got := collectLive(out) + Expect(got).To(HaveLen(1)) // just the ready ack + close(in) + }) +}) + +var _ = Describe("stripEouMarker", func() { + It("strips a trailing and reports it", func() { + text, eou := stripEouMarker("it is certainly very like the old portrait") + Expect(text).To(Equal("it is certainly very like the old portrait")) + Expect(eou).To(BeTrue()) + }) + + It("strips a trailing WITHOUT reporting an utterance end", func() { + // A decode ending on a backchannel must not confirm the + // retranscribe gate — the user was acknowledging, not yielding. + text, eou := stripEouMarker("uh-huh") + Expect(text).To(Equal("uh-huh")) + Expect(eou).To(BeFalse()) + }) + + It("leaves marker-free text alone", func() { + text, eou := stripEouMarker("plain transcript") + Expect(text).To(Equal("plain transcript")) + Expect(eou).To(BeFalse()) + }) + + It("does not strip a marker in the middle of the text", func() { + text, eou := stripEouMarker("ab") + Expect(text).To(Equal("ab")) + Expect(eou).To(BeFalse()) + }) +}) + +var _ = Describe("transcriptResultFromDoc EOU handling", func() { + It("strips the offline marker from text and sets the result flag", func() { + doc := transcriptJSON{Text: "the old portrait"} + res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0) + Expect(res.Text).To(Equal("the old portrait")) + Expect(res.Eou).To(BeTrue()) + Expect(res.Segments).To(HaveLen(1)) + Expect(res.Segments[0].Text).To(Equal("the old portrait")) + }) + + It("reports eou=false for marker-free decodes", func() { + doc := transcriptJSON{Text: "no marker here"} + res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0) + Expect(res.Text).To(Equal("no marker here")) + Expect(res.Eou).To(BeFalse()) + }) +}) diff --git a/backend/go/parakeet-cpp/segments_test.go b/backend/go/parakeet-cpp/segments_test.go index 9d8e9f8d5..0295e771f 100644 --- a/backend/go/parakeet-cpp/segments_test.go +++ b/backend/go/parakeet-cpp/segments_test.go @@ -106,7 +106,7 @@ var _ = Describe("transcriptResultFromDoc (multi-segment)", func() { var _ = Describe("streaming segment assembly", func() { It("closes a segment with start/end from its words on EOU", func() { acc := &streamSegmenter{} - acc.add(streamFeedJSON{Text: "hello world", Eou: 1, Words: []transcriptWord{ + acc.add(streamFeedResult{Delta: "hello world", Eou: true, Words: []transcriptWord{ {W: "hello", Start: 0.0, End: 0.4}, {W: "world", Start: 0.4, End: 0.9}, }}) segs := acc.segments() @@ -118,9 +118,9 @@ var _ = Describe("streaming segment assembly", func() { It("buffers words across feeds until EOU", func() { acc := &streamSegmenter{} - acc.add(streamFeedJSON{Text: "hi", Eou: 0, Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}}) + acc.add(streamFeedResult{Delta: "hi", Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}}) Expect(acc.segments()).To(BeEmpty()) - acc.add(streamFeedJSON{Text: "there", Eou: 1, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}}) + acc.add(streamFeedResult{Delta: "there", Eou: true, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}}) Expect(acc.segments()).To(HaveLen(1)) Expect(acc.segments()[0].Text).To(Equal("hi there")) }) @@ -129,7 +129,7 @@ var _ = Describe("streaming segment assembly", func() { // field; a backchannel must still close the segment as it did in v4. It("closes a segment on EOB (backchannel) too", func() { acc := &streamSegmenter{} - acc.add(streamFeedJSON{Text: "uh huh", Eou: 0, Eob: 1, Words: []transcriptWord{ + acc.add(streamFeedResult{Delta: "uh huh", Eob: true, Words: []transcriptWord{ {W: "uh", Start: 0.0, End: 0.2}, {W: "huh", Start: 0.2, End: 0.5}, }}) segs := acc.segments() @@ -137,4 +137,18 @@ var _ = Describe("streaming segment assembly", func() { Expect(segs[0].Text).To(Equal("uh huh")) Expect(segs[0].End).To(Equal(secondsToNanos(0.5))) }) + + // Older text-only libparakeet.so: no per-word timings, so a segment is cut + // from the delta text on each / (no timestamps), one per utterance. + It("falls back to text segments when the feed carries no words", func() { + acc := &streamSegmenter{} + acc.add(streamFeedResult{Delta: "first turn", Eou: true}) + acc.add(streamFeedResult{Delta: "second turn", Eou: true}) + segs := acc.segments() + Expect(segs).To(HaveLen(2)) + Expect(segs[0].Text).To(Equal("first turn")) + Expect(segs[1].Text).To(Equal("second turn")) + Expect(segs[0].Start).To(Equal(int64(0)), "no per-word timing on the text path") + Expect(segs[0].End).To(Equal(int64(0))) + }) }) diff --git a/core/application/application.go b/core/application/application.go index 52f8618f1..83057c9cd 100644 --- a/core/application/application.go +++ b/core/application/application.go @@ -103,6 +103,11 @@ func newApplication(appConfig *config.ApplicationConfig) *Application { mcpTools.CloseMCPSessions(modelName) }) + // Record a model_load backend trace for every real backend load, so the + // Traces UI shows which backend runtime served each model and how long + // the load took. Load failures are traced by the modality wrappers. + ml.SetLoadObserver(corebackend.ModelLoadTraceObserver(appConfig)) + app := &Application{ backendLoader: config.NewModelConfigLoader(appConfig.SystemState.Model.ModelsPath), modelLoader: ml, diff --git a/core/backend/model_load_trace_test.go b/core/backend/model_load_trace_test.go new file mode 100644 index 000000000..1cce5da26 --- /dev/null +++ b/core/backend/model_load_trace_test.go @@ -0,0 +1,72 @@ +package backend_test + +import ( + "errors" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/trace" + "github.com/mudler/LocalAI/pkg/model" +) + +// ModelLoadTraceObserver is what makes successful loads visible on the +// Traces page: one model_load row per real backend load, carrying the +// resolved backend runtime. Failures must NOT be recorded here — the +// modality wrappers own those — and the observer must respect the runtime +// tracing toggle. +var _ = Describe("ModelLoadTraceObserver", func() { + var appConfig *config.ApplicationConfig + + successEvent := model.BackendLoadEvent{ + ModelID: "parakeet-cpp-realtime_eou_120m-v1", + ModelName: "realtime_eou_120m.gguf", + Backend: "parakeet-cpp", + BackendURI: "/backends/intel-sycl-f16-parakeet-cpp-development/run.sh", + Duration: 1500 * time.Millisecond, + } + + BeforeEach(func() { + appConfig = &config.ApplicationConfig{ + EnableTracing: true, + TracingMaxItems: 64, + } + trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes) + trace.ClearBackendTraces() + }) + + It("records a model_load trace with the backend runtime on success", func() { + backend.ModelLoadTraceObserver(appConfig)(successEvent) + + Eventually(trace.GetBackendTraces).Should(HaveLen(1)) + got := trace.GetBackendTraces()[0] + Expect(got.Type).To(Equal(trace.BackendTraceModelLoad)) + Expect(got.Summary).To(Equal("Model loaded")) + Expect(got.ModelName).To(Equal("parakeet-cpp-realtime_eou_120m-v1")) + Expect(got.Backend).To(Equal("parakeet-cpp")) + Expect(got.Duration).To(Equal(1500 * time.Millisecond)) + Expect(got.Data["backend_runtime"]).To(Equal("/backends/intel-sycl-f16-parakeet-cpp-development/run.sh")) + Expect(got.Data["model_file"]).To(Equal("realtime_eou_120m.gguf")) + Expect(got.Error).To(BeEmpty()) + }) + + It("skips failed loads — the modality wrappers trace those with request context", func() { + failed := successEvent + failed.Err = errors.New("grpc service not ready") + + backend.ModelLoadTraceObserver(appConfig)(failed) + + Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty()) + }) + + It("records nothing when tracing is disabled", func() { + appConfig.EnableTracing = false + + backend.ModelLoadTraceObserver(appConfig)(successEvent) + + Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty()) + }) +}) diff --git a/core/backend/options.go b/core/backend/options.go index 528c10e52..9ae22dd22 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -19,6 +19,39 @@ import ( "github.com/mudler/xlog" ) +// ModelLoadTraceObserver returns the ModelLoader load observer that records +// a model_load backend trace for every successful real load (backend process +// spawn + LoadModel RPC; cache hits never reach the observer). Failures are +// deliberately skipped here: the modality wrappers already record them via +// recordModelLoadFailure with request context, and the backend auto-discovery +// scan probes several backends before one succeeds — tracing every probe +// failure would bury the buffer in noise. +// +// The traced data includes the resolved backend runtime (the installed +// backend's launcher path, which names the variant directory) — that is what +// identifies WHICH build served the load. A stale installed backend is +// invisible in the model config but obvious here. +func ModelLoadTraceObserver(appConfig *config.ApplicationConfig) func(model.BackendLoadEvent) { + return func(ev model.BackendLoadEvent) { + if ev.Err != nil || !appConfig.EnableTracing { + return + } + trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes) + trace.RecordBackendTrace(trace.BackendTrace{ + Timestamp: time.Now(), + Duration: ev.Duration, + Type: trace.BackendTraceModelLoad, + ModelName: ev.ModelID, + Backend: ev.Backend, + Summary: "Model loaded", + Data: map[string]any{ + "model_file": ev.ModelName, + "backend_runtime": ev.BackendURI, + }, + }) + } +} + // recordModelLoadFailure records a backend trace when model loading fails. func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, backend string, err error, data map[string]any) { if !appConfig.EnableTracing { diff --git a/core/backend/transcript.go b/core/backend/transcript.go index e6da923cc..211269160 100644 --- a/core/backend/transcript.go +++ b/core/backend/transcript.go @@ -181,6 +181,7 @@ func transcriptResultFromProto(r *proto.TranscriptResult) *schema.TranscriptionR Text: r.Text, Language: r.Language, Duration: float64(r.Duration), + Eou: r.Eou, } for _, s := range r.Segments { diff --git a/core/backend/transcript_live.go b/core/backend/transcript_live.go new file mode 100644 index 000000000..956e7e717 --- /dev/null +++ b/core/backend/transcript_live.go @@ -0,0 +1,297 @@ +package backend + +import ( + "context" + "errors" + "fmt" + "io" + "maps" + "sync" + "time" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/core/trace" + grpcPkg "github.com/mudler/LocalAI/pkg/grpc" + "github.com/mudler/LocalAI/pkg/grpc/proto" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/LocalAI/pkg/sound" + "github.com/mudler/xlog" +) + +// LiveTranscriptionEvent is one streamed event from a live (bidirectional) +// transcription session. Delta/Eou/Eob/Words arrive as the user speaks; Final +// is set exactly once, on the terminal event after Close flushes the decode +// tail. Eou means the model judged the user yielded the turn; Eob means a +// backchannel ("uh-huh") ended — callers must NOT treat Eob as a turn +// boundary. +type LiveTranscriptionEvent struct { + Delta string + Eou bool + Eob bool + Words []schema.TranscriptionWord + Final *schema.TranscriptionResult +} + +// LiveTranscriptionSession is a handle on an open live transcription stream. +// Feed pushes 16 kHz mono float PCM; Close signals end-of-audio, waits for +// the backend's terminal Final event to be delivered, and releases the +// stream. +type LiveTranscriptionSession interface { + Feed(pcm []float32) error + Close() error +} + +// liveCloseDrainTimeout bounds how long Close waits for the backend to flush +// the decode tail before force-cancelling the stream. Finalize is one short +// engine call; seconds here means the backend is wedged. +const liveCloseDrainTimeout = 10 * time.Second + +type liveTranscriptionSession struct { + stream grpcPkg.AudioTranscriptionLiveClient + cancel context.CancelFunc + recvDone chan struct{} + recvErr error // written by the recv goroutine before recvDone closes + closeOnce sync.Once + closeErr error + trace *liveTraceState // nil when tracing was disabled at open +} + +func (s *liveTranscriptionSession) Feed(pcm []float32) error { + s.trace.addPCM(pcm) + return s.stream.Send(&proto.TranscriptLiveRequest{ + Payload: &proto.TranscriptLiveRequest_Audio{Audio: &proto.TranscriptLiveAudio{Pcm: pcm}}, + }) +} + +func (s *liveTranscriptionSession) Close() error { + s.closeOnce.Do(func() { + err := s.stream.CloseSend() + select { + case <-s.recvDone: + case <-time.After(liveCloseDrainTimeout): + xlog.Warn("live transcription: backend did not finalize in time; cancelling stream") + s.cancel() + <-s.recvDone + } + s.cancel() + if err == nil { + err = s.recvErr + } + s.closeErr = err + s.trace.record(err) + }) + return s.closeErr +} + +// liveSampleRate is the PCM rate of a live transcription session, fixed by +// the session config sent in ModelTranscriptionLive. +const liveSampleRate = 16000 + +// liveTraceState accumulates what the per-turn backend trace needs while a +// live session runs: a bounded copy of the fed PCM for the audio snippet, +// the decode outputs, and timing. One trace is recorded at Close — the live +// path never touches the unary transcription wrapper, so without this a +// streaming-only pipeline produced no transcription traces at all. Feed and +// the recv goroutine run concurrently; mu guards the accumulators. +type liveTraceState struct { + appConfig *config.ApplicationConfig + modelName string + backend string + language string + started time.Time + + mu sync.Mutex + pcm []byte // first trace.MaxSnippetSeconds of fed audio, int16 LE + fedSamples int // ALL samples fed, beyond the snippet cap + deltaEvents int + eouEvents int + eobEvents int + finalText string +} + +func newLiveTraceState(modelConfig config.ModelConfig, appConfig *config.ApplicationConfig, language string) *liveTraceState { + if !appConfig.EnableTracing { + return nil + } + trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes) + return &liveTraceState{ + appConfig: appConfig, + modelName: modelConfig.Name, + backend: modelConfig.Backend, + language: language, + started: time.Now(), + } +} + +func (ts *liveTraceState) addPCM(pcm []float32) { + if ts == nil { + return + } + ts.mu.Lock() + defer ts.mu.Unlock() + ts.fedSamples += len(pcm) + maxBytes := trace.MaxSnippetSeconds * liveSampleRate * 2 + if room := (maxBytes - len(ts.pcm)) / 2; room > 0 { + if len(pcm) > room { + pcm = pcm[:room] + } + ts.pcm = append(ts.pcm, sound.Float32sToInt16LEBytes(pcm)...) + } +} + +func (ts *liveTraceState) observe(ev LiveTranscriptionEvent) { + if ts == nil { + return + } + ts.mu.Lock() + defer ts.mu.Unlock() + if ev.Delta != "" { + ts.deltaEvents++ + } + if ev.Eou { + ts.eouEvents++ + } + if ev.Eob { + ts.eobEvents++ + } + if ev.Final != nil { + ts.finalText = ev.Final.Text + } +} + +func (ts *liveTraceState) record(closeErr error) { + if ts == nil || !ts.appConfig.EnableTracing { + return + } + ts.mu.Lock() + data := map[string]any{ + "source": "live_stream", + "language": ts.language, + "result_text": ts.finalText, + "eou_events": ts.eouEvents, + "eob_events": ts.eobEvents, + "delta_events": ts.deltaEvents, + } + if snippet := trace.AudioSnippetFromPCM(ts.pcm, liveSampleRate, ts.fedSamples*2, ts.appConfig.TracingMaxBodyBytes); snippet != nil { + maps.Copy(data, snippet) + } + summary := "live -> " + ts.finalText + ts.mu.Unlock() + + bt := trace.BackendTrace{ + Timestamp: ts.started, + Duration: time.Since(ts.started), + Type: trace.BackendTraceTranscription, + ModelName: ts.modelName, + Backend: ts.backend, + Summary: trace.TruncateString(summary, 200), + Data: data, + } + if closeErr != nil { + bt.Error = closeErr.Error() + } + trace.RecordBackendTrace(bt) +} + +// ModelTranscriptionLive loads the transcription backend, opens the +// bidirectional AudioTranscriptionLive RPC, sends the session config, and +// BLOCKS until the backend's ready ack. A grpcerrors. +// IsLiveTranscriptionUnsupported error means the backend (or the loaded +// model) cannot do live transcription and the caller should degrade to the +// unary/file path. After a successful return, onEvent is invoked from a +// background goroutine — in order, one event at a time — for every response +// the backend streams, ending with the Final event triggered by Close. +func ModelTranscriptionLive(ctx context.Context, language string, + ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig, + onEvent func(LiveTranscriptionEvent)) (LiveTranscriptionSession, error) { + + transcriptionModel, err := loadTranscriptionModel(ctx, ml, modelConfig, appConfig) + if err != nil { + return nil, err + } + + // The derived cancel out-lives this call inside the session: Close uses + // it to unwind the stream (and, in embed mode, the server-side recv + // pump, which only stops on send-close or context cancellation). + streamCtx, cancel := context.WithCancel(ctx) + stream, err := transcriptionModel.AudioTranscriptionLive(streamCtx) + if err != nil { + cancel() + return nil, err + } + + fail := func(err error) (LiveTranscriptionSession, error) { + _ = stream.CloseSend() + cancel() + return nil, err + } + + if err := stream.Send(&proto.TranscriptLiveRequest{ + Payload: &proto.TranscriptLiveRequest_Config{Config: &proto.TranscriptLiveConfig{ + Language: language, + SampleRate: liveSampleRate, + }}, + }); err != nil { + return fail(err) + } + + // Ready-ack contract: the backend answers a successful open with a + // {ready:true} response before any transcript data; unsupported + // backends surface Unimplemented here instead. + ack, err := stream.Recv() + if err != nil { + return fail(err) + } + if !ack.GetReady() { + return fail(fmt.Errorf("live transcription: backend %q broke the ready-ack contract (first response carried data)", modelConfig.Backend)) + } + + s := &liveTranscriptionSession{ + stream: stream, + cancel: cancel, + recvDone: make(chan struct{}), + trace: newLiveTraceState(modelConfig, appConfig, language), + } + + go func() { + defer close(s.recvDone) + for { + resp, err := stream.Recv() + if err != nil { + if !errors.Is(err, io.EOF) && streamCtx.Err() == nil { + xlog.Warn("live transcription stream ended unexpectedly", "error", err) + s.recvErr = err + } + return + } + ev := liveEventFromProto(resp) + if ev.Delta == "" && !ev.Eou && !ev.Eob && len(ev.Words) == 0 && ev.Final == nil { + continue // duplicate ready ack / keep-alive: nothing to deliver + } + s.trace.observe(ev) + onEvent(ev) + } + }() + + return s, nil +} + +func liveEventFromProto(r *proto.TranscriptLiveResponse) LiveTranscriptionEvent { + ev := LiveTranscriptionEvent{ + Delta: r.GetDelta(), + Eou: r.GetEou(), + Eob: r.GetEob(), + } + for _, w := range r.GetWords() { + ev.Words = append(ev.Words, schema.TranscriptionWord{ + Start: time.Duration(w.Start), + End: time.Duration(w.End), + Text: w.Text, + }) + } + if r.GetFinalResult() != nil { + ev.Final = transcriptResultFromProto(r.GetFinalResult()) + } + return ev +} diff --git a/core/backend/transcript_live_internal_test.go b/core/backend/transcript_live_internal_test.go new file mode 100644 index 000000000..cbd7fac54 --- /dev/null +++ b/core/backend/transcript_live_internal_test.go @@ -0,0 +1,162 @@ +package backend + +import ( + "errors" + "time" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/core/trace" + "github.com/mudler/LocalAI/pkg/grpc/proto" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("liveEventFromProto", func() { + It("maps deltas, eou flags and words (ns -> duration)", func() { + ev := liveEventFromProto(&proto.TranscriptLiveResponse{ + Delta: "hello ", + Eou: true, + Words: []*proto.TranscriptWord{ + {Start: int64(100 * time.Millisecond), End: int64(400 * time.Millisecond), Text: "hello"}, + }, + }) + Expect(ev.Delta).To(Equal("hello ")) + Expect(ev.Eou).To(BeTrue()) + Expect(ev.Words).To(HaveLen(1)) + Expect(ev.Words[0].Text).To(Equal("hello")) + Expect(ev.Words[0].Start).To(Equal(100 * time.Millisecond)) + Expect(ev.Words[0].End).To(Equal(400 * time.Millisecond)) + Expect(ev.Final).To(BeNil()) + }) + + It("maps the terminal final result including the eou flag", func() { + ev := liveEventFromProto(&proto.TranscriptLiveResponse{ + FinalResult: &proto.TranscriptResult{ + Text: "hello world", + Duration: 1.5, + Eou: true, + Segments: []*proto.TranscriptSegment{{Id: 0, Text: "hello world"}}, + }, + }) + Expect(ev.Final).NotTo(BeNil()) + Expect(ev.Final.Text).To(Equal("hello world")) + Expect(ev.Final.Duration).To(BeNumerically("~", 1.5, 1e-6)) + Expect(ev.Final.Eou).To(BeTrue()) + Expect(ev.Final.Segments).To(HaveLen(1)) + }) + + It("yields an empty event for a bare ready ack (filtered by the recv loop)", func() { + ev := liveEventFromProto(&proto.TranscriptLiveResponse{Ready: true}) + Expect(ev.Delta).To(BeEmpty()) + Expect(ev.Eou).To(BeFalse()) + Expect(ev.Words).To(BeEmpty()) + Expect(ev.Final).To(BeNil()) + }) + + It("maps the eob backchannel flag separately from eou", func() { + ev := liveEventFromProto(&proto.TranscriptLiveResponse{Delta: "uh-huh", Eob: true}) + Expect(ev.Eob).To(BeTrue()) + Expect(ev.Eou).To(BeFalse()) + }) +}) + +// liveTraceState is what makes streaming-only pipelines visible on the +// Traces page: without it a semantic_vad session with retranscribe off +// produced no transcription trace at all. One trace per session (= one per +// realtime turn), recorded at Close. +var _ = Describe("liveTraceState", func() { + var appConfig *config.ApplicationConfig + + BeforeEach(func() { + appConfig = &config.ApplicationConfig{ + EnableTracing: true, + TracingMaxItems: 64, + } + trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes) + trace.ClearBackendTraces() + }) + + modelCfg := func() config.ModelConfig { + cfg := config.ModelConfig{Backend: "parakeet-cpp"} + cfg.Name = "parakeet-live" + return cfg + } + + It("is disabled (nil) when tracing is off, and nil receivers are no-ops", func() { + appConfig.EnableTracing = false + ts := newLiveTraceState(modelCfg(), appConfig, "en") + Expect(ts).To(BeNil()) + + // The session calls these unconditionally; nil must be safe. + ts.addPCM([]float32{0.5}) + ts.observe(LiveTranscriptionEvent{Eou: true}) + ts.record(nil) + Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty()) + }) + + It("records one transcription trace with text, eou event counts and audio snippet at Close", func() { + ts := newLiveTraceState(modelCfg(), appConfig, "en") + Expect(ts).NotTo(BeNil()) + + // One second of a loud-ish constant tone so the snippet has signal. + pcm := make([]float32, liveSampleRate) + for i := range pcm { + pcm[i] = 0.25 + } + ts.addPCM(pcm) + ts.observe(LiveTranscriptionEvent{Delta: "hello "}) + ts.observe(LiveTranscriptionEvent{Delta: "world", Eou: true}) + ts.observe(LiveTranscriptionEvent{Final: &schema.TranscriptionResult{Text: "hello world", Eou: true}}) + + ts.record(nil) + + Eventually(trace.GetBackendTraces).Should(HaveLen(1)) + got := trace.GetBackendTraces()[0] + Expect(got.Type).To(Equal(trace.BackendTraceTranscription)) + Expect(got.ModelName).To(Equal("parakeet-live")) + Expect(got.Backend).To(Equal("parakeet-cpp")) + Expect(got.Summary).To(ContainSubstring("hello world")) + Expect(got.Data["source"]).To(Equal("live_stream")) + Expect(got.Data["result_text"]).To(Equal("hello world")) + // The live FinalResult no longer carries a terminal eou flag; the + // per-feed eou_events count is what the trace records instead. + Expect(got.Data).NotTo(HaveKey("eou")) + Expect(got.Data["eou_events"]).To(Equal(1)) + Expect(got.Data["delta_events"]).To(Equal(2)) + Expect(got.Data["audio_duration_s"]).To(BeNumerically("~", 1.0, 0.01)) + Expect(got.Data["audio_wav_base64"]).NotTo(BeEmpty()) + Expect(got.Error).To(BeEmpty()) + }) + + It("caps the stored snippet but keeps counting the full fed duration", func() { + ts := newLiveTraceState(modelCfg(), appConfig, "") + + // Feed past the snippet cap in two chunks (cap + one extra second). + ts.addPCM(make([]float32, trace.MaxSnippetSeconds*liveSampleRate)) + ts.addPCM(make([]float32, liveSampleRate)) + + Expect(len(ts.pcm)).To(Equal(trace.MaxSnippetSeconds * liveSampleRate * 2)) + Expect(ts.fedSamples).To(Equal((trace.MaxSnippetSeconds + 1) * liveSampleRate)) + + ts.record(nil) + Eventually(trace.GetBackendTraces).Should(HaveLen(1)) + got := trace.GetBackendTraces()[0] + Expect(got.Data["audio_duration_s"]).To(BeNumerically("~", float64(trace.MaxSnippetSeconds+1), 0.01)) + Expect(got.Data["audio_snippet_s"]).To(BeNumerically("~", float64(trace.MaxSnippetSeconds), 0.01)) + }) + + It("clamps out-of-range float samples instead of wrapping", func() { + ts := newLiveTraceState(modelCfg(), appConfig, "") + ts.addPCM([]float32{2.0, -2.0}) + Expect(ts.pcm).To(Equal([]byte{0xff, 0x7f, 0x00, 0x80})) // 32767, -32768 + }) + + It("stamps the close error on the trace", func() { + ts := newLiveTraceState(modelCfg(), appConfig, "") + ts.record(errors.New("stream torn down")) + + Eventually(trace.GetBackendTraces).Should(HaveLen(1)) + Expect(trace.GetBackendTraces()[0].Error).To(Equal("stream torn down")) + }) +}) diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go index 3476076e1..b8200cd41 100644 --- a/core/config/meta/registry.go +++ b/core/config/meta/registry.go @@ -567,6 +567,38 @@ func DefaultRegistry() map[string]FieldMetaOverride { Advanced: true, Order: 83, }, + "pipeline.turn_detection.type": { + Section: "pipeline", + Label: "Turn Detection", + Description: "Default turn-detection mode for realtime sessions on this pipeline. server_vad commits after a fixed silence window; semantic_vad lets the transcription model's end-of-utterance token drive a dynamic window (fast commit after the token, long eagerness fallback without it). semantic_vad requires a streaming-EOU transcription model (e.g. parakeet-cpp-realtime_eou_120m-v1) and degrades to silence-only otherwise. Clients can override per session via session.update.", + Component: "select", + Options: []FieldOption{ + {Value: "", Label: "Default (server_vad)"}, + {Value: "server_vad", Label: "server_vad (silence-based)"}, + {Value: "semantic_vad", Label: "semantic_vad (end-of-utterance token)"}, + }, + Order: 87, + }, + "pipeline.turn_detection.eagerness": { + Section: "pipeline", + Label: "Eagerness", + Description: "semantic_vad fallback silence window used when no end-of-utterance token was seen: low waits 8s, medium/auto 4s, high 2s.", + Component: "select", + Options: []FieldOption{ + {Value: "", Label: "Default (auto)"}, + {Value: "low", Label: "low (8s)"}, + {Value: "medium", Label: "medium (4s)"}, + {Value: "high", Label: "high (2s)"}, + }, + Order: 88, + }, + "pipeline.turn_detection.retranscribe": { + Section: "pipeline", + Label: "Retranscribe on Commit", + Description: "Cross-check every semantic_vad commit with an offline decode of the buffered turn: commit only proceeds when the batch decode also ends in the end-of-utterance token, and its transcript is used. Logs a streamed-vs-batch comparison — useful to gauge streaming/batch alignment — at the cost of one extra decode per turn.", + Component: "toggle", + Order: 89, + }, // --- Functions --- "function.grammar.parallel_calls": { diff --git a/core/config/model_config.go b/core/config/model_config.go index 2d1e18cc7..69dda331b 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -650,6 +650,12 @@ type Pipeline struct { // VoiceRecognition gates the pipeline behind speaker verification. Nil // (block absent) means no gate, preserving existing behavior. VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"` + + // TurnDetection sets the server-side default turn-detection mode for + // realtime sessions on this pipeline, so clients need no session.update + // to benefit. A client session.update still overrides type and eagerness + // per session; retranscribe is server-side only. Unset keeps server_vad. + TurnDetection PipelineTurnDetection `yaml:"turn_detection,omitempty" json:"turn_detection,omitempty"` } // PipelineCompaction configures summarize-then-drop for a realtime pipeline. @@ -934,6 +940,38 @@ func (v PipelineVoiceRecognition) Validate(registryAvailable bool) error { return nil } +// @Description PipelineTurnDetection sets realtime turn-detection defaults. +type PipelineTurnDetection struct { + // Type selects the default turn_detection mode for sessions on this + // pipeline: "server_vad" (silence-based) or "semantic_vad" (the + // transcription model's end-of-utterance token drives a dynamic silence + // window; needs a streaming-EOU transcription model such as + // parakeet_realtime_eou_120m-v1, degrades to silence-only otherwise). + Type string `yaml:"type,omitempty" json:"type,omitempty"` + // Eagerness is the semantic_vad fallback when no end-of-utterance token + // was seen: low waits 8s of silence, medium/auto 4s, high 2s. + Eagerness string `yaml:"eagerness,omitempty" json:"eagerness,omitempty"` + // Retranscribe (semantic_vad only) cross-checks every EOU-triggered + // commit with an offline decode of the buffered turn: the commit only + // proceeds when the batch decode also ends in the end-of-utterance token, + // and its transcript is the one used. The streamed and batch transcripts + // are compared in the logs — a diagnostic for streaming/batch alignment + // at the cost of one extra decode per turn. + Retranscribe *bool `yaml:"retranscribe,omitempty" json:"retranscribe,omitempty"` +} + +// TurnDetectionSemantic reports whether this pipeline defaults sessions to +// semantic (EOU-driven) turn detection. +func (p Pipeline) TurnDetectionSemantic() bool { + return strings.EqualFold(strings.TrimSpace(p.TurnDetection.Type), "semantic_vad") +} + +// TurnDetectionRetranscribe reports whether semantic_vad commits should be +// cross-checked (and transcribed) by an offline decode of the buffered turn. +func (p Pipeline) TurnDetectionRetranscribe() bool { + return p.TurnDetection.Retranscribe != nil && *p.TurnDetection.Retranscribe +} + // @Description File configuration for model downloads type File struct { Filename string `yaml:"filename,omitempty" json:"filename,omitempty"` diff --git a/core/config/pipeline_turn_detection_test.go b/core/config/pipeline_turn_detection_test.go new file mode 100644 index 000000000..d2b11a115 --- /dev/null +++ b/core/config/pipeline_turn_detection_test.go @@ -0,0 +1,61 @@ +package config + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "gopkg.in/yaml.v3" +) + +// pipeline.turn_detection sets the server-side default turn-detection mode +// for realtime sessions. Unset keeps server_vad, so existing configs are +// unaffected; retranscribe is opt-in. +var _ = Describe("Pipeline turn_detection config", func() { + It("defaults to non-semantic with retranscribe off when unset", func() { + var p Pipeline + Expect(p.TurnDetectionSemantic()).To(BeFalse()) + Expect(p.TurnDetectionRetranscribe()).To(BeFalse()) + }) + + It("parses the nested turn_detection block from YAML", func() { + var c ModelConfig + err := yaml.Unmarshal([]byte(` +name: gpt-realtime +pipeline: + transcription: parakeet-cpp-realtime_eou_120m-v1 + turn_detection: + type: semantic_vad + eagerness: high + retranscribe: true +`), &c) + Expect(err).ToNot(HaveOccurred()) + Expect(c.Pipeline.TurnDetectionSemantic()).To(BeTrue()) + Expect(c.Pipeline.TurnDetection.Eagerness).To(Equal("high")) + Expect(c.Pipeline.TurnDetectionRetranscribe()).To(BeTrue()) + }) + + It("treats server_vad and unknown types as non-semantic", func() { + var p Pipeline + p.TurnDetection.Type = "server_vad" + Expect(p.TurnDetectionSemantic()).To(BeFalse()) + p.TurnDetection.Type = "something_else" + Expect(p.TurnDetectionSemantic()).To(BeFalse()) + }) + + It("matches semantic_vad case-insensitively with surrounding space", func() { + var p Pipeline + p.TurnDetection.Type = " Semantic_VAD " + Expect(p.TurnDetectionSemantic()).To(BeTrue()) + }) + + It("treats an explicit retranscribe false as off", func() { + var c ModelConfig + err := yaml.Unmarshal([]byte(` +pipeline: + turn_detection: + type: semantic_vad + retranscribe: false +`), &c) + Expect(err).ToNot(HaveOccurred()) + Expect(c.Pipeline.TurnDetectionRetranscribe()).To(BeFalse()) + }) +}) diff --git a/core/http/endpoints/openai/compactcoord/compactcoord.go b/core/http/endpoints/openai/compactcoord/compactcoord.go new file mode 100644 index 000000000..62b7156ed --- /dev/null +++ b/core/http/endpoints/openai/compactcoord/compactcoord.go @@ -0,0 +1,149 @@ +// Package compactcoord is the explicit state machine for the realtime API's +// conversation-compaction concern (machine "M4" in +// docs/design/realtime-state-machines.md). +// +// In the legacy code this machine is an implicit single-flight guard: a +// per-conversation `compacting atomic.Bool` that maybeCompact CAS-flips to start +// a background summarize+evict and a deferred Store(false) clears. The intent — +// at most one compaction running per conversation at a time, so two goroutines +// never summarize and evict the same overflow concurrently (Part 4, invariant +// #9) — is correct but implicit in a bare atomic. +// +// This package makes it explicit: +// - a sealed sum type for State (Idle | Running) — "two compactions running" is +// unrepresentable, +// - a total, pure transition function Next(state, event) -> (state, effects), +// - a single-writer Coordinator that serializes every transition. +// +// Unlike respcoord (M3), a Trigger while Running is NOT a supersede: compaction +// is idempotent work on the same overflow, so a concurrent trigger is simply +// dropped (matching the legacy CAS-fails-so-skip), not queued or restarted. +package compactcoord + +import ( + "fmt" + + "github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator" +) + +// State is the sealed sum type of compaction states. Exhaustively: +// Idle | Running | Terminated. +type State interface { + isState() + String() string +} + +// Idle: no compaction is running. +type Idle struct{} + +// Running: exactly one compaction is in flight. +type Running struct{} + +// Terminated: the conversation/session is torn down. Absorbing — no compaction +// can start from here, so the M1 (connection) parent's teardown can cancel + +// join the in-flight compaction and guarantee none outlives the session (see +// formal-verification/session_lifecycle.fizz). This closes the legacy gap where +// the fire-and-forget compaction goroutine could outlive the session. +type Terminated struct{} + +func (Idle) isState() {} +func (Running) isState() {} +func (Terminated) isState() {} + +func (Idle) String() string { return "Idle" } +func (Running) String() string { return "Running" } +func (Terminated) String() string { return "Terminated" } + +// Event is the sealed sum type of inputs. Exhaustively: +// Trigger | Finished | Shutdown. +type Event interface { + isEvent() + String() string +} + +// Trigger requests a compaction (the live buffer grew past the trigger). It +// starts one only when Idle; while Running it is a no-op (single-flight). +type Trigger struct{} + +// Finished reports that the running compaction goroutine finished (success, error, or +// timeout — it always reports Finished so the flag can never stick). +type Finished struct{} + +// Shutdown terminates the coordinator at teardown: the in-flight compaction is +// cancelled + joined by the sink, and no compaction can start afterwards. +type Shutdown struct{} + +func (Trigger) isEvent() {} +func (Finished) isEvent() {} +func (Shutdown) isEvent() {} + +func (Trigger) String() string { return "Trigger" } +func (Finished) String() string { return "Finished" } +func (Shutdown) String() string { return "Shutdown" } + +// Effect is a side effect returned by Next as data. Exhaustively: StartCompaction. +type Effect interface { + isEffect() + String() string +} + +// StartCompaction: spawn the background summarize+evict goroutine. +type StartCompaction struct{} + +func (StartCompaction) isEffect() {} + +func (StartCompaction) String() string { return "StartCompaction" } + +// Next is the total, pure transition function. For every (state, event) it +// returns the next state and the ordered effects. It returns a non-nil error +// only for an unknown State/Event implementation. Every in-domain pair is +// defined; there are no forbidden transitions, only no-ops. +// +// Single-flight crux: StartCompaction is emitted only on Idle+Trigger, and a +// Trigger while Running is a no-op — so at most one compaction ever runs. +func Next(s State, e Event) (State, []Effect, error) { + switch s.(type) { + case Idle: + switch e.(type) { + case Trigger: + return Running{}, []Effect{StartCompaction{}}, nil + case Finished: + // No compaction to finish: stale/idempotent no-op. + return Idle{}, nil, nil + case Shutdown: + return Terminated{}, nil, nil + } + case Running: + switch e.(type) { + case Trigger: + // Already compacting: drop (single-flight). + return Running{}, nil, nil + case Finished: + return Idle{}, nil, nil + case Shutdown: + // Teardown while compacting: the sink cancels + joins the goroutine, + // so its later Finished is absorbed here in Terminated. + return Terminated{}, nil, nil + } + case Terminated: + // Absorbing: a Trigger after teardown is rejected (no StartCompaction), so + // no compaction outlives the session. + switch e.(type) { + case Trigger, Finished, Shutdown: + return Terminated{}, nil, nil + } + } + return s, nil, fmt.Errorf("compactcoord: unhandled transition %s <- %s", s, e) +} + +// EffectSink performs the effects produced by a transition. See coordinator.Sink: +// StartCompaction spawns a goroutine, so Perform does not block under the lock. +type EffectSink = coordinator.Sink[Effect] + +// Coordinator serializes the compaction transitions. See coordinator.Coordinator. +type Coordinator = coordinator.Coordinator[State, Event, Effect] + +// New returns an idle Coordinator that performs effects via sink. +func New(sink EffectSink) *Coordinator { + return coordinator.New[State, Event, Effect](Idle{}, Next, sink) +} diff --git a/core/http/endpoints/openai/compactcoord/compactcoord_suite_test.go b/core/http/endpoints/openai/compactcoord/compactcoord_suite_test.go new file mode 100644 index 000000000..0dae15f80 --- /dev/null +++ b/core/http/endpoints/openai/compactcoord/compactcoord_suite_test.go @@ -0,0 +1,13 @@ +package compactcoord + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestCompactcoord(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "compactcoord (realtime M4) Suite") +} diff --git a/core/http/endpoints/openai/compactcoord/compactcoord_test.go b/core/http/endpoints/openai/compactcoord/compactcoord_test.go new file mode 100644 index 000000000..caba28ecd --- /dev/null +++ b/core/http/endpoints/openai/compactcoord/compactcoord_test.go @@ -0,0 +1,202 @@ +package compactcoord + +import ( + "math/rand/v2" + "sync" + "sync/atomic" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// recordingSink captures the ordered stream of effects. Perform is called under +// the coordinator lock; the mutex here guards reads from the spec goroutine. +type recordingSink struct { + mu sync.Mutex + log []Effect +} + +func (s *recordingSink) Perform(e Effect) { + s.mu.Lock() + s.log = append(s.log, e) + s.mu.Unlock() +} + +func (s *recordingSink) count() int { + s.mu.Lock() + defer s.mu.Unlock() + return len(s.log) +} + +type unknownEvent struct{} + +func (unknownEvent) isEvent() {} +func (unknownEvent) String() string { return "unknownEvent" } + +type unknownState struct{} + +func (unknownState) isState() {} +func (unknownState) String() string { return "unknownState" } + +var _ = Describe("compactcoord.Next", func() { + DescribeTable("transitions", + func(state State, event Event, wantState State, wantEff []Effect) { + gotState, gotEff, err := Next(state, event) + Expect(err).NotTo(HaveOccurred()) + Expect(gotState).To(Equal(wantState)) + Expect(gotEff).To(Equal(wantEff)) + }, + Entry("idle+trigger -> running: start", + Idle{}, Trigger{}, Running{}, []Effect{StartCompaction{}}), + Entry("idle+finished -> idle, no-op (stale)", + Idle{}, Finished{}, Idle{}, []Effect(nil)), + Entry("running+trigger -> running, no-op (single-flight)", + Running{}, Trigger{}, Running{}, []Effect(nil)), + Entry("running+finished -> idle", + Running{}, Finished{}, Idle{}, []Effect(nil)), + Entry("idle+shutdown -> terminated", + Idle{}, Shutdown{}, Terminated{}, []Effect(nil)), + Entry("running+shutdown -> terminated", + Running{}, Shutdown{}, Terminated{}, []Effect(nil)), + Entry("terminated+trigger -> terminated, REJECTED", + Terminated{}, Trigger{}, Terminated{}, []Effect(nil)), + Entry("terminated+finished -> terminated, no-op (stale)", + Terminated{}, Finished{}, Terminated{}, []Effect(nil)), + Entry("terminated+shutdown -> terminated, idempotent", + Terminated{}, Shutdown{}, Terminated{}, []Effect(nil)), + ) + + It("is total over the defined (state, event) pairs", func() { + for _, s := range []State{Idle{}, Running{}, Terminated{}} { + for _, e := range []Event{Trigger{}, Finished{}, Shutdown{}} { + _, _, err := Next(s, e) + Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e) + } + } + }) + + It("errors on an unknown event type", func() { + _, _, err := Next(Idle{}, unknownEvent{}) + Expect(err).To(HaveOccurred()) + }) + + It("errors on an unknown state type", func() { + _, _, err := Next(unknownState{}, Trigger{}) + Expect(err).To(HaveOccurred()) + }) +}) + +var _ = Describe("compactcoord.Coordinator", func() { + // A StartCompaction is only ever produced while Idle (verified by checking the + // effect count grows exactly when the model transitions Idle->Running), so at + // most one compaction is ever in flight. + It("starts at most one compaction at a time over random sequences", func() { + seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE} + for _, seed := range seeds { + r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5)) + sink := &recordingSink{} + c := New(sink) + running := false + starts := 0 + + for range 5000 { + if r.IntN(2) == 0 { + before := sink.count() + Expect(c.Apply(Trigger{})).To(Succeed()) + if sink.count() > before { + // A StartCompaction was produced: must have been Idle. + Expect(running).To(BeFalse(), "seed=%d: started while already running", seed) + running = true + starts++ + } + } else { + Expect(c.Apply(Finished{})).To(Succeed()) + running = false + } + if running { + Expect(c.State()).To(Equal(State(Running{})), "seed=%d", seed) + } else { + Expect(c.State()).To(Equal(State(Idle{})), "seed=%d", seed) + } + } + Expect(starts).To(BeNumerically(">", 0), "seed=%d: walk should have started at least one", seed) + } + }) + + // Faithful concurrent test: StartCompaction spawns "work" that bumps an active + // counter, runs, and reports Finished back to the coordinator (exactly how the + // real sink behaves). Single-flight must hold even under many concurrent + // Triggers: the active counter never exceeds 1. Run under -race. + It("never runs two compactions concurrently", func() { + var active, maxActive int32 + var c *Coordinator + var work sync.WaitGroup + sink := &spawnSink{onStart: func() { + work.Add(1) + go func() { + defer work.Done() + n := atomic.AddInt32(&active, 1) + for { + m := atomic.LoadInt32(&maxActive) + if n <= m || atomic.CompareAndSwapInt32(&maxActive, m, n) { + break + } + } + atomic.AddInt32(&active, -1) + _ = c.Apply(Finished{}) + }() + }} + c = New(sink) + + var wg sync.WaitGroup + for g := 0; g < 8; g++ { + wg.Add(1) + go func() { + defer wg.Done() + for range 1000 { + _ = c.Apply(Trigger{}) + } + }() + } + wg.Wait() + work.Wait() // let any in-flight compaction report Finished + + Expect(atomic.LoadInt32(&maxActive)).To(BeNumerically("<=", 1)) + Expect(c.State()).To(Equal(State(Idle{}))) + }) + + It("terminates on shutdown and rejects later triggers", func() { + sink := &recordingSink{} + c := New(sink) + Expect(c.Apply(Trigger{})).To(Succeed()) // Idle -> Running (StartCompaction) + Expect(c.Apply(Shutdown{})).To(Succeed()) + Expect(c.State()).To(Equal(State(Terminated{}))) + + before := sink.count() + Expect(c.Apply(Trigger{})).To(Succeed()) // rejected + Expect(sink.count()).To(Equal(before), "no StartCompaction after shutdown") + Expect(c.Apply(Finished{})).To(Succeed()) // stale, absorbed + Expect(c.State()).To(Equal(State(Terminated{}))) + }) +}) + +// spawnSink invokes onStart for each StartCompaction (called under the coord lock; +// onStart must be non-blocking — it spawns the work goroutine). +type spawnSink struct{ onStart func() } + +func (s *spawnSink) Perform(e Effect) { + if _, ok := e.(StartCompaction); ok { + s.onStart() + } +} + +var _ = DescribeTable("compactcoord stringers", + func(got, want string) { Expect(got).To(Equal(want)) }, + Entry(nil, Idle{}.String(), "Idle"), + Entry(nil, Running{}.String(), "Running"), + Entry(nil, Terminated{}.String(), "Terminated"), + Entry(nil, Trigger{}.String(), "Trigger"), + Entry(nil, Finished{}.String(), "Finished"), + Entry(nil, Shutdown{}.String(), "Shutdown"), + Entry(nil, StartCompaction{}.String(), "StartCompaction"), +) diff --git a/core/http/endpoints/openai/conncoord/conncoord.go b/core/http/endpoints/openai/conncoord/conncoord.go new file mode 100644 index 000000000..f6e7e0e03 --- /dev/null +++ b/core/http/endpoints/openai/conncoord/conncoord.go @@ -0,0 +1,164 @@ +// Package conncoord is the explicit state machine for the realtime API's +// connection lifecycle (machine "M1" in docs/design/realtime-state-machines.md). +// +// In the legacy code this machine is implicit and fragile. The session handler +// keeps a `vadServerStarted` bool plus a `done` channel that is REASSIGNED to a +// fresh channel every time turn detection is toggled on (session.update) and +// closed both at toggle-off and at teardown (Part 2, failure mode 6). It is +// correct today only because one goroutine owns it; "one variable name meaning +// different channels over time, closed from two sites guarded by a bool" is a +// structural hazard, not an explicit lifecycle. Teardown likewise depends on the +// bool to avoid closing an already-closed channel. +// +// This package makes the lifecycle explicit: +// - a sealed sum type for State (Live{VADRunning} | Torn) — illegal states +// such as "running after teardown" are unrepresentable, +// - a total, pure transition function Next(state, event) -> (state, effects), +// - a single-writer Coordinator that serializes every transition. +// +// The guarantees the spec checks: +// - the VAD goroutine's done channel is closed exactly once per start (StopVAD +// is emitted only while running, so never a double close / close of nil), +// - teardown runs exactly once (Close from Live; any later Close is a no-op), +// - nothing is started after teardown (no resurrection / no send-after-close). +// +// Like turncoord (M2), the connection machine is driven by the single session +// goroutine; the Coordinator's lock keeps State() race-free and guards against a +// future second writer. The effects are performed by a sink that owns the actual +// channels/goroutines (see realtime_conncoord.go). +package conncoord + +import ( + "fmt" + + "github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator" +) + +// State is the sealed sum type of connection states. The only implementations +// are the marker-method structs in this file. Exhaustively: Live | Torn. +type State interface { + isState() + String() string +} + +// Live: the session is active. VADRunning records whether the turn-detection +// (handleVAD) goroutine is currently running — the single source of truth that +// replaces the legacy vadServerStarted bool, so the per-run done channel is +// closed exactly once. +type Live struct{ VADRunning bool } + +// Torn: the session has been torn down. Terminal — no effect is ever produced +// from here again. +type Torn struct{} + +func (Live) isState() {} +func (Torn) isState() {} + +func (s Live) String() string { return fmt.Sprintf("Live(vad=%t)", s.VADRunning) } +func (Torn) String() string { return "Torn" } + +// Event is the sealed sum type of inputs. Exhaustively: SetVAD | Close. +type Event interface { + isEvent() + String() string +} + +// SetVAD requests the turn-detection goroutine be running (Active) or not. It is +// raised whenever session.update changes whether turn detection is active. It is +// idempotent: setting the state it is already in is a no-op. +type SetVAD struct{ Active bool } + +// Close requests teardown (the transport read loop ended, or the session is +// closing). It is idempotent — only the first Close from Live tears down. +type Close struct{} + +func (SetVAD) isEvent() {} +func (Close) isEvent() {} + +func (e SetVAD) String() string { return fmt.Sprintf("SetVAD(%t)", e.Active) } +func (Close) String() string { return "Close" } + +// Effect is a side effect returned by Next as data for the caller to perform. +// Exhaustively: StartVAD | StopVAD | Teardown. +type Effect interface { + isEffect() + String() string +} + +// StartVAD: create a fresh done channel and spawn the handleVAD goroutine on it. +type StartVAD struct{} + +// StopVAD: close the running VAD goroutine's done channel (signal it to exit). +type StopVAD struct{} + +// Teardown: the once-only teardown — stop the remaining input goroutines (opus +// decode, sound window), join them, cancel in-flight responses, and remove the +// session from the registry. Emitted exactly once. +type Teardown struct{} + +func (StartVAD) isEffect() {} +func (StopVAD) isEffect() {} +func (Teardown) isEffect() {} + +func (StartVAD) String() string { return "StartVAD" } +func (StopVAD) String() string { return "StopVAD" } +func (Teardown) String() string { return "Teardown" } + +// Next is the total, pure transition function. For every (state, event) it +// returns the next state and the ordered effects to perform. It returns a +// non-nil error only for an unknown State/Event implementation. Every in-domain +// pair is defined; there are no forbidden transitions, only no-ops. +// +// The crux: Close moves to Torn, which absorbs every later event with no +// effects. So teardown's channel closes happen exactly once even if Close is +// raised again (e.g. an error path and the normal return both reaching it), and +// no StartVAD can resurrect a torn session. +func Next(s State, e Event) (State, []Effect, error) { + switch st := s.(type) { + case Live: + switch ev := e.(type) { + case SetVAD: + switch { + case ev.Active && !st.VADRunning: + return Live{VADRunning: true}, []Effect{StartVAD{}}, nil + case !ev.Active && st.VADRunning: + return Live{VADRunning: false}, []Effect{StopVAD{}}, nil + default: + // Already in the requested state: idempotent no-op. + return Live{VADRunning: st.VADRunning}, nil, nil + } + case Close: + if st.VADRunning { + return Torn{}, []Effect{StopVAD{}, Teardown{}}, nil + } + return Torn{}, []Effect{Teardown{}}, nil + } + case Torn: + switch e.(type) { + case SetVAD: + // No resurrection: a toggle after teardown is ignored. + return Torn{}, nil, nil + case Close: + // Idempotent: teardown already ran. + return Torn{}, nil, nil + } + } + return s, nil, fmt.Errorf("conncoord: unhandled transition %s <- %s", s, e) +} + +// EffectSink performs the effects produced by a transition. See coordinator.Sink: +// Perform runs under the coordinator lock. The Teardown effect does join +// goroutines (which can block) — acceptable here because the connection +// coordinator is single-writer and torn down exactly once at the end of the +// session goroutine, so no other Apply is contending the lock. +type EffectSink = coordinator.Sink[Effect] + +// Coordinator serializes the connection-lifecycle transitions. +// See coordinator.Coordinator. +type Coordinator = coordinator.Coordinator[State, Event, Effect] + +// New returns a Coordinator in Live{VADRunning:false} that performs effects via +// sink. +func New(sink EffectSink) *Coordinator { + return coordinator.New[State, Event, Effect](Live{VADRunning: false}, Next, sink) +} diff --git a/core/http/endpoints/openai/conncoord/conncoord_suite_test.go b/core/http/endpoints/openai/conncoord/conncoord_suite_test.go new file mode 100644 index 000000000..3344a2355 --- /dev/null +++ b/core/http/endpoints/openai/conncoord/conncoord_suite_test.go @@ -0,0 +1,13 @@ +package conncoord + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestConncoord(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "conncoord (realtime M1) Suite") +} diff --git a/core/http/endpoints/openai/conncoord/conncoord_test.go b/core/http/endpoints/openai/conncoord/conncoord_test.go new file mode 100644 index 000000000..8fb3c5051 --- /dev/null +++ b/core/http/endpoints/openai/conncoord/conncoord_test.go @@ -0,0 +1,212 @@ +package conncoord + +import ( + "math/rand/v2" + "sync" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// recordingSink captures the ordered stream of effects so the invariants can be +// checked independently of the transition function. Perform is called by +// Coordinator.Apply under the coordinator lock; the mutex here only guards reads +// from the spec goroutine. +type recordingSink struct { + mu sync.Mutex + log []Effect +} + +func (s *recordingSink) Perform(e Effect) { + s.mu.Lock() + s.log = append(s.log, e) + s.mu.Unlock() +} + +func (s *recordingSink) snapshot() []Effect { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]Effect, len(s.log)) + copy(out, s.log) + return out +} + +// checkLog replays the effect log and asserts the lifecycle safety properties +// from docs/design/realtime-state-machines.md, Part 4 (invariants #8, #10 and +// failure mode 6): +// +// (1) the VAD done channel is closed exactly once per start -- StartVAD only +// while stopped, StopVAD only while running (no double close / close-of-nil); +// (2) teardown runs at most once; +// (3) no resurrection -- no StartVAD after Teardown. +func checkLog(log []Effect) { + running := false + torn := false + teardowns := 0 + for i, eff := range log { + switch eff.(type) { + case StartVAD: + Expect(torn).To(BeFalse(), "invariant (3): StartVAD after teardown (effect #%d)\nlog=%v", i, log) + Expect(running).To(BeFalse(), "invariant (1): StartVAD while already running (effect #%d)\nlog=%v", i, log) + running = true + case StopVAD: + Expect(running).To(BeTrue(), "invariant (1): StopVAD while not running (effect #%d)\nlog=%v", i, log) + running = false + case Teardown: + Expect(torn).To(BeFalse(), "invariant (2): Teardown twice (effect #%d)\nlog=%v", i, log) + torn = true + teardowns++ + } + } + Expect(teardowns).To(BeNumerically("<=", 1), "invariant (2): teardown ran %d times\nlog=%v", teardowns, log) +} + +type unknownEvent struct{} + +func (unknownEvent) isEvent() {} +func (unknownEvent) String() string { return "unknownEvent" } + +type unknownState struct{} + +func (unknownState) isState() {} +func (unknownState) String() string { return "unknownState" } + +var _ = Describe("conncoord.Next", func() { + DescribeTable("transitions", + func(state State, event Event, wantState State, wantEff []Effect) { + gotState, gotEff, err := Next(state, event) + Expect(err).NotTo(HaveOccurred()) + Expect(gotState).To(Equal(wantState)) + Expect(gotEff).To(Equal(wantEff)) + }, + Entry("stopped+setvad(on) -> running: start", + Live{VADRunning: false}, SetVAD{Active: true}, + Live{VADRunning: true}, []Effect{StartVAD{}}), + Entry("running+setvad(on) -> running, no-op", + Live{VADRunning: true}, SetVAD{Active: true}, + Live{VADRunning: true}, []Effect(nil)), + Entry("stopped+setvad(off) -> stopped, no-op", + Live{VADRunning: false}, SetVAD{Active: false}, + Live{VADRunning: false}, []Effect(nil)), + Entry("running+setvad(off) -> stopped: stop", + Live{VADRunning: true}, SetVAD{Active: false}, + Live{VADRunning: false}, []Effect{StopVAD{}}), + Entry("stopped+close -> torn: teardown", + Live{VADRunning: false}, Close{}, + Torn{}, []Effect{Teardown{}}), + Entry("running+close -> torn: stop + teardown", + Live{VADRunning: true}, Close{}, + Torn{}, []Effect{StopVAD{}, Teardown{}}), + Entry("torn+setvad(on) -> torn, no-op (no resurrection)", + Torn{}, SetVAD{Active: true}, + Torn{}, []Effect(nil)), + Entry("torn+close -> torn, no-op (idempotent)", + Torn{}, Close{}, + Torn{}, []Effect(nil)), + ) + + It("is total over the defined (state, event) pairs", func() { + states := []State{Live{VADRunning: false}, Live{VADRunning: true}, Torn{}} + events := []Event{SetVAD{Active: true}, SetVAD{Active: false}, Close{}} + for _, s := range states { + for _, e := range events { + _, _, err := Next(s, e) + Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e) + } + } + }) + + It("errors on an unknown event type", func() { + _, _, err := Next(Live{}, unknownEvent{}) + Expect(err).To(HaveOccurred()) + }) + + It("errors on an unknown state type", func() { + _, _, err := Next(unknownState{}, Close{}) + Expect(err).To(HaveOccurred()) + }) +}) + +var _ = Describe("conncoord.Coordinator", func() { + It("upholds the lifecycle invariants over random event sequences", func() { + seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE} + for _, seed := range seeds { + r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5)) + sink := &recordingSink{} + c := New(sink) + running := false + torn := false + + for range 5000 { + switch r.IntN(3) { + case 0: + Expect(c.Apply(SetVAD{Active: true})).To(Succeed()) + if !torn { + running = true + } + case 1: + Expect(c.Apply(SetVAD{Active: false})).To(Succeed()) + if !torn { + running = false + } + case 2: + Expect(c.Apply(Close{})).To(Succeed()) + torn = true + running = false + } + if torn { + Expect(c.State()).To(Equal(State(Torn{})), "seed=%d", seed) + } else { + Expect(c.State()).To(Equal(State(Live{VADRunning: running})), "seed=%d", seed) + } + } + checkLog(sink.snapshot()) + } + }) + + It("tears down at most once under concurrent SetVAD/Close from two goroutines", func() { + const perGoroutine = 2000 + sink := &recordingSink{} + c := New(sink) + + var wg sync.WaitGroup + drive := func(active bool) { + defer wg.Done() + for i := range perGoroutine { + switch i % 3 { + case 0: + _ = c.Apply(SetVAD{Active: active}) + case 1: + _ = c.Apply(SetVAD{Active: !active}) + case 2: + if i > perGoroutine/2 { + _ = c.Apply(Close{}) + } + } + } + } + + wg.Add(2) + go drive(true) + go drive(false) + wg.Wait() + _ = c.Apply(Close{}) + + checkLog(sink.snapshot()) + Expect(c.State()).To(Equal(State(Torn{}))) + }) +}) + +var _ = DescribeTable("conncoord stringers", + func(got, want string) { Expect(got).To(Equal(want)) }, + Entry(nil, Live{VADRunning: true}.String(), "Live(vad=true)"), + Entry(nil, Live{VADRunning: false}.String(), "Live(vad=false)"), + Entry(nil, Torn{}.String(), "Torn"), + + Entry(nil, SetVAD{Active: true}.String(), "SetVAD(true)"), + Entry(nil, Close{}.String(), "Close"), + + Entry(nil, StartVAD{}.String(), "StartVAD"), + Entry(nil, StopVAD{}.String(), "StopVAD"), + Entry(nil, Teardown{}.String(), "Teardown"), +) diff --git a/core/http/endpoints/openai/coordinator/coordinator.go b/core/http/endpoints/openai/coordinator/coordinator.go new file mode 100644 index 000000000..d8ae2fa43 --- /dev/null +++ b/core/http/endpoints/openai/coordinator/coordinator.go @@ -0,0 +1,82 @@ +// Package coordinator is the shared single-writer state-machine runtime for the +// realtime API's explicit coordinators (machines M1–M5 in +// docs/design/realtime-state-machines.md). +// +// Each machine package (respcoord, turncoord, conncoord, compactcoord, ttscoord) +// defines its OWN sealed sum types for State/Event/Effect and a total, pure +// transition function Next(state, event) -> (state, []effect, error). The +// plumbing around that — a single-writer Coordinator that serializes every +// transition behind one lock and performs the returned effects in order — is +// identical across all five, so it lives here once instead of being copied. +// +// A machine package wires itself up with three lines: +// +// type EffectSink = coordinator.Sink[Effect] +// type Coordinator = coordinator.Coordinator[State, Event, Effect] +// func New(sink EffectSink) *Coordinator { return coordinator.New[State, Event, Effect](Idle{}, Next, sink) } +// +// The aliases keep each package's public API (Coordinator, New, EffectSink, +// Apply, State) unchanged. The single-writer serialization — the load-bearing +// concurrency guarantee the FizzBee specs check — is therefore implemented and +// reasoned about in exactly one place. +package coordinator + +import "sync" + +// TransitionFunc is a machine's total, pure transition: given the current state +// and an event it returns the next state, the ordered effects to perform, and a +// non-nil error ONLY for an unhandled (programmer-error) state/event pair. It +// must not perform I/O or block; side effects are returned as data (F) for the +// Coordinator to hand to the Sink. +type TransitionFunc[S, E, F any] func(state S, event E) (S, []F, error) + +// Sink performs the effects a transition produces. Implementations MUST be +// non-blocking: Perform is called while the Coordinator holds its lock, so it +// must not block (it should spawn a goroutine, call a cancel func, or do a +// non-blocking channel send) and MUST NOT call back into the same Coordinator's +// Apply. +type Sink[F any] interface { + Perform(F) +} + +// Coordinator is the single-writer wrapper around a pure transition function. +// Every Apply is serialized by mu, so multiple goroutines can drive the machine +// without racing, and a transition's effects are performed in order under the +// lock (before any subsequent Apply can observe the new state). +type Coordinator[S, E, F any] struct { + mu sync.Mutex + state S + next TransitionFunc[S, E, F] + sink Sink[F] +} + +// New returns a Coordinator in the given initial state that transitions via next +// and performs effects via sink. +func New[S, E, F any](initial S, next TransitionFunc[S, E, F], sink Sink[F]) *Coordinator[S, E, F] { + return &Coordinator[S, E, F]{state: initial, next: next, sink: sink} +} + +// Apply runs one transition under the lock and performs its effects in order. If +// the transition function returns an error (an unhandled state/event), the state +// is left unchanged and the error is returned to the caller — never silently +// swallowed. +func (c *Coordinator[S, E, F]) Apply(e E) error { + c.mu.Lock() + defer c.mu.Unlock() + ns, effects, err := c.next(c.state, e) + if err != nil { + return err + } + c.state = ns + for _, eff := range effects { + c.sink.Perform(eff) + } + return nil +} + +// State returns the current state (a value; safe to call concurrently). +func (c *Coordinator[S, E, F]) State() S { + c.mu.Lock() + defer c.mu.Unlock() + return c.state +} diff --git a/core/http/endpoints/openai/coordinator/coordinator_suite_test.go b/core/http/endpoints/openai/coordinator/coordinator_suite_test.go new file mode 100644 index 000000000..8ea84eeea --- /dev/null +++ b/core/http/endpoints/openai/coordinator/coordinator_suite_test.go @@ -0,0 +1,13 @@ +package coordinator + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestCoordinator(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "coordinator (shared runtime) Suite") +} diff --git a/core/http/endpoints/openai/coordinator/coordinator_test.go b/core/http/endpoints/openai/coordinator/coordinator_test.go new file mode 100644 index 000000000..2eec77124 --- /dev/null +++ b/core/http/endpoints/openai/coordinator/coordinator_test.go @@ -0,0 +1,124 @@ +package coordinator + +import ( + "errors" + "fmt" + "sync" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// A tiny toy machine exercises the generic runtime directly (the five real +// machines exercise it via their aliases, but the gate measures this package's +// own coverage). off <-toggle-> on; burst emits three ordered effects; boom is +// the unhandled/error path. +type tstate int + +const ( + off tstate = iota + on +) + +type tevent int + +const ( + toggle tevent = iota + burst + boom +) + +type teffect string + +func tnext(s tstate, e tevent) (tstate, []teffect, error) { + switch e { + case toggle: + if s == off { + return on, []teffect{"on"}, nil + } + return off, []teffect{"off"}, nil + case burst: + return s, []teffect{"a", "b", "c"}, nil + case boom: + return s, nil, errors.New("boom: unhandled") + } + return s, nil, fmt.Errorf("unknown event %d", int(e)) +} + +type recordingSink struct { + mu sync.Mutex + log []teffect +} + +func (s *recordingSink) Perform(e teffect) { + s.mu.Lock() + s.log = append(s.log, e) + s.mu.Unlock() +} + +func (s *recordingSink) snapshot() []teffect { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]teffect, len(s.log)) + copy(out, s.log) + return out +} + +var _ = Describe("coordinator.Coordinator", func() { + It("starts in the initial state", func() { + c := New[tstate, tevent, teffect](off, tnext, &recordingSink{}) + Expect(c.State()).To(Equal(off)) + }) + + It("advances state and performs the transition's effects", func() { + sink := &recordingSink{} + c := New[tstate, tevent, teffect](off, tnext, sink) + + Expect(c.Apply(toggle)).To(Succeed()) + Expect(c.State()).To(Equal(on)) + Expect(c.Apply(toggle)).To(Succeed()) + Expect(c.State()).To(Equal(off)) + + Expect(sink.snapshot()).To(Equal([]teffect{"on", "off"})) + }) + + It("performs multiple effects in order", func() { + sink := &recordingSink{} + c := New[tstate, tevent, teffect](off, tnext, sink) + Expect(c.Apply(burst)).To(Succeed()) + Expect(sink.snapshot()).To(Equal([]teffect{"a", "b", "c"})) + }) + + It("returns the transition error and leaves state unchanged", func() { + sink := &recordingSink{} + c := New[tstate, tevent, teffect](on, tnext, sink) + err := c.Apply(boom) + Expect(err).To(HaveOccurred()) + Expect(c.State()).To(Equal(on), "state unchanged on error") + Expect(sink.snapshot()).To(BeEmpty(), "no effects performed on error") + }) + + It("serializes concurrent Apply from many goroutines (run with -race)", func() { + const goroutines = 8 + const each = 1000 + sink := &recordingSink{} + c := New[tstate, tevent, teffect](off, tnext, sink) + + var wg sync.WaitGroup + wg.Add(goroutines) + for range goroutines { + go func() { + defer wg.Done() + for range each { + _ = c.Apply(toggle) + } + }() + } + wg.Wait() + + // goroutines*each toggles from off; an even total returns to off. The + // point is race-freedom + a consistent final state, not the value itself. + Expect(c.State()).To(Equal(off)) + Expect(sink.snapshot()).To(HaveLen(goroutines * each)) + }) +}) diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index d4d6a0ac4..94c8a1a65 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -12,7 +12,6 @@ import ( "os" "strconv" "sync" - "sync/atomic" "time" "net/http" @@ -26,6 +25,8 @@ import ( "github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/core/http/auth" mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp" + "github.com/mudler/LocalAI/core/http/endpoints/openai/respcoord" + "github.com/mudler/LocalAI/core/http/endpoints/openai/turncoord" "github.com/mudler/LocalAI/core/http/endpoints/openai/types" "github.com/mudler/LocalAI/core/schema" "github.com/mudler/LocalAI/core/templates" @@ -168,44 +169,12 @@ type Session struct { gateMu sync.Mutex voiceVerified bool - // Response cancellation: protects activeResponseCancel/activeResponseDone - responseMu sync.Mutex - activeResponseCancel context.CancelFunc - activeResponseDone chan struct{} -} - -// cancelActiveResponse cancels any in-flight response and waits for its -// goroutine to exit. This ensures we never have overlapping responses and -// that interrupted responses are fully cleaned up before starting a new one. -func (s *Session) cancelActiveResponse() { - s.responseMu.Lock() - cancel := s.activeResponseCancel - done := s.activeResponseDone - s.responseMu.Unlock() - - if cancel != nil { - cancel() - } - if done != nil { - <-done - } -} - -// startResponse cancels any active response and returns a new context for -// the replacement response. The caller MUST close the returned done channel -// when the response goroutine exits. -func (s *Session) startResponse(parent context.Context) (context.Context, chan struct{}) { - s.cancelActiveResponse() - - ctx, cancel := context.WithCancel(parent) - done := make(chan struct{}) - - s.responseMu.Lock() - s.activeResponseCancel = cancel - s.activeResponseDone = done - s.responseMu.Unlock() - - return ctx, done + // respSink is the explicit response-coordination state machine (respcoord, + // machine M3). It replaces the legacy startResponse/cancelActiveResponse + // pair and its dual-writer activeResponse* fields: every start/cancel/finish + // decision is serialized through respcoord.Coordinator, guaranteeing at most + // one live response. See realtime_respcoord.go. + respSink *responseSink } func (s *Session) FromClient(session *types.SessionUnion) { @@ -258,8 +227,10 @@ type Conversation struct { // is kept out of Items (so trimRealtimeItems never drops it) and rendered // as a system message right after the session instructions. Memory string - // compacting ensures at most one background compaction runs per conversation. - compacting atomic.Bool + // compaction is the explicit single-flight compaction coordinator (M4): at + // most one background summarize+evict runs per conversation at a time. It + // replaces the legacy `compacting atomic.Bool`. See realtime_compactcoord.go. + compaction *compactionSink } func (c *Conversation) ToServer() types.Conversation { @@ -288,6 +259,12 @@ type Model interface { // sound-event tags. topK caps the number of returned tags (0 = backend // default), threshold drops tags below the given score (0 = keep all). SoundDetection(ctx context.Context, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error) + // TranscribeLive opens a live (bidirectional) transcription session on the + // pipeline's transcription backend, used by semantic_vad turn detection; + // onEvent fires from a background goroutine for every delta/EOU/final + // event. Backends without live support fail with an error satisfying + // grpcerrors.IsLiveTranscriptionUnsupported. + TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error) PredictConfig() *config.ModelConfig } @@ -513,14 +490,10 @@ func runRealtimeSession(application *application.Application, t Transport, model // input_audio_buffer.commit. There is no transcription stage in that case. soundOnly := cfg.Pipeline.SoundDetection != "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.LLM == "" - turnDetection := &types.TurnDetectionUnion{ - ServerVad: &types.ServerVad{ - Threshold: 0.5, - PrefixPaddingMs: 300, - SilenceDurationMs: 500, - CreateResponse: true, - }, - } + // defaultTurnDetection seeds server_vad by default, or semantic_vad when the + // pipeline opts in (turn_detection.type: semantic_vad); clients can still + // override per session via session.update. + turnDetection := defaultTurnDetection(cfg) inputAudioTranscription := &types.AudioTranscription{Model: sttModel} if soundOnly { turnDetection = nil // turn_detection none: no VAD @@ -561,12 +534,27 @@ func runRealtimeSession(application *application.Application, t Transport, model } session.CompactionEnabled, session.CompactionTrigger, session.MaxSummaryTokens, session.SummaryModel = resolveCompaction(cfg, session.MaxHistoryItems) + // Single-writer response coordinator (machine M3). All response starts and + // cancels go through this, so the read-loop and VAD goroutine can never race + // into two overlapping responses (see realtime_respcoord.go). + session.respSink = newResponseSink() + // Create a default conversation conversationID := generateConversationID() conversation := &Conversation{ ID: conversationID, Items: []*types.MessageItemUnion{}, } + // The compaction coordinator's work closure resolves the summarizer (lazily + // loading a configured summary_model) and runs the summarize+evict off the + // response path — only when a compaction actually starts. + conversation.compaction = newCompactionSink(func(ctx context.Context) { + model := session.summarizerModel() + if model == nil { + return + } + session.compact(ctx, conversation, model) + }) session.Conversations[conversationID] = conversation session.DefaultConversationID = conversationID @@ -648,34 +636,22 @@ func runRealtimeSession(application *application.Application, t Transport, model }) var ( - msg []byte - wg sync.WaitGroup - done = make(chan struct{}) + msg []byte + wg sync.WaitGroup ) - vadServerStarted := false - toggleVAD := func() { - if session.TurnDetection != nil && session.TurnDetection.ServerVad != nil && !vadServerStarted { - xlog.Debug("Starting VAD goroutine...") - done = make(chan struct{}) - wg.Go(func() { - conversation := session.Conversations[session.DefaultConversationID] - handleVAD(session, conversation, t, done) - }) - vadServerStarted = true - } else if (session.TurnDetection == nil || session.TurnDetection.ServerVad == nil) && vadServerStarted { - xlog.Debug("Stopping VAD goroutine...") - close(done) - vadServerStarted = false - } - } + // M1 connection lifecycle. The VAD goroutine's run/stop (and its done channel) + // and the once-only teardown are owned by this coordinator, so the channel is + // closed exactly once and never resurrected after teardown (Part 2, failure + // mode 6; invariants #8, #10). See realtime_conncoord.go and conncoord/. + conn := newConnSink(session, sessionID, t, &wg) + toggleVAD := func() { conn.setVAD(turnDetectionActive(session.TurnDetection)) } // For WebRTC sessions, start the Opus decode loop before VAD so that // decoded PCM is already flowing when VAD's first tick fires. - var decodeDone chan struct{} if wt, ok := t.(*WebRTCTransport); ok { - decodeDone = make(chan struct{}) - go decodeOpusLoop(session, wt.opusBackend, decodeDone) + conn.decodeDone = make(chan struct{}) + go decodeOpusLoop(session, wt.opusBackend, conn.decodeDone) } toggleVAD() @@ -684,9 +660,9 @@ func runRealtimeSession(application *application.Application, t Transport, model // with window/hop configured, the server classifies the last window of // streamed audio on a timer, so the client only has to stream (no commits). // This runs independent of VAD (sound events are not speech). - var soundWindowDone chan struct{} if soundOnly && session.SoundDetectionWindowMs > 0 && session.SoundDetectionHopMs > 0 { - soundWindowDone = make(chan struct{}) + conn.soundWindowDone = make(chan struct{}) + soundWindowDone := conn.soundWindowDone wg.Go(func() { handleSoundWindow(session, t, soundWindowDone) }) @@ -811,11 +787,11 @@ func runRealtimeSession(application *application.Application, t Transport, model xlog.Debug("recv", "message", string(msg)) sessionLock.Lock() - isServerVAD := session.TurnDetection != nil && session.TurnDetection.ServerVad != nil + autoTurnDetection := turnDetectionActive(session.TurnDetection) sessionLock.Unlock() // TODO: At the least need to check locking and timer state in the VAD Go routine before allowing this - if isServerVAD { + if autoTurnDetection { sendNotImplemented(t, "input_audio_buffer.commit in conjunction with VAD") continue } @@ -831,11 +807,9 @@ func runRealtimeSession(application *application.Application, t Transport, model ItemID: generateItemID(), }) - respCtx, respDone := session.startResponse(context.Background()) - go func() { - defer close(respDone) - commitUtterance(respCtx, allAudio, session, conversation, t) - }() + session.respSink.issue(context.Background(), respcoord.SourceClient, func(ctx context.Context) { + commitUtterance(ctx, allAudio, session, conversation, t) + }) case types.InputAudioBufferClearEvent: xlog.Debug("recv", "message", string(msg)) @@ -968,15 +942,14 @@ func runRealtimeSession(application *application.Application, t Transport, model conversation.Lock.Unlock() } - respCtx, respDone := session.startResponse(context.Background()) - go func() { - defer close(respDone) - triggerResponse(respCtx, session, conversation, t, &e.Response) - }() + resp := e.Response + session.respSink.issue(context.Background(), respcoord.SourceClient, func(ctx context.Context) { + triggerResponse(ctx, session, conversation, t, &resp) + }) case types.ResponseCancelEvent: xlog.Debug("recv", "message", string(msg)) - session.cancelActiveResponse() + session.respSink.cancel(respcoord.SourceClient) default: xlog.Error("unknown message type") @@ -984,28 +957,11 @@ func runRealtimeSession(application *application.Application, t Transport, model } } - // Cancel any in-flight response before tearing down - session.cancelActiveResponse() - - // Stop the Opus decode goroutine (if running) - if decodeDone != nil { - close(decodeDone) - } - - // Signal any running VAD goroutine to exit. - if vadServerStarted { - close(done) - } - // Stop the server-side sound-detection windowing goroutine (if running). - if soundWindowDone != nil { - close(soundWindowDone) - } - wg.Wait() - - // Remove the session from the sessions map - sessionLock.Lock() - delete(sessions, sessionID) - sessionLock.Unlock() + // Tear down through the connection coordinator (once). It stops any running + // VAD goroutine, then the opus-decode and sound-window goroutines, joins them, + // cancels the in-flight response and drains all response goroutines, and + // finally removes the session — all in dependency order, exactly once. + conn.close() } // sendEvent sends a server event via the transport, logging any errors. @@ -1285,8 +1241,38 @@ func decodeOpusLoop(session *Session, opusBackend grpc.Backend, done chan struct } } +// noSpeechHoldbackSec is how much of the tail of an inspected, segment-free +// buffer survives the periodic no-speech clear. It must cover the VAD's +// onset-detection latency: a word can already be underway in the newest part +// of the window without silero having crossed its threshold yet, and clearing +// it cuts the start of the utterance the next tick will detect. +const noSpeechHoldbackSec = 0.5 + +// dropInspectedPrefix removes the head of the audio buffer that a VAD tick +// inspected (the first inspected bytes), keeping the newest holdbackBytes of +// that window plus everything appended while the tick ran — audio the VAD +// never saw. When something is dropped the result is a fresh copy, never a +// sub-slice, so later appends can't scribble on memory shared with the old +// backing array; when nothing is dropped buf is returned unchanged. +func dropInspectedPrefix(buf []byte, inspected, holdbackBytes int) []byte { + cut := inspected - holdbackBytes + if cut <= 0 { + return buf + } + if cut > len(buf) { + cut = len(buf) + } + return append([]byte(nil), buf[cut:]...) +} + // handleVAD is a goroutine that listens for audio data from the client, -// runs VAD on the audio data, and commits utterances to the conversation +// runs VAD on the audio data, and commits utterances to the conversation. +// +// With turn_detection.type == "semantic_vad" (sv != nil below) the silero +// loop is augmented by a live transcription stream: the buffer's new audio +// is fed to the transcription model every tick and its end-of-utterance +// token switches the commit threshold between a short post-EOU window and +// the long eagerness fallback. The server_vad path is untouched. func handleVAD(session *Session, conv *Conversation, t Transport, done chan struct{}) { vadContext, cancel := context.WithCancel(context.Background()) go func() { @@ -1299,9 +1285,22 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru silenceThreshold = float64(session.TurnDetection.ServerVad.SilenceDurationMs) / 1000 } - speechStarted := false + lts := newLiveTurnState(session, t) startTime := time.Now() + // M2 turn-detection state machine. "Speech started" and "a turn's live ASR + // stream is open" are ONE coordinator state (Idle/Speaking), so they cannot + // desync the way the legacy speechStarted bool and lts.open() could (Part 2, + // failure mode 4). See realtime_turncoord.go and turncoord/. + sink := newTurnSink(session, conv, t, lts, vadContext, startTime) + // Teardown: end any open turn through the coordinator (DiscardTurn closes the + // live stream; no-op if already idle). Replaces the bare lts.discardTurn(). + defer func() { + if err := sink.coord.Apply(turncoord.Abort{Reason: turncoord.AbortTeardown}); err != nil { + xlog.Error("turncoord: abort(teardown) failed", "error", err) + } + }() + ticker := time.NewTicker(300 * time.Millisecond) defer ticker.Stop() @@ -1310,6 +1309,30 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru case <-done: return case <-ticker.C: + // Semantic mode is re-read each tick: session.update can switch + // turn-detection modes (and the retranscribe gate) mid-session. + sessionLock.Lock() + var sv *types.RealtimeSessionSemanticVad + if session.TurnDetection != nil { + sv = session.TurnDetection.SemanticVad + } + retranscribe := sv != nil && session.ModelConfig != nil && + session.ModelConfig.Pipeline.TurnDetectionRetranscribe() + sessionLock.Unlock() + + // The turn coordinator's data-heavy effects (OpenTurn/CommitTurn) + // need this tick's mode; set it before any Apply below. + sink.sv = sv + + // session.update switched semantic -> server mid-turn: drop the + // orphaned live stream. This is NOT a turn abort — the turn continues + // under server_vad (a config change must not cut off a mid-utterance + // speaker), so the coordinator stays Speaking; only the orphaned live + // stream is closed. + if sv == nil && lts.open() { + lts.discardTurn() + } + session.AudioBufferLock.Lock() allAudio := make([]byte, len(session.InputAudioBuffer)) copy(allAudio, session.InputAudioBuffer) @@ -1323,6 +1346,13 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru // Resample from InputSampleRate to 16kHz aints = sound.ResampleInt16(aints, session.InputSampleRate, localSampleRate) + audioLength := float64(len(aints)) / localSampleRate + + if sv != nil && lts.open() { + lts.feedNewAudio(aints) + lts.drainEvents(audioLength) + } + segments, err := runVAD(vadContext, session, aints) if err != nil { if err.Error() == "unexpected speech end" { @@ -1334,31 +1364,52 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru continue } - audioLength := float64(len(aints)) / localSampleRate - - // TODO: When resetting the buffer we should retain a small postfix + // NOTE: the no-speech clear and the min-buffer gate above stay on + // the short silenceThreshold even in semantic mode — the eagerness + // fallback applies only to the end-of-speech commit decision, or a + // low eagerness would delay speech_started/barge-in by seconds. if len(segments) == 0 && audioLength > silenceThreshold { + // "No segments" is not "no speech": silero (threshold 0.5) + // crosses up to a few hundred ms into a soft word onset, so + // the newest audio in the inspected window may be the start + // of a word the next tick will recognize — and more audio + // arrived while this tick ran. Keep both; drop only the + // older, confirmed-silent head, or utterance onsets get cut. + holdback := int(noSpeechHoldbackSec*float64(session.InputSampleRate)) * 2 session.AudioBufferLock.Lock() - session.InputAudioBuffer = nil + session.InputAudioBuffer = dropInspectedPrefix(session.InputAudioBuffer, len(allAudio), holdback) session.AudioBufferLock.Unlock() + // No-speech clear: end any open turn (Speaking -> Idle, discarding + // the partial). Returning to Idle is the fix for failure mode 4 — + // the legacy discardTurn left speechStarted true, suppressing the + // next onset. Idle while not speaking is a no-op. + if err := sink.coord.Apply(turncoord.Abort{Reason: turncoord.AbortNoSpeech}); err != nil { + xlog.Error("turncoord: abort(no_speech) failed", "error", err) + } continue } else if len(segments) == 0 { continue } - if !speechStarted { - // Barge-in: cancel any in-flight response so we stop - // sending audio and don't keep the interrupted reply in history. - session.cancelActiveResponse() + // Speech detected this tick: open the turn (Idle -> Speaking) through + // the coordinator. On that transition it opens the turn's live ASR + // stream + feeds the buffered prefix (OpenTurn), cancels any in-flight + // response (BargeIn, non-blocking — the VAD tick is never stalled), and + // emits speech_started. While already Speaking it is a no-op, so "turn + // open" and "speech started" can never disagree. The turn id is minted + // here and carried by the coordinator through to the committed event. + sink.onsetAudio = aints + if err := sink.coord.Apply(turncoord.Onset{Turn: turncoord.TurnID(generateItemID())}); err != nil { + xlog.Error("turncoord: onset failed", "error", err) + } - sendEvent(t, types.InputAudioBufferSpeechStartedEvent{ - ServerEventBase: types.ServerEventBase{ - EventID: "event_TODO", - }, - AudioStartMs: time.Since(startTime).Milliseconds(), - }) - speechStarted = true + if sv != nil { + // Drain again: events produced by THIS tick's feed have + // usually arrived by the time runVAD returns, and leaving + // them for the next tick adds 300ms to every EOU-triggered + // commit. + lts.drainEvents(audioLength) } // Segment still in progress when audio ended @@ -1367,41 +1418,90 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru continue } - if float32(audioLength)-segEndTime > float32(silenceThreshold) { + threshold := silenceThreshold + eouPending := false + if sv != nil { + eouPending = lts.eouPending(segments) + threshold = lts.thresholdSec(eouPending, sv) + } + + if float32(audioLength)-segEndTime > float32(threshold) { + if sv != nil { + trigger, eouLag := lts.commitTrigger(eouPending, float64(segEndTime)) + xlog.Info("semantic_vad: committing turn", + "trigger", trigger, + "speech_end_s", segEndTime, + "eou_lag_s", eouLag, + "silence_s", audioLength-float64(segEndTime), + "audio_s", audioLength) + } + // Retranscribe gate (semantic mode, EOU-triggered commits + // only): cross-check the streamed EOU with an offline decode + // of the buffered turn before committing. Runs synchronously + // on the tick — the engine would serialize a concurrent feed + // against it anyway. Timeout-triggered commits skip the gate. + var gated *schema.TranscriptionResult + if retranscribe && eouPending { + batch, gerr := transcribeUtterance(vadContext, sound.Int16toBytesLE(aints), session) + switch { + case gerr != nil: + xlog.Warn("semantic_vad: retranscribe gate failed; committing via the file path", "error", gerr) + case !batch.Eou: + xlog.Info("semantic_vad: batch decode did not confirm the streamed EOU; continuing to listen", + "streamed", lts.previewText(), "batch", batch.Text) + // The batch decode rejected the streamed EOU as a false + // positive: consume the recorded EOU so the next tick + // falls back to the eagerness window instead of + // re-triggering on the same token. + lts.eouAtSec = 0 + continue + default: + xlog.Info("semantic_vad: batch decode confirmed the streamed EOU", + "streamed", lts.previewText(), "batch", batch.Text) + gated = batch + } + } + xlog.Debug("Detected end of speech segment") session.AudioBufferLock.Lock() - session.InputAudioBuffer = nil + // Keep audio appended while this tick ran — it belongs to + // the next turn (in any mode: nil-ing it dropped the onset + // of an utterance started right after a commit). + session.InputAudioBuffer = dropInspectedPrefix(session.InputAudioBuffer, len(allAudio), 0) session.AudioBufferLock.Unlock() - sendEvent(t, types.InputAudioBufferSpeechStoppedEvent{ - ServerEventBase: types.ServerEventBase{ - EventID: "event_TODO", - }, - AudioEndMs: time.Since(startTime).Milliseconds(), - }) - speechStarted = false - - sendEvent(t, types.InputAudioBufferCommittedEvent{ - ServerEventBase: types.ServerEventBase{ - EventID: "event_TODO", - }, - ItemID: generateItemID(), - PreviousItemID: "TODO", - }) - - abytes := sound.Int16toBytesLE(aints) - // TODO: Remove prefix silence that is is over TurnDetectionParams.PrefixPaddingMs - respCtx, respDone := session.startResponse(vadContext) - go func() { - defer close(respDone) - commitUtterance(respCtx, abytes, session, conv, t) - }() + // Commit the turn through the coordinator: it emits speech_stopped + // (EmitSpeechStopped) then the committed event, finalizes the live + // stream, and issues the response (CommitTurn). The committed item + // id is the coordinator's turn id (== the id the live captions + // streamed under), so the client replaces the partial text. + sink.commitAudio = sound.Int16toBytesLE(aints) + sink.commitAudioLength = audioLength + sink.commitRetranscribe = retranscribe + sink.commitGated = gated + // TODO: Remove prefix silence that is over TurnDetectionParams.PrefixPaddingMs + if err := sink.coord.Apply(turncoord.Silence{}); err != nil { + xlog.Error("turncoord: commit failed", "error", err) + } } } } } func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Conversation, t Transport) { + commitUtteranceWithTranscript(ctx, utt, nil, nil, "", session, conv, t) +} + +// commitUtteranceWithTranscript commits one user turn. live carries the +// transcript semantic_vad's live stream already produced (its caption deltas +// were streamed to the client during the turn, so only the completed event +// is emitted here); gated carries the retranscribe gate's batch decode (the +// authoritative transcript in that mode). With neither — server_vad, manual +// commits, semantic degrade, or a live stream that heard nothing — the audio +// is written to a temp WAV and transcribed via the file path as before. +// itemID is the turn's conversation item id ("" mints a fresh one); it must +// match the id any live deltas were sent under. +func commitUtteranceWithTranscript(ctx context.Context, utt []byte, live *liveUtterance, gated *schema.TranscriptionResult, itemID string, session *Session, conv *Conversation, t Transport) { if len(utt) == 0 { return } @@ -1466,14 +1566,37 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co } // TODO: If we have a real any-to-any model then transcription is optional + + // The turn's live captions (semantic_vad) already streamed under this + // itemID; the completed event below reuses it so the client replaces the + // partial text. server_vad / manual commits arrive with no itemID, so mint + // one here. + if itemID == "" { + itemID = generateItemID() + } + var transcript string switch { + case gated != nil: + // semantic_vad retranscribe gate: the batch decode is authoritative. + transcript = gated.Text + if err := emitPrecomputedTranscription(t, itemID, nil, transcript); err != nil { + sendError(t, "transcription_failed", err.Error(), "", "event_TODO") + return + } + case live != nil && live.Text != "": + // The caption deltas already streamed during the turn under this + // itemID; the completed event replaces the partial text client-side. + transcript = live.Text + if err := emitPrecomputedTranscription(t, itemID, nil, transcript); err != nil { + sendError(t, "transcription_failed", err.Error(), "", "event_TODO") + return + } case session.InputAudioTranscription != nil: // emitTranscription streams transcript deltas when // pipeline.streaming.transcription is set, otherwise emits a single // completed event; either way it returns the final transcript text. - var err error - transcript, err = emitTranscription(ctx, t, session, generateItemID(), f.Name()) + transcript, err = emitTranscription(ctx, t, session, itemID, f.Name()) if err != nil { // Drain the gate goroutine before returning so its in-flight read of // the temp WAV finishes before the deferred os.Remove fires. @@ -1642,6 +1765,56 @@ func writeWindowWAV(pcm []byte, sampleRate int) (string, error) { return f.Name(), nil } +// writeUtteranceWAV persists raw 16 kHz mono PCM to a temp WAV for the +// file-based transcription paths. The caller must invoke cleanup. +func writeUtteranceWAV(utt []byte) (string, func(), error) { + f, err := os.CreateTemp("", "realtime-audio-chunk-*.wav") + if err != nil { + return "", nil, err + } + cleanup := func() { + _ = f.Close() + _ = os.Remove(f.Name()) + } + xlog.Debug("Writing to file", "file", f.Name()) + + hdr := laudio.NewWAVHeader(uint32(len(utt))) + if err := hdr.Write(f); err != nil { + cleanup() + return "", nil, err + } + if _, err := f.Write(utt); err != nil { + cleanup() + return "", nil, err + } + _ = f.Sync() + return f.Name(), cleanup, nil +} + +// transcribeUtterance runs one offline (unary) decode of the buffered turn — +// the semantic_vad retranscribe gate. The result's Eou flag reports whether +// the batch decode also ended on the end-of-utterance token. +func transcribeUtterance(ctx context.Context, utt []byte, session *Session) (*schema.TranscriptionResult, error) { + path, cleanup, err := writeUtteranceWAV(utt) + if err != nil { + return nil, err + } + defer cleanup() + + language, prompt := "", "" + if cfg := session.InputAudioTranscription; cfg != nil { + language, prompt = cfg.Language, cfg.Prompt + } + tr, err := session.ModelInterface.Transcribe(ctx, path, language, false, false, prompt) + if err != nil { + return nil, err + } + if tr == nil { + return nil, fmt.Errorf("transcribe result is nil") + } + return tr, nil +} + func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADSegment, error) { soundIntBuffer := &audio.IntBuffer{ Format: &audio.Format{SampleRate: localSampleRate, NumChannels: 1}, @@ -1721,14 +1894,100 @@ func generateResponse(ctx context.Context, session *Session, utt []byte, transcr // without another response cycle. const maxAssistantToolTurns = 10 +// responseOutcome is how a response ended, decided by the response body and +// read once by triggerResponse to emit the single terminal event. +type responseOutcome int + +const ( + outcomeCompleted responseOutcome = iota + outcomeCancelled + outcomeFailed // an error event was already sent; emit no terminal (legacy behavior) +) + +// liveResponse accumulates the wire-visible result of ONE response.create across +// the whole agentic tool-turn recursion: a single id, the output items as they +// complete, the summed token usage, and the final outcome. triggerResponse owns +// it; triggerResponseAtTurn / streamLLMResponse / emitToolCallItems fill it in. +// This is what makes "exactly one response.done per response.create, with Output +// and Usage populated" true — the body no longer emits per-turn terminals. +type liveResponse struct { + id string + output []types.MessageItemUnion + usage backend.TokenUsage + outcome responseOutcome +} + +func (r *liveResponse) addItem(it types.MessageItemUnion) { r.output = append(r.output, it) } + +func (r *liveResponse) addUsage(u backend.TokenUsage) { + r.usage.Prompt += u.Prompt + r.usage.Completion += u.Completion +} + +// responseUsage maps the backend's token counts onto the OpenAI Realtime +// response.usage shape. Returns nil when there is nothing to report so the +// field is omitted rather than sent as zeros. +func responseUsage(u backend.TokenUsage) *types.TokenUsage { + if u.Prompt == 0 && u.Completion == 0 { + return nil + } + return &types.TokenUsage{ + InputTokens: u.Prompt, + OutputTokens: u.Completion, + TotalTokens: u.Prompt + u.Completion, + } +} + func triggerResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams) { - triggerResponseAtTurn(ctx, session, conv, t, overrides, 0) + // One response.created and one response.done per response.create — even when + // the server-side tool loop runs several inference turns. The per-turn + // terminals the legacy code emitted (one response.done per turn, with empty + // Output/Usage) are gone; tool turns are now internal to this single response. + r := &liveResponse{id: generateUniqueID()} + sendEvent(t, types.ResponseCreatedEvent{ + ServerEventBase: types.ServerEventBase{}, + Response: types.Response{ + ID: r.id, + Object: "realtime.response", + Status: types.ResponseStatusInProgress, + }, + }) + + triggerResponseAtTurn(ctx, session, conv, t, overrides, 0, r) + + switch r.outcome { + case outcomeCancelled: + sendEvent(t, types.ResponseDoneEvent{ + ServerEventBase: types.ServerEventBase{}, + Response: types.Response{ + ID: r.id, + Object: "realtime.response", + Status: types.ResponseStatusCancelled, + Output: r.output, + }, + }) + case outcomeFailed: + // A specific error event was already sent; emit no terminal (matches the + // legacy behavior where failed responses had no response.done). + default: + sendEvent(t, types.ResponseDoneEvent{ + ServerEventBase: types.ServerEventBase{}, + Response: types.Response{ + ID: r.id, + Object: "realtime.response", + Status: types.ResponseStatusCompleted, + Output: r.output, + Usage: responseUsage(r.usage), + }, + }) + } + // Fold aged-out turns into the rolling memory off the critical path; the // next turn reaps the smaller buffer. session.maybeCompact(conv) } -func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams, toolTurn int) { +func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams, toolTurn int, r *liveResponse) { config := session.ModelInterface.PredictConfig() // Default values @@ -1891,15 +2150,9 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa images = append(images, m.StringImages...) } - responseID := generateUniqueID() - sendEvent(t, types.ResponseCreatedEvent{ - ServerEventBase: types.ServerEventBase{}, - Response: types.Response{ - ID: responseID, - Object: "realtime.response", - Status: types.ResponseStatusInProgress, - }, - }) + // response.created/done are emitted once per response.create by triggerResponse; + // every turn (including agentic recursion) shares this id. + responseID := r.id // Streamed LLM path: when the pipeline opts into LLM streaming, stream the // transcript to the client as it is generated and synthesize the buffered @@ -1915,7 +2168,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa respMods = overrides.OutputModalities } if canStream && modalitiesContainAudio(resolveOutputModalities(session.OutputModalities, respMods)) { - if streamLLMResponse(ctx, session, conv, t, responseID, conversationHistory, images, config, tools, toolChoice, toolTurn) { + if streamLLMResponse(ctx, session, conv, t, r, conversationHistory, images, config, tools, toolChoice, toolTurn) { return } } @@ -1924,26 +2177,22 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa predFunc, err := session.ModelInterface.Predict(ctx, conversationHistory, images, nil, nil, nil, tools, toolChoice, nil, nil, nil) if err != nil { sendError(t, "inference_failed", fmt.Sprintf("backend error: %v", err), "", "") // item.Assistant.ID is unknown here + r.outcome = outcomeFailed return } pred, err := predFunc() if err != nil { sendError(t, "prediction_failed", fmt.Sprintf("backend error: %v", err), "", "") + r.outcome = outcomeFailed return } + r.addUsage(pred.Usage) // Check for cancellation after LLM inference (barge-in may have fired) if ctx.Err() != nil { xlog.Debug("Response cancelled after LLM inference (barge-in)") - sendEvent(t, types.ResponseDoneEvent{ - ServerEventBase: types.ServerEventBase{}, - Response: types.Response{ - ID: responseID, - Object: "realtime.response", - Status: types.ResponseStatusCancelled, - }, - }) + r.outcome = outcomeCancelled return } @@ -2103,18 +2352,12 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa conv.Lock.Unlock() } - // sendCancelledResponse emits the cancelled status and cleans up the - // assistant item so the interrupted reply is not in chat history. + // sendCancelledResponse records the cancelled outcome (triggerResponse + // emits the single terminal) and cleans up the partial assistant item so + // the interrupted reply is not in chat history. sendCancelledResponse := func() { removeItemFromConv(item.Assistant.ID) - sendEvent(t, types.ResponseDoneEvent{ - ServerEventBase: types.ServerEventBase{}, - Response: types.Response{ - ID: responseID, - Object: "realtime.response", - Status: types.ResponseStatusCancelled, - }, - }) + r.outcome = outcomeCancelled } var audioString string @@ -2163,6 +2406,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa } xlog.Error("TTS failed", "error", err) sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %v", err), "", item.Assistant.ID) + r.outcome = outcomeFailed return } if !isWebRTC { @@ -2220,12 +2464,13 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa OutputIndex: 0, Item: item, }) + r.addItem(item) } - // Emit the parsed tool calls, the terminal response.done, and (for - // server-side assistant tools) the follow-up response. Shared with the - // streamed path so both finalize tool calls identically. - emitToolCallItems(ctx, session, conv, t, responseID, finalToolCalls, finalSpeech != "", toolTurn) + // Emit the parsed tool calls and (for server-side assistant tools) the + // follow-up turn. Shared with the streamed path so both finalize tool calls + // identically. The single terminal is emitted by triggerResponse. + emitToolCallItems(ctx, session, conv, t, r, finalToolCalls, finalSpeech != "", toolTurn) } // emitToolCallItems emits the realtime function_call items for the parsed tool @@ -2239,7 +2484,8 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa // - All other tools follow the standard OpenAI flow: emit // function_call_arguments.done and wait for the client to send // conversation.item.create back. -func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation, t Transport, responseID string, toolCalls []functions.FuncCallResults, hasContent bool, toolTurn int) { +func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation, t Transport, r *liveResponse, toolCalls []functions.FuncCallResults, hasContent bool, toolTurn int) { + responseID := r.id xlog.Debug("About to handle tool calls", "finalToolCallsCount", len(toolCalls)) executedAssistantTool := false for i, tc := range toolCalls { @@ -2302,6 +2548,7 @@ func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation OutputIndex: outputIndex, Item: fcItem, }) + r.addItem(fcItem) sendEvent(t, types.ResponseOutputItemAddedEvent{ ServerEventBase: types.ServerEventBase{}, ResponseID: responseID, @@ -2314,6 +2561,7 @@ func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation OutputIndex: outputIndex, Item: foItem, }) + r.addItem(foItem) executedAssistantTool = true continue } @@ -2343,28 +2591,25 @@ func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation OutputIndex: outputIndex, Item: fcItem, }) + r.addItem(fcItem) } - sendEvent(t, types.ResponseDoneEvent{ - ServerEventBase: types.ServerEventBase{}, - Response: types.Response{ - ID: responseID, - Object: "realtime.response", - Status: types.ResponseStatusCompleted, - }, - }) + // No terminal here: triggerResponse emits the single response.done once the + // whole turn (including the agentic recursion below) completes. // If we executed any assistant tools inproc, run another response cycle // so the model can speak the result. Mirrors the chat-side agentic loop // but driven server-side rather than by client round-trip. Bounded so a - // degenerate "model keeps calling tools" doesn't blow the stack. + // degenerate "model keeps calling tools" doesn't blow the stack. The + // follow-up turn shares the same liveResponse, so its output accumulates + // into the one response.done. if executedAssistantTool { if toolTurn+1 >= maxAssistantToolTurns { xlog.Warn("realtime: assistant tool-turn limit reached, stopping the agentic loop", "limit", maxAssistantToolTurns, "model", session.Model) return } - triggerResponseAtTurn(ctx, session, conv, t, nil, toolTurn+1) + triggerResponseAtTurn(ctx, session, conv, t, nil, toolTurn+1, r) } } diff --git a/core/http/endpoints/openai/realtime_compactcoord.go b/core/http/endpoints/openai/realtime_compactcoord.go new file mode 100644 index 000000000..10c6304f4 --- /dev/null +++ b/core/http/endpoints/openai/realtime_compactcoord.go @@ -0,0 +1,79 @@ +package openai + +import ( + "context" + "sync" + + "github.com/mudler/LocalAI/core/http/endpoints/openai/compactcoord" + "github.com/mudler/xlog" +) + +// compactionSink wires the explicit compaction state machine +// (compactcoord.Coordinator — machine "M4" in docs/design/realtime-state-machines.md) +// into a conversation. +// +// It replaces the legacy `compacting atomic.Bool` single-flight guard: the +// coordinator owns whether a compaction is running, so a Trigger while one is +// already in flight is dropped (single-flight) and the background goroutine +// always reports Finished — the flag can never stick (invariant #9). +// +// run is the summarize+evict work for this conversation (captured at +// construction); StartCompaction spawns it and reports Finished when it returns. +// It takes a context derived from the sink's session-scoped ctx, so shutdown() +// can cancel an in-flight compaction. +type compactionSink struct { + coord *compactcoord.Coordinator + run func(ctx context.Context) + ctx context.Context + cancel context.CancelFunc + wg sync.WaitGroup +} + +func newCompactionSink(run func(ctx context.Context)) *compactionSink { + s := &compactionSink{run: run} + s.ctx, s.cancel = context.WithCancel(context.Background()) + s.coord = compactcoord.New(s) + return s +} + +// trigger asks the coordinator to start a compaction; a no-op while one is +// already running or after shutdown. Non-blocking. +func (s *compactionSink) trigger() { + if err := s.coord.Apply(compactcoord.Trigger{}); err != nil { + xlog.Error("compactcoord: trigger failed", "error", err) + } +} + +// shutdown is called by the connection (M1) parent's teardown: cancel any +// in-flight compaction, join it, then move the coordinator to Terminated so no +// compaction can start afterwards. This closes the legacy gap where the +// fire-and-forget compaction goroutine could outlive the session. Cancelling the +// context first makes the in-flight summarizer Predict return promptly, so the +// join is bounded. +func (s *compactionSink) shutdown() { + s.cancel() + s.wg.Wait() + if err := s.coord.Apply(compactcoord.Shutdown{}); err != nil { + xlog.Error("compactcoord: shutdown apply failed", "error", err) + } +} + +// Perform executes one effect. Called under the coordinator lock; StartCompaction +// only spawns a goroutine, so it does not block. +func (s *compactionSink) Perform(e compactcoord.Effect) { + switch e.(type) { + case compactcoord.StartCompaction: + s.wg.Add(1) + go func() { + defer s.wg.Done() + defer func() { + if err := s.coord.Apply(compactcoord.Finished{}); err != nil { + xlog.Error("compactcoord: finished apply failed", "error", err) + } + }() + if s.run != nil { + s.run(s.ctx) + } + }() + } +} diff --git a/core/http/endpoints/openai/realtime_compaction.go b/core/http/endpoints/openai/realtime_compaction.go index f79a2d7a2..3b1967465 100644 --- a/core/http/endpoints/openai/realtime_compaction.go +++ b/core/http/endpoints/openai/realtime_compaction.go @@ -222,7 +222,7 @@ func prefixMatches(items, snapshot []*types.MessageItemUnion) bool { // conv.Lock across the summarizer call: snapshot under lock, summarize unlocked, // commit under lock (re-validating the head is unchanged). On any error it // leaves the conversation untouched — items are never dropped without a summary. -func (s *Session) compact(conv *Conversation, model Model) { +func (s *Session) compact(ctx context.Context, conv *Conversation, model Model) { if model == nil { return } @@ -241,9 +241,10 @@ func (s *Session) compact(conv *Conversation, model Model) { prior := conv.Memory conv.Lock.Unlock() - // Summarize (unlocked). + // Summarize (unlocked). The timeout is derived from the caller's ctx so the + // connection teardown can cancel an in-flight summary (bounding the join). msgs := buildSummaryMessages(prior, renderItemsTranscript(overflow), s.MaxSummaryTokens) - ctx, cancel := context.WithTimeout(context.Background(), compactionTimeout) + ctx, cancel := context.WithTimeout(ctx, compactionTimeout) defer cancel() predFunc, err := model.Predict(ctx, msgs, nil, nil, nil, nil, nil, nil, nil, nil, nil) if err != nil { @@ -298,9 +299,13 @@ func (s *Session) summarizerModel() Model { } // maybeCompact schedules a background compaction when the live buffer has grown -// past the trigger and none is already running. Returns immediately. +// past the trigger and none is already running. Returns immediately. The +// single-flight guarantee (at most one compaction per conversation) is owned by +// the compaction coordinator (M4); see realtime_compactcoord.go. The actual +// summarize+evict work (and the lazy summary_model load) is the conversation's +// compaction-sink run closure, so it stays off the response path. func (s *Session) maybeCompact(conv *Conversation) { - if !s.CompactionEnabled { + if !s.CompactionEnabled || conv.compaction == nil { return } conv.Lock.Lock() @@ -309,18 +314,5 @@ func (s *Session) maybeCompact(conv *Conversation) { if !over { return } - if !conv.compacting.CompareAndSwap(false, true) { - return - } - go func() { - defer conv.compacting.Store(false) - // Resolve (and, for a configured summary_model, lazily load) the - // summarizer only when a compaction actually runs, off the response - // path — so the model load never blocks a user turn. - model := s.summarizerModel() - if model == nil { - return - } - s.compact(conv, model) - }() + conv.compaction.trigger() } diff --git a/core/http/endpoints/openai/realtime_compaction_test.go b/core/http/endpoints/openai/realtime_compaction_test.go index 5b19a8259..dd8180497 100644 --- a/core/http/endpoints/openai/realtime_compaction_test.go +++ b/core/http/endpoints/openai/realtime_compaction_test.go @@ -1,6 +1,7 @@ package openai import ( + "context" "errors" . "github.com/onsi/ginkgo/v2" @@ -198,7 +199,7 @@ var _ = Describe("compact", func() { s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512} m := &fakeModel{predictResp: backend.LLMResponse{Response: "ROLLED UP"}} - s.compact(conv, m) + s.compact(context.Background(), conv, m) Expect(conv.Memory).To(Equal("ROLLED UP")) Expect(len(conv.Items)).To(Equal(4)) @@ -213,7 +214,7 @@ var _ = Describe("compact", func() { s := &Session{CompactionEnabled: true, CompactionTrigger: 2, MaxHistoryItems: 1, MaxSummaryTokens: 512} m := &fakeModel{predictErr: errors.New("boom")} - s.compact(conv, m) + s.compact(context.Background(), conv, m) Expect(conv.Memory).To(Equal("")) Expect(len(conv.Items)).To(Equal(3)) @@ -227,7 +228,7 @@ var _ = Describe("compact", func() { s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512} m := &fakeModel{predictResp: backend.LLMResponse{Response: "planning the summaryCLEAN SUMMARY"}} - s.compact(conv, m) + s.compact(context.Background(), conv, m) Expect(conv.Memory).To(Equal("CLEAN SUMMARY")) Expect(conv.Memory).ToNot(ContainSubstring("planning")) @@ -236,7 +237,7 @@ var _ = Describe("compact", func() { It("does nothing when items are at or below the trigger", func() { conv := &Conversation{Items: []*types.MessageItemUnion{user("1", "a")}} s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4} - s.compact(conv, &fakeModel{predictResp: backend.LLMResponse{Response: "x"}}) + s.compact(context.Background(), conv, &fakeModel{predictResp: backend.LLMResponse{Response: "x"}}) Expect(conv.Memory).To(Equal("")) Expect(len(conv.Items)).To(Equal(1)) }) diff --git a/core/http/endpoints/openai/realtime_conncoord.go b/core/http/endpoints/openai/realtime_conncoord.go new file mode 100644 index 000000000..0dc6016bf --- /dev/null +++ b/core/http/endpoints/openai/realtime_conncoord.go @@ -0,0 +1,122 @@ +package openai + +import ( + "sync" + + "github.com/mudler/LocalAI/core/http/endpoints/openai/conncoord" + "github.com/mudler/xlog" +) + +// connSink wires the explicit connection-lifecycle state machine +// (conncoord.Coordinator — machine "M1" in docs/design/realtime-state-machines.md) +// into the realtime session handler. +// +// It replaces the legacy vadServerStarted bool + the `done` channel that was +// reassigned on every turn-detection toggle and closed from two sites (Part 2, +// failure mode 6). The coordinator owns whether the VAD goroutine is running, so +// the per-run done channel is created and closed in lockstep with that one state +// — closed exactly once, never resurrected after teardown. +// +// The connection machine is driven by the single session goroutine (the handler +// loop and its teardown), so this sink and its coordinator are loop-local; the +// Coordinator's lock only keeps State() race-free. +// +// Effects: +// - StartVAD: create a fresh done channel and spawn handleVAD on it (joined via wg). +// - StopVAD: close that done channel. +// - Teardown: stop the remaining input goroutines (opus decode, sound window), +// join everything, cancel in-flight responses, and remove the session — once. +type connSink struct { + session *Session + sessionID string + transport Transport + wg *sync.WaitGroup + + coord *conncoord.Coordinator + + // vadDone is the current VAD run's stop signal — recreated on each StartVAD, + // closed by StopVAD / Teardown. Owned solely by Perform (single goroutine). + vadDone chan struct{} + + // One-shot stop signals for the other input goroutines, registered by the + // handler when it starts them; closed once by Teardown. + decodeDone chan struct{} + soundWindowDone chan struct{} +} + +func newConnSink(session *Session, sessionID string, t Transport, wg *sync.WaitGroup) *connSink { + s := &connSink{ + session: session, + sessionID: sessionID, + transport: t, + wg: wg, + } + s.coord = conncoord.New(s) + return s +} + +// setVAD requests the turn-detection goroutine match active. Idempotent. +func (s *connSink) setVAD(active bool) { + if err := s.coord.Apply(conncoord.SetVAD{Active: active}); err != nil { + xlog.Error("conncoord: setVAD failed", "error", err) + } +} + +// close tears the session down (once). Safe to call from multiple exit paths. +func (s *connSink) close() { + if err := s.coord.Apply(conncoord.Close{}); err != nil { + xlog.Error("conncoord: close failed", "error", err) + } +} + +// Perform executes one effect. Called by Coordinator.Apply under the coordinator +// lock; the connection coordinator is single-writer and torn down exactly once at +// the end of the session goroutine, so the blocking joins in Teardown never +// contend the lock. +func (s *connSink) Perform(e conncoord.Effect) { + switch e.(type) { + case conncoord.StartVAD: + xlog.Debug("Starting VAD goroutine...") + s.vadDone = make(chan struct{}) + done := s.vadDone + s.wg.Go(func() { + conversation := s.session.Conversations[s.session.DefaultConversationID] + handleVAD(s.session, conversation, s.transport, done) + }) + case conncoord.StopVAD: + xlog.Debug("Stopping VAD goroutine...") + close(s.vadDone) + s.vadDone = nil + case conncoord.Teardown: + // Tear down in dependency order, driving every child machine to its + // terminal state so none outlives the session (the hierarchy invariant in + // formal-verification/session_lifecycle.fizz: conn Torn => children terminal). + // + // 1. Stop the remaining input goroutines and join them (this joins the VAD + // goroutine, M2, via the StopVAD above + wg). + if s.decodeDone != nil { + close(s.decodeDone) + } + if s.soundWindowDone != nil { + close(s.soundWindowDone) + } + s.wg.Wait() + + // 2. Terminate the response coordinator (M3): cancel the in-flight response + // and join all response goroutines (which also closes their TTS + // pipelines, M5). After this no response can start. + s.session.respSink.shutdown() + + // 3. Terminate every conversation's compaction coordinator (M4): cancel + + // join any in-flight summarize+evict so it cannot outlive the session. + for _, conv := range s.session.Conversations { + if conv.compaction != nil { + conv.compaction.shutdown() + } + } + + sessionLock.Lock() + delete(sessions, s.sessionID) + sessionLock.Unlock() + } +} diff --git a/core/http/endpoints/openai/realtime_doubles_test.go b/core/http/endpoints/openai/realtime_doubles_test.go index 10e608c17..6dc1c6ca5 100644 --- a/core/http/endpoints/openai/realtime_doubles_test.go +++ b/core/http/endpoints/openai/realtime_doubles_test.go @@ -74,6 +74,16 @@ type fakeModel struct { transcribeDeltas []string transcribeFinal *schema.TranscriptionResult + transcribeErr error + + // TranscribeLive scripting: liveErr makes the open fail (degrade path); + // liveEvents are delivered to onEvent synchronously at open; + // liveCloseEvents are delivered during Close (the finalize flush). + liveErr error + liveEvents []backend.LiveTranscriptionEvent + liveCloseEvents []backend.LiveTranscriptionEvent + liveOpened int + liveSession *fakeLiveSession // soundDetectionResult/soundDetectionErr drive the SoundDetection double so // the sound-event path can be exercised deterministically. @@ -97,7 +107,7 @@ func (m *fakeModel) VAD(context.Context, *schema.VADRequest) (*schema.VADRespons } func (m *fakeModel) Transcribe(context.Context, string, string, bool, bool, string) (*schema.TranscriptionResult, error) { - return m.transcribeFinal, nil + return m.transcribeFinal, m.transcribeErr } func (m *fakeModel) SoundDetection(context.Context, string, int, float32) (*schema.SoundClassificationResult, error) { @@ -150,4 +160,43 @@ func (m *fakeModel) TranscribeStream(_ context.Context, _, _ string, _, _ bool, return m.transcribeFinal, nil } +func (m *fakeModel) TranscribeLive(_ context.Context, _ string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error) { + if m.liveErr != nil { + return nil, m.liveErr + } + m.liveOpened++ + for _, ev := range m.liveEvents { + onEvent(ev) + } + m.liveSession = &fakeLiveSession{onEvent: onEvent, closeEvents: m.liveCloseEvents} + return m.liveSession, nil +} + func (m *fakeModel) PredictConfig() *config.ModelConfig { return m.cfg } + +// fakeLiveSession records what semantic_vad fed and closed; closeEvents are +// replayed through onEvent during Close, mimicking the backend's finalize +// flush (trailing delta + Final) landing before Close returns. +type fakeLiveSession struct { + onEvent func(backend.LiveTranscriptionEvent) + closeEvents []backend.LiveTranscriptionEvent + fed [][]float32 + feedErr error + closed int +} + +func (s *fakeLiveSession) Feed(pcm []float32) error { + if s.feedErr != nil { + return s.feedErr + } + s.fed = append(s.fed, append([]float32(nil), pcm...)) + return nil +} + +func (s *fakeLiveSession) Close() error { + s.closed++ + for _, ev := range s.closeEvents { + s.onEvent(ev) + } + return nil +} diff --git a/core/http/endpoints/openai/realtime_model.go b/core/http/endpoints/openai/realtime_model.go index 0dafa0a35..71f553980 100644 --- a/core/http/endpoints/openai/realtime_model.go +++ b/core/http/endpoints/openai/realtime_model.go @@ -102,6 +102,10 @@ func (m *transcriptOnlyModel) TranscribeStream(ctx context.Context, audio, langu return transcribeStream(ctx, m.modelLoader, *m.TranscriptionConfig, m.appConfig, audio, language, translate, diarize, prompt, onDelta) } +func (m *transcriptOnlyModel) TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error) { + return backend.ModelTranscriptionLive(ctx, language, m.modelLoader, *m.TranscriptionConfig, m.appConfig, onEvent) +} + func (m *transcriptOnlyModel) PredictConfig() *config.ModelConfig { return nil } @@ -348,6 +352,10 @@ func (m *wrappedModel) TranscribeStream(ctx context.Context, audio, language str return transcribeStream(ctx, m.modelLoader, *m.TranscriptionConfig, m.appConfig, audio, language, translate, diarize, prompt, onDelta) } +func (m *wrappedModel) TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error) { + return backend.ModelTranscriptionLive(ctx, language, m.modelLoader, *m.TranscriptionConfig, m.appConfig, onEvent) +} + func (m *wrappedModel) PredictConfig() *config.ModelConfig { return m.LLMConfig } diff --git a/core/http/endpoints/openai/realtime_respcoord.go b/core/http/endpoints/openai/realtime_respcoord.go new file mode 100644 index 000000000..c34ef8bd4 --- /dev/null +++ b/core/http/endpoints/openai/realtime_respcoord.go @@ -0,0 +1,143 @@ +package openai + +import ( + "context" + "sync" + "sync/atomic" + + "github.com/mudler/LocalAI/core/http/endpoints/openai/respcoord" + "github.com/mudler/xlog" +) + +// responseSink wires the explicit response-coordination state machine +// (respcoord.Coordinator — machine "M3" in docs/design/realtime-state-machines.md) +// into a realtime session. +// +// It replaces the legacy startResponse/cancelActiveResponse pair, whose +// activeResponse* fields were written from two goroutines (the client read-loop +// and the VAD goroutine) with the <-done wait performed outside the lock — the +// dual-writer race documented in Part 2 (failure mode 2). The coordinator +// serializes every start/cancel/finish decision behind one lock and guarantees +// at most one live response, so the two callers can no longer interleave into +// two overlapping responses. +// +// Each response runs as a goroutine spawned here. The effects map as: +// - StartResponse: spawn the registered body with a fresh cancelable context. +// - CancelResponse: cancel that context (cooperative — the body stops at its +// next ctx checkpoint and emits its own response.done{cancelled}). +// - EmitTerminal: currently a no-op. response.done is still emitted by the +// response body itself; making this the single authoritative terminal (one +// response.done per response.create, with Output+Usage populated) is the +// next step and does not change the coordination guarantees here. +type responseSink struct { + mu sync.Mutex + coord *respcoord.Coordinator + cancels map[respcoord.ResponseID]context.CancelFunc + bodies map[respcoord.ResponseID]responseBody + seq atomic.Uint64 + wg sync.WaitGroup +} + +type responseBody struct { + parent context.Context + run func(ctx context.Context) +} + +func newResponseSink() *responseSink { + s := &responseSink{ + cancels: map[respcoord.ResponseID]context.CancelFunc{}, + bodies: map[respcoord.ResponseID]responseBody{}, + } + s.coord = respcoord.New(s) + return s +} + +// issue registers a response body and asks the coordinator to start it. Any +// in-flight response is superseded (cancelled, with its own terminal) first, +// atomically inside the coordinator — no caller-side locking, no dual-writer +// race. Non-blocking: the superseded response drains concurrently and its later +// Finished is ignored as stale. +func (s *responseSink) issue(parent context.Context, source respcoord.Source, run func(ctx context.Context)) { + id := respcoord.ResponseID(s.seq.Add(1)) + s.mu.Lock() + s.bodies[id] = responseBody{parent: parent, run: run} + s.mu.Unlock() + if err := s.coord.Apply(respcoord.Start{ID: id, Source: source}); err != nil { + xlog.Error("respcoord: start failed", "error", err) + } +} + +// cancel cancels the in-flight response, if any. Non-blocking (barge-in must not +// stall the VAD tick). +func (s *responseSink) cancel(source respcoord.Source) { + if err := s.coord.Apply(respcoord.Cancel{Source: source}); err != nil { + xlog.Error("respcoord: cancel failed", "error", err) + } +} + +// wait blocks until every response goroutine (the active one plus any draining +// superseded ones) has exited. Used at teardown so the session is never deleted +// out from under a running response. +func (s *responseSink) wait() { + s.wg.Wait() +} + +// shutdown terminates the coordinator (cancelling any in-flight response) and +// then joins all response goroutines. After this the coordinator is in its +// absorbing Terminated state, so no further response can be issued — the +// connection (M1) parent's teardown uses this to guarantee no response outlives +// the session (see formal-verification/session_lifecycle.fizz). +func (s *responseSink) shutdown() { + if err := s.coord.Apply(respcoord.Shutdown{}); err != nil { + xlog.Error("respcoord: shutdown failed", "error", err) + } + s.wait() +} + +// Perform executes one effect. It is called by Coordinator.Apply while the +// coordinator lock is held, so it must not block. It briefly takes s.mu but +// never acquires the coordinator lock while holding s.mu; the spawned +// goroutine's Finished apply takes the coordinator lock only AFTER releasing +// s.mu, so there is no lock cycle. +func (s *responseSink) Perform(e respcoord.Effect) { + switch eff := e.(type) { + case respcoord.StartResponse: + s.mu.Lock() + body := s.bodies[eff.ID] + delete(s.bodies, eff.ID) + parent := body.parent + if parent == nil { + parent = context.Background() + } + ctx, cancel := context.WithCancel(parent) + s.cancels[eff.ID] = cancel + s.mu.Unlock() + + s.wg.Go(func() { + defer func() { + s.mu.Lock() + delete(s.cancels, eff.ID) + s.mu.Unlock() + // Report completion. If this response was superseded/cancelled + // the id is stale and the coordinator ignores it (so the + // terminal is never emitted twice). + if err := s.coord.Apply(respcoord.Finished{ID: eff.ID}); err != nil { + xlog.Error("respcoord: finished apply failed", "error", err) + } + }() + if body.run != nil { + body.run(ctx) + } + }) + case respcoord.CancelResponse: + s.mu.Lock() + cancel := s.cancels[eff.ID] + s.mu.Unlock() + if cancel != nil { + cancel() + } + case respcoord.EmitTerminal: + // No-op for now: the response body still emits its own response.done. + // Wiring the authoritative single terminal here is the next step. + } +} diff --git a/core/http/endpoints/openai/realtime_semantic_vad.go b/core/http/endpoints/openai/realtime_semantic_vad.go new file mode 100644 index 000000000..66dfc6efe --- /dev/null +++ b/core/http/endpoints/openai/realtime_semantic_vad.go @@ -0,0 +1,350 @@ +package openai + +import ( + "context" + "strings" + + "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/http/endpoints/openai/types" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/xlog" +) + +// Semantic (EOU-driven) turn detection. +// +// With turn_detection.type == "semantic_vad", the transcription model is fed +// the microphone audio live while the user speaks and its end-of-utterance +// token turns the silence window dynamic: an immediate commit once the +// token fires (the model judged the user finished and expects a reply), the +// much longer eagerness fallback when it does not (mid-thought pause). The +// silero VAD stays in charge of speech_started/barge-in and the actual +// silence measurement, so a spurious EOU mid-speech cannot cut the user off +// — the commit still requires real silence. + +const ( + // semanticEouSilenceSec is the extra silence required to commit once the + // end-of-utterance token has fired. Zero: the token already trails the + // audio by the encoder chunk schedule plus a VAD tick (~0.3-0.9s), and + // the commit check only runs after silero closes the speech segment — + // which itself takes real silence — so any window on top is pure added + // response delay. + semanticEouSilenceSec = 0.0 + + // liveEventsBuffer sizes the recv-callback → VAD-tick handoff channel. + // Events arrive at a few per second and the ticker drains every 300ms; + // a full channel means the loop is wedged, and dropping (with a warning) + // beats blocking the backend's recv goroutine. + liveEventsBuffer = 64 +) + +// eagernessMaxSilenceSec maps the OpenAI semantic_vad eagerness to the +// fallback silence window used when no end-of-utterance token was seen: +// low waits longest, high responds fastest, auto/empty equals medium — +// the same 8s/4s/2s max timeouts OpenAI documents. +func eagernessMaxSilenceSec(eagerness string) float64 { + switch strings.ToLower(strings.TrimSpace(eagerness)) { + case "low": + return 8 + case "high": + return 2 + default: // "medium", "auto", "" + return 4 + } +} + +// liveUtterance is one committed turn's transcript as produced by the live +// stream. Its delta events were already streamed to the client as they +// arrived (keyed by the turn's item id), so only the final text travels here. +type liveUtterance struct { + Text string +} + +// liveTurnState is handleVAD's per-session live-ASR companion for +// semantic_vad. One live stream is opened per user turn (begun when the VAD +// first reports speech, finalized at commit) — the underlying decode session +// grows with fed audio, so per-turn streams keep it bounded. All fields are +// owned by the handleVAD goroutine; the backend's recv callback only writes +// into the buffered events channel. +type liveTurnState struct { + session *Session + transport Transport // live caption deltas are sent here as they drain + events chan backend.LiveTranscriptionEvent + + live backend.LiveTranscriptionSession // nil between turns + unavailable bool // sticky: backend can't do live ASR, degrade for the session + + fed16k int // 16k samples of the current buffer already fed + // eouAtSec is the audio time of the most recent EOU this turn (0 = none). + // It is a recorded fact: set when an EOU drains and never toggled off + // mid-turn. Whether it still governs the trailing silence is derived + // purely by eouPending() from this plus the live VAD segments. + eouAtSec float64 + parts []string // deltas accumulated for the current turn + finalText string // authoritative full-turn text from the Final event + itemID string // the turn's conversation item id, allocated at openTurn + deltasSent bool // at least one caption delta reached the client this turn +} + +func newLiveTurnState(session *Session, transport Transport) *liveTurnState { + return &liveTurnState{ + session: session, + transport: transport, + events: make(chan backend.LiveTranscriptionEvent, liveEventsBuffer), + } +} + +func (l *liveTurnState) open() bool { return l.live != nil } + +// openTurn starts the turn's live stream under the caller-supplied item id. A +// failure (most commonly the backend's typed "live transcription unsupported" +// signal) degrades the whole session to silence-only detection — warned once, +// then sticky. +// +// The item id is supplied by the turn coordinator (turncoord) rather than minted +// here: it is allocated when the turn STARTS so caption deltas can stream to the +// client while the user is still speaking, and the committed event and final +// transcript reuse it (replacing the partial text). The coordinator carries the +// same id on its CommitTurn/DiscardTurn effects, so the committed event always +// matches the captions. +func (l *liveTurnState) openTurn(ctx context.Context, itemID string) bool { + if l.live != nil { + return true + } + if l.unavailable { + return false + } + language := "" + if l.session.InputAudioTranscription != nil { + language = l.session.InputAudioTranscription.Language + } + live, err := l.session.ModelInterface.TranscribeLive(ctx, language, func(ev backend.LiveTranscriptionEvent) { + select { + case l.events <- ev: + default: + xlog.Warn("semantic_vad: live transcription event dropped (event channel full)") + } + }) + if err != nil { + l.unavailable = true + xlog.Warn("semantic_vad: live transcription unavailable; degrading to silence-only turn detection", + "error", err) + return false + } + l.resetTurn() + l.live = live + l.itemID = itemID + return true +} + +// feedNewAudio pushes the not-yet-fed tail of the resampled buffer to the +// live stream. The final sample is held back: ResampleInt16 is prefix-stable +// except for its last output sample, so excluding it keeps successive +// whole-buffer resamples bit-identical over the fed range. +func (l *liveTurnState) feedNewAudio(aints16k []int16) { + if l.live == nil { + return + } + end := len(aints16k) - 1 + if end <= l.fed16k { + return + } + if err := l.live.Feed(int16sToFloat32(aints16k[l.fed16k:end])); err != nil { + xlog.Warn("semantic_vad: live feed failed; degrading to silence-only turn detection", "error", err) + l.discardTurn() + l.unavailable = true + return + } + l.fed16k = end +} + +// drainEvents folds everything the live stream produced since the last tick +// into the turn state. audioSec (the current buffer length in seconds) marks +// WHEN an EOU was observed, so later VAD segments can distinguish speech +// that resumed after it. +func (l *liveTurnState) drainEvents(audioSec float64) { + for { + select { + case ev := <-l.events: + if ev.Delta != "" { + l.parts = append(l.parts, ev.Delta) + // Live captions: forward the delta immediately under the + // turn's item id — the browser shows text while the user + // is still speaking; the completed event at commit + // replaces it with the authoritative transcript. + if l.transport != nil && l.itemID != "" { + sendEvent(l.transport, types.ConversationItemInputAudioTranscriptionDeltaEvent{ + ServerEventBase: types.ServerEventBase{EventID: "event_TODO"}, + ItemID: l.itemID, + ContentIndex: 0, + Delta: ev.Delta, + }) + l.deltasSent = true + } + } + if ev.Eou { + // Record the position; do not flip a flag. Whether this EOU + // still applies to the trailing silence is decided later by + // eouPending(), purely from this and the live VAD segments. + l.eouAtSec = audioSec + xlog.Debug("semantic_vad: EOU token observed", "audio_s", audioSec) + } + if ev.Eob { + // A backchannel ended ("uh-huh") — the user is still + // listening, not yielding the turn. Deliberately NOT a + // commit trigger. + xlog.Debug("semantic_vad: EOB (backchannel) observed", "audio_s", audioSec) + } + if ev.Final != nil && strings.TrimSpace(ev.Final.Text) != "" { + l.finalText = ev.Final.Text + } + default: + return + } + } +} + +// eouPending reports whether the recorded EOU still applies to the current +// trailing silence. It is a pure function of the recorded EOU position and the +// VAD's live view — there is no stored boolean that can fall out of sync. +// +// An EOU stops applying only once the user has STARTED a new utterance after +// it (a segment whose start is past the EOU): that is genuine resumed speech, +// so the earlier yield no longer holds. An in-progress segment whose speech +// began BEFORE the EOU is NOT resumed speech — it is just silero still padding +// before it closes the segment, which is the normal state at the instant the +// (predictive) EOU fires. Treating that as resumed speech was the bug that +// cleared the flag on the very tick the token arrived, dropping almost every +// EOU to the eagerness timeout. +func (l *liveTurnState) eouPending(segments []schema.VADSegment) bool { + if l.eouAtSec == 0 || len(segments) == 0 { + return false + } + last := segments[len(segments)-1] + return float64(last.Start) <= l.eouAtSec +} + +// thresholdSec is the dynamic commit threshold: zero once the model said +// the utterance is over (any VAD-confirmed silence commits), the eagerness +// fallback otherwise. +func (l *liveTurnState) thresholdSec(eouPending bool, sv *types.RealtimeSessionSemanticVad) float64 { + if eouPending { + return semanticEouSilenceSec + } + return eagernessMaxSilenceSec(sv.Eagerness) +} + +// commitTrigger describes how a commit decision was reached, for the per-turn +// timing log: "eou" with the token's lag behind the VAD's speech end, or +// "timeout" when the eagerness fallback elapsed without one. The lag is the +// number the user needs to tell a slow EOU emission apart from loop overhead. +func (l *liveTurnState) commitTrigger(eouPending bool, speechEndSec float64) (trigger string, eouLagSec float64) { + if !eouPending { + return "timeout", 0 + } + return "eou", l.eouAtSec - speechEndSec +} + +// finishTurn finalizes the live stream (flushing the decode tail — the last +// ~2 encoder frames of text only appear here), folds the terminal events in, +// and returns the turn's transcript. Returns nil when the stream never +// produced text (the VAD triggered on something the model heard nothing in). +func (l *liveTurnState) finishTurn(audioSec float64) *liveUtterance { + if l.live == nil { + return nil + } + if err := l.live.Close(); err != nil { + xlog.Warn("semantic_vad: live transcription finalize failed", "error", err) + } + l.live = nil + l.drainEvents(audioSec) + + text := strings.TrimSpace(l.finalText) + if text == "" { + text = l.previewText() + } + ut := &liveUtterance{Text: text} + l.resetTurn() + if ut.Text == "" { + return nil + } + return ut +} + +// discardTurn drops the current turn (no-speech buffer clear, feed failure, +// session teardown): the stream is closed and its transcript thrown away. +// Any caption deltas already shown for it are retracted via the failed +// event, so the client doesn't keep a stuck partial entry. +func (l *liveTurnState) discardTurn() { + if l.live != nil { + _ = l.live.Close() + l.live = nil + } + l.drainEvents(0) + if l.deltasSent && l.transport != nil && l.itemID != "" { + sendEvent(l.transport, types.ConversationItemInputAudioTranscriptionFailedEvent{ + ServerEventBase: types.ServerEventBase{EventID: "event_TODO"}, + ItemID: l.itemID, + ContentIndex: 0, + Error: types.Error{ + Type: "transcription_discarded", + Message: "turn discarded before commit", + }, + }) + } + l.resetTurn() +} + +func (l *liveTurnState) resetTurn() { + l.fed16k = 0 + l.eouAtSec = 0 + l.parts = nil + l.finalText = "" + l.itemID = "" + l.deltasSent = false +} + +// previewText is the turn's transcript so far (for the retranscribe +// comparison log and as the fallback when no Final event arrived). +func (l *liveTurnState) previewText() string { + return strings.TrimSpace(strings.Join(l.parts, "")) +} + +// int16sToFloat32 converts PCM to the [-1,1] float form the live stream +// feeds the model (the same scaling runVAD's go-audio conversion applies). +func int16sToFloat32(samples []int16) []float32 { + out := make([]float32, len(samples)) + for i, s := range samples { + out[i] = float32(s) / 32768.0 + } + return out +} + +// turnDetectionActive reports whether the session has any automatic turn +// detection (server or semantic VAD) that should run the handleVAD loop. +func turnDetectionActive(td *types.TurnDetectionUnion) bool { + return td != nil && (td.ServerVad != nil || td.SemanticVad != nil) +} + +// defaultTurnDetection seeds a new session's turn detection from the +// pipeline's server-side default: semantic_vad pipelines start sessions in +// semantic mode (clients can still override via session.update); everything +// else keeps the historical server_vad defaults. +func defaultTurnDetection(cfg *config.ModelConfig) *types.TurnDetectionUnion { + if cfg != nil && cfg.Pipeline.TurnDetectionSemantic() { + return &types.TurnDetectionUnion{ + SemanticVad: &types.RealtimeSessionSemanticVad{ + CreateResponse: true, + Eagerness: cfg.Pipeline.TurnDetection.Eagerness, + }, + } + } + return &types.TurnDetectionUnion{ + ServerVad: &types.ServerVad{ + Threshold: 0.5, + PrefixPaddingMs: 300, + SilenceDurationMs: 500, + CreateResponse: true, + }, + } +} diff --git a/core/http/endpoints/openai/realtime_semantic_vad_test.go b/core/http/endpoints/openai/realtime_semantic_vad_test.go new file mode 100644 index 000000000..c3f5d7ef8 --- /dev/null +++ b/core/http/endpoints/openai/realtime_semantic_vad_test.go @@ -0,0 +1,414 @@ +package openai + +import ( + "context" + "errors" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/http/endpoints/openai/types" + "github.com/mudler/LocalAI/core/schema" +) + +var _ = Describe("eagernessMaxSilenceSec", func() { + DescribeTable("maps eagerness to the no-EOU fallback window", + func(eagerness string, want float64) { + Expect(eagernessMaxSilenceSec(eagerness)).To(Equal(want)) + }, + Entry("low", "low", 8.0), + Entry("medium", "medium", 4.0), + Entry("high", "high", 2.0), + Entry("auto equals medium", "auto", 4.0), + Entry("empty equals medium", "", 4.0), + Entry("case and space insensitive", " High ", 2.0), + Entry("unknown equals medium", "frantic", 4.0), + ) +}) + +var _ = Describe("turnDetectionActive", func() { + It("is active for server and semantic VAD, inactive otherwise", func() { + Expect(turnDetectionActive(nil)).To(BeFalse()) + Expect(turnDetectionActive(&types.TurnDetectionUnion{})).To(BeFalse()) + Expect(turnDetectionActive(&types.TurnDetectionUnion{ServerVad: &types.ServerVad{}})).To(BeTrue()) + Expect(turnDetectionActive(&types.TurnDetectionUnion{SemanticVad: &types.RealtimeSessionSemanticVad{}})).To(BeTrue()) + }) +}) + +var _ = Describe("defaultTurnDetection", func() { + It("keeps the historical server_vad defaults for non-semantic pipelines", func() { + td := defaultTurnDetection(&config.ModelConfig{}) + Expect(td.ServerVad).NotTo(BeNil()) + Expect(td.SemanticVad).To(BeNil()) + Expect(td.ServerVad.SilenceDurationMs).To(Equal(int64(500))) + Expect(td.ServerVad.CreateResponse).To(BeTrue()) + }) + + It("seeds semantic_vad with the pipeline's eagerness", func() { + cfg := &config.ModelConfig{} + cfg.Pipeline.TurnDetection.Type = "semantic_vad" + cfg.Pipeline.TurnDetection.Eagerness = "high" + td := defaultTurnDetection(cfg) + Expect(td.SemanticVad).NotTo(BeNil()) + Expect(td.ServerVad).To(BeNil()) + Expect(td.SemanticVad.Eagerness).To(Equal("high")) + Expect(td.SemanticVad.CreateResponse).To(BeTrue()) + }) + + It("treats a nil config as server_vad", func() { + Expect(defaultTurnDetection(nil).ServerVad).NotTo(BeNil()) + }) +}) + +var _ = Describe("int16sToFloat32", func() { + It("scales like the VAD conversion", func() { + out := int16sToFloat32([]int16{0, 16384, -32768}) + Expect(out).To(HaveLen(3)) + Expect(out[0]).To(BeNumerically("~", 0.0, 1e-6)) + Expect(out[1]).To(BeNumerically("~", 0.5, 1e-6)) + Expect(out[2]).To(BeNumerically("~", -1.0, 1e-6)) + }) +}) + +var _ = Describe("liveTurnState", func() { + var ( + m *fakeModel + lts *liveTurnState + ftr *fakeTransport + ) + + newSemanticSession := func(m *fakeModel) *Session { + return &Session{ + InputAudioTranscription: &types.AudioTranscription{}, + ModelInterface: m, + } + } + + BeforeEach(func() { + m = &fakeModel{} + ftr = &fakeTransport{} + lts = newLiveTurnState(newSemanticSession(m), ftr) + }) + + Describe("openTurn", func() { + It("opens once per turn and reports open()", func() { + Expect(lts.open()).To(BeFalse()) + Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue()) + Expect(lts.open()).To(BeTrue()) + Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue(), "idempotent while open") + Expect(m.liveOpened).To(Equal(1)) + }) + + It("degrades stickily when the backend cannot do live transcription", func() { + m.liveErr = errors.New("rpc error: code = Unimplemented desc = live transcription unsupported") + Expect(lts.openTurn(context.Background(), "item1")).To(BeFalse()) + Expect(lts.unavailable).To(BeTrue()) + + // Later turns never retry: the failure is per-session sticky. + m.liveErr = nil + Expect(lts.openTurn(context.Background(), "item1")).To(BeFalse()) + Expect(m.liveOpened).To(Equal(0)) + }) + }) + + Describe("feedNewAudio", func() { + It("feeds only the unfed tail and holds back the final resampled sample", func() { + Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue()) + + lts.feedNewAudio([]int16{1, 2, 3, 4}) + Expect(m.liveSession.fed).To(HaveLen(1)) + Expect(m.liveSession.fed[0]).To(HaveLen(3), "last sample held back") + + // Same buffer grown by two samples: only the delta is fed. + lts.feedNewAudio([]int16{1, 2, 3, 4, 5, 6}) + Expect(m.liveSession.fed).To(HaveLen(2)) + Expect(m.liveSession.fed[1]).To(HaveLen(2)) + + // No growth past the holdback: nothing fed. + lts.feedNewAudio([]int16{1, 2, 3, 4, 5, 6}) + Expect(m.liveSession.fed).To(HaveLen(2)) + }) + + It("degrades and closes the turn when a feed fails", func() { + Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue()) + m.liveSession.feedErr = errors.New("backend gone") + sess := m.liveSession + + lts.feedNewAudio([]int16{1, 2, 3, 4}) + + Expect(lts.open()).To(BeFalse()) + Expect(lts.unavailable).To(BeTrue()) + Expect(sess.closed).To(Equal(1)) + }) + }) + + Describe("event handling and the dynamic threshold", func() { + sv := &types.RealtimeSessionSemanticVad{Eagerness: "high"} + + It("uses the eagerness fallback until an EOU is recorded, then commits without an extra window", func() { + Expect(lts.thresholdSec(false, sv)).To(Equal(2.0)) + Expect(lts.thresholdSec(true, sv)).To(Equal(semanticEouSilenceSec)) + + Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue()) + lts.session.ModelInterface.(*fakeModel).liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "hello ", Eou: false}) + lts.session.ModelInterface.(*fakeModel).liveSession.onEvent(backend.LiveTranscriptionEvent{Eou: true}) + lts.drainEvents(3.3) + + Expect(lts.eouAtSec).To(BeNumerically("~", 3.3, 1e-9)) + Expect(lts.previewText()).To(Equal("hello")) + }) + + // The bug this replaces: the (predictive) EOU routinely arrives while + // silero is still padding the speech segment open. eouPending must NOT + // read that as resumed speech. + It("keeps the EOU pending while silero is still closing the same segment", func() { + lts.eouAtSec = 3.3 + Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 0}})).To(BeTrue(), "segment began before the EOU and is merely unclosed") + Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}})).To(BeTrue(), "and still pending once it closes") + }) + + It("drops the EOU only when a new utterance starts after it (resumed speech)", func() { + lts.eouAtSec = 3.3 + Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}, {Start: 4.0, End: 0}})).To(BeFalse()) + Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}, {Start: 4.0, End: 5.0}})).To(BeFalse()) + }) + + It("has no pending EOU before one is recorded", func() { + Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}})).To(BeFalse()) + Expect(lts.eouPending(nil)).To(BeFalse()) + }) + + It("does not arm the commit threshold on an EOB backchannel", func() { + Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue()) + lts.session.ModelInterface.(*fakeModel).liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "uh-huh", Eob: true}) + lts.drainEvents(2.0) + + Expect(lts.eouAtSec).To(BeZero(), "a backchannel is not the user yielding the turn") + Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 1.8}})).To(BeFalse(), "still on the eagerness fallback") + Expect(lts.previewText()).To(Equal("uh-huh"), "the backchannel text still lands in the transcript") + }) + + It("reports the commit trigger and the EOU token's lag behind speech end", func() { + trigger, lag := lts.commitTrigger(false, 3.2) + Expect(trigger).To(Equal("timeout")) + Expect(lag).To(BeZero()) + + lts.eouAtSec = 3.5 + trigger, lag = lts.commitTrigger(true, 3.2) + Expect(trigger).To(Equal("eou")) + Expect(lag).To(BeNumerically("~", 0.3, 1e-9)) + }) + }) + + Describe("finishTurn", func() { + It("finalizes the stream, prefers the Final text, and resets for the next turn", func() { + m.liveCloseEvents = []backend.LiveTranscriptionEvent{ + {Delta: " world"}, + {Final: &schema.TranscriptionResult{Text: "hello world", Eou: true}}, + } + Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue()) + sess := m.liveSession + sess.onEvent(backend.LiveTranscriptionEvent{Delta: "hello", Eou: true}) + lts.drainEvents(2.0) + + ut := lts.finishTurn(2.5) + + Expect(sess.closed).To(Equal(1)) + Expect(ut).NotTo(BeNil()) + Expect(ut.Text).To(Equal("hello world"), "Final event text wins over joined deltas") + Expect(lts.open()).To(BeFalse()) + Expect(lts.eouAtSec).To(BeZero()) + Expect(lts.parts).To(BeEmpty()) + Expect(lts.fed16k).To(BeZero()) + }) + + It("returns nil when the stream heard nothing", func() { + Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue()) + Expect(lts.finishTurn(1.0)).To(BeNil()) + Expect(m.liveSession.closed).To(Equal(1)) + }) + + It("is a no-op without an open stream", func() { + Expect(lts.finishTurn(1.0)).To(BeNil()) + }) + }) + + Describe("discardTurn", func() { + It("closes the stream, drops the transcript and retracts streamed captions", func() { + Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue()) + sess := m.liveSession + sess.onEvent(backend.LiveTranscriptionEvent{Delta: "noise"}) + lts.drainEvents(1.0) + + lts.discardTurn() + + Expect(sess.closed).To(Equal(1)) + Expect(lts.open()).To(BeFalse()) + Expect(lts.parts).To(BeEmpty()) + Expect(ftr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionFailed)).To(Equal(1), + "the client saw caption deltas for this turn — it must be told to drop them") + }) + + It("sends no failed event when no captions ever reached the client", func() { + Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue()) + lts.discardTurn() + Expect(ftr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionFailed)).To(Equal(0)) + }) + }) + + Describe("live captions", func() { + It("streams each delta to the client under the turn's item id as it drains", func() { + Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue()) + turnID := lts.itemID + Expect(turnID).NotTo(BeEmpty(), "the item id exists from turn open so captions can reference it") + + m.liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "hel"}) + m.liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "lo"}) + lts.drainEvents(1.0) + + var got []types.ConversationItemInputAudioTranscriptionDeltaEvent + for _, e := range ftr.events { + if d, ok := e.(types.ConversationItemInputAudioTranscriptionDeltaEvent); ok { + got = append(got, d) + } + } + Expect(got).To(HaveLen(2)) + Expect(got[0].Delta).To(Equal("hel")) + Expect(got[1].Delta).To(Equal("lo")) + Expect(got[0].ItemID).To(Equal(turnID)) + Expect(got[1].ItemID).To(Equal(turnID)) + Expect(lts.deltasSent).To(BeTrue()) + }) + + It("finishTurn does not retract captions — the commit's completed event supersedes them", func() { + Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue()) + m.liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "hello"}) + lts.drainEvents(1.0) + + Expect(lts.finishTurn(1.5)).NotTo(BeNil()) + Expect(ftr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionFailed)).To(Equal(0)) + }) + }) +}) + +// commitUtteranceWithTranscript routes the three transcript sources: the +// retranscribe gate's batch decode, the live stream's accumulated text, and +// the historical file path. +var _ = Describe("commitUtteranceWithTranscript", func() { + newTranscriptionOnlySession := func(m *fakeModel, streamTranscription bool) *Session { + cfg := &config.ModelConfig{} + if streamTranscription { + on := true + cfg.Pipeline.Streaming.Transcription = &on + } + return &Session{ + TranscriptionOnly: true, // stop after the transcript: no LLM/TTS in these specs + InputAudioTranscription: &types.AudioTranscription{}, + ModelConfig: cfg, + ModelInterface: m, + } + } + + It("uses the gate's batch transcript and never re-runs the backend", func() { + m := &fakeModel{transcribeErr: errors.New("must not be called")} + session := newTranscriptionOnlySession(m, true) + tr := &fakeTransport{} + + commitUtteranceWithTranscript(context.Background(), []byte{1, 2}, nil, + &schema.TranscriptionResult{Text: "batch text", Eou: true}, "item_turn", session, &Conversation{}, tr) + + Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(0)) + Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1)) + }) + + It("emits only the completed event for a live transcript — captions already streamed during the turn", func() { + m := &fakeModel{transcribeErr: errors.New("must not be called")} + session := newTranscriptionOnlySession(m, true) + tr := &fakeTransport{} + + commitUtteranceWithTranscript(context.Background(), []byte{1, 2}, + &liveUtterance{Text: "hello"}, nil, "item_turn", session, &Conversation{}, tr) + + Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(0)) + Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1)) + + var completed types.ConversationItemInputAudioTranscriptionCompletedEvent + for _, e := range tr.events { + if c, ok := e.(types.ConversationItemInputAudioTranscriptionCompletedEvent); ok { + completed = c + } + } + Expect(completed.ItemID).To(Equal("item_turn"), + "completed must reuse the caption deltas' item id so the client replaces, not duplicates") + Expect(completed.Transcript).To(Equal("hello")) + }) + + It("falls back to the file path when the live stream heard nothing", func() { + m := &fakeModel{transcribeFinal: &schema.TranscriptionResult{Text: "from file"}} + session := newTranscriptionOnlySession(m, false) + tr := &fakeTransport{} + + commitUtteranceWithTranscript(context.Background(), []byte{1, 2}, + &liveUtterance{}, nil, "", session, &Conversation{}, tr) + + Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1)) + }) +}) + +// transcribeUtterance is the retranscribe gate's offline decode of the +// buffered turn. +var _ = Describe("transcribeUtterance", func() { + It("returns the batch decode with its Eou flag", func() { + m := &fakeModel{transcribeFinal: &schema.TranscriptionResult{Text: "confirmed", Eou: true}} + session := &Session{ + InputAudioTranscription: &types.AudioTranscription{}, + ModelInterface: m, + } + + tr, err := transcribeUtterance(context.Background(), []byte{0, 0, 1, 1}, session) + Expect(err).ToNot(HaveOccurred()) + Expect(tr.Text).To(Equal("confirmed")) + Expect(tr.Eou).To(BeTrue()) + }) + + It("propagates backend errors", func() { + m := &fakeModel{transcribeErr: errors.New("engine fell over")} + session := &Session{ + InputAudioTranscription: &types.AudioTranscription{}, + ModelInterface: m, + } + + _, err := transcribeUtterance(context.Background(), []byte{0, 0}, session) + Expect(err).To(MatchError(ContainSubstring("engine fell over"))) + }) +}) + +// emitPrecomputedTranscription replays an already-produced transcript as the +// standard delta/completed event sequence. +var _ = Describe("emitPrecomputedTranscription", func() { + It("emits deltas then completed, sharing the item id", func() { + tr := &fakeTransport{} + Expect(emitPrecomputedTranscription(tr, "item42", []string{"a", "", "b"}, "ab")).To(Succeed()) + + Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(2), "empty deltas skipped") + Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1)) + for _, e := range tr.events { + switch ev := e.(type) { + case types.ConversationItemInputAudioTranscriptionDeltaEvent: + Expect(ev.ItemID).To(Equal("item42")) + case types.ConversationItemInputAudioTranscriptionCompletedEvent: + Expect(ev.ItemID).To(Equal("item42")) + Expect(ev.Transcript).To(Equal("ab")) + } + } + }) + + It("emits only the completed event with no deltas", func() { + tr := &fakeTransport{} + Expect(emitPrecomputedTranscription(tr, "item1", nil, "hi")).To(Succeed()) + Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(0)) + Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1)) + }) +}) diff --git a/core/http/endpoints/openai/realtime_stream.go b/core/http/endpoints/openai/realtime_stream.go index 909fc50dc..7c37f7aff 100644 --- a/core/http/endpoints/openai/realtime_stream.go +++ b/core/http/endpoints/openai/realtime_stream.go @@ -86,7 +86,8 @@ func (s *transcriptStreamer) content() string { // tool calls. It returns true when it has fully handled the response so the // caller can return; callers must only invoke it for an audio modality, and with // tools only when the model uses its tokenizer template (see triggerResponseAtTurn). -func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, responseID string, history schema.Messages, images []string, llmCfg *config.ModelConfig, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, toolTurn int) bool { +func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, r *liveResponse, history schema.Messages, images []string, llmCfg *config.ModelConfig, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, toolTurn int) bool { + responseID := r.id itemID := generateItemID() item := types.MessageItemUnion{ Assistant: &types.MessageItemAssistant{ @@ -121,6 +122,8 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation }) } + // cancel rolls back the partial item and records the cancelled outcome; the + // single terminal is emitted by triggerResponse. cancel := func() { if announced { conv.Lock.Lock() @@ -132,10 +135,7 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation } conv.Lock.Unlock() } - sendEvent(t, types.ResponseDoneEvent{ - ServerEventBase: types.ServerEventBase{}, - Response: types.Response{ID: responseID, Object: "realtime.response", Status: types.ResponseStatusCancelled}, - }) + r.outcome = outcomeCancelled } var template string @@ -161,24 +161,30 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation streamer.announce = announce // Clause chunking (opt-in): synthesize each clause as soon as it completes - // instead of buffering the whole reply. streamedAudio accumulates the PCM - // across clauses for the conversation item record; ttsErr captures the first - // synthesis failure so the token callback can stop the prediction. emitSpeech - // runs synchronously here — the LLM keeps generating into the gRPC stream - // while a clause is synthesized, so audio still starts mid-generation. + // instead of buffering the whole reply. Synthesis runs on a worker goroutine + // (ttsPipeline) rather than inline in the token callback: emitSpeech blocks + // until the whole clause is synthesized (and, for WebRTC, played back at + // real time), and the callback runs on the goroutine that drains the LLM + // gRPC stream — so speaking inline stalls generation and freezes the + // assistant transcript at every clause boundary. The worker lets generation + // and the transcript stream keep flowing while audio is produced behind them. var chunker *clauseChunker + var ttsPipe *ttsPipeline if session.ModelConfig != nil && session.ModelConfig.Pipeline.ChunkClauses() { chunker = newClauseChunker(defaultClauseMinRunes, defaultClauseMaxRunes) + ttsPipe = newTTSPipeline(func(clause string) ([]byte, error) { + return emitSpeech(ctx, t, session, responseID, itemID, clause) + }) } var streamedAudio []byte var ttsErr error - speakClause := func(clause string) error { - a, err := emitSpeech(ctx, t, session, responseID, itemID, clause) - if err != nil { - return err - } - streamedAudio = append(streamedAudio, a...) - return nil + + // Backstop: always join the TTS worker, even on an unexpected early return. + // wait() is idempotent, so the explicit drain below (which captures the + // streamed audio and first error) stays authoritative; this only guarantees + // the goroutine can never leak if a new return path is added. + if ttsPipe != nil { + defer func() { _, _ = ttsPipe.wait() }() } // fail reports a mid-stream failure. A cancelled context means the client @@ -188,6 +194,7 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation cancel() } else { sendError(t, code, fmt.Sprintf("%s: %v", msg, err), "", itemID) + r.outcome = outcomeFailed } return true } @@ -207,8 +214,12 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation delta := streamer.onToken(text) if chunker != nil && delta != "" { for _, clause := range chunker.push(delta) { - if ttsErr = speakClause(clause); ttsErr != nil { - return false // stop the prediction; reported after predFunc returns + // Hand the clause to the worker and keep going — never block the + // recv loop on synthesis. A false return means a prior clause + // already failed; stop the prediction (the error is collected + // from the pipeline after predFunc returns). + if !ttsPipe.enqueue(clause) { + return false } } } @@ -217,10 +228,27 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation predFunc, err := session.ModelInterface.Predict(ctx, history, images, nil, nil, cb, tools, toolChoice, nil, nil, nil) if err != nil { + // The deferred wait() joins the (idle) worker. sendError(t, "inference_failed", fmt.Sprintf("backend error: %v", err), "", itemID) return true } pred, err := predFunc() + + // Drain the TTS worker. On a clean finish, enqueue the trailing clause(s) the + // chunker was still holding; on an error or barge-in, stop synthesizing. + // wait() runs on every path so the worker goroutine never leaks, and it + // returns the audio streamed so far plus the first synthesis failure. + if ttsPipe != nil { + if err == nil && ctx.Err() == nil { + for _, clause := range chunker.flush() { + if !ttsPipe.enqueue(clause) { + break + } + } + } + streamedAudio, ttsErr = ttsPipe.wait() + } + // A clause synthesis failed mid-stream (the callback stopped the prediction); // report it as a TTS error rather than a prediction error. if ttsErr != nil { @@ -233,6 +261,7 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation cancel() return true } + r.addUsage(pred.Usage) content := streamer.content() toolCalls := functions.ToolCallsFromChatDeltas(pred.ChatDeltas) @@ -244,24 +273,19 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation announce() } - // Synthesize the audio. With clause chunking the completed clauses were - // already spoken inside the token callback; flush the trailing clause(s) - // the segmenter was still holding. Otherwise buffer the whole message and - // synthesize it once. emitSpeech streams the audio chunks when the TTS - // backend supports TTSStream, otherwise it sends a single unary delta. + // With clause chunking the clauses were synthesized on the worker as the + // reply streamed (including the trailing flush drained above), so the + // audio is already accumulated. Otherwise buffer the whole message and + // synthesize it once now — emitSpeech streams the audio chunks when the + // TTS backend supports TTSStream, otherwise it sends a single unary delta. var audio []byte if chunker != nil { - for _, clause := range chunker.flush() { - if ttsErr = speakClause(clause); ttsErr != nil { - break - } - } audio = streamedAudio } else { audio, ttsErr = emitSpeech(ctx, t, session, responseID, itemID, content) - } - if ttsErr != nil { - return fail("tts_error", "TTS generation failed", ttsErr) + if ttsErr != nil { + return fail("tts_error", "TTS generation failed", ttsErr) + } } _, isWebRTC := t.(*WebRTCTransport) @@ -306,10 +330,12 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation OutputIndex: 0, Item: item, }) + r.addItem(item) } - // Emit any tool calls, the terminal response.done, and (for server-side - // assistant tools) the follow-up turn — shared with the buffered path. - emitToolCallItems(ctx, session, conv, t, responseID, toolCalls, content != "", toolTurn) + // Emit any tool calls and (for server-side assistant tools) the follow-up + // turn — shared with the buffered path. The single terminal is emitted by + // triggerResponse. + emitToolCallItems(ctx, session, conv, t, r, toolCalls, content != "", toolTurn) return true } diff --git a/core/http/endpoints/openai/realtime_stream_test.go b/core/http/endpoints/openai/realtime_stream_test.go index 5150feb21..439f3240e 100644 --- a/core/http/endpoints/openai/realtime_stream_test.go +++ b/core/http/endpoints/openai/realtime_stream_test.go @@ -102,7 +102,8 @@ var _ = Describe("streamLLMResponse", func() { t := &fakeTransport{} llmCfg := &config.ModelConfig{} - handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0) + r := &liveResponse{id: "resp1"} + handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0) Expect(handled).To(BeTrue()) // One live transcript delta per streamed token. @@ -132,7 +133,8 @@ var _ = Describe("streamLLMResponse", func() { t := &fakeTransport{} llmCfg := &config.ModelConfig{} - handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0) + r := &liveResponse{id: "resp1"} + handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0) Expect(handled).To(BeTrue()) // Two clauses ("Hello world." mid-stream, "How are you?" on flush) → two @@ -140,8 +142,10 @@ var _ = Describe("streamLLMResponse", func() { Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioDelta)).To(Equal(2)) // The full transcript still streams verbatim. Expect(t.transcriptDeltaText()).To(Equal("Hello world. How are you?")) - // Exactly one terminal response.done. - Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1)) + // The terminal response.done is emitted by triggerResponse, not by + // streamLLMResponse — so at this layer there are none. + Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0)) + Expect(r.outcome).To(Equal(outcomeCompleted)) }) It("streams content deltas and emits tool-call items (autoparser tool turn)", func() { @@ -169,15 +173,18 @@ var _ = Describe("streamLLMResponse", func() { llmCfg := &config.ModelConfig{} llmCfg.TemplateConfig.UseTokenizerTemplate = true - handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0) + r := &liveResponse{id: "resp1"} + handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0) Expect(handled).To(BeTrue()) // The spoken content was streamed live. Expect(t.transcriptDeltaText()).To(Equal("Let me check.")) // The tool call is emitted as a function_call item. Expect(t.countEvents(types.ServerEventTypeResponseFunctionCallArgumentsDone)).To(Equal(1)) - // Exactly one terminal response.done. - Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1)) + // The terminal response.done is emitted by triggerResponse, not by + // streamLLMResponse — so at this layer there are none. + Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0)) + Expect(r.outcome).To(Equal(outcomeCompleted)) }) It("emits only tool-call items for a content-less tool turn (no empty assistant item)", func() { @@ -200,7 +207,8 @@ var _ = Describe("streamLLMResponse", func() { llmCfg := &config.ModelConfig{} llmCfg.TemplateConfig.UseTokenizerTemplate = true - handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0) + r := &liveResponse{id: "resp1"} + handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0) Expect(handled).To(BeTrue()) // No content → no transcript deltas and no spurious assistant content item. @@ -208,6 +216,51 @@ var _ = Describe("streamLLMResponse", func() { Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioTranscriptDelta)).To(Equal(0)) // The tool call is still emitted. Expect(t.countEvents(types.ServerEventTypeResponseFunctionCallArgumentsDone)).To(Equal(1)) - Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1)) + Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0)) + Expect(r.outcome).To(Equal(outcomeCompleted)) + }) +}) + +var _ = Describe("triggerResponse", func() { + It("emits exactly one response.created and one response.done with output and usage", func() { + m := &fakeModel{ + cfg: &config.ModelConfig{}, + predictResp: backend.LLMResponse{ + Response: "Hi there.", + Usage: backend.TokenUsage{Prompt: 5, Completion: 3}, + }, + } + session := &Session{ + OutputSampleRate: 24000, + ModelInterface: m, + ModelConfig: &config.ModelConfig{}, + // Text-only so the buffered path skips TTS and the assertion focuses + // on the terminal's Output + Usage. + OutputModalities: []types.Modality{types.ModalityText}, + } + conv := &Conversation{} + t := &fakeTransport{} + + triggerResponse(context.Background(), session, conv, t, nil) + + // Exactly one of each lifecycle event for the whole response.create. + Expect(t.countEvents(types.ServerEventTypeResponseCreated)).To(Equal(1)) + Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1)) + + // The single terminal carries the produced output item and the usage — + // both empty in the legacy code. + var done *types.ResponseDoneEvent + for i := range t.events { + if d, ok := t.events[i].(types.ResponseDoneEvent); ok { + done = &d + } + } + Expect(done).NotTo(BeNil()) + Expect(done.Response.Status).To(Equal(types.ResponseStatusCompleted)) + Expect(done.Response.Output).To(HaveLen(1)) + Expect(done.Response.Usage).NotTo(BeNil()) + Expect(done.Response.Usage.InputTokens).To(Equal(5)) + Expect(done.Response.Usage.OutputTokens).To(Equal(3)) + Expect(done.Response.Usage.TotalTokens).To(Equal(8)) }) }) diff --git a/core/http/endpoints/openai/realtime_transcription.go b/core/http/endpoints/openai/realtime_transcription.go index 44456101c..28a5147c1 100644 --- a/core/http/endpoints/openai/realtime_transcription.go +++ b/core/http/endpoints/openai/realtime_transcription.go @@ -7,6 +7,33 @@ import ( "github.com/mudler/LocalAI/core/http/endpoints/openai/types" ) +// emitPrecomputedTranscription emits the transcription events for a turn +// whose transcript already exists (semantic_vad's live stream, or the +// retranscribe gate's batch decode): optional delta replays followed by the +// completed event — the same contract emitTranscription produces, sharing +// one itemID — without running the backend again. +func emitPrecomputedTranscription(t Transport, itemID string, deltas []string, transcript string) error { + for _, d := range deltas { + if d == "" { + continue + } + if err := t.SendEvent(types.ConversationItemInputAudioTranscriptionDeltaEvent{ + ServerEventBase: types.ServerEventBase{EventID: "event_TODO"}, + ItemID: itemID, + ContentIndex: 0, + Delta: d, + }); err != nil { + return err + } + } + return t.SendEvent(types.ConversationItemInputAudioTranscriptionCompletedEvent{ + ServerEventBase: types.ServerEventBase{EventID: "event_TODO"}, + ItemID: itemID, + ContentIndex: 0, + Transcript: transcript, + }) +} + // emitTranscription transcribes a committed utterance and emits the transcription // events for it, returning the final transcript text. With // pipeline.streaming.transcription enabled it streams each transcript fragment as diff --git a/core/http/endpoints/openai/realtime_tts_pipeline.go b/core/http/endpoints/openai/realtime_tts_pipeline.go new file mode 100644 index 000000000..c9828b0aa --- /dev/null +++ b/core/http/endpoints/openai/realtime_tts_pipeline.go @@ -0,0 +1,153 @@ +package openai + +import ( + "sync" + "sync/atomic" + + "github.com/mudler/LocalAI/core/http/endpoints/openai/ttscoord" +) + +// ttsPipeline decouples speech synthesis from LLM token generation. +// +// The LLM token callback runs on the same goroutine that drains the model's +// gRPC stream, so anything it does serially — including a blocking TTS call — +// stops the stream from being read and stalls generation (and, since the same +// goroutine also sends the assistant transcript, freezes the transcript the +// client sees). ttsPipeline lets the callback hand each completed clause to a +// single worker goroutine that synthesizes them in order, concurrently with +// continued generation. One worker preserves clause — and therefore audio — +// ordering. +// +// The clause queue is intentionally unbounded: clauses are short strings and a +// reply has a bounded number of them, while the expensive product (audio) is +// paced by the TTS backend regardless. So enqueue never blocks the callback, +// and the transcript streams to the client at generation speed while audio is +// produced behind it. +type ttsPipeline struct { + speak func(clause string) ([]byte, error) + + mu sync.Mutex + queue []string + wake chan struct{} // buffered(1) wakeup signal for the worker + + // coord owns the open->closing->closed lifecycle (machine M5). It replaces the + // legacy `closed bool`: the producer raises Close (wait()), the worker raises + // WorkerExited. See ttscoord/ and realtime-state-machines.md. + coord *ttscoord.Coordinator + + done chan struct{} + failed atomic.Bool + + // audio and firstErr are owned by the worker goroutine and only safe to + // read after wait() has returned (it joins on the worker via done). + audio []byte + firstErr error +} + +// newTTSPipeline starts the worker. speak performs the actual synthesis and +// returns the PCM accumulated for the conversation-item record (empty for +// transports that stream audio out-of-band, e.g. WebRTC). +func newTTSPipeline(speak func(clause string) ([]byte, error)) *ttsPipeline { + p := &ttsPipeline{ + speak: speak, + wake: make(chan struct{}, 1), + done: make(chan struct{}), + } + p.coord = ttscoord.New(p) + go p.run() + return p +} + +// closing reports whether wait() has been called (lifecycle past Open). Read +// under p.mu in the worker so the queue-empty check and the close check are +// consistent. +func (p *ttsPipeline) closing() bool { + _, open := p.coord.State().(ttscoord.Open) + return !open +} + +// Perform executes a coordinator effect. Wake nudges the worker (non-blocking). +func (p *ttsPipeline) Perform(e ttscoord.Effect) { + if _, ok := e.(ttscoord.Wake); ok { + p.signal() + } +} + +func (p *ttsPipeline) run() { + defer close(p.done) + for { + p.mu.Lock() + for len(p.queue) == 0 && !p.closing() { + p.mu.Unlock() + <-p.wake + p.mu.Lock() + } + if len(p.queue) == 0 && p.closing() { + p.mu.Unlock() + // Drained and closed: advance the lifecycle to Closed, then exit + // (the deferred close(p.done) joins the producer's wait()). + _ = p.coord.Apply(ttscoord.WorkerExited{}) + return + } + clause := p.queue[0] + p.queue = p.queue[1:] + p.mu.Unlock() + + // Once a clause has failed, keep draining the queue without speaking so + // the producer's wait() returns promptly and the first error is kept. + if p.failed.Load() { + continue + } + a, err := p.speak(clause) + if err != nil { + p.firstErr = err + p.failed.Store(true) + continue + } + p.audio = append(p.audio, a...) + } +} + +// enqueue offers a clause for synthesis. It never blocks; it returns false once +// synthesis has failed, signalling the caller to stop the prediction. +func (p *ttsPipeline) enqueue(clause string) bool { + if p.failed.Load() { + return false + } + p.mu.Lock() + // Reject once closing/closed: the worker may have already drained and exited, + // so a clause queued now would be silently dropped. The lifecycle (Open) and + // the append are checked under the same lock, so the worker cannot exit between + // the gate and the enqueue (it takes p.mu to observe the empty queue). + if p.closing() { + p.mu.Unlock() + return false + } + p.queue = append(p.queue, clause) + p.mu.Unlock() + p.signal() + return true +} + +// signal wakes the worker without blocking; the buffered channel coalesces +// signals, which is safe because the worker drains the whole queue per wake. +func (p *ttsPipeline) signal() { + select { + case p.wake <- struct{}{}: + default: + } +} + +// wait closes the queue and blocks until the worker has spoken every enqueued +// clause, then returns the accumulated audio and the first synthesis error. It +// is idempotent: calling it again returns the same result without blocking, so +// callers can drain it explicitly to read the audio and still defer a wait() as +// a leak-proof backstop. No clause may be enqueued after the first wait(). +func (p *ttsPipeline) wait() ([]byte, error) { + // Close the lifecycle (Open->Closing) and wake the worker. Idempotent: a + // second Close is absorbed (no second wake), and <-p.done returns immediately + // once the worker has exited. + _ = p.coord.Apply(ttscoord.Close{}) + <-p.done + return p.audio, p.firstErr +} diff --git a/core/http/endpoints/openai/realtime_tts_pipeline_test.go b/core/http/endpoints/openai/realtime_tts_pipeline_test.go new file mode 100644 index 000000000..a5e070248 --- /dev/null +++ b/core/http/endpoints/openai/realtime_tts_pipeline_test.go @@ -0,0 +1,114 @@ +package openai + +import ( + "errors" + "sync" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("ttsPipeline", func() { + It("synthesizes clauses in order and accumulates their audio", func() { + p := newTTSPipeline(func(clause string) ([]byte, error) { + return []byte(clause), nil + }) + Expect(p.enqueue("a")).To(BeTrue()) + Expect(p.enqueue("b")).To(BeTrue()) + Expect(p.enqueue("c")).To(BeTrue()) + + audio, err := p.wait() + Expect(err).NotTo(HaveOccurred()) + Expect(string(audio)).To(Equal("abc")) + }) + + It("never blocks the producer even when synthesis is slow", func() { + var started sync.WaitGroup + started.Add(1) + release := make(chan struct{}) + first := true + p := newTTSPipeline(func(clause string) ([]byte, error) { + if first { + first = false + started.Done() + <-release // hold the worker on the first clause + } + return []byte(clause), nil + }) + + Expect(p.enqueue("1")).To(BeTrue()) + started.Wait() // worker is now blocked synthesizing the first clause + + // Enqueuing many more clauses must return immediately, not block on the + // stalled worker — this is what keeps the LLM recv loop flowing. + done := make(chan struct{}) + go func() { + defer close(done) + for _, c := range []string{"2", "3", "4", "5"} { + p.enqueue(c) + } + }() + Eventually(done, time.Second).Should(BeClosed()) + + close(release) + audio, err := p.wait() + Expect(err).NotTo(HaveOccurred()) + Expect(string(audio)).To(Equal("12345")) + }) + + It("keeps the first error, stops speaking, and signals the producer to stop", func() { + boom := errors.New("backend gone") + var spoken []string + var mu sync.Mutex + p := newTTSPipeline(func(clause string) ([]byte, error) { + mu.Lock() + spoken = append(spoken, clause) + mu.Unlock() + if clause == "b" { + return nil, boom + } + return []byte(clause), nil + }) + + Expect(p.enqueue("a")).To(BeTrue()) + Expect(p.enqueue("b")).To(BeTrue()) + + // Once the failure is observed, enqueue reports it so the caller stops + // the prediction; any further clauses are dropped, not spoken. + Eventually(func() bool { return !p.enqueue("c") }, time.Second).Should(BeTrue()) + + _, err := p.wait() + Expect(err).To(MatchError(boom)) + + mu.Lock() + defer mu.Unlock() + Expect(spoken).NotTo(ContainElement("c"), "clauses after the failure are not synthesized") + }) + + It("is idempotent: a second wait returns the same result without blocking", func() { + p := newTTSPipeline(func(clause string) ([]byte, error) { + return []byte(clause), nil + }) + Expect(p.enqueue("x")).To(BeTrue()) + + audio1, err1 := p.wait() + // A deferred backstop wait() in the caller runs after the explicit one; + // it must not block or change the result. + audio2, err2 := p.wait() + + Expect(err1).NotTo(HaveOccurred()) + Expect(err2).NotTo(HaveOccurred()) + Expect(string(audio1)).To(Equal("x")) + Expect(string(audio2)).To(Equal("x")) + }) + + It("returns cleanly when no clause was ever enqueued", func() { + p := newTTSPipeline(func(clause string) ([]byte, error) { + return []byte(clause), nil + }) + audio, err := p.wait() + Expect(err).NotTo(HaveOccurred()) + Expect(audio).To(BeEmpty()) + }) +}) diff --git a/core/http/endpoints/openai/realtime_turncoord.go b/core/http/endpoints/openai/realtime_turncoord.go new file mode 100644 index 000000000..30ffffc66 --- /dev/null +++ b/core/http/endpoints/openai/realtime_turncoord.go @@ -0,0 +1,127 @@ +package openai + +import ( + "context" + "time" + + "github.com/mudler/LocalAI/core/http/endpoints/openai/respcoord" + "github.com/mudler/LocalAI/core/http/endpoints/openai/turncoord" + "github.com/mudler/LocalAI/core/http/endpoints/openai/types" + "github.com/mudler/LocalAI/core/schema" +) + +// turnSink wires the explicit turn-detection state machine (turncoord.Coordinator +// — machine "M2" in docs/design/realtime-state-machines.md) into handleVAD. +// +// In the legacy code the turn lifecycle was split across two variables that could +// disagree: handleVAD's goroutine-local speechStarted bool and the semantic_vad +// liveTurnState's "is the live stream open" flag (lts.open()). A discardTurn (the +// no-speech clear, or teardown) closed the live stream but left speechStarted +// true, so the next speech onset was suppressed by `if !speechStarted` — no +// speech_started, no barge-in, no commit (Part 2, failure mode 4). Here "speech +// started" and "a turn is open" are ONE coordinator state, so they cannot desync. +// +// Unlike responseSink (M3), which is a genuine dual-writer race, the turn machine +// is owned by the single handleVAD goroutine; this sink and its coordinator are +// loop-local. The coordinator's lock only matters for the teardown-time Abort and +// for keeping State() readable — there is no second writer. +// +// The effects map onto the existing turn I/O: +// - OpenTurn: open the live ASR stream (semantic_vad) + feed the onset +// audio. A failed open degrades the turn to silence-only — the turn still +// proceeds (server_vad-like), matching the legacy behaviour. +// - BargeIn: cancel any in-flight response (non-blocking). +// - EmitSpeechStarted: input_audio_buffer.speech_started. +// - EmitSpeechStopped: input_audio_buffer.speech_stopped. +// - CommitTurn: committed event + finalize the live stream + issue the +// response (via responseSink/respcoord). +// - DiscardTurn: close the live stream and retract any captions. +// +// The data-heavy effects (OpenTurn, CommitTurn) need the current tick's audio and +// transcription context. Because Apply performs effects synchronously on the same +// (handleVAD) goroutine, the loop sets the relevant scratch fields immediately +// before each Apply; there is no cross-goroutine sharing. +type turnSink struct { + session *Session + conv *Conversation + transport Transport + lts *liveTurnState + vadContext context.Context + startTime time.Time + + coord *turncoord.Coordinator + + // per-tick context, set by handleVAD before each Apply (single goroutine). + sv *types.RealtimeSessionSemanticVad // nil = server_vad + onsetAudio []int16 // OpenTurn feeds this + commitAudio []byte // CommitTurn issues this + commitAudioLength float64 // for finishTurn (flush tail) + commitRetranscribe bool // gated batch is authoritative + commitGated *schema.TranscriptionResult // retranscribe batch decode +} + +func newTurnSink(session *Session, conv *Conversation, t Transport, lts *liveTurnState, vadContext context.Context, startTime time.Time) *turnSink { + s := &turnSink{ + session: session, + conv: conv, + transport: t, + lts: lts, + vadContext: vadContext, + startTime: startTime, + } + s.coord = turncoord.New(s) + return s +} + +// Perform executes one effect. It is called by Coordinator.Apply while the +// coordinator lock is held. The turn coordinator is single-writer (handleVAD), so +// the synchronous network writes / lts operations here are the same ones the +// legacy loop did inline on this goroutine; they never contend the lock. +func (s *turnSink) Perform(e turncoord.Effect) { + switch eff := e.(type) { + case turncoord.OpenTurn: + if s.sv != nil && s.lts.openTurn(s.vadContext, string(eff.Turn)) { + s.lts.feedNewAudio(s.onsetAudio) + } + case turncoord.BargeIn: + s.session.respSink.cancel(respcoord.SourceVAD) + case turncoord.EmitSpeechStarted: + sendEvent(s.transport, types.InputAudioBufferSpeechStartedEvent{ + ServerEventBase: types.ServerEventBase{EventID: "event_TODO"}, + AudioStartMs: time.Since(s.startTime).Milliseconds(), + }) + case turncoord.EmitSpeechStopped: + sendEvent(s.transport, types.InputAudioBufferSpeechStoppedEvent{ + ServerEventBase: types.ServerEventBase{EventID: "event_TODO"}, + AudioEndMs: time.Since(s.startTime).Milliseconds(), + }) + case turncoord.CommitTurn: + // The committed item id is the coordinator's turn id (== the live caption + // id), so the client's completed event replaces the partial text. + itemID := string(eff.Turn) + sendEvent(s.transport, types.InputAudioBufferCommittedEvent{ + ServerEventBase: types.ServerEventBase{EventID: "event_TODO"}, + ItemID: itemID, + PreviousItemID: "TODO", + }) + // Finalize the turn's live stream (flushes the decode tail). In + // retranscribe mode the batch decode is authoritative, so the streamed + // transcript is dropped. + var live *liveUtterance + if s.sv != nil { + ut := s.lts.finishTurn(s.commitAudioLength) + if !s.commitRetranscribe { + live = ut + } + } + audio := s.commitAudio + gated := s.commitGated + conv := s.conv + s.session.respSink.issue(s.vadContext, respcoord.SourceVAD, func(ctx context.Context) { + commitUtteranceWithTranscript(ctx, audio, live, gated, itemID, s.session, conv, s.transport) + }) + case turncoord.DiscardTurn: + // No-op if the stream was never open (server_vad / already idle). + s.lts.discardTurn() + } +} diff --git a/core/http/endpoints/openai/realtime_vad_buffer_test.go b/core/http/endpoints/openai/realtime_vad_buffer_test.go new file mode 100644 index 000000000..0fbef3e6b --- /dev/null +++ b/core/http/endpoints/openai/realtime_vad_buffer_test.go @@ -0,0 +1,54 @@ +package openai + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// dropInspectedPrefix is what stands between the VAD loop's buffer clears and +// cutting the first word off an utterance: the no-speech clear must keep the +// holdback tail (silero hasn't crossed its onset threshold yet) and both +// clears must keep audio appended while the tick ran (the VAD never saw it). +var _ = Describe("dropInspectedPrefix", func() { + It("keeps the holdback tail of the inspected window and everything appended mid-tick", func() { + inspected := []byte{1, 2, 3, 4, 5, 6} + appended := []byte{7, 8} + buf := append(append([]byte(nil), inspected...), appended...) + + out := dropInspectedPrefix(buf, len(inspected), 2) + + Expect(out).To(Equal([]byte{5, 6, 7, 8}), "older confirmed-silent head dropped, possible onset + fresh audio kept") + }) + + It("returns the buffer unchanged when the inspected window fits in the holdback", func() { + buf := []byte{1, 2, 3} + + Expect(dropInspectedPrefix(buf, len(buf), 4)).To(Equal(buf)) + Expect(dropInspectedPrefix(buf, len(buf), len(buf))).To(Equal(buf)) + }) + + It("drops the whole inspected window with zero holdback, keeping only mid-tick appends", func() { + // The commit-time clear: the inspected audio was committed, audio + // appended while the tick ran belongs to the next turn. + buf := []byte{1, 2, 3, 4} + + Expect(dropInspectedPrefix(buf, 4, 0)).To(BeEmpty()) + Expect(dropInspectedPrefix(append(buf, 9), 4, 0)).To(Equal([]byte{9})) + }) + + It("clamps when told more was inspected than the buffer holds", func() { + buf := []byte{1, 2} + + Expect(dropInspectedPrefix(buf, 10, 0)).To(BeEmpty()) + }) + + It("returns a copy, not a sub-slice, when bytes are dropped", func() { + buf := []byte{1, 2, 3, 4} + + out := dropInspectedPrefix(buf, 4, 2) + + Expect(out).To(Equal([]byte{3, 4})) + buf[2] = 99 + Expect(out).To(Equal([]byte{3, 4}), "mutating the old backing array must not leak into the published buffer") + }) +}) diff --git a/core/http/endpoints/openai/respcoord/respcoord.go b/core/http/endpoints/openai/respcoord/respcoord.go new file mode 100644 index 000000000..6c8c6d80f --- /dev/null +++ b/core/http/endpoints/openai/respcoord/respcoord.go @@ -0,0 +1,267 @@ +// Package respcoord is the explicit state machine for the realtime API's +// response-coordination concern (machine "M3" in +// docs/design/realtime-state-machines.md). +// +// In the legacy code this machine is implicit: a response is "active" iff +// Session.activeResponseDone is a non-nil, unclosed channel, and the lifecycle +// is driven from TWO goroutines (the client read-loop and the VAD goroutine) +// that both call startResponse/cancelActiveResponse. responseMu guards only the +// field swap, while the <-done wait happens outside the lock, so two concurrent +// starts can briefly leave two live response goroutines both appending to the +// conversation. See docs/design/realtime-state-machines.md, Part 2 (failure +// mode 2) and the ResponseLifecycle spec under formal-verification/. +// +// This package replaces that with: +// - a sealed sum type for State (illegal states are unrepresentable), +// - a total, pure transition function Next(state, event) -> (state, effects), +// - a single-writer Coordinator that serializes every transition. +// +// The design guarantees the invariants the specs check: +// - at most one live response at any instant, +// - exactly one terminal (response.done) per started response, +// - no response is started after its terminal (no resurrection). +package respcoord + +import ( + "fmt" + + "github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator" +) + +// ResponseID identifies a single response attempt. The caller mints a fresh, +// monotonically increasing id for every Start; ids are never reused. The +// monotonic id is what lets the machine ignore "stale" Finished events from a +// response that was already superseded or cancelled. +type ResponseID uint64 + +// Source records which goroutine drove an event. It is carried for +// observability/logging only; it never affects a transition (both sources are +// equal authority). Keeping it in the event type makes the dual-writer reality +// explicit rather than hidden. +type Source int + +const ( + // SourceClient is the read-loop: response.create or a manual + // input_audio_buffer.commit. + SourceClient Source = iota + // SourceVAD is the turn-detection goroutine: end-of-speech commit or a + // barge-in cancel. + SourceVAD +) + +func (s Source) String() string { + switch s { + case SourceClient: + return "client" + case SourceVAD: + return "vad" + default: + return fmt.Sprintf("Source(%d)", int(s)) + } +} + +// Status is the terminal status reported on response.done. +type Status int + +const ( + // StatusCompleted is a response that finished on its own. + StatusCompleted Status = iota + // StatusCancelled is a response cut short by a barge-in, an explicit + // response.cancel, or by being superseded by a newer response. + StatusCancelled +) + +func (s Status) String() string { + switch s { + case StatusCompleted: + return "completed" + case StatusCancelled: + return "cancelled" + default: + return fmt.Sprintf("Status(%d)", int(s)) + } +} + +// State is the sealed sum type of coordinator states. The only implementations +// are the unexported-method-bearing structs in this file, so callers outside +// the package cannot fabricate an out-of-band state. Exhaustively: +// Idle | Active | Terminated. +type State interface { + isState() + String() string +} + +// Idle: no response is in flight. +type Idle struct{} + +// Active: exactly one response (ID) is in flight. The struct holds a single id, +// so "two active responses" is not representable. +type Active struct{ ID ResponseID } + +// Terminated: the session is torn down. Absorbing — no response can start from +// here, so the M1 (connection) parent's teardown can guarantee no response +// outlives the session (see formal-verification/session_lifecycle.fizz). +type Terminated struct{} + +func (Idle) isState() {} +func (Active) isState() {} +func (Terminated) isState() {} + +func (Idle) String() string { return "Idle" } +func (a Active) String() string { return fmt.Sprintf("Active(%d)", a.ID) } +func (Terminated) String() string { return "Terminated" } + +// Event is the sealed sum type of inputs. Exhaustively: +// Start | Finished | Cancel | Shutdown. +type Event interface { + isEvent() + String() string +} + +// Start requests a new response. ID must be a fresh, never-before-used id. +type Start struct { + ID ResponseID + Source Source +} + +// Finished reports that the response goroutine for ID reached its own terminal. +// If ID is not the currently-active response it is "stale" (the response was +// already superseded/cancelled) and is ignored. +type Finished struct{ ID ResponseID } + +// Cancel requests cancellation of the in-flight response (barge-in or explicit +// response.cancel). It is a no-op when idle. +type Cancel struct{ Source Source } + +// Shutdown terminates the coordinator at session teardown: it cancels any +// in-flight response and moves to the absorbing Terminated state, after which no +// response can start. Raised by the connection (M1) parent's teardown. +type Shutdown struct{} + +func (Start) isEvent() {} +func (Finished) isEvent() {} +func (Cancel) isEvent() {} +func (Shutdown) isEvent() {} + +func (e Start) String() string { return fmt.Sprintf("Start(%d,%s)", e.ID, e.Source) } +func (e Finished) String() string { return fmt.Sprintf("Finished(%d)", e.ID) } +func (e Cancel) String() string { return fmt.Sprintf("Cancel(%s)", e.Source) } +func (Shutdown) String() string { return "Shutdown" } + +// Effect is a side effect returned by Next as data for the caller to perform. +// Returning effects as data (rather than firing callbacks inside the +// transition) keeps Next pure and exhaustively testable, and lets the +// Coordinator decide how/when to perform them. Exhaustively: +// CancelResponse | StartResponse | EmitTerminal. +type Effect interface { + isEffect() + String() string +} + +// CancelResponse: cancel the context of the running response ID. +type CancelResponse struct{ ID ResponseID } + +// StartResponse: spawn the response goroutine for ID. +type StartResponse struct{ ID ResponseID } + +// EmitTerminal: send response.done for ID with Status. +type EmitTerminal struct { + ID ResponseID + Status Status +} + +func (CancelResponse) isEffect() {} +func (StartResponse) isEffect() {} +func (EmitTerminal) isEffect() {} + +func (e CancelResponse) String() string { return fmt.Sprintf("CancelResponse(%d)", e.ID) } +func (e StartResponse) String() string { return fmt.Sprintf("StartResponse(%d)", e.ID) } +func (e EmitTerminal) String() string { + return fmt.Sprintf("EmitTerminal(%d,%s)", e.ID, e.Status) +} + +// Next is the total, pure transition function. For every (state, event) it +// returns the next state and the ordered effects to perform. It returns a +// non-nil error only for an unknown State/Event implementation (a programmer +// error / future type added without updating this function) — callers must +// surface that, never silently ignore it. Every in-domain (state, event) pair +// is defined; there are no "forbidden" transitions, only no-ops for stale or +// idle inputs. +// +// The supersede rule (Active + Start) is the crux of the fix: starting a new +// response while one is active emits the old response's cancelled terminal and +// cancels it BEFORE the replacement starts, all within one serialized +// transition. The old goroutine's later Finished is therefore stale and +// ignored — so each id gets exactly one terminal and there is never more than +// one live response. +func Next(s State, e Event) (State, []Effect, error) { + switch st := s.(type) { + case Idle: + switch ev := e.(type) { + case Start: + return Active{ID: ev.ID}, []Effect{StartResponse{ID: ev.ID}}, nil + case Cancel: + // Nothing in flight: idempotent no-op. + return Idle{}, nil, nil + case Finished: + // Stale terminal from an already-superseded/cancelled response. + return Idle{}, nil, nil + case Shutdown: + // Teardown with nothing in flight: go terminal. + return Terminated{}, nil, nil + } + case Active: + switch ev := e.(type) { + case Start: + return Active{ID: ev.ID}, []Effect{ + CancelResponse{ID: st.ID}, + EmitTerminal{ID: st.ID, Status: StatusCancelled}, + StartResponse{ID: ev.ID}, + }, nil + case Finished: + if ev.ID == st.ID { + return Idle{}, []Effect{EmitTerminal{ID: st.ID, Status: StatusCompleted}}, nil + } + // Stale finish from a superseded response — already terminal-ed. + return Active{ID: st.ID}, nil, nil + case Cancel: + return Idle{}, []Effect{ + CancelResponse{ID: st.ID}, + EmitTerminal{ID: st.ID, Status: StatusCancelled}, + }, nil + case Shutdown: + // Teardown while a response is live: cancel it (with its terminal) and + // go terminal so nothing can start afterwards. + return Terminated{}, []Effect{ + CancelResponse{ID: st.ID}, + EmitTerminal{ID: st.ID, Status: StatusCancelled}, + }, nil + } + case Terminated: + // Absorbing: every event is a no-op. A Start after teardown is rejected + // (no StartResponse), so no response can outlive the session. + switch e.(type) { + case Start, Finished, Cancel, Shutdown: + return Terminated{}, nil, nil + } + } + return s, nil, fmt.Errorf("respcoord: unhandled transition %s <- %s", s, e) +} + +// EffectSink performs the effects produced by a transition. See coordinator.Sink +// for the non-blocking contract: Perform runs under the coordinator lock, so it +// must not block and must not re-enter Apply (the spawned response goroutine's +// Finished apply happens only after the sink returns). +type EffectSink = coordinator.Sink[Effect] + +// Coordinator serializes every Start/Finished/Cancel/Shutdown transition behind +// one lock, so the two driving goroutines (read-loop and VAD) can call Apply +// concurrently without the legacy dual-writer race. Effects are performed in +// order under the lock — preserving the (cancel old, emit old terminal, start +// new) supersede ordering. See coordinator.Coordinator. +type Coordinator = coordinator.Coordinator[State, Event, Effect] + +// New returns an idle Coordinator that performs effects via sink. +func New(sink EffectSink) *Coordinator { + return coordinator.New[State, Event, Effect](Idle{}, Next, sink) +} diff --git a/core/http/endpoints/openai/respcoord/respcoord_suite_test.go b/core/http/endpoints/openai/respcoord/respcoord_suite_test.go new file mode 100644 index 000000000..df26a1813 --- /dev/null +++ b/core/http/endpoints/openai/respcoord/respcoord_suite_test.go @@ -0,0 +1,13 @@ +package respcoord + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestRespcoord(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "respcoord (realtime M3) Suite") +} diff --git a/core/http/endpoints/openai/respcoord/respcoord_test.go b/core/http/endpoints/openai/respcoord/respcoord_test.go new file mode 100644 index 000000000..6a3c7c297 --- /dev/null +++ b/core/http/endpoints/openai/respcoord/respcoord_test.go @@ -0,0 +1,370 @@ +package respcoord + +import ( + "math/rand/v2" + "sync" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// recordingSink captures the ordered stream of effects so the invariants can be +// checked independently of the transition function's internals. Perform is +// called by Coordinator.Apply under the coordinator lock, so it is already +// serialized; the mutex here only guards reads from the spec goroutine. +type recordingSink struct { + mu sync.Mutex + log []Effect +} + +func (s *recordingSink) Perform(e Effect) { + s.mu.Lock() + s.log = append(s.log, e) + s.mu.Unlock() +} + +func (s *recordingSink) snapshot() []Effect { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]Effect, len(s.log)) + copy(out, s.log) + return out +} + +// checkInvariants replays the effect log and asserts the three core safety +// properties from docs/design/realtime-state-machines.md, Part 4: +// +// (1) at most one live response at any instant +// -- after every effect, the number of started-but-not-terminated ids <= 1; +// (2) exactly one terminal per started response +// -- each id is started at most once and terminated at most once; +// (3) no resurrection +// -- an id is never started after it has been terminated. +func checkInvariants(log []Effect) { + started := map[ResponseID]int{} + terminated := map[ResponseID]int{} + live := map[ResponseID]bool{} + + for i, eff := range log { + switch e := eff.(type) { + case StartResponse: + Expect(terminated[e.ID]).To(Equal(0), "invariant (3): StartResponse(%d) after it was terminated (effect #%d)\nlog=%v", e.ID, i, log) + started[e.ID]++ + Expect(started[e.ID]).To(Equal(1), "invariant (2): id %d started %d times (effect #%d)\nlog=%v", e.ID, started[e.ID], i, log) + live[e.ID] = true + case EmitTerminal: + terminated[e.ID]++ + Expect(terminated[e.ID]).To(Equal(1), "invariant (2): id %d terminated %d times (effect #%d)\nlog=%v", e.ID, terminated[e.ID], i, log) + delete(live, e.ID) + case CancelResponse: + // no count assertion; cancellation is paired with a terminal + } + Expect(len(live)).To(BeNumerically("<=", 1), "invariant (1): %d live responses after effect #%d (%s)\nlog=%v", len(live), i, eff, log) + } +} + +// unknownEvent is an Event implementation Next does not know about, to exercise +// the defensive error path. +type unknownEvent struct{} + +func (unknownEvent) isEvent() {} +func (unknownEvent) String() string { return "unknownEvent" } + +var _ = Describe("respcoord.Next", func() { + // DescribeTable exhaustively pins every (state, event) cell of the pure + // transition function, including the stale / idle no-op cells. This is the + // practical stand-in for "no transition leads to an inconsistent state": if a + // cell changes, this table must change with it. + DescribeTable("transitions", + func(state State, event Event, wantState State, wantEff []Effect) { + gotState, gotEff, err := Next(state, event) + Expect(err).NotTo(HaveOccurred()) + Expect(gotState).To(Equal(wantState)) + Expect(gotEff).To(Equal(wantEff)) + }, + Entry("idle+start -> active, spawns response", + Idle{}, Start{ID: 1, Source: SourceClient}, + Active{ID: 1}, []Effect{StartResponse{ID: 1}}), + Entry("idle+cancel -> idle, no-op", + Idle{}, Cancel{Source: SourceVAD}, + Idle{}, []Effect(nil)), + Entry("idle+finished(stale) -> idle, no-op", + Idle{}, Finished{ID: 7}, + Idle{}, []Effect(nil)), + Entry("active+start -> supersede: cancel+terminal(old)+start(new)", + Active{ID: 1}, Start{ID: 2, Source: SourceVAD}, + Active{ID: 2}, + []Effect{ + CancelResponse{ID: 1}, + EmitTerminal{ID: 1, Status: StatusCancelled}, + StartResponse{ID: 2}, + }), + Entry("active+finished(current) -> idle, completed terminal", + Active{ID: 3}, Finished{ID: 3}, + Idle{}, []Effect{EmitTerminal{ID: 3, Status: StatusCompleted}}), + Entry("active+finished(stale) -> stay active, no-op", + Active{ID: 3}, Finished{ID: 2}, + Active{ID: 3}, []Effect(nil)), + Entry("active+cancel -> idle, cancel+cancelled terminal", + Active{ID: 5}, Cancel{Source: SourceClient}, + Idle{}, + []Effect{ + CancelResponse{ID: 5}, + EmitTerminal{ID: 5, Status: StatusCancelled}, + }), + Entry("idle+shutdown -> terminated, no-op", + Idle{}, Shutdown{}, + Terminated{}, []Effect(nil)), + Entry("active+shutdown -> terminated: cancel+cancelled terminal", + Active{ID: 6}, Shutdown{}, + Terminated{}, + []Effect{ + CancelResponse{ID: 6}, + EmitTerminal{ID: 6, Status: StatusCancelled}, + }), + Entry("terminated+start -> terminated, REJECTED (no resurrection)", + Terminated{}, Start{ID: 9, Source: SourceClient}, + Terminated{}, []Effect(nil)), + Entry("terminated+finished -> terminated, no-op (stale)", + Terminated{}, Finished{ID: 9}, + Terminated{}, []Effect(nil)), + Entry("terminated+cancel -> terminated, no-op", + Terminated{}, Cancel{Source: SourceVAD}, + Terminated{}, []Effect(nil)), + Entry("terminated+shutdown -> terminated, idempotent", + Terminated{}, Shutdown{}, + Terminated{}, []Effect(nil)), + ) + + It("is total: every defined (state, event) pair is handled without error", func() { + states := []State{Idle{}, Active{ID: 1}, Terminated{}} + events := []Event{ + Start{ID: 2, Source: SourceClient}, + Finished{ID: 1}, + Finished{ID: 99}, + Cancel{Source: SourceVAD}, + Shutdown{}, + } + for _, s := range states { + for _, e := range events { + _, _, err := Next(s, e) + Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e) + } + } + }) + + It("errors on an unknown event type", func() { + _, _, err := Next(Active{ID: 1}, unknownEvent{}) + Expect(err).To(HaveOccurred()) + }) +}) + +var _ = Describe("respcoord.Coordinator", func() { + // This replaces the previous rapid stateful test: a seeded random walk over + // the event space, asserting the invariants hold after every step. Seeds are + // fixed so any failure reproduces deterministically. + It("upholds the safety invariants over random event sequences", func() { + seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE} + for _, seed := range seeds { + r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5)) + sink := &recordingSink{} + c := New(sink) + var nextID uint64 + + for range 3000 { + switch r.IntN(4) { + case 0: // start from client + nextID++ + Expect(c.Apply(Start{ID: ResponseID(nextID), Source: SourceClient})).To(Succeed()) + case 1: // start from VAD + nextID++ + Expect(c.Apply(Start{ID: ResponseID(nextID), Source: SourceVAD})).To(Succeed()) + case 2: // possibly-stale finish from any plausible id (incl. future) + id := r.Uint64N(nextID + 3) + Expect(c.Apply(Finished{ID: ResponseID(id)})).To(Succeed()) + case 3: // explicit cancel + Expect(c.Apply(Cancel{Source: SourceClient})).To(Succeed()) + } + } + // One full-log replay per seed: it iterates the whole sequence, so + // it catches a violation at any step without the O(n^2) cost of + // re-replaying after every Apply. + checkInvariants(sink.snapshot()) + } + }) + + // Hammer Apply from two goroutines -- the read-loop and the VAD goroutine, + // the exact dual-writer scenario that races in the legacy code -- and assert + // the invariants still hold. Run under -race to also catch any data race in + // the coordinator itself. + It("upholds the invariants under concurrent dual-writer Apply", func() { + const perGoroutine = 2000 + sink := &recordingSink{} + c := New(sink) + + var idCounter uint64 + var idMu sync.Mutex + nextID := func() ResponseID { + idMu.Lock() + defer idMu.Unlock() + idCounter++ + return ResponseID(idCounter) + } + + var wg sync.WaitGroup + drive := func(src Source) { + defer wg.Done() + for i := range perGoroutine { + switch i % 3 { + case 0: + _ = c.Apply(Start{ID: nextID(), Source: src}) + case 1: + if a, ok := c.State().(Active); ok { + _ = c.Apply(Finished{ID: a.ID}) + } + case 2: + _ = c.Apply(Cancel{Source: src}) + } + } + } + + wg.Add(2) + go drive(SourceClient) + go drive(SourceVAD) + wg.Wait() + + checkInvariants(sink.snapshot()) + }) + + It("rejects the dual-writer interleaving the legacy mechanism allowed", func() { + // Equivalent sequence to the legacy double-start race: start id1, then two + // superseding starts (id2, id3) such as the read-loop and VAD would each + // issue. Each Start is serialized by the coordinator, so each supersede + // cancels+terminates the previous -- never two live at once. + sink := &recordingSink{} + c := New(sink) + + Expect(c.Apply(Start{ID: 1, Source: SourceClient})).To(Succeed()) + Expect(c.Apply(Start{ID: 2, Source: SourceVAD})).To(Succeed()) + Expect(c.Apply(Start{ID: 3, Source: SourceClient})).To(Succeed()) + + checkInvariants(sink.snapshot()) + + got, ok := c.State().(Active) + Expect(ok).To(BeTrue(), "state = %s, want Active(3)", c.State()) + Expect(got.ID).To(Equal(ResponseID(3))) + }) + + It("terminates on shutdown and rejects any later response (no resurrection)", func() { + sink := &recordingSink{} + c := New(sink) + + Expect(c.Apply(Start{ID: 1, Source: SourceClient})).To(Succeed()) + Expect(c.Apply(Shutdown{})).To(Succeed()) // cancels id 1 + goes terminal + Expect(c.State()).To(Equal(State(Terminated{}))) + + // A late response.create after teardown is structurally rejected. + Expect(c.Apply(Start{ID: 2, Source: SourceClient})).To(Succeed()) + Expect(c.State()).To(Equal(State(Terminated{}))) + // And a stale Finished from the cancelled response is absorbed. + Expect(c.Apply(Finished{ID: 1})).To(Succeed()) + + checkInvariants(sink.snapshot()) + starts := 0 + for _, e := range sink.snapshot() { + if _, ok := e.(StartResponse); ok { + starts++ + } + } + Expect(starts).To(Equal(1), "only id 1 ever started; the post-shutdown Start was rejected") + }) +}) + +// legacyCoord models the LEGACY startResponse/cancelActiveResponse mechanism, in +// which the snapshot ("lock" read), the cancel-and-wait, and the spawn are NOT +// atomic with respect to each other across the two driving goroutines. It exists +// only to demonstrate the dual-writer race (Part 2, failure mode 2) that +// respcoord.Coordinator eliminates. It is not used in production. +// +// Mapping to the legacy code: +// - startStep1 = snapshot Session.activeResponse* under responseMu +// - startStep2 = cancelActiveResponse: cancel() then <-done (outside the lock); +// a second waiter on an already-closed done returns immediately and does NOT +// decrement again (modeled by the snap==registered guard) +// - startStep3 = store the new cancel/done pair and spawn the goroutine +type legacyCoord struct { + live int // # of live response goroutines (the bug: can exceed 1) + registered uint64 // id of the currently-registered response (0 = none) + nextID uint64 +} + +func (l *legacyCoord) startStep1() uint64 { return l.registered } // snapshot + +func (l *legacyCoord) startStep2(snap uint64) { // cancel-and-wait + if snap != 0 && snap == l.registered { + l.live-- + l.registered = 0 + } +} + +func (l *legacyCoord) startStep3() { // spawn + register + l.nextID++ + l.live++ + l.registered = l.nextID +} + +var _ = DescribeTable("respcoord stringers", + func(got, want string) { Expect(got).To(Equal(want)) }, + Entry(nil, SourceClient.String(), "client"), + Entry(nil, SourceVAD.String(), "vad"), + Entry(nil, Source(99).String(), "Source(99)"), + + Entry(nil, StatusCompleted.String(), "completed"), + Entry(nil, StatusCancelled.String(), "cancelled"), + Entry(nil, Status(99).String(), "Status(99)"), + + Entry(nil, Idle{}.String(), "Idle"), + Entry(nil, Active{ID: 7}.String(), "Active(7)"), + Entry(nil, Terminated{}.String(), "Terminated"), + + Entry(nil, Start{ID: 1, Source: SourceVAD}.String(), "Start(1,vad)"), + Entry(nil, Finished{ID: 2}.String(), "Finished(2)"), + Entry(nil, Cancel{Source: SourceClient}.String(), "Cancel(client)"), + Entry(nil, Shutdown{}.String(), "Shutdown"), + + Entry(nil, CancelResponse{ID: 3}.String(), "CancelResponse(3)"), + Entry(nil, StartResponse{ID: 4}.String(), "StartResponse(4)"), + Entry(nil, EmitTerminal{ID: 5, Status: StatusCompleted}.String(), "EmitTerminal(5,completed)"), +) + +var _ = Describe("legacy dual-writer characterization", func() { + // Pins the exact interleaving in which the read-loop and the VAD goroutine + // both start a response and the machine ends up with TWO live responses. This + // is a characterization test for the bug: if a future change to the legacy + // model accidentally fixes it, this spec flips and we delete the legacy model. + // The production path uses respcoord.Coordinator, proven safe above. + It("can reach two live responses (the bug respcoord eliminates)", func() { + l := &legacyCoord{} + + // First response established normally. + s := l.startStep1() + l.startStep2(s) + l.startStep3() // live=1, registered=1 + Expect(l.live).To(Equal(1), "setup") + + // The race: both goroutines snapshot the SAME active response (id 1)... + snapVAD := l.startStep1() // 1 + snapClient := l.startStep1() // 1 + + // ...both "cancel-and-wait" it. The first decrements; the second finds it + // already gone and does nothing. + l.startStep2(snapVAD) // live=0, registered=0 + l.startStep2(snapClient) // no-op (already 0) + + // ...then both spawn their replacement. + l.startStep3() // live=1 + l.startStep3() // live=2 <-- two live responses + + Expect(l.live).To(Equal(2), "expected the legacy race to reach 2 live responses") + }) +}) diff --git a/core/http/endpoints/openai/ttscoord/ttscoord.go b/core/http/endpoints/openai/ttscoord/ttscoord.go new file mode 100644 index 000000000..9b4510347 --- /dev/null +++ b/core/http/endpoints/openai/ttscoord/ttscoord.go @@ -0,0 +1,150 @@ +// Package ttscoord is the explicit state machine for the realtime API's +// TTS-pipeline lifecycle (machine "M5" in docs/design/realtime-state-machines.md). +// +// The realtime TTS pipeline (realtime_tts_pipeline.go) decouples synthesis from +// LLM token generation: the token callback enqueues clauses, a single worker +// goroutine synthesizes them in order, and wait() closes the queue and joins the +// worker. In the legacy code the lifecycle is an implicit `closed bool` (guarded +// by the pipeline mutex) plus a `done` channel closed once by the worker. Two +// gaps: enqueue does NOT check `closed`, so a clause offered after wait() is +// silently appended to a worker that may have already exited (dropped); and the +// open/closed lifecycle is inferred from a bool rather than stored. +// +// This package makes the lifecycle explicit: +// - a sealed sum type for State (Open | Closing | Closed) — monotonic; illegal +// reversals are unrepresentable, +// - a total, pure transition function Next(state, event) -> (state, effects), +// - a single-writer Coordinator that serializes every transition. +// +// It is a genuine two-writer machine: the producer goroutine raises Close (from +// wait()), and the worker goroutine raises WorkerExited when it has drained the +// queue and seen the close — so serializing the transition matters. The poison +// `failed` latch stays a lock-free atomic.Bool in the pipeline (it is read per +// clause on the worker's hot path and is orthogonal to open/closed); this machine +// owns only the open->closing->closed lifecycle. +// +// Guarantees the spec checks: +// - Close wakes the worker to exit exactly once (idempotent wait(); invariant +// #10), +// - the lifecycle is monotonic and Closed is terminal — so a clause is never +// accepted after close (enqueue is gated on Open) and the worker is joined +// exactly once (no leak; invariant #8). +package ttscoord + +import ( + "fmt" + + "github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator" +) + +// State is the sealed sum type of TTS-pipeline lifecycle states. Exhaustively: +// Open | Closing | Closed. +type State interface { + isState() + String() string +} + +// Open: the worker is running and accepting clauses. +type Open struct{} + +// Closing: wait() has been called; the worker is draining the remaining queue and +// will exit. No new clause is accepted. +type Closing struct{} + +// Closed: the worker has exited (its done channel is closed). Terminal. +type Closed struct{} + +func (Open) isState() {} +func (Closing) isState() {} +func (Closed) isState() {} + +func (Open) String() string { return "Open" } +func (Closing) String() string { return "Closing" } +func (Closed) String() string { return "Closed" } + +// Event is the sealed sum type of inputs. Exhaustively: Close | WorkerExited. +type Event interface { + isEvent() + String() string +} + +// Close is raised by the producer goroutine (wait()): close the queue and ask +// the worker to finish. Idempotent. +type Close struct{} + +// WorkerExited is raised by the worker goroutine when it has drained the queue +// and observed the close, just before it closes its done channel. +type WorkerExited struct{} + +func (Close) isEvent() {} +func (WorkerExited) isEvent() {} + +func (Close) String() string { return "Close" } +func (WorkerExited) String() string { return "WorkerExited" } + +// Effect is a side effect returned by Next as data. Exhaustively: Wake. +type Effect interface { + isEffect() + String() string +} + +// Wake: signal the worker (via the buffered wake channel) so it re-checks the +// lifecycle and exits. Emitted once, on the Open->Closing transition. +type Wake struct{} + +func (Wake) isEffect() {} + +func (Wake) String() string { return "Wake" } + +// Next is the total, pure transition function. For every (state, event) it +// returns the next state and the ordered effects. It returns a non-nil error +// only for an unknown State/Event implementation. Every in-domain pair is +// defined; there are no forbidden transitions, only no-ops. +// +// The lifecycle is monotonic Open -> Closing -> Closed. Close wakes the worker +// only on the first Open->Closing transition (idempotent wait()); a later Close +// is absorbed. WorkerExited only advances Closing -> Closed. +func Next(s State, e Event) (State, []Effect, error) { + switch s.(type) { + case Open: + switch e.(type) { + case Close: + return Closing{}, []Effect{Wake{}}, nil + case WorkerExited: + // Worker exited while still Open (e.g. never any clause and an early + // close race) -- treat as fully closed; defensive, keeps Next total. + return Closed{}, nil, nil + } + case Closing: + switch e.(type) { + case Close: + // Idempotent wait(): already closing, no second wake. + return Closing{}, nil, nil + case WorkerExited: + return Closed{}, nil, nil + } + case Closed: + switch e.(type) { + case Close: + return Closed{}, nil, nil + case WorkerExited: + return Closed{}, nil, nil + } + } + return s, nil, fmt.Errorf("ttscoord: unhandled transition %s <- %s", s, e) +} + +// EffectSink performs the effects produced by a transition. See coordinator.Sink: +// Wake does a non-blocking send on a buffered channel, so Perform does not block +// under the lock. +type EffectSink = coordinator.Sink[Effect] + +// Coordinator serializes the TTS-pipeline transitions. The producer (Close) and +// worker (WorkerExited) goroutines both call Apply, so the lock serializes the +// two writers. See coordinator.Coordinator. +type Coordinator = coordinator.Coordinator[State, Event, Effect] + +// New returns an Open Coordinator that performs effects via sink. +func New(sink EffectSink) *Coordinator { + return coordinator.New[State, Event, Effect](Open{}, Next, sink) +} diff --git a/core/http/endpoints/openai/ttscoord/ttscoord_suite_test.go b/core/http/endpoints/openai/ttscoord/ttscoord_suite_test.go new file mode 100644 index 000000000..3f58e120d --- /dev/null +++ b/core/http/endpoints/openai/ttscoord/ttscoord_suite_test.go @@ -0,0 +1,13 @@ +package ttscoord + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestTtscoord(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "ttscoord (realtime M5) Suite") +} diff --git a/core/http/endpoints/openai/ttscoord/ttscoord_test.go b/core/http/endpoints/openai/ttscoord/ttscoord_test.go new file mode 100644 index 000000000..97524b816 --- /dev/null +++ b/core/http/endpoints/openai/ttscoord/ttscoord_test.go @@ -0,0 +1,165 @@ +package ttscoord + +import ( + "math/rand/v2" + "sync" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// recordingSink captures the ordered stream of effects. +type recordingSink struct { + mu sync.Mutex + log []Effect +} + +func (s *recordingSink) Perform(e Effect) { + s.mu.Lock() + s.log = append(s.log, e) + s.mu.Unlock() +} + +func (s *recordingSink) wakes() int { + s.mu.Lock() + defer s.mu.Unlock() + n := 0 + for _, e := range s.log { + if _, ok := e.(Wake); ok { + n++ + } + } + return n +} + +type unknownEvent struct{} + +func (unknownEvent) isEvent() {} +func (unknownEvent) String() string { return "unknownEvent" } + +type unknownState struct{} + +func (unknownState) isState() {} +func (unknownState) String() string { return "unknownState" } + +var _ = Describe("ttscoord.Next", func() { + DescribeTable("transitions", + func(state State, event Event, wantState State, wantEff []Effect) { + gotState, gotEff, err := Next(state, event) + Expect(err).NotTo(HaveOccurred()) + Expect(gotState).To(Equal(wantState)) + Expect(gotEff).To(Equal(wantEff)) + }, + Entry("open+close -> closing: wake", + Open{}, Close{}, Closing{}, []Effect{Wake{}}), + Entry("open+workerexited -> closed (defensive)", + Open{}, WorkerExited{}, Closed{}, []Effect(nil)), + Entry("closing+close -> closing, no-op (idempotent wait)", + Closing{}, Close{}, Closing{}, []Effect(nil)), + Entry("closing+workerexited -> closed", + Closing{}, WorkerExited{}, Closed{}, []Effect(nil)), + Entry("closed+close -> closed, no-op", + Closed{}, Close{}, Closed{}, []Effect(nil)), + Entry("closed+workerexited -> closed, no-op", + Closed{}, WorkerExited{}, Closed{}, []Effect(nil)), + ) + + It("is total over the defined (state, event) pairs", func() { + for _, s := range []State{Open{}, Closing{}, Closed{}} { + for _, e := range []Event{Close{}, WorkerExited{}} { + _, _, err := Next(s, e) + Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e) + } + } + }) + + It("errors on an unknown event type", func() { + _, _, err := Next(Open{}, unknownEvent{}) + Expect(err).To(HaveOccurred()) + }) + + It("errors on an unknown state type", func() { + _, _, err := Next(unknownState{}, Close{}) + Expect(err).To(HaveOccurred()) + }) +}) + +// phaseOf maps a state to a monotonic rank for the "never goes backwards" check. +func phaseOf(s State) int { + switch s.(type) { + case Open: + return 0 + case Closing: + return 1 + case Closed: + return 2 + default: + return -1 + } +} + +var _ = Describe("ttscoord.Coordinator", func() { + It("keeps the lifecycle monotonic and wakes at most once over random sequences", func() { + seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE} + for _, seed := range seeds { + r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5)) + sink := &recordingSink{} + c := New(sink) + prev := 0 + + for range 5000 { + if r.IntN(2) == 0 { + Expect(c.Apply(Close{})).To(Succeed()) + } else { + Expect(c.Apply(WorkerExited{})).To(Succeed()) + } + cur := phaseOf(c.State()) + Expect(cur).To(BeNumerically(">=", prev), "seed=%d: lifecycle went backwards", seed) + prev = cur + } + Expect(sink.wakes()).To(BeNumerically("<=", 1), "seed=%d: woke more than once", seed) + } + }) + + // Two-writer test: a producer raises Close while the "worker" raises + // WorkerExited, the real concurrency. The lifecycle must stay monotonic and + // Wake must fire at most once. Run under -race. + It("is two-writer safe (producer Close vs worker WorkerExited)", func() { + const iterations = 200 + for range iterations { + sink := &recordingSink{} + c := New(sink) + var wg sync.WaitGroup + wg.Add(2) + go func() { defer wg.Done(); _ = c.Apply(Close{}) }() + go func() { defer wg.Done(); _ = c.Apply(WorkerExited{}) }() + wg.Wait() + // After both, drive to terminal and assert idempotence. + _ = c.Apply(Close{}) + _ = c.Apply(WorkerExited{}) + Expect(c.State()).To(Equal(State(Closed{}))) + Expect(sink.wakes()).To(BeNumerically("<=", 1)) + } + }) + + It("only Open accepts (a gate query never panics across states)", func() { + // Mirrors the pipeline's enqueue gate: accepted iff Open. + sink := &recordingSink{} + c := New(sink) + _, open := c.State().(Open) + Expect(open).To(BeTrue()) + Expect(c.Apply(Close{})).To(Succeed()) + _, open = c.State().(Open) + Expect(open).To(BeFalse()) + }) +}) + +var _ = DescribeTable("ttscoord stringers", + func(got, want string) { Expect(got).To(Equal(want)) }, + Entry(nil, Open{}.String(), "Open"), + Entry(nil, Closing{}.String(), "Closing"), + Entry(nil, Closed{}.String(), "Closed"), + Entry(nil, Close{}.String(), "Close"), + Entry(nil, WorkerExited{}.String(), "WorkerExited"), + Entry(nil, Wake{}.String(), "Wake"), +) diff --git a/core/http/endpoints/openai/turncoord/turncoord.go b/core/http/endpoints/openai/turncoord/turncoord.go new file mode 100644 index 000000000..ac9e85052 --- /dev/null +++ b/core/http/endpoints/openai/turncoord/turncoord.go @@ -0,0 +1,255 @@ +// Package turncoord is the explicit state machine for the realtime API's +// turn-detection concern (machine "M2" in +// docs/design/realtime-state-machines.md). +// +// In the legacy code this machine is implicit and, worse, split across TWO +// variables that can disagree: handleVAD's goroutine-local speechStarted bool +// and the semantic_vad liveTurnState's "is the live stream open" flag +// (lts.open()). They are set and cleared at separate points, so a discardTurn +// (no-speech clear, a semantic->server mode switch mid-turn, or teardown) +// closes the live stream but leaves speechStarted true. The two then disagree, +// and the next speech onset is suppressed because `if !speechStarted` is false +// — the user's next utterance silently produces no speech_started, no barge-in, +// and no commit. See docs/design/realtime-state-machines.md, Part 2 (failure +// mode 4) and the turn_lifecycle spec under formal-verification/. +// +// This package replaces that with: +// - a sealed sum type for State (illegal states are unrepresentable), +// - a total, pure transition function Next(state, event) -> (state, effects), +// - a single-writer Coordinator that serializes every transition. +// +// "Speech detected" and "a turn is open" become ONE state (Speaking), so they +// can no longer fall out of sync: every path that ends a turn returns to Idle +// and necessarily clears both. The design guarantees the invariants the specs +// check: +// - speechStarted ⟺ a turn is open (Part 4, invariant #4) — structural here, +// - a barge-in cancel precedes the next turn's commit (you must pass through +// Speaking, which barges in on entry, before a Silence can commit), +// - every opened turn is finished (commit) or discarded (abort) exactly once. +// +// Unlike M3 (respcoord), which is a genuine dual-writer race, M2's turn +// lifecycle is driven by the single handleVAD goroutine: the value here is +// making the speechStarted/turn-open desync unrepresentable, not serializing +// concurrent writers. The Coordinator still serializes transitions so that +// State() is race-free and a teardown-time Abort from another goroutine (or a +// future second writer) stays safe. +// +// Mode note: in server_vad mode there is no live ASR stream, so OpenTurn / +// DiscardTurn have nothing to open or close — the sink performs them as no-ops +// and "turn open" is satisfied vacuously. The state coupling (Speaking ⟺ turn +// open) still holds; it is only semantic_vad that had two real variables to +// desync. +package turncoord + +import ( + "fmt" + + "github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator" +) + +// TurnID identifies one user turn. The caller mints it when speech begins (it +// is the conversation item id the live caption deltas stream under, reused by +// the committed event so the client replaces the partial text). Carrying it in +// the state makes "commit/discard refer to the turn that was opened" explicit. +type TurnID string + +// AbortReason records why a turn was dropped without committing. Like +// respcoord.Source it is observability only — every reason aborts the same way; +// keeping it in the event makes the distinct legacy discardTurn sites explicit +// rather than collapsed into one anonymous code path. +type AbortReason int + +const ( + // AbortNoSpeech: the no-speech clear — the VAD found no segments and the + // buffer is past the holdback, so the inspected audio was not speech. + AbortNoSpeech AbortReason = iota + // AbortTeardown: the session is closing. + AbortTeardown +) + +// NOTE: a semantic->server turn-detection switch mid-turn is deliberately NOT an +// Abort: it only drops the orphaned live ASR stream and lets the turn continue +// under server_vad (so a config change can't cut off a mid-utterance speaker). +// That orphan cleanup stays inline in handleVAD; only the two reasons above end +// a turn (return to Idle). + +func (r AbortReason) String() string { + switch r { + case AbortNoSpeech: + return "no_speech" + case AbortTeardown: + return "teardown" + default: + return fmt.Sprintf("AbortReason(%d)", int(r)) + } +} + +// State is the sealed sum type of turn-detection states. The only +// implementations are the marker-method structs in this file, so callers +// outside the package cannot fabricate an out-of-band state. Exhaustively: +// Idle | Speaking. +type State interface { + isState() + String() string +} + +// Idle: no turn is open and no speech is in progress (legacy: speechStarted == +// false AND the live stream is closed — here a single state, so they cannot +// disagree). +type Idle struct{} + +// Speaking: a turn is open and speech is in progress (legacy: speechStarted == +// true AND, in semantic mode, the live stream open). Turn is the open turn's id. +type Speaking struct{ Turn TurnID } + +func (Idle) isState() {} +func (Speaking) isState() {} + +func (Idle) String() string { return "Idle" } +func (s Speaking) String() string { return fmt.Sprintf("Speaking(%s)", s.Turn) } + +// Event is the sealed sum type of inputs. Exhaustively: Onset | Silence | Abort. +type Event interface { + isEvent() + String() string +} + +// Onset reports that the VAD found speech this tick. Turn is the id to open the +// turn under (allocated by the caller so caption deltas can stream immediately). +// While already Speaking it is a no-op: re-detection of ongoing speech does not +// reopen a turn (legacy `if !speechStarted`). +type Onset struct{ Turn TurnID } + +// Silence reports VAD-confirmed silence past the dynamic commit threshold (the +// end-of-speech commit trigger). The threshold itself — semantic_vad's EOU vs +// eagerness fallback — is computed by the caller before raising this event; the +// machine only sequences the commit. It is a no-op while Idle (nothing to +// commit). +type Silence struct{} + +// Abort drops the open turn without committing (no-speech clear, mode switch, +// teardown). It is a no-op while Idle (nothing open). +type Abort struct{ Reason AbortReason } + +func (Onset) isEvent() {} +func (Silence) isEvent() {} +func (Abort) isEvent() {} + +func (e Onset) String() string { return fmt.Sprintf("Onset(%s)", e.Turn) } +func (Silence) String() string { return "Silence" } +func (e Abort) String() string { return fmt.Sprintf("Abort(%s)", e.Reason) } + +// Effect is a side effect returned by Next as data for the caller to perform. +// Returning effects as data (rather than firing callbacks inside the +// transition) keeps Next pure and exhaustively testable. Exhaustively: +// BargeIn | OpenTurn | EmitSpeechStarted | EmitSpeechStopped | CommitTurn | +// DiscardTurn. +type Effect interface { + isEffect() + String() string +} + +// BargeIn: cancel any in-flight response (the M2->M3 edge). Emitted on the +// Idle->Speaking onset, before the new turn can ever commit — so a barge-in +// always precedes the next commit. +type BargeIn struct{} + +// OpenTurn: open the live ASR stream for Turn (semantic_vad). No-op in +// server_vad mode. +type OpenTurn struct{ Turn TurnID } + +// EmitSpeechStarted: send input_audio_buffer.speech_started. +type EmitSpeechStarted struct{} + +// EmitSpeechStopped: send input_audio_buffer.speech_stopped. +type EmitSpeechStopped struct{} + +// CommitTurn: finalize the turn's live stream, emit input_audio_buffer.committed +// for Turn, and issue the response (via respcoord). The completion of one turn. +type CommitTurn struct{ Turn TurnID } + +// DiscardTurn: close the turn's live stream and retract any caption deltas +// already shown for Turn (the failed transcription event). No commit, no +// response. +type DiscardTurn struct{ Turn TurnID } + +func (BargeIn) isEffect() {} +func (OpenTurn) isEffect() {} +func (EmitSpeechStarted) isEffect() {} +func (EmitSpeechStopped) isEffect() {} +func (CommitTurn) isEffect() {} +func (DiscardTurn) isEffect() {} + +func (BargeIn) String() string { return "BargeIn" } +func (e OpenTurn) String() string { return fmt.Sprintf("OpenTurn(%s)", e.Turn) } +func (EmitSpeechStarted) String() string { return "EmitSpeechStarted" } +func (EmitSpeechStopped) String() string { return "EmitSpeechStopped" } +func (e CommitTurn) String() string { return fmt.Sprintf("CommitTurn(%s)", e.Turn) } +func (e DiscardTurn) String() string { return fmt.Sprintf("DiscardTurn(%s)", e.Turn) } + +// Next is the total, pure transition function. For every (state, event) it +// returns the next state and the ordered effects to perform. It returns a +// non-nil error only for an unknown State/Event implementation (a programmer +// error / future type added without updating this function) — callers must +// surface that, never silently ignore it. Every in-domain (state, event) pair +// is defined; there are no "forbidden" transitions, only no-ops for events that +// don't apply to the current state. +// +// The crux of the fix is that both turn-ending transitions (Silence commit and +// Abort) go to Idle, which carries no turn data: there is no way to clear "turn +// open" while leaving "speech started" set, because they are the same state. +// The legacy desync (discardTurn closed the live stream but left speechStarted +// true) is therefore unrepresentable. +// +// Effect ordering on onset mirrors the live handleVAD: OpenTurn (start the live +// stream), then BargeIn (cancel the prior response), then EmitSpeechStarted. +func Next(s State, e Event) (State, []Effect, error) { + switch st := s.(type) { + case Idle: + switch ev := e.(type) { + case Onset: + return Speaking{Turn: ev.Turn}, []Effect{ + OpenTurn{Turn: ev.Turn}, + BargeIn{}, + EmitSpeechStarted{}, + }, nil + case Silence: + // Nothing in flight to commit: idempotent no-op. + return Idle{}, nil, nil + case Abort: + // No open turn: idempotent no-op (discardTurn on a closed stream). + return Idle{}, nil, nil + } + case Speaking: + switch e.(type) { + case Onset: + // Speech already in progress: re-detection does not reopen a turn + // or re-emit speech_started (legacy `if !speechStarted`). The turn + // id stays the one allocated at onset. + return Speaking{Turn: st.Turn}, nil, nil + case Silence: + return Idle{}, []Effect{ + EmitSpeechStopped{}, + CommitTurn{Turn: st.Turn}, + }, nil + case Abort: + return Idle{}, []Effect{DiscardTurn{Turn: st.Turn}}, nil + } + } + return s, nil, fmt.Errorf("turncoord: unhandled transition %s <- %s", s, e) +} + +// EffectSink performs the effects produced by a transition. See coordinator.Sink +// for the non-blocking contract: Perform runs under the coordinator lock, so it +// must not block and must not re-enter Apply. +type EffectSink = coordinator.Sink[Effect] + +// Coordinator serializes turn transitions. In practice the handleVAD goroutine is +// the only writer, but serializing keeps State() race-free and a teardown-time +// Abort from another goroutine safe. See coordinator.Coordinator. +type Coordinator = coordinator.Coordinator[State, Event, Effect] + +// New returns an idle Coordinator that performs effects via sink. +func New(sink EffectSink) *Coordinator { + return coordinator.New[State, Event, Effect](Idle{}, Next, sink) +} diff --git a/core/http/endpoints/openai/turncoord/turncoord_suite_test.go b/core/http/endpoints/openai/turncoord/turncoord_suite_test.go new file mode 100644 index 000000000..8e34feb74 --- /dev/null +++ b/core/http/endpoints/openai/turncoord/turncoord_suite_test.go @@ -0,0 +1,13 @@ +package turncoord + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestTurncoord(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "turncoord (realtime M2) Suite") +} diff --git a/core/http/endpoints/openai/turncoord/turncoord_test.go b/core/http/endpoints/openai/turncoord/turncoord_test.go new file mode 100644 index 000000000..a3c342187 --- /dev/null +++ b/core/http/endpoints/openai/turncoord/turncoord_test.go @@ -0,0 +1,242 @@ +package turncoord + +import ( + "fmt" + "math/rand/v2" + "sync" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// recordingSink captures the ordered stream of effects so the invariants can be +// checked independently of the transition function's internals. Perform is +// called by Coordinator.Apply under the coordinator lock, so it is already +// serialized; the mutex here only guards reads from the spec goroutine. +type recordingSink struct { + mu sync.Mutex + log []Effect +} + +func (s *recordingSink) Perform(e Effect) { + s.mu.Lock() + s.log = append(s.log, e) + s.mu.Unlock() +} + +func (s *recordingSink) snapshot() []Effect { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]Effect, len(s.log)) + copy(out, s.log) + return out +} + +// checkLog replays the effect log and asserts the turn-lifecycle safety +// properties from docs/design/realtime-state-machines.md, Part 4 (invariant #4 +// and the discardTurn/speechStarted desync, failure mode 4): +// +// (1) at most one turn open at any instant -- OpenTurn never fires while a +// turn is already open; +// (2) every turn id is opened at most once; +// (3) no orphan close -- CommitTurn/DiscardTurn only fire on an open turn. +// +// The wire pairing of speech_started/speech_stopped is intentionally NOT +// reconstructed here: like the legacy no-speech clear, an Abort discards the +// turn without a speech_stopped (the failed-transcription event is its closure +// signal). The guarantee this package adds is the *state* coupling (Speaking +// <=> a turn is open), checked inline in the property spec below. +func checkLog(log []Effect) { + open := false + opens := map[TurnID]int{} + for i, eff := range log { + switch e := eff.(type) { + case OpenTurn: + Expect(open).To(BeFalse(), "invariant (1): OpenTurn(%s) while a turn is already open (effect #%d)\nlog=%v", e.Turn, i, log) + open = true + opens[e.Turn]++ + Expect(opens[e.Turn]).To(Equal(1), "invariant (2): turn %s opened %d times (effect #%d)\nlog=%v", e.Turn, opens[e.Turn], i, log) + case CommitTurn: + Expect(open).To(BeTrue(), "invariant (3): CommitTurn(%s) with no open turn (effect #%d)\nlog=%v", e.Turn, i, log) + open = false + case DiscardTurn: + Expect(open).To(BeTrue(), "invariant (3): DiscardTurn(%s) with no open turn (effect #%d)\nlog=%v", e.Turn, i, log) + open = false + } + } +} + +// unknownEvent / unknownState exercise the defensive error path for a type that +// Next does not know about (a future variant added without updating Next). +type unknownEvent struct{} + +func (unknownEvent) isEvent() {} +func (unknownEvent) String() string { return "unknownEvent" } + +type unknownState struct{} + +func (unknownState) isState() {} +func (unknownState) String() string { return "unknownState" } + +var _ = Describe("turncoord.Next", func() { + // DescribeTable exhaustively pins every (state, event) cell of the pure + // transition function, including the idle no-op cells. This is the practical + // stand-in for "no transition leads to an inconsistent state": if a cell + // changes, this table must change with it. + DescribeTable("transitions", + func(state State, event Event, wantState State, wantEff []Effect) { + gotState, gotEff, err := Next(state, event) + Expect(err).NotTo(HaveOccurred()) + Expect(gotState).To(Equal(wantState)) + Expect(gotEff).To(Equal(wantEff)) + }, + Entry("idle+onset -> speaking: open, barge-in, speech_started", + Idle{}, Onset{Turn: "t1"}, + Speaking{Turn: "t1"}, + []Effect{OpenTurn{Turn: "t1"}, BargeIn{}, EmitSpeechStarted{}}), + Entry("idle+silence -> idle, no-op (nothing to commit)", + Idle{}, Silence{}, + Idle{}, []Effect(nil)), + Entry("idle+abort -> idle, no-op (nothing open)", + Idle{}, Abort{Reason: AbortNoSpeech}, + Idle{}, []Effect(nil)), + Entry("speaking+onset -> stay speaking, no-op (already speaking)", + Speaking{Turn: "t1"}, Onset{Turn: "t2"}, // a fresh id is ignored mid-turn + Speaking{Turn: "t1"}, []Effect(nil)), + Entry("speaking+silence -> idle: speech_stopped + commit", + Speaking{Turn: "t1"}, Silence{}, + Idle{}, []Effect{EmitSpeechStopped{}, CommitTurn{Turn: "t1"}}), + Entry("speaking+abort(no_speech) -> idle: discard", + Speaking{Turn: "t1"}, Abort{Reason: AbortNoSpeech}, + Idle{}, []Effect{DiscardTurn{Turn: "t1"}}), + Entry("speaking+abort(teardown) -> idle: discard", + Speaking{Turn: "t9"}, Abort{Reason: AbortTeardown}, + Idle{}, []Effect{DiscardTurn{Turn: "t9"}}), + ) + + It("is total: every defined (state, event) pair is handled without error", func() { + states := []State{Idle{}, Speaking{Turn: "t1"}} + events := []Event{ + Onset{Turn: "t2"}, + Silence{}, + Abort{Reason: AbortNoSpeech}, + Abort{Reason: AbortTeardown}, + } + for _, s := range states { + for _, e := range events { + _, _, err := Next(s, e) + Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e) + } + } + }) + + It("errors on an unknown event type", func() { + _, _, err := Next(Speaking{Turn: "t1"}, unknownEvent{}) + Expect(err).To(HaveOccurred()) + }) + + It("errors on an unknown state type", func() { + _, _, err := Next(unknownState{}, Onset{Turn: "t1"}) + Expect(err).To(HaveOccurred()) + }) +}) + +var _ = Describe("turncoord.Coordinator", func() { + // This replaces the previous rapid stateful test: a seeded random walk over + // the event space, asserting after every step both the log invariants and + // the core state coupling -- the machine is in Speaking IFF a turn is + // currently open. That coupling is the whole point of M2: in the legacy code + // speechStarted and the live-stream-open flag were separate variables a + // discard could desync; here they are one state and cannot. Seeds are fixed + // so any failure reproduces deterministically (the failing seed/step is in + // the assertion message). + It("keeps state coupled to turn-open over random event sequences", func() { + seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE} + for _, seed := range seeds { + r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5)) + sink := &recordingSink{} + c := New(sink) + var nextTurn uint64 + open := false // independent model of "is a turn open" + + for step := range 5000 { + switch r.IntN(3) { + case 0: + nextTurn++ + Expect(c.Apply(Onset{Turn: TurnID(fmt.Sprintf("t%d", nextTurn))})).To(Succeed()) + open = true // onset opens a turn (or is a no-op if already open) + case 1: + Expect(c.Apply(Silence{})).To(Succeed()) + open = false // commit (or no-op if already idle) + case 2: + Expect(c.Apply(Abort{Reason: AbortReason(r.IntN(2))})).To(Succeed()) + open = false // discard (or no-op if already idle) + } + _, speaking := c.State().(Speaking) + Expect(speaking).To(Equal(open), "coupling: seed=%d step=%d state=%s", seed, step, c.State()) + } + checkLog(sink.snapshot()) + } + }) + + // M2 is single-writer in practice (handleVAD), but teardown can Abort from + // another goroutine, so the Coordinator must be race-safe. Run under -race; + // the log invariants must hold regardless of interleaving. + It("is race-safe under concurrent Apply from two goroutines", func() { + const perGoroutine = 2000 + sink := &recordingSink{} + c := New(sink) + + var idCounter uint64 + var idMu sync.Mutex + nextTurn := func() TurnID { + idMu.Lock() + defer idMu.Unlock() + idCounter++ + return TurnID(fmt.Sprintf("t%d", idCounter)) + } + + var wg sync.WaitGroup + drive := func(reason AbortReason) { + defer wg.Done() + for i := range perGoroutine { + switch i % 3 { + case 0: + _ = c.Apply(Onset{Turn: nextTurn()}) + case 1: + _ = c.Apply(Silence{}) + case 2: + _ = c.Apply(Abort{Reason: reason}) + } + } + } + + wg.Add(2) + go drive(AbortNoSpeech) + go drive(AbortTeardown) + wg.Wait() + + checkLog(sink.snapshot()) + }) +}) + +var _ = DescribeTable("turncoord stringers", + func(got, want string) { Expect(got).To(Equal(want)) }, + Entry(nil, AbortNoSpeech.String(), "no_speech"), + Entry(nil, AbortTeardown.String(), "teardown"), + Entry(nil, AbortReason(99).String(), "AbortReason(99)"), + + Entry(nil, Idle{}.String(), "Idle"), + Entry(nil, Speaking{Turn: "t7"}.String(), "Speaking(t7)"), + + Entry(nil, Onset{Turn: "t1"}.String(), "Onset(t1)"), + Entry(nil, Silence{}.String(), "Silence"), + Entry(nil, Abort{Reason: AbortTeardown}.String(), "Abort(teardown)"), + + Entry(nil, BargeIn{}.String(), "BargeIn"), + Entry(nil, OpenTurn{Turn: "t2"}.String(), "OpenTurn(t2)"), + Entry(nil, EmitSpeechStarted{}.String(), "EmitSpeechStarted"), + Entry(nil, EmitSpeechStopped{}.String(), "EmitSpeechStopped"), + Entry(nil, CommitTurn{Turn: "t3"}.String(), "CommitTurn(t3)"), + Entry(nil, DiscardTurn{Turn: "t4"}.String(), "DiscardTurn(t4)"), +) diff --git a/core/http/react-ui/e2e/traces-audio.spec.js b/core/http/react-ui/e2e/traces-audio.spec.js new file mode 100644 index 000000000..567fd56c2 --- /dev/null +++ b/core/http/react-ui/e2e/traces-audio.spec.js @@ -0,0 +1,87 @@ +import { test, expect } from './coverage-fixtures.js' + +// Audio snippets on the Traces page must play through a blob: object URL — +// the CSP's connect-src allows blob: but not data:, and the waveform peaks +// renderer fetch()es the player src — and must degrade to a readable note +// (not a broken player) when the stored payload is the "" +// marker an older server stamped into oversized fields. + +// Minimal valid 16 kHz mono 16-bit PCM WAV (0.1s 440 Hz sine), base64-encoded. +function wavBase64(samples = 1600, rate = 16000) { + const dataSize = samples * 2 + const buf = Buffer.alloc(44 + dataSize) + buf.write('RIFF', 0) + buf.writeUInt32LE(36 + dataSize, 4) + buf.write('WAVE', 8) + buf.write('fmt ', 12) + buf.writeUInt32LE(16, 16) + buf.writeUInt16LE(1, 20) // PCM + buf.writeUInt16LE(1, 22) // mono + buf.writeUInt32LE(rate, 24) + buf.writeUInt32LE(rate * 2, 28) + buf.writeUInt16LE(2, 32) + buf.writeUInt16LE(16, 34) + buf.write('data', 36) + buf.writeUInt32LE(dataSize, 40) + for (let i = 0; i < samples; i++) { + buf.writeInt16LE(Math.round(8000 * Math.sin((2 * Math.PI * 440 * i) / rate)), 44 + i * 2) + } + return buf.toString('base64') +} + +function transcriptionTrace(audioWavBase64) { + return { + type: 'transcription', + timestamp: Date.now() * 1_000_000, + model_name: 'parakeet-test', + summary: 'transcribed utterance', + duration: 500_000_000, + error: null, + data: { + audio_wav_base64: audioWavBase64, + audio_duration_s: 0.1, + audio_snippet_s: 0.1, + audio_sample_rate: 16000, + audio_samples: 1600, + audio_rms_dbfs: -12.0, + audio_peak_dbfs: -6.0, + audio_dc_offset: 0, + }, + } +} + +async function openBackendTraceRow(page, traces) { + await page.route('**/api/traces', (route) => { + route.fulfill({ contentType: 'application/json', body: JSON.stringify([]) }) + }) + await page.route('**/api/backend-traces', (route) => { + route.fulfill({ contentType: 'application/json', body: JSON.stringify(traces) }) + }) + await page.goto('/app/traces') + await expect(page.locator('text=Tracing is')).toBeVisible({ timeout: 10_000 }) + await page.locator('button', { hasText: 'Backend Traces' }).click() + await page.locator('td', { hasText: 'parakeet-test' }).first().click() +} + +test.describe('Traces - Audio Snippets', () => { + test('plays a clip through a blob: URL, not a CSP-blocked data: URL', async ({ page }) => { + await openBackendTraceRow(page, [transcriptionTrace(wavBase64())]) + + // The expanded row carries the snippet metrics and a player whose source + // is an object URL (connect-src allows blob:, so the peaks fetch works). + await expect(page.locator('text=Audio Snippet')).toBeVisible() + const audio = page.locator('audio') + await expect(audio).toHaveCount(1) + const src = await audio.getAttribute('src') + expect(src).toMatch(/^blob:/) + await expect(page.getByTestId('audio-snippet-unavailable')).toHaveCount(0) + }) + + test('shows a readable note instead of a broken player for truncated payloads', async ({ page }) => { + await openBackendTraceRow(page, [transcriptionTrace('')]) + + await expect(page.locator('text=Audio Snippet')).toBeVisible() + await expect(page.getByTestId('audio-snippet-unavailable')).toBeVisible() + await expect(page.locator('audio')).toHaveCount(0) + }) +}) diff --git a/core/http/react-ui/src/pages/Talk.jsx b/core/http/react-ui/src/pages/Talk.jsx index 5a6857a9e..b25643aa7 100644 --- a/core/http/react-ui/src/pages/Talk.jsx +++ b/core/http/react-ui/src/pages/Talk.jsx @@ -19,24 +19,31 @@ const STATUS_STYLES = { error: { icon: 'fa-solid fa-circle', color: 'var(--color-error)', bg: 'var(--color-error-light)' }, } -// upsertAssistant merges a streamed transcript fragment into the assistant entry -// identified by the server's item_id, or appends a new entry if none exists yet. -// Keying by item_id (not a mutable index tracked across handler/updater -// boundaries) makes streamed deltas idempotent and order-independent, so React's -// batching of non-React data-channel events cannot produce a duplicate bubble. -// mode 'append' adds to the running text; 'replace' sets the final transcript. -function upsertAssistant(prev, itemId, text, mode) { - // Only assistant entries carry an id, and the streaming entry is almost - // always the newest — search from the tail so per-delta cost stays constant. +// upsertEntry merges a streamed transcript fragment into the entry identified +// by the server's item_id, or appends a new entry (with the given role) if +// none exists yet. Keying by item_id (not a mutable index tracked across +// handler/updater boundaries) makes streamed deltas idempotent and +// order-independent, so React's batching of non-React data-channel events +// cannot produce a duplicate bubble. mode 'append' adds to the running text; +// 'replace' sets the final transcript — the server sends a completed event +// whose authoritative text supersedes any live captions (e.g. the +// semantic_vad retranscribe gate's batch decode). +function upsertEntry(prev, itemId, role, text, mode) { + // The streaming entry is almost always the newest — search from the tail + // so per-delta cost stays constant. const i = prev.findLastIndex(e => e.id === itemId) if (i === -1) { - return [...prev, { role: 'assistant', id: itemId, text }] + return [...prev, { role, id: itemId, text }] } const next = [...prev] next[i] = { ...next[i], text: mode === 'append' ? next[i].text + text : text } return next } +function upsertAssistant(prev, itemId, text, mode) { + return upsertEntry(prev, itemId, 'assistant', text, mode) +} + export default function Talk() { const { addToast } = useOutletContext() const navigate = useNavigate() @@ -252,12 +259,33 @@ export default function Talk() { case 'input_audio_buffer.speech_stopped': updateStatus('thinking', 'Processing...') break + case 'conversation.item.input_audio_transcription.delta': + // Live captions: semantic_vad streams the user's words while they + // are still speaking, keyed by the item id the commit will reuse. + if (event.delta && event.item_id) { + setTranscript(prev => upsertEntry(prev, event.item_id, 'user', event.delta, 'append')) + } + break case 'conversation.item.input_audio_transcription.completed': if (event.transcript) { - setTranscript(prev => [...prev, { role: 'user', text: event.transcript }]) + if (event.item_id) { + // Replaces any live captions with the authoritative transcript + // (which may differ, e.g. the retranscribe gate's batch decode); + // creates the entry when there were none (server_vad). + setTranscript(prev => upsertEntry(prev, event.item_id, 'user', event.transcript, 'replace')) + } else { + setTranscript(prev => [...prev, { role: 'user', text: event.transcript }]) + } } updateStatus('thinking', 'Generating response...') break + case 'conversation.item.input_audio_transcription.failed': + // The turn was discarded after captions were shown (e.g. the buffer + // was cleared as silence) — retract the partial entry. + if (event.item_id) { + setTranscript(prev => prev.filter(e => e.id !== event.item_id)) + } + break case 'response.output_audio_transcript.delta': if (event.delta) { inProgressIdRef.current = event.item_id @@ -712,7 +740,7 @@ export default function Talk() { )} {selectedModelInfo && !selectedModelInfo.self_contained && (
{[ @@ -724,9 +752,12 @@ export default function Talk() {
-
{item.label}
-
{item.value}
+
{item.label}
+ {/* full width for the value; wrap rather than overflow when the + model name is long (minWidth:0 lets the flex item shrink) */} +
{item.value || '—'}
))}
diff --git a/core/http/react-ui/src/pages/Traces.jsx b/core/http/react-ui/src/pages/Traces.jsx index 85387f815..933acf344 100644 --- a/core/http/react-ui/src/pages/Traces.jsx +++ b/core/http/react-ui/src/pages/Traces.jsx @@ -86,8 +86,40 @@ function typeBadgeStyle(type) { return { background: c.bg, color: c.color, padding: '2px 8px', borderRadius: 'var(--radius-sm)', fontSize: '0.75rem', fontWeight: 500 } } +// useWavObjectURL — decode a base64 WAV payload into a blob: object URL for +// the waveform player. A data: URL would render in