mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-30 03:17:01 -04:00
Compare commits
1 Commits
master
...
fix/distri
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a73516f9b4 |
@@ -7,11 +7,8 @@
|
|||||||
# Runs only the checks relevant to what's staged:
|
# Runs only the checks relevant to what's staged:
|
||||||
# - Go files -> make lint + make test-coverage-check
|
# - Go files -> make lint + make test-coverage-check
|
||||||
# - core/http/react-ui -> make test-ui-coverage-check (Playwright e2e + gate)
|
# - core/http/react-ui -> make test-ui-coverage-check (Playwright e2e + gate)
|
||||||
# - realtime state machines / specs -> make test-realtime-conformance
|
# A commit touching neither is skipped entirely (docs/YAML/etc. can't change
|
||||||
# (respcoord/**, turncoord/**, or formal-verification/** -- a pure .fizz
|
# lint findings, Go coverage, or the UI).
|
||||||
# spec edit must still re-verify the design, detected separately from Go)
|
|
||||||
# A commit touching none of these is skipped entirely (other docs/YAML can't
|
|
||||||
# change lint findings, Go coverage, the UI, or the realtime conformance gate).
|
|
||||||
#
|
#
|
||||||
# To bypass for a single commit (e.g. a WIP checkpoint): git commit --no-verify
|
# To bypass for a single commit (e.g. a WIP checkpoint): git commit --no-verify
|
||||||
set -eu
|
set -eu
|
||||||
@@ -23,13 +20,11 @@ staged="$(git diff --cached --name-only --diff-filter=ACMRD)"
|
|||||||
|
|
||||||
go_changed=0
|
go_changed=0
|
||||||
ui_changed=0
|
ui_changed=0
|
||||||
rt_changed=0
|
|
||||||
if echo "$staged" | grep -qE '\.go$'; then go_changed=1; fi
|
if echo "$staged" | grep -qE '\.go$'; then go_changed=1; fi
|
||||||
if echo "$staged" | grep -qE '^core/http/react-ui/'; then ui_changed=1; fi
|
if echo "$staged" | grep -qE '^core/http/react-ui/'; then ui_changed=1; fi
|
||||||
if echo "$staged" | grep -qE '^(core/http/endpoints/openai/(coordinator|respcoord|turncoord|conncoord|compactcoord|ttscoord)/|formal-verification/)'; then rt_changed=1; fi
|
|
||||||
|
|
||||||
if [ "$go_changed" -eq 0 ] && [ "$ui_changed" -eq 0 ] && [ "$rt_changed" -eq 0 ]; then
|
if [ "$go_changed" -eq 0 ] && [ "$ui_changed" -eq 0 ]; then
|
||||||
echo "pre-commit: no Go, React UI, or realtime-spec changes staged — skipping."
|
echo "pre-commit: no Go or React UI changes staged — skipping."
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -62,11 +57,4 @@ if [ "$ui_changed" -eq 1 ]; then
|
|||||||
make test-ui-coverage-check
|
make test-ui-coverage-check
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$rt_changed" -eq 1 ]; then
|
|
||||||
echo "pre-commit ▶ realtime state-machine conformance (make test-realtime-conformance) —"
|
|
||||||
echo " Go transition/rapid tests under -race + FizzBee model check of the"
|
|
||||||
echo " authoritative specs. Fail-closed: needs FizzBee (make install-fizzbee)."
|
|
||||||
make test-realtime-conformance
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "pre-commit ✓ all relevant checks passed"
|
echo "pre-commit ✓ all relevant checks passed"
|
||||||
|
|||||||
69
.github/workflows/realtime-conformance.yml
vendored
69
.github/workflows/realtime-conformance.yml
vendored
@@ -1,69 +0,0 @@
|
|||||||
---
|
|
||||||
name: 'realtime-conformance'
|
|
||||||
|
|
||||||
# Verifies the realtime state-machine implementations conform to their formal
|
|
||||||
# designs (docs/design/realtime-state-machines.md, formal-verification/). BOTH
|
|
||||||
# layers are enforced and the gate is fail-closed: the Go conformance layer
|
|
||||||
# (respcoord + turncoord transition/rapid tests under -race) AND the FizzBee model check of
|
|
||||||
# the authoritative specs. FizzBee is pinned + checksum-verified
|
|
||||||
# (formal-verification/fizzbee.sha256), so a failed install fails the job rather
|
|
||||||
# than silently skipping verification.
|
|
||||||
|
|
||||||
on:
|
|
||||||
pull_request:
|
|
||||||
paths:
|
|
||||||
- 'core/http/endpoints/openai/coordinator/**'
|
|
||||||
- 'core/http/endpoints/openai/respcoord/**'
|
|
||||||
- 'core/http/endpoints/openai/turncoord/**'
|
|
||||||
- 'core/http/endpoints/openai/conncoord/**'
|
|
||||||
- 'core/http/endpoints/openai/compactcoord/**'
|
|
||||||
- 'core/http/endpoints/openai/ttscoord/**'
|
|
||||||
- 'formal-verification/**'
|
|
||||||
- 'scripts/realtime-conformance.sh'
|
|
||||||
- 'scripts/install-fizzbee.sh'
|
|
||||||
- '.github/workflows/realtime-conformance.yml'
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
paths:
|
|
||||||
- 'core/http/endpoints/openai/coordinator/**'
|
|
||||||
- 'core/http/endpoints/openai/respcoord/**'
|
|
||||||
- 'core/http/endpoints/openai/turncoord/**'
|
|
||||||
- 'core/http/endpoints/openai/conncoord/**'
|
|
||||||
- 'core/http/endpoints/openai/compactcoord/**'
|
|
||||||
- 'core/http/endpoints/openai/ttscoord/**'
|
|
||||||
- 'formal-verification/**'
|
|
||||||
- 'scripts/realtime-conformance.sh'
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: realtime-conformance-${{ github.event.pull_request.number || github.sha }}-${{ github.repository }}
|
|
||||||
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
conformance:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
go-version: ['1.26.x']
|
|
||||||
steps:
|
|
||||||
- name: Clone
|
|
||||||
uses: actions/checkout@v7
|
|
||||||
- name: Setup Go ${{ matrix.go-version }}
|
|
||||||
uses: actions/setup-go@v5
|
|
||||||
with:
|
|
||||||
go-version: ${{ matrix.go-version }}
|
|
||||||
cache: false
|
|
||||||
- name: Cache FizzBee
|
|
||||||
uses: actions/cache@v4
|
|
||||||
with:
|
|
||||||
path: .tools/fizzbee
|
|
||||||
key: fizzbee-v0.5.2-${{ runner.os }}-${{ hashFiles('formal-verification/fizzbee.sha256') }}
|
|
||||||
- name: Install FizzBee (pinned, checksum-verified)
|
|
||||||
# No `|| true`: a failed/forged download must fail the job, not silently
|
|
||||||
# drop the design verification. install-fizzbee.sh is a no-op if the
|
|
||||||
# cached binary is already present and valid.
|
|
||||||
run: ./scripts/install-fizzbee.sh
|
|
||||||
- name: Run conformance gate (fail-closed)
|
|
||||||
# No skip env: both the Go conformance and the FizzBee model check are
|
|
||||||
# required. The gate auto-detects .tools/fizzbee/fizz.
|
|
||||||
run: make test-realtime-conformance
|
|
||||||
9
.gitignore
vendored
9
.gitignore
vendored
@@ -97,12 +97,3 @@ core/http/react-ui/test-results/
|
|||||||
|
|
||||||
# Local Apple signing material (never commit)
|
# Local Apple signing material (never commit)
|
||||||
.certs/
|
.certs/
|
||||||
|
|
||||||
# Pinned dev tools (e.g. FizzBee for the realtime-conformance gate)
|
|
||||||
.tools/
|
|
||||||
|
|
||||||
# FizzBee model-check artifacts: the parser emits <spec>.json next to each
|
|
||||||
# .fizz and the checker writes run dirs under out/. Both are regenerated by
|
|
||||||
# the realtime-conformance gate; only the .fizz sources are authoritative.
|
|
||||||
formal-verification/*.json
|
|
||||||
formal-verification/out/
|
|
||||||
|
|||||||
14
Makefile
14
Makefile
@@ -405,18 +405,6 @@ test-realtime: build-mock-backend
|
|||||||
@echo 'Running realtime e2e tests (mock backend)'
|
@echo 'Running realtime e2e tests (mock backend)'
|
||||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="Realtime && !real-models" --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
|
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="Realtime && !real-models" --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
|
||||||
|
|
||||||
# Verify the realtime state-machine implementations conform to their formal
|
|
||||||
# designs (Go transition/rapid tests under -race + FizzBee model check of the
|
|
||||||
# authoritative specs). See docs/design/realtime-state-machines.md (Part 6) and
|
|
||||||
# docs/design/specs/README.md.
|
|
||||||
test-realtime-conformance:
|
|
||||||
GOCMD=$(GOCMD) ./scripts/realtime-conformance.sh
|
|
||||||
|
|
||||||
# Install the pinned, checksum-verified FizzBee model checker (into .tools/,
|
|
||||||
# gitignored) used by test-realtime-conformance. Idempotent; no-op if present.
|
|
||||||
install-fizzbee:
|
|
||||||
./scripts/install-fizzbee.sh
|
|
||||||
|
|
||||||
# Container-based real-model realtime testing. Build env vars / pipeline
|
# Container-based real-model realtime testing. Build env vars / pipeline
|
||||||
# definition kept here so test-realtime-models-docker can drive a fully wired
|
# definition kept here so test-realtime-models-docker can drive a fully wired
|
||||||
# pipeline (VAD + STT + LLM + TTS) from inside a containerised runner.
|
# pipeline (VAD + STT + LLM + TTS) from inside a containerised runner.
|
||||||
@@ -1039,7 +1027,7 @@ test-extra-backend-whisper-transcription: docker-build-whisper
|
|||||||
## is reachable.
|
## is reachable.
|
||||||
test-extra-backend-parakeet-cpp-transcription: docker-build-parakeet-cpp
|
test-extra-backend-parakeet-cpp-transcription: docker-build-parakeet-cpp
|
||||||
BACKEND_IMAGE=local-ai-backend:parakeet-cpp \
|
BACKEND_IMAGE=local-ai-backend:parakeet-cpp \
|
||||||
BACKEND_TEST_MODEL_URL=https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/realtime_eou_120m-v1-f16.gguf \
|
BACKEND_TEST_MODEL_URL=https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/tdt_ctc-110m-f16.gguf \
|
||||||
BACKEND_TEST_AUDIO_URL=https://github.com/ggml-org/whisper.cpp/raw/master/samples/jfk.wav \
|
BACKEND_TEST_AUDIO_URL=https://github.com/ggml-org/whisper.cpp/raw/master/samples/jfk.wav \
|
||||||
BACKEND_TEST_CAPS=health,load,transcription \
|
BACKEND_TEST_CAPS=health,load,transcription \
|
||||||
$(MAKE) test-extra-backend
|
$(MAKE) test-extra-backend
|
||||||
|
|||||||
@@ -18,18 +18,6 @@ service Backend {
|
|||||||
rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
|
rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
|
||||||
rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
|
rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
|
||||||
rpc AudioTranscriptionStream(TranscriptRequest) returns (stream TranscriptStreamResponse) {}
|
rpc AudioTranscriptionStream(TranscriptRequest) returns (stream TranscriptStreamResponse) {}
|
||||||
// AudioTranscriptionLive is the bidirectional live-microphone ASR RPC. The
|
|
||||||
// first message MUST carry a Config; subsequent messages carry Audio frames
|
|
||||||
// (mono float PCM at config.sample_rate, 16 kHz default). After a
|
|
||||||
// successful open the backend replies with a single ready ack
|
|
||||||
// (TranscriptLiveResponse{ready:true}); backends or models without
|
|
||||||
// cache-aware streaming support return UNIMPLEMENTED instead. Newly
|
|
||||||
// finalized text streams back as deltas; eou=true marks the model's
|
|
||||||
// end-of-utterance token. One stream spans many utterances (the decoder
|
|
||||||
// resets itself after each EOU). Closing the send side finalizes: the
|
|
||||||
// backend flushes the decoder tail and emits a terminal message carrying
|
|
||||||
// final_result. A second Config mid-stream resets the decode session.
|
|
||||||
rpc AudioTranscriptionLive(stream TranscriptLiveRequest) returns (stream TranscriptLiveResponse) {}
|
|
||||||
rpc TTS(TTSRequest) returns (Result) {}
|
rpc TTS(TTSRequest) returns (Result) {}
|
||||||
rpc TTSStream(TTSRequest) returns (stream Reply) {}
|
rpc TTSStream(TTSRequest) returns (stream Reply) {}
|
||||||
rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
|
rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
|
||||||
@@ -491,10 +479,6 @@ message TranscriptResult {
|
|||||||
string text = 2;
|
string text = 2;
|
||||||
string language = 3;
|
string language = 3;
|
||||||
float duration = 4;
|
float duration = 4;
|
||||||
// True when the decode ended on the model's end-of-utterance special token
|
|
||||||
// (<EOU>/<EOB>, emitted by cache-aware streaming models such as
|
|
||||||
// parakeet_realtime_eou_120m-v1). The marker itself is stripped from text.
|
|
||||||
bool eou = 5;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
message TranscriptStreamResponse {
|
message TranscriptStreamResponse {
|
||||||
@@ -502,34 +486,6 @@ message TranscriptStreamResponse {
|
|||||||
TranscriptResult final_result = 2;
|
TranscriptResult final_result = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
// === AudioTranscriptionLive messages =====================================
|
|
||||||
|
|
||||||
message TranscriptLiveRequest {
|
|
||||||
oneof payload {
|
|
||||||
TranscriptLiveConfig config = 1;
|
|
||||||
TranscriptLiveAudio audio = 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
message TranscriptLiveConfig {
|
|
||||||
string language = 1; // "" => model default
|
|
||||||
int32 sample_rate = 2; // 0 => 16000; backends may reject others
|
|
||||||
map<string, string> params = 3; // backend-specific tuning
|
|
||||||
}
|
|
||||||
|
|
||||||
message TranscriptLiveAudio {
|
|
||||||
repeated float pcm = 1; // mono PCM in [-1,1] at config.sample_rate
|
|
||||||
}
|
|
||||||
|
|
||||||
message TranscriptLiveResponse {
|
|
||||||
bool ready = 1; // open ack: sent once, before any delta
|
|
||||||
string delta = 2; // newly-finalized text since previous response
|
|
||||||
bool eou = 3; // <EOU> fired during this feed (the user yielded the turn)
|
|
||||||
repeated TranscriptWord words = 4; // words finalized by this feed (stream-relative ns)
|
|
||||||
TranscriptResult final_result = 5; // terminal message only, after the send side closes
|
|
||||||
bool eob = 6; // <EOB> fired: a backchannel ("uh-huh") ended — NOT a turn boundary
|
|
||||||
}
|
|
||||||
|
|
||||||
message TranscriptWord {
|
message TranscriptWord {
|
||||||
int64 start = 1;
|
int64 start = 1;
|
||||||
int64 end = 2;
|
int64 end = 2;
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
LLAMA_VERSION?=6f4f53f2b7da54fcdbbecaaa734337c337ad6176
|
LLAMA_VERSION?=dbdaece23de9ac63f2e7ca9e6bfcdc4fc156a3fa
|
||||||
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
|
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
|
||||||
|
|
||||||
CMAKE_ARGS?=
|
CMAKE_ARGS?=
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
# Local development: point at a working checkout instead of cloning, e.g.
|
# Local development: point at a working checkout instead of cloning, e.g.
|
||||||
# make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server
|
# make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server
|
||||||
|
|
||||||
PRIVACY_FILTER_VERSION?=595f59630c69d361b5196f2aba2c71c873d0c13c
|
PRIVACY_FILTER_VERSION?=98f52c5ef2250f207cc6b9a6aef05393a120cb7c
|
||||||
PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
|
PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
|
||||||
PRIVACY_FILTER_SRC?=
|
PRIVACY_FILTER_SRC?=
|
||||||
|
|
||||||
|
|||||||
@@ -1,81 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
// utteranceBoundary is the single definition of a small state machine that was
|
|
||||||
// previously open-coded three times — as a bare `finalEou` bool with an ad-hoc
|
|
||||||
// toggle — in the live feed (live.go), the file-stream text path, and the
|
|
||||||
// file-stream JSON path (goparakeetcpp.go).
|
|
||||||
//
|
|
||||||
// It answers one running question: does the decode currently rest on an
|
|
||||||
// end-of-utterance boundary? That is the value a closing FinalResult reports as
|
|
||||||
// .Eou and the realtime turn detector treats as a commit point.
|
|
||||||
//
|
|
||||||
// parakeet auto-resets its decoder after every <EOU>/<EOB>, so one streaming
|
|
||||||
// session is a sequence of utterances and this is a LATCH, not a monotonic
|
|
||||||
// flag: it closes on an <EOU> and reopens as soon as the next utterance starts.
|
|
||||||
// (Contrast the realtime API's per-turn `eouSeen`, which only ever goes
|
|
||||||
// false->true because each turn gets a fresh stream. Here the stream outlives
|
|
||||||
// the turn, so the boundary status must be able to reopen.)
|
|
||||||
//
|
|
||||||
// The only transitions, over the events one streamFeedResult carries — an
|
|
||||||
// <EOU>, an <EOB> (backchannel), or plain speech output (text and/or words):
|
|
||||||
//
|
|
||||||
// <EOU>
|
|
||||||
// open ───────────► closed
|
|
||||||
// ▲ ▲ │ │ │
|
|
||||||
// │ └─┘ <EOB>|speech │ │ <EOU>
|
|
||||||
// │ (stay open) │ └─┘ (stay closed)
|
|
||||||
// └──────────────────┘
|
|
||||||
// <EOB>|speech
|
|
||||||
//
|
|
||||||
// open = NOT on an utterance boundary: mid-utterance, the last boundary was
|
|
||||||
// a backchannel <EOB>, or the stream just began (the initial state).
|
|
||||||
// closed = the last meaningful event was an <EOU> with no later speech: a real
|
|
||||||
// turn boundary.
|
|
||||||
//
|
|
||||||
// A feed that carries nothing (no eou/eob/text/words — e.g. a finalize flush
|
|
||||||
// that produced no tail) is a no-op and leaves the state unchanged, matching
|
|
||||||
// the legacy "leave finalEou as it was" behaviour.
|
|
||||||
//
|
|
||||||
// The state carries no data, so it is modelled as a two-valued type (a named
|
|
||||||
// bool) rather than an int enum: every inhabitant is legal, so illegal states
|
|
||||||
// are unrepresentable — the payload-free analog of the sealed sum types the
|
|
||||||
// realtime machines use (those need interfaces because their states carry data,
|
|
||||||
// e.g. Active{ID}, where "Active with no ID" is the illegal combination a scalar
|
|
||||||
// cannot even express).
|
|
||||||
type utteranceBoundary bool
|
|
||||||
|
|
||||||
const (
|
|
||||||
// boundaryOpen is the zero value (false), so a fresh decode starts open —
|
|
||||||
// exactly the legacy `var finalEou bool` (false) initial condition.
|
|
||||||
boundaryOpen utteranceBoundary = false
|
|
||||||
boundaryClosed utteranceBoundary = true
|
|
||||||
)
|
|
||||||
|
|
||||||
// observe folds one decode increment into the latch and returns the new state.
|
|
||||||
//
|
|
||||||
// <EOU> takes priority when a single feed carries both an <EOU> and speech
|
|
||||||
// (e.g. {"text":"hello","eou":1}): the utterance both produced that text AND
|
|
||||||
// ended, so the decode rests on the boundary. This matches the legacy
|
|
||||||
// eou-checked-first ordering at every call site.
|
|
||||||
func (b utteranceBoundary) observe(r streamFeedResult) utteranceBoundary {
|
|
||||||
switch {
|
|
||||||
case r.Eou:
|
|
||||||
return boundaryClosed
|
|
||||||
case r.Eob || r.Delta != "" || len(r.Words) > 0:
|
|
||||||
return boundaryOpen
|
|
||||||
default:
|
|
||||||
return b
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ended reports whether the decode currently rests on an end-of-utterance
|
|
||||||
// boundary (a real <EOU>, not a backchannel <EOB>). This is what a closing
|
|
||||||
// FinalResult carries as .Eou.
|
|
||||||
func (b utteranceBoundary) ended() bool { return b == boundaryClosed }
|
|
||||||
|
|
||||||
func (b utteranceBoundary) String() string {
|
|
||||||
if b == boundaryClosed {
|
|
||||||
return "closed"
|
|
||||||
}
|
|
||||||
return "open"
|
|
||||||
}
|
|
||||||
@@ -1,92 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"math/rand/v2"
|
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
)
|
|
||||||
|
|
||||||
var _ = Describe("utteranceBoundary (decode end-of-utterance latch)", func() {
|
|
||||||
It("starts open: a fresh decode is not on a boundary", func() {
|
|
||||||
var b utteranceBoundary
|
|
||||||
Expect(b).To(Equal(boundaryOpen))
|
|
||||||
Expect(b.ended()).To(BeFalse())
|
|
||||||
})
|
|
||||||
|
|
||||||
DescribeTable("single feed transition from the open state",
|
|
||||||
func(r streamFeedResult, wantEnded bool) {
|
|
||||||
Expect(boundaryOpen.observe(r).ended()).To(Equal(wantEnded))
|
|
||||||
},
|
|
||||||
Entry("<EOU> closes it", streamFeedResult{Eou: true}, true),
|
|
||||||
Entry("<EOU> with text closes it (eou wins)", streamFeedResult{Delta: "hi", Eou: true}, true),
|
|
||||||
Entry("<EOB> stays open (backchannel is not a turn boundary)", streamFeedResult{Eob: true}, false),
|
|
||||||
Entry("plain text stays open", streamFeedResult{Delta: "hello"}, false),
|
|
||||||
Entry("words-only stays open", streamFeedResult{Words: []transcriptWord{{W: "x"}}}, false),
|
|
||||||
Entry("empty feed is a no-op (stays open)", streamFeedResult{}, false),
|
|
||||||
)
|
|
||||||
|
|
||||||
DescribeTable("single feed transition from the closed state",
|
|
||||||
func(r streamFeedResult, wantEnded bool) {
|
|
||||||
Expect(boundaryClosed.observe(r).ended()).To(Equal(wantEnded))
|
|
||||||
},
|
|
||||||
Entry("another <EOU> stays closed", streamFeedResult{Eou: true}, true),
|
|
||||||
Entry("trailing speech reopens it", streamFeedResult{Delta: "and more"}, false),
|
|
||||||
Entry("words reopen it", streamFeedResult{Words: []transcriptWord{{W: "x"}}}, false),
|
|
||||||
Entry("a backchannel <EOB> reopens it", streamFeedResult{Eob: true}, false),
|
|
||||||
Entry("empty feed is a no-op (stays closed)", streamFeedResult{}, true),
|
|
||||||
)
|
|
||||||
|
|
||||||
It("is a latch: <EOU> then trailing speech reopens, then <EOU> closes again", func() {
|
|
||||||
b := boundaryOpen
|
|
||||||
b = b.observe(streamFeedResult{Delta: "turn one", Eou: true})
|
|
||||||
Expect(b.ended()).To(BeTrue())
|
|
||||||
b = b.observe(streamFeedResult{Delta: " and more"})
|
|
||||||
Expect(b.ended()).To(BeFalse(), "trailing speech without an EOU is an open utterance")
|
|
||||||
b = b.observe(streamFeedResult{Eou: true})
|
|
||||||
Expect(b.ended()).To(BeTrue())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("treats a backchannel before a real EOU correctly", func() {
|
|
||||||
b := boundaryOpen
|
|
||||||
b = b.observe(streamFeedResult{Delta: "uh huh", Eob: true})
|
|
||||||
Expect(b.ended()).To(BeFalse(), "a backchannel must not masquerade as a turn boundary")
|
|
||||||
b = b.observe(streamFeedResult{Delta: "done", Eou: true})
|
|
||||||
Expect(b.ended()).To(BeTrue())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("matches the reference fold over seeded random feed sequences", func() {
|
|
||||||
// The invariant: after any sequence of feeds, ended() is true iff the
|
|
||||||
// last feed that carried ANY event was an <EOU>. <EOU> takes priority
|
|
||||||
// when a feed carries both an EOU and speech; empty feeds are ignored.
|
|
||||||
for seed := uint64(1); seed <= 200; seed++ {
|
|
||||||
rng := rand.New(rand.NewPCG(seed, seed*2654435761))
|
|
||||||
b := boundaryOpen
|
|
||||||
lastWasEou := false // reference: did the last meaningful feed end on EOU?
|
|
||||||
steps := rng.IntN(30)
|
|
||||||
for i := 0; i < steps; i++ {
|
|
||||||
var r streamFeedResult
|
|
||||||
switch rng.IntN(5) {
|
|
||||||
case 0:
|
|
||||||
r = streamFeedResult{Eou: true}
|
|
||||||
case 1:
|
|
||||||
r = streamFeedResult{Eob: true}
|
|
||||||
case 2:
|
|
||||||
r = streamFeedResult{Delta: "w"}
|
|
||||||
case 3:
|
|
||||||
r = streamFeedResult{Delta: "w", Eou: true} // eou + speech, eou wins
|
|
||||||
case 4:
|
|
||||||
r = streamFeedResult{} // empty: no-op
|
|
||||||
}
|
|
||||||
b = b.observe(r)
|
|
||||||
if r.Eou {
|
|
||||||
lastWasEou = true
|
|
||||||
} else if r.Eob || r.Delta != "" || len(r.Words) > 0 {
|
|
||||||
lastWasEou = false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Expect(b.ended()).To(Equal(lastWasEou),
|
|
||||||
"seed %d: latch disagreed with the reference fold", seed)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -1,82 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
|
|
||||||
"google.golang.org/grpc/codes"
|
|
||||||
"google.golang.org/grpc/status"
|
|
||||||
)
|
|
||||||
|
|
||||||
// streamFeedResult is one decode increment from a cache-aware streaming session:
|
|
||||||
// the newly-finalized text plus the model's own per-feed boundary tokens
|
|
||||||
// (<EOU>/<EOB>) and word timings. It is the single event type both the live
|
|
||||||
// (bidi) and file (server-stream) paths fold over, hiding the ABI v4 JSON vs
|
|
||||||
// older text-only entry-point split behind one shape.
|
|
||||||
type streamFeedResult struct {
|
|
||||||
Delta string
|
|
||||||
Eou bool
|
|
||||||
Eob bool
|
|
||||||
Words []transcriptWord
|
|
||||||
}
|
|
||||||
|
|
||||||
// feedChunk feeds one PCM chunk to the streaming session (or finalizes it, when
|
|
||||||
// finalize is true) and returns the unified decode increment. It prefers the
|
|
||||||
// ABI v4 JSON entry points (which also carry per-word timestamps) and falls
|
|
||||||
// back to the older text-only entry points against an older libparakeet.so.
|
|
||||||
//
|
|
||||||
// This is the one place the JSON-vs-text choice is made; every consumer works
|
|
||||||
// in terms of streamFeedResult.
|
|
||||||
func (p *ParakeetCpp) feedChunk(stream uintptr, pcm []float32, finalize bool) (streamFeedResult, error) {
|
|
||||||
if CppStreamFeedJSON != nil {
|
|
||||||
doc, err := p.streamFeedDoc(stream, pcm, finalize)
|
|
||||||
if err != nil {
|
|
||||||
return streamFeedResult{}, err
|
|
||||||
}
|
|
||||||
return streamFeedResult{Delta: doc.Text, Eou: doc.Eou != 0, Eob: doc.Eob != 0, Words: doc.Words}, nil
|
|
||||||
}
|
|
||||||
delta, eou, eob, err := p.streamFeedText(stream, pcm, finalize)
|
|
||||||
if err != nil {
|
|
||||||
return streamFeedResult{}, err
|
|
||||||
}
|
|
||||||
return streamFeedResult{Delta: delta, Eou: eou, Eob: eob}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// feedSlices feeds pcm through the session in streamChunkSamples slices,
|
|
||||||
// invoking onFeed for each decode increment. It does NOT finalize: callers
|
|
||||||
// decide when the send side is done. The file path finalizes after the whole
|
|
||||||
// file; the live path finalizes only when its request channel closes, never
|
|
||||||
// between audio messages. Slicing keeps each per-call engineMu hold short so
|
|
||||||
// concurrent unary transcription interleaves fairly (the C session buffers
|
|
||||||
// internally).
|
|
||||||
//
|
|
||||||
// If ctx is non-nil it is checked before each slice so a cancelled file
|
|
||||||
// transcription stops promptly; the live path passes nil (it is bounded by its
|
|
||||||
// request channel instead of a ctx).
|
|
||||||
func (p *ParakeetCpp) feedSlices(ctx context.Context, stream uintptr, pcm []float32, onFeed func(streamFeedResult) error) error {
|
|
||||||
for off := 0; off < len(pcm); off += streamChunkSamples {
|
|
||||||
if ctx != nil {
|
|
||||||
if err := ctx.Err(); err != nil {
|
|
||||||
return status.Error(codes.Canceled, "transcription cancelled")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
end := min(off+streamChunkSamples, len(pcm))
|
|
||||||
res, err := p.feedChunk(stream, pcm[off:end], false)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if err := onFeed(res); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// flushTail finalizes the session once and folds the flushed tail (the last
|
|
||||||
// ~2 encoder frames of text, which only appear on finalize) through onFeed.
|
|
||||||
func (p *ParakeetCpp) flushTail(stream uintptr, onFeed func(streamFeedResult) error) error {
|
|
||||||
res, err := p.feedChunk(stream, nil, true)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return onFeed(res)
|
|
||||||
}
|
|
||||||
@@ -103,13 +103,12 @@ type transcriptJSON struct {
|
|||||||
// {"text":"...","eou":0,"eob":0,"frame_sec":0.080000,
|
// {"text":"...","eou":0,"eob":0,"frame_sec":0.080000,
|
||||||
// "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...]}
|
// "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...]}
|
||||||
//
|
//
|
||||||
// "text" is the newly-finalized text since the last call. Under ABI v5 "eou"
|
// "text" is the newly-finalized text since the last call; "eou" is 1 when an
|
||||||
// is 1 iff an <EOU> fired this feed (the user yielded the turn) and "eob" 1
|
// <EOU> (end of utterance) fired this feed and "eob" is 1 when an <EOB>
|
||||||
// iff an <EOB> fired (a backchannel like "uh-huh" ended — NOT a turn
|
// (backchannel) fired. ABI v4 conflated the two into "eou"; v5 split them, so
|
||||||
// boundary). A v4 library has no "eob" field and its "eou" conflates both
|
// we read both and treat either as an utterance boundary for segmentation.
|
||||||
// tokens: Eob stays 0 and Eou keeps the old any-event meaning. "words" are
|
// "words" are the words finalized this call with absolute (stream-relative)
|
||||||
// the words finalized this call with absolute (stream-relative) start/end
|
// start/end seconds.
|
||||||
// seconds.
|
|
||||||
type streamFeedJSON struct {
|
type streamFeedJSON struct {
|
||||||
Text string `json:"text"`
|
Text string `json:"text"`
|
||||||
Eou int `json:"eou"`
|
Eou int `json:"eou"`
|
||||||
@@ -365,7 +364,7 @@ var segmentSeparators = []rune{'.', '?', '!'}
|
|||||||
// the caller requested word granularity; token ids populate each segment's
|
// the caller requested word granularity; token ids populate each segment's
|
||||||
// Tokens by time-window membership. Shared by the batched and direct paths.
|
// Tokens by time-window membership. Shared by the batched and direct paths.
|
||||||
func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gapFrames int) pb.TranscriptResult {
|
func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gapFrames int) pb.TranscriptResult {
|
||||||
text, eou := stripEouMarker(strings.TrimSpace(doc.Text))
|
text := strings.TrimSpace(doc.Text)
|
||||||
|
|
||||||
// Frame-unit gap threshold -> seconds (NeMo segment_gap_threshold). 0 = off.
|
// Frame-unit gap threshold -> seconds (NeMo segment_gap_threshold). 0 = off.
|
||||||
gapSeconds := 0.0
|
gapSeconds := 0.0
|
||||||
@@ -384,7 +383,6 @@ func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gap
|
|||||||
return pb.TranscriptResult{
|
return pb.TranscriptResult{
|
||||||
Text: text,
|
Text: text,
|
||||||
Segments: []*pb.TranscriptSegment{{Id: 0, Text: text}},
|
Segments: []*pb.TranscriptSegment{{Id: 0, Text: text}},
|
||||||
Eou: eou,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -411,25 +409,7 @@ func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gap
|
|||||||
}
|
}
|
||||||
segments = append(segments, seg)
|
segments = append(segments, seg)
|
||||||
}
|
}
|
||||||
return pb.TranscriptResult{Text: text, Segments: segments, Eou: eou}
|
return pb.TranscriptResult{Text: text, Segments: segments}
|
||||||
}
|
|
||||||
|
|
||||||
// stripEouMarker removes a trailing literal <EOU>/<EOB> from offline-decode
|
|
||||||
// text and reports whether the decode ended on an end-of-UTTERANCE token. The
|
|
||||||
// realtime EOU model's offline decode keeps the special token in the
|
|
||||||
// detokenized text (the streaming path strips it and surfaces it as flags
|
|
||||||
// instead); user-visible transcripts must never carry either marker, but only
|
|
||||||
// <EOU> may confirm the semantic_vad retranscribe cross-check — a decode
|
|
||||||
// ending on <EOB> means the last thing heard was a backchannel, not the user
|
|
||||||
// yielding the turn.
|
|
||||||
func stripEouMarker(text string) (string, bool) {
|
|
||||||
if strings.HasSuffix(text, "<EOU>") {
|
|
||||||
return strings.TrimSpace(strings.TrimSuffix(text, "<EOU>")), true
|
|
||||||
}
|
|
||||||
if strings.HasSuffix(text, "<EOB>") {
|
|
||||||
return strings.TrimSpace(strings.TrimSuffix(text, "<EOB>")), false
|
|
||||||
}
|
|
||||||
return text, false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// splitWordsIntoSegments groups words into segments exactly as NeMo's
|
// splitWordsIntoSegments groups words into segments exactly as NeMo's
|
||||||
@@ -496,55 +476,41 @@ func tokensInWindow(tokens []transcriptToken, start, end float64) []int32 {
|
|||||||
return ids
|
return ids
|
||||||
}
|
}
|
||||||
|
|
||||||
// streamSegmenter accumulates streaming decode increments into per-utterance
|
// streamSegmenter accumulates streaming words into per-utterance segments. EOU
|
||||||
// segments. <EOU>/<EOB> are the model's own utterance boundaries; each closes a
|
// is the model's own utterance boundary; each closed segment takes its start/end
|
||||||
// segment. When the feed carries per-word timings (ABI v4 JSON), a closed
|
// from its first/last accumulated word.
|
||||||
// segment takes its start/end from its first/last word; against an older
|
|
||||||
// text-only library (no words) it falls back to segmenting the delta text, so
|
|
||||||
// the same assembler serves both paths.
|
|
||||||
type streamSegmenter struct {
|
type streamSegmenter struct {
|
||||||
segs []*pb.TranscriptSegment
|
segs []*pb.TranscriptSegment
|
||||||
cur []transcriptWord // words for the open segment (ABI v4 JSON path)
|
cur []transcriptWord
|
||||||
curText []string // delta text for the open segment (text-only path)
|
nextID int32
|
||||||
nextID int32
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *streamSegmenter) add(r streamFeedResult) {
|
func (s *streamSegmenter) add(doc streamFeedJSON) {
|
||||||
s.cur = append(s.cur, r.Words...)
|
s.cur = append(s.cur, doc.Words...)
|
||||||
if len(r.Words) == 0 && r.Delta != "" {
|
// Close the segment on either turn signal: <EOU> (end of utterance) or
|
||||||
// Older libparakeet.so with no per-word timing: segment from the text.
|
// <EOB> (backchannel). ABI v4 reported both via "eou"; v5 split them, so we
|
||||||
s.curText = append(s.curText, r.Delta)
|
// OR them here to keep the v4 segmentation boundaries.
|
||||||
}
|
if doc.Eou != 0 || doc.Eob != 0 {
|
||||||
// Both <EOU> and <EOB> reset the decoder, so both close a segment.
|
|
||||||
if r.Eou || r.Eob {
|
|
||||||
s.flush()
|
s.flush()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *streamSegmenter) flush() {
|
func (s *streamSegmenter) flush() {
|
||||||
switch {
|
if len(s.cur) == 0 {
|
||||||
case len(s.cur) > 0:
|
return
|
||||||
parts := make([]string, len(s.cur))
|
|
||||||
for i, w := range s.cur {
|
|
||||||
parts[i] = w.W
|
|
||||||
}
|
|
||||||
s.segs = append(s.segs, &pb.TranscriptSegment{
|
|
||||||
Id: s.nextID,
|
|
||||||
Start: secondsToNanos(s.cur[0].Start),
|
|
||||||
End: secondsToNanos(s.cur[len(s.cur)-1].End),
|
|
||||||
Text: strings.TrimSpace(strings.Join(parts, " ")),
|
|
||||||
})
|
|
||||||
s.nextID++
|
|
||||||
case len(s.curText) > 0:
|
|
||||||
// No words this segment: emit a text-only segment (no timestamps),
|
|
||||||
// skipping a purely-whitespace one as the legacy text path did.
|
|
||||||
if t := strings.TrimSpace(strings.Join(s.curText, "")); t != "" {
|
|
||||||
s.segs = append(s.segs, &pb.TranscriptSegment{Id: s.nextID, Text: t})
|
|
||||||
s.nextID++
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
parts := make([]string, len(s.cur))
|
||||||
|
for i, w := range s.cur {
|
||||||
|
parts[i] = w.W
|
||||||
|
}
|
||||||
|
s.segs = append(s.segs, &pb.TranscriptSegment{
|
||||||
|
Id: s.nextID,
|
||||||
|
Start: secondsToNanos(s.cur[0].Start),
|
||||||
|
End: secondsToNanos(s.cur[len(s.cur)-1].End),
|
||||||
|
Text: strings.TrimSpace(strings.Join(parts, " ")),
|
||||||
|
})
|
||||||
|
s.nextID++
|
||||||
s.cur = nil
|
s.cur = nil
|
||||||
s.curText = nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *streamSegmenter) segments() []*pb.TranscriptSegment { return s.segs }
|
func (s *streamSegmenter) segments() []*pb.TranscriptSegment { return s.segs }
|
||||||
@@ -569,119 +535,18 @@ func secondsToNanos(sec float64) int64 {
|
|||||||
return int64(sec * 1e9)
|
return int64(sec * 1e9)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Per-C-call engine serialization for the streaming paths.
|
|
||||||
//
|
|
||||||
// Every individual C call (begin / feed / finalize / free) takes engineMu and
|
|
||||||
// re-checks ctxPtr under the lock; the lock is NEVER held across a stream's
|
|
||||||
// lifetime. This is safe because each parakeet.cpp call builds its own ggml
|
|
||||||
// graph and all streaming caches live in the session object, not the ctx —
|
|
||||||
// the only ctx-shared mutable state is last_error, which is why it is read
|
|
||||||
// under the same lock as the failing call. Holding the lock per call (rather
|
|
||||||
// than per stream, as this file previously did) keeps a long-lived live
|
|
||||||
// session from starving batched unary transcription and vice versa.
|
|
||||||
//
|
|
||||||
// A stream must not outlive its ctx (C-API contract). Free() takes engineMu
|
|
||||||
// and zeroes ctxPtr, so a racing per-call helper returns ModelNotLoaded
|
|
||||||
// instead of feeding a freed engine; streamFree of an orphaned session only
|
|
||||||
// runs the session destructor, which does not touch the ctx.
|
|
||||||
|
|
||||||
// streamBegin opens a cache-aware streaming session. A 0 stream with nil
|
|
||||||
// error means the loaded model is not a streaming model.
|
|
||||||
func (p *ParakeetCpp) streamBegin(lang string) (uintptr, error) {
|
|
||||||
p.engineMu.Lock()
|
|
||||||
defer p.engineMu.Unlock()
|
|
||||||
if p.ctxPtr == 0 {
|
|
||||||
return 0, grpcerrors.ModelNotLoaded("parakeet-cpp")
|
|
||||||
}
|
|
||||||
if CppStreamBeginLang != nil {
|
|
||||||
return CppStreamBeginLang(p.ctxPtr, lang), nil
|
|
||||||
}
|
|
||||||
return CppStreamBegin(p.ctxPtr), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *ParakeetCpp) streamFree(stream uintptr) {
|
|
||||||
if stream == 0 {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
p.engineMu.Lock()
|
|
||||||
defer p.engineMu.Unlock()
|
|
||||||
CppStreamFree(stream)
|
|
||||||
}
|
|
||||||
|
|
||||||
// streamFeedText runs one text-mode feed (or the finalize flush when
|
|
||||||
// finalize is true) under engineMu, returning the newly-finalized delta and
|
|
||||||
// whether an <EOU>/<EOB> fired during the call.
|
|
||||||
func (p *ParakeetCpp) streamFeedText(stream uintptr, pcm []float32, finalize bool) (delta string, eou, eob bool, err error) {
|
|
||||||
p.engineMu.Lock()
|
|
||||||
defer p.engineMu.Unlock()
|
|
||||||
if p.ctxPtr == 0 {
|
|
||||||
return "", false, false, grpcerrors.ModelNotLoaded("parakeet-cpp")
|
|
||||||
}
|
|
||||||
var ret uintptr
|
|
||||||
var events int32
|
|
||||||
if finalize {
|
|
||||||
ret = CppStreamFinalize(stream)
|
|
||||||
} else {
|
|
||||||
ret = CppStreamFeed(stream, pcm, int32(len(pcm)), unsafe.Pointer(&events))
|
|
||||||
}
|
|
||||||
if ret == 0 {
|
|
||||||
// last_error is ctx-shared: read it under the same lock as the call.
|
|
||||||
msg := CppLastError(p.ctxPtr)
|
|
||||||
if msg == "" {
|
|
||||||
msg = "unknown error"
|
|
||||||
}
|
|
||||||
return "", false, false, fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
|
|
||||||
}
|
|
||||||
delta = goStringFromCPtr(ret)
|
|
||||||
CppFreeString(ret)
|
|
||||||
// ABI v5: eou_out is a bitmask (bit 0 = <EOU>, bit 1 = <EOB>). A v4
|
|
||||||
// library sets 0/1 for either token, which the bit-0 test reads as the
|
|
||||||
// old conflated eou — the EOB distinction simply isn't available there.
|
|
||||||
return delta, events&1 != 0, events&2 != 0, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// streamFeedDoc runs one ABI v4 JSON feed (or finalize) under engineMu and
|
|
||||||
// returns the parsed {text,eou,frame_sec,words} document.
|
|
||||||
func (p *ParakeetCpp) streamFeedDoc(stream uintptr, pcm []float32, finalize bool) (streamFeedJSON, error) {
|
|
||||||
p.engineMu.Lock()
|
|
||||||
defer p.engineMu.Unlock()
|
|
||||||
if p.ctxPtr == 0 {
|
|
||||||
return streamFeedJSON{}, grpcerrors.ModelNotLoaded("parakeet-cpp")
|
|
||||||
}
|
|
||||||
var ret uintptr
|
|
||||||
if finalize {
|
|
||||||
ret = CppStreamFinalizeJSON(stream)
|
|
||||||
} else {
|
|
||||||
ret = CppStreamFeedJSON(stream, pcm, int32(len(pcm)))
|
|
||||||
}
|
|
||||||
if ret == 0 {
|
|
||||||
msg := CppLastError(p.ctxPtr)
|
|
||||||
if msg == "" {
|
|
||||||
msg = "unknown error"
|
|
||||||
}
|
|
||||||
return streamFeedJSON{}, fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
|
|
||||||
}
|
|
||||||
raw := goStringFromCPtr(ret)
|
|
||||||
CppFreeString(ret)
|
|
||||||
var doc streamFeedJSON
|
|
||||||
if err := json.Unmarshal([]byte(raw), &doc); err != nil {
|
|
||||||
return streamFeedJSON{}, fmt.Errorf("parakeet-cpp: decode stream json: %w", err)
|
|
||||||
}
|
|
||||||
return doc, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// AudioTranscriptionStream drives the cache-aware streaming RNN-T over the
|
// AudioTranscriptionStream drives the cache-aware streaming RNN-T over the
|
||||||
// audio at opts.Dst: it decodes the file to 16 kHz mono PCM, feeds it through
|
// audio at opts.Dst: it decodes the file to 16 kHz mono PCM, feeds it in
|
||||||
// the shared decode driver (feedSlices/flushTail), and emits each
|
// chunks to parakeet_capi_stream_feed, and emits each newly-finalized text
|
||||||
// newly-finalized text run as a TranscriptStreamResponse delta. <EOU>/<EOB>
|
// run as a TranscriptStreamResponse delta. <EOU>/<EOB> events close the
|
||||||
// events close the current segment; a closing FinalResult carries the full
|
// current segment; a closing FinalResult carries the full transcript and the
|
||||||
// transcript, the per-utterance segments, and whether the file ended on an
|
// per-utterance segments.
|
||||||
// utterance boundary.
|
|
||||||
//
|
//
|
||||||
// stream_begin returns 0 for models that are not cache-aware streaming models
|
// stream_begin returns 0 for models that are not cache-aware streaming models
|
||||||
// (only e.g. nvidia/parakeet_realtime_eou_120m-v1 qualifies). For those this
|
// (only e.g. nvidia/parakeet_realtime_eou_120m-v1 qualifies). For those we fall
|
||||||
// returns codes.Unimplemented rather than faking a stream from an offline
|
// back to a single offline transcription emitted as one delta plus a closing
|
||||||
// decode — see the stream==0 branch and grpcerrors.StreamTranscriptionUnsupported.
|
// FinalResult, matching LocalAI's non-streaming streaming contract (and the
|
||||||
|
// whisper backend), so the streaming endpoint works for every model.
|
||||||
func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.TranscriptRequest, results chan *pb.TranscriptStreamResponse) error {
|
func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.TranscriptRequest, results chan *pb.TranscriptStreamResponse) error {
|
||||||
defer close(results)
|
defer close(results)
|
||||||
|
|
||||||
@@ -695,73 +560,185 @@ func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.Tra
|
|||||||
return status.Error(codes.Canceled, "transcription cancelled")
|
return status.Error(codes.Canceled, "transcription cancelled")
|
||||||
}
|
}
|
||||||
|
|
||||||
stream, err := p.streamBegin(opts.GetLanguage())
|
var stream uintptr
|
||||||
if err != nil {
|
if CppStreamBeginLang != nil {
|
||||||
return err
|
stream = CppStreamBeginLang(p.ctxPtr, opts.GetLanguage())
|
||||||
|
} else {
|
||||||
|
stream = CppStreamBegin(p.ctxPtr)
|
||||||
}
|
}
|
||||||
if stream == 0 {
|
if stream == 0 {
|
||||||
// Not a cache-aware streaming model. Report the missing capability
|
// Not a cache-aware streaming model: run a normal offline
|
||||||
// honestly instead of decoding offline and emitting it as one "delta"
|
// transcription and emit it as one delta + a closing final result.
|
||||||
// + final: a client that asked for streaming must learn the model
|
res, err := p.AudioTranscription(ctx, opts)
|
||||||
// cannot stream, not receive a batch result dressed as a stream (which
|
if err != nil {
|
||||||
// is indistinguishable except qualitatively, and silently breaks any
|
return err
|
||||||
// feature that genuinely needs incremental output). Callers wanting a
|
}
|
||||||
// plain transcript use the unary AudioTranscription path. This mirrors
|
if t := strings.TrimSpace(res.Text); t != "" {
|
||||||
// AudioTranscriptionLive, which already returns Unimplemented here.
|
results <- &pb.TranscriptStreamResponse{Delta: t}
|
||||||
return grpcerrors.StreamTranscriptionUnsupported("parakeet-cpp",
|
}
|
||||||
"loaded model is not a cache-aware streaming model")
|
results <- &pb.TranscriptStreamResponse{FinalResult: &res}
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
defer p.streamFree(stream)
|
defer CppStreamFree(stream)
|
||||||
|
// The C engine is a single shared context: a streaming session and a batched
|
||||||
|
// unary dispatch must never touch it at once, so hold engineMu for the whole
|
||||||
|
// stream. This lock is intentionally taken AFTER the non-streaming fallback
|
||||||
|
// above returns: that fallback goes through AudioTranscription -> the batcher
|
||||||
|
// -> runBatch, which itself acquires engineMu, so locking here first would
|
||||||
|
// deadlock. Do not hoist this lock above the fallback.
|
||||||
|
p.engineMu.Lock()
|
||||||
|
defer p.engineMu.Unlock()
|
||||||
|
|
||||||
data, duration, err := decodeWavMono16k(opts.Dst)
|
data, duration, err := decodeWavMono16k(opts.Dst)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fold the shared decode driver's per-feed increments into the streamed
|
// ABI v4: when the streaming JSON entry points are present, drive them so the
|
||||||
// deltas and the closing batch result: words/text accumulate into
|
// per-utterance segments carry per-word start/end timestamps. Falls through to
|
||||||
// per-utterance segments (streamSegmenter), and the utterance-boundary
|
// the text-only loop below against an older libparakeet.so. Runs under the
|
||||||
// latch (boundary.go) records whether the file ended on an <EOU>. These
|
// engineMu already held above.
|
||||||
// are the offline path's concern — the live RPC carries none of them.
|
if CppStreamFeedJSON != nil {
|
||||||
|
return p.streamJSON(ctx, stream, data, duration, results)
|
||||||
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
full strings.Builder
|
full strings.Builder
|
||||||
seg streamSegmenter
|
segText strings.Builder
|
||||||
boundary utteranceBoundary
|
segments []*pb.TranscriptSegment
|
||||||
|
segID int32
|
||||||
)
|
)
|
||||||
emit := func(r streamFeedResult) error {
|
|
||||||
if r.Delta != "" {
|
flushSegment := func() {
|
||||||
full.WriteString(r.Delta)
|
t := strings.TrimSpace(segText.String())
|
||||||
results <- &pb.TranscriptStreamResponse{Delta: r.Delta}
|
segText.Reset()
|
||||||
|
if t == "" {
|
||||||
|
return
|
||||||
}
|
}
|
||||||
seg.add(r)
|
segments = append(segments, &pb.TranscriptSegment{Id: segID, Text: t})
|
||||||
boundary = boundary.observe(r)
|
segID++
|
||||||
|
}
|
||||||
|
|
||||||
|
// emitDelta consumes the malloc'd char* returned by feed/finalize: frees
|
||||||
|
// it, accumulates the text, and sends a delta when non-empty. A 0 return
|
||||||
|
// is an error (vs the "" empty-but-non-NULL no-new-text case).
|
||||||
|
emitDelta := func(ret uintptr) error {
|
||||||
|
if ret == 0 {
|
||||||
|
msg := CppLastError(p.ctxPtr)
|
||||||
|
if msg == "" {
|
||||||
|
msg = "unknown error"
|
||||||
|
}
|
||||||
|
return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
|
||||||
|
}
|
||||||
|
delta := goStringFromCPtr(ret)
|
||||||
|
CppFreeString(ret)
|
||||||
|
if delta == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
full.WriteString(delta)
|
||||||
|
segText.WriteString(delta)
|
||||||
|
results <- &pb.TranscriptStreamResponse{Delta: delta}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := p.feedSlices(ctx, stream, data, emit); err != nil {
|
for off := 0; off < len(data); off += streamChunkSamples {
|
||||||
return err
|
if err := ctx.Err(); err != nil {
|
||||||
}
|
return status.Error(codes.Canceled, "transcription cancelled")
|
||||||
if err := p.flushTail(stream, emit); err != nil {
|
}
|
||||||
return err
|
end := min(off+streamChunkSamples, len(data))
|
||||||
}
|
chunk := data[off:end]
|
||||||
seg.flush() // close a trailing utterance that never saw an <EOU>
|
|
||||||
|
|
||||||
// final.Text is the exact concatenation of the streamed deltas (full is
|
var eou int32
|
||||||
// their accumulation), so concat(deltas) == FinalResult.Text holds even
|
ret := CppStreamFeed(stream, chunk, int32(len(chunk)), unsafe.Pointer(&eou))
|
||||||
// when the model prepends a leading space to the first word (SentencePiece
|
if err := emitDelta(ret); err != nil {
|
||||||
// detokenization). This matches the whisper backend's streaming contract.
|
return err
|
||||||
// The single-segment fallback stays trimmed.
|
}
|
||||||
fullText := full.String()
|
if eou != 0 {
|
||||||
segments := seg.segments()
|
flushSegment()
|
||||||
if trimmed := strings.TrimSpace(fullText); len(segments) == 0 && trimmed != "" {
|
}
|
||||||
segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: trimmed})
|
}
|
||||||
|
|
||||||
|
// Flush the streaming tail (final encoder chunk).
|
||||||
|
if err := emitDelta(CppStreamFinalize(stream)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
flushSegment()
|
||||||
|
|
||||||
|
text := strings.TrimSpace(full.String())
|
||||||
|
if len(segments) == 0 && text != "" {
|
||||||
|
segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text})
|
||||||
}
|
}
|
||||||
results <- &pb.TranscriptStreamResponse{
|
results <- &pb.TranscriptStreamResponse{
|
||||||
FinalResult: &pb.TranscriptResult{
|
FinalResult: &pb.TranscriptResult{
|
||||||
Text: fullText,
|
Text: text,
|
||||||
|
Segments: segments,
|
||||||
|
Duration: duration,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// streamJSON drives the streaming JSON entry points (present since ABI v4): each
|
||||||
|
// feed/finalize returns a {text,eou,eob,frame_sec,words} document. The
|
||||||
|
// newly-finalized text is emitted as a delta (unchanged streaming contract)
|
||||||
|
// while words are accumulated into per-utterance segments (closed on <EOU> or
|
||||||
|
// <EOB>) so the closing FinalResult carries timestamped segments. Runs under
|
||||||
|
// engineMu (already held by the caller).
|
||||||
|
func (p *ParakeetCpp) streamJSON(ctx context.Context, stream uintptr, data []float32,
|
||||||
|
duration float32, results chan *pb.TranscriptStreamResponse) error {
|
||||||
|
var (
|
||||||
|
full strings.Builder
|
||||||
|
seg streamSegmenter
|
||||||
|
)
|
||||||
|
// consume frees the malloc'd char* (a 0 return is an error), parses the JSON,
|
||||||
|
// emits the delta, and routes words through the segmenter.
|
||||||
|
consume := func(ret uintptr) error {
|
||||||
|
if ret == 0 {
|
||||||
|
msg := CppLastError(p.ctxPtr)
|
||||||
|
if msg == "" {
|
||||||
|
msg = "unknown error"
|
||||||
|
}
|
||||||
|
return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
|
||||||
|
}
|
||||||
|
raw := goStringFromCPtr(ret)
|
||||||
|
CppFreeString(ret)
|
||||||
|
var doc streamFeedJSON
|
||||||
|
if err := json.Unmarshal([]byte(raw), &doc); err != nil {
|
||||||
|
return fmt.Errorf("parakeet-cpp: decode stream json: %w", err)
|
||||||
|
}
|
||||||
|
if doc.Text != "" {
|
||||||
|
full.WriteString(doc.Text)
|
||||||
|
results <- &pb.TranscriptStreamResponse{Delta: doc.Text}
|
||||||
|
}
|
||||||
|
seg.add(doc)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
for off := 0; off < len(data); off += streamChunkSamples {
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return status.Error(codes.Canceled, "transcription cancelled")
|
||||||
|
}
|
||||||
|
end := min(off+streamChunkSamples, len(data))
|
||||||
|
chunk := data[off:end]
|
||||||
|
if err := consume(CppStreamFeedJSON(stream, chunk, int32(len(chunk)))); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := consume(CppStreamFinalizeJSON(stream)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
seg.flush() // close any trailing utterance that never saw an EOU
|
||||||
|
|
||||||
|
text := strings.TrimSpace(full.String())
|
||||||
|
segments := seg.segments()
|
||||||
|
if len(segments) == 0 && text != "" {
|
||||||
|
segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text})
|
||||||
|
}
|
||||||
|
results <- &pb.TranscriptStreamResponse{
|
||||||
|
FinalResult: &pb.TranscriptResult{
|
||||||
|
Text: text,
|
||||||
Segments: segments,
|
Segments: segments,
|
||||||
Duration: duration,
|
Duration: duration,
|
||||||
Eou: boundary.ended(),
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
@@ -826,10 +803,6 @@ func (p *ParakeetCpp) Free() error {
|
|||||||
close(p.batStop)
|
close(p.batStop)
|
||||||
p.batStop = nil
|
p.batStop = nil
|
||||||
}
|
}
|
||||||
// engineMu so an in-flight streaming call (which locks per C call and
|
|
||||||
// re-checks ctxPtr under the lock) can never feed into a freed ctx.
|
|
||||||
p.engineMu.Lock()
|
|
||||||
defer p.engineMu.Unlock()
|
|
||||||
if p.ctxPtr != 0 {
|
if p.ctxPtr != 0 {
|
||||||
CppFree(p.ctxPtr)
|
CppFree(p.ctxPtr)
|
||||||
p.ctxPtr = 0
|
p.ctxPtr = 0
|
||||||
|
|||||||
@@ -14,8 +14,6 @@ import (
|
|||||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||||
. "github.com/onsi/ginkgo/v2"
|
. "github.com/onsi/ginkgo/v2"
|
||||||
. "github.com/onsi/gomega"
|
. "github.com/onsi/gomega"
|
||||||
"google.golang.org/grpc/codes"
|
|
||||||
"google.golang.org/grpc/status"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestParakeetCpp(t *testing.T) {
|
func TestParakeetCpp(t *testing.T) {
|
||||||
@@ -203,29 +201,6 @@ var _ = Describe("ParakeetCpp", func() {
|
|||||||
})
|
})
|
||||||
|
|
||||||
Context("AudioTranscriptionStream", func() {
|
Context("AudioTranscriptionStream", func() {
|
||||||
It("returns the typed Unimplemented signal for non-streaming models (no offline fallback)", func() {
|
|
||||||
// stream_begin == 0 means the loaded model is not a cache-aware
|
|
||||||
// streaming model. The backend must surface that, not silently
|
|
||||||
// decode offline and fake a one-shot "stream".
|
|
||||||
savedBegin, savedBeginLang := CppStreamBegin, CppStreamBeginLang
|
|
||||||
defer func() { CppStreamBegin, CppStreamBeginLang = savedBegin, savedBeginLang }()
|
|
||||||
CppStreamBeginLang = nil
|
|
||||||
CppStreamBegin = func(ctx uintptr) uintptr { return 0 }
|
|
||||||
|
|
||||||
p := &ParakeetCpp{ctxPtr: 1}
|
|
||||||
results := make(chan *pb.TranscriptStreamResponse, 8)
|
|
||||||
err := p.AudioTranscriptionStream(context.Background(),
|
|
||||||
&pb.TranscriptRequest{Dst: "ignored.wav"}, results)
|
|
||||||
Expect(status.Code(err)).To(Equal(codes.Unimplemented))
|
|
||||||
|
|
||||||
// Honest signal: nothing was emitted — no faked batch result.
|
|
||||||
var emitted []*pb.TranscriptStreamResponse
|
|
||||||
for r := range results {
|
|
||||||
emitted = append(emitted, r)
|
|
||||||
}
|
|
||||||
Expect(emitted).To(BeEmpty())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("streams deltas and a closing FinalResult from a cache-aware model", func() {
|
It("streams deltas and a closing FinalResult from a cache-aware model", func() {
|
||||||
// Streaming needs a cache-aware streaming model (e.g.
|
// Streaming needs a cache-aware streaming model (e.g.
|
||||||
// realtime_eou); the offline test model would fail stream_begin.
|
// realtime_eou); the offline test model would fail stream_begin.
|
||||||
|
|||||||
@@ -1,186 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
|
|
||||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
|
||||||
"github.com/mudler/xlog"
|
|
||||||
"google.golang.org/grpc/codes"
|
|
||||||
"google.golang.org/grpc/status"
|
|
||||||
)
|
|
||||||
|
|
||||||
// liveSampleRate is the only PCM rate the parakeet C streaming API accepts.
|
|
||||||
const liveSampleRate = 16000
|
|
||||||
|
|
||||||
// AudioTranscriptionLive drives one cache-aware streaming session over audio
|
|
||||||
// fed incrementally by the caller (the realtime API's semantic_vad turn
|
|
||||||
// detection). Contract:
|
|
||||||
//
|
|
||||||
// - the first request must carry a Config; a Config mid-stream resets the
|
|
||||||
// decode session (free + begin) and drops accumulated transcript state;
|
|
||||||
// - a Ready ack is sent right after a successful stream_begin so callers
|
|
||||||
// can degrade synchronously when the model has no streaming support
|
|
||||||
// (LiveTranscriptionUnsupported, codes.Unimplemented);
|
|
||||||
// - every feed that produced output is forwarded as {delta, eou, words};
|
|
||||||
// the <EOU>/<EOB> flag is the model's own utterance boundary and the
|
|
||||||
// decoder auto-resets after it, so one session spans many utterances;
|
|
||||||
// - closing the send side finalizes: the held-back tail chunk is flushed
|
|
||||||
// (the last ~2 encoder frames of words only appear here) and a terminal
|
|
||||||
// FinalResult carries the full transcript Text only. Per-utterance
|
|
||||||
// segments, duration, and the terminal <EOU> flag are NOT produced here —
|
|
||||||
// the realtime core consumes the streamed per-feed tokens and the final
|
|
||||||
// Text; those batch fields are the file path's concern (see
|
|
||||||
// AudioTranscriptionStream).
|
|
||||||
//
|
|
||||||
// Engine access is serialized per C call (streamBegin/streamFeed*/streamFree
|
|
||||||
// take engineMu internally), never for the session lifetime — unary
|
|
||||||
// transcription keeps flowing between feeds.
|
|
||||||
func (p *ParakeetCpp) AudioTranscriptionLive(in <-chan *pb.TranscriptLiveRequest, out chan<- *pb.TranscriptLiveResponse) error {
|
|
||||||
defer close(out)
|
|
||||||
|
|
||||||
if p.ctxPtr == 0 {
|
|
||||||
return grpcerrors.ModelNotLoaded("parakeet-cpp")
|
|
||||||
}
|
|
||||||
|
|
||||||
first, ok := <-in
|
|
||||||
if !ok {
|
|
||||||
return nil // caller closed without sending anything
|
|
||||||
}
|
|
||||||
cfg := first.GetConfig()
|
|
||||||
if cfg == nil {
|
|
||||||
return status.Error(codes.InvalidArgument, "parakeet-cpp: first live message must carry a config")
|
|
||||||
}
|
|
||||||
if err := validateLiveConfig(cfg); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
stream, err := p.streamBegin(cfg.GetLanguage())
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if stream == 0 {
|
|
||||||
return grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp",
|
|
||||||
"loaded model is not a cache-aware streaming model")
|
|
||||||
}
|
|
||||||
// stream is reassigned on a mid-stream Config reset; free whatever is
|
|
||||||
// current when the RPC unwinds.
|
|
||||||
defer func() { p.streamFree(stream) }()
|
|
||||||
|
|
||||||
out <- &pb.TranscriptLiveResponse{Ready: true}
|
|
||||||
|
|
||||||
var (
|
|
||||||
full strings.Builder
|
|
||||||
fedSecs float64
|
|
||||||
|
|
||||||
// behindSec accumulates how far decode wall time has fallen behind
|
|
||||||
// the audio it was fed. A live caller feeds in real time, so a
|
|
||||||
// persistent positive backlog means every downstream signal —
|
|
||||||
// including the <EOU> the turn detector waits on — arrives that many
|
|
||||||
// seconds late. Warned once per session; reset by a Config reset.
|
|
||||||
behindSec float64
|
|
||||||
behindWarned bool
|
|
||||||
)
|
|
||||||
|
|
||||||
// emit forwards one decode increment: it streams the per-feed tokens the
|
|
||||||
// realtime turn detector consumes (delta/eou/eob/words) and accumulates the
|
|
||||||
// running transcript for the closing FinalResult. No segmentation or
|
|
||||||
// boundary latch here — the live consumer reads only the streamed tokens
|
|
||||||
// and the final Text; per-utterance segments and the terminal <EOU> flag
|
|
||||||
// are an offline-path concern (see AudioTranscriptionStream / boundary.go).
|
|
||||||
emit := func(r streamFeedResult) error {
|
|
||||||
if r.Delta != "" {
|
|
||||||
full.WriteString(r.Delta)
|
|
||||||
}
|
|
||||||
if r.Delta != "" || r.Eou || r.Eob || len(r.Words) > 0 {
|
|
||||||
out <- &pb.TranscriptLiveResponse{
|
|
||||||
Delta: r.Delta,
|
|
||||||
Eou: r.Eou,
|
|
||||||
Eob: r.Eob,
|
|
||||||
Words: liveWordsToProto(r.Words),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
for req := range in {
|
|
||||||
switch payload := req.GetPayload().(type) {
|
|
||||||
case *pb.TranscriptLiveRequest_Config:
|
|
||||||
if err := validateLiveConfig(payload.Config); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
// Reset: a fresh decode session, dropping accumulated state.
|
|
||||||
p.streamFree(stream)
|
|
||||||
stream, err = p.streamBegin(payload.Config.GetLanguage())
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if stream == 0 {
|
|
||||||
return grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp",
|
|
||||||
"loaded model is not a cache-aware streaming model")
|
|
||||||
}
|
|
||||||
full.Reset()
|
|
||||||
fedSecs = 0
|
|
||||||
case *pb.TranscriptLiveRequest_Audio:
|
|
||||||
pcm := payload.Audio.GetPcm()
|
|
||||||
audioSec := float64(len(pcm)) / liveSampleRate
|
|
||||||
fedSecs += audioSec
|
|
||||||
start := time.Now()
|
|
||||||
// nil ctx: a live session is bounded by this request channel, not a
|
|
||||||
// context — cancellation is the caller closing the stream.
|
|
||||||
if err := p.feedSlices(nil, stream, pcm, emit); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
wallSec := time.Since(start).Seconds()
|
|
||||||
behindSec += wallSec - audioSec
|
|
||||||
if behindSec < 0 {
|
|
||||||
behindSec = 0
|
|
||||||
}
|
|
||||||
xlog.Debug("parakeet-cpp: live feed",
|
|
||||||
"audio_ms", int(audioSec*1000), "wall_ms", int(wallSec*1000),
|
|
||||||
"behind_ms", int(behindSec*1000), "fed_s", fedSecs)
|
|
||||||
if behindSec > 1 && !behindWarned {
|
|
||||||
behindWarned = true
|
|
||||||
xlog.Warn("parakeet-cpp: live decode is falling behind real time; "+
|
|
||||||
"end-of-utterance signals will arrive late",
|
|
||||||
"behind_s", behindSec, "fed_s", fedSecs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Send side closed: flush the streaming tail and emit the final transcript.
|
|
||||||
// The live FinalResult carries only Text — the authoritative full-turn
|
|
||||||
// transcript the realtime core commits. Per-utterance segments, duration,
|
|
||||||
// and the terminal <EOU> flag are not produced on the live path.
|
|
||||||
if err := p.flushTail(stream, emit); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
out <- &pb.TranscriptLiveResponse{
|
|
||||||
FinalResult: &pb.TranscriptResult{Text: strings.TrimSpace(full.String())},
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func validateLiveConfig(cfg *pb.TranscriptLiveConfig) error {
|
|
||||||
if sr := cfg.GetSampleRate(); sr != 0 && sr != liveSampleRate {
|
|
||||||
return status.Errorf(codes.InvalidArgument,
|
|
||||||
"parakeet-cpp: unsupported live sample_rate %d (only %d)", sr, liveSampleRate)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func liveWordsToProto(words []transcriptWord) []*pb.TranscriptWord {
|
|
||||||
if len(words) == 0 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
out := make([]*pb.TranscriptWord, len(words))
|
|
||||||
for i, w := range words {
|
|
||||||
out[i] = &pb.TranscriptWord{
|
|
||||||
Start: secondsToNanos(w.Start),
|
|
||||||
End: secondsToNanos(w.End),
|
|
||||||
Text: w.W,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
@@ -1,417 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"sync"
|
|
||||||
"time"
|
|
||||||
"unsafe"
|
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
|
|
||||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
"google.golang.org/grpc/codes"
|
|
||||||
"google.golang.org/grpc/status"
|
|
||||||
)
|
|
||||||
|
|
||||||
// The live-RPC specs drive AudioTranscriptionLive entirely against stubbed
|
|
||||||
// Cpp* package vars (the same seam batcher_test.go uses), so they run
|
|
||||||
// without libparakeet.so.
|
|
||||||
|
|
||||||
// liveCstrPool hands out NUL-terminated C-style strings backed by Go memory
|
|
||||||
// and keeps them alive for the duration of a spec (goStringFromCPtr reads
|
|
||||||
// through the raw pointer; Go's GC must not collect the backing array while
|
|
||||||
// a stub's return value is in flight).
|
|
||||||
type liveCstrPool struct {
|
|
||||||
mu sync.Mutex
|
|
||||||
bufs [][]byte
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *liveCstrPool) cstr(s string) uintptr {
|
|
||||||
p.mu.Lock()
|
|
||||||
defer p.mu.Unlock()
|
|
||||||
b := append([]byte(s), 0)
|
|
||||||
p.bufs = append(p.bufs, b)
|
|
||||||
return uintptr(unsafe.Pointer(&b[0]))
|
|
||||||
}
|
|
||||||
|
|
||||||
// liveStubs swaps every C entry point the live path touches and returns a
|
|
||||||
// restore func for AfterEach.
|
|
||||||
func liveStubs() (restore func()) {
|
|
||||||
savedBegin, savedBeginLang := CppStreamBegin, CppStreamBeginLang
|
|
||||||
savedFeed, savedFeedJSON := CppStreamFeed, CppStreamFeedJSON
|
|
||||||
savedFinalize, savedFinalizeJSON := CppStreamFinalize, CppStreamFinalizeJSON
|
|
||||||
savedFree, savedLastError := CppStreamFree, CppLastError
|
|
||||||
savedFreeString := CppFreeString
|
|
||||||
return func() {
|
|
||||||
CppStreamBegin, CppStreamBeginLang = savedBegin, savedBeginLang
|
|
||||||
CppStreamFeed, CppStreamFeedJSON = savedFeed, savedFeedJSON
|
|
||||||
CppStreamFinalize, CppStreamFinalizeJSON = savedFinalize, savedFinalizeJSON
|
|
||||||
CppStreamFree, CppLastError = savedFree, savedLastError
|
|
||||||
CppFreeString = savedFreeString
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// runLive starts the RPC on its own goroutine and returns the request
|
|
||||||
// channel plus a collector for everything the backend emitted.
|
|
||||||
func runLive(p *ParakeetCpp) (chan *pb.TranscriptLiveRequest, chan *pb.TranscriptLiveResponse, chan error) {
|
|
||||||
in := make(chan *pb.TranscriptLiveRequest)
|
|
||||||
out := make(chan *pb.TranscriptLiveResponse, 32)
|
|
||||||
errCh := make(chan error, 1)
|
|
||||||
go func() { errCh <- p.AudioTranscriptionLive(in, out) }()
|
|
||||||
return in, out, errCh
|
|
||||||
}
|
|
||||||
|
|
||||||
func liveConfig(lang string) *pb.TranscriptLiveRequest {
|
|
||||||
return &pb.TranscriptLiveRequest{
|
|
||||||
Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{Language: lang}},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func liveAudio(pcm []float32) *pb.TranscriptLiveRequest {
|
|
||||||
return &pb.TranscriptLiveRequest{
|
|
||||||
Payload: &pb.TranscriptLiveRequest_Audio{Audio: &pb.TranscriptLiveAudio{Pcm: pcm}},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func collectLive(out chan *pb.TranscriptLiveResponse) []*pb.TranscriptLiveResponse {
|
|
||||||
var got []*pb.TranscriptLiveResponse
|
|
||||||
for r := range out {
|
|
||||||
got = append(got, r)
|
|
||||||
}
|
|
||||||
return got
|
|
||||||
}
|
|
||||||
|
|
||||||
var _ = Describe("AudioTranscriptionLive (stubbed C API)", func() {
|
|
||||||
var (
|
|
||||||
pool *liveCstrPool
|
|
||||||
restore func()
|
|
||||||
p *ParakeetCpp
|
|
||||||
)
|
|
||||||
|
|
||||||
BeforeEach(func() {
|
|
||||||
pool = &liveCstrPool{}
|
|
||||||
restore = liveStubs()
|
|
||||||
p = &ParakeetCpp{ctxPtr: 1}
|
|
||||||
|
|
||||||
CppStreamBeginLang = nil
|
|
||||||
CppStreamBegin = func(ctx uintptr) uintptr { return 7 }
|
|
||||||
CppStreamFree = func(s uintptr) {}
|
|
||||||
CppFreeString = func(s uintptr) {}
|
|
||||||
CppLastError = func(ctx uintptr) string { return "stub error" }
|
|
||||||
CppStreamFeed = nil
|
|
||||||
CppStreamFeedJSON = nil
|
|
||||||
CppStreamFinalize = nil
|
|
||||||
CppStreamFinalizeJSON = nil
|
|
||||||
})
|
|
||||||
|
|
||||||
AfterEach(func() { restore() })
|
|
||||||
|
|
||||||
It("rejects a stream whose first message is not a config", func() {
|
|
||||||
in, out, errCh := runLive(p)
|
|
||||||
in <- liveAudio([]float32{0.1})
|
|
||||||
close(in)
|
|
||||||
|
|
||||||
err := <-errCh
|
|
||||||
Expect(status.Code(err)).To(Equal(codes.InvalidArgument))
|
|
||||||
Expect(collectLive(out)).To(BeEmpty())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("rejects a non-16k sample rate", func() {
|
|
||||||
in, _, errCh := runLive(p)
|
|
||||||
in <- &pb.TranscriptLiveRequest{
|
|
||||||
Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{SampleRate: 8000}},
|
|
||||||
}
|
|
||||||
close(in)
|
|
||||||
Expect(status.Code(<-errCh)).To(Equal(codes.InvalidArgument))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("returns the typed Unimplemented signal for non-streaming models, before any ack", func() {
|
|
||||||
CppStreamBegin = func(ctx uintptr) uintptr { return 0 }
|
|
||||||
|
|
||||||
in, out, errCh := runLive(p)
|
|
||||||
in <- liveConfig("")
|
|
||||||
close(in)
|
|
||||||
|
|
||||||
err := <-errCh
|
|
||||||
Expect(grpcerrors.IsLiveTranscriptionUnsupported(err)).To(BeTrue())
|
|
||||||
Expect(collectLive(out)).To(BeEmpty())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("streams deltas, eou flags and words on the JSON path and finalizes on close", func() {
|
|
||||||
var freed []uintptr
|
|
||||||
CppStreamFree = func(s uintptr) { freed = append(freed, s) }
|
|
||||||
feeds := 0
|
|
||||||
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
|
|
||||||
feeds++
|
|
||||||
switch feeds {
|
|
||||||
case 1:
|
|
||||||
return pool.cstr(`{"text":"hello ","eou":0,"frame_sec":0.08,` +
|
|
||||||
`"words":[{"w":"hello","start":0.1,"end":0.4,"conf":0.9}]}`)
|
|
||||||
default:
|
|
||||||
return pool.cstr(`{"text":"world","eou":1,"frame_sec":0.08,` +
|
|
||||||
`"words":[{"w":"world","start":0.5,"end":0.8,"conf":0.9}]}`)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
CppStreamFinalizeJSON = func(s uintptr) uintptr {
|
|
||||||
return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
|
|
||||||
}
|
|
||||||
|
|
||||||
in, out, errCh := runLive(p)
|
|
||||||
in <- liveConfig("en")
|
|
||||||
in <- liveAudio(make([]float32, 100))
|
|
||||||
in <- liveAudio(make([]float32, 200))
|
|
||||||
close(in)
|
|
||||||
Expect(<-errCh).NotTo(HaveOccurred())
|
|
||||||
|
|
||||||
got := collectLive(out)
|
|
||||||
Expect(got).To(HaveLen(4)) // ready, two deltas, final
|
|
||||||
|
|
||||||
Expect(got[0].Ready).To(BeTrue())
|
|
||||||
|
|
||||||
Expect(got[1].Delta).To(Equal("hello "))
|
|
||||||
Expect(got[1].Eou).To(BeFalse())
|
|
||||||
Expect(got[1].Words).To(HaveLen(1))
|
|
||||||
Expect(got[1].Words[0].Text).To(Equal("hello"))
|
|
||||||
|
|
||||||
Expect(got[2].Delta).To(Equal("world"))
|
|
||||||
Expect(got[2].Eou).To(BeTrue())
|
|
||||||
|
|
||||||
final := got[3].FinalResult
|
|
||||||
Expect(final).NotTo(BeNil())
|
|
||||||
Expect(final.Text).To(Equal("hello world"))
|
|
||||||
// The live FinalResult carries only Text. Per-utterance segments,
|
|
||||||
// duration and the terminal eou flag are an offline-path concern (see
|
|
||||||
// boundary.go / AudioTranscriptionStream); the realtime core reads the
|
|
||||||
// streamed per-feed tokens above plus this Text.
|
|
||||||
Expect(final.Eou).To(BeFalse())
|
|
||||||
Expect(final.Segments).To(BeEmpty())
|
|
||||||
Expect(final.Duration).To(BeZero())
|
|
||||||
|
|
||||||
Expect(freed).To(Equal([]uintptr{7}))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("falls back to the text feed (eou out-param) when the JSON entry points are absent", func() {
|
|
||||||
feeds := 0
|
|
||||||
CppStreamFeed = func(s uintptr, pcm []float32, n int32, eouOut unsafe.Pointer) uintptr {
|
|
||||||
feeds++
|
|
||||||
if feeds == 2 {
|
|
||||||
*(*int32)(eouOut) = 1
|
|
||||||
return pool.cstr("done")
|
|
||||||
}
|
|
||||||
return pool.cstr("first ")
|
|
||||||
}
|
|
||||||
CppStreamFinalize = func(s uintptr) uintptr { return pool.cstr("") }
|
|
||||||
|
|
||||||
in, out, errCh := runLive(p)
|
|
||||||
in <- liveConfig("")
|
|
||||||
in <- liveAudio(make([]float32, 10))
|
|
||||||
in <- liveAudio(make([]float32, 10))
|
|
||||||
close(in)
|
|
||||||
Expect(<-errCh).NotTo(HaveOccurred())
|
|
||||||
|
|
||||||
got := collectLive(out)
|
|
||||||
Expect(got).To(HaveLen(4))
|
|
||||||
Expect(got[1].Delta).To(Equal("first "))
|
|
||||||
Expect(got[1].Eou).To(BeFalse())
|
|
||||||
Expect(got[2].Delta).To(Equal("done"))
|
|
||||||
Expect(got[2].Eou).To(BeTrue())
|
|
||||||
Expect(got[3].FinalResult.Text).To(Equal("first done"))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("forwards <EOB> as eob — a backchannel, never an eou (ABI v5 JSON)", func() {
|
|
||||||
feeds := 0
|
|
||||||
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
|
|
||||||
feeds++
|
|
||||||
if feeds == 1 {
|
|
||||||
return pool.cstr(`{"text":"uh-huh","eou":0,"eob":1,"frame_sec":0.08,` +
|
|
||||||
`"words":[{"w":"uh-huh","start":0.1,"end":0.3,"conf":0.9}]}`)
|
|
||||||
}
|
|
||||||
return pool.cstr(`{"text":"the turn","eou":1,"eob":0,"frame_sec":0.08,` +
|
|
||||||
`"words":[{"w":"the","start":0.5,"end":0.6,"conf":0.9},{"w":"turn","start":0.6,"end":0.8,"conf":0.9}]}`)
|
|
||||||
}
|
|
||||||
CppStreamFinalizeJSON = func(s uintptr) uintptr {
|
|
||||||
return pool.cstr(`{"text":"","eou":0,"eob":0,"frame_sec":0.08,"words":[]}`)
|
|
||||||
}
|
|
||||||
|
|
||||||
in, out, errCh := runLive(p)
|
|
||||||
in <- liveConfig("")
|
|
||||||
in <- liveAudio(make([]float32, 10))
|
|
||||||
in <- liveAudio(make([]float32, 10))
|
|
||||||
close(in)
|
|
||||||
Expect(<-errCh).NotTo(HaveOccurred())
|
|
||||||
|
|
||||||
got := collectLive(out)
|
|
||||||
Expect(got).To(HaveLen(4))
|
|
||||||
Expect(got[1].Eob).To(BeTrue())
|
|
||||||
Expect(got[1].Eou).To(BeFalse(), "a backchannel must not masquerade as a turn boundary")
|
|
||||||
Expect(got[2].Eou).To(BeTrue())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("maps the v5 eou_out bitmask on the text path (bit0 <EOU>, bit1 <EOB>)", func() {
|
|
||||||
feeds := 0
|
|
||||||
CppStreamFeed = func(s uintptr, pcm []float32, n int32, eouOut unsafe.Pointer) uintptr {
|
|
||||||
feeds++
|
|
||||||
if feeds == 1 {
|
|
||||||
*(*int32)(eouOut) = 2 // <EOB> only
|
|
||||||
return pool.cstr("uh-huh")
|
|
||||||
}
|
|
||||||
*(*int32)(eouOut) = 1 // <EOU>
|
|
||||||
return pool.cstr(" done")
|
|
||||||
}
|
|
||||||
CppStreamFinalize = func(s uintptr) uintptr { return pool.cstr("") }
|
|
||||||
|
|
||||||
in, out, errCh := runLive(p)
|
|
||||||
in <- liveConfig("")
|
|
||||||
in <- liveAudio(make([]float32, 10))
|
|
||||||
in <- liveAudio(make([]float32, 10))
|
|
||||||
close(in)
|
|
||||||
Expect(<-errCh).NotTo(HaveOccurred())
|
|
||||||
|
|
||||||
got := collectLive(out)
|
|
||||||
Expect(got).To(HaveLen(4))
|
|
||||||
Expect(got[1].Eob).To(BeTrue())
|
|
||||||
Expect(got[1].Eou).To(BeFalse())
|
|
||||||
Expect(got[2].Eou).To(BeTrue())
|
|
||||||
Expect(got[2].Eob).To(BeFalse())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("accumulates trailing text after an EOU into the final transcript", func() {
|
|
||||||
feeds := 0
|
|
||||||
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
|
|
||||||
feeds++
|
|
||||||
if feeds == 1 {
|
|
||||||
return pool.cstr(`{"text":"turn one","eou":1,"frame_sec":0.08,"words":[]}`)
|
|
||||||
}
|
|
||||||
return pool.cstr(`{"text":" and more","eou":0,"frame_sec":0.08,"words":[]}`)
|
|
||||||
}
|
|
||||||
CppStreamFinalizeJSON = func(s uintptr) uintptr {
|
|
||||||
return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
|
|
||||||
}
|
|
||||||
|
|
||||||
in, out, errCh := runLive(p)
|
|
||||||
in <- liveConfig("")
|
|
||||||
in <- liveAudio(make([]float32, 10))
|
|
||||||
in <- liveAudio(make([]float32, 10))
|
|
||||||
close(in)
|
|
||||||
Expect(<-errCh).NotTo(HaveOccurred())
|
|
||||||
|
|
||||||
got := collectLive(out)
|
|
||||||
final := got[len(got)-1].FinalResult
|
|
||||||
Expect(final.Text).To(Equal("turn one and more"))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("resets the decode session on a mid-stream config", func() {
|
|
||||||
var begun, freed int
|
|
||||||
CppStreamBegin = func(ctx uintptr) uintptr { begun++; return uintptr(10 + begun) }
|
|
||||||
CppStreamFree = func(s uintptr) { freed++ }
|
|
||||||
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
|
|
||||||
return pool.cstr(`{"text":"x","eou":0,"frame_sec":0.08,"words":[]}`)
|
|
||||||
}
|
|
||||||
CppStreamFinalizeJSON = func(s uintptr) uintptr {
|
|
||||||
return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
|
|
||||||
}
|
|
||||||
|
|
||||||
in, out, errCh := runLive(p)
|
|
||||||
in <- liveConfig("")
|
|
||||||
in <- liveAudio(make([]float32, 10))
|
|
||||||
in <- liveConfig("") // reset
|
|
||||||
in <- liveAudio(make([]float32, 10))
|
|
||||||
close(in)
|
|
||||||
Expect(<-errCh).NotTo(HaveOccurred())
|
|
||||||
|
|
||||||
got := collectLive(out)
|
|
||||||
final := got[len(got)-1].FinalResult
|
|
||||||
Expect(final.Text).To(Equal("x"), "pre-reset transcript dropped")
|
|
||||||
Expect(begun).To(Equal(2))
|
|
||||||
Expect(freed).To(Equal(2), "old session freed on reset, new one on unwind")
|
|
||||||
})
|
|
||||||
|
|
||||||
It("does not hold engineMu between feeds (unary work interleaves with a live session)", func() {
|
|
||||||
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
|
|
||||||
return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
|
|
||||||
}
|
|
||||||
CppStreamFinalizeJSON = func(s uintptr) uintptr {
|
|
||||||
return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
|
|
||||||
}
|
|
||||||
|
|
||||||
in, out, errCh := runLive(p)
|
|
||||||
in <- liveConfig("")
|
|
||||||
in <- liveAudio(make([]float32, 10))
|
|
||||||
|
|
||||||
// The session is open and idle between feeds: the engine lock must be
|
|
||||||
// acquirable, which is what lets batched unary transcription proceed
|
|
||||||
// mid-session. Under stream-lifetime locking this probe would block
|
|
||||||
// until the stream ended and the Eventually would time out.
|
|
||||||
locked := make(chan struct{})
|
|
||||||
go func() {
|
|
||||||
p.engineMu.Lock()
|
|
||||||
p.engineMu.Unlock() //nolint:staticcheck // probe: acquire-release proves availability
|
|
||||||
close(locked)
|
|
||||||
}()
|
|
||||||
Eventually(locked, time.Second).Should(BeClosed())
|
|
||||||
|
|
||||||
close(in)
|
|
||||||
Expect(<-errCh).NotTo(HaveOccurred())
|
|
||||||
collectLive(out)
|
|
||||||
})
|
|
||||||
|
|
||||||
It("errors out and reads last_error under the lock when a feed fails", func() {
|
|
||||||
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr { return 0 }
|
|
||||||
|
|
||||||
in, out, errCh := runLive(p)
|
|
||||||
in <- liveConfig("")
|
|
||||||
in <- liveAudio(make([]float32, 10))
|
|
||||||
|
|
||||||
err := <-errCh
|
|
||||||
Expect(err).To(MatchError(ContainSubstring("stub error")))
|
|
||||||
got := collectLive(out)
|
|
||||||
Expect(got).To(HaveLen(1)) // just the ready ack
|
|
||||||
close(in)
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
var _ = Describe("stripEouMarker", func() {
|
|
||||||
It("strips a trailing <EOU> and reports it", func() {
|
|
||||||
text, eou := stripEouMarker("it is certainly very like the old portrait<EOU>")
|
|
||||||
Expect(text).To(Equal("it is certainly very like the old portrait"))
|
|
||||||
Expect(eou).To(BeTrue())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("strips a trailing <EOB> WITHOUT reporting an utterance end", func() {
|
|
||||||
// A decode ending on a backchannel must not confirm the
|
|
||||||
// retranscribe gate — the user was acknowledging, not yielding.
|
|
||||||
text, eou := stripEouMarker("uh-huh<EOB>")
|
|
||||||
Expect(text).To(Equal("uh-huh"))
|
|
||||||
Expect(eou).To(BeFalse())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("leaves marker-free text alone", func() {
|
|
||||||
text, eou := stripEouMarker("plain transcript")
|
|
||||||
Expect(text).To(Equal("plain transcript"))
|
|
||||||
Expect(eou).To(BeFalse())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("does not strip a marker in the middle of the text", func() {
|
|
||||||
text, eou := stripEouMarker("a<EOU>b")
|
|
||||||
Expect(text).To(Equal("a<EOU>b"))
|
|
||||||
Expect(eou).To(BeFalse())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
var _ = Describe("transcriptResultFromDoc EOU handling", func() {
|
|
||||||
It("strips the offline marker from text and sets the result flag", func() {
|
|
||||||
doc := transcriptJSON{Text: "the old portrait<EOU>"}
|
|
||||||
res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
|
|
||||||
Expect(res.Text).To(Equal("the old portrait"))
|
|
||||||
Expect(res.Eou).To(BeTrue())
|
|
||||||
Expect(res.Segments).To(HaveLen(1))
|
|
||||||
Expect(res.Segments[0].Text).To(Equal("the old portrait"))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("reports eou=false for marker-free decodes", func() {
|
|
||||||
doc := transcriptJSON{Text: "no marker here"}
|
|
||||||
res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
|
|
||||||
Expect(res.Text).To(Equal("no marker here"))
|
|
||||||
Expect(res.Eou).To(BeFalse())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -106,7 +106,7 @@ var _ = Describe("transcriptResultFromDoc (multi-segment)", func() {
|
|||||||
var _ = Describe("streaming segment assembly", func() {
|
var _ = Describe("streaming segment assembly", func() {
|
||||||
It("closes a segment with start/end from its words on EOU", func() {
|
It("closes a segment with start/end from its words on EOU", func() {
|
||||||
acc := &streamSegmenter{}
|
acc := &streamSegmenter{}
|
||||||
acc.add(streamFeedResult{Delta: "hello world", Eou: true, Words: []transcriptWord{
|
acc.add(streamFeedJSON{Text: "hello world", Eou: 1, Words: []transcriptWord{
|
||||||
{W: "hello", Start: 0.0, End: 0.4}, {W: "world", Start: 0.4, End: 0.9},
|
{W: "hello", Start: 0.0, End: 0.4}, {W: "world", Start: 0.4, End: 0.9},
|
||||||
}})
|
}})
|
||||||
segs := acc.segments()
|
segs := acc.segments()
|
||||||
@@ -118,9 +118,9 @@ var _ = Describe("streaming segment assembly", func() {
|
|||||||
|
|
||||||
It("buffers words across feeds until EOU", func() {
|
It("buffers words across feeds until EOU", func() {
|
||||||
acc := &streamSegmenter{}
|
acc := &streamSegmenter{}
|
||||||
acc.add(streamFeedResult{Delta: "hi", Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}})
|
acc.add(streamFeedJSON{Text: "hi", Eou: 0, Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}})
|
||||||
Expect(acc.segments()).To(BeEmpty())
|
Expect(acc.segments()).To(BeEmpty())
|
||||||
acc.add(streamFeedResult{Delta: "there", Eou: true, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}})
|
acc.add(streamFeedJSON{Text: "there", Eou: 1, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}})
|
||||||
Expect(acc.segments()).To(HaveLen(1))
|
Expect(acc.segments()).To(HaveLen(1))
|
||||||
Expect(acc.segments()[0].Text).To(Equal("hi there"))
|
Expect(acc.segments()[0].Text).To(Equal("hi there"))
|
||||||
})
|
})
|
||||||
@@ -129,7 +129,7 @@ var _ = Describe("streaming segment assembly", func() {
|
|||||||
// field; a backchannel must still close the segment as it did in v4.
|
// field; a backchannel must still close the segment as it did in v4.
|
||||||
It("closes a segment on EOB (backchannel) too", func() {
|
It("closes a segment on EOB (backchannel) too", func() {
|
||||||
acc := &streamSegmenter{}
|
acc := &streamSegmenter{}
|
||||||
acc.add(streamFeedResult{Delta: "uh huh", Eob: true, Words: []transcriptWord{
|
acc.add(streamFeedJSON{Text: "uh huh", Eou: 0, Eob: 1, Words: []transcriptWord{
|
||||||
{W: "uh", Start: 0.0, End: 0.2}, {W: "huh", Start: 0.2, End: 0.5},
|
{W: "uh", Start: 0.0, End: 0.2}, {W: "huh", Start: 0.2, End: 0.5},
|
||||||
}})
|
}})
|
||||||
segs := acc.segments()
|
segs := acc.segments()
|
||||||
@@ -137,18 +137,4 @@ var _ = Describe("streaming segment assembly", func() {
|
|||||||
Expect(segs[0].Text).To(Equal("uh huh"))
|
Expect(segs[0].Text).To(Equal("uh huh"))
|
||||||
Expect(segs[0].End).To(Equal(secondsToNanos(0.5)))
|
Expect(segs[0].End).To(Equal(secondsToNanos(0.5)))
|
||||||
})
|
})
|
||||||
|
|
||||||
// Older text-only libparakeet.so: no per-word timings, so a segment is cut
|
|
||||||
// from the delta text on each <EOU>/<EOB> (no timestamps), one per utterance.
|
|
||||||
It("falls back to text segments when the feed carries no words", func() {
|
|
||||||
acc := &streamSegmenter{}
|
|
||||||
acc.add(streamFeedResult{Delta: "first turn", Eou: true})
|
|
||||||
acc.add(streamFeedResult{Delta: "second turn", Eou: true})
|
|
||||||
segs := acc.segments()
|
|
||||||
Expect(segs).To(HaveLen(2))
|
|
||||||
Expect(segs[0].Text).To(Equal("first turn"))
|
|
||||||
Expect(segs[1].Text).To(Equal("second turn"))
|
|
||||||
Expect(segs[0].Start).To(Equal(int64(0)), "no per-word timing on the text path")
|
|
||||||
Expect(segs[0].End).To(Equal(int64(0)))
|
|
||||||
})
|
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
|||||||
|
|
||||||
# stablediffusion.cpp (ggml)
|
# stablediffusion.cpp (ggml)
|
||||||
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
|
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
|
||||||
STABLEDIFFUSION_GGML_VERSION?=3b6c9ca97cfcda8e68e719e6670d06379fcbe943
|
STABLEDIFFUSION_GGML_VERSION?=c1790754d31bec0731ed5fddc9d5b9ff22ee19cd
|
||||||
|
|
||||||
CMAKE_ARGS+=-DGGML_MAX_NAME=128
|
CMAKE_ARGS+=-DGGML_MAX_NAME=128
|
||||||
|
|
||||||
|
|||||||
@@ -147,25 +147,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
d["reasoning_content"] = msg.reasoning_content
|
d["reasoning_content"] = msg.reasoning_content
|
||||||
if msg.tool_calls:
|
if msg.tool_calls:
|
||||||
try:
|
try:
|
||||||
tool_calls = json.loads(msg.tool_calls)
|
d["tool_calls"] = json.loads(msg.tool_calls)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
pass
|
pass
|
||||||
else:
|
|
||||||
# OpenAI wire format carries function.arguments as a
|
|
||||||
# JSON-encoded string, but chat templates (e.g. Qwen3)
|
|
||||||
# iterate over it as a mapping. The vllm backend
|
|
||||||
# already parses arguments before applying the chat
|
|
||||||
# template (PR #10256); mirror that here so the
|
|
||||||
# sglang backend works with the same wire format.
|
|
||||||
if isinstance(tool_calls, list):
|
|
||||||
for tc in tool_calls:
|
|
||||||
func = tc.get("function") if isinstance(tc, dict) else None
|
|
||||||
if isinstance(func, dict) and isinstance(func.get("arguments"), str):
|
|
||||||
try:
|
|
||||||
func["arguments"] = json.loads(func["arguments"])
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
pass
|
|
||||||
d["tool_calls"] = tool_calls
|
|
||||||
result.append(d)
|
result.append(d)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@@ -103,11 +103,6 @@ func newApplication(appConfig *config.ApplicationConfig) *Application {
|
|||||||
mcpTools.CloseMCPSessions(modelName)
|
mcpTools.CloseMCPSessions(modelName)
|
||||||
})
|
})
|
||||||
|
|
||||||
// Record a model_load backend trace for every real backend load, so the
|
|
||||||
// Traces UI shows which backend runtime served each model and how long
|
|
||||||
// the load took. Load failures are traced by the modality wrappers.
|
|
||||||
ml.SetLoadObserver(corebackend.ModelLoadTraceObserver(appConfig))
|
|
||||||
|
|
||||||
app := &Application{
|
app := &Application{
|
||||||
backendLoader: config.NewModelConfigLoader(appConfig.SystemState.Model.ModelsPath),
|
backendLoader: config.NewModelConfigLoader(appConfig.SystemState.Model.ModelsPath),
|
||||||
modelLoader: ml,
|
modelLoader: ml,
|
||||||
|
|||||||
@@ -356,6 +356,12 @@ func initDistributed(cfg *config.ApplicationConfig, authDB *gorm.DB, configLoade
|
|||||||
PrefixConfig: prefixCfg,
|
PrefixConfig: prefixCfg,
|
||||||
Pressure: pressure,
|
Pressure: pressure,
|
||||||
SharedModels: cfg.Distributed.SharedModels,
|
SharedModels: cfg.Distributed.SharedModels,
|
||||||
|
// Cap how long a cold load may hold the per-model advisory lock: the
|
||||||
|
// configured backend.install deadline plus a margin for file staging and
|
||||||
|
// the remote LoadModel. Derived from the install timeout so raising it
|
||||||
|
// (for slow links pulling multi-GB images) widens the ceiling too,
|
||||||
|
// instead of letting the static default cut a legitimately slow load.
|
||||||
|
ModelLoadCeiling: cfg.Distributed.BackendInstallTimeoutOrDefault() + 10*time.Minute,
|
||||||
})
|
})
|
||||||
|
|
||||||
// Wire staging-progress broadcasting so file-staging shows up on every
|
// Wire staging-progress broadcasting so file-staging shows up on every
|
||||||
|
|||||||
@@ -1,72 +0,0 @@
|
|||||||
package backend_test
|
|
||||||
|
|
||||||
import (
|
|
||||||
"errors"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/core/backend"
|
|
||||||
"github.com/mudler/LocalAI/core/config"
|
|
||||||
"github.com/mudler/LocalAI/core/trace"
|
|
||||||
"github.com/mudler/LocalAI/pkg/model"
|
|
||||||
)
|
|
||||||
|
|
||||||
// ModelLoadTraceObserver is what makes successful loads visible on the
|
|
||||||
// Traces page: one model_load row per real backend load, carrying the
|
|
||||||
// resolved backend runtime. Failures must NOT be recorded here — the
|
|
||||||
// modality wrappers own those — and the observer must respect the runtime
|
|
||||||
// tracing toggle.
|
|
||||||
var _ = Describe("ModelLoadTraceObserver", func() {
|
|
||||||
var appConfig *config.ApplicationConfig
|
|
||||||
|
|
||||||
successEvent := model.BackendLoadEvent{
|
|
||||||
ModelID: "parakeet-cpp-realtime_eou_120m-v1",
|
|
||||||
ModelName: "realtime_eou_120m.gguf",
|
|
||||||
Backend: "parakeet-cpp",
|
|
||||||
BackendURI: "/backends/intel-sycl-f16-parakeet-cpp-development/run.sh",
|
|
||||||
Duration: 1500 * time.Millisecond,
|
|
||||||
}
|
|
||||||
|
|
||||||
BeforeEach(func() {
|
|
||||||
appConfig = &config.ApplicationConfig{
|
|
||||||
EnableTracing: true,
|
|
||||||
TracingMaxItems: 64,
|
|
||||||
}
|
|
||||||
trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
|
|
||||||
trace.ClearBackendTraces()
|
|
||||||
})
|
|
||||||
|
|
||||||
It("records a model_load trace with the backend runtime on success", func() {
|
|
||||||
backend.ModelLoadTraceObserver(appConfig)(successEvent)
|
|
||||||
|
|
||||||
Eventually(trace.GetBackendTraces).Should(HaveLen(1))
|
|
||||||
got := trace.GetBackendTraces()[0]
|
|
||||||
Expect(got.Type).To(Equal(trace.BackendTraceModelLoad))
|
|
||||||
Expect(got.Summary).To(Equal("Model loaded"))
|
|
||||||
Expect(got.ModelName).To(Equal("parakeet-cpp-realtime_eou_120m-v1"))
|
|
||||||
Expect(got.Backend).To(Equal("parakeet-cpp"))
|
|
||||||
Expect(got.Duration).To(Equal(1500 * time.Millisecond))
|
|
||||||
Expect(got.Data["backend_runtime"]).To(Equal("/backends/intel-sycl-f16-parakeet-cpp-development/run.sh"))
|
|
||||||
Expect(got.Data["model_file"]).To(Equal("realtime_eou_120m.gguf"))
|
|
||||||
Expect(got.Error).To(BeEmpty())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("skips failed loads — the modality wrappers trace those with request context", func() {
|
|
||||||
failed := successEvent
|
|
||||||
failed.Err = errors.New("grpc service not ready")
|
|
||||||
|
|
||||||
backend.ModelLoadTraceObserver(appConfig)(failed)
|
|
||||||
|
|
||||||
Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("records nothing when tracing is disabled", func() {
|
|
||||||
appConfig.EnableTracing = false
|
|
||||||
|
|
||||||
backend.ModelLoadTraceObserver(appConfig)(successEvent)
|
|
||||||
|
|
||||||
Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -19,39 +19,6 @@ import (
|
|||||||
"github.com/mudler/xlog"
|
"github.com/mudler/xlog"
|
||||||
)
|
)
|
||||||
|
|
||||||
// ModelLoadTraceObserver returns the ModelLoader load observer that records
|
|
||||||
// a model_load backend trace for every successful real load (backend process
|
|
||||||
// spawn + LoadModel RPC; cache hits never reach the observer). Failures are
|
|
||||||
// deliberately skipped here: the modality wrappers already record them via
|
|
||||||
// recordModelLoadFailure with request context, and the backend auto-discovery
|
|
||||||
// scan probes several backends before one succeeds — tracing every probe
|
|
||||||
// failure would bury the buffer in noise.
|
|
||||||
//
|
|
||||||
// The traced data includes the resolved backend runtime (the installed
|
|
||||||
// backend's launcher path, which names the variant directory) — that is what
|
|
||||||
// identifies WHICH build served the load. A stale installed backend is
|
|
||||||
// invisible in the model config but obvious here.
|
|
||||||
func ModelLoadTraceObserver(appConfig *config.ApplicationConfig) func(model.BackendLoadEvent) {
|
|
||||||
return func(ev model.BackendLoadEvent) {
|
|
||||||
if ev.Err != nil || !appConfig.EnableTracing {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
|
|
||||||
trace.RecordBackendTrace(trace.BackendTrace{
|
|
||||||
Timestamp: time.Now(),
|
|
||||||
Duration: ev.Duration,
|
|
||||||
Type: trace.BackendTraceModelLoad,
|
|
||||||
ModelName: ev.ModelID,
|
|
||||||
Backend: ev.Backend,
|
|
||||||
Summary: "Model loaded",
|
|
||||||
Data: map[string]any{
|
|
||||||
"model_file": ev.ModelName,
|
|
||||||
"backend_runtime": ev.BackendURI,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// recordModelLoadFailure records a backend trace when model loading fails.
|
// recordModelLoadFailure records a backend trace when model loading fails.
|
||||||
func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, backend string, err error, data map[string]any) {
|
func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, backend string, err error, data map[string]any) {
|
||||||
if !appConfig.EnableTracing {
|
if !appConfig.EnableTracing {
|
||||||
|
|||||||
@@ -181,7 +181,6 @@ func transcriptResultFromProto(r *proto.TranscriptResult) *schema.TranscriptionR
|
|||||||
Text: r.Text,
|
Text: r.Text,
|
||||||
Language: r.Language,
|
Language: r.Language,
|
||||||
Duration: float64(r.Duration),
|
Duration: float64(r.Duration),
|
||||||
Eou: r.Eou,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, s := range r.Segments {
|
for _, s := range r.Segments {
|
||||||
|
|||||||
@@ -1,297 +0,0 @@
|
|||||||
package backend
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"maps"
|
|
||||||
"sync"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/core/config"
|
|
||||||
"github.com/mudler/LocalAI/core/schema"
|
|
||||||
"github.com/mudler/LocalAI/core/trace"
|
|
||||||
grpcPkg "github.com/mudler/LocalAI/pkg/grpc"
|
|
||||||
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
|
||||||
"github.com/mudler/LocalAI/pkg/model"
|
|
||||||
"github.com/mudler/LocalAI/pkg/sound"
|
|
||||||
"github.com/mudler/xlog"
|
|
||||||
)
|
|
||||||
|
|
||||||
// LiveTranscriptionEvent is one streamed event from a live (bidirectional)
|
|
||||||
// transcription session. Delta/Eou/Eob/Words arrive as the user speaks; Final
|
|
||||||
// is set exactly once, on the terminal event after Close flushes the decode
|
|
||||||
// tail. Eou means the model judged the user yielded the turn; Eob means a
|
|
||||||
// backchannel ("uh-huh") ended — callers must NOT treat Eob as a turn
|
|
||||||
// boundary.
|
|
||||||
type LiveTranscriptionEvent struct {
|
|
||||||
Delta string
|
|
||||||
Eou bool
|
|
||||||
Eob bool
|
|
||||||
Words []schema.TranscriptionWord
|
|
||||||
Final *schema.TranscriptionResult
|
|
||||||
}
|
|
||||||
|
|
||||||
// LiveTranscriptionSession is a handle on an open live transcription stream.
|
|
||||||
// Feed pushes 16 kHz mono float PCM; Close signals end-of-audio, waits for
|
|
||||||
// the backend's terminal Final event to be delivered, and releases the
|
|
||||||
// stream.
|
|
||||||
type LiveTranscriptionSession interface {
|
|
||||||
Feed(pcm []float32) error
|
|
||||||
Close() error
|
|
||||||
}
|
|
||||||
|
|
||||||
// liveCloseDrainTimeout bounds how long Close waits for the backend to flush
|
|
||||||
// the decode tail before force-cancelling the stream. Finalize is one short
|
|
||||||
// engine call; seconds here means the backend is wedged.
|
|
||||||
const liveCloseDrainTimeout = 10 * time.Second
|
|
||||||
|
|
||||||
type liveTranscriptionSession struct {
|
|
||||||
stream grpcPkg.AudioTranscriptionLiveClient
|
|
||||||
cancel context.CancelFunc
|
|
||||||
recvDone chan struct{}
|
|
||||||
recvErr error // written by the recv goroutine before recvDone closes
|
|
||||||
closeOnce sync.Once
|
|
||||||
closeErr error
|
|
||||||
trace *liveTraceState // nil when tracing was disabled at open
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *liveTranscriptionSession) Feed(pcm []float32) error {
|
|
||||||
s.trace.addPCM(pcm)
|
|
||||||
return s.stream.Send(&proto.TranscriptLiveRequest{
|
|
||||||
Payload: &proto.TranscriptLiveRequest_Audio{Audio: &proto.TranscriptLiveAudio{Pcm: pcm}},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *liveTranscriptionSession) Close() error {
|
|
||||||
s.closeOnce.Do(func() {
|
|
||||||
err := s.stream.CloseSend()
|
|
||||||
select {
|
|
||||||
case <-s.recvDone:
|
|
||||||
case <-time.After(liveCloseDrainTimeout):
|
|
||||||
xlog.Warn("live transcription: backend did not finalize in time; cancelling stream")
|
|
||||||
s.cancel()
|
|
||||||
<-s.recvDone
|
|
||||||
}
|
|
||||||
s.cancel()
|
|
||||||
if err == nil {
|
|
||||||
err = s.recvErr
|
|
||||||
}
|
|
||||||
s.closeErr = err
|
|
||||||
s.trace.record(err)
|
|
||||||
})
|
|
||||||
return s.closeErr
|
|
||||||
}
|
|
||||||
|
|
||||||
// liveSampleRate is the PCM rate of a live transcription session, fixed by
|
|
||||||
// the session config sent in ModelTranscriptionLive.
|
|
||||||
const liveSampleRate = 16000
|
|
||||||
|
|
||||||
// liveTraceState accumulates what the per-turn backend trace needs while a
|
|
||||||
// live session runs: a bounded copy of the fed PCM for the audio snippet,
|
|
||||||
// the decode outputs, and timing. One trace is recorded at Close — the live
|
|
||||||
// path never touches the unary transcription wrapper, so without this a
|
|
||||||
// streaming-only pipeline produced no transcription traces at all. Feed and
|
|
||||||
// the recv goroutine run concurrently; mu guards the accumulators.
|
|
||||||
type liveTraceState struct {
|
|
||||||
appConfig *config.ApplicationConfig
|
|
||||||
modelName string
|
|
||||||
backend string
|
|
||||||
language string
|
|
||||||
started time.Time
|
|
||||||
|
|
||||||
mu sync.Mutex
|
|
||||||
pcm []byte // first trace.MaxSnippetSeconds of fed audio, int16 LE
|
|
||||||
fedSamples int // ALL samples fed, beyond the snippet cap
|
|
||||||
deltaEvents int
|
|
||||||
eouEvents int
|
|
||||||
eobEvents int
|
|
||||||
finalText string
|
|
||||||
}
|
|
||||||
|
|
||||||
func newLiveTraceState(modelConfig config.ModelConfig, appConfig *config.ApplicationConfig, language string) *liveTraceState {
|
|
||||||
if !appConfig.EnableTracing {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
|
|
||||||
return &liveTraceState{
|
|
||||||
appConfig: appConfig,
|
|
||||||
modelName: modelConfig.Name,
|
|
||||||
backend: modelConfig.Backend,
|
|
||||||
language: language,
|
|
||||||
started: time.Now(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (ts *liveTraceState) addPCM(pcm []float32) {
|
|
||||||
if ts == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
ts.mu.Lock()
|
|
||||||
defer ts.mu.Unlock()
|
|
||||||
ts.fedSamples += len(pcm)
|
|
||||||
maxBytes := trace.MaxSnippetSeconds * liveSampleRate * 2
|
|
||||||
if room := (maxBytes - len(ts.pcm)) / 2; room > 0 {
|
|
||||||
if len(pcm) > room {
|
|
||||||
pcm = pcm[:room]
|
|
||||||
}
|
|
||||||
ts.pcm = append(ts.pcm, sound.Float32sToInt16LEBytes(pcm)...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (ts *liveTraceState) observe(ev LiveTranscriptionEvent) {
|
|
||||||
if ts == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
ts.mu.Lock()
|
|
||||||
defer ts.mu.Unlock()
|
|
||||||
if ev.Delta != "" {
|
|
||||||
ts.deltaEvents++
|
|
||||||
}
|
|
||||||
if ev.Eou {
|
|
||||||
ts.eouEvents++
|
|
||||||
}
|
|
||||||
if ev.Eob {
|
|
||||||
ts.eobEvents++
|
|
||||||
}
|
|
||||||
if ev.Final != nil {
|
|
||||||
ts.finalText = ev.Final.Text
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (ts *liveTraceState) record(closeErr error) {
|
|
||||||
if ts == nil || !ts.appConfig.EnableTracing {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
ts.mu.Lock()
|
|
||||||
data := map[string]any{
|
|
||||||
"source": "live_stream",
|
|
||||||
"language": ts.language,
|
|
||||||
"result_text": ts.finalText,
|
|
||||||
"eou_events": ts.eouEvents,
|
|
||||||
"eob_events": ts.eobEvents,
|
|
||||||
"delta_events": ts.deltaEvents,
|
|
||||||
}
|
|
||||||
if snippet := trace.AudioSnippetFromPCM(ts.pcm, liveSampleRate, ts.fedSamples*2, ts.appConfig.TracingMaxBodyBytes); snippet != nil {
|
|
||||||
maps.Copy(data, snippet)
|
|
||||||
}
|
|
||||||
summary := "live -> " + ts.finalText
|
|
||||||
ts.mu.Unlock()
|
|
||||||
|
|
||||||
bt := trace.BackendTrace{
|
|
||||||
Timestamp: ts.started,
|
|
||||||
Duration: time.Since(ts.started),
|
|
||||||
Type: trace.BackendTraceTranscription,
|
|
||||||
ModelName: ts.modelName,
|
|
||||||
Backend: ts.backend,
|
|
||||||
Summary: trace.TruncateString(summary, 200),
|
|
||||||
Data: data,
|
|
||||||
}
|
|
||||||
if closeErr != nil {
|
|
||||||
bt.Error = closeErr.Error()
|
|
||||||
}
|
|
||||||
trace.RecordBackendTrace(bt)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ModelTranscriptionLive loads the transcription backend, opens the
|
|
||||||
// bidirectional AudioTranscriptionLive RPC, sends the session config, and
|
|
||||||
// BLOCKS until the backend's ready ack. A grpcerrors.
|
|
||||||
// IsLiveTranscriptionUnsupported error means the backend (or the loaded
|
|
||||||
// model) cannot do live transcription and the caller should degrade to the
|
|
||||||
// unary/file path. After a successful return, onEvent is invoked from a
|
|
||||||
// background goroutine — in order, one event at a time — for every response
|
|
||||||
// the backend streams, ending with the Final event triggered by Close.
|
|
||||||
func ModelTranscriptionLive(ctx context.Context, language string,
|
|
||||||
ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig,
|
|
||||||
onEvent func(LiveTranscriptionEvent)) (LiveTranscriptionSession, error) {
|
|
||||||
|
|
||||||
transcriptionModel, err := loadTranscriptionModel(ctx, ml, modelConfig, appConfig)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// The derived cancel out-lives this call inside the session: Close uses
|
|
||||||
// it to unwind the stream (and, in embed mode, the server-side recv
|
|
||||||
// pump, which only stops on send-close or context cancellation).
|
|
||||||
streamCtx, cancel := context.WithCancel(ctx)
|
|
||||||
stream, err := transcriptionModel.AudioTranscriptionLive(streamCtx)
|
|
||||||
if err != nil {
|
|
||||||
cancel()
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
fail := func(err error) (LiveTranscriptionSession, error) {
|
|
||||||
_ = stream.CloseSend()
|
|
||||||
cancel()
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := stream.Send(&proto.TranscriptLiveRequest{
|
|
||||||
Payload: &proto.TranscriptLiveRequest_Config{Config: &proto.TranscriptLiveConfig{
|
|
||||||
Language: language,
|
|
||||||
SampleRate: liveSampleRate,
|
|
||||||
}},
|
|
||||||
}); err != nil {
|
|
||||||
return fail(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ready-ack contract: the backend answers a successful open with a
|
|
||||||
// {ready:true} response before any transcript data; unsupported
|
|
||||||
// backends surface Unimplemented here instead.
|
|
||||||
ack, err := stream.Recv()
|
|
||||||
if err != nil {
|
|
||||||
return fail(err)
|
|
||||||
}
|
|
||||||
if !ack.GetReady() {
|
|
||||||
return fail(fmt.Errorf("live transcription: backend %q broke the ready-ack contract (first response carried data)", modelConfig.Backend))
|
|
||||||
}
|
|
||||||
|
|
||||||
s := &liveTranscriptionSession{
|
|
||||||
stream: stream,
|
|
||||||
cancel: cancel,
|
|
||||||
recvDone: make(chan struct{}),
|
|
||||||
trace: newLiveTraceState(modelConfig, appConfig, language),
|
|
||||||
}
|
|
||||||
|
|
||||||
go func() {
|
|
||||||
defer close(s.recvDone)
|
|
||||||
for {
|
|
||||||
resp, err := stream.Recv()
|
|
||||||
if err != nil {
|
|
||||||
if !errors.Is(err, io.EOF) && streamCtx.Err() == nil {
|
|
||||||
xlog.Warn("live transcription stream ended unexpectedly", "error", err)
|
|
||||||
s.recvErr = err
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
ev := liveEventFromProto(resp)
|
|
||||||
if ev.Delta == "" && !ev.Eou && !ev.Eob && len(ev.Words) == 0 && ev.Final == nil {
|
|
||||||
continue // duplicate ready ack / keep-alive: nothing to deliver
|
|
||||||
}
|
|
||||||
s.trace.observe(ev)
|
|
||||||
onEvent(ev)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
return s, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func liveEventFromProto(r *proto.TranscriptLiveResponse) LiveTranscriptionEvent {
|
|
||||||
ev := LiveTranscriptionEvent{
|
|
||||||
Delta: r.GetDelta(),
|
|
||||||
Eou: r.GetEou(),
|
|
||||||
Eob: r.GetEob(),
|
|
||||||
}
|
|
||||||
for _, w := range r.GetWords() {
|
|
||||||
ev.Words = append(ev.Words, schema.TranscriptionWord{
|
|
||||||
Start: time.Duration(w.Start),
|
|
||||||
End: time.Duration(w.End),
|
|
||||||
Text: w.Text,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
if r.GetFinalResult() != nil {
|
|
||||||
ev.Final = transcriptResultFromProto(r.GetFinalResult())
|
|
||||||
}
|
|
||||||
return ev
|
|
||||||
}
|
|
||||||
@@ -1,162 +0,0 @@
|
|||||||
package backend
|
|
||||||
|
|
||||||
import (
|
|
||||||
"errors"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/core/config"
|
|
||||||
"github.com/mudler/LocalAI/core/schema"
|
|
||||||
"github.com/mudler/LocalAI/core/trace"
|
|
||||||
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
)
|
|
||||||
|
|
||||||
var _ = Describe("liveEventFromProto", func() {
|
|
||||||
It("maps deltas, eou flags and words (ns -> duration)", func() {
|
|
||||||
ev := liveEventFromProto(&proto.TranscriptLiveResponse{
|
|
||||||
Delta: "hello ",
|
|
||||||
Eou: true,
|
|
||||||
Words: []*proto.TranscriptWord{
|
|
||||||
{Start: int64(100 * time.Millisecond), End: int64(400 * time.Millisecond), Text: "hello"},
|
|
||||||
},
|
|
||||||
})
|
|
||||||
Expect(ev.Delta).To(Equal("hello "))
|
|
||||||
Expect(ev.Eou).To(BeTrue())
|
|
||||||
Expect(ev.Words).To(HaveLen(1))
|
|
||||||
Expect(ev.Words[0].Text).To(Equal("hello"))
|
|
||||||
Expect(ev.Words[0].Start).To(Equal(100 * time.Millisecond))
|
|
||||||
Expect(ev.Words[0].End).To(Equal(400 * time.Millisecond))
|
|
||||||
Expect(ev.Final).To(BeNil())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("maps the terminal final result including the eou flag", func() {
|
|
||||||
ev := liveEventFromProto(&proto.TranscriptLiveResponse{
|
|
||||||
FinalResult: &proto.TranscriptResult{
|
|
||||||
Text: "hello world",
|
|
||||||
Duration: 1.5,
|
|
||||||
Eou: true,
|
|
||||||
Segments: []*proto.TranscriptSegment{{Id: 0, Text: "hello world"}},
|
|
||||||
},
|
|
||||||
})
|
|
||||||
Expect(ev.Final).NotTo(BeNil())
|
|
||||||
Expect(ev.Final.Text).To(Equal("hello world"))
|
|
||||||
Expect(ev.Final.Duration).To(BeNumerically("~", 1.5, 1e-6))
|
|
||||||
Expect(ev.Final.Eou).To(BeTrue())
|
|
||||||
Expect(ev.Final.Segments).To(HaveLen(1))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("yields an empty event for a bare ready ack (filtered by the recv loop)", func() {
|
|
||||||
ev := liveEventFromProto(&proto.TranscriptLiveResponse{Ready: true})
|
|
||||||
Expect(ev.Delta).To(BeEmpty())
|
|
||||||
Expect(ev.Eou).To(BeFalse())
|
|
||||||
Expect(ev.Words).To(BeEmpty())
|
|
||||||
Expect(ev.Final).To(BeNil())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("maps the eob backchannel flag separately from eou", func() {
|
|
||||||
ev := liveEventFromProto(&proto.TranscriptLiveResponse{Delta: "uh-huh", Eob: true})
|
|
||||||
Expect(ev.Eob).To(BeTrue())
|
|
||||||
Expect(ev.Eou).To(BeFalse())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
// liveTraceState is what makes streaming-only pipelines visible on the
|
|
||||||
// Traces page: without it a semantic_vad session with retranscribe off
|
|
||||||
// produced no transcription trace at all. One trace per session (= one per
|
|
||||||
// realtime turn), recorded at Close.
|
|
||||||
var _ = Describe("liveTraceState", func() {
|
|
||||||
var appConfig *config.ApplicationConfig
|
|
||||||
|
|
||||||
BeforeEach(func() {
|
|
||||||
appConfig = &config.ApplicationConfig{
|
|
||||||
EnableTracing: true,
|
|
||||||
TracingMaxItems: 64,
|
|
||||||
}
|
|
||||||
trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
|
|
||||||
trace.ClearBackendTraces()
|
|
||||||
})
|
|
||||||
|
|
||||||
modelCfg := func() config.ModelConfig {
|
|
||||||
cfg := config.ModelConfig{Backend: "parakeet-cpp"}
|
|
||||||
cfg.Name = "parakeet-live"
|
|
||||||
return cfg
|
|
||||||
}
|
|
||||||
|
|
||||||
It("is disabled (nil) when tracing is off, and nil receivers are no-ops", func() {
|
|
||||||
appConfig.EnableTracing = false
|
|
||||||
ts := newLiveTraceState(modelCfg(), appConfig, "en")
|
|
||||||
Expect(ts).To(BeNil())
|
|
||||||
|
|
||||||
// The session calls these unconditionally; nil must be safe.
|
|
||||||
ts.addPCM([]float32{0.5})
|
|
||||||
ts.observe(LiveTranscriptionEvent{Eou: true})
|
|
||||||
ts.record(nil)
|
|
||||||
Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("records one transcription trace with text, eou event counts and audio snippet at Close", func() {
|
|
||||||
ts := newLiveTraceState(modelCfg(), appConfig, "en")
|
|
||||||
Expect(ts).NotTo(BeNil())
|
|
||||||
|
|
||||||
// One second of a loud-ish constant tone so the snippet has signal.
|
|
||||||
pcm := make([]float32, liveSampleRate)
|
|
||||||
for i := range pcm {
|
|
||||||
pcm[i] = 0.25
|
|
||||||
}
|
|
||||||
ts.addPCM(pcm)
|
|
||||||
ts.observe(LiveTranscriptionEvent{Delta: "hello "})
|
|
||||||
ts.observe(LiveTranscriptionEvent{Delta: "world", Eou: true})
|
|
||||||
ts.observe(LiveTranscriptionEvent{Final: &schema.TranscriptionResult{Text: "hello world", Eou: true}})
|
|
||||||
|
|
||||||
ts.record(nil)
|
|
||||||
|
|
||||||
Eventually(trace.GetBackendTraces).Should(HaveLen(1))
|
|
||||||
got := trace.GetBackendTraces()[0]
|
|
||||||
Expect(got.Type).To(Equal(trace.BackendTraceTranscription))
|
|
||||||
Expect(got.ModelName).To(Equal("parakeet-live"))
|
|
||||||
Expect(got.Backend).To(Equal("parakeet-cpp"))
|
|
||||||
Expect(got.Summary).To(ContainSubstring("hello world"))
|
|
||||||
Expect(got.Data["source"]).To(Equal("live_stream"))
|
|
||||||
Expect(got.Data["result_text"]).To(Equal("hello world"))
|
|
||||||
// The live FinalResult no longer carries a terminal eou flag; the
|
|
||||||
// per-feed eou_events count is what the trace records instead.
|
|
||||||
Expect(got.Data).NotTo(HaveKey("eou"))
|
|
||||||
Expect(got.Data["eou_events"]).To(Equal(1))
|
|
||||||
Expect(got.Data["delta_events"]).To(Equal(2))
|
|
||||||
Expect(got.Data["audio_duration_s"]).To(BeNumerically("~", 1.0, 0.01))
|
|
||||||
Expect(got.Data["audio_wav_base64"]).NotTo(BeEmpty())
|
|
||||||
Expect(got.Error).To(BeEmpty())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("caps the stored snippet but keeps counting the full fed duration", func() {
|
|
||||||
ts := newLiveTraceState(modelCfg(), appConfig, "")
|
|
||||||
|
|
||||||
// Feed past the snippet cap in two chunks (cap + one extra second).
|
|
||||||
ts.addPCM(make([]float32, trace.MaxSnippetSeconds*liveSampleRate))
|
|
||||||
ts.addPCM(make([]float32, liveSampleRate))
|
|
||||||
|
|
||||||
Expect(len(ts.pcm)).To(Equal(trace.MaxSnippetSeconds * liveSampleRate * 2))
|
|
||||||
Expect(ts.fedSamples).To(Equal((trace.MaxSnippetSeconds + 1) * liveSampleRate))
|
|
||||||
|
|
||||||
ts.record(nil)
|
|
||||||
Eventually(trace.GetBackendTraces).Should(HaveLen(1))
|
|
||||||
got := trace.GetBackendTraces()[0]
|
|
||||||
Expect(got.Data["audio_duration_s"]).To(BeNumerically("~", float64(trace.MaxSnippetSeconds+1), 0.01))
|
|
||||||
Expect(got.Data["audio_snippet_s"]).To(BeNumerically("~", float64(trace.MaxSnippetSeconds), 0.01))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("clamps out-of-range float samples instead of wrapping", func() {
|
|
||||||
ts := newLiveTraceState(modelCfg(), appConfig, "")
|
|
||||||
ts.addPCM([]float32{2.0, -2.0})
|
|
||||||
Expect(ts.pcm).To(Equal([]byte{0xff, 0x7f, 0x00, 0x80})) // 32767, -32768
|
|
||||||
})
|
|
||||||
|
|
||||||
It("stamps the close error on the trace", func() {
|
|
||||||
ts := newLiveTraceState(modelCfg(), appConfig, "")
|
|
||||||
ts.record(errors.New("stream torn down"))
|
|
||||||
|
|
||||||
Eventually(trace.GetBackendTraces).Should(HaveLen(1))
|
|
||||||
Expect(trace.GetBackendTraces()[0].Error).To(Equal("stream torn down"))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -567,38 +567,6 @@ func DefaultRegistry() map[string]FieldMetaOverride {
|
|||||||
Advanced: true,
|
Advanced: true,
|
||||||
Order: 83,
|
Order: 83,
|
||||||
},
|
},
|
||||||
"pipeline.turn_detection.type": {
|
|
||||||
Section: "pipeline",
|
|
||||||
Label: "Turn Detection",
|
|
||||||
Description: "Default turn-detection mode for realtime sessions on this pipeline. server_vad commits after a fixed silence window; semantic_vad lets the transcription model's end-of-utterance token drive a dynamic window (fast commit after the token, long eagerness fallback without it). semantic_vad requires a streaming-EOU transcription model (e.g. parakeet-cpp-realtime_eou_120m-v1) and degrades to silence-only otherwise. Clients can override per session via session.update.",
|
|
||||||
Component: "select",
|
|
||||||
Options: []FieldOption{
|
|
||||||
{Value: "", Label: "Default (server_vad)"},
|
|
||||||
{Value: "server_vad", Label: "server_vad (silence-based)"},
|
|
||||||
{Value: "semantic_vad", Label: "semantic_vad (end-of-utterance token)"},
|
|
||||||
},
|
|
||||||
Order: 87,
|
|
||||||
},
|
|
||||||
"pipeline.turn_detection.eagerness": {
|
|
||||||
Section: "pipeline",
|
|
||||||
Label: "Eagerness",
|
|
||||||
Description: "semantic_vad fallback silence window used when no end-of-utterance token was seen: low waits 8s, medium/auto 4s, high 2s.",
|
|
||||||
Component: "select",
|
|
||||||
Options: []FieldOption{
|
|
||||||
{Value: "", Label: "Default (auto)"},
|
|
||||||
{Value: "low", Label: "low (8s)"},
|
|
||||||
{Value: "medium", Label: "medium (4s)"},
|
|
||||||
{Value: "high", Label: "high (2s)"},
|
|
||||||
},
|
|
||||||
Order: 88,
|
|
||||||
},
|
|
||||||
"pipeline.turn_detection.retranscribe": {
|
|
||||||
Section: "pipeline",
|
|
||||||
Label: "Retranscribe on Commit",
|
|
||||||
Description: "Cross-check every semantic_vad commit with an offline decode of the buffered turn: commit only proceeds when the batch decode also ends in the end-of-utterance token, and its transcript is used. Logs a streamed-vs-batch comparison — useful to gauge streaming/batch alignment — at the cost of one extra decode per turn.",
|
|
||||||
Component: "toggle",
|
|
||||||
Order: 89,
|
|
||||||
},
|
|
||||||
|
|
||||||
// --- Functions ---
|
// --- Functions ---
|
||||||
"function.grammar.parallel_calls": {
|
"function.grammar.parallel_calls": {
|
||||||
|
|||||||
@@ -650,12 +650,6 @@ type Pipeline struct {
|
|||||||
// VoiceRecognition gates the pipeline behind speaker verification. Nil
|
// VoiceRecognition gates the pipeline behind speaker verification. Nil
|
||||||
// (block absent) means no gate, preserving existing behavior.
|
// (block absent) means no gate, preserving existing behavior.
|
||||||
VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"`
|
VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"`
|
||||||
|
|
||||||
// TurnDetection sets the server-side default turn-detection mode for
|
|
||||||
// realtime sessions on this pipeline, so clients need no session.update
|
|
||||||
// to benefit. A client session.update still overrides type and eagerness
|
|
||||||
// per session; retranscribe is server-side only. Unset keeps server_vad.
|
|
||||||
TurnDetection PipelineTurnDetection `yaml:"turn_detection,omitempty" json:"turn_detection,omitempty"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// PipelineCompaction configures summarize-then-drop for a realtime pipeline.
|
// PipelineCompaction configures summarize-then-drop for a realtime pipeline.
|
||||||
@@ -940,38 +934,6 @@ func (v PipelineVoiceRecognition) Validate(registryAvailable bool) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// @Description PipelineTurnDetection sets realtime turn-detection defaults.
|
|
||||||
type PipelineTurnDetection struct {
|
|
||||||
// Type selects the default turn_detection mode for sessions on this
|
|
||||||
// pipeline: "server_vad" (silence-based) or "semantic_vad" (the
|
|
||||||
// transcription model's end-of-utterance token drives a dynamic silence
|
|
||||||
// window; needs a streaming-EOU transcription model such as
|
|
||||||
// parakeet_realtime_eou_120m-v1, degrades to silence-only otherwise).
|
|
||||||
Type string `yaml:"type,omitempty" json:"type,omitempty"`
|
|
||||||
// Eagerness is the semantic_vad fallback when no end-of-utterance token
|
|
||||||
// was seen: low waits 8s of silence, medium/auto 4s, high 2s.
|
|
||||||
Eagerness string `yaml:"eagerness,omitempty" json:"eagerness,omitempty"`
|
|
||||||
// Retranscribe (semantic_vad only) cross-checks every EOU-triggered
|
|
||||||
// commit with an offline decode of the buffered turn: the commit only
|
|
||||||
// proceeds when the batch decode also ends in the end-of-utterance token,
|
|
||||||
// and its transcript is the one used. The streamed and batch transcripts
|
|
||||||
// are compared in the logs — a diagnostic for streaming/batch alignment
|
|
||||||
// at the cost of one extra decode per turn.
|
|
||||||
Retranscribe *bool `yaml:"retranscribe,omitempty" json:"retranscribe,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// TurnDetectionSemantic reports whether this pipeline defaults sessions to
|
|
||||||
// semantic (EOU-driven) turn detection.
|
|
||||||
func (p Pipeline) TurnDetectionSemantic() bool {
|
|
||||||
return strings.EqualFold(strings.TrimSpace(p.TurnDetection.Type), "semantic_vad")
|
|
||||||
}
|
|
||||||
|
|
||||||
// TurnDetectionRetranscribe reports whether semantic_vad commits should be
|
|
||||||
// cross-checked (and transcribed) by an offline decode of the buffered turn.
|
|
||||||
func (p Pipeline) TurnDetectionRetranscribe() bool {
|
|
||||||
return p.TurnDetection.Retranscribe != nil && *p.TurnDetection.Retranscribe
|
|
||||||
}
|
|
||||||
|
|
||||||
// @Description File configuration for model downloads
|
// @Description File configuration for model downloads
|
||||||
type File struct {
|
type File struct {
|
||||||
Filename string `yaml:"filename,omitempty" json:"filename,omitempty"`
|
Filename string `yaml:"filename,omitempty" json:"filename,omitempty"`
|
||||||
|
|||||||
@@ -1,61 +0,0 @@
|
|||||||
package config
|
|
||||||
|
|
||||||
import (
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
"gopkg.in/yaml.v3"
|
|
||||||
)
|
|
||||||
|
|
||||||
// pipeline.turn_detection sets the server-side default turn-detection mode
|
|
||||||
// for realtime sessions. Unset keeps server_vad, so existing configs are
|
|
||||||
// unaffected; retranscribe is opt-in.
|
|
||||||
var _ = Describe("Pipeline turn_detection config", func() {
|
|
||||||
It("defaults to non-semantic with retranscribe off when unset", func() {
|
|
||||||
var p Pipeline
|
|
||||||
Expect(p.TurnDetectionSemantic()).To(BeFalse())
|
|
||||||
Expect(p.TurnDetectionRetranscribe()).To(BeFalse())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("parses the nested turn_detection block from YAML", func() {
|
|
||||||
var c ModelConfig
|
|
||||||
err := yaml.Unmarshal([]byte(`
|
|
||||||
name: gpt-realtime
|
|
||||||
pipeline:
|
|
||||||
transcription: parakeet-cpp-realtime_eou_120m-v1
|
|
||||||
turn_detection:
|
|
||||||
type: semantic_vad
|
|
||||||
eagerness: high
|
|
||||||
retranscribe: true
|
|
||||||
`), &c)
|
|
||||||
Expect(err).ToNot(HaveOccurred())
|
|
||||||
Expect(c.Pipeline.TurnDetectionSemantic()).To(BeTrue())
|
|
||||||
Expect(c.Pipeline.TurnDetection.Eagerness).To(Equal("high"))
|
|
||||||
Expect(c.Pipeline.TurnDetectionRetranscribe()).To(BeTrue())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("treats server_vad and unknown types as non-semantic", func() {
|
|
||||||
var p Pipeline
|
|
||||||
p.TurnDetection.Type = "server_vad"
|
|
||||||
Expect(p.TurnDetectionSemantic()).To(BeFalse())
|
|
||||||
p.TurnDetection.Type = "something_else"
|
|
||||||
Expect(p.TurnDetectionSemantic()).To(BeFalse())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("matches semantic_vad case-insensitively with surrounding space", func() {
|
|
||||||
var p Pipeline
|
|
||||||
p.TurnDetection.Type = " Semantic_VAD "
|
|
||||||
Expect(p.TurnDetectionSemantic()).To(BeTrue())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("treats an explicit retranscribe false as off", func() {
|
|
||||||
var c ModelConfig
|
|
||||||
err := yaml.Unmarshal([]byte(`
|
|
||||||
pipeline:
|
|
||||||
turn_detection:
|
|
||||||
type: semantic_vad
|
|
||||||
retranscribe: false
|
|
||||||
`), &c)
|
|
||||||
Expect(err).ToNot(HaveOccurred())
|
|
||||||
Expect(c.Pipeline.TurnDetectionRetranscribe()).To(BeFalse())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -22,13 +22,11 @@ var _ = Describe("DiscoverModelConfig", func() {
|
|||||||
modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
|
modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
|
||||||
|
|
||||||
Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
|
Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
|
||||||
// No name preference + repo-root URI: the name follows the selected
|
Expect(modelConfig.Name).To(Equal("LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
// GGUF file, not the repo (issue #10587).
|
|
||||||
Expect(modelConfig.Name).To(Equal("localai-functioncall-qwen2.5-7b-v0.5-q4_k_m"), fmt.Sprintf("Model config: %+v", modelConfig))
|
|
||||||
Expect(modelConfig.Description).To(Equal("Imported from https://huggingface.co/mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Description).To(Equal("Imported from https://huggingface.co/mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: llama-cpp"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: llama-cpp"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(len(modelConfig.Files)).To(Equal(1), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(len(modelConfig.Files)).To(Equal(1), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.Files[0].URI).To(Equal("https://huggingface.co/mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/resolve/main/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Files[0].URI).To(Equal("https://huggingface.co/mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/resolve/main/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.Files[0].SHA256).To(Equal("4e7b7fe1d54b881f1ef90799219dc6cc285d29db24f559c8998d1addb35713d4"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Files[0].SHA256).To(Equal("4e7b7fe1d54b881f1ef90799219dc6cc285d29db24f559c8998d1addb35713d4"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
})
|
})
|
||||||
@@ -40,17 +38,16 @@ var _ = Describe("DiscoverModelConfig", func() {
|
|||||||
modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
|
modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
|
||||||
|
|
||||||
Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
|
Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
|
||||||
// No name preference: name follows the selected model GGUF (issue #10587).
|
Expect(modelConfig.Name).To(Equal("Qwen3-VL-2B-Instruct-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.Name).To(Equal("Qwen3VL-2B-Instruct-Q4_K_M"), fmt.Sprintf("Model config: %+v", modelConfig))
|
|
||||||
Expect(modelConfig.Description).To(Equal("Imported from https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Description).To(Equal("Imported from https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: llama-cpp"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: llama-cpp"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("mmproj: llama-cpp/mmproj/Qwen3VL-2B-Instruct-Q4_K_M/mmproj-Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.ConfigFile).To(ContainSubstring("mmproj: llama-cpp/mmproj/Qwen3-VL-2B-Instruct-GGUF/mmproj-Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("model: llama-cpp/models/Qwen3VL-2B-Instruct-Q4_K_M/Qwen3VL-2B-Instruct-Q4_K_M.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.ConfigFile).To(ContainSubstring("model: llama-cpp/models/Qwen3-VL-2B-Instruct-GGUF/Qwen3VL-2B-Instruct-Q4_K_M.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(len(modelConfig.Files)).To(Equal(2), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(len(modelConfig.Files)).To(Equal(2), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/Qwen3VL-2B-Instruct-Q4_K_M/Qwen3VL-2B-Instruct-Q4_K_M.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/Qwen3-VL-2B-Instruct-GGUF/Qwen3VL-2B-Instruct-Q4_K_M.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.Files[0].URI).To(Equal("https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF/resolve/main/Qwen3VL-2B-Instruct-Q4_K_M.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Files[0].URI).To(Equal("https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF/resolve/main/Qwen3VL-2B-Instruct-Q4_K_M.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.Files[0].SHA256).ToNot(BeEmpty(), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Files[0].SHA256).ToNot(BeEmpty(), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.Files[1].Filename).To(Equal("llama-cpp/mmproj/Qwen3VL-2B-Instruct-Q4_K_M/mmproj-Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Files[1].Filename).To(Equal("llama-cpp/mmproj/Qwen3-VL-2B-Instruct-GGUF/mmproj-Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.Files[1].URI).To(Equal("https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF/resolve/main/mmproj-Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Files[1].URI).To(Equal("https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF/resolve/main/mmproj-Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.Files[1].SHA256).ToNot(BeEmpty(), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Files[1].SHA256).ToNot(BeEmpty(), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
})
|
})
|
||||||
@@ -62,17 +59,16 @@ var _ = Describe("DiscoverModelConfig", func() {
|
|||||||
modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
|
modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
|
||||||
|
|
||||||
Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
|
Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
|
||||||
// No name preference: name follows the selected Q8_0 model GGUF (issue #10587).
|
Expect(modelConfig.Name).To(Equal("Qwen3-VL-2B-Instruct-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.Name).To(Equal("Qwen3VL-2B-Instruct-Q8_0"), fmt.Sprintf("Model config: %+v", modelConfig))
|
|
||||||
Expect(modelConfig.Description).To(Equal("Imported from https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Description).To(Equal("Imported from https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: llama-cpp"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: llama-cpp"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("mmproj: llama-cpp/mmproj/Qwen3VL-2B-Instruct-Q8_0/mmproj-Qwen3VL-2B-Instruct-F16.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.ConfigFile).To(ContainSubstring("mmproj: llama-cpp/mmproj/Qwen3-VL-2B-Instruct-GGUF/mmproj-Qwen3VL-2B-Instruct-F16.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("model: llama-cpp/models/Qwen3VL-2B-Instruct-Q8_0/Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.ConfigFile).To(ContainSubstring("model: llama-cpp/models/Qwen3-VL-2B-Instruct-GGUF/Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(len(modelConfig.Files)).To(Equal(2), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(len(modelConfig.Files)).To(Equal(2), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/Qwen3VL-2B-Instruct-Q8_0/Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/Qwen3-VL-2B-Instruct-GGUF/Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.Files[0].URI).To(Equal("https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF/resolve/main/Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Files[0].URI).To(Equal("https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF/resolve/main/Qwen3VL-2B-Instruct-Q8_0.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.Files[0].SHA256).ToNot(BeEmpty(), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Files[0].SHA256).ToNot(BeEmpty(), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.Files[1].Filename).To(Equal("llama-cpp/mmproj/Qwen3VL-2B-Instruct-Q8_0/mmproj-Qwen3VL-2B-Instruct-F16.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Files[1].Filename).To(Equal("llama-cpp/mmproj/Qwen3-VL-2B-Instruct-GGUF/mmproj-Qwen3VL-2B-Instruct-F16.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.Files[1].URI).To(Equal("https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF/resolve/main/mmproj-Qwen3VL-2B-Instruct-F16.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Files[1].URI).To(Equal("https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-GGUF/resolve/main/mmproj-Qwen3VL-2B-Instruct-F16.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
Expect(modelConfig.Files[1].SHA256).ToNot(BeEmpty(), fmt.Sprintf("Model config: %+v", modelConfig))
|
Expect(modelConfig.Files[1].SHA256).ToNot(BeEmpty(), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -98,13 +98,8 @@ func (i *LlamaCPPImporter) Import(details Details) (gallery.ModelConfig, error)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// nameProvided tracks whether the user supplied an explicit model name.
|
name, ok := preferencesMap["name"].(string)
|
||||||
// When they didn't, the URI base is only a fallback: for a HuggingFace
|
if !ok {
|
||||||
// repo-root URI (no file component) it would be the repo name, so the HF
|
|
||||||
// branch below re-derives the name from the selected GGUF file instead
|
|
||||||
// (issue #10587).
|
|
||||||
name, nameProvided := preferencesMap["name"].(string)
|
|
||||||
if !nameProvided {
|
|
||||||
name = filepath.Base(details.URI)
|
name = filepath.Base(details.URI)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -232,23 +227,10 @@ func (i *LlamaCPPImporter) Import(details Details) (gallery.ModelConfig, error)
|
|||||||
mmprojGroups := hfapi.GroupShards(mmprojFiles)
|
mmprojGroups := hfapi.GroupShards(mmprojFiles)
|
||||||
ggufGroups := hfapi.GroupShards(ggufFiles)
|
ggufGroups := hfapi.GroupShards(ggufFiles)
|
||||||
|
|
||||||
modelGroup := pickPreferredGroup(ggufGroups, quants)
|
|
||||||
|
|
||||||
// A repo-root URI has no file component, so the URI-base fallback
|
|
||||||
// above produced the repo name. When the user left the name blank,
|
|
||||||
// derive it from the GGUF file actually selected from the listing so
|
|
||||||
// the gallery entry and `model:` directory reflect the model, not the
|
|
||||||
// repository (issue #10587). An explicit name preference always wins.
|
|
||||||
if !nameProvided && modelGroup != nil {
|
|
||||||
name = modelNameFromShardGroup(*modelGroup)
|
|
||||||
modelConfig.Name = name
|
|
||||||
cfg.Name = name
|
|
||||||
}
|
|
||||||
|
|
||||||
// Emit the model group first so cfg.Files[0] is the model — callers
|
// Emit the model group first so cfg.Files[0] is the model — callers
|
||||||
// and tests rely on the model file preceding any mmproj companion.
|
// and tests rely on the model file preceding any mmproj companion.
|
||||||
if modelGroup != nil {
|
if group := pickPreferredGroup(ggufGroups, quants); group != nil {
|
||||||
appendShardGroup(&cfg, *modelGroup, filepath.Join("llama-cpp", "models", name))
|
appendShardGroup(&cfg, *group, filepath.Join("llama-cpp", "models", name))
|
||||||
}
|
}
|
||||||
if group := pickPreferredGroup(mmprojGroups, mmprojQuantsList); group != nil {
|
if group := pickPreferredGroup(mmprojGroups, mmprojQuantsList); group != nil {
|
||||||
appendShardGroup(&cfg, *group, filepath.Join("llama-cpp", "mmproj", name))
|
appendShardGroup(&cfg, *group, filepath.Join("llama-cpp", "mmproj", name))
|
||||||
@@ -299,20 +281,6 @@ func (i *LlamaCPPImporter) Import(details Details) (gallery.ModelConfig, error)
|
|||||||
return cfg, nil
|
return cfg, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// modelNameFromShardGroup derives a human-facing model name from the picked
|
|
||||||
// GGUF group: the logical base filename with its .gguf extension stripped.
|
|
||||||
// ShardGroup.Base is the common prefix for sharded sets (without the
|
|
||||||
// -NNNNN-of-MMMMM suffix) and the sole basename for single-file models, so
|
|
||||||
// this yields a clean name like "model-Q4_K_M" rather than an individual
|
|
||||||
// shard filename or the repo-root URI base.
|
|
||||||
func modelNameFromShardGroup(group hfapi.ShardGroup) string {
|
|
||||||
base := group.Base
|
|
||||||
if ext := filepath.Ext(base); strings.EqualFold(ext, ".gguf") {
|
|
||||||
base = strings.TrimSuffix(base, ext)
|
|
||||||
}
|
|
||||||
return base
|
|
||||||
}
|
|
||||||
|
|
||||||
// pickPreferredGroup walks the preference list in priority order and returns
|
// pickPreferredGroup walks the preference list in priority order and returns
|
||||||
// the first group whose base filename contains any preference. When nothing
|
// the first group whose base filename contains any preference. When nothing
|
||||||
// matches, the last group wins — this preserves the historical "if the user
|
// matches, the last group wins — this preserves the historical "if the user
|
||||||
|
|||||||
@@ -372,62 +372,6 @@ var _ = Describe("LlamaCPPImporter", func() {
|
|||||||
Expect(err).ToNot(HaveOccurred())
|
Expect(err).ToNot(HaveOccurred())
|
||||||
Expect(modelConfig.Files).To(BeEmpty())
|
Expect(modelConfig.Files).To(BeEmpty())
|
||||||
})
|
})
|
||||||
|
|
||||||
It("derives the model name from the selected GGUF when no name is given", func() {
|
|
||||||
// Regression for #10587: a repo-root URI has no file component, so
|
|
||||||
// the URI base ("example-GGUF") is just the repo name. With the
|
|
||||||
// name field left blank, the emitted name and model directory must
|
|
||||||
// follow the GGUF file actually selected, not the repository.
|
|
||||||
details := withHF(`{"quantizations":"Q4_K_M"}`,
|
|
||||||
hfFile("Meta-Llama-3-8B-Instruct.Q4_K_M.gguf", "aaa"),
|
|
||||||
hfFile("Meta-Llama-3-8B-Instruct.Q3_K_M.gguf", "bbb"),
|
|
||||||
)
|
|
||||||
|
|
||||||
modelConfig, err := importer.Import(details)
|
|
||||||
|
|
||||||
Expect(err).ToNot(HaveOccurred())
|
|
||||||
Expect(modelConfig.Name).To(Equal("Meta-Llama-3-8B-Instruct.Q4_K_M"))
|
|
||||||
Expect(modelConfig.Files).To(HaveLen(1), fmt.Sprintf("%+v", modelConfig))
|
|
||||||
Expect(modelConfig.Files[0].Filename).To(Equal(
|
|
||||||
"llama-cpp/models/Meta-Llama-3-8B-Instruct.Q4_K_M/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf"))
|
|
||||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("name: Meta-Llama-3-8B-Instruct.Q4_K_M"))
|
|
||||||
Expect(modelConfig.ConfigFile).To(ContainSubstring(
|
|
||||||
"model: llama-cpp/models/Meta-Llama-3-8B-Instruct.Q4_K_M/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf"))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("derives a clean name from the shard base for split GGUFs when no name is given", func() {
|
|
||||||
// The selected primary file is shard 1; using its raw basename
|
|
||||||
// would leak the -00001-of-00002 suffix into the name. The shard
|
|
||||||
// base must be used so the name is the logical model.
|
|
||||||
details := withHF(``,
|
|
||||||
hfFile("Qwen3-30B-A3B-Q4_K_M-00001-of-00002.gguf", "p1"),
|
|
||||||
hfFile("Qwen3-30B-A3B-Q4_K_M-00002-of-00002.gguf", "p2"),
|
|
||||||
)
|
|
||||||
|
|
||||||
modelConfig, err := importer.Import(details)
|
|
||||||
|
|
||||||
Expect(err).ToNot(HaveOccurred())
|
|
||||||
Expect(modelConfig.Name).To(Equal("Qwen3-30B-A3B-Q4_K_M"))
|
|
||||||
Expect(modelConfig.Files).To(HaveLen(2), fmt.Sprintf("%+v", modelConfig))
|
|
||||||
Expect(modelConfig.Files[0].Filename).To(Equal(
|
|
||||||
"llama-cpp/models/Qwen3-30B-A3B-Q4_K_M/Qwen3-30B-A3B-Q4_K_M-00001-of-00002.gguf"))
|
|
||||||
Expect(modelConfig.ConfigFile).To(ContainSubstring(
|
|
||||||
"model: llama-cpp/models/Qwen3-30B-A3B-Q4_K_M/Qwen3-30B-A3B-Q4_K_M-00001-of-00002.gguf"))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("keeps an explicit name over the selected GGUF filename", func() {
|
|
||||||
// Precedence guard: when the user supplies a name it always wins,
|
|
||||||
// even though a GGUF file was selected from the listing.
|
|
||||||
details := withHF(`{"name":"my-custom-name","quantizations":"Q4_K_M"}`,
|
|
||||||
hfFile("model-Q4_K_M.gguf", "aaa"),
|
|
||||||
)
|
|
||||||
|
|
||||||
modelConfig, err := importer.Import(details)
|
|
||||||
|
|
||||||
Expect(err).ToNot(HaveOccurred())
|
|
||||||
Expect(modelConfig.Name).To(Equal("my-custom-name"))
|
|
||||||
Expect(modelConfig.Files[0].Filename).To(Equal("llama-cpp/models/my-custom-name/model-Q4_K_M.gguf"))
|
|
||||||
})
|
|
||||||
})
|
})
|
||||||
|
|
||||||
Context("quant token boundary matching", func() {
|
Context("quant token boundary matching", func() {
|
||||||
|
|||||||
@@ -618,10 +618,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
|||||||
finishReason = FinishReasonToolCalls
|
finishReason = FinishReasonToolCalls
|
||||||
} else if toolsCalled {
|
} else if toolsCalled {
|
||||||
finishReason = FinishReasonFunctionCall
|
finishReason = FinishReasonFunctionCall
|
||||||
} else if reachedTokenBudget(finalUsage.Completion, config.Maxtokens) {
|
|
||||||
// Generation stopped because it hit the max_tokens ceiling
|
|
||||||
// rather than a natural stop — report "length" (issue #9716).
|
|
||||||
finishReason = FinishReasonLength
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Final delta chunk: empty delta with finish_reason set. Per
|
// Final delta chunk: empty delta with finish_reason set. Per
|
||||||
@@ -988,18 +984,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If generation hit the max_tokens ceiling, report "length"
|
|
||||||
// instead of a natural "stop" (issue #9716). Mirrors the
|
|
||||||
// streaming path; tool/function finish reasons are untouched.
|
|
||||||
if reachedTokenBudget(tokenUsage.Completion, config.Maxtokens) {
|
|
||||||
for i := range result {
|
|
||||||
if result[i].FinishReason != nil && *result[i].FinishReason == FinishReasonStop {
|
|
||||||
lengthReason := FinishReasonLength
|
|
||||||
result[i].FinishReason = &lengthReason
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// No MCP tools to execute (or no MCP tools configured), return response
|
// No MCP tools to execute (or no MCP tools configured), return response
|
||||||
usage := schema.OpenAIUsage{
|
usage := schema.OpenAIUsage{
|
||||||
PromptTokens: tokenUsage.Prompt,
|
PromptTokens: tokenUsage.Prompt,
|
||||||
|
|||||||
@@ -1,149 +0,0 @@
|
|||||||
// Package compactcoord is the explicit state machine for the realtime API's
|
|
||||||
// conversation-compaction concern (machine "M4" in
|
|
||||||
// docs/design/realtime-state-machines.md).
|
|
||||||
//
|
|
||||||
// In the legacy code this machine is an implicit single-flight guard: a
|
|
||||||
// per-conversation `compacting atomic.Bool` that maybeCompact CAS-flips to start
|
|
||||||
// a background summarize+evict and a deferred Store(false) clears. The intent —
|
|
||||||
// at most one compaction running per conversation at a time, so two goroutines
|
|
||||||
// never summarize and evict the same overflow concurrently (Part 4, invariant
|
|
||||||
// #9) — is correct but implicit in a bare atomic.
|
|
||||||
//
|
|
||||||
// This package makes it explicit:
|
|
||||||
// - a sealed sum type for State (Idle | Running) — "two compactions running" is
|
|
||||||
// unrepresentable,
|
|
||||||
// - a total, pure transition function Next(state, event) -> (state, effects),
|
|
||||||
// - a single-writer Coordinator that serializes every transition.
|
|
||||||
//
|
|
||||||
// Unlike respcoord (M3), a Trigger while Running is NOT a supersede: compaction
|
|
||||||
// is idempotent work on the same overflow, so a concurrent trigger is simply
|
|
||||||
// dropped (matching the legacy CAS-fails-so-skip), not queued or restarted.
|
|
||||||
package compactcoord
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
|
|
||||||
)
|
|
||||||
|
|
||||||
// State is the sealed sum type of compaction states. Exhaustively:
|
|
||||||
// Idle | Running | Terminated.
|
|
||||||
type State interface {
|
|
||||||
isState()
|
|
||||||
String() string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Idle: no compaction is running.
|
|
||||||
type Idle struct{}
|
|
||||||
|
|
||||||
// Running: exactly one compaction is in flight.
|
|
||||||
type Running struct{}
|
|
||||||
|
|
||||||
// Terminated: the conversation/session is torn down. Absorbing — no compaction
|
|
||||||
// can start from here, so the M1 (connection) parent's teardown can cancel +
|
|
||||||
// join the in-flight compaction and guarantee none outlives the session (see
|
|
||||||
// formal-verification/session_lifecycle.fizz). This closes the legacy gap where
|
|
||||||
// the fire-and-forget compaction goroutine could outlive the session.
|
|
||||||
type Terminated struct{}
|
|
||||||
|
|
||||||
func (Idle) isState() {}
|
|
||||||
func (Running) isState() {}
|
|
||||||
func (Terminated) isState() {}
|
|
||||||
|
|
||||||
func (Idle) String() string { return "Idle" }
|
|
||||||
func (Running) String() string { return "Running" }
|
|
||||||
func (Terminated) String() string { return "Terminated" }
|
|
||||||
|
|
||||||
// Event is the sealed sum type of inputs. Exhaustively:
|
|
||||||
// Trigger | Finished | Shutdown.
|
|
||||||
type Event interface {
|
|
||||||
isEvent()
|
|
||||||
String() string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Trigger requests a compaction (the live buffer grew past the trigger). It
|
|
||||||
// starts one only when Idle; while Running it is a no-op (single-flight).
|
|
||||||
type Trigger struct{}
|
|
||||||
|
|
||||||
// Finished reports that the running compaction goroutine finished (success, error, or
|
|
||||||
// timeout — it always reports Finished so the flag can never stick).
|
|
||||||
type Finished struct{}
|
|
||||||
|
|
||||||
// Shutdown terminates the coordinator at teardown: the in-flight compaction is
|
|
||||||
// cancelled + joined by the sink, and no compaction can start afterwards.
|
|
||||||
type Shutdown struct{}
|
|
||||||
|
|
||||||
func (Trigger) isEvent() {}
|
|
||||||
func (Finished) isEvent() {}
|
|
||||||
func (Shutdown) isEvent() {}
|
|
||||||
|
|
||||||
func (Trigger) String() string { return "Trigger" }
|
|
||||||
func (Finished) String() string { return "Finished" }
|
|
||||||
func (Shutdown) String() string { return "Shutdown" }
|
|
||||||
|
|
||||||
// Effect is a side effect returned by Next as data. Exhaustively: StartCompaction.
|
|
||||||
type Effect interface {
|
|
||||||
isEffect()
|
|
||||||
String() string
|
|
||||||
}
|
|
||||||
|
|
||||||
// StartCompaction: spawn the background summarize+evict goroutine.
|
|
||||||
type StartCompaction struct{}
|
|
||||||
|
|
||||||
func (StartCompaction) isEffect() {}
|
|
||||||
|
|
||||||
func (StartCompaction) String() string { return "StartCompaction" }
|
|
||||||
|
|
||||||
// Next is the total, pure transition function. For every (state, event) it
|
|
||||||
// returns the next state and the ordered effects. It returns a non-nil error
|
|
||||||
// only for an unknown State/Event implementation. Every in-domain pair is
|
|
||||||
// defined; there are no forbidden transitions, only no-ops.
|
|
||||||
//
|
|
||||||
// Single-flight crux: StartCompaction is emitted only on Idle+Trigger, and a
|
|
||||||
// Trigger while Running is a no-op — so at most one compaction ever runs.
|
|
||||||
func Next(s State, e Event) (State, []Effect, error) {
|
|
||||||
switch s.(type) {
|
|
||||||
case Idle:
|
|
||||||
switch e.(type) {
|
|
||||||
case Trigger:
|
|
||||||
return Running{}, []Effect{StartCompaction{}}, nil
|
|
||||||
case Finished:
|
|
||||||
// No compaction to finish: stale/idempotent no-op.
|
|
||||||
return Idle{}, nil, nil
|
|
||||||
case Shutdown:
|
|
||||||
return Terminated{}, nil, nil
|
|
||||||
}
|
|
||||||
case Running:
|
|
||||||
switch e.(type) {
|
|
||||||
case Trigger:
|
|
||||||
// Already compacting: drop (single-flight).
|
|
||||||
return Running{}, nil, nil
|
|
||||||
case Finished:
|
|
||||||
return Idle{}, nil, nil
|
|
||||||
case Shutdown:
|
|
||||||
// Teardown while compacting: the sink cancels + joins the goroutine,
|
|
||||||
// so its later Finished is absorbed here in Terminated.
|
|
||||||
return Terminated{}, nil, nil
|
|
||||||
}
|
|
||||||
case Terminated:
|
|
||||||
// Absorbing: a Trigger after teardown is rejected (no StartCompaction), so
|
|
||||||
// no compaction outlives the session.
|
|
||||||
switch e.(type) {
|
|
||||||
case Trigger, Finished, Shutdown:
|
|
||||||
return Terminated{}, nil, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return s, nil, fmt.Errorf("compactcoord: unhandled transition %s <- %s", s, e)
|
|
||||||
}
|
|
||||||
|
|
||||||
// EffectSink performs the effects produced by a transition. See coordinator.Sink:
|
|
||||||
// StartCompaction spawns a goroutine, so Perform does not block under the lock.
|
|
||||||
type EffectSink = coordinator.Sink[Effect]
|
|
||||||
|
|
||||||
// Coordinator serializes the compaction transitions. See coordinator.Coordinator.
|
|
||||||
type Coordinator = coordinator.Coordinator[State, Event, Effect]
|
|
||||||
|
|
||||||
// New returns an idle Coordinator that performs effects via sink.
|
|
||||||
func New(sink EffectSink) *Coordinator {
|
|
||||||
return coordinator.New[State, Event, Effect](Idle{}, Next, sink)
|
|
||||||
}
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
package compactcoord
|
|
||||||
|
|
||||||
import (
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestCompactcoord(t *testing.T) {
|
|
||||||
RegisterFailHandler(Fail)
|
|
||||||
RunSpecs(t, "compactcoord (realtime M4) Suite")
|
|
||||||
}
|
|
||||||
@@ -1,202 +0,0 @@
|
|||||||
package compactcoord
|
|
||||||
|
|
||||||
import (
|
|
||||||
"math/rand/v2"
|
|
||||||
"sync"
|
|
||||||
"sync/atomic"
|
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
)
|
|
||||||
|
|
||||||
// recordingSink captures the ordered stream of effects. Perform is called under
|
|
||||||
// the coordinator lock; the mutex here guards reads from the spec goroutine.
|
|
||||||
type recordingSink struct {
|
|
||||||
mu sync.Mutex
|
|
||||||
log []Effect
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *recordingSink) Perform(e Effect) {
|
|
||||||
s.mu.Lock()
|
|
||||||
s.log = append(s.log, e)
|
|
||||||
s.mu.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *recordingSink) count() int {
|
|
||||||
s.mu.Lock()
|
|
||||||
defer s.mu.Unlock()
|
|
||||||
return len(s.log)
|
|
||||||
}
|
|
||||||
|
|
||||||
type unknownEvent struct{}
|
|
||||||
|
|
||||||
func (unknownEvent) isEvent() {}
|
|
||||||
func (unknownEvent) String() string { return "unknownEvent" }
|
|
||||||
|
|
||||||
type unknownState struct{}
|
|
||||||
|
|
||||||
func (unknownState) isState() {}
|
|
||||||
func (unknownState) String() string { return "unknownState" }
|
|
||||||
|
|
||||||
var _ = Describe("compactcoord.Next", func() {
|
|
||||||
DescribeTable("transitions",
|
|
||||||
func(state State, event Event, wantState State, wantEff []Effect) {
|
|
||||||
gotState, gotEff, err := Next(state, event)
|
|
||||||
Expect(err).NotTo(HaveOccurred())
|
|
||||||
Expect(gotState).To(Equal(wantState))
|
|
||||||
Expect(gotEff).To(Equal(wantEff))
|
|
||||||
},
|
|
||||||
Entry("idle+trigger -> running: start",
|
|
||||||
Idle{}, Trigger{}, Running{}, []Effect{StartCompaction{}}),
|
|
||||||
Entry("idle+finished -> idle, no-op (stale)",
|
|
||||||
Idle{}, Finished{}, Idle{}, []Effect(nil)),
|
|
||||||
Entry("running+trigger -> running, no-op (single-flight)",
|
|
||||||
Running{}, Trigger{}, Running{}, []Effect(nil)),
|
|
||||||
Entry("running+finished -> idle",
|
|
||||||
Running{}, Finished{}, Idle{}, []Effect(nil)),
|
|
||||||
Entry("idle+shutdown -> terminated",
|
|
||||||
Idle{}, Shutdown{}, Terminated{}, []Effect(nil)),
|
|
||||||
Entry("running+shutdown -> terminated",
|
|
||||||
Running{}, Shutdown{}, Terminated{}, []Effect(nil)),
|
|
||||||
Entry("terminated+trigger -> terminated, REJECTED",
|
|
||||||
Terminated{}, Trigger{}, Terminated{}, []Effect(nil)),
|
|
||||||
Entry("terminated+finished -> terminated, no-op (stale)",
|
|
||||||
Terminated{}, Finished{}, Terminated{}, []Effect(nil)),
|
|
||||||
Entry("terminated+shutdown -> terminated, idempotent",
|
|
||||||
Terminated{}, Shutdown{}, Terminated{}, []Effect(nil)),
|
|
||||||
)
|
|
||||||
|
|
||||||
It("is total over the defined (state, event) pairs", func() {
|
|
||||||
for _, s := range []State{Idle{}, Running{}, Terminated{}} {
|
|
||||||
for _, e := range []Event{Trigger{}, Finished{}, Shutdown{}} {
|
|
||||||
_, _, err := Next(s, e)
|
|
||||||
Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
It("errors on an unknown event type", func() {
|
|
||||||
_, _, err := Next(Idle{}, unknownEvent{})
|
|
||||||
Expect(err).To(HaveOccurred())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("errors on an unknown state type", func() {
|
|
||||||
_, _, err := Next(unknownState{}, Trigger{})
|
|
||||||
Expect(err).To(HaveOccurred())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
var _ = Describe("compactcoord.Coordinator", func() {
|
|
||||||
// A StartCompaction is only ever produced while Idle (verified by checking the
|
|
||||||
// effect count grows exactly when the model transitions Idle->Running), so at
|
|
||||||
// most one compaction is ever in flight.
|
|
||||||
It("starts at most one compaction at a time over random sequences", func() {
|
|
||||||
seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
|
|
||||||
for _, seed := range seeds {
|
|
||||||
r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
|
|
||||||
sink := &recordingSink{}
|
|
||||||
c := New(sink)
|
|
||||||
running := false
|
|
||||||
starts := 0
|
|
||||||
|
|
||||||
for range 5000 {
|
|
||||||
if r.IntN(2) == 0 {
|
|
||||||
before := sink.count()
|
|
||||||
Expect(c.Apply(Trigger{})).To(Succeed())
|
|
||||||
if sink.count() > before {
|
|
||||||
// A StartCompaction was produced: must have been Idle.
|
|
||||||
Expect(running).To(BeFalse(), "seed=%d: started while already running", seed)
|
|
||||||
running = true
|
|
||||||
starts++
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
Expect(c.Apply(Finished{})).To(Succeed())
|
|
||||||
running = false
|
|
||||||
}
|
|
||||||
if running {
|
|
||||||
Expect(c.State()).To(Equal(State(Running{})), "seed=%d", seed)
|
|
||||||
} else {
|
|
||||||
Expect(c.State()).To(Equal(State(Idle{})), "seed=%d", seed)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Expect(starts).To(BeNumerically(">", 0), "seed=%d: walk should have started at least one", seed)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
// Faithful concurrent test: StartCompaction spawns "work" that bumps an active
|
|
||||||
// counter, runs, and reports Finished back to the coordinator (exactly how the
|
|
||||||
// real sink behaves). Single-flight must hold even under many concurrent
|
|
||||||
// Triggers: the active counter never exceeds 1. Run under -race.
|
|
||||||
It("never runs two compactions concurrently", func() {
|
|
||||||
var active, maxActive int32
|
|
||||||
var c *Coordinator
|
|
||||||
var work sync.WaitGroup
|
|
||||||
sink := &spawnSink{onStart: func() {
|
|
||||||
work.Add(1)
|
|
||||||
go func() {
|
|
||||||
defer work.Done()
|
|
||||||
n := atomic.AddInt32(&active, 1)
|
|
||||||
for {
|
|
||||||
m := atomic.LoadInt32(&maxActive)
|
|
||||||
if n <= m || atomic.CompareAndSwapInt32(&maxActive, m, n) {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
atomic.AddInt32(&active, -1)
|
|
||||||
_ = c.Apply(Finished{})
|
|
||||||
}()
|
|
||||||
}}
|
|
||||||
c = New(sink)
|
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
|
||||||
for g := 0; g < 8; g++ {
|
|
||||||
wg.Add(1)
|
|
||||||
go func() {
|
|
||||||
defer wg.Done()
|
|
||||||
for range 1000 {
|
|
||||||
_ = c.Apply(Trigger{})
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
}
|
|
||||||
wg.Wait()
|
|
||||||
work.Wait() // let any in-flight compaction report Finished
|
|
||||||
|
|
||||||
Expect(atomic.LoadInt32(&maxActive)).To(BeNumerically("<=", 1))
|
|
||||||
Expect(c.State()).To(Equal(State(Idle{})))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("terminates on shutdown and rejects later triggers", func() {
|
|
||||||
sink := &recordingSink{}
|
|
||||||
c := New(sink)
|
|
||||||
Expect(c.Apply(Trigger{})).To(Succeed()) // Idle -> Running (StartCompaction)
|
|
||||||
Expect(c.Apply(Shutdown{})).To(Succeed())
|
|
||||||
Expect(c.State()).To(Equal(State(Terminated{})))
|
|
||||||
|
|
||||||
before := sink.count()
|
|
||||||
Expect(c.Apply(Trigger{})).To(Succeed()) // rejected
|
|
||||||
Expect(sink.count()).To(Equal(before), "no StartCompaction after shutdown")
|
|
||||||
Expect(c.Apply(Finished{})).To(Succeed()) // stale, absorbed
|
|
||||||
Expect(c.State()).To(Equal(State(Terminated{})))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
// spawnSink invokes onStart for each StartCompaction (called under the coord lock;
|
|
||||||
// onStart must be non-blocking — it spawns the work goroutine).
|
|
||||||
type spawnSink struct{ onStart func() }
|
|
||||||
|
|
||||||
func (s *spawnSink) Perform(e Effect) {
|
|
||||||
if _, ok := e.(StartCompaction); ok {
|
|
||||||
s.onStart()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var _ = DescribeTable("compactcoord stringers",
|
|
||||||
func(got, want string) { Expect(got).To(Equal(want)) },
|
|
||||||
Entry(nil, Idle{}.String(), "Idle"),
|
|
||||||
Entry(nil, Running{}.String(), "Running"),
|
|
||||||
Entry(nil, Terminated{}.String(), "Terminated"),
|
|
||||||
Entry(nil, Trigger{}.String(), "Trigger"),
|
|
||||||
Entry(nil, Finished{}.String(), "Finished"),
|
|
||||||
Entry(nil, Shutdown{}.String(), "Shutdown"),
|
|
||||||
Entry(nil, StartCompaction{}.String(), "StartCompaction"),
|
|
||||||
)
|
|
||||||
@@ -1,164 +0,0 @@
|
|||||||
// Package conncoord is the explicit state machine for the realtime API's
|
|
||||||
// connection lifecycle (machine "M1" in docs/design/realtime-state-machines.md).
|
|
||||||
//
|
|
||||||
// In the legacy code this machine is implicit and fragile. The session handler
|
|
||||||
// keeps a `vadServerStarted` bool plus a `done` channel that is REASSIGNED to a
|
|
||||||
// fresh channel every time turn detection is toggled on (session.update) and
|
|
||||||
// closed both at toggle-off and at teardown (Part 2, failure mode 6). It is
|
|
||||||
// correct today only because one goroutine owns it; "one variable name meaning
|
|
||||||
// different channels over time, closed from two sites guarded by a bool" is a
|
|
||||||
// structural hazard, not an explicit lifecycle. Teardown likewise depends on the
|
|
||||||
// bool to avoid closing an already-closed channel.
|
|
||||||
//
|
|
||||||
// This package makes the lifecycle explicit:
|
|
||||||
// - a sealed sum type for State (Live{VADRunning} | Torn) — illegal states
|
|
||||||
// such as "running after teardown" are unrepresentable,
|
|
||||||
// - a total, pure transition function Next(state, event) -> (state, effects),
|
|
||||||
// - a single-writer Coordinator that serializes every transition.
|
|
||||||
//
|
|
||||||
// The guarantees the spec checks:
|
|
||||||
// - the VAD goroutine's done channel is closed exactly once per start (StopVAD
|
|
||||||
// is emitted only while running, so never a double close / close of nil),
|
|
||||||
// - teardown runs exactly once (Close from Live; any later Close is a no-op),
|
|
||||||
// - nothing is started after teardown (no resurrection / no send-after-close).
|
|
||||||
//
|
|
||||||
// Like turncoord (M2), the connection machine is driven by the single session
|
|
||||||
// goroutine; the Coordinator's lock keeps State() race-free and guards against a
|
|
||||||
// future second writer. The effects are performed by a sink that owns the actual
|
|
||||||
// channels/goroutines (see realtime_conncoord.go).
|
|
||||||
package conncoord
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
|
|
||||||
)
|
|
||||||
|
|
||||||
// State is the sealed sum type of connection states. The only implementations
|
|
||||||
// are the marker-method structs in this file. Exhaustively: Live | Torn.
|
|
||||||
type State interface {
|
|
||||||
isState()
|
|
||||||
String() string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Live: the session is active. VADRunning records whether the turn-detection
|
|
||||||
// (handleVAD) goroutine is currently running — the single source of truth that
|
|
||||||
// replaces the legacy vadServerStarted bool, so the per-run done channel is
|
|
||||||
// closed exactly once.
|
|
||||||
type Live struct{ VADRunning bool }
|
|
||||||
|
|
||||||
// Torn: the session has been torn down. Terminal — no effect is ever produced
|
|
||||||
// from here again.
|
|
||||||
type Torn struct{}
|
|
||||||
|
|
||||||
func (Live) isState() {}
|
|
||||||
func (Torn) isState() {}
|
|
||||||
|
|
||||||
func (s Live) String() string { return fmt.Sprintf("Live(vad=%t)", s.VADRunning) }
|
|
||||||
func (Torn) String() string { return "Torn" }
|
|
||||||
|
|
||||||
// Event is the sealed sum type of inputs. Exhaustively: SetVAD | Close.
|
|
||||||
type Event interface {
|
|
||||||
isEvent()
|
|
||||||
String() string
|
|
||||||
}
|
|
||||||
|
|
||||||
// SetVAD requests the turn-detection goroutine be running (Active) or not. It is
|
|
||||||
// raised whenever session.update changes whether turn detection is active. It is
|
|
||||||
// idempotent: setting the state it is already in is a no-op.
|
|
||||||
type SetVAD struct{ Active bool }
|
|
||||||
|
|
||||||
// Close requests teardown (the transport read loop ended, or the session is
|
|
||||||
// closing). It is idempotent — only the first Close from Live tears down.
|
|
||||||
type Close struct{}
|
|
||||||
|
|
||||||
func (SetVAD) isEvent() {}
|
|
||||||
func (Close) isEvent() {}
|
|
||||||
|
|
||||||
func (e SetVAD) String() string { return fmt.Sprintf("SetVAD(%t)", e.Active) }
|
|
||||||
func (Close) String() string { return "Close" }
|
|
||||||
|
|
||||||
// Effect is a side effect returned by Next as data for the caller to perform.
|
|
||||||
// Exhaustively: StartVAD | StopVAD | Teardown.
|
|
||||||
type Effect interface {
|
|
||||||
isEffect()
|
|
||||||
String() string
|
|
||||||
}
|
|
||||||
|
|
||||||
// StartVAD: create a fresh done channel and spawn the handleVAD goroutine on it.
|
|
||||||
type StartVAD struct{}
|
|
||||||
|
|
||||||
// StopVAD: close the running VAD goroutine's done channel (signal it to exit).
|
|
||||||
type StopVAD struct{}
|
|
||||||
|
|
||||||
// Teardown: the once-only teardown — stop the remaining input goroutines (opus
|
|
||||||
// decode, sound window), join them, cancel in-flight responses, and remove the
|
|
||||||
// session from the registry. Emitted exactly once.
|
|
||||||
type Teardown struct{}
|
|
||||||
|
|
||||||
func (StartVAD) isEffect() {}
|
|
||||||
func (StopVAD) isEffect() {}
|
|
||||||
func (Teardown) isEffect() {}
|
|
||||||
|
|
||||||
func (StartVAD) String() string { return "StartVAD" }
|
|
||||||
func (StopVAD) String() string { return "StopVAD" }
|
|
||||||
func (Teardown) String() string { return "Teardown" }
|
|
||||||
|
|
||||||
// Next is the total, pure transition function. For every (state, event) it
|
|
||||||
// returns the next state and the ordered effects to perform. It returns a
|
|
||||||
// non-nil error only for an unknown State/Event implementation. Every in-domain
|
|
||||||
// pair is defined; there are no forbidden transitions, only no-ops.
|
|
||||||
//
|
|
||||||
// The crux: Close moves to Torn, which absorbs every later event with no
|
|
||||||
// effects. So teardown's channel closes happen exactly once even if Close is
|
|
||||||
// raised again (e.g. an error path and the normal return both reaching it), and
|
|
||||||
// no StartVAD can resurrect a torn session.
|
|
||||||
func Next(s State, e Event) (State, []Effect, error) {
|
|
||||||
switch st := s.(type) {
|
|
||||||
case Live:
|
|
||||||
switch ev := e.(type) {
|
|
||||||
case SetVAD:
|
|
||||||
switch {
|
|
||||||
case ev.Active && !st.VADRunning:
|
|
||||||
return Live{VADRunning: true}, []Effect{StartVAD{}}, nil
|
|
||||||
case !ev.Active && st.VADRunning:
|
|
||||||
return Live{VADRunning: false}, []Effect{StopVAD{}}, nil
|
|
||||||
default:
|
|
||||||
// Already in the requested state: idempotent no-op.
|
|
||||||
return Live{VADRunning: st.VADRunning}, nil, nil
|
|
||||||
}
|
|
||||||
case Close:
|
|
||||||
if st.VADRunning {
|
|
||||||
return Torn{}, []Effect{StopVAD{}, Teardown{}}, nil
|
|
||||||
}
|
|
||||||
return Torn{}, []Effect{Teardown{}}, nil
|
|
||||||
}
|
|
||||||
case Torn:
|
|
||||||
switch e.(type) {
|
|
||||||
case SetVAD:
|
|
||||||
// No resurrection: a toggle after teardown is ignored.
|
|
||||||
return Torn{}, nil, nil
|
|
||||||
case Close:
|
|
||||||
// Idempotent: teardown already ran.
|
|
||||||
return Torn{}, nil, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return s, nil, fmt.Errorf("conncoord: unhandled transition %s <- %s", s, e)
|
|
||||||
}
|
|
||||||
|
|
||||||
// EffectSink performs the effects produced by a transition. See coordinator.Sink:
|
|
||||||
// Perform runs under the coordinator lock. The Teardown effect does join
|
|
||||||
// goroutines (which can block) — acceptable here because the connection
|
|
||||||
// coordinator is single-writer and torn down exactly once at the end of the
|
|
||||||
// session goroutine, so no other Apply is contending the lock.
|
|
||||||
type EffectSink = coordinator.Sink[Effect]
|
|
||||||
|
|
||||||
// Coordinator serializes the connection-lifecycle transitions.
|
|
||||||
// See coordinator.Coordinator.
|
|
||||||
type Coordinator = coordinator.Coordinator[State, Event, Effect]
|
|
||||||
|
|
||||||
// New returns a Coordinator in Live{VADRunning:false} that performs effects via
|
|
||||||
// sink.
|
|
||||||
func New(sink EffectSink) *Coordinator {
|
|
||||||
return coordinator.New[State, Event, Effect](Live{VADRunning: false}, Next, sink)
|
|
||||||
}
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
package conncoord
|
|
||||||
|
|
||||||
import (
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestConncoord(t *testing.T) {
|
|
||||||
RegisterFailHandler(Fail)
|
|
||||||
RunSpecs(t, "conncoord (realtime M1) Suite")
|
|
||||||
}
|
|
||||||
@@ -1,212 +0,0 @@
|
|||||||
package conncoord
|
|
||||||
|
|
||||||
import (
|
|
||||||
"math/rand/v2"
|
|
||||||
"sync"
|
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
)
|
|
||||||
|
|
||||||
// recordingSink captures the ordered stream of effects so the invariants can be
|
|
||||||
// checked independently of the transition function. Perform is called by
|
|
||||||
// Coordinator.Apply under the coordinator lock; the mutex here only guards reads
|
|
||||||
// from the spec goroutine.
|
|
||||||
type recordingSink struct {
|
|
||||||
mu sync.Mutex
|
|
||||||
log []Effect
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *recordingSink) Perform(e Effect) {
|
|
||||||
s.mu.Lock()
|
|
||||||
s.log = append(s.log, e)
|
|
||||||
s.mu.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *recordingSink) snapshot() []Effect {
|
|
||||||
s.mu.Lock()
|
|
||||||
defer s.mu.Unlock()
|
|
||||||
out := make([]Effect, len(s.log))
|
|
||||||
copy(out, s.log)
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
// checkLog replays the effect log and asserts the lifecycle safety properties
|
|
||||||
// from docs/design/realtime-state-machines.md, Part 4 (invariants #8, #10 and
|
|
||||||
// failure mode 6):
|
|
||||||
//
|
|
||||||
// (1) the VAD done channel is closed exactly once per start -- StartVAD only
|
|
||||||
// while stopped, StopVAD only while running (no double close / close-of-nil);
|
|
||||||
// (2) teardown runs at most once;
|
|
||||||
// (3) no resurrection -- no StartVAD after Teardown.
|
|
||||||
func checkLog(log []Effect) {
|
|
||||||
running := false
|
|
||||||
torn := false
|
|
||||||
teardowns := 0
|
|
||||||
for i, eff := range log {
|
|
||||||
switch eff.(type) {
|
|
||||||
case StartVAD:
|
|
||||||
Expect(torn).To(BeFalse(), "invariant (3): StartVAD after teardown (effect #%d)\nlog=%v", i, log)
|
|
||||||
Expect(running).To(BeFalse(), "invariant (1): StartVAD while already running (effect #%d)\nlog=%v", i, log)
|
|
||||||
running = true
|
|
||||||
case StopVAD:
|
|
||||||
Expect(running).To(BeTrue(), "invariant (1): StopVAD while not running (effect #%d)\nlog=%v", i, log)
|
|
||||||
running = false
|
|
||||||
case Teardown:
|
|
||||||
Expect(torn).To(BeFalse(), "invariant (2): Teardown twice (effect #%d)\nlog=%v", i, log)
|
|
||||||
torn = true
|
|
||||||
teardowns++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Expect(teardowns).To(BeNumerically("<=", 1), "invariant (2): teardown ran %d times\nlog=%v", teardowns, log)
|
|
||||||
}
|
|
||||||
|
|
||||||
type unknownEvent struct{}
|
|
||||||
|
|
||||||
func (unknownEvent) isEvent() {}
|
|
||||||
func (unknownEvent) String() string { return "unknownEvent" }
|
|
||||||
|
|
||||||
type unknownState struct{}
|
|
||||||
|
|
||||||
func (unknownState) isState() {}
|
|
||||||
func (unknownState) String() string { return "unknownState" }
|
|
||||||
|
|
||||||
var _ = Describe("conncoord.Next", func() {
|
|
||||||
DescribeTable("transitions",
|
|
||||||
func(state State, event Event, wantState State, wantEff []Effect) {
|
|
||||||
gotState, gotEff, err := Next(state, event)
|
|
||||||
Expect(err).NotTo(HaveOccurred())
|
|
||||||
Expect(gotState).To(Equal(wantState))
|
|
||||||
Expect(gotEff).To(Equal(wantEff))
|
|
||||||
},
|
|
||||||
Entry("stopped+setvad(on) -> running: start",
|
|
||||||
Live{VADRunning: false}, SetVAD{Active: true},
|
|
||||||
Live{VADRunning: true}, []Effect{StartVAD{}}),
|
|
||||||
Entry("running+setvad(on) -> running, no-op",
|
|
||||||
Live{VADRunning: true}, SetVAD{Active: true},
|
|
||||||
Live{VADRunning: true}, []Effect(nil)),
|
|
||||||
Entry("stopped+setvad(off) -> stopped, no-op",
|
|
||||||
Live{VADRunning: false}, SetVAD{Active: false},
|
|
||||||
Live{VADRunning: false}, []Effect(nil)),
|
|
||||||
Entry("running+setvad(off) -> stopped: stop",
|
|
||||||
Live{VADRunning: true}, SetVAD{Active: false},
|
|
||||||
Live{VADRunning: false}, []Effect{StopVAD{}}),
|
|
||||||
Entry("stopped+close -> torn: teardown",
|
|
||||||
Live{VADRunning: false}, Close{},
|
|
||||||
Torn{}, []Effect{Teardown{}}),
|
|
||||||
Entry("running+close -> torn: stop + teardown",
|
|
||||||
Live{VADRunning: true}, Close{},
|
|
||||||
Torn{}, []Effect{StopVAD{}, Teardown{}}),
|
|
||||||
Entry("torn+setvad(on) -> torn, no-op (no resurrection)",
|
|
||||||
Torn{}, SetVAD{Active: true},
|
|
||||||
Torn{}, []Effect(nil)),
|
|
||||||
Entry("torn+close -> torn, no-op (idempotent)",
|
|
||||||
Torn{}, Close{},
|
|
||||||
Torn{}, []Effect(nil)),
|
|
||||||
)
|
|
||||||
|
|
||||||
It("is total over the defined (state, event) pairs", func() {
|
|
||||||
states := []State{Live{VADRunning: false}, Live{VADRunning: true}, Torn{}}
|
|
||||||
events := []Event{SetVAD{Active: true}, SetVAD{Active: false}, Close{}}
|
|
||||||
for _, s := range states {
|
|
||||||
for _, e := range events {
|
|
||||||
_, _, err := Next(s, e)
|
|
||||||
Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
It("errors on an unknown event type", func() {
|
|
||||||
_, _, err := Next(Live{}, unknownEvent{})
|
|
||||||
Expect(err).To(HaveOccurred())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("errors on an unknown state type", func() {
|
|
||||||
_, _, err := Next(unknownState{}, Close{})
|
|
||||||
Expect(err).To(HaveOccurred())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
var _ = Describe("conncoord.Coordinator", func() {
|
|
||||||
It("upholds the lifecycle invariants over random event sequences", func() {
|
|
||||||
seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
|
|
||||||
for _, seed := range seeds {
|
|
||||||
r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
|
|
||||||
sink := &recordingSink{}
|
|
||||||
c := New(sink)
|
|
||||||
running := false
|
|
||||||
torn := false
|
|
||||||
|
|
||||||
for range 5000 {
|
|
||||||
switch r.IntN(3) {
|
|
||||||
case 0:
|
|
||||||
Expect(c.Apply(SetVAD{Active: true})).To(Succeed())
|
|
||||||
if !torn {
|
|
||||||
running = true
|
|
||||||
}
|
|
||||||
case 1:
|
|
||||||
Expect(c.Apply(SetVAD{Active: false})).To(Succeed())
|
|
||||||
if !torn {
|
|
||||||
running = false
|
|
||||||
}
|
|
||||||
case 2:
|
|
||||||
Expect(c.Apply(Close{})).To(Succeed())
|
|
||||||
torn = true
|
|
||||||
running = false
|
|
||||||
}
|
|
||||||
if torn {
|
|
||||||
Expect(c.State()).To(Equal(State(Torn{})), "seed=%d", seed)
|
|
||||||
} else {
|
|
||||||
Expect(c.State()).To(Equal(State(Live{VADRunning: running})), "seed=%d", seed)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
checkLog(sink.snapshot())
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
It("tears down at most once under concurrent SetVAD/Close from two goroutines", func() {
|
|
||||||
const perGoroutine = 2000
|
|
||||||
sink := &recordingSink{}
|
|
||||||
c := New(sink)
|
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
|
||||||
drive := func(active bool) {
|
|
||||||
defer wg.Done()
|
|
||||||
for i := range perGoroutine {
|
|
||||||
switch i % 3 {
|
|
||||||
case 0:
|
|
||||||
_ = c.Apply(SetVAD{Active: active})
|
|
||||||
case 1:
|
|
||||||
_ = c.Apply(SetVAD{Active: !active})
|
|
||||||
case 2:
|
|
||||||
if i > perGoroutine/2 {
|
|
||||||
_ = c.Apply(Close{})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
wg.Add(2)
|
|
||||||
go drive(true)
|
|
||||||
go drive(false)
|
|
||||||
wg.Wait()
|
|
||||||
_ = c.Apply(Close{})
|
|
||||||
|
|
||||||
checkLog(sink.snapshot())
|
|
||||||
Expect(c.State()).To(Equal(State(Torn{})))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
var _ = DescribeTable("conncoord stringers",
|
|
||||||
func(got, want string) { Expect(got).To(Equal(want)) },
|
|
||||||
Entry(nil, Live{VADRunning: true}.String(), "Live(vad=true)"),
|
|
||||||
Entry(nil, Live{VADRunning: false}.String(), "Live(vad=false)"),
|
|
||||||
Entry(nil, Torn{}.String(), "Torn"),
|
|
||||||
|
|
||||||
Entry(nil, SetVAD{Active: true}.String(), "SetVAD(true)"),
|
|
||||||
Entry(nil, Close{}.String(), "Close"),
|
|
||||||
|
|
||||||
Entry(nil, StartVAD{}.String(), "StartVAD"),
|
|
||||||
Entry(nil, StopVAD{}.String(), "StopVAD"),
|
|
||||||
Entry(nil, Teardown{}.String(), "Teardown"),
|
|
||||||
)
|
|
||||||
@@ -5,7 +5,4 @@ const (
|
|||||||
FinishReasonStop = "stop"
|
FinishReasonStop = "stop"
|
||||||
FinishReasonToolCalls = "tool_calls"
|
FinishReasonToolCalls = "tool_calls"
|
||||||
FinishReasonFunctionCall = "function_call"
|
FinishReasonFunctionCall = "function_call"
|
||||||
// FinishReasonLength is reported when generation stopped because it
|
|
||||||
// reached the max_tokens budget rather than a natural stop (issue #9716).
|
|
||||||
FinishReasonLength = "length"
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,82 +0,0 @@
|
|||||||
// Package coordinator is the shared single-writer state-machine runtime for the
|
|
||||||
// realtime API's explicit coordinators (machines M1–M5 in
|
|
||||||
// docs/design/realtime-state-machines.md).
|
|
||||||
//
|
|
||||||
// Each machine package (respcoord, turncoord, conncoord, compactcoord, ttscoord)
|
|
||||||
// defines its OWN sealed sum types for State/Event/Effect and a total, pure
|
|
||||||
// transition function Next(state, event) -> (state, []effect, error). The
|
|
||||||
// plumbing around that — a single-writer Coordinator that serializes every
|
|
||||||
// transition behind one lock and performs the returned effects in order — is
|
|
||||||
// identical across all five, so it lives here once instead of being copied.
|
|
||||||
//
|
|
||||||
// A machine package wires itself up with three lines:
|
|
||||||
//
|
|
||||||
// type EffectSink = coordinator.Sink[Effect]
|
|
||||||
// type Coordinator = coordinator.Coordinator[State, Event, Effect]
|
|
||||||
// func New(sink EffectSink) *Coordinator { return coordinator.New[State, Event, Effect](Idle{}, Next, sink) }
|
|
||||||
//
|
|
||||||
// The aliases keep each package's public API (Coordinator, New, EffectSink,
|
|
||||||
// Apply, State) unchanged. The single-writer serialization — the load-bearing
|
|
||||||
// concurrency guarantee the FizzBee specs check — is therefore implemented and
|
|
||||||
// reasoned about in exactly one place.
|
|
||||||
package coordinator
|
|
||||||
|
|
||||||
import "sync"
|
|
||||||
|
|
||||||
// TransitionFunc is a machine's total, pure transition: given the current state
|
|
||||||
// and an event it returns the next state, the ordered effects to perform, and a
|
|
||||||
// non-nil error ONLY for an unhandled (programmer-error) state/event pair. It
|
|
||||||
// must not perform I/O or block; side effects are returned as data (F) for the
|
|
||||||
// Coordinator to hand to the Sink.
|
|
||||||
type TransitionFunc[S, E, F any] func(state S, event E) (S, []F, error)
|
|
||||||
|
|
||||||
// Sink performs the effects a transition produces. Implementations MUST be
|
|
||||||
// non-blocking: Perform is called while the Coordinator holds its lock, so it
|
|
||||||
// must not block (it should spawn a goroutine, call a cancel func, or do a
|
|
||||||
// non-blocking channel send) and MUST NOT call back into the same Coordinator's
|
|
||||||
// Apply.
|
|
||||||
type Sink[F any] interface {
|
|
||||||
Perform(F)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Coordinator is the single-writer wrapper around a pure transition function.
|
|
||||||
// Every Apply is serialized by mu, so multiple goroutines can drive the machine
|
|
||||||
// without racing, and a transition's effects are performed in order under the
|
|
||||||
// lock (before any subsequent Apply can observe the new state).
|
|
||||||
type Coordinator[S, E, F any] struct {
|
|
||||||
mu sync.Mutex
|
|
||||||
state S
|
|
||||||
next TransitionFunc[S, E, F]
|
|
||||||
sink Sink[F]
|
|
||||||
}
|
|
||||||
|
|
||||||
// New returns a Coordinator in the given initial state that transitions via next
|
|
||||||
// and performs effects via sink.
|
|
||||||
func New[S, E, F any](initial S, next TransitionFunc[S, E, F], sink Sink[F]) *Coordinator[S, E, F] {
|
|
||||||
return &Coordinator[S, E, F]{state: initial, next: next, sink: sink}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Apply runs one transition under the lock and performs its effects in order. If
|
|
||||||
// the transition function returns an error (an unhandled state/event), the state
|
|
||||||
// is left unchanged and the error is returned to the caller — never silently
|
|
||||||
// swallowed.
|
|
||||||
func (c *Coordinator[S, E, F]) Apply(e E) error {
|
|
||||||
c.mu.Lock()
|
|
||||||
defer c.mu.Unlock()
|
|
||||||
ns, effects, err := c.next(c.state, e)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
c.state = ns
|
|
||||||
for _, eff := range effects {
|
|
||||||
c.sink.Perform(eff)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// State returns the current state (a value; safe to call concurrently).
|
|
||||||
func (c *Coordinator[S, E, F]) State() S {
|
|
||||||
c.mu.Lock()
|
|
||||||
defer c.mu.Unlock()
|
|
||||||
return c.state
|
|
||||||
}
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
package coordinator
|
|
||||||
|
|
||||||
import (
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestCoordinator(t *testing.T) {
|
|
||||||
RegisterFailHandler(Fail)
|
|
||||||
RunSpecs(t, "coordinator (shared runtime) Suite")
|
|
||||||
}
|
|
||||||
@@ -1,124 +0,0 @@
|
|||||||
package coordinator
|
|
||||||
|
|
||||||
import (
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"sync"
|
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
)
|
|
||||||
|
|
||||||
// A tiny toy machine exercises the generic runtime directly (the five real
|
|
||||||
// machines exercise it via their aliases, but the gate measures this package's
|
|
||||||
// own coverage). off <-toggle-> on; burst emits three ordered effects; boom is
|
|
||||||
// the unhandled/error path.
|
|
||||||
type tstate int
|
|
||||||
|
|
||||||
const (
|
|
||||||
off tstate = iota
|
|
||||||
on
|
|
||||||
)
|
|
||||||
|
|
||||||
type tevent int
|
|
||||||
|
|
||||||
const (
|
|
||||||
toggle tevent = iota
|
|
||||||
burst
|
|
||||||
boom
|
|
||||||
)
|
|
||||||
|
|
||||||
type teffect string
|
|
||||||
|
|
||||||
func tnext(s tstate, e tevent) (tstate, []teffect, error) {
|
|
||||||
switch e {
|
|
||||||
case toggle:
|
|
||||||
if s == off {
|
|
||||||
return on, []teffect{"on"}, nil
|
|
||||||
}
|
|
||||||
return off, []teffect{"off"}, nil
|
|
||||||
case burst:
|
|
||||||
return s, []teffect{"a", "b", "c"}, nil
|
|
||||||
case boom:
|
|
||||||
return s, nil, errors.New("boom: unhandled")
|
|
||||||
}
|
|
||||||
return s, nil, fmt.Errorf("unknown event %d", int(e))
|
|
||||||
}
|
|
||||||
|
|
||||||
type recordingSink struct {
|
|
||||||
mu sync.Mutex
|
|
||||||
log []teffect
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *recordingSink) Perform(e teffect) {
|
|
||||||
s.mu.Lock()
|
|
||||||
s.log = append(s.log, e)
|
|
||||||
s.mu.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *recordingSink) snapshot() []teffect {
|
|
||||||
s.mu.Lock()
|
|
||||||
defer s.mu.Unlock()
|
|
||||||
out := make([]teffect, len(s.log))
|
|
||||||
copy(out, s.log)
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
var _ = Describe("coordinator.Coordinator", func() {
|
|
||||||
It("starts in the initial state", func() {
|
|
||||||
c := New[tstate, tevent, teffect](off, tnext, &recordingSink{})
|
|
||||||
Expect(c.State()).To(Equal(off))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("advances state and performs the transition's effects", func() {
|
|
||||||
sink := &recordingSink{}
|
|
||||||
c := New[tstate, tevent, teffect](off, tnext, sink)
|
|
||||||
|
|
||||||
Expect(c.Apply(toggle)).To(Succeed())
|
|
||||||
Expect(c.State()).To(Equal(on))
|
|
||||||
Expect(c.Apply(toggle)).To(Succeed())
|
|
||||||
Expect(c.State()).To(Equal(off))
|
|
||||||
|
|
||||||
Expect(sink.snapshot()).To(Equal([]teffect{"on", "off"}))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("performs multiple effects in order", func() {
|
|
||||||
sink := &recordingSink{}
|
|
||||||
c := New[tstate, tevent, teffect](off, tnext, sink)
|
|
||||||
Expect(c.Apply(burst)).To(Succeed())
|
|
||||||
Expect(sink.snapshot()).To(Equal([]teffect{"a", "b", "c"}))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("returns the transition error and leaves state unchanged", func() {
|
|
||||||
sink := &recordingSink{}
|
|
||||||
c := New[tstate, tevent, teffect](on, tnext, sink)
|
|
||||||
err := c.Apply(boom)
|
|
||||||
Expect(err).To(HaveOccurred())
|
|
||||||
Expect(c.State()).To(Equal(on), "state unchanged on error")
|
|
||||||
Expect(sink.snapshot()).To(BeEmpty(), "no effects performed on error")
|
|
||||||
})
|
|
||||||
|
|
||||||
It("serializes concurrent Apply from many goroutines (run with -race)", func() {
|
|
||||||
const goroutines = 8
|
|
||||||
const each = 1000
|
|
||||||
sink := &recordingSink{}
|
|
||||||
c := New[tstate, tevent, teffect](off, tnext, sink)
|
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
|
||||||
wg.Add(goroutines)
|
|
||||||
for range goroutines {
|
|
||||||
go func() {
|
|
||||||
defer wg.Done()
|
|
||||||
for range each {
|
|
||||||
_ = c.Apply(toggle)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
}
|
|
||||||
wg.Wait()
|
|
||||||
|
|
||||||
// goroutines*each toggles from off; an even total returns to off. The
|
|
||||||
// point is race-freedom + a consistent final state, not the value itself.
|
|
||||||
Expect(c.State()).To(Equal(off))
|
|
||||||
Expect(sink.snapshot()).To(HaveLen(goroutines * each))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -13,14 +13,6 @@ import (
|
|||||||
"github.com/mudler/xlog"
|
"github.com/mudler/xlog"
|
||||||
)
|
)
|
||||||
|
|
||||||
// reachedTokenBudget reports whether generation stopped because it reached the
|
|
||||||
// configured max_tokens ceiling. A maxTokens of nil or <= 0 means "no limit".
|
|
||||||
// Used to suppress regeneration retries (which would just hit the same ceiling
|
|
||||||
// again) and to report finish_reason "length" instead of "stop" (issue #9716).
|
|
||||||
func reachedTokenBudget(completion int, maxTokens *int) bool {
|
|
||||||
return maxTokens != nil && *maxTokens > 0 && completion >= *maxTokens
|
|
||||||
}
|
|
||||||
|
|
||||||
func ComputeChoices(
|
func ComputeChoices(
|
||||||
req *schema.OpenAIRequest,
|
req *schema.OpenAIRequest,
|
||||||
predInput string,
|
predInput string,
|
||||||
@@ -121,21 +113,11 @@ func ComputeChoices(
|
|||||||
}
|
}
|
||||||
prediction = p
|
prediction = p
|
||||||
|
|
||||||
// budgetExhausted is true when the model stopped because it reached
|
|
||||||
// the configured max_tokens ceiling. None of the retry paths below
|
|
||||||
// should fire in that case: regenerating would just hit the same
|
|
||||||
// ceiling again and multiply token consumption (issue #9716). A
|
|
||||||
// thinking model that spends its whole budget on the reasoning block
|
|
||||||
// produces an empty content / reasoning-only response, which would
|
|
||||||
// otherwise look like a failed generation worth retrying. This is a
|
|
||||||
// "length" finish, not an empty one.
|
|
||||||
budgetExhausted := reachedTokenBudget(prediction.Usage.Completion, config.Maxtokens)
|
|
||||||
|
|
||||||
// Built-in: retry on truly empty response (no tokens at all).
|
// Built-in: retry on truly empty response (no tokens at all).
|
||||||
// However, when the C++ autoparser is active, it clears the raw
|
// However, when the C++ autoparser is active, it clears the raw
|
||||||
// message and delivers content via ChatDeltas instead. Do NOT
|
// message and delivers content via ChatDeltas instead. Do NOT
|
||||||
// retry if ChatDeltas contain tool calls or content.
|
// retry if ChatDeltas contain tool calls or content.
|
||||||
if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries && !budgetExhausted {
|
if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries {
|
||||||
hasChatDeltaData := false
|
hasChatDeltaData := false
|
||||||
for _, d := range prediction.ChatDeltas {
|
for _, d := range prediction.ChatDeltas {
|
||||||
if d.Content != "" || len(d.ToolCalls) > 0 {
|
if d.Content != "" || len(d.ToolCalls) > 0 {
|
||||||
@@ -177,7 +159,7 @@ func ComputeChoices(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if shouldRetryFn != nil && !skipCallerRetry && !budgetExhausted && shouldRetryFn(attempt) && attempt < maxRetries {
|
if shouldRetryFn != nil && !skipCallerRetry && shouldRetryFn(attempt) && attempt < maxRetries {
|
||||||
// Caller has already reset its state inside shouldRetry
|
// Caller has already reset its state inside shouldRetry
|
||||||
result = result[:0]
|
result = result[:0]
|
||||||
allChatDeltas = nil
|
allChatDeltas = nil
|
||||||
|
|||||||
@@ -393,73 +393,6 @@ var _ = Describe("ComputeChoices", func() {
|
|||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
Context("reachedTokenBudget", func() {
|
|
||||||
ptr := func(i int) *int { return &i }
|
|
||||||
It("is false when no limit is configured", func() {
|
|
||||||
Expect(reachedTokenBudget(1000, nil)).To(BeFalse())
|
|
||||||
Expect(reachedTokenBudget(1000, ptr(0))).To(BeFalse())
|
|
||||||
Expect(reachedTokenBudget(1000, ptr(-1))).To(BeFalse())
|
|
||||||
})
|
|
||||||
It("is false when generation stopped below the limit", func() {
|
|
||||||
Expect(reachedTokenBudget(99, ptr(100))).To(BeFalse())
|
|
||||||
})
|
|
||||||
It("is true when generation reached or exceeded the limit", func() {
|
|
||||||
Expect(reachedTokenBudget(100, ptr(100))).To(BeTrue())
|
|
||||||
Expect(reachedTokenBudget(101, ptr(100))).To(BeTrue())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
Context("max_tokens budget exhausted on reasoning (issue #9716)", func() {
|
|
||||||
// Reproduces the streaming retry loop: when a thinking model spends its
|
|
||||||
// entire max_tokens budget on the reasoning block, the C++ autoparser
|
|
||||||
// clears the raw Response and delivers reasoning-only ChatDeltas (no
|
|
||||||
// content, no tool calls). The built-in empty-response retry then fires
|
|
||||||
// and regenerates from scratch up to maxRetries times, each re-consuming
|
|
||||||
// the whole budget — instead of terminating with finish_reason "length".
|
|
||||||
It("should NOT retry when the token budget was exhausted", func() {
|
|
||||||
maxTokens := 100
|
|
||||||
cfg.Maxtokens = &maxTokens
|
|
||||||
|
|
||||||
calls := 0
|
|
||||||
backend.ModelInferenceFunc = func(
|
|
||||||
ctx context.Context, s string, messages schema.Messages,
|
|
||||||
images, videos, audios []string,
|
|
||||||
loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader,
|
|
||||||
o *config.ApplicationConfig,
|
|
||||||
tokenCallback func(string, backend.TokenUsage) bool,
|
|
||||||
tools, toolChoice string,
|
|
||||||
logprobs, topLogprobs *int,
|
|
||||||
logitBias map[string]float64,
|
|
||||||
metadata map[string]string,
|
|
||||||
) (func() (backend.LLMResponse, error), error) {
|
|
||||||
predFunc := func() (backend.LLMResponse, error) {
|
|
||||||
calls++
|
|
||||||
// Autoparser cleared Response; only reasoning was produced,
|
|
||||||
// and the completion count reached the max_tokens budget.
|
|
||||||
return backend.LLMResponse{
|
|
||||||
Response: "",
|
|
||||||
ChatDeltas: []*pb.ChatDelta{{ReasoningContent: "thinking..."}},
|
|
||||||
Usage: backend.TokenUsage{Prompt: 5, Completion: maxTokens},
|
|
||||||
}, nil
|
|
||||||
}
|
|
||||||
return predFunc, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
_, usage, _, err := ComputeChoices(
|
|
||||||
makeReq(), "test", cfg, nil, appCfg, nil,
|
|
||||||
func(s string, c *[]schema.Choice) {
|
|
||||||
*c = append(*c, schema.Choice{Text: s})
|
|
||||||
},
|
|
||||||
nil,
|
|
||||||
)
|
|
||||||
Expect(err).ToNot(HaveOccurred())
|
|
||||||
// The model hit its token ceiling; regenerating would just hit it
|
|
||||||
// again and multiply token consumption. Exactly one call expected.
|
|
||||||
Expect(calls).To(Equal(1), "budget-exhausted generation must not be retried")
|
|
||||||
Expect(usage.Completion).To(Equal(maxTokens))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
Context("with streaming token callback", func() {
|
Context("with streaming token callback", func() {
|
||||||
It("should call tokenCallback for streaming responses", func() {
|
It("should call tokenCallback for streaming responses", func() {
|
||||||
var streamedTokens []string
|
var streamedTokens []string
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"strconv"
|
"strconv"
|
||||||
"sync"
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"net/http"
|
"net/http"
|
||||||
@@ -25,8 +26,6 @@ import (
|
|||||||
"github.com/mudler/LocalAI/core/config"
|
"github.com/mudler/LocalAI/core/config"
|
||||||
"github.com/mudler/LocalAI/core/http/auth"
|
"github.com/mudler/LocalAI/core/http/auth"
|
||||||
mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
|
mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
|
||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/respcoord"
|
|
||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/turncoord"
|
|
||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
|
"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
|
||||||
"github.com/mudler/LocalAI/core/schema"
|
"github.com/mudler/LocalAI/core/schema"
|
||||||
"github.com/mudler/LocalAI/core/templates"
|
"github.com/mudler/LocalAI/core/templates"
|
||||||
@@ -169,12 +168,44 @@ type Session struct {
|
|||||||
gateMu sync.Mutex
|
gateMu sync.Mutex
|
||||||
voiceVerified bool
|
voiceVerified bool
|
||||||
|
|
||||||
// respSink is the explicit response-coordination state machine (respcoord,
|
// Response cancellation: protects activeResponseCancel/activeResponseDone
|
||||||
// machine M3). It replaces the legacy startResponse/cancelActiveResponse
|
responseMu sync.Mutex
|
||||||
// pair and its dual-writer activeResponse* fields: every start/cancel/finish
|
activeResponseCancel context.CancelFunc
|
||||||
// decision is serialized through respcoord.Coordinator, guaranteeing at most
|
activeResponseDone chan struct{}
|
||||||
// one live response. See realtime_respcoord.go.
|
}
|
||||||
respSink *responseSink
|
|
||||||
|
// cancelActiveResponse cancels any in-flight response and waits for its
|
||||||
|
// goroutine to exit. This ensures we never have overlapping responses and
|
||||||
|
// that interrupted responses are fully cleaned up before starting a new one.
|
||||||
|
func (s *Session) cancelActiveResponse() {
|
||||||
|
s.responseMu.Lock()
|
||||||
|
cancel := s.activeResponseCancel
|
||||||
|
done := s.activeResponseDone
|
||||||
|
s.responseMu.Unlock()
|
||||||
|
|
||||||
|
if cancel != nil {
|
||||||
|
cancel()
|
||||||
|
}
|
||||||
|
if done != nil {
|
||||||
|
<-done
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// startResponse cancels any active response and returns a new context for
|
||||||
|
// the replacement response. The caller MUST close the returned done channel
|
||||||
|
// when the response goroutine exits.
|
||||||
|
func (s *Session) startResponse(parent context.Context) (context.Context, chan struct{}) {
|
||||||
|
s.cancelActiveResponse()
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(parent)
|
||||||
|
done := make(chan struct{})
|
||||||
|
|
||||||
|
s.responseMu.Lock()
|
||||||
|
s.activeResponseCancel = cancel
|
||||||
|
s.activeResponseDone = done
|
||||||
|
s.responseMu.Unlock()
|
||||||
|
|
||||||
|
return ctx, done
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Session) FromClient(session *types.SessionUnion) {
|
func (s *Session) FromClient(session *types.SessionUnion) {
|
||||||
@@ -227,10 +258,8 @@ type Conversation struct {
|
|||||||
// is kept out of Items (so trimRealtimeItems never drops it) and rendered
|
// is kept out of Items (so trimRealtimeItems never drops it) and rendered
|
||||||
// as a system message right after the session instructions.
|
// as a system message right after the session instructions.
|
||||||
Memory string
|
Memory string
|
||||||
// compaction is the explicit single-flight compaction coordinator (M4): at
|
// compacting ensures at most one background compaction runs per conversation.
|
||||||
// most one background summarize+evict runs per conversation at a time. It
|
compacting atomic.Bool
|
||||||
// replaces the legacy `compacting atomic.Bool`. See realtime_compactcoord.go.
|
|
||||||
compaction *compactionSink
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Conversation) ToServer() types.Conversation {
|
func (c *Conversation) ToServer() types.Conversation {
|
||||||
@@ -259,12 +288,6 @@ type Model interface {
|
|||||||
// sound-event tags. topK caps the number of returned tags (0 = backend
|
// sound-event tags. topK caps the number of returned tags (0 = backend
|
||||||
// default), threshold drops tags below the given score (0 = keep all).
|
// default), threshold drops tags below the given score (0 = keep all).
|
||||||
SoundDetection(ctx context.Context, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error)
|
SoundDetection(ctx context.Context, audio string, topK int, threshold float32) (*schema.SoundClassificationResult, error)
|
||||||
// TranscribeLive opens a live (bidirectional) transcription session on the
|
|
||||||
// pipeline's transcription backend, used by semantic_vad turn detection;
|
|
||||||
// onEvent fires from a background goroutine for every delta/EOU/final
|
|
||||||
// event. Backends without live support fail with an error satisfying
|
|
||||||
// grpcerrors.IsLiveTranscriptionUnsupported.
|
|
||||||
TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error)
|
|
||||||
PredictConfig() *config.ModelConfig
|
PredictConfig() *config.ModelConfig
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -490,10 +513,14 @@ func runRealtimeSession(application *application.Application, t Transport, model
|
|||||||
// input_audio_buffer.commit. There is no transcription stage in that case.
|
// input_audio_buffer.commit. There is no transcription stage in that case.
|
||||||
soundOnly := cfg.Pipeline.SoundDetection != "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.LLM == ""
|
soundOnly := cfg.Pipeline.SoundDetection != "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.LLM == ""
|
||||||
|
|
||||||
// defaultTurnDetection seeds server_vad by default, or semantic_vad when the
|
turnDetection := &types.TurnDetectionUnion{
|
||||||
// pipeline opts in (turn_detection.type: semantic_vad); clients can still
|
ServerVad: &types.ServerVad{
|
||||||
// override per session via session.update.
|
Threshold: 0.5,
|
||||||
turnDetection := defaultTurnDetection(cfg)
|
PrefixPaddingMs: 300,
|
||||||
|
SilenceDurationMs: 500,
|
||||||
|
CreateResponse: true,
|
||||||
|
},
|
||||||
|
}
|
||||||
inputAudioTranscription := &types.AudioTranscription{Model: sttModel}
|
inputAudioTranscription := &types.AudioTranscription{Model: sttModel}
|
||||||
if soundOnly {
|
if soundOnly {
|
||||||
turnDetection = nil // turn_detection none: no VAD
|
turnDetection = nil // turn_detection none: no VAD
|
||||||
@@ -534,27 +561,12 @@ func runRealtimeSession(application *application.Application, t Transport, model
|
|||||||
}
|
}
|
||||||
session.CompactionEnabled, session.CompactionTrigger, session.MaxSummaryTokens, session.SummaryModel = resolveCompaction(cfg, session.MaxHistoryItems)
|
session.CompactionEnabled, session.CompactionTrigger, session.MaxSummaryTokens, session.SummaryModel = resolveCompaction(cfg, session.MaxHistoryItems)
|
||||||
|
|
||||||
// Single-writer response coordinator (machine M3). All response starts and
|
|
||||||
// cancels go through this, so the read-loop and VAD goroutine can never race
|
|
||||||
// into two overlapping responses (see realtime_respcoord.go).
|
|
||||||
session.respSink = newResponseSink()
|
|
||||||
|
|
||||||
// Create a default conversation
|
// Create a default conversation
|
||||||
conversationID := generateConversationID()
|
conversationID := generateConversationID()
|
||||||
conversation := &Conversation{
|
conversation := &Conversation{
|
||||||
ID: conversationID,
|
ID: conversationID,
|
||||||
Items: []*types.MessageItemUnion{},
|
Items: []*types.MessageItemUnion{},
|
||||||
}
|
}
|
||||||
// The compaction coordinator's work closure resolves the summarizer (lazily
|
|
||||||
// loading a configured summary_model) and runs the summarize+evict off the
|
|
||||||
// response path — only when a compaction actually starts.
|
|
||||||
conversation.compaction = newCompactionSink(func(ctx context.Context) {
|
|
||||||
model := session.summarizerModel()
|
|
||||||
if model == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
session.compact(ctx, conversation, model)
|
|
||||||
})
|
|
||||||
session.Conversations[conversationID] = conversation
|
session.Conversations[conversationID] = conversation
|
||||||
session.DefaultConversationID = conversationID
|
session.DefaultConversationID = conversationID
|
||||||
|
|
||||||
@@ -636,22 +648,34 @@ func runRealtimeSession(application *application.Application, t Transport, model
|
|||||||
})
|
})
|
||||||
|
|
||||||
var (
|
var (
|
||||||
msg []byte
|
msg []byte
|
||||||
wg sync.WaitGroup
|
wg sync.WaitGroup
|
||||||
|
done = make(chan struct{})
|
||||||
)
|
)
|
||||||
|
|
||||||
// M1 connection lifecycle. The VAD goroutine's run/stop (and its done channel)
|
vadServerStarted := false
|
||||||
// and the once-only teardown are owned by this coordinator, so the channel is
|
toggleVAD := func() {
|
||||||
// closed exactly once and never resurrected after teardown (Part 2, failure
|
if session.TurnDetection != nil && session.TurnDetection.ServerVad != nil && !vadServerStarted {
|
||||||
// mode 6; invariants #8, #10). See realtime_conncoord.go and conncoord/.
|
xlog.Debug("Starting VAD goroutine...")
|
||||||
conn := newConnSink(session, sessionID, t, &wg)
|
done = make(chan struct{})
|
||||||
toggleVAD := func() { conn.setVAD(turnDetectionActive(session.TurnDetection)) }
|
wg.Go(func() {
|
||||||
|
conversation := session.Conversations[session.DefaultConversationID]
|
||||||
|
handleVAD(session, conversation, t, done)
|
||||||
|
})
|
||||||
|
vadServerStarted = true
|
||||||
|
} else if (session.TurnDetection == nil || session.TurnDetection.ServerVad == nil) && vadServerStarted {
|
||||||
|
xlog.Debug("Stopping VAD goroutine...")
|
||||||
|
close(done)
|
||||||
|
vadServerStarted = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// For WebRTC sessions, start the Opus decode loop before VAD so that
|
// For WebRTC sessions, start the Opus decode loop before VAD so that
|
||||||
// decoded PCM is already flowing when VAD's first tick fires.
|
// decoded PCM is already flowing when VAD's first tick fires.
|
||||||
|
var decodeDone chan struct{}
|
||||||
if wt, ok := t.(*WebRTCTransport); ok {
|
if wt, ok := t.(*WebRTCTransport); ok {
|
||||||
conn.decodeDone = make(chan struct{})
|
decodeDone = make(chan struct{})
|
||||||
go decodeOpusLoop(session, wt.opusBackend, conn.decodeDone)
|
go decodeOpusLoop(session, wt.opusBackend, decodeDone)
|
||||||
}
|
}
|
||||||
|
|
||||||
toggleVAD()
|
toggleVAD()
|
||||||
@@ -660,9 +684,9 @@ func runRealtimeSession(application *application.Application, t Transport, model
|
|||||||
// with window/hop configured, the server classifies the last window of
|
// with window/hop configured, the server classifies the last window of
|
||||||
// streamed audio on a timer, so the client only has to stream (no commits).
|
// streamed audio on a timer, so the client only has to stream (no commits).
|
||||||
// This runs independent of VAD (sound events are not speech).
|
// This runs independent of VAD (sound events are not speech).
|
||||||
|
var soundWindowDone chan struct{}
|
||||||
if soundOnly && session.SoundDetectionWindowMs > 0 && session.SoundDetectionHopMs > 0 {
|
if soundOnly && session.SoundDetectionWindowMs > 0 && session.SoundDetectionHopMs > 0 {
|
||||||
conn.soundWindowDone = make(chan struct{})
|
soundWindowDone = make(chan struct{})
|
||||||
soundWindowDone := conn.soundWindowDone
|
|
||||||
wg.Go(func() {
|
wg.Go(func() {
|
||||||
handleSoundWindow(session, t, soundWindowDone)
|
handleSoundWindow(session, t, soundWindowDone)
|
||||||
})
|
})
|
||||||
@@ -787,11 +811,11 @@ func runRealtimeSession(application *application.Application, t Transport, model
|
|||||||
xlog.Debug("recv", "message", string(msg))
|
xlog.Debug("recv", "message", string(msg))
|
||||||
|
|
||||||
sessionLock.Lock()
|
sessionLock.Lock()
|
||||||
autoTurnDetection := turnDetectionActive(session.TurnDetection)
|
isServerVAD := session.TurnDetection != nil && session.TurnDetection.ServerVad != nil
|
||||||
sessionLock.Unlock()
|
sessionLock.Unlock()
|
||||||
|
|
||||||
// TODO: At the least need to check locking and timer state in the VAD Go routine before allowing this
|
// TODO: At the least need to check locking and timer state in the VAD Go routine before allowing this
|
||||||
if autoTurnDetection {
|
if isServerVAD {
|
||||||
sendNotImplemented(t, "input_audio_buffer.commit in conjunction with VAD")
|
sendNotImplemented(t, "input_audio_buffer.commit in conjunction with VAD")
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -807,9 +831,11 @@ func runRealtimeSession(application *application.Application, t Transport, model
|
|||||||
ItemID: generateItemID(),
|
ItemID: generateItemID(),
|
||||||
})
|
})
|
||||||
|
|
||||||
session.respSink.issue(context.Background(), respcoord.SourceClient, func(ctx context.Context) {
|
respCtx, respDone := session.startResponse(context.Background())
|
||||||
commitUtterance(ctx, allAudio, session, conversation, t)
|
go func() {
|
||||||
})
|
defer close(respDone)
|
||||||
|
commitUtterance(respCtx, allAudio, session, conversation, t)
|
||||||
|
}()
|
||||||
|
|
||||||
case types.InputAudioBufferClearEvent:
|
case types.InputAudioBufferClearEvent:
|
||||||
xlog.Debug("recv", "message", string(msg))
|
xlog.Debug("recv", "message", string(msg))
|
||||||
@@ -942,14 +968,15 @@ func runRealtimeSession(application *application.Application, t Transport, model
|
|||||||
conversation.Lock.Unlock()
|
conversation.Lock.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
resp := e.Response
|
respCtx, respDone := session.startResponse(context.Background())
|
||||||
session.respSink.issue(context.Background(), respcoord.SourceClient, func(ctx context.Context) {
|
go func() {
|
||||||
triggerResponse(ctx, session, conversation, t, &resp)
|
defer close(respDone)
|
||||||
})
|
triggerResponse(respCtx, session, conversation, t, &e.Response)
|
||||||
|
}()
|
||||||
|
|
||||||
case types.ResponseCancelEvent:
|
case types.ResponseCancelEvent:
|
||||||
xlog.Debug("recv", "message", string(msg))
|
xlog.Debug("recv", "message", string(msg))
|
||||||
session.respSink.cancel(respcoord.SourceClient)
|
session.cancelActiveResponse()
|
||||||
|
|
||||||
default:
|
default:
|
||||||
xlog.Error("unknown message type")
|
xlog.Error("unknown message type")
|
||||||
@@ -957,11 +984,28 @@ func runRealtimeSession(application *application.Application, t Transport, model
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tear down through the connection coordinator (once). It stops any running
|
// Cancel any in-flight response before tearing down
|
||||||
// VAD goroutine, then the opus-decode and sound-window goroutines, joins them,
|
session.cancelActiveResponse()
|
||||||
// cancels the in-flight response and drains all response goroutines, and
|
|
||||||
// finally removes the session — all in dependency order, exactly once.
|
// Stop the Opus decode goroutine (if running)
|
||||||
conn.close()
|
if decodeDone != nil {
|
||||||
|
close(decodeDone)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Signal any running VAD goroutine to exit.
|
||||||
|
if vadServerStarted {
|
||||||
|
close(done)
|
||||||
|
}
|
||||||
|
// Stop the server-side sound-detection windowing goroutine (if running).
|
||||||
|
if soundWindowDone != nil {
|
||||||
|
close(soundWindowDone)
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
// Remove the session from the sessions map
|
||||||
|
sessionLock.Lock()
|
||||||
|
delete(sessions, sessionID)
|
||||||
|
sessionLock.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
// sendEvent sends a server event via the transport, logging any errors.
|
// sendEvent sends a server event via the transport, logging any errors.
|
||||||
@@ -1241,38 +1285,8 @@ func decodeOpusLoop(session *Session, opusBackend grpc.Backend, done chan struct
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// noSpeechHoldbackSec is how much of the tail of an inspected, segment-free
|
|
||||||
// buffer survives the periodic no-speech clear. It must cover the VAD's
|
|
||||||
// onset-detection latency: a word can already be underway in the newest part
|
|
||||||
// of the window without silero having crossed its threshold yet, and clearing
|
|
||||||
// it cuts the start of the utterance the next tick will detect.
|
|
||||||
const noSpeechHoldbackSec = 0.5
|
|
||||||
|
|
||||||
// dropInspectedPrefix removes the head of the audio buffer that a VAD tick
|
|
||||||
// inspected (the first inspected bytes), keeping the newest holdbackBytes of
|
|
||||||
// that window plus everything appended while the tick ran — audio the VAD
|
|
||||||
// never saw. When something is dropped the result is a fresh copy, never a
|
|
||||||
// sub-slice, so later appends can't scribble on memory shared with the old
|
|
||||||
// backing array; when nothing is dropped buf is returned unchanged.
|
|
||||||
func dropInspectedPrefix(buf []byte, inspected, holdbackBytes int) []byte {
|
|
||||||
cut := inspected - holdbackBytes
|
|
||||||
if cut <= 0 {
|
|
||||||
return buf
|
|
||||||
}
|
|
||||||
if cut > len(buf) {
|
|
||||||
cut = len(buf)
|
|
||||||
}
|
|
||||||
return append([]byte(nil), buf[cut:]...)
|
|
||||||
}
|
|
||||||
|
|
||||||
// handleVAD is a goroutine that listens for audio data from the client,
|
// handleVAD is a goroutine that listens for audio data from the client,
|
||||||
// runs VAD on the audio data, and commits utterances to the conversation.
|
// runs VAD on the audio data, and commits utterances to the conversation
|
||||||
//
|
|
||||||
// With turn_detection.type == "semantic_vad" (sv != nil below) the silero
|
|
||||||
// loop is augmented by a live transcription stream: the buffer's new audio
|
|
||||||
// is fed to the transcription model every tick and its end-of-utterance
|
|
||||||
// token switches the commit threshold between a short post-EOU window and
|
|
||||||
// the long eagerness fallback. The server_vad path is untouched.
|
|
||||||
func handleVAD(session *Session, conv *Conversation, t Transport, done chan struct{}) {
|
func handleVAD(session *Session, conv *Conversation, t Transport, done chan struct{}) {
|
||||||
vadContext, cancel := context.WithCancel(context.Background())
|
vadContext, cancel := context.WithCancel(context.Background())
|
||||||
go func() {
|
go func() {
|
||||||
@@ -1285,22 +1299,9 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
|
|||||||
silenceThreshold = float64(session.TurnDetection.ServerVad.SilenceDurationMs) / 1000
|
silenceThreshold = float64(session.TurnDetection.ServerVad.SilenceDurationMs) / 1000
|
||||||
}
|
}
|
||||||
|
|
||||||
lts := newLiveTurnState(session, t)
|
speechStarted := false
|
||||||
startTime := time.Now()
|
startTime := time.Now()
|
||||||
|
|
||||||
// M2 turn-detection state machine. "Speech started" and "a turn's live ASR
|
|
||||||
// stream is open" are ONE coordinator state (Idle/Speaking), so they cannot
|
|
||||||
// desync the way the legacy speechStarted bool and lts.open() could (Part 2,
|
|
||||||
// failure mode 4). See realtime_turncoord.go and turncoord/.
|
|
||||||
sink := newTurnSink(session, conv, t, lts, vadContext, startTime)
|
|
||||||
// Teardown: end any open turn through the coordinator (DiscardTurn closes the
|
|
||||||
// live stream; no-op if already idle). Replaces the bare lts.discardTurn().
|
|
||||||
defer func() {
|
|
||||||
if err := sink.coord.Apply(turncoord.Abort{Reason: turncoord.AbortTeardown}); err != nil {
|
|
||||||
xlog.Error("turncoord: abort(teardown) failed", "error", err)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
ticker := time.NewTicker(300 * time.Millisecond)
|
ticker := time.NewTicker(300 * time.Millisecond)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
@@ -1309,30 +1310,6 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
|
|||||||
case <-done:
|
case <-done:
|
||||||
return
|
return
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
// Semantic mode is re-read each tick: session.update can switch
|
|
||||||
// turn-detection modes (and the retranscribe gate) mid-session.
|
|
||||||
sessionLock.Lock()
|
|
||||||
var sv *types.RealtimeSessionSemanticVad
|
|
||||||
if session.TurnDetection != nil {
|
|
||||||
sv = session.TurnDetection.SemanticVad
|
|
||||||
}
|
|
||||||
retranscribe := sv != nil && session.ModelConfig != nil &&
|
|
||||||
session.ModelConfig.Pipeline.TurnDetectionRetranscribe()
|
|
||||||
sessionLock.Unlock()
|
|
||||||
|
|
||||||
// The turn coordinator's data-heavy effects (OpenTurn/CommitTurn)
|
|
||||||
// need this tick's mode; set it before any Apply below.
|
|
||||||
sink.sv = sv
|
|
||||||
|
|
||||||
// session.update switched semantic -> server mid-turn: drop the
|
|
||||||
// orphaned live stream. This is NOT a turn abort — the turn continues
|
|
||||||
// under server_vad (a config change must not cut off a mid-utterance
|
|
||||||
// speaker), so the coordinator stays Speaking; only the orphaned live
|
|
||||||
// stream is closed.
|
|
||||||
if sv == nil && lts.open() {
|
|
||||||
lts.discardTurn()
|
|
||||||
}
|
|
||||||
|
|
||||||
session.AudioBufferLock.Lock()
|
session.AudioBufferLock.Lock()
|
||||||
allAudio := make([]byte, len(session.InputAudioBuffer))
|
allAudio := make([]byte, len(session.InputAudioBuffer))
|
||||||
copy(allAudio, session.InputAudioBuffer)
|
copy(allAudio, session.InputAudioBuffer)
|
||||||
@@ -1346,13 +1323,6 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
|
|||||||
// Resample from InputSampleRate to 16kHz
|
// Resample from InputSampleRate to 16kHz
|
||||||
aints = sound.ResampleInt16(aints, session.InputSampleRate, localSampleRate)
|
aints = sound.ResampleInt16(aints, session.InputSampleRate, localSampleRate)
|
||||||
|
|
||||||
audioLength := float64(len(aints)) / localSampleRate
|
|
||||||
|
|
||||||
if sv != nil && lts.open() {
|
|
||||||
lts.feedNewAudio(aints)
|
|
||||||
lts.drainEvents(audioLength)
|
|
||||||
}
|
|
||||||
|
|
||||||
segments, err := runVAD(vadContext, session, aints)
|
segments, err := runVAD(vadContext, session, aints)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if err.Error() == "unexpected speech end" {
|
if err.Error() == "unexpected speech end" {
|
||||||
@@ -1364,52 +1334,31 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// NOTE: the no-speech clear and the min-buffer gate above stay on
|
audioLength := float64(len(aints)) / localSampleRate
|
||||||
// the short silenceThreshold even in semantic mode — the eagerness
|
|
||||||
// fallback applies only to the end-of-speech commit decision, or a
|
// TODO: When resetting the buffer we should retain a small postfix
|
||||||
// low eagerness would delay speech_started/barge-in by seconds.
|
|
||||||
if len(segments) == 0 && audioLength > silenceThreshold {
|
if len(segments) == 0 && audioLength > silenceThreshold {
|
||||||
// "No segments" is not "no speech": silero (threshold 0.5)
|
|
||||||
// crosses up to a few hundred ms into a soft word onset, so
|
|
||||||
// the newest audio in the inspected window may be the start
|
|
||||||
// of a word the next tick will recognize — and more audio
|
|
||||||
// arrived while this tick ran. Keep both; drop only the
|
|
||||||
// older, confirmed-silent head, or utterance onsets get cut.
|
|
||||||
holdback := int(noSpeechHoldbackSec*float64(session.InputSampleRate)) * 2
|
|
||||||
session.AudioBufferLock.Lock()
|
session.AudioBufferLock.Lock()
|
||||||
session.InputAudioBuffer = dropInspectedPrefix(session.InputAudioBuffer, len(allAudio), holdback)
|
session.InputAudioBuffer = nil
|
||||||
session.AudioBufferLock.Unlock()
|
session.AudioBufferLock.Unlock()
|
||||||
|
|
||||||
// No-speech clear: end any open turn (Speaking -> Idle, discarding
|
|
||||||
// the partial). Returning to Idle is the fix for failure mode 4 —
|
|
||||||
// the legacy discardTurn left speechStarted true, suppressing the
|
|
||||||
// next onset. Idle while not speaking is a no-op.
|
|
||||||
if err := sink.coord.Apply(turncoord.Abort{Reason: turncoord.AbortNoSpeech}); err != nil {
|
|
||||||
xlog.Error("turncoord: abort(no_speech) failed", "error", err)
|
|
||||||
}
|
|
||||||
continue
|
continue
|
||||||
} else if len(segments) == 0 {
|
} else if len(segments) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Speech detected this tick: open the turn (Idle -> Speaking) through
|
if !speechStarted {
|
||||||
// the coordinator. On that transition it opens the turn's live ASR
|
// Barge-in: cancel any in-flight response so we stop
|
||||||
// stream + feeds the buffered prefix (OpenTurn), cancels any in-flight
|
// sending audio and don't keep the interrupted reply in history.
|
||||||
// response (BargeIn, non-blocking — the VAD tick is never stalled), and
|
session.cancelActiveResponse()
|
||||||
// emits speech_started. While already Speaking it is a no-op, so "turn
|
|
||||||
// open" and "speech started" can never disagree. The turn id is minted
|
|
||||||
// here and carried by the coordinator through to the committed event.
|
|
||||||
sink.onsetAudio = aints
|
|
||||||
if err := sink.coord.Apply(turncoord.Onset{Turn: turncoord.TurnID(generateItemID())}); err != nil {
|
|
||||||
xlog.Error("turncoord: onset failed", "error", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if sv != nil {
|
sendEvent(t, types.InputAudioBufferSpeechStartedEvent{
|
||||||
// Drain again: events produced by THIS tick's feed have
|
ServerEventBase: types.ServerEventBase{
|
||||||
// usually arrived by the time runVAD returns, and leaving
|
EventID: "event_TODO",
|
||||||
// them for the next tick adds 300ms to every EOU-triggered
|
},
|
||||||
// commit.
|
AudioStartMs: time.Since(startTime).Milliseconds(),
|
||||||
lts.drainEvents(audioLength)
|
})
|
||||||
|
speechStarted = true
|
||||||
}
|
}
|
||||||
|
|
||||||
// Segment still in progress when audio ended
|
// Segment still in progress when audio ended
|
||||||
@@ -1418,90 +1367,41 @@ func handleVAD(session *Session, conv *Conversation, t Transport, done chan stru
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
threshold := silenceThreshold
|
if float32(audioLength)-segEndTime > float32(silenceThreshold) {
|
||||||
eouPending := false
|
|
||||||
if sv != nil {
|
|
||||||
eouPending = lts.eouPending(segments)
|
|
||||||
threshold = lts.thresholdSec(eouPending, sv)
|
|
||||||
}
|
|
||||||
|
|
||||||
if float32(audioLength)-segEndTime > float32(threshold) {
|
|
||||||
if sv != nil {
|
|
||||||
trigger, eouLag := lts.commitTrigger(eouPending, float64(segEndTime))
|
|
||||||
xlog.Info("semantic_vad: committing turn",
|
|
||||||
"trigger", trigger,
|
|
||||||
"speech_end_s", segEndTime,
|
|
||||||
"eou_lag_s", eouLag,
|
|
||||||
"silence_s", audioLength-float64(segEndTime),
|
|
||||||
"audio_s", audioLength)
|
|
||||||
}
|
|
||||||
// Retranscribe gate (semantic mode, EOU-triggered commits
|
|
||||||
// only): cross-check the streamed EOU with an offline decode
|
|
||||||
// of the buffered turn before committing. Runs synchronously
|
|
||||||
// on the tick — the engine would serialize a concurrent feed
|
|
||||||
// against it anyway. Timeout-triggered commits skip the gate.
|
|
||||||
var gated *schema.TranscriptionResult
|
|
||||||
if retranscribe && eouPending {
|
|
||||||
batch, gerr := transcribeUtterance(vadContext, sound.Int16toBytesLE(aints), session)
|
|
||||||
switch {
|
|
||||||
case gerr != nil:
|
|
||||||
xlog.Warn("semantic_vad: retranscribe gate failed; committing via the file path", "error", gerr)
|
|
||||||
case !batch.Eou:
|
|
||||||
xlog.Info("semantic_vad: batch decode did not confirm the streamed EOU; continuing to listen",
|
|
||||||
"streamed", lts.previewText(), "batch", batch.Text)
|
|
||||||
// The batch decode rejected the streamed EOU as a false
|
|
||||||
// positive: consume the recorded EOU so the next tick
|
|
||||||
// falls back to the eagerness window instead of
|
|
||||||
// re-triggering on the same token.
|
|
||||||
lts.eouAtSec = 0
|
|
||||||
continue
|
|
||||||
default:
|
|
||||||
xlog.Info("semantic_vad: batch decode confirmed the streamed EOU",
|
|
||||||
"streamed", lts.previewText(), "batch", batch.Text)
|
|
||||||
gated = batch
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
xlog.Debug("Detected end of speech segment")
|
xlog.Debug("Detected end of speech segment")
|
||||||
session.AudioBufferLock.Lock()
|
session.AudioBufferLock.Lock()
|
||||||
// Keep audio appended while this tick ran — it belongs to
|
session.InputAudioBuffer = nil
|
||||||
// the next turn (in any mode: nil-ing it dropped the onset
|
|
||||||
// of an utterance started right after a commit).
|
|
||||||
session.InputAudioBuffer = dropInspectedPrefix(session.InputAudioBuffer, len(allAudio), 0)
|
|
||||||
session.AudioBufferLock.Unlock()
|
session.AudioBufferLock.Unlock()
|
||||||
|
|
||||||
// Commit the turn through the coordinator: it emits speech_stopped
|
sendEvent(t, types.InputAudioBufferSpeechStoppedEvent{
|
||||||
// (EmitSpeechStopped) then the committed event, finalizes the live
|
ServerEventBase: types.ServerEventBase{
|
||||||
// stream, and issues the response (CommitTurn). The committed item
|
EventID: "event_TODO",
|
||||||
// id is the coordinator's turn id (== the id the live captions
|
},
|
||||||
// streamed under), so the client replaces the partial text.
|
AudioEndMs: time.Since(startTime).Milliseconds(),
|
||||||
sink.commitAudio = sound.Int16toBytesLE(aints)
|
})
|
||||||
sink.commitAudioLength = audioLength
|
speechStarted = false
|
||||||
sink.commitRetranscribe = retranscribe
|
|
||||||
sink.commitGated = gated
|
sendEvent(t, types.InputAudioBufferCommittedEvent{
|
||||||
// TODO: Remove prefix silence that is over TurnDetectionParams.PrefixPaddingMs
|
ServerEventBase: types.ServerEventBase{
|
||||||
if err := sink.coord.Apply(turncoord.Silence{}); err != nil {
|
EventID: "event_TODO",
|
||||||
xlog.Error("turncoord: commit failed", "error", err)
|
},
|
||||||
}
|
ItemID: generateItemID(),
|
||||||
|
PreviousItemID: "TODO",
|
||||||
|
})
|
||||||
|
|
||||||
|
abytes := sound.Int16toBytesLE(aints)
|
||||||
|
// TODO: Remove prefix silence that is is over TurnDetectionParams.PrefixPaddingMs
|
||||||
|
respCtx, respDone := session.startResponse(vadContext)
|
||||||
|
go func() {
|
||||||
|
defer close(respDone)
|
||||||
|
commitUtterance(respCtx, abytes, session, conv, t)
|
||||||
|
}()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Conversation, t Transport) {
|
func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Conversation, t Transport) {
|
||||||
commitUtteranceWithTranscript(ctx, utt, nil, nil, "", session, conv, t)
|
|
||||||
}
|
|
||||||
|
|
||||||
// commitUtteranceWithTranscript commits one user turn. live carries the
|
|
||||||
// transcript semantic_vad's live stream already produced (its caption deltas
|
|
||||||
// were streamed to the client during the turn, so only the completed event
|
|
||||||
// is emitted here); gated carries the retranscribe gate's batch decode (the
|
|
||||||
// authoritative transcript in that mode). With neither — server_vad, manual
|
|
||||||
// commits, semantic degrade, or a live stream that heard nothing — the audio
|
|
||||||
// is written to a temp WAV and transcribed via the file path as before.
|
|
||||||
// itemID is the turn's conversation item id ("" mints a fresh one); it must
|
|
||||||
// match the id any live deltas were sent under.
|
|
||||||
func commitUtteranceWithTranscript(ctx context.Context, utt []byte, live *liveUtterance, gated *schema.TranscriptionResult, itemID string, session *Session, conv *Conversation, t Transport) {
|
|
||||||
if len(utt) == 0 {
|
if len(utt) == 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -1566,37 +1466,14 @@ func commitUtteranceWithTranscript(ctx context.Context, utt []byte, live *liveUt
|
|||||||
}
|
}
|
||||||
|
|
||||||
// TODO: If we have a real any-to-any model then transcription is optional
|
// TODO: If we have a real any-to-any model then transcription is optional
|
||||||
|
|
||||||
// The turn's live captions (semantic_vad) already streamed under this
|
|
||||||
// itemID; the completed event below reuses it so the client replaces the
|
|
||||||
// partial text. server_vad / manual commits arrive with no itemID, so mint
|
|
||||||
// one here.
|
|
||||||
if itemID == "" {
|
|
||||||
itemID = generateItemID()
|
|
||||||
}
|
|
||||||
|
|
||||||
var transcript string
|
var transcript string
|
||||||
switch {
|
switch {
|
||||||
case gated != nil:
|
|
||||||
// semantic_vad retranscribe gate: the batch decode is authoritative.
|
|
||||||
transcript = gated.Text
|
|
||||||
if err := emitPrecomputedTranscription(t, itemID, nil, transcript); err != nil {
|
|
||||||
sendError(t, "transcription_failed", err.Error(), "", "event_TODO")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
case live != nil && live.Text != "":
|
|
||||||
// The caption deltas already streamed during the turn under this
|
|
||||||
// itemID; the completed event replaces the partial text client-side.
|
|
||||||
transcript = live.Text
|
|
||||||
if err := emitPrecomputedTranscription(t, itemID, nil, transcript); err != nil {
|
|
||||||
sendError(t, "transcription_failed", err.Error(), "", "event_TODO")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
case session.InputAudioTranscription != nil:
|
case session.InputAudioTranscription != nil:
|
||||||
// emitTranscription streams transcript deltas when
|
// emitTranscription streams transcript deltas when
|
||||||
// pipeline.streaming.transcription is set, otherwise emits a single
|
// pipeline.streaming.transcription is set, otherwise emits a single
|
||||||
// completed event; either way it returns the final transcript text.
|
// completed event; either way it returns the final transcript text.
|
||||||
transcript, err = emitTranscription(ctx, t, session, itemID, f.Name())
|
var err error
|
||||||
|
transcript, err = emitTranscription(ctx, t, session, generateItemID(), f.Name())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// Drain the gate goroutine before returning so its in-flight read of
|
// Drain the gate goroutine before returning so its in-flight read of
|
||||||
// the temp WAV finishes before the deferred os.Remove fires.
|
// the temp WAV finishes before the deferred os.Remove fires.
|
||||||
@@ -1765,56 +1642,6 @@ func writeWindowWAV(pcm []byte, sampleRate int) (string, error) {
|
|||||||
return f.Name(), nil
|
return f.Name(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// writeUtteranceWAV persists raw 16 kHz mono PCM to a temp WAV for the
|
|
||||||
// file-based transcription paths. The caller must invoke cleanup.
|
|
||||||
func writeUtteranceWAV(utt []byte) (string, func(), error) {
|
|
||||||
f, err := os.CreateTemp("", "realtime-audio-chunk-*.wav")
|
|
||||||
if err != nil {
|
|
||||||
return "", nil, err
|
|
||||||
}
|
|
||||||
cleanup := func() {
|
|
||||||
_ = f.Close()
|
|
||||||
_ = os.Remove(f.Name())
|
|
||||||
}
|
|
||||||
xlog.Debug("Writing to file", "file", f.Name())
|
|
||||||
|
|
||||||
hdr := laudio.NewWAVHeader(uint32(len(utt)))
|
|
||||||
if err := hdr.Write(f); err != nil {
|
|
||||||
cleanup()
|
|
||||||
return "", nil, err
|
|
||||||
}
|
|
||||||
if _, err := f.Write(utt); err != nil {
|
|
||||||
cleanup()
|
|
||||||
return "", nil, err
|
|
||||||
}
|
|
||||||
_ = f.Sync()
|
|
||||||
return f.Name(), cleanup, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// transcribeUtterance runs one offline (unary) decode of the buffered turn —
|
|
||||||
// the semantic_vad retranscribe gate. The result's Eou flag reports whether
|
|
||||||
// the batch decode also ended on the end-of-utterance token.
|
|
||||||
func transcribeUtterance(ctx context.Context, utt []byte, session *Session) (*schema.TranscriptionResult, error) {
|
|
||||||
path, cleanup, err := writeUtteranceWAV(utt)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
defer cleanup()
|
|
||||||
|
|
||||||
language, prompt := "", ""
|
|
||||||
if cfg := session.InputAudioTranscription; cfg != nil {
|
|
||||||
language, prompt = cfg.Language, cfg.Prompt
|
|
||||||
}
|
|
||||||
tr, err := session.ModelInterface.Transcribe(ctx, path, language, false, false, prompt)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
if tr == nil {
|
|
||||||
return nil, fmt.Errorf("transcribe result is nil")
|
|
||||||
}
|
|
||||||
return tr, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADSegment, error) {
|
func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADSegment, error) {
|
||||||
soundIntBuffer := &audio.IntBuffer{
|
soundIntBuffer := &audio.IntBuffer{
|
||||||
Format: &audio.Format{SampleRate: localSampleRate, NumChannels: 1},
|
Format: &audio.Format{SampleRate: localSampleRate, NumChannels: 1},
|
||||||
@@ -1894,100 +1721,14 @@ func generateResponse(ctx context.Context, session *Session, utt []byte, transcr
|
|||||||
// without another response cycle.
|
// without another response cycle.
|
||||||
const maxAssistantToolTurns = 10
|
const maxAssistantToolTurns = 10
|
||||||
|
|
||||||
// responseOutcome is how a response ended, decided by the response body and
|
|
||||||
// read once by triggerResponse to emit the single terminal event.
|
|
||||||
type responseOutcome int
|
|
||||||
|
|
||||||
const (
|
|
||||||
outcomeCompleted responseOutcome = iota
|
|
||||||
outcomeCancelled
|
|
||||||
outcomeFailed // an error event was already sent; emit no terminal (legacy behavior)
|
|
||||||
)
|
|
||||||
|
|
||||||
// liveResponse accumulates the wire-visible result of ONE response.create across
|
|
||||||
// the whole agentic tool-turn recursion: a single id, the output items as they
|
|
||||||
// complete, the summed token usage, and the final outcome. triggerResponse owns
|
|
||||||
// it; triggerResponseAtTurn / streamLLMResponse / emitToolCallItems fill it in.
|
|
||||||
// This is what makes "exactly one response.done per response.create, with Output
|
|
||||||
// and Usage populated" true — the body no longer emits per-turn terminals.
|
|
||||||
type liveResponse struct {
|
|
||||||
id string
|
|
||||||
output []types.MessageItemUnion
|
|
||||||
usage backend.TokenUsage
|
|
||||||
outcome responseOutcome
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *liveResponse) addItem(it types.MessageItemUnion) { r.output = append(r.output, it) }
|
|
||||||
|
|
||||||
func (r *liveResponse) addUsage(u backend.TokenUsage) {
|
|
||||||
r.usage.Prompt += u.Prompt
|
|
||||||
r.usage.Completion += u.Completion
|
|
||||||
}
|
|
||||||
|
|
||||||
// responseUsage maps the backend's token counts onto the OpenAI Realtime
|
|
||||||
// response.usage shape. Returns nil when there is nothing to report so the
|
|
||||||
// field is omitted rather than sent as zeros.
|
|
||||||
func responseUsage(u backend.TokenUsage) *types.TokenUsage {
|
|
||||||
if u.Prompt == 0 && u.Completion == 0 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
return &types.TokenUsage{
|
|
||||||
InputTokens: u.Prompt,
|
|
||||||
OutputTokens: u.Completion,
|
|
||||||
TotalTokens: u.Prompt + u.Completion,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func triggerResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams) {
|
func triggerResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams) {
|
||||||
// One response.created and one response.done per response.create — even when
|
triggerResponseAtTurn(ctx, session, conv, t, overrides, 0)
|
||||||
// the server-side tool loop runs several inference turns. The per-turn
|
|
||||||
// terminals the legacy code emitted (one response.done per turn, with empty
|
|
||||||
// Output/Usage) are gone; tool turns are now internal to this single response.
|
|
||||||
r := &liveResponse{id: generateUniqueID()}
|
|
||||||
sendEvent(t, types.ResponseCreatedEvent{
|
|
||||||
ServerEventBase: types.ServerEventBase{},
|
|
||||||
Response: types.Response{
|
|
||||||
ID: r.id,
|
|
||||||
Object: "realtime.response",
|
|
||||||
Status: types.ResponseStatusInProgress,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
|
|
||||||
triggerResponseAtTurn(ctx, session, conv, t, overrides, 0, r)
|
|
||||||
|
|
||||||
switch r.outcome {
|
|
||||||
case outcomeCancelled:
|
|
||||||
sendEvent(t, types.ResponseDoneEvent{
|
|
||||||
ServerEventBase: types.ServerEventBase{},
|
|
||||||
Response: types.Response{
|
|
||||||
ID: r.id,
|
|
||||||
Object: "realtime.response",
|
|
||||||
Status: types.ResponseStatusCancelled,
|
|
||||||
Output: r.output,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
case outcomeFailed:
|
|
||||||
// A specific error event was already sent; emit no terminal (matches the
|
|
||||||
// legacy behavior where failed responses had no response.done).
|
|
||||||
default:
|
|
||||||
sendEvent(t, types.ResponseDoneEvent{
|
|
||||||
ServerEventBase: types.ServerEventBase{},
|
|
||||||
Response: types.Response{
|
|
||||||
ID: r.id,
|
|
||||||
Object: "realtime.response",
|
|
||||||
Status: types.ResponseStatusCompleted,
|
|
||||||
Output: r.output,
|
|
||||||
Usage: responseUsage(r.usage),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fold aged-out turns into the rolling memory off the critical path; the
|
// Fold aged-out turns into the rolling memory off the critical path; the
|
||||||
// next turn reaps the smaller buffer.
|
// next turn reaps the smaller buffer.
|
||||||
session.maybeCompact(conv)
|
session.maybeCompact(conv)
|
||||||
}
|
}
|
||||||
|
|
||||||
func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams, toolTurn int, r *liveResponse) {
|
func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams, toolTurn int) {
|
||||||
config := session.ModelInterface.PredictConfig()
|
config := session.ModelInterface.PredictConfig()
|
||||||
|
|
||||||
// Default values
|
// Default values
|
||||||
@@ -2150,9 +1891,15 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
|
|||||||
images = append(images, m.StringImages...)
|
images = append(images, m.StringImages...)
|
||||||
}
|
}
|
||||||
|
|
||||||
// response.created/done are emitted once per response.create by triggerResponse;
|
responseID := generateUniqueID()
|
||||||
// every turn (including agentic recursion) shares this id.
|
sendEvent(t, types.ResponseCreatedEvent{
|
||||||
responseID := r.id
|
ServerEventBase: types.ServerEventBase{},
|
||||||
|
Response: types.Response{
|
||||||
|
ID: responseID,
|
||||||
|
Object: "realtime.response",
|
||||||
|
Status: types.ResponseStatusInProgress,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
// Streamed LLM path: when the pipeline opts into LLM streaming, stream the
|
// Streamed LLM path: when the pipeline opts into LLM streaming, stream the
|
||||||
// transcript to the client as it is generated and synthesize the buffered
|
// transcript to the client as it is generated and synthesize the buffered
|
||||||
@@ -2168,7 +1915,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
|
|||||||
respMods = overrides.OutputModalities
|
respMods = overrides.OutputModalities
|
||||||
}
|
}
|
||||||
if canStream && modalitiesContainAudio(resolveOutputModalities(session.OutputModalities, respMods)) {
|
if canStream && modalitiesContainAudio(resolveOutputModalities(session.OutputModalities, respMods)) {
|
||||||
if streamLLMResponse(ctx, session, conv, t, r, conversationHistory, images, config, tools, toolChoice, toolTurn) {
|
if streamLLMResponse(ctx, session, conv, t, responseID, conversationHistory, images, config, tools, toolChoice, toolTurn) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2177,22 +1924,26 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
|
|||||||
predFunc, err := session.ModelInterface.Predict(ctx, conversationHistory, images, nil, nil, nil, tools, toolChoice, nil, nil, nil)
|
predFunc, err := session.ModelInterface.Predict(ctx, conversationHistory, images, nil, nil, nil, tools, toolChoice, nil, nil, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
sendError(t, "inference_failed", fmt.Sprintf("backend error: %v", err), "", "") // item.Assistant.ID is unknown here
|
sendError(t, "inference_failed", fmt.Sprintf("backend error: %v", err), "", "") // item.Assistant.ID is unknown here
|
||||||
r.outcome = outcomeFailed
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
pred, err := predFunc()
|
pred, err := predFunc()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
sendError(t, "prediction_failed", fmt.Sprintf("backend error: %v", err), "", "")
|
sendError(t, "prediction_failed", fmt.Sprintf("backend error: %v", err), "", "")
|
||||||
r.outcome = outcomeFailed
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
r.addUsage(pred.Usage)
|
|
||||||
|
|
||||||
// Check for cancellation after LLM inference (barge-in may have fired)
|
// Check for cancellation after LLM inference (barge-in may have fired)
|
||||||
if ctx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
xlog.Debug("Response cancelled after LLM inference (barge-in)")
|
xlog.Debug("Response cancelled after LLM inference (barge-in)")
|
||||||
r.outcome = outcomeCancelled
|
sendEvent(t, types.ResponseDoneEvent{
|
||||||
|
ServerEventBase: types.ServerEventBase{},
|
||||||
|
Response: types.Response{
|
||||||
|
ID: responseID,
|
||||||
|
Object: "realtime.response",
|
||||||
|
Status: types.ResponseStatusCancelled,
|
||||||
|
},
|
||||||
|
})
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2352,12 +2103,18 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
|
|||||||
conv.Lock.Unlock()
|
conv.Lock.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
// sendCancelledResponse records the cancelled outcome (triggerResponse
|
// sendCancelledResponse emits the cancelled status and cleans up the
|
||||||
// emits the single terminal) and cleans up the partial assistant item so
|
// assistant item so the interrupted reply is not in chat history.
|
||||||
// the interrupted reply is not in chat history.
|
|
||||||
sendCancelledResponse := func() {
|
sendCancelledResponse := func() {
|
||||||
removeItemFromConv(item.Assistant.ID)
|
removeItemFromConv(item.Assistant.ID)
|
||||||
r.outcome = outcomeCancelled
|
sendEvent(t, types.ResponseDoneEvent{
|
||||||
|
ServerEventBase: types.ServerEventBase{},
|
||||||
|
Response: types.Response{
|
||||||
|
ID: responseID,
|
||||||
|
Object: "realtime.response",
|
||||||
|
Status: types.ResponseStatusCancelled,
|
||||||
|
},
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
var audioString string
|
var audioString string
|
||||||
@@ -2406,7 +2163,6 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
|
|||||||
}
|
}
|
||||||
xlog.Error("TTS failed", "error", err)
|
xlog.Error("TTS failed", "error", err)
|
||||||
sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %v", err), "", item.Assistant.ID)
|
sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %v", err), "", item.Assistant.ID)
|
||||||
r.outcome = outcomeFailed
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if !isWebRTC {
|
if !isWebRTC {
|
||||||
@@ -2464,13 +2220,12 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
|
|||||||
OutputIndex: 0,
|
OutputIndex: 0,
|
||||||
Item: item,
|
Item: item,
|
||||||
})
|
})
|
||||||
r.addItem(item)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Emit the parsed tool calls and (for server-side assistant tools) the
|
// Emit the parsed tool calls, the terminal response.done, and (for
|
||||||
// follow-up turn. Shared with the streamed path so both finalize tool calls
|
// server-side assistant tools) the follow-up response. Shared with the
|
||||||
// identically. The single terminal is emitted by triggerResponse.
|
// streamed path so both finalize tool calls identically.
|
||||||
emitToolCallItems(ctx, session, conv, t, r, finalToolCalls, finalSpeech != "", toolTurn)
|
emitToolCallItems(ctx, session, conv, t, responseID, finalToolCalls, finalSpeech != "", toolTurn)
|
||||||
}
|
}
|
||||||
|
|
||||||
// emitToolCallItems emits the realtime function_call items for the parsed tool
|
// emitToolCallItems emits the realtime function_call items for the parsed tool
|
||||||
@@ -2484,8 +2239,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
|
|||||||
// - All other tools follow the standard OpenAI flow: emit
|
// - All other tools follow the standard OpenAI flow: emit
|
||||||
// function_call_arguments.done and wait for the client to send
|
// function_call_arguments.done and wait for the client to send
|
||||||
// conversation.item.create back.
|
// conversation.item.create back.
|
||||||
func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation, t Transport, r *liveResponse, toolCalls []functions.FuncCallResults, hasContent bool, toolTurn int) {
|
func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation, t Transport, responseID string, toolCalls []functions.FuncCallResults, hasContent bool, toolTurn int) {
|
||||||
responseID := r.id
|
|
||||||
xlog.Debug("About to handle tool calls", "finalToolCallsCount", len(toolCalls))
|
xlog.Debug("About to handle tool calls", "finalToolCallsCount", len(toolCalls))
|
||||||
executedAssistantTool := false
|
executedAssistantTool := false
|
||||||
for i, tc := range toolCalls {
|
for i, tc := range toolCalls {
|
||||||
@@ -2548,7 +2302,6 @@ func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation
|
|||||||
OutputIndex: outputIndex,
|
OutputIndex: outputIndex,
|
||||||
Item: fcItem,
|
Item: fcItem,
|
||||||
})
|
})
|
||||||
r.addItem(fcItem)
|
|
||||||
sendEvent(t, types.ResponseOutputItemAddedEvent{
|
sendEvent(t, types.ResponseOutputItemAddedEvent{
|
||||||
ServerEventBase: types.ServerEventBase{},
|
ServerEventBase: types.ServerEventBase{},
|
||||||
ResponseID: responseID,
|
ResponseID: responseID,
|
||||||
@@ -2561,7 +2314,6 @@ func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation
|
|||||||
OutputIndex: outputIndex,
|
OutputIndex: outputIndex,
|
||||||
Item: foItem,
|
Item: foItem,
|
||||||
})
|
})
|
||||||
r.addItem(foItem)
|
|
||||||
executedAssistantTool = true
|
executedAssistantTool = true
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -2591,25 +2343,28 @@ func emitToolCallItems(ctx context.Context, session *Session, conv *Conversation
|
|||||||
OutputIndex: outputIndex,
|
OutputIndex: outputIndex,
|
||||||
Item: fcItem,
|
Item: fcItem,
|
||||||
})
|
})
|
||||||
r.addItem(fcItem)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// No terminal here: triggerResponse emits the single response.done once the
|
sendEvent(t, types.ResponseDoneEvent{
|
||||||
// whole turn (including the agentic recursion below) completes.
|
ServerEventBase: types.ServerEventBase{},
|
||||||
|
Response: types.Response{
|
||||||
|
ID: responseID,
|
||||||
|
Object: "realtime.response",
|
||||||
|
Status: types.ResponseStatusCompleted,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
// If we executed any assistant tools inproc, run another response cycle
|
// If we executed any assistant tools inproc, run another response cycle
|
||||||
// so the model can speak the result. Mirrors the chat-side agentic loop
|
// so the model can speak the result. Mirrors the chat-side agentic loop
|
||||||
// but driven server-side rather than by client round-trip. Bounded so a
|
// but driven server-side rather than by client round-trip. Bounded so a
|
||||||
// degenerate "model keeps calling tools" doesn't blow the stack. The
|
// degenerate "model keeps calling tools" doesn't blow the stack.
|
||||||
// follow-up turn shares the same liveResponse, so its output accumulates
|
|
||||||
// into the one response.done.
|
|
||||||
if executedAssistantTool {
|
if executedAssistantTool {
|
||||||
if toolTurn+1 >= maxAssistantToolTurns {
|
if toolTurn+1 >= maxAssistantToolTurns {
|
||||||
xlog.Warn("realtime: assistant tool-turn limit reached, stopping the agentic loop",
|
xlog.Warn("realtime: assistant tool-turn limit reached, stopping the agentic loop",
|
||||||
"limit", maxAssistantToolTurns, "model", session.Model)
|
"limit", maxAssistantToolTurns, "model", session.Model)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
triggerResponseAtTurn(ctx, session, conv, t, nil, toolTurn+1, r)
|
triggerResponseAtTurn(ctx, session, conv, t, nil, toolTurn+1)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,79 +0,0 @@
|
|||||||
package openai
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"sync"
|
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/compactcoord"
|
|
||||||
"github.com/mudler/xlog"
|
|
||||||
)
|
|
||||||
|
|
||||||
// compactionSink wires the explicit compaction state machine
|
|
||||||
// (compactcoord.Coordinator — machine "M4" in docs/design/realtime-state-machines.md)
|
|
||||||
// into a conversation.
|
|
||||||
//
|
|
||||||
// It replaces the legacy `compacting atomic.Bool` single-flight guard: the
|
|
||||||
// coordinator owns whether a compaction is running, so a Trigger while one is
|
|
||||||
// already in flight is dropped (single-flight) and the background goroutine
|
|
||||||
// always reports Finished — the flag can never stick (invariant #9).
|
|
||||||
//
|
|
||||||
// run is the summarize+evict work for this conversation (captured at
|
|
||||||
// construction); StartCompaction spawns it and reports Finished when it returns.
|
|
||||||
// It takes a context derived from the sink's session-scoped ctx, so shutdown()
|
|
||||||
// can cancel an in-flight compaction.
|
|
||||||
type compactionSink struct {
|
|
||||||
coord *compactcoord.Coordinator
|
|
||||||
run func(ctx context.Context)
|
|
||||||
ctx context.Context
|
|
||||||
cancel context.CancelFunc
|
|
||||||
wg sync.WaitGroup
|
|
||||||
}
|
|
||||||
|
|
||||||
func newCompactionSink(run func(ctx context.Context)) *compactionSink {
|
|
||||||
s := &compactionSink{run: run}
|
|
||||||
s.ctx, s.cancel = context.WithCancel(context.Background())
|
|
||||||
s.coord = compactcoord.New(s)
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
|
|
||||||
// trigger asks the coordinator to start a compaction; a no-op while one is
|
|
||||||
// already running or after shutdown. Non-blocking.
|
|
||||||
func (s *compactionSink) trigger() {
|
|
||||||
if err := s.coord.Apply(compactcoord.Trigger{}); err != nil {
|
|
||||||
xlog.Error("compactcoord: trigger failed", "error", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// shutdown is called by the connection (M1) parent's teardown: cancel any
|
|
||||||
// in-flight compaction, join it, then move the coordinator to Terminated so no
|
|
||||||
// compaction can start afterwards. This closes the legacy gap where the
|
|
||||||
// fire-and-forget compaction goroutine could outlive the session. Cancelling the
|
|
||||||
// context first makes the in-flight summarizer Predict return promptly, so the
|
|
||||||
// join is bounded.
|
|
||||||
func (s *compactionSink) shutdown() {
|
|
||||||
s.cancel()
|
|
||||||
s.wg.Wait()
|
|
||||||
if err := s.coord.Apply(compactcoord.Shutdown{}); err != nil {
|
|
||||||
xlog.Error("compactcoord: shutdown apply failed", "error", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Perform executes one effect. Called under the coordinator lock; StartCompaction
|
|
||||||
// only spawns a goroutine, so it does not block.
|
|
||||||
func (s *compactionSink) Perform(e compactcoord.Effect) {
|
|
||||||
switch e.(type) {
|
|
||||||
case compactcoord.StartCompaction:
|
|
||||||
s.wg.Add(1)
|
|
||||||
go func() {
|
|
||||||
defer s.wg.Done()
|
|
||||||
defer func() {
|
|
||||||
if err := s.coord.Apply(compactcoord.Finished{}); err != nil {
|
|
||||||
xlog.Error("compactcoord: finished apply failed", "error", err)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
if s.run != nil {
|
|
||||||
s.run(s.ctx)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -222,7 +222,7 @@ func prefixMatches(items, snapshot []*types.MessageItemUnion) bool {
|
|||||||
// conv.Lock across the summarizer call: snapshot under lock, summarize unlocked,
|
// conv.Lock across the summarizer call: snapshot under lock, summarize unlocked,
|
||||||
// commit under lock (re-validating the head is unchanged). On any error it
|
// commit under lock (re-validating the head is unchanged). On any error it
|
||||||
// leaves the conversation untouched — items are never dropped without a summary.
|
// leaves the conversation untouched — items are never dropped without a summary.
|
||||||
func (s *Session) compact(ctx context.Context, conv *Conversation, model Model) {
|
func (s *Session) compact(conv *Conversation, model Model) {
|
||||||
if model == nil {
|
if model == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -241,10 +241,9 @@ func (s *Session) compact(ctx context.Context, conv *Conversation, model Model)
|
|||||||
prior := conv.Memory
|
prior := conv.Memory
|
||||||
conv.Lock.Unlock()
|
conv.Lock.Unlock()
|
||||||
|
|
||||||
// Summarize (unlocked). The timeout is derived from the caller's ctx so the
|
// Summarize (unlocked).
|
||||||
// connection teardown can cancel an in-flight summary (bounding the join).
|
|
||||||
msgs := buildSummaryMessages(prior, renderItemsTranscript(overflow), s.MaxSummaryTokens)
|
msgs := buildSummaryMessages(prior, renderItemsTranscript(overflow), s.MaxSummaryTokens)
|
||||||
ctx, cancel := context.WithTimeout(ctx, compactionTimeout)
|
ctx, cancel := context.WithTimeout(context.Background(), compactionTimeout)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
predFunc, err := model.Predict(ctx, msgs, nil, nil, nil, nil, nil, nil, nil, nil, nil)
|
predFunc, err := model.Predict(ctx, msgs, nil, nil, nil, nil, nil, nil, nil, nil, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -299,13 +298,9 @@ func (s *Session) summarizerModel() Model {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// maybeCompact schedules a background compaction when the live buffer has grown
|
// maybeCompact schedules a background compaction when the live buffer has grown
|
||||||
// past the trigger and none is already running. Returns immediately. The
|
// past the trigger and none is already running. Returns immediately.
|
||||||
// single-flight guarantee (at most one compaction per conversation) is owned by
|
|
||||||
// the compaction coordinator (M4); see realtime_compactcoord.go. The actual
|
|
||||||
// summarize+evict work (and the lazy summary_model load) is the conversation's
|
|
||||||
// compaction-sink run closure, so it stays off the response path.
|
|
||||||
func (s *Session) maybeCompact(conv *Conversation) {
|
func (s *Session) maybeCompact(conv *Conversation) {
|
||||||
if !s.CompactionEnabled || conv.compaction == nil {
|
if !s.CompactionEnabled {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
conv.Lock.Lock()
|
conv.Lock.Lock()
|
||||||
@@ -314,5 +309,18 @@ func (s *Session) maybeCompact(conv *Conversation) {
|
|||||||
if !over {
|
if !over {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
conv.compaction.trigger()
|
if !conv.compacting.CompareAndSwap(false, true) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
go func() {
|
||||||
|
defer conv.compacting.Store(false)
|
||||||
|
// Resolve (and, for a configured summary_model, lazily load) the
|
||||||
|
// summarizer only when a compaction actually runs, off the response
|
||||||
|
// path — so the model load never blocks a user turn.
|
||||||
|
model := s.summarizerModel()
|
||||||
|
if model == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.compact(conv, model)
|
||||||
|
}()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
package openai
|
package openai
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
|
||||||
"errors"
|
"errors"
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
. "github.com/onsi/ginkgo/v2"
|
||||||
@@ -199,7 +198,7 @@ var _ = Describe("compact", func() {
|
|||||||
s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512}
|
s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512}
|
||||||
m := &fakeModel{predictResp: backend.LLMResponse{Response: "ROLLED UP"}}
|
m := &fakeModel{predictResp: backend.LLMResponse{Response: "ROLLED UP"}}
|
||||||
|
|
||||||
s.compact(context.Background(), conv, m)
|
s.compact(conv, m)
|
||||||
|
|
||||||
Expect(conv.Memory).To(Equal("ROLLED UP"))
|
Expect(conv.Memory).To(Equal("ROLLED UP"))
|
||||||
Expect(len(conv.Items)).To(Equal(4))
|
Expect(len(conv.Items)).To(Equal(4))
|
||||||
@@ -214,7 +213,7 @@ var _ = Describe("compact", func() {
|
|||||||
s := &Session{CompactionEnabled: true, CompactionTrigger: 2, MaxHistoryItems: 1, MaxSummaryTokens: 512}
|
s := &Session{CompactionEnabled: true, CompactionTrigger: 2, MaxHistoryItems: 1, MaxSummaryTokens: 512}
|
||||||
m := &fakeModel{predictErr: errors.New("boom")}
|
m := &fakeModel{predictErr: errors.New("boom")}
|
||||||
|
|
||||||
s.compact(context.Background(), conv, m)
|
s.compact(conv, m)
|
||||||
|
|
||||||
Expect(conv.Memory).To(Equal(""))
|
Expect(conv.Memory).To(Equal(""))
|
||||||
Expect(len(conv.Items)).To(Equal(3))
|
Expect(len(conv.Items)).To(Equal(3))
|
||||||
@@ -228,7 +227,7 @@ var _ = Describe("compact", func() {
|
|||||||
s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512}
|
s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4, MaxSummaryTokens: 512}
|
||||||
m := &fakeModel{predictResp: backend.LLMResponse{Response: "<think>planning the summary</think>CLEAN SUMMARY"}}
|
m := &fakeModel{predictResp: backend.LLMResponse{Response: "<think>planning the summary</think>CLEAN SUMMARY"}}
|
||||||
|
|
||||||
s.compact(context.Background(), conv, m)
|
s.compact(conv, m)
|
||||||
|
|
||||||
Expect(conv.Memory).To(Equal("CLEAN SUMMARY"))
|
Expect(conv.Memory).To(Equal("CLEAN SUMMARY"))
|
||||||
Expect(conv.Memory).ToNot(ContainSubstring("planning"))
|
Expect(conv.Memory).ToNot(ContainSubstring("planning"))
|
||||||
@@ -237,7 +236,7 @@ var _ = Describe("compact", func() {
|
|||||||
It("does nothing when items are at or below the trigger", func() {
|
It("does nothing when items are at or below the trigger", func() {
|
||||||
conv := &Conversation{Items: []*types.MessageItemUnion{user("1", "a")}}
|
conv := &Conversation{Items: []*types.MessageItemUnion{user("1", "a")}}
|
||||||
s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4}
|
s := &Session{CompactionEnabled: true, CompactionTrigger: 7, MaxHistoryItems: 4}
|
||||||
s.compact(context.Background(), conv, &fakeModel{predictResp: backend.LLMResponse{Response: "x"}})
|
s.compact(conv, &fakeModel{predictResp: backend.LLMResponse{Response: "x"}})
|
||||||
Expect(conv.Memory).To(Equal(""))
|
Expect(conv.Memory).To(Equal(""))
|
||||||
Expect(len(conv.Items)).To(Equal(1))
|
Expect(len(conv.Items)).To(Equal(1))
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -1,122 +0,0 @@
|
|||||||
package openai
|
|
||||||
|
|
||||||
import (
|
|
||||||
"sync"
|
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/conncoord"
|
|
||||||
"github.com/mudler/xlog"
|
|
||||||
)
|
|
||||||
|
|
||||||
// connSink wires the explicit connection-lifecycle state machine
|
|
||||||
// (conncoord.Coordinator — machine "M1" in docs/design/realtime-state-machines.md)
|
|
||||||
// into the realtime session handler.
|
|
||||||
//
|
|
||||||
// It replaces the legacy vadServerStarted bool + the `done` channel that was
|
|
||||||
// reassigned on every turn-detection toggle and closed from two sites (Part 2,
|
|
||||||
// failure mode 6). The coordinator owns whether the VAD goroutine is running, so
|
|
||||||
// the per-run done channel is created and closed in lockstep with that one state
|
|
||||||
// — closed exactly once, never resurrected after teardown.
|
|
||||||
//
|
|
||||||
// The connection machine is driven by the single session goroutine (the handler
|
|
||||||
// loop and its teardown), so this sink and its coordinator are loop-local; the
|
|
||||||
// Coordinator's lock only keeps State() race-free.
|
|
||||||
//
|
|
||||||
// Effects:
|
|
||||||
// - StartVAD: create a fresh done channel and spawn handleVAD on it (joined via wg).
|
|
||||||
// - StopVAD: close that done channel.
|
|
||||||
// - Teardown: stop the remaining input goroutines (opus decode, sound window),
|
|
||||||
// join everything, cancel in-flight responses, and remove the session — once.
|
|
||||||
type connSink struct {
|
|
||||||
session *Session
|
|
||||||
sessionID string
|
|
||||||
transport Transport
|
|
||||||
wg *sync.WaitGroup
|
|
||||||
|
|
||||||
coord *conncoord.Coordinator
|
|
||||||
|
|
||||||
// vadDone is the current VAD run's stop signal — recreated on each StartVAD,
|
|
||||||
// closed by StopVAD / Teardown. Owned solely by Perform (single goroutine).
|
|
||||||
vadDone chan struct{}
|
|
||||||
|
|
||||||
// One-shot stop signals for the other input goroutines, registered by the
|
|
||||||
// handler when it starts them; closed once by Teardown.
|
|
||||||
decodeDone chan struct{}
|
|
||||||
soundWindowDone chan struct{}
|
|
||||||
}
|
|
||||||
|
|
||||||
func newConnSink(session *Session, sessionID string, t Transport, wg *sync.WaitGroup) *connSink {
|
|
||||||
s := &connSink{
|
|
||||||
session: session,
|
|
||||||
sessionID: sessionID,
|
|
||||||
transport: t,
|
|
||||||
wg: wg,
|
|
||||||
}
|
|
||||||
s.coord = conncoord.New(s)
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
|
|
||||||
// setVAD requests the turn-detection goroutine match active. Idempotent.
|
|
||||||
func (s *connSink) setVAD(active bool) {
|
|
||||||
if err := s.coord.Apply(conncoord.SetVAD{Active: active}); err != nil {
|
|
||||||
xlog.Error("conncoord: setVAD failed", "error", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// close tears the session down (once). Safe to call from multiple exit paths.
|
|
||||||
func (s *connSink) close() {
|
|
||||||
if err := s.coord.Apply(conncoord.Close{}); err != nil {
|
|
||||||
xlog.Error("conncoord: close failed", "error", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Perform executes one effect. Called by Coordinator.Apply under the coordinator
|
|
||||||
// lock; the connection coordinator is single-writer and torn down exactly once at
|
|
||||||
// the end of the session goroutine, so the blocking joins in Teardown never
|
|
||||||
// contend the lock.
|
|
||||||
func (s *connSink) Perform(e conncoord.Effect) {
|
|
||||||
switch e.(type) {
|
|
||||||
case conncoord.StartVAD:
|
|
||||||
xlog.Debug("Starting VAD goroutine...")
|
|
||||||
s.vadDone = make(chan struct{})
|
|
||||||
done := s.vadDone
|
|
||||||
s.wg.Go(func() {
|
|
||||||
conversation := s.session.Conversations[s.session.DefaultConversationID]
|
|
||||||
handleVAD(s.session, conversation, s.transport, done)
|
|
||||||
})
|
|
||||||
case conncoord.StopVAD:
|
|
||||||
xlog.Debug("Stopping VAD goroutine...")
|
|
||||||
close(s.vadDone)
|
|
||||||
s.vadDone = nil
|
|
||||||
case conncoord.Teardown:
|
|
||||||
// Tear down in dependency order, driving every child machine to its
|
|
||||||
// terminal state so none outlives the session (the hierarchy invariant in
|
|
||||||
// formal-verification/session_lifecycle.fizz: conn Torn => children terminal).
|
|
||||||
//
|
|
||||||
// 1. Stop the remaining input goroutines and join them (this joins the VAD
|
|
||||||
// goroutine, M2, via the StopVAD above + wg).
|
|
||||||
if s.decodeDone != nil {
|
|
||||||
close(s.decodeDone)
|
|
||||||
}
|
|
||||||
if s.soundWindowDone != nil {
|
|
||||||
close(s.soundWindowDone)
|
|
||||||
}
|
|
||||||
s.wg.Wait()
|
|
||||||
|
|
||||||
// 2. Terminate the response coordinator (M3): cancel the in-flight response
|
|
||||||
// and join all response goroutines (which also closes their TTS
|
|
||||||
// pipelines, M5). After this no response can start.
|
|
||||||
s.session.respSink.shutdown()
|
|
||||||
|
|
||||||
// 3. Terminate every conversation's compaction coordinator (M4): cancel +
|
|
||||||
// join any in-flight summarize+evict so it cannot outlive the session.
|
|
||||||
for _, conv := range s.session.Conversations {
|
|
||||||
if conv.compaction != nil {
|
|
||||||
conv.compaction.shutdown()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
sessionLock.Lock()
|
|
||||||
delete(sessions, s.sessionID)
|
|
||||||
sessionLock.Unlock()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -74,16 +74,6 @@ type fakeModel struct {
|
|||||||
|
|
||||||
transcribeDeltas []string
|
transcribeDeltas []string
|
||||||
transcribeFinal *schema.TranscriptionResult
|
transcribeFinal *schema.TranscriptionResult
|
||||||
transcribeErr error
|
|
||||||
|
|
||||||
// TranscribeLive scripting: liveErr makes the open fail (degrade path);
|
|
||||||
// liveEvents are delivered to onEvent synchronously at open;
|
|
||||||
// liveCloseEvents are delivered during Close (the finalize flush).
|
|
||||||
liveErr error
|
|
||||||
liveEvents []backend.LiveTranscriptionEvent
|
|
||||||
liveCloseEvents []backend.LiveTranscriptionEvent
|
|
||||||
liveOpened int
|
|
||||||
liveSession *fakeLiveSession
|
|
||||||
|
|
||||||
// soundDetectionResult/soundDetectionErr drive the SoundDetection double so
|
// soundDetectionResult/soundDetectionErr drive the SoundDetection double so
|
||||||
// the sound-event path can be exercised deterministically.
|
// the sound-event path can be exercised deterministically.
|
||||||
@@ -107,7 +97,7 @@ func (m *fakeModel) VAD(context.Context, *schema.VADRequest) (*schema.VADRespons
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *fakeModel) Transcribe(context.Context, string, string, bool, bool, string) (*schema.TranscriptionResult, error) {
|
func (m *fakeModel) Transcribe(context.Context, string, string, bool, bool, string) (*schema.TranscriptionResult, error) {
|
||||||
return m.transcribeFinal, m.transcribeErr
|
return m.transcribeFinal, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *fakeModel) SoundDetection(context.Context, string, int, float32) (*schema.SoundClassificationResult, error) {
|
func (m *fakeModel) SoundDetection(context.Context, string, int, float32) (*schema.SoundClassificationResult, error) {
|
||||||
@@ -160,43 +150,4 @@ func (m *fakeModel) TranscribeStream(_ context.Context, _, _ string, _, _ bool,
|
|||||||
return m.transcribeFinal, nil
|
return m.transcribeFinal, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *fakeModel) TranscribeLive(_ context.Context, _ string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error) {
|
|
||||||
if m.liveErr != nil {
|
|
||||||
return nil, m.liveErr
|
|
||||||
}
|
|
||||||
m.liveOpened++
|
|
||||||
for _, ev := range m.liveEvents {
|
|
||||||
onEvent(ev)
|
|
||||||
}
|
|
||||||
m.liveSession = &fakeLiveSession{onEvent: onEvent, closeEvents: m.liveCloseEvents}
|
|
||||||
return m.liveSession, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *fakeModel) PredictConfig() *config.ModelConfig { return m.cfg }
|
func (m *fakeModel) PredictConfig() *config.ModelConfig { return m.cfg }
|
||||||
|
|
||||||
// fakeLiveSession records what semantic_vad fed and closed; closeEvents are
|
|
||||||
// replayed through onEvent during Close, mimicking the backend's finalize
|
|
||||||
// flush (trailing delta + Final) landing before Close returns.
|
|
||||||
type fakeLiveSession struct {
|
|
||||||
onEvent func(backend.LiveTranscriptionEvent)
|
|
||||||
closeEvents []backend.LiveTranscriptionEvent
|
|
||||||
fed [][]float32
|
|
||||||
feedErr error
|
|
||||||
closed int
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *fakeLiveSession) Feed(pcm []float32) error {
|
|
||||||
if s.feedErr != nil {
|
|
||||||
return s.feedErr
|
|
||||||
}
|
|
||||||
s.fed = append(s.fed, append([]float32(nil), pcm...))
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *fakeLiveSession) Close() error {
|
|
||||||
s.closed++
|
|
||||||
for _, ev := range s.closeEvents {
|
|
||||||
s.onEvent(ev)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -102,10 +102,6 @@ func (m *transcriptOnlyModel) TranscribeStream(ctx context.Context, audio, langu
|
|||||||
return transcribeStream(ctx, m.modelLoader, *m.TranscriptionConfig, m.appConfig, audio, language, translate, diarize, prompt, onDelta)
|
return transcribeStream(ctx, m.modelLoader, *m.TranscriptionConfig, m.appConfig, audio, language, translate, diarize, prompt, onDelta)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *transcriptOnlyModel) TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error) {
|
|
||||||
return backend.ModelTranscriptionLive(ctx, language, m.modelLoader, *m.TranscriptionConfig, m.appConfig, onEvent)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *transcriptOnlyModel) PredictConfig() *config.ModelConfig {
|
func (m *transcriptOnlyModel) PredictConfig() *config.ModelConfig {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -352,10 +348,6 @@ func (m *wrappedModel) TranscribeStream(ctx context.Context, audio, language str
|
|||||||
return transcribeStream(ctx, m.modelLoader, *m.TranscriptionConfig, m.appConfig, audio, language, translate, diarize, prompt, onDelta)
|
return transcribeStream(ctx, m.modelLoader, *m.TranscriptionConfig, m.appConfig, audio, language, translate, diarize, prompt, onDelta)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *wrappedModel) TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error) {
|
|
||||||
return backend.ModelTranscriptionLive(ctx, language, m.modelLoader, *m.TranscriptionConfig, m.appConfig, onEvent)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *wrappedModel) PredictConfig() *config.ModelConfig {
|
func (m *wrappedModel) PredictConfig() *config.ModelConfig {
|
||||||
return m.LLMConfig
|
return m.LLMConfig
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,143 +0,0 @@
|
|||||||
package openai
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"sync"
|
|
||||||
"sync/atomic"
|
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/respcoord"
|
|
||||||
"github.com/mudler/xlog"
|
|
||||||
)
|
|
||||||
|
|
||||||
// responseSink wires the explicit response-coordination state machine
|
|
||||||
// (respcoord.Coordinator — machine "M3" in docs/design/realtime-state-machines.md)
|
|
||||||
// into a realtime session.
|
|
||||||
//
|
|
||||||
// It replaces the legacy startResponse/cancelActiveResponse pair, whose
|
|
||||||
// activeResponse* fields were written from two goroutines (the client read-loop
|
|
||||||
// and the VAD goroutine) with the <-done wait performed outside the lock — the
|
|
||||||
// dual-writer race documented in Part 2 (failure mode 2). The coordinator
|
|
||||||
// serializes every start/cancel/finish decision behind one lock and guarantees
|
|
||||||
// at most one live response, so the two callers can no longer interleave into
|
|
||||||
// two overlapping responses.
|
|
||||||
//
|
|
||||||
// Each response runs as a goroutine spawned here. The effects map as:
|
|
||||||
// - StartResponse: spawn the registered body with a fresh cancelable context.
|
|
||||||
// - CancelResponse: cancel that context (cooperative — the body stops at its
|
|
||||||
// next ctx checkpoint and emits its own response.done{cancelled}).
|
|
||||||
// - EmitTerminal: currently a no-op. response.done is still emitted by the
|
|
||||||
// response body itself; making this the single authoritative terminal (one
|
|
||||||
// response.done per response.create, with Output+Usage populated) is the
|
|
||||||
// next step and does not change the coordination guarantees here.
|
|
||||||
type responseSink struct {
|
|
||||||
mu sync.Mutex
|
|
||||||
coord *respcoord.Coordinator
|
|
||||||
cancels map[respcoord.ResponseID]context.CancelFunc
|
|
||||||
bodies map[respcoord.ResponseID]responseBody
|
|
||||||
seq atomic.Uint64
|
|
||||||
wg sync.WaitGroup
|
|
||||||
}
|
|
||||||
|
|
||||||
type responseBody struct {
|
|
||||||
parent context.Context
|
|
||||||
run func(ctx context.Context)
|
|
||||||
}
|
|
||||||
|
|
||||||
func newResponseSink() *responseSink {
|
|
||||||
s := &responseSink{
|
|
||||||
cancels: map[respcoord.ResponseID]context.CancelFunc{},
|
|
||||||
bodies: map[respcoord.ResponseID]responseBody{},
|
|
||||||
}
|
|
||||||
s.coord = respcoord.New(s)
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
|
|
||||||
// issue registers a response body and asks the coordinator to start it. Any
|
|
||||||
// in-flight response is superseded (cancelled, with its own terminal) first,
|
|
||||||
// atomically inside the coordinator — no caller-side locking, no dual-writer
|
|
||||||
// race. Non-blocking: the superseded response drains concurrently and its later
|
|
||||||
// Finished is ignored as stale.
|
|
||||||
func (s *responseSink) issue(parent context.Context, source respcoord.Source, run func(ctx context.Context)) {
|
|
||||||
id := respcoord.ResponseID(s.seq.Add(1))
|
|
||||||
s.mu.Lock()
|
|
||||||
s.bodies[id] = responseBody{parent: parent, run: run}
|
|
||||||
s.mu.Unlock()
|
|
||||||
if err := s.coord.Apply(respcoord.Start{ID: id, Source: source}); err != nil {
|
|
||||||
xlog.Error("respcoord: start failed", "error", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// cancel cancels the in-flight response, if any. Non-blocking (barge-in must not
|
|
||||||
// stall the VAD tick).
|
|
||||||
func (s *responseSink) cancel(source respcoord.Source) {
|
|
||||||
if err := s.coord.Apply(respcoord.Cancel{Source: source}); err != nil {
|
|
||||||
xlog.Error("respcoord: cancel failed", "error", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// wait blocks until every response goroutine (the active one plus any draining
|
|
||||||
// superseded ones) has exited. Used at teardown so the session is never deleted
|
|
||||||
// out from under a running response.
|
|
||||||
func (s *responseSink) wait() {
|
|
||||||
s.wg.Wait()
|
|
||||||
}
|
|
||||||
|
|
||||||
// shutdown terminates the coordinator (cancelling any in-flight response) and
|
|
||||||
// then joins all response goroutines. After this the coordinator is in its
|
|
||||||
// absorbing Terminated state, so no further response can be issued — the
|
|
||||||
// connection (M1) parent's teardown uses this to guarantee no response outlives
|
|
||||||
// the session (see formal-verification/session_lifecycle.fizz).
|
|
||||||
func (s *responseSink) shutdown() {
|
|
||||||
if err := s.coord.Apply(respcoord.Shutdown{}); err != nil {
|
|
||||||
xlog.Error("respcoord: shutdown failed", "error", err)
|
|
||||||
}
|
|
||||||
s.wait()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Perform executes one effect. It is called by Coordinator.Apply while the
|
|
||||||
// coordinator lock is held, so it must not block. It briefly takes s.mu but
|
|
||||||
// never acquires the coordinator lock while holding s.mu; the spawned
|
|
||||||
// goroutine's Finished apply takes the coordinator lock only AFTER releasing
|
|
||||||
// s.mu, so there is no lock cycle.
|
|
||||||
func (s *responseSink) Perform(e respcoord.Effect) {
|
|
||||||
switch eff := e.(type) {
|
|
||||||
case respcoord.StartResponse:
|
|
||||||
s.mu.Lock()
|
|
||||||
body := s.bodies[eff.ID]
|
|
||||||
delete(s.bodies, eff.ID)
|
|
||||||
parent := body.parent
|
|
||||||
if parent == nil {
|
|
||||||
parent = context.Background()
|
|
||||||
}
|
|
||||||
ctx, cancel := context.WithCancel(parent)
|
|
||||||
s.cancels[eff.ID] = cancel
|
|
||||||
s.mu.Unlock()
|
|
||||||
|
|
||||||
s.wg.Go(func() {
|
|
||||||
defer func() {
|
|
||||||
s.mu.Lock()
|
|
||||||
delete(s.cancels, eff.ID)
|
|
||||||
s.mu.Unlock()
|
|
||||||
// Report completion. If this response was superseded/cancelled
|
|
||||||
// the id is stale and the coordinator ignores it (so the
|
|
||||||
// terminal is never emitted twice).
|
|
||||||
if err := s.coord.Apply(respcoord.Finished{ID: eff.ID}); err != nil {
|
|
||||||
xlog.Error("respcoord: finished apply failed", "error", err)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
if body.run != nil {
|
|
||||||
body.run(ctx)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
case respcoord.CancelResponse:
|
|
||||||
s.mu.Lock()
|
|
||||||
cancel := s.cancels[eff.ID]
|
|
||||||
s.mu.Unlock()
|
|
||||||
if cancel != nil {
|
|
||||||
cancel()
|
|
||||||
}
|
|
||||||
case respcoord.EmitTerminal:
|
|
||||||
// No-op for now: the response body still emits its own response.done.
|
|
||||||
// Wiring the authoritative single terminal here is the next step.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,350 +0,0 @@
|
|||||||
package openai
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/core/backend"
|
|
||||||
"github.com/mudler/LocalAI/core/config"
|
|
||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
|
|
||||||
"github.com/mudler/LocalAI/core/schema"
|
|
||||||
"github.com/mudler/xlog"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Semantic (EOU-driven) turn detection.
|
|
||||||
//
|
|
||||||
// With turn_detection.type == "semantic_vad", the transcription model is fed
|
|
||||||
// the microphone audio live while the user speaks and its end-of-utterance
|
|
||||||
// token turns the silence window dynamic: an immediate commit once the
|
|
||||||
// token fires (the model judged the user finished and expects a reply), the
|
|
||||||
// much longer eagerness fallback when it does not (mid-thought pause). The
|
|
||||||
// silero VAD stays in charge of speech_started/barge-in and the actual
|
|
||||||
// silence measurement, so a spurious EOU mid-speech cannot cut the user off
|
|
||||||
// — the commit still requires real silence.
|
|
||||||
|
|
||||||
const (
|
|
||||||
// semanticEouSilenceSec is the extra silence required to commit once the
|
|
||||||
// end-of-utterance token has fired. Zero: the token already trails the
|
|
||||||
// audio by the encoder chunk schedule plus a VAD tick (~0.3-0.9s), and
|
|
||||||
// the commit check only runs after silero closes the speech segment —
|
|
||||||
// which itself takes real silence — so any window on top is pure added
|
|
||||||
// response delay.
|
|
||||||
semanticEouSilenceSec = 0.0
|
|
||||||
|
|
||||||
// liveEventsBuffer sizes the recv-callback → VAD-tick handoff channel.
|
|
||||||
// Events arrive at a few per second and the ticker drains every 300ms;
|
|
||||||
// a full channel means the loop is wedged, and dropping (with a warning)
|
|
||||||
// beats blocking the backend's recv goroutine.
|
|
||||||
liveEventsBuffer = 64
|
|
||||||
)
|
|
||||||
|
|
||||||
// eagernessMaxSilenceSec maps the OpenAI semantic_vad eagerness to the
|
|
||||||
// fallback silence window used when no end-of-utterance token was seen:
|
|
||||||
// low waits longest, high responds fastest, auto/empty equals medium —
|
|
||||||
// the same 8s/4s/2s max timeouts OpenAI documents.
|
|
||||||
func eagernessMaxSilenceSec(eagerness string) float64 {
|
|
||||||
switch strings.ToLower(strings.TrimSpace(eagerness)) {
|
|
||||||
case "low":
|
|
||||||
return 8
|
|
||||||
case "high":
|
|
||||||
return 2
|
|
||||||
default: // "medium", "auto", ""
|
|
||||||
return 4
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// liveUtterance is one committed turn's transcript as produced by the live
|
|
||||||
// stream. Its delta events were already streamed to the client as they
|
|
||||||
// arrived (keyed by the turn's item id), so only the final text travels here.
|
|
||||||
type liveUtterance struct {
|
|
||||||
Text string
|
|
||||||
}
|
|
||||||
|
|
||||||
// liveTurnState is handleVAD's per-session live-ASR companion for
|
|
||||||
// semantic_vad. One live stream is opened per user turn (begun when the VAD
|
|
||||||
// first reports speech, finalized at commit) — the underlying decode session
|
|
||||||
// grows with fed audio, so per-turn streams keep it bounded. All fields are
|
|
||||||
// owned by the handleVAD goroutine; the backend's recv callback only writes
|
|
||||||
// into the buffered events channel.
|
|
||||||
type liveTurnState struct {
|
|
||||||
session *Session
|
|
||||||
transport Transport // live caption deltas are sent here as they drain
|
|
||||||
events chan backend.LiveTranscriptionEvent
|
|
||||||
|
|
||||||
live backend.LiveTranscriptionSession // nil between turns
|
|
||||||
unavailable bool // sticky: backend can't do live ASR, degrade for the session
|
|
||||||
|
|
||||||
fed16k int // 16k samples of the current buffer already fed
|
|
||||||
// eouAtSec is the audio time of the most recent EOU this turn (0 = none).
|
|
||||||
// It is a recorded fact: set when an EOU drains and never toggled off
|
|
||||||
// mid-turn. Whether it still governs the trailing silence is derived
|
|
||||||
// purely by eouPending() from this plus the live VAD segments.
|
|
||||||
eouAtSec float64
|
|
||||||
parts []string // deltas accumulated for the current turn
|
|
||||||
finalText string // authoritative full-turn text from the Final event
|
|
||||||
itemID string // the turn's conversation item id, allocated at openTurn
|
|
||||||
deltasSent bool // at least one caption delta reached the client this turn
|
|
||||||
}
|
|
||||||
|
|
||||||
func newLiveTurnState(session *Session, transport Transport) *liveTurnState {
|
|
||||||
return &liveTurnState{
|
|
||||||
session: session,
|
|
||||||
transport: transport,
|
|
||||||
events: make(chan backend.LiveTranscriptionEvent, liveEventsBuffer),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (l *liveTurnState) open() bool { return l.live != nil }
|
|
||||||
|
|
||||||
// openTurn starts the turn's live stream under the caller-supplied item id. A
|
|
||||||
// failure (most commonly the backend's typed "live transcription unsupported"
|
|
||||||
// signal) degrades the whole session to silence-only detection — warned once,
|
|
||||||
// then sticky.
|
|
||||||
//
|
|
||||||
// The item id is supplied by the turn coordinator (turncoord) rather than minted
|
|
||||||
// here: it is allocated when the turn STARTS so caption deltas can stream to the
|
|
||||||
// client while the user is still speaking, and the committed event and final
|
|
||||||
// transcript reuse it (replacing the partial text). The coordinator carries the
|
|
||||||
// same id on its CommitTurn/DiscardTurn effects, so the committed event always
|
|
||||||
// matches the captions.
|
|
||||||
func (l *liveTurnState) openTurn(ctx context.Context, itemID string) bool {
|
|
||||||
if l.live != nil {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
if l.unavailable {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
language := ""
|
|
||||||
if l.session.InputAudioTranscription != nil {
|
|
||||||
language = l.session.InputAudioTranscription.Language
|
|
||||||
}
|
|
||||||
live, err := l.session.ModelInterface.TranscribeLive(ctx, language, func(ev backend.LiveTranscriptionEvent) {
|
|
||||||
select {
|
|
||||||
case l.events <- ev:
|
|
||||||
default:
|
|
||||||
xlog.Warn("semantic_vad: live transcription event dropped (event channel full)")
|
|
||||||
}
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
l.unavailable = true
|
|
||||||
xlog.Warn("semantic_vad: live transcription unavailable; degrading to silence-only turn detection",
|
|
||||||
"error", err)
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
l.resetTurn()
|
|
||||||
l.live = live
|
|
||||||
l.itemID = itemID
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
// feedNewAudio pushes the not-yet-fed tail of the resampled buffer to the
|
|
||||||
// live stream. The final sample is held back: ResampleInt16 is prefix-stable
|
|
||||||
// except for its last output sample, so excluding it keeps successive
|
|
||||||
// whole-buffer resamples bit-identical over the fed range.
|
|
||||||
func (l *liveTurnState) feedNewAudio(aints16k []int16) {
|
|
||||||
if l.live == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
end := len(aints16k) - 1
|
|
||||||
if end <= l.fed16k {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if err := l.live.Feed(int16sToFloat32(aints16k[l.fed16k:end])); err != nil {
|
|
||||||
xlog.Warn("semantic_vad: live feed failed; degrading to silence-only turn detection", "error", err)
|
|
||||||
l.discardTurn()
|
|
||||||
l.unavailable = true
|
|
||||||
return
|
|
||||||
}
|
|
||||||
l.fed16k = end
|
|
||||||
}
|
|
||||||
|
|
||||||
// drainEvents folds everything the live stream produced since the last tick
|
|
||||||
// into the turn state. audioSec (the current buffer length in seconds) marks
|
|
||||||
// WHEN an EOU was observed, so later VAD segments can distinguish speech
|
|
||||||
// that resumed after it.
|
|
||||||
func (l *liveTurnState) drainEvents(audioSec float64) {
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case ev := <-l.events:
|
|
||||||
if ev.Delta != "" {
|
|
||||||
l.parts = append(l.parts, ev.Delta)
|
|
||||||
// Live captions: forward the delta immediately under the
|
|
||||||
// turn's item id — the browser shows text while the user
|
|
||||||
// is still speaking; the completed event at commit
|
|
||||||
// replaces it with the authoritative transcript.
|
|
||||||
if l.transport != nil && l.itemID != "" {
|
|
||||||
sendEvent(l.transport, types.ConversationItemInputAudioTranscriptionDeltaEvent{
|
|
||||||
ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
|
|
||||||
ItemID: l.itemID,
|
|
||||||
ContentIndex: 0,
|
|
||||||
Delta: ev.Delta,
|
|
||||||
})
|
|
||||||
l.deltasSent = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if ev.Eou {
|
|
||||||
// Record the position; do not flip a flag. Whether this EOU
|
|
||||||
// still applies to the trailing silence is decided later by
|
|
||||||
// eouPending(), purely from this and the live VAD segments.
|
|
||||||
l.eouAtSec = audioSec
|
|
||||||
xlog.Debug("semantic_vad: EOU token observed", "audio_s", audioSec)
|
|
||||||
}
|
|
||||||
if ev.Eob {
|
|
||||||
// A backchannel ended ("uh-huh") — the user is still
|
|
||||||
// listening, not yielding the turn. Deliberately NOT a
|
|
||||||
// commit trigger.
|
|
||||||
xlog.Debug("semantic_vad: EOB (backchannel) observed", "audio_s", audioSec)
|
|
||||||
}
|
|
||||||
if ev.Final != nil && strings.TrimSpace(ev.Final.Text) != "" {
|
|
||||||
l.finalText = ev.Final.Text
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// eouPending reports whether the recorded EOU still applies to the current
|
|
||||||
// trailing silence. It is a pure function of the recorded EOU position and the
|
|
||||||
// VAD's live view — there is no stored boolean that can fall out of sync.
|
|
||||||
//
|
|
||||||
// An EOU stops applying only once the user has STARTED a new utterance after
|
|
||||||
// it (a segment whose start is past the EOU): that is genuine resumed speech,
|
|
||||||
// so the earlier yield no longer holds. An in-progress segment whose speech
|
|
||||||
// began BEFORE the EOU is NOT resumed speech — it is just silero still padding
|
|
||||||
// before it closes the segment, which is the normal state at the instant the
|
|
||||||
// (predictive) EOU fires. Treating that as resumed speech was the bug that
|
|
||||||
// cleared the flag on the very tick the token arrived, dropping almost every
|
|
||||||
// EOU to the eagerness timeout.
|
|
||||||
func (l *liveTurnState) eouPending(segments []schema.VADSegment) bool {
|
|
||||||
if l.eouAtSec == 0 || len(segments) == 0 {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
last := segments[len(segments)-1]
|
|
||||||
return float64(last.Start) <= l.eouAtSec
|
|
||||||
}
|
|
||||||
|
|
||||||
// thresholdSec is the dynamic commit threshold: zero once the model said
|
|
||||||
// the utterance is over (any VAD-confirmed silence commits), the eagerness
|
|
||||||
// fallback otherwise.
|
|
||||||
func (l *liveTurnState) thresholdSec(eouPending bool, sv *types.RealtimeSessionSemanticVad) float64 {
|
|
||||||
if eouPending {
|
|
||||||
return semanticEouSilenceSec
|
|
||||||
}
|
|
||||||
return eagernessMaxSilenceSec(sv.Eagerness)
|
|
||||||
}
|
|
||||||
|
|
||||||
// commitTrigger describes how a commit decision was reached, for the per-turn
|
|
||||||
// timing log: "eou" with the token's lag behind the VAD's speech end, or
|
|
||||||
// "timeout" when the eagerness fallback elapsed without one. The lag is the
|
|
||||||
// number the user needs to tell a slow EOU emission apart from loop overhead.
|
|
||||||
func (l *liveTurnState) commitTrigger(eouPending bool, speechEndSec float64) (trigger string, eouLagSec float64) {
|
|
||||||
if !eouPending {
|
|
||||||
return "timeout", 0
|
|
||||||
}
|
|
||||||
return "eou", l.eouAtSec - speechEndSec
|
|
||||||
}
|
|
||||||
|
|
||||||
// finishTurn finalizes the live stream (flushing the decode tail — the last
|
|
||||||
// ~2 encoder frames of text only appear here), folds the terminal events in,
|
|
||||||
// and returns the turn's transcript. Returns nil when the stream never
|
|
||||||
// produced text (the VAD triggered on something the model heard nothing in).
|
|
||||||
func (l *liveTurnState) finishTurn(audioSec float64) *liveUtterance {
|
|
||||||
if l.live == nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
if err := l.live.Close(); err != nil {
|
|
||||||
xlog.Warn("semantic_vad: live transcription finalize failed", "error", err)
|
|
||||||
}
|
|
||||||
l.live = nil
|
|
||||||
l.drainEvents(audioSec)
|
|
||||||
|
|
||||||
text := strings.TrimSpace(l.finalText)
|
|
||||||
if text == "" {
|
|
||||||
text = l.previewText()
|
|
||||||
}
|
|
||||||
ut := &liveUtterance{Text: text}
|
|
||||||
l.resetTurn()
|
|
||||||
if ut.Text == "" {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
return ut
|
|
||||||
}
|
|
||||||
|
|
||||||
// discardTurn drops the current turn (no-speech buffer clear, feed failure,
|
|
||||||
// session teardown): the stream is closed and its transcript thrown away.
|
|
||||||
// Any caption deltas already shown for it are retracted via the failed
|
|
||||||
// event, so the client doesn't keep a stuck partial entry.
|
|
||||||
func (l *liveTurnState) discardTurn() {
|
|
||||||
if l.live != nil {
|
|
||||||
_ = l.live.Close()
|
|
||||||
l.live = nil
|
|
||||||
}
|
|
||||||
l.drainEvents(0)
|
|
||||||
if l.deltasSent && l.transport != nil && l.itemID != "" {
|
|
||||||
sendEvent(l.transport, types.ConversationItemInputAudioTranscriptionFailedEvent{
|
|
||||||
ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
|
|
||||||
ItemID: l.itemID,
|
|
||||||
ContentIndex: 0,
|
|
||||||
Error: types.Error{
|
|
||||||
Type: "transcription_discarded",
|
|
||||||
Message: "turn discarded before commit",
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
l.resetTurn()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (l *liveTurnState) resetTurn() {
|
|
||||||
l.fed16k = 0
|
|
||||||
l.eouAtSec = 0
|
|
||||||
l.parts = nil
|
|
||||||
l.finalText = ""
|
|
||||||
l.itemID = ""
|
|
||||||
l.deltasSent = false
|
|
||||||
}
|
|
||||||
|
|
||||||
// previewText is the turn's transcript so far (for the retranscribe
|
|
||||||
// comparison log and as the fallback when no Final event arrived).
|
|
||||||
func (l *liveTurnState) previewText() string {
|
|
||||||
return strings.TrimSpace(strings.Join(l.parts, ""))
|
|
||||||
}
|
|
||||||
|
|
||||||
// int16sToFloat32 converts PCM to the [-1,1] float form the live stream
|
|
||||||
// feeds the model (the same scaling runVAD's go-audio conversion applies).
|
|
||||||
func int16sToFloat32(samples []int16) []float32 {
|
|
||||||
out := make([]float32, len(samples))
|
|
||||||
for i, s := range samples {
|
|
||||||
out[i] = float32(s) / 32768.0
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
// turnDetectionActive reports whether the session has any automatic turn
|
|
||||||
// detection (server or semantic VAD) that should run the handleVAD loop.
|
|
||||||
func turnDetectionActive(td *types.TurnDetectionUnion) bool {
|
|
||||||
return td != nil && (td.ServerVad != nil || td.SemanticVad != nil)
|
|
||||||
}
|
|
||||||
|
|
||||||
// defaultTurnDetection seeds a new session's turn detection from the
|
|
||||||
// pipeline's server-side default: semantic_vad pipelines start sessions in
|
|
||||||
// semantic mode (clients can still override via session.update); everything
|
|
||||||
// else keeps the historical server_vad defaults.
|
|
||||||
func defaultTurnDetection(cfg *config.ModelConfig) *types.TurnDetectionUnion {
|
|
||||||
if cfg != nil && cfg.Pipeline.TurnDetectionSemantic() {
|
|
||||||
return &types.TurnDetectionUnion{
|
|
||||||
SemanticVad: &types.RealtimeSessionSemanticVad{
|
|
||||||
CreateResponse: true,
|
|
||||||
Eagerness: cfg.Pipeline.TurnDetection.Eagerness,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return &types.TurnDetectionUnion{
|
|
||||||
ServerVad: &types.ServerVad{
|
|
||||||
Threshold: 0.5,
|
|
||||||
PrefixPaddingMs: 300,
|
|
||||||
SilenceDurationMs: 500,
|
|
||||||
CreateResponse: true,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,414 +0,0 @@
|
|||||||
package openai
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"errors"
|
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/core/backend"
|
|
||||||
"github.com/mudler/LocalAI/core/config"
|
|
||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
|
|
||||||
"github.com/mudler/LocalAI/core/schema"
|
|
||||||
)
|
|
||||||
|
|
||||||
var _ = Describe("eagernessMaxSilenceSec", func() {
|
|
||||||
DescribeTable("maps eagerness to the no-EOU fallback window",
|
|
||||||
func(eagerness string, want float64) {
|
|
||||||
Expect(eagernessMaxSilenceSec(eagerness)).To(Equal(want))
|
|
||||||
},
|
|
||||||
Entry("low", "low", 8.0),
|
|
||||||
Entry("medium", "medium", 4.0),
|
|
||||||
Entry("high", "high", 2.0),
|
|
||||||
Entry("auto equals medium", "auto", 4.0),
|
|
||||||
Entry("empty equals medium", "", 4.0),
|
|
||||||
Entry("case and space insensitive", " High ", 2.0),
|
|
||||||
Entry("unknown equals medium", "frantic", 4.0),
|
|
||||||
)
|
|
||||||
})
|
|
||||||
|
|
||||||
var _ = Describe("turnDetectionActive", func() {
|
|
||||||
It("is active for server and semantic VAD, inactive otherwise", func() {
|
|
||||||
Expect(turnDetectionActive(nil)).To(BeFalse())
|
|
||||||
Expect(turnDetectionActive(&types.TurnDetectionUnion{})).To(BeFalse())
|
|
||||||
Expect(turnDetectionActive(&types.TurnDetectionUnion{ServerVad: &types.ServerVad{}})).To(BeTrue())
|
|
||||||
Expect(turnDetectionActive(&types.TurnDetectionUnion{SemanticVad: &types.RealtimeSessionSemanticVad{}})).To(BeTrue())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
var _ = Describe("defaultTurnDetection", func() {
|
|
||||||
It("keeps the historical server_vad defaults for non-semantic pipelines", func() {
|
|
||||||
td := defaultTurnDetection(&config.ModelConfig{})
|
|
||||||
Expect(td.ServerVad).NotTo(BeNil())
|
|
||||||
Expect(td.SemanticVad).To(BeNil())
|
|
||||||
Expect(td.ServerVad.SilenceDurationMs).To(Equal(int64(500)))
|
|
||||||
Expect(td.ServerVad.CreateResponse).To(BeTrue())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("seeds semantic_vad with the pipeline's eagerness", func() {
|
|
||||||
cfg := &config.ModelConfig{}
|
|
||||||
cfg.Pipeline.TurnDetection.Type = "semantic_vad"
|
|
||||||
cfg.Pipeline.TurnDetection.Eagerness = "high"
|
|
||||||
td := defaultTurnDetection(cfg)
|
|
||||||
Expect(td.SemanticVad).NotTo(BeNil())
|
|
||||||
Expect(td.ServerVad).To(BeNil())
|
|
||||||
Expect(td.SemanticVad.Eagerness).To(Equal("high"))
|
|
||||||
Expect(td.SemanticVad.CreateResponse).To(BeTrue())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("treats a nil config as server_vad", func() {
|
|
||||||
Expect(defaultTurnDetection(nil).ServerVad).NotTo(BeNil())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
var _ = Describe("int16sToFloat32", func() {
|
|
||||||
It("scales like the VAD conversion", func() {
|
|
||||||
out := int16sToFloat32([]int16{0, 16384, -32768})
|
|
||||||
Expect(out).To(HaveLen(3))
|
|
||||||
Expect(out[0]).To(BeNumerically("~", 0.0, 1e-6))
|
|
||||||
Expect(out[1]).To(BeNumerically("~", 0.5, 1e-6))
|
|
||||||
Expect(out[2]).To(BeNumerically("~", -1.0, 1e-6))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
var _ = Describe("liveTurnState", func() {
|
|
||||||
var (
|
|
||||||
m *fakeModel
|
|
||||||
lts *liveTurnState
|
|
||||||
ftr *fakeTransport
|
|
||||||
)
|
|
||||||
|
|
||||||
newSemanticSession := func(m *fakeModel) *Session {
|
|
||||||
return &Session{
|
|
||||||
InputAudioTranscription: &types.AudioTranscription{},
|
|
||||||
ModelInterface: m,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
BeforeEach(func() {
|
|
||||||
m = &fakeModel{}
|
|
||||||
ftr = &fakeTransport{}
|
|
||||||
lts = newLiveTurnState(newSemanticSession(m), ftr)
|
|
||||||
})
|
|
||||||
|
|
||||||
Describe("openTurn", func() {
|
|
||||||
It("opens once per turn and reports open()", func() {
|
|
||||||
Expect(lts.open()).To(BeFalse())
|
|
||||||
Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
|
|
||||||
Expect(lts.open()).To(BeTrue())
|
|
||||||
Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue(), "idempotent while open")
|
|
||||||
Expect(m.liveOpened).To(Equal(1))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("degrades stickily when the backend cannot do live transcription", func() {
|
|
||||||
m.liveErr = errors.New("rpc error: code = Unimplemented desc = live transcription unsupported")
|
|
||||||
Expect(lts.openTurn(context.Background(), "item1")).To(BeFalse())
|
|
||||||
Expect(lts.unavailable).To(BeTrue())
|
|
||||||
|
|
||||||
// Later turns never retry: the failure is per-session sticky.
|
|
||||||
m.liveErr = nil
|
|
||||||
Expect(lts.openTurn(context.Background(), "item1")).To(BeFalse())
|
|
||||||
Expect(m.liveOpened).To(Equal(0))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
Describe("feedNewAudio", func() {
|
|
||||||
It("feeds only the unfed tail and holds back the final resampled sample", func() {
|
|
||||||
Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
|
|
||||||
|
|
||||||
lts.feedNewAudio([]int16{1, 2, 3, 4})
|
|
||||||
Expect(m.liveSession.fed).To(HaveLen(1))
|
|
||||||
Expect(m.liveSession.fed[0]).To(HaveLen(3), "last sample held back")
|
|
||||||
|
|
||||||
// Same buffer grown by two samples: only the delta is fed.
|
|
||||||
lts.feedNewAudio([]int16{1, 2, 3, 4, 5, 6})
|
|
||||||
Expect(m.liveSession.fed).To(HaveLen(2))
|
|
||||||
Expect(m.liveSession.fed[1]).To(HaveLen(2))
|
|
||||||
|
|
||||||
// No growth past the holdback: nothing fed.
|
|
||||||
lts.feedNewAudio([]int16{1, 2, 3, 4, 5, 6})
|
|
||||||
Expect(m.liveSession.fed).To(HaveLen(2))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("degrades and closes the turn when a feed fails", func() {
|
|
||||||
Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
|
|
||||||
m.liveSession.feedErr = errors.New("backend gone")
|
|
||||||
sess := m.liveSession
|
|
||||||
|
|
||||||
lts.feedNewAudio([]int16{1, 2, 3, 4})
|
|
||||||
|
|
||||||
Expect(lts.open()).To(BeFalse())
|
|
||||||
Expect(lts.unavailable).To(BeTrue())
|
|
||||||
Expect(sess.closed).To(Equal(1))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
Describe("event handling and the dynamic threshold", func() {
|
|
||||||
sv := &types.RealtimeSessionSemanticVad{Eagerness: "high"}
|
|
||||||
|
|
||||||
It("uses the eagerness fallback until an EOU is recorded, then commits without an extra window", func() {
|
|
||||||
Expect(lts.thresholdSec(false, sv)).To(Equal(2.0))
|
|
||||||
Expect(lts.thresholdSec(true, sv)).To(Equal(semanticEouSilenceSec))
|
|
||||||
|
|
||||||
Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
|
|
||||||
lts.session.ModelInterface.(*fakeModel).liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "hello ", Eou: false})
|
|
||||||
lts.session.ModelInterface.(*fakeModel).liveSession.onEvent(backend.LiveTranscriptionEvent{Eou: true})
|
|
||||||
lts.drainEvents(3.3)
|
|
||||||
|
|
||||||
Expect(lts.eouAtSec).To(BeNumerically("~", 3.3, 1e-9))
|
|
||||||
Expect(lts.previewText()).To(Equal("hello"))
|
|
||||||
})
|
|
||||||
|
|
||||||
// The bug this replaces: the (predictive) EOU routinely arrives while
|
|
||||||
// silero is still padding the speech segment open. eouPending must NOT
|
|
||||||
// read that as resumed speech.
|
|
||||||
It("keeps the EOU pending while silero is still closing the same segment", func() {
|
|
||||||
lts.eouAtSec = 3.3
|
|
||||||
Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 0}})).To(BeTrue(), "segment began before the EOU and is merely unclosed")
|
|
||||||
Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}})).To(BeTrue(), "and still pending once it closes")
|
|
||||||
})
|
|
||||||
|
|
||||||
It("drops the EOU only when a new utterance starts after it (resumed speech)", func() {
|
|
||||||
lts.eouAtSec = 3.3
|
|
||||||
Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}, {Start: 4.0, End: 0}})).To(BeFalse())
|
|
||||||
Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}, {Start: 4.0, End: 5.0}})).To(BeFalse())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("has no pending EOU before one is recorded", func() {
|
|
||||||
Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 3.0}})).To(BeFalse())
|
|
||||||
Expect(lts.eouPending(nil)).To(BeFalse())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("does not arm the commit threshold on an EOB backchannel", func() {
|
|
||||||
Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
|
|
||||||
lts.session.ModelInterface.(*fakeModel).liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "uh-huh", Eob: true})
|
|
||||||
lts.drainEvents(2.0)
|
|
||||||
|
|
||||||
Expect(lts.eouAtSec).To(BeZero(), "a backchannel is not the user yielding the turn")
|
|
||||||
Expect(lts.eouPending([]schema.VADSegment{{Start: 0, End: 1.8}})).To(BeFalse(), "still on the eagerness fallback")
|
|
||||||
Expect(lts.previewText()).To(Equal("uh-huh"), "the backchannel text still lands in the transcript")
|
|
||||||
})
|
|
||||||
|
|
||||||
It("reports the commit trigger and the EOU token's lag behind speech end", func() {
|
|
||||||
trigger, lag := lts.commitTrigger(false, 3.2)
|
|
||||||
Expect(trigger).To(Equal("timeout"))
|
|
||||||
Expect(lag).To(BeZero())
|
|
||||||
|
|
||||||
lts.eouAtSec = 3.5
|
|
||||||
trigger, lag = lts.commitTrigger(true, 3.2)
|
|
||||||
Expect(trigger).To(Equal("eou"))
|
|
||||||
Expect(lag).To(BeNumerically("~", 0.3, 1e-9))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
Describe("finishTurn", func() {
|
|
||||||
It("finalizes the stream, prefers the Final text, and resets for the next turn", func() {
|
|
||||||
m.liveCloseEvents = []backend.LiveTranscriptionEvent{
|
|
||||||
{Delta: " world"},
|
|
||||||
{Final: &schema.TranscriptionResult{Text: "hello world", Eou: true}},
|
|
||||||
}
|
|
||||||
Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
|
|
||||||
sess := m.liveSession
|
|
||||||
sess.onEvent(backend.LiveTranscriptionEvent{Delta: "hello", Eou: true})
|
|
||||||
lts.drainEvents(2.0)
|
|
||||||
|
|
||||||
ut := lts.finishTurn(2.5)
|
|
||||||
|
|
||||||
Expect(sess.closed).To(Equal(1))
|
|
||||||
Expect(ut).NotTo(BeNil())
|
|
||||||
Expect(ut.Text).To(Equal("hello world"), "Final event text wins over joined deltas")
|
|
||||||
Expect(lts.open()).To(BeFalse())
|
|
||||||
Expect(lts.eouAtSec).To(BeZero())
|
|
||||||
Expect(lts.parts).To(BeEmpty())
|
|
||||||
Expect(lts.fed16k).To(BeZero())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("returns nil when the stream heard nothing", func() {
|
|
||||||
Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
|
|
||||||
Expect(lts.finishTurn(1.0)).To(BeNil())
|
|
||||||
Expect(m.liveSession.closed).To(Equal(1))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("is a no-op without an open stream", func() {
|
|
||||||
Expect(lts.finishTurn(1.0)).To(BeNil())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
Describe("discardTurn", func() {
|
|
||||||
It("closes the stream, drops the transcript and retracts streamed captions", func() {
|
|
||||||
Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
|
|
||||||
sess := m.liveSession
|
|
||||||
sess.onEvent(backend.LiveTranscriptionEvent{Delta: "noise"})
|
|
||||||
lts.drainEvents(1.0)
|
|
||||||
|
|
||||||
lts.discardTurn()
|
|
||||||
|
|
||||||
Expect(sess.closed).To(Equal(1))
|
|
||||||
Expect(lts.open()).To(BeFalse())
|
|
||||||
Expect(lts.parts).To(BeEmpty())
|
|
||||||
Expect(ftr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionFailed)).To(Equal(1),
|
|
||||||
"the client saw caption deltas for this turn — it must be told to drop them")
|
|
||||||
})
|
|
||||||
|
|
||||||
It("sends no failed event when no captions ever reached the client", func() {
|
|
||||||
Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
|
|
||||||
lts.discardTurn()
|
|
||||||
Expect(ftr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionFailed)).To(Equal(0))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
Describe("live captions", func() {
|
|
||||||
It("streams each delta to the client under the turn's item id as it drains", func() {
|
|
||||||
Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
|
|
||||||
turnID := lts.itemID
|
|
||||||
Expect(turnID).NotTo(BeEmpty(), "the item id exists from turn open so captions can reference it")
|
|
||||||
|
|
||||||
m.liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "hel"})
|
|
||||||
m.liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "lo"})
|
|
||||||
lts.drainEvents(1.0)
|
|
||||||
|
|
||||||
var got []types.ConversationItemInputAudioTranscriptionDeltaEvent
|
|
||||||
for _, e := range ftr.events {
|
|
||||||
if d, ok := e.(types.ConversationItemInputAudioTranscriptionDeltaEvent); ok {
|
|
||||||
got = append(got, d)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Expect(got).To(HaveLen(2))
|
|
||||||
Expect(got[0].Delta).To(Equal("hel"))
|
|
||||||
Expect(got[1].Delta).To(Equal("lo"))
|
|
||||||
Expect(got[0].ItemID).To(Equal(turnID))
|
|
||||||
Expect(got[1].ItemID).To(Equal(turnID))
|
|
||||||
Expect(lts.deltasSent).To(BeTrue())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("finishTurn does not retract captions — the commit's completed event supersedes them", func() {
|
|
||||||
Expect(lts.openTurn(context.Background(), "item1")).To(BeTrue())
|
|
||||||
m.liveSession.onEvent(backend.LiveTranscriptionEvent{Delta: "hello"})
|
|
||||||
lts.drainEvents(1.0)
|
|
||||||
|
|
||||||
Expect(lts.finishTurn(1.5)).NotTo(BeNil())
|
|
||||||
Expect(ftr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionFailed)).To(Equal(0))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
// commitUtteranceWithTranscript routes the three transcript sources: the
|
|
||||||
// retranscribe gate's batch decode, the live stream's accumulated text, and
|
|
||||||
// the historical file path.
|
|
||||||
var _ = Describe("commitUtteranceWithTranscript", func() {
|
|
||||||
newTranscriptionOnlySession := func(m *fakeModel, streamTranscription bool) *Session {
|
|
||||||
cfg := &config.ModelConfig{}
|
|
||||||
if streamTranscription {
|
|
||||||
on := true
|
|
||||||
cfg.Pipeline.Streaming.Transcription = &on
|
|
||||||
}
|
|
||||||
return &Session{
|
|
||||||
TranscriptionOnly: true, // stop after the transcript: no LLM/TTS in these specs
|
|
||||||
InputAudioTranscription: &types.AudioTranscription{},
|
|
||||||
ModelConfig: cfg,
|
|
||||||
ModelInterface: m,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
It("uses the gate's batch transcript and never re-runs the backend", func() {
|
|
||||||
m := &fakeModel{transcribeErr: errors.New("must not be called")}
|
|
||||||
session := newTranscriptionOnlySession(m, true)
|
|
||||||
tr := &fakeTransport{}
|
|
||||||
|
|
||||||
commitUtteranceWithTranscript(context.Background(), []byte{1, 2}, nil,
|
|
||||||
&schema.TranscriptionResult{Text: "batch text", Eou: true}, "item_turn", session, &Conversation{}, tr)
|
|
||||||
|
|
||||||
Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(0))
|
|
||||||
Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("emits only the completed event for a live transcript — captions already streamed during the turn", func() {
|
|
||||||
m := &fakeModel{transcribeErr: errors.New("must not be called")}
|
|
||||||
session := newTranscriptionOnlySession(m, true)
|
|
||||||
tr := &fakeTransport{}
|
|
||||||
|
|
||||||
commitUtteranceWithTranscript(context.Background(), []byte{1, 2},
|
|
||||||
&liveUtterance{Text: "hello"}, nil, "item_turn", session, &Conversation{}, tr)
|
|
||||||
|
|
||||||
Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(0))
|
|
||||||
Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
|
|
||||||
|
|
||||||
var completed types.ConversationItemInputAudioTranscriptionCompletedEvent
|
|
||||||
for _, e := range tr.events {
|
|
||||||
if c, ok := e.(types.ConversationItemInputAudioTranscriptionCompletedEvent); ok {
|
|
||||||
completed = c
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Expect(completed.ItemID).To(Equal("item_turn"),
|
|
||||||
"completed must reuse the caption deltas' item id so the client replaces, not duplicates")
|
|
||||||
Expect(completed.Transcript).To(Equal("hello"))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("falls back to the file path when the live stream heard nothing", func() {
|
|
||||||
m := &fakeModel{transcribeFinal: &schema.TranscriptionResult{Text: "from file"}}
|
|
||||||
session := newTranscriptionOnlySession(m, false)
|
|
||||||
tr := &fakeTransport{}
|
|
||||||
|
|
||||||
commitUtteranceWithTranscript(context.Background(), []byte{1, 2},
|
|
||||||
&liveUtterance{}, nil, "", session, &Conversation{}, tr)
|
|
||||||
|
|
||||||
Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
// transcribeUtterance is the retranscribe gate's offline decode of the
|
|
||||||
// buffered turn.
|
|
||||||
var _ = Describe("transcribeUtterance", func() {
|
|
||||||
It("returns the batch decode with its Eou flag", func() {
|
|
||||||
m := &fakeModel{transcribeFinal: &schema.TranscriptionResult{Text: "confirmed", Eou: true}}
|
|
||||||
session := &Session{
|
|
||||||
InputAudioTranscription: &types.AudioTranscription{},
|
|
||||||
ModelInterface: m,
|
|
||||||
}
|
|
||||||
|
|
||||||
tr, err := transcribeUtterance(context.Background(), []byte{0, 0, 1, 1}, session)
|
|
||||||
Expect(err).ToNot(HaveOccurred())
|
|
||||||
Expect(tr.Text).To(Equal("confirmed"))
|
|
||||||
Expect(tr.Eou).To(BeTrue())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("propagates backend errors", func() {
|
|
||||||
m := &fakeModel{transcribeErr: errors.New("engine fell over")}
|
|
||||||
session := &Session{
|
|
||||||
InputAudioTranscription: &types.AudioTranscription{},
|
|
||||||
ModelInterface: m,
|
|
||||||
}
|
|
||||||
|
|
||||||
_, err := transcribeUtterance(context.Background(), []byte{0, 0}, session)
|
|
||||||
Expect(err).To(MatchError(ContainSubstring("engine fell over")))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
// emitPrecomputedTranscription replays an already-produced transcript as the
|
|
||||||
// standard delta/completed event sequence.
|
|
||||||
var _ = Describe("emitPrecomputedTranscription", func() {
|
|
||||||
It("emits deltas then completed, sharing the item id", func() {
|
|
||||||
tr := &fakeTransport{}
|
|
||||||
Expect(emitPrecomputedTranscription(tr, "item42", []string{"a", "", "b"}, "ab")).To(Succeed())
|
|
||||||
|
|
||||||
Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(2), "empty deltas skipped")
|
|
||||||
Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
|
|
||||||
for _, e := range tr.events {
|
|
||||||
switch ev := e.(type) {
|
|
||||||
case types.ConversationItemInputAudioTranscriptionDeltaEvent:
|
|
||||||
Expect(ev.ItemID).To(Equal("item42"))
|
|
||||||
case types.ConversationItemInputAudioTranscriptionCompletedEvent:
|
|
||||||
Expect(ev.ItemID).To(Equal("item42"))
|
|
||||||
Expect(ev.Transcript).To(Equal("ab"))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
It("emits only the completed event with no deltas", func() {
|
|
||||||
tr := &fakeTransport{}
|
|
||||||
Expect(emitPrecomputedTranscription(tr, "item1", nil, "hi")).To(Succeed())
|
|
||||||
Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionDelta)).To(Equal(0))
|
|
||||||
Expect(tr.countEvents(types.ServerEventTypeConversationItemInputAudioTranscriptionCompleted)).To(Equal(1))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -86,8 +86,7 @@ func (s *transcriptStreamer) content() string {
|
|||||||
// tool calls. It returns true when it has fully handled the response so the
|
// tool calls. It returns true when it has fully handled the response so the
|
||||||
// caller can return; callers must only invoke it for an audio modality, and with
|
// caller can return; callers must only invoke it for an audio modality, and with
|
||||||
// tools only when the model uses its tokenizer template (see triggerResponseAtTurn).
|
// tools only when the model uses its tokenizer template (see triggerResponseAtTurn).
|
||||||
func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, r *liveResponse, history schema.Messages, images []string, llmCfg *config.ModelConfig, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, toolTurn int) bool {
|
func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, responseID string, history schema.Messages, images []string, llmCfg *config.ModelConfig, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, toolTurn int) bool {
|
||||||
responseID := r.id
|
|
||||||
itemID := generateItemID()
|
itemID := generateItemID()
|
||||||
item := types.MessageItemUnion{
|
item := types.MessageItemUnion{
|
||||||
Assistant: &types.MessageItemAssistant{
|
Assistant: &types.MessageItemAssistant{
|
||||||
@@ -122,8 +121,6 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// cancel rolls back the partial item and records the cancelled outcome; the
|
|
||||||
// single terminal is emitted by triggerResponse.
|
|
||||||
cancel := func() {
|
cancel := func() {
|
||||||
if announced {
|
if announced {
|
||||||
conv.Lock.Lock()
|
conv.Lock.Lock()
|
||||||
@@ -135,7 +132,10 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
|
|||||||
}
|
}
|
||||||
conv.Lock.Unlock()
|
conv.Lock.Unlock()
|
||||||
}
|
}
|
||||||
r.outcome = outcomeCancelled
|
sendEvent(t, types.ResponseDoneEvent{
|
||||||
|
ServerEventBase: types.ServerEventBase{},
|
||||||
|
Response: types.Response{ID: responseID, Object: "realtime.response", Status: types.ResponseStatusCancelled},
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
var template string
|
var template string
|
||||||
@@ -161,30 +161,24 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
|
|||||||
streamer.announce = announce
|
streamer.announce = announce
|
||||||
|
|
||||||
// Clause chunking (opt-in): synthesize each clause as soon as it completes
|
// Clause chunking (opt-in): synthesize each clause as soon as it completes
|
||||||
// instead of buffering the whole reply. Synthesis runs on a worker goroutine
|
// instead of buffering the whole reply. streamedAudio accumulates the PCM
|
||||||
// (ttsPipeline) rather than inline in the token callback: emitSpeech blocks
|
// across clauses for the conversation item record; ttsErr captures the first
|
||||||
// until the whole clause is synthesized (and, for WebRTC, played back at
|
// synthesis failure so the token callback can stop the prediction. emitSpeech
|
||||||
// real time), and the callback runs on the goroutine that drains the LLM
|
// runs synchronously here — the LLM keeps generating into the gRPC stream
|
||||||
// gRPC stream — so speaking inline stalls generation and freezes the
|
// while a clause is synthesized, so audio still starts mid-generation.
|
||||||
// assistant transcript at every clause boundary. The worker lets generation
|
|
||||||
// and the transcript stream keep flowing while audio is produced behind them.
|
|
||||||
var chunker *clauseChunker
|
var chunker *clauseChunker
|
||||||
var ttsPipe *ttsPipeline
|
|
||||||
if session.ModelConfig != nil && session.ModelConfig.Pipeline.ChunkClauses() {
|
if session.ModelConfig != nil && session.ModelConfig.Pipeline.ChunkClauses() {
|
||||||
chunker = newClauseChunker(defaultClauseMinRunes, defaultClauseMaxRunes)
|
chunker = newClauseChunker(defaultClauseMinRunes, defaultClauseMaxRunes)
|
||||||
ttsPipe = newTTSPipeline(func(clause string) ([]byte, error) {
|
|
||||||
return emitSpeech(ctx, t, session, responseID, itemID, clause)
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
var streamedAudio []byte
|
var streamedAudio []byte
|
||||||
var ttsErr error
|
var ttsErr error
|
||||||
|
speakClause := func(clause string) error {
|
||||||
// Backstop: always join the TTS worker, even on an unexpected early return.
|
a, err := emitSpeech(ctx, t, session, responseID, itemID, clause)
|
||||||
// wait() is idempotent, so the explicit drain below (which captures the
|
if err != nil {
|
||||||
// streamed audio and first error) stays authoritative; this only guarantees
|
return err
|
||||||
// the goroutine can never leak if a new return path is added.
|
}
|
||||||
if ttsPipe != nil {
|
streamedAudio = append(streamedAudio, a...)
|
||||||
defer func() { _, _ = ttsPipe.wait() }()
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// fail reports a mid-stream failure. A cancelled context means the client
|
// fail reports a mid-stream failure. A cancelled context means the client
|
||||||
@@ -194,7 +188,6 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
|
|||||||
cancel()
|
cancel()
|
||||||
} else {
|
} else {
|
||||||
sendError(t, code, fmt.Sprintf("%s: %v", msg, err), "", itemID)
|
sendError(t, code, fmt.Sprintf("%s: %v", msg, err), "", itemID)
|
||||||
r.outcome = outcomeFailed
|
|
||||||
}
|
}
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
@@ -214,12 +207,8 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
|
|||||||
delta := streamer.onToken(text)
|
delta := streamer.onToken(text)
|
||||||
if chunker != nil && delta != "" {
|
if chunker != nil && delta != "" {
|
||||||
for _, clause := range chunker.push(delta) {
|
for _, clause := range chunker.push(delta) {
|
||||||
// Hand the clause to the worker and keep going — never block the
|
if ttsErr = speakClause(clause); ttsErr != nil {
|
||||||
// recv loop on synthesis. A false return means a prior clause
|
return false // stop the prediction; reported after predFunc returns
|
||||||
// already failed; stop the prediction (the error is collected
|
|
||||||
// from the pipeline after predFunc returns).
|
|
||||||
if !ttsPipe.enqueue(clause) {
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -228,27 +217,10 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
|
|||||||
|
|
||||||
predFunc, err := session.ModelInterface.Predict(ctx, history, images, nil, nil, cb, tools, toolChoice, nil, nil, nil)
|
predFunc, err := session.ModelInterface.Predict(ctx, history, images, nil, nil, cb, tools, toolChoice, nil, nil, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// The deferred wait() joins the (idle) worker.
|
|
||||||
sendError(t, "inference_failed", fmt.Sprintf("backend error: %v", err), "", itemID)
|
sendError(t, "inference_failed", fmt.Sprintf("backend error: %v", err), "", itemID)
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
pred, err := predFunc()
|
pred, err := predFunc()
|
||||||
|
|
||||||
// Drain the TTS worker. On a clean finish, enqueue the trailing clause(s) the
|
|
||||||
// chunker was still holding; on an error or barge-in, stop synthesizing.
|
|
||||||
// wait() runs on every path so the worker goroutine never leaks, and it
|
|
||||||
// returns the audio streamed so far plus the first synthesis failure.
|
|
||||||
if ttsPipe != nil {
|
|
||||||
if err == nil && ctx.Err() == nil {
|
|
||||||
for _, clause := range chunker.flush() {
|
|
||||||
if !ttsPipe.enqueue(clause) {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
streamedAudio, ttsErr = ttsPipe.wait()
|
|
||||||
}
|
|
||||||
|
|
||||||
// A clause synthesis failed mid-stream (the callback stopped the prediction);
|
// A clause synthesis failed mid-stream (the callback stopped the prediction);
|
||||||
// report it as a TTS error rather than a prediction error.
|
// report it as a TTS error rather than a prediction error.
|
||||||
if ttsErr != nil {
|
if ttsErr != nil {
|
||||||
@@ -261,7 +233,6 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
|
|||||||
cancel()
|
cancel()
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
r.addUsage(pred.Usage)
|
|
||||||
|
|
||||||
content := streamer.content()
|
content := streamer.content()
|
||||||
toolCalls := functions.ToolCallsFromChatDeltas(pred.ChatDeltas)
|
toolCalls := functions.ToolCallsFromChatDeltas(pred.ChatDeltas)
|
||||||
@@ -273,19 +244,24 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
|
|||||||
announce()
|
announce()
|
||||||
}
|
}
|
||||||
|
|
||||||
// With clause chunking the clauses were synthesized on the worker as the
|
// Synthesize the audio. With clause chunking the completed clauses were
|
||||||
// reply streamed (including the trailing flush drained above), so the
|
// already spoken inside the token callback; flush the trailing clause(s)
|
||||||
// audio is already accumulated. Otherwise buffer the whole message and
|
// the segmenter was still holding. Otherwise buffer the whole message and
|
||||||
// synthesize it once now — emitSpeech streams the audio chunks when the
|
// synthesize it once. emitSpeech streams the audio chunks when the TTS
|
||||||
// TTS backend supports TTSStream, otherwise it sends a single unary delta.
|
// backend supports TTSStream, otherwise it sends a single unary delta.
|
||||||
var audio []byte
|
var audio []byte
|
||||||
if chunker != nil {
|
if chunker != nil {
|
||||||
|
for _, clause := range chunker.flush() {
|
||||||
|
if ttsErr = speakClause(clause); ttsErr != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
audio = streamedAudio
|
audio = streamedAudio
|
||||||
} else {
|
} else {
|
||||||
audio, ttsErr = emitSpeech(ctx, t, session, responseID, itemID, content)
|
audio, ttsErr = emitSpeech(ctx, t, session, responseID, itemID, content)
|
||||||
if ttsErr != nil {
|
}
|
||||||
return fail("tts_error", "TTS generation failed", ttsErr)
|
if ttsErr != nil {
|
||||||
}
|
return fail("tts_error", "TTS generation failed", ttsErr)
|
||||||
}
|
}
|
||||||
|
|
||||||
_, isWebRTC := t.(*WebRTCTransport)
|
_, isWebRTC := t.(*WebRTCTransport)
|
||||||
@@ -330,12 +306,10 @@ func streamLLMResponse(ctx context.Context, session *Session, conv *Conversation
|
|||||||
OutputIndex: 0,
|
OutputIndex: 0,
|
||||||
Item: item,
|
Item: item,
|
||||||
})
|
})
|
||||||
r.addItem(item)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Emit any tool calls and (for server-side assistant tools) the follow-up
|
// Emit any tool calls, the terminal response.done, and (for server-side
|
||||||
// turn — shared with the buffered path. The single terminal is emitted by
|
// assistant tools) the follow-up turn — shared with the buffered path.
|
||||||
// triggerResponse.
|
emitToolCallItems(ctx, session, conv, t, responseID, toolCalls, content != "", toolTurn)
|
||||||
emitToolCallItems(ctx, session, conv, t, r, toolCalls, content != "", toolTurn)
|
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -102,8 +102,7 @@ var _ = Describe("streamLLMResponse", func() {
|
|||||||
t := &fakeTransport{}
|
t := &fakeTransport{}
|
||||||
llmCfg := &config.ModelConfig{}
|
llmCfg := &config.ModelConfig{}
|
||||||
|
|
||||||
r := &liveResponse{id: "resp1"}
|
handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)
|
||||||
handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0)
|
|
||||||
|
|
||||||
Expect(handled).To(BeTrue())
|
Expect(handled).To(BeTrue())
|
||||||
// One live transcript delta per streamed token.
|
// One live transcript delta per streamed token.
|
||||||
@@ -133,8 +132,7 @@ var _ = Describe("streamLLMResponse", func() {
|
|||||||
t := &fakeTransport{}
|
t := &fakeTransport{}
|
||||||
llmCfg := &config.ModelConfig{}
|
llmCfg := &config.ModelConfig{}
|
||||||
|
|
||||||
r := &liveResponse{id: "resp1"}
|
handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)
|
||||||
handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0)
|
|
||||||
|
|
||||||
Expect(handled).To(BeTrue())
|
Expect(handled).To(BeTrue())
|
||||||
// Two clauses ("Hello world." mid-stream, "How are you?" on flush) → two
|
// Two clauses ("Hello world." mid-stream, "How are you?" on flush) → two
|
||||||
@@ -142,10 +140,8 @@ var _ = Describe("streamLLMResponse", func() {
|
|||||||
Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioDelta)).To(Equal(2))
|
Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioDelta)).To(Equal(2))
|
||||||
// The full transcript still streams verbatim.
|
// The full transcript still streams verbatim.
|
||||||
Expect(t.transcriptDeltaText()).To(Equal("Hello world. How are you?"))
|
Expect(t.transcriptDeltaText()).To(Equal("Hello world. How are you?"))
|
||||||
// The terminal response.done is emitted by triggerResponse, not by
|
// Exactly one terminal response.done.
|
||||||
// streamLLMResponse — so at this layer there are none.
|
Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
|
||||||
Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
|
|
||||||
Expect(r.outcome).To(Equal(outcomeCompleted))
|
|
||||||
})
|
})
|
||||||
|
|
||||||
It("streams content deltas and emits tool-call items (autoparser tool turn)", func() {
|
It("streams content deltas and emits tool-call items (autoparser tool turn)", func() {
|
||||||
@@ -173,18 +169,15 @@ var _ = Describe("streamLLMResponse", func() {
|
|||||||
llmCfg := &config.ModelConfig{}
|
llmCfg := &config.ModelConfig{}
|
||||||
llmCfg.TemplateConfig.UseTokenizerTemplate = true
|
llmCfg.TemplateConfig.UseTokenizerTemplate = true
|
||||||
|
|
||||||
r := &liveResponse{id: "resp1"}
|
handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)
|
||||||
handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0)
|
|
||||||
|
|
||||||
Expect(handled).To(BeTrue())
|
Expect(handled).To(BeTrue())
|
||||||
// The spoken content was streamed live.
|
// The spoken content was streamed live.
|
||||||
Expect(t.transcriptDeltaText()).To(Equal("Let me check."))
|
Expect(t.transcriptDeltaText()).To(Equal("Let me check."))
|
||||||
// The tool call is emitted as a function_call item.
|
// The tool call is emitted as a function_call item.
|
||||||
Expect(t.countEvents(types.ServerEventTypeResponseFunctionCallArgumentsDone)).To(Equal(1))
|
Expect(t.countEvents(types.ServerEventTypeResponseFunctionCallArgumentsDone)).To(Equal(1))
|
||||||
// The terminal response.done is emitted by triggerResponse, not by
|
// Exactly one terminal response.done.
|
||||||
// streamLLMResponse — so at this layer there are none.
|
Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
|
||||||
Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
|
|
||||||
Expect(r.outcome).To(Equal(outcomeCompleted))
|
|
||||||
})
|
})
|
||||||
|
|
||||||
It("emits only tool-call items for a content-less tool turn (no empty assistant item)", func() {
|
It("emits only tool-call items for a content-less tool turn (no empty assistant item)", func() {
|
||||||
@@ -207,8 +200,7 @@ var _ = Describe("streamLLMResponse", func() {
|
|||||||
llmCfg := &config.ModelConfig{}
|
llmCfg := &config.ModelConfig{}
|
||||||
llmCfg.TemplateConfig.UseTokenizerTemplate = true
|
llmCfg.TemplateConfig.UseTokenizerTemplate = true
|
||||||
|
|
||||||
r := &liveResponse{id: "resp1"}
|
handled := streamLLMResponse(context.Background(), session, conv, t, "resp1", nil, nil, llmCfg, nil, nil, 0)
|
||||||
handled := streamLLMResponse(context.Background(), session, conv, t, r, nil, nil, llmCfg, nil, nil, 0)
|
|
||||||
|
|
||||||
Expect(handled).To(BeTrue())
|
Expect(handled).To(BeTrue())
|
||||||
// No content → no transcript deltas and no spurious assistant content item.
|
// No content → no transcript deltas and no spurious assistant content item.
|
||||||
@@ -216,51 +208,6 @@ var _ = Describe("streamLLMResponse", func() {
|
|||||||
Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioTranscriptDelta)).To(Equal(0))
|
Expect(t.countEvents(types.ServerEventTypeResponseOutputAudioTranscriptDelta)).To(Equal(0))
|
||||||
// The tool call is still emitted.
|
// The tool call is still emitted.
|
||||||
Expect(t.countEvents(types.ServerEventTypeResponseFunctionCallArgumentsDone)).To(Equal(1))
|
Expect(t.countEvents(types.ServerEventTypeResponseFunctionCallArgumentsDone)).To(Equal(1))
|
||||||
Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(0))
|
|
||||||
Expect(r.outcome).To(Equal(outcomeCompleted))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
var _ = Describe("triggerResponse", func() {
|
|
||||||
It("emits exactly one response.created and one response.done with output and usage", func() {
|
|
||||||
m := &fakeModel{
|
|
||||||
cfg: &config.ModelConfig{},
|
|
||||||
predictResp: backend.LLMResponse{
|
|
||||||
Response: "Hi there.",
|
|
||||||
Usage: backend.TokenUsage{Prompt: 5, Completion: 3},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
session := &Session{
|
|
||||||
OutputSampleRate: 24000,
|
|
||||||
ModelInterface: m,
|
|
||||||
ModelConfig: &config.ModelConfig{},
|
|
||||||
// Text-only so the buffered path skips TTS and the assertion focuses
|
|
||||||
// on the terminal's Output + Usage.
|
|
||||||
OutputModalities: []types.Modality{types.ModalityText},
|
|
||||||
}
|
|
||||||
conv := &Conversation{}
|
|
||||||
t := &fakeTransport{}
|
|
||||||
|
|
||||||
triggerResponse(context.Background(), session, conv, t, nil)
|
|
||||||
|
|
||||||
// Exactly one of each lifecycle event for the whole response.create.
|
|
||||||
Expect(t.countEvents(types.ServerEventTypeResponseCreated)).To(Equal(1))
|
|
||||||
Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
|
Expect(t.countEvents(types.ServerEventTypeResponseDone)).To(Equal(1))
|
||||||
|
|
||||||
// The single terminal carries the produced output item and the usage —
|
|
||||||
// both empty in the legacy code.
|
|
||||||
var done *types.ResponseDoneEvent
|
|
||||||
for i := range t.events {
|
|
||||||
if d, ok := t.events[i].(types.ResponseDoneEvent); ok {
|
|
||||||
done = &d
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Expect(done).NotTo(BeNil())
|
|
||||||
Expect(done.Response.Status).To(Equal(types.ResponseStatusCompleted))
|
|
||||||
Expect(done.Response.Output).To(HaveLen(1))
|
|
||||||
Expect(done.Response.Usage).NotTo(BeNil())
|
|
||||||
Expect(done.Response.Usage.InputTokens).To(Equal(5))
|
|
||||||
Expect(done.Response.Usage.OutputTokens).To(Equal(3))
|
|
||||||
Expect(done.Response.Usage.TotalTokens).To(Equal(8))
|
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -7,33 +7,6 @@ import (
|
|||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
|
"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
|
||||||
)
|
)
|
||||||
|
|
||||||
// emitPrecomputedTranscription emits the transcription events for a turn
|
|
||||||
// whose transcript already exists (semantic_vad's live stream, or the
|
|
||||||
// retranscribe gate's batch decode): optional delta replays followed by the
|
|
||||||
// completed event — the same contract emitTranscription produces, sharing
|
|
||||||
// one itemID — without running the backend again.
|
|
||||||
func emitPrecomputedTranscription(t Transport, itemID string, deltas []string, transcript string) error {
|
|
||||||
for _, d := range deltas {
|
|
||||||
if d == "" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if err := t.SendEvent(types.ConversationItemInputAudioTranscriptionDeltaEvent{
|
|
||||||
ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
|
|
||||||
ItemID: itemID,
|
|
||||||
ContentIndex: 0,
|
|
||||||
Delta: d,
|
|
||||||
}); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return t.SendEvent(types.ConversationItemInputAudioTranscriptionCompletedEvent{
|
|
||||||
ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
|
|
||||||
ItemID: itemID,
|
|
||||||
ContentIndex: 0,
|
|
||||||
Transcript: transcript,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// emitTranscription transcribes a committed utterance and emits the transcription
|
// emitTranscription transcribes a committed utterance and emits the transcription
|
||||||
// events for it, returning the final transcript text. With
|
// events for it, returning the final transcript text. With
|
||||||
// pipeline.streaming.transcription enabled it streams each transcript fragment as
|
// pipeline.streaming.transcription enabled it streams each transcript fragment as
|
||||||
|
|||||||
@@ -1,153 +0,0 @@
|
|||||||
package openai
|
|
||||||
|
|
||||||
import (
|
|
||||||
"sync"
|
|
||||||
"sync/atomic"
|
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/ttscoord"
|
|
||||||
)
|
|
||||||
|
|
||||||
// ttsPipeline decouples speech synthesis from LLM token generation.
|
|
||||||
//
|
|
||||||
// The LLM token callback runs on the same goroutine that drains the model's
|
|
||||||
// gRPC stream, so anything it does serially — including a blocking TTS call —
|
|
||||||
// stops the stream from being read and stalls generation (and, since the same
|
|
||||||
// goroutine also sends the assistant transcript, freezes the transcript the
|
|
||||||
// client sees). ttsPipeline lets the callback hand each completed clause to a
|
|
||||||
// single worker goroutine that synthesizes them in order, concurrently with
|
|
||||||
// continued generation. One worker preserves clause — and therefore audio —
|
|
||||||
// ordering.
|
|
||||||
//
|
|
||||||
// The clause queue is intentionally unbounded: clauses are short strings and a
|
|
||||||
// reply has a bounded number of them, while the expensive product (audio) is
|
|
||||||
// paced by the TTS backend regardless. So enqueue never blocks the callback,
|
|
||||||
// and the transcript streams to the client at generation speed while audio is
|
|
||||||
// produced behind it.
|
|
||||||
type ttsPipeline struct {
|
|
||||||
speak func(clause string) ([]byte, error)
|
|
||||||
|
|
||||||
mu sync.Mutex
|
|
||||||
queue []string
|
|
||||||
wake chan struct{} // buffered(1) wakeup signal for the worker
|
|
||||||
|
|
||||||
// coord owns the open->closing->closed lifecycle (machine M5). It replaces the
|
|
||||||
// legacy `closed bool`: the producer raises Close (wait()), the worker raises
|
|
||||||
// WorkerExited. See ttscoord/ and realtime-state-machines.md.
|
|
||||||
coord *ttscoord.Coordinator
|
|
||||||
|
|
||||||
done chan struct{}
|
|
||||||
failed atomic.Bool
|
|
||||||
|
|
||||||
// audio and firstErr are owned by the worker goroutine and only safe to
|
|
||||||
// read after wait() has returned (it joins on the worker via done).
|
|
||||||
audio []byte
|
|
||||||
firstErr error
|
|
||||||
}
|
|
||||||
|
|
||||||
// newTTSPipeline starts the worker. speak performs the actual synthesis and
|
|
||||||
// returns the PCM accumulated for the conversation-item record (empty for
|
|
||||||
// transports that stream audio out-of-band, e.g. WebRTC).
|
|
||||||
func newTTSPipeline(speak func(clause string) ([]byte, error)) *ttsPipeline {
|
|
||||||
p := &ttsPipeline{
|
|
||||||
speak: speak,
|
|
||||||
wake: make(chan struct{}, 1),
|
|
||||||
done: make(chan struct{}),
|
|
||||||
}
|
|
||||||
p.coord = ttscoord.New(p)
|
|
||||||
go p.run()
|
|
||||||
return p
|
|
||||||
}
|
|
||||||
|
|
||||||
// closing reports whether wait() has been called (lifecycle past Open). Read
|
|
||||||
// under p.mu in the worker so the queue-empty check and the close check are
|
|
||||||
// consistent.
|
|
||||||
func (p *ttsPipeline) closing() bool {
|
|
||||||
_, open := p.coord.State().(ttscoord.Open)
|
|
||||||
return !open
|
|
||||||
}
|
|
||||||
|
|
||||||
// Perform executes a coordinator effect. Wake nudges the worker (non-blocking).
|
|
||||||
func (p *ttsPipeline) Perform(e ttscoord.Effect) {
|
|
||||||
if _, ok := e.(ttscoord.Wake); ok {
|
|
||||||
p.signal()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *ttsPipeline) run() {
|
|
||||||
defer close(p.done)
|
|
||||||
for {
|
|
||||||
p.mu.Lock()
|
|
||||||
for len(p.queue) == 0 && !p.closing() {
|
|
||||||
p.mu.Unlock()
|
|
||||||
<-p.wake
|
|
||||||
p.mu.Lock()
|
|
||||||
}
|
|
||||||
if len(p.queue) == 0 && p.closing() {
|
|
||||||
p.mu.Unlock()
|
|
||||||
// Drained and closed: advance the lifecycle to Closed, then exit
|
|
||||||
// (the deferred close(p.done) joins the producer's wait()).
|
|
||||||
_ = p.coord.Apply(ttscoord.WorkerExited{})
|
|
||||||
return
|
|
||||||
}
|
|
||||||
clause := p.queue[0]
|
|
||||||
p.queue = p.queue[1:]
|
|
||||||
p.mu.Unlock()
|
|
||||||
|
|
||||||
// Once a clause has failed, keep draining the queue without speaking so
|
|
||||||
// the producer's wait() returns promptly and the first error is kept.
|
|
||||||
if p.failed.Load() {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
a, err := p.speak(clause)
|
|
||||||
if err != nil {
|
|
||||||
p.firstErr = err
|
|
||||||
p.failed.Store(true)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
p.audio = append(p.audio, a...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// enqueue offers a clause for synthesis. It never blocks; it returns false once
|
|
||||||
// synthesis has failed, signalling the caller to stop the prediction.
|
|
||||||
func (p *ttsPipeline) enqueue(clause string) bool {
|
|
||||||
if p.failed.Load() {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
p.mu.Lock()
|
|
||||||
// Reject once closing/closed: the worker may have already drained and exited,
|
|
||||||
// so a clause queued now would be silently dropped. The lifecycle (Open) and
|
|
||||||
// the append are checked under the same lock, so the worker cannot exit between
|
|
||||||
// the gate and the enqueue (it takes p.mu to observe the empty queue).
|
|
||||||
if p.closing() {
|
|
||||||
p.mu.Unlock()
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
p.queue = append(p.queue, clause)
|
|
||||||
p.mu.Unlock()
|
|
||||||
p.signal()
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
// signal wakes the worker without blocking; the buffered channel coalesces
|
|
||||||
// signals, which is safe because the worker drains the whole queue per wake.
|
|
||||||
func (p *ttsPipeline) signal() {
|
|
||||||
select {
|
|
||||||
case p.wake <- struct{}{}:
|
|
||||||
default:
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// wait closes the queue and blocks until the worker has spoken every enqueued
|
|
||||||
// clause, then returns the accumulated audio and the first synthesis error. It
|
|
||||||
// is idempotent: calling it again returns the same result without blocking, so
|
|
||||||
// callers can drain it explicitly to read the audio and still defer a wait() as
|
|
||||||
// a leak-proof backstop. No clause may be enqueued after the first wait().
|
|
||||||
func (p *ttsPipeline) wait() ([]byte, error) {
|
|
||||||
// Close the lifecycle (Open->Closing) and wake the worker. Idempotent: a
|
|
||||||
// second Close is absorbed (no second wake), and <-p.done returns immediately
|
|
||||||
// once the worker has exited.
|
|
||||||
_ = p.coord.Apply(ttscoord.Close{})
|
|
||||||
<-p.done
|
|
||||||
return p.audio, p.firstErr
|
|
||||||
}
|
|
||||||
@@ -1,114 +0,0 @@
|
|||||||
package openai
|
|
||||||
|
|
||||||
import (
|
|
||||||
"errors"
|
|
||||||
"sync"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
)
|
|
||||||
|
|
||||||
var _ = Describe("ttsPipeline", func() {
|
|
||||||
It("synthesizes clauses in order and accumulates their audio", func() {
|
|
||||||
p := newTTSPipeline(func(clause string) ([]byte, error) {
|
|
||||||
return []byte(clause), nil
|
|
||||||
})
|
|
||||||
Expect(p.enqueue("a")).To(BeTrue())
|
|
||||||
Expect(p.enqueue("b")).To(BeTrue())
|
|
||||||
Expect(p.enqueue("c")).To(BeTrue())
|
|
||||||
|
|
||||||
audio, err := p.wait()
|
|
||||||
Expect(err).NotTo(HaveOccurred())
|
|
||||||
Expect(string(audio)).To(Equal("abc"))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("never blocks the producer even when synthesis is slow", func() {
|
|
||||||
var started sync.WaitGroup
|
|
||||||
started.Add(1)
|
|
||||||
release := make(chan struct{})
|
|
||||||
first := true
|
|
||||||
p := newTTSPipeline(func(clause string) ([]byte, error) {
|
|
||||||
if first {
|
|
||||||
first = false
|
|
||||||
started.Done()
|
|
||||||
<-release // hold the worker on the first clause
|
|
||||||
}
|
|
||||||
return []byte(clause), nil
|
|
||||||
})
|
|
||||||
|
|
||||||
Expect(p.enqueue("1")).To(BeTrue())
|
|
||||||
started.Wait() // worker is now blocked synthesizing the first clause
|
|
||||||
|
|
||||||
// Enqueuing many more clauses must return immediately, not block on the
|
|
||||||
// stalled worker — this is what keeps the LLM recv loop flowing.
|
|
||||||
done := make(chan struct{})
|
|
||||||
go func() {
|
|
||||||
defer close(done)
|
|
||||||
for _, c := range []string{"2", "3", "4", "5"} {
|
|
||||||
p.enqueue(c)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
Eventually(done, time.Second).Should(BeClosed())
|
|
||||||
|
|
||||||
close(release)
|
|
||||||
audio, err := p.wait()
|
|
||||||
Expect(err).NotTo(HaveOccurred())
|
|
||||||
Expect(string(audio)).To(Equal("12345"))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("keeps the first error, stops speaking, and signals the producer to stop", func() {
|
|
||||||
boom := errors.New("backend gone")
|
|
||||||
var spoken []string
|
|
||||||
var mu sync.Mutex
|
|
||||||
p := newTTSPipeline(func(clause string) ([]byte, error) {
|
|
||||||
mu.Lock()
|
|
||||||
spoken = append(spoken, clause)
|
|
||||||
mu.Unlock()
|
|
||||||
if clause == "b" {
|
|
||||||
return nil, boom
|
|
||||||
}
|
|
||||||
return []byte(clause), nil
|
|
||||||
})
|
|
||||||
|
|
||||||
Expect(p.enqueue("a")).To(BeTrue())
|
|
||||||
Expect(p.enqueue("b")).To(BeTrue())
|
|
||||||
|
|
||||||
// Once the failure is observed, enqueue reports it so the caller stops
|
|
||||||
// the prediction; any further clauses are dropped, not spoken.
|
|
||||||
Eventually(func() bool { return !p.enqueue("c") }, time.Second).Should(BeTrue())
|
|
||||||
|
|
||||||
_, err := p.wait()
|
|
||||||
Expect(err).To(MatchError(boom))
|
|
||||||
|
|
||||||
mu.Lock()
|
|
||||||
defer mu.Unlock()
|
|
||||||
Expect(spoken).NotTo(ContainElement("c"), "clauses after the failure are not synthesized")
|
|
||||||
})
|
|
||||||
|
|
||||||
It("is idempotent: a second wait returns the same result without blocking", func() {
|
|
||||||
p := newTTSPipeline(func(clause string) ([]byte, error) {
|
|
||||||
return []byte(clause), nil
|
|
||||||
})
|
|
||||||
Expect(p.enqueue("x")).To(BeTrue())
|
|
||||||
|
|
||||||
audio1, err1 := p.wait()
|
|
||||||
// A deferred backstop wait() in the caller runs after the explicit one;
|
|
||||||
// it must not block or change the result.
|
|
||||||
audio2, err2 := p.wait()
|
|
||||||
|
|
||||||
Expect(err1).NotTo(HaveOccurred())
|
|
||||||
Expect(err2).NotTo(HaveOccurred())
|
|
||||||
Expect(string(audio1)).To(Equal("x"))
|
|
||||||
Expect(string(audio2)).To(Equal("x"))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("returns cleanly when no clause was ever enqueued", func() {
|
|
||||||
p := newTTSPipeline(func(clause string) ([]byte, error) {
|
|
||||||
return []byte(clause), nil
|
|
||||||
})
|
|
||||||
audio, err := p.wait()
|
|
||||||
Expect(err).NotTo(HaveOccurred())
|
|
||||||
Expect(audio).To(BeEmpty())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -1,127 +0,0 @@
|
|||||||
package openai
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/respcoord"
|
|
||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/turncoord"
|
|
||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
|
|
||||||
"github.com/mudler/LocalAI/core/schema"
|
|
||||||
)
|
|
||||||
|
|
||||||
// turnSink wires the explicit turn-detection state machine (turncoord.Coordinator
|
|
||||||
// — machine "M2" in docs/design/realtime-state-machines.md) into handleVAD.
|
|
||||||
//
|
|
||||||
// In the legacy code the turn lifecycle was split across two variables that could
|
|
||||||
// disagree: handleVAD's goroutine-local speechStarted bool and the semantic_vad
|
|
||||||
// liveTurnState's "is the live stream open" flag (lts.open()). A discardTurn (the
|
|
||||||
// no-speech clear, or teardown) closed the live stream but left speechStarted
|
|
||||||
// true, so the next speech onset was suppressed by `if !speechStarted` — no
|
|
||||||
// speech_started, no barge-in, no commit (Part 2, failure mode 4). Here "speech
|
|
||||||
// started" and "a turn is open" are ONE coordinator state, so they cannot desync.
|
|
||||||
//
|
|
||||||
// Unlike responseSink (M3), which is a genuine dual-writer race, the turn machine
|
|
||||||
// is owned by the single handleVAD goroutine; this sink and its coordinator are
|
|
||||||
// loop-local. The coordinator's lock only matters for the teardown-time Abort and
|
|
||||||
// for keeping State() readable — there is no second writer.
|
|
||||||
//
|
|
||||||
// The effects map onto the existing turn I/O:
|
|
||||||
// - OpenTurn: open the live ASR stream (semantic_vad) + feed the onset
|
|
||||||
// audio. A failed open degrades the turn to silence-only — the turn still
|
|
||||||
// proceeds (server_vad-like), matching the legacy behaviour.
|
|
||||||
// - BargeIn: cancel any in-flight response (non-blocking).
|
|
||||||
// - EmitSpeechStarted: input_audio_buffer.speech_started.
|
|
||||||
// - EmitSpeechStopped: input_audio_buffer.speech_stopped.
|
|
||||||
// - CommitTurn: committed event + finalize the live stream + issue the
|
|
||||||
// response (via responseSink/respcoord).
|
|
||||||
// - DiscardTurn: close the live stream and retract any captions.
|
|
||||||
//
|
|
||||||
// The data-heavy effects (OpenTurn, CommitTurn) need the current tick's audio and
|
|
||||||
// transcription context. Because Apply performs effects synchronously on the same
|
|
||||||
// (handleVAD) goroutine, the loop sets the relevant scratch fields immediately
|
|
||||||
// before each Apply; there is no cross-goroutine sharing.
|
|
||||||
type turnSink struct {
|
|
||||||
session *Session
|
|
||||||
conv *Conversation
|
|
||||||
transport Transport
|
|
||||||
lts *liveTurnState
|
|
||||||
vadContext context.Context
|
|
||||||
startTime time.Time
|
|
||||||
|
|
||||||
coord *turncoord.Coordinator
|
|
||||||
|
|
||||||
// per-tick context, set by handleVAD before each Apply (single goroutine).
|
|
||||||
sv *types.RealtimeSessionSemanticVad // nil = server_vad
|
|
||||||
onsetAudio []int16 // OpenTurn feeds this
|
|
||||||
commitAudio []byte // CommitTurn issues this
|
|
||||||
commitAudioLength float64 // for finishTurn (flush tail)
|
|
||||||
commitRetranscribe bool // gated batch is authoritative
|
|
||||||
commitGated *schema.TranscriptionResult // retranscribe batch decode
|
|
||||||
}
|
|
||||||
|
|
||||||
func newTurnSink(session *Session, conv *Conversation, t Transport, lts *liveTurnState, vadContext context.Context, startTime time.Time) *turnSink {
|
|
||||||
s := &turnSink{
|
|
||||||
session: session,
|
|
||||||
conv: conv,
|
|
||||||
transport: t,
|
|
||||||
lts: lts,
|
|
||||||
vadContext: vadContext,
|
|
||||||
startTime: startTime,
|
|
||||||
}
|
|
||||||
s.coord = turncoord.New(s)
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
|
|
||||||
// Perform executes one effect. It is called by Coordinator.Apply while the
|
|
||||||
// coordinator lock is held. The turn coordinator is single-writer (handleVAD), so
|
|
||||||
// the synchronous network writes / lts operations here are the same ones the
|
|
||||||
// legacy loop did inline on this goroutine; they never contend the lock.
|
|
||||||
func (s *turnSink) Perform(e turncoord.Effect) {
|
|
||||||
switch eff := e.(type) {
|
|
||||||
case turncoord.OpenTurn:
|
|
||||||
if s.sv != nil && s.lts.openTurn(s.vadContext, string(eff.Turn)) {
|
|
||||||
s.lts.feedNewAudio(s.onsetAudio)
|
|
||||||
}
|
|
||||||
case turncoord.BargeIn:
|
|
||||||
s.session.respSink.cancel(respcoord.SourceVAD)
|
|
||||||
case turncoord.EmitSpeechStarted:
|
|
||||||
sendEvent(s.transport, types.InputAudioBufferSpeechStartedEvent{
|
|
||||||
ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
|
|
||||||
AudioStartMs: time.Since(s.startTime).Milliseconds(),
|
|
||||||
})
|
|
||||||
case turncoord.EmitSpeechStopped:
|
|
||||||
sendEvent(s.transport, types.InputAudioBufferSpeechStoppedEvent{
|
|
||||||
ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
|
|
||||||
AudioEndMs: time.Since(s.startTime).Milliseconds(),
|
|
||||||
})
|
|
||||||
case turncoord.CommitTurn:
|
|
||||||
// The committed item id is the coordinator's turn id (== the live caption
|
|
||||||
// id), so the client's completed event replaces the partial text.
|
|
||||||
itemID := string(eff.Turn)
|
|
||||||
sendEvent(s.transport, types.InputAudioBufferCommittedEvent{
|
|
||||||
ServerEventBase: types.ServerEventBase{EventID: "event_TODO"},
|
|
||||||
ItemID: itemID,
|
|
||||||
PreviousItemID: "TODO",
|
|
||||||
})
|
|
||||||
// Finalize the turn's live stream (flushes the decode tail). In
|
|
||||||
// retranscribe mode the batch decode is authoritative, so the streamed
|
|
||||||
// transcript is dropped.
|
|
||||||
var live *liveUtterance
|
|
||||||
if s.sv != nil {
|
|
||||||
ut := s.lts.finishTurn(s.commitAudioLength)
|
|
||||||
if !s.commitRetranscribe {
|
|
||||||
live = ut
|
|
||||||
}
|
|
||||||
}
|
|
||||||
audio := s.commitAudio
|
|
||||||
gated := s.commitGated
|
|
||||||
conv := s.conv
|
|
||||||
s.session.respSink.issue(s.vadContext, respcoord.SourceVAD, func(ctx context.Context) {
|
|
||||||
commitUtteranceWithTranscript(ctx, audio, live, gated, itemID, s.session, conv, s.transport)
|
|
||||||
})
|
|
||||||
case turncoord.DiscardTurn:
|
|
||||||
// No-op if the stream was never open (server_vad / already idle).
|
|
||||||
s.lts.discardTurn()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,54 +0,0 @@
|
|||||||
package openai
|
|
||||||
|
|
||||||
import (
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
)
|
|
||||||
|
|
||||||
// dropInspectedPrefix is what stands between the VAD loop's buffer clears and
|
|
||||||
// cutting the first word off an utterance: the no-speech clear must keep the
|
|
||||||
// holdback tail (silero hasn't crossed its onset threshold yet) and both
|
|
||||||
// clears must keep audio appended while the tick ran (the VAD never saw it).
|
|
||||||
var _ = Describe("dropInspectedPrefix", func() {
|
|
||||||
It("keeps the holdback tail of the inspected window and everything appended mid-tick", func() {
|
|
||||||
inspected := []byte{1, 2, 3, 4, 5, 6}
|
|
||||||
appended := []byte{7, 8}
|
|
||||||
buf := append(append([]byte(nil), inspected...), appended...)
|
|
||||||
|
|
||||||
out := dropInspectedPrefix(buf, len(inspected), 2)
|
|
||||||
|
|
||||||
Expect(out).To(Equal([]byte{5, 6, 7, 8}), "older confirmed-silent head dropped, possible onset + fresh audio kept")
|
|
||||||
})
|
|
||||||
|
|
||||||
It("returns the buffer unchanged when the inspected window fits in the holdback", func() {
|
|
||||||
buf := []byte{1, 2, 3}
|
|
||||||
|
|
||||||
Expect(dropInspectedPrefix(buf, len(buf), 4)).To(Equal(buf))
|
|
||||||
Expect(dropInspectedPrefix(buf, len(buf), len(buf))).To(Equal(buf))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("drops the whole inspected window with zero holdback, keeping only mid-tick appends", func() {
|
|
||||||
// The commit-time clear: the inspected audio was committed, audio
|
|
||||||
// appended while the tick ran belongs to the next turn.
|
|
||||||
buf := []byte{1, 2, 3, 4}
|
|
||||||
|
|
||||||
Expect(dropInspectedPrefix(buf, 4, 0)).To(BeEmpty())
|
|
||||||
Expect(dropInspectedPrefix(append(buf, 9), 4, 0)).To(Equal([]byte{9}))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("clamps when told more was inspected than the buffer holds", func() {
|
|
||||||
buf := []byte{1, 2}
|
|
||||||
|
|
||||||
Expect(dropInspectedPrefix(buf, 10, 0)).To(BeEmpty())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("returns a copy, not a sub-slice, when bytes are dropped", func() {
|
|
||||||
buf := []byte{1, 2, 3, 4}
|
|
||||||
|
|
||||||
out := dropInspectedPrefix(buf, 4, 2)
|
|
||||||
|
|
||||||
Expect(out).To(Equal([]byte{3, 4}))
|
|
||||||
buf[2] = 99
|
|
||||||
Expect(out).To(Equal([]byte{3, 4}), "mutating the old backing array must not leak into the published buffer")
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -1,267 +0,0 @@
|
|||||||
// Package respcoord is the explicit state machine for the realtime API's
|
|
||||||
// response-coordination concern (machine "M3" in
|
|
||||||
// docs/design/realtime-state-machines.md).
|
|
||||||
//
|
|
||||||
// In the legacy code this machine is implicit: a response is "active" iff
|
|
||||||
// Session.activeResponseDone is a non-nil, unclosed channel, and the lifecycle
|
|
||||||
// is driven from TWO goroutines (the client read-loop and the VAD goroutine)
|
|
||||||
// that both call startResponse/cancelActiveResponse. responseMu guards only the
|
|
||||||
// field swap, while the <-done wait happens outside the lock, so two concurrent
|
|
||||||
// starts can briefly leave two live response goroutines both appending to the
|
|
||||||
// conversation. See docs/design/realtime-state-machines.md, Part 2 (failure
|
|
||||||
// mode 2) and the ResponseLifecycle spec under formal-verification/.
|
|
||||||
//
|
|
||||||
// This package replaces that with:
|
|
||||||
// - a sealed sum type for State (illegal states are unrepresentable),
|
|
||||||
// - a total, pure transition function Next(state, event) -> (state, effects),
|
|
||||||
// - a single-writer Coordinator that serializes every transition.
|
|
||||||
//
|
|
||||||
// The design guarantees the invariants the specs check:
|
|
||||||
// - at most one live response at any instant,
|
|
||||||
// - exactly one terminal (response.done) per started response,
|
|
||||||
// - no response is started after its terminal (no resurrection).
|
|
||||||
package respcoord
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
|
|
||||||
)
|
|
||||||
|
|
||||||
// ResponseID identifies a single response attempt. The caller mints a fresh,
|
|
||||||
// monotonically increasing id for every Start; ids are never reused. The
|
|
||||||
// monotonic id is what lets the machine ignore "stale" Finished events from a
|
|
||||||
// response that was already superseded or cancelled.
|
|
||||||
type ResponseID uint64
|
|
||||||
|
|
||||||
// Source records which goroutine drove an event. It is carried for
|
|
||||||
// observability/logging only; it never affects a transition (both sources are
|
|
||||||
// equal authority). Keeping it in the event type makes the dual-writer reality
|
|
||||||
// explicit rather than hidden.
|
|
||||||
type Source int
|
|
||||||
|
|
||||||
const (
|
|
||||||
// SourceClient is the read-loop: response.create or a manual
|
|
||||||
// input_audio_buffer.commit.
|
|
||||||
SourceClient Source = iota
|
|
||||||
// SourceVAD is the turn-detection goroutine: end-of-speech commit or a
|
|
||||||
// barge-in cancel.
|
|
||||||
SourceVAD
|
|
||||||
)
|
|
||||||
|
|
||||||
func (s Source) String() string {
|
|
||||||
switch s {
|
|
||||||
case SourceClient:
|
|
||||||
return "client"
|
|
||||||
case SourceVAD:
|
|
||||||
return "vad"
|
|
||||||
default:
|
|
||||||
return fmt.Sprintf("Source(%d)", int(s))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Status is the terminal status reported on response.done.
|
|
||||||
type Status int
|
|
||||||
|
|
||||||
const (
|
|
||||||
// StatusCompleted is a response that finished on its own.
|
|
||||||
StatusCompleted Status = iota
|
|
||||||
// StatusCancelled is a response cut short by a barge-in, an explicit
|
|
||||||
// response.cancel, or by being superseded by a newer response.
|
|
||||||
StatusCancelled
|
|
||||||
)
|
|
||||||
|
|
||||||
func (s Status) String() string {
|
|
||||||
switch s {
|
|
||||||
case StatusCompleted:
|
|
||||||
return "completed"
|
|
||||||
case StatusCancelled:
|
|
||||||
return "cancelled"
|
|
||||||
default:
|
|
||||||
return fmt.Sprintf("Status(%d)", int(s))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// State is the sealed sum type of coordinator states. The only implementations
|
|
||||||
// are the unexported-method-bearing structs in this file, so callers outside
|
|
||||||
// the package cannot fabricate an out-of-band state. Exhaustively:
|
|
||||||
// Idle | Active | Terminated.
|
|
||||||
type State interface {
|
|
||||||
isState()
|
|
||||||
String() string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Idle: no response is in flight.
|
|
||||||
type Idle struct{}
|
|
||||||
|
|
||||||
// Active: exactly one response (ID) is in flight. The struct holds a single id,
|
|
||||||
// so "two active responses" is not representable.
|
|
||||||
type Active struct{ ID ResponseID }
|
|
||||||
|
|
||||||
// Terminated: the session is torn down. Absorbing — no response can start from
|
|
||||||
// here, so the M1 (connection) parent's teardown can guarantee no response
|
|
||||||
// outlives the session (see formal-verification/session_lifecycle.fizz).
|
|
||||||
type Terminated struct{}
|
|
||||||
|
|
||||||
func (Idle) isState() {}
|
|
||||||
func (Active) isState() {}
|
|
||||||
func (Terminated) isState() {}
|
|
||||||
|
|
||||||
func (Idle) String() string { return "Idle" }
|
|
||||||
func (a Active) String() string { return fmt.Sprintf("Active(%d)", a.ID) }
|
|
||||||
func (Terminated) String() string { return "Terminated" }
|
|
||||||
|
|
||||||
// Event is the sealed sum type of inputs. Exhaustively:
|
|
||||||
// Start | Finished | Cancel | Shutdown.
|
|
||||||
type Event interface {
|
|
||||||
isEvent()
|
|
||||||
String() string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Start requests a new response. ID must be a fresh, never-before-used id.
|
|
||||||
type Start struct {
|
|
||||||
ID ResponseID
|
|
||||||
Source Source
|
|
||||||
}
|
|
||||||
|
|
||||||
// Finished reports that the response goroutine for ID reached its own terminal.
|
|
||||||
// If ID is not the currently-active response it is "stale" (the response was
|
|
||||||
// already superseded/cancelled) and is ignored.
|
|
||||||
type Finished struct{ ID ResponseID }
|
|
||||||
|
|
||||||
// Cancel requests cancellation of the in-flight response (barge-in or explicit
|
|
||||||
// response.cancel). It is a no-op when idle.
|
|
||||||
type Cancel struct{ Source Source }
|
|
||||||
|
|
||||||
// Shutdown terminates the coordinator at session teardown: it cancels any
|
|
||||||
// in-flight response and moves to the absorbing Terminated state, after which no
|
|
||||||
// response can start. Raised by the connection (M1) parent's teardown.
|
|
||||||
type Shutdown struct{}
|
|
||||||
|
|
||||||
func (Start) isEvent() {}
|
|
||||||
func (Finished) isEvent() {}
|
|
||||||
func (Cancel) isEvent() {}
|
|
||||||
func (Shutdown) isEvent() {}
|
|
||||||
|
|
||||||
func (e Start) String() string { return fmt.Sprintf("Start(%d,%s)", e.ID, e.Source) }
|
|
||||||
func (e Finished) String() string { return fmt.Sprintf("Finished(%d)", e.ID) }
|
|
||||||
func (e Cancel) String() string { return fmt.Sprintf("Cancel(%s)", e.Source) }
|
|
||||||
func (Shutdown) String() string { return "Shutdown" }
|
|
||||||
|
|
||||||
// Effect is a side effect returned by Next as data for the caller to perform.
|
|
||||||
// Returning effects as data (rather than firing callbacks inside the
|
|
||||||
// transition) keeps Next pure and exhaustively testable, and lets the
|
|
||||||
// Coordinator decide how/when to perform them. Exhaustively:
|
|
||||||
// CancelResponse | StartResponse | EmitTerminal.
|
|
||||||
type Effect interface {
|
|
||||||
isEffect()
|
|
||||||
String() string
|
|
||||||
}
|
|
||||||
|
|
||||||
// CancelResponse: cancel the context of the running response ID.
|
|
||||||
type CancelResponse struct{ ID ResponseID }
|
|
||||||
|
|
||||||
// StartResponse: spawn the response goroutine for ID.
|
|
||||||
type StartResponse struct{ ID ResponseID }
|
|
||||||
|
|
||||||
// EmitTerminal: send response.done for ID with Status.
|
|
||||||
type EmitTerminal struct {
|
|
||||||
ID ResponseID
|
|
||||||
Status Status
|
|
||||||
}
|
|
||||||
|
|
||||||
func (CancelResponse) isEffect() {}
|
|
||||||
func (StartResponse) isEffect() {}
|
|
||||||
func (EmitTerminal) isEffect() {}
|
|
||||||
|
|
||||||
func (e CancelResponse) String() string { return fmt.Sprintf("CancelResponse(%d)", e.ID) }
|
|
||||||
func (e StartResponse) String() string { return fmt.Sprintf("StartResponse(%d)", e.ID) }
|
|
||||||
func (e EmitTerminal) String() string {
|
|
||||||
return fmt.Sprintf("EmitTerminal(%d,%s)", e.ID, e.Status)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Next is the total, pure transition function. For every (state, event) it
|
|
||||||
// returns the next state and the ordered effects to perform. It returns a
|
|
||||||
// non-nil error only for an unknown State/Event implementation (a programmer
|
|
||||||
// error / future type added without updating this function) — callers must
|
|
||||||
// surface that, never silently ignore it. Every in-domain (state, event) pair
|
|
||||||
// is defined; there are no "forbidden" transitions, only no-ops for stale or
|
|
||||||
// idle inputs.
|
|
||||||
//
|
|
||||||
// The supersede rule (Active + Start) is the crux of the fix: starting a new
|
|
||||||
// response while one is active emits the old response's cancelled terminal and
|
|
||||||
// cancels it BEFORE the replacement starts, all within one serialized
|
|
||||||
// transition. The old goroutine's later Finished is therefore stale and
|
|
||||||
// ignored — so each id gets exactly one terminal and there is never more than
|
|
||||||
// one live response.
|
|
||||||
func Next(s State, e Event) (State, []Effect, error) {
|
|
||||||
switch st := s.(type) {
|
|
||||||
case Idle:
|
|
||||||
switch ev := e.(type) {
|
|
||||||
case Start:
|
|
||||||
return Active{ID: ev.ID}, []Effect{StartResponse{ID: ev.ID}}, nil
|
|
||||||
case Cancel:
|
|
||||||
// Nothing in flight: idempotent no-op.
|
|
||||||
return Idle{}, nil, nil
|
|
||||||
case Finished:
|
|
||||||
// Stale terminal from an already-superseded/cancelled response.
|
|
||||||
return Idle{}, nil, nil
|
|
||||||
case Shutdown:
|
|
||||||
// Teardown with nothing in flight: go terminal.
|
|
||||||
return Terminated{}, nil, nil
|
|
||||||
}
|
|
||||||
case Active:
|
|
||||||
switch ev := e.(type) {
|
|
||||||
case Start:
|
|
||||||
return Active{ID: ev.ID}, []Effect{
|
|
||||||
CancelResponse{ID: st.ID},
|
|
||||||
EmitTerminal{ID: st.ID, Status: StatusCancelled},
|
|
||||||
StartResponse{ID: ev.ID},
|
|
||||||
}, nil
|
|
||||||
case Finished:
|
|
||||||
if ev.ID == st.ID {
|
|
||||||
return Idle{}, []Effect{EmitTerminal{ID: st.ID, Status: StatusCompleted}}, nil
|
|
||||||
}
|
|
||||||
// Stale finish from a superseded response — already terminal-ed.
|
|
||||||
return Active{ID: st.ID}, nil, nil
|
|
||||||
case Cancel:
|
|
||||||
return Idle{}, []Effect{
|
|
||||||
CancelResponse{ID: st.ID},
|
|
||||||
EmitTerminal{ID: st.ID, Status: StatusCancelled},
|
|
||||||
}, nil
|
|
||||||
case Shutdown:
|
|
||||||
// Teardown while a response is live: cancel it (with its terminal) and
|
|
||||||
// go terminal so nothing can start afterwards.
|
|
||||||
return Terminated{}, []Effect{
|
|
||||||
CancelResponse{ID: st.ID},
|
|
||||||
EmitTerminal{ID: st.ID, Status: StatusCancelled},
|
|
||||||
}, nil
|
|
||||||
}
|
|
||||||
case Terminated:
|
|
||||||
// Absorbing: every event is a no-op. A Start after teardown is rejected
|
|
||||||
// (no StartResponse), so no response can outlive the session.
|
|
||||||
switch e.(type) {
|
|
||||||
case Start, Finished, Cancel, Shutdown:
|
|
||||||
return Terminated{}, nil, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return s, nil, fmt.Errorf("respcoord: unhandled transition %s <- %s", s, e)
|
|
||||||
}
|
|
||||||
|
|
||||||
// EffectSink performs the effects produced by a transition. See coordinator.Sink
|
|
||||||
// for the non-blocking contract: Perform runs under the coordinator lock, so it
|
|
||||||
// must not block and must not re-enter Apply (the spawned response goroutine's
|
|
||||||
// Finished apply happens only after the sink returns).
|
|
||||||
type EffectSink = coordinator.Sink[Effect]
|
|
||||||
|
|
||||||
// Coordinator serializes every Start/Finished/Cancel/Shutdown transition behind
|
|
||||||
// one lock, so the two driving goroutines (read-loop and VAD) can call Apply
|
|
||||||
// concurrently without the legacy dual-writer race. Effects are performed in
|
|
||||||
// order under the lock — preserving the (cancel old, emit old terminal, start
|
|
||||||
// new) supersede ordering. See coordinator.Coordinator.
|
|
||||||
type Coordinator = coordinator.Coordinator[State, Event, Effect]
|
|
||||||
|
|
||||||
// New returns an idle Coordinator that performs effects via sink.
|
|
||||||
func New(sink EffectSink) *Coordinator {
|
|
||||||
return coordinator.New[State, Event, Effect](Idle{}, Next, sink)
|
|
||||||
}
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
package respcoord
|
|
||||||
|
|
||||||
import (
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestRespcoord(t *testing.T) {
|
|
||||||
RegisterFailHandler(Fail)
|
|
||||||
RunSpecs(t, "respcoord (realtime M3) Suite")
|
|
||||||
}
|
|
||||||
@@ -1,370 +0,0 @@
|
|||||||
package respcoord
|
|
||||||
|
|
||||||
import (
|
|
||||||
"math/rand/v2"
|
|
||||||
"sync"
|
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
)
|
|
||||||
|
|
||||||
// recordingSink captures the ordered stream of effects so the invariants can be
|
|
||||||
// checked independently of the transition function's internals. Perform is
|
|
||||||
// called by Coordinator.Apply under the coordinator lock, so it is already
|
|
||||||
// serialized; the mutex here only guards reads from the spec goroutine.
|
|
||||||
type recordingSink struct {
|
|
||||||
mu sync.Mutex
|
|
||||||
log []Effect
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *recordingSink) Perform(e Effect) {
|
|
||||||
s.mu.Lock()
|
|
||||||
s.log = append(s.log, e)
|
|
||||||
s.mu.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *recordingSink) snapshot() []Effect {
|
|
||||||
s.mu.Lock()
|
|
||||||
defer s.mu.Unlock()
|
|
||||||
out := make([]Effect, len(s.log))
|
|
||||||
copy(out, s.log)
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
// checkInvariants replays the effect log and asserts the three core safety
|
|
||||||
// properties from docs/design/realtime-state-machines.md, Part 4:
|
|
||||||
//
|
|
||||||
// (1) at most one live response at any instant
|
|
||||||
// -- after every effect, the number of started-but-not-terminated ids <= 1;
|
|
||||||
// (2) exactly one terminal per started response
|
|
||||||
// -- each id is started at most once and terminated at most once;
|
|
||||||
// (3) no resurrection
|
|
||||||
// -- an id is never started after it has been terminated.
|
|
||||||
func checkInvariants(log []Effect) {
|
|
||||||
started := map[ResponseID]int{}
|
|
||||||
terminated := map[ResponseID]int{}
|
|
||||||
live := map[ResponseID]bool{}
|
|
||||||
|
|
||||||
for i, eff := range log {
|
|
||||||
switch e := eff.(type) {
|
|
||||||
case StartResponse:
|
|
||||||
Expect(terminated[e.ID]).To(Equal(0), "invariant (3): StartResponse(%d) after it was terminated (effect #%d)\nlog=%v", e.ID, i, log)
|
|
||||||
started[e.ID]++
|
|
||||||
Expect(started[e.ID]).To(Equal(1), "invariant (2): id %d started %d times (effect #%d)\nlog=%v", e.ID, started[e.ID], i, log)
|
|
||||||
live[e.ID] = true
|
|
||||||
case EmitTerminal:
|
|
||||||
terminated[e.ID]++
|
|
||||||
Expect(terminated[e.ID]).To(Equal(1), "invariant (2): id %d terminated %d times (effect #%d)\nlog=%v", e.ID, terminated[e.ID], i, log)
|
|
||||||
delete(live, e.ID)
|
|
||||||
case CancelResponse:
|
|
||||||
// no count assertion; cancellation is paired with a terminal
|
|
||||||
}
|
|
||||||
Expect(len(live)).To(BeNumerically("<=", 1), "invariant (1): %d live responses after effect #%d (%s)\nlog=%v", len(live), i, eff, log)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// unknownEvent is an Event implementation Next does not know about, to exercise
|
|
||||||
// the defensive error path.
|
|
||||||
type unknownEvent struct{}
|
|
||||||
|
|
||||||
func (unknownEvent) isEvent() {}
|
|
||||||
func (unknownEvent) String() string { return "unknownEvent" }
|
|
||||||
|
|
||||||
var _ = Describe("respcoord.Next", func() {
|
|
||||||
// DescribeTable exhaustively pins every (state, event) cell of the pure
|
|
||||||
// transition function, including the stale / idle no-op cells. This is the
|
|
||||||
// practical stand-in for "no transition leads to an inconsistent state": if a
|
|
||||||
// cell changes, this table must change with it.
|
|
||||||
DescribeTable("transitions",
|
|
||||||
func(state State, event Event, wantState State, wantEff []Effect) {
|
|
||||||
gotState, gotEff, err := Next(state, event)
|
|
||||||
Expect(err).NotTo(HaveOccurred())
|
|
||||||
Expect(gotState).To(Equal(wantState))
|
|
||||||
Expect(gotEff).To(Equal(wantEff))
|
|
||||||
},
|
|
||||||
Entry("idle+start -> active, spawns response",
|
|
||||||
Idle{}, Start{ID: 1, Source: SourceClient},
|
|
||||||
Active{ID: 1}, []Effect{StartResponse{ID: 1}}),
|
|
||||||
Entry("idle+cancel -> idle, no-op",
|
|
||||||
Idle{}, Cancel{Source: SourceVAD},
|
|
||||||
Idle{}, []Effect(nil)),
|
|
||||||
Entry("idle+finished(stale) -> idle, no-op",
|
|
||||||
Idle{}, Finished{ID: 7},
|
|
||||||
Idle{}, []Effect(nil)),
|
|
||||||
Entry("active+start -> supersede: cancel+terminal(old)+start(new)",
|
|
||||||
Active{ID: 1}, Start{ID: 2, Source: SourceVAD},
|
|
||||||
Active{ID: 2},
|
|
||||||
[]Effect{
|
|
||||||
CancelResponse{ID: 1},
|
|
||||||
EmitTerminal{ID: 1, Status: StatusCancelled},
|
|
||||||
StartResponse{ID: 2},
|
|
||||||
}),
|
|
||||||
Entry("active+finished(current) -> idle, completed terminal",
|
|
||||||
Active{ID: 3}, Finished{ID: 3},
|
|
||||||
Idle{}, []Effect{EmitTerminal{ID: 3, Status: StatusCompleted}}),
|
|
||||||
Entry("active+finished(stale) -> stay active, no-op",
|
|
||||||
Active{ID: 3}, Finished{ID: 2},
|
|
||||||
Active{ID: 3}, []Effect(nil)),
|
|
||||||
Entry("active+cancel -> idle, cancel+cancelled terminal",
|
|
||||||
Active{ID: 5}, Cancel{Source: SourceClient},
|
|
||||||
Idle{},
|
|
||||||
[]Effect{
|
|
||||||
CancelResponse{ID: 5},
|
|
||||||
EmitTerminal{ID: 5, Status: StatusCancelled},
|
|
||||||
}),
|
|
||||||
Entry("idle+shutdown -> terminated, no-op",
|
|
||||||
Idle{}, Shutdown{},
|
|
||||||
Terminated{}, []Effect(nil)),
|
|
||||||
Entry("active+shutdown -> terminated: cancel+cancelled terminal",
|
|
||||||
Active{ID: 6}, Shutdown{},
|
|
||||||
Terminated{},
|
|
||||||
[]Effect{
|
|
||||||
CancelResponse{ID: 6},
|
|
||||||
EmitTerminal{ID: 6, Status: StatusCancelled},
|
|
||||||
}),
|
|
||||||
Entry("terminated+start -> terminated, REJECTED (no resurrection)",
|
|
||||||
Terminated{}, Start{ID: 9, Source: SourceClient},
|
|
||||||
Terminated{}, []Effect(nil)),
|
|
||||||
Entry("terminated+finished -> terminated, no-op (stale)",
|
|
||||||
Terminated{}, Finished{ID: 9},
|
|
||||||
Terminated{}, []Effect(nil)),
|
|
||||||
Entry("terminated+cancel -> terminated, no-op",
|
|
||||||
Terminated{}, Cancel{Source: SourceVAD},
|
|
||||||
Terminated{}, []Effect(nil)),
|
|
||||||
Entry("terminated+shutdown -> terminated, idempotent",
|
|
||||||
Terminated{}, Shutdown{},
|
|
||||||
Terminated{}, []Effect(nil)),
|
|
||||||
)
|
|
||||||
|
|
||||||
It("is total: every defined (state, event) pair is handled without error", func() {
|
|
||||||
states := []State{Idle{}, Active{ID: 1}, Terminated{}}
|
|
||||||
events := []Event{
|
|
||||||
Start{ID: 2, Source: SourceClient},
|
|
||||||
Finished{ID: 1},
|
|
||||||
Finished{ID: 99},
|
|
||||||
Cancel{Source: SourceVAD},
|
|
||||||
Shutdown{},
|
|
||||||
}
|
|
||||||
for _, s := range states {
|
|
||||||
for _, e := range events {
|
|
||||||
_, _, err := Next(s, e)
|
|
||||||
Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
It("errors on an unknown event type", func() {
|
|
||||||
_, _, err := Next(Active{ID: 1}, unknownEvent{})
|
|
||||||
Expect(err).To(HaveOccurred())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
var _ = Describe("respcoord.Coordinator", func() {
|
|
||||||
// This replaces the previous rapid stateful test: a seeded random walk over
|
|
||||||
// the event space, asserting the invariants hold after every step. Seeds are
|
|
||||||
// fixed so any failure reproduces deterministically.
|
|
||||||
It("upholds the safety invariants over random event sequences", func() {
|
|
||||||
seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
|
|
||||||
for _, seed := range seeds {
|
|
||||||
r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
|
|
||||||
sink := &recordingSink{}
|
|
||||||
c := New(sink)
|
|
||||||
var nextID uint64
|
|
||||||
|
|
||||||
for range 3000 {
|
|
||||||
switch r.IntN(4) {
|
|
||||||
case 0: // start from client
|
|
||||||
nextID++
|
|
||||||
Expect(c.Apply(Start{ID: ResponseID(nextID), Source: SourceClient})).To(Succeed())
|
|
||||||
case 1: // start from VAD
|
|
||||||
nextID++
|
|
||||||
Expect(c.Apply(Start{ID: ResponseID(nextID), Source: SourceVAD})).To(Succeed())
|
|
||||||
case 2: // possibly-stale finish from any plausible id (incl. future)
|
|
||||||
id := r.Uint64N(nextID + 3)
|
|
||||||
Expect(c.Apply(Finished{ID: ResponseID(id)})).To(Succeed())
|
|
||||||
case 3: // explicit cancel
|
|
||||||
Expect(c.Apply(Cancel{Source: SourceClient})).To(Succeed())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// One full-log replay per seed: it iterates the whole sequence, so
|
|
||||||
// it catches a violation at any step without the O(n^2) cost of
|
|
||||||
// re-replaying after every Apply.
|
|
||||||
checkInvariants(sink.snapshot())
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
// Hammer Apply from two goroutines -- the read-loop and the VAD goroutine,
|
|
||||||
// the exact dual-writer scenario that races in the legacy code -- and assert
|
|
||||||
// the invariants still hold. Run under -race to also catch any data race in
|
|
||||||
// the coordinator itself.
|
|
||||||
It("upholds the invariants under concurrent dual-writer Apply", func() {
|
|
||||||
const perGoroutine = 2000
|
|
||||||
sink := &recordingSink{}
|
|
||||||
c := New(sink)
|
|
||||||
|
|
||||||
var idCounter uint64
|
|
||||||
var idMu sync.Mutex
|
|
||||||
nextID := func() ResponseID {
|
|
||||||
idMu.Lock()
|
|
||||||
defer idMu.Unlock()
|
|
||||||
idCounter++
|
|
||||||
return ResponseID(idCounter)
|
|
||||||
}
|
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
|
||||||
drive := func(src Source) {
|
|
||||||
defer wg.Done()
|
|
||||||
for i := range perGoroutine {
|
|
||||||
switch i % 3 {
|
|
||||||
case 0:
|
|
||||||
_ = c.Apply(Start{ID: nextID(), Source: src})
|
|
||||||
case 1:
|
|
||||||
if a, ok := c.State().(Active); ok {
|
|
||||||
_ = c.Apply(Finished{ID: a.ID})
|
|
||||||
}
|
|
||||||
case 2:
|
|
||||||
_ = c.Apply(Cancel{Source: src})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
wg.Add(2)
|
|
||||||
go drive(SourceClient)
|
|
||||||
go drive(SourceVAD)
|
|
||||||
wg.Wait()
|
|
||||||
|
|
||||||
checkInvariants(sink.snapshot())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("rejects the dual-writer interleaving the legacy mechanism allowed", func() {
|
|
||||||
// Equivalent sequence to the legacy double-start race: start id1, then two
|
|
||||||
// superseding starts (id2, id3) such as the read-loop and VAD would each
|
|
||||||
// issue. Each Start is serialized by the coordinator, so each supersede
|
|
||||||
// cancels+terminates the previous -- never two live at once.
|
|
||||||
sink := &recordingSink{}
|
|
||||||
c := New(sink)
|
|
||||||
|
|
||||||
Expect(c.Apply(Start{ID: 1, Source: SourceClient})).To(Succeed())
|
|
||||||
Expect(c.Apply(Start{ID: 2, Source: SourceVAD})).To(Succeed())
|
|
||||||
Expect(c.Apply(Start{ID: 3, Source: SourceClient})).To(Succeed())
|
|
||||||
|
|
||||||
checkInvariants(sink.snapshot())
|
|
||||||
|
|
||||||
got, ok := c.State().(Active)
|
|
||||||
Expect(ok).To(BeTrue(), "state = %s, want Active(3)", c.State())
|
|
||||||
Expect(got.ID).To(Equal(ResponseID(3)))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("terminates on shutdown and rejects any later response (no resurrection)", func() {
|
|
||||||
sink := &recordingSink{}
|
|
||||||
c := New(sink)
|
|
||||||
|
|
||||||
Expect(c.Apply(Start{ID: 1, Source: SourceClient})).To(Succeed())
|
|
||||||
Expect(c.Apply(Shutdown{})).To(Succeed()) // cancels id 1 + goes terminal
|
|
||||||
Expect(c.State()).To(Equal(State(Terminated{})))
|
|
||||||
|
|
||||||
// A late response.create after teardown is structurally rejected.
|
|
||||||
Expect(c.Apply(Start{ID: 2, Source: SourceClient})).To(Succeed())
|
|
||||||
Expect(c.State()).To(Equal(State(Terminated{})))
|
|
||||||
// And a stale Finished from the cancelled response is absorbed.
|
|
||||||
Expect(c.Apply(Finished{ID: 1})).To(Succeed())
|
|
||||||
|
|
||||||
checkInvariants(sink.snapshot())
|
|
||||||
starts := 0
|
|
||||||
for _, e := range sink.snapshot() {
|
|
||||||
if _, ok := e.(StartResponse); ok {
|
|
||||||
starts++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Expect(starts).To(Equal(1), "only id 1 ever started; the post-shutdown Start was rejected")
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
// legacyCoord models the LEGACY startResponse/cancelActiveResponse mechanism, in
|
|
||||||
// which the snapshot ("lock" read), the cancel-and-wait, and the spawn are NOT
|
|
||||||
// atomic with respect to each other across the two driving goroutines. It exists
|
|
||||||
// only to demonstrate the dual-writer race (Part 2, failure mode 2) that
|
|
||||||
// respcoord.Coordinator eliminates. It is not used in production.
|
|
||||||
//
|
|
||||||
// Mapping to the legacy code:
|
|
||||||
// - startStep1 = snapshot Session.activeResponse* under responseMu
|
|
||||||
// - startStep2 = cancelActiveResponse: cancel() then <-done (outside the lock);
|
|
||||||
// a second waiter on an already-closed done returns immediately and does NOT
|
|
||||||
// decrement again (modeled by the snap==registered guard)
|
|
||||||
// - startStep3 = store the new cancel/done pair and spawn the goroutine
|
|
||||||
type legacyCoord struct {
|
|
||||||
live int // # of live response goroutines (the bug: can exceed 1)
|
|
||||||
registered uint64 // id of the currently-registered response (0 = none)
|
|
||||||
nextID uint64
|
|
||||||
}
|
|
||||||
|
|
||||||
func (l *legacyCoord) startStep1() uint64 { return l.registered } // snapshot
|
|
||||||
|
|
||||||
func (l *legacyCoord) startStep2(snap uint64) { // cancel-and-wait
|
|
||||||
if snap != 0 && snap == l.registered {
|
|
||||||
l.live--
|
|
||||||
l.registered = 0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (l *legacyCoord) startStep3() { // spawn + register
|
|
||||||
l.nextID++
|
|
||||||
l.live++
|
|
||||||
l.registered = l.nextID
|
|
||||||
}
|
|
||||||
|
|
||||||
var _ = DescribeTable("respcoord stringers",
|
|
||||||
func(got, want string) { Expect(got).To(Equal(want)) },
|
|
||||||
Entry(nil, SourceClient.String(), "client"),
|
|
||||||
Entry(nil, SourceVAD.String(), "vad"),
|
|
||||||
Entry(nil, Source(99).String(), "Source(99)"),
|
|
||||||
|
|
||||||
Entry(nil, StatusCompleted.String(), "completed"),
|
|
||||||
Entry(nil, StatusCancelled.String(), "cancelled"),
|
|
||||||
Entry(nil, Status(99).String(), "Status(99)"),
|
|
||||||
|
|
||||||
Entry(nil, Idle{}.String(), "Idle"),
|
|
||||||
Entry(nil, Active{ID: 7}.String(), "Active(7)"),
|
|
||||||
Entry(nil, Terminated{}.String(), "Terminated"),
|
|
||||||
|
|
||||||
Entry(nil, Start{ID: 1, Source: SourceVAD}.String(), "Start(1,vad)"),
|
|
||||||
Entry(nil, Finished{ID: 2}.String(), "Finished(2)"),
|
|
||||||
Entry(nil, Cancel{Source: SourceClient}.String(), "Cancel(client)"),
|
|
||||||
Entry(nil, Shutdown{}.String(), "Shutdown"),
|
|
||||||
|
|
||||||
Entry(nil, CancelResponse{ID: 3}.String(), "CancelResponse(3)"),
|
|
||||||
Entry(nil, StartResponse{ID: 4}.String(), "StartResponse(4)"),
|
|
||||||
Entry(nil, EmitTerminal{ID: 5, Status: StatusCompleted}.String(), "EmitTerminal(5,completed)"),
|
|
||||||
)
|
|
||||||
|
|
||||||
var _ = Describe("legacy dual-writer characterization", func() {
|
|
||||||
// Pins the exact interleaving in which the read-loop and the VAD goroutine
|
|
||||||
// both start a response and the machine ends up with TWO live responses. This
|
|
||||||
// is a characterization test for the bug: if a future change to the legacy
|
|
||||||
// model accidentally fixes it, this spec flips and we delete the legacy model.
|
|
||||||
// The production path uses respcoord.Coordinator, proven safe above.
|
|
||||||
It("can reach two live responses (the bug respcoord eliminates)", func() {
|
|
||||||
l := &legacyCoord{}
|
|
||||||
|
|
||||||
// First response established normally.
|
|
||||||
s := l.startStep1()
|
|
||||||
l.startStep2(s)
|
|
||||||
l.startStep3() // live=1, registered=1
|
|
||||||
Expect(l.live).To(Equal(1), "setup")
|
|
||||||
|
|
||||||
// The race: both goroutines snapshot the SAME active response (id 1)...
|
|
||||||
snapVAD := l.startStep1() // 1
|
|
||||||
snapClient := l.startStep1() // 1
|
|
||||||
|
|
||||||
// ...both "cancel-and-wait" it. The first decrements; the second finds it
|
|
||||||
// already gone and does nothing.
|
|
||||||
l.startStep2(snapVAD) // live=0, registered=0
|
|
||||||
l.startStep2(snapClient) // no-op (already 0)
|
|
||||||
|
|
||||||
// ...then both spawn their replacement.
|
|
||||||
l.startStep3() // live=1
|
|
||||||
l.startStep3() // live=2 <-- two live responses
|
|
||||||
|
|
||||||
Expect(l.live).To(Equal(2), "expected the legacy race to reach 2 live responses")
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -1,150 +0,0 @@
|
|||||||
// Package ttscoord is the explicit state machine for the realtime API's
|
|
||||||
// TTS-pipeline lifecycle (machine "M5" in docs/design/realtime-state-machines.md).
|
|
||||||
//
|
|
||||||
// The realtime TTS pipeline (realtime_tts_pipeline.go) decouples synthesis from
|
|
||||||
// LLM token generation: the token callback enqueues clauses, a single worker
|
|
||||||
// goroutine synthesizes them in order, and wait() closes the queue and joins the
|
|
||||||
// worker. In the legacy code the lifecycle is an implicit `closed bool` (guarded
|
|
||||||
// by the pipeline mutex) plus a `done` channel closed once by the worker. Two
|
|
||||||
// gaps: enqueue does NOT check `closed`, so a clause offered after wait() is
|
|
||||||
// silently appended to a worker that may have already exited (dropped); and the
|
|
||||||
// open/closed lifecycle is inferred from a bool rather than stored.
|
|
||||||
//
|
|
||||||
// This package makes the lifecycle explicit:
|
|
||||||
// - a sealed sum type for State (Open | Closing | Closed) — monotonic; illegal
|
|
||||||
// reversals are unrepresentable,
|
|
||||||
// - a total, pure transition function Next(state, event) -> (state, effects),
|
|
||||||
// - a single-writer Coordinator that serializes every transition.
|
|
||||||
//
|
|
||||||
// It is a genuine two-writer machine: the producer goroutine raises Close (from
|
|
||||||
// wait()), and the worker goroutine raises WorkerExited when it has drained the
|
|
||||||
// queue and seen the close — so serializing the transition matters. The poison
|
|
||||||
// `failed` latch stays a lock-free atomic.Bool in the pipeline (it is read per
|
|
||||||
// clause on the worker's hot path and is orthogonal to open/closed); this machine
|
|
||||||
// owns only the open->closing->closed lifecycle.
|
|
||||||
//
|
|
||||||
// Guarantees the spec checks:
|
|
||||||
// - Close wakes the worker to exit exactly once (idempotent wait(); invariant
|
|
||||||
// #10),
|
|
||||||
// - the lifecycle is monotonic and Closed is terminal — so a clause is never
|
|
||||||
// accepted after close (enqueue is gated on Open) and the worker is joined
|
|
||||||
// exactly once (no leak; invariant #8).
|
|
||||||
package ttscoord
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
|
|
||||||
)
|
|
||||||
|
|
||||||
// State is the sealed sum type of TTS-pipeline lifecycle states. Exhaustively:
|
|
||||||
// Open | Closing | Closed.
|
|
||||||
type State interface {
|
|
||||||
isState()
|
|
||||||
String() string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Open: the worker is running and accepting clauses.
|
|
||||||
type Open struct{}
|
|
||||||
|
|
||||||
// Closing: wait() has been called; the worker is draining the remaining queue and
|
|
||||||
// will exit. No new clause is accepted.
|
|
||||||
type Closing struct{}
|
|
||||||
|
|
||||||
// Closed: the worker has exited (its done channel is closed). Terminal.
|
|
||||||
type Closed struct{}
|
|
||||||
|
|
||||||
func (Open) isState() {}
|
|
||||||
func (Closing) isState() {}
|
|
||||||
func (Closed) isState() {}
|
|
||||||
|
|
||||||
func (Open) String() string { return "Open" }
|
|
||||||
func (Closing) String() string { return "Closing" }
|
|
||||||
func (Closed) String() string { return "Closed" }
|
|
||||||
|
|
||||||
// Event is the sealed sum type of inputs. Exhaustively: Close | WorkerExited.
|
|
||||||
type Event interface {
|
|
||||||
isEvent()
|
|
||||||
String() string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Close is raised by the producer goroutine (wait()): close the queue and ask
|
|
||||||
// the worker to finish. Idempotent.
|
|
||||||
type Close struct{}
|
|
||||||
|
|
||||||
// WorkerExited is raised by the worker goroutine when it has drained the queue
|
|
||||||
// and observed the close, just before it closes its done channel.
|
|
||||||
type WorkerExited struct{}
|
|
||||||
|
|
||||||
func (Close) isEvent() {}
|
|
||||||
func (WorkerExited) isEvent() {}
|
|
||||||
|
|
||||||
func (Close) String() string { return "Close" }
|
|
||||||
func (WorkerExited) String() string { return "WorkerExited" }
|
|
||||||
|
|
||||||
// Effect is a side effect returned by Next as data. Exhaustively: Wake.
|
|
||||||
type Effect interface {
|
|
||||||
isEffect()
|
|
||||||
String() string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Wake: signal the worker (via the buffered wake channel) so it re-checks the
|
|
||||||
// lifecycle and exits. Emitted once, on the Open->Closing transition.
|
|
||||||
type Wake struct{}
|
|
||||||
|
|
||||||
func (Wake) isEffect() {}
|
|
||||||
|
|
||||||
func (Wake) String() string { return "Wake" }
|
|
||||||
|
|
||||||
// Next is the total, pure transition function. For every (state, event) it
|
|
||||||
// returns the next state and the ordered effects. It returns a non-nil error
|
|
||||||
// only for an unknown State/Event implementation. Every in-domain pair is
|
|
||||||
// defined; there are no forbidden transitions, only no-ops.
|
|
||||||
//
|
|
||||||
// The lifecycle is monotonic Open -> Closing -> Closed. Close wakes the worker
|
|
||||||
// only on the first Open->Closing transition (idempotent wait()); a later Close
|
|
||||||
// is absorbed. WorkerExited only advances Closing -> Closed.
|
|
||||||
func Next(s State, e Event) (State, []Effect, error) {
|
|
||||||
switch s.(type) {
|
|
||||||
case Open:
|
|
||||||
switch e.(type) {
|
|
||||||
case Close:
|
|
||||||
return Closing{}, []Effect{Wake{}}, nil
|
|
||||||
case WorkerExited:
|
|
||||||
// Worker exited while still Open (e.g. never any clause and an early
|
|
||||||
// close race) -- treat as fully closed; defensive, keeps Next total.
|
|
||||||
return Closed{}, nil, nil
|
|
||||||
}
|
|
||||||
case Closing:
|
|
||||||
switch e.(type) {
|
|
||||||
case Close:
|
|
||||||
// Idempotent wait(): already closing, no second wake.
|
|
||||||
return Closing{}, nil, nil
|
|
||||||
case WorkerExited:
|
|
||||||
return Closed{}, nil, nil
|
|
||||||
}
|
|
||||||
case Closed:
|
|
||||||
switch e.(type) {
|
|
||||||
case Close:
|
|
||||||
return Closed{}, nil, nil
|
|
||||||
case WorkerExited:
|
|
||||||
return Closed{}, nil, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return s, nil, fmt.Errorf("ttscoord: unhandled transition %s <- %s", s, e)
|
|
||||||
}
|
|
||||||
|
|
||||||
// EffectSink performs the effects produced by a transition. See coordinator.Sink:
|
|
||||||
// Wake does a non-blocking send on a buffered channel, so Perform does not block
|
|
||||||
// under the lock.
|
|
||||||
type EffectSink = coordinator.Sink[Effect]
|
|
||||||
|
|
||||||
// Coordinator serializes the TTS-pipeline transitions. The producer (Close) and
|
|
||||||
// worker (WorkerExited) goroutines both call Apply, so the lock serializes the
|
|
||||||
// two writers. See coordinator.Coordinator.
|
|
||||||
type Coordinator = coordinator.Coordinator[State, Event, Effect]
|
|
||||||
|
|
||||||
// New returns an Open Coordinator that performs effects via sink.
|
|
||||||
func New(sink EffectSink) *Coordinator {
|
|
||||||
return coordinator.New[State, Event, Effect](Open{}, Next, sink)
|
|
||||||
}
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
package ttscoord
|
|
||||||
|
|
||||||
import (
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestTtscoord(t *testing.T) {
|
|
||||||
RegisterFailHandler(Fail)
|
|
||||||
RunSpecs(t, "ttscoord (realtime M5) Suite")
|
|
||||||
}
|
|
||||||
@@ -1,165 +0,0 @@
|
|||||||
package ttscoord
|
|
||||||
|
|
||||||
import (
|
|
||||||
"math/rand/v2"
|
|
||||||
"sync"
|
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
)
|
|
||||||
|
|
||||||
// recordingSink captures the ordered stream of effects.
|
|
||||||
type recordingSink struct {
|
|
||||||
mu sync.Mutex
|
|
||||||
log []Effect
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *recordingSink) Perform(e Effect) {
|
|
||||||
s.mu.Lock()
|
|
||||||
s.log = append(s.log, e)
|
|
||||||
s.mu.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *recordingSink) wakes() int {
|
|
||||||
s.mu.Lock()
|
|
||||||
defer s.mu.Unlock()
|
|
||||||
n := 0
|
|
||||||
for _, e := range s.log {
|
|
||||||
if _, ok := e.(Wake); ok {
|
|
||||||
n++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return n
|
|
||||||
}
|
|
||||||
|
|
||||||
type unknownEvent struct{}
|
|
||||||
|
|
||||||
func (unknownEvent) isEvent() {}
|
|
||||||
func (unknownEvent) String() string { return "unknownEvent" }
|
|
||||||
|
|
||||||
type unknownState struct{}
|
|
||||||
|
|
||||||
func (unknownState) isState() {}
|
|
||||||
func (unknownState) String() string { return "unknownState" }
|
|
||||||
|
|
||||||
var _ = Describe("ttscoord.Next", func() {
|
|
||||||
DescribeTable("transitions",
|
|
||||||
func(state State, event Event, wantState State, wantEff []Effect) {
|
|
||||||
gotState, gotEff, err := Next(state, event)
|
|
||||||
Expect(err).NotTo(HaveOccurred())
|
|
||||||
Expect(gotState).To(Equal(wantState))
|
|
||||||
Expect(gotEff).To(Equal(wantEff))
|
|
||||||
},
|
|
||||||
Entry("open+close -> closing: wake",
|
|
||||||
Open{}, Close{}, Closing{}, []Effect{Wake{}}),
|
|
||||||
Entry("open+workerexited -> closed (defensive)",
|
|
||||||
Open{}, WorkerExited{}, Closed{}, []Effect(nil)),
|
|
||||||
Entry("closing+close -> closing, no-op (idempotent wait)",
|
|
||||||
Closing{}, Close{}, Closing{}, []Effect(nil)),
|
|
||||||
Entry("closing+workerexited -> closed",
|
|
||||||
Closing{}, WorkerExited{}, Closed{}, []Effect(nil)),
|
|
||||||
Entry("closed+close -> closed, no-op",
|
|
||||||
Closed{}, Close{}, Closed{}, []Effect(nil)),
|
|
||||||
Entry("closed+workerexited -> closed, no-op",
|
|
||||||
Closed{}, WorkerExited{}, Closed{}, []Effect(nil)),
|
|
||||||
)
|
|
||||||
|
|
||||||
It("is total over the defined (state, event) pairs", func() {
|
|
||||||
for _, s := range []State{Open{}, Closing{}, Closed{}} {
|
|
||||||
for _, e := range []Event{Close{}, WorkerExited{}} {
|
|
||||||
_, _, err := Next(s, e)
|
|
||||||
Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
It("errors on an unknown event type", func() {
|
|
||||||
_, _, err := Next(Open{}, unknownEvent{})
|
|
||||||
Expect(err).To(HaveOccurred())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("errors on an unknown state type", func() {
|
|
||||||
_, _, err := Next(unknownState{}, Close{})
|
|
||||||
Expect(err).To(HaveOccurred())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
// phaseOf maps a state to a monotonic rank for the "never goes backwards" check.
|
|
||||||
func phaseOf(s State) int {
|
|
||||||
switch s.(type) {
|
|
||||||
case Open:
|
|
||||||
return 0
|
|
||||||
case Closing:
|
|
||||||
return 1
|
|
||||||
case Closed:
|
|
||||||
return 2
|
|
||||||
default:
|
|
||||||
return -1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var _ = Describe("ttscoord.Coordinator", func() {
|
|
||||||
It("keeps the lifecycle monotonic and wakes at most once over random sequences", func() {
|
|
||||||
seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
|
|
||||||
for _, seed := range seeds {
|
|
||||||
r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
|
|
||||||
sink := &recordingSink{}
|
|
||||||
c := New(sink)
|
|
||||||
prev := 0
|
|
||||||
|
|
||||||
for range 5000 {
|
|
||||||
if r.IntN(2) == 0 {
|
|
||||||
Expect(c.Apply(Close{})).To(Succeed())
|
|
||||||
} else {
|
|
||||||
Expect(c.Apply(WorkerExited{})).To(Succeed())
|
|
||||||
}
|
|
||||||
cur := phaseOf(c.State())
|
|
||||||
Expect(cur).To(BeNumerically(">=", prev), "seed=%d: lifecycle went backwards", seed)
|
|
||||||
prev = cur
|
|
||||||
}
|
|
||||||
Expect(sink.wakes()).To(BeNumerically("<=", 1), "seed=%d: woke more than once", seed)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
// Two-writer test: a producer raises Close while the "worker" raises
|
|
||||||
// WorkerExited, the real concurrency. The lifecycle must stay monotonic and
|
|
||||||
// Wake must fire at most once. Run under -race.
|
|
||||||
It("is two-writer safe (producer Close vs worker WorkerExited)", func() {
|
|
||||||
const iterations = 200
|
|
||||||
for range iterations {
|
|
||||||
sink := &recordingSink{}
|
|
||||||
c := New(sink)
|
|
||||||
var wg sync.WaitGroup
|
|
||||||
wg.Add(2)
|
|
||||||
go func() { defer wg.Done(); _ = c.Apply(Close{}) }()
|
|
||||||
go func() { defer wg.Done(); _ = c.Apply(WorkerExited{}) }()
|
|
||||||
wg.Wait()
|
|
||||||
// After both, drive to terminal and assert idempotence.
|
|
||||||
_ = c.Apply(Close{})
|
|
||||||
_ = c.Apply(WorkerExited{})
|
|
||||||
Expect(c.State()).To(Equal(State(Closed{})))
|
|
||||||
Expect(sink.wakes()).To(BeNumerically("<=", 1))
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
It("only Open accepts (a gate query never panics across states)", func() {
|
|
||||||
// Mirrors the pipeline's enqueue gate: accepted iff Open.
|
|
||||||
sink := &recordingSink{}
|
|
||||||
c := New(sink)
|
|
||||||
_, open := c.State().(Open)
|
|
||||||
Expect(open).To(BeTrue())
|
|
||||||
Expect(c.Apply(Close{})).To(Succeed())
|
|
||||||
_, open = c.State().(Open)
|
|
||||||
Expect(open).To(BeFalse())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
var _ = DescribeTable("ttscoord stringers",
|
|
||||||
func(got, want string) { Expect(got).To(Equal(want)) },
|
|
||||||
Entry(nil, Open{}.String(), "Open"),
|
|
||||||
Entry(nil, Closing{}.String(), "Closing"),
|
|
||||||
Entry(nil, Closed{}.String(), "Closed"),
|
|
||||||
Entry(nil, Close{}.String(), "Close"),
|
|
||||||
Entry(nil, WorkerExited{}.String(), "WorkerExited"),
|
|
||||||
Entry(nil, Wake{}.String(), "Wake"),
|
|
||||||
)
|
|
||||||
@@ -1,255 +0,0 @@
|
|||||||
// Package turncoord is the explicit state machine for the realtime API's
|
|
||||||
// turn-detection concern (machine "M2" in
|
|
||||||
// docs/design/realtime-state-machines.md).
|
|
||||||
//
|
|
||||||
// In the legacy code this machine is implicit and, worse, split across TWO
|
|
||||||
// variables that can disagree: handleVAD's goroutine-local speechStarted bool
|
|
||||||
// and the semantic_vad liveTurnState's "is the live stream open" flag
|
|
||||||
// (lts.open()). They are set and cleared at separate points, so a discardTurn
|
|
||||||
// (no-speech clear, a semantic->server mode switch mid-turn, or teardown)
|
|
||||||
// closes the live stream but leaves speechStarted true. The two then disagree,
|
|
||||||
// and the next speech onset is suppressed because `if !speechStarted` is false
|
|
||||||
// — the user's next utterance silently produces no speech_started, no barge-in,
|
|
||||||
// and no commit. See docs/design/realtime-state-machines.md, Part 2 (failure
|
|
||||||
// mode 4) and the turn_lifecycle spec under formal-verification/.
|
|
||||||
//
|
|
||||||
// This package replaces that with:
|
|
||||||
// - a sealed sum type for State (illegal states are unrepresentable),
|
|
||||||
// - a total, pure transition function Next(state, event) -> (state, effects),
|
|
||||||
// - a single-writer Coordinator that serializes every transition.
|
|
||||||
//
|
|
||||||
// "Speech detected" and "a turn is open" become ONE state (Speaking), so they
|
|
||||||
// can no longer fall out of sync: every path that ends a turn returns to Idle
|
|
||||||
// and necessarily clears both. The design guarantees the invariants the specs
|
|
||||||
// check:
|
|
||||||
// - speechStarted ⟺ a turn is open (Part 4, invariant #4) — structural here,
|
|
||||||
// - a barge-in cancel precedes the next turn's commit (you must pass through
|
|
||||||
// Speaking, which barges in on entry, before a Silence can commit),
|
|
||||||
// - every opened turn is finished (commit) or discarded (abort) exactly once.
|
|
||||||
//
|
|
||||||
// Unlike M3 (respcoord), which is a genuine dual-writer race, M2's turn
|
|
||||||
// lifecycle is driven by the single handleVAD goroutine: the value here is
|
|
||||||
// making the speechStarted/turn-open desync unrepresentable, not serializing
|
|
||||||
// concurrent writers. The Coordinator still serializes transitions so that
|
|
||||||
// State() is race-free and a teardown-time Abort from another goroutine (or a
|
|
||||||
// future second writer) stays safe.
|
|
||||||
//
|
|
||||||
// Mode note: in server_vad mode there is no live ASR stream, so OpenTurn /
|
|
||||||
// DiscardTurn have nothing to open or close — the sink performs them as no-ops
|
|
||||||
// and "turn open" is satisfied vacuously. The state coupling (Speaking ⟺ turn
|
|
||||||
// open) still holds; it is only semantic_vad that had two real variables to
|
|
||||||
// desync.
|
|
||||||
package turncoord
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/core/http/endpoints/openai/coordinator"
|
|
||||||
)
|
|
||||||
|
|
||||||
// TurnID identifies one user turn. The caller mints it when speech begins (it
|
|
||||||
// is the conversation item id the live caption deltas stream under, reused by
|
|
||||||
// the committed event so the client replaces the partial text). Carrying it in
|
|
||||||
// the state makes "commit/discard refer to the turn that was opened" explicit.
|
|
||||||
type TurnID string
|
|
||||||
|
|
||||||
// AbortReason records why a turn was dropped without committing. Like
|
|
||||||
// respcoord.Source it is observability only — every reason aborts the same way;
|
|
||||||
// keeping it in the event makes the distinct legacy discardTurn sites explicit
|
|
||||||
// rather than collapsed into one anonymous code path.
|
|
||||||
type AbortReason int
|
|
||||||
|
|
||||||
const (
|
|
||||||
// AbortNoSpeech: the no-speech clear — the VAD found no segments and the
|
|
||||||
// buffer is past the holdback, so the inspected audio was not speech.
|
|
||||||
AbortNoSpeech AbortReason = iota
|
|
||||||
// AbortTeardown: the session is closing.
|
|
||||||
AbortTeardown
|
|
||||||
)
|
|
||||||
|
|
||||||
// NOTE: a semantic->server turn-detection switch mid-turn is deliberately NOT an
|
|
||||||
// Abort: it only drops the orphaned live ASR stream and lets the turn continue
|
|
||||||
// under server_vad (so a config change can't cut off a mid-utterance speaker).
|
|
||||||
// That orphan cleanup stays inline in handleVAD; only the two reasons above end
|
|
||||||
// a turn (return to Idle).
|
|
||||||
|
|
||||||
func (r AbortReason) String() string {
|
|
||||||
switch r {
|
|
||||||
case AbortNoSpeech:
|
|
||||||
return "no_speech"
|
|
||||||
case AbortTeardown:
|
|
||||||
return "teardown"
|
|
||||||
default:
|
|
||||||
return fmt.Sprintf("AbortReason(%d)", int(r))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// State is the sealed sum type of turn-detection states. The only
|
|
||||||
// implementations are the marker-method structs in this file, so callers
|
|
||||||
// outside the package cannot fabricate an out-of-band state. Exhaustively:
|
|
||||||
// Idle | Speaking.
|
|
||||||
type State interface {
|
|
||||||
isState()
|
|
||||||
String() string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Idle: no turn is open and no speech is in progress (legacy: speechStarted ==
|
|
||||||
// false AND the live stream is closed — here a single state, so they cannot
|
|
||||||
// disagree).
|
|
||||||
type Idle struct{}
|
|
||||||
|
|
||||||
// Speaking: a turn is open and speech is in progress (legacy: speechStarted ==
|
|
||||||
// true AND, in semantic mode, the live stream open). Turn is the open turn's id.
|
|
||||||
type Speaking struct{ Turn TurnID }
|
|
||||||
|
|
||||||
func (Idle) isState() {}
|
|
||||||
func (Speaking) isState() {}
|
|
||||||
|
|
||||||
func (Idle) String() string { return "Idle" }
|
|
||||||
func (s Speaking) String() string { return fmt.Sprintf("Speaking(%s)", s.Turn) }
|
|
||||||
|
|
||||||
// Event is the sealed sum type of inputs. Exhaustively: Onset | Silence | Abort.
|
|
||||||
type Event interface {
|
|
||||||
isEvent()
|
|
||||||
String() string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Onset reports that the VAD found speech this tick. Turn is the id to open the
|
|
||||||
// turn under (allocated by the caller so caption deltas can stream immediately).
|
|
||||||
// While already Speaking it is a no-op: re-detection of ongoing speech does not
|
|
||||||
// reopen a turn (legacy `if !speechStarted`).
|
|
||||||
type Onset struct{ Turn TurnID }
|
|
||||||
|
|
||||||
// Silence reports VAD-confirmed silence past the dynamic commit threshold (the
|
|
||||||
// end-of-speech commit trigger). The threshold itself — semantic_vad's EOU vs
|
|
||||||
// eagerness fallback — is computed by the caller before raising this event; the
|
|
||||||
// machine only sequences the commit. It is a no-op while Idle (nothing to
|
|
||||||
// commit).
|
|
||||||
type Silence struct{}
|
|
||||||
|
|
||||||
// Abort drops the open turn without committing (no-speech clear, mode switch,
|
|
||||||
// teardown). It is a no-op while Idle (nothing open).
|
|
||||||
type Abort struct{ Reason AbortReason }
|
|
||||||
|
|
||||||
func (Onset) isEvent() {}
|
|
||||||
func (Silence) isEvent() {}
|
|
||||||
func (Abort) isEvent() {}
|
|
||||||
|
|
||||||
func (e Onset) String() string { return fmt.Sprintf("Onset(%s)", e.Turn) }
|
|
||||||
func (Silence) String() string { return "Silence" }
|
|
||||||
func (e Abort) String() string { return fmt.Sprintf("Abort(%s)", e.Reason) }
|
|
||||||
|
|
||||||
// Effect is a side effect returned by Next as data for the caller to perform.
|
|
||||||
// Returning effects as data (rather than firing callbacks inside the
|
|
||||||
// transition) keeps Next pure and exhaustively testable. Exhaustively:
|
|
||||||
// BargeIn | OpenTurn | EmitSpeechStarted | EmitSpeechStopped | CommitTurn |
|
|
||||||
// DiscardTurn.
|
|
||||||
type Effect interface {
|
|
||||||
isEffect()
|
|
||||||
String() string
|
|
||||||
}
|
|
||||||
|
|
||||||
// BargeIn: cancel any in-flight response (the M2->M3 edge). Emitted on the
|
|
||||||
// Idle->Speaking onset, before the new turn can ever commit — so a barge-in
|
|
||||||
// always precedes the next commit.
|
|
||||||
type BargeIn struct{}
|
|
||||||
|
|
||||||
// OpenTurn: open the live ASR stream for Turn (semantic_vad). No-op in
|
|
||||||
// server_vad mode.
|
|
||||||
type OpenTurn struct{ Turn TurnID }
|
|
||||||
|
|
||||||
// EmitSpeechStarted: send input_audio_buffer.speech_started.
|
|
||||||
type EmitSpeechStarted struct{}
|
|
||||||
|
|
||||||
// EmitSpeechStopped: send input_audio_buffer.speech_stopped.
|
|
||||||
type EmitSpeechStopped struct{}
|
|
||||||
|
|
||||||
// CommitTurn: finalize the turn's live stream, emit input_audio_buffer.committed
|
|
||||||
// for Turn, and issue the response (via respcoord). The completion of one turn.
|
|
||||||
type CommitTurn struct{ Turn TurnID }
|
|
||||||
|
|
||||||
// DiscardTurn: close the turn's live stream and retract any caption deltas
|
|
||||||
// already shown for Turn (the failed transcription event). No commit, no
|
|
||||||
// response.
|
|
||||||
type DiscardTurn struct{ Turn TurnID }
|
|
||||||
|
|
||||||
func (BargeIn) isEffect() {}
|
|
||||||
func (OpenTurn) isEffect() {}
|
|
||||||
func (EmitSpeechStarted) isEffect() {}
|
|
||||||
func (EmitSpeechStopped) isEffect() {}
|
|
||||||
func (CommitTurn) isEffect() {}
|
|
||||||
func (DiscardTurn) isEffect() {}
|
|
||||||
|
|
||||||
func (BargeIn) String() string { return "BargeIn" }
|
|
||||||
func (e OpenTurn) String() string { return fmt.Sprintf("OpenTurn(%s)", e.Turn) }
|
|
||||||
func (EmitSpeechStarted) String() string { return "EmitSpeechStarted" }
|
|
||||||
func (EmitSpeechStopped) String() string { return "EmitSpeechStopped" }
|
|
||||||
func (e CommitTurn) String() string { return fmt.Sprintf("CommitTurn(%s)", e.Turn) }
|
|
||||||
func (e DiscardTurn) String() string { return fmt.Sprintf("DiscardTurn(%s)", e.Turn) }
|
|
||||||
|
|
||||||
// Next is the total, pure transition function. For every (state, event) it
|
|
||||||
// returns the next state and the ordered effects to perform. It returns a
|
|
||||||
// non-nil error only for an unknown State/Event implementation (a programmer
|
|
||||||
// error / future type added without updating this function) — callers must
|
|
||||||
// surface that, never silently ignore it. Every in-domain (state, event) pair
|
|
||||||
// is defined; there are no "forbidden" transitions, only no-ops for events that
|
|
||||||
// don't apply to the current state.
|
|
||||||
//
|
|
||||||
// The crux of the fix is that both turn-ending transitions (Silence commit and
|
|
||||||
// Abort) go to Idle, which carries no turn data: there is no way to clear "turn
|
|
||||||
// open" while leaving "speech started" set, because they are the same state.
|
|
||||||
// The legacy desync (discardTurn closed the live stream but left speechStarted
|
|
||||||
// true) is therefore unrepresentable.
|
|
||||||
//
|
|
||||||
// Effect ordering on onset mirrors the live handleVAD: OpenTurn (start the live
|
|
||||||
// stream), then BargeIn (cancel the prior response), then EmitSpeechStarted.
|
|
||||||
func Next(s State, e Event) (State, []Effect, error) {
|
|
||||||
switch st := s.(type) {
|
|
||||||
case Idle:
|
|
||||||
switch ev := e.(type) {
|
|
||||||
case Onset:
|
|
||||||
return Speaking{Turn: ev.Turn}, []Effect{
|
|
||||||
OpenTurn{Turn: ev.Turn},
|
|
||||||
BargeIn{},
|
|
||||||
EmitSpeechStarted{},
|
|
||||||
}, nil
|
|
||||||
case Silence:
|
|
||||||
// Nothing in flight to commit: idempotent no-op.
|
|
||||||
return Idle{}, nil, nil
|
|
||||||
case Abort:
|
|
||||||
// No open turn: idempotent no-op (discardTurn on a closed stream).
|
|
||||||
return Idle{}, nil, nil
|
|
||||||
}
|
|
||||||
case Speaking:
|
|
||||||
switch e.(type) {
|
|
||||||
case Onset:
|
|
||||||
// Speech already in progress: re-detection does not reopen a turn
|
|
||||||
// or re-emit speech_started (legacy `if !speechStarted`). The turn
|
|
||||||
// id stays the one allocated at onset.
|
|
||||||
return Speaking{Turn: st.Turn}, nil, nil
|
|
||||||
case Silence:
|
|
||||||
return Idle{}, []Effect{
|
|
||||||
EmitSpeechStopped{},
|
|
||||||
CommitTurn{Turn: st.Turn},
|
|
||||||
}, nil
|
|
||||||
case Abort:
|
|
||||||
return Idle{}, []Effect{DiscardTurn{Turn: st.Turn}}, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return s, nil, fmt.Errorf("turncoord: unhandled transition %s <- %s", s, e)
|
|
||||||
}
|
|
||||||
|
|
||||||
// EffectSink performs the effects produced by a transition. See coordinator.Sink
|
|
||||||
// for the non-blocking contract: Perform runs under the coordinator lock, so it
|
|
||||||
// must not block and must not re-enter Apply.
|
|
||||||
type EffectSink = coordinator.Sink[Effect]
|
|
||||||
|
|
||||||
// Coordinator serializes turn transitions. In practice the handleVAD goroutine is
|
|
||||||
// the only writer, but serializing keeps State() race-free and a teardown-time
|
|
||||||
// Abort from another goroutine safe. See coordinator.Coordinator.
|
|
||||||
type Coordinator = coordinator.Coordinator[State, Event, Effect]
|
|
||||||
|
|
||||||
// New returns an idle Coordinator that performs effects via sink.
|
|
||||||
func New(sink EffectSink) *Coordinator {
|
|
||||||
return coordinator.New[State, Event, Effect](Idle{}, Next, sink)
|
|
||||||
}
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
package turncoord
|
|
||||||
|
|
||||||
import (
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestTurncoord(t *testing.T) {
|
|
||||||
RegisterFailHandler(Fail)
|
|
||||||
RunSpecs(t, "turncoord (realtime M2) Suite")
|
|
||||||
}
|
|
||||||
@@ -1,242 +0,0 @@
|
|||||||
package turncoord
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"math/rand/v2"
|
|
||||||
"sync"
|
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo/v2"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
)
|
|
||||||
|
|
||||||
// recordingSink captures the ordered stream of effects so the invariants can be
|
|
||||||
// checked independently of the transition function's internals. Perform is
|
|
||||||
// called by Coordinator.Apply under the coordinator lock, so it is already
|
|
||||||
// serialized; the mutex here only guards reads from the spec goroutine.
|
|
||||||
type recordingSink struct {
|
|
||||||
mu sync.Mutex
|
|
||||||
log []Effect
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *recordingSink) Perform(e Effect) {
|
|
||||||
s.mu.Lock()
|
|
||||||
s.log = append(s.log, e)
|
|
||||||
s.mu.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *recordingSink) snapshot() []Effect {
|
|
||||||
s.mu.Lock()
|
|
||||||
defer s.mu.Unlock()
|
|
||||||
out := make([]Effect, len(s.log))
|
|
||||||
copy(out, s.log)
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
// checkLog replays the effect log and asserts the turn-lifecycle safety
|
|
||||||
// properties from docs/design/realtime-state-machines.md, Part 4 (invariant #4
|
|
||||||
// and the discardTurn/speechStarted desync, failure mode 4):
|
|
||||||
//
|
|
||||||
// (1) at most one turn open at any instant -- OpenTurn never fires while a
|
|
||||||
// turn is already open;
|
|
||||||
// (2) every turn id is opened at most once;
|
|
||||||
// (3) no orphan close -- CommitTurn/DiscardTurn only fire on an open turn.
|
|
||||||
//
|
|
||||||
// The wire pairing of speech_started/speech_stopped is intentionally NOT
|
|
||||||
// reconstructed here: like the legacy no-speech clear, an Abort discards the
|
|
||||||
// turn without a speech_stopped (the failed-transcription event is its closure
|
|
||||||
// signal). The guarantee this package adds is the *state* coupling (Speaking
|
|
||||||
// <=> a turn is open), checked inline in the property spec below.
|
|
||||||
func checkLog(log []Effect) {
|
|
||||||
open := false
|
|
||||||
opens := map[TurnID]int{}
|
|
||||||
for i, eff := range log {
|
|
||||||
switch e := eff.(type) {
|
|
||||||
case OpenTurn:
|
|
||||||
Expect(open).To(BeFalse(), "invariant (1): OpenTurn(%s) while a turn is already open (effect #%d)\nlog=%v", e.Turn, i, log)
|
|
||||||
open = true
|
|
||||||
opens[e.Turn]++
|
|
||||||
Expect(opens[e.Turn]).To(Equal(1), "invariant (2): turn %s opened %d times (effect #%d)\nlog=%v", e.Turn, opens[e.Turn], i, log)
|
|
||||||
case CommitTurn:
|
|
||||||
Expect(open).To(BeTrue(), "invariant (3): CommitTurn(%s) with no open turn (effect #%d)\nlog=%v", e.Turn, i, log)
|
|
||||||
open = false
|
|
||||||
case DiscardTurn:
|
|
||||||
Expect(open).To(BeTrue(), "invariant (3): DiscardTurn(%s) with no open turn (effect #%d)\nlog=%v", e.Turn, i, log)
|
|
||||||
open = false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// unknownEvent / unknownState exercise the defensive error path for a type that
|
|
||||||
// Next does not know about (a future variant added without updating Next).
|
|
||||||
type unknownEvent struct{}
|
|
||||||
|
|
||||||
func (unknownEvent) isEvent() {}
|
|
||||||
func (unknownEvent) String() string { return "unknownEvent" }
|
|
||||||
|
|
||||||
type unknownState struct{}
|
|
||||||
|
|
||||||
func (unknownState) isState() {}
|
|
||||||
func (unknownState) String() string { return "unknownState" }
|
|
||||||
|
|
||||||
var _ = Describe("turncoord.Next", func() {
|
|
||||||
// DescribeTable exhaustively pins every (state, event) cell of the pure
|
|
||||||
// transition function, including the idle no-op cells. This is the practical
|
|
||||||
// stand-in for "no transition leads to an inconsistent state": if a cell
|
|
||||||
// changes, this table must change with it.
|
|
||||||
DescribeTable("transitions",
|
|
||||||
func(state State, event Event, wantState State, wantEff []Effect) {
|
|
||||||
gotState, gotEff, err := Next(state, event)
|
|
||||||
Expect(err).NotTo(HaveOccurred())
|
|
||||||
Expect(gotState).To(Equal(wantState))
|
|
||||||
Expect(gotEff).To(Equal(wantEff))
|
|
||||||
},
|
|
||||||
Entry("idle+onset -> speaking: open, barge-in, speech_started",
|
|
||||||
Idle{}, Onset{Turn: "t1"},
|
|
||||||
Speaking{Turn: "t1"},
|
|
||||||
[]Effect{OpenTurn{Turn: "t1"}, BargeIn{}, EmitSpeechStarted{}}),
|
|
||||||
Entry("idle+silence -> idle, no-op (nothing to commit)",
|
|
||||||
Idle{}, Silence{},
|
|
||||||
Idle{}, []Effect(nil)),
|
|
||||||
Entry("idle+abort -> idle, no-op (nothing open)",
|
|
||||||
Idle{}, Abort{Reason: AbortNoSpeech},
|
|
||||||
Idle{}, []Effect(nil)),
|
|
||||||
Entry("speaking+onset -> stay speaking, no-op (already speaking)",
|
|
||||||
Speaking{Turn: "t1"}, Onset{Turn: "t2"}, // a fresh id is ignored mid-turn
|
|
||||||
Speaking{Turn: "t1"}, []Effect(nil)),
|
|
||||||
Entry("speaking+silence -> idle: speech_stopped + commit",
|
|
||||||
Speaking{Turn: "t1"}, Silence{},
|
|
||||||
Idle{}, []Effect{EmitSpeechStopped{}, CommitTurn{Turn: "t1"}}),
|
|
||||||
Entry("speaking+abort(no_speech) -> idle: discard",
|
|
||||||
Speaking{Turn: "t1"}, Abort{Reason: AbortNoSpeech},
|
|
||||||
Idle{}, []Effect{DiscardTurn{Turn: "t1"}}),
|
|
||||||
Entry("speaking+abort(teardown) -> idle: discard",
|
|
||||||
Speaking{Turn: "t9"}, Abort{Reason: AbortTeardown},
|
|
||||||
Idle{}, []Effect{DiscardTurn{Turn: "t9"}}),
|
|
||||||
)
|
|
||||||
|
|
||||||
It("is total: every defined (state, event) pair is handled without error", func() {
|
|
||||||
states := []State{Idle{}, Speaking{Turn: "t1"}}
|
|
||||||
events := []Event{
|
|
||||||
Onset{Turn: "t2"},
|
|
||||||
Silence{},
|
|
||||||
Abort{Reason: AbortNoSpeech},
|
|
||||||
Abort{Reason: AbortTeardown},
|
|
||||||
}
|
|
||||||
for _, s := range states {
|
|
||||||
for _, e := range events {
|
|
||||||
_, _, err := Next(s, e)
|
|
||||||
Expect(err).NotTo(HaveOccurred(), "Next(%s, %s)", s, e)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
It("errors on an unknown event type", func() {
|
|
||||||
_, _, err := Next(Speaking{Turn: "t1"}, unknownEvent{})
|
|
||||||
Expect(err).To(HaveOccurred())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("errors on an unknown state type", func() {
|
|
||||||
_, _, err := Next(unknownState{}, Onset{Turn: "t1"})
|
|
||||||
Expect(err).To(HaveOccurred())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
var _ = Describe("turncoord.Coordinator", func() {
|
|
||||||
// This replaces the previous rapid stateful test: a seeded random walk over
|
|
||||||
// the event space, asserting after every step both the log invariants and
|
|
||||||
// the core state coupling -- the machine is in Speaking IFF a turn is
|
|
||||||
// currently open. That coupling is the whole point of M2: in the legacy code
|
|
||||||
// speechStarted and the live-stream-open flag were separate variables a
|
|
||||||
// discard could desync; here they are one state and cannot. Seeds are fixed
|
|
||||||
// so any failure reproduces deterministically (the failing seed/step is in
|
|
||||||
// the assertion message).
|
|
||||||
It("keeps state coupled to turn-open over random event sequences", func() {
|
|
||||||
seeds := []uint64{1, 2, 3, 42, 1337, 0xC0FFEE}
|
|
||||||
for _, seed := range seeds {
|
|
||||||
r := rand.New(rand.NewPCG(seed, 0xA5A5A5A5))
|
|
||||||
sink := &recordingSink{}
|
|
||||||
c := New(sink)
|
|
||||||
var nextTurn uint64
|
|
||||||
open := false // independent model of "is a turn open"
|
|
||||||
|
|
||||||
for step := range 5000 {
|
|
||||||
switch r.IntN(3) {
|
|
||||||
case 0:
|
|
||||||
nextTurn++
|
|
||||||
Expect(c.Apply(Onset{Turn: TurnID(fmt.Sprintf("t%d", nextTurn))})).To(Succeed())
|
|
||||||
open = true // onset opens a turn (or is a no-op if already open)
|
|
||||||
case 1:
|
|
||||||
Expect(c.Apply(Silence{})).To(Succeed())
|
|
||||||
open = false // commit (or no-op if already idle)
|
|
||||||
case 2:
|
|
||||||
Expect(c.Apply(Abort{Reason: AbortReason(r.IntN(2))})).To(Succeed())
|
|
||||||
open = false // discard (or no-op if already idle)
|
|
||||||
}
|
|
||||||
_, speaking := c.State().(Speaking)
|
|
||||||
Expect(speaking).To(Equal(open), "coupling: seed=%d step=%d state=%s", seed, step, c.State())
|
|
||||||
}
|
|
||||||
checkLog(sink.snapshot())
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
// M2 is single-writer in practice (handleVAD), but teardown can Abort from
|
|
||||||
// another goroutine, so the Coordinator must be race-safe. Run under -race;
|
|
||||||
// the log invariants must hold regardless of interleaving.
|
|
||||||
It("is race-safe under concurrent Apply from two goroutines", func() {
|
|
||||||
const perGoroutine = 2000
|
|
||||||
sink := &recordingSink{}
|
|
||||||
c := New(sink)
|
|
||||||
|
|
||||||
var idCounter uint64
|
|
||||||
var idMu sync.Mutex
|
|
||||||
nextTurn := func() TurnID {
|
|
||||||
idMu.Lock()
|
|
||||||
defer idMu.Unlock()
|
|
||||||
idCounter++
|
|
||||||
return TurnID(fmt.Sprintf("t%d", idCounter))
|
|
||||||
}
|
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
|
||||||
drive := func(reason AbortReason) {
|
|
||||||
defer wg.Done()
|
|
||||||
for i := range perGoroutine {
|
|
||||||
switch i % 3 {
|
|
||||||
case 0:
|
|
||||||
_ = c.Apply(Onset{Turn: nextTurn()})
|
|
||||||
case 1:
|
|
||||||
_ = c.Apply(Silence{})
|
|
||||||
case 2:
|
|
||||||
_ = c.Apply(Abort{Reason: reason})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
wg.Add(2)
|
|
||||||
go drive(AbortNoSpeech)
|
|
||||||
go drive(AbortTeardown)
|
|
||||||
wg.Wait()
|
|
||||||
|
|
||||||
checkLog(sink.snapshot())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
var _ = DescribeTable("turncoord stringers",
|
|
||||||
func(got, want string) { Expect(got).To(Equal(want)) },
|
|
||||||
Entry(nil, AbortNoSpeech.String(), "no_speech"),
|
|
||||||
Entry(nil, AbortTeardown.String(), "teardown"),
|
|
||||||
Entry(nil, AbortReason(99).String(), "AbortReason(99)"),
|
|
||||||
|
|
||||||
Entry(nil, Idle{}.String(), "Idle"),
|
|
||||||
Entry(nil, Speaking{Turn: "t7"}.String(), "Speaking(t7)"),
|
|
||||||
|
|
||||||
Entry(nil, Onset{Turn: "t1"}.String(), "Onset(t1)"),
|
|
||||||
Entry(nil, Silence{}.String(), "Silence"),
|
|
||||||
Entry(nil, Abort{Reason: AbortTeardown}.String(), "Abort(teardown)"),
|
|
||||||
|
|
||||||
Entry(nil, BargeIn{}.String(), "BargeIn"),
|
|
||||||
Entry(nil, OpenTurn{Turn: "t2"}.String(), "OpenTurn(t2)"),
|
|
||||||
Entry(nil, EmitSpeechStarted{}.String(), "EmitSpeechStarted"),
|
|
||||||
Entry(nil, EmitSpeechStopped{}.String(), "EmitSpeechStopped"),
|
|
||||||
Entry(nil, CommitTurn{Turn: "t3"}.String(), "CommitTurn(t3)"),
|
|
||||||
Entry(nil, DiscardTurn{Turn: "t4"}.String(), "DiscardTurn(t4)"),
|
|
||||||
)
|
|
||||||
@@ -1,87 +0,0 @@
|
|||||||
import { test, expect } from './coverage-fixtures.js'
|
|
||||||
|
|
||||||
// Audio snippets on the Traces page must play through a blob: object URL —
|
|
||||||
// the CSP's connect-src allows blob: but not data:, and the waveform peaks
|
|
||||||
// renderer fetch()es the player src — and must degrade to a readable note
|
|
||||||
// (not a broken player) when the stored payload is the "<truncated: N bytes>"
|
|
||||||
// marker an older server stamped into oversized fields.
|
|
||||||
|
|
||||||
// Minimal valid 16 kHz mono 16-bit PCM WAV (0.1s 440 Hz sine), base64-encoded.
|
|
||||||
function wavBase64(samples = 1600, rate = 16000) {
|
|
||||||
const dataSize = samples * 2
|
|
||||||
const buf = Buffer.alloc(44 + dataSize)
|
|
||||||
buf.write('RIFF', 0)
|
|
||||||
buf.writeUInt32LE(36 + dataSize, 4)
|
|
||||||
buf.write('WAVE', 8)
|
|
||||||
buf.write('fmt ', 12)
|
|
||||||
buf.writeUInt32LE(16, 16)
|
|
||||||
buf.writeUInt16LE(1, 20) // PCM
|
|
||||||
buf.writeUInt16LE(1, 22) // mono
|
|
||||||
buf.writeUInt32LE(rate, 24)
|
|
||||||
buf.writeUInt32LE(rate * 2, 28)
|
|
||||||
buf.writeUInt16LE(2, 32)
|
|
||||||
buf.writeUInt16LE(16, 34)
|
|
||||||
buf.write('data', 36)
|
|
||||||
buf.writeUInt32LE(dataSize, 40)
|
|
||||||
for (let i = 0; i < samples; i++) {
|
|
||||||
buf.writeInt16LE(Math.round(8000 * Math.sin((2 * Math.PI * 440 * i) / rate)), 44 + i * 2)
|
|
||||||
}
|
|
||||||
return buf.toString('base64')
|
|
||||||
}
|
|
||||||
|
|
||||||
function transcriptionTrace(audioWavBase64) {
|
|
||||||
return {
|
|
||||||
type: 'transcription',
|
|
||||||
timestamp: Date.now() * 1_000_000,
|
|
||||||
model_name: 'parakeet-test',
|
|
||||||
summary: 'transcribed utterance',
|
|
||||||
duration: 500_000_000,
|
|
||||||
error: null,
|
|
||||||
data: {
|
|
||||||
audio_wav_base64: audioWavBase64,
|
|
||||||
audio_duration_s: 0.1,
|
|
||||||
audio_snippet_s: 0.1,
|
|
||||||
audio_sample_rate: 16000,
|
|
||||||
audio_samples: 1600,
|
|
||||||
audio_rms_dbfs: -12.0,
|
|
||||||
audio_peak_dbfs: -6.0,
|
|
||||||
audio_dc_offset: 0,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function openBackendTraceRow(page, traces) {
|
|
||||||
await page.route('**/api/traces', (route) => {
|
|
||||||
route.fulfill({ contentType: 'application/json', body: JSON.stringify([]) })
|
|
||||||
})
|
|
||||||
await page.route('**/api/backend-traces', (route) => {
|
|
||||||
route.fulfill({ contentType: 'application/json', body: JSON.stringify(traces) })
|
|
||||||
})
|
|
||||||
await page.goto('/app/traces')
|
|
||||||
await expect(page.locator('text=Tracing is')).toBeVisible({ timeout: 10_000 })
|
|
||||||
await page.locator('button', { hasText: 'Backend Traces' }).click()
|
|
||||||
await page.locator('td', { hasText: 'parakeet-test' }).first().click()
|
|
||||||
}
|
|
||||||
|
|
||||||
test.describe('Traces - Audio Snippets', () => {
|
|
||||||
test('plays a clip through a blob: URL, not a CSP-blocked data: URL', async ({ page }) => {
|
|
||||||
await openBackendTraceRow(page, [transcriptionTrace(wavBase64())])
|
|
||||||
|
|
||||||
// The expanded row carries the snippet metrics and a player whose source
|
|
||||||
// is an object URL (connect-src allows blob:, so the peaks fetch works).
|
|
||||||
await expect(page.locator('text=Audio Snippet')).toBeVisible()
|
|
||||||
const audio = page.locator('audio')
|
|
||||||
await expect(audio).toHaveCount(1)
|
|
||||||
const src = await audio.getAttribute('src')
|
|
||||||
expect(src).toMatch(/^blob:/)
|
|
||||||
await expect(page.getByTestId('audio-snippet-unavailable')).toHaveCount(0)
|
|
||||||
})
|
|
||||||
|
|
||||||
test('shows a readable note instead of a broken player for truncated payloads', async ({ page }) => {
|
|
||||||
await openBackendTraceRow(page, [transcriptionTrace('<truncated: 281660 bytes>')])
|
|
||||||
|
|
||||||
await expect(page.locator('text=Audio Snippet')).toBeVisible()
|
|
||||||
await expect(page.getByTestId('audio-snippet-unavailable')).toBeVisible()
|
|
||||||
await expect(page.locator('audio')).toHaveCount(0)
|
|
||||||
})
|
|
||||||
})
|
|
||||||
@@ -19,31 +19,24 @@ const STATUS_STYLES = {
|
|||||||
error: { icon: 'fa-solid fa-circle', color: 'var(--color-error)', bg: 'var(--color-error-light)' },
|
error: { icon: 'fa-solid fa-circle', color: 'var(--color-error)', bg: 'var(--color-error-light)' },
|
||||||
}
|
}
|
||||||
|
|
||||||
// upsertEntry merges a streamed transcript fragment into the entry identified
|
// upsertAssistant merges a streamed transcript fragment into the assistant entry
|
||||||
// by the server's item_id, or appends a new entry (with the given role) if
|
// identified by the server's item_id, or appends a new entry if none exists yet.
|
||||||
// none exists yet. Keying by item_id (not a mutable index tracked across
|
// Keying by item_id (not a mutable index tracked across handler/updater
|
||||||
// handler/updater boundaries) makes streamed deltas idempotent and
|
// boundaries) makes streamed deltas idempotent and order-independent, so React's
|
||||||
// order-independent, so React's batching of non-React data-channel events
|
// batching of non-React data-channel events cannot produce a duplicate bubble.
|
||||||
// cannot produce a duplicate bubble. mode 'append' adds to the running text;
|
// mode 'append' adds to the running text; 'replace' sets the final transcript.
|
||||||
// 'replace' sets the final transcript — the server sends a completed event
|
function upsertAssistant(prev, itemId, text, mode) {
|
||||||
// whose authoritative text supersedes any live captions (e.g. the
|
// Only assistant entries carry an id, and the streaming entry is almost
|
||||||
// semantic_vad retranscribe gate's batch decode).
|
// always the newest — search from the tail so per-delta cost stays constant.
|
||||||
function upsertEntry(prev, itemId, role, text, mode) {
|
|
||||||
// The streaming entry is almost always the newest — search from the tail
|
|
||||||
// so per-delta cost stays constant.
|
|
||||||
const i = prev.findLastIndex(e => e.id === itemId)
|
const i = prev.findLastIndex(e => e.id === itemId)
|
||||||
if (i === -1) {
|
if (i === -1) {
|
||||||
return [...prev, { role, id: itemId, text }]
|
return [...prev, { role: 'assistant', id: itemId, text }]
|
||||||
}
|
}
|
||||||
const next = [...prev]
|
const next = [...prev]
|
||||||
next[i] = { ...next[i], text: mode === 'append' ? next[i].text + text : text }
|
next[i] = { ...next[i], text: mode === 'append' ? next[i].text + text : text }
|
||||||
return next
|
return next
|
||||||
}
|
}
|
||||||
|
|
||||||
function upsertAssistant(prev, itemId, text, mode) {
|
|
||||||
return upsertEntry(prev, itemId, 'assistant', text, mode)
|
|
||||||
}
|
|
||||||
|
|
||||||
export default function Talk() {
|
export default function Talk() {
|
||||||
const { addToast } = useOutletContext()
|
const { addToast } = useOutletContext()
|
||||||
const navigate = useNavigate()
|
const navigate = useNavigate()
|
||||||
@@ -259,33 +252,12 @@ export default function Talk() {
|
|||||||
case 'input_audio_buffer.speech_stopped':
|
case 'input_audio_buffer.speech_stopped':
|
||||||
updateStatus('thinking', 'Processing...')
|
updateStatus('thinking', 'Processing...')
|
||||||
break
|
break
|
||||||
case 'conversation.item.input_audio_transcription.delta':
|
|
||||||
// Live captions: semantic_vad streams the user's words while they
|
|
||||||
// are still speaking, keyed by the item id the commit will reuse.
|
|
||||||
if (event.delta && event.item_id) {
|
|
||||||
setTranscript(prev => upsertEntry(prev, event.item_id, 'user', event.delta, 'append'))
|
|
||||||
}
|
|
||||||
break
|
|
||||||
case 'conversation.item.input_audio_transcription.completed':
|
case 'conversation.item.input_audio_transcription.completed':
|
||||||
if (event.transcript) {
|
if (event.transcript) {
|
||||||
if (event.item_id) {
|
setTranscript(prev => [...prev, { role: 'user', text: event.transcript }])
|
||||||
// Replaces any live captions with the authoritative transcript
|
|
||||||
// (which may differ, e.g. the retranscribe gate's batch decode);
|
|
||||||
// creates the entry when there were none (server_vad).
|
|
||||||
setTranscript(prev => upsertEntry(prev, event.item_id, 'user', event.transcript, 'replace'))
|
|
||||||
} else {
|
|
||||||
setTranscript(prev => [...prev, { role: 'user', text: event.transcript }])
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
updateStatus('thinking', 'Generating response...')
|
updateStatus('thinking', 'Generating response...')
|
||||||
break
|
break
|
||||||
case 'conversation.item.input_audio_transcription.failed':
|
|
||||||
// The turn was discarded after captions were shown (e.g. the buffer
|
|
||||||
// was cleared as silence) — retract the partial entry.
|
|
||||||
if (event.item_id) {
|
|
||||||
setTranscript(prev => prev.filter(e => e.id !== event.item_id))
|
|
||||||
}
|
|
||||||
break
|
|
||||||
case 'response.output_audio_transcript.delta':
|
case 'response.output_audio_transcript.delta':
|
||||||
if (event.delta) {
|
if (event.delta) {
|
||||||
inProgressIdRef.current = event.item_id
|
inProgressIdRef.current = event.item_id
|
||||||
@@ -740,7 +712,7 @@ export default function Talk() {
|
|||||||
)}
|
)}
|
||||||
{selectedModelInfo && !selectedModelInfo.self_contained && (
|
{selectedModelInfo && !selectedModelInfo.self_contained && (
|
||||||
<div style={{
|
<div style={{
|
||||||
display: 'flex', flexDirection: 'column', gap: 'var(--spacing-xs)',
|
display: 'grid', gridTemplateColumns: 'repeat(4, minmax(0, 1fr))', gap: 'var(--spacing-xs)',
|
||||||
marginBottom: 'var(--spacing-xs)', fontSize: '0.75rem',
|
marginBottom: 'var(--spacing-xs)', fontSize: '0.75rem',
|
||||||
}}>
|
}}>
|
||||||
{[
|
{[
|
||||||
@@ -752,12 +724,9 @@ export default function Talk() {
|
|||||||
<div key={item.label} style={{
|
<div key={item.label} style={{
|
||||||
background: 'var(--color-bg-secondary)', borderRadius: 'var(--radius-sm)',
|
background: 'var(--color-bg-secondary)', borderRadius: 'var(--radius-sm)',
|
||||||
padding: 'var(--spacing-xs)', border: '1px solid var(--color-border)',
|
padding: 'var(--spacing-xs)', border: '1px solid var(--color-border)',
|
||||||
display: 'flex', alignItems: 'baseline', gap: 'var(--spacing-sm)',
|
|
||||||
}}>
|
}}>
|
||||||
<div style={{ color: 'var(--color-text-secondary)', whiteSpace: 'nowrap' }}>{item.label}</div>
|
<div style={{ color: 'var(--color-text-secondary)', marginBottom: 2 }}>{item.label}</div>
|
||||||
{/* full width for the value; wrap rather than overflow when the
|
<div style={{ fontFamily: 'var(--font-mono)', overflow: 'hidden', textOverflow: 'ellipsis', whiteSpace: 'nowrap' }}>{item.value}</div>
|
||||||
model name is long (minWidth:0 lets the flex item shrink) */}
|
|
||||||
<div style={{ fontFamily: 'var(--font-mono)', minWidth: 0, marginLeft: 'auto', textAlign: 'right', overflowWrap: 'anywhere' }}>{item.value || '—'}</div>
|
|
||||||
</div>
|
</div>
|
||||||
))}
|
))}
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -86,40 +86,8 @@ function typeBadgeStyle(type) {
|
|||||||
return { background: c.bg, color: c.color, padding: '2px 8px', borderRadius: 'var(--radius-sm)', fontSize: '0.75rem', fontWeight: 500 }
|
return { background: c.bg, color: c.color, padding: '2px 8px', borderRadius: 'var(--radius-sm)', fontSize: '0.75rem', fontWeight: 500 }
|
||||||
}
|
}
|
||||||
|
|
||||||
// useWavObjectURL — decode a base64 WAV payload into a blob: object URL for
|
|
||||||
// the waveform player. A data: URL would render in <audio> (media-src allows
|
|
||||||
// data:) but the peaks renderer fetch()es the src and the CSP's connect-src
|
|
||||||
// only allows blob:, so playback broke with a CSP violation. Decoding to a
|
|
||||||
// Blob also tolerates payloads that aren't valid base64 — e.g. the
|
|
||||||
// "<truncated: N bytes>" marker older servers stamped into oversized fields —
|
|
||||||
// by yielding null instead of a broken player.
|
|
||||||
function useWavObjectURL(b64) {
|
|
||||||
const [url, setUrl] = useState(null)
|
|
||||||
useEffect(() => {
|
|
||||||
if (!b64) {
|
|
||||||
setUrl(null)
|
|
||||||
return undefined
|
|
||||||
}
|
|
||||||
let objectUrl = null
|
|
||||||
try {
|
|
||||||
const bin = atob(b64)
|
|
||||||
const bytes = new Uint8Array(bin.length)
|
|
||||||
for (let i = 0; i < bin.length; i++) bytes[i] = bin.charCodeAt(i)
|
|
||||||
objectUrl = URL.createObjectURL(new Blob([bytes], { type: 'audio/wav' }))
|
|
||||||
setUrl(objectUrl)
|
|
||||||
} catch {
|
|
||||||
setUrl(null)
|
|
||||||
}
|
|
||||||
return () => {
|
|
||||||
if (objectUrl) URL.revokeObjectURL(objectUrl)
|
|
||||||
}
|
|
||||||
}, [b64])
|
|
||||||
return url
|
|
||||||
}
|
|
||||||
|
|
||||||
// Audio player + metrics for transcription traces
|
// Audio player + metrics for transcription traces
|
||||||
function AudioSnippet({ data }) {
|
function AudioSnippet({ data }) {
|
||||||
const audioUrl = useWavObjectURL(data?.audio_wav_base64)
|
|
||||||
if (!data?.audio_wav_base64) return null
|
if (!data?.audio_wav_base64) return null
|
||||||
const metrics = [
|
const metrics = [
|
||||||
{ label: 'Duration', value: data.audio_duration_s + 's' },
|
{ label: 'Duration', value: data.audio_duration_s + 's' },
|
||||||
@@ -136,11 +104,7 @@ function AudioSnippet({ data }) {
|
|||||||
<i className="fas fa-headphones" style={{ color: 'var(--color-primary)' }} /> Audio Snippet
|
<i className="fas fa-headphones" style={{ color: 'var(--color-primary)' }} /> Audio Snippet
|
||||||
</h4>
|
</h4>
|
||||||
<div style={{ background: 'var(--color-bg-primary)', border: '1px solid var(--color-border)', borderRadius: 'var(--radius-md)', padding: 'var(--spacing-sm)' }}>
|
<div style={{ background: 'var(--color-bg-primary)', border: '1px solid var(--color-border)', borderRadius: 'var(--radius-md)', padding: 'var(--spacing-sm)' }}>
|
||||||
{audioUrl
|
<WaveformPlayer src={`data:audio/wav;base64,${data.audio_wav_base64}`} height={64} />
|
||||||
? <WaveformPlayer src={audioUrl} height={64} />
|
|
||||||
: <div data-testid="audio-snippet-unavailable" style={{ fontSize: '0.75rem', color: 'var(--color-text-secondary)', padding: 'var(--spacing-xs)' }}>
|
|
||||||
<i className="fas fa-triangle-exclamation" /> Audio clip not playable — it was truncated when recorded (raise Max Body Bytes in the tracing settings).
|
|
||||||
</div>}
|
|
||||||
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fill, minmax(120px, 1fr))', gap: 'var(--spacing-xs)', fontSize: '0.75rem', marginTop: 'var(--spacing-sm)' }}>
|
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fill, minmax(120px, 1fr))', gap: 'var(--spacing-xs)', fontSize: '0.75rem', marginTop: 'var(--spacing-sm)' }}>
|
||||||
{metrics.map(m => (
|
{metrics.map(m => (
|
||||||
<div key={m.label} style={{ background: 'var(--color-bg-secondary)', borderRadius: 'var(--radius-sm)', padding: 'var(--spacing-xs)' }}>
|
<div key={m.label} style={{ background: 'var(--color-bg-secondary)', borderRadius: 'var(--radius-sm)', padding: 'var(--spacing-xs)' }}>
|
||||||
|
|||||||
@@ -24,11 +24,6 @@ type TranscriptionResult struct {
|
|||||||
Text string `json:"text"`
|
Text string `json:"text"`
|
||||||
Language string `json:"language,omitempty"`
|
Language string `json:"language,omitempty"`
|
||||||
Duration float64 `json:"duration,omitempty"`
|
Duration float64 `json:"duration,omitempty"`
|
||||||
// Eou reports that the decode ended on the model's end-of-utterance
|
|
||||||
// special token (emitted by streaming-EOU models such as
|
|
||||||
// parakeet_realtime_eou_120m-v1; always false elsewhere). The marker
|
|
||||||
// itself never appears in Text.
|
|
||||||
Eou bool `json:"eou,omitempty"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type TranscriptionSegmentSeconds struct {
|
type TranscriptionSegmentSeconds struct {
|
||||||
|
|||||||
@@ -130,6 +130,20 @@ func WithLockCtx(ctx context.Context, db *gorm.DB, key int64, fn func() error) e
|
|||||||
}
|
}
|
||||||
defer conn.Close()
|
defer conn.Close()
|
||||||
|
|
||||||
|
// Neutralize any deployment-wide lock_timeout on this dedicated connection.
|
||||||
|
// Operators commonly set a short global lock_timeout (on the role or
|
||||||
|
// database) to bound ordinary row-lock waits. Applied to the blocking
|
||||||
|
// pg_advisory_lock below, it aborts the wait with SQLSTATE 55P03 and turns
|
||||||
|
// LocalAI's intentional cross-replica "wait your turn, then re-check"
|
||||||
|
// coordination into a hard error for the caller (e.g. a chat request that
|
||||||
|
// just wanted to reuse a model another replica is loading). Let the Go
|
||||||
|
// context be the single source of truth for how long we wait instead.
|
||||||
|
if _, err := conn.ExecContext(ctx, "SET lock_timeout = 0"); err != nil {
|
||||||
|
return fmt.Errorf("advisorylock: disabling lock_timeout: %w", err)
|
||||||
|
}
|
||||||
|
// Restore the session default before this pooled connection is reused.
|
||||||
|
defer func() { _, _ = conn.ExecContext(context.Background(), "RESET lock_timeout") }()
|
||||||
|
|
||||||
if _, err := conn.ExecContext(ctx, "SELECT pg_advisory_lock($1)", key); err != nil {
|
if _, err := conn.ExecContext(ctx, "SELECT pg_advisory_lock($1)", key); err != nil {
|
||||||
return fmt.Errorf("advisorylock: acquiring lock %d: %w", key, err)
|
return fmt.Errorf("advisorylock: acquiring lock %d: %w", key, err)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -158,6 +158,53 @@ var _ = Describe("AdvisoryLock", func() {
|
|||||||
Expect(err).To(HaveOccurred())
|
Expect(err).To(HaveOccurred())
|
||||||
})
|
})
|
||||||
|
|
||||||
|
It("waits out a short server-side lock_timeout instead of failing with 55P03", func() {
|
||||||
|
const lockKey int64 = 703
|
||||||
|
|
||||||
|
// Reproduce the production deployment that triggered this: a short
|
||||||
|
// global lock_timeout set on the database. Without the fix, a waiter
|
||||||
|
// blocked on pg_advisory_lock() is aborted by the server after this
|
||||||
|
// window and surfaces SQLSTATE 55P03 ("canceling statement due to
|
||||||
|
// lock timeout") to the caller instead of waiting for its turn.
|
||||||
|
Expect(db.Exec("ALTER DATABASE testdb SET lock_timeout = '300ms'").Error).ToNot(HaveOccurred())
|
||||||
|
sqlDB, err := db.DB()
|
||||||
|
Expect(err).ToNot(HaveOccurred())
|
||||||
|
// Drop pooled connections so subsequent ones reconnect and inherit
|
||||||
|
// the new database-level lock_timeout default.
|
||||||
|
sqlDB.SetMaxIdleConns(0)
|
||||||
|
|
||||||
|
holding := make(chan struct{})
|
||||||
|
released := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
defer GinkgoRecover()
|
||||||
|
herr := WithLockCtx(context.Background(), db, lockKey, func() error {
|
||||||
|
close(holding)
|
||||||
|
// Hold well past the 300ms server lock_timeout.
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
Expect(herr).ToNot(HaveOccurred())
|
||||||
|
close(released)
|
||||||
|
}()
|
||||||
|
|
||||||
|
<-holding // ensure the holder owns the lock before we contend
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
executed := false
|
||||||
|
start := time.Now()
|
||||||
|
werr := WithLockCtx(ctx, db, lockKey, func() error {
|
||||||
|
executed = true
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
Expect(werr).ToNot(HaveOccurred(),
|
||||||
|
"waiter should wait out the in-progress hold, not fail with lock_timeout (55P03)")
|
||||||
|
Expect(executed).To(BeTrue())
|
||||||
|
Expect(time.Since(start)).To(BeNumerically(">=", 400*time.Millisecond),
|
||||||
|
"waiter should have actually waited for the holder to release")
|
||||||
|
<-released
|
||||||
|
})
|
||||||
|
|
||||||
It("serializes concurrent WithLockCtx on same key", func() {
|
It("serializes concurrent WithLockCtx on same key", func() {
|
||||||
const lockKey int64 = 702
|
const lockKey int64 = 702
|
||||||
|
|
||||||
|
|||||||
@@ -241,9 +241,6 @@ func (c *fakeBackendClient) AudioTransformStream(_ context.Context, _ ...ggrpc.C
|
|||||||
func (c *fakeBackendClient) AudioToAudioStream(_ context.Context, _ ...ggrpc.CallOption) (grpc.AudioToAudioStreamClient, error) {
|
func (c *fakeBackendClient) AudioToAudioStream(_ context.Context, _ ...ggrpc.CallOption) (grpc.AudioToAudioStreamClient, error) {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
func (c *fakeBackendClient) AudioTranscriptionLive(_ context.Context, _ ...ggrpc.CallOption) (grpc.AudioTranscriptionLiveClient, error) {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
func (c *fakeBackendClient) Forward(_ context.Context, _ ...ggrpc.CallOption) (grpc.ForwardClient, error) {
|
func (c *fakeBackendClient) Forward(_ context.Context, _ ...ggrpc.CallOption) (grpc.ForwardClient, error) {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -195,10 +195,6 @@ func (f *fakeGRPCBackend) AudioToAudioStream(_ context.Context, _ ...ggrpc.CallO
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f *fakeGRPCBackend) AudioTranscriptionLive(_ context.Context, _ ...ggrpc.CallOption) (grpc.AudioTranscriptionLiveClient, error) {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (f *fakeGRPCBackend) Forward(_ context.Context, _ ...ggrpc.CallOption) (grpc.ForwardClient, error) {
|
func (f *fakeGRPCBackend) Forward(_ context.Context, _ ...ggrpc.CallOption) (grpc.ForwardClient, error) {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -68,6 +68,13 @@ type SmartRouterOptions struct {
|
|||||||
// the absolute model paths untouched so the worker loads them directly from
|
// the absolute model paths untouched so the worker loads them directly from
|
||||||
// the shared volume (#10556). See config.DistributedConfig.SharedModels.
|
// the shared volume (#10556). See config.DistributedConfig.SharedModels.
|
||||||
SharedModels bool
|
SharedModels bool
|
||||||
|
// ModelLoadCeiling is the hard upper bound on how long a single cold-load
|
||||||
|
// attempt (node selection -> backend install -> file staging -> LoadModel)
|
||||||
|
// may run while holding the per-model advisory lock. It backstops every
|
||||||
|
// sub-step's own timeout so a wedged worker can never pin the lock - and
|
||||||
|
// every other replica's request for that model - indefinitely. Zero selects
|
||||||
|
// defaultModelLoadCeiling.
|
||||||
|
ModelLoadCeiling time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
// SmartRouter routes inference requests to the best available backend node.
|
// SmartRouter routes inference requests to the best available backend node.
|
||||||
@@ -101,8 +108,18 @@ type SmartRouter struct {
|
|||||||
// sharedModels skips file staging when all nodes mount the same models
|
// sharedModels skips file staging when all nodes mount the same models
|
||||||
// directory at the same path (see SmartRouterOptions.SharedModels).
|
// directory at the same path (see SmartRouterOptions.SharedModels).
|
||||||
sharedModels bool
|
sharedModels bool
|
||||||
|
// modelLoadCeiling bounds how long a cold load may hold the per-model
|
||||||
|
// advisory lock (see SmartRouterOptions.ModelLoadCeiling).
|
||||||
|
modelLoadCeiling time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// defaultModelLoadCeiling is the fallback hold ceiling for a cold model load.
|
||||||
|
// It must comfortably exceed the slowest legitimate load - a multi-GB backend
|
||||||
|
// install (DefaultBackendInstallTimeout, 15m) plus staging and the remote
|
||||||
|
// LoadModel (5m) - so it never cuts a real load short; it only ever fires when
|
||||||
|
// a step is genuinely wedged (e.g. a worker that died mid-install).
|
||||||
|
const defaultModelLoadCeiling = 25 * time.Minute
|
||||||
|
|
||||||
// probeCacheTTL is how long a successful gRPC HealthCheck on a backend is
|
// probeCacheTTL is how long a successful gRPC HealthCheck on a backend is
|
||||||
// trusted before the next request re-probes. Matches healthCheckTTL in
|
// trusted before the next request re-probes. Matches healthCheckTTL in
|
||||||
// pkg/model/model.go so the single-process and distributed paths share a
|
// pkg/model/model.go so the single-process and distributed paths share a
|
||||||
@@ -117,6 +134,10 @@ func NewSmartRouter(registry ModelRouter, opts SmartRouterOptions) *SmartRouter
|
|||||||
if factory == nil {
|
if factory == nil {
|
||||||
factory = &tokenClientFactory{token: opts.AuthToken}
|
factory = &tokenClientFactory{token: opts.AuthToken}
|
||||||
}
|
}
|
||||||
|
ceiling := opts.ModelLoadCeiling
|
||||||
|
if ceiling <= 0 {
|
||||||
|
ceiling = defaultModelLoadCeiling
|
||||||
|
}
|
||||||
return &SmartRouter{
|
return &SmartRouter{
|
||||||
registry: registry,
|
registry: registry,
|
||||||
unloader: opts.Unloader,
|
unloader: opts.Unloader,
|
||||||
@@ -131,6 +152,7 @@ func NewSmartRouter(registry ModelRouter, opts SmartRouterOptions) *SmartRouter
|
|||||||
prefixConfig: opts.PrefixConfig,
|
prefixConfig: opts.PrefixConfig,
|
||||||
pressure: opts.Pressure,
|
pressure: opts.Pressure,
|
||||||
sharedModels: opts.SharedModels,
|
sharedModels: opts.SharedModels,
|
||||||
|
modelLoadCeiling: ceiling,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -383,11 +405,19 @@ func (r *SmartRouter) Route(ctx context.Context, modelID, modelName, backendType
|
|||||||
// the request context. If staging were bound to it, the multi-GB upload
|
// the request context. If staging were bound to it, the multi-GB upload
|
||||||
// aborts with "context canceled" mid-transfer and large models can never
|
// aborts with "context canceled" mid-transfer and large models can never
|
||||||
// finish staging (the model-load outage). WithoutCancel keeps the request's
|
// finish staging (the model-load outage). WithoutCancel keeps the request's
|
||||||
// values (prefix chain, etc.) but drops its cancellation/deadline. Each
|
// values (prefix chain, etc.) but drops its cancellation/deadline.
|
||||||
// long step still has its own bound (the file stager's resume budget,
|
//
|
||||||
// LoadModel's 5m timeout), and the per-model advisory lock below de-dupes
|
// Detaching from the caller is necessary, but it must not be unbounded: the
|
||||||
// concurrent loaders across replicas.
|
// load runs while holding the per-model advisory lock, and a worker that
|
||||||
loadCtx := context.WithoutCancel(ctx)
|
// dies mid-install (its backend.install never replies) would otherwise pin
|
||||||
|
// that lock (and every other replica's request for the same model) until
|
||||||
|
// the NATS install deadline alone expires. Re-impose a single hard ceiling
|
||||||
|
// over the whole sequence so the lock is always released in bounded time,
|
||||||
|
// even if a sub-step wedges. Each long step still has its own (tighter)
|
||||||
|
// bound; this only backstops them. The per-model advisory lock below
|
||||||
|
// de-dupes concurrent loaders across replicas.
|
||||||
|
loadCtx, cancelLoad := context.WithTimeout(context.WithoutCancel(ctx), r.modelLoadCeiling)
|
||||||
|
defer cancelLoad()
|
||||||
loadModel := func(ctx context.Context) (*RouteResult, error) {
|
loadModel := func(ctx context.Context) (*RouteResult, error) {
|
||||||
// Re-check after acquiring lock — another request may have loaded it
|
// Re-check after acquiring lock — another request may have loaded it
|
||||||
node, nm, err := r.registry.FindAndLockNodeWithModel(ctx, trackingKey, candidateNodeIDs, pref)
|
node, nm, err := r.registry.FindAndLockNodeWithModel(ctx, trackingKey, candidateNodeIDs, pref)
|
||||||
@@ -916,7 +946,14 @@ func (r *SmartRouter) installBackendOnNode(ctx context.Context, node *BackendNod
|
|||||||
}
|
}
|
||||||
|
|
||||||
key := fmt.Sprintf("%s|%s|%s|%d", node.ID, backendType, modelID, replicaIndex)
|
key := fmt.Sprintf("%s|%s|%s|%d", node.ID, backendType, modelID, replicaIndex)
|
||||||
v, err, _ := r.installFlight.Do(key, func() (any, error) {
|
// DoChan rather than Do so this wait honors ctx cancellation. InstallBackend
|
||||||
|
// blocks for its full NATS deadline (15m by default) when a worker accepts
|
||||||
|
// the request but never replies (e.g. it died mid-install). Without ctx
|
||||||
|
// awareness the caller (holding the per-model advisory lock) would sit there
|
||||||
|
// the whole time; here a cancelled ctx (typically the model-load ceiling)
|
||||||
|
// frees the caller promptly. The shared install keeps running in the
|
||||||
|
// background and still coalesces other callers via singleflight.
|
||||||
|
resCh := r.installFlight.DoChan(key, func() (any, error) {
|
||||||
reply, err := r.unloader.InstallBackend(node.ID, backendType, modelID, r.galleriesJSON, "", "", "", replicaIndex, "", nil)
|
reply, err := r.unloader.InstallBackend(node.ID, backendType, modelID, r.galleriesJSON, "", "", "", replicaIndex, "", nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
@@ -931,10 +968,15 @@ func (r *SmartRouter) installBackendOnNode(ctx context.Context, node *BackendNod
|
|||||||
}
|
}
|
||||||
return addr, nil
|
return addr, nil
|
||||||
})
|
})
|
||||||
if err != nil {
|
select {
|
||||||
return "", err
|
case <-ctx.Done():
|
||||||
|
return "", ctx.Err()
|
||||||
|
case res := <-resCh:
|
||||||
|
if res.Err != nil {
|
||||||
|
return "", res.Err
|
||||||
|
}
|
||||||
|
return res.Val.(string), nil
|
||||||
}
|
}
|
||||||
return v.(string), nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *SmartRouter) buildClientForAddr(node *BackendNode, addr string, parallel bool) grpc.Backend {
|
func (r *SmartRouter) buildClientForAddr(node *BackendNode, addr string, parallel bool) grpc.Backend {
|
||||||
|
|||||||
@@ -493,6 +493,44 @@ var _ = Describe("SmartRouter", func() {
|
|||||||
Expect(result.Node.ID).To(Equal("n3"))
|
Expect(result.Node.ID).To(Equal("n3"))
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
Context("worker wedges mid-install (dead node holding the lock)", func() {
|
||||||
|
It("aborts the load at the ModelLoadCeiling instead of blocking forever", func() {
|
||||||
|
// Simulate the production incident: the chosen worker accepts the
|
||||||
|
// backend.install but never replies (it died), so InstallBackend
|
||||||
|
// would otherwise block for its full NATS deadline (15m by
|
||||||
|
// default) while pinning the per-model advisory lock. Route must
|
||||||
|
// give up at the ceiling so the lock is released promptly.
|
||||||
|
reg.findAndLockErr = errors.New("not found")
|
||||||
|
reg.findIdleNode = &BackendNode{ID: "n4", Name: "dead-node", Address: "10.0.0.4:50051"}
|
||||||
|
|
||||||
|
block := make(chan struct{})
|
||||||
|
defer close(block) // let the background install goroutine drain at test end
|
||||||
|
unloader.installHook = func() { <-block }
|
||||||
|
|
||||||
|
router := NewSmartRouter(reg, SmartRouterOptions{
|
||||||
|
Unloader: unloader,
|
||||||
|
ClientFactory: factory,
|
||||||
|
ModelLoadCeiling: 200 * time.Millisecond,
|
||||||
|
})
|
||||||
|
|
||||||
|
done := make(chan error, 1)
|
||||||
|
start := time.Now()
|
||||||
|
go func() {
|
||||||
|
defer GinkgoRecover()
|
||||||
|
_, err := router.Route(context.Background(), "wedged-model",
|
||||||
|
"models/wedged.gguf", "llama-cpp",
|
||||||
|
&pb.ModelOptions{Model: "models/wedged.gguf"}, false)
|
||||||
|
done <- err
|
||||||
|
}()
|
||||||
|
|
||||||
|
var routeErr error
|
||||||
|
Eventually(done, 5*time.Second).Should(Receive(&routeErr),
|
||||||
|
"Route must not block on a wedged install past the ceiling")
|
||||||
|
Expect(routeErr).To(HaveOccurred())
|
||||||
|
Expect(time.Since(start)).To(BeNumerically("<", 5*time.Second))
|
||||||
|
})
|
||||||
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
Describe("scheduleNewModel (mock-based, via Route)", func() {
|
Describe("scheduleNewModel (mock-based, via Route)", func() {
|
||||||
|
|||||||
@@ -75,8 +75,8 @@ var (
|
|||||||
// trace) or any TTS run (~1.3 MiB of audio_wav_base64 per trace) blows the
|
// trace) or any TTS run (~1.3 MiB of audio_wav_base64 per trace) blows the
|
||||||
// payload past tens of MiB and locks the Traces page in a loading state.
|
// payload past tens of MiB and locks the Traces page in a loading state.
|
||||||
//
|
//
|
||||||
// 0 disables the cap. Guarded by backendMu; refreshed on EVERY
|
// 0 disables the cap. Set on the first InitBackendTracingIfEnabled call only,
|
||||||
// InitBackendTracingIfEnabled call — see below.
|
// matching the sync.Once-guarded maxItems semantics.
|
||||||
var backendMaxBodyBytes int
|
var backendMaxBodyBytes int
|
||||||
|
|
||||||
func InitBackendTracingIfEnabled(maxItems, maxBodyBytes int) {
|
func InitBackendTracingIfEnabled(maxItems, maxBodyBytes int) {
|
||||||
@@ -86,6 +86,7 @@ func InitBackendTracingIfEnabled(maxItems, maxBodyBytes int) {
|
|||||||
}
|
}
|
||||||
backendMu.Lock()
|
backendMu.Lock()
|
||||||
backendTraceBuffer = circularbuffer.New[*BackendTrace](maxItems)
|
backendTraceBuffer = circularbuffer.New[*BackendTrace](maxItems)
|
||||||
|
backendMaxBodyBytes = maxBodyBytes
|
||||||
backendMu.Unlock()
|
backendMu.Unlock()
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
@@ -98,26 +99,11 @@ func InitBackendTracingIfEnabled(maxItems, maxBodyBytes int) {
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
})
|
})
|
||||||
|
|
||||||
// The body cap tracks the LATEST call, not the first: tracing_max_body_bytes
|
|
||||||
// is runtime-mutable via the settings API (ApplyRuntimeSettings), and every
|
|
||||||
// recording path calls this right before RecordBackendTrace with the current
|
|
||||||
// appConfig value. Freezing the cap on first init meant a raised setting let
|
|
||||||
// producers (e.g. trace.AudioSnippet, which reads the live value) embed
|
|
||||||
// payloads that this recorder then stomped with the "<truncated: N bytes>"
|
|
||||||
// marker — corrupting audio_wav_base64 into an unplayable string. maxItems
|
|
||||||
// keeps first-call semantics: resizing the ring buffer would drop entries.
|
|
||||||
backendMu.Lock()
|
|
||||||
backendMaxBodyBytes = maxBodyBytes
|
|
||||||
backendMu.Unlock()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func RecordBackendTrace(t BackendTrace) {
|
func RecordBackendTrace(t BackendTrace) {
|
||||||
backendMu.Lock()
|
if t.Data != nil && backendMaxBodyBytes > 0 {
|
||||||
maxBody := backendMaxBodyBytes
|
t.Data = capDataStrings(t.Data, backendMaxBodyBytes)
|
||||||
backendMu.Unlock()
|
|
||||||
if t.Data != nil && maxBody > 0 {
|
|
||||||
t.Data = capDataStrings(t.Data, maxBody)
|
|
||||||
}
|
}
|
||||||
select {
|
select {
|
||||||
case backendLogChan <- &t:
|
case backendLogChan <- &t:
|
||||||
|
|||||||
@@ -28,9 +28,8 @@ const (
|
|||||||
|
|
||||||
var _ = Describe("RecordBackendTrace Data capping", func() {
|
var _ = Describe("RecordBackendTrace Data capping", func() {
|
||||||
BeforeEach(func() {
|
BeforeEach(func() {
|
||||||
// The ring buffer is allocated once (sync.Once) but the body cap
|
// Init is sync.Once so the first test wins; subsequent tests just
|
||||||
// follows the latest call, so each spec re-establishes smallCap here
|
// clear the buffer. The cap value below has to match the first call.
|
||||||
// regardless of what a previous spec set.
|
|
||||||
trace.InitBackendTracingIfEnabled(64, smallCap)
|
trace.InitBackendTracingIfEnabled(64, smallCap)
|
||||||
trace.ClearBackendTraces()
|
trace.ClearBackendTraces()
|
||||||
})
|
})
|
||||||
@@ -132,30 +131,6 @@ var _ = Describe("RecordBackendTrace Data capping", func() {
|
|||||||
got := trace.GetBackendTraces()[0]
|
got := trace.GetBackendTraces()[0]
|
||||||
Expect(got.Data["messages"]).To(Equal(preTruncated))
|
Expect(got.Data["messages"]).To(Equal(preTruncated))
|
||||||
})
|
})
|
||||||
|
|
||||||
It("applies a runtime-raised cap without a restart", func() {
|
|
||||||
// tracing_max_body_bytes is runtime-mutable via the settings API.
|
|
||||||
// Producers like AudioSnippet read the live value, so the recorder
|
|
||||||
// must too — under the old first-call-wins behaviour a raised cap
|
|
||||||
// kept truncating audio_wav_base64 payloads the producer had already
|
|
||||||
// let through, corrupting them into "<truncated: N bytes>" markers.
|
|
||||||
oversizedForOldCap := strings.Repeat("w", smallCap*4)
|
|
||||||
|
|
||||||
trace.InitBackendTracingIfEnabled(64, smallCap*8) // simulate the settings raise
|
|
||||||
trace.RecordBackendTrace(trace.BackendTrace{
|
|
||||||
Timestamp: time.Now(),
|
|
||||||
Type: trace.BackendTraceTranscription,
|
|
||||||
ModelName: "m",
|
|
||||||
Data: map[string]any{
|
|
||||||
"audio_wav_base64": oversizedForOldCap,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
|
|
||||||
Eventually(trace.GetBackendTraces).Should(HaveLen(1))
|
|
||||||
got := trace.GetBackendTraces()[0]
|
|
||||||
Expect(got.Data["audio_wav_base64"]).To(Equal(oversizedForOldCap),
|
|
||||||
"a payload under the raised cap must survive intact")
|
|
||||||
})
|
|
||||||
})
|
})
|
||||||
|
|
||||||
var _ = Describe("TruncateToBytes", func() {
|
var _ = Describe("TruncateToBytes", func() {
|
||||||
|
|||||||
@@ -1 +1 @@
|
|||||||
48.5
|
45.0
|
||||||
|
|||||||
@@ -56,41 +56,6 @@ pipeline:
|
|||||||
|
|
||||||
All streaming flags are off by default, so existing pipelines are unaffected.
|
All streaming flags are off by default, so existing pipelines are unaffected.
|
||||||
|
|
||||||
### Turn detection
|
|
||||||
|
|
||||||
Turn detection decides when the user has finished speaking and the pipeline should respond. Two modes are supported, matching the OpenAI session schema:
|
|
||||||
|
|
||||||
- **`server_vad`** (default): silence-based. The VAD model watches the audio and the turn commits after `silence_duration_ms` (default 500 ms) of silence. Simple and model-agnostic, but a fixed silence window must trade interrupting mid-sentence pauses against sluggish responses.
|
|
||||||
- **`semantic_vad`**: model-driven. The transcription model itself signals end-of-utterance and the silence window becomes dynamic: short right after the model emits its end-of-utterance token, much longer when it does not — so pausing to think no longer gets cut off, while finished sentences get a fast response.
|
|
||||||
|
|
||||||
`semantic_vad` requires a transcription model that emits an end-of-utterance token over a cache-aware streaming decode — currently `parakeet-cpp-realtime_eou_120m-v1` (the model is trained to distinguish "paused, expecting a reply" from "paused mid-thought"). The realtime pipeline feeds it the microphone audio live while the user speaks. With any other transcription backend the session degrades gracefully to silence-only detection using the eagerness timeout below (a warning is logged once). The model also emits a distinct end-of-backchannel token (`<EOB>`) for short acknowledgments like "uh-huh": those are transcribed but never treated as the user yielding the turn.
|
|
||||||
|
|
||||||
Sessions can opt in via `session.update` (`turn_detection: {"type": "semantic_vad", "eagerness": "medium"}`), or the pipeline can set a server-side default so clients need no changes:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
name: gpt-realtime
|
|
||||||
pipeline:
|
|
||||||
vad: silero-vad-ggml
|
|
||||||
transcription: parakeet-cpp-realtime_eou_120m-v1
|
|
||||||
llm: qwen3-4b
|
|
||||||
tts: tts-1
|
|
||||||
turn_detection:
|
|
||||||
type: semantic_vad # default for sessions on this model (server_vad if unset)
|
|
||||||
eagerness: medium # low | medium | high | auto (auto == medium)
|
|
||||||
retranscribe: false # see below
|
|
||||||
```
|
|
||||||
|
|
||||||
A client `session.update` still overrides `type` and `eagerness` per session.
|
|
||||||
|
|
||||||
**Eagerness** sets the fallback silence window used when no end-of-utterance token was seen (the model missed it, or the user genuinely trails off): `low` waits 8 s, `medium`/`auto` 4 s, `high` 2 s — the same max-timeout semantics OpenAI documents. After the token is seen, the turn commits on the next VAD tick (~300 ms).
|
|
||||||
|
|
||||||
**Live captions**: while the user speaks, `semantic_vad` streams `conversation.item.input_audio_transcription.delta` events under the item id the commit will later reuse, so clients can render the words as they are recognized. The `completed` event at commit carries the authoritative transcript and replaces the partial text (with `retranscribe: true` it may differ from the captions); a turn discarded before commit emits `conversation.item.input_audio_transcription.failed` so clients can retract its captions.
|
|
||||||
|
|
||||||
**`retranscribe`** (server-side only, semantic_vad only) cross-checks the streaming decode against a batch decode at commit time:
|
|
||||||
|
|
||||||
- `false` (default): the transcript accumulated from the live stream is used as-is — the model runs once per utterance and the LLM starts immediately at commit.
|
|
||||||
- `true`: the committed audio is re-transcribed offline. If the batch decode also ends with the end-of-utterance token the turn proceeds (using the batch transcript); if it does **not**, the commit is cancelled and the session keeps listening — treating the streaming token as a false positive. Both transcripts are compared and logged, which makes this mode a useful diagnostic for how well the streaming and batch decodes align, at the cost of one extra decode per turn.
|
|
||||||
|
|
||||||
### Disabling thinking
|
### Disabling thinking
|
||||||
|
|
||||||
For reasoning models, you can force the pipeline LLM's thinking off without editing the LLM model config:
|
For reasoning models, you can force the pipeline LLM's thinking off without editing the LLM model config:
|
||||||
|
|||||||
@@ -1,603 +0,0 @@
|
|||||||
# Realtime API state machines — map & re-architecture research
|
|
||||||
|
|
||||||
Status: research / design (compaction phase). No code changes implied yet.
|
|
||||||
|
|
||||||
The realtime API (`core/http/endpoints/openai/realtime*.go`) grew feature-by-feature
|
|
||||||
(server_vad → semantic_vad/EOU, streaming pipeline, tool turns, compaction, voice
|
|
||||||
gate, sound detection, WebRTC). The result is several **implicit** state machines
|
|
||||||
whose states and transitions are scattered across goroutine-local variables, shared
|
|
||||||
`Session`/`Conversation` fields under five different mutexes, raw channels, and
|
|
||||||
`context` cancellation. State is *inferred* from variable combinations rather than
|
|
||||||
*stored*; several illegal/inconsistent states are reachable.
|
|
||||||
|
|
||||||
This document (1) inventories the implicit machines, (2) catalogues the cross-cutting
|
|
||||||
failure modes, (3) researches how to re-implement them explicitly and verifiably, and
|
|
||||||
(4) lists the invariants a correct implementation must guarantee.
|
|
||||||
|
|
||||||
All line numbers are against the current `feat/realtime-semantic-vad-eou` branch and
|
|
||||||
will drift; treat them as anchors.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Part 1 — Inventory of the implicit state machines
|
|
||||||
|
|
||||||
There is **no `state`/`status` field anywhere** in `Session` or `Conversation`. Every
|
|
||||||
machine below is reconstructed from variable combinations.
|
|
||||||
|
|
||||||
### M1. Connection / transport lifecycle
|
|
||||||
|
|
||||||
Two transports implement one `Transport` interface; their lifecycles differ sharply.
|
|
||||||
|
|
||||||
- **WebSocket** (`realtime_transport_ws.go`): essentially stateless — a `*websocket.Conn`
|
|
||||||
plus a write `sync.Mutex`. No send queue, no send goroutine, no closed flag. "Closed"
|
|
||||||
= `ReadEvent` returns an error.
|
|
||||||
- **WebRTC** (`realtime_transport_webrtc.go`): an explicit-ish machine built from raw
|
|
||||||
channels — `dcReady` (closed by `dcDone sync.OnceFunc`), `closed` (closed by
|
|
||||||
`closeDone sync.OnceFunc` from *either* `OnConnectionStateChange` or `Close()`),
|
|
||||||
`flushed`, `sessionCh` (cap 1), `inEvents`/`outEvents` (cap 256), plus a `sendLoop`
|
|
||||||
goroutine and RTP counters under `rtpMu`.
|
|
||||||
|
|
||||||
Conceptual states (`connecting → data-channel-open → session-created → active →
|
|
||||||
closing → closed`) are **not stored**; the only persisted membership state is the
|
|
||||||
`sessions[sessionID]` map entry (exists `realtime.go:631`→`:1009`). `session-created`
|
|
||||||
and `session-updated` are *events*, not states.
|
|
||||||
|
|
||||||
Teardown order (`realtime.go:989-1010`): `cancelActiveResponse` → `close(decodeDone)`
|
|
||||||
→ `close(done)` (if VAD running) → `close(soundWindowDone)` → `wg.Wait()` →
|
|
||||||
`delete(sessions,…)`. Then, WebRTC only, `defer transport.Close()` → `closeDone()` →
|
|
||||||
`<-flushed` → `pc.Close()`.
|
|
||||||
|
|
||||||
### M2. Audio-input / turn-detection (server_vad + semantic_vad + EOU)
|
|
||||||
|
|
||||||
One `handleVAD` goroutine (`realtime.go:1322`) on a 300 ms ticker. Mode is
|
|
||||||
**re-evaluated every tick** under `sessionLock` (`:1350-1357`) so it can flip mid-turn.
|
|
||||||
|
|
||||||
- **server_vad** states are encoded by the goroutine-local `speechStarted bool`
|
|
||||||
(`:1337`) plus silence *measured* (not timed) as `audioLength - segEndTime >
|
|
||||||
silenceThreshold` recomputed each tick (`:1461`). States: idle → inspecting →
|
|
||||||
speech-detected → awaiting-commit → committing → transcribing/responding.
|
|
||||||
"Holdback" is a byte count (`noSpeechHoldbackSec*rate*2`), not a timer.
|
|
||||||
- **semantic_vad** adds the `liveTurnState` struct (`realtime_semantic_vad.go`):
|
|
||||||
`live` (nil = closed), `unavailable` (sticky degrade → behaves as server_vad),
|
|
||||||
`eouAtSec`, `parts`, `itemID` (allocated at turn open so captions can stream),
|
|
||||||
`deltasSent`. Extra states: closed, open/streaming-ASR, EOU-pending, EOU-fallback
|
|
||||||
(dynamic silence threshold 0 s when EOU pending, else eagerness 8/4/2 s),
|
|
||||||
retranscribe-gate, EOU-rejected, finished, discarded.
|
|
||||||
The one cross-goroutine edge: the backend recv callback pushes onto `events`
|
|
||||||
(buffered 64, **non-blocking — drops on overflow**, `:116-117`); `drainEvents`
|
|
||||||
reads it on the tick.
|
|
||||||
- **Voice gate** (`realtime_voicegate.go`) runs *inside* the commit goroutine:
|
|
||||||
resolving → authorized/rejected, with a sticky `voiceVerified` (under `gateMu`) for
|
|
||||||
`when:first`.
|
|
||||||
|
|
||||||
### M3. Response lifecycle (+ synchronous tool-turn recursion)
|
|
||||||
|
|
||||||
A response is "active" iff `Session.activeResponseDone` is non-nil and unclosed
|
|
||||||
(`responseMu`, `:172`). One goroutine owns it; its lifetime == that channel's. State
|
|
||||||
is observable only through the `response.*` event stream and `ItemStatus*` on the
|
|
||||||
assistant item. Logical states: idle → starting → generating-text →
|
|
||||||
generating-audio → tool-call-pending → tool-executing → awaiting-next-tool-turn →
|
|
||||||
cancelling → done(completed|cancelled) | failed.
|
|
||||||
|
|
||||||
- Cancellation is **cooperative at discrete checkpoints** (`ctx.Err()` at
|
|
||||||
`:2172,2364,2394`, `realtime_stream.go:193,202,241,259`).
|
|
||||||
- The tool loop is **synchronous recursion on the same goroutine**, bounded by
|
|
||||||
`maxAssistantToolTurns = 10`; each level mints a fresh `responseID` and emits a full
|
|
||||||
`response.created … response.done{Completed}` cycle — so one user turn can emit
|
|
||||||
*several* `response.done{Completed}` events under different IDs.
|
|
||||||
- Terminal events are **not exactly-once**: failed paths `return` with no
|
|
||||||
`response.done`; cancelled paths emit `done{Cancelled}`; the completed terminal is
|
|
||||||
unconditional at the tail of `emitToolCallItems`.
|
|
||||||
|
|
||||||
### M4. Conversation / compaction
|
|
||||||
|
|
||||||
`Conversation`: `Items` + `Memory` (rolling summary) under `Lock`; `compacting
|
|
||||||
atomic.Bool`. States: normal ↔ compacting. Compaction (`realtime_compaction.go`)
|
|
||||||
snapshots overflow under `Lock`, summarizes **unlocked**, re-locks and commits guarded
|
|
||||||
by an optimistic head-`prefixMatches` check. It is launched **only by turn-0
|
|
||||||
`triggerResponse`** (`:1963`), off the response path — so a long agentic turn
|
|
||||||
(recursion calls `triggerResponseAtTurn` directly) can append many tool items and
|
|
||||||
**never compact** until the next user turn (compaction starvation).
|
|
||||||
|
|
||||||
### M5. Streaming sub-machines (transcription, chunker, TTS)
|
|
||||||
|
|
||||||
Backend LLM/TTS/transcription streams are **synchronous callback recv loops on the
|
|
||||||
caller's goroutine** — no internal goroutines/channels. The only true concurrent FSM is:
|
|
||||||
|
|
||||||
- **TTS pipeline** (`realtime_tts_pipeline.go`): one worker goroutine, an **unbounded**
|
|
||||||
mutex-guarded `queue`, a coalesced `wake` chan (cap 1), a `closed` flag, a `done`
|
|
||||||
chan closed once by the worker's `defer`, a lock-free `failed atomic.Bool`, and
|
|
||||||
worker-owned `audio`/`firstErr` that are safe to read only after `wait()` joins via
|
|
||||||
`done`. Idempotent `wait()`; deferred `wait()` backstop guarantees no worker leak.
|
|
||||||
- **Chunker** (`realtime_chunker.go`): a pure single-buffer FSM (buffering ↔ emitting,
|
|
||||||
`flush` = hard boundary). **No concurrency guard** — correctness depends entirely on
|
|
||||||
`push`/`flush` being called from one goroutine (the LLM recv loop). On cancel the
|
|
||||||
flush is skipped, so the buffered partial clause is intentionally dropped.
|
|
||||||
- **Transcription** (`realtime_transcription.go`): stateless straight-line function;
|
|
||||||
"streaming" is just repeated synchronous callbacks.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Part 2 — Cross-cutting failure modes (why it's a mess)
|
|
||||||
|
|
||||||
1. **Shared mutable `Session` config with inconsistent locking (the core problem).**
|
|
||||||
`updateSession`/`updateTransSession` mutate `Voice`, `Instructions`, `Tools`,
|
|
||||||
`OutputModalities`, `ModelConfig`, **`ModelInterface`**, sample rates, and the
|
|
||||||
shared `InputAudioTranscription` pointer under `sessionLock`. But in-flight
|
|
||||||
response/speech/transcription goroutines read those same fields **without any
|
|
||||||
lock** (`realtime_speech.go:72-79`, `realtime_stream.go:228`, semantic_vad
|
|
||||||
`:110`). Reloading `ModelInterface` mid-response is a data race against a running
|
|
||||||
Predict/TTS/Transcribe, and the swapped-out model is dropped without Close.
|
|
||||||
`sessionLock` actually guards the *global `sessions` map*; it only mutually excludes
|
|
||||||
the handful of other sites that happen to also take it (handleVAD tick, the commit
|
|
||||||
branch). Response goroutines never take it.
|
|
||||||
|
|
||||||
2. **Two writers of the active-response pair.** `startResponse`/`cancelActiveResponse`
|
|
||||||
are called from both the main read loop (`:836,973,981,990`) **and** the VAD
|
|
||||||
goroutine (barge-in `:1429`, end-of-speech `:1543`). `responseMu` guards only the
|
|
||||||
field swap; the `<-done` wait is outside the lock. A read-loop `ResponseCreate`
|
|
||||||
racing a VAD `speech_stopped` can have both read the same prior pair, both
|
|
||||||
overwrite, and briefly leave **two live response goroutines** both appending to
|
|
||||||
`conv.Items`. The "never overlapping" guarantee holds only under the unstated
|
|
||||||
assumption that responses are driven from a single goroutine — which is false.
|
|
||||||
|
|
||||||
3. **State is inferred, not stored.** Whether a response is active, whether a turn is
|
|
||||||
open, whether audio is being buffered — all are derived from combinations of
|
|
||||||
booleans, nil-checks, channel state, and `context` error. No single source of truth;
|
|
||||||
no place to assert an invariant.
|
|
||||||
|
|
||||||
4. **Reachable inconsistent states.** e.g. after a semantic-VAD `discardTurn`,
|
|
||||||
`speechStarted` stays true while `lts` is closed, so they disagree and the next
|
|
||||||
onset suppresses `SpeechStarted`. Mid-stream cancel leaves the client having seen
|
|
||||||
`output_item.added`/`content_part.added` with no matching `…done`. `events`-channel
|
|
||||||
overflow silently drops an EOU, degrading EOU-pending to the 2–8 s fallback.
|
|
||||||
|
|
||||||
5. **Lifecycle/ownership gaps.** `decodeOpusLoop` is a bare `go` (not in `wg`) and can
|
|
||||||
run after `delete(sessions,…)`. `handleIncomingAudioTrack` (pion `OnTrack`
|
|
||||||
goroutine) has **no shutdown signal** — it appends to `OpusFrames` until `ReadRTP`
|
|
||||||
errors, unjoined by `wg`. WebRTC `outEvents` enqueued before the DC opens are lost
|
|
||||||
on early failure.
|
|
||||||
|
|
||||||
6. **The `done`-channel/`vadServerStarted` toggle dance.** A single `done` local
|
|
||||||
(`:655`) is reassigned to a fresh channel on each VAD start (`:662`) and closed at
|
|
||||||
toggle-off (`:670`) and teardown (`:999`). Safe today only because one goroutine
|
|
||||||
owns it — one variable name meaning different channels over time is a structural
|
|
||||||
fragility, not an explicit lifecycle.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Part 3 — Research: explicit, verifiable re-implementation
|
|
||||||
|
|
||||||
The goal the user stated: **transitions cannot lead to an inconsistent state, and we
|
|
||||||
can verify that.** Four layered techniques, from architecture down to runtime.
|
|
||||||
|
|
||||||
### 3.1 Architecture: single-writer session actor (share by communicating)
|
|
||||||
|
|
||||||
The root cause of (1) and (2) is *shared mutable state across goroutines*. The most
|
|
||||||
effective, idiomatic-Go fix is to give each session **one owning goroutine** that holds
|
|
||||||
all session state with **no locks**, and have every other goroutine communicate with it
|
|
||||||
over channels:
|
|
||||||
|
|
||||||
```
|
|
||||||
┌────────── inbound events ──────────┐
|
|
||||||
transport ─┤ client events (ReadEvent) │
|
|
||||||
VAD ─┤ vad: speech_started/stopped, EOU ├─► session actor ──► outbound
|
|
||||||
model I/O ─┤ llm/tts/asr results, errors │ (owns ALL state, events
|
|
||||||
timers ─┤ ticks, deadlines │ single goroutine)
|
|
||||||
└────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
- All state mutation happens in one place; `sessionLock`, `responseMu`, `gateMu`,
|
|
||||||
`AudioBufferLock`, `OpusFramesLock`, `Conversation.Lock` collapse into "the actor owns
|
|
||||||
it." Worker goroutines (Predict/TTS/ASR, opus decode, RTP read) become **stateless
|
|
||||||
effects** that take an immutable snapshot in and send results back as events.
|
|
||||||
- `ModelInterface` reload becomes an event the actor sequences relative to responses
|
|
||||||
(e.g. drain/cancel the active response first), eliminating the mid-call swap race.
|
|
||||||
- Cancellation stays `context`-based but the actor is the only thing that starts/stops
|
|
||||||
responses, killing the dual-writer race (2).
|
|
||||||
|
|
||||||
This is the actor / CSP model. It does not by itself prove correctness — that's what
|
|
||||||
3.2–3.4 add — but it makes the state *centralized and explicit*, which is the
|
|
||||||
precondition for verification.
|
|
||||||
|
|
||||||
### 3.2 Make illegal states unrepresentable (type-level)
|
|
||||||
|
|
||||||
Inside the actor, model each machine as an explicit state with a **pure transition
|
|
||||||
function** `next(state, event) (state, []effect, error)`:
|
|
||||||
|
|
||||||
- Represent states as a Go **sealed sum type** (interface with an unexported marker
|
|
||||||
method, one struct per state carrying only that state's data) so e.g. `EOU-pending`
|
|
||||||
data cannot be accessed while `Closed`. This is the Go equivalent of an ADT and is the
|
|
||||||
single biggest lever for "inconsistent state unrepresentable."
|
|
||||||
- The transition function is **total and pure** (no I/O, no goroutines): it returns the
|
|
||||||
next state plus a list of *effects* (send event, start Predict, arm timer) that the
|
|
||||||
actor executes. Pure transition functions are trivially unit-testable and
|
|
||||||
property-testable.
|
|
||||||
- An unexpected `(state, event)` pair returns an explicit error / stays put and logs —
|
|
||||||
never a silent half-transition.
|
|
||||||
|
|
||||||
The four machines are **hierarchical** (a statechart): Connection ⊃ Turn(M2) and
|
|
||||||
Response(M3) ⊃ Tool-turn; Conversation(M4) and the TTS sub-machine(M5) are largely
|
|
||||||
orthogonal regions. Model them as nested states rather than one flat enum.
|
|
||||||
|
|
||||||
Library options (all guard *logic*, none give concurrency safety — that's 3.1's job):
|
|
||||||
- `qmuntal/stateless` — declarative, hierarchical, guard/entry/exit actions; closest fit.
|
|
||||||
- `looplab/fsm` — simpler, flat, event-callback based.
|
|
||||||
- Hand-rolled transition tables — most control, no dep; recommended here given the
|
|
||||||
hierarchy and the desire to keep transitions auditable. `go.mod` currently pulls no
|
|
||||||
FSM lib.
|
|
||||||
|
|
||||||
### 3.3 Design-time formal verification (prove the protocol)
|
|
||||||
|
|
||||||
Before/while coding, model the *protocol* (not the Go) in a model checker to prove the
|
|
||||||
hard concurrency properties exhaustively:
|
|
||||||
|
|
||||||
- **FizzBee** (the adopted tool) to specify the actor's event/state space and check: no
|
|
||||||
two concurrent active responses; barge-in + ResponseCancel + speech_stopped
|
|
||||||
interleavings never deadlock or drop a turn; every `response.created` is eventually
|
|
||||||
followed by exactly one terminal; teardown joins all goroutines. The
|
|
||||||
cancel/startResponse/barge-in interplay (failure mode 2) is exactly the kind of
|
|
||||||
liveness/safety property model checkers exist for.
|
|
||||||
- Keep the spec small and focused on the M2↔M3 boundary (turn detection ↔ response),
|
|
||||||
which is where the real races live.
|
|
||||||
|
|
||||||
### 3.4 Implementation-time & runtime verification
|
|
||||||
|
|
||||||
- **Exhaustive table-driven transition tests**: since transitions are a pure function,
|
|
||||||
enumerate `(state × event)` and assert the result for every cell, including the
|
|
||||||
illegal cells (assert they error / no-op). This is the practical stand-in for a proof
|
|
||||||
that "no transition leads to inconsistent state."
|
|
||||||
- **Property-based testing**: feed random event sequences into the actor and assert
|
|
||||||
global invariants hold after every step (Part 4). This catches reachable-bad-state
|
|
||||||
bugs the example tests miss. (Implemented as Ginkgo/Gomega seeded random-walk specs
|
|
||||||
— see Part 6.2 for why not `rapid`.)
|
|
||||||
- **Race detector under load**: run the property tests with `-race`; with 3.1 there
|
|
||||||
should be *zero* shared mutable state, so `-race` cleanliness becomes a meaningful
|
|
||||||
signal rather than noise.
|
|
||||||
- **Runtime invariant assertions + structured transition logging**: log every
|
|
||||||
`state --event--> state` with the session ID; assert invariants in dev builds.
|
|
||||||
Replace today's silent degradations (dropped EOU, suppressed SpeechStarted) with
|
|
||||||
explicit, observable transitions.
|
|
||||||
|
|
||||||
### 3.5 Recommended path for LocalAI
|
|
||||||
|
|
||||||
1. Specify the M2↔M3 protocol in FizzBee; nail the cancel/barge-in invariants.
|
|
||||||
2. Introduce a per-session actor (3.1) that owns existing state behind the current
|
|
||||||
`Transport` interface — incremental, keeps the event types.
|
|
||||||
3. Replace each implicit machine with an explicit sealed-state transition function
|
|
||||||
(3.2), one at a time: Response first (highest-risk dual-writer), then Turn/VAD, then
|
|
||||||
Connection, then leave TTS/Chunker/Compaction (already mostly self-contained) for
|
|
||||||
last.
|
|
||||||
4. Land the table-driven + property-based test suites alongside each machine; gate on
|
|
||||||
`-race`.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Part 4 — Invariants a correct implementation must guarantee
|
|
||||||
|
|
||||||
These are the "cannot reach inconsistent state" properties to encode as assertions,
|
|
||||||
property-test oracles, and FizzBee invariants:
|
|
||||||
|
|
||||||
1. **At most one active response per session** at any instant (no overlapping response
|
|
||||||
goroutines; no two appenders to `conv.Items` from response logic).
|
|
||||||
2. **Exactly one terminal per `response.created`**: every emitted `response.created` is
|
|
||||||
followed by exactly one of `response.done{completed|cancelled}` or a defined failure
|
|
||||||
terminal — never zero, never two. (Decide whether agentic tool turns are one
|
|
||||||
response or many; make it explicit either way.)
|
|
||||||
3. **No `response.*` content events after that response's terminal.** No
|
|
||||||
`output_item.added`/`content_part.added` without a matching `…done` (even on cancel).
|
|
||||||
4. **Turn/response coupling**: `speechStarted` ⟺ a live turn is open; barge-in cancels
|
|
||||||
the active response *before* a new turn's commit starts.
|
|
||||||
5. **No config field is read by a worker while being mutated** (reload is sequenced
|
|
||||||
against in-flight work; a response uses an immutable snapshot of model/voice/tools).
|
|
||||||
6. **Audio buffer monotonic & consistent**: commit/clear/append/VAD-drop never lose or
|
|
||||||
double-consume bytes; `clear` resets *all* turn state (including `lts`).
|
|
||||||
7. **No dropped control events**: an EOU/Final is never silently lost (no overflow-drop
|
|
||||||
on a bounded channel that changes turn outcome).
|
|
||||||
8. **Clean teardown**: every spawned goroutine (incl. `decodeOpusLoop`,
|
|
||||||
`handleIncomingAudioTrack`) is signalled and joined before the session is deleted; no
|
|
||||||
sends after transport close.
|
|
||||||
9. **Compaction safety & liveness**: compaction never races a reader into a torn
|
|
||||||
`Items`; and it actually runs when the trigger is exceeded, including inside long
|
|
||||||
agentic turns.
|
|
||||||
10. **Idempotent close**: every channel/resource closed exactly once on every path.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Implementation status
|
|
||||||
|
|
||||||
- **M3 (response coordination) — first vertical slice landed.** Explicit machine in
|
|
||||||
`core/http/endpoints/openai/respcoord/` (sealed `State`/`Event`/`Effect` sum types, a
|
|
||||||
total pure `Next`, a single-writer `Coordinator`); transition-table + Ginkgo/Gomega
|
|
||||||
seeded-property + concurrent conformance tests (green under `-race`); a deterministic
|
|
||||||
characterization test pinning the legacy dual-writer race. Authoritative spec:
|
|
||||||
`formal-verification/response_lifecycle.fizz`. Gate:
|
|
||||||
`scripts/realtime-conformance.sh` (Go layer always; FizzBee when pinned) wired as
|
|
||||||
`make test-realtime-conformance` and `.github/workflows/realtime-conformance.yml`. See
|
|
||||||
`formal-verification/README.md`.
|
|
||||||
- **Gate is fail-closed and pinned (done).** `fizzbee.sha256` pins all four platforms;
|
|
||||||
the gate hard-fails without FizzBee; CI installs+caches the verified binary with no skip;
|
|
||||||
pre-commit runs the gate on `respcoord/**` or `formal-verification/**` changes.
|
|
||||||
- **M3 wired into the live session (done).** `realtime_respcoord.go` adds `responseSink`
|
|
||||||
(the `respcoord.Coordinator` + a goroutine-spawning effect sink) to `Session`. The legacy
|
|
||||||
`startResponse`/`cancelActiveResponse` and the dual-writer `activeResponse*`/`responseMu`
|
|
||||||
fields are gone; all six call sites (manual commit, `response.create`, VAD speech-stopped,
|
|
||||||
`response.cancel`, barge-in, teardown) route through it. Barge-in/cancel are now
|
|
||||||
non-blocking (removes the legacy ~300 ms VAD stall); teardown stops input goroutines, then
|
|
||||||
cancels + `wait()`s all response goroutines before deleting the session. `EmitTerminal` is
|
|
||||||
a no-op for now (the response body still emits its own `response.done`) — coordination is
|
|
||||||
fixed without changing wire behavior. Verified: builds, `go vet` clean, all 300 openai
|
|
||||||
specs pass under `-race`, and `make test-realtime` (the mock-backend realtime e2e suite,
|
|
||||||
12 specs over WS + WebRTC) passes.
|
|
||||||
- **Single authoritative terminal + populated Output/Usage (done).** One
|
|
||||||
`response.created` and one `response.done` per `response.create`, even across the
|
|
||||||
server-side agentic tool loop (which is now internal turns of one response, not one
|
|
||||||
terminal each). A `liveResponse` accumulator threads through
|
|
||||||
`triggerResponse`→`triggerResponseAtTurn`→`emitToolCallItems`/`streamLLMResponse`,
|
|
||||||
collecting output items as they complete and summing token usage; `triggerResponse`
|
|
||||||
emits the one terminal (completed/cancelled; failed still emits none, matching legacy)
|
|
||||||
with `Output` + `Usage` filled in (both were always empty before). Verified: 301 openai
|
|
||||||
specs under `-race` (incl. a new `triggerResponse` terminal test) + `make test-realtime`.
|
|
||||||
Design note: emission is hoisted to `triggerResponse` (the body owns it) rather than the
|
|
||||||
coordinator's `EmitTerminal` effect — at cancel/supersede time the coordinator doesn't
|
|
||||||
yet have the body's partial Output, so the body, which does, is the natural emitter. The
|
|
||||||
coordinator still guarantees one body run per `response.create`, so "exactly one terminal"
|
|
||||||
holds transitively; `EmitTerminal` remains the spec's logical marker (no-op in the sink).
|
|
||||||
- **M2 (turn detection) — model + spec landed AND wired into the live session.**
|
|
||||||
Explicit machine in `core/http/endpoints/openai/turncoord/` (sealed `State` =
|
|
||||||
`Idle | Speaking{Turn}`, `Event` = `Onset | Silence | Abort{Reason}`, `Effect` =
|
|
||||||
`BargeIn | OpenTurn | EmitSpeechStarted | EmitSpeechStopped | CommitTurn |
|
|
||||||
DiscardTurn`, a total pure `Next`, a single-writer `Coordinator`);
|
|
||||||
transition-table + Ginkgo/Gomega seeded-property + concurrent conformance tests
|
|
||||||
(green under `-race`). The fix it encodes: "speech detected" and "a turn is open"
|
|
||||||
— the two legacy variables (`speechStarted` and `lts.open()`) that a `discardTurn`
|
|
||||||
could desync (failure mode 4) — become ONE state, so the next-onset suppression
|
|
||||||
bug is unrepresentable. Authoritative spec:
|
|
||||||
`formal-verification/turn_lifecycle.fizz`, with an `always assertion Coupled`
|
|
||||||
(speech ⟺ turn-open), verified non-vacuous (deleting `self.speech = 0` in `Abort`
|
|
||||||
makes the checker report `Coupled` violated). The gate
|
|
||||||
(`scripts/realtime-conformance.sh`, pre-commit, CI) covers `turncoord` and the
|
|
||||||
spec. **Wired (done):** `realtime_turncoord.go` adds `turnSink` (the
|
|
||||||
`turncoord.Coordinator` + a loop-local effect sink) to `handleVAD`. The legacy
|
|
||||||
`speechStarted` bool is gone; onset/no-speech-clear/commit/teardown route through
|
|
||||||
`coord.Apply(Onset|Abort{NoSpeech}|Silence|Abort{Teardown})`. The turn id is
|
|
||||||
minted at onset and carried by the coordinator to the committed event (so it
|
|
||||||
matches the live captions); `liveTurnState.openTurn` now takes that id instead of
|
|
||||||
minting its own. A semantic→server mode switch mid-turn is deliberately NOT an
|
|
||||||
abort (it only drops the orphaned live stream and lets the turn continue under
|
|
||||||
server_vad), so it stays inline. Verified: builds, `go vet`/`gofmt`/golangci-lint
|
|
||||||
clean, all openai specs under `-race`, and `make test-realtime` (12 e2e specs over
|
|
||||||
WS + WebRTC) pass.
|
|
||||||
- **M1 (connection lifecycle) — model + spec landed AND wired.** Explicit machine
|
|
||||||
in `core/http/endpoints/openai/conncoord/` (sealed `State` = `Live{VADRunning} |
|
|
||||||
Torn`, `Event` = `SetVAD | Close`, `Effect` = `StartVAD | StopVAD | Teardown`, a
|
|
||||||
total pure `Next`, a single-writer `Coordinator`); transition-table +
|
|
||||||
Ginkgo/Gomega seeded-property + concurrent conformance tests (green under
|
|
||||||
`-race`). It replaces the legacy `vadServerStarted` bool + the `done` channel
|
|
||||||
reassigned on every turn-detection toggle and closed from two sites (failure
|
|
||||||
mode 6): the coordinator owns whether the VAD goroutine runs, so its done channel
|
|
||||||
is closed exactly once and never resurrected after teardown; `Close` moves to
|
|
||||||
`Torn`, which absorbs every later event so teardown runs exactly once even from
|
|
||||||
multiple exit paths (invariants #8, #10). Spec:
|
|
||||||
`formal-verification/conn_lifecycle.fizz` (`always assertion TeardownOnce` +
|
|
||||||
`NoRunAfterTorn`), verified non-vacuous (deleting `self.torn = 1` in `Close`
|
|
||||||
fails `TeardownOnce`). **Wired (done):** `realtime_conncoord.go` adds `connSink`;
|
|
||||||
the handler's setup/`toggleVAD`/teardown now route through
|
|
||||||
`conn.setVAD(...)`/`conn.close()`; the `done`/`vadServerStarted` locals and the
|
|
||||||
manual ordered-teardown block are gone (the Teardown effect performs that
|
|
||||||
sequence). Verified: builds, vet/gofmt/golangci-lint clean, openai specs under
|
|
||||||
`-race`, `make test-realtime` (12 e2e WS+WebRTC), full conformance gate green
|
|
||||||
(3 Go packages + 3 fizz specs PASSED).
|
|
||||||
- **M4 (conversation compaction) — model + spec landed AND wired.** Explicit
|
|
||||||
machine in `core/http/endpoints/openai/compactcoord/` (sealed `State` =
|
|
||||||
`Idle | Running`, `Event` = `Trigger | Finished`, `Effect` = `StartCompaction`,
|
|
||||||
a total pure `Next`, a single-writer `Coordinator`); transition-table +
|
|
||||||
Ginkgo/Gomega seeded-property + concurrent (effect-spawns-work-reports-Finished)
|
|
||||||
conformance tests (green under `-race`). It makes the legacy `compacting
|
|
||||||
atomic.Bool` single-flight guard explicit: a `Trigger` while `Running` is dropped
|
|
||||||
(not superseded — compaction is idempotent work on the same overflow), so at most
|
|
||||||
one summarize+evict runs per conversation (invariant #9). Spec:
|
|
||||||
`formal-verification/compaction.fizz` (`always assertion SingleFlight`), verified
|
|
||||||
non-vacuous (deleting the `if self.active == 0` guard fails `SingleFlight`).
|
|
||||||
**Wired (done):** `realtime_compactcoord.go` adds `compactionSink`; the
|
|
||||||
`Conversation.compacting atomic.Bool` is replaced by `Conversation.compaction
|
|
||||||
*compactionSink` (built at conversation creation with the summarize+evict run
|
|
||||||
closure); `maybeCompact` now calls `conv.compaction.trigger()`. The summarizer
|
|
||||||
resolution + `compact()` stay in the sink's spawned goroutine (off the response
|
|
||||||
path); `compact()` itself (snapshot/summarize-unlocked/optimistic-commit) is
|
|
||||||
unchanged. Verified: builds, vet/gofmt/golangci-lint clean, openai specs under
|
|
||||||
`-race`, `make test-realtime` (12 e2e), full conformance gate green (4 Go
|
|
||||||
packages + 4 fizz specs PASSED).
|
|
||||||
- **M5 (TTS pipeline lifecycle) — model + spec landed AND wired.** Explicit
|
|
||||||
machine in `core/http/endpoints/openai/ttscoord/` (sealed `State` =
|
|
||||||
`Open | Closing | Closed`, `Event` = `Close | WorkerExited`, `Effect` = `Wake`, a
|
|
||||||
total pure `Next`, a single-writer `Coordinator`); transition-table +
|
|
||||||
Ginkgo/Gomega seeded-property + two-writer conformance tests (green under
|
|
||||||
`-race`). It is a genuine two-writer machine (producer `Close` from `wait()` vs
|
|
||||||
worker `WorkerExited`); it makes the legacy `closed bool` lifecycle explicit and
|
|
||||||
monotonic, fixes the latent enqueue-after-close silent drop (enqueue is now gated
|
|
||||||
on `Open`), and guarantees idempotent `wait()` (one wake / one worker join). The
|
|
||||||
poison `failed` latch stays a lock-free `atomic.Bool` (orthogonal, read per
|
|
||||||
clause on the worker's hot path). Spec: `formal-verification/tts_pipeline.fizz`
|
|
||||||
(`always assertion WakeOnce` + `Monotonic`), verified non-vacuous (deleting the
|
|
||||||
`if self.phase == 0` guard in `Close` fails `WakeOnce`). **Wired (done):**
|
|
||||||
`realtime_tts_pipeline.go`'s `ttsPipeline` embeds the coordinator (and is its
|
|
||||||
effect sink — `Wake` → `signal()`); `closed bool` is gone; the worker checks
|
|
||||||
`closing()` and raises `WorkerExited` on drain, `enqueue` rejects once not
|
|
||||||
`Open`, `wait()` raises `Close`. The wake/done channel mechanics are unchanged.
|
|
||||||
Verified: builds, vet/gofmt/golangci-lint clean, openai specs under `-race`,
|
|
||||||
`make test-realtime` (12 e2e), full conformance gate green (5 Go packages + 5
|
|
||||||
fizz specs PASSED).
|
|
||||||
- **All five mapped machines (M1–M5) are now explicit, wired, and verified.** The
|
|
||||||
realtime-conformance gate model-checks all `.fizz` specs and runs all five Go
|
|
||||||
conformance suites under `-race`, fail-closed.
|
|
||||||
- **The machines form a hierarchy, and that relationship is now modeled and
|
|
||||||
enforced.** M1 (connection) is the parent region; when it tears down, every child
|
|
||||||
must be terminal. Previously this was only an imperative side effect of
|
|
||||||
`conncoord`'s teardown ordering, with a real gap (M4 compaction was
|
|
||||||
fire-and-forget and could outlive the torn session). Now:
|
|
||||||
- `formal-verification/session_lifecycle.fizz` is a **composition spec** that
|
|
||||||
models conn + its direct children (vad/M2, resp/M3, compaction/M4) as one
|
|
||||||
statechart and asserts `ChildrenDieWithParent` (conn torn ⟹ all children
|
|
||||||
terminal) plus "no child starts after teardown". Its non-vacuity reproduces the
|
|
||||||
exact M4 gap (drop the compaction-terminate line → assertion fails).
|
|
||||||
- `respcoord` (M3) and `compactcoord` (M4) gained an absorbing **`Terminated`**
|
|
||||||
state + a `Shutdown` event, so a response/compaction cannot start after
|
|
||||||
teardown (structural "no resurrection").
|
|
||||||
- `conncoord`'s `Teardown` effect now explicitly drives the children terminal:
|
|
||||||
stop+join the VAD goroutine (M2), `respSink.shutdown()` (M3 → Terminated, joins
|
|
||||||
response goroutines and their M5 pipelines), and `compaction.shutdown()` for
|
|
||||||
every conversation (M4: cancel the in-flight summary via a session-scoped
|
|
||||||
context, then join — **closing the gap**). `compact` now takes a `context` so
|
|
||||||
teardown can bound the join. M2's terminal is realized by the goroutine join and
|
|
||||||
M5's by its existing `Closed`; the persistent coordinators (M3/M4) carry the
|
|
||||||
explicit `Terminated` state.
|
|
||||||
|
|
||||||
## Part 5 — Library vs hand-rolled (Go ecosystem, verified 2026-06)
|
|
||||||
|
|
||||||
Researched against live GitHub/pkg.go.dev data. **Verdict: hand-roll a typed transition
|
|
||||||
table over sealed sum-type states for the per-connection machines.** No Go library gives
|
|
||||||
the two properties we most want — *compile-time-illegal states* and a *pure
|
|
||||||
`next(state,event)->(state,[]effect,error)`*; every library models states as
|
|
||||||
`string`/`int`/`any` and fires side-effecting callbacks mid-transition. And since the
|
|
||||||
actor (Part 3.1) drives everything from one goroutine, the libraries' main value-add —
|
|
||||||
internal locking — is dead weight.
|
|
||||||
|
|
||||||
Library landscape:
|
|
||||||
|
|
||||||
| Option | Stars / status | Hierarchy | Typed states | Illegal-transition | Viz | Fit |
|
|
||||||
|---|---|---|---|---|---|---|
|
|
||||||
| **hand-rolled table + sealed sum types** | — | DIY (parent field / nested switch) | **yes** (sealed iface) | explicit `default:` | ~30 LOC Mermaid emitter | **best** |
|
|
||||||
| **qmuntal/stateless** (port of .NET Stateless) | 1.36k, v1.8.0 2026-02, maintained | yes (substates, guards, entry/exit, internal/ignored) | `any` | `error` + `OnUnhandledTrigger` + `PermittedTriggers` | DOT | best library fallback if hierarchy grows |
|
|
||||||
| **looplab/fsm** | 3.4k, v1.0.3 2025-05, maintained | flat | strings | typed errors | **DOT+Mermaid** | only for flat machines wanting free diagrams |
|
|
||||||
| cocoonspace/fsm | 89, dormant 2021 | flat | int | `bool` no-op | — | lock-free but dead; DIY beats it |
|
|
||||||
| true Harel statecharts (gstate, statechartx) | ≤10, <1yr, single-author | parallel+history | varies | varies | varies | only if we truly need parallel regions; unproven |
|
|
||||||
| Temporal / Cadence | large, maintained | n/a | n/a | n/a | n/a | **overkill** — external cluster+DB, durable replay, wrong latency class |
|
|
||||||
|
|
||||||
Decision: hand-roll; keep **qmuntal/stateless** as the fallback if one machine grows deep
|
|
||||||
hierarchy/guards faster than we want to hand-maintain (its `error`-on-illegal-trigger and
|
|
||||||
`PermittedTriggers()` are the most useful library features for our "reject illegal
|
|
||||||
transitions" requirement, at the cost of `any`-typed states). Add a tiny Mermaid emitter
|
|
||||||
over the hand-rolled table so we keep the visualization the libraries advertise.
|
|
||||||
|
|
||||||
## Part 6 — Formal design tied to code, and making it authoritative
|
|
||||||
|
|
||||||
The user requirement: the formal design is **authoritative** — a coding agent should be
|
|
||||||
unable to silently change implementation behavior without it being caught against the
|
|
||||||
spec; the default path is "update the spec and re-verify," not "edit the code and ignore
|
|
||||||
the spec." This is a *conformance + enforcement* problem, in three layers.
|
|
||||||
|
|
||||||
### 6.1 The source of truth & design-time check
|
|
||||||
|
|
||||||
Write the concurrency-critical core — the **M2↔M3 boundary** (turn detection ↔ response:
|
|
||||||
barge-in, ResponseCancel, speech_stopped, the dual-writer race) — as a **FizzBee** spec
|
|
||||||
and **model-check it in CI**. Keep the spec small and focused on M2↔M3; that is where the
|
|
||||||
real safety/liveness properties (Part 4 invariants 1–4) live. (FizzBee is the adopted
|
|
||||||
model checker — see Part 6.4.)
|
|
||||||
|
|
||||||
### 6.2 The conformance bridge (code ↔ spec)
|
|
||||||
|
|
||||||
The honest finding: design-time model checking is well-supported; the *Go conformance
|
|
||||||
bridge is thin everywhere* and needs per-spec glue. Two layers, adopted together:
|
|
||||||
|
|
||||||
1. **FizzBee MBT** — the authoritative layer. The `.fizz` spec is model-checked, and
|
|
||||||
`fizz mbt-scaffold --lang go` generates Go interfaces + a `go test` harness; you
|
|
||||||
implement adapters mapping model actions→code and `StateGetter`→state. Conformance
|
|
||||||
runs as plain `go test` — the cleanest CI fit. Risk: pre-1.0, essentially one
|
|
||||||
maintainer (pin a version + sha256, vendor examples).
|
|
||||||
2. **Ginkgo/Gomega seeded property tests** — the Go-native floor. A small Go model
|
|
||||||
(the test's `open`/`registered` shadow) is the oracle; a fixed-seed random walk
|
|
||||||
drives random event sequences against the `Coordinator`, asserting the Part-4
|
|
||||||
invariants after each step / per seed. It checks the *implementation* against a Go
|
|
||||||
oracle — it complements, but does not replace, the FizzBee check of the *design*.
|
|
||||||
(We originally specced `pgregory.net/rapid` here for its `(*T).Repeat` driver and
|
|
||||||
automatic shrinking, but LocalAI mandates Ginkgo/Gomega for all tests — its
|
|
||||||
`forbidigo` lint forbids stdlib `testing` assertions — and `rapid.Check` needs a
|
|
||||||
concrete `*testing.T`/`*rapid.T` that cannot run inside a Ginkgo `It`. Rather than
|
|
||||||
weaken the lint gate with an exclusion, the property layer is hand-rolled seeded
|
|
||||||
walks: fixed seeds make every failure reproducible, at the cost of `rapid`'s
|
|
||||||
automatic shrinking. `rapid` is consequently not a direct dependency.)
|
|
||||||
|
|
||||||
These compose: model-check the design (6.1) for "the design is right"; conformance-test
|
|
||||||
the code (6.2) for "the code matches the design." Add `go test -race` (with `-cpu=1,2,4`,
|
|
||||||
repeated runs) over the stateful tests for interleaving-bug discovery, and Go native
|
|
||||||
fuzzing over the *same* harness for coverage-guided sequence exploration + a committable
|
|
||||||
regression corpus. (`testing/quick` is frozen — do not use.)
|
|
||||||
|
|
||||||
There is no viable single-source-of-truth codegen (one spec compiled into both the runtime
|
|
||||||
Go and the model) for retrofitting existing Go — the candidates are research-grade and
|
|
||||||
greenfield-only. Our practical substitute is the CI gate below plus a single Go transition
|
|
||||||
table that emits both the diagram and the test action set.
|
|
||||||
|
|
||||||
### 6.3 Enforcement — making the design un-ignorable for agents
|
|
||||||
|
|
||||||
Structural enforcement, leveraging this repo's existing non-bypassable gate culture
|
|
||||||
(pre-commit + monotonic ratchets; `--no-verify` is forbidden, baselines never lowered):
|
|
||||||
|
|
||||||
1. **Add a `realtime-conformance` gate** to the pre-commit/CI pipeline that runs (a) the
|
|
||||||
model check (6.1) and (b) the conformance bridge (6.2). A behavior change that does not
|
|
||||||
conform turns the gate **red**; the only green paths are *make the code conform* or
|
|
||||||
*update the spec* — and updating the spec re-triggers the model check, so an illegal
|
|
||||||
design is rejected too. This is the actual mechanism that makes "update the design and
|
|
||||||
verify" the default rather than optional.
|
|
||||||
2. **Treat the spec as a ratchet artifact** like coverage: the gate must not be weakened,
|
|
||||||
the spec not deleted, the build tag not silently disabled.
|
|
||||||
3. **Write an `.agents/realtime-state-machines.md` guide** (indexed from `CLAUDE.md`)
|
|
||||||
stating the spec is the source of truth: change the spec first, re-run the gate, then
|
|
||||||
implement. The doc is secondary; the gate is what enforces it.
|
|
||||||
|
|
||||||
### 6.4 Decided stack
|
|
||||||
|
|
||||||
- **Implementation:** hand-rolled sealed-state transition functions + single-writer actor
|
|
||||||
(Parts 3.1–3.2).
|
|
||||||
- **Design-time + conformance:** **FizzBee** (decided). `.fizz` spec is model-checked, and
|
|
||||||
`fizz`'s Go MBT generator (`mbt/generator/templates/go` → interfaces/adapters/test;
|
|
||||||
driven via a gRPC plugin in `mbt/lib/go`) produces a `go test` conformance harness
|
|
||||||
whose adapters map model actions → our actor and `StateGetter` → our state. Go is a
|
|
||||||
first-class MBT target (Go + Rust are the only two). Verified 2026-06: Apache-2.0,
|
|
||||||
v0.5.2, prebuilt linux/macos×x86/arm binaries, ships Claude Code skills
|
|
||||||
(`/fizz-spec|check|debug|mbt`) for the spec-authoring loop.
|
|
||||||
- **Go-native layer:** **Ginkgo/Gomega seeded property tests** run alongside — they
|
|
||||||
check the *implementation*, complementing (not substituting for) the FizzBee check
|
|
||||||
of the *design*. Skipping FizzBee is NOT "degrading to the Go layer": the design
|
|
||||||
authority would be gone. The gate is therefore **fail-closed** (see Enforcement).
|
|
||||||
(Originally specced as `rapid`; switched to Ginkgo/Gomega to satisfy LocalAI's
|
|
||||||
Ginkgo-only `forbidigo` lint without weakening that gate — see Part 6.2.)
|
|
||||||
- **Enforcement:** the `realtime-conformance` pre-commit/CI gate + `.agents/` guide
|
|
||||||
(Part 6.3).
|
|
||||||
|
|
||||||
FizzBee risk mitigations (decided):
|
|
||||||
- The gate is **fail-closed**: a missing FizzBee is a hard failure, never a silent skip.
|
|
||||||
The only bypass is the explicit, loud `REALTIME_CONFORMANCE_SKIP_FIZZBEE=1` (local
|
|
||||||
only; CI never sets it; pre-commit runs the gate on any `respcoord/**` or
|
|
||||||
`formal-verification/**` change so a pure `.fizz` edit still re-verifies).
|
|
||||||
- CI **pins the FizzBee release binary by version + sha256** (`formal-verification/fizzbee.sha256`,
|
|
||||||
all four platforms, digests from the GitHub release; installer verifies before extract,
|
|
||||||
CI caches it). Not go-gettable: `pkg/modelchecker` imports the Bazel-internal `fizz/proto`
|
|
||||||
with no committed `.pb.go`, so a plain `go get` won't build — hence the pinned binary.
|
|
||||||
- Keep the `.fizz` model **portable** (no exotic features) so it stays re-expressible in
|
|
||||||
another model checker if FizzBee is ever abandoned — lock-in is at the tooling layer
|
|
||||||
only, not the design.
|
|
||||||
|
|
||||||
## Open questions (decide before implementing)
|
|
||||||
|
|
||||||
- **Scope of the actor refactor**: full single-writer per session, or incrementally
|
|
||||||
migrate one machine at a time behind the existing locks? (Suggest: M3 response
|
|
||||||
coordination first — it has the load-bearing dual-writer bug.)
|
|
||||||
|
|
||||||
Resolved: **FSM library vs hand-rolled** → hand-rolled sealed-state tables,
|
|
||||||
qmuntal/stateless fallback (Part 5). **Conformance bridge** → FizzBee (model-check + Go
|
|
||||||
MBT) with a Ginkgo/Gomega seeded-property Go-native floor as hedge (Part 6.4). **Single-source-of-truth codegen**
|
|
||||||
(PGo/MPCal) → not viable (research-grade, greenfield-only); substitute is the CI
|
|
||||||
conformance gate (Part 6.3).
|
|
||||||
|
|
||||||
**Agentic turn semantics** → invariant #2 is **one `response.done` per `response.create`**
|
|
||||||
(OpenAI-faithful); the server-side `AssistantExecutor` tool loop becomes internal
|
|
||||||
sub-states of a single response rather than emitting one terminal per turn. Verified safe
|
|
||||||
in-tree: the current `response.done` carries only `{id, object, status}` (`Output`/`Usage`
|
|
||||||
never populated), the React UI (`Talk.jsx:330`) reads only `status`, every unit test
|
|
||||||
already asserts `ResponseDone == 1` for tool turns, no test expects multiplicity, and the
|
|
||||||
server-side recursion is untested. Collapsing also fixes a latent "Listening…" flicker
|
|
||||||
mid-agentic-loop. The client-driven tool loop (fresh `response.create` per round-trip)
|
|
||||||
legitimately keeps one terminal each — unaffected. Follow-up: actually populate `Output` +
|
|
||||||
`Usage` in the single terminal (currently always empty).
|
|
||||||
@@ -1,142 +0,0 @@
|
|||||||
# Formal verification — realtime state machines
|
|
||||||
|
|
||||||
Formal designs (FizzBee specs) for the realtime API state machines and the harness
|
|
||||||
that keeps the Go implementation provably in step with them. Background and
|
|
||||||
rationale: [../docs/design/realtime-state-machines.md](../docs/design/realtime-state-machines.md) (Part 6).
|
|
||||||
|
|
||||||
The design is **authoritative**: behaviour changes go through the spec first, then
|
|
||||||
the implementation is checked against it. The `realtime-conformance` gate makes
|
|
||||||
that the path of least resistance — you cannot land a non-conforming change green.
|
|
||||||
|
|
||||||
## What's here
|
|
||||||
|
|
||||||
| File | Role |
|
|
||||||
|------|------|
|
|
||||||
| `response_lifecycle.fizz` | **Authoritative** FizzBee model of machine M3 (response coordination). Model-checked + drives the Go MBT conformance harness. |
|
|
||||||
| `turn_lifecycle.fizz` | **Authoritative** FizzBee model of machine M2 (turn detection): the speechStarted / turn-open coupling. |
|
|
||||||
| `conn_lifecycle.fizz` | **Authoritative** FizzBee model of machine M1 (connection lifecycle): VAD toggle + once-only teardown. |
|
|
||||||
| `compaction.fizz` | **Authoritative** FizzBee model of machine M4 (conversation compaction): single-flight. |
|
|
||||||
| `tts_pipeline.fizz` | **Authoritative** FizzBee model of machine M5 (TTS pipeline): open->closing->closed, idempotent close. |
|
|
||||||
| `session_lifecycle.fizz` | **Composition** spec: the M1–M5 hierarchy — conn (M1) is the parent; when it is torn down, every child (vad/M2, resp/M3, compaction/M4) is terminal. Models the relationship the per-machine specs can't express. |
|
|
||||||
| `fizzbee.sha256` | Pinned checksum(s) of the FizzBee release the gate uses (created on first `install-fizzbee.sh` run). |
|
|
||||||
|
|
||||||
The implementations under test live in
|
|
||||||
[`core/http/endpoints/openai/respcoord`](../../../core/http/endpoints/openai/respcoord) (M3),
|
|
||||||
[`core/http/endpoints/openai/turncoord`](../../../core/http/endpoints/openai/turncoord) (M2),
|
|
||||||
[`core/http/endpoints/openai/conncoord`](../../../core/http/endpoints/openai/conncoord) (M1),
|
|
||||||
[`core/http/endpoints/openai/compactcoord`](../../../core/http/endpoints/openai/compactcoord) (M4),
|
|
||||||
and [`core/http/endpoints/openai/ttscoord`](../../../core/http/endpoints/openai/ttscoord) (M5).
|
|
||||||
|
|
||||||
## Running the gate
|
|
||||||
|
|
||||||
```sh
|
|
||||||
make test-realtime-conformance
|
|
||||||
# or directly:
|
|
||||||
./scripts/realtime-conformance.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
Two layers, **both required — the gate is fail-closed**:
|
|
||||||
|
|
||||||
1. **Go-native conformance** — the `respcoord` + `turncoord` + `conncoord` + `compactcoord` + `ttscoord` transition-table
|
|
||||||
tests + Ginkgo/Gomega seeded property (random-walk) tests under `-race`
|
|
||||||
(checks the implementation), plus the shared `coordinator` runtime they all
|
|
||||||
build on. Also run as part of `make test` (they're ordinary Go packages with a
|
|
||||||
Ginkgo suite each). The five machines reduce to their sealed State/Event/Effect
|
|
||||||
types + a pure `Next`; the single-writer Coordinator/Sink plumbing lives once in
|
|
||||||
`core/http/endpoints/openai/coordinator` (a generic `Coordinator[S,E,F]`).
|
|
||||||
2. **FizzBee model check** — model-checks the authoritative `.fizz` specs (checks
|
|
||||||
the design). **A missing FizzBee is a hard failure, not a skip** — otherwise
|
|
||||||
the design verification silently disappears whenever the tool is inconvenient,
|
|
||||||
which is the whole thing we're trying to prevent.
|
|
||||||
|
|
||||||
FizzBee is pinned and checksum-verified (`fizzbee.sha256`), so "couldn't install"
|
|
||||||
is not a reason to skip — run `make install-fizzbee`. The **only** way to skip is
|
|
||||||
the explicit, loud `REALTIME_CONFORMANCE_SKIP_FIZZBEE=1` opt-out, intended for
|
|
||||||
local work on unrelated code. CI never sets it, and `pre-commit` runs the full
|
|
||||||
gate whenever `respcoord/**`, `turncoord/**`, `conncoord/**`, `compactcoord/**`, `ttscoord/**`, or `formal-verification/**` is
|
|
||||||
staged (so a pure `.fizz` edit still re-verifies).
|
|
||||||
|
|
||||||
## Installing FizzBee (pinned)
|
|
||||||
|
|
||||||
FizzBee is pre-1.0 and single-maintainer, so we pin a version + sha256 and use the
|
|
||||||
prebuilt release tarball (its primary build is Bazel — it is **not** go-gettable:
|
|
||||||
the `pkg/modelchecker` library imports the Bazel-internal `fizz/proto` with no
|
|
||||||
committed `.pb.go`, so a plain `go get` won't build it).
|
|
||||||
|
|
||||||
```sh
|
|
||||||
make install-fizzbee # = scripts/install-fizzbee.sh (default v0.5.2)
|
|
||||||
```
|
|
||||||
|
|
||||||
The four platform assets are pinned by sha256 in `fizzbee.sha256` (digests taken
|
|
||||||
from the GitHub release); the installer verifies before extracting. Heads-up: the
|
|
||||||
Linux bundles are large (~290–350 MB, because `parser_bin` embeds a full runtime),
|
|
||||||
macOS ~36 MB. CI caches `.tools/fizzbee` keyed on the pin so it downloads once.
|
|
||||||
|
|
||||||
This unpacks a **self-contained** directory under `.tools/fizzbee/` (gitignored):
|
|
||||||
|
|
||||||
```
|
|
||||||
.tools/fizzbee/
|
|
||||||
fizz -> stable symlink the gate auto-detects
|
|
||||||
fizzbee-v0.5.2-linux_x86/
|
|
||||||
fizz # CLI wrapper (entrypoint)
|
|
||||||
parser/parser_bin # the .fizz frontend, BUNDLED (no system Python needed)
|
|
||||||
fizzbee # Go model-checker binary
|
|
||||||
fizz.env # resolves the above paths relative to `fizz`
|
|
||||||
mbt_gen.zip # MBT generator (this one DOES need system python)
|
|
||||||
```
|
|
||||||
|
|
||||||
Keep the directory intact — `fizz.env` resolves its siblings relative to the
|
|
||||||
`fizz` wrapper. The gate auto-detects `.tools/fizzbee/fizz`; override with
|
|
||||||
`FIZZBEE_BIN` only if you installed elsewhere (point it at the `fizz` wrapper,
|
|
||||||
not the raw `fizzbee` binary).
|
|
||||||
|
|
||||||
First `install-fizzbee.sh` run prints the computed sha256; record it in
|
|
||||||
`fizzbee.sha256` as `<sha256> <asset>` and commit so later runs verify the pin.
|
|
||||||
|
|
||||||
> CLI facts (validate against the pinned version — FizzBee is pre-1.0): the CLI
|
|
||||||
> is `fizz [flags] <spec.fizz>` (default = exhaustive BFS); there is **no `run`
|
|
||||||
> subcommand**. The checker can print `FAILED`/`DEADLOCK` while still exiting 0,
|
|
||||||
> so the gate scans output for those markers in addition to the exit code.
|
|
||||||
> Model-checking needs only the bundled `parser_bin` (no Python); only
|
|
||||||
> `mbt-scaffold` shells out to system `python`.
|
|
||||||
|
|
||||||
## Reproducing the bug the spec catches
|
|
||||||
|
|
||||||
Each spec models the **correct** design, so it passes; each documents how to
|
|
||||||
reproduce the legacy bug it guards against:
|
|
||||||
|
|
||||||
- `response_lifecycle.fizz` (M3): change `atomic func start()` to
|
|
||||||
`serial func start()` — the checker reports `AtMostOneLive` violated (the
|
|
||||||
dual-writer race). Pinned deterministically in Go by the respcoord
|
|
||||||
"legacy dual-writer characterization" spec.
|
|
||||||
- `turn_lifecycle.fizz` (M2): in `Abort`, delete `self.speech = 0` (clear only
|
|
||||||
the turn, as the legacy `discardTurn` did) — the checker reports `Coupled`
|
|
||||||
violated (the speechStarted/turn-open desync that suppressed the next onset).
|
|
||||||
- `conn_lifecycle.fizz` (M1): in `Close`, delete `self.torn = 1` — the checker
|
|
||||||
reports `TeardownOnce` violated (the legacy double-teardown / double-close
|
|
||||||
hazard when a session reaches teardown from more than one exit path).
|
|
||||||
- `compaction.fizz` (M4): in `Trigger`, delete the `if self.active == 0:` guard —
|
|
||||||
the checker reports `SingleFlight` violated (two goroutines compacting the same
|
|
||||||
overflow concurrently, the race the `compacting` CAS prevents).
|
|
||||||
- `tts_pipeline.fizz` (M5): in `Close`, delete the `if self.phase == 0` guard —
|
|
||||||
the checker reports `WakeOnce` violated (a non-idempotent wait() that wakes /
|
|
||||||
joins the worker more than once).
|
|
||||||
- `session_lifecycle.fizz` (hierarchy): in `Teardown`, delete `self.compaction = 2`
|
|
||||||
— the checker reports `ChildrenDieWithParent` violated. This is the real M4 gap:
|
|
||||||
a fire-and-forget compaction outliving the torn session. The fix is `conncoord`'s
|
|
||||||
teardown cancelling + joining each conversation's compaction (and respcoord/
|
|
||||||
compactcoord gained an absorbing `Terminated` state so no child can start after
|
|
||||||
teardown).
|
|
||||||
|
|
||||||
## Adding another machine
|
|
||||||
|
|
||||||
All five mapped machines (M1–M5) have landed. To add a new sealed-state machine:
|
|
||||||
|
|
||||||
1. Add `<machine>.fizz` here (with an `always assertion`; verify non-vacuity by
|
|
||||||
breaking one guard and confirming the checker fails).
|
|
||||||
2. Implement it as a sealed-state package under `core/http/endpoints/openai/`.
|
|
||||||
3. Add transition-table + Ginkgo/Gomega seeded property conformance tests
|
|
||||||
(one `*_suite_test.go` bootstrap per package; LocalAI mandates Ginkgo/Gomega).
|
|
||||||
4. The gate picks up new `*.fizz` specs automatically; add the new Go package to
|
|
||||||
the `-race` test list in `scripts/realtime-conformance.sh` (and the path
|
|
||||||
filters in `.githooks/pre-commit` + `.github/workflows/realtime-conformance.yml`).
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
---
|
|
||||||
# Authoritative formal design for realtime machine M4: conversation compaction.
|
|
||||||
#
|
|
||||||
# Companion to:
|
|
||||||
# - docs/design/realtime-state-machines.md (the map + invariants)
|
|
||||||
# - core/http/endpoints/openai/compactcoord (the Go implementation)
|
|
||||||
#
|
|
||||||
# The Go MBT adapter maps each action below onto compactcoord.Coordinator.Apply
|
|
||||||
# and the StateGetter onto compactcoord.Coordinator.State, so this spec is the
|
|
||||||
# source of truth the implementation is checked against.
|
|
||||||
#
|
|
||||||
# The property: at most one background compaction runs per conversation at a time,
|
|
||||||
# so two goroutines never summarize+evict the same overflow concurrently (Part 4,
|
|
||||||
# invariant #9). The legacy guard is a `compacting atomic.Bool` CAS; here `active`
|
|
||||||
# is the number of in-flight compactions, started only from Idle.
|
|
||||||
#
|
|
||||||
# NOTE: FizzBee is pre-1.0. Validate the exact syntax/CLI against the version
|
|
||||||
# pinned in formal-verification/README.md before trusting the gate.
|
|
||||||
deadlock_detection: false
|
|
||||||
---
|
|
||||||
|
|
||||||
role Compactor:
|
|
||||||
action Init:
|
|
||||||
self.active = 0 # compactions in flight -- MUST stay in {0,1}
|
|
||||||
self.torn = 0 # session torn down (Terminated) -- absorbing
|
|
||||||
|
|
||||||
# maybeCompact wants to start a compaction. THE FIX: it starts one only when
|
|
||||||
# none is running (single-flight) and not after teardown. To reproduce the
|
|
||||||
# legacy race where two goroutines could both compact the same overflow,
|
|
||||||
# delete the `self.active == 0` guard (always increment): the checker then
|
|
||||||
# reports SingleFlight violated.
|
|
||||||
atomic action Trigger:
|
|
||||||
if self.active == 0 and self.torn == 0:
|
|
||||||
self.active += 1 # StartCompaction
|
|
||||||
|
|
||||||
# The background compaction goroutine finished (success, error, or timeout).
|
|
||||||
atomic action Finished:
|
|
||||||
if self.active > 0:
|
|
||||||
self.active -= 1
|
|
||||||
|
|
||||||
# Teardown: the connection (M1) parent cancels + joins the in-flight
|
|
||||||
# compaction, then terminates the coordinator so none can start afterwards.
|
|
||||||
atomic action Shutdown:
|
|
||||||
self.active = 0 # cancelled + joined
|
|
||||||
self.torn = 1
|
|
||||||
|
|
||||||
action Init:
|
|
||||||
c = Compactor()
|
|
||||||
|
|
||||||
# SAFETY: at most one compaction is ever in flight (Part 4, invariant #9).
|
|
||||||
always assertion SingleFlight:
|
|
||||||
return c.active >= 0 and c.active <= 1
|
|
||||||
|
|
||||||
# SAFETY: no compaction is in flight once torn (it was cancelled + joined at
|
|
||||||
# teardown), so none outlives the session.
|
|
||||||
always assertion NoneAfterTeardown:
|
|
||||||
return c.torn == 0 or c.active == 0
|
|
||||||
@@ -1,60 +0,0 @@
|
|||||||
---
|
|
||||||
# Authoritative formal design for realtime machine M1: connection lifecycle.
|
|
||||||
#
|
|
||||||
# Companion to:
|
|
||||||
# - docs/design/realtime-state-machines.md (the map + invariants)
|
|
||||||
# - core/http/endpoints/openai/conncoord (the Go implementation)
|
|
||||||
#
|
|
||||||
# The Go MBT adapter maps each action below onto conncoord.Coordinator.Apply and
|
|
||||||
# the StateGetter onto conncoord.Coordinator.State, so this spec is the source of
|
|
||||||
# truth the implementation is checked against.
|
|
||||||
#
|
|
||||||
# The legacy hazard (Part 2, failure mode 6 / invariants #8, #10): a single `done`
|
|
||||||
# channel reassigned on every VAD toggle and closed from two sites (toggle-off and
|
|
||||||
# teardown) guarded only by a vadServerStarted bool. Modeled here as `running`
|
|
||||||
# (the VAD goroutine's done channel is live) and `torn` (teardown happened).
|
|
||||||
#
|
|
||||||
# NOTE: FizzBee is pre-1.0. Validate the exact syntax/CLI against the version
|
|
||||||
# pinned in formal-verification/README.md before trusting the gate.
|
|
||||||
deadlock_detection: false
|
|
||||||
---
|
|
||||||
|
|
||||||
role Conn:
|
|
||||||
action Init:
|
|
||||||
self.running = 0 # VAD goroutine running (its done channel is live)
|
|
||||||
self.torn = 0 # teardown has happened
|
|
||||||
self.teardowns = 0 # how many times teardown ran -- MUST stay <= 1
|
|
||||||
|
|
||||||
# session.update toggled turn detection on. No-op after teardown (the legacy
|
|
||||||
# reassign-and-spawn must never resurrect a torn session).
|
|
||||||
atomic action VadOn:
|
|
||||||
if self.torn == 0:
|
|
||||||
self.running = 1
|
|
||||||
|
|
||||||
# session.update toggled turn detection off (close the running done channel).
|
|
||||||
atomic action VadOff:
|
|
||||||
if self.torn == 0:
|
|
||||||
self.running = 0
|
|
||||||
|
|
||||||
# Transport read loop ended / session closing. THE FIX: setting torn absorbs
|
|
||||||
# every later Close, so teardown's channel closes happen exactly once. To
|
|
||||||
# reproduce the legacy double-teardown hazard, delete `self.torn = 1` below:
|
|
||||||
# the checker then reports TeardownOnce violated (Close runs teardown again).
|
|
||||||
atomic action Close:
|
|
||||||
if self.torn == 0:
|
|
||||||
self.running = 0 # StopVAD if it was running (close-once)
|
|
||||||
self.teardowns += 1 # Teardown
|
|
||||||
self.torn = 1
|
|
||||||
|
|
||||||
action Init:
|
|
||||||
c = Conn()
|
|
||||||
|
|
||||||
# SAFETY: teardown runs at most once -- the done/decode/sound channels are closed
|
|
||||||
# exactly once, never double-closed (Part 4, invariant #10).
|
|
||||||
always assertion TeardownOnce:
|
|
||||||
return c.teardowns <= 1
|
|
||||||
|
|
||||||
# SAFETY: the VAD goroutine is never (re)started after teardown -- no
|
|
||||||
# send-after-close / no goroutine outliving the session (Part 4, invariant #8).
|
|
||||||
always assertion NoRunAfterTorn:
|
|
||||||
return not (c.torn == 1 and c.running == 1)
|
|
||||||
@@ -1,4 +0,0 @@
|
|||||||
00011bbfe9bf4c7bcb03a5bf1f5b7fe7390111ad6f0611c6be71e8692504da4e fizzbee-v0.5.2-linux_arm.tar.gz
|
|
||||||
f494b7b2afcc7ce24575ed91a389b46bbbbe5976f9e4b5cd717327012f5e0395 fizzbee-v0.5.2-linux_x86.tar.gz
|
|
||||||
aab223e0bac8f0c052cf774dc25872f72c138da30f4079b914bb9c8921910904 fizzbee-v0.5.2-macos_arm.tar.gz
|
|
||||||
6293bd7ab90c79b8607dc9fb2f09407fde0e11ac6596e884bef7f660178597fa fizzbee-v0.5.2-macos_x86.tar.gz
|
|
||||||
@@ -1,83 +0,0 @@
|
|||||||
---
|
|
||||||
# Authoritative formal design for realtime machine M3: response coordination.
|
|
||||||
#
|
|
||||||
# Companion to:
|
|
||||||
# - docs/design/realtime-state-machines.md (the map + invariants)
|
|
||||||
# - core/http/endpoints/openai/respcoord (the Go implementation)
|
|
||||||
#
|
|
||||||
# The Go MBT adapter maps each action below onto respcoord.Coordinator.Apply
|
|
||||||
# and the StateGetter onto respcoord.Coordinator.State, so this spec is the
|
|
||||||
# source of truth the implementation is checked against.
|
|
||||||
#
|
|
||||||
# NOTE: FizzBee is pre-1.0. Validate the exact syntax/CLI against the version
|
|
||||||
# pinned in formal-verification/README.md before trusting the gate.
|
|
||||||
deadlock_detection: false
|
|
||||||
---
|
|
||||||
|
|
||||||
# Bound the number of responses so the state space is finite.
|
|
||||||
MAX_RESPONSES = 4
|
|
||||||
|
|
||||||
role Session:
|
|
||||||
action Init:
|
|
||||||
self.live = 0 # number of live responses -- MUST stay in {0,1}
|
|
||||||
self.registered = 0 # id of the active response (0 = none)
|
|
||||||
self.next_id = 0
|
|
||||||
self.torn = 0 # session torn down (Terminated) -- absorbing
|
|
||||||
|
|
||||||
# startResponse as ONE indivisible transition -- this is the single-writer
|
|
||||||
# actor guarantee. Superseding an active response emits its cancelled
|
|
||||||
# terminal (live -= 1) BEFORE spawning the replacement (live += 1), so the
|
|
||||||
# net live count never exceeds 1.
|
|
||||||
#
|
|
||||||
# To reproduce the LEGACY dual-writer race from Part 2 of the design doc,
|
|
||||||
# change `atomic func` to `serial func`: the checker then interleaves two
|
|
||||||
# callers between the cancel and the spawn and reports AtMostOneLive
|
|
||||||
# violated -- exactly the bug TestLegacyMechanismCanDoubleStart pins in Go.
|
|
||||||
atomic func start():
|
|
||||||
if self.registered != 0:
|
|
||||||
self.live -= 1 # cancel + cancelled-terminal for the old
|
|
||||||
self.registered = 0
|
|
||||||
self.next_id += 1
|
|
||||||
self.live += 1 # spawn + register the replacement
|
|
||||||
self.registered = self.next_id
|
|
||||||
|
|
||||||
# client read-loop path: response.create / manual input_audio_buffer.commit.
|
|
||||||
# Rejected once torn (no response starts after teardown).
|
|
||||||
atomic action StartFromClient:
|
|
||||||
require self.next_id < MAX_RESPONSES
|
|
||||||
require self.torn == 0
|
|
||||||
self.start()
|
|
||||||
|
|
||||||
# VAD goroutine path: end-of-speech commit / barge-in. Rejected once torn.
|
|
||||||
atomic action StartFromVad:
|
|
||||||
require self.next_id < MAX_RESPONSES
|
|
||||||
require self.torn == 0
|
|
||||||
self.start()
|
|
||||||
|
|
||||||
# a response reaches its own terminal (response.done completed)
|
|
||||||
atomic action FinishCurrent:
|
|
||||||
if self.registered != 0:
|
|
||||||
self.live -= 1
|
|
||||||
self.registered = 0
|
|
||||||
|
|
||||||
# explicit response.cancel with nothing newer queued
|
|
||||||
atomic action CancelReq:
|
|
||||||
if self.registered != 0:
|
|
||||||
self.live -= 1
|
|
||||||
self.registered = 0
|
|
||||||
|
|
||||||
# session teardown (M1 parent): cancel any in-flight response and go to the
|
|
||||||
# absorbing Terminated state, after which no response can start. This is what
|
|
||||||
# lets the connection's teardown guarantee no response outlives the session.
|
|
||||||
atomic action Shutdown:
|
|
||||||
if self.registered != 0:
|
|
||||||
self.live -= 1
|
|
||||||
self.registered = 0
|
|
||||||
self.torn = 1
|
|
||||||
|
|
||||||
action Init:
|
|
||||||
s = Session()
|
|
||||||
|
|
||||||
# SAFETY: at most one live response at any instant (Part 4, invariant #1).
|
|
||||||
always assertion AtMostOneLive:
|
|
||||||
return s.live >= 0 and s.live <= 1
|
|
||||||
@@ -1,84 +0,0 @@
|
|||||||
---
|
|
||||||
# Authoritative formal design for the realtime session lifecycle HIERARCHY:
|
|
||||||
# how the per-machine coordinators (M1-M5) relate as one statechart.
|
|
||||||
#
|
|
||||||
# The five machines (respcoord/turncoord/conncoord/compactcoord/ttscoord) are
|
|
||||||
# implemented as separate single-writer coordinators, but they are not
|
|
||||||
# independent: M1 (connection) is the PARENT region, and its children must
|
|
||||||
# terminate when it does. This spec models that relationship — the property no
|
|
||||||
# single per-machine spec can express — without merging the Go code.
|
|
||||||
#
|
|
||||||
# Companion to:
|
|
||||||
# - docs/design/realtime-state-machines.md (the map + invariants #8/#10)
|
|
||||||
# - the per-machine specs (response_lifecycle / turn_lifecycle / conn_lifecycle
|
|
||||||
# / compaction / tts_pipeline) which check each machine in isolation.
|
|
||||||
#
|
|
||||||
# Regions modeled here are M1's DIRECT children — the ones the connection
|
|
||||||
# goroutine owns and tears down:
|
|
||||||
# conn M1: 0 live, 1 torn
|
|
||||||
# vad M2: 0 stopped, 1 running, 2 terminated (handleVAD goroutine joined)
|
|
||||||
# resp M3: 0 idle, 1 active, 2 terminated (respcoord Terminated)
|
|
||||||
# compaction M4: 0 idle, 1 running, 2 terminated (compactcoord Terminated)
|
|
||||||
# M5 (TTS) is nested UNDER a response (each response owns its TTS pipeline), so
|
|
||||||
# "resp terminated => tts closed" is an M3-internal relationship, not a direct
|
|
||||||
# child of conn; it is covered by tts_pipeline.fizz + the response path.
|
|
||||||
#
|
|
||||||
# NOTE: FizzBee is pre-1.0. Validate the exact syntax/CLI against the version
|
|
||||||
# pinned in formal-verification/README.md before trusting the gate.
|
|
||||||
deadlock_detection: false
|
|
||||||
---
|
|
||||||
|
|
||||||
role Session:
|
|
||||||
action Init:
|
|
||||||
self.conn = 0
|
|
||||||
self.vad = 0
|
|
||||||
self.resp = 0
|
|
||||||
self.compaction = 0
|
|
||||||
|
|
||||||
# Children may only START work while the connection is live: no goroutine is
|
|
||||||
# spawned after teardown (no resurrection / no send-after-close).
|
|
||||||
atomic action VadStart:
|
|
||||||
if self.conn == 0 and self.vad == 0:
|
|
||||||
self.vad = 1
|
|
||||||
atomic action VadStop:
|
|
||||||
if self.conn == 0 and self.vad == 1:
|
|
||||||
self.vad = 0
|
|
||||||
atomic action RespStart:
|
|
||||||
if self.conn == 0 and self.resp != 2:
|
|
||||||
self.resp = 1
|
|
||||||
atomic action RespFinish:
|
|
||||||
if self.resp == 1:
|
|
||||||
self.resp = 0
|
|
||||||
atomic action CompTrigger:
|
|
||||||
if self.conn == 0 and self.compaction == 0:
|
|
||||||
self.compaction = 1
|
|
||||||
atomic action CompFinish:
|
|
||||||
if self.compaction == 1:
|
|
||||||
self.compaction = 0
|
|
||||||
|
|
||||||
# Parent teardown drives EVERY child to its terminal state in one step: the
|
|
||||||
# connection goroutine stops + joins the VAD goroutine (vad->2), shuts down
|
|
||||||
# the response coordinator (resp->2), and cancels + joins the in-flight
|
|
||||||
# compaction (compaction->2). THE RELATIONSHIP: a torn parent implies all
|
|
||||||
# children terminal.
|
|
||||||
#
|
|
||||||
# To reproduce the real M4 gap (compaction left fire-and-forget, able to
|
|
||||||
# outlive the session), delete `self.compaction = 2` below: the checker then
|
|
||||||
# reports ChildrenDieWithParent violated (conn torn while compaction still
|
|
||||||
# running). Likewise dropping vad/resp reproduces a leaked VAD/response.
|
|
||||||
atomic action Teardown:
|
|
||||||
if self.conn == 0:
|
|
||||||
self.conn = 1
|
|
||||||
self.vad = 2
|
|
||||||
self.resp = 2
|
|
||||||
self.compaction = 2
|
|
||||||
|
|
||||||
action Init:
|
|
||||||
s = Session()
|
|
||||||
|
|
||||||
# SAFETY (the hierarchy invariant): once the connection is torn, every child is
|
|
||||||
# terminal — no VAD goroutine, response, or compaction outlives the session
|
|
||||||
# (Part 4, invariants #8/#10). The start guards above additionally make "no child
|
|
||||||
# starts after teardown" unreachable.
|
|
||||||
always assertion ChildrenDieWithParent:
|
|
||||||
return s.conn == 0 or (s.vad == 2 and s.resp == 2 and s.compaction == 2)
|
|
||||||
@@ -1,53 +0,0 @@
|
|||||||
---
|
|
||||||
# Authoritative formal design for realtime machine M5: TTS pipeline lifecycle.
|
|
||||||
#
|
|
||||||
# Companion to:
|
|
||||||
# - docs/design/realtime-state-machines.md (the map + invariants)
|
|
||||||
# - core/http/endpoints/openai/ttscoord (the Go implementation)
|
|
||||||
#
|
|
||||||
# The Go MBT adapter maps each action below onto ttscoord.Coordinator.Apply and
|
|
||||||
# the StateGetter onto ttscoord.Coordinator.State, so this spec is the source of
|
|
||||||
# truth the implementation is checked against.
|
|
||||||
#
|
|
||||||
# The TTS pipeline's open->closing->closed lifecycle (the legacy `closed` bool +
|
|
||||||
# `done` channel). Two writers: the producer raises Close (wait()), the worker
|
|
||||||
# raises WorkerExited. `phase` is 0=open, 1=closing, 2=closed; `wakes` counts how
|
|
||||||
# many times Close woke the worker to exit.
|
|
||||||
#
|
|
||||||
# NOTE: FizzBee is pre-1.0. Validate the exact syntax/CLI against the version
|
|
||||||
# pinned in formal-verification/README.md before trusting the gate.
|
|
||||||
deadlock_detection: false
|
|
||||||
---
|
|
||||||
|
|
||||||
role Pipeline:
|
|
||||||
action Init:
|
|
||||||
self.phase = 0 # 0 open, 1 closing, 2 closed -- monotonic
|
|
||||||
self.wakes = 0 # Close->Closing transitions (worker wakeups to exit)
|
|
||||||
|
|
||||||
# wait() called (producer). THE FIX: it advances to closing and wakes the
|
|
||||||
# worker only from open, so wait() is idempotent. To reproduce the legacy
|
|
||||||
# double-wake hazard, drop the `if self.phase == 0` guard (always wake): the
|
|
||||||
# checker then reports WakeOnce violated.
|
|
||||||
atomic action Close:
|
|
||||||
if self.phase == 0:
|
|
||||||
self.phase = 1
|
|
||||||
self.wakes += 1
|
|
||||||
|
|
||||||
# The worker drained the queue and observed the close (worker goroutine).
|
|
||||||
atomic action WorkerExited:
|
|
||||||
if self.phase == 1:
|
|
||||||
self.phase = 2
|
|
||||||
|
|
||||||
action Init:
|
|
||||||
p = Pipeline()
|
|
||||||
|
|
||||||
# SAFETY: the worker is woken-to-exit at most once -- the done channel is joined
|
|
||||||
# exactly once, wait() is idempotent (Part 4, invariant #10).
|
|
||||||
always assertion WakeOnce:
|
|
||||||
return p.wakes <= 1
|
|
||||||
|
|
||||||
# SAFETY: the lifecycle is bounded and monotonic open -> closing -> closed; a
|
|
||||||
# clause is never accepted after close (enqueue is gated on phase 0 in Go) and
|
|
||||||
# the worker is joined exactly once (Part 4, invariant #8).
|
|
||||||
always assertion Monotonic:
|
|
||||||
return p.phase >= 0 and p.phase <= 2
|
|
||||||
@@ -1,79 +0,0 @@
|
|||||||
---
|
|
||||||
# Authoritative formal design for realtime machine M2: turn detection.
|
|
||||||
#
|
|
||||||
# Companion to:
|
|
||||||
# - docs/design/realtime-state-machines.md (the map + invariants)
|
|
||||||
# - core/http/endpoints/openai/turncoord (the Go implementation)
|
|
||||||
#
|
|
||||||
# The Go MBT adapter maps each action below onto turncoord.Coordinator.Apply
|
|
||||||
# and the StateGetter onto turncoord.Coordinator.State, so this spec is the
|
|
||||||
# source of truth the implementation is checked against.
|
|
||||||
#
|
|
||||||
# The property this machine must guarantee is the COUPLING of two facts the
|
|
||||||
# legacy code tracked in two separate variables that could disagree:
|
|
||||||
# - speech -- handleVAD's speechStarted bool
|
|
||||||
# - turn -- the semantic_vad live-stream-open flag (lts.open())
|
|
||||||
# A discardTurn (no-speech clear / mode switch / teardown) closed the live
|
|
||||||
# stream (turn -> 0) but left speechStarted set (speech stays 1). They then
|
|
||||||
# disagreed, and the next onset was suppressed by `if !speechStarted` -- no
|
|
||||||
# speech_started, no barge-in, no commit. See Part 2, failure mode 4.
|
|
||||||
#
|
|
||||||
# Here speech and turn are driven only ever TOGETHER, modelling the single
|
|
||||||
# turncoord State (Idle <-> Speaking) where both facts are one value.
|
|
||||||
#
|
|
||||||
# NOTE: FizzBee is pre-1.0. Validate the exact syntax/CLI against the version
|
|
||||||
# pinned in formal-verification/README.md before trusting the gate.
|
|
||||||
deadlock_detection: false
|
|
||||||
---
|
|
||||||
|
|
||||||
# Bound the number of turns so the state space is finite.
|
|
||||||
MAX_TURNS = 4
|
|
||||||
|
|
||||||
role Detector:
|
|
||||||
action Init:
|
|
||||||
self.speech = 0 # speechStarted (0/1)
|
|
||||||
self.turn = 0 # live-stream / turn open (0/1)
|
|
||||||
self.turns = 0 # how many turns have been opened (bound)
|
|
||||||
|
|
||||||
# Onset: VAD reports speech while idle -> open a turn. ONE indivisible
|
|
||||||
# transition sets BOTH facts, so they cannot be left disagreeing. Re-onset
|
|
||||||
# while already speaking is a no-op (legacy `if !speechStarted`).
|
|
||||||
atomic action Onset:
|
|
||||||
require self.turns < MAX_TURNS
|
|
||||||
if self.speech == 0:
|
|
||||||
self.turns += 1
|
|
||||||
self.speech = 1
|
|
||||||
self.turn = 1
|
|
||||||
|
|
||||||
# Silence: VAD-confirmed end-of-speech past the dynamic threshold -> commit.
|
|
||||||
# Both facts clear together (EmitSpeechStopped + CommitTurn return to Idle).
|
|
||||||
atomic action Silence:
|
|
||||||
if self.speech == 1:
|
|
||||||
self.speech = 0
|
|
||||||
self.turn = 0
|
|
||||||
|
|
||||||
# Abort: no-speech clear / teardown -> discard. BOTH facts clear together.
|
|
||||||
# (A semantic->server mode switch only drops the orphaned live stream and
|
|
||||||
# lets the turn continue, so it is NOT an Abort -- see turncoord.go.)
|
|
||||||
# THE FIX: clearing only `self.turn` here (deleting `self.speech = 0`)
|
|
||||||
# reproduces the legacy discardTurn bug --
|
|
||||||
# the checker then reports Coupled violated, exactly the desync that
|
|
||||||
# suppressed the next onset.
|
|
||||||
atomic action Abort:
|
|
||||||
if self.turn == 1:
|
|
||||||
self.turn = 0
|
|
||||||
self.speech = 0
|
|
||||||
|
|
||||||
action Init:
|
|
||||||
d = Detector()
|
|
||||||
|
|
||||||
# SAFETY: speechStarted and turn-open never disagree -- they are one state, so
|
|
||||||
# the legacy desync that suppressed the next onset is unrepresentable
|
|
||||||
# (Part 4, invariant #4; failure mode 4).
|
|
||||||
always assertion Coupled:
|
|
||||||
return d.speech == d.turn
|
|
||||||
|
|
||||||
# SAFETY: at most one turn open at any instant -- `turn` is a 0/1 fact, never
|
|
||||||
# incremented twice without a clear between (onset is a no-op while speaking).
|
|
||||||
always assertion AtMostOneTurnOpen:
|
|
||||||
return d.turn >= 0 and d.turn <= 1
|
|
||||||
@@ -252,25 +252,8 @@ func (g FunctionsConfig) GrammarOptions() []func(o *grammars.GrammarOption) {
|
|||||||
return opts
|
return opts
|
||||||
}
|
}
|
||||||
|
|
||||||
// truncForLog returns a length+head-truncated view of s for debug logging.
|
|
||||||
//
|
|
||||||
// CleanupLLMResult / ParseFunctionCall are invoked once per streaming chunk
|
|
||||||
// with the *full accumulated* LLM result so far (see
|
|
||||||
// core/http/endpoints/openai/chat_stream_workers.go). Logging the full
|
|
||||||
// argument on every call gives O(N^2) total log volume across a single
|
|
||||||
// generation, which under LOG_LEVEL=debug has been observed to fill disks
|
|
||||||
// and stall the host during long streaming sessions. Logging only the
|
|
||||||
// length plus a fixed-size head bounds per-call output to a constant.
|
|
||||||
func truncForLog(s string) string {
|
|
||||||
const maxHead = 200
|
|
||||||
if len(s) <= maxHead {
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
return s[:maxHead] + "...(truncated)"
|
|
||||||
}
|
|
||||||
|
|
||||||
func CleanupLLMResult(llmresult string, functionConfig FunctionsConfig) string {
|
func CleanupLLMResult(llmresult string, functionConfig FunctionsConfig) string {
|
||||||
xlog.Debug("LLM result", "len", len(llmresult), "head", truncForLog(llmresult))
|
xlog.Debug("LLM result", "result", llmresult)
|
||||||
|
|
||||||
for _, item := range functionConfig.ReplaceLLMResult {
|
for _, item := range functionConfig.ReplaceLLMResult {
|
||||||
k, v := item.Key, item.Value
|
k, v := item.Key, item.Value
|
||||||
@@ -278,7 +261,7 @@ func CleanupLLMResult(llmresult string, functionConfig FunctionsConfig) string {
|
|||||||
re := regexp.MustCompile(k)
|
re := regexp.MustCompile(k)
|
||||||
llmresult = re.ReplaceAllString(llmresult, v)
|
llmresult = re.ReplaceAllString(llmresult, v)
|
||||||
}
|
}
|
||||||
xlog.Debug("LLM result(processed)", "len", len(llmresult), "head", truncForLog(llmresult))
|
xlog.Debug("LLM result(processed)", "result", llmresult)
|
||||||
|
|
||||||
return llmresult
|
return llmresult
|
||||||
}
|
}
|
||||||
@@ -930,7 +913,7 @@ func parseParameterValue(paramValue string, format *XMLToolCallFormat) any {
|
|||||||
|
|
||||||
func ParseFunctionCall(llmresult string, functionConfig FunctionsConfig) []FuncCallResults {
|
func ParseFunctionCall(llmresult string, functionConfig FunctionsConfig) []FuncCallResults {
|
||||||
|
|
||||||
xlog.Debug("LLM result", "len", len(llmresult), "head", truncForLog(llmresult))
|
xlog.Debug("LLM result", "result", llmresult)
|
||||||
|
|
||||||
for _, item := range functionConfig.ReplaceFunctionResults {
|
for _, item := range functionConfig.ReplaceFunctionResults {
|
||||||
k, v := item.Key, item.Value
|
k, v := item.Key, item.Value
|
||||||
@@ -938,7 +921,7 @@ func ParseFunctionCall(llmresult string, functionConfig FunctionsConfig) []FuncC
|
|||||||
re := regexp.MustCompile(k)
|
re := regexp.MustCompile(k)
|
||||||
llmresult = re.ReplaceAllString(llmresult, v)
|
llmresult = re.ReplaceAllString(llmresult, v)
|
||||||
}
|
}
|
||||||
xlog.Debug("LLM result(function cleanup)", "len", len(llmresult), "head", truncForLog(llmresult))
|
xlog.Debug("LLM result(function cleanup)", "result", llmresult)
|
||||||
|
|
||||||
functionNameKey := defaultFunctionNameKey
|
functionNameKey := defaultFunctionNameKey
|
||||||
functionArgumentsKey := defaultFunctionArgumentsKey
|
functionArgumentsKey := defaultFunctionArgumentsKey
|
||||||
|
|||||||
@@ -119,7 +119,6 @@ type ControlBackend interface {
|
|||||||
// NOT tracked as a single in-flight unit.
|
// NOT tracked as a single in-flight unit.
|
||||||
AudioTransformStream(ctx context.Context, opts ...grpc.CallOption) (AudioTransformStreamClient, error)
|
AudioTransformStream(ctx context.Context, opts ...grpc.CallOption) (AudioTransformStreamClient, error)
|
||||||
AudioToAudioStream(ctx context.Context, opts ...grpc.CallOption) (AudioToAudioStreamClient, error)
|
AudioToAudioStream(ctx context.Context, opts ...grpc.CallOption) (AudioToAudioStreamClient, error)
|
||||||
AudioTranscriptionLive(ctx context.Context, opts ...grpc.CallOption) (AudioTranscriptionLiveClient, error)
|
|
||||||
|
|
||||||
// Forward proxies a raw HTTP request to an upstream provider for
|
// Forward proxies a raw HTTP request to an upstream provider for
|
||||||
// passthrough-mode cloud-proxy backends. Caller streams a single
|
// passthrough-mode cloud-proxy backends. Caller streams a single
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
|
|
||||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||||
gopsutil "github.com/shirou/gopsutil/v3/process"
|
gopsutil "github.com/shirou/gopsutil/v3/process"
|
||||||
)
|
)
|
||||||
@@ -167,11 +166,6 @@ func (llm *Base) AudioTransformStream(in <-chan *pb.AudioTransformFrameRequest,
|
|||||||
return fmt.Errorf("unimplemented")
|
return fmt.Errorf("unimplemented")
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *Base) AudioTranscriptionLive(in <-chan *pb.TranscriptLiveRequest, out chan<- *pb.TranscriptLiveResponse) error {
|
|
||||||
close(out)
|
|
||||||
return grpcerrors.LiveTranscriptionUnsupported("base", "not implemented by this backend")
|
|
||||||
}
|
|
||||||
|
|
||||||
func (llm *Base) AudioToAudioStream(in <-chan *pb.AudioToAudioRequest, out chan<- *pb.AudioToAudioResponse) error {
|
func (llm *Base) AudioToAudioStream(in <-chan *pb.AudioToAudioRequest, out chan<- *pb.AudioToAudioResponse) error {
|
||||||
close(out)
|
close(out)
|
||||||
return fmt.Errorf("unimplemented")
|
return fmt.Errorf("unimplemented")
|
||||||
|
|||||||
@@ -900,22 +900,19 @@ type AudioTransformStreamClient interface {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// audioTransformStreamClient is the concrete wrapper. It also owns the
|
// audioTransformStreamClient is the concrete wrapper. It also owns the
|
||||||
// underlying gRPC connection, released once the receive side terminates —
|
// underlying gRPC connection so it can be closed when the caller is done.
|
||||||
// NOT at CloseSend, because the server still streams responses (the tail of
|
|
||||||
// the transform) after the client closes its send side. Same lifecycle as
|
|
||||||
// forwardClient.
|
|
||||||
type audioTransformStreamClient struct {
|
type audioTransformStreamClient struct {
|
||||||
pb.Backend_AudioTransformStreamClient
|
pb.Backend_AudioTransformStreamClient
|
||||||
closeOnce sync.Once
|
conn *grpc.ClientConn
|
||||||
closer func()
|
closer func()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *audioTransformStreamClient) Recv() (*pb.AudioTransformFrameResponse, error) {
|
func (s *audioTransformStreamClient) CloseSend() error {
|
||||||
resp, err := s.Backend_AudioTransformStreamClient.Recv()
|
err := s.Backend_AudioTransformStreamClient.CloseSend()
|
||||||
if err != nil && s.closer != nil {
|
if s.closer != nil {
|
||||||
s.closeOnce.Do(s.closer)
|
s.closer()
|
||||||
}
|
}
|
||||||
return resp, err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Client) AudioTransformStream(ctx context.Context, opts ...grpc.CallOption) (AudioTransformStreamClient, error) {
|
func (c *Client) AudioTransformStream(ctx context.Context, opts ...grpc.CallOption) (AudioTransformStreamClient, error) {
|
||||||
@@ -947,85 +944,7 @@ func (c *Client) AudioTransformStream(ctx context.Context, opts ...grpc.CallOpti
|
|||||||
}
|
}
|
||||||
return &audioTransformStreamClient{
|
return &audioTransformStreamClient{
|
||||||
Backend_AudioTransformStreamClient: stream,
|
Backend_AudioTransformStreamClient: stream,
|
||||||
closer: func() {
|
conn: conn,
|
||||||
_ = conn.Close()
|
|
||||||
cleanup()
|
|
||||||
},
|
|
||||||
}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// AudioTranscriptionLiveClient is the duplex interface returned by
|
|
||||||
// (*Client).AudioTranscriptionLive. Wraps the generated bidi client without
|
|
||||||
// leaking the proto package across the public boundary.
|
|
||||||
type AudioTranscriptionLiveClient interface {
|
|
||||||
Send(*pb.TranscriptLiveRequest) error
|
|
||||||
Recv() (*pb.TranscriptLiveResponse, error)
|
|
||||||
CloseSend() error
|
|
||||||
Context() context.Context
|
|
||||||
}
|
|
||||||
|
|
||||||
type audioTranscriptionLiveClient struct {
|
|
||||||
pb.Backend_AudioTranscriptionLiveClient
|
|
||||||
closeOnce sync.Once
|
|
||||||
closer func()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Recv releases the connection once the stream reaches a terminal state
|
|
||||||
// (io.EOF after the server finishes, or any error). The conn MUST survive
|
|
||||||
// CloseSend: the live protocol is close-send -> backend flushes the decode
|
|
||||||
// tail -> terminal FinalResult arrives. Closing the conn inside CloseSend
|
|
||||||
// killed that pending Recv with "grpc: the client connection is closing",
|
|
||||||
// losing the final transcript (and its tail words) on every turn.
|
|
||||||
func (s *audioTranscriptionLiveClient) Recv() (*pb.TranscriptLiveResponse, error) {
|
|
||||||
resp, err := s.Backend_AudioTranscriptionLiveClient.Recv()
|
|
||||||
if err != nil {
|
|
||||||
s.release()
|
|
||||||
}
|
|
||||||
return resp, err
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *audioTranscriptionLiveClient) release() {
|
|
||||||
s.closeOnce.Do(func() {
|
|
||||||
if s.closer != nil {
|
|
||||||
s.closer()
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// AudioTranscriptionLive opens the bidirectional live ASR stream. Note the
|
|
||||||
// same caveat as AudioToAudioStream: the watchdog busy-mark (and, on
|
|
||||||
// non-parallel backends, opMutex) is held for the stream's lifetime, which
|
|
||||||
// for a realtime session can be minutes — enable parallel requests on
|
|
||||||
// backends meant to serve live sessions alongside unary work.
|
|
||||||
func (c *Client) AudioTranscriptionLive(ctx context.Context, opts ...grpc.CallOption) (AudioTranscriptionLiveClient, error) {
|
|
||||||
if !c.parallel {
|
|
||||||
c.opMutex.Lock()
|
|
||||||
}
|
|
||||||
c.setBusy(true)
|
|
||||||
c.wdMark()
|
|
||||||
|
|
||||||
cleanup := func() {
|
|
||||||
c.wdUnMark()
|
|
||||||
c.setBusy(false)
|
|
||||||
if !c.parallel {
|
|
||||||
c.opMutex.Unlock()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
conn, err := c.dial()
|
|
||||||
if err != nil {
|
|
||||||
cleanup()
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
client := pb.NewBackendClient(conn)
|
|
||||||
stream, err := client.AudioTranscriptionLive(ctx, opts...)
|
|
||||||
if err != nil {
|
|
||||||
_ = conn.Close()
|
|
||||||
cleanup()
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return &audioTranscriptionLiveClient{
|
|
||||||
Backend_AudioTranscriptionLiveClient: stream,
|
|
||||||
closer: func() {
|
closer: func() {
|
||||||
_ = conn.Close()
|
_ = conn.Close()
|
||||||
cleanup()
|
cleanup()
|
||||||
@@ -1043,22 +962,18 @@ type AudioToAudioStreamClient interface {
|
|||||||
Context() context.Context
|
Context() context.Context
|
||||||
}
|
}
|
||||||
|
|
||||||
// audioToAudioStreamClient owns its gRPC connection, released once the
|
|
||||||
// receive side terminates — NOT at CloseSend, because the server still
|
|
||||||
// streams the response tail after the client closes its send side. Same
|
|
||||||
// lifecycle as forwardClient.
|
|
||||||
type audioToAudioStreamClient struct {
|
type audioToAudioStreamClient struct {
|
||||||
pb.Backend_AudioToAudioStreamClient
|
pb.Backend_AudioToAudioStreamClient
|
||||||
closeOnce sync.Once
|
conn *grpc.ClientConn
|
||||||
closer func()
|
closer func()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *audioToAudioStreamClient) Recv() (*pb.AudioToAudioResponse, error) {
|
func (s *audioToAudioStreamClient) CloseSend() error {
|
||||||
resp, err := s.Backend_AudioToAudioStreamClient.Recv()
|
err := s.Backend_AudioToAudioStreamClient.CloseSend()
|
||||||
if err != nil && s.closer != nil {
|
if s.closer != nil {
|
||||||
s.closeOnce.Do(s.closer)
|
s.closer()
|
||||||
}
|
}
|
||||||
return resp, err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Client) AudioToAudioStream(ctx context.Context, opts ...grpc.CallOption) (AudioToAudioStreamClient, error) {
|
func (c *Client) AudioToAudioStream(ctx context.Context, opts ...grpc.CallOption) (AudioToAudioStreamClient, error) {
|
||||||
@@ -1090,6 +1005,7 @@ func (c *Client) AudioToAudioStream(ctx context.Context, opts ...grpc.CallOption
|
|||||||
}
|
}
|
||||||
return &audioToAudioStreamClient{
|
return &audioToAudioStreamClient{
|
||||||
Backend_AudioToAudioStreamClient: stream,
|
Backend_AudioToAudioStreamClient: stream,
|
||||||
|
conn: conn,
|
||||||
closer: func() {
|
closer: func() {
|
||||||
_ = conn.Close()
|
_ = conn.Close()
|
||||||
cleanup()
|
cleanup()
|
||||||
|
|||||||
@@ -198,34 +198,6 @@ func (e *embedBackend) AudioTransformStream(ctx context.Context, opts ...grpc.Ca
|
|||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *embedBackend) AudioTranscriptionLive(ctx context.Context, opts ...grpc.CallOption) (AudioTranscriptionLiveClient, error) {
|
|
||||||
reqs := make(chan *pb.TranscriptLiveRequest, 4)
|
|
||||||
resps := make(chan *pb.TranscriptLiveResponse, 4)
|
|
||||||
srvDone := make(chan error, 1)
|
|
||||||
|
|
||||||
server := &embedBackendAudioTranscriptionLiveStream{
|
|
||||||
ctx: ctx,
|
|
||||||
reqs: reqs,
|
|
||||||
resps: resps,
|
|
||||||
}
|
|
||||||
|
|
||||||
go func() {
|
|
||||||
err := e.s.AudioTranscriptionLive(server)
|
|
||||||
// Stash the terminal error BEFORE closing resps: a caller blocked in
|
|
||||||
// Recv wakes on the close and must find the error (the ready-ack
|
|
||||||
// contract surfaces Unimplemented through that first Recv).
|
|
||||||
srvDone <- err
|
|
||||||
close(resps)
|
|
||||||
}()
|
|
||||||
|
|
||||||
return &embedBackendAudioTranscriptionLiveStreamClient{
|
|
||||||
ctx: ctx,
|
|
||||||
reqs: reqs,
|
|
||||||
resps: resps,
|
|
||||||
srvDone: srvDone,
|
|
||||||
}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (e *embedBackend) Forward(ctx context.Context, opts ...grpc.CallOption) (ForwardClient, error) {
|
func (e *embedBackend) Forward(ctx context.Context, opts ...grpc.CallOption) (ForwardClient, error) {
|
||||||
reqs := make(chan *pb.ForwardRequest, 8)
|
reqs := make(chan *pb.ForwardRequest, 8)
|
||||||
resps := make(chan *pb.ForwardReply, 8)
|
resps := make(chan *pb.ForwardReply, 8)
|
||||||
@@ -329,8 +301,6 @@ var _ pb.Backend_AudioTransformStreamServer = new(embedBackendAudioTransformStre
|
|||||||
var _ AudioTransformStreamClient = new(embedBackendAudioTransformStreamClient)
|
var _ AudioTransformStreamClient = new(embedBackendAudioTransformStreamClient)
|
||||||
var _ pb.Backend_AudioToAudioStreamServer = new(embedBackendAudioToAudioStream)
|
var _ pb.Backend_AudioToAudioStreamServer = new(embedBackendAudioToAudioStream)
|
||||||
var _ AudioToAudioStreamClient = new(embedBackendAudioToAudioStreamClient)
|
var _ AudioToAudioStreamClient = new(embedBackendAudioToAudioStreamClient)
|
||||||
var _ pb.Backend_AudioTranscriptionLiveServer = new(embedBackendAudioTranscriptionLiveStream)
|
|
||||||
var _ AudioTranscriptionLiveClient = new(embedBackendAudioTranscriptionLiveStreamClient)
|
|
||||||
|
|
||||||
// embedBackendAudioTransformStream is the server side of an in-process bidi
|
// embedBackendAudioTransformStream is the server side of an in-process bidi
|
||||||
// stream. The hosted server reads requests from `reqs` (closed by client when
|
// stream. The hosted server reads requests from `reqs` (closed by client when
|
||||||
@@ -427,102 +397,6 @@ func (e *embedBackendAudioTransformStreamClient) CloseSend() error {
|
|||||||
|
|
||||||
func (e *embedBackendAudioTransformStreamClient) Context() context.Context { return e.ctx }
|
func (e *embedBackendAudioTransformStreamClient) Context() context.Context { return e.ctx }
|
||||||
|
|
||||||
// embedBackendAudioTranscriptionLiveStream is the in-process server-side
|
|
||||||
// handle for the bidirectional live ASR RPC. Mirrors
|
|
||||||
// embedBackendAudioTransformStream — the hosted server reads requests from
|
|
||||||
// `reqs` (closed by client when done sending) and writes responses to `resps`.
|
|
||||||
type embedBackendAudioTranscriptionLiveStream struct {
|
|
||||||
ctx context.Context
|
|
||||||
reqs <-chan *pb.TranscriptLiveRequest
|
|
||||||
resps chan<- *pb.TranscriptLiveResponse
|
|
||||||
}
|
|
||||||
|
|
||||||
func (e *embedBackendAudioTranscriptionLiveStream) Send(resp *pb.TranscriptLiveResponse) error {
|
|
||||||
select {
|
|
||||||
case e.resps <- resp:
|
|
||||||
return nil
|
|
||||||
case <-e.ctx.Done():
|
|
||||||
return e.ctx.Err()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (e *embedBackendAudioTranscriptionLiveStream) Recv() (*pb.TranscriptLiveRequest, error) {
|
|
||||||
select {
|
|
||||||
case req, ok := <-e.reqs:
|
|
||||||
if !ok {
|
|
||||||
return nil, io.EOF
|
|
||||||
}
|
|
||||||
return req, nil
|
|
||||||
case <-e.ctx.Done():
|
|
||||||
return nil, e.ctx.Err()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (e *embedBackendAudioTranscriptionLiveStream) SetHeader(md metadata.MD) error { return nil }
|
|
||||||
func (e *embedBackendAudioTranscriptionLiveStream) SendHeader(md metadata.MD) error { return nil }
|
|
||||||
func (e *embedBackendAudioTranscriptionLiveStream) SetTrailer(md metadata.MD) {}
|
|
||||||
func (e *embedBackendAudioTranscriptionLiveStream) Context() context.Context { return e.ctx }
|
|
||||||
func (e *embedBackendAudioTranscriptionLiveStream) SendMsg(m any) error {
|
|
||||||
if x, ok := m.(*pb.TranscriptLiveResponse); ok {
|
|
||||||
return e.Send(x)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
func (e *embedBackendAudioTranscriptionLiveStream) RecvMsg(m any) error {
|
|
||||||
// gRPC bidi streaming uses Recv() directly; RecvMsg is unused on this path.
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// embedBackendAudioTranscriptionLiveStreamClient is the caller-facing side.
|
|
||||||
// It mirrors the server-side stream over the same channels.
|
|
||||||
type embedBackendAudioTranscriptionLiveStreamClient struct {
|
|
||||||
ctx context.Context
|
|
||||||
reqs chan<- *pb.TranscriptLiveRequest
|
|
||||||
resps <-chan *pb.TranscriptLiveResponse
|
|
||||||
srvDone <-chan error
|
|
||||||
closeOnce bool
|
|
||||||
}
|
|
||||||
|
|
||||||
func (e *embedBackendAudioTranscriptionLiveStreamClient) Send(req *pb.TranscriptLiveRequest) error {
|
|
||||||
select {
|
|
||||||
case e.reqs <- req:
|
|
||||||
return nil
|
|
||||||
case <-e.ctx.Done():
|
|
||||||
return e.ctx.Err()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (e *embedBackendAudioTranscriptionLiveStreamClient) Recv() (*pb.TranscriptLiveResponse, error) {
|
|
||||||
select {
|
|
||||||
case resp, ok := <-e.resps:
|
|
||||||
if !ok {
|
|
||||||
// Server-side finished. Surface its terminal error if any.
|
|
||||||
select {
|
|
||||||
case err := <-e.srvDone:
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
}
|
|
||||||
return nil, io.EOF
|
|
||||||
}
|
|
||||||
return resp, nil
|
|
||||||
case <-e.ctx.Done():
|
|
||||||
return nil, e.ctx.Err()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (e *embedBackendAudioTranscriptionLiveStreamClient) CloseSend() error {
|
|
||||||
if e.closeOnce {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
e.closeOnce = true
|
|
||||||
close(e.reqs)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (e *embedBackendAudioTranscriptionLiveStreamClient) Context() context.Context { return e.ctx }
|
|
||||||
|
|
||||||
// embedBackendAudioToAudioStream is the in-process server-side handle for
|
// embedBackendAudioToAudioStream is the in-process server-side handle for
|
||||||
// the bidirectional any-to-any audio RPC. Mirrors embedBackendAudioTransform
|
// the bidirectional any-to-any audio RPC. Mirrors embedBackendAudioTransform
|
||||||
// Stream — the hosted server reads requests from `reqs` (closed by client
|
// Stream — the hosted server reads requests from `reqs` (closed by client
|
||||||
|
|||||||
@@ -33,41 +33,3 @@ func IsModelNotLoaded(err error) bool {
|
|||||||
}
|
}
|
||||||
return strings.Contains(strings.ToLower(err.Error()), "model not loaded")
|
return strings.Contains(strings.ToLower(err.Error()), "model not loaded")
|
||||||
}
|
}
|
||||||
|
|
||||||
// LiveTranscriptionUnsupported returns the canonical error a backend returns
|
|
||||||
// when it (or the loaded model) cannot serve the bidirectional
|
|
||||||
// AudioTranscriptionLive RPC. It carries codes.Unimplemented deliberately:
|
|
||||||
// that is also what gRPC itself returns for backends whose stubs predate the
|
|
||||||
// RPC, so callers get one uniform "degrade to non-live transcription" signal.
|
|
||||||
// (codes.FailedPrecondition is not used here — IsModelNotLoaded claims it.)
|
|
||||||
func LiveTranscriptionUnsupported(backend, reason string) error {
|
|
||||||
return status.Errorf(codes.Unimplemented, "%s: live transcription unsupported: %s", backend, reason)
|
|
||||||
}
|
|
||||||
|
|
||||||
// IsLiveTranscriptionUnsupported reports whether err signals that live
|
|
||||||
// transcription is not available for this backend/model. It prefers the typed
|
|
||||||
// gRPC status code (Unimplemented) and falls back to the message for paths
|
|
||||||
// that lose the status (e.g. errors wrapped across non-gRPC boundaries).
|
|
||||||
func IsLiveTranscriptionUnsupported(err error) bool {
|
|
||||||
if err == nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
if status.Code(err) == codes.Unimplemented {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
return strings.Contains(strings.ToLower(err.Error()), "unimplemented")
|
|
||||||
}
|
|
||||||
|
|
||||||
// StreamTranscriptionUnsupported returns the canonical error a backend returns
|
|
||||||
// when it (or the loaded model) cannot serve the server-streaming
|
|
||||||
// AudioTranscriptionStream RPC. It carries codes.Unimplemented like the live
|
|
||||||
// signal, but its intent is the opposite: it is meant to be SURFACED to the
|
|
||||||
// caller, not silently degraded. A backend must not decode the audio offline
|
|
||||||
// and emit it as a single "delta" + final to fake a stream — a client that
|
|
||||||
// asked for streaming has to learn the model cannot stream (qualitatively
|
|
||||||
// identical output would otherwise hide a missing, possibly required,
|
|
||||||
// capability). Callers wanting a plain transcript use the unary
|
|
||||||
// AudioTranscription / non-streaming endpoint instead.
|
|
||||||
func StreamTranscriptionUnsupported(backend, reason string) error {
|
|
||||||
return status.Errorf(codes.Unimplemented, "%s: streaming transcription unsupported: %s", backend, reason)
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -34,30 +34,4 @@ var _ = Describe("grpcerrors", func() {
|
|||||||
It("ModelNotLoaded carries FailedPrecondition", func() {
|
It("ModelNotLoaded carries FailedPrecondition", func() {
|
||||||
Expect(status.Code(grpcerrors.ModelNotLoaded("whisper"))).To(Equal(codes.FailedPrecondition))
|
Expect(status.Code(grpcerrors.ModelNotLoaded("whisper"))).To(Equal(codes.FailedPrecondition))
|
||||||
})
|
})
|
||||||
|
|
||||||
DescribeTable("IsLiveTranscriptionUnsupported",
|
|
||||||
func(err error, want bool) {
|
|
||||||
Expect(grpcerrors.IsLiveTranscriptionUnsupported(err)).To(Equal(want))
|
|
||||||
},
|
|
||||||
Entry("nil", nil, false),
|
|
||||||
Entry("typed via constructor", grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp", "not a streaming model"), true),
|
|
||||||
Entry("typed code only", status.Error(codes.Unimplemented, "anything"), true),
|
|
||||||
Entry("stale stub message (Unknown code)", errors.New("rpc error: method AudioTranscriptionLive unimplemented"), true),
|
|
||||||
Entry("unrelated error", errors.New("context deadline exceeded"), false),
|
|
||||||
Entry("model not loaded is NOT unsupported", grpcerrors.ModelNotLoaded("parakeet-cpp"), false),
|
|
||||||
)
|
|
||||||
|
|
||||||
It("LiveTranscriptionUnsupported carries Unimplemented, not FailedPrecondition", func() {
|
|
||||||
err := grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp", "reason")
|
|
||||||
Expect(status.Code(err)).To(Equal(codes.Unimplemented))
|
|
||||||
// FailedPrecondition is claimed by IsModelNotLoaded — the two
|
|
||||||
// signals must never alias.
|
|
||||||
Expect(grpcerrors.IsModelNotLoaded(err)).To(BeFalse())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("StreamTranscriptionUnsupported carries Unimplemented and is not ModelNotLoaded", func() {
|
|
||||||
err := grpcerrors.StreamTranscriptionUnsupported("parakeet-cpp", "not a streaming model")
|
|
||||||
Expect(status.Code(err)).To(Equal(codes.Unimplemented))
|
|
||||||
Expect(grpcerrors.IsModelNotLoaded(err)).To(BeFalse())
|
|
||||||
})
|
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -27,7 +27,6 @@ type AIModel interface {
|
|||||||
VoiceEmbed(*pb.VoiceEmbedRequest) (pb.VoiceEmbedResponse, error)
|
VoiceEmbed(*pb.VoiceEmbedRequest) (pb.VoiceEmbedResponse, error)
|
||||||
AudioTranscription(context.Context, *pb.TranscriptRequest) (pb.TranscriptResult, error)
|
AudioTranscription(context.Context, *pb.TranscriptRequest) (pb.TranscriptResult, error)
|
||||||
AudioTranscriptionStream(context.Context, *pb.TranscriptRequest, chan *pb.TranscriptStreamResponse) error
|
AudioTranscriptionStream(context.Context, *pb.TranscriptRequest, chan *pb.TranscriptStreamResponse) error
|
||||||
AudioTranscriptionLive(in <-chan *pb.TranscriptLiveRequest, out chan<- *pb.TranscriptLiveResponse) error
|
|
||||||
TTS(*pb.TTSRequest) error
|
TTS(*pb.TTSRequest) error
|
||||||
TTSStream(*pb.TTSRequest, chan []byte) error
|
TTSStream(*pb.TTSRequest, chan []byte) error
|
||||||
SoundGeneration(*pb.SoundGenerationRequest) error
|
SoundGeneration(*pb.SoundGenerationRequest) error
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user