feat(realtime): pipeline streaming + disable_thinking config

Add a nested pipeline.streaming.{llm,tts,transcription} block plus pipeline.disable_thinking, with StreamLLM/StreamTTS/StreamTranscription/ ThinkingDisabled helpers. Pointer-bools so unset keeps the unary path; existing configs are unaffected. Wiring into the realtime handler follows. Assisted-by: Claude:claude-opus-4-8 go vet Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-06 07:46:15 -04:00 · 2026-06-04 16:03:03 +00:00
parent e837921c2c
commit 16d7704a69
2 changed files with 87 additions and 0 deletions
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -499,6 +499,16 @@ type Pipeline struct {
 	// the pipeline's LLM without editing the LLM model config. Overrides the LLM's
 	// own reasoning_effort. Unset leaves the LLM model config in charge.
 	ReasoningEffort string `yaml:"reasoning_effort,omitempty" json:"reasoning_effort,omitempty"`
+
+	// Streaming opts each pipeline stage into incremental delivery (LLM tokens,
+	// TTS audio chunks, transcription text). Unset stages keep the blocking
+	// unary path, so existing configs are unaffected.
+	Streaming PipelineStreaming `yaml:"streaming,omitempty" json:"streaming,omitempty"`
+
+	// DisableThinking suppresses reasoning/thinking for the pipeline LLM (maps
+	// to enable_thinking=false backend metadata) without editing the underlying
+	// LLM model config. Unset leaves the LLM model config in charge.
+	DisableThinking *bool `yaml:"disable_thinking,omitempty" json:"disable_thinking,omitempty"`
 }

 // ApplyReasoningEffort resolves the effective reasoning effort — a per-request
@@ -530,6 +540,29 @@ func (c *ModelConfig) ApplyReasoningEffort(requestEffort string) {
 	}
 }

+// @Description PipelineStreaming toggles incremental delivery per realtime stage.
+type PipelineStreaming struct {
+	LLM           *bool `yaml:"llm,omitempty" json:"llm,omitempty"`
+	TTS           *bool `yaml:"tts,omitempty" json:"tts,omitempty"`
+	Transcription *bool `yaml:"transcription,omitempty" json:"transcription,omitempty"`
+}
+
+// StreamLLM reports whether LLM tokens should be streamed for this pipeline.
+func (p Pipeline) StreamLLM() bool { return p.Streaming.LLM != nil && *p.Streaming.LLM }
+
+// StreamTTS reports whether TTS audio should be streamed for this pipeline.
+func (p Pipeline) StreamTTS() bool { return p.Streaming.TTS != nil && *p.Streaming.TTS }
+
+// StreamTranscription reports whether transcription text should be streamed.
+func (p Pipeline) StreamTranscription() bool {
+	return p.Streaming.Transcription != nil && *p.Streaming.Transcription
+}
+
+// ThinkingDisabled reports whether the pipeline forces the LLM's thinking off.
+func (p Pipeline) ThinkingDisabled() bool {
+	return p.DisableThinking != nil && *p.DisableThinking
+}
+
 // @Description File configuration for model downloads
 type File struct {
 	Filename string         `yaml:"filename,omitempty" json:"filename,omitempty"`
--- a/core/config/pipeline_streaming_test.go
+++ b/core/config/pipeline_streaming_test.go
@@ -0,0 +1,54 @@
+package config
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"gopkg.in/yaml.v3"
+)
+
+// The realtime pipeline can stream each stage (LLM tokens, TTS audio,
+// transcription text) and can disable model "thinking" for the LLM. These are
+// opt-in per pipeline; everything defaults to off so existing configs keep the
+// unary behaviour.
+var _ = Describe("Pipeline streaming config", func() {
+	It("defaults every streaming + thinking helper to false when unset", func() {
+		var p Pipeline
+		Expect(p.StreamLLM()).To(BeFalse())
+		Expect(p.StreamTTS()).To(BeFalse())
+		Expect(p.StreamTranscription()).To(BeFalse())
+		Expect(p.ThinkingDisabled()).To(BeFalse())
+	})
+
+	It("parses the nested streaming block and disable_thinking from YAML", func() {
+		var c ModelConfig
+		err := yaml.Unmarshal([]byte(`
+name: gpt-realtime
+pipeline:
+  llm: my-llm
+  tts: my-tts
+  transcription: my-stt
+  streaming:
+    llm: true
+    tts: true
+    transcription: true
+  disable_thinking: true
+`), &c)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(c.Pipeline.StreamLLM()).To(BeTrue())
+		Expect(c.Pipeline.StreamTTS()).To(BeTrue())
+		Expect(c.Pipeline.StreamTranscription()).To(BeTrue())
+		Expect(c.Pipeline.ThinkingDisabled()).To(BeTrue())
+	})
+
+	It("treats an explicit false in the streaming block as disabled", func() {
+		var c ModelConfig
+		err := yaml.Unmarshal([]byte(`
+name: gpt-realtime
+pipeline:
+  streaming:
+    tts: false
+`), &c)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(c.Pipeline.StreamTTS()).To(BeFalse())
+	})
+})