From 7d2a762b538e342b372110015e61bd5490f5bd5b Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sun, 14 Jun 2026 18:13:09 +0200
Subject: [PATCH] feat(realtime): configurable pipeline.max_history_items
 (#10331)

Composed realtime pipelines (VAD+STT+LLM+TTS) defaulted to unlimited history,
so a long-running session grew every turn and fed the whole conversation to the
LLM until its context window filled. Add an optional pipeline.max_history_items
to cap the trailing items per turn; explicit value (including 0=unlimited) wins
over the per-model-type default. Self-contained any-to-any models keep their
6-item default.

Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 core/config/model_config.go                   |  7 ++++++
 core/http/endpoints/openai/realtime.go        | 13 ++++++++++-
 .../endpoints/openai/realtime_gate_test.go    | 23 +++++++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/core/config/model_config.go b/core/config/model_config.go
index 195739654..755280cc3 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -510,6 +510,13 @@ type Pipeline struct {
 	// LLM model config. Unset leaves the LLM model config in charge.
 	DisableThinking *bool `yaml:"disable_thinking,omitempty" json:"disable_thinking,omitempty"`
 
+	// MaxHistoryItems caps how many trailing conversation items are fed to the
+	// LLM each realtime turn (0 = unlimited, rely on the LLM's context window).
+	// Unset (nil) uses the per-model-type default. Set it on a composed pipeline
+	// (VAD+STT+LLM+TTS) so a long-running session doesn't grow until the LLM's
+	// context fills.
+	MaxHistoryItems *int `yaml:"max_history_items,omitempty" json:"max_history_items,omitempty"`
+
 	// VoiceRecognition gates the pipeline behind speaker verification. Nil
 	// (block absent) means no gate, preserving existing behavior.
 	VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"`
diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
index f626a895c..343ef4c07 100644
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -340,6 +340,17 @@ func defaultMaxHistoryItems(cfg *config.ModelConfig) int {
 	return 0
 }
 
+// resolveMaxHistoryItems honors an explicit pipeline.max_history_items when set,
+// otherwise falls back to the per-model-type default. This lets a composed
+// pipeline (VAD+STT+LLM+TTS) cap its history so a long-running session doesn't
+// grow until the LLM's context window fills.
+func resolveMaxHistoryItems(cfg *config.ModelConfig) int {
+	if cfg != nil && cfg.Pipeline.MaxHistoryItems != nil {
+		return *cfg.Pipeline.MaxHistoryItems
+	}
+	return defaultMaxHistoryItems(cfg)
+}
+
 // trimRealtimeItems returns the tail of items capped at maxItems (0 = no cap).
 // Walks backwards keeping function_call + function_call_output pairs together
 // so we never feed the LLM an orphaned tool result that references a call it
@@ -492,7 +503,7 @@ func runRealtimeSession(application *application.Application, t Transport, model
 		Conversations:    make(map[string]*Conversation),
 		InputSampleRate:  defaultRemoteSampleRate,
 		OutputSampleRate: defaultRemoteSampleRate,
-		MaxHistoryItems:  defaultMaxHistoryItems(cfg),
+		MaxHistoryItems:  resolveMaxHistoryItems(cfg),
 	}
 
 	// Create a default conversation
diff --git a/core/http/endpoints/openai/realtime_gate_test.go b/core/http/endpoints/openai/realtime_gate_test.go
index e49eb71eb..0b86e7f1f 100644
--- a/core/http/endpoints/openai/realtime_gate_test.go
+++ b/core/http/endpoints/openai/realtime_gate_test.go
@@ -107,6 +107,29 @@ var _ = Describe("defaultMaxHistoryItems", func() {
 	})
 })
 
+var _ = Describe("resolveMaxHistoryItems", func() {
+	ptr := func(i int) *int { return &i }
+
+	It("uses an explicit pipeline.max_history_items", func() {
+		cfg := &config.ModelConfig{Pipeline: config.Pipeline{LLM: "llama", MaxHistoryItems: ptr(10)}}
+		Expect(resolveMaxHistoryItems(cfg)).To(Equal(10))
+	})
+	It("honors an explicit 0 (unlimited) over the type default", func() {
+		cfg := &config.ModelConfig{
+			KnownUsecases: withUsecases(config.FLAG_REALTIME_AUDIO),
+			Pipeline:      config.Pipeline{MaxHistoryItems: ptr(0)},
+		}
+		Expect(resolveMaxHistoryItems(cfg)).To(Equal(0))
+	})
+	It("falls back to the type default when unset", func() {
+		cfg := &config.ModelConfig{KnownUsecases: withUsecases(config.FLAG_REALTIME_AUDIO)}
+		Expect(resolveMaxHistoryItems(cfg)).To(Equal(6))
+	})
+	It("tolerates nil", func() {
+		Expect(resolveMaxHistoryItems(nil)).To(Equal(0))
+	})
+})
+
 var _ = Describe("trimRealtimeItems", func() {
 	user := func(id string) *types.MessageItemUnion {
 		return &types.MessageItemUnion{User: &types.MessageItemUser{ID: id}}