chore: refactor endpoints to use same inferencing path, add automatic retrial mechanism in case of errors (#9029)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-17 04:56:52 -04:00 · 2026-03-16 21:31:02 +01:00
parent 3d9ccd1ddc
commit ee96e5e08d
9 changed files with 1263 additions and 662 deletions
--- a/pkg/reasoning/extractor.go
+++ b/pkg/reasoning/extractor.go
@@ -0,0 +1,104 @@
+package reasoning
+
+import "strings"
+
+// ReasoningExtractor tracks streaming reasoning extraction state, computing
+// incremental deltas so callers don't need to duplicate the ~30-line
+// accumulated-content / last-emitted tracking logic.
+//
+// Usage:
+//
+//	extractor := NewReasoningExtractor(thinkingStartToken, cfg)
+//	// In your streaming token callback:
+//	reasoningDelta, contentDelta := extractor.ProcessToken(token)
+//	// After streaming completes:
+//	finalReasoning := extractor.Reasoning()
+//	finalContent := extractor.CleanedContent()
+type ReasoningExtractor struct {
+	thinkingStartToken string
+	config             Config
+	accumulated        string
+	lastReasoning      string
+	lastCleaned        string
+	suppressReasoning  bool
+}
+
+// NewReasoningExtractor creates a new extractor for the given thinking token and config.
+func NewReasoningExtractor(thinkingStartToken string, cfg Config) *ReasoningExtractor {
+	return &ReasoningExtractor{
+		thinkingStartToken: thinkingStartToken,
+		config:             cfg,
+	}
+}
+
+// ProcessToken processes a new streaming token and returns the reasoning
+// and content deltas (the new portions not yet emitted).
+func (e *ReasoningExtractor) ProcessToken(token string) (reasoningDelta, contentDelta string) {
+	e.accumulated += token
+	currentReasoning, cleanedContent := ExtractReasoningWithConfig(e.accumulated, e.thinkingStartToken, e.config)
+
+	// Calculate reasoning delta
+	if currentReasoning != e.lastReasoning {
+		if len(currentReasoning) > len(e.lastReasoning) && strings.HasPrefix(currentReasoning, e.lastReasoning) {
+			reasoningDelta = currentReasoning[len(e.lastReasoning):]
+		} else if currentReasoning != "" {
+			// Reasoning changed in a non-append way, emit the full current reasoning
+			reasoningDelta = currentReasoning
+		}
+		e.lastReasoning = currentReasoning
+	}
+
+	// Calculate content delta
+	if len(cleanedContent) > len(e.lastCleaned) && strings.HasPrefix(cleanedContent, e.lastCleaned) {
+		contentDelta = cleanedContent[len(e.lastCleaned):]
+		e.lastCleaned = cleanedContent
+	} else if cleanedContent != e.lastCleaned {
+		contentDelta = cleanedContent
+		e.lastCleaned = cleanedContent
+	}
+
+	if e.suppressReasoning {
+		reasoningDelta = ""
+	}
+
+	return reasoningDelta, contentDelta
+}
+
+// Reasoning returns the total accumulated reasoning after streaming.
+func (e *ReasoningExtractor) Reasoning() string {
+	return e.lastReasoning
+}
+
+// CleanedContent returns the total accumulated content (reasoning stripped).
+func (e *ReasoningExtractor) CleanedContent() string {
+	return e.lastCleaned
+}
+
+// Accumulated returns the total raw accumulated content.
+func (e *ReasoningExtractor) Accumulated() string {
+	return e.accumulated
+}
+
+// Reset clears the extractor state for reuse.
+func (e *ReasoningExtractor) Reset() {
+	e.accumulated = ""
+	e.lastReasoning = ""
+	e.lastCleaned = ""
+}
+
+// ResetAndSuppressReasoning clears state and suppresses future reasoning deltas.
+// ProcessToken() still extracts reasoning internally (CleanedContent works),
+// but returns empty reasoningDelta — reasoning is not surfaced to the caller.
+// This is used on retry after streaming: reasoning from the first attempt was
+// already sent to the client; re-streaming it would cause duplicates.
+func (e *ReasoningExtractor) ResetAndSuppressReasoning() {
+	e.accumulated = ""
+	e.lastReasoning = ""
+	e.lastCleaned = ""
+	e.suppressReasoning = true
+}
+
+// Suppressed returns whether reasoning delta suppression is active.
+func (e *ReasoningExtractor) Suppressed() bool {
+	return e.suppressReasoning
+}
--- a/pkg/reasoning/extractor_test.go
+++ b/pkg/reasoning/extractor_test.go
@@ -0,0 +1,198 @@
+package reasoning_test
+
+import (
+	. "github.com/mudler/LocalAI/pkg/reasoning"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("ReasoningExtractor", func() {
+	Context("basic streaming with <think> tags", func() {
+		It("should extract reasoning and content deltas incrementally", func() {
+			ext := NewReasoningExtractor("<think>", Config{})
+
+			// Simulate tokens arriving one at a time
+			tokens := []string{"<think>", "I need", " to think", "</think>", "Hello", " world"}
+			var allReasoningDeltas, allContentDeltas string
+
+			for _, tok := range tokens {
+				rDelta, cDelta := ext.ProcessToken(tok)
+				allReasoningDeltas += rDelta
+				allContentDeltas += cDelta
+			}
+
+			Expect(ext.Reasoning()).To(Equal("I need to think"))
+			Expect(ext.CleanedContent()).To(Equal("Hello world"))
+			Expect(allReasoningDeltas).To(Equal("I need to think"))
+			Expect(allContentDeltas).To(Equal("Hello world"))
+		})
+	})
+
+	Context("no reasoning tags", func() {
+		It("should pass all content through as content deltas", func() {
+			ext := NewReasoningExtractor("", Config{})
+
+			rDelta1, cDelta1 := ext.ProcessToken("Hello")
+			rDelta2, cDelta2 := ext.ProcessToken(" world")
+
+			Expect(rDelta1).To(BeEmpty())
+			Expect(cDelta1).To(Equal("Hello"))
+			Expect(rDelta2).To(BeEmpty())
+			Expect(cDelta2).To(Equal(" world"))
+			Expect(ext.Reasoning()).To(BeEmpty())
+			Expect(ext.CleanedContent()).To(Equal("Hello world"))
+		})
+	})
+
+	Context("unclosed thinking tags", func() {
+		It("should treat content after unclosed tag as reasoning", func() {
+			ext := NewReasoningExtractor("<think>", Config{})
+
+			ext.ProcessToken("<think>")
+			ext.ProcessToken("still thinking")
+			// No closing tag - reasoning is extracted from unclosed tag
+
+			Expect(ext.Reasoning()).To(Equal("still thinking"))
+			Expect(ext.CleanedContent()).To(BeEmpty())
+		})
+	})
+
+	Context("empty tokens", func() {
+		It("should handle empty tokens gracefully", func() {
+			ext := NewReasoningExtractor("", Config{})
+
+			rDelta, cDelta := ext.ProcessToken("")
+			Expect(rDelta).To(BeEmpty())
+			Expect(cDelta).To(BeEmpty())
+
+			rDelta, cDelta = ext.ProcessToken("Hello")
+			Expect(rDelta).To(BeEmpty())
+			Expect(cDelta).To(Equal("Hello"))
+		})
+	})
+
+	Context("Reset", func() {
+		It("should clear all state", func() {
+			ext := NewReasoningExtractor("<think>", Config{})
+
+			ext.ProcessToken("<think>reason</think>content")
+			Expect(ext.Reasoning()).ToNot(BeEmpty())
+			Expect(ext.CleanedContent()).ToNot(BeEmpty())
+
+			ext.Reset()
+			Expect(ext.Reasoning()).To(BeEmpty())
+			Expect(ext.CleanedContent()).To(BeEmpty())
+			Expect(ext.Accumulated()).To(BeEmpty())
+		})
+	})
+
+	Context("disabled reasoning", func() {
+		It("should pass all content through when reasoning is disabled", func() {
+			disabled := true
+			ext := NewReasoningExtractor("<think>", Config{DisableReasoning: &disabled})
+
+			rDelta, cDelta := ext.ProcessToken("<think>reason</think>content")
+			Expect(rDelta).To(BeEmpty())
+			Expect(cDelta).To(Equal("<think>reason</think>content"))
+			Expect(ext.Reasoning()).To(BeEmpty())
+		})
+	})
+
+	Context("split tags across tokens", func() {
+		It("should handle tags split across multiple tokens", func() {
+			ext := NewReasoningExtractor("<think>", Config{})
+
+			// Tag arrives in pieces
+			ext.ProcessToken("<thi")
+			ext.ProcessToken("nk>reasoning here</thi")
+			ext.ProcessToken("nk>final answer")
+
+			Expect(ext.Reasoning()).To(Equal("reasoning here"))
+			Expect(ext.CleanedContent()).To(Equal("final answer"))
+		})
+	})
+
+	Context("ResetAndSuppressReasoning", func() {
+		It("should suppress reasoning deltas but still extract reasoning internally", func() {
+			ext := NewReasoningExtractor("<think>", Config{})
+
+			// First pass: reasoning is emitted normally
+			rDelta1, cDelta1 := ext.ProcessToken("<think>first reasoning</think>first content")
+			Expect(rDelta1).To(Equal("first reasoning"))
+			Expect(cDelta1).To(Equal("first content"))
+			Expect(ext.Suppressed()).To(BeFalse())
+
+			// Simulate retry: suppress reasoning
+			ext.ResetAndSuppressReasoning()
+			Expect(ext.Suppressed()).To(BeTrue())
+			Expect(ext.Reasoning()).To(BeEmpty())
+			Expect(ext.CleanedContent()).To(BeEmpty())
+			Expect(ext.Accumulated()).To(BeEmpty())
+
+			// Second pass: reasoning deltas suppressed, content still works
+			rDelta2, cDelta2 := ext.ProcessToken("<think>retry reasoning</think>retry content")
+			Expect(rDelta2).To(BeEmpty(), "reasoning delta should be suppressed after ResetAndSuppressReasoning")
+			Expect(cDelta2).To(Equal("retry content"))
+
+			// Internal state still tracks reasoning (for CleanedContent to work)
+			Expect(ext.Reasoning()).To(Equal("retry reasoning"))
+			Expect(ext.CleanedContent()).To(Equal("retry content"))
+		})
+
+		It("should suppress reasoning across multiple streaming tokens", func() {
+			ext := NewReasoningExtractor("<think>", Config{})
+			ext.ResetAndSuppressReasoning()
+
+			tokens := []string{"<think>", "suppressed", " thought", "</think>", "visible", " answer"}
+			var allReasoningDeltas, allContentDeltas string
+
+			for _, tok := range tokens {
+				rDelta, cDelta := ext.ProcessToken(tok)
+				allReasoningDeltas += rDelta
+				allContentDeltas += cDelta
+			}
+
+			Expect(allReasoningDeltas).To(BeEmpty(), "no reasoning deltas should be emitted when suppressed")
+			Expect(allContentDeltas).To(Equal("visible answer"))
+			Expect(ext.Reasoning()).To(Equal("suppressed thought"))
+			Expect(ext.CleanedContent()).To(Equal("visible answer"))
+		})
+	})
+
+	Context("Accumulated", func() {
+		It("should return all raw tokens concatenated", func() {
+			ext := NewReasoningExtractor("<think>", Config{})
+
+			ext.ProcessToken("<think>reason</think>")
+			ext.ProcessToken("content")
+
+			Expect(ext.Accumulated()).To(Equal("<think>reason</think>content"))
+		})
+	})
+
+	Context("with thinking start token prefill", func() {
+		It("should prepend thinking token when prefill is not disabled", func() {
+			ext := NewReasoningExtractor("<think>", Config{})
+
+			// Content without explicit <think> tag - extractor should prepend it
+			ext.ProcessToken("I am thinking")
+			ext.ProcessToken("</think>")
+			ext.ProcessToken("Answer here")
+
+			Expect(ext.Reasoning()).To(Equal("I am thinking"))
+			Expect(ext.CleanedContent()).To(Equal("Answer here"))
+		})
+	})
+
+	Context("strip reasoning only", func() {
+		It("should strip reasoning from content but not return it", func() {
+			strip := true
+			ext := NewReasoningExtractor("<think>", Config{StripReasoningOnly: &strip})
+
+			ext.ProcessToken("<think>secret reasoning</think>visible content")
+
+			Expect(ext.Reasoning()).To(BeEmpty())
+			Expect(ext.CleanedContent()).To(Equal("visible content"))
+		})
+	})
+})