feat(openresponses): Support reasoning blocks (#8133)

* feat(openresponses): support reasoning blocks

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* allow to disable reasoning, refactor common logic

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Add option to only strip reasoning

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Add configurations for custom reasoning tokens

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-01-21 00:11:45 +01:00
committed by GitHub
parent 34e054f607
commit c491c6ca90
7 changed files with 1289 additions and 155 deletions

View File

@@ -47,7 +47,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
} else {
template = s
}
thinkingStartToken := reason.DetectThinkingStartToken(template)
thinkingStartToken := reason.DetectThinkingStartToken(template, &config.ReasoningConfig)
// Track accumulated content for reasoning extraction
accumulatedContent := ""
@@ -56,12 +56,8 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
_, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
accumulatedContent += s
content := accumulatedContent
// Prepend thinking token if needed, then extract reasoning
if config.ReasoningConfig.DisableReasoningTagPrefill == nil || !*config.ReasoningConfig.DisableReasoningTagPrefill {
content = reason.PrependThinkingTokenIfNeeded(content, thinkingStartToken)
}
currentReasoning, cleanedContent := reason.ExtractReasoning(content)
currentReasoning, cleanedContent := reason.ExtractReasoningWithConfig(accumulatedContent, thinkingStartToken, config.ReasoningConfig)
// Calculate new reasoning delta (what we haven't emitted yet)
var reasoningDelta *string
@@ -140,7 +136,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
} else {
template = prompt
}
thinkingStartToken := reason.DetectThinkingStartToken(template)
thinkingStartToken := reason.DetectThinkingStartToken(template, &config.ReasoningConfig)
result := ""
lastEmittedCount := 0
@@ -254,12 +250,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
return err
}
// Prepend thinking token if needed, then extract reasoning before processing tool calls
resultWithToken := result
if config.ReasoningConfig.DisableReasoningTagPrefill == nil || !*config.ReasoningConfig.DisableReasoningTagPrefill {
resultWithToken = reason.PrependThinkingTokenIfNeeded(result, thinkingStartToken)
}
reasoning, cleanedResult := reason.ExtractReasoning(resultWithToken)
result = cleanedResult
reasoning, result := reason.ExtractReasoningWithConfig(result, thinkingStartToken, config.ReasoningConfig)
textContentToReturn = functions.ParseTextContent(result, config.FunctionsConfig)
result = functions.CleanupLLMResult(result, config.FunctionsConfig)
@@ -652,18 +643,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
} else {
template = predInput
}
thinkingStartToken := reason.DetectThinkingStartToken(template)
thinkingStartToken := reason.DetectThinkingStartToken(template, &config.ReasoningConfig)
xlog.Debug("Thinking start token", "thinkingStartToken", thinkingStartToken, "template", template)
tokenCallback := func(s string, c *[]schema.Choice) {
// Prepend thinking token if needed, then extract reasoning from the response
sWithToken := s
if config.ReasoningConfig.DisableReasoningTagPrefill == nil || !*config.ReasoningConfig.DisableReasoningTagPrefill {
sWithToken = reason.PrependThinkingTokenIfNeeded(s, thinkingStartToken)
}
reasoning, cleanedS := reason.ExtractReasoning(sWithToken)
s = cleanedS
reasoning, s := reason.ExtractReasoningWithConfig(s, thinkingStartToken, config.ReasoningConfig)
if !shouldUseFn {
// no function is called, just reply and use stop as finish reason

View File

@@ -6,6 +6,7 @@ import (
"errors"
"fmt"
"net"
"strings"
"time"
"github.com/google/uuid"
@@ -18,6 +19,7 @@ import (
"github.com/mudler/LocalAI/core/templates"
"github.com/mudler/LocalAI/pkg/functions"
"github.com/mudler/LocalAI/pkg/model"
reason "github.com/mudler/LocalAI/pkg/reasoning"
"github.com/mudler/LocalAI/pkg/utils"
"github.com/mudler/cogito"
"github.com/mudler/xlog"
@@ -1330,13 +1332,37 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
result := backend.Finetune(*cfg, predInput, prediction.Response)
xlog.Debug("Open Responses - Raw model result", "result", result, "shouldUseFn", shouldUseFn)
// Detect if thinking token is already in prompt or template
var template string
if cfg.TemplateConfig.UseTokenizerTemplate {
template = cfg.GetModelTemplate()
} else {
template = predInput
}
thinkingStartToken := reason.DetectThinkingStartToken(template, &cfg.ReasoningConfig)
// Extract reasoning from result before cleaning
reasoningContent, cleanedResult := reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
// Parse tool calls if using functions
var outputItems []schema.ORItemField
var toolCalls []schema.ToolCall
// Add reasoning item if reasoning was found (reasoning comes first per spec)
if reasoningContent != "" {
reasoningItem := schema.ORItemField{
Type: "reasoning",
ID: fmt.Sprintf("reasoning_%s", uuid.New().String()),
Status: "completed",
Content: []schema.ORContentPart{makeOutputTextPart(reasoningContent)},
}
outputItems = append(outputItems, reasoningItem)
xlog.Debug("Open Responses - Extracted reasoning", "reasoning_length", len(reasoningContent))
}
if shouldUseFn {
// Clean up the result first (handle reasoning tags, etc.)
cleanedResult := functions.CleanupLLMResult(result, cfg.FunctionsConfig)
// Clean up the result (already extracted reasoning above)
cleanedResult = functions.CleanupLLMResult(cleanedResult, cfg.FunctionsConfig)
xlog.Debug("Open Responses - Cleaned result", "cleanedResult", cleanedResult)
funcCallResults := functions.ParseFunctionCall(cleanedResult, cfg.FunctionsConfig)
@@ -1398,28 +1424,46 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
})
}
// If we have no output items but the model did produce output, include the raw result as a message
// If we have no output items but the model did produce output, include the cleaned result as a message
// This handles cases where the function call parsing failed but we still have model output
if len(outputItems) == 0 && result != "" {
xlog.Debug("Open Responses - No parsed output, falling back to raw result")
// Note: reasoning item may already be added above
hasMessageItem := false
for _, item := range outputItems {
if item.Type == "message" {
hasMessageItem = true
break
}
}
if !hasMessageItem && cleanedResult != "" {
xlog.Debug("Open Responses - No parsed output, falling back to cleaned result")
outputItems = append(outputItems, schema.ORItemField{
Type: "message",
ID: fmt.Sprintf("msg_%s", uuid.New().String()),
Status: "completed",
Role: "assistant",
Content: []schema.ORContentPart{makeOutputTextPartWithLogprobs(result, prediction.Logprobs)},
Content: []schema.ORContentPart{makeOutputTextPartWithLogprobs(cleanedResult, prediction.Logprobs)},
})
}
} else {
// Simple text response (include logprobs if available)
outputItems = []schema.ORItemField{
{
Type: "message",
ID: fmt.Sprintf("msg_%s", uuid.New().String()),
Status: "completed",
Role: "assistant",
Content: []schema.ORContentPart{makeOutputTextPartWithLogprobs(result, prediction.Logprobs)},
},
// Note: reasoning item may already be added above
messageItem := schema.ORItemField{
Type: "message",
ID: fmt.Sprintf("msg_%s", uuid.New().String()),
Status: "completed",
Role: "assistant",
Content: []schema.ORContentPart{makeOutputTextPartWithLogprobs(cleanedResult, prediction.Logprobs)},
}
outputItems = append(outputItems, messageItem)
}
// Calculate reasoning tokens (approximate: character count / 4)
reasoningTokens := 0
if reasoningContent != "" {
// Simple estimation: ~4 characters per token
reasoningTokens = len(reasoningContent) / 4
if reasoningTokens == 0 && len(reasoningContent) > 0 {
reasoningTokens = 1
}
}
@@ -1429,6 +1473,9 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
InputTokens: prediction.Usage.Prompt,
OutputTokens: prediction.Usage.Completion,
TotalTokens: prediction.Usage.Prompt + prediction.Usage.Completion,
OutputTokensDetails: &schema.OROutputTokensDetails{
ReasoningTokens: reasoningTokens,
},
}, shouldStore)
// Store response for future reference (if enabled)
@@ -1484,6 +1531,15 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
}
}
// Detect if thinking token is already in prompt or template
var template string
if cfg.TemplateConfig.UseTokenizerTemplate {
template = cfg.GetModelTemplate()
} else {
template = predInput
}
thinkingStartToken := reason.DetectThinkingStartToken(template, &cfg.ReasoningConfig)
// Track state for streaming
var currentMessageID string
var currentContentIndex int
@@ -1492,6 +1548,14 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
outputIndex := 0
inToolCallMode := false
// Track reasoning state for streaming
var currentReasoningID string
var currentReasoningContentIndex int
var accumulatedContent string
var lastEmittedReasoning string
var lastEmittedCleanedContent string
var reasoningTokens int
// Collect all output items for storage
var collectedOutputItems []schema.ORItemField
@@ -1646,52 +1710,133 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
return true
}
// If no tool calls detected yet, emit text delta
// If no tool calls detected yet, handle reasoning and text
if !inToolCallMode {
if currentMessageID == "" {
// Emit output_item.added for message
currentMessageID = fmt.Sprintf("msg_%s", uuid.New().String())
messageItem := &schema.ORItemField{
Type: "message",
ID: currentMessageID,
Status: "in_progress",
Role: "assistant",
Content: []schema.ORContentPart{},
}
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.output_item.added",
SequenceNumber: sequenceNumber,
OutputIndex: &outputIndex,
Item: messageItem,
})
sequenceNumber++
accumulatedContent += token
currentReasoning, cleanedContent := reason.ExtractReasoningWithConfig(accumulatedContent, thinkingStartToken, cfg.ReasoningConfig)
// Emit content_part.added
currentContentIndex = 0
emptyPart := makeOutputTextPart("")
// Handle reasoning item
if currentReasoning != "" {
// Check if we need to create reasoning item
if currentReasoningID == "" {
outputIndex++
currentReasoningID = fmt.Sprintf("reasoning_%s", uuid.New().String())
reasoningItem := &schema.ORItemField{
Type: "reasoning",
ID: currentReasoningID,
Status: "in_progress",
}
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.output_item.added",
SequenceNumber: sequenceNumber,
OutputIndex: &outputIndex,
Item: reasoningItem,
})
sequenceNumber++
// Emit content_part.added for reasoning
currentReasoningContentIndex = 0
emptyPart := makeOutputTextPart("")
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.content_part.added",
SequenceNumber: sequenceNumber,
ItemID: currentReasoningID,
OutputIndex: &outputIndex,
ContentIndex: &currentReasoningContentIndex,
Part: &emptyPart,
})
sequenceNumber++
}
// Calculate reasoning delta
var reasoningDelta string
if len(currentReasoning) > len(lastEmittedReasoning) && strings.HasPrefix(currentReasoning, lastEmittedReasoning) {
reasoningDelta = currentReasoning[len(lastEmittedReasoning):]
lastEmittedReasoning = currentReasoning
} else if currentReasoning != lastEmittedReasoning {
reasoningDelta = currentReasoning
lastEmittedReasoning = currentReasoning
}
// Emit reasoning delta if there's new content
if reasoningDelta != "" {
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.output_text.delta",
SequenceNumber: sequenceNumber,
ItemID: currentReasoningID,
OutputIndex: &outputIndex,
ContentIndex: &currentReasoningContentIndex,
Delta: strPtr(reasoningDelta),
Logprobs: emptyLogprobs(),
})
sequenceNumber++
c.Response().Flush()
}
}
// Handle message content (cleaned content without reasoning tags)
var deltaContent string
if len(cleanedContent) > len(lastEmittedCleanedContent) && strings.HasPrefix(cleanedContent, lastEmittedCleanedContent) {
deltaContent = cleanedContent[len(lastEmittedCleanedContent):]
lastEmittedCleanedContent = cleanedContent
} else if cleanedContent != lastEmittedCleanedContent {
if lastEmittedCleanedContent == "" {
deltaContent = cleanedContent
lastEmittedCleanedContent = cleanedContent
} else {
deltaContent = cleanedContent
lastEmittedCleanedContent = cleanedContent
}
}
// Only emit message content if there's actual content (not just reasoning)
if deltaContent != "" {
if currentMessageID == "" {
// Emit output_item.added for message
outputIndex++
currentMessageID = fmt.Sprintf("msg_%s", uuid.New().String())
messageItem := &schema.ORItemField{
Type: "message",
ID: currentMessageID,
Status: "in_progress",
Role: "assistant",
Content: []schema.ORContentPart{},
}
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.output_item.added",
SequenceNumber: sequenceNumber,
OutputIndex: &outputIndex,
Item: messageItem,
})
sequenceNumber++
// Emit content_part.added
currentContentIndex = 0
emptyPart := makeOutputTextPart("")
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.content_part.added",
SequenceNumber: sequenceNumber,
ItemID: currentMessageID,
OutputIndex: &outputIndex,
ContentIndex: &currentContentIndex,
Part: &emptyPart,
})
sequenceNumber++
}
// Emit text delta
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.content_part.added",
Type: "response.output_text.delta",
SequenceNumber: sequenceNumber,
ItemID: currentMessageID,
OutputIndex: &outputIndex,
ContentIndex: &currentContentIndex,
Part: &emptyPart,
Delta: strPtr(deltaContent),
Logprobs: emptyLogprobs(),
})
sequenceNumber++
c.Response().Flush()
}
// Emit text delta
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.output_text.delta",
SequenceNumber: sequenceNumber,
ItemID: currentMessageID,
OutputIndex: &outputIndex,
ContentIndex: &currentContentIndex,
Delta: strPtr(token),
Logprobs: emptyLogprobs(),
})
sequenceNumber++
c.Response().Flush()
}
return true
}
@@ -1754,7 +1899,62 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
}
result := backend.Finetune(*cfg, predInput, prediction.Response)
cleanedResult := functions.CleanupLLMResult(result, cfg.FunctionsConfig)
// Extract reasoning from final result
finalReasoning, finalCleanedResult := reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
// Close reasoning item if it exists and wasn't closed yet
if currentReasoningID != "" && finalReasoning != "" {
// Emit output_text.done for reasoning
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.output_text.done",
SequenceNumber: sequenceNumber,
ItemID: currentReasoningID,
OutputIndex: &outputIndex,
ContentIndex: &currentReasoningContentIndex,
Text: strPtr(finalReasoning),
Logprobs: emptyLogprobs(),
})
sequenceNumber++
// Emit content_part.done for reasoning
reasoningPart := makeOutputTextPart(finalReasoning)
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.content_part.done",
SequenceNumber: sequenceNumber,
ItemID: currentReasoningID,
OutputIndex: &outputIndex,
ContentIndex: &currentReasoningContentIndex,
Part: &reasoningPart,
})
sequenceNumber++
// Emit output_item.done for reasoning
reasoningItem := &schema.ORItemField{
Type: "reasoning",
ID: currentReasoningID,
Status: "completed",
Content: []schema.ORContentPart{reasoningPart},
}
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.output_item.done",
SequenceNumber: sequenceNumber,
OutputIndex: &outputIndex,
Item: reasoningItem,
})
sequenceNumber++
// Collect reasoning item for storage
collectedOutputItems = append(collectedOutputItems, *reasoningItem)
// Calculate reasoning tokens
reasoningTokens = len(finalReasoning) / 4
if reasoningTokens == 0 && len(finalReasoning) > 0 {
reasoningTokens = 1
}
}
cleanedResult := functions.CleanupLLMResult(finalCleanedResult, cfg.FunctionsConfig)
xlog.Debug("Open Responses Stream - Cleaned result", "cleanedResult", cleanedResult)
parsedToolCalls := functions.ParseFunctionCall(cleanedResult, cfg.FunctionsConfig)
@@ -1789,10 +1989,10 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
// Convert prediction logprobs for streaming events
streamEventLogprobs := convertLogprobsForStreaming(prediction.Logprobs)
// If we have no output but the model did produce something, use the raw result
if textContent == "" && len(toolCalls) == 0 && result != "" {
xlog.Debug("Open Responses Stream - No parsed output, using raw result")
textContent = result
// If we have no output but the model did produce something, use the cleaned result (without reasoning tags)
if textContent == "" && len(toolCalls) == 0 && finalCleanedResult != "" {
xlog.Debug("Open Responses Stream - No parsed output, using cleaned result")
textContent = finalCleanedResult
}
// Close message if we have text content
@@ -1875,8 +2075,18 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
collectedOutputItems = append(collectedOutputItems, *functionCallItem)
}
// Build final response with all items (include logprobs)
// Build final response with all items (include reasoning first, then messages, then tool calls)
var allOutputItems []schema.ORItemField
// Add reasoning item if it exists
if currentReasoningID != "" && finalReasoning != "" {
allOutputItems = append(allOutputItems, schema.ORItemField{
Type: "reasoning",
ID: currentReasoningID,
Status: "completed",
Content: []schema.ORContentPart{makeOutputTextPart(finalReasoning)},
})
}
// Add message item
if currentMessageID != "" && textContent != "" {
allOutputItems = append(allOutputItems, schema.ORItemField{
Type: "message",
@@ -1886,6 +2096,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
Content: []schema.ORContentPart{makeOutputTextPartWithLogprobs(textContent, prediction.Logprobs)},
})
}
// Add tool call items
for _, tc := range toolCalls {
toolCallID := fmt.Sprintf("fc_%s", uuid.New().String())
allOutputItems = append(allOutputItems, schema.ORItemField{
@@ -1904,6 +2115,9 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
InputTokens: prediction.Usage.Prompt,
OutputTokens: prediction.Usage.Completion,
TotalTokens: prediction.Usage.Prompt + prediction.Usage.Completion,
OutputTokensDetails: &schema.OROutputTokensDetails{
ReasoningTokens: reasoningTokens,
},
}, shouldStore)
sendSSEEvent(c, &schema.ORStreamEvent{
@@ -1956,22 +2170,102 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
})
sequenceNumber++
// Stream text deltas
// Stream text deltas with reasoning extraction
tokenCallback := func(token string, tokenUsage backend.TokenUsage) bool {
accumulatedText += token
accumulatedContent += token
// Prepend thinking token if needed, then extract reasoning
currentReasoning, cleanedContent := reason.ExtractReasoningWithConfig(accumulatedContent, thinkingStartToken, cfg.ReasoningConfig)
// Emit text delta
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.output_text.delta",
SequenceNumber: sequenceNumber,
ItemID: currentMessageID,
OutputIndex: &outputIndex,
ContentIndex: &currentContentIndex,
Delta: strPtr(token),
Logprobs: emptyLogprobs(),
})
sequenceNumber++
c.Response().Flush()
// Handle reasoning item
if currentReasoning != "" {
// Check if we need to create reasoning item
if currentReasoningID == "" {
outputIndex++
currentReasoningID = fmt.Sprintf("reasoning_%s", uuid.New().String())
reasoningItem := &schema.ORItemField{
Type: "reasoning",
ID: currentReasoningID,
Status: "in_progress",
}
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.output_item.added",
SequenceNumber: sequenceNumber,
OutputIndex: &outputIndex,
Item: reasoningItem,
})
sequenceNumber++
// Emit content_part.added for reasoning
currentReasoningContentIndex = 0
emptyPart := makeOutputTextPart("")
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.content_part.added",
SequenceNumber: sequenceNumber,
ItemID: currentReasoningID,
OutputIndex: &outputIndex,
ContentIndex: &currentReasoningContentIndex,
Part: &emptyPart,
})
sequenceNumber++
}
// Calculate reasoning delta
var reasoningDelta string
if len(currentReasoning) > len(lastEmittedReasoning) && strings.HasPrefix(currentReasoning, lastEmittedReasoning) {
reasoningDelta = currentReasoning[len(lastEmittedReasoning):]
lastEmittedReasoning = currentReasoning
} else if currentReasoning != lastEmittedReasoning {
reasoningDelta = currentReasoning
lastEmittedReasoning = currentReasoning
}
// Emit reasoning delta if there's new content
if reasoningDelta != "" {
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.output_text.delta",
SequenceNumber: sequenceNumber,
ItemID: currentReasoningID,
OutputIndex: &outputIndex,
ContentIndex: &currentReasoningContentIndex,
Delta: strPtr(reasoningDelta),
Logprobs: emptyLogprobs(),
})
sequenceNumber++
c.Response().Flush()
}
}
// Handle message content (cleaned content without reasoning tags)
var deltaContent string
if len(cleanedContent) > len(lastEmittedCleanedContent) && strings.HasPrefix(cleanedContent, lastEmittedCleanedContent) {
deltaContent = cleanedContent[len(lastEmittedCleanedContent):]
lastEmittedCleanedContent = cleanedContent
} else if cleanedContent != lastEmittedCleanedContent {
if lastEmittedCleanedContent == "" {
deltaContent = cleanedContent
lastEmittedCleanedContent = cleanedContent
} else {
deltaContent = cleanedContent
lastEmittedCleanedContent = cleanedContent
}
}
// Only emit message content if there's actual content (not just reasoning)
if deltaContent != "" {
// Emit text delta
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.output_text.delta",
SequenceNumber: sequenceNumber,
ItemID: currentMessageID,
OutputIndex: &outputIndex,
ContentIndex: &currentContentIndex,
Delta: strPtr(deltaContent),
Logprobs: emptyLogprobs(),
})
sequenceNumber++
c.Response().Flush()
}
return true
}
@@ -2034,6 +2328,62 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
result := backend.Finetune(*cfg, predInput, prediction.Response)
// Extract reasoning from final result for non-tool-call path
finalReasoning, finalCleanedResult := reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
// Close reasoning item if it exists and wasn't closed yet
if currentReasoningID != "" && finalReasoning != "" {
// Emit output_text.done for reasoning
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.output_text.done",
SequenceNumber: sequenceNumber,
ItemID: currentReasoningID,
OutputIndex: &outputIndex,
ContentIndex: &currentReasoningContentIndex,
Text: strPtr(finalReasoning),
Logprobs: emptyLogprobs(),
})
sequenceNumber++
// Emit content_part.done for reasoning
reasoningPart := makeOutputTextPart(finalReasoning)
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.content_part.done",
SequenceNumber: sequenceNumber,
ItemID: currentReasoningID,
OutputIndex: &outputIndex,
ContentIndex: &currentReasoningContentIndex,
Part: &reasoningPart,
})
sequenceNumber++
// Emit output_item.done for reasoning
reasoningItem := &schema.ORItemField{
Type: "reasoning",
ID: currentReasoningID,
Status: "completed",
Content: []schema.ORContentPart{reasoningPart},
}
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.output_item.done",
SequenceNumber: sequenceNumber,
OutputIndex: &outputIndex,
Item: reasoningItem,
})
sequenceNumber++
// Collect reasoning item for storage
collectedOutputItems = append(collectedOutputItems, *reasoningItem)
// Calculate reasoning tokens
reasoningTokens = len(finalReasoning) / 4
if reasoningTokens == 0 && len(finalReasoning) > 0 {
reasoningTokens = 1
}
}
result = finalCleanedResult
// Convert prediction logprobs for streaming events
mcpStreamLogprobs := convertLogprobsForStreaming(prediction.Logprobs)
@@ -2075,17 +2425,35 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
// Emit response.completed
now := time.Now().Unix()
// Collect final output items (use collected items if available, otherwise use messageItem)
// Collect final output items (reasoning first, then message)
var finalOutputItems []schema.ORItemField
// Add reasoning item if it exists
if currentReasoningID != "" && finalReasoning != "" {
finalOutputItems = append(finalOutputItems, schema.ORItemField{
Type: "reasoning",
ID: currentReasoningID,
Status: "completed",
Content: []schema.ORContentPart{makeOutputTextPart(finalReasoning)},
})
}
// Add message item
if len(collectedOutputItems) > 0 {
finalOutputItems = collectedOutputItems
// Use collected items (may include reasoning already)
for _, item := range collectedOutputItems {
if item.Type == "message" {
finalOutputItems = append(finalOutputItems, item)
}
}
} else {
finalOutputItems = []schema.ORItemField{*messageItem}
finalOutputItems = append(finalOutputItems, *messageItem)
}
responseCompleted := buildORResponse(responseID, createdAt, &now, "completed", input, finalOutputItems, &schema.ORUsage{
InputTokens: prediction.Usage.Prompt,
OutputTokens: prediction.Usage.Completion,
TotalTokens: prediction.Usage.Prompt + prediction.Usage.Completion,
OutputTokensDetails: &schema.OROutputTokensDetails{
ReasoningTokens: reasoningTokens,
},
}, shouldStore)
sendSSEEvent(c, &schema.ORStreamEvent{
Type: "response.completed",

View File

@@ -93,7 +93,12 @@ type ORItemParam struct {
// Function call output fields
Output interface{} `json:"output,omitempty"` // string or []ORContentPart
// Reasoning fields (for type == "reasoning")
Summary []ORContentPart `json:"summary,omitempty"` // Array of summary parts
EncryptedContent *string `json:"encrypted_content,omitempty"` // Provider-specific encrypted content
// Note: For item_reference type, use the ID field above to reference the item
// Note: For reasoning type, Content field (from message fields) contains the raw reasoning content
}
// ORContentPart represents a content block (discriminated union by type)

View File

@@ -397,6 +397,83 @@ Agent/autonomous agent configuration:
| `agent.enable_mcp_prompts` | bool | Enable MCP prompts |
| `agent.enable_plan_re_evaluator` | bool | Enable plan re-evaluation |
## Reasoning Configuration
Configure how reasoning tags are extracted and processed from model output. Reasoning tags are used by models like DeepSeek, Command-R, and others to include internal reasoning steps in their responses.
| Field | Type | Default | Description |
|-------|------|---------|-------------|
| `reasoning.disable` | bool | `false` | When `true`, disables reasoning extraction entirely. The original content is returned without any processing. |
| `reasoning.disable_reasoning_tag_prefill` | bool | `false` | When `true`, disables automatic prepending of thinking start tokens. Use this when your model already includes reasoning tags in its output format. |
| `reasoning.strip_reasoning_only` | bool | `false` | When `true`, extracts and removes reasoning tags from content but discards the reasoning text. Useful when you want to clean reasoning tags from output without storing the reasoning content. |
| `reasoning.thinking_start_tokens` | array | `[]` | List of custom thinking start tokens to detect in prompts. Custom tokens are checked before default tokens. |
| `reasoning.tag_pairs` | array | `[]` | List of custom tag pairs for reasoning extraction. Each entry has `start` and `end` fields. Custom pairs are checked before default pairs. |
### Reasoning Tag Formats
The reasoning extraction supports multiple tag formats used by different models:
- `<thinking>...</thinking>` - General thinking tag
- `<think>...</think>` - DeepSeek, Granite, ExaOne, GLM models
- `<|START_THINKING|>...<|END_THINKING|>` - Command-R models
- `<|inner_prefix|>...<|inner_suffix|>` - Apertus models
- `<seed:think>...</seed:think>` - Seed models
- `<|think|>...<|end|><|begin|>assistant<|content|>` - Solar Open models
- `[THINK]...[/THINK]` - Magistral models
### Examples
**Disable reasoning extraction:**
```yaml
reasoning:
disable: true
```
**Extract reasoning but don't prepend tags:**
```yaml
reasoning:
disable_reasoning_tag_prefill: true
```
**Strip reasoning tags without storing reasoning content:**
```yaml
reasoning:
strip_reasoning_only: true
```
**Complete example with reasoning configuration:**
```yaml
name: deepseek-model
backend: llama-cpp
parameters:
model: deepseek.gguf
reasoning:
disable: false
disable_reasoning_tag_prefill: false
strip_reasoning_only: false
```
**Example with custom tokens and tag pairs:**
```yaml
name: custom-reasoning-model
backend: llama-cpp
parameters:
model: custom.gguf
reasoning:
thinking_start_tokens:
- "<custom:think>"
- "<my:reasoning>"
tag_pairs:
- start: "<custom:think>"
end: "</custom:think>"
- start: "<my:reasoning>"
end: "</my:reasoning>"
```
**Note:** Custom tokens and tag pairs are checked before the default ones, giving them priority. This allows you to override default behavior or add support for new reasoning tag formats.
## Pipeline Configuration
Define pipelines for audio-to-audio processing:

View File

@@ -1,5 +1,15 @@
package reasoning
type Config struct {
DisableReasoningTagPrefill *bool `yaml:"disable_reasoning_tag_prefill,omitempty" json:"disable_reasoning_tag_prefill,omitempty"`
// TagPair represents a start/end tag pair for reasoning extraction
type TagPair struct {
Start string `yaml:"start" json:"start"`
End string `yaml:"end" json:"end"`
}
type Config struct {
DisableReasoningTagPrefill *bool `yaml:"disable_reasoning_tag_prefill,omitempty" json:"disable_reasoning_tag_prefill,omitempty"`
DisableReasoning *bool `yaml:"disable,omitempty" json:"disable,omitempty"`
StripReasoningOnly *bool `yaml:"strip_reasoning_only,omitempty" json:"strip_reasoning_only,omitempty"`
ThinkingStartTokens []string `yaml:"thinking_start_tokens,omitempty" json:"thinking_start_tokens,omitempty"`
TagPairs []TagPair `yaml:"tag_pairs,omitempty" json:"tag_pairs,omitempty"`
}

View File

@@ -17,12 +17,12 @@ import (
// - <think> (DeepSeek, Granite, ExaOne models)
// - <|think|> (Solar Open models)
// - <thinking> (General thinking tag)
// - <think> (GLM models)
// - [THINK] (Magistral models)
func DetectThinkingStartToken(prompt string) string {
// Custom tokens from config are checked first, then default tokens.
func DetectThinkingStartToken(prompt string, config *Config) string {
// Common thinking start tokens (in order of specificity - longer first)
// Based on llama.cpp's chat-parser.cpp implementations
thinkingStartTokens := []string{
defaultTokens := []string{
"<|START_THINKING|>", // Command-R models
"<|inner_prefix|>", // Apertus models
"<seed:think>", // Seed models
@@ -32,6 +32,13 @@ func DetectThinkingStartToken(prompt string) string {
"[THINK]", // Magistral models
}
// Merge custom tokens with default tokens (custom tokens first for priority)
var thinkingStartTokens []string
if config != nil && len(config.ThinkingStartTokens) > 0 {
thinkingStartTokens = append(thinkingStartTokens, config.ThinkingStartTokens...)
}
thinkingStartTokens = append(thinkingStartTokens, defaultTokens...)
// Check if prompt ends with any of these tokens (allowing for trailing whitespace/newlines)
trimmedPrompt := strings.TrimRight(prompt, " \t\n\r")
for _, token := range thinkingStartTokens {
@@ -58,6 +65,28 @@ func DetectThinkingStartToken(prompt string) string {
return ""
}
// ExtractReasoningWithConfig extracts reasoning from content with the given config.
// If reasoning is disabled, it returns the original content.
// If thinking start token prefill is enabled, it prepends the thinking start token to the content.
// It returns the extracted reasoning and the cleaned content.
func ExtractReasoningWithConfig(content, thinkingStartToken string, config Config) (reasoning string, cleanedContent string) {
cleanedContent = content
// If reasoning is not disabled, prepend the thinking start token if needed and extract reasoning
if config.DisableReasoning == nil || !*config.DisableReasoning {
// If thinking start token prefill is not disabled, prepend the thinking start token
if config.DisableReasoningTagPrefill == nil || !*config.DisableReasoningTagPrefill {
cleanedContent = PrependThinkingTokenIfNeeded(cleanedContent, thinkingStartToken)
}
// Extract reasoning from the cleaned content
reasoning, cleanedContent = ExtractReasoning(cleanedContent, &config)
if config.StripReasoningOnly != nil && *config.StripReasoningOnly {
reasoning = ""
}
}
return reasoning, cleanedContent
}
// PrependThinkingTokenIfNeeded prepends the thinking start token to content if it was
// detected in the prompt. This allows the standard extraction logic to work correctly
// for models where the thinking token is already in the prompt.
@@ -97,7 +126,8 @@ func PrependThinkingTokenIfNeeded(content string, startToken string) string {
// both the extracted reasoning and the cleaned content (with tags removed).
// It handles <thinking>...</thinking> and <think>...</think> tags.
// Multiple reasoning blocks are concatenated with newlines.
func ExtractReasoning(content string) (reasoning string, cleanedContent string) {
// Custom tag pairs from config are checked first, then default tag pairs.
func ExtractReasoning(content string, config *Config) (reasoning string, cleanedContent string) {
if content == "" {
return "", content
}
@@ -106,8 +136,8 @@ func ExtractReasoning(content string) (reasoning string, cleanedContent string)
var cleanedParts []string
remaining := content
// Define tag pairs to look for (matching llama.cpp's chat-parser.cpp)
tagPairs := []struct {
// Define default tag pairs to look for (matching llama.cpp's chat-parser.cpp)
defaultTagPairs := []struct {
start string
end string
}{
@@ -120,6 +150,26 @@ func ExtractReasoning(content string) (reasoning string, cleanedContent string)
{"[THINK]", "[/THINK]"}, // Magistral models
}
// Merge custom tag pairs with default tag pairs (custom pairs first for priority)
var tagPairs []struct {
start string
end string
}
if config != nil && len(config.TagPairs) > 0 {
for _, pair := range config.TagPairs {
if pair.Start != "" && pair.End != "" {
tagPairs = append(tagPairs, struct {
start string
end string
}{pair.Start, pair.End})
}
}
}
// Add default tag pairs
for _, pair := range defaultTagPairs {
tagPairs = append(tagPairs, pair)
}
// Track the last position we've processed
lastPos := 0

View File

File diff suppressed because it is too large Load Diff