mirror of
https://github.com/mudler/LocalAI.git
synced 2026-01-21 21:01:23 -05:00
feat(openresponses): Support reasoning blocks (#8133)
* feat(openresponses): support reasoning blocks Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * allow to disable reasoning, refactor common logic Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add option to only strip reasoning Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add configurations for custom reasoning tokens Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
committed by
GitHub
parent
34e054f607
commit
c491c6ca90
@@ -47,7 +47,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
} else {
|
||||
template = s
|
||||
}
|
||||
thinkingStartToken := reason.DetectThinkingStartToken(template)
|
||||
thinkingStartToken := reason.DetectThinkingStartToken(template, &config.ReasoningConfig)
|
||||
|
||||
// Track accumulated content for reasoning extraction
|
||||
accumulatedContent := ""
|
||||
@@ -56,12 +56,8 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
|
||||
_, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
|
||||
accumulatedContent += s
|
||||
content := accumulatedContent
|
||||
// Prepend thinking token if needed, then extract reasoning
|
||||
if config.ReasoningConfig.DisableReasoningTagPrefill == nil || !*config.ReasoningConfig.DisableReasoningTagPrefill {
|
||||
content = reason.PrependThinkingTokenIfNeeded(content, thinkingStartToken)
|
||||
}
|
||||
currentReasoning, cleanedContent := reason.ExtractReasoning(content)
|
||||
|
||||
currentReasoning, cleanedContent := reason.ExtractReasoningWithConfig(accumulatedContent, thinkingStartToken, config.ReasoningConfig)
|
||||
|
||||
// Calculate new reasoning delta (what we haven't emitted yet)
|
||||
var reasoningDelta *string
|
||||
@@ -140,7 +136,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
} else {
|
||||
template = prompt
|
||||
}
|
||||
thinkingStartToken := reason.DetectThinkingStartToken(template)
|
||||
thinkingStartToken := reason.DetectThinkingStartToken(template, &config.ReasoningConfig)
|
||||
|
||||
result := ""
|
||||
lastEmittedCount := 0
|
||||
@@ -254,12 +250,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
return err
|
||||
}
|
||||
// Prepend thinking token if needed, then extract reasoning before processing tool calls
|
||||
resultWithToken := result
|
||||
if config.ReasoningConfig.DisableReasoningTagPrefill == nil || !*config.ReasoningConfig.DisableReasoningTagPrefill {
|
||||
resultWithToken = reason.PrependThinkingTokenIfNeeded(result, thinkingStartToken)
|
||||
}
|
||||
reasoning, cleanedResult := reason.ExtractReasoning(resultWithToken)
|
||||
result = cleanedResult
|
||||
reasoning, result := reason.ExtractReasoningWithConfig(result, thinkingStartToken, config.ReasoningConfig)
|
||||
|
||||
textContentToReturn = functions.ParseTextContent(result, config.FunctionsConfig)
|
||||
result = functions.CleanupLLMResult(result, config.FunctionsConfig)
|
||||
@@ -652,18 +643,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
} else {
|
||||
template = predInput
|
||||
}
|
||||
thinkingStartToken := reason.DetectThinkingStartToken(template)
|
||||
thinkingStartToken := reason.DetectThinkingStartToken(template, &config.ReasoningConfig)
|
||||
|
||||
xlog.Debug("Thinking start token", "thinkingStartToken", thinkingStartToken, "template", template)
|
||||
|
||||
tokenCallback := func(s string, c *[]schema.Choice) {
|
||||
// Prepend thinking token if needed, then extract reasoning from the response
|
||||
sWithToken := s
|
||||
if config.ReasoningConfig.DisableReasoningTagPrefill == nil || !*config.ReasoningConfig.DisableReasoningTagPrefill {
|
||||
sWithToken = reason.PrependThinkingTokenIfNeeded(s, thinkingStartToken)
|
||||
}
|
||||
reasoning, cleanedS := reason.ExtractReasoning(sWithToken)
|
||||
s = cleanedS
|
||||
reasoning, s := reason.ExtractReasoningWithConfig(s, thinkingStartToken, config.ReasoningConfig)
|
||||
|
||||
if !shouldUseFn {
|
||||
// no function is called, just reply and use stop as finish reason
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
@@ -18,6 +19,7 @@ import (
|
||||
"github.com/mudler/LocalAI/core/templates"
|
||||
"github.com/mudler/LocalAI/pkg/functions"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
reason "github.com/mudler/LocalAI/pkg/reasoning"
|
||||
"github.com/mudler/LocalAI/pkg/utils"
|
||||
"github.com/mudler/cogito"
|
||||
"github.com/mudler/xlog"
|
||||
@@ -1330,13 +1332,37 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
|
||||
result := backend.Finetune(*cfg, predInput, prediction.Response)
|
||||
xlog.Debug("Open Responses - Raw model result", "result", result, "shouldUseFn", shouldUseFn)
|
||||
|
||||
// Detect if thinking token is already in prompt or template
|
||||
var template string
|
||||
if cfg.TemplateConfig.UseTokenizerTemplate {
|
||||
template = cfg.GetModelTemplate()
|
||||
} else {
|
||||
template = predInput
|
||||
}
|
||||
thinkingStartToken := reason.DetectThinkingStartToken(template, &cfg.ReasoningConfig)
|
||||
|
||||
// Extract reasoning from result before cleaning
|
||||
reasoningContent, cleanedResult := reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
|
||||
|
||||
// Parse tool calls if using functions
|
||||
var outputItems []schema.ORItemField
|
||||
var toolCalls []schema.ToolCall
|
||||
|
||||
// Add reasoning item if reasoning was found (reasoning comes first per spec)
|
||||
if reasoningContent != "" {
|
||||
reasoningItem := schema.ORItemField{
|
||||
Type: "reasoning",
|
||||
ID: fmt.Sprintf("reasoning_%s", uuid.New().String()),
|
||||
Status: "completed",
|
||||
Content: []schema.ORContentPart{makeOutputTextPart(reasoningContent)},
|
||||
}
|
||||
outputItems = append(outputItems, reasoningItem)
|
||||
xlog.Debug("Open Responses - Extracted reasoning", "reasoning_length", len(reasoningContent))
|
||||
}
|
||||
|
||||
if shouldUseFn {
|
||||
// Clean up the result first (handle reasoning tags, etc.)
|
||||
cleanedResult := functions.CleanupLLMResult(result, cfg.FunctionsConfig)
|
||||
// Clean up the result (already extracted reasoning above)
|
||||
cleanedResult = functions.CleanupLLMResult(cleanedResult, cfg.FunctionsConfig)
|
||||
xlog.Debug("Open Responses - Cleaned result", "cleanedResult", cleanedResult)
|
||||
|
||||
funcCallResults := functions.ParseFunctionCall(cleanedResult, cfg.FunctionsConfig)
|
||||
@@ -1398,28 +1424,46 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
|
||||
})
|
||||
}
|
||||
|
||||
// If we have no output items but the model did produce output, include the raw result as a message
|
||||
// If we have no output items but the model did produce output, include the cleaned result as a message
|
||||
// This handles cases where the function call parsing failed but we still have model output
|
||||
if len(outputItems) == 0 && result != "" {
|
||||
xlog.Debug("Open Responses - No parsed output, falling back to raw result")
|
||||
// Note: reasoning item may already be added above
|
||||
hasMessageItem := false
|
||||
for _, item := range outputItems {
|
||||
if item.Type == "message" {
|
||||
hasMessageItem = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !hasMessageItem && cleanedResult != "" {
|
||||
xlog.Debug("Open Responses - No parsed output, falling back to cleaned result")
|
||||
outputItems = append(outputItems, schema.ORItemField{
|
||||
Type: "message",
|
||||
ID: fmt.Sprintf("msg_%s", uuid.New().String()),
|
||||
Status: "completed",
|
||||
Role: "assistant",
|
||||
Content: []schema.ORContentPart{makeOutputTextPartWithLogprobs(result, prediction.Logprobs)},
|
||||
Content: []schema.ORContentPart{makeOutputTextPartWithLogprobs(cleanedResult, prediction.Logprobs)},
|
||||
})
|
||||
}
|
||||
} else {
|
||||
// Simple text response (include logprobs if available)
|
||||
outputItems = []schema.ORItemField{
|
||||
{
|
||||
Type: "message",
|
||||
ID: fmt.Sprintf("msg_%s", uuid.New().String()),
|
||||
Status: "completed",
|
||||
Role: "assistant",
|
||||
Content: []schema.ORContentPart{makeOutputTextPartWithLogprobs(result, prediction.Logprobs)},
|
||||
},
|
||||
// Note: reasoning item may already be added above
|
||||
messageItem := schema.ORItemField{
|
||||
Type: "message",
|
||||
ID: fmt.Sprintf("msg_%s", uuid.New().String()),
|
||||
Status: "completed",
|
||||
Role: "assistant",
|
||||
Content: []schema.ORContentPart{makeOutputTextPartWithLogprobs(cleanedResult, prediction.Logprobs)},
|
||||
}
|
||||
outputItems = append(outputItems, messageItem)
|
||||
}
|
||||
|
||||
// Calculate reasoning tokens (approximate: character count / 4)
|
||||
reasoningTokens := 0
|
||||
if reasoningContent != "" {
|
||||
// Simple estimation: ~4 characters per token
|
||||
reasoningTokens = len(reasoningContent) / 4
|
||||
if reasoningTokens == 0 && len(reasoningContent) > 0 {
|
||||
reasoningTokens = 1
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1429,6 +1473,9 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
|
||||
InputTokens: prediction.Usage.Prompt,
|
||||
OutputTokens: prediction.Usage.Completion,
|
||||
TotalTokens: prediction.Usage.Prompt + prediction.Usage.Completion,
|
||||
OutputTokensDetails: &schema.OROutputTokensDetails{
|
||||
ReasoningTokens: reasoningTokens,
|
||||
},
|
||||
}, shouldStore)
|
||||
|
||||
// Store response for future reference (if enabled)
|
||||
@@ -1484,6 +1531,15 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
}
|
||||
}
|
||||
|
||||
// Detect if thinking token is already in prompt or template
|
||||
var template string
|
||||
if cfg.TemplateConfig.UseTokenizerTemplate {
|
||||
template = cfg.GetModelTemplate()
|
||||
} else {
|
||||
template = predInput
|
||||
}
|
||||
thinkingStartToken := reason.DetectThinkingStartToken(template, &cfg.ReasoningConfig)
|
||||
|
||||
// Track state for streaming
|
||||
var currentMessageID string
|
||||
var currentContentIndex int
|
||||
@@ -1492,6 +1548,14 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
outputIndex := 0
|
||||
inToolCallMode := false
|
||||
|
||||
// Track reasoning state for streaming
|
||||
var currentReasoningID string
|
||||
var currentReasoningContentIndex int
|
||||
var accumulatedContent string
|
||||
var lastEmittedReasoning string
|
||||
var lastEmittedCleanedContent string
|
||||
var reasoningTokens int
|
||||
|
||||
// Collect all output items for storage
|
||||
var collectedOutputItems []schema.ORItemField
|
||||
|
||||
@@ -1646,52 +1710,133 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
return true
|
||||
}
|
||||
|
||||
// If no tool calls detected yet, emit text delta
|
||||
// If no tool calls detected yet, handle reasoning and text
|
||||
if !inToolCallMode {
|
||||
if currentMessageID == "" {
|
||||
// Emit output_item.added for message
|
||||
currentMessageID = fmt.Sprintf("msg_%s", uuid.New().String())
|
||||
messageItem := &schema.ORItemField{
|
||||
Type: "message",
|
||||
ID: currentMessageID,
|
||||
Status: "in_progress",
|
||||
Role: "assistant",
|
||||
Content: []schema.ORContentPart{},
|
||||
}
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_item.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
OutputIndex: &outputIndex,
|
||||
Item: messageItem,
|
||||
})
|
||||
sequenceNumber++
|
||||
accumulatedContent += token
|
||||
currentReasoning, cleanedContent := reason.ExtractReasoningWithConfig(accumulatedContent, thinkingStartToken, cfg.ReasoningConfig)
|
||||
|
||||
// Emit content_part.added
|
||||
currentContentIndex = 0
|
||||
emptyPart := makeOutputTextPart("")
|
||||
// Handle reasoning item
|
||||
if currentReasoning != "" {
|
||||
// Check if we need to create reasoning item
|
||||
if currentReasoningID == "" {
|
||||
outputIndex++
|
||||
currentReasoningID = fmt.Sprintf("reasoning_%s", uuid.New().String())
|
||||
reasoningItem := &schema.ORItemField{
|
||||
Type: "reasoning",
|
||||
ID: currentReasoningID,
|
||||
Status: "in_progress",
|
||||
}
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_item.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
OutputIndex: &outputIndex,
|
||||
Item: reasoningItem,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit content_part.added for reasoning
|
||||
currentReasoningContentIndex = 0
|
||||
emptyPart := makeOutputTextPart("")
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.content_part.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentReasoningID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tReasoningContentIndex,
|
||||
Part: &emptyPart,
|
||||
})
|
||||
sequenceNumber++
|
||||
}
|
||||
|
||||
// Calculate reasoning delta
|
||||
var reasoningDelta string
|
||||
if len(currentReasoning) > len(lastEmittedReasoning) && strings.HasPrefix(currentReasoning, lastEmittedReasoning) {
|
||||
reasoningDelta = currentReasoning[len(lastEmittedReasoning):]
|
||||
lastEmittedReasoning = currentReasoning
|
||||
} else if currentReasoning != lastEmittedReasoning {
|
||||
reasoningDelta = currentReasoning
|
||||
lastEmittedReasoning = currentReasoning
|
||||
}
|
||||
|
||||
// Emit reasoning delta if there's new content
|
||||
if reasoningDelta != "" {
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_text.delta",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentReasoningID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tReasoningContentIndex,
|
||||
Delta: strPtr(reasoningDelta),
|
||||
Logprobs: emptyLogprobs(),
|
||||
})
|
||||
sequenceNumber++
|
||||
c.Response().Flush()
|
||||
}
|
||||
}
|
||||
|
||||
// Handle message content (cleaned content without reasoning tags)
|
||||
var deltaContent string
|
||||
if len(cleanedContent) > len(lastEmittedCleanedContent) && strings.HasPrefix(cleanedContent, lastEmittedCleanedContent) {
|
||||
deltaContent = cleanedContent[len(lastEmittedCleanedContent):]
|
||||
lastEmittedCleanedContent = cleanedContent
|
||||
} else if cleanedContent != lastEmittedCleanedContent {
|
||||
if lastEmittedCleanedContent == "" {
|
||||
deltaContent = cleanedContent
|
||||
lastEmittedCleanedContent = cleanedContent
|
||||
} else {
|
||||
deltaContent = cleanedContent
|
||||
lastEmittedCleanedContent = cleanedContent
|
||||
}
|
||||
}
|
||||
|
||||
// Only emit message content if there's actual content (not just reasoning)
|
||||
if deltaContent != "" {
|
||||
if currentMessageID == "" {
|
||||
// Emit output_item.added for message
|
||||
outputIndex++
|
||||
currentMessageID = fmt.Sprintf("msg_%s", uuid.New().String())
|
||||
messageItem := &schema.ORItemField{
|
||||
Type: "message",
|
||||
ID: currentMessageID,
|
||||
Status: "in_progress",
|
||||
Role: "assistant",
|
||||
Content: []schema.ORContentPart{},
|
||||
}
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_item.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
OutputIndex: &outputIndex,
|
||||
Item: messageItem,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit content_part.added
|
||||
currentContentIndex = 0
|
||||
emptyPart := makeOutputTextPart("")
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.content_part.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentMessageID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tContentIndex,
|
||||
Part: &emptyPart,
|
||||
})
|
||||
sequenceNumber++
|
||||
}
|
||||
|
||||
// Emit text delta
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.content_part.added",
|
||||
Type: "response.output_text.delta",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentMessageID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tContentIndex,
|
||||
Part: &emptyPart,
|
||||
Delta: strPtr(deltaContent),
|
||||
Logprobs: emptyLogprobs(),
|
||||
})
|
||||
sequenceNumber++
|
||||
c.Response().Flush()
|
||||
}
|
||||
|
||||
// Emit text delta
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_text.delta",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentMessageID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tContentIndex,
|
||||
Delta: strPtr(token),
|
||||
Logprobs: emptyLogprobs(),
|
||||
})
|
||||
sequenceNumber++
|
||||
c.Response().Flush()
|
||||
}
|
||||
return true
|
||||
}
|
||||
@@ -1754,7 +1899,62 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
}
|
||||
|
||||
result := backend.Finetune(*cfg, predInput, prediction.Response)
|
||||
cleanedResult := functions.CleanupLLMResult(result, cfg.FunctionsConfig)
|
||||
|
||||
// Extract reasoning from final result
|
||||
finalReasoning, finalCleanedResult := reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
|
||||
|
||||
// Close reasoning item if it exists and wasn't closed yet
|
||||
if currentReasoningID != "" && finalReasoning != "" {
|
||||
// Emit output_text.done for reasoning
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_text.done",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentReasoningID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tReasoningContentIndex,
|
||||
Text: strPtr(finalReasoning),
|
||||
Logprobs: emptyLogprobs(),
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit content_part.done for reasoning
|
||||
reasoningPart := makeOutputTextPart(finalReasoning)
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.content_part.done",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentReasoningID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tReasoningContentIndex,
|
||||
Part: &reasoningPart,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit output_item.done for reasoning
|
||||
reasoningItem := &schema.ORItemField{
|
||||
Type: "reasoning",
|
||||
ID: currentReasoningID,
|
||||
Status: "completed",
|
||||
Content: []schema.ORContentPart{reasoningPart},
|
||||
}
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_item.done",
|
||||
SequenceNumber: sequenceNumber,
|
||||
OutputIndex: &outputIndex,
|
||||
Item: reasoningItem,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Collect reasoning item for storage
|
||||
collectedOutputItems = append(collectedOutputItems, *reasoningItem)
|
||||
|
||||
// Calculate reasoning tokens
|
||||
reasoningTokens = len(finalReasoning) / 4
|
||||
if reasoningTokens == 0 && len(finalReasoning) > 0 {
|
||||
reasoningTokens = 1
|
||||
}
|
||||
}
|
||||
|
||||
cleanedResult := functions.CleanupLLMResult(finalCleanedResult, cfg.FunctionsConfig)
|
||||
xlog.Debug("Open Responses Stream - Cleaned result", "cleanedResult", cleanedResult)
|
||||
|
||||
parsedToolCalls := functions.ParseFunctionCall(cleanedResult, cfg.FunctionsConfig)
|
||||
@@ -1789,10 +1989,10 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
// Convert prediction logprobs for streaming events
|
||||
streamEventLogprobs := convertLogprobsForStreaming(prediction.Logprobs)
|
||||
|
||||
// If we have no output but the model did produce something, use the raw result
|
||||
if textContent == "" && len(toolCalls) == 0 && result != "" {
|
||||
xlog.Debug("Open Responses Stream - No parsed output, using raw result")
|
||||
textContent = result
|
||||
// If we have no output but the model did produce something, use the cleaned result (without reasoning tags)
|
||||
if textContent == "" && len(toolCalls) == 0 && finalCleanedResult != "" {
|
||||
xlog.Debug("Open Responses Stream - No parsed output, using cleaned result")
|
||||
textContent = finalCleanedResult
|
||||
}
|
||||
|
||||
// Close message if we have text content
|
||||
@@ -1875,8 +2075,18 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
collectedOutputItems = append(collectedOutputItems, *functionCallItem)
|
||||
}
|
||||
|
||||
// Build final response with all items (include logprobs)
|
||||
// Build final response with all items (include reasoning first, then messages, then tool calls)
|
||||
var allOutputItems []schema.ORItemField
|
||||
// Add reasoning item if it exists
|
||||
if currentReasoningID != "" && finalReasoning != "" {
|
||||
allOutputItems = append(allOutputItems, schema.ORItemField{
|
||||
Type: "reasoning",
|
||||
ID: currentReasoningID,
|
||||
Status: "completed",
|
||||
Content: []schema.ORContentPart{makeOutputTextPart(finalReasoning)},
|
||||
})
|
||||
}
|
||||
// Add message item
|
||||
if currentMessageID != "" && textContent != "" {
|
||||
allOutputItems = append(allOutputItems, schema.ORItemField{
|
||||
Type: "message",
|
||||
@@ -1886,6 +2096,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
Content: []schema.ORContentPart{makeOutputTextPartWithLogprobs(textContent, prediction.Logprobs)},
|
||||
})
|
||||
}
|
||||
// Add tool call items
|
||||
for _, tc := range toolCalls {
|
||||
toolCallID := fmt.Sprintf("fc_%s", uuid.New().String())
|
||||
allOutputItems = append(allOutputItems, schema.ORItemField{
|
||||
@@ -1904,6 +2115,9 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
InputTokens: prediction.Usage.Prompt,
|
||||
OutputTokens: prediction.Usage.Completion,
|
||||
TotalTokens: prediction.Usage.Prompt + prediction.Usage.Completion,
|
||||
OutputTokensDetails: &schema.OROutputTokensDetails{
|
||||
ReasoningTokens: reasoningTokens,
|
||||
},
|
||||
}, shouldStore)
|
||||
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
@@ -1956,22 +2170,102 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Stream text deltas
|
||||
// Stream text deltas with reasoning extraction
|
||||
tokenCallback := func(token string, tokenUsage backend.TokenUsage) bool {
|
||||
accumulatedText += token
|
||||
accumulatedContent += token
|
||||
// Prepend thinking token if needed, then extract reasoning
|
||||
currentReasoning, cleanedContent := reason.ExtractReasoningWithConfig(accumulatedContent, thinkingStartToken, cfg.ReasoningConfig)
|
||||
|
||||
// Emit text delta
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_text.delta",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentMessageID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tContentIndex,
|
||||
Delta: strPtr(token),
|
||||
Logprobs: emptyLogprobs(),
|
||||
})
|
||||
sequenceNumber++
|
||||
c.Response().Flush()
|
||||
// Handle reasoning item
|
||||
if currentReasoning != "" {
|
||||
// Check if we need to create reasoning item
|
||||
if currentReasoningID == "" {
|
||||
outputIndex++
|
||||
currentReasoningID = fmt.Sprintf("reasoning_%s", uuid.New().String())
|
||||
reasoningItem := &schema.ORItemField{
|
||||
Type: "reasoning",
|
||||
ID: currentReasoningID,
|
||||
Status: "in_progress",
|
||||
}
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_item.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
OutputIndex: &outputIndex,
|
||||
Item: reasoningItem,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit content_part.added for reasoning
|
||||
currentReasoningContentIndex = 0
|
||||
emptyPart := makeOutputTextPart("")
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.content_part.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentReasoningID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tReasoningContentIndex,
|
||||
Part: &emptyPart,
|
||||
})
|
||||
sequenceNumber++
|
||||
}
|
||||
|
||||
// Calculate reasoning delta
|
||||
var reasoningDelta string
|
||||
if len(currentReasoning) > len(lastEmittedReasoning) && strings.HasPrefix(currentReasoning, lastEmittedReasoning) {
|
||||
reasoningDelta = currentReasoning[len(lastEmittedReasoning):]
|
||||
lastEmittedReasoning = currentReasoning
|
||||
} else if currentReasoning != lastEmittedReasoning {
|
||||
reasoningDelta = currentReasoning
|
||||
lastEmittedReasoning = currentReasoning
|
||||
}
|
||||
|
||||
// Emit reasoning delta if there's new content
|
||||
if reasoningDelta != "" {
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_text.delta",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentReasoningID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tReasoningContentIndex,
|
||||
Delta: strPtr(reasoningDelta),
|
||||
Logprobs: emptyLogprobs(),
|
||||
})
|
||||
sequenceNumber++
|
||||
c.Response().Flush()
|
||||
}
|
||||
}
|
||||
|
||||
// Handle message content (cleaned content without reasoning tags)
|
||||
var deltaContent string
|
||||
if len(cleanedContent) > len(lastEmittedCleanedContent) && strings.HasPrefix(cleanedContent, lastEmittedCleanedContent) {
|
||||
deltaContent = cleanedContent[len(lastEmittedCleanedContent):]
|
||||
lastEmittedCleanedContent = cleanedContent
|
||||
} else if cleanedContent != lastEmittedCleanedContent {
|
||||
if lastEmittedCleanedContent == "" {
|
||||
deltaContent = cleanedContent
|
||||
lastEmittedCleanedContent = cleanedContent
|
||||
} else {
|
||||
deltaContent = cleanedContent
|
||||
lastEmittedCleanedContent = cleanedContent
|
||||
}
|
||||
}
|
||||
|
||||
// Only emit message content if there's actual content (not just reasoning)
|
||||
if deltaContent != "" {
|
||||
// Emit text delta
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_text.delta",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentMessageID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tContentIndex,
|
||||
Delta: strPtr(deltaContent),
|
||||
Logprobs: emptyLogprobs(),
|
||||
})
|
||||
sequenceNumber++
|
||||
c.Response().Flush()
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -2034,6 +2328,62 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
|
||||
result := backend.Finetune(*cfg, predInput, prediction.Response)
|
||||
|
||||
// Extract reasoning from final result for non-tool-call path
|
||||
finalReasoning, finalCleanedResult := reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
|
||||
|
||||
// Close reasoning item if it exists and wasn't closed yet
|
||||
if currentReasoningID != "" && finalReasoning != "" {
|
||||
// Emit output_text.done for reasoning
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_text.done",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentReasoningID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tReasoningContentIndex,
|
||||
Text: strPtr(finalReasoning),
|
||||
Logprobs: emptyLogprobs(),
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit content_part.done for reasoning
|
||||
reasoningPart := makeOutputTextPart(finalReasoning)
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.content_part.done",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentReasoningID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tReasoningContentIndex,
|
||||
Part: &reasoningPart,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit output_item.done for reasoning
|
||||
reasoningItem := &schema.ORItemField{
|
||||
Type: "reasoning",
|
||||
ID: currentReasoningID,
|
||||
Status: "completed",
|
||||
Content: []schema.ORContentPart{reasoningPart},
|
||||
}
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_item.done",
|
||||
SequenceNumber: sequenceNumber,
|
||||
OutputIndex: &outputIndex,
|
||||
Item: reasoningItem,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Collect reasoning item for storage
|
||||
collectedOutputItems = append(collectedOutputItems, *reasoningItem)
|
||||
|
||||
// Calculate reasoning tokens
|
||||
reasoningTokens = len(finalReasoning) / 4
|
||||
if reasoningTokens == 0 && len(finalReasoning) > 0 {
|
||||
reasoningTokens = 1
|
||||
}
|
||||
}
|
||||
|
||||
result = finalCleanedResult
|
||||
|
||||
// Convert prediction logprobs for streaming events
|
||||
mcpStreamLogprobs := convertLogprobsForStreaming(prediction.Logprobs)
|
||||
|
||||
@@ -2075,17 +2425,35 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
// Emit response.completed
|
||||
now := time.Now().Unix()
|
||||
|
||||
// Collect final output items (use collected items if available, otherwise use messageItem)
|
||||
// Collect final output items (reasoning first, then message)
|
||||
var finalOutputItems []schema.ORItemField
|
||||
// Add reasoning item if it exists
|
||||
if currentReasoningID != "" && finalReasoning != "" {
|
||||
finalOutputItems = append(finalOutputItems, schema.ORItemField{
|
||||
Type: "reasoning",
|
||||
ID: currentReasoningID,
|
||||
Status: "completed",
|
||||
Content: []schema.ORContentPart{makeOutputTextPart(finalReasoning)},
|
||||
})
|
||||
}
|
||||
// Add message item
|
||||
if len(collectedOutputItems) > 0 {
|
||||
finalOutputItems = collectedOutputItems
|
||||
// Use collected items (may include reasoning already)
|
||||
for _, item := range collectedOutputItems {
|
||||
if item.Type == "message" {
|
||||
finalOutputItems = append(finalOutputItems, item)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
finalOutputItems = []schema.ORItemField{*messageItem}
|
||||
finalOutputItems = append(finalOutputItems, *messageItem)
|
||||
}
|
||||
responseCompleted := buildORResponse(responseID, createdAt, &now, "completed", input, finalOutputItems, &schema.ORUsage{
|
||||
InputTokens: prediction.Usage.Prompt,
|
||||
OutputTokens: prediction.Usage.Completion,
|
||||
TotalTokens: prediction.Usage.Prompt + prediction.Usage.Completion,
|
||||
OutputTokensDetails: &schema.OROutputTokensDetails{
|
||||
ReasoningTokens: reasoningTokens,
|
||||
},
|
||||
}, shouldStore)
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.completed",
|
||||
|
||||
@@ -93,7 +93,12 @@ type ORItemParam struct {
|
||||
// Function call output fields
|
||||
Output interface{} `json:"output,omitempty"` // string or []ORContentPart
|
||||
|
||||
// Reasoning fields (for type == "reasoning")
|
||||
Summary []ORContentPart `json:"summary,omitempty"` // Array of summary parts
|
||||
EncryptedContent *string `json:"encrypted_content,omitempty"` // Provider-specific encrypted content
|
||||
|
||||
// Note: For item_reference type, use the ID field above to reference the item
|
||||
// Note: For reasoning type, Content field (from message fields) contains the raw reasoning content
|
||||
}
|
||||
|
||||
// ORContentPart represents a content block (discriminated union by type)
|
||||
|
||||
@@ -397,6 +397,83 @@ Agent/autonomous agent configuration:
|
||||
| `agent.enable_mcp_prompts` | bool | Enable MCP prompts |
|
||||
| `agent.enable_plan_re_evaluator` | bool | Enable plan re-evaluation |
|
||||
|
||||
## Reasoning Configuration
|
||||
|
||||
Configure how reasoning tags are extracted and processed from model output. Reasoning tags are used by models like DeepSeek, Command-R, and others to include internal reasoning steps in their responses.
|
||||
|
||||
| Field | Type | Default | Description |
|
||||
|-------|------|---------|-------------|
|
||||
| `reasoning.disable` | bool | `false` | When `true`, disables reasoning extraction entirely. The original content is returned without any processing. |
|
||||
| `reasoning.disable_reasoning_tag_prefill` | bool | `false` | When `true`, disables automatic prepending of thinking start tokens. Use this when your model already includes reasoning tags in its output format. |
|
||||
| `reasoning.strip_reasoning_only` | bool | `false` | When `true`, extracts and removes reasoning tags from content but discards the reasoning text. Useful when you want to clean reasoning tags from output without storing the reasoning content. |
|
||||
| `reasoning.thinking_start_tokens` | array | `[]` | List of custom thinking start tokens to detect in prompts. Custom tokens are checked before default tokens. |
|
||||
| `reasoning.tag_pairs` | array | `[]` | List of custom tag pairs for reasoning extraction. Each entry has `start` and `end` fields. Custom pairs are checked before default pairs. |
|
||||
|
||||
### Reasoning Tag Formats
|
||||
|
||||
The reasoning extraction supports multiple tag formats used by different models:
|
||||
|
||||
- `<thinking>...</thinking>` - General thinking tag
|
||||
- `<think>...</think>` - DeepSeek, Granite, ExaOne, GLM models
|
||||
- `<|START_THINKING|>...<|END_THINKING|>` - Command-R models
|
||||
- `<|inner_prefix|>...<|inner_suffix|>` - Apertus models
|
||||
- `<seed:think>...</seed:think>` - Seed models
|
||||
- `<|think|>...<|end|><|begin|>assistant<|content|>` - Solar Open models
|
||||
- `[THINK]...[/THINK]` - Magistral models
|
||||
|
||||
### Examples
|
||||
|
||||
**Disable reasoning extraction:**
|
||||
```yaml
|
||||
reasoning:
|
||||
disable: true
|
||||
```
|
||||
|
||||
**Extract reasoning but don't prepend tags:**
|
||||
```yaml
|
||||
reasoning:
|
||||
disable_reasoning_tag_prefill: true
|
||||
```
|
||||
|
||||
**Strip reasoning tags without storing reasoning content:**
|
||||
```yaml
|
||||
reasoning:
|
||||
strip_reasoning_only: true
|
||||
```
|
||||
|
||||
**Complete example with reasoning configuration:**
|
||||
```yaml
|
||||
name: deepseek-model
|
||||
backend: llama-cpp
|
||||
parameters:
|
||||
model: deepseek.gguf
|
||||
|
||||
reasoning:
|
||||
disable: false
|
||||
disable_reasoning_tag_prefill: false
|
||||
strip_reasoning_only: false
|
||||
```
|
||||
|
||||
**Example with custom tokens and tag pairs:**
|
||||
```yaml
|
||||
name: custom-reasoning-model
|
||||
backend: llama-cpp
|
||||
parameters:
|
||||
model: custom.gguf
|
||||
|
||||
reasoning:
|
||||
thinking_start_tokens:
|
||||
- "<custom:think>"
|
||||
- "<my:reasoning>"
|
||||
tag_pairs:
|
||||
- start: "<custom:think>"
|
||||
end: "</custom:think>"
|
||||
- start: "<my:reasoning>"
|
||||
end: "</my:reasoning>"
|
||||
```
|
||||
|
||||
**Note:** Custom tokens and tag pairs are checked before the default ones, giving them priority. This allows you to override default behavior or add support for new reasoning tag formats.
|
||||
|
||||
## Pipeline Configuration
|
||||
|
||||
Define pipelines for audio-to-audio processing:
|
||||
|
||||
@@ -1,5 +1,15 @@
|
||||
package reasoning
|
||||
|
||||
type Config struct {
|
||||
DisableReasoningTagPrefill *bool `yaml:"disable_reasoning_tag_prefill,omitempty" json:"disable_reasoning_tag_prefill,omitempty"`
|
||||
// TagPair represents a start/end tag pair for reasoning extraction
|
||||
type TagPair struct {
|
||||
Start string `yaml:"start" json:"start"`
|
||||
End string `yaml:"end" json:"end"`
|
||||
}
|
||||
|
||||
type Config struct {
|
||||
DisableReasoningTagPrefill *bool `yaml:"disable_reasoning_tag_prefill,omitempty" json:"disable_reasoning_tag_prefill,omitempty"`
|
||||
DisableReasoning *bool `yaml:"disable,omitempty" json:"disable,omitempty"`
|
||||
StripReasoningOnly *bool `yaml:"strip_reasoning_only,omitempty" json:"strip_reasoning_only,omitempty"`
|
||||
ThinkingStartTokens []string `yaml:"thinking_start_tokens,omitempty" json:"thinking_start_tokens,omitempty"`
|
||||
TagPairs []TagPair `yaml:"tag_pairs,omitempty" json:"tag_pairs,omitempty"`
|
||||
}
|
||||
|
||||
@@ -17,12 +17,12 @@ import (
|
||||
// - <think> (DeepSeek, Granite, ExaOne models)
|
||||
// - <|think|> (Solar Open models)
|
||||
// - <thinking> (General thinking tag)
|
||||
// - <think> (GLM models)
|
||||
// - [THINK] (Magistral models)
|
||||
func DetectThinkingStartToken(prompt string) string {
|
||||
// Custom tokens from config are checked first, then default tokens.
|
||||
func DetectThinkingStartToken(prompt string, config *Config) string {
|
||||
// Common thinking start tokens (in order of specificity - longer first)
|
||||
// Based on llama.cpp's chat-parser.cpp implementations
|
||||
thinkingStartTokens := []string{
|
||||
defaultTokens := []string{
|
||||
"<|START_THINKING|>", // Command-R models
|
||||
"<|inner_prefix|>", // Apertus models
|
||||
"<seed:think>", // Seed models
|
||||
@@ -32,6 +32,13 @@ func DetectThinkingStartToken(prompt string) string {
|
||||
"[THINK]", // Magistral models
|
||||
}
|
||||
|
||||
// Merge custom tokens with default tokens (custom tokens first for priority)
|
||||
var thinkingStartTokens []string
|
||||
if config != nil && len(config.ThinkingStartTokens) > 0 {
|
||||
thinkingStartTokens = append(thinkingStartTokens, config.ThinkingStartTokens...)
|
||||
}
|
||||
thinkingStartTokens = append(thinkingStartTokens, defaultTokens...)
|
||||
|
||||
// Check if prompt ends with any of these tokens (allowing for trailing whitespace/newlines)
|
||||
trimmedPrompt := strings.TrimRight(prompt, " \t\n\r")
|
||||
for _, token := range thinkingStartTokens {
|
||||
@@ -58,6 +65,28 @@ func DetectThinkingStartToken(prompt string) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
// ExtractReasoningWithConfig extracts reasoning from content with the given config.
|
||||
// If reasoning is disabled, it returns the original content.
|
||||
// If thinking start token prefill is enabled, it prepends the thinking start token to the content.
|
||||
// It returns the extracted reasoning and the cleaned content.
|
||||
func ExtractReasoningWithConfig(content, thinkingStartToken string, config Config) (reasoning string, cleanedContent string) {
|
||||
cleanedContent = content
|
||||
// If reasoning is not disabled, prepend the thinking start token if needed and extract reasoning
|
||||
if config.DisableReasoning == nil || !*config.DisableReasoning {
|
||||
// If thinking start token prefill is not disabled, prepend the thinking start token
|
||||
if config.DisableReasoningTagPrefill == nil || !*config.DisableReasoningTagPrefill {
|
||||
cleanedContent = PrependThinkingTokenIfNeeded(cleanedContent, thinkingStartToken)
|
||||
}
|
||||
// Extract reasoning from the cleaned content
|
||||
reasoning, cleanedContent = ExtractReasoning(cleanedContent, &config)
|
||||
if config.StripReasoningOnly != nil && *config.StripReasoningOnly {
|
||||
reasoning = ""
|
||||
}
|
||||
}
|
||||
|
||||
return reasoning, cleanedContent
|
||||
}
|
||||
|
||||
// PrependThinkingTokenIfNeeded prepends the thinking start token to content if it was
|
||||
// detected in the prompt. This allows the standard extraction logic to work correctly
|
||||
// for models where the thinking token is already in the prompt.
|
||||
@@ -97,7 +126,8 @@ func PrependThinkingTokenIfNeeded(content string, startToken string) string {
|
||||
// both the extracted reasoning and the cleaned content (with tags removed).
|
||||
// It handles <thinking>...</thinking> and <think>...</think> tags.
|
||||
// Multiple reasoning blocks are concatenated with newlines.
|
||||
func ExtractReasoning(content string) (reasoning string, cleanedContent string) {
|
||||
// Custom tag pairs from config are checked first, then default tag pairs.
|
||||
func ExtractReasoning(content string, config *Config) (reasoning string, cleanedContent string) {
|
||||
if content == "" {
|
||||
return "", content
|
||||
}
|
||||
@@ -106,8 +136,8 @@ func ExtractReasoning(content string) (reasoning string, cleanedContent string)
|
||||
var cleanedParts []string
|
||||
remaining := content
|
||||
|
||||
// Define tag pairs to look for (matching llama.cpp's chat-parser.cpp)
|
||||
tagPairs := []struct {
|
||||
// Define default tag pairs to look for (matching llama.cpp's chat-parser.cpp)
|
||||
defaultTagPairs := []struct {
|
||||
start string
|
||||
end string
|
||||
}{
|
||||
@@ -120,6 +150,26 @@ func ExtractReasoning(content string) (reasoning string, cleanedContent string)
|
||||
{"[THINK]", "[/THINK]"}, // Magistral models
|
||||
}
|
||||
|
||||
// Merge custom tag pairs with default tag pairs (custom pairs first for priority)
|
||||
var tagPairs []struct {
|
||||
start string
|
||||
end string
|
||||
}
|
||||
if config != nil && len(config.TagPairs) > 0 {
|
||||
for _, pair := range config.TagPairs {
|
||||
if pair.Start != "" && pair.End != "" {
|
||||
tagPairs = append(tagPairs, struct {
|
||||
start string
|
||||
end string
|
||||
}{pair.Start, pair.End})
|
||||
}
|
||||
}
|
||||
}
|
||||
// Add default tag pairs
|
||||
for _, pair := range defaultTagPairs {
|
||||
tagPairs = append(tagPairs, pair)
|
||||
}
|
||||
|
||||
// Track the last position we've processed
|
||||
lastPos := 0
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user