feat(openresponses): Support reasoning blocks (#8133)

* feat(openresponses): support reasoning blocks Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * allow to disable reasoning, refactor common logic Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add option to only strip reasoning Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add configurations for custom reasoning tokens Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-03-08 00:48:00 -05:00 · 2026-01-21 00:11:45 +01:00
parent 34e054f607
commit c491c6ca90
7 changed files with 1289 additions and 155 deletions
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -47,7 +47,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 		} else {
 			template = s
 		}
-		thinkingStartToken := reason.DetectThinkingStartToken(template)
+		thinkingStartToken := reason.DetectThinkingStartToken(template, &config.ReasoningConfig)

 		// Track accumulated content for reasoning extraction
 		accumulatedContent := ""
@@ -56,12 +56,8 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator

 		_, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
 			accumulatedContent += s
-			content := accumulatedContent
-			// Prepend thinking token if needed, then extract reasoning
-			if config.ReasoningConfig.DisableReasoningTagPrefill == nil || !*config.ReasoningConfig.DisableReasoningTagPrefill {
-				content = reason.PrependThinkingTokenIfNeeded(content, thinkingStartToken)
-			}
-			currentReasoning, cleanedContent := reason.ExtractReasoning(content)
+
+			currentReasoning, cleanedContent := reason.ExtractReasoningWithConfig(accumulatedContent, thinkingStartToken, config.ReasoningConfig)

 			// Calculate new reasoning delta (what we haven't emitted yet)
 			var reasoningDelta *string
@@ -140,7 +136,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 		} else {
 			template = prompt
 		}
-		thinkingStartToken := reason.DetectThinkingStartToken(template)
+		thinkingStartToken := reason.DetectThinkingStartToken(template, &config.ReasoningConfig)

 		result := ""
 		lastEmittedCount := 0
@@ -254,12 +250,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			return err
 		}
 		// Prepend thinking token if needed, then extract reasoning before processing tool calls
-		resultWithToken := result
-		if config.ReasoningConfig.DisableReasoningTagPrefill == nil || !*config.ReasoningConfig.DisableReasoningTagPrefill {
-			resultWithToken = reason.PrependThinkingTokenIfNeeded(result, thinkingStartToken)
-		}
-		reasoning, cleanedResult := reason.ExtractReasoning(resultWithToken)
-		result = cleanedResult
+		reasoning, result := reason.ExtractReasoningWithConfig(result, thinkingStartToken, config.ReasoningConfig)

 		textContentToReturn = functions.ParseTextContent(result, config.FunctionsConfig)
 		result = functions.CleanupLLMResult(result, config.FunctionsConfig)
@@ -652,18 +643,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			} else {
 				template = predInput
 			}
-			thinkingStartToken := reason.DetectThinkingStartToken(template)
+			thinkingStartToken := reason.DetectThinkingStartToken(template, &config.ReasoningConfig)

 			xlog.Debug("Thinking start token", "thinkingStartToken", thinkingStartToken, "template", template)

 			tokenCallback := func(s string, c *[]schema.Choice) {
 				// Prepend thinking token if needed, then extract reasoning from the response
-				sWithToken := s
-				if config.ReasoningConfig.DisableReasoningTagPrefill == nil || !*config.ReasoningConfig.DisableReasoningTagPrefill {
-					sWithToken = reason.PrependThinkingTokenIfNeeded(s, thinkingStartToken)
-				}
-				reasoning, cleanedS := reason.ExtractReasoning(sWithToken)
-				s = cleanedS
+				reasoning, s := reason.ExtractReasoningWithConfig(s, thinkingStartToken, config.ReasoningConfig)

 				if !shouldUseFn {
 					// no function is called, just reply and use stop as finish reason
--- a/core/http/endpoints/openresponses/responses.go
+++ b/core/http/endpoints/openresponses/responses.go
@@ -6,6 +6,7 @@ import (
 	"errors"
 	"fmt"
 	"net"
+	"strings"
 	"time"

 	"github.com/google/uuid"
@@ -18,6 +19,7 @@ import (
 	"github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/model"
+	reason "github.com/mudler/LocalAI/pkg/reasoning"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/mudler/cogito"
 	"github.com/mudler/xlog"
@@ -1330,13 +1332,37 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
 	result := backend.Finetune(*cfg, predInput, prediction.Response)
 	xlog.Debug("Open Responses - Raw model result", "result", result, "shouldUseFn", shouldUseFn)

+	// Detect if thinking token is already in prompt or template
+	var template string
+	if cfg.TemplateConfig.UseTokenizerTemplate {
+		template = cfg.GetModelTemplate()
+	} else {
+		template = predInput
+	}
+	thinkingStartToken := reason.DetectThinkingStartToken(template, &cfg.ReasoningConfig)
+
+	// Extract reasoning from result before cleaning
+	reasoningContent, cleanedResult := reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
+
 	// Parse tool calls if using functions
 	var outputItems []schema.ORItemField
 	var toolCalls []schema.ToolCall

+	// Add reasoning item if reasoning was found (reasoning comes first per spec)
+	if reasoningContent != "" {
+		reasoningItem := schema.ORItemField{
+			Type:    "reasoning",
+			ID:      fmt.Sprintf("reasoning_%s", uuid.New().String()),
+			Status:  "completed",
+			Content: []schema.ORContentPart{makeOutputTextPart(reasoningContent)},
+		}
+		outputItems = append(outputItems, reasoningItem)
+		xlog.Debug("Open Responses - Extracted reasoning", "reasoning_length", len(reasoningContent))
+	}
+
 	if shouldUseFn {
-		// Clean up the result first (handle reasoning tags, etc.)
-		cleanedResult := functions.CleanupLLMResult(result, cfg.FunctionsConfig)
+		// Clean up the result (already extracted reasoning above)
+		cleanedResult = functions.CleanupLLMResult(cleanedResult, cfg.FunctionsConfig)
 		xlog.Debug("Open Responses - Cleaned result", "cleanedResult", cleanedResult)

 		funcCallResults := functions.ParseFunctionCall(cleanedResult, cfg.FunctionsConfig)
@@ -1398,28 +1424,46 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
 			})
 		}

-		// If we have no output items but the model did produce output, include the raw result as a message
+		// If we have no output items but the model did produce output, include the cleaned result as a message
 		// This handles cases where the function call parsing failed but we still have model output
-		if len(outputItems) == 0 && result != "" {
-			xlog.Debug("Open Responses - No parsed output, falling back to raw result")
+		// Note: reasoning item may already be added above
+		hasMessageItem := false
+		for _, item := range outputItems {
+			if item.Type == "message" {
+				hasMessageItem = true
+				break
+			}
+		}
+		if !hasMessageItem && cleanedResult != "" {
+			xlog.Debug("Open Responses - No parsed output, falling back to cleaned result")
 			outputItems = append(outputItems, schema.ORItemField{
 				Type:    "message",
 				ID:      fmt.Sprintf("msg_%s", uuid.New().String()),
 				Status:  "completed",
 				Role:    "assistant",
-				Content: []schema.ORContentPart{makeOutputTextPartWithLogprobs(result, prediction.Logprobs)},
+				Content: []schema.ORContentPart{makeOutputTextPartWithLogprobs(cleanedResult, prediction.Logprobs)},
 			})
 		}
 	} else {
 		// Simple text response (include logprobs if available)
-		outputItems = []schema.ORItemField{
-			{
-				Type:    "message",
-				ID:      fmt.Sprintf("msg_%s", uuid.New().String()),
-				Status:  "completed",
-				Role:    "assistant",
-				Content: []schema.ORContentPart{makeOutputTextPartWithLogprobs(result, prediction.Logprobs)},
-			},
+		// Note: reasoning item may already be added above
+		messageItem := schema.ORItemField{
+			Type:    "message",
+			ID:      fmt.Sprintf("msg_%s", uuid.New().String()),
+			Status:  "completed",
+			Role:    "assistant",
+			Content: []schema.ORContentPart{makeOutputTextPartWithLogprobs(cleanedResult, prediction.Logprobs)},
+		}
+		outputItems = append(outputItems, messageItem)
+	}
+
+	// Calculate reasoning tokens (approximate: character count / 4)
+	reasoningTokens := 0
+	if reasoningContent != "" {
+		// Simple estimation: ~4 characters per token
+		reasoningTokens = len(reasoningContent) / 4
+		if reasoningTokens == 0 && len(reasoningContent) > 0 {
+			reasoningTokens = 1
 		}
 	}

@@ -1429,6 +1473,9 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
 		InputTokens:  prediction.Usage.Prompt,
 		OutputTokens: prediction.Usage.Completion,
 		TotalTokens:  prediction.Usage.Prompt + prediction.Usage.Completion,
+		OutputTokensDetails: &schema.OROutputTokensDetails{
+			ReasoningTokens: reasoningTokens,
+		},
 	}, shouldStore)

 	// Store response for future reference (if enabled)
@@ -1484,6 +1531,15 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 		}
 	}

+	// Detect if thinking token is already in prompt or template
+	var template string
+	if cfg.TemplateConfig.UseTokenizerTemplate {
+		template = cfg.GetModelTemplate()
+	} else {
+		template = predInput
+	}
+	thinkingStartToken := reason.DetectThinkingStartToken(template, &cfg.ReasoningConfig)
+
 	// Track state for streaming
 	var currentMessageID string
 	var currentContentIndex int
@@ -1492,6 +1548,14 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 	outputIndex := 0
 	inToolCallMode := false

+	// Track reasoning state for streaming
+	var currentReasoningID string
+	var currentReasoningContentIndex int
+	var accumulatedContent string
+	var lastEmittedReasoning string
+	var lastEmittedCleanedContent string
+	var reasoningTokens int
+
 	// Collect all output items for storage
 	var collectedOutputItems []schema.ORItemField

@@ -1646,52 +1710,133 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 				return true
 			}

-			// If no tool calls detected yet, emit text delta
+			// If no tool calls detected yet, handle reasoning and text
 			if !inToolCallMode {
-				if currentMessageID == "" {
-					// Emit output_item.added for message
-					currentMessageID = fmt.Sprintf("msg_%s", uuid.New().String())
-					messageItem := &schema.ORItemField{
-						Type:    "message",
-						ID:      currentMessageID,
-						Status:  "in_progress",
-						Role:    "assistant",
-						Content: []schema.ORContentPart{},
-					}
-					sendSSEEvent(c, &schema.ORStreamEvent{
-						Type:           "response.output_item.added",
-						SequenceNumber: sequenceNumber,
-						OutputIndex:    &outputIndex,
-						Item:           messageItem,
-					})
-					sequenceNumber++
+				accumulatedContent += token
+				currentReasoning, cleanedContent := reason.ExtractReasoningWithConfig(accumulatedContent, thinkingStartToken, cfg.ReasoningConfig)

-					// Emit content_part.added
-					currentContentIndex = 0
-					emptyPart := makeOutputTextPart("")
+				// Handle reasoning item
+				if currentReasoning != "" {
+					// Check if we need to create reasoning item
+					if currentReasoningID == "" {
+						outputIndex++
+						currentReasoningID = fmt.Sprintf("reasoning_%s", uuid.New().String())
+						reasoningItem := &schema.ORItemField{
+							Type:   "reasoning",
+							ID:     currentReasoningID,
+							Status: "in_progress",
+						}
+						sendSSEEvent(c, &schema.ORStreamEvent{
+							Type:           "response.output_item.added",
+							SequenceNumber: sequenceNumber,
+							OutputIndex:    &outputIndex,
+							Item:           reasoningItem,
+						})
+						sequenceNumber++
+
+						// Emit content_part.added for reasoning
+						currentReasoningContentIndex = 0
+						emptyPart := makeOutputTextPart("")
+						sendSSEEvent(c, &schema.ORStreamEvent{
+							Type:           "response.content_part.added",
+							SequenceNumber: sequenceNumber,
+							ItemID:         currentReasoningID,
+							OutputIndex:    &outputIndex,
+							ContentIndex:   &currentReasoningContentIndex,
+							Part:           &emptyPart,
+						})
+						sequenceNumber++
+					}
+
+					// Calculate reasoning delta
+					var reasoningDelta string
+					if len(currentReasoning) > len(lastEmittedReasoning) && strings.HasPrefix(currentReasoning, lastEmittedReasoning) {
+						reasoningDelta = currentReasoning[len(lastEmittedReasoning):]
+						lastEmittedReasoning = currentReasoning
+					} else if currentReasoning != lastEmittedReasoning {
+						reasoningDelta = currentReasoning
+						lastEmittedReasoning = currentReasoning
+					}
+
+					// Emit reasoning delta if there's new content
+					if reasoningDelta != "" {
+						sendSSEEvent(c, &schema.ORStreamEvent{
+							Type:           "response.output_text.delta",
+							SequenceNumber: sequenceNumber,
+							ItemID:         currentReasoningID,
+							OutputIndex:    &outputIndex,
+							ContentIndex:   &currentReasoningContentIndex,
+							Delta:          strPtr(reasoningDelta),
+							Logprobs:       emptyLogprobs(),
+						})
+						sequenceNumber++
+						c.Response().Flush()
+					}
+				}
+
+				// Handle message content (cleaned content without reasoning tags)
+				var deltaContent string
+				if len(cleanedContent) > len(lastEmittedCleanedContent) && strings.HasPrefix(cleanedContent, lastEmittedCleanedContent) {
+					deltaContent = cleanedContent[len(lastEmittedCleanedContent):]
+					lastEmittedCleanedContent = cleanedContent
+				} else if cleanedContent != lastEmittedCleanedContent {
+					if lastEmittedCleanedContent == "" {
+						deltaContent = cleanedContent
+						lastEmittedCleanedContent = cleanedContent
+					} else {
+						deltaContent = cleanedContent
+						lastEmittedCleanedContent = cleanedContent
+					}
+				}
+
+				// Only emit message content if there's actual content (not just reasoning)
+				if deltaContent != "" {
+					if currentMessageID == "" {
+						// Emit output_item.added for message
+						outputIndex++
+						currentMessageID = fmt.Sprintf("msg_%s", uuid.New().String())
+						messageItem := &schema.ORItemField{
+							Type:    "message",
+							ID:      currentMessageID,
+							Status:  "in_progress",
+							Role:    "assistant",
+							Content: []schema.ORContentPart{},
+						}
+						sendSSEEvent(c, &schema.ORStreamEvent{
+							Type:           "response.output_item.added",
+							SequenceNumber: sequenceNumber,
+							OutputIndex:    &outputIndex,
+							Item:           messageItem,
+						})
+						sequenceNumber++
+
+						// Emit content_part.added
+						currentContentIndex = 0
+						emptyPart := makeOutputTextPart("")
+						sendSSEEvent(c, &schema.ORStreamEvent{
+							Type:           "response.content_part.added",
+							SequenceNumber: sequenceNumber,
+							ItemID:         currentMessageID,
+							OutputIndex:    &outputIndex,
+							ContentIndex:   &currentContentIndex,
+							Part:           &emptyPart,
+						})
+						sequenceNumber++
+					}
+
+					// Emit text delta
 					sendSSEEvent(c, &schema.ORStreamEvent{
-						Type:           "response.content_part.added",
+						Type:           "response.output_text.delta",
 						SequenceNumber: sequenceNumber,
 						ItemID:         currentMessageID,
 						OutputIndex:    &outputIndex,
 						ContentIndex:   &currentContentIndex,
-						Part:           &emptyPart,
+						Delta:          strPtr(deltaContent),
+						Logprobs:       emptyLogprobs(),
 					})
 					sequenceNumber++
+					c.Response().Flush()
 				}
-
-				// Emit text delta
-				sendSSEEvent(c, &schema.ORStreamEvent{
-					Type:           "response.output_text.delta",
-					SequenceNumber: sequenceNumber,
-					ItemID:         currentMessageID,
-					OutputIndex:    &outputIndex,
-					ContentIndex:   &currentContentIndex,
-					Delta:          strPtr(token),
-					Logprobs:       emptyLogprobs(),
-				})
-				sequenceNumber++
-				c.Response().Flush()
 			}
 			return true
 		}
@@ -1754,7 +1899,62 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 		}

 		result := backend.Finetune(*cfg, predInput, prediction.Response)
-		cleanedResult := functions.CleanupLLMResult(result, cfg.FunctionsConfig)
+
+		// Extract reasoning from final result
+		finalReasoning, finalCleanedResult := reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
+
+		// Close reasoning item if it exists and wasn't closed yet
+		if currentReasoningID != "" && finalReasoning != "" {
+			// Emit output_text.done for reasoning
+			sendSSEEvent(c, &schema.ORStreamEvent{
+				Type:           "response.output_text.done",
+				SequenceNumber: sequenceNumber,
+				ItemID:         currentReasoningID,
+				OutputIndex:    &outputIndex,
+				ContentIndex:   &currentReasoningContentIndex,
+				Text:           strPtr(finalReasoning),
+				Logprobs:       emptyLogprobs(),
+			})
+			sequenceNumber++
+
+			// Emit content_part.done for reasoning
+			reasoningPart := makeOutputTextPart(finalReasoning)
+			sendSSEEvent(c, &schema.ORStreamEvent{
+				Type:           "response.content_part.done",
+				SequenceNumber: sequenceNumber,
+				ItemID:         currentReasoningID,
+				OutputIndex:    &outputIndex,
+				ContentIndex:   &currentReasoningContentIndex,
+				Part:           &reasoningPart,
+			})
+			sequenceNumber++
+
+			// Emit output_item.done for reasoning
+			reasoningItem := &schema.ORItemField{
+				Type:    "reasoning",
+				ID:      currentReasoningID,
+				Status:  "completed",
+				Content: []schema.ORContentPart{reasoningPart},
+			}
+			sendSSEEvent(c, &schema.ORStreamEvent{
+				Type:           "response.output_item.done",
+				SequenceNumber: sequenceNumber,
+				OutputIndex:    &outputIndex,
+				Item:           reasoningItem,
+			})
+			sequenceNumber++
+
+			// Collect reasoning item for storage
+			collectedOutputItems = append(collectedOutputItems, *reasoningItem)
+
+			// Calculate reasoning tokens
+			reasoningTokens = len(finalReasoning) / 4
+			if reasoningTokens == 0 && len(finalReasoning) > 0 {
+				reasoningTokens = 1
+			}
+		}
+
+		cleanedResult := functions.CleanupLLMResult(finalCleanedResult, cfg.FunctionsConfig)
 		xlog.Debug("Open Responses Stream - Cleaned result", "cleanedResult", cleanedResult)

 		parsedToolCalls := functions.ParseFunctionCall(cleanedResult, cfg.FunctionsConfig)
@@ -1789,10 +1989,10 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 		// Convert prediction logprobs for streaming events
 		streamEventLogprobs := convertLogprobsForStreaming(prediction.Logprobs)

-		// If we have no output but the model did produce something, use the raw result
-		if textContent == "" && len(toolCalls) == 0 && result != "" {
-			xlog.Debug("Open Responses Stream - No parsed output, using raw result")
-			textContent = result
+		// If we have no output but the model did produce something, use the cleaned result (without reasoning tags)
+		if textContent == "" && len(toolCalls) == 0 && finalCleanedResult != "" {
+			xlog.Debug("Open Responses Stream - No parsed output, using cleaned result")
+			textContent = finalCleanedResult
 		}

 		// Close message if we have text content
@@ -1875,8 +2075,18 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 			collectedOutputItems = append(collectedOutputItems, *functionCallItem)
 		}

-		// Build final response with all items (include logprobs)
+		// Build final response with all items (include reasoning first, then messages, then tool calls)
 		var allOutputItems []schema.ORItemField
+		// Add reasoning item if it exists
+		if currentReasoningID != "" && finalReasoning != "" {
+			allOutputItems = append(allOutputItems, schema.ORItemField{
+				Type:    "reasoning",
+				ID:      currentReasoningID,
+				Status:  "completed",
+				Content: []schema.ORContentPart{makeOutputTextPart(finalReasoning)},
+			})
+		}
+		// Add message item
 		if currentMessageID != "" && textContent != "" {
 			allOutputItems = append(allOutputItems, schema.ORItemField{
 				Type:    "message",
@@ -1886,6 +2096,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 				Content: []schema.ORContentPart{makeOutputTextPartWithLogprobs(textContent, prediction.Logprobs)},
 			})
 		}
+		// Add tool call items
 		for _, tc := range toolCalls {
 			toolCallID := fmt.Sprintf("fc_%s", uuid.New().String())
 			allOutputItems = append(allOutputItems, schema.ORItemField{
@@ -1904,6 +2115,9 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 			InputTokens:  prediction.Usage.Prompt,
 			OutputTokens: prediction.Usage.Completion,
 			TotalTokens:  prediction.Usage.Prompt + prediction.Usage.Completion,
+			OutputTokensDetails: &schema.OROutputTokensDetails{
+				ReasoningTokens: reasoningTokens,
+			},
 		}, shouldStore)

 		sendSSEEvent(c, &schema.ORStreamEvent{
@@ -1956,22 +2170,102 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 	})
 	sequenceNumber++

-	// Stream text deltas
+	// Stream text deltas with reasoning extraction
 	tokenCallback := func(token string, tokenUsage backend.TokenUsage) bool {
 		accumulatedText += token
+		accumulatedContent += token
+		// Prepend thinking token if needed, then extract reasoning
+		currentReasoning, cleanedContent := reason.ExtractReasoningWithConfig(accumulatedContent, thinkingStartToken, cfg.ReasoningConfig)

-		// Emit text delta
-		sendSSEEvent(c, &schema.ORStreamEvent{
-			Type:           "response.output_text.delta",
-			SequenceNumber: sequenceNumber,
-			ItemID:         currentMessageID,
-			OutputIndex:    &outputIndex,
-			ContentIndex:   &currentContentIndex,
-			Delta:          strPtr(token),
-			Logprobs:       emptyLogprobs(),
-		})
-		sequenceNumber++
-		c.Response().Flush()
+		// Handle reasoning item
+		if currentReasoning != "" {
+			// Check if we need to create reasoning item
+			if currentReasoningID == "" {
+				outputIndex++
+				currentReasoningID = fmt.Sprintf("reasoning_%s", uuid.New().String())
+				reasoningItem := &schema.ORItemField{
+					Type:   "reasoning",
+					ID:     currentReasoningID,
+					Status: "in_progress",
+				}
+				sendSSEEvent(c, &schema.ORStreamEvent{
+					Type:           "response.output_item.added",
+					SequenceNumber: sequenceNumber,
+					OutputIndex:    &outputIndex,
+					Item:           reasoningItem,
+				})
+				sequenceNumber++
+
+				// Emit content_part.added for reasoning
+				currentReasoningContentIndex = 0
+				emptyPart := makeOutputTextPart("")
+				sendSSEEvent(c, &schema.ORStreamEvent{
+					Type:           "response.content_part.added",
+					SequenceNumber: sequenceNumber,
+					ItemID:         currentReasoningID,
+					OutputIndex:    &outputIndex,
+					ContentIndex:   &currentReasoningContentIndex,
+					Part:           &emptyPart,
+				})
+				sequenceNumber++
+			}
+
+			// Calculate reasoning delta
+			var reasoningDelta string
+			if len(currentReasoning) > len(lastEmittedReasoning) && strings.HasPrefix(currentReasoning, lastEmittedReasoning) {
+				reasoningDelta = currentReasoning[len(lastEmittedReasoning):]
+				lastEmittedReasoning = currentReasoning
+			} else if currentReasoning != lastEmittedReasoning {
+				reasoningDelta = currentReasoning
+				lastEmittedReasoning = currentReasoning
+			}
+
+			// Emit reasoning delta if there's new content
+			if reasoningDelta != "" {
+				sendSSEEvent(c, &schema.ORStreamEvent{
+					Type:           "response.output_text.delta",
+					SequenceNumber: sequenceNumber,
+					ItemID:         currentReasoningID,
+					OutputIndex:    &outputIndex,
+					ContentIndex:   &currentReasoningContentIndex,
+					Delta:          strPtr(reasoningDelta),
+					Logprobs:       emptyLogprobs(),
+				})
+				sequenceNumber++
+				c.Response().Flush()
+			}
+		}
+
+		// Handle message content (cleaned content without reasoning tags)
+		var deltaContent string
+		if len(cleanedContent) > len(lastEmittedCleanedContent) && strings.HasPrefix(cleanedContent, lastEmittedCleanedContent) {
+			deltaContent = cleanedContent[len(lastEmittedCleanedContent):]
+			lastEmittedCleanedContent = cleanedContent
+		} else if cleanedContent != lastEmittedCleanedContent {
+			if lastEmittedCleanedContent == "" {
+				deltaContent = cleanedContent
+				lastEmittedCleanedContent = cleanedContent
+			} else {
+				deltaContent = cleanedContent
+				lastEmittedCleanedContent = cleanedContent
+			}
+		}
+
+		// Only emit message content if there's actual content (not just reasoning)
+		if deltaContent != "" {
+			// Emit text delta
+			sendSSEEvent(c, &schema.ORStreamEvent{
+				Type:           "response.output_text.delta",
+				SequenceNumber: sequenceNumber,
+				ItemID:         currentMessageID,
+				OutputIndex:    &outputIndex,
+				ContentIndex:   &currentContentIndex,
+				Delta:          strPtr(deltaContent),
+				Logprobs:       emptyLogprobs(),
+			})
+			sequenceNumber++
+			c.Response().Flush()
+		}
 		return true
 	}

@@ -2034,6 +2328,62 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6

 	result := backend.Finetune(*cfg, predInput, prediction.Response)

+	// Extract reasoning from final result for non-tool-call path
+	finalReasoning, finalCleanedResult := reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
+
+	// Close reasoning item if it exists and wasn't closed yet
+	if currentReasoningID != "" && finalReasoning != "" {
+		// Emit output_text.done for reasoning
+		sendSSEEvent(c, &schema.ORStreamEvent{
+			Type:           "response.output_text.done",
+			SequenceNumber: sequenceNumber,
+			ItemID:         currentReasoningID,
+			OutputIndex:    &outputIndex,
+			ContentIndex:   &currentReasoningContentIndex,
+			Text:           strPtr(finalReasoning),
+			Logprobs:       emptyLogprobs(),
+		})
+		sequenceNumber++
+
+		// Emit content_part.done for reasoning
+		reasoningPart := makeOutputTextPart(finalReasoning)
+		sendSSEEvent(c, &schema.ORStreamEvent{
+			Type:           "response.content_part.done",
+			SequenceNumber: sequenceNumber,
+			ItemID:         currentReasoningID,
+			OutputIndex:    &outputIndex,
+			ContentIndex:   &currentReasoningContentIndex,
+			Part:           &reasoningPart,
+		})
+		sequenceNumber++
+
+		// Emit output_item.done for reasoning
+		reasoningItem := &schema.ORItemField{
+			Type:    "reasoning",
+			ID:      currentReasoningID,
+			Status:  "completed",
+			Content: []schema.ORContentPart{reasoningPart},
+		}
+		sendSSEEvent(c, &schema.ORStreamEvent{
+			Type:           "response.output_item.done",
+			SequenceNumber: sequenceNumber,
+			OutputIndex:    &outputIndex,
+			Item:           reasoningItem,
+		})
+		sequenceNumber++
+
+		// Collect reasoning item for storage
+		collectedOutputItems = append(collectedOutputItems, *reasoningItem)
+
+		// Calculate reasoning tokens
+		reasoningTokens = len(finalReasoning) / 4
+		if reasoningTokens == 0 && len(finalReasoning) > 0 {
+			reasoningTokens = 1
+		}
+	}
+
+	result = finalCleanedResult
+
 	// Convert prediction logprobs for streaming events
 	mcpStreamLogprobs := convertLogprobsForStreaming(prediction.Logprobs)

@@ -2075,17 +2425,35 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 	// Emit response.completed
 	now := time.Now().Unix()

-	// Collect final output items (use collected items if available, otherwise use messageItem)
+	// Collect final output items (reasoning first, then message)
 	var finalOutputItems []schema.ORItemField
+	// Add reasoning item if it exists
+	if currentReasoningID != "" && finalReasoning != "" {
+		finalOutputItems = append(finalOutputItems, schema.ORItemField{
+			Type:    "reasoning",
+			ID:      currentReasoningID,
+			Status:  "completed",
+			Content: []schema.ORContentPart{makeOutputTextPart(finalReasoning)},
+		})
+	}
+	// Add message item
 	if len(collectedOutputItems) > 0 {
-		finalOutputItems = collectedOutputItems
+		// Use collected items (may include reasoning already)
+		for _, item := range collectedOutputItems {
+			if item.Type == "message" {
+				finalOutputItems = append(finalOutputItems, item)
+			}
+		}
 	} else {
-		finalOutputItems = []schema.ORItemField{*messageItem}
+		finalOutputItems = append(finalOutputItems, *messageItem)
 	}
 	responseCompleted := buildORResponse(responseID, createdAt, &now, "completed", input, finalOutputItems, &schema.ORUsage{
 		InputTokens:  prediction.Usage.Prompt,
 		OutputTokens: prediction.Usage.Completion,
 		TotalTokens:  prediction.Usage.Prompt + prediction.Usage.Completion,
+		OutputTokensDetails: &schema.OROutputTokensDetails{
+			ReasoningTokens: reasoningTokens,
+		},
 	}, shouldStore)
 	sendSSEEvent(c, &schema.ORStreamEvent{
 		Type:           "response.completed",
--- a/core/schema/openresponses.go
+++ b/core/schema/openresponses.go
@@ -93,7 +93,12 @@ type ORItemParam struct {
 	// Function call output fields
 	Output interface{} `json:"output,omitempty"` // string or []ORContentPart

+	// Reasoning fields (for type == "reasoning")
+	Summary         []ORContentPart `json:"summary,omitempty"`          // Array of summary parts
+	EncryptedContent *string        `json:"encrypted_content,omitempty"` // Provider-specific encrypted content
+
 	// Note: For item_reference type, use the ID field above to reference the item
+	// Note: For reasoning type, Content field (from message fields) contains the raw reasoning content
 }

 // ORContentPart represents a content block (discriminated union by type)
--- a/docs/content/advanced/model-configuration.md
+++ b/docs/content/advanced/model-configuration.md
@@ -397,6 +397,83 @@ Agent/autonomous agent configuration:
 | `agent.enable_mcp_prompts` | bool | Enable MCP prompts |
 | `agent.enable_plan_re_evaluator` | bool | Enable plan re-evaluation |

+## Reasoning Configuration
+
+Configure how reasoning tags are extracted and processed from model output. Reasoning tags are used by models like DeepSeek, Command-R, and others to include internal reasoning steps in their responses.
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `reasoning.disable` | bool | `false` | When `true`, disables reasoning extraction entirely. The original content is returned without any processing. |
+| `reasoning.disable_reasoning_tag_prefill` | bool | `false` | When `true`, disables automatic prepending of thinking start tokens. Use this when your model already includes reasoning tags in its output format. |
+| `reasoning.strip_reasoning_only` | bool | `false` | When `true`, extracts and removes reasoning tags from content but discards the reasoning text. Useful when you want to clean reasoning tags from output without storing the reasoning content. |
+| `reasoning.thinking_start_tokens` | array | `[]` | List of custom thinking start tokens to detect in prompts. Custom tokens are checked before default tokens. |
+| `reasoning.tag_pairs` | array | `[]` | List of custom tag pairs for reasoning extraction. Each entry has `start` and `end` fields. Custom pairs are checked before default pairs. |
+
+### Reasoning Tag Formats
+
+The reasoning extraction supports multiple tag formats used by different models:
+
+- `<thinking>...</thinking>` - General thinking tag
+- `<think>...</think>` - DeepSeek, Granite, ExaOne, GLM models
+- `<|START_THINKING|>...<|END_THINKING|>` - Command-R models
+- `<|inner_prefix|>...<|inner_suffix|>` - Apertus models
+- `<seed:think>...</seed:think>` - Seed models
+- `<|think|>...<|end|><|begin|>assistant<|content|>` - Solar Open models
+- `[THINK]...[/THINK]` - Magistral models
+
+### Examples
+
+**Disable reasoning extraction:**
+```yaml
+reasoning:
+  disable: true
+```
+
+**Extract reasoning but don't prepend tags:**
+```yaml
+reasoning:
+  disable_reasoning_tag_prefill: true
+```
+
+**Strip reasoning tags without storing reasoning content:**
+```yaml
+reasoning:
+  strip_reasoning_only: true
+```
+
+**Complete example with reasoning configuration:**
+```yaml
+name: deepseek-model
+backend: llama-cpp
+parameters:
+  model: deepseek.gguf
+
+reasoning:
+  disable: false
+  disable_reasoning_tag_prefill: false
+  strip_reasoning_only: false
+```
+
+**Example with custom tokens and tag pairs:**
+```yaml
+name: custom-reasoning-model
+backend: llama-cpp
+parameters:
+  model: custom.gguf
+
+reasoning:
+  thinking_start_tokens:
+    - "<custom:think>"
+    - "<my:reasoning>"
+  tag_pairs:
+    - start: "<custom:think>"
+      end: "</custom:think>"
+    - start: "<my:reasoning>"
+      end: "</my:reasoning>"
+```
+
+**Note:** Custom tokens and tag pairs are checked before the default ones, giving them priority. This allows you to override default behavior or add support for new reasoning tag formats.
+
 ## Pipeline Configuration

 Define pipelines for audio-to-audio processing:
--- a/pkg/reasoning/config.go
+++ b/pkg/reasoning/config.go
@@ -1,5 +1,15 @@
 package reasoning

-type Config struct {
-	DisableReasoningTagPrefill *bool `yaml:"disable_reasoning_tag_prefill,omitempty" json:"disable_reasoning_tag_prefill,omitempty"`
+// TagPair represents a start/end tag pair for reasoning extraction
+type TagPair struct {
+	Start string `yaml:"start" json:"start"`
+	End   string `yaml:"end" json:"end"`
+}
+
+type Config struct {
+	DisableReasoningTagPrefill *bool     `yaml:"disable_reasoning_tag_prefill,omitempty" json:"disable_reasoning_tag_prefill,omitempty"`
+	DisableReasoning           *bool     `yaml:"disable,omitempty" json:"disable,omitempty"`
+	StripReasoningOnly         *bool     `yaml:"strip_reasoning_only,omitempty" json:"strip_reasoning_only,omitempty"`
+	ThinkingStartTokens        []string  `yaml:"thinking_start_tokens,omitempty" json:"thinking_start_tokens,omitempty"`
+	TagPairs                   []TagPair `yaml:"tag_pairs,omitempty" json:"tag_pairs,omitempty"`
 }
--- a/pkg/reasoning/reasoning.go
+++ b/pkg/reasoning/reasoning.go
@@ -17,12 +17,12 @@ import (
 // - <think>    (DeepSeek, Granite, ExaOne models)
 // - <|think|>               (Solar Open models)
 // - <thinking>              (General thinking tag)
-// - <think>                 (GLM models)
 // - [THINK]                 (Magistral models)
-func DetectThinkingStartToken(prompt string) string {
+// Custom tokens from config are checked first, then default tokens.
+func DetectThinkingStartToken(prompt string, config *Config) string {
 	// Common thinking start tokens (in order of specificity - longer first)
 	// Based on llama.cpp's chat-parser.cpp implementations
-	thinkingStartTokens := []string{
+	defaultTokens := []string{
 		"<|START_THINKING|>", // Command-R models
 		"<|inner_prefix|>",   // Apertus models
 		"<seed:think>",       // Seed models
@@ -32,6 +32,13 @@ func DetectThinkingStartToken(prompt string) string {
 		"[THINK]",            // Magistral models
 	}

+	// Merge custom tokens with default tokens (custom tokens first for priority)
+	var thinkingStartTokens []string
+	if config != nil && len(config.ThinkingStartTokens) > 0 {
+		thinkingStartTokens = append(thinkingStartTokens, config.ThinkingStartTokens...)
+	}
+	thinkingStartTokens = append(thinkingStartTokens, defaultTokens...)
+
 	// Check if prompt ends with any of these tokens (allowing for trailing whitespace/newlines)
 	trimmedPrompt := strings.TrimRight(prompt, " \t\n\r")
 	for _, token := range thinkingStartTokens {
@@ -58,6 +65,28 @@ func DetectThinkingStartToken(prompt string) string {
 	return ""
 }

+// ExtractReasoningWithConfig extracts reasoning from content with the given config.
+// If reasoning is disabled, it returns the original content.
+// If thinking start token prefill is enabled, it prepends the thinking start token to the content.
+// It returns the extracted reasoning and the cleaned content.
+func ExtractReasoningWithConfig(content, thinkingStartToken string, config Config) (reasoning string, cleanedContent string) {
+	cleanedContent = content
+	// If reasoning is not disabled, prepend the thinking start token if needed and extract reasoning
+	if config.DisableReasoning == nil || !*config.DisableReasoning {
+		// If thinking start token prefill is not disabled, prepend the thinking start token
+		if config.DisableReasoningTagPrefill == nil || !*config.DisableReasoningTagPrefill {
+			cleanedContent = PrependThinkingTokenIfNeeded(cleanedContent, thinkingStartToken)
+		}
+		// Extract reasoning from the cleaned content
+		reasoning, cleanedContent = ExtractReasoning(cleanedContent, &config)
+		if config.StripReasoningOnly != nil && *config.StripReasoningOnly {
+			reasoning = ""
+		}
+	}
+
+	return reasoning, cleanedContent
+}
+
 // PrependThinkingTokenIfNeeded prepends the thinking start token to content if it was
 // detected in the prompt. This allows the standard extraction logic to work correctly
 // for models where the thinking token is already in the prompt.
@@ -97,7 +126,8 @@ func PrependThinkingTokenIfNeeded(content string, startToken string) string {
 // both the extracted reasoning and the cleaned content (with tags removed).
 // It handles <thinking>...</thinking> and <think>...</think> tags.
 // Multiple reasoning blocks are concatenated with newlines.
-func ExtractReasoning(content string) (reasoning string, cleanedContent string) {
+// Custom tag pairs from config are checked first, then default tag pairs.
+func ExtractReasoning(content string, config *Config) (reasoning string, cleanedContent string) {
 	if content == "" {
 		return "", content
 	}
@@ -106,8 +136,8 @@ func ExtractReasoning(content string) (reasoning string, cleanedContent string)
 	var cleanedParts []string
 	remaining := content

-	// Define tag pairs to look for (matching llama.cpp's chat-parser.cpp)
-	tagPairs := []struct {
+	// Define default tag pairs to look for (matching llama.cpp's chat-parser.cpp)
+	defaultTagPairs := []struct {
 		start string
 		end   string
 	}{
@@ -120,6 +150,26 @@ func ExtractReasoning(content string) (reasoning string, cleanedContent string)
 		{"[THINK]", "[/THINK]"},                               // Magistral models
 	}

+	// Merge custom tag pairs with default tag pairs (custom pairs first for priority)
+	var tagPairs []struct {
+		start string
+		end   string
+	}
+	if config != nil && len(config.TagPairs) > 0 {
+		for _, pair := range config.TagPairs {
+			if pair.Start != "" && pair.End != "" {
+				tagPairs = append(tagPairs, struct {
+					start string
+					end   string
+				}{pair.Start, pair.End})
+			}
+		}
+	}
+	// Add default tag pairs
+	for _, pair := range defaultTagPairs {
+		tagPairs = append(tagPairs, pair)
+	}
+
 	// Track the last position we've processed
 	lastPos := 0

--- a/pkg/reasoning/reasoning_test.go
+++ b/pkg/reasoning/reasoning_test.go