package openai import ( "encoding/json" "fmt" "strings" "time" "github.com/google/uuid" "github.com/labstack/echo/v4" "github.com/mudler/LocalAI/core/backend" "github.com/mudler/LocalAI/core/config" mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp" "github.com/mudler/LocalAI/core/http/middleware" "github.com/mudler/LocalAI/core/schema" "github.com/mudler/LocalAI/pkg/functions" reason "github.com/mudler/LocalAI/pkg/reasoning" "github.com/mudler/LocalAI/core/templates" pb "github.com/mudler/LocalAI/pkg/grpc/proto" "github.com/mudler/LocalAI/pkg/model" "github.com/mudler/xlog" ) // mergeToolCallDeltas merges streaming tool call deltas into complete tool calls. // In SSE streaming, a single tool call arrives as multiple chunks sharing the same Index: // the first chunk carries the ID, Type, and Name; subsequent chunks append to Arguments. func mergeToolCallDeltas(existing []schema.ToolCall, deltas []schema.ToolCall) []schema.ToolCall { byIndex := make(map[int]int, len(existing)) // tool call Index -> position in slice for i, tc := range existing { byIndex[tc.Index] = i } for _, d := range deltas { pos, found := byIndex[d.Index] if !found { byIndex[d.Index] = len(existing) existing = append(existing, d) continue } // Merge into existing entry tc := &existing[pos] if d.ID != "" { tc.ID = d.ID } if d.Type != "" { tc.Type = d.Type } if d.FunctionCall.Name != "" { tc.FunctionCall.Name = d.FunctionCall.Name } tc.FunctionCall.Arguments += d.FunctionCall.Arguments } return existing } // ChatEndpoint is the OpenAI Completion API endpoint https://platform.openai.com/docs/api-reference/chat/create // @Summary Generate a chat completions for a given prompt and model. // @Param request body schema.OpenAIRequest true "query params" // @Success 200 {object} schema.OpenAIResponse "Response" // @Router /v1/chat/completions [post] func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, startupOptions *config.ApplicationConfig) echo.HandlerFunc { var id, textContentToReturn string var created int process := func(s string, req *schema.OpenAIRequest, config *config.ModelConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse, extraUsage bool) error { initialMessage := schema.OpenAIResponse{ ID: id, Created: created, Model: req.Model, // we have to return what the user sent here, due to OpenAI spec. Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant"}, Index: 0, FinishReason: nil}}, Object: "chat.completion.chunk", } responses <- initialMessage // Detect if thinking token is already in prompt or template // When UseTokenizerTemplate is enabled, predInput is empty, so we check the template var template string if config.TemplateConfig.UseTokenizerTemplate { template = config.GetModelTemplate() } else { template = s } thinkingStartToken := reason.DetectThinkingStartToken(template, &config.ReasoningConfig) extractor := reason.NewReasoningExtractor(thinkingStartToken, config.ReasoningConfig) _, _, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool { reasoningDelta, contentDelta := extractor.ProcessToken(s) usage := schema.OpenAIUsage{ PromptTokens: tokenUsage.Prompt, CompletionTokens: tokenUsage.Completion, TotalTokens: tokenUsage.Prompt + tokenUsage.Completion, } if extraUsage { usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing } delta := &schema.Message{} if contentDelta != "" { delta.Content = &contentDelta } if reasoningDelta != "" { delta.Reasoning = &reasoningDelta } resp := schema.OpenAIResponse{ ID: id, Created: created, Model: req.Model, // we have to return what the user sent here, due to OpenAI spec. Choices: []schema.Choice{{Delta: delta, Index: 0, FinishReason: nil}}, Object: "chat.completion.chunk", Usage: usage, } responses <- resp return true }) close(responses) return err } processTools := func(noAction string, prompt string, req *schema.OpenAIRequest, config *config.ModelConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse, extraUsage bool) error { // Detect if thinking token is already in prompt or template var template string if config.TemplateConfig.UseTokenizerTemplate { template = config.GetModelTemplate() } else { template = prompt } thinkingStartToken := reason.DetectThinkingStartToken(template, &config.ReasoningConfig) extractor := reason.NewReasoningExtractor(thinkingStartToken, config.ReasoningConfig) result := "" lastEmittedCount := 0 sentInitialRole := false _, tokenUsage, chatDeltas, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool { result += s reasoningDelta, contentDelta := extractor.ProcessToken(s) // Emit reasoning deltas in their own SSE chunks before any tool-call chunks // (OpenAI spec: reasoning and tool_calls never share a delta) if reasoningDelta != "" { responses <- schema.OpenAIResponse{ ID: id, Created: created, Model: req.Model, Choices: []schema.Choice{{ Delta: &schema.Message{Reasoning: &reasoningDelta}, Index: 0, }}, Object: "chat.completion.chunk", } } // Stream content deltas (cleaned of reasoning tags) while no tool calls // have been detected. Once the incremental parser finds tool calls, // content stops — per OpenAI spec, content and tool_calls don't mix. if lastEmittedCount == 0 && contentDelta != "" { if !sentInitialRole { responses <- schema.OpenAIResponse{ ID: id, Created: created, Model: req.Model, Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant"}, Index: 0}}, Object: "chat.completion.chunk", } sentInitialRole = true } responses <- schema.OpenAIResponse{ ID: id, Created: created, Model: req.Model, Choices: []schema.Choice{{ Delta: &schema.Message{Content: &contentDelta}, Index: 0, }}, Object: "chat.completion.chunk", } } // Try incremental XML parsing for streaming support using iterative parser // This allows emitting partial tool calls as they're being generated cleanedResult := functions.CleanupLLMResult(result, config.FunctionsConfig) // Determine XML format from config var xmlFormat *functions.XMLToolCallFormat if config.FunctionsConfig.XMLFormat != nil { xmlFormat = config.FunctionsConfig.XMLFormat } else if config.FunctionsConfig.XMLFormatPreset != "" { xmlFormat = functions.GetXMLFormatPreset(config.FunctionsConfig.XMLFormatPreset) } // Use iterative parser for streaming (partial parsing enabled) // Try XML parsing first partialResults, parseErr := functions.ParseXMLIterative(cleanedResult, xmlFormat, true) if parseErr == nil && len(partialResults) > 0 { // Emit new XML tool calls that weren't emitted before if len(partialResults) > lastEmittedCount { for i := lastEmittedCount; i < len(partialResults); i++ { toolCall := partialResults[i] initialMessage := schema.OpenAIResponse{ ID: id, Created: created, Model: req.Model, Choices: []schema.Choice{{ Delta: &schema.Message{ Role: "assistant", ToolCalls: []schema.ToolCall{ { Index: i, ID: id, Type: "function", FunctionCall: schema.FunctionCall{ Name: toolCall.Name, }, }, }, }, Index: 0, FinishReason: nil, }}, Object: "chat.completion.chunk", } select { case responses <- initialMessage: default: } } lastEmittedCount = len(partialResults) } } else { // Try JSON tool call parsing for streaming // Check if the result looks like JSON tool calls jsonResults, jsonErr := functions.ParseJSONIterative(cleanedResult, true) if jsonErr == nil && len(jsonResults) > 0 { // Check if these are tool calls (have "name" and optionally "arguments") for _, jsonObj := range jsonResults { if name, ok := jsonObj["name"].(string); ok && name != "" { // This looks like a tool call args := "{}" if argsVal, ok := jsonObj["arguments"]; ok { if argsStr, ok := argsVal.(string); ok { args = argsStr } else { argsBytes, _ := json.Marshal(argsVal) args = string(argsBytes) } } // Emit tool call initialMessage := schema.OpenAIResponse{ ID: id, Created: created, Model: req.Model, Choices: []schema.Choice{{ Delta: &schema.Message{ Role: "assistant", ToolCalls: []schema.ToolCall{ { Index: lastEmittedCount, ID: id, Type: "function", FunctionCall: schema.FunctionCall{ Name: name, Arguments: args, }, }, }, }, Index: 0, FinishReason: nil, }}, Object: "chat.completion.chunk", } select { case responses <- initialMessage: default: } lastEmittedCount++ } } } } return true }, func(attempt int) bool { // After streaming completes: check if we got actionable content cleaned := extractor.CleanedContent() // Check for tool calls from chat deltas (will be re-checked after ComputeChoices, // but we need to know here whether to retry) hasToolCalls := lastEmittedCount > 0 if cleaned == "" && !hasToolCalls { xlog.Warn("Streaming: backend produced only reasoning, retrying", "reasoning_len", len(extractor.Reasoning()), "attempt", attempt+1) extractor.ResetAndSuppressReasoning() result = "" lastEmittedCount = 0 sentInitialRole = false return true } return false }, ) if err != nil { return err } // Try using pre-parsed tool calls from C++ autoparser (chat deltas) var functionResults []functions.FuncCallResults var reasoning string if deltaToolCalls := functions.ToolCallsFromChatDeltas(chatDeltas); len(deltaToolCalls) > 0 { xlog.Debug("[ChatDeltas] Using pre-parsed tool calls from C++ autoparser", "count", len(deltaToolCalls)) functionResults = deltaToolCalls // Use content/reasoning from deltas too textContentToReturn = functions.ContentFromChatDeltas(chatDeltas) reasoning = functions.ReasoningFromChatDeltas(chatDeltas) } else { // Fallback: parse tool calls from raw text (no chat deltas from backend) xlog.Debug("[ChatDeltas] no pre-parsed tool calls, falling back to Go-side text parsing") reasoning = extractor.Reasoning() cleanedResult := extractor.CleanedContent() textContentToReturn = functions.ParseTextContent(cleanedResult, config.FunctionsConfig) cleanedResult = functions.CleanupLLMResult(cleanedResult, config.FunctionsConfig) functionResults = functions.ParseFunctionCall(cleanedResult, config.FunctionsConfig) } xlog.Debug("[ChatDeltas] final tool call decision", "tool_calls", len(functionResults), "text_content", textContentToReturn) noActionToRun := len(functionResults) > 0 && functionResults[0].Name == noAction || len(functionResults) == 0 switch { case noActionToRun: usage := schema.OpenAIUsage{ PromptTokens: tokenUsage.Prompt, CompletionTokens: tokenUsage.Completion, TotalTokens: tokenUsage.Prompt + tokenUsage.Completion, } if extraUsage { usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing } if sentInitialRole { // Content was already streamed during the callback — just emit usage. delta := &schema.Message{} if reasoning != "" && extractor.Reasoning() == "" { delta.Reasoning = &reasoning } responses <- schema.OpenAIResponse{ ID: id, Created: created, Model: req.Model, Choices: []schema.Choice{{Delta: delta, Index: 0}}, Object: "chat.completion.chunk", Usage: usage, } } else { // Content was NOT streamed — send everything at once (fallback). responses <- schema.OpenAIResponse{ ID: id, Created: created, Model: req.Model, Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant"}, Index: 0}}, Object: "chat.completion.chunk", } result, err := handleQuestion(config, functionResults, extractor.CleanedContent(), prompt) if err != nil { xlog.Error("error handling question", "error", err) return err } delta := &schema.Message{Content: &result} if reasoning != "" { delta.Reasoning = &reasoning } responses <- schema.OpenAIResponse{ ID: id, Created: created, Model: req.Model, Choices: []schema.Choice{{Delta: delta, Index: 0}}, Object: "chat.completion.chunk", Usage: usage, } } default: for i, ss := range functionResults { name, args := ss.Name, ss.Arguments toolCallID := ss.ID if toolCallID == "" { toolCallID = id } initialMessage := schema.OpenAIResponse{ ID: id, Created: created, Model: req.Model, // we have to return what the user sent here, due to OpenAI spec. Choices: []schema.Choice{{ Delta: &schema.Message{ Role: "assistant", ToolCalls: []schema.ToolCall{ { Index: i, ID: toolCallID, Type: "function", FunctionCall: schema.FunctionCall{ Name: name, }, }, }, }, Index: 0, FinishReason: nil, }}, Object: "chat.completion.chunk", } responses <- initialMessage responses <- schema.OpenAIResponse{ ID: id, Created: created, Model: req.Model, // we have to return what the user sent here, due to OpenAI spec. Choices: []schema.Choice{{ Delta: &schema.Message{ Role: "assistant", Content: &textContentToReturn, ToolCalls: []schema.ToolCall{ { Index: i, ID: toolCallID, Type: "function", FunctionCall: schema.FunctionCall{ Arguments: args, }, }, }, }, Index: 0, FinishReason: nil, }}, Object: "chat.completion.chunk", } } } close(responses) return err } return func(c echo.Context) error { textContentToReturn = "" id = uuid.New().String() created = int(time.Now().Unix()) input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest) if !ok || input.Model == "" { return echo.ErrBadRequest } extraUsage := c.Request().Header.Get("Extra-Usage") != "" config, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) if !ok || config == nil { return echo.ErrBadRequest } xlog.Debug("Chat endpoint configuration read", "config", config) funcs := input.Functions shouldUseFn := len(input.Functions) > 0 && config.ShouldUseFunctions() strictMode := false // MCP tool injection: when mcp_servers is set in metadata and model has MCP config var mcpToolInfos []mcpTools.MCPToolInfo mcpServers := mcpTools.MCPServersFromMetadata(input.Metadata) // MCP prompt and resource injection (extracted before tool injection) mcpPromptName, mcpPromptArgs := mcpTools.MCPPromptFromMetadata(input.Metadata) mcpResourceURIs := mcpTools.MCPResourcesFromMetadata(input.Metadata) if (len(mcpServers) > 0 || mcpPromptName != "" || len(mcpResourceURIs) > 0) && (config.MCP.Servers != "" || config.MCP.Stdio != "") { remote, stdio, mcpErr := config.MCP.MCPConfigFromYAML() if mcpErr == nil { namedSessions, sessErr := mcpTools.NamedSessionsFromMCPConfig(config.Name, remote, stdio, mcpServers) if sessErr == nil && len(namedSessions) > 0 { // Prompt injection: prepend prompt messages to the conversation if mcpPromptName != "" { prompts, discErr := mcpTools.DiscoverMCPPrompts(c.Request().Context(), namedSessions) if discErr == nil { promptMsgs, getErr := mcpTools.GetMCPPrompt(c.Request().Context(), prompts, mcpPromptName, mcpPromptArgs) if getErr == nil { var injected []schema.Message for _, pm := range promptMsgs { injected = append(injected, schema.Message{ Role: string(pm.Role), Content: mcpTools.PromptMessageToText(pm), }) } input.Messages = append(injected, input.Messages...) xlog.Debug("MCP prompt injected", "prompt", mcpPromptName, "messages", len(injected)) } else { xlog.Error("Failed to get MCP prompt", "error", getErr) } } else { xlog.Error("Failed to discover MCP prompts", "error", discErr) } } // Resource injection: append resource content to the last user message if len(mcpResourceURIs) > 0 { resources, discErr := mcpTools.DiscoverMCPResources(c.Request().Context(), namedSessions) if discErr == nil { var resourceTexts []string for _, uri := range mcpResourceURIs { content, readErr := mcpTools.ReadMCPResource(c.Request().Context(), resources, uri) if readErr != nil { xlog.Error("Failed to read MCP resource", "error", readErr, "uri", uri) continue } // Find resource name name := uri for _, r := range resources { if r.URI == uri { name = r.Name break } } resourceTexts = append(resourceTexts, fmt.Sprintf("--- MCP Resource: %s ---\n%s", name, content)) } if len(resourceTexts) > 0 && len(input.Messages) > 0 { lastIdx := len(input.Messages) - 1 suffix := "\n\n" + strings.Join(resourceTexts, "\n\n") switch ct := input.Messages[lastIdx].Content.(type) { case string: input.Messages[lastIdx].Content = ct + suffix default: input.Messages[lastIdx].Content = fmt.Sprintf("%v%s", ct, suffix) } xlog.Debug("MCP resources injected", "count", len(resourceTexts)) } } else { xlog.Error("Failed to discover MCP resources", "error", discErr) } } // Tool injection if len(mcpServers) > 0 { discovered, discErr := mcpTools.DiscoverMCPTools(c.Request().Context(), namedSessions) if discErr == nil { mcpToolInfos = discovered for _, ti := range mcpToolInfos { funcs = append(funcs, ti.Function) input.Tools = append(input.Tools, functions.Tool{Type: "function", Function: ti.Function}) } shouldUseFn = len(funcs) > 0 && config.ShouldUseFunctions() xlog.Debug("MCP tools injected", "count", len(mcpToolInfos), "total_funcs", len(funcs)) } else { xlog.Error("Failed to discover MCP tools", "error", discErr) } } } } else { xlog.Error("Failed to parse MCP config", "error", mcpErr) } } xlog.Debug("Tool call routing decision", "shouldUseFn", shouldUseFn, "len(input.Functions)", len(input.Functions), "len(input.Tools)", len(input.Tools), "config.ShouldUseFunctions()", config.ShouldUseFunctions(), "config.FunctionToCall()", config.FunctionToCall(), ) for _, f := range input.Functions { if f.Strict { strictMode = true break } } // Allow the user to set custom actions via config file // to be "embedded" in each model noActionName := "answer" noActionDescription := "use this action to answer without performing any action" if config.FunctionsConfig.NoActionFunctionName != "" { noActionName = config.FunctionsConfig.NoActionFunctionName } if config.FunctionsConfig.NoActionDescriptionName != "" { noActionDescription = config.FunctionsConfig.NoActionDescriptionName } // If we are using a response format, we need to generate a grammar for it if config.ResponseFormatMap != nil { d := schema.ChatCompletionResponseFormat{} dat, err := json.Marshal(config.ResponseFormatMap) if err != nil { return err } err = json.Unmarshal(dat, &d) if err != nil { return err } switch d.Type { case "json_object": input.Grammar = functions.JSONBNF case "json_schema": d := schema.JsonSchemaRequest{} dat, err := json.Marshal(config.ResponseFormatMap) if err != nil { return err } err = json.Unmarshal(dat, &d) if err != nil { return err } fs := &functions.JSONFunctionStructure{ AnyOf: []functions.Item{d.JsonSchema.Schema}, } g, err := fs.Grammar(config.FunctionsConfig.GrammarOptions()...) if err == nil { input.Grammar = g } else { xlog.Error("Failed generating grammar", "error", err) } } } config.Grammar = input.Grammar if shouldUseFn { xlog.Debug("Response needs to process functions") } switch { // Generates grammar with internal's LocalAI engine case (!config.FunctionsConfig.GrammarConfig.NoGrammar || strictMode) && shouldUseFn: noActionGrammar := functions.Function{ Name: noActionName, Description: noActionDescription, Parameters: map[string]interface{}{ "properties": map[string]interface{}{ "message": map[string]interface{}{ "type": "string", "description": "The message to reply the user with", }}, }, } // Append the no action function if !config.FunctionsConfig.DisableNoAction && !strictMode { funcs = append(funcs, noActionGrammar) } // Force picking one of the functions by the request if config.FunctionToCall() != "" { funcs = funcs.Select(config.FunctionToCall()) } // Update input grammar or json_schema based on use_llama_grammar option jsStruct := funcs.ToJSONStructure(config.FunctionsConfig.FunctionNameKey, config.FunctionsConfig.FunctionNameKey) g, err := jsStruct.Grammar(config.FunctionsConfig.GrammarOptions()...) if err == nil { config.Grammar = g } else { xlog.Error("Failed generating grammar", "error", err) } case input.JSONFunctionGrammarObject != nil: g, err := input.JSONFunctionGrammarObject.Grammar(config.FunctionsConfig.GrammarOptions()...) if err == nil { config.Grammar = g } else { xlog.Error("Failed generating grammar", "error", err) } default: // Force picking one of the functions by the request if config.FunctionToCall() != "" { funcs = funcs.Select(config.FunctionToCall()) } } // process functions if we have any defined or if we have a function call string // functions are not supported in stream mode (yet?) toStream := input.Stream xlog.Debug("Parameters", "config", config) var predInput string // If we are using the tokenizer template, we don't need to process the messages // unless we are processing functions if !config.TemplateConfig.UseTokenizerTemplate { predInput = evaluator.TemplateMessages(*input, input.Messages, config, funcs, shouldUseFn) xlog.Debug("Prompt (after templating)", "prompt", predInput) if config.Grammar != "" { xlog.Debug("Grammar", "grammar", config.Grammar) } } switch { case toStream: xlog.Debug("Stream request received") c.Response().Header().Set("Content-Type", "text/event-stream") c.Response().Header().Set("Cache-Control", "no-cache") c.Response().Header().Set("Connection", "keep-alive") c.Response().Header().Set("X-Correlation-ID", id) mcpStreamMaxIterations := 10 if config.Agent.MaxIterations > 0 { mcpStreamMaxIterations = config.Agent.MaxIterations } hasMCPToolsStream := len(mcpToolInfos) > 0 for mcpStreamIter := 0; mcpStreamIter <= mcpStreamMaxIterations; mcpStreamIter++ { // Re-template on MCP iterations if mcpStreamIter > 0 && !config.TemplateConfig.UseTokenizerTemplate { predInput = evaluator.TemplateMessages(*input, input.Messages, config, funcs, shouldUseFn) xlog.Debug("MCP stream re-templating", "iteration", mcpStreamIter) } responses := make(chan schema.OpenAIResponse) ended := make(chan error, 1) go func() { if !shouldUseFn { ended <- process(predInput, input, config, ml, responses, extraUsage) } else { ended <- processTools(noActionName, predInput, input, config, ml, responses, extraUsage) } }() usage := &schema.OpenAIUsage{} toolsCalled := false var collectedToolCalls []schema.ToolCall var collectedContent string LOOP: for { select { case <-input.Context.Done(): // Context was cancelled (client disconnected or request cancelled) xlog.Debug("Request context cancelled, stopping stream") input.Cancel() break LOOP case ev := <-responses: if len(ev.Choices) == 0 { xlog.Debug("No choices in the response, skipping") continue } usage = &ev.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it if len(ev.Choices[0].Delta.ToolCalls) > 0 { toolsCalled = true // Collect and merge tool call deltas for MCP execution if hasMCPToolsStream { collectedToolCalls = mergeToolCallDeltas(collectedToolCalls, ev.Choices[0].Delta.ToolCalls) } } // Collect content for MCP conversation history if hasMCPToolsStream && ev.Choices[0].Delta != nil && ev.Choices[0].Delta.Content != nil { if s, ok := ev.Choices[0].Delta.Content.(string); ok { collectedContent += s } else if sp, ok := ev.Choices[0].Delta.Content.(*string); ok && sp != nil { collectedContent += *sp } } respData, err := json.Marshal(ev) if err != nil { xlog.Debug("Failed to marshal response", "error", err) input.Cancel() continue } xlog.Debug("Sending chunk", "chunk", string(respData)) _, err = fmt.Fprintf(c.Response().Writer, "data: %s\n\n", string(respData)) if err != nil { xlog.Debug("Sending chunk failed", "error", err) input.Cancel() return err } c.Response().Flush() case err := <-ended: if err == nil { break LOOP } xlog.Error("Stream ended with error", "error", err) stopReason := FinishReasonStop resp := &schema.OpenAIResponse{ ID: id, Created: created, Model: input.Model, // we have to return what the user sent here, due to OpenAI spec. Choices: []schema.Choice{ { FinishReason: &stopReason, Index: 0, Delta: &schema.Message{Content: "Internal error: " + err.Error()}, }}, Object: "chat.completion.chunk", Usage: *usage, } respData, marshalErr := json.Marshal(resp) if marshalErr != nil { xlog.Error("Failed to marshal error response", "error", marshalErr) // Send a simple error message as fallback fmt.Fprintf(c.Response().Writer, "data: {\"error\":\"Internal error\"}\n\n") } else { fmt.Fprintf(c.Response().Writer, "data: %s\n\n", respData) } fmt.Fprintf(c.Response().Writer, "data: [DONE]\n\n") c.Response().Flush() return nil } } // MCP streaming tool execution: if we collected MCP tool calls, execute and loop if hasMCPToolsStream && toolsCalled && len(collectedToolCalls) > 0 { var hasMCPCalls bool for _, tc := range collectedToolCalls { if mcpTools.IsMCPTool(mcpToolInfos, tc.FunctionCall.Name) { hasMCPCalls = true break } } if hasMCPCalls { // Append assistant message with tool_calls assistantMsg := schema.Message{ Role: "assistant", Content: collectedContent, ToolCalls: collectedToolCalls, } input.Messages = append(input.Messages, assistantMsg) // Execute MCP tool calls and stream results as tool_result events for _, tc := range collectedToolCalls { if !mcpTools.IsMCPTool(mcpToolInfos, tc.FunctionCall.Name) { continue } xlog.Debug("Executing MCP tool (stream)", "tool", tc.FunctionCall.Name, "iteration", mcpStreamIter) toolResult, toolErr := mcpTools.ExecuteMCPToolCall( c.Request().Context(), mcpToolInfos, tc.FunctionCall.Name, tc.FunctionCall.Arguments, ) if toolErr != nil { xlog.Error("MCP tool execution failed", "tool", tc.FunctionCall.Name, "error", toolErr) toolResult = fmt.Sprintf("Error: %v", toolErr) } input.Messages = append(input.Messages, schema.Message{ Role: "tool", Content: toolResult, StringContent: toolResult, ToolCallID: tc.ID, Name: tc.FunctionCall.Name, }) // Stream tool result event to client mcpEvent := map[string]any{ "type": "mcp_tool_result", "name": tc.FunctionCall.Name, "result": toolResult, } if mcpEventData, err := json.Marshal(mcpEvent); err == nil { fmt.Fprintf(c.Response().Writer, "data: %s\n\n", mcpEventData) c.Response().Flush() } } xlog.Debug("MCP streaming tools executed, re-running inference", "iteration", mcpStreamIter) continue // next MCP stream iteration } } // No MCP tools to execute, send final stop message finishReason := FinishReasonStop if toolsCalled && len(input.Tools) > 0 { finishReason = FinishReasonToolCalls } else if toolsCalled { finishReason = FinishReasonFunctionCall } resp := &schema.OpenAIResponse{ ID: id, Created: created, Model: input.Model, // we have to return what the user sent here, due to OpenAI spec. Choices: []schema.Choice{ { FinishReason: &finishReason, Index: 0, Delta: &schema.Message{}, }}, Object: "chat.completion.chunk", Usage: *usage, } respData, _ := json.Marshal(resp) fmt.Fprintf(c.Response().Writer, "data: %s\n\n", respData) fmt.Fprintf(c.Response().Writer, "data: [DONE]\n\n") c.Response().Flush() xlog.Debug("Stream ended") return nil } // end MCP stream iteration loop // Safety fallback fmt.Fprintf(c.Response().Writer, "data: [DONE]\n\n") c.Response().Flush() return nil // no streaming mode default: mcpMaxIterations := 10 if config.Agent.MaxIterations > 0 { mcpMaxIterations = config.Agent.MaxIterations } hasMCPTools := len(mcpToolInfos) > 0 for mcpIteration := 0; mcpIteration <= mcpMaxIterations; mcpIteration++ { // Re-template on each MCP iteration since messages may have changed if mcpIteration > 0 && !config.TemplateConfig.UseTokenizerTemplate { predInput = evaluator.TemplateMessages(*input, input.Messages, config, funcs, shouldUseFn) xlog.Debug("MCP re-templating", "iteration", mcpIteration, "prompt_len", len(predInput)) } // Detect if thinking token is already in prompt or template var template string if config.TemplateConfig.UseTokenizerTemplate { template = config.GetModelTemplate() // TODO: this should be the parsed jinja template. But for now this is the best we can do. } else { template = predInput } thinkingStartToken := reason.DetectThinkingStartToken(template, &config.ReasoningConfig) xlog.Debug("Thinking start token", "thinkingStartToken", thinkingStartToken, "template", template) // When shouldUseFn, the callback just stores the raw text — tool parsing // is deferred to after ComputeChoices so we can check chat deltas first // and avoid redundant Go-side parsing. var cbRawResult, cbReasoning string tokenCallback := func(s string, c *[]schema.Choice) { reasoning, s := reason.ExtractReasoningWithConfig(s, thinkingStartToken, config.ReasoningConfig) if !shouldUseFn { stopReason := FinishReasonStop message := &schema.Message{Role: "assistant", Content: &s} if reasoning != "" { message.Reasoning = &reasoning } *c = append(*c, schema.Choice{FinishReason: &stopReason, Index: 0, Message: message}) return } // Store raw text for deferred tool parsing cbRawResult = s cbReasoning = reasoning } var result []schema.Choice var tokenUsage backend.TokenUsage var err error var chatDeltas []*pb.ChatDelta result, tokenUsage, chatDeltas, err = ComputeChoices( input, predInput, config, cl, startupOptions, ml, tokenCallback, nil, func(attempt int) bool { if !shouldUseFn { return false } // Retry when backend produced only reasoning and no content/tool calls. // Full tool parsing is deferred until after ComputeChoices returns // (when chat deltas are available), but we can detect the empty case here. if cbRawResult == "" && textContentToReturn == "" { xlog.Warn("Backend produced reasoning without actionable content, retrying", "reasoning_len", len(cbReasoning), "attempt", attempt+1) cbRawResult = "" cbReasoning = "" textContentToReturn = "" return true } return false }, ) if err != nil { return err } // Tool parsing is deferred here (only when shouldUseFn) so chat deltas are available if shouldUseFn { var funcResults []functions.FuncCallResults // Try pre-parsed tool calls from C++ autoparser first if deltaToolCalls := functions.ToolCallsFromChatDeltas(chatDeltas); len(deltaToolCalls) > 0 { xlog.Debug("[ChatDeltas] non-SSE: using C++ autoparser tool calls, skipping Go-side parsing", "count", len(deltaToolCalls)) funcResults = deltaToolCalls textContentToReturn = functions.ContentFromChatDeltas(chatDeltas) cbReasoning = functions.ReasoningFromChatDeltas(chatDeltas) } else { // Fallback: parse tool calls from raw text xlog.Debug("[ChatDeltas] non-SSE: no chat deltas, falling back to Go-side text parsing") textContentToReturn = functions.ParseTextContent(cbRawResult, config.FunctionsConfig) cbRawResult = functions.CleanupLLMResult(cbRawResult, config.FunctionsConfig) funcResults = functions.ParseFunctionCall(cbRawResult, config.FunctionsConfig) } noActionsToRun := len(funcResults) > 0 && funcResults[0].Name == noActionName || len(funcResults) == 0 switch { case noActionsToRun: qResult, qErr := handleQuestion(config, funcResults, cbRawResult, predInput) if qErr != nil { xlog.Error("error handling question", "error", qErr) } stopReason := FinishReasonStop message := &schema.Message{Role: "assistant", Content: &qResult} if cbReasoning != "" { message.Reasoning = &cbReasoning } result = append(result, schema.Choice{ FinishReason: &stopReason, Message: message, }) default: toolCallsReason := FinishReasonToolCalls toolChoice := schema.Choice{ FinishReason: &toolCallsReason, Message: &schema.Message{ Role: "assistant", }, } if cbReasoning != "" { toolChoice.Message.Reasoning = &cbReasoning } for _, ss := range funcResults { name, args := ss.Name, ss.Arguments toolCallID := ss.ID if toolCallID == "" { toolCallID = id } if len(input.Tools) > 0 { toolChoice.Message.Content = textContentToReturn toolChoice.Message.ToolCalls = append(toolChoice.Message.ToolCalls, schema.ToolCall{ ID: toolCallID, Type: "function", FunctionCall: schema.FunctionCall{ Name: name, Arguments: args, }, }, ) } else { // Deprecated function_call format functionCallReason := FinishReasonFunctionCall message := &schema.Message{ Role: "assistant", Content: &textContentToReturn, FunctionCall: map[string]interface{}{ "name": name, "arguments": args, }, } if cbReasoning != "" { message.Reasoning = &cbReasoning } result = append(result, schema.Choice{ FinishReason: &functionCallReason, Message: message, }) } } if len(input.Tools) > 0 { result = append(result, toolChoice) } } } // MCP server-side tool execution loop: // If we have MCP tools and the model returned tool_calls, execute MCP tools // and re-run inference with the results appended to the conversation. if hasMCPTools && len(result) > 0 { var mcpCallsExecuted bool for _, choice := range result { if choice.Message == nil || len(choice.Message.ToolCalls) == 0 { continue } // Check if any tool calls are MCP tools var hasMCPCalls bool for _, tc := range choice.Message.ToolCalls { if mcpTools.IsMCPTool(mcpToolInfos, tc.FunctionCall.Name) { hasMCPCalls = true break } } if !hasMCPCalls { continue } // Append assistant message with tool_calls to conversation assistantContent := "" if choice.Message.Content != nil { if s, ok := choice.Message.Content.(string); ok { assistantContent = s } else if sp, ok := choice.Message.Content.(*string); ok && sp != nil { assistantContent = *sp } } assistantMsg := schema.Message{ Role: "assistant", Content: assistantContent, ToolCalls: choice.Message.ToolCalls, } input.Messages = append(input.Messages, assistantMsg) // Execute each MCP tool call and append results for _, tc := range choice.Message.ToolCalls { if !mcpTools.IsMCPTool(mcpToolInfos, tc.FunctionCall.Name) { continue } xlog.Debug("Executing MCP tool", "tool", tc.FunctionCall.Name, "arguments", tc.FunctionCall.Arguments, "iteration", mcpIteration) toolResult, toolErr := mcpTools.ExecuteMCPToolCall( c.Request().Context(), mcpToolInfos, tc.FunctionCall.Name, tc.FunctionCall.Arguments, ) if toolErr != nil { xlog.Error("MCP tool execution failed", "tool", tc.FunctionCall.Name, "error", toolErr) toolResult = fmt.Sprintf("Error: %v", toolErr) } input.Messages = append(input.Messages, schema.Message{ Role: "tool", Content: toolResult, StringContent: toolResult, ToolCallID: tc.ID, Name: tc.FunctionCall.Name, }) mcpCallsExecuted = true } } if mcpCallsExecuted { xlog.Debug("MCP tools executed, re-running inference", "iteration", mcpIteration, "messages", len(input.Messages)) continue // next MCP iteration } } // No MCP tools to execute (or no MCP tools configured), return response usage := schema.OpenAIUsage{ PromptTokens: tokenUsage.Prompt, CompletionTokens: tokenUsage.Completion, TotalTokens: tokenUsage.Prompt + tokenUsage.Completion, } if extraUsage { usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing } resp := &schema.OpenAIResponse{ ID: id, Created: created, Model: input.Model, // we have to return what the user sent here, due to OpenAI spec. Choices: result, Object: "chat.completion", Usage: usage, } respData, _ := json.Marshal(resp) xlog.Debug("Response", "response", string(respData)) // Return the prediction in the response body return c.JSON(200, resp) } // end MCP iteration loop // Should not reach here, but safety fallback return fmt.Errorf("MCP iteration limit reached") } } } func handleQuestion(config *config.ModelConfig, funcResults []functions.FuncCallResults, result, prompt string) (string, error) { if len(funcResults) == 0 && result != "" { xlog.Debug("nothing function results but we had a message from the LLM") return result, nil } xlog.Debug("nothing to do, computing a reply") arg := "" if len(funcResults) > 0 { arg = funcResults[0].Arguments } // If there is a message that the LLM already sends as part of the JSON reply, use it arguments := map[string]interface{}{} if err := json.Unmarshal([]byte(arg), &arguments); err != nil { xlog.Debug("handleQuestion: function result did not contain a valid JSON object") } m, exists := arguments["message"] if exists { switch message := m.(type) { case string: if message != "" { xlog.Debug("Reply received from LLM", "message", message) message = backend.Finetune(*config, prompt, message) xlog.Debug("Reply received from LLM(finetuned)", "message", message) return message, nil } } } xlog.Debug("No action received from LLM, without a message, computing a reply") return "", nil }