mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-30 03:25:42 -04:00
* chore: ignore local .worktrees directory Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(openai): stream usage non-zero when tools are enabled The streaming chat-completions worker for tool-bearing requests (processTools in core/http/endpoints/openai/chat.go) never forwarded the cumulative TokenUsage from ComputeChoices to the chunks it placed on the responses channel. The outer streaming loop's running usage tracker therefore stayed at the zero value, and the include_usage trailer reported {prompt_tokens:0, completion_tokens:0, total_tokens:0} whenever the request carried a `tools` array. Without tools, the alternative `process` path stamps Usage on every chunk, so that path was unaffected. Forward the final TokenUsage via a usage-only sentinel chunk (empty Choices, populated Usage) emitted right before close(responses). The outer loop's per-chunk Usage capture moves above the empty-Choices skip so the sentinel updates the tracker without ever reaching the wire, keeping the existing OpenAI spec contract (intermediate chunks carry no `usage` field, and the deferred-final-chunk helpers remain Usage-free per the regression test for issue #8546). Adds streamUsageFromTokenUsage, usageSentinelChunk, and applyChunkToUsage helpers with focused Ginkgo coverage plus a flow-level test that mirrors the outer-loop sequence. Fixes #9927 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:opus-4-7 [Claude Code] * refactor(openai): return final TokenUsage from stream workers Replace the usage-only sentinel SSE chunk introduced in the previous commit with a plain return value. The streaming workers process and processTools (now extracted as package-level processStream and processStreamWithTools) return (backend.TokenUsage, error); the outer ChatEndpoint loop reads the cumulative counts off the existing `ended` channel (now carrying streamWorkerResult{usage, err}) and builds the include_usage trailer from a normal Go value after the LOOP exits. This drops the empty-Choices "skip but capture Usage" rule from the outer loop and removes the usageSentinelChunk / applyChunkToUsage helpers entirely. The SSE responses channel is back to a single purpose: wire chunks only. processStream and processStreamWithTools move into chat_stream_workers.go so they can be exercised directly from tests. The chat_stream_usage_test.go suite now drives the workers with a mocked backend.ModelInferenceFunc and asserts on the returned TokenUsage. The regression coverage for issue #9927 is therefore behavioral: reverting the fix (discarding ComputeChoices' usage return) makes the assertions fail with concrete count mismatches. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:opus-4-7 [Claude Code] --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
984 lines
35 KiB
Go
984 lines
35 KiB
Go
package openai
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"net/http"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
"github.com/labstack/echo/v4"
|
|
"github.com/mudler/LocalAI/core/backend"
|
|
"github.com/mudler/LocalAI/core/config"
|
|
mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
|
|
"github.com/mudler/LocalAI/core/http/middleware"
|
|
"github.com/mudler/LocalAI/core/schema"
|
|
"github.com/mudler/LocalAI/pkg/functions"
|
|
reason "github.com/mudler/LocalAI/pkg/reasoning"
|
|
|
|
"github.com/mudler/LocalAI/core/templates"
|
|
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
|
"github.com/mudler/LocalAI/pkg/model"
|
|
|
|
"github.com/mudler/xlog"
|
|
)
|
|
|
|
// hasSystemMessage reports whether the message slice already contains a
|
|
// system-role message — used to avoid clobbering a caller-supplied system
|
|
// prompt when the LocalAI Assistant modality is on.
|
|
func hasSystemMessage(messages []schema.Message) bool {
|
|
for _, m := range messages {
|
|
if m.Role == "system" {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// mergeToolCallDeltas merges streaming tool call deltas into complete tool calls.
|
|
// In SSE streaming, a single tool call arrives as multiple chunks sharing the same Index:
|
|
// the first chunk carries the ID, Type, and Name; subsequent chunks append to Arguments.
|
|
func mergeToolCallDeltas(existing []schema.ToolCall, deltas []schema.ToolCall) []schema.ToolCall {
|
|
byIndex := make(map[int]int, len(existing)) // tool call Index -> position in slice
|
|
for i, tc := range existing {
|
|
byIndex[tc.Index] = i
|
|
}
|
|
for _, d := range deltas {
|
|
pos, found := byIndex[d.Index]
|
|
if !found {
|
|
byIndex[d.Index] = len(existing)
|
|
existing = append(existing, d)
|
|
continue
|
|
}
|
|
// Merge into existing entry
|
|
tc := &existing[pos]
|
|
if d.ID != "" {
|
|
tc.ID = d.ID
|
|
}
|
|
if d.Type != "" {
|
|
tc.Type = d.Type
|
|
}
|
|
if d.FunctionCall.Name != "" {
|
|
tc.FunctionCall.Name = d.FunctionCall.Name
|
|
}
|
|
tc.FunctionCall.Arguments += d.FunctionCall.Arguments
|
|
}
|
|
return existing
|
|
}
|
|
|
|
// ChatEndpoint is the OpenAI Completion API endpoint https://platform.openai.com/docs/api-reference/chat/create
|
|
// @Summary Generate a chat completions for a given prompt and model.
|
|
// @Tags inference
|
|
// @Param request body schema.OpenAIRequest true "query params"
|
|
// @Success 200 {object} schema.OpenAIResponse "Response"
|
|
// @Router /v1/chat/completions [post]
|
|
func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, startupOptions *config.ApplicationConfig, natsClient mcpTools.MCPNATSClient, assistantHolder *mcpTools.LocalAIAssistantHolder) echo.HandlerFunc {
|
|
return func(c echo.Context) error {
|
|
var textContentToReturn string
|
|
id := uuid.New().String()
|
|
created := int(time.Now().Unix())
|
|
|
|
input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
|
|
if !ok || input.Model == "" {
|
|
return echo.ErrBadRequest
|
|
}
|
|
|
|
extraUsage := c.Request().Header.Get("Extra-Usage") != ""
|
|
|
|
config, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
|
|
if !ok || config == nil {
|
|
return echo.ErrBadRequest
|
|
}
|
|
|
|
xlog.Debug("Chat endpoint configuration read", "config", config)
|
|
|
|
funcs := input.Functions
|
|
shouldUseFn := len(input.Functions) > 0 && config.ShouldUseFunctions()
|
|
strictMode := false
|
|
|
|
// MCP tool injection: when mcp_servers is set in metadata and model has MCP config
|
|
var mcpExecutor mcpTools.ToolExecutor
|
|
mcpServers := mcpTools.MCPServersFromMetadata(input.Metadata)
|
|
|
|
// LocalAI Assistant modality: an admin opted into the in-process MCP
|
|
// admin tool surface. Runs *before* the regular MCP block — when both
|
|
// are set, the assistant tools win (the admin cannot mix them with
|
|
// per-model MCP servers in the same chat session by design).
|
|
assistantMode := mcpTools.LocalAIAssistantFromMetadata(input.Metadata)
|
|
if assistantMode {
|
|
if err := requireAssistantAccess(c, startupOptions.Auth.Enabled); err != nil {
|
|
return err
|
|
}
|
|
// Read the disable flag live: an admin can flip it via /api/settings
|
|
// and the next request must see the change without a restart.
|
|
if startupOptions.DisableLocalAIAssistant {
|
|
return echo.NewHTTPError(http.StatusServiceUnavailable, "LocalAI Assistant is disabled on this server")
|
|
}
|
|
if assistantHolder == nil || !assistantHolder.HasTools() {
|
|
return echo.NewHTTPError(http.StatusServiceUnavailable, "LocalAI Assistant is not available on this server")
|
|
}
|
|
mcpExecutor = assistantHolder.Executor()
|
|
mcpFuncs, discErr := mcpExecutor.DiscoverTools(c.Request().Context())
|
|
if discErr != nil {
|
|
xlog.Error("Failed to discover LocalAI Assistant tools", "error", discErr)
|
|
return echo.NewHTTPError(http.StatusInternalServerError, "discover assistant tools: "+discErr.Error())
|
|
}
|
|
for _, fn := range mcpFuncs {
|
|
funcs = append(funcs, fn)
|
|
input.Tools = append(input.Tools, functions.Tool{Type: "function", Function: fn})
|
|
}
|
|
shouldUseFn = len(funcs) > 0 && config.ShouldUseFunctions()
|
|
|
|
// Prepend the embedded system prompt unless the caller supplied
|
|
// their own system message. Why: the prompt is what teaches the
|
|
// model the safety rules and recipes. If a caller already has a
|
|
// system message they're responsible for keeping the assistant
|
|
// safe, so we leave it alone.
|
|
if !hasSystemMessage(input.Messages) {
|
|
input.Messages = append([]schema.Message{{Role: "system", StringContent: assistantHolder.SystemPrompt()}}, input.Messages...)
|
|
}
|
|
|
|
xlog.Debug("LocalAI Assistant tools injected", "count", len(mcpFuncs))
|
|
}
|
|
|
|
// MCP prompt and resource injection (extracted before tool injection)
|
|
mcpPromptName, mcpPromptArgs := mcpTools.MCPPromptFromMetadata(input.Metadata)
|
|
mcpResourceURIs := mcpTools.MCPResourcesFromMetadata(input.Metadata)
|
|
|
|
if (len(mcpServers) > 0 || mcpPromptName != "" || len(mcpResourceURIs) > 0) && (config.MCP.Servers != "" || config.MCP.Stdio != "") {
|
|
remote, stdio, mcpErr := config.MCP.MCPConfigFromYAML()
|
|
if mcpErr == nil {
|
|
mcpExecutor = mcpTools.NewToolExecutor(c.Request().Context(), natsClient, config.Name, remote, stdio, mcpServers)
|
|
|
|
// Prompt and resource injection (pre-processing step — resolves locally regardless of distributed mode)
|
|
namedSessions, sessErr := mcpTools.NamedSessionsFromMCPConfig(config.Name, remote, stdio, mcpServers)
|
|
if sessErr == nil && len(namedSessions) > 0 {
|
|
mcpCtx, _ := mcpTools.InjectMCPContext(c.Request().Context(), namedSessions, mcpPromptName, mcpPromptArgs, mcpResourceURIs)
|
|
if mcpCtx != nil {
|
|
input.Messages = append(mcpCtx.PromptMessages, input.Messages...)
|
|
mcpTools.AppendResourceSuffix(input.Messages, mcpCtx.ResourceSuffix)
|
|
}
|
|
}
|
|
|
|
// Tool injection via executor
|
|
if mcpExecutor.HasTools() {
|
|
mcpFuncs, discErr := mcpExecutor.DiscoverTools(c.Request().Context())
|
|
if discErr == nil {
|
|
for _, fn := range mcpFuncs {
|
|
funcs = append(funcs, fn)
|
|
input.Tools = append(input.Tools, functions.Tool{Type: "function", Function: fn})
|
|
}
|
|
shouldUseFn = len(funcs) > 0 && config.ShouldUseFunctions()
|
|
xlog.Debug("MCP tools injected", "count", len(mcpFuncs), "total_funcs", len(funcs))
|
|
} else {
|
|
xlog.Error("Failed to discover MCP tools", "error", discErr)
|
|
}
|
|
}
|
|
} else {
|
|
xlog.Error("Failed to parse MCP config", "error", mcpErr)
|
|
}
|
|
}
|
|
|
|
xlog.Debug("Tool call routing decision",
|
|
"shouldUseFn", shouldUseFn,
|
|
"len(input.Functions)", len(input.Functions),
|
|
"len(input.Tools)", len(input.Tools),
|
|
"config.ShouldUseFunctions()", config.ShouldUseFunctions(),
|
|
"config.FunctionToCall()", config.FunctionToCall(),
|
|
)
|
|
|
|
for _, f := range input.Functions {
|
|
if f.Strict {
|
|
strictMode = true
|
|
break
|
|
}
|
|
}
|
|
|
|
// Allow the user to set custom actions via config file
|
|
// to be "embedded" in each model
|
|
noActionName := "answer"
|
|
noActionDescription := "use this action to answer without performing any action"
|
|
|
|
if config.FunctionsConfig.NoActionFunctionName != "" {
|
|
noActionName = config.FunctionsConfig.NoActionFunctionName
|
|
}
|
|
if config.FunctionsConfig.NoActionDescriptionName != "" {
|
|
noActionDescription = config.FunctionsConfig.NoActionDescriptionName
|
|
}
|
|
|
|
// If we are using a response format, we need to generate a grammar for it
|
|
if config.ResponseFormatMap != nil {
|
|
d := schema.ChatCompletionResponseFormat{}
|
|
dat, err := json.Marshal(config.ResponseFormatMap)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = json.Unmarshal(dat, &d)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
switch d.Type {
|
|
case "json_object":
|
|
input.Grammar = functions.JSONBNF
|
|
case "json_schema":
|
|
d := schema.JsonSchemaRequest{}
|
|
dat, err := json.Marshal(config.ResponseFormatMap)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = json.Unmarshal(dat, &d)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
fs := &functions.JSONFunctionStructure{
|
|
AnyOf: []functions.Item{d.JsonSchema.Schema},
|
|
}
|
|
g, err := fs.Grammar(config.FunctionsConfig.GrammarOptions()...)
|
|
if err == nil {
|
|
input.Grammar = g
|
|
} else {
|
|
xlog.Error("Failed generating grammar", "error", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
config.Grammar = input.Grammar
|
|
|
|
if shouldUseFn {
|
|
xlog.Debug("Response needs to process functions")
|
|
}
|
|
|
|
switch {
|
|
// Generates grammar with internal's LocalAI engine
|
|
case (!config.FunctionsConfig.GrammarConfig.NoGrammar || strictMode) && shouldUseFn:
|
|
noActionGrammar := functions.Function{
|
|
Name: noActionName,
|
|
Description: noActionDescription,
|
|
Parameters: map[string]any{
|
|
"properties": map[string]any{
|
|
"message": map[string]any{
|
|
"type": "string",
|
|
"description": "The message to reply the user with",
|
|
}},
|
|
},
|
|
}
|
|
|
|
// Append the no action function
|
|
if !config.FunctionsConfig.DisableNoAction && !strictMode {
|
|
funcs = append(funcs, noActionGrammar)
|
|
}
|
|
|
|
// Force picking one of the functions by the request
|
|
if config.FunctionToCall() != "" {
|
|
funcs = funcs.Select(config.FunctionToCall())
|
|
}
|
|
|
|
// Update input grammar or json_schema based on use_llama_grammar option
|
|
jsStruct := funcs.ToJSONStructure(config.FunctionsConfig.FunctionNameKey, config.FunctionsConfig.FunctionNameKey)
|
|
g, err := jsStruct.Grammar(config.FunctionsConfig.GrammarOptions()...)
|
|
if err == nil {
|
|
config.Grammar = g
|
|
} else {
|
|
xlog.Error("Failed generating grammar", "error", err)
|
|
}
|
|
case input.JSONFunctionGrammarObject != nil:
|
|
g, err := input.JSONFunctionGrammarObject.Grammar(config.FunctionsConfig.GrammarOptions()...)
|
|
if err == nil {
|
|
config.Grammar = g
|
|
} else {
|
|
xlog.Error("Failed generating grammar", "error", err)
|
|
}
|
|
|
|
default:
|
|
// Force picking one of the functions by the request
|
|
if config.FunctionToCall() != "" {
|
|
funcs = funcs.Select(config.FunctionToCall())
|
|
}
|
|
}
|
|
|
|
// process functions if we have any defined or if we have a function call string
|
|
|
|
// functions are not supported in stream mode (yet?)
|
|
toStream := input.Stream
|
|
|
|
xlog.Debug("Parameters", "config", config)
|
|
|
|
var predInput string
|
|
|
|
// If we are using the tokenizer template, we don't need to process the messages
|
|
// unless we are processing functions
|
|
if !config.TemplateConfig.UseTokenizerTemplate {
|
|
predInput = evaluator.TemplateMessages(*input, input.Messages, config, funcs, shouldUseFn)
|
|
|
|
xlog.Debug("Prompt (after templating)", "prompt", predInput)
|
|
if config.Grammar != "" {
|
|
xlog.Debug("Grammar", "grammar", config.Grammar)
|
|
}
|
|
}
|
|
|
|
switch {
|
|
case toStream:
|
|
|
|
xlog.Debug("Stream request received")
|
|
c.Response().Header().Set("Content-Type", "text/event-stream")
|
|
c.Response().Header().Set("Cache-Control", "no-cache")
|
|
c.Response().Header().Set("Connection", "keep-alive")
|
|
c.Response().Header().Set("X-Correlation-ID", id)
|
|
|
|
mcpStreamMaxIterations := 10
|
|
if config.Agent.MaxIterations > 0 {
|
|
mcpStreamMaxIterations = config.Agent.MaxIterations
|
|
}
|
|
hasMCPToolsStream := mcpExecutor != nil && mcpExecutor.HasTools()
|
|
|
|
for mcpStreamIter := 0; mcpStreamIter <= mcpStreamMaxIterations; mcpStreamIter++ {
|
|
// Re-template on MCP iterations
|
|
if mcpStreamIter > 0 && !config.TemplateConfig.UseTokenizerTemplate {
|
|
predInput = evaluator.TemplateMessages(*input, input.Messages, config, funcs, shouldUseFn)
|
|
xlog.Debug("MCP stream re-templating", "iteration", mcpStreamIter)
|
|
}
|
|
|
|
responses := make(chan schema.OpenAIResponse)
|
|
ended := make(chan streamWorkerResult, 1)
|
|
|
|
go func() {
|
|
if !shouldUseFn {
|
|
u, err := processStream(predInput, input, config, cl, startupOptions, ml, responses, id, created)
|
|
ended <- streamWorkerResult{usage: u, err: err}
|
|
} else {
|
|
u, err := processStreamWithTools(noActionName, predInput, input, config, cl, startupOptions, ml, responses, id, created, &textContentToReturn)
|
|
ended <- streamWorkerResult{usage: u, err: err}
|
|
}
|
|
}()
|
|
|
|
var finalUsage backend.TokenUsage
|
|
toolsCalled := false
|
|
var collectedToolCalls []schema.ToolCall
|
|
var collectedContent string
|
|
|
|
LOOP:
|
|
for {
|
|
select {
|
|
case <-input.Context.Done():
|
|
// Context was cancelled (client disconnected or request cancelled)
|
|
xlog.Debug("Request context cancelled, stopping stream")
|
|
input.Cancel()
|
|
break LOOP
|
|
case ev := <-responses:
|
|
if len(ev.Choices) == 0 {
|
|
xlog.Debug("No choices in the response, skipping")
|
|
continue
|
|
}
|
|
if len(ev.Choices[0].Delta.ToolCalls) > 0 {
|
|
toolsCalled = true
|
|
// Collect and merge tool call deltas for MCP execution
|
|
if hasMCPToolsStream {
|
|
collectedToolCalls = mergeToolCallDeltas(collectedToolCalls, ev.Choices[0].Delta.ToolCalls)
|
|
}
|
|
}
|
|
// Collect content for MCP conversation history and automatic tool parsing fallback
|
|
if (hasMCPToolsStream || config.FunctionsConfig.AutomaticToolParsingFallback) && ev.Choices[0].Delta != nil && ev.Choices[0].Delta.Content != nil {
|
|
if s, ok := ev.Choices[0].Delta.Content.(string); ok {
|
|
collectedContent += s
|
|
} else if sp, ok := ev.Choices[0].Delta.Content.(*string); ok && sp != nil {
|
|
collectedContent += *sp
|
|
}
|
|
}
|
|
respData, err := json.Marshal(ev)
|
|
if err != nil {
|
|
xlog.Debug("Failed to marshal response", "error", err)
|
|
input.Cancel()
|
|
continue
|
|
}
|
|
xlog.Debug("Sending chunk", "chunk", string(respData))
|
|
_, err = fmt.Fprintf(c.Response().Writer, "data: %s\n\n", string(respData))
|
|
if err != nil {
|
|
xlog.Debug("Sending chunk failed", "error", err)
|
|
input.Cancel()
|
|
return err
|
|
}
|
|
c.Response().Flush()
|
|
case res := <-ended:
|
|
if res.err == nil {
|
|
finalUsage = res.usage
|
|
break LOOP
|
|
}
|
|
xlog.Error("Stream ended with error", "error", res.err)
|
|
|
|
errorResp := schema.ErrorResponse{
|
|
Error: &schema.APIError{
|
|
Message: res.err.Error(),
|
|
Type: "server_error",
|
|
Code: "server_error",
|
|
},
|
|
}
|
|
respData, marshalErr := json.Marshal(errorResp)
|
|
if marshalErr != nil {
|
|
xlog.Error("Failed to marshal error response", "error", marshalErr)
|
|
fmt.Fprintf(c.Response().Writer, "data: {\"error\":{\"message\":\"Internal error\",\"type\":\"server_error\"}}\n\n")
|
|
} else {
|
|
fmt.Fprintf(c.Response().Writer, "data: %s\n\n", respData)
|
|
}
|
|
fmt.Fprintf(c.Response().Writer, "data: [DONE]\n\n")
|
|
c.Response().Flush()
|
|
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// Drain responses channel to unblock the background goroutine if it's
|
|
// still trying to send (e.g., after client disconnect). The goroutine
|
|
// calls close(responses) when done, which terminates the drain.
|
|
if input.Context.Err() != nil {
|
|
go func() {
|
|
for range responses {
|
|
}
|
|
}()
|
|
<-ended
|
|
}
|
|
|
|
// MCP streaming tool execution: if we collected MCP tool calls, execute and loop
|
|
if hasMCPToolsStream && toolsCalled && len(collectedToolCalls) > 0 {
|
|
var hasMCPCalls bool
|
|
for _, tc := range collectedToolCalls {
|
|
if mcpExecutor != nil && mcpExecutor.IsTool(tc.FunctionCall.Name) {
|
|
hasMCPCalls = true
|
|
break
|
|
}
|
|
}
|
|
if hasMCPCalls {
|
|
// Append assistant message with tool_calls
|
|
assistantMsg := schema.Message{
|
|
Role: "assistant",
|
|
Content: collectedContent,
|
|
ToolCalls: collectedToolCalls,
|
|
}
|
|
input.Messages = append(input.Messages, assistantMsg)
|
|
|
|
// Execute MCP tool calls and stream results as tool_result events
|
|
for _, tc := range collectedToolCalls {
|
|
if mcpExecutor == nil || !mcpExecutor.IsTool(tc.FunctionCall.Name) {
|
|
continue
|
|
}
|
|
xlog.Debug("Executing MCP tool (stream)", "tool", tc.FunctionCall.Name, "iteration", mcpStreamIter)
|
|
toolResult, toolErr := mcpExecutor.ExecuteTool(c.Request().Context(), tc.FunctionCall.Name, tc.FunctionCall.Arguments)
|
|
if toolErr != nil {
|
|
xlog.Error("MCP tool execution failed", "tool", tc.FunctionCall.Name, "error", toolErr)
|
|
toolResult = fmt.Sprintf("Error: %v", toolErr)
|
|
}
|
|
input.Messages = append(input.Messages, schema.Message{
|
|
Role: "tool",
|
|
Content: toolResult,
|
|
StringContent: toolResult,
|
|
ToolCallID: tc.ID,
|
|
Name: tc.FunctionCall.Name,
|
|
})
|
|
|
|
// Stream tool result event to client
|
|
mcpEvent := map[string]any{
|
|
"type": "mcp_tool_result",
|
|
"name": tc.FunctionCall.Name,
|
|
"result": toolResult,
|
|
}
|
|
if mcpEventData, err := json.Marshal(mcpEvent); err == nil {
|
|
fmt.Fprintf(c.Response().Writer, "data: %s\n\n", mcpEventData)
|
|
c.Response().Flush()
|
|
}
|
|
}
|
|
|
|
xlog.Debug("MCP streaming tools executed, re-running inference", "iteration", mcpStreamIter)
|
|
continue // next MCP stream iteration
|
|
}
|
|
}
|
|
|
|
// Automatic tool parsing fallback for streaming: when no tools were
|
|
// requested but the model emitted tool call markup, parse and emit them.
|
|
if !shouldUseFn && config.FunctionsConfig.AutomaticToolParsingFallback && collectedContent != "" && !toolsCalled {
|
|
parsed := functions.ParseFunctionCall(collectedContent, config.FunctionsConfig)
|
|
for i, fc := range parsed {
|
|
toolCallID := fc.ID
|
|
if toolCallID == "" {
|
|
toolCallID = id
|
|
}
|
|
toolCallMsg := schema.OpenAIResponse{
|
|
ID: id,
|
|
Created: created,
|
|
Model: input.Model,
|
|
Choices: []schema.Choice{{
|
|
Delta: &schema.Message{
|
|
Role: "assistant",
|
|
ToolCalls: []schema.ToolCall{{
|
|
Index: i,
|
|
ID: toolCallID,
|
|
Type: "function",
|
|
FunctionCall: schema.FunctionCall{
|
|
Name: fc.Name,
|
|
Arguments: fc.Arguments,
|
|
},
|
|
}},
|
|
},
|
|
Index: 0,
|
|
}},
|
|
Object: "chat.completion.chunk",
|
|
}
|
|
respData, _ := json.Marshal(toolCallMsg)
|
|
fmt.Fprintf(c.Response().Writer, "data: %s\n\n", respData)
|
|
c.Response().Flush()
|
|
toolsCalled = true
|
|
}
|
|
}
|
|
|
|
// No MCP tools to execute, send final stop message
|
|
finishReason := FinishReasonStop
|
|
if toolsCalled && len(input.Tools) > 0 {
|
|
finishReason = FinishReasonToolCalls
|
|
} else if toolsCalled {
|
|
finishReason = FinishReasonFunctionCall
|
|
}
|
|
|
|
// Final delta chunk: empty delta with finish_reason set. Per
|
|
// OpenAI streaming spec this chunk does NOT carry usage —
|
|
// the optional trailer (below) does, gated on include_usage.
|
|
resp := &schema.OpenAIResponse{
|
|
ID: id,
|
|
Created: created,
|
|
Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
|
|
Choices: []schema.Choice{
|
|
{
|
|
FinishReason: &finishReason,
|
|
Index: 0,
|
|
Delta: &schema.Message{},
|
|
}},
|
|
Object: "chat.completion.chunk",
|
|
}
|
|
respData, _ := json.Marshal(resp)
|
|
fmt.Fprintf(c.Response().Writer, "data: %s\n\n", respData)
|
|
|
|
// Trailing usage chunk per OpenAI spec: emit only when the
|
|
// caller opted in via stream_options.include_usage. Shape:
|
|
// {"choices":[],"usage":{...},"object":"chat.completion.chunk",...}
|
|
//
|
|
// finalUsage is the authoritative TokenUsage returned by the
|
|
// worker function (process / processTools) via the `ended`
|
|
// channel. The worker reads it from ComputeChoices' return
|
|
// value, which is the cumulative count produced by the backend
|
|
// over the whole prediction. Issue #9927 was caused by the
|
|
// tools-path worker not surfacing this value at all.
|
|
if input.StreamOptions != nil && input.StreamOptions.IncludeUsage {
|
|
trailerUsage := streamUsageFromTokenUsage(finalUsage, extraUsage)
|
|
trailer := streamUsageTrailerJSON(id, input.Model, created, trailerUsage)
|
|
_, _ = fmt.Fprintf(c.Response().Writer, "data: %s\n\n", trailer)
|
|
}
|
|
|
|
fmt.Fprintf(c.Response().Writer, "data: [DONE]\n\n")
|
|
c.Response().Flush()
|
|
xlog.Debug("Stream ended")
|
|
return nil
|
|
} // end MCP stream iteration loop
|
|
|
|
// Safety fallback
|
|
fmt.Fprintf(c.Response().Writer, "data: [DONE]\n\n")
|
|
c.Response().Flush()
|
|
return nil
|
|
|
|
// no streaming mode
|
|
default:
|
|
mcpMaxIterations := 10
|
|
if config.Agent.MaxIterations > 0 {
|
|
mcpMaxIterations = config.Agent.MaxIterations
|
|
}
|
|
hasMCPTools := mcpExecutor != nil && mcpExecutor.HasTools()
|
|
|
|
for mcpIteration := 0; mcpIteration <= mcpMaxIterations; mcpIteration++ {
|
|
// Re-template on each MCP iteration since messages may have changed
|
|
if mcpIteration > 0 && !config.TemplateConfig.UseTokenizerTemplate {
|
|
predInput = evaluator.TemplateMessages(*input, input.Messages, config, funcs, shouldUseFn)
|
|
xlog.Debug("MCP re-templating", "iteration", mcpIteration, "prompt_len", len(predInput))
|
|
}
|
|
|
|
// Detect if thinking token is already in prompt or template
|
|
var template string
|
|
if config.TemplateConfig.UseTokenizerTemplate {
|
|
template = config.GetModelTemplate() // TODO: this should be the parsed jinja template. But for now this is the best we can do.
|
|
} else {
|
|
template = predInput
|
|
}
|
|
thinkingStartToken := reason.DetectThinkingStartToken(template, &config.ReasoningConfig)
|
|
|
|
xlog.Debug("Thinking start token", "thinkingStartToken", thinkingStartToken, "template", template)
|
|
|
|
// When shouldUseFn, the callback just stores the raw text — tool parsing
|
|
// is deferred to after ComputeChoices so we can check chat deltas first
|
|
// and avoid redundant Go-side parsing.
|
|
var cbRawResult, cbReasoning string
|
|
|
|
tokenCallback := func(s string, c *[]schema.Choice) {
|
|
reasoning, s := reason.ExtractReasoningWithConfig(s, thinkingStartToken, config.ReasoningConfig)
|
|
|
|
if !shouldUseFn {
|
|
stopReason := FinishReasonStop
|
|
message := &schema.Message{Role: "assistant", Content: &s}
|
|
if reasoning != "" {
|
|
message.Reasoning = &reasoning
|
|
}
|
|
*c = append(*c, schema.Choice{FinishReason: &stopReason, Index: 0, Message: message})
|
|
return
|
|
}
|
|
|
|
// Store raw text for deferred tool parsing
|
|
cbRawResult = s
|
|
cbReasoning = reasoning
|
|
}
|
|
|
|
var result []schema.Choice
|
|
var tokenUsage backend.TokenUsage
|
|
var err error
|
|
|
|
var chatDeltas []*pb.ChatDelta
|
|
result, tokenUsage, chatDeltas, err = ComputeChoices(
|
|
input,
|
|
predInput,
|
|
config,
|
|
cl,
|
|
startupOptions,
|
|
ml,
|
|
tokenCallback,
|
|
nil,
|
|
func(attempt int) bool {
|
|
if !shouldUseFn {
|
|
return false
|
|
}
|
|
// Retry when backend produced only reasoning and no content/tool calls.
|
|
// Full tool parsing is deferred until after ComputeChoices returns
|
|
// (when chat deltas are available), but we can detect the empty case here.
|
|
if cbRawResult == "" && textContentToReturn == "" {
|
|
xlog.Warn("Backend produced reasoning without actionable content, retrying",
|
|
"reasoning_len", len(cbReasoning), "attempt", attempt+1)
|
|
cbRawResult = ""
|
|
cbReasoning = ""
|
|
textContentToReturn = ""
|
|
return true
|
|
}
|
|
return false
|
|
},
|
|
)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// For non-tool requests: prefer C++ autoparser chat deltas over
|
|
// Go-side tag extraction (which can mangle output when thinkingStartToken
|
|
// differs from the model's actual reasoning tags, e.g. Gemma 4).
|
|
if !shouldUseFn && len(chatDeltas) > 0 {
|
|
deltaContent := functions.ContentFromChatDeltas(chatDeltas)
|
|
deltaReasoning := functions.ReasoningFromChatDeltas(chatDeltas)
|
|
if deltaContent != "" || deltaReasoning != "" {
|
|
xlog.Debug("[ChatDeltas] non-SSE no-tools: overriding result with C++ autoparser deltas",
|
|
"content_len", len(deltaContent), "reasoning_len", len(deltaReasoning))
|
|
stopReason := FinishReasonStop
|
|
message := &schema.Message{Role: "assistant", Content: &deltaContent}
|
|
if deltaReasoning != "" {
|
|
message.Reasoning = &deltaReasoning
|
|
}
|
|
newChoice := schema.Choice{FinishReason: &stopReason, Index: 0, Message: message}
|
|
// Preserve logprobs from the original result
|
|
if len(result) > 0 && result[0].Logprobs != nil {
|
|
newChoice.Logprobs = result[0].Logprobs
|
|
}
|
|
result = []schema.Choice{newChoice}
|
|
}
|
|
}
|
|
|
|
// Tool parsing is deferred here (only when shouldUseFn) so chat deltas are available
|
|
if shouldUseFn {
|
|
var funcResults []functions.FuncCallResults
|
|
|
|
// Try pre-parsed tool calls from C++ autoparser first
|
|
if deltaToolCalls := functions.ToolCallsFromChatDeltas(chatDeltas); len(deltaToolCalls) > 0 {
|
|
xlog.Debug("[ChatDeltas] non-SSE: using C++ autoparser tool calls, skipping Go-side parsing", "count", len(deltaToolCalls))
|
|
funcResults = deltaToolCalls
|
|
textContentToReturn = functions.ContentFromChatDeltas(chatDeltas)
|
|
cbReasoning = functions.ReasoningFromChatDeltas(chatDeltas)
|
|
} else if deltaContent := functions.ContentFromChatDeltas(chatDeltas); len(chatDeltas) > 0 && deltaContent != "" {
|
|
// ChatDeltas have content but no tool calls — model answered without using tools.
|
|
// This happens with thinking models (e.g. Gemma 4) where the Go-side reasoning
|
|
// extraction misclassifies clean content as reasoning, leaving cbRawResult empty.
|
|
xlog.Debug("[ChatDeltas] non-SSE: using C++ autoparser content (no tool calls)", "content_len", len(deltaContent))
|
|
textContentToReturn = deltaContent
|
|
cbReasoning = functions.ReasoningFromChatDeltas(chatDeltas)
|
|
} else {
|
|
// Fallback: parse tool calls from raw text
|
|
xlog.Debug("[ChatDeltas] non-SSE: no chat deltas, falling back to Go-side text parsing")
|
|
textContentToReturn = functions.ParseTextContent(cbRawResult, config.FunctionsConfig)
|
|
cbRawResult = functions.CleanupLLMResult(cbRawResult, config.FunctionsConfig)
|
|
funcResults = functions.ParseFunctionCall(cbRawResult, config.FunctionsConfig)
|
|
}
|
|
|
|
// Content-based tool call fallback: if no tool calls were found,
|
|
// try parsing the raw result — ParseFunctionCall handles detection internally.
|
|
if len(funcResults) == 0 {
|
|
contentFuncResults := functions.ParseFunctionCall(cbRawResult, config.FunctionsConfig)
|
|
if len(contentFuncResults) > 0 {
|
|
funcResults = contentFuncResults
|
|
textContentToReturn = functions.StripToolCallMarkup(cbRawResult)
|
|
}
|
|
}
|
|
|
|
noActionsToRun := len(funcResults) > 0 && funcResults[0].Name == noActionName || len(funcResults) == 0
|
|
|
|
switch {
|
|
case noActionsToRun:
|
|
// Use textContentToReturn if available (e.g. from ChatDeltas),
|
|
// otherwise fall back to cbRawResult for legacy Go-side parsing.
|
|
questionInput := cbRawResult
|
|
if textContentToReturn != "" {
|
|
questionInput = textContentToReturn
|
|
}
|
|
qResult, qErr := handleQuestion(config, funcResults, questionInput, predInput)
|
|
if qErr != nil {
|
|
xlog.Error("error handling question", "error", qErr)
|
|
}
|
|
|
|
stopReason := FinishReasonStop
|
|
message := &schema.Message{Role: "assistant", Content: &qResult}
|
|
if cbReasoning != "" {
|
|
message.Reasoning = &cbReasoning
|
|
}
|
|
result = append(result, schema.Choice{
|
|
FinishReason: &stopReason,
|
|
Message: message,
|
|
})
|
|
default:
|
|
toolCallsReason := FinishReasonToolCalls
|
|
toolChoice := schema.Choice{
|
|
FinishReason: &toolCallsReason,
|
|
Message: &schema.Message{
|
|
Role: "assistant",
|
|
},
|
|
}
|
|
if cbReasoning != "" {
|
|
toolChoice.Message.Reasoning = &cbReasoning
|
|
}
|
|
|
|
for _, ss := range funcResults {
|
|
name, args := ss.Name, ss.Arguments
|
|
toolCallID := ss.ID
|
|
if toolCallID == "" {
|
|
toolCallID = id
|
|
}
|
|
if len(input.Tools) > 0 {
|
|
toolChoice.Message.Content = textContentToReturn
|
|
toolChoice.Message.ToolCalls = append(toolChoice.Message.ToolCalls,
|
|
schema.ToolCall{
|
|
ID: toolCallID,
|
|
Type: "function",
|
|
FunctionCall: schema.FunctionCall{
|
|
Name: name,
|
|
Arguments: args,
|
|
},
|
|
},
|
|
)
|
|
} else {
|
|
// Deprecated function_call format
|
|
functionCallReason := FinishReasonFunctionCall
|
|
message := &schema.Message{
|
|
Role: "assistant",
|
|
Content: &textContentToReturn,
|
|
FunctionCall: map[string]any{
|
|
"name": name,
|
|
"arguments": args,
|
|
},
|
|
}
|
|
if cbReasoning != "" {
|
|
message.Reasoning = &cbReasoning
|
|
}
|
|
result = append(result, schema.Choice{
|
|
FinishReason: &functionCallReason,
|
|
Message: message,
|
|
})
|
|
}
|
|
}
|
|
|
|
if len(input.Tools) > 0 {
|
|
result = append(result, toolChoice)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Automatic tool parsing fallback: when no tools/functions were in the
|
|
// request but the model emitted tool call markup, parse and surface them.
|
|
if !shouldUseFn && config.FunctionsConfig.AutomaticToolParsingFallback && len(result) > 0 {
|
|
for i, choice := range result {
|
|
if choice.Message == nil || choice.Message.Content == nil {
|
|
continue
|
|
}
|
|
contentStr, ok := choice.Message.Content.(string)
|
|
if !ok || contentStr == "" {
|
|
continue
|
|
}
|
|
parsed := functions.ParseFunctionCall(contentStr, config.FunctionsConfig)
|
|
if len(parsed) == 0 {
|
|
continue
|
|
}
|
|
stripped := functions.StripToolCallMarkup(contentStr)
|
|
toolCallsReason := FinishReasonToolCalls
|
|
result[i].FinishReason = &toolCallsReason
|
|
if stripped != "" {
|
|
result[i].Message.Content = &stripped
|
|
} else {
|
|
result[i].Message.Content = nil
|
|
}
|
|
for _, fc := range parsed {
|
|
toolCallID := fc.ID
|
|
if toolCallID == "" {
|
|
toolCallID = id
|
|
}
|
|
result[i].Message.ToolCalls = append(result[i].Message.ToolCalls,
|
|
schema.ToolCall{
|
|
ID: toolCallID,
|
|
Type: "function",
|
|
FunctionCall: schema.FunctionCall{
|
|
Name: fc.Name,
|
|
Arguments: fc.Arguments,
|
|
},
|
|
},
|
|
)
|
|
}
|
|
}
|
|
}
|
|
|
|
// MCP server-side tool execution loop:
|
|
// If we have MCP tools and the model returned tool_calls, execute MCP tools
|
|
// and re-run inference with the results appended to the conversation.
|
|
if hasMCPTools && len(result) > 0 {
|
|
var mcpCallsExecuted bool
|
|
for _, choice := range result {
|
|
if choice.Message == nil || len(choice.Message.ToolCalls) == 0 {
|
|
continue
|
|
}
|
|
// Check if any tool calls are MCP tools
|
|
var hasMCPCalls bool
|
|
for _, tc := range choice.Message.ToolCalls {
|
|
if mcpExecutor != nil && mcpExecutor.IsTool(tc.FunctionCall.Name) {
|
|
hasMCPCalls = true
|
|
break
|
|
}
|
|
}
|
|
if !hasMCPCalls {
|
|
continue
|
|
}
|
|
|
|
// Append assistant message with tool_calls to conversation
|
|
assistantContent := ""
|
|
if choice.Message.Content != nil {
|
|
if s, ok := choice.Message.Content.(string); ok {
|
|
assistantContent = s
|
|
} else if sp, ok := choice.Message.Content.(*string); ok && sp != nil {
|
|
assistantContent = *sp
|
|
}
|
|
}
|
|
assistantMsg := schema.Message{
|
|
Role: "assistant",
|
|
Content: assistantContent,
|
|
ToolCalls: choice.Message.ToolCalls,
|
|
}
|
|
input.Messages = append(input.Messages, assistantMsg)
|
|
|
|
// Execute each MCP tool call and append results
|
|
for _, tc := range choice.Message.ToolCalls {
|
|
if mcpExecutor == nil || !mcpExecutor.IsTool(tc.FunctionCall.Name) {
|
|
continue
|
|
}
|
|
xlog.Debug("Executing MCP tool", "tool", tc.FunctionCall.Name, "arguments", tc.FunctionCall.Arguments, "iteration", mcpIteration)
|
|
toolResult, toolErr := mcpExecutor.ExecuteTool(c.Request().Context(), tc.FunctionCall.Name, tc.FunctionCall.Arguments)
|
|
if toolErr != nil {
|
|
xlog.Error("MCP tool execution failed", "tool", tc.FunctionCall.Name, "error", toolErr)
|
|
toolResult = fmt.Sprintf("Error: %v", toolErr)
|
|
}
|
|
input.Messages = append(input.Messages, schema.Message{
|
|
Role: "tool",
|
|
Content: toolResult,
|
|
StringContent: toolResult,
|
|
ToolCallID: tc.ID,
|
|
Name: tc.FunctionCall.Name,
|
|
})
|
|
mcpCallsExecuted = true
|
|
}
|
|
}
|
|
|
|
if mcpCallsExecuted {
|
|
xlog.Debug("MCP tools executed, re-running inference", "iteration", mcpIteration, "messages", len(input.Messages))
|
|
continue // next MCP iteration
|
|
}
|
|
}
|
|
|
|
// No MCP tools to execute (or no MCP tools configured), return response
|
|
usage := schema.OpenAIUsage{
|
|
PromptTokens: tokenUsage.Prompt,
|
|
CompletionTokens: tokenUsage.Completion,
|
|
TotalTokens: tokenUsage.Prompt + tokenUsage.Completion,
|
|
}
|
|
if extraUsage {
|
|
usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
|
|
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
|
|
}
|
|
|
|
resp := &schema.OpenAIResponse{
|
|
ID: id,
|
|
Created: created,
|
|
Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
|
|
Choices: result,
|
|
Object: "chat.completion",
|
|
Usage: &usage,
|
|
}
|
|
respData, _ := json.Marshal(resp)
|
|
xlog.Debug("Response", "response", string(respData))
|
|
|
|
// Return the prediction in the response body
|
|
return c.JSON(200, resp)
|
|
} // end MCP iteration loop
|
|
|
|
// Should not reach here, but safety fallback
|
|
return fmt.Errorf("MCP iteration limit reached")
|
|
}
|
|
}
|
|
}
|
|
|
|
func handleQuestion(config *config.ModelConfig, funcResults []functions.FuncCallResults, result, prompt string) (string, error) {
|
|
|
|
if len(funcResults) == 0 && result != "" {
|
|
xlog.Debug("nothing function results but we had a message from the LLM")
|
|
|
|
return result, nil
|
|
}
|
|
|
|
xlog.Debug("nothing to do, computing a reply")
|
|
arg := ""
|
|
if len(funcResults) > 0 {
|
|
arg = funcResults[0].Arguments
|
|
}
|
|
// If there is a message that the LLM already sends as part of the JSON reply, use it
|
|
arguments := map[string]any{}
|
|
if err := json.Unmarshal([]byte(arg), &arguments); err != nil {
|
|
xlog.Debug("handleQuestion: function result did not contain a valid JSON object")
|
|
}
|
|
m, exists := arguments["message"]
|
|
if exists {
|
|
switch message := m.(type) {
|
|
case string:
|
|
if message != "" {
|
|
xlog.Debug("Reply received from LLM", "message", message)
|
|
message = backend.Finetune(*config, prompt, message)
|
|
xlog.Debug("Reply received from LLM(finetuned)", "message", message)
|
|
|
|
return message, nil
|
|
}
|
|
}
|
|
}
|
|
|
|
xlog.Debug("No action received from LLM, without a message, computing a reply")
|
|
|
|
return "", nil
|
|
}
|