Files
LocalAI/core/http/endpoints/anthropic/messages.go
Richard Palethorpe 085fc53bbc fix(router): production-ready request router + auto-size batch for embedding/rerank (#10104)
* fix(router): score classifier production-readiness

Conversation trimming runs through the classifier model's chat template
and trims by exact token count, sized to the model's n_batch which is
now scaled to context so long probes can't crash the backend. Missing
chat_message templates are a hard error at router build time. Router-
facing factories (Embedder/Scorer/Reranker/TokenCounter) re-resolve
ModelConfig per call so a model installed post-startup doesn't bind a
stub Backend="" config and silently fall into the loader's auto-
iterate path.

New 'vector_store' backend trace recorded inside localVectorStore on
every Search/Insert — including the backend-load-failure path that
previously vanished into an xlog.Warn — with outcome tagging
(hit/miss/empty_store/backend_load_error/find_error/insert_error/ok).
Companion cleanup drops misleading similarity:0 and input_tokens_count:0
from non-hit and text-mode traces.

Gallery local-store-development aliases to 'local-store' so the master
image satisfies pkg/model.LocalStoreBackend lookups from the embedding
cache.

Misc: llama-cpp TokenizeString reads the correct 'prompt' JSON key
(the original bug); ModelTokenize nil-guard; non-fatal mitm proxy
startup; PII 'route_local' renamed to 'allow' with docs/UI in sync;
model-editor footer no longer eats the edit area on small screens;
several config-editor template/dropdown/section fixes.

Tests: e2e router specs (casual/code-hint + long-conversation trim),
vector_store trace specs, lazy-factory specs, gallery dev-alias
resolution, Playwright trace badge + scroll regression.

Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Richard Palethorpe <io@richiejp.com>

* feat(backend): auto-size batch to context for embedding and rerank models

Embedding and rerank models pool over the whole input in a single physical batch (n_ubatch). With batch left at the 512 default, the backend rejects longer inputs with "input is too large to process", silently capping a large-context embedder (e.g. 8k/32k) at 512 tokens. Size n_batch to the context for these single-pass usecases, mirroring the existing FLAG_SCORE behaviour; an explicit batch: still wins.

Extracts EffectiveContextSize/EffectiveBatchSize from grpcModelOpts so the effective decode window has one home for other callers to reuse.

Adds an e2e-aio regression test that embeds a >512-token input. The AIO embedding model is switched to nomic-embed-text-v1.5 (2048 context) because the previous granite model was capped at 512 tokens and could not exercise the larger batch.

Assisted-by: claude-code:claude-opus-4-8 [Claude Code]
Signed-off-by: Richard Palethorpe <io@richiejp.com>

* fix(gallery): raise arch-router scoring output cap via parallel:64

Scoring decodes the whole prompt+candidate in a single llama_decode and
reads one logit row per candidate token. The vendored llama.cpp server
caps causal output rows at n_parallel, so the default of 1 aborts with
GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max) on multi-token route
labels. Set options: [parallel:64] on both arch-router quant entries to
lift the cap; kv_unified (the grpc-server default) keeps the full context
per sequence, so this does not split the KV cache.

Assisted-by: claude-code:claude-opus-4-8 [Claude Code]
Signed-off-by: Richard Palethorpe <io@richiejp.com>

---------

Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-06-12 16:21:15 +02:00

990 lines
33 KiB
Go

package anthropic
import (
"encoding/json"
"fmt"
"sync"
"time"
"github.com/google/uuid"
"github.com/labstack/echo/v4"
"github.com/mudler/LocalAI/core/backend"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/http/auth"
mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
openaiEndpoint "github.com/mudler/LocalAI/core/http/endpoints/openai"
"github.com/mudler/LocalAI/core/http/middleware"
"github.com/mudler/LocalAI/core/schema"
"github.com/mudler/LocalAI/core/services/cloudproxy"
"github.com/mudler/LocalAI/core/services/routing/pii"
"github.com/mudler/LocalAI/core/templates"
"github.com/mudler/LocalAI/pkg/functions"
"github.com/mudler/LocalAI/pkg/model"
"github.com/mudler/xlog"
)
// MessagesEndpoint is the Anthropic Messages API endpoint
// https://docs.anthropic.com/claude/reference/messages_post
// @Summary Generate a message response for the given messages and model.
// @Tags inference
// @Param request body schema.AnthropicRequest true "query params"
// @Success 200 {object} schema.AnthropicResponse "Response"
// @Router /v1/messages [post]
func MessagesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig, natsClient mcpTools.MCPNATSClient, piiRedactor *pii.Redactor, piiEvents pii.EventStore) echo.HandlerFunc {
return func(c echo.Context) error {
id := uuid.New().String()
input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.AnthropicRequest)
if !ok || input.Model == "" {
return sendAnthropicError(c, 400, "invalid_request_error", "model is required")
}
cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
if !ok || cfg == nil {
return sendAnthropicError(c, 400, "invalid_request_error", "model configuration not found")
}
if input.MaxTokens <= 0 {
return sendAnthropicError(c, 400, "invalid_request_error", "max_tokens is required and must be greater than 0")
}
xlog.Debug("Anthropic Messages endpoint configuration read", "config", cfg)
// Cloud-proxy bail. Same shape as the OpenAI chat endpoint —
// forwards via the cloud-proxy gRPC backend.
if cfg.IsCloudProxyBackendPassthrough() {
return forwardCloudProxyAnthropicViaBackend(c, cfg, input, piiRedactor, piiEvents, ml, appConfig)
}
// Convert Anthropic messages to OpenAI format for internal processing
openAIMessages := convertAnthropicToOpenAIMessages(input)
// Convert Anthropic tools to internal Functions format
funcs, shouldUseFn := convertAnthropicTools(input, cfg)
// MCP injection: prompts, resources, and tools
var mcpExecutor mcpTools.ToolExecutor
mcpServers := mcpTools.MCPServersFromMetadata(input.Metadata)
mcpPromptName, mcpPromptArgs := mcpTools.MCPPromptFromMetadata(input.Metadata)
mcpResourceURIs := mcpTools.MCPResourcesFromMetadata(input.Metadata)
if (len(mcpServers) > 0 || mcpPromptName != "" || len(mcpResourceURIs) > 0) && (cfg.MCP.Servers != "" || cfg.MCP.Stdio != "") {
remote, stdio, mcpErr := cfg.MCP.MCPConfigFromYAML()
if mcpErr == nil {
mcpExecutor = mcpTools.NewToolExecutor(c.Request().Context(), natsClient, cfg.Name, remote, stdio, mcpServers)
// Prompt and resource injection (pre-processing step — resolves locally regardless of distributed mode)
namedSessions, sessErr := mcpTools.NamedSessionsFromMCPConfig(cfg.Name, remote, stdio, mcpServers)
if sessErr == nil && len(namedSessions) > 0 {
mcpCtx, _ := mcpTools.InjectMCPContext(c.Request().Context(), namedSessions, mcpPromptName, mcpPromptArgs, mcpResourceURIs)
if mcpCtx != nil {
openAIMessages = append(mcpCtx.PromptMessages, openAIMessages...)
mcpTools.AppendResourceSuffix(openAIMessages, mcpCtx.ResourceSuffix)
}
}
// Tool injection via executor
if mcpExecutor.HasTools() {
mcpFuncs, discErr := mcpExecutor.DiscoverTools(c.Request().Context())
if discErr == nil {
for _, fn := range mcpFuncs {
funcs = append(funcs, fn)
}
shouldUseFn = len(funcs) > 0 && cfg.ShouldUseFunctions()
xlog.Debug("Anthropic MCP tools injected", "count", len(mcpFuncs), "total_funcs", len(funcs))
} else {
xlog.Error("Failed to discover MCP tools", "error", discErr)
}
}
} else {
xlog.Error("Failed to parse MCP config", "error", mcpErr)
}
}
// Create an OpenAI-compatible request for internal processing
openAIReq := &schema.OpenAIRequest{
PredictionOptions: schema.PredictionOptions{
BasicModelRequest: schema.BasicModelRequest{Model: input.Model},
Temperature: input.Temperature,
TopK: input.TopK,
TopP: input.TopP,
Maxtokens: &input.MaxTokens,
},
Messages: openAIMessages,
Stream: input.Stream,
Context: input.Context,
Cancel: input.Cancel,
}
// Set stop sequences
if len(input.StopSequences) > 0 {
openAIReq.Stop = input.StopSequences
}
// Merge config settings
if input.Temperature != nil {
cfg.Temperature = input.Temperature
}
if input.TopK != nil {
cfg.TopK = input.TopK
}
if input.TopP != nil {
cfg.TopP = input.TopP
}
cfg.Maxtokens = &input.MaxTokens
if len(input.StopSequences) > 0 {
cfg.StopWords = append(cfg.StopWords, input.StopSequences...)
}
// Template the prompt with tools if available
predInput := evaluator.TemplateMessages(*openAIReq, openAIReq.Messages, cfg, funcs, shouldUseFn)
xlog.Debug("Anthropic Messages - Prompt (after templating)", "prompt", predInput)
if input.Stream {
return handleAnthropicStream(c, id, input, cfg, ml, cl, appConfig, predInput, openAIReq, funcs, shouldUseFn, mcpExecutor, evaluator, piiRedactor, piiEvents)
}
return handleAnthropicNonStream(c, id, input, cfg, ml, cl, appConfig, predInput, openAIReq, funcs, shouldUseFn, mcpExecutor, evaluator)
}
}
func handleAnthropicNonStream(c echo.Context, id string, input *schema.AnthropicRequest, cfg *config.ModelConfig, ml *model.ModelLoader, cl *config.ModelConfigLoader, appConfig *config.ApplicationConfig, predInput string, openAIReq *schema.OpenAIRequest, funcs functions.Functions, shouldUseFn bool, mcpExecutor mcpTools.ToolExecutor, evaluator *templates.Evaluator) error {
mcpMaxIterations := 10
if cfg.Agent.MaxIterations > 0 {
mcpMaxIterations = cfg.Agent.MaxIterations
}
hasMCPTools := mcpExecutor != nil && mcpExecutor.HasTools()
for mcpIteration := 0; mcpIteration <= mcpMaxIterations; mcpIteration++ {
// Re-template on each MCP iteration since messages may have changed
if mcpIteration > 0 {
predInput = evaluator.TemplateMessages(*openAIReq, openAIReq.Messages, cfg, funcs, shouldUseFn)
xlog.Debug("Anthropic MCP re-templating", "iteration", mcpIteration, "prompt_len", len(predInput))
}
// Populate openAIReq fields for ComputeChoices
openAIReq.Tools = convertFuncsToOpenAITools(funcs)
openAIReq.ToolsChoice = input.ToolChoice
openAIReq.Metadata = input.Metadata
var result string
cb := func(s string, c *[]schema.Choice) {
result = s
}
_, tokenUsage, chatDeltas, err := openaiEndpoint.ComputeChoices(openAIReq, predInput, cfg, cl, appConfig, ml, cb, nil)
if err != nil {
xlog.Error("Anthropic model inference failed", "error", err)
return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("model inference failed: %v", err))
}
// Try pre-parsed tool calls from C++ autoparser first, fall back to text parsing
var toolCalls []functions.FuncCallResults
if deltaToolCalls := functions.ToolCallsFromChatDeltas(chatDeltas); len(deltaToolCalls) > 0 {
xlog.Debug("[ChatDeltas] Anthropic: using pre-parsed tool calls", "count", len(deltaToolCalls))
toolCalls = deltaToolCalls
} else {
xlog.Debug("[ChatDeltas] Anthropic: no pre-parsed tool calls, falling back to Go-side text parsing")
toolCalls = functions.ParseFunctionCall(result, cfg.FunctionsConfig)
}
// MCP server-side tool execution: if any tool calls are MCP tools, execute and loop
if hasMCPTools && shouldUseFn && len(toolCalls) > 0 {
var hasMCPCalls bool
for _, tc := range toolCalls {
if mcpExecutor != nil && mcpExecutor.IsTool(tc.Name) {
hasMCPCalls = true
break
}
}
if hasMCPCalls {
// Append assistant message with tool_calls to conversation
assistantMsg := schema.Message{
Role: "assistant",
Content: result,
}
for i, tc := range toolCalls {
toolCallID := tc.ID
if toolCallID == "" {
toolCallID = fmt.Sprintf("toolu_%s_%d", id, i)
}
assistantMsg.ToolCalls = append(assistantMsg.ToolCalls, schema.ToolCall{
Index: i,
ID: toolCallID,
Type: "function",
FunctionCall: schema.FunctionCall{
Name: tc.Name,
Arguments: tc.Arguments,
},
})
}
openAIReq.Messages = append(openAIReq.Messages, assistantMsg)
// Execute each MCP tool call and append results
for _, tc := range assistantMsg.ToolCalls {
if mcpExecutor == nil || !mcpExecutor.IsTool(tc.FunctionCall.Name) {
continue
}
xlog.Debug("Executing MCP tool (Anthropic)", "tool", tc.FunctionCall.Name, "iteration", mcpIteration)
toolResult, toolErr := mcpExecutor.ExecuteTool(
c.Request().Context(), tc.FunctionCall.Name, tc.FunctionCall.Arguments,
)
if toolErr != nil {
xlog.Error("MCP tool execution failed", "tool", tc.FunctionCall.Name, "error", toolErr)
toolResult = fmt.Sprintf("Error: %v", toolErr)
}
openAIReq.Messages = append(openAIReq.Messages, schema.Message{
Role: "tool",
Content: toolResult,
StringContent: toolResult,
ToolCallID: tc.ID,
Name: tc.FunctionCall.Name,
})
}
xlog.Debug("Anthropic MCP tools executed, re-running inference", "iteration", mcpIteration)
continue // next MCP iteration
}
}
// No MCP tools to execute, build and return response
var contentBlocks []schema.AnthropicContentBlock
var stopReason string
if shouldUseFn && len(toolCalls) > 0 {
stopReason = "tool_use"
for _, tc := range toolCalls {
var inputArgs map[string]any
if err := json.Unmarshal([]byte(tc.Arguments), &inputArgs); err != nil {
xlog.Warn("Failed to parse tool call arguments as JSON", "error", err, "args", tc.Arguments)
inputArgs = map[string]any{"raw": tc.Arguments}
}
contentBlocks = append(contentBlocks, schema.AnthropicContentBlock{
Type: "tool_use",
ID: fmt.Sprintf("toolu_%s_%d", id, len(contentBlocks)),
Name: tc.Name,
Input: inputArgs,
})
}
textContent := functions.ParseTextContent(result, cfg.FunctionsConfig)
if textContent != "" {
contentBlocks = append([]schema.AnthropicContentBlock{{Type: "text", Text: textContent}}, contentBlocks...)
}
} else if !shouldUseFn && cfg.FunctionsConfig.AutomaticToolParsingFallback && result != "" {
// Automatic tool parsing fallback: no tools in request but model emitted tool call markup
parsed := functions.ParseFunctionCall(result, cfg.FunctionsConfig)
if len(parsed) > 0 {
stopReason = "tool_use"
stripped := functions.StripToolCallMarkup(result)
if stripped != "" {
contentBlocks = append(contentBlocks, schema.AnthropicContentBlock{Type: "text", Text: stripped})
}
for i, fc := range parsed {
var inputArgs map[string]any
if err := json.Unmarshal([]byte(fc.Arguments), &inputArgs); err != nil {
inputArgs = map[string]any{"raw": fc.Arguments}
}
toolCallID := fc.ID
if toolCallID == "" {
toolCallID = fmt.Sprintf("toolu_%s_%d", id, i)
}
contentBlocks = append(contentBlocks, schema.AnthropicContentBlock{
Type: "tool_use",
ID: toolCallID,
Name: fc.Name,
Input: inputArgs,
})
}
} else {
stopReason = "end_turn"
contentBlocks = []schema.AnthropicContentBlock{{Type: "text", Text: result}}
}
} else {
stopReason = "end_turn"
contentBlocks = []schema.AnthropicContentBlock{
{Type: "text", Text: result},
}
}
resp := &schema.AnthropicResponse{
ID: fmt.Sprintf("msg_%s", id),
Type: "message",
Role: "assistant",
Model: input.Model,
StopReason: &stopReason,
Content: contentBlocks,
Usage: schema.AnthropicUsage{
InputTokens: tokenUsage.Prompt,
OutputTokens: tokenUsage.Completion,
},
}
if respData, err := json.Marshal(resp); err == nil {
xlog.Debug("Anthropic Response", "response", string(respData))
}
middleware.StampUsage(c, input.Model, tokenUsage.Prompt, tokenUsage.Completion)
return c.JSON(200, resp)
} // end MCP iteration loop
return sendAnthropicError(c, 500, "api_error", "MCP iteration limit reached")
}
func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicRequest, cfg *config.ModelConfig, ml *model.ModelLoader, cl *config.ModelConfigLoader, appConfig *config.ApplicationConfig, predInput string, openAIReq *schema.OpenAIRequest, funcs functions.Functions, shouldUseFn bool, mcpExecutor mcpTools.ToolExecutor, evaluator *templates.Evaluator, piiRedactor *pii.Redactor, piiEvents pii.EventStore) error {
c.Response().Header().Set("Content-Type", "text/event-stream")
c.Response().Header().Set("Cache-Control", "no-cache")
c.Response().Header().Set("Connection", "keep-alive")
// Per-stream PII filter — same gating as the OpenAI chat path. The
// filter is wire-format-agnostic; we feed it the text portion of
// each text_delta and emit only what's safe to send. The filter
// holds back a tail of size MaxPatternLength-1 so a pattern split
// across chunk boundaries still gets masked. When PII is disabled
// for this model the filter is nil and emits flow unchanged.
var streamPIIFilter *pii.StreamFilter
if piiRedactor != nil && cfg.PIIIsEnabled() {
correlationID := c.Request().Header.Get("x-request-id")
userID := ""
if u := auth.GetUser(c); u != nil {
userID = u.ID
}
var overrides map[string]pii.Action
if raw := cfg.PIIPatternOverrides(); len(raw) > 0 {
overrides = make(map[string]pii.Action, len(raw))
for ovid, action := range raw {
switch pii.Action(action) {
case pii.ActionMask, pii.ActionBlock, pii.ActionAllow:
overrides[ovid] = pii.Action(action)
}
}
}
streamPIIFilter = pii.NewStreamFilter(piiRedactor, overrides, piiEvents, correlationID, userID)
}
// Send message_start event
messageStart := schema.AnthropicStreamEvent{
Type: "message_start",
Message: &schema.AnthropicStreamMessage{
ID: fmt.Sprintf("msg_%s", id),
Type: "message",
Role: "assistant",
Content: []schema.AnthropicContentBlock{},
Model: input.Model,
Usage: schema.AnthropicUsage{InputTokens: 0, OutputTokens: 0},
},
}
sendAnthropicSSE(c, messageStart)
mcpMaxIterations := 10
if cfg.Agent.MaxIterations > 0 {
mcpMaxIterations = cfg.Agent.MaxIterations
}
hasMCPTools := mcpExecutor != nil && mcpExecutor.HasTools()
for mcpIteration := 0; mcpIteration <= mcpMaxIterations; mcpIteration++ {
// Re-template on MCP iterations
if mcpIteration > 0 {
predInput = evaluator.TemplateMessages(*openAIReq, openAIReq.Messages, cfg, funcs, shouldUseFn)
xlog.Debug("Anthropic MCP stream re-templating", "iteration", mcpIteration)
}
// Track accumulated content for tool call detection
accumulatedContent := ""
currentBlockIndex := 0
inToolCall := false
toolCallsEmitted := 0
// Send initial content_block_start event
contentBlockStart := schema.AnthropicStreamEvent{
Type: "content_block_start",
Index: intPtr(currentBlockIndex),
ContentBlock: &schema.AnthropicContentBlock{Type: "text", Text: ""},
}
sendAnthropicSSE(c, contentBlockStart)
// Collect tool calls for MCP execution
var collectedToolCalls []functions.FuncCallResults
// SSE keepalive: send comment pings every 3s until the first token arrives.
// This prevents clients (e.g. Claude Code) from timing out while the model loads or processes the prompt.
firstTokenReceived := make(chan struct{})
keepaliveDone := make(chan struct{})
go func() {
defer close(keepaliveDone)
ticker := time.NewTicker(3 * time.Second)
defer ticker.Stop()
for {
select {
case <-firstTokenReceived:
return
case <-c.Request().Context().Done():
return
case <-ticker.C:
fmt.Fprintf(c.Response().Writer, "event: ping\ndata: {\"type\": \"ping\"}\n\n")
c.Response().Flush()
}
}
}()
firstTokenOnce := sync.Once{}
tokenCallback := func(token string, usage backend.TokenUsage) bool {
firstTokenOnce.Do(func() {
close(firstTokenReceived)
<-keepaliveDone // wait for keepalive goroutine to exit before writing
})
accumulatedContent += token
if shouldUseFn {
cleanedResult := functions.CleanupLLMResult(accumulatedContent, cfg.FunctionsConfig)
toolCalls := functions.ParseFunctionCall(cleanedResult, cfg.FunctionsConfig)
if len(toolCalls) > toolCallsEmitted {
if !inToolCall && currentBlockIndex == 0 {
drainStreamPIIToText(c, streamPIIFilter, intPtr(currentBlockIndex))
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_stop",
Index: intPtr(currentBlockIndex),
})
currentBlockIndex++
inToolCall = true
}
for i := toolCallsEmitted; i < len(toolCalls); i++ {
tc := toolCalls[i]
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_start",
Index: intPtr(currentBlockIndex),
ContentBlock: &schema.AnthropicContentBlock{
Type: "tool_use",
ID: fmt.Sprintf("toolu_%s_%d", id, i),
Name: tc.Name,
},
})
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_delta",
Index: intPtr(currentBlockIndex),
Delta: &schema.AnthropicStreamDelta{
Type: "input_json_delta",
PartialJSON: tc.Arguments,
},
})
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_stop",
Index: intPtr(currentBlockIndex),
})
currentBlockIndex++
}
collectedToolCalls = toolCalls
toolCallsEmitted = len(toolCalls)
return true
}
}
if !inToolCall && token != "" {
out := token
if streamPIIFilter != nil {
out = streamPIIFilter.Push(token)
}
if out != "" {
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_delta",
Index: intPtr(0),
Delta: &schema.AnthropicStreamDelta{
Type: "text_delta",
Text: out,
},
})
}
}
return true
}
// Populate openAIReq fields for ComputeChoices
openAIReq.Tools = convertFuncsToOpenAITools(funcs)
openAIReq.ToolsChoice = input.ToolChoice
openAIReq.Metadata = input.Metadata
_, tokenUsage, chatDeltas, err := openaiEndpoint.ComputeChoices(openAIReq, predInput, cfg, cl, appConfig, ml, func(s string, c *[]schema.Choice) {}, tokenCallback)
// Stop the keepalive goroutine now that inference is done
firstTokenOnce.Do(func() { close(firstTokenReceived) })
<-keepaliveDone
if err != nil {
xlog.Error("Anthropic stream model inference failed", "error", err)
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "error",
Error: &schema.AnthropicError{
Type: "api_error",
Message: fmt.Sprintf("model inference failed: %v", err),
},
})
return nil
}
// Check chat deltas from C++ autoparser — when active, the raw
// message is cleared and content/tool calls arrive via ChatDeltas.
if len(chatDeltas) > 0 {
deltaContent := functions.ContentFromChatDeltas(chatDeltas)
deltaToolCalls := functions.ToolCallsFromChatDeltas(chatDeltas)
// Emit text content from ChatDeltas only when the tokenCallback
// didn't already stream it (autoparser clears raw text, so
// accumulatedContent will be empty in that case).
if deltaContent != "" && !inToolCall && accumulatedContent == "" {
out := deltaContent
if streamPIIFilter != nil {
out = streamPIIFilter.Push(deltaContent)
}
if out != "" {
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_delta",
Index: intPtr(0),
Delta: &schema.AnthropicStreamDelta{
Type: "text_delta",
Text: out,
},
})
}
}
// Emit tool_use blocks from ChatDeltas
if len(deltaToolCalls) > 0 && len(collectedToolCalls) == 0 {
collectedToolCalls = deltaToolCalls
if !inToolCall && currentBlockIndex == 0 {
drainStreamPIIToText(c, streamPIIFilter, intPtr(currentBlockIndex))
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_stop",
Index: intPtr(currentBlockIndex),
})
currentBlockIndex++
inToolCall = true
}
for i, tc := range deltaToolCalls {
toolCallID := tc.ID
if toolCallID == "" {
toolCallID = fmt.Sprintf("toolu_%s_%d", id, i)
}
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_start",
Index: intPtr(currentBlockIndex),
ContentBlock: &schema.AnthropicContentBlock{
Type: "tool_use",
ID: toolCallID,
Name: tc.Name,
},
})
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_delta",
Index: intPtr(currentBlockIndex),
Delta: &schema.AnthropicStreamDelta{
Type: "input_json_delta",
PartialJSON: tc.Arguments,
},
})
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_stop",
Index: intPtr(currentBlockIndex),
})
currentBlockIndex++
toolCallsEmitted++
}
}
}
// MCP streaming tool execution: if we collected MCP tool calls, execute and loop
if hasMCPTools && len(collectedToolCalls) > 0 {
var hasMCPCalls bool
for _, tc := range collectedToolCalls {
if mcpExecutor != nil && mcpExecutor.IsTool(tc.Name) {
hasMCPCalls = true
break
}
}
if hasMCPCalls {
// Append assistant message with tool_calls
assistantMsg := schema.Message{
Role: "assistant",
Content: accumulatedContent,
}
for i, tc := range collectedToolCalls {
toolCallID := tc.ID
if toolCallID == "" {
toolCallID = fmt.Sprintf("toolu_%s_%d", id, i)
}
assistantMsg.ToolCalls = append(assistantMsg.ToolCalls, schema.ToolCall{
Index: i,
ID: toolCallID,
Type: "function",
FunctionCall: schema.FunctionCall{
Name: tc.Name,
Arguments: tc.Arguments,
},
})
}
openAIReq.Messages = append(openAIReq.Messages, assistantMsg)
// Execute MCP tool calls
for _, tc := range assistantMsg.ToolCalls {
if mcpExecutor == nil || !mcpExecutor.IsTool(tc.FunctionCall.Name) {
continue
}
xlog.Debug("Executing MCP tool (Anthropic stream)", "tool", tc.FunctionCall.Name, "iteration", mcpIteration)
toolResult, toolErr := mcpExecutor.ExecuteTool(
c.Request().Context(), tc.FunctionCall.Name, tc.FunctionCall.Arguments,
)
if toolErr != nil {
xlog.Error("MCP tool execution failed", "tool", tc.FunctionCall.Name, "error", toolErr)
toolResult = fmt.Sprintf("Error: %v", toolErr)
}
openAIReq.Messages = append(openAIReq.Messages, schema.Message{
Role: "tool",
Content: toolResult,
StringContent: toolResult,
ToolCallID: tc.ID,
Name: tc.FunctionCall.Name,
})
}
xlog.Debug("Anthropic MCP streaming tools executed, re-running inference", "iteration", mcpIteration)
continue // next MCP iteration
}
}
// Automatic tool parsing fallback for streaming: when no tools were requested
// but the model emitted tool call markup, parse and emit as tool_use blocks.
if !shouldUseFn && cfg.FunctionsConfig.AutomaticToolParsingFallback && accumulatedContent != "" && toolCallsEmitted == 0 {
parsed := functions.ParseFunctionCall(accumulatedContent, cfg.FunctionsConfig)
if len(parsed) > 0 {
// Close the text content block (after flushing any
// residual the streaming PII filter held back).
drainStreamPIIToText(c, streamPIIFilter, intPtr(currentBlockIndex))
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_stop",
Index: intPtr(currentBlockIndex),
})
currentBlockIndex++
inToolCall = true
for i, fc := range parsed {
toolCallID := fc.ID
if toolCallID == "" {
toolCallID = fmt.Sprintf("toolu_%s_%d", id, i)
}
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_start",
Index: intPtr(currentBlockIndex),
ContentBlock: &schema.AnthropicContentBlock{
Type: "tool_use",
ID: toolCallID,
Name: fc.Name,
},
})
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_delta",
Index: intPtr(currentBlockIndex),
Delta: &schema.AnthropicStreamDelta{
Type: "input_json_delta",
PartialJSON: fc.Arguments,
},
})
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_stop",
Index: intPtr(currentBlockIndex),
})
currentBlockIndex++
toolCallsEmitted++
}
}
}
// No MCP tools to execute, close stream. drainStreamPIIToText
// flushes any residual the streaming PII filter held back as
// part of its trailing pattern-window before we close the
// text content block.
if !inToolCall {
drainStreamPIIToText(c, streamPIIFilter, intPtr(0))
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_stop",
Index: intPtr(0),
})
}
stopReason := "end_turn"
if toolCallsEmitted > 0 {
stopReason = "tool_use"
}
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "message_delta",
Delta: &schema.AnthropicStreamDelta{
StopReason: &stopReason,
},
Usage: &schema.AnthropicUsage{
OutputTokens: tokenUsage.Completion,
},
})
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "message_stop",
})
middleware.StampUsage(c, input.Model, tokenUsage.Prompt, tokenUsage.Completion)
return nil
} // end MCP iteration loop
// Safety fallback
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "message_stop",
})
return nil
}
func convertFuncsToOpenAITools(funcs functions.Functions) []functions.Tool {
tools := make([]functions.Tool, len(funcs))
for i, f := range funcs {
tools[i] = functions.Tool{Type: "function", Function: f}
}
return tools
}
func intPtr(i int) *int { return &i }
// drainStreamPIIToText flushes any residual the streaming PII filter
// has been holding back as part of its trailing pattern-window, and
// emits it as one final text_delta into the named block before the
// caller closes that block. Drain is idempotent: calling it twice on
// the same filter returns "" the second time. Safe to call with a nil
// filter (no-op).
func drainStreamPIIToText(c echo.Context, sf *pii.StreamFilter, index *int) {
if sf == nil {
return
}
residual := sf.Drain()
if residual == "" {
return
}
sendAnthropicSSE(c, schema.AnthropicStreamEvent{
Type: "content_block_delta",
Index: index,
Delta: &schema.AnthropicStreamDelta{
Type: "text_delta",
Text: residual,
},
})
}
func sendAnthropicSSE(c echo.Context, event schema.AnthropicStreamEvent) {
data, err := json.Marshal(event)
if err != nil {
xlog.Error("Failed to marshal SSE event", "error", err)
return
}
fmt.Fprintf(c.Response().Writer, "event: %s\ndata: %s\n\n", event.Type, string(data))
c.Response().Flush()
}
func sendAnthropicError(c echo.Context, statusCode int, errorType, message string) error {
resp := schema.AnthropicErrorResponse{
Type: "error",
Error: schema.AnthropicError{
Type: errorType,
Message: message,
},
}
return c.JSON(statusCode, resp)
}
func convertAnthropicToOpenAIMessages(input *schema.AnthropicRequest) []schema.Message {
var messages []schema.Message
// Add system message if present
if input.System != "" {
sysStr := string(input.System)
messages = append(messages, schema.Message{
Role: "system",
StringContent: sysStr,
Content: sysStr,
})
}
// Convert Anthropic messages to OpenAI format
for _, msg := range input.Messages {
openAIMsg := schema.Message{
Role: msg.Role,
}
// Handle content (can be string or array of content blocks)
switch content := msg.Content.(type) {
case string:
openAIMsg.StringContent = content
openAIMsg.Content = content
case []any:
// Handle array of content blocks
var textContent string
var stringImages []string
var toolCalls []schema.ToolCall
toolCallIndex := 0
for _, block := range content {
if blockMap, ok := block.(map[string]any); ok {
blockType, _ := blockMap["type"].(string)
switch blockType {
case "text":
if text, ok := blockMap["text"].(string); ok {
textContent += text
}
case "image":
// Handle image content
if source, ok := blockMap["source"].(map[string]any); ok {
if sourceType, ok := source["type"].(string); ok && sourceType == "base64" {
if data, ok := source["data"].(string); ok {
mediaType, _ := source["media_type"].(string)
// Format as data URI
dataURI := fmt.Sprintf("data:%s;base64,%s", mediaType, data)
stringImages = append(stringImages, dataURI)
}
}
}
case "tool_use":
// Convert tool_use to ToolCall format
toolID, _ := blockMap["id"].(string)
toolName, _ := blockMap["name"].(string)
toolInput := blockMap["input"]
// Serialize input to JSON string
inputJSON, err := json.Marshal(toolInput)
if err != nil {
xlog.Warn("Failed to marshal tool input", "error", err)
inputJSON = []byte("{}")
}
toolCalls = append(toolCalls, schema.ToolCall{
Index: toolCallIndex,
ID: toolID,
Type: "function",
FunctionCall: schema.FunctionCall{
Name: toolName,
Arguments: string(inputJSON),
},
})
toolCallIndex++
case "tool_result":
// Convert tool_result to a message with role "tool"
// This is handled by creating a separate message after this block
// For now, we'll add it as text content
toolUseID, _ := blockMap["tool_use_id"].(string)
isError := false
if isErrorPtr, ok := blockMap["is_error"].(*bool); ok && isErrorPtr != nil {
isError = *isErrorPtr
}
var resultText string
if resultContent, ok := blockMap["content"]; ok {
switch rc := resultContent.(type) {
case string:
resultText = rc
case []any:
// Array of content blocks
for _, cb := range rc {
if cbMap, ok := cb.(map[string]any); ok {
if cbMap["type"] == "text" {
if text, ok := cbMap["text"].(string); ok {
resultText += text
}
}
}
}
}
}
// Add tool result as a tool role message
// We need to handle this differently - create a new message
if msg.Role == "user" {
// Store tool result info for creating separate message
prefix := ""
if isError {
prefix = "Error: "
}
textContent += fmt.Sprintf("\n[Tool Result for %s]: %s%s", toolUseID, prefix, resultText)
}
}
}
}
openAIMsg.StringContent = textContent
openAIMsg.Content = textContent
openAIMsg.StringImages = stringImages
// Add tool calls if present
if len(toolCalls) > 0 {
openAIMsg.ToolCalls = toolCalls
}
}
messages = append(messages, openAIMsg)
}
return messages
}
// convertAnthropicTools converts Anthropic tools to internal Functions format
func convertAnthropicTools(input *schema.AnthropicRequest, cfg *config.ModelConfig) (functions.Functions, bool) {
if len(input.Tools) == 0 {
return nil, false
}
var funcs functions.Functions
for _, tool := range input.Tools {
f := functions.Function{
Name: tool.Name,
Description: tool.Description,
Parameters: tool.InputSchema,
}
funcs = append(funcs, f)
}
// Handle tool_choice
if input.ToolChoice != nil {
switch tc := input.ToolChoice.(type) {
case string:
// "auto", "any", or "none"
if tc == "any" {
// Force the model to use one of the tools
cfg.SetFunctionCallString("required")
} else if tc == "none" {
// Don't use tools
return nil, false
}
// "auto" is the default - let model decide
case map[string]any:
// Specific tool selection: {"type": "tool", "name": "tool_name"}
if tcType, ok := tc["type"].(string); ok && tcType == "tool" {
if name, ok := tc["name"].(string); ok {
// Force specific tool
cfg.SetFunctionCallNameString(name)
}
}
}
}
return funcs, len(funcs) > 0 && cfg.ShouldUseFunctions()
}
// forwardCloudProxyAnthropicViaBackend marshals the Anthropic request,
// constructs the streaming PII filter (when applicable), and hands the
// body off to the cloud-proxy gRPC backend. Model swap + upstream auth
// headers are applied inside the backend; the filter is built here
// because the auth/correlation context only exists in the echo handler.
func forwardCloudProxyAnthropicViaBackend(c echo.Context, cfg *config.ModelConfig, input *schema.AnthropicRequest, piiRedactor *pii.Redactor, piiEvents pii.EventStore, ml *model.ModelLoader, appConfig *config.ApplicationConfig) error {
body, err := json.Marshal(input)
if err != nil {
return sendAnthropicError(c, 400, "invalid_request_error", "cloudproxy: marshal request: "+err.Error())
}
correlationID := c.Request().Header.Get("x-request-id")
streamFilter := cloudproxy.BuildStreamFilter(c, cfg, input.Stream, piiRedactor, piiEvents, correlationID)
return cloudproxy.ForwardViaBackend(c, cfg, body, streamFilter, ml, appConfig)
}