Files
LocalAI/backend/go/cloud-proxy/provider_anthropic.go
Richard Palethorpe 6a80e23733 feat(middleware): Model routing, PII filtering, Cloud model proxies (#9802)
Add a routing middleware stack and a cloud-proxy backend.

* cloud-proxy: a Go gRPC backend that forwards OpenAI- and
  Anthropic-shaped chat requests to upstream providers, with an
  optional translate mode (OpenAI request -> Anthropic /v1/messages
  -> OpenAI response) and full tool-calling support.

* routing: admission control, content-aware model routing
  (embedding cache + classifier + rerank + Arch-Router score),
  PII detection/redaction (regex + NER) with streaming filter and
  OpenAI/Anthropic adapters, and a per-user/per-key billing recorder
  backed by GORM or in-memory storage.

* middleware: UsageMiddleware records usage via the billing recorder,
  plus admission, route-model, usage-stamp and trace middlewares.

* observability: BackendTrace ring buffer stores full request bodies
  (capped), MITM proxy emits structured trace events, and router
  classifier decisions surface at /api/router/decide.

* gallery: Arch-Router-1.5B (Q4_K_M and Q8_0).

* UI: cloud-proxy model-editor fields, classifier system-prompt and
  score-normalization config, and a Traces page rendering request
  bodies.

Assisted-by: claude-code:claude-opus-4-7 [Read] [Edit] [Bash]

Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-05-25 09:28:27 +02:00

509 lines
17 KiB
Go

package main
import (
"bufio"
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/xlog"
)
// Anthropic Messages API wire-format types. Narrowed to what translate
// mode preserves through the Reply proto: text + tool_use blocks +
// usage tokens. Image blocks, prompt caching, metadata, and stop
// sequence metadata are not modelled — passthrough mode covers those.
//
// Notable differences from OpenAI:
// - max_tokens is REQUIRED. Anthropic 400s without it.
// - Roles are user/assistant only — system messages move to a
// top-level `system` string field.
// - Streaming SSE uses event: lines alongside data: lines. The
// events we care about: content_block_start (carries tool_use
// init: id + name), content_block_delta (text_delta with text;
// input_json_delta with partial_json for tool arguments), and
// message_stop (terminates the stream). Others are ignored.
type anthropicRequest struct {
Model string `json:"model"`
MaxTokens int32 `json:"max_tokens"`
System string `json:"system,omitempty"`
Messages []anthropicMessage `json:"messages"`
Stream bool `json:"stream,omitempty"`
Temperature *float64 `json:"temperature,omitempty"`
TopP *float64 `json:"top_p,omitempty"`
StopSequences []string `json:"stop_sequences,omitempty"`
Tools []anthropicTool `json:"tools,omitempty"`
ToolChoice *anthropicToolChoice `json:"tool_choice,omitempty"`
}
// Content is `any` because Anthropic accepts a bare string OR a
// list of content blocks. Use the string form for plain user/
// assistant turns; switch to []anthropicContentBlock when the
// turn needs tool_use (assistant) or tool_result (user) blocks.
type anthropicMessage struct {
Role string `json:"role"`
Content any `json:"content"`
}
type anthropicTool struct {
Name string `json:"name"`
Description string `json:"description,omitempty"`
InputSchema json.RawMessage `json:"input_schema"`
}
// anthropicToolChoice mirrors the four shapes Anthropic accepts:
// {"type":"auto"} | {"type":"any"} | {"type":"tool","name":"X"} |
// {"type":"none"} (newer models). OpenAI's "auto"/"none"/
// "required"/{"function":{"name":"X"}} all map here.
type anthropicToolChoice struct {
Type string `json:"type"`
Name string `json:"name,omitempty"`
}
// anthropicContentBlock is the union shape used both for response
// blocks (text/tool_use we read off the wire) and outbound request
// blocks (tool_use/tool_result we emit in the conversation history).
// Anthropic encodes tool calls inline rather than as a separate field,
// so we walk Content[] looking for type=="tool_use" on responses and
// produce equivalent blocks when serialising prior-turn tool calls.
type anthropicContentBlock struct {
Type string `json:"type"`
Text string `json:"text,omitempty"`
ID string `json:"id,omitempty"`
Name string `json:"name,omitempty"`
Input json.RawMessage `json:"input,omitempty"`
// Tool-result block fields. tool_result uses `content` (not
// `text`) and pairs with `tool_use_id`; modelling them as
// distinct fields avoids ambiguity at marshal time.
ToolUseID string `json:"tool_use_id,omitempty"`
ResultContent string `json:"content,omitempty"`
}
type anthropicResponse struct {
ID string `json:"id"`
Type string `json:"type"`
Role string `json:"role"`
Content []anthropicContentBlock `json:"content"`
Model string `json:"model"`
Usage *anthropicUsage `json:"usage,omitempty"`
}
type anthropicUsage struct {
InputTokens int `json:"input_tokens"`
OutputTokens int `json:"output_tokens"`
}
// anthropicStreamEvent is the union shape used for every event type we
// process. Type discriminates; only the matching fields are populated.
// content_block_start carries ContentBlock (with id/name for tool_use);
// content_block_delta carries Delta (text or partial_json).
type anthropicStreamEvent struct {
Type string `json:"type"`
Index int `json:"index,omitempty"`
ContentBlock *anthropicContentBlock `json:"content_block,omitempty"`
Delta *anthropicStreamDelta `json:"delta,omitempty"`
Message *anthropicResponse `json:"message,omitempty"`
Usage *anthropicUsage `json:"usage,omitempty"`
}
type anthropicStreamDelta struct {
Type string `json:"type,omitempty"`
Text string `json:"text,omitempty"`
PartialJSON string `json:"partial_json,omitempty"`
}
// Anthropic requires max_tokens. If the caller didn't set it, use a
// generous-but-bounded default so the request doesn't 400.
const anthropicDefaultMaxTokens int32 = 4096
const anthropicToolChoiceNone = "none"
// Reused JSON-Schema defaults for malformed inputs. Anthropic requires
// input_schema to be a JSON object and tool_use.input to be a JSON
// object; clients that omit them must not 400 the entire request.
var (
emptyJSONObject = json.RawMessage(`{}`)
emptyObjectSchema = json.RawMessage(`{"type":"object","properties":{}}`)
)
func buildAnthropicRequest(opts *pb.PredictOptions, cfg *proxyConfig, stream bool) ([]byte, error) {
req := anthropicRequest{
Model: modelName(cfg, opts),
MaxTokens: opts.GetTokens(),
Stream: stream,
StopSequences: opts.GetStopPrompts(),
}
if req.MaxTokens <= 0 {
req.MaxTokens = anthropicDefaultMaxTokens
}
// Newer Anthropic models 400 when both temperature and top_p are
// set ("`temperature` and `top_p` cannot both be specified for
// this model. Please use only one.") even though their docs only
// "recommend" picking one. The OpenAI-compatible chat UI almost
// always sends both with default values, so prefer temperature
// and drop top_p when both are present.
if t := opts.GetTemperature(); t != 0 {
v := float64(t)
req.Temperature = &v
} else if t := opts.GetTopP(); t != 0 {
v := float64(t)
req.TopP = &v
}
req.Tools = convertOpenAITools(opts.GetTools())
req.ToolChoice = convertOpenAIToolChoice(opts.GetToolChoice())
// Anthropic rejects tool_choice without tools and older models
// don't accept {"type":"none"} — collapse to a no-tools request.
if req.ToolChoice != nil && req.ToolChoice.Type == anthropicToolChoiceNone {
req.Tools, req.ToolChoice = nil, nil
}
var systemParts []string
for _, m := range opts.GetMessages() {
role := m.GetRole()
if role == "system" {
if c := m.GetContent(); c != "" {
systemParts = append(systemParts, c)
}
continue
}
switch role {
case "user":
req.Messages = append(req.Messages, anthropicMessage{
Role: "user",
Content: m.GetContent(),
})
case "assistant":
if blocks := assistantBlocks(m); blocks != nil {
req.Messages = append(req.Messages, anthropicMessage{Role: "assistant", Content: blocks})
continue
}
req.Messages = append(req.Messages, anthropicMessage{
Role: "assistant",
Content: m.GetContent(),
})
case "tool", "function":
req.Messages = appendToolResult(req.Messages, anthropicContentBlock{
Type: "tool_result",
ToolUseID: m.GetToolCallId(),
ResultContent: m.GetContent(),
})
}
}
req.System = strings.Join(systemParts, "\n\n")
if len(req.Messages) == 0 && opts.GetPrompt() != "" {
req.Messages = []anthropicMessage{{Role: "user", Content: opts.GetPrompt()}}
}
return json.Marshal(req)
}
// appendToolResult appends a tool_result block as a user message,
// merging into a preceding user message that already carries blocks.
// Anthropic concatenates consecutive same-role messages on its end,
// but explicit merging keeps the body smaller and the conversation
// strictly alternating — which some upstream filters require.
func appendToolResult(msgs []anthropicMessage, block anthropicContentBlock) []anthropicMessage {
if n := len(msgs); n > 0 && msgs[n-1].Role == "user" {
if existing, ok := msgs[n-1].Content.([]anthropicContentBlock); ok {
msgs[n-1].Content = append(existing, block)
return msgs
}
}
return append(msgs, anthropicMessage{
Role: "user",
Content: []anthropicContentBlock{block},
})
}
func convertOpenAITools(toolsJSON string) []anthropicTool {
if toolsJSON == "" {
return nil
}
var raw []openAITool
if err := json.Unmarshal([]byte(toolsJSON), &raw); err != nil {
xlog.Warn("cloud-proxy: anthropic translate: unparseable tools JSON, dropping", "error", err)
return nil
}
tools := make([]anthropicTool, 0, len(raw))
for _, t := range raw {
if t.Function.Name == "" {
continue
}
schema := t.Function.Parameters
if len(schema) == 0 {
schema = emptyObjectSchema
}
tools = append(tools, anthropicTool{
Name: t.Function.Name,
Description: t.Function.Description,
InputSchema: schema,
})
}
return tools
}
// convertOpenAIToolChoice accepts the spec form
// ({type:function, function:{name:X}}) and the flat legacy form
// ({type:function, name:X}) some clients send. Unknown object shapes
// are warned and dropped rather than silently treated as auto.
func convertOpenAIToolChoice(toolChoiceJSON string) *anthropicToolChoice {
if toolChoiceJSON == "" {
return nil
}
var asString string
if err := json.Unmarshal([]byte(toolChoiceJSON), &asString); err == nil {
switch asString {
case "auto":
return &anthropicToolChoice{Type: "auto"}
case "none":
return &anthropicToolChoice{Type: anthropicToolChoiceNone}
case "required":
return &anthropicToolChoice{Type: "any"}
}
return nil
}
var asObj struct {
Type string `json:"type"`
Name string `json:"name"`
Function struct {
Name string `json:"name"`
} `json:"function"`
}
if err := json.Unmarshal([]byte(toolChoiceJSON), &asObj); err != nil {
xlog.Warn("cloud-proxy: anthropic translate: unparseable tool_choice, dropping", "error", err)
return nil
}
if name := asObj.Function.Name; name != "" {
return &anthropicToolChoice{Type: "tool", Name: name}
}
if asObj.Name != "" {
return &anthropicToolChoice{Type: "tool", Name: asObj.Name}
}
xlog.Warn("cloud-proxy: anthropic translate: unrecognised tool_choice shape, dropping", "shape", toolChoiceJSON)
return nil
}
// openAITool mirrors pkg/functions.Tool but keeps Parameters as
// json.RawMessage so the input_schema passes through verbatim — no
// re-marshal cost, no fidelity loss on exotic schemas.
type openAITool struct {
Type string `json:"type"`
Function struct {
Name string `json:"name"`
Description string `json:"description"`
Parameters json.RawMessage `json:"parameters"`
} `json:"function"`
}
func assistantBlocks(m *pb.Message) []anthropicContentBlock {
toolCallsJSON := m.GetToolCalls()
if toolCallsJSON == "" {
return nil
}
var toolCalls []openAIToolCall
if err := json.Unmarshal([]byte(toolCallsJSON), &toolCalls); err != nil || len(toolCalls) == 0 {
return nil
}
blocks := make([]anthropicContentBlock, 0, len(toolCalls)+1)
if text := m.GetContent(); text != "" {
blocks = append(blocks, anthropicContentBlock{Type: "text", Text: text})
}
for _, tc := range toolCalls {
// OpenAI's arguments are a JSON-encoded string; pass through
// as RawMessage so a non-JSON string from a poorly-formed
// local model doesn't crash the marshaller downstream.
args := json.RawMessage(tc.Function.Arguments)
if len(args) == 0 {
args = emptyJSONObject
}
blocks = append(blocks, anthropicContentBlock{
Type: "tool_use",
ID: tc.ID,
Name: tc.Function.Name,
Input: args,
})
}
return blocks
}
// doAnthropicRequest is the Anthropic counterpart of doOpenAIRequest.
// applyAuthHeader sets x-api-key and anthropic-version when provider
// is anthropic, so this method doesn't need to duplicate that.
func (c *CloudProxy) doAnthropicRequest(ctx context.Context, cfg *proxyConfig, body []byte) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodPost, cfg.upstreamURL, bytes.NewReader(body))
if err != nil {
return nil, fmt.Errorf("cloud-proxy: build request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Accept", "*/*")
if cfg.apiKey != "" {
applyAuthHeader(req, cfg.provider, cfg.apiKey)
}
resp, err := c.client.Do(req)
if err != nil {
return nil, fmt.Errorf("cloud-proxy: upstream request: %w", err)
}
return resp, nil
}
// predictAnthropicRich returns the full Reply: joined text from all
// text blocks, tool_use blocks mapped to ToolCallDelta, and usage
// tokens.
func (c *CloudProxy) predictAnthropicRich(ctx context.Context, cfg *proxyConfig, opts *pb.PredictOptions) (*pb.Reply, error) {
body, err := buildAnthropicRequest(opts, cfg, false)
if err != nil {
return nil, fmt.Errorf("cloud-proxy: marshal request: %w", err)
}
resp, err := c.doAnthropicRequest(ctx, cfg, body)
if err != nil {
return nil, err
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode >= 400 {
errBody, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
return nil, fmt.Errorf("cloud-proxy: upstream %d: %s", resp.StatusCode, string(errBody))
}
var parsed anthropicResponse
if err := json.NewDecoder(resp.Body).Decode(&parsed); err != nil {
return nil, fmt.Errorf("cloud-proxy: decode response: %w", err)
}
reply := &pb.Reply{}
if parsed.Usage != nil {
reply.PromptTokens = int32(parsed.Usage.InputTokens)
reply.Tokens = int32(parsed.Usage.OutputTokens)
}
var content strings.Builder
var toolCalls []*pb.ToolCallDelta
toolIdx := 0
for _, b := range parsed.Content {
switch b.Type {
case "text":
content.WriteString(b.Text)
case "tool_use":
// Input is a structured JSON object; we serialise to a
// string so it fits the OpenAI-shaped arguments field
// downstream consumers expect.
args := ""
if len(b.Input) > 0 {
args = string(b.Input)
}
toolCalls = append(toolCalls, newToolCallDelta(toolIdx, b.ID, b.Name, args))
toolIdx++
}
}
reply.Message = []byte(content.String())
if len(toolCalls) > 0 {
reply.ChatDeltas = []*pb.ChatDelta{{ToolCalls: toolCalls}}
}
return reply, nil
}
// predictAnthropicStreamRich streams Reply chunks from Anthropic's SSE.
// Three event types matter: content_block_start (initialises tool_use
// id+name), content_block_delta (carries text or input_json_delta),
// message_stop (terminates). The block index from the wire feeds
// straight into ToolCallDelta.Index so downstream consumers can
// reassemble multiple parallel tool calls.
func (c *CloudProxy) predictAnthropicStreamRich(ctx context.Context, cfg *proxyConfig, opts *pb.PredictOptions, results chan<- *pb.Reply) error {
body, err := buildAnthropicRequest(opts, cfg, true)
if err != nil {
return fmt.Errorf("cloud-proxy: marshal request: %w", err)
}
resp, err := c.doAnthropicRequest(ctx, cfg, body)
if err != nil {
return err
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode >= 400 {
errBody, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
return fmt.Errorf("cloud-proxy: upstream %d: %s", resp.StatusCode, string(errBody))
}
scanner := bufio.NewScanner(resp.Body)
scanner.Buffer(make([]byte, 0, 64*1024), 1<<20)
for scanner.Scan() {
line := scanner.Text()
if !strings.HasPrefix(line, "data:") {
continue
}
payload := strings.TrimSpace(strings.TrimPrefix(line, "data:"))
if payload == "" {
continue
}
var ev anthropicStreamEvent
if err := json.Unmarshal([]byte(payload), &ev); err != nil {
xlog.Debug("cloud-proxy: skip malformed SSE chunk", "error", err)
continue
}
switch ev.Type {
case "content_block_start":
// tool_use blocks announce id + name here; arguments arrive
// in subsequent input_json_delta events. Emit a Reply with
// just the tool_call init fields so consumers can allocate
// a slot at this index.
if ev.ContentBlock != nil && ev.ContentBlock.Type == "tool_use" {
if !sendReply(ctx, results, &pb.Reply{
ChatDeltas: []*pb.ChatDelta{{ToolCalls: []*pb.ToolCallDelta{
newToolCallDelta(ev.Index, ev.ContentBlock.ID, ev.ContentBlock.Name, ""),
}}},
}) {
return ctx.Err()
}
}
case "content_block_delta":
if ev.Delta == nil {
continue
}
switch ev.Delta.Type {
case "text_delta":
if ev.Delta.Text == "" {
continue
}
if !sendReply(ctx, results, &pb.Reply{
Message: []byte(ev.Delta.Text),
ChatDeltas: []*pb.ChatDelta{{Content: ev.Delta.Text}},
}) {
return ctx.Err()
}
case "input_json_delta":
if ev.Delta.PartialJSON == "" {
continue
}
if !sendReply(ctx, results, &pb.Reply{
ChatDeltas: []*pb.ChatDelta{{ToolCalls: []*pb.ToolCallDelta{
newToolCallDelta(ev.Index, "", "", ev.Delta.PartialJSON),
}}},
}) {
return ctx.Err()
}
}
case "message_delta":
// Anthropic sends final usage in message_delta.usage. Emit
// a usage-only Reply so the consumer can record totals.
if ev.Usage != nil {
if !sendReply(ctx, results, &pb.Reply{
Tokens: int32(ev.Usage.OutputTokens),
}) {
return ctx.Err()
}
}
case "message_stop":
return nil
}
}
return scanner.Err()
}