Files
LocalAI/backend/go/cloud-proxy/provider_openai.go
Richard Palethorpe 6a80e23733 feat(middleware): Model routing, PII filtering, Cloud model proxies (#9802)
Add a routing middleware stack and a cloud-proxy backend.

* cloud-proxy: a Go gRPC backend that forwards OpenAI- and
  Anthropic-shaped chat requests to upstream providers, with an
  optional translate mode (OpenAI request -> Anthropic /v1/messages
  -> OpenAI response) and full tool-calling support.

* routing: admission control, content-aware model routing
  (embedding cache + classifier + rerank + Arch-Router score),
  PII detection/redaction (regex + NER) with streaming filter and
  OpenAI/Anthropic adapters, and a per-user/per-key billing recorder
  backed by GORM or in-memory storage.

* middleware: UsageMiddleware records usage via the billing recorder,
  plus admission, route-model, usage-stamp and trace middlewares.

* observability: BackendTrace ring buffer stores full request bodies
  (capped), MITM proxy emits structured trace events, and router
  classifier decisions surface at /api/router/decide.

* gallery: Arch-Router-1.5B (Q4_K_M and Q8_0).

* UI: cloud-proxy model-editor fields, classifier system-prompt and
  score-normalization config, and a Traces page rendering request
  bodies.

Assisted-by: claude-code:claude-opus-4-7 [Read] [Edit] [Bash]

Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-05-25 09:28:27 +02:00

321 lines
11 KiB
Go

package main
import (
"bufio"
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"strings"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/xlog"
)
// OpenAI Chat Completions wire-format types. Narrowed to the fields
// translate mode needs to preserve through the Reply proto: content,
// role, tool_calls (typed so we can map them to pb.ToolCallDelta),
// and sampling params copied verbatim from PredictOptions.
//
// Provider-specific extensions (logit_bias, function calling beyond
// tool_calls, etc.) are not modelled — passthrough mode covers callers
// that need full upstream fidelity.
type openAIRequest struct {
Model string `json:"model"`
Messages []openAIMessage `json:"messages"`
Stream bool `json:"stream,omitempty"`
Temperature *float64 `json:"temperature,omitempty"`
TopP *float64 `json:"top_p,omitempty"`
MaxTokens *int32 `json:"max_tokens,omitempty"`
Stop []string `json:"stop,omitempty"`
FrequencyPenalty *float64 `json:"frequency_penalty,omitempty"`
PresencePenalty *float64 `json:"presence_penalty,omitempty"`
Tools json.RawMessage `json:"tools,omitempty"`
ToolChoice json.RawMessage `json:"tool_choice,omitempty"`
}
type openAIMessage struct {
Role string `json:"role"`
Content string `json:"content,omitempty"`
Name string `json:"name,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"`
ToolCalls []openAIToolCall `json:"tool_calls,omitempty"`
}
// openAIToolCall covers both the non-streaming response shape (full
// id+function+arguments) and the streaming-delta shape (sparse fields,
// index assignment). The proto's ToolCallDelta absorbs both — name is
// set on first appearance, arguments arrive incrementally in streaming.
type openAIToolCall struct {
Index int `json:"index,omitempty"`
ID string `json:"id,omitempty"`
Type string `json:"type,omitempty"`
Function openAIFunctionCall `json:"function,omitempty"`
}
type openAIFunctionCall struct {
Name string `json:"name,omitempty"`
Arguments string `json:"arguments,omitempty"`
}
type openAIChoice struct {
Index int `json:"index"`
Message openAIMessage `json:"message"`
FinishReason string `json:"finish_reason"`
}
type openAIResponse struct {
ID string `json:"id"`
Choices []openAIChoice `json:"choices"`
Usage *openAIUsage `json:"usage,omitempty"`
}
type openAIStreamChoice struct {
Index int `json:"index"`
Delta struct {
Content string `json:"content,omitempty"`
Role string `json:"role,omitempty"`
ToolCalls []openAIToolCall `json:"tool_calls,omitempty"`
} `json:"delta"`
FinishReason string `json:"finish_reason,omitempty"`
}
type openAIStreamChunk struct {
Choices []openAIStreamChoice `json:"choices"`
Usage *openAIUsage `json:"usage,omitempty"`
}
type openAIUsage struct {
PromptTokens int `json:"prompt_tokens"`
CompletionTokens int `json:"completion_tokens"`
TotalTokens int `json:"total_tokens"`
}
// buildOpenAIRequest converts pb.PredictOptions into the OpenAI Chat
// Completions request body. Prefers Messages when non-empty; falls
// back to wrapping Prompt as a single user message so plain
// /completions-style calls still work in translate mode.
func buildOpenAIRequest(opts *pb.PredictOptions, cfg *proxyConfig, stream bool) ([]byte, error) {
req := openAIRequest{
Model: modelName(cfg, opts),
Stream: stream,
Stop: opts.GetStopPrompts(),
Tools: parseRawJSON(opts.GetTools()),
ToolChoice: parseRawJSON(opts.GetToolChoice()),
}
if t := opts.GetTemperature(); t != 0 {
v := float64(t)
req.Temperature = &v
}
if t := opts.GetTopP(); t != 0 {
v := float64(t)
req.TopP = &v
}
if n := opts.GetTokens(); n > 0 {
req.MaxTokens = &n
}
if p := opts.GetFrequencyPenalty(); p != 0 {
v := float64(p)
req.FrequencyPenalty = &v
}
if p := opts.GetPresencePenalty(); p != 0 {
v := float64(p)
req.PresencePenalty = &v
}
for _, m := range opts.GetMessages() {
msg := openAIMessage{
Role: m.GetRole(),
Content: m.GetContent(),
Name: m.GetName(),
ToolCallID: m.GetToolCallId(),
}
// Pre-existing tool_calls arrive as a JSON string from the
// upstream caller's previous assistant turn; pass-through as-is.
if tc := m.GetToolCalls(); tc != "" {
_ = json.Unmarshal([]byte(tc), &msg.ToolCalls)
}
req.Messages = append(req.Messages, msg)
}
// Fallback for plain Prompt requests (no Messages array). LocalAI
// templating may have produced a flat prompt; rewrap as a single
// user message so the upstream chat endpoint accepts it.
if len(req.Messages) == 0 && opts.GetPrompt() != "" {
req.Messages = []openAIMessage{{Role: "user", Content: opts.GetPrompt()}}
}
return json.Marshal(req)
}
// modelName picks the upstream model: upstream_model from the proxy
// config wins (operator override), else the local model name captured
// at LoadModel time. Operator sets upstream_model to map LocalAI's
// alias (e.g. "claude-strict") to the upstream's canonical name
// (e.g. "claude-3-5-sonnet-20241022").
func modelName(cfg *proxyConfig, _ *pb.PredictOptions) string {
if cfg.upstreamModel != "" {
return cfg.upstreamModel
}
return cfg.localModel
}
// parseRawJSON parses a JSON string into a RawMessage so it round-trips
// into the upstream body. Returns nil for empty/invalid input so the
// field is omitted (omitempty).
func parseRawJSON(s string) json.RawMessage {
if s == "" {
return nil
}
var probe json.RawMessage
if err := json.Unmarshal([]byte(s), &probe); err != nil {
return nil
}
return probe
}
// doOpenAIRequest builds + sends the upstream request. Returns the
// raw response on success; caller handles status / body.
func (c *CloudProxy) doOpenAIRequest(ctx context.Context, cfg *proxyConfig, body []byte) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodPost, cfg.upstreamURL, bytes.NewReader(body))
if err != nil {
return nil, fmt.Errorf("cloud-proxy: build request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Accept", "*/*")
if cfg.apiKey != "" {
applyAuthHeader(req, cfg.provider, cfg.apiKey)
}
resp, err := c.client.Do(req)
if err != nil {
return nil, fmt.Errorf("cloud-proxy: upstream request: %w", err)
}
return resp, nil
}
// predictOpenAIRich is the non-streaming translate path. Returns a
// fully-populated *pb.Reply with assistant content, tool calls, and
// token usage. The gRPC server forwards the Reply verbatim.
func (c *CloudProxy) predictOpenAIRich(ctx context.Context, cfg *proxyConfig, opts *pb.PredictOptions) (*pb.Reply, error) {
body, err := buildOpenAIRequest(opts, cfg, false)
if err != nil {
return nil, fmt.Errorf("cloud-proxy: marshal request: %w", err)
}
resp, err := c.doOpenAIRequest(ctx, cfg, body)
if err != nil {
return nil, err
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode >= 400 {
errBody, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
return nil, fmt.Errorf("cloud-proxy: upstream %d: %s", resp.StatusCode, string(errBody))
}
var parsed openAIResponse
if err := json.NewDecoder(resp.Body).Decode(&parsed); err != nil {
return nil, fmt.Errorf("cloud-proxy: decode response: %w", err)
}
if len(parsed.Choices) == 0 {
return nil, errors.New("cloud-proxy: upstream returned no choices")
}
choice := parsed.Choices[0]
reply := &pb.Reply{
Message: []byte(choice.Message.Content),
}
if parsed.Usage != nil {
reply.PromptTokens = int32(parsed.Usage.PromptTokens)
reply.Tokens = int32(parsed.Usage.CompletionTokens)
}
if len(choice.Message.ToolCalls) > 0 {
// Non-streaming: a single ChatDelta carries the full tool-call
// set. Index/Name/Arguments are populated together; downstream
// consumers don't need to assemble streaming deltas.
delta := &pb.ChatDelta{}
for _, tc := range choice.Message.ToolCalls {
delta.ToolCalls = append(delta.ToolCalls,
newToolCallDelta(tc.Index, tc.ID, tc.Function.Name, tc.Function.Arguments))
}
reply.ChatDeltas = []*pb.ChatDelta{delta}
}
return reply, nil
}
// predictOpenAIStreamRich streams *pb.Reply chunks. Each chunk carries
// either a content delta (Message + ChatDeltas[].Content) or tool-call
// deltas (ChatDeltas[].ToolCalls). The final Reply carries usage tokens
// when the upstream sends them (stream_options.include_usage).
func (c *CloudProxy) predictOpenAIStreamRich(ctx context.Context, cfg *proxyConfig, opts *pb.PredictOptions, results chan<- *pb.Reply) error {
body, err := buildOpenAIRequest(opts, cfg, true)
if err != nil {
return fmt.Errorf("cloud-proxy: marshal request: %w", err)
}
resp, err := c.doOpenAIRequest(ctx, cfg, body)
if err != nil {
return err
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode >= 400 {
errBody, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
return fmt.Errorf("cloud-proxy: upstream %d: %s", resp.StatusCode, string(errBody))
}
scanner := bufio.NewScanner(resp.Body)
scanner.Buffer(make([]byte, 0, 64*1024), 1<<20)
for scanner.Scan() {
line := scanner.Text()
if !strings.HasPrefix(line, "data:") {
continue
}
payload := strings.TrimSpace(strings.TrimPrefix(line, "data:"))
if payload == "" || payload == "[DONE]" {
return nil
}
var chunk openAIStreamChunk
if err := json.Unmarshal([]byte(payload), &chunk); err != nil {
xlog.Debug("cloud-proxy: skip malformed SSE chunk", "error", err)
continue
}
// Usage frames may arrive separately from content frames when
// stream_options.include_usage is set; emit a usage-only Reply
// in that case so the consumer sees the totals.
if chunk.Usage != nil && len(chunk.Choices) == 0 {
if !sendReply(ctx, results, &pb.Reply{
PromptTokens: int32(chunk.Usage.PromptTokens),
Tokens: int32(chunk.Usage.CompletionTokens),
}) {
return ctx.Err()
}
continue
}
for _, ch := range chunk.Choices {
reply := &pb.Reply{}
if ch.Delta.Content != "" {
reply.Message = []byte(ch.Delta.Content)
reply.ChatDeltas = []*pb.ChatDelta{{Content: ch.Delta.Content}}
}
if len(ch.Delta.ToolCalls) > 0 {
if len(reply.ChatDeltas) == 0 {
reply.ChatDeltas = []*pb.ChatDelta{{}}
}
for _, tc := range ch.Delta.ToolCalls {
reply.ChatDeltas[0].ToolCalls = append(reply.ChatDeltas[0].ToolCalls,
newToolCallDelta(tc.Index, tc.ID, tc.Function.Name, tc.Function.Arguments))
}
}
if reply.Message == nil && len(reply.ChatDeltas) == 0 {
continue
}
if !sendReply(ctx, results, reply) {
return ctx.Err()
}
}
}
return scanner.Err()
}