mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-03 13:56:46 -04:00
Add a routing middleware stack and a cloud-proxy backend. * cloud-proxy: a Go gRPC backend that forwards OpenAI- and Anthropic-shaped chat requests to upstream providers, with an optional translate mode (OpenAI request -> Anthropic /v1/messages -> OpenAI response) and full tool-calling support. * routing: admission control, content-aware model routing (embedding cache + classifier + rerank + Arch-Router score), PII detection/redaction (regex + NER) with streaming filter and OpenAI/Anthropic adapters, and a per-user/per-key billing recorder backed by GORM or in-memory storage. * middleware: UsageMiddleware records usage via the billing recorder, plus admission, route-model, usage-stamp and trace middlewares. * observability: BackendTrace ring buffer stores full request bodies (capped), MITM proxy emits structured trace events, and router classifier decisions surface at /api/router/decide. * gallery: Arch-Router-1.5B (Q4_K_M and Q8_0). * UI: cloud-proxy model-editor fields, classifier system-prompt and score-normalization config, and a Traces page rendering request bodies. Assisted-by: claude-code:claude-opus-4-7 [Read] [Edit] [Bash] Signed-off-by: Richard Palethorpe <io@richiejp.com>
82 lines
2.6 KiB
Go
82 lines
2.6 KiB
Go
package middleware
|
|
|
|
import (
|
|
"context"
|
|
"crypto/rand"
|
|
"encoding/hex"
|
|
"fmt"
|
|
"net/http"
|
|
"strconv"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/labstack/echo/v4"
|
|
"github.com/mudler/LocalAI/core/config"
|
|
"github.com/mudler/LocalAI/core/services/routing/admission"
|
|
"github.com/mudler/LocalAI/core/services/routing/pii"
|
|
)
|
|
|
|
// AdmissionControl runs after RouteModel so the limit applies to the
|
|
// SERVED model — a router fanout that lands on a saturated downstream
|
|
// model gets rejected even though the requested router-model has slack.
|
|
//
|
|
// On reject: HTTP 503, Retry-After header, error JSON. An audit row
|
|
// goes into the shared event store under KindAdmission so admins see
|
|
// rejection rates alongside PII and proxy events.
|
|
//
|
|
// Models without limits.max_concurrent (the common case) hit a fast
|
|
// no-op path — Acquire returns immediately for max <= 0.
|
|
func AdmissionControl(limiter *admission.Limiter, events pii.EventStore) echo.MiddlewareFunc {
|
|
return func(next echo.HandlerFunc) echo.HandlerFunc {
|
|
return func(c echo.Context) error {
|
|
cfg, ok := c.Get(CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
|
|
if !ok || cfg == nil {
|
|
return next(c)
|
|
}
|
|
max := cfg.Limits.MaxConcurrent
|
|
release, ok := limiter.Acquire(cfg.Name, max)
|
|
if !ok {
|
|
retryAfter := admission.RetryAfter(cfg.Limits.RetryAfterSeconds)
|
|
recordAdmissionRejection(events, cfg.Name, retryAfter)
|
|
c.Response().Header().Set("Retry-After", strconv.Itoa(int(retryAfter.Seconds())))
|
|
return c.JSON(http.StatusServiceUnavailable, map[string]any{
|
|
"error": map[string]any{
|
|
"type": "admission_rejected",
|
|
"message": fmt.Sprintf("model %q is at capacity (max_concurrent=%d); retry after %s", cfg.Name, max, retryAfter),
|
|
},
|
|
})
|
|
}
|
|
defer release()
|
|
return next(c)
|
|
}
|
|
}
|
|
}
|
|
|
|
// admissionEventSeq scopes IDs across the process so rapid
|
|
// rejections under load get unique row IDs without coordinating
|
|
// with the rest of the event-store ID schemes.
|
|
var admissionEventSeq atomic.Uint64
|
|
|
|
func recordAdmissionRejection(events pii.EventStore, modelName string, retryAfter time.Duration) {
|
|
if events == nil {
|
|
return
|
|
}
|
|
statusCode := http.StatusServiceUnavailable
|
|
durMS := retryAfter.Milliseconds()
|
|
id := fmt.Sprintf("adm_%d_%s", admissionEventSeq.Add(1), randHex(4))
|
|
_ = events.Record(context.Background(), pii.PIIEvent{
|
|
ID: id,
|
|
Kind: pii.KindAdmission,
|
|
Host: modelName,
|
|
StatusCode: statusCode,
|
|
DurationMS: durMS,
|
|
CreatedAt: time.Now().UTC(),
|
|
})
|
|
}
|
|
|
|
func randHex(n int) string {
|
|
b := make([]byte, n)
|
|
_, _ = rand.Read(b)
|
|
return hex.EncodeToString(b)
|
|
}
|