Files
LocalAI/core/backend/score.go
Richard Palethorpe 6a80e23733 feat(middleware): Model routing, PII filtering, Cloud model proxies (#9802)
Add a routing middleware stack and a cloud-proxy backend.

* cloud-proxy: a Go gRPC backend that forwards OpenAI- and
  Anthropic-shaped chat requests to upstream providers, with an
  optional translate mode (OpenAI request -> Anthropic /v1/messages
  -> OpenAI response) and full tool-calling support.

* routing: admission control, content-aware model routing
  (embedding cache + classifier + rerank + Arch-Router score),
  PII detection/redaction (regex + NER) with streaming filter and
  OpenAI/Anthropic adapters, and a per-user/per-key billing recorder
  backed by GORM or in-memory storage.

* middleware: UsageMiddleware records usage via the billing recorder,
  plus admission, route-model, usage-stamp and trace middlewares.

* observability: BackendTrace ring buffer stores full request bodies
  (capped), MITM proxy emits structured trace events, and router
  classifier decisions surface at /api/router/decide.

* gallery: Arch-Router-1.5B (Q4_K_M and Q8_0).

* UI: cloud-proxy model-editor fields, classifier system-prompt and
  score-normalization config, and a Traces page rendering request
  bodies.

Assisted-by: claude-code:claude-opus-4-7 [Read] [Edit] [Bash]

Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-05-25 09:28:27 +02:00

160 lines
5.6 KiB
Go

package backend
import (
"context"
"fmt"
"time"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/trace"
"github.com/mudler/LocalAI/pkg/grpc"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
model "github.com/mudler/LocalAI/pkg/model"
)
// ScoreOptions controls a single Score request.
type ScoreOptions struct {
// IncludeTokenLogprobs returns per-token log-probability detail for
// each candidate. Off by default — the joint LogProb is enough for
// ranking; callers that need calibration / entropy over the token
// stream opt in.
IncludeTokenLogprobs bool
// LengthNormalize divides the joint log-prob by the candidate's
// token count. Useful when comparing candidates of different
// lengths — without it, longer candidates score lower by default.
LengthNormalize bool
}
// CandidateScore is the per-candidate result. Mirrors pb.CandidateScore
// but avoids leaking the proto type to consumers.
type CandidateScore struct {
LogProb float64
LengthNormalizedLogProb float64
NumTokens int
Tokens []TokenLogProb
}
type TokenLogProb struct {
Token string
LogProb float64
}
// Scorer evaluates a model's joint log-probability of each candidate
// continuation given a shared prompt. Implemented by NewScorer over a
// model-loaded backend; the router's score classifier consumes this
// for multi-label policy selection.
type Scorer interface {
Score(ctx context.Context, prompt string, candidates []string) ([]CandidateScore, error)
}
// NewScorer binds (loader, modelConfig, appConfig) into a Scorer. The
// underlying backend is resolved lazily on the first Score call.
// Returns nil only as a contract violation — callers that need to
// detect "model not loadable" should look up the config first.
func NewScorer(loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) Scorer {
return &modelScorer{loader: loader, modelConfig: modelConfig, appConfig: appConfig}
}
type modelScorer struct {
loader *model.ModelLoader
modelConfig config.ModelConfig
appConfig *config.ApplicationConfig
}
func (m *modelScorer) Score(ctx context.Context, prompt string, candidates []string) ([]CandidateScore, error) {
fn, err := ModelScore(prompt, candidates, ScoreOptions{LengthNormalize: true}, m.loader, m.modelConfig, m.appConfig)
if err != nil {
return nil, err
}
return fn(ctx)
}
// ModelScore loads the backend for modelConfig and returns a closure
// that scores `candidates` against `prompt`. The closure is bound to
// the loaded model so callers can keep it around for repeat scoring
// within the same request without re-resolving the backend.
func ModelScore(prompt string, candidates []string, opts ScoreOptions, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (func(ctx context.Context) ([]CandidateScore, error), error) {
modelOpts := ModelOptions(modelConfig, appConfig)
inferenceModel, err := loader.Load(modelOpts...)
if err != nil {
recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
return nil, err
}
b, ok := inferenceModel.(grpc.Backend)
if !ok {
return nil, fmt.Errorf("scoring not supported by backend %q", modelConfig.Backend)
}
if len(candidates) == 0 {
return nil, fmt.Errorf("Score: candidates must be non-empty")
}
return func(ctx context.Context) ([]CandidateScore, error) {
// Surface score calls in the Traces UI alongside the LLM calls
// they typically gate (router classifier, eval scoring). Without
// this, a router-classified request shows only the downstream LLM
// trace with no record of the classification that picked it.
var startTime time.Time
if appConfig.EnableTracing {
trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
startTime = time.Now()
}
resp, err := b.Score(ctx, &pb.ScoreRequest{
Prompt: prompt,
Candidates: candidates,
IncludeTokenLogprobs: opts.IncludeTokenLogprobs,
LengthNormalize: opts.LengthNormalize,
})
results := scoreResponseToCandidates(resp, opts.IncludeTokenLogprobs)
if appConfig.EnableTracing {
errStr := ""
if err != nil {
errStr = err.Error()
}
trace.RecordBackendTrace(trace.BackendTrace{
Timestamp: startTime,
Duration: time.Since(startTime),
Type: trace.BackendTraceScore,
ModelName: modelConfig.Name,
Backend: modelConfig.Backend,
Summary: trace.TruncateString(prompt, 200),
Error: errStr,
Data: map[string]any{
// Copy candidates so the trace buffer doesn't pin a
// caller-owned slice for the lifetime of the ring.
"candidates": append([]string(nil), candidates...),
"results": results,
},
})
}
if err != nil {
return nil, err
}
return results, nil
}, nil
}
// scoreResponseToCandidates converts the wire-format pb response into
// the value type consumed by callers. Extracted to keep ModelScore's
// closure trivial and so the conversion can be unit-tested without a
// real backend.
func scoreResponseToCandidates(resp *pb.ScoreResponse, includeTokens bool) []CandidateScore {
if resp == nil {
return nil
}
out := make([]CandidateScore, len(resp.Candidates))
for i, c := range resp.Candidates {
cs := CandidateScore{
LogProb: c.LogProb,
LengthNormalizedLogProb: c.LengthNormalizedLogProb,
NumTokens: int(c.NumTokens),
}
if includeTokens && len(c.Tokens) > 0 {
cs.Tokens = make([]TokenLogProb, len(c.Tokens))
for j, t := range c.Tokens {
cs.Tokens[j] = TokenLogProb{Token: t.Token, LogProb: t.LogProb}
}
}
out[i] = cs
}
return out
}