mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-30 11:36:31 -04:00
Add a routing middleware stack and a cloud-proxy backend. * cloud-proxy: a Go gRPC backend that forwards OpenAI- and Anthropic-shaped chat requests to upstream providers, with an optional translate mode (OpenAI request -> Anthropic /v1/messages -> OpenAI response) and full tool-calling support. * routing: admission control, content-aware model routing (embedding cache + classifier + rerank + Arch-Router score), PII detection/redaction (regex + NER) with streaming filter and OpenAI/Anthropic adapters, and a per-user/per-key billing recorder backed by GORM or in-memory storage. * middleware: UsageMiddleware records usage via the billing recorder, plus admission, route-model, usage-stamp and trace middlewares. * observability: BackendTrace ring buffer stores full request bodies (capped), MITM proxy emits structured trace events, and router classifier decisions surface at /api/router/decide. * gallery: Arch-Router-1.5B (Q4_K_M and Q8_0). * UI: cloud-proxy model-editor fields, classifier system-prompt and score-normalization config, and a Traces page rendering request bodies. Assisted-by: claude-code:claude-opus-4-7 [Read] [Edit] [Bash] Signed-off-by: Richard Palethorpe <io@richiejp.com>
160 lines
5.6 KiB
Go
160 lines
5.6 KiB
Go
package backend
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/mudler/LocalAI/core/config"
|
|
"github.com/mudler/LocalAI/core/trace"
|
|
"github.com/mudler/LocalAI/pkg/grpc"
|
|
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
|
model "github.com/mudler/LocalAI/pkg/model"
|
|
)
|
|
|
|
// ScoreOptions controls a single Score request.
|
|
type ScoreOptions struct {
|
|
// IncludeTokenLogprobs returns per-token log-probability detail for
|
|
// each candidate. Off by default — the joint LogProb is enough for
|
|
// ranking; callers that need calibration / entropy over the token
|
|
// stream opt in.
|
|
IncludeTokenLogprobs bool
|
|
// LengthNormalize divides the joint log-prob by the candidate's
|
|
// token count. Useful when comparing candidates of different
|
|
// lengths — without it, longer candidates score lower by default.
|
|
LengthNormalize bool
|
|
}
|
|
|
|
// CandidateScore is the per-candidate result. Mirrors pb.CandidateScore
|
|
// but avoids leaking the proto type to consumers.
|
|
type CandidateScore struct {
|
|
LogProb float64
|
|
LengthNormalizedLogProb float64
|
|
NumTokens int
|
|
Tokens []TokenLogProb
|
|
}
|
|
|
|
type TokenLogProb struct {
|
|
Token string
|
|
LogProb float64
|
|
}
|
|
|
|
// Scorer evaluates a model's joint log-probability of each candidate
|
|
// continuation given a shared prompt. Implemented by NewScorer over a
|
|
// model-loaded backend; the router's score classifier consumes this
|
|
// for multi-label policy selection.
|
|
type Scorer interface {
|
|
Score(ctx context.Context, prompt string, candidates []string) ([]CandidateScore, error)
|
|
}
|
|
|
|
// NewScorer binds (loader, modelConfig, appConfig) into a Scorer. The
|
|
// underlying backend is resolved lazily on the first Score call.
|
|
// Returns nil only as a contract violation — callers that need to
|
|
// detect "model not loadable" should look up the config first.
|
|
func NewScorer(loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) Scorer {
|
|
return &modelScorer{loader: loader, modelConfig: modelConfig, appConfig: appConfig}
|
|
}
|
|
|
|
type modelScorer struct {
|
|
loader *model.ModelLoader
|
|
modelConfig config.ModelConfig
|
|
appConfig *config.ApplicationConfig
|
|
}
|
|
|
|
func (m *modelScorer) Score(ctx context.Context, prompt string, candidates []string) ([]CandidateScore, error) {
|
|
fn, err := ModelScore(prompt, candidates, ScoreOptions{LengthNormalize: true}, m.loader, m.modelConfig, m.appConfig)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return fn(ctx)
|
|
}
|
|
|
|
// ModelScore loads the backend for modelConfig and returns a closure
|
|
// that scores `candidates` against `prompt`. The closure is bound to
|
|
// the loaded model so callers can keep it around for repeat scoring
|
|
// within the same request without re-resolving the backend.
|
|
func ModelScore(prompt string, candidates []string, opts ScoreOptions, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (func(ctx context.Context) ([]CandidateScore, error), error) {
|
|
modelOpts := ModelOptions(modelConfig, appConfig)
|
|
inferenceModel, err := loader.Load(modelOpts...)
|
|
if err != nil {
|
|
recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
|
|
return nil, err
|
|
}
|
|
b, ok := inferenceModel.(grpc.Backend)
|
|
if !ok {
|
|
return nil, fmt.Errorf("scoring not supported by backend %q", modelConfig.Backend)
|
|
}
|
|
if len(candidates) == 0 {
|
|
return nil, fmt.Errorf("Score: candidates must be non-empty")
|
|
}
|
|
return func(ctx context.Context) ([]CandidateScore, error) {
|
|
// Surface score calls in the Traces UI alongside the LLM calls
|
|
// they typically gate (router classifier, eval scoring). Without
|
|
// this, a router-classified request shows only the downstream LLM
|
|
// trace with no record of the classification that picked it.
|
|
var startTime time.Time
|
|
if appConfig.EnableTracing {
|
|
trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
|
|
startTime = time.Now()
|
|
}
|
|
resp, err := b.Score(ctx, &pb.ScoreRequest{
|
|
Prompt: prompt,
|
|
Candidates: candidates,
|
|
IncludeTokenLogprobs: opts.IncludeTokenLogprobs,
|
|
LengthNormalize: opts.LengthNormalize,
|
|
})
|
|
results := scoreResponseToCandidates(resp, opts.IncludeTokenLogprobs)
|
|
if appConfig.EnableTracing {
|
|
errStr := ""
|
|
if err != nil {
|
|
errStr = err.Error()
|
|
}
|
|
trace.RecordBackendTrace(trace.BackendTrace{
|
|
Timestamp: startTime,
|
|
Duration: time.Since(startTime),
|
|
Type: trace.BackendTraceScore,
|
|
ModelName: modelConfig.Name,
|
|
Backend: modelConfig.Backend,
|
|
Summary: trace.TruncateString(prompt, 200),
|
|
Error: errStr,
|
|
Data: map[string]any{
|
|
// Copy candidates so the trace buffer doesn't pin a
|
|
// caller-owned slice for the lifetime of the ring.
|
|
"candidates": append([]string(nil), candidates...),
|
|
"results": results,
|
|
},
|
|
})
|
|
}
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return results, nil
|
|
}, nil
|
|
}
|
|
|
|
// scoreResponseToCandidates converts the wire-format pb response into
|
|
// the value type consumed by callers. Extracted to keep ModelScore's
|
|
// closure trivial and so the conversion can be unit-tested without a
|
|
// real backend.
|
|
func scoreResponseToCandidates(resp *pb.ScoreResponse, includeTokens bool) []CandidateScore {
|
|
if resp == nil {
|
|
return nil
|
|
}
|
|
out := make([]CandidateScore, len(resp.Candidates))
|
|
for i, c := range resp.Candidates {
|
|
cs := CandidateScore{
|
|
LogProb: c.LogProb,
|
|
LengthNormalizedLogProb: c.LengthNormalizedLogProb,
|
|
NumTokens: int(c.NumTokens),
|
|
}
|
|
if includeTokens && len(c.Tokens) > 0 {
|
|
cs.Tokens = make([]TokenLogProb, len(c.Tokens))
|
|
for j, t := range c.Tokens {
|
|
cs.Tokens[j] = TokenLogProb{Token: t.Token, LogProb: t.LogProb}
|
|
}
|
|
}
|
|
out[i] = cs
|
|
}
|
|
return out
|
|
}
|