Files
LocalAI/core/schema/localai.go
Richard Palethorpe 6a80e23733 feat(middleware): Model routing, PII filtering, Cloud model proxies (#9802)
Add a routing middleware stack and a cloud-proxy backend.

* cloud-proxy: a Go gRPC backend that forwards OpenAI- and
  Anthropic-shaped chat requests to upstream providers, with an
  optional translate mode (OpenAI request -> Anthropic /v1/messages
  -> OpenAI response) and full tool-calling support.

* routing: admission control, content-aware model routing
  (embedding cache + classifier + rerank + Arch-Router score),
  PII detection/redaction (regex + NER) with streaming filter and
  OpenAI/Anthropic adapters, and a per-user/per-key billing recorder
  backed by GORM or in-memory storage.

* middleware: UsageMiddleware records usage via the billing recorder,
  plus admission, route-model, usage-stamp and trace middlewares.

* observability: BackendTrace ring buffer stores full request bodies
  (capped), MITM proxy emits structured trace events, and router
  classifier decisions surface at /api/router/decide.

* gallery: Arch-Router-1.5B (Q4_K_M and Q8_0).

* UI: cloud-proxy model-editor fields, classifier system-prompt and
  score-normalization config, and a Traces page rendering request
  bodies.

Assisted-by: claude-code:claude-opus-4-7 [Read] [Edit] [Bash]

Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-05-25 09:28:27 +02:00

528 lines
20 KiB
Go

package schema
import (
"encoding/json"
"time"
gopsutil "github.com/shirou/gopsutil/v3/process"
)
type BackendMonitorRequest struct {
BasicModelRequest
}
type TokenMetricsRequest struct {
BasicModelRequest
}
type BackendMonitorResponse struct {
MemoryInfo *gopsutil.MemoryInfoStat
MemoryPercent float32
CPUPercent float64
}
type GalleryResponse struct {
ID string `json:"uuid"`
StatusURL string `json:"status"`
EstimatedVRAMBytes uint64 `json:"estimated_vram_bytes,omitempty"`
EstimatedVRAMDisplay string `json:"estimated_vram_display,omitempty"`
EstimatedSizeBytes uint64 `json:"estimated_size_bytes,omitempty"`
EstimatedSizeDisplay string `json:"estimated_size_display,omitempty"`
}
type VideoRequest struct {
BasicModelRequest
Prompt string `json:"prompt" yaml:"prompt"` // text description of the video to generate
NegativePrompt string `json:"negative_prompt" yaml:"negative_prompt"` // things to avoid in the output
StartImage string `json:"start_image" yaml:"start_image"` // URL or base64 of the first frame
EndImage string `json:"end_image" yaml:"end_image"` // URL or base64 of the last frame
Width int32 `json:"width" yaml:"width"` // output width in pixels
Height int32 `json:"height" yaml:"height"` // output height in pixels
NumFrames int32 `json:"num_frames" yaml:"num_frames"` // total number of frames to generate
FPS int32 `json:"fps" yaml:"fps"` // frames per second
Seconds string `json:"seconds,omitempty" yaml:"seconds,omitempty"` // duration in seconds (alternative to num_frames)
Size string `json:"size,omitempty" yaml:"size,omitempty"` // WxH shorthand (e.g. "512x512")
InputReference string `json:"input_reference,omitempty" yaml:"input_reference,omitempty"` // reference image or video URL
Seed int32 `json:"seed" yaml:"seed"` // random seed for reproducibility
CFGScale float32 `json:"cfg_scale" yaml:"cfg_scale"` // classifier-free guidance scale
Step int32 `json:"step" yaml:"step"` // number of diffusion steps
ResponseFormat string `json:"response_format" yaml:"response_format"` // output format (url or b64_json)
}
// @Description TTS request body
type TTSRequest struct {
BasicModelRequest
Input string `json:"input" yaml:"input"` // text input
Voice string `json:"voice" yaml:"voice"` // voice audio file or speaker id
Backend string `json:"backend" yaml:"backend"` // backend engine override
Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
Format string `json:"response_format,omitempty" yaml:"response_format,omitempty"` // (optional) output format
Stream bool `json:"stream,omitempty" yaml:"stream,omitempty"` // (optional) enable streaming TTS
SampleRate int `json:"sample_rate,omitempty" yaml:"sample_rate,omitempty"` // (optional) desired output sample rate
}
// @Description VAD request body
type VADRequest struct {
BasicModelRequest
Audio []float32 `json:"audio" yaml:"audio"` // raw audio samples as float32 PCM
}
type VADSegment struct {
Start float32 `json:"start" yaml:"start"`
End float32 `json:"end" yaml:"end"`
}
type VADResponse struct {
Segments []VADSegment `json:"segments" yaml:"segments"`
}
type StoreCommon struct {
Backend string `json:"backend,omitempty" yaml:"backend,omitempty"`
}
type StoresSet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
StoreCommon
}
type StoresDelete struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`
Keys [][]float32 `json:"keys"`
StoreCommon
}
type StoresGet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`
Keys [][]float32 `json:"keys" yaml:"keys"`
StoreCommon
}
type StoresGetResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
}
type StoresFind struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`
Key []float32 `json:"key" yaml:"key"`
Topk int `json:"topk" yaml:"topk"`
StoreCommon
}
type StoresFindResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
Similarities []float32 `json:"similarities" yaml:"similarities"`
}
type NodeData struct {
Name string
ID string
TunnelAddress string
ServiceID string
LastSeen time.Time
}
func (d NodeData) IsOnline() bool {
now := time.Now()
// if the node was seen in the last 40 seconds, it's online
return now.Sub(d.LastSeen) < 40*time.Second
}
type P2PNodesResponse struct {
LlamaCPPNodes []NodeData `json:"llama_cpp_nodes" yaml:"llama_cpp_nodes"`
FederatedNodes []NodeData `json:"federated_nodes" yaml:"federated_nodes"`
MLXNodes []NodeData `json:"mlx_nodes" yaml:"mlx_nodes"`
}
type SysInfoModel struct {
ID string `json:"id"`
}
type SystemInformationResponse struct {
Backends []string `json:"backends"` // available backend engines
Models []SysInfoModel `json:"loaded_models"` // currently loaded models
}
type DetectionRequest struct {
BasicModelRequest
Image string `json:"image"` // URL or base64-encoded image to analyze
Prompt string `json:"prompt,omitempty"` // Text prompt (for SAM 3 PCS mode)
Points []float32 `json:"points,omitempty"` // Point coordinates as [x,y,label,...] triples (label: 1=pos, 0=neg)
Boxes []float32 `json:"boxes,omitempty"` // Box coordinates as [x1,y1,x2,y2,...] quads
Threshold float32 `json:"threshold,omitempty"` // Detection confidence threshold
}
type DetectionResponse struct {
Detections []Detection `json:"detections"`
}
type Detection struct {
X float32 `json:"x"`
Y float32 `json:"y"`
Width float32 `json:"width"`
Height float32 `json:"height"`
ClassName string `json:"class_name"`
Confidence float32 `json:"confidence,omitempty"`
Mask string `json:"mask,omitempty"` // base64-encoded PNG segmentation mask
}
// ─── Face recognition ──────────────────────────────────────────────
//
// FacialArea describes a bounding box for a detected face.
type FacialArea struct {
X float32 `json:"x"`
Y float32 `json:"y"`
W float32 `json:"w"`
H float32 `json:"h"`
}
// FaceVerifyRequest compares two images to decide whether they depict
// the same person. Img1 and Img2 accept URL, base64, or data-URI.
type FaceVerifyRequest struct {
BasicModelRequest
Img1 string `json:"img1"`
Img2 string `json:"img2"`
Threshold float32 `json:"threshold,omitempty"`
AntiSpoofing bool `json:"anti_spoofing,omitempty"`
}
type FaceVerifyResponse struct {
Verified bool `json:"verified"`
Distance float32 `json:"distance"`
Threshold float32 `json:"threshold"`
Confidence float32 `json:"confidence"`
Model string `json:"model"`
Img1Area FacialArea `json:"img1_area"`
Img2Area FacialArea `json:"img2_area"`
ProcessingTimeMs float32 `json:"processing_time_ms,omitempty"`
// Liveness fields are only populated when the request set
// anti_spoofing=true. Pointers keep them fully absent from the
// JSON response otherwise, so callers can tell "not checked"
// apart from "checked and fake" (which would collapse to zero
// values with plain bool+omitempty).
Img1IsReal *bool `json:"img1_is_real,omitempty"`
Img1AntispoofScore *float32 `json:"img1_antispoof_score,omitempty"`
Img2IsReal *bool `json:"img2_is_real,omitempty"`
Img2AntispoofScore *float32 `json:"img2_antispoof_score,omitempty"`
}
// FaceAnalyzeRequest asks the backend for demographic attributes on
// every face detected in Img.
type FaceAnalyzeRequest struct {
BasicModelRequest
Img string `json:"img"`
Actions []string `json:"actions,omitempty"` // subset of {"age","gender","emotion","race"}
AntiSpoofing bool `json:"anti_spoofing,omitempty"`
}
type FaceAnalyzeResponse struct {
Faces []FaceAnalysis `json:"faces"`
}
type FaceAnalysis struct {
Region FacialArea `json:"region"`
FaceConfidence float32 `json:"face_confidence"`
Age float32 `json:"age,omitempty"`
DominantGender string `json:"dominant_gender,omitempty"`
Gender map[string]float32 `json:"gender,omitempty"`
DominantEmotion string `json:"dominant_emotion,omitempty"`
Emotion map[string]float32 `json:"emotion,omitempty"`
DominantRace string `json:"dominant_race,omitempty"`
Race map[string]float32 `json:"race,omitempty"`
// Liveness fields — see FaceVerifyResponse for why these are pointers.
IsReal *bool `json:"is_real,omitempty"`
AntispoofScore *float32 `json:"antispoof_score,omitempty"`
}
// FaceEmbedRequest extracts a face embedding from an image. Distinct
// from /v1/embeddings (which is OpenAI-compatible and text-only); this
// endpoint accepts URL / base64 / data-URI image inputs.
type FaceEmbedRequest struct {
BasicModelRequest
Img string `json:"img"`
}
type FaceEmbedResponse struct {
Embedding []float32 `json:"embedding"`
Dim int `json:"dim"`
Model string `json:"model,omitempty"`
}
// FaceRegisterRequest enrolls a face into the 1:N recognition store.
type FaceRegisterRequest struct {
BasicModelRequest
Img string `json:"img"`
Name string `json:"name"`
Labels map[string]string `json:"labels,omitempty"`
Store string `json:"store,omitempty"` // vector store model; empty = local-store default
}
type FaceRegisterResponse struct {
ID string `json:"id"`
Name string `json:"name"`
RegisteredAt time.Time `json:"registered_at"`
}
// FaceIdentifyRequest runs 1:N recognition: embed the probe and
// return the top-K nearest registered faces.
type FaceIdentifyRequest struct {
BasicModelRequest
Img string `json:"img"`
TopK int `json:"top_k,omitempty"`
Threshold float32 `json:"threshold,omitempty"` // optional cutoff on distance
Store string `json:"store,omitempty"`
}
type FaceIdentifyResponse struct {
Matches []FaceIdentifyMatch `json:"matches"`
}
type FaceIdentifyMatch struct {
ID string `json:"id"`
Name string `json:"name"`
Labels map[string]string `json:"labels,omitempty"`
Distance float32 `json:"distance"`
Confidence float32 `json:"confidence"`
Match bool `json:"match"` // true when distance <= threshold
}
// FaceForgetRequest removes a previously-registered face by ID.
type FaceForgetRequest struct {
BasicModelRequest
ID string `json:"id"`
Store string `json:"store,omitempty"`
}
// ─── Voice (speaker) recognition ───────────────────────────────────
//
// VoiceVerifyRequest compares two audio clips and reports whether they
// were spoken by the same speaker. Audio1/Audio2 accept URL, base64,
// or data-URI (the HTTP layer materialises the bytes to a temp file
// before calling the gRPC backend).
type VoiceVerifyRequest struct {
BasicModelRequest
Audio1 string `json:"audio1"`
Audio2 string `json:"audio2"`
Threshold float32 `json:"threshold,omitempty"`
AntiSpoofing bool `json:"anti_spoofing,omitempty"`
}
type VoiceVerifyResponse struct {
Verified bool `json:"verified"`
Distance float32 `json:"distance"`
Threshold float32 `json:"threshold"`
Confidence float32 `json:"confidence"`
Model string `json:"model"`
ProcessingTimeMs float32 `json:"processing_time_ms,omitempty"`
}
// VoiceAnalyzeRequest asks the backend for demographic attributes
// (age, gender, emotion) inferred from the audio clip.
type VoiceAnalyzeRequest struct {
BasicModelRequest
Audio string `json:"audio"`
Actions []string `json:"actions,omitempty"` // subset of {"age","gender","emotion"}
}
type VoiceAnalyzeResponse struct {
Segments []VoiceAnalysis `json:"segments"`
}
type VoiceAnalysis struct {
Start float32 `json:"start"`
End float32 `json:"end"`
Age float32 `json:"age,omitempty"`
DominantGender string `json:"dominant_gender,omitempty"`
Gender map[string]float32 `json:"gender,omitempty"`
DominantEmotion string `json:"dominant_emotion,omitempty"`
Emotion map[string]float32 `json:"emotion,omitempty"`
}
// VoiceEmbedRequest extracts a speaker embedding from an audio clip.
// Distinct from /v1/embeddings (OpenAI-compatible, text-only) — this
// endpoint accepts URL / base64 / data-URI audio inputs.
type VoiceEmbedRequest struct {
BasicModelRequest
Audio string `json:"audio"`
}
type VoiceEmbedResponse struct {
Embedding []float32 `json:"embedding"`
Dim int `json:"dim"`
Model string `json:"model,omitempty"`
}
// VoiceRegisterRequest enrolls a speaker into the 1:N identification store.
type VoiceRegisterRequest struct {
BasicModelRequest
Audio string `json:"audio"`
Name string `json:"name"`
Labels map[string]string `json:"labels,omitempty"`
Store string `json:"store,omitempty"`
}
type VoiceRegisterResponse struct {
ID string `json:"id"`
Name string `json:"name"`
RegisteredAt time.Time `json:"registered_at"`
}
// VoiceIdentifyRequest runs 1:N recognition: embed the probe and
// return the top-K nearest registered speakers.
type VoiceIdentifyRequest struct {
BasicModelRequest
Audio string `json:"audio"`
TopK int `json:"top_k,omitempty"`
Threshold float32 `json:"threshold,omitempty"`
Store string `json:"store,omitempty"`
}
type VoiceIdentifyResponse struct {
Matches []VoiceIdentifyMatch `json:"matches"`
}
type VoiceIdentifyMatch struct {
ID string `json:"id"`
Name string `json:"name"`
Labels map[string]string `json:"labels,omitempty"`
Distance float32 `json:"distance"`
Confidence float32 `json:"confidence"`
Match bool `json:"match"`
}
// VoiceForgetRequest removes a previously-registered speaker by ID.
type VoiceForgetRequest struct {
BasicModelRequest
ID string `json:"id"`
Store string `json:"store,omitempty"`
}
type ImportModelRequest struct {
URI string `json:"uri"`
Preferences json.RawMessage `json:"preferences,omitempty"`
}
// KnownBackend describes a backend that the importer knows about.
// Used by GET /backends/known to populate the import form dropdown.
type KnownBackend struct {
Name string `json:"name"`
Modality string `json:"modality"`
AutoDetect bool `json:"auto_detect"`
Description string `json:"description,omitempty"`
// Installed is true when the backend is currently present on disk — i.e. it
// appears in gallery.ListSystemBackends(systemState). Importer-registered or
// curated pref-only backends default to false unless they also show up on
// disk. The import form uses this to warn users that submitting an import
// may trigger an automatic backend download.
Installed bool `json:"installed"`
}
// SettingsResponse is the response type for settings API operations
type SettingsResponse struct {
Success bool `json:"success"`
Error string `json:"error,omitempty"`
Message string `json:"message,omitempty"`
}
// RouterDecideRequest is the input for POST /api/router/decide — the
// programmatic decision-oracle endpoint. Given the name of a router
// model (a ModelConfig that carries a `router:` block) and a prompt,
// the endpoint returns the classifier's label set plus the candidate
// model the in-band RouteModel middleware would have chosen. The
// endpoint does NOT rewrite any request, forward to a backend, or
// record a row in the decision store — it is a pure decision oracle
// for external routers that want LocalAI's classifier opinion without
// committing LocalAI to handle the request.
type RouterDecideRequest struct {
// Router is the name of the router model (a ModelConfig with a
// `router:` block). Required.
Router string `json:"router"`
// Input is the user-visible prompt text to classify. Required.
// Schema-shape extraction (chat-message concatenation, etc.) is
// the caller's responsibility — matches the Probe contract used
// by the in-band middleware.
Input string `json:"input"`
}
// RouterDecideResponse carries the classifier's decision plus the
// resolved candidate. Mirrors router.Decision with the addition of
// Candidate/Fallback so the caller learns which downstream model
// would have served the request without re-implementing the
// label-set → candidate match locally.
type RouterDecideResponse struct {
// Router echoes the requested router model.
Router string `json:"router"`
// Classifier is the classifier name that produced the decision
// (e.g. "score").
Classifier string `json:"classifier"`
// Labels is the set of active policy labels.
Labels []string `json:"labels"`
// Candidate is the model that would be routed to. Empty when no
// candidate covers Labels AND no fallback is configured.
Candidate string `json:"candidate,omitempty"`
// Fallback is true when Candidate is the router's configured
// fallback because no candidate covered Labels. Lets callers
// distinguish "matched" from "fell back" without comparing names.
Fallback bool `json:"fallback,omitempty"`
// Score is the top label's softmax probability (the
// classifier-side confidence signal).
Score float64 `json:"score"`
// LatencyMs is the classifier's wall-clock cost.
LatencyMs int64 `json:"latency_ms"`
// Cached is true when the decision came from the L2 embedding
// cache rather than a fresh classifier run.
Cached bool `json:"cached,omitempty"`
// CacheSimilarity carries the cosine similarity of the cache hit
// (0 when not cached).
CacheSimilarity float64 `json:"cache_similarity,omitempty"`
}
// PIIDecideRequest is the input for POST /api/pii/decide — the
// programmatic PII-decision oracle. External routers call it before
// dispatching a request to learn whether the content carries PII and
// what action the configured pattern set would take. The endpoint
// inspects the text and returns findings + a suggested action; it
// does NOT mutate the input, record an audit event, or rewrite any
// downstream request. The caller composes the decision with its own
// policy (mask, block, route to local-only backends, allow).
type PIIDecideRequest struct {
// Text is the user-visible content to inspect. Required.
Text string `json:"text"`
}
// PIIDecideResponse carries the redactor's findings.
// SuggestedAction is derived from the action ordering used by the
// internal redactor (block > route_local > mask > allow) so callers
// don't need to replicate that logic.
type PIIDecideResponse struct {
// Findings is one entry per matched span — pattern id, byte
// range, and audit-safe hash prefix (never the matched value).
Findings []PIIFinding `json:"findings"`
// SuggestedAction is the strongest action across all findings:
// "block", "route_local", "mask", or "allow" (no findings).
SuggestedAction string `json:"suggested_action"`
// RedactedPreview is the input with mask-action spans replaced
// by their placeholders. Identical to Text when no findings or
// when the strongest action is block/route_local (which don't
// rewrite content).
RedactedPreview string `json:"redacted_preview"`
}
// PIIFinding mirrors pii.Span on the wire. Pattern is the pattern id
// that matched (e.g. "email"). HashPrefix is the first 8 chars of
// sha256(matched value) — lets admins correlate recurring leaks
// without recovering the value itself.
type PIIFinding struct {
Start int `json:"start"`
End int `json:"end"`
Pattern string `json:"pattern"`
HashPrefix string `json:"hash_prefix"`
}