mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-02 21:32:23 -04:00
Add a routing middleware stack and a cloud-proxy backend. * cloud-proxy: a Go gRPC backend that forwards OpenAI- and Anthropic-shaped chat requests to upstream providers, with an optional translate mode (OpenAI request -> Anthropic /v1/messages -> OpenAI response) and full tool-calling support. * routing: admission control, content-aware model routing (embedding cache + classifier + rerank + Arch-Router score), PII detection/redaction (regex + NER) with streaming filter and OpenAI/Anthropic adapters, and a per-user/per-key billing recorder backed by GORM or in-memory storage. * middleware: UsageMiddleware records usage via the billing recorder, plus admission, route-model, usage-stamp and trace middlewares. * observability: BackendTrace ring buffer stores full request bodies (capped), MITM proxy emits structured trace events, and router classifier decisions surface at /api/router/decide. * gallery: Arch-Router-1.5B (Q4_K_M and Q8_0). * UI: cloud-proxy model-editor fields, classifier system-prompt and score-normalization config, and a Traces page rendering request bodies. Assisted-by: claude-code:claude-opus-4-7 [Read] [Edit] [Bash] Signed-off-by: Richard Palethorpe <io@richiejp.com>
189 lines
5.7 KiB
Go
189 lines
5.7 KiB
Go
package pii
|
|
|
|
import (
|
|
"fmt"
|
|
"regexp"
|
|
"strings"
|
|
)
|
|
|
|
// regexpMatcher is a thin wrapper so tests can swap in a deterministic
|
|
// matcher without touching the regexp package. Real usage uses
|
|
// regexpMatcherFromPattern; tests can construct fakes.
|
|
type regexpMatcher interface {
|
|
FindAllStringIndex(s string, n int) [][]int
|
|
}
|
|
|
|
type goRegexp struct{ r *regexp.Regexp }
|
|
|
|
func (g goRegexp) FindAllStringIndex(s string, n int) [][]int {
|
|
return g.r.FindAllStringIndex(s, n)
|
|
}
|
|
|
|
// DefaultPatterns returns the built-in regex set. Each entry includes
|
|
// a conservative MaxMatchLength so the streaming filter can size its
|
|
// tail buffer without re-parsing the regex at runtime.
|
|
//
|
|
// Caveats by design:
|
|
// - The phone pattern matches international and US formats but does
|
|
// not validate area codes. False positives on numbers that look
|
|
// phone-like (e.g., timestamps in some formats) are accepted in
|
|
// return for reliable coverage.
|
|
// - The credit card pattern requires the Luhn check (verifyLuhn) to
|
|
// reduce false positives — random 16-digit strings won't match.
|
|
// - The API-key pattern targets common provider prefixes (sk-, pk-,
|
|
// xoxb-, ghp_, github_pat_) rather than guessing entropy. Adding
|
|
// new providers should append a new Pattern, not extend an
|
|
// existing alternation, so the admin UI can show one row per
|
|
// provider with its own toggle.
|
|
func DefaultPatterns() []Pattern {
|
|
return []Pattern{
|
|
{
|
|
ID: "email",
|
|
Description: "Email address",
|
|
Action: ActionMask,
|
|
MaxMatchLength: 254, // RFC 5321 max
|
|
},
|
|
{
|
|
ID: "phone",
|
|
Description: "Phone number (international or US format)",
|
|
Action: ActionMask,
|
|
MaxMatchLength: 24,
|
|
},
|
|
{
|
|
ID: "ssn",
|
|
Description: "US Social Security Number (NNN-NN-NNNN)",
|
|
Action: ActionMask,
|
|
MaxMatchLength: 11,
|
|
},
|
|
{
|
|
ID: "credit_card",
|
|
Description: "Credit card number (Luhn-verified)",
|
|
Action: ActionMask,
|
|
MaxMatchLength: 19,
|
|
},
|
|
{
|
|
ID: "ipv4",
|
|
Description: "IPv4 address",
|
|
Action: ActionMask,
|
|
MaxMatchLength: 15,
|
|
},
|
|
{
|
|
ID: "api_key_prefix",
|
|
Description: "Common API key prefixes (sk-, pk-, xoxb-, ghp_, github_pat_)",
|
|
Action: ActionBlock, // tighter default — leaked credentials are higher harm
|
|
MaxMatchLength: 200,
|
|
},
|
|
}
|
|
}
|
|
|
|
// patternRegexps maps Pattern.ID to its compiled regex. Kept separate
|
|
// from the Pattern struct so DefaultPatterns can be data-only and
|
|
// tests can swap matchers via Compile().
|
|
var patternRegexps = map[string]*regexp.Regexp{
|
|
// Pragmatic email — does not implement RFC 5322 in full (no one
|
|
// sane does in a regex). Catches the common shape; the encoder
|
|
// NER tier (future) catches edge cases.
|
|
"email": regexp.MustCompile(`(?i)[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}`),
|
|
// US: (123) 456-7890, 123-456-7890, 123.456.7890, 1234567890.
|
|
// International: +<country>-<area>-<rest> with separators.
|
|
"phone": regexp.MustCompile(`(?:\+?\d{1,3}[\s\-.]?)?(?:\(\d{3}\)|\d{3})[\s\-.]?\d{3}[\s\-.]?\d{4}`),
|
|
"ssn": regexp.MustCompile(`\b\d{3}-\d{2}-\d{4}\b`),
|
|
// 13-19 digit Luhn-eligible runs. The verifier in match() rejects
|
|
// non-Luhn matches.
|
|
"credit_card": regexp.MustCompile(`\b(?:\d[ \-]?){13,19}\b`),
|
|
"ipv4": regexp.MustCompile(`\b(?:\d{1,3}\.){3}\d{1,3}\b`),
|
|
// Common provider prefixes; each alternative is a separate
|
|
// well-known marker rather than a permissive entropy match.
|
|
"api_key_prefix": regexp.MustCompile(`(?:sk-[A-Za-z0-9]{20,}|pk-[A-Za-z0-9]{20,}|xoxb-[A-Za-z0-9\-]{20,}|ghp_[A-Za-z0-9]{20,}|github_pat_[A-Za-z0-9_]{20,})`),
|
|
}
|
|
|
|
// Compile attaches matchers to each pattern. Patterns whose ID is not
|
|
// in patternRegexps are returned as a typed error so an admin who
|
|
// adds a custom pattern via config gets a clear "no regex registered"
|
|
// message instead of silent skip.
|
|
func Compile(patterns []Pattern) ([]Pattern, error) {
|
|
out := make([]Pattern, len(patterns))
|
|
for i, p := range patterns {
|
|
r, ok := patternRegexps[p.ID]
|
|
if !ok {
|
|
return nil, fmt.Errorf("pii: no regex registered for pattern id %q", p.ID)
|
|
}
|
|
p.regex = goRegexp{r: r}
|
|
out[i] = p
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// VerifyMatch applies pattern-specific post-checks (e.g. Luhn for
|
|
// credit_card). Returns the original match or "" to discard it.
|
|
func VerifyMatch(patternID, candidate string) string {
|
|
switch patternID {
|
|
case "credit_card":
|
|
digits := stripNonDigits(candidate)
|
|
if len(digits) < 13 || len(digits) > 19 {
|
|
return ""
|
|
}
|
|
if !verifyLuhn(digits) {
|
|
return ""
|
|
}
|
|
case "ipv4":
|
|
// Each octet must be 0..255. The regex allows 0..999 since
|
|
// regex isn't great at numeric ranges; we tighten here.
|
|
for oct := range strings.SplitSeq(candidate, ".") {
|
|
n := 0
|
|
for _, c := range oct {
|
|
if c < '0' || c > '9' {
|
|
return ""
|
|
}
|
|
n = n*10 + int(c-'0')
|
|
}
|
|
if n > 255 {
|
|
return ""
|
|
}
|
|
}
|
|
}
|
|
return candidate
|
|
}
|
|
|
|
func stripNonDigits(s string) string {
|
|
var b strings.Builder
|
|
b.Grow(len(s))
|
|
for _, c := range s {
|
|
if c >= '0' && c <= '9' {
|
|
b.WriteRune(c)
|
|
}
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
// verifyLuhn implements the Luhn checksum used by credit-card numbers.
|
|
// Returns true iff the digits pass.
|
|
func verifyLuhn(digits string) bool {
|
|
sum := 0
|
|
double := false
|
|
for i := len(digits) - 1; i >= 0; i-- {
|
|
d := int(digits[i] - '0')
|
|
if double {
|
|
d *= 2
|
|
if d > 9 {
|
|
d -= 9
|
|
}
|
|
}
|
|
sum += d
|
|
double = !double
|
|
}
|
|
return sum%10 == 0
|
|
}
|
|
|
|
// MaxPatternLength returns the longest MaxMatchLength across the input
|
|
// patterns. Used by the streaming filter to size its tail buffer.
|
|
func MaxPatternLength(patterns []Pattern) int {
|
|
max := 0
|
|
for _, p := range patterns {
|
|
if p.MaxMatchLength > max {
|
|
max = p.MaxMatchLength
|
|
}
|
|
}
|
|
return max
|
|
}
|