Files
LocalAI/core/services/routing/pii/patterns.go
Richard Palethorpe 6a80e23733 feat(middleware): Model routing, PII filtering, Cloud model proxies (#9802)
Add a routing middleware stack and a cloud-proxy backend.

* cloud-proxy: a Go gRPC backend that forwards OpenAI- and
  Anthropic-shaped chat requests to upstream providers, with an
  optional translate mode (OpenAI request -> Anthropic /v1/messages
  -> OpenAI response) and full tool-calling support.

* routing: admission control, content-aware model routing
  (embedding cache + classifier + rerank + Arch-Router score),
  PII detection/redaction (regex + NER) with streaming filter and
  OpenAI/Anthropic adapters, and a per-user/per-key billing recorder
  backed by GORM or in-memory storage.

* middleware: UsageMiddleware records usage via the billing recorder,
  plus admission, route-model, usage-stamp and trace middlewares.

* observability: BackendTrace ring buffer stores full request bodies
  (capped), MITM proxy emits structured trace events, and router
  classifier decisions surface at /api/router/decide.

* gallery: Arch-Router-1.5B (Q4_K_M and Q8_0).

* UI: cloud-proxy model-editor fields, classifier system-prompt and
  score-normalization config, and a Traces page rendering request
  bodies.

Assisted-by: claude-code:claude-opus-4-7 [Read] [Edit] [Bash]

Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-05-25 09:28:27 +02:00

189 lines
5.7 KiB
Go

package pii
import (
"fmt"
"regexp"
"strings"
)
// regexpMatcher is a thin wrapper so tests can swap in a deterministic
// matcher without touching the regexp package. Real usage uses
// regexpMatcherFromPattern; tests can construct fakes.
type regexpMatcher interface {
FindAllStringIndex(s string, n int) [][]int
}
type goRegexp struct{ r *regexp.Regexp }
func (g goRegexp) FindAllStringIndex(s string, n int) [][]int {
return g.r.FindAllStringIndex(s, n)
}
// DefaultPatterns returns the built-in regex set. Each entry includes
// a conservative MaxMatchLength so the streaming filter can size its
// tail buffer without re-parsing the regex at runtime.
//
// Caveats by design:
// - The phone pattern matches international and US formats but does
// not validate area codes. False positives on numbers that look
// phone-like (e.g., timestamps in some formats) are accepted in
// return for reliable coverage.
// - The credit card pattern requires the Luhn check (verifyLuhn) to
// reduce false positives — random 16-digit strings won't match.
// - The API-key pattern targets common provider prefixes (sk-, pk-,
// xoxb-, ghp_, github_pat_) rather than guessing entropy. Adding
// new providers should append a new Pattern, not extend an
// existing alternation, so the admin UI can show one row per
// provider with its own toggle.
func DefaultPatterns() []Pattern {
return []Pattern{
{
ID: "email",
Description: "Email address",
Action: ActionMask,
MaxMatchLength: 254, // RFC 5321 max
},
{
ID: "phone",
Description: "Phone number (international or US format)",
Action: ActionMask,
MaxMatchLength: 24,
},
{
ID: "ssn",
Description: "US Social Security Number (NNN-NN-NNNN)",
Action: ActionMask,
MaxMatchLength: 11,
},
{
ID: "credit_card",
Description: "Credit card number (Luhn-verified)",
Action: ActionMask,
MaxMatchLength: 19,
},
{
ID: "ipv4",
Description: "IPv4 address",
Action: ActionMask,
MaxMatchLength: 15,
},
{
ID: "api_key_prefix",
Description: "Common API key prefixes (sk-, pk-, xoxb-, ghp_, github_pat_)",
Action: ActionBlock, // tighter default — leaked credentials are higher harm
MaxMatchLength: 200,
},
}
}
// patternRegexps maps Pattern.ID to its compiled regex. Kept separate
// from the Pattern struct so DefaultPatterns can be data-only and
// tests can swap matchers via Compile().
var patternRegexps = map[string]*regexp.Regexp{
// Pragmatic email — does not implement RFC 5322 in full (no one
// sane does in a regex). Catches the common shape; the encoder
// NER tier (future) catches edge cases.
"email": regexp.MustCompile(`(?i)[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}`),
// US: (123) 456-7890, 123-456-7890, 123.456.7890, 1234567890.
// International: +<country>-<area>-<rest> with separators.
"phone": regexp.MustCompile(`(?:\+?\d{1,3}[\s\-.]?)?(?:\(\d{3}\)|\d{3})[\s\-.]?\d{3}[\s\-.]?\d{4}`),
"ssn": regexp.MustCompile(`\b\d{3}-\d{2}-\d{4}\b`),
// 13-19 digit Luhn-eligible runs. The verifier in match() rejects
// non-Luhn matches.
"credit_card": regexp.MustCompile(`\b(?:\d[ \-]?){13,19}\b`),
"ipv4": regexp.MustCompile(`\b(?:\d{1,3}\.){3}\d{1,3}\b`),
// Common provider prefixes; each alternative is a separate
// well-known marker rather than a permissive entropy match.
"api_key_prefix": regexp.MustCompile(`(?:sk-[A-Za-z0-9]{20,}|pk-[A-Za-z0-9]{20,}|xoxb-[A-Za-z0-9\-]{20,}|ghp_[A-Za-z0-9]{20,}|github_pat_[A-Za-z0-9_]{20,})`),
}
// Compile attaches matchers to each pattern. Patterns whose ID is not
// in patternRegexps are returned as a typed error so an admin who
// adds a custom pattern via config gets a clear "no regex registered"
// message instead of silent skip.
func Compile(patterns []Pattern) ([]Pattern, error) {
out := make([]Pattern, len(patterns))
for i, p := range patterns {
r, ok := patternRegexps[p.ID]
if !ok {
return nil, fmt.Errorf("pii: no regex registered for pattern id %q", p.ID)
}
p.regex = goRegexp{r: r}
out[i] = p
}
return out, nil
}
// VerifyMatch applies pattern-specific post-checks (e.g. Luhn for
// credit_card). Returns the original match or "" to discard it.
func VerifyMatch(patternID, candidate string) string {
switch patternID {
case "credit_card":
digits := stripNonDigits(candidate)
if len(digits) < 13 || len(digits) > 19 {
return ""
}
if !verifyLuhn(digits) {
return ""
}
case "ipv4":
// Each octet must be 0..255. The regex allows 0..999 since
// regex isn't great at numeric ranges; we tighten here.
for oct := range strings.SplitSeq(candidate, ".") {
n := 0
for _, c := range oct {
if c < '0' || c > '9' {
return ""
}
n = n*10 + int(c-'0')
}
if n > 255 {
return ""
}
}
}
return candidate
}
func stripNonDigits(s string) string {
var b strings.Builder
b.Grow(len(s))
for _, c := range s {
if c >= '0' && c <= '9' {
b.WriteRune(c)
}
}
return b.String()
}
// verifyLuhn implements the Luhn checksum used by credit-card numbers.
// Returns true iff the digits pass.
func verifyLuhn(digits string) bool {
sum := 0
double := false
for i := len(digits) - 1; i >= 0; i-- {
d := int(digits[i] - '0')
if double {
d *= 2
if d > 9 {
d -= 9
}
}
sum += d
double = !double
}
return sum%10 == 0
}
// MaxPatternLength returns the longest MaxMatchLength across the input
// patterns. Used by the streaming filter to size its tail buffer.
func MaxPatternLength(patterns []Pattern) int {
max := 0
for _, p := range patterns {
if p.MaxMatchLength > max {
max = p.MaxMatchLength
}
}
return max
}