mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-26 01:16:58 -04:00
* fix(pii): post-merge review fixes + live NER e2e for the privacy-filter tier Follow-up to the NER tier engine (#10360), already on master. This carries only the incremental review fixes and tests that postdate that merge — the feature itself is not re-introduced. Review fixes: - openai_completion.go: remove the dead `elem >= 0` conjunct in applyAnyText (the `elem < 0` guard above already returns). - application.go: collapse ResolvePIIPolicy's inline re-implementation of PIIIsEnabled to a single cfg.PIIIsEnabled() call (sole source of the "explicit pii.enabled wins, else cloud-proxy default" rule) and return true past the !enabled guard where it is provable. - pattern.go: hoist the triple `appConfig != nil && EnableTracing` check in patternDetector.Detect into one local. - grammar.go: MaxQuantifier was 4096, but Go's regexp/syntax rejects repeat bounds above 1000 at Parse time, so walk()'s {n,m} guard could never fire — dead code shadowed by the parser. Lower it to 512 so a bound in (512,1000] is rejected here with an actionable error; >1000 still fails closed via Parse. Specs pin the relationship so the guard can't silently revert. - PatternListEditor.jsx: clamp a directly-typed negative min_len to >=0 and force the DOM value back when clamping (min={0} only constrained the spinner, so a negative reached saved config and silently disabled the length filter). Tests: - piipattern_test.go: MaxQuantifier guard specs (must stay live, not dead). - model-config.spec.js: assert the min_len clamp, and that entity_actions collapses a duplicate group to a single row (map semantics; regression guard against emitting an array that drops a row on save). - tests/e2e-backends: token_classify capability driving the TokenClassify gRPC RPC against the backend image, asserting byte-correct, UTF-8 rune-aligned spans (entity.Text == text[start:end]) at threshold 0. Verified on CPU via `make test-extra-backend-privacy-filter` (3/3 specs). - Makefile: test-extra-backend-privacy-filter wrapper. - tests/e2e: e2e_pii_ner_test.go drives /api/pii/analyze + /api/pii/redact (mask + block) through the full HTTP -> detector -> redactor path; gated on PII_NER_MODEL_GGUF so the default suite is unaffected. - .github/workflows/tests-pii-ner-e2e.yml: path-filtered / nightly CI job running the container harness on CPU. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * feat(gallery): add privacy-filter-nemotron (f16 + q8) GGUF conversions of OpenMed/privacy-filter-nemotron — a fine-grained English PII token-classifier (55 categories / 221 BIOES classes), fine-tuned from openai/privacy-filter on NVIDIA's Nemotron-PII dataset. Sibling to the existing privacy-filter-multilingual entry, trading language breadth for category depth. - privacy-filter-nemotron: F16 reference artifact (~2.8 GB). - privacy-filter-nemotron-q8: Q8_0 quant (~1.64 GB) for RAM-constrained / edge use; description notes the size/speed tradeoff and to validate on your own data (a single dropped span is a PII leak). Both run on the privacy-filter backend with known_usecases [token_classify] and a default mask policy (min_score 0.5); operators add per-category entity_actions as needed. sha256s taken from the HF repo's LFS object ids. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com>
187 lines
7.1 KiB
Go
187 lines
7.1 KiB
Go
package e2e_test
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"io"
|
|
"net/http"
|
|
"unicode/utf8"
|
|
|
|
"github.com/mudler/LocalAI/core/backend"
|
|
"github.com/mudler/LocalAI/core/schema"
|
|
|
|
. "github.com/onsi/ginkgo/v2"
|
|
. "github.com/onsi/gomega"
|
|
)
|
|
|
|
// Live PII NER tier e2e. These specs run the real privacy-filter GGUF on CPU
|
|
// through the full TokenClassify path — the gap the hermetic suite cannot
|
|
// cover (it only exercises the in-process pattern tier). They Skip unless
|
|
// PII_NER_MODEL_GGUF is wired in BeforeSuite, so the default PR suite is
|
|
// unaffected; the dedicated CI job sets it.
|
|
//
|
|
// The crown-jewel invariant is byte-offset correctness: entity Start/End are
|
|
// half-open BYTE offsets into the original UTF-8 text, and the model's emitted
|
|
// text for a span must equal the corresponding byte slice. We assert that two
|
|
// ways — directly against ModelTokenClassify (raw, Threshold 0, no redactor
|
|
// merge) and against the /api/pii/analyze HTTP contract (post-merge,
|
|
// post-MinScore). The multibyte case proves offsets are bytes, not runes.
|
|
var _ = Describe("PII NER tier (live privacy-filter GGUF)", func() {
|
|
const (
|
|
// Reliable, unambiguous PII the multilingual NER model detects.
|
|
emailText = "Please contact John Doe at john.doe@example.com about invoice 4421."
|
|
// Multibyte chars BEFORE the email push its byte offset past its rune
|
|
// offset, so a rune/byte confusion in the engine or the Go bridge would
|
|
// surface as a mismatched slice here but not in the ASCII case above.
|
|
multibyteText = "Müller paid at café in Zürich; reach john.doe@example.com tomorrow."
|
|
)
|
|
|
|
BeforeEach(func() {
|
|
if piiNERModel == "" {
|
|
Skip("live PII NER model not wired (set PII_NER_MODEL_GGUF + REALTIME_BACKENDS_PATH; see tests-pii-ner-e2e.yml)")
|
|
}
|
|
})
|
|
|
|
Context("raw TokenClassify (byte-offset contract)", func() {
|
|
It("returns byte-correct, rune-aligned spans for an ASCII email", func() {
|
|
ents := tokenClassify(emailText)
|
|
Expect(ents).NotTo(BeEmpty(), "model must detect at least one entity in an obvious-PII sentence")
|
|
for _, e := range ents {
|
|
assertByteCorrectSpan(emailText, e.Start, e.End, e.Text)
|
|
}
|
|
Expect(spanCoversSubstring(emailText, ents, "john.doe@example.com")).To(BeTrue(),
|
|
"some detected span must cover the email address")
|
|
})
|
|
|
|
It("keeps byte offsets correct when multibyte runes precede the PII", func() {
|
|
ents := tokenClassify(multibyteText)
|
|
Expect(ents).NotTo(BeEmpty())
|
|
for _, e := range ents {
|
|
// This is the assertion that fails if offsets were computed in
|
|
// runes rather than bytes: the slice would be shifted left.
|
|
assertByteCorrectSpan(multibyteText, e.Start, e.End, e.Text)
|
|
}
|
|
Expect(spanCoversSubstring(multibyteText, ents, "john.doe@example.com")).To(BeTrue())
|
|
})
|
|
})
|
|
|
|
Context("HTTP /api/pii/analyze", func() {
|
|
It("reports ner-source entities with byte-correct offsets", func() {
|
|
status, resp := analyze(schema.PIIAnalyzeRequest{
|
|
Text: emailText,
|
|
Detectors: []string{piiNERModel},
|
|
})
|
|
Expect(status).To(Equal(http.StatusOK))
|
|
Expect(resp.Entities).NotTo(BeEmpty())
|
|
for _, e := range resp.Entities {
|
|
Expect(e.Source).To(Equal("ner"), "privacy-filter detections must be tagged source=ner")
|
|
Expect(e.Action).To(Equal("mask"), "default_action mask must propagate to each entity")
|
|
assertByteCorrectSpan(emailText, e.Start, e.End, emailText[e.Start:e.End])
|
|
Expect(e.Score).To(BeNumerically(">=", 0.5), "below-MinScore spans are dropped before the response")
|
|
}
|
|
})
|
|
})
|
|
|
|
Context("HTTP /api/pii/redact", func() {
|
|
It("masks detected PII out of the returned text", func() {
|
|
status, body := redact(schema.PIIAnalyzeRequest{
|
|
Text: emailText,
|
|
Detectors: []string{piiNERModel},
|
|
})
|
|
Expect(status).To(Equal(http.StatusOK))
|
|
var resp schema.PIIRedactResponse
|
|
Expect(json.Unmarshal(body, &resp)).To(Succeed())
|
|
Expect(resp.Masked).To(BeTrue())
|
|
Expect(resp.RedactedText).NotTo(Equal(emailText))
|
|
Expect(resp.RedactedText).NotTo(ContainSubstring("john.doe@example.com"),
|
|
"the masked email must not survive in the redacted body")
|
|
})
|
|
|
|
It("rejects the request with pii_blocked when an entity action is block", func() {
|
|
status, body := redact(schema.PIIAnalyzeRequest{
|
|
Text: emailText,
|
|
Detectors: []string{piiNERBlockModel},
|
|
})
|
|
Expect(status).To(Equal(http.StatusBadRequest))
|
|
Expect(string(body)).To(ContainSubstring("pii_blocked"))
|
|
Expect(string(body)).NotTo(ContainSubstring("john.doe@example.com"),
|
|
"a blocked response must never echo the raw secret")
|
|
})
|
|
})
|
|
})
|
|
|
|
// tokenClassify drives core/backend.ModelTokenClassify against the live model
|
|
// with the loader/config the running server uses — the same path the NER
|
|
// detector takes, but at Threshold 0 so we see the raw, unmerged spans.
|
|
func tokenClassify(text string) []backend.TokenEntity {
|
|
GinkgoHelper()
|
|
cfg, ok := localAIApp.ModelConfigLoader().GetModelConfig(piiNERModel)
|
|
Expect(ok).To(BeTrue(), "model config %q must be loaded", piiNERModel)
|
|
fn, err := backend.ModelTokenClassify(text, backend.TokenClassifyOptions{},
|
|
localAIApp.ModelLoader(), cfg, localAIApp.ApplicationConfig())
|
|
Expect(err).NotTo(HaveOccurred())
|
|
ents, err := fn(context.TODO())
|
|
Expect(err).NotTo(HaveOccurred())
|
|
return ents
|
|
}
|
|
|
|
// assertByteCorrectSpan is the shared byte-offset invariant: a half-open byte
|
|
// range within text, aligned to UTF-8 rune boundaries, whose slice equals the
|
|
// entity's own reported text.
|
|
func assertByteCorrectSpan(text string, start, end int, got string) {
|
|
GinkgoHelper()
|
|
Expect(start).To(BeNumerically(">=", 0))
|
|
Expect(end).To(BeNumerically(">", start))
|
|
Expect(end).To(BeNumerically("<=", len(text)))
|
|
Expect(utf8.RuneStart(text[start])).To(BeTrue(), "start %d is mid-rune in %q", start, text)
|
|
if end < len(text) {
|
|
Expect(utf8.RuneStart(text[end])).To(BeTrue(), "end %d is mid-rune in %q", end, text)
|
|
}
|
|
slice := text[start:end]
|
|
Expect(utf8.ValidString(slice)).To(BeTrue(), "span %q is not valid UTF-8", slice)
|
|
Expect(slice).To(Equal(got), "entity text must equal text[start:end]")
|
|
}
|
|
|
|
func spanCoversSubstring(text string, ents []backend.TokenEntity, sub string) bool {
|
|
lo := bytes.Index([]byte(text), []byte(sub))
|
|
if lo < 0 {
|
|
return false
|
|
}
|
|
hi := lo + len(sub)
|
|
for _, e := range ents {
|
|
// any overlap with [lo,hi)
|
|
if e.Start < hi && e.End > lo {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func analyze(req schema.PIIAnalyzeRequest) (int, schema.PIIAnalyzeResponse) {
|
|
GinkgoHelper()
|
|
status, body := postJSON("/api/pii/analyze", req)
|
|
var resp schema.PIIAnalyzeResponse
|
|
if status == http.StatusOK {
|
|
Expect(json.Unmarshal(body, &resp)).To(Succeed())
|
|
}
|
|
return status, resp
|
|
}
|
|
|
|
func redact(req schema.PIIAnalyzeRequest) (int, []byte) {
|
|
GinkgoHelper()
|
|
return postJSON("/api/pii/redact", req)
|
|
}
|
|
|
|
func postJSON(path string, payload any) (int, []byte) {
|
|
GinkgoHelper()
|
|
data, err := json.Marshal(payload)
|
|
Expect(err).NotTo(HaveOccurred())
|
|
httpResp, err := http.Post(anthropicBaseURL+path, "application/json", bytes.NewReader(data))
|
|
Expect(err).NotTo(HaveOccurred())
|
|
defer func() { _ = httpResp.Body.Close() }()
|
|
body, err := io.ReadAll(httpResp.Body)
|
|
Expect(err).NotTo(HaveOccurred())
|
|
return httpResp.StatusCode, body
|
|
}
|