feat(pii): NER tier engine — privacy-filter.cpp backend + NER-centric PII filter (#10360)

Squashed feat/pii-ner-tier-engine rebased onto master (was 45 commits; see backup/pii-ner-tier-engine-prerebase). Net change: - privacy-filter.cpp: standalone GGML engine for the openai-privacy-filter PII/NER token classifier, wired as a LocalAI gRPC backend (CPU/CUDA/Vulkan). TokenClassify moves off the patched llama.cpp path onto this backend. - PII filter reworked to be NER-centric (encoder/NER detection tier scanning whole conversations as one document), with a recreated bounded restricted- regex secret-matching pattern detector tier alongside it (per-model pii_detection.builtins / .patterns + core/services/routing/piipattern). - Detection labelled by source (ner vs pattern); backend trace / confidence / debug observability; analyze/redact exposed as a synchronous API. - Instance-wide default detector policy + per-usecase default-on; request filtering extended to completions, embeddings, edits & Ollama. - React UI: NER-centric PII editor, detector-models table, pattern/builtins editor, middleware default-policy UI. - Gallery: privacy-filter-multilingual token-classify model + NER install filter; token_classify known_usecase; batch sized to context for NER models. privacy-filter backend registered in the backend gallery (cpu/vulkan/cuda-13 meta + image entries with a capabilities map) matching its CI matrix jobs, and an /import-model auto-detect importer (PrivacyFilterImporter, narrow privacy-filter GGUF detection) replacing the prior pref-only registration. Reconciled against master's independent evolution: - Dropped master's PIIPatternOverrides feature (global-pattern runtime overrides + /api/pii/patterns API + runtime_settings.json persistence). The per-model NER + pattern-detector design supersedes it; it was built on the global redactor pattern set this branch replaced. - Reverted the llama.cpp Score carry-patch (0006-server-task-type-score): removed the patch and restored master's grpc-server.cpp Score RPC (direct llama_decode, slot-loop bypass) and LLAMA_VERSION pin, plus master's model_config validation forbidding score + chat/completion/embeddings on llama-cpp. token_classify is unaffected (it runs on the privacy-filter backend, not llama-cpp). Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-06-19 06:09:07 -04:00 · 2026-06-18 11:45:22 +01:00
parent c133ca39dc
commit 3fa7b2955c
134 changed files with 6671 additions and 4223 deletions
--- a/core/http/endpoints/anthropic/messages.go
+++ b/core/http/endpoints/anthropic/messages.go
@@ -10,13 +10,11 @@ import (
 	"github.com/labstack/echo/v4"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/auth"
 	mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
 	openaiEndpoint "github.com/mudler/LocalAI/core/http/endpoints/openai"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services/cloudproxy"
-	"github.com/mudler/LocalAI/core/services/routing/pii"
 	"github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/model"
@@ -30,7 +28,7 @@ import (
 // @Param request body schema.AnthropicRequest true "query params"
 // @Success 200 {object} schema.AnthropicResponse "Response"
 // @Router /v1/messages [post]
-func MessagesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig, natsClient mcpTools.MCPNATSClient, piiRedactor *pii.Redactor, piiEvents pii.EventStore) echo.HandlerFunc {
+func MessagesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig, natsClient mcpTools.MCPNATSClient) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		id := uuid.New().String()

@@ -53,7 +51,7 @@ func MessagesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evalu
 		// Cloud-proxy bail. Same shape as the OpenAI chat endpoint —
 		// forwards via the cloud-proxy gRPC backend.
 		if cfg.IsCloudProxyBackendPassthrough() {
-			return forwardCloudProxyAnthropicViaBackend(c, cfg, input, piiRedactor, piiEvents, ml, appConfig)
+			return forwardCloudProxyAnthropicViaBackend(c, cfg, input, ml, appConfig)
 		}

 		// Convert Anthropic messages to OpenAI format for internal processing
@@ -141,7 +139,7 @@ func MessagesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evalu
 		xlog.Debug("Anthropic Messages - Prompt (after templating)", "prompt", predInput)

 		if input.Stream {
-			return handleAnthropicStream(c, id, input, cfg, ml, cl, appConfig, predInput, openAIReq, funcs, shouldUseFn, mcpExecutor, evaluator, piiRedactor, piiEvents)
+			return handleAnthropicStream(c, id, input, cfg, ml, cl, appConfig, predInput, openAIReq, funcs, shouldUseFn, mcpExecutor, evaluator)
 		}

 		return handleAnthropicNonStream(c, id, input, cfg, ml, cl, appConfig, predInput, openAIReq, funcs, shouldUseFn, mcpExecutor, evaluator)
@@ -330,36 +328,13 @@ func handleAnthropicNonStream(c echo.Context, id string, input *schema.Anthropic
 	return sendAnthropicError(c, 500, "api_error", "MCP iteration limit reached")
 }

-func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicRequest, cfg *config.ModelConfig, ml *model.ModelLoader, cl *config.ModelConfigLoader, appConfig *config.ApplicationConfig, predInput string, openAIReq *schema.OpenAIRequest, funcs functions.Functions, shouldUseFn bool, mcpExecutor mcpTools.ToolExecutor, evaluator *templates.Evaluator, piiRedactor *pii.Redactor, piiEvents pii.EventStore) error {
+func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicRequest, cfg *config.ModelConfig, ml *model.ModelLoader, cl *config.ModelConfigLoader, appConfig *config.ApplicationConfig, predInput string, openAIReq *schema.OpenAIRequest, funcs functions.Functions, shouldUseFn bool, mcpExecutor mcpTools.ToolExecutor, evaluator *templates.Evaluator) error {
 	c.Response().Header().Set("Content-Type", "text/event-stream")
 	c.Response().Header().Set("Cache-Control", "no-cache")
 	c.Response().Header().Set("Connection", "keep-alive")

-	// Per-stream PII filter — same gating as the OpenAI chat path. The
-	// filter is wire-format-agnostic; we feed it the text portion of
-	// each text_delta and emit only what's safe to send. The filter
-	// holds back a tail of size MaxPatternLength-1 so a pattern split
-	// across chunk boundaries still gets masked. When PII is disabled
-	// for this model the filter is nil and emits flow unchanged.
-	var streamPIIFilter *pii.StreamFilter
-	if piiRedactor != nil && cfg.PIIIsEnabled() {
-		correlationID := c.Request().Header.Get("x-request-id")
-		userID := ""
-		if u := auth.GetUser(c); u != nil {
-			userID = u.ID
-		}
-		var overrides map[string]pii.Action
-		if raw := cfg.PIIPatternOverrides(); len(raw) > 0 {
-			overrides = make(map[string]pii.Action, len(raw))
-			for ovid, action := range raw {
-				switch pii.Action(action) {
-				case pii.ActionMask, pii.ActionBlock, pii.ActionAllow:
-					overrides[ovid] = pii.Action(action)
-				}
-			}
-		}
-		streamPIIFilter = pii.NewStreamFilter(piiRedactor, overrides, piiEvents, correlationID, userID)
-	}
+	// Response/output PII redaction is out of scope for now — redaction
+	// runs request-side only (the NER middleware).

 	// Send message_start event
 	messageStart := schema.AnthropicStreamEvent{
@@ -440,7 +415,6 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq

 				if len(toolCalls) > toolCallsEmitted {
 					if !inToolCall && currentBlockIndex == 0 {
-						drainStreamPIIToText(c, streamPIIFilter, intPtr(currentBlockIndex))
 						sendAnthropicSSE(c, schema.AnthropicStreamEvent{
 							Type:  "content_block_stop",
 							Index: intPtr(currentBlockIndex),
@@ -481,20 +455,14 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 			}

 			if !inToolCall && token != "" {
-				out := token
-				if streamPIIFilter != nil {
-					out = streamPIIFilter.Push(token)
-				}
-				if out != "" {
-					sendAnthropicSSE(c, schema.AnthropicStreamEvent{
-						Type:  "content_block_delta",
-						Index: intPtr(0),
-						Delta: &schema.AnthropicStreamDelta{
-							Type: "text_delta",
-							Text: out,
-						},
-					})
-				}
+				sendAnthropicSSE(c, schema.AnthropicStreamEvent{
+					Type:  "content_block_delta",
+					Index: intPtr(0),
+					Delta: &schema.AnthropicStreamDelta{
+						Type: "text_delta",
+						Text: token,
+					},
+				})
 			}
 			return true
 		}
@@ -532,20 +500,14 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 			// didn't already stream it (autoparser clears raw text, so
 			// accumulatedContent will be empty in that case).
 			if deltaContent != "" && !inToolCall && accumulatedContent == "" {
-				out := deltaContent
-				if streamPIIFilter != nil {
-					out = streamPIIFilter.Push(deltaContent)
-				}
-				if out != "" {
-					sendAnthropicSSE(c, schema.AnthropicStreamEvent{
-						Type:  "content_block_delta",
-						Index: intPtr(0),
-						Delta: &schema.AnthropicStreamDelta{
-							Type: "text_delta",
-							Text: out,
-						},
-					})
-				}
+				sendAnthropicSSE(c, schema.AnthropicStreamEvent{
+					Type:  "content_block_delta",
+					Index: intPtr(0),
+					Delta: &schema.AnthropicStreamDelta{
+						Type: "text_delta",
+						Text: deltaContent,
+					},
+				})
 			}

 			// Emit tool_use blocks from ChatDeltas
@@ -553,7 +515,6 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 				collectedToolCalls = deltaToolCalls

 				if !inToolCall && currentBlockIndex == 0 {
-					drainStreamPIIToText(c, streamPIIFilter, intPtr(currentBlockIndex))
 					sendAnthropicSSE(c, schema.AnthropicStreamEvent{
 						Type:  "content_block_stop",
 						Index: intPtr(currentBlockIndex),
@@ -657,9 +618,7 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 		if !shouldUseFn && cfg.FunctionsConfig.AutomaticToolParsingFallback && accumulatedContent != "" && toolCallsEmitted == 0 {
 			parsed := functions.ParseFunctionCall(accumulatedContent, cfg.FunctionsConfig)
 			if len(parsed) > 0 {
-				// Close the text content block (after flushing any
-				// residual the streaming PII filter held back).
-				drainStreamPIIToText(c, streamPIIFilter, intPtr(currentBlockIndex))
+				// Close the text content block.
 				sendAnthropicSSE(c, schema.AnthropicStreamEvent{
 					Type:  "content_block_stop",
 					Index: intPtr(currentBlockIndex),
@@ -699,12 +658,8 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 			}
 		}

-		// No MCP tools to execute, close stream. drainStreamPIIToText
-		// flushes any residual the streaming PII filter held back as
-		// part of its trailing pattern-window before we close the
-		// text content block.
+		// No MCP tools to execute, close the text content block.
 		if !inToolCall {
-			drainStreamPIIToText(c, streamPIIFilter, intPtr(0))
 			sendAnthropicSSE(c, schema.AnthropicStreamEvent{
 				Type:  "content_block_stop",
 				Index: intPtr(0),
@@ -752,30 +707,6 @@ func convertFuncsToOpenAITools(funcs functions.Functions) []functions.Tool {

 func intPtr(i int) *int { return &i }

-// drainStreamPIIToText flushes any residual the streaming PII filter
-// has been holding back as part of its trailing pattern-window, and
-// emits it as one final text_delta into the named block before the
-// caller closes that block. Drain is idempotent: calling it twice on
-// the same filter returns "" the second time. Safe to call with a nil
-// filter (no-op).
-func drainStreamPIIToText(c echo.Context, sf *pii.StreamFilter, index *int) {
-	if sf == nil {
-		return
-	}
-	residual := sf.Drain()
-	if residual == "" {
-		return
-	}
-	sendAnthropicSSE(c, schema.AnthropicStreamEvent{
-		Type:  "content_block_delta",
-		Index: index,
-		Delta: &schema.AnthropicStreamDelta{
-			Type: "text_delta",
-			Text: residual,
-		},
-	})
-}
-
 func sendAnthropicSSE(c echo.Context, event schema.AnthropicStreamEvent) {
 	data, err := json.Marshal(event)
 	if err != nil {
@@ -973,17 +904,14 @@ func convertAnthropicTools(input *schema.AnthropicRequest, cfg *config.ModelConf
 }

 // forwardCloudProxyAnthropicViaBackend marshals the Anthropic request,
-// constructs the streaming PII filter (when applicable), and hands the
-// body off to the cloud-proxy gRPC backend. Model swap + upstream auth
-// headers are applied inside the backend; the filter is built here
-// because the auth/correlation context only exists in the echo handler.
-func forwardCloudProxyAnthropicViaBackend(c echo.Context, cfg *config.ModelConfig, input *schema.AnthropicRequest, piiRedactor *pii.Redactor, piiEvents pii.EventStore, ml *model.ModelLoader, appConfig *config.ApplicationConfig) error {
+// and hands the body off to the cloud-proxy gRPC backend. Model swap +
+// upstream auth headers are applied inside the backend. Request-side PII
+// redaction already ran in the middleware; the response is forwarded
+// unmodified.
+func forwardCloudProxyAnthropicViaBackend(c echo.Context, cfg *config.ModelConfig, input *schema.AnthropicRequest, ml *model.ModelLoader, appConfig *config.ApplicationConfig) error {
 	body, err := json.Marshal(input)
 	if err != nil {
 		return sendAnthropicError(c, 400, "invalid_request_error", "cloudproxy: marshal request: "+err.Error())
 	}
-
-	correlationID := c.Request().Header.Get("x-request-id")
-	streamFilter := cloudproxy.BuildStreamFilter(c, cfg, input.Stream, piiRedactor, piiEvents, correlationID)
-	return cloudproxy.ForwardViaBackend(c, cfg, body, streamFilter, ml, appConfig)
+	return cloudproxy.ForwardViaBackend(c, cfg, body, ml, appConfig)
 }
--- a/core/http/endpoints/anthropic/messages_pii_test.go
+++ b/core/http/endpoints/anthropic/messages_pii_test.go
@@ -1,114 +0,0 @@
-package anthropic
-
-import (
-	"net/http"
-	"net/http/httptest"
-	"strings"
-
-	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/services/routing/pii"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// drainStreamPIIToText is called from four sites in messages.go and is
-// the load-bearing primitive for "the streaming filter has buffered
-// some bytes that the request just ended on; flush them as a final
-// text_delta event before closing the content block". A regression
-// here would silently truncate the last few bytes of an assistant
-// response on every PII-enabled stream — invisible without coverage.
-
-// newTestFilter compiles the default patterns and returns a filter
-// that holds back its trailing pattern-window; pushing a short string
-// (shorter than holdLen) keeps the bytes inside Drain.
-func newTestFilter() *pii.StreamFilter {
-	patterns, err := pii.Compile(pii.DefaultPatterns())
-	ExpectWithOffset(1, err).NotTo(HaveOccurred())
-	red := pii.NewRedactor(patterns)
-	return pii.NewStreamFilter(red, nil, nil, "", "")
-}
-
-// newTestContext builds a recording echo context — the recorder
-// captures the SSE bytes drainStreamPIIToText writes.
-func newTestContext() (echo.Context, *httptest.ResponseRecorder) {
-	req := httptest.NewRequest(http.MethodPost, "/v1/messages", strings.NewReader("{}"))
-	rec := httptest.NewRecorder()
-	return echo.New().NewContext(req, rec), rec
-}
-
-var _ = Describe("drainStreamPIIToText", func() {
-	It("is a no-op when the filter is nil", func() {
-		c, rec := newTestContext()
-		drainStreamPIIToText(c, nil, intPtr(0))
-		Expect(rec.Body.Len()).To(Equal(0), "nil filter wrote %d bytes: %q", rec.Body.Len(), rec.Body.String())
-	})
-
-	It("emits nothing when the drain is empty", func() {
-		// A filter with nothing buffered should not emit a phantom event;
-		// otherwise every non-PII response would close with an empty
-		// text_delta that pollutes downstream parsers.
-		sf := newTestFilter()
-		c, rec := newTestContext()
-		drainStreamPIIToText(c, sf, intPtr(0))
-		Expect(rec.Body.Len()).To(Equal(0), "empty drain wrote %d bytes: %q", rec.Body.Len(), rec.Body.String())
-	})
-
-	It("flushes residual buffered bytes as a text_delta event", func() {
-		sf := newTestFilter()
-		// Push less than holdLen so all bytes are retained until Drain.
-		// "tail" is short enough that no pattern is plausible.
-		out := sf.Push("tail")
-		Expect(out).To(Equal(""), "Push of short text emitted %q; want all bytes held", out)
-
-		c, rec := newTestContext()
-		drainStreamPIIToText(c, sf, intPtr(2))
-
-		body := rec.Body.String()
-		// Wire format: "event: content_block_delta\ndata: {…}\n\n"
-		Expect(body).To(ContainSubstring("event: content_block_delta"))
-		Expect(body).To(ContainSubstring(`"type":"content_block_delta"`))
-		Expect(body).To(ContainSubstring(`"index":2`))
-		Expect(body).To(ContainSubstring(`"text":"tail"`))
-		Expect(body).To(ContainSubstring(`"type":"text_delta"`))
-		Expect(strings.HasSuffix(body, "\n\n")).To(BeTrue(), "SSE event missing trailing blank line: %q", body)
-	})
-
-	It("is idempotent across consecutive drains", func() {
-		// Two consecutive Drains: the filter returns "" the second time,
-		// so the second drainStreamPIIToText must emit nothing. The
-		// production path in messages.go has at least four call sites
-		// that may overlap (currentBlockIndex==0 emergency path + the
-		// unconditional drain near the end of the stream); without
-		// idempotence we'd duplicate the residual on the wire.
-		sf := newTestFilter()
-		sf.Push("tail")
-
-		c1, rec1 := newTestContext()
-		drainStreamPIIToText(c1, sf, intPtr(0))
-		first := rec1.Body.Len()
-		Expect(first).NotTo(Equal(0), "first drain emitted nothing")
-
-		c2, rec2 := newTestContext()
-		drainStreamPIIToText(c2, sf, intPtr(0))
-		Expect(rec2.Body.Len()).To(Equal(0), "second drain wrote %d bytes; want idempotent no-op: %q", rec2.Body.Len(), rec2.Body.String())
-	})
-
-	It("masks redacted residual instead of leaking it", func() {
-		// The held tail must travel through the redactor on Drain. If
-		// the bytes happen to form a complete pattern at end-of-stream,
-		// the residual emit must contain the mask placeholder, not the
-		// raw value.
-		sf := newTestFilter()
-		// "alice@example.com" is 17 bytes. holdLen for default patterns
-		// is well above 17, so this stays buffered until Drain, which
-		// then redacts it.
-		out := sf.Push("alice@example.com")
-		Expect(out).To(Equal(""), "Push emitted bytes early: %q", out)
-
-		c, rec := newTestContext()
-		drainStreamPIIToText(c, sf, intPtr(0))
-		body := rec.Body.String()
-		Expect(body).NotTo(ContainSubstring("alice@example.com"), "raw email leaked in residual emit: %q", body)
-		Expect(body).To(ContainSubstring("[REDACTED:email]"), "residual emit missing mask placeholder: %q", body)
-	})
-})
--- a/core/http/endpoints/localai/api_instructions.go
+++ b/core/http/endpoints/localai/api_instructions.go
@@ -100,15 +100,15 @@ var instructionDefs = []instructionDef{
 	},
 	{
 		Name:        "pii-filtering",
-		Description: "Inspect and tune the regex PII filter applied to chat requests",
+		Description: "Inspect the NER-based PII filter applied to chat requests",
 		Tags:        []string{"pii"},
-		Intro:       "GET /api/pii/patterns lists the active pattern set with each one's action (mask, block, allow). GET /api/pii/events returns recent redaction events filtered by correlation_id / user_id / pattern_id (admin or local-user only). POST /api/pii/test dry-runs the redactor against an admin-supplied string. POST /api/pii/decide is the programmatic decision oracle for external routers: send `{text}`, receive `{findings, suggested_action, redacted_preview}` without LocalAI mutating, recording, or acting on the call — caller composes the action with its own policy. Default patterns: email, phone, SSN, credit card (Luhn), IPv4, common API key prefixes (sk-, pk-, ghp_, github_pat_). PII is per-model: by default it is OFF for non-proxy backends and ON for backends starting with proxy-* (cloud passthroughs). Opt in with `pii: { enabled: true }` in a model's YAML; use `pii: { patterns: [{id, action}] }` to upgrade or downgrade individual actions for that model. Override global default actions via --pii-config pii.yaml; --disable-pii turns the filter off entirely.",
+		Intro:       "PII redaction is NER-based and request-side. A consuming model opts in with `pii: { enabled: true, detectors: [<model>] }` where each detector is a token-classification (token_classify) model. The detection policy lives on the detector model itself in a `pii_detection:` block: `{ min_score, default_action (mask|block|allow), entity_actions: { GROUP: action } }`. Multiple detectors union their hits; overlapping spans resolve to the strongest action (block > mask > allow). PII defaults OFF for non-proxy backends and ON for proxy-* (cloud passthroughs). Besides the inline path, two synchronous service endpoints expose the same engine without an inference request: POST /api/pii/analyze returns the detected entity spans (entity_type, source ner|pattern, start/end, score, action) without mutating the text, and POST /api/pii/redact applies the policy — returning redacted_text, or 400 (type pii_blocked) with the offending entities when a block action fires. Both take `{ text, detectors:[<model>...] }` (or `model` to inherit a consuming model's detectors), require the pii_filter feature (any authenticated user), and record audit events with an `origin` of pii_analyze / pii_redact. GET /api/pii/events returns recent redaction events filtered by correlation_id / user_id / pattern_id / origin (middleware|proxy|pii_analyze|pii_redact); events carry `<source>:<GROUP>` ids — e.g. `ner:EMAIL` for the neural detector, `pattern:ANTHROPIC_KEY` for the regex pattern tier — and an 8-char hash prefix, never the matched value (admin or local-user only). The legacy regex pattern tier and its endpoints (/api/pii/patterns, /test, /decide) were removed.",
 	},
 	{
 		Name:        "middleware-admin",
 		Description: "Inspect and configure the routing-module middleware (PII filter and routing)",
 		Tags:        []string{"middleware", "pii", "router"},
-		Intro:       "GET /api/middleware/status is the single round-trip the /app/middleware admin page reads to render the current state: active PII patterns and their actions, every model's resolved enabled/override state, recent event count, and the active routing models with their classifier configurations. Admin-only (the synthetic local user is admin in no-auth mode). PUT /api/pii/patterns/:id changes a pattern's action in-process — TRANSIENT, lost on restart. To persist, edit --pii-config YAML. GET /api/router/decisions returns the routing decision log filtered by correlation_id / user_id / router_model. The same surface is exposed as MCP tools (`get_middleware_status`, `set_pii_pattern_action`, `get_router_decisions`) for agent-driven configuration.",
+		Intro:       "GET /api/middleware/status is the single round-trip the /app/middleware admin page reads to render the current state: every model's resolved PII enabled state and the NER detector models it references, recent event count, and the active routing models with their classifier configurations. Admin-only (the synthetic local user is admin in no-auth mode). PII detection policy is edited on each detector model's `pii_detection:` block via the model-config tools/UI — there is no global pattern set to mutate. GET /api/router/decisions returns the routing decision log filtered by correlation_id / user_id / router_model. The same surface is exposed as MCP tools (`get_middleware_status`, `get_pii_events`, `get_router_decisions`) for agent-driven inspection.",
 	},
 	{
 		Name:        "intelligent-routing",
--- a/core/http/endpoints/localai/backend.go
+++ b/core/http/endpoints/localai/backend.go
@@ -25,6 +25,10 @@ var knownPrefOnlyBackends = []schema.KnownBackend{
 	// Text LLM
 	// ds4: antirez/ds4 - single-model DeepSeek V4 Flash engine; auto-detected via DS4Importer
 	{Name: "ds4", Modality: "text", AutoDetect: false, Description: "antirez/ds4 DeepSeek V4 Flash engine (auto-detected; pref-only fallback)"},
+	// privacy-filter is now auto-detected via PrivacyFilterImporter (see
+	// core/gallery/importers/privacy-filter.go); the importer registry entry
+	// supersedes any pref-only line here, which the /backends/known merge would
+	// dedupe away.
 	{Name: "sglang", Modality: "text", AutoDetect: false, Description: "SGLang runtime (preference-only)"},
 	{Name: "tinygrad", Modality: "text", AutoDetect: false, Description: "tinygrad runtime (preference-only)"},
 	{Name: "trl", Modality: "text", AutoDetect: false, Description: "Transformers Reinforcement Learning (preference-only)"},
--- a/core/http/endpoints/localai/backend_test.go
+++ b/core/http/endpoints/localai/backend_test.go
@@ -88,7 +88,20 @@ var _ = Describe("Backend Endpoints", func() {
 			}
 			Expect(names).To(ContainElements(
 				"llama-cpp", "mlx", "vllm", "transformers", "diffusers",
+				"privacy-filter",
 			))
+
+			// privacy-filter is auto-detected via PrivacyFilterImporter, so it
+			// surfaces from the importer registry (AutoDetect=true) rather than
+			// the curated pref-only slice.
+			byName := map[string]schema.KnownBackend{}
+			for _, b := range payload {
+				byName[b.Name] = b
+			}
+			pf, ok := byName["privacy-filter"]
+			Expect(ok).To(BeTrue(), "privacy-filter must be present")
+			Expect(pf.AutoDetect).To(BeTrue(), "privacy-filter is auto-detected via its importer")
+			Expect(pf.Modality).To(Equal("text"))
 		})

 		It("includes drop-in llama-cpp replacements with AutoDetect=false", func() {
--- a/core/http/endpoints/localai/config_meta.go
+++ b/core/http/endpoints/localai/config_meta.go
@@ -126,6 +126,8 @@ func AutocompleteEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, a
 				filterFn = config.BuildUsecaseFilterFn(config.FLAG_TRANSCRIPT)
 			case "score": // router classifier usecase (FLAG_SCORE); not in UsecaseInfoMap
 				filterFn = config.BuildUsecaseFilterFn(config.FLAG_SCORE)
+			case config.UsecaseTokenClassify: // PII NER detector usecase (FLAG_TOKEN_CLASSIFY)
+				filterFn = config.BuildUsecaseFilterFn(config.FLAG_TOKEN_CLASSIFY)
 			default:
 				filterFn = config.NoFilterFn
 			}
--- a/core/http/endpoints/localai/mcp.go
+++ b/core/http/endpoints/localai/mcp.go
@@ -65,7 +65,7 @@ func MCPEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 	// the per-model PII config and is kept for backward compatibility.
 	// The request-side middleware on the main chat route handles
 	// filtering for the standard /v1/chat/completions path.
-	chatHandler := openai.ChatEndpoint(cl, ml, evaluator, appConfig, natsClient, nil, nil, nil)
+	chatHandler := openai.ChatEndpoint(cl, ml, evaluator, appConfig, natsClient, nil)

 	return func(c echo.Context) error {
 		input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
--- a/core/http/endpoints/localai/pii.go
+++ b/core/http/endpoints/localai/pii.go
@@ -0,0 +1,248 @@
+package localai
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/application"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/auth"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+)
+
+// ErrNoDetectors is returned by RunPIIScan when neither an explicit detector
+// list nor a model's effective PII policy resolve to anything to scan with —
+// including a model that has PII disabled, or one that is enabled but names
+// no detectors while no instance-wide default is set. The handler maps it to
+// 400: the truthful answer is "the middleware would scan nothing", and
+// surfacing that loudly beats implying a clean scan happened.
+var ErrNoDetectors = errors.New("no PII detectors specified")
+
+// ErrUnknownDetector is returned when a named detector model cannot be
+// resolved. Wrapped (errors.Is) so the handler can map it to 400 — a bad
+// detector name is a client error, distinct from a detector that resolved but
+// failed at scan time (mapped to 502, fail-closed).
+var ErrUnknownDetector = errors.New("unknown PII detector")
+
+// RunPIIScan resolves the requested detectors and runs the shared NER/pattern
+// redaction pipeline over text. It is the engine behind both /api/pii/analyze
+// and /api/pii/redact, kept free of echo so the resolution + scan logic is
+// unit-testable with a fake resolver.
+//
+// Detector selection mirrors the inline chat middleware (middleware.go):
+// explicit names take precedence; otherwise the consuming model's effective
+// policy is resolved through policy (Application.ResolvePIIPolicy — the
+// model's own pii.detectors, else the instance-wide PIIDefaultDetectors, and
+// nothing when the model has PII disabled), so the model path answers "what
+// would the middleware do with this text?" with the same inputs the
+// middleware uses. A nil policy falls back to the model's raw pii.detectors
+// (unit tests). Unknown names fail closed (ErrUnknownDetector) rather than
+// silently scanning with fewer detectors than asked for.
+func RunPIIScan(ctx context.Context, resolver pii.NERDetectorResolver, cl *config.ModelConfigLoader, policy pii.PolicyResolver, names []string, model, text string) (pii.Result, error) {
+	if len(names) == 0 && model != "" && cl != nil {
+		if cfg, ok := cl.GetModelConfig(model); ok {
+			if policy != nil {
+				if enabled, detectors := policy(&cfg); enabled {
+					names = detectors
+				}
+			} else {
+				names = cfg.PIIDetectors()
+			}
+		}
+	}
+	if len(names) == 0 {
+		return pii.Result{}, ErrNoDetectors
+	}
+
+	cfgs := make([]pii.NERConfig, 0, len(names))
+	for _, name := range names {
+		nc, ok := resolver(name)
+		if !ok {
+			return pii.Result{}, fmt.Errorf("%w: %q", ErrUnknownDetector, name)
+		}
+		cfgs = append(cfgs, nc)
+	}
+	return pii.RedactNER(ctx, text, cfgs)
+}
+
+// piiEntities maps redaction spans to API entities. Each span's Pattern is the
+// synthetic "<source>:<GROUP>" id (e.g. "ner:EMAIL"); it is split back into
+// the entity type and its source tier. hash_prefix is included only when
+// revealHash is set (admin + reveal) — the raw matched value is never exposed.
+func piiEntities(spans []pii.Span, revealHash bool) []schema.PIIEntity {
+	out := make([]schema.PIIEntity, 0, len(spans))
+	for _, s := range spans {
+		source, group := splitPatternID(s.Pattern)
+		e := schema.PIIEntity{
+			EntityType: group,
+			Source:     source,
+			Start:      s.Start,
+			End:        s.End,
+			Score:      s.Score,
+			Action:     string(s.Action),
+		}
+		if revealHash {
+			e.HashPrefix = s.HashPrefix
+		}
+		out = append(out, e)
+	}
+	return out
+}
+
+// splitPatternID splits "ner:EMAIL" into ("ner", "EMAIL"). A value with no
+// colon is returned as (group, "") inverted to ("", value) so the group is
+// never lost.
+func splitPatternID(patternID string) (source, group string) {
+	if i := strings.IndexByte(patternID, ':'); i >= 0 {
+		return patternID[:i], patternID[i+1:]
+	}
+	return "", patternID
+}
+
+// recordPIIEvents persists one audit event per span, tagged with the calling
+// API as its Origin so /api/pii/events can be filtered to this surface. Mirrors
+// the per-span recording the chat middleware does. Best-effort: a store error
+// is logged by the store layer, not surfaced to the caller.
+func recordPIIEvents(store pii.EventStore, spans []pii.Span, origin pii.Origin, correlationID, userID string) {
+	if store == nil {
+		return
+	}
+	for _, s := range spans {
+		_ = store.Record(context.Background(), pii.PIIEvent{
+			ID:            pii.NewEventID(),
+			Kind:          pii.KindPII,
+			Origin:        origin,
+			CorrelationID: correlationID,
+			UserID:        userID,
+			Direction:     pii.DirectionIn,
+			PatternID:     s.Pattern,
+			ByteOffset:    s.Start,
+			Length:        s.End - s.Start,
+			HashPrefix:    s.HashPrefix,
+			Action:        s.Action,
+			Score:         s.Score,
+			CreatedAt:     time.Now().UTC(),
+		})
+	}
+}
+
+// piiScanError maps a RunPIIScan error to an HTTP response. Selection/naming
+// errors are client errors (400); a detector that resolved but failed at scan
+// time is a fail-closed dependency error (502) — the text is never returned
+// unredacted.
+func piiScanError(c echo.Context, err error) error {
+	if errors.Is(err, ErrNoDetectors) || errors.Is(err, ErrUnknownDetector) {
+		return c.JSON(http.StatusBadRequest, map[string]any{
+			"error": map[string]string{"message": err.Error(), "type": "invalid_request"},
+		})
+	}
+	return c.JSON(http.StatusBadGateway, map[string]any{
+		"error": map[string]string{"message": err.Error(), "type": "pii_detector_error"},
+	})
+}
+
+// piiViewer resolves the request's user (the authenticated user, or the
+// synthetic local admin in single-user mode) so the handlers can attribute
+// events and gate the admin-only hash reveal.
+func piiViewer(c echo.Context, app *application.Application) *auth.User {
+	if u := auth.GetUser(c); u != nil {
+		return u
+	}
+	return app.FallbackUser()
+}
+
+// PIIAnalyzeEndpoint scans text and returns the detected PII entities without
+// mutating it. Always 200 (detection, not enforcement); Blocked reports
+// whether the redact endpoint would reject the same text.
+// @Summary Detect PII entities in a string (no mutation).
+// @Description Runs the configured PII detectors (NER and/or pattern tiers) over the supplied text and returns the matched entity spans with the policy action that would fire. Detection only — the text is not modified and no block is enforced. Select detectors explicitly via `detectors`, or pass a consuming `model` to use its effective policy: the model's own `pii.detectors`, else the instance-wide `pii_default_detectors`. A model with PII disabled, or enabled with nothing to scan with, is a 400. The raw matched value is never returned; admins may set `reveal:true` for the audit hash prefix.
+// @Tags pii
+// @Param request body schema.PIIAnalyzeRequest true "text + detector selection"
+// @Success 200 {object} schema.PIIAnalyzeResponse "Detected entities"
+// @Router /api/pii/analyze [post]
+func PIIAnalyzeEndpoint(app *application.Application) echo.HandlerFunc {
+	return func(c echo.Context) error {
+		var req schema.PIIAnalyzeRequest
+		if err := c.Bind(&req); err != nil {
+			return c.JSON(http.StatusBadRequest, map[string]any{
+				"error": map[string]string{"message": "invalid request body", "type": "invalid_request"},
+			})
+		}
+		viewer := piiViewer(c, app)
+		if viewer == nil {
+			return c.JSON(http.StatusUnauthorized, map[string]string{"error": "not authenticated"})
+		}
+
+		correlationID := pii.NewEventID()
+		res, err := RunPIIScan(c.Request().Context(), app.PIINERResolver(), app.ModelConfigLoader(), app.PIIPolicyResolver(), req.Detectors, req.Model, req.Text)
+		if err != nil {
+			return piiScanError(c, err)
+		}
+
+		recordPIIEvents(app.PIIEvents(), res.Spans, pii.OriginAnalyzeAPI, correlationID, viewer.ID)
+		revealHash := req.Reveal && viewer.Role == auth.RoleAdmin
+		return c.JSON(http.StatusOK, schema.PIIAnalyzeResponse{
+			Entities:      piiEntities(res.Spans, revealHash),
+			Blocked:       res.Blocked,
+			CorrelationID: correlationID,
+		})
+	}
+}
+
+// PIIRedactEndpoint scans text and applies the configured mask/block/allow
+// policy. Returns the redacted text (200), or 400 with type "pii_blocked" and
+// the offending entities when a block action fires — never a redacted body in
+// that case. Mirrors the inline middleware's block contract.
+// @Summary Redact PII in a string by applying the configured policy.
+// @Description Runs the configured PII detectors over the text and applies each detector model's policy: masked spans are replaced with `[REDACTED:<id>]`, allow spans pass through, and a single block action causes a 400 (type `pii_blocked`) carrying the offending entities — the text is never returned in that case. Select detectors via `detectors`, or a consuming `model`'s effective policy (its own `pii.detectors`, else the instance-wide `pii_default_detectors`; PII must be enabled on the model). Records audit events (origin `pii_redact`) visible at /api/pii/events.
+// @Tags pii
+// @Param request body schema.PIIAnalyzeRequest true "text + detector selection"
+// @Success 200 {object} schema.PIIRedactResponse "Redacted text + entities"
+// @Router /api/pii/redact [post]
+func PIIRedactEndpoint(app *application.Application) echo.HandlerFunc {
+	return func(c echo.Context) error {
+		var req schema.PIIAnalyzeRequest
+		if err := c.Bind(&req); err != nil {
+			return c.JSON(http.StatusBadRequest, map[string]any{
+				"error": map[string]string{"message": "invalid request body", "type": "invalid_request"},
+			})
+		}
+		viewer := piiViewer(c, app)
+		if viewer == nil {
+			return c.JSON(http.StatusUnauthorized, map[string]string{"error": "not authenticated"})
+		}
+
+		correlationID := pii.NewEventID()
+		res, err := RunPIIScan(c.Request().Context(), app.PIINERResolver(), app.ModelConfigLoader(), app.PIIPolicyResolver(), req.Detectors, req.Model, req.Text)
+		if err != nil {
+			return piiScanError(c, err)
+		}
+
+		recordPIIEvents(app.PIIEvents(), res.Spans, pii.OriginRedactAPI, correlationID, viewer.ID)
+		revealHash := req.Reveal && viewer.Role == auth.RoleAdmin
+		entities := piiEntities(res.Spans, revealHash)
+
+		if res.Blocked {
+			// Fail closed: a block action returns no redacted text, only the
+			// reason and the offending entities — identical to the middleware.
+			return c.JSON(http.StatusBadRequest, map[string]any{
+				"error":          map[string]string{"message": "text blocked by content policy (sensitive data detected)", "type": "pii_blocked"},
+				"entities":       entities,
+				"correlation_id": correlationID,
+			})
+		}
+		return c.JSON(http.StatusOK, schema.PIIRedactResponse{
+			RedactedText:  res.Redacted,
+			Entities:      entities,
+			Blocked:       false,
+			Masked:        res.Masked,
+			CorrelationID: correlationID,
+		})
+	}
+}
--- a/core/http/endpoints/localai/pii_decide.go
+++ b/core/http/endpoints/localai/pii_decide.go
@@ -1,79 +0,0 @@
-package localai
-
-import (
-	"net/http"
-
-	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/core/services/routing/pii"
-)
-
-// PIIDecideEndpoint exposes the PII redactor as a decision oracle:
-// scan the supplied text and return findings + the strongest action
-// the configured pattern set would take, without rewriting the
-// caller's request or recording an audit event.
-//
-// External routers (e.g. the localai-org/platform router) call this
-// before dispatching to learn whether to mask the prompt in place,
-// block the request, or pass it through. LocalAI's in-band PII
-// middleware is the alternative path for direct-to-LocalAI clients —
-// same Redactor, different framing.
-//
-// Takes the *pii.Redactor directly rather than the whole
-// *application.Application so the handler stays unit-testable with a
-// freshly-constructed redactor (mirrors the pattern in
-// router_decide.go). The route-registration site is responsible for
-// stubbing this endpoint when --disable-pii is set so callers get a
-// 503 signalling "admin opted out" rather than a misleading allow.
-//
-// @Summary  Scan text for PII and return findings + suggested action (decision oracle)
-// @Tags     pii
-// @Accept   json
-// @Produce  json
-// @Param    request body schema.PIIDecideRequest true "decide params"
-// @Success  200 {object} schema.PIIDecideResponse
-// @Failure  400 {object} map[string]string
-// @Router   /api/pii/decide [post]
-func PIIDecideEndpoint(redactor *pii.Redactor) echo.HandlerFunc {
-	return func(c echo.Context) error {
-		var req schema.PIIDecideRequest
-		if err := c.Bind(&req); err != nil {
-			return echo.NewHTTPError(http.StatusBadRequest, "invalid request body: "+err.Error())
-		}
-		if req.Text == "" {
-			return echo.NewHTTPError(http.StatusBadRequest, "text is required")
-		}
-
-		res := redactor.Redact(req.Text)
-		findings := make([]schema.PIIFinding, len(res.Spans))
-		for i, s := range res.Spans {
-			findings[i] = schema.PIIFinding{
-				Start:      s.Start,
-				End:        s.End,
-				Pattern:    s.Pattern,
-				HashPrefix: s.HashPrefix,
-			}
-		}
-		return c.JSON(http.StatusOK, schema.PIIDecideResponse{
-			Findings:        findings,
-			SuggestedAction: suggestedAction(res),
-			RedactedPreview: res.Redacted,
-		})
-	}
-}
-
-// suggestedAction collapses the Redactor's Result flags onto a single
-// wire-format action using the in-band ordering (block > mask >
-// allow). "allow" covers both "nothing matched" and "matched but every
-// span resolved to the allow action" — in both cases the caller may
-// dispatch unchanged, with the Findings list reporting what was seen.
-func suggestedAction(res pii.Result) string {
-	switch {
-	case res.Blocked:
-		return string(pii.ActionBlock)
-	case res.Masked:
-		return string(pii.ActionMask)
-	default:
-		return string(pii.ActionAllow)
-	}
-}
--- a/core/http/endpoints/localai/pii_decide_test.go
+++ b/core/http/endpoints/localai/pii_decide_test.go
@@ -1,108 +0,0 @@
-package localai_test
-
-import (
-	"encoding/json"
-	"net/http"
-	"net/http/httptest"
-	"strings"
-
-	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/http/endpoints/localai"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/core/services/routing/pii"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// PIIDecideEndpoint exposes the redactor as a decision oracle. These
-// specs pin the validation surface and the suggested_action mapping
-// across the three actions (allow/mask/block). The redactor itself is
-// covered in core/services/routing/pii/redactor_test.go.
-
-var _ = Describe("PIIDecideEndpoint", func() {
-	var redactor *pii.Redactor
-
-	BeforeEach(func() {
-		patterns, err := pii.Compile(pii.DefaultPatterns())
-		Expect(err).NotTo(HaveOccurred())
-		redactor = pii.NewRedactor(patterns)
-	})
-
-	It("rejects requests with no text field", func() {
-		rec, _ := invokePIIDecide(redactor, `{}`)
-		Expect(rec.Code).To(Equal(http.StatusBadRequest))
-		Expect(rec.Body.String()).To(ContainSubstring("text is required"))
-	})
-
-	It("rejects malformed JSON", func() {
-		rec, _ := invokePIIDecide(redactor, `not json`)
-		Expect(rec.Code).To(Equal(http.StatusBadRequest))
-	})
-
-	It("returns allow for clean text", func() {
-		rec, body := invokePIIDecide(redactor, `{"text":"hello world"}`)
-		Expect(rec.Code).To(Equal(http.StatusOK))
-		Expect(body.SuggestedAction).To(Equal("allow"))
-		Expect(body.Findings).To(BeEmpty())
-		Expect(body.RedactedPreview).To(Equal("hello world"))
-	})
-
-	It("returns mask for text containing email (default action)", func() {
-		rec, body := invokePIIDecide(redactor, `{"text":"reach me at alice@example.com please"}`)
-		Expect(rec.Code).To(Equal(http.StatusOK))
-		Expect(body.SuggestedAction).To(Equal("mask"))
-		Expect(body.Findings).To(HaveLen(1))
-		Expect(body.Findings[0].Pattern).To(Equal("email"))
-		Expect(body.Findings[0].HashPrefix).NotTo(BeEmpty())
-		Expect(body.RedactedPreview).To(ContainSubstring("[REDACTED:email]"))
-		Expect(body.RedactedPreview).NotTo(ContainSubstring("alice@example.com"))
-	})
-
-	It("returns block when an api_key_prefix is present (block beats mask)", func() {
-		// api_key_prefix defaults to ActionBlock per DefaultPatterns.
-		// Mix in an email so we also confirm the block-action wins
-		// over the mask-action via actionRank.
-		rec, body := invokePIIDecide(redactor, `{"text":"my key is sk-1234567890abcdefghij and email alice@example.com"}`)
-		Expect(rec.Code).To(Equal(http.StatusOK))
-		Expect(body.SuggestedAction).To(Equal("block"))
-		Expect(len(body.Findings)).To(BeNumerically(">=", 1))
-	})
-
-	It("returns allow when a matched pattern's action is allow", func() {
-		// Downgrade the email pattern to allow for this test —
-		// exercises the allow branch of suggestedAction: a match is
-		// found, but the strongest action is allow so the suggestion
-		// is "allow" and the text is left intact.
-		Expect(redactor.SetAction("email", pii.ActionAllow)).To(Succeed())
-		rec, body := invokePIIDecide(redactor, `{"text":"contact alice@example.com"}`)
-		Expect(rec.Code).To(Equal(http.StatusOK))
-		Expect(body.SuggestedAction).To(Equal("allow"))
-		Expect(body.Findings).To(HaveLen(1), "allow still reports the finding")
-		// allow leaves the original text intact.
-		Expect(body.RedactedPreview).To(ContainSubstring("alice@example.com"))
-	})
-
-	It("never leaks the matched value via HashPrefix", func() {
-		rec, body := invokePIIDecide(redactor, `{"text":"alice@example.com"}`)
-		Expect(rec.Code).To(Equal(http.StatusOK))
-		Expect(body.Findings).To(HaveLen(1))
-		// HashPrefix is 8 hex chars of sha256 — definitely not the
-		// matched value, but stable so admins can correlate leaks.
-		Expect(body.Findings[0].HashPrefix).To(HaveLen(8))
-		Expect(body.Findings[0].HashPrefix).NotTo(ContainSubstring("alice"))
-	})
-})
-
-func invokePIIDecide(redactor *pii.Redactor, body string) (*httptest.ResponseRecorder, schema.PIIDecideResponse) {
-	e := echo.New()
-	e.POST("/api/pii/decide", localai.PIIDecideEndpoint(redactor))
-	req := httptest.NewRequest(http.MethodPost, "/api/pii/decide", strings.NewReader(body))
-	req.Header.Set("Content-Type", "application/json")
-	rec := httptest.NewRecorder()
-	e.ServeHTTP(rec, req)
-	var parsed schema.PIIDecideResponse
-	if rec.Code == http.StatusOK {
-		Expect(json.Unmarshal(rec.Body.Bytes(), &parsed)).To(Succeed())
-	}
-	return rec, parsed
-}
--- a/core/http/endpoints/localai/pii_test.go
+++ b/core/http/endpoints/localai/pii_test.go
@@ -0,0 +1,258 @@
+package localai_test
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"errors"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/application"
+	"github.com/mudler/LocalAI/core/config"
+	. "github.com/mudler/LocalAI/core/http/endpoints/localai"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+	"github.com/mudler/LocalAI/pkg/system"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// stubDetector is a fixed NER detector for the resolver-level unit tests.
+type stubDetector struct {
+	ents []pii.NEREntity
+	err  error
+}
+
+func (s stubDetector) Detect(_ context.Context, _ string) ([]pii.NEREntity, error) {
+	return s.ents, s.err
+}
+
+var _ = Describe("RunPIIScan (resolver + scan core)", func() {
+	ctx := context.Background()
+
+	resolver := func(name string) (pii.NERConfig, bool) {
+		if name != "det" {
+			return pii.NERConfig{}, false
+		}
+		return pii.NERConfig{
+			Detector:      stubDetector{ents: []pii.NEREntity{{Group: "EMAIL", Start: 0, End: 5, Score: 0.9}}},
+			EntityActions: map[string]pii.Action{"EMAIL": pii.ActionMask},
+			Source:        pii.SourceNER,
+		}, true
+	}
+
+	It("resolves named detectors and returns their spans", func() {
+		res, err := RunPIIScan(ctx, resolver, nil, nil, []string{"det"}, "", "jane@acme.io")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(res.Spans).To(HaveLen(1))
+		Expect(res.Spans[0].Pattern).To(Equal("ner:EMAIL"))
+		Expect(res.Masked).To(BeTrue())
+	})
+
+	It("fails closed with ErrUnknownDetector for an unresolvable name", func() {
+		_, err := RunPIIScan(ctx, resolver, nil, nil, []string{"nope"}, "", "x")
+		Expect(errors.Is(err, ErrUnknownDetector)).To(BeTrue())
+	})
+
+	It("returns ErrNoDetectors when nothing is selected", func() {
+		_, err := RunPIIScan(ctx, resolver, nil, nil, nil, "", "x")
+		Expect(errors.Is(err, ErrNoDetectors)).To(BeTrue())
+	})
+})
+
+var _ = Describe("PII analyze/redact endpoints", func() {
+	var (
+		app    *application.Application
+		e      *echo.Echo
+		tmp    string
+		cancel context.CancelFunc
+	)
+
+	BeforeEach(func() {
+		var err error
+		tmp, err = os.MkdirTemp("", "pii-api-test-*")
+		Expect(err).ToNot(HaveOccurred())
+
+		var ctx context.Context
+		ctx, cancel = context.WithCancel(context.Background())
+
+		modelsDir := filepath.Join(tmp, "models")
+		Expect(os.MkdirAll(modelsDir, 0o755)).To(Succeed())
+
+		st, err := system.GetSystemState(
+			system.WithModelPath(modelsDir),
+			system.WithBackendPath(filepath.Join(tmp, "backends")),
+		)
+		Expect(err).ToNot(HaveOccurred())
+
+		app, err = application.New(config.WithContext(ctx), config.WithSystemState(st))
+		Expect(err).ToNot(HaveOccurred())
+
+		// A pattern detector with two deterministic patterns: one blocks, one
+		// masks. No backend is loaded — the pattern tier runs in-process.
+		detYAML := `name: secret-filter
+backend: pattern
+pii_detection:
+  default_action: mask
+  patterns:
+    - name: SECRET
+      match: "sk-test-[A-Za-z0-9]+"
+      action: block
+    - name: TOKEN
+      match: "tok-[A-Za-z0-9]+"
+      action: mask
+`
+		// A consuming model that opts into the detector, for the model-fallback path.
+		consumerYAML := `name: chatmodel
+pii:
+  enabled: true
+  detectors: [secret-filter]
+`
+		// PII-enabled but names no detectors: scanned only when the
+		// instance-wide default detectors are set, else a 400.
+		defaultsYAML := `name: defaultsmodel
+pii:
+  enabled: true
+`
+		// Lists detectors but never enables PII — the middleware ignores it,
+		// so the model path must too.
+		disabledYAML := `name: disabledmodel
+pii:
+  detectors: [secret-filter]
+`
+		detPath := filepath.Join(modelsDir, "secret-filter.yaml")
+		consumerPath := filepath.Join(modelsDir, "chatmodel.yaml")
+		defaultsPath := filepath.Join(modelsDir, "defaultsmodel.yaml")
+		disabledPath := filepath.Join(modelsDir, "disabledmodel.yaml")
+		Expect(os.WriteFile(detPath, []byte(detYAML), 0o644)).To(Succeed())
+		Expect(os.WriteFile(consumerPath, []byte(consumerYAML), 0o644)).To(Succeed())
+		Expect(os.WriteFile(defaultsPath, []byte(defaultsYAML), 0o644)).To(Succeed())
+		Expect(os.WriteFile(disabledPath, []byte(disabledYAML), 0o644)).To(Succeed())
+		Expect(app.ModelConfigLoader().ReadModelConfig(detPath)).To(Succeed())
+		Expect(app.ModelConfigLoader().ReadModelConfig(consumerPath)).To(Succeed())
+		Expect(app.ModelConfigLoader().ReadModelConfig(defaultsPath)).To(Succeed())
+		Expect(app.ModelConfigLoader().ReadModelConfig(disabledPath)).To(Succeed())
+
+		e = echo.New()
+		e.POST("/api/pii/analyze", PIIAnalyzeEndpoint(app))
+		e.POST("/api/pii/redact", PIIRedactEndpoint(app))
+	})
+
+	AfterEach(func() {
+		cancel()
+		Expect(os.RemoveAll(tmp)).To(Succeed())
+	})
+
+	post := func(path, body string) *httptest.ResponseRecorder {
+		req := httptest.NewRequest(http.MethodPost, path, bytes.NewBufferString(body))
+		req.Header.Set("Content-Type", "application/json")
+		rec := httptest.NewRecorder()
+		e.ServeHTTP(rec, req)
+		return rec
+	}
+
+	It("analyze reports a block-class entity without mutating text (200)", func() {
+		rec := post("/api/pii/analyze", `{"text":"my key sk-test-abc123 ok","detectors":["secret-filter"]}`)
+		Expect(rec.Code).To(Equal(http.StatusOK))
+
+		var resp struct {
+			Entities []struct {
+				EntityType string `json:"entity_type"`
+				Source     string `json:"source"`
+				Action     string `json:"action"`
+			} `json:"entities"`
+			Blocked bool `json:"blocked"`
+		}
+		Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed())
+		Expect(resp.Blocked).To(BeTrue())
+		Expect(resp.Entities).To(HaveLen(1))
+		Expect(resp.Entities[0].EntityType).To(Equal("SECRET"))
+		Expect(resp.Entities[0].Source).To(Equal("pattern"))
+		Expect(resp.Entities[0].Action).To(Equal("block"))
+	})
+
+	It("redact masks a mask-class match and returns redacted text (200)", func() {
+		rec := post("/api/pii/redact", `{"text":"here is tok-xyz789 done","detectors":["secret-filter"]}`)
+		Expect(rec.Code).To(Equal(http.StatusOK))
+
+		var resp struct {
+			RedactedText string `json:"redacted_text"`
+			Masked       bool   `json:"masked"`
+			Blocked      bool   `json:"blocked"`
+		}
+		Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed())
+		Expect(resp.Masked).To(BeTrue())
+		Expect(resp.Blocked).To(BeFalse())
+		Expect(resp.RedactedText).To(ContainSubstring("[REDACTED:pattern:TOKEN]"))
+		Expect(resp.RedactedText).ToNot(ContainSubstring("tok-xyz789"))
+	})
+
+	It("redact returns 400 pii_blocked for a block-class match", func() {
+		rec := post("/api/pii/redact", `{"text":"key sk-test-abc123","detectors":["secret-filter"]}`)
+		Expect(rec.Code).To(Equal(http.StatusBadRequest))
+		Expect(rec.Body.String()).To(ContainSubstring("pii_blocked"))
+		// The raw secret must never appear in the block response.
+		Expect(rec.Body.String()).ToNot(ContainSubstring("sk-test-abc123"))
+	})
+
+	It("400s when no detector is selected", func() {
+		rec := post("/api/pii/redact", `{"text":"sk-test-abc123"}`)
+		Expect(rec.Code).To(Equal(http.StatusBadRequest))
+		Expect(rec.Body.String()).To(ContainSubstring("invalid_request"))
+	})
+
+	It("resolves detectors from a consuming model via the model field", func() {
+		rec := post("/api/pii/analyze", `{"text":"tok-aaa111","model":"chatmodel"}`)
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		var resp struct {
+			Entities []struct {
+				EntityType string `json:"entity_type"`
+			} `json:"entities"`
+		}
+		Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed())
+		Expect(resp.Entities).To(HaveLen(1))
+		Expect(resp.Entities[0].EntityType).To(Equal("TOKEN"))
+	})
+
+	It("400s for a PII-enabled model with no detectors and no instance default", func() {
+		rec := post("/api/pii/analyze", `{"text":"tok-aaa111","model":"defaultsmodel"}`)
+		Expect(rec.Code).To(Equal(http.StatusBadRequest))
+		Expect(rec.Body.String()).To(ContainSubstring("invalid_request"))
+	})
+
+	It("falls back to the instance-wide default detectors for an enabled model", func() {
+		defaults := []string{"secret-filter"}
+		app.ApplicationConfig().ApplyRuntimeSettings(&config.RuntimeSettings{PIIDefaultDetectors: &defaults})
+
+		rec := post("/api/pii/analyze", `{"text":"tok-aaa111","model":"defaultsmodel"}`)
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		var resp struct {
+			Entities []struct {
+				EntityType string `json:"entity_type"`
+			} `json:"entities"`
+		}
+		Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed())
+		Expect(resp.Entities).To(HaveLen(1))
+		Expect(resp.Entities[0].EntityType).To(Equal("TOKEN"))
+	})
+
+	It("400s for a model that lists detectors but has PII disabled, like the middleware", func() {
+		rec := post("/api/pii/analyze", `{"text":"tok-aaa111","model":"disabledmodel"}`)
+		Expect(rec.Code).To(Equal(http.StatusBadRequest))
+		Expect(rec.Body.String()).To(ContainSubstring("invalid_request"))
+	})
+
+	It("records redact-API events with origin pii_redact", func() {
+		_ = post("/api/pii/redact", `{"text":"here is tok-xyz789 done","detectors":["secret-filter"]}`)
+		events, err := app.PIIEvents().List(context.Background(), pii.ListQuery{Origin: pii.OriginRedactAPI})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(len(events)).To(BeNumerically(">=", 1))
+		Expect(events[0].PatternID).To(Equal("pattern:TOKEN"))
+		// Regression: API-recorded events must carry a real timestamp, not the
+		// zero value (the handler, unlike the middleware, originally omitted it).
+		Expect(events[0].CreatedAt.IsZero()).To(BeFalse())
+	})
+})
--- a/core/http/endpoints/mcp/localai_assistant_test.go
+++ b/core/http/endpoints/mcp/localai_assistant_test.go
@@ -22,25 +22,31 @@ type stubClient struct{}
 func (stubClient) GallerySearch(_ context.Context, _ localaitools.GallerySearchQuery) ([]gallery.Metadata, error) {
 	return []gallery.Metadata{{Name: "stub", Gallery: config.Gallery{Name: "stub-gallery"}}}, nil
 }
+
 func (stubClient) ListInstalledModels(_ context.Context, _ localaitools.Capability) ([]localaitools.InstalledModel, error) {
 	return []localaitools.InstalledModel{{Name: "stub"}}, nil
 }
+
 func (stubClient) ListGalleries(_ context.Context) ([]config.Gallery, error) {
 	return []config.Gallery{{Name: "stub-gallery", URL: "http://example"}}, nil
 }
+
 func (stubClient) GetJobStatus(_ context.Context, _ string) (*localaitools.JobStatus, error) {
 	return &localaitools.JobStatus{ID: "stub", Processed: true}, nil
 }
+
 func (stubClient) GetModelConfig(_ context.Context, _ string) (*localaitools.ModelConfigView, error) {
 	return &localaitools.ModelConfigView{Name: "stub"}, nil
 }
+
 func (stubClient) InstallModel(_ context.Context, _ localaitools.InstallModelRequest) (string, error) {
 	return "stub-job", nil
 }
+
 func (stubClient) ImportModelURI(_ context.Context, _ localaitools.ImportModelURIRequest) (*localaitools.ImportModelURIResponse, error) {
 	return &localaitools.ImportModelURIResponse{JobID: "stub-import"}, nil
 }
-func (stubClient) DeleteModel(_ context.Context, _ string) error  { return nil }
+func (stubClient) DeleteModel(_ context.Context, _ string) error { return nil }
 func (stubClient) EditModelConfig(_ context.Context, _ string, _ map[string]any) error {
 	return nil
 }
@@ -48,57 +54,61 @@ func (stubClient) ReloadModels(_ context.Context) error { return nil }
 func (stubClient) ListBackends(_ context.Context) ([]localaitools.Backend, error) {
 	return []localaitools.Backend{{Name: "stub-backend", Installed: true}}, nil
 }
+
 func (stubClient) ListKnownBackends(_ context.Context) ([]schema.KnownBackend, error) {
 	return []schema.KnownBackend{}, nil
 }
+
 func (stubClient) InstallBackend(_ context.Context, _ localaitools.InstallBackendRequest) (string, error) {
 	return "stub-backend-job", nil
 }
+
 func (stubClient) UpgradeBackend(_ context.Context, _ string) (string, error) {
 	return "stub-upgrade-job", nil
 }
+
 func (stubClient) SystemInfo(_ context.Context) (*localaitools.SystemInfo, error) {
 	return &localaitools.SystemInfo{Version: "stub"}, nil
 }
+
 func (stubClient) ListNodes(_ context.Context) ([]localaitools.Node, error) {
 	return []localaitools.Node{}, nil
 }
+
 func (stubClient) VRAMEstimate(_ context.Context, _ localaitools.VRAMEstimateRequest) (*vram.EstimateResult, error) {
 	return &vram.EstimateResult{SizeDisplay: "stub"}, nil
 }
-func (stubClient) ToggleModelState(_ context.Context, _ string, _ modeladmin.Action) error  { return nil }
-func (stubClient) ToggleModelPinned(_ context.Context, _ string, _ modeladmin.Action) error { return nil }
+func (stubClient) ToggleModelState(_ context.Context, _ string, _ modeladmin.Action) error {
+	return nil
+}
+func (stubClient) ToggleModelPinned(_ context.Context, _ string, _ modeladmin.Action) error {
+	return nil
+}
 func (stubClient) GetBranding(_ context.Context) (*localaitools.Branding, error) {
 	return &localaitools.Branding{InstanceName: "LocalAI"}, nil
 }
+
 func (stubClient) SetBranding(_ context.Context, _ localaitools.SetBrandingRequest) (*localaitools.Branding, error) {
 	return &localaitools.Branding{InstanceName: "LocalAI"}, nil
 }
+
 func (stubClient) GetUsageStats(_ context.Context, _ localaitools.UsageStatsQuery) (*localaitools.UsageStats, error) {
 	return &localaitools.UsageStats{Viewer: localaitools.UsageViewer{ID: "stub", Name: "stub"}, Period: "month"}, nil
 }
-func (stubClient) ListPIIPatterns(_ context.Context) ([]localaitools.PIIPattern, error) {
-	return nil, nil
-}
+
 func (stubClient) GetPIIEvents(_ context.Context, _ localaitools.PIIEventsQuery) ([]localaitools.PIIEvent, error) {
 	return nil, nil
 }
-func (stubClient) TestPIIRedaction(_ context.Context, req localaitools.PIIRedactTestRequest) (*localaitools.PIIRedactTestResult, error) {
-	return &localaitools.PIIRedactTestResult{Redacted: req.Text}, nil
-}
-func (stubClient) SetPIIPatternAction(_ context.Context, _ localaitools.PIIPatternActionUpdate) error {
-	return nil
-}
-func (stubClient) PersistPIIPatterns(_ context.Context) error { return nil }
+
 func (stubClient) GetMiddlewareStatus(_ context.Context) (*localaitools.MiddlewareStatus, error) {
 	return &localaitools.MiddlewareStatus{
 		PII: localaitools.MiddlewarePIIStatus{
 			EnabledGlobally: true,
-			Patterns:        []localaitools.PIIPattern{},
 			Models:          []localaitools.MiddlewarePIIModel{},
 		},
 	}, nil
 }
+
 func (stubClient) GetRouterDecisions(_ context.Context, _ localaitools.RouterDecisionsQuery) ([]localaitools.RouterDecision, error) {
 	return []localaitools.RouterDecision{}, nil
 }
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -14,7 +14,6 @@ import (
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services/cloudproxy"
-	"github.com/mudler/LocalAI/core/services/routing/pii"
 	"github.com/mudler/LocalAI/pkg/functions"
 	reason "github.com/mudler/LocalAI/pkg/reasoning"

@@ -130,7 +129,7 @@ func applyAutoparserOverride(
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/chat/completions [post]
-func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, startupOptions *config.ApplicationConfig, natsClient mcpTools.MCPNATSClient, assistantHolder *mcpTools.LocalAIAssistantHolder, piiRedactor *pii.Redactor, piiEvents pii.EventStore) echo.HandlerFunc {
+func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, startupOptions *config.ApplicationConfig, natsClient mcpTools.MCPNATSClient, assistantHolder *mcpTools.LocalAIAssistantHolder) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		var textContentToReturn string
 		id := uuid.New().String()
@@ -152,11 +151,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator

 		// Cloud-proxy bail. Bypasses the local pipeline (templating,
 		// MCP injection, gRPC backend) and forwards via the cloud-
-		// proxy backend, which does the outbound HTTP. The streaming
-		// PII filter still runs because its input is per-token text
-		// extracted from the wire envelope, not the envelope itself.
+		// proxy backend, which does the outbound HTTP. Request-side PII
+		// redaction already ran in the middleware; the response is
+		// forwarded unmodified.
 		if config.IsCloudProxyBackendPassthrough() {
-			return forwardCloudProxyOpenAIViaBackend(c, config, input, piiRedactor, piiEvents, ml, startupOptions)
+			return forwardCloudProxyOpenAIViaBackend(c, config, input, ml, startupOptions)
 		}

 		funcs := input.Functions
@@ -327,7 +326,8 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 						"message": map[string]any{
 							"type":        "string",
 							"description": "The message to reply the user with",
-						}},
+						},
+					},
 				},
 			}

@@ -393,14 +393,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			c.Response().Header().Set("Connection", "keep-alive")
 			c.Response().Header().Set("X-Correlation-ID", id)

-			// Per-stream PII filter: when the resolved model has PII
-			// enabled, wrap the response content so values spanning
-			// chunk boundaries still get masked. Shared with the
-			// cloud-proxy bail below via cloudproxy.BuildStreamFilter
-			// so both paths apply the same per-model gate and override
-			// rules.
-			streamPIIFilter := cloudproxy.BuildStreamFilter(c, config, true, piiRedactor, piiEvents, id)
-
 			mcpStreamMaxIterations := 10
 			if config.Agent.MaxIterations > 0 {
 				mcpStreamMaxIterations = config.Agent.MaxIterations
@@ -476,30 +468,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 						if (hasMCPToolsStream || config.FunctionsConfig.AutomaticToolParsingFallback) && haveContent {
 							collectedContent += rawContent
 						}
-						// Stream-side PII filter: feed the content delta
-						// through the buffered-emit filter. The filter
-						// holds back a tail to handle pattern boundaries
-						// across chunks, so a Push may legitimately
-						// return "" — drop the chunk in that case rather
-						// than emitting an empty Delta to the wire.
-						if streamPIIFilter != nil && haveContent {
-							filtered := streamPIIFilter.Push(rawContent)
-							if filtered == "" {
-								// Fully buffered — skip this chunk's
-								// content. Still emit non-content chunks
-								// (role, tool_calls). When this delta is
-								// content-only and we buffer it, drop the
-								// whole event to avoid a vestigial
-								// {"delta":{}} on the wire.
-								if ev.Choices[0].Delta.Role == "" && len(ev.Choices[0].Delta.ToolCalls) == 0 && ev.Choices[0].Delta.Reasoning == nil {
-									continue
-								}
-								// Mixed delta — strip content, keep the rest.
-								ev.Choices[0].Delta.Content = nil
-							} else {
-								ev.Choices[0].Delta.Content = filtered
-							}
-						}
 						respData, err := json.Marshal(ev)
 						if err != nil {
 							xlog.Debug("Failed to marshal response", "error", err)
@@ -644,31 +612,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					}
 				}

-				// Drain the per-stream PII filter before the stop chunk
-				// so any text held back by the buffered-emit invariant
-				// reaches the client as a regular content delta. We
-				// emit it as a chunk WITHOUT a finish_reason so the
-				// next "stop" chunk still terminates the stream.
-				if streamPIIFilter != nil {
-					residual := streamPIIFilter.Drain()
-					if residual != "" {
-						drainResp := &schema.OpenAIResponse{
-							ID:      id,
-							Created: created,
-							Model:   input.Model,
-							Choices: []schema.Choice{{
-								Delta: &schema.Message{Content: residual},
-								Index: 0,
-							}},
-							Object: "chat.completion.chunk",
-						}
-						if drainBytes, err := json.Marshal(drainResp); err == nil {
-							_, _ = fmt.Fprintf(c.Response().Writer, "data: %s\n\n", drainBytes)
-							c.Response().Flush()
-						}
-					}
-				}
-
 				// No MCP tools to execute, send final stop message
 				finishReason := FinishReasonStop
 				if toolsCalled && len(input.Tools) > 0 {
@@ -689,7 +632,8 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 							FinishReason: &finishReason,
 							Index:        0,
 							Delta:        &schema.Message{},
-						}},
+						},
+					},
 					Object: "chat.completion.chunk",
 				}
 				respData, _ := json.Marshal(resp)
@@ -1075,7 +1019,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 }

 func handleQuestion(config *config.ModelConfig, funcResults []functions.FuncCallResults, result, prompt string) (string, error) {
-
 	if len(funcResults) == 0 && result != "" {
 		xlog.Debug("nothing function results but we had a message from the LLM")

@@ -1111,19 +1054,16 @@ func handleQuestion(config *config.ModelConfig, funcResults []functions.FuncCall
 	return "", nil
 }

-// forwardCloudProxyOpenAIViaBackend marshals the OpenAI request,
-// constructs the streaming PII filter (when this model has PII
-// enabled), and hands off to the cloud-proxy gRPC backend which does
-// the outbound HTTP. The chat endpoint owns the body+filter
-// construction because it's the only place the request lands as a
-// parsed *schema.OpenAIRequest.
-func forwardCloudProxyOpenAIViaBackend(c echo.Context, cfg *config.ModelConfig, input *schema.OpenAIRequest, piiRedactor *pii.Redactor, piiEvents pii.EventStore, ml *model.ModelLoader, appConfig *config.ApplicationConfig) error {
+// forwardCloudProxyOpenAIViaBackend marshals the OpenAI request and
+// hands off to the cloud-proxy gRPC backend which does the outbound
+// HTTP. The chat endpoint owns the body construction because it's the
+// only place the request lands as a parsed *schema.OpenAIRequest.
+// Request-side PII redaction already ran in the middleware; the
+// response is forwarded unmodified.
+func forwardCloudProxyOpenAIViaBackend(c echo.Context, cfg *config.ModelConfig, input *schema.OpenAIRequest, ml *model.ModelLoader, appConfig *config.ApplicationConfig) error {
 	body, err := json.Marshal(input)
 	if err != nil {
 		return echo.NewHTTPError(http.StatusBadRequest, "cloudproxy: marshal request: "+err.Error())
 	}
-
-	correlationID := c.Response().Header().Get("X-Correlation-ID")
-	streamFilter := cloudproxy.BuildStreamFilter(c, cfg, input.Stream, piiRedactor, piiEvents, correlationID)
-	return cloudproxy.ForwardViaBackend(c, cfg, body, streamFilter, ml, appConfig)
+	return cloudproxy.ForwardViaBackend(c, cfg, body, ml, appConfig)
 }
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -9,12 +9,10 @@ import (
 	"github.com/labstack/echo/v4"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/auth"
 	"github.com/mudler/LocalAI/core/http/middleware"

 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/core/services/routing/pii"
 	"github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/model"
@@ -27,7 +25,7 @@ import (
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/completions [post]
-func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig, piiRedactor *pii.Redactor, piiEvents pii.EventStore) echo.HandlerFunc {
+func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig) echo.HandlerFunc {
 	process := func(id string, s string, req *schema.OpenAIRequest, config *config.ModelConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse, extraUsage bool) error {
 		tokenCallback := func(s string, tokenUsage backend.TokenUsage) bool {
 			created := int(time.Now().Unix())
@@ -70,7 +68,6 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 	}

 	return func(c echo.Context) error {
-
 		created := int(time.Now().Unix())

 		// Handle Correlation
@@ -113,31 +110,8 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 				return errors.New("cannot handle more than 1 `PromptStrings` when Streaming")
 			}

-			// Per-stream PII filter — same gating as chat. /v1/completions
-			// has no chat-message structure, so request-side PII isn't
-			// wired here, but the response-side filter still catches PII
-			// trained into the model. Filter is nil when this model has
-			// PII disabled.
-			var streamPIIFilter *pii.StreamFilter
-			if piiRedactor != nil && config.PIIIsEnabled() {
-				correlationID := id
-				userID := ""
-				if u := auth.GetUser(c); u != nil {
-					userID = u.ID
-				}
-				var overrides map[string]pii.Action
-				if raw := config.PIIPatternOverrides(); len(raw) > 0 {
-					overrides = make(map[string]pii.Action, len(raw))
-					for ovid, action := range raw {
-						switch pii.Action(action) {
-						case pii.ActionMask, pii.ActionBlock, pii.ActionAllow:
-							overrides[ovid] = pii.Action(action)
-						}
-					}
-				}
-				streamPIIFilter = pii.NewStreamFilter(piiRedactor, overrides, piiEvents, correlationID, userID)
-			}
-
+			// Response/output PII redaction is out of scope for now —
+			// redaction runs request-side via the NER middleware only.
 			predInput := config.PromptStrings[0]

 			templatedInput, err := evaluator.EvaluateTemplateForPrompt(templates.CompletionPromptTemplate, *config, templates.PromptTemplateData{
@@ -179,19 +153,6 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 					// OpenAI streaming spec: intermediate chunks must NOT
 					// carry a `usage` field. Strip the tracking copy now.
 					ev.Usage = nil
-					// Run the per-chunk text through the streaming PII
-					// filter. The filter holds back a tail to handle
-					// pattern boundaries, so a Push may legitimately
-					// return "" — drop the chunk's text rather than
-					// emitting a 0-token delta. Choice.Text is the only
-					// content surface in /v1/completions chunks.
-					if streamPIIFilter != nil && ev.Choices[0].Text != "" {
-						filtered := streamPIIFilter.Push(ev.Choices[0].Text)
-						if filtered == "" {
-							continue
-						}
-						ev.Choices[0].Text = filtered
-					}
 					respData, err := json.Marshal(ev)
 					if err != nil {
 						xlog.Debug("Failed to marshal response", "error", err)
@@ -237,25 +198,6 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 				}
 			}

-			// Flush any residual the streaming PII filter held back as
-			// part of its trailing pattern-window. Emit it as one final
-			// text-bearing chunk before the synthetic stop chunk so the
-			// completion body remains a contiguous text stream.
-			if streamPIIFilter != nil {
-				if residual := streamPIIFilter.Drain(); residual != "" {
-					residualResp := schema.OpenAIResponse{
-						ID:      id,
-						Created: created,
-						Model:   input.Model,
-						Choices: []schema.Choice{{Index: 0, Text: residual}},
-						Object:  "text_completion",
-					}
-					if data, err := json.Marshal(residualResp); err == nil {
-						_, _ = fmt.Fprintf(c.Response().Writer, "data: %s\n\n", string(data))
-					}
-				}
-			}
-
 			stopReason := FinishReasonStop
 			resp := &schema.OpenAIResponse{
 				ID:      id,