From 6a80e2373392f00cf03b2a61cbca1b8ac0b09541 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Mon, 25 May 2026 08:28:27 +0100
Subject: [PATCH] feat(middleware): Model routing, PII filtering, Cloud model
 proxies (#9802)

Add a routing middleware stack and a cloud-proxy backend.

* cloud-proxy: a Go gRPC backend that forwards OpenAI- and
  Anthropic-shaped chat requests to upstream providers, with an
  optional translate mode (OpenAI request -> Anthropic /v1/messages
  -> OpenAI response) and full tool-calling support.

* routing: admission control, content-aware model routing
  (embedding cache + classifier + rerank + Arch-Router score),
  PII detection/redaction (regex + NER) with streaming filter and
  OpenAI/Anthropic adapters, and a per-user/per-key billing recorder
  backed by GORM or in-memory storage.

* middleware: UsageMiddleware records usage via the billing recorder,
  plus admission, route-model, usage-stamp and trace middlewares.

* observability: BackendTrace ring buffer stores full request bodies
  (capped), MITM proxy emits structured trace events, and router
  classifier decisions surface at /api/router/decide.

* gallery: Arch-Router-1.5B (Q4_K_M and Q8_0).

* UI: cloud-proxy model-editor fields, classifier system-prompt and
  score-normalization config, and a Traces page rendering request
  bodies.

Assisted-by: claude-code:claude-opus-4-7 [Read] [Edit] [Bash]

Signed-off-by: Richard Palethorpe <io@richiejp.com>
---
 .dockerignore                                 |    9 +
 .gitignore                                    |    4 +
 Makefile                                      |   15 +-
 backend/backend.proto                         |  151 +++
 backend/cpp/llama-cpp/grpc-server.cpp         |  248 ++++
 backend/go/cloud-proxy/Makefile               |   12 +
 .../go/cloud-proxy/cloud_proxy_suite_test.go  |   16 +
 backend/go/cloud-proxy/main.go                |   39 +
 backend/go/cloud-proxy/package.sh             |   13 +
 .../go/cloud-proxy/passthrough_edge_test.go   |  270 ++++
 backend/go/cloud-proxy/provider_anthropic.go  |  508 ++++++++
 .../go/cloud-proxy/provider_anthropic_test.go |  334 +++++
 backend/go/cloud-proxy/provider_edge_test.go  |  119 ++
 backend/go/cloud-proxy/provider_openai.go     |  320 +++++
 .../go/cloud-proxy/provider_openai_test.go    |  170 +++
 backend/go/cloud-proxy/proxy.go               |  429 +++++++
 backend/go/cloud-proxy/proxy_test.go          |  206 +++
 backend/go/cloud-proxy/run.sh                 |    6 +
 backend/go/cloud-proxy/toolcalls_test.go      |  232 ++++
 backend/go/local-store/debug.go               |    2 +-
 backend/go/local-store/store.go               |  686 ++++------
 backend/go/local-store/store_suite_test.go    |   13 +
 backend/go/local-store/store_test.go          |  284 +++++
 backend/python/transformers/backend.py        |   50 +-
 backend/python/vllm/backend.py                |  127 ++
 core/application/application.go               |  129 ++
 core/application/mitm.go                      |  146 +++
 core/application/router_factories.go          |   63 +
 .../runtime_settings_branding_test.go         |   22 +
 core/application/startup.go                   |  150 +++
 core/backend/embeddings.go                    |   27 +
 core/backend/options.go                       |   12 +
 core/backend/rerank.go                        |   45 +
 core/backend/score.go                         |  159 +++
 core/backend/score_test.go                    |   63 +
 core/backend/stores.go                        |   62 +
 core/cli/run.go                               |    6 +
 core/config/application_config.go             |   94 ++
 core/config/meta/constants.go                 |   13 +-
 core/config/meta/registry.go                  |  213 ++++
 core/config/meta/registry_coverage_test.go    |  258 ++++
 core/config/mitm_host_owners_test.go          |  133 ++
 core/config/model_config.go                   |  497 +++++++-
 core/config/model_config_loader.go            |   43 +
 core/config/model_config_test.go              |  158 +++
 core/config/runtime_settings.go               |   22 +
 core/config/runtime_settings_persist_test.go  |   19 +
 core/explorer/empty_db.json.lock              |    0
 core/explorer/test_db.json.lock               |    0
 core/http/app.go                              |   44 +-
 core/http/auth/usage.go                       |   28 +
 .../anthropic/anthropic_suite_test.go         |   13 +
 core/http/endpoints/anthropic/messages.go     |  141 ++-
 .../endpoints/anthropic/messages_pii_test.go  |  114 ++
 .../endpoints/localai/api_instructions.go     |   24 +
 .../localai/api_instructions_test.go          |    6 +-
 core/http/endpoints/localai/import_model.go   |   10 +-
 core/http/endpoints/localai/mcp.go            |    6 +-
 core/http/endpoints/localai/pii_decide.go     |   85 ++
 .../http/endpoints/localai/pii_decide_test.go |  107 ++
 core/http/endpoints/localai/router_decide.go  |  109 ++
 .../endpoints/localai/router_decide_test.go   |  248 ++++
 core/http/endpoints/localai/score.go          |   90 ++
 core/http/endpoints/localai/settings.go       |   10 +
 .../endpoints/mcp/localai_assistant_test.go   |   28 +
 core/http/endpoints/openai/chat.go            |  120 +-
 core/http/endpoints/openai/completion.go      |   74 +-
 core/http/endpoints/openai/edit.go            |    2 +
 core/http/endpoints/openai/embeddings.go      |    9 +
 core/http/endpoints/openai/realtime.go        |    6 +-
 core/http/endpoints/openai/realtime_model.go  |  166 ++-
 core/http/middleware/admission.go             |   81 ++
 core/http/middleware/admission_test.go        |  118 ++
 core/http/middleware/context_keys.go          |   50 +
 core/http/middleware/request.go               |   11 +
 core/http/middleware/request_test.go          |   50 +-
 core/http/middleware/route_model.go           |  603 +++++++++
 core/http/middleware/route_model_test.go      |  551 ++++++++
 core/http/middleware/trace.go                 |   12 +
 core/http/middleware/usage.go                 |  402 +++---
 core/http/middleware/usage_stamp.go           |   33 +
 core/http/middleware/usage_test.go            |  393 ++++--
 .../http/react-ui/e2e/middleware-page.spec.js |  308 +++++
 .../http/react-ui/e2e/router-template.spec.js |  219 ++++
 .../http/react-ui/e2e/usage-dashboard.spec.js |  148 +++
 .../react-ui/e2e/users-tab-gating.spec.js     |   74 ++
 core/http/react-ui/public/locales/en/nav.json |    1 +
 core/http/react-ui/src/App.css                |   18 +
 .../src/components/ConfigFieldRenderer.jsx    |   70 +-
 .../src/components/PIIPatternListEditor.jsx   |  120 ++
 .../src/components/RequireAuthEnabled.jsx     |   16 +
 .../src/components/RouterCandidatesEditor.jsx |  185 +++
 .../src/components/RouterPoliciesEditor.jsx   |  109 ++
 core/http/react-ui/src/components/Sidebar.jsx |    3 +-
 .../src/components/StructuredCodeEditor.jsx   |   80 ++
 .../react-ui/src/contexts/FormContext.jsx     |   26 +
 core/http/react-ui/src/hooks/useChat.js       |    6 +-
 core/http/react-ui/src/pages/Manage.jsx       |   15 +-
 core/http/react-ui/src/pages/Middleware.jsx   | 1108 +++++++++++++++++
 core/http/react-ui/src/pages/ModelEditor.jsx  |    3 +
 core/http/react-ui/src/pages/Traces.jsx       |   17 +
 core/http/react-ui/src/pages/Usage.jsx        |   40 +-
 core/http/react-ui/src/router.jsx             |    5 +-
 core/http/react-ui/src/utils/capabilities.js  |    1 +
 .../http/react-ui/src/utils/modelTemplates.js |   90 ++
 core/http/routes/anthropic.go                 |   29 +-
 core/http/routes/localai.go                   |    5 +
 core/http/routes/middleware.go                |  362 ++++++
 core/http/routes/ollama.go                    |    2 +-
 core/http/routes/openai.go                    |   43 +-
 core/http/routes/openresponses.go             |    6 +-
 core/http/routes/pii.go                       |  260 ++++
 core/http/routes/usage.go                     |  157 +++
 core/http/routes/usage_test.go                |  135 ++
 core/schema/localai.go                        |   95 ++
 core/schema/message.go                        |   12 +-
 core/schema/message_test.go                   |   53 +
 core/schema/openai.go                         |   50 +-
 core/schema/prediction.go                     |    8 +-
 core/services/cloudproxy/backend_forward.go   |  237 ++++
 .../cloudproxy/backend_forward_test.go        |  160 +++
 core/services/cloudproxy/build_filter_test.go |   72 ++
 core/services/cloudproxy/mitm/ca.go           |  177 +++
 core/services/cloudproxy/mitm/ca_test.go      |   79 ++
 core/services/cloudproxy/mitm/handler.go      |  442 +++++++
 core/services/cloudproxy/mitm/handler_test.go |  329 +++++
 core/services/cloudproxy/mitm/http2_test.go   |  165 +++
 core/services/cloudproxy/mitm/leaf.go         |  102 ++
 core/services/cloudproxy/mitm/leaf_test.go    |  103 ++
 .../cloudproxy/mitm/mitm_suite_test.go        |   13 +
 core/services/cloudproxy/mitm/proxy.go        |  306 +++++
 core/services/cloudproxy/mitm/proxy_test.go   |  278 +++++
 core/services/cloudproxy/mitm/response.go     |  105 ++
 core/services/cloudproxy/mitm/restart_test.go |   98 ++
 core/services/cloudproxy/proxy.go             |  125 ++
 core/services/cloudproxy/proxy_suite_test.go  |   13 +
 core/services/cloudproxy/proxy_test.go        |   38 +
 core/services/cloudproxy/ssewire/ssewire.go   |  218 ++++
 .../cloudproxy/ssewire/ssewire_suite_test.go  |   13 +
 .../cloudproxy/ssewire/ssewire_test.go        |  114 ++
 core/services/monitoring/metrics.go           |   10 +
 core/services/nodes/health_mock_test.go       |    9 +
 core/services/nodes/inflight_test.go          |   12 +
 core/services/routing/admission/admission.go  |  105 ++
 .../routing/admission/admission_suite_test.go |   13 +
 .../routing/admission/admission_test.go       |  103 ++
 core/services/routing/billing/backend.go      |   52 +
 .../routing/billing/billing_suite_test.go     |   13 +
 core/services/routing/billing/disabled.go     |   20 +
 core/services/routing/billing/gorm.go         |  111 ++
 core/services/routing/billing/inmem.go        |  157 +++
 core/services/routing/billing/inmem_test.go   |  140 +++
 core/services/routing/billing/local_user.go   |   84 ++
 .../routing/billing/local_user_test.go        |   70 ++
 core/services/routing/billing/prom.go         |  215 ++++
 .../services/routing/billing/recorder_test.go |   82 ++
 core/services/routing/contract/contract.go    |   55 +
 core/services/routing/contract/strict_off.go  |    5 +
 core/services/routing/contract/strict_on.go   |    9 +
 core/services/routing/pii/config.go           |   71 ++
 core/services/routing/pii/config_test.go      |   56 +
 core/services/routing/pii/middleware.go       |  260 ++++
 core/services/routing/pii/middleware_test.go  |  309 +++++
 core/services/routing/pii/ner.go              |   97 ++
 core/services/routing/pii/ner_test.go         |  174 +++
 core/services/routing/pii/patterns.go         |  188 +++
 core/services/routing/pii/pii_suite_test.go   |   13 +
 core/services/routing/pii/redactor.go         |  342 +++++
 .../routing/pii/redactor_race_test.go         |   66 +
 core/services/routing/pii/redactor_test.go    |  184 +++
 core/services/routing/pii/store.go            |  130 ++
 core/services/routing/pii/stream.go           |  197 +++
 core/services/routing/pii/stream_test.go      |  184 +++
 core/services/routing/pii/types.go            |  170 +++
 core/services/routing/piiadapter/anthropic.go |   81 ++
 .../routing/piiadapter/anthropic_test.go      |   69 +
 core/services/routing/piiadapter/openai.go    |  112 ++
 .../routing/piiadapter/openai_test.go         |   93 ++
 .../piiadapter/piiadapter_suite_test.go       |   13 +
 core/services/routing/router/cache.go         |   96 ++
 core/services/routing/router/decisions.go     |  166 +++
 .../routing/router/embedding_cache.go         |  227 ++++
 .../routing/router/embedding_cache_test.go    |  311 +++++
 core/services/routing/router/registry.go      |   76 ++
 core/services/routing/router/rerank.go        |  104 ++
 core/services/routing/router/rerank_test.go   |  121 ++
 core/services/routing/router/resolve.go       |  203 +++
 core/services/routing/router/resolve_test.go  |  130 ++
 .../routing/router/router_suite_test.go       |   13 +
 core/services/routing/router/score.go         |  423 +++++++
 core/services/routing/router/score_test.go    |  337 +++++
 core/services/routing/router/types.go         |  133 ++
 core/trace/backend_trace.go                   |   26 +-
 docs/content/features/cloud-proxy.md          |  232 ++++
 docs/content/features/middleware.md           |  509 ++++++++
 docs/content/features/mitm-proxy.md           |  159 +++
 gallery/bge-m3-colbert.yaml                   |   11 +
 gallery/index.yaml                            |   69 +
 go.mod                                        |    2 +-
 pkg/grpc/backend.go                           |   11 +
 pkg/grpc/base/base.go                         |    5 +
 pkg/grpc/client.go                            |  111 ++
 pkg/grpc/embed.go                             |  121 ++
 pkg/grpc/forward_test.go                      |   94 ++
 pkg/grpc/integration_toolcalls_test.go        |  147 +++
 pkg/grpc/interface.go                         |   25 +
 pkg/grpc/rich_test.go                         |  129 ++
 pkg/grpc/server.go                            |   90 +-
 pkg/mcp/localaitools/client.go                |   39 +
 pkg/mcp/localaitools/coverage_test.go         |   28 +-
 pkg/mcp/localaitools/dto.go                   |  173 +++
 pkg/mcp/localaitools/fakes_test.go            |   81 +-
 pkg/mcp/localaitools/httpapi/client.go        |  187 ++-
 pkg/mcp/localaitools/httpapi/routes.go        |   38 +-
 pkg/mcp/localaitools/inproc/client.go         |  320 ++++-
 pkg/mcp/localaitools/server.go                |    3 +
 pkg/mcp/localaitools/server_test.go           |   14 +
 pkg/mcp/localaitools/tools.go                 |    8 +
 pkg/mcp/localaitools/tools_middleware.go      |   78 ++
 pkg/mcp/localaitools/tools_pii.go             |   45 +
 pkg/mcp/localaitools/tools_usage.go           |   22 +
 pkg/model/connection_evicting_client.go       |   12 +
 pkg/store/client.go                           |   82 +-
 pkg/store/proto.go                            |   46 +
 tests/e2e-ui/main.go                          |   39 +-
 tests/e2e/cloud_proxy_helpers_test.go         |  206 +++
 tests/e2e/e2e_cloud_proxy_test.go             |  268 ++++
 tests/e2e/e2e_suite_test.go                   |  105 ++
 tests/e2e/mock-backend/main.go                |   13 +-
 229 files changed, 26339 insertions(+), 1030 deletions(-)
 create mode 100644 backend/go/cloud-proxy/Makefile
 create mode 100644 backend/go/cloud-proxy/cloud_proxy_suite_test.go
 create mode 100644 backend/go/cloud-proxy/main.go
 create mode 100755 backend/go/cloud-proxy/package.sh
 create mode 100644 backend/go/cloud-proxy/passthrough_edge_test.go
 create mode 100644 backend/go/cloud-proxy/provider_anthropic.go
 create mode 100644 backend/go/cloud-proxy/provider_anthropic_test.go
 create mode 100644 backend/go/cloud-proxy/provider_edge_test.go
 create mode 100644 backend/go/cloud-proxy/provider_openai.go
 create mode 100644 backend/go/cloud-proxy/provider_openai_test.go
 create mode 100644 backend/go/cloud-proxy/proxy.go
 create mode 100644 backend/go/cloud-proxy/proxy_test.go
 create mode 100755 backend/go/cloud-proxy/run.sh
 create mode 100644 backend/go/cloud-proxy/toolcalls_test.go
 create mode 100644 backend/go/local-store/store_suite_test.go
 create mode 100644 backend/go/local-store/store_test.go
 create mode 100644 core/application/mitm.go
 create mode 100644 core/application/router_factories.go
 create mode 100644 core/backend/score.go
 create mode 100644 core/backend/score_test.go
 create mode 100644 core/config/meta/registry_coverage_test.go
 create mode 100644 core/config/mitm_host_owners_test.go
 create mode 100644 core/explorer/empty_db.json.lock
 create mode 100644 core/explorer/test_db.json.lock
 create mode 100644 core/http/endpoints/anthropic/anthropic_suite_test.go
 create mode 100644 core/http/endpoints/anthropic/messages_pii_test.go
 create mode 100644 core/http/endpoints/localai/pii_decide.go
 create mode 100644 core/http/endpoints/localai/pii_decide_test.go
 create mode 100644 core/http/endpoints/localai/router_decide.go
 create mode 100644 core/http/endpoints/localai/router_decide_test.go
 create mode 100644 core/http/endpoints/localai/score.go
 create mode 100644 core/http/middleware/admission.go
 create mode 100644 core/http/middleware/admission_test.go
 create mode 100644 core/http/middleware/context_keys.go
 create mode 100644 core/http/middleware/route_model.go
 create mode 100644 core/http/middleware/route_model_test.go
 create mode 100644 core/http/middleware/usage_stamp.go
 create mode 100644 core/http/react-ui/e2e/middleware-page.spec.js
 create mode 100644 core/http/react-ui/e2e/router-template.spec.js
 create mode 100644 core/http/react-ui/e2e/usage-dashboard.spec.js
 create mode 100644 core/http/react-ui/e2e/users-tab-gating.spec.js
 create mode 100644 core/http/react-ui/src/components/PIIPatternListEditor.jsx
 create mode 100644 core/http/react-ui/src/components/RequireAuthEnabled.jsx
 create mode 100644 core/http/react-ui/src/components/RouterCandidatesEditor.jsx
 create mode 100644 core/http/react-ui/src/components/RouterPoliciesEditor.jsx
 create mode 100644 core/http/react-ui/src/components/StructuredCodeEditor.jsx
 create mode 100644 core/http/react-ui/src/contexts/FormContext.jsx
 create mode 100644 core/http/react-ui/src/pages/Middleware.jsx
 create mode 100644 core/http/routes/middleware.go
 create mode 100644 core/http/routes/pii.go
 create mode 100644 core/http/routes/usage.go
 create mode 100644 core/http/routes/usage_test.go
 create mode 100644 core/services/cloudproxy/backend_forward.go
 create mode 100644 core/services/cloudproxy/backend_forward_test.go
 create mode 100644 core/services/cloudproxy/build_filter_test.go
 create mode 100644 core/services/cloudproxy/mitm/ca.go
 create mode 100644 core/services/cloudproxy/mitm/ca_test.go
 create mode 100644 core/services/cloudproxy/mitm/handler.go
 create mode 100644 core/services/cloudproxy/mitm/handler_test.go
 create mode 100644 core/services/cloudproxy/mitm/http2_test.go
 create mode 100644 core/services/cloudproxy/mitm/leaf.go
 create mode 100644 core/services/cloudproxy/mitm/leaf_test.go
 create mode 100644 core/services/cloudproxy/mitm/mitm_suite_test.go
 create mode 100644 core/services/cloudproxy/mitm/proxy.go
 create mode 100644 core/services/cloudproxy/mitm/proxy_test.go
 create mode 100644 core/services/cloudproxy/mitm/response.go
 create mode 100644 core/services/cloudproxy/mitm/restart_test.go
 create mode 100644 core/services/cloudproxy/proxy.go
 create mode 100644 core/services/cloudproxy/proxy_suite_test.go
 create mode 100644 core/services/cloudproxy/proxy_test.go
 create mode 100644 core/services/cloudproxy/ssewire/ssewire.go
 create mode 100644 core/services/cloudproxy/ssewire/ssewire_suite_test.go
 create mode 100644 core/services/cloudproxy/ssewire/ssewire_test.go
 create mode 100644 core/services/routing/admission/admission.go
 create mode 100644 core/services/routing/admission/admission_suite_test.go
 create mode 100644 core/services/routing/admission/admission_test.go
 create mode 100644 core/services/routing/billing/backend.go
 create mode 100644 core/services/routing/billing/billing_suite_test.go
 create mode 100644 core/services/routing/billing/disabled.go
 create mode 100644 core/services/routing/billing/gorm.go
 create mode 100644 core/services/routing/billing/inmem.go
 create mode 100644 core/services/routing/billing/inmem_test.go
 create mode 100644 core/services/routing/billing/local_user.go
 create mode 100644 core/services/routing/billing/local_user_test.go
 create mode 100644 core/services/routing/billing/prom.go
 create mode 100644 core/services/routing/billing/recorder_test.go
 create mode 100644 core/services/routing/contract/contract.go
 create mode 100644 core/services/routing/contract/strict_off.go
 create mode 100644 core/services/routing/contract/strict_on.go
 create mode 100644 core/services/routing/pii/config.go
 create mode 100644 core/services/routing/pii/config_test.go
 create mode 100644 core/services/routing/pii/middleware.go
 create mode 100644 core/services/routing/pii/middleware_test.go
 create mode 100644 core/services/routing/pii/ner.go
 create mode 100644 core/services/routing/pii/ner_test.go
 create mode 100644 core/services/routing/pii/patterns.go
 create mode 100644 core/services/routing/pii/pii_suite_test.go
 create mode 100644 core/services/routing/pii/redactor.go
 create mode 100644 core/services/routing/pii/redactor_race_test.go
 create mode 100644 core/services/routing/pii/redactor_test.go
 create mode 100644 core/services/routing/pii/store.go
 create mode 100644 core/services/routing/pii/stream.go
 create mode 100644 core/services/routing/pii/stream_test.go
 create mode 100644 core/services/routing/pii/types.go
 create mode 100644 core/services/routing/piiadapter/anthropic.go
 create mode 100644 core/services/routing/piiadapter/anthropic_test.go
 create mode 100644 core/services/routing/piiadapter/openai.go
 create mode 100644 core/services/routing/piiadapter/openai_test.go
 create mode 100644 core/services/routing/piiadapter/piiadapter_suite_test.go
 create mode 100644 core/services/routing/router/cache.go
 create mode 100644 core/services/routing/router/decisions.go
 create mode 100644 core/services/routing/router/embedding_cache.go
 create mode 100644 core/services/routing/router/embedding_cache_test.go
 create mode 100644 core/services/routing/router/registry.go
 create mode 100644 core/services/routing/router/rerank.go
 create mode 100644 core/services/routing/router/rerank_test.go
 create mode 100644 core/services/routing/router/resolve.go
 create mode 100644 core/services/routing/router/resolve_test.go
 create mode 100644 core/services/routing/router/router_suite_test.go
 create mode 100644 core/services/routing/router/score.go
 create mode 100644 core/services/routing/router/score_test.go
 create mode 100644 core/services/routing/router/types.go
 create mode 100644 docs/content/features/cloud-proxy.md
 create mode 100644 docs/content/features/middleware.md
 create mode 100644 docs/content/features/mitm-proxy.md
 create mode 100644 gallery/bge-m3-colbert.yaml
 create mode 100644 pkg/grpc/forward_test.go
 create mode 100644 pkg/grpc/integration_toolcalls_test.go
 create mode 100644 pkg/grpc/rich_test.go
 create mode 100644 pkg/mcp/localaitools/tools_middleware.go
 create mode 100644 pkg/mcp/localaitools/tools_pii.go
 create mode 100644 pkg/mcp/localaitools/tools_usage.go
 create mode 100644 pkg/store/proto.go
 create mode 100644 tests/e2e/cloud_proxy_helpers_test.go
 create mode 100644 tests/e2e/e2e_cloud_proxy_test.go

diff --git a/.dockerignore b/.dockerignore
index 5b62e5f31..e8904d3be 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -4,6 +4,7 @@
 .devcontainer
 models
 backends
+volumes
 examples/chatbot-ui/models
 backend/go/image/stablediffusion-ggml/build/
 backend/go/*/build
@@ -21,3 +22,11 @@ __pycache__
 # backend virtual environments
 **/venv
 backend/python/**/source
+
+# In-place llama.cpp clone + per-variant build copies. The Makefile
+# clones llama.cpp itself at the pinned LLAMA_VERSION; if a stale
+# local checkout is COPY'd into the image, the `llama.cpp:` target
+# sees the directory and skips re-cloning, so grpc-server.cpp ends
+# up compiled against whatever (likely older) commit the host had.
+backend/cpp/llama-cpp/llama.cpp
+backend/cpp/llama-cpp-*-build
diff --git a/.gitignore b/.gitignore
index 08873e8b2..ec105ac85 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,10 @@ go-bert
 LocalAI
 /local-ai
 /local-ai-launcher
+# Root-level build artifacts when running `go build ./...` against
+# Go backend packages whose main lives under backend/go/.
+/cloud-proxy
+/local-store
 # prevent above rules from omitting the helm chart
 !charts/*
 # prevent above rules from omitting the api/localai folder
diff --git a/Makefile b/Makefile
index ebeef4c41..3eebc1871 100644
--- a/Makefile
+++ b/Makefile
@@ -69,7 +69,7 @@ else
 	GORELEASER=$(shell which goreleaser)
 endif
 
-TEST_PATHS?=./api/... ./pkg/... ./core/...
+TEST_PATHS?=./api/... ./pkg/... ./core/... ./backend/go/cloud-proxy/... ./backend/go/local-store/...
 
 
 .PHONY: all test build vendor lint lint-all
@@ -268,12 +268,13 @@ prepare-e2e:
 run-e2e-image:
 	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --name e2e-tests-$(RANDOM) localai-tests
 
-test-e2e: build-mock-backend prepare-e2e run-e2e-image
+test-e2e: build-mock-backend build-cloud-proxy-backend prepare-e2e run-e2e-image
 	@echo 'Running e2e tests'
 	BUILD_TYPE=$(BUILD_TYPE) \
 	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390 \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
 	$(MAKE) clean-mock-backend
+	$(MAKE) clean-cloud-proxy-backend
 	$(MAKE) teardown-e2e
 	docker rmi localai-tests
 
@@ -1064,6 +1065,7 @@ BACKEND_DS4 = ds4|ds4|.|false|false
 # Golang backends
 BACKEND_PIPER = piper|golang|.|false|true
 BACKEND_LOCAL_STORE = local-store|golang|.|false|true
+BACKEND_CLOUD_PROXY = cloud-proxy|golang|.|false|true
 BACKEND_HUGGINGFACE = huggingface|golang|.|false|true
 BACKEND_SILERO_VAD = silero-vad|golang|.|false|true
 BACKEND_STABLEDIFFUSION_GGML = stablediffusion-ggml|golang|.|--progress=plain|true
@@ -1149,6 +1151,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_TURBOQUANT)))
 $(eval $(call generate-docker-build-target,$(BACKEND_DS4)))
 $(eval $(call generate-docker-build-target,$(BACKEND_PIPER)))
 $(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE)))
+$(eval $(call generate-docker-build-target,$(BACKEND_CLOUD_PROXY)))
 $(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE)))
 $(eval $(call generate-docker-build-target,$(BACKEND_SILERO_VAD)))
 $(eval $(call generate-docker-build-target,$(BACKEND_STABLEDIFFUSION_GGML)))
@@ -1201,7 +1204,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_SHERPA_ONNX)))
 docker-save-%: backend-images
 	docker save local-ai-backend:$* -o backend-images/$*.tar
 
-docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx
+docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy
 
 ########################################################
 ### Mock Backend for E2E Tests
@@ -1213,6 +1216,12 @@ build-mock-backend: protogen-go
 clean-mock-backend:
 	rm -f tests/e2e/mock-backend/mock-backend
 
+build-cloud-proxy-backend: protogen-go
+	$(GOCMD) build -o tests/e2e/mock-backend/cloud-proxy ./backend/go/cloud-proxy
+
+clean-cloud-proxy-backend:
+	rm -f tests/e2e/mock-backend/cloud-proxy
+
 ########################################################
 ### UI E2E Test Server
 ########################################################
diff --git a/backend/backend.proto b/backend/backend.proto
index bf07f3bd4..8a0c8e696 100644
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -37,6 +37,22 @@ service Backend {
 
   rpc Rerank(RerankRequest) returns (RerankResult) {}
 
+  // TokenClassify runs a token-classification (NER) model on the
+  // supplied text and returns each detected entity span. Used by the
+  // PII redactor's optional NER tier — the regex tier still handles
+  // formatted hits cheaply, while this catches names, locations, and
+  // other unformatted PII that regex misses.
+  rpc TokenClassify(TokenClassifyRequest) returns (TokenClassifyResponse) {}
+
+  // Score evaluates the model's joint log-probability of each
+  // supplied candidate continuation given a shared prompt. The
+  // prompt's KV cache is computed once and reused across candidates.
+  // Used for routing-policy multi-label classification, reranking,
+  // calibrated confidence, and reward-model scoring — any task where
+  // the consumer wants the model's confidence in a pre-specified
+  // continuation rather than a generated one.
+  rpc Score(ScoreRequest) returns (ScoreResponse) {}
+
   rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
 
   rpc VAD(VADRequest) returns (VADResponse) {}
@@ -68,6 +84,23 @@ service Backend {
   rpc QuantizationProgress(QuantizationProgressRequest) returns (stream QuantizationProgressUpdate) {}
   rpc StopQuantization(QuantizationStopRequest) returns (Result) {}
 
+  // Forward proxies a raw HTTP request to an upstream provider. The
+  // cloud-proxy backend implements this for passthrough-mode model
+  // configs: the client wire format is preserved end-to-end (no
+  // translation through internal proto), which means new provider
+  // fields work the day they ship. Translation-mode proxies use the
+  // standard Predict/PredictStream RPCs instead. Backends that don't
+  // support this return UNIMPLEMENTED.
+  //
+  // The request is bidirectionally streamed so large bodies can flow
+  // without buffering. In practice the first ForwardRequest carries
+  // path, method, headers, and the initial body chunk; subsequent
+  // messages append body chunks. The first ForwardReply carries the
+  // upstream status and response headers; subsequent messages stream
+  // body chunks (SSE frames or chunked transfer). Cancellation of the
+  // gRPC context closes the upstream connection.
+  rpc Forward(stream ForwardRequest) returns (stream ForwardReply) {}
+
 }
 
 // Define the empty request
@@ -81,6 +114,76 @@ message MetricsResponse {
   int32 prompt_tokens_processed = 5;
 }
 
+// TokenClassifyRequest carries the text to classify plus an optional
+// score threshold. The transformers backend interprets threshold as
+// the minimum confidence to include in the response; 0 = include all.
+message TokenClassifyRequest {
+  string text = 1;
+  float threshold = 2;
+}
+
+// TokenClassifyEntity is one detected entity span. Byte offsets are
+// into the original UTF-8 text — start..end is a half-open range that
+// addresses the substring corresponding to entity_group.
+//
+// entity_group follows HuggingFace's aggregated-tag convention (e.g.
+// "PER", "LOC", "ORG", or a PII-specific label like "EMAIL" /
+// "SSN" depending on the model). The redactor's per-pattern action
+// map keys off this string.
+message TokenClassifyEntity {
+  string entity_group = 1;
+  int32 start = 2;
+  int32 end = 3;
+  float score = 4;
+  string text = 5;
+}
+
+message TokenClassifyResponse {
+  repeated TokenClassifyEntity entities = 1;
+}
+
+// ScoreRequest carries one shared prompt and one or more continuations
+// to score against it. The backend tokenises the prompt once and reuses
+// the resulting KV cache across all candidates in this request.
+message ScoreRequest {
+  string prompt = 1;
+  repeated string candidates = 2;
+  // Return per-token logprobs for each candidate when true. Default
+  // false to keep the wire response small; the joint log_prob field
+  // covers the common ranking case.
+  bool include_token_logprobs = 3;
+  // When true, the response also populates length_normalized_log_prob
+  // (joint log-prob divided by candidate token count). Useful when
+  // candidates differ in length and the consumer wants a per-token
+  // measure comparable across them (PMI-style scoring).
+  bool length_normalize = 4;
+}
+
+// CandidateScore is one row in the ScoreResponse, matching by index
+// the candidate in ScoreRequest.candidates.
+message CandidateScore {
+  // Sum of log P(token_i | prompt, candidate_token_<i) across the
+  // candidate's tokens. The primary ranking signal.
+  double log_prob = 1;
+  // log_prob / num_tokens — populated when length_normalize=true on
+  // the request.
+  double length_normalized_log_prob = 2;
+  // Per-token detail — populated when include_token_logprobs=true.
+  repeated TokenLogProb tokens = 3;
+  // Number of tokens the backend tokenised this candidate into, after
+  // any backend-specific normalisation (e.g. leading-space handling).
+  int32 num_tokens = 4;
+}
+
+message TokenLogProb {
+  string token = 1;
+  double log_prob = 2;
+}
+
+message ScoreResponse {
+  repeated CandidateScore candidates = 1;
+}
+
 message RerankRequest {
   string query = 1;
   repeated string documents = 2;
@@ -325,6 +428,25 @@ message ModelOptions {
   // applied verbatim to the backend's engine constructor (e.g. vLLM AsyncEngineArgs).
   // Unknown keys produce an error at LoadModel time.
   string EngineArgs = 73;
+
+  // Proxy carries the cloud-proxy backend's per-model configuration.
+  // Empty for non-proxy backends.
+  ProxyOptions Proxy = 74;
+}
+
+// ProxyOptions configures the cloud-proxy backend. UpstreamURL and
+// Mode are always meaningful; Provider only matters in translate mode.
+// The two api_key_* fields are mutually exclusive and resolved by the
+// backend at LoadModel — core forwards the references rather than the
+// plaintext key.
+message ProxyOptions {
+  string upstream_url = 1;
+  string mode = 2;
+  string provider = 3;
+  string api_key_env = 4;
+  string api_key_file = 5;
+  string upstream_model = 6;
+  int32 request_timeout_seconds = 7;
 }
 
 message Result {
@@ -1002,3 +1124,32 @@ message QuantizationStopRequest {
   string job_id = 1;
 }
 
+// ForwardHeader is one HTTP header on the request or response. Headers
+// like Authorization are typically injected by the backend (from the
+// resolved API key) rather than passed through from the client.
+message ForwardHeader {
+  string name = 1;
+  string value = 2;
+}
+
+// ForwardRequest is a streamed HTTP request to the upstream. First
+// message carries path/method/headers; subsequent messages carry
+// body_chunk only. All fields except body_chunk are honoured on the
+// first message and ignored thereafter.
+message ForwardRequest {
+  string path = 1;                          // e.g. "/v1/chat/completions" — appended to the model's upstream_url
+  string method = 2;                        // usually "POST"
+  repeated ForwardHeader headers = 3;
+  bytes body_chunk = 4;
+}
+
+// ForwardReply is a streamed HTTP response from the upstream. First
+// message carries status/headers; subsequent messages carry body_chunk
+// only. SSE responses arrive as a sequence of body_chunk frames; the
+// caller is responsible for any parsing.
+message ForwardReply {
+  int32 status = 1;
+  repeated ForwardHeader headers = 2;
+  bytes body_chunk = 3;
+}
+
diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
index bc51f06e5..84c584f13 100644
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -34,6 +34,7 @@
 #include <regex>
 #include <algorithm>
 #include <atomic>
+#include <cmath>
 #include <cstdlib>
 #include <fstream>
 #include <iterator>
@@ -121,6 +122,40 @@ static std::string base64_encode_bytes(const unsigned char* data, size_t len) {
 
 bool loaded_model; // TODO: add a mutex for this, but happens only once loading the model
 
+// Score bypasses the slot loop (see the comment on Score below) so it
+// must not run concurrently with any slot-loop RPC. These counters
+// are a defence-in-depth tripwire — ModelConfig.Validate already
+// rejects llama-cpp configs that mix score with chat/completion/
+// embeddings, so a healthy deployment never trips them. seq_cst is
+// load-bearing for the increment-then-check pattern below.
+static std::atomic<int> slot_loop_inflight{0};
+static std::atomic<int> score_inflight{0};
+
+// Increment-then-check, not check-then-increment: two simultaneous
+// racers both observe the other's increment and both abort cleanly.
+// Reversed, both could see zero and proceed.
+struct conflict_guard {
+    std::atomic<int>& self;
+    conflict_guard(const char* rpc, std::atomic<int>& self_, std::atomic<int>& other, const char* other_name)
+        : self(self_) {
+        self.fetch_add(1, std::memory_order_seq_cst);
+        int o = other.load(std::memory_order_seq_cst);
+        if (o > 0) {
+            fprintf(stderr,
+                "FATAL: %s called with %s=%d. The llama-cpp backend cannot "
+                "service Score and slot-loop RPCs concurrently — Score "
+                "bypasses the slot loop and races the llama_context. Bind "
+                "Score-using features to a model dedicated to scoring "
+                "(known_usecases: [score] with no chat/completion/embeddings).\n",
+                rpc, other_name, o);
+            std::abort();
+        }
+    }
+    ~conflict_guard() {
+        self.fetch_sub(1, std::memory_order_seq_cst);
+    }
+};
+
 static std::function<void(int)> shutdown_handler;
 static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
 
@@ -1446,6 +1481,7 @@ public:
         if (params_base.model.path.empty()) {
             return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
         }
+        conflict_guard guard("PredictStream", slot_loop_inflight, score_inflight, "score_inflight");
         json data = parse_options(true, request, params_base, ctx_server.get_llama_context());
 
 
@@ -2205,6 +2241,7 @@ public:
          if (params_base.model.path.empty()) {
              return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
          }
+         conflict_guard guard("Predict", slot_loop_inflight, score_inflight, "score_inflight");
          json data = parse_options(true, request, params_base, ctx_server.get_llama_context());
 
         data["stream"] = false;
@@ -2963,6 +3000,7 @@ public:
         if (params_base.model.path.empty()) {
             return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
         }
+        conflict_guard guard("Embedding", slot_loop_inflight, score_inflight, "score_inflight");
         json body = parse_options(false, request, params_base, ctx_server.get_llama_context());
 
         body["stream"] = false;
@@ -3070,6 +3108,8 @@ public:
             return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "\"documents\" must be a non-empty string array");
         }
 
+        conflict_guard guard("Rerank", slot_loop_inflight, score_inflight, "score_inflight");
+
         // Create and queue the task
         auto rd = ctx_server.get_response_reader();
         {
@@ -3142,12 +3182,218 @@ public:
         return grpc::Status::OK;
     }
 
+    // Score returns the model's joint log-probability of each candidate
+    // continuation given a shared prompt.
+    //
+    // WHY bypass the slot/task queue: upstream server_context exposes
+    // get_llama_context as "main thread only" and the slot loop's
+    // update_slots() owns the context whenever a task is in flight.
+    // No public synchronization primitive is available — so Score is
+    // unsafe to call concurrently with active generation through this
+    // backend. In practice routing-classifier calls happen before the
+    // request is routed to a generation backend, so the model used
+    // for Score is typically idle. Concurrent Score calls are
+    // serialised by a local mutex; KV-cache state is isolated behind
+    // a dedicated sequence ID cleared between candidates.
+    //
+    // A patch to server-context.cpp that adds SERVER_TASK_TYPE_SCORE
+    // and routes scoring through the slot loop would be the correct
+    // long-term fix; tracked as a follow-up.
+    //
+    // Perf TODO (measured: ~450 ms warm for 3 candidates on Arch-
+    // Router-1.5B Q4_K_M + Intel SYCL): the current loop re-decodes
+    // `prompt + candidate` from scratch for every candidate, throwing
+    // away the prompt's KV cache between iterations. A smarter
+    // version would:
+    //   1. Decode just the prompt once into score_seq_id.
+    //   2. Snapshot/cp that sequence (llama_memory_seq_cp) into a
+    //      per-candidate sequence id.
+    //   3. For each candidate, decode only its tokens onto the copy
+    //      (continuing from the saved prompt state), read logits.
+    //   4. llama_memory_seq_rm the copy.
+    // Estimated speedup: 3-candidate calls 450 ms -> ~150-200 ms,
+    // 6-candidate calls 630 ms -> ~220 ms. Single source-file change,
+    // no proto / Go-side changes needed. Worth doing once routing is
+    // wired into the middleware and Score is on the hot path of every
+    // chat request.
+    grpc::Status Score(ServerContext* context, const backend::ScoreRequest* request, backend::ScoreResponse* response) override {
+        auto auth = checkAuth(context);
+        if (!auth.ok()) return auth;
+        if (params_base.model.path.empty()) {
+            return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
+        }
+        if (request->candidates_size() == 0) {
+            return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "candidates must be non-empty");
+        }
+
+        // Tripwire against the slot loop. Acquired before score_mutex
+        // so it fires even when this Score is queued behind another.
+        conflict_guard guard("Score", score_inflight, slot_loop_inflight, "slot_loop_inflight");
+
+        // Serialise concurrent Score calls. The slot loop is still
+        // free to race with us — see the class comment above.
+        static std::mutex score_mutex;
+        std::lock_guard<std::mutex> score_lock(score_mutex);
+
+        llama_context * lctx = ctx_server.get_llama_context();
+        if (lctx == nullptr) {
+            return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "llama context unavailable (sleeping?)");
+        }
+        const llama_vocab * vocab = ctx_server.impl->vocab;
+        const int32_t n_vocab = llama_vocab_n_tokens(vocab);
+        const int32_t n_ctx = llama_n_ctx(lctx);
+        llama_memory_t mem = llama_get_memory(lctx);
+
+        // The KV-cache is sized to seq_to_stream.size() at load
+        // (typically equal to n_slots, often 1). Sequence IDs must
+        // be in [0, n_seq_max), so we can't pick a high-value
+        // "private" ID — we have to share with the slot. We clear
+        // the cache before AND after each candidate to keep
+        // scoring isolated from whatever state the slot held, and
+        // the static mutex above guarantees no other Score call is
+        // racing in the meantime. The slot loop is still free to
+        // race (see comment on this method) — Score must not run
+        // concurrently with generation through this backend.
+        const llama_seq_id score_seq_id = 0;
+        llama_memory_seq_rm(mem, score_seq_id, -1, -1);
+
+        // Tokenize the shared prompt once with add_special=true so
+        // BOS is prepended when the model requires it. parse_special
+        // keeps chat-template markers in the prompt intact.
+        const std::string prompt = request->prompt();
+        std::vector<llama_token> prompt_tokens = common_tokenize(vocab, prompt, /*add_special=*/true, /*parse_special=*/true);
+        const int32_t prompt_len = (int32_t) prompt_tokens.size();
+
+        for (int ci = 0; ci < request->candidates_size(); ci++) {
+            const std::string & candidate_text = request->candidates(ci);
+
+            // Re-tokenize prompt + candidate as a single string. BPE
+            // merges across the boundary can shift the tokenization
+            // versus tokenize(prompt) ++ tokenize(candidate), so we
+            // find the divergence point against prompt_tokens.
+            std::vector<llama_token> full_tokens = common_tokenize(vocab, prompt + candidate_text, /*add_special=*/true, /*parse_special=*/true);
+            int32_t divergence = prompt_len;
+            const int32_t min_len = std::min<int32_t>(prompt_len, (int32_t) full_tokens.size());
+            for (int32_t i = 0; i < min_len; i++) {
+                if (prompt_tokens[i] != full_tokens[i]) {
+                    divergence = i;
+                    break;
+                }
+            }
+            const int32_t cand_len = (int32_t) full_tokens.size() - divergence;
+            backend::CandidateScore * cs = response->add_candidates();
+            cs->set_num_tokens(cand_len);
+            if (cand_len <= 0) {
+                cs->set_log_prob(0.0);
+                if (request->length_normalize()) {
+                    cs->set_length_normalized_log_prob(0.0);
+                }
+                continue;
+            }
+            if (divergence < 1) {
+                // Need at least one prior token (typically BOS) to
+                // predict the first candidate token's logit. Tokeniser
+                // models without BOS + an empty prompt fall in here.
+                return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
+                    "Score: prompt produced no leading tokens; need at least one (e.g. BOS) to predict candidate");
+            }
+            if ((int32_t) full_tokens.size() > n_ctx) {
+                return grpc::Status(grpc::StatusCode::OUT_OF_RANGE,
+                    "Score: prompt+candidate exceeds context size (got " +
+                    std::to_string(full_tokens.size()) + ", n_ctx=" + std::to_string(n_ctx) + ")");
+            }
+
+            // Build a batch covering the entire prompt+candidate. We
+            // need logits at (divergence-1) onward — those are the
+            // predictions for each candidate token.
+            llama_batch batch = llama_batch_init((int32_t) full_tokens.size(), 0, 1);
+            for (int32_t i = 0; i < (int32_t) full_tokens.size(); i++) {
+                batch.token[i]    = full_tokens[i];
+                batch.pos[i]      = i;
+                batch.n_seq_id[i] = 1;
+                batch.seq_id[i][0] = score_seq_id;
+                // logits[i] is "do we want the prediction *for the
+                // next token*, computed from this position?"
+                // We want predictions for candidate tokens at
+                // positions divergence .. full_tokens.size()-1, which
+                // come from logits at positions (divergence-1) ..
+                // (full_tokens.size()-2).
+                bool need_logit = (i >= divergence - 1) && (i < (int32_t) full_tokens.size() - 1);
+                batch.logits[i] = need_logit ? 1 : 0;
+            }
+            batch.n_tokens = (int32_t) full_tokens.size();
+
+            // Decode the batch. If decode fails (e.g. KV slot
+            // exhaustion), surface as INTERNAL — the caller will
+            // typically fall back to a sampling-based classifier.
+            int decode_err = llama_decode(lctx, batch);
+            if (decode_err != 0) {
+                llama_batch_free(batch);
+                llama_memory_seq_rm(mem, score_seq_id, -1, -1);
+                return grpc::Status(grpc::StatusCode::INTERNAL,
+                    "llama_decode failed during Score: " + std::to_string(decode_err));
+            }
+
+            // Sum log-probabilities of the actual candidate tokens.
+            double total_log_prob = 0.0;
+            for (int32_t k = 0; k < cand_len; k++) {
+                // The k-th candidate token sits at full_tokens index
+                // (divergence + k). Its predicting logit is at batch
+                // position (divergence + k - 1).
+                int32_t logit_pos = divergence + k - 1;
+                const float * logits = llama_get_logits_ith(lctx, logit_pos);
+                if (logits == nullptr) {
+                    llama_batch_free(batch);
+                    llama_memory_seq_rm(mem, score_seq_id, -1, -1);
+                    return grpc::Status(grpc::StatusCode::INTERNAL,
+                        "llama_get_logits_ith returned null at position " + std::to_string(logit_pos));
+                }
+                llama_token target_token = full_tokens[divergence + k];
+
+                // Compute log_softmax(logits)[target_token] with the
+                // max-subtraction stability trick.
+                float max_logit = logits[0];
+                for (int32_t v = 1; v < n_vocab; v++) {
+                    if (logits[v] > max_logit) max_logit = logits[v];
+                }
+                double sum_exp = 0.0;
+                for (int32_t v = 0; v < n_vocab; v++) {
+                    sum_exp += std::exp((double)(logits[v] - max_logit));
+                }
+                double token_log_prob = (double)(logits[target_token] - max_logit) - std::log(sum_exp);
+                total_log_prob += token_log_prob;
+
+                if (request->include_token_logprobs()) {
+                    backend::TokenLogProb * tlp = cs->add_tokens();
+                    std::string piece = common_token_to_piece(lctx, target_token);
+                    tlp->set_token(piece);
+                    tlp->set_log_prob(token_log_prob);
+                }
+            }
+
+            cs->set_log_prob(total_log_prob);
+            if (request->length_normalize() && cand_len > 0) {
+                cs->set_length_normalized_log_prob(total_log_prob / (double) cand_len);
+            }
+
+            llama_batch_free(batch);
+            // Drop this candidate's KV-cache contribution so the next
+            // candidate starts from a clean state. Without this, the
+            // next decode would conflict at positions 0..N-1 for our
+            // sequence ID.
+            llama_memory_seq_rm(mem, score_seq_id, -1, -1);
+        }
+
+        return grpc::Status::OK;
+    }
+
     grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response) override {
         auto auth = checkAuth(context);
         if (!auth.ok()) return auth;
         if (params_base.model.path.empty()) {
             return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
         }
+        conflict_guard guard("TokenizeString", slot_loop_inflight, score_inflight, "score_inflight");
         json body = parse_options(false, request, params_base, ctx_server.get_llama_context());
         body["stream"] = false;
 
@@ -3169,6 +3415,8 @@ public:
 
     grpc::Status GetMetrics(ServerContext* /*context*/, const backend::MetricsRequest* /*request*/, backend::MetricsResponse* response) override {
 
+        conflict_guard guard("GetMetrics", slot_loop_inflight, score_inflight, "score_inflight");
+
 // request slots data using task queue
         auto rd = ctx_server.get_response_reader();
         int task_id = rd.queue_tasks.get_new_id();
diff --git a/backend/go/cloud-proxy/Makefile b/backend/go/cloud-proxy/Makefile
new file mode 100644
index 000000000..7900905cd
--- /dev/null
+++ b/backend/go/cloud-proxy/Makefile
@@ -0,0 +1,12 @@
+GOCMD=go
+
+cloud-proxy:
+	CGO_ENABLED=0 $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o cloud-proxy ./
+
+package:
+	bash package.sh
+
+build: cloud-proxy package
+
+clean:
+	rm -f cloud-proxy
diff --git a/backend/go/cloud-proxy/cloud_proxy_suite_test.go b/backend/go/cloud-proxy/cloud_proxy_suite_test.go
new file mode 100644
index 000000000..e6c8bf322
--- /dev/null
+++ b/backend/go/cloud-proxy/cloud_proxy_suite_test.go
@@ -0,0 +1,16 @@
+package main
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// Ginkgo bootstrap. The other Test* functions in this package use
+// raw testing.T and run independently; they coexist with Ginkgo
+// specs registered via Describe / Context.
+func TestCloudProxySpecs(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "cloud-proxy specs")
+}
diff --git a/backend/go/cloud-proxy/main.go b/backend/go/cloud-proxy/main.go
new file mode 100644
index 000000000..7f75efb2a
--- /dev/null
+++ b/backend/go/cloud-proxy/main.go
@@ -0,0 +1,39 @@
+package main
+
+// cloud-proxy is a LocalAI backend that forwards request traffic to an
+// external HTTP provider (OpenAI, Anthropic, etc.). Two modes:
+//
+//   - passthrough: serves the Forward RPC; the client wire format is
+//     preserved end-to-end, no translation.
+//   - translate: serves Predict/PredictStream; the backend converts
+//     internal proto to the provider's wire format. (Phases 5–6.)
+//
+// LoadModel reads UpstreamURL/Mode/Provider/key references from
+// ProxyOptions and resolves the API key once at load time.
+
+import (
+	"flag"
+	"os"
+
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+	"github.com/mudler/xlog"
+	"golang.org/x/term"
+)
+
+var addr = flag.String("addr", "localhost:50051", "the address to listen on")
+
+func main() {
+	// xlog's default handler emits ANSI color codes; that's fine for an
+	// interactive shell but unreadable when the backend's stdout is
+	// captured by LocalAI and tee'd to a log file. Force plain text when
+	// LOCALAI_LOG_FORMAT is unset and stdout isn't a terminal.
+	format := os.Getenv("LOCALAI_LOG_FORMAT")
+	if format == "" && !term.IsTerminal(int(os.Stdout.Fd())) {
+		format = xlog.TextFormat
+	}
+	xlog.SetLogger(xlog.NewLogger(xlog.LogLevel(os.Getenv("LOCALAI_LOG_LEVEL")), format))
+	flag.Parse()
+	if err := grpc.StartServer(*addr, NewCloudProxy()); err != nil {
+		panic(err)
+	}
+}
diff --git a/backend/go/cloud-proxy/package.sh b/backend/go/cloud-proxy/package.sh
new file mode 100755
index 000000000..da86cd003
--- /dev/null
+++ b/backend/go/cloud-proxy/package.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Script to copy the cloud-proxy binary into the package dir for the
+# final Dockerfile stage. Mirrors backend/go/local-store/package.sh —
+# no extra runtime libs needed since the backend is pure Go.
+
+set -e
+
+CURDIR=$(dirname "$(realpath $0)")
+
+mkdir -p $CURDIR/package
+cp -avf $CURDIR/cloud-proxy $CURDIR/package/
+cp -rfv $CURDIR/run.sh $CURDIR/package/
diff --git a/backend/go/cloud-proxy/passthrough_edge_test.go b/backend/go/cloud-proxy/passthrough_edge_test.go
new file mode 100644
index 000000000..c727026f8
--- /dev/null
+++ b/backend/go/cloud-proxy/passthrough_edge_test.go
@@ -0,0 +1,270 @@
+package main
+
+import (
+	"context"
+	"errors"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"strconv"
+	"sync"
+
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("composeURL", func() {
+	// Upstream URL convention: gallery configs put the canonical path
+	// in upstream_url, so per-request Path is ignored. A bare-host
+	// upstream_url accepts the per-request path.
+	DescribeTable("path resolution",
+		func(upstream, reqPath, want string) {
+			got, err := composeURL(upstream, reqPath)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(got).To(Equal(want))
+		},
+		Entry("full path wins", "https://api.openai.com/v1/chat/completions", "/v1/something-else", "https://api.openai.com/v1/chat/completions"),
+		Entry("bare host accepts path", "https://api.openai.com", "/v1/chat/completions", "https://api.openai.com/v1/chat/completions"),
+		Entry("root slash treated as bare", "https://api.openai.com/", "/v1/chat/completions", "https://api.openai.com/v1/chat/completions"),
+		Entry("bare host + empty path", "https://api.openai.com", "", "https://api.openai.com"),
+	)
+
+	It("returns an error on invalid upstream URL", func() {
+		_, err := composeURL("://garbage", "")
+		Expect(err).To(HaveOccurred())
+	})
+})
+
+var _ = Describe("applyAuthHeader", func() {
+	It("sets x-api-key and anthropic-version for Anthropic, no Authorization", func() {
+		req, _ := http.NewRequest("POST", "https://example.com", nil)
+		applyAuthHeader(req, providerAnthropic, "ant-key")
+		Expect(req.Header.Get("x-api-key")).To(Equal("ant-key"))
+		Expect(req.Header.Get("anthropic-version")).NotTo(BeEmpty())
+		Expect(req.Header.Get("Authorization")).To(BeEmpty(), "Authorization must not leak on Anthropic backend")
+	})
+
+	It("sets Bearer Authorization for OpenAI, no x-api-key", func() {
+		req, _ := http.NewRequest("POST", "https://example.com", nil)
+		applyAuthHeader(req, providerOpenAI, "sk-key")
+		Expect(req.Header.Get("Authorization")).To(Equal("Bearer sk-key"))
+		Expect(req.Header.Get("x-api-key")).To(BeEmpty(), "x-api-key must not leak on OpenAI backend")
+	})
+
+	It("defaults to Bearer when provider is empty", func() {
+		// Passthrough mode often has provider == "" because the operator
+		// doesn't claim a specific upstream wire format. Most providers
+		// (including OpenAI-compatible ones) accept Bearer, so default to it.
+		req, _ := http.NewRequest("POST", "https://example.com", nil)
+		applyAuthHeader(req, "", "some-key")
+		Expect(req.Header.Get("Authorization")).To(Equal("Bearer some-key"))
+	})
+
+	It("preserves an existing anthropic-version header", func() {
+		// If the client supplied anthropic-version (rare but legitimate
+		// for an upstream pinned to a specific date), the proxy must not
+		// clobber it.
+		req, _ := http.NewRequest("POST", "https://example.com", nil)
+		req.Header.Set("anthropic-version", "2024-10-01")
+		applyAuthHeader(req, providerAnthropic, "k")
+		Expect(req.Header.Get("anthropic-version")).To(Equal("2024-10-01"))
+	})
+})
+
+var _ = Describe("isHopByHopHeader", func() {
+	DescribeTable("hop-by-hop classification",
+		func(header string, want bool) {
+			Expect(isHopByHopHeader(header)).To(Equal(want))
+		},
+		Entry("Connection is hop-by-hop", "Connection", true),
+		Entry("Keep-Alive is hop-by-hop", "Keep-Alive", true),
+		Entry("Proxy-Connection is hop-by-hop", "Proxy-Connection", true),
+		Entry("Transfer-Encoding is hop-by-hop", "Transfer-Encoding", true),
+		Entry("TE is hop-by-hop", "TE", true),
+		Entry("Trailer is hop-by-hop", "Trailer", true),
+		Entry("Upgrade is hop-by-hop", "Upgrade", true),
+		Entry("Host is hop-by-hop", "Host", true),
+		Entry("Content-Length is hop-by-hop", "Content-Length", true),
+		// Case-insensitive — RFC 7230 doesn't constrain header case.
+		Entry("lowercase connection is hop-by-hop", "connection", true),
+		Entry("uppercase HOST is hop-by-hop", "HOST", true),
+		// Non hop-by-hop — must NOT be stripped.
+		Entry("Authorization is end-to-end", "Authorization", false),
+		Entry("Content-Type is end-to-end", "Content-Type", false),
+		Entry("Accept is end-to-end", "Accept", false),
+		Entry("X-Custom is end-to-end", "X-Custom", false),
+	)
+})
+
+var _ = Describe("Forward", func() {
+	It("strips hop-by-hop and Connection headers before upstream, preserves custom headers", func() {
+		gotConnection := make(chan string, 1)
+		gotXCustom := make(chan string, 1)
+		gotHost := make(chan string, 1)
+		upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			gotConnection <- r.Header.Get("Connection")
+			gotXCustom <- r.Header.Get("X-Custom")
+			gotHost <- r.Header.Get("Host")
+			w.WriteHeader(http.StatusOK)
+		}))
+		defer upstream.Close()
+
+		cp := NewCloudProxy()
+		Expect(cp.Load(&pb.ModelOptions{
+			Proxy: &pb.ProxyOptions{
+				UpstreamUrl: upstream.URL,
+				Mode:        modePassthrough,
+			},
+		})).To(Succeed())
+
+		addr := "test://forward-hopbyhop"
+		grpc.Provide(addr, cp)
+		c := grpc.NewClient(addr, true, nil, false)
+		stream, err := c.Forward(context.Background())
+		Expect(err).NotTo(HaveOccurred())
+		Expect(stream.Send(&pb.ForwardRequest{
+			Path:   "/v1/chat/completions",
+			Method: "POST",
+			Headers: []*pb.ForwardHeader{
+				{Name: "Connection", Value: "keep-alive"},
+				{Name: "Host", Value: "spoofed.example.com"},
+				{Name: "X-Custom", Value: "preserved"},
+			},
+		})).To(Succeed())
+		Expect(stream.CloseSend()).To(Succeed())
+		_, _ = stream.Recv()
+		for {
+			if _, err := stream.Recv(); errors.Is(err, io.EOF) || err != nil {
+				break
+			}
+		}
+
+		Expect(<-gotConnection).To(BeEmpty(), "Connection must not leak to upstream")
+		Expect(<-gotHost).NotTo(Equal("spoofed.example.com"), "Host header must not be spoofed through")
+		Expect(<-gotXCustom).To(Equal("preserved"), "X-Custom header must survive")
+	})
+
+	It("replaces caller-supplied Authorization with the configured key", func() {
+		// The proxy must overwrite a client-supplied Authorization header
+		// so a downstream caller can't smuggle stale or wrong credentials.
+		gotAuth := make(chan string, 1)
+		upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			gotAuth <- r.Header.Get("Authorization")
+			w.WriteHeader(http.StatusOK)
+		}))
+		defer upstream.Close()
+
+		GinkgoT().Setenv("CLOUD_PROXY_AUTH_REPLACE_KEY", "sk-real")
+
+		cp := NewCloudProxy()
+		Expect(cp.Load(&pb.ModelOptions{
+			Proxy: &pb.ProxyOptions{
+				UpstreamUrl: upstream.URL,
+				Mode:        modePassthrough,
+				ApiKeyEnv:   "CLOUD_PROXY_AUTH_REPLACE_KEY",
+			},
+		})).To(Succeed())
+
+		addr := "test://forward-replaces-auth"
+		grpc.Provide(addr, cp)
+		c := grpc.NewClient(addr, true, nil, false)
+		stream, err := c.Forward(context.Background())
+		Expect(err).NotTo(HaveOccurred())
+		Expect(stream.Send(&pb.ForwardRequest{
+			Path:   "/v1/chat/completions",
+			Method: "POST",
+			Headers: []*pb.ForwardHeader{
+				// Client-supplied Authorization with the wrong scheme / key.
+				{Name: "Authorization", Value: "Basic Zm9vOmJhcg=="},
+			},
+		})).To(Succeed())
+		Expect(stream.CloseSend()).To(Succeed())
+		_, _ = stream.Recv()
+		for {
+			if _, err := stream.Recv(); errors.Is(err, io.EOF) || err != nil {
+				break
+			}
+		}
+
+		Expect(<-gotAuth).To(Equal("Bearer sk-real"), "caller-supplied Basic header must be replaced")
+	})
+
+	It("handles concurrent calls without interference", func() {
+		// CloudProxy explicitly omits base.SingleThread — independent
+		// Forward streams must not block each other or leak state.
+		upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			body, _ := io.ReadAll(r.Body)
+			w.WriteHeader(http.StatusOK)
+			_, _ = w.Write(body)
+		}))
+		defer upstream.Close()
+
+		cp := NewCloudProxy()
+		Expect(cp.Load(&pb.ModelOptions{
+			Proxy: &pb.ProxyOptions{
+				UpstreamUrl: upstream.URL,
+				Mode:        modePassthrough,
+			},
+		})).To(Succeed())
+		addr := "test://forward-concurrent"
+		grpc.Provide(addr, cp)
+		c := grpc.NewClient(addr, true, nil, false)
+
+		const N = 8
+		var wg sync.WaitGroup
+		errs := make(chan error, N)
+		for i := 0; i < N; i++ {
+			wg.Add(1)
+			go func(idx int) {
+				defer wg.Done()
+				stream, err := c.Forward(context.Background())
+				if err != nil {
+					errs <- err
+					return
+				}
+				payload := "request-" + string(rune('A'+idx))
+				if err := stream.Send(&pb.ForwardRequest{
+					Path:      "/v1/chat/completions",
+					Method:    "POST",
+					BodyChunk: []byte(payload),
+				}); err != nil {
+					errs <- err
+					return
+				}
+				_ = stream.CloseSend()
+				_, _ = stream.Recv()
+				var body []byte
+				for {
+					r, err := stream.Recv()
+					if errors.Is(err, io.EOF) {
+						break
+					}
+					if err != nil {
+						errs <- err
+						return
+					}
+					body = append(body, r.GetBodyChunk()...)
+				}
+				if string(body) != payload {
+					errs <- &echoMismatch{want: payload, got: string(body)}
+				}
+			}(i)
+		}
+		wg.Wait()
+		close(errs)
+		var collected []error
+		for err := range errs {
+			collected = append(collected, err)
+		}
+		Expect(collected).To(BeEmpty(), "no concurrent Forward call should fail")
+	})
+})
+
+type echoMismatch struct{ want, got string }
+
+func (e *echoMismatch) Error() string {
+	return "echo mismatch: want " + strconv.Quote(e.want) + " got " + strconv.Quote(e.got)
+}
diff --git a/backend/go/cloud-proxy/provider_anthropic.go b/backend/go/cloud-proxy/provider_anthropic.go
new file mode 100644
index 000000000..d8382d454
--- /dev/null
+++ b/backend/go/cloud-proxy/provider_anthropic.go
@@ -0,0 +1,508 @@
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/xlog"
+)
+
+// Anthropic Messages API wire-format types. Narrowed to what translate
+// mode preserves through the Reply proto: text + tool_use blocks +
+// usage tokens. Image blocks, prompt caching, metadata, and stop
+// sequence metadata are not modelled — passthrough mode covers those.
+//
+// Notable differences from OpenAI:
+//   - max_tokens is REQUIRED. Anthropic 400s without it.
+//   - Roles are user/assistant only — system messages move to a
+//     top-level `system` string field.
+//   - Streaming SSE uses event: lines alongside data: lines. The
+//     events we care about: content_block_start (carries tool_use
+//     init: id + name), content_block_delta (text_delta with text;
+//     input_json_delta with partial_json for tool arguments), and
+//     message_stop (terminates the stream). Others are ignored.
+
+type anthropicRequest struct {
+	Model         string               `json:"model"`
+	MaxTokens     int32                `json:"max_tokens"`
+	System        string               `json:"system,omitempty"`
+	Messages      []anthropicMessage   `json:"messages"`
+	Stream        bool                 `json:"stream,omitempty"`
+	Temperature   *float64             `json:"temperature,omitempty"`
+	TopP          *float64             `json:"top_p,omitempty"`
+	StopSequences []string             `json:"stop_sequences,omitempty"`
+	Tools         []anthropicTool      `json:"tools,omitempty"`
+	ToolChoice    *anthropicToolChoice `json:"tool_choice,omitempty"`
+}
+
+// Content is `any` because Anthropic accepts a bare string OR a
+// list of content blocks. Use the string form for plain user/
+// assistant turns; switch to []anthropicContentBlock when the
+// turn needs tool_use (assistant) or tool_result (user) blocks.
+type anthropicMessage struct {
+	Role    string `json:"role"`
+	Content any    `json:"content"`
+}
+
+type anthropicTool struct {
+	Name        string          `json:"name"`
+	Description string          `json:"description,omitempty"`
+	InputSchema json.RawMessage `json:"input_schema"`
+}
+
+// anthropicToolChoice mirrors the four shapes Anthropic accepts:
+// {"type":"auto"} | {"type":"any"} | {"type":"tool","name":"X"} |
+// {"type":"none"} (newer models). OpenAI's "auto"/"none"/
+// "required"/{"function":{"name":"X"}} all map here.
+type anthropicToolChoice struct {
+	Type string `json:"type"`
+	Name string `json:"name,omitempty"`
+}
+
+// anthropicContentBlock is the union shape used both for response
+// blocks (text/tool_use we read off the wire) and outbound request
+// blocks (tool_use/tool_result we emit in the conversation history).
+// Anthropic encodes tool calls inline rather than as a separate field,
+// so we walk Content[] looking for type=="tool_use" on responses and
+// produce equivalent blocks when serialising prior-turn tool calls.
+type anthropicContentBlock struct {
+	Type  string          `json:"type"`
+	Text  string          `json:"text,omitempty"`
+	ID    string          `json:"id,omitempty"`
+	Name  string          `json:"name,omitempty"`
+	Input json.RawMessage `json:"input,omitempty"`
+	// Tool-result block fields. tool_result uses `content` (not
+	// `text`) and pairs with `tool_use_id`; modelling them as
+	// distinct fields avoids ambiguity at marshal time.
+	ToolUseID     string `json:"tool_use_id,omitempty"`
+	ResultContent string `json:"content,omitempty"`
+}
+
+type anthropicResponse struct {
+	ID      string                  `json:"id"`
+	Type    string                  `json:"type"`
+	Role    string                  `json:"role"`
+	Content []anthropicContentBlock `json:"content"`
+	Model   string                  `json:"model"`
+	Usage   *anthropicUsage         `json:"usage,omitempty"`
+}
+
+type anthropicUsage struct {
+	InputTokens  int `json:"input_tokens"`
+	OutputTokens int `json:"output_tokens"`
+}
+
+// anthropicStreamEvent is the union shape used for every event type we
+// process. Type discriminates; only the matching fields are populated.
+// content_block_start carries ContentBlock (with id/name for tool_use);
+// content_block_delta carries Delta (text or partial_json).
+type anthropicStreamEvent struct {
+	Type         string                 `json:"type"`
+	Index        int                    `json:"index,omitempty"`
+	ContentBlock *anthropicContentBlock `json:"content_block,omitempty"`
+	Delta        *anthropicStreamDelta  `json:"delta,omitempty"`
+	Message      *anthropicResponse     `json:"message,omitempty"`
+	Usage        *anthropicUsage        `json:"usage,omitempty"`
+}
+
+type anthropicStreamDelta struct {
+	Type        string `json:"type,omitempty"`
+	Text        string `json:"text,omitempty"`
+	PartialJSON string `json:"partial_json,omitempty"`
+}
+
+// Anthropic requires max_tokens. If the caller didn't set it, use a
+// generous-but-bounded default so the request doesn't 400.
+const anthropicDefaultMaxTokens int32 = 4096
+
+const anthropicToolChoiceNone = "none"
+
+// Reused JSON-Schema defaults for malformed inputs. Anthropic requires
+// input_schema to be a JSON object and tool_use.input to be a JSON
+// object; clients that omit them must not 400 the entire request.
+var (
+	emptyJSONObject   = json.RawMessage(`{}`)
+	emptyObjectSchema = json.RawMessage(`{"type":"object","properties":{}}`)
+)
+
+func buildAnthropicRequest(opts *pb.PredictOptions, cfg *proxyConfig, stream bool) ([]byte, error) {
+	req := anthropicRequest{
+		Model:         modelName(cfg, opts),
+		MaxTokens:     opts.GetTokens(),
+		Stream:        stream,
+		StopSequences: opts.GetStopPrompts(),
+	}
+	if req.MaxTokens <= 0 {
+		req.MaxTokens = anthropicDefaultMaxTokens
+	}
+	// Newer Anthropic models 400 when both temperature and top_p are
+	// set ("`temperature` and `top_p` cannot both be specified for
+	// this model. Please use only one.") even though their docs only
+	// "recommend" picking one. The OpenAI-compatible chat UI almost
+	// always sends both with default values, so prefer temperature
+	// and drop top_p when both are present.
+	if t := opts.GetTemperature(); t != 0 {
+		v := float64(t)
+		req.Temperature = &v
+	} else if t := opts.GetTopP(); t != 0 {
+		v := float64(t)
+		req.TopP = &v
+	}
+
+	req.Tools = convertOpenAITools(opts.GetTools())
+	req.ToolChoice = convertOpenAIToolChoice(opts.GetToolChoice())
+	// Anthropic rejects tool_choice without tools and older models
+	// don't accept {"type":"none"} — collapse to a no-tools request.
+	if req.ToolChoice != nil && req.ToolChoice.Type == anthropicToolChoiceNone {
+		req.Tools, req.ToolChoice = nil, nil
+	}
+
+	var systemParts []string
+	for _, m := range opts.GetMessages() {
+		role := m.GetRole()
+		if role == "system" {
+			if c := m.GetContent(); c != "" {
+				systemParts = append(systemParts, c)
+			}
+			continue
+		}
+		switch role {
+		case "user":
+			req.Messages = append(req.Messages, anthropicMessage{
+				Role:    "user",
+				Content: m.GetContent(),
+			})
+		case "assistant":
+			if blocks := assistantBlocks(m); blocks != nil {
+				req.Messages = append(req.Messages, anthropicMessage{Role: "assistant", Content: blocks})
+				continue
+			}
+			req.Messages = append(req.Messages, anthropicMessage{
+				Role:    "assistant",
+				Content: m.GetContent(),
+			})
+		case "tool", "function":
+			req.Messages = appendToolResult(req.Messages, anthropicContentBlock{
+				Type:          "tool_result",
+				ToolUseID:     m.GetToolCallId(),
+				ResultContent: m.GetContent(),
+			})
+		}
+	}
+	req.System = strings.Join(systemParts, "\n\n")
+
+	if len(req.Messages) == 0 && opts.GetPrompt() != "" {
+		req.Messages = []anthropicMessage{{Role: "user", Content: opts.GetPrompt()}}
+	}
+
+	return json.Marshal(req)
+}
+
+// appendToolResult appends a tool_result block as a user message,
+// merging into a preceding user message that already carries blocks.
+// Anthropic concatenates consecutive same-role messages on its end,
+// but explicit merging keeps the body smaller and the conversation
+// strictly alternating — which some upstream filters require.
+func appendToolResult(msgs []anthropicMessage, block anthropicContentBlock) []anthropicMessage {
+	if n := len(msgs); n > 0 && msgs[n-1].Role == "user" {
+		if existing, ok := msgs[n-1].Content.([]anthropicContentBlock); ok {
+			msgs[n-1].Content = append(existing, block)
+			return msgs
+		}
+	}
+	return append(msgs, anthropicMessage{
+		Role:    "user",
+		Content: []anthropicContentBlock{block},
+	})
+}
+
+func convertOpenAITools(toolsJSON string) []anthropicTool {
+	if toolsJSON == "" {
+		return nil
+	}
+	var raw []openAITool
+	if err := json.Unmarshal([]byte(toolsJSON), &raw); err != nil {
+		xlog.Warn("cloud-proxy: anthropic translate: unparseable tools JSON, dropping", "error", err)
+		return nil
+	}
+	tools := make([]anthropicTool, 0, len(raw))
+	for _, t := range raw {
+		if t.Function.Name == "" {
+			continue
+		}
+		schema := t.Function.Parameters
+		if len(schema) == 0 {
+			schema = emptyObjectSchema
+		}
+		tools = append(tools, anthropicTool{
+			Name:        t.Function.Name,
+			Description: t.Function.Description,
+			InputSchema: schema,
+		})
+	}
+	return tools
+}
+
+// convertOpenAIToolChoice accepts the spec form
+// ({type:function, function:{name:X}}) and the flat legacy form
+// ({type:function, name:X}) some clients send. Unknown object shapes
+// are warned and dropped rather than silently treated as auto.
+func convertOpenAIToolChoice(toolChoiceJSON string) *anthropicToolChoice {
+	if toolChoiceJSON == "" {
+		return nil
+	}
+	var asString string
+	if err := json.Unmarshal([]byte(toolChoiceJSON), &asString); err == nil {
+		switch asString {
+		case "auto":
+			return &anthropicToolChoice{Type: "auto"}
+		case "none":
+			return &anthropicToolChoice{Type: anthropicToolChoiceNone}
+		case "required":
+			return &anthropicToolChoice{Type: "any"}
+		}
+		return nil
+	}
+	var asObj struct {
+		Type     string `json:"type"`
+		Name     string `json:"name"`
+		Function struct {
+			Name string `json:"name"`
+		} `json:"function"`
+	}
+	if err := json.Unmarshal([]byte(toolChoiceJSON), &asObj); err != nil {
+		xlog.Warn("cloud-proxy: anthropic translate: unparseable tool_choice, dropping", "error", err)
+		return nil
+	}
+	if name := asObj.Function.Name; name != "" {
+		return &anthropicToolChoice{Type: "tool", Name: name}
+	}
+	if asObj.Name != "" {
+		return &anthropicToolChoice{Type: "tool", Name: asObj.Name}
+	}
+	xlog.Warn("cloud-proxy: anthropic translate: unrecognised tool_choice shape, dropping", "shape", toolChoiceJSON)
+	return nil
+}
+
+// openAITool mirrors pkg/functions.Tool but keeps Parameters as
+// json.RawMessage so the input_schema passes through verbatim — no
+// re-marshal cost, no fidelity loss on exotic schemas.
+type openAITool struct {
+	Type     string `json:"type"`
+	Function struct {
+		Name        string          `json:"name"`
+		Description string          `json:"description"`
+		Parameters  json.RawMessage `json:"parameters"`
+	} `json:"function"`
+}
+
+func assistantBlocks(m *pb.Message) []anthropicContentBlock {
+	toolCallsJSON := m.GetToolCalls()
+	if toolCallsJSON == "" {
+		return nil
+	}
+	var toolCalls []openAIToolCall
+	if err := json.Unmarshal([]byte(toolCallsJSON), &toolCalls); err != nil || len(toolCalls) == 0 {
+		return nil
+	}
+	blocks := make([]anthropicContentBlock, 0, len(toolCalls)+1)
+	if text := m.GetContent(); text != "" {
+		blocks = append(blocks, anthropicContentBlock{Type: "text", Text: text})
+	}
+	for _, tc := range toolCalls {
+		// OpenAI's arguments are a JSON-encoded string; pass through
+		// as RawMessage so a non-JSON string from a poorly-formed
+		// local model doesn't crash the marshaller downstream.
+		args := json.RawMessage(tc.Function.Arguments)
+		if len(args) == 0 {
+			args = emptyJSONObject
+		}
+		blocks = append(blocks, anthropicContentBlock{
+			Type:  "tool_use",
+			ID:    tc.ID,
+			Name:  tc.Function.Name,
+			Input: args,
+		})
+	}
+	return blocks
+}
+
+// doAnthropicRequest is the Anthropic counterpart of doOpenAIRequest.
+// applyAuthHeader sets x-api-key and anthropic-version when provider
+// is anthropic, so this method doesn't need to duplicate that.
+func (c *CloudProxy) doAnthropicRequest(ctx context.Context, cfg *proxyConfig, body []byte) (*http.Response, error) {
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, cfg.upstreamURL, bytes.NewReader(body))
+	if err != nil {
+		return nil, fmt.Errorf("cloud-proxy: build request: %w", err)
+	}
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Accept", "*/*")
+	if cfg.apiKey != "" {
+		applyAuthHeader(req, cfg.provider, cfg.apiKey)
+	}
+	resp, err := c.client.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("cloud-proxy: upstream request: %w", err)
+	}
+	return resp, nil
+}
+
+// predictAnthropicRich returns the full Reply: joined text from all
+// text blocks, tool_use blocks mapped to ToolCallDelta, and usage
+// tokens.
+func (c *CloudProxy) predictAnthropicRich(ctx context.Context, cfg *proxyConfig, opts *pb.PredictOptions) (*pb.Reply, error) {
+	body, err := buildAnthropicRequest(opts, cfg, false)
+	if err != nil {
+		return nil, fmt.Errorf("cloud-proxy: marshal request: %w", err)
+	}
+	resp, err := c.doAnthropicRequest(ctx, cfg, body)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = resp.Body.Close() }()
+
+	if resp.StatusCode >= 400 {
+		errBody, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
+		return nil, fmt.Errorf("cloud-proxy: upstream %d: %s", resp.StatusCode, string(errBody))
+	}
+
+	var parsed anthropicResponse
+	if err := json.NewDecoder(resp.Body).Decode(&parsed); err != nil {
+		return nil, fmt.Errorf("cloud-proxy: decode response: %w", err)
+	}
+
+	reply := &pb.Reply{}
+	if parsed.Usage != nil {
+		reply.PromptTokens = int32(parsed.Usage.InputTokens)
+		reply.Tokens = int32(parsed.Usage.OutputTokens)
+	}
+
+	var content strings.Builder
+	var toolCalls []*pb.ToolCallDelta
+	toolIdx := 0
+	for _, b := range parsed.Content {
+		switch b.Type {
+		case "text":
+			content.WriteString(b.Text)
+		case "tool_use":
+			// Input is a structured JSON object; we serialise to a
+			// string so it fits the OpenAI-shaped arguments field
+			// downstream consumers expect.
+			args := ""
+			if len(b.Input) > 0 {
+				args = string(b.Input)
+			}
+			toolCalls = append(toolCalls, newToolCallDelta(toolIdx, b.ID, b.Name, args))
+			toolIdx++
+		}
+	}
+	reply.Message = []byte(content.String())
+	if len(toolCalls) > 0 {
+		reply.ChatDeltas = []*pb.ChatDelta{{ToolCalls: toolCalls}}
+	}
+	return reply, nil
+}
+
+// predictAnthropicStreamRich streams Reply chunks from Anthropic's SSE.
+// Three event types matter: content_block_start (initialises tool_use
+// id+name), content_block_delta (carries text or input_json_delta),
+// message_stop (terminates). The block index from the wire feeds
+// straight into ToolCallDelta.Index so downstream consumers can
+// reassemble multiple parallel tool calls.
+func (c *CloudProxy) predictAnthropicStreamRich(ctx context.Context, cfg *proxyConfig, opts *pb.PredictOptions, results chan<- *pb.Reply) error {
+	body, err := buildAnthropicRequest(opts, cfg, true)
+	if err != nil {
+		return fmt.Errorf("cloud-proxy: marshal request: %w", err)
+	}
+	resp, err := c.doAnthropicRequest(ctx, cfg, body)
+	if err != nil {
+		return err
+	}
+	defer func() { _ = resp.Body.Close() }()
+
+	if resp.StatusCode >= 400 {
+		errBody, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
+		return fmt.Errorf("cloud-proxy: upstream %d: %s", resp.StatusCode, string(errBody))
+	}
+
+	scanner := bufio.NewScanner(resp.Body)
+	scanner.Buffer(make([]byte, 0, 64*1024), 1<<20)
+	for scanner.Scan() {
+		line := scanner.Text()
+		if !strings.HasPrefix(line, "data:") {
+			continue
+		}
+		payload := strings.TrimSpace(strings.TrimPrefix(line, "data:"))
+		if payload == "" {
+			continue
+		}
+		var ev anthropicStreamEvent
+		if err := json.Unmarshal([]byte(payload), &ev); err != nil {
+			xlog.Debug("cloud-proxy: skip malformed SSE chunk", "error", err)
+			continue
+		}
+		switch ev.Type {
+		case "content_block_start":
+			// tool_use blocks announce id + name here; arguments arrive
+			// in subsequent input_json_delta events. Emit a Reply with
+			// just the tool_call init fields so consumers can allocate
+			// a slot at this index.
+			if ev.ContentBlock != nil && ev.ContentBlock.Type == "tool_use" {
+				if !sendReply(ctx, results, &pb.Reply{
+					ChatDeltas: []*pb.ChatDelta{{ToolCalls: []*pb.ToolCallDelta{
+						newToolCallDelta(ev.Index, ev.ContentBlock.ID, ev.ContentBlock.Name, ""),
+					}}},
+				}) {
+					return ctx.Err()
+				}
+			}
+		case "content_block_delta":
+			if ev.Delta == nil {
+				continue
+			}
+			switch ev.Delta.Type {
+			case "text_delta":
+				if ev.Delta.Text == "" {
+					continue
+				}
+				if !sendReply(ctx, results, &pb.Reply{
+					Message:    []byte(ev.Delta.Text),
+					ChatDeltas: []*pb.ChatDelta{{Content: ev.Delta.Text}},
+				}) {
+					return ctx.Err()
+				}
+			case "input_json_delta":
+				if ev.Delta.PartialJSON == "" {
+					continue
+				}
+				if !sendReply(ctx, results, &pb.Reply{
+					ChatDeltas: []*pb.ChatDelta{{ToolCalls: []*pb.ToolCallDelta{
+						newToolCallDelta(ev.Index, "", "", ev.Delta.PartialJSON),
+					}}},
+				}) {
+					return ctx.Err()
+				}
+			}
+		case "message_delta":
+			// Anthropic sends final usage in message_delta.usage. Emit
+			// a usage-only Reply so the consumer can record totals.
+			if ev.Usage != nil {
+				if !sendReply(ctx, results, &pb.Reply{
+					Tokens: int32(ev.Usage.OutputTokens),
+				}) {
+					return ctx.Err()
+				}
+			}
+		case "message_stop":
+			return nil
+		}
+	}
+	return scanner.Err()
+}
diff --git a/backend/go/cloud-proxy/provider_anthropic_test.go b/backend/go/cloud-proxy/provider_anthropic_test.go
new file mode 100644
index 000000000..d46db7b12
--- /dev/null
+++ b/backend/go/cloud-proxy/provider_anthropic_test.go
@@ -0,0 +1,334 @@
+package main
+
+import (
+	"encoding/json"
+	"io"
+	"math"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/gomega"
+)
+
+// fakeAnthropicUpstream mirrors fakeOpenAIUpstream but decodes the
+// request body as an anthropicRequest so tests can assert on the
+// translated wire shape (system field, max_tokens, etc.).
+func fakeAnthropicUpstream(t *testing.T, handler func(req anthropicRequest) (status int, body string, contentType string)) (*httptest.Server, *anthropicRequest) {
+	t.Helper()
+	var captured anthropicRequest
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		raw, _ := io.ReadAll(r.Body)
+		_ = json.Unmarshal(raw, &captured)
+		status, body, ct := handler(captured)
+		w.Header().Set("Content-Type", ct)
+		w.WriteHeader(status)
+		_, _ = io.WriteString(w, body)
+	}))
+	return srv, &captured
+}
+
+func newAnthropicTranslateCloudProxy(t *testing.T, upstreamURL string) *CloudProxy {
+	t.Helper()
+	g := NewWithT(t)
+	t.Setenv("CLOUD_PROXY_ANTHROPIC_FAKE", "sk-ant-fake")
+	cp := NewCloudProxy()
+	err := cp.Load(&pb.ModelOptions{
+		Model: "claude-local",
+		Proxy: &pb.ProxyOptions{
+			UpstreamUrl:   upstreamURL,
+			Mode:          modeTranslate,
+			Provider:      providerAnthropic,
+			ApiKeyEnv:     "CLOUD_PROXY_ANTHROPIC_FAKE",
+			UpstreamModel: "claude-3-5-sonnet-20241022",
+		},
+	})
+	g.Expect(err).NotTo(HaveOccurred())
+	return cp
+}
+
+func TestPredict_Anthropic_BasicMessages(t *testing.T) {
+	g := NewWithT(t)
+	srv, captured := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
+		return 200, `{"id":"msg_1","type":"message","role":"assistant","content":[{"type":"text","text":"hi there"}],"model":"claude-3-5-sonnet-20241022","usage":{"input_tokens":5,"output_tokens":2}}`, "application/json"
+	})
+	defer srv.Close()
+	cp := newAnthropicTranslateCloudProxy(t, srv.URL)
+
+	got, err := cp.Predict(&pb.PredictOptions{
+		Messages: []*pb.Message{
+			{Role: "system", Content: "be brief"},
+			{Role: "user", Content: "hello"},
+		},
+		Temperature: 0.5,
+		TopP:        0.9,
+		Tokens:      32,
+	})
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(got).To(Equal("hi there"))
+
+	g.Expect(captured.Model).To(Equal("claude-3-5-sonnet-20241022"))
+	// System message must be hoisted out of Messages into top-level field.
+	g.Expect(captured.System).To(Equal("be brief"))
+	g.Expect(captured.Messages).To(HaveLen(1))
+	g.Expect(captured.Messages[0].Role).To(Equal("user"))
+	g.Expect(captured.MaxTokens).To(Equal(int32(32)))
+	g.Expect(captured.Temperature).NotTo(BeNil())
+	g.Expect(*captured.Temperature).To(Equal(0.5))
+	// Anthropic 400s when both temperature and top_p are set; the
+	// translator must prefer temperature and drop top_p.
+	g.Expect(captured.TopP).To(BeNil())
+	g.Expect(captured.Stream).To(BeFalse())
+}
+
+// When only top_p is set, it should be forwarded.
+func TestPredict_Anthropic_TopPOnly(t *testing.T) {
+	g := NewWithT(t)
+	srv, captured := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
+		return 200, `{"content":[{"type":"text","text":"ok"}]}`, "application/json"
+	})
+	defer srv.Close()
+	cp := newAnthropicTranslateCloudProxy(t, srv.URL)
+
+	_, err := cp.Predict(&pb.PredictOptions{
+		Messages: []*pb.Message{{Role: "user", Content: "hello"}},
+		TopP:     0.9,
+		Tokens:   16,
+	})
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(captured.Temperature).To(BeNil())
+	// PredictOptions.TopP is float32 on the wire; the translator widens
+	// to float64 so 0.9 round-trips as 0.8999999761581421… — compare
+	// with a small tolerance rather than exact equality.
+	g.Expect(captured.TopP).NotTo(BeNil())
+	g.Expect(math.Abs(*captured.TopP - 0.9)).To(BeNumerically("<=", 1e-6))
+}
+
+func TestPredict_Anthropic_DefaultsMaxTokens(t *testing.T) {
+	g := NewWithT(t)
+	// Anthropic 400s without max_tokens. The translator must default
+	// it when the caller doesn't supply Tokens.
+	srv, captured := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
+		return 200, `{"content":[{"type":"text","text":"ok"}]}`, "application/json"
+	})
+	defer srv.Close()
+	cp := newAnthropicTranslateCloudProxy(t, srv.URL)
+
+	_, err := cp.Predict(&pb.PredictOptions{Messages: []*pb.Message{{Role: "user", Content: "x"}}})
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(captured.MaxTokens).To(Equal(anthropicDefaultMaxTokens))
+}
+
+func TestPredict_Anthropic_PromptFallback(t *testing.T) {
+	g := NewWithT(t)
+	srv, captured := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
+		return 200, `{"content":[{"type":"text","text":"ok"}]}`, "application/json"
+	})
+	defer srv.Close()
+	cp := newAnthropicTranslateCloudProxy(t, srv.URL)
+
+	_, err := cp.Predict(&pb.PredictOptions{Prompt: "what time is it?", Tokens: 16})
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(captured.Messages).To(HaveLen(1))
+	g.Expect(captured.Messages[0].Role).To(Equal("user"))
+	g.Expect(captured.Messages[0].Content).To(Equal("what time is it?"))
+}
+
+func TestPredict_Anthropic_ConcatenatesContentBlocks(t *testing.T) {
+	g := NewWithT(t)
+	// Anthropic may return multiple text blocks; the translator joins
+	// them so the Predict() string return is the full assistant message.
+	srv, _ := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
+		return 200, `{"content":[{"type":"text","text":"hello "},{"type":"text","text":"world"}]}`, "application/json"
+	})
+	defer srv.Close()
+	cp := newAnthropicTranslateCloudProxy(t, srv.URL)
+
+	got, err := cp.Predict(&pb.PredictOptions{Messages: []*pb.Message{{Role: "user", Content: "x"}}, Tokens: 16})
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(got).To(Equal("hello world"))
+}
+
+func TestPredict_Anthropic_UpstreamError(t *testing.T) {
+	g := NewWithT(t)
+	srv, _ := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
+		return 401, `{"error":{"type":"authentication_error","message":"bad key"}}`, "application/json"
+	})
+	defer srv.Close()
+	cp := newAnthropicTranslateCloudProxy(t, srv.URL)
+
+	_, err := cp.Predict(&pb.PredictOptions{Messages: []*pb.Message{{Role: "user", Content: "x"}}, Tokens: 16})
+	g.Expect(err).To(HaveOccurred())
+	g.Expect(err.Error()).To(ContainSubstring("401"))
+}
+
+func TestPredictStream_Anthropic_StreamsTextDeltas(t *testing.T) {
+	g := NewWithT(t)
+	// Real Anthropic SSE has event: lines + data: lines. The translator
+	// only needs the data: payload; only content_block_delta with
+	// delta.type=text_delta carries content. message_stop ends.
+	frames := []string{
+		"event: message_start\ndata: {\"type\":\"message_start\"}\n\n",
+		"event: content_block_start\ndata: {\"type\":\"content_block_start\",\"index\":0}\n\n",
+		"event: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"hello\"}}\n\n",
+		"event: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" \"}}\n\n",
+		"event: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"world\"}}\n\n",
+		"event: content_block_stop\ndata: {\"type\":\"content_block_stop\",\"index\":0}\n\n",
+		"event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n",
+	}
+	body := strings.Join(frames, "")
+
+	srv, captured := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
+		return 200, body, "text/event-stream"
+	})
+	defer srv.Close()
+	cp := newAnthropicTranslateCloudProxy(t, srv.URL)
+
+	results := make(chan string, 8)
+	done := make(chan error, 1)
+	go func() {
+		done <- cp.PredictStream(&pb.PredictOptions{
+			Messages: []*pb.Message{{Role: "user", Content: "hi"}},
+			Tokens:   16,
+		}, results)
+	}()
+
+	var got []string
+	for s := range results {
+		got = append(got, s)
+	}
+	err := <-done
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(strings.Join(got, "")).To(Equal("hello world"))
+	g.Expect(captured.Stream).To(BeTrue())
+}
+
+func TestBuildAnthropic_TranslatesOpenAITools(t *testing.T) {
+	g := NewWithT(t)
+	srv, captured := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
+		return 200, `{"content":[{"type":"text","text":"ok"}]}`, "application/json"
+	})
+	defer srv.Close()
+	cp := newAnthropicTranslateCloudProxy(t, srv.URL)
+
+	tools := `[{"type":"function","function":{"name":"get_weather","description":"Get weather","parameters":{"type":"object","properties":{"city":{"type":"string"}},"required":["city"]}}}]`
+	_, err := cp.Predict(&pb.PredictOptions{
+		Messages:   []*pb.Message{{Role: "user", Content: "weather in Paris?"}},
+		Tools:      tools,
+		ToolChoice: `"auto"`,
+		Tokens:     32,
+	})
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(captured.Tools).To(HaveLen(1))
+	g.Expect(captured.Tools[0].Name).To(Equal("get_weather"))
+	g.Expect(captured.Tools[0].Description).To(Equal("Get weather"))
+	// input_schema must be the parameters object verbatim.
+	g.Expect(string(captured.Tools[0].InputSchema)).To(ContainSubstring(`"city"`))
+	g.Expect(captured.ToolChoice).NotTo(BeNil())
+	g.Expect(captured.ToolChoice.Type).To(Equal("auto"))
+}
+
+func TestBuildAnthropic_ToolChoice_RequiredMapsToAny(t *testing.T) {
+	g := NewWithT(t)
+	srv, captured := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
+		return 200, `{"content":[]}`, "application/json"
+	})
+	defer srv.Close()
+	cp := newAnthropicTranslateCloudProxy(t, srv.URL)
+	_, err := cp.Predict(&pb.PredictOptions{
+		Messages:   []*pb.Message{{Role: "user", Content: "x"}},
+		Tools:      `[{"type":"function","function":{"name":"t","parameters":{"type":"object"}}}]`,
+		ToolChoice: `"required"`,
+		Tokens:     16,
+	})
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(captured.ToolChoice).NotTo(BeNil())
+	g.Expect(captured.ToolChoice.Type).To(Equal("any"))
+}
+
+func TestBuildAnthropic_ToolChoice_NoneDropsTools(t *testing.T) {
+	g := NewWithT(t)
+	srv, captured := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
+		return 200, `{"content":[]}`, "application/json"
+	})
+	defer srv.Close()
+	cp := newAnthropicTranslateCloudProxy(t, srv.URL)
+	_, err := cp.Predict(&pb.PredictOptions{
+		Messages:   []*pb.Message{{Role: "user", Content: "x"}},
+		Tools:      `[{"type":"function","function":{"name":"t","parameters":{"type":"object"}}}]`,
+		ToolChoice: `"none"`,
+		Tokens:     16,
+	})
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(captured.Tools).To(BeNil())
+	g.Expect(captured.ToolChoice).To(BeNil())
+}
+
+func TestBuildAnthropic_ToolChoice_NamedFunction(t *testing.T) {
+	g := NewWithT(t)
+	srv, captured := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
+		return 200, `{"content":[]}`, "application/json"
+	})
+	defer srv.Close()
+	cp := newAnthropicTranslateCloudProxy(t, srv.URL)
+	_, err := cp.Predict(&pb.PredictOptions{
+		Messages:   []*pb.Message{{Role: "user", Content: "x"}},
+		Tools:      `[{"type":"function","function":{"name":"weather","parameters":{"type":"object"}}}]`,
+		ToolChoice: `{"type":"function","function":{"name":"weather"}}`,
+		Tokens:     16,
+	})
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(captured.ToolChoice).NotTo(BeNil())
+	g.Expect(captured.ToolChoice.Type).To(Equal("tool"))
+	g.Expect(captured.ToolChoice.Name).To(Equal("weather"))
+}
+
+func TestBuildAnthropic_RoundTripsAssistantToolCalls(t *testing.T) {
+	g := NewWithT(t)
+	// LocalAI Assistant's second turn: the LLM previously emitted a
+	// tool_use, the server executed it, and the conversation now
+	// includes the assistant turn (with tool_calls) plus a tool-role
+	// result message. Both must convert to Anthropic block form.
+	srv, captured := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
+		return 200, `{"content":[{"type":"text","text":"ok"}]}`, "application/json"
+	})
+	defer srv.Close()
+	cp := newAnthropicTranslateCloudProxy(t, srv.URL)
+
+	tools := `[{"type":"function","function":{"name":"list_models","parameters":{"type":"object"}}}]`
+	toolCallsJSON := `[{"id":"call_abc","type":"function","function":{"name":"list_models","arguments":"{}"}}]`
+	_, err := cp.Predict(&pb.PredictOptions{
+		Tools: tools,
+		Messages: []*pb.Message{
+			{Role: "user", Content: "what models are installed?"},
+			{Role: "assistant", Content: "", ToolCalls: toolCallsJSON},
+			{Role: "tool", Content: `{"models":["a","b"]}`, ToolCallId: "call_abc"},
+		},
+		Tokens: 64,
+	})
+	g.Expect(err).NotTo(HaveOccurred())
+
+	g.Expect(captured.Messages).To(HaveLen(3))
+	// 1. user text — bare string
+	s, ok := captured.Messages[0].Content.(string)
+	g.Expect(ok).To(BeTrue())
+	g.Expect(s).To(Equal("what models are installed?"))
+	// 2. assistant — must be a content-block list with one tool_use
+	// json.Unmarshal of `any` produces []any not []anthropicContentBlock.
+	blocks, ok := captured.Messages[1].Content.([]any)
+	g.Expect(ok).To(BeTrue())
+	g.Expect(blocks).To(HaveLen(1))
+	b0, _ := blocks[0].(map[string]any)
+	g.Expect(b0["type"]).To(Equal("tool_use"))
+	g.Expect(b0["id"]).To(Equal("call_abc"))
+	g.Expect(b0["name"]).To(Equal("list_models"))
+	// 3. tool → user with tool_result block
+	g.Expect(captured.Messages[2].Role).To(Equal("user"))
+	resBlocks, _ := captured.Messages[2].Content.([]any)
+	r0, _ := resBlocks[0].(map[string]any)
+	g.Expect(r0["type"]).To(Equal("tool_result"))
+	g.Expect(r0["tool_use_id"]).To(Equal("call_abc"))
+	g.Expect(r0["content"]).To(Equal(`{"models":["a","b"]}`))
+}
diff --git a/backend/go/cloud-proxy/provider_edge_test.go b/backend/go/cloud-proxy/provider_edge_test.go
new file mode 100644
index 000000000..5c6deb829
--- /dev/null
+++ b/backend/go/cloud-proxy/provider_edge_test.go
@@ -0,0 +1,119 @@
+package main
+
+import (
+	"encoding/json"
+	"strings"
+	"testing"
+
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/gomega"
+)
+
+// Verify buildOpenAIRequest preserves caller-supplied tools and
+// tool_choice as opaque JSON. PredictOptions carries them as strings;
+// they must land in the outbound request body unchanged so the
+// upstream sees the caller's intent verbatim. A regression here would
+// silently disable function calling for translate-mode clients.
+func TestBuildOpenAIRequest_ToolsAndToolChoicePassthrough(t *testing.T) {
+	g := NewWithT(t)
+	cfg := &proxyConfig{upstreamModel: "gpt-4o"}
+	toolsJSON := `[{"type":"function","function":{"name":"search","parameters":{"type":"object"}}}]`
+	choiceJSON := `{"type":"function","function":{"name":"search"}}`
+
+	body, err := buildOpenAIRequest(&pb.PredictOptions{
+		Messages:   []*pb.Message{{Role: "user", Content: "find x"}},
+		Tools:      toolsJSON,
+		ToolChoice: choiceJSON,
+	}, cfg, false)
+	g.Expect(err).NotTo(HaveOccurred())
+
+	var decoded openAIRequest
+	err = json.Unmarshal(body, &decoded)
+	g.Expect(err).NotTo(HaveOccurred())
+	// Compare the JSON-canonical form so whitespace differences are ignored.
+	gotTools, _ := json.Marshal(json.RawMessage(decoded.Tools))
+	wantTools, _ := json.Marshal(json.RawMessage(toolsJSON))
+	g.Expect(string(gotTools)).To(Equal(string(wantTools)))
+	gotChoice, _ := json.Marshal(json.RawMessage(decoded.ToolChoice))
+	wantChoice, _ := json.Marshal(json.RawMessage(choiceJSON))
+	g.Expect(string(gotChoice)).To(Equal(string(wantChoice)))
+}
+
+// Garbage JSON in tools / tool_choice is silently dropped (omitted)
+// rather than blowing up the request. Documents the parseRawJSON
+// behaviour — operators shouldn't see hard failures from an upstream
+// caller's mis-formatted tools field.
+func TestBuildOpenAIRequest_InvalidToolsJSONDropped(t *testing.T) {
+	g := NewWithT(t)
+	cfg := &proxyConfig{upstreamModel: "gpt-4o"}
+	body, err := buildOpenAIRequest(&pb.PredictOptions{
+		Messages:   []*pb.Message{{Role: "user", Content: "x"}},
+		Tools:      "this is not json",
+		ToolChoice: "{also bad",
+	}, cfg, false)
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(string(body)).NotTo(ContainSubstring("this is not json"))
+	g.Expect(string(body)).NotTo(ContainSubstring("{also bad"))
+}
+
+// Anthropic empty content array yields an empty Reply (not an error).
+// Mirrors how an upstream tool_use-only response might arrive — the
+// content array can legitimately be empty in some edge cases.
+func TestPredictRich_Anthropic_EmptyContent(t *testing.T) {
+	g := NewWithT(t)
+	srv, _ := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
+		return 200, `{"id":"m1","type":"message","role":"assistant","content":[],"usage":{"input_tokens":3,"output_tokens":0}}`, "application/json"
+	})
+	defer srv.Close()
+	cp := newAnthropicTranslateCloudProxy(t, srv.URL)
+
+	reply, err := cp.PredictRich(&pb.PredictOptions{
+		Messages: []*pb.Message{{Role: "user", Content: "x"}},
+		Tokens:   16,
+	})
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(string(reply.GetMessage())).To(Equal(""))
+	g.Expect(reply.GetChatDeltas()).To(HaveLen(0))
+	g.Expect(reply.GetPromptTokens()).To(Equal(int32(3)))
+}
+
+// A truncated / malformed SSE payload mid-stream should be tolerated:
+// the malformed chunk gets skipped (xlog.Debug logged), valid chunks
+// before AND after it still reach the channel.
+func TestPredictStreamRich_OpenAI_TolerantOfBadChunks(t *testing.T) {
+	g := NewWithT(t)
+	body := strings.Join([]string{
+		`data: {"choices":[{"index":0,"delta":{"content":"hello"}}]}`,
+		``,
+		`data: this-is-not-json{{`,
+		``,
+		`data: {"choices":[{"index":0,"delta":{"content":" world"}}]}`,
+		``,
+		`data: [DONE]`,
+		``,
+	}, "\n")
+
+	srv, _ := fakeOpenAIUpstream(t, func(_ openAIRequest) (int, string, string) {
+		return 200, body, "text/event-stream"
+	})
+	defer srv.Close()
+	cp := newTranslateCloudProxy(t, srv.URL)
+
+	results := make(chan *pb.Reply, 8)
+	done := make(chan error, 1)
+	go func() {
+		done <- cp.PredictStreamRich(&pb.PredictOptions{
+			Messages: []*pb.Message{{Role: "user", Content: "hi"}},
+		}, results)
+		close(results)
+	}()
+
+	var assembled strings.Builder
+	for reply := range results {
+		assembled.Write(reply.GetMessage())
+	}
+	err := <-done
+	g.Expect(err).NotTo(HaveOccurred())
+	// The good chunks before and after the malformed one both made it through.
+	g.Expect(assembled.String()).To(Equal("hello world"))
+}
diff --git a/backend/go/cloud-proxy/provider_openai.go b/backend/go/cloud-proxy/provider_openai.go
new file mode 100644
index 000000000..d4911b6b7
--- /dev/null
+++ b/backend/go/cloud-proxy/provider_openai.go
@@ -0,0 +1,320 @@
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/xlog"
+)
+
+// OpenAI Chat Completions wire-format types. Narrowed to the fields
+// translate mode needs to preserve through the Reply proto: content,
+// role, tool_calls (typed so we can map them to pb.ToolCallDelta),
+// and sampling params copied verbatim from PredictOptions.
+//
+// Provider-specific extensions (logit_bias, function calling beyond
+// tool_calls, etc.) are not modelled — passthrough mode covers callers
+// that need full upstream fidelity.
+
+type openAIRequest struct {
+	Model            string          `json:"model"`
+	Messages         []openAIMessage `json:"messages"`
+	Stream           bool            `json:"stream,omitempty"`
+	Temperature      *float64        `json:"temperature,omitempty"`
+	TopP             *float64        `json:"top_p,omitempty"`
+	MaxTokens        *int32          `json:"max_tokens,omitempty"`
+	Stop             []string        `json:"stop,omitempty"`
+	FrequencyPenalty *float64        `json:"frequency_penalty,omitempty"`
+	PresencePenalty  *float64        `json:"presence_penalty,omitempty"`
+	Tools            json.RawMessage `json:"tools,omitempty"`
+	ToolChoice       json.RawMessage `json:"tool_choice,omitempty"`
+}
+
+type openAIMessage struct {
+	Role       string           `json:"role"`
+	Content    string           `json:"content,omitempty"`
+	Name       string           `json:"name,omitempty"`
+	ToolCallID string           `json:"tool_call_id,omitempty"`
+	ToolCalls  []openAIToolCall `json:"tool_calls,omitempty"`
+}
+
+// openAIToolCall covers both the non-streaming response shape (full
+// id+function+arguments) and the streaming-delta shape (sparse fields,
+// index assignment). The proto's ToolCallDelta absorbs both — name is
+// set on first appearance, arguments arrive incrementally in streaming.
+type openAIToolCall struct {
+	Index    int                `json:"index,omitempty"`
+	ID       string             `json:"id,omitempty"`
+	Type     string             `json:"type,omitempty"`
+	Function openAIFunctionCall `json:"function,omitempty"`
+}
+
+type openAIFunctionCall struct {
+	Name      string `json:"name,omitempty"`
+	Arguments string `json:"arguments,omitempty"`
+}
+
+type openAIChoice struct {
+	Index        int           `json:"index"`
+	Message      openAIMessage `json:"message"`
+	FinishReason string        `json:"finish_reason"`
+}
+
+type openAIResponse struct {
+	ID      string         `json:"id"`
+	Choices []openAIChoice `json:"choices"`
+	Usage   *openAIUsage   `json:"usage,omitempty"`
+}
+
+type openAIStreamChoice struct {
+	Index int `json:"index"`
+	Delta struct {
+		Content   string           `json:"content,omitempty"`
+		Role      string           `json:"role,omitempty"`
+		ToolCalls []openAIToolCall `json:"tool_calls,omitempty"`
+	} `json:"delta"`
+	FinishReason string `json:"finish_reason,omitempty"`
+}
+
+type openAIStreamChunk struct {
+	Choices []openAIStreamChoice `json:"choices"`
+	Usage   *openAIUsage         `json:"usage,omitempty"`
+}
+
+type openAIUsage struct {
+	PromptTokens     int `json:"prompt_tokens"`
+	CompletionTokens int `json:"completion_tokens"`
+	TotalTokens      int `json:"total_tokens"`
+}
+
+// buildOpenAIRequest converts pb.PredictOptions into the OpenAI Chat
+// Completions request body. Prefers Messages when non-empty; falls
+// back to wrapping Prompt as a single user message so plain
+// /completions-style calls still work in translate mode.
+func buildOpenAIRequest(opts *pb.PredictOptions, cfg *proxyConfig, stream bool) ([]byte, error) {
+	req := openAIRequest{
+		Model:      modelName(cfg, opts),
+		Stream:     stream,
+		Stop:       opts.GetStopPrompts(),
+		Tools:      parseRawJSON(opts.GetTools()),
+		ToolChoice: parseRawJSON(opts.GetToolChoice()),
+	}
+	if t := opts.GetTemperature(); t != 0 {
+		v := float64(t)
+		req.Temperature = &v
+	}
+	if t := opts.GetTopP(); t != 0 {
+		v := float64(t)
+		req.TopP = &v
+	}
+	if n := opts.GetTokens(); n > 0 {
+		req.MaxTokens = &n
+	}
+	if p := opts.GetFrequencyPenalty(); p != 0 {
+		v := float64(p)
+		req.FrequencyPenalty = &v
+	}
+	if p := opts.GetPresencePenalty(); p != 0 {
+		v := float64(p)
+		req.PresencePenalty = &v
+	}
+
+	for _, m := range opts.GetMessages() {
+		msg := openAIMessage{
+			Role:       m.GetRole(),
+			Content:    m.GetContent(),
+			Name:       m.GetName(),
+			ToolCallID: m.GetToolCallId(),
+		}
+		// Pre-existing tool_calls arrive as a JSON string from the
+		// upstream caller's previous assistant turn; pass-through as-is.
+		if tc := m.GetToolCalls(); tc != "" {
+			_ = json.Unmarshal([]byte(tc), &msg.ToolCalls)
+		}
+		req.Messages = append(req.Messages, msg)
+	}
+	// Fallback for plain Prompt requests (no Messages array). LocalAI
+	// templating may have produced a flat prompt; rewrap as a single
+	// user message so the upstream chat endpoint accepts it.
+	if len(req.Messages) == 0 && opts.GetPrompt() != "" {
+		req.Messages = []openAIMessage{{Role: "user", Content: opts.GetPrompt()}}
+	}
+
+	return json.Marshal(req)
+}
+
+// modelName picks the upstream model: upstream_model from the proxy
+// config wins (operator override), else the local model name captured
+// at LoadModel time. Operator sets upstream_model to map LocalAI's
+// alias (e.g. "claude-strict") to the upstream's canonical name
+// (e.g. "claude-3-5-sonnet-20241022").
+func modelName(cfg *proxyConfig, _ *pb.PredictOptions) string {
+	if cfg.upstreamModel != "" {
+		return cfg.upstreamModel
+	}
+	return cfg.localModel
+}
+
+// parseRawJSON parses a JSON string into a RawMessage so it round-trips
+// into the upstream body. Returns nil for empty/invalid input so the
+// field is omitted (omitempty).
+func parseRawJSON(s string) json.RawMessage {
+	if s == "" {
+		return nil
+	}
+	var probe json.RawMessage
+	if err := json.Unmarshal([]byte(s), &probe); err != nil {
+		return nil
+	}
+	return probe
+}
+
+// doOpenAIRequest builds + sends the upstream request. Returns the
+// raw response on success; caller handles status / body.
+func (c *CloudProxy) doOpenAIRequest(ctx context.Context, cfg *proxyConfig, body []byte) (*http.Response, error) {
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, cfg.upstreamURL, bytes.NewReader(body))
+	if err != nil {
+		return nil, fmt.Errorf("cloud-proxy: build request: %w", err)
+	}
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Accept", "*/*")
+	if cfg.apiKey != "" {
+		applyAuthHeader(req, cfg.provider, cfg.apiKey)
+	}
+	resp, err := c.client.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("cloud-proxy: upstream request: %w", err)
+	}
+	return resp, nil
+}
+
+// predictOpenAIRich is the non-streaming translate path. Returns a
+// fully-populated *pb.Reply with assistant content, tool calls, and
+// token usage. The gRPC server forwards the Reply verbatim.
+func (c *CloudProxy) predictOpenAIRich(ctx context.Context, cfg *proxyConfig, opts *pb.PredictOptions) (*pb.Reply, error) {
+	body, err := buildOpenAIRequest(opts, cfg, false)
+	if err != nil {
+		return nil, fmt.Errorf("cloud-proxy: marshal request: %w", err)
+	}
+	resp, err := c.doOpenAIRequest(ctx, cfg, body)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = resp.Body.Close() }()
+
+	if resp.StatusCode >= 400 {
+		errBody, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
+		return nil, fmt.Errorf("cloud-proxy: upstream %d: %s", resp.StatusCode, string(errBody))
+	}
+
+	var parsed openAIResponse
+	if err := json.NewDecoder(resp.Body).Decode(&parsed); err != nil {
+		return nil, fmt.Errorf("cloud-proxy: decode response: %w", err)
+	}
+	if len(parsed.Choices) == 0 {
+		return nil, errors.New("cloud-proxy: upstream returned no choices")
+	}
+
+	choice := parsed.Choices[0]
+	reply := &pb.Reply{
+		Message: []byte(choice.Message.Content),
+	}
+	if parsed.Usage != nil {
+		reply.PromptTokens = int32(parsed.Usage.PromptTokens)
+		reply.Tokens = int32(parsed.Usage.CompletionTokens)
+	}
+	if len(choice.Message.ToolCalls) > 0 {
+		// Non-streaming: a single ChatDelta carries the full tool-call
+		// set. Index/Name/Arguments are populated together; downstream
+		// consumers don't need to assemble streaming deltas.
+		delta := &pb.ChatDelta{}
+		for _, tc := range choice.Message.ToolCalls {
+			delta.ToolCalls = append(delta.ToolCalls,
+				newToolCallDelta(tc.Index, tc.ID, tc.Function.Name, tc.Function.Arguments))
+		}
+		reply.ChatDeltas = []*pb.ChatDelta{delta}
+	}
+	return reply, nil
+}
+
+// predictOpenAIStreamRich streams *pb.Reply chunks. Each chunk carries
+// either a content delta (Message + ChatDeltas[].Content) or tool-call
+// deltas (ChatDeltas[].ToolCalls). The final Reply carries usage tokens
+// when the upstream sends them (stream_options.include_usage).
+func (c *CloudProxy) predictOpenAIStreamRich(ctx context.Context, cfg *proxyConfig, opts *pb.PredictOptions, results chan<- *pb.Reply) error {
+	body, err := buildOpenAIRequest(opts, cfg, true)
+	if err != nil {
+		return fmt.Errorf("cloud-proxy: marshal request: %w", err)
+	}
+	resp, err := c.doOpenAIRequest(ctx, cfg, body)
+	if err != nil {
+		return err
+	}
+	defer func() { _ = resp.Body.Close() }()
+
+	if resp.StatusCode >= 400 {
+		errBody, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
+		return fmt.Errorf("cloud-proxy: upstream %d: %s", resp.StatusCode, string(errBody))
+	}
+
+	scanner := bufio.NewScanner(resp.Body)
+	scanner.Buffer(make([]byte, 0, 64*1024), 1<<20)
+	for scanner.Scan() {
+		line := scanner.Text()
+		if !strings.HasPrefix(line, "data:") {
+			continue
+		}
+		payload := strings.TrimSpace(strings.TrimPrefix(line, "data:"))
+		if payload == "" || payload == "[DONE]" {
+			return nil
+		}
+		var chunk openAIStreamChunk
+		if err := json.Unmarshal([]byte(payload), &chunk); err != nil {
+			xlog.Debug("cloud-proxy: skip malformed SSE chunk", "error", err)
+			continue
+		}
+		// Usage frames may arrive separately from content frames when
+		// stream_options.include_usage is set; emit a usage-only Reply
+		// in that case so the consumer sees the totals.
+		if chunk.Usage != nil && len(chunk.Choices) == 0 {
+			if !sendReply(ctx, results, &pb.Reply{
+				PromptTokens: int32(chunk.Usage.PromptTokens),
+				Tokens:       int32(chunk.Usage.CompletionTokens),
+			}) {
+				return ctx.Err()
+			}
+			continue
+		}
+		for _, ch := range chunk.Choices {
+			reply := &pb.Reply{}
+			if ch.Delta.Content != "" {
+				reply.Message = []byte(ch.Delta.Content)
+				reply.ChatDeltas = []*pb.ChatDelta{{Content: ch.Delta.Content}}
+			}
+			if len(ch.Delta.ToolCalls) > 0 {
+				if len(reply.ChatDeltas) == 0 {
+					reply.ChatDeltas = []*pb.ChatDelta{{}}
+				}
+				for _, tc := range ch.Delta.ToolCalls {
+					reply.ChatDeltas[0].ToolCalls = append(reply.ChatDeltas[0].ToolCalls,
+						newToolCallDelta(tc.Index, tc.ID, tc.Function.Name, tc.Function.Arguments))
+				}
+			}
+			if reply.Message == nil && len(reply.ChatDeltas) == 0 {
+				continue
+			}
+			if !sendReply(ctx, results, reply) {
+				return ctx.Err()
+			}
+		}
+	}
+	return scanner.Err()
+}
diff --git a/backend/go/cloud-proxy/provider_openai_test.go b/backend/go/cloud-proxy/provider_openai_test.go
new file mode 100644
index 000000000..9ce4334db
--- /dev/null
+++ b/backend/go/cloud-proxy/provider_openai_test.go
@@ -0,0 +1,170 @@
+package main
+
+import (
+	"encoding/json"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	. "github.com/onsi/gomega"
+
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+// fakeOpenAIUpstream returns an httptest.Server that decodes the
+// inbound request as an openAIRequest, calls handler with it, and
+// writes the handler's reply as the response.
+func fakeOpenAIUpstream(t *testing.T, handler func(req openAIRequest) (status int, body string, contentType string)) (*httptest.Server, *openAIRequest) {
+	t.Helper()
+	var captured openAIRequest
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		raw, _ := io.ReadAll(r.Body)
+		_ = json.Unmarshal(raw, &captured)
+		status, body, ct := handler(captured)
+		w.Header().Set("Content-Type", ct)
+		w.WriteHeader(status)
+		_, _ = io.WriteString(w, body)
+	}))
+	return srv, &captured
+}
+
+func newTranslateCloudProxy(t *testing.T, upstreamURL string) *CloudProxy {
+	t.Helper()
+	g := NewWithT(t)
+	t.Setenv("CLOUD_PROXY_OPENAI_FAKE", "sk-fake-openai")
+	cp := NewCloudProxy()
+	err := cp.Load(&pb.ModelOptions{
+		Model: "gpt-4o-local",
+		Proxy: &pb.ProxyOptions{
+			UpstreamUrl:   upstreamURL,
+			Mode:          modeTranslate,
+			Provider:      providerOpenAI,
+			ApiKeyEnv:     "CLOUD_PROXY_OPENAI_FAKE",
+			UpstreamModel: "gpt-4o",
+		},
+	})
+	g.Expect(err).NotTo(HaveOccurred())
+	return cp
+}
+
+func TestPredict_OpenAI_BasicChat(t *testing.T) {
+	g := NewWithT(t)
+	srv, captured := fakeOpenAIUpstream(t, func(_ openAIRequest) (int, string, string) {
+		return 200, `{"id":"resp-1","choices":[{"index":0,"message":{"role":"assistant","content":"hi there"},"finish_reason":"stop"}],"usage":{"prompt_tokens":5,"completion_tokens":2,"total_tokens":7}}`, "application/json"
+	})
+	defer srv.Close()
+	cp := newTranslateCloudProxy(t, srv.URL)
+
+	got, err := cp.Predict(&pb.PredictOptions{
+		Messages: []*pb.Message{
+			{Role: "system", Content: "be brief"},
+			{Role: "user", Content: "hello"},
+		},
+		Temperature: 0.5,
+		TopP:        0.9,
+		Tokens:      32,
+	})
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(got).To(Equal("hi there"))
+
+	// Verify the upstream saw a properly-translated request.
+	g.Expect(captured.Model).To(Equal("gpt-4o"))
+	g.Expect(captured.Messages).To(HaveLen(2))
+	g.Expect(captured.Messages[0].Role).To(Equal("system"))
+	g.Expect(captured.Messages[1].Role).To(Equal("user"))
+	g.Expect(captured.Temperature).NotTo(BeNil())
+	g.Expect(*captured.Temperature).To(Equal(0.5))
+	g.Expect(captured.MaxTokens).NotTo(BeNil())
+	g.Expect(*captured.MaxTokens).To(Equal(int32(32)))
+	g.Expect(captured.Stream).To(BeFalse())
+}
+
+func TestPredict_OpenAI_PromptFallback(t *testing.T) {
+	g := NewWithT(t)
+	// No Messages array — backend should synth a single user message
+	// from Prompt so non-chat clients still route through translate.
+	srv, captured := fakeOpenAIUpstream(t, func(_ openAIRequest) (int, string, string) {
+		return 200, `{"choices":[{"message":{"role":"assistant","content":"ok"}}]}`, "application/json"
+	})
+	defer srv.Close()
+	cp := newTranslateCloudProxy(t, srv.URL)
+
+	_, err := cp.Predict(&pb.PredictOptions{Prompt: "what time is it?"})
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(captured.Messages).To(HaveLen(1))
+	g.Expect(captured.Messages[0].Role).To(Equal("user"))
+	g.Expect(captured.Messages[0].Content).To(Equal("what time is it?"))
+}
+
+func TestPredict_OpenAI_UpstreamError(t *testing.T) {
+	g := NewWithT(t)
+	srv, _ := fakeOpenAIUpstream(t, func(_ openAIRequest) (int, string, string) {
+		return 401, `{"error":{"message":"bad key"}}`, "application/json"
+	})
+	defer srv.Close()
+	cp := newTranslateCloudProxy(t, srv.URL)
+
+	_, err := cp.Predict(&pb.PredictOptions{Messages: []*pb.Message{{Role: "user", Content: "x"}}})
+	g.Expect(err).To(HaveOccurred())
+	g.Expect(err.Error()).To(ContainSubstring("401"))
+}
+
+func TestPredictStream_OpenAI_StreamsContent(t *testing.T) {
+	g := NewWithT(t)
+	// Stream three content deltas then [DONE]. Verify the channel
+	// receives them in order with no missing pieces.
+	chunks := []string{
+		`{"choices":[{"index":0,"delta":{"role":"assistant"}}]}`,
+		`{"choices":[{"index":0,"delta":{"content":"hello"}}]}`,
+		`{"choices":[{"index":0,"delta":{"content":" "}}]}`,
+		`{"choices":[{"index":0,"delta":{"content":"world"}}]}`,
+		`{"choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}`,
+	}
+	body := ""
+	for _, c := range chunks {
+		body += "data: " + c + "\n\n"
+	}
+	body += "data: [DONE]\n\n"
+
+	srv, captured := fakeOpenAIUpstream(t, func(_ openAIRequest) (int, string, string) {
+		return 200, body, "text/event-stream"
+	})
+	defer srv.Close()
+	cp := newTranslateCloudProxy(t, srv.URL)
+
+	results := make(chan string, 8)
+	done := make(chan error, 1)
+	go func() {
+		done <- cp.PredictStream(&pb.PredictOptions{
+			Messages: []*pb.Message{{Role: "user", Content: "hi"}},
+		}, results)
+	}()
+
+	var got []string
+	for s := range results {
+		got = append(got, s)
+	}
+	err := <-done
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(strings.Join(got, "")).To(Equal("hello world"))
+	g.Expect(captured.Stream).To(BeTrue())
+}
+
+func TestPredict_RejectedInPassthroughMode(t *testing.T) {
+	g := NewWithT(t)
+	t.Setenv("CLOUD_PROXY_FAKE", "k")
+	cp := NewCloudProxy()
+	err := cp.Load(&pb.ModelOptions{
+		Proxy: &pb.ProxyOptions{
+			UpstreamUrl: "https://example.com",
+			Mode:        modePassthrough,
+			ApiKeyEnv:   "CLOUD_PROXY_FAKE",
+		},
+	})
+	g.Expect(err).NotTo(HaveOccurred())
+	_, err = cp.Predict(&pb.PredictOptions{})
+	g.Expect(err).To(HaveOccurred())
+	g.Expect(err.Error()).To(ContainSubstring("only valid in translate"))
+}
diff --git a/backend/go/cloud-proxy/proxy.go b/backend/go/cloud-proxy/proxy.go
new file mode 100644
index 000000000..9015ffc4d
--- /dev/null
+++ b/backend/go/cloud-proxy/proxy.go
@@ -0,0 +1,429 @@
+package main
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"os"
+	"strings"
+	"sync/atomic"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/xlog"
+)
+
+// Mirror of core/config.Proxy{Mode,Provider}* — backends don't
+// import core to keep the boundary clean.
+const (
+	modePassthrough = "passthrough"
+	modeTranslate   = "translate"
+
+	providerOpenAI    = "openai"
+	providerAnthropic = "anthropic"
+)
+
+// CloudProxy is the LocalAI backend that proxies model traffic to a
+// configured upstream HTTP provider. Concurrency: base.SingleThread is
+// NOT embedded — forward calls are independent and HTTP transport is
+// goroutine-safe, so multiple Forward streams can run in parallel.
+// Locking would serialise requests to a chat provider for no benefit.
+type CloudProxy struct {
+	base.Base
+
+	cfg    atomic.Pointer[proxyConfig]
+	client *http.Client
+}
+
+type proxyConfig struct {
+	upstreamURL   string
+	mode          string
+	provider      string
+	upstreamModel string
+	localModel    string // ModelOptions.Model — fallback when upstream_model is unset
+	apiKey        string // resolved at Load time
+}
+
+func NewCloudProxy() *CloudProxy {
+	// No Client-level Timeout — that would bound streaming SSE
+	// responses too, which can legitimately last minutes. Per-request
+	// deadlines come from the gRPC stream context.
+	return &CloudProxy{client: &http.Client{}}
+}
+
+func (c *CloudProxy) Load(opts *pb.ModelOptions) error {
+	po := opts.GetProxy()
+	if po == nil {
+		return errors.New("cloud-proxy: Load requires ProxyOptions to be set")
+	}
+	if po.GetUpstreamUrl() == "" {
+		return errors.New("cloud-proxy: upstream_url is required")
+	}
+	if _, err := url.ParseRequestURI(po.GetUpstreamUrl()); err != nil {
+		return fmt.Errorf("cloud-proxy: upstream_url %q invalid: %w", po.GetUpstreamUrl(), err)
+	}
+
+	mode := po.GetMode()
+	if mode == "" {
+		mode = modePassthrough
+	}
+	switch mode {
+	case modePassthrough:
+	case modeTranslate:
+		switch po.GetProvider() {
+		case providerOpenAI:
+			// implemented in provider_openai.go
+		case providerAnthropic:
+			// implemented in provider_anthropic.go
+		default:
+			return fmt.Errorf("cloud-proxy: translate mode requires provider in {%s, %s}, got %q",
+				providerOpenAI, providerAnthropic, po.GetProvider())
+		}
+	default:
+		return fmt.Errorf("cloud-proxy: unknown mode %q", mode)
+	}
+
+	key, err := resolveAPIKey(po.GetApiKeyEnv(), po.GetApiKeyFile())
+	if err != nil {
+		return err
+	}
+
+	c.cfg.Store(&proxyConfig{
+		upstreamURL:   po.GetUpstreamUrl(),
+		mode:          mode,
+		provider:      po.GetProvider(),
+		upstreamModel: po.GetUpstreamModel(),
+		localModel:    opts.GetModel(),
+		apiKey:        key,
+	})
+	xlog.Info("cloud-proxy: ready",
+		"upstream", po.GetUpstreamUrl(),
+		"mode", mode,
+		"provider", po.GetProvider(),
+		"has_key", key != "")
+	return nil
+}
+
+// resolveAPIKey mirrors config.ProxyConfig.ResolveAPIKey. Duplicated
+// (a few lines) rather than importing core/config from a backend
+// binary — keeps backends independent of core's package layout.
+// Mutual-exclusion is enforced upstream in core/config.Validate.
+func resolveAPIKey(envName, filePath string) (string, error) {
+	if envName != "" {
+		v := os.Getenv(envName)
+		if v == "" {
+			return "", fmt.Errorf("cloud-proxy: api_key_env %q is unset", envName)
+		}
+		return v, nil
+	}
+	if filePath != "" {
+		b, err := os.ReadFile(filePath)
+		if err != nil {
+			return "", fmt.Errorf("cloud-proxy: read api_key_file %q: %w", filePath, err)
+		}
+		return strings.TrimSpace(string(b)), nil
+	}
+	return "", nil
+}
+
+// PredictRich is the non-streaming translate path. Returns a fully-
+// populated *pb.Reply: content, tool-call deltas (ChatDeltas), and
+// usage tokens. Implements the optional grpc.AIModelRich interface;
+// the gRPC server prefers this path over Predict when present so
+// tool calls survive the round-trip. Passthrough mode rejects
+// PredictRich — callers must use Forward.
+func (c *CloudProxy) PredictRich(opts *pb.PredictOptions) (reply *pb.Reply, err error) {
+	cfg := c.cfg.Load()
+	if cfg == nil {
+		return nil, errors.New("cloud-proxy: model not loaded")
+	}
+	if cfg.mode != modeTranslate {
+		return nil, fmt.Errorf("cloud-proxy: Predict only valid in translate mode (have %s)", cfg.mode)
+	}
+	xlog.Info("cloud-proxy: predict", "provider", cfg.provider, "upstream", cfg.upstreamURL, "upstream_model", cfg.upstreamModel)
+	defer func() {
+		if err != nil {
+			xlog.Warn("cloud-proxy: predict failed", "provider", cfg.provider, "error", err)
+		}
+	}()
+	ctx := context.Background()
+	switch cfg.provider {
+	case providerOpenAI:
+		return c.predictOpenAIRich(ctx, cfg, opts)
+	case providerAnthropic:
+		return c.predictAnthropicRich(ctx, cfg, opts)
+	default:
+		return nil, fmt.Errorf("cloud-proxy: predict not implemented for provider %q", cfg.provider)
+	}
+}
+
+// PredictStreamRich is the rich streaming counterpart of PredictRich.
+// Each emitted Reply carries either a content delta, tool-call deltas,
+// or usage tokens (the final upstream frame). base.Base.PredictStream
+// is bypassed when AIModelRich is implemented, so the channel is
+// closed by the gRPC server pump.
+func (c *CloudProxy) PredictStreamRich(opts *pb.PredictOptions, results chan<- *pb.Reply) (err error) {
+	cfg := c.cfg.Load()
+	if cfg == nil {
+		return errors.New("cloud-proxy: model not loaded")
+	}
+	if cfg.mode != modeTranslate {
+		return fmt.Errorf("cloud-proxy: PredictStream only valid in translate mode (have %s)", cfg.mode)
+	}
+	xlog.Info("cloud-proxy: predict-stream", "provider", cfg.provider, "upstream", cfg.upstreamURL, "upstream_model", cfg.upstreamModel)
+	defer func() {
+		if err != nil {
+			xlog.Warn("cloud-proxy: predict-stream failed", "provider", cfg.provider, "error", err)
+		}
+	}()
+	ctx := context.Background()
+	switch cfg.provider {
+	case providerOpenAI:
+		return c.predictOpenAIStreamRich(ctx, cfg, opts, results)
+	case providerAnthropic:
+		return c.predictAnthropicStreamRich(ctx, cfg, opts, results)
+	default:
+		return fmt.Errorf("cloud-proxy: predictStream not implemented for provider %q", cfg.provider)
+	}
+}
+
+// Predict is the legacy (string, error) AIModel signature. Used only
+// if a caller goes through the non-rich path (it shouldn't, since
+// server.go prefers PredictRich). Provided so the AIModel interface
+// is satisfied for backends that haven't opted into the rich variant.
+func (c *CloudProxy) Predict(opts *pb.PredictOptions) (string, error) {
+	reply, err := c.PredictRich(opts)
+	if err != nil {
+		return "", err
+	}
+	return string(reply.GetMessage()), nil
+}
+
+// PredictStream is the legacy chan-string streaming path. Adapts the
+// rich stream by extracting only content text — tool-call-only chunks
+// (no Message bytes) and usage-only chunks are silently dropped, since
+// the legacy chan-string contract cannot represent them. Consumers
+// that need tool calls must call PredictStreamRich directly.
+func (c *CloudProxy) PredictStream(opts *pb.PredictOptions, results chan string) error {
+	defer close(results)
+	richCh := make(chan *pb.Reply)
+	errCh := make(chan error, 1)
+	go func() {
+		errCh <- c.PredictStreamRich(opts, richCh)
+		close(richCh)
+	}()
+	for reply := range richCh {
+		if msg := reply.GetMessage(); len(msg) > 0 {
+			results <- string(msg)
+		}
+	}
+	return <-errCh
+}
+
+// sendReply pushes one Reply onto a stream channel honouring ctx
+// cancellation. Returns false on cancel so the caller can exit with
+// ctx.Err(). Used by both translate-mode providers.
+func sendReply(ctx context.Context, results chan<- *pb.Reply, reply *pb.Reply) bool {
+	select {
+	case results <- reply:
+		return true
+	case <-ctx.Done():
+		return false
+	}
+}
+
+// newToolCallDelta is a small constructor for the cross-provider
+// tool-call delta shape. Centralised so the int32 cast and the four
+// fields stay consistent across the OpenAI / Anthropic translators.
+// Empty name/args are valid — Anthropic streaming announces the call
+// with id+name then sends arguments incrementally; OpenAI's reverse
+// pattern (args without name) also lands here.
+func newToolCallDelta(index int, id, name, args string) *pb.ToolCallDelta {
+	return &pb.ToolCallDelta{
+		Index:     int32(index),
+		Id:        id,
+		Name:      name,
+		Arguments: args,
+	}
+}
+
+// Forward shovels bytes between a Forward gRPC stream and an upstream
+// HTTP request. First request message carries path/method/headers and
+// the initial body chunk; subsequent messages append body chunks. The
+// first reply carries upstream status + response headers; subsequent
+// replies stream body chunks until the upstream connection closes.
+// Cancellation of ctx (the gRPC stream context) closes the upstream
+// connection.
+func (c *CloudProxy) Forward(ctx context.Context, in <-chan *pb.ForwardRequest, out chan<- *pb.ForwardReply) error {
+	defer close(out)
+
+	cfg := c.cfg.Load()
+	if cfg == nil {
+		return errors.New("cloud-proxy: model not loaded")
+	}
+	if cfg.mode != modePassthrough {
+		return fmt.Errorf("cloud-proxy: Forward only valid in passthrough mode (have %s)", cfg.mode)
+	}
+
+	first, ok := <-in
+	if !ok {
+		return errors.New("cloud-proxy: Forward stream closed before first request")
+	}
+
+	// Honour the per-request path only when the configured upstream_url
+	// has no path of its own — gallery convention is to put the
+	// canonical path in upstream_url.
+	fullURL, err := composeURL(cfg.upstreamURL, first.GetPath())
+	if err != nil {
+		return err
+	}
+
+	method := first.GetMethod()
+	if method == "" {
+		method = http.MethodPost
+	}
+
+	// Pipe the body in from the gRPC stream so the HTTP request can
+	// start before the client finishes sending. The pipe-reader is
+	// closed via CloseWithError on the error paths so the writer
+	// goroutine doesn't block forever.
+	pr, pw := io.Pipe()
+
+	go func() {
+		var writeErr error
+		defer func() { _ = pw.CloseWithError(writeErr) }()
+		if len(first.GetBodyChunk()) > 0 {
+			if _, writeErr = pw.Write(first.GetBodyChunk()); writeErr != nil {
+				return
+			}
+		}
+		for req := range in {
+			if len(req.GetBodyChunk()) == 0 {
+				continue
+			}
+			if _, writeErr = pw.Write(req.GetBodyChunk()); writeErr != nil {
+				return
+			}
+		}
+	}()
+
+	req, err := http.NewRequestWithContext(ctx, method, fullURL, pr)
+	if err != nil {
+		_ = pr.CloseWithError(err) // unblocks the body-pump's pw.Write
+		return fmt.Errorf("cloud-proxy: build request: %w", err)
+	}
+
+	// Apply caller-supplied headers, then override with the
+	// authorization header derived from the resolved key. Caller-
+	// supplied Authorization is always replaced — operators may not
+	// know the backend's auth scheme, and silently leaking through a
+	// client Authorization header to a different upstream would
+	// confuse the upstream and could leak credentials.
+	for _, h := range first.GetHeaders() {
+		if h == nil || h.GetName() == "" {
+			continue
+		}
+		// Strip hop-by-hop headers that aren't meaningful to the
+		// upstream (Host is set by the http client from the URL;
+		// Content-Length is computed from the body).
+		if isHopByHopHeader(h.GetName()) {
+			continue
+		}
+		req.Header.Add(h.GetName(), h.GetValue())
+	}
+	if cfg.apiKey != "" {
+		applyAuthHeader(req, cfg.provider, cfg.apiKey)
+	}
+
+	xlog.Info("cloud-proxy: forward", "method", method, "url", fullURL, "provider", cfg.provider)
+	resp, err := c.client.Do(req)
+	if err != nil {
+		xlog.Warn("cloud-proxy: forward upstream failed", "url", fullURL, "error", err)
+		return fmt.Errorf("cloud-proxy: upstream request failed: %w", err)
+	}
+	defer func() { _ = resp.Body.Close() }()
+
+	logFn := xlog.Info
+	if resp.StatusCode >= 400 {
+		logFn = xlog.Warn
+	}
+	logFn("cloud-proxy: forward response", "url", fullURL, "status", resp.StatusCode)
+
+	// First reply: status + response headers, no body.
+	headers := make([]*pb.ForwardHeader, 0, len(resp.Header))
+	for k, vs := range resp.Header {
+		for _, v := range vs {
+			headers = append(headers, &pb.ForwardHeader{Name: k, Value: v})
+		}
+	}
+	out <- &pb.ForwardReply{Status: int32(resp.StatusCode), Headers: headers}
+
+	// Subsequent replies: body chunks. Use a fixed 8KB buffer — small
+	// enough that SSE token frames flush promptly, large enough that
+	// long chunked-transfer bodies aren't death by a thousand reads.
+	buf := make([]byte, 8*1024)
+	for {
+		n, rerr := resp.Body.Read(buf)
+		if n > 0 {
+			chunk := make([]byte, n)
+			copy(chunk, buf[:n])
+			out <- &pb.ForwardReply{BodyChunk: chunk}
+		}
+		if rerr != nil {
+			if errors.Is(rerr, io.EOF) {
+				return nil
+			}
+			return fmt.Errorf("cloud-proxy: upstream body read: %w", rerr)
+		}
+	}
+}
+
+// composeURL combines the configured upstream URL with the per-request
+// path. The upstream URL typically already includes the canonical path
+// (e.g. https://api.openai.com/v1/chat/completions) so the per-request
+// path is ignored in that case. When upstream_url is a bare host
+// (https://api.openai.com), the request path is appended.
+func composeURL(upstream, reqPath string) (string, error) {
+	u, err := url.Parse(upstream)
+	if err != nil {
+		return "", fmt.Errorf("cloud-proxy: parse upstream_url %q: %w", upstream, err)
+	}
+	if u.Path == "" || u.Path == "/" {
+		u.Path = reqPath
+	}
+	return u.String(), nil
+}
+
+// applyAuthHeader writes the appropriate authorization header for the
+// provider. OpenAI/Anthropic/most providers use Bearer; Anthropic
+// historically uses x-api-key + anthropic-version, but accepts Bearer
+// too via the OpenAI-compatible path. Default to Bearer when provider
+// is empty (passthrough mode where the operator doesn't claim a
+// provider).
+func applyAuthHeader(req *http.Request, provider, key string) {
+	switch provider {
+	case providerAnthropic:
+		req.Header.Set("x-api-key", key)
+		if req.Header.Get("anthropic-version") == "" {
+			req.Header.Set("anthropic-version", "2023-06-01")
+		}
+	default:
+		req.Header.Set("Authorization", "Bearer "+key)
+	}
+}
+
+// isHopByHopHeader returns true for headers that should not be
+// forwarded from the client request to the upstream (RFC 7230 §6.1
+// hop-by-hop list, plus a few that the http.Client sets itself).
+func isHopByHopHeader(name string) bool {
+	switch strings.ToLower(name) {
+	case "connection", "proxy-connection", "keep-alive", "transfer-encoding",
+		"te", "trailer", "upgrade", "host", "content-length":
+		return true
+	}
+	return false
+}
+
diff --git a/backend/go/cloud-proxy/proxy_test.go b/backend/go/cloud-proxy/proxy_test.go
new file mode 100644
index 000000000..3881346d4
--- /dev/null
+++ b/backend/go/cloud-proxy/proxy_test.go
@@ -0,0 +1,206 @@
+package main
+
+import (
+	"context"
+	"errors"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+
+	. "github.com/onsi/gomega"
+)
+
+// helper: run a CloudProxy in-process via grpc.Provide so tests can
+// call Forward through the public Backend interface without listening
+// on a real socket.
+func newInProcClient(t *testing.T, proxy *CloudProxy) grpc.Backend {
+	t.Helper()
+	addr := "test://" + t.Name()
+	grpc.Provide(addr, proxy)
+	return grpc.NewClient(addr, true, nil, false)
+}
+
+func TestForward_PassthroughEcho(t *testing.T) {
+	g := NewWithT(t)
+	// Fake upstream: echoes the request body back, prefixed with a
+	// canary so the test can assert both that the body reached the
+	// upstream and the response made it back to the client.
+	gotBody := make(chan string, 1)
+	gotAuth := make(chan string, 1)
+	gotPath := make(chan string, 1)
+	upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		body, _ := io.ReadAll(r.Body)
+		gotBody <- string(body)
+		gotAuth <- r.Header.Get("Authorization")
+		gotPath <- r.URL.Path
+		w.Header().Set("X-Echo", "true")
+		w.WriteHeader(http.StatusOK)
+		_, _ = w.Write([]byte("echo: " + string(body)))
+	}))
+	defer upstream.Close()
+
+	t.Setenv("CLOUD_PROXY_FAKE_KEY", "sk-fake")
+
+	cp := NewCloudProxy()
+	err := cp.Load(&pb.ModelOptions{
+		Proxy: &pb.ProxyOptions{
+			UpstreamUrl: upstream.URL,
+			Mode:        modePassthrough,
+			ApiKeyEnv:   "CLOUD_PROXY_FAKE_KEY",
+		},
+	})
+	g.Expect(err).NotTo(HaveOccurred())
+
+	c := newInProcClient(t, cp)
+	stream, err := c.Forward(context.Background())
+	g.Expect(err).NotTo(HaveOccurred())
+
+	err = stream.Send(&pb.ForwardRequest{
+		Path:      "/v1/chat/completions",
+		Method:    "POST",
+		Headers:   []*pb.ForwardHeader{{Name: "Content-Type", Value: "application/json"}},
+		BodyChunk: []byte(`{"prompt":`),
+	})
+	g.Expect(err).NotTo(HaveOccurred())
+	err = stream.Send(&pb.ForwardRequest{BodyChunk: []byte(`"hi"}`)})
+	g.Expect(err).NotTo(HaveOccurred())
+	err = stream.CloseSend()
+	g.Expect(err).NotTo(HaveOccurred())
+
+	// First reply: status + headers.
+	first, err := stream.Recv()
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(first.Status).To(Equal(int32(http.StatusOK)))
+	g.Expect(hasHeader(first.Headers, "X-Echo", "true")).To(BeTrue())
+
+	// Subsequent replies: body.
+	var body []byte
+	for {
+		r, err := stream.Recv()
+		if errors.Is(err, io.EOF) {
+			break
+		}
+		g.Expect(err).NotTo(HaveOccurred())
+		body = append(body, r.BodyChunk...)
+	}
+	g.Expect(string(body)).To(Equal(`echo: {"prompt":"hi"}`))
+
+	// Upstream observations.
+	var gotBodyVal, gotAuthVal, gotPathVal string
+	g.Eventually(gotBody).Should(Receive(&gotBodyVal), "upstream never saw body")
+	g.Expect(gotBodyVal).To(Equal(`{"prompt":"hi"}`))
+	g.Eventually(gotAuth).Should(Receive(&gotAuthVal), "upstream never saw auth header")
+	g.Expect(gotAuthVal).To(Equal("Bearer sk-fake"))
+	g.Eventually(gotPath).Should(Receive(&gotPathVal), "upstream never saw path")
+	g.Expect(gotPathVal).To(Equal("/v1/chat/completions"))
+}
+
+func TestForward_AnthropicAuthHeader(t *testing.T) {
+	g := NewWithT(t)
+	gotXAPIKey := make(chan string, 1)
+	gotVersion := make(chan string, 1)
+	upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		gotXAPIKey <- r.Header.Get("x-api-key")
+		gotVersion <- r.Header.Get("anthropic-version")
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer upstream.Close()
+
+	t.Setenv("CLOUD_PROXY_ANTHROPIC_KEY", "sk-ant-fake")
+
+	cp := NewCloudProxy()
+	err := cp.Load(&pb.ModelOptions{
+		Proxy: &pb.ProxyOptions{
+			UpstreamUrl: upstream.URL,
+			Mode:        modePassthrough,
+			Provider:    providerAnthropic,
+			ApiKeyEnv:   "CLOUD_PROXY_ANTHROPIC_KEY",
+		},
+	})
+	g.Expect(err).NotTo(HaveOccurred())
+
+	c := newInProcClient(t, cp)
+	stream, err := c.Forward(context.Background())
+	g.Expect(err).NotTo(HaveOccurred())
+	err = stream.Send(&pb.ForwardRequest{Path: "/v1/messages", Method: "POST"})
+	g.Expect(err).NotTo(HaveOccurred())
+	_ = stream.CloseSend()
+	_, _ = stream.Recv() // drain status
+	for {
+		if _, err := stream.Recv(); errors.Is(err, io.EOF) || err != nil {
+			break
+		}
+	}
+
+	g.Expect(<-gotXAPIKey).To(Equal("sk-ant-fake"))
+	g.Expect(<-gotVersion).NotTo(BeEmpty())
+}
+
+func TestLoad_ValidatesConfig(t *testing.T) {
+	g := NewWithT(t)
+	cp := NewCloudProxy()
+
+	err := cp.Load(&pb.ModelOptions{})
+	g.Expect(err).To(HaveOccurred())
+	g.Expect(err.Error()).To(ContainSubstring("ProxyOptions"))
+
+	err = cp.Load(&pb.ModelOptions{Proxy: &pb.ProxyOptions{}})
+	g.Expect(err).To(HaveOccurred())
+	g.Expect(err.Error()).To(ContainSubstring("upstream_url"))
+
+	err = cp.Load(&pb.ModelOptions{Proxy: &pb.ProxyOptions{
+		UpstreamUrl: "https://example.com",
+		Mode:        "rewrite",
+	}})
+	g.Expect(err).To(HaveOccurred())
+	g.Expect(err.Error()).To(ContainSubstring("unknown mode"))
+
+	// translate + openai should load successfully (Phase 5).
+	err = cp.Load(&pb.ModelOptions{Proxy: &pb.ProxyOptions{
+		UpstreamUrl: "https://example.com/v1/chat/completions",
+		Mode:        modeTranslate,
+		Provider:    providerOpenAI,
+	}})
+	g.Expect(err).NotTo(HaveOccurred())
+
+	// translate + anthropic should load successfully (Phase 6).
+	err = cp.Load(&pb.ModelOptions{Proxy: &pb.ProxyOptions{
+		UpstreamUrl: "https://example.com/v1/messages",
+		Mode:        modeTranslate,
+		Provider:    providerAnthropic,
+	}})
+	g.Expect(err).NotTo(HaveOccurred())
+
+	err = cp.Load(&pb.ModelOptions{Proxy: &pb.ProxyOptions{
+		UpstreamUrl: "https://example.com",
+		ApiKeyEnv:   "DEFINITELY_UNSET_ENV_VAR_XYZ",
+	}})
+	g.Expect(err).To(HaveOccurred())
+	g.Expect(err.Error()).To(ContainSubstring("unset"))
+}
+
+func TestForward_RejectsWithoutLoad(t *testing.T) {
+	g := NewWithT(t)
+	cp := NewCloudProxy()
+	c := newInProcClient(t, cp)
+	stream, err := c.Forward(context.Background())
+	g.Expect(err).NotTo(HaveOccurred())
+	_ = stream.CloseSend()
+	_, err = stream.Recv()
+	g.Expect(err).To(HaveOccurred())
+	g.Expect(err.Error()).To(ContainSubstring("not loaded"))
+}
+
+func hasHeader(hs []*pb.ForwardHeader, name, value string) bool {
+	for _, h := range hs {
+		if strings.EqualFold(h.GetName(), name) && h.GetValue() == value {
+			return true
+		}
+	}
+	return false
+}
diff --git a/backend/go/cloud-proxy/run.sh b/backend/go/cloud-proxy/run.sh
new file mode 100755
index 000000000..c533c093a
--- /dev/null
+++ b/backend/go/cloud-proxy/run.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -ex
+
+CURDIR=$(dirname "$(realpath $0)")
+
+exec $CURDIR/cloud-proxy "$@"
diff --git a/backend/go/cloud-proxy/toolcalls_test.go b/backend/go/cloud-proxy/toolcalls_test.go
new file mode 100644
index 000000000..e05de8023
--- /dev/null
+++ b/backend/go/cloud-proxy/toolcalls_test.go
@@ -0,0 +1,232 @@
+package main
+
+import (
+	"strings"
+	"testing"
+
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/gomega"
+)
+
+// OpenAI: non-streaming tool call response. Verify the response is
+// mapped to Reply.ChatDeltas[].ToolCalls with id/name/arguments intact,
+// and usage tokens land on Reply.PromptTokens / Reply.Tokens.
+func TestPredictRich_OpenAI_ToolCalls(t *testing.T) {
+	srv, _ := fakeOpenAIUpstream(t, func(_ openAIRequest) (int, string, string) {
+		return 200, `{
+			"id":"resp-1",
+			"choices":[{
+				"index":0,
+				"message":{
+					"role":"assistant",
+					"content":"",
+					"tool_calls":[
+						{"id":"call_abc","type":"function","function":{"name":"get_weather","arguments":"{\"location\":\"SF\"}"}},
+						{"id":"call_def","type":"function","function":{"name":"get_time","arguments":"{\"tz\":\"PT\"}"}}
+					]
+				},
+				"finish_reason":"tool_calls"
+			}],
+			"usage":{"prompt_tokens":42,"completion_tokens":18,"total_tokens":60}
+		}`, "application/json"
+	})
+	defer srv.Close()
+	g := NewWithT(t)
+	cp := newTranslateCloudProxy(t, srv.URL)
+
+	reply, err := cp.PredictRich(&pb.PredictOptions{
+		Messages: []*pb.Message{{Role: "user", Content: "what's the weather?"}},
+	})
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(string(reply.GetMessage())).To(Equal(""))
+	g.Expect(reply.GetPromptTokens()).To(Equal(int32(42)))
+	g.Expect(reply.GetTokens()).To(Equal(int32(18)))
+	g.Expect(reply.GetChatDeltas()).To(HaveLen(1))
+	tcs := reply.GetChatDeltas()[0].GetToolCalls()
+	g.Expect(tcs).To(HaveLen(2))
+	g.Expect(tcs[0].GetId()).To(Equal("call_abc"))
+	g.Expect(tcs[0].GetName()).To(Equal("get_weather"))
+	g.Expect(tcs[0].GetArguments()).To(ContainSubstring(`"location":"SF"`))
+	g.Expect(tcs[1].GetId()).To(Equal("call_def"))
+	g.Expect(tcs[1].GetName()).To(Equal("get_time"))
+}
+
+// OpenAI: streaming tool call. Arguments arrive as a sequence of
+// delta chunks; the consumer is expected to concatenate by tool index.
+// Verify each chunk reaches the channel and the assembled arguments
+// match the input.
+func TestPredictStreamRich_OpenAI_ToolCallDeltas(t *testing.T) {
+	chunks := []string{
+		// Frame 0: announce the tool call (id + name, no args yet).
+		`{"choices":[{"index":0,"delta":{"role":"assistant","tool_calls":[{"index":0,"id":"call_xyz","type":"function","function":{"name":"search"}}]}}]}`,
+		// Frames 1-3: arguments arrive in fragments.
+		`{"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"{\"q\":"}}]}}]}`,
+		`{"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"\"clo"}}]}}]}`,
+		`{"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"uds\"}"}}]}}]}`,
+		// Stop frame.
+		`{"choices":[{"index":0,"delta":{},"finish_reason":"tool_calls"}]}`,
+	}
+	body := ""
+	for _, c := range chunks {
+		body += "data: " + c + "\n\n"
+	}
+	body += "data: [DONE]\n\n"
+
+	srv, _ := fakeOpenAIUpstream(t, func(_ openAIRequest) (int, string, string) {
+		return 200, body, "text/event-stream"
+	})
+	defer srv.Close()
+	g := NewWithT(t)
+	cp := newTranslateCloudProxy(t, srv.URL)
+
+	results := make(chan *pb.Reply, 16)
+	done := make(chan error, 1)
+	go func() {
+		done <- cp.PredictStreamRich(&pb.PredictOptions{
+			Messages: []*pb.Message{{Role: "user", Content: "find something"}},
+		}, results)
+		close(results)
+	}()
+
+	var (
+		toolName  string
+		toolID    string
+		toolIndex int32 = -1
+		argsBuf   strings.Builder
+	)
+	for reply := range results {
+		for _, cd := range reply.GetChatDeltas() {
+			for _, tc := range cd.GetToolCalls() {
+				if tc.GetName() != "" {
+					toolName = tc.GetName()
+				}
+				if tc.GetId() != "" {
+					toolID = tc.GetId()
+				}
+				if toolIndex == -1 {
+					toolIndex = tc.GetIndex()
+				}
+				argsBuf.WriteString(tc.GetArguments())
+			}
+		}
+	}
+	err := <-done
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(toolID).To(Equal("call_xyz"))
+	g.Expect(toolName).To(Equal("search"))
+	g.Expect(toolIndex).To(Equal(int32(0)))
+	g.Expect(argsBuf.String()).To(Equal(`{"q":"clouds"}`))
+}
+
+// Anthropic: non-streaming tool_use block. The block appears in
+// Content[] alongside text blocks; the input field is a structured
+// JSON object. Map to ToolCallDelta with arguments as serialised JSON
+// so downstream OpenAI-shaped consumers see a familiar format.
+func TestPredictRich_Anthropic_ToolUse(t *testing.T) {
+	srv, _ := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
+		return 200, `{
+			"id":"msg_1","type":"message","role":"assistant",
+			"content":[
+				{"type":"text","text":"Let me check that."},
+				{"type":"tool_use","id":"toolu_01","name":"weather","input":{"location":"SF"}}
+			],
+			"model":"claude","usage":{"input_tokens":12,"output_tokens":34}
+		}`, "application/json"
+	})
+	defer srv.Close()
+	g := NewWithT(t)
+	cp := newAnthropicTranslateCloudProxy(t, srv.URL)
+
+	reply, err := cp.PredictRich(&pb.PredictOptions{
+		Messages: []*pb.Message{{Role: "user", Content: "what's the weather?"}},
+		Tokens:   64,
+	})
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(string(reply.GetMessage())).To(Equal("Let me check that."))
+	g.Expect(reply.GetPromptTokens()).To(Equal(int32(12)))
+	g.Expect(reply.GetTokens()).To(Equal(int32(34)))
+	g.Expect(reply.GetChatDeltas()).To(HaveLen(1))
+	g.Expect(reply.GetChatDeltas()[0].GetToolCalls()).To(HaveLen(1))
+	tc := reply.GetChatDeltas()[0].GetToolCalls()[0]
+	g.Expect(tc.GetId()).To(Equal("toolu_01"))
+	g.Expect(tc.GetName()).To(Equal("weather"))
+	g.Expect(tc.GetArguments()).To(ContainSubstring(`"location":"SF"`))
+}
+
+// Anthropic: streaming tool_use. content_block_start announces the
+// tool's id + name; input_json_delta events carry argument fragments
+// which the consumer accumulates. message_delta carries final usage.
+func TestPredictStreamRich_Anthropic_InputJSONDelta(t *testing.T) {
+	frames := []string{
+		"event: message_start\ndata: {\"type\":\"message_start\"}\n\n",
+		// Block 0 is a tool_use; consumer should allocate a slot.
+		"event: content_block_start\ndata: {\"type\":\"content_block_start\",\"index\":0,\"content_block\":{\"type\":\"tool_use\",\"id\":\"toolu_42\",\"name\":\"lookup\"}}\n\n",
+		"event: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"input_json_delta\",\"partial_json\":\"{\\\"q\\\":\"}}\n\n",
+		"event: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"input_json_delta\",\"partial_json\":\"\\\"rain\\\"}\"}}\n\n",
+		"event: content_block_stop\ndata: {\"type\":\"content_block_stop\",\"index\":0}\n\n",
+		"event: message_delta\ndata: {\"type\":\"message_delta\",\"usage\":{\"output_tokens\":7}}\n\n",
+		"event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n",
+	}
+	body := strings.Join(frames, "")
+
+	srv, _ := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
+		return 200, body, "text/event-stream"
+	})
+	defer srv.Close()
+	g := NewWithT(t)
+	cp := newAnthropicTranslateCloudProxy(t, srv.URL)
+
+	results := make(chan *pb.Reply, 16)
+	done := make(chan error, 1)
+	go func() {
+		done <- cp.PredictStreamRich(&pb.PredictOptions{
+			Messages: []*pb.Message{{Role: "user", Content: "rain?"}},
+			Tokens:   64,
+		}, results)
+		close(results)
+	}()
+
+	var (
+		toolID, toolName string
+		argsBuf          strings.Builder
+		finalTokens      int32
+	)
+	for reply := range results {
+		if reply.GetTokens() > 0 && len(reply.GetChatDeltas()) == 0 {
+			finalTokens = reply.GetTokens()
+			continue
+		}
+		for _, cd := range reply.GetChatDeltas() {
+			for _, tc := range cd.GetToolCalls() {
+				if tc.GetId() != "" {
+					toolID = tc.GetId()
+				}
+				if tc.GetName() != "" {
+					toolName = tc.GetName()
+				}
+				argsBuf.WriteString(tc.GetArguments())
+			}
+		}
+	}
+	err := <-done
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(toolID).To(Equal("toolu_42"))
+	g.Expect(toolName).To(Equal("lookup"))
+	g.Expect(argsBuf.String()).To(Equal(`{"q":"rain"}`))
+	g.Expect(finalTokens).To(Equal(int32(7)))
+}
+
+// Sanity: the legacy Predict() (string, error) signature still works
+// — it delegates to PredictRich and extracts Message.
+func TestPredict_LegacyWrapper_OpenAI(t *testing.T) {
+	srv, _ := fakeOpenAIUpstream(t, func(_ openAIRequest) (int, string, string) {
+		return 200, `{"choices":[{"message":{"role":"assistant","content":"hello"}}]}`, "application/json"
+	})
+	defer srv.Close()
+	g := NewWithT(t)
+	cp := newTranslateCloudProxy(t, srv.URL)
+
+	got, err := cp.Predict(&pb.PredictOptions{Messages: []*pb.Message{{Role: "user", Content: "hi"}}})
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(got).To(Equal("hello"))
+}
diff --git a/backend/go/local-store/debug.go b/backend/go/local-store/debug.go
index 2c3d77cab..503b4ece2 100644
--- a/backend/go/local-store/debug.go
+++ b/backend/go/local-store/debug.go
@@ -8,6 +8,6 @@ import (
 
 func assert(cond bool, msg string) {
 	if !cond {
-		xlog.Fatal().Stack().Msg(msg)
+		xlog.Fatal(msg)
 	}
 }
diff --git a/backend/go/local-store/store.go b/backend/go/local-store/store.go
index e2ad54098..2085f74a9 100644
--- a/backend/go/local-store/store.go
+++ b/backend/go/local-store/store.go
@@ -1,7 +1,22 @@
 package main
 
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+// LocalAI's in-process vector store, exposed as a gRPC backend. Keep
+// the implementation here — NOT in a pkg/ library imported by the main
+// LocalAI process. The whole point of the gRPC surface is that vector
+// storage is a backend like any other (local-store, qdrant, pinecone,
+// ...) and can be swapped without changing the routing/recognition
+// code that consumes it.
+//
+// Storage is a sorted parallel-slice (keys [][]float32, values
+// [][]byte). Set/Delete preserve the sort so Get can binary-search.
+// Find scans linearly and uses a heap to keep the top-K — fine for
+// the tens-to-thousands range. The "normalized fast path" (Find when
+// every stored key has unit magnitude AND the query is normalized)
+// skips the per-item magnitude calculation.
+//
+// Concurrency: base.SingleThread serialises gRPC calls so the
+// non-thread-safe slice/heap manipulation here is sound.
+
 import (
 	"container/heap"
 	"fmt"
@@ -10,30 +25,27 @@ import (
 
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-
-	"github.com/mudler/xlog"
+	"github.com/mudler/LocalAI/pkg/store"
 )
 
 type Store struct {
 	base.SingleThread
 
-	// The sorted keys
-	keys [][]float32
-	// The sorted values
+	keys   [][]float32
 	values [][]byte
 
-	// If for every K it holds that ||k||^2 = 1, then we can use the normalized distance functions
-	// TODO: Should we normalize incoming keys if they are not instead?
+	// keysAreNormalized stays true until any non-unit-magnitude key
+	// is added; once false, the magnitude-aware fallback path is
+	// used by Find. Re-evaluated only at Set time, never again on
+	// its own — a deletion of the offending key does NOT flip it
+	// back to true (the bookkeeping cost would dominate the gain).
 	keysAreNormalized bool
-	// The first key decides the length of the keys
-	keyLen int
-}
 
-// TODO: Only used for sorting using Go's builtin implementation. The interfaces are columnar because
-// that's theoretically best for memory layout and cache locality, but this isn't optimized yet.
-type Pair struct {
-	Key   []float32
-	Value []byte
+	// keyLen is the dimension of every stored key. -1 means "no
+	// keys yet, dimension is open". Dimension mismatch on Set is
+	// rejected so cosine similarity (which requires equal-length
+	// vectors) doesn't silently mis-match.
+	keyLen int
 }
 
 func NewStore() *Store {
@@ -45,334 +57,278 @@ func NewStore() *Store {
 	}
 }
 
-func compareSlices(k1, k2 []float32) int {
-	assert(len(k1) == len(k2), fmt.Sprintf("compareSlices: len(k1) = %d, len(k2) = %d", len(k1), len(k2)))
-
-	return slices.Compare(k1, k2)
-}
-
-func hasKey(unsortedSlice [][]float32, target []float32) bool {
-	return slices.ContainsFunc(unsortedSlice, func(k []float32) bool {
-		return compareSlices(k, target) == 0
-	})
-}
-
-func findInSortedSlice(sortedSlice [][]float32, target []float32) (int, bool) {
-	return slices.BinarySearchFunc(sortedSlice, target, func(k, t []float32) int {
-		return compareSlices(k, t)
-	})
-}
-
-func isSortedPairs(kvs []Pair) bool {
-	for i := 1; i < len(kvs); i++ {
-		if compareSlices(kvs[i-1].Key, kvs[i].Key) > 0 {
-			return false
-		}
-	}
-
-	return true
-}
-
-func isSortedKeys(keys [][]float32) bool {
-	for i := 1; i < len(keys); i++ {
-		if compareSlices(keys[i-1], keys[i]) > 0 {
-			return false
-		}
-	}
-
-	return true
-}
-
-func sortIntoKeySlicese(keys []*pb.StoresKey) [][]float32 {
-	ks := make([][]float32, len(keys))
-
-	for i, k := range keys {
-		ks[i] = k.Floats
-	}
-
-	slices.SortFunc(ks, compareSlices)
-
-	assert(len(ks) == len(keys), fmt.Sprintf("len(ks) = %d, len(keys) = %d", len(ks), len(keys)))
-	assert(isSortedKeys(ks), "keys are not sorted")
-
-	return ks
-}
-
+// Load is a no-op — local-store has no on-disk artefact. opts.Model is
+// just a namespace identifier; isolation is already handled upstream
+// (ModelLoader spawns a fresh local-store process per (backend,
+// model) tuple, so each namespace is its own Store{} instance).
 func (s *Store) Load(opts *pb.ModelOptions) error {
-	// local-store is an in-memory vector store with no on-disk artefact to
-	// load — opts.Model is just a namespace identifier. The old `!= ""` guard
-	// rejected any non-empty model name with "not implemented", which broke
-	// callers that pass a namespace to isolate embedding spaces (face vs.
-	// voice biometrics both go through local-store but need distinct stores
-	// so ArcFace 512-D and ECAPA-TDNN 192-D don't collide). Namespace
-	// isolation is already handled upstream: ModelLoader spawns a fresh
-	// local-store process per (backend, model) tuple, so each namespace is
-	// its own Store{} instance. Nothing to do here beyond accepting the load.
 	_ = opts
 	return nil
 }
 
-// Sort the incoming kvs and merge them with the existing sorted kvs
 func (s *Store) StoresSet(opts *pb.StoresSetOptions) error {
-	if len(opts.Keys) == 0 {
-		return fmt.Errorf("no keys to add")
+	keys := store.UnwrapKeys(opts.Keys)
+	values := store.UnwrapValues(opts.Values)
+	if len(keys) == 0 {
+		return fmt.Errorf("local-store: Set: no keys to add")
 	}
-
-	if len(opts.Keys) != len(opts.Values) {
-		return fmt.Errorf("len(keys) = %d, len(values) = %d", len(opts.Keys), len(opts.Values))
+	if len(keys) != len(values) {
+		return fmt.Errorf("local-store: Set: len(keys) = %d, len(values) = %d", len(keys), len(values))
 	}
 
 	if s.keyLen == -1 {
-		s.keyLen = len(opts.Keys[0].Floats)
-	} else {
-		if len(opts.Keys[0].Floats) != s.keyLen {
-			return fmt.Errorf("Try to add key with length %d when existing length is %d", len(opts.Keys[0].Floats), s.keyLen)
-		}
+		s.keyLen = len(keys[0])
+	} else if len(keys[0]) != s.keyLen {
+		return fmt.Errorf("local-store: Set: key length %d does not match existing %d", len(keys[0]), s.keyLen)
 	}
 
-	kvs := make([]Pair, len(opts.Keys))
-
-	for i, k := range opts.Keys {
-		if s.keysAreNormalized && !isNormalized(k.Floats) {
+	kvs := make([]incomingPair, len(keys))
+	for i, k := range keys {
+		if len(k) != s.keyLen {
+			return fmt.Errorf("local-store: Set: key %d length %d does not match existing %d", i, len(k), s.keyLen)
+		}
+		if s.keysAreNormalized && !isNormalized(k) {
 			s.keysAreNormalized = false
-			var sample []float32
-			if len(s.keys) > 5 {
-				sample = k.Floats[:5]
-			} else {
-				sample = k.Floats
-			}
-			xlog.Debug("Key is not normalized", "sample", sample)
-		}
-
-		kvs[i] = Pair{
-			Key:   k.Floats,
-			Value: opts.Values[i].Bytes,
 		}
+		kvs[i] = incomingPair{key: k, value: values[i]}
 	}
 
-	slices.SortFunc(kvs, func(a, b Pair) int {
-		return compareSlices(a.Key, b.Key)
-	})
-
-	assert(len(kvs) == len(opts.Keys), fmt.Sprintf("len(kvs) = %d, len(opts.Keys) = %d", len(kvs), len(opts.Keys)))
-	assert(isSortedPairs(kvs), "keys are not sorted")
-
-	l := len(kvs) + len(s.keys)
-	merge_ks := make([][]float32, 0, l)
-	merge_vs := make([][]byte, 0, l)
-
-	i, j := 0, 0
-	for {
-		if i+j >= l {
-			break
-		}
-
-		if i >= len(kvs) {
-			merge_ks = append(merge_ks, s.keys[j])
-			merge_vs = append(merge_vs, s.values[j])
-			j++
-			continue
-		}
-
-		if j >= len(s.keys) {
-			merge_ks = append(merge_ks, kvs[i].Key)
-			merge_vs = append(merge_vs, kvs[i].Value)
-			i++
-			continue
-		}
-
-		c := compareSlices(kvs[i].Key, s.keys[j])
-		if c < 0 {
-			merge_ks = append(merge_ks, kvs[i].Key)
-			merge_vs = append(merge_vs, kvs[i].Value)
-			i++
-		} else if c > 0 {
-			merge_ks = append(merge_ks, s.keys[j])
-			merge_vs = append(merge_vs, s.values[j])
-			j++
-		} else {
-			merge_ks = append(merge_ks, kvs[i].Key)
-			merge_vs = append(merge_vs, kvs[i].Value)
-			i++
-			j++
-		}
-	}
-
-	assert(len(merge_ks) == l, fmt.Sprintf("len(merge_ks) = %d, l = %d", len(merge_ks), l))
-	assert(isSortedKeys(merge_ks), "merge keys are not sorted")
-
-	s.keys = merge_ks
-	s.values = merge_vs
+	slices.SortFunc(kvs, func(a, b incomingPair) int { return slices.Compare(a.key, b.key) })
 
+	merged := mergeSortedPairs(s.keys, s.values, kvs)
+	s.keys = merged.keys
+	s.values = merged.values
+	assert(slices.IsSortedFunc(s.keys, slices.Compare[[]float32]), "Set: s.keys not sorted post-merge")
+	assert(len(s.keys) == len(s.values), "Set: keys/values length skew")
 	return nil
 }
 
 func (s *Store) StoresDelete(opts *pb.StoresDeleteOptions) error {
-	if len(opts.Keys) == 0 {
-		return fmt.Errorf("no keys to delete")
+	keys := store.UnwrapKeys(opts.Keys)
+	if len(keys) == 0 {
+		return fmt.Errorf("local-store: Delete: no keys to delete")
 	}
-
-	if len(opts.Keys) == 0 {
-		return fmt.Errorf("no keys to add")
-	}
-
-	if s.keyLen == -1 {
-		s.keyLen = len(opts.Keys[0].Floats)
-	} else {
-		if len(opts.Keys[0].Floats) != s.keyLen {
-			return fmt.Errorf("Trying to delete key with length %d when existing length is %d", len(opts.Keys[0].Floats), s.keyLen)
-		}
-	}
-
-	ks := sortIntoKeySlicese(opts.Keys)
-
-	l := len(s.keys) - len(ks)
-	merge_ks := make([][]float32, 0, l)
-	merge_vs := make([][]byte, 0, l)
-
-	tail_ks := s.keys
-	tail_vs := s.values
-	for _, k := range ks {
-		j, found := findInSortedSlice(tail_ks, k)
-
-		if found {
-			merge_ks = append(merge_ks, tail_ks[:j]...)
-			merge_vs = append(merge_vs, tail_vs[:j]...)
-			tail_ks = tail_ks[j+1:]
-			tail_vs = tail_vs[j+1:]
-		} else {
-			assert(!hasKey(s.keys, k), fmt.Sprintf("Key exists, but was not found: t=%d, %v", len(tail_ks), k))
-		}
-
-		xlog.Debug("Delete", "found", found, "tailLen", len(tail_ks), "j", j, "mergeKeysLen", len(merge_ks), "mergeValuesLen", len(merge_vs))
-	}
-
-	merge_ks = append(merge_ks, tail_ks...)
-	merge_vs = append(merge_vs, tail_vs...)
-
-	assert(len(merge_ks) <= len(s.keys), fmt.Sprintf("len(merge_ks) = %d, len(s.keys) = %d", len(merge_ks), len(s.keys)))
-
-	s.keys = merge_ks
-	s.values = merge_vs
-
-	assert(len(s.keys) >= l, fmt.Sprintf("len(s.keys) = %d, l = %d", len(s.keys), l))
-	assert(isSortedKeys(s.keys), "keys are not sorted")
-	assert(func() bool {
-		for _, k := range ks {
-			if _, found := findInSortedSlice(s.keys, k); found {
-				return false
+	if s.keyLen != -1 {
+		for i, k := range keys {
+			if len(k) != s.keyLen {
+				return fmt.Errorf("local-store: Delete: key %d length %d does not match existing %d", i, len(k), s.keyLen)
 			}
 		}
-		return true
-	}(), "Keys to delete still present")
-
-	if len(s.keys) != l {
-		xlog.Debug("Delete: Some keys not found", "keysLen", len(s.keys), "expectedLen", l)
 	}
+	sortedKeys := append([][]float32(nil), keys...)
+	slices.SortFunc(sortedKeys, slices.Compare[[]float32])
 
+	mergedK := make([][]float32, 0, len(s.keys))
+	mergedV := make([][]byte, 0, len(s.keys))
+	tailK := s.keys
+	tailV := s.values
+	for _, k := range sortedKeys {
+		j, ok := slices.BinarySearchFunc(tailK, k, slices.Compare[[]float32])
+		if ok {
+			mergedK = append(mergedK, tailK[:j]...)
+			mergedV = append(mergedV, tailV[:j]...)
+			tailK = tailK[j+1:]
+			tailV = tailV[j+1:]
+		}
+	}
+	mergedK = append(mergedK, tailK...)
+	mergedV = append(mergedV, tailV...)
+	s.keys = mergedK
+	s.values = mergedV
+	assert(slices.IsSortedFunc(s.keys, slices.Compare[[]float32]), "Delete: s.keys not sorted post-merge")
+	assert(len(s.keys) == len(s.values), "Delete: keys/values length skew")
 	return nil
 }
 
+// StoresGet fetches values for the given keys. Missing keys are
+// omitted from the result rather than reported as an error — callers
+// compare returned-key length against requested-key length to detect
+// them. Returned slices are aligned.
 func (s *Store) StoresGet(opts *pb.StoresGetOptions) (pb.StoresGetResult, error) {
-	pbKeys := make([]*pb.StoresKey, 0, len(opts.Keys))
-	pbValues := make([]*pb.StoresValue, 0, len(opts.Keys))
-	ks := sortIntoKeySlicese(opts.Keys)
-
+	keys := store.UnwrapKeys(opts.Keys)
 	if len(s.keys) == 0 {
-		xlog.Debug("Get: No keys in store")
+		return pb.StoresGetResult{}, nil
 	}
-
-	if s.keyLen == -1 {
-		s.keyLen = len(opts.Keys[0].Floats)
-	} else {
-		if len(opts.Keys[0].Floats) != s.keyLen {
-			return pb.StoresGetResult{}, fmt.Errorf("Try to get a key with length %d when existing length is %d", len(opts.Keys[0].Floats), s.keyLen)
+	if s.keyLen != -1 {
+		for i, k := range keys {
+			if len(k) != s.keyLen {
+				return pb.StoresGetResult{}, fmt.Errorf("local-store: Get: key %d length %d does not match existing %d", i, len(k), s.keyLen)
+			}
 		}
 	}
+	sortedKeys := append([][]float32(nil), keys...)
+	slices.SortFunc(sortedKeys, slices.Compare[[]float32])
 
-	tail_k := s.keys
-	tail_v := s.values
-	for i, k := range ks {
-		j, found := findInSortedSlice(tail_k, k)
-
-		if found {
-			pbKeys = append(pbKeys, &pb.StoresKey{
-				Floats: k,
-			})
-			pbValues = append(pbValues, &pb.StoresValue{
-				Bytes: tail_v[j],
-			})
-
-			tail_k = tail_k[j+1:]
-			tail_v = tail_v[j+1:]
-		} else {
-			assert(!hasKey(s.keys, k), fmt.Sprintf("Key exists, but was not found: i=%d, %v", i, k))
+	var foundKeys [][]float32
+	var foundValues [][]byte
+	tailK := s.keys
+	tailV := s.values
+	for _, k := range sortedKeys {
+		j, ok := slices.BinarySearchFunc(tailK, k, slices.Compare[[]float32])
+		if !ok {
+			continue
 		}
+		foundKeys = append(foundKeys, tailK[j])
+		foundValues = append(foundValues, tailV[j])
+		tailK = tailK[j+1:]
+		tailV = tailV[j+1:]
 	}
-
-	if len(pbKeys) != len(opts.Keys) {
-		xlog.Debug("Get: Some keys not found", "pbKeysLen", len(pbKeys), "optsKeysLen", len(opts.Keys), "storeKeysLen", len(s.keys))
-	}
-
 	return pb.StoresGetResult{
-		Keys:   pbKeys,
-		Values: pbValues,
+		Keys:   store.WrapKeys(foundKeys),
+		Values: store.WrapValues(foundValues),
 	}, nil
 }
 
+// StoresFind returns the topK nearest stored entries by cosine
+// similarity, ordered most-similar first. An empty store returns
+// empty slices and no error.
+func (s *Store) StoresFind(opts *pb.StoresFindOptions) (pb.StoresFindResult, error) {
+	query := opts.Key.Floats
+	topK := int(opts.TopK)
+	if topK < 1 {
+		return pb.StoresFindResult{}, fmt.Errorf("local-store: Find: topK = %d, must be >= 1", topK)
+	}
+	if len(s.keys) == 0 {
+		return pb.StoresFindResult{}, nil
+	}
+	if len(query) != s.keyLen {
+		return pb.StoresFindResult{}, fmt.Errorf("local-store: Find: query length %d does not match existing %d", len(query), s.keyLen)
+	}
+
+	var keys [][]float32
+	var values [][]byte
+	var sims []float32
+	if s.keysAreNormalized && isNormalized(query) {
+		keys, values, sims = s.findNormalized(query, topK)
+	} else {
+		keys, values, sims = s.findFallback(query, topK)
+	}
+	return pb.StoresFindResult{
+		Keys:         store.WrapKeys(keys),
+		Values:       store.WrapValues(values),
+		Similarities: sims,
+	}, nil
+}
+
+func (s *Store) findNormalized(query []float32, topK int) (keys [][]float32, values [][]byte, similarities []float32) {
+	assert(s.keysAreNormalized, "findNormalized: s.keysAreNormalized is false")
+	assert(isNormalized(query), "findNormalized: query is not unit-length")
+	pq := make(priorityQueue, 0, topK)
+	heap.Init(&pq)
+	for i, k := range s.keys {
+		var dot float32
+		for j := range k {
+			dot += query[j] * k[j]
+		}
+		assert(dot >= -1.01 && dot <= 1.01, fmt.Sprintf("findNormalized: dot %f out of [-1, 1] — keysAreNormalized invariant violated", dot))
+		heap.Push(&pq, &priorityItem{similarity: dot, key: k, value: s.values[i]})
+		if pq.Len() > topK {
+			heap.Pop(&pq)
+		}
+	}
+	return drainPQ(&pq)
+}
+
+func (s *Store) findFallback(query []float32, topK int) (keys [][]float32, values [][]byte, similarities []float32) {
+	var qmag float64
+	for _, v := range query {
+		qmag += float64(v) * float64(v)
+	}
+	qmag = math.Sqrt(qmag)
+	pq := make(priorityQueue, 0, topK)
+	heap.Init(&pq)
+	for i, k := range s.keys {
+		var dot, kmag float64
+		for j := range k {
+			dot += float64(query[j]) * float64(k[j])
+			kmag += float64(k[j]) * float64(k[j])
+		}
+		denom := qmag * math.Sqrt(kmag)
+		var sim float32
+		if denom > 0 {
+			sim = float32(dot / denom)
+		}
+		heap.Push(&pq, &priorityItem{similarity: sim, key: k, value: s.values[i]})
+		if pq.Len() > topK {
+			heap.Pop(&pq)
+		}
+	}
+	return drainPQ(&pq)
+}
+
 func isNormalized(k []float32) bool {
 	var sum float64
-
 	for _, v := range k {
-		v64 := float64(v)
-		sum += v64 * v64
+		sum += float64(v) * float64(v)
 	}
-
-	s := math.Sqrt(sum)
-
-	return s >= 0.99 && s <= 1.01
+	mag := math.Sqrt(sum)
+	return mag >= 0.99 && mag <= 1.01
 }
 
-// TODO: This we could replace with handwritten SIMD code
-func normalizedCosineSimilarity(k1, k2 []float32) float32 {
-	assert(len(k1) == len(k2), fmt.Sprintf("normalizedCosineSimilarity: len(k1) = %d, len(k2) = %d", len(k1), len(k2)))
+type incomingPair struct {
+	key   []float32
+	value []byte
+}
 
-	var dot float32
-	for i := range len(k1) {
-		dot += k1[i] * k2[i]
+type pairs struct {
+	keys   [][]float32
+	values [][]byte
+}
+
+// mergeSortedPairs merges (existing, incoming) into a fresh sorted
+// slice. Equal keys take the incoming value — Set is upsert.
+func mergeSortedPairs(existingK [][]float32, existingV [][]byte, incoming []incomingPair) pairs {
+	assert(slices.IsSortedFunc(existingK, slices.Compare[[]float32]), "mergeSortedPairs: existing not sorted")
+	assert(slices.IsSortedFunc(incoming, func(a, b incomingPair) int { return slices.Compare(a.key, b.key) }), "mergeSortedPairs: incoming not sorted")
+	l := len(existingK) + len(incoming)
+	mk := make([][]float32, 0, l)
+	mv := make([][]byte, 0, l)
+	i, j := 0, 0
+	for i < len(incoming) || j < len(existingK) {
+		switch {
+		case j >= len(existingK):
+			mk = append(mk, incoming[i].key)
+			mv = append(mv, incoming[i].value)
+			i++
+		case i >= len(incoming):
+			mk = append(mk, existingK[j])
+			mv = append(mv, existingV[j])
+			j++
+		default:
+			c := slices.Compare(incoming[i].key, existingK[j])
+			switch {
+			case c < 0:
+				mk = append(mk, incoming[i].key)
+				mv = append(mv, incoming[i].value)
+				i++
+			case c > 0:
+				mk = append(mk, existingK[j])
+				mv = append(mv, existingV[j])
+				j++
+			default:
+				mk = append(mk, incoming[i].key)
+				mv = append(mv, incoming[i].value)
+				i++
+				j++
+			}
+		}
 	}
-
-	assert(dot >= -1.01 && dot <= 1.01, fmt.Sprintf("dot = %f", dot))
-
-	// 2.0 * (1.0 - dot) would be the Euclidean distance
-	return dot
+	return pairs{keys: mk, values: mv}
 }
 
-type PriorityItem struct {
-	Similarity float32
-	Key        []float32
-	Value      []byte
+type priorityItem struct {
+	similarity float32
+	key        []float32
+	value      []byte
 }
 
-type PriorityQueue []*PriorityItem
+type priorityQueue []*priorityItem
 
-func (pq PriorityQueue) Len() int { return len(pq) }
-
-func (pq PriorityQueue) Less(i, j int) bool {
-	// Inverted because the most similar should be at the top
-	return pq[i].Similarity < pq[j].Similarity
-}
-
-func (pq PriorityQueue) Swap(i, j int) {
-	pq[i], pq[j] = pq[j], pq[i]
-}
-
-func (pq *PriorityQueue) Push(x any) {
-	item := x.(*PriorityItem)
-	*pq = append(*pq, item)
-}
-
-func (pq *PriorityQueue) Pop() any {
+func (pq priorityQueue) Len() int           { return len(pq) }
+func (pq priorityQueue) Less(i, j int) bool { return pq[i].similarity < pq[j].similarity }
+func (pq priorityQueue) Swap(i, j int)      { pq[i], pq[j] = pq[j], pq[i] }
+func (pq *priorityQueue) Push(x any)        { *pq = append(*pq, x.(*priorityItem)) }
+func (pq *priorityQueue) Pop() any {
 	old := *pq
 	n := len(old)
 	item := old[n-1]
@@ -380,142 +336,16 @@ func (pq *PriorityQueue) Pop() any {
 	return item
 }
 
-func (s *Store) StoresFindNormalized(opts *pb.StoresFindOptions) (pb.StoresFindResult, error) {
-	tk := opts.Key.Floats
-	top_ks := make(PriorityQueue, 0, int(opts.TopK))
-	heap.Init(&top_ks)
-
-	for i, k := range s.keys {
-		sim := normalizedCosineSimilarity(tk, k)
-		heap.Push(&top_ks, &PriorityItem{
-			Similarity: sim,
-			Key:        k,
-			Value:      s.values[i],
-		})
-
-		if top_ks.Len() > int(opts.TopK) {
-			heap.Pop(&top_ks)
-		}
-	}
-
-	similarities := make([]float32, top_ks.Len())
-	pbKeys := make([]*pb.StoresKey, top_ks.Len())
-	pbValues := make([]*pb.StoresValue, top_ks.Len())
-
-	for i := top_ks.Len() - 1; i >= 0; i-- {
-		item := heap.Pop(&top_ks).(*PriorityItem)
-
-		similarities[i] = item.Similarity
-		pbKeys[i] = &pb.StoresKey{
-			Floats: item.Key,
-		}
-		pbValues[i] = &pb.StoresValue{
-			Bytes: item.Value,
-		}
-	}
-
-	return pb.StoresFindResult{
-		Keys:         pbKeys,
-		Values:       pbValues,
-		Similarities: similarities,
-	}, nil
-}
-
-func cosineSimilarity(k1, k2 []float32, mag1 float64) float32 {
-	assert(len(k1) == len(k2), fmt.Sprintf("cosineSimilarity: len(k1) = %d, len(k2) = %d", len(k1), len(k2)))
-
-	var dot, mag2 float64
-	for i := range len(k1) {
-		dot += float64(k1[i] * k2[i])
-		mag2 += float64(k2[i] * k2[i])
-	}
-
-	sim := float32(dot / (mag1 * math.Sqrt(mag2)))
-
-	assert(sim >= -1.01 && sim <= 1.01, fmt.Sprintf("sim = %f", sim))
-
-	return sim
-}
-
-func (s *Store) StoresFindFallback(opts *pb.StoresFindOptions) (pb.StoresFindResult, error) {
-	tk := opts.Key.Floats
-	top_ks := make(PriorityQueue, 0, int(opts.TopK))
-	heap.Init(&top_ks)
-
-	var mag1 float64
-	for _, v := range tk {
-		mag1 += float64(v * v)
-	}
-	mag1 = math.Sqrt(mag1)
-
-	for i, k := range s.keys {
-		dist := cosineSimilarity(tk, k, mag1)
-		heap.Push(&top_ks, &PriorityItem{
-			Similarity: dist,
-			Key:        k,
-			Value:      s.values[i],
-		})
-
-		if top_ks.Len() > int(opts.TopK) {
-			heap.Pop(&top_ks)
-		}
-	}
-
-	similarities := make([]float32, top_ks.Len())
-	pbKeys := make([]*pb.StoresKey, top_ks.Len())
-	pbValues := make([]*pb.StoresValue, top_ks.Len())
-
-	for i := top_ks.Len() - 1; i >= 0; i-- {
-		item := heap.Pop(&top_ks).(*PriorityItem)
-
-		similarities[i] = item.Similarity
-		pbKeys[i] = &pb.StoresKey{
-			Floats: item.Key,
-		}
-		pbValues[i] = &pb.StoresValue{
-			Bytes: item.Value,
-		}
-	}
-
-	return pb.StoresFindResult{
-		Keys:         pbKeys,
-		Values:       pbValues,
-		Similarities: similarities,
-	}, nil
-}
-
-func (s *Store) StoresFind(opts *pb.StoresFindOptions) (pb.StoresFindResult, error) {
-	tk := opts.Key.Floats
-
-	if len(tk) != s.keyLen {
-		return pb.StoresFindResult{}, fmt.Errorf("Try to find key with length %d when existing length is %d", len(tk), s.keyLen)
-	}
-
-	if opts.TopK < 1 {
-		return pb.StoresFindResult{}, fmt.Errorf("opts.TopK = %d, must be >= 1", opts.TopK)
-	}
-
-	if s.keyLen == -1 {
-		s.keyLen = len(opts.Key.Floats)
-	} else {
-		if len(opts.Key.Floats) != s.keyLen {
-			return pb.StoresFindResult{}, fmt.Errorf("Try to add key with length %d when existing length is %d", len(opts.Key.Floats), s.keyLen)
-		}
-	}
-
-	if s.keysAreNormalized && isNormalized(tk) {
-		return s.StoresFindNormalized(opts)
-	} else {
-		if s.keysAreNormalized {
-			var sample []float32
-			if len(s.keys) > 5 {
-				sample = tk[:5]
-			} else {
-				sample = tk
-			}
-			xlog.Debug("Trying to compare non-normalized key with normalized keys", "sample", sample)
-		}
-
-		return s.StoresFindFallback(opts)
+func drainPQ(pq *priorityQueue) (keys [][]float32, values [][]byte, similarities []float32) {
+	n := pq.Len()
+	keys = make([][]float32, n)
+	values = make([][]byte, n)
+	similarities = make([]float32, n)
+	for i := n - 1; i >= 0; i-- {
+		item := heap.Pop(pq).(*priorityItem)
+		keys[i] = item.key
+		values[i] = item.value
+		similarities[i] = item.similarity
 	}
+	return keys, values, similarities
 }
diff --git a/backend/go/local-store/store_suite_test.go b/backend/go/local-store/store_suite_test.go
new file mode 100644
index 000000000..63affb46b
--- /dev/null
+++ b/backend/go/local-store/store_suite_test.go
@@ -0,0 +1,13 @@
+package main
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestLocalStore(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "local-store test suite")
+}
diff --git a/backend/go/local-store/store_test.go b/backend/go/local-store/store_test.go
new file mode 100644
index 000000000..2043647c0
--- /dev/null
+++ b/backend/go/local-store/store_test.go
@@ -0,0 +1,284 @@
+package main
+
+// Regression suite for the local-store gRPC backend. Exercises the
+// Stores{Set,Get,Find,Delete} surface — the only public contract.
+// Callers (face/voice recognition, the routing KNN classifier) reach
+// this code via grpc.Backend, so testing at the wire-shaped boundary
+// matches the production import shape.
+
+import (
+	"math"
+	"math/rand/v2"
+	"testing"
+
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("StoresSet", func() {
+	It("rejects empty input", func() {
+		Expect(NewStore().StoresSet(&pb.StoresSetOptions{})).NotTo(Succeed(), "Set with no keys should fail")
+	})
+
+	It("rejects key/value length mismatch", func() {
+		err := NewStore().StoresSet(&pb.StoresSetOptions{
+			Keys:   wrapKeys([][]float32{{1, 0, 0}}),
+			Values: wrapValues([][]byte{[]byte("a"), []byte("b")}),
+		})
+		Expect(err).To(HaveOccurred(), "len(keys) != len(values) should fail")
+	})
+
+	It("rejects dimension mismatch on later add", func() {
+		s := NewStore()
+		mustSet(s, [][]float32{{1, 0, 0}}, [][]byte{[]byte("3d")})
+		err := s.StoresSet(&pb.StoresSetOptions{
+			Keys:   wrapKeys([][]float32{{1, 0}}),
+			Values: wrapValues([][]byte{[]byte("2d")}),
+		})
+		Expect(err).To(HaveOccurred(), "dimension mismatch on later Set should fail")
+	})
+
+	It("rejects dimension mismatch within batch", func() {
+		err := NewStore().StoresSet(&pb.StoresSetOptions{
+			Keys:   wrapKeys([][]float32{{1, 0, 0}, {1, 0}}),
+			Values: wrapValues([][]byte{[]byte("3d"), []byte("2d")}),
+		})
+		Expect(err).To(HaveOccurred(), "mixed-dimension within one batch should fail")
+	})
+
+	It("merges sorted and updates existing key", func() {
+		s := NewStore()
+		mustSet(s, [][]float32{{0.3, 0, 0}, {0.1, 0, 0}}, [][]byte{[]byte("c"), []byte("a")})
+		mustSet(s, [][]float32{{0.2, 0, 0}, {0.1, 0, 0}}, [][]byte{[]byte("b"), []byte("a-updated")})
+		Expect(s.keys).To(HaveLen(3))
+		got := singleGet(s, []float32{0.1, 0, 0})
+		Expect(string(got)).To(Equal("a-updated"))
+	})
+})
+
+var _ = Describe("StoresGet", func() {
+	It("round-trips multi-key", func() {
+		s := NewStore()
+		mustSet(s,
+			[][]float32{{0.1, 0.2, 0.3}, {0.4, 0.5, 0.6}, {0.7, 0.8, 0.9}},
+			[][]byte{[]byte("a"), []byte("b"), []byte("c")},
+		)
+		res, err := s.StoresGet(&pb.StoresGetOptions{
+			Keys: wrapKeys([][]float32{{0.7, 0.8, 0.9}, {0.1, 0.2, 0.3}}),
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.Keys).To(HaveLen(2))
+	})
+
+	It("omits missing keys rather than erroring", func() {
+		s := NewStore()
+		mustSet(s, [][]float32{{0.1, 0, 0}}, [][]byte{[]byte("a")})
+		res, err := s.StoresGet(&pb.StoresGetOptions{
+			Keys: wrapKeys([][]float32{{0.1, 0, 0}, {0.9, 0, 0}}),
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.Keys).To(HaveLen(1))
+	})
+})
+
+var _ = Describe("StoresDelete", func() {
+	It("removes and preserves sort", func() {
+		s := NewStore()
+		mustSet(s,
+			[][]float32{{0.1, 0, 0}, {0.2, 0, 0}, {0.3, 0, 0}, {0.4, 0, 0}},
+			[][]byte{[]byte("a"), []byte("b"), []byte("c"), []byte("d")},
+		)
+		Expect(s.StoresDelete(&pb.StoresDeleteOptions{
+			Keys: wrapKeys([][]float32{{0.2, 0, 0}, {0.4, 0, 0}}),
+		})).To(Succeed())
+		Expect(s.keys).To(HaveLen(2))
+	})
+
+	It("tolerates missing keys", func() {
+		s := NewStore()
+		mustSet(s, [][]float32{{0.1, 0, 0}}, [][]byte{[]byte("a")})
+		Expect(s.StoresDelete(&pb.StoresDeleteOptions{
+			Keys: wrapKeys([][]float32{{0.9, 0, 0}}),
+		})).To(Succeed(), "delete of missing key should succeed")
+		Expect(s.keys).To(HaveLen(1))
+	})
+})
+
+var _ = Describe("StoresFind", func() {
+	It("returns normalized top-K", func() {
+		s := NewStore()
+		mustSet(s,
+			[][]float32{
+				normalizeVec([]float32{1, 0, 0}),
+				normalizeVec([]float32{0, 1, 0}),
+				normalizeVec([]float32{0, 0, 1}),
+			},
+			[][]byte{[]byte("x"), []byte("y"), []byte("z")},
+		)
+		res, err := s.StoresFind(&pb.StoresFindOptions{
+			Key:  &pb.StoresKey{Floats: normalizeVec([]float32{0.9, 0.1, 0})},
+			TopK: 2,
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.Keys).To(HaveLen(2))
+		Expect(res.Similarities[0]).To(BeNumerically(">=", res.Similarities[1]), "results not sorted desc by similarity")
+		Expect(string(res.Values[0].Bytes)).To(Equal("x"))
+	})
+
+	It("falls back for non-normalized keys", func() {
+		s := NewStore()
+		mustSet(s, [][]float32{{2, 0, 0}, {0, 3, 0}}, [][]byte{[]byte("x"), []byte("y")})
+		Expect(s.keysAreNormalized).To(BeFalse(), "store should report non-normalized after Set with magnitude > 1")
+		res, err := s.StoresFind(&pb.StoresFindOptions{
+			Key:  &pb.StoresKey{Floats: []float32{4, 0, 0}},
+			TopK: 1,
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(string(res.Values[0].Bytes)).To(Equal("x"))
+		Expect(res.Similarities[0]).To(BeNumerically(">=", float32(0.99)))
+		Expect(res.Similarities[0]).To(BeNumerically("<=", float32(1.01)))
+	})
+
+	It("rejects zero topK", func() {
+		s := NewStore()
+		mustSet(s, [][]float32{{1, 0, 0}}, [][]byte{[]byte("x")})
+		_, err := s.StoresFind(&pb.StoresFindOptions{
+			Key:  &pb.StoresKey{Floats: []float32{1, 0, 0}},
+			TopK: 0,
+		})
+		Expect(err).To(HaveOccurred(), "Find with topK=0 should fail")
+	})
+
+	It("rejects dimension mismatch", func() {
+		s := NewStore()
+		mustSet(s, [][]float32{{1, 0, 0}}, [][]byte{[]byte("x")})
+		_, err := s.StoresFind(&pb.StoresFindOptions{
+			Key:  &pb.StoresKey{Floats: []float32{1, 0}},
+			TopK: 1,
+		})
+		Expect(err).To(HaveOccurred(), "Find with mismatched dimension should fail")
+	})
+
+	It("returns empty result on empty store", func() {
+		res, err := NewStore().StoresFind(&pb.StoresFindOptions{
+			Key:  &pb.StoresKey{Floats: []float32{1, 0, 0}},
+			TopK: 5,
+		})
+		Expect(err).NotTo(HaveOccurred(), "Find on empty store should succeed")
+		Expect(res.Keys).To(BeEmpty())
+	})
+
+	It("handles topK larger than store", func() {
+		s := NewStore()
+		mustSet(s,
+			[][]float32{normalizeVec([]float32{1, 0, 0}), normalizeVec([]float32{0, 1, 0})},
+			[][]byte{[]byte("x"), []byte("y")},
+		)
+		res, err := s.StoresFind(&pb.StoresFindOptions{
+			Key:  &pb.StoresKey{Floats: normalizeVec([]float32{1, 0, 0})},
+			TopK: 10,
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.Keys).To(HaveLen(2))
+	})
+})
+
+var _ = Describe("StoresLoad", func() {
+	It("is a no-op", func() {
+		Expect(NewStore().Load(&pb.ModelOptions{Model: "any-namespace"})).To(Succeed())
+	})
+})
+
+func BenchmarkStoresFindNormalized(b *testing.B) {
+	const dim = 768
+	for _, n := range []int{8, 32, 128, 512} {
+		b.Run(fmtN(n), func(b *testing.B) {
+			s := buildStore(b, n, dim)
+			query := normalizeVec(randVec(dim, 42))
+			req := &pb.StoresFindOptions{Key: &pb.StoresKey{Floats: query}, TopK: 1}
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				if _, err := s.StoresFind(req); err != nil {
+					b.Fatal(err)
+				}
+			}
+		})
+	}
+}
+
+// --- test helpers ---
+
+func mustSet(s *Store, keys [][]float32, values [][]byte) {
+	ExpectWithOffset(1, s.StoresSet(&pb.StoresSetOptions{Keys: wrapKeys(keys), Values: wrapValues(values)})).To(Succeed())
+}
+
+func singleGet(s *Store, key []float32) []byte {
+	res, err := s.StoresGet(&pb.StoresGetOptions{Keys: wrapKeys([][]float32{key})})
+	ExpectWithOffset(1, err).NotTo(HaveOccurred())
+	if len(res.Values) == 0 {
+		return nil
+	}
+	return res.Values[0].Bytes
+}
+
+func wrapKeys(in [][]float32) []*pb.StoresKey {
+	out := make([]*pb.StoresKey, len(in))
+	for i, k := range in {
+		out[i] = &pb.StoresKey{Floats: k}
+	}
+	return out
+}
+
+func wrapValues(in [][]byte) []*pb.StoresValue {
+	out := make([]*pb.StoresValue, len(in))
+	for i, v := range in {
+		out[i] = &pb.StoresValue{Bytes: v}
+	}
+	return out
+}
+
+func buildStore(tb testing.TB, n, dim int) *Store {
+	tb.Helper()
+	s := NewStore()
+	keys := make([][]float32, n)
+	values := make([][]byte, n)
+	for i := 0; i < n; i++ {
+		keys[i] = normalizeVec(randVec(dim, int64(i)+1))
+		values[i] = []byte{byte(i)}
+	}
+	if err := s.StoresSet(&pb.StoresSetOptions{Keys: wrapKeys(keys), Values: wrapValues(values)}); err != nil {
+		tb.Fatal(err)
+	}
+	return s
+}
+
+func randVec(dim int, seed int64) []float32 {
+	r := rand.New(rand.NewPCG(uint64(seed), 0xabcdef))
+	v := make([]float32, dim)
+	for i := range v {
+		v[i] = float32(r.NormFloat64())
+	}
+	return v
+}
+
+func normalizeVec(v []float32) []float32 {
+	var sum float64
+	for _, x := range v {
+		sum += float64(x) * float64(x)
+	}
+	mag := math.Sqrt(sum)
+	if mag == 0 {
+		return v
+	}
+	out := make([]float32, len(v))
+	for i, x := range v {
+		out[i] = float32(float64(x) / mag)
+	}
+	return out
+}
+
+func fmtN(n int) string {
+	return map[int]string{8: "n=8", 32: "n=32", 128: "n=128", 512: "n=512"}[n]
+}
diff --git a/backend/python/transformers/backend.py b/backend/python/transformers/backend.py
index f2f70acb3..a8c1840b3 100644
--- a/backend/python/transformers/backend.py
+++ b/backend/python/transformers/backend.py
@@ -26,7 +26,7 @@ import torch.cuda
 
 XPU=os.environ.get("XPU", "0") == "1"
 import transformers as transformers_module
-from transformers import AutoTokenizer, AutoModel, AutoProcessor, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria
+from transformers import AutoTokenizer, AutoModel, AutoProcessor, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria, pipeline
 from scipy.io import wavfile
 from sentence_transformers import SentenceTransformer
 
@@ -200,6 +200,21 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                 autoTokenizer = False
                 self.model = SentenceTransformer(model_name, trust_remote_code=request.TrustRemoteCode)
                 self.SentenceTransformer = True
+            elif request.Type == "TokenClassification":
+                # NER / PII tagging via HuggingFace's token-classification
+                # pipeline. aggregation_strategy="simple" merges B-/I- tags
+                # into single spans and gives byte offsets back. The
+                # tokenizer is bundled inside the pipeline, so we skip the
+                # AutoTokenizer load below.
+                autoTokenizer = False
+                self.tokenClassifier = pipeline(
+                    "token-classification",
+                    model=model_name,
+                    aggregation_strategy="simple",
+                    device=0 if self.CUDA else -1,
+                    trust_remote_code=request.TrustRemoteCode,
+                )
+                self.TokenClassification = True
             else:
                 # Generic: dynamically resolve model class from transformers
                 model_type = TYPE_ALIASES.get(request.Type, request.Type)
@@ -253,6 +268,39 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
             return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
         return backend_pb2.Result(message="Model loaded successfully", success=True)
 
+    def TokenClassify(self, request, context):
+        # Runs HuggingFace's token-classification pipeline and returns
+        # the aggregated entity spans. The pipeline gives us byte
+        # offsets via aggregation_strategy="simple" (set at load
+        # time), so the caller can slice the original text without
+        # re-tokenising on the Go side.
+        if not getattr(self, "TokenClassification", False):
+            context.set_code(grpc.StatusCode.FAILED_PRECONDITION)
+            context.set_details("model was not loaded as Type=TokenClassification")
+            return backend_pb2.TokenClassifyResponse()
+        try:
+            results = self.tokenClassifier(request.text)
+        except Exception as err:
+            print("TokenClassify error:", err, file=sys.stderr)
+            context.set_code(grpc.StatusCode.INTERNAL)
+            context.set_details(f"token-classification failed: {err}")
+            return backend_pb2.TokenClassifyResponse()
+
+        threshold = request.threshold if request.threshold > 0 else 0.0
+        entities = []
+        for r in results:
+            score = float(r.get("score", 0.0))
+            if score < threshold:
+                continue
+            entities.append(backend_pb2.TokenClassifyEntity(
+                entity_group=str(r.get("entity_group") or r.get("entity") or ""),
+                start=int(r.get("start", 0)),
+                end=int(r.get("end", 0)),
+                score=score,
+                text=str(r.get("word", "")),
+            ))
+        return backend_pb2.TokenClassifyResponse(entities=entities)
+
     def Embedding(self, request, context):
         set_seed(request.Seed)
         # Tokenize input
diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index 967c4420c..74598660b 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -356,6 +356,133 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
         except Exception as e:
             return backend_pb2.Result(success=False, message=str(e))
 
+    async def Score(self, request, context):
+        """
+        Joint log-probability of each candidate continuation given the
+        shared prompt. Used by routing-policy multi-label classification
+        (read the distribution rather than asking the model to emit a
+        single argmax label), reranking, and reward-model scoring.
+
+        Implementation uses vLLM's `prompt_logprobs` to recover the
+        per-token log P(token_i | tokens_<i) for the full concatenated
+        sequence; the candidate's tokens are the suffix whose logprobs
+        get summed. max_tokens=1 because vLLM requires at least one
+        generated token; the generated token is discarded.
+        """
+        if not hasattr(self, 'llm') or self.llm is None:
+            context.set_code(grpc.StatusCode.FAILED_PRECONDITION)
+            context.set_details("Model not loaded")
+            return backend_pb2.ScoreResponse()
+        if not hasattr(self, 'tokenizer') or self.tokenizer is None:
+            context.set_code(grpc.StatusCode.FAILED_PRECONDITION)
+            context.set_details("Tokenizer not available")
+            return backend_pb2.ScoreResponse()
+        if len(request.candidates) == 0:
+            context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
+            context.set_details("candidates must be non-empty")
+            return backend_pb2.ScoreResponse()
+
+        try:
+            prompt = request.prompt or ""
+            prompt_token_ids = self.tokenizer.encode(prompt)
+            prompt_len = len(prompt_token_ids)
+            results = []
+
+            for candidate in request.candidates:
+                # Tokenise the concatenated sequence. We can't naively
+                # use len(prompt_tokens) + len(tokenizer.encode(candidate))
+                # because BPE merges at the boundary may produce a
+                # different tokenisation. Encoding the joined text and
+                # walking the divergence point is the correct primitive.
+                full_text = prompt + candidate
+                full_token_ids = self.tokenizer.encode(full_text)
+
+                divergence = prompt_len
+                min_len = min(prompt_len, len(full_token_ids))
+                for i in range(min_len):
+                    if prompt_token_ids[i] != full_token_ids[i]:
+                        divergence = i
+                        break
+
+                candidate_token_ids = full_token_ids[divergence:]
+                num_candidate_tokens = len(candidate_token_ids)
+                if num_candidate_tokens == 0:
+                    results.append(backend_pb2.CandidateScore(
+                        log_prob=0.0,
+                        length_normalized_log_prob=0.0,
+                        num_tokens=0,
+                    ))
+                    continue
+
+                sampling = SamplingParams(
+                    max_tokens=1,
+                    temperature=0.0,
+                    prompt_logprobs=1,
+                    detokenize=False,
+                )
+
+                request_id = random_uuid()
+                last_output = None
+                outputs_iter = self.llm.generate(
+                    {"prompt": full_text},
+                    sampling_params=sampling,
+                    request_id=request_id,
+                )
+                try:
+                    async for out in outputs_iter:
+                        last_output = out
+                finally:
+                    try:
+                        await outputs_iter.aclose()
+                    except Exception:
+                        pass
+
+                if last_output is None or not getattr(last_output, "prompt_logprobs", None):
+                    context.set_code(grpc.StatusCode.INTERNAL)
+                    context.set_details("vLLM did not return prompt_logprobs")
+                    return backend_pb2.ScoreResponse()
+
+                prompt_logprobs = last_output.prompt_logprobs
+                total = 0.0
+                tokens_proto = []
+                for offset, tok_id in enumerate(candidate_token_ids):
+                    position = divergence + offset
+                    if position >= len(prompt_logprobs) or prompt_logprobs[position] is None:
+                        continue
+                    entry = prompt_logprobs[position]
+                    lp_obj = entry.get(tok_id)
+                    if lp_obj is not None:
+                        lp = lp_obj.logprob
+                    else:
+                        # Token not in top-K; vLLM's top-1 may miss it.
+                        # Fall back to the lowest available logprob in the
+                        # entry — a conservative lower-bound on the true
+                        # log P, biased against this candidate.
+                        lp = min(v.logprob for v in entry.values())
+                    total += lp
+                    if request.include_token_logprobs:
+                        tokens_proto.append(backend_pb2.TokenLogProb(
+                            token=self.tokenizer.decode([tok_id]),
+                            log_prob=lp,
+                        ))
+
+                cs = backend_pb2.CandidateScore(
+                    log_prob=total,
+                    num_tokens=num_candidate_tokens,
+                )
+                if request.length_normalize and num_candidate_tokens > 0:
+                    cs.length_normalized_log_prob = total / num_candidate_tokens
+                if tokens_proto:
+                    cs.tokens.extend(tokens_proto)
+                results.append(cs)
+
+            return backend_pb2.ScoreResponse(candidates=results)
+        except Exception as e:
+            print(f"Score error: {e}", file=sys.stderr)
+            context.set_code(grpc.StatusCode.INTERNAL)
+            context.set_details(str(e))
+            return backend_pb2.ScoreResponse()
+
     async def _predict(self, request, context, streaming=False):
         # Build the sampling parameters
         # NOTE: this must stay in sync with the vllm backend
diff --git a/core/application/application.go b/core/application/application.go
index 852324e74..7a34279c9 100644
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -9,11 +9,18 @@ import (
 
 	corebackend "github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/auth"
 	mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
 	"github.com/mudler/LocalAI/core/services/agentpool"
 	"github.com/mudler/LocalAI/core/services/facerecognition"
 	"github.com/mudler/LocalAI/core/services/galleryop"
+	"github.com/mudler/LocalAI/core/services/monitoring"
 	"github.com/mudler/LocalAI/core/services/nodes"
+	"github.com/mudler/LocalAI/core/services/routing/admission"
+	"github.com/mudler/LocalAI/core/services/routing/billing"
+	"github.com/mudler/LocalAI/core/services/cloudproxy/mitm"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+	"github.com/mudler/LocalAI/core/services/routing/router"
 	"github.com/mudler/LocalAI/core/services/voicerecognition"
 	"github.com/mudler/LocalAI/core/templates"
 	pkggrpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -51,6 +58,22 @@ type Application struct {
 	faceRegistry       facerecognition.Registry
 	voiceRegistry      voicerecognition.Registry
 	authDB             *gorm.DB
+	metricsService     *monitoring.LocalAIMetricsService
+	statsRecorder      *billing.Recorder
+	fallbackUser       *auth.User
+	piiRedactor        *pii.Redactor
+	piiEvents          pii.EventStore
+	mitmCA             atomic.Pointer[mitm.CA]
+	mitmServer         atomic.Pointer[mitm.Server]
+	mitmMutex          sync.Mutex // serializes Stop+Start; readers use atomic loads
+	// mitmHostConflicts records duplicate-host claims across model configs.
+	// Non-empty disables the MITM listener until resolved — the strict
+	// 1-to-1 host↔model invariant the dispatcher relies on. Read by
+	// /api/middleware/status so the admin UI can surface the cause.
+	mitmHostConflicts atomic.Pointer[map[string][]string]
+	routerDecisions    router.DecisionStore
+	routerRegistry     *router.Registry
+	admissionLimiter   *admission.Limiter
 	watchdogMutex      sync.Mutex
 	watchdogStop       chan bool
 	p2pMutex           sync.Mutex
@@ -185,6 +208,103 @@ func (a *Application) AuthDB() *gorm.DB {
 	return a.authDB
 }
 
+// MetricsService returns the OTel + Prometheus metric service. nil when
+// --disable-metrics is set or initialisation failed at startup.
+//
+// The service is created in startup.go before any counter is registered
+// so that otel.SetMeterProvider runs early enough for the billing
+// recorder's counters to bind to the Prom-backed provider rather than
+// the no-op global. core/http/app.go reuses this instance instead of
+// constructing its own — two providers would orphan one set of counters
+// behind whichever provider lost the SetMeterProvider race.
+func (a *Application) MetricsService() *monitoring.LocalAIMetricsService {
+	return a.metricsService
+}
+
+// StatsRecorder returns the billing recorder used by the usage
+// middleware. It is non-nil whenever stats are not explicitly disabled
+// — i.e., the no-auth single-user path still gets a working recorder
+// (in-memory by default). Routes register UsageMiddleware against this
+// recorder regardless of auth state.
+func (a *Application) StatsRecorder() *billing.Recorder {
+	return a.statsRecorder
+}
+
+// FallbackUser is the synthetic "local" user that UsageMiddleware uses
+// to attribute requests when no authenticated user is on the context
+// (i.e., --auth is off). nil when auth is on, since real users are
+// always available there.
+func (a *Application) FallbackUser() *auth.User {
+	return a.fallbackUser
+}
+
+// PIIRedactor returns the regex-tier PII redactor or nil if PII
+// filtering is disabled. The chat-route middleware uses this to apply
+// redaction before dispatch.
+func (a *Application) PIIRedactor() *pii.Redactor {
+	return a.piiRedactor
+}
+
+// PIIEvents returns the PII event store. Same nil-when-disabled
+// semantics as PIIRedactor; admin REST and MCP read tools call List
+// against it.
+func (a *Application) PIIEvents() pii.EventStore {
+	return a.piiEvents
+}
+
+// MITMCA returns the cloudproxy MITM proxy's CA, or nil when the
+// MITM listener is disabled.
+func (a *Application) MITMCA() *mitm.CA { return a.mitmCA.Load() }
+
+// MITMServer returns the running MITM proxy or nil.
+func (a *Application) MITMServer() *mitm.Server { return a.mitmServer.Load() }
+
+// MITMHostConflicts returns a snapshot of host→[]model-name pairs that
+// are claimed by 2+ model configs. Empty when the 1-to-1 invariant
+// holds. Non-empty disables the MITM listener — read by the admin
+// status endpoint to explain why.
+func (a *Application) MITMHostConflicts() map[string][]string {
+	p := a.mitmHostConflicts.Load()
+	if p == nil {
+		return nil
+	}
+	return *p
+}
+
+// MITMHostOwners returns the host→model-name map, useful for the
+// admin status endpoint. The lookup is recomputed on each call to
+// stay current with model-config edits without needing a
+// MITMRestart.
+func (a *Application) MITMHostOwners() map[string]string {
+	if a.backendLoader == nil {
+		return nil
+	}
+	return a.backendLoader.MITMHostOwners().Owners
+}
+
+// RouterDecisions returns the routing decision store. nil when stats
+// are disabled (--disable-stats); the RouteModel middleware skips the
+// log write in that case but still rewrites requests.
+func (a *Application) RouterDecisions() router.DecisionStore {
+	return a.routerDecisions
+}
+
+// RouterClassifierRegistry returns the process-wide classifier cache.
+// Shared between the OpenAI and Anthropic route middlewares so the
+// admin stats endpoint sees every live classifier — and so a
+// classifier built on the OpenAI route is reused on Anthropic.
+func (a *Application) RouterClassifierRegistry() *router.Registry {
+	return a.routerRegistry
+}
+
+// AdmissionLimiter returns the per-model admission limiter. The
+// admission middleware uses it to gate concurrent requests; the
+// admin status surface reads InFlight/Capacity from it for live
+// load visibility.
+func (a *Application) AdmissionLimiter() *admission.Limiter {
+	return a.admissionLimiter
+}
+
 // StartupConfig returns the original startup configuration (from env vars, before file loading)
 func (a *Application) StartupConfig() *config.ApplicationConfig {
 	return a.startupConfig
@@ -255,6 +375,15 @@ func (a *Application) start() error {
 			a.modelLoader,
 			a.galleryService,
 		)
+		// Wire usage tracking so the assistant's get_usage_stats tool
+		// returns real data; nil values keep the tool returning a clear
+		// "unavailable" error if startup ran with --disable-stats.
+		assistantClient.StatsRecorder = a.statsRecorder
+		assistantClient.FallbackUser = a.fallbackUser
+		// PII filter — same nil-or-real wiring.
+		assistantClient.PIIRedactor = a.piiRedactor
+		assistantClient.PIIEvents = a.piiEvents
+		assistantClient.RouterDecisions = a.routerDecisions
 		if err := holder.Initialize(a.applicationConfig.Context, assistantClient, localaitools.Options{}); err != nil {
 			// Why log+continue instead of fail: the assistant is an optional
 			// feature; a failure here must not take down the whole server.
diff --git a/core/application/mitm.go b/core/application/mitm.go
new file mode 100644
index 000000000..293b3d449
--- /dev/null
+++ b/core/application/mitm.go
@@ -0,0 +1,146 @@
+package application
+
+import (
+	"errors"
+	"fmt"
+	"path/filepath"
+	"sort"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services/cloudproxy/mitm"
+	"github.com/mudler/xlog"
+)
+
+func startMITMProxy(app *Application, options *config.ApplicationConfig) error {
+	app.mitmMutex.Lock()
+	defer app.mitmMutex.Unlock()
+	return startMITMLocked(app, options)
+}
+
+func startMITMLocked(app *Application, options *config.ApplicationConfig) error {
+	// Validate the host↔model-config 1-to-1 invariant before binding
+	// the listener. Two configs claiming the same host means the
+	// dispatcher would have ambiguous PII settings; refuse to start
+	// rather than silently picking one. The conflict map is published
+	// for /api/middleware/status to surface in the UI.
+	ownership := app.backendLoader.MITMHostOwners()
+	if len(ownership.Conflicts) > 0 {
+		conflicts := ownership.Conflicts
+		app.mitmHostConflicts.Store(&conflicts)
+		hosts := make([]string, 0, len(conflicts))
+		for h := range conflicts {
+			hosts = append(hosts, h)
+		}
+		sort.Strings(hosts)
+		xlog.Error("mitm: refusing to start — duplicate host claims across model configs",
+			"hosts", hosts,
+			"conflicts", conflicts,
+		)
+		return errors.New("mitm: configuration error: duplicate host claims (see /api/middleware/status)")
+	}
+	app.mitmHostConflicts.Store(nil)
+
+	caDir := options.MITMCADir
+	if caDir == "" {
+		base := options.DataPath
+		if base == "" {
+			base = "."
+		}
+		caDir = filepath.Join(base, "mitm-ca")
+	}
+
+	if app.mitmCA.Load() == nil {
+		ca, err := mitm.LoadOrCreateCA(caDir)
+		if err != nil {
+			return fmt.Errorf("ca: %w", err)
+		}
+		app.mitmCA.Store(ca)
+	}
+
+	// Allowlist is exactly the set of hosts claimed by model configs.
+	// No global list — admins add hosts by creating an MITM model
+	// config (template available in the Add Model UI). When no config
+	// claims any host, the listener still starts but every CONNECT
+	// tunnels through unmodified.
+	effectiveHosts := make([]string, 0, len(ownership.Owners))
+	for h := range ownership.Owners {
+		effectiveHosts = append(effectiveHosts, h)
+	}
+	sort.Strings(effectiveHosts)
+
+	// Per-host PII gate inherits from the owning model's pii.enabled.
+	// A non-cloud-proxy backend with no explicit pii.enabled resolves
+	// to false → host is intercepted but the regex pass is skipped
+	// (audit events still record).
+	var piiDisabled []string
+	for host, modelName := range ownership.Owners {
+		cfg, exists := app.backendLoader.GetModelConfig(modelName)
+		if !exists {
+			continue
+		}
+		if !cfg.PIIIsEnabled() {
+			piiDisabled = append(piiDisabled, host)
+		}
+	}
+
+	handler := mitm.NewPIIHandler(mitm.PIIHandlerOptions{
+		Redactor:             app.piiRedactor,
+		EventStore:           app.piiEvents,
+		HostsWithPIIDisabled: piiDisabled,
+	})
+
+	srv, err := mitm.NewServer(mitm.Config{
+		Addr:           options.MITMListen,
+		CA:             app.mitmCA.Load(),
+		InterceptHosts: effectiveHosts,
+		Handler:        handler,
+		EventStore:     app.piiEvents,
+	})
+	if err != nil {
+		return fmt.Errorf("server: %w", err)
+	}
+	if err := srv.Start(); err != nil {
+		return fmt.Errorf("listen: %w", err)
+	}
+	app.mitmServer.Store(srv)
+
+	xlog.Info("mitm: cloudproxy listener started",
+		"addr", srv.Addr(),
+		"ca_dir", caDir,
+		"intercept_hosts", effectiveHosts,
+		"model_owned_hosts", len(ownership.Owners),
+		"pii_disabled_hosts", len(piiDisabled),
+	)
+	return nil
+}
+
+// StopMITM is idempotent.
+func (a *Application) StopMITM() error {
+	a.mitmMutex.Lock()
+	defer a.mitmMutex.Unlock()
+	stopMITMLocked(a)
+	return nil
+}
+
+// RestartMITM reuses the existing CA so trusted clients keep
+// working across listener flips.
+func (a *Application) RestartMITM() error {
+	a.mitmMutex.Lock()
+	defer a.mitmMutex.Unlock()
+	stopMITMLocked(a)
+	if a.applicationConfig.MITMListen == "" {
+		xlog.Info("mitm: cloudproxy listener stays disabled (no listen address)")
+		return nil
+	}
+	return startMITMLocked(a, a.applicationConfig)
+}
+
+func stopMITMLocked(a *Application) {
+	srv := a.mitmServer.Load()
+	if srv == nil {
+		return
+	}
+	srv.Stop()
+	a.mitmServer.Store(nil)
+	xlog.Info("mitm: cloudproxy listener stopped")
+}
diff --git a/core/application/router_factories.go b/core/application/router_factories.go
new file mode 100644
index 000000000..d37cfb9d8
--- /dev/null
+++ b/core/application/router_factories.go
@@ -0,0 +1,63 @@
+package application
+
+import (
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+)
+
+// adapterConfig resolves a model name to its runtime ModelConfig, or
+// nil when the name is unknown. Shared by the router-facing factories
+// below and by ModelConfigLookup.
+func (a *Application) adapterConfig(modelName string) *config.ModelConfig {
+	cfg, err := a.backendLoader.LoadModelConfigFileByNameDefaultOptions(modelName, a.applicationConfig)
+	if err != nil || cfg == nil {
+		return nil
+	}
+	return cfg
+}
+
+// ModelConfigLookup is the lookup function the router middleware's
+// classifier validator uses to confirm classifier_model declares
+// FLAG_SCORE before binding it.
+func (a *Application) ModelConfigLookup() func(modelName string) *config.ModelConfig {
+	return a.adapterConfig
+}
+
+// Scorer returns a backend.Scorer bound to the named model, or nil
+// when the model is unknown. Used as a method value (app.Scorer) by
+// router.ClassifierDeps — no factory-of-factory wrapper needed.
+func (a *Application) Scorer(modelName string) backend.Scorer {
+	cfg := a.adapterConfig(modelName)
+	if cfg == nil {
+		return nil
+	}
+	return backend.NewScorer(a.modelLoader, *cfg, a.applicationConfig)
+}
+
+// Reranker returns a backend.Reranker bound to the named model, or
+// nil when unknown. The reranker model's `type:` (e.g. "colbert")
+// selects the scoring head inside the rerankers backend.
+func (a *Application) Reranker(modelName string) backend.Reranker {
+	cfg := a.adapterConfig(modelName)
+	if cfg == nil {
+		return nil
+	}
+	return backend.NewReranker(a.modelLoader, *cfg, a.applicationConfig)
+}
+
+// Embedder returns a backend.Embedder bound to the named model, or
+// nil when unknown. Used by the router's L2 embedding cache.
+func (a *Application) Embedder(modelName string) backend.Embedder {
+	cfg := a.adapterConfig(modelName)
+	if cfg == nil {
+		return nil
+	}
+	return backend.NewEmbedder(a.modelLoader, *cfg, a.applicationConfig)
+}
+
+// VectorStore returns a backend.VectorStore for the named collection,
+// or nil when the name is empty. Each router model gets its own
+// backend process via the model loader's cache keyed by storeName.
+func (a *Application) VectorStore(storeName string) backend.VectorStore {
+	return backend.NewVectorStore(a.modelLoader, a.applicationConfig, storeName)
+}
diff --git a/core/application/runtime_settings_branding_test.go b/core/application/runtime_settings_branding_test.go
index 9f173864e..6300f4456 100644
--- a/core/application/runtime_settings_branding_test.go
+++ b/core/application/runtime_settings_branding_test.go
@@ -87,6 +87,28 @@ var _ = Describe("loadRuntimeSettingsFromFile", func() {
 		})
 	})
 
+	// MITM listener address. The file is the only source — no env var
+	// exists — so a regression here means an admin who configured the
+	// listener via /api/settings loses it after a reboot, even though
+	// the value is still on disk in the volume. (Intercept hosts now
+	// live in model YAML mitm.hosts: blocks, not runtime_settings.json.)
+	Describe("MITM fields", func() {
+		It("loads mitm_listen", func() {
+			cfg := &config.ApplicationConfig{DynamicConfigsDir: seedSettings(`{"mitm_listen": ":8443"}`)}
+			loadRuntimeSettingsFromFile(cfg)
+			Expect(cfg.MITMListen).To(Equal(":8443"))
+		})
+
+		It("does not override an explicit CLI flag", func() {
+			cfg := &config.ApplicationConfig{
+				DynamicConfigsDir: seedSettings(`{"mitm_listen": ":8443"}`),
+				MITMListen:        ":9999", // simulate WithMITMListen(":9999")
+			}
+			loadRuntimeSettingsFromFile(cfg)
+			Expect(cfg.MITMListen).To(Equal(":9999"), "CLI flag must win over the persisted file value")
+		})
+	})
+
 	// The Agent Pool block has a mix of zero and non-zero defaults
 	// (Enabled=true, EmbeddingModel="granite-...", MaxChunkingSize=400,
 	// VectorEngine="chromem", AgentHubURL="https://agenthub.localai.io").
diff --git a/core/application/startup.go b/core/application/startup.go
index 183713f86..8268112bc 100644
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -15,8 +15,14 @@ import (
 	"github.com/mudler/LocalAI/core/http/auth"
 	"github.com/mudler/LocalAI/core/services/galleryop"
 	"github.com/mudler/LocalAI/core/services/jobs"
+	"github.com/mudler/LocalAI/core/services/monitoring"
 	"github.com/mudler/LocalAI/core/services/nodes"
+	"github.com/mudler/LocalAI/core/services/routing/admission"
+	"github.com/mudler/LocalAI/core/services/routing/billing"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+	"github.com/mudler/LocalAI/core/services/routing/router"
 	"github.com/mudler/LocalAI/core/services/storage"
+	"github.com/mudler/LocalAI/pkg/signals"
 	coreStartup "github.com/mudler/LocalAI/core/startup"
 	"github.com/mudler/LocalAI/internal"
 	"github.com/mudler/LocalAI/pkg/vram"
@@ -128,6 +134,117 @@ func New(opts ...config.AppOption) (*Application, error) {
 		}()
 	}
 
+	// Initialize the OTel + Prometheus metric pipeline before any
+	// counter is created. monitoring.NewLocalAIMetricsService calls
+	// otel.SetMeterProvider, so any subsequent otel.Meter() call —
+	// including billing.NewRecorder below — sees the real provider
+	// rather than the no-op global. Initialising metrics later (in
+	// core/http/app.go) leaves billing's counters bound to a no-op
+	// meter and never reaches /metrics. We deliberately ignore
+	// DisableMetrics here for ordering purposes; the HTTP middleware
+	// that records api_call histograms is still gated.
+	if !options.DisableMetrics {
+		ms, err := monitoring.NewLocalAIMetricsService()
+		if err != nil {
+			xlog.Error("failed to initialize metrics provider", "error", err)
+		} else {
+			application.metricsService = ms
+			// Bind the billing package's counters to the same meter the
+			// metrics service exports. Without this, billing's counters
+			// resolve via the OTel global and never reach /metrics.
+			billing.SetMeter(ms.Meter)
+		}
+	}
+
+	// Wire the routing-module billing recorder. The recorder runs in
+	// every mode (auth on/off, distributed/single-node) so that token
+	// tracking is not gated on auth — a no-auth single-user box still
+	// gets dashboards and `/api/usage` populated.
+	//
+	// fallbackUser is wired *unconditionally* when stats are enabled.
+	// UsageMiddleware uses it as the attribution source whenever
+	// auth.GetUser(c) is nil — that covers (a) no-auth deployments and
+	// (b) internal callers under auth-on (cron flushers, distributed
+	// worker callbacks) that hit a recordable endpoint without a user
+	// in context. The billing.user_id_present invariant still rejects
+	// empty IDs; LocalUser() returns a stable UUID per data path.
+	if !options.DisableStats {
+		var statsBackend billing.StatsBackend
+		switch {
+		case application.authDB != nil:
+			statsBackend = billing.NewGormBackend(application.authDB, 0, 0)
+			xlog.Info("stats: using auth DB for usage records")
+		default:
+			statsBackend = billing.NewMemoryBackend(0)
+			xlog.Info("stats: using in-memory ring buffer (no-auth single-user mode)")
+		}
+		application.fallbackUser = billing.LocalUser(options.DataPath)
+		application.statsRecorder = billing.NewRecorder(statsBackend)
+		// Drain pending records on SIGTERM. The GORM backend buffers up
+		// to maxPending (5k) records across a 5s flush tick, so without
+		// this the last few seconds of usage disappear on graceful exit.
+		signals.RegisterGracefulTerminationHandler(func() {
+			_ = application.statsRecorder.Close()
+		})
+		xlog.Info("stats: fallback user wired", "local_user_id", application.fallbackUser.ID)
+	} else {
+		xlog.Info("stats: disabled by --disable-stats")
+	}
+
+	// Wire the regex PII filter. Default-on: a single-user box gets
+	// the built-in pattern set the first time it starts, with email/
+	// phone/SSN/credit-card on mask and api_key_prefix on block. If
+	// the operator wants different actions, --pii-config points at a
+	// YAML file that overrides per-id; --disable-pii turns it off
+	// entirely.
+	if !options.DisablePII {
+		patterns, err := pii.LoadConfig(options.PIIConfigPath)
+		if err != nil {
+			return nil, fmt.Errorf("pii config: %w", err)
+		}
+		application.piiRedactor = pii.NewRedactor(patterns)
+		application.piiEvents = pii.NewMemoryEventStore(0)
+		// Apply persisted per-pattern overrides — admins toggling
+		// action/disabled via the UI and clicking "Save to disk" land
+		// here on the next start. Bad ids are warned and ignored so a
+		// stale entry doesn't block startup.
+		for id, ov := range options.PIIPatternOverrides {
+			if ov.Action != nil {
+				if err := application.piiRedactor.SetAction(id, pii.Action(*ov.Action)); err != nil {
+					xlog.Warn("pii: persisted override skipped", "pattern", id, "error", err)
+					continue
+				}
+			}
+			if ov.Disabled != nil {
+				if err := application.piiRedactor.SetDisabled(id, *ov.Disabled); err != nil {
+					xlog.Warn("pii: persisted disable skipped", "pattern", id, "error", err)
+				}
+			}
+		}
+		xlog.Info("pii: filter enabled",
+			"patterns", len(patterns),
+			"config_path", options.PIIConfigPath,
+			"persisted_overrides", len(options.PIIPatternOverrides),
+		)
+	} else {
+		xlog.Info("pii: disabled by --disable-pii")
+	}
+
+	// Wire the routing decision log. Always-on when stats are enabled —
+	// the per-router admin page reads this as the live activity feed
+	// and as input to drift checks for subsystem 5.
+	if !options.DisableStats {
+		application.routerDecisions = router.NewMemoryDecisionStore(0)
+	}
+	// Process-wide classifier cache shared across all route middlewares so
+	// the embedding-cache stats endpoint sees a single source of truth.
+	application.routerRegistry = router.NewRegistry()
+
+	// Subsystem 5: admission control. Limiter is always wired so a
+	// model that gains a limits: block via gallery install or YAML
+	// edit takes effect on the next restart without conditional plumbing.
+	application.admissionLimiter = admission.New()
+
 	// Wire JobStore for DB-backed task/job persistence whenever auth DB is available.
 	// This ensures tasks and jobs survive restarts in both single-node and distributed modes.
 	if application.authDB != nil && application.agentJobService != nil {
@@ -291,6 +408,20 @@ func New(opts ...config.AppOption) (*Application, error) {
 		loadRuntimeSettingsFromFile(options)
 	}
 
+	// Wire the cloudproxy MITM listener. Opt-in: empty MITMListen
+	// means "no MITM" — operators must explicitly choose to start
+	// it because clients have to install the generated CA cert.
+	// The handler reuses the global redactor + event store so an
+	// admin who's already configured PII filtering for direct API
+	// traffic doesn't need a parallel config for MITM traffic.
+	// Runs after loadRuntimeSettingsFromFile so a listener configured
+	// via /api/settings is brought back up across restarts.
+	if options.MITMListen != "" {
+		if err := startMITMProxy(application, options); err != nil {
+			return nil, fmt.Errorf("mitm: startup: %w", err)
+		}
+	}
+
 	application.ModelLoader().SetBackendLoggingEnabled(options.EnableBackendLogging)
 
 	// turn off any process that was started by GRPC if the context is canceled
@@ -580,6 +711,25 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
 		options.Branding.FaviconFile = *settings.FaviconFile
 	}
 
+	// MITM listener address. The CLI flag WithMITMListen populates
+	// options at startup; if the user configured MITM via /api/settings
+	// after the fact, only the file holds the value. Apply when the
+	// CLI flag did not already set it. (Intercept hosts now live in
+	// model YAML mitm.hosts: rather than runtime_settings.json.)
+	if settings.MITMListen != nil && options.MITMListen == "" {
+		options.MITMListen = *settings.MITMListen
+	}
+
+	// PII pattern overrides — file is the only source; CLI flags don't
+	// reach into this map. Apply unconditionally when present; the
+	// redactor wiring below sees the result on first construction.
+	if settings.PIIPatternOverrides != nil {
+		options.PIIPatternOverrides = make(map[string]config.PIIPatternRuntimeOverride, len(*settings.PIIPatternOverrides))
+		for id, ov := range *settings.PIIPatternOverrides {
+			options.PIIPatternOverrides[id] = ov
+		}
+	}
+
 	// Backend upgrade flags
 	if settings.AutoUpgradeBackends != nil {
 		if !options.AutoUpgradeBackends {
diff --git a/core/backend/embeddings.go b/core/backend/embeddings.go
index f7944827d..dd9b9cffb 100644
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -1,6 +1,7 @@
 package backend
 
 import (
+	"context"
 	"fmt"
 	"time"
 
@@ -11,6 +12,32 @@ import (
 	model "github.com/mudler/LocalAI/pkg/model"
 )
 
+// Embedder produces a fixed-dimension vector from a prompt. The
+// router's L2 embedding cache uses it to look up semantically-similar
+// past decisions.
+type Embedder interface {
+	Embed(ctx context.Context, text string) ([]float32, error)
+}
+
+// NewEmbedder binds (loader, modelConfig, appConfig) into an Embedder.
+func NewEmbedder(loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) Embedder {
+	return &modelEmbedder{loader: loader, modelConfig: modelConfig, appConfig: appConfig}
+}
+
+type modelEmbedder struct {
+	loader      *model.ModelLoader
+	modelConfig config.ModelConfig
+	appConfig   *config.ApplicationConfig
+}
+
+func (e *modelEmbedder) Embed(_ context.Context, text string) ([]float32, error) {
+	fn, err := ModelEmbedding(text, nil, e.loader, e.modelConfig, e.appConfig)
+	if err != nil {
+		return nil, err
+	}
+	return fn()
+}
+
 func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
 
 	opts := ModelOptions(modelConfig, appConfig)
diff --git a/core/backend/options.go b/core/backend/options.go
index a7d332344..0215bf37a 100644
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -242,6 +242,18 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
 		Tokenizer: c.Tokenizer,
 	}
 
+	if c.Backend == "cloud-proxy" {
+		opts.Proxy = &pb.ProxyOptions{
+			UpstreamUrl:            c.Proxy.UpstreamURL,
+			Mode:                   c.Proxy.Mode,
+			Provider:               c.Proxy.Provider,
+			ApiKeyEnv:              c.Proxy.APIKeyEnv,
+			ApiKeyFile:             c.Proxy.APIKeyFile,
+			UpstreamModel:          c.Proxy.UpstreamModel,
+			RequestTimeoutSeconds:  int32(c.Proxy.RequestTimeoutSeconds),
+		}
+	}
+
 	if c.MMProj != "" {
 		opts.MMProj = filepath.Join(modelPath, c.MMProj)
 	}
diff --git a/core/backend/rerank.go b/core/backend/rerank.go
index a90c2aad1..f82208f8c 100644
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -11,6 +11,51 @@ import (
 	model "github.com/mudler/LocalAI/pkg/model"
 )
 
+// RerankResult is the per-document score returned to consumers,
+// narrowed from proto.RerankResult so callers don't need to depend on
+// the proto package.
+type RerankResult struct {
+	Index          int
+	RelevanceScore float32
+}
+
+// Reranker scores a list of candidate documents against a query.
+// Returns one RerankResult per input document (no top-N truncation —
+// callers that need it can sort and slice).
+type Reranker interface {
+	Rerank(ctx context.Context, query string, documents []string) ([]RerankResult, error)
+}
+
+// NewReranker binds (loader, modelConfig, appConfig) into a Reranker.
+func NewReranker(loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) Reranker {
+	return &modelReranker{loader: loader, modelConfig: modelConfig, appConfig: appConfig}
+}
+
+type modelReranker struct {
+	loader      *model.ModelLoader
+	modelConfig config.ModelConfig
+	appConfig   *config.ApplicationConfig
+}
+
+func (r *modelReranker) Rerank(ctx context.Context, query string, documents []string) ([]RerankResult, error) {
+	req := &proto.RerankRequest{
+		Query:     query,
+		Documents: documents,
+		// TopN=0 → backend returns scores for every document. Truncating
+		// here would silently zero out labels the reranker considered
+		// unlikely, which the router classifier needs.
+	}
+	res, err := Rerank(ctx, req, r.loader, r.appConfig, r.modelConfig)
+	if err != nil {
+		return nil, err
+	}
+	out := make([]RerankResult, 0, len(res.GetResults()))
+	for _, dr := range res.GetResults() {
+		out = append(out, RerankResult{Index: int(dr.GetIndex()), RelevanceScore: dr.GetRelevanceScore()})
+	}
+	return out, nil
+}
+
 func Rerank(ctx context.Context, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, modelConfig config.ModelConfig) (*proto.RerankResult, error) {
 	opts := ModelOptions(modelConfig, appConfig)
 	rerankModel, err := loader.Load(opts...)
diff --git a/core/backend/score.go b/core/backend/score.go
new file mode 100644
index 000000000..dce06213c
--- /dev/null
+++ b/core/backend/score.go
@@ -0,0 +1,159 @@
+package backend
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/trace"
+	"github.com/mudler/LocalAI/pkg/grpc"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	model "github.com/mudler/LocalAI/pkg/model"
+)
+
+// ScoreOptions controls a single Score request.
+type ScoreOptions struct {
+	// IncludeTokenLogprobs returns per-token log-probability detail for
+	// each candidate. Off by default — the joint LogProb is enough for
+	// ranking; callers that need calibration / entropy over the token
+	// stream opt in.
+	IncludeTokenLogprobs bool
+	// LengthNormalize divides the joint log-prob by the candidate's
+	// token count. Useful when comparing candidates of different
+	// lengths — without it, longer candidates score lower by default.
+	LengthNormalize bool
+}
+
+// CandidateScore is the per-candidate result. Mirrors pb.CandidateScore
+// but avoids leaking the proto type to consumers.
+type CandidateScore struct {
+	LogProb                 float64
+	LengthNormalizedLogProb float64
+	NumTokens               int
+	Tokens                  []TokenLogProb
+}
+
+type TokenLogProb struct {
+	Token   string
+	LogProb float64
+}
+
+// Scorer evaluates a model's joint log-probability of each candidate
+// continuation given a shared prompt. Implemented by NewScorer over a
+// model-loaded backend; the router's score classifier consumes this
+// for multi-label policy selection.
+type Scorer interface {
+	Score(ctx context.Context, prompt string, candidates []string) ([]CandidateScore, error)
+}
+
+// NewScorer binds (loader, modelConfig, appConfig) into a Scorer. The
+// underlying backend is resolved lazily on the first Score call.
+// Returns nil only as a contract violation — callers that need to
+// detect "model not loadable" should look up the config first.
+func NewScorer(loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) Scorer {
+	return &modelScorer{loader: loader, modelConfig: modelConfig, appConfig: appConfig}
+}
+
+type modelScorer struct {
+	loader      *model.ModelLoader
+	modelConfig config.ModelConfig
+	appConfig   *config.ApplicationConfig
+}
+
+func (m *modelScorer) Score(ctx context.Context, prompt string, candidates []string) ([]CandidateScore, error) {
+	fn, err := ModelScore(prompt, candidates, ScoreOptions{LengthNormalize: true}, m.loader, m.modelConfig, m.appConfig)
+	if err != nil {
+		return nil, err
+	}
+	return fn(ctx)
+}
+
+// ModelScore loads the backend for modelConfig and returns a closure
+// that scores `candidates` against `prompt`. The closure is bound to
+// the loaded model so callers can keep it around for repeat scoring
+// within the same request without re-resolving the backend.
+func ModelScore(prompt string, candidates []string, opts ScoreOptions, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (func(ctx context.Context) ([]CandidateScore, error), error) {
+	modelOpts := ModelOptions(modelConfig, appConfig)
+	inferenceModel, err := loader.Load(modelOpts...)
+	if err != nil {
+		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
+		return nil, err
+	}
+	b, ok := inferenceModel.(grpc.Backend)
+	if !ok {
+		return nil, fmt.Errorf("scoring not supported by backend %q", modelConfig.Backend)
+	}
+	if len(candidates) == 0 {
+		return nil, fmt.Errorf("Score: candidates must be non-empty")
+	}
+	return func(ctx context.Context) ([]CandidateScore, error) {
+		// Surface score calls in the Traces UI alongside the LLM calls
+		// they typically gate (router classifier, eval scoring). Without
+		// this, a router-classified request shows only the downstream LLM
+		// trace with no record of the classification that picked it.
+		var startTime time.Time
+		if appConfig.EnableTracing {
+			trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
+			startTime = time.Now()
+		}
+		resp, err := b.Score(ctx, &pb.ScoreRequest{
+			Prompt:               prompt,
+			Candidates:           candidates,
+			IncludeTokenLogprobs: opts.IncludeTokenLogprobs,
+			LengthNormalize:      opts.LengthNormalize,
+		})
+		results := scoreResponseToCandidates(resp, opts.IncludeTokenLogprobs)
+		if appConfig.EnableTracing {
+			errStr := ""
+			if err != nil {
+				errStr = err.Error()
+			}
+			trace.RecordBackendTrace(trace.BackendTrace{
+				Timestamp: startTime,
+				Duration:  time.Since(startTime),
+				Type:      trace.BackendTraceScore,
+				ModelName: modelConfig.Name,
+				Backend:   modelConfig.Backend,
+				Summary:   trace.TruncateString(prompt, 200),
+				Error:     errStr,
+				Data: map[string]any{
+					// Copy candidates so the trace buffer doesn't pin a
+					// caller-owned slice for the lifetime of the ring.
+					"candidates": append([]string(nil), candidates...),
+					"results":    results,
+				},
+			})
+		}
+		if err != nil {
+			return nil, err
+		}
+		return results, nil
+	}, nil
+}
+
+// scoreResponseToCandidates converts the wire-format pb response into
+// the value type consumed by callers. Extracted to keep ModelScore's
+// closure trivial and so the conversion can be unit-tested without a
+// real backend.
+func scoreResponseToCandidates(resp *pb.ScoreResponse, includeTokens bool) []CandidateScore {
+	if resp == nil {
+		return nil
+	}
+	out := make([]CandidateScore, len(resp.Candidates))
+	for i, c := range resp.Candidates {
+		cs := CandidateScore{
+			LogProb:                 c.LogProb,
+			LengthNormalizedLogProb: c.LengthNormalizedLogProb,
+			NumTokens:               int(c.NumTokens),
+		}
+		if includeTokens && len(c.Tokens) > 0 {
+			cs.Tokens = make([]TokenLogProb, len(c.Tokens))
+			for j, t := range c.Tokens {
+				cs.Tokens[j] = TokenLogProb{Token: t.Token, LogProb: t.LogProb}
+			}
+		}
+		out[i] = cs
+	}
+	return out
+}
diff --git a/core/backend/score_test.go b/core/backend/score_test.go
new file mode 100644
index 000000000..48193efab
--- /dev/null
+++ b/core/backend/score_test.go
@@ -0,0 +1,63 @@
+package backend
+
+import (
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("scoreResponseToCandidates", func() {
+	It("returns nil for a nil response", func() {
+		Expect(scoreResponseToCandidates(nil, false)).To(BeNil())
+	})
+
+	It("returns an empty slice when the response has no candidates", func() {
+		Expect(scoreResponseToCandidates(&pb.ScoreResponse{}, false)).To(BeEmpty())
+	})
+
+	It("copies LogProb / LengthNormalizedLogProb / NumTokens for every candidate", func() {
+		resp := &pb.ScoreResponse{Candidates: []*pb.CandidateScore{
+			{LogProb: -2.0, LengthNormalizedLogProb: -1.0, NumTokens: 2},
+			{LogProb: -7.5, LengthNormalizedLogProb: -1.5, NumTokens: 5},
+		}}
+		got := scoreResponseToCandidates(resp, false)
+		Expect(got).To(HaveLen(2))
+		Expect(got[0].LogProb).To(Equal(-2.0))
+		Expect(got[0].LengthNormalizedLogProb).To(Equal(-1.0))
+		Expect(got[0].NumTokens).To(Equal(2))
+		Expect(got[1].LogProb).To(Equal(-7.5))
+		Expect(got[1].NumTokens).To(Equal(5))
+	})
+
+	It("omits per-token detail when includeTokens=false even if the wire response carries it", func() {
+		// Defensive: if the backend over-reports we still respect the
+		// caller's opt-in so consumers don't pay marshaling for data
+		// they didn't ask for.
+		resp := &pb.ScoreResponse{Candidates: []*pb.CandidateScore{{
+			LogProb: -1.0,
+			Tokens:  []*pb.TokenLogProb{{Token: "hi", LogProb: -1.0}},
+		}}}
+		got := scoreResponseToCandidates(resp, false)
+		Expect(got).To(HaveLen(1))
+		Expect(got[0].Tokens).To(BeNil())
+	})
+
+	It("populates per-token detail when includeTokens=true", func() {
+		resp := &pb.ScoreResponse{Candidates: []*pb.CandidateScore{{
+			LogProb:   -3.0,
+			NumTokens: 2,
+			Tokens: []*pb.TokenLogProb{
+				{Token: "Hello", LogProb: -1.0},
+				{Token: " world", LogProb: -2.0},
+			},
+		}}}
+		got := scoreResponseToCandidates(resp, true)
+		Expect(got).To(HaveLen(1))
+		Expect(got[0].Tokens).To(HaveLen(2))
+		Expect(got[0].Tokens[0].Token).To(Equal("Hello"))
+		Expect(got[0].Tokens[0].LogProb).To(Equal(-1.0))
+		Expect(got[0].Tokens[1].Token).To(Equal(" world"))
+		Expect(got[0].Tokens[1].LogProb).To(Equal(-2.0))
+	})
+})
diff --git a/core/backend/stores.go b/core/backend/stores.go
index 2fd4cc148..4884765f2 100644
--- a/core/backend/stores.go
+++ b/core/backend/stores.go
@@ -1,12 +1,74 @@
 package backend
 
 import (
+	"context"
+	"fmt"
+	"strings"
+
 	"github.com/mudler/LocalAI/core/config"
 
 	"github.com/mudler/LocalAI/pkg/grpc"
 	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/store"
 )
 
+// VectorStore is the narrowed KNN store used by the router's embedding
+// cache. Search returns the top-1 match (cosine similarity in [-1, 1])
+// and the serialised payload, or ok=false on a clean miss.
+type VectorStore interface {
+	Search(ctx context.Context, vec []float32) (similarity float64, payload []byte, ok bool, err error)
+	Insert(ctx context.Context, vec []float32, payload []byte) error
+}
+
+// NewVectorStore returns a VectorStore backed by the local-store
+// gRPC backend, namespaced by storeName so two routers don't collide.
+func NewVectorStore(loader *model.ModelLoader, appConfig *config.ApplicationConfig, storeName string) VectorStore {
+	if storeName == "" {
+		return nil
+	}
+	return &localVectorStore{loader: loader, appConfig: appConfig, storeName: storeName}
+}
+
+type localVectorStore struct {
+	loader    *model.ModelLoader
+	appConfig *config.ApplicationConfig
+	storeName string
+}
+
+func (s *localVectorStore) backend(_ context.Context) (grpc.Backend, error) {
+	return StoreBackend(s.loader, s.appConfig, s.storeName, "")
+}
+
+func (s *localVectorStore) Search(ctx context.Context, vec []float32) (float64, []byte, bool, error) {
+	be, err := s.backend(ctx)
+	if err != nil {
+		return 0, nil, false, fmt.Errorf("vector store load: %w", err)
+	}
+	_, values, similarities, err := store.Find(ctx, be, vec, 1)
+	if err != nil {
+		// local-store's Find returns "existing length is -1" before
+		// any keys are inserted. Surface that as a clean miss so the
+		// cache layer treats it as an empty store and proceeds to
+		// Insert rather than skipping.
+		if strings.Contains(err.Error(), "existing length is -1") {
+			return 0, nil, false, nil
+		}
+		return 0, nil, false, fmt.Errorf("vector store find: %w", err)
+	}
+	if len(values) == 0 || len(similarities) == 0 {
+		return 0, nil, false, nil
+	}
+	return float64(similarities[0]), values[0], true, nil
+}
+
+func (s *localVectorStore) Insert(ctx context.Context, vec []float32, payload []byte) error {
+	be, err := s.backend(ctx)
+	if err != nil {
+		return fmt.Errorf("vector store load: %w", err)
+	}
+	return store.SetSingle(ctx, be, vec, payload)
+}
+
 func StoreBackend(sl *model.ModelLoader, appConfig *config.ApplicationConfig, storeName string, backend string) (grpc.Backend, error) {
 	if backend == "" {
 		backend = model.LocalStoreBackend
diff --git a/core/cli/run.go b/core/cli/run.go
index 78e8b69c1..0f01e2ab4 100644
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -159,6 +159,10 @@ type RunCMD struct {
 	BackendUpgradeTimeout string `env:"LOCALAI_NATS_BACKEND_UPGRADE_TIMEOUT" help:"NATS round-trip timeout for backend.upgrade requests (default 15m)." group:"distributed"`
 
 	Version bool
+
+	// Cloud-proxy MITM listener (off by default).
+	MITMListen string `env:"LOCALAI_MITM_LISTEN" help:"Address (host:port) for the cloudproxy MITM listener. Empty = disabled. Clients set HTTPS_PROXY=http://<this>:<port>. Intercept hosts are declared per-model via the model YAML mitm.hosts: block; create one from the Add Model UI." group:"middleware"`
+	MITMCADir  string `env:"LOCALAI_MITM_CA_DIR" type:"path" help:"Directory holding the MITM proxy CA cert + key. Defaults to <data-path>/mitm-ca." group:"middleware"`
 }
 
 func (r *RunCMD) Run(ctx *cliContext.Context) error {
@@ -217,6 +221,8 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithLoadToMemory(r.LoadToMemory),
 		config.WithMachineTag(r.MachineTag),
 		config.WithAPIAddress(r.Address),
+		config.WithMITMListen(r.MITMListen),
+		config.WithMITMCADir(r.MITMCADir),
 		config.WithAgentJobRetentionDays(r.AgentJobRetentionDays),
 		config.WithLlamaCPPTunnelCallback(func(tunnels []string) {
 			tunnelEnvVar := strings.Join(tunnels, ",")
diff --git a/core/config/application_config.go b/core/config/application_config.go
index a4119206d..24e7a82fc 100644
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -40,6 +40,54 @@ type ApplicationConfig struct {
 	P2PNetworkID                  string
 	Federated                     bool
 
+	// DisableStats turns off per-request token tracking. By default the
+	// routing module's billing recorder runs in every mode (including
+	// no-auth single-user) so dashboards and `/api/usage` are immediately
+	// useful; set this to opt out of that, e.g., for ephemeral CI runs
+	// or privacy-strict deployments where no token-count history should
+	// touch disk or memory.
+	DisableStats bool
+
+	// PIIConfigPath points to an optional YAML file describing the PII
+	// pattern set. When empty, the routing/pii module's DefaultPatterns()
+	// (email, phone, SSN, credit card, IPv4, API key prefixes) are
+	// loaded with their default actions. Each entry overrides the
+	// matching default by ID:
+	//
+	//   patterns:
+	//     - id: email
+	//       action: route_local      # downgrade default mask -> route_local
+	//     - id: ssn
+	//       action: block            # upgrade default mask -> block
+	//
+	// Unknown ids are rejected with a clear error at startup.
+	PIIConfigPath string
+
+	// DisablePII turns the regex PII filter off entirely. Default
+	// (false) enables it on the OpenAI chat completions route.
+	DisablePII bool
+
+	// MITMListen is the address (host:port) the cloudproxy MITM
+	// listener binds on. Empty disables the MITM proxy entirely.
+	// Use case: redacting PII from Claude Code / Codex CLI traffic
+	// without LocalAI holding the upstream API key. Clients set
+	// HTTPS_PROXY=http://localai:port and trust the CA cert
+	// LocalAI exposes at /api/middleware/proxy-ca.crt.
+	MITMListen string
+
+	// MITMCADir holds the persisted MITM proxy CA cert and private
+	// key. The CA is generated on first start; subsequent starts
+	// reload it so clients keep trusting the same root. The key
+	// file is mode 0600.
+	MITMCADir string
+
+
+	// PIIPatternOverrides applies persisted per-id deltas (action,
+	// disabled) to the live redactor at startup. Loaded from
+	// runtime_settings.json and applied right after pii.NewRedactor.
+	// nil/empty leaves the YAML defaults in place.
+	PIIPatternOverrides map[string]PIIPatternRuntimeOverride
+
 	DisableWebUI                       bool
 	OllamaAPIRootEndpoint              bool
 	EnforcePredownloadScans            bool
@@ -604,6 +652,45 @@ func WithDataPath(dataPath string) AppOption {
 	}
 }
 
+// WithDisableStats turns off the billing recorder. CLI: --disable-stats.
+func WithDisableStats(disable bool) AppOption {
+	return func(o *ApplicationConfig) {
+		o.DisableStats = disable
+	}
+}
+
+// WithPIIConfigPath points the routing PII filter at a YAML config
+// file. CLI: --pii-config.
+func WithPIIConfigPath(path string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.PIIConfigPath = path
+	}
+}
+
+// WithDisablePII turns the regex PII filter off. CLI: --disable-pii.
+func WithDisablePII(disable bool) AppOption {
+	return func(o *ApplicationConfig) {
+		o.DisablePII = disable
+	}
+}
+
+// WithMITMListen sets the address the cloudproxy MITM listener
+// binds on. Empty = disabled. CLI: --mitm-listen.
+func WithMITMListen(addr string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.MITMListen = addr
+	}
+}
+
+// WithMITMCADir sets the directory used to persist the MITM proxy
+// CA cert + key. CLI: --mitm-ca-dir.
+func WithMITMCADir(dir string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.MITMCADir = dir
+	}
+}
+
+
 func WithDynamicConfigDir(dynamicConfigsDir string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.DynamicConfigsDir = dynamicConfigsDir
@@ -998,6 +1085,8 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 	logoHorizontalFile := o.Branding.LogoHorizontalFile
 	faviconFile := o.Branding.FaviconFile
 
+	mitmListen := o.MITMListen
+
 	return RuntimeSettings{
 		WatchdogEnabled:           &watchdogEnabled,
 		WatchdogIdleEnabled:       &watchdogIdle,
@@ -1051,6 +1140,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 		LogoFile:                  &logoFile,
 		LogoHorizontalFile:        &logoHorizontalFile,
 		FaviconFile:               &faviconFile,
+		MITMListen:                &mitmListen,
 	}
 }
 
@@ -1276,6 +1366,10 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req
 		o.Branding.FaviconFile = *settings.FaviconFile
 	}
 
+	if settings.MITMListen != nil {
+		o.MITMListen = *settings.MITMListen
+	}
+
 	// Note: ApiKeys requires special handling (merging with startup keys) - handled in caller
 
 	return requireRestart
diff --git a/core/config/meta/constants.go b/core/config/meta/constants.go
index b0633c22d..b15eb53d0 100644
--- a/core/config/meta/constants.go
+++ b/core/config/meta/constants.go
@@ -49,20 +49,31 @@ var DiffusersPipelineOptions = []FieldOption{
 	{Value: "StableVideoDiffusionPipeline", Label: "StableVideoDiffusionPipeline"},
 }
 
+// UsecaseOptions must stay in sync with GetAllModelConfigUsecases in
+// core/config/model_config.go — a value missing here is silently
+// inaccessible from the model editor, which is how `score` (the router
+// classifier usecase) hid for an entire release.
 var UsecaseOptions = []FieldOption{
 	{Value: "chat", Label: "Chat"},
 	{Value: "completion", Label: "Completion"},
 	{Value: "edit", Label: "Edit"},
 	{Value: "embeddings", Label: "Embeddings"},
 	{Value: "rerank", Label: "Rerank"},
+	{Value: "score", Label: "Score (Router Classifier)"},
 	{Value: "image", Label: "Image"},
+	{Value: "vision", Label: "Vision"},
+	{Value: "detection", Label: "Detection"},
+	{Value: "face_recognition", Label: "Face Recognition"},
 	{Value: "transcript", Label: "Transcript"},
+	{Value: "diarization", Label: "Diarization"},
+	{Value: "speaker_recognition", Label: "Speaker Recognition"},
 	{Value: "tts", Label: "TTS"},
 	{Value: "sound_generation", Label: "Sound Generation"},
+	{Value: "audio_transform", Label: "Audio Transform"},
+	{Value: "realtime_audio", Label: "Realtime Audio"},
 	{Value: "tokenize", Label: "Tokenize"},
 	{Value: "vad", Label: "VAD"},
 	{Value: "video", Label: "Video"},
-	{Value: "detection", Label: "Detection"},
 }
 
 var DiffusersSchedulerOptions = []FieldOption{
diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go
index 99f9e0298..54d891106 100644
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -232,6 +232,17 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Description: "Use the chat template from the model's tokenizer config",
 			Order:       43,
 		},
+		// Router section template — kept in the templates UI section
+		// (rather than the router section under "other") so operators
+		// editing prompt shapes find all template-typed fields in one
+		// place, mirroring how chat / chat_message are grouped.
+		"router.classifier_system_template": {
+			Section:     "templates",
+			Label:       "Router Classifier System Prompt",
+			Description: "Go text/template (with sprig functions) for the routing system prompt the score classifier feeds to its classifier_model. Executed with `.Policies` ([]{Label, Description}). Empty falls back to the built-in Arch-Router-shaped prompt (route-listing block + JSON output schema). Override when the classifier model was trained on a different schema or you need the routing instructions in a different language. The candidate format scored against the model is fixed at `{\"route\": \"<label>\"}` — keep your override's output schema instruction matching that.",
+			Component:   "code-editor",
+			Order:       44,
+		},
 
 		// --- Pipeline ---
 		"pipeline.llm": {
@@ -320,5 +331,207 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Description: "Enable CUDA for diffusers",
 			Order:       82,
 		},
+
+		// --- PII filtering (per-model) ---
+		"pii.enabled": {
+			Section:     "other",
+			Label:       "PII Filtering Enabled",
+			Description: "Enable PII redaction middleware for this model. Unset means use the default (off for local backends, on for proxy-* / cloud-hosted backends).",
+			Component:   "toggle",
+			Order:       200,
+		},
+		"pii.patterns": {
+			Section:     "other",
+			Label:       "PII Pattern Overrides",
+			Description: "Override the global default action for specific patterns on this model. Patterns not listed here inherit the global action (Settings → Middleware → Filtering).",
+			Component:   "pii-pattern-list",
+			Order:       201,
+		},
+
+		// --- Cloud passthrough proxy ---
+		// These only have an effect when Backend is set to
+		// "cloud-proxy". When the upstream URL is empty, the model
+		// fails closed — the chat handler does NOT silently fall back
+		// to the local gRPC pipeline.
+		"proxy.mode": {
+			Section:     "other",
+			Label:       "Proxy Mode",
+			Description: "passthrough forwards the client's OpenAI body verbatim — point upstream_url at an OpenAI-compatible endpoint (incl. Anthropic's /v1/chat/completions compat layer). translate converts OpenAI ↔ Anthropic Messages so you can target a native API (/v1/messages); tool_calls and usage tokens survive the round-trip.",
+			Component:   "select",
+			Options: []FieldOption{
+				{Value: "passthrough", Label: "passthrough (raw forward)"},
+				{Value: "translate", Label: "translate (OpenAI ↔ native)"},
+			},
+			Default: "passthrough",
+			Order:   208,
+		},
+		"proxy.provider": {
+			Section:     "other",
+			Label:       "Proxy Provider",
+			Description: "Upstream API family. Drives auth header shape (Bearer vs x-api-key + anthropic-version) and, in translate mode, which request/response codec is used.",
+			Component:   "select",
+			Options: []FieldOption{
+				{Value: "openai", Label: "OpenAI"},
+				{Value: "anthropic", Label: "Anthropic"},
+			},
+			Default: "openai",
+			Order:   209,
+		},
+		"proxy.upstream_url": {
+			Section:     "other",
+			Label:       "Proxy Upstream URL",
+			Description: "Full POST endpoint of the upstream provider (e.g. https://api.openai.com/v1/chat/completions). Only used when Backend is cloud-proxy.",
+			Component:   "input",
+			Order:       210,
+		},
+		"proxy.api_key_env": {
+			Section:     "other",
+			Label:       "Proxy API Key Env Var",
+			Description: "Name of the environment variable holding the upstream API key. Reading from env keeps the secret out of the YAML and the admin UI.",
+			Component:   "input",
+			Order:       211,
+		},
+		"proxy.upstream_model": {
+			Section:     "other",
+			Label:       "Proxy Upstream Model",
+			Description: "Model name sent to the upstream. Leave empty to forward the client's model field unchanged. Useful when the LocalAI alias differs from the upstream's canonical name.",
+			Component:   "input",
+			Order:       212,
+		},
+		"proxy.request_timeout_seconds": {
+			Section:     "other",
+			Label:       "Proxy Request Timeout (seconds)",
+			Description: "Caps the upstream HTTP request duration. 0 disables the deadline; the request still ends when the client disconnects.",
+			Component:   "number",
+			Min:         f64(0),
+			Order:       213,
+		},
+
+		// --- MITM intercept hosts ---
+		// Each host listed here is claimed by this model config; the
+		// cloudproxy MITM listener (see Middleware → MITM Proxy) uses
+		// THIS config's pii: settings to filter the intercepted traffic.
+		// A host claimed by two configs is a critical error — the
+		// listener refuses to start until resolved.
+		"mitm.hosts": {
+			Section:     "other",
+			Label:       "MITM Intercept Hosts",
+			Description: "Hostnames the cloudproxy MITM proxy terminates TLS for on behalf of this model config. PII filtering and pattern overrides flow from this model when the host is intercepted. Each host must be unique across all configs.",
+			Component:   "string-list",
+			Order:       220,
+		},
+
+		// --- Router ---
+		// Routing turns this model config into a dispatcher: the
+		// classifier scores every policy label as a continuation of
+		// the routing prompt and picks the first candidate whose
+		// labels are a superset of the active set. The Routing tab of
+		// the middleware admin page surfaces every model with a router
+		// block.
+		"router.classifier": {
+			Section:     "other",
+			Label:       "Classifier",
+			Description: "Picks a candidate by scoring every policy label against the prompt. Only \"score\" is shipped today; it asks the classifier_model to rank each label and reads off the softmax. Empty defaults to \"score\".",
+			Component:   "select",
+			Options: []FieldOption{
+				{Value: "score", Label: "Score (Arch-Router-style)"},
+			},
+			Order: 230,
+		},
+		"router.classifier_model": {
+			Section:              "other",
+			Label:                "Classifier Model",
+			Description:          "Loaded LocalAI model the score classifier asks to rank each policy label as a continuation. Must support the Score gRPC primitive (today: llama-cpp, vLLM) and use the ChatML template. Arch-Router-1.5B Q4_K_M is the canonical choice; any small ChatML instruct model also works at a higher activation_threshold.",
+			Component:            "model-select",
+			AutocompleteProvider: ProviderModelsChat,
+			Order:                231,
+		},
+		"router.fallback": {
+			Section:              "other",
+			Label:                "Fallback Model",
+			Description:          "Model used when no candidate's labels cover the classifier's active label set, or when the classifier errors. Empty means router failures bubble up as HTTP 500 — fail-fast, not silent-bypass.",
+			Component:            "model-select",
+			AutocompleteProvider: ProviderModelsChat,
+			Order:                232,
+		},
+		"router.activation_threshold": {
+			Section:     "other",
+			Label:       "Activation Threshold",
+			Description: "Softmax-probability floor a policy must clear to join the active label set for a request. Higher → single-label dominant routes; lower → more multi-label activations. 0 picks the package default (0.15). On Arch-Router-1.5B a value around 0.40 keeps the dominant label clean without losing genuine compound activations.",
+			Component:   "slider",
+			Min:         f64(0),
+			Max:         f64(1),
+			Step:        f64(0.05),
+			Order:       233,
+		},
+		"router.classifier_cache_size": {
+			Section:     "other",
+			Label:       "Classifier L1 Cache Size",
+			Description: "Bounded LRU keyed on (case-folded, whitespace-trimmed) prompt — amortises the classifier round-trip across verbatim repeats common in agent loops. 0 here means \"use the default\" (1024); the cache cannot be disabled from YAML.",
+			Component:   "number",
+			Min:         f64(0),
+			Order:       234,
+		},
+		"router.policies": {
+			Section:     "other",
+			Label:       "Policies",
+			Description: "Label vocabulary the classifier scores over. Each policy has a label and a short natural-language description fed verbatim to the classifier model. Short action-oriented sentences work best (\"writing or debugging code\"; \"small talk\").",
+			Component:   "router-policies",
+			Order:       235,
+		},
+		"router.candidates": {
+			Section:     "other",
+			Label:       "Candidates",
+			Description: "Routing table: each entry binds a downstream model to a set of policy labels it can serve. Order matters — the middleware picks the FIRST candidate whose labels are a superset of the active set, so list candidates smallest → largest.",
+			Component:   "router-candidates",
+			Order:       236,
+		},
+		"router.score_normalization": {
+			Section:     "other",
+			Label:       "Score Normalization",
+			Description: "How the score classifier collapses per-candidate joint log-probs into the softmax input. \"raw\" (default) feeds joint log-prob as-is — on-distribution for Arch-Router (the route the model would actually emit if decoded freely). \"mean\" divides by candidate token count — fairer to long labels but off-distribution for models trained to emit fixed-format outputs.",
+			Component:   "select",
+			Options: []FieldOption{
+				{Value: "", Label: "Raw (default)"},
+				{Value: "raw", Label: "Raw"},
+				{Value: "mean", Label: "Mean (length-normalised)"},
+			},
+			Order: 240,
+		},
+		"router.embedding_cache.embedding_model": {
+			Section:              "other",
+			Label:                "L2 Cache: Embedding Model",
+			Description:          "Embedding model used by the L2 decision cache. Embeds incoming probes and looks them up in the per-router local-store collection. Empty disables the cache entirely. nomic-embed-text-v1.5 is the recommended default.",
+			Component:            "model-select",
+			AutocompleteProvider: ProviderModels,
+			Order:                237,
+		},
+		"router.embedding_cache.similarity_threshold": {
+			Section:     "other",
+			Label:       "L2 Cache: Similarity Threshold",
+			Description: "Cosine-similarity floor a cache candidate must clear to count as a hit. 0 picks the package default (0.80). Re-tune per embedding model — the histogram on the Routing tab shows where the cosine distribution actually sits.",
+			Component:   "slider",
+			Min:         f64(0),
+			Max:         f64(1),
+			Step:        f64(0.01),
+			Order:       238,
+		},
+		"router.embedding_cache.confidence_threshold": {
+			Section:     "other",
+			Label:       "L2 Cache: Confidence Threshold",
+			Description: "Minimum top-label probability a classifier decision must have to be inserted into the cache. 0 picks the package default (0.60). Uncertain decisions are skipped so they can't poison future paraphrases.",
+			Component:   "slider",
+			Min:         f64(0),
+			Max:         f64(1),
+			Step:        f64(0.05),
+			Order:       239,
+		},
+		"router.embedding_cache.store_name": {
+			Section:     "other",
+			Label:       "L2 Cache: Store Name",
+			Description: "Optional override for the local-store collection used by this router's cache. Empty defaults to \"router-cache-<router-model-name>\". Two routers sharing a store_name share their cache (rare).",
+			Component:   "input",
+			Order:       240,
+		},
 	}
 }
diff --git a/core/config/meta/registry_coverage_test.go b/core/config/meta/registry_coverage_test.go
new file mode 100644
index 000000000..0db375c7c
--- /dev/null
+++ b/core/config/meta/registry_coverage_test.go
@@ -0,0 +1,258 @@
+package meta_test
+
+import (
+	"reflect"
+	"sort"
+	"strings"
+	"testing"
+
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/config/meta"
+)
+
+// TestAllFieldsHaveRegistryEntries fails when a NEW ModelConfig field
+// is added without either a registry entry in DefaultRegistry() or an
+// entry in the grandfatheredUnregistered baseline below.
+//
+// Why this matters: fields without a registry entry render in the UI
+// with no description, the default `input` (single-line) component,
+// and land in the catch-all "other" section — which is what we just
+// hit for router.classifier_system_template. The reflection-based
+// fallback produces something that works mechanically but is hostile
+// to operators.
+//
+// How to fix when this test fails:
+//
+//  1. Preferred — add a registry entry to DefaultRegistry() in
+//     registry.go with Section, Label, Description, and Component.
+//     See e.g. "router.classifier" or "template.chat" for the pattern.
+//
+//  2. Escape hatch — append the field path to
+//     grandfatheredUnregistered with a one-line comment justifying
+//     why it has no UI surface (internal, deprecated, legacy
+//     compatibility shim, etc.). The expectation is that this list
+//     shrinks over time as fields get proper registry entries; it
+//     should never grow without good reason.
+//
+// The grandfathered list was seeded from a one-time audit. Migrating
+// the existing 150+ entries to proper registry metadata is out of
+// scope for any single PR; the lock just stops the list from growing.
+func TestAllFieldsHaveRegistryEntries(t *testing.T) {
+	g := NewWithT(t)
+	md := meta.BuildConfigMetadata(reflect.TypeOf(config.ModelConfig{}))
+	reg := meta.DefaultRegistry()
+
+	grand := make(map[string]struct{}, len(grandfatheredUnregistered))
+	for _, p := range grandfatheredUnregistered {
+		grand[p] = struct{}{}
+	}
+
+	var missing []string
+	for _, f := range md.Fields {
+		if _, ok := reg[f.Path]; ok {
+			continue
+		}
+		if _, ok := grand[f.Path]; ok {
+			continue
+		}
+		missing = append(missing, f.Path)
+	}
+
+	sort.Strings(missing)
+	g.Expect(missing).To(BeEmpty(),
+		"%d config field(s) have no registry entry and are not on the grandfathered list.\n"+
+			"Add a registry entry to core/config/meta/registry.go OR append to grandfatheredUnregistered in this file with a justification:\n  %s",
+		len(missing), strings.Join(missing, "\n  "),
+	)
+
+	// Inverse drift check: catch dead entries on the grandfathered
+	// list (field was renamed/removed, or someone wrote a registry
+	// entry without trimming the grandfathered duplicate).
+	known := make(map[string]struct{}, len(md.Fields))
+	for _, f := range md.Fields {
+		known[f.Path] = struct{}{}
+	}
+	var stale, duplicated []string
+	for _, p := range grandfatheredUnregistered {
+		if _, ok := known[p]; !ok {
+			stale = append(stale, p)
+		}
+		if _, ok := reg[p]; ok {
+			duplicated = append(duplicated, p)
+		}
+	}
+	sort.Strings(stale)
+	g.Expect(stale).To(BeEmpty(),
+		"grandfatheredUnregistered references fields that no longer exist in ModelConfig — remove them:\n  %s",
+		strings.Join(stale, "\n  "))
+	sort.Strings(duplicated)
+	g.Expect(duplicated).To(BeEmpty(),
+		"grandfatheredUnregistered references fields that now HAVE a registry entry — remove them so the test stays meaningful:\n  %s",
+		strings.Join(duplicated, "\n  "))
+}
+
+// grandfatheredUnregistered is the baseline of config fields that
+// pre-date the registry-coverage test and have no UI metadata yet.
+// Adding new entries here should be a deliberate, justified decision
+// — prefer adding a registry entry in registry.go instead.
+//
+// Keep the list sorted (one-line-per-entry) so the diff is minimal
+// when an entry is removed or (rarely) added.
+var grandfatheredUnregistered = []string{
+	"agent.disable_sink_state",
+	"agent.enable_mcp_prompts",
+	"agent.enable_plan_re_evaluator",
+	"agent.enable_planning",
+	"agent.enable_reasoning",
+	"agent.force_reasoning_tool",
+	"agent.loop_detection",
+	"agent.max_adjustment_attempts",
+	"agent.max_attempts",
+	"agent.max_iterations",
+	"cfg_scale",
+	"concurrency_groups",
+	"cutstrings",
+	"debug",
+	"diffusers.clip_model",
+	"diffusers.clip_skip",
+	"diffusers.clip_subfolder",
+	"diffusers.control_net",
+	"diffusers.enable_parameters",
+	"diffusers.img2img",
+	"disable_log_stats",
+	"disabled",
+	"download_files",
+	"draft_model",
+	"dtype",
+	"enforce_eager",
+	"engine_args",
+	"extract_regex",
+	"feature_flags",
+	"function.argument_regex",
+	"function.argument_regex_key_name",
+	"function.argument_regex_value_name",
+	"function.automatic_tool_parsing_fallback",
+	"function.capture_llm_results",
+	"function.disable_no_action",
+	"function.disable_peg_parser",
+	"function.function_arguments_key",
+	"function.function_name_key",
+	"function.grammar.disable_parallel_new_lines",
+	"function.grammar.expect_strings_after_json",
+	"function.grammar.no_mixed_free_string",
+	"function.grammar.prefix",
+	"function.grammar.properties_order",
+	"function.grammar.schema_type",
+	"function.grammar.triggers",
+	"function.json_regex_match",
+	"function.no_action_description_name",
+	"function.no_action_function_name",
+	"function.replace_function_results",
+	"function.replace_llm_results",
+	"function.response_regex",
+	"function.xml_format.allow_toolcall_in_think",
+	"function.xml_format.key_start",
+	"function.xml_format.key_val_sep",
+	"function.xml_format.key_val_sep2",
+	"function.xml_format.last_tool_end",
+	"function.xml_format.last_val_end",
+	"function.xml_format.raw_argval",
+	"function.xml_format.scope_end",
+	"function.xml_format.scope_start",
+	"function.xml_format.tool_end",
+	"function.xml_format.tool_sep",
+	"function.xml_format.tool_start",
+	"function.xml_format.trim_raw_argval",
+	"function.xml_format.val_end",
+	"function.xml_format_preset",
+	"gpu_memory_utilization",
+	"grammar",
+	"grpc.attempts",
+	"grpc.attempts_sleep_time",
+	"limit_mm_per_prompt.audio",
+	"limit_mm_per_prompt.image",
+	"limit_mm_per_prompt.video",
+	"limits.max_concurrent",
+	"limits.retry_after_seconds",
+	"load_format",
+	"lora_adapter",
+	"lora_adapters",
+	"lora_base",
+	"lora_scale",
+	"lora_scales",
+	"main_gpu",
+	"max_model_len",
+	"mcp.remote",
+	"mcp.stdio",
+	"mirostat",
+	"mirostat_eta",
+	"mirostat_tau",
+	"mmproj",
+	"n_draft",
+	"ngqa",
+	"no_kv_offloading",
+	"no_mulmatq",
+	"numa",
+	"options",
+	"overrides",
+	"parameters.batch",
+	"parameters.clip_skip",
+	"parameters.echo",
+	"parameters.encoding_format",
+	"parameters.frequency_penalty",
+	"parameters.ignore_eos",
+	"parameters.language",
+	"parameters.logit_bias",
+	"parameters.logprobs",
+	"parameters.min_p",
+	"parameters.model",
+	"parameters.n",
+	"parameters.n_keep",
+	"parameters.negative_prompt",
+	"parameters.negative_prompt_scale",
+	"parameters.presence_penalty",
+	"parameters.repeat_last_n",
+	"parameters.rope_freq_base",
+	"parameters.rope_freq_scale",
+	"parameters.tfz",
+	"parameters.tokenizer",
+	"parameters.top_logprobs",
+	"parameters.translate",
+	"parameters.typical_p",
+	"pinned",
+	"prompt_cache_all",
+	"prompt_cache_path",
+	"prompt_cache_ro",
+	"proxy.api_key_file",
+	"reasoning.disable",
+	"reasoning.disable_reasoning_tag_prefill",
+	"reasoning.strip_reasoning_only",
+	"reasoning.tag_pairs",
+	"reasoning.thinking_start_tokens",
+	"reranking",
+	"rms_norm_eps",
+	"roles",
+	"rope_scaling",
+	"step",
+	"stopwords",
+	"swap_space",
+	"system_prompt",
+	"template.edit",
+	"template.function",
+	"template.join_chat_messages_by_character",
+	"template.multimodal",
+	"template.reply_prefix",
+	"tensor_parallel_size",
+	"tensor_split",
+	"trimspace",
+	"trimsuffix",
+	"trust_remote_code",
+	"tts.audio_path",
+	"type",
+	"yarn_attn_factor",
+	"yarn_beta_fast",
+	"yarn_beta_slow",
+	"yarn_ext_factor",
+}
diff --git a/core/config/mitm_host_owners_test.go b/core/config/mitm_host_owners_test.go
new file mode 100644
index 000000000..1ab2f36f2
--- /dev/null
+++ b/core/config/mitm_host_owners_test.go
@@ -0,0 +1,133 @@
+package config_test
+
+import (
+	"os"
+	"path/filepath"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/config"
+)
+
+// MITMHostOwners is the load-bearing piece of D2 — a duplicate host
+// across model configs is a critical error that disables the listener.
+// The test exercises both happy paths (no duplicates → clean Owners
+// map) and conflict detection (two configs on one host → entry in
+// Conflicts naming both).
+
+var _ = Describe("ModelConfigLoader.MITMHostOwners", func() {
+	var (
+		dir    string
+		loader *config.ModelConfigLoader
+	)
+
+	writeYAML := func(name, body string) {
+		path := filepath.Join(dir, name+".yaml")
+		Expect(os.WriteFile(path, []byte(body), 0o644)).To(Succeed())
+		Expect(loader.ReadModelConfig(path)).To(Succeed())
+	}
+
+	BeforeEach(func() {
+		var err error
+		dir, err = os.MkdirTemp("", "mitm-host-owners-test-*")
+		Expect(err).ToNot(HaveOccurred())
+		loader = config.NewModelConfigLoader(dir)
+	})
+
+	AfterEach(func() {
+		_ = os.RemoveAll(dir)
+	})
+
+	It("returns empty maps when no model declares mitm.hosts", func() {
+		writeYAML("plain", `name: plain
+backend: llama-cpp
+`)
+		got := loader.MITMHostOwners()
+		Expect(got.Owners).To(BeEmpty())
+		Expect(got.Conflicts).To(BeEmpty())
+	})
+
+	It("indexes hosts to the owning model name", func() {
+		writeYAML("claude", `name: claude
+backend: cloud-proxy
+mitm:
+  hosts:
+    - api.anthropic.com
+`)
+		writeYAML("openai", `name: openai
+backend: cloud-proxy
+mitm:
+  hosts:
+    - api.openai.com
+    - api.openai.azure.com
+`)
+		got := loader.MITMHostOwners()
+		Expect(got.Owners).To(Equal(map[string]string{
+			"api.anthropic.com":    "claude",
+			"api.openai.com":       "openai",
+			"api.openai.azure.com": "openai",
+		}))
+		Expect(got.Conflicts).To(BeEmpty())
+	})
+
+	It("normalises case and trims whitespace before indexing", func() {
+		writeYAML("claude", `name: claude
+backend: cloud-proxy
+mitm:
+  hosts:
+    - "  API.ANTHROPIC.com  "
+`)
+		got := loader.MITMHostOwners()
+		Expect(got.Owners).To(HaveKey("api.anthropic.com"))
+	})
+
+	It("detects two configs claiming the same host as a conflict", func() {
+		// The 1-to-1 invariant the D2 dispatcher relies on: a host
+		// claimed twice means the owner lookup is ambiguous, so the
+		// caller must NOT start the MITM listener until resolved.
+		writeYAML("alpha", `name: alpha
+backend: cloud-proxy
+mitm:
+  hosts:
+    - api.anthropic.com
+`)
+		writeYAML("beta", `name: beta
+backend: cloud-proxy
+mitm:
+  hosts:
+    - api.anthropic.com
+`)
+		got := loader.MITMHostOwners()
+		Expect(got.Conflicts).To(HaveKey("api.anthropic.com"))
+		Expect(got.Conflicts["api.anthropic.com"]).To(ConsistOf("alpha", "beta"))
+	})
+
+	It("treats the same host listed twice within ONE config as a no-op (not a conflict)", func() {
+		// A single config repeating a host is benign — same owner
+		// either way. The conflict signal must be cross-config only.
+		writeYAML("dup", `name: dup
+backend: llama-cpp
+mitm:
+  hosts:
+    - api.example.com
+    - api.example.com
+`)
+		got := loader.MITMHostOwners()
+		Expect(got.Owners).To(Equal(map[string]string{"api.example.com": "dup"}))
+		Expect(got.Conflicts).To(BeEmpty())
+	})
+
+	It("ignores empty/whitespace-only host entries", func() {
+		writeYAML("sloppy", `name: sloppy
+backend: llama-cpp
+mitm:
+  hosts:
+    - ""
+    - "   "
+    - api.real.com
+`)
+		got := loader.MITMHostOwners()
+		Expect(got.Owners).To(Equal(map[string]string{"api.real.com": "sloppy"}))
+	})
+})
diff --git a/core/config/model_config.go b/core/config/model_config.go
index c49a87f4c..727d12f7c 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -7,6 +7,7 @@ import (
 	"regexp"
 	"slices"
 	"strings"
+	"text/template"
 
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/downloader"
@@ -95,8 +96,330 @@ type ModelConfig struct {
 	Options   []string `yaml:"options,omitempty" json:"options,omitempty"`
 	Overrides []string `yaml:"overrides,omitempty" json:"overrides,omitempty"`
 
-	MCP   MCPConfig   `yaml:"mcp,omitempty" json:"mcp,omitempty"`
-	Agent AgentConfig `yaml:"agent,omitempty" json:"agent,omitempty"`
+	MCP    MCPConfig       `yaml:"mcp,omitempty" json:"mcp,omitempty"`
+	Agent  AgentConfig     `yaml:"agent,omitempty" json:"agent,omitempty"`
+	PII    PIIConfig       `yaml:"pii,omitempty" json:"pii,omitempty"`
+	Router RouterConfig    `yaml:"router,omitempty" json:"router,omitempty"`
+	Proxy  ProxyConfig     `yaml:"proxy,omitempty" json:"proxy,omitempty"`
+	MITM   MITMModelConfig `yaml:"mitm,omitempty" json:"mitm,omitempty"`
+	Limits LimitsConfig    `yaml:"limits,omitempty" json:"limits,omitempty"`
+}
+
+// @Description Admission-control limits applied per request. The
+// admission middleware enforces these before invoking the handler;
+// requests that exceed a limit get 503 with a Retry-After hint so
+// clients back off rather than pile on. Per-model so cloud passthroughs
+// can have a stricter ceiling than local models.
+type LimitsConfig struct {
+	// MaxConcurrent caps simultaneous in-flight requests for this
+	// model. 0 = unlimited (default). Useful for cloud-passthrough
+	// configs where the upstream rate-limits aggressively, or for
+	// local backends whose memory budget tops out before LocalAI's
+	// queue depth would.
+	MaxConcurrent int `yaml:"max_concurrent,omitempty" json:"max_concurrent,omitempty"`
+
+	// RetryAfterSeconds advises clients how long to wait before
+	// retrying when admission rejects. 0 defaults to 1s — enough to
+	// let an in-flight request finish on a busy local model. The
+	// value is sent verbatim in the Retry-After response header.
+	RetryAfterSeconds int `yaml:"retry_after_seconds,omitempty" json:"retry_after_seconds,omitempty"`
+}
+
+// @Description MITM intercept binding for the model. When the cloudproxy
+// MITM listener is enabled and any host listed here appears in a CONNECT,
+// the proxy uses THIS model config's pii: settings to filter the
+// intercepted body. Strict 1-to-1: a host claimed by two configs is a
+// configuration error and disables the MITM listener until resolved.
+//
+// Lets an admin pair a host (api.anthropic.com) with the model's
+// PII overrides without maintaining a parallel per-host map.
+type MITMModelConfig struct {
+	// Hosts is the list of hostnames this model claims for MITM
+	// interception. Each entry must be unique across all model configs.
+	Hosts []string `yaml:"hosts,omitempty" json:"hosts,omitempty"`
+}
+
+// @Description Cloud proxy configuration. The cloud-proxy backend
+// forwards a model's traffic to an external provider. Two modes:
+//
+//   - mode: passthrough — client and upstream must speak the same wire
+//     format; the backend ships the raw request body to the upstream
+//     URL and streams the response back untouched. The streaming PII
+//     filter still runs because it operates on extracted token text.
+//
+//   - mode: translate — the backend converts LocalAI's internal proto
+//     to the provider's wire format and back. Unlocks cross-provider
+//     routing (OpenAI client → Anthropic upstream, etc.) at the cost
+//     of dropping provider-specific extensions that the internal proto
+//     doesn't model.
+type ProxyConfig struct {
+	// UpstreamURL is the full POST endpoint, e.g.
+	// https://api.openai.com/v1/chat/completions or
+	// https://api.anthropic.com/v1/messages. Required.
+	UpstreamURL string `yaml:"upstream_url,omitempty" json:"upstream_url,omitempty"`
+
+	// Mode selects passthrough (wire-perfect) or translate (full
+	// control via internal proto). Empty defaults to passthrough.
+	Mode string `yaml:"mode,omitempty" json:"mode,omitempty"`
+
+	// Provider identifies the upstream's wire format for translate
+	// mode (openai, anthropic). Ignored in passthrough mode — the
+	// wire format there is whatever the client sent.
+	Provider string `yaml:"provider,omitempty" json:"provider,omitempty"`
+
+	// APIKeyEnv names the environment variable holding the upstream
+	// API key. Mutually exclusive with APIKeyFile. Both empty is
+	// allowed (no-auth upstreams).
+	APIKeyEnv string `yaml:"api_key_env,omitempty" json:"api_key_env,omitempty"`
+
+	// APIKeyFile is a path to a file whose contents are the upstream
+	// API key. Trailing whitespace is trimmed. Mutually exclusive
+	// with APIKeyEnv. The integration point for K8s secret mounts,
+	// Vault agent files, and similar external-secret workflows.
+	APIKeyFile string `yaml:"api_key_file,omitempty" json:"api_key_file,omitempty"`
+
+	// UpstreamModel overrides the model name sent to the upstream.
+	// Useful when the LocalAI-facing model alias differs from the
+	// upstream's canonical name (e.g. local "claude-strict" maps to
+	// upstream "claude-3-5-sonnet-20241022"). Empty means forward
+	// the client's model field unchanged.
+	UpstreamModel string `yaml:"upstream_model,omitempty" json:"upstream_model,omitempty"`
+
+	// RequestTimeoutSeconds caps the upstream request duration. 0
+	// means no per-request timeout (only the request context, which
+	// is bound to the client connection, applies).
+	RequestTimeoutSeconds int `yaml:"request_timeout_seconds,omitempty" json:"request_timeout_seconds,omitempty"`
+}
+
+// Proxy mode names. Validate() normalises an empty Mode to
+// ProxyModePassthrough so downstream code only sees concrete values.
+const (
+	ProxyModePassthrough = "passthrough"
+	ProxyModeTranslate   = "translate"
+)
+
+// Proxy provider names. Only meaningful in translate mode, where the
+// cloud-proxy backend picks the wire format to use against the
+// upstream URL.
+const (
+	ProxyProviderOpenAI    = "openai"
+	ProxyProviderAnthropic = "anthropic"
+)
+
+// IsCloudProxyBackendPassthrough reports whether this model uses the
+// cloud-proxy gRPC backend in passthrough mode. Empty Mode counts as
+// passthrough (SetDefaults normalises it, but Validate accepts empty
+// too — handlers should not rely on a particular call order).
+func (c *ModelConfig) IsCloudProxyBackendPassthrough() bool {
+	if c.Backend != "cloud-proxy" {
+		return false
+	}
+	return c.Proxy.Mode == "" || c.Proxy.Mode == ProxyModePassthrough
+}
+
+// @Description Intelligent routing configuration. When a model declares
+// a Router block, requests addressed to it are reclassified at runtime
+// and dispatched to one of the named candidates. The router rewrites
+// input.Model in-place, then the standard model-resolution path picks
+// up the resolved config — meaning ACL checks, disabled-state, and
+// per-model PII still run against the chosen target.
+//
+// Depth-1 invariant: candidates must NOT themselves carry a Router
+// block. The router's "smart-router → claude-strict → cloud-proxy"
+// chain is fine, but "router-A → router-B → claude" is rejected at
+// config load to keep the dispatch graph acyclic and predictable. The
+// middleware also asserts depth ≤ 1 at runtime as a defensive check.
+type RouterConfig struct {
+	// Classifier picks the implementation. Only "score" ships today:
+	// it asks the classifier model to score every Policy label as a
+	// continuation of the routing prompt and reads off the
+	// distribution. Empty defaults to "score".
+	Classifier string `yaml:"classifier,omitempty" json:"classifier,omitempty"`
+
+	// Policies is the label vocabulary the classifier scores over.
+	// Each policy carries a natural-language description that ends up
+	// in the system prompt the classifier model sees — short, action-
+	// oriented sentences work best ("writing or debugging code",
+	// "small talk", ...). The Score classifier picks the subset of
+	// labels whose softmax probability passes ActivationThreshold.
+	Policies []RouterPolicy `yaml:"policies,omitempty" json:"policies,omitempty"`
+
+	// Candidates is the routing table — each entry binds a downstream
+	// model to a set of labels it can serve. The middleware picks the
+	// FIRST candidate whose Labels are a superset of the active label
+	// set from the classifier. Admins order this list smallest →
+	// largest so a query that needs one label routes to the smallest
+	// capable model, while a query that needs multiple falls to a
+	// bigger candidate that covers them all.
+	Candidates []RouterCandidate `yaml:"candidates,omitempty" json:"candidates,omitempty"`
+
+	// Fallback is the model used when no candidate matches the active
+	// label set, or when the classifier returns nothing above
+	// threshold. Empty fallback means router failures bubble up as
+	// 500 — fail-fast, not silent-bypass.
+	Fallback string `yaml:"fallback,omitempty" json:"fallback,omitempty"`
+
+	// ClassifierModel names the model the Score classifier scores
+	// against (Arch-Router-1.5B is the canonical choice).
+	ClassifierModel string `yaml:"classifier_model,omitempty" json:"classifier_model,omitempty"`
+
+	// ClassifierCacheSize bounds the per-prompt memo cache that
+	// amortises the classifier round-trip across repeat probes.
+	// 0 disables the cache. Default 1024.
+	ClassifierCacheSize int `yaml:"classifier_cache_size,omitempty" json:"classifier_cache_size,omitempty"`
+
+	// ActivationThreshold is the softmax-probability floor a policy
+	// must clear to be considered "active" for the request. 0
+	// defaults to a sensible value (~0.15) inside the classifier.
+	// Higher → narrower routes (single-label dominant); lower →
+	// more multi-label activations.
+	ActivationThreshold float64 `yaml:"activation_threshold,omitempty" json:"activation_threshold,omitempty"`
+
+	// ClassifierSystemTemplate overrides the routing system prompt
+	// the score classifier feeds to its classifier_model. Go
+	// text/template + Sprig, executed with `.Policies []ScorePolicy`
+	// (Label + Description fields). Empty falls back to the built-in
+	// Arch-Router-shaped template (route-listing block + JSON output
+	// schema). Override when the classifier model was trained on a
+	// different schema (e.g. bare label output, XML route block) or
+	// when the routing instructions need to be in a different
+	// language. The candidate format scored against the model is
+	// fixed at `{"route": "<label>"}` and IS NOT templated — keep
+	// your override's output schema instruction matching that, or
+	// the per-candidate scores degenerate.
+	ClassifierSystemTemplate string `yaml:"classifier_system_template,omitempty" json:"classifier_system_template,omitempty"`
+
+	// ScoreNormalization picks how the score classifier collapses
+	// per-candidate joint log-probs into the softmax input.
+	//   - ""/"raw": use joint log-prob as-is (default). Matches the
+	//     distribution the classifier model was trained against — the
+	//     route the model would actually emit if decoded freely.
+	//   - "mean": divide by candidate token count. Fairer to long
+	//     labels (their joint log-prob is mechanically smaller because
+	//     it sums more negatives), but off-distribution for models
+	//     trained to emit fixed-format outputs like Arch-Router's
+	//     {"route": "name"}.
+	// Future modes (e.g. "weighted_mean") will land here too.
+	ScoreNormalization string `yaml:"score_normalization,omitempty" json:"score_normalization,omitempty"`
+
+	// EmbeddingCache configures the L2 cache that maps prompt
+	// embeddings to past decisions, so semantically-similar prompts
+	// reuse a classification instead of re-running the classifier
+	// model. Omit the block to disable. See router/embedding_cache.go.
+	EmbeddingCache *EmbeddingCacheConfig `yaml:"embedding_cache,omitempty" json:"embedding_cache,omitempty"`
+}
+
+// EmbeddingCacheConfig configures the L2 embedding-similarity decision
+// cache. Pairs naturally with a larger / slower classifier model: the
+// classifier round-trip is amortised across paraphrases of the same
+// intent. The cache uses the standard /v1/embeddings backend for
+// vector generation and the local-store gRPC surface for KNN search.
+type EmbeddingCacheConfig struct {
+	// EmbeddingModel names the loaded LocalAI model used to embed
+	// router prompts. Required when the cache is enabled. Any model
+	// that supports the Embeddings gRPC primitive works;
+	// nomic-embed-text-v1.5 is the recommended default.
+	EmbeddingModel string `yaml:"embedding_model" json:"embedding_model"`
+
+	// SimilarityThreshold is the cosine-similarity floor a cache
+	// candidate must clear to be treated as a hit. 0 picks the
+	// package default (0.80). Higher → fewer false hits, higher miss
+	// rate; lower → more aggressive sharing across paraphrases.
+	SimilarityThreshold float64 `yaml:"similarity_threshold,omitempty" json:"similarity_threshold,omitempty"`
+
+	// ConfidenceThreshold is the minimum classifier top-label
+	// probability for a decision to be inserted into the cache. 0
+	// picks the package default (0.60). Uncertain decisions are not
+	// cached so they can't poison future paraphrases.
+	ConfidenceThreshold float64 `yaml:"confidence_threshold,omitempty" json:"confidence_threshold,omitempty"`
+
+	// StoreName overrides the local-store collection name used for
+	// this router's cache. Empty defaults to "router-cache-<router>"
+	// where <router> is the parent model name. Useful when two
+	// router models should share a cache (rare).
+	StoreName string `yaml:"store_name,omitempty" json:"store_name,omitempty"`
+}
+
+// RouterPolicy is one entry in the label vocabulary. The label string
+// is what the classifier model emits and what candidates reference in
+// their Labels field; the description is the natural-language hint
+// fed to the classifier so it can match user intent against the label
+// space.
+type RouterPolicy struct {
+	Label       string `yaml:"label" json:"label"`
+	Description string `yaml:"description" json:"description"`
+}
+
+// RouterCandidate names a downstream model and the policy labels it
+// is willing to serve. Labels are matched as a set: the middleware
+// picks the first candidate whose Labels is a superset of the
+// classifier's active set.
+type RouterCandidate struct {
+	Model  string   `yaml:"model" json:"model"`
+	Labels []string `yaml:"labels" json:"labels"`
+}
+
+// HasRouter returns true when the model declares a router config with
+// at least one candidate. Used by the RouteModel middleware to decide
+// whether to engage the classifier.
+func (c *ModelConfig) HasRouter() bool {
+	return len(c.Router.Candidates) > 0
+}
+
+// @Description PII filtering configuration. PII redaction is per-model so
+// that local models don't pay the latency or behaviour change of regex
+// scanning, while cloud-bound traffic (cloud-proxy backend) can default to
+// on. Setting Enabled explicitly always wins over the backend default.
+type PIIConfig struct {
+	// Enabled toggles redaction for this model. When unset (zero value),
+	// the resolved default depends on Backend: cloud-proxy defaults to
+	// true, everything else to false. A pointer is used so the absence of
+	// the YAML key is distinguishable from explicit false.
+	Enabled *bool `yaml:"enabled,omitempty" json:"enabled,omitempty"`
+
+	// Patterns lets a model upgrade or downgrade individual pattern
+	// actions (mask | block | route_local) relative to the global
+	// defaults loaded from --pii-config / DefaultPatterns. Pattern IDs
+	// not listed inherit the global action. The regex itself stays
+	// global — only the action is settable per-model.
+	Patterns []PIIPatternOverride `yaml:"patterns,omitempty" json:"patterns,omitempty"`
+}
+
+// @Description Per-model action override for a single PII pattern.
+type PIIPatternOverride struct {
+	ID     string `yaml:"id" json:"id"`
+	Action string `yaml:"action" json:"action"`
+}
+
+// PIIIsEnabled returns the resolved PII state for this model. Single
+// source of truth for the gating decision so the middleware and the
+// /api/middleware/status admin view agree.
+func (c *ModelConfig) PIIIsEnabled() bool {
+	if c.PII.Enabled != nil {
+		return *c.PII.Enabled
+	}
+	return c.Backend == "cloud-proxy"
+}
+
+// PIIPatternOverrides returns the per-pattern action overrides as a map
+// keyed by pattern ID. The values are the raw action strings — the pii
+// package validates and converts them.
+//
+// Returned via the documented modelPIIConfig interface in
+// core/services/routing/pii/middleware.go without taking a config
+// dependency on this package.
+func (c *ModelConfig) PIIPatternOverrides() map[string]string {
+	if len(c.PII.Patterns) == 0 {
+		return nil
+	}
+	out := make(map[string]string, len(c.PII.Patterns))
+	for _, p := range c.PII.Patterns {
+		if p.ID == "" {
+			continue
+		}
+		out[p.ID] = p.Action
+	}
+	return out
 }
 
 // @Description MCP configuration
@@ -401,6 +724,14 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	f16 := lo.f16
 	debug := lo.debug
 
+	// Cloud-proxy: normalise empty Mode so downstream consumers
+	// switch on two concrete values only. Validate accepts empty too,
+	// but SetDefaults is the chokepoint that runs before any
+	// inference path reads cfg.Proxy.Mode.
+	if cfg.Proxy.Mode == "" {
+		cfg.Proxy.Mode = ProxyModePassthrough
+	}
+
 	// Apply model-family-specific inference defaults before generic fallbacks.
 	// This ensures gallery-installed and runtime-loaded models get optimal parameters.
 	ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
@@ -573,9 +904,74 @@ func (c *ModelConfig) Validate() (bool, error) {
 		}
 	}
 
+	// Cloud-proxy: at most one of api_key_env / api_key_file may be
+	// set. Both empty means no Authorization header (no-auth upstream
+	// or a development passthrough). The mode field accepts the empty
+	// string (defaults to passthrough), "passthrough", or "translate".
+	if c.Proxy.APIKeyEnv != "" && c.Proxy.APIKeyFile != "" {
+		return false, fmt.Errorf("proxy: api_key_env and api_key_file are mutually exclusive")
+	}
+	switch c.Proxy.Mode {
+	case "", ProxyModePassthrough, ProxyModeTranslate:
+		// Empty is accepted at validate-time and normalised to
+		// passthrough by SetDefaults so it never reaches runtime.
+	default:
+		return false, fmt.Errorf("proxy: unknown mode %q (expected %s or %s)",
+			c.Proxy.Mode, ProxyModePassthrough, ProxyModeTranslate)
+	}
+	if c.Proxy.Mode == ProxyModeTranslate && c.Proxy.Provider == "" {
+		return false, fmt.Errorf("proxy: translate mode requires provider (%s, %s)",
+			ProxyProviderOpenAI, ProxyProviderAnthropic)
+	}
+
+	// Score on llama-cpp bypasses the slot loop and races the
+	// llama_context against concurrent generation/embedding traffic
+	// (see backend/cpp/llama-cpp/grpc-server.cpp on Score). Reject the
+	// combination here so operators are forced to split the model.
+	const scoreConflicts = FLAG_CHAT | FLAG_COMPLETION | FLAG_EMBEDDINGS
+	if (c.Backend == "llama-cpp" || c.Backend == "llama") &&
+		c.HasUsecases(FLAG_SCORE) && c.KnownUsecases != nil &&
+		*c.KnownUsecases&scoreConflicts != 0 {
+		return false, fmt.Errorf(
+			"known_usecases conflict on llama-cpp: score is incompatible " +
+				"with chat/completion/embeddings — split into separate model configs")
+	}
+
+	// router.score_normalization is consumed lazily by the score
+	// classifier at first-request time; without load-time validation
+	// a typo wouldn't surface until the first router request panicked
+	// inside NewScoreClassifier. Reject unknown values here so the
+	// operator sees the offending key at startup.
+	switch c.Router.ScoreNormalization {
+	case "", ScoreNormalizationRaw, ScoreNormalizationMean:
+		// ok
+	default:
+		return false, fmt.Errorf("router: unknown score_normalization %q (expected %q or %q)",
+			c.Router.ScoreNormalization, ScoreNormalizationRaw, ScoreNormalizationMean)
+	}
+
+	// router.classifier_system_template parses as Go text/template
+	// (Sprig funcs available at execution time). Reject malformed
+	// templates at load time so the operator sees the parse error
+	// at startup rather than as a 500 on the first router request.
+	if c.Router.ClassifierSystemTemplate != "" {
+		if _, err := template.New("classifier_system").Parse(c.Router.ClassifierSystemTemplate); err != nil {
+			return false, fmt.Errorf("router: classifier_system_template parse error: %w", err)
+		}
+	}
+
 	return true, nil
 }
 
+// Score normalisation modes mirror router.ScoreNormalization* —
+// duplicated as constants on the config package so ModelConfig.Validate
+// can reject unknown values without taking a dependency on the router
+// package (which already depends on config).
+const (
+	ScoreNormalizationRaw  = "raw"
+	ScoreNormalizationMean = "mean"
+)
+
 func (c *ModelConfig) HasTemplate() bool {
 	return c.TemplateConfig.Completion != "" || c.TemplateConfig.Edit != "" || c.TemplateConfig.Chat != "" || c.TemplateConfig.ChatMessage != "" || c.TemplateConfig.UseTokenizerTemplate
 }
@@ -624,19 +1020,19 @@ func (c *ModelConfig) GetConcurrencyGroups() []string {
 type ModelConfigUsecase int
 
 const (
-	FLAG_ANY              ModelConfigUsecase = 0b000000000000
-	FLAG_CHAT             ModelConfigUsecase = 0b000000000001
-	FLAG_COMPLETION       ModelConfigUsecase = 0b000000000010
-	FLAG_EDIT             ModelConfigUsecase = 0b000000000100
-	FLAG_EMBEDDINGS       ModelConfigUsecase = 0b000000001000
-	FLAG_RERANK           ModelConfigUsecase = 0b000000010000
-	FLAG_IMAGE            ModelConfigUsecase = 0b000000100000
-	FLAG_TRANSCRIPT       ModelConfigUsecase = 0b000001000000
-	FLAG_TTS              ModelConfigUsecase = 0b000010000000
-	FLAG_SOUND_GENERATION ModelConfigUsecase = 0b000100000000
-	FLAG_TOKENIZE         ModelConfigUsecase = 0b001000000000
-	FLAG_VAD              ModelConfigUsecase = 0b010000000000
-	FLAG_VIDEO            ModelConfigUsecase = 0b100000000000
+	FLAG_ANY                 ModelConfigUsecase = 0b000000000000
+	FLAG_CHAT                ModelConfigUsecase = 0b000000000001
+	FLAG_COMPLETION          ModelConfigUsecase = 0b000000000010
+	FLAG_EDIT                ModelConfigUsecase = 0b000000000100
+	FLAG_EMBEDDINGS          ModelConfigUsecase = 0b000000001000
+	FLAG_RERANK              ModelConfigUsecase = 0b000000010000
+	FLAG_IMAGE               ModelConfigUsecase = 0b000000100000
+	FLAG_TRANSCRIPT          ModelConfigUsecase = 0b000001000000
+	FLAG_TTS                 ModelConfigUsecase = 0b000010000000
+	FLAG_SOUND_GENERATION    ModelConfigUsecase = 0b000100000000
+	FLAG_TOKENIZE            ModelConfigUsecase = 0b001000000000
+	FLAG_VAD                 ModelConfigUsecase = 0b010000000000
+	FLAG_VIDEO               ModelConfigUsecase = 0b100000000000
 	FLAG_DETECTION           ModelConfigUsecase = 0b1000000000000
 	FLAG_VISION              ModelConfigUsecase = 0b10000000000000
 	FLAG_FACE_RECOGNITION    ModelConfigUsecase = 0b100000000000000
@@ -644,6 +1040,14 @@ const (
 	FLAG_AUDIO_TRANSFORM     ModelConfigUsecase = 0b10000000000000000
 	FLAG_DIARIZATION         ModelConfigUsecase = 0b100000000000000000
 	FLAG_REALTIME_AUDIO      ModelConfigUsecase = 0b1000000000000000000
+	// Marks a model as wired for the Score gRPC primitive (joint
+	// log-prob of candidate continuations under a shared prompt). Must
+	// be declared explicitly via `known_usecases: [score]` — there's
+	// no heuristic for it. On the llama-cpp backend, Score bypasses
+	// the slot loop and races the llama_context, so Validate() refuses
+	// to load a llama-cpp config that combines FLAG_SCORE with
+	// chat/completion/embeddings.
+	FLAG_SCORE ModelConfigUsecase = 0b10000000000000000000
 
 	// Common Subsets
 	FLAG_LLM ModelConfigUsecase = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
@@ -653,12 +1057,12 @@ const (
 // Flags within the same group are NOT orthogonal (e.g., chat and completion are
 // both text/language). A model is multimodal when its usecases span 2+ groups.
 var ModalityGroups = []ModelConfigUsecase{
-	FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT,    // text/language
-	FLAG_VISION | FLAG_DETECTION,               // visual understanding
-	FLAG_TRANSCRIPT | FLAG_REALTIME_AUDIO,      // speech input — realtime_audio is any-to-any, so it counts here too
+	FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT,                // text/language
+	FLAG_VISION | FLAG_DETECTION,                           // visual understanding
+	FLAG_TRANSCRIPT | FLAG_REALTIME_AUDIO,                  // speech input — realtime_audio is any-to-any, so it counts here too
 	FLAG_TTS | FLAG_SOUND_GENERATION | FLAG_REALTIME_AUDIO, // audio output — and here, so a lone realtime_audio flag still reads as multimodal
-	FLAG_AUDIO_TRANSFORM,                       // audio in/out transforms
-	FLAG_IMAGE | FLAG_VIDEO,                    // visual generation
+	FLAG_AUDIO_TRANSFORM,                                   // audio in/out transforms
+	FLAG_IMAGE | FLAG_VIDEO,                                // visual generation
 }
 
 // IsMultimodal returns true if the given usecases span two or more orthogonal
@@ -681,19 +1085,19 @@ func GetAllModelConfigUsecases() map[string]ModelConfigUsecase {
 	return map[string]ModelConfigUsecase{
 		// Note: FLAG_ANY is intentionally excluded from this map
 		// because it's 0 and would always match in HasUsecases checks
-		"FLAG_CHAT":             FLAG_CHAT,
-		"FLAG_COMPLETION":       FLAG_COMPLETION,
-		"FLAG_EDIT":             FLAG_EDIT,
-		"FLAG_EMBEDDINGS":       FLAG_EMBEDDINGS,
-		"FLAG_RERANK":           FLAG_RERANK,
-		"FLAG_IMAGE":            FLAG_IMAGE,
-		"FLAG_TRANSCRIPT":       FLAG_TRANSCRIPT,
-		"FLAG_TTS":              FLAG_TTS,
-		"FLAG_SOUND_GENERATION": FLAG_SOUND_GENERATION,
-		"FLAG_TOKENIZE":         FLAG_TOKENIZE,
-		"FLAG_VAD":              FLAG_VAD,
-		"FLAG_LLM":              FLAG_LLM,
-		"FLAG_VIDEO":            FLAG_VIDEO,
+		"FLAG_CHAT":                FLAG_CHAT,
+		"FLAG_COMPLETION":          FLAG_COMPLETION,
+		"FLAG_EDIT":                FLAG_EDIT,
+		"FLAG_EMBEDDINGS":          FLAG_EMBEDDINGS,
+		"FLAG_RERANK":              FLAG_RERANK,
+		"FLAG_IMAGE":               FLAG_IMAGE,
+		"FLAG_TRANSCRIPT":          FLAG_TRANSCRIPT,
+		"FLAG_TTS":                 FLAG_TTS,
+		"FLAG_SOUND_GENERATION":    FLAG_SOUND_GENERATION,
+		"FLAG_TOKENIZE":            FLAG_TOKENIZE,
+		"FLAG_VAD":                 FLAG_VAD,
+		"FLAG_LLM":                 FLAG_LLM,
+		"FLAG_VIDEO":               FLAG_VIDEO,
 		"FLAG_DETECTION":           FLAG_DETECTION,
 		"FLAG_VISION":              FLAG_VISION,
 		"FLAG_FACE_RECOGNITION":    FLAG_FACE_RECOGNITION,
@@ -701,6 +1105,7 @@ func GetAllModelConfigUsecases() map[string]ModelConfigUsecase {
 		"FLAG_AUDIO_TRANSFORM":     FLAG_AUDIO_TRANSFORM,
 		"FLAG_DIARIZATION":         FLAG_DIARIZATION,
 		"FLAG_REALTIME_AUDIO":      FLAG_REALTIME_AUDIO,
+		"FLAG_SCORE":               FLAG_SCORE,
 	}
 }
 
@@ -726,9 +1131,23 @@ func GetUsecasesFromYAML(input []string) *ModelConfigUsecase {
 }
 
 // HasUsecases examines a ModelConfig and determines which endpoints have a chance of success.
+//
+// Declared known_usecases are normally additive — the guessing heuristic
+// still adds whatever it can infer from backend/templates. The one
+// exception is FLAG_SCORE: when the operator declared score, they
+// reserved the model for the router classifier. Letting GuessUsecases
+// paint chat/completion on top would surface it in chat pickers it was
+// deliberately kept out of, and (on llama-cpp) reintroduce the slot
+// contention the score/chat conflict check exists to prevent. So a
+// declared score list is authoritative.
 func (c *ModelConfig) HasUsecases(u ModelConfigUsecase) bool {
-	if (c.KnownUsecases != nil) && ((u & *c.KnownUsecases) == u) {
-		return true
+	if c.KnownUsecases != nil {
+		if (u & *c.KnownUsecases) == u {
+			return true
+		}
+		if (*c.KnownUsecases & FLAG_SCORE) == FLAG_SCORE {
+			return false
+		}
 	}
 	return c.GuessUsecases(u)
 }
@@ -885,6 +1304,14 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecase) bool {
 		}
 	}
 
+	if (u & FLAG_SCORE) == FLAG_SCORE {
+		// No heuristic: Score-intent is a deliberate operator choice
+		// (it reserves the model from generation traffic on llama-cpp),
+		// so HasUsecases(FLAG_SCORE) is true only when KnownUsecases
+		// declares it explicitly.
+		return false
+	}
+
 	return true
 }
 
diff --git a/core/config/model_config_loader.go b/core/config/model_config_loader.go
index 32b2bb38a..89f4bc5cb 100644
--- a/core/config/model_config_loader.go
+++ b/core/config/model_config_loader.go
@@ -388,6 +388,49 @@ func (bcl *ModelConfigLoader) Preload(modelPath string) error {
 	return nil
 }
 
+// MITMHostOwnership is the result of mapping intercept hosts to the
+// model configs that claim them. The invariant the dispatcher relies
+// on: every host belongs to AT MOST one model config. Any duplicate
+// is surfaced via Conflicts and disables the MITM listener until
+// resolved — a half-applied "first wins" rule would silently mask
+// configuration drift, so we fail loud.
+type MITMHostOwnership struct {
+	// Owners maps lowercase hostname → owning model name. Empty when
+	// no model declares mitm.hosts.
+	Owners map[string]string
+	// Conflicts lists hosts claimed by 2+ configs, with the names of
+	// the configs that claim them. Non-empty Conflicts means callers
+	// must NOT start the MITM listener.
+	Conflicts map[string][]string
+}
+
+// MITMHostOwners walks every loaded ModelConfig's mitm.hosts, builds
+// the host→owner index, and reports any duplicates. The lookup table
+// is hostname-lowercased to match the Server's allowlist semantics.
+func (bcl *ModelConfigLoader) MITMHostOwners() MITMHostOwnership {
+	bcl.Lock()
+	defer bcl.Unlock()
+	owners := map[string]string{}
+	collisions := map[string][]string{}
+	for name, cfg := range bcl.configs {
+		for _, h := range cfg.MITM.Hosts {
+			h = strings.ToLower(strings.TrimSpace(h))
+			if h == "" {
+				continue
+			}
+			if existing, ok := owners[h]; ok && existing != name {
+				if _, seen := collisions[h]; !seen {
+					collisions[h] = []string{existing}
+				}
+				collisions[h] = append(collisions[h], name)
+				continue
+			}
+			owners[h] = name
+		}
+	}
+	return MITMHostOwnership{Owners: owners, Conflicts: collisions}
+}
+
 // LoadModelConfigsFromPath reads all the configurations of the models from a path
 // (non-recursive)
 func (bcl *ModelConfigLoader) LoadModelConfigsFromPath(path string, opts ...ConfigLoaderOption) error {
diff --git a/core/config/model_config_test.go b/core/config/model_config_test.go
index c1216ec35..d1b7abae9 100644
--- a/core/config/model_config_test.go
+++ b/core/config/model_config_test.go
@@ -54,6 +54,141 @@ parameters:
 			Expect(err).To(BeNil())
 			Expect(valid).To(BeTrue())
 
+			// llama-cpp configs can't mix the score usecase with
+			// chat/completion/embeddings — Score bypasses the slot
+			// loop and would race the llama_context. The check fires
+			// at load and save time; here we exercise it directly.
+			scoreFlag := FLAG_SCORE | FLAG_CHAT
+			conflicting := ModelConfig{
+				Name:          "router-but-also-chat",
+				Backend:       "llama-cpp",
+				KnownUsecases: &scoreFlag,
+			}
+			valid, err = conflicting.Validate()
+			Expect(valid).To(BeFalse())
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("score is incompatible"))
+
+			scoreOnly := FLAG_SCORE
+			dedicated := ModelConfig{
+				Name:          "router-only",
+				Backend:       "llama-cpp",
+				KnownUsecases: &scoreOnly,
+			}
+			valid, err = dedicated.Validate()
+			Expect(valid).To(BeTrue())
+			Expect(err).NotTo(HaveOccurred())
+
+			// The constraint is llama-cpp-specific; other backends
+			// may safely combine.
+			scoreAndChat := FLAG_SCORE | FLAG_CHAT
+			otherBackend := ModelConfig{
+				Name:          "vllm-router-and-chat",
+				Backend:       "vllm",
+				KnownUsecases: &scoreAndChat,
+			}
+			valid, err = otherBackend.Validate()
+			Expect(valid).To(BeTrue())
+			Expect(err).NotTo(HaveOccurred())
+
+			// Cloud-proxy: api_key_env and api_key_file are mutually
+			// exclusive — picking both is a config bug we catch at
+			// load/save rather than at backend-load time.
+			bothKeys := ModelConfig{
+				Name:    "both-keys",
+				Backend: "cloud-proxy",
+				Proxy: ProxyConfig{
+					UpstreamURL: "https://example.com/v1",
+					APIKeyEnv:   "OPENAI_KEY",
+					APIKeyFile:  "/run/secrets/openai",
+				},
+			}
+			valid, err = bothKeys.Validate()
+			Expect(valid).To(BeFalse())
+			Expect(err).To(MatchError(ContainSubstring("mutually exclusive")))
+
+			// Translate mode requires a provider — without one, the
+			// backend has no way to pick a wire format.
+			translateNoProvider := ModelConfig{
+				Name:    "translate-no-provider",
+				Backend: "cloud-proxy",
+				Proxy:   ProxyConfig{UpstreamURL: "https://example.com/v1", Mode: ProxyModeTranslate},
+			}
+			valid, err = translateNoProvider.Validate()
+			Expect(valid).To(BeFalse())
+			Expect(err).To(MatchError(ContainSubstring("translate mode requires provider")))
+
+			// Unknown mode is rejected.
+			badMode := ModelConfig{
+				Name:    "bad-mode",
+				Backend: "cloud-proxy",
+				Proxy:   ProxyConfig{UpstreamURL: "https://example.com/v1", Mode: "rewrite"},
+			}
+			valid, err = badMode.Validate()
+			Expect(valid).To(BeFalse())
+			Expect(err).To(MatchError(ContainSubstring("unknown mode")))
+
+			// Passthrough (default) with one key source is happy.
+			passthroughOK := ModelConfig{
+				Name:    "passthrough-ok",
+				Backend: "cloud-proxy",
+				Proxy:   ProxyConfig{UpstreamURL: "https://example.com/v1", APIKeyEnv: "OPENAI_KEY"},
+			}
+			valid, err = passthroughOK.Validate()
+			Expect(valid).To(BeTrue())
+			Expect(err).NotTo(HaveOccurred())
+
+			// router.score_normalization: load-time rejection of an
+			// unknown value. The classifier consumes it lazily, so
+			// without this validation a YAML typo wouldn't surface
+			// until the first router request panicked deep in
+			// NewScoreClassifier.
+			badNorm := ModelConfig{
+				Name: "bad-norm",
+				Router: RouterConfig{
+					ScoreNormalization: "men", // typo of "mean"
+				},
+			}
+			valid, err = badNorm.Validate()
+			Expect(valid).To(BeFalse())
+			Expect(err).To(MatchError(ContainSubstring("unknown score_normalization")))
+
+			// Accepted values pass.
+			for _, mode := range []string{"", ScoreNormalizationRaw, ScoreNormalizationMean} {
+				goodNorm := ModelConfig{
+					Name:   "good-norm-" + mode,
+					Router: RouterConfig{ScoreNormalization: mode},
+				}
+				valid, err = goodNorm.Validate()
+				Expect(valid).To(BeTrue(), "score_normalization=%q should be accepted", mode)
+				Expect(err).NotTo(HaveOccurred())
+			}
+
+			// router.classifier_system_template: parse-time rejection
+			// of malformed Go templates. Same reasoning as above —
+			// without this the parse error wouldn't surface until
+			// the first router request panicked in NewScoreClassifier.
+			badTmpl := ModelConfig{
+				Name: "bad-tmpl",
+				Router: RouterConfig{
+					ClassifierSystemTemplate: "Routes: {{range .Policies",
+				},
+			}
+			valid, err = badTmpl.Validate()
+			Expect(valid).To(BeFalse())
+			Expect(err).To(MatchError(ContainSubstring("classifier_system_template parse error")))
+
+			// Well-formed template passes.
+			goodTmpl := ModelConfig{
+				Name: "good-tmpl",
+				Router: RouterConfig{
+					ClassifierSystemTemplate: `Routes: {{range .Policies}}{{.Label}} {{end}}`,
+				},
+			}
+			valid, err = goodTmpl.Validate()
+			Expect(valid).To(BeTrue())
+			Expect(err).NotTo(HaveOccurred())
+
 			// download https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml
 			httpClient := http.Client{}
 			resp, err := httpClient.Get("https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml")
@@ -168,6 +303,29 @@ parameters:
 		Expect(i.HasUsecases(FLAG_TTS)).To(BeFalse())
 		Expect(i.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
 		Expect(i.HasUsecases(FLAG_CHAT)).To(BeTrue())
+
+		// Declared `known_usecases: [score]` is authoritative — the
+		// guessing heuristic must NOT add chat on top, even though the
+		// inherited chatml template would otherwise satisfy the chat
+		// heuristic. Score means "this model is reserved for the
+		// router classifier"; surfacing it as a chat model defeats the
+		// reservation and reintroduces the slot contention the load-time
+		// score/chat conflict check exists to prevent.
+		scoreReserved := FLAG_SCORE
+		j := ModelConfig{
+			Name:          "arch-router",
+			Backend:       "llama-cpp",
+			KnownUsecases: &scoreReserved,
+			TemplateConfig: TemplateConfig{
+				Chat:    "inherited from chatml",
+				ChatMessage: "inherited from chatml",
+				Completion:  "inherited from chatml",
+			},
+		}
+		Expect(j.HasUsecases(FLAG_SCORE)).To(BeTrue())
+		Expect(j.HasUsecases(FLAG_CHAT)).To(BeFalse())
+		Expect(j.HasUsecases(FLAG_COMPLETION)).To(BeFalse())
+		Expect(j.HasUsecases(FLAG_EMBEDDINGS)).To(BeFalse())
 	})
 	It("Test Validate with invalid MCP config", func() {
 		tmp, err := os.CreateTemp("", "config.yaml")
diff --git a/core/config/runtime_settings.go b/core/config/runtime_settings.go
index dbddc4a2d..721362ada 100644
--- a/core/config/runtime_settings.go
+++ b/core/config/runtime_settings.go
@@ -90,4 +90,26 @@ type RuntimeSettings struct {
 	LogoFile           *string `json:"logo_file,omitempty"`
 	LogoHorizontalFile *string `json:"logo_horizontal_file,omitempty"`
 	FaviconFile        *string `json:"favicon_file,omitempty"`
+
+	// Cloud-proxy MITM listener. MITMCADir is intentionally NOT
+	// exposed at runtime — the CA dir is a startup-only path and
+	// changing it after the CA has been generated would orphan
+	// trusted clients.
+	MITMListen *string `json:"mitm_listen,omitempty"`
+
+	// PII pattern overrides — keyed by pattern id, applied to the live
+	// redactor at startup and persisted by POST /api/pii/patterns/persist.
+	// Distinguishes from --pii-config (which replaces the entire
+	// pattern set) by only carrying the per-id action/enabled deltas
+	// against the global default catalog.
+	PIIPatternOverrides *map[string]PIIPatternRuntimeOverride `json:"pii_pattern_overrides,omitempty"`
+}
+
+// PIIPatternRuntimeOverride captures the persistable deltas an admin
+// has applied to a single global PII pattern. Both fields are pointers
+// so an override that only flips Disabled doesn't have to also restate
+// Action (and vice versa).
+type PIIPatternRuntimeOverride struct {
+	Action   *string `json:"action,omitempty"`
+	Disabled *bool   `json:"disabled,omitempty"`
 }
diff --git a/core/config/runtime_settings_persist_test.go b/core/config/runtime_settings_persist_test.go
index b2f61c10a..a36acb0d2 100644
--- a/core/config/runtime_settings_persist_test.go
+++ b/core/config/runtime_settings_persist_test.go
@@ -51,6 +51,25 @@ var _ = Describe("RuntimeSettings persistence helpers", func() {
 		})
 	})
 
+	// MITM round trip pins the contract that loadRuntimeSettingsFromFile
+	// MITM listener address must survive a write/read round trip so the
+	// next process restart can bring the listener back up. (Intercept
+	// hosts now live in model YAML rather than runtime_settings.json.)
+	Describe("MITM round trip", func() {
+		It("preserves mitm_listen across read/write", func() {
+			listen := ":8443"
+			Expect(cfg.WritePersistedSettings(config.RuntimeSettings{
+				MITMListen: &listen,
+			})).To(Succeed())
+
+			got, err := cfg.ReadPersistedSettings()
+			Expect(err).ToNot(HaveOccurred())
+
+			Expect(got.MITMListen).ToNot(BeNil())
+			Expect(*got.MITMListen).To(Equal(":8443"))
+		})
+	})
+
 	// PreserveOnSaveDoesNotClobberAssets reproduces the user-reported
 	// regression: an admin uploads a logo, then clicks Save on the
 	// Settings page. The Save body still has the stale pre-upload
diff --git a/core/explorer/empty_db.json.lock b/core/explorer/empty_db.json.lock
new file mode 100644
index 000000000..e69de29bb
diff --git a/core/explorer/test_db.json.lock b/core/explorer/test_db.json.lock
new file mode 100644
index 000000000..e69de29bb
diff --git a/core/http/app.go b/core/http/app.go
index 0f7bbbbcb..4aa7aa46f 100644
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -25,10 +25,8 @@ import (
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services/finetune"
 	"github.com/mudler/LocalAI/core/services/galleryop"
-	"github.com/mudler/LocalAI/core/services/monitoring"
 	"github.com/mudler/LocalAI/core/services/nodes"
 	"github.com/mudler/LocalAI/core/services/quantization"
-	"github.com/mudler/LocalAI/pkg/signals"
 
 	"github.com/mudler/xlog"
 )
@@ -213,19 +211,18 @@ func API(application *application.Application) (*echo.Echo, error) {
 		e.Use(middleware.Recover())
 	}
 
-	// Metrics middleware
-	if !application.ApplicationConfig().DisableMetrics {
-		metricsService, err := monitoring.NewLocalAIMetricsService()
-		if err != nil {
-			return nil, err
-		}
-
-		if metricsService != nil {
-			e.Use(localai.LocalAIMetricsAPIMiddleware(metricsService))
-			e.Server.RegisterOnShutdown(func() {
-				metricsService.Shutdown()
-			})
-		}
+	// Metrics middleware. The metric service was created in
+	// application.start() so the OTel global provider is set before any
+	// counter is registered (the routing-module billing recorder relies
+	// on this). We reuse that instance here rather than calling
+	// monitoring.NewLocalAIMetricsService a second time, which would
+	// create a second provider, second prometheus exporter, and orphan
+	// whichever instance lost the SetMeterProvider race.
+	if metricsService := application.MetricsService(); metricsService != nil {
+		e.Use(localai.LocalAIMetricsAPIMiddleware(metricsService))
+		e.Server.RegisterOnShutdown(func() {
+			_ = metricsService.Shutdown()
+		})
 	}
 
 	// Health Checks should always be exempt from auth, so register these first
@@ -268,13 +265,9 @@ func API(application *application.Application) (*echo.Echo, error) {
 		e.Static("/generated-videos", videoPath)
 	}
 
-	// Initialize usage recording when auth DB is available, and ensure the
-	// batcher drains its in-memory queue on graceful shutdown so the last
-	// few seconds of usage don't disappear when the process exits.
-	if application.AuthDB() != nil {
-		httpMiddleware.InitUsageRecorder(application.AuthDB())
-		signals.RegisterGracefulTerminationHandler(httpMiddleware.ShutdownUsageRecorder)
-	}
+	// Usage recording is initialised in application/startup.go and
+	// surfaced via application.StatsRecorder(); routes wire UsageMiddleware
+	// against that recorder regardless of auth state.
 
 	// Auth is applied to _all_ endpoints. Filtering out endpoints to bypass is
 	// the role of the exempt-path logic inside the middleware.
@@ -361,6 +354,13 @@ func API(application *application.Application) (*echo.Echo, error) {
 	// Register auth routes (login, callback, API keys, user management)
 	routes.RegisterAuthRoutes(e, application)
 
+	// Register routing-module usage endpoints. Unlike /api/auth/usage
+	// these go through the StatsRecorder and work in no-auth single-user
+	// mode by attributing requests to the synthetic "local" user.
+	routes.RegisterUsageRoutes(e, application)
+	routes.RegisterPIIRoutes(e, application)
+	routes.RegisterMiddlewareRoutes(e, application)
+
 	routes.RegisterElevenLabsRoutes(e, requestExtractor, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig())
 
 	// Create opcache for tracking UI operations (used by both UI and LocalAI routes)
diff --git a/core/http/auth/usage.go b/core/http/auth/usage.go
index 44ccf840e..1ab308d52 100644
--- a/core/http/auth/usage.go
+++ b/core/http/auth/usage.go
@@ -17,6 +17,18 @@ const (
 )
 
 // UsageRecord represents a single API request's token usage.
+//
+// Model semantics: Model is the legacy column kept for backward-compatible
+// aggregation; new code should write RequestedModel (what the client asked
+// for) and ServedModel (what actually ran after routing). When no router
+// is in play, all three are equal.
+//
+// PreFilterPromptTokens vs PromptTokens: PromptTokens is the count after
+// PII redaction (i.e., what the backend processed and was billed for).
+// PreFilterPromptTokens is the count of the original prompt before any
+// PII filtering; PostFilterPromptTokens duplicates PromptTokens for
+// queryability symmetry. For non-PII paths PreFilterPromptTokens ==
+// PostFilterPromptTokens == PromptTokens.
 type UsageRecord struct {
 	ID       uint   `gorm:"primaryKey;autoIncrement"`
 	UserID   string `gorm:"size:36;index:idx_usage_user_time"`
@@ -37,6 +49,22 @@ type UsageRecord struct {
 	TotalTokens      int64
 	Duration         int64     // milliseconds
 	CreatedAt        time.Time `gorm:"index:idx_usage_user_time"`
+
+	// Routing extension fields. Nullable / zero-valued for legacy rows.
+	RequestedModel         string  `gorm:"size:255;index"`
+	ServedModel            string  `gorm:"size:255;index"`
+	PreFilterPromptTokens  int64   // tokens the client sent before PII redaction
+	PostFilterPromptTokens int64   // tokens after redaction (== PromptTokens unless filter shrunk it)
+	CachedTokens           int64   // backend-reported KV-cache hit tokens
+	PrefillTokens          int64   // backend-reported prefill tokens (subset of prompt)
+	DraftTokens            int64   // speculative-decoding draft tokens
+	PricingVersionID       string  `gorm:"size:64;index"` // FK to pricing_version; "" when no pricing was applied
+	CostUSD                float64 // computed at insert when pricing is available; 0 with empty PricingVersionID = unknown
+
+	// Cross-subsystem correlation. Empty when the subsystem didn't run.
+	CorrelationID    string `gorm:"size:64;index"`
+	RouterDecisionID string `gorm:"size:64;index"`
+	PIIEventID       string `gorm:"size:64"`
 }
 
 // RecordUsage inserts a usage record.
diff --git a/core/http/endpoints/anthropic/anthropic_suite_test.go b/core/http/endpoints/anthropic/anthropic_suite_test.go
new file mode 100644
index 000000000..0b88b92f2
--- /dev/null
+++ b/core/http/endpoints/anthropic/anthropic_suite_test.go
@@ -0,0 +1,13 @@
+package anthropic
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestAnthropic(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Anthropic test suite")
+}
diff --git a/core/http/endpoints/anthropic/messages.go b/core/http/endpoints/anthropic/messages.go
index 62e58a4a1..c4776d084 100644
--- a/core/http/endpoints/anthropic/messages.go
+++ b/core/http/endpoints/anthropic/messages.go
@@ -10,10 +10,13 @@ import (
 	"github.com/labstack/echo/v4"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/auth"
 	mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
 	openaiEndpoint "github.com/mudler/LocalAI/core/http/endpoints/openai"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/cloudproxy"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
 	"github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/model"
@@ -27,7 +30,7 @@ import (
 // @Param request body schema.AnthropicRequest true "query params"
 // @Success 200 {object} schema.AnthropicResponse "Response"
 // @Router /v1/messages [post]
-func MessagesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig, natsClient mcpTools.MCPNATSClient) echo.HandlerFunc {
+func MessagesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig, natsClient mcpTools.MCPNATSClient, piiRedactor *pii.Redactor, piiEvents pii.EventStore) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		id := uuid.New().String()
 
@@ -47,6 +50,12 @@ func MessagesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evalu
 
 		xlog.Debug("Anthropic Messages endpoint configuration read", "config", cfg)
 
+		// Cloud-proxy bail. Same shape as the OpenAI chat endpoint —
+		// forwards via the cloud-proxy gRPC backend.
+		if cfg.IsCloudProxyBackendPassthrough() {
+			return forwardCloudProxyAnthropicViaBackend(c, cfg, input, piiRedactor, piiEvents, ml, appConfig)
+		}
+
 		// Convert Anthropic messages to OpenAI format for internal processing
 		openAIMessages := convertAnthropicToOpenAIMessages(input)
 
@@ -132,7 +141,7 @@ func MessagesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evalu
 		xlog.Debug("Anthropic Messages - Prompt (after templating)", "prompt", predInput)
 
 		if input.Stream {
-			return handleAnthropicStream(c, id, input, cfg, ml, cl, appConfig, predInput, openAIReq, funcs, shouldUseFn, mcpExecutor, evaluator)
+			return handleAnthropicStream(c, id, input, cfg, ml, cl, appConfig, predInput, openAIReq, funcs, shouldUseFn, mcpExecutor, evaluator, piiRedactor, piiEvents)
 		}
 
 		return handleAnthropicNonStream(c, id, input, cfg, ml, cl, appConfig, predInput, openAIReq, funcs, shouldUseFn, mcpExecutor, evaluator)
@@ -313,17 +322,45 @@ func handleAnthropicNonStream(c echo.Context, id string, input *schema.Anthropic
 			xlog.Debug("Anthropic Response", "response", string(respData))
 		}
 
+		middleware.StampUsage(c, input.Model, tokenUsage.Prompt, tokenUsage.Completion)
+
 		return c.JSON(200, resp)
 	} // end MCP iteration loop
 
 	return sendAnthropicError(c, 500, "api_error", "MCP iteration limit reached")
 }
 
-func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicRequest, cfg *config.ModelConfig, ml *model.ModelLoader, cl *config.ModelConfigLoader, appConfig *config.ApplicationConfig, predInput string, openAIReq *schema.OpenAIRequest, funcs functions.Functions, shouldUseFn bool, mcpExecutor mcpTools.ToolExecutor, evaluator *templates.Evaluator) error {
+func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicRequest, cfg *config.ModelConfig, ml *model.ModelLoader, cl *config.ModelConfigLoader, appConfig *config.ApplicationConfig, predInput string, openAIReq *schema.OpenAIRequest, funcs functions.Functions, shouldUseFn bool, mcpExecutor mcpTools.ToolExecutor, evaluator *templates.Evaluator, piiRedactor *pii.Redactor, piiEvents pii.EventStore) error {
 	c.Response().Header().Set("Content-Type", "text/event-stream")
 	c.Response().Header().Set("Cache-Control", "no-cache")
 	c.Response().Header().Set("Connection", "keep-alive")
 
+	// Per-stream PII filter — same gating as the OpenAI chat path. The
+	// filter is wire-format-agnostic; we feed it the text portion of
+	// each text_delta and emit only what's safe to send. The filter
+	// holds back a tail of size MaxPatternLength-1 so a pattern split
+	// across chunk boundaries still gets masked. When PII is disabled
+	// for this model the filter is nil and emits flow unchanged.
+	var streamPIIFilter *pii.StreamFilter
+	if piiRedactor != nil && cfg.PIIIsEnabled() {
+		correlationID := c.Request().Header.Get("x-request-id")
+		userID := ""
+		if u := auth.GetUser(c); u != nil {
+			userID = u.ID
+		}
+		var overrides map[string]pii.Action
+		if raw := cfg.PIIPatternOverrides(); len(raw) > 0 {
+			overrides = make(map[string]pii.Action, len(raw))
+			for ovid, action := range raw {
+				switch pii.Action(action) {
+				case pii.ActionMask, pii.ActionBlock, pii.ActionRouteLocal:
+					overrides[ovid] = pii.Action(action)
+				}
+			}
+		}
+		streamPIIFilter = pii.NewStreamFilter(piiRedactor, overrides, piiEvents, correlationID, userID)
+	}
+
 	// Send message_start event
 	messageStart := schema.AnthropicStreamEvent{
 		Type: "message_start",
@@ -403,6 +440,7 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 
 				if len(toolCalls) > toolCallsEmitted {
 					if !inToolCall && currentBlockIndex == 0 {
+						drainStreamPIIToText(c, streamPIIFilter, intPtr(currentBlockIndex))
 						sendAnthropicSSE(c, schema.AnthropicStreamEvent{
 							Type:  "content_block_stop",
 							Index: intPtr(currentBlockIndex),
@@ -443,14 +481,20 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 			}
 
 			if !inToolCall && token != "" {
-				sendAnthropicSSE(c, schema.AnthropicStreamEvent{
-					Type:  "content_block_delta",
-					Index: intPtr(0),
-					Delta: &schema.AnthropicStreamDelta{
-						Type: "text_delta",
-						Text: token,
-					},
-				})
+				out := token
+				if streamPIIFilter != nil {
+					out = streamPIIFilter.Push(token)
+				}
+				if out != "" {
+					sendAnthropicSSE(c, schema.AnthropicStreamEvent{
+						Type:  "content_block_delta",
+						Index: intPtr(0),
+						Delta: &schema.AnthropicStreamDelta{
+							Type: "text_delta",
+							Text: out,
+						},
+					})
+				}
 			}
 			return true
 		}
@@ -488,14 +532,20 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 			// didn't already stream it (autoparser clears raw text, so
 			// accumulatedContent will be empty in that case).
 			if deltaContent != "" && !inToolCall && accumulatedContent == "" {
-				sendAnthropicSSE(c, schema.AnthropicStreamEvent{
-					Type:  "content_block_delta",
-					Index: intPtr(0),
-					Delta: &schema.AnthropicStreamDelta{
-						Type: "text_delta",
-						Text: deltaContent,
-					},
-				})
+				out := deltaContent
+				if streamPIIFilter != nil {
+					out = streamPIIFilter.Push(deltaContent)
+				}
+				if out != "" {
+					sendAnthropicSSE(c, schema.AnthropicStreamEvent{
+						Type:  "content_block_delta",
+						Index: intPtr(0),
+						Delta: &schema.AnthropicStreamDelta{
+							Type: "text_delta",
+							Text: out,
+						},
+					})
+				}
 			}
 
 			// Emit tool_use blocks from ChatDeltas
@@ -503,6 +553,7 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 				collectedToolCalls = deltaToolCalls
 
 				if !inToolCall && currentBlockIndex == 0 {
+					drainStreamPIIToText(c, streamPIIFilter, intPtr(currentBlockIndex))
 					sendAnthropicSSE(c, schema.AnthropicStreamEvent{
 						Type:  "content_block_stop",
 						Index: intPtr(currentBlockIndex),
@@ -606,7 +657,9 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 		if !shouldUseFn && cfg.FunctionsConfig.AutomaticToolParsingFallback && accumulatedContent != "" && toolCallsEmitted == 0 {
 			parsed := functions.ParseFunctionCall(accumulatedContent, cfg.FunctionsConfig)
 			if len(parsed) > 0 {
-				// Close the text content block
+				// Close the text content block (after flushing any
+				// residual the streaming PII filter held back).
+				drainStreamPIIToText(c, streamPIIFilter, intPtr(currentBlockIndex))
 				sendAnthropicSSE(c, schema.AnthropicStreamEvent{
 					Type:  "content_block_stop",
 					Index: intPtr(currentBlockIndex),
@@ -646,8 +699,12 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 			}
 		}
 
-		// No MCP tools to execute, close stream
+		// No MCP tools to execute, close stream. drainStreamPIIToText
+		// flushes any residual the streaming PII filter held back as
+		// part of its trailing pattern-window before we close the
+		// text content block.
 		if !inToolCall {
+			drainStreamPIIToText(c, streamPIIFilter, intPtr(0))
 			sendAnthropicSSE(c, schema.AnthropicStreamEvent{
 				Type:  "content_block_stop",
 				Index: intPtr(0),
@@ -673,6 +730,8 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
 			Type: "message_stop",
 		})
 
+		middleware.StampUsage(c, input.Model, tokenUsage.Prompt, tokenUsage.Completion)
+
 		return nil
 	} // end MCP iteration loop
 
@@ -693,6 +752,30 @@ func convertFuncsToOpenAITools(funcs functions.Functions) []functions.Tool {
 
 func intPtr(i int) *int { return &i }
 
+// drainStreamPIIToText flushes any residual the streaming PII filter
+// has been holding back as part of its trailing pattern-window, and
+// emits it as one final text_delta into the named block before the
+// caller closes that block. Drain is idempotent: calling it twice on
+// the same filter returns "" the second time. Safe to call with a nil
+// filter (no-op).
+func drainStreamPIIToText(c echo.Context, sf *pii.StreamFilter, index *int) {
+	if sf == nil {
+		return
+	}
+	residual := sf.Drain()
+	if residual == "" {
+		return
+	}
+	sendAnthropicSSE(c, schema.AnthropicStreamEvent{
+		Type:  "content_block_delta",
+		Index: index,
+		Delta: &schema.AnthropicStreamDelta{
+			Type: "text_delta",
+			Text: residual,
+		},
+	})
+}
+
 func sendAnthropicSSE(c echo.Context, event schema.AnthropicStreamEvent) {
 	data, err := json.Marshal(event)
 	if err != nil {
@@ -888,3 +971,19 @@ func convertAnthropicTools(input *schema.AnthropicRequest, cfg *config.ModelConf
 
 	return funcs, len(funcs) > 0 && cfg.ShouldUseFunctions()
 }
+
+// forwardCloudProxyAnthropicViaBackend marshals the Anthropic request,
+// constructs the streaming PII filter (when applicable), and hands the
+// body off to the cloud-proxy gRPC backend. Model swap + upstream auth
+// headers are applied inside the backend; the filter is built here
+// because the auth/correlation context only exists in the echo handler.
+func forwardCloudProxyAnthropicViaBackend(c echo.Context, cfg *config.ModelConfig, input *schema.AnthropicRequest, piiRedactor *pii.Redactor, piiEvents pii.EventStore, ml *model.ModelLoader, appConfig *config.ApplicationConfig) error {
+	body, err := json.Marshal(input)
+	if err != nil {
+		return sendAnthropicError(c, 400, "invalid_request_error", "cloudproxy: marshal request: "+err.Error())
+	}
+
+	correlationID := c.Request().Header.Get("x-request-id")
+	streamFilter := cloudproxy.BuildStreamFilter(c, cfg, input.Stream, piiRedactor, piiEvents, correlationID)
+	return cloudproxy.ForwardViaBackend(c, cfg, body, streamFilter, ml, appConfig)
+}
diff --git a/core/http/endpoints/anthropic/messages_pii_test.go b/core/http/endpoints/anthropic/messages_pii_test.go
new file mode 100644
index 000000000..91e5297e4
--- /dev/null
+++ b/core/http/endpoints/anthropic/messages_pii_test.go
@@ -0,0 +1,114 @@
+package anthropic
+
+import (
+	"net/http"
+	"net/http/httptest"
+	"strings"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// drainStreamPIIToText is called from four sites in messages.go and is
+// the load-bearing primitive for "the streaming filter has buffered
+// some bytes that the request just ended on; flush them as a final
+// text_delta event before closing the content block". A regression
+// here would silently truncate the last few bytes of an assistant
+// response on every PII-enabled stream — invisible without coverage.
+
+// newTestFilter compiles the default patterns and returns a filter
+// that holds back its trailing pattern-window; pushing a short string
+// (shorter than holdLen) keeps the bytes inside Drain.
+func newTestFilter() *pii.StreamFilter {
+	patterns, err := pii.Compile(pii.DefaultPatterns())
+	ExpectWithOffset(1, err).NotTo(HaveOccurred())
+	red := pii.NewRedactor(patterns)
+	return pii.NewStreamFilter(red, nil, nil, "", "")
+}
+
+// newTestContext builds a recording echo context — the recorder
+// captures the SSE bytes drainStreamPIIToText writes.
+func newTestContext() (echo.Context, *httptest.ResponseRecorder) {
+	req := httptest.NewRequest(http.MethodPost, "/v1/messages", strings.NewReader("{}"))
+	rec := httptest.NewRecorder()
+	return echo.New().NewContext(req, rec), rec
+}
+
+var _ = Describe("drainStreamPIIToText", func() {
+	It("is a no-op when the filter is nil", func() {
+		c, rec := newTestContext()
+		drainStreamPIIToText(c, nil, intPtr(0))
+		Expect(rec.Body.Len()).To(Equal(0), "nil filter wrote %d bytes: %q", rec.Body.Len(), rec.Body.String())
+	})
+
+	It("emits nothing when the drain is empty", func() {
+		// A filter with nothing buffered should not emit a phantom event;
+		// otherwise every non-PII response would close with an empty
+		// text_delta that pollutes downstream parsers.
+		sf := newTestFilter()
+		c, rec := newTestContext()
+		drainStreamPIIToText(c, sf, intPtr(0))
+		Expect(rec.Body.Len()).To(Equal(0), "empty drain wrote %d bytes: %q", rec.Body.Len(), rec.Body.String())
+	})
+
+	It("flushes residual buffered bytes as a text_delta event", func() {
+		sf := newTestFilter()
+		// Push less than holdLen so all bytes are retained until Drain.
+		// "tail" is short enough that no pattern is plausible.
+		out := sf.Push("tail")
+		Expect(out).To(Equal(""), "Push of short text emitted %q; want all bytes held", out)
+
+		c, rec := newTestContext()
+		drainStreamPIIToText(c, sf, intPtr(2))
+
+		body := rec.Body.String()
+		// Wire format: "event: content_block_delta\ndata: {…}\n\n"
+		Expect(body).To(ContainSubstring("event: content_block_delta"))
+		Expect(body).To(ContainSubstring(`"type":"content_block_delta"`))
+		Expect(body).To(ContainSubstring(`"index":2`))
+		Expect(body).To(ContainSubstring(`"text":"tail"`))
+		Expect(body).To(ContainSubstring(`"type":"text_delta"`))
+		Expect(strings.HasSuffix(body, "\n\n")).To(BeTrue(), "SSE event missing trailing blank line: %q", body)
+	})
+
+	It("is idempotent across consecutive drains", func() {
+		// Two consecutive Drains: the filter returns "" the second time,
+		// so the second drainStreamPIIToText must emit nothing. The
+		// production path in messages.go has at least four call sites
+		// that may overlap (currentBlockIndex==0 emergency path + the
+		// unconditional drain near the end of the stream); without
+		// idempotence we'd duplicate the residual on the wire.
+		sf := newTestFilter()
+		sf.Push("tail")
+
+		c1, rec1 := newTestContext()
+		drainStreamPIIToText(c1, sf, intPtr(0))
+		first := rec1.Body.Len()
+		Expect(first).NotTo(Equal(0), "first drain emitted nothing")
+
+		c2, rec2 := newTestContext()
+		drainStreamPIIToText(c2, sf, intPtr(0))
+		Expect(rec2.Body.Len()).To(Equal(0), "second drain wrote %d bytes; want idempotent no-op: %q", rec2.Body.Len(), rec2.Body.String())
+	})
+
+	It("masks redacted residual instead of leaking it", func() {
+		// The held tail must travel through the redactor on Drain. If
+		// the bytes happen to form a complete pattern at end-of-stream,
+		// the residual emit must contain the mask placeholder, not the
+		// raw value.
+		sf := newTestFilter()
+		// "alice@example.com" is 17 bytes. holdLen for default patterns
+		// is well above 17, so this stays buffered until Drain, which
+		// then redacts it.
+		out := sf.Push("alice@example.com")
+		Expect(out).To(Equal(""), "Push emitted bytes early: %q", out)
+
+		c, rec := newTestContext()
+		drainStreamPIIToText(c, sf, intPtr(0))
+		body := rec.Body.String()
+		Expect(body).NotTo(ContainSubstring("alice@example.com"), "raw email leaked in residual emit: %q", body)
+		Expect(body).To(ContainSubstring("[REDACTED:email]"), "residual emit missing mask placeholder: %q", body)
+	})
+})
diff --git a/core/http/endpoints/localai/api_instructions.go b/core/http/endpoints/localai/api_instructions.go
index 103c87443..9eb0095dd 100644
--- a/core/http/endpoints/localai/api_instructions.go
+++ b/core/http/endpoints/localai/api_instructions.go
@@ -92,6 +92,30 @@ var instructionDefs = []instructionDef{
 		Tags:        []string{"branding"},
 		Intro:       "GET /api/branding is public so the login screen can render the configured logo before authentication. Text fields are saved through POST /api/settings; binary assets (logo, horizontal logo, favicon) use multipart upload at /api/branding/asset/{kind} and are served back from /branding/asset/{kind}.",
 	},
+	{
+		Name:        "usage-and-billing",
+		Description: "Per-user token usage and request counts, with optional cost tracking",
+		Tags:        []string{"usage"},
+		Intro:       "GET /api/usage returns the current user's token usage in time-bucketed form (day/week/month/all). In single-user no-auth mode the records are attributed to a synthetic local user with stable UUID, so this endpoint and the dashboard work without --auth. /api/usage/all is the cluster-wide view and requires admin (the local user is admin in single-user mode). UsageRecord fields include RequestedModel/ServedModel and PreFilter/PostFilterPromptTokens for routing- and PII-aware accounting.",
+	},
+	{
+		Name:        "pii-filtering",
+		Description: "Inspect and tune the regex PII filter applied to chat requests",
+		Tags:        []string{"pii"},
+		Intro:       "GET /api/pii/patterns lists the active pattern set with each one's action (mask, block, route_local). GET /api/pii/events returns recent redaction events filtered by correlation_id / user_id / pattern_id (admin or local-user only). POST /api/pii/test dry-runs the redactor against an admin-supplied string. POST /api/pii/decide is the programmatic decision oracle for external routers: send `{text}`, receive `{findings, suggested_action, redacted_preview}` without LocalAI mutating, recording, or acting on the call — caller composes the action with its own policy. Default patterns: email, phone, SSN, credit card (Luhn), IPv4, common API key prefixes (sk-, pk-, ghp_, github_pat_). PII is per-model: by default it is OFF for non-proxy backends and ON for backends starting with proxy-* (cloud passthroughs). Opt in with `pii: { enabled: true }` in a model's YAML; use `pii: { patterns: [{id, action}] }` to upgrade or downgrade individual actions for that model. Override global default actions via --pii-config pii.yaml; --disable-pii turns the filter off entirely.",
+	},
+	{
+		Name:        "middleware-admin",
+		Description: "Inspect and configure the routing-module middleware (PII filter and routing)",
+		Tags:        []string{"middleware", "pii", "router"},
+		Intro:       "GET /api/middleware/status is the single round-trip the /app/middleware admin page reads to render the current state: active PII patterns and their actions, every model's resolved enabled/override state, recent event count, and the active routing models with their classifier configurations. Admin-only (the synthetic local user is admin in no-auth mode). PUT /api/pii/patterns/:id changes a pattern's action in-process — TRANSIENT, lost on restart. To persist, edit --pii-config YAML. GET /api/router/decisions returns the routing decision log filtered by correlation_id / user_id / router_model. The same surface is exposed as MCP tools (`get_middleware_status`, `set_pii_pattern_action`, `get_router_decisions`) for agent-driven configuration.",
+	},
+	{
+		Name:        "intelligent-routing",
+		Description: "Per-model `router:` configuration that classifies requests and rewrites the served model",
+		Tags:        []string{"router"},
+		Intro:       "Add a `router:` block to a ModelConfig to turn it into a routing model. The block declares a classifier (today: `feature` — handcrafted rules over prompt length and code-fence presence), a list of candidates (label + downstream model + optional rule), and a fallback. When a client addresses the routing model, the RouteModel middleware invokes the classifier, picks a candidate, and rewrites input.Model — the standard model-resolution path then runs ACL, disabled-state, and per-model PII against the chosen target. Depth-1 invariant: candidates must NOT themselves carry a `router:` block; runtime check returns 500 on violation. Decisions are logged to GET /api/router/decisions and surfaced in the /app/middleware Routing tab. POST /api/router/decide is the programmatic decision-oracle: external routers (e.g. an organisation-wide router service) send `{router, input}` and receive the classifier's label set + candidate model WITHOUT LocalAI rewriting, forwarding, or recording the call. Shares the classifier cache with the in-band path so warm-up costs are paid once.",
+	},
 }
 
 // swaggerState holds parsed swagger spec data, initialised once.
diff --git a/core/http/endpoints/localai/api_instructions_test.go b/core/http/endpoints/localai/api_instructions_test.go
index 35bdfa239..70ae71765 100644
--- a/core/http/endpoints/localai/api_instructions_test.go
+++ b/core/http/endpoints/localai/api_instructions_test.go
@@ -39,7 +39,7 @@ var _ = Describe("API Instructions Endpoints", func() {
 
 			instructions, ok := resp["instructions"].([]any)
 			Expect(ok).To(BeTrue())
-			Expect(instructions).To(HaveLen(12))
+			Expect(instructions).To(HaveLen(16))
 
 			// Verify each instruction has required fields and correct URL format
 			for _, s := range instructions {
@@ -74,6 +74,10 @@ var _ = Describe("API Instructions Endpoints", func() {
 				"monitoring",
 				"agents",
 				"face-recognition",
+				"usage-and-billing",
+				"pii-filtering",
+				"middleware-admin",
+				"intelligent-routing",
 			))
 		})
 	})
diff --git a/core/http/endpoints/localai/import_model.go b/core/http/endpoints/localai/import_model.go
index f96ecd789..dc225abdd 100644
--- a/core/http/endpoints/localai/import_model.go
+++ b/core/http/endpoints/localai/import_model.go
@@ -173,12 +173,12 @@ func ImportModelEndpoint(cl *config.ModelConfigLoader, appConfig *config.Applica
 		// Validate without calling SetDefaults() — runtime defaults should not
 		// be persisted to disk. SetDefaults() is called when loading configs
 		// for inference via LoadModelConfigsFromPath().
-		if valid, _ := modelConfig.Validate(); !valid {
-			response := ModelResponse{
-				Success: false,
-				Error:   "Invalid configuration",
+		if valid, vErr := modelConfig.Validate(); !valid {
+			msg := "Invalid configuration"
+			if vErr != nil {
+				msg = vErr.Error()
 			}
-			return c.JSON(http.StatusBadRequest, response)
+			return c.JSON(http.StatusBadRequest, ModelResponse{Success: false, Error: msg})
 		}
 
 		// Create the configuration file
diff --git a/core/http/endpoints/localai/mcp.go b/core/http/endpoints/localai/mcp.go
index 541d4963b..a849e8a2f 100644
--- a/core/http/endpoints/localai/mcp.go
+++ b/core/http/endpoints/localai/mcp.go
@@ -61,7 +61,11 @@ func MCPEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 	// The legacy /v1/mcp/chat/completions endpoint never opts into the
 	// in-process LocalAI Assistant tool surface — pass nil holder so the
 	// assistant branch in chat.go is unreachable from this code path.
-	chatHandler := openai.ChatEndpoint(cl, ml, evaluator, appConfig, natsClient, nil)
+	// Stream-side PII filter is also nil: this legacy endpoint pre-dates
+	// the per-model PII config and is kept for backward compatibility.
+	// The request-side middleware on the main chat route handles
+	// filtering for the standard /v1/chat/completions path.
+	chatHandler := openai.ChatEndpoint(cl, ml, evaluator, appConfig, natsClient, nil, nil, nil)
 
 	return func(c echo.Context) error {
 		input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
diff --git a/core/http/endpoints/localai/pii_decide.go b/core/http/endpoints/localai/pii_decide.go
new file mode 100644
index 000000000..1b1ac8e94
--- /dev/null
+++ b/core/http/endpoints/localai/pii_decide.go
@@ -0,0 +1,85 @@
+package localai
+
+import (
+	"net/http"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+)
+
+// PIIDecideEndpoint exposes the PII redactor as a decision oracle:
+// scan the supplied text and return findings + the strongest action
+// the configured pattern set would take, without rewriting the
+// caller's request or recording an audit event.
+//
+// External routers (e.g. the localai-org/platform router) call this
+// before dispatching to learn whether to mask the prompt in place,
+// route to a local-only backend, block the request, or pass it
+// through. LocalAI's in-band PII middleware is the alternative path
+// for direct-to-LocalAI clients — same Redactor, different framing.
+//
+// Takes the *pii.Redactor directly rather than the whole
+// *application.Application so the handler stays unit-testable with a
+// freshly-constructed redactor (mirrors the pattern in
+// router_decide.go). The route-registration site is responsible for
+// stubbing this endpoint when --disable-pii is set so callers get a
+// 503 signalling "admin opted out" rather than a misleading allow.
+//
+// @Summary  Scan text for PII and return findings + suggested action (decision oracle)
+// @Tags     pii
+// @Accept   json
+// @Produce  json
+// @Param    request body schema.PIIDecideRequest true "decide params"
+// @Success  200 {object} schema.PIIDecideResponse
+// @Failure  400 {object} map[string]string
+// @Router   /api/pii/decide [post]
+func PIIDecideEndpoint(redactor *pii.Redactor) echo.HandlerFunc {
+	return func(c echo.Context) error {
+		var req schema.PIIDecideRequest
+		if err := c.Bind(&req); err != nil {
+			return echo.NewHTTPError(http.StatusBadRequest, "invalid request body: "+err.Error())
+		}
+		if req.Text == "" {
+			return echo.NewHTTPError(http.StatusBadRequest, "text is required")
+		}
+
+		res := redactor.Redact(req.Text)
+		findings := make([]schema.PIIFinding, len(res.Spans))
+		for i, s := range res.Spans {
+			findings[i] = schema.PIIFinding{
+				Start:      s.Start,
+				End:        s.End,
+				Pattern:    s.Pattern,
+				HashPrefix: s.HashPrefix,
+			}
+		}
+		return c.JSON(http.StatusOK, schema.PIIDecideResponse{
+			Findings:        findings,
+			SuggestedAction: suggestedAction(res),
+			RedactedPreview: res.Redacted,
+		})
+	}
+}
+
+// actionAllow is the wire-only value for "no findings". The other
+// three map to existing pii.Action* constants; allow has no in-band
+// counterpart because the in-band middleware simply passes through.
+const actionAllow = "allow"
+
+// suggestedAction collapses the Redactor's Result flags onto a single
+// wire-format action using the in-band ordering (block > route_local
+// > mask > allow). Spans-without-Blocked-or-LocalOnly means every
+// match resolved to ActionMask.
+func suggestedAction(res pii.Result) string {
+	switch {
+	case res.Blocked:
+		return string(pii.ActionBlock)
+	case res.LocalOnly:
+		return string(pii.ActionRouteLocal)
+	case len(res.Spans) > 0:
+		return string(pii.ActionMask)
+	default:
+		return actionAllow
+	}
+}
diff --git a/core/http/endpoints/localai/pii_decide_test.go b/core/http/endpoints/localai/pii_decide_test.go
new file mode 100644
index 000000000..d91d72834
--- /dev/null
+++ b/core/http/endpoints/localai/pii_decide_test.go
@@ -0,0 +1,107 @@
+package localai_test
+
+import (
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/http/endpoints/localai"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// PIIDecideEndpoint exposes the redactor as a decision oracle. These
+// specs pin the validation surface and the suggested_action mapping
+// across all four actions (allow/mask/route_local/block). The redactor
+// itself is covered in core/services/routing/pii/redactor_test.go.
+
+var _ = Describe("PIIDecideEndpoint", func() {
+	var redactor *pii.Redactor
+
+	BeforeEach(func() {
+		patterns, err := pii.Compile(pii.DefaultPatterns())
+		Expect(err).NotTo(HaveOccurred())
+		redactor = pii.NewRedactor(patterns)
+	})
+
+	It("rejects requests with no text field", func() {
+		rec, _ := invokePIIDecide(redactor, `{}`)
+		Expect(rec.Code).To(Equal(http.StatusBadRequest))
+		Expect(rec.Body.String()).To(ContainSubstring("text is required"))
+	})
+
+	It("rejects malformed JSON", func() {
+		rec, _ := invokePIIDecide(redactor, `not json`)
+		Expect(rec.Code).To(Equal(http.StatusBadRequest))
+	})
+
+	It("returns allow for clean text", func() {
+		rec, body := invokePIIDecide(redactor, `{"text":"hello world"}`)
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(body.SuggestedAction).To(Equal("allow"))
+		Expect(body.Findings).To(BeEmpty())
+		Expect(body.RedactedPreview).To(Equal("hello world"))
+	})
+
+	It("returns mask for text containing email (default action)", func() {
+		rec, body := invokePIIDecide(redactor, `{"text":"reach me at alice@example.com please"}`)
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(body.SuggestedAction).To(Equal("mask"))
+		Expect(body.Findings).To(HaveLen(1))
+		Expect(body.Findings[0].Pattern).To(Equal("email"))
+		Expect(body.Findings[0].HashPrefix).NotTo(BeEmpty())
+		Expect(body.RedactedPreview).To(ContainSubstring("[REDACTED:email]"))
+		Expect(body.RedactedPreview).NotTo(ContainSubstring("alice@example.com"))
+	})
+
+	It("returns block when an api_key_prefix is present (block beats mask)", func() {
+		// api_key_prefix defaults to ActionBlock per DefaultPatterns.
+		// Mix in an email so we also confirm the block-action wins
+		// over the mask-action via actionRank.
+		rec, body := invokePIIDecide(redactor, `{"text":"my key is sk-1234567890abcdefghij and email alice@example.com"}`)
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(body.SuggestedAction).To(Equal("block"))
+		Expect(len(body.Findings)).To(BeNumerically(">=", 1))
+	})
+
+	It("returns route_local when an override sets that action", func() {
+		// Promote the email pattern to route_local for this test —
+		// exercises the route_local branch of suggestedAction without
+		// needing a custom pattern set.
+		Expect(redactor.SetAction("email", pii.ActionRouteLocal)).To(Succeed())
+		rec, body := invokePIIDecide(redactor, `{"text":"contact alice@example.com"}`)
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(body.SuggestedAction).To(Equal("route_local"))
+		// route_local leaves the original text intact — caller decides
+		// whether to forward it to a local-only backend.
+		Expect(body.RedactedPreview).To(ContainSubstring("alice@example.com"))
+	})
+
+	It("never leaks the matched value via HashPrefix", func() {
+		rec, body := invokePIIDecide(redactor, `{"text":"alice@example.com"}`)
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(body.Findings).To(HaveLen(1))
+		// HashPrefix is 8 hex chars of sha256 — definitely not the
+		// matched value, but stable so admins can correlate leaks.
+		Expect(body.Findings[0].HashPrefix).To(HaveLen(8))
+		Expect(body.Findings[0].HashPrefix).NotTo(ContainSubstring("alice"))
+	})
+})
+
+func invokePIIDecide(redactor *pii.Redactor, body string) (*httptest.ResponseRecorder, schema.PIIDecideResponse) {
+	e := echo.New()
+	e.POST("/api/pii/decide", localai.PIIDecideEndpoint(redactor))
+	req := httptest.NewRequest(http.MethodPost, "/api/pii/decide", strings.NewReader(body))
+	req.Header.Set("Content-Type", "application/json")
+	rec := httptest.NewRecorder()
+	e.ServeHTTP(rec, req)
+	var parsed schema.PIIDecideResponse
+	if rec.Code == http.StatusOK {
+		Expect(json.Unmarshal(rec.Body.Bytes(), &parsed)).To(Succeed())
+	}
+	return rec, parsed
+}
diff --git a/core/http/endpoints/localai/router_decide.go b/core/http/endpoints/localai/router_decide.go
new file mode 100644
index 000000000..11fddcf57
--- /dev/null
+++ b/core/http/endpoints/localai/router_decide.go
@@ -0,0 +1,109 @@
+package localai
+
+import (
+	"net/http"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/middleware"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/router"
+)
+
+// RouterDecideEndpoint exposes the routing classifier as a decision
+// oracle: given a router model and a prompt, it runs the same
+// classifier the in-band RouteModel middleware would have run, returns
+// the active label set, and resolves which candidate model would have
+// been picked. It does NOT rewrite anything, forward to a backend, or
+// write to the decision store — Platform-side routers call this to get
+// LocalAI's opinion without committing LocalAI to handle the request.
+//
+// The classifier is shared with the in-band middleware via the
+// process-wide router.Registry on deps, so this endpoint and the
+// request path agree on cache state, embedding-cache hits, etc.
+//
+// Takes discrete deps rather than the whole *application.Application so
+// it stays unit-testable with a stub Scorer and a tmpdir-backed model
+// loader (mirrors the existing route_model_test.go setup).
+//
+// @Summary  Classify a prompt against a router model's policies (decision oracle)
+// @Tags     router
+// @Accept   json
+// @Produce  json
+// @Param    request body schema.RouterDecideRequest true "decide params"
+// @Success  200 {object} schema.RouterDecideResponse
+// @Failure  400 {object} map[string]string
+// @Failure  404 {object} map[string]string
+// @Failure  500 {object} map[string]string
+// @Failure  503 {object} map[string]string
+// @Router   /api/router/decide [post]
+func RouterDecideEndpoint(loader *config.ModelConfigLoader, appConfig *config.ApplicationConfig, deps middleware.ClassifierDeps) echo.HandlerFunc {
+	return func(c echo.Context) error {
+		var req schema.RouterDecideRequest
+		if err := c.Bind(&req); err != nil {
+			return echo.NewHTTPError(http.StatusBadRequest, "invalid request body: "+err.Error())
+		}
+		if req.Router == "" {
+			return echo.NewHTTPError(http.StatusBadRequest, "router is required")
+		}
+		if req.Input == "" {
+			return echo.NewHTTPError(http.StatusBadRequest, "input is required")
+		}
+
+		cfg, err := loader.LoadModelConfigFileByNameDefaultOptions(req.Router, appConfig)
+		if err != nil {
+			return echo.NewHTTPError(http.StatusInternalServerError, "failed to load model config: "+err.Error())
+		}
+		// LoadModelConfigFileByName returns a synthetic stub
+		// (PredictionOptions.Model only, no Name) when neither an
+		// in-memory config nor a YAML file exists for the requested
+		// name. Use Name to discriminate "model unknown" (404) from
+		// "model known but not a router" (400) — Platform wants both
+		// signals.
+		if cfg == nil || cfg.Name == "" {
+			return echo.NewHTTPError(http.StatusNotFound, "router model not found: "+req.Router)
+		}
+		if !cfg.HasRouter() {
+			return echo.NewHTTPError(http.StatusBadRequest, "model "+req.Router+" is not a router (no `router:` block)")
+		}
+
+		// Build (or reuse) the classifier via the same registry the
+		// in-band middleware uses. Errors here are config problems —
+		// classifier_model missing, policy without description, etc. —
+		// so 503 is the right status: the router is configured but its
+		// classifier can't be instantiated right now.
+		classifier, err := middleware.GetOrBuildClassifier(deps.Registry, cfg, deps)
+		if err != nil {
+			return echo.NewHTTPError(http.StatusServiceUnavailable, "classifier unavailable: "+err.Error())
+		}
+
+		decision, err := classifier.Classify(c.Request().Context(), router.Probe{Prompt: req.Input})
+		if err != nil {
+			return echo.NewHTTPError(http.StatusInternalServerError, "classify failed: "+err.Error())
+		}
+
+		candidate := router.MatchCandidate(cfg.Router.Candidates, decision.Labels)
+		fallback := false
+		if candidate == "" && cfg.Router.Fallback != "" {
+			candidate = cfg.Router.Fallback
+			fallback = true
+		}
+
+		classifierName := cfg.Router.Classifier
+		if classifierName == "" {
+			classifierName = router.ClassifierScore
+		}
+
+		return c.JSON(http.StatusOK, schema.RouterDecideResponse{
+			Router:          req.Router,
+			Classifier:      classifierName,
+			Labels:          decision.Labels,
+			Candidate:       candidate,
+			Fallback:        fallback,
+			Score:           decision.Score,
+			LatencyMs:       decision.Latency.Milliseconds(),
+			Cached:          decision.Cached,
+			CacheSimilarity: decision.CacheSimilarity,
+		})
+	}
+}
diff --git a/core/http/endpoints/localai/router_decide_test.go b/core/http/endpoints/localai/router_decide_test.go
new file mode 100644
index 000000000..d63cd091d
--- /dev/null
+++ b/core/http/endpoints/localai/router_decide_test.go
@@ -0,0 +1,248 @@
+package localai_test
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/endpoints/localai"
+	"github.com/mudler/LocalAI/core/http/middleware"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/router"
+	"github.com/mudler/LocalAI/pkg/system"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"gopkg.in/yaml.v3"
+)
+
+// RouterDecideEndpoint is the programmatic decision oracle that
+// external routers call to get LocalAI's classifier opinion without
+// committing LocalAI to handle the request. These specs pin the
+// validation surface and the happy-path / fallback / depth-1
+// behaviours; the classifier itself is covered in
+// core/services/routing/router/score_test.go and the in-band
+// middleware is covered in core/http/middleware/route_model_test.go.
+
+var _ = Describe("RouterDecideEndpoint", func() {
+	var (
+		modelDir  string
+		appConfig *config.ApplicationConfig
+		loader    *config.ModelConfigLoader
+	)
+
+	BeforeEach(func() {
+		d, err := os.MkdirTemp("", "router-decide-test-*")
+		Expect(err).NotTo(HaveOccurred())
+		modelDir = d
+		appConfig = &config.ApplicationConfig{
+			Context:     context.Background(),
+			SystemState: &system.SystemState{Model: system.Model{ModelsPath: modelDir}},
+		}
+		loader = config.NewModelConfigLoader(modelDir)
+	})
+
+	AfterEach(func() {
+		_ = os.RemoveAll(modelDir)
+	})
+
+	It("rejects requests with no router field", func() {
+		rec, _ := invokeDecide(loader, appConfig, deps(nil), `{"input":"hello"}`)
+		Expect(rec.Code).To(Equal(http.StatusBadRequest))
+		Expect(rec.Body.String()).To(ContainSubstring("router is required"))
+	})
+
+	It("rejects requests with no input field", func() {
+		rec, _ := invokeDecide(loader, appConfig, deps(nil), `{"router":"smart-router"}`)
+		Expect(rec.Code).To(Equal(http.StatusBadRequest))
+		Expect(rec.Body.String()).To(ContainSubstring("input is required"))
+	})
+
+	It("returns 404 for an unknown router model", func() {
+		rec, _ := invokeDecide(loader, appConfig, deps(nil), `{"router":"missing","input":"hello"}`)
+		Expect(rec.Code).To(Equal(http.StatusNotFound))
+		Expect(rec.Body.String()).To(ContainSubstring("router model not found"))
+	})
+
+	It("returns 400 when the named model has no router block", func() {
+		writeBareModel(modelDir, "plain-model")
+		rec, _ := invokeDecide(loader, appConfig, deps(nil), `{"router":"plain-model","input":"hello"}`)
+		Expect(rec.Code).To(Equal(http.StatusBadRequest))
+		Expect(rec.Body.String()).To(ContainSubstring("is not a router"))
+	})
+
+	It("returns 503 when the classifier can't be built (no scorer wired)", func() {
+		writeScoreRouter(modelDir, "smart-router")
+		writeBareModel(modelDir, "small-model")
+		writeBareModel(modelDir, "big-model")
+		// deps(nil) provides no scorer — buildClassifier returns an
+		// error and the handler maps that to 503.
+		rec, _ := invokeDecide(loader, appConfig, deps(nil), `{"router":"smart-router","input":"hello"}`)
+		Expect(rec.Code).To(Equal(http.StatusServiceUnavailable))
+		Expect(rec.Body.String()).To(ContainSubstring("classifier unavailable"))
+	})
+
+	It("returns the picked candidate when one covers the active labels", func() {
+		writeScoreRouter(modelDir, "smart-router")
+		writeBareModel(modelDir, "small-model")
+		writeBareModel(modelDir, "big-model")
+		scorer := &stubScorer{labelToLogProb: map[string]float64{
+			"code-generation": -0.05, // dominant
+			"casual-chat":     -3.0,
+			"math-reasoning":  -4.0,
+		}}
+		rec, body := invokeDecide(loader, appConfig, deps(scorer), `{"router":"smart-router","input":"debug my Go null pointer"}`)
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(body.Candidate).To(Equal("big-model"))
+		Expect(body.Fallback).To(BeFalse())
+		Expect(body.Labels).To(ContainElement("code-generation"))
+		Expect(body.Classifier).To(Equal(router.ClassifierScore))
+		Expect(body.Score).To(BeNumerically(">", 0))
+	})
+
+	It("returns the fallback when no candidate covers the active labels", func() {
+		// The router declares a label `math-reasoning` but no
+		// candidate carries it — only small=[casual-chat] and
+		// big=[code-generation, casual-chat]. A classifier output of
+		// "math-reasoning" forces the fallback path.
+		writeRouterNoFallbackCover(modelDir, "smart-router")
+		writeBareModel(modelDir, "small-model")
+		writeBareModel(modelDir, "big-model")
+		writeBareModel(modelDir, "fallback-model")
+		scorer := &stubScorer{labelToLogProb: map[string]float64{
+			"math-reasoning":  -0.05,
+			"code-generation": -3.0,
+			"casual-chat":     -4.0,
+		}}
+		rec, body := invokeDecide(loader, appConfig, deps(scorer), `{"router":"smart-router","input":"3 apples cost $2.40"}`)
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(body.Candidate).To(Equal("fallback-model"))
+		Expect(body.Fallback).To(BeTrue())
+		Expect(body.Labels).To(ContainElement("math-reasoning"))
+	})
+})
+
+// stubScorer mirrors the one in core/http/middleware/route_model_test.go.
+// Duplicated rather than exported because Go test helpers don't cross
+// _test.go package boundaries and exporting test-only types would
+// pollute the production surface.
+type stubScorer struct {
+	labelToLogProb map[string]float64
+}
+
+func (s *stubScorer) Score(_ context.Context, _ string, candidates []string) ([]backend.CandidateScore, error) {
+	out := make([]backend.CandidateScore, len(candidates))
+	for i, c := range candidates {
+		// Candidate is the Arch-Router JSON envelope
+		// `{"route": "<label>"}<stop>`; match against the full
+		// envelope so overlapping labels (e.g. `code` vs
+		// `code-generation`) can't collide under Go's randomised map
+		// iteration. Without this the lookup misses on every
+		// candidate and softmax flattens, making assertions pass for
+		// accidental reasons.
+		var lp float64
+		for label, v := range s.labelToLogProb {
+			if strings.Contains(c, `{"route": "`+label+`"}`) {
+				lp = v
+				break
+			}
+		}
+		out[i] = backend.CandidateScore{
+			LogProb:                 lp * 2,
+			LengthNormalizedLogProb: lp,
+			NumTokens:               2,
+		}
+	}
+	return out, nil
+}
+
+// deps wires a ClassifierDeps with a fresh registry and (optionally) a
+// stub scorer. Nil scorer is used to exercise the unavailable path.
+func deps(s *stubScorer) middleware.ClassifierDeps {
+	var scorer middleware.ScorerFactory
+	if s != nil {
+		scorer = func(string) backend.Scorer { return s }
+	}
+	return middleware.ClassifierDeps{
+		Scorer:   scorer,
+		Registry: router.NewRegistry(),
+	}
+}
+
+func invokeDecide(loader *config.ModelConfigLoader, appConfig *config.ApplicationConfig, d middleware.ClassifierDeps, body string) (*httptest.ResponseRecorder, schema.RouterDecideResponse) {
+	// Route through echo's mux so the default HTTPErrorHandler
+	// serialises echo.HTTPError into the response body. Calling the
+	// handler directly with a fresh Context skips that step and
+	// leaves the recorder empty on errors.
+	e := echo.New()
+	e.POST("/api/router/decide", localai.RouterDecideEndpoint(loader, appConfig, d))
+	req := httptest.NewRequest(http.MethodPost, "/api/router/decide", strings.NewReader(body))
+	req.Header.Set("Content-Type", "application/json")
+	rec := httptest.NewRecorder()
+	e.ServeHTTP(rec, req)
+	var parsed schema.RouterDecideResponse
+	if rec.Code == http.StatusOK {
+		Expect(json.Unmarshal(rec.Body.Bytes(), &parsed)).To(Succeed())
+	}
+	return rec, parsed
+}
+
+func writeScoreRouter(modelDir, name string) {
+	cfg := &config.ModelConfig{
+		Name: name,
+		Router: config.RouterConfig{
+			Classifier:      "score",
+			ClassifierModel: "arch-router",
+			Fallback:        "small-model",
+			Policies: []config.RouterPolicy{
+				{Label: "code-generation", Description: "writing or debugging code"},
+				{Label: "casual-chat", Description: "small talk"},
+				{Label: "math-reasoning", Description: "arithmetic and word problems"},
+			},
+			Candidates: []config.RouterCandidate{
+				{Model: "small-model", Labels: []string{"casual-chat"}},
+				{Model: "big-model", Labels: []string{"code-generation", "casual-chat", "math-reasoning"}},
+			},
+		},
+	}
+	b, err := yaml.Marshal(cfg)
+	Expect(err).NotTo(HaveOccurred())
+	Expect(os.WriteFile(filepath.Join(modelDir, name+".yaml"), b, 0o644)).To(Succeed())
+}
+
+// writeRouterNoFallbackCover declares math-reasoning as a policy but
+// has no candidate covering it. Combined with Fallback=fallback-model,
+// a math-reasoning classification forces the fallback branch.
+func writeRouterNoFallbackCover(modelDir, name string) {
+	cfg := &config.ModelConfig{
+		Name: name,
+		Router: config.RouterConfig{
+			Classifier:      "score",
+			ClassifierModel: "arch-router",
+			Fallback:        "fallback-model",
+			Policies: []config.RouterPolicy{
+				{Label: "code-generation", Description: "writing or debugging code"},
+				{Label: "casual-chat", Description: "small talk"},
+				{Label: "math-reasoning", Description: "arithmetic and word problems"},
+			},
+			Candidates: []config.RouterCandidate{
+				{Model: "small-model", Labels: []string{"casual-chat"}},
+				{Model: "big-model", Labels: []string{"code-generation", "casual-chat"}},
+			},
+		},
+	}
+	b, err := yaml.Marshal(cfg)
+	Expect(err).NotTo(HaveOccurred())
+	Expect(os.WriteFile(filepath.Join(modelDir, name+".yaml"), b, 0o644)).To(Succeed())
+}
+
+func writeBareModel(modelDir, name string) {
+	body := "name: " + name + "\nbackend: mock-backend\n"
+	Expect(os.WriteFile(filepath.Join(modelDir, name+".yaml"), []byte(body), 0o644)).To(Succeed())
+}
diff --git a/core/http/endpoints/localai/score.go b/core/http/endpoints/localai/score.go
new file mode 100644
index 000000000..cfbdd1d42
--- /dev/null
+++ b/core/http/endpoints/localai/score.go
@@ -0,0 +1,90 @@
+package localai
+
+import (
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/model"
+)
+
+// ScoreRequest is the wire format for POST /api/score. Mirrors the
+// gRPC ScoreRequest one-to-one — the endpoint exists primarily to
+// smoke-test the new Score primitive end-to-end without writing a
+// custom gRPC client. Production routing will call backend.ModelScore
+// directly via the router-side adapter.
+type ScoreRequest struct {
+	Model                string   `json:"model"`
+	Prompt               string   `json:"prompt"`
+	Candidates           []string `json:"candidates"`
+	IncludeTokenLogprobs bool     `json:"include_token_logprobs,omitempty"`
+	LengthNormalize      bool     `json:"length_normalize,omitempty"`
+}
+
+type ScoreResponseCandidate struct {
+	LogProb                 float64        `json:"log_prob"`
+	LengthNormalizedLogProb float64        `json:"length_normalized_log_prob,omitempty"`
+	NumTokens               int            `json:"num_tokens"`
+	Tokens                  []ScoreTokenLP `json:"tokens,omitempty"`
+}
+
+type ScoreTokenLP struct {
+	Token   string  `json:"token"`
+	LogProb float64 `json:"log_prob"`
+}
+
+type ScoreResponse struct {
+	Model      string                   `json:"model"`
+	Candidates []ScoreResponseCandidate `json:"candidates"`
+}
+
+// ScoreEndpoint exposes the Score gRPC primitive over HTTP. Admin-only —
+// scoring loads a model and runs inference, same risk surface as
+// /v1/chat/completions.
+func ScoreEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc {
+	return func(c echo.Context) error {
+		var req ScoreRequest
+		if err := c.Bind(&req); err != nil {
+			return echo.NewHTTPError(400, "invalid request body: "+err.Error())
+		}
+		if req.Model == "" {
+			return echo.NewHTTPError(400, "model is required")
+		}
+		if len(req.Candidates) == 0 {
+			return echo.NewHTTPError(400, "candidates must be non-empty")
+		}
+
+		modelConfig, err := cl.LoadModelConfigFileByNameDefaultOptions(req.Model, appConfig)
+		if err != nil || modelConfig == nil {
+			return echo.NewHTTPError(404, "model not found: "+req.Model)
+		}
+
+		fn, err := backend.ModelScore(req.Prompt, req.Candidates, backend.ScoreOptions{
+			IncludeTokenLogprobs: req.IncludeTokenLogprobs,
+			LengthNormalize:      req.LengthNormalize,
+		}, ml, *modelConfig, appConfig)
+		if err != nil {
+			return echo.NewHTTPError(500, "failed to bind scorer: "+err.Error())
+		}
+		results, err := fn(c.Request().Context())
+		if err != nil {
+			return echo.NewHTTPError(500, "score call failed: "+err.Error())
+		}
+
+		out := ScoreResponse{Model: req.Model, Candidates: make([]ScoreResponseCandidate, len(results))}
+		for i, r := range results {
+			out.Candidates[i] = ScoreResponseCandidate{
+				LogProb:                 r.LogProb,
+				LengthNormalizedLogProb: r.LengthNormalizedLogProb,
+				NumTokens:               r.NumTokens,
+			}
+			if req.IncludeTokenLogprobs && len(r.Tokens) > 0 {
+				toks := make([]ScoreTokenLP, len(r.Tokens))
+				for j, t := range r.Tokens {
+					toks[j] = ScoreTokenLP{Token: t.Token, LogProb: t.LogProb}
+				}
+				out.Candidates[i].Tokens = toks
+			}
+		}
+		return c.JSON(200, out)
+	}
+}
diff --git a/core/http/endpoints/localai/settings.go b/core/http/endpoints/localai/settings.go
index 0e29e39cc..1db87e313 100644
--- a/core/http/endpoints/localai/settings.go
+++ b/core/http/endpoints/localai/settings.go
@@ -253,6 +253,16 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
 			}
 		}
 
+		if settings.MITMListen != nil {
+			if err := app.RestartMITM(); err != nil {
+				xlog.Error("Failed to restart MITM proxy", "error", err)
+				return c.JSON(http.StatusInternalServerError, schema.SettingsResponse{
+					Success: false,
+					Error:   "Settings saved but failed to restart MITM proxy: " + err.Error(),
+				})
+			}
+		}
+
 		// Restart P2P if P2P settings changed
 		p2pChanged := settings.P2PToken != nil || settings.P2PNetworkID != nil || settings.Federated != nil
 		if p2pChanged {
diff --git a/core/http/endpoints/mcp/localai_assistant_test.go b/core/http/endpoints/mcp/localai_assistant_test.go
index bf701b0e9..a37e3234e 100644
--- a/core/http/endpoints/mcp/localai_assistant_test.go
+++ b/core/http/endpoints/mcp/localai_assistant_test.go
@@ -74,6 +74,34 @@ func (stubClient) GetBranding(_ context.Context) (*localaitools.Branding, error)
 func (stubClient) SetBranding(_ context.Context, _ localaitools.SetBrandingRequest) (*localaitools.Branding, error) {
 	return &localaitools.Branding{InstanceName: "LocalAI"}, nil
 }
+func (stubClient) GetUsageStats(_ context.Context, _ localaitools.UsageStatsQuery) (*localaitools.UsageStats, error) {
+	return &localaitools.UsageStats{Viewer: localaitools.UsageViewer{ID: "stub", Name: "stub"}, Period: "month"}, nil
+}
+func (stubClient) ListPIIPatterns(_ context.Context) ([]localaitools.PIIPattern, error) {
+	return nil, nil
+}
+func (stubClient) GetPIIEvents(_ context.Context, _ localaitools.PIIEventsQuery) ([]localaitools.PIIEvent, error) {
+	return nil, nil
+}
+func (stubClient) TestPIIRedaction(_ context.Context, req localaitools.PIIRedactTestRequest) (*localaitools.PIIRedactTestResult, error) {
+	return &localaitools.PIIRedactTestResult{Redacted: req.Text}, nil
+}
+func (stubClient) SetPIIPatternAction(_ context.Context, _ localaitools.PIIPatternActionUpdate) error {
+	return nil
+}
+func (stubClient) PersistPIIPatterns(_ context.Context) error { return nil }
+func (stubClient) GetMiddlewareStatus(_ context.Context) (*localaitools.MiddlewareStatus, error) {
+	return &localaitools.MiddlewareStatus{
+		PII: localaitools.MiddlewarePIIStatus{
+			EnabledGlobally: true,
+			Patterns:        []localaitools.PIIPattern{},
+			Models:          []localaitools.MiddlewarePIIModel{},
+		},
+	}, nil
+}
+func (stubClient) GetRouterDecisions(_ context.Context, _ localaitools.RouterDecisionsQuery) ([]localaitools.RouterDecision, error) {
+	return []localaitools.RouterDecision{}, nil
+}
 
 var _ = Describe("LocalAIAssistantHolder", func() {
 	var ctx context.Context
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index db716e4b7..ca1a76ec5 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -13,6 +13,8 @@ import (
 	mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/cloudproxy"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
 	"github.com/mudler/LocalAI/pkg/functions"
 	reason "github.com/mudler/LocalAI/pkg/reasoning"
 
@@ -72,7 +74,7 @@ func mergeToolCallDeltas(existing []schema.ToolCall, deltas []schema.ToolCall) [
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/chat/completions [post]
-func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, startupOptions *config.ApplicationConfig, natsClient mcpTools.MCPNATSClient, assistantHolder *mcpTools.LocalAIAssistantHolder) echo.HandlerFunc {
+func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, startupOptions *config.ApplicationConfig, natsClient mcpTools.MCPNATSClient, assistantHolder *mcpTools.LocalAIAssistantHolder, piiRedactor *pii.Redactor, piiEvents pii.EventStore) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		var textContentToReturn string
 		id := uuid.New().String()
@@ -92,6 +94,15 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 
 		xlog.Debug("Chat endpoint configuration read", "config", config)
 
+		// Cloud-proxy bail. Bypasses the local pipeline (templating,
+		// MCP injection, gRPC backend) and forwards via the cloud-
+		// proxy backend, which does the outbound HTTP. The streaming
+		// PII filter still runs because its input is per-token text
+		// extracted from the wire envelope, not the envelope itself.
+		if config.IsCloudProxyBackendPassthrough() {
+			return forwardCloudProxyOpenAIViaBackend(c, config, input, piiRedactor, piiEvents, ml, startupOptions)
+		}
+
 		funcs := input.Functions
 		shouldUseFn := len(input.Functions) > 0 && config.ShouldUseFunctions()
 		strictMode := false
@@ -326,6 +337,14 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			c.Response().Header().Set("Connection", "keep-alive")
 			c.Response().Header().Set("X-Correlation-ID", id)
 
+			// Per-stream PII filter: when the resolved model has PII
+			// enabled, wrap the response content so values spanning
+			// chunk boundaries still get masked. Shared with the
+			// cloud-proxy bail below via cloudproxy.BuildStreamFilter
+			// so both paths apply the same per-model gate and override
+			// rules.
+			streamPIIFilter := cloudproxy.BuildStreamFilter(c, config, true, piiRedactor, piiEvents, id)
+
 			mcpStreamMaxIterations := 10
 			if config.Agent.MaxIterations > 0 {
 				mcpStreamMaxIterations = config.Agent.MaxIterations
@@ -377,12 +396,52 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 								collectedToolCalls = mergeToolCallDeltas(collectedToolCalls, ev.Choices[0].Delta.ToolCalls)
 							}
 						}
-						// Collect content for MCP conversation history and automatic tool parsing fallback
-						if (hasMCPToolsStream || config.FunctionsConfig.AutomaticToolParsingFallback) && ev.Choices[0].Delta != nil && ev.Choices[0].Delta.Content != nil {
-							if s, ok := ev.Choices[0].Delta.Content.(string); ok {
-								collectedContent += s
-							} else if sp, ok := ev.Choices[0].Delta.Content.(*string); ok && sp != nil {
-								collectedContent += *sp
+						// Extract the raw content delta string once per chunk;
+						// both the MCP collector and the PII filter need it
+						// and the type-switch is otherwise duplicated.
+						var rawContent string
+						haveContent := false
+						if ev.Choices[0].Delta != nil && ev.Choices[0].Delta.Content != nil {
+							switch v := ev.Choices[0].Delta.Content.(type) {
+							case string:
+								rawContent = v
+								haveContent = true
+							case *string:
+								if v != nil {
+									rawContent = *v
+									haveContent = true
+								}
+							}
+						}
+						// Collect content for MCP conversation history and automatic tool parsing fallback.
+						// We collect the RAW (unfiltered) content so the model's tool-call
+						// markup keeps parsing correctly even when PII redaction would mask
+						// substrings.
+						if (hasMCPToolsStream || config.FunctionsConfig.AutomaticToolParsingFallback) && haveContent {
+							collectedContent += rawContent
+						}
+						// Stream-side PII filter: feed the content delta
+						// through the buffered-emit filter. The filter
+						// holds back a tail to handle pattern boundaries
+						// across chunks, so a Push may legitimately
+						// return "" — drop the chunk in that case rather
+						// than emitting an empty Delta to the wire.
+						if streamPIIFilter != nil && haveContent {
+							filtered := streamPIIFilter.Push(rawContent)
+							if filtered == "" {
+								// Fully buffered — skip this chunk's
+								// content. Still emit non-content chunks
+								// (role, tool_calls). When this delta is
+								// content-only and we buffer it, drop the
+								// whole event to avoid a vestigial
+								// {"delta":{}} on the wire.
+								if ev.Choices[0].Delta.Role == "" && len(ev.Choices[0].Delta.ToolCalls) == 0 && ev.Choices[0].Delta.Reasoning == nil {
+									continue
+								}
+								// Mixed delta — strip content, keep the rest.
+								ev.Choices[0].Delta.Content = nil
+							} else {
+								ev.Choices[0].Delta.Content = filtered
 							}
 						}
 						respData, err := json.Marshal(ev)
@@ -529,6 +588,31 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					}
 				}
 
+				// Drain the per-stream PII filter before the stop chunk
+				// so any text held back by the buffered-emit invariant
+				// reaches the client as a regular content delta. We
+				// emit it as a chunk WITHOUT a finish_reason so the
+				// next "stop" chunk still terminates the stream.
+				if streamPIIFilter != nil {
+					residual := streamPIIFilter.Drain()
+					if residual != "" {
+						drainResp := &schema.OpenAIResponse{
+							ID:      id,
+							Created: created,
+							Model:   input.Model,
+							Choices: []schema.Choice{{
+								Delta: &schema.Message{Content: residual},
+								Index: 0,
+							}},
+							Object: "chat.completion.chunk",
+						}
+						if drainBytes, err := json.Marshal(drainResp); err == nil {
+							_, _ = fmt.Fprintf(c.Response().Writer, "data: %s\n\n", drainBytes)
+							c.Response().Flush()
+						}
+					}
+				}
+
 				// No MCP tools to execute, send final stop message
 				finishReason := FinishReasonStop
 				if toolsCalled && len(input.Tools) > 0 {
@@ -553,6 +637,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					Object: "chat.completion.chunk",
 				}
 				respData, _ := json.Marshal(resp)
+
+				middleware.StampUsage(c, input.Model, finalUsage.Prompt, finalUsage.Completion)
+
 				fmt.Fprintf(c.Response().Writer, "data: %s\n\n", respData)
 
 				// Trailing usage chunk per OpenAI spec: emit only when the
@@ -935,6 +1022,8 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				respData, _ := json.Marshal(resp)
 				xlog.Debug("Response", "response", string(respData))
 
+				middleware.StampUsage(c, input.Model, usage.PromptTokens, usage.CompletionTokens)
+
 				// Return the prediction in the response body
 				return c.JSON(200, resp)
 			} // end MCP iteration loop
@@ -981,3 +1070,20 @@ func handleQuestion(config *config.ModelConfig, funcResults []functions.FuncCall
 
 	return "", nil
 }
+
+// forwardCloudProxyOpenAIViaBackend marshals the OpenAI request,
+// constructs the streaming PII filter (when this model has PII
+// enabled), and hands off to the cloud-proxy gRPC backend which does
+// the outbound HTTP. The chat endpoint owns the body+filter
+// construction because it's the only place the request lands as a
+// parsed *schema.OpenAIRequest.
+func forwardCloudProxyOpenAIViaBackend(c echo.Context, cfg *config.ModelConfig, input *schema.OpenAIRequest, piiRedactor *pii.Redactor, piiEvents pii.EventStore, ml *model.ModelLoader, appConfig *config.ApplicationConfig) error {
+	body, err := json.Marshal(input)
+	if err != nil {
+		return echo.NewHTTPError(http.StatusBadRequest, "cloudproxy: marshal request: "+err.Error())
+	}
+
+	correlationID := c.Response().Header().Get("X-Correlation-ID")
+	streamFilter := cloudproxy.BuildStreamFilter(c, cfg, input.Stream, piiRedactor, piiEvents, correlationID)
+	return cloudproxy.ForwardViaBackend(c, cfg, body, streamFilter, ml, appConfig)
+}
diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go
index f81e13e6a..fdcd310cf 100644
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -9,10 +9,12 @@ import (
 	"github.com/labstack/echo/v4"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/auth"
 	"github.com/mudler/LocalAI/core/http/middleware"
 
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
 	"github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/model"
@@ -25,7 +27,7 @@ import (
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/completions [post]
-func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig) echo.HandlerFunc {
+func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig, piiRedactor *pii.Redactor, piiEvents pii.EventStore) echo.HandlerFunc {
 	process := func(id string, s string, req *schema.OpenAIRequest, config *config.ModelConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse, extraUsage bool) error {
 		tokenCallback := func(s string, tokenUsage backend.TokenUsage) bool {
 			created := int(time.Now().Unix())
@@ -111,6 +113,31 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 				return errors.New("cannot handle more than 1 `PromptStrings` when Streaming")
 			}
 
+			// Per-stream PII filter — same gating as chat. /v1/completions
+			// has no chat-message structure, so request-side PII isn't
+			// wired here, but the response-side filter still catches PII
+			// trained into the model. Filter is nil when this model has
+			// PII disabled.
+			var streamPIIFilter *pii.StreamFilter
+			if piiRedactor != nil && config.PIIIsEnabled() {
+				correlationID := id
+				userID := ""
+				if u := auth.GetUser(c); u != nil {
+					userID = u.ID
+				}
+				var overrides map[string]pii.Action
+				if raw := config.PIIPatternOverrides(); len(raw) > 0 {
+					overrides = make(map[string]pii.Action, len(raw))
+					for ovid, action := range raw {
+						switch pii.Action(action) {
+						case pii.ActionMask, pii.ActionBlock, pii.ActionRouteLocal:
+							overrides[ovid] = pii.Action(action)
+						}
+					}
+				}
+				streamPIIFilter = pii.NewStreamFilter(piiRedactor, overrides, piiEvents, correlationID, userID)
+			}
+
 			predInput := config.PromptStrings[0]
 
 			templatedInput, err := evaluator.EvaluateTemplateForPrompt(templates.CompletionPromptTemplate, *config, templates.PromptTemplateData{
@@ -143,12 +170,28 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 					}
 					// Capture running cumulative usage for the optional trailer
 					// emitted after the final stop chunk when include_usage=true.
+					// Done before the PII filter so a fully-buffered chunk
+					// (which we drop from the wire) still contributes to the
+					// running total.
 					if ev.Usage != nil {
 						latestUsage = ev.Usage
 					}
 					// OpenAI streaming spec: intermediate chunks must NOT
 					// carry a `usage` field. Strip the tracking copy now.
 					ev.Usage = nil
+					// Run the per-chunk text through the streaming PII
+					// filter. The filter holds back a tail to handle
+					// pattern boundaries, so a Push may legitimately
+					// return "" — drop the chunk's text rather than
+					// emitting a 0-token delta. Choice.Text is the only
+					// content surface in /v1/completions chunks.
+					if streamPIIFilter != nil && ev.Choices[0].Text != "" {
+						filtered := streamPIIFilter.Push(ev.Choices[0].Text)
+						if filtered == "" {
+							continue
+						}
+						ev.Choices[0].Text = filtered
+					}
 					respData, err := json.Marshal(ev)
 					if err != nil {
 						xlog.Debug("Failed to marshal response", "error", err)
@@ -194,6 +237,25 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 				}
 			}
 
+			// Flush any residual the streaming PII filter held back as
+			// part of its trailing pattern-window. Emit it as one final
+			// text-bearing chunk before the synthetic stop chunk so the
+			// completion body remains a contiguous text stream.
+			if streamPIIFilter != nil {
+				if residual := streamPIIFilter.Drain(); residual != "" {
+					residualResp := schema.OpenAIResponse{
+						ID:      id,
+						Created: created,
+						Model:   input.Model,
+						Choices: []schema.Choice{{Index: 0, Text: residual}},
+						Object:  "text_completion",
+					}
+					if data, err := json.Marshal(residualResp); err == nil {
+						_, _ = fmt.Fprintf(c.Response().Writer, "data: %s\n\n", string(data))
+					}
+				}
+			}
+
 			stopReason := FinishReasonStop
 			resp := &schema.OpenAIResponse{
 				ID:      id,
@@ -208,6 +270,14 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 				Object: "text_completion",
 			}
 			respData, _ := json.Marshal(resp)
+
+			pt, ct := 0, 0
+			if latestUsage != nil {
+				pt = latestUsage.PromptTokens
+				ct = latestUsage.CompletionTokens
+			}
+			middleware.StampUsage(c, input.Model, pt, ct)
+
 			fmt.Fprintf(c.Response().Writer, "data: %s\n\n", respData)
 
 			// Trailing usage chunk per OpenAI spec: emit only when the caller
@@ -274,6 +344,8 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 		jsonResult, _ := json.Marshal(resp)
 		xlog.Debug("Response", "response", string(jsonResult))
 
+		middleware.StampUsage(c, input.Model, totalTokenUsage.Prompt, totalTokenUsage.Completion)
+
 		// Return the prediction in the response body
 		return c.JSON(200, resp)
 	}
diff --git a/core/http/endpoints/openai/edit.go b/core/http/endpoints/openai/edit.go
index 9a5198916..5258fddb1 100644
--- a/core/http/endpoints/openai/edit.go
+++ b/core/http/endpoints/openai/edit.go
@@ -98,6 +98,8 @@ func EditEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 		jsonResult, _ := json.Marshal(resp)
 		xlog.Debug("Response", "response", string(jsonResult))
 
+		middleware.StampUsage(c, input.Model, totalTokenUsage.Prompt, totalTokenUsage.Completion)
+
 		// Return the prediction in the response body
 		return c.JSON(200, resp)
 	}
diff --git a/core/http/endpoints/openai/embeddings.go b/core/http/endpoints/openai/embeddings.go
index 517881f66..96fd5efc8 100644
--- a/core/http/endpoints/openai/embeddings.go
+++ b/core/http/endpoints/openai/embeddings.go
@@ -102,6 +102,15 @@ func EmbeddingsEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, app
 		jsonResult, _ := json.Marshal(resp)
 		xlog.Debug("Response", "response", string(jsonResult))
 
+		// LocalAI's embeddings endpoint does not currently track per-call
+		// token counts (the gRPC Embedding RPC returns a vector, not a
+		// usage block), so we stamp with zeros. The point of stamping is
+		// that the billing pipeline still sees the request and emits the
+		// localai_billed_requests_total counter; without this the call
+		// would be silently dropped by the unrecorded-counter path. When
+		// embeddings learn to report usage, swap the zeros for real counts.
+		middleware.StampUsage(c, input.Model, 0, 0)
+
 		// Return the prediction in the response body
 		return c.JSON(200, resp)
 	}
diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
index 9a416719d..b680559cf 100644
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -497,6 +497,7 @@ func runRealtimeSession(application *application.Application, t Transport, model
 		application.ModelLoader(),
 		application.ApplicationConfig(),
 		evaluator,
+		buildRealtimeRoutingContext(application, sessionID),
 	)
 	if err != nil {
 		xlog.Error("failed to load model", "error", err)
@@ -627,6 +628,7 @@ func runRealtimeSession(application *application.Application, t Transport, model
 					application.ModelLoader(),
 					application.ApplicationConfig(),
 					evaluator,
+					buildRealtimeRoutingContext(application, session.ID),
 				); err != nil {
 					xlog.Error("failed to update session", "error", err)
 					sendError(t, "session_update_error", "Failed to update session", "", "")
@@ -946,7 +948,7 @@ func updateTransSession(session *Session, update *types.SessionUnion, cl *config
 	return nil
 }
 
-func updateSession(session *Session, update *types.SessionUnion, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, evaluator *templates.Evaluator) error {
+func updateSession(session *Session, update *types.SessionUnion, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, evaluator *templates.Evaluator, routing *RealtimeRoutingContext) error {
 	sessionLock.Lock()
 	defer sessionLock.Unlock()
 
@@ -985,7 +987,7 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
 	}
 
 	if rt.Model != "" || (rt.Audio != nil && rt.Audio.Output != nil && rt.Audio.Output.Voice != "") || (rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil) {
-		m, err := newModel(&session.ModelConfig.Pipeline, cl, ml, appConfig, evaluator)
+		m, err := newModel(&session.ModelConfig.Pipeline, cl, ml, appConfig, evaluator, routing)
 		if err != nil {
 			return err
 		}
diff --git a/core/http/endpoints/openai/realtime_model.go b/core/http/endpoints/openai/realtime_model.go
index bfeb70739..9ec7e3d6b 100644
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -2,13 +2,18 @@ package openai
 
 import (
 	"context"
+	"crypto/rand"
+	"encoding/hex"
 	"encoding/json"
 	"fmt"
 
+	"github.com/mudler/LocalAI/core/application"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/router"
 	"github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
@@ -34,6 +39,15 @@ type wrappedModel struct {
 	modelLoader *model.ModelLoader
 	confLoader  *config.ModelConfigLoader
 	evaluator   *templates.Evaluator
+
+	// Routing — populated by newModel when the application wires routing
+	// deps in. nil-safe: with classifierRegistry == nil the per-turn
+	// routing block in Predict is skipped, preserving today's "one LLM
+	// for the whole session" behaviour.
+	routerDeps        *middleware.ClassifierDeps
+	routerStore       router.DecisionStore
+	routerSessionID   string
+	routerUserID      string
 }
 
 // anyToAnyModel represent a model which supports Any-to-Any operations
@@ -90,9 +104,24 @@ func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, im
 		Messages: messages,
 	}
 
+	// Per-turn routing: when the session's LLMConfig is a router, swap
+	// to the candidate the classifier picks for this turn's prompt.
+	// LLMConfig itself is held by value (we never mutate it) — turnCfg
+	// is the config we dispatch against.
+	turnCfg := m.LLMConfig
+	if m.LLMConfig.HasRouter() && m.routerDeps != nil {
+		chosen, err := m.routeTurn(ctx, &input)
+		if err != nil {
+			xlog.Warn("realtime routing failed; using session default LLM",
+				"router_model", m.LLMConfig.Name, "error", err)
+		} else if chosen != nil {
+			turnCfg = chosen
+		}
+	}
+
 	var predInput string
 	var funcs []functions.Function
-	if !m.LLMConfig.TemplateConfig.UseTokenizerTemplate {
+	if !turnCfg.TemplateConfig.UseTokenizerTemplate {
 		if len(tools) > 0 {
 			for _, t := range tools {
 				if t.Function != nil {
@@ -120,11 +149,11 @@ func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, im
 			noActionName := "answer"
 			noActionDescription := "use this action to answer without performing any action"
 
-			if m.LLMConfig.FunctionsConfig.NoActionFunctionName != "" {
-				noActionName = m.LLMConfig.FunctionsConfig.NoActionFunctionName
+			if turnCfg.FunctionsConfig.NoActionFunctionName != "" {
+				noActionName = turnCfg.FunctionsConfig.NoActionFunctionName
 			}
-			if m.LLMConfig.FunctionsConfig.NoActionDescriptionName != "" {
-				noActionDescription = m.LLMConfig.FunctionsConfig.NoActionDescriptionName
+			if turnCfg.FunctionsConfig.NoActionDescriptionName != "" {
+				noActionDescription = turnCfg.FunctionsConfig.NoActionDescriptionName
 			}
 
 			noActionGrammar := functions.Function{
@@ -140,16 +169,16 @@ func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, im
 				},
 			}
 
-			if !m.LLMConfig.FunctionsConfig.DisableNoAction {
+			if !turnCfg.FunctionsConfig.DisableNoAction {
 				funcs = append(funcs, noActionGrammar)
 			}
 		}
 
-		predInput = m.evaluator.TemplateMessages(input, input.Messages, m.LLMConfig, funcs, len(funcs) > 0)
+		predInput = m.evaluator.TemplateMessages(input, input.Messages, turnCfg, funcs, len(funcs) > 0)
 
 		xlog.Debug("Prompt (after templating)", "prompt", predInput)
-		if m.LLMConfig.Grammar != "" {
-			xlog.Debug("Grammar", "grammar", m.LLMConfig.Grammar)
+		if turnCfg.Grammar != "" {
+			xlog.Debug("Grammar", "grammar", turnCfg.Grammar)
 		}
 	}
 
@@ -159,33 +188,33 @@ func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, im
 			// String values: "auto", "required", "none"
 			switch toolChoice.Mode {
 			case types.ToolChoiceModeRequired:
-				m.LLMConfig.SetFunctionCallString("required")
+				turnCfg.SetFunctionCallString("required")
 			case types.ToolChoiceModeNone:
 				// Don't use tools
-				m.LLMConfig.SetFunctionCallString("none")
+				turnCfg.SetFunctionCallString("none")
 			case types.ToolChoiceModeAuto:
 				// Default behavior - let model decide
 			}
 		} else if toolChoice.Function != nil {
 			// Specific function specified
-			m.LLMConfig.SetFunctionCallNameString(toolChoice.Function.Name)
+			turnCfg.SetFunctionCallNameString(toolChoice.Function.Name)
 		}
 	}
 
 	// Generate grammar for function calling if tools are provided and grammar generation is enabled
-	shouldUseFn := len(tools) > 0 && m.LLMConfig.ShouldUseFunctions()
+	shouldUseFn := len(tools) > 0 && turnCfg.ShouldUseFunctions()
 
-	if !m.LLMConfig.FunctionsConfig.GrammarConfig.NoGrammar && shouldUseFn {
+	if !turnCfg.FunctionsConfig.GrammarConfig.NoGrammar && shouldUseFn {
 		// Force picking one of the functions by the request
-		if m.LLMConfig.FunctionToCall() != "" {
-			funcs = functions.Functions(funcs).Select(m.LLMConfig.FunctionToCall())
+		if turnCfg.FunctionToCall() != "" {
+			funcs = functions.Functions(funcs).Select(turnCfg.FunctionToCall())
 		}
 
 		// Generate grammar from function definitions
-		jsStruct := functions.Functions(funcs).ToJSONStructure(m.LLMConfig.FunctionsConfig.FunctionNameKey, m.LLMConfig.FunctionsConfig.FunctionNameKey)
-		g, err := jsStruct.Grammar(m.LLMConfig.FunctionsConfig.GrammarOptions()...)
+		jsStruct := functions.Functions(funcs).ToJSONStructure(turnCfg.FunctionsConfig.FunctionNameKey, turnCfg.FunctionsConfig.FunctionNameKey)
+		g, err := jsStruct.Grammar(turnCfg.FunctionsConfig.GrammarOptions()...)
 		if err == nil {
-			m.LLMConfig.Grammar = g
+			turnCfg.Grammar = g
 			xlog.Debug("Generated grammar for function calling", "grammar", g)
 		} else {
 			xlog.Error("Failed generating grammar", "error", err)
@@ -237,7 +266,50 @@ func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, im
 		toolChoiceJSON = string(b)
 	}
 
-	return backend.ModelInference(ctx, predInput, messages, images, videos, audios, m.modelLoader, m.LLMConfig, m.confLoader, m.appConfig, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias, nil)
+	return backend.ModelInference(ctx, predInput, messages, images, videos, audios, m.modelLoader, turnCfg, m.confLoader, m.appConfig, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias, nil)
+}
+
+// routeTurn classifies this turn's prompt against the session's router
+// LLM config and returns the candidate ModelConfig to dispatch against.
+// Returns nil with no error when routing was attempted but the resolver
+// signalled "no decision" — the caller falls back to the session
+// default. Records the decision in the store using the realtime session
+// id as the correlation id so the admin UI can group turn-by-turn
+// decisions under one session row.
+func (m *wrappedModel) routeTurn(ctx context.Context, req *schema.OpenAIRequest) (*config.ModelConfig, error) {
+	if m.routerDeps == nil {
+		return nil, nil
+	}
+	registry := m.routerDeps.Registry
+	if registry == nil {
+		registry = router.NewRegistry()
+	}
+	classifier, classifierErr := middleware.GetOrBuildClassifier(registry, m.LLMConfig, *m.routerDeps)
+	if classifierErr != nil {
+		xlog.Warn("realtime router: classifier unavailable — using fallback",
+			"router_model", m.LLMConfig.Name, "error", classifierErr)
+		classifier = nil
+	}
+	loader := func(name string) (*config.ModelConfig, error) {
+		return m.confLoader.LoadModelConfigFileByNameDefaultOptions(name, m.appConfig)
+	}
+	probe := middleware.OpenAIProbeFromRequest(req)
+
+	result, err := router.Resolve(ctx, m.LLMConfig, classifier, loader, probe)
+	if err != nil {
+		return nil, err
+	}
+
+	if m.routerStore != nil {
+		_ = m.routerStore.Record(context.Background(), result.ToDecisionRecord(newRealtimeDecisionID(), m.routerSessionID, m.routerUserID, router.SourceRealtime))
+	}
+	return result.ChosenConfig, nil
+}
+
+func newRealtimeDecisionID() string {
+	var b [12]byte
+	_, _ = rand.Read(b[:])
+	return "rd_" + hex.EncodeToString(b[:])
 }
 
 func (m *wrappedModel) TTS(ctx context.Context, text, voice, language string) (string, *proto.Result, error) {
@@ -279,8 +351,49 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig
 	}, cfgSST, nil
 }
 
+// RealtimeRoutingContext is the bundle of routing dependencies the
+// realtime pipeline needs to consult router.Resolve per turn. nil-safe:
+// passing nil skips routing entirely and preserves the historical "one
+// LLM for the whole session" behaviour.
+type RealtimeRoutingContext struct {
+	Deps      *middleware.ClassifierDeps
+	Store     router.DecisionStore
+	SessionID string
+	UserID    string
+}
+
+// buildRealtimeRoutingContext assembles the routing dependencies the
+// realtime pipeline needs from the application container. Returns nil
+// when no Application is wired (tests, stripped builds) — that path
+// leaves wrappedModel.Predict on the historical "no routing" path
+// instead of failing at session start.
+func buildRealtimeRoutingContext(a *application.Application, sessionID string) *RealtimeRoutingContext {
+	if a == nil {
+		return nil
+	}
+	deps := &middleware.ClassifierDeps{
+		Scorer:      a.Scorer,
+		Embedder:    a.Embedder,
+		VectorStore: a.VectorStore,
+		Reranker:    a.Reranker,
+		ModelLookup: a.ModelConfigLookup(),
+		Registry:    a.RouterClassifierRegistry(),
+		Evaluator:   a.TemplatesEvaluator(),
+	}
+	userID := ""
+	if u := a.FallbackUser(); u != nil {
+		userID = u.ID
+	}
+	return &RealtimeRoutingContext{
+		Deps:      deps,
+		Store:     a.RouterDecisions(),
+		SessionID: sessionID,
+		UserID:    userID,
+	}
+}
+
 // returns and loads either a wrapped model or a model that support audio-to-audio
-func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, evaluator *templates.Evaluator) (Model, error) {
+func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, evaluator *templates.Evaluator, routing *RealtimeRoutingContext) (Model, error) {
 	xlog.Debug("Creating new model pipeline model", "pipeline", pipeline)
 
 	cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath)
@@ -346,7 +459,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 		return nil, fmt.Errorf("failed to validate config: %w", err)
 	}
 
-	return &wrappedModel{
+	wm := &wrappedModel{
 		TTSConfig:           cfgTTS,
 		TranscriptionConfig: cfgSST,
 		LLMConfig:           cfgLLM,
@@ -356,5 +469,12 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 		modelLoader: ml,
 		appConfig:   appConfig,
 		evaluator:   evaluator,
-	}, nil
+	}
+	if routing != nil {
+		wm.routerDeps = routing.Deps
+		wm.routerStore = routing.Store
+		wm.routerSessionID = routing.SessionID
+		wm.routerUserID = routing.UserID
+	}
+	return wm, nil
 }
diff --git a/core/http/middleware/admission.go b/core/http/middleware/admission.go
new file mode 100644
index 000000000..c79066925
--- /dev/null
+++ b/core/http/middleware/admission.go
@@ -0,0 +1,81 @@
+package middleware
+
+import (
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"fmt"
+	"net/http"
+	"strconv"
+	"sync/atomic"
+	"time"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services/routing/admission"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+)
+
+// AdmissionControl runs after RouteModel so the limit applies to the
+// SERVED model — a router fanout that lands on a saturated downstream
+// model gets rejected even though the requested router-model has slack.
+//
+// On reject: HTTP 503, Retry-After header, error JSON. An audit row
+// goes into the shared event store under KindAdmission so admins see
+// rejection rates alongside PII and proxy events.
+//
+// Models without limits.max_concurrent (the common case) hit a fast
+// no-op path — Acquire returns immediately for max <= 0.
+func AdmissionControl(limiter *admission.Limiter, events pii.EventStore) echo.MiddlewareFunc {
+	return func(next echo.HandlerFunc) echo.HandlerFunc {
+		return func(c echo.Context) error {
+			cfg, ok := c.Get(CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
+			if !ok || cfg == nil {
+				return next(c)
+			}
+			max := cfg.Limits.MaxConcurrent
+			release, ok := limiter.Acquire(cfg.Name, max)
+			if !ok {
+				retryAfter := admission.RetryAfter(cfg.Limits.RetryAfterSeconds)
+				recordAdmissionRejection(events, cfg.Name, retryAfter)
+				c.Response().Header().Set("Retry-After", strconv.Itoa(int(retryAfter.Seconds())))
+				return c.JSON(http.StatusServiceUnavailable, map[string]any{
+					"error": map[string]any{
+						"type":    "admission_rejected",
+						"message": fmt.Sprintf("model %q is at capacity (max_concurrent=%d); retry after %s", cfg.Name, max, retryAfter),
+					},
+				})
+			}
+			defer release()
+			return next(c)
+		}
+	}
+}
+
+// admissionEventSeq scopes IDs across the process so rapid
+// rejections under load get unique row IDs without coordinating
+// with the rest of the event-store ID schemes.
+var admissionEventSeq atomic.Uint64
+
+func recordAdmissionRejection(events pii.EventStore, modelName string, retryAfter time.Duration) {
+	if events == nil {
+		return
+	}
+	statusCode := http.StatusServiceUnavailable
+	durMS := retryAfter.Milliseconds()
+	id := fmt.Sprintf("adm_%d_%s", admissionEventSeq.Add(1), randHex(4))
+	_ = events.Record(context.Background(), pii.PIIEvent{
+		ID:         id,
+		Kind:       pii.KindAdmission,
+		Host:       modelName,
+		StatusCode: statusCode,
+		DurationMS: durMS,
+		CreatedAt:  time.Now().UTC(),
+	})
+}
+
+func randHex(n int) string {
+	b := make([]byte, n)
+	_, _ = rand.Read(b)
+	return hex.EncodeToString(b)
+}
diff --git a/core/http/middleware/admission_test.go b/core/http/middleware/admission_test.go
new file mode 100644
index 000000000..841a2dd47
--- /dev/null
+++ b/core/http/middleware/admission_test.go
@@ -0,0 +1,118 @@
+package middleware_test
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"sync"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/config"
+	. "github.com/mudler/LocalAI/core/http/middleware"
+	"github.com/mudler/LocalAI/core/services/routing/admission"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// recordingStore captures admission rows so the test can assert
+// the audit trail without standing up the full pii event store.
+type recordingStore struct {
+	mu     sync.Mutex
+	events []pii.PIIEvent
+}
+
+func (r *recordingStore) Record(_ context.Context, e pii.PIIEvent) error {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.events = append(r.events, e)
+	return nil
+}
+func (r *recordingStore) List(_ context.Context, _ pii.ListQuery) ([]pii.PIIEvent, error) {
+	return nil, nil
+}
+func (r *recordingStore) Count(_ context.Context) (int, error) { return 0, nil }
+func (r *recordingStore) Close() error                         { return nil }
+
+func runAdmission(lim *admission.Limiter, store *recordingStore, cfg *config.ModelConfig, handler echo.HandlerFunc) (*httptest.ResponseRecorder, error) {
+	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader("{}"))
+	rec := httptest.NewRecorder()
+	c := echo.New().NewContext(req, rec)
+	c.Set(CONTEXT_LOCALS_KEY_MODEL_CONFIG, cfg)
+	mw := AdmissionControl(lim, store)
+	err := mw(handler)(c)
+	return rec, err
+}
+
+var _ = Describe("Admission", func() {
+	It("allows when under limit", func() {
+		lim := admission.New()
+		cfg := &config.ModelConfig{Limits: config.LimitsConfig{MaxConcurrent: 2}}
+		cfg.Name = "m"
+		rec, err := runAdmission(lim, &recordingStore{}, cfg, func(c echo.Context) error {
+			return c.String(http.StatusOK, "ok")
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(rec.Code).To(Equal(http.StatusOK))
+	})
+
+	It("rejects when full", func() {
+		// Saturate the limiter outside the middleware, then a request
+		// at the same model gets 503 with a Retry-After header.
+		lim := admission.New()
+		release, ok := lim.Acquire("busy", 1)
+		Expect(ok).To(BeTrue(), "setup acquire should succeed")
+		defer release()
+
+		cfg := &config.ModelConfig{Limits: config.LimitsConfig{MaxConcurrent: 1, RetryAfterSeconds: 3}}
+		cfg.Name = "busy"
+		store := &recordingStore{}
+		handlerCalled := false
+		rec, err := runAdmission(lim, store, cfg, func(c echo.Context) error {
+			handlerCalled = true
+			return c.String(http.StatusOK, "ok")
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(rec.Code).To(Equal(http.StatusServiceUnavailable))
+		Expect(rec.Header().Get("Retry-After")).To(Equal("3"))
+		Expect(handlerCalled).To(BeFalse(), "handler should not run when admission rejects")
+		Expect(rec.Body.String()).To(ContainSubstring("admission_rejected"))
+		Expect(store.events).To(HaveLen(1))
+		Expect(store.events[0].Kind).To(Equal(pii.KindAdmission))
+		Expect(store.events[0].Host).To(Equal("busy"), "audit row carries the model name")
+	})
+
+	It("no limit configured is no-op", func() {
+		// MaxConcurrent=0 means unlimited — handler always runs and no
+		// audit row is written even after many calls.
+		lim := admission.New()
+		cfg := &config.ModelConfig{}
+		cfg.Name = "open"
+		store := &recordingStore{}
+		for i := 0; i < 10; i++ {
+			rec, err := runAdmission(lim, store, cfg, func(c echo.Context) error {
+				return c.String(http.StatusOK, "ok")
+			})
+			Expect(err).NotTo(HaveOccurred())
+			Expect(rec.Code).To(Equal(http.StatusOK))
+		}
+		Expect(store.events).To(BeEmpty())
+	})
+
+	It("releases after handler", func() {
+		// One slot, two SEQUENTIAL requests: the second succeeds because
+		// the first's release runs on handler return.
+		lim := admission.New()
+		cfg := &config.ModelConfig{Limits: config.LimitsConfig{MaxConcurrent: 1}}
+		cfg.Name = "tight"
+		for i := 0; i < 3; i++ {
+			rec, err := runAdmission(lim, &recordingStore{}, cfg, func(c echo.Context) error {
+				return c.String(http.StatusOK, "ok")
+			})
+			Expect(err).NotTo(HaveOccurred())
+			Expect(rec.Code).To(Equal(http.StatusOK))
+		}
+	})
+})
diff --git a/core/http/middleware/context_keys.go b/core/http/middleware/context_keys.go
new file mode 100644
index 000000000..d1983c882
--- /dev/null
+++ b/core/http/middleware/context_keys.go
@@ -0,0 +1,50 @@
+package middleware
+
+// Context keys used by routing-module middlewares to communicate with
+// the usage recorder. Unlike the legacy CONTEXT_LOCALS_KEY_* constants
+// (which exist for backward-compatible callers), these are the
+// canonical names for new fields.
+const (
+	// ContextKeyRequestedModel is set by content-router middleware to
+	// the model name the client originally asked for, before any router
+	// remapping. UsageMiddleware writes this into UsageRecord.RequestedModel.
+	ContextKeyRequestedModel = "routing.requested_model"
+
+	// ContextKeyServedModel is set by content-router middleware to the
+	// model that actually handled the request (post-routing). When no
+	// router runs, callers may leave this unset and the response-reported
+	// model name is used as the served value.
+	ContextKeyServedModel = "routing.served_model"
+
+	// ContextKeyPreFilterPromptTokens / ContextKeyPostFilterPromptTokens
+	// are set by the PII middleware to record how many prompt tokens
+	// the user sent vs how many made it past redaction. When both are
+	// zero or unset, UsageMiddleware uses the response-reported prompt
+	// token count for both — i.e., no filter ran.
+	ContextKeyPreFilterPromptTokens  = "routing.pre_filter_prompt_tokens"
+	ContextKeyPostFilterPromptTokens = "routing.post_filter_prompt_tokens"
+
+	// ContextKeyCorrelationID is the join key threaded across PII
+	// events, router decisions, admission events, and usage records.
+	// trace.go middleware sets X-Correlation-ID on the response; this
+	// key mirrors the same value into echo.Context for in-process
+	// propagation without re-parsing the header.
+	ContextKeyCorrelationID = "routing.correlation_id"
+
+	// ContextKeyPromptTokens / ContextKeyCompletionTokens / ContextKeyTotalTokens
+	// are the canonical token counts the request handler measured. Stamping
+	// these from the handler is the only reliable path for streaming
+	// responses, where the SSE chunks may not include a usage block (OpenAI
+	// requires stream_options.include_usage; Anthropic uses a separate
+	// message_delta event shape). UsageMiddleware prefers these context
+	// values over body-parsing.
+	ContextKeyPromptTokens     = "routing.prompt_tokens"
+	ContextKeyCompletionTokens = "routing.completion_tokens"
+	ContextKeyTotalTokens      = "routing.total_tokens"
+
+	// ContextKeyResponseModel is the model name the handler committed to
+	// in its response payload. UsageMiddleware uses it when neither the
+	// router nor the body-parse path has produced one. Distinct from
+	// ContextKeyServedModel, which is the router's resolved choice.
+	ContextKeyResponseModel = "routing.response_model"
+)
diff --git a/core/http/middleware/request.go b/core/http/middleware/request.go
index 797968260..456c4d0f7 100644
--- a/core/http/middleware/request.go
+++ b/core/http/middleware/request.go
@@ -308,6 +308,17 @@ func mergeOpenAIRequestAndModelConfig(config *config.ModelConfig, input *schema.
 		config.Temperature = input.Temperature
 	}
 
+	// Collapse the modern max_completion_tokens alias into the
+	// legacy Maxtokens field so downstream code reads exactly one.
+	// MaxCompletionTokens wins on conflict — it's the canonical
+	// name per OpenAI's deprecation guidance, and a client that
+	// took the trouble to send it intends that value. Clearing
+	// the sibling prevents both names from being emitted if input
+	// is re-marshaled (cloud-proxy passthrough).
+	if input.MaxCompletionTokens != nil {
+		input.Maxtokens = input.MaxCompletionTokens
+		input.MaxCompletionTokens = nil
+	}
 	if input.Maxtokens != nil {
 		config.Maxtokens = input.Maxtokens
 	}
diff --git a/core/http/middleware/request_test.go b/core/http/middleware/request_test.go
index 70e8a05b1..cc4e8199e 100644
--- a/core/http/middleware/request_test.go
+++ b/core/http/middleware/request_test.go
@@ -156,9 +156,13 @@ var _ = Describe("SetModelAndConfig middleware", func() {
 // ---------------------------------------------------------------------------
 //
 // The OpenAI chat/completions spec nests the function name under "function":
-//     {"type":"function", "function":{"name":"my_function"}}
+//
+//	{"type":"function", "function":{"name":"my_function"}}
+//
 // The legacy Anthropic-compat shape puts it at the top level:
-//     {"type":"function", "name":"my_function"}
+//
+//	{"type":"function", "name":"my_function"}
+//
 // Both need to reach SetFunctionCallNameString (not SetFunctionCallString,
 // which is the mode field "none"/"auto"/"required").
 //
@@ -550,4 +554,46 @@ var _ = Describe("SetModelAndConfig tool_choice parsing (chat completions)", fun
 			Expect(capturedConfig.FunctionToCall()).To(Equal(""))
 		})
 	})
+
+	// OpenAI deprecated max_tokens in favour of max_completion_tokens
+	// (gpt-5 / o-series reject the legacy name). The middleware accepts
+	// both and collapses to the legacy internal Maxtokens field so
+	// downstream code reads exactly one.
+	Context("max_completion_tokens alias", func() {
+		chatReqMaxTokens := func(fields string) string {
+			return `{"model":"test-model",` +
+				`"messages":[{"role":"user","content":"hi"}],` +
+				fields + `}`
+		}
+
+		It("accepts the modern max_completion_tokens name", func() {
+			rec := postJSON(app, "/v1/chat/completions",
+				chatReqMaxTokens(`"max_completion_tokens":64`))
+
+			Expect(rec.Code).To(Equal(http.StatusOK))
+			Expect(capturedConfig).ToNot(BeNil())
+			Expect(capturedConfig.Maxtokens).ToNot(BeNil())
+			Expect(*capturedConfig.Maxtokens).To(Equal(64))
+		})
+
+		It("still accepts the legacy max_tokens name", func() {
+			rec := postJSON(app, "/v1/chat/completions",
+				chatReqMaxTokens(`"max_tokens":48`))
+
+			Expect(rec.Code).To(Equal(http.StatusOK))
+			Expect(capturedConfig).ToNot(BeNil())
+			Expect(capturedConfig.Maxtokens).ToNot(BeNil())
+			Expect(*capturedConfig.Maxtokens).To(Equal(48))
+		})
+
+		It("prefers max_completion_tokens when both are set", func() {
+			rec := postJSON(app, "/v1/chat/completions",
+				chatReqMaxTokens(`"max_tokens":48,"max_completion_tokens":64`))
+
+			Expect(rec.Code).To(Equal(http.StatusOK))
+			Expect(capturedConfig).ToNot(BeNil())
+			Expect(capturedConfig.Maxtokens).ToNot(BeNil())
+			Expect(*capturedConfig.Maxtokens).To(Equal(64))
+		})
+	})
 })
diff --git a/core/http/middleware/route_model.go b/core/http/middleware/route_model.go
new file mode 100644
index 000000000..2cdbca41c
--- /dev/null
+++ b/core/http/middleware/route_model.go
@@ -0,0 +1,603 @@
+package middleware
+
+import (
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"fmt"
+	"hash/fnv"
+	"strings"
+	"time"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/auth"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/router"
+	"github.com/mudler/LocalAI/core/templates"
+	"github.com/mudler/xlog"
+	"gopkg.in/yaml.v3"
+)
+
+// ScorerFactory returns a backend.Scorer bound to a named classifier
+// model. The score classifier uses it to compute joint log-prob of
+// every policy label against the routing prompt.
+type ScorerFactory func(modelName string) backend.Scorer
+
+// EmbedderFactory returns a backend.Embedder bound to a named model.
+// Used by the L2 embedding cache. Returning nil signals "model not
+// loadable" — the middleware then falls back to the uncached
+// classifier so routing still happens.
+type EmbedderFactory func(modelName string) backend.Embedder
+
+// VectorStoreFactory returns a backend.VectorStore bound to a named
+// collection. Each router model's cache lives in its own collection
+// so two routers can't poison each other's hits.
+type VectorStoreFactory func(storeName string) backend.VectorStore
+
+// RerankerFactory returns a backend.Reranker bound to a named model.
+// Used by the colbert classifier to score policy descriptions against
+// the prompt via LocalAI's rerankers backend. Returning nil signals
+// "model not loadable" — buildClassifier reports a config error.
+type RerankerFactory func(modelName string) backend.Reranker
+
+// ModelConfigLookup resolves a model name to its config, or nil when
+// unknown. Used by buildClassifier to confirm the classifier_model
+// declared the score usecase — the actual usecase-conflict check
+// lives in ModelConfig.Validate() and runs at config load/save time.
+type ModelConfigLookup func(modelName string) *config.ModelConfig
+
+// ClassifierDeps bundles the backend factories the router middleware
+// needs to build a classifier and its optional L2 cache. Bundled into
+// one struct because RouteModel already takes many positional
+// arguments — additions to the dependency surface go here instead of
+// growing the signature.
+//
+// Embedder and VectorStore are optional: when both are non-nil and the
+// router config declares an embedding_cache block, the score
+// classifier is wrapped in EmbeddingCacheClassifier. Otherwise the
+// score classifier runs unwrapped and the embedding-cache YAML is
+// ignored with a warning.
+type ClassifierDeps struct {
+	Scorer      ScorerFactory
+	Embedder    EmbedderFactory
+	VectorStore VectorStoreFactory
+	Reranker    RerankerFactory
+
+	// ModelLookup resolves the classifier_model name to its config so
+	// buildClassifier can reject misconfigurations that would
+	// otherwise crash the llama-cpp backend at request time. Optional
+	// — when nil, the check is skipped (tests, embedded callers that
+	// haven't wired the loader).
+	ModelLookup ModelConfigLookup
+
+	// Registry is the shared classifier cache. Both the OpenAI and
+	// Anthropic routes pass the same registry so the admin stats
+	// endpoint sees every live classifier. Nil falls back to a local
+	// registry — tests that don't need cross-route stats use this.
+	Registry *router.Registry
+
+	// Evaluator renders the classifier model's chat template around
+	// the routing system + user prompt. Optional — when nil, the
+	// score classifier falls back to a built-in ChatML envelope,
+	// which is correct for Arch-Router/Qwen but wrong for non-ChatML
+	// routing models. Production wiring passes the app-wide
+	// templates.Evaluator so any model the operator points at gets
+	// its own chat template applied.
+	Evaluator *templates.Evaluator
+}
+
+// ProbeExtractor pulls the prompt content out of a parsed request so
+// the classifier can inspect it without taking a dependency on the
+// schema package. One extractor per request shape — wired by the
+// route registration site (mirrors the piiadapter pattern).
+//
+// Returns ok=false when the parsed value isn't the expected type — the
+// middleware then passes through without engaging the router.
+type ProbeExtractor func(parsed any) (router.Probe, bool)
+
+// RouteModel runs after SetModelAndConfig and the schema-specific
+// SetXRequest, looks at the resolved model's Router config, and (when
+// present) reclassifies the request to one of the candidates.
+//
+// The middleware:
+//
+//  1. Loads MODEL_CONFIG from the echo context. If nil or HasRouter()
+//     is false, passes through.
+//  2. Extracts the probe via the supplied ProbeExtractor.
+//  3. Invokes the classifier matching cfg.Router.Classifier
+//     ("score" or "colbert"). If the classifier can't be built —
+//     missing classifier_model, misconfigured policies, etc. — the
+//     request fails with 503. cfg.Router.Fallback only catches
+//     Classify-time errors and label-coverage misses, not config
+//     bugs that would otherwise be silent.
+//  4. Resolves the chosen candidate to its model name. Reloads the
+//     ModelConfig for that model and asserts depth-1 (the candidate
+//     must NOT itself have a Router). Violation returns 500 — config
+//     bug, not a request bug.
+//  5. Updates input.Model in place, replaces MODEL_CONFIG with the
+//     candidate's config, and stamps RequestedModel/ServedModel on the
+//     context so UsageMiddleware records the routing.
+//  6. Writes a DecisionRecord to the store for the admin page.
+//
+// store may be nil when --disable-stats turns off the routing log;
+// classification still runs.
+//
+// Composition with SmartRouter (distributed mode): this middleware
+// only does *model* selection. Node selection still happens in
+// SmartRouter.Route() downstream of this middleware.
+// RouteModel wires the router middleware. source is the value written to
+// DecisionRecord.Source (router.SourceChat / SourceAnthropic / ...) so
+// the admin page can split decisions by entry point. Pass
+// router.SourceChat for the OpenAI chat endpoint, router.SourceAnthropic
+// for the Anthropic messages endpoint.
+func RouteModel(loader *config.ModelConfigLoader, appConfig *config.ApplicationConfig, store router.DecisionStore, fallbackUser *auth.User, extractor ProbeExtractor, source string, deps ClassifierDeps) echo.MiddlewareFunc {
+	registry := deps.Registry
+	if registry == nil {
+		registry = router.NewRegistry()
+	}
+	candidateLoader := func(name string) (*config.ModelConfig, error) {
+		return loader.LoadModelConfigFileByNameDefaultOptions(name, appConfig)
+	}
+	return func(next echo.HandlerFunc) echo.HandlerFunc {
+		return func(c echo.Context) error {
+			cfg, ok := c.Get(CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
+			if !ok || cfg == nil || !cfg.HasRouter() {
+				return next(c)
+			}
+
+			parsed := c.Get(CONTEXT_LOCALS_KEY_LOCALAI_REQUEST)
+			if parsed == nil {
+				return next(c)
+			}
+
+			probe, probeOK := extractor(parsed)
+			if !probeOK {
+				return next(c)
+			}
+
+			classifier, err := GetOrBuildClassifier(registry, cfg, deps)
+			if err != nil {
+				// Build-time failures are config bugs (missing
+				// classifier_model, undeclared usecase, policy
+				// validation, ...). Silently falling back would hide
+				// them and make the router look "working" while the
+				// classifier model is never invoked — surface as 503
+				// with the underlying reason so operators see it.
+				xlog.Warn("router: classifier build failed",
+					"router_model", cfg.Name, "classifier", cfg.Router.Classifier, "error", err)
+				return echo.NewHTTPError(503, "router classifier unavailable: "+err.Error())
+			}
+
+			result, err := router.Resolve(c.Request().Context(), cfg, classifier, candidateLoader, probe)
+			if err != nil {
+				xlog.Warn("router: resolve failed", "router_model", cfg.Name, "error", err)
+				return echo.NewHTTPError(500, err.Error())
+			}
+
+			if req, ok := parsed.(schema.LocalAIRequest); ok {
+				chosen := result.ChosenModel
+				req.ModelName(&chosen)
+			}
+
+			c.Set(CONTEXT_LOCALS_KEY_MODEL_CONFIG, result.ChosenConfig)
+			c.Set(ContextKeyRequestedModel, result.RouterModel)
+			c.Set(ContextKeyServedModel, result.ChosenModel)
+
+			if store != nil {
+				recordHTTPDecision(c, store, result, fallbackUser, source)
+			}
+			return next(c)
+		}
+	}
+}
+
+// recordHTTPDecision writes the resolved decision to the store with
+// HTTP-shaped audit metadata (correlation id from header, user from
+// auth middleware, fallback to the synthetic local user). Realtime
+// has its own recorder that supplies session-derived metadata
+// instead.
+func recordHTTPDecision(c echo.Context, store router.DecisionStore, result *router.ResolveResult, fallbackUser *auth.User, source string) {
+	correlationID, _ := c.Get(ContextKeyCorrelationID).(string)
+	if correlationID == "" {
+		correlationID = c.Response().Header().Get("X-Correlation-ID")
+	}
+	userID := ""
+	if u := auth.GetUser(c); u != nil {
+		userID = u.ID
+	} else if fallbackUser != nil {
+		userID = fallbackUser.ID
+	}
+	_ = store.Record(context.Background(), result.ToDecisionRecord(newDecisionID(), correlationID, userID, source))
+}
+
+
+// GetOrBuildClassifier looks up a built Classifier for the named router
+// model in the registry and builds it on miss. Exported so the
+// /api/router/decide decision-oracle endpoint can share the same
+// build-once cache that the in-band RouteModel middleware uses.
+func GetOrBuildClassifier(registry *router.Registry, cfg *config.ModelConfig, deps ClassifierDeps) (router.Classifier, error) {
+	// Fingerprint folds the classifier model's renderer-affecting
+	// fields (chat templates + stopwords) in alongside the router
+	// config. Without this, hot-reloading the classifier model's
+	// YAML (via ReloadModelsEndpoint, /import-model, or the MCP
+	// reload_models tool) wouldn't rebuild the cached classifier —
+	// the candidates slice and renderer closure are baked at build
+	// time from those fields and would silently keep the stale
+	// stop token / template until process restart.
+	var classifierCfg *config.ModelConfig
+	if deps.ModelLookup != nil {
+		classifierCfg = deps.ModelLookup(cfg.Router.ClassifierModel)
+	}
+	fp := routerConfigFingerprint(cfg.Router, classifierCfg)
+	if cached, ok := registry.Get(cfg.Name, fp); ok {
+		return cached, nil
+	}
+	c, err := buildClassifier(cfg, deps)
+	if err != nil {
+		return nil, err
+	}
+	registry.Put(cfg.Name, fp, c)
+	return c, nil
+}
+
+// routerConfigFingerprint is a stable cache key for the (router cfg,
+// classifier model cfg) tuple. FNV-64 over the YAML form of the
+// router block plus the renderer-affecting fields of the classifier
+// model — equality-only, not cryptographic. YAML-marshal picks up
+// any future RouterConfig field without this function needing to be
+// touched; for the classifier model we hash a narrow projection so
+// unrelated changes (parameters, files, ...) don't burst the cache.
+// Pass classifierCfg=nil when no lookup is wired — the fingerprint
+// degenerates to the router-only form, matching pre-refactor behaviour.
+func routerConfigFingerprint(rc config.RouterConfig, classifierCfg *config.ModelConfig) uint64 {
+	bytes, err := yaml.Marshal(rc)
+	if err != nil {
+		// Marshalling a value type can't fail in practice; fall
+		// back to a hash that varies per call so we don't quietly
+		// share a cache entry across distinct configs.
+		return uint64(time.Now().UnixNano())
+	}
+	h := fnv.New64a()
+	h.Write(bytes)
+	if classifierCfg != nil {
+		// Narrow projection: only the fields newTemplateRenderer and
+		// firstStopWord actually read. Hashing the whole ModelConfig
+		// would invalidate the cache on irrelevant parameter changes.
+		h.Write([]byte{0}) // separator so empty fields don't collide
+		h.Write([]byte(classifierCfg.TemplateConfig.Chat))
+		h.Write([]byte{0})
+		h.Write([]byte(classifierCfg.TemplateConfig.ChatMessage))
+		h.Write([]byte{0})
+		for _, sw := range classifierCfg.StopWords {
+			h.Write([]byte(sw))
+			h.Write([]byte{0})
+		}
+	}
+	return h.Sum64()
+}
+
+func buildClassifier(cfg *config.ModelConfig, deps ClassifierDeps) (router.Classifier, error) {
+	rc := cfg.Router
+	name := rc.Classifier
+	if name == "" {
+		name = router.ClassifierScore
+	}
+	policies, err := validateRouterPolicies(name, rc)
+	if err != nil {
+		return nil, err
+	}
+	cacheCap := rc.ClassifierCacheSize
+	if cacheCap == 0 {
+		cacheCap = 1024
+	}
+
+	var inner router.Classifier
+	switch name {
+	case router.ClassifierScore:
+		if deps.Scorer == nil {
+			return nil, fmt.Errorf("router classifier score unavailable: no scorer factory wired")
+		}
+		if err := assertClassifierDeclaresScore(rc.ClassifierModel, deps.ModelLookup); err != nil {
+			return nil, err
+		}
+		scorer := deps.Scorer(rc.ClassifierModel)
+		if scorer == nil {
+			return nil, fmt.Errorf("router classifier score: classifier_model %q not loadable", rc.ClassifierModel)
+		}
+		opts := router.ScoreClassifierOptions{
+			CacheCap:             cacheCap,
+			ActivationThreshold:  rc.ActivationThreshold,
+			Normalization:        rc.ScoreNormalization,
+			SystemPromptTemplate: rc.ClassifierSystemTemplate,
+		}
+		// Build the prompt renderer + stop token from the classifier
+		// model's own config when available. Without ModelLookup
+		// (tests, embedded callers) the score classifier's built-in
+		// ChatML defaults kick in, which is correct for Arch-Router.
+		if deps.ModelLookup != nil {
+			if classifierCfg := deps.ModelLookup(rc.ClassifierModel); classifierCfg != nil {
+				if deps.Evaluator != nil {
+					opts.PromptRenderer = newTemplateRenderer(deps.Evaluator, classifierCfg)
+				}
+				if st := pickAssistantTurnEnd(classifierCfg.StopWords, classifierCfg.TemplateConfig.ChatMessage); st != "" {
+					opts.StopToken = st
+				}
+			}
+		}
+		inner = router.NewScoreClassifier(policies, scorer, opts)
+	case router.ClassifierColbert:
+		if deps.Reranker == nil {
+			return nil, fmt.Errorf("router classifier colbert unavailable: no reranker factory wired")
+		}
+		reranker := deps.Reranker(rc.ClassifierModel)
+		if reranker == nil {
+			return nil, fmt.Errorf("router classifier colbert: classifier_model %q not loadable", rc.ClassifierModel)
+		}
+		inner = router.NewRerankClassifier(policies, reranker, cacheCap, rc.ActivationThreshold)
+	default:
+		return nil, fmt.Errorf("router: unknown classifier %q (supported: %s)", name, strings.Join([]string{router.ClassifierScore, router.ClassifierColbert}, ", "))
+	}
+
+	if rc.EmbeddingCache == nil {
+		return inner, nil
+	}
+	wrapped, err := wrapWithEmbeddingCache(cfg, inner, deps)
+	if err != nil {
+		// Caching plumbing problems must not break routing — log,
+		// drop the cache layer, and return the uncached classifier.
+		// The admin UI surfaces the warning via the classifier-build
+		// error path used elsewhere.
+		xlog.Warn("router: embedding cache disabled",
+			"router_model", cfg.Name, "error", err)
+		return inner, nil
+	}
+	return wrapped, nil
+}
+
+// assertClassifierDeclaresScore refuses to build the score classifier
+// unless classifier_model's config declares FLAG_SCORE. The actual
+// usecase-conflict check (score + chat/completion/embeddings on
+// llama-cpp) lives in ModelConfig.Validate() and fires at config load
+// and save time — by the time we get here, any model that reached the
+// loader is already conflict-free. This check just refuses to bind a
+// model that never declared itself for Score in the first place; that
+// model could be a misconfigured chat model the operator pointed at
+// by accident, and without FLAG_SCORE the validator never saw it.
+//
+// When lookup is nil (test wiring) the check is skipped and we fall
+// back to the C++ backend's runtime tripwire as the last line of
+// defence.
+func assertClassifierDeclaresScore(classifierModel string, lookup ModelConfigLookup) error {
+	if lookup == nil {
+		return nil
+	}
+	cfg := lookup(classifierModel)
+	if cfg == nil {
+		// Unknown model — Scorer() will produce a clearer "not
+		// loadable" error a few lines down.
+		return nil
+	}
+	if !cfg.HasUsecases(config.FLAG_SCORE) {
+		return fmt.Errorf(
+			"router classifier score: classifier_model %q does not declare the "+
+				"score usecase. Add `known_usecases: [score]` to its config so "+
+				"the loader can reject conflicting usecase combinations",
+			classifierModel)
+	}
+	return nil
+}
+
+// validateRouterPolicies checks the shared invariants both classifiers
+// rely on (non-empty policies, every candidate label declared as a
+// policy, every candidate has a model + at least one label) and
+// returns the parsed []ScorePolicy. Both Score and Rerank classifiers
+// take the same policy shape.
+func validateRouterPolicies(classifierName string, rc config.RouterConfig) ([]router.ScorePolicy, error) {
+	if rc.ClassifierModel == "" {
+		return nil, fmt.Errorf("router classifier %s requires classifier_model", classifierName)
+	}
+	if len(rc.Policies) == 0 {
+		return nil, fmt.Errorf("router classifier %s requires at least one policy", classifierName)
+	}
+	policies := make([]router.ScorePolicy, 0, len(rc.Policies))
+	for _, p := range rc.Policies {
+		if p.Label == "" {
+			return nil, fmt.Errorf("router classifier %s: policy with empty label", classifierName)
+		}
+		if p.Description == "" {
+			return nil, fmt.Errorf("router classifier %s: policy %q has no description", classifierName, p.Label)
+		}
+		policies = append(policies, router.ScorePolicy{Label: p.Label, Description: p.Description})
+	}
+	policyLabels := make(map[string]struct{}, len(policies))
+	for _, p := range policies {
+		policyLabels[p.Label] = struct{}{}
+	}
+	for _, c := range rc.Candidates {
+		if c.Model == "" {
+			return nil, fmt.Errorf("router classifier %s: candidate has empty model field", classifierName)
+		}
+		if len(c.Labels) == 0 {
+			return nil, fmt.Errorf("router classifier %s: candidate %q has no labels", classifierName, c.Model)
+		}
+		for _, l := range c.Labels {
+			if _, ok := policyLabels[l]; !ok {
+				return nil, fmt.Errorf("router classifier %s: candidate %q references unknown label %q (not in policies)", classifierName, c.Model, l)
+			}
+		}
+	}
+	return policies, nil
+}
+
+// newTemplateRenderer adapts the templates.Evaluator + the classifier
+// model's config into the router.PromptRenderer callback. The
+// resulting renderer pushes the routing system + user prompt through
+// the classifier model's full chat-template pipeline — per-role
+// formatting via TemplateConfig.ChatMessage, then the outer
+// TemplateConfig.Chat — so non-ChatML routing models render
+// correctly without router-package awareness of the template format.
+//
+// We must go through TemplateMessages, not EvaluateTemplateForPrompt
+// directly: the gallery's outer Chat templates are uniformly
+// `{{.Input -}}<|im_start|>assistant` (or the Llama-3 equivalent)
+// and reference {{.Input}} only — never {{.SystemPrompt}}. Passing
+// our routing system prompt through .SystemPrompt would silently
+// drop it because Go text/template ignores unreferenced fields.
+// TemplateMessages instead renders each role through ChatMessage and
+// joins them into the .Input the outer template DOES read.
+//
+// Returns nil (forcing the score classifier's chatMLRenderer
+// fallback) when either template piece is missing — partial
+// templating would still drop content.
+func newTemplateRenderer(eval *templates.Evaluator, classifierCfg *config.ModelConfig) router.PromptRenderer {
+	if classifierCfg.TemplateConfig.Chat == "" || classifierCfg.TemplateConfig.ChatMessage == "" {
+		return nil
+	}
+	cfgCopy := *classifierCfg
+	return func(system, user string) (string, error) {
+		messages := []schema.Message{
+			{Role: "system", StringContent: system},
+			{Role: "user", StringContent: user},
+		}
+		rendered := eval.TemplateMessages(schema.OpenAIRequest{}, messages, &cfgCopy, nil, false)
+		if rendered == "" {
+			return "", fmt.Errorf("router: classifier %q chat template produced empty output", cfgCopy.Name)
+		}
+		return rendered, nil
+	}
+}
+
+// pickAssistantTurnEnd returns the classifier model's assistant
+// turn-end token — the one to suffix candidates with so the model's
+// "I'm done" signal folds into the per-candidate joint log-prob.
+//
+// Strategy: prefer the stopword that *literally appears* in the
+// chat_message template, because that token is the assistant
+// turn-end by construction. ChatML's chat_message ends with
+// "<|im_end|>", Llama-3's ends with "<|eot_id|>", etc. — the
+// template is the source of truth.
+//
+// Fallback: the first non-empty stopword. That's right for
+// well-ordered configs (ChatML conventionally lists <|im_end|>
+// first) but wrong for some gallery Llama-3 templates that defensively
+// list <|im_end|> first even though the actual turn-end is <|eot_id|>.
+// The template-scan above catches those.
+//
+// When no stopwords are configured at all, return "" — caller falls
+// back to defaultStopToken (<|im_end|>) inside the score classifier.
+func pickAssistantTurnEnd(words []string, chatMessageTemplate string) string {
+	if chatMessageTemplate != "" {
+		for _, w := range words {
+			if w != "" && strings.Contains(chatMessageTemplate, w) {
+				return w
+			}
+		}
+	}
+	for _, w := range words {
+		if w != "" {
+			return w
+		}
+	}
+	return ""
+}
+
+func wrapWithEmbeddingCache(cfg *config.ModelConfig, inner router.Classifier, deps ClassifierDeps) (router.Classifier, error) {
+	ec := cfg.Router.EmbeddingCache
+	if ec.EmbeddingModel == "" {
+		return nil, fmt.Errorf("embedding_cache requires embedding_model")
+	}
+	if deps.Embedder == nil || deps.VectorStore == nil {
+		return nil, fmt.Errorf("embedding cache factories not wired")
+	}
+	embedder := deps.Embedder(ec.EmbeddingModel)
+	if embedder == nil {
+		return nil, fmt.Errorf("embedding_model %q not loadable", ec.EmbeddingModel)
+	}
+	storeName := ec.StoreName
+	if storeName == "" {
+		storeName = "router-cache-" + cfg.Name
+	}
+	vstore := deps.VectorStore(storeName)
+	if vstore == nil {
+		return nil, fmt.Errorf("vector store %q not loadable", storeName)
+	}
+	return router.NewEmbeddingCacheClassifier(inner, embedder, vstore, ec.SimilarityThreshold, ec.ConfidenceThreshold), nil
+}
+
+func newDecisionID() string {
+	var b [12]byte
+	_, _ = rand.Read(b[:])
+	return "rd_" + hex.EncodeToString(b[:])
+}
+
+// OpenAIProbe extracts a router.Probe from a parsed *schema.OpenAIRequest.
+// Concatenates message contents (string-form or text blocks of the
+// structured `[]any` content) so the classifier sees a single corpus
+// for length and content-shape rules. Image blocks are skipped — a
+// future multimodal classifier can take a different route.
+func OpenAIProbe(parsed any) (router.Probe, bool) {
+	req, ok := parsed.(*schema.OpenAIRequest)
+	if !ok || req == nil {
+		return router.Probe{}, false
+	}
+	return OpenAIProbeFromRequest(req), true
+}
+
+// OpenAIProbeFromRequest is the typed counterpart of OpenAIProbe — same
+// extraction logic, but takes the request struct directly. Realtime and
+// other non-HTTP callers use it to feed a probe to router.Resolve
+// without going through an echo.Context first.
+func OpenAIProbeFromRequest(req *schema.OpenAIRequest) router.Probe {
+	if req == nil {
+		return router.Probe{}
+	}
+	var b strings.Builder
+	for i := range req.Messages {
+		switch ct := req.Messages[i].Content.(type) {
+		case string:
+			b.WriteString(ct)
+			b.WriteByte('\n')
+		case []any:
+			for _, block := range ct {
+				if bm, ok := block.(map[string]any); ok && bm["type"] == "text" {
+					if t, ok := bm["text"].(string); ok {
+						b.WriteString(t)
+						b.WriteByte('\n')
+					}
+				}
+			}
+		}
+	}
+	return router.Probe{Prompt: b.String()}
+}
+
+// AnthropicProbe is the AnthropicRequest analogue of OpenAIProbe.
+func AnthropicProbe(parsed any) (router.Probe, bool) {
+	req, ok := parsed.(*schema.AnthropicRequest)
+	if !ok || req == nil {
+		return router.Probe{}, false
+	}
+	var b strings.Builder
+	for i := range req.Messages {
+		switch ct := req.Messages[i].Content.(type) {
+		case string:
+			b.WriteString(ct)
+			b.WriteByte('\n')
+		case []any:
+			for _, block := range ct {
+				if bm, ok := block.(map[string]any); ok && bm["type"] == "text" {
+					if t, ok := bm["text"].(string); ok {
+						b.WriteString(t)
+						b.WriteByte('\n')
+					}
+				}
+			}
+		}
+	}
+	return router.Probe{
+		Prompt: b.String(),
+	}, true
+}
+
diff --git a/core/http/middleware/route_model_test.go b/core/http/middleware/route_model_test.go
new file mode 100644
index 000000000..32cb8afb9
--- /dev/null
+++ b/core/http/middleware/route_model_test.go
@@ -0,0 +1,551 @@
+package middleware_test
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	. "github.com/mudler/LocalAI/core/http/middleware"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/router"
+	"github.com/mudler/LocalAI/core/templates"
+	"github.com/mudler/LocalAI/pkg/system"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"gopkg.in/yaml.v3"
+)
+
+// The RouteModel middleware wires the score classifier into request
+// rewriting. The classifier itself is covered in
+// router/score_test.go — these specs pin the middleware-level
+// behaviour: candidate matching against the active label set, the
+// fallback path, and the depth-1 invariant.
+
+var _ = Describe("RouteModel middleware (score classifier)", func() {
+	var (
+		modelDir  string
+		appConfig *config.ApplicationConfig
+		loader    *config.ModelConfigLoader
+		store     *fakeDecisionStore
+	)
+
+	BeforeEach(func() {
+		d, err := os.MkdirTemp("", "router-test-*")
+		Expect(err).NotTo(HaveOccurred())
+		modelDir = d
+		appConfig = &config.ApplicationConfig{
+			Context:     context.Background(),
+			SystemState: &system.SystemState{Model: system.Model{ModelsPath: modelDir}},
+		}
+		loader = config.NewModelConfigLoader(modelDir)
+		store = &fakeDecisionStore{}
+	})
+
+	AfterEach(func() {
+		_ = os.RemoveAll(modelDir)
+	})
+
+	It("routes to a candidate whose labels cover the active set", func() {
+		// 3 policies, 2 candidates. Small model has [casual-chat],
+		// bigger has [code-generation, math-reasoning, casual-chat].
+		// A query that activates code-generation should fall to the
+		// bigger candidate because it's the only one that covers it.
+		routerCfg := newScoreRouterModel(modelDir, "smart-router")
+		writeCandidate(modelDir, "small-model")
+		writeCandidate(modelDir, "big-model")
+
+		s := &stubScorer{labelToLogProb: map[string]float64{
+			"code-generation": -0.05, // dominant
+			"casual-chat":     -3.0,
+			"math-reasoning":  -4.0,
+		}}
+		rec, err := runRouter(loader, appConfig, store, routerCfg, openAIChat("debug my Go null pointer"), stubScorerFactory(s))
+		Expect(err).NotTo(HaveOccurred())
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(rec.Body.String()).To(Equal("served:big-model"))
+		Expect(store.records).To(HaveLen(1))
+		Expect(store.records[0].ServedModel).To(Equal("big-model"))
+		Expect(store.records[0].Label).To(ContainSubstring("code-generation"))
+	})
+
+	It("prefers the smaller candidate when both cover the active set", func() {
+		// Both candidates list casual-chat. Admins order small →
+		// big, so a casual-chat-only request must route to small.
+		routerCfg := newScoreRouterModel(modelDir, "smart-router")
+		writeCandidate(modelDir, "small-model")
+		writeCandidate(modelDir, "big-model")
+
+		s := &stubScorer{labelToLogProb: map[string]float64{
+			"code-generation": -5.0,
+			"casual-chat":     -0.05, // dominant
+			"math-reasoning":  -5.0,
+		}}
+		rec, err := runRouter(loader, appConfig, store, routerCfg, openAIChat("hi"), stubScorerFactory(s))
+		Expect(err).NotTo(HaveOccurred())
+		Expect(rec.Body.String()).To(Equal("served:small-model"))
+	})
+
+	It("falls back when no candidate covers the active label set", func() {
+		// Only the bigger candidate covers math-reasoning. We
+		// deliberately drop it from the candidates list so neither
+		// matches; expect Fallback to fire.
+		routerCfg := newScoreRouterModel(modelDir, "smart-router")
+		// Remove the second candidate so coverage gap appears.
+		routerCfg.Router.Candidates = routerCfg.Router.Candidates[:1]
+		writeCandidate(modelDir, "small-model")
+		writeCandidate(modelDir, "qwen3-0.6b")
+
+		s := &stubScorer{labelToLogProb: map[string]float64{
+			"code-generation": -5.0,
+			"casual-chat":     -5.0,
+			"math-reasoning":  -0.05, // dominant — but no candidate has it
+		}}
+		rec, err := runRouter(loader, appConfig, store, routerCfg, openAIChat("3 apples cost $2.40"), stubScorerFactory(s))
+		Expect(err).NotTo(HaveOccurred())
+		Expect(rec.Body.String()).To(Equal("served:qwen3-0.6b"))
+	})
+
+	It("rejects candidates that reference unknown labels at build time", func() {
+		routerCfg := newScoreRouterModel(modelDir, "smart-router")
+		routerCfg.Router.Candidates = append(routerCfg.Router.Candidates, config.RouterCandidate{
+			Model:  "broken",
+			Labels: []string{"nonexistent-label"},
+		})
+		writeCandidate(modelDir, "small-model")
+		writeCandidate(modelDir, "big-model")
+		writeCandidate(modelDir, "broken")
+		writeCandidate(modelDir, "qwen3-0.6b")
+
+		s := &stubScorer{labelToLogProb: map[string]float64{
+			"code-generation": -0.05,
+			"casual-chat":     -3.0,
+			"math-reasoning":  -4.0,
+		}}
+		_, err := runRouter(loader, appConfig, store, routerCfg, openAIChat("debug something"), stubScorerFactory(s))
+		// Build-time config bugs (here: a candidate referencing a
+		// label not declared in policies) must surface to the client
+		// — the previous silent-fallback behaviour hid the broken
+		// config and left operators wondering why traces never showed
+		// the classifier model running.
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("unknown label"))
+	})
+
+	It("returns 500 when the candidate is itself a router (depth-1 invariant)", func() {
+		// The candidate model is itself a router. We must reject
+		// the dispatch — chained routers are deliberately
+		// disallowed.
+		routerCfg := newScoreRouterModel(modelDir, "smart-router")
+		// Bend the test setup: replace one of the candidate-model
+		// configs with a nested-router config.
+		nestedRouter := newScoreRouterModel(modelDir, "small-model")
+		Expect(os.WriteFile(filepath.Join(modelDir, "small-model.yaml"), []byte(toYAML(nestedRouter)), 0o644)).To(Succeed())
+		writeCandidate(modelDir, "big-model")
+		writeCandidate(modelDir, "qwen3-0.6b")
+
+		s := &stubScorer{labelToLogProb: map[string]float64{
+			"code-generation": -5.0,
+			"casual-chat":     -0.05,
+			"math-reasoning":  -5.0,
+		}}
+		_, err := runRouter(loader, appConfig, store, routerCfg, openAIChat("hi"), stubScorerFactory(s))
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("depth-1 invariant"))
+	})
+})
+
+// Regression coverage for the rendered routing prompt — pins the
+// guarantee that the routing system prompt (route listing, JSON
+// output schema) actually reaches the classifier model. The first
+// implementation of the template-aware renderer routed through
+// EvaluateTemplateForPrompt, which only invokes the outer Chat
+// template — and the gallery's outer Chat templates are
+// `{{.Input -}}<|im_start|>assistant` shape, so .SystemPrompt was
+// silently dropped. The fix routes through TemplateMessages, which
+// renders each role through ChatMessage and joins the result into
+// .Input. These specs would fail loudly if the renderer ever
+// regresses back to bypassing per-role formatting.
+var _ = Describe("RouteModel rendered classifier prompt", func() {
+	var (
+		modelDir  string
+		appConfig *config.ApplicationConfig
+		loader    *config.ModelConfigLoader
+		store     *fakeDecisionStore
+		eval      *templates.Evaluator
+	)
+
+	BeforeEach(func() {
+		d, err := os.MkdirTemp("", "router-render-*")
+		Expect(err).NotTo(HaveOccurred())
+		modelDir = d
+		appConfig = &config.ApplicationConfig{
+			Context:     context.Background(),
+			SystemState: &system.SystemState{Model: system.Model{ModelsPath: modelDir}},
+		}
+		loader = config.NewModelConfigLoader(modelDir)
+		store = &fakeDecisionStore{}
+		eval = templates.NewEvaluator(modelDir)
+	})
+
+	AfterEach(func() {
+		_ = os.RemoveAll(modelDir)
+	})
+
+	It("includes the routing system prompt in the rendered ChatML envelope", func() {
+		// Mirrors the live arch-router-1.5b.yaml: chatml-style chat +
+		// chat_message templates. This is the production-wired path.
+		writeChatMLClassifierModel(modelDir, "arch-router")
+		routerCfg := newScoreRouterModel(modelDir, "smart-router")
+
+		s := &stubScorer{labelToLogProb: map[string]float64{
+			"code-generation": -0.05,
+			"casual-chat":     -3.0,
+			"math-reasoning":  -4.0,
+		}}
+		_, err := runRouterWithDeps(loader, appConfig, store, routerCfg,
+			openAIChat("debug this null pointer"),
+			ClassifierDeps{
+				Scorer:      stubScorerFactory(s),
+				ModelLookup: loaderLookup(loader, appConfig),
+				Evaluator:   eval,
+			})
+		Expect(err).NotTo(HaveOccurred())
+
+		// The routing system prompt must reach the scorer. Three
+		// anchors: the route-listing block, one of the JSON-shaped
+		// route entries (escapeJSONString preserves the description),
+		// and the JSON output schema instruction.
+		Expect(s.lastPrompt).To(ContainSubstring("<routes>"),
+			"system prompt dropped: rendered prompt missing route-listing block. got: %q", s.lastPrompt)
+		Expect(s.lastPrompt).To(ContainSubstring(`{"name": "code-generation"`),
+			"system prompt dropped: rendered prompt missing route entries. got: %q", s.lastPrompt)
+		Expect(s.lastPrompt).To(ContainSubstring(`{"route": "<name>"}`),
+			"system prompt dropped: rendered prompt missing JSON output schema. got: %q", s.lastPrompt)
+
+		// And the per-role envelope must be present (proves we went
+		// through ChatMessage, not the SystemPrompt-only path).
+		Expect(s.lastPrompt).To(ContainSubstring("<|im_start|>system"),
+			"system role marker missing — ChatMessage template wasn't invoked")
+		Expect(s.lastPrompt).To(ContainSubstring("<|im_start|>user"),
+			"user role marker missing")
+		// User probe makes it through the per-role template. The trailing
+		// \n on the probe content is added by OpenAIProbeFromRequest;
+		// preserved through ChatMessage rendering.
+		Expect(s.lastPrompt).To(ContainSubstring("debug this null pointer"),
+			"user probe missing from rendered prompt")
+		// Outer Chat template must add the assistant-open marker so
+		// the scorer's first predicted token is the start of the
+		// candidate.
+		Expect(s.lastPrompt).To(MatchRegexp(`<\|im_start\|>assistant\s*$`),
+			"rendered prompt must end at assistant-open marker. got: %q", s.lastPrompt)
+	})
+
+	It("falls back to chatMLRenderer when the classifier model has no chat_message template", func() {
+		// Partial template config: only outer Chat, no per-role
+		// piece. The renderer must refuse rather than emit a prompt
+		// that drops the system turn, so the score classifier's
+		// built-in ChatML default takes over.
+		writePartialClassifierModel(modelDir, "arch-router")
+		routerCfg := newScoreRouterModel(modelDir, "smart-router")
+
+		s := &stubScorer{labelToLogProb: map[string]float64{
+			"code-generation": -0.05,
+			"casual-chat":     -3.0,
+			"math-reasoning":  -4.0,
+		}}
+		_, err := runRouterWithDeps(loader, appConfig, store, routerCfg,
+			openAIChat("hello world"),
+			ClassifierDeps{
+				Scorer:      stubScorerFactory(s),
+				ModelLookup: loaderLookup(loader, appConfig),
+				Evaluator:   eval,
+			})
+		Expect(err).NotTo(HaveOccurred())
+
+		// chatMLRenderer fallback emits its own envelope and still
+		// embeds the routing system prompt. OpenAIProbeFromRequest
+		// appends "\n" after each message body, so the user content
+		// reaches the renderer as "hello world\n" — the substring
+		// match accounts for that.
+		Expect(s.lastPrompt).To(ContainSubstring("<routes>"),
+			"fallback renderer also dropped the system prompt")
+		Expect(s.lastPrompt).To(ContainSubstring("<|im_start|>system\n"))
+		Expect(s.lastPrompt).To(ContainSubstring("<|im_start|>user\nhello world\n<|im_end|>"))
+		Expect(strings.HasSuffix(s.lastPrompt, "<|im_start|>assistant\n")).To(BeTrue(),
+			"chatMLRenderer fallback must end at assistant-open marker. got: %q", s.lastPrompt)
+	})
+
+	It("uses the classifier model's first stopword as the candidate suffix", func() {
+		writeChatMLClassifierModel(modelDir, "arch-router")
+		routerCfg := newScoreRouterModel(modelDir, "smart-router")
+
+		s := &stubScorer{labelToLogProb: map[string]float64{
+			"code-generation": -0.05,
+			"casual-chat":     -3.0,
+			"math-reasoning":  -4.0,
+		}}
+		_, err := runRouterWithDeps(loader, appConfig, store, routerCfg,
+			openAIChat("hi"),
+			ClassifierDeps{
+				Scorer:      stubScorerFactory(s),
+				ModelLookup: loaderLookup(loader, appConfig),
+				Evaluator:   eval,
+			})
+		Expect(err).NotTo(HaveOccurred())
+		// arch-router YAML lists <|im_end|> first.
+		for _, c := range s.lastCandidates {
+			Expect(c).To(HaveSuffix("<|im_end|>"),
+				"candidate must end with the classifier model's turn-end token. got: %q", c)
+		}
+	})
+
+	It("picks the actual turn-end token when the stopwords list is misordered (Llama-3 style)", func() {
+		// gallery/llama3-instruct.yaml et al. defensively list
+		// <|im_end|> first even though the actual Llama-3 assistant
+		// turn-end is <|eot_id|>. The naive "stopwords[0]" pick would
+		// suffix candidates with <|im_end|> — a token Llama-3 never
+		// emits at turn end. pickAssistantTurnEnd should scan the
+		// chat_message template and recognise <|eot_id|> as the real
+		// turn-end.
+		writeLlama3StyleClassifierModel(modelDir, "arch-router")
+		routerCfg := newScoreRouterModel(modelDir, "smart-router")
+
+		s := &stubScorer{labelToLogProb: map[string]float64{
+			"code-generation": -0.05,
+			"casual-chat":     -3.0,
+			"math-reasoning":  -4.0,
+		}}
+		_, err := runRouterWithDeps(loader, appConfig, store, routerCfg,
+			openAIChat("hi"),
+			ClassifierDeps{
+				Scorer:      stubScorerFactory(s),
+				ModelLookup: loaderLookup(loader, appConfig),
+				Evaluator:   eval,
+			})
+		Expect(err).NotTo(HaveOccurred())
+		for _, c := range s.lastCandidates {
+			Expect(c).To(HaveSuffix("<|eot_id|>"),
+				"candidate must end with the Llama-3 turn-end token, not the misordered first stopword. got: %q", c)
+		}
+	})
+})
+
+// --- helpers ---
+
+// stubScorer scores each candidate label according to a fixed
+// label→log-prob map; per-token length is faked at 2 tokens so length
+// normalisation is a no-op. Captures the prompt + candidate list of
+// the last Score call so regression tests can pin the rendered prompt
+// shape.
+type stubScorer struct {
+	labelToLogProb map[string]float64
+	lastPrompt     string
+	lastCandidates []string
+}
+
+func (s *stubScorer) Score(_ context.Context, prompt string, candidates []string) ([]backend.CandidateScore, error) {
+	s.lastPrompt = prompt
+	s.lastCandidates = append([]string(nil), candidates...)
+	out := make([]backend.CandidateScore, len(candidates))
+	for i, c := range candidates {
+		// Match against the full `{"route": "<label>"}` envelope.
+		// Naively substring-matching on `"<label>"` would let a label
+		// that's a substring of another collide via Go's randomised
+		// map iteration order — `"code"` would also match the
+		// `"code-generation"` candidate.
+		var lp float64
+		for label, v := range s.labelToLogProb {
+			if strings.Contains(c, `{"route": "`+label+`"}`) {
+				lp = v
+				break
+			}
+		}
+		out[i] = backend.CandidateScore{
+			LogProb:                 lp * 2,
+			LengthNormalizedLogProb: lp,
+			NumTokens:               2,
+		}
+	}
+	return out, nil
+}
+
+func stubScorerFactory(s *stubScorer) ScorerFactory {
+	return func(string) backend.Scorer { return s }
+}
+
+type fakeDecisionStore struct {
+	records []router.DecisionRecord
+}
+
+func (f *fakeDecisionStore) Record(_ context.Context, r router.DecisionRecord) error {
+	f.records = append(f.records, r)
+	return nil
+}
+
+func (f *fakeDecisionStore) List(_ context.Context, _ router.DecisionListQuery) ([]router.DecisionRecord, error) {
+	out := append([]router.DecisionRecord(nil), f.records...)
+	return out, nil
+}
+
+func (f *fakeDecisionStore) Close() error                         { return nil }
+func (f *fakeDecisionStore) Count(_ context.Context) (int, error) { return len(f.records), nil }
+
+// newScoreRouterModel builds a smart-router config with 3 policies
+// and 2 candidates (small with one label, bigger with all three).
+// Admins are expected to order candidates small → large; the
+// middleware picks the first whose labels are a superset of the
+// active set.
+func newScoreRouterModel(modelDir, name string) *config.ModelConfig {
+	cfg := &config.ModelConfig{
+		Name: name,
+		Router: config.RouterConfig{
+			Classifier:      "score",
+			ClassifierModel: "arch-router",
+			Fallback:        "qwen3-0.6b",
+			Policies: []config.RouterPolicy{
+				{Label: "code-generation", Description: "writing or debugging code"},
+				{Label: "casual-chat", Description: "small talk"},
+				{Label: "math-reasoning", Description: "arithmetic and word problems"},
+			},
+			Candidates: []config.RouterCandidate{
+				{Model: "small-model", Labels: []string{"casual-chat"}},
+				{Model: "big-model", Labels: []string{"code-generation", "casual-chat", "math-reasoning"}},
+			},
+		},
+	}
+	Expect(os.WriteFile(filepath.Join(modelDir, name+".yaml"), []byte(toYAML(cfg)), 0o644)).To(Succeed())
+	return cfg
+}
+
+func writeCandidate(modelDir, name string) {
+	body := "name: " + name + "\nbackend: mock-backend\n"
+	Expect(os.WriteFile(filepath.Join(modelDir, name+".yaml"), []byte(body), 0o644)).To(Succeed())
+}
+
+func toYAML(cfg *config.ModelConfig) string {
+	b, err := yaml.Marshal(cfg)
+	Expect(err).NotTo(HaveOccurred())
+	return string(b)
+}
+
+func openAIChat(content string) *schema.OpenAIRequest {
+	req := &schema.OpenAIRequest{
+		Messages: []schema.Message{
+			{Role: "user", Content: content},
+		},
+	}
+	req.Model = "smart-router"
+	return req
+}
+
+func runRouter(loader *config.ModelConfigLoader, appConfig *config.ApplicationConfig, store router.DecisionStore, routerCfg *config.ModelConfig, parsed any, scorerFactory ScorerFactory) (*httptest.ResponseRecorder, error) {
+	return runRouterWithDeps(loader, appConfig, store, routerCfg, parsed, ClassifierDeps{Scorer: scorerFactory})
+}
+
+// runRouterWithDeps is runRouter's general form: lets the caller pass
+// a fully-populated ClassifierDeps (ModelLookup, Evaluator, ...) so
+// tests can exercise the template-renderer + stop-token derivation
+// paths, not just the bare-scorer fast path.
+func runRouterWithDeps(loader *config.ModelConfigLoader, appConfig *config.ApplicationConfig, store router.DecisionStore, routerCfg *config.ModelConfig, parsed any, deps ClassifierDeps) (*httptest.ResponseRecorder, error) {
+	mw := RouteModel(loader, appConfig, store, nil, OpenAIProbe, router.SourceChat, deps)
+	req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader("{}"))
+	rec := httptest.NewRecorder()
+	c := echo.New().NewContext(req, rec)
+	c.Set(CONTEXT_LOCALS_KEY_MODEL_CONFIG, routerCfg)
+	c.Set(CONTEXT_LOCALS_KEY_LOCALAI_REQUEST, parsed)
+	handler := mw(func(c echo.Context) error {
+		served, _ := c.Get(ContextKeyServedModel).(string)
+		return c.String(http.StatusOK, "served:"+served)
+	})
+	err := handler(c)
+	return rec, err
+}
+
+// loaderLookup mirrors application.ModelConfigLookup — bridges the
+// loader to the ModelConfigLookup signature ClassifierDeps wants.
+func loaderLookup(loader *config.ModelConfigLoader, appConfig *config.ApplicationConfig) ModelConfigLookup {
+	return func(name string) *config.ModelConfig {
+		cfg, err := loader.LoadModelConfigFileByNameDefaultOptions(name, appConfig)
+		if err != nil || cfg == nil {
+			return nil
+		}
+		return cfg
+	}
+}
+
+// writeChatMLClassifierModel writes a classifier model YAML that
+// mirrors the live arch-router-1.5b.yaml shipped at
+// volumes/models/arch-router-1.5b.yaml: ChatML chat + chat_message
+// templates, score usecase, <|im_end|> first in stopwords.
+func writeChatMLClassifierModel(modelDir, name string) {
+	body := `name: ` + name + `
+backend: llama-cpp
+known_usecases:
+  - score
+stopwords:
+  - <|im_end|>
+  - <|endoftext|>
+template:
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
+  chat_message: |
+    <|im_start|>{{ .RoleName }}
+    {{- if .Content }}
+    {{ .Content }}
+    {{- end }}<|im_end|>
+`
+	Expect(os.WriteFile(filepath.Join(modelDir, name+".yaml"), []byte(body), 0o644)).To(Succeed())
+}
+
+// writeLlama3StyleClassifierModel writes a classifier model mirroring
+// gallery/llama3-instruct.yaml — stopwords defensively list <|im_end|>
+// first even though the assistant turn-end is actually <|eot_id|>.
+// Exercises pickAssistantTurnEnd's template scan: the right token is
+// the one that appears in chat_message, not the one at position 0.
+func writeLlama3StyleClassifierModel(modelDir, name string) {
+	body := `name: ` + name + `
+backend: llama-cpp
+known_usecases:
+  - score
+stopwords:
+  - <|im_end|>
+  - <dummy32000>
+  - "<|eot_id|>"
+  - <|end_of_text|>
+template:
+  chat: |
+    {{.Input }}
+    <|start_header_id|>assistant<|end_header_id|>
+  chat_message: |
+    <|start_header_id|>{{ .RoleName }}<|end_header_id|>
+
+    {{ .Content }}<|eot_id|>
+`
+	Expect(os.WriteFile(filepath.Join(modelDir, name+".yaml"), []byte(body), 0o644)).To(Succeed())
+}
+
+// writePartialClassifierModel writes a classifier model that has the
+// outer Chat template but no ChatMessage — exercises the
+// newTemplateRenderer "refuse partial templating" branch that hands
+// off to chatMLRenderer.
+func writePartialClassifierModel(modelDir, name string) {
+	body := `name: ` + name + `
+backend: llama-cpp
+known_usecases:
+  - score
+stopwords:
+  - <|im_end|>
+template:
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
+`
+	Expect(os.WriteFile(filepath.Join(modelDir, name+".yaml"), []byte(body), 0o644)).To(Succeed())
+}
diff --git a/core/http/middleware/trace.go b/core/http/middleware/trace.go
index 7bf661aab..9a3da8dea 100644
--- a/core/http/middleware/trace.go
+++ b/core/http/middleware/trace.go
@@ -1,9 +1,11 @@
 package middleware
 
 import (
+	"bufio"
 	"bytes"
 	"io"
 	"mime"
+	"net"
 	"net/http"
 	"slices"
 	"sync"
@@ -116,6 +118,16 @@ func truncateForTrace(body []byte, maxBytes int) ([]byte, bool) {
 	return out, true
 }
 
+// Hijack lets WebSocket upgraders (gorilla/websocket) reach the
+// underlying connection. Without this, gorilla's Hijacker type-assertion
+// fails on the wrapped writer and the handshake returns 500.
+func (w *bodyWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
+	if hj, ok := w.ResponseWriter.(http.Hijacker); ok {
+		return hj.Hijack()
+	}
+	return nil, nil, http.ErrNotSupported
+}
+
 func initializeTracing(maxItems int) {
 	tracingMaxItems = maxItems
 	doInitializeTracing()
diff --git a/core/http/middleware/usage.go b/core/http/middleware/usage.go
index 6dc4699b8..ff0820db6 100644
--- a/core/http/middleware/usage.go
+++ b/core/http/middleware/usage.go
@@ -2,165 +2,19 @@ package middleware
 
 import (
 	"bytes"
+	"context"
 	"encoding/json"
-	"sync"
-	"sync/atomic"
 	"time"
 
 	"github.com/labstack/echo/v4"
 	"github.com/mudler/LocalAI/core/http/auth"
+	"github.com/mudler/LocalAI/core/services/routing/billing"
 	"github.com/mudler/xlog"
-	"gorm.io/gorm"
 )
 
-const (
-	usageFlushInterval = 5 * time.Second
-	// usageMaxPending bounds the in-memory queue. Sized for bursty inference
-	// traffic on a self-hosted instance with a slow or unavailable DB.
-	usageMaxPending = 50000
-)
-
-// usageBatcher accumulates usage records and flushes them to the DB periodically.
-type usageBatcher struct {
-	mu       sync.Mutex
-	pending  []*auth.UsageRecord
-	db       *gorm.DB
-	stop     chan struct{}
-	done     chan struct{}
-	stopOnce sync.Once
-}
-
-// droppedRecords counts records discarded because the in-memory queue was full.
-// Used to rate-limit the warn log so a sustained outage doesn't flood it.
-var droppedRecords atomic.Uint64
-
-func (b *usageBatcher) add(r *auth.UsageRecord) {
-	b.mu.Lock()
-	if len(b.pending) >= usageMaxPending {
-		b.mu.Unlock()
-		// Rate-limit: one warn per 1024 drops keeps the log readable.
-		n := droppedRecords.Add(1)
-		if n&1023 == 1 {
-			xlog.Warn("usage batcher full, dropping record",
-				"cap", usageMaxPending, "total_dropped", n)
-		}
-		return
-	}
-	b.pending = append(b.pending, r)
-	b.mu.Unlock()
-}
-
-func (b *usageBatcher) flush() {
-	b.mu.Lock()
-	batch := b.pending
-	b.pending = nil
-	b.mu.Unlock()
-
-	if len(batch) == 0 {
-		return
-	}
-
-	if err := b.db.Create(&batch).Error; err != nil {
-		xlog.Error("Failed to flush usage batch", "count", len(batch), "error", err)
-		// Cap-aware re-queue: prepend as much of the failed batch as fits
-		// alongside any records added concurrently with the failed write.
-		b.mu.Lock()
-		room := usageMaxPending - len(b.pending)
-		if room > 0 {
-			if room > len(batch) {
-				room = len(batch)
-			}
-			b.pending = append(batch[:room], b.pending...)
-		}
-		b.mu.Unlock()
-	}
-}
-
-func (b *usageBatcher) run() {
-	defer close(b.done)
-	ticker := time.NewTicker(usageFlushInterval)
-	defer ticker.Stop()
-	for {
-		select {
-		case <-ticker.C:
-			b.flush()
-		case <-b.stop:
-			b.flush() // final drain
-			return
-		}
-	}
-}
-
-func (b *usageBatcher) shutdown() {
-	b.stopOnce.Do(func() {
-		close(b.stop)
-		<-b.done
-	})
-}
-
-// The package-level batcher is guarded by batcherMu so Init / Shutdown cycles
-// (the test pattern) don't race against UsageMiddleware reads.
-var (
-	batcherMu sync.RWMutex
-	batcher   *usageBatcher
-)
-
-func currentBatcher() *usageBatcher {
-	batcherMu.RLock()
-	defer batcherMu.RUnlock()
-	return batcher
-}
-
-// InitUsageRecorder starts a background goroutine that periodically flushes
-// accumulated usage records to the database. Calling it more than once
-// shuts down the previous batcher first so its goroutine doesn't leak.
-func InitUsageRecorder(db *gorm.DB) {
-	if db == nil {
-		return
-	}
-
-	batcherMu.Lock()
-	old := batcher
-	batcher = nil
-	batcherMu.Unlock()
-	if old != nil {
-		old.shutdown()
-	}
-
-	b := &usageBatcher{
-		db:   db,
-		stop: make(chan struct{}),
-		done: make(chan struct{}),
-	}
-	batcherMu.Lock()
-	batcher = b
-	batcherMu.Unlock()
-
-	go b.run()
-}
-
-// ShutdownUsageRecorder stops the background flusher and synchronously drains
-// pending records once. Safe to call multiple times. Not yet wired into the
-// application lifecycle; intended for graceful process exit and tests.
-func ShutdownUsageRecorder() {
-	batcherMu.Lock()
-	b := batcher
-	batcher = nil
-	batcherMu.Unlock()
-	if b != nil {
-		b.shutdown()
-	}
-}
-
-// FlushNow synchronously flushes any pending usage records. Intended for tests
-// that need deterministic behaviour without waiting for the ticker.
-func FlushNow() {
-	if b := currentBatcher(); b != nil {
-		b.flush()
-	}
-}
-
-// usageResponseBody is the minimal structure we need from the response JSON.
+// usageResponseBody is the minimal structure we need from an OpenAI-shaped
+// JSON response. Anthropic responses are decoded separately because their
+// usage block uses different field names (input_tokens / output_tokens).
 type usageResponseBody struct {
 	Model string `json:"model"`
 	Usage *struct {
@@ -170,19 +24,47 @@ type usageResponseBody struct {
 	} `json:"usage"`
 }
 
-// UsageMiddleware extracts token usage from OpenAI-compatible response JSON
-// and records it per-user.
-func UsageMiddleware(db *gorm.DB) echo.MiddlewareFunc {
+// anthropicResponseBody covers /v1/messages JSON responses.
+type anthropicResponseBody struct {
+	Model string `json:"model"`
+	Usage *struct {
+		InputTokens  int64 `json:"input_tokens"`
+		OutputTokens int64 `json:"output_tokens"`
+	} `json:"usage"`
+}
+
+// UsageMiddleware records token usage for inference requests via the
+// billing.Recorder. Two paths produce a record:
+//
+//  1. Handler-stamped (preferred): the request handler called
+//     middleware.StampUsage with the canonical token counts before
+//     returning. This is the only reliable path for streaming responses
+//     — clients rarely set OpenAI's stream_options.include_usage, and
+//     Anthropic's usage lives in a separate message_delta event.
+//  2. Body-parsed (fallback): the response is parsed for an OpenAI- or
+//     Anthropic-shaped usage block. Used by passthrough proxies and
+//     foreign endpoints.
+//
+// Recorder being nil (e.g., --disable-stats) makes the middleware a
+// transparent pass-through. fallbackUser is used when auth.GetUser(c)
+// returns nil; without it, an unauthenticated request would be dropped.
+//
+// Every request that fails to produce a record ticks
+// localai_usage_unrecorded_total so silent billing misses are observable.
+func UsageMiddleware(recorder *billing.Recorder, fallbackUser *auth.User) echo.MiddlewareFunc {
 	return func(next echo.HandlerFunc) echo.HandlerFunc {
 		return func(c echo.Context) error {
-			b := currentBatcher()
-			if db == nil || b == nil {
+			if recorder == nil {
 				return next(c)
 			}
 
 			startTime := time.Now()
 
-			// Wrap response writer to capture body
+			// Wrap response writer to capture body for the fallback parser.
+			// When the handler stamps the context we never read this buffer,
+			// so the cost is the per-chunk Write going through one extra
+			// indirection — accepted overhead in exchange for one billing
+			// path that works for both stamping and body-parse callers.
 			resBody := new(bytes.Buffer)
 			origWriter := c.Response().Writer
 			mw := &bodyWriter{
@@ -193,53 +75,34 @@ func UsageMiddleware(db *gorm.DB) echo.MiddlewareFunc {
 
 			handlerErr := next(c)
 
-			// Restore original writer
 			c.Response().Writer = origWriter
 
-			// Only record on successful responses
+			endpoint := c.Request().URL.Path
+
 			if c.Response().Status < 200 || c.Response().Status >= 300 {
 				return handlerErr
 			}
 
-			// Get authenticated user
 			user := auth.GetUser(c)
 			if user == nil {
+				user = fallbackUser
+			}
+			if user == nil || user.ID == "" {
+				billing.CountUnrecorded(context.Background(), endpoint, "no_user")
 				return handlerErr
 			}
 
-			// Try to parse usage from response
-			responseBytes := resBody.Bytes()
-			if len(responseBytes) == 0 {
+			model, prompt, completion, total, ok := tokensFromContext(c)
+			if !ok {
+				model, prompt, completion, total, ok = tokensFromBody(resBody.Bytes(), c.Response().Header().Get("Content-Type"))
+			}
+			if !ok {
+				billing.CountUnrecorded(context.Background(), endpoint, "no_usage")
 				return handlerErr
 			}
 
-			// Check content type
-			ct := c.Response().Header().Get("Content-Type")
-			isJSON := ct == "" || ct == "application/json" || bytes.HasPrefix([]byte(ct), []byte("application/json"))
-			isSSE := bytes.HasPrefix([]byte(ct), []byte("text/event-stream"))
-
-			if !isJSON && !isSSE {
-				return handlerErr
-			}
-
-			var resp usageResponseBody
-			if isSSE {
-				last, ok := lastSSEData(responseBytes)
-				if !ok {
-					return handlerErr
-				}
-				if err := json.Unmarshal(last, &resp); err != nil {
-					return handlerErr
-				}
-			} else {
-				if err := json.Unmarshal(responseBytes, &resp); err != nil {
-					return handlerErr
-				}
-			}
-
-			if resp.Usage == nil {
-				return handlerErr
-			}
+			requested, served := modelsFromContext(c, model)
+			pre, post := promptTokensFromContext(c, prompt)
 
 			source := auth.GetSource(c)
 			if source == "" {
@@ -249,16 +112,21 @@ func UsageMiddleware(db *gorm.DB) echo.MiddlewareFunc {
 			}
 
 			record := &auth.UsageRecord{
-				UserID:           user.ID,
-				UserName:         user.Name,
-				Source:           source,
-				Model:            resp.Model,
-				Endpoint:         c.Request().URL.Path,
-				PromptTokens:     resp.Usage.PromptTokens,
-				CompletionTokens: resp.Usage.CompletionTokens,
-				TotalTokens:      resp.Usage.TotalTokens,
-				Duration:         time.Since(startTime).Milliseconds(),
-				CreatedAt:        startTime,
+				UserID:                 user.ID,
+				UserName:               user.Name,
+				Source:                 source,
+				Model:                  model,
+				Endpoint:               endpoint,
+				PromptTokens:           prompt,
+				CompletionTokens:       completion,
+				TotalTokens:            total,
+				Duration:               time.Since(startTime).Milliseconds(),
+				CreatedAt:              startTime,
+				RequestedModel:         requested,
+				ServedModel:            served,
+				PreFilterPromptTokens:  pre,
+				PostFilterPromptTokens: post,
+				CorrelationID:          correlationIDFromContext(c),
 			}
 
 			if key := auth.GetAPIKey(c); key != nil {
@@ -267,13 +135,145 @@ func UsageMiddleware(db *gorm.DB) echo.MiddlewareFunc {
 				record.APIKeyName = key.Name
 			}
 
-			b.add(record)
+			if err := recorder.Record(context.Background(), record); err != nil {
+				xlog.Error("usage middleware: recorder.Record failed", "error", err, "user", user.ID, "model", model)
+				billing.CountUnrecorded(context.Background(), endpoint, "record_failed")
+			}
 
 			return handlerErr
 		}
 	}
 }
 
+// tokensFromContext returns canonical token counts stamped by a handler
+// via middleware.StampUsage. Returns ok=false when no stamp is present
+// — the caller then tries the body-parse fallback.
+//
+// A model name without token counts is not considered "stamped" because a
+// record with zero tokens looks the same as a never-recorded request to
+// later analytics; the second condition is what gates ok.
+func tokensFromContext(c echo.Context) (model string, prompt, completion, total int64, ok bool) {
+	if v, found := c.Get(ContextKeyResponseModel).(string); found {
+		model = v
+	}
+	pPresent := false
+	cPresent := false
+	if v, found := c.Get(ContextKeyPromptTokens).(int64); found {
+		prompt = v
+		pPresent = true
+	}
+	if v, found := c.Get(ContextKeyCompletionTokens).(int64); found {
+		completion = v
+		cPresent = true
+	}
+	if v, found := c.Get(ContextKeyTotalTokens).(int64); found {
+		total = v
+	} else {
+		total = prompt + completion
+	}
+	ok = pPresent || cPresent
+	return
+}
+
+// tokensFromBody covers the passthrough-proxy / foreign-endpoint case
+// where no handler stamps the context. Returns ok=false on any parse
+// failure or missing-usage; the caller increments the unrecorded counter.
+func tokensFromBody(responseBytes []byte, contentType string) (model string, prompt, completion, total int64, ok bool) {
+	if len(responseBytes) == 0 {
+		return
+	}
+	isJSON := contentType == "" || contentType == "application/json" || bytes.HasPrefix([]byte(contentType), []byte("application/json"))
+	isSSE := bytes.HasPrefix([]byte(contentType), []byte("text/event-stream"))
+	if !isJSON && !isSSE {
+		return
+	}
+
+	payload := responseBytes
+	if isSSE {
+		// For SSE, the canonical usage chunk is the *last* non-[DONE] data
+		// line. OpenAI clients only emit one if stream_options.include_usage
+		// is set; Anthropic emits a final message_delta with usage. Both
+		// fit the "last data: line" rule.
+		last, lastOk := lastSSEData(responseBytes)
+		if !lastOk {
+			return
+		}
+		payload = last
+	}
+
+	// Try OpenAI shape first (handles /v1/chat/completions, /v1/completions,
+	// /v1/embeddings, /v1/edits, and any proxy that translates to OpenAI).
+	// A usage block whose token fields all decoded to zero is ambiguous —
+	// it could be an Anthropic body that happens to have a `usage` key —
+	// so fall through to the Anthropic parser instead of recording zeros.
+	var openAI usageResponseBody
+	if err := json.Unmarshal(payload, &openAI); err == nil && openAI.Usage != nil {
+		if openAI.Usage.PromptTokens != 0 || openAI.Usage.CompletionTokens != 0 || openAI.Usage.TotalTokens != 0 {
+			model = openAI.Model
+			prompt = openAI.Usage.PromptTokens
+			completion = openAI.Usage.CompletionTokens
+			total = openAI.Usage.TotalTokens
+			if total == 0 {
+				total = prompt + completion
+			}
+			ok = true
+			return
+		}
+	}
+
+	// Fall through to Anthropic shape (proxy passthrough territory).
+	var ant anthropicResponseBody
+	if err := json.Unmarshal(payload, &ant); err == nil && ant.Usage != nil {
+		if ant.Usage.InputTokens != 0 || ant.Usage.OutputTokens != 0 {
+			model = ant.Model
+			prompt = ant.Usage.InputTokens
+			completion = ant.Usage.OutputTokens
+			total = prompt + completion
+			ok = true
+			return
+		}
+	}
+
+	return
+}
+
+// modelsFromContext returns (requested, served) using context-set values
+// when present, falling back to the response-reported model for both.
+// The router middleware (subsystem 2 of the routing plan) populates
+// these; until it lands they are equal.
+func modelsFromContext(c echo.Context, fallback string) (string, string) {
+	requested := fallback
+	served := fallback
+	if v, ok := c.Get(ContextKeyRequestedModel).(string); ok && v != "" {
+		requested = v
+	}
+	if v, ok := c.Get(ContextKeyServedModel).(string); ok && v != "" {
+		served = v
+	}
+	return requested, served
+}
+
+func promptTokensFromContext(c echo.Context, fallback int64) (int64, int64) {
+	pre := fallback
+	post := fallback
+	if v, ok := c.Get(ContextKeyPreFilterPromptTokens).(int64); ok && v > 0 {
+		pre = v
+	}
+	if v, ok := c.Get(ContextKeyPostFilterPromptTokens).(int64); ok && v > 0 {
+		post = v
+	}
+	return pre, post
+}
+
+func correlationIDFromContext(c echo.Context) string {
+	if v, ok := c.Get(ContextKeyCorrelationID).(string); ok {
+		return v
+	}
+	// X-Correlation-ID header is set by trace.go middleware; read it as a
+	// fallback if the echo-context binding hasn't been populated yet.
+	return c.Response().Header().Get("X-Correlation-ID")
+}
+
 // lastSSEData returns the payload of the last "data: " line whose content is not "[DONE]".
 func lastSSEData(b []byte) ([]byte, bool) {
 	prefix := []byte("data: ")
diff --git a/core/http/middleware/usage_stamp.go b/core/http/middleware/usage_stamp.go
new file mode 100644
index 000000000..7e82ab744
--- /dev/null
+++ b/core/http/middleware/usage_stamp.go
@@ -0,0 +1,33 @@
+package middleware
+
+import "github.com/labstack/echo/v4"
+
+// StampUsage records the canonical token counts on the echo context so
+// UsageMiddleware can attribute the request without parsing the response
+// body. Handlers must call this for every successful response — the
+// body-parse fallback is reserved for foreign endpoints (e.g., the cloud
+// passthrough proxy).
+//
+// model is the name written into the response payload; passing it here
+// is what lets the middleware fill the UsageRecord even when the handler
+// abbreviates or rewrites the user-supplied model. Empty values are
+// ignored so partial information is still useful (e.g., embeddings calls
+// where completion is always 0).
+//
+// prompt and completion accept int because that's the native width of
+// LocalAI's TokenUsage / OpenAIUsage structs (token counts never come
+// close to overflow). Conversion to int64 happens once, here, so call
+// sites stay free of casts.
+func StampUsage(c echo.Context, model string, prompt, completion int) {
+	if c == nil {
+		return
+	}
+	if model != "" {
+		c.Set(ContextKeyResponseModel, model)
+	}
+	p := int64(prompt)
+	cp := int64(completion)
+	c.Set(ContextKeyPromptTokens, p)
+	c.Set(ContextKeyCompletionTokens, cp)
+	c.Set(ContextKeyTotalTokens, p+cp)
+}
diff --git a/core/http/middleware/usage_test.go b/core/http/middleware/usage_test.go
index 7db03a6ba..e95110988 100644
--- a/core/http/middleware/usage_test.go
+++ b/core/http/middleware/usage_test.go
@@ -1,140 +1,323 @@
-//go:build auth
-
 package middleware_test
 
 import (
-	"bytes"
-	"encoding/json"
+	"context"
+	"fmt"
 	"net/http"
 	"net/http/httptest"
+	"strings"
 
 	"github.com/labstack/echo/v4"
 	"github.com/mudler/LocalAI/core/http/auth"
-	"github.com/mudler/LocalAI/core/http/middleware"
+	httpMiddleware "github.com/mudler/LocalAI/core/http/middleware"
+	"github.com/mudler/LocalAI/core/services/routing/billing"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
-	"gorm.io/gorm"
 )
 
-// testAuthDB returns a fresh in-memory SQLite auth DB.
-func testAuthDB() *gorm.DB {
-	db, err := auth.InitDB(":memory:")
-	if err != nil {
-		panic(err)
-	}
-	return db
+// captureBackend collects records the recorder forwards. We assert on
+// it directly rather than going through StatsBackend.Aggregate because
+// these tests verify the middleware -> recorder hop, not aggregation
+// (which has its own tests in routing/billing).
+type captureBackend struct {
+	records []*auth.UsageRecord
 }
 
+func (c *captureBackend) Record(_ context.Context, r *auth.UsageRecord) error {
+	c.records = append(c.records, r)
+	return nil
+}
+func (c *captureBackend) Aggregate(_ context.Context, _ billing.AggregateQuery) ([]auth.UsageBucket, error) {
+	return nil, nil
+}
+func (c *captureBackend) Close() error { return nil }
+
 var _ = Describe("UsageMiddleware", func() {
-	var (
-		e  *echo.Echo
-		db *gorm.DB
-	)
-
-	BeforeEach(func() {
-		db = testAuthDB()
-		e = echo.New()
-		middleware.InitUsageRecorder(db)
-	})
-
-	AfterEach(func() {
-		middleware.ShutdownUsageRecorder()
-	})
-
-	okHandler := func(c echo.Context) error {
-		body, _ := json.Marshal(map[string]any{
-			"model": "gpt-4",
-			"usage": map[string]int{
-				"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15,
-			},
-		})
-		c.Response().Header().Set("Content-Type", "application/json")
-		c.Response().WriteHeader(http.StatusOK)
-		_, _ = c.Response().Write(body)
-		return nil
+	mockChat := func(usage string) echo.HandlerFunc {
+		return func(c echo.Context) error {
+			c.Response().Header().Set("Content-Type", "application/json")
+			body := fmt.Sprintf(`{"model":"qwen-7b","usage":%s}`, usage)
+			return c.String(http.StatusOK, body)
+		}
 	}
 
-	// FlushNow drains pending records synchronously, replacing the 6s sleep
-	// that was previously needed to wait for the batcher's ticker.
-	flush := middleware.FlushNow
+	It("records under the synthetic local user when auth is off", func() {
+		cap := &captureBackend{}
+		rec := billing.NewRecorder(cap)
+		fallback := &auth.User{ID: "local-uuid", Name: "local", Provider: auth.ProviderLocal}
 
-	It("records source=web when auth_source is web", func() {
-		e.POST("/v1/chat/completions", okHandler, func(next echo.HandlerFunc) echo.HandlerFunc {
+		e := echo.New()
+		e.POST("/v1/chat/completions",
+			mockChat(`{"prompt_tokens":12,"completion_tokens":8,"total_tokens":20}`),
+			httpMiddleware.UsageMiddleware(rec, fallback),
+		)
+
+		req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(`{}`))
+		req.Header.Set("Content-Type", "application/json")
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+
+		Expect(w.Code).To(Equal(http.StatusOK))
+		Expect(cap.records).To(HaveLen(1))
+		r := cap.records[0]
+		Expect(r.UserID).To(Equal("local-uuid"))
+		Expect(r.UserName).To(Equal("local"))
+		Expect(r.Model).To(Equal("qwen-7b"))
+		Expect(r.PromptTokens).To(Equal(int64(12)))
+		Expect(r.CompletionTokens).To(Equal(int64(8)))
+		Expect(r.TotalTokens).To(Equal(int64(20)))
+	})
+
+	It("does nothing when recorder is nil (--disable-stats)", func() {
+		fallback := &auth.User{ID: "local-uuid", Name: "local"}
+		e := echo.New()
+		e.POST("/v1/chat/completions",
+			mockChat(`{"prompt_tokens":1,"completion_tokens":1,"total_tokens":2}`),
+			httpMiddleware.UsageMiddleware(nil, fallback),
+		)
+		req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(`{}`))
+		req.Header.Set("Content-Type", "application/json")
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+		Expect(w.Code).To(Equal(http.StatusOK))
+		// no panic, no record — recorder=nil is the disable-stats path
+	})
+
+	It("skips when neither auth nor fallback user is available", func() {
+		cap := &captureBackend{}
+		rec := billing.NewRecorder(cap)
+
+		e := echo.New()
+		e.POST("/v1/chat/completions",
+			mockChat(`{"prompt_tokens":3,"completion_tokens":2,"total_tokens":5}`),
+			httpMiddleware.UsageMiddleware(rec, nil),
+		)
+		req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(`{}`))
+		req.Header.Set("Content-Type", "application/json")
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+		Expect(w.Code).To(Equal(http.StatusOK))
+		Expect(cap.records).To(BeEmpty())
+	})
+
+	It("ignores 5xx responses (no usage to attribute)", func() {
+		cap := &captureBackend{}
+		rec := billing.NewRecorder(cap)
+		fallback := &auth.User{ID: "local-uuid", Name: "local"}
+
+		e := echo.New()
+		e.POST("/v1/chat/completions",
+			func(c echo.Context) error {
+				return c.String(http.StatusInternalServerError, `{"error":"boom"}`)
+			},
+			httpMiddleware.UsageMiddleware(rec, fallback),
+		)
+		req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(`{}`))
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+		Expect(w.Code).To(Equal(http.StatusInternalServerError))
+		Expect(cap.records).To(BeEmpty())
+	})
+
+	It("records via context-stamped tokens when handler called StampUsage (streaming-safe path)", func() {
+		cap := &captureBackend{}
+		rec := billing.NewRecorder(cap)
+		fallback := &auth.User{ID: "local-uuid", Name: "local"}
+
+		// Simulate a streaming chat handler that emits SSE chunks WITHOUT a
+		// terminal usage block (the common case — clients rarely set
+		// stream_options.include_usage). The handler stamps the canonical
+		// counts on the context just before returning. UsageMiddleware
+		// must record from the stamp, not from body parsing.
+		streamingHandler := func(c echo.Context) error {
+			c.Response().Header().Set("Content-Type", "text/event-stream")
+			c.Response().WriteHeader(http.StatusOK)
+			_, _ = fmt.Fprint(c.Response().Writer, "data: {\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}\n\n")
+			_, _ = fmt.Fprint(c.Response().Writer, "data: [DONE]\n\n")
+			httpMiddleware.StampUsage(c, "qwen-7b", 9, 5)
+			return nil
+		}
+
+		e := echo.New()
+		e.POST("/v1/chat/completions",
+			streamingHandler,
+			httpMiddleware.UsageMiddleware(rec, fallback),
+		)
+
+		req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(`{}`))
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+
+		Expect(w.Code).To(Equal(http.StatusOK))
+		Expect(cap.records).To(HaveLen(1))
+		Expect(cap.records[0].PromptTokens).To(Equal(int64(9)))
+		Expect(cap.records[0].CompletionTokens).To(Equal(int64(5)))
+		Expect(cap.records[0].TotalTokens).To(Equal(int64(14)))
+		Expect(cap.records[0].Model).To(Equal("qwen-7b"))
+	})
+
+	It("falls back to Anthropic body shape when no stamp is present", func() {
+		cap := &captureBackend{}
+		rec := billing.NewRecorder(cap)
+		fallback := &auth.User{ID: "local-uuid", Name: "local"}
+
+		// Simulates a passthrough proxy / foreign endpoint: no handler stamp,
+		// so the middleware must parse the response body. Anthropic's shape
+		// uses input_tokens / output_tokens, not the OpenAI names.
+		anthropicHandler := func(c echo.Context) error {
+			c.Response().Header().Set("Content-Type", "application/json")
+			body := `{"model":"claude-sonnet","usage":{"input_tokens":15,"output_tokens":7}}`
+			return c.String(http.StatusOK, body)
+		}
+
+		e := echo.New()
+		e.POST("/v1/messages",
+			anthropicHandler,
+			httpMiddleware.UsageMiddleware(rec, fallback),
+		)
+
+		req := httptest.NewRequest(http.MethodPost, "/v1/messages", strings.NewReader(`{}`))
+		req.Header.Set("Content-Type", "application/json")
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+
+		Expect(w.Code).To(Equal(http.StatusOK))
+		Expect(cap.records).To(HaveLen(1))
+		Expect(cap.records[0].PromptTokens).To(Equal(int64(15)))
+		Expect(cap.records[0].CompletionTokens).To(Equal(int64(7)))
+		Expect(cap.records[0].TotalTokens).To(Equal(int64(22)))
+		Expect(cap.records[0].Model).To(Equal("claude-sonnet"))
+	})
+
+	It("populates RequestedModel/ServedModel from echo context when set", func() {
+		cap := &captureBackend{}
+		rec := billing.NewRecorder(cap)
+		fallback := &auth.User{ID: "local-uuid", Name: "local"}
+
+		// A pre-handler stand-in for the future router middleware: it
+		// rewrites Served and remembers the original Requested. Once the
+		// real router lands, this is exactly the contract it must keep.
+		setRouterContext := func(next echo.HandlerFunc) echo.HandlerFunc {
 			return func(c echo.Context) error {
-				c.Set("auth_user", &auth.User{ID: "alice", Name: "Alice"})
-				c.Set("auth_source", auth.UsageSourceWeb)
+				c.Set(httpMiddleware.ContextKeyRequestedModel, "auto")
+				c.Set(httpMiddleware.ContextKeyServedModel, "qwen-7b")
 				return next(c)
 			}
-		}, middleware.UsageMiddleware(db))
+		}
 
-		req := httptest.NewRequest("POST", "/v1/chat/completions", bytes.NewReader([]byte(`{}`)))
-		e.ServeHTTP(httptest.NewRecorder(), req)
-		flush()
+		e := echo.New()
+		e.POST("/v1/chat/completions",
+			mockChat(`{"prompt_tokens":4,"completion_tokens":3,"total_tokens":7}`),
+			httpMiddleware.UsageMiddleware(rec, fallback),
+			setRouterContext,
+		)
 
-		var rec auth.UsageRecord
-		Expect(db.Where("user_id = ?", "alice").First(&rec).Error).To(Succeed())
-		Expect(rec.Source).To(Equal(auth.UsageSourceWeb))
-		Expect(rec.APIKeyID).To(BeNil())
-		Expect(rec.APIKeyName).To(BeEmpty())
+		req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(`{}`))
+		req.Header.Set("Content-Type", "application/json")
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+
+		Expect(w.Code).To(Equal(http.StatusOK))
+		Expect(cap.records).To(HaveLen(1))
+		Expect(cap.records[0].RequestedModel).To(Equal("auto"))
+		Expect(cap.records[0].ServedModel).To(Equal("qwen-7b"))
+	})
+
+	// stampAuth is a stand-in for the auth middleware: it sets the
+	// echo-context keys UsageMiddleware reads. Pass source=="" to
+	// simulate the unauthenticated/legacy path; pass key=nil to skip
+	// the API-key snapshot.
+	stampAuth := func(user *auth.User, source string, key *auth.UserAPIKey) echo.MiddlewareFunc {
+		return func(next echo.HandlerFunc) echo.HandlerFunc {
+			return func(c echo.Context) error {
+				if user != nil {
+					c.Set("auth_user", user)
+				}
+				if source != "" {
+					c.Set("auth_source", source)
+				}
+				if key != nil {
+					c.Set("auth_apikey", key)
+				}
+				return next(c)
+			}
+		}
+	}
+
+	It("records source=web when auth_source is web and snapshots no API key", func() {
+		cap := &captureBackend{}
+		rec := billing.NewRecorder(cap)
+
+		e := echo.New()
+		e.POST("/v1/chat/completions",
+			mockChat(`{"prompt_tokens":2,"completion_tokens":1,"total_tokens":3}`),
+			httpMiddleware.UsageMiddleware(rec, nil),
+			stampAuth(&auth.User{ID: "alice", Name: "Alice"}, auth.UsageSourceWeb, nil),
+		)
+
+		req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(`{}`))
+		req.Header.Set("Content-Type", "application/json")
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+
+		Expect(w.Code).To(Equal(http.StatusOK))
+		Expect(cap.records).To(HaveLen(1))
+		r := cap.records[0]
+		Expect(r.UserID).To(Equal("alice"))
+		Expect(r.Source).To(Equal(auth.UsageSourceWeb))
+		Expect(r.APIKeyID).To(BeNil())
+		Expect(r.APIKeyName).To(BeEmpty())
 	})
 
 	It("records source=apikey with snapshotted name when auth_apikey is set", func() {
-		e.POST("/v1/chat/completions", okHandler, func(next echo.HandlerFunc) echo.HandlerFunc {
-			return func(c echo.Context) error {
-				c.Set("auth_user", &auth.User{ID: "alice", Name: "Alice"})
-				c.Set("auth_source", auth.UsageSourceAPIKey)
-				c.Set("auth_apikey", &auth.UserAPIKey{ID: "key-1", Name: "ci-runner"})
-				return next(c)
-			}
-		}, middleware.UsageMiddleware(db))
+		cap := &captureBackend{}
+		rec := billing.NewRecorder(cap)
 
-		req := httptest.NewRequest("POST", "/v1/chat/completions", bytes.NewReader([]byte(`{}`)))
-		e.ServeHTTP(httptest.NewRecorder(), req)
-		flush()
+		e := echo.New()
+		e.POST("/v1/chat/completions",
+			mockChat(`{"prompt_tokens":1,"completion_tokens":1,"total_tokens":2}`),
+			httpMiddleware.UsageMiddleware(rec, nil),
+			stampAuth(
+				&auth.User{ID: "alice", Name: "Alice"},
+				auth.UsageSourceAPIKey,
+				&auth.UserAPIKey{ID: "key-1", Name: "ci-runner"},
+			),
+		)
 
-		var rec auth.UsageRecord
-		Expect(db.Where("user_id = ?", "alice").First(&rec).Error).To(Succeed())
-		Expect(rec.Source).To(Equal(auth.UsageSourceAPIKey))
-		Expect(rec.APIKeyID).ToNot(BeNil())
-		Expect(*rec.APIKeyID).To(Equal("key-1"))
-		Expect(rec.APIKeyName).To(Equal("ci-runner"))
+		req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(`{}`))
+		req.Header.Set("Content-Type", "application/json")
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+
+		Expect(w.Code).To(Equal(http.StatusOK))
+		Expect(cap.records).To(HaveLen(1))
+		r := cap.records[0]
+		Expect(r.Source).To(Equal(auth.UsageSourceAPIKey))
+		Expect(r.APIKeyID).ToNot(BeNil())
+		Expect(*r.APIKeyID).To(Equal("key-1"))
+		Expect(r.APIKeyName).To(Equal("ci-runner"))
 	})
 
-	It("FlushNow drains pending records synchronously", func() {
-		e.POST("/v1/chat/completions", okHandler, func(next echo.HandlerFunc) echo.HandlerFunc {
-			return func(c echo.Context) error {
-				c.Set("auth_user", &auth.User{ID: "carol", Name: "Carol"})
-				c.Set("auth_source", auth.UsageSourceWeb)
-				return next(c)
-			}
-		}, middleware.UsageMiddleware(db))
+	It("defaults source=web when auth_source is empty", func() {
+		cap := &captureBackend{}
+		rec := billing.NewRecorder(cap)
 
-		req := httptest.NewRequest("POST", "/v1/chat/completions", bytes.NewReader([]byte(`{}`)))
-		e.ServeHTTP(httptest.NewRecorder(), req)
+		// Only user set, no source — the middleware must classify the
+		// row as web rather than dropping it from per-source aggregates.
+		e := echo.New()
+		e.POST("/v1/chat/completions",
+			mockChat(`{"prompt_tokens":1,"completion_tokens":1,"total_tokens":2}`),
+			httpMiddleware.UsageMiddleware(rec, nil),
+			stampAuth(&auth.User{ID: "alice", Name: "Alice"}, "", nil),
+		)
 
-		// No sleep: FlushNow should drain immediately.
-		middleware.FlushNow()
+		req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(`{}`))
+		req.Header.Set("Content-Type", "application/json")
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
 
-		var rec auth.UsageRecord
-		Expect(db.Where("user_id = ?", "carol").First(&rec).Error).To(Succeed())
-		Expect(rec.Source).To(Equal(auth.UsageSourceWeb))
-	})
-
-	It("falls back to source=web when auth_source is empty", func() {
-		e.POST("/v1/chat/completions", okHandler, func(next echo.HandlerFunc) echo.HandlerFunc {
-			return func(c echo.Context) error {
-				c.Set("auth_user", &auth.User{ID: "alice", Name: "Alice"})
-				// no auth_source set
-				return next(c)
-			}
-		}, middleware.UsageMiddleware(db))
-
-		req := httptest.NewRequest("POST", "/v1/chat/completions", bytes.NewReader([]byte(`{}`)))
-		e.ServeHTTP(httptest.NewRecorder(), req)
-		flush()
-
-		var rec auth.UsageRecord
-		Expect(db.Where("user_id = ?", "alice").First(&rec).Error).To(Succeed())
-		Expect(rec.Source).To(Equal(auth.UsageSourceWeb))
+		Expect(w.Code).To(Equal(http.StatusOK))
+		Expect(cap.records).To(HaveLen(1))
+		Expect(cap.records[0].Source).To(Equal(auth.UsageSourceWeb))
 	})
 })
diff --git a/core/http/react-ui/e2e/middleware-page.spec.js b/core/http/react-ui/e2e/middleware-page.spec.js
new file mode 100644
index 000000000..fe890db77
--- /dev/null
+++ b/core/http/react-ui/e2e/middleware-page.spec.js
@@ -0,0 +1,308 @@
+import { test, expect } from '@playwright/test'
+
+// Mocked fixture covering the three things the page renders:
+//   - PII pattern catalogue (action badges, action-change buttons)
+//   - Per-model resolved PII state (one with default off, one with proxy default on, one with explicit YAML)
+//   - Recent events feed (the page must NEVER show the redacted content)
+const MOCK_STATUS = {
+  pii: {
+    enabled_globally: true,
+    default_enabled_for_backends: ['cloud-proxy'],
+    patterns: [
+      { id: 'email', description: 'Email addresses', action: 'mask', max_match_length: 254 },
+      { id: 'ssn', description: 'US Social Security Numbers', action: 'mask', max_match_length: 11 },
+      { id: 'api_key_prefix', description: 'API key prefixes', action: 'block', max_match_length: 200 },
+    ],
+    models: [
+      { name: 'qwen-7b', backend: 'llama-cpp', enabled: false, explicit: false, default_for_backend: false, overrides: null },
+      { name: 'claude-sonnet', backend: 'cloud-proxy', enabled: true, explicit: false, default_for_backend: true, overrides: null },
+      { name: 'claude-strict', backend: 'cloud-proxy', enabled: true, explicit: true, default_for_backend: true, overrides: { ssn: 'block' } },
+    ],
+    recent_event_count: 2,
+  },
+  router: {
+    configured: true,
+    models: [
+      {
+        name: 'smart-router',
+        classifier: 'score',
+        fallback: 'qwen-7b',
+        policies: [
+          { label: 'casual-chat', description: 'small talk' },
+          { label: 'code-generation', description: 'writing or debugging code' },
+        ],
+        candidates: [
+          { model: 'qwen-3b', labels: ['casual-chat'] },
+          { model: 'qwen-coder', labels: ['code-generation', 'casual-chat'] },
+        ],
+        embedding_cache: {
+          embedding_model: 'nomic-embed-text-v1.5',
+          similarity_threshold: 0.80,
+          confidence_threshold: 0.60,
+          store_name: '',
+          stats: {
+            hits: 31,
+            misses: 1,
+            near_misses: 56,
+            low_confidence: 29,
+            embedder_errors: 0,
+            store_errors: 0,
+            // peak [0.4, 0.6) for paraphrases, secondary in [0.8, 1.0) for near-exact matches
+            similarity_buckets: [0, 0, 0, 1, 22, 16, 3, 7, 19, 19],
+          },
+        },
+      },
+    ],
+    recent_decision_count: 1,
+    available_classifiers: ['score'],
+  },
+}
+
+const MOCK_DECISIONS = {
+  decisions: [
+    {
+      id: 'rd_a1', correlation_id: 'corr-1', user_id: 'local',
+      router_model: 'smart-router', requested_model: 'smart-router', served_model: 'qwen-3b',
+      classifier: 'score', label: 'casual-chat', score: 0.91, latency_ms: 15,
+      cached: true, cache_similarity: 0.92,
+      created_at: '2026-05-06T11:00:00Z',
+    },
+  ],
+}
+
+const MOCK_EVENTS = {
+  events: [
+    {
+      id: 'pii_aaa', kind: 'pii', correlation_id: 'corr-1', user_id: 'local',
+      direction: 'in', pattern_id: 'email', byte_offset: 12, length: 17,
+      hash_prefix: 'ff8d9819', action: 'mask',
+      created_at: '2026-05-06T10:00:00Z',
+    },
+    {
+      id: 'proxy_connect_1', kind: 'proxy_connect',
+      host: 'api.openai.com', intercepted: true,
+      created_at: '2026-05-06T10:01:00Z',
+    },
+    {
+      id: 'proxy_connect_2', kind: 'proxy_connect',
+      host: 'github.com', intercepted: false,
+      created_at: '2026-05-06T10:02:00Z',
+    },
+    {
+      id: 'proxy_traffic_1', kind: 'proxy_traffic', correlation_id: 'corr-2',
+      host: 'api.openai.com',
+      bytes_sent: 412, bytes_received: 1228, status_code: 200, duration_ms: 240,
+      created_at: '2026-05-06T10:03:00Z',
+    },
+  ],
+}
+
+test.describe('Middleware page — admin in no-auth mode', () => {
+  test.beforeEach(async ({ page }) => {
+    await page.route('**/api/auth/status', (route) =>
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify({ authEnabled: false, staticApiKeyRequired: false, providers: [] }),
+      })
+    )
+    await page.route('**/api/middleware/status', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify(MOCK_STATUS) })
+    )
+    await page.route('**/api/pii/events?**', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify(MOCK_EVENTS) })
+    )
+    await page.route('**/api/router/decisions?**', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify(MOCK_DECISIONS) })
+    )
+  })
+
+  test('Filtering tab renders pattern catalogue and per-model state', async ({ page }) => {
+    await page.goto('/app/middleware')
+
+    // Pattern table — at least one pattern id visible.
+    await expect(page.getByText('email').first()).toBeVisible()
+    await expect(page.getByText('api_key_prefix').first()).toBeVisible()
+
+    // Per-model state — each model's name is visible.
+    await expect(page.getByText('qwen-7b').first()).toBeVisible()
+    await expect(page.getByText('claude-strict').first()).toBeVisible()
+
+    // Default-policy banner names the backends with PII on by default.
+    await expect(page.getByText(/cloud-proxy/).first()).toBeVisible()
+  })
+
+  test('Routing tab renders configured routers and recent decisions', async ({ page }) => {
+    await page.goto('/app/middleware')
+    await page.getByRole('button', { name: /Routing/i }).click()
+    // Active router model name visible.
+    await expect(page.getByText('smart-router').first()).toBeVisible()
+    // Candidate model names visible.
+    await expect(page.getByText('qwen-coder').first()).toBeVisible()
+    await expect(page.getByText('qwen-3b').first()).toBeVisible()
+    // Decision row visible — label and served model.
+    await expect(page.getByText('casual-chat').first()).toBeVisible()
+  })
+
+  test('Routing tab renders embedding-cache stats and similarity histogram', async ({ page }) => {
+    await page.goto('/app/middleware')
+    await page.getByRole('button', { name: /Routing/i }).click()
+
+    // Embedding model name surfaces in the cache column.
+    await expect(page.getByText('nomic-embed-text-v1.5').first()).toBeVisible()
+
+    // Hit-rate badge: 31 hits / (31 + 56 + 1) = 35% rounded.
+    await expect(page.getByText(/35% hit/i).first()).toBeVisible()
+
+    // h/n/m counter row visible.
+    await expect(page.getByText(/31h\/56n\/1m/).first()).toBeVisible()
+
+    // Skipped (low-confidence) counter visible.
+    await expect(page.getByText(/29 skipped/).first()).toBeVisible()
+
+    // Threshold marker text matches the configured 0.80.
+    await expect(page.getByText(/sim ≥ 0\.8/).first()).toBeVisible()
+
+    // Histogram bars rendered with hover titles that include the
+    // bucket range and count. Bucket 4 (peak) has count 22; the
+    // <div> with that exact title is the structural assertion.
+    await expect(
+      page.locator('div[title="[0.4, 0.5): 22"]')
+    ).toBeVisible()
+    // Bucket 8 (just at threshold) has count 19.
+    await expect(
+      page.locator('div[title="[0.8, 0.9): 19"]')
+    ).toBeVisible()
+  })
+
+  test('Routing tab shows a cached decision with cache_similarity', async ({ page }) => {
+    await page.goto('/app/middleware')
+    await page.getByRole('button', { name: /Routing/i }).click()
+
+    // The decision row exposes the cached flag and the cosine that
+    // produced the hit so admins can correlate with the histogram.
+    await expect(page.getByText('corr-1')).toBeVisible()
+  })
+
+  test('Events tab renders rows but never the redacted content', async ({ page }) => {
+    await page.goto('/app/middleware')
+    await page.getByRole('button', { name: /Events/i }).click()
+    // Hash prefix is visible — that's how admins audit recurring leaks.
+    await expect(page.getByText('ff8d9819')).toBeVisible()
+    // The page only ever shows fields the EventStore stores. The matched
+    // value (e.g. "alice@example.com") would never appear because it's
+    // not in the payload — explicit asserting absence here is the
+    // contract the design relies on.
+    await expect(page.getByText(/@example\.com/)).toHaveCount(0)
+  })
+
+  test('Events tab renders proxy_connect rows with intercept decision', async ({ page }) => {
+    await page.goto('/app/middleware')
+    await page.getByRole('button', { name: /Events/i }).click()
+
+    // Both intercept and tunnel decisions visible.
+    const interceptRow = page.locator('tr').filter({ hasText: 'api.openai.com' }).first()
+    await expect(interceptRow).toContainText(/intercepted/i)
+    const tunnelRow = page.locator('tr').filter({ hasText: 'github.com' }).first()
+    await expect(tunnelRow).toContainText(/tunneled/i)
+  })
+
+  test('Events tab renders proxy_traffic byte counts and status', async ({ page }) => {
+    await page.goto('/app/middleware')
+    await page.getByRole('button', { name: /Events/i }).click()
+
+    // The traffic row formats as "HTTP 200 · ↑412B ↓1.2KB · 240ms".
+    // We assert on the durable parts: status code, byte values, duration unit.
+    const trafficRow = page.locator('tr').filter({ hasText: 'corr-2' }).first()
+    await expect(trafficRow).toContainText('HTTP 200')
+    await expect(trafficRow).toContainText('412B')
+    await expect(trafficRow).toContainText(/1\.2\s*KB/i)
+    await expect(trafficRow).toContainText('240ms')
+  })
+
+  test('Events kind filter narrows the table to the chosen kind', async ({ page }) => {
+    await page.goto('/app/middleware')
+    await page.getByRole('button', { name: /Events/i }).click()
+
+    // Default = All: pii row + 2 connect rows + 1 traffic row visible.
+    await expect(page.getByText('ff8d9819')).toBeVisible()
+    await expect(page.getByText('github.com')).toBeVisible()
+
+    // Click "PII" filter — proxy rows must disappear.
+    await page.getByRole('button', { name: /^PII$/ }).click()
+    await expect(page.getByText('ff8d9819')).toBeVisible()
+    await expect(page.getByText('github.com')).toHaveCount(0)
+    await expect(page.getByText('HTTP 200')).toHaveCount(0)
+
+    // Click "Proxy traffic" — only the traffic row remains.
+    await page.getByRole('button', { name: /Proxy traffic/i }).click()
+    await expect(page.getByText('HTTP 200')).toBeVisible()
+    await expect(page.getByText('ff8d9819')).toHaveCount(0)
+    await expect(page.getByText('github.com')).toHaveCount(0)
+
+    // Click "Proxy connect" — both connect rows visible, no PII or traffic.
+    await page.getByRole('button', { name: /Proxy connect/i }).click()
+    await expect(page.locator('tr').filter({ hasText: 'github.com' })).toHaveCount(1)
+    await expect(page.locator('tr').filter({ hasText: 'api.openai.com' }).filter({ hasText: 'intercepted' })).toHaveCount(1)
+    await expect(page.getByText('HTTP 200')).toHaveCount(0)
+    await expect(page.getByText('ff8d9819')).toHaveCount(0)
+
+    // Click "All" — everything back.
+    await page.getByRole('button', { name: /^All$/ }).click()
+    await expect(page.getByText('ff8d9819')).toBeVisible()
+    await expect(page.getByText('HTTP 200')).toBeVisible()
+  })
+
+  test('Events tab shows the kind badge for each row', async ({ page }) => {
+    await page.goto('/app/middleware')
+    await page.getByRole('button', { name: /Events/i }).click()
+
+    // The Kind column header is present.
+    await expect(page.locator('th').filter({ hasText: /^Kind$/ })).toBeVisible()
+    // At least one cell renders each of the three kinds. Scope to
+    // <span> elements so the "PII" filter button doesn't match.
+    await expect(page.locator('span').getByText(/^pii$/i).first()).toBeVisible()
+    await expect(page.getByText(/^proxy connect$/i).first()).toBeVisible()
+    await expect(page.getByText(/^proxy traffic$/i).first()).toBeVisible()
+  })
+
+  test('PUT /api/pii/patterns/:id fires when an action button is clicked', async ({ page }) => {
+    let putHit = null
+    await page.route('**/api/pii/patterns/email', (route) => {
+      if (route.request().method() === 'PUT') {
+        putHit = JSON.parse(route.request().postData() || '{}')
+        route.fulfill({ contentType: 'application/json', body: JSON.stringify({ id: 'email', action: putHit.action, persisted: false }) })
+      } else {
+        route.continue()
+      }
+    })
+
+    await page.goto('/app/middleware')
+    // Click the email row's "block" button (currently mask, so block is
+    // enabled). Use a precise locator that matches the inner button.
+    const emailRow = page.locator('tr').filter({ hasText: 'email' }).first()
+    await emailRow.getByRole('button', { name: 'block' }).click()
+
+    await expect.poll(() => putHit).toEqual({ action: 'block' })
+  })
+})
+
+test.describe('Middleware page — non-admin under auth-on', () => {
+  test('redirects to /app when the user is not admin', async ({ page }) => {
+    await page.route('**/api/auth/status', (route) =>
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify({
+          authEnabled: true,
+          staticApiKeyRequired: false,
+          providers: ['local'],
+          user: { id: 'bob', name: 'Bob', role: 'user', provider: 'local' },
+        }),
+      })
+    )
+
+    await page.goto('/app/middleware')
+    // RequireAdmin redirects non-admin viewers; the URL must not stay on /middleware.
+    await page.waitForURL(/\/app(?!\/middleware)/, { timeout: 5000 })
+    expect(page.url()).not.toMatch(/\/middleware/)
+  })
+})
diff --git a/core/http/react-ui/e2e/router-template.spec.js b/core/http/react-ui/e2e/router-template.spec.js
new file mode 100644
index 000000000..72431854e
--- /dev/null
+++ b/core/http/react-ui/e2e/router-template.spec.js
@@ -0,0 +1,219 @@
+import { test, expect } from '@playwright/test'
+
+// Router template + structured editor regression tests.
+//
+// The historical regression was: the "Create routing model" button
+// loaded the model editor with an array-shaped `router.candidates`
+// value, which crashed when a code-editor field received it instead
+// of a string ("(intermediate value).split is not a function").
+//
+// The current schema is also covered:
+//   - classifier=score is the only shipped classifier
+//   - router.policies surfaces in its own structured editor (label +
+//     description rows with duplicate detection)
+//   - router.candidates is the structured {model, labels[]} editor;
+//     labels are chips populated from router.policies via FormContext
+//   - router.embedding_cache.* surface as labelled fields with the
+//     correct components (model-select / slider)
+//   - router.activation_threshold and the two embedding_cache slider
+//     fields render with slider min/max/step from the registry
+
+const ROUTER_METADATA = {
+  sections: [
+    { id: 'general', label: 'General', icon: 'settings', order: 0 },
+    { id: 'other', label: 'Other', icon: 'more-horizontal', order: 100 },
+  ],
+  fields: [
+    { path: 'name', yaml_key: 'name', go_type: 'string', ui_type: 'string',
+      section: 'general', label: 'Model Name', component: 'input', order: 0 },
+    {
+      path: 'router.classifier', yaml_key: 'classifier', go_type: 'string', ui_type: 'string',
+      section: 'other', label: 'Classifier', component: 'select',
+      options: [{ value: 'score', label: 'Score (Arch-Router-style)' }],
+      description: 'Picks a candidate by scoring every policy label against the prompt. Only "score" is shipped today.',
+      order: 230,
+    },
+    {
+      path: 'router.classifier_model', yaml_key: 'classifier_model', go_type: 'string', ui_type: 'string',
+      section: 'other', label: 'Classifier Model', component: 'model-select', autocomplete_provider: 'models:chat',
+      description: 'Loaded LocalAI model the score classifier asks to rank each policy label.',
+      order: 231,
+    },
+    {
+      path: 'router.fallback', yaml_key: 'fallback', go_type: 'string', ui_type: 'string',
+      section: 'other', label: 'Fallback Model', component: 'model-select', autocomplete_provider: 'models:chat',
+      description: 'Model used when no candidate covers the active label set.',
+      order: 232,
+    },
+    {
+      path: 'router.activation_threshold', yaml_key: 'activation_threshold', go_type: 'float64', ui_type: 'float',
+      section: 'other', label: 'Activation Threshold', component: 'slider',
+      min: 0, max: 1, step: 0.05,
+      description: 'Softmax-probability floor a policy must clear to join the active label set.',
+      order: 233,
+    },
+    {
+      path: 'router.policies', yaml_key: 'policies', go_type: '[]RouterPolicy', ui_type: 'object',
+      section: 'other', label: 'Policies', component: 'router-policies',
+      description: 'Label vocabulary the classifier scores over.',
+      order: 235,
+    },
+    {
+      path: 'router.candidates', yaml_key: 'candidates', go_type: '[]RouterCandidate', ui_type: 'object',
+      section: 'other', label: 'Candidates', component: 'router-candidates',
+      description: 'Routing table: each entry binds a downstream model to a set of policy labels.',
+      order: 236,
+    },
+    {
+      path: 'router.embedding_cache.embedding_model', yaml_key: 'embedding_model', go_type: 'string', ui_type: 'string',
+      section: 'other', label: 'L2 Cache: Embedding Model', component: 'model-select', autocomplete_provider: 'models',
+      description: 'Embedding model used by the L2 decision cache.',
+      order: 237,
+    },
+    {
+      path: 'router.embedding_cache.similarity_threshold', yaml_key: 'similarity_threshold', go_type: 'float64', ui_type: 'float',
+      section: 'other', label: 'L2 Cache: Similarity Threshold', component: 'slider',
+      min: 0, max: 1, step: 0.01,
+      description: 'Cosine-similarity floor a cache candidate must clear to count as a hit.',
+      order: 238,
+    },
+  ],
+}
+
+const MIDDLEWARE_STATUS = {
+  pii: { enabled_globally: false, patterns: [], models: [], recent_event_count: 0 },
+  router: { configured: false, models: [], recent_decision_count: 0, available_classifiers: ['score'] },
+  mitm: { running: false, listen_addr: '', configured_addr: '', host_owners: {}, host_conflicts: {}, models: [], ca_available: false, ca_cert_url: '' },
+}
+
+test.describe('Router template — create flow', () => {
+  test.beforeEach(async ({ page }) => {
+    await page.route('**/api/auth/status', (route) =>
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify({ authEnabled: false, staticApiKeyRequired: false, providers: [] }),
+      })
+    )
+    await page.route('**/api/middleware/status', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify(MIDDLEWARE_STATUS) })
+    )
+    await page.route('**/api/router/decisions?**', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ decisions: [] }) })
+    )
+    await page.route('**/api/pii/events?**', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ events: [] }) })
+    )
+    await page.route('**/api/models/config-metadata*', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify(ROUTER_METADATA) })
+    )
+    await page.route('**/api/models/config-metadata/autocomplete/**', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ values: [] }) })
+    )
+
+    // Surface any uncaught render-time error so the assertion fails
+    // with a useful message rather than the test silently passing.
+    page.on('pageerror', (err) => {
+      throw new Error(`uncaught page error: ${err.message}`)
+    })
+  })
+
+  test('Routing tab links to the model editor with the router template loaded', async ({ page }) => {
+    await page.goto('/app/middleware')
+    await page.getByRole('button', { name: /Routing/i }).click()
+
+    // Empty-state button is the primary CTA.
+    await page.getByRole('button', { name: /Create routing model/i }).click()
+
+    // Editor loads on a /app/model-editor URL with template=router.
+    await expect(page).toHaveURL(/\/app\/model-editor.*template=router/)
+  })
+
+  test('Router template renders without crashing on structured candidates/policies', async ({ page }) => {
+    // Navigate straight to the create-with-template URL. This was the
+    // regression that crashed with "(intermediate value).split is not
+    // a function" when the template's array-shaped router.candidates
+    // fell into a code-editor wrapper.
+    await page.goto('/app/model-editor?template=router')
+
+    // The react-router error overlay must not appear.
+    await expect(page.getByText(/Unexpected Application Error/i)).toHaveCount(0)
+
+    // Editor surface visible. Template URL is "create mode", so the
+    // heading reads "Add Model" rather than "Model Editor".
+    await expect(page.locator('h1.page-title')).toBeVisible({ timeout: 10_000 })
+
+    // Top-level field labels seeded by the template are visible.
+    // embedding_cache.* fields are surfaced via "Add Field" search
+    // rather than active by default — separate spec covers them.
+    await expect(page.getByText('Classifier').first()).toBeVisible()
+    await expect(page.getByText('Policies').first()).toBeVisible()
+    await expect(page.getByText('Candidates').first()).toBeVisible()
+    await expect(page.getByText('Activation Threshold').first()).toBeVisible()
+  })
+
+  test('Classifier select offers only the score option', async ({ page }) => {
+    await page.goto('/app/model-editor?template=router')
+
+    // SearchableSelect renders the current option's *label* inside the
+    // trigger button. After the schema cleanup the only option is
+    // "Score (Arch-Router-style)", pre-selected by the template.
+    await expect(page.getByText('Score (Arch-Router-style)').first()).toBeVisible({ timeout: 10_000 })
+  })
+
+  test('Policies editor renders structured rows with label + description fields', async ({ page }) => {
+    await page.goto('/app/model-editor?template=router')
+
+    // The template seeds three example policies. Their labels are
+    // pre-populated in input fields with monospace styling — the
+    // editor signature is "Add policy" button + label/description
+    // input pairs.
+    await expect(page.getByRole('button', { name: /Add policy/i }).first()).toBeVisible()
+
+    // Pre-seeded labels visible as input values. RouterPoliciesEditor
+    // renders each label in an input with a recognisable placeholder;
+    // assert on their values by position.
+    const labelInputs = page.locator('input[placeholder^="label ("]')
+    await expect(labelInputs.nth(0)).toHaveValue('code-generation')
+    await expect(labelInputs.nth(1)).toHaveValue('casual-chat')
+    await expect(labelInputs.nth(2)).toHaveValue('math-reasoning')
+  })
+
+  test('Candidates editor renders {model, labels} rows with policy-aware label chips', async ({ page }) => {
+    await page.goto('/app/model-editor?template=router')
+
+    // "Add candidate" is the signature of the new RouterCandidatesEditor.
+    await expect(page.getByRole('button', { name: /Add candidate/i }).first()).toBeVisible()
+
+    // Each candidate row should expose move-up/move-down controls,
+    // a model picker, and label chips. The chip for a known policy
+    // label appears as a button with the policy's label text.
+    // Pre-seeded template: candidate[0] has labels=['casual-chat'];
+    // candidate[1] has labels=['code-generation', 'casual-chat', 'math-reasoning'].
+    //
+    // The chips appear inside a flex row of buttons. Using getByRole
+    // with the exact name catches typos/regressions cleanly.
+    await expect(page.getByRole('button', { name: 'casual-chat' }).first()).toBeVisible()
+    await expect(page.getByRole('button', { name: 'code-generation' }).first()).toBeVisible()
+    await expect(page.getByRole('button', { name: 'math-reasoning' }).first()).toBeVisible()
+  })
+
+  test('Adding a duplicate policy label flags the duplicate row', async ({ page }) => {
+    await page.goto('/app/model-editor?template=router')
+
+    // Add a new empty policy row, then type a duplicate of the
+    // existing 'casual-chat'. The duplicate detection in
+    // RouterPoliciesEditor sets a warning border via inline style.
+    await page.getByRole('button', { name: /Add policy/i }).first().click()
+
+    // Find the newly-added empty label input (placeholder catches it).
+    const newLabel = page.locator('input[placeholder*="label (e.g. code-generation)"]').last()
+    await newLabel.fill('casual-chat')
+
+    // Both rows now hold the same label. The duplicate-detection
+    // logic flags the row visually; we assert on the title attribute
+    // RouterPoliciesEditor sets on the input when duplicate=true.
+    await expect(
+      page.locator('input[title="Duplicate label — candidates won\'t be able to distinguish them"]').first()
+    ).toBeVisible()
+  })
+})
diff --git a/core/http/react-ui/e2e/usage-dashboard.spec.js b/core/http/react-ui/e2e/usage-dashboard.spec.js
new file mode 100644
index 000000000..a27bf4006
--- /dev/null
+++ b/core/http/react-ui/e2e/usage-dashboard.spec.js
@@ -0,0 +1,148 @@
+import { test, expect } from '@playwright/test'
+
+// Mock usage payload as the new /api/usage endpoint returns it.
+const MOCK_USAGE = {
+  viewer: { id: 'local-uuid', name: 'local', role: 'admin', provider: 'local' },
+  totals: {
+    prompt_tokens: 1234,
+    completion_tokens: 567,
+    total_tokens: 1801,
+    request_count: 42,
+  },
+  usage: [
+    {
+      bucket: '2026-05-05',
+      model: 'qwen-7b',
+      user_id: 'local-uuid',
+      user_name: 'local',
+      prompt_tokens: 1234,
+      completion_tokens: 567,
+      total_tokens: 1801,
+      request_count: 42,
+    },
+  ],
+}
+
+const MOCK_USAGE_AUTH_USER = {
+  ...MOCK_USAGE,
+  viewer: { id: 'alice-uuid', name: 'Alice', role: 'user', provider: 'local' },
+}
+
+// Two scenarios:
+//   1. No-auth single-user box: /api/auth/status returns authEnabled:false
+//      and the page must call /api/usage and render the local user's data.
+//   2. Auth-on regular user: status returns authEnabled:true and the page
+//      keeps using /api/auth/usage as before.
+//
+// The point of these specs is the "prevent accidental removal" guarantee
+// the user asked for: if anyone gates the Usage page behind auth again,
+// scenario 1 fails immediately.
+
+test.describe('Usage page — single-user no-auth mode', () => {
+  test.beforeEach(async ({ page }) => {
+    await page.route('**/api/auth/status', (route) =>
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify({
+          authEnabled: false,
+          staticApiKeyRequired: false,
+          providers: [],
+        }),
+      })
+    )
+
+    // The new no-auth code path. If anyone reverts Usage.jsx to
+    // /api/auth/usage in single-user mode, this route is never hit and
+    // the test fails because no usage data renders.
+    let usageHits = 0
+    await page.route('**/api/usage?**', (route) => {
+      usageHits++
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify(MOCK_USAGE),
+      })
+    })
+    // The synthetic local user has admin role, so Usage.jsx also pulls
+    // the cluster-wide view from /api/usage/all to populate displayTotals.
+    await page.route('**/api/usage/all?**', (route) =>
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify(MOCK_USAGE),
+      })
+    )
+    page.usageHits = () => usageHits
+  })
+
+  test('Usage entry is visible in sidebar without auth', async ({ page }) => {
+    await page.goto('/app')
+    const systemSection = page.locator('button.sidebar-section-toggle', { hasText: 'System' })
+    await systemSection.click()
+    const usageLink = page.locator('a.nav-item[href="/app/usage"]')
+    await expect(usageLink).toBeVisible()
+  })
+
+  test('navigating to /app/usage renders the dashboard with local-user data', async ({ page }) => {
+    await page.goto('/app/usage')
+
+    // The page used to bail with "Usage tracking unavailable" when authEnabled=false.
+    // We assert the *opposite*: data is rendered and the empty-state text is absent.
+    await expect(page.getByText('Usage tracking unavailable')).toHaveCount(0)
+
+    // The total-tokens stat card is one of the first things rendered after
+    // a successful /api/usage call. We assert the formatted number "1.8K"
+    // is present (formatNumber in Usage.jsx renders 1801 as "1.8K").
+    await expect(page.getByText('1.8K').first()).toBeVisible()
+  })
+})
+
+test.describe('Usage page — auth on', () => {
+  test.beforeEach(async ({ page }) => {
+    // RequireAuth redirects to /login when user is null, so the status
+    // response must include a resolved user for auth-on specs to reach
+    // the Usage page at all.
+    await page.route('**/api/auth/status', (route) =>
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify({
+          authEnabled: true,
+          staticApiKeyRequired: false,
+          providers: ['local'],
+          user: { id: 'alice-uuid', name: 'Alice', role: 'user', provider: 'local' },
+        }),
+      })
+    )
+    await page.route('**/api/auth/me', (route) =>
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify({
+          user: { id: 'alice-uuid', name: 'Alice', role: 'user', provider: 'local' },
+          permissions: {},
+        }),
+      })
+    )
+    await page.route('**/api/auth/usage?**', (route) =>
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify(MOCK_USAGE_AUTH_USER),
+      })
+    )
+    await page.route('**/api/auth/quota', (route) =>
+      route.fulfill({ contentType: 'application/json', body: JSON.stringify({ quotas: [] }) })
+    )
+  })
+
+  test('Usage page calls /api/auth/usage when auth is on', async ({ page }) => {
+    let authUsageHit = false
+    await page.route('**/api/auth/usage?**', (route) => {
+      authUsageHit = true
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify(MOCK_USAGE_AUTH_USER),
+      })
+    })
+
+    await page.goto('/app/usage')
+    await expect(page.getByText('1.8K').first()).toBeVisible()
+    expect(authUsageHit).toBe(true)
+  })
+})
diff --git a/core/http/react-ui/e2e/users-tab-gating.spec.js b/core/http/react-ui/e2e/users-tab-gating.spec.js
new file mode 100644
index 000000000..f683d215f
--- /dev/null
+++ b/core/http/react-ui/e2e/users-tab-gating.spec.js
@@ -0,0 +1,74 @@
+import { test, expect } from '@playwright/test'
+
+// Two surfaces enforce single-user (no-auth) gating for the Users page:
+//   1. Sidebar entry: hidden via the `authOnly: true` flag in Sidebar.jsx
+//      (filterItem returns false when `!authEnabled`).
+//   2. Direct URL navigation: RequireAuthEnabled wrapping the /app/users
+//      route in router.jsx redirects to /app when authEnabled is false.
+//
+// Without (2), an old bookmark or pasted URL would land on a page rendered
+// against admin-only `/api/auth/admin/users` data — which doesn't exist
+// when auth is off — and the user sees a confusing empty/error state.
+//
+// These specs are the "prevent accidental removal" guarantee — if anyone
+// drops the gating, /app/users stays open in single-user mode and the
+// test fails on the redirect or the visible sidebar item.
+
+test.describe('Users tab — single-user no-auth mode', () => {
+  test.beforeEach(async ({ page }) => {
+    await page.route('**/api/auth/status', (route) =>
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify({
+          authEnabled: false,
+          staticApiKeyRequired: false,
+          providers: [],
+        }),
+      })
+    )
+  })
+
+  test('sidebar does not list Users entry', async ({ page }) => {
+    await page.goto('/app')
+    const systemSection = page.locator('button.sidebar-section-toggle', { hasText: 'System' })
+    await systemSection.click()
+    // The Users page link uses /app/users; if Sidebar's authOnly gate
+    // regresses (or someone removes the flag), this assertion fails.
+    const usersLink = page.locator('a.nav-item[href="/app/users"]')
+    await expect(usersLink).toHaveCount(0)
+  })
+
+  test('direct navigation to /app/users redirects to /app', async ({ page }) => {
+    await page.goto('/app/users')
+    // RequireAuthEnabled performs the redirect synchronously, but the URL
+    // change is async — wait for it before asserting.
+    await page.waitForURL(/\/app(?!\/users)/, { timeout: 5000 })
+    expect(page.url()).toMatch(/\/app(\/?$|\/(?!users))/)
+  })
+})
+
+test.describe('Users tab — auth on', () => {
+  test.beforeEach(async ({ page }) => {
+    await page.route('**/api/auth/status', (route) =>
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify({
+          authEnabled: true,
+          staticApiKeyRequired: false,
+          providers: ['local'],
+          // Mark the viewer as admin so the sidebar's adminOnly gate also
+          // passes; the test then exercises the authOnly path in isolation.
+          user: { id: 'admin-uuid', name: 'Admin', role: 'admin', provider: 'local' },
+        }),
+      })
+    )
+  })
+
+  test('sidebar lists Users entry when auth is on', async ({ page }) => {
+    await page.goto('/app')
+    const systemSection = page.locator('button.sidebar-section-toggle', { hasText: 'System' })
+    await systemSection.click()
+    const usersLink = page.locator('a.nav-item[href="/app/users"]')
+    await expect(usersLink).toBeVisible()
+  })
+})
diff --git a/core/http/react-ui/public/locales/en/nav.json b/core/http/react-ui/public/locales/en/nav.json
index 9f5218a19..ac85d4979 100644
--- a/core/http/react-ui/public/locales/en/nav.json
+++ b/core/http/react-ui/public/locales/en/nav.json
@@ -36,6 +36,7 @@
     "mcpJobs": "MCP CI Jobs",
     "usage": "Usage",
     "users": "Users",
+    "middleware": "Middleware",
     "backends": "Backends",
     "traces": "Traces",
     "nodes": "Nodes",
diff --git a/core/http/react-ui/src/App.css b/core/http/react-ui/src/App.css
index 9dd4cae2d..a71eda5f5 100644
--- a/core/http/react-ui/src/App.css
+++ b/core/http/react-ui/src/App.css
@@ -1847,6 +1847,24 @@ select.input {
   color: var(--color-accent);
 }
 
+/* badge-link marks a badge that navigates somewhere (e.g. the use-case
+   chips that open a playground page). Signals clickability the bare
+   colour variant doesn't: pointer cursor, and a hover state that lifts
+   the background/text so links read as interactive while non-link
+   badges (Embeddings, VAD, Score, ...) stay flat. */
+.badge-link {
+  cursor: pointer;
+  text-decoration: none;
+  transition: background-color 0.12s ease, color 0.12s ease, box-shadow 0.12s ease;
+}
+.badge-link:hover,
+.badge-link:focus-visible {
+  background: var(--color-info);
+  color: var(--color-bg-primary);
+  box-shadow: 0 0 0 1px var(--color-info);
+  outline: none;
+}
+
 /* Horizontal row of badges used inside table cells — consistent spacing so
    cells line up regardless of how many badges are present. */
 .badge-row {
diff --git a/core/http/react-ui/src/components/ConfigFieldRenderer.jsx b/core/http/react-ui/src/components/ConfigFieldRenderer.jsx
index f2c80885d..ccf5bf05c 100644
--- a/core/http/react-ui/src/components/ConfigFieldRenderer.jsx
+++ b/core/http/react-ui/src/components/ConfigFieldRenderer.jsx
@@ -5,6 +5,10 @@ import SearchableSelect from './SearchableSelect'
 import SearchableModelSelect from './SearchableModelSelect'
 import AutocompleteInput from './AutocompleteInput'
 import CodeEditor from './CodeEditor'
+import StructuredCodeEditor from './StructuredCodeEditor'
+import PIIPatternListEditor from './PIIPatternListEditor'
+import RouterCandidatesEditor from './RouterCandidatesEditor'
+import RouterPoliciesEditor from './RouterPoliciesEditor'
 
 // Map autocomplete provider to SearchableModelSelect capability
 const PROVIDER_TO_CAPABILITY = {
@@ -300,8 +304,17 @@ export default function ConfigFieldRenderer({ field, value, onChange, onRemove,
     )
   }
 
-  // Code editor
+  // Code editor. Two flavours:
+  //   - Plain CodeEditor when the form value is a string (Go template
+  //     blobs etc. — what the original `code-editor` shipped for).
+  //   - StructuredCodeEditor when the form value is a structured
+  //     object/array (e.g. `router.candidates`, where the canonical
+  //     value is `[{label, model, rules}, ...]`). The wrapper keeps a
+  //     YAML representation in the textarea while publishing the
+  //     parsed structure back to form state, so the save flow can
+  //     unflatten it into the YAML file cleanly.
   if (component === 'code-editor') {
+    const isStructured = value !== null && value !== undefined && typeof value !== 'string'
     return (
       <div style={{ padding: 'var(--spacing-sm) 0', borderBottom: '1px solid var(--color-border-subtle)' }}>
         <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: 4 }}>
@@ -310,7 +323,9 @@ export default function ConfigFieldRenderer({ field, value, onChange, onRemove,
             <div style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', marginTop: 2 }}>{description}</div>
           </div>
         </div>
-        <CodeEditor value={value || ''} onChange={handleChange} minHeight="80px" />
+        {isStructured
+          ? <StructuredCodeEditor value={value} onChange={handleChange} minHeight="80px" />
+          : <CodeEditor value={value || ''} onChange={handleChange} minHeight="80px" />}
       </div>
     )
   }
@@ -345,6 +360,57 @@ export default function ConfigFieldRenderer({ field, value, onChange, onRemove,
     )
   }
 
+  // Router candidates — routing table editor. Each row is
+  // {model, labels[]}; the labels picker reads from router.policies
+  // via FormContext so candidate labels match the declared vocabulary.
+  if (component === 'router-candidates') {
+    return (
+      <div style={{ padding: 'var(--spacing-sm) 0', borderBottom: '1px solid var(--color-border-subtle)' }}>
+        <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: 4 }}>
+          <div>
+            <div style={{ fontSize: '0.875rem', fontWeight: 500 }}><FieldLabel field={field} /></div>
+            <div style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', marginTop: 2 }}>{description}</div>
+          </div>
+        </div>
+        <RouterCandidatesEditor value={value} onChange={handleChange} />
+      </div>
+    )
+  }
+
+  // Router policies — label vocabulary editor. Each row is
+  // {label, description}; the description ends up verbatim in the
+  // routing system prompt sent to the classifier model.
+  if (component === 'router-policies') {
+    return (
+      <div style={{ padding: 'var(--spacing-sm) 0', borderBottom: '1px solid var(--color-border-subtle)' }}>
+        <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: 4 }}>
+          <div>
+            <div style={{ fontSize: '0.875rem', fontWeight: 500 }}><FieldLabel field={field} /></div>
+            <div style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', marginTop: 2 }}>{description}</div>
+          </div>
+        </div>
+        <RouterPoliciesEditor value={value} onChange={handleChange} />
+      </div>
+    )
+  }
+
+  // PII pattern list — per-model action overrides for named patterns.
+  // The pattern catalog is loaded from /api/pii/patterns at render time
+  // so new built-in patterns surface automatically.
+  if (component === 'pii-pattern-list') {
+    return (
+      <div style={{ padding: 'var(--spacing-sm) 0', borderBottom: '1px solid var(--color-border-subtle)' }}>
+        <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: 4 }}>
+          <div>
+            <div style={{ fontSize: '0.875rem', fontWeight: 500 }}><FieldLabel field={field} /></div>
+            <div style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', marginTop: 2 }}>{description}</div>
+          </div>
+        </div>
+        <PIIPatternListEditor value={value} onChange={handleChange} />
+      </div>
+    )
+  }
+
   // Map editor
   if (component === 'map-editor') {
     return (
diff --git a/core/http/react-ui/src/components/PIIPatternListEditor.jsx b/core/http/react-ui/src/components/PIIPatternListEditor.jsx
new file mode 100644
index 000000000..558f4cd6a
--- /dev/null
+++ b/core/http/react-ui/src/components/PIIPatternListEditor.jsx
@@ -0,0 +1,120 @@
+import { useState, useEffect, useMemo } from 'react'
+import { apiUrl } from '../utils/basePath'
+import SearchableSelect from './SearchableSelect'
+
+const ACTION_OPTIONS = [
+  { value: 'mask', label: 'Mask — replace with a [REDACTED:id] placeholder' },
+  { value: 'block', label: 'Block — reject the request (request side) / mask in stream' },
+  { value: 'route_local', label: 'Route local — keep text, force local-only routing' },
+]
+
+export default function PIIPatternListEditor({ value, onChange }) {
+  const items = Array.isArray(value) ? value : []
+
+  const [catalog, setCatalog] = useState([])
+  const [loadError, setLoadError] = useState(null)
+
+  useEffect(() => {
+    let cancelled = false
+    fetch(apiUrl('/api/pii/patterns'))
+      .then(r => r.ok ? r.json() : Promise.reject(new Error(`HTTP ${r.status}`)))
+      .then(data => { if (!cancelled) setCatalog(data?.patterns || []) })
+      .catch(err => { if (!cancelled) setLoadError(err.message) })
+    return () => { cancelled = true }
+  }, [])
+
+  const idOptions = useMemo(() =>
+    catalog.map(p => ({
+      value: p.id,
+      label: p.description ? `${p.id} — ${p.description}` : p.id,
+    })),
+    [catalog]
+  )
+
+  // Patterns already chosen — exclude from the "add row" select so each
+  // pattern only appears once per model.
+  const usedIDs = new Set(items.map(it => it?.id).filter(Boolean))
+  const availableForAdd = idOptions.filter(o => !usedIDs.has(o.value))
+
+  const update = (index, key, val) => {
+    const next = items.map((it, i) =>
+      i === index ? { ...it, [key]: val } : it
+    )
+    onChange(next)
+  }
+
+  const remove = (index) => {
+    onChange(items.filter((_, i) => i !== index))
+  }
+
+  const add = (id) => {
+    const cat = catalog.find(c => c.id === id)
+    onChange([...items, { id, action: cat?.action || 'mask' }])
+  }
+
+  return (
+    <div style={{ display: 'flex', flexDirection: 'column', gap: 6, width: '100%' }}>
+      {loadError && (
+        <div style={{ fontSize: '0.75rem', color: 'var(--color-error)' }}>
+          Could not load pattern catalog: {loadError}. You can still type IDs manually.
+        </div>
+      )}
+
+      {items.length === 0 && (
+        <div style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>
+          No overrides — every pattern uses its global default action. Add a row below to
+          tighten or relax the action for a specific pattern on this model.
+        </div>
+      )}
+
+      {items.map((row, i) => {
+        const cat = catalog.find(c => c.id === row?.id)
+        const idLabel = cat?.description ? `${row.id} — ${cat.description}` : (row?.id || '')
+        // Show the chosen id even if the catalog hasn't loaded yet (or
+        // the YAML references an unknown pattern), so users can edit
+        // without losing context.
+        const idItems = [
+          ...(row?.id && !idOptions.some(o => o.value === row.id)
+            ? [{ value: row.id, label: idLabel }]
+            : []),
+          ...idOptions.filter(o => o.value === row?.id || !usedIDs.has(o.value)),
+        ]
+        return (
+          <div key={i} style={{ display: 'flex', gap: 6, alignItems: 'center', flexWrap: 'wrap' }}>
+            <SearchableSelect
+              value={row?.id || ''}
+              onChange={v => update(i, 'id', v)}
+              options={idItems}
+              placeholder="Pattern..."
+              style={{ flex: '1 1 220px', minWidth: 200 }}
+            />
+            <SearchableSelect
+              value={row?.action || 'mask'}
+              onChange={v => update(i, 'action', v)}
+              options={ACTION_OPTIONS}
+              placeholder="Action..."
+              style={{ flex: '1 1 240px', minWidth: 220 }}
+            />
+            <button type="button" className="btn btn-secondary btn-sm"
+              onClick={() => remove(i)}
+              style={{ padding: '2px 8px', fontSize: '0.75rem' }}>
+              <i className="fas fa-times" />
+            </button>
+          </div>
+        )
+      })}
+
+      {availableForAdd.length > 0 && (
+        <div style={{ display: 'flex', gap: 6, alignItems: 'center' }}>
+          <SearchableSelect
+            value=""
+            onChange={v => v && add(v)}
+            options={availableForAdd}
+            placeholder="+ Add pattern override..."
+            style={{ flex: '1 1 220px', minWidth: 200 }}
+          />
+        </div>
+      )}
+    </div>
+  )
+}
diff --git a/core/http/react-ui/src/components/RequireAuthEnabled.jsx b/core/http/react-ui/src/components/RequireAuthEnabled.jsx
new file mode 100644
index 000000000..c71ca3f07
--- /dev/null
+++ b/core/http/react-ui/src/components/RequireAuthEnabled.jsx
@@ -0,0 +1,16 @@
+import { Navigate } from 'react-router-dom'
+import { useAuth } from '../context/AuthContext'
+
+// RequireAuthEnabled gates routes that only make sense when auth is on.
+// User management is the canonical example: in single-user (no-auth)
+// mode there is exactly one synthetic local user, so the page would
+// either be empty or expose admin tools that have nothing to manage.
+//
+// We redirect to /app rather than render a "not available" page so that
+// stale bookmarks don't leave the user on a dead-end screen.
+export default function RequireAuthEnabled({ children }) {
+  const { authEnabled, loading } = useAuth()
+  if (loading) return null
+  if (!authEnabled) return <Navigate to="/app" replace />
+  return children
+}
diff --git a/core/http/react-ui/src/components/RouterCandidatesEditor.jsx b/core/http/react-ui/src/components/RouterCandidatesEditor.jsx
new file mode 100644
index 000000000..5d744c8d4
--- /dev/null
+++ b/core/http/react-ui/src/components/RouterCandidatesEditor.jsx
@@ -0,0 +1,185 @@
+import { useMemo } from 'react'
+import { useFormContext } from '../contexts/FormContext'
+import SearchableModelSelect from './SearchableModelSelect'
+
+// RouterCandidatesEditor renders the routing table for a router model.
+// Each row binds a downstream model to a SET of policy labels it can
+// serve. The middleware picks the first candidate whose labels are a
+// superset of the active label set from the classifier, so admins
+// order candidates smallest → largest.
+//
+// Schema mirrors core/config.RouterCandidate:
+//   { model: string, labels: []string }
+//
+// Labels are picked from the parent form's router.policies (a multi-
+// select rather than a free-text input) so a typo in one place doesn't
+// silently disable a candidate. Labels typed manually are still kept
+// — useful when admins paste a config before defining the policies.
+
+export default function RouterCandidatesEditor({ value, onChange }) {
+  const items = Array.isArray(value) ? value : []
+  const knownLabels = usePolicyLabels()
+  const knownLabelSet = useMemo(() => new Set(knownLabels), [knownLabels])
+
+  const update = (index, mut) => {
+    const next = items.map((it, i) => (i === index ? mut({ ...it }) : it))
+    onChange(next)
+  }
+  const remove = (index) => onChange(items.filter((_, i) => i !== index))
+  const move = (index, dir) => {
+    const j = index + dir
+    if (j < 0 || j >= items.length) return
+    const next = items.slice()
+    ;[next[index], next[j]] = [next[j], next[index]]
+    onChange(next)
+  }
+  const add = () => onChange([...items, { model: '', labels: [] }])
+
+  return (
+    <div style={{ display: 'flex', flexDirection: 'column', gap: 'var(--spacing-sm)', width: '100%' }}>
+      {items.length === 0 && (
+        <div style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', padding: 'var(--spacing-sm) 0' }}>
+          No candidates yet. Add at least one — order from smallest model to largest.
+          The middleware picks the FIRST candidate whose labels superset the active set.
+        </div>
+      )}
+
+      {items.map((row, i) => (
+        <CandidateRow
+          key={i}
+          index={i}
+          total={items.length}
+          row={row}
+          knownLabels={knownLabels}
+          knownLabelSet={knownLabelSet}
+          onChange={(mut) => update(i, mut)}
+          onRemove={() => remove(i)}
+          onMove={(dir) => move(i, dir)}
+        />
+      ))}
+
+      <button
+        type="button"
+        className="btn btn-secondary btn-sm"
+        onClick={add}
+        style={{ alignSelf: 'flex-start' }}
+      >
+        <i className="fas fa-plus" /> Add candidate
+      </button>
+    </div>
+  )
+}
+
+function CandidateRow({ index, total, row, knownLabels, knownLabelSet, onChange, onRemove, onMove }) {
+  const labels = Array.isArray(row?.labels) ? row.labels : []
+  const toggleLabel = (label) => onChange((r) => ({
+    ...r,
+    labels: labels.includes(label) ? labels.filter(l => l !== label) : [...labels, label],
+  }))
+
+  // Row-local labels not in the parent policy list are still surfaced
+  // (with a warning chip) so a stale row doesn't silently lose its
+  // labels while the policy list is being edited.
+  const unknownOnRow = labels.filter(l => !knownLabelSet.has(l))
+  const visible = [...knownLabels, ...unknownOnRow]
+
+  return (
+    <div
+      className="card"
+      style={{
+        padding: 'var(--spacing-sm)',
+        display: 'flex',
+        flexDirection: 'column',
+        gap: 'var(--spacing-xs)',
+        border: '1px solid var(--color-border)',
+      }}
+    >
+      <div style={{ display: 'flex', gap: 4, alignItems: 'center', fontSize: '0.6875rem', color: 'var(--color-text-muted)' }}>
+        <span style={{ fontWeight: 600 }}>#{index + 1}</span>
+        <button
+          type="button"
+          className="btn btn-secondary btn-sm"
+          onClick={() => onMove(-1)}
+          disabled={index === 0}
+          title="Move up (smaller priority)"
+          style={{ padding: '0 6px' }}
+        >
+          <i className="fas fa-arrow-up" />
+        </button>
+        <button
+          type="button"
+          className="btn btn-secondary btn-sm"
+          onClick={() => onMove(1)}
+          disabled={index === total - 1}
+          title="Move down"
+          style={{ padding: '0 6px' }}
+        >
+          <i className="fas fa-arrow-down" />
+        </button>
+        <span style={{ marginLeft: 'auto' }}>
+          {index === 0 ? 'tried first' : index === total - 1 ? 'tried last (fallback-class)' : ''}
+        </span>
+      </div>
+
+      <div style={{ display: 'grid', gridTemplateColumns: '1fr auto', gap: 'var(--spacing-sm)', alignItems: 'center' }}>
+        <SearchableModelSelect
+          value={row?.model || ''}
+          onChange={(v) => onChange((r) => ({ ...r, model: v }))}
+          placeholder="downstream model..."
+        />
+        <button
+          type="button"
+          className="btn btn-secondary btn-sm"
+          onClick={onRemove}
+          title="Remove candidate"
+        >
+          <i className="fas fa-trash" />
+        </button>
+      </div>
+
+      <div>
+        <div style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)', marginBottom: 4 }}>
+          {visible.length === 0
+            ? 'No policies defined yet — add policies above before assigning labels.'
+            : 'Labels this model can serve. The middleware requires the candidate to cover every label the classifier activates.'}
+        </div>
+        <div style={{ display: 'flex', flexWrap: 'wrap', gap: 6 }}>
+          {visible.map((label) => {
+            const on = labels.includes(label)
+            const known = knownLabelSet.has(label)
+            return (
+              <button
+                key={label}
+                type="button"
+                onClick={() => toggleLabel(label)}
+                title={known ? '' : 'Not in router.policies — typo or stale'}
+                style={{
+                  padding: '4px 10px',
+                  borderRadius: 'var(--radius-md)',
+                  fontSize: '0.75rem',
+                  fontFamily: 'var(--font-mono)',
+                  border: '1px solid ' + (on ? 'var(--color-primary)' : 'var(--color-border)'),
+                  background: on ? 'var(--color-primary)' : 'transparent',
+                  color: on ? 'var(--color-bg-primary)' : (known ? 'var(--color-text-primary)' : 'var(--color-warning)'),
+                  cursor: 'pointer',
+                }}
+              >
+                {label}{!known && ' ⚠'}
+              </button>
+            )
+          })}
+        </div>
+      </div>
+    </div>
+  )
+}
+
+// usePolicyLabels reads router.policies from the surrounding form state
+// and returns the list of declared labels. Falls back to [] when no
+// FormContext is present (e.g. preview render).
+function usePolicyLabels() {
+  const ctx = useFormContext()
+  const policies = ctx?.formData?.['router.policies']
+  if (!Array.isArray(policies)) return []
+  return policies.map(p => p?.label).filter(Boolean)
+}
diff --git a/core/http/react-ui/src/components/RouterPoliciesEditor.jsx b/core/http/react-ui/src/components/RouterPoliciesEditor.jsx
new file mode 100644
index 000000000..b323bc288
--- /dev/null
+++ b/core/http/react-ui/src/components/RouterPoliciesEditor.jsx
@@ -0,0 +1,109 @@
+import { useMemo } from 'react'
+
+// RouterPoliciesEditor renders the label vocabulary the score
+// classifier ranks for each request. The shape mirrors
+// core/config.RouterPolicy:
+//
+//   { label: string, description: string }
+//
+// The description ends up verbatim in the routing system prompt fed
+// to the classifier model. Short, action-oriented sentences ("writing
+// or debugging code", "small talk") consistently produce cleaner
+// label distributions on Arch-Router-style scorers than longer
+// taxonomies — keep them tight.
+
+export default function RouterPoliciesEditor({ value, onChange }) {
+  const items = Array.isArray(value) ? value : []
+
+  const duplicateLabels = useMemo(() => {
+    const seen = new Set()
+    const dup = new Set()
+    for (const it of items) {
+      const label = it?.label
+      if (!label) continue
+      if (seen.has(label)) dup.add(label)
+      else seen.add(label)
+    }
+    return dup
+  }, [items])
+
+  const update = (index, mut) => {
+    const next = items.map((it, i) => (i === index ? mut({ ...it }) : it))
+    onChange(next)
+  }
+  const remove = (index) => onChange(items.filter((_, i) => i !== index))
+  const add = () => onChange([...items, { label: '', description: '' }])
+
+  return (
+    <div style={{ display: 'flex', flexDirection: 'column', gap: 'var(--spacing-sm)', width: '100%' }}>
+      {items.length === 0 && (
+        <div style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)', padding: 'var(--spacing-sm) 0' }}>
+          No policies defined. Add at least one — the classifier needs a label vocabulary to rank over,
+          and candidates reference these labels.
+        </div>
+      )}
+
+      {items.map((row, i) => (
+        <PolicyRow
+          key={i}
+          row={row}
+          duplicate={!!row?.label && duplicateLabels.has(row.label)}
+          onChange={(mut) => update(i, mut)}
+          onRemove={() => remove(i)}
+        />
+      ))}
+
+      <button
+        type="button"
+        className="btn btn-secondary btn-sm"
+        onClick={add}
+        style={{ alignSelf: 'flex-start' }}
+      >
+        <i className="fas fa-plus" /> Add policy
+      </button>
+    </div>
+  )
+}
+
+function PolicyRow({ row, duplicate, onChange, onRemove }) {
+  return (
+    <div
+      className="card"
+      style={{
+        padding: 'var(--spacing-sm)',
+        display: 'grid',
+        gridTemplateColumns: '180px 1fr auto',
+        gap: 'var(--spacing-sm)',
+        alignItems: 'center',
+        border: '1px solid ' + (duplicate ? 'var(--color-warning, #d97706)' : 'var(--color-border)'),
+      }}
+    >
+      <input
+        className="input"
+        type="text"
+        placeholder="label (e.g. code-generation)"
+        value={row?.label || ''}
+        onChange={(e) => onChange((r) => ({ ...r, label: e.target.value }))}
+        style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem' }}
+        title={duplicate ? 'Duplicate label — candidates won\'t be able to distinguish them' : ''}
+      />
+      <input
+        className="input"
+        type="text"
+        placeholder="short description fed verbatim to the classifier prompt"
+        value={row?.description || ''}
+        onChange={(e) => onChange((r) => ({ ...r, description: e.target.value }))}
+        style={{ fontSize: '0.8125rem' }}
+      />
+      <button
+        type="button"
+        className="btn btn-secondary btn-sm"
+        onClick={onRemove}
+        title="Remove policy"
+      >
+        <i className="fas fa-trash" />
+      </button>
+    </div>
+  )
+}
+
diff --git a/core/http/react-ui/src/components/Sidebar.jsx b/core/http/react-ui/src/components/Sidebar.jsx
index 9956fb7c5..148a33bfb 100644
--- a/core/http/react-ui/src/components/Sidebar.jsx
+++ b/core/http/react-ui/src/components/Sidebar.jsx
@@ -69,8 +69,9 @@ const sections = [
     id: 'system',
     titleKey: 'sections.system',
     items: [
-      { path: '/app/usage', icon: 'fas fa-chart-bar', labelKey: 'items.usage', authOnly: true },
+      { path: '/app/usage', icon: 'fas fa-chart-bar', labelKey: 'items.usage' },
       { path: '/app/users', icon: 'fas fa-users', labelKey: 'items.users', adminOnly: true, authOnly: true },
+      { path: '/app/middleware', icon: 'fas fa-shield-halved', labelKey: 'items.middleware', adminOnly: true },
       { path: '/app/backends', icon: 'fas fa-server', labelKey: 'items.backends', adminOnly: true },
       { path: '/app/traces', icon: 'fas fa-chart-line', labelKey: 'items.traces', adminOnly: true },
       { path: '/app/nodes', icon: 'fas fa-network-wired', labelKey: 'items.nodes', adminOnly: true, feature: 'distributed' },
diff --git a/core/http/react-ui/src/components/StructuredCodeEditor.jsx b/core/http/react-ui/src/components/StructuredCodeEditor.jsx
new file mode 100644
index 000000000..496d0cb1b
--- /dev/null
+++ b/core/http/react-ui/src/components/StructuredCodeEditor.jsx
@@ -0,0 +1,80 @@
+import { useEffect, useState } from 'react'
+import YAML from 'yaml'
+import CodeEditor from './CodeEditor'
+
+// StructuredCodeEditor is the wrapper that lets a `code-editor`
+// field hold a structured value (object / array) rather than a raw
+// string. Two reasons we need this:
+//
+//   1. CodeMirror's EditorState.create({ doc }) requires a string —
+//      pass an array and it crashes inside CM's Text class with
+//      "(intermediate value).split is not a function".
+//   2. The model-editor save path uses unflattenConfig + YAML.stringify
+//      which needs the structured value to round-trip cleanly into
+//      YAML (otherwise a YAML-string-of-YAML appears in the file).
+//
+// The component keeps two pieces of state in sync:
+//   - `text`: the YAML representation shown to the user. The user
+//     edits this; we don't reformat while they type.
+//   - upstream `value`: the parsed structured value held by the
+//     editor form. We try to parse `text` on every edit; if the
+//     parse succeeds we publish the new structure, otherwise the
+//     structured value lags until the YAML is syntactically valid
+//     again (the linter shows the error inline).
+export default function StructuredCodeEditor({ value, onChange, minHeight }) {
+  // Lazy-init: stringify the initial structured value once. Subsequent
+  // re-renders driven by our own onChange keep `text` authoritative —
+  // we only re-sync from `value` when it changes due to an external
+  // edit (template selection, YAML-tab save).
+  const [text, setText] = useState(() => structuredToYAML(value))
+  const [lastExternal, setLastExternal] = useState(value)
+
+  useEffect(() => {
+    // Detect external changes (a different `value` reference that
+    // didn't come from our own parse). reference-equality is enough
+    // because onChange always publishes the parsed object, never the
+    // text.
+    if (value !== lastExternal) {
+      const next = structuredToYAML(value)
+      setText(next)
+      setLastExternal(value)
+    }
+  }, [value, lastExternal])
+
+  const handleTextChange = (nextText) => {
+    setText(nextText)
+    // Empty buffer publishes empty array — the most common "I want to
+    // start fresh" case and keeps a YAML-valid round-trip.
+    if (!nextText.trim()) {
+      onChange([])
+      setLastExternal([])
+      return
+    }
+    try {
+      const parsed = YAML.parse(nextText)
+      onChange(parsed)
+      setLastExternal(parsed)
+    } catch {
+      // Hold the structured value steady while YAML is being typed
+      // and is temporarily invalid. The CodeMirror YAML linter
+      // surfaces the syntax error inline.
+    }
+  }
+
+  return <CodeEditor value={text} onChange={handleTextChange} minHeight={minHeight} />
+}
+
+// structuredToYAML renders the form-state value as the YAML text the
+// editor shows. Strings pass through untouched (so a legacy template
+// that supplied a pre-formatted YAML string still renders cleanly).
+// null/undefined renders as empty so the editor starts blank rather
+// than showing the literal "null\n".
+export function structuredToYAML(value) {
+  if (value === null || value === undefined) return ''
+  if (typeof value === 'string') return value
+  try {
+    return YAML.stringify(value)
+  } catch {
+    return ''
+  }
+}
diff --git a/core/http/react-ui/src/contexts/FormContext.jsx b/core/http/react-ui/src/contexts/FormContext.jsx
new file mode 100644
index 000000000..f29402e34
--- /dev/null
+++ b/core/http/react-ui/src/contexts/FormContext.jsx
@@ -0,0 +1,26 @@
+import { createContext, useContext, useMemo } from 'react'
+
+// FormContext exposes the surrounding form's read-only state to deep
+// field editors that need to inspect sibling fields. Used by the
+// router-candidates editor to read router.policies so candidate
+// labels can be picked from the declared policy vocabulary rather
+// than typed by hand.
+//
+// Only the read shape is exposed (formData); mutations still go
+// through the parent's onChange so the editor remains the single
+// source of truth.
+const FormContext = createContext(null)
+
+export function FormContextProvider({ formData, children }) {
+  // Memo the wrapper so consumers don't re-render on every keystroke
+  // when formData itself is referentially stable. ModelEditor's
+  // setValues replaces the object on each edit, so this still
+  // propagates updates — it just avoids spurious churn when an
+  // ancestor re-renders without changing values.
+  const value = useMemo(() => ({ formData }), [formData])
+  return <FormContext.Provider value={value}>{children}</FormContext.Provider>
+}
+
+export function useFormContext() {
+  return useContext(FormContext)
+}
diff --git a/core/http/react-ui/src/hooks/useChat.js b/core/http/react-ui/src/hooks/useChat.js
index 43a869653..539190b99 100644
--- a/core/http/react-ui/src/hooks/useChat.js
+++ b/core/http/react-ui/src/hooks/useChat.js
@@ -197,7 +197,6 @@ export function useChat(initialModel = '') {
     const temperature = activeChat.temperature
     const topP = activeChat.topP
     const topK = activeChat.topK
-    const contextSize = activeChat.contextSize
 
     // Build user message content
     let messageContent
@@ -268,7 +267,10 @@ export function useChat(initialModel = '') {
     if (temperature !== null && temperature !== undefined) requestBody.temperature = temperature
     if (topP !== null && topP !== undefined) requestBody.top_p = topP
     if (topK !== null && topK !== undefined) requestBody.top_k = topK
-    if (contextSize) requestBody.max_tokens = contextSize
+    // contextSize is the model's input+output window, not an
+    // output cap. Backends bound generation at remaining context
+    // automatically; Anthropic translate mode supplies its own
+    // default. So we deliberately do not send any output-token cap.
 
     // MCP: send selected servers via metadata so the backend activates them
     const hasMcpServers = activeChat.mcpServers && activeChat.mcpServers.length > 0
diff --git a/core/http/react-ui/src/pages/Manage.jsx b/core/http/react-ui/src/pages/Manage.jsx
index 4f0f6ab3a..eb8da5759 100644
--- a/core/http/react-ui/src/pages/Manage.jsx
+++ b/core/http/react-ui/src/pages/Manage.jsx
@@ -19,6 +19,7 @@ import {
   CAP_CHAT, CAP_COMPLETION, CAP_IMAGE, CAP_VIDEO, CAP_TTS,
   CAP_TRANSCRIPT, CAP_SOUND_GENERATION, CAP_FACE_RECOGNITION,
   CAP_SPEAKER_RECOGNITION, CAP_EMBEDDINGS, CAP_RERANK,
+  CAP_VAD, CAP_SCORE,
 } from '../utils/capabilities'
 
 const TABS = [
@@ -41,6 +42,14 @@ const USE_CASES = [
   { cap: CAP_SPEAKER_RECOGNITION, label: 'Voice',      route: (id) => `/app/voice/${encodeURIComponent(id)}` },
   { cap: CAP_EMBEDDINGS,          label: 'Embeddings' },
   { cap: CAP_RERANK,              label: 'Rerank' },
+  // Display-only badges (no playground page): infrastructure
+  // capabilities the operator declares but doesn't directly drive
+  // from the UI. VAD feeds the transcribe pipeline; Score feeds
+  // the router classifier — both are wired through other model
+  // configs and need to be visible here so operators can confirm
+  // the underlying model declares the right known_usecases.
+  { cap: CAP_VAD,                 label: 'VAD' },
+  { cap: CAP_SCORE,               label: 'Score' },
 ]
 
 // Number of columns the expandable detail row spans, per tab. Kept as
@@ -634,8 +643,7 @@ export default function Manage() {
                               key={uc.cap}
                               href="#"
                               onClick={(e) => { e.preventDefault(); e.stopPropagation(); navigate(uc.route(model.id)) }}
-                              className="badge badge-info"
-                              style={{ textDecoration: 'none', cursor: 'pointer' }}
+                              className="badge badge-info badge-link"
                             >{uc.label}</a>
                           ) : (
                             <span key={uc.cap} className="badge">{uc.label}</span>
@@ -1037,8 +1045,7 @@ function ModelDetail({ model, enriched, matchedCaps, distributedMode, onNavigate
                   key={uc.cap}
                   href="#"
                   onClick={(e) => { e.preventDefault(); onNavigate(uc.route(model.id)) }}
-                  className="badge badge-info"
-                  style={{ textDecoration: 'none', cursor: 'pointer' }}
+                  className="badge badge-info badge-link"
                 >{uc.label}</a>
               ) : (
                 <span key={uc.cap} className="badge">{uc.label}</span>
diff --git a/core/http/react-ui/src/pages/Middleware.jsx b/core/http/react-ui/src/pages/Middleware.jsx
new file mode 100644
index 000000000..4d51251bc
--- /dev/null
+++ b/core/http/react-ui/src/pages/Middleware.jsx
@@ -0,0 +1,1108 @@
+import { useState, useEffect, useCallback, useRef, useMemo, Fragment } from 'react'
+import { useOutletContext, Link, useNavigate } from 'react-router-dom'
+import { apiUrl } from '../utils/basePath'
+import { settingsApi } from '../utils/api'
+import LoadingSpinner from '../components/LoadingSpinner'
+
+// Middleware admin page. Three tabs:
+//   - Filtering: PII pattern catalogue + per-model resolved state +
+//     pattern-action editor (PUT /api/pii/patterns/:id, transient).
+//   - Routing: placeholder until subsystem 2 lands. Renders the note
+//     from /api/router/status so admins see "not yet implemented" rather
+//     than an empty page.
+//   - Events: recent PIIEvent rows from /api/pii/events. The page
+//     intentionally NEVER displays the redacted content (the redactor
+//     never stores it); only pattern_id, byte_offset, length, and an
+//     8-char sha256 prefix admins can use to dedupe recurring leaks.
+//
+// Wiring is admin-only: RequireAdmin in router.jsx already redirects
+// non-admin viewers; in single-user no-auth mode the local user has
+// admin role so the page works without --auth.
+
+const TABS = [
+  { id: 'filtering', label: 'Filtering', icon: 'fa-shield-halved' },
+  { id: 'routing', label: 'Routing', icon: 'fa-route' },
+  { id: 'proxy', label: 'MITM Proxy', icon: 'fa-shield' },
+  { id: 'events', label: 'Events', icon: 'fa-list-ul' },
+]
+
+const ACTIONS = ['mask', 'block', 'route_local']
+
+function actionBadge(action) {
+  const colors = {
+    mask: 'var(--color-primary)',
+    block: 'var(--color-error)',
+    route_local: 'var(--color-warning)',
+  }
+  return (
+    <span style={{
+      display: 'inline-block',
+      padding: '2px 8px',
+      fontSize: '0.6875rem',
+      fontWeight: 600,
+      borderRadius: 'var(--radius-sm)',
+      background: colors[action] || 'var(--color-bg-tertiary)',
+      color: 'white',
+      fontFamily: 'var(--font-mono)',
+      textTransform: 'uppercase',
+    }}>
+      {action}
+    </span>
+  )
+}
+
+function enabledBadge(enabled) {
+  return (
+    <span style={{
+      display: 'inline-block',
+      padding: '2px 8px',
+      fontSize: '0.6875rem',
+      fontWeight: 600,
+      borderRadius: 'var(--radius-sm)',
+      background: enabled ? 'var(--color-success, #22c55e)' : 'var(--color-bg-tertiary)',
+      color: enabled ? 'white' : 'var(--color-text-muted)',
+      fontFamily: 'var(--font-mono)',
+      textTransform: 'uppercase',
+    }}>
+      {enabled ? 'on' : 'off'}
+    </span>
+  )
+}
+
+export default function Middleware() {
+  const { addToast } = useOutletContext()
+  const [status, setStatus] = useState(null)
+  const [events, setEvents] = useState([])
+  const [decisions, setDecisions] = useState([])
+  const [loading, setLoading] = useState(true)
+  const [activeTab, setActiveTab] = useState('filtering')
+  const [pendingPattern, setPendingPattern] = useState(null) // id while a PUT is in flight
+
+  // silent=true on background polls: skips the loading spinner and
+  // suppresses toast spam if the server is briefly unreachable.
+  const fetchAll = useCallback(async (silent = false) => {
+    if (!silent) setLoading(true)
+    try {
+      const [statusRes, eventsRes, decisionsRes] = await Promise.all([
+        fetch(apiUrl('/api/middleware/status')),
+        fetch(apiUrl('/api/pii/events?limit=100')),
+        fetch(apiUrl('/api/router/decisions?limit=100')),
+      ])
+      if (!statusRes.ok) throw new Error(`status: HTTP ${statusRes.status}`)
+      const statusData = await statusRes.json()
+      setStatus(statusData)
+      if (eventsRes.ok) {
+        const data = await eventsRes.json()
+        setEvents(data.events || [])
+      }
+      if (decisionsRes.ok) {
+        const data = await decisionsRes.json()
+        setDecisions(data.decisions || [])
+      }
+    } catch (err) {
+      if (!silent) addToast(`Failed to load middleware status: ${err.message}`, 'error')
+    } finally {
+      if (!silent) setLoading(false)
+    }
+  }, [addToast])
+
+  useEffect(() => { fetchAll() }, [fetchAll])
+
+  // Auto-refresh every 5s so admins watching the Events / Routing tabs
+  // see new rows without manual refresh. Matches the Traces page cadence.
+  // ProxyTab guards against clobbering mid-typed config via its own
+  // `dirty` check, so the poll is safe while the form is in use.
+  const refreshRef = useRef(null)
+  useEffect(() => {
+    refreshRef.current = setInterval(() => fetchAll(true), 5000)
+    return () => clearInterval(refreshRef.current)
+  }, [fetchAll])
+
+  const mutatePattern = async (patternID, body, successMsg) => {
+    setPendingPattern(patternID)
+    try {
+      const res = await fetch(apiUrl(`/api/pii/patterns/${encodeURIComponent(patternID)}`), {
+        method: 'PUT',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify(body),
+      })
+      if (!res.ok) {
+        const data = await res.json().catch(() => ({}))
+        throw new Error(data.error || `HTTP ${res.status}`)
+      }
+      addToast(successMsg, 'success')
+      await fetchAll()
+    } catch (err) {
+      addToast(`Failed to update pattern: ${err.message}`, 'error')
+    } finally {
+      setPendingPattern(null)
+    }
+  }
+
+  const setPatternAction = (patternID, action) =>
+    mutatePattern(patternID, { action }, `Pattern ${patternID}: action ${action} (transient — click "Save to disk" to persist)`)
+
+  const setPatternDisabled = (patternID, disabled) =>
+    mutatePattern(patternID, { disabled }, `Pattern ${patternID}: ${disabled ? 'disabled' : 'enabled'} (transient — click "Save to disk" to persist)`)
+
+  const [persisting, setPersisting] = useState(false)
+  const persistPatterns = async () => {
+    setPersisting(true)
+    try {
+      const res = await fetch(apiUrl('/api/pii/patterns/persist'), { method: 'POST' })
+      if (!res.ok) {
+        const data = await res.json().catch(() => ({}))
+        throw new Error(data.error || `HTTP ${res.status}`)
+      }
+      const data = await res.json().catch(() => ({}))
+      addToast(`Saved ${data.override_count ?? 0} pattern override(s) to runtime_settings.json`, 'success')
+    } catch (err) {
+      addToast(`Failed to persist: ${err.message}`, 'error')
+    } finally {
+      setPersisting(false)
+    }
+  }
+
+  return (
+    <div className="page page--wide">
+      <div className="page-header" style={{ marginBottom: 'var(--spacing-sm)' }}>
+        <h1 className="page-title">Middleware</h1>
+        <p className="page-subtitle">
+          Inspect and configure routing-module middleware: PII filtering and intelligent routing.
+        </p>
+      </div>
+
+      {/* Tab bar */}
+      <div style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-xs)', marginBottom: 'var(--spacing-md)', flexWrap: 'wrap' }}>
+        {TABS.map(tab => (
+          <button
+            key={tab.id}
+            className={`btn btn-sm ${activeTab === tab.id ? 'btn-primary' : 'btn-secondary'}`}
+            onClick={() => setActiveTab(tab.id)}
+          >
+            <i className={`fas ${tab.icon}`} style={{ marginRight: 4 }} />
+            {tab.label}
+          </button>
+        ))}
+        <div style={{ flex: 1 }} />
+        <button className="btn btn-secondary btn-sm" onClick={fetchAll} disabled={loading}>
+          <i className={`fas fa-rotate${loading ? ' fa-spin' : ''}`} /> Refresh
+        </button>
+      </div>
+
+      {loading && !status ? (
+        <div style={{ display: 'flex', justifyContent: 'center', padding: 'var(--spacing-xl)' }}>
+          <LoadingSpinner size="lg" />
+        </div>
+      ) : activeTab === 'filtering' ? (
+        <FilteringTab
+          status={status}
+          pendingPattern={pendingPattern}
+          onSetAction={setPatternAction}
+          onSetDisabled={setPatternDisabled}
+          onPersist={persistPatterns}
+          persisting={persisting}
+        />
+      ) : activeTab === 'routing' ? (
+        <RoutingTab status={status} decisions={decisions} />
+      ) : activeTab === 'proxy' ? (
+        <ProxyTab status={status} addToast={addToast} onChanged={fetchAll} />
+      ) : (
+        <EventsTab events={events} />
+      )}
+    </div>
+  )
+}
+
+function FilteringTab({ status, pendingPattern, onSetAction, onSetDisabled, onPersist, persisting }) {
+  if (!status?.pii) return null
+  const pii = status.pii
+
+  if (!pii.enabled_globally) {
+    return (
+      <div className="empty-state">
+        <div className="empty-state-icon"><i className="fas fa-shield-slash" /></div>
+        <h2 className="empty-state-title">PII filtering disabled</h2>
+        <p className="empty-state-text">
+          The PII filter is disabled by <code>{pii.reason || '--disable-pii'}</code>.
+          Restart without that flag to enable it.
+        </p>
+      </div>
+    )
+  }
+
+  return (
+    <>
+      {/* Default rule banner */}
+      <div className="card" style={{ padding: 'var(--spacing-md)', marginBottom: 'var(--spacing-md)' }}>
+        <div style={{ display: 'flex', alignItems: 'flex-start', gap: 'var(--spacing-sm)' }}>
+          <i className="fas fa-info-circle" style={{ color: 'var(--color-text-muted)', marginTop: 2 }} />
+          <div>
+            <div style={{ fontWeight: 600, marginBottom: 4 }}>Default policy</div>
+            <div style={{ fontSize: '0.8125rem', color: 'var(--color-text-secondary)' }}>
+              PII redaction is per-model and OFF by default. Backends matching <code>{(pii.default_enabled_for_backends || []).join(', ')}</code> default to ON (cloud passthroughs). Override per model with <code>pii: {'{'} enabled: true {'}'}</code> in the model YAML.
+            </div>
+          </div>
+        </div>
+      </div>
+
+      {/* Patterns table */}
+      <div className="card" style={{ padding: 'var(--spacing-md)', marginBottom: 'var(--spacing-md)' }}>
+        <div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', marginBottom: 'var(--spacing-sm)' }}>
+          <span style={{ fontSize: '0.875rem', fontWeight: 600 }}>Active patterns</span>
+          <div style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)' }}>
+            <span style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)' }}>
+              Toggle / action edits are transient — click Save to disk to persist.
+            </span>
+            <button
+              className="btn btn-secondary btn-sm"
+              onClick={onPersist}
+              disabled={persisting}
+              style={{ fontSize: '0.75rem' }}
+            >
+              <i className={`fas ${persisting ? 'fa-spinner fa-spin' : 'fa-save'}`} /> Save to disk
+            </button>
+          </div>
+        </div>
+        <div className="table-container">
+          <table className="table">
+            <thead>
+              <tr>
+                <th style={{ width: 80 }}>Enabled</th>
+                <th style={{ width: 140 }}>Pattern</th>
+                <th>Description</th>
+                <th style={{ width: 110 }}>Action</th>
+                <th style={{ width: 250 }}>Change</th>
+              </tr>
+            </thead>
+            <tbody>
+              {pii.patterns.map(p => {
+                const enabled = !p.disabled
+                const muted = p.disabled
+                return (
+                <tr key={p.id} style={muted ? { opacity: 0.55 } : undefined}>
+                  <td>
+                    <input
+                      type="checkbox"
+                      checked={enabled}
+                      disabled={pendingPattern === p.id}
+                      onChange={e => onSetDisabled(p.id, !e.target.checked)}
+                      style={{ cursor: 'pointer' }}
+                      aria-label={`Enable ${p.id} pattern`}
+                    />
+                  </td>
+                  <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem', fontWeight: 600 }}>{p.id}</td>
+                  <td style={{ fontSize: '0.8125rem', color: 'var(--color-text-secondary)' }}>{p.description}</td>
+                  <td>{actionBadge(p.action)}</td>
+                  <td>
+                    <div style={{ display: 'flex', gap: 4 }}>
+                      {ACTIONS.map(a => (
+                        <button
+                          key={a}
+                          className={`btn btn-sm ${p.action === a ? 'btn-primary' : 'btn-secondary'}`}
+                          onClick={() => onSetAction(p.id, a)}
+                          disabled={pendingPattern === p.id || p.action === a || p.disabled}
+                          style={{ fontSize: '0.6875rem', padding: '2px 8px' }}
+                        >
+                          {a}
+                        </button>
+                      ))}
+                    </div>
+                  </td>
+                </tr>
+              )})}
+            </tbody>
+          </table>
+        </div>
+      </div>
+
+      {/* Per-model resolved state */}
+      <div className="card" style={{ padding: 'var(--spacing-md)' }}>
+        <div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', marginBottom: 'var(--spacing-sm)' }}>
+          <span style={{ fontSize: '0.875rem', fontWeight: 600 }}>Per-model state</span>
+          <span style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)' }}>
+            Edit the model YAML to change these.
+          </span>
+        </div>
+        <div className="table-container">
+          <table className="table">
+            <thead>
+              <tr>
+                <th>Model</th>
+                <th style={{ width: 120 }}>Backend</th>
+                <th style={{ width: 80 }}>PII</th>
+                <th style={{ width: 110 }}>Source</th>
+                <th>Pattern overrides</th>
+                <th style={{ width: 80 }}>Edit</th>
+              </tr>
+            </thead>
+            <tbody>
+              {(pii.models || []).map(m => (
+                <tr key={m.name}>
+                  <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem' }}>{m.name}</td>
+                  <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>{m.backend || '—'}</td>
+                  <td>{enabledBadge(m.enabled)}</td>
+                  <td style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)' }}>
+                    {m.explicit ? 'YAML' : (m.default_for_backend ? 'backend default' : 'default off')}
+                  </td>
+                  <td style={{ fontSize: '0.75rem', fontFamily: 'var(--font-mono)' }}>
+                    {m.overrides && Object.keys(m.overrides).length > 0
+                      ? Object.entries(m.overrides).map(([k, v]) => `${k}=${v}`).join(', ')
+                      : <span style={{ color: 'var(--color-text-muted)' }}>—</span>}
+                  </td>
+                  <td>
+                    <Link
+                      to={`/app/model-editor/${encodeURIComponent(m.name)}`}
+                      className="btn btn-secondary btn-sm"
+                      style={{ fontSize: '0.6875rem', padding: '2px 8px' }}
+                      title={`Edit ${m.name}.yaml`}
+                    >
+                      <i className="fas fa-pen-to-square" /> Edit
+                    </Link>
+                  </td>
+                </tr>
+              ))}
+              {(!pii.models || pii.models.length === 0) && (
+                <tr>
+                  <td colSpan={6} style={{ textAlign: 'center', color: 'var(--color-text-muted)', padding: 'var(--spacing-md)' }}>
+                    No models loaded.
+                  </td>
+                </tr>
+              )}
+            </tbody>
+          </table>
+        </div>
+      </div>
+    </>
+  )
+}
+
+// decisionActiveSet rebuilds the Set of active labels from a
+// DecisionRecord's comma-joined `label` column. Used by both the
+// collapsed-row score suffix and the expanded-row bar rendering.
+function decisionActiveSet(d) {
+  return new Set((d?.label || '').split(',').filter(Boolean))
+}
+
+// formatDecisionScoreSuffix renders the top active label's score
+// next to the label cell so operators can spot uncertain calls at a
+// glance without expanding the row. Empty when the decision came from
+// the cache or fallback — both cases lack per-label scores.
+function formatDecisionScoreSuffix(d, activeSet) {
+  if (!d?.label_scores?.length) return ''
+  const top = d.label_scores
+    .filter(ls => activeSet.has(ls.label))
+    .sort((a, b) => b.score - a.score)[0]
+  if (!top) return ''
+  return ` ${(top.score * 100).toFixed(0)}%`
+}
+
+// LabelBar is one row in the expanded decision view — a horizontal
+// score bar with a vertical marker at the activation threshold so
+// operators can see how close inactive labels got to firing.
+function LabelBar({ label, score, threshold, active }) {
+  const scorePct = Math.max(0, Math.min(100, score * 100))
+  const thresholdPct = Math.max(0, Math.min(100, (threshold || 0) * 100))
+  return (
+    <div style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)', fontFamily: 'var(--font-mono)', fontSize: '0.75rem' }}>
+      <div style={{
+        width: 160,
+        color: active ? 'var(--color-text)' : 'var(--color-text-muted)',
+        fontWeight: active ? 600 : 400,
+        overflow: 'hidden',
+        textOverflow: 'ellipsis',
+        whiteSpace: 'nowrap',
+      }} title={label}>
+        {label}
+      </div>
+      <div style={{ flex: 1, position: 'relative', height: 14, background: 'var(--color-border, #e5e7eb)', borderRadius: 2 }}>
+        <div style={{
+          width: `${scorePct}%`,
+          height: '100%',
+          background: active ? 'var(--color-success, #2da44e)' : 'var(--color-text-muted)',
+          opacity: active ? 1 : 0.45,
+          borderRadius: 2,
+        }} />
+        {threshold > 0 && (
+          <div
+            title={`Activation threshold ${thresholdPct.toFixed(0)}%`}
+            style={{
+              position: 'absolute',
+              top: -3,
+              left: `${thresholdPct}%`,
+              width: 2,
+              height: 20,
+              background: 'var(--color-warning, #d97706)',
+              transform: 'translateX(-1px)',
+              pointerEvents: 'none',
+            }}
+          />
+        )}
+      </div>
+      <div style={{ width: 56, textAlign: 'right', color: 'var(--color-text-muted)' }}>
+        {scorePct.toFixed(1)}%
+      </div>
+    </div>
+  )
+}
+
+// DecisionDetail renders the per-label bar breakdown for one decision.
+// Empty-state messaging covers cached and fallback rows where the
+// classifier never produced per-label scores.
+function DecisionDetail({ d }) {
+  if (!d.label_scores?.length) {
+    return (
+      <div style={{ color: 'var(--color-text-muted)', fontSize: '0.75rem', fontStyle: 'italic' }}>
+        {d.cached
+          ? 'Cached decision — per-label scores not recorded (the cache stores only the resulting label set).'
+          : 'No per-label scores recorded for this decision (likely a fallback row).'}
+      </div>
+    )
+  }
+  const threshold = d.activation_threshold || 0
+  const active = decisionActiveSet(d)
+  return (
+    <div style={{ display: 'flex', flexDirection: 'column', gap: 6, maxWidth: 720 }}>
+      <div style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)' }}>
+        Activation threshold:&nbsp;
+        <span style={{ color: 'var(--color-warning, #d97706)', fontWeight: 600 }}>
+          {(threshold * 100).toFixed(0)}%
+        </span>
+        &nbsp;(orange marker on each bar)
+      </div>
+      {d.label_scores.map(ls => (
+        <LabelBar
+          key={ls.label}
+          label={ls.label}
+          score={ls.score}
+          threshold={threshold}
+          active={active.has(ls.label)}
+        />
+      ))}
+    </div>
+  )
+}
+
+function RoutingTab({ status, decisions }) {
+  const navigate = useNavigate()
+  const router = status?.router || { configured: false }
+  const [expanded, setExpanded] = useState(() => new Set())
+
+  // Precompute per-row formatter strings once per decisions update.
+  // The score suffix is shown in the collapsed row so operators can
+  // scan top-label confidence without expanding everything.
+  const decisionRows = useMemo(() => (decisions || []).map(d => {
+    const active = decisionActiveSet(d)
+    return {
+      ...d,
+      _scoreSuffix: formatDecisionScoreSuffix(d, active),
+    }
+  }), [decisions])
+
+  const toggleExpanded = useCallback(id => {
+    setExpanded(prev => {
+      const next = new Set(prev)
+      if (next.has(id)) next.delete(id)
+      else next.add(id)
+      return next
+    })
+  }, [])
+
+  if (!router.configured || !router.models || router.models.length === 0) {
+    return (
+      <div className="empty-state">
+        <div className="empty-state-icon"><i className="fas fa-route" /></div>
+        <h2 className="empty-state-title">No routers configured</h2>
+        <p className="empty-state-text">
+          {router.note || 'Add a `router:` block to a model YAML to enable intelligent routing. The classifier picks one of the listed candidates per request and the standard model-resolution path runs against the chosen target.'}
+        </p>
+        <button
+          className="btn btn-primary"
+          style={{ marginTop: 'var(--spacing-md)' }}
+          onClick={() => navigate('/app/model-editor?template=router')}
+        >
+          <i className="fas fa-plus" /> Create routing model
+        </button>
+      </div>
+    )
+  }
+
+  return (
+    <>
+      {/* Configured router models */}
+      <div className="card" style={{ padding: 'var(--spacing-md)', marginBottom: 'var(--spacing-md)' }}>
+        <div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', marginBottom: 'var(--spacing-sm)' }}>
+          <span style={{ fontSize: '0.875rem', fontWeight: 600 }}>Active routers</span>
+          <div style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)' }}>
+            <span style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)' }}>
+              Edit the router model YAML to change candidates or rules.
+            </span>
+            <button
+              className="btn btn-secondary btn-sm"
+              onClick={() => navigate('/app/model-editor?template=router')}
+              title="Open the model editor with the Routing Model template pre-selected"
+            >
+              <i className="fas fa-plus" /> Add routing model
+            </button>
+          </div>
+        </div>
+        <div className="table-container">
+          <table className="table">
+            <thead>
+              <tr>
+                <th style={{ width: 160 }}>Model</th>
+                <th style={{ width: 110 }}>Classifier</th>
+                <th>Candidates</th>
+                <th style={{ width: 200 }}>Embedding cache</th>
+                <th style={{ width: 140 }}>Fallback</th>
+              </tr>
+            </thead>
+            <tbody>
+              {router.models.map(m => (
+                <tr key={m.name}>
+                  <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem', fontWeight: 600 }}>{m.name}</td>
+                  <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.75rem' }}>{m.classifier}</td>
+                  <td style={{ fontSize: '0.75rem' }}>
+                    {(m.candidates || []).map((c, i) => (
+                      <div key={i} style={{ display: 'flex', alignItems: 'center', gap: 6, fontFamily: 'var(--font-mono)' }}>
+                        <span style={{ minWidth: 100, color: 'var(--color-primary)' }}>{(c.labels || []).join(', ') || '—'}</span>
+                        <span style={{ color: 'var(--color-text-muted)' }}>→</span>
+                        <span>{c.model}</span>
+                      </div>
+                    ))}
+                  </td>
+                  <td style={{ fontSize: '0.75rem' }}>
+                    <RouterCacheCell cache={m.embedding_cache} />
+                  </td>
+                  <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>
+                    {m.fallback || '—'}
+                  </td>
+                </tr>
+              ))}
+            </tbody>
+          </table>
+        </div>
+      </div>
+
+      {/* Recent decisions */}
+      <div className="card" style={{ padding: 'var(--spacing-md)' }}>
+        <div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', marginBottom: 'var(--spacing-sm)' }}>
+          <span style={{ fontSize: '0.875rem', fontWeight: 600 }}>Recent decisions</span>
+          <span style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)' }}>
+            Newest first, capped at 100.
+          </span>
+        </div>
+        {(!decisions || decisions.length === 0) ? (
+          <div style={{ padding: 'var(--spacing-md)', textAlign: 'center', color: 'var(--color-text-muted)', fontSize: '0.8125rem' }}>
+            No routing decisions yet. Send a request to a router model to populate this log.
+          </div>
+        ) : (
+          <div className="table-container">
+            <table className="table">
+              <thead>
+                <tr>
+                  <th style={{ width: 170 }}>Time</th>
+                  <th style={{ width: 130 }}>Router</th>
+                  <th style={{ width: 80 }}>Label</th>
+                  <th style={{ width: 130 }}>Served</th>
+                  <th style={{ width: 90 }}>Latency</th>
+                  <th>Correlation</th>
+                </tr>
+              </thead>
+              <tbody>
+                {decisionRows.map(d => {
+                  const isExpanded = expanded.has(d.id)
+                  return (
+                    <Fragment key={d.id}>
+                      <tr
+                        onClick={() => toggleExpanded(d.id)}
+                        style={{ cursor: 'pointer' }}
+                        title={isExpanded ? 'Click to collapse' : 'Click to see per-label score breakdown'}
+                      >
+                        <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>
+                          <span style={{ display: 'inline-block', width: 12, color: 'var(--color-text-muted)' }}>
+                            {isExpanded ? '▼' : '▶'}
+                          </span>
+                          {d.created_at}
+                        </td>
+                        <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.75rem' }}>{d.router_model}</td>
+                        <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.75rem', fontWeight: 600 }}>
+                          {d.label}
+                          {d._scoreSuffix}
+                        </td>
+                        <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.75rem' }}>{d.served_model}</td>
+                        <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>{d.latency_ms}ms</td>
+                        <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.6875rem', color: 'var(--color-text-muted)' }}>
+                          {d.correlation_id || '—'}
+                        </td>
+                      </tr>
+                      {isExpanded && (
+                        <tr>
+                          <td colSpan={6} style={{ background: 'var(--color-bg-muted, #f6f8fa)', padding: 'var(--spacing-md)' }}>
+                            <DecisionDetail d={d} />
+                          </td>
+                        </tr>
+                      )}
+                    </Fragment>
+                  )
+                })}
+              </tbody>
+            </table>
+          </div>
+        )}
+      </div>
+    </>
+  )
+}
+
+function ProxyTab({ status, addToast, onChanged }) {
+  const navigate = useNavigate()
+  const mitm = status?.mitm
+  const serverListen = mitm?.configured_addr || ''
+
+  const [listen, setListen] = useState(serverListen)
+  const [saving, setSaving] = useState(false)
+
+  const dirty = listen !== serverListen
+
+  // Refresh local state from the server only when the user has no
+  // pending edits to clobber.
+  useEffect(() => {
+    if (dirty) return
+    setListen(serverListen)
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [serverListen])
+
+  const save = async () => {
+    setSaving(true)
+    try {
+      const body = await settingsApi.save({ mitm_listen: listen })
+      if (body && body.success === false) {
+        throw new Error(body.error || 'unknown error')
+      }
+      addToast('MITM proxy settings updated', 'success')
+      onChanged?.()
+    } catch (err) {
+      addToast(`Failed to save: ${err.message}`, 'error')
+    } finally {
+      setSaving(false)
+    }
+  }
+
+  if (!mitm) {
+    return (
+      <div className="empty-state">
+        <div className="empty-state-icon"><i className="fas fa-shield" /></div>
+        <h2 className="empty-state-title">MITM proxy status unavailable</h2>
+        <p className="empty-state-text">The status endpoint did not return a mitm section.</p>
+      </div>
+    )
+  }
+
+  const conflicts = mitm.host_conflicts || {}
+  const owners = mitm.host_owners || {}
+  const conflictHosts = Object.keys(conflicts)
+  const ownerEntries = Object.entries(owners)
+  const mitmModels = mitm.models || []
+
+  return (
+    <div style={{ display: 'flex', flexDirection: 'column', gap: 'var(--spacing-md)' }}>
+      {conflictHosts.length > 0 && (
+        <div className="card" style={{ padding: 'var(--spacing-md)', borderLeft: '3px solid var(--color-error)' }}>
+          <div style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)', marginBottom: 'var(--spacing-xs)' }}>
+            <i className="fas fa-triangle-exclamation" style={{ color: 'var(--color-error)' }} />
+            <span style={{ fontWeight: 600 }}>MITM listener disabled — duplicate host claims</span>
+          </div>
+          <p style={{ fontSize: '0.8125rem', color: 'var(--color-text-secondary)', margin: 0 }}>
+            Each MITM intercept host must be owned by exactly one model config. Resolve by editing the conflicting model YAMLs.
+          </p>
+          <ul style={{ margin: 'var(--spacing-xs) 0 0', paddingLeft: 20, fontSize: '0.8125rem' }}>
+            {conflictHosts.map(h => (
+              <li key={h}>
+                <code style={{ fontFamily: 'var(--font-mono)' }}>{h}</code>
+                {' claimed by: '}
+                {(conflicts[h] || []).map(name => (
+                  <Link key={name} to={`/app/model-editor/${encodeURIComponent(name)}`} style={{ marginRight: 6, fontFamily: 'var(--font-mono)' }}>
+                    {name}
+                  </Link>
+                ))}
+              </li>
+            ))}
+          </ul>
+        </div>
+      )}
+
+      <div className="card" style={{ padding: 'var(--spacing-lg)' }}>
+        <div style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-md)', marginBottom: 'var(--spacing-md)' }}>
+          <h2 style={{ fontSize: '1rem', fontWeight: 600, margin: 0 }}>State</h2>
+          {enabledBadge(mitm.running)}
+          {mitm.running && (
+            <span style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem', color: 'var(--color-text-secondary)' }}>
+              listening on {mitm.listen_addr}
+            </span>
+          )}
+        </div>
+        <p style={{ fontSize: '0.8125rem', color: 'var(--color-text-secondary)', marginBottom: 'var(--spacing-sm)' }}>
+          The MITM proxy terminates TLS for allowlisted hosts so PII redaction
+          can run on traffic from clients that authenticate via OAuth /
+          subscription (Claude Code, Codex CLI). Non-allowlisted hosts get a
+          plain CONNECT tunnel — no inspection, no CA-trust required.
+        </p>
+        {ownerEntries.length > 0 ? (
+          <div style={{ marginBottom: 'var(--spacing-sm)', fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>
+            <div style={{ marginBottom: 4 }}>Hosts claimed by model configs (PII settings flow from the owning config):</div>
+            <ul style={{ margin: 0, paddingLeft: 20, fontFamily: 'var(--font-mono)' }}>
+              {ownerEntries.map(([host, name]) => (
+                <li key={host}>
+                  {host} → <Link to={`/app/model-editor/${encodeURIComponent(name)}`}>{name}</Link>
+                </li>
+              ))}
+            </ul>
+          </div>
+        ) : (
+          <div style={{ marginBottom: 'var(--spacing-sm)', fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>
+            No model config declares an MITM intercept host. Without one, every CONNECT tunnels through unmodified. Create one from the Add Model page using the MITM Intercept template.
+          </div>
+        )}
+        {mitm.ca_available ? (
+          <a
+            className="btn btn-secondary btn-sm"
+            href={apiUrl(mitm.ca_cert_url)}
+            download="localai-mitm-ca.crt"
+          >
+            <i className="fas fa-download" /> Download CA cert
+          </a>
+        ) : (
+          <span style={{ fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>
+            CA not generated yet — start the listener to generate it.
+          </span>
+        )}
+      </div>
+
+      <div className="card" style={{ padding: 'var(--spacing-lg)' }}>
+        <div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', marginBottom: 'var(--spacing-sm)' }}>
+          <h2 style={{ fontSize: '1rem', fontWeight: 600, margin: 0 }}>MITM Models</h2>
+          <button
+            className="btn btn-secondary btn-sm"
+            onClick={() => navigate('/app/model-editor?template=mitm')}
+            title="Open the model editor with the MITM Intercept template pre-selected"
+          >
+            <i className="fas fa-plus" /> Add MITM model
+          </button>
+        </div>
+        {mitmModels.length === 0 ? (
+          <div style={{ fontSize: '0.8125rem', color: 'var(--color-text-muted)' }}>
+            No model config declares <code>mitm.hosts</code>. Use the Add MITM model button above — the template defaults to <code>api.anthropic.com</code> with PII filtering on.
+          </div>
+        ) : (
+          <table className="table">
+            <thead>
+              <tr>
+                <th>Model</th>
+                <th>Hosts</th>
+                <th style={{ width: 80 }}>PII</th>
+                <th style={{ width: 80 }}>Edit</th>
+              </tr>
+            </thead>
+            <tbody>
+              {mitmModels.map(m => (
+                <tr key={m.name}>
+                  <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem', fontWeight: 600 }}>{m.name}</td>
+                  <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.75rem' }}>
+                    {(m.hosts || []).join(', ')}
+                  </td>
+                  <td>{enabledBadge(m.pii_enabled)}</td>
+                  <td>
+                    <Link
+                      to={`/app/model-editor/${encodeURIComponent(m.name)}`}
+                      className="btn btn-secondary btn-sm"
+                      style={{ fontSize: '0.6875rem', padding: '2px 8px' }}
+                    >
+                      <i className="fas fa-pen-to-square" /> Edit
+                    </Link>
+                  </td>
+                </tr>
+              ))}
+            </tbody>
+          </table>
+        )}
+      </div>
+
+      <div className="card" style={{ padding: 'var(--spacing-lg)' }}>
+        <h2 style={{ fontSize: '1rem', fontWeight: 600, marginTop: 0, marginBottom: 'var(--spacing-md)' }}>Configuration</h2>
+
+        <label style={{ display: 'block', marginBottom: 'var(--spacing-md)' }}>
+          <div style={{ fontSize: '0.875rem', fontWeight: 500, marginBottom: 'var(--spacing-xs)' }}>Listen address</div>
+          <input
+            type="text"
+            value={listen}
+            onChange={e => setListen(e.target.value)}
+            placeholder=":8443  (leave empty to disable)"
+            style={{ width: '100%', padding: '8px 12px', fontFamily: 'var(--font-mono)', fontSize: '0.875rem', background: 'var(--color-bg-tertiary)', border: '1px solid var(--color-border-default)', borderRadius: 'var(--radius-sm)', color: 'var(--color-text-primary)' }}
+          />
+          <div style={{ marginTop: 'var(--spacing-xs)', fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>
+            Bind address for the proxy listener. Empty disables it. Bind to <code>127.0.0.1:port</code> unless the listener is reachable only from clients you control — there is no auth on the CONNECT port. Clients connect to the proxy over plain HTTP (use <code>http://</code>, even for the <code>HTTPS_PROXY</code> env var); the proxy terminates TLS for allowlisted hosts inside the CONNECT tunnel.
+          </div>
+        </label>
+
+        <div style={{ marginBottom: 'var(--spacing-md)', fontSize: '0.8125rem', color: 'var(--color-text-secondary)' }}>
+          Intercept hosts are declared per-model in the model YAML's
+          {' '}<code style={{ fontFamily: 'var(--font-mono)' }}>mitm.hosts:</code>{' '}
+          block. Each host is owned by exactly one model config; PII filtering and
+          pattern overrides flow from the owning config when the host is intercepted.
+        </div>
+
+        <div style={{ display: 'flex', gap: 'var(--spacing-sm)' }}>
+          <button
+            className="btn btn-primary btn-sm"
+            onClick={save}
+            disabled={!dirty || saving}
+          >
+            <i className={`fas ${saving ? 'fa-spinner fa-spin' : 'fa-save'}`} /> {saving ? 'Saving…' : 'Apply'}
+          </button>
+          {dirty && (
+            <button
+              className="btn btn-ghost btn-sm"
+              onClick={() => setListen(mitm.configured_addr || '')}
+              disabled={saving}
+            >
+              Discard changes
+            </button>
+          )}
+        </div>
+      </div>
+
+      <div className="card" style={{ padding: 'var(--spacing-md)', background: 'var(--color-bg-secondary)' }}>
+        <h2 style={{ fontSize: '0.875rem', fontWeight: 600, marginTop: 0, marginBottom: 'var(--spacing-sm)' }}>Client setup</h2>
+        <ol style={{ margin: 0, paddingLeft: 20, fontSize: '0.8125rem', color: 'var(--color-text-secondary)', lineHeight: 1.7 }}>
+          <li>Download the CA cert (button above).</li>
+          <li>Trust it on the client. For Node-based CLIs (Claude Code, Codex): <code style={{ fontFamily: 'var(--font-mono)' }}>export NODE_EXTRA_CA_CERTS=$(pwd)/localai-mitm-ca.crt</code></li>
+          <li>Point the client at the proxy: <code style={{ fontFamily: 'var(--font-mono)' }}>export HTTPS_PROXY=http://&lt;host&gt;:&lt;port&gt;</code> (yes, <code>http://</code> — clients speak plain HTTP to the proxy, which then terminates TLS for allowlisted hosts on the inner connection).</li>
+        </ol>
+      </div>
+    </div>
+  )
+}
+
+const EVENT_KINDS = [
+  { id: '', label: 'All' },
+  { id: 'pii', label: 'PII' },
+  { id: 'proxy_connect', label: 'Proxy connect' },
+  { id: 'proxy_traffic', label: 'Proxy traffic' },
+  { id: 'admission', label: 'Admission' },
+]
+
+function eventKind(e) {
+  return e.kind || 'pii'
+}
+
+function eventSubject(e) {
+  switch (eventKind(e)) {
+    case 'proxy_connect':
+    case 'proxy_traffic':
+    case 'admission':
+      return e.host || '—'
+    default:
+      return e.pattern_id || '—'
+  }
+}
+
+function eventDetails(e) {
+  switch (eventKind(e)) {
+    case 'proxy_connect':
+      return e.intercepted ? 'intercepted (TLS terminated)' : 'tunneled (passthrough)'
+    case 'proxy_traffic': {
+      const status = e.status_code ? `HTTP ${e.status_code}` : 'no upstream'
+      const sent = formatBytes(e.bytes_sent)
+      const recv = formatBytes(e.bytes_received)
+      const dur = e.duration_ms != null ? `${e.duration_ms}ms` : ''
+      return `${status} · ↑${sent} ↓${recv} · ${dur}`
+    }
+    case 'admission': {
+      const retry = e.duration_ms != null ? `retry-after ${Math.round(e.duration_ms / 1000)}s` : ''
+      return `HTTP 503 rejected · ${retry}`
+    }
+    default: {
+      const len = e.length != null ? `len ${e.length}` : ''
+      const hash = e.hash_prefix ? `hash ${e.hash_prefix}` : ''
+      return [len, hash].filter(Boolean).join(' · ') || '—'
+    }
+  }
+}
+
+function formatBytes(n) {
+  if (!n) return '0B'
+  if (n < 1024) return `${n}B`
+  if (n < 1024 * 1024) return `${(n / 1024).toFixed(1)}KB`
+  return `${(n / (1024 * 1024)).toFixed(1)}MB`
+}
+
+function kindBadge(kind) {
+  const colors = {
+    pii: 'var(--color-warning)',
+    proxy_connect: 'var(--color-primary)',
+    proxy_traffic: 'var(--color-text-muted)',
+    admission: 'var(--color-error)',
+  }
+  return (
+    <span style={{
+      display: 'inline-block',
+      padding: '2px 8px',
+      fontSize: '0.6875rem',
+      fontWeight: 600,
+      borderRadius: 'var(--radius-sm)',
+      background: colors[kind] || 'var(--color-bg-tertiary)',
+      color: 'white',
+      fontFamily: 'var(--font-mono)',
+      textTransform: 'uppercase',
+      whiteSpace: 'nowrap',
+    }}>
+      {kind.replace(/_/g, ' ')}
+    </span>
+  )
+}
+
+function EventsTab({ events }) {
+  const [kindFilter, setKindFilter] = useState('')
+  const filtered = kindFilter ? events.filter(e => eventKind(e) === kindFilter) : events
+
+  return (
+    <div className="card" style={{ padding: 'var(--spacing-md)' }}>
+      <div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', marginBottom: 'var(--spacing-sm)', gap: 'var(--spacing-sm)', flexWrap: 'wrap' }}>
+        <div style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-xs)' }}>
+          <span style={{ fontSize: '0.875rem', fontWeight: 600 }}>Recent events</span>
+          <span style={{ fontSize: '0.6875rem', color: 'var(--color-text-muted)' }}>
+            shared by PII filter and MITM proxy · newest first · capped at 100
+          </span>
+        </div>
+        <div style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-xs)' }}>
+          {EVENT_KINDS.map(k => (
+            <button
+              key={k.id || 'all'}
+              className={`btn btn-sm ${kindFilter === k.id ? 'btn-primary' : 'btn-secondary'}`}
+              onClick={() => setKindFilter(k.id)}
+            >
+              {k.label}
+            </button>
+          ))}
+        </div>
+      </div>
+      {filtered.length === 0 ? (
+        <div className="empty-state">
+          <div className="empty-state-icon"><i className="fas fa-list-ul" /></div>
+          <h2 className="empty-state-title">No events</h2>
+          <p className="empty-state-text">
+            Events appear here when the PII filter matches a pattern, when the MITM proxy decides whether
+            to intercept a hostname, or when an intercepted request finishes. Request bodies are never
+            stored — use the API and backend traces for that.
+          </p>
+        </div>
+      ) : (
+        <div className="table-container">
+          <table className="table">
+            <thead>
+              <tr>
+                <th style={{ width: 170 }}>Time</th>
+                <th style={{ width: 130 }}>Kind</th>
+                <th style={{ width: 200 }}>Subject</th>
+                <th>Details</th>
+                <th style={{ width: 110 }}>Action</th>
+                <th>Correlation</th>
+              </tr>
+            </thead>
+            <tbody>
+              {filtered.map(e => (
+                <tr key={e.id}>
+                  <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>
+                    {e.created_at}
+                  </td>
+                  <td>{kindBadge(eventKind(e))}</td>
+                  <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.8125rem', fontWeight: 600 }}>
+                    {eventSubject(e)}
+                  </td>
+                  <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.75rem', color: 'var(--color-text-muted)' }}>
+                    {eventDetails(e)}
+                  </td>
+                  <td>{e.action ? actionBadge(e.action) : '—'}</td>
+                  <td style={{ fontFamily: 'var(--font-mono)', fontSize: '0.6875rem', color: 'var(--color-text-muted)' }}>
+                    {e.correlation_id || '—'}
+                  </td>
+                </tr>
+              ))}
+            </tbody>
+          </table>
+        </div>
+      )}
+    </div>
+  )
+}
+
+// RouterCacheCell renders the L2 embedding-cache state for one router
+// model. Shows nothing for routers without an embedding_cache: block;
+// for configured caches, shows hit/miss/near-miss counters plus a
+// similarity histogram with a marker at the configured threshold so
+// admins can tell at a glance whether the threshold is well-placed.
+function RouterCacheCell({ cache }) {
+  if (!cache) {
+    return <span style={{ color: 'var(--color-text-muted)' }}>—</span>
+  }
+  const stats = cache.stats || {}
+  const hits = stats.hits || 0
+  const misses = stats.misses || 0
+  const nearMisses = stats.near_misses || 0
+  const lowConf = stats.low_confidence || 0
+  const totalLookups = hits + misses + nearMisses
+  const hitRate = totalLookups > 0 ? Math.round((hits / totalLookups) * 100) : null
+  const errors = (stats.embedder_errors || 0) + (stats.store_errors || 0)
+  const buckets = stats.similarity_buckets || []
+  const bucketMax = buckets.length ? Math.max(...buckets, 1) : 1
+  const threshold = cache.similarity_threshold || 0.80
+  const thresholdBucket = Math.max(0, Math.min(9, Math.floor(threshold * 10)))
+  return (
+    <div style={{ fontFamily: 'var(--font-mono)', fontSize: '0.6875rem', lineHeight: 1.3 }}>
+      <div style={{ fontWeight: 600 }}>{cache.embedding_model}</div>
+      <div style={{ color: 'var(--color-text-muted)' }}>
+        {totalLookups === 0 ? (
+          <span>no traffic yet</span>
+        ) : (
+          <>
+            <span style={{ color: hitRate >= 50 ? 'var(--color-success, #2da44e)' : 'var(--color-text-muted)' }}>
+              {hitRate}% hit
+            </span>
+            <span> · {hits}h/{nearMisses}n/{misses}m</span>
+            {lowConf > 0 && <span> · {lowConf} skipped</span>}
+            {errors > 0 && <span style={{ color: 'var(--color-warning, #d97706)' }}> · {errors} err</span>}
+          </>
+        )}
+      </div>
+      {buckets.length === 10 && buckets.some(v => v > 0) && (
+        <div title={`Cosine similarity histogram, threshold=${threshold}`}
+             style={{ display: 'flex', alignItems: 'flex-end', gap: 1, marginTop: 4, height: 18 }}>
+          {buckets.map((count, i) => {
+            const h = bucketMax > 0 ? Math.max(2, Math.round((count / bucketMax) * 18)) : 2
+            const inHitZone = i >= thresholdBucket
+            return (
+              <div
+                key={i}
+                title={`[${(i/10).toFixed(1)}, ${((i+1)/10).toFixed(1)}): ${count}`}
+                style={{
+                  width: 6,
+                  height: h,
+                  background: count === 0
+                    ? 'var(--color-border, #e5e7eb)'
+                    : inHitZone
+                      ? 'var(--color-success, #2da44e)'
+                      : 'var(--color-warning, #d97706)',
+                  opacity: count === 0 ? 0.3 : 1,
+                }}
+              />
+            )
+          })}
+          <div style={{ marginLeft: 4, fontSize: '0.625rem', color: 'var(--color-text-muted)' }}>
+            sim ≥ {threshold}
+          </div>
+        </div>
+      )}
+    </div>
+  )
+}
diff --git a/core/http/react-ui/src/pages/ModelEditor.jsx b/core/http/react-ui/src/pages/ModelEditor.jsx
index 40446b2bc..9cb032f1b 100644
--- a/core/http/react-ui/src/pages/ModelEditor.jsx
+++ b/core/http/react-ui/src/pages/ModelEditor.jsx
@@ -9,6 +9,7 @@ import LoadingSpinner from '../components/LoadingSpinner'
 import CodeEditor from '../components/CodeEditor'
 import FieldBrowser from '../components/FieldBrowser'
 import ConfigFieldRenderer from '../components/ConfigFieldRenderer'
+import { FormContextProvider } from '../contexts/FormContext'
 import TemplateSelector from '../components/TemplateSelector'
 import MODEL_TEMPLATES from '../utils/modelTemplates'
 
@@ -386,6 +387,7 @@ export default function ModelEditor() {
   if (metaError) return <div className="page page--medium"><div className="empty-state"><p className="empty-state-text">Failed to load config metadata: {metaError}</p></div></div>
 
   return (
+    <FormContextProvider formData={values}>
     <div className="page page--medium" style={{ padding: 0 }}>
       {/* Header */}
       <div style={{
@@ -642,5 +644,6 @@ export default function ModelEditor() {
         </>
       )}
     </div>
+    </FormContextProvider>
   )
 }
diff --git a/core/http/react-ui/src/pages/Traces.jsx b/core/http/react-ui/src/pages/Traces.jsx
index b42c13a63..7bceaa299 100644
--- a/core/http/react-ui/src/pages/Traces.jsx
+++ b/core/http/react-ui/src/pages/Traces.jsx
@@ -238,6 +238,23 @@ function BackendTraceDetail({ trace }) {
       {/* Audio snippet */}
       {trace.data && <AudioSnippet data={trace.data} />}
 
+      {/* Request body: cloud-proxy passthrough records the full
+          payload here (capped to ~1MB upstream); pretty-print when
+          it parses as JSON, otherwise show the raw text. */}
+      {trace.body && (
+        <div style={{ marginBottom: 'var(--spacing-md)' }}>
+          <h4 style={{ fontSize: '0.8125rem', fontWeight: 600, marginBottom: 'var(--spacing-xs)' }}>Request Body</h4>
+          <pre style={{
+            background: 'var(--color-bg-primary)', border: '1px solid var(--color-border)',
+            borderRadius: 'var(--radius-sm)', padding: 'var(--spacing-sm)',
+            fontSize: '0.75rem', fontFamily: 'var(--font-mono)', whiteSpace: 'pre-wrap', wordBreak: 'break-word',
+            overflow: 'auto', maxHeight: '50vh', margin: 0,
+          }}>
+            {formatLargeValue(trace.body)}
+          </pre>
+        </div>
+      )}
+
       {/* Data fields */}
       {trace.data && Object.keys(trace.data).length > 0 && <DataFields data={trace.data} />}
     </div>
diff --git a/core/http/react-ui/src/pages/Usage.jsx b/core/http/react-ui/src/pages/Usage.jsx
index 468c6dd07..5d505b6ce 100644
--- a/core/http/react-ui/src/pages/Usage.jsx
+++ b/core/http/react-ui/src/pages/Usage.jsx
@@ -630,7 +630,7 @@ function ModelDistChart({ rows }) {
 
 export default function Usage() {
   const { addToast } = useOutletContext()
-  const { isAdmin, authEnabled } = useAuth()
+  const { isAdmin, authEnabled, loading: authLoading } = useAuth()
   const { t } = useTranslation('admin')
   const [period, setPeriod] = useState('month')
   const [loading, setLoading] = useState(true)
@@ -645,8 +645,13 @@ export default function Usage() {
   const fetchUsage = useCallback(async () => {
     setLoading(true)
     try {
-      const usagePromise = fetch(apiUrl(`/api/auth/usage?period=${period}`))
-      const quotaPromise = fetch(apiUrl('/api/auth/quota'))
+      // /api/usage works in no-auth single-user mode (returns the synthetic
+      // local user's usage). /api/auth/usage is the legacy auth-required
+      // path; we keep using it when auth is on so /api/auth/quota and
+      // friends remain consistent.
+      const userUsageURL = authEnabled ? '/api/auth/usage' : '/api/usage'
+      const usagePromise = fetch(apiUrl(`${userUsageURL}?period=${period}`))
+      const quotaPromise = authEnabled ? fetch(apiUrl('/api/auth/quota')) : Promise.resolve(null)
 
       const [res, quotaRes] = await Promise.all([usagePromise, quotaPromise])
 
@@ -655,13 +660,18 @@ export default function Usage() {
       setUsage(data.usage || [])
       setTotals(data.totals || {})
 
-      if (quotaRes.ok) {
+      if (quotaRes && quotaRes.ok) {
         const quotaData = await quotaRes.json()
         setQuotas(quotaData.quotas || [])
       }
 
       if (isAdmin) {
-        const adminRes = await fetch(apiUrl(`/api/auth/admin/usage?period=${period}`))
+        // /api/usage/all serves the cluster-wide view in both modes.
+        // The synthetic local user has Role: admin, so single-user mode
+        // gets the admin-style cross-user table (which collapses to one
+        // row, but keeps the UI shape consistent).
+        const adminURL = authEnabled ? '/api/auth/admin/usage' : '/api/usage/all'
+        const adminRes = await fetch(apiUrl(`${adminURL}?period=${period}`))
         if (adminRes.ok) {
           const adminData = await adminRes.json()
           setAdminUsage(adminData.usage || [])
@@ -673,24 +683,12 @@ export default function Usage() {
     } finally {
       setLoading(false)
     }
-  }, [period, isAdmin, addToast])
+  }, [period, isAdmin, authEnabled, addToast])
 
   useEffect(() => {
-    if (authEnabled) fetchUsage()
-    else setLoading(false)
-  }, [fetchUsage, authEnabled])
-
-  if (!authEnabled) {
-    return (
-      <div className="page page--wide">
-        <div className="empty-state">
-          <div className="empty-state-icon"><i className="fas fa-chart-bar" /></div>
-          <h2 className="empty-state-title">Usage tracking unavailable</h2>
-          <p className="empty-state-text">Authentication must be enabled to track API usage.</p>
-        </div>
-      </div>
-    )
-  }
+    if (authLoading) return
+    fetchUsage()
+  }, [fetchUsage, authLoading])
 
   const modelRows = aggregateByModel(isAdmin ? adminUsage : usage)
   const userRows = isAdmin ? aggregateByUser(adminUsage) : []
diff --git a/core/http/react-ui/src/router.jsx b/core/http/react-ui/src/router.jsx
index 2e07fea5f..ae662a8be 100644
--- a/core/http/react-ui/src/router.jsx
+++ b/core/http/react-ui/src/router.jsx
@@ -42,9 +42,11 @@ import NodeBackendLogs from './pages/NodeBackendLogs'
 import NotFound from './pages/NotFound'
 import Usage from './pages/Usage'
 import Users from './pages/Users'
+import Middleware from './pages/Middleware'
 import Account from './pages/Account'
 import RequireAdmin from './components/RequireAdmin'
 import RequireAuth from './components/RequireAuth'
+import RequireAuthEnabled from './components/RequireAuthEnabled'
 import RequireFeature from './components/RequireFeature'
 
 function BrowseRedirect() {
@@ -84,7 +86,8 @@ const appChildren = [
   { path: 'voice/:model', element: <Feature feature="voice_recognition"><VoiceRecognition /></Feature> },
   { path: 'usage', element: <Usage /> },
   { path: 'account', element: <Account /> },
-  { path: 'users', element: <Admin><Users /></Admin> },
+  { path: 'users', element: <RequireAuthEnabled><Admin><Users /></Admin></RequireAuthEnabled> },
+  { path: 'middleware', element: <Admin><Middleware /></Admin> },
   { path: 'manage', element: <Admin><Manage /></Admin> },
   { path: 'backends', element: <Admin><Backends /></Admin> },
   { path: 'settings', element: <Admin><Settings /></Admin> },
diff --git a/core/http/react-ui/src/utils/capabilities.js b/core/http/react-ui/src/utils/capabilities.js
index 5d2956e77..e7c81695b 100644
--- a/core/http/react-ui/src/utils/capabilities.js
+++ b/core/http/react-ui/src/utils/capabilities.js
@@ -21,3 +21,4 @@ export const CAP_FACE_RECOGNITION = 'FLAG_FACE_RECOGNITION'
 export const CAP_SPEAKER_RECOGNITION = 'FLAG_SPEAKER_RECOGNITION'
 export const CAP_AUDIO_TRANSFORM = 'FLAG_AUDIO_TRANSFORM'
 export const CAP_REALTIME_AUDIO = 'FLAG_REALTIME_AUDIO'
+export const CAP_SCORE = 'FLAG_SCORE'
diff --git a/core/http/react-ui/src/utils/modelTemplates.js b/core/http/react-ui/src/utils/modelTemplates.js
index 576d66c5c..a733127a4 100644
--- a/core/http/react-ui/src/utils/modelTemplates.js
+++ b/core/http/react-ui/src/utils/modelTemplates.js
@@ -74,6 +74,96 @@ const MODEL_TEMPLATES = [
       'embeddings': true,
     },
   },
+  {
+    id: 'cloud-proxy-openai',
+    label: 'OpenAI Cloud Proxy',
+    icon: 'fa-cloud',
+    description: 'Forward chat completions to OpenAI or any OpenAI-compatible provider; PII redaction runs in flight',
+    // known_usecases is pre-seeded with chat so the proxy model
+    // surfaces in places that filter by capability — model pickers
+    // for chat, router fallback dropdowns, etc. Backends without an
+    // explicit usecase list are filtered out of those selectors.
+    fields: {
+      'name': '',
+      'backend': 'cloud-proxy',
+      'known_usecases': ['chat'],
+      'proxy.mode': 'passthrough',
+      'proxy.provider': 'openai',
+      'proxy.upstream_url': 'https://api.openai.com/v1/chat/completions',
+      'proxy.api_key_env': 'OPENAI_API_KEY',
+      'proxy.upstream_model': '',
+      'proxy.request_timeout_seconds': 120,
+      'pii.enabled': true,
+    },
+  },
+  {
+    id: 'cloud-proxy-anthropic',
+    label: 'Anthropic Cloud Proxy',
+    icon: 'fa-cloud',
+    description: 'Forward chat completions to Anthropic via translate mode (OpenAI ↔ Messages); tool_use blocks and usage tokens survive the round-trip. PII redaction runs in flight.',
+    fields: {
+      'name': '',
+      'backend': 'cloud-proxy',
+      'known_usecases': ['chat'],
+      // translate mode targets Anthropic's native /v1/messages and
+      // converts request/response between OpenAI Chat Completions and
+      // Anthropic Messages so the LocalAI chat UI keeps speaking
+      // OpenAI. passthrough would only work against Anthropic's
+      // /v1/chat/completions OpenAI-compat endpoint and loses
+      // tool_use semantics.
+      'proxy.mode': 'translate',
+      'proxy.provider': 'anthropic',
+      'proxy.upstream_url': 'https://api.anthropic.com/v1/messages',
+      'proxy.api_key_env': 'ANTHROPIC_API_KEY',
+      'proxy.upstream_model': '',
+      'proxy.request_timeout_seconds': 300,
+      'pii.enabled': true,
+    },
+  },
+  {
+    id: 'router',
+    label: 'Routing Model',
+    icon: 'fa-route',
+    description: 'Score-classifier router with three example policies and two candidates. Fill in the classifier_model (Arch-Router-1.5B recommended), the per-candidate downstream models, and the fallback. The L2 embedding cache is opt-in via the Routing section.',
+    fields: {
+      'name': 'smart-router',
+      'router.classifier': 'score',
+      'router.classifier_model': '',
+      'router.fallback': '',
+      'router.activation_threshold': 0.40,
+      'router.policies': [
+        { label: 'code-generation', description: 'writing, debugging, reading, or explaining code in any programming language' },
+        { label: 'casual-chat', description: 'small talk, greetings, jokes, or general conversation with no specific task' },
+        { label: 'math-reasoning', description: 'arithmetic, equations, percentage calculations, or step-by-step word problems' },
+      ],
+      'router.candidates': [
+        { model: '', labels: ['casual-chat'] },
+        { model: '', labels: ['code-generation', 'casual-chat', 'math-reasoning'] },
+      ],
+    },
+  },
+  {
+    id: 'mitm',
+    label: 'MITM Intercept',
+    icon: 'fa-shield-halved',
+    description: 'Bind a hostname to this config for the cloudproxy MITM listener. PII filtering and pattern overrides flow from this config when the host is intercepted.',
+    // The mitm- name prefix is a convention, not a contract — the
+    // dispatcher looks up by host, not name. Prefixing keeps the
+    // config out of the way of callable model names so a chat client
+    // accidentally requesting "anthropic" doesn't hit a backendless
+    // intercept config.
+    //
+    // pii.patterns is pre-seeded with an empty list so the override
+    // editor is visible by default — admins typically want to tighten
+    // a couple of pattern actions when intercepting a cloud provider.
+    // An empty list serializes out and the redactor ignores it.
+    fields: {
+      'name': 'mitm-anthropic',
+      'mitm.hosts': ['api.anthropic.com'],
+      'pii.enabled': true,
+      'pii.patterns': [],
+    },
+  },
 ]
 
 export default MODEL_TEMPLATES
diff --git a/core/http/routes/anthropic.go b/core/http/routes/anthropic.go
index 68b3079bd..386942e27 100644
--- a/core/http/routes/anthropic.go
+++ b/core/http/routes/anthropic.go
@@ -13,6 +13,9 @@ import (
 	mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+	"github.com/mudler/LocalAI/core/services/routing/piiadapter"
+	"github.com/mudler/LocalAI/core/services/routing/router"
 	"github.com/mudler/xlog"
 )
 
@@ -32,14 +35,38 @@ func RegisterAnthropicRoutes(app *echo.Echo,
 		application.TemplatesEvaluator(),
 		application.ApplicationConfig(),
 		natsClient,
+		application.PIIRedactor(),
+		application.PIIEvents(),
 	)
 
 	messagesMiddleware := []echo.MiddlewareFunc{
-		middleware.UsageMiddleware(application.AuthDB()),
+		middleware.UsageMiddleware(application.StatsRecorder(), application.FallbackUser()),
 		middleware.TraceMiddleware(application),
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.AnthropicRequest) }),
 		setAnthropicRequestContext(application.ApplicationConfig()),
+		// RouteModel runs after the request is parsed but before the
+		// PII filter — see the OpenAI route for why this order matters
+		// (per-model PII configs apply to the routed target).
+		middleware.RouteModel(
+			application.ModelConfigLoader(),
+			application.ApplicationConfig(),
+			application.RouterDecisions(),
+			application.FallbackUser(),
+			middleware.AnthropicProbe,
+			router.SourceAnthropic,
+			middleware.ClassifierDeps{
+				Scorer:      application.Scorer,
+				Embedder:    application.Embedder,
+				VectorStore: application.VectorStore,
+				Reranker:    application.Reranker,
+				ModelLookup: application.ModelConfigLookup(),
+				Registry:    application.RouterClassifierRegistry(),
+				Evaluator:   application.TemplatesEvaluator(),
+			},
+		),
+		middleware.AdmissionControl(application.AdmissionLimiter(), application.PIIEvents()),
+		pii.RequestMiddleware(application.PIIRedactor(), application.PIIEvents(), piiadapter.Anthropic(), application.FallbackUser()),
 	}
 
 	// Main Anthropic endpoint
diff --git a/core/http/routes/localai.go b/core/http/routes/localai.go
index 5c341b90c..4f61c8525 100644
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -216,6 +216,11 @@ func RegisterLocalAIRoutes(router *echo.Echo,
 	router.GET("/api/p2p", localai.ShowP2PNodes(appConfig), adminMiddleware)
 	router.GET("/api/p2p/token", localai.ShowP2PToken(appConfig), adminMiddleware)
 
+	// Score (logprob over candidate continuations) — admin-only smoke-test
+	// surface for the gRPC Score primitive. Production consumers should
+	// use application.ScorerFactory() directly rather than HTTP.
+	router.POST("/api/score", localai.ScoreEndpoint(cl, ml, appConfig), adminMiddleware)
+
 	router.GET("/version", func(c echo.Context) error {
 		return c.JSON(200, struct {
 			Version string `json:"version"`
diff --git a/core/http/routes/middleware.go b/core/http/routes/middleware.go
new file mode 100644
index 000000000..a5c5ca9ca
--- /dev/null
+++ b/core/http/routes/middleware.go
@@ -0,0 +1,362 @@
+package routes
+
+import (
+	"context"
+	"net/http"
+	"strconv"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/application"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/auth"
+	"github.com/mudler/LocalAI/core/http/endpoints/localai"
+	"github.com/mudler/LocalAI/core/http/middleware"
+	"github.com/mudler/LocalAI/core/services/routing/router"
+)
+
+// RegisterMiddlewareRoutes wires the routing-module admin surface that
+// powers the /app/middleware React page. Two endpoints:
+//
+//   - GET /api/middleware/status — single round-trip aggregator. Lists
+//     PII patterns with current actions, each model's resolved
+//     enabled/override state, recent event count, and a router status
+//     stub (until subsystem 2 lands).
+//   - GET /api/router/status — placeholder that the page renders for
+//     the Routing tab. Returns { configured: false, models: [] } today;
+//     subsystem 2 fills it in.
+//
+// Both are admin-only when auth is on. In single-user (no-auth) mode
+// the synthetic local user has Role: admin so the page works without
+// extra config — same gating shape as the existing /api/usage/all.
+func RegisterMiddlewareRoutes(e *echo.Echo, app *application.Application) {
+	e.GET("/api/middleware/status", func(c echo.Context) error {
+		viewer := resolveUsageUser(c, app)
+		if viewer == nil {
+			return c.JSON(http.StatusUnauthorized, map[string]string{"error": "not authenticated"})
+		}
+		if viewer.Role != auth.RoleAdmin {
+			return c.JSON(http.StatusForbidden, map[string]string{"error": "admin access required"})
+		}
+
+		piiSection := buildPIIStatus(app)
+		routerSection := buildRouterStatus(app)
+		mitmSection := buildMITMStatus(app)
+		admissionSection := buildAdmissionStatus(app)
+
+		return c.JSON(http.StatusOK, map[string]any{
+			"pii":       piiSection,
+			"router":    routerSection,
+			"mitm":      mitmSection,
+			"admission": admissionSection,
+		})
+	})
+
+	e.GET("/api/router/status", func(c echo.Context) error {
+		// Read-only — admins want to see classifier configurations
+		// without authenticating, same as /api/pii/patterns.
+		return c.JSON(http.StatusOK, buildRouterStatus(app))
+	})
+
+	e.GET("/api/middleware/proxy-ca.crt", func(c echo.Context) error {
+		// The CA cert is the public half — safe to expose without
+		// auth so clients can curl it during initial setup. The
+		// private key never leaves disk and is mode 0600. Returning
+		// 404 (rather than 500) when MITM is disabled keeps the
+		// endpoint a clean "is this feature available?" probe.
+		ca := app.MITMCA()
+		if ca == nil {
+			return c.JSON(http.StatusNotFound, map[string]string{
+				"error": "mitm proxy is not enabled (set --mitm-listen to start it)",
+			})
+		}
+		c.Response().Header().Set("Content-Type", "application/x-pem-file")
+		c.Response().Header().Set("Content-Disposition", `attachment; filename="localai-mitm-ca.crt"`)
+		return c.Blob(http.StatusOK, "application/x-pem-file", ca.PublicCertPEM())
+	})
+
+	e.GET("/api/router/decisions", func(c echo.Context) error {
+		viewer := resolveUsageUser(c, app)
+		if viewer == nil {
+			return c.JSON(http.StatusUnauthorized, map[string]string{"error": "not authenticated"})
+		}
+		// Decision logs may include user ids — admin-only when auth is
+		// on; the synthetic local user has admin so single-user mode
+		// works.
+		if viewer.Role != auth.RoleAdmin {
+			return c.JSON(http.StatusForbidden, map[string]string{"error": "admin access required"})
+		}
+
+		store := app.RouterDecisions()
+		if store == nil {
+			return c.JSON(http.StatusOK, map[string]any{"decisions": []any{}})
+		}
+
+		limit := 100
+		if v := c.QueryParam("limit"); v != "" {
+			if n, err := strconv.Atoi(v); err == nil && n > 0 {
+				limit = n
+			}
+		}
+		decisions, err := store.List(c.Request().Context(), router.DecisionListQuery{
+			CorrelationID: c.QueryParam("correlation_id"),
+			UserID:        c.QueryParam("user_id"),
+			RouterModel:   c.QueryParam("router_model"),
+			Limit:         limit,
+		})
+		if err != nil {
+			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "failed to list decisions"})
+		}
+		return c.JSON(http.StatusOK, map[string]any{"decisions": decisions})
+	})
+
+	// GET /api/router/cache/stats — embedding-cache counters per
+	// router model. Read-only; same auth gating as /api/router/status
+	// (any authenticated user can see configuration). Omitted entries
+	// indicate "embedding cache not enabled for this router".
+	e.GET("/api/router/cache/stats", func(c echo.Context) error {
+		reg := app.RouterClassifierRegistry()
+		stats := map[string]router.EmbeddingCacheStats{}
+		if reg != nil {
+			stats = reg.EmbeddingCacheStatsByRouter()
+		}
+		return c.JSON(http.StatusOK, map[string]any{"caches": stats})
+	})
+
+	// POST /api/router/decide — programmatic decision-oracle endpoint
+	// for external routers. Runs the same classifier that the in-band
+	// RouteModel middleware would have run and returns the chosen
+	// label set + candidate model, without rewriting the request,
+	// forwarding it, or recording a row in the decision store.
+	//
+	// Admin-only — same gating as /api/router/decisions. The risk
+	// surface is "runs classifier inference on arbitrary input", which
+	// matches the decision-log endpoint's gating.
+	decideHandler := localai.RouterDecideEndpoint(
+		app.ModelConfigLoader(),
+		app.ApplicationConfig(),
+		middleware.ClassifierDeps{
+			Scorer:      app.Scorer,
+			Embedder:    app.Embedder,
+			VectorStore: app.VectorStore,
+			Reranker:    app.Reranker,
+			ModelLookup: app.ModelConfigLookup(),
+			Registry:    app.RouterClassifierRegistry(),
+			Evaluator:   app.TemplatesEvaluator(),
+		},
+	)
+	e.POST("/api/router/decide", func(c echo.Context) error {
+		viewer := resolveUsageUser(c, app)
+		if viewer == nil {
+			return c.JSON(http.StatusUnauthorized, map[string]string{"error": "not authenticated"})
+		}
+		if viewer.Role != auth.RoleAdmin {
+			return c.JSON(http.StatusForbidden, map[string]string{"error": "admin access required"})
+		}
+		return decideHandler(c)
+	})
+}
+
+// buildRouterStatus inventories every model that declares a Router
+// block and reports their classifiers + candidate tables. Reads from
+// the same loader the RouteModel middleware uses so the admin page
+// agrees with what's actually live in the request path.
+func buildRouterStatus(app *application.Application) map[string]any {
+	models := []map[string]any{}
+	hasAny := false
+	cacheStats := map[string]router.EmbeddingCacheStats{}
+	if reg := app.RouterClassifierRegistry(); reg != nil {
+		cacheStats = reg.EmbeddingCacheStatsByRouter()
+	}
+	for _, cfg := range app.ModelConfigLoader().GetAllModelsConfigs() {
+		if !cfg.HasRouter() {
+			continue
+		}
+		hasAny = true
+		candidates := make([]map[string]any, 0, len(cfg.Router.Candidates))
+		for _, ca := range cfg.Router.Candidates {
+			candidates = append(candidates, map[string]any{
+				"model":  ca.Model,
+				"labels": ca.Labels,
+			})
+		}
+		policies := make([]map[string]any, 0, len(cfg.Router.Policies))
+		for _, p := range cfg.Router.Policies {
+			policies = append(policies, map[string]any{
+				"label":       p.Label,
+				"description": p.Description,
+			})
+		}
+		classifier := cfg.Router.Classifier
+		if classifier == "" {
+			classifier = router.ClassifierScore
+		}
+		entry := map[string]any{
+			"name":       cfg.Name,
+			"classifier": classifier,
+			"policies":   policies,
+			"candidates": candidates,
+			"fallback":   cfg.Router.Fallback,
+		}
+		if ec := cfg.Router.EmbeddingCache; ec != nil {
+			cacheEntry := map[string]any{
+				"embedding_model":      ec.EmbeddingModel,
+				"similarity_threshold": ec.SimilarityThreshold,
+				"confidence_threshold": ec.ConfidenceThreshold,
+				"store_name":           ec.StoreName,
+			}
+			if s, ok := cacheStats[cfg.Name]; ok {
+				cacheEntry["stats"] = s
+			}
+			entry["embedding_cache"] = cacheEntry
+		}
+		models = append(models, entry)
+	}
+
+	recentCount := 0
+	if store := app.RouterDecisions(); store != nil {
+		if n, err := store.Count(context.Background()); err == nil {
+			recentCount = n
+		}
+	}
+
+	out := map[string]any{
+		"configured":          hasAny,
+		"models":              models,
+		"recent_decision_count": recentCount,
+		"available_classifiers": []string{router.ClassifierScore},
+	}
+	if !hasAny {
+		out["note"] = "No router models configured. Add a `router:` block to a model YAML to enable intelligent routing."
+	}
+	return out
+}
+
+func buildMITMStatus(app *application.Application) map[string]any {
+	srv := app.MITMServer()
+	ca := app.MITMCA()
+	cfg := app.ApplicationConfig()
+
+	// MITM-bound model configs — anything with an mitm: block, even
+	// if hosts is empty. Surfaces a "fresh from template" config the
+	// admin started but hasn't yet attached a host to.
+	mitmModels := []map[string]any{}
+	for _, mc := range app.ModelConfigLoader().GetModelConfigsByFilter(func(_ string, c *config.ModelConfig) bool {
+		return len(c.MITM.Hosts) > 0
+	}) {
+		mitmModels = append(mitmModels, map[string]any{
+			"name":        mc.Name,
+			"hosts":       mc.MITM.Hosts,
+			"pii_enabled": mc.PIIIsEnabled(),
+			"backend":     mc.Backend,
+		})
+	}
+
+	out := map[string]any{
+		"running":         srv != nil,
+		"listen_addr":     "",
+		"configured_addr": cfg.MITMListen,
+		"host_owners":     app.MITMHostOwners(),
+		"host_conflicts":  app.MITMHostConflicts(),
+		"models":          mitmModels,
+		"ca_available":    ca != nil,
+		"ca_cert_url":     "",
+	}
+	if conflicts := app.MITMHostConflicts(); len(conflicts) > 0 {
+		out["error"] = "MITM listener disabled: duplicate host claims across model configs (see host_conflicts). Resolve by editing the conflicting model YAMLs so each host appears in at most one mitm.hosts list."
+	}
+	if srv != nil {
+		out["listen_addr"] = srv.Addr()
+	}
+	if ca != nil {
+		out["ca_cert_url"] = "/api/middleware/proxy-ca.crt"
+	}
+	return out
+}
+
+// buildAdmissionStatus reports each model's MaxConcurrent ceiling
+// and current in-flight count. Models with no limit set are
+// omitted — the dashboard view is "what's gated", not "every
+// model in the loader".
+func buildAdmissionStatus(app *application.Application) map[string]any {
+	limiter := app.AdmissionLimiter()
+	models := []map[string]any{}
+	if limiter == nil {
+		return map[string]any{"models": models}
+	}
+	for _, cfg := range app.ModelConfigLoader().GetAllModelsConfigs() {
+		if cfg.Limits.MaxConcurrent <= 0 {
+			continue
+		}
+		models = append(models, map[string]any{
+			"name":                cfg.Name,
+			"max_concurrent":      cfg.Limits.MaxConcurrent,
+			"retry_after_seconds": cfg.Limits.RetryAfterSeconds,
+			"in_flight":           limiter.InFlight(cfg.Name),
+		})
+	}
+	return map[string]any{"models": models}
+}
+
+// buildPIIStatus builds the pii section of /api/middleware/status. It
+// reads the live redactor, walks every model config, and reports the
+// resolved enabled state plus any per-pattern overrides — that's what
+// the admin page renders side-by-side so the operator can see at a
+// glance which models are protected.
+//
+// Returns a sentinel "disabled" payload when the redactor is nil
+// (--disable-pii), letting the page show "filter switched off" rather
+// than a confusing empty state.
+func buildPIIStatus(app *application.Application) map[string]any {
+	redactor := app.PIIRedactor()
+	if redactor == nil {
+		return map[string]any{
+			"enabled_globally": false,
+			"reason":           "--disable-pii",
+			"patterns":         []any{},
+			"models":           []any{},
+		}
+	}
+
+	patterns := redactor.Patterns()
+	patternList := make([]map[string]any, 0, len(patterns))
+	for _, p := range patterns {
+		patternList = append(patternList, map[string]any{
+			"id":               p.ID,
+			"description":      p.Description,
+			"action":           string(p.Action),
+			"disabled":         p.Disabled,
+			"max_match_length": p.MaxMatchLength,
+		})
+	}
+
+	models := []map[string]any{}
+	for _, cfg := range app.ModelConfigLoader().GetAllModelsConfigs() {
+		entry := map[string]any{
+			"name":      cfg.Name,
+			"backend":   cfg.Backend,
+			"enabled":   cfg.PIIIsEnabled(),
+			"overrides": cfg.PIIPatternOverrides(),
+		}
+		// explicit-set tells the UI whether the resolved state came
+		// from the YAML or the backend-prefix default. Helps admins
+		// understand "why is this on?" without reading source.
+		entry["explicit"] = cfg.PII.Enabled != nil
+		entry["default_for_backend"] = cfg.Backend == "cloud-proxy"
+		models = append(models, entry)
+	}
+
+	recentCount := 0
+	if app.PIIEvents() != nil {
+		if n, err := app.PIIEvents().Count(context.Background()); err == nil {
+			recentCount = n
+		}
+	}
+
+	return map[string]any{
+		"enabled_globally":             true,
+		"default_enabled_for_backends": []string{"cloud-proxy"},
+		"patterns":                     patternList,
+		"models":                       models,
+		"recent_event_count":           recentCount,
+	}
+}
diff --git a/core/http/routes/ollama.go b/core/http/routes/ollama.go
index aba0d8e97..f02db7636 100644
--- a/core/http/routes/ollama.go
+++ b/core/http/routes/ollama.go
@@ -17,7 +17,7 @@ func RegisterOllamaRoutes(app *echo.Echo,
 	application *application.Application) {
 
 	traceMiddleware := middleware.TraceMiddleware(application)
-	usageMiddleware := middleware.UsageMiddleware(application.AuthDB())
+	usageMiddleware := middleware.UsageMiddleware(application.StatsRecorder(), application.FallbackUser())
 
 	// Chat endpoint: POST /api/chat
 	chatHandler := ollama.ChatEndpoint(
diff --git a/core/http/routes/openai.go b/core/http/routes/openai.go
index bd7793ae9..417df8a79 100644
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -9,6 +9,9 @@ import (
 	"github.com/mudler/LocalAI/core/http/endpoints/openai"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+	"github.com/mudler/LocalAI/core/services/routing/piiadapter"
+	"github.com/mudler/LocalAI/core/services/routing/router"
 )
 
 func RegisterOpenAIRoutes(app *echo.Echo,
@@ -16,7 +19,7 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 	application *application.Application) {
 	// openAI compatible API endpoint
 	traceMiddleware := middleware.TraceMiddleware(application)
-	usageMiddleware := middleware.UsageMiddleware(application.AuthDB())
+	usageMiddleware := middleware.UsageMiddleware(application.StatsRecorder(), application.FallbackUser())
 
 	// realtime
 	// TODO: Modify/disable the API key middleware for this endpoint to allow ephemeral keys created by sessions
@@ -32,7 +35,7 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 	}
 
 	// chat
-	chatHandler := openai.ChatEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.TemplatesEvaluator(), application.ApplicationConfig(), natsClient, application.LocalAIAssistant())
+	chatHandler := openai.ChatEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.TemplatesEvaluator(), application.ApplicationConfig(), natsClient, application.LocalAIAssistant(), application.PIIRedactor(), application.PIIEvents())
 	chatMiddleware := []echo.MiddlewareFunc{
 		usageMiddleware,
 		traceMiddleware,
@@ -46,6 +49,40 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 				return next(c)
 			}
 		},
+		// RouteModel runs AFTER the schema-specific request parser so
+		// the classifier sees a populated *schema.OpenAIRequest. When
+		// the resolved model has a Router config, the middleware
+		// rewrites input.Model to the chosen candidate, swaps
+		// MODEL_CONFIG, and stamps RequestedModel/ServedModel for the
+		// usage log. Models without a Router pass through.
+		middleware.RouteModel(
+			application.ModelConfigLoader(),
+			application.ApplicationConfig(),
+			application.RouterDecisions(),
+			application.FallbackUser(),
+			middleware.OpenAIProbe,
+			router.SourceChat,
+			middleware.ClassifierDeps{
+				Scorer:      application.Scorer,
+				Embedder:    application.Embedder,
+				VectorStore: application.VectorStore,
+				Reranker:    application.Reranker,
+				ModelLookup: application.ModelConfigLookup(),
+				Registry:    application.RouterClassifierRegistry(),
+				Evaluator:   application.TemplatesEvaluator(),
+			},
+		),
+		// Admission control runs after RouteModel so the SERVED
+		// model's limits apply — a router fanout that lands on a
+		// saturated downstream gets rejected even when the requested
+		// router-model has slack.
+		middleware.AdmissionControl(application.AdmissionLimiter(), application.PIIEvents()),
+		// PII redaction runs INNERMOST, after RouteModel has resolved
+		// the actual served model. This is what makes per-model PII
+		// configs honour the routed target (e.g., a router fans out to
+		// claude-strict; that model's pii block applies, not the
+		// router model's).
+		pii.RequestMiddleware(application.PIIRedactor(), application.PIIEvents(), piiadapter.OpenAI(), application.FallbackUser()),
 	}
 	app.POST("/v1/chat/completions", chatHandler, chatMiddleware...)
 	app.POST("/chat/completions", chatHandler, chatMiddleware...)
@@ -71,7 +108,7 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 	app.POST("/edits", editHandler, editMiddleware...)
 
 	// completion
-	completionHandler := openai.CompletionEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.TemplatesEvaluator(), application.ApplicationConfig())
+	completionHandler := openai.CompletionEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.TemplatesEvaluator(), application.ApplicationConfig(), application.PIIRedactor(), application.PIIEvents())
 	completionMiddleware := []echo.MiddlewareFunc{
 		usageMiddleware,
 		traceMiddleware,
diff --git a/core/http/routes/openresponses.go b/core/http/routes/openresponses.go
index 951e34910..a5932fb0f 100644
--- a/core/http/routes/openresponses.go
+++ b/core/http/routes/openresponses.go
@@ -34,7 +34,7 @@ func RegisterOpenResponsesRoutes(app *echo.Echo,
 		// Intercept requests where the model name matches an agent — route directly
 		// to the agent pool without going through the model config resolution pipeline.
 		localai.AgentResponsesInterceptor(application),
-		middleware.UsageMiddleware(application.AuthDB()),
+		middleware.UsageMiddleware(application.StatsRecorder(), application.FallbackUser()),
 		middleware.TraceMiddleware(application),
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenResponsesRequest) }),
@@ -49,8 +49,8 @@ func RegisterOpenResponsesRoutes(app *echo.Echo,
 
 	// WebSocket mode for Responses API
 	wsHandler := openresponses.WebSocketEndpoint(application)
-	app.GET("/v1/responses", wsHandler, middleware.UsageMiddleware(application.AuthDB()), middleware.TraceMiddleware(application))
-	app.GET("/responses", wsHandler, middleware.UsageMiddleware(application.AuthDB()), middleware.TraceMiddleware(application))
+	app.GET("/v1/responses", wsHandler, middleware.UsageMiddleware(application.StatsRecorder(), application.FallbackUser()), middleware.TraceMiddleware(application))
+	app.GET("/responses", wsHandler, middleware.UsageMiddleware(application.StatsRecorder(), application.FallbackUser()), middleware.TraceMiddleware(application))
 
 	// GET /responses/:id - Retrieve a response (for polling background requests)
 	getResponseHandler := openresponses.GetResponseEndpoint()
diff --git a/core/http/routes/pii.go b/core/http/routes/pii.go
new file mode 100644
index 000000000..8b8ec903e
--- /dev/null
+++ b/core/http/routes/pii.go
@@ -0,0 +1,260 @@
+package routes
+
+import (
+	"net/http"
+	"strconv"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/application"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/auth"
+	"github.com/mudler/LocalAI/core/http/endpoints/localai"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+)
+
+// RegisterPIIRoutes wires the read-only routing-PII endpoints. They
+// surface (a) the active pattern set so admins can verify what is
+// being filtered, (b) the recent PIIEvent log so they can audit what
+// has been redacted, and (c) a dry-run "test" endpoint so an admin
+// can paste candidate text and see what the redactor would do without
+// sending a real request.
+//
+// The redactor itself runs from the chat middleware in routes/openai.go;
+// these endpoints are observation- and configuration-side only.
+func RegisterPIIRoutes(e *echo.Echo, app *application.Application) {
+	if app.PIIRedactor() == nil {
+		stub := func(c echo.Context) error {
+			return c.JSON(http.StatusServiceUnavailable, map[string]string{
+				"error": "PII filter is disabled (--disable-pii)",
+			})
+		}
+		e.GET("/api/pii/patterns", stub)
+		e.GET("/api/pii/events", stub)
+		e.POST("/api/pii/test", stub)
+		e.POST("/api/pii/decide", stub)
+		e.POST("/api/pii/patterns/persist", stub)
+		return
+	}
+
+	// GetPIIPatternsEndpoint godoc
+	// @Summary List the active PII patterns
+	// @Description Returns the configured pattern set with their actions. Available without auth.
+	// @Tags pii
+	// @Produce json
+	// @Success 200 {object} map[string]interface{}
+	// @Router /api/pii/patterns [get]
+	e.GET("/api/pii/patterns", func(c echo.Context) error {
+		patterns := app.PIIRedactor().Patterns()
+		out := make([]map[string]any, 0, len(patterns))
+		for _, p := range patterns {
+			out = append(out, map[string]any{
+				"id":               p.ID,
+				"description":      p.Description,
+				"action":           string(p.Action),
+				"disabled":         p.Disabled,
+				"max_match_length": p.MaxMatchLength,
+			})
+		}
+		return c.JSON(http.StatusOK, map[string]any{"patterns": out})
+	})
+
+	// GetPIIEventsEndpoint godoc
+	// @Summary List recent middleware events
+	// @Description The event log is shared between the PII filter and the MITM proxy: PII redactions, proxy_connect (intercept decisions), and proxy_traffic (per-request byte counts) all flow through the same store. Filter by kind to narrow the view. Admin-only when auth is on; available to the local user in single-user mode.
+	// @Tags pii
+	// @Produce json
+	// @Param correlation_id query string false "Correlation ID join key"
+	// @Param user_id query string false "User id"
+	// @Param pattern_id query string false "Pattern id (e.g. email, ssn)"
+	// @Param kind query string false "Event kind: pii | proxy_connect | proxy_traffic"
+	// @Param limit query int false "Max events" default(100)
+	// @Success 200 {object} map[string]interface{}
+	// @Router /api/pii/events [get]
+	e.GET("/api/pii/events", func(c echo.Context) error {
+		viewer := resolveUsageUser(c, app)
+		if viewer == nil {
+			return c.JSON(http.StatusUnauthorized, map[string]string{"error": "not authenticated"})
+		}
+		// Admin-only when auth is enabled. Local user has Role: admin.
+		if viewer.Role != auth.RoleAdmin {
+			return c.JSON(http.StatusForbidden, map[string]string{"error": "admin access required"})
+		}
+
+		limit := 100
+		if v := c.QueryParam("limit"); v != "" {
+			if n, err := strconv.Atoi(v); err == nil && n > 0 {
+				limit = n
+			}
+		}
+		events, err := app.PIIEvents().List(c.Request().Context(), pii.ListQuery{
+			CorrelationID: c.QueryParam("correlation_id"),
+			UserID:        c.QueryParam("user_id"),
+			PatternID:     c.QueryParam("pattern_id"),
+			Kind:          pii.EventKind(c.QueryParam("kind")),
+			Limit:         limit,
+		})
+		if err != nil {
+			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "failed to list events"})
+		}
+		return c.JSON(http.StatusOK, map[string]any{"events": events})
+	})
+
+	// PostPIITestEndpoint godoc
+	// @Summary Dry-run the PII redactor against text
+	// @Description Useful for admins tuning patterns. Returns the redacted text, matched spans, and whether the input would have been blocked.
+	// @Tags pii
+	// @Accept json
+	// @Produce json
+	// @Param body body map[string]string true "JSON {\"text\":\"...\"}"
+	// @Success 200 {object} map[string]interface{}
+	// @Router /api/pii/test [post]
+	e.POST("/api/pii/test", func(c echo.Context) error {
+		var body struct {
+			Text string `json:"text"`
+		}
+		if err := c.Bind(&body); err != nil {
+			return c.JSON(http.StatusBadRequest, map[string]string{"error": "invalid JSON"})
+		}
+		res := app.PIIRedactor().Redact(body.Text)
+		return c.JSON(http.StatusOK, map[string]any{
+			"redacted":   res.Redacted,
+			"spans":      res.Spans,
+			"blocked":    res.Blocked,
+			"local_only": res.LocalOnly,
+		})
+	})
+
+	// POST /api/pii/decide — programmatic PII decision oracle for
+	// external routers. Returns findings + suggested action without
+	// mutating the caller's request or recording an audit event.
+	// Production hot path — admin-only, matching /api/pii/events.
+	decideHandler := localai.PIIDecideEndpoint(app.PIIRedactor())
+	e.POST("/api/pii/decide", func(c echo.Context) error {
+		viewer := resolveUsageUser(c, app)
+		if viewer == nil {
+			return c.JSON(http.StatusUnauthorized, map[string]string{"error": "not authenticated"})
+		}
+		if viewer.Role != auth.RoleAdmin {
+			return c.JSON(http.StatusForbidden, map[string]string{"error": "admin access required"})
+		}
+		return decideHandler(c)
+	})
+
+	// PutPIIPatternActionEndpoint godoc
+	// @Summary Change a pattern's action in-process
+	// @Description Mutates the named pattern's action (mask|block|route_local). Transient — restored to YAML defaults on restart. Admin-only.
+	// @Tags pii
+	// @Accept json
+	// @Produce json
+	// @Param id path string true "Pattern id"
+	// @Param body body map[string]string true "JSON {\"action\":\"mask|block|route_local\"}"
+	// @Success 200 {object} map[string]interface{}
+	// @Router /api/pii/patterns/{id} [put]
+	e.PUT("/api/pii/patterns/:id", func(c echo.Context) error {
+		viewer := resolveUsageUser(c, app)
+		if viewer == nil {
+			return c.JSON(http.StatusUnauthorized, map[string]string{"error": "not authenticated"})
+		}
+		if viewer.Role != auth.RoleAdmin {
+			return c.JSON(http.StatusForbidden, map[string]string{"error": "admin access required"})
+		}
+
+		id := c.Param("id")
+		if id == "" {
+			return c.JSON(http.StatusBadRequest, map[string]string{"error": "pattern id is required"})
+		}
+		// Either field is optional. The body must set at least one;
+		// otherwise the call is a no-op and the client probably means
+		// to PUT something.
+		var body struct {
+			Action   *string `json:"action,omitempty"`
+			Disabled *bool   `json:"disabled,omitempty"`
+		}
+		if err := c.Bind(&body); err != nil {
+			return c.JSON(http.StatusBadRequest, map[string]string{"error": "invalid JSON"})
+		}
+		if body.Action == nil && body.Disabled == nil {
+			return c.JSON(http.StatusBadRequest, map[string]string{"error": "must specify action and/or disabled"})
+		}
+		if body.Action != nil {
+			if err := app.PIIRedactor().SetAction(id, pii.Action(*body.Action)); err != nil {
+				return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
+			}
+		}
+		if body.Disabled != nil {
+			if err := app.PIIRedactor().SetDisabled(id, *body.Disabled); err != nil {
+				return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()})
+			}
+		}
+		return c.JSON(http.StatusOK, map[string]any{
+			"id":        id,
+			"action":    body.Action,
+			"disabled":  body.Disabled,
+			"persisted": false,
+		})
+	})
+
+	// PostPIIPatternsPersistEndpoint godoc
+	// @Summary Persist current pattern overrides to disk
+	// @Description Snapshots the live redactor's per-pattern (action, disabled) state into runtime_settings.json so the next process start re-applies it. Admin-only. Pairs with PUT /api/pii/patterns/:id which only mutates in-process.
+	// @Tags pii
+	// @Produce json
+	// @Success 200 {object} map[string]interface{}
+	// @Router /api/pii/patterns/persist [post]
+	e.POST("/api/pii/patterns/persist", func(c echo.Context) error {
+		viewer := resolveUsageUser(c, app)
+		if viewer == nil {
+			return c.JSON(http.StatusUnauthorized, map[string]string{"error": "not authenticated"})
+		}
+		if viewer.Role != auth.RoleAdmin {
+			return c.JSON(http.StatusForbidden, map[string]string{"error": "admin access required"})
+		}
+
+		appCfg := app.ApplicationConfig()
+		existing, err := appCfg.ReadPersistedSettings()
+		if err != nil {
+			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "read settings: " + err.Error()})
+		}
+		// Only persist patterns whose live state differs from the YAML
+		// default — that way an operator can compare runtime_settings.json
+		// at a glance and see only the deltas they applied.
+		defaults, dErr := pii.LoadConfig(appCfg.PIIConfigPath)
+		if dErr != nil {
+			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "reload defaults: " + dErr.Error()})
+		}
+		defaultByID := make(map[string]pii.Pattern, len(defaults))
+		for _, d := range defaults {
+			defaultByID[d.ID] = d
+		}
+		overrides := map[string]config.PIIPatternRuntimeOverride{}
+		for _, p := range app.PIIRedactor().Patterns() {
+			d, ok := defaultByID[p.ID]
+			ov := config.PIIPatternRuntimeOverride{}
+			changed := false
+			if !ok || p.Action != d.Action {
+				action := string(p.Action)
+				ov.Action = &action
+				changed = true
+			}
+			if !ok || p.Disabled != d.Disabled {
+				disabled := p.Disabled
+				ov.Disabled = &disabled
+				changed = true
+			}
+			if changed {
+				overrides[p.ID] = ov
+			}
+		}
+		existing.PIIPatternOverrides = &overrides
+		if err := appCfg.WritePersistedSettings(existing); err != nil {
+			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "write settings: " + err.Error()})
+		}
+		// Mirror onto the live ApplicationConfig so a subsequent reload
+		// without a process restart sees the same map.
+		appCfg.PIIPatternOverrides = overrides
+		return c.JSON(http.StatusOK, map[string]any{
+			"persisted":          true,
+			"override_count":     len(overrides),
+		})
+	})
+}
diff --git a/core/http/routes/usage.go b/core/http/routes/usage.go
new file mode 100644
index 000000000..4565d3ee9
--- /dev/null
+++ b/core/http/routes/usage.go
@@ -0,0 +1,157 @@
+package routes
+
+import (
+	"net/http"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/application"
+	"github.com/mudler/LocalAI/core/http/auth"
+	"github.com/mudler/LocalAI/core/services/routing/billing"
+)
+
+// RegisterUsageRoutes wires the routing-module billing endpoints. These
+// are the auth-agnostic siblings of /api/auth/usage and
+// /api/auth/admin/usage — they go through application.StatsRecorder()
+// so that a no-auth single-user box also gets a working dashboard
+// (the existing /api/auth/usage hardcodes a 401 when no user is on the
+// context).
+//
+// Permission model:
+//   - GET /api/usage          → current user's own usage; falls back to
+//     the synthetic "local" user when auth is off.
+//   - GET /api/usage/all      → cluster-wide; requires admin when auth
+//     is on. In no-auth mode the local user is the only principal and
+//     is treated as admin (the LocalUser is constructed with Role:
+//     admin), so this endpoint returns the same data as /api/usage.
+//
+// Both endpoints accept ?period={day|week|month|all} (default month)
+// and ?user_id=… on the admin path.
+func RegisterUsageRoutes(e *echo.Echo, app *application.Application) {
+	rec := app.StatsRecorder()
+	if rec == nil {
+		// Stats explicitly disabled (--disable-stats). Register stub
+		// handlers that return 503 with a clear reason rather than
+		// 404; clients (UI, MCP tools) can distinguish "not enabled
+		// here" from "endpoint missing entirely".
+		stub := func(c echo.Context) error {
+			return c.JSON(http.StatusServiceUnavailable, map[string]string{
+				"error": "usage tracking is disabled (--disable-stats)",
+			})
+		}
+		e.GET("/api/usage", stub)
+		e.GET("/api/usage/all", stub)
+		return
+	}
+
+	// GetUsageEndpoint godoc
+	// @Summary Get usage and token totals for the current user
+	// @Description Returns time-bucketed token usage for the authenticated user. In single-user no-auth mode, returns usage for the synthetic "local" user. Pass ?period={day|week|month|all}.
+	// @Tags usage
+	// @Produce json
+	// @Param period query string false "Time window: day, week, month, all" default(month)
+	// @Success 200 {object} map[string]interface{}
+	// @Router /api/usage [get]
+	e.GET("/api/usage", func(c echo.Context) error {
+		user := resolveUsageUser(c, app)
+		if user == nil {
+			return c.JSON(http.StatusUnauthorized, map[string]string{
+				"error": "not authenticated",
+			})
+		}
+
+		period := c.QueryParam("period")
+		if period == "" {
+			period = "month"
+		}
+
+		buckets, err := rec.Aggregate(c.Request().Context(), billing.AggregateQuery{
+			UserID: user.ID,
+			Period: period,
+		})
+		if err != nil {
+			return c.JSON(http.StatusInternalServerError, map[string]string{
+				"error": "failed to get usage",
+			})
+		}
+		return c.JSON(http.StatusOK, usageResponse(buckets, user))
+	})
+
+	// GetAllUsageEndpoint godoc
+	// @Summary Get cluster-wide usage (admin)
+	// @Description Returns aggregate usage across all users. Requires admin role when auth is enabled. In single-user no-auth mode, returns the same data as /api/usage (the local user is the only principal).
+	// @Tags usage
+	// @Produce json
+	// @Param period query string false "Time window: day, week, month, all" default(month)
+	// @Param user_id query string false "Filter to a specific user"
+	// @Success 200 {object} map[string]interface{}
+	// @Failure 403 {object} map[string]string
+	// @Router /api/usage/all [get]
+	e.GET("/api/usage/all", func(c echo.Context) error {
+		user := resolveUsageUser(c, app)
+		if user == nil {
+			return c.JSON(http.StatusUnauthorized, map[string]string{
+				"error": "not authenticated",
+			})
+		}
+		// Admin gate. The synthetic local user is built with Role: admin
+		// in single-user mode, so this passes naturally when auth is off.
+		if user.Role != auth.RoleAdmin {
+			return c.JSON(http.StatusForbidden, map[string]string{
+				"error": "admin access required",
+			})
+		}
+
+		period := c.QueryParam("period")
+		if period == "" {
+			period = "month"
+		}
+		filterUser := c.QueryParam("user_id")
+
+		buckets, err := rec.Aggregate(c.Request().Context(), billing.AggregateQuery{
+			UserID: filterUser, // empty = all users
+			Period: period,
+		})
+		if err != nil {
+			return c.JSON(http.StatusInternalServerError, map[string]string{
+				"error": "failed to get usage",
+			})
+		}
+		return c.JSON(http.StatusOK, usageResponse(buckets, user))
+	})
+}
+
+// resolveUsageUser returns the authenticated user when present,
+// otherwise the synthetic local user when auth is off. Centralizes the
+// "if not auth, fall back to local" pattern that both routes need.
+func resolveUsageUser(c echo.Context, app *application.Application) *auth.User {
+	if u := auth.GetUser(c); u != nil {
+		return u
+	}
+	return app.FallbackUser()
+}
+
+// usageResponse builds the JSON shape the UI consumes. The "viewer"
+// field surfaces who the data belongs to so a single-user dashboard
+// can show "local" without inventing its own labels.
+func usageResponse(buckets []auth.UsageBucket, viewer *auth.User) map[string]any {
+	totals := auth.UsageTotals{}
+	for _, b := range buckets {
+		totals.PromptTokens += b.PromptTokens
+		totals.CompletionTokens += b.CompletionTokens
+		totals.TotalTokens += b.TotalTokens
+		totals.RequestCount += b.RequestCount
+	}
+	resp := map[string]any{
+		"usage":  buckets,
+		"totals": totals,
+	}
+	if viewer != nil {
+		resp["viewer"] = map[string]string{
+			"id":       viewer.ID,
+			"name":     viewer.Name,
+			"role":     viewer.Role,
+			"provider": viewer.Provider,
+		}
+	}
+	return resp
+}
diff --git a/core/http/routes/usage_test.go b/core/http/routes/usage_test.go
new file mode 100644
index 000000000..741878781
--- /dev/null
+++ b/core/http/routes/usage_test.go
@@ -0,0 +1,135 @@
+package routes_test
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/http/auth"
+	"github.com/mudler/LocalAI/core/services/routing/billing"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// fakeRecorderBackend lets us assert what the handler asked for without
+// pulling in a real GORM/SQLite. The aggregate query is captured so
+// the test can verify (a) it ran with the right user/period and (b)
+// the JSON shape of the response matches the UI's expectations.
+type fakeRecorderBackend struct {
+	lastQuery billing.AggregateQuery
+	buckets   []auth.UsageBucket
+}
+
+func (f *fakeRecorderBackend) Record(_ context.Context, _ *auth.UsageRecord) error { return nil }
+func (f *fakeRecorderBackend) Aggregate(_ context.Context, q billing.AggregateQuery) ([]auth.UsageBucket, error) {
+	f.lastQuery = q
+	return f.buckets, nil
+}
+func (f *fakeRecorderBackend) Close() error { return nil }
+
+// usageHandler reproduces the /api/usage handler logic from
+// routes/usage.go without going through application.Application, which
+// drags in galleryop, model loaders, etc. Keeping this tight test
+// surface lets the no-auth path (the user-visible feature here) be
+// covered without the auth build tag.
+func usageHandler(rec *billing.Recorder, fallback *auth.User) echo.HandlerFunc {
+	return func(c echo.Context) error {
+		user := auth.GetUser(c)
+		if user == nil {
+			user = fallback
+		}
+		if user == nil {
+			return c.JSON(http.StatusUnauthorized, map[string]string{"error": "not authenticated"})
+		}
+		period := c.QueryParam("period")
+		if period == "" {
+			period = "month"
+		}
+		buckets, err := rec.Aggregate(c.Request().Context(), billing.AggregateQuery{
+			UserID: user.ID,
+			Period: period,
+		})
+		if err != nil {
+			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "agg failed"})
+		}
+		return c.JSON(http.StatusOK, map[string]any{
+			"usage": buckets,
+			"viewer": map[string]string{
+				"id":   user.ID,
+				"name": user.Name,
+				"role": user.Role,
+			},
+		})
+	}
+}
+
+var _ = Describe("Usage endpoint", func() {
+	It("resolves the local user in no-auth mode", func() {
+		fb := &fakeRecorderBackend{
+			buckets: []auth.UsageBucket{
+				{Bucket: "2026-05-05", Model: "qwen-7b", PromptTokens: 100, TotalTokens: 150, RequestCount: 3},
+			},
+		}
+		rec := billing.NewRecorder(fb)
+		fallback := &auth.User{ID: "local-uuid", Name: "local", Role: auth.RoleAdmin}
+
+		e := echo.New()
+		e.GET("/api/usage", usageHandler(rec, fallback))
+
+		// No Authorization header: simulates --auth=off. The handler must
+		// fall through to the fallback user instead of 401-ing.
+		req := httptest.NewRequest(http.MethodGet, "/api/usage?period=week", nil)
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+
+		Expect(w.Code).To(Equal(http.StatusOK), "status: got %d, body: %s", w.Code, w.Body.String())
+		Expect(fb.lastQuery.UserID).To(Equal("local-uuid"))
+		Expect(fb.lastQuery.Period).To(Equal("week"))
+
+		var resp struct {
+			Usage []struct {
+				Model        string `json:"model"`
+				TotalTokens  int64  `json:"total_tokens"`
+				RequestCount int64  `json:"request_count"`
+			} `json:"usage"`
+			Viewer struct {
+				ID   string `json:"id"`
+				Name string `json:"name"`
+			} `json:"viewer"`
+		}
+		Expect(json.Unmarshal(w.Body.Bytes(), &resp)).To(Succeed())
+		Expect(resp.Usage).To(HaveLen(1))
+		Expect(resp.Usage[0].Model).To(Equal("qwen-7b"))
+		Expect(resp.Viewer.ID).To(Equal("local-uuid"))
+		Expect(resp.Viewer.Name).To(Equal("local"))
+	})
+
+	It("returns 401 when there is no user and no fallback", func() {
+		rec := billing.NewRecorder(&fakeRecorderBackend{})
+		e := echo.New()
+		e.GET("/api/usage", usageHandler(rec, nil))
+
+		req := httptest.NewRequest(http.MethodGet, "/api/usage", nil)
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+
+		Expect(w.Code).To(Equal(http.StatusUnauthorized))
+	})
+
+	It("defaults to month period when none is supplied", func() {
+		fb := &fakeRecorderBackend{}
+		rec := billing.NewRecorder(fb)
+		fallback := &auth.User{ID: "u", Name: "u", Role: auth.RoleAdmin}
+		e := echo.New()
+		e.GET("/api/usage", usageHandler(rec, fallback))
+
+		req := httptest.NewRequest(http.MethodGet, "/api/usage", nil)
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+
+		Expect(w.Code).To(Equal(http.StatusOK))
+		Expect(fb.lastQuery.Period).To(Equal("month"))
+	})
+})
diff --git a/core/schema/localai.go b/core/schema/localai.go
index 5fceae0b3..8704f8ad8 100644
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@@ -430,3 +430,98 @@ type SettingsResponse struct {
 	Error   string `json:"error,omitempty"`
 	Message string `json:"message,omitempty"`
 }
+
+// RouterDecideRequest is the input for POST /api/router/decide — the
+// programmatic decision-oracle endpoint. Given the name of a router
+// model (a ModelConfig that carries a `router:` block) and a prompt,
+// the endpoint returns the classifier's label set plus the candidate
+// model the in-band RouteModel middleware would have chosen. The
+// endpoint does NOT rewrite any request, forward to a backend, or
+// record a row in the decision store — it is a pure decision oracle
+// for external routers that want LocalAI's classifier opinion without
+// committing LocalAI to handle the request.
+type RouterDecideRequest struct {
+	// Router is the name of the router model (a ModelConfig with a
+	// `router:` block). Required.
+	Router string `json:"router"`
+	// Input is the user-visible prompt text to classify. Required.
+	// Schema-shape extraction (chat-message concatenation, etc.) is
+	// the caller's responsibility — matches the Probe contract used
+	// by the in-band middleware.
+	Input string `json:"input"`
+}
+
+// RouterDecideResponse carries the classifier's decision plus the
+// resolved candidate. Mirrors router.Decision with the addition of
+// Candidate/Fallback so the caller learns which downstream model
+// would have served the request without re-implementing the
+// label-set → candidate match locally.
+type RouterDecideResponse struct {
+	// Router echoes the requested router model.
+	Router string `json:"router"`
+	// Classifier is the classifier name that produced the decision
+	// (e.g. "score").
+	Classifier string `json:"classifier"`
+	// Labels is the set of active policy labels.
+	Labels []string `json:"labels"`
+	// Candidate is the model that would be routed to. Empty when no
+	// candidate covers Labels AND no fallback is configured.
+	Candidate string `json:"candidate,omitempty"`
+	// Fallback is true when Candidate is the router's configured
+	// fallback because no candidate covered Labels. Lets callers
+	// distinguish "matched" from "fell back" without comparing names.
+	Fallback bool `json:"fallback,omitempty"`
+	// Score is the top label's softmax probability (the
+	// classifier-side confidence signal).
+	Score float64 `json:"score"`
+	// LatencyMs is the classifier's wall-clock cost.
+	LatencyMs int64 `json:"latency_ms"`
+	// Cached is true when the decision came from the L2 embedding
+	// cache rather than a fresh classifier run.
+	Cached bool `json:"cached,omitempty"`
+	// CacheSimilarity carries the cosine similarity of the cache hit
+	// (0 when not cached).
+	CacheSimilarity float64 `json:"cache_similarity,omitempty"`
+}
+
+// PIIDecideRequest is the input for POST /api/pii/decide — the
+// programmatic PII-decision oracle. External routers call it before
+// dispatching a request to learn whether the content carries PII and
+// what action the configured pattern set would take. The endpoint
+// inspects the text and returns findings + a suggested action; it
+// does NOT mutate the input, record an audit event, or rewrite any
+// downstream request. The caller composes the decision with its own
+// policy (mask, block, route to local-only backends, allow).
+type PIIDecideRequest struct {
+	// Text is the user-visible content to inspect. Required.
+	Text string `json:"text"`
+}
+
+// PIIDecideResponse carries the redactor's findings.
+// SuggestedAction is derived from the action ordering used by the
+// internal redactor (block > route_local > mask > allow) so callers
+// don't need to replicate that logic.
+type PIIDecideResponse struct {
+	// Findings is one entry per matched span — pattern id, byte
+	// range, and audit-safe hash prefix (never the matched value).
+	Findings []PIIFinding `json:"findings"`
+	// SuggestedAction is the strongest action across all findings:
+	// "block", "route_local", "mask", or "allow" (no findings).
+	SuggestedAction string `json:"suggested_action"`
+	// RedactedPreview is the input with mask-action spans replaced
+	// by their placeholders. Identical to Text when no findings or
+	// when the strongest action is block/route_local (which don't
+	// rewrite content).
+	RedactedPreview string `json:"redacted_preview"`
+}
+
+// PIIFinding mirrors pii.Span on the wire. Pattern is the pattern id
+// that matched (e.g. "email"). HashPrefix is the first 8 chars of
+// sha256(matched value) — lets admins correlate recurring leaks
+// without recovering the value itself.
+type PIIFinding struct {
+	Start      int    `json:"start"`
+	End        int    `json:"end"`
+	Pattern    string `json:"pattern"`
+	HashPrefix string `json:"hash_prefix"`
+}
diff --git a/core/schema/message.go b/core/schema/message.go
index 24407165e..d55e91345 100644
--- a/core/schema/message.go
+++ b/core/schema/message.go
@@ -18,10 +18,14 @@ type Message struct {
 	// The message content
 	Content any `json:"content" yaml:"content"`
 
-	StringContent string   `json:"string_content,omitempty" yaml:"string_content,omitempty"`
-	StringImages  []string `json:"string_images,omitempty" yaml:"string_images,omitempty"`
-	StringVideos  []string `json:"string_videos,omitempty" yaml:"string_videos,omitempty"`
-	StringAudios  []string `json:"string_audios,omitempty" yaml:"string_audios,omitempty"`
+	// Staging buffers populated by the request middleware while
+	// decoding multimodal Content. Never serialised — strict
+	// providers (Anthropic) 400 on unknown message fields when the
+	// cloud-proxy passthrough re-marshals Message verbatim.
+	StringContent string   `json:"-" yaml:"-"`
+	StringImages  []string `json:"-" yaml:"-"`
+	StringVideos  []string `json:"-" yaml:"-"`
+	StringAudios  []string `json:"-" yaml:"-"`
 
 	// A result of a function call
 	FunctionCall any `json:"function_call,omitempty" yaml:"function_call,omitempty"`
diff --git a/core/schema/message_test.go b/core/schema/message_test.go
index 8ebf3fa05..d14b0c16c 100644
--- a/core/schema/message_test.go
+++ b/core/schema/message_test.go
@@ -255,6 +255,59 @@ var _ = Describe("LLM tests", func() {
 			Expect(protoMessages[0].ReasoningContent).To(Equal("thinking..."))
 		})
 
+		It("should not leak unset LocalAI-only or cross-endpoint request fields into JSON", func() {
+			// OpenAIRequest is a union over chat / completion /
+			// embedding / image / whisper. Strict upstream providers
+			// (OpenAI, Anthropic) 400 on unknown parameters when
+			// cloud-proxy passthrough re-marshals a chat request and
+			// whisper's `file`, image's `step`, embedding's `input`,
+			// etc. tag along as empty zero values.
+			req := OpenAIRequest{}
+			req.Model = "gpt-4"
+			data, err := json.Marshal(req)
+			Expect(err).NotTo(HaveOccurred())
+			body := string(data)
+			// Anchor with the trailing `:` so e.g. `"stream"` doesn't
+			// false-match `"stream_options"` if a future test setup
+			// populates the latter.
+			for _, key := range []string{
+				// LocalAI-only fields
+				`"backend":`, `"grammar":`, `"grammar_json_functions":`,
+				`"model_base_name":`, `"reasoning_effort":`,
+				// Cross-endpoint fields that don't belong on chat
+				`"file":`, `"size":`, `"prompt":`, `"instruction":`,
+				`"input":`, `"stop":`, `"messages":`, `"functions":`,
+				`"function_call":`, `"stream":`, `"quality":`, `"step":`,
+				`"metadata":`,
+			} {
+				Expect(body).NotTo(ContainSubstring(key), "unset field "+key+" must not appear in marshalled JSON")
+			}
+		})
+
+		It("should not leak internal String* staging fields into JSON", func() {
+			// Regression: the request middleware copies decoded
+			// Content into StringContent/StringImages/etc. for
+			// templating. When cloud-proxy passthrough re-marshals
+			// the request, strict providers (Anthropic) 400 with
+			// "messages.0.string_content: Extra inputs are not
+			// permitted" if these leak.
+			msg := Message{
+				Role:          "user",
+				Content:       "Hello",
+				StringContent: "Hello",
+				StringImages:  []string{"base64-blob"},
+				StringVideos:  []string{"base64-blob"},
+				StringAudios:  []string{"base64-blob"},
+			}
+			data, err := json.Marshal(msg)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(string(data)).NotTo(ContainSubstring("string_content"))
+			Expect(string(data)).NotTo(ContainSubstring("string_images"))
+			Expect(string(data)).NotTo(ContainSubstring("string_videos"))
+			Expect(string(data)).NotTo(ContainSubstring("string_audios"))
+			Expect(string(data)).To(ContainSubstring(`"content":"Hello"`))
+		})
+
 		It("should handle message with array content containing non-text parts", func() {
 			messages := Messages{
 				{
diff --git a/core/schema/openai.go b/core/schema/openai.go
index 83ab3a9fc..897dcbb97 100644
--- a/core/schema/openai.go
+++ b/core/schema/openai.go
@@ -181,8 +181,15 @@ type OpenAIRequest struct {
 	Context context.Context    `json:"-"`
 	Cancel  context.CancelFunc `json:"-"`
 
+	// OpenAIRequest is a union over chat / completion / embedding /
+	// edit / image / whisper endpoints. Most fields apply to only one
+	// endpoint family — they MUST be omitempty so the re-marshal path
+	// in cloud-proxy passthrough doesn't ship whisper's `file:""` or
+	// embedding's `input:null` to an upstream chat endpoint, which
+	// strict providers (OpenAI) reject as unknown parameters.
+
 	// whisper
-	File string `json:"file" validate:"required"`
+	File string `json:"file,omitempty" validate:"required"`
 	// Multiple input images for img2img or inpainting
 	Files []string `json:"files,omitempty"`
 	// Reference images for models that support them (e.g., Flux Kontext)
@@ -190,47 +197,54 @@ type OpenAIRequest struct {
 	//whisper/image
 	ResponseFormat any `json:"response_format,omitempty"`
 	// image
-	Size string `json:"size"`
+	Size string `json:"size,omitempty"`
 	// Prompt is read only by completion/image API calls
-	Prompt any `json:"prompt" yaml:"prompt"`
+	Prompt any `json:"prompt,omitempty" yaml:"prompt"`
 
 	// Edit endpoint
-	Instruction string `json:"instruction" yaml:"instruction"`
-	Input       any    `json:"input" yaml:"input"`
+	Instruction string `json:"instruction,omitempty" yaml:"instruction"`
+	Input       any    `json:"input,omitempty" yaml:"input"`
 
-	Stop any `json:"stop" yaml:"stop"`
+	Stop any `json:"stop,omitempty" yaml:"stop"`
 
 	// Messages is read only by chat/completion API calls
-	Messages []Message `json:"messages" yaml:"messages"`
+	Messages []Message `json:"messages,omitempty" yaml:"messages"`
 
 	// A list of available functions to call
-	Functions    functions.Functions `json:"functions" yaml:"functions"`
-	FunctionCall any                 `json:"function_call" yaml:"function_call"` // might be a string or an object
+	Functions    functions.Functions `json:"functions,omitempty" yaml:"functions"`
+	FunctionCall any                 `json:"function_call,omitempty" yaml:"function_call"` // might be a string or an object
 
 	Tools       []functions.Tool `json:"tools,omitempty" yaml:"tools"`
 	ToolsChoice any              `json:"tool_choice,omitempty" yaml:"tool_choice"`
 
-	Stream bool `json:"stream"`
+	Stream bool `json:"stream,omitempty"`
 
 	// StreamOptions opts into OpenAI streaming extensions, e.g. include_usage.
 	StreamOptions *StreamOptions `json:"stream_options,omitempty" yaml:"stream_options,omitempty"`
 
 	// Image (not supported by OpenAI)
-	Quality string `json:"quality"`
-	Step    int    `json:"step"`
+	Quality string `json:"quality,omitempty"`
+	Step    int    `json:"step,omitempty"`
+
+	// LocalAI-specific request fields below. They carry server-side
+	// routing/templating hints and are NOT part of the OpenAI surface
+	// — leaking them upstream as zero values trips strict providers
+	// (e.g. OpenAI 400s with "Unknown parameter: 'backend'."), so
+	// they must use omitempty to disappear from re-marshaled bodies
+	// in the cloud-proxy passthrough path.
 
 	// A grammar to constrain the LLM output
-	Grammar string `json:"grammar" yaml:"grammar"`
+	Grammar string `json:"grammar,omitempty" yaml:"grammar"`
 
-	JSONFunctionGrammarObject *functions.JSONFunctionStructure `json:"grammar_json_functions" yaml:"grammar_json_functions"`
+	JSONFunctionGrammarObject *functions.JSONFunctionStructure `json:"grammar_json_functions,omitempty" yaml:"grammar_json_functions"`
 
-	Backend string `json:"backend" yaml:"backend"`
+	Backend string `json:"backend,omitempty" yaml:"backend"`
 
-	ModelBaseName string `json:"model_base_name" yaml:"model_base_name"`
+	ModelBaseName string `json:"model_base_name,omitempty" yaml:"model_base_name"`
 
-	ReasoningEffort string `json:"reasoning_effort" yaml:"reasoning_effort"`
+	ReasoningEffort string `json:"reasoning_effort,omitempty" yaml:"reasoning_effort"`
 
-	Metadata map[string]string `json:"metadata" yaml:"metadata"`
+	Metadata map[string]string `json:"metadata,omitempty" yaml:"metadata"`
 }
 
 type ModelsDataResponse struct {
diff --git a/core/schema/prediction.go b/core/schema/prediction.go
index f0b2bda40..d6e4fabf3 100644
--- a/core/schema/prediction.go
+++ b/core/schema/prediction.go
@@ -98,7 +98,13 @@ type PredictionOptions struct {
 	MinP        *float64 `json:"min_p,omitempty" yaml:"min_p,omitempty"`
 	Temperature *float64 `json:"temperature,omitempty" yaml:"temperature,omitempty"`
 	Maxtokens   *int     `json:"max_tokens,omitempty" yaml:"max_tokens,omitempty"`
-	Echo        bool     `json:"echo,omitempty" yaml:"echo,omitempty"`
+	// MaxCompletionTokens is the modern alias for max_tokens
+	// (OpenAI deprecated max_tokens; gpt-5 / o-series reject it).
+	// Accepted on the wire so up-to-date clients can use the new
+	// name; the request middleware collapses it into Maxtokens so
+	// internal code reads exactly one field.
+	MaxCompletionTokens *int `json:"max_completion_tokens,omitempty" yaml:"-"`
+	Echo                bool `json:"echo,omitempty" yaml:"echo,omitempty"`
 
 	// Custom parameters - not present in the OpenAI API
 	Batch         int     `json:"batch,omitempty" yaml:"batch,omitempty"`
diff --git a/core/services/cloudproxy/backend_forward.go b/core/services/cloudproxy/backend_forward.go
new file mode 100644
index 000000000..841f3c95a
--- /dev/null
+++ b/core/services/cloudproxy/backend_forward.go
@@ -0,0 +1,237 @@
+package cloudproxy
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"time"
+
+	"github.com/labstack/echo/v4"
+	corebackend "github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/auth"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+	"github.com/mudler/LocalAI/core/trace"
+	pkggrpc "github.com/mudler/LocalAI/pkg/grpc"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/xlog"
+)
+
+// BuildStreamFilter constructs the per-request streaming PII filter
+// for a cloud-proxy forward. Returns nil when the request isn't
+// streaming, PII is disabled for this model, or no redactor is wired
+// up — callers pass the result through unchanged. correlationID is
+// caller-supplied because the OpenAI and Anthropic endpoints read it
+// from different headers.
+func BuildStreamFilter(c echo.Context, cfg *config.ModelConfig, isStream bool, piiRedactor *pii.Redactor, piiEvents pii.EventStore, correlationID string) *pii.StreamFilter {
+	if !isStream || piiRedactor == nil || !cfg.PIIIsEnabled() {
+		return nil
+	}
+	userID := ""
+	if u := auth.GetUser(c); u != nil {
+		userID = u.ID
+	}
+	var overrides map[string]pii.Action
+	if raw := cfg.PIIPatternOverrides(); len(raw) > 0 {
+		overrides = make(map[string]pii.Action, len(raw))
+		for ovid, action := range raw {
+			switch pii.Action(action) {
+			case pii.ActionMask, pii.ActionBlock, pii.ActionRouteLocal:
+				overrides[ovid] = pii.Action(action)
+			}
+		}
+	}
+	return pii.NewStreamFilter(piiRedactor, overrides, piiEvents, correlationID, userID)
+}
+
+// ForwardViaBackend loads the cloud-proxy gRPC backend, ships the
+// request via the Forward RPC, and pumps the response back to the
+// client through the SSE-aware PII pipeline.
+func ForwardViaBackend(
+	c echo.Context,
+	cfg *config.ModelConfig,
+	body []byte,
+	filter *pii.StreamFilter,
+	loader *model.ModelLoader,
+	appConfig *config.ApplicationConfig,
+) (resultErr error) {
+	// Passthrough forwards bypass core/backend/llm.go and therefore its
+	// trace.RecordBackendTrace call — instrument here so passthrough
+	// requests show up in the Traces UI alongside translate-mode ones.
+	// Named return is unusual for this package but lets the defer capture
+	// the final error across the function's many early-return paths
+	// without rewriting them.
+	var startTime time.Time
+	statusCode := 0
+	if appConfig.EnableTracing {
+		trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
+		startTime = time.Now()
+	}
+	defer func() {
+		if !appConfig.EnableTracing {
+			return
+		}
+		errStr := ""
+		if resultErr != nil {
+			errStr = resultErr.Error()
+		}
+		data := map[string]any{
+			"mode":           cfg.Proxy.Mode,
+			"provider":       cfg.Proxy.Provider,
+			"upstream":       cfg.Proxy.UpstreamURL,
+			"upstream_model": cfg.Proxy.UpstreamModel,
+		}
+		if statusCode != 0 {
+			data["status"] = statusCode
+		}
+		trace.RecordBackendTrace(trace.BackendTrace{
+			Timestamp: startTime,
+			Duration:  time.Since(startTime),
+			Type:      trace.BackendTraceLLM,
+			ModelName: cfg.Name,
+			Backend:   cfg.Backend,
+			Summary:   trace.TruncateBytes(body, 200),
+			Body:      trace.TruncateBytes(body, trace.MaxTraceBodyBytes),
+			Error:     errStr,
+			Data:      data,
+		})
+	}()
+
+	if cfg.Proxy.UpstreamURL == "" {
+		return echo.NewHTTPError(http.StatusInternalServerError,
+			fmt.Sprintf("cloudproxy: proxy.upstream_url empty for model %q", cfg.Name))
+	}
+
+	body, err := rewriteModel(body, cfg.Proxy.UpstreamModel)
+	if err != nil {
+		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
+	}
+
+	opts := corebackend.ModelOptions(*cfg, appConfig)
+	inferenceModel, err := loader.Load(opts...)
+	if err != nil {
+		return echo.NewHTTPError(http.StatusInternalServerError, "cloudproxy: load cloud-proxy backend: "+err.Error())
+	}
+	be, ok := inferenceModel.(pkggrpc.Backend)
+	if !ok {
+		return echo.NewHTTPError(http.StatusInternalServerError, "cloudproxy: cloud-proxy backend doesn't speak gRPC")
+	}
+
+	ctx := c.Request().Context()
+	stream, err := be.Forward(ctx)
+	if err != nil {
+		return echo.NewHTTPError(http.StatusBadGateway, "cloudproxy: open Forward stream: "+err.Error())
+	}
+
+	// Single request message — first carries path/method/headers + the
+	// full body. Cloud-proxy's upstream_url has the canonical path so
+	// the Path field is informational; backend uses upstream_url.
+	if err := stream.Send(&pb.ForwardRequest{
+		Path:      "",
+		Method:    http.MethodPost,
+		Headers:   []*pb.ForwardHeader{{Name: "Content-Type", Value: "application/json"}},
+		BodyChunk: body,
+	}); err != nil {
+		_ = stream.CloseSend()
+		return echo.NewHTTPError(http.StatusBadGateway, "cloudproxy: send request: "+err.Error())
+	}
+	if err := stream.CloseSend(); err != nil {
+		return echo.NewHTTPError(http.StatusBadGateway, "cloudproxy: close send: "+err.Error())
+	}
+
+	// First reply carries status + response headers. Subsequent replies
+	// carry body chunks. Wrap the remaining stream as an io.Reader so
+	// the existing forwardStream / forwardBuffered code paths apply
+	// unchanged.
+	first, err := stream.Recv()
+	if err != nil {
+		return echo.NewHTTPError(http.StatusBadGateway, "cloudproxy: recv first reply: "+err.Error())
+	}
+
+	statusCode = int(first.GetStatus())
+	contentType := ""
+	for _, h := range first.GetHeaders() {
+		if h != nil && h.GetName() != "" && http.CanonicalHeaderKey(h.GetName()) == "Content-Type" {
+			contentType = h.GetValue()
+			break
+		}
+	}
+	bodyReader := &forwardReader{stream: stream}
+
+	isStream := streaming(body)
+	logFn := xlog.Info
+	if statusCode >= 400 {
+		logFn = xlog.Warn
+	}
+	logFn("cloudproxy: forwarding via backend",
+		"model", cfg.Name,
+		"upstream", cfg.Proxy.UpstreamURL,
+		"upstream_model", cfg.Proxy.UpstreamModel,
+		"status", statusCode,
+		"stream", isStream)
+
+	if statusCode >= 400 {
+		return passthroughError(c, statusCode, contentType, bodyReader)
+	}
+	if isStream {
+		return forwardStream(c, bodyReader, cfg.Proxy.Provider, filter)
+	}
+	return forwardBuffered(c, statusCode, contentType, bodyReader)
+}
+
+// forwardReader adapts a Backend_ForwardClient into an io.ReadCloser.
+// Each ForwardReply carries a chunk of the upstream body; we accumulate
+// into a single buffer and serve it through Read.
+type forwardReader struct {
+	stream pkggrpc.ForwardClient
+	pos    int
+	buf    []byte
+	err    error
+}
+
+func (r *forwardReader) Read(p []byte) (int, error) {
+	if r.err != nil && r.pos >= len(r.buf) {
+		return 0, r.err
+	}
+	if r.pos >= len(r.buf) {
+		// Need a new chunk.
+		reply, err := r.stream.Recv()
+		if err != nil {
+			if errors.Is(err, io.EOF) {
+				r.err = io.EOF
+				return 0, io.EOF
+			}
+			r.err = err
+			return 0, err
+		}
+		r.buf = reply.GetBodyChunk()
+		r.pos = 0
+		if len(r.buf) == 0 {
+			// Zero-length chunk — try again rather than returning 0
+			// (some readers treat that as EOF).
+			return r.Read(p)
+		}
+	}
+	n := copy(p, r.buf[r.pos:])
+	r.pos += n
+	return n, nil
+}
+
+func (r *forwardReader) Close() error {
+	// Drain any remaining replies so the server-side goroutine isn't
+	// left blocked. The stream is request-scoped; when the parent
+	// context is cancelled (handler returns), Recv returns and we
+	// exit. A misbehaving backend that keeps emitting replies after
+	// cancellation is bounded by the iteration cap.
+	for i := 0; i < 1024; i++ {
+		if _, err := r.stream.Recv(); err != nil {
+			return nil
+		}
+		if r.stream.Context().Err() != nil {
+			return nil
+		}
+	}
+	return nil
+}
diff --git a/core/services/cloudproxy/backend_forward_test.go b/core/services/cloudproxy/backend_forward_test.go
new file mode 100644
index 000000000..6328dff2a
--- /dev/null
+++ b/core/services/cloudproxy/backend_forward_test.go
@@ -0,0 +1,160 @@
+package cloudproxy
+
+import (
+	"context"
+	"errors"
+	"io"
+	"testing"
+	"time"
+
+	. "github.com/onsi/gomega"
+
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+// scriptedForwardClient is a fake ForwardClient that returns a fixed
+// sequence of replies. Each Recv pops the next reply or returns the
+// terminal error. Used to drive forwardReader through scripted gRPC
+// responses without standing up a real backend.
+type scriptedForwardClient struct {
+	replies []*pb.ForwardReply
+	final   error
+	idx     int
+}
+
+func (s *scriptedForwardClient) Send(*pb.ForwardRequest) error { return nil }
+func (s *scriptedForwardClient) CloseSend() error              { return nil }
+func (s *scriptedForwardClient) Context() context.Context      { return context.Background() }
+func (s *scriptedForwardClient) Recv() (*pb.ForwardReply, error) {
+	if s.idx >= len(s.replies) {
+		if s.final != nil {
+			return nil, s.final
+		}
+		return nil, io.EOF
+	}
+	r := s.replies[s.idx]
+	s.idx++
+	return r, nil
+}
+
+func TestForwardReader_ConcatsChunks(t *testing.T) {
+	g := NewWithT(t)
+	r := &forwardReader{stream: &scriptedForwardClient{
+		replies: []*pb.ForwardReply{
+			{BodyChunk: []byte("hello ")},
+			{BodyChunk: []byte("world")},
+			{BodyChunk: []byte("!")},
+		},
+	}}
+	got, err := io.ReadAll(r)
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(string(got)).To(Equal("hello world!"))
+}
+
+func TestForwardReader_PartialReads(t *testing.T) {
+	g := NewWithT(t)
+	r := &forwardReader{stream: &scriptedForwardClient{
+		replies: []*pb.ForwardReply{
+			{BodyChunk: []byte("abcdefghij")},
+		},
+	}}
+	// Read 3 bytes at a time — exercises pos advancement within a chunk.
+	var out []byte
+	buf := make([]byte, 3)
+	for {
+		n, err := r.Read(buf)
+		out = append(out, buf[:n]...)
+		if errors.Is(err, io.EOF) {
+			break
+		}
+		g.Expect(err).NotTo(HaveOccurred())
+	}
+	g.Expect(string(out)).To(Equal("abcdefghij"))
+}
+
+func TestForwardReader_SkipsEmptyChunks(t *testing.T) {
+	g := NewWithT(t)
+	// Empty chunks must not be treated as EOF — backends may legitimately
+	// emit them (e.g. SSE keepalives, transport quirks).
+	r := &forwardReader{stream: &scriptedForwardClient{
+		replies: []*pb.ForwardReply{
+			{BodyChunk: nil},
+			{BodyChunk: []byte("data")},
+			{BodyChunk: []byte{}},
+			{BodyChunk: []byte("more")},
+		},
+	}}
+	got, err := io.ReadAll(r)
+	g.Expect(err).NotTo(HaveOccurred())
+	g.Expect(string(got)).To(Equal("datamore"))
+}
+
+// infiniteForwardClient simulates a misbehaving backend that never
+// stops emitting replies. Used to verify Close() doesn't spin forever.
+type infiniteForwardClient struct {
+	ctx   context.Context
+	calls int
+}
+
+func (s *infiniteForwardClient) Send(*pb.ForwardRequest) error { return nil }
+func (s *infiniteForwardClient) CloseSend() error              { return nil }
+func (s *infiniteForwardClient) Context() context.Context {
+	if s.ctx == nil {
+		return context.Background()
+	}
+	return s.ctx
+}
+func (s *infiniteForwardClient) Recv() (*pb.ForwardReply, error) {
+	s.calls++
+	return &pb.ForwardReply{BodyChunk: []byte("never-ending")}, nil
+}
+
+func TestForwardReader_CloseBoundedByIterationCap(t *testing.T) {
+	// Misbehaving backend that never returns EOF. Without the cap,
+	// Close() would loop forever. The cap is currently 1024.
+	g := NewWithT(t)
+	upstream := &infiniteForwardClient{}
+	r := &forwardReader{stream: upstream}
+
+	done := make(chan struct{})
+	go func() {
+		_ = r.Close()
+		close(done)
+	}()
+	g.Eventually(done, 2*time.Second).Should(BeClosed(), "Close did not return within 2s")
+	g.Expect(upstream.calls).To(BeNumerically("<=", 2048),
+		"Close drained %d replies; expected bounded near 1024", upstream.calls)
+}
+
+func TestForwardReader_CloseExitsOnContextCancel(t *testing.T) {
+	// Even before the iteration cap, a cancelled context should let
+	// Close() return — that's the request-scoped exit path.
+	g := NewWithT(t)
+	ctx, cancel := context.WithCancel(context.Background())
+	upstream := &infiniteForwardClient{ctx: ctx}
+	r := &forwardReader{stream: upstream}
+
+	cancel()
+	done := make(chan struct{})
+	go func() {
+		_ = r.Close()
+		close(done)
+	}()
+	g.Eventually(done, 2*time.Second).Should(BeClosed(), "Close did not return after context cancel")
+}
+
+func TestForwardReader_PropagatesError(t *testing.T) {
+	g := NewWithT(t)
+	wantErr := errors.New("upstream blew up")
+	r := &forwardReader{stream: &scriptedForwardClient{
+		replies: []*pb.ForwardReply{{BodyChunk: []byte("partial")}},
+		final:   wantErr,
+	}}
+	buf := make([]byte, 16)
+	n, err := r.Read(buf)
+	g.Expect(n).To(Equal(len("partial")))
+	g.Expect(string(buf[:n])).To(Equal("partial"))
+	g.Expect(err).NotTo(HaveOccurred())
+	_, err = r.Read(buf)
+	g.Expect(errors.Is(err, wantErr)).To(BeTrue())
+}
diff --git a/core/services/cloudproxy/build_filter_test.go b/core/services/cloudproxy/build_filter_test.go
new file mode 100644
index 000000000..c46d8a392
--- /dev/null
+++ b/core/services/cloudproxy/build_filter_test.go
@@ -0,0 +1,72 @@
+package cloudproxy
+
+import (
+	"net/http/httptest"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("BuildStreamFilter", func() {
+	var (
+		c  echo.Context
+		cfg *config.ModelConfig
+	)
+
+	BeforeEach(func() {
+		e := echo.New()
+		req := httptest.NewRequest("POST", "/v1/chat/completions", nil)
+		rec := httptest.NewRecorder()
+		c = e.NewContext(req, rec)
+		piiOn := true
+		cfg = &config.ModelConfig{
+			Backend: "cloud-proxy",
+			PII:     config.PIIConfig{Enabled: &piiOn},
+		}
+	})
+
+	// Three guards must each independently force a nil return — proves
+	// the gate is a logical AND, not an order-dependent short-circuit
+	// that silently activates one branch.
+	It("returns nil when isStream is false", func() {
+		patterns, err := pii.Compile(pii.DefaultPatterns())
+		Expect(err).NotTo(HaveOccurred())
+		r := pii.NewRedactor(patterns)
+		Expect(BuildStreamFilter(c, cfg, false, r, nil, "corr-1")).To(BeNil())
+	})
+
+	It("returns nil when piiRedactor is nil", func() {
+		Expect(BuildStreamFilter(c, cfg, true, nil, nil, "corr-1")).To(BeNil())
+	})
+
+	It("returns nil when the model has PII disabled", func() {
+		piiOff := false
+		cfg.PII.Enabled = &piiOff
+		patterns, err := pii.Compile(pii.DefaultPatterns())
+		Expect(err).NotTo(HaveOccurred())
+		r := pii.NewRedactor(patterns)
+		Expect(BuildStreamFilter(c, cfg, true, r, nil, "corr-1")).To(BeNil())
+	})
+
+	It("returns a configured filter when all preconditions hold", func() {
+		patterns, err := pii.Compile(pii.DefaultPatterns())
+		Expect(err).NotTo(HaveOccurred())
+		r := pii.NewRedactor(patterns)
+		store := pii.NewMemoryEventStore(8)
+		filter := BuildStreamFilter(c, cfg, true, r, store, "corr-xyz")
+		Expect(filter).NotTo(BeNil())
+	})
+
+	// Empty correlationID is allowed — some entry points don't have one.
+	// The filter must still construct so the stream can flow.
+	It("constructs a filter even when correlationID is empty", func() {
+		patterns, err := pii.Compile(pii.DefaultPatterns())
+		Expect(err).NotTo(HaveOccurred())
+		r := pii.NewRedactor(patterns)
+		Expect(BuildStreamFilter(c, cfg, true, r, nil, "")).NotTo(BeNil())
+	})
+})
diff --git a/core/services/cloudproxy/mitm/ca.go b/core/services/cloudproxy/mitm/ca.go
new file mode 100644
index 000000000..1dea43566
--- /dev/null
+++ b/core/services/cloudproxy/mitm/ca.go
@@ -0,0 +1,177 @@
+// Package mitm implements a TLS man-in-the-middle proxy that
+// applies per-request PII redaction to allowlisted LLM API hosts
+// while tunnelling everything else byte-for-byte.
+package mitm
+
+import (
+	"crypto/ecdsa"
+	"crypto/elliptic"
+	"crypto/rand"
+	"crypto/x509"
+	"crypto/x509/pkix"
+	"encoding/pem"
+	"fmt"
+	"math/big"
+	"os"
+	"path/filepath"
+	"sync"
+	"time"
+)
+
+type CA struct {
+	cert      *x509.Certificate
+	key       *ecdsa.PrivateKey
+	publicPEM []byte
+
+	mu     sync.Mutex
+	leaves map[string]*leafEntry
+}
+
+// LoadOrCreateCA loads the CA from dir if both files exist, or
+// generates a new ECDSA-P256 CA and persists it. The key file is
+// mode 0600.
+func LoadOrCreateCA(dir string) (*CA, error) {
+	if err := os.MkdirAll(dir, 0o700); err != nil {
+		return nil, fmt.Errorf("mitm: create ca dir %q: %w", dir, err)
+	}
+
+	certPath := filepath.Join(dir, "ca.crt")
+	keyPath := filepath.Join(dir, "ca.key")
+
+	certPEM, err1 := os.ReadFile(certPath)
+	keyPEM, err2 := os.ReadFile(keyPath)
+	if err1 == nil && err2 == nil {
+		ca, err := parseCA(certPEM, keyPEM)
+		if err == nil {
+			return ca, nil
+		}
+		// Fall through and regenerate. We don't auto-delete the
+		// existing files — the operator might have hand-edited
+		// them. Surface the parse error instead.
+		return nil, fmt.Errorf("mitm: parse existing CA at %s: %w (delete to regenerate)", dir, err)
+	}
+
+	ca, certPEMOut, keyPEMOut, err := generateCA()
+	if err != nil {
+		return nil, err
+	}
+	if err := os.WriteFile(certPath, certPEMOut, 0o644); err != nil {
+		return nil, fmt.Errorf("mitm: write ca cert %q: %w", certPath, err)
+	}
+	if err := os.WriteFile(keyPath, keyPEMOut, 0o600); err != nil {
+		return nil, fmt.Errorf("mitm: write ca key %q: %w", keyPath, err)
+	}
+	return ca, nil
+}
+
+func generateCA() (*CA, []byte, []byte, error) {
+	key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
+	if err != nil {
+		return nil, nil, nil, fmt.Errorf("mitm: generate ca key: %w", err)
+	}
+
+	serial, err := rand.Int(rand.Reader, new(big.Int).Lsh(big.NewInt(1), 128))
+	if err != nil {
+		return nil, nil, nil, fmt.Errorf("mitm: serial: %w", err)
+	}
+
+	now := time.Now().UTC()
+	tmpl := &x509.Certificate{
+		SerialNumber: serial,
+		Subject: pkix.Name{
+			CommonName:   "LocalAI MITM Proxy CA",
+			Organization: []string{"LocalAI"},
+		},
+		NotBefore:             now.Add(-1 * time.Hour),
+		NotAfter:              now.Add(10 * 365 * 24 * time.Hour),
+		KeyUsage:              x509.KeyUsageCertSign | x509.KeyUsageCRLSign | x509.KeyUsageDigitalSignature,
+		BasicConstraintsValid: true,
+		IsCA:                  true,
+		MaxPathLenZero:        true,
+	}
+
+	der, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, &key.PublicKey, key)
+	if err != nil {
+		return nil, nil, nil, fmt.Errorf("mitm: create ca cert: %w", err)
+	}
+	cert, err := x509.ParseCertificate(der)
+	if err != nil {
+		return nil, nil, nil, fmt.Errorf("mitm: re-parse ca cert: %w", err)
+	}
+
+	certPEM := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der})
+	keyDER, err := x509.MarshalECPrivateKey(key)
+	if err != nil {
+		return nil, nil, nil, fmt.Errorf("mitm: marshal ca key: %w", err)
+	}
+	keyPEM := pem.EncodeToMemory(&pem.Block{Type: "EC PRIVATE KEY", Bytes: keyDER})
+
+	return &CA{
+		cert:      cert,
+		key:       key,
+		publicPEM: certPEM,
+		leaves:    make(map[string]*leafEntry),
+	}, certPEM, keyPEM, nil
+}
+
+// NewInMemoryCA mints an ephemeral CA for tests.
+func NewInMemoryCA() (*CA, error) {
+	ca, _, _, err := generateCA()
+	return ca, err
+}
+
+func parseCA(certPEM, keyPEM []byte) (*CA, error) {
+	certBlock, _ := pem.Decode(certPEM)
+	if certBlock == nil || certBlock.Type != "CERTIFICATE" {
+		return nil, fmt.Errorf("mitm: ca cert PEM block missing or wrong type")
+	}
+	cert, err := x509.ParseCertificate(certBlock.Bytes)
+	if err != nil {
+		return nil, fmt.Errorf("mitm: parse ca cert: %w", err)
+	}
+	if !cert.IsCA {
+		return nil, fmt.Errorf("mitm: stored cert at is not a CA")
+	}
+
+	keyBlock, _ := pem.Decode(keyPEM)
+	if keyBlock == nil {
+		return nil, fmt.Errorf("mitm: ca key PEM block missing")
+	}
+	var key *ecdsa.PrivateKey
+	switch keyBlock.Type {
+	case "EC PRIVATE KEY":
+		k, err := x509.ParseECPrivateKey(keyBlock.Bytes)
+		if err != nil {
+			return nil, fmt.Errorf("mitm: parse ec ca key: %w", err)
+		}
+		key = k
+	case "PRIVATE KEY":
+		k, err := x509.ParsePKCS8PrivateKey(keyBlock.Bytes)
+		if err != nil {
+			return nil, fmt.Errorf("mitm: parse pkcs8 ca key: %w", err)
+		}
+		ecKey, ok := k.(*ecdsa.PrivateKey)
+		if !ok {
+			return nil, fmt.Errorf("mitm: pkcs8 key is not ECDSA")
+		}
+		key = ecKey
+	default:
+		return nil, fmt.Errorf("mitm: unsupported ca key PEM type %q", keyBlock.Type)
+	}
+
+	return &CA{
+		cert:      cert,
+		key:       key,
+		publicPEM: certPEM,
+		leaves:    make(map[string]*leafEntry),
+	}, nil
+}
+
+// PublicCertPEM returns a copy of the PEM-encoded CA certificate.
+func (c *CA) PublicCertPEM() []byte {
+	out := make([]byte, len(c.publicPEM))
+	copy(out, c.publicPEM)
+	return out
+}
+
+func (c *CA) Cert() *x509.Certificate { return c.cert }
diff --git a/core/services/cloudproxy/mitm/ca_test.go b/core/services/cloudproxy/mitm/ca_test.go
new file mode 100644
index 000000000..308361919
--- /dev/null
+++ b/core/services/cloudproxy/mitm/ca_test.go
@@ -0,0 +1,79 @@
+package mitm
+
+import (
+	"crypto/x509"
+	"encoding/pem"
+	"os"
+	"path/filepath"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("LoadOrCreateCA", func() {
+	It("generates and persists", func() {
+		dir := GinkgoT().TempDir()
+
+		ca1, err := LoadOrCreateCA(dir)
+		Expect(err).NotTo(HaveOccurred(), "first call")
+		Expect(ca1.cert).NotTo(BeNil())
+		Expect(ca1.cert.IsCA).To(BeTrue(), "generated cert is not a CA")
+		// Files must be on disk after first call.
+		for _, name := range []string{"ca.crt", "ca.key"} {
+			path := filepath.Join(dir, name)
+			info, err := os.Stat(path)
+			Expect(err).NotTo(HaveOccurred(), "expected %s to exist", path)
+			mode := info.Mode().Perm()
+			if name == "ca.key" {
+				Expect(mode).To(Equal(os.FileMode(0o600)))
+			}
+		}
+
+		// Second load must round-trip the same cert (same serial number
+		// proves we read from disk rather than regenerating).
+		ca2, err := LoadOrCreateCA(dir)
+		Expect(err).NotTo(HaveOccurred(), "second call")
+		Expect(ca1.cert.SerialNumber.Cmp(ca2.cert.SerialNumber)).To(Equal(0), "second load regenerated instead of reading from disk")
+	})
+
+	It("rejects non-CA stored cert", func() {
+		dir := GinkgoT().TempDir()
+		// Write a non-CA leaf cert into the slot reserved for the CA.
+		ca, err := NewInMemoryCA()
+		Expect(err).NotTo(HaveOccurred())
+		leaf, err := ca.IssueLeaf("example.com")
+		Expect(err).NotTo(HaveOccurred())
+		leafPEM := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: leaf.Certificate[0]})
+		Expect(os.WriteFile(filepath.Join(dir, "ca.crt"), leafPEM, 0o644)).To(Succeed())
+		// Pair with a key file so LoadOrCreateCA proceeds to parse.
+		Expect(os.WriteFile(filepath.Join(dir, "ca.key"), []byte("garbage"), 0o600)).To(Succeed())
+		_, err = LoadOrCreateCA(dir)
+		Expect(err).To(HaveOccurred(), "expected error for non-CA cert in CA slot")
+		Expect(strings.Contains(err.Error(), "delete to regenerate")).To(BeTrue(), "error should mention regenerate path")
+	})
+})
+
+var _ = Describe("PublicCertPEM", func() {
+	It("is a valid certificate", func() {
+		ca, err := NewInMemoryCA()
+		Expect(err).NotTo(HaveOccurred())
+		pemBytes := ca.PublicCertPEM()
+		block, _ := pem.Decode(pemBytes)
+		Expect(block).NotTo(BeNil())
+		Expect(block.Type).To(Equal("CERTIFICATE"))
+		cert, err := x509.ParseCertificate(block.Bytes)
+		Expect(err).NotTo(HaveOccurred())
+		Expect(cert.IsCA).To(BeTrue(), "decoded cert is not a CA")
+	})
+
+	It("returns a copy", func() {
+		// Mutating the returned slice must not poison subsequent calls.
+		ca, err := NewInMemoryCA()
+		Expect(err).NotTo(HaveOccurred())
+		first := ca.PublicCertPEM()
+		first[0] = 0x00 // corrupt
+		second := ca.PublicCertPEM()
+		Expect(second[0]).NotTo(Equal(byte(0x00)), "PublicCertPEM aliased its cache; mutation leaked")
+	})
+})
diff --git a/core/services/cloudproxy/mitm/handler.go b/core/services/cloudproxy/mitm/handler.go
new file mode 100644
index 000000000..8d73fe735
--- /dev/null
+++ b/core/services/cloudproxy/mitm/handler.go
@@ -0,0 +1,442 @@
+package mitm
+
+import (
+	"bytes"
+	"context"
+	"crypto/tls"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"sync/atomic"
+	"time"
+
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/cloudproxy/ssewire"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+	"github.com/mudler/LocalAI/core/services/routing/piiadapter"
+	"github.com/mudler/xlog"
+	"golang.org/x/net/http2"
+)
+
+// PIIHandlerOptions configures NewPIIHandler.
+type PIIHandlerOptions struct {
+	// Redactor is the regex PII redactor. nil disables redaction.
+	Redactor *pii.Redactor
+
+	// EventStore receives PIIEvent rows. nil discards events.
+	EventStore pii.EventStore
+
+	// UpstreamTLS overrides the tls.Config used when dialing the
+	// real upstream. Defaults to a system-trust HTTPS client.
+	UpstreamTLS *tls.Config
+
+	// CorrelationIDHeader names the request header carrying a
+	// caller-supplied correlation ID. Defaults to "X-Correlation-ID".
+	CorrelationIDHeader string
+
+	// DialHost optionally remaps the host used for the outbound
+	// upstream URL. Identity by default; tests inject a httptest
+	// listener address.
+	DialHost func(host string) string
+
+	// HostsWithPIIDisabled lists destination hosts whose request
+	// bodies should NOT run through the redactor. TLS termination,
+	// upstream forwarding, and audit events still happen — only the
+	// regex pass is bypassed. Useful for telemetry/probe endpoints
+	// whose bodies aren't PII-shaped.
+	HostsWithPIIDisabled []string
+}
+
+func NewPIIHandler(opts PIIHandlerOptions) InterceptHandler {
+	tlsCfg := opts.UpstreamTLS
+	if tlsCfg == nil {
+		tlsCfg = &tls.Config{NextProtos: []string{"h2", "http/1.1"}}
+	} else if len(tlsCfg.NextProtos) == 0 {
+		tlsCfg.NextProtos = []string{"h2", "http/1.1"}
+	}
+	transport := &http.Transport{
+		TLSClientConfig:   tlsCfg,
+		ForceAttemptHTTP2: true,
+	}
+	if err := http2.ConfigureTransport(transport); err != nil {
+		xlog.Debug("mitm: http2.ConfigureTransport failed", "error", err)
+	}
+
+	corrHeader := opts.CorrelationIDHeader
+	if corrHeader == "" {
+		corrHeader = "X-Correlation-ID"
+	}
+
+	dialHost := opts.DialHost
+	if dialHost == nil {
+		dialHost = func(h string) string { return h }
+	}
+
+	patternAction := map[string]pii.Action{}
+	if opts.Redactor != nil {
+		for _, p := range opts.Redactor.Patterns() {
+			patternAction[p.ID] = p.Action
+		}
+	}
+
+	piiDisabled := make(map[string]bool, len(opts.HostsWithPIIDisabled))
+	for _, h := range opts.HostsWithPIIDisabled {
+		piiDisabled[strings.ToLower(strings.TrimSpace(h))] = true
+	}
+
+	d := &piiDispatcher{
+		client:        &http.Client{Transport: transport},
+		redactor:      opts.Redactor,
+		store:         opts.EventStore,
+		patternAction: patternAction,
+		corrHeader:    corrHeader,
+		dialHost:      dialHost,
+		piiDisabled:   piiDisabled,
+	}
+	return d.serve
+}
+
+type piiDispatcher struct {
+	client        *http.Client
+	redactor      *pii.Redactor
+	store         pii.EventStore
+	patternAction map[string]pii.Action
+	corrHeader    string
+	dialHost      func(host string) string
+	piiDisabled   map[string]bool
+	eventSeq      atomic.Uint64
+}
+
+func (d *piiDispatcher) serve(w http.ResponseWriter, r *http.Request, host string) {
+	start := time.Now()
+	cw := &countingResponseWriter{ResponseWriter: w}
+	w = cw
+
+	var (
+		correlationID string
+		bytesSent     int64
+	)
+	defer func() {
+		d.recordTrafficEvent(host, correlationID, bytesSent, cw.bytes, cw.status, start)
+	}()
+
+	body, err := io.ReadAll(r.Body)
+	if err != nil {
+		http.Error(w, "mitm: read body: "+err.Error(), http.StatusBadGateway)
+		return
+	}
+	_ = r.Body.Close()
+
+	correlationID = r.Header.Get(d.corrHeader)
+	if correlationID == "" {
+		correlationID = r.Header.Get("x-request-id")
+	}
+
+	shape := classifyRequestShape(host, r.URL.Path)
+	if d.redactor != nil && shape != shapeUnknown && !d.piiDisabled[strings.ToLower(host)] {
+		redacted, blocked, err := d.redactRequest(body, shape, correlationID)
+		switch {
+		case err != nil:
+			xlog.Debug("mitm: redact request failed; forwarding unchanged", "host", host, "path", r.URL.Path, "error", err)
+		case blocked:
+			writePIIBlocked(w, correlationID)
+			return
+		default:
+			body = redacted
+		}
+	}
+
+	upstreamURL := "https://" + d.dialHost(host) + r.URL.RequestURI()
+	upstreamReq, err := http.NewRequestWithContext(r.Context(), r.Method, upstreamURL, bytes.NewReader(body))
+	if err != nil {
+		http.Error(w, "mitm: build upstream request: "+err.Error(), http.StatusBadGateway)
+		return
+	}
+	upstreamReq.Header = cloneHopByHopFiltered(r.Header)
+	upstreamReq.ContentLength = int64(len(body))
+	upstreamReq.Header.Set("Content-Length", fmt.Sprintf("%d", len(body)))
+	bytesSent = int64(len(body))
+
+	resp, err := d.client.Do(upstreamReq)
+	if err != nil {
+		http.Error(w, "mitm: upstream: "+err.Error(), http.StatusBadGateway)
+		return
+	}
+	defer func() { _ = resp.Body.Close() }()
+
+	for k, vs := range resp.Header {
+		if isHopByHop(k) || strings.EqualFold(k, "Transfer-Encoding") || strings.EqualFold(k, "Content-Length") {
+			continue
+		}
+		for _, v := range vs {
+			w.Header().Add(k, v)
+		}
+	}
+	w.WriteHeader(resp.StatusCode)
+
+	contentType := resp.Header.Get("Content-Type")
+	if shape != shapeUnknown && d.redactor != nil && isSSE(contentType) {
+		d.streamWithPII(w, resp.Body, shape, correlationID)
+		return
+	}
+
+	if isSSE(contentType) {
+		flusher, _ := w.(http.Flusher)
+		buf := make([]byte, 32*1024)
+		for {
+			n, rErr := resp.Body.Read(buf)
+			if n > 0 {
+				if _, wErr := w.Write(buf[:n]); wErr != nil {
+					return
+				}
+				if flusher != nil {
+					flusher.Flush()
+				}
+			}
+			if rErr != nil {
+				return
+			}
+		}
+	}
+
+	_, _ = io.Copy(w, resp.Body)
+}
+
+type requestShape int
+
+const (
+	shapeUnknown requestShape = iota
+	shapeOpenAIChat
+	shapeAnthropicMessages
+)
+
+func classifyRequestShape(host, path string) requestShape {
+	host = strings.ToLower(host)
+	switch {
+	case host == "api.openai.com" && strings.HasSuffix(path, "/v1/chat/completions"):
+		return shapeOpenAIChat
+	case host == "api.anthropic.com" && strings.HasSuffix(path, "/v1/messages"):
+		return shapeAnthropicMessages
+	}
+	return shapeUnknown
+}
+
+func (d *piiDispatcher) redactRequest(body []byte, shape requestShape, correlationID string) ([]byte, bool, error) {
+	var parsed any
+	var adapter pii.Adapter
+	switch shape {
+	case shapeOpenAIChat:
+		req := &schema.OpenAIRequest{}
+		if err := json.Unmarshal(body, req); err != nil {
+			return nil, false, fmt.Errorf("parse openai: %w", err)
+		}
+		parsed = req
+		adapter = piiadapter.OpenAI()
+	case shapeAnthropicMessages:
+		req := &schema.AnthropicRequest{}
+		if err := json.Unmarshal(body, req); err != nil {
+			return nil, false, fmt.Errorf("parse anthropic: %w", err)
+		}
+		parsed = req
+		adapter = piiadapter.Anthropic()
+	default:
+		return body, false, nil
+	}
+
+	texts := adapter.Scan(parsed)
+	if len(texts) == 0 {
+		return body, false, nil
+	}
+
+	updates := make([]pii.ScannedText, 0, len(texts))
+	blocked := false
+	for _, st := range texts {
+		if st.Text == "" {
+			continue
+		}
+		res := d.redactor.RedactWithOverrides(st.Text, nil)
+		if len(res.Spans) == 0 {
+			continue
+		}
+		d.recordEvents(res.Spans, correlationID)
+		if res.Blocked {
+			blocked = true
+		}
+		updates = append(updates, pii.ScannedText{Index: st.Index, Text: res.Redacted})
+	}
+
+	if len(updates) > 0 {
+		adapter.Apply(parsed, updates)
+	}
+
+	out, err := json.Marshal(parsed)
+	if err != nil {
+		return nil, false, fmt.Errorf("re-marshal: %w", err)
+	}
+	return out, blocked, nil
+}
+
+func (d *piiDispatcher) recordEvents(spans []pii.Span, correlationID string) {
+	if d.store == nil {
+		return
+	}
+	for _, span := range spans {
+		ev := pii.PIIEvent{
+			ID:            fmt.Sprintf("mitm_%s_%d", correlationID, d.eventSeq.Add(1)),
+			Kind:          pii.KindPII,
+			CorrelationID: correlationID,
+			Direction:     pii.DirectionIn,
+			PatternID:     span.Pattern,
+			ByteOffset:    span.Start,
+			Length:        span.End - span.Start,
+			HashPrefix:    span.HashPrefix,
+			Action:        d.patternAction[span.Pattern],
+			CreatedAt:     time.Now(),
+		}
+		if err := d.store.Record(context.Background(), ev); err != nil {
+			xlog.Debug("mitm: failed to record pii event", "error", err, "pattern", span.Pattern)
+		}
+	}
+}
+
+func (d *piiDispatcher) streamWithPII(w http.ResponseWriter, src io.Reader, shape requestShape, correlationID string) {
+	flusher, _ := w.(http.Flusher)
+	filter := pii.NewStreamFilter(d.redactor, nil, d.store, correlationID, "")
+
+	provider := ssewire.OpenAI
+	if shape == shapeAnthropicMessages {
+		provider = ssewire.Anthropic
+	}
+
+	emit := func(s string) {
+		_, _ = w.Write([]byte(s))
+		if flusher != nil {
+			flusher.Flush()
+		}
+	}
+
+	scanner := ssewire.NewScanner(src)
+	for scanner.Scan() {
+		ev := scanner.Event()
+		if ssewire.IsTerminalMarker(ev.DataLine, provider) {
+			if residual := filter.Drain(); residual != "" {
+				emit(ssewire.SynthResidualEvent(provider, residual))
+			}
+			emit(ev.Raw)
+			continue
+		}
+		out := ev.Raw
+		if ev.DataLine != "" {
+			rewritten, drop := ssewire.RewritePayload(ev.DataLine, provider, filter)
+			if drop {
+				continue
+			}
+			if rewritten != ev.DataLine {
+				out = strings.Replace(ev.Raw, ev.DataLine, rewritten, 1)
+			}
+		}
+		emit(out)
+	}
+	if residual := filter.Drain(); residual != "" {
+		emit(ssewire.SynthResidualEvent(provider, residual))
+	}
+}
+
+func writePIIBlocked(w http.ResponseWriter, correlationID string) {
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(http.StatusBadRequest)
+	resp := map[string]any{
+		"error": map[string]string{
+			"message": "request blocked by LocalAI MITM proxy (sensitive data detected)",
+			"type":    "pii_blocked",
+		},
+		"correlation_id": correlationID,
+	}
+	_ = json.NewEncoder(w).Encode(resp)
+}
+
+func isSSE(contentType string) bool {
+	return strings.HasPrefix(strings.TrimSpace(contentType), "text/event-stream")
+}
+
+// hopByHopHeaders are not forwarded by the proxy (RFC 7230 §6.1).
+var hopByHopHeaders = map[string]struct{}{
+	"Connection":          {},
+	"Keep-Alive":          {},
+	"Proxy-Authenticate":  {},
+	"Proxy-Authorization": {},
+	"Te":                  {},
+	"Trailers":            {},
+	"Transfer-Encoding":   {},
+	"Upgrade":             {},
+}
+
+func isHopByHop(name string) bool {
+	_, ok := hopByHopHeaders[http.CanonicalHeaderKey(name)]
+	return ok
+}
+
+// countingResponseWriter wraps an http.ResponseWriter to track the
+// total bytes written downstream and the status code. It implements
+// http.Flusher because the SSE paths flush per event; without that
+// the assertion `w.(http.Flusher)` would silently degrade to no-op.
+type countingResponseWriter struct {
+	http.ResponseWriter
+	bytes  int64
+	status int
+}
+
+func (w *countingResponseWriter) Write(p []byte) (int, error) {
+	if w.status == 0 {
+		w.status = http.StatusOK
+	}
+	n, err := w.ResponseWriter.Write(p)
+	w.bytes += int64(n)
+	return n, err
+}
+
+func (w *countingResponseWriter) WriteHeader(code int) {
+	w.status = code
+	w.ResponseWriter.WriteHeader(code)
+}
+
+func (w *countingResponseWriter) Flush() {
+	if f, ok := w.ResponseWriter.(http.Flusher); ok {
+		f.Flush()
+	}
+}
+
+func (d *piiDispatcher) recordTrafficEvent(host, correlationID string, sent, received int64, status int, start time.Time) {
+	if d.store == nil {
+		return
+	}
+	ev := pii.PIIEvent{
+		ID:            fmt.Sprintf("proxy_traffic_%s_%d", correlationID, d.eventSeq.Add(1)),
+		Kind:          pii.KindProxyTraffic,
+		CorrelationID: correlationID,
+		Host:          host,
+		BytesSent:     sent,
+		BytesReceived: received,
+		StatusCode:    status,
+		DurationMS:    time.Since(start).Milliseconds(),
+		CreatedAt:     time.Now(),
+	}
+	if err := d.store.Record(context.Background(), ev); err != nil {
+		xlog.Debug("mitm: failed to record proxy_traffic event", "error", err, "host", host)
+	}
+}
+
+func cloneHopByHopFiltered(in http.Header) http.Header {
+	out := make(http.Header, len(in))
+	for k, vs := range in {
+		if isHopByHop(k) {
+			continue
+		}
+		copied := make([]string, len(vs))
+		copy(copied, vs)
+		out[k] = copied
+	}
+	return out
+}
diff --git a/core/services/cloudproxy/mitm/handler_test.go b/core/services/cloudproxy/mitm/handler_test.go
new file mode 100644
index 000000000..b6177b0e9
--- /dev/null
+++ b/core/services/cloudproxy/mitm/handler_test.go
@@ -0,0 +1,329 @@
+package mitm
+
+import (
+	"context"
+	"crypto/tls"
+	"crypto/x509"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net"
+	"net/http"
+	"net/http/httptest"
+	"net/url"
+	"strings"
+
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// startPIITestRig is the same shape as startMITMTestRig but plugs
+// in the production PII handler instead of the passthrough fixture.
+// The "host" the client thinks it's reaching is forced to
+// api.anthropic.com so the request shape classifier matches.
+func startPIITestRig(upstream http.Handler) (*http.Client, string, *fakeStore, func()) {
+	// Upstream fake — plays the role of api.anthropic.com.
+	ts := httptest.NewTLSServer(upstream)
+	upstreamCertPool := x509.NewCertPool()
+	upstreamCertPool.AddCert(ts.Certificate())
+	upstreamURL, _ := url.Parse(ts.URL)
+
+	// Compiled patterns required for the redactor to actually fire
+	// (DefaultPatterns alone returns Pattern structs without regex).
+	patterns, err := pii.Compile(pii.DefaultPatterns())
+	ExpectWithOffset(1, err).NotTo(HaveOccurred())
+	redactor := pii.NewRedactor(patterns)
+	store := &fakeStore{}
+
+	ca, err := NewInMemoryCA()
+	ExpectWithOffset(1, err).NotTo(HaveOccurred())
+
+	// DialHost remaps the upstream dial target to the httptest
+	// fake while leaving the classifier-facing host
+	// ("api.anthropic.com") untouched. ServerName=example.com is
+	// what httptest.NewTLSServer issues its cert for.
+	upstreamHost := upstreamURL.Host
+	prodHandler := NewPIIHandler(PIIHandlerOptions{
+		Redactor:   redactor,
+		EventStore: store,
+		UpstreamTLS: &tls.Config{
+			RootCAs:    upstreamCertPool,
+			ServerName: "example.com",
+		},
+		DialHost: func(_ string) string { return upstreamHost },
+	})
+
+	srv, err := NewServer(Config{
+		Addr:           "127.0.0.1:0",
+		CA:             ca,
+		InterceptHosts: []string{"api.anthropic.com"},
+		Handler:        prodHandler,
+		EventStore:     store,
+	})
+	ExpectWithOffset(1, err).NotTo(HaveOccurred())
+	ExpectWithOffset(1, srv.Start()).To(Succeed())
+
+	clientPool := x509.NewCertPool()
+	clientPool.AddCert(ca.Cert())
+	proxyURL, _ := url.Parse("http://" + srv.Addr())
+	client := &http.Client{
+		Transport: &http.Transport{
+			Proxy:           http.ProxyURL(proxyURL),
+			TLSClientConfig: &tls.Config{RootCAs: clientPool},
+		},
+	}
+
+	cleanup := func() {
+		srv.Stop()
+		ts.Close()
+	}
+	// We point requests at api.anthropic.com so classifyRequestShape
+	// matches; the wrappedHandler retargets to the upstream fake.
+	return client, "https://api.anthropic.com", store, cleanup
+}
+
+type fakeStore struct{ events []pii.PIIEvent }
+
+func (s *fakeStore) Record(_ context.Context, ev pii.PIIEvent) error {
+	s.events = append(s.events, ev)
+	return nil
+}
+
+func (s *fakeStore) List(_ context.Context, _ pii.ListQuery) ([]pii.PIIEvent, error) {
+	return s.events, nil
+}
+
+func (s *fakeStore) Count(_ context.Context) (int, error) { return len(s.events), nil }
+func (s *fakeStore) Close() error                         { return nil }
+
+func (s *fakeStore) recorded() int { return len(s.events) }
+
+var _ = Describe("PIIHandler", func() {
+	It("redacts request email", func() {
+		var receivedBody []byte
+		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			receivedBody, _ = io.ReadAll(r.Body)
+			w.Header().Set("Content-Type", "application/json")
+			_, _ = fmt.Fprint(w, `{"id":"msg_x","content":[{"type":"text","text":"ok"}]}`)
+		})
+
+		client, base, store, cleanup := startPIITestRig(upstream)
+		defer cleanup()
+
+		body := `{"model":"claude-3-5-sonnet","max_tokens":100,"messages":[{"role":"user","content":"my email is alice@example.com please reply"}]}`
+		resp, err := client.Post(base+"/v1/messages", "application/json", strings.NewReader(body))
+		Expect(err).NotTo(HaveOccurred(), "client.Post")
+		defer func() { _ = resp.Body.Close() }()
+		Expect(resp.StatusCode).To(Equal(200))
+
+		Expect(string(receivedBody)).NotTo(ContainSubstring("alice@example.com"), "upstream received unredacted body")
+		Expect(string(receivedBody)).To(ContainSubstring("[REDACTED:email]"), "upstream did not see redaction marker")
+		Expect(store.recorded()).NotTo(BeZero(), "no PIIEvent recorded for the email match")
+	})
+
+	It("blocks api key in request", func() {
+		upstreamCalled := false
+		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			upstreamCalled = true
+			w.WriteHeader(200)
+		})
+
+		client, base, _, cleanup := startPIITestRig(upstream)
+		defer cleanup()
+
+		body := `{"model":"claude-3-5-sonnet","max_tokens":100,"messages":[{"role":"user","content":"my key is sk-abcdefghijklmnopqrstuvwxyz1234"}]}`
+		resp, err := client.Post(base+"/v1/messages", "application/json", strings.NewReader(body))
+		Expect(err).NotTo(HaveOccurred(), "client.Post")
+		defer func() { _ = resp.Body.Close() }()
+		Expect(resp.StatusCode).To(Equal(400), "api_key_prefix has Block default")
+		Expect(upstreamCalled).To(BeFalse(), "upstream was called despite block — proxy should short-circuit")
+		body2, _ := io.ReadAll(resp.Body)
+		Expect(string(body2)).To(ContainSubstring("pii_blocked"))
+	})
+
+	It("streaming redaction", func() {
+		// Anthropic-shape SSE; "alice@" + "example.com" splits the
+		// email across chunks so the StreamFilter has to buffer.
+		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			w.Header().Set("Content-Type", "text/event-stream")
+			w.WriteHeader(200)
+			flusher := w.(http.Flusher)
+			chunks := []string{
+				`{"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"contact me at alice@"}}`,
+				`{"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"example.com any time"}}`,
+				`{"type":"message_stop"}`,
+			}
+			for _, c := range chunks {
+				_, _ = fmt.Fprintf(w, "event: %s\ndata: %s\n\n", "content_block_delta", c)
+				flusher.Flush()
+			}
+		})
+
+		client, base, _, cleanup := startPIITestRig(upstream)
+		defer cleanup()
+
+		body := `{"model":"claude-3-5-sonnet","max_tokens":100,"stream":true,"messages":[{"role":"user","content":"hi"}]}`
+		resp, err := client.Post(base+"/v1/messages", "application/json", strings.NewReader(body))
+		Expect(err).NotTo(HaveOccurred(), "Post")
+		defer func() { _ = resp.Body.Close() }()
+		out, _ := io.ReadAll(resp.Body)
+		outStr := string(out)
+		Expect(outStr).NotTo(ContainSubstring("alice@example.com"), "email leaked through MITM stream")
+		Expect(outStr).To(ContainSubstring("[REDACTED:email]"), "redaction marker missing from MITM stream")
+	})
+
+	It("non-chat path passes through", func() {
+		// A path the classifier doesn't recognise (e.g. an OAuth
+		// callback) must forward the body verbatim, no PII parsing.
+		var receivedBody []byte
+		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			receivedBody, _ = io.ReadAll(r.Body)
+			w.Header().Set("Content-Type", "application/json")
+			_, _ = fmt.Fprint(w, `{"ok":true}`)
+		})
+
+		client, base, _, cleanup := startPIITestRig(upstream)
+		defer cleanup()
+
+		body := `{"email":"alice@example.com"}`
+		resp, err := client.Post(base+"/oauth/callback", "application/json", strings.NewReader(body))
+		Expect(err).NotTo(HaveOccurred())
+		defer func() { _ = resp.Body.Close() }()
+		Expect(string(receivedBody)).To(Equal(body), "body forwarded with mutation")
+	})
+})
+
+var _ = Describe("redactRequest", func() {
+	It("handles anthropic shape", func() {
+		patterns, _ := pii.Compile(pii.DefaultPatterns())
+		r := pii.NewRedactor(patterns)
+		body := []byte(`{"model":"claude","max_tokens":10,"messages":[{"role":"user","content":"reach me at bob@example.org"}]}`)
+
+		d := &piiDispatcher{redactor: r, patternAction: map[string]pii.Action{}}
+		out, blocked, err := d.redactRequest(body, shapeAnthropicMessages, "corr-1")
+		Expect(err).NotTo(HaveOccurred())
+		Expect(blocked).To(BeFalse(), "email is mask, not block — blocked should be false")
+		var parsed map[string]any
+		Expect(json.Unmarshal(out, &parsed)).To(Succeed())
+		msgs := parsed["messages"].([]any)
+		first := msgs[0].(map[string]any)
+		content, _ := first["content"].(string)
+		Expect(content).NotTo(ContainSubstring("bob@example.org"), "redaction did not run")
+	})
+})
+
+var _ = Describe("Proxy events", func() {
+	It("emits connect and traffic events", func() {
+		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			w.Header().Set("Content-Type", "application/json")
+			_, _ = fmt.Fprint(w, `{"id":"msg_x","content":[{"type":"text","text":"ok"}]}`)
+		})
+
+		client, base, store, cleanup := startPIITestRig(upstream)
+		defer cleanup()
+
+		body := `{"model":"claude-3-5-sonnet","max_tokens":10,"messages":[{"role":"user","content":"hi"}]}`
+		resp, err := client.Post(base+"/v1/messages", "application/json", strings.NewReader(body))
+		Expect(err).NotTo(HaveOccurred(), "client.Post")
+		defer func() { _ = resp.Body.Close() }()
+		_, _ = io.Copy(io.Discard, resp.Body)
+
+		var connect, traffic *pii.PIIEvent
+		for i := range store.events {
+			ev := &store.events[i]
+			switch ev.ResolvedKind() {
+			case pii.KindProxyConnect:
+				connect = ev
+			case pii.KindProxyTraffic:
+				traffic = ev
+			}
+		}
+
+		Expect(connect).NotTo(BeNil(), "no proxy_connect event recorded")
+		Expect(connect.Host).To(Equal("api.anthropic.com"))
+		Expect(connect.Intercepted).NotTo(BeNil())
+		Expect(*connect.Intercepted).To(BeTrue(), "connect.Intercepted should be true for an allowlisted host")
+
+		Expect(traffic).NotTo(BeNil(), "no proxy_traffic event recorded")
+		Expect(traffic.Host).To(Equal("api.anthropic.com"))
+		Expect(traffic.BytesSent).To(BeNumerically(">", 0))
+		Expect(traffic.BytesReceived).To(BeNumerically(">", 0))
+		Expect(traffic.StatusCode).To(Equal(200))
+	})
+
+	It("tunneled host emits connect event only", func() {
+		// A non-allowlisted CONNECT must record a proxy_connect with
+		// Intercepted=false and NOT a proxy_traffic event (tunneled
+		// bytes never reach the dispatcher).
+		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			_, _ = fmt.Fprint(w, "passthrough")
+		})
+		ts := httptest.NewTLSServer(upstream)
+		defer ts.Close()
+		upstreamURL, _ := url.Parse(ts.URL)
+		upstreamHost, _, _ := net.SplitHostPort(upstreamURL.Host)
+
+		ca, _ := NewInMemoryCA()
+		store := &fakeStore{}
+		srv, err := NewServer(Config{
+			Addr:           "127.0.0.1:0",
+			CA:             ca,
+			InterceptHosts: []string{"some-other-host"},
+			Handler:        func(w http.ResponseWriter, r *http.Request, h string) {},
+			EventStore:     store,
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(srv.Start()).To(Succeed())
+		defer srv.Stop()
+
+		upstreamCertPool := x509.NewCertPool()
+		upstreamCertPool.AddCert(ts.Certificate())
+		proxyURL, _ := url.Parse("http://" + srv.Addr())
+		client := &http.Client{
+			Transport: &http.Transport{
+				Proxy: http.ProxyURL(proxyURL),
+				TLSClientConfig: &tls.Config{
+					RootCAs:    upstreamCertPool,
+					ServerName: upstreamHost,
+				},
+			},
+		}
+		resp, err := client.Get(ts.URL)
+		Expect(err).NotTo(HaveOccurred(), "Get through tunnel")
+		_ = resp.Body.Close()
+
+		var connect *pii.PIIEvent
+		for i := range store.events {
+			ev := &store.events[i]
+			Expect(ev.ResolvedKind()).NotTo(Equal(pii.KindProxyTraffic), "unexpected proxy_traffic event for tunneled host: %+v", ev)
+			if ev.ResolvedKind() == pii.KindProxyConnect {
+				connect = ev
+			}
+		}
+		Expect(connect).NotTo(BeNil(), "no proxy_connect event recorded for tunneled host")
+		Expect(connect.Intercepted).NotTo(BeNil())
+		Expect(*connect.Intercepted).To(BeFalse(), "connect.Intercepted should be false (tunneled)")
+		Expect(connect.Host).NotTo(BeEmpty())
+	})
+})
+
+var _ = Describe("classifyRequestShape", func() {
+	cases := []struct {
+		host string
+		path string
+		want requestShape
+	}{
+		{"api.anthropic.com", "/v1/messages", shapeAnthropicMessages},
+		{"api.openai.com", "/v1/chat/completions", shapeOpenAIChat},
+		{"api.anthropic.com", "/v1/oauth/token", shapeUnknown},
+		{"api.openai.com", "/v1/embeddings", shapeUnknown},
+		{"example.com", "/v1/messages", shapeUnknown},
+	}
+	for _, c := range cases {
+		It(fmt.Sprintf("classifies (%q, %q)", c.host, c.path), func() {
+			Expect(classifyRequestShape(c.host, c.path)).To(Equal(c.want))
+		})
+	}
+})
diff --git a/core/services/cloudproxy/mitm/http2_test.go b/core/services/cloudproxy/mitm/http2_test.go
new file mode 100644
index 000000000..8eb70ff94
--- /dev/null
+++ b/core/services/cloudproxy/mitm/http2_test.go
@@ -0,0 +1,165 @@
+package mitm
+
+import (
+	"crypto/tls"
+	"crypto/x509"
+	"fmt"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"net/url"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"golang.org/x/net/http2"
+)
+
+// h2InterceptRig is the test fixture for HTTP/2 paths. Two things
+// differ from the H1.1 rig:
+//   - The client http.Transport has http2.ConfigureTransport called
+//     so it negotiates h2 with our proxy.
+//   - The upstream httptest server is started via StartTLS *and*
+//     manually configured for h2 (httptest does this by default in
+//     modern Go but we make it explicit for clarity).
+func h2InterceptRig(interceptHost string, upstream http.Handler) (*http.Client, string, func()) {
+	ts := httptest.NewUnstartedServer(upstream)
+	ts.EnableHTTP2 = true
+	ts.StartTLS()
+	upstreamCertPool := x509.NewCertPool()
+	upstreamCertPool.AddCert(ts.Certificate())
+	upstreamURL, _ := url.Parse(ts.URL)
+
+	ca, err := NewInMemoryCA()
+	ExpectWithOffset(1, err).NotTo(HaveOccurred())
+
+	srv, err := NewServer(Config{
+		Addr:           "127.0.0.1:0",
+		CA:             ca,
+		InterceptHosts: []string{interceptHost},
+		Handler:        passthroughHandler(upstreamCertPool, upstreamURL.Host),
+	})
+	ExpectWithOffset(1, err).NotTo(HaveOccurred())
+	ExpectWithOffset(1, srv.Start()).To(Succeed())
+
+	// Client with HTTP/2 explicitly enabled (modern net/http does
+	// this by default, but configuring the Transport directly makes
+	// the test independent of stdlib defaults).
+	clientPool := x509.NewCertPool()
+	clientPool.AddCert(ca.Cert())
+	proxyURL, _ := url.Parse("http://" + srv.Addr())
+	transport := &http.Transport{
+		Proxy: http.ProxyURL(proxyURL),
+		TLSClientConfig: &tls.Config{
+			RootCAs:    clientPool,
+			NextProtos: []string{"h2", "http/1.1"},
+		},
+		ForceAttemptHTTP2: true,
+	}
+	ExpectWithOffset(1, http2.ConfigureTransport(transport)).To(Succeed(), "client h2 configure")
+	client := &http.Client{Transport: transport}
+
+	cleanup := func() {
+		srv.Stop()
+		ts.Close()
+	}
+	return client, "https://" + interceptHost, cleanup
+}
+
+var _ = Describe("Proxy HTTP/2", func() {
+	It("negotiates HTTP/2", func() {
+		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			// The upstream side: when serving over h2, r.ProtoMajor == 2.
+			w.Header().Set("X-Upstream-Proto", r.Proto)
+			w.Header().Set("Content-Type", "application/json")
+			_, _ = fmt.Fprint(w, `{"ok":true}`)
+		})
+
+		client, base, cleanup := h2InterceptRig("api.test.local", upstream)
+		defer cleanup()
+
+		resp, err := client.Get(base + "/v1/test")
+		Expect(err).NotTo(HaveOccurred(), "Get")
+		defer func() { _ = resp.Body.Close() }()
+
+		// The proxy ↔ client leg: client sees h2 because we ALPN-
+		// negotiated it. resp.Proto is the protocol the client used.
+		Expect(resp.Proto).To(Equal("HTTP/2.0"), "proxy did not serve h2")
+		body, _ := io.ReadAll(resp.Body)
+		Expect(string(body)).To(ContainSubstring(`"ok":true`))
+	})
+
+	It("streams over HTTP/2", func() {
+		// h2 streaming: the proxy must flush each frame promptly. The
+		// upstream sends 3 SSE-style chunks; we read them back through
+		// a streaming decoder so a buffering bug would surface.
+		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			w.Header().Set("Content-Type", "text/event-stream")
+			w.WriteHeader(200)
+			flusher := w.(http.Flusher)
+			for _, msg := range []string{"first", "second", "third"} {
+				_, _ = fmt.Fprintf(w, "data: %s\n\n", msg)
+				flusher.Flush()
+			}
+		})
+
+		client, base, cleanup := h2InterceptRig("api.test.local", upstream)
+		defer cleanup()
+
+		resp, err := client.Get(base + "/stream")
+		Expect(err).NotTo(HaveOccurred(), "Get")
+		defer func() { _ = resp.Body.Close() }()
+
+		Expect(resp.Proto).To(Equal("HTTP/2.0"), "expected h2 for streaming response")
+		body, _ := io.ReadAll(resp.Body)
+		for _, msg := range []string{"first", "second", "third"} {
+			Expect(strings.Contains(string(body), "data: "+msg)).To(BeTrue(), "missing %q in h2 streamed body: %s", msg, body)
+		}
+	})
+
+	It("falls back to HTTP/1.1", func() {
+		// Force the client to negotiate h1.1 only, by overriding ALPN.
+		// Verifies the fallback path still works for legacy clients.
+		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			_, _ = fmt.Fprint(w, `{"ok":true}`)
+		})
+
+		ts := httptest.NewTLSServer(upstream)
+		defer ts.Close()
+		upstreamCertPool := x509.NewCertPool()
+		upstreamCertPool.AddCert(ts.Certificate())
+		upstreamURL, _ := url.Parse(ts.URL)
+
+		ca, _ := NewInMemoryCA()
+		srv, _ := NewServer(Config{
+			Addr:           "127.0.0.1:0",
+			CA:             ca,
+			InterceptHosts: []string{"api.test.local"},
+			Handler:        passthroughHandler(upstreamCertPool, upstreamURL.Host),
+		})
+		Expect(srv.Start()).To(Succeed())
+		defer srv.Stop()
+
+		clientPool := x509.NewCertPool()
+		clientPool.AddCert(ca.Cert())
+		proxyURL, _ := url.Parse("http://" + srv.Addr())
+		// ALPN intentionally restricted to http/1.1 to force the
+		// fallback path. Most clients will negotiate h2, but the
+		// proxy must keep h1 working for the rare case.
+		client := &http.Client{
+			Transport: &http.Transport{
+				Proxy: http.ProxyURL(proxyURL),
+				TLSClientConfig: &tls.Config{
+					RootCAs:    clientPool,
+					NextProtos: []string{"http/1.1"},
+				},
+			},
+		}
+		resp, err := client.Get("https://api.test.local/v1/test")
+		Expect(err).NotTo(HaveOccurred(), "Get")
+		defer func() { _ = resp.Body.Close() }()
+		Expect(resp.Proto).To(Equal("HTTP/1.1"))
+		body, _ := io.ReadAll(resp.Body)
+		Expect(string(body)).To(ContainSubstring(`"ok":true`))
+	})
+})
diff --git a/core/services/cloudproxy/mitm/leaf.go b/core/services/cloudproxy/mitm/leaf.go
new file mode 100644
index 000000000..4e542d6ba
--- /dev/null
+++ b/core/services/cloudproxy/mitm/leaf.go
@@ -0,0 +1,102 @@
+package mitm
+
+import (
+	"crypto/ecdsa"
+	"crypto/elliptic"
+	"crypto/rand"
+	"crypto/tls"
+	"crypto/x509"
+	"crypto/x509/pkix"
+	"fmt"
+	"math/big"
+	"net"
+	"strings"
+	"time"
+)
+
+type leafEntry struct {
+	cert      *tls.Certificate
+	expiresAt time.Time
+}
+
+const (
+	leafLifetime     = 30 * 24 * time.Hour
+	minBeforeReissue = 24 * time.Hour
+)
+
+// IssueLeaf returns a TLS certificate for host, signed by this CA.
+// Cached per host, re-minted when the cached cert is within
+// minBeforeReissue of expiry.
+func (c *CA) IssueLeaf(host string) (*tls.Certificate, error) {
+	if h, _, err := net.SplitHostPort(host); err == nil {
+		host = h
+	}
+	host = strings.ToLower(host)
+
+	now := time.Now()
+
+	c.mu.Lock()
+	if entry, ok := c.leaves[host]; ok {
+		if entry.expiresAt.After(now.Add(minBeforeReissue)) {
+			c.mu.Unlock()
+			return entry.cert, nil
+		}
+		delete(c.leaves, host)
+	}
+	c.mu.Unlock()
+
+	// Mint outside the lock so a slow ECDSA key-gen doesn't block
+	// concurrent lookups for already-cached hosts.
+	leaf, err := c.mintLeaf(host)
+	if err != nil {
+		return nil, err
+	}
+
+	c.mu.Lock()
+	c.leaves[host] = &leafEntry{
+		cert:      leaf,
+		expiresAt: now.Add(leafLifetime),
+	}
+	c.mu.Unlock()
+	return leaf, nil
+}
+
+func (c *CA) mintLeaf(host string) (*tls.Certificate, error) {
+	leafKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
+	if err != nil {
+		return nil, fmt.Errorf("mitm: leaf key for %q: %w", host, err)
+	}
+
+	serial, err := rand.Int(rand.Reader, new(big.Int).Lsh(big.NewInt(1), 128))
+	if err != nil {
+		return nil, fmt.Errorf("mitm: leaf serial: %w", err)
+	}
+
+	now := time.Now().UTC()
+	tmpl := &x509.Certificate{
+		SerialNumber: serial,
+		Subject:      pkix.Name{CommonName: host},
+		NotBefore:    now.Add(-1 * time.Hour),
+		NotAfter:     now.Add(leafLifetime),
+		KeyUsage:     x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment,
+		ExtKeyUsage: []x509.ExtKeyUsage{
+			x509.ExtKeyUsageServerAuth,
+		},
+		BasicConstraintsValid: true,
+	}
+	if ip := net.ParseIP(host); ip != nil {
+		tmpl.IPAddresses = []net.IP{ip}
+	} else {
+		tmpl.DNSNames = []string{host}
+	}
+
+	der, err := x509.CreateCertificate(rand.Reader, tmpl, c.cert, &leafKey.PublicKey, c.key)
+	if err != nil {
+		return nil, fmt.Errorf("mitm: sign leaf for %q: %w", host, err)
+	}
+
+	return &tls.Certificate{
+		Certificate: [][]byte{der, c.cert.Raw},
+		PrivateKey:  leafKey,
+	}, nil
+}
diff --git a/core/services/cloudproxy/mitm/leaf_test.go b/core/services/cloudproxy/mitm/leaf_test.go
new file mode 100644
index 000000000..3a1bd9b05
--- /dev/null
+++ b/core/services/cloudproxy/mitm/leaf_test.go
@@ -0,0 +1,103 @@
+package mitm
+
+import (
+	"crypto/tls"
+	"crypto/x509"
+	"net"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("IssueLeaf", func() {
+	It("chains to CA", func() {
+		ca, err := NewInMemoryCA()
+		Expect(err).NotTo(HaveOccurred())
+		leaf, err := ca.IssueLeaf("api.anthropic.com")
+		Expect(err).NotTo(HaveOccurred())
+		Expect(len(leaf.Certificate)).To(BeNumerically(">=", 1), "leaf has no DER")
+		parsed, err := x509.ParseCertificate(leaf.Certificate[0])
+		Expect(err).NotTo(HaveOccurred())
+		// Verify it's actually signed by the CA we generated.
+		pool := x509.NewCertPool()
+		pool.AddCert(ca.Cert())
+		_, err = parsed.Verify(x509.VerifyOptions{
+			Roots:     pool,
+			KeyUsages: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth},
+			DNSName:   "api.anthropic.com",
+		})
+		Expect(err).NotTo(HaveOccurred(), "verify chain")
+	})
+
+	It("populates DNS and IP SANs correctly", func() {
+		ca, err := NewInMemoryCA()
+		Expect(err).NotTo(HaveOccurred())
+
+		// Hostname → DNSNames
+		leafDNS, err := ca.IssueLeaf("example.com")
+		Expect(err).NotTo(HaveOccurred())
+		parsedDNS, _ := x509.ParseCertificate(leafDNS.Certificate[0])
+		Expect(parsedDNS.DNSNames).NotTo(BeEmpty())
+		Expect(parsedDNS.DNSNames[0]).To(Equal("example.com"))
+		Expect(parsedDNS.IPAddresses).To(BeEmpty(), "hostname leaf should have no IP SAN")
+
+		// IP → IPAddresses
+		leafIP, err := ca.IssueLeaf("127.0.0.1")
+		Expect(err).NotTo(HaveOccurred())
+		parsedIP, _ := x509.ParseCertificate(leafIP.Certificate[0])
+		Expect(parsedIP.IPAddresses).NotTo(BeEmpty())
+		Expect(parsedIP.IPAddresses[0].Equal(net.ParseIP("127.0.0.1"))).To(BeTrue())
+		Expect(parsedIP.DNSNames).To(BeEmpty(), "IP leaf should have no DNS SAN")
+	})
+
+	It("caches by host", func() {
+		ca, err := NewInMemoryCA()
+		Expect(err).NotTo(HaveOccurred())
+		a, _ := ca.IssueLeaf("api.example.com")
+		b, _ := ca.IssueLeaf("api.example.com")
+		Expect(a).To(BeIdenticalTo(b), "expected cached leaf to be returned, got distinct certs")
+		c, _ := ca.IssueLeaf("API.Example.com") // case-insensitive
+		Expect(a).To(BeIdenticalTo(c), "expected case-insensitive cache hit")
+		d, _ := ca.IssueLeaf("api.example.com:443") // host:port stripped
+		Expect(a).To(BeIdenticalTo(d), "expected port-stripped cache hit")
+	})
+
+	It("handshake accepted by client", func() {
+		// End-to-end check: a TLS server using the leaf, with a client
+		// trusting the CA, completes a handshake. This is the property
+		// every other flow in this package depends on.
+		ca, err := NewInMemoryCA()
+		Expect(err).NotTo(HaveOccurred())
+		leaf, err := ca.IssueLeaf("localhost")
+		Expect(err).NotTo(HaveOccurred())
+
+		pool := x509.NewCertPool()
+		pool.AddCert(ca.Cert())
+
+		listener, err := tls.Listen("tcp", "127.0.0.1:0", &tls.Config{
+			Certificates: []tls.Certificate{*leaf},
+		})
+		Expect(err).NotTo(HaveOccurred())
+		defer func() { _ = listener.Close() }()
+
+		go func() {
+			conn, err := listener.Accept()
+			if err != nil {
+				return
+			}
+			defer func() { _ = conn.Close() }()
+			_, _ = conn.Write([]byte("ok"))
+		}()
+
+		conn, err := tls.Dial("tcp", listener.Addr().String(), &tls.Config{
+			RootCAs:    pool,
+			ServerName: "localhost",
+		})
+		Expect(err).NotTo(HaveOccurred(), "client TLS dial")
+		defer func() { _ = conn.Close() }()
+		buf := make([]byte, 2)
+		_, err = conn.Read(buf)
+		Expect(err).NotTo(HaveOccurred(), "read")
+		Expect(string(buf)).To(Equal("ok"))
+	})
+})
diff --git a/core/services/cloudproxy/mitm/mitm_suite_test.go b/core/services/cloudproxy/mitm/mitm_suite_test.go
new file mode 100644
index 000000000..aeb019112
--- /dev/null
+++ b/core/services/cloudproxy/mitm/mitm_suite_test.go
@@ -0,0 +1,13 @@
+package mitm
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestMitm(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "mitm test suite")
+}
diff --git a/core/services/cloudproxy/mitm/proxy.go b/core/services/cloudproxy/mitm/proxy.go
new file mode 100644
index 000000000..79f49aa64
--- /dev/null
+++ b/core/services/cloudproxy/mitm/proxy.go
@@ -0,0 +1,306 @@
+package mitm
+
+import (
+	"bufio"
+	"context"
+	"crypto/tls"
+	"errors"
+	"fmt"
+	"io"
+	"net"
+	"net/http"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+	"github.com/mudler/xlog"
+	"golang.org/x/net/http2"
+)
+
+// Server is an HTTPS forward proxy that MITMs traffic for hosts
+// in its intercept allowlist; non-allowlisted hosts get a plain
+// TCP CONNECT tunnel.
+type Server struct {
+	addr            string
+	ca              *CA
+	interceptHosts  map[string]bool
+	handler         InterceptHandler
+	connectTimeout  time.Duration
+	dialTimeout     time.Duration
+	upstreamTLS     *tls.Config
+	events          pii.EventStore
+	eventSeq        atomic.Uint64
+
+	listener net.Listener
+	srv      *http.Server
+
+	wg       sync.WaitGroup
+	stopOnce sync.Once
+	stopped  chan struct{}
+}
+
+// InterceptHandler runs after the proxy terminates TLS for an
+// allowlisted host. It is responsible for forwarding the upstream
+// response bytes back to w.
+type InterceptHandler func(w http.ResponseWriter, r *http.Request, upstreamHost string)
+
+type Config struct {
+	Addr           string
+	CA             *CA
+	InterceptHosts []string
+	Handler        InterceptHandler
+	// EventStore optionally receives a proxy_connect event for every
+	// CONNECT, recording the destination host and whether the proxy
+	// intercepted or tunneled it. nil disables connect-event recording.
+	EventStore pii.EventStore
+}
+
+func NewServer(cfg Config) (*Server, error) {
+	if cfg.CA == nil {
+		return nil, errors.New("mitm: NewServer: CA is required")
+	}
+	if cfg.Handler == nil {
+		return nil, errors.New("mitm: NewServer: Handler is required")
+	}
+	hosts := make(map[string]bool, len(cfg.InterceptHosts))
+	for _, h := range cfg.InterceptHosts {
+		hosts[strings.ToLower(strings.TrimSpace(h))] = true
+	}
+	return &Server{
+		addr:           cfg.Addr,
+		ca:             cfg.CA,
+		interceptHosts: hosts,
+		handler:        cfg.Handler,
+		connectTimeout: 30 * time.Second,
+		dialTimeout:    15 * time.Second,
+		upstreamTLS:    &tls.Config{NextProtos: []string{"http/1.1"}},
+		events:         cfg.EventStore,
+		stopped:        make(chan struct{}),
+	}, nil
+}
+
+func (s *Server) Start() error {
+	ln, err := net.Listen("tcp", s.addr)
+	if err != nil {
+		return fmt.Errorf("mitm: listen %q: %w", s.addr, err)
+	}
+	s.listener = ln
+	s.srv = &http.Server{
+		Handler:           http.HandlerFunc(s.handle),
+		ReadHeaderTimeout: 30 * time.Second,
+	}
+	s.wg.Add(1)
+	go func() {
+		defer s.wg.Done()
+		err := s.srv.Serve(ln)
+		if err != nil && !errors.Is(err, http.ErrServerClosed) {
+			xlog.Error("mitm: serve error", "error", err)
+		}
+	}()
+	xlog.Info("mitm: listening", "addr", ln.Addr().String(), "intercept_hosts", len(s.interceptHosts))
+	return nil
+}
+
+// Addr returns the bound listener address. Useful when Start was
+// called with ":0" — the kernel picks a port and tests need to
+// discover which.
+func (s *Server) Addr() string {
+	if s.listener == nil {
+		return s.addr
+	}
+	return s.listener.Addr().String()
+}
+
+// Stop is idempotent.
+func (s *Server) Stop() {
+	s.stopOnce.Do(func() {
+		close(s.stopped)
+		if s.srv != nil {
+			_ = s.srv.Close()
+		}
+		s.wg.Wait()
+	})
+}
+
+func (s *Server) handle(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodConnect {
+		http.Error(w, "this proxy only supports HTTPS via CONNECT", http.StatusMethodNotAllowed)
+		return
+	}
+
+	host, _, err := net.SplitHostPort(r.Host)
+	if err != nil {
+		host = r.Host
+	}
+	host = strings.ToLower(host)
+
+	intercept := s.shouldIntercept(host)
+	s.recordConnectEvent(host, intercept)
+	if !intercept {
+		s.handleTunnel(w, r)
+		return
+	}
+	s.handleIntercept(w, r, host)
+}
+
+// recordConnectEvent writes a proxy_connect audit row. Best-effort —
+// store errors are logged at debug only so a failing recorder cannot
+// break a CONNECT.
+func (s *Server) recordConnectEvent(host string, intercepted bool) {
+	if s.events == nil {
+		return
+	}
+	flag := intercepted
+	ev := pii.PIIEvent{
+		ID:          fmt.Sprintf("proxy_connect_%d", s.eventSeq.Add(1)),
+		Kind:        pii.KindProxyConnect,
+		Host:        host,
+		Intercepted: &flag,
+		CreatedAt:   time.Now(),
+	}
+	if err := s.events.Record(context.Background(), ev); err != nil {
+		xlog.Debug("mitm: failed to record proxy_connect event", "error", err, "host", host)
+	}
+}
+
+// shouldIntercept reports whether host is in the allowlist. An
+// empty allowlist tunnels everything.
+func (s *Server) shouldIntercept(host string) bool {
+	if len(s.interceptHosts) == 0 {
+		return false
+	}
+	return s.interceptHosts[host]
+}
+
+func (s *Server) handleTunnel(w http.ResponseWriter, r *http.Request) {
+	upstream, err := net.DialTimeout("tcp", normalizeHostPort(r.Host), s.dialTimeout)
+	if err != nil {
+		http.Error(w, "mitm: tunnel dial: "+err.Error(), http.StatusBadGateway)
+		return
+	}
+	defer func() { _ = upstream.Close() }()
+
+	hijacker, ok := w.(http.Hijacker)
+	if !ok {
+		http.Error(w, "mitm: hijack unsupported", http.StatusInternalServerError)
+		return
+	}
+	clientConn, _, err := hijacker.Hijack()
+	if err != nil {
+		http.Error(w, "mitm: hijack failed: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	defer func() { _ = clientConn.Close() }()
+
+	if _, err := clientConn.Write([]byte("HTTP/1.1 200 Connection established\r\n\r\n")); err != nil {
+		return
+	}
+
+	pipe(clientConn, upstream)
+}
+
+func pipe(a, b net.Conn) {
+	done := make(chan struct{}, 2)
+	go func() {
+		_, _ = io.Copy(a, b)
+		_ = a.SetDeadline(time.Now())
+		done <- struct{}{}
+	}()
+	go func() {
+		_, _ = io.Copy(b, a)
+		_ = b.SetDeadline(time.Now())
+		done <- struct{}{}
+	}()
+	<-done
+}
+
+func (s *Server) handleIntercept(w http.ResponseWriter, r *http.Request, host string) {
+	leaf, err := s.ca.IssueLeaf(host)
+	if err != nil {
+		http.Error(w, "mitm: leaf issuance failed: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+
+	hijacker, ok := w.(http.Hijacker)
+	if !ok {
+		http.Error(w, "mitm: hijack unsupported", http.StatusInternalServerError)
+		return
+	}
+	clientConn, _, err := hijacker.Hijack()
+	if err != nil {
+		http.Error(w, "mitm: hijack failed: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	defer func() { _ = clientConn.Close() }()
+
+	if _, err := clientConn.Write([]byte("HTTP/1.1 200 Connection established\r\n\r\n")); err != nil {
+		return
+	}
+
+	tlsConn := tls.Server(clientConn, &tls.Config{
+		Certificates: []tls.Certificate{*leaf},
+		NextProtos:   []string{"h2", "http/1.1"},
+	})
+	defer func() { _ = tlsConn.Close() }()
+
+	// Deadline applies to the handshake only; cleared before the
+	// request loop so long-running streams don't get cut off. Fail
+	// closed if SetDeadline errors — better than handshaking without
+	// a deadline.
+	if err := tlsConn.SetDeadline(time.Now().Add(s.connectTimeout)); err != nil {
+		xlog.Debug("mitm: TLS handshake set-deadline failed", "host", host, "error", err)
+		return
+	}
+	if err := tlsConn.Handshake(); err != nil {
+		xlog.Debug("mitm: TLS handshake failed", "host", host, "error", err)
+		return
+	}
+	_ = tlsConn.SetDeadline(time.Time{})
+
+	handler := http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
+		req.URL.Scheme = "https"
+		if req.URL.Host == "" {
+			req.URL.Host = req.Host
+		}
+		s.handler(rw, req, host)
+	})
+
+	switch tlsConn.ConnectionState().NegotiatedProtocol {
+	case "h2":
+		h2srv := &http2.Server{}
+		h2srv.ServeConn(tlsConn, &http2.ServeConnOpts{
+			Handler: handler,
+			Context: r.Context(),
+		})
+	default:
+		s.serveHTTP1(tlsConn, handler, host)
+	}
+}
+
+func (s *Server) serveHTTP1(tlsConn *tls.Conn, handler http.Handler, host string) {
+	br := bufio.NewReader(tlsConn)
+	for {
+		req, err := http.ReadRequest(br)
+		if err != nil {
+			if !errors.Is(err, io.EOF) {
+				xlog.Debug("mitm: read request", "host", host, "error", err)
+			}
+			return
+		}
+		rw := newConnResponseWriter(tlsConn, req)
+		handler.ServeHTTP(rw, req)
+		rw.finish()
+		if req.Close || rw.closeAfter {
+			return
+		}
+	}
+}
+
+func normalizeHostPort(host string) string {
+	if _, _, err := net.SplitHostPort(host); err == nil {
+		return host
+	}
+	return host + ":443"
+}
diff --git a/core/services/cloudproxy/mitm/proxy_test.go b/core/services/cloudproxy/mitm/proxy_test.go
new file mode 100644
index 000000000..7f4cb9fcf
--- /dev/null
+++ b/core/services/cloudproxy/mitm/proxy_test.go
@@ -0,0 +1,278 @@
+package mitm
+
+import (
+	"crypto/tls"
+	"crypto/x509"
+	"fmt"
+	"io"
+	"net"
+	"net/http"
+	"net/http/httptest"
+	"net/url"
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// passthroughHandler is the test fixture: forward the parsed
+// request to the upstream and stream the response back. Mirrors
+// what a production handler would do without any PII rewriting,
+// so the proxy core's CONNECT/TLS/req-loop semantics are testable
+// in isolation from the redaction logic.
+func passthroughHandler(upstreamRoots *x509.CertPool, upstreamAddr string) InterceptHandler {
+	return func(w http.ResponseWriter, r *http.Request, host string) {
+		// Build the upstream URL — host is what the client thought
+		// it was talking to (api.anthropic.com); upstreamAddr is
+		// where the test fake actually lives. We use upstreamAddr
+		// directly because the test fake's cert is self-signed
+		// against an arbitrary CA we control.
+		u := *r.URL
+		u.Scheme = "https"
+		u.Host = upstreamAddr
+
+		body := r.Body
+		req, err := http.NewRequest(r.Method, u.String(), body)
+		if err != nil {
+			http.Error(w, "bad request: "+err.Error(), http.StatusBadRequest)
+			return
+		}
+		req.Header = r.Header.Clone()
+		req.Header.Set("Host", host)
+
+		client := &http.Client{
+			Transport: &http.Transport{
+				TLSClientConfig: &tls.Config{
+					RootCAs: upstreamRoots,
+					// httptest.NewTLSServer issues a cert for
+					// example.com / *.example.com regardless of the
+					// listener's actual hostname. Trust that name
+					// rather than the SNI the client used —
+					// production code would set ServerName=host.
+					ServerName: "example.com",
+				},
+			},
+			Timeout: 10 * time.Second,
+		}
+		resp, err := client.Do(req)
+		if err != nil {
+			http.Error(w, "upstream: "+err.Error(), http.StatusBadGateway)
+			return
+		}
+		defer func() { _ = resp.Body.Close() }()
+
+		for k, vs := range resp.Header {
+			for _, v := range vs {
+				w.Header().Add(k, v)
+			}
+		}
+		w.WriteHeader(resp.StatusCode)
+		_, _ = io.Copy(w, resp.Body)
+	}
+}
+
+// startMITMTestRig spins up:
+//   - A fake "upstream" HTTPS server with a self-signed cert
+//   - A MITM proxy that intercepts the upstream's hostname
+//
+// Returns a client http.Client whose Transport points at the proxy
+// and trusts the MITM CA, plus the upstream URL the client should
+// use. Callers tear down with the returned cleanup.
+func startMITMTestRig(interceptHost string, upstream http.Handler) (*http.Client, string, func()) {
+	// Upstream: real TLS server with its own cert. Trust this
+	// from the proxy's outbound side only.
+	ts := httptest.NewTLSServer(upstream)
+	upstreamCertPool := x509.NewCertPool()
+	upstreamCertPool.AddCert(ts.Certificate())
+	upstreamURL, _ := url.Parse(ts.URL)
+
+	ca, err := NewInMemoryCA()
+	ExpectWithOffset(1, err).NotTo(HaveOccurred())
+
+	srv, err := NewServer(Config{
+		Addr:           "127.0.0.1:0",
+		CA:             ca,
+		InterceptHosts: []string{interceptHost},
+		Handler:        passthroughHandler(upstreamCertPool, upstreamURL.Host),
+	})
+	ExpectWithOffset(1, err).NotTo(HaveOccurred())
+	ExpectWithOffset(1, srv.Start()).To(Succeed())
+
+	// Client side: trust the MITM CA so the proxied TLS handshake
+	// succeeds. Configure HTTPS_PROXY to the proxy listener.
+	clientPool := x509.NewCertPool()
+	clientPool.AddCert(ca.Cert())
+	proxyURL, _ := url.Parse("http://" + srv.Addr())
+	client := &http.Client{
+		Transport: &http.Transport{
+			Proxy:           http.ProxyURL(proxyURL),
+			TLSClientConfig: &tls.Config{RootCAs: clientPool},
+		},
+		Timeout: 10 * time.Second,
+	}
+
+	cleanup := func() {
+		srv.Stop()
+		ts.Close()
+	}
+	return client, "https://" + interceptHost, cleanup
+}
+
+var _ = Describe("Proxy", func() {
+	It("intercepts allowlisted host", func() {
+		captured := false
+		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			captured = true
+			// Upstream receives whatever Host header the proxy
+			// forwarded — in production this would be the real
+			// hostname; in this test it's the upstream's listener.
+			// We just verify *some* request landed at the upstream.
+			w.Header().Set("Content-Type", "application/json")
+			_, _ = fmt.Fprint(w, `{"ok":true}`)
+		})
+
+		client, baseURL, cleanup := startMITMTestRig("api.test.local", upstream)
+		defer cleanup()
+
+		resp, err := client.Get(baseURL + "/v1/test")
+		Expect(err).NotTo(HaveOccurred(), "client.Get")
+		defer func() { _ = resp.Body.Close() }()
+
+		Expect(resp.StatusCode).To(Equal(200))
+		body, _ := io.ReadAll(resp.Body)
+		Expect(string(body)).To(ContainSubstring(`"ok":true`))
+		Expect(captured).To(BeTrue(), "upstream handler was never called — proxy did not forward")
+	})
+
+	It("tunnels non-allowlisted host", func() {
+		// Set up a "different" upstream we don't put in the allowlist.
+		// The proxy should tunnel CONNECTs to it without TLS termination,
+		// so we need to dial through the proxy and verify the upstream
+		// sees the raw TLS — the MITM CA isn't used.
+		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			_, _ = fmt.Fprint(w, `passthrough`)
+		})
+		ts := httptest.NewTLSServer(upstream)
+		defer ts.Close()
+		upstreamURL, _ := url.Parse(ts.URL)
+		upstreamHost, upstreamPort, _ := net.SplitHostPort(upstreamURL.Host)
+
+		ca, _ := NewInMemoryCA()
+		srv, err := NewServer(Config{
+			Addr: "127.0.0.1:0",
+			CA:   ca,
+			// Allowlist only "api.test.local" — upstream's host is NOT
+			// on it, so CONNECT to it must tunnel.
+			InterceptHosts: []string{"api.test.local"},
+			Handler:        func(w http.ResponseWriter, r *http.Request, h string) { http.Error(w, "should not be called", 500) },
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(srv.Start()).To(Succeed())
+		defer srv.Stop()
+
+		// Client trusts the upstream's actual cert (NOT the MITM CA),
+		// so a successful TLS handshake proves the proxy did not MITM.
+		upstreamCertPool := x509.NewCertPool()
+		upstreamCertPool.AddCert(ts.Certificate())
+		proxyURL, _ := url.Parse("http://" + srv.Addr())
+		client := &http.Client{
+			Transport: &http.Transport{
+				Proxy: http.ProxyURL(proxyURL),
+				TLSClientConfig: &tls.Config{
+					RootCAs:    upstreamCertPool,
+					ServerName: upstreamHost,
+				},
+			},
+			Timeout: 10 * time.Second,
+		}
+		_ = upstreamPort
+
+		resp, err := client.Get(ts.URL)
+		Expect(err).NotTo(HaveOccurred(), "Get through tunnel")
+		defer func() { _ = resp.Body.Close() }()
+		body, _ := io.ReadAll(resp.Body)
+		Expect(string(body)).To(Equal("passthrough"))
+	})
+
+	It("rejects non-CONNECT requests", func() {
+		ca, _ := NewInMemoryCA()
+		srv, err := NewServer(Config{
+			Addr:    "127.0.0.1:0",
+			CA:      ca,
+			Handler: func(w http.ResponseWriter, r *http.Request, h string) {},
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(srv.Start()).To(Succeed())
+		defer srv.Stop()
+
+		resp, err := http.Get("http://" + srv.Addr() + "/")
+		Expect(err).NotTo(HaveOccurred(), "GET")
+		defer func() { _ = resp.Body.Close() }()
+		Expect(resp.StatusCode).To(Equal(http.StatusMethodNotAllowed))
+	})
+
+	It("streams responses", func() {
+		// SSE-style upstream: send three text chunks with explicit
+		// flushes so the proxy's Flusher path is exercised.
+		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			w.Header().Set("Content-Type", "text/event-stream")
+			w.WriteHeader(200)
+			flusher := w.(http.Flusher)
+			for _, msg := range []string{"a", "b", "c"} {
+				_, _ = fmt.Fprintf(w, "data: %s\n\n", msg)
+				flusher.Flush()
+			}
+		})
+		client, baseURL, cleanup := startMITMTestRig("api.test.local", upstream)
+		defer cleanup()
+
+		resp, err := client.Get(baseURL + "/stream")
+		Expect(err).NotTo(HaveOccurred(), "Get")
+		defer func() { _ = resp.Body.Close() }()
+		body, _ := io.ReadAll(resp.Body)
+		for _, msg := range []string{"a", "b", "c"} {
+			Expect(string(body)).To(ContainSubstring("data: " + msg))
+		}
+	})
+
+	It("with no allowlist tunnels everything", func() {
+		// Empty InterceptHosts means the proxy is in observability-
+		// only mode: every CONNECT tunnels. Verifies the default-
+		// fail-safe behaviour mentioned in shouldIntercept.
+		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			_, _ = fmt.Fprint(w, "tunneled")
+		})
+		ts := httptest.NewTLSServer(upstream)
+		defer ts.Close()
+		upstreamURL, _ := url.Parse(ts.URL)
+		upstreamHost, _, _ := net.SplitHostPort(upstreamURL.Host)
+
+		ca, _ := NewInMemoryCA()
+		srv, _ := NewServer(Config{
+			Addr:    "127.0.0.1:0",
+			CA:      ca,
+			Handler: func(w http.ResponseWriter, r *http.Request, h string) { Fail("intercept handler called with empty allowlist") },
+			// InterceptHosts intentionally empty.
+		})
+		Expect(srv.Start()).To(Succeed())
+		defer srv.Stop()
+
+		upstreamCertPool := x509.NewCertPool()
+		upstreamCertPool.AddCert(ts.Certificate())
+		proxyURL, _ := url.Parse("http://" + srv.Addr())
+		client := &http.Client{
+			Transport: &http.Transport{
+				Proxy: http.ProxyURL(proxyURL),
+				TLSClientConfig: &tls.Config{
+					RootCAs:    upstreamCertPool,
+					ServerName: upstreamHost,
+				},
+			},
+		}
+		resp, err := client.Get(ts.URL)
+		Expect(err).NotTo(HaveOccurred(), "Get")
+		defer func() { _ = resp.Body.Close() }()
+		body, _ := io.ReadAll(resp.Body)
+		Expect(string(body)).To(Equal("tunneled"))
+	})
+})
diff --git a/core/services/cloudproxy/mitm/response.go b/core/services/cloudproxy/mitm/response.go
new file mode 100644
index 000000000..067877bfe
--- /dev/null
+++ b/core/services/cloudproxy/mitm/response.go
@@ -0,0 +1,105 @@
+package mitm
+
+import (
+	"bufio"
+	"crypto/tls"
+	"fmt"
+	"net/http"
+	"strconv"
+	"strings"
+)
+
+// connResponseWriter is a minimal HTTP/1.1 http.ResponseWriter
+// that writes directly to a hijacked TLS connection.
+type connResponseWriter struct {
+	conn *tls.Conn
+	bw   *bufio.Writer
+	req  *http.Request
+
+	header        http.Header
+	wroteHeader   bool
+	chunked       bool
+	contentLength int64
+	written       int64
+	closeAfter    bool
+}
+
+func newConnResponseWriter(conn *tls.Conn, req *http.Request) *connResponseWriter {
+	return &connResponseWriter{
+		conn:          conn,
+		bw:            bufio.NewWriter(conn),
+		req:           req,
+		header:        make(http.Header),
+		contentLength: -1,
+	}
+}
+
+func (w *connResponseWriter) Header() http.Header { return w.header }
+
+func (w *connResponseWriter) WriteHeader(status int) {
+	if w.wroteHeader {
+		return
+	}
+	w.wroteHeader = true
+
+	if cl := w.header.Get("Content-Length"); cl != "" {
+		if n, err := strconv.ParseInt(cl, 10, 64); err == nil {
+			w.contentLength = n
+		}
+	}
+	if w.contentLength < 0 {
+		w.chunked = true
+		w.header.Set("Transfer-Encoding", "chunked")
+		w.header.Del("Content-Length")
+	}
+
+	// "Connection: close" is case-insensitive per RFC 9110 §7.6.1; some
+	// upstreams send "Close" or "CLOSE". Use EqualFold so any casing
+	// triggers the post-response disconnect.
+	for _, v := range w.header.Values("Connection") {
+		if strings.EqualFold(v, "close") {
+			w.closeAfter = true
+		}
+	}
+
+	_, _ = fmt.Fprintf(w.bw, "HTTP/1.1 %d %s\r\n", status, http.StatusText(status))
+	_ = w.header.Write(w.bw)
+	_, _ = w.bw.WriteString("\r\n")
+}
+
+func (w *connResponseWriter) Write(p []byte) (int, error) {
+	if !w.wroteHeader {
+		w.WriteHeader(http.StatusOK)
+	}
+	if w.chunked {
+		if _, err := fmt.Fprintf(w.bw, "%x\r\n", len(p)); err != nil {
+			return 0, err
+		}
+		n, err := w.bw.Write(p)
+		if err != nil {
+			return n, err
+		}
+		if _, err := w.bw.WriteString("\r\n"); err != nil {
+			return n, err
+		}
+		w.written += int64(n)
+		return n, nil
+	}
+	n, err := w.bw.Write(p)
+	w.written += int64(n)
+	return n, err
+}
+
+func (w *connResponseWriter) Flush() {
+	_ = w.bw.Flush()
+}
+
+func (w *connResponseWriter) finish() {
+	if !w.wroteHeader {
+		w.WriteHeader(http.StatusOK)
+	}
+	if w.chunked {
+		_, _ = w.bw.WriteString("0\r\n\r\n")
+	}
+	_ = w.bw.Flush()
+}
diff --git a/core/services/cloudproxy/mitm/restart_test.go b/core/services/cloudproxy/mitm/restart_test.go
new file mode 100644
index 000000000..fd39a5ff8
--- /dev/null
+++ b/core/services/cloudproxy/mitm/restart_test.go
@@ -0,0 +1,98 @@
+package mitm
+
+import (
+	"fmt"
+	"net"
+	"net/http"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// noopHandler is the simplest InterceptHandler that satisfies NewServer.
+// We only exercise Start/Stop lifecycle here — no requests go through.
+func noopHandler(_ http.ResponseWriter, _ *http.Request, _ string) {}
+
+func newTestServer(addr string, hosts []string) *Server {
+	ca, err := NewInMemoryCA()
+	ExpectWithOffset(1, err).NotTo(HaveOccurred(), "NewInMemoryCA")
+	srv, err := NewServer(Config{
+		Addr:           addr,
+		CA:             ca,
+		InterceptHosts: hosts,
+		Handler:        noopHandler,
+	})
+	ExpectWithOffset(1, err).NotTo(HaveOccurred(), "NewServer")
+	return srv
+}
+
+// Server_StopIdempotent: calling Stop twice (and Stop without
+// Start) must not panic or deadlock. The application's RestartMITM
+// path is sensitive to this — it always calls Stop before Start, even
+// when the server is already stopped.
+var _ = Describe("Server", func() {
+	It("Stop is idempotent", func() {
+		srv := newTestServer("127.0.0.1:0", nil)
+		srv.Stop() // never started
+		srv.Stop() // double-stop after never-started
+
+		srv2 := newTestServer("127.0.0.1:0", nil)
+		Expect(srv2.Start()).To(Succeed())
+		srv2.Stop()
+		srv2.Stop() // second Stop after Start+Stop
+	})
+
+	// Server_RestartCycle: two sequential Server lifecycles on the
+	// same address must rebind cleanly, the new listener must accept
+	// connections, and the new allowlist must take effect — the shape
+	// RestartMITM relies on.
+	It("restart cycle rebinds and swaps allowlist", func() {
+		// First, find a free port we can rebind to.
+		probe, err := net.Listen("tcp", "127.0.0.1:0")
+		Expect(err).NotTo(HaveOccurred(), "probe listen")
+		addr := probe.Addr().String()
+		_ = probe.Close()
+
+		srv1 := newTestServer(addr, []string{"first.example.com"})
+		if err := srv1.Start(); err != nil {
+			// Port could have been recycled between probe close and Start.
+			// Skip rather than flake — the production path uses dynamic
+			// addrs anyway.
+			Skip(fmt.Sprintf("could not bind probed addr: %v", err))
+		}
+		Expect(strings.HasPrefix(srv1.Addr(), "127.0.0.1:")).To(BeTrue(), "Addr() = %q, want 127.0.0.1:* prefix", srv1.Addr())
+		srv1.Stop()
+
+		// Now bring up a second server on the same addr with a different
+		// allowlist — mirrors the RestartMITM-with-edited-hosts path.
+		srv2 := newTestServer(addr, []string{"second.example.com"})
+		if err := srv2.Start(); err != nil {
+			// SO_REUSEADDR is not set; brief TIME_WAIT collisions are
+			// possible on slow CI runners. Retry once on a fresh port so
+			// the test still exercises the "different hosts" property.
+			srv2 = newTestServer("127.0.0.1:0", []string{"second.example.com"})
+			Expect(srv2.Start()).To(Succeed(), "second Start (fresh port fallback)")
+		}
+		defer srv2.Stop()
+
+		// Smoke: the new listener accepts a TCP connection.
+		conn, err := net.Dial("tcp", srv2.Addr())
+		Expect(err).NotTo(HaveOccurred(), "dial restarted listener")
+		_ = conn.Close()
+
+		// Allowlist swap took effect: the new server intercepts
+		// "second.example.com" (and not the old "first.example.com").
+		Expect(srv2.shouldIntercept("second.example.com")).To(BeTrue(), "second server did not pick up the new InterceptHosts")
+		Expect(srv2.shouldIntercept("first.example.com")).To(BeFalse(), "second server still has the first server's allowlist")
+	})
+
+	// Server_AddrBeforeStart: Addr() pre-Start returns the configured
+	// address rather than panicking on a nil listener. The admin status
+	// endpoint reads it under MITMServer() — when an admin queries between
+	// configuration and Start, the response should still render cleanly.
+	It("Addr before start returns configured address", func() {
+		srv := newTestServer(":12345", nil)
+		Expect(srv.Addr()).To(Equal(":12345"))
+	})
+})
diff --git a/core/services/cloudproxy/proxy.go b/core/services/cloudproxy/proxy.go
new file mode 100644
index 000000000..879f353a3
--- /dev/null
+++ b/core/services/cloudproxy/proxy.go
@@ -0,0 +1,125 @@
+// Package cloudproxy stitches the cloud-proxy gRPC backend to the
+// HTTP edge: model rewrite, body shaping, and SSE-aware PII filtering
+// on the response. The outbound HTTP request itself lives inside the
+// cloud-proxy backend binary (backend/go/cloud-proxy), not here — this
+// package is the core-side glue.
+package cloudproxy
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/services/cloudproxy/ssewire"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+	"github.com/mudler/xlog"
+)
+
+func rewriteModel(body []byte, upstreamModel string) ([]byte, error) {
+	if upstreamModel == "" {
+		return body, nil
+	}
+	var m map[string]any
+	if err := json.Unmarshal(body, &m); err != nil {
+		return nil, fmt.Errorf("cloudproxy: parse request body: %w", err)
+	}
+	m["model"] = upstreamModel
+	return json.Marshal(m)
+}
+
+func streaming(body []byte) bool {
+	var probe struct {
+		Stream bool `json:"stream"`
+	}
+	if err := json.Unmarshal(body, &probe); err != nil {
+		return false
+	}
+	return probe.Stream
+}
+
+// passthroughError emits the upstream's error response unchanged.
+func passthroughError(c echo.Context, statusCode int, contentType string, body io.Reader) error {
+	const maxErrBody = 1 << 20
+	buf, _ := io.ReadAll(io.LimitReader(body, maxErrBody))
+	if contentType != "" {
+		c.Response().Header().Set("Content-Type", contentType)
+	}
+	c.Response().WriteHeader(statusCode)
+	_, _ = c.Response().Writer.Write(buf)
+	return nil
+}
+
+func forwardBuffered(c echo.Context, statusCode int, contentType string, body io.Reader) error {
+	if contentType != "" {
+		c.Response().Header().Set("Content-Type", contentType)
+	}
+	c.Response().WriteHeader(statusCode)
+	_, err := io.Copy(c.Response().Writer, body)
+	return err
+}
+
+// forwardStream applies SSE-aware PII rewriting as the response flows
+// to the client. provider selects the dialect (openai vs anthropic);
+// it comes from cfg.Proxy.Provider on the cloud-proxy backend.
+func forwardStream(c echo.Context, body io.Reader, provider string, filter *pii.StreamFilter) error {
+	c.Response().Header().Set("Content-Type", "text/event-stream")
+	c.Response().Header().Set("Cache-Control", "no-cache")
+	c.Response().Header().Set("Connection", "keep-alive")
+	c.Response().WriteHeader(http.StatusOK)
+
+	emit := func(line string) error {
+		_, err := fmt.Fprint(c.Response().Writer, line)
+		if err != nil {
+			return err
+		}
+		c.Response().Flush()
+		return nil
+	}
+
+	flushResidual := func() {
+		if filter == nil {
+			return
+		}
+		residual := filter.Drain()
+		if residual == "" {
+			return
+		}
+		if line := ssewire.SynthResidualEvent(ssewire.Provider(provider), residual); line != "" {
+			_ = emit(line)
+		}
+	}
+
+	prov := ssewire.Provider(provider)
+	scanner := ssewire.NewScanner(body)
+	for scanner.Scan() {
+		ev := scanner.Event()
+		if ssewire.IsTerminalMarker(ev.DataLine, prov) {
+			flushResidual()
+			_ = emit(ev.Raw)
+			continue
+		}
+		out := ev.Raw
+		if filter != nil && ev.DataLine != "" {
+			rewritten, drop := ssewire.RewritePayload(ev.DataLine, prov, filter)
+			if drop {
+				continue
+			}
+			if rewritten != ev.DataLine {
+				// strings.Replace with n=1 touches only the data line,
+				// preserving any "event:"/"id:" preamble.
+				out = strings.Replace(ev.Raw, ev.DataLine, rewritten, 1)
+			}
+		}
+		if err := emit(out); err != nil {
+			return nil
+		}
+	}
+	if err := scanner.Err(); err != nil && err != io.EOF {
+		xlog.Debug("cloudproxy: stream read error", "error", err)
+	}
+	flushResidual()
+	return nil
+}
diff --git a/core/services/cloudproxy/proxy_suite_test.go b/core/services/cloudproxy/proxy_suite_test.go
new file mode 100644
index 000000000..a30c14ec5
--- /dev/null
+++ b/core/services/cloudproxy/proxy_suite_test.go
@@ -0,0 +1,13 @@
+package cloudproxy
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestCloudproxy(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "cloudproxy test suite")
+}
diff --git a/core/services/cloudproxy/proxy_test.go b/core/services/cloudproxy/proxy_test.go
new file mode 100644
index 000000000..dd52683fc
--- /dev/null
+++ b/core/services/cloudproxy/proxy_test.go
@@ -0,0 +1,38 @@
+package cloudproxy
+
+import (
+	"encoding/json"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("rewriteModel", func() {
+	It("is a no-op when upstream model is empty", func() {
+		body := []byte(`{"model":"x","stream":false}`)
+		out, err := rewriteModel(body, "")
+		Expect(err).NotTo(HaveOccurred())
+		Expect(string(out)).To(Equal(string(body)))
+	})
+
+	It("replaces the model", func() {
+		body := []byte(`{"model":"alias","stream":false}`)
+		out, err := rewriteModel(body, "real-model-id")
+		Expect(err).NotTo(HaveOccurred())
+		var m map[string]any
+		Expect(json.Unmarshal(out, &m)).To(Succeed())
+		Expect(m["model"]).To(Equal("real-model-id"))
+	})
+})
+
+var _ = Describe("streaming", func() {
+	It("detects stream=true", func() {
+		Expect(streaming([]byte(`{"stream":true}`))).To(BeTrue())
+	})
+	It("detects stream=false", func() {
+		Expect(streaming([]byte(`{"stream":false}`))).To(BeFalse())
+	})
+	It("returns false when stream key absent", func() {
+		Expect(streaming([]byte(`{}`))).To(BeFalse())
+	})
+})
diff --git a/core/services/cloudproxy/ssewire/ssewire.go b/core/services/cloudproxy/ssewire/ssewire.go
new file mode 100644
index 000000000..ed3cb862b
--- /dev/null
+++ b/core/services/cloudproxy/ssewire/ssewire.go
@@ -0,0 +1,218 @@
+// Package ssewire holds the SSE-format helpers shared between
+// the request-shape cloud proxy (core/services/cloudproxy) and the
+// TLS-terminating MITM proxy (core/services/cloudproxy/mitm). Both
+// run a pii.StreamFilter over per-token text extracted from
+// provider-specific JSON chunks; this package owns the JSON shapes
+// so a future provider addition is one edit, not two.
+package ssewire
+
+import (
+	"bufio"
+	"encoding/json"
+	"io"
+	"strings"
+
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+)
+
+// Provider is the upstream wire format an SSE stream conforms to.
+type Provider string
+
+const (
+	OpenAI    Provider = "openai"
+	Anthropic Provider = "anthropic"
+)
+
+// Event is one SSE event with its exact wire bytes preserved in
+// Raw (so unmodified events round-trip byte-for-byte) and the
+// extracted JSON payload from the data: line in DataLine.
+type Event struct {
+	Raw      string
+	DataLine string
+}
+
+// Scanner reads SSE events one at a time from an upstream body.
+type Scanner struct {
+	r   *bufio.Reader
+	ev  Event
+	err error
+}
+
+func NewScanner(r io.Reader) *Scanner {
+	return &Scanner{r: bufio.NewReaderSize(r, 64*1024)}
+}
+
+func (s *Scanner) Scan() bool {
+	var raw strings.Builder
+	var dataLine string
+	for {
+		line, err := s.r.ReadString('\n')
+		if line != "" {
+			raw.WriteString(line)
+			trimmed := strings.TrimRight(line, "\r\n")
+			if trimmed == "" {
+				if raw.Len() == len(line) {
+					raw.Reset()
+					continue
+				}
+				s.ev = Event{Raw: raw.String(), DataLine: dataLine}
+				return true
+			}
+			if strings.HasPrefix(trimmed, "data:") && dataLine == "" {
+				payload := strings.TrimPrefix(trimmed, "data:")
+				payload = strings.TrimPrefix(payload, " ")
+				dataLine = payload
+			}
+		}
+		if err != nil {
+			s.err = err
+			if raw.Len() > 0 {
+				s.ev = Event{Raw: raw.String(), DataLine: dataLine}
+				return true
+			}
+			return false
+		}
+	}
+}
+
+func (s *Scanner) Event() Event { return s.ev }
+func (s *Scanner) Err() error   { return s.err }
+
+// IsTerminalMarker reports whether the data line is the per-provider
+// end-of-stream sentinel. The streaming PII filter must drain its
+// residue before the caller forwards a terminal marker — clients
+// stop reading after it.
+func IsTerminalMarker(dataLine string, provider Provider) bool {
+	if dataLine == "" {
+		return false
+	}
+	if strings.TrimSpace(dataLine) == "[DONE]" {
+		return true
+	}
+	if provider == Anthropic {
+		var probe struct {
+			Type string `json:"type"`
+		}
+		if err := json.Unmarshal([]byte(dataLine), &probe); err == nil {
+			return probe.Type == "message_stop"
+		}
+	}
+	return false
+}
+
+// RewritePayload runs the data line's content-bearing field through
+// the streaming filter. drop=true tells the caller to suppress the
+// SSE event entirely (the filter buffered the whole token while
+// disambiguating a pattern boundary).
+func RewritePayload(dataLine string, provider Provider, filter *pii.StreamFilter) (rewritten string, drop bool) {
+	if strings.TrimSpace(dataLine) == "[DONE]" {
+		return dataLine, false
+	}
+	switch provider {
+	case Anthropic:
+		return rewriteAnthropic(dataLine, filter)
+	default:
+		return rewriteOpenAI(dataLine, filter)
+	}
+}
+
+func rewriteOpenAI(dataLine string, filter *pii.StreamFilter) (string, bool) {
+	var m map[string]any
+	if err := json.Unmarshal([]byte(dataLine), &m); err != nil {
+		return dataLine, false
+	}
+	choices, ok := m["choices"].([]any)
+	if !ok || len(choices) == 0 {
+		return dataLine, false
+	}
+	first, ok := choices[0].(map[string]any)
+	if !ok {
+		return dataLine, false
+	}
+	delta, ok := first["delta"].(map[string]any)
+	if !ok {
+		return dataLine, false
+	}
+	content, ok := delta["content"].(string)
+	if !ok || content == "" {
+		return dataLine, false
+	}
+	rewritten := filter.Push(content)
+	if rewritten == "" {
+		return "", true
+	}
+	if rewritten == content {
+		return dataLine, false
+	}
+	delta["content"] = rewritten
+	out, err := json.Marshal(m)
+	if err != nil {
+		return dataLine, false
+	}
+	return string(out), false
+}
+
+func rewriteAnthropic(dataLine string, filter *pii.StreamFilter) (string, bool) {
+	var m map[string]any
+	if err := json.Unmarshal([]byte(dataLine), &m); err != nil {
+		return dataLine, false
+	}
+	if t, _ := m["type"].(string); t != "content_block_delta" {
+		return dataLine, false
+	}
+	delta, ok := m["delta"].(map[string]any)
+	if !ok {
+		return dataLine, false
+	}
+	if dt, _ := delta["type"].(string); dt != "text_delta" {
+		return dataLine, false
+	}
+	text, ok := delta["text"].(string)
+	if !ok || text == "" {
+		return dataLine, false
+	}
+	rewritten := filter.Push(text)
+	if rewritten == "" {
+		return "", true
+	}
+	if rewritten == text {
+		return dataLine, false
+	}
+	delta["text"] = rewritten
+	out, err := json.Marshal(m)
+	if err != nil {
+		return dataLine, false
+	}
+	return string(out), false
+}
+
+// SynthResidualEvent builds a provider-shaped SSE event carrying
+// the streaming filter's drained tail so the response body remains
+// a valid event stream after the proxy splices in held-back text.
+func SynthResidualEvent(provider Provider, text string) string {
+	switch provider {
+	case Anthropic:
+		payload := map[string]any{
+			"type":  "content_block_delta",
+			"index": 0,
+			"delta": map[string]string{"type": "text_delta", "text": text},
+		}
+		b, err := json.Marshal(payload)
+		if err != nil {
+			return ""
+		}
+		return "event: content_block_delta\ndata: " + string(b) + "\n\n"
+	default:
+		payload := map[string]any{
+			"object": "chat.completion.chunk",
+			"choices": []map[string]any{
+				{"index": 0, "delta": map[string]string{"content": text}},
+			},
+		}
+		b, err := json.Marshal(payload)
+		if err != nil {
+			return ""
+		}
+		return "data: " + string(b) + "\n\n"
+	}
+}
diff --git a/core/services/cloudproxy/ssewire/ssewire_suite_test.go b/core/services/cloudproxy/ssewire/ssewire_suite_test.go
new file mode 100644
index 000000000..6925017f0
--- /dev/null
+++ b/core/services/cloudproxy/ssewire/ssewire_suite_test.go
@@ -0,0 +1,13 @@
+package ssewire
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestSsewire(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "ssewire test suite")
+}
diff --git a/core/services/cloudproxy/ssewire/ssewire_test.go b/core/services/cloudproxy/ssewire/ssewire_test.go
new file mode 100644
index 000000000..2750367fd
--- /dev/null
+++ b/core/services/cloudproxy/ssewire/ssewire_test.go
@@ -0,0 +1,114 @@
+package ssewire
+
+import (
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// Scanner contract: returns one Event per double-newline-terminated
+// SSE block, preserving the raw bytes (so unmodified events round-trip
+// exactly) and extracting the first data: payload as DataLine.
+
+var _ = Describe("Scanner", func() {
+	It("scans a basic event", func() {
+		in := "event: foo\ndata: hello\n\n"
+		s := NewScanner(strings.NewReader(in))
+		Expect(s.Scan()).To(BeTrue(), "Scan returned false on a well-formed event; err=%v", s.Err())
+		ev := s.Event()
+		Expect(ev.Raw).To(Equal(in))
+		Expect(ev.DataLine).To(Equal("hello"))
+		Expect(s.Scan()).To(BeFalse(), "Scan should return false after the only event")
+	})
+
+	It("handles CRLF", func() {
+		// Some upstreams emit CRLF instead of LF. The scanner trims
+		// trailing \r off the data line so DataLine carries the same
+		// bytes whichever line ending the producer chose.
+		in := "event: foo\r\ndata: hello\r\n\r\n"
+		s := NewScanner(strings.NewReader(in))
+		Expect(s.Scan()).To(BeTrue(), "Scan returned false on CRLF event; err=%v", s.Err())
+		Expect(s.Event().DataLine).To(Equal("hello"))
+	})
+
+	It("scans multiple events", func() {
+		in := "data: one\n\ndata: two\n\ndata: three\n\n"
+		s := NewScanner(strings.NewReader(in))
+		got := []string{}
+		for s.Scan() {
+			got = append(got, s.Event().DataLine)
+		}
+		Expect(got).To(Equal([]string{"one", "two", "three"}))
+	})
+
+	It("handles empty data payload", func() {
+		// "data:" with no payload is valid SSE — DataLine should be empty
+		// and Scan should still surface the event so callers can decide.
+		in := "data:\n\n"
+		s := NewScanner(strings.NewReader(in))
+		Expect(s.Scan()).To(BeTrue(), "Scan returned false on empty data payload; err=%v", s.Err())
+		Expect(s.Event().DataLine).To(Equal(""))
+	})
+
+	It("skips leading blank lines", func() {
+		// A producer that prints a blank "keep-alive" before the first
+		// real event must not produce a phantom event.
+		in := "\n\n\ndata: real\n\n"
+		s := NewScanner(strings.NewReader(in))
+		Expect(s.Scan()).To(BeTrue(), "Scan returned false; err=%v", s.Err())
+		Expect(s.Event().DataLine).To(Equal("real"))
+	})
+
+	It("handles mid-event EOF", func() {
+		// EOF mid-event still surfaces the partial event with whatever
+		// data was extracted — the StreamFilter+caller decides how to
+		// handle a truncated upstream rather than silently dropping it.
+		in := "data: half"
+		s := NewScanner(strings.NewReader(in))
+		Expect(s.Scan()).To(BeTrue(), "Scan returned false on partial event")
+		ev := s.Event()
+		Expect(ev.DataLine).To(Equal("half"))
+		Expect(s.Scan()).To(BeFalse(), "Scan should not surface a second event after EOF")
+	})
+})
+
+var _ = Describe("IsTerminalMarker", func() {
+	cases := []struct {
+		name     string
+		dataLine string
+		provider Provider
+		want     bool
+	}{
+		{"openai DONE", "[DONE]", OpenAI, true},
+		{"openai DONE with whitespace", "  [DONE]  ", OpenAI, true},
+		{"anthropic DONE also recognised", "[DONE]", Anthropic, true},
+		{"anthropic message_stop", `{"type":"message_stop"}`, Anthropic, true},
+		{"anthropic content_block_delta is not terminal", `{"type":"content_block_delta"}`, Anthropic, false},
+		{"openai chat.completion.chunk is not terminal", `{"object":"chat.completion.chunk"}`, OpenAI, false},
+		{"openai message_stop is not terminal (wrong provider)", `{"type":"message_stop"}`, OpenAI, false},
+		{"empty data", "", OpenAI, false},
+		{"non-json garbage", "garbage", Anthropic, false},
+	}
+	for _, c := range cases {
+		It(c.name, func() {
+			Expect(IsTerminalMarker(c.dataLine, c.provider)).To(Equal(c.want))
+		})
+	}
+})
+
+var _ = Describe("SynthResidualEvent", func() {
+	It("anthropic", func() {
+		got := SynthResidualEvent(Anthropic, "tail")
+		Expect(strings.HasPrefix(got, "event: content_block_delta\ndata:")).To(BeTrue(), "Anthropic residual event missing event/data lines: %q", got)
+		Expect(strings.HasSuffix(got, "\n\n")).To(BeTrue(), "Anthropic residual event missing trailing blank line: %q", got)
+		Expect(got).To(ContainSubstring(`"text":"tail"`))
+	})
+
+	It("openai", func() {
+		got := SynthResidualEvent(OpenAI, "tail")
+		Expect(strings.HasPrefix(got, "data: ")).To(BeTrue(), "OpenAI residual event missing data: prefix: %q", got)
+		Expect(strings.HasSuffix(got, "\n\n")).To(BeTrue(), "OpenAI residual event missing trailing blank line: %q", got)
+		Expect(got).To(ContainSubstring(`"content":"tail"`))
+	})
+})
diff --git a/core/services/monitoring/metrics.go b/core/services/monitoring/metrics.go
index fa5663210..f9644698e 100644
--- a/core/services/monitoring/metrics.go
+++ b/core/services/monitoring/metrics.go
@@ -4,6 +4,7 @@ import (
 	"context"
 
 	"github.com/mudler/xlog"
+	"go.opentelemetry.io/otel"
 	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/exporters/prometheus"
 	"go.opentelemetry.io/otel/metric"
@@ -12,6 +13,7 @@ import (
 
 type LocalAIMetricsService struct {
 	Meter         metric.Meter
+	Provider      *metricApi.MeterProvider
 	ApiTimeMetric metric.Float64Histogram
 }
 
@@ -31,6 +33,13 @@ func NewLocalAIMetricsService() (*LocalAIMetricsService, error) {
 		return nil, err
 	}
 	provider := metricApi.NewMeterProvider(metricApi.WithReader(exporter))
+	// Share the provider with the OTel global so packages outside this
+	// service (e.g., core/services/routing/billing) see the same Prom
+	// exporter when they call otel.Meter(...). Without this, the billing
+	// counters would route to the no-op global provider and never reach
+	// /metrics — which is exactly the silent-billing-loss class of bug
+	// the routing module is designed to surface.
+	otel.SetMeterProvider(provider)
 	meter := provider.Meter("github.com/mudler/LocalAI")
 
 	apiTimeMetric, err := meter.Float64Histogram("api_call", metric.WithDescription("api calls"))
@@ -40,6 +49,7 @@ func NewLocalAIMetricsService() (*LocalAIMetricsService, error) {
 
 	return &LocalAIMetricsService{
 		Meter:         meter,
+		Provider:      provider,
 		ApiTimeMetric: apiTimeMetric,
 	}, nil
 }
diff --git a/core/services/nodes/health_mock_test.go b/core/services/nodes/health_mock_test.go
index 45f37b563..fd8ec892d 100644
--- a/core/services/nodes/health_mock_test.go
+++ b/core/services/nodes/health_mock_test.go
@@ -235,6 +235,9 @@ func (c *fakeBackendClient) AudioTransformStream(_ context.Context, _ ...ggrpc.C
 func (c *fakeBackendClient) AudioToAudioStream(_ context.Context, _ ...ggrpc.CallOption) (grpc.AudioToAudioStreamClient, error) {
 	return nil, nil
 }
+func (c *fakeBackendClient) Forward(_ context.Context, _ ...ggrpc.CallOption) (grpc.ForwardClient, error) {
+	return nil, nil
+}
 func (c *fakeBackendClient) ModelMetadata(_ context.Context, _ *pb.ModelOptions, _ ...ggrpc.CallOption) (*pb.ModelMetadataResponse, error) {
 	return nil, nil
 }
@@ -265,6 +268,12 @@ func (c *fakeBackendClient) StopQuantization(_ context.Context, _ *pb.Quantizati
 func (c *fakeBackendClient) Free(_ context.Context) error {
 	return nil
 }
+func (c *fakeBackendClient) TokenClassify(_ context.Context, _ *pb.TokenClassifyRequest, _ ...ggrpc.CallOption) (*pb.TokenClassifyResponse, error) {
+	return nil, nil
+}
+func (c *fakeBackendClient) Score(_ context.Context, _ *pb.ScoreRequest, _ ...ggrpc.CallOption) (*pb.ScoreResponse, error) {
+	return nil, nil
+}
 
 // --- fakeBackendClientFactory ---
 
diff --git a/core/services/nodes/inflight_test.go b/core/services/nodes/inflight_test.go
index edb04b6f8..60a5299cc 100644
--- a/core/services/nodes/inflight_test.go
+++ b/core/services/nodes/inflight_test.go
@@ -180,6 +180,10 @@ func (f *fakeGRPCBackend) AudioToAudioStream(_ context.Context, _ ...ggrpc.CallO
 	return nil, nil
 }
 
+func (f *fakeGRPCBackend) Forward(_ context.Context, _ ...ggrpc.CallOption) (grpc.ForwardClient, error) {
+	return nil, nil
+}
+
 func (f *fakeGRPCBackend) ModelMetadata(_ context.Context, _ *pb.ModelOptions, _ ...ggrpc.CallOption) (*pb.ModelMetadataResponse, error) {
 	return &pb.ModelMetadataResponse{}, nil
 }
@@ -220,6 +224,14 @@ func (f *fakeGRPCBackend) Free(_ context.Context) error {
 	return nil
 }
 
+func (f *fakeGRPCBackend) TokenClassify(_ context.Context, _ *pb.TokenClassifyRequest, _ ...ggrpc.CallOption) (*pb.TokenClassifyResponse, error) {
+	return nil, nil
+}
+
+func (f *fakeGRPCBackend) Score(_ context.Context, _ *pb.ScoreRequest, _ ...ggrpc.CallOption) (*pb.ScoreResponse, error) {
+	return nil, nil
+}
+
 // --- Tests ---
 
 var _ = Describe("InFlightTrackingClient", func() {
diff --git a/core/services/routing/admission/admission.go b/core/services/routing/admission/admission.go
new file mode 100644
index 000000000..168248181
--- /dev/null
+++ b/core/services/routing/admission/admission.go
@@ -0,0 +1,105 @@
+// Package admission is routing-module subsystem 5: per-model
+// concurrency control + audit. The middleware acquires a slot
+// before the handler runs; on full, the request gets 503 with
+// Retry-After so clients back off rather than pile on. The audit
+// row goes into the shared event store alongside PII and proxy
+// rows so admins see a single timeline of routing pressure.
+//
+// Concurrency model: one buffered channel per model name (kept in
+// a sync.Map). Acquire is a non-blocking send; full = reject. No
+// queueing in the MVP — adding queue depth + timeout is a small
+// follow-up if/when telemetry shows admins want it.
+package admission
+
+import (
+	"sync"
+	"time"
+)
+
+// Limiter holds the per-model semaphores. Safe for concurrent use.
+//
+// Each model's slot count is fixed at first Acquire — a config
+// edit that reduces MaxConcurrent only takes effect on the NEXT
+// process start (or after the limiter is rebuilt). The alternative
+// (dynamic resize on every call) would require swapping the channel
+// out from under in-flight Acquires; the simplicity tradeoff favors
+// "restart to apply" since admins editing limits do so rarely.
+type Limiter struct {
+	mu    sync.Mutex
+	slots map[string]chan struct{}
+}
+
+// New returns an empty Limiter.
+func New() *Limiter {
+	return &Limiter{slots: make(map[string]chan struct{})}
+}
+
+// Acquire takes a slot for the named model. maxConcurrent <= 0
+// means unlimited — Acquire returns immediately with a no-op
+// release. When all slots are busy, returns ok=false. Callers
+// MUST call the returned release when done (typically via defer);
+// missing a release leaks one slot for the lifetime of the
+// process.
+func (l *Limiter) Acquire(modelName string, maxConcurrent int) (release func(), ok bool) {
+	if maxConcurrent <= 0 {
+		return func() {}, true
+	}
+	ch := l.slot(modelName, maxConcurrent)
+	select {
+	case ch <- struct{}{}:
+		return func() { <-ch }, true
+	default:
+		return nil, false
+	}
+}
+
+// InFlight reports the number of currently-held slots for the
+// named model. Used by the admin status surface — read-only and
+// approximate (ch length is racy with concurrent Acquire/release
+// but that's fine for a dashboard).
+func (l *Limiter) InFlight(modelName string) int {
+	l.mu.Lock()
+	ch, ok := l.slots[modelName]
+	l.mu.Unlock()
+	if !ok {
+		return 0
+	}
+	return len(ch)
+}
+
+// Capacity reports the limiter's configured slot count for the
+// named model, or 0 if the model has never had Acquire called
+// against it. Same dashboard-only purpose as InFlight.
+func (l *Limiter) Capacity(modelName string) int {
+	l.mu.Lock()
+	ch, ok := l.slots[modelName]
+	l.mu.Unlock()
+	if !ok {
+		return 0
+	}
+	return cap(ch)
+}
+
+// slot returns the per-model channel, creating it on first use.
+func (l *Limiter) slot(modelName string, capacity int) chan struct{} {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	if ch, ok := l.slots[modelName]; ok {
+		return ch
+	}
+	ch := make(chan struct{}, capacity)
+	l.slots[modelName] = ch
+	return ch
+}
+
+// RetryAfter returns the Retry-After header value for a rejected
+// request. The Limiter doesn't track rolling latency — this is a
+// pure config-driven hint, identity-mapped to the LimitsConfig
+// field with a 1s fallback. Centralised here so the middleware
+// doesn't reimplement the default rule.
+func RetryAfter(configured int) time.Duration {
+	if configured > 0 {
+		return time.Duration(configured) * time.Second
+	}
+	return time.Second
+}
diff --git a/core/services/routing/admission/admission_suite_test.go b/core/services/routing/admission/admission_suite_test.go
new file mode 100644
index 000000000..163c84db3
--- /dev/null
+++ b/core/services/routing/admission/admission_suite_test.go
@@ -0,0 +1,13 @@
+package admission
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestAdmission(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Admission test suite")
+}
diff --git a/core/services/routing/admission/admission_test.go b/core/services/routing/admission/admission_test.go
new file mode 100644
index 000000000..b97e18969
--- /dev/null
+++ b/core/services/routing/admission/admission_test.go
@@ -0,0 +1,103 @@
+package admission
+
+import (
+	"sync"
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Limiter", func() {
+	It("returns immediate no-op when unlimited", func() {
+		l := New()
+		for i := 0; i < 5; i++ {
+			release, ok := l.Acquire("anything", 0)
+			Expect(ok).To(BeTrue(), "max=0 should never reject")
+			release()
+		}
+		Expect(l.InFlight("anything")).To(Equal(0))
+		Expect(l.Capacity("anything")).To(Equal(0))
+	})
+
+	It("rejects when full", func() {
+		// Two concurrent requests at MaxConcurrent=1: second is
+		// rejected and the limiter reports the in-flight count.
+		l := New()
+		r1, ok := l.Acquire("m", 1)
+		Expect(ok).To(BeTrue(), "first Acquire should succeed")
+		defer r1()
+
+		_, ok = l.Acquire("m", 1)
+		Expect(ok).To(BeFalse(), "second Acquire should reject — slot is held")
+		Expect(l.InFlight("m")).To(Equal(1))
+		Expect(l.Capacity("m")).To(Equal(1))
+	})
+
+	It("allows the next Acquire after Release", func() {
+		l := New()
+		r1, _ := l.Acquire("m", 1)
+		r1()
+		_, ok := l.Acquire("m", 1)
+		Expect(ok).To(BeTrue(), "Acquire after release should succeed")
+	})
+
+	It("isolates slots per-model", func() {
+		// Slots are per-model; saturating one does not affect another.
+		l := New()
+		r1, _ := l.Acquire("m1", 1)
+		defer r1()
+		_, ok := l.Acquire("m2", 1)
+		Expect(ok).To(BeTrue(), "m2 should have its own slot")
+	})
+
+	It("honours the cap under concurrent Acquires", func() {
+		// Hammer Acquire from multiple goroutines; the count of
+		// successful acquires must not exceed the cap.
+		l := New()
+		const cap = 4
+		const goroutines = 50
+		var wg sync.WaitGroup
+		successes := make(chan func(), goroutines)
+		for i := 0; i < goroutines; i++ {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				if release, ok := l.Acquire("m", cap); ok {
+					successes <- release
+				}
+			}()
+		}
+		wg.Wait()
+		close(successes)
+
+		count := 0
+		for r := range successes {
+			count++
+			r()
+		}
+		Expect(count).To(Equal(cap))
+	})
+
+	// First-Acquire fixes the channel capacity. A subsequent Acquire
+	// at a different maxConcurrent does NOT resize — admins editing
+	// limits expect a process restart. Pin that behaviour so the
+	// surprise isn't accidentally introduced.
+	It("fixes the cap at first Acquire", func() {
+		l := New()
+		r1, _ := l.Acquire("m", 2)
+		defer r1()
+		// Try to acquire with cap=10 — should still be bounded by 2.
+		r2, _ := l.Acquire("m", 10)
+		defer r2()
+		_, ok := l.Acquire("m", 10)
+		Expect(ok).To(BeFalse(), "third Acquire should reject — initial cap of 2 still applies")
+	})
+})
+
+var _ = Describe("RetryAfter", func() {
+	It("defaults to one second", func() {
+		Expect(RetryAfter(0)).To(Equal(time.Second))
+		Expect(RetryAfter(5)).To(Equal(5 * time.Second))
+	})
+})
diff --git a/core/services/routing/billing/backend.go b/core/services/routing/billing/backend.go
new file mode 100644
index 000000000..69119878e
--- /dev/null
+++ b/core/services/routing/billing/backend.go
@@ -0,0 +1,52 @@
+// Package billing provides the StatsBackend abstraction that decouples
+// per-request token tracking from the auth database. This lets a
+// single-user no-auth deployment still see usage and costs, which the
+// pre-routing-module middleware did not allow.
+package billing
+
+import (
+	"context"
+
+	"github.com/mudler/LocalAI/core/http/auth"
+)
+
+// StatsBackend is the persistence target for usage records. Three
+// implementations exist:
+//
+//   - GORM (auth-DB-backed) — used when --auth is on; records share the
+//     auth database and existing aggregation queries continue to work.
+//   - Memory (ring buffer) — used when --auth is off and no other DB is
+//     configured. Records are lost on restart by design; the same
+//     process can still answer aggregation queries for live dashboards.
+//   - Disabled — explicit no-op when --disable-stats is set, useful in
+//     ephemeral CI runs.
+//
+// All implementations are safe for concurrent use. Record() must not
+// block the caller for more than the time it takes to enqueue — durable
+// flushing happens on a background goroutine inside the implementation.
+type StatsBackend interface {
+	// Record enqueues a single usage record. The record is asynchronously
+	// persisted; callers should not assume durability on return. The ctx
+	// is currently unused but reserved for future cancellation.
+	Record(ctx context.Context, r *auth.UsageRecord) error
+
+	// Aggregate returns time-bucketed totals for the dashboard. The
+	// AggregateQuery's UserID is required; pass the empty string only
+	// from admin-scoped paths. Implementations that do not support
+	// aggregation (e.g., ring buffer in saturation) may return an empty
+	// result with no error.
+	Aggregate(ctx context.Context, q AggregateQuery) ([]auth.UsageBucket, error)
+
+	// Close releases resources (flushes pending records, stops
+	// goroutines). Safe to call multiple times.
+	Close() error
+}
+
+// AggregateQuery describes a usage aggregation request. Period is one of
+// "day", "week", "month", "all" (matching the existing auth.UsageRecord
+// vocabulary). UserID empty means cluster-wide; callers must enforce
+// admin permission before passing the empty string.
+type AggregateQuery struct {
+	UserID string
+	Period string
+}
diff --git a/core/services/routing/billing/billing_suite_test.go b/core/services/routing/billing/billing_suite_test.go
new file mode 100644
index 000000000..0b43673eb
--- /dev/null
+++ b/core/services/routing/billing/billing_suite_test.go
@@ -0,0 +1,13 @@
+package billing
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestBilling(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "billing test suite")
+}
diff --git a/core/services/routing/billing/disabled.go b/core/services/routing/billing/disabled.go
new file mode 100644
index 000000000..c6d35df6c
--- /dev/null
+++ b/core/services/routing/billing/disabled.go
@@ -0,0 +1,20 @@
+package billing
+
+import (
+	"context"
+
+	"github.com/mudler/LocalAI/core/http/auth"
+)
+
+// disabledBackend drops every record. Used when --disable-stats is set,
+// e.g., for ephemeral CI runs where token tracking is just noise.
+type disabledBackend struct{}
+
+// NewDisabledBackend returns a no-op StatsBackend.
+func NewDisabledBackend() StatsBackend { return disabledBackend{} }
+
+func (disabledBackend) Record(_ context.Context, _ *auth.UsageRecord) error { return nil }
+func (disabledBackend) Aggregate(_ context.Context, _ AggregateQuery) ([]auth.UsageBucket, error) {
+	return nil, nil
+}
+func (disabledBackend) Close() error { return nil }
diff --git a/core/services/routing/billing/gorm.go b/core/services/routing/billing/gorm.go
new file mode 100644
index 000000000..d304af10a
--- /dev/null
+++ b/core/services/routing/billing/gorm.go
@@ -0,0 +1,111 @@
+package billing
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"github.com/mudler/LocalAI/core/http/auth"
+	"github.com/mudler/xlog"
+	"gorm.io/gorm"
+)
+
+// gormBackend writes UsageRecord rows to a GORM-backed database (the
+// existing auth DB when --auth is enabled). It batches inserts every
+// flushInterval to amortize round-trips; pre-routing-module middleware
+// did the same with a private batcher — we keep the same cadence.
+type gormBackend struct {
+	db            *gorm.DB
+	flushInterval time.Duration
+	maxPending    int
+
+	mu      sync.Mutex
+	pending []*auth.UsageRecord
+
+	stopCh chan struct{}
+	doneCh chan struct{}
+}
+
+// NewGormBackend constructs a StatsBackend that persists records to db.
+// The returned backend launches a background flush goroutine; call
+// Close() to stop it. flushInterval ≤ 0 picks the prior 5s default;
+// maxPending ≤ 0 picks 5000.
+func NewGormBackend(db *gorm.DB, flushInterval time.Duration, maxPending int) StatsBackend {
+	if flushInterval <= 0 {
+		flushInterval = 5 * time.Second
+	}
+	if maxPending <= 0 {
+		maxPending = 5000
+	}
+	b := &gormBackend{
+		db:            db,
+		flushInterval: flushInterval,
+		maxPending:    maxPending,
+		stopCh:        make(chan struct{}),
+		doneCh:        make(chan struct{}),
+	}
+	go b.run()
+	return b
+}
+
+func (b *gormBackend) Record(_ context.Context, r *auth.UsageRecord) error {
+	b.mu.Lock()
+	b.pending = append(b.pending, r)
+	b.mu.Unlock()
+	return nil
+}
+
+func (b *gormBackend) Aggregate(_ context.Context, q AggregateQuery) ([]auth.UsageBucket, error) {
+	if q.UserID == "" {
+		return auth.GetAllUsage(b.db, q.Period, "")
+	}
+	return auth.GetUserUsage(b.db, q.UserID, q.Period)
+}
+
+func (b *gormBackend) Close() error {
+	select {
+	case <-b.stopCh:
+		// already stopped
+	default:
+		close(b.stopCh)
+	}
+	<-b.doneCh
+	return nil
+}
+
+func (b *gormBackend) run() {
+	defer close(b.doneCh)
+	ticker := time.NewTicker(b.flushInterval)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-b.stopCh:
+			b.flush()
+			return
+		case <-ticker.C:
+			b.flush()
+		}
+	}
+}
+
+func (b *gormBackend) flush() {
+	b.mu.Lock()
+	batch := b.pending
+	b.pending = nil
+	b.mu.Unlock()
+
+	if len(batch) == 0 {
+		return
+	}
+
+	if err := b.db.Create(&batch).Error; err != nil {
+		xlog.Error("failed to flush usage batch", "count", len(batch), "error", err)
+		// Re-queue with a cap to avoid unbounded growth on persistent DB
+		// failure (matches the prior behavior in core/http/middleware/usage.go).
+		b.mu.Lock()
+		if len(b.pending) < b.maxPending {
+			b.pending = append(batch, b.pending...)
+		}
+		b.mu.Unlock()
+	}
+}
diff --git a/core/services/routing/billing/inmem.go b/core/services/routing/billing/inmem.go
new file mode 100644
index 000000000..3be341c7b
--- /dev/null
+++ b/core/services/routing/billing/inmem.go
@@ -0,0 +1,157 @@
+package billing
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"github.com/mudler/LocalAI/core/http/auth"
+)
+
+// memoryBackend keeps the most recent N records in a ring buffer. It is
+// the no-auth, no-DB fallback: a single user running LocalAI on a
+// laptop still gets live aggregation against this buffer until the
+// process exits. Records are not durable.
+//
+// Aggregation is computed by linear scan — fine because the ring is
+// bounded (default 50_000 records) and aggregation is rare (UI dashboard
+// poll, MCP tool calls). If the working set grows beyond what scan can
+// service in <100ms, the operator should enable auth+DB.
+type memoryBackend struct {
+	mu     sync.RWMutex
+	ring   []*auth.UsageRecord
+	cap    int
+	cursor int // next write position
+	full   bool
+}
+
+// NewMemoryBackend returns a StatsBackend backed by an in-process ring
+// buffer. capacity ≤ 0 uses 50_000.
+func NewMemoryBackend(capacity int) StatsBackend {
+	if capacity <= 0 {
+		capacity = 50_000
+	}
+	return &memoryBackend{
+		ring: make([]*auth.UsageRecord, capacity),
+		cap:  capacity,
+	}
+}
+
+func (b *memoryBackend) Record(_ context.Context, r *auth.UsageRecord) error {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.ring[b.cursor] = r
+	b.cursor++
+	if b.cursor == b.cap {
+		b.cursor = 0
+		b.full = true
+	}
+	return nil
+}
+
+func (b *memoryBackend) Aggregate(_ context.Context, q AggregateQuery) ([]auth.UsageBucket, error) {
+	since := periodStart(q.Period)
+	bucketWidth := bucketWidthFor(q.Period)
+	dateFmt := bucketFormatFor(q.Period)
+
+	type aggKey struct {
+		bucket   string
+		model    string
+		userID   string
+		userName string
+	}
+	agg := make(map[aggKey]*auth.UsageBucket)
+
+	b.mu.RLock()
+	defer b.mu.RUnlock()
+
+	scan := func(r *auth.UsageRecord) {
+		if r == nil {
+			return
+		}
+		if !since.IsZero() && r.CreatedAt.Before(since) {
+			return
+		}
+		if q.UserID != "" && r.UserID != q.UserID {
+			return
+		}
+		bucketTime := r.CreatedAt.Truncate(bucketWidth)
+		key := aggKey{
+			bucket:   bucketTime.Format(dateFmt),
+			model:    r.Model,
+			userID:   r.UserID,
+			userName: r.UserName,
+		}
+		entry, ok := agg[key]
+		if !ok {
+			entry = &auth.UsageBucket{
+				Bucket:   key.bucket,
+				Model:    key.model,
+				UserID:   key.userID,
+				UserName: key.userName,
+			}
+			agg[key] = entry
+		}
+		entry.PromptTokens += r.PromptTokens
+		entry.CompletionTokens += r.CompletionTokens
+		entry.TotalTokens += r.TotalTokens
+		entry.RequestCount++
+	}
+
+	if b.full {
+		for _, r := range b.ring {
+			scan(r)
+		}
+	} else {
+		for i := 0; i < b.cursor; i++ {
+			scan(b.ring[i])
+		}
+	}
+
+	out := make([]auth.UsageBucket, 0, len(agg))
+	for _, v := range agg {
+		out = append(out, *v)
+	}
+	return out, nil
+}
+
+func (b *memoryBackend) Close() error { return nil }
+
+// periodStart returns the lower bound of the time window for the
+// given period. Mirrors auth.periodToWindow but without GORM
+// dialector concerns.
+func periodStart(period string) time.Time {
+	now := time.Now()
+	switch period {
+	case "day":
+		return now.Add(-24 * time.Hour)
+	case "week":
+		return now.Add(-7 * 24 * time.Hour)
+	case "all":
+		return time.Time{}
+	default: // "month"
+		return now.Add(-30 * 24 * time.Hour)
+	}
+}
+
+func bucketWidthFor(period string) time.Duration {
+	switch period {
+	case "day":
+		return time.Hour
+	case "all":
+		return 30 * 24 * time.Hour
+	default: // week, month
+		return 24 * time.Hour
+	}
+}
+
+func bucketFormatFor(period string) string {
+	switch period {
+	case "day":
+		return "2006-01-02 15:00"
+	case "all":
+		return "2006-01"
+	default:
+		return "2006-01-02"
+	}
+}
diff --git a/core/services/routing/billing/inmem_test.go b/core/services/routing/billing/inmem_test.go
new file mode 100644
index 000000000..c630249a9
--- /dev/null
+++ b/core/services/routing/billing/inmem_test.go
@@ -0,0 +1,140 @@
+package billing
+
+import (
+	"context"
+	"time"
+
+	"github.com/mudler/LocalAI/core/http/auth"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("MemoryBackend", func() {
+	It("records and aggregates", func() {
+		ctx := context.Background()
+		b := NewMemoryBackend(0)
+		defer func() { _ = b.Close() }()
+
+		now := time.Now()
+		for i := 0; i < 5; i++ {
+			err := b.Record(ctx, &auth.UsageRecord{
+				UserID:           "u-1",
+				UserName:         "alice",
+				Model:            "qwen-7b",
+				Endpoint:         "/v1/chat/completions",
+				PromptTokens:     10,
+				CompletionTokens: 20,
+				TotalTokens:      30,
+				CreatedAt:        now,
+			})
+			Expect(err).NotTo(HaveOccurred(), "record")
+		}
+		for i := 0; i < 3; i++ {
+			err := b.Record(ctx, &auth.UsageRecord{
+				UserID:           "u-2",
+				UserName:         "bob",
+				Model:            "qwen-7b",
+				Endpoint:         "/v1/chat/completions",
+				PromptTokens:     7,
+				CompletionTokens: 13,
+				TotalTokens:      20,
+				CreatedAt:        now,
+			})
+			Expect(err).NotTo(HaveOccurred(), "record")
+		}
+
+		buckets, err := b.Aggregate(ctx, AggregateQuery{UserID: "u-1", Period: "month"})
+		Expect(err).NotTo(HaveOccurred(), "aggregate")
+		var promptTotal, reqTotal int64
+		for _, bk := range buckets {
+			Expect(bk.UserID).To(Equal("u-1"), "expected only u-1 buckets")
+			promptTotal += bk.PromptTokens
+			reqTotal += bk.RequestCount
+		}
+		Expect(promptTotal).To(Equal(int64(50)))
+		Expect(reqTotal).To(Equal(int64(5)))
+
+		all, err := b.Aggregate(ctx, AggregateQuery{Period: "month"})
+		Expect(err).NotTo(HaveOccurred(), "aggregate all")
+		var allPrompt, allReqs int64
+		for _, bk := range all {
+			allPrompt += bk.PromptTokens
+			allReqs += bk.RequestCount
+		}
+		Expect(allPrompt).To(Equal(int64(50 + 21)))
+		Expect(allReqs).To(Equal(int64(8)))
+	})
+
+	It("filters by period", func() {
+		ctx := context.Background()
+		b := NewMemoryBackend(0)
+		defer func() { _ = b.Close() }()
+
+		old := time.Now().Add(-48 * time.Hour)
+		recent := time.Now()
+
+		err := b.Record(ctx, &auth.UsageRecord{
+			UserID: "u", UserName: "u", Model: "m",
+			PromptTokens: 100, TotalTokens: 100, CreatedAt: old,
+		})
+		Expect(err).NotTo(HaveOccurred())
+		err = b.Record(ctx, &auth.UsageRecord{
+			UserID: "u", UserName: "u", Model: "m",
+			PromptTokens: 50, TotalTokens: 50, CreatedAt: recent,
+		})
+		Expect(err).NotTo(HaveOccurred())
+
+		dayBuckets, err := b.Aggregate(ctx, AggregateQuery{UserID: "u", Period: "day"})
+		Expect(err).NotTo(HaveOccurred())
+		var dayTotal int64
+		for _, bk := range dayBuckets {
+			dayTotal += bk.PromptTokens
+		}
+		Expect(dayTotal).To(Equal(int64(50)), "day window should only include the recent record")
+
+		monthBuckets, err := b.Aggregate(ctx, AggregateQuery{UserID: "u", Period: "month"})
+		Expect(err).NotTo(HaveOccurred())
+		var monthTotal int64
+		for _, bk := range monthBuckets {
+			monthTotal += bk.PromptTokens
+		}
+		Expect(monthTotal).To(Equal(int64(150)), "month window should include both records")
+	})
+
+	It("ring wraps", func() {
+		ctx := context.Background()
+		b := NewMemoryBackend(4) // tiny ring so we can observe wrap
+
+		for i := 0; i < 10; i++ {
+			err := b.Record(ctx, &auth.UsageRecord{
+				UserID:       "u",
+				UserName:     "u",
+				Model:        "m",
+				PromptTokens: 1,
+				TotalTokens:  1,
+				CreatedAt:    time.Now(),
+			})
+			Expect(err).NotTo(HaveOccurred())
+		}
+
+		buckets, err := b.Aggregate(ctx, AggregateQuery{UserID: "u", Period: "month"})
+		Expect(err).NotTo(HaveOccurred())
+		var total int64
+		for _, bk := range buckets {
+			total += bk.PromptTokens
+		}
+		Expect(total).To(Equal(int64(4)), "ring should keep last 4 records")
+	})
+})
+
+var _ = Describe("DisabledBackend", func() {
+	It("is a no-op", func() {
+		ctx := context.Background()
+		b := NewDisabledBackend()
+		Expect(b.Record(ctx, &auth.UsageRecord{UserID: "u"})).To(Succeed(), "disabled record should not error")
+		out, err := b.Aggregate(ctx, AggregateQuery{Period: "month"})
+		Expect(err).NotTo(HaveOccurred(), "disabled aggregate should not error")
+		Expect(out).To(BeNil(), "disabled aggregate should return nil")
+	})
+})
diff --git a/core/services/routing/billing/local_user.go b/core/services/routing/billing/local_user.go
new file mode 100644
index 000000000..c3ae1fa48
--- /dev/null
+++ b/core/services/routing/billing/local_user.go
@@ -0,0 +1,84 @@
+package billing
+
+import (
+	"crypto/rand"
+	"encoding/hex"
+	"errors"
+	"os"
+	"path/filepath"
+	"sync"
+
+	"github.com/mudler/LocalAI/core/http/auth"
+	"github.com/mudler/xlog"
+)
+
+// LocalUserName is the fixed display name used for the synthetic
+// no-auth user. Surfaces it in the dashboard so single-user installs
+// have a recognizable label rather than an opaque UUID.
+const LocalUserName = "local"
+
+// localUserIDFile is the basename, inside DataPath, where we persist
+// the synthetic user's UUID so it stays stable across restarts.
+const localUserIDFile = ".local_user_id"
+
+var (
+	localOnce sync.Once
+	localUser *auth.User
+)
+
+// LocalUser returns a process-singleton "local" user used by
+// UsageMiddleware when --auth is off. The user's ID is persisted to
+// dataPath so usage history aggregates correctly across restarts; if
+// dataPath is empty, a fresh random UUID is generated for this process
+// only and aggregation drops on restart (in-memory mode).
+//
+// Concurrency note: the singleton uses sync.Once, so calling LocalUser
+// from any goroutine is safe; the first call may briefly hit disk.
+func LocalUser(dataPath string) *auth.User {
+	localOnce.Do(func() {
+		id := loadOrGenerateLocalUserID(dataPath)
+		localUser = &auth.User{
+			ID:       id,
+			Name:     LocalUserName,
+			Email:    "",
+			Provider: auth.ProviderLocal,
+			Role:     "admin", // single-user box: the only user has full access
+			Status:   "active",
+		}
+	})
+	return localUser
+}
+
+func loadOrGenerateLocalUserID(dataPath string) string {
+	if dataPath != "" {
+		path := filepath.Join(dataPath, localUserIDFile)
+		if b, err := os.ReadFile(path); err == nil {
+			id := string(b)
+			if len(id) > 0 {
+				return id
+			}
+		} else if !errors.Is(err, os.ErrNotExist) {
+			xlog.Warn("failed to read local user id file; generating fresh", "path", path, "error", err)
+		}
+		id := newUUID()
+		// 0600: only the LocalAI process owner should read this. The file
+		// is just a stable identifier, not a credential, but we keep it
+		// tight by default.
+		if err := os.WriteFile(path, []byte(id), 0o600); err != nil {
+			xlog.Warn("failed to persist local user id; will regenerate next start", "path", path, "error", err)
+		}
+		return id
+	}
+	return newUUID()
+}
+
+func newUUID() string {
+	var b [16]byte
+	_, _ = rand.Read(b[:])
+	// Set version 4 + RFC 4122 variant bits so this round-trips through
+	// any UUID parser the rest of the codebase might use.
+	b[6] = (b[6] & 0x0f) | 0x40
+	b[8] = (b[8] & 0x3f) | 0x80
+	hexb := hex.EncodeToString(b[:])
+	return hexb[0:8] + "-" + hexb[8:12] + "-" + hexb[12:16] + "-" + hexb[16:20] + "-" + hexb[20:32]
+}
diff --git a/core/services/routing/billing/local_user_test.go b/core/services/routing/billing/local_user_test.go
new file mode 100644
index 000000000..6d80b8607
--- /dev/null
+++ b/core/services/routing/billing/local_user_test.go
@@ -0,0 +1,70 @@
+package billing
+
+import (
+	"os"
+	"path/filepath"
+	"sync"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("LocalUser", func() {
+	It("persists ID", func() {
+		// Reset the package-singleton sentinel so this test gets a fresh
+		// LocalUser call. Without this, other tests racing through LocalUser
+		// would freeze the value before we set DataPath.
+		resetLocalUserForTesting()
+
+		dir := GinkgoT().TempDir()
+		u1 := LocalUser(dir)
+		Expect(u1).NotTo(BeNil(), "LocalUser returned nil")
+		Expect(u1.ID).NotTo(BeEmpty(), "LocalUser must have a non-empty ID")
+		Expect(u1.Name).To(Equal(LocalUserName))
+
+		// File written?
+		idPath := filepath.Join(dir, localUserIDFile)
+		got, err := os.ReadFile(idPath)
+		Expect(err).NotTo(HaveOccurred(), "expected %s to exist", idPath)
+		Expect(string(got)).To(Equal(u1.ID))
+
+		// Singleton: subsequent calls return the same pointer.
+		u2 := LocalUser(dir)
+		Expect(u2).To(BeIdenticalTo(u1), "LocalUser returned a different instance on second call")
+	})
+
+	It("is stable across processes", func() {
+		resetLocalUserForTesting()
+		dir := GinkgoT().TempDir()
+
+		first := LocalUser(dir).ID
+
+		// Simulate process restart by clearing the singleton; the disk file
+		// must let us recover the same UUID.
+		resetLocalUserForTesting()
+
+		second := LocalUser(dir).ID
+		Expect(first).To(Equal(second), "local user id not stable across restart")
+	})
+
+	It("works with no data path", func() {
+		resetLocalUserForTesting()
+		u := LocalUser("")
+		Expect(u).NotTo(BeNil())
+		Expect(u.ID).NotTo(BeEmpty(), "LocalUser with empty data path must still produce a usable user")
+	})
+})
+
+// resetLocalUserForTesting clears the package singleton so a test can
+// rebind LocalUser to a fresh state. Tests must serialize on a mutex
+// because Go tests within a package run concurrently within the same
+// goroutine pool — LocalUser's sync.Once is a global, and these tests
+// deliberately reach past it.
+var testResetMu sync.Mutex
+
+func resetLocalUserForTesting() {
+	testResetMu.Lock()
+	defer testResetMu.Unlock()
+	localOnce = sync.Once{}
+	localUser = nil
+}
diff --git a/core/services/routing/billing/prom.go b/core/services/routing/billing/prom.go
new file mode 100644
index 000000000..352f699ed
--- /dev/null
+++ b/core/services/routing/billing/prom.go
@@ -0,0 +1,215 @@
+package billing
+
+import (
+	"context"
+	"sync"
+
+	"github.com/mudler/LocalAI/core/http/auth"
+	"github.com/mudler/LocalAI/core/services/routing/contract"
+	"github.com/mudler/xlog"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/metric"
+)
+
+// Recorder is the single increment site for billing data. It writes
+// the same record to (a) the StatsBackend (durable / queryable) and
+// (b) Prometheus counters (live ops). Splitting these would invite
+// drift; this type guarantees both fire in lockstep from one call.
+//
+// The plan calls out a DB-vs-Prom drift assertion. With a single
+// increment site, drift can only come from StatsBackend.Record returning
+// without persisting (e.g., the DB flusher dropping batches under load
+// — see gormBackend.flush). We log+invariant-fail in that path; a future
+// drift goroutine compares Prom to a SUM(total_tokens) checkpoint as
+// extra defense in depth.
+type Recorder struct {
+	backend StatsBackend
+
+	tokensCounter metric.Int64Counter
+	costCounter   metric.Float64Counter
+	requestsCount metric.Int64Counter
+}
+
+var (
+	metricsOnce              sync.Once
+	sharedTokensCounter      metric.Int64Counter
+	sharedCostCounter        metric.Float64Counter
+	sharedRequestsCount      metric.Int64Counter
+	sharedUnrecordedCounter  metric.Int64Counter
+
+	// configuredMeter is the meter handed in by the caller (typically
+	// monitoring.LocalAIMetricsService). Setting it before initMetrics
+	// runs makes sure billing's counters land on the same Prom-backed
+	// MeterProvider that exports /metrics. Without this we relied on
+	// otel.SetMeterProvider race ordering, which silently dropped
+	// counters when initMetrics ran first.
+	configuredMeterMu sync.Mutex
+	configuredMeter   metric.Meter
+)
+
+// SetMeter wires the meter from monitoring.LocalAIMetricsService (or any
+// caller-controlled MeterProvider) before any Recorder is constructed.
+// Call from application startup — initMetrics uses this meter rather than
+// the OTel global the moment it's set.
+func SetMeter(m metric.Meter) {
+	configuredMeterMu.Lock()
+	defer configuredMeterMu.Unlock()
+	configuredMeter = m
+}
+
+func resolveMeter() metric.Meter {
+	configuredMeterMu.Lock()
+	m := configuredMeter
+	configuredMeterMu.Unlock()
+	if m != nil {
+		return m
+	}
+	return otel.Meter("github.com/mudler/LocalAI/core/services/routing/billing")
+}
+
+func initMetrics() {
+	metricsOnce.Do(func() {
+		meter := resolveMeter()
+		var err error
+		sharedTokensCounter, err = meter.Int64Counter(
+			"localai_tokens_total",
+			metric.WithDescription("Cumulative tokens accounted, labeled by user, served_model, kind"),
+		)
+		if err != nil {
+			xlog.Error("billing: failed to create tokens counter", "error", err)
+		}
+		sharedCostCounter, err = meter.Float64Counter(
+			"localai_cost_usd_total",
+			metric.WithDescription("Cumulative USD cost accounted, labeled by user, served_model"),
+		)
+		if err != nil {
+			xlog.Error("billing: failed to create cost counter", "error", err)
+		}
+		sharedRequestsCount, err = meter.Int64Counter(
+			"localai_billed_requests_total",
+			metric.WithDescription("Cumulative billed requests, labeled by user, served_model, endpoint"),
+		)
+		if err != nil {
+			xlog.Error("billing: failed to create requests counter", "error", err)
+		}
+		sharedUnrecordedCounter, err = meter.Int64Counter(
+			"localai_usage_unrecorded_total",
+			metric.WithDescription("Requests that completed but produced no UsageRecord, labeled by endpoint and reason. A non-zero rate signals a billing gap (handler didn't stamp, body lacked usage, no user resolvable)."),
+		)
+		if err != nil {
+			xlog.Error("billing: failed to create unrecorded counter", "error", err)
+		}
+	})
+}
+
+// CountUnrecorded ticks the localai_usage_unrecorded_total counter so that
+// silent billing misses are observable. UsageMiddleware calls this whenever
+// a request completes without producing a UsageRecord. Reasons should be
+// short, stable strings ("no_handler_stamp", "no_user", "parse_failed", …)
+// — never user-supplied content.
+func CountUnrecorded(ctx context.Context, endpoint, reason string) {
+	initMetrics()
+	if sharedUnrecordedCounter == nil {
+		return
+	}
+	sharedUnrecordedCounter.Add(ctx, 1,
+		metric.WithAttributes(
+			attribute.String("endpoint", endpoint),
+			attribute.String("reason", reason),
+		))
+}
+
+// NewRecorder returns a Recorder that fans out to the given StatsBackend
+// and to Prometheus. The Prom counters are package-singletons so that
+// multiple Recorders (e.g., reusing the same metrics across rebuilds)
+// don't double-register identical metric names.
+func NewRecorder(backend StatsBackend) *Recorder {
+	initMetrics()
+	return &Recorder{
+		backend:       backend,
+		tokensCounter: sharedTokensCounter,
+		costCounter:   sharedCostCounter,
+		requestsCount: sharedRequestsCount,
+	}
+}
+
+// Record asserts billing invariants, persists the record, and emits the
+// matching Prom counters. r must not be mutated by the caller after
+// this call; the backend takes ownership.
+func (rec *Recorder) Record(ctx context.Context, r *auth.UsageRecord) error {
+	rec.assertInvariants(r)
+
+	if err := rec.backend.Record(ctx, r); err != nil {
+		return err
+	}
+
+	if rec.tokensCounter != nil {
+		userAttr := attribute.String("user", r.UserID)
+		modelAttr := attribute.String("served_model", servedModelOf(r))
+		rec.tokensCounter.Add(ctx, r.PromptTokens,
+			metric.WithAttributes(userAttr, modelAttr, attribute.String("kind", "prompt")))
+		rec.tokensCounter.Add(ctx, r.CompletionTokens,
+			metric.WithAttributes(userAttr, modelAttr, attribute.String("kind", "completion")))
+	}
+	if rec.costCounter != nil && r.PricingVersionID != "" {
+		rec.costCounter.Add(ctx, r.CostUSD,
+			metric.WithAttributes(
+				attribute.String("user", r.UserID),
+				attribute.String("served_model", servedModelOf(r)),
+			))
+	}
+	if rec.requestsCount != nil {
+		rec.requestsCount.Add(ctx, 1,
+			metric.WithAttributes(
+				attribute.String("user", r.UserID),
+				attribute.String("served_model", servedModelOf(r)),
+				attribute.String("endpoint", r.Endpoint),
+			))
+	}
+	return nil
+}
+
+// Aggregate is a convenience pass-through.
+func (rec *Recorder) Aggregate(ctx context.Context, q AggregateQuery) ([]auth.UsageBucket, error) {
+	return rec.backend.Aggregate(ctx, q)
+}
+
+// Close flushes the underlying backend.
+func (rec *Recorder) Close() error { return rec.backend.Close() }
+
+func (rec *Recorder) assertInvariants(r *auth.UsageRecord) {
+	contract.Invariant(
+		"billing.user_id_present",
+		r.UserID != "",
+		"endpoint", r.Endpoint, "model", r.Model,
+	)
+	// PII can only shrink the prompt; a post-filter count above pre-filter
+	// would mean the filter expanded text, which is impossible by design.
+	// Both are zero on legacy paths that don't populate the new fields,
+	// so the assertion only fires when one side is set.
+	if r.PreFilterPromptTokens > 0 || r.PostFilterPromptTokens > 0 {
+		contract.Invariant(
+			"billing.prefilter_ge_postfilter",
+			r.PreFilterPromptTokens >= r.PostFilterPromptTokens,
+			"pre", r.PreFilterPromptTokens, "post", r.PostFilterPromptTokens,
+			"user", r.UserID, "model", r.Model,
+		)
+	}
+	// CostUSD without a pricing version is a data-integrity bug: we'd
+	// be unable to retroactively recompute or audit the rate used.
+	if r.CostUSD != 0 {
+		contract.Invariant(
+			"billing.cost_requires_pricing_version",
+			r.PricingVersionID != "",
+			"cost", r.CostUSD, "model", r.Model,
+		)
+	}
+}
+
+func servedModelOf(r *auth.UsageRecord) string {
+	if r.ServedModel != "" {
+		return r.ServedModel
+	}
+	return r.Model
+}
diff --git a/core/services/routing/billing/recorder_test.go b/core/services/routing/billing/recorder_test.go
new file mode 100644
index 000000000..f75e62f26
--- /dev/null
+++ b/core/services/routing/billing/recorder_test.go
@@ -0,0 +1,82 @@
+package billing
+
+import (
+	"context"
+	"sync"
+
+	"github.com/mudler/LocalAI/core/http/auth"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// fakeBackend is a minimal StatsBackend that records what it received
+// without actually writing anywhere. Lets the Recorder be tested in
+// isolation from GORM/SQLite/in-memory specifics.
+type fakeBackend struct {
+	mu      sync.Mutex
+	records []*auth.UsageRecord
+}
+
+func (f *fakeBackend) Record(_ context.Context, r *auth.UsageRecord) error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.records = append(f.records, r)
+	return nil
+}
+func (f *fakeBackend) Aggregate(_ context.Context, _ AggregateQuery) ([]auth.UsageBucket, error) {
+	return nil, nil
+}
+func (f *fakeBackend) Close() error { return nil }
+
+var _ = Describe("Recorder", func() {
+	It("forwards to backend", func() {
+		fb := &fakeBackend{}
+		rec := NewRecorder(fb)
+
+		r := &auth.UsageRecord{
+			UserID:           "u-1",
+			UserName:         "alice",
+			Model:            "qwen-7b",
+			Endpoint:         "/v1/chat/completions",
+			PromptTokens:     10,
+			CompletionTokens: 5,
+			TotalTokens:      15,
+		}
+		Expect(rec.Record(context.Background(), r)).To(Succeed(), "recorder.Record")
+
+		fb.mu.Lock()
+		defer fb.mu.Unlock()
+		Expect(fb.records).To(HaveLen(1))
+		Expect(fb.records[0]).To(BeIdenticalTo(r), "recorder must pass the record through without copying")
+	})
+
+	// RecorderInvariantsPassWhenZero ensures legacy paths that don't
+	// populate the routing-extension fields still record successfully —
+	// the invariants only fire when a partial routing fact is set.
+	It("invariants pass when zero", func() {
+		rec := NewRecorder(&fakeBackend{})
+		err := rec.Record(context.Background(), &auth.UsageRecord{
+			UserID: "u-1", Model: "qwen-7b", Endpoint: "/v1/chat/completions",
+		})
+		Expect(err).NotTo(HaveOccurred(), "zero routing fields must record cleanly")
+	})
+
+	// RecorderInvariantsDetectShrinkViolation: setting both pre/post
+	// prompt tokens with post > pre (impossible — PII can only shrink the
+	// prompt) should trigger the contract assertion. In a non-strict build
+	// the call still succeeds (logs + counter) but a routing_strict build
+	// would panic. We assert the call returns nil here; the strict-build
+	// behavior is covered by an integration test that compiles with the
+	// tag.
+	It("invariants detect shrink violation", func() {
+		rec := NewRecorder(&fakeBackend{})
+		err := rec.Record(context.Background(), &auth.UsageRecord{
+			UserID:                 "u-1",
+			Model:                  "qwen-7b",
+			PreFilterPromptTokens:  5,
+			PostFilterPromptTokens: 10, // post > pre is impossible by design
+		})
+		Expect(err).NotTo(HaveOccurred(), "non-strict build must not error on invariant violation")
+	})
+})
diff --git a/core/services/routing/contract/contract.go b/core/services/routing/contract/contract.go
new file mode 100644
index 000000000..3e2d7605a
--- /dev/null
+++ b/core/services/routing/contract/contract.go
@@ -0,0 +1,55 @@
+// Package contract provides runtime invariant assertions for the routing
+// module. Each Invariant call logs at error level via xlog, increments a
+// Prometheus counter, and (under build tag routing_strict) panics so test
+// runs surface violations as test failures.
+//
+// The routing subsystems (billing, router, pii, proxy, admission) all
+// publish invariants through this single package so that observability —
+// dashboards, alerts, post-mortem analysis — joins on a single counter
+// name regardless of which subsystem fired.
+package contract
+
+import (
+	"context"
+
+	"github.com/mudler/xlog"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/metric"
+)
+
+var violationCounter metric.Int64Counter
+
+func init() {
+	meter := otel.Meter("github.com/mudler/LocalAI/core/services/routing")
+	c, err := meter.Int64Counter(
+		"localai_invariant_violation_total",
+		metric.WithDescription("Routing-module runtime invariant violations, labeled by name"),
+	)
+	if err != nil {
+		// OTel API never returns an error in practice for a simple counter;
+		// log and fall back to a nil counter (Add becomes a no-op).
+		xlog.Error("failed to create invariant violation counter", "error", err)
+		return
+	}
+	violationCounter = c
+}
+
+// Invariant asserts that cond is true. If false, it logs the violation
+// and increments localai_invariant_violation_total{name=name}. Use
+// fields for structured context (e.g., "model", "qwen-7b", "user", uid).
+//
+// In a build with -tags=routing_strict, a violation panics — meant for
+// test suites and nightly E2E runs to surface drift. Production builds
+// degrade silently into a metric so a single bad request does not crash
+// the server.
+func Invariant(name string, cond bool, fields ...any) {
+	if cond {
+		return
+	}
+	xlog.Error("routing invariant violated", append([]any{"name", name}, fields...)...)
+	if violationCounter != nil {
+		violationCounter.Add(context.Background(), 1, metric.WithAttributes(attribute.String("name", name)))
+	}
+	panicIfStrict(name, fields...)
+}
diff --git a/core/services/routing/contract/strict_off.go b/core/services/routing/contract/strict_off.go
new file mode 100644
index 000000000..f0f70829f
--- /dev/null
+++ b/core/services/routing/contract/strict_off.go
@@ -0,0 +1,5 @@
+//go:build !routing_strict
+
+package contract
+
+func panicIfStrict(name string, fields ...any) {}
diff --git a/core/services/routing/contract/strict_on.go b/core/services/routing/contract/strict_on.go
new file mode 100644
index 000000000..7ea6e96e3
--- /dev/null
+++ b/core/services/routing/contract/strict_on.go
@@ -0,0 +1,9 @@
+//go:build routing_strict
+
+package contract
+
+import "fmt"
+
+func panicIfStrict(name string, fields ...any) {
+	panic(fmt.Sprintf("routing invariant violated under -tags=routing_strict: %s %v", name, fields))
+}
diff --git a/core/services/routing/pii/config.go b/core/services/routing/pii/config.go
new file mode 100644
index 000000000..64f709675
--- /dev/null
+++ b/core/services/routing/pii/config.go
@@ -0,0 +1,71 @@
+package pii
+
+import (
+	"fmt"
+	"os"
+
+	"gopkg.in/yaml.v3"
+)
+
+// FileConfig is the on-disk schema for pii.yaml. Each Pattern entry
+// overrides the matching default by ID; missing fields fall back to
+// the default. Unknown IDs are rejected at load time so an admin who
+// fat-fingers a pattern name gets a clear error rather than a silent
+// no-op.
+type FileConfig struct {
+	Patterns []FilePattern `yaml:"patterns"`
+}
+
+type FilePattern struct {
+	ID     string `yaml:"id"`
+	Action Action `yaml:"action"`
+}
+
+// LoadConfig reads pii.yaml from path and merges it on top of
+// DefaultPatterns(). path == "" returns the defaults compiled and
+// ready. The returned slice is already Compile()'d, so callers can
+// pass it straight to NewRedactor.
+func LoadConfig(path string) ([]Pattern, error) {
+	defaults := DefaultPatterns()
+	if path == "" {
+		return Compile(defaults)
+	}
+
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		return nil, fmt.Errorf("pii: read config %q: %w", path, err)
+	}
+	var cfg FileConfig
+	if err := yaml.Unmarshal(raw, &cfg); err != nil {
+		return nil, fmt.Errorf("pii: parse config %q: %w", path, err)
+	}
+
+	overrides := make(map[string]Action, len(cfg.Patterns))
+	known := make(map[string]bool, len(defaults))
+	for _, d := range defaults {
+		known[d.ID] = true
+	}
+	for _, p := range cfg.Patterns {
+		if !known[p.ID] {
+			return nil, fmt.Errorf("pii: unknown pattern id %q in %q", p.ID, path)
+		}
+		if p.Action == "" {
+			continue
+		}
+		switch p.Action {
+		case ActionMask, ActionBlock, ActionRouteLocal:
+			overrides[p.ID] = p.Action
+		default:
+			return nil, fmt.Errorf("pii: invalid action %q for pattern %q", p.Action, p.ID)
+		}
+	}
+
+	merged := make([]Pattern, len(defaults))
+	for i, d := range defaults {
+		if a, ok := overrides[d.ID]; ok {
+			d.Action = a
+		}
+		merged[i] = d
+	}
+	return Compile(merged)
+}
diff --git a/core/services/routing/pii/config_test.go b/core/services/routing/pii/config_test.go
new file mode 100644
index 000000000..650b804f0
--- /dev/null
+++ b/core/services/routing/pii/config_test.go
@@ -0,0 +1,56 @@
+package pii
+
+import (
+	"os"
+	"path/filepath"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("LoadConfig", func() {
+	It("returns defaults when no path given", func() {
+		patterns, err := LoadConfig("")
+		Expect(err).NotTo(HaveOccurred())
+		Expect(patterns).To(HaveLen(len(DefaultPatterns())))
+	})
+
+	It("overrides action", func() {
+		dir := GinkgoT().TempDir()
+		path := filepath.Join(dir, "pii.yaml")
+		body := []byte(`patterns:
+  - id: email
+    action: block
+  - id: ssn
+    action: route_local
+`)
+		Expect(os.WriteFile(path, body, 0o600)).To(Succeed())
+		patterns, err := LoadConfig(path)
+		Expect(err).NotTo(HaveOccurred())
+
+		got := map[string]Action{}
+		for _, p := range patterns {
+			got[p.ID] = p.Action
+		}
+		Expect(got["email"]).To(Equal(ActionBlock))
+		Expect(got["ssn"]).To(Equal(ActionRouteLocal))
+		// Unmentioned patterns keep their default action.
+		Expect(got["credit_card"]).To(Equal(ActionMask), "credit_card default action lost")
+	})
+
+	It("rejects unknown id", func() {
+		dir := GinkgoT().TempDir()
+		path := filepath.Join(dir, "pii.yaml")
+		Expect(os.WriteFile(path, []byte("patterns:\n  - id: nonsense\n    action: mask\n"), 0o600)).To(Succeed())
+		_, err := LoadConfig(path)
+		Expect(err).To(HaveOccurred(), "expected error on unknown pattern id")
+	})
+
+	It("rejects invalid action", func() {
+		dir := GinkgoT().TempDir()
+		path := filepath.Join(dir, "pii.yaml")
+		Expect(os.WriteFile(path, []byte("patterns:\n  - id: email\n    action: lolwhat\n"), 0o600)).To(Succeed())
+		_, err := LoadConfig(path)
+		Expect(err).To(HaveOccurred(), "expected error on invalid action")
+	})
+})
diff --git a/core/services/routing/pii/middleware.go b/core/services/routing/pii/middleware.go
new file mode 100644
index 000000000..0994c32ba
--- /dev/null
+++ b/core/services/routing/pii/middleware.go
@@ -0,0 +1,260 @@
+package pii
+
+import (
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"net/http"
+	"time"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/http/auth"
+	"github.com/mudler/LocalAI/core/services/routing/contract"
+	"github.com/mudler/xlog"
+)
+
+// Echo context keys this middleware reads from / writes to. The string
+// values must match the constants in core/http/middleware/context_keys.go;
+// kept in sync by hand because echoing constants across packages would
+// drag the http/middleware package into pii's import graph and create
+// a cycle (http/middleware will import this one).
+const (
+	ctxKeyCorrelationID    = "routing.correlation_id"
+	ctxKeyPIIEventID       = "routing.pii_event_id"
+	ctxKeyLocalOnly        = "routing.local_only"
+	// Must match the constants in core/http/middleware/request.go.
+	// Echoing them across packages would create an import cycle
+	// (http/middleware imports this package). Drift is caught by
+	// integration tests against the chat route.
+	ctxKeyParsedRequest    = "LOCALAI_REQUEST"
+	ctxKeyModelConfig      = "MODEL_CONFIG"
+)
+
+// ModelPIIConfig is the duck-typed view this middleware needs of the
+// per-model PII configuration carried on the echo context. *config.ModelConfig
+// satisfies it via PIIIsEnabled / PIIPatternOverrides; the indirection
+// keeps the pii package from importing core/config.
+//
+// Consumers of the override map: the action returned from PIIPatternOverrides
+// is the raw YAML string (e.g. "block"). Validation against the canonical
+// ActionMask/Block/RouteLocal constants happens here, so a typo in a model
+// YAML logs and is ignored rather than panicking.
+type ModelPIIConfig interface {
+	PIIIsEnabled() bool
+	PIIPatternOverrides() map[string]string
+}
+
+// ScannedText is one piece of user text from the request. Index is
+// opaque to the middleware — the Adapter implementation uses it to
+// put the redacted version back in the right place.
+type ScannedText struct {
+	Index int
+	Text  string
+}
+
+// Adapter pulls scannable text out of a parsed request and writes
+// redacted text back. Provided as a per-API-shape function rather
+// than an interface on the request type so the schema package does
+// not have to depend on pii. Each route registration passes the
+// adapter that knows its request format.
+//
+// The middleware calls Scan once per request and Apply once with
+// every span the redactor returned. updates are guaranteed to share
+// indices the adapter previously returned from Scan; the adapter
+// must not assume input order matches scan order.
+type Adapter struct {
+	Scan  func(parsed any) []ScannedText
+	Apply func(parsed any, updates []ScannedText)
+}
+
+// RequestMiddleware applies the regex PII tier to incoming chat
+// requests. If the parsed request is not a MessageScanner (e.g.,
+// non-chat endpoints registered against the same group later), the
+// middleware passes through.
+//
+//   - On match with action=block: the request is rejected with 400 and
+//     a PIIEvent is recorded. The matched value is never echoed back
+//     to the client.
+//   - On match with action=mask: the redacted text replaces the
+//     original on the parsed request. PIIEvents are recorded.
+//   - On match with action=route_local: the original text is left
+//     intact, but the echo context is annotated so the (future) router
+//     middleware refuses cloud-proxy candidates.
+//
+// recorder is the Recorder on which to record events; nil disables
+// recording (the redaction still happens). fallbackUser supplies the
+// no-auth identity. The middleware writes ctxKeyPIIEventID on the echo
+// context so the usage middleware can later cross-reference the event
+// with the UsageRecord.
+func RequestMiddleware(redactor *Redactor, store EventStore, adapter Adapter, fallbackUser *auth.User) echo.MiddlewareFunc {
+	return func(next echo.HandlerFunc) echo.HandlerFunc {
+		return func(c echo.Context) error {
+			if redactor == nil || len(redactor.Patterns()) == 0 || adapter.Scan == nil {
+				return next(c)
+			}
+
+			// Per-model gating: redaction is opt-in per model. If the
+			// resolved config disables PII for this model (the default
+			// for non-proxy backends), pass through immediately. We do
+			// this before parsing the request so a disabled model
+			// doesn't pay the regex scan cost.
+			if cfg, ok := c.Get(ctxKeyModelConfig).(ModelPIIConfig); ok {
+				if !cfg.PIIIsEnabled() {
+					return next(c)
+				}
+			} else {
+				// No ModelPIIConfig on context → fail-closed: skip
+				// redaction. This protects routes that wire the
+				// middleware before SetModelAndConfig runs (or non-chat
+				// routes that don't carry a model). The middleware was
+				// previously fail-open, applying the global redactor
+				// unconditionally; the new contract is per-model
+				// opt-in, and a missing model is treated as disabled.
+				return next(c)
+			}
+
+			parsed := c.Get(ctxKeyParsedRequest)
+			if parsed == nil {
+				return next(c)
+			}
+
+			user := auth.GetUser(c)
+			if user == nil {
+				user = fallbackUser
+			}
+			userID := ""
+			if user != nil {
+				userID = user.ID
+			}
+			correlationID, _ := c.Get(ctxKeyCorrelationID).(string)
+
+			// Resolve per-model action overrides once per request. The
+			// raw map is YAML strings; convert to the typed Action set
+			// and silently drop unknown values rather than failing the
+			// request — model YAML typos shouldn't take chat down.
+			var overrides map[string]Action
+			if cfg, ok := c.Get(ctxKeyModelConfig).(ModelPIIConfig); ok {
+				if raw := cfg.PIIPatternOverrides(); len(raw) > 0 {
+					overrides = make(map[string]Action, len(raw))
+					for id, action := range raw {
+						switch Action(action) {
+						case ActionMask, ActionBlock, ActionRouteLocal:
+							overrides[id] = Action(action)
+						default:
+							xlog.Warn("pii: ignoring unknown action in per-model override",
+								"pattern", id, "action", action)
+						}
+					}
+				}
+			}
+
+			texts := adapter.Scan(parsed)
+			updates := make([]ScannedText, 0, len(texts))
+			var blocked bool
+			var localOnly bool
+			var firstEventID string
+
+			for _, st := range texts {
+				if st.Text == "" {
+					continue
+				}
+				res := redactor.RedactWithOverrides(st.Text, overrides)
+				if len(res.Spans) == 0 {
+					continue
+				}
+
+				// Persist one event per span so admins can see exactly
+				// which patterns fired in which positions. The action
+				// recorded is the resolved one (after override), so the
+				// events log reflects what actually happened to the
+				// request, not the global default.
+				for _, span := range res.Spans {
+					action := actionForSpan(redactor.Patterns(), span.Pattern, overrides)
+					ev := PIIEvent{
+						ID:            newEventID(),
+						CorrelationID: correlationID,
+						UserID:        userID,
+						Direction:     DirectionIn,
+						PatternID:     span.Pattern,
+						ByteOffset:    span.Start,
+						Length:        span.End - span.Start,
+						HashPrefix:    span.HashPrefix,
+						Action:        action,
+						CreatedAt:     time.Now().UTC(),
+					}
+					if firstEventID == "" {
+						firstEventID = ev.ID
+					}
+					if store != nil {
+						if err := store.Record(context.Background(), ev); err != nil {
+							xlog.Error("pii: failed to record event", "error", err, "pattern", span.Pattern)
+						}
+					}
+					// Contract: every span must produce an event.
+					contract.Invariant(
+						"pii.event_per_span",
+						span.Pattern != "" && ev.PatternID != "",
+						"correlation", correlationID, "pattern", span.Pattern,
+					)
+				}
+
+				if res.Blocked {
+					blocked = true
+				}
+				if res.LocalOnly {
+					localOnly = true
+				}
+				updates = append(updates, ScannedText{Index: st.Index, Text: res.Redacted})
+			}
+
+			if blocked {
+				return c.JSON(http.StatusBadRequest, map[string]any{
+					"error": map[string]string{
+						"message": "request blocked by content policy (sensitive data detected)",
+						"type":    "pii_blocked",
+					},
+					"correlation_id": correlationID,
+					"pii_event_id":   firstEventID,
+				})
+			}
+
+			if len(updates) > 0 && adapter.Apply != nil {
+				adapter.Apply(parsed, updates)
+			}
+			if firstEventID != "" {
+				c.Set(ctxKeyPIIEventID, firstEventID)
+			}
+			if localOnly {
+				c.Set(ctxKeyLocalOnly, true)
+			}
+
+			return next(c)
+		}
+	}
+}
+
+func actionForPattern(patterns []Pattern, id string) Action {
+	for _, p := range patterns {
+		if p.ID == id {
+			return p.Action
+		}
+	}
+	return ActionMask
+}
+
+// actionForSpan returns the resolved action for a span, preferring a
+// per-request override over the pattern's stored action. Used so the
+// PIIEvent log reflects the action that actually fired (e.g., a model
+// upgraded email from mask to block — the event row says "block").
+func actionForSpan(patterns []Pattern, id string, overrides map[string]Action) Action {
+	if action, ok := overrides[id]; ok {
+		return action
+	}
+	return actionForPattern(patterns, id)
+}
+
+func newEventID() string {
+	var b [12]byte
+	_, _ = rand.Read(b[:])
+	return "pii_" + hex.EncodeToString(b[:])
+}
diff --git a/core/services/routing/pii/middleware_test.go b/core/services/routing/pii/middleware_test.go
new file mode 100644
index 000000000..d3bbbb2e7
--- /dev/null
+++ b/core/services/routing/pii/middleware_test.go
@@ -0,0 +1,309 @@
+package pii
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/http/auth"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// fakeRequest is the simplest possible parsed-request shape: a list of
+// strings that the adapter scans and writes back. Lets us drive the
+// middleware without dragging the real schema package in.
+type fakeRequest struct {
+	Messages []string
+}
+
+func fakeAdapter() Adapter {
+	return Adapter{
+		Scan: func(parsed any) []ScannedText {
+			r, ok := parsed.(*fakeRequest)
+			if !ok {
+				return nil
+			}
+			out := make([]ScannedText, len(r.Messages))
+			for i, m := range r.Messages {
+				out[i] = ScannedText{Index: i, Text: m}
+			}
+			return out
+		},
+		Apply: func(parsed any, updates []ScannedText) {
+			r, ok := parsed.(*fakeRequest)
+			if !ok {
+				return
+			}
+			for _, u := range updates {
+				r.Messages[u.Index] = u.Text
+			}
+		},
+	}
+}
+
+func setRequestOnContext(req *fakeRequest) echo.MiddlewareFunc {
+	return func(next echo.HandlerFunc) echo.HandlerFunc {
+		return func(c echo.Context) error {
+			c.Set(ctxKeyParsedRequest, req)
+			return next(c)
+		}
+	}
+}
+
+// fakeModelPIIConfig satisfies the duck-typed ModelPIIConfig interface
+// the middleware expects on the echo context. The real implementation
+// lives on *config.ModelConfig; using a fake here keeps these tests
+// out of the core/config import graph.
+type fakeModelPIIConfig struct {
+	enabled   bool
+	overrides map[string]string
+}
+
+func (f fakeModelPIIConfig) PIIIsEnabled() bool                     { return f.enabled }
+func (f fakeModelPIIConfig) PIIPatternOverrides() map[string]string { return f.overrides }
+
+// withModelConfig wires a ModelPIIConfig onto the context so the
+// middleware's per-model gate doesn't fail-closed during tests. Pass
+// enabled=true for the default test path; explicit-false tests should
+// use the gating spec further down instead.
+func withModelConfig(cfg fakeModelPIIConfig) echo.MiddlewareFunc {
+	return func(next echo.HandlerFunc) echo.HandlerFunc {
+		return func(c echo.Context) error {
+			c.Set(ctxKeyModelConfig, cfg)
+			return next(c)
+		}
+	}
+}
+
+func newTestRedactor(ids ...string) *Redactor {
+	patterns, err := Compile(pick(DefaultPatterns(), ids))
+	ExpectWithOffset(1, err).NotTo(HaveOccurred(), "compile")
+	return NewRedactor(patterns)
+}
+
+var _ = Describe("RequestMiddleware", func() {
+	It("masks email", func() {
+		red := newTestRedactor("email")
+		store := NewMemoryEventStore(0)
+		defer func() { _ = store.Close() }()
+		user := &auth.User{ID: "user-1", Name: "alice"}
+
+		body := &fakeRequest{Messages: []string{"contact me at alice@example.com"}}
+		mw := RequestMiddleware(red, store, fakeAdapter(), nil)
+
+		e := echo.New()
+		e.POST("/chat", func(c echo.Context) error {
+			return c.JSON(http.StatusOK, map[string]string{"ok": "yes"})
+		}, setRequestOnContext(body), withModelConfig(fakeModelPIIConfig{enabled: true}), mw, func(next echo.HandlerFunc) echo.HandlerFunc {
+			// Inject the user as if upstream auth ran.
+			return func(c echo.Context) error {
+				c.Set("auth_user", user)
+				return next(c)
+			}
+		})
+
+		req := httptest.NewRequest(http.MethodPost, "/chat", strings.NewReader(`{}`))
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+
+		Expect(w.Code).To(Equal(http.StatusOK), "body=%s", w.Body.String())
+		Expect(body.Messages[0]).NotTo(ContainSubstring("alice@example.com"), "request body should be redacted in place")
+		Expect(body.Messages[0]).To(ContainSubstring("[REDACTED:email]"))
+
+		events, err := store.List(context.Background(), ListQuery{Limit: 100})
+		Expect(err).NotTo(HaveOccurred(), "list events")
+		Expect(events).To(HaveLen(1))
+		Expect(events[0].PatternID).To(Equal("email"))
+		Expect(events[0].Direction).To(Equal(DirectionIn))
+	})
+
+	It("blocks api key", func() {
+		red := newTestRedactor("api_key_prefix")
+		store := NewMemoryEventStore(0)
+		defer func() { _ = store.Close() }()
+
+		body := &fakeRequest{Messages: []string{"my key is sk-abcdefghijklmnopqrstuvwxyz0123456789"}}
+		mw := RequestMiddleware(red, store, fakeAdapter(), nil)
+
+		e := echo.New()
+		handlerCalled := false
+		e.POST("/chat", func(c echo.Context) error {
+			handlerCalled = true
+			return c.JSON(http.StatusOK, map[string]string{"ok": "yes"})
+		}, setRequestOnContext(body), withModelConfig(fakeModelPIIConfig{enabled: true}), mw)
+
+		req := httptest.NewRequest(http.MethodPost, "/chat", strings.NewReader(`{}`))
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+
+		Expect(w.Code).To(Equal(http.StatusBadRequest), "expected 400 on block; body=%s", w.Body.String())
+		Expect(handlerCalled).To(BeFalse(), "handler must not run when request is blocked")
+		// Ensure the matched value never appears in the response body.
+		Expect(w.Body.String()).NotTo(ContainSubstring("abcdefghijklmnopqrstuvwxyz0123456789"), "blocked response leaks the matched value")
+
+		var resp map[string]any
+		Expect(json.Unmarshal(w.Body.Bytes(), &resp)).To(Succeed())
+		errBlock, ok := resp["error"].(map[string]any)
+		Expect(ok).To(BeTrue())
+		Expect(errBlock["type"]).To(Equal("pii_blocked"))
+	})
+
+	It("route_local sets context flag", func() {
+		patterns, _ := Compile([]Pattern{{
+			ID: "email", Description: "Email", Action: ActionRouteLocal, MaxMatchLength: 254,
+		}})
+		red := NewRedactor(patterns)
+		store := NewMemoryEventStore(0)
+		defer func() { _ = store.Close() }()
+
+		body := &fakeRequest{Messages: []string{"hi at alice@example.com"}}
+		mw := RequestMiddleware(red, store, fakeAdapter(), nil)
+
+		e := echo.New()
+		var observedLocalOnly bool
+		e.POST("/chat", func(c echo.Context) error {
+			v, _ := c.Get(ctxKeyLocalOnly).(bool)
+			observedLocalOnly = v
+			return c.JSON(http.StatusOK, map[string]string{"ok": "yes"})
+		}, setRequestOnContext(body), withModelConfig(fakeModelPIIConfig{enabled: true}), mw)
+
+		req := httptest.NewRequest(http.MethodPost, "/chat", strings.NewReader(`{}`))
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+
+		Expect(w.Code).To(Equal(http.StatusOK))
+		Expect(observedLocalOnly).To(BeTrue(), "ctxKeyLocalOnly should be true on route_local match")
+		// route_local does NOT mutate the body — the model still sees the email.
+		Expect(body.Messages[0]).To(ContainSubstring("alice@example.com"), "route_local should leave text intact")
+	})
+
+	It("no match passes through", func() {
+		red := newTestRedactor()
+		store := NewMemoryEventStore(0)
+		defer func() { _ = store.Close() }()
+
+		body := &fakeRequest{Messages: []string{"perfectly innocent text"}}
+		mw := RequestMiddleware(red, store, fakeAdapter(), nil)
+
+		e := echo.New()
+		e.POST("/chat", func(c echo.Context) error {
+			return c.JSON(http.StatusOK, map[string]string{"ok": "yes"})
+		}, setRequestOnContext(body), withModelConfig(fakeModelPIIConfig{enabled: true}), mw)
+
+		req := httptest.NewRequest(http.MethodPost, "/chat", strings.NewReader(`{}`))
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+
+		Expect(w.Code).To(Equal(http.StatusOK))
+		Expect(body.Messages[0]).To(Equal("perfectly innocent text"), "body should be untouched")
+		events, _ := store.List(context.Background(), ListQuery{Limit: 100})
+		Expect(events).To(BeEmpty(), "expected 0 events on no-match input")
+	})
+
+	It("skips when model config disabled", func() {
+		// Per-model gating is the new contract: a model with PIIIsEnabled
+		// returning false must bypass redaction entirely, even if the
+		// global redactor has matching patterns.
+		red := newTestRedactor("email")
+		store := NewMemoryEventStore(0)
+		defer func() { _ = store.Close() }()
+
+		body := &fakeRequest{Messages: []string{"contact alice@example.com"}}
+		mw := RequestMiddleware(red, store, fakeAdapter(), nil)
+
+		e := echo.New()
+		e.POST("/chat", func(c echo.Context) error {
+			return c.JSON(http.StatusOK, map[string]string{"ok": "yes"})
+		}, setRequestOnContext(body), withModelConfig(fakeModelPIIConfig{enabled: false}), mw)
+
+		req := httptest.NewRequest(http.MethodPost, "/chat", strings.NewReader(`{}`))
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+
+		Expect(w.Code).To(Equal(http.StatusOK))
+		Expect(body.Messages[0]).To(ContainSubstring("alice@example.com"), "disabled model must not redact")
+		events, _ := store.List(context.Background(), ListQuery{Limit: 100})
+		Expect(events).To(BeEmpty(), "disabled model must produce no events")
+	})
+
+	It("fails closed without model config", func() {
+		// Routes that wire the middleware before SetModelAndConfig, or
+		// non-chat routes lacking a model, hit this path. The contract
+		// is fail-closed: pass through without redaction so a missing
+		// model can't accidentally leak through global defaults.
+		red := newTestRedactor("email")
+		store := NewMemoryEventStore(0)
+		defer func() { _ = store.Close() }()
+
+		body := &fakeRequest{Messages: []string{"contact alice@example.com"}}
+		mw := RequestMiddleware(red, store, fakeAdapter(), nil)
+
+		e := echo.New()
+		// Note: no withModelConfig in the chain.
+		e.POST("/chat", func(c echo.Context) error {
+			return c.JSON(http.StatusOK, map[string]string{"ok": "yes"})
+		}, setRequestOnContext(body), mw)
+
+		req := httptest.NewRequest(http.MethodPost, "/chat", strings.NewReader(`{}`))
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+
+		Expect(w.Code).To(Equal(http.StatusOK))
+		Expect(body.Messages[0]).To(ContainSubstring("alice@example.com"), "missing ModelPIIConfig should fail-closed (no redaction)")
+	})
+
+	It("applies per-model override", func() {
+		// email defaults to mask. A per-model override upgrades it to
+		// block. The middleware short-circuits with 400, the request
+		// body is never touched, and the events log records action=block.
+		red := newTestRedactor("email")
+		store := NewMemoryEventStore(0)
+		defer func() { _ = store.Close() }()
+
+		body := &fakeRequest{Messages: []string{"contact alice@example.com"}}
+		mw := RequestMiddleware(red, store, fakeAdapter(), nil)
+
+		e := echo.New()
+		handlerCalled := false
+		e.POST("/chat", func(c echo.Context) error {
+			handlerCalled = true
+			return c.JSON(http.StatusOK, map[string]string{"ok": "yes"})
+		}, setRequestOnContext(body),
+			withModelConfig(fakeModelPIIConfig{
+				enabled:   true,
+				overrides: map[string]string{"email": "block"},
+			}), mw)
+
+		req := httptest.NewRequest(http.MethodPost, "/chat", strings.NewReader(`{}`))
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+
+		Expect(w.Code).To(Equal(http.StatusBadRequest), "expected 400 from override-block; body=%s", w.Body.String())
+		Expect(handlerCalled).To(BeFalse(), "handler must not run when override blocks")
+		events, _ := store.List(context.Background(), ListQuery{Limit: 100})
+		Expect(events).To(HaveLen(1))
+		Expect(events[0].Action).To(Equal(ActionBlock), "event must record the resolved (override) action")
+	})
+
+	It("nil redactor is passthrough", func() {
+		body := &fakeRequest{Messages: []string{"alice@example.com"}}
+		mw := RequestMiddleware(nil, nil, fakeAdapter(), nil)
+
+		e := echo.New()
+		e.POST("/chat", func(c echo.Context) error {
+			return c.JSON(http.StatusOK, map[string]string{"ok": "yes"})
+		}, setRequestOnContext(body), withModelConfig(fakeModelPIIConfig{enabled: true}), mw)
+
+		req := httptest.NewRequest(http.MethodPost, "/chat", strings.NewReader(`{}`))
+		w := httptest.NewRecorder()
+		e.ServeHTTP(w, req)
+
+		Expect(w.Code).To(Equal(http.StatusOK))
+		Expect(body.Messages[0]).To(Equal("alice@example.com"), "nil redactor must be a no-op")
+	})
+})
diff --git a/core/services/routing/pii/ner.go b/core/services/routing/pii/ner.go
new file mode 100644
index 000000000..57d25cded
--- /dev/null
+++ b/core/services/routing/pii/ner.go
@@ -0,0 +1,97 @@
+package pii
+
+import (
+	"context"
+	"fmt"
+)
+
+// NERDetector is the contract the redactor's encoder/NER tier expects.
+// One detector wraps one loaded token-classification model. The
+// implementation (e.g. via the transformers gRPC backend) is wired in
+// from core/application; this package stays free of core/backend
+// imports so the redactor remains unit-testable with a stub detector.
+//
+// Implementations must honour ctx cancellation — NER round-trips can
+// take tens of milliseconds and a client-aborted request should not
+// stall the redactor.
+type NERDetector interface {
+	Detect(ctx context.Context, text string) ([]NEREntity, error)
+}
+
+// NEREntity is one detected span. Start/End are byte offsets into the
+// text passed to Detect — half-open, addressing text[Start:End]. The
+// Group is the entity label (e.g. "PER", "LOC", "EMAIL"); the exact
+// vocabulary depends on the model. The redactor's action map keys off
+// Group, so admins configure per-label behaviour.
+type NEREntity struct {
+	Group string
+	Start int
+	End   int
+	Score float32
+}
+
+// NERConfig configures the encoder tier for one redactor invocation.
+// Per-request so the same Redactor instance can serve multiple models
+// (each with its own NER preferences) without per-model redactor
+// instances.
+type NERConfig struct {
+	// Detector is the loaded model. nil disables the NER tier — the
+	// redactor falls back to the regex-only path with no allocation
+	// cost.
+	Detector NERDetector
+
+	// MinScore is the confidence floor; entities below this are dropped
+	// before being merged into the hit list. 0 keeps every result the
+	// detector returns.
+	MinScore float32
+
+	// EntityActions maps entity_group → Action. Unknown groups (groups
+	// the detector returns but the admin didn't configure) use
+	// DefaultAction. Empty map + DefaultAction empty = NER detections
+	// recorded as audit rows but no redaction applied.
+	EntityActions map[string]Action
+
+	// DefaultAction is applied when a detected entity_group has no
+	// explicit override. Empty (zero value) means "drop unmatched
+	// entities silently" — useful when the model returns a broad
+	// taxonomy but the admin only cares about a subset.
+	DefaultAction Action
+}
+
+// ResolveAction returns the action configured for a detected entity
+// group, falling back to DefaultAction. Returns ("", false) when the
+// entity should be ignored entirely (no override + no default).
+func (c NERConfig) ResolveAction(group string) (Action, bool) {
+	if a, ok := c.EntityActions[group]; ok {
+		return a, true
+	}
+	if c.DefaultAction != "" {
+		return c.DefaultAction, true
+	}
+	return "", false
+}
+
+// nerPatternID returns the synthetic pattern ID that audit rows carry
+// for NER hits. Prefixing with "ner:" keeps these distinguishable from
+// regex pattern IDs in the events tab and in filter queries; admins
+// can switch off a single entity type with the same Disabled-pattern
+// machinery used for regex.
+func nerPatternID(group string) string {
+	return "ner:" + group
+}
+
+// errNERDetector is a NERDetector that always returns the wrapped
+// error. Exported via NewErrNERDetector so the application wiring can
+// surface "model not loaded" without taking on a fmt-only dependency
+// just to format the error.
+type errNERDetector struct{ err error }
+
+func (e errNERDetector) Detect(context.Context, string) ([]NEREntity, error) {
+	return nil, e.err
+}
+
+// NewErrNERDetector returns a detector whose Detect always fails with
+// the supplied error. Used by the application-level adapter when the
+// configured NER model can't be resolved — the redactor surfaces a
+// clear runtime error rather than silently skipping the NER tier.
+func NewErrNERDetector(msg string) NERDetector { return errNERDetector{err: fmt.Errorf("%s", msg)} }
diff --git a/core/services/routing/pii/ner_test.go b/core/services/routing/pii/ner_test.go
new file mode 100644
index 000000000..b4d822234
--- /dev/null
+++ b/core/services/routing/pii/ner_test.go
@@ -0,0 +1,174 @@
+package pii
+
+import (
+	"context"
+	"errors"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// stubNERDetector returns a fixed slice of entities and tracks call
+// count so tests can assert the detector isn't called when text is
+// empty / no patterns / detector disabled.
+type stubNERDetector struct {
+	entities []NEREntity
+	err      error
+	calls    int
+}
+
+func (s *stubNERDetector) Detect(_ context.Context, _ string) ([]NEREntity, error) {
+	s.calls++
+	return s.entities, s.err
+}
+
+var _ = Describe("RedactWithNER", func() {
+	It("nil detector is regex-only", func() {
+		// When the NER tier is disabled (Detector == nil) the redactor
+		// must behave exactly like the existing regex-only path — no
+		// detector call, same Result shape, no error.
+		r := NewRedactor([]Pattern{pickEmail()})
+		res, err := r.RedactWithNER(context.Background(), "ping me at alice@example.com", nil, NERConfig{})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.Redacted).To(ContainSubstring("[REDACTED:email]"), "regex tier should still run when Detector is nil")
+	})
+
+	It("applies entity actions", func() {
+		det := &stubNERDetector{entities: []NEREntity{
+			{Group: "PER", Start: 6, End: 11, Score: 0.95}, // "Alice" in "Hi I'm Alice today"
+		}}
+		r := NewRedactor(nil)
+		res, err := r.RedactWithNER(context.Background(), "Hi I'm Alice today", nil, NERConfig{
+			Detector:      det,
+			EntityActions: map[string]Action{"PER": ActionMask},
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(det.calls).To(Equal(1))
+		Expect(res.Redacted).To(ContainSubstring("[REDACTED:ner:PER]"))
+		Expect(res.Spans).To(HaveLen(1))
+		Expect(res.Spans[0].Pattern).To(Equal("ner:PER"))
+	})
+
+	It("filters below MinScore", func() {
+		det := &stubNERDetector{entities: []NEREntity{
+			{Group: "PER", Start: 0, End: 5, Score: 0.20},
+		}}
+		r := NewRedactor(nil)
+		res, err := r.RedactWithNER(context.Background(), "Alice", nil, NERConfig{
+			Detector:      det,
+			MinScore:      0.50,
+			EntityActions: map[string]Action{"PER": ActionMask},
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.Redacted).To(Equal("Alice"), "low-confidence entity should be dropped")
+	})
+
+	It("default action applies to unconfigured groups", func() {
+		det := &stubNERDetector{entities: []NEREntity{
+			{Group: "ORG", Start: 7, End: 11, Score: 0.9}, // "Acme" in "Hello, Acme!"
+		}}
+		r := NewRedactor(nil)
+		res, err := r.RedactWithNER(context.Background(), "Hello, Acme!", nil, NERConfig{
+			Detector:      det,
+			DefaultAction: ActionMask,
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.Redacted).To(ContainSubstring("[REDACTED:ner:ORG]"), "DefaultAction should apply to ORG")
+	})
+
+	It("drops unconfigured groups with no default", func() {
+		// EntityActions has no entry for ORG and DefaultAction is empty —
+		// the detected entity must be ignored entirely (no audit row, no
+		// redaction).
+		det := &stubNERDetector{entities: []NEREntity{
+			{Group: "ORG", Start: 0, End: 4, Score: 0.9},
+		}}
+		r := NewRedactor(nil)
+		res, err := r.RedactWithNER(context.Background(), "Acme", nil, NERConfig{
+			Detector:      det,
+			EntityActions: map[string]Action{"PER": ActionMask}, // ORG is unconfigured
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.Redacted).To(Equal("Acme"))
+		Expect(res.Spans).To(BeEmpty())
+	})
+
+	It("overlapping hits keep stronger action", func() {
+		// Regex marks 0..10 as mask; NER marks 5..15 as block. After
+		// merge, the union 0..15 keeps the strongest action (block).
+		pat := Pattern{ID: "test", Action: ActionMask, regex: rangeRegex(0, 10)}
+		r := NewRedactor([]Pattern{pat})
+		det := &stubNERDetector{entities: []NEREntity{
+			{Group: "PER", Start: 5, End: 15, Score: 0.9},
+		}}
+		text := "0123456789ABCDEF"
+		res, err := r.RedactWithNER(context.Background(), text, nil, NERConfig{
+			Detector:      det,
+			EntityActions: map[string]Action{"PER": ActionBlock},
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.Blocked).To(BeTrue(), "overlapping mask+block should set Blocked=true")
+	})
+
+	It("detector error returns regex result and error", func() {
+		// Fail-open: when the NER detector errors, the redactor still
+		// returns regex-tier hits so an offline NER backend doesn't strip
+		// the cheap protection. Caller can read the error and decide
+		// whether to surface it.
+		det := &stubNERDetector{err: errors.New("backend offline")}
+		r := NewRedactor([]Pattern{pickEmail()})
+		res, err := r.RedactWithNER(context.Background(), "ping alice@example.com", nil, NERConfig{
+			Detector:      det,
+			DefaultAction: ActionMask,
+		})
+		Expect(err).To(HaveOccurred(), "expected detector error to surface")
+		Expect(res.Redacted).To(ContainSubstring("[REDACTED:email]"), "regex tier should still apply on NER failure")
+	})
+
+	It("out-of-bounds offsets are skipped", func() {
+		// A misconfigured / buggy backend could return offsets past the
+		// end of text. The redactor must not panic on slice OOB.
+		det := &stubNERDetector{entities: []NEREntity{
+			{Group: "PER", Start: 0, End: 999, Score: 0.9},
+			{Group: "PER", Start: -1, End: 3, Score: 0.9},
+			{Group: "PER", Start: 5, End: 5, Score: 0.9}, // zero-length
+		}}
+		r := NewRedactor(nil)
+		res, err := r.RedactWithNER(context.Background(), "Alice", nil, NERConfig{
+			Detector:      det,
+			DefaultAction: ActionMask,
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.Redacted).To(Equal("Alice"))
+		Expect(res.Spans).To(BeEmpty())
+	})
+})
+
+// --- test helpers ---
+
+// rangeMatcher is a deterministic regexpMatcher stub: it claims one
+// fixed range regardless of input. Lets the overlap-merge test
+// produce a known regex/NER intersection without depending on a real
+// compiled regex.
+type rangeMatcher struct{ start, end int }
+
+func (m rangeMatcher) FindAllStringIndex(_ string, _ int) [][]int {
+	return [][]int{{m.start, m.end}}
+}
+
+func rangeRegex(start, end int) regexpMatcher { return rangeMatcher{start: start, end: end} }
+
+// pickEmail returns the compiled "email" pattern from DefaultPatterns
+// — the NER tests use it as the regex tier's contribution.
+func pickEmail() Pattern {
+	for _, p := range DefaultPatterns() {
+		if p.ID == "email" {
+			compiled, err := Compile([]Pattern{p})
+			ExpectWithOffset(1, err).NotTo(HaveOccurred(), "compile")
+			return compiled[0]
+		}
+	}
+	Fail("email pattern missing from DefaultPatterns")
+	return Pattern{}
+}
+
diff --git a/core/services/routing/pii/patterns.go b/core/services/routing/pii/patterns.go
new file mode 100644
index 000000000..1e1ef50a1
--- /dev/null
+++ b/core/services/routing/pii/patterns.go
@@ -0,0 +1,188 @@
+package pii
+
+import (
+	"fmt"
+	"regexp"
+	"strings"
+)
+
+// regexpMatcher is a thin wrapper so tests can swap in a deterministic
+// matcher without touching the regexp package. Real usage uses
+// regexpMatcherFromPattern; tests can construct fakes.
+type regexpMatcher interface {
+	FindAllStringIndex(s string, n int) [][]int
+}
+
+type goRegexp struct{ r *regexp.Regexp }
+
+func (g goRegexp) FindAllStringIndex(s string, n int) [][]int {
+	return g.r.FindAllStringIndex(s, n)
+}
+
+// DefaultPatterns returns the built-in regex set. Each entry includes
+// a conservative MaxMatchLength so the streaming filter can size its
+// tail buffer without re-parsing the regex at runtime.
+//
+// Caveats by design:
+//   - The phone pattern matches international and US formats but does
+//     not validate area codes. False positives on numbers that look
+//     phone-like (e.g., timestamps in some formats) are accepted in
+//     return for reliable coverage.
+//   - The credit card pattern requires the Luhn check (verifyLuhn) to
+//     reduce false positives — random 16-digit strings won't match.
+//   - The API-key pattern targets common provider prefixes (sk-, pk-,
+//     xoxb-, ghp_, github_pat_) rather than guessing entropy. Adding
+//     new providers should append a new Pattern, not extend an
+//     existing alternation, so the admin UI can show one row per
+//     provider with its own toggle.
+func DefaultPatterns() []Pattern {
+	return []Pattern{
+		{
+			ID:             "email",
+			Description:    "Email address",
+			Action:         ActionMask,
+			MaxMatchLength: 254, // RFC 5321 max
+		},
+		{
+			ID:             "phone",
+			Description:    "Phone number (international or US format)",
+			Action:         ActionMask,
+			MaxMatchLength: 24,
+		},
+		{
+			ID:             "ssn",
+			Description:    "US Social Security Number (NNN-NN-NNNN)",
+			Action:         ActionMask,
+			MaxMatchLength: 11,
+		},
+		{
+			ID:             "credit_card",
+			Description:    "Credit card number (Luhn-verified)",
+			Action:         ActionMask,
+			MaxMatchLength: 19,
+		},
+		{
+			ID:             "ipv4",
+			Description:    "IPv4 address",
+			Action:         ActionMask,
+			MaxMatchLength: 15,
+		},
+		{
+			ID:             "api_key_prefix",
+			Description:    "Common API key prefixes (sk-, pk-, xoxb-, ghp_, github_pat_)",
+			Action:         ActionBlock, // tighter default — leaked credentials are higher harm
+			MaxMatchLength: 200,
+		},
+	}
+}
+
+// patternRegexps maps Pattern.ID to its compiled regex. Kept separate
+// from the Pattern struct so DefaultPatterns can be data-only and
+// tests can swap matchers via Compile().
+var patternRegexps = map[string]*regexp.Regexp{
+	// Pragmatic email — does not implement RFC 5322 in full (no one
+	// sane does in a regex). Catches the common shape; the encoder
+	// NER tier (future) catches edge cases.
+	"email": regexp.MustCompile(`(?i)[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}`),
+	// US: (123) 456-7890, 123-456-7890, 123.456.7890, 1234567890.
+	// International: +<country>-<area>-<rest> with separators.
+	"phone": regexp.MustCompile(`(?:\+?\d{1,3}[\s\-.]?)?(?:\(\d{3}\)|\d{3})[\s\-.]?\d{3}[\s\-.]?\d{4}`),
+	"ssn":   regexp.MustCompile(`\b\d{3}-\d{2}-\d{4}\b`),
+	// 13-19 digit Luhn-eligible runs. The verifier in match() rejects
+	// non-Luhn matches.
+	"credit_card": regexp.MustCompile(`\b(?:\d[ \-]?){13,19}\b`),
+	"ipv4":        regexp.MustCompile(`\b(?:\d{1,3}\.){3}\d{1,3}\b`),
+	// Common provider prefixes; each alternative is a separate
+	// well-known marker rather than a permissive entropy match.
+	"api_key_prefix": regexp.MustCompile(`(?:sk-[A-Za-z0-9]{20,}|pk-[A-Za-z0-9]{20,}|xoxb-[A-Za-z0-9\-]{20,}|ghp_[A-Za-z0-9]{20,}|github_pat_[A-Za-z0-9_]{20,})`),
+}
+
+// Compile attaches matchers to each pattern. Patterns whose ID is not
+// in patternRegexps are returned as a typed error so an admin who
+// adds a custom pattern via config gets a clear "no regex registered"
+// message instead of silent skip.
+func Compile(patterns []Pattern) ([]Pattern, error) {
+	out := make([]Pattern, len(patterns))
+	for i, p := range patterns {
+		r, ok := patternRegexps[p.ID]
+		if !ok {
+			return nil, fmt.Errorf("pii: no regex registered for pattern id %q", p.ID)
+		}
+		p.regex = goRegexp{r: r}
+		out[i] = p
+	}
+	return out, nil
+}
+
+// VerifyMatch applies pattern-specific post-checks (e.g. Luhn for
+// credit_card). Returns the original match or "" to discard it.
+func VerifyMatch(patternID, candidate string) string {
+	switch patternID {
+	case "credit_card":
+		digits := stripNonDigits(candidate)
+		if len(digits) < 13 || len(digits) > 19 {
+			return ""
+		}
+		if !verifyLuhn(digits) {
+			return ""
+		}
+	case "ipv4":
+		// Each octet must be 0..255. The regex allows 0..999 since
+		// regex isn't great at numeric ranges; we tighten here.
+		for oct := range strings.SplitSeq(candidate, ".") {
+			n := 0
+			for _, c := range oct {
+				if c < '0' || c > '9' {
+					return ""
+				}
+				n = n*10 + int(c-'0')
+			}
+			if n > 255 {
+				return ""
+			}
+		}
+	}
+	return candidate
+}
+
+func stripNonDigits(s string) string {
+	var b strings.Builder
+	b.Grow(len(s))
+	for _, c := range s {
+		if c >= '0' && c <= '9' {
+			b.WriteRune(c)
+		}
+	}
+	return b.String()
+}
+
+// verifyLuhn implements the Luhn checksum used by credit-card numbers.
+// Returns true iff the digits pass.
+func verifyLuhn(digits string) bool {
+	sum := 0
+	double := false
+	for i := len(digits) - 1; i >= 0; i-- {
+		d := int(digits[i] - '0')
+		if double {
+			d *= 2
+			if d > 9 {
+				d -= 9
+			}
+		}
+		sum += d
+		double = !double
+	}
+	return sum%10 == 0
+}
+
+// MaxPatternLength returns the longest MaxMatchLength across the input
+// patterns. Used by the streaming filter to size its tail buffer.
+func MaxPatternLength(patterns []Pattern) int {
+	max := 0
+	for _, p := range patterns {
+		if p.MaxMatchLength > max {
+			max = p.MaxMatchLength
+		}
+	}
+	return max
+}
diff --git a/core/services/routing/pii/pii_suite_test.go b/core/services/routing/pii/pii_suite_test.go
new file mode 100644
index 000000000..634b66df4
--- /dev/null
+++ b/core/services/routing/pii/pii_suite_test.go
@@ -0,0 +1,13 @@
+package pii
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestPii(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "pii test suite")
+}
diff --git a/core/services/routing/pii/redactor.go b/core/services/routing/pii/redactor.go
new file mode 100644
index 000000000..b70192cac
--- /dev/null
+++ b/core/services/routing/pii/redactor.go
@@ -0,0 +1,342 @@
+package pii
+
+import (
+	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"fmt"
+	"slices"
+	"sort"
+	"strings"
+	"sync"
+)
+
+// rawHit is one detection — regex-side or NER-side — before
+// overlap-merging. Lifted to file scope so the regex and NER
+// collectors can both produce them and feed the same merge/emit step.
+type rawHit struct {
+	patternID string
+	action    Action
+	start     int
+	end       int
+}
+
+// Redactor scans text against a configured pattern set and applies the
+// per-pattern action. The pattern set itself is mutable at runtime via
+// SetAction (the /api/pii/patterns/:id admin endpoint mutates it
+// in-place); reads are guarded by a mutex so concurrent requests stay
+// race-free.
+type Redactor struct {
+	mu       sync.RWMutex
+	patterns []Pattern
+	maxLen   int
+}
+
+// NewRedactor constructs a redactor from a list of compiled patterns
+// (use Compile() to compile config-loaded patterns first). nil
+// patterns is valid and produces a no-op redactor — convenient for the
+// "PII disabled" deployment.
+func NewRedactor(patterns []Pattern) *Redactor {
+	return &Redactor{
+		patterns: patterns,
+		maxLen:   MaxPatternLength(patterns),
+	}
+}
+
+// MaxPatternLength is exposed so the streaming wrapper can size its
+// tail buffer to match.
+func (r *Redactor) MaxPatternLength() int { return r.maxLen }
+
+// Patterns returns a copy of the configured pattern set so callers can
+// iterate without holding the redactor lock. The compiled regexes are
+// shared — they are immutable once built.
+func (r *Redactor) Patterns() []Pattern {
+	r.mu.RLock()
+	defer r.mu.RUnlock()
+	return slices.Clone(r.patterns)
+}
+
+// SetAction overrides the action for a single pattern. Used by the
+// /api/pii/patterns/:id admin endpoint and the set_pii_pattern_action
+// MCP tool — transient until process restart unless persisted via
+// --pii-config.
+//
+// Publishes a new slice so concurrent Redact callers iterating an
+// older snapshot don't race on the per-element Action string (Go
+// strings are not atomic two-word values).
+func (r *Redactor) SetAction(id string, action Action) error {
+	if action != ActionMask && action != ActionBlock && action != ActionRouteLocal {
+		return fmt.Errorf("unknown action %q (must be mask, block, or route_local)", action)
+	}
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	for i := range r.patterns {
+		if r.patterns[i].ID == id {
+			next := slices.Clone(r.patterns)
+			next[i].Action = action
+			r.patterns = next
+			return nil
+		}
+	}
+	return fmt.Errorf("unknown pattern id %q", id)
+}
+
+// SetDisabled toggles a pattern's enabled state in the live redactor.
+// Same COW publish as SetAction.
+func (r *Redactor) SetDisabled(id string, disabled bool) error {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	for i := range r.patterns {
+		if r.patterns[i].ID == id {
+			next := slices.Clone(r.patterns)
+			next[i].Disabled = disabled
+			r.patterns = next
+			return nil
+		}
+	}
+	return fmt.Errorf("unknown pattern id %q", id)
+}
+
+// Redact is a thin wrapper for callers that don't need per-request
+// action overrides. It applies each pattern's compiled-in default
+// action.
+func (r *Redactor) Redact(text string) Result {
+	return r.RedactWithOverrides(text, nil)
+}
+
+// RedactWithOverrides scans text and returns the result. The override
+// map is keyed by pattern id; when present, the value replaces the
+// pattern's compiled-in action for this call only — the redactor's
+// stored action is unchanged. Pattern ids missing from the map use
+// their stored action.
+//
+// For every match it records a Span (with HashPrefix, never the value)
+// and applies the resolved Action:
+//   - block: sets Result.Blocked, leaves text intact (caller decides
+//     whether to surface the redacted form).
+//   - mask: replaces the span with maskFor(pattern.ID).
+//   - route_local: sets Result.LocalOnly, leaves text intact.
+//
+// Spans are returned in the original input's coordinate system so the
+// PIIEvent record can be written without re-running the scan.
+func (r *Redactor) RedactWithOverrides(text string, overrides map[string]Action) Result {
+	return r.redact(context.Background(), text, overrides, NERConfig{})
+}
+
+// RedactWithNER is the encoder-tier variant: runs both the regex tier
+// (with per-pattern overrides) and the NER tier, merges hits, and
+// emits one redacted output. A nil NERConfig.Detector skips the NER
+// pass — callers can hand the same path the same NERConfig{} whether
+// or not the model has NER configured.
+//
+// Errors from the NER detector are returned alongside a best-effort
+// regex-only Result so the caller can decide whether to fail open
+// (return the regex Result, log the error) or fail closed (refuse the
+// request). The regex tier never errors.
+func (r *Redactor) RedactWithNER(ctx context.Context, text string, overrides map[string]Action, nerCfg NERConfig) (Result, error) {
+	if nerCfg.Detector == nil {
+		return r.redact(ctx, text, overrides, nerCfg), nil
+	}
+	hits, err := r.collectRegexHits(text, overrides)
+	if err != nil {
+		return Result{Redacted: text}, err
+	}
+	nerHits, nerErr := collectNERHits(ctx, text, nerCfg)
+	if nerErr != nil {
+		// Return the regex-only result so a NER-backend outage doesn't
+		// strip the cheap protection. Caller decides fail-open vs
+		// fail-closed via the returned error.
+		return mergeAndEmit(text, hits), nerErr
+	}
+	return mergeAndEmit(text, append(hits, nerHits...)), nil
+}
+
+// redact is the internal regex-only entry point. RedactWithOverrides
+// is the public wrapper; RedactWithNER routes through here only when
+// the NER detector is nil (so the call site doesn't need a separate
+// "regex-only" code path).
+func (r *Redactor) redact(_ context.Context, text string, overrides map[string]Action, _ NERConfig) Result {
+	hits, _ := r.collectRegexHits(text, overrides)
+	return mergeAndEmit(text, hits)
+}
+
+// collectRegexHits walks the configured pattern set against text and
+// returns each verified match as a rawHit. The redactor lock is held
+// only long enough to snapshot the pattern slice — regex evaluation
+// runs lock-free against the snapshot, so SetAction/SetDisabled don't
+// stall a long-running Redact.
+func (r *Redactor) collectRegexHits(text string, overrides map[string]Action) ([]rawHit, error) {
+	r.mu.RLock()
+	patterns := r.patterns
+	r.mu.RUnlock()
+
+	if len(patterns) == 0 || text == "" {
+		return nil, nil
+	}
+	var hits []rawHit
+	for _, p := range patterns {
+		if p.regex == nil {
+			// Pattern declared but Compile() not called. Skip rather
+			// than panic; the caller already saw an error from Compile.
+			continue
+		}
+		if p.Disabled {
+			continue
+		}
+		action := p.Action
+		if override, ok := overrides[p.ID]; ok {
+			action = override
+		}
+		idxs := p.regex.FindAllStringIndex(text, -1)
+		for _, idx := range idxs {
+			candidate := text[idx[0]:idx[1]]
+			if VerifyMatch(p.ID, candidate) == "" {
+				continue
+			}
+			hits = append(hits, rawHit{
+				patternID: p.ID,
+				action:    action,
+				start:     idx[0],
+				end:       idx[1],
+			})
+		}
+	}
+	return hits, nil
+}
+
+// collectNERHits invokes the configured NERDetector and converts each
+// returned entity into a rawHit using the NERConfig's action map.
+// Entities below MinScore or with no resolved action are dropped — the
+// detector doesn't know which entity groups the admin cares about, so
+// the redactor filters here.
+func collectNERHits(ctx context.Context, text string, cfg NERConfig) ([]rawHit, error) {
+	if cfg.Detector == nil || text == "" {
+		return nil, nil
+	}
+	entities, err := cfg.Detector.Detect(ctx, text)
+	if err != nil {
+		return nil, err
+	}
+	var hits []rawHit
+	for _, e := range entities {
+		if e.Score < cfg.MinScore {
+			continue
+		}
+		action, ok := cfg.ResolveAction(e.Group)
+		if !ok {
+			continue
+		}
+		if e.Start < 0 || e.End <= e.Start || e.End > len(text) {
+			// Defensive: the backend should return byte offsets into
+			// the original text, but a misconfigured model could
+			// produce garbage. Skip rather than panic on slice OOB.
+			continue
+		}
+		hits = append(hits, rawHit{
+			patternID: nerPatternID(e.Group),
+			action:    action,
+			start:     e.Start,
+			end:       e.End,
+		})
+	}
+	return hits, nil
+}
+
+// mergeAndEmit handles the overlap-merge + masked-output step that
+// regex-only and combined regex+NER redactions both perform. Sorts by
+// start (stable on equal starts by descending action strength), drops
+// overlapping hits in favour of the stronger action, and walks the
+// text once to emit replacement spans.
+func mergeAndEmit(text string, hits []rawHit) Result {
+	if len(hits) == 0 {
+		return Result{Redacted: text}
+	}
+	// Sort and deduplicate overlapping hits — when two patterns claim
+	// the same span (e.g., a credit-card-shaped value also scans as
+	// digits, or NER tags a span the regex also caught), keep the one
+	// with the strongest action. Order: block > route_local > mask.
+	sort.Slice(hits, func(i, j int) bool {
+		if hits[i].start != hits[j].start {
+			return hits[i].start < hits[j].start
+		}
+		return actionRank(hits[i].action) > actionRank(hits[j].action)
+	})
+	merged := hits[:0]
+	for _, h := range hits {
+		if len(merged) > 0 {
+			last := &merged[len(merged)-1]
+			if h.start < last.end {
+				if actionRank(h.action) > actionRank(last.action) {
+					last.action = h.action
+					last.patternID = h.patternID
+				}
+				if h.end > last.end {
+					last.end = h.end
+				}
+				continue
+			}
+		}
+		merged = append(merged, h)
+	}
+
+	res := Result{}
+	var out strings.Builder
+	out.Grow(len(text))
+	cursor := 0
+	for _, h := range merged {
+		matched := text[h.start:h.end]
+		span := Span{
+			Start:      h.start,
+			End:        h.end,
+			Pattern:    h.patternID,
+			HashPrefix: hashPrefix(matched),
+		}
+		res.Spans = append(res.Spans, span)
+
+		out.WriteString(text[cursor:h.start])
+		switch h.action {
+		case ActionBlock:
+			res.Blocked = true
+			out.WriteString(matched)
+		case ActionRouteLocal:
+			res.LocalOnly = true
+			out.WriteString(matched)
+		default:
+			out.WriteString(maskFor(h.patternID))
+		}
+		cursor = h.end
+	}
+	out.WriteString(text[cursor:])
+	res.Redacted = out.String()
+	return res
+}
+
+// maskFor returns the placeholder that replaces a matched span. The
+// shape "[REDACTED:<id>]" is intentionally stable — it surfaces the
+// pattern id back to the model, which is sometimes useful (e.g., the
+// model can say "I see you redacted an email"). Admins who want a
+// less informative replacement can build one in front of this.
+func maskFor(patternID string) string {
+	return "[REDACTED:" + patternID + "]"
+}
+
+// hashPrefix returns the first 8 chars of sha256(value). Two calls
+// with the same input produce the same prefix so an admin auditing
+// the PIIEvent log can spot a recurring leak ("the same SSN appears
+// 200 times this hour") without ever recovering the value.
+func hashPrefix(value string) string {
+	sum := sha256.Sum256([]byte(value))
+	return hex.EncodeToString(sum[:])[:8]
+}
+
+func actionRank(a Action) int {
+	switch a {
+	case ActionBlock:
+		return 3
+	case ActionRouteLocal:
+		return 2
+	case ActionMask:
+		return 1
+	}
+	return 0
+}
diff --git a/core/services/routing/pii/redactor_race_test.go b/core/services/routing/pii/redactor_race_test.go
new file mode 100644
index 000000000..f926ea64d
--- /dev/null
+++ b/core/services/routing/pii/redactor_race_test.go
@@ -0,0 +1,66 @@
+package pii
+
+import (
+	"sync"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// Redactor_SetActionConcurrentRedact pins the SetAction copy-on-
+// write contract: concurrent SetAction must not race with readers
+// iterating an older patterns snapshot. Run with -race to surface the
+// regression that motivated the COW (in-place mutation of the
+// per-element Action string is not atomic).
+var _ = Describe("Redactor", func() {
+	It("SetAction concurrent with Redact", func() {
+		patterns, err := Compile(DefaultPatterns())
+		Expect(err).NotTo(HaveOccurred(), "compile")
+		r := NewRedactor(patterns)
+
+		const writers = 4
+		const readers = 8
+		const iter = 100
+
+		var wg sync.WaitGroup
+		stop := make(chan struct{})
+
+		for w := 0; w < writers; w++ {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				for i := 0; i < iter; i++ {
+					select {
+					case <-stop:
+						return
+					default:
+					}
+					action := ActionMask
+					if i%2 == 0 {
+						action = ActionBlock
+					}
+					_ = r.SetAction("email", action)
+				}
+			}()
+		}
+
+		for rd := 0; rd < readers; rd++ {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				text := "contact alice@example.com please"
+				for i := 0; i < iter*2; i++ {
+					select {
+					case <-stop:
+						return
+					default:
+					}
+					_ = r.Redact(text)
+				}
+			}()
+		}
+
+		wg.Wait()
+		close(stop)
+	})
+})
diff --git a/core/services/routing/pii/redactor_test.go b/core/services/routing/pii/redactor_test.go
new file mode 100644
index 000000000..a084e4d54
--- /dev/null
+++ b/core/services/routing/pii/redactor_test.go
@@ -0,0 +1,184 @@
+package pii
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func mustCompile(ids ...string) []Pattern {
+	all := DefaultPatterns()
+	if len(ids) == 0 {
+		out, err := Compile(all)
+		ExpectWithOffset(1, err).NotTo(HaveOccurred(), "compile")
+		return out
+	}
+	pickP := pick(all, ids)
+	out, err := Compile(pickP)
+	ExpectWithOffset(1, err).NotTo(HaveOccurred(), "compile")
+	return out
+}
+
+func pick(all []Pattern, ids []string) []Pattern {
+	keep := map[string]bool{}
+	for _, id := range ids {
+		keep[id] = true
+	}
+	var out []Pattern
+	for _, p := range all {
+		if keep[p.ID] {
+			out = append(out, p)
+		}
+	}
+	return out
+}
+
+var _ = Describe("Redactor", func() {
+	It("masks email", func() {
+		r := NewRedactor(mustCompile("email"))
+		res := r.Redact("Contact me at alice@example.com any time.")
+		Expect(res.Blocked).To(BeFalse(), "email is mask-action by default, should not block")
+		Expect(res.Redacted).To(ContainSubstring("[REDACTED:email]"))
+		Expect(res.Redacted).NotTo(ContainSubstring("alice@example.com"))
+		Expect(res.Spans).To(HaveLen(1))
+		Expect(res.Spans[0].HashPrefix).NotTo(BeEmpty(), "hash prefix must be set so audits can dedupe leaks")
+	})
+
+	It("masks SSN", func() {
+		r := NewRedactor(mustCompile("ssn"))
+		res := r.Redact("call me about SSN 123-45-6789 please")
+		Expect(res.Redacted).To(ContainSubstring("[REDACTED:ssn]"))
+	})
+
+	It("uses Luhn for credit card", func() {
+		r := NewRedactor(mustCompile("credit_card"))
+
+		// 4111 1111 1111 1111 — canonical Luhn-valid Visa test number.
+		good := r.Redact("card: 4111 1111 1111 1111")
+		Expect(good.Spans).To(HaveLen(1))
+		Expect(good.Redacted).To(ContainSubstring("[REDACTED:credit_card]"))
+
+		// 4111 1111 1111 1112 — same shape, fails Luhn. Must NOT match.
+		bad := r.Redact("card: 4111 1111 1111 1112")
+		Expect(bad.Spans).To(BeEmpty(), "Luhn-invalid 16-digit run must not be redacted")
+		Expect(bad.Redacted).To(ContainSubstring("1112"), "Luhn-invalid input should pass through untouched")
+	})
+
+	It("validates IPv4 octets", func() {
+		r := NewRedactor(mustCompile("ipv4"))
+
+		good := r.Redact("server at 192.168.1.10 is up")
+		Expect(good.Spans).To(HaveLen(1))
+
+		// 999.999.999.999 — regex matches but octet > 255 must reject.
+		bad := r.Redact("not an ip: 999.999.999.999")
+		Expect(bad.Spans).To(BeEmpty(), "ipv4 with octet>255 must not match")
+	})
+
+	It("api_key defaults to block", func() {
+		r := NewRedactor(mustCompile("api_key_prefix"))
+		res := r.Redact("here's a token sk-abcdefghijklmnopqrstuvwxyz0123456789 to use")
+		Expect(res.Blocked).To(BeTrue(), "api_key default action is block; Result.Blocked must be true")
+		// The redacted output keeps the matched value when blocking — the
+		// caller is expected to refuse the request, not to forward a partial.
+		Expect(res.Redacted).To(ContainSubstring("sk-abcdefghijklmn"), "blocked actions leave the matched span intact for caller inspection")
+	})
+
+	It("preserves non-matching text", func() {
+		r := NewRedactor(mustCompile()) // all default patterns
+		in := "no PII here at all, just words and numbers like 42 and 1.5"
+		res := r.Redact(in)
+		Expect(res.Redacted).To(Equal(in), "non-PII input should pass through unchanged")
+		Expect(res.Spans).To(BeEmpty())
+	})
+
+	It("handles empty input", func() {
+		r := NewRedactor(mustCompile())
+		res := r.Redact("")
+		Expect(res.Redacted).To(BeEmpty())
+		Expect(res.Blocked).To(BeFalse())
+		Expect(res.LocalOnly).To(BeFalse())
+		Expect(res.Spans).To(BeEmpty())
+	})
+
+	It("nil patterns is a no-op", func() {
+		// Disabled-PII deployment: pii.NewRedactor(nil) is a no-op.
+		r := NewRedactor(nil)
+		res := r.Redact("alice@example.com sent it")
+		Expect(res.Redacted).To(Equal("alice@example.com sent it"))
+	})
+
+	It("hash prefix is stable", func() {
+		r := NewRedactor(mustCompile("email"))
+		a := r.Redact("a@b.com")
+		b := r.Redact("hi a@b.com again")
+		Expect(a.Spans).To(HaveLen(1))
+		Expect(b.Spans).To(HaveLen(1))
+		Expect(a.Spans[0].HashPrefix).To(Equal(b.Spans[0].HashPrefix), "same matched value must produce same hash prefix")
+	})
+})
+
+var _ = Describe("Compile", func() {
+	It("rejects unknown pattern id", func() {
+		_, err := Compile([]Pattern{{ID: "nonexistent", Action: ActionMask}})
+		Expect(err).To(HaveOccurred(), "Compile must error on unknown pattern id")
+	})
+})
+
+var _ = Describe("MaxPatternLength", func() {
+	It("returns the longest pattern's max length", func() {
+		patterns := mustCompile("email", "ssn")
+		got := MaxPatternLength(patterns)
+		// email is the longer of the two (254). The streaming filter
+		// will use this to size its tail buffer.
+		Expect(got).To(Equal(254))
+	})
+})
+
+var _ = Describe("RedactWithOverrides", func() {
+	It("upgrades action", func() {
+		// email is mask by default; the per-model override turns it into a
+		// hard block for one request without mutating the redactor.
+		r := NewRedactor(mustCompile("email"))
+		res := r.RedactWithOverrides("contact alice@example.com",
+			map[string]Action{"email": ActionBlock})
+		Expect(res.Blocked).To(BeTrue(), "override should have set Blocked")
+		// Block leaves the value intact (the caller short-circuits the
+		// request) — the redactor never echoes the matched text.
+		Expect(res.Redacted).To(ContainSubstring("alice@example.com"), "block leaves text intact for the caller to discard")
+		// Stored action is unchanged so a subsequent default Redact still
+		// masks rather than blocks.
+		res2 := r.Redact("contact alice@example.com")
+		Expect(res2.Blocked).To(BeFalse(), "override must not mutate stored action")
+	})
+
+	It("ignores unknown IDs", func() {
+		// An override for a pattern this redactor doesn't know about is a
+		// no-op rather than an error — per-model configs may reference
+		// patterns from a wider catalogue than the active redactor holds.
+		r := NewRedactor(mustCompile("email"))
+		res := r.RedactWithOverrides("contact alice@example.com",
+			map[string]Action{"ssn": ActionBlock})
+		Expect(res.Blocked).To(BeFalse(), "ssn override against email-only redactor must be no-op")
+	})
+})
+
+var _ = Describe("SetAction", func() {
+	It("swaps in place", func() {
+		r := NewRedactor(mustCompile("email"))
+		Expect(r.SetAction("email", ActionRouteLocal)).To(Succeed())
+		res := r.Redact("contact alice@example.com")
+		Expect(res.LocalOnly).To(BeTrue(), "expected LocalOnly after SetAction(route_local)")
+		Expect(res.Blocked).To(BeFalse(), "SetAction(route_local) should not block")
+	})
+
+	It("rejects unknown id", func() {
+		r := NewRedactor(mustCompile("email"))
+		Expect(r.SetAction("nonexistent", ActionMask)).NotTo(Succeed(), "expected error for unknown pattern id")
+	})
+
+	It("rejects unknown action", func() {
+		r := NewRedactor(mustCompile("email"))
+		Expect(r.SetAction("email", Action("frobnicate"))).NotTo(Succeed(), "expected error for unknown action")
+	})
+})
+
diff --git a/core/services/routing/pii/store.go b/core/services/routing/pii/store.go
new file mode 100644
index 000000000..2b3a5df65
--- /dev/null
+++ b/core/services/routing/pii/store.go
@@ -0,0 +1,130 @@
+package pii
+
+import (
+	"context"
+	"sync"
+)
+
+// EventStore persists PIIEvent records. Mirrors the StatsBackend
+// abstraction in the billing package: in-process by default so a
+// no-auth box still gets an event log; a future GORM-backed impl
+// (when --auth is on) will reuse the auth DB.
+type EventStore interface {
+	Record(ctx context.Context, e PIIEvent) error
+	List(ctx context.Context, q ListQuery) ([]PIIEvent, error)
+	// Count returns the number of events currently stored. Used by
+	// /api/middleware/status to surface a "recent_event_count" without
+	// pulling the whole list (the dashboard polls this on a refresh).
+	Count(ctx context.Context) (int, error)
+	Close() error
+}
+
+// ListQuery filters the event log. CorrelationID, UserID, PatternID,
+// Kind each scope the search; empty values match anything. Limit ≤ 0
+// returns up to a default cap.
+type ListQuery struct {
+	CorrelationID string
+	UserID        string
+	PatternID     string
+	Kind          EventKind
+	Limit         int
+}
+
+// NewMemoryEventStore returns an in-memory ring-buffer event store.
+// capacity ≤ 0 picks 10_000.
+//
+// Why a ring: PII events are noisy; a chatty deployment can produce
+// thousands per minute. A bounded buffer keeps memory predictable,
+// and the GORM impl (when added) handles long-term retention.
+func NewMemoryEventStore(capacity int) EventStore {
+	if capacity <= 0 {
+		capacity = 10_000
+	}
+	return &memoryEventStore{
+		ring: make([]PIIEvent, capacity),
+		cap:  capacity,
+	}
+}
+
+type memoryEventStore struct {
+	mu     sync.RWMutex
+	ring   []PIIEvent
+	cap    int
+	cursor int
+	full   bool
+}
+
+func (s *memoryEventStore) Record(_ context.Context, e PIIEvent) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.ring[s.cursor] = e
+	s.cursor++
+	if s.cursor == s.cap {
+		s.cursor = 0
+		s.full = true
+	}
+	return nil
+}
+
+func (s *memoryEventStore) List(_ context.Context, q ListQuery) ([]PIIEvent, error) {
+	limit := q.Limit
+	if limit <= 0 {
+		limit = 1000
+	}
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	out := make([]PIIEvent, 0, limit)
+	scan := func(e PIIEvent) bool {
+		if e.ID == "" {
+			return false // empty slot
+		}
+		if q.CorrelationID != "" && e.CorrelationID != q.CorrelationID {
+			return false
+		}
+		if q.UserID != "" && e.UserID != q.UserID {
+			return false
+		}
+		if q.PatternID != "" && e.PatternID != q.PatternID {
+			return false
+		}
+		if q.Kind != "" && e.ResolvedKind() != q.Kind {
+			return false
+		}
+		out = append(out, e)
+		return len(out) >= limit
+	}
+
+	// Walk newest-first: cursor-1 down to 0, then cap-1 down to cursor
+	// when the ring has wrapped.
+	if s.full {
+		for i := s.cursor - 1; i >= 0; i-- {
+			if scan(s.ring[i]) {
+				return out, nil
+			}
+		}
+		for i := s.cap - 1; i >= s.cursor; i-- {
+			if scan(s.ring[i]) {
+				return out, nil
+			}
+		}
+	} else {
+		for i := s.cursor - 1; i >= 0; i-- {
+			if scan(s.ring[i]) {
+				return out, nil
+			}
+		}
+	}
+	return out, nil
+}
+
+func (s *memoryEventStore) Count(_ context.Context) (int, error) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	if s.full {
+		return s.cap, nil
+	}
+	return s.cursor, nil
+}
+
+func (s *memoryEventStore) Close() error { return nil }
diff --git a/core/services/routing/pii/stream.go b/core/services/routing/pii/stream.go
new file mode 100644
index 000000000..93a5cd261
--- /dev/null
+++ b/core/services/routing/pii/stream.go
@@ -0,0 +1,197 @@
+package pii
+
+import (
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"strings"
+	"time"
+	"unicode/utf8"
+)
+
+// StreamFilter applies the regex PII tier to a streaming response,
+// chunk by chunk, with a buffered-emit invariant: for any active
+// pattern with bounded max-length L, the filter never emits the
+// trailing L-1 characters of the cumulative input until either
+//
+//   (a) more text arrives that disambiguates the boundary, or
+//   (b) the stream closes (Drain).
+//
+// That keeps the redactor honest across chunk splits — an email
+// arriving as "alice@" + "example.com" still masks the same way as
+// "alice@example.com" arriving in one piece.
+//
+// Action handling in stream mode differs from the request-side
+// middleware. Earlier chunks of the response are already on the wire
+// by the time later chunks are scanned, so a "block" can't actually
+// reject the request. We remap block → mask for redaction purposes
+// while still recording PIIEvent rows with action="block" so audits
+// surface the original intent ("the model would have leaked X here,
+// suppressed in flight"). route_local on the output side is a no-op
+// (the dispatch decision was already made on the request side).
+//
+// StreamFilter is NOT safe for concurrent use across goroutines; one
+// instance per response stream.
+type StreamFilter struct {
+	redactor      *Redactor
+	maskOverrides map[string]Action // block → mask map used for redaction
+	auditActions  map[string]Action // original action per pattern, used for events
+	store         EventStore
+	correlationID string
+	userID        string
+	holdLen       int
+	buffer        strings.Builder
+	emittedBytes  int
+}
+
+// NewStreamFilter constructs a per-response filter. modelOverrides is
+// the per-model action override map (same shape the request-side
+// middleware uses); it can be nil when the model only accepts global
+// defaults.
+//
+// store may be nil — events are then computed but not persisted, which
+// is what the chat handler does when --disable-stats is set.
+func NewStreamFilter(redactor *Redactor, modelOverrides map[string]Action, store EventStore, correlationID, userID string) *StreamFilter {
+	if redactor == nil {
+		return &StreamFilter{}
+	}
+
+	patterns := redactor.Patterns()
+
+	// auditActions: the action we *would* have applied if this match
+	// occurred on the request side. Honours the per-model override.
+	auditActions := make(map[string]Action, len(patterns))
+	for _, p := range patterns {
+		auditActions[p.ID] = p.Action
+	}
+	for id, action := range modelOverrides {
+		auditActions[id] = action
+	}
+
+	// maskOverrides: the action we actually apply to the stream. Same
+	// as auditActions, but with every block remapped to mask.
+	maskOverrides := make(map[string]Action, len(auditActions))
+	for id, action := range auditActions {
+		if action == ActionBlock {
+			maskOverrides[id] = ActionMask
+		} else {
+			maskOverrides[id] = action
+		}
+	}
+
+	return &StreamFilter{
+		redactor:      redactor,
+		maskOverrides: maskOverrides,
+		auditActions:  auditActions,
+		store:         store,
+		correlationID: correlationID,
+		userID:        userID,
+		holdLen:       redactor.MaxPatternLength() - 1,
+	}
+}
+
+// Push appends new text to the filter's buffer and returns the prefix
+// safe to emit downstream — the cumulative input minus a tail of
+// holdLen characters that might still be the start of a longer match.
+// Returned text has masks already applied.
+//
+// Returns an empty string when not enough text has arrived to clear
+// the hold window.
+func (sf *StreamFilter) Push(text string) string {
+	if sf.redactor == nil || sf.holdLen <= 0 {
+		return text
+	}
+	sf.buffer.WriteString(text)
+	bufStr := sf.buffer.String()
+	n := len(bufStr)
+
+	if n <= sf.holdLen {
+		return ""
+	}
+
+	emitBoundary := n - sf.holdLen
+
+	// Scan the entire buffer. A match whose start is before the
+	// boundary but whose end runs past it crosses the window — pull
+	// the boundary back to match.start so the pattern stays whole in
+	// the buffer for the next Push to scan again.
+	full := sf.redactor.RedactWithOverrides(bufStr, sf.maskOverrides)
+	for _, span := range full.Spans {
+		if span.Start < emitBoundary && span.End > emitBoundary {
+			emitBoundary = span.Start
+		}
+	}
+
+	// holdLen is byte-sized but a chunk boundary may land mid-codepoint.
+	// Snap back to the nearest rune start so neither the emitted prefix
+	// nor the retained tail contains a split codepoint — otherwise the
+	// next regex scan over an invalid-UTF-8 prefix could mis-match.
+	for emitBoundary > 0 && emitBoundary < n && !utf8.RuneStart(bufStr[emitBoundary]) {
+		emitBoundary--
+	}
+
+	if emitBoundary <= 0 {
+		return ""
+	}
+
+	emitted := sf.applyAndEmit(bufStr[:emitBoundary])
+	sf.buffer.Reset()
+	sf.buffer.WriteString(bufStr[emitBoundary:])
+	return emitted
+}
+
+// Drain emits whatever's left in the buffer with all matches applied.
+// Call exactly once when the stream closes — repeat calls return the
+// empty string.
+func (sf *StreamFilter) Drain() string {
+	if sf.redactor == nil {
+		return sf.buffer.String()
+	}
+	bufStr := sf.buffer.String()
+	if bufStr == "" {
+		return ""
+	}
+	emitted := sf.applyAndEmit(bufStr)
+	sf.buffer.Reset()
+	return emitted
+}
+
+// applyAndEmit runs the redactor over a committed-for-emit fragment,
+// substitutes mask/block placeholders inline, and records one
+// PIIEvent per matched span (with the audit action, not the masked
+// one). ByteOffset is referenced to the cumulative emitted output so
+// admins can correlate event positions against the streamed body.
+func (sf *StreamFilter) applyAndEmit(fragment string) string {
+	res := sf.redactor.RedactWithOverrides(fragment, sf.maskOverrides)
+	output := res.Redacted
+
+	if len(res.Spans) > 0 {
+		now := time.Now().UTC()
+		for _, span := range res.Spans {
+			ev := PIIEvent{
+				ID:            newStreamEventID(),
+				CorrelationID: sf.correlationID,
+				UserID:        sf.userID,
+				Direction:     DirectionOut,
+				PatternID:     span.Pattern,
+				ByteOffset:    sf.emittedBytes + span.Start,
+				Length:        span.End - span.Start,
+				HashPrefix:    span.HashPrefix,
+				Action:        sf.auditActions[span.Pattern],
+				CreatedAt:     now,
+			}
+			if sf.store != nil {
+				_ = sf.store.Record(context.Background(), ev)
+			}
+		}
+	}
+
+	sf.emittedBytes += len(fragment)
+	return output
+}
+
+func newStreamEventID() string {
+	var b [12]byte
+	_, _ = rand.Read(b[:])
+	return "pii_" + hex.EncodeToString(b[:])
+}
diff --git a/core/services/routing/pii/stream_test.go b/core/services/routing/pii/stream_test.go
new file mode 100644
index 000000000..037020609
--- /dev/null
+++ b/core/services/routing/pii/stream_test.go
@@ -0,0 +1,184 @@
+package pii
+
+import (
+	"context"
+	"fmt"
+	"math/rand"
+	"strings"
+	"unicode/utf8"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func newStreamRedactor(ids ...string) *Redactor {
+	all := DefaultPatterns()
+	chosen := all
+	if len(ids) > 0 {
+		chosen = pick(all, ids)
+	}
+	patterns, err := Compile(chosen)
+	ExpectWithOffset(1, err).NotTo(HaveOccurred(), "compile")
+	return NewRedactor(patterns)
+}
+
+var _ = Describe("StreamFilter", func() {
+	It("masks across chunks", func() {
+		// The most important streaming test: an email split arbitrarily
+		// across chunk boundaries must mask exactly the same way as one
+		// arriving in a single Push.
+		red := newStreamRedactor("email")
+		sf := NewStreamFilter(red, nil, nil, "", "")
+
+		// "alice@example.com" (17 bytes) split between '@' and 'e'.
+		out := ""
+		out += sf.Push("hi alice@")
+		out += sf.Push("example.com! end")
+		out += sf.Drain()
+
+		Expect(out).NotTo(ContainSubstring("alice@example.com"), "stream leaked email across chunk boundary")
+		Expect(out).To(ContainSubstring("[REDACTED:email]"))
+	})
+
+	It("block becomes mask", func() {
+		// api_key_prefix is block by default. In stream mode the earlier
+		// chunks are already on the wire so block is impossible — the
+		// filter remaps to mask while still recording action="block" so
+		// the audit log keeps the original intent.
+		red := newStreamRedactor("api_key_prefix")
+		store := NewMemoryEventStore(0)
+		defer func() { _ = store.Close() }()
+		sf := NewStreamFilter(red, nil, store, "corr-1", "user-1")
+
+		out := sf.Push("here is your token: sk-abcdefghijklmnopqrstuvwxyz0123456789 done")
+		out += sf.Drain()
+
+		Expect(out).NotTo(ContainSubstring("abcdefghijklmnopqrstuvwxyz0123456789"), "block-in-stream must mask, leaked the value")
+		Expect(out).To(ContainSubstring("[REDACTED:api_key_prefix]"))
+
+		events, _ := store.List(context.Background(), ListQuery{Limit: 10})
+		Expect(events).To(HaveLen(1))
+		Expect(events[0].Action).To(Equal(ActionBlock), "audit must record original block action")
+		Expect(events[0].Direction).To(Equal(DirectionOut), "stream events must be DirectionOut")
+	})
+
+	It("no match passthrough", func() {
+		red := newStreamRedactor("email")
+		sf := NewStreamFilter(red, nil, nil, "", "")
+		out := sf.Push("perfectly clean text that should") + sf.Push(" pass through unchanged.") + sf.Drain()
+		Expect(out).To(Equal("perfectly clean text that should pass through unchanged."))
+	})
+
+	It("nil redactor passthrough", func() {
+		// --disable-pii path: NewStreamFilter(nil, ...) returns a filter
+		// that just forwards Push input verbatim.
+		sf := NewStreamFilter(nil, nil, nil, "", "")
+		out := sf.Push("any old text including alice@example.com") + sf.Drain()
+		Expect(out).To(Equal("any old text including alice@example.com"))
+	})
+
+	It("per-model overrides", func() {
+		// email defaults to mask; per-model override upgrades to block.
+		// In stream mode the override still maps to mask placeholder, but
+		// the audit event records action="block".
+		red := newStreamRedactor("email")
+		store := NewMemoryEventStore(0)
+		defer func() { _ = store.Close() }()
+		sf := NewStreamFilter(red, map[string]Action{"email": ActionBlock}, store, "corr-2", "user-2")
+
+		out := sf.Push("contact alice@example.com please") + sf.Drain()
+		Expect(out).NotTo(ContainSubstring("alice@example.com"), "override block-in-stream must mask")
+		events, _ := store.List(context.Background(), ListQuery{Limit: 10})
+		Expect(events).To(HaveLen(1))
+		Expect(events[0].Action).To(Equal(ActionBlock))
+	})
+
+	// StreamFilter_BufferedEmitInvariant feeds the redactor a corpus
+	// one rune at a time, randomly chunked, and asserts:
+	//
+	//   1. Across all (input, splitting) pairs, the cumulative emitted
+	//      output never contains any of the secret values that were
+	//      embedded in the input.
+	//   2. The output, fully drained, equals what Redact would have
+	//      produced on the unsplit input.
+	//
+	// This is the load-bearing property of streaming PII: regardless of
+	// where chunks split, the emitted bytes cannot contain a value that a
+	// single-shot redactor would have masked.
+	It("buffered emit invariant", func() {
+		corpus := []struct {
+			text    string
+			secrets []string
+		}{
+			{"contact alice@example.com or bob@example.org", []string{"alice@example.com", "bob@example.org"}},
+			{"my SSN is 123-45-6789 and his is 987-65-4321", []string{"123-45-6789", "987-65-4321"}},
+			{"sk-abcdefghijklmnopqrstuvwxyz0123456789 leaked", []string{"sk-abcdefghijklmnopqrstuvwxyz0123456789"}},
+			{"repeats: alice@example.com / alice@example.com / alice@example.com", []string{"alice@example.com"}},
+			// Multibyte UTF-8 corpora pin the rune-boundary snap in
+			// StreamFilter.Push: holdLen is byte-sized, so a chunk boundary
+			// may land mid-codepoint. Without the snap, the retained tail
+			// has a partial codepoint and the next regex scan can mis-align.
+			// Each entry mixes ASCII secrets with surrounding multibyte text
+			// so a byte-aligned cut would land inside a CJK or accented
+			// character on at least some splits.
+			{"こんにちは alice@example.com さようなら", []string{"alice@example.com"}},
+			{"クレジットカード: 4111-1111-1111-1111 終わり", []string{"4111-1111-1111-1111"}},
+			{"naïve résumé: alice@example.com, façade", []string{"alice@example.com"}},
+		}
+
+		red := newStreamRedactor()        // all default patterns
+		rng := rand.New(rand.NewSource(1)) // seeded for reproducibility
+
+		for _, tc := range corpus {
+			for trial := 0; trial < 10; trial++ {
+				sf := NewStreamFilter(red, nil, nil, "", "")
+				var out strings.Builder
+				for i := 0; i < utf8.RuneCountInString(tc.text); {
+					// Random chunk size 1-8 runes, never crossing the end.
+					chunk := 1 + rng.Intn(8)
+					if i+chunk > utf8.RuneCountInString(tc.text) {
+						chunk = utf8.RuneCountInString(tc.text) - i
+					}
+					out.WriteString(sf.Push(stringSlice(tc.text, i, i+chunk)))
+					i += chunk
+				}
+				out.WriteString(sf.Drain())
+				result := out.String()
+
+				// Property 1: no secret value appears anywhere in the
+				// output.
+				for _, secret := range tc.secrets {
+					Expect(result).NotTo(ContainSubstring(secret),
+						fmt.Sprintf("trial %d: secret %q leaked through streaming\n  input: %q\n  output: %q", trial, secret, tc.text, result))
+				}
+
+				// Property 2: the streamed output equals what a single-shot
+				// Redact would have produced on the same input. (Block
+				// patterns get masked in stream mode, so we compare against
+				// a remapped redaction.)
+				expected := singleShotMaskAll(red, tc.text)
+				Expect(result).To(Equal(expected),
+					fmt.Sprintf("trial %d: stream != single-shot\n  input: %q", trial, tc.text))
+			}
+		}
+	})
+})
+
+// singleShotMaskAll runs the redactor in one pass with all blocks
+// remapped to mask — the same view the StreamFilter produces.
+func singleShotMaskAll(red *Redactor, text string) string {
+	patterns := red.Patterns()
+	overrides := make(map[string]Action, len(patterns))
+	for _, p := range patterns {
+		if p.Action == ActionBlock {
+			overrides[p.ID] = ActionMask
+		}
+	}
+	res := red.RedactWithOverrides(text, overrides)
+	return res.Redacted
+}
+
+func stringSlice(s string, fromRune, toRune int) string {
+	runes := []rune(s)
+	return string(runes[fromRune:toRune])
+}
diff --git a/core/services/routing/pii/types.go b/core/services/routing/pii/types.go
new file mode 100644
index 000000000..afdcc7ad4
--- /dev/null
+++ b/core/services/routing/pii/types.go
@@ -0,0 +1,170 @@
+// Package pii implements the routing-module PII / sensitive-data filter.
+//
+// Two tiers are planned (per the routing plan):
+//
+//  1. Regex tier: cheap, deterministic patterns (email, phone, SSN, credit
+//     card with Luhn, IPs, API-key prefixes). Always on by default.
+//  2. Encoder NER tier: a HF token-classification model exposed via a new
+//     gRPC TokenClassify RPC. Out of scope for this slice — added later.
+//
+// This file ships tier 1 only. The Pipeline interface is shaped so tier 2
+// drops in without changing call sites.
+//
+// Configuration model: each pattern has an Action (block | mask |
+// route_local). Actions are evaluated in this order:
+//   - block: short-circuits the request with an error (the middleware
+//     returns 400 to the client).
+//   - mask: replaces the matched span with ReplacementFor(pattern).
+//   - route_local: leaves the text alone but sets a context flag the
+//     router (subsystem 2) treats as "this request must stay on a local
+//     model" — never crosses the boundary to a cloud proxy backend.
+package pii
+
+import "time"
+
+// Action describes what to do when a pattern matches.
+type Action string
+
+const (
+	// ActionMask replaces the matched span with a placeholder. The
+	// default. Lets the request proceed to the backend with the
+	// sensitive token removed.
+	ActionMask Action = "mask"
+
+	// ActionBlock rejects the entire request. The middleware returns
+	// 400 with an error referencing the matched pattern_id (but never
+	// the matched value).
+	ActionBlock Action = "block"
+
+	// ActionRouteLocal leaves the text intact but flags the request so
+	// the content router will refuse to dispatch it to a cloud proxy
+	// backend. Useful when a deployment trusts local models with
+	// sensitive data but not external providers.
+	ActionRouteLocal Action = "route_local"
+)
+
+// Direction tags whether a PIIEvent fired on input (request body before
+// dispatch) or output (response stream after generation). Stored in the
+// PIIEvent record so admins can see which direction PII appeared in.
+type Direction string
+
+const (
+	DirectionIn  Direction = "in"
+	DirectionOut Direction = "out"
+)
+
+// Span is a half-open byte range [Start, End) within a scanned string.
+// Pattern is the rule that matched. Text never holds the matched value
+// itself — call sites that need the value (for masking) do their own
+// substring slicing; call sites that need to log it strip it via
+// HashPrefix.
+type Span struct {
+	Start     int
+	End       int
+	Pattern   string // matches Pattern.ID
+	HashPrefix string // first 8 chars of sha256(matched value); audit-safe
+}
+
+// Result is what Redact returns. Redacted is the input string after
+// all configured masks were applied. Spans are the original positions
+// of every match (in the original input — not the redacted output —
+// so admins can see where things were).
+//
+// Blocked is true iff at least one matched pattern had Action=block;
+// the call site must enforce this by returning a 400 / refusing to
+// dispatch.
+//
+// LocalOnly is true iff at least one matched pattern had
+// Action=route_local. The router middleware reads this and constrains
+// candidate selection.
+type Result struct {
+	Redacted  string
+	Spans     []Span
+	Blocked   bool
+	LocalOnly bool
+}
+
+// Pattern is one configurable rule. Description is shown in the admin
+// UI alongside the pattern; the regex itself stays an implementation
+// detail (a leak-prone admin showing an SSN regex with a sample value
+// in the field is a risk we deliberately design around).
+type Pattern struct {
+	ID          string
+	Description string
+	Action      Action
+	// Disabled skips the pattern entirely when true — useful for
+	// admins who want to keep a regex around (visible in the UI) but
+	// turn it off without removing the YAML entry. Default-false so
+	// every existing pattern stays active without touching its config.
+	Disabled bool
+	// MaxMatchLength is the longest possible match in characters. The
+	// streaming filter (subsystem 3, follow-up commit) uses this to
+	// size its tail buffer. For regex patterns we compute it at
+	// compile time from the pattern's structure when possible, or set
+	// a conservative upper bound otherwise.
+	MaxMatchLength int
+
+	// internal — populated by Compile().
+	regex regexpMatcher
+}
+
+// EventKind classifies a stored audit event. The store is shared by the
+// PII filter (its original use), the MITM proxy (connect decisions and
+// per-request traffic counters), and — when subsystem 2 lands — the
+// content router. Filtering by Kind keeps unrelated event types out of
+// each other's UI tabs without splitting storage.
+//
+// An empty Kind is treated as KindPII so rows written before this field
+// existed still classify correctly.
+type EventKind string
+
+const (
+	KindPII          EventKind = "pii"
+	KindProxyConnect EventKind = "proxy_connect"
+	KindProxyTraffic EventKind = "proxy_traffic"
+	// KindAdmission rows are written by the admission middleware
+	// (routing subsystem 5) when a request is rejected because a
+	// model's MaxConcurrent ceiling is full. The Host field carries
+	// the model name (overloading the existing column rather than
+	// adding a new one — admins read it as "the thing that was
+	// busy"); StatusCode is 503.
+	KindAdmission EventKind = "admission"
+)
+
+// PIIEvent is the persisted record. The Hash field is the first 8 chars
+// of sha256(matched value) — enough to deduplicate "is this the same
+// thing as last time" without ever storing the value itself.
+//
+// Proxy-event fields (Host, Intercepted, Bytes*, StatusCode, DurationMS)
+// are only set when Kind is KindProxyConnect or KindProxyTraffic. They
+// hold connection-level metadata for audit and basic diagnostics — never
+// request bodies. Use the API/backend traces to inspect contents.
+type PIIEvent struct {
+	ID            string    `json:"id"`
+	Kind          EventKind `json:"kind,omitempty"`
+	CorrelationID string    `json:"correlation_id,omitempty"`
+	UserID        string    `json:"user_id,omitempty"`
+	Direction     Direction `json:"direction,omitempty"`
+	PatternID     string    `json:"pattern_id,omitempty"`
+	ByteOffset    int       `json:"byte_offset,omitempty"`
+	Length        int       `json:"length,omitempty"`
+	HashPrefix    string    `json:"hash_prefix,omitempty"`
+	Action        Action    `json:"action,omitempty"`
+	CreatedAt     time.Time `json:"created_at"`
+
+	Host          string `json:"host,omitempty"`
+	Intercepted   *bool  `json:"intercepted,omitempty"`
+	BytesSent     int64  `json:"bytes_sent,omitempty"`
+	BytesReceived int64  `json:"bytes_received,omitempty"`
+	StatusCode    int    `json:"status_code,omitempty"`
+	DurationMS    int64  `json:"duration_ms,omitempty"`
+}
+
+// ResolvedKind returns the event's Kind, treating an empty value as
+// KindPII for rows written before Kind existed.
+func (e PIIEvent) ResolvedKind() EventKind {
+	if e.Kind == "" {
+		return KindPII
+	}
+	return e.Kind
+}
diff --git a/core/services/routing/piiadapter/anthropic.go b/core/services/routing/piiadapter/anthropic.go
new file mode 100644
index 000000000..e059e6bc1
--- /dev/null
+++ b/core/services/routing/piiadapter/anthropic.go
@@ -0,0 +1,81 @@
+package piiadapter
+
+import (
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+)
+
+// Anthropic returns a pii.Adapter for *schema.AnthropicRequest. The
+// scan walks every message's text content (string-form or text blocks
+// inside the structured `[]any` content), and the apply writes redacted
+// text back in place.
+//
+// The shape mirrors OpenAI() — Anthropic's multimodal blocks
+// (`{"type":"image","source":{...}}`, `{"type":"tool_use", ...}`) are
+// left untouched; text-block scanning covers the chat-completion path.
+//
+// System prompts in the Anthropic API live on the request's top-level
+// System field, not in Messages — they're skipped here for now (chat
+// messages are the high-traffic surface). System-prompt scanning is a
+// follow-up if a deployment proves it needs it.
+func Anthropic() pii.Adapter {
+	return pii.Adapter{
+		Scan: func(parsed any) []pii.ScannedText {
+			req, ok := parsed.(*schema.AnthropicRequest)
+			if !ok || req == nil {
+				return nil
+			}
+			var out []pii.ScannedText
+			for i := range req.Messages {
+				msg := &req.Messages[i]
+				switch ct := msg.Content.(type) {
+				case string:
+					if ct != "" {
+						out = append(out, pii.ScannedText{
+							Index: encodeIdx(i, -1),
+							Text:  ct,
+						})
+					}
+				case []any:
+					for j, block := range ct {
+						if blockMap, ok := block.(map[string]any); ok {
+							if blockMap["type"] == "text" {
+								if text, ok := blockMap["text"].(string); ok && text != "" {
+									out = append(out, pii.ScannedText{
+										Index: encodeIdx(i, j),
+										Text:  text,
+									})
+								}
+							}
+						}
+					}
+				}
+			}
+			return out
+		},
+		Apply: func(parsed any, updates []pii.ScannedText) {
+			req, ok := parsed.(*schema.AnthropicRequest)
+			if !ok || req == nil {
+				return
+			}
+			for _, u := range updates {
+				msgIdx, blockIdx := decodeIdx(u.Index)
+				if msgIdx < 0 || msgIdx >= len(req.Messages) {
+					continue
+				}
+				msg := &req.Messages[msgIdx]
+				if blockIdx < 0 {
+					msg.Content = u.Text
+					continue
+				}
+				blocks, ok := msg.Content.([]any)
+				if !ok || blockIdx >= len(blocks) {
+					continue
+				}
+				if blockMap, ok := blocks[blockIdx].(map[string]any); ok {
+					blockMap["text"] = u.Text
+				}
+			}
+		},
+	}
+}
diff --git a/core/services/routing/piiadapter/anthropic_test.go b/core/services/routing/piiadapter/anthropic_test.go
new file mode 100644
index 000000000..1ec72d4ee
--- /dev/null
+++ b/core/services/routing/piiadapter/anthropic_test.go
@@ -0,0 +1,69 @@
+package piiadapter
+
+import (
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Anthropic adapter", func() {
+	It("scans string content", func() {
+		req := &schema.AnthropicRequest{
+			Messages: []schema.AnthropicMessage{
+				{Role: "user", Content: "hi alice@example.com"},
+			},
+		}
+		got := Anthropic().Scan(req)
+		Expect(got).To(HaveLen(1))
+		Expect(got[0].Text).To(Equal("hi alice@example.com"))
+	})
+
+	It("scans text blocks", func() {
+		// AnthropicMessage.Content is `any`. After JSON decode of a real
+		// request it is []any of map[string]any blocks, exactly mirroring
+		// OpenAI's content-block shape — image blocks must be skipped, text
+		// blocks must be scanned.
+		req := &schema.AnthropicRequest{
+			Messages: []schema.AnthropicMessage{
+				{Role: "user", Content: []any{
+					map[string]any{"type": "text", "text": "first text"},
+					map[string]any{"type": "image", "source": map[string]any{"type": "base64", "data": "..."}},
+					map[string]any{"type": "text", "text": "second text"},
+				}},
+			},
+		}
+		got := Anthropic().Scan(req)
+		Expect(got).To(HaveLen(2))
+		Expect(got[0].Text).To(Equal("first text"))
+		Expect(got[1].Text).To(Equal("second text"))
+	})
+
+	It("Apply mutates string content", func() {
+		req := &schema.AnthropicRequest{
+			Messages: []schema.AnthropicMessage{
+				{Role: "user", Content: "original"},
+			},
+		}
+		adapter := Anthropic()
+		got := adapter.Scan(req)
+		adapter.Apply(req, []pii.ScannedText{{Index: got[0].Index, Text: "redacted"}})
+		Expect(req.Messages[0].Content).To(Equal("redacted"))
+	})
+
+	It("Apply mutates text block content", func() {
+		req := &schema.AnthropicRequest{
+			Messages: []schema.AnthropicMessage{
+				{Role: "user", Content: []any{
+					map[string]any{"type": "text", "text": "original"},
+				}},
+			},
+		}
+		adapter := Anthropic()
+		got := adapter.Scan(req)
+		adapter.Apply(req, []pii.ScannedText{{Index: got[0].Index, Text: "redacted"}})
+		blocks := req.Messages[0].Content.([]any)
+		block := blocks[0].(map[string]any)
+		Expect(block["text"]).To(Equal("redacted"))
+	})
+})
diff --git a/core/services/routing/piiadapter/openai.go b/core/services/routing/piiadapter/openai.go
new file mode 100644
index 000000000..a79b4dd7e
--- /dev/null
+++ b/core/services/routing/piiadapter/openai.go
@@ -0,0 +1,112 @@
+// Package piiadapter holds the per-API-shape adapters that translate
+// between the routing/pii middleware and concrete request types from
+// core/schema. Lives outside core/services/routing/pii so the schema
+// package never imports pii (and pii never imports schema), keeping
+// the dependency direction clean.
+package piiadapter
+
+import (
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+)
+
+// OpenAI returns a pii.Adapter for *schema.OpenAIRequest. It scans
+// every chat message's text content (string-form or text blocks of
+// the structured `[]any` content), and writes redacted text back.
+//
+// Multimodal content (image_url, audio_url, video_url) is left alone
+// — PII in image bytes is the encoder NER tier's problem, not the
+// regex tier's. We do walk text fields embedded inside content
+// blocks because those are the most common shape Claude Code and
+// similar clients produce.
+//
+// System / developer / tool messages are scanned as well: an API key
+// pasted into a system prompt is just as leak-prone as one in a user
+// message.
+func OpenAI() pii.Adapter {
+	return pii.Adapter{
+		Scan: func(parsed any) []pii.ScannedText {
+			req, ok := parsed.(*schema.OpenAIRequest)
+			if !ok || req == nil {
+				return nil
+			}
+			var out []pii.ScannedText
+			for i := range req.Messages {
+				msg := &req.Messages[i]
+				switch ct := msg.Content.(type) {
+				case string:
+					if ct != "" {
+						// Index encodes (message index, -1) to mean
+						// "the whole Content string". Negative
+						// inner indices are a valid sentinel because
+						// real array indices are ≥ 0.
+						out = append(out, pii.ScannedText{
+							Index: encodeIdx(i, -1),
+							Text:  ct,
+						})
+					}
+				case []any:
+					for j, block := range ct {
+						if blockMap, ok := block.(map[string]any); ok {
+							if blockMap["type"] == "text" {
+								if text, ok := blockMap["text"].(string); ok && text != "" {
+									out = append(out, pii.ScannedText{
+										Index: encodeIdx(i, j),
+										Text:  text,
+									})
+								}
+							}
+						}
+					}
+				}
+			}
+			return out
+		},
+		Apply: func(parsed any, updates []pii.ScannedText) {
+			req, ok := parsed.(*schema.OpenAIRequest)
+			if !ok || req == nil {
+				return
+			}
+			for _, u := range updates {
+				msgIdx, blockIdx := decodeIdx(u.Index)
+				if msgIdx < 0 || msgIdx >= len(req.Messages) {
+					continue
+				}
+				msg := &req.Messages[msgIdx]
+				if blockIdx < 0 {
+					// Whole-string content.
+					msg.Content = u.Text
+					continue
+				}
+				blocks, ok := msg.Content.([]any)
+				if !ok || blockIdx >= len(blocks) {
+					continue
+				}
+				if blockMap, ok := blocks[blockIdx].(map[string]any); ok {
+					blockMap["text"] = u.Text
+				}
+			}
+		},
+	}
+}
+
+// encodeIdx packs (msg, block) into one int. block=-1 means
+// "the whole Content string"; bit 24 is the sentinel flag and
+// bits 0..23 hold the block index, leaving the rest for msg.
+const idxWholeStringFlag = 1 << 24
+const idxBlockMask = (1 << 24) - 1
+
+func encodeIdx(msg, block int) int {
+	if block < 0 {
+		return (msg << 25) | idxWholeStringFlag
+	}
+	return (msg << 25) | (block & idxBlockMask)
+}
+
+func decodeIdx(packed int) (msg, block int) {
+	msg = packed >> 25
+	if packed&idxWholeStringFlag != 0 {
+		return msg, -1
+	}
+	return msg, packed & idxBlockMask
+}
diff --git a/core/services/routing/piiadapter/openai_test.go b/core/services/routing/piiadapter/openai_test.go
new file mode 100644
index 000000000..f35ac959f
--- /dev/null
+++ b/core/services/routing/piiadapter/openai_test.go
@@ -0,0 +1,93 @@
+package piiadapter
+
+import (
+	"github.com/mudler/LocalAI/core/schema"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("OpenAI adapter", func() {
+	It("scans string content", func() {
+		req := &schema.OpenAIRequest{
+			Messages: []schema.Message{
+				{Role: "user", Content: "hello alice@example.com"},
+			},
+		}
+		adapter := OpenAI()
+		got := adapter.Scan(req)
+		Expect(got).To(HaveLen(1))
+		Expect(got[0].Text).To(Equal("hello alice@example.com"))
+	})
+
+	It("scans content blocks", func() {
+		req := &schema.OpenAIRequest{
+			Messages: []schema.Message{
+				{Role: "user", Content: []any{
+					map[string]any{"type": "text", "text": "block one"},
+					map[string]any{"type": "image_url", "image_url": map[string]any{"url": "data:image/png;base64,xyz"}},
+					map[string]any{"type": "text", "text": "block two"},
+				}},
+			},
+		}
+		adapter := OpenAI()
+		got := adapter.Scan(req)
+		Expect(got).To(HaveLen(2))
+		Expect(got[0].Text).To(Equal("block one"))
+		Expect(got[1].Text).To(Equal("block two"))
+	})
+
+	It("Apply mutates string content", func() {
+		req := &schema.OpenAIRequest{
+			Messages: []schema.Message{
+				{Role: "user", Content: "original"},
+				{Role: "user", Content: "second"},
+			},
+		}
+		adapter := OpenAI()
+		scans := adapter.Scan(req)
+		updates := scans
+		updates[0].Text = "REDACTED-0"
+		updates[1].Text = "REDACTED-1"
+		adapter.Apply(req, updates)
+
+		Expect(req.Messages[0].Content.(string)).To(Equal("REDACTED-0"))
+		Expect(req.Messages[1].Content.(string)).To(Equal("REDACTED-1"))
+	})
+
+	It("Apply mutates content block selectively", func() {
+		req := &schema.OpenAIRequest{
+			Messages: []schema.Message{
+				{Role: "user", Content: []any{
+					map[string]any{"type": "text", "text": "before"},
+					map[string]any{"type": "text", "text": "untouched"},
+				}},
+			},
+		}
+		adapter := OpenAI()
+		scans := adapter.Scan(req)
+		Expect(scans).To(HaveLen(2))
+
+		// Redact only the first block.
+		updates := []struct{ idx int }{{0}}
+		scans[updates[0].idx].Text = "AFTER"
+		adapter.Apply(req, scans[:1])
+
+		blocks := req.Messages[0].Content.([]any)
+		Expect(blocks[0].(map[string]any)["text"]).To(Equal("AFTER"))
+		Expect(blocks[1].(map[string]any)["text"]).To(Equal("untouched"))
+	})
+})
+
+var _ = Describe("encodeIdx/decodeIdx", func() {
+	It("round-trips message and block indices", func() {
+		cases := []struct{ msg, block int }{
+			{0, 0}, {0, 5}, {3, 0}, {3, 12}, {7, -1}, {0, -1},
+		}
+		for _, c := range cases {
+			got := encodeIdx(c.msg, c.block)
+			m, b := decodeIdx(got)
+			Expect(m).To(Equal(c.msg), "round-trip msg for (%d,%d)", c.msg, c.block)
+			Expect(b).To(Equal(c.block), "round-trip block for (%d,%d)", c.msg, c.block)
+		}
+	})
+})
diff --git a/core/services/routing/piiadapter/piiadapter_suite_test.go b/core/services/routing/piiadapter/piiadapter_suite_test.go
new file mode 100644
index 000000000..9d3134987
--- /dev/null
+++ b/core/services/routing/piiadapter/piiadapter_suite_test.go
@@ -0,0 +1,13 @@
+package piiadapter
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestPiiAdapter(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "PII Adapter test suite")
+}
diff --git a/core/services/routing/router/cache.go b/core/services/routing/router/cache.go
new file mode 100644
index 000000000..b314af22e
--- /dev/null
+++ b/core/services/routing/router/cache.go
@@ -0,0 +1,96 @@
+package router
+
+import (
+	"sort"
+	"strings"
+	"sync"
+)
+
+// labelSetCache memoises classifier output (a sorted active-label set)
+// keyed on the case-folded, whitespace-trimmed prompt. Both Score and
+// Rerank classifiers embed one.
+//
+// Eviction is naive (drop one arbitrary entry on overflow); the cache
+// is a hot-prompt amortiser, not a long-tail store, so LRU semantics
+// aren't worth the extra bookkeeping. Cap=0 disables the cache.
+type labelSetCache struct {
+	mu    sync.RWMutex
+	store map[string][]string
+	cap   int
+}
+
+func newLabelSetCache(size int) *labelSetCache {
+	if size < 0 {
+		size = 0
+	}
+	return &labelSetCache{store: make(map[string][]string, size), cap: size}
+}
+
+// cacheKey normalises a prompt for cache equality. Callers can compute
+// it once at the top of Classify and pass it to both get and put to
+// save the second TrimSpace+ToLower allocation on a miss.
+func cacheKey(prompt string) string {
+	return strings.ToLower(strings.TrimSpace(prompt))
+}
+
+func (c *labelSetCache) get(key string) ([]string, bool) {
+	if c.cap == 0 {
+		return nil, false
+	}
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	v, ok := c.store[key]
+	return v, ok
+}
+
+func (c *labelSetCache) put(key string, labels []string) {
+	if c.cap == 0 {
+		return
+	}
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	if len(c.store) >= c.cap {
+		for k := range c.store {
+			delete(c.store, k)
+			break
+		}
+	}
+	// Defensive copy + sort: cached label sets must be stable so
+	// callers can't mutate via aliasing, and equality comparisons
+	// in tests don't depend on insertion order.
+	cp := make([]string, len(labels))
+	copy(cp, labels)
+	sort.Strings(cp)
+	c.store[key] = cp
+}
+
+func (c *labelSetCache) len() int {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	return len(c.store)
+}
+
+// selectActive picks the labels whose corresponding score clears
+// threshold, plus the index of the argmax. If no label clears the
+// threshold the caller falls back to the argmax — both classifiers
+// guarantee a non-empty active set so the surrounding middleware
+// always has something to route on. Returns nil active when labels
+// is empty.
+func selectActive(scores []float64, labels []string, threshold float64) (active []string, bestIdx int) {
+	if len(labels) == 0 {
+		return nil, 0
+	}
+	active = make([]string, 0, 2)
+	for i, s := range scores {
+		if s > scores[bestIdx] {
+			bestIdx = i
+		}
+		if s >= threshold {
+			active = append(active, labels[i])
+		}
+	}
+	if len(active) == 0 {
+		active = []string{labels[bestIdx]}
+	}
+	return active, bestIdx
+}
diff --git a/core/services/routing/router/decisions.go b/core/services/routing/router/decisions.go
new file mode 100644
index 000000000..d446ac29a
--- /dev/null
+++ b/core/services/routing/router/decisions.go
@@ -0,0 +1,166 @@
+package router
+
+import (
+	"context"
+	"sync"
+	"time"
+)
+
+// Decision row written to the in-memory store. Mirrors the PIIEvent
+// shape so the admin page can render the two side-by-side. Note:
+// Prompt is NEVER stored — admins audit by Hash if they need to
+// dedupe recurring routing patterns.
+type DecisionRecord struct {
+	ID            string        `json:"id"`
+	CorrelationID string        `json:"correlation_id"`
+	UserID        string        `json:"user_id"`
+	RouterModel   string        `json:"router_model"`   // The smart-router model name the client asked for.
+	RequestedModel string       `json:"requested_model"`// Same as RouterModel for now; reserved for chained routers.
+	ServedModel   string        `json:"served_model"`   // The candidate the classifier picked.
+	Classifier    string        `json:"classifier"`     // Classifier.Name(), e.g. "score".
+	Label         string        `json:"label"`
+	Score         float64       `json:"score"`
+	LatencyMs     int64         `json:"latency_ms"`
+	Cached        bool          `json:"cached"`         // True when the decision came from the L2 embedding cache.
+	CacheSimilarity float64     `json:"cache_similarity,omitempty"` // Cosine similarity of the cache hit, 0 when not cached.
+	// LabelScores carries the full per-label score distribution so the
+	// admin UI can show how close inactive labels got to the activation
+	// threshold. Empty on cache hits (only the final label set is cached).
+	LabelScores         []LabelScore `json:"label_scores,omitempty"`
+	ActivationThreshold float64      `json:"activation_threshold,omitempty"`
+	// Source groups decisions by the entry point that produced them so
+	// the admin page can split realtime / chat / anthropic streams. Empty
+	// string is treated as "chat" for backward compatibility with rows
+	// written before the field existed.
+	Source        string        `json:"source,omitempty"`
+	CreatedAt     time.Time     `json:"created_at"`
+}
+
+// Source values for DecisionRecord.Source. Kept as constants so callers
+// don't drift on capitalisation.
+const (
+	SourceChat      = "chat"
+	SourceAnthropic = "anthropic"
+	SourceRealtime  = "realtime"
+)
+
+// DecisionStore persists routing decisions for the admin page and
+// future drift checks. In-process by default so a no-auth box still
+// gets a decision log; a future GORM impl can reuse the auth DB.
+type DecisionStore interface {
+	Record(ctx context.Context, r DecisionRecord) error
+	List(ctx context.Context, q DecisionListQuery) ([]DecisionRecord, error)
+	Count(ctx context.Context) (int, error)
+	Close() error
+}
+
+// DecisionListQuery filters the decision log. Empty fields match all.
+// Limit ≤ 0 picks a default cap.
+type DecisionListQuery struct {
+	CorrelationID string
+	UserID        string
+	RouterModel   string
+	Source        string
+	Limit         int
+}
+
+// NewMemoryDecisionStore returns a ring-buffer DecisionStore. capacity
+// ≤ 0 picks 5_000 — same order of magnitude as PIIEvents but smaller
+// because routing decisions correlate one-to-one with usage records;
+// the existing UsageRecord log carries the bulk.
+func NewMemoryDecisionStore(capacity int) DecisionStore {
+	if capacity <= 0 {
+		capacity = 5_000
+	}
+	return &memoryDecisionStore{
+		ring: make([]DecisionRecord, capacity),
+		cap:  capacity,
+	}
+}
+
+type memoryDecisionStore struct {
+	mu     sync.RWMutex
+	ring   []DecisionRecord
+	cap    int
+	cursor int
+	full   bool
+}
+
+func (s *memoryDecisionStore) Record(_ context.Context, r DecisionRecord) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.ring[s.cursor] = r
+	s.cursor++
+	if s.cursor == s.cap {
+		s.cursor = 0
+		s.full = true
+	}
+	return nil
+}
+
+func (s *memoryDecisionStore) List(_ context.Context, q DecisionListQuery) ([]DecisionRecord, error) {
+	limit := q.Limit
+	if limit <= 0 {
+		limit = 1000
+	}
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	out := make([]DecisionRecord, 0, limit)
+	scan := func(r DecisionRecord) bool {
+		if r.ID == "" {
+			return false
+		}
+		if q.CorrelationID != "" && r.CorrelationID != q.CorrelationID {
+			return false
+		}
+		if q.UserID != "" && r.UserID != q.UserID {
+			return false
+		}
+		if q.RouterModel != "" && r.RouterModel != q.RouterModel {
+			return false
+		}
+		if q.Source != "" {
+			// Empty source on the row is treated as SourceChat for back-
+			// compat with rows written before the field existed.
+			rowSource := r.Source
+			if rowSource == "" {
+				rowSource = SourceChat
+			}
+			if rowSource != q.Source {
+				return false
+			}
+		}
+		out = append(out, r)
+		return len(out) >= limit
+	}
+	if s.full {
+		for i := s.cursor - 1; i >= 0; i-- {
+			if scan(s.ring[i]) {
+				return out, nil
+			}
+		}
+		for i := s.cap - 1; i >= s.cursor; i-- {
+			if scan(s.ring[i]) {
+				return out, nil
+			}
+		}
+	} else {
+		for i := s.cursor - 1; i >= 0; i-- {
+			if scan(s.ring[i]) {
+				return out, nil
+			}
+		}
+	}
+	return out, nil
+}
+
+func (s *memoryDecisionStore) Count(_ context.Context) (int, error) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	if s.full {
+		return s.cap, nil
+	}
+	return s.cursor, nil
+}
+
+func (s *memoryDecisionStore) Close() error { return nil }
diff --git a/core/services/routing/router/embedding_cache.go b/core/services/routing/router/embedding_cache.go
new file mode 100644
index 000000000..ba9063534
--- /dev/null
+++ b/core/services/routing/router/embedding_cache.go
@@ -0,0 +1,227 @@
+package router
+
+import (
+	"context"
+	"encoding/json"
+	"sync/atomic"
+	"time"
+
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/xlog"
+)
+
+// EmbeddingCacheStats reports per-classifier cache hit/miss/error
+// counts. Surfaced through /api/router/cache/stats and the Routing tab
+// so admins can see whether the cache is paying off.
+//
+// Hits + NearMisses + Misses equals the total number of Search calls
+// that succeeded (no embedder/store error). NearMisses are kept
+// separate from Misses because their similarity is observable —
+// lowering similarity_threshold turns near-misses into hits without
+// growing the cache, so the ratio tells admins how much room is left
+// in the current threshold.
+type EmbeddingCacheStats struct {
+	Hits           uint64 `json:"hits"`
+	Misses         uint64 `json:"misses"`         // empty store or no similar key
+	NearMisses     uint64 `json:"near_misses"`    // store returned a key but below similarity_threshold
+	LowConfidence  uint64 `json:"low_confidence"` // decisions we deliberately did not cache
+	EmbedderErrors uint64 `json:"embedder_errors"`
+	StoreErrors    uint64 `json:"store_errors"`
+
+	// SimilarityBuckets is a 10-bin histogram of the cosine
+	// similarities the store reported for any successful Search (hits
+	// and near-misses combined). Index i covers similarity [i/10,
+	// (i+1)/10). Counts are non-decreasing across the classifier's
+	// lifetime; reset via process restart.
+	SimilarityBuckets [10]uint64 `json:"similarity_buckets"`
+}
+
+// EmbeddingCacheClassifier wraps an inner Classifier with an
+// embedding-similarity cache. On Classify it first embeds the probe,
+// asks the vector store for the nearest past decision, and returns
+// it if similarity passes the configured threshold. Misses fall
+// through to the inner classifier, and high-confidence outcomes are
+// inserted into the store for future hits.
+//
+// Failure modes — embedder error, store error — degrade to the inner
+// classifier's result. Routing never fails because of cache plumbing.
+type EmbeddingCacheClassifier struct {
+	inner               Classifier
+	embedder            backend.Embedder
+	store               backend.VectorStore
+	similarityThreshold float64
+	confidenceThreshold float64
+
+	hits           atomic.Uint64
+	misses         atomic.Uint64
+	nearMisses     atomic.Uint64
+	lowConfidence  atomic.Uint64
+	embedderErrors atomic.Uint64
+	storeErrors    atomic.Uint64
+	simBuckets     [10]atomic.Uint64
+}
+
+// Default thresholds. Re-tune per (embedding model, corpus) — the
+// admin histogram on the Routing tab shows where the cosine
+// distribution actually sits.
+const (
+	defaultEmbeddingSimilarity = 0.80
+	defaultEmbeddingConfidence = 0.60
+)
+
+// NewEmbeddingCacheClassifier wraps inner with an embedding-similarity
+// cache. Panics on misconfiguration (nil inner / embedder / store) —
+// same fail-fast posture as the score classifier.
+//
+// Zero threshold picks the package default (defaultEmbeddingSimilarity
+// / defaultEmbeddingConfidence).
+func NewEmbeddingCacheClassifier(inner Classifier, embedder backend.Embedder, store backend.VectorStore, similarityThreshold, confidenceThreshold float64) *EmbeddingCacheClassifier {
+	if inner == nil {
+		panic("router/embedding_cache: inner classifier is required")
+	}
+	if embedder == nil {
+		panic("router/embedding_cache: embedder is required")
+	}
+	if store == nil {
+		panic("router/embedding_cache: vector store is required")
+	}
+	if similarityThreshold <= 0 {
+		similarityThreshold = defaultEmbeddingSimilarity
+	}
+	if confidenceThreshold <= 0 {
+		confidenceThreshold = defaultEmbeddingConfidence
+	}
+	return &EmbeddingCacheClassifier{
+		inner:               inner,
+		embedder:            embedder,
+		store:               store,
+		similarityThreshold: similarityThreshold,
+		confidenceThreshold: confidenceThreshold,
+	}
+}
+
+// Name is the inner classifier's name — the decision-log "classifier"
+// field should reflect *what* made the decision, not the caching
+// transport. Cache hits set Decision.Cached separately so admins can
+// still distinguish a cached lookup from a fresh run.
+func (c *EmbeddingCacheClassifier) Name() string {
+	return c.inner.Name()
+}
+
+// Stats returns a snapshot of the cache counters.
+func (c *EmbeddingCacheClassifier) Stats() EmbeddingCacheStats {
+	s := EmbeddingCacheStats{
+		Hits:           c.hits.Load(),
+		Misses:         c.misses.Load(),
+		NearMisses:     c.nearMisses.Load(),
+		LowConfidence:  c.lowConfidence.Load(),
+		EmbedderErrors: c.embedderErrors.Load(),
+		StoreErrors:    c.storeErrors.Load(),
+	}
+	for i := range c.simBuckets {
+		s.SimilarityBuckets[i] = c.simBuckets[i].Load()
+	}
+	return s
+}
+
+func (c *EmbeddingCacheClassifier) Classify(ctx context.Context, p Probe) (Decision, error) {
+	start := time.Now()
+
+	vec, err := c.embedder.Embed(ctx, p.Prompt)
+	if err != nil {
+		c.embedderErrors.Add(1)
+		xlog.Warn("router: embedding cache embed failed", "error", err)
+		// Embedder failure — fall through to the inner classifier so
+		// routing still happens. The miss is not a hard error.
+		return c.inner.Classify(ctx, p)
+	}
+
+	sim, payload, hit, err := c.store.Search(ctx, vec)
+	if err != nil {
+		c.storeErrors.Add(1)
+		xlog.Warn("router: embedding cache store.Search failed", "error", err, "vec_dim", len(vec))
+		return c.inner.Classify(ctx, p)
+	}
+	if hit {
+		// Bin the similarity once, regardless of threshold outcome.
+		// Admins read this back to see where the cosine distribution
+		// sits relative to the configured similarity_threshold.
+		c.recordSimilarity(sim)
+		if sim >= c.similarityThreshold {
+			if cached, ok := decodeCachedDecision(payload); ok {
+				c.hits.Add(1)
+				cached.Cached = true
+				cached.CacheSimilarity = sim
+				cached.Latency = time.Since(start)
+				return cached, nil
+			}
+			// Payload corrupt — treat as miss and overwrite on the next
+			// confident decision.
+			c.misses.Add(1)
+		} else {
+			c.nearMisses.Add(1)
+		}
+	} else {
+		c.misses.Add(1)
+	}
+	decision, err := c.inner.Classify(ctx, p)
+	if err != nil {
+		return decision, err
+	}
+
+	// Don't poison the cache with uncertain decisions. The score
+	// classifier's softmax can put the top label as low as 1/N in
+	// pathological cases; only store outcomes where the model is
+	// clearly committed.
+	if decision.Score < c.confidenceThreshold {
+		c.lowConfidence.Add(1)
+		return decision, nil
+	}
+
+	payload, encodeErr := encodeCachedDecision(decision)
+	if encodeErr != nil {
+		// Encoding can't realistically fail for the Decision type but
+		// guard so a future field doesn't break routing silently.
+		return decision, nil
+	}
+	if insertErr := c.store.Insert(ctx, vec, payload); insertErr != nil {
+		c.storeErrors.Add(1)
+		xlog.Warn("router: embedding cache store.Insert failed", "error", insertErr, "vec_dim", len(vec))
+		// Insert failure is non-fatal — the decision is still good
+		// for this request, only the future-hit benefit is lost.
+	}
+	return decision, nil
+}
+
+// recordSimilarity increments the histogram bucket covering the given
+// cosine similarity. The store occasionally returns sim slightly above
+// 1.0 due to floating-point error on exact matches; we clamp to the
+// top bin to keep the histogram bounded.
+func (c *EmbeddingCacheClassifier) recordSimilarity(sim float64) {
+	bucket := max(0, min(9, int(sim*10)))
+	c.simBuckets[bucket].Add(1)
+}
+
+// cachedDecision is the on-disk shape stored in the vector backend.
+// Kept separate from Decision so transient fields (Latency, Cached,
+// CacheSimilarity) don't get serialized — they're per-call, not
+// per-prompt.
+type cachedDecision struct {
+	Labels []string `json:"labels"`
+	Score  float64  `json:"score"`
+}
+
+func encodeCachedDecision(d Decision) ([]byte, error) {
+	return json.Marshal(cachedDecision{Labels: append([]string(nil), d.Labels...), Score: d.Score})
+}
+
+func decodeCachedDecision(b []byte) (Decision, bool) {
+	var cd cachedDecision
+	if err := json.Unmarshal(b, &cd); err != nil {
+		return Decision{}, false
+	}
+	if len(cd.Labels) == 0 {
+		return Decision{}, false
+	}
+	return Decision{Labels: cd.Labels, Score: cd.Score}, true
+}
diff --git a/core/services/routing/router/embedding_cache_test.go b/core/services/routing/router/embedding_cache_test.go
new file mode 100644
index 000000000..726614d0e
--- /dev/null
+++ b/core/services/routing/router/embedding_cache_test.go
@@ -0,0 +1,311 @@
+package router_test
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"sync"
+	"time"
+
+	"github.com/mudler/LocalAI/core/services/routing/router"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// fakeEmbedder returns a vector keyed by a lookup table; this lets the
+// test exercise hit/miss control without depending on a real model.
+type fakeEmbedder struct {
+	mu       sync.Mutex
+	table    map[string][]float32
+	failOnce bool
+}
+
+func (e *fakeEmbedder) Embed(ctx context.Context, text string) ([]float32, error) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	if e.failOnce {
+		e.failOnce = false
+		return nil, errors.New("embedder offline")
+	}
+	v, ok := e.table[text]
+	if !ok {
+		return nil, errors.New("no embedding for: " + text)
+	}
+	return v, nil
+}
+
+// memVectorStore is an in-memory KNN store with exact-vector hits, used
+// to exercise the cache layer without a real local-store backend.
+// Similarity is 1.0 for an exact match (after vector quantisation), 0.5
+// for "close" (configured via the second-arg suffix), 0.0 otherwise.
+type memVectorStore struct {
+	mu      sync.Mutex
+	entries []memEntry
+	failOps int // remaining Search calls to fail before returning miss
+}
+
+type memEntry struct {
+	vec     []float32
+	payload []byte
+}
+
+func (s *memVectorStore) Search(_ context.Context, vec []float32) (float64, []byte, bool, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.failOps > 0 {
+		s.failOps--
+		return 0, nil, false, errors.New("store offline")
+	}
+	for _, e := range s.entries {
+		if vecEqual(e.vec, vec) {
+			return 1.0, e.payload, true, nil
+		}
+	}
+	// "close" hit if the leading element matches but the rest doesn't —
+	// lets a test simulate sim=0.8 without floating-point fragility.
+	for _, e := range s.entries {
+		if len(vec) > 0 && len(e.vec) > 0 && vec[0] == e.vec[0] {
+			return 0.80, e.payload, true, nil
+		}
+	}
+	return 0, nil, false, nil
+}
+
+func (s *memVectorStore) Insert(_ context.Context, vec []float32, payload []byte) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.entries = append(s.entries, memEntry{vec: append([]float32(nil), vec...), payload: append([]byte(nil), payload...)})
+	return nil
+}
+
+func vecEqual(a, b []float32) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+// stubInner is a Classifier that records call count and returns a
+// pre-programmed Decision.
+type stubInner struct {
+	name     string
+	decision router.Decision
+	err      error
+	calls    int
+}
+
+func (s *stubInner) Classify(_ context.Context, _ router.Probe) (router.Decision, error) {
+	s.calls++
+	if s.err != nil {
+		return router.Decision{}, s.err
+	}
+	return s.decision, nil
+}
+
+func (s *stubInner) Name() string { return s.name }
+
+var _ = Describe("EmbeddingCache", func() {
+	ctx := context.Background()
+
+	Context("miss then hit on exact prompt", func() {
+		It("populates the cache and serves the second call", func() {
+			embedder := &fakeEmbedder{table: map[string][]float32{
+				"how do I exit vim": {1, 2, 3},
+			}}
+			store := &memVectorStore{}
+			inner := &stubInner{
+				name:     "score",
+				decision: router.Decision{Labels: []string{"code-generation"}, Score: 0.9},
+			}
+			cache := router.NewEmbeddingCacheClassifier(inner, embedder, store, 0.92, 0.6)
+
+			// First call → miss, inner runs, decision stored.
+			d, err := cache.Classify(ctx, router.Probe{Prompt: "how do I exit vim"})
+			Expect(err).NotTo(HaveOccurred(), "first classify")
+			Expect(d.Cached).To(BeFalse(), "first call should be a miss")
+			Expect(inner.calls).To(Equal(1))
+
+			// Second call with the same prompt → hit, inner NOT called again.
+			d, err = cache.Classify(ctx, router.Probe{Prompt: "how do I exit vim"})
+			Expect(err).NotTo(HaveOccurred(), "second classify")
+			Expect(d.Cached).To(BeTrue(), "second call should be a cache hit")
+			Expect(d.CacheSimilarity).To(Equal(1.0))
+			Expect(inner.calls).To(Equal(1), "inner ran on a hit")
+			Expect(d.Labels).To(Equal([]string{"code-generation"}))
+
+			stats := cache.Stats()
+			Expect(stats.Hits).To(Equal(uint64(1)))
+			Expect(stats.Misses).To(Equal(uint64(1)))
+			// Second call had sim=1.0 (exact match), so the top bucket
+			// should have one count.
+			Expect(stats.SimilarityBuckets[9]).To(Equal(uint64(1)), "SimilarityBuckets[9] should be 1 (sim=1.0 hit)")
+		})
+	})
+
+	Context("similarity below threshold", func() {
+		It("counts as a near-miss", func() {
+			// Two distinct prompts that produce vectors sharing only the
+			// first element — memVectorStore reports similarity 0.80, below
+			// the 0.92 threshold.
+			embedder := &fakeEmbedder{table: map[string][]float32{
+				"first prompt":  {1, 1, 1},
+				"second prompt": {1, 9, 9},
+			}}
+			store := &memVectorStore{}
+			inner := &stubInner{
+				name:     "score",
+				decision: router.Decision{Labels: []string{"math-reasoning"}, Score: 0.95},
+			}
+			cache := router.NewEmbeddingCacheClassifier(inner, embedder, store, 0.92, 0.6)
+
+			_, _ = cache.Classify(ctx, router.Probe{Prompt: "first prompt"})
+			d, err := cache.Classify(ctx, router.Probe{Prompt: "second prompt"})
+			Expect(err).NotTo(HaveOccurred(), "classify")
+			Expect(d.Cached).To(BeFalse(), "0.80 sim below 0.92 threshold should not hit")
+			Expect(inner.calls).To(Equal(2), "inner should have run twice")
+			stats := cache.Stats()
+			Expect(stats.NearMisses).To(Equal(uint64(1)), "NearMisses (sim=0.80 below 0.92 threshold)")
+			// Second call hit at sim=0.80 → bucket [0.8, 0.9) = index 8.
+			// First call missed cleanly (empty store) → no bucket.
+			Expect(stats.SimilarityBuckets[8]).To(Equal(uint64(1)), "SimilarityBuckets[8] (sim=0.80 near-miss)")
+		})
+	})
+
+	Context("low confidence decisions", func() {
+		It("are not cached", func() {
+			embedder := &fakeEmbedder{table: map[string][]float32{
+				"ambiguous": {7, 7, 7},
+			}}
+			store := &memVectorStore{}
+			// Score 0.4 < confidenceThreshold 0.6 → don't cache.
+			inner := &stubInner{
+				name:     "score",
+				decision: router.Decision{Labels: []string{"casual-chat"}, Score: 0.4},
+			}
+			cache := router.NewEmbeddingCacheClassifier(inner, embedder, store, 0.92, 0.6)
+
+			_, _ = cache.Classify(ctx, router.Probe{Prompt: "ambiguous"})
+			_, _ = cache.Classify(ctx, router.Probe{Prompt: "ambiguous"})
+
+			Expect(inner.calls).To(Equal(2), "second call should also miss")
+			stats := cache.Stats()
+			Expect(stats.LowConfidence).To(Equal(uint64(2)))
+			Expect(stats.Hits).To(Equal(uint64(0)))
+		})
+	})
+
+	Context("embedder error", func() {
+		It("degrades to inner classifier", func() {
+			embedder := &fakeEmbedder{
+				table:    map[string][]float32{"p": {1}},
+				failOnce: true,
+			}
+			store := &memVectorStore{}
+			inner := &stubInner{
+				name:     "score",
+				decision: router.Decision{Labels: []string{"x"}, Score: 0.99},
+			}
+			cache := router.NewEmbeddingCacheClassifier(inner, embedder, store, 0.92, 0.6)
+
+			d, err := cache.Classify(ctx, router.Probe{Prompt: "p"})
+			Expect(err).NotTo(HaveOccurred(), "classify")
+			Expect(d.Cached).To(BeFalse(), "embedder error should not produce a cache hit")
+			Expect(inner.calls).To(Equal(1), "inner should have run once via fallthrough")
+			stats := cache.Stats()
+			Expect(stats.EmbedderErrors).To(Equal(uint64(1)))
+		})
+	})
+
+	Context("store error", func() {
+		It("degrades to inner classifier", func() {
+			embedder := &fakeEmbedder{table: map[string][]float32{"p": {1}}}
+			store := &memVectorStore{failOps: 1}
+			inner := &stubInner{
+				name:     "score",
+				decision: router.Decision{Labels: []string{"x"}, Score: 0.99},
+			}
+			cache := router.NewEmbeddingCacheClassifier(inner, embedder, store, 0.92, 0.6)
+
+			_, _ = cache.Classify(ctx, router.Probe{Prompt: "p"})
+			stats := cache.Stats()
+			Expect(stats.StoreErrors).To(Equal(uint64(1)))
+		})
+	})
+
+	Context("Name", func() {
+		It("returns inner classifier name", func() {
+			embedder := &fakeEmbedder{}
+			store := &memVectorStore{}
+			inner := &stubInner{name: "score"}
+			cache := router.NewEmbeddingCacheClassifier(inner, embedder, store, 0, 0)
+			Expect(cache.Name()).To(Equal("score"))
+		})
+	})
+
+	Context("inner error", func() {
+		It("propagates", func() {
+			embedder := &fakeEmbedder{table: map[string][]float32{"p": {1}}}
+			store := &memVectorStore{}
+			inner := &stubInner{name: "score", err: errors.New("classifier blew up")}
+			cache := router.NewEmbeddingCacheClassifier(inner, embedder, store, 0.92, 0.6)
+
+			_, err := cache.Classify(ctx, router.Probe{Prompt: "p"})
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(Equal("classifier blew up"))
+		})
+	})
+
+	Context("default thresholds", func() {
+		It("apply for zero values", func() {
+			embedder := &fakeEmbedder{table: map[string][]float32{"p": {1}}}
+			store := &memVectorStore{}
+			inner := &stubInner{name: "score", decision: router.Decision{Labels: []string{"y"}, Score: 0.7}}
+			// thresholds=0 → defaults (0.92 / 0.60). 0.7 > 0.60 so should
+			// cache, and a re-call hits at sim=1.0 > 0.92.
+			cache := router.NewEmbeddingCacheClassifier(inner, embedder, store, 0, 0)
+			_, _ = cache.Classify(ctx, router.Probe{Prompt: "p"})
+			d, _ := cache.Classify(ctx, router.Probe{Prompt: "p"})
+			Expect(d.Cached).To(BeTrue(), "expected hit with default thresholds")
+		})
+	})
+
+	Context("corrupt payload", func() {
+		It("is treated as miss", func() {
+			embedder := &fakeEmbedder{table: map[string][]float32{"p": {1}}}
+			store := &memVectorStore{}
+			// Pre-poison the store with garbage that decodes to an empty
+			// label slice — Search will hit but the payload decoder must
+			// reject it, falling through to the inner classifier.
+			garbage, _ := json.Marshal(map[string]any{"labels": []string{}, "score": 1.0})
+			_ = store.Insert(ctx, []float32{1}, garbage)
+			inner := &stubInner{name: "score", decision: router.Decision{Labels: []string{"ok"}, Score: 0.8}}
+			cache := router.NewEmbeddingCacheClassifier(inner, embedder, store, 0.5, 0.5)
+			d, err := cache.Classify(ctx, router.Probe{Prompt: "p"})
+			Expect(err).NotTo(HaveOccurred(), "classify")
+			Expect(d.Cached).To(BeFalse(), "corrupt payload should not surface as a hit")
+			Expect(inner.calls).To(Equal(1), "inner should have run via fallthrough")
+		})
+	})
+})
+
+var _ = Describe("EmbeddingCache latency", func() {
+	It("is populated on hits", func() {
+		embedder := &fakeEmbedder{table: map[string][]float32{"p": {1}}}
+		store := &memVectorStore{}
+		inner := &stubInner{name: "score", decision: router.Decision{Labels: []string{"x"}, Score: 0.9, Latency: time.Millisecond}}
+		cache := router.NewEmbeddingCacheClassifier(inner, embedder, store, 0.92, 0.6)
+
+		_, _ = cache.Classify(context.Background(), router.Probe{Prompt: "p"})
+		d, _ := cache.Classify(context.Background(), router.Probe{Prompt: "p"})
+		Expect(d.Cached).To(BeTrue(), "expected hit")
+		// On a hit, Latency reflects the cache-lookup time, NOT the original
+		// classifier latency stored in the payload.
+		Expect(d.Latency).To(BeNumerically("<", time.Second), "Latency unreasonably high for an in-memory hit")
+	})
+})
diff --git a/core/services/routing/router/registry.go b/core/services/routing/router/registry.go
new file mode 100644
index 000000000..3eac5a103
--- /dev/null
+++ b/core/services/routing/router/registry.go
@@ -0,0 +1,76 @@
+package router
+
+import (
+	"sync"
+)
+
+// Registry is the process-wide store of built classifiers, keyed by
+// router-model name. The middleware uses it to avoid rebuilding the
+// score classifier on every request, and the admin status endpoint
+// reads from it to surface per-classifier cache stats.
+//
+// Each entry carries the fingerprint of the RouterConfig it was built
+// from. A Get() with a stale fingerprint reports a miss so the
+// middleware rebuilds — matches the previous local-sync.Map behaviour
+// that keyed on fingerprint alone.
+type Registry struct {
+	entries sync.Map // name → *registryEntry
+}
+
+type registryEntry struct {
+	fingerprint uint64
+	classifier  Classifier
+}
+
+func NewRegistry() *Registry { return &Registry{} }
+
+// Get returns the cached classifier for the named router model iff the
+// stored fingerprint matches. A miss (no entry, or stale fingerprint)
+// returns false; the caller is expected to rebuild and Put the result.
+func (r *Registry) Get(name string, fingerprint uint64) (Classifier, bool) {
+	if r == nil {
+		return nil, false
+	}
+	v, ok := r.entries.Load(name)
+	if !ok {
+		return nil, false
+	}
+	e := v.(*registryEntry)
+	if e.fingerprint != fingerprint {
+		return nil, false
+	}
+	return e.classifier, true
+}
+
+// Put stores a built classifier under (name, fingerprint), replacing
+// any prior entry. The middleware calls this after a Get miss.
+func (r *Registry) Put(name string, fingerprint uint64, c Classifier) {
+	if r == nil {
+		return
+	}
+	r.entries.Store(name, &registryEntry{fingerprint: fingerprint, classifier: c})
+}
+
+// EmbeddingCacheStatsByRouter returns a snapshot of every embedding
+// cache currently in the registry, keyed by router-model name. Plain
+// classifiers without the L2 cache wrapper are skipped — callers
+// distinguish "cache disabled" from "cache enabled with zero hits" by
+// the presence of the map key.
+func (r *Registry) EmbeddingCacheStatsByRouter() map[string]EmbeddingCacheStats {
+	if r == nil {
+		return nil
+	}
+	out := map[string]EmbeddingCacheStats{}
+	r.entries.Range(func(k, v any) bool {
+		name, _ := k.(string)
+		e, _ := v.(*registryEntry)
+		if e == nil {
+			return true
+		}
+		if ec, ok := e.classifier.(*EmbeddingCacheClassifier); ok {
+			out[name] = ec.Stats()
+		}
+		return true
+	})
+	return out
+}
diff --git a/core/services/routing/router/rerank.go b/core/services/routing/router/rerank.go
new file mode 100644
index 000000000..d422a58db
--- /dev/null
+++ b/core/services/routing/router/rerank.go
@@ -0,0 +1,104 @@
+package router
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/mudler/LocalAI/core/backend"
+)
+
+// RerankClassifier scores each policy description against the prompt
+// via a reranker model and activates the labels whose relevance clears
+// an absolute threshold. Robust when policy labels are abstract
+// relative to user prompts — the description is the natural English
+// the reranker was trained on.
+type RerankClassifier struct {
+	reranker            backend.Reranker
+	activationThreshold float64
+	// labels[i] is the policy label corresponding to documents[i] —
+	// both are scattered indices into the reranker's input order.
+	// Materialised once at construction so Classify never allocates
+	// them per call.
+	labels    []string
+	documents []string
+	cache     *labelSetCache
+}
+
+// defaultRerankActivationThreshold is the relevance floor a label
+// must clear to be considered active. Reranker scores live in [0, 1]
+// for cross-encoder / ColBERT heads; 0.5 picks "more positive than
+// not on this label."
+const defaultRerankActivationThreshold = 0.5
+
+func NewRerankClassifier(policies []ScorePolicy, reranker backend.Reranker, cacheCap int, activationThreshold float64) *RerankClassifier {
+	if len(policies) == 0 {
+		panic("router/rerank: at least one policy is required")
+	}
+	if reranker == nil {
+		panic("router/rerank: reranker is required (configure router.classifier_model)")
+	}
+	for _, p := range policies {
+		if p.Label == "" {
+			panic("router/rerank: policy has empty label")
+		}
+		if p.Description == "" {
+			panic(fmt.Sprintf("router/rerank: policy %q has no description", p.Label))
+		}
+	}
+	if activationThreshold <= 0 {
+		activationThreshold = defaultRerankActivationThreshold
+	}
+	labels := make([]string, len(policies))
+	docs := make([]string, len(policies))
+	for i, p := range policies {
+		labels[i] = p.Label
+		docs[i] = p.Description
+	}
+	return &RerankClassifier{
+		reranker:            reranker,
+		activationThreshold: activationThreshold,
+		labels:              labels,
+		documents:           docs,
+		cache:               newLabelSetCache(cacheCap),
+	}
+}
+
+func (c *RerankClassifier) Name() string { return ClassifierColbert }
+
+func (c *RerankClassifier) Classify(ctx context.Context, p Probe) (Decision, error) {
+	start := time.Now()
+	key := cacheKey(p.Prompt)
+	if hit, ok := c.cache.get(key); ok {
+		return Decision{Labels: hit, Score: 1.0, Latency: time.Since(start)}, nil
+	}
+
+	results, err := c.reranker.Rerank(ctx, p.Prompt, c.documents)
+	if err != nil {
+		return errDecision(start, fmt.Errorf("rerank classify: %w", err))
+	}
+
+	// The reranker may return fewer-than-N entries (top_n filtering)
+	// or reorder them by score. Scatter back into input order so
+	// threshold + argmax don't depend on result ordering.
+	scores := make([]float64, len(c.labels))
+	for _, r := range results {
+		if r.Index < 0 || r.Index >= len(scores) {
+			continue
+		}
+		scores[r.Index] = float64(r.RelevanceScore)
+	}
+
+	active, bestIdx := selectActive(scores, c.labels, c.activationThreshold)
+	c.cache.put(key, active)
+	labelScores := NewLabelScores(c.labels, scores)
+	return Decision{
+		Labels:              active,
+		Score:               scores[bestIdx],
+		Latency:             time.Since(start),
+		LabelScores:         labelScores,
+		ActivationThreshold: c.activationThreshold,
+	}, nil
+}
+
+func (c *RerankClassifier) CacheLen() int { return c.cache.len() }
diff --git a/core/services/routing/router/rerank_test.go b/core/services/routing/router/rerank_test.go
new file mode 100644
index 000000000..5b88d0bf0
--- /dev/null
+++ b/core/services/routing/router/rerank_test.go
@@ -0,0 +1,121 @@
+package router
+
+import (
+	"context"
+	"errors"
+
+	"github.com/mudler/LocalAI/core/backend"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+type stubReranker struct {
+	results []backend.RerankResult
+	err     error
+	calls   int
+	lastQ   string
+	lastDs  []string
+}
+
+func (r *stubReranker) Rerank(_ context.Context, query string, documents []string) ([]backend.RerankResult, error) {
+	r.calls++
+	r.lastQ = query
+	r.lastDs = append(r.lastDs[:0], documents...)
+	if r.err != nil {
+		return nil, r.err
+	}
+	return r.results, nil
+}
+
+var _ = Describe("RerankClassifier", func() {
+	It("activates the single label whose description is most relevant", func() {
+		// code-generation dominates; the other two fall below the
+		// default 0.5 activation threshold.
+		r := &stubReranker{results: []backend.RerankResult{
+			{Index: 0, RelevanceScore: 0.92},
+			{Index: 1, RelevanceScore: 0.10},
+			{Index: 2, RelevanceScore: 0.05},
+		}}
+		c := NewRerankClassifier(testPolicies(), r, 0, 0)
+		d, err := c.Classify(context.Background(), Probe{Prompt: "debug my null pointer"})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(equalLabels(d.Labels, []string{"code-generation"})).To(BeTrue(), "got %v", d.Labels)
+		Expect(d.Score).To(BeNumerically(">=", 0.9))
+	})
+
+	It("activates multiple labels when several descriptions clear threshold", func() {
+		r := &stubReranker{results: []backend.RerankResult{
+			{Index: 0, RelevanceScore: 0.85},
+			{Index: 1, RelevanceScore: 0.10},
+			{Index: 2, RelevanceScore: 0.75},
+		}}
+		c := NewRerankClassifier(testPolicies(), r, 0, 0)
+		d, err := c.Classify(context.Background(), Probe{Prompt: "write code that solves this equation"})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(sortedLabels(d)).To(Equal([]string{"code-generation", "math-reasoning"}))
+	})
+
+	It("falls back to argmax when no description clears threshold", func() {
+		// All scores below 0.5 — defensively fall back to the top
+		// label so the router always has something to route on.
+		r := &stubReranker{results: []backend.RerankResult{
+			{Index: 0, RelevanceScore: 0.30},
+			{Index: 1, RelevanceScore: 0.10},
+			{Index: 2, RelevanceScore: 0.20},
+		}}
+		c := NewRerankClassifier(testPolicies(), r, 0, 0)
+		d, err := c.Classify(context.Background(), Probe{Prompt: "ambiguous"})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(equalLabels(d.Labels, []string{"code-generation"})).To(BeTrue(), "got %v", d.Labels)
+	})
+
+	It("returns the reranker error verbatim", func() {
+		r := &stubReranker{err: errors.New("backend down")}
+		c := NewRerankClassifier(testPolicies(), r, 0, 0)
+		_, err := c.Classify(context.Background(), Probe{Prompt: "anything"})
+		Expect(err).To(MatchError(ContainSubstring("backend down")))
+	})
+
+	It("respects the configured activation threshold", func() {
+		r := &stubReranker{results: []backend.RerankResult{
+			{Index: 0, RelevanceScore: 0.40},
+			{Index: 1, RelevanceScore: 0.10},
+			{Index: 2, RelevanceScore: 0.45},
+		}}
+		// Threshold lowered to 0.35 — both 0.40 and 0.45 should activate.
+		c := NewRerankClassifier(testPolicies(), r, 0, 0.35)
+		d, err := c.Classify(context.Background(), Probe{Prompt: "borderline"})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(sortedLabels(d)).To(Equal([]string{"code-generation", "math-reasoning"}))
+	})
+
+	It("caches by case-folded prompt", func() {
+		r := &stubReranker{results: []backend.RerankResult{
+			{Index: 0, RelevanceScore: 0.92},
+			{Index: 1, RelevanceScore: 0.10},
+			{Index: 2, RelevanceScore: 0.05},
+		}}
+		c := NewRerankClassifier(testPolicies(), r, 4, 0)
+		_, _ = c.Classify(context.Background(), Probe{Prompt: "Debug my null pointer"})
+		_, _ = c.Classify(context.Background(), Probe{Prompt: " debug MY null POINTER "})
+		Expect(r.calls).To(Equal(1), "case+whitespace variants should hit the cache")
+		Expect(c.CacheLen()).To(Equal(1))
+	})
+
+	It("scores against the policy descriptions, not the labels", func() {
+		// The reranker library should be reranking *descriptions*
+		// (natural English the model was trained on), not abstract
+		// label slugs that wouldn't match any pretraining distribution.
+		r := &stubReranker{results: []backend.RerankResult{
+			{Index: 0, RelevanceScore: 0.9},
+		}}
+		c := NewRerankClassifier(testPolicies(), r, 0, 0)
+		_, err := c.Classify(context.Background(), Probe{Prompt: "p"})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(r.lastDs).To(Equal([]string{
+			"writing, debugging, or explaining code",
+			"small talk and general conversation",
+			"arithmetic, equations, word problems",
+		}))
+	})
+})
diff --git a/core/services/routing/router/resolve.go b/core/services/routing/router/resolve.go
new file mode 100644
index 000000000..a474d6d48
--- /dev/null
+++ b/core/services/routing/router/resolve.go
@@ -0,0 +1,203 @@
+package router
+
+import (
+	"context"
+	"fmt"
+	"slices"
+	"strings"
+	"time"
+
+	"github.com/mudler/LocalAI/core/config"
+)
+
+// CandidateLoader resolves a candidate's model name to its parsed
+// ModelConfig. The router calls it after MatchCandidate to load the
+// resolved target so the caller (HTTP middleware or realtime handler)
+// can dispatch against it.
+//
+// Defined as a function value rather than tied to *config.ModelConfigLoader
+// so callers in tests can stub it without spinning up a real loader.
+type CandidateLoader func(name string) (*config.ModelConfig, error)
+
+// ResolveResult is the output of Resolve. It captures everything a
+// caller needs to (a) dispatch the request against the chosen candidate,
+// (b) record an audit row, and (c) decide whether to fall back to the
+// classifier-error path.
+type ResolveResult struct {
+	// RouterModel is the router config's own name (the model the client asked for).
+	RouterModel string
+
+	// ChosenModel is the candidate the classifier picked, or
+	// cfg.Router.Fallback when the classifier errored / no candidate
+	// covered the active labels.
+	ChosenModel string
+
+	// ChosenConfig is the loaded ModelConfig for ChosenModel. The
+	// caller dispatches against this — it has the right backend,
+	// pipeline, etc.
+	ChosenConfig *config.ModelConfig
+
+	// Decision carries the classifier's labels/score/latency/cache
+	// info. When UsedFallback is true, Decision.Labels is
+	// []string{LabelFallback}.
+	Decision Decision
+
+	// Labels are the labels recorded against this decision — either
+	// Decision.Labels or []string{LabelFallback} when the classifier
+	// failed. Pulled out so callers don't have to special-case the
+	// fallback path.
+	Labels []string
+
+	// ClassifierName is the Name() of the classifier that produced the
+	// decision, or LabelFallback when classifier setup itself failed
+	// and the fallback path ran without a working classifier.
+	ClassifierName string
+
+	// UsedFallback is true when the result came from cfg.Router.Fallback
+	// rather than a classifier-picked candidate (classifier
+	// build/Classify error or no candidate covered the active labels).
+	UsedFallback bool
+}
+
+// Resolve runs the full classify → match → load pipeline for a router
+// model config. It is transport-agnostic: callers pass a built
+// classifier, a candidate loader, and a probe; Resolve returns a
+// ResolveResult or an error if the resolved config violates invariants
+// or the fallback can't be loaded.
+//
+// Errors returned here are *terminal* — the caller should surface them
+// to the client. Classifier-error fallbacks are non-terminal and folded
+// into ResolveResult.UsedFallback.
+//
+// classifier may be nil; that signals "classifier build failed" and
+// pushes resolution straight to the fallback path (mirrors the
+// classifier-build-error branch in the historical RouteModel middleware).
+func Resolve(ctx context.Context, routerCfg *config.ModelConfig, classifier Classifier, loader CandidateLoader, probe Probe) (*ResolveResult, error) {
+	if routerCfg == nil || !routerCfg.HasRouter() {
+		return nil, fmt.Errorf("router.Resolve: config has no router block")
+	}
+
+	if classifier == nil {
+		return resolveFallback(routerCfg, loader, Decision{}, LabelFallback, "classifier unavailable")
+	}
+
+	start := time.Now()
+	decision, err := classifier.Classify(ctx, probe)
+	if err != nil {
+		return resolveFallback(routerCfg, loader, Decision{Latency: time.Since(start)}, classifier.Name(), "classifier error: "+err.Error())
+	}
+
+	candidate := MatchCandidate(routerCfg.Router.Candidates, decision.Labels)
+	if candidate == "" {
+		return resolveFallback(routerCfg, loader, decision, classifier.Name(), "no candidate covers labels: "+strings.Join(decision.Labels, ","))
+	}
+
+	candidateCfg, err := loader(candidate)
+	if err != nil || candidateCfg == nil {
+		return nil, fmt.Errorf("router candidate %q not loadable: %w", candidate, err)
+	}
+	if candidateCfg.HasRouter() {
+		return nil, fmt.Errorf("router candidate %q is itself a router (depth-1 invariant)", candidate)
+	}
+
+	return &ResolveResult{
+		RouterModel:    routerCfg.Name,
+		ChosenModel:    candidate,
+		ChosenConfig:   candidateCfg,
+		Decision:       decision,
+		Labels:         decision.Labels,
+		ClassifierName: classifier.Name(),
+		UsedFallback:   false,
+	}, nil
+}
+
+// resolveFallback handles the three failure modes that fall through to
+// cfg.Router.Fallback: classifier build failed, Classify returned an
+// error, or no candidate covered the active labels. Returns an error
+// when no fallback is configured — those translate to 503/500 at the
+// HTTP layer.
+//
+// reason is included in the wrapped error for debugging; it's not
+// surfaced to the client.
+func resolveFallback(routerCfg *config.ModelConfig, loader CandidateLoader, decision Decision, classifierName, reason string) (*ResolveResult, error) {
+	if routerCfg.Router.Fallback == "" {
+		return nil, fmt.Errorf("router: %s and no fallback configured", reason)
+	}
+	candidateCfg, err := loader(routerCfg.Router.Fallback)
+	if err != nil || candidateCfg == nil {
+		return nil, fmt.Errorf("router fallback %q not loadable: %w", routerCfg.Router.Fallback, err)
+	}
+	if candidateCfg.HasRouter() {
+		return nil, fmt.Errorf("router fallback %q is itself a router (depth-1 invariant)", routerCfg.Router.Fallback)
+	}
+	decision.Labels = []string{LabelFallback}
+	return &ResolveResult{
+		RouterModel:    routerCfg.Name,
+		ChosenModel:    routerCfg.Router.Fallback,
+		ChosenConfig:   candidateCfg,
+		Decision:       decision,
+		Labels:         []string{LabelFallback},
+		ClassifierName: classifierName,
+		UsedFallback:   true,
+	}, nil
+}
+
+// ToDecisionRecord projects a ResolveResult into the persisted
+// DecisionRecord shape. Centralised so the chat-side recordHTTPDecision
+// and the realtime-side recorder can't drift in which Decision fields
+// they copy through — a new field added to Decision only needs to be
+// remembered here, not at every call site.
+//
+// id, correlationID, userID, and source are caller-supplied because
+// they originate outside the routing pipeline (request ID generator,
+// auth, entry-point dispatch).
+func (r *ResolveResult) ToDecisionRecord(id, correlationID, userID, source string) DecisionRecord {
+	return DecisionRecord{
+		ID:                  id,
+		CorrelationID:       correlationID,
+		UserID:              userID,
+		RouterModel:         r.RouterModel,
+		RequestedModel:      r.RouterModel,
+		ServedModel:         r.ChosenModel,
+		Classifier:          r.ClassifierName,
+		Label:               strings.Join(r.Labels, ","),
+		Score:               r.Decision.Score,
+		LatencyMs:           r.Decision.Latency.Milliseconds(),
+		Cached:              r.Decision.Cached,
+		CacheSimilarity:     r.Decision.CacheSimilarity,
+		LabelScores:         r.Decision.LabelScores,
+		ActivationThreshold: r.Decision.ActivationThreshold,
+		Source:              source,
+		CreatedAt:           time.Now().UTC(),
+	}
+}
+
+// MatchCandidate picks the FIRST candidate whose Labels are a
+// superset of the active label set. Admins order the candidates list
+// smallest → largest, so a request that needs one label routes to
+// the smallest capable model and one that needs multiple falls to
+// the first bigger candidate that covers them all. Returns empty
+// string when no candidate matches; the caller falls back.
+func MatchCandidate(candidates []config.RouterCandidate, active []string) string {
+	if len(active) == 0 {
+		return ""
+	}
+	for _, c := range candidates {
+		if labelSetCovers(c.Labels, active) {
+			return c.Model
+		}
+	}
+	return ""
+}
+
+// labelSetCovers returns true when every element of needed appears
+// in have. Label sets are typically <10 entries so the linear scan
+// is fine.
+func labelSetCovers(have, needed []string) bool {
+	for _, n := range needed {
+		if !slices.Contains(have, n) {
+			return false
+		}
+	}
+	return true
+}
diff --git a/core/services/routing/router/resolve_test.go b/core/services/routing/router/resolve_test.go
new file mode 100644
index 000000000..973114112
--- /dev/null
+++ b/core/services/routing/router/resolve_test.go
@@ -0,0 +1,130 @@
+package router_test
+
+import (
+	"context"
+	"errors"
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services/routing/router"
+)
+
+type fakeClassifier struct {
+	name     string
+	decision router.Decision
+	err      error
+}
+
+func (f *fakeClassifier) Classify(_ context.Context, _ router.Probe) (router.Decision, error) {
+	if f.err != nil {
+		return router.Decision{}, f.err
+	}
+	return f.decision, nil
+}
+
+func (f *fakeClassifier) Name() string {
+	if f.name == "" {
+		return "fake"
+	}
+	return f.name
+}
+
+// loaderFrom returns a CandidateLoader serving cfgs by name. Missing
+// entries return ("not found"). Keeps test setup compact — each spec
+// declares the model name → config map it cares about.
+func loaderFrom(cfgs map[string]*config.ModelConfig) router.CandidateLoader {
+	return func(name string) (*config.ModelConfig, error) {
+		c, ok := cfgs[name]
+		if !ok {
+			return nil, errors.New("not found: " + name)
+		}
+		return c, nil
+	}
+}
+
+var _ = Describe("router.Resolve", func() {
+	var (
+		routerCfg *config.ModelConfig
+		fast      *config.ModelConfig
+		smart     *config.ModelConfig
+		fallback  *config.ModelConfig
+		loader    router.CandidateLoader
+	)
+
+	BeforeEach(func() {
+		fast = &config.ModelConfig{Name: "fast-local", Backend: "llama-cpp"}
+		smart = &config.ModelConfig{Name: "smart-cloud", Backend: "cloud-proxy"}
+		fallback = &config.ModelConfig{Name: "fallback-local", Backend: "llama-cpp"}
+		routerCfg = &config.ModelConfig{
+			Name: "router-llm",
+			Router: config.RouterConfig{
+				Classifier: router.ClassifierScore,
+				Candidates: []config.RouterCandidate{
+					{Model: "fast-local", Labels: []string{"chat"}},
+					{Model: "smart-cloud", Labels: []string{"reasoning"}},
+				},
+				Fallback: "fallback-local",
+			},
+		}
+		loader = loaderFrom(map[string]*config.ModelConfig{
+			"fast-local":     fast,
+			"smart-cloud":    smart,
+			"fallback-local": fallback,
+		})
+	})
+
+	It("picks the candidate that covers the classifier's labels", func() {
+		cls := &fakeClassifier{decision: router.Decision{Labels: []string{"reasoning"}, Score: 0.92, Latency: 5 * time.Millisecond}}
+		got, err := router.Resolve(context.Background(), routerCfg, cls, loader, router.Probe{Prompt: "tricky"})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(got.ChosenModel).To(Equal("smart-cloud"))
+		Expect(got.ChosenConfig).To(Equal(smart))
+		Expect(got.UsedFallback).To(BeFalse())
+		Expect(got.Labels).To(Equal([]string{"reasoning"}))
+	})
+
+	It("falls back when the classifier errors", func() {
+		cls := &fakeClassifier{err: errors.New("boom")}
+		got, err := router.Resolve(context.Background(), routerCfg, cls, loader, router.Probe{Prompt: "anything"})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(got.UsedFallback).To(BeTrue())
+		Expect(got.ChosenModel).To(Equal("fallback-local"))
+		Expect(got.Labels).To(Equal([]string{router.LabelFallback}))
+	})
+
+	It("falls back when no candidate covers the active labels", func() {
+		cls := &fakeClassifier{decision: router.Decision{Labels: []string{"unknown-label"}}}
+		got, err := router.Resolve(context.Background(), routerCfg, cls, loader, router.Probe{Prompt: "x"})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(got.UsedFallback).To(BeTrue())
+		Expect(got.ChosenModel).To(Equal("fallback-local"))
+	})
+
+	It("falls back when classifier is nil (build failed upstream)", func() {
+		got, err := router.Resolve(context.Background(), routerCfg, nil, loader, router.Probe{Prompt: "x"})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(got.UsedFallback).To(BeTrue())
+		Expect(got.ChosenModel).To(Equal("fallback-local"))
+	})
+
+	It("returns a terminal error when classifier fails AND no fallback is configured", func() {
+		routerCfg.Router.Fallback = ""
+		_, err := router.Resolve(context.Background(), routerCfg, nil, loader, router.Probe{Prompt: "x"})
+		Expect(err).To(HaveOccurred())
+	})
+
+	It("rejects candidates that are themselves routers (depth-1 invariant)", func() {
+		// Swap the fast-local config for one that itself has a router
+		// block — the depth-1 guard must reject it.
+		fast.Router = config.RouterConfig{
+			Candidates: []config.RouterCandidate{{Model: "deeper", Labels: []string{"x"}}},
+		}
+		cls := &fakeClassifier{decision: router.Decision{Labels: []string{"chat"}}}
+		_, err := router.Resolve(context.Background(), routerCfg, cls, loader, router.Probe{Prompt: "x"})
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("depth-1 invariant"))
+	})
+})
diff --git a/core/services/routing/router/router_suite_test.go b/core/services/routing/router/router_suite_test.go
new file mode 100644
index 000000000..a51a2ee50
--- /dev/null
+++ b/core/services/routing/router/router_suite_test.go
@@ -0,0 +1,13 @@
+package router_test
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestRouter(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "router test suite")
+}
diff --git a/core/services/routing/router/score.go b/core/services/routing/router/score.go
new file mode 100644
index 000000000..19da0d2c6
--- /dev/null
+++ b/core/services/routing/router/score.go
@@ -0,0 +1,423 @@
+package router
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"math"
+	"strings"
+	"text/template"
+	"time"
+
+	"github.com/Masterminds/sprig/v3"
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/xlog"
+)
+
+// ScorePolicy mirrors config.RouterPolicy at the classifier boundary —
+// a label string plus its natural-language description for the
+// routing system prompt.
+type ScorePolicy struct {
+	Label       string
+	Description string
+}
+
+// defaultActivationThreshold is the softmax-probability floor a policy
+// must clear to be considered "active." Picked low enough that two
+// reasonably-confident labels (each ~0.4) both activate, high enough
+// that a flat distribution doesn't activate everything.
+const defaultActivationThreshold = 0.15
+
+// defaultStopToken is the assistant-turn-end marker used when the
+// classifier model's stop token isn't configured. ChatML/Qwen
+// (Arch-Router's base) uses <|im_end|>; non-ChatML routing models
+// should set Router.classifier_model.stopwords.
+const defaultStopToken = "<|im_end|>"
+
+// Score normalisation modes. The default mode (raw) feeds joint
+// log-probs directly into the softmax — that's the distribution the
+// classifier model was trained against. "mean" divides by candidate
+// token count, which is fairer to long labels but off-distribution
+// for models trained to emit fixed-format outputs.
+const (
+	ScoreNormalizationRaw  = "raw"
+	ScoreNormalizationMean = "mean"
+)
+
+// PromptRenderer renders the chat envelope around the routing
+// system prompt + user probe. The returned string ends right at the
+// assistant-open marker so the scorer's first predicted token is the
+// start of the candidate output. Wired from the classifier model's
+// chat template by middleware — kept as a callback so the router
+// package doesn't depend on core/templates.
+type PromptRenderer func(system, user string) (string, error)
+
+// ScoreClassifierOptions groups the optional knobs that have grown
+// past the comfortable positional-arg threshold. Zero-value gives
+// production defaults (raw normalisation, ChatML stop token, built-in
+// ChatML renderer, package activation threshold, 1024-entry cache).
+type ScoreClassifierOptions struct {
+	// PromptRenderer wraps the routing system + user prompt in the
+	// classifier model's chat template. Nil falls back to a built-in
+	// ChatML renderer — fine for Arch-Router/Qwen and for tests, but
+	// the production wiring should pass through the templates
+	// evaluator so non-Qwen routing models render correctly.
+	PromptRenderer PromptRenderer
+
+	// StopToken is appended to each candidate before scoring so the
+	// model's assistant-turn-end log-prob is included in the joint.
+	// That token's probability is the model's explicit "I'm done"
+	// signal — including it folds completion-confidence into the
+	// candidate score. Empty falls back to ChatML's <|im_end|>.
+	StopToken string
+
+	// Normalization picks how candidate joint log-probs feed into
+	// softmax. See package consts ScoreNormalizationRaw (default) and
+	// ScoreNormalizationMean.
+	Normalization string
+
+	// CacheCap bounds the per-prompt memo cache. 0 disables.
+	CacheCap int
+
+	// ActivationThreshold is the softmax-probability floor a policy
+	// must clear to activate. 0 picks defaultActivationThreshold.
+	ActivationThreshold float64
+
+	// SystemPromptTemplate overrides the routing system prompt at
+	// construction time. Go text/template + Sprig, executed with
+	// `.Policies []ScorePolicy`. Empty falls back to the built-in
+	// Arch-Router-shaped template (buildScoreSystemPrompt). The
+	// candidate format `{"route": "<label>"}` is NOT templated — an
+	// override that instructs the model to emit a different schema
+	// would silently desync from what the scorer actually scores.
+	SystemPromptTemplate string
+}
+
+// ScoreClassifier scores every policy label as the model's actual
+// trained output ({"route": "<label>"} + turn-end) under the routing
+// prompt, converts log-probabilities into a softmax distribution,
+// and returns the set of labels whose probability passes the
+// activation threshold.
+//
+// This is the Arch-Router approach extended for multi-label. The
+// classifier model is trained to emit a single policy as a JSON
+// route name, but its output distribution still spreads probability
+// mass across competing labels when more than one applies. Reading
+// the distribution rather than the argmax lets us route conjunctive
+// intents ("debug this code AND explain the math") to a candidate
+// that can serve both.
+type ScoreClassifier struct {
+	scorer              backend.Scorer
+	activationThreshold float64
+	normalization       string
+	renderer            PromptRenderer
+
+	// systemPrompt is built once at construction. The same prompt is
+	// reused on every classification — only the user-turn body changes.
+	systemPrompt string
+
+	// labelOrder mirrors the configured policy ordering — the scorer
+	// receives candidates in this order and the softmax distribution
+	// indexes back into it.
+	labelOrder []string
+
+	// candidates are the pre-built scoring strings — the JSON output
+	// Arch-Router was trained to emit, suffixed with the model's
+	// turn-end token so its probability folds into the joint
+	// log-prob. Built once at construction; same list every call.
+	candidates []string
+
+	cache *labelSetCache
+}
+
+// NewScoreClassifier panics on caller errors at construction (empty
+// policies, missing description, nil scorer) — same rationale as the
+// other classifiers. See ScoreClassifierOptions for the optional
+// knobs and their zero-value defaults.
+func NewScoreClassifier(policies []ScorePolicy, scorer backend.Scorer, opts ScoreClassifierOptions) *ScoreClassifier {
+	if len(policies) == 0 {
+		panic("router/score: at least one policy is required")
+	}
+	if scorer == nil {
+		panic("router/score: scorer is required (configure router.classifier_model)")
+	}
+	for _, p := range policies {
+		if p.Label == "" {
+			panic("router/score: policy has empty label")
+		}
+		if p.Description == "" {
+			panic(fmt.Sprintf("router/score: policy %q has no description", p.Label))
+		}
+	}
+	labels := make([]string, 0, len(policies))
+	for _, p := range policies {
+		labels = append(labels, p.Label)
+	}
+	if opts.ActivationThreshold <= 0 {
+		opts.ActivationThreshold = defaultActivationThreshold
+	}
+	if opts.StopToken == "" {
+		opts.StopToken = defaultStopToken
+	}
+	if opts.PromptRenderer == nil {
+		opts.PromptRenderer = chatMLRenderer
+	}
+	switch opts.Normalization {
+	case "", ScoreNormalizationRaw:
+		opts.Normalization = ScoreNormalizationRaw
+	case ScoreNormalizationMean:
+		// ok
+	default:
+		panic(fmt.Sprintf("router/score: unknown score_normalization %q (want %q or %q)",
+			opts.Normalization, ScoreNormalizationRaw, ScoreNormalizationMean))
+	}
+	candidates := make([]string, len(labels))
+	for i, l := range labels {
+		candidates[i] = buildCandidate(l, opts.StopToken)
+	}
+	systemPrompt, err := renderSystemPrompt(opts.SystemPromptTemplate, policies)
+	if err != nil {
+		// Parse-time error here means the operator-supplied template
+		// is malformed. Config-load validation (ModelConfig.Validate)
+		// should have caught it earlier; reaching this is either a
+		// direct programmatic construction or a validation gap.
+		panic(fmt.Sprintf("router/score: system_prompt_template: %v", err))
+	}
+	return &ScoreClassifier{
+		scorer:              scorer,
+		activationThreshold: opts.ActivationThreshold,
+		normalization:       opts.Normalization,
+		renderer:            opts.PromptRenderer,
+		systemPrompt:        systemPrompt,
+		labelOrder:          labels,
+		candidates:          candidates,
+		cache:               newLabelSetCache(opts.CacheCap),
+	}
+}
+
+// renderSystemPrompt returns the routing system prompt for the given
+// policies. When tmpl is empty it falls back to the built-in
+// Arch-Router-shaped buildScoreSystemPrompt. Otherwise it parses tmpl
+// as Go text/template + Sprig and executes against {Policies}.
+func renderSystemPrompt(tmpl string, policies []ScorePolicy) (string, error) {
+	if tmpl == "" {
+		return buildScoreSystemPrompt(policies), nil
+	}
+	t, err := template.New("router_system_prompt").Funcs(sprig.FuncMap()).Parse(tmpl)
+	if err != nil {
+		return "", fmt.Errorf("parse: %w", err)
+	}
+	var buf bytes.Buffer
+	if err := t.Execute(&buf, struct{ Policies []ScorePolicy }{Policies: policies}); err != nil {
+		return "", fmt.Errorf("execute: %w", err)
+	}
+	return buf.String(), nil
+}
+
+func (c *ScoreClassifier) Name() string { return ClassifierScore }
+
+func (c *ScoreClassifier) Classify(ctx context.Context, p Probe) (Decision, error) {
+	start := time.Now()
+	key := cacheKey(p.Prompt)
+	if hit, ok := c.cache.get(key); ok {
+		return Decision{Labels: hit, Score: 1.0, Latency: time.Since(start)}, nil
+	}
+	prompt, err := c.renderer(c.systemPrompt, p.Prompt)
+	if err != nil {
+		return errDecision(start, fmt.Errorf("score classify: render prompt: %w", err))
+	}
+	results, err := c.scorer.Score(ctx, prompt, c.candidates)
+	if err != nil {
+		xlog.Warn("router: score classifier failed", "error", err, "labels", c.labelOrder)
+		return errDecision(start, fmt.Errorf("score classify: %w", err))
+	}
+	if len(results) != len(c.labelOrder) {
+		return errDecision(start, fmt.Errorf("score classify: scorer returned %d results for %d policies", len(results), len(c.labelOrder)))
+	}
+
+	// Convert per-candidate joint log-probs to the softmax inputs the
+	// activation threshold reads. Default mode (raw) is on-distribution
+	// for Arch-Router: longer candidates score lower for legitimate
+	// reasons (the model assigns less probability to outputs that span
+	// more tokens). Mean normalisation is available for operators with
+	// highly uneven label token counts who'd rather spend the off-
+	// distribution cost than the length bias.
+	logProbs := make([]float64, len(results))
+	nonZero := 0
+	for i, r := range results {
+		if r.NumTokens == 0 {
+			logProbs[i] = math.Inf(-1)
+			continue
+		}
+		nonZero++
+		switch c.normalization {
+		case ScoreNormalizationMean:
+			logProbs[i] = r.LogProb / float64(r.NumTokens)
+		default:
+			logProbs[i] = r.LogProb
+		}
+	}
+	// All-zero NumTokens means the backend never tokenised any
+	// candidate — almost certainly a Scorer regression (forgot to
+	// populate NumTokens) rather than a real distribution. Without
+	// this check softmax degenerates to uniform 1/N, every label
+	// clears the activation threshold, and the router silently treats
+	// the bug as "multi-label intent". Fail loud instead so the
+	// operator sees the cause.
+	if nonZero == 0 {
+		return errDecision(start, fmt.Errorf("score classify: backend returned zero tokens for every candidate (scorer regression?)"))
+	}
+	probs := softmax(logProbs)
+
+	active, bestIdx := selectActive(probs, c.labelOrder, c.activationThreshold)
+	c.cache.put(key, active)
+	latency := time.Since(start)
+	labelScores := NewLabelScores(c.labelOrder, probs)
+	xlog.Info("router: score classified",
+		"labels", active,
+		"top_label", c.labelOrder[bestIdx],
+		"top_prob", probs[bestIdx],
+		"latency_ms", latency.Milliseconds())
+	return Decision{
+		Labels:              active,
+		Score:               probs[bestIdx],
+		Latency:             latency,
+		LabelScores:         labelScores,
+		ActivationThreshold: c.activationThreshold,
+	}, nil
+}
+
+// softmax converts an array of log-probabilities into a probability
+// distribution. -inf inputs are handled (their exp contributes 0).
+// Uses the standard max-subtraction trick for numerical stability.
+func softmax(logProbs []float64) []float64 {
+	if len(logProbs) == 0 {
+		return nil
+	}
+	maxLP := math.Inf(-1)
+	for _, lp := range logProbs {
+		if lp > maxLP {
+			maxLP = lp
+		}
+	}
+	if math.IsInf(maxLP, -1) {
+		// All -inf: return a uniform distribution as a sensible
+		// degenerate result.
+		out := make([]float64, len(logProbs))
+		for i := range out {
+			out[i] = 1.0 / float64(len(logProbs))
+		}
+		return out
+	}
+	out := make([]float64, len(logProbs))
+	sum := 0.0
+	for i, lp := range logProbs {
+		out[i] = math.Exp(lp - maxLP)
+		sum += out[i]
+	}
+	if sum == 0 {
+		// Shouldn't happen given the maxLP check above, but guard
+		// against pathological inputs.
+		for i := range out {
+			out[i] = 1.0 / float64(len(out))
+		}
+		return out
+	}
+	for i := range out {
+		out[i] /= sum
+	}
+	return out
+}
+
+func (c *ScoreClassifier) CacheLen() int { return c.cache.len() }
+
+// buildScoreSystemPrompt renders the Arch-Router-style routing
+// instructions: routes listed in a structured block, output schema
+// declared as JSON {"route": "<name>"}. Candidates are scored as
+// realisations of that schema (see buildCandidate), so the system
+// prompt and candidate format have to agree.
+func buildScoreSystemPrompt(policies []ScorePolicy) string {
+	var b strings.Builder
+	b.WriteString("You are a helpful assistant that selects the best route for the user's request.\n")
+	b.WriteString("Routes are listed inside <routes></routes> tags with a name and a short description.\n\n")
+	b.WriteString("<routes>\n")
+	for _, p := range policies {
+		// Both label and description need escaping — YAML accepts
+		// any characters in either, and an unescaped quote/backslash
+		// breaks the JSON-lines structure the classifier model was
+		// trained to parse.
+		b.WriteString(`{"name": "`)
+		b.WriteString(escapeJSONString(p.Label))
+		b.WriteString(`", "description": "`)
+		b.WriteString(escapeJSONString(p.Description))
+		b.WriteString("\"}\n")
+	}
+	b.WriteString("</routes>\n\n")
+	b.WriteString("Respond with exactly one JSON object of the form ")
+	b.WriteString(`{"route": "<name>"}`)
+	b.WriteString(", where <name> is one of the route names above.")
+	return b.String()
+}
+
+// buildCandidate forms the scoring string for one policy label. The
+// JSON envelope is what Arch-Router was trained to emit; suffixing
+// the model's turn-end token folds completion-confidence into the
+// joint log-prob (the model puts more probability on the turn-end
+// token when the preceding output feels like a complete answer).
+//
+// Tokenisation: the gRPC scorer re-tokenises prompt+candidate as one
+// string with parse_special=true, so a literal "<|im_end|>" collapses
+// to the single special token ID rather than being tokenised as raw
+// characters (see backend/cpp/llama-cpp/grpc-server.cpp Score).
+func buildCandidate(label, stopToken string) string {
+	// Escape so the candidate stays valid JSON when a label
+	// (uncommonly) contains a quote or backslash. Keeps the scoring
+	// string consistent with the listing in buildScoreSystemPrompt —
+	// both describe the same route token-for-token.
+	return `{"route": "` + escapeJSONString(label) + `"}` + stopToken
+}
+
+// chatMLRenderer is the fallback PromptRenderer used when the caller
+// doesn't wire a template-evaluator-backed renderer. ChatML is what
+// Arch-Router (Qwen-2.5-1.5B-Instruct) was trained against, so this
+// default is safe for the canonical setup. Production wiring should
+// still pass a real renderer so a non-ChatML routing model works
+// without operators having to know about this fallback.
+func chatMLRenderer(system, user string) (string, error) {
+	var b strings.Builder
+	b.WriteString("<|im_start|>system\n")
+	b.WriteString(system)
+	b.WriteString("<|im_end|>\n<|im_start|>user\n")
+	b.WriteString(user)
+	b.WriteString("<|im_end|>\n<|im_start|>assistant\n")
+	return b.String(), nil
+}
+
+// escapeJSONString does the minimal escaping needed to embed a policy
+// description inside a JSON string literal in the route-listing
+// system prompt. The list is rendered as ad-hoc JSON-lines (one
+// {"name":..,"description":..} per route) rather than via
+// encoding/json because the surrounding system prompt is plain text;
+// using a real marshaller would force us to encode the whole block
+// or wrap each route in a separate call.
+func escapeJSONString(s string) string {
+	var b strings.Builder
+	b.Grow(len(s))
+	for _, r := range s {
+		switch r {
+		case '\\':
+			b.WriteString(`\\`)
+		case '"':
+			b.WriteString(`\"`)
+		case '\n':
+			b.WriteString(`\n`)
+		case '\r':
+			b.WriteString(`\r`)
+		case '\t':
+			b.WriteString(`\t`)
+		default:
+			b.WriteRune(r)
+		}
+	}
+	return b.String()
+}
diff --git a/core/services/routing/router/score_test.go b/core/services/routing/router/score_test.go
new file mode 100644
index 000000000..8b9b0fe9b
--- /dev/null
+++ b/core/services/routing/router/score_test.go
@@ -0,0 +1,337 @@
+package router
+
+import (
+	"context"
+	"errors"
+	"sort"
+	"strings"
+
+	"github.com/mudler/LocalAI/core/backend"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+type stubScorer struct {
+	results []backend.CandidateScore
+	err     error
+	calls   int
+	lastP   string
+	lastC   []string
+}
+
+func (s *stubScorer) Score(_ context.Context, prompt string, candidates []string) ([]backend.CandidateScore, error) {
+	s.calls++
+	s.lastP = prompt
+	s.lastC = append(s.lastC[:0], candidates...)
+	if s.err != nil {
+		return nil, s.err
+	}
+	return s.results, nil
+}
+
+func testPolicies() []ScorePolicy {
+	return []ScorePolicy{
+		{Label: "code-generation", Description: "writing, debugging, or explaining code"},
+		{Label: "casual-chat", Description: "small talk and general conversation"},
+		{Label: "math-reasoning", Description: "arithmetic, equations, word problems"},
+	}
+}
+
+func sortedLabels(d Decision) []string {
+	out := append([]string(nil), d.Labels...)
+	sort.Strings(out)
+	return out
+}
+
+func equalLabels(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+var _ = Describe("ScoreClassifier", func() {
+	It("returns a single dominant label", func() {
+		// Raw mode: code's joint log-prob is far above the rest, so
+		// softmax collapses to ~1.0 on code and the others fall below
+		// the activation threshold (default 0.15).
+		s := &stubScorer{results: []backend.CandidateScore{
+			{LogProb: -0.05, NumTokens: 6},
+			{LogProb: -8.0, NumTokens: 6},
+			{LogProb: -10.0, NumTokens: 6},
+		}}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{})
+		d, err := c.Classify(context.Background(), Probe{Prompt: "fix this null pointer"})
+		Expect(err).NotTo(HaveOccurred(), "Classify")
+		Expect(equalLabels(d.Labels, []string{"code-generation"})).To(BeTrue(), "Labels = %v, want [code-generation]", d.Labels)
+		Expect(d.Score).To(BeNumerically(">=", 0.8), "want >= 0.8 for dominant single label")
+	})
+
+	It("activates multiple labels", func() {
+		// Raw mode two-way tie: code and math share ~0.49 each, chat
+		// far behind. Both must activate so the router can pick a
+		// candidate covering both capabilities.
+		s := &stubScorer{results: []backend.CandidateScore{
+			{LogProb: -1.0, NumTokens: 6}, // code  ~0.488
+			{LogProb: -4.0, NumTokens: 6}, // chat  ~0.024
+			{LogProb: -1.0, NumTokens: 6}, // math  ~0.488
+		}}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{})
+		d, err := c.Classify(context.Background(), Probe{Prompt: "write code that solves this word problem"})
+		Expect(err).NotTo(HaveOccurred(), "Classify")
+		got := sortedLabels(d)
+		want := []string{"code-generation", "math-reasoning"}
+		Expect(equalLabels(got, want)).To(BeTrue(), "Labels = %v, want %v", got, want)
+	})
+
+	It("falls back to argmax on flat distribution", func() {
+		// All three labels score identically — softmax flat at ~0.333
+		// each. Threshold above that forces the fallback path, which
+		// must still return one label so the router has something to
+		// route on.
+		s := &stubScorer{results: []backend.CandidateScore{
+			{LogProb: -3.0, NumTokens: 6},
+			{LogProb: -3.0, NumTokens: 6},
+			{LogProb: -3.0, NumTokens: 6},
+		}}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{ActivationThreshold: 0.5})
+		d, err := c.Classify(context.Background(), Probe{Prompt: "x"})
+		Expect(err).NotTo(HaveOccurred(), "Classify")
+		Expect(d.Labels).To(HaveLen(1), "want fallback to argmax (single label)")
+	})
+
+	It("mean normalisation rescales by token count", func() {
+		// Raw joint log-probs (-8, -15, -6) without normalisation would
+		// pick chat — because chat's two tokens give it a smaller
+		// summed negative. With mean normalisation the per-token
+		// quality wins: code and math are -2.0/token, chat is
+		// -7.5/token and falls out.
+		s := &stubScorer{results: []backend.CandidateScore{
+			{LogProb: -8.0, NumTokens: 4},  // -2.0 per token
+			{LogProb: -15.0, NumTokens: 2}, // -7.5 per token
+			{LogProb: -6.0, NumTokens: 3},  // -2.0 per token
+		}}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{Normalization: ScoreNormalizationMean})
+		d, err := c.Classify(context.Background(), Probe{Prompt: "x"})
+		Expect(err).NotTo(HaveOccurred(), "Classify")
+		got := sortedLabels(d)
+		want := []string{"code-generation", "math-reasoning"}
+		Expect(equalLabels(got, want)).To(BeTrue(), "Labels = %v, want %v", got, want)
+	})
+
+	It("builds Arch-Router system prompt with route-listing and JSON output schema", func() {
+		s := &stubScorer{results: []backend.CandidateScore{
+			{LogProb: -1, NumTokens: 6},
+			{LogProb: -2, NumTokens: 6},
+			{LogProb: -3, NumTokens: 6},
+		}}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{})
+		_, err := c.Classify(context.Background(), Probe{Prompt: "hello world"})
+		Expect(err).NotTo(HaveOccurred(), "Classify")
+		// System prompt structure
+		Expect(s.lastP).To(ContainSubstring("<routes>"))
+		Expect(s.lastP).To(ContainSubstring(`{"name": "code-generation", "description": "writing, debugging, or explaining code"}`))
+		Expect(s.lastP).To(ContainSubstring("</routes>"))
+		Expect(s.lastP).To(ContainSubstring(`{"route": "<name>"}`))
+		// ChatML envelope from the fallback renderer
+		Expect(s.lastP).To(ContainSubstring("<|im_start|>user\nhello world<|im_end|>"))
+		Expect(strings.HasSuffix(s.lastP, "<|im_start|>assistant\n")).To(BeTrue(), "prompt does not end at assistant marker: %q", s.lastP)
+		// Candidates: JSON output + turn-end token, in policy order
+		Expect(s.lastC).To(Equal([]string{
+			`{"route": "code-generation"}<|im_end|>`,
+			`{"route": "casual-chat"}<|im_end|>`,
+			`{"route": "math-reasoning"}<|im_end|>`,
+		}))
+	})
+
+	It("honours custom stop token", func() {
+		s := &stubScorer{results: []backend.CandidateScore{
+			{LogProb: -1, NumTokens: 6},
+			{LogProb: -2, NumTokens: 6},
+			{LogProb: -3, NumTokens: 6},
+		}}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{StopToken: "<|eot_id|>"})
+		_, err := c.Classify(context.Background(), Probe{Prompt: "x"})
+		Expect(err).NotTo(HaveOccurred(), "Classify")
+		Expect(s.lastC[0]).To(Equal(`{"route": "code-generation"}<|eot_id|>`))
+	})
+
+	It("honours custom prompt renderer", func() {
+		called := false
+		renderer := func(system, user string) (string, error) {
+			called = true
+			return "SYSTEM:" + system + "\nUSER:" + user + "\nASSISTANT:", nil
+		}
+		s := &stubScorer{results: []backend.CandidateScore{
+			{LogProb: -1, NumTokens: 6},
+			{LogProb: -2, NumTokens: 6},
+			{LogProb: -3, NumTokens: 6},
+		}}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{PromptRenderer: renderer})
+		_, err := c.Classify(context.Background(), Probe{Prompt: "hi"})
+		Expect(err).NotTo(HaveOccurred(), "Classify")
+		Expect(called).To(BeTrue(), "renderer must be invoked")
+		Expect(s.lastP).To(HavePrefix("SYSTEM:"))
+		Expect(s.lastP).To(HaveSuffix("\nASSISTANT:"))
+		Expect(s.lastP).To(ContainSubstring("\nUSER:hi\n"))
+	})
+
+	It("caches by normalised prompt", func() {
+		s := &stubScorer{results: []backend.CandidateScore{
+			{LogProb: -0.1, NumTokens: 6},
+			{LogProb: -5, NumTokens: 6},
+			{LogProb: -6, NumTokens: 6},
+		}}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{CacheCap: 64})
+		_, err := c.Classify(context.Background(), Probe{Prompt: "Fix Bug"})
+		Expect(err).NotTo(HaveOccurred(), "classify 1")
+		_, err = c.Classify(context.Background(), Probe{Prompt: " fix bug "})
+		Expect(err).NotTo(HaveOccurred(), "classify 2")
+		Expect(s.calls).To(Equal(1), "second classify should hit cache")
+		Expect(c.CacheLen()).To(Equal(1))
+	})
+
+	It("cache disabled when cap zero", func() {
+		s := &stubScorer{results: []backend.CandidateScore{
+			{LogProb: -1, NumTokens: 6},
+			{LogProb: -5, NumTokens: 6},
+			{LogProb: -6, NumTokens: 6},
+		}}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{})
+		for i := 0; i < 3; i++ {
+			_, err := c.Classify(context.Background(), Probe{Prompt: "same"})
+			Expect(err).NotTo(HaveOccurred(), "classify")
+		}
+		Expect(s.calls).To(Equal(3), "cache disabled")
+	})
+
+	It("propagates scorer error", func() {
+		scorerErr := errors.New("boom")
+		c := NewScoreClassifier(testPolicies(), &stubScorer{err: scorerErr}, ScoreClassifierOptions{})
+		_, err := c.Classify(context.Background(), Probe{Prompt: "x"})
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("boom"), "expected scorer error to propagate")
+	})
+
+	It("returns result-count mismatch as error", func() {
+		s := &stubScorer{results: []backend.CandidateScore{
+			{LogProb: -1, NumTokens: 6},
+		}}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{})
+		_, err := c.Classify(context.Background(), Probe{Prompt: "x"})
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("returned 1 results for 3 policies"))
+	})
+
+	It("errors when every candidate has zero tokens", func() {
+		// Scorer regression: backend forgets to populate NumTokens.
+		// Without the guard this would softmax to uniform 1/N and the
+		// activation threshold would let every label through, masking
+		// the bug as multi-label intent.
+		s := &stubScorer{results: []backend.CandidateScore{
+			{LogProb: -1, NumTokens: 0},
+			{LogProb: -2, NumTokens: 0},
+			{LogProb: -3, NumTokens: 0},
+		}}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{})
+		_, err := c.Classify(context.Background(), Probe{Prompt: "x"})
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("zero tokens"))
+	})
+
+	It("zero-token candidate scores -inf", func() {
+		// A NumTokens=0 candidate must contribute zero softmax mass and
+		// never win, even if its raw log-prob looks favourable.
+		s := &stubScorer{results: []backend.CandidateScore{
+			{LogProb: 100, NumTokens: 0}, // degenerate
+			{LogProb: -2, NumTokens: 6},
+			{LogProb: -3, NumTokens: 6},
+		}}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{})
+		d, err := c.Classify(context.Background(), Probe{Prompt: "x"})
+		Expect(err).NotTo(HaveOccurred(), "Classify")
+		for _, l := range d.Labels {
+			Expect(l).NotTo(Equal("code-generation"), "NumTokens=0 label must not be active")
+		}
+	})
+
+	It("panics on empty policies", func() {
+		Expect(func() { NewScoreClassifier(nil, &stubScorer{}, ScoreClassifierOptions{}) }).To(Panic())
+	})
+
+	It("panics on nil scorer", func() {
+		Expect(func() { NewScoreClassifier(testPolicies(), nil, ScoreClassifierOptions{}) }).To(Panic())
+	})
+
+	It("panics on missing description", func() {
+		Expect(func() {
+			NewScoreClassifier([]ScorePolicy{{Label: "x"}}, &stubScorer{}, ScoreClassifierOptions{})
+		}).To(Panic())
+	})
+
+	It("panics on unknown normalization", func() {
+		Expect(func() {
+			NewScoreClassifier(testPolicies(), &stubScorer{}, ScoreClassifierOptions{Normalization: "nonsense"})
+		}).To(Panic())
+	})
+
+	It("renders a custom SystemPromptTemplate with .Policies", func() {
+		// Operator-supplied template with sprig (`join`) — substitutes
+		// the built-in Arch-Router-shaped prompt entirely. Asserts
+		// that .Policies reaches the template and the rendered output
+		// reaches the scorer.
+		tmpl := `Pick one of: {{range $i, $p := .Policies}}{{if $i}}, {{end}}{{$p.Label}}{{end}}.
+Reply: {"route": "<name>"}`
+		s := &stubScorer{results: []backend.CandidateScore{
+			{LogProb: -1, NumTokens: 6},
+			{LogProb: -2, NumTokens: 6},
+			{LogProb: -3, NumTokens: 6},
+		}}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{
+			SystemPromptTemplate: tmpl,
+		})
+		_, err := c.Classify(context.Background(), Probe{Prompt: "hi"})
+		Expect(err).NotTo(HaveOccurred(), "Classify")
+		Expect(s.lastP).To(ContainSubstring("Pick one of: code-generation, casual-chat, math-reasoning."))
+		Expect(s.lastP).To(ContainSubstring(`Reply: {"route": "<name>"}`))
+		// Built-in default must NOT leak when override is supplied.
+		Expect(s.lastP).NotTo(ContainSubstring("<routes>"))
+	})
+
+	It("renders a custom template with sprig functions", func() {
+		// Sprig's `upper` confirms the funcmap is wired through.
+		s := &stubScorer{results: []backend.CandidateScore{
+			{LogProb: -1, NumTokens: 6},
+			{LogProb: -2, NumTokens: 6},
+			{LogProb: -3, NumTokens: 6},
+		}}
+		c := NewScoreClassifier(testPolicies(), s, ScoreClassifierOptions{
+			SystemPromptTemplate: `{{range .Policies}}{{upper .Label}}: {{.Description}}
+{{end}}`,
+		})
+		_, err := c.Classify(context.Background(), Probe{Prompt: "x"})
+		Expect(err).NotTo(HaveOccurred(), "Classify")
+		Expect(s.lastP).To(ContainSubstring("CODE-GENERATION: writing, debugging, or explaining code"))
+	})
+
+	It("panics on a malformed SystemPromptTemplate", func() {
+		// Last line of defence: ModelConfig.Validate should catch
+		// this earlier, but direct programmatic callers that bypass
+		// validation still get a clear failure.
+		Expect(func() {
+			NewScoreClassifier(testPolicies(), &stubScorer{}, ScoreClassifierOptions{
+				SystemPromptTemplate: "{{ broken",
+			})
+		}).To(Panic())
+	})
+
+	It("Name returns the classifier identifier", func() {
+		c := NewScoreClassifier(testPolicies(), &stubScorer{}, ScoreClassifierOptions{})
+		Expect(c.Name()).To(Equal(ClassifierScore))
+	})
+})
diff --git a/core/services/routing/router/types.go b/core/services/routing/router/types.go
new file mode 100644
index 000000000..178cafbae
--- /dev/null
+++ b/core/services/routing/router/types.go
@@ -0,0 +1,133 @@
+// Package router holds the routing module's classifier interface and
+// the Score implementation.
+//
+// The dispatch architecture is: a "router model" in ModelConfig (one
+// with a Router block) gets matched at request time. The classifier
+// inspects the prompt and returns the set of policy labels it considers
+// active; the surrounding middleware picks the first candidate whose
+// labels are a superset of the active set, rewrites input.Model to that
+// candidate, and falls back through the existing model resolution path.
+// This keeps ACL checks, disabled-state, and per-model PII consistent —
+// the router does *model* selection, nothing else.
+//
+// The package deliberately has no dependency on core/http or
+// core/services — those wire the classifier in and feed it the request
+// shape they own. Keeps the classifier easy to unit-test against
+// synthetic Probe inputs and reusable from non-HTTP entry points
+// (e.g., a future MCP routing tool).
+package router
+
+import (
+	"context"
+	"time"
+)
+
+// Probe is the classifier's input — the parsed prompt content the
+// classifier needs to make a decision. Populated by the caller (the
+// middleware does the schema-shape extraction); the classifier never
+// inspects the original request struct.
+type Probe struct {
+	// Prompt is the merged user-visible text. For chat completions it
+	// is the concatenation of message contents (separated by newlines);
+	// for plain completions it is the raw prompt.
+	Prompt string
+}
+
+// Decision is the classifier's output. Labels carries the SET of
+// policy labels the classifier considers active for this probe. The
+// surrounding middleware picks the first candidate whose Labels
+// superset the active label set; that lets one prompt activate multiple
+// policies and route to a model capable of all of them. Score is the
+// softmax probability of the top label — kept for the decision log so
+// admins can spot uncertain calls.
+type Decision struct {
+	Labels  []string      `json:"labels"`
+	Score   float64       `json:"score"`
+	Latency time.Duration `json:"latency"`
+
+	// LabelScores carries the full per-label score distribution that
+	// fed the threshold check, in policy-declaration order. Score
+	// classifier emits softmax probabilities (sum to 1.0); rerank
+	// emits independent relevance in [0, 1]. Empty on cache hits —
+	// the cache stores only the final label set, not the distribution.
+	LabelScores []LabelScore `json:"label_scores,omitempty"`
+
+	// ActivationThreshold is the floor a label's score had to clear
+	// to land in Labels. Surfaced so the decision log can show how
+	// close inactive labels got to firing.
+	ActivationThreshold float64 `json:"activation_threshold,omitempty"`
+
+	// Cached is true when the decision came from the L2 embedding
+	// cache rather than a fresh classifier run. CacheSimilarity carries
+	// the cosine similarity of the cache hit (0 when not cached).
+	Cached          bool    `json:"cached,omitempty"`
+	CacheSimilarity float64 `json:"cache_similarity,omitempty"`
+}
+
+// LabelScore is one entry in Decision.LabelScores — a policy label and
+// the classifier's score for it. Score semantics depend on the
+// classifier (softmax probability for score, relevance for rerank), but
+// the threshold-comparison contract is identical.
+type LabelScore struct {
+	Label string  `json:"label"`
+	Score float64 `json:"score"`
+}
+
+// NewLabelScores zips two parallel slices (label name + score) into the
+// []LabelScore shape Decision carries. Caller must ensure len(labels)
+// == len(scores); panics on mismatch to surface the classifier bug
+// loudly rather than silently truncate.
+func NewLabelScores(labels []string, scores []float64) []LabelScore {
+	if len(labels) != len(scores) {
+		panic("router: NewLabelScores called with mismatched slice lengths")
+	}
+	out := make([]LabelScore, len(labels))
+	for i, l := range labels {
+		out[i] = LabelScore{Label: l, Score: scores[i]}
+	}
+	return out
+}
+
+// Classifier is the entry point the middleware calls. The
+// implementation honours ctx cancellation so long-running classifiers
+// abort when the request context dies.
+type Classifier interface {
+	Classify(ctx context.Context, p Probe) (Decision, error)
+	// Name is a stable identifier that ends up in RouterDecision rows
+	// — admins read this to know which classifier produced a given
+	// decision.
+	Name() string
+}
+
+// Classifier names. Single source of truth for the YAML
+// classifier: field, the buildClassifier dispatch in the
+// middleware, and the strings each Classifier returns from Name().
+const (
+	// ClassifierScore picks labels by asking a small classifier
+	// model (Arch-Router-style) to score each policy label as a
+	// continuation of the routing prompt. See router/score.go for
+	// the full rationale.
+	ClassifierScore = "score"
+
+	// ClassifierColbert picks labels by reranking each policy's
+	// description against the prompt via LocalAI's rerankers
+	// backend. Robust when policy labels are abstract relative to
+	// user prompts — the description is the natural English the
+	// reranker was trained on. The classifier_model points to a
+	// reranker model (cross-encoder or bge-m3-colbert); the
+	// `type:` field on that model's YAML controls which Reranker
+	// library mode loads. See router/rerank.go.
+	ClassifierColbert = "colbert"
+)
+
+// LabelFallback is the synthetic label written to the decision
+// store when the middleware uses cfg.Router.Fallback rather than a
+// classifier-picked candidate.
+const LabelFallback = "fallback"
+
+// errDecision packages an error with a populated Latency so each
+// classifier's Classify can return early without restating the
+// `Decision{Latency: time.Since(start)}, err` pattern.
+func errDecision(start time.Time, err error) (Decision, error) {
+	return Decision{Latency: time.Since(start)}, err
+}
diff --git a/core/trace/backend_trace.go b/core/trace/backend_trace.go
index 890da49fc..83af1339e 100644
--- a/core/trace/backend_trace.go
+++ b/core/trace/backend_trace.go
@@ -32,6 +32,7 @@ const (
 	BackendTraceVoiceEmbed      BackendTraceType = "voice_embed"
 	BackendTraceAudioTransform  BackendTraceType = "audio_transform"
 	BackendTraceModelLoad       BackendTraceType = "model_load"
+	BackendTraceScore           BackendTraceType = "score"
 )
 
 type BackendTrace struct {
@@ -41,10 +42,21 @@ type BackendTrace struct {
 	ModelName string           `json:"model_name"`
 	Backend   string           `json:"backend"`
 	Summary   string           `json:"summary"`
-	Error     string           `json:"error,omitempty"`
-	Data      map[string]any   `json:"data"`
+	// Body is the full request payload sent to the backend, when one
+	// applies (currently: cloud-proxy passthrough forwards). Summary
+	// is a short preview for the trace list; Body is the full
+	// payload shown when the row is expanded. Capped by the recorder
+	// to keep the in-memory ring buffer bounded.
+	Body  string         `json:"body,omitempty"`
+	Error string         `json:"error,omitempty"`
+	Data  map[string]any `json:"data"`
 }
 
+// MaxTraceBodyBytes caps the per-trace stored request body. Roomy
+// enough to keep typical chat histories intact while preventing a
+// runaway buffer when a caller streams MB-scale payloads.
+const MaxTraceBodyBytes = 1 << 20
+
 var backendTraceBuffer *circularbuffer.Queue[*BackendTrace]
 var backendMu sync.Mutex
 var backendLogChan = make(chan *BackendTrace, 100)
@@ -202,3 +214,13 @@ func TruncateToBytes(s string, maxBytes int) string {
 	}
 	return s[:maxBytes-len(suffix)] + suffix
 }
+
+// TruncateBytes is the []byte counterpart of TruncateString — it copies
+// at most maxLen bytes, avoiding a full string([]byte) allocation when
+// the input is a large request body.
+func TruncateBytes(b []byte, maxLen int) string {
+	if len(b) <= maxLen {
+		return string(b)
+	}
+	return string(b[:maxLen]) + "..."
+}
diff --git a/docs/content/features/cloud-proxy.md b/docs/content/features/cloud-proxy.md
new file mode 100644
index 000000000..1c870a930
--- /dev/null
+++ b/docs/content/features/cloud-proxy.md
@@ -0,0 +1,232 @@
++++
+title = "Cloud passthrough proxy"
+weight = 28
+toc = true
+description = "Forward requests to OpenAI, Anthropic, or any compatible provider"
+tags = ["Proxy", "Cloud", "Routing", "Advanced"]
+categories = ["Features"]
++++
+
+LocalAI can forward chat-completion and Anthropic Messages requests to an
+external provider instead of running them through the local gRPC backend
+pipeline. Configure a model with `backend: cloud-proxy` and a `proxy.upstream_url`,
+and LocalAI bypasses templating, MCP injection, and the local model loader
+entirely — the upstream sees the body the client sent (with only the top-level
+`model` field optionally rewritten).
+
+The streaming PII filter still runs over the upstream's SSE stream, so cloud
+egress remains subject to the same redaction rules a local model would apply.
+
+## When to use this
+
+- Mix local and cloud models in the same LocalAI instance — clients hit one
+  endpoint, LocalAI dispatches per model.
+- Apply LocalAI's auth, usage tracking, and PII redaction to cloud traffic
+  before the body leaves the network.
+- Use the intelligent router to send small or simple prompts to a local model
+  and complex ones to Claude or GPT-4o.
+
+## How it works
+
+1. Request hits LocalAI on `/v1/chat/completions` (OpenAI-shaped) or
+   `/v1/messages` (Anthropic-shaped).
+2. The standard auth and routing middleware runs.
+3. Per-model PII redaction runs request-side as it would for any model.
+4. The handler detects the `cloud-proxy` backend in passthrough mode and
+   loads the cloud-proxy gRPC backend, which owns the outbound HTTP.
+5. The backend POSTs the body to `proxy.upstream_url` with provider-aware
+   authentication, then streams the SSE response back to core.
+6. The streaming PII filter rewrites per-token text in flight; the upstream's
+   event names and metadata pass through unchanged.
+
+Passthrough mode is **wire-format-faithful** — it does not translate request
+shapes between providers. A client posting an OpenAI-shaped body to an
+Anthropic upstream will get a confused upstream. Use the matching wire format,
+or switch to translate mode (below).
+
+## Configuration
+
+The cloud-proxy backend has one knob — the provider it should authenticate
+against — and two modes:
+
+| `proxy.mode` | What it does | When to use |
+|---|---|---|
+| `passthrough` (default) | Forwards the request body verbatim to `upstream_url`. Client must speak the upstream's wire format. | Same wire format on both ends. |
+| `translate` | Backend converts internal proto to the upstream's wire format. Client can speak OpenAI-shaped requests to an Anthropic upstream, etc. | Cross-format adaptation. |
+
+`proxy.provider` selects the auth scheme and (in translate mode) the wire
+format. Supported values: `openai`, `anthropic`.
+
+API keys are loaded from either an environment variable (`api_key_env`) or a
+file (`api_key_file`). The key never appears in the config file or the admin
+UI; pick whichever fits your secret-management setup.
+
+### OpenAI passthrough
+
+```yaml
+name: gpt-4o-proxy
+backend: cloud-proxy
+
+# When set, replaces the client's "model" field before forwarding.
+# Useful when the LocalAI alias differs from the upstream's canonical name.
+proxy:
+  mode: passthrough
+  provider: openai
+  upstream_url: https://api.openai.com/v1/chat/completions
+  api_key_env: OPENAI_API_KEY
+  upstream_model: gpt-4o
+  request_timeout_seconds: 120
+
+# PII filtering defaults to ON for cloud-proxy backends. Override by setting
+# pii.enabled: false explicitly. Per-pattern action overrides go in
+# pii.patterns; see the Middleware admin page or the Middleware feature doc.
+pii:
+  enabled: true
+```
+
+Then start LocalAI with the API key in the environment:
+
+```bash
+export OPENAI_API_KEY=sk-...
+local-ai run
+```
+
+Clients hit `http://localhost:8080/v1/chat/completions` with `"model": "gpt-4o-proxy"`
+and the request lands on OpenAI's API.
+
+### Anthropic passthrough
+
+```yaml
+name: claude-sonnet-proxy
+backend: cloud-proxy
+
+proxy:
+  mode: passthrough
+  provider: anthropic
+  upstream_url: https://api.anthropic.com/v1/messages
+  api_key_env: ANTHROPIC_API_KEY
+  upstream_model: claude-3-5-sonnet-20241022
+  request_timeout_seconds: 300
+
+pii:
+  enabled: true
+  # Block — not just mask — leaked credentials before they reach the upstream.
+  patterns:
+    - id: api_key_prefix
+      action: block
+```
+
+Anthropic clients hit `http://localhost:8080/v1/messages` with
+`"model": "claude-sonnet-proxy"`.
+
+### Other OpenAI-compatible providers
+
+Most third-party providers (Together, Groq, DeepInfra, OpenRouter, …) speak
+the OpenAI chat-completions wire format. Use `provider: openai` with the
+provider's URL and API key:
+
+```yaml
+name: llama-3-70b-via-together
+backend: cloud-proxy
+
+proxy:
+  mode: passthrough
+  provider: openai
+  upstream_url: https://api.together.xyz/v1/chat/completions
+  api_key_env: TOGETHER_API_KEY
+  upstream_model: meta-llama/Llama-3-70b-chat-hf
+```
+
+### Translate mode
+
+In translate mode the cloud-proxy backend converts LocalAI's internal proto
+to the provider's wire format. This lets a client speak one shape (e.g.
+OpenAI Chat Completions) against an upstream that expects another (e.g.
+Anthropic Messages).
+
+```yaml
+name: claude-via-openai-clients
+backend: cloud-proxy
+
+proxy:
+  mode: translate
+  provider: anthropic
+  upstream_url: https://api.anthropic.com/v1/messages
+  api_key_env: ANTHROPIC_API_KEY
+  upstream_model: claude-3-5-sonnet-20241022
+```
+
+Translate mode currently routes only pure-text completions — tool calls,
+image blocks, and per-request usage tokens are dropped through the
+internal `Predict()` signature. Use passthrough mode when your clients need
+the upstream's full feature set.
+
+## Loading secrets from a file
+
+`api_key_file` is an alternative to `api_key_env` when your secret manager
+mounts keys as files (e.g. Kubernetes secrets, Docker secrets, Vault Agent):
+
+```yaml
+proxy:
+  api_key_file: /run/secrets/openai_api_key
+```
+
+The file is read at backend load time and trimmed of surrounding whitespace.
+`api_key_env` and `api_key_file` are mutually exclusive.
+
+## Combining with the intelligent router
+
+A router model can spread traffic across local and cloud candidates. The
+score classifier reads the policy descriptions and routes per request:
+
+```yaml
+name: smart-router
+router:
+  classifier: score
+  classifier_model: arch-router-1.5b
+  fallback: qwen-3-7b-local
+  activation_threshold: 0.40
+  policies:
+    - label: casual
+      description: small talk, greetings, short answers
+    - label: code
+      description: writing or debugging code in any programming language
+    - label: heavy-reasoning
+      description: long-form analysis, complex math, multi-step reasoning
+  candidates:
+    - model: qwen-3-7b-local
+      labels: [casual]
+    - model: gpt-4o-proxy
+      labels: [casual, code]
+    - model: claude-sonnet-proxy
+      labels: [casual, code, heavy-reasoning]
+```
+
+The router rewrites `input.Model` to the chosen candidate; per-model PII,
+ACLs, and the cloud-proxy fork all run against the resolved target.
+
+See [Middleware: PII filtering and intelligent routing]({{< relref "middleware.md" >}})
+for the full router and PII-filter reference.
+
+## Limitations
+
+- **Passthrough does no wire-shape translation.** Use `mode: translate` (with
+  the constraints documented above) or send requests that match the upstream's
+  format.
+- **No output-side PII for non-streaming responses.** Streaming responses are
+  filtered in flight; buffered responses pass through verbatim. Request-side
+  PII covers both.
+- **No retry or backoff.** Transient upstream failures bubble up to the client
+  as `502 Bad Gateway`.
+- **No request shape validation.** If the upstream rejects the body, its
+  error envelope is forwarded to the client unchanged.
+
+## Operational notes
+
+- Cloud-proxy backends load like any other gRPC backend — they consume one
+  process per loaded model and appear in the backend management view, but
+  they hold no GPU memory.
+- Usage stats and the trace log capture cloud-proxy requests like any other
+  request. Token counts come from the upstream's `usage` field when present.
+- Set `request_timeout_seconds` defensively — a hung upstream otherwise ties
+  up an HTTP handler until the client disconnects.
diff --git a/docs/content/features/middleware.md b/docs/content/features/middleware.md
new file mode 100644
index 000000000..ee4ef9d4a
--- /dev/null
+++ b/docs/content/features/middleware.md
@@ -0,0 +1,509 @@
++++
+title = "Middleware: PII filtering and intelligent routing"
+weight = 27
+toc = true
+description = "Per-model PII redaction and policy-based request routing"
+tags = ["Routing", "Privacy", "PII", "Middleware", "Advanced"]
+categories = ["Features"]
++++
+
+LocalAI ships a request-middleware layer that sits between the HTTP API and
+the backend dispatcher. Two subsystems share that layer because they share
+the same lifecycle hook: **PII filtering** scans the request body before it
+reaches a backend (and the SSE stream on the way out), and the **intelligent
+router** rewrites `input.Model` so a single client-facing model name fans
+out across multiple downstream targets.
+
+Both are inspected and configured from the same admin page
+(`/app/middleware`), backed by the same REST surface (`/api/middleware/*`,
+`/api/pii/*`, `/api/router/*`) and the same MCP tools.
+
+## Request lifecycle
+
+```
+client ── auth ── route-model ── per-model PII ── backend ── streaming PII ── client
+                       │                              │
+                       └─── decision log              └─── event log
+```
+
+The router runs first (it picks the target model so per-model PII has
+something to gate on), per-model PII runs next (gated by the resolved
+config), the backend executes, and the streaming PII filter rewrites the
+SSE response in flight. Each subsystem writes to its own admin-visible
+log: `/api/router/decisions` for routing, `/api/pii/events` for redaction
+and block actions.
+
+---
+
+## PII filtering
+
+PII redaction is **per-model and off by default**. The default flips to
+**on for any backend whose name starts with `proxy-`** because that traffic
+crosses the network to a third-party provider. Explicit `pii.enabled`
+in a model's YAML always wins over the backend default.
+
+### Pattern catalog
+
+The built-in regex tier ships six patterns. Each has a default action
+(`mask`, `block`, or `route_local`) and a length cap that prevents
+pathological inputs from blowing up scanning time:
+
+| ID | Description | Default action | Max length |
+|---|---|---|---|
+| `email` | Email address | `mask` | 254 |
+| `phone` | Phone number (international or US) | `mask` | 24 |
+| `ssn` | US Social Security Number | `mask` | 11 |
+| `credit_card` | Credit card number (Luhn-verified) | `mask` | 19 |
+| `ipv4` | IPv4 address | `mask` | 15 |
+| `api_key_prefix` | `sk-`, `pk-`, `xoxb-`, `ghp_`, `github_pat_` | **`block`** | 200 |
+
+`mask` rewrites the match to `[REDACTED:<id>]` in the request body before
+forwarding. `block` returns HTTP 400 with `error.type=pii_blocked` to the
+client without forwarding. `route_local` is reserved for the routing
+integration (see below) and falls back to `mask` when no local route is
+available.
+
+### Per-model configuration
+
+Add a `pii:` block to a model YAML to opt in (or out, or to override
+per-pattern actions):
+
+```yaml
+# Local model — explicit opt-in so chats with this model get redaction
+# applied request-side.
+name: qwen-7b-local
+backend: llama-cpp
+pii:
+  enabled: true
+```
+
+```yaml
+# Cloud-bound model — defaults to enabled because backend is cloud-proxy.
+# Tighten api_key_prefix from the global default and downgrade email to
+# route_local so emails route to a local model rather than leaving the
+# network.
+name: claude-strict
+backend: cloud-proxy
+proxy:
+  mode: passthrough
+  provider: anthropic
+  upstream_url: https://api.anthropic.com/v1/messages
+  api_key_env: ANTHROPIC_API_KEY
+pii:
+  patterns:
+    - id: api_key_prefix
+      action: block        # already the default, made explicit for audit
+    - id: email
+      action: route_local
+```
+
+The regex itself stays global — only the action is settable per-model.
+Adding new patterns is a build-time concern (extend `patternRegexps` in
+`core/services/routing/pii/patterns.go`).
+
+### NER tier (optional)
+
+The regex matcher covers high-precision patterns. For natural-language
+PII (proper names, addresses, organization names) LocalAI carries an
+**encoder NER tier** that runs after the regex pass. It expects a
+transformers token-classification model wired through the `TokenClassify`
+gRPC primitive (e.g. `dslim/bert-base-NER`). The detector annotates
+spans with an entity group (`PER`, `LOC`, `ORG`, `MISC`); per-group
+actions are configurable through the same `pii:` block.
+
+The NER tier ships as a contract (`NERDetector`, `NERConfig` in
+`core/services/routing/pii/ner.go`); an operator-facing knob to load and
+attach a detector is not plumbed yet. When no detector is configured the
+regex tier still runs.
+
+### Streaming PII filter
+
+Buffered (`/v1/chat/completions` without `"stream": true`) responses are
+forwarded verbatim today — only the request-side scan runs. Streaming
+responses run through `pii.StreamFilter` which buffers SSE chunks until
+either a full pattern matches or the buffer's max length is reached,
+then emits the safe prefix. The streaming filter is what makes the
+cloud-proxy backend and the MITM proxy safe to expose to clients that
+issue streaming requests.
+
+The streaming filter is wired automatically for any model with `pii.enabled`
+true — there is no separate streaming toggle.
+
+### Admin page
+
+The `/app/middleware` page (admin role only) has four tabs — **Filtering**,
+**Routing**, **MITM Proxy** (see the [MITM doc]({{< relref "mitm-proxy.md" >}})),
+and **Events**. The Filtering tab shows:
+
+- The pattern catalogue with live action dropdowns. Changing an action via
+  the UI calls `PUT /api/pii/patterns/:id` and updates the live redactor
+  in-process. Click **Persist** in the action header to write the current
+  state into `runtime_settings.json` so the next process start re-applies it.
+- A per-model resolved-state table — each model row reports `enabled`,
+  the per-pattern overrides, and which patterns are effectively active.
+- A live test panel that posts sample text to `/api/pii/test` and
+  highlights matches with their resolved actions, without storing the
+  text in the event log.
+
+### REST surface
+
+| Method | Path | Auth | Purpose |
+|---|---|---|---|
+| GET | `/api/pii/patterns` | any | Live pattern list with current actions. Used by the UI catalogue. |
+| POST | `/api/pii/test` | any | Dry-run the redactor on `{"text":"..."}`. Returns hits and the would-be-rewritten body. Does not write to the event log. |
+| GET | `/api/pii/events` | admin | Recent middleware events — PII redactions, MITM connect/traffic, admission denials. Filterable by `correlation_id`, `user_id`, `pattern_id`, `kind`. |
+| PUT | `/api/pii/patterns/:id` | admin | Update a pattern in-process. Body accepts `{"action":"mask"\|"block"\|"route_local"}` and/or `{"disabled":true\|false}`. Transient — reverts on restart unless persisted. |
+| POST | `/api/pii/patterns/persist` | admin | Snapshot the live per-pattern (action, disabled) state into `runtime_settings.json`. |
+| GET | `/api/middleware/status` | admin | Aggregated dashboard data: patterns + per-model resolved state + router status + MITM status + admission status. One round-trip for the UI. |
+
+### MCP tools
+
+The same surface is mirrored through the LocalAI Assistant MCP server so
+the in-process and stdio assistants can manage the filter conversationally:
+
+| Tool | Read/Write | Purpose |
+|---|---|---|
+| `list_pii_patterns` | read | Returns the live pattern list. |
+| `get_pii_events` | read | Recent redaction / block events with optional filters. |
+| `test_pii_redaction` | read | Dry-run sample text without writing to the event log. |
+| `get_middleware_status` | read | Aggregator — the same payload as `GET /api/middleware/status`. |
+| `set_pii_pattern_action` | write | Update a pattern's action. Admin-only. |
+| `persist_pii_patterns` | write | Snapshot live state to `runtime_settings.json`. Admin-only. |
+
+---
+
+## Intelligent routing
+
+A **router model** is a model whose YAML carries a `router:` block. When
+a client addresses it (`"model": "smart-router"`), the middleware
+classifies the prompt, picks a downstream candidate model, rewrites
+`input.Model` to the candidate, and the standard model-resolution path
+runs against that resolved target. ACL checks, disabled-state, and
+per-model PII all apply to the resolved model — the router does
+*model selection only*.
+
+#### Depth-1 invariant
+
+Candidates **must not** themselves be router models. A
+`smart-router → claude-strict → cloud-proxy` chain is fine
+(`claude-strict` is a regular cloud-proxy model). A
+`smart-router → other-router → real-model` chain is rejected at runtime
+by the middleware (the dispatcher returns HTTP 500 with a
+`depth-1 invariant` error). This keeps the dispatch graph acyclic and
+predictable.
+
+#### Fallback
+
+If no candidate's label set covers the active label set from the classifier,
+or the classifier errors out, the router uses `cfg.Router.Fallback`.
+An empty `fallback` causes the dispatch to fail with HTTP 500 rather
+than silently routing somewhere unintended — fail-fast, not
+silent-bypass.
+
+### Available classifiers
+
+LocalAI ships two classifier implementations. Pick one with `classifier:`
+in the router YAML:
+
+| Classifier | When to use | Underlying primitive |
+|---|---|---|
+| `score` (default) | Small classifier-tuned LM (Arch-Router-style). Best when label vocabulary is well-covered by next-token continuation. | `Score` gRPC primitive (llama-cpp, vLLM). |
+| `colbert` | When label descriptions are abstract or short and a next-token classifier produces flat distributions. Robust on long-form policy descriptions. | rerankers backend in ColBERT mode (e.g. `bge-m3-colbert` from the gallery). |
+
+Both classifiers share the same YAML shape: `classifier_model`,
+`policies`, `candidates`, `fallback`, `activation_threshold`,
+`classifier_cache_size`, and the optional `embedding_cache` block.
+
+### The Score classifier
+
+The `score` classifier works like this:
+
+1. Build a Qwen/ChatML system prompt that lists every policy label with
+   its description and primes the model to emit a label as the assistant
+   turn.
+2. Ask the classifier model to **score every policy label** as the
+   first-token(s) continuation. This uses the `Score` gRPC primitive
+   (`backend.proto::Score`), which returns per-candidate log-probabilities
+   length-normalized so candidates of unequal token length stay
+   comparable.
+3. Softmax the length-normalized log-probabilities into a probability
+   distribution over labels.
+4. Threshold the distribution: every label whose probability passes
+   `activation_threshold` joins the **active label set**.
+5. Pick the FIRST candidate whose `Labels` is a superset of the active
+   set. Admins order candidates smallest → largest so a single-label
+   query routes to the smallest capable model, while a query that
+   activates multiple labels falls to a candidate that covers them all.
+
+This is the Arch-Router approach extended for multi-label. The
+distribution carries more signal than the argmax — reading off the
+spread lets one prompt activate multiple policies and route to a model
+capable of all of them.
+
+#### Recommended classifier model
+
+[Arch-Router-1.5B](https://huggingface.co/katanemo/Arch-Router-1.5B) is
+the canonical choice. It's a Qwen-2.5-1.5B-Instruct base trained
+specifically on routing-policy continuation, so the ChatML system-prompt
++ label-continuation pattern produces well-separated label probabilities
+without prompt tuning. The Q4_K_M GGUF runs on CPU, GPU, and Intel SYCL.
+
+The classifier model must support the `Score` gRPC primitive (today: the
+llama-cpp and vLLM backends) and use the ChatML chat template. Any small
+ChatML instruct model works under those constraints, but expect flatter
+probability distributions which translate to a higher
+`activation_threshold` to keep noise out of the active label set.
+
+On llama-cpp, declare `known_usecases: [score]` on the classifier
+model — LocalAI rejects configs that combine `score` with
+`chat`/`completion`/`embeddings` there, because the Score RPC races
+the `llama_context` against slot-loop traffic.
+
+### The Colbert classifier
+
+The `colbert` classifier reranks each policy *description* against the
+prompt via the rerankers backend and activates the labels whose
+relevance scores clear `activation_threshold` (default 0.5 for
+reranker-style scores in [0, 1]).
+
+```yaml
+router:
+  classifier: colbert
+  classifier_model: bge-m3-colbert  # gallery entry; loads BAAI/bge-m3 in ColBERT mode
+  activation_threshold: 0.5
+  policies:
+    - label: code-generation
+      description: writing, debugging, reading, or explaining code
+    - label: casual-chat
+      description: small talk, greetings, jokes
+  candidates: [...]
+```
+
+The reranker scores the *description* (natural English) rather than
+asking a small LM to score the *label* as a next-token continuation,
+so it tends to be more robust when policy labels are abstract slugs
+(`compliance-review`, `tier-2-support`). The trade-off is one
+reranker round-trip per request — bge-m3 in ColBERT mode is fast
+enough on GPU that this is comparable to the Score path for most
+workloads. The `embedding_cache` block applies identically.
+
+The reranker model's `type:` (in the model YAML) selects which
+underlying scoring head loads — `colbert` for late-interaction MaxSim,
+`cross-encoder` for cross-attention scoring. The classifier itself is
+indifferent; pick the head that fits your latency / quality budget.
+
+### YAML reference
+
+```yaml
+name: smart-router
+known_usecases:
+  - chat
+router:
+  # `score` (Arch-Router-style next-token scoring) or `colbert`
+  # (rerank policy descriptions). See "Available classifiers" above.
+  classifier: score
+
+  # A model loaded by LocalAI that supports the Score gRPC primitive
+  # (llama-cpp and vLLM ship implementations). Arch-Router-1.5B is the
+  # canonical choice.
+  classifier_model: arch-router-1.5b
+
+  # Bounded LRU keyed on (case-folded, whitespace-trimmed) prompt — prompts
+  # repeat in agent loops; the cache amortises the classifier round-trip
+  # across them. 0 here means "use the default" (1024); the cache cannot be
+  # disabled from YAML today.
+  classifier_cache_size: 256
+
+  # Softmax probability floor a label must clear to join the active label set.
+  # 0 = use the package default (0.15). 0.40 is a better empirical
+  # starting point on Arch-Router-1.5B — see the tuning note below.
+  activation_threshold: 0.40
+
+  # Used when no candidate covers the active label set, or the classifier
+  # itself errors. Empty here = fail-fast with HTTP 500.
+  fallback: qwen3-0.6b
+
+  # The label vocabulary. Descriptions are fed verbatim into the
+  # classifier's system prompt — short, action-oriented sentences work
+  # best ("writing or debugging code", "small talk").
+  policies:
+    - label: code-generation
+      description: writing, debugging, reading, or explaining code in any programming language
+    - label: casual-chat
+      description: small talk, greetings, jokes, or general conversation with no specific task
+    - label: math-reasoning
+      description: arithmetic, equations, percentage calculations, or step-by-step word problems
+
+  # Routing table — order matters (smallest → largest). See "Score
+  # classifier" above for the matching rule.
+  candidates:
+    - model: qwen3-0.6b
+      labels: [casual-chat]
+    - model: qwen_qwen3.5-2b
+      labels: [code-generation, casual-chat, math-reasoning]
+```
+
+### Tuning `activation_threshold`
+
+The threshold is the single knob you'll want to tune per
+(classifier-model, policy-set) pair. On Arch-Router-1.5B with the
+three-policy setup above, sweeping the threshold over a hand-labeled
+30-prompt corpus produced:
+
+| Threshold | Label-set accuracy | End-to-end routing accuracy |
+|---:|---:|---:|
+| 0.15 (package default) | 30% | 73% |
+| 0.30 | 57% | 87% |
+| **0.40** | **60%** | **90%** |
+| 0.45 | 67% | 97% |
+| 0.50 | 67% | 97% |
+
+The classifier's argmax matches the dominant label 93% of the time on
+this corpus — what the threshold controls is how much secondary-label
+noise leaks into the active label set. Low thresholds push single-label
+queries to multi-label-capable (larger) candidates unnecessarily; 0.40
+keeps the dominant label dominant without losing genuine compound
+activations.
+
+Re-tune per (classifier-model, policy-set) pair. The `/api/score`
+endpoint (see below) is the convenient probe — it returns the raw
+length-normalized log-probabilities so you can sweep thresholds offline
+without driving real chat completions.
+
+### Embedding cache (L2)
+
+Classification is the most expensive thing the middleware does. The
+score classifier already memo-caches verbatim repeats (case- and
+whitespace-folded prompt → decision); the **embedding cache** is the
+L2 tier that catches *semantically similar* prompts — "How do I exit
+vim?" and "i need to quit vim" can share a decision instead of running
+the classifier twice.
+
+Pairs naturally with a larger / slower classifier model: the steady-state
+cost on cache hits collapses to one embedding round-trip plus a KNN
+search, both well under 100ms with `nomic-embed-text-v1.5` + local-store.
+
+#### Configuration
+
+Add an `embedding_cache:` block to a router model:
+
+```yaml
+router:
+  classifier: score
+  classifier_model: arch-router-1.5b
+  policies: [...]
+  candidates: [...]
+
+  embedding_cache:
+    embedding_model: nomic-embed-text-v1.5    # any loaded embedding model
+    similarity_threshold: 0.80                # cosine sim floor for a hit (default 0.80)
+    confidence_threshold: 0.60                # min top-label prob to cache a decision (default 0.60)
+    # store_name: router-cache-smart-router   # optional override; defaults to "router-cache-<router>"
+```
+
+Omit the block entirely to disable. The cache adds two new failure modes
+(embedder unavailable, store unavailable) — both fall through to the
+inner classifier so routing keeps working.
+
+#### How it works
+
+For each request:
+
+1. Embed the probe prompt via the configured `embedding_model`.
+2. KNN top-1 against the per-router local-store collection.
+3. If similarity ≥ `similarity_threshold`, return the cached decision
+   (`Cached=true`, `CacheSimilarity=<sim>` in the decision log).
+4. Miss → run the inner classifier. If `decision.score >= confidence_threshold`,
+   insert `(embedding, decision)` into the store. Low-confidence
+   decisions are deliberately skipped so they can't poison future
+   paraphrases.
+
+The local-store collection is named `router-cache-<router-model-name>` by
+default — each router gets its own collection so two routers can't
+cross-contaminate. Collections persist on disk (local-store is the
+canonical persistent vector backend), so the cache survives restarts.
+
+#### Tuning notes
+
+- **Similarity threshold**: 0.80 is the package default — re-tune
+  per (embedding model, corpus). The histogram on the Routing tab
+  shows where the cosine distribution actually sits; pick a
+  threshold above the cross-intent cluster and below the paraphrase
+  cluster.
+- **Confidence threshold**: 0.60 corresponds roughly to "the
+  classifier is committed to a top label." Don't lower this — caching
+  unsure decisions propagates the uncertainty.
+- **Cache flush**: invalidates automatically when the router YAML
+  changes (the classifier cache is fingerprinted by `yaml.Marshal`),
+  but the underlying local-store collection still holds the old
+  payloads. Manual flush via local-store admin or by renaming
+  `store_name` if you need a hard reset.
+- **Latency budget**: an embedding round-trip (typically 30–80ms for
+  small embedding models) plus KNN search (~5ms) is added to every
+  *miss* on top of the classifier latency. Cache hits skip the
+  classifier entirely. Break-even is around 7–10% hit rate; agent
+  loops with repeated phrasing easily exceed this.
+
+### Admin page
+
+The `/app/middleware` page has a **Routing** tab listing every router
+model's classifier, policies, candidates, and fallback. The **Events**
+tab shows the decision log — one row per classified request with
+correlation ID, requested model, served model, classifier name, active
+labels, top-label score, and latency.
+
+Routing decisions are stored in an in-process ring buffer (default
+capacity 5,000). The decision log is for audit and tuning — the
+canonical usage log lives in `/api/usage` and correlates by request ID.
+
+### REST surface
+
+| Method | Path | Auth | Purpose |
+|---|---|---|---|
+| GET | `/api/router/status` | any | Router configuration: each router model's classifier, policies, candidates. |
+| GET | `/api/router/decisions` | admin | Decision log with optional filters (`correlation_id`, `user_id`, `router_model`, `limit`). |
+| POST | `/api/score` | admin | Direct access to the `Score` gRPC primitive — useful for offline threshold tuning. Body: `{"model": "<classifier-model>", "prompt": "<chatml-prompt>", "candidates": ["label-a", ...], "length_normalize": true}`. The llama-cpp and vLLM backends implement Score; other backends return `UNIMPLEMENTED`. |
+
+### MCP tools
+
+| Tool | Read/Write | Purpose |
+|---|---|---|
+| `get_router_decisions` | read | Recent decision log with optional filters. |
+| `get_middleware_status` | read | Includes the router section listing configured router models. |
+
+Mutating routing config — adding a candidate, changing the classifier
+model — is YAML-only today; reload with `POST /models/reload` to pick
+up edits without restarting.
+
+### Operational notes
+
+- **Reload after YAML edits.** The router configs are loaded at startup
+  and cached. `POST /models/reload` re-reads from disk; the next request
+  rebuilds the classifier from the new config (the classifier cache is
+  fingerprinted by `yaml.Marshal(RouterConfig)` so it invalidates
+  automatically).
+- **Classifier latency** on Arch-Router-1.5B Q4_K_M is ~500ms steady
+  for 3 policies on Intel SYCL. The score primitive re-decodes the full
+  prompt for every candidate today (the KV cache is cleared between
+  candidates); the prompt-KV-sharing optimization is on the perf TODO
+  list in `backend/cpp/llama-cpp/grpc-server.cpp::Score`. Until then,
+  `classifier_cache_size` is the highest-leverage knob for repeat-query
+  workloads (agent loops).
+- **Decision log size**: 5,000-entry ring buffer per process. The
+  log is in-process and not persisted — pair with the usage log for
+  long-horizon audit.
+
+---
+
+## Related features
+
+- [Cloud passthrough proxy]({{< relref "cloud-proxy.md" >}}) — combine
+  the router with `proxy-*` backends to send simple prompts to local
+  models and complex ones to cloud providers.
+- [MITM proxy]({{< relref "mitm-proxy.md" >}}) — apply the same PII
+  filter to Claude Code, Codex CLI, and any HTTPS client without
+  LocalAI holding their API keys.
+- [Authentication]({{< relref "authentication.md" >}}) — admin role is
+  required for mutating endpoints and the `/app/middleware` page; in
+  no-auth single-user mode the synthetic local user has admin role
+  automatically.
diff --git a/docs/content/features/mitm-proxy.md b/docs/content/features/mitm-proxy.md
new file mode 100644
index 000000000..4c0428df4
--- /dev/null
+++ b/docs/content/features/mitm-proxy.md
@@ -0,0 +1,159 @@
++++
+title = "MITM proxy for Claude Code / Codex CLI"
+weight = 29
+toc = true
+description = "Redact PII from cloud-AI traffic without LocalAI holding API keys"
+tags = ["Proxy", "MITM", "Privacy", "Routing", "Advanced"]
+categories = ["Features"]
++++
+
+LocalAI can act as a local HTTPS proxy that **redacts PII from your Claude
+Code, OpenAI Codex CLI, or any HTTPS client** without holding their API keys.
+The proxy intercepts only the LLM API endpoints you allowlist (default:
+`api.anthropic.com`, `api.openai.com`); everything else — OAuth, telemetry,
+package fetches — passes through as a plain TCP tunnel.
+
+Use this when:
+
+- You want to use **Claude Code with a Claude Pro/Max subscription** but still
+  apply the same PII redaction LocalAI applies to API-key traffic.
+- You run Codex CLI on a corporate laptop and need an audit trail of prompts.
+- You want LocalAI to enforce egress policies for AI traffic without
+  becoming the API endpoint clients talk to.
+
+The proxy is **off by default**. Operators opt in by setting `--mitm-listen`
+and distributing the generated CA cert.
+
+## How it works
+
+1. The proxy generates a private CA on first start (persisted to disk).
+2. Clients set `HTTPS_PROXY=http://localai:port` and add the CA to their
+   trust store (e.g. `NODE_EXTRA_CA_CERTS` for Node-based CLIs like Claude
+   Code and Codex).
+3. The CLI sends `CONNECT api.anthropic.com:443` to the proxy.
+4. For allowlisted hosts, the proxy mints a per-host leaf cert signed by
+   the CA, terminates TLS, parses the HTTP request, applies the global
+   PII redactor on `/v1/messages` or `/v1/chat/completions`, and forwards
+   to the real upstream over its own TLS connection.
+5. The streaming SSE response runs through the same `pii.StreamFilter`
+   the cloud-proxy backend uses.
+6. For non-allowlisted hosts, the proxy is a plain CONNECT tunnel — no
+   TLS termination, no inspection, no CA trust required.
+
+The CLI authenticates with its own subscription / API key as it normally
+would. LocalAI never holds the credential — it just observes and rewrites
+the request body.
+
+## Quick start
+
+Start LocalAI with the MITM listener:
+
+```bash
+local-ai run --mitm-listen :8443
+```
+
+The first start generates a CA at `<data-path>/mitm-ca/{ca.crt,ca.key}`.
+Restarting reloads the same CA so clients keep trusting it.
+
+Download the public CA cert:
+
+```bash
+curl -O http://localhost:8080/api/middleware/proxy-ca.crt
+```
+
+Configure Claude Code to use the proxy and trust the cert:
+
+```bash
+export HTTPS_PROXY=http://localhost:8443
+export NODE_EXTRA_CA_CERTS=$(pwd)/proxy-ca.crt
+claude
+```
+
+Now any `claude` chat session that touches `api.anthropic.com/v1/messages`
+gets its prompts and tool inputs scanned by LocalAI's PII filter, and any
+PII the model emits in its streaming response is masked before reaching
+your terminal. Events appear in the LocalAI middleware admin page under
+**Filtering → Recent events**.
+
+The same works for Codex CLI — set `HTTPS_PROXY` and `NODE_EXTRA_CA_CERTS`
+and run `codex`.
+
+## Configuration
+
+| Flag / env | Default | Purpose |
+|---|---|---|
+| `--mitm-listen` / `LOCALAI_MITM_LISTEN` | empty (disabled) | Address to bind the proxy listener on |
+| `--mitm-ca-dir` / `LOCALAI_MITM_CA_DIR` | `<data-path>/mitm-ca` | Where to persist the CA cert + key |
+| `--mitm-intercept-hosts` / `LOCALAI_MITM_INTERCEPT_HOSTS` | `api.anthropic.com,api.openai.com` | Hosts to terminate TLS for; everything else tunnels |
+
+Hostnames are case-insensitive. Add custom upstreams (e.g. an
+OpenAI-compatible third-party provider) by extending the allowlist and
+ensuring their endpoint paths match `/v1/chat/completions` or
+`/v1/messages`.
+
+## What gets redacted
+
+Same patterns the regular request middleware uses:
+
+- Email addresses → masked
+- Phone numbers → masked
+- US Social Security Numbers → masked
+- Credit card numbers (Luhn-verified) → masked
+- IPv4 addresses → masked
+- API key prefixes (`sk-`, `pk-`, `ghp_`, `github_pat_`, `xoxb-`) → **blocked**
+
+A `block` action returns HTTP 400 with `error.type=pii_blocked` to the
+client. The CLI sees the rejection and shows it as a request error.
+
+Events are persisted via the same `pii.EventStore` the rest of LocalAI
+uses, so the `/api/pii/events` endpoint and the middleware admin page
+include MITM events alongside direct-API events.
+
+## Security notes
+
+- **The CA private key is the master credential.** Anyone with read
+  access to `<data-path>/mitm-ca/ca.key` can forge TLS for any host the
+  proxy could intercept. The file is mode 0600; keep it that way.
+- The proxy listener accepts plaintext HTTP `CONNECT` requests — bind it
+  to localhost (`--mitm-listen 127.0.0.1:8443`) unless you've added auth
+  in front of the listener. There is no built-in API-key check on this
+  port.
+- The MITM CA is **separate** from any TLS cert LocalAI's main HTTP API
+  uses. Installing the MITM CA grants trust only for traffic that flows
+  through this proxy.
+- The proxy does not pin upstream certificates; it trusts the system
+  certificate store. If your machine's trust store is compromised, the
+  proxy is too.
+- TLS termination negotiates HTTP/2 by default (ALPN `h2`) and falls
+  back to HTTP/1.1 for clients that don't speak h2. Modern CLIs (Claude
+  Code, Codex) and the Anthropic / OpenAI APIs all use h2.
+
+## Limitations
+
+- **Only `/v1/messages` and `/v1/chat/completions` get redacted.** Other
+  paths on the same host (OAuth, model listing) are forwarded verbatim.
+- **No request-shape translation.** The proxy assumes the request body
+  matches the host's wire format; cross-shape forwarding is the cloud
+  proxy backend's job, not the MITM's.
+- **No CA rotation in the MVP.** To rotate, delete `ca.key` and `ca.crt`
+  and re-distribute the new cert to every client.
+- **Cert pinning kills MITM.** Neither Claude Code nor Codex CLI pins
+  certificates today, but a future SDK update could. If a CLI starts
+  refusing the proxied handshake, that's the signal.
+
+## Comparison with the cloud-proxy backend
+
+LocalAI ships two cloud-related proxy modes; pick by who holds the credential:
+
+|  | Cloud-proxy backend (`backend: proxy-*`) | MITM proxy (`--mitm-listen`) |
+|---|---|---|
+| Client config | `localai:8080` as **API endpoint** | `localai:8443` as **HTTPS_PROXY** |
+| Holds API key | LocalAI | Client (CLI's own auth) |
+| Works with subscription auth | No | Yes (CLI uses its own login) |
+| Request rewriting | Yes (handler controls it) | Yes (selective per host+path) |
+| CA cert distribution | Not needed | Required on every client |
+| Routes through LocalAI's auth/usage tracking | Yes | Yes (per-correlation-id events) |
+
+For shared deployments where LocalAI owns the API key and clients are
+unsophisticated (curl, simple webapps), use the cloud-proxy backend. For
+"give my Claude Code a privacy filter" use cases, use the MITM proxy.
diff --git a/gallery/bge-m3-colbert.yaml b/gallery/bge-m3-colbert.yaml
new file mode 100644
index 000000000..ff7c52634
--- /dev/null
+++ b/gallery/bge-m3-colbert.yaml
@@ -0,0 +1,11 @@
+---
+config_file: |
+  backend: rerankers
+  # `type: colbert` is forwarded to the rerankers backend as the
+  # `model_type` kwarg, selecting bge-m3's ColBERT (late-interaction
+  # multi-vector MaxSim) scoring head. Use this with the `colbert`
+  # router classifier — the classifier feeds policy descriptions as
+  # documents and reads off per-label MaxSim scores.
+  type: colbert
+  parameters:
+    model: BAAI/bge-m3
diff --git a/gallery/index.yaml b/gallery/index.yaml
index a37764bed..8cfb80587 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -23520,6 +23520,75 @@
     - python
   parameters:
     model: cross-encoder
+- name: bge-m3-colbert
+  url: github:mudler/LocalAI/gallery/bge-m3-colbert.yaml@master
+  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/1664511063789-632c234f42c386ebd2710434.png
+  urls:
+    - https://huggingface.co/BAAI/bge-m3
+  description: |
+    BAAI/bge-m3 loaded by the rerankers backend in ColBERT
+    (late-interaction MaxSim) mode. Pairs with the `colbert` router
+    classifier to score policy descriptions against the prompt
+    without an LLM round-trip — robust on abstract or short labels
+    where next-token scoring with Arch-Router-style models is noisy.
+  license: mit
+  tags:
+    - reranker
+    - colbert
+    - router
+    - gpu
+    - python
+  parameters:
+    model: bge-m3-colbert
+- &arch-router-1_5b
+  url: github:mudler/LocalAI/gallery/chatml.yaml@master
+  name: arch-router-1.5b-q4
+  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/66b681906c8d3b36786b764c/uyP7mxDVv0HbV9Hv_KfHk.jpeg
+  license: other
+  urls:
+    - https://huggingface.co/katanemo/Arch-Router-1.5B
+    - https://huggingface.co/mradermacher/Arch-Router-1.5B-GGUF
+  description: |
+    Arch-Router-1.5B is a compact router LLM from Katanemo, fine-tuned from
+    Qwen2.5-1.5B-Instruct. Given a prompt and a set of user-defined route
+    policies (domain + action), it picks the best-matching policy name so
+    requests can be dispatched to the appropriate downstream model. Designed
+    for low-latency, high-throughput use inside the Arch proxy, it pairs
+    with LocalAI's router classifier as a preference-aligned alternative to
+    embedding/ColBERT-based routing on concrete, well-described policies.
+  tags:
+    - llm
+    - gguf
+    - qwen
+    - qwen2.5
+    - 1.5b
+    - router
+    - cpu
+    - gpu
+  overrides:
+    # Replace the inherited [chat] usecase from chatml.yaml — Arch-Router
+    # is exclusively a router-classifier model, and chat+score conflict
+    # on llama-cpp (the score path races the llama_context against
+    # concurrent generation traffic; see model_config.go validation).
+    known_usecases:
+      - score
+    parameters:
+      model: Arch-Router-1.5B.Q4_K_M.gguf
+  files:
+    - filename: Arch-Router-1.5B.Q4_K_M.gguf
+      sha256: 9abe34414ebfe3921a1d157ed3ce8718e21e59a1f80693a33969a82ea40df636
+      uri: huggingface://mradermacher/Arch-Router-1.5B-GGUF/Arch-Router-1.5B.Q4_K_M.gguf
+- !!merge <<: *arch-router-1_5b
+  name: arch-router-1.5b-q8
+  overrides:
+    known_usecases:
+      - score
+    parameters:
+      model: Arch-Router-1.5B.Q8_0.gguf
+  files:
+    - filename: Arch-Router-1.5B.Q8_0.gguf
+      sha256: 236fcf372bb25f314dafa1605d84566db60ddad98b889aaa072a3108ec48ef22
+      uri: huggingface://mradermacher/Arch-Router-1.5B-GGUF/Arch-Router-1.5B.Q8_0.gguf
 - name: dolphin-2.9-llama3-8b
   url: github:mudler/LocalAI/gallery/hermes-2-pro-mistral.yaml@master
   urls:
diff --git a/go.mod b/go.mod
index ff5ee642e..12b74b2c4 100644
--- a/go.mod
+++ b/go.mod
@@ -330,7 +330,7 @@ require (
 	go.yaml.in/yaml/v2 v2.4.4
 	go.yaml.in/yaml/v3 v3.0.4 // indirect
 	golang.org/x/image v0.38.0 // indirect
-	golang.org/x/net v0.53.0 // indirect; indirect (for websocket)
+	golang.org/x/net v0.53.0 // indirect (for websocket)
 	golang.org/x/oauth2 v0.36.0
 	golang.org/x/telemetry v0.0.0-20260409153401-be6f6cb8b1fa // indirect
 	golang.org/x/time v0.14.0 // indirect
diff --git a/pkg/grpc/backend.go b/pkg/grpc/backend.go
index eaabea8ef..ead95d195 100644
--- a/pkg/grpc/backend.go
+++ b/pkg/grpc/backend.go
@@ -71,6 +71,10 @@ type Backend interface {
 
 	Rerank(ctx context.Context, in *pb.RerankRequest, opts ...grpc.CallOption) (*pb.RerankResult, error)
 
+	TokenClassify(ctx context.Context, in *pb.TokenClassifyRequest, opts ...grpc.CallOption) (*pb.TokenClassifyResponse, error)
+
+	Score(ctx context.Context, in *pb.ScoreRequest, opts ...grpc.CallOption) (*pb.ScoreResponse, error)
+
 	GetTokenMetrics(ctx context.Context, in *pb.MetricsRequest, opts ...grpc.CallOption) (*pb.MetricsResponse, error)
 
 	VAD(ctx context.Context, in *pb.VADRequest, opts ...grpc.CallOption) (*pb.VADResponse, error)
@@ -84,6 +88,13 @@ type Backend interface {
 	AudioTransformStream(ctx context.Context, opts ...grpc.CallOption) (AudioTransformStreamClient, error)
 	AudioToAudioStream(ctx context.Context, opts ...grpc.CallOption) (AudioToAudioStreamClient, error)
 
+	// Forward proxies a raw HTTP request to an upstream provider for
+	// passthrough-mode cloud-proxy backends. Caller streams a single
+	// ForwardRequest carrying path/method/headers/body, then closes
+	// send; backend streams back status/headers in the first reply
+	// and body chunks thereafter.
+	Forward(ctx context.Context, opts ...grpc.CallOption) (ForwardClient, error)
+
 	ModelMetadata(ctx context.Context, in *pb.ModelOptions, opts ...grpc.CallOption) (*pb.ModelMetadataResponse, error)
 
 	// Fine-tuning
diff --git a/pkg/grpc/base/base.go b/pkg/grpc/base/base.go
index 66f7a9a18..24417e4c2 100644
--- a/pkg/grpc/base/base.go
+++ b/pkg/grpc/base/base.go
@@ -163,6 +163,11 @@ func (llm *Base) AudioToAudioStream(in <-chan *pb.AudioToAudioRequest, out chan<
 	return fmt.Errorf("unimplemented")
 }
 
+func (llm *Base) Forward(ctx context.Context, in <-chan *pb.ForwardRequest, out chan<- *pb.ForwardReply) error {
+	close(out)
+	return fmt.Errorf("unimplemented")
+}
+
 func (llm *Base) StartFineTune(*pb.FineTuneRequest) (*pb.FineTuneJobResult, error) {
 	return nil, fmt.Errorf("unimplemented")
 }
diff --git a/pkg/grpc/client.go b/pkg/grpc/client.go
index 8360d2645..b6a148186 100644
--- a/pkg/grpc/client.go
+++ b/pkg/grpc/client.go
@@ -526,6 +526,42 @@ func (c *Client) Rerank(ctx context.Context, in *pb.RerankRequest, opts ...grpc.
 	return client.Rerank(ctx, in, opts...)
 }
 
+func (c *Client) TokenClassify(ctx context.Context, in *pb.TokenClassifyRequest, opts ...grpc.CallOption) (*pb.TokenClassifyResponse, error) {
+	if !c.parallel {
+		c.opMutex.Lock()
+		defer c.opMutex.Unlock()
+	}
+	c.setBusy(true)
+	defer c.setBusy(false)
+	c.wdMark()
+	defer c.wdUnMark()
+	conn, err := c.dial()
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = conn.Close() }()
+	client := pb.NewBackendClient(conn)
+	return client.TokenClassify(ctx, in, opts...)
+}
+
+func (c *Client) Score(ctx context.Context, in *pb.ScoreRequest, opts ...grpc.CallOption) (*pb.ScoreResponse, error) {
+	if !c.parallel {
+		c.opMutex.Lock()
+		defer c.opMutex.Unlock()
+	}
+	c.setBusy(true)
+	defer c.setBusy(false)
+	c.wdMark()
+	defer c.wdUnMark()
+	conn, err := c.dial()
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = conn.Close() }()
+	client := pb.NewBackendClient(conn)
+	return client.Score(ctx, in, opts...)
+}
+
 func (c *Client) GetTokenMetrics(ctx context.Context, in *pb.MetricsRequest, opts ...grpc.CallOption) (*pb.MetricsResponse, error) {
 	if !c.parallel {
 		c.opMutex.Lock()
@@ -742,6 +778,81 @@ func (c *Client) AudioTransform(ctx context.Context, in *pb.AudioTransformReques
 	return client.AudioTransform(ctx, in, opts...)
 }
 
+// ForwardClient is the duplex interface returned by (*Client).Forward.
+// First Send carries path/method/headers/body, subsequent Sends carry
+// body_chunk only. First Recv carries status/headers, subsequent Recvs
+// carry body_chunk. Caller closes via CloseSend when request is done;
+// stream ends when the upstream finishes and the server closes.
+type ForwardClient interface {
+	Send(*pb.ForwardRequest) error
+	Recv() (*pb.ForwardReply, error)
+	CloseSend() error
+	Context() context.Context
+}
+
+type forwardClient struct {
+	pb.Backend_ForwardClient
+	conn   *grpc.ClientConn
+	closer func()
+	once   sync.Once
+}
+
+// CloseSend signals end-of-requests to the server but keeps the
+// underlying connection open so the server can still send replies.
+// Connection cleanup happens when Recv returns a final error (EOF
+// or any other terminal status).
+func (s *forwardClient) CloseSend() error {
+	return s.Backend_ForwardClient.CloseSend()
+}
+
+// Recv wraps the embedded stream's Recv to fire the connection-level
+// closer once the stream ends. On EOF or any other error the
+// connection + operation-state cleanup runs exactly once.
+func (s *forwardClient) Recv() (*pb.ForwardReply, error) {
+	reply, err := s.Backend_ForwardClient.Recv()
+	if err != nil && s.closer != nil {
+		s.once.Do(s.closer)
+	}
+	return reply, err
+}
+
+func (c *Client) Forward(ctx context.Context, opts ...grpc.CallOption) (ForwardClient, error) {
+	if !c.parallel {
+		c.opMutex.Lock()
+	}
+	c.setBusy(true)
+	c.wdMark()
+
+	cleanup := func() {
+		c.wdUnMark()
+		c.setBusy(false)
+		if !c.parallel {
+			c.opMutex.Unlock()
+		}
+	}
+
+	conn, err := c.dial()
+	if err != nil {
+		cleanup()
+		return nil, err
+	}
+	client := pb.NewBackendClient(conn)
+	stream, err := client.Forward(ctx, opts...)
+	if err != nil {
+		_ = conn.Close()
+		cleanup()
+		return nil, err
+	}
+	return &forwardClient{
+		Backend_ForwardClient: stream,
+		conn:                  conn,
+		closer: func() {
+			_ = conn.Close()
+			cleanup()
+		},
+	}, nil
+}
+
 // AudioTransformStreamClient is the duplex interface returned by
 // (*Client).AudioTransformStream. Wraps the generated bidi client without
 // leaking the proto package across the public boundary.
diff --git a/pkg/grpc/embed.go b/pkg/grpc/embed.go
index 15d9615c8..b9f08ddb4 100644
--- a/pkg/grpc/embed.go
+++ b/pkg/grpc/embed.go
@@ -3,6 +3,7 @@ package grpc
 import (
 	"context"
 	"io"
+	"sync"
 
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"google.golang.org/grpc"
@@ -132,6 +133,14 @@ func (e *embedBackend) Rerank(ctx context.Context, in *pb.RerankRequest, opts ..
 	return e.s.Rerank(ctx, in)
 }
 
+func (e *embedBackend) TokenClassify(ctx context.Context, in *pb.TokenClassifyRequest, opts ...grpc.CallOption) (*pb.TokenClassifyResponse, error) {
+	return e.s.TokenClassify(ctx, in)
+}
+
+func (e *embedBackend) Score(ctx context.Context, in *pb.ScoreRequest, opts ...grpc.CallOption) (*pb.ScoreResponse, error) {
+	return e.s.Score(ctx, in)
+}
+
 func (e *embedBackend) VAD(ctx context.Context, in *pb.VADRequest, opts ...grpc.CallOption) (*pb.VADResponse, error) {
 	return e.s.VAD(ctx, in)
 }
@@ -181,6 +190,27 @@ func (e *embedBackend) AudioTransformStream(ctx context.Context, opts ...grpc.Ca
 	}, nil
 }
 
+func (e *embedBackend) Forward(ctx context.Context, opts ...grpc.CallOption) (ForwardClient, error) {
+	reqs := make(chan *pb.ForwardRequest, 8)
+	resps := make(chan *pb.ForwardReply, 8)
+	srvDone := make(chan error, 1)
+
+	server := &embedBackendForwardStream{ctx: ctx, reqs: reqs, resps: resps}
+
+	go func() {
+		err := e.s.Forward(server)
+		close(resps)
+		srvDone <- err
+	}()
+
+	return &embedBackendForwardStreamClient{
+		ctx:     ctx,
+		reqs:    reqs,
+		resps:   resps,
+		srvDone: srvDone,
+	}, nil
+}
+
 func (e *embedBackend) AudioToAudioStream(ctx context.Context, opts ...grpc.CallOption) (AudioToAudioStreamClient, error) {
 	reqs := make(chan *pb.AudioToAudioRequest, 8)
 	resps := make(chan *pb.AudioToAudioResponse, 8)
@@ -601,3 +631,94 @@ func (e *embedBackendServerStream) SendMsg(m any) error {
 func (e *embedBackendServerStream) RecvMsg(m any) error {
 	return nil
 }
+
+var _ pb.Backend_ForwardServer = new(embedBackendForwardStream)
+var _ ForwardClient = new(embedBackendForwardStreamClient)
+
+// embedBackendForwardStream is the server-side handle for an in-process
+// Forward bidi stream. The hosted backend reads requests from `reqs`
+// (closed by the client when done sending) and writes replies to
+// `resps`.
+type embedBackendForwardStream struct {
+	ctx   context.Context
+	reqs  <-chan *pb.ForwardRequest
+	resps chan<- *pb.ForwardReply
+}
+
+func (e *embedBackendForwardStream) Send(resp *pb.ForwardReply) error {
+	select {
+	case e.resps <- resp:
+		return nil
+	case <-e.ctx.Done():
+		return e.ctx.Err()
+	}
+}
+
+func (e *embedBackendForwardStream) Recv() (*pb.ForwardRequest, error) {
+	select {
+	case req, ok := <-e.reqs:
+		if !ok {
+			return nil, io.EOF
+		}
+		return req, nil
+	case <-e.ctx.Done():
+		return nil, e.ctx.Err()
+	}
+}
+
+func (e *embedBackendForwardStream) SetHeader(md metadata.MD) error  { return nil }
+func (e *embedBackendForwardStream) SendHeader(md metadata.MD) error { return nil }
+func (e *embedBackendForwardStream) SetTrailer(md metadata.MD)       {}
+func (e *embedBackendForwardStream) Context() context.Context        { return e.ctx }
+func (e *embedBackendForwardStream) SendMsg(m any) error {
+	if x, ok := m.(*pb.ForwardReply); ok {
+		return e.Send(x)
+	}
+	return nil
+}
+func (e *embedBackendForwardStream) RecvMsg(m any) error { return nil }
+
+// embedBackendForwardStreamClient is the caller-facing side. Mirrors
+// the server-side stream over the same channels.
+type embedBackendForwardStreamClient struct {
+	ctx     context.Context
+	reqs    chan<- *pb.ForwardRequest
+	resps   <-chan *pb.ForwardReply
+	srvDone <-chan error
+	once    sync.Once
+}
+
+func (e *embedBackendForwardStreamClient) Send(req *pb.ForwardRequest) error {
+	select {
+	case e.reqs <- req:
+		return nil
+	case <-e.ctx.Done():
+		return e.ctx.Err()
+	}
+}
+
+func (e *embedBackendForwardStreamClient) Recv() (*pb.ForwardReply, error) {
+	select {
+	case resp, ok := <-e.resps:
+		if !ok {
+			select {
+			case err := <-e.srvDone:
+				if err != nil {
+					return nil, err
+				}
+			default:
+			}
+			return nil, io.EOF
+		}
+		return resp, nil
+	case <-e.ctx.Done():
+		return nil, e.ctx.Err()
+	}
+}
+
+func (e *embedBackendForwardStreamClient) CloseSend() error {
+	e.once.Do(func() { close(e.reqs) })
+	return nil
+}
+
+func (e *embedBackendForwardStreamClient) Context() context.Context { return e.ctx }
diff --git a/pkg/grpc/forward_test.go b/pkg/grpc/forward_test.go
new file mode 100644
index 000000000..b3dec80a0
--- /dev/null
+++ b/pkg/grpc/forward_test.go
@@ -0,0 +1,94 @@
+package grpc
+
+import (
+	"context"
+	"errors"
+	"io"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// echoForwardModel is a minimal AIModel that just echoes Forward
+// requests back as replies — used to exercise the in-process bidi
+// plumbing without standing up a real HTTP upstream.
+type echoForwardModel struct {
+	base.SingleThread
+}
+
+func (m *echoForwardModel) Forward(_ context.Context, in <-chan *pb.ForwardRequest, out chan<- *pb.ForwardReply) error {
+	defer close(out)
+	first := true
+	for req := range in {
+		if first {
+			out <- &pb.ForwardReply{
+				Status:  200,
+				Headers: []*pb.ForwardHeader{{Name: "Content-Type", Value: "text/event-stream"}},
+			}
+			first = false
+		}
+		out <- &pb.ForwardReply{BodyChunk: req.BodyChunk}
+	}
+	return nil
+}
+
+var _ = Describe("Forward RPC (in-process)", func() {
+	It("round-trips status, headers, and body chunks", func() {
+		// Provide registers an AIModel under a virtual address so
+		// NewClient routes via the in-process embedBackend instead of
+		// dialing a real socket.
+		addr := "test://forward-echo"
+		Provide(addr, &echoForwardModel{})
+		c := NewClient(addr, true, nil, false)
+
+		stream, err := c.Forward(context.Background())
+		Expect(err).NotTo(HaveOccurred())
+
+		// One initial request carrying path/method/headers, then two body chunks.
+		Expect(stream.Send(&pb.ForwardRequest{
+			Path:      "/v1/chat/completions",
+			Method:    "POST",
+			Headers:   []*pb.ForwardHeader{{Name: "Authorization", Value: "Bearer x"}},
+			BodyChunk: []byte(`{"hello":`),
+		})).To(Succeed())
+		Expect(stream.Send(&pb.ForwardRequest{BodyChunk: []byte(`"world"}`)})).To(Succeed())
+		Expect(stream.CloseSend()).To(Succeed())
+
+		// First reply carries status + headers.
+		first, err := stream.Recv()
+		Expect(err).NotTo(HaveOccurred())
+		Expect(first.Status).To(Equal(int32(200)))
+		Expect(first.Headers).To(HaveLen(1))
+		Expect(first.Headers[0].Name).To(Equal("Content-Type"))
+
+		// Body echoes back, one reply per request chunk.
+		var body []byte
+		for {
+			r, err := stream.Recv()
+			if errors.Is(err, io.EOF) {
+				break
+			}
+			Expect(err).NotTo(HaveOccurred())
+			body = append(body, r.BodyChunk...)
+		}
+		Expect(string(body)).To(Equal(`{"hello":"world"}`))
+	})
+
+	It("UnimplementedBase returns an error on Forward", func() {
+		// The default base.Base.Forward returns "unimplemented" — any
+		// backend that doesn't opt in should surface that to callers
+		// rather than silently succeed.
+		addr := "test://forward-base"
+		Provide(addr, &base.SingleThread{})
+		c := NewClient(addr, true, nil, false)
+
+		stream, err := c.Forward(context.Background())
+		Expect(err).NotTo(HaveOccurred())
+		Expect(stream.CloseSend()).To(Succeed())
+
+		_, err = stream.Recv()
+		Expect(err).To(HaveOccurred())
+	})
+})
diff --git a/pkg/grpc/integration_toolcalls_test.go b/pkg/grpc/integration_toolcalls_test.go
new file mode 100644
index 000000000..3011bebe3
--- /dev/null
+++ b/pkg/grpc/integration_toolcalls_test.go
@@ -0,0 +1,147 @@
+package grpc
+
+import (
+	"context"
+	"strings"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// toolCallStreamer simulates what a cloud-proxy translate backend
+// emits: a sequence of *pb.Reply chunks carrying content + tool_call
+// deltas + final usage tokens. The replies are sent in the same order
+// and shape the cloud-proxy OpenAI translator produces.
+type toolCallStreamer struct {
+	base.SingleThread
+}
+
+func (*toolCallStreamer) Predict(*pb.PredictOptions) (string, error) {
+	return "", nil
+}
+
+func (*toolCallStreamer) PredictStream(*pb.PredictOptions, chan string) error {
+	return nil
+}
+
+func (*toolCallStreamer) PredictRich(*pb.PredictOptions) (*pb.Reply, error) {
+	return &pb.Reply{
+		Message:      []byte("done"),
+		PromptTokens: 11,
+		Tokens:       4,
+		ChatDeltas: []*pb.ChatDelta{{
+			ToolCalls: []*pb.ToolCallDelta{{
+				Index: 0, Id: "call_finalize", Name: "submit", Arguments: `{"ok":true}`,
+			}},
+		}},
+	}, nil
+}
+
+func (*toolCallStreamer) PredictStreamRich(_ *pb.PredictOptions, out chan<- *pb.Reply) error {
+	// Chunk 1: opening text delta.
+	out <- &pb.Reply{
+		Message:    []byte("Looking up "),
+		ChatDeltas: []*pb.ChatDelta{{Content: "Looking up "}},
+	}
+	// Chunk 2: tool call announcement (id + name).
+	out <- &pb.Reply{
+		ChatDeltas: []*pb.ChatDelta{{
+			ToolCalls: []*pb.ToolCallDelta{{
+				Index: 0, Id: "call_x", Name: "search",
+			}},
+		}},
+	}
+	// Chunks 3-4: argument fragments (consumer concatenates by index).
+	out <- &pb.Reply{
+		ChatDeltas: []*pb.ChatDelta{{
+			ToolCalls: []*pb.ToolCallDelta{{
+				Index: 0, Arguments: `{"q":"`,
+			}},
+		}},
+	}
+	out <- &pb.Reply{
+		ChatDeltas: []*pb.ChatDelta{{
+			ToolCalls: []*pb.ToolCallDelta{{
+				Index: 0, Arguments: `weather"}`,
+			}},
+		}},
+	}
+	// Chunk 5: usage tokens (final chunk pattern from OpenAI stream).
+	out <- &pb.Reply{Tokens: 17}
+	return nil
+}
+
+var _ AIModelRich = &toolCallStreamer{}
+
+var _ = Describe("Cloud-proxy translate-mode integration (gRPC + tool calls)", func() {
+	// This test simulates what the OpenAI chat endpoint does after
+	// ModelInference returns: it walks the per-chunk TokenUsage.ChatDeltas
+	// and assembles tool calls indexed by ToolCallDelta.Index. Verifies
+	// that the rich gRPC path delivers everything the consumer needs.
+	It("delivers tool-call deltas through PredictStream end-to-end", func() {
+		addr := "test://translate-integration-stream"
+		Provide(addr, &toolCallStreamer{})
+		c := NewClient(addr, true, nil, false)
+
+		type accumulator struct {
+			text   strings.Builder
+			toolID string
+			name   string
+			args   strings.Builder
+			tokens int32
+		}
+		var acc accumulator
+
+		err := c.PredictStream(context.Background(), &pb.PredictOptions{}, func(reply *pb.Reply) {
+			if msg := reply.GetMessage(); len(msg) > 0 {
+				acc.text.Write(msg)
+			}
+			if reply.GetTokens() > 0 && len(reply.GetChatDeltas()) == 0 {
+				acc.tokens = reply.GetTokens()
+				return
+			}
+			for _, cd := range reply.GetChatDeltas() {
+				for _, tc := range cd.GetToolCalls() {
+					if tc.GetId() != "" {
+						acc.toolID = tc.GetId()
+					}
+					if tc.GetName() != "" {
+						acc.name = tc.GetName()
+					}
+					acc.args.WriteString(tc.GetArguments())
+				}
+			}
+		})
+		Expect(err).NotTo(HaveOccurred())
+
+		// Text content survived the wire.
+		Expect(acc.text.String()).To(Equal("Looking up "))
+		// Tool call id + name landed on the first announcing chunk.
+		Expect(acc.toolID).To(Equal("call_x"))
+		Expect(acc.name).To(Equal("search"))
+		// Argument fragments assembled in order.
+		Expect(acc.args.String()).To(Equal(`{"q":"weather"}`))
+		// Final usage frame propagated.
+		Expect(acc.tokens).To(BeEquivalentTo(17))
+	})
+
+	It("delivers complete tool-call results through non-streaming Predict", func() {
+		addr := "test://translate-integration-predict"
+		Provide(addr, &toolCallStreamer{})
+		c := NewClient(addr, true, nil, false)
+
+		reply, err := c.Predict(context.Background(), &pb.PredictOptions{})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(string(reply.GetMessage())).To(Equal("done"))
+		Expect(reply.GetPromptTokens()).To(BeEquivalentTo(11))
+		Expect(reply.GetTokens()).To(BeEquivalentTo(4))
+		Expect(reply.GetChatDeltas()).To(HaveLen(1))
+		tcs := reply.GetChatDeltas()[0].GetToolCalls()
+		Expect(tcs).To(HaveLen(1))
+		Expect(tcs[0].GetId()).To(Equal("call_finalize"))
+		Expect(tcs[0].GetName()).To(Equal("submit"))
+		Expect(tcs[0].GetArguments()).To(Equal(`{"ok":true}`))
+	})
+})
diff --git a/pkg/grpc/interface.go b/pkg/grpc/interface.go
index bce3f689c..31b9ab26d 100644
--- a/pkg/grpc/interface.go
+++ b/pkg/grpc/interface.go
@@ -47,6 +47,12 @@ type AIModel interface {
 	AudioTransformStream(in <-chan *pb.AudioTransformFrameRequest, out chan<- *pb.AudioTransformFrameResponse) error
 	AudioToAudioStream(in <-chan *pb.AudioToAudioRequest, out chan<- *pb.AudioToAudioResponse) error
 
+	// Forward proxies a raw HTTP request to an upstream provider for
+	// passthrough-mode cloud-proxy backends. ctx is the gRPC stream
+	// context — cancellation propagates to the upstream HTTP request
+	// so client disconnect closes the upstream connection.
+	Forward(ctx context.Context, in <-chan *pb.ForwardRequest, out chan<- *pb.ForwardReply) error
+
 	ModelMetadata(*pb.ModelOptions) (*pb.ModelMetadataResponse, error)
 
 	// Fine-tuning
@@ -65,3 +71,22 @@ type AIModel interface {
 func newReply(s string) *pb.Reply {
 	return &pb.Reply{Message: []byte(s)}
 }
+
+// AIModelRich is an optional extension to AIModel for backends that
+// can produce a full *pb.Reply — including tool-call deltas and
+// usage tokens — rather than just a content string. The gRPC server
+// type-asserts and prefers the rich path when implemented; otherwise
+// it wraps Predict's string return in a Reply.
+//
+// Cloud-proxy translate mode is the motivating use case: the upstream
+// emits structured tool_calls that would be lost through the legacy
+// (string, error) signature.
+//
+// PredictStreamRich contract: send replies into the channel and
+// return when finished. Do NOT close the channel — the server closes
+// it after the call returns. This is opposite to legacy PredictStream
+// which expects the impl to defer close().
+type AIModelRich interface {
+	PredictRich(*pb.PredictOptions) (*pb.Reply, error)
+	PredictStreamRich(*pb.PredictOptions, chan<- *pb.Reply) error
+}
diff --git a/pkg/grpc/rich_test.go b/pkg/grpc/rich_test.go
new file mode 100644
index 000000000..096de8045
--- /dev/null
+++ b/pkg/grpc/rich_test.go
@@ -0,0 +1,129 @@
+package grpc
+
+import (
+	"context"
+	"errors"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// richBackend implements AIModel + AIModelRich. The legacy methods
+// return scripted errors so a test that touches them by accident
+// (instead of taking the rich path) fails loudly rather than silently
+// returning empty content.
+type richBackend struct {
+	base.SingleThread
+
+	predictRich       func(*pb.PredictOptions) (*pb.Reply, error)
+	predictStreamRich func(*pb.PredictOptions, chan<- *pb.Reply) error
+}
+
+func (r *richBackend) Predict(*pb.PredictOptions) (string, error) {
+	return "", errors.New("richBackend: legacy Predict should not have been called")
+}
+
+func (r *richBackend) PredictStream(*pb.PredictOptions, chan string) error {
+	return errors.New("richBackend: legacy PredictStream should not have been called")
+}
+
+func (r *richBackend) PredictRich(opts *pb.PredictOptions) (*pb.Reply, error) {
+	return r.predictRich(opts)
+}
+
+func (r *richBackend) PredictStreamRich(opts *pb.PredictOptions, out chan<- *pb.Reply) error {
+	return r.predictStreamRich(opts, out)
+}
+
+var _ AIModelRich = (*richBackend)(nil)
+
+var _ = Describe("AIModelRich dispatch", func() {
+	It("server.Predict routes through PredictRich when implemented", func() {
+		addr := "test://rich-predict"
+		Provide(addr, &richBackend{
+			predictRich: func(*pb.PredictOptions) (*pb.Reply, error) {
+				return &pb.Reply{
+					Message:      []byte("hello"),
+					PromptTokens: 5,
+					Tokens:       7,
+					ChatDeltas: []*pb.ChatDelta{{
+						ToolCalls: []*pb.ToolCallDelta{{
+							Index: 0, Id: "call_1", Name: "ping", Arguments: "{}",
+						}},
+					}},
+				}, nil
+			},
+		})
+		c := NewClient(addr, true, nil, false)
+
+		reply, err := c.Predict(context.Background(), &pb.PredictOptions{})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(string(reply.GetMessage())).To(Equal("hello"))
+		// Rich fields survive the RPC marshal/unmarshal — proves the
+		// server used PredictRich, not the legacy (string, error)
+		// wrapper which would have lost everything except Message.
+		Expect(reply.GetPromptTokens()).To(BeEquivalentTo(5))
+		Expect(reply.GetTokens()).To(BeEquivalentTo(7))
+		Expect(reply.GetChatDeltas()).To(HaveLen(1))
+		Expect(reply.GetChatDeltas()[0].GetToolCalls()).To(HaveLen(1))
+		Expect(reply.GetChatDeltas()[0].GetToolCalls()[0].GetName()).To(Equal("ping"))
+	})
+
+	It("server.PredictStream routes through PredictStreamRich when implemented", func() {
+		addr := "test://rich-stream"
+		Provide(addr, &richBackend{
+			predictStreamRich: func(_ *pb.PredictOptions, out chan<- *pb.Reply) error {
+				out <- &pb.Reply{
+					Message:    []byte("hi"),
+					ChatDeltas: []*pb.ChatDelta{{Content: "hi"}},
+				}
+				out <- &pb.Reply{
+					ChatDeltas: []*pb.ChatDelta{{ToolCalls: []*pb.ToolCallDelta{{
+						Index: 0, Id: "call_x", Name: "search",
+					}}}},
+				}
+				out <- &pb.Reply{Tokens: 9}
+				return nil
+			},
+		})
+		c := NewClient(addr, true, nil, false)
+
+		var collected []*pb.Reply
+		err := c.PredictStream(context.Background(), &pb.PredictOptions{}, func(r *pb.Reply) {
+			collected = append(collected, r)
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(collected).To(HaveLen(3))
+		Expect(string(collected[0].GetMessage())).To(Equal("hi"))
+		Expect(collected[1].GetChatDeltas()).To(HaveLen(1))
+		Expect(collected[1].GetChatDeltas()[0].GetToolCalls()).To(HaveLen(1))
+		Expect(collected[2].GetTokens()).To(BeEquivalentTo(9))
+	})
+
+	It("falls back to legacy Predict when AIModelRich is not implemented", func() {
+		// Use a non-Rich model (just base.SingleThread embedded in a
+		// minimal wrapper). The legacy wrapper path stringifies the
+		// reply, so ChatDeltas are lost — the fallback is the contract
+		// for backends that haven't migrated.
+		addr := "test://legacy-predict"
+		Provide(addr, &legacyOnlyBackend{response: "legacy hello"})
+		c := NewClient(addr, true, nil, false)
+
+		reply, err := c.Predict(context.Background(), &pb.PredictOptions{})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(string(reply.GetMessage())).To(Equal("legacy hello"))
+		Expect(reply.GetChatDeltas()).To(BeEmpty())
+	})
+})
+
+// legacyOnlyBackend implements AIModel but NOT AIModelRich.
+type legacyOnlyBackend struct {
+	base.SingleThread
+	response string
+}
+
+func (l *legacyOnlyBackend) Predict(*pb.PredictOptions) (string, error) {
+	return l.response, nil
+}
diff --git a/pkg/grpc/server.go b/pkg/grpc/server.go
index 4eaa71297..5be668497 100644
--- a/pkg/grpc/server.go
+++ b/pkg/grpc/server.go
@@ -68,6 +68,9 @@ func (s *server) Predict(ctx context.Context, in *pb.PredictOptions) (*pb.Reply,
 		s.llm.Lock()
 		defer s.llm.Unlock()
 	}
+	if rich, ok := s.llm.(AIModelRich); ok {
+		return rich.PredictRich(in)
+	}
 	result, err := s.llm.Predict(in)
 	return newReply(result), err
 }
@@ -271,8 +274,30 @@ func (s *server) PredictStream(in *pb.PredictOptions, stream pb.Backend_PredictS
 		s.llm.Lock()
 		defer s.llm.Unlock()
 	}
-	resultChan := make(chan string)
 
+	if rich, ok := s.llm.(AIModelRich); ok {
+		replyChan := make(chan *pb.Reply)
+		done := make(chan bool)
+		go func() {
+			for reply := range replyChan {
+				// Send errors here mean the client disconnected;
+				// drain the rest of the channel so the producer
+				// (PredictStreamRich) doesn't block on the next
+				// reply forever.
+				_ = stream.Send(reply)
+			}
+			done <- true
+		}()
+		// Server-side close: PredictStreamRich implementations send into
+		// the channel and return when finished; closing is the host's
+		// concern so impls don't have to remember `defer close(...)`.
+		err := rich.PredictStreamRich(in, replyChan)
+		close(replyChan)
+		<-done
+		return err
+	}
+
+	resultChan := make(chan string)
 	done := make(chan bool)
 	go func() {
 		for result := range resultChan {
@@ -547,6 +572,69 @@ func (s *server) AudioToAudioStream(stream pb.Backend_AudioToAudioStreamServer)
 	return recvErr
 }
 
+// Forward is the bidi-stream handler for the cloud-proxy backend's
+// passthrough mode. Same recv→in / out→send goroutine idiom as
+// AudioTransformStream / AudioToAudioStream above. Buffer size 8 to
+// keep SSE token streams flowing — at 4, a half-RTT slow gRPC client
+// makes the body-read goroutine in the backend block on out<- after
+// every few token frames.
+func (s *server) Forward(stream pb.Backend_ForwardServer) error {
+	if s.llm.Locking() {
+		s.llm.Lock()
+		defer s.llm.Unlock()
+	}
+
+	in := make(chan *pb.ForwardRequest, 8)
+	out := make(chan *pb.ForwardReply, 8)
+
+	recvErrCh := make(chan error, 1)
+	go func() {
+		defer close(in)
+		for {
+			req, err := stream.Recv()
+			if err != nil {
+				if errors.Is(err, io.EOF) {
+					recvErrCh <- nil
+					return
+				}
+				recvErrCh <- err
+				return
+			}
+			select {
+			case in <- req:
+			case <-stream.Context().Done():
+				recvErrCh <- stream.Context().Err()
+				return
+			}
+		}
+	}()
+
+	sendDone := make(chan error, 1)
+	go func() {
+		for resp := range out {
+			if err := stream.Send(resp); err != nil {
+				sendDone <- err
+				for range out {
+				}
+				return
+			}
+		}
+		sendDone <- nil
+	}()
+
+	backendErr := s.llm.Forward(stream.Context(), in, out)
+	sendErr := <-sendDone
+	recvErr := <-recvErrCh
+
+	if backendErr != nil {
+		return backendErr
+	}
+	if sendErr != nil {
+		return sendErr
+	}
+	return recvErr
+}
+
 func (s *server) StartFineTune(ctx context.Context, in *pb.FineTuneRequest) (*pb.FineTuneJobResult, error) {
 	if s.llm.Locking() {
 		s.llm.Lock()
diff --git a/pkg/mcp/localaitools/client.go b/pkg/mcp/localaitools/client.go
index ac77e789b..60090d63a 100644
--- a/pkg/mcp/localaitools/client.go
+++ b/pkg/mcp/localaitools/client.go
@@ -67,4 +67,43 @@ type LocalAIClient interface {
 	// SetBranding updates the text branding fields. Asset uploads are not
 	// exposed over MCP — admins use the Settings UI for binary files.
 	SetBranding(ctx context.Context, req SetBrandingRequest) (*Branding, error)
+
+	// ---- Usage / billing ----
+
+	// GetUsageStats returns aggregated token usage. In single-user
+	// no-auth mode this reports the synthetic local user's usage. The
+	// implementation enforces "admin required to query other users".
+	GetUsageStats(ctx context.Context, q UsageStatsQuery) (*UsageStats, error)
+
+	// ---- PII filter ----
+	// ListPIIPatterns returns the active PII pattern set with each
+	// one's action.
+	ListPIIPatterns(ctx context.Context) ([]PIIPattern, error)
+	// GetPIIEvents returns recent redaction events. Implementation
+	// enforces "admin required" when auth is on.
+	GetPIIEvents(ctx context.Context, q PIIEventsQuery) ([]PIIEvent, error)
+	// TestPIIRedaction dry-runs the redactor against text. No event
+	// is recorded.
+	TestPIIRedaction(ctx context.Context, req PIIRedactTestRequest) (*PIIRedactTestResult, error)
+	// SetPIIPatternAction mutates the named pattern's action and/or
+	// disabled state in-process. Transient until PersistPIIPatterns is
+	// called — runtime_settings.json then applies the deltas on the
+	// next start. Admin-required.
+	SetPIIPatternAction(ctx context.Context, req PIIPatternActionUpdate) error
+
+	// PersistPIIPatterns snapshots the live redactor's per-pattern
+	// (action, disabled) state into runtime_settings.json. Admin-required.
+	PersistPIIPatterns(ctx context.Context) error
+
+	// ---- Middleware admin ----
+	// GetMiddlewareStatus returns the aggregated state surfaced on the
+	// /app/middleware page: active PII patterns, per-model resolved
+	// enabled state, recent event count, router placeholder.
+	GetMiddlewareStatus(ctx context.Context) (*MiddlewareStatus, error)
+
+	// ---- Router (intelligent routing) ----
+	// GetRouterDecisions returns recent routing decisions for the
+	// /app/middleware Routing tab and for agent-driven introspection.
+	// Admin-required when auth is on.
+	GetRouterDecisions(ctx context.Context, q RouterDecisionsQuery) ([]RouterDecision, error)
 }
diff --git a/pkg/mcp/localaitools/coverage_test.go b/pkg/mcp/localaitools/coverage_test.go
index d8054ae04..8159afcf9 100644
--- a/pkg/mcp/localaitools/coverage_test.go
+++ b/pkg/mcp/localaitools/coverage_test.go
@@ -37,18 +37,26 @@ var toolToHTTPRoute = map[string]string{
 	ToolListNodes:           "GET /api/nodes",
 	ToolVRAMEstimate:        "POST /api/models/vram-estimate",
 	ToolGetBranding:         "GET /api/branding",
+	ToolGetUsageStats:       "GET /api/usage (or /api/usage/all when all=true)",
+	ToolListPIIPatterns:     "GET /api/pii/patterns",
+	ToolGetPIIEvents:        "GET /api/pii/events",
+	ToolTestPIIRedaction:    "POST /api/pii/test",
+	ToolGetMiddlewareStatus: "GET /api/middleware/status",
+	ToolGetRouterDecisions:  "GET /api/router/decisions",
 
 	// Mutating tools.
-	ToolInstallModel:      "POST /models/apply",
-	ToolImportModelURI:    "POST /models/import-uri",
-	ToolDeleteModel:       "POST /models/delete/:name",
-	ToolEditModelConfig:   "PATCH /api/models/config-json/:name",
-	ToolReloadModels:      "POST /models/reload",
-	ToolInstallBackend:    "POST /backends/apply",
-	ToolUpgradeBackend:    "POST /backends/upgrade/:name",
-	ToolToggleModelState:  "PUT /models/toggle-state/:name/:action",
-	ToolToggleModelPinned: "PUT /models/toggle-pinned/:name/:action",
-	ToolSetBranding:       "POST /api/settings (instance_name, instance_tagline)",
+	ToolInstallModel:        "POST /models/apply",
+	ToolImportModelURI:      "POST /models/import-uri",
+	ToolDeleteModel:         "POST /models/delete/:name",
+	ToolEditModelConfig:     "PATCH /api/models/config-json/:name",
+	ToolReloadModels:        "POST /models/reload",
+	ToolInstallBackend:      "POST /backends/apply",
+	ToolUpgradeBackend:      "POST /backends/upgrade/:name",
+	ToolToggleModelState:    "PUT /models/toggle-state/:name/:action",
+	ToolToggleModelPinned:   "PUT /models/toggle-pinned/:name/:action",
+	ToolSetBranding:         "POST /api/settings (instance_name, instance_tagline)",
+	ToolSetPIIPatternAction: "PUT /api/pii/patterns/:id",
+	ToolPersistPIIPatterns:  "POST /api/pii/patterns/persist",
 }
 
 // allKnownTools is the union of expectedFullCatalog (defined in
diff --git a/pkg/mcp/localaitools/dto.go b/pkg/mcp/localaitools/dto.go
index 4816d6d09..85136c60a 100644
--- a/pkg/mcp/localaitools/dto.go
+++ b/pkg/mcp/localaitools/dto.go
@@ -137,6 +137,179 @@ type SetBrandingRequest struct {
 	InstanceTagline *string `json:"instance_tagline,omitempty" jsonschema:"Optional short subtitle shown beneath the instance name. Pass an empty string to clear."`
 }
 
+// UsageStatsQuery is the input for get_usage_stats. UserID is optional;
+// when empty the tool returns the calling user's own usage in auth-on
+// mode, or the synthetic local user's usage in single-user no-auth
+// mode. Admins (or the local user) may pass UserID to inspect another
+// user; the LocalAIClient implementation enforces the role check.
+type UsageStatsQuery struct {
+	Period string `json:"period,omitempty" jsonschema:"Time window. One of: day, week, month, all. Defaults to month."`
+	UserID string `json:"user_id,omitempty" jsonschema:"Optional user id to query. Empty = caller's own usage. Querying another user requires admin role."`
+	All    bool   `json:"all,omitempty"     jsonschema:"When true, returns the cluster-wide /api/usage/all view (admin-only when auth is on)."`
+}
+
+// UsageStats is the response shape for get_usage_stats. Mirrors what
+// /api/usage and /api/usage/all return so the LLM can correlate
+// dashboard numbers with what it pulls via MCP.
+type UsageStats struct {
+	Viewer  UsageViewer   `json:"viewer"`
+	Period  string        `json:"period"`
+	Totals  UsageTotals   `json:"totals"`
+	Buckets []UsageBucket `json:"buckets"`
+}
+
+type UsageViewer struct {
+	ID   string `json:"id"`
+	Name string `json:"name"`
+	Role string `json:"role,omitempty"`
+}
+
+type UsageTotals struct {
+	PromptTokens     int64 `json:"prompt_tokens"`
+	CompletionTokens int64 `json:"completion_tokens"`
+	TotalTokens      int64 `json:"total_tokens"`
+	RequestCount     int64 `json:"request_count"`
+}
+
+type UsageBucket struct {
+	Bucket           string `json:"bucket"`
+	Model            string `json:"model"`
+	UserID           string `json:"user_id,omitempty"`
+	UserName         string `json:"user_name,omitempty"`
+	PromptTokens     int64  `json:"prompt_tokens"`
+	CompletionTokens int64  `json:"completion_tokens"`
+	TotalTokens      int64  `json:"total_tokens"`
+	RequestCount     int64  `json:"request_count"`
+}
+
+// ---- PII / sensitive data tools ----
+
+// PIIPattern is one row in the list_pii_patterns response.
+type PIIPattern struct {
+	ID             string `json:"id"`
+	Description    string `json:"description"`
+	Action         string `json:"action"` // mask | block | route_local
+	MaxMatchLength int    `json:"max_match_length"`
+}
+
+// PIIEventsQuery filters get_pii_events.
+type PIIEventsQuery struct {
+	CorrelationID string `json:"correlation_id,omitempty" jsonschema:"Optional X-Correlation-ID join key (binds events to the request and usage record)."`
+	UserID        string `json:"user_id,omitempty"        jsonschema:"Optional user id to scope the query."`
+	PatternID     string `json:"pattern_id,omitempty"     jsonschema:"Optional pattern id (e.g. email, ssn)."`
+	Limit         int    `json:"limit,omitempty"          jsonschema:"Maximum events. Defaults to 100."`
+}
+
+// PIIEvent is the LLM-facing view of one redaction record. The matched
+// value is never exposed; admins audit by hash_prefix.
+type PIIEvent struct {
+	ID            string `json:"id"`
+	CorrelationID string `json:"correlation_id"`
+	UserID        string `json:"user_id"`
+	Direction     string `json:"direction"`
+	PatternID     string `json:"pattern_id"`
+	ByteOffset    int    `json:"byte_offset"`
+	Length        int    `json:"length"`
+	HashPrefix    string `json:"hash_prefix"`
+	Action        string `json:"action"`
+	CreatedAt     string `json:"created_at"`
+}
+
+// PIIRedactTestRequest is the input for test_pii_redaction.
+type PIIRedactTestRequest struct {
+	Text string `json:"text" jsonschema:"The candidate text. Will be run through the redactor without recording an event."`
+}
+
+// PIIRedactTestResult is the output for test_pii_redaction. spans
+// describes where the redactor matched; redacted is the text after
+// applying mask actions; blocked / local_only flag stronger actions.
+type PIIRedactTestResult struct {
+	Redacted  string        `json:"redacted"`
+	Spans     []PIIEventSpan `json:"spans"`
+	Blocked   bool          `json:"blocked"`
+	LocalOnly bool          `json:"local_only"`
+}
+
+type PIIEventSpan struct {
+	Start      int    `json:"start"`
+	End        int    `json:"end"`
+	Pattern    string `json:"pattern"`
+	HashPrefix string `json:"hash_prefix"`
+}
+
+// PIIPatternActionUpdate is the input for set_pii_pattern_action.
+// At least one of Action or Disabled must be set. Mutations are
+// transient by default — call persist_pii_patterns to flush them
+// to runtime_settings.json so the next start re-applies them.
+type PIIPatternActionUpdate struct {
+	ID       string `json:"id" jsonschema:"Pattern id to mutate (e.g. email, ssn, credit_card, api_key_prefix)."`
+	Action   string `json:"action,omitempty" jsonschema:"New action: mask, block, or route_local. Optional — omit to leave the action unchanged."`
+	Disabled *bool  `json:"disabled,omitempty" jsonschema:"Set true to skip this pattern entirely; false to re-enable. Optional — omit to leave enabled-state unchanged."`
+}
+
+// MiddlewareStatus is the aggregated /api/middleware/status payload —
+// the React Middleware page renders this in one go. Routing is a
+// placeholder until subsystem 2 lands.
+type MiddlewareStatus struct {
+	PII    MiddlewarePIIStatus    `json:"pii"`
+	Router MiddlewareRouterStatus `json:"router"`
+}
+
+// MiddlewarePIIStatus shows what the redactor is doing right now and
+// which models opt in. enabled_globally=false means --disable-pii.
+type MiddlewarePIIStatus struct {
+	EnabledGlobally           bool                  `json:"enabled_globally"`
+	Reason                    string                `json:"reason,omitempty"`
+	DefaultEnabledForBackends []string              `json:"default_enabled_for_backends,omitempty"`
+	Patterns                  []PIIPattern          `json:"patterns"`
+	Models                    []MiddlewarePIIModel  `json:"models"`
+	RecentEventCount          int                   `json:"recent_event_count"`
+}
+
+// MiddlewarePIIModel is one model row in the per-model PII table.
+type MiddlewarePIIModel struct {
+	Name              string            `json:"name"`
+	Backend           string            `json:"backend"`
+	Enabled           bool              `json:"enabled"`
+	Explicit          bool              `json:"explicit"`             // Did YAML set Enabled, or did the backend prefix decide?
+	DefaultForBackend bool              `json:"default_for_backend"`  // Backend matches the auto-on rule (proxy-*).
+	Overrides         map[string]string `json:"overrides,omitempty"`
+}
+
+// MiddlewareRouterStatus is the placeholder shape the Routing tab
+// reads. Subsystem 2 fills in Models with real RouterDecision rows.
+type MiddlewareRouterStatus struct {
+	Configured bool     `json:"configured"`
+	Models     []string `json:"models"`
+	Note       string   `json:"note,omitempty"`
+}
+
+// RouterDecisionsQuery filters get_router_decisions.
+type RouterDecisionsQuery struct {
+	CorrelationID string `json:"correlation_id,omitempty" jsonschema:"Optional X-Correlation-ID join key (binds decisions to the request and usage record)."`
+	UserID        string `json:"user_id,omitempty"        jsonschema:"Optional user id to scope the query."`
+	RouterModel   string `json:"router_model,omitempty"   jsonschema:"Optional router model name to filter by (e.g. smart-router)."`
+	Limit         int    `json:"limit,omitempty"          jsonschema:"Maximum decisions. Defaults to 100."`
+}
+
+// RouterDecision is the LLM-facing view of one routing decision. The
+// prompt is NEVER stored; admins audit by hash if they need to dedupe
+// recurring routing patterns.
+type RouterDecision struct {
+	ID             string  `json:"id"`
+	CorrelationID  string  `json:"correlation_id"`
+	UserID         string  `json:"user_id"`
+	RouterModel    string  `json:"router_model"`
+	RequestedModel string  `json:"requested_model"`
+	ServedModel    string  `json:"served_model"`
+	Classifier     string  `json:"classifier"`
+	Label          string  `json:"label"`
+	Score          float64 `json:"score"`
+	LatencyMs      int64   `json:"latency_ms"`
+	Cached         bool    `json:"cached"`
+	CreatedAt      string  `json:"created_at"`
+}
+
 // VRAMEstimateRequest is the input for vram_estimate. The output type is
 // pkg/vram.EstimateResult — used directly via the LocalAIClient interface
 // so the LLM sees the same shape (size_bytes/size_display/vram_bytes/
diff --git a/pkg/mcp/localaitools/fakes_test.go b/pkg/mcp/localaitools/fakes_test.go
index dcb8abdd3..cbe429a08 100644
--- a/pkg/mcp/localaitools/fakes_test.go
+++ b/pkg/mcp/localaitools/fakes_test.go
@@ -3,7 +3,6 @@ package localaitools
 import (
 	"context"
 	"errors"
-	"fmt"
 	"sync"
 
 	"github.com/mudler/LocalAI/core/config"
@@ -45,6 +44,13 @@ type fakeClient struct {
 	toggleModelPinned   func(string, modeladmin.Action) error
 	getBranding         func() (*Branding, error)
 	setBranding         func(SetBrandingRequest) (*Branding, error)
+	getUsageStats       func(UsageStatsQuery) (*UsageStats, error)
+	listPIIPatterns     func() ([]PIIPattern, error)
+	getPIIEvents        func(PIIEventsQuery) ([]PIIEvent, error)
+	testPIIRedaction    func(PIIRedactTestRequest) (*PIIRedactTestResult, error)
+	setPIIPatternAction func(PIIPatternActionUpdate) error
+	getMiddlewareStatus func() (*MiddlewareStatus, error)
+	getRouterDecisions  func(RouterDecisionsQuery) ([]RouterDecision, error)
 }
 
 type fakeCall struct {
@@ -236,5 +242,74 @@ func (f *fakeClient) SetBranding(_ context.Context, req SetBrandingRequest) (*Br
 	return &Branding{InstanceName: "LocalAI"}, nil
 }
 
-// boom is a sentinel error used by tests that want a deterministic error string.
-var boom = fmt.Errorf("boom")
+func (f *fakeClient) GetUsageStats(_ context.Context, q UsageStatsQuery) (*UsageStats, error) {
+	f.record("GetUsageStats", q)
+	if f.getUsageStats != nil {
+		return f.getUsageStats(q)
+	}
+	return &UsageStats{
+		Viewer: UsageViewer{ID: "fake-user", Name: "fake", Role: "user"},
+		Period: "month",
+	}, nil
+}
+
+func (f *fakeClient) ListPIIPatterns(_ context.Context) ([]PIIPattern, error) {
+	f.record("ListPIIPatterns", nil)
+	if f.listPIIPatterns != nil {
+		return f.listPIIPatterns()
+	}
+	return []PIIPattern{}, nil
+}
+
+func (f *fakeClient) GetPIIEvents(_ context.Context, q PIIEventsQuery) ([]PIIEvent, error) {
+	f.record("GetPIIEvents", q)
+	if f.getPIIEvents != nil {
+		return f.getPIIEvents(q)
+	}
+	return []PIIEvent{}, nil
+}
+
+func (f *fakeClient) TestPIIRedaction(_ context.Context, req PIIRedactTestRequest) (*PIIRedactTestResult, error) {
+	f.record("TestPIIRedaction", req)
+	if f.testPIIRedaction != nil {
+		return f.testPIIRedaction(req)
+	}
+	return &PIIRedactTestResult{Redacted: req.Text}, nil
+}
+
+func (f *fakeClient) SetPIIPatternAction(_ context.Context, req PIIPatternActionUpdate) error {
+	f.record("SetPIIPatternAction", req)
+	if f.setPIIPatternAction != nil {
+		return f.setPIIPatternAction(req)
+	}
+	return nil
+}
+
+func (f *fakeClient) PersistPIIPatterns(_ context.Context) error {
+	f.record("PersistPIIPatterns", nil)
+	return nil
+}
+
+func (f *fakeClient) GetRouterDecisions(_ context.Context, q RouterDecisionsQuery) ([]RouterDecision, error) {
+	f.record("GetRouterDecisions", q)
+	if f.getRouterDecisions != nil {
+		return f.getRouterDecisions(q)
+	}
+	return []RouterDecision{}, nil
+}
+
+func (f *fakeClient) GetMiddlewareStatus(_ context.Context) (*MiddlewareStatus, error) {
+	f.record("GetMiddlewareStatus", nil)
+	if f.getMiddlewareStatus != nil {
+		return f.getMiddlewareStatus()
+	}
+	return &MiddlewareStatus{
+		PII: MiddlewarePIIStatus{
+			EnabledGlobally: true,
+			Patterns:        []PIIPattern{},
+			Models:          []MiddlewarePIIModel{},
+		},
+		Router: MiddlewareRouterStatus{Configured: false, Models: []string{}},
+	}, nil
+}
+
diff --git a/pkg/mcp/localaitools/httpapi/client.go b/pkg/mcp/localaitools/httpapi/client.go
index b32a7600a..1e8c08352 100644
--- a/pkg/mcp/localaitools/httpapi/client.go
+++ b/pkg/mcp/localaitools/httpapi/client.go
@@ -11,6 +11,7 @@ import (
 	"fmt"
 	"io"
 	"net/http"
+	"net/url"
 	"strings"
 	"time"
 
@@ -106,7 +107,7 @@ func (c *Client) do(ctx context.Context, method, path string, body any, out any)
 	if err != nil {
 		return err
 	}
-	defer resp.Body.Close()
+	defer func() { _ = resp.Body.Close() }()
 
 	respBody, _ := io.ReadAll(resp.Body)
 	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
@@ -290,7 +291,7 @@ func (c *Client) ImportModelURI(ctx context.Context, req localaitools.ImportMode
 	if err != nil {
 		return nil, err
 	}
-	defer resp.Body.Close()
+	defer func() { _ = resp.Body.Close() }()
 	respBody, _ := io.ReadAll(resp.Body)
 
 	// 400 with `error: "ambiguous import"` is not a transport error — it's the
@@ -506,6 +507,188 @@ func (c *Client) SetBranding(ctx context.Context, req localaitools.SetBrandingRe
 	return c.GetBranding(ctx)
 }
 
+// ---- Usage / billing ----
+
+func (c *Client) GetUsageStats(ctx context.Context, q localaitools.UsageStatsQuery) (*localaitools.UsageStats, error) {
+	period := q.Period
+	if period == "" {
+		period = "month"
+	}
+	path := routeUsage
+	if q.All {
+		path = routeUsageAll
+	}
+	// Build query string. The /api/usage server expects these exact param
+	// names; any change there must update both sides.
+	qs := url.Values{}
+	qs.Set("period", period)
+	if q.UserID != "" && q.All {
+		qs.Set("user_id", q.UserID)
+	}
+	if enc := qs.Encode(); enc != "" {
+		path = path + "?" + enc
+	}
+
+	var raw struct {
+		Viewer struct {
+			ID   string `json:"id"`
+			Name string `json:"name"`
+			Role string `json:"role"`
+		} `json:"viewer"`
+		Totals struct {
+			PromptTokens     int64 `json:"prompt_tokens"`
+			CompletionTokens int64 `json:"completion_tokens"`
+			TotalTokens      int64 `json:"total_tokens"`
+			RequestCount     int64 `json:"request_count"`
+		} `json:"totals"`
+		Usage []struct {
+			Bucket           string `json:"bucket"`
+			Model            string `json:"model"`
+			UserID           string `json:"user_id"`
+			UserName         string `json:"user_name"`
+			PromptTokens     int64  `json:"prompt_tokens"`
+			CompletionTokens int64  `json:"completion_tokens"`
+			TotalTokens      int64  `json:"total_tokens"`
+			RequestCount     int64  `json:"request_count"`
+		} `json:"usage"`
+	}
+	if err := c.do(ctx, http.MethodGet, path, nil, &raw); err != nil {
+		return nil, err
+	}
+	out := &localaitools.UsageStats{
+		Viewer: localaitools.UsageViewer{ID: raw.Viewer.ID, Name: raw.Viewer.Name, Role: raw.Viewer.Role},
+		Period: period,
+		Totals: localaitools.UsageTotals{
+			PromptTokens:     raw.Totals.PromptTokens,
+			CompletionTokens: raw.Totals.CompletionTokens,
+			TotalTokens:      raw.Totals.TotalTokens,
+			RequestCount:     raw.Totals.RequestCount,
+		},
+		Buckets: make([]localaitools.UsageBucket, 0, len(raw.Usage)),
+	}
+	for _, b := range raw.Usage {
+		out.Buckets = append(out.Buckets, localaitools.UsageBucket{
+			Bucket:           b.Bucket,
+			Model:            b.Model,
+			UserID:           b.UserID,
+			UserName:         b.UserName,
+			PromptTokens:     b.PromptTokens,
+			CompletionTokens: b.CompletionTokens,
+			TotalTokens:      b.TotalTokens,
+			RequestCount:     b.RequestCount,
+		})
+	}
+	return out, nil
+}
+
+// ---- PII filter ----
+
+func (c *Client) ListPIIPatterns(ctx context.Context) ([]localaitools.PIIPattern, error) {
+	var raw struct {
+		Patterns []localaitools.PIIPattern `json:"patterns"`
+	}
+	if err := c.do(ctx, http.MethodGet, routePIIPatterns, nil, &raw); err != nil {
+		return nil, err
+	}
+	return raw.Patterns, nil
+}
+
+func (c *Client) GetPIIEvents(ctx context.Context, q localaitools.PIIEventsQuery) ([]localaitools.PIIEvent, error) {
+	qs := url.Values{}
+	if q.CorrelationID != "" {
+		qs.Set("correlation_id", q.CorrelationID)
+	}
+	if q.UserID != "" {
+		qs.Set("user_id", q.UserID)
+	}
+	if q.PatternID != "" {
+		qs.Set("pattern_id", q.PatternID)
+	}
+	// The MCP get_pii_events tool is PII-shaped; the events store is now
+	// shared with proxy events that have no pattern_id/action. Scope to
+	// kind=pii so the LLM-facing audit stays coherent.
+	qs.Set("kind", "pii")
+	if q.Limit > 0 {
+		qs.Set("limit", fmt.Sprintf("%d", q.Limit))
+	}
+	path := routePIIEvents
+	if enc := qs.Encode(); enc != "" {
+		path = path + "?" + enc
+	}
+
+	var raw struct {
+		Events []localaitools.PIIEvent `json:"events"`
+	}
+	if err := c.do(ctx, http.MethodGet, path, nil, &raw); err != nil {
+		return nil, err
+	}
+	return raw.Events, nil
+}
+
+func (c *Client) TestPIIRedaction(ctx context.Context, req localaitools.PIIRedactTestRequest) (*localaitools.PIIRedactTestResult, error) {
+	var out localaitools.PIIRedactTestResult
+	if err := c.do(ctx, http.MethodPost, routePIITest, map[string]string{"text": req.Text}, &out); err != nil {
+		return nil, err
+	}
+	return &out, nil
+}
+
+func (c *Client) SetPIIPatternAction(ctx context.Context, req localaitools.PIIPatternActionUpdate) error {
+	if req.ID == "" {
+		return fmt.Errorf("pattern id is required")
+	}
+	body := map[string]any{}
+	if req.Action != "" {
+		body["action"] = req.Action
+	}
+	if req.Disabled != nil {
+		body["disabled"] = *req.Disabled
+	}
+	if len(body) == 0 {
+		return fmt.Errorf("must specify action and/or disabled")
+	}
+	return c.do(ctx, http.MethodPut, routePIIPatternByID(req.ID), body, nil)
+}
+
+func (c *Client) PersistPIIPatterns(ctx context.Context) error {
+	return c.do(ctx, http.MethodPost, routePIIPatternsPersist, nil, nil)
+}
+
+func (c *Client) GetMiddlewareStatus(ctx context.Context) (*localaitools.MiddlewareStatus, error) {
+	var out localaitools.MiddlewareStatus
+	if err := c.do(ctx, http.MethodGet, routeMiddleware, nil, &out); err != nil {
+		return nil, err
+	}
+	return &out, nil
+}
+
+func (c *Client) GetRouterDecisions(ctx context.Context, q localaitools.RouterDecisionsQuery) ([]localaitools.RouterDecision, error) {
+	qs := url.Values{}
+	if q.CorrelationID != "" {
+		qs.Set("correlation_id", q.CorrelationID)
+	}
+	if q.UserID != "" {
+		qs.Set("user_id", q.UserID)
+	}
+	if q.RouterModel != "" {
+		qs.Set("router_model", q.RouterModel)
+	}
+	if q.Limit > 0 {
+		qs.Set("limit", fmt.Sprintf("%d", q.Limit))
+	}
+	path := routeRouterDecisions
+	if enc := qs.Encode(); enc != "" {
+		path = path + "?" + enc
+	}
+	var raw struct {
+		Decisions []localaitools.RouterDecision `json:"decisions"`
+	}
+	if err := c.do(ctx, http.MethodGet, path, nil, &raw); err != nil {
+		return nil, err
+	}
+	return raw.Decisions, nil
+}
+
 // ---- helpers ----
 
 func contains(haystack, lowerNeedle string) bool {
diff --git a/pkg/mcp/localaitools/httpapi/routes.go b/pkg/mcp/localaitools/httpapi/routes.go
index e44c12b97..4be8f2ad8 100644
--- a/pkg/mcp/localaitools/httpapi/routes.go
+++ b/pkg/mcp/localaitools/httpapi/routes.go
@@ -11,21 +11,33 @@ import (
 // registrations in core/http/routes/localai.go — the Tool↔REST drift detector
 // in coverage_test.go documents the mapping.
 const (
-	routeWelcome       = "/"
-	routeModelsApply   = "/models/apply"
-	routeModelsAvail   = "/models/available"
-	routeModelsGall    = "/models/galleries"
-	routeModelsImport  = "/models/import-uri"
-	routeModelsReload  = "/models/reload"
-	routeBackends      = "/backends"
-	routeBackendsKnown = "/backends/known"
-	routeBackendsApply = "/backends/apply"
-	routeNodes         = "/api/nodes"
-	routeVRAMEstimate  = "/api/models/vram-estimate"
-	routeBranding      = "/api/branding"
-	routeSettings      = "/api/settings"
+	routeWelcome            = "/"
+	routeModelsApply        = "/models/apply"
+	routeModelsAvail        = "/models/available"
+	routeModelsGall         = "/models/galleries"
+	routeModelsImport       = "/models/import-uri"
+	routeModelsReload       = "/models/reload"
+	routeBackends           = "/backends"
+	routeBackendsKnown      = "/backends/known"
+	routeBackendsApply      = "/backends/apply"
+	routeNodes              = "/api/nodes"
+	routeVRAMEstimate       = "/api/models/vram-estimate"
+	routeBranding           = "/api/branding"
+	routeSettings           = "/api/settings"
+	routeUsage              = "/api/usage"
+	routeUsageAll           = "/api/usage/all"
+	routePIIPatterns        = "/api/pii/patterns"
+	routePIIPatternsPersist = "/api/pii/patterns/persist"
+	routePIIEvents          = "/api/pii/events"
+	routePIITest            = "/api/pii/test"
+	routeMiddleware         = "/api/middleware/status"
+	routeRouterDecisions    = "/api/router/decisions"
 )
 
+func routePIIPatternByID(id string) string {
+	return "/api/pii/patterns/" + url.PathEscape(id)
+}
+
 func routeJobStatus(jobID string) string {
 	return "/models/jobs/" + url.PathEscape(jobID)
 }
diff --git a/pkg/mcp/localaitools/inproc/client.go b/pkg/mcp/localaitools/inproc/client.go
index 85ad82167..e1d190dcd 100644
--- a/pkg/mcp/localaitools/inproc/client.go
+++ b/pkg/mcp/localaitools/inproc/client.go
@@ -17,6 +17,10 @@ import (
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services/galleryop"
 	"github.com/mudler/LocalAI/core/services/modeladmin"
+	"github.com/mudler/LocalAI/core/http/auth"
+	"github.com/mudler/LocalAI/core/services/routing/billing"
+	"github.com/mudler/LocalAI/core/services/routing/pii"
+	"github.com/mudler/LocalAI/core/services/routing/router"
 	"github.com/mudler/LocalAI/internal"
 	localaitools "github.com/mudler/LocalAI/pkg/mcp/localaitools"
 	"github.com/mudler/LocalAI/pkg/model"
@@ -36,12 +40,32 @@ type Client struct {
 	ModelLoader  *model.ModelLoader
 	Gallery      *galleryop.GalleryService
 
+	// StatsRecorder and FallbackUser are optional — they back the
+	// get_usage_stats tool. nil StatsRecorder makes the tool return an
+	// "unavailable" error, which keeps the assistant responsive on
+	// deployments that ran with --disable-stats or where startup wired
+	// the inproc client before stats were ready.
+	StatsRecorder *billing.Recorder
+	FallbackUser  *auth.User
+
+	// PIIRedactor and PIIEvents back the list_pii_patterns,
+	// get_pii_events, and test_pii_redaction tools. nil values cause
+	// the tools to return a "filter disabled" error.
+	PIIRedactor *pii.Redactor
+	PIIEvents   pii.EventStore
+
+	// RouterDecisions backs the get_router_decisions tool. nil makes
+	// the tool return an empty list — same shape the REST endpoint
+	// returns when stats are disabled.
+	RouterDecisions router.DecisionStore
+
 	modelAdmin *modeladmin.ConfigService
 }
 
 // New builds a Client wired to the given services. All fields are required
 // except ModelLoader (used only for SystemInfo's loaded-models report and
-// best-effort ShutdownModel calls during config edits).
+// best-effort ShutdownModel calls during config edits) and the stats
+// fields (StatsRecorder, FallbackUser) which gate get_usage_stats.
 func New(appConfig *config.ApplicationConfig, systemState *system.SystemState, cl *config.ModelConfigLoader, ml *model.ModelLoader, gs *galleryop.GalleryService) *Client {
 	return &Client{
 		AppConfig:    appConfig,
@@ -520,6 +544,300 @@ func capabilityToFlag(capability localaitools.Capability) (config.ModelConfigUse
 	return 0, false
 }
 
+// ---- Usage / billing ----
+
+func (c *Client) GetUsageStats(ctx context.Context, q localaitools.UsageStatsQuery) (*localaitools.UsageStats, error) {
+	if c.StatsRecorder == nil {
+		return nil, errors.New("usage tracking is not available on this server")
+	}
+	period := q.Period
+	if period == "" {
+		period = "month"
+	}
+
+	// Resolve which user this is. In single-user no-auth mode the
+	// inproc client doesn't have an echo context to read auth.GetUser
+	// from, so the FallbackUser is the only available identity. When
+	// auth IS on, the assistant runs under a privileged session and the
+	// caller can pass q.UserID; we don't enforce admin here because the
+	// MCP server itself is gated on admin (see prompts/10_safety.md).
+	var viewerID, viewerName, viewerRole string
+	switch {
+	case q.UserID != "":
+		viewerID = q.UserID
+	case c.FallbackUser != nil:
+		viewerID = c.FallbackUser.ID
+		viewerName = c.FallbackUser.Name
+		viewerRole = c.FallbackUser.Role
+	default:
+		return nil, errors.New("no user context for usage query (auth is on but no user id was provided)")
+	}
+
+	queryUser := viewerID
+	if q.All {
+		// /api/usage/all: cluster-wide by default, but honour the
+		// optional UserID filter so admins can scope to one user —
+		// matches the REST endpoint's ?user_id=… query param. Empty
+		// q.UserID falls through to the cluster-wide aggregate.
+		queryUser = q.UserID
+	}
+
+	rows, err := c.StatsRecorder.Aggregate(ctx, billing.AggregateQuery{
+		UserID: queryUser,
+		Period: period,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("aggregate usage: %w", err)
+	}
+
+	totals := localaitools.UsageTotals{}
+	buckets := make([]localaitools.UsageBucket, 0, len(rows))
+	for _, r := range rows {
+		buckets = append(buckets, localaitools.UsageBucket{
+			Bucket:           r.Bucket,
+			Model:            r.Model,
+			UserID:           r.UserID,
+			UserName:         r.UserName,
+			PromptTokens:     r.PromptTokens,
+			CompletionTokens: r.CompletionTokens,
+			TotalTokens:      r.TotalTokens,
+			RequestCount:     r.RequestCount,
+		})
+		totals.PromptTokens += r.PromptTokens
+		totals.CompletionTokens += r.CompletionTokens
+		totals.TotalTokens += r.TotalTokens
+		totals.RequestCount += r.RequestCount
+	}
+
+	return &localaitools.UsageStats{
+		Viewer:  localaitools.UsageViewer{ID: viewerID, Name: viewerName, Role: viewerRole},
+		Period:  period,
+		Totals:  totals,
+		Buckets: buckets,
+	}, nil
+}
+
+// ---- PII filter ----
+
+func (c *Client) ListPIIPatterns(_ context.Context) ([]localaitools.PIIPattern, error) {
+	if c.PIIRedactor == nil {
+		return nil, errors.New("PII filter is disabled")
+	}
+	patterns := c.PIIRedactor.Patterns()
+	out := make([]localaitools.PIIPattern, 0, len(patterns))
+	for _, p := range patterns {
+		out = append(out, localaitools.PIIPattern{
+			ID:             p.ID,
+			Description:    p.Description,
+			Action:         string(p.Action),
+			MaxMatchLength: p.MaxMatchLength,
+		})
+	}
+	return out, nil
+}
+
+func (c *Client) GetPIIEvents(ctx context.Context, q localaitools.PIIEventsQuery) ([]localaitools.PIIEvent, error) {
+	if c.PIIEvents == nil {
+		return nil, errors.New("PII filter is disabled")
+	}
+	events, err := c.PIIEvents.List(ctx, pii.ListQuery{
+		CorrelationID: q.CorrelationID,
+		UserID:        q.UserID,
+		PatternID:     q.PatternID,
+		Kind:          pii.KindPII,
+		Limit:         q.Limit,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("list pii events: %w", err)
+	}
+	out := make([]localaitools.PIIEvent, 0, len(events))
+	for _, e := range events {
+		out = append(out, localaitools.PIIEvent{
+			ID:            e.ID,
+			CorrelationID: e.CorrelationID,
+			UserID:        e.UserID,
+			Direction:     string(e.Direction),
+			PatternID:     e.PatternID,
+			ByteOffset:    e.ByteOffset,
+			Length:        e.Length,
+			HashPrefix:    e.HashPrefix,
+			Action:        string(e.Action),
+			CreatedAt:     e.CreatedAt.Format("2006-01-02T15:04:05Z07:00"),
+		})
+	}
+	return out, nil
+}
+
+func (c *Client) SetPIIPatternAction(_ context.Context, req localaitools.PIIPatternActionUpdate) error {
+	if c.PIIRedactor == nil {
+		return errors.New("PII filter is disabled")
+	}
+	if req.ID == "" {
+		return errors.New("pattern id is required")
+	}
+	if req.Action == "" && req.Disabled == nil {
+		return errors.New("must specify action and/or disabled")
+	}
+	if req.Action != "" {
+		if err := c.PIIRedactor.SetAction(req.ID, pii.Action(req.Action)); err != nil {
+			return err
+		}
+	}
+	if req.Disabled != nil {
+		if err := c.PIIRedactor.SetDisabled(req.ID, *req.Disabled); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// PersistPIIPatterns snapshots the current redactor state into
+// runtime_settings.json. Mirrors POST /api/pii/patterns/persist.
+func (c *Client) PersistPIIPatterns(_ context.Context) error {
+	if c.PIIRedactor == nil {
+		return errors.New("PII filter is disabled")
+	}
+	if c.AppConfig == nil {
+		return errors.New("app config not available")
+	}
+	existing, err := c.AppConfig.ReadPersistedSettings()
+	if err != nil {
+		return fmt.Errorf("read settings: %w", err)
+	}
+	defaults, err := pii.LoadConfig(c.AppConfig.PIIConfigPath)
+	if err != nil {
+		return fmt.Errorf("reload defaults: %w", err)
+	}
+	defaultByID := make(map[string]pii.Pattern, len(defaults))
+	for _, d := range defaults {
+		defaultByID[d.ID] = d
+	}
+	overrides := map[string]config.PIIPatternRuntimeOverride{}
+	for _, p := range c.PIIRedactor.Patterns() {
+		d, known := defaultByID[p.ID]
+		ov := config.PIIPatternRuntimeOverride{}
+		changed := false
+		if !known || p.Action != d.Action {
+			action := string(p.Action)
+			ov.Action = &action
+			changed = true
+		}
+		if !known || p.Disabled != d.Disabled {
+			disabled := p.Disabled
+			ov.Disabled = &disabled
+			changed = true
+		}
+		if changed {
+			overrides[p.ID] = ov
+		}
+	}
+	existing.PIIPatternOverrides = &overrides
+	if err := c.AppConfig.WritePersistedSettings(existing); err != nil {
+		return fmt.Errorf("write settings: %w", err)
+	}
+	c.AppConfig.PIIPatternOverrides = overrides
+	return nil
+}
+
+func (c *Client) GetRouterDecisions(ctx context.Context, q localaitools.RouterDecisionsQuery) ([]localaitools.RouterDecision, error) {
+	if c.RouterDecisions == nil {
+		return []localaitools.RouterDecision{}, nil
+	}
+	rows, err := c.RouterDecisions.List(ctx, router.DecisionListQuery{
+		CorrelationID: q.CorrelationID,
+		UserID:        q.UserID,
+		RouterModel:   q.RouterModel,
+		Limit:         q.Limit,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("list router decisions: %w", err)
+	}
+	out := make([]localaitools.RouterDecision, 0, len(rows))
+	for _, r := range rows {
+		out = append(out, localaitools.RouterDecision{
+			ID:             r.ID,
+			CorrelationID:  r.CorrelationID,
+			UserID:         r.UserID,
+			RouterModel:    r.RouterModel,
+			RequestedModel: r.RequestedModel,
+			ServedModel:    r.ServedModel,
+			Classifier:     r.Classifier,
+			Label:          r.Label,
+			Score:          r.Score,
+			LatencyMs:      r.LatencyMs,
+			Cached:         r.Cached,
+			CreatedAt:      r.CreatedAt.Format("2006-01-02T15:04:05Z07:00"),
+		})
+	}
+	return out, nil
+}
+
+func (c *Client) GetMiddlewareStatus(ctx context.Context) (*localaitools.MiddlewareStatus, error) {
+	router := localaitools.MiddlewareRouterStatus{
+		Configured: false,
+		Models:     []string{},
+		Note:       "Intelligent routing is not yet implemented.",
+	}
+	piiSection := localaitools.MiddlewarePIIStatus{
+		EnabledGlobally: c.PIIRedactor != nil,
+		Patterns:        []localaitools.PIIPattern{},
+		Models:          []localaitools.MiddlewarePIIModel{},
+	}
+	if c.PIIRedactor == nil {
+		piiSection.Reason = "--disable-pii"
+		return &localaitools.MiddlewareStatus{PII: piiSection, Router: router}, nil
+	}
+	piiSection.DefaultEnabledForBackends = []string{"cloud-proxy"}
+	for _, p := range c.PIIRedactor.Patterns() {
+		piiSection.Patterns = append(piiSection.Patterns, localaitools.PIIPattern{
+			ID:             p.ID,
+			Description:    p.Description,
+			Action:         string(p.Action),
+			MaxMatchLength: p.MaxMatchLength,
+		})
+	}
+	if c.ConfigLoader != nil {
+		for _, cfg := range c.ConfigLoader.GetAllModelsConfigs() {
+			cfg := cfg
+			piiSection.Models = append(piiSection.Models, localaitools.MiddlewarePIIModel{
+				Name:              cfg.Name,
+				Backend:           cfg.Backend,
+				Enabled:           cfg.PIIIsEnabled(),
+				Explicit:          cfg.PII.Enabled != nil,
+				DefaultForBackend: cfg.Backend == "cloud-proxy",
+				Overrides:         cfg.PIIPatternOverrides(),
+			})
+		}
+	}
+	if c.PIIEvents != nil {
+		if n, err := c.PIIEvents.Count(ctx); err == nil {
+			piiSection.RecentEventCount = n
+		}
+	}
+	return &localaitools.MiddlewareStatus{PII: piiSection, Router: router}, nil
+}
+
+func (c *Client) TestPIIRedaction(_ context.Context, req localaitools.PIIRedactTestRequest) (*localaitools.PIIRedactTestResult, error) {
+	if c.PIIRedactor == nil {
+		return nil, errors.New("PII filter is disabled")
+	}
+	res := c.PIIRedactor.Redact(req.Text)
+	out := &localaitools.PIIRedactTestResult{
+		Redacted:  res.Redacted,
+		Blocked:   res.Blocked,
+		LocalOnly: res.LocalOnly,
+	}
+	for _, s := range res.Spans {
+		out.Spans = append(out.Spans, localaitools.PIIEventSpan{
+			Start:      s.Start,
+			End:        s.End,
+			Pattern:    s.Pattern,
+			HashPrefix: s.HashPrefix,
+		})
+	}
+	return out, nil
+}
+
 func capabilityFlagsOf(m *config.ModelConfig) []string {
 	var out []string
 	for label, flag := range config.GetAllModelConfigUsecases() {
diff --git a/pkg/mcp/localaitools/server.go b/pkg/mcp/localaitools/server.go
index 88d96ac0a..fd9f5da00 100644
--- a/pkg/mcp/localaitools/server.go
+++ b/pkg/mcp/localaitools/server.go
@@ -48,6 +48,9 @@ func NewServer(client LocalAIClient, opts Options) *mcp.Server {
 	registerSystemTools(srv, client, opts)
 	registerStateTools(srv, client, opts)
 	registerBrandingTools(srv, client, opts)
+	registerUsageTools(srv, client, opts)
+	registerPIITools(srv, client, opts)
+	registerMiddlewareTools(srv, client, opts)
 
 	return srv
 }
diff --git a/pkg/mcp/localaitools/server_test.go b/pkg/mcp/localaitools/server_test.go
index caf8bfdee..f82d0ae41 100644
--- a/pkg/mcp/localaitools/server_test.go
+++ b/pkg/mcp/localaitools/server_test.go
@@ -78,7 +78,11 @@ var expectedFullCatalog = sortedStrings(
 	ToolGallerySearch,
 	ToolGetBranding,
 	ToolGetJobStatus,
+	ToolGetMiddlewareStatus,
 	ToolGetModelConfig,
+	ToolGetPIIEvents,
+	ToolGetRouterDecisions,
+	ToolGetUsageStats,
 	ToolImportModelURI,
 	ToolInstallBackend,
 	ToolInstallModel,
@@ -87,9 +91,13 @@ var expectedFullCatalog = sortedStrings(
 	ToolListInstalledModels,
 	ToolListKnownBackends,
 	ToolListNodes,
+	ToolListPIIPatterns,
+	ToolPersistPIIPatterns,
 	ToolReloadModels,
 	ToolSetBranding,
+	ToolSetPIIPatternAction,
 	ToolSystemInfo,
+	ToolTestPIIRedaction,
 	ToolToggleModelPinned,
 	ToolToggleModelState,
 	ToolUpgradeBackend,
@@ -101,13 +109,19 @@ var expectedReadOnlyCatalog = sortedStrings(
 	ToolGallerySearch,
 	ToolGetBranding,
 	ToolGetJobStatus,
+	ToolGetMiddlewareStatus,
 	ToolGetModelConfig,
+	ToolGetPIIEvents,
+	ToolGetRouterDecisions,
+	ToolGetUsageStats,
 	ToolListBackends,
 	ToolListGalleries,
 	ToolListInstalledModels,
 	ToolListKnownBackends,
 	ToolListNodes,
+	ToolListPIIPatterns,
 	ToolSystemInfo,
+	ToolTestPIIRedaction,
 	ToolVRAMEstimate,
 )
 
diff --git a/pkg/mcp/localaitools/tools.go b/pkg/mcp/localaitools/tools.go
index d5e213f42..57b2638e3 100644
--- a/pkg/mcp/localaitools/tools.go
+++ b/pkg/mcp/localaitools/tools.go
@@ -19,6 +19,12 @@ const (
 	ToolListNodes           = "list_nodes"
 	ToolVRAMEstimate        = "vram_estimate"
 	ToolGetBranding         = "get_branding"
+	ToolGetUsageStats       = "get_usage_stats"
+	ToolListPIIPatterns     = "list_pii_patterns"
+	ToolGetPIIEvents        = "get_pii_events"
+	ToolTestPIIRedaction    = "test_pii_redaction"
+	ToolGetMiddlewareStatus = "get_middleware_status"
+	ToolGetRouterDecisions  = "get_router_decisions"
 
 	// Mutating tools — guarded by Options.DisableMutating and the
 	// LLM-side safety prompt (see prompts/10_safety.md).
@@ -32,6 +38,8 @@ const (
 	ToolToggleModelState  = "toggle_model_state"
 	ToolToggleModelPinned = "toggle_model_pinned"
 	ToolSetBranding       = "set_branding"
+	ToolSetPIIPatternAction = "set_pii_pattern_action"
+	ToolPersistPIIPatterns  = "persist_pii_patterns"
 )
 
 // DefaultServerName is the MCP Implementation.Name surfaced when
diff --git a/pkg/mcp/localaitools/tools_middleware.go b/pkg/mcp/localaitools/tools_middleware.go
new file mode 100644
index 000000000..626609bb0
--- /dev/null
+++ b/pkg/mcp/localaitools/tools_middleware.go
@@ -0,0 +1,78 @@
+package localaitools
+
+import (
+	"context"
+
+	"github.com/modelcontextprotocol/go-sdk/mcp"
+)
+
+// registerMiddlewareTools wires the routing-module admin surface for the
+// MCP server. The two tools mirror what the React /app/middleware page
+// exposes:
+//
+//   - get_middleware_status: read-only aggregator. The agent can ask
+//     "what's filtering my requests?" and get back the active PII
+//     pattern set, the per-model resolved enabled/override state, and
+//     a placeholder for routing.
+//   - set_pii_pattern_action: mutating. Mutations are TRANSIENT — they
+//     live until process restart, when patterns reload from the YAML
+//     defaults. The skill prompt should warn the user about that
+//     before applying lasting changes.
+func registerMiddlewareTools(s *mcp.Server, client LocalAIClient, opts Options) {
+	mcp.AddTool(s, &mcp.Tool{
+		Name:        ToolGetMiddlewareStatus,
+		Description: "Aggregated routing-module status: PII pattern catalogue with current actions, per-model resolved PII state and overrides, recent event count, plus the active router models and their classifier configs. Read-only.",
+	}, func(ctx context.Context, _ *mcp.CallToolRequest, _ struct{}) (*mcp.CallToolResult, any, error) {
+		status, err := client.GetMiddlewareStatus(ctx)
+		if err != nil {
+			return errorResult(err), nil, nil
+		}
+		return jsonResult(status), nil, nil
+	})
+
+	mcp.AddTool(s, &mcp.Tool{
+		Name:        ToolGetRouterDecisions,
+		Description: "Recent intelligent-routing decisions. Each row records which router model the client called, which candidate the classifier picked, the classifier's score and latency, and a correlation id that joins back to the usage record. Filter by correlation_id, user_id, or router_model. Read-only.",
+	}, func(ctx context.Context, _ *mcp.CallToolRequest, args RouterDecisionsQuery) (*mcp.CallToolResult, any, error) {
+		decisions, err := client.GetRouterDecisions(ctx, args)
+		if err != nil {
+			return errorResult(err), nil, nil
+		}
+		return jsonResult(decisions), nil, nil
+	})
+
+	if opts.DisableMutating {
+		return
+	}
+
+	mcp.AddTool(s, &mcp.Tool{
+		Name:        ToolSetPIIPatternAction,
+		Description: "Change a PII pattern's action (mask|block|route_local) and/or disabled state in-process. TRANSIENT: the mutation is lost on restart unless followed by persist_pii_patterns. Admin-required.",
+	}, func(ctx context.Context, _ *mcp.CallToolRequest, args PIIPatternActionUpdate) (*mcp.CallToolResult, any, error) {
+		if args.ID == "" {
+			return errorResultf("id is required"), nil, nil
+		}
+		if args.Action == "" && args.Disabled == nil {
+			return errorResultf("at least one of action (mask, block, route_local) or disabled must be set"), nil, nil
+		}
+		if err := client.SetPIIPatternAction(ctx, args); err != nil {
+			return errorResult(err), nil, nil
+		}
+		return jsonResult(map[string]any{
+			"id":        args.ID,
+			"action":    args.Action,
+			"disabled":  args.Disabled,
+			"persisted": false,
+		}), nil, nil
+	})
+
+	mcp.AddTool(s, &mcp.Tool{
+		Name:        ToolPersistPIIPatterns,
+		Description: "Snapshot the live PII redactor's per-pattern (action, disabled) state into runtime_settings.json so it re-applies on the next process start. Pairs with set_pii_pattern_action — that one is in-process; this one persists. Admin-required.",
+	}, func(ctx context.Context, _ *mcp.CallToolRequest, _ struct{}) (*mcp.CallToolResult, any, error) {
+		if err := client.PersistPIIPatterns(ctx); err != nil {
+			return errorResult(err), nil, nil
+		}
+		return jsonResult(map[string]any{"persisted": true}), nil, nil
+	})
+}
diff --git a/pkg/mcp/localaitools/tools_pii.go b/pkg/mcp/localaitools/tools_pii.go
new file mode 100644
index 000000000..e53a27dbe
--- /dev/null
+++ b/pkg/mcp/localaitools/tools_pii.go
@@ -0,0 +1,45 @@
+package localaitools
+
+import (
+	"context"
+
+	"github.com/modelcontextprotocol/go-sdk/mcp"
+)
+
+func registerPIITools(s *mcp.Server, client LocalAIClient, _ Options) {
+	mcp.AddTool(s, &mcp.Tool{
+		Name:        ToolListPIIPatterns,
+		Description: "List the active PII regex pattern set. Each entry shows the pattern id, description, and current action (mask, block, route_local). Read-only.",
+	}, func(ctx context.Context, _ *mcp.CallToolRequest, _ struct{}) (*mcp.CallToolResult, any, error) {
+		patterns, err := client.ListPIIPatterns(ctx)
+		if err != nil {
+			return errorResult(err), nil, nil
+		}
+		return jsonResult(patterns), nil, nil
+	})
+
+	mcp.AddTool(s, &mcp.Tool{
+		Name:        ToolGetPIIEvents,
+		Description: "Recent PII redaction events. Filter by correlation_id (joins to a usage record), user_id, or pattern_id. Events never carry the matched value — only an 8-char sha256 prefix so admins can dedupe recurring leaks.",
+	}, func(ctx context.Context, _ *mcp.CallToolRequest, args PIIEventsQuery) (*mcp.CallToolResult, any, error) {
+		events, err := client.GetPIIEvents(ctx, args)
+		if err != nil {
+			return errorResult(err), nil, nil
+		}
+		return jsonResult(events), nil, nil
+	})
+
+	mcp.AddTool(s, &mcp.Tool{
+		Name:        ToolTestPIIRedaction,
+		Description: "Dry-run the PII redactor against text without recording a real event. Useful for tuning patterns: paste a candidate string and see whether it would be masked, blocked, or routed locally.",
+	}, func(ctx context.Context, _ *mcp.CallToolRequest, args PIIRedactTestRequest) (*mcp.CallToolResult, any, error) {
+		if args.Text == "" {
+			return errorResultf("text is required"), nil, nil
+		}
+		res, err := client.TestPIIRedaction(ctx, args)
+		if err != nil {
+			return errorResult(err), nil, nil
+		}
+		return jsonResult(res), nil, nil
+	})
+}
diff --git a/pkg/mcp/localaitools/tools_usage.go b/pkg/mcp/localaitools/tools_usage.go
new file mode 100644
index 000000000..055118d92
--- /dev/null
+++ b/pkg/mcp/localaitools/tools_usage.go
@@ -0,0 +1,22 @@
+package localaitools
+
+import (
+	"context"
+
+	"github.com/modelcontextprotocol/go-sdk/mcp"
+)
+
+func registerUsageTools(s *mcp.Server, client LocalAIClient, _ Options) {
+	mcp.AddTool(s, &mcp.Tool{
+		Name: ToolGetUsageStats,
+		Description: "Return aggregated token usage. Defaults to the calling user's own usage over the last month. " +
+			"Use period=day|week|month|all to change the window. Set all=true for a cluster-wide admin view " +
+			"(only meaningful when auth is on and the caller is admin; in single-user mode there is only one user).",
+	}, func(ctx context.Context, _ *mcp.CallToolRequest, args UsageStatsQuery) (*mcp.CallToolResult, any, error) {
+		stats, err := client.GetUsageStats(ctx, args)
+		if err != nil {
+			return errorResult(err), nil, nil
+		}
+		return jsonResult(stats), nil, nil
+	})
+}
diff --git a/pkg/model/connection_evicting_client.go b/pkg/model/connection_evicting_client.go
index ade1e294b..b101e8f82 100644
--- a/pkg/model/connection_evicting_client.go
+++ b/pkg/model/connection_evicting_client.go
@@ -113,3 +113,15 @@ func (c *ConnectionEvictingClient) Rerank(ctx context.Context, in *pb.RerankRequ
 	c.checkErr(err)
 	return result, err
 }
+
+func (c *ConnectionEvictingClient) TokenClassify(ctx context.Context, in *pb.TokenClassifyRequest, opts ...ggrpc.CallOption) (*pb.TokenClassifyResponse, error) {
+	result, err := c.Backend.TokenClassify(ctx, in, opts...)
+	c.checkErr(err)
+	return result, err
+}
+
+func (c *ConnectionEvictingClient) Score(ctx context.Context, in *pb.ScoreRequest, opts ...ggrpc.CallOption) (*pb.ScoreResponse, error) {
+	result, err := c.Backend.Score(ctx, in, opts...)
+	c.checkErr(err)
+	return result, err
+}
diff --git a/pkg/store/client.go b/pkg/store/client.go
index 1a1f46ccc..4fa884b19 100644
--- a/pkg/store/client.go
+++ b/pkg/store/client.go
@@ -13,24 +13,10 @@ import (
 // SetCols sets multiple key-value pairs in the store
 // It's in columnar format so that keys[i] is associated with values[i]
 func SetCols(ctx context.Context, c grpc.Backend, keys [][]float32, values [][]byte) error {
-	protoKeys := make([]*proto.StoresKey, len(keys))
-	for i, k := range keys {
-		protoKeys[i] = &proto.StoresKey{
-			Floats: k,
-		}
-	}
-	protoValues := make([]*proto.StoresValue, len(values))
-	for i, v := range values {
-		protoValues[i] = &proto.StoresValue{
-			Bytes: v,
-		}
-	}
-	setOpts := &proto.StoresSetOptions{
-		Keys:   protoKeys,
-		Values: protoValues,
-	}
-
-	res, err := c.StoresSet(ctx, setOpts)
+	res, err := c.StoresSet(ctx, &proto.StoresSetOptions{
+		Keys:   WrapKeys(keys),
+		Values: WrapValues(values),
+	})
 	if err != nil {
 		return err
 	}
@@ -51,17 +37,7 @@ func SetSingle(ctx context.Context, c grpc.Backend, key []float32, value []byte)
 // DeleteCols deletes multiple key-value pairs from the store
 // It's in columnar format so that keys[i] is associated with values[i]
 func DeleteCols(ctx context.Context, c grpc.Backend, keys [][]float32) error {
-	protoKeys := make([]*proto.StoresKey, len(keys))
-	for i, k := range keys {
-		protoKeys[i] = &proto.StoresKey{
-			Floats: k,
-		}
-	}
-	deleteOpts := &proto.StoresDeleteOptions{
-		Keys: protoKeys,
-	}
-
-	res, err := c.StoresDelete(ctx, deleteOpts)
+	res, err := c.StoresDelete(ctx, &proto.StoresDeleteOptions{Keys: WrapKeys(keys)})
 	if err != nil {
 		return err
 	}
@@ -84,31 +60,11 @@ func DeleteSingle(ctx context.Context, c grpc.Backend, key []float32) error {
 // Be warned the keys are sorted and will be returned in a different order than they were input
 // There is no guarantee as to how the keys are sorted
 func GetCols(ctx context.Context, c grpc.Backend, keys [][]float32) ([][]float32, [][]byte, error) {
-	protoKeys := make([]*proto.StoresKey, len(keys))
-	for i, k := range keys {
-		protoKeys[i] = &proto.StoresKey{
-			Floats: k,
-		}
-	}
-	getOpts := &proto.StoresGetOptions{
-		Keys: protoKeys,
-	}
-
-	res, err := c.StoresGet(ctx, getOpts)
+	res, err := c.StoresGet(ctx, &proto.StoresGetOptions{Keys: WrapKeys(keys)})
 	if err != nil {
 		return nil, nil, err
 	}
-
-	ks := make([][]float32, len(res.Keys))
-	for i, k := range res.Keys {
-		ks[i] = k.Floats
-	}
-	vs := make([][]byte, len(res.Values))
-	for i, v := range res.Values {
-		vs[i] = v.Bytes
-	}
-
-	return ks, vs, nil
+	return UnwrapKeys(res.Keys), UnwrapValues(res.Values), nil
 }
 
 // GetSingle gets a single key-value pair from the store
@@ -128,28 +84,12 @@ func GetSingle(ctx context.Context, c grpc.Backend, key []float32) ([]byte, erro
 
 // Find similar keys to the given key. Returns the keys, values, and similarities
 func Find(ctx context.Context, c grpc.Backend, key []float32, topk int) ([][]float32, [][]byte, []float32, error) {
-	findOpts := &proto.StoresFindOptions{
-		Key: &proto.StoresKey{
-			Floats: key,
-		},
+	res, err := c.StoresFind(ctx, &proto.StoresFindOptions{
+		Key:  &proto.StoresKey{Floats: key},
 		TopK: int32(topk),
-	}
-
-	res, err := c.StoresFind(ctx, findOpts)
+	})
 	if err != nil {
 		return nil, nil, nil, err
 	}
-
-	ks := make([][]float32, len(res.Keys))
-	vs := make([][]byte, len(res.Values))
-
-	for i, k := range res.Keys {
-		ks[i] = k.Floats
-	}
-
-	for i, v := range res.Values {
-		vs[i] = v.Bytes
-	}
-
-	return ks, vs, res.Similarities, nil
+	return UnwrapKeys(res.Keys), UnwrapValues(res.Values), res.Similarities, nil
 }
diff --git a/pkg/store/proto.go b/pkg/store/proto.go
new file mode 100644
index 000000000..1eb5bece9
--- /dev/null
+++ b/pkg/store/proto.go
@@ -0,0 +1,46 @@
+package store
+
+// pb⇄[][]float32/[][]byte translation helpers shared by the gRPC
+// client (this file's package) and the local-store gRPC server in
+// backend/go/local-store. Same shape on both sides of the wire so a
+// schema bug only needs fixing once.
+
+import (
+	"github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+// WrapKeys wraps each plain []float32 in a *proto.StoresKey.
+func WrapKeys(in [][]float32) []*proto.StoresKey {
+	out := make([]*proto.StoresKey, len(in))
+	for i, k := range in {
+		out[i] = &proto.StoresKey{Floats: k}
+	}
+	return out
+}
+
+// WrapValues wraps each []byte in a *proto.StoresValue.
+func WrapValues(in [][]byte) []*proto.StoresValue {
+	out := make([]*proto.StoresValue, len(in))
+	for i, v := range in {
+		out[i] = &proto.StoresValue{Bytes: v}
+	}
+	return out
+}
+
+// UnwrapKeys extracts the inner Floats from a slice of *proto.StoresKey.
+func UnwrapKeys(in []*proto.StoresKey) [][]float32 {
+	out := make([][]float32, len(in))
+	for i, k := range in {
+		out[i] = k.Floats
+	}
+	return out
+}
+
+// UnwrapValues extracts the inner Bytes from a slice of *proto.StoresValue.
+func UnwrapValues(in []*proto.StoresValue) [][]byte {
+	out := make([][]byte, len(in))
+	for i, v := range in {
+		out[i] = v.Bytes
+	}
+	return out
+}
diff --git a/tests/e2e-ui/main.go b/tests/e2e-ui/main.go
index 7aca8e7e4..46dd954c4 100644
--- a/tests/e2e-ui/main.go
+++ b/tests/e2e-ui/main.go
@@ -8,6 +8,7 @@ import (
 	"os"
 	"os/signal"
 	"path/filepath"
+	"strings"
 	"syscall"
 
 	"github.com/mudler/LocalAI/core/application"
@@ -21,6 +22,20 @@ import (
 func main() {
 	mockBackend := flag.String("mock-backend", "", "path to mock-backend binary")
 	port := flag.Int("port", 8089, "port to listen on")
+	// piiYAML lets a test inject a per-model `pii:` block into the
+	// auto-generated mock-model.yaml. Used by the middleware end-to-end
+	// verification (and any future test that wants to exercise per-model
+	// gating without bringing up a real backend). The argument is the
+	// body of the pii: block — the leading "pii:\n  " is added here.
+	piiYAML := flag.String("pii-yaml", "", "optional pii: block to merge into mock-model.yaml")
+	// extraModels accepts repeatable name=yaml pairs that get written
+	// as additional model files. Used by the routing E2E to seed
+	// candidate models a router model can dispatch to.
+	extraModelFlag := flag.String("extra-model", "", "extra model YAML, formatted as 'name|<full yaml body>'. Repeatable via comma-then-pipe? — for the router test we ship a single big string with embedded newlines.")
+	// routerYAML appends a `router:` block to mock-model.yaml. Used by
+	// the routing E2E to turn mock-model into a smart-router that
+	// dispatches to extra-models.
+	routerYAML := flag.String("router-yaml", "", "optional router: block to merge into mock-model.yaml")
 	flag.Parse()
 
 	if *mockBackend == "" {
@@ -71,11 +86,33 @@ func main() {
 		fmt.Fprintf(os.Stderr, "error marshaling config: %v\n", err)
 		os.Exit(1)
 	}
-	if err := os.WriteFile(filepath.Join(modelsPath, "mock-model.yaml"), configYAML, 0644); err != nil {
+	body := configYAML
+	if *piiYAML != "" {
+		body = append(body, []byte("pii:\n  "+*piiYAML+"\n")...)
+	}
+	if *routerYAML != "" {
+		body = append(body, []byte("router:\n  "+*routerYAML+"\n")...)
+	}
+	if err := os.WriteFile(filepath.Join(modelsPath, "mock-model.yaml"), body, 0644); err != nil {
 		fmt.Fprintf(os.Stderr, "error writing config: %v\n", err)
 		os.Exit(1)
 	}
 
+	if *extraModelFlag != "" {
+		// extra-model format: "name|<yaml body>". The yaml body is
+		// inlined verbatim — caller controls indentation. Single name
+		// per flag invocation; multi-flag is fine because flag.String
+		// only keeps the last but the test passes only one.
+		parts := strings.SplitN(*extraModelFlag, "|", 2)
+		if len(parts) == 2 {
+			extraPath := filepath.Join(modelsPath, parts[0]+".yaml")
+			if err := os.WriteFile(extraPath, []byte(parts[1]), 0644); err != nil {
+				fmt.Fprintf(os.Stderr, "error writing extra model: %v\n", err)
+				os.Exit(1)
+			}
+		}
+	}
+
 	// Set up system state
 	systemState, err := system.GetSystemState(
 		system.WithModelPath(modelsPath),
diff --git a/tests/e2e/cloud_proxy_helpers_test.go b/tests/e2e/cloud_proxy_helpers_test.go
new file mode 100644
index 000000000..73844210c
--- /dev/null
+++ b/tests/e2e/cloud_proxy_helpers_test.go
@@ -0,0 +1,206 @@
+package e2e_test
+
+import (
+	"encoding/json"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"sync"
+	"sync/atomic"
+)
+
+// upstreamRecorder captures whatever request the cloud-proxy backend
+// forwarded to the fake upstream. Tests assert against the captured
+// fields to prove the body / headers / model rewrite landed correctly.
+type upstreamRecorder struct {
+	mu          sync.Mutex
+	Method      string
+	Path        string
+	Header      http.Header
+	Body        []byte
+	RequestHits int32
+}
+
+func (r *upstreamRecorder) Hits() int {
+	return int(atomic.LoadInt32(&r.RequestHits))
+}
+
+func (r *upstreamRecorder) snapshot() (method, path string, hdr http.Header, body []byte) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	// Clone header so the test can read after the next request lands.
+	cloned := http.Header{}
+	for k, v := range r.Header {
+		cloned[k] = append([]string{}, v...)
+	}
+	return r.Method, r.Path, cloned, append([]byte(nil), r.Body...)
+}
+
+// fakeOpenAIUpstreamServer stands up an httptest server that mimics
+// OpenAI Chat Completions. The script chooses what to return per
+// request — tests with different cases swap script via SetScript.
+type fakeOpenAIUpstreamServer struct {
+	srv      *httptest.Server
+	recorder upstreamRecorder
+
+	mu     sync.Mutex
+	script func(req []byte) (status int, body string, contentType string)
+}
+
+func newFakeOpenAIUpstream() *fakeOpenAIUpstreamServer {
+	f := &fakeOpenAIUpstreamServer{}
+	f.SetScript(func([]byte) (int, string, string) {
+		// Default: a trivial non-streaming text reply, no tool calls.
+		return 200, `{"id":"chatcmpl-x","choices":[{"index":0,"message":{"role":"assistant","content":"hello from fake openai"},"finish_reason":"stop"}],"usage":{"prompt_tokens":3,"completion_tokens":5,"total_tokens":8}}`, "application/json"
+	})
+	f.srv = httptest.NewServer(http.HandlerFunc(f.serve))
+	return f
+}
+
+func (f *fakeOpenAIUpstreamServer) serve(w http.ResponseWriter, r *http.Request) {
+	atomic.AddInt32(&f.recorder.RequestHits, 1)
+	body, _ := io.ReadAll(r.Body)
+	f.recorder.mu.Lock()
+	f.recorder.Method = r.Method
+	f.recorder.Path = r.URL.Path
+	f.recorder.Header = r.Header.Clone()
+	f.recorder.Body = body
+	f.recorder.mu.Unlock()
+
+	f.mu.Lock()
+	script := f.script
+	f.mu.Unlock()
+	status, replyBody, contentType := script(body)
+	w.Header().Set("Content-Type", contentType)
+	w.WriteHeader(status)
+	_, _ = io.WriteString(w, replyBody)
+}
+
+func (f *fakeOpenAIUpstreamServer) URL() string { return f.srv.URL }
+func (f *fakeOpenAIUpstreamServer) Close()      { f.srv.Close() }
+
+func (f *fakeOpenAIUpstreamServer) SetScript(script func(req []byte) (status int, body string, contentType string)) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.script = script
+}
+
+// Snapshot returns the most-recently captured request data.
+func (f *fakeOpenAIUpstreamServer) Snapshot() (method, path string, hdr http.Header, body []byte) {
+	return f.recorder.snapshot()
+}
+
+// DecodedBody returns the captured body parsed as a generic OpenAI
+// request. Helper for tests that want to assert specific fields
+// (e.g. model rewrite, stream flag) without re-parsing inline.
+func (f *fakeOpenAIUpstreamServer) DecodedBody() map[string]any {
+	_, _, _, body := f.Snapshot()
+	var m map[string]any
+	_ = json.Unmarshal(body, &m)
+	return m
+}
+
+// fakeAnthropicUpstreamServer is the Anthropic counterpart.
+type fakeAnthropicUpstreamServer struct {
+	srv      *httptest.Server
+	recorder upstreamRecorder
+
+	mu     sync.Mutex
+	script func(req []byte) (status int, body string, contentType string)
+}
+
+func newFakeAnthropicUpstream() *fakeAnthropicUpstreamServer {
+	f := &fakeAnthropicUpstreamServer{}
+	f.SetScript(func([]byte) (int, string, string) {
+		return 200, `{"id":"msg_x","type":"message","role":"assistant","content":[{"type":"text","text":"hello from fake anthropic"}],"model":"claude-fake","usage":{"input_tokens":3,"output_tokens":5}}`, "application/json"
+	})
+	f.srv = httptest.NewServer(http.HandlerFunc(f.serve))
+	return f
+}
+
+func (f *fakeAnthropicUpstreamServer) serve(w http.ResponseWriter, r *http.Request) {
+	atomic.AddInt32(&f.recorder.RequestHits, 1)
+	body, _ := io.ReadAll(r.Body)
+	f.recorder.mu.Lock()
+	f.recorder.Method = r.Method
+	f.recorder.Path = r.URL.Path
+	f.recorder.Header = r.Header.Clone()
+	f.recorder.Body = body
+	f.recorder.mu.Unlock()
+
+	f.mu.Lock()
+	script := f.script
+	f.mu.Unlock()
+	status, replyBody, contentType := script(body)
+	w.Header().Set("Content-Type", contentType)
+	w.WriteHeader(status)
+	_, _ = io.WriteString(w, replyBody)
+}
+
+func (f *fakeAnthropicUpstreamServer) URL() string { return f.srv.URL }
+func (f *fakeAnthropicUpstreamServer) Close()      { f.srv.Close() }
+
+func (f *fakeAnthropicUpstreamServer) SetScript(script func(req []byte) (status int, body string, contentType string)) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.script = script
+}
+
+func (f *fakeAnthropicUpstreamServer) Snapshot() (method, path string, hdr http.Header, body []byte) {
+	return f.recorder.snapshot()
+}
+
+func (f *fakeAnthropicUpstreamServer) DecodedBody() map[string]any {
+	_, _, _, body := f.Snapshot()
+	var m map[string]any
+	_ = json.Unmarshal(body, &m)
+	return m
+}
+
+// streamingOpenAIToolCallScript returns an SSE response that announces
+// a single tool call broken across delta fragments. The wire shape
+// matches what OpenAI actually emits; used to verify cloud-proxy
+// translate-mode preserves tool calls through HTTP.
+func streamingOpenAIToolCallScript() (status int, body string, contentType string) {
+	frames := []string{
+		`{"choices":[{"index":0,"delta":{"role":"assistant","tool_calls":[{"index":0,"id":"call_e2e","type":"function","function":{"name":"get_weather"}}]}}]}`,
+		`{"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"{\"location\":\"SF\"}"}}]}}]}`,
+		`{"choices":[{"index":0,"delta":{},"finish_reason":"tool_calls"}]}`,
+	}
+	var b strings.Builder
+	for _, f := range frames {
+		b.WriteString("data: ")
+		b.WriteString(f)
+		b.WriteString("\n\n")
+	}
+	b.WriteString("data: [DONE]\n\n")
+	return 200, b.String(), "text/event-stream"
+}
+
+// nonStreamingOpenAIToolCallScript returns a non-streaming tool-call
+// response with id/name/arguments fully populated.
+func nonStreamingOpenAIToolCallScript() (status int, body string, contentType string) {
+	return 200, `{"id":"chatcmpl-y","choices":[{"index":0,"message":{"role":"assistant","content":"","tool_calls":[{"id":"call_lookup","type":"function","function":{"name":"lookup","arguments":"{\"q\":\"clouds\"}"}}]},"finish_reason":"tool_calls"}],"usage":{"prompt_tokens":12,"completion_tokens":7,"total_tokens":19}}`, "application/json"
+}
+
+// emailLeakOpenAIScript returns a non-streaming response containing an
+// email address. The streaming PII filter doesn't apply to buffered
+// responses, but the response is JSON the client receives unchanged —
+// used to verify the wire path without PII assertions. The streaming
+// PII variant uses an SSE response.
+func emailLeakOpenAIStreamingScript() (status int, body string, contentType string) {
+	frames := []string{
+		`{"choices":[{"index":0,"delta":{"content":"contact alice@"}}]}`,
+		`{"choices":[{"index":0,"delta":{"content":"example.com please"}}]}`,
+		`{"choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}`,
+	}
+	var b strings.Builder
+	for _, f := range frames {
+		b.WriteString("data: ")
+		b.WriteString(f)
+		b.WriteString("\n\n")
+	}
+	b.WriteString("data: [DONE]\n\n")
+	return 200, b.String(), "text/event-stream"
+}
diff --git a/tests/e2e/e2e_cloud_proxy_test.go b/tests/e2e/e2e_cloud_proxy_test.go
new file mode 100644
index 000000000..d1b012e10
--- /dev/null
+++ b/tests/e2e/e2e_cloud_proxy_test.go
@@ -0,0 +1,268 @@
+package e2e_test
+
+import (
+	"context"
+	"encoding/json"
+	"io"
+	"net/http"
+	"strings"
+
+	"github.com/openai/openai-go/v3"
+	"github.com/openai/openai-go/v3/option"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// Cloud-proxy e2e tests drive real HTTP requests through LocalAI ->
+// cloud-proxy backend (separate process) -> fake upstream httptest
+// server. The whole pipeline is exercised: chat handler dispatch,
+// gRPC client/server, cloud-proxy translation, upstream call,
+// response forwarding back to the client.
+var _ = Describe("Cloud-proxy backend E2E", func() {
+	BeforeEach(func() {
+		if cloudProxyPath == "" {
+			Skip("cloud-proxy backend binary not built (make build-cloud-proxy-backend)")
+		}
+		// Reset upstream scripts + counters between specs so a previous
+		// spec's hits don't leak in. The default script is restored by
+		// each spec that needs a custom one.
+		cpOpenAIUpstream.SetScript(defaultOpenAIScript)
+		cpAnthropicUpstream.SetScript(defaultAnthropicScript)
+	})
+
+	Context("Passthrough mode — OpenAI shape", func() {
+		It("forwards a chat completion request verbatim and pipes the response back", func() {
+			cpOpenAIUpstream.SetScript(func([]byte) (int, string, string) {
+				return 200, `{"id":"resp-pt","choices":[{"index":0,"message":{"role":"assistant","content":"hi via passthrough"},"finish_reason":"stop"}],"usage":{"prompt_tokens":4,"completion_tokens":3,"total_tokens":7}}`, "application/json"
+			})
+
+			cp := openai.NewClient(option.WithBaseURL(apiURL))
+			resp, err := cp.Chat.Completions.New(context.TODO(), openai.ChatCompletionNewParams{
+				Model: "cp-passthrough-openai",
+				Messages: []openai.ChatCompletionMessageParamUnion{
+					openai.UserMessage("hello"),
+				},
+			})
+			Expect(err).NotTo(HaveOccurred())
+			Expect(resp.Choices).NotTo(BeEmpty())
+			Expect(resp.Choices[0].Message.Content).To(Equal("hi via passthrough"))
+
+			// Upstream observed an Authorization header sourced from
+			// the api_key_env we set at suite startup.
+			_, _, hdr, _ := cpOpenAIUpstream.Snapshot()
+			Expect(hdr.Get("Authorization")).To(Equal("Bearer sk-e2e-openai"))
+			// Body field assertions prove the wire format wasn't
+			// rewritten — passthrough mode shouldn't touch tools,
+			// messages, etc.
+			body := cpOpenAIUpstream.DecodedBody()
+			Expect(body["messages"]).NotTo(BeNil())
+		})
+	})
+
+	Context("Passthrough mode — Anthropic shape", func() {
+		It("forwards an Anthropic Messages request with x-api-key + anthropic-version", func() {
+			cpAnthropicUpstream.SetScript(func([]byte) (int, string, string) {
+				return 200, `{"id":"msg-pt","type":"message","role":"assistant","content":[{"type":"text","text":"hi via passthrough anthropic"}],"model":"claude","usage":{"input_tokens":4,"output_tokens":6}}`, "application/json"
+			})
+
+			// Anthropic SDK omitted to keep the test self-contained;
+			// raw POST exercises the same path. The Anthropic endpoint
+			// is /v1/messages on LocalAI.
+			reqBody := `{"model":"cp-passthrough-anthropic","max_tokens":64,"messages":[{"role":"user","content":"hello"}]}`
+			httpResp, err := http.Post(anthropicBaseURL+"/v1/messages", "application/json", strings.NewReader(reqBody))
+			Expect(err).NotTo(HaveOccurred())
+			defer func() { _ = httpResp.Body.Close() }()
+			Expect(httpResp.StatusCode).To(Equal(200))
+
+			respBody, _ := io.ReadAll(httpResp.Body)
+			Expect(string(respBody)).To(ContainSubstring("hi via passthrough anthropic"))
+
+			_, _, hdr, _ := cpAnthropicUpstream.Snapshot()
+			Expect(hdr.Get("x-api-key")).To(Equal("sk-ant-e2e"))
+			Expect(hdr.Get("anthropic-version")).NotTo(BeEmpty())
+			Expect(hdr.Get("Authorization")).To(BeEmpty(), "Authorization leaked on anthropic backend")
+		})
+	})
+
+	Context("Translate mode — OpenAI provider", func() {
+		// The chat handler only emits tool_calls in the response when
+		// the client asked for tools. The translate backend forwards
+		// whatever the upstream returns, but the endpoint-level
+		// assembly is gated on the request shape — same as for local
+		// models. The e2e tests therefore declare tools on the
+		// outbound request so the response-side assembly fires.
+		toolsParam := []openai.ChatCompletionToolUnionParam{
+			openai.ChatCompletionFunctionTool(openai.FunctionDefinitionParam{
+				Name:        "lookup",
+				Description: openai.String("look something up"),
+				Parameters: openai.FunctionParameters{
+					"type": "object",
+					"properties": map[string]any{
+						"q": map[string]any{"type": "string"},
+					},
+				},
+			}),
+		}
+
+		It("delivers tool_calls in the chat completion response", func() {
+			cpOpenAIUpstream.SetScript(func([]byte) (int, string, string) {
+				return nonStreamingOpenAIToolCallScript()
+			})
+
+			cp := openai.NewClient(option.WithBaseURL(apiURL))
+			resp, err := cp.Chat.Completions.New(context.TODO(), openai.ChatCompletionNewParams{
+				Model: "cp-translate-openai",
+				Messages: []openai.ChatCompletionMessageParamUnion{
+					openai.UserMessage("find clouds"),
+				},
+				Tools: toolsParam,
+			})
+			Expect(err).NotTo(HaveOccurred())
+			Expect(resp.Choices).NotTo(BeEmpty())
+			tcs := resp.Choices[0].Message.ToolCalls
+			Expect(tcs).To(HaveLen(1), "tool_calls should survive translate-mode round-trip")
+			Expect(tcs[0].Function.Name).To(Equal("lookup"))
+			Expect(tcs[0].Function.Arguments).To(ContainSubstring(`"q":"clouds"`))
+			// Token usage propagated from upstream.
+			Expect(resp.Usage.PromptTokens).To(BeNumerically(">", 0))
+		})
+
+		It("streams tool_call deltas through SSE", func() {
+			cpOpenAIUpstream.SetScript(func([]byte) (int, string, string) {
+				return streamingOpenAIToolCallScript()
+			})
+
+			cp := openai.NewClient(option.WithBaseURL(apiURL))
+			stream := cp.Chat.Completions.NewStreaming(context.TODO(), openai.ChatCompletionNewParams{
+				Model: "cp-translate-openai",
+				Messages: []openai.ChatCompletionMessageParamUnion{
+					openai.UserMessage("what's the weather in SF?"),
+				},
+				Tools: []openai.ChatCompletionToolUnionParam{
+					openai.ChatCompletionFunctionTool(openai.FunctionDefinitionParam{
+						Name:        "get_weather",
+						Description: openai.String("look up the weather"),
+						Parameters: openai.FunctionParameters{
+							"type": "object",
+							"properties": map[string]any{
+								"location": map[string]any{"type": "string"},
+							},
+						},
+					}),
+				},
+			})
+
+			var toolID, toolName string
+			var args strings.Builder
+			for stream.Next() {
+				chunk := stream.Current()
+				for _, ch := range chunk.Choices {
+					for _, tc := range ch.Delta.ToolCalls {
+						if tc.ID != "" {
+							toolID = tc.ID
+						}
+						if tc.Function.Name != "" {
+							toolName = tc.Function.Name
+						}
+						args.WriteString(tc.Function.Arguments)
+					}
+				}
+			}
+			Expect(stream.Err()).NotTo(HaveOccurred())
+			Expect(toolID).To(Equal("call_e2e"))
+			Expect(toolName).To(Equal("get_weather"))
+			// Argument fragments assembled in order.
+			var parsed map[string]any
+			Expect(json.Unmarshal([]byte(args.String()), &parsed)).To(Succeed())
+			Expect(parsed["location"]).To(Equal("SF"))
+		})
+	})
+
+	Context("Translate mode — Anthropic provider", func() {
+		It("preserves tool_use blocks through Messages API", func() {
+			cpAnthropicUpstream.SetScript(func([]byte) (int, string, string) {
+				return 200, `{"id":"msg-tu","type":"message","role":"assistant","content":[{"type":"text","text":"Let me check"},{"type":"tool_use","id":"toolu_e2e","name":"weather","input":{"location":"SF"}}],"model":"claude","usage":{"input_tokens":7,"output_tokens":12}}`, "application/json"
+			})
+
+			// Anthropic Messages endpoint exposes tool_use blocks
+			// directly. Raw POST + JSON decode keeps the test
+			// independent of any specific SDK version's accessor API.
+			// Tools declared on the request so the response-side
+			// assembly populates the tool_use blocks (same gate as
+			// for local models).
+			reqBody := `{"model":"cp-translate-anthropic","max_tokens":64,"messages":[{"role":"user","content":"what's the weather?"}],"tools":[{"name":"weather","description":"weather lookup","input_schema":{"type":"object","properties":{"location":{"type":"string"}}}}]}`
+			httpResp, err := http.Post(anthropicBaseURL+"/v1/messages", "application/json", strings.NewReader(reqBody))
+			Expect(err).NotTo(HaveOccurred())
+			defer func() { _ = httpResp.Body.Close() }()
+			Expect(httpResp.StatusCode).To(Equal(200))
+
+			var decoded map[string]any
+			Expect(json.NewDecoder(httpResp.Body).Decode(&decoded)).To(Succeed())
+			contentArr, ok := decoded["content"].([]any)
+			Expect(ok).To(BeTrue(), "response must carry content array")
+			var sawToolUse bool
+			for _, block := range contentArr {
+				m := block.(map[string]any)
+				if m["type"] == "tool_use" {
+					sawToolUse = true
+					Expect(m["name"]).To(Equal("weather"))
+					// Anthropic content-block assembly synthesizes
+					// tool_use IDs from the LocalAI request ID rather
+					// than passing through the upstream's toolu_* ID
+					// (see messages.go:253-267). Documenting the
+					// current behavior — the synthesized ID still
+					// follows the toolu_ prefix convention so SDK
+					// validation passes.
+					id, _ := m["id"].(string)
+					Expect(id).To(HavePrefix("toolu_"))
+					input, _ := m["input"].(map[string]any)
+					Expect(input["location"]).To(Equal("SF"))
+				}
+			}
+			Expect(sawToolUse).To(BeTrue(), "tool_use block must survive translate-mode round-trip")
+		})
+	})
+
+	Context("Translate mode + PII filter", func() {
+		It("applies the streaming PII filter to translate-mode content", func() {
+			// Default PII config redacts email addresses. Split the
+			// email across two SSE deltas so the filter has to buffer
+			// the partial match — proves the streaming filter is wired
+			// up in translate mode, not just passthrough.
+			cpOpenAIUpstream.SetScript(func([]byte) (int, string, string) {
+				return emailLeakOpenAIStreamingScript()
+			})
+
+			cp := openai.NewClient(option.WithBaseURL(apiURL))
+			stream := cp.Chat.Completions.NewStreaming(context.TODO(), openai.ChatCompletionNewParams{
+				Model: "cp-translate-openai",
+				Messages: []openai.ChatCompletionMessageParamUnion{
+					openai.UserMessage("share contact info"),
+				},
+			})
+
+			var assembled strings.Builder
+			for stream.Next() {
+				for _, ch := range stream.Current().Choices {
+					assembled.WriteString(ch.Delta.Content)
+				}
+			}
+			Expect(stream.Err()).NotTo(HaveOccurred())
+			out := assembled.String()
+			// If PII is wired up, the email is redacted before reaching
+			// the client. If not, "alice@example.com" leaks through.
+			// This is the lock-in test for gap #3.
+			Expect(out).NotTo(ContainSubstring("alice@example.com"),
+				"email leaked through translate-mode stream — PII filter not applied")
+		})
+	})
+})
+
+func defaultOpenAIScript([]byte) (int, string, string) {
+	return 200, `{"id":"chatcmpl-default","choices":[{"index":0,"message":{"role":"assistant","content":"default openai reply"},"finish_reason":"stop"}],"usage":{"prompt_tokens":1,"completion_tokens":1,"total_tokens":2}}`, "application/json"
+}
+
+func defaultAnthropicScript([]byte) (int, string, string) {
+	return 200, `{"id":"msg-default","type":"message","role":"assistant","content":[{"type":"text","text":"default anthropic reply"}],"model":"claude","usage":{"input_tokens":1,"output_tokens":1}}`, "application/json"
+}
diff --git a/tests/e2e/e2e_suite_test.go b/tests/e2e/e2e_suite_test.go
index 94cb05aad..65bb9b852 100644
--- a/tests/e2e/e2e_suite_test.go
+++ b/tests/e2e/e2e_suite_test.go
@@ -38,8 +38,14 @@ var (
 	apiPort           int
 	apiURL            string
 	mockBackendPath   string
+	cloudProxyPath    string
 	mcpServerURL      string
 	mcpServerShutdown func()
+
+	// Cloud-proxy fake upstreams. Live for the whole suite so the four
+	// cloud-proxy model YAMLs can point at their URLs at startup time.
+	cpOpenAIUpstream    *fakeOpenAIUpstreamServer
+	cpAnthropicUpstream *fakeAnthropicUpstreamServer
 )
 
 var _ = BeforeSuite(func() {
@@ -285,6 +291,96 @@ var _ = BeforeSuite(func() {
 		systemOpts = append(systemOpts, system.WithBackendPath(backendPath))
 	}
 
+	// Cloud-proxy backend e2e setup. The cloud-proxy binary lives next
+	// to mock-backend and is registered under its canonical "cloud-proxy"
+	// name. Fake upstreams come up first so the model YAMLs can encode
+	// their URLs at startup time. Build is best-effort — when the binary
+	// isn't present, the cloud-proxy specs Skip and the rest of the
+	// suite is unaffected.
+	cloudProxyCandidates := []string{
+		filepath.Join("..", "e2e", "mock-backend", "cloud-proxy"),
+		filepath.Join("tests", "e2e", "mock-backend", "cloud-proxy"),
+		filepath.Join("..", "..", "tests", "e2e", "mock-backend", "cloud-proxy"),
+	}
+	for _, p := range cloudProxyCandidates {
+		if _, err := os.Stat(p); err == nil {
+			cloudProxyPath = p
+			break
+		}
+	}
+	if cloudProxyPath != "" {
+		Expect(os.Chmod(cloudProxyPath, 0755)).To(Succeed())
+
+		cpOpenAIUpstream = newFakeOpenAIUpstream()
+		cpAnthropicUpstream = newFakeAnthropicUpstream()
+
+		// API keys are read from env vars — set placeholder values so
+		// the cloud-proxy backend's Load() doesn't fail with "unset".
+		// The fake upstreams accept any auth header.
+		Expect(os.Setenv("CLOUD_PROXY_E2E_OPENAI_KEY", "sk-e2e-openai")).To(Succeed())
+		Expect(os.Setenv("CLOUD_PROXY_E2E_ANTHROPIC_KEY", "sk-ant-e2e")).To(Succeed())
+
+		cloudProxyConfigs := []map[string]any{
+			{
+				"name":    "cp-passthrough-openai",
+				"backend": "cloud-proxy",
+				"parameters": map[string]any{
+					"model": "cloud-proxy-passthrough-openai.bin",
+				},
+				"proxy": map[string]any{
+					"mode":         "passthrough",
+					"provider":     "openai",
+					"upstream_url": cpOpenAIUpstream.URL() + "/v1/chat/completions",
+					"api_key_env":  "CLOUD_PROXY_E2E_OPENAI_KEY",
+				},
+			},
+			{
+				"name":    "cp-passthrough-anthropic",
+				"backend": "cloud-proxy",
+				"parameters": map[string]any{
+					"model": "cloud-proxy-passthrough-anthropic.bin",
+				},
+				"proxy": map[string]any{
+					"mode":         "passthrough",
+					"provider":     "anthropic",
+					"upstream_url": cpAnthropicUpstream.URL() + "/v1/messages",
+					"api_key_env":  "CLOUD_PROXY_E2E_ANTHROPIC_KEY",
+				},
+			},
+			{
+				"name":    "cp-translate-openai",
+				"backend": "cloud-proxy",
+				"parameters": map[string]any{
+					"model": "cloud-proxy-translate-openai.bin",
+				},
+				"proxy": map[string]any{
+					"mode":         "translate",
+					"provider":     "openai",
+					"upstream_url": cpOpenAIUpstream.URL() + "/v1/chat/completions",
+					"api_key_env":  "CLOUD_PROXY_E2E_OPENAI_KEY",
+				},
+			},
+			{
+				"name":    "cp-translate-anthropic",
+				"backend": "cloud-proxy",
+				"parameters": map[string]any{
+					"model": "cloud-proxy-translate-anthropic.bin",
+				},
+				"proxy": map[string]any{
+					"mode":         "translate",
+					"provider":     "anthropic",
+					"upstream_url": cpAnthropicUpstream.URL() + "/v1/messages",
+					"api_key_env":  "CLOUD_PROXY_E2E_ANTHROPIC_KEY",
+				},
+			},
+		}
+		for _, cfg := range cloudProxyConfigs {
+			data, err := yaml.Marshal(cfg)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(os.WriteFile(filepath.Join(modelsPath, cfg["name"].(string)+".yaml"), data, 0644)).To(Succeed())
+		}
+	}
+
 	systemState, err := system.GetSystemState(systemOpts...)
 	Expect(err).ToNot(HaveOccurred())
 
@@ -305,6 +401,9 @@ var _ = BeforeSuite(func() {
 	// Register mock backend (always available for non-realtime tests).
 	application.ModelLoader().SetExternalBackend("mock-backend", mockBackendPath)
 	application.ModelLoader().SetExternalBackend("opus", mockBackendPath)
+	if cloudProxyPath != "" {
+		application.ModelLoader().SetExternalBackend("cloud-proxy", cloudProxyPath)
+	}
 
 	// Create HTTP app
 	app, err = httpapi.API(application)
@@ -348,6 +447,12 @@ var _ = AfterSuite(func() {
 	if mcpServerShutdown != nil {
 		mcpServerShutdown()
 	}
+	if cpOpenAIUpstream != nil {
+		cpOpenAIUpstream.Close()
+	}
+	if cpAnthropicUpstream != nil {
+		cpAnthropicUpstream.Close()
+	}
 	if tmpDir != "" {
 		os.RemoveAll(tmpDir)
 	}
diff --git a/tests/e2e/mock-backend/main.go b/tests/e2e/mock-backend/main.go
index ec0c7735d..46c4e51d6 100644
--- a/tests/e2e/mock-backend/main.go
+++ b/tests/e2e/mock-backend/main.go
@@ -315,11 +315,18 @@ func (m *MockBackend) PredictStream(in *pb.PredictOptions, stream pb.Backend_Pre
 
 	var toStream string
 	toolName := mockToolNameFromRequest(in)
-	if toolName != "" && !promptHasToolResults(in.Prompt) {
+	switch {
+	case toolName != "" && !promptHasToolResults(in.Prompt):
 		toStream = fmt.Sprintf(`{"name": "%s", "arguments": {"location": "San Francisco"}}`, toolName)
-	} else if toolName != "" {
+	case toolName != "":
 		toStream = "Based on the tool results, the weather in San Francisco is sunny, 72°F."
-	} else {
+	case strings.Contains(in.Prompt, "MOCK_LEAK_EMAIL"):
+		// PII streaming test fixture: emit a response containing an email
+		// address so the streaming PII filter has something to mask. The
+		// content is split character-by-character below, so the mask
+		// must hold across chunk boundaries.
+		toStream = "Sure — here it is: alice@example.com is the address."
+	default:
 		toStream = "This is a mocked streaming response."
 	}
 	for i, r := range toStream {