feat(middleware): Model routing, PII filtering, Cloud model proxies (#9802)

Add a routing middleware stack and a cloud-proxy backend. * cloud-proxy: a Go gRPC backend that forwards OpenAI- and Anthropic-shaped chat requests to upstream providers, with an optional translate mode (OpenAI request -> Anthropic /v1/messages -> OpenAI response) and full tool-calling support. * routing: admission control, content-aware model routing (embedding cache + classifier + rerank + Arch-Router score), PII detection/redaction (regex + NER) with streaming filter and OpenAI/Anthropic adapters, and a per-user/per-key billing recorder backed by GORM or in-memory storage. * middleware: UsageMiddleware records usage via the billing recorder, plus admission, route-model, usage-stamp and trace middlewares. * observability: BackendTrace ring buffer stores full request bodies (capped), MITM proxy emits structured trace events, and router classifier decisions surface at /api/router/decide. * gallery: Arch-Router-1.5B (Q4_K_M and Q8_0). * UI: cloud-proxy model-editor fields, classifier system-prompt and score-normalization config, and a Traces page rendering request bodies. Assisted-by: claude-code:claude-opus-4-7 [Read] [Edit] [Bash] Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-07-29 17:38:21 -04:00 · 2026-05-25 08:28:27 +01:00
parent 1dcd1ae915
commit 6a80e23733
229 changed files with 26339 additions and 1030 deletions
--- a/backend/python/transformers/backend.py
+++ b/backend/python/transformers/backend.py
@@ -26,7 +26,7 @@ import torch.cuda

 XPU=os.environ.get("XPU", "0") == "1"
 import transformers as transformers_module
-from transformers import AutoTokenizer, AutoModel, AutoProcessor, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria
+from transformers import AutoTokenizer, AutoModel, AutoProcessor, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria, pipeline
 from scipy.io import wavfile
 from sentence_transformers import SentenceTransformer

@@ -200,6 +200,21 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                autoTokenizer = False
                self.model = SentenceTransformer(model_name, trust_remote_code=request.TrustRemoteCode)
                self.SentenceTransformer = True
+            elif request.Type == "TokenClassification":
+                # NER / PII tagging via HuggingFace's token-classification
+                # pipeline. aggregation_strategy="simple" merges B-/I- tags
+                # into single spans and gives byte offsets back. The
+                # tokenizer is bundled inside the pipeline, so we skip the
+                # AutoTokenizer load below.
+                autoTokenizer = False
+                self.tokenClassifier = pipeline(
+                    "token-classification",
+                    model=model_name,
+                    aggregation_strategy="simple",
+                    device=0 if self.CUDA else -1,
+                    trust_remote_code=request.TrustRemoteCode,
+                )
+                self.TokenClassification = True
            else:
                # Generic: dynamically resolve model class from transformers
                model_type = TYPE_ALIASES.get(request.Type, request.Type)
@@ -253,6 +268,39 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(message="Model loaded successfully", success=True)

+    def TokenClassify(self, request, context):
+        # Runs HuggingFace's token-classification pipeline and returns
+        # the aggregated entity spans. The pipeline gives us byte
+        # offsets via aggregation_strategy="simple" (set at load
+        # time), so the caller can slice the original text without
+        # re-tokenising on the Go side.
+        if not getattr(self, "TokenClassification", False):
+            context.set_code(grpc.StatusCode.FAILED_PRECONDITION)
+            context.set_details("model was not loaded as Type=TokenClassification")
+            return backend_pb2.TokenClassifyResponse()
+        try:
+            results = self.tokenClassifier(request.text)
+        except Exception as err:
+            print("TokenClassify error:", err, file=sys.stderr)
+            context.set_code(grpc.StatusCode.INTERNAL)
+            context.set_details(f"token-classification failed: {err}")
+            return backend_pb2.TokenClassifyResponse()
+
+        threshold = request.threshold if request.threshold > 0 else 0.0
+        entities = []
+        for r in results:
+            score = float(r.get("score", 0.0))
+            if score < threshold:
+                continue
+            entities.append(backend_pb2.TokenClassifyEntity(
+                entity_group=str(r.get("entity_group") or r.get("entity") or ""),
+                start=int(r.get("start", 0)),
+                end=int(r.get("end", 0)),
+                score=score,
+                text=str(r.get("word", "")),
+            ))
+        return backend_pb2.TokenClassifyResponse(entities=entities)
+
    def Embedding(self, request, context):
        set_seed(request.Seed)
        # Tokenize input