feat(pii): NER tier engine — privacy-filter.cpp backend + NER-centric PII filter (#10360)

Squashed feat/pii-ner-tier-engine rebased onto master (was 45 commits; see
backup/pii-ner-tier-engine-prerebase). Net change:

- privacy-filter.cpp: standalone GGML engine for the openai-privacy-filter
  PII/NER token classifier, wired as a LocalAI gRPC backend (CPU/CUDA/Vulkan).
  TokenClassify moves off the patched llama.cpp path onto this backend.
- PII filter reworked to be NER-centric (encoder/NER detection tier scanning
  whole conversations as one document), with a recreated bounded restricted-
  regex secret-matching pattern detector tier alongside it (per-model
  pii_detection.builtins / .patterns + core/services/routing/piipattern).
- Detection labelled by source (ner vs pattern); backend trace / confidence /
  debug observability; analyze/redact exposed as a synchronous API.
- Instance-wide default detector policy + per-usecase default-on; request
  filtering extended to completions, embeddings, edits & Ollama.
- React UI: NER-centric PII editor, detector-models table, pattern/builtins
  editor, middleware default-policy UI.
- Gallery: privacy-filter-multilingual token-classify model + NER install
  filter; token_classify known_usecase; batch sized to context for NER models.
  privacy-filter backend registered in the backend gallery (cpu/vulkan/cuda-13
  meta + image entries with a capabilities map) matching its CI matrix jobs,
  and an /import-model auto-detect importer (PrivacyFilterImporter, narrow
  privacy-filter GGUF detection) replacing the prior pref-only registration.

Reconciled against master's independent evolution:

- Dropped master's PIIPatternOverrides feature (global-pattern runtime
  overrides + /api/pii/patterns API + runtime_settings.json persistence). The
  per-model NER + pattern-detector design supersedes it; it was built on the
  global redactor pattern set this branch replaced.
- Reverted the llama.cpp Score carry-patch (0006-server-task-type-score):
  removed the patch and restored master's grpc-server.cpp Score RPC (direct
  llama_decode, slot-loop bypass) and LLAMA_VERSION pin, plus master's
  model_config validation forbidding score + chat/completion/embeddings on
  llama-cpp. token_classify is unaffected (it runs on the privacy-filter
  backend, not llama-cpp).

Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Richard Palethorpe <io@richiejp.com>
This commit is contained in:
Richard Palethorpe
2026-06-18 11:45:22 +01:00
committed by GitHub
parent c133ca39dc
commit 3fa7b2955c
134 changed files with 6671 additions and 4223 deletions

9
backend/cpp/privacy-filter/.gitignore vendored Normal file
View File

@@ -0,0 +1,9 @@
/privacy-filter.cpp
build/
package/
grpc-server
*.o
backend.pb.cc
backend.pb.h
backend.grpc.pb.cc
backend.grpc.pb.h

View File

@@ -0,0 +1,69 @@
cmake_minimum_required(VERSION 3.21)
project(privacy-filter-grpc-server LANGUAGES CXX C)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(TARGET grpc-server)
# Path to the privacy-filter.cpp engine sources. The Makefile arranges for this
# to exist (clone of a pinned commit, or a symlink to PRIVACY_FILTER_SRC).
set(PRIVACY_FILTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/privacy-filter.cpp"
CACHE PATH "Path to the privacy-filter.cpp engine source tree")
find_package(Threads REQUIRED)
find_package(Protobuf CONFIG QUIET)
if(NOT Protobuf_FOUND)
find_package(Protobuf REQUIRED)
endif()
find_package(gRPC CONFIG QUIET)
if(NOT gRPC_FOUND)
# Ubuntu's apt-installed grpc++ does not ship a CMake config - fall back.
find_library(GRPCPP_LIB grpc++ REQUIRED)
find_library(GRPCPP_REFLECTION_LIB grpc++_reflection REQUIRED)
add_library(gRPC::grpc++ INTERFACE IMPORTED)
set_target_properties(gRPC::grpc++ PROPERTIES INTERFACE_LINK_LIBRARIES "${GRPCPP_LIB}")
add_library(gRPC::grpc++_reflection INTERFACE IMPORTED)
set_target_properties(gRPC::grpc++_reflection PROPERTIES INTERFACE_LINK_LIBRARIES "${GRPCPP_REFLECTION_LIB}")
endif()
find_program(_PROTOC NAMES protoc REQUIRED)
find_program(_GRPC_CPP_PLUGIN NAMES grpc_cpp_plugin REQUIRED)
get_filename_component(HW_PROTO "${CMAKE_CURRENT_SOURCE_DIR}/../../backend.proto" ABSOLUTE)
get_filename_component(HW_PROTO_PATH "${HW_PROTO}" PATH)
set(HW_PROTO_SRCS "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.cc")
set(HW_PROTO_HDRS "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.h")
set(HW_GRPC_SRCS "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.cc")
set(HW_GRPC_HDRS "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.h")
add_custom_command(
OUTPUT "${HW_PROTO_SRCS}" "${HW_PROTO_HDRS}" "${HW_GRPC_SRCS}" "${HW_GRPC_HDRS}"
COMMAND ${_PROTOC}
ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
--cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
-I "${HW_PROTO_PATH}"
--plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN}"
"${HW_PROTO}"
DEPENDS "${HW_PROTO}")
add_library(hw_grpc_proto STATIC
${HW_GRPC_SRCS} ${HW_GRPC_HDRS}
${HW_PROTO_SRCS} ${HW_PROTO_HDRS})
target_include_directories(hw_grpc_proto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
# Build only the pf static lib (+ ggml) from the engine tree — no CLI/bench/tests.
# PF_VULKAN is honored when passed on the cmake command line (it lands in the
# shared cache the engine reads).
set(PF_BUILD_TOOLS OFF CACHE BOOL "" FORCE)
set(PF_BUILD_TESTS OFF CACHE BOOL "" FORCE)
add_subdirectory(${PRIVACY_FILTER_DIR} ${CMAKE_CURRENT_BINARY_DIR}/privacy-filter.cpp)
add_executable(${TARGET} grpc-server.cpp)
target_link_libraries(${TARGET} PRIVATE
pf
hw_grpc_proto
gRPC::grpc++
gRPC::grpc++_reflection
protobuf::libprotobuf
Threads::Threads)

View File

@@ -0,0 +1,77 @@
# privacy-filter backend Makefile.
#
# Wraps the standalone privacy-filter.cpp GGML engine (the openai-privacy-filter
# PII/NER token classifier) as a LocalAI gRPC backend. The engine source is
# fetched at the pin below — .github/workflows/bump_deps.yaml finds and updates
# PRIVACY_FILTER_VERSION, matching the llama-cpp / ds4 convention.
#
# Local development: point at a working checkout instead of cloning, e.g.
# make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server
PRIVACY_FILTER_VERSION?=646342f7a59c6b7d195185eac60bad762e572f1d
PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
PRIVACY_FILTER_SRC?=
CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
BUILD_DIR := build
BUILD_TYPE ?=
NATIVE ?= false
JOBS ?= $(shell nproc 2>/dev/null || echo 4)
CMAKE_ARGS ?= -DCMAKE_BUILD_TYPE=Release
# GPU backends; the default (cpu) needs no extra flags. 'cublas' is LocalAI's
# name for the CUDA build (matches llama-cpp / ds4), mapping to the engine's
# GGML_CUDA path; 'vulkan' selects the ggml Vulkan backend.
ifeq ($(BUILD_TYPE),cublas)
CMAKE_ARGS += -DPF_CUDA=ON
endif
ifeq ($(BUILD_TYPE),vulkan)
CMAKE_ARGS += -DPF_VULKAN=ON
endif
# Portable binaries for distribution: disable -march=native unless asked.
ifneq ($(NATIVE),true)
CMAKE_ARGS += -DGGML_NATIVE=OFF
endif
.PHONY: grpc-server package clean purge test all
all: grpc-server
# Provide the engine sources at ./privacy-filter.cpp. With PRIVACY_FILTER_SRC
# set we symlink a local checkout (instant, no network); otherwise we clone the
# pinned commit and its ggml submodule. The directory/symlink is the target, so
# make only does this once — run 'make purge && make' to refetch after a bump.
privacy-filter.cpp:
ifneq ($(PRIVACY_FILTER_SRC),)
ln -sfn $(abspath $(PRIVACY_FILTER_SRC)) privacy-filter.cpp
else
mkdir -p privacy-filter.cpp
cd privacy-filter.cpp && \
git init -q && \
git remote add origin $(PRIVACY_FILTER_REPO) && \
git fetch --depth 1 origin $(PRIVACY_FILTER_VERSION) && \
git checkout FETCH_HEAD && \
git submodule update --init --recursive --depth 1
endif
grpc-server: privacy-filter.cpp
@echo "Building privacy-filter grpc-server ($(BUILD_TYPE)) with $(CMAKE_ARGS)"
mkdir -p $(BUILD_DIR)
cd $(BUILD_DIR) && cmake $(CMAKE_ARGS) $(CURRENT_MAKEFILE_DIR) && cmake --build . --config Release -j $(JOBS)
cp $(BUILD_DIR)/grpc-server grpc-server
package: grpc-server
bash package.sh
test:
@echo "privacy-filter backend: parity/regression coverage lives in the engine repo"
clean:
rm -rf $(BUILD_DIR) grpc-server package
# 'privacy-filter.cpp' may be a symlink (PRIVACY_FILTER_SRC) — rm without a
# trailing slash removes the link, never the linked-to checkout.
purge: clean
rm -rf privacy-filter.cpp

View File

@@ -0,0 +1,210 @@
// privacy-filter LocalAI gRPC backend.
//
// Thin shim over privacy-filter.cpp's flat C API (include/pf.h): a standalone
// GGML engine for the openai-privacy-filter token-classification model family
// (PII NER). It replaces the llama.cpp-patched TokenClassify path for this one
// model family — same GGUF files, no llama.cpp carry-patches.
//
// Only the RPCs the PII tier needs are implemented: LoadModel, TokenClassify,
// plus Health / Status / Free. Everything else inherits the generated base
// class default (UNIMPLEMENTED).
#include "backend.pb.h"
#include "backend.grpc.pb.h"
#include "pf.h"
#include <grpcpp/grpcpp.h>
#include <grpcpp/server.h>
#include <grpcpp/server_builder.h>
#include <grpcpp/ext/proto_server_reflection_plugin.h>
#include <atomic>
#include <chrono>
#include <csignal>
#include <iostream>
#include <memory>
#include <mutex>
#include <string>
using grpc::Server;
using grpc::ServerBuilder;
using grpc::ServerContext;
// NOTE: do NOT alias grpc::Status as Status — the Status RPC method below would
// shadow the type and break the other method signatures. Use GStatus instead.
using GStatus = ::grpc::Status;
using grpc::StatusCode;
namespace {
// The engine is single-model-per-process: LocalAI spawns one backend process
// per loaded model. g_mu guards (re)load against in-flight classification.
std::mutex g_mu;
pf_ctx * g_ctx = nullptr;
std::atomic<Server *> g_server{nullptr};
// Resolve the device string the engine expects ("cpu" / "gpu" / "cuda" /
// "vulkan", optionally ":N"). Priority: an explicit "device:..." in
// ModelOptions.Options, then a non-zero NGPULayers as a coarse "use the GPU"
// signal, else CPU. "gpu" lets the engine pick whichever GPU backend this
// binary was compiled with (CUDA or Vulkan), so the same config works on
// either build; pin "device:cuda"/"device:vulkan" to be explicit.
std::string resolve_device(const backend::ModelOptions * opts) {
for (const auto & o : opts->options()) {
const std::string prefix = "device:";
if (o.rfind(prefix, 0) == 0) {
return o.substr(prefix.size());
}
}
if (opts->ngpulayers() > 0) {
return "gpu";
}
return "cpu";
}
class PrivacyFilterBackend final : public backend::Backend::Service {
public:
GStatus Health(ServerContext *, const backend::HealthMessage *,
backend::Reply * reply) override {
reply->set_message("OK");
return GStatus::OK;
}
GStatus Status(ServerContext *, const backend::HealthMessage *,
backend::StatusResponse * response) override {
std::lock_guard<std::mutex> lock(g_mu);
response->set_state(g_ctx ? backend::StatusResponse::READY
: backend::StatusResponse::UNINITIALIZED);
return GStatus::OK;
}
GStatus LoadModel(ServerContext *, const backend::ModelOptions * request,
backend::Result * result) override {
std::lock_guard<std::mutex> lock(g_mu);
// ModelFile is the absolute path LocalAI resolves; Model is the bare
// name. Prefer the former, fall back to the latter.
const std::string path =
!request->modelfile().empty() ? request->modelfile() : request->model();
if (path.empty()) {
result->set_success(false);
result->set_message("no model path supplied");
return GStatus::OK;
}
const std::string device = resolve_device(request);
if (g_ctx) { pf_free(g_ctx); g_ctx = nullptr; }
pf_ctx * ctx = pf_load(path.c_str(), device.c_str(), request->threads());
const char * err = pf_last_error(ctx);
if (err) {
result->set_success(false);
result->set_message(std::string("privacy-filter load failed: ") + err);
pf_free(ctx);
return GStatus::OK;
}
// ContextSize, when set, becomes the per-forward window. The engine
// ignores values that are too small to window (<= 2*halo) and just
// runs a single forward, so passing it through is always safe.
if (request->contextsize() > 0) {
pf_set_window(ctx, request->contextsize());
}
g_ctx = ctx;
result->set_success(true);
result->set_message("privacy-filter loaded (" + device + ")");
return GStatus::OK;
}
GStatus TokenClassify(ServerContext *, const backend::TokenClassifyRequest * request,
backend::TokenClassifyResponse * response) override {
std::lock_guard<std::mutex> lock(g_mu);
if (!g_ctx) {
return GStatus(StatusCode::FAILED_PRECONDITION, "Model not loaded");
}
const std::string & text = request->text();
if (text.empty()) {
return GStatus::OK; // no text -> no entities
}
pf_entity * ents = nullptr;
size_t n = 0;
if (pf_classify(g_ctx, text.data(), text.size(), request->threshold(), &ents, &n) != 0) {
const char * err = pf_last_error(g_ctx);
return GStatus(StatusCode::INTERNAL,
std::string("TokenClassify failed: ") + (err ? err : "unknown"));
}
// Byte offsets are into the original UTF-8 text; the engine already
// applied the threshold and whitespace-trimmed span edges.
for (size_t i = 0; i < n; i++) {
backend::TokenClassifyEntity * ent = response->add_entities();
ent->set_entity_group(ents[i].label ? ents[i].label : "");
ent->set_start(ents[i].start);
ent->set_end(ents[i].end);
ent->set_score(ents[i].score);
ent->set_text(text.substr((size_t) ents[i].start,
(size_t) (ents[i].end - ents[i].start)));
}
pf_entities_free(ents, n);
return GStatus::OK;
}
GStatus Free(ServerContext *, const backend::HealthMessage *,
backend::Result * result) override {
std::lock_guard<std::mutex> lock(g_mu);
if (g_ctx) { pf_free(g_ctx); g_ctx = nullptr; }
result->set_success(true);
return GStatus::OK;
}
};
void RunServer(const std::string & addr) {
PrivacyFilterBackend service;
grpc::EnableDefaultHealthCheckService(true);
grpc::reflection::InitProtoReflectionServerBuilderPlugin();
ServerBuilder builder;
builder.AddListeningPort(addr, grpc::InsecureServerCredentials());
builder.RegisterService(&service);
builder.SetMaxReceiveMessageSize(64 * 1024 * 1024);
builder.SetMaxSendMessageSize(64 * 1024 * 1024);
std::unique_ptr<Server> server(builder.BuildAndStart());
if (!server) {
std::cerr << "privacy-filter grpc-server: failed to bind " << addr << "\n";
std::exit(1);
}
g_server = server.get();
std::cerr << "privacy-filter grpc-server listening on " << addr << "\n";
server->Wait();
}
void signal_handler(int) {
if (auto * srv = g_server.load()) {
srv->Shutdown(std::chrono::system_clock::now() + std::chrono::seconds(3));
}
}
} // namespace
int main(int argc, char * argv[]) {
std::string addr = "127.0.0.1:50051";
for (int i = 1; i < argc; ++i) {
std::string a = argv[i];
const std::string addr_flag = "--addr=";
if (a.rfind(addr_flag, 0) == 0) addr = a.substr(addr_flag.size());
else if (a == "--addr" && i + 1 < argc) addr = argv[++i];
else if (a == "--help" || a == "-h") {
std::cout << "Usage: grpc-server --addr=HOST:PORT\n";
return 0;
}
}
std::signal(SIGINT, signal_handler);
std::signal(SIGTERM, signal_handler);
RunServer(addr);
return 0;
}

View File

@@ -0,0 +1,39 @@
#!/bin/bash
# Assemble package/ for the from-scratch backend image: the grpc-server binary,
# run.sh, the dynamic loader, and every shared library the binary needs.
set -e
CURDIR=$(dirname "$(realpath "$0")")
REPO_ROOT="${CURDIR}/../../.."
mkdir -p "$CURDIR/package/lib"
cp -avf "$CURDIR/grpc-server" "$CURDIR/package/"
cp -rfv "$CURDIR/run.sh" "$CURDIR/package/"
# The dynamic loader, renamed to lib/ld.so so run.sh can invoke it explicitly
# (makes the image independent of the host's glibc layout).
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
else
echo "package.sh: unknown architecture" >&2; exit 1
fi
# Bundle the binary's transitive shared deps (libstdc++, libgomp, and the apt
# grpc++/protobuf/absl stack) by walking ldd — robust to whichever of those are
# linked shared vs static. The loader line (no "=>") is skipped; ld.so above
# already covers it.
ldd "$CURDIR/grpc-server" | awk '$2 == "=>" && $3 ~ /^\// { print $3 }' | sort -u | \
while read -r so; do
[ -f "$so" ] && cp -arfLv "$so" "$CURDIR/package/lib/"
done
# Vulkan loader / GPU libs when building the GPU variant.
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
if [ -f "$GPU_LIB_SCRIPT" ]; then
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
package_gpu_libs
fi
echo "privacy-filter package contents:"
ls -lah "$CURDIR/package/" "$CURDIR/package/lib/"

View File

@@ -0,0 +1,9 @@
#!/bin/bash
# Entry point for the privacy-filter backend image / BACKEND_BINARY mode.
set -e
CURDIR=$(dirname "$(realpath "$0")")
export LD_LIBRARY_PATH="$CURDIR/lib:$LD_LIBRARY_PATH"
if [ -f "$CURDIR/lib/ld.so" ]; then
exec "$CURDIR/lib/ld.so" "$CURDIR/grpc-server" "$@"
fi
exec "$CURDIR/grpc-server" "$@"