diff --git a/.docker/ik-llama-cpp-compile.sh b/.docker/ik-llama-cpp-compile.sh
new file mode 100755
index 000000000..3da869007
--- /dev/null
+++ b/.docker/ik-llama-cpp-compile.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Shared compile logic for backend/Dockerfile.ik-llama-cpp.
+# Sourced (via bind mount) from both builder-fromsource and builder-prebuilt stages.
+
+set -euxo pipefail
+
+export CCACHE_DIR=/root/.ccache
+ccache --max-size=5G || true
+ccache -z || true
+
+export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"
+
+if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
+  CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
+  export CMAKE_ARGS="${CMAKE_ARGS} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
+  echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
+  rm -rf /LocalAI/backend/cpp/ik-llama-cpp-*-build
+fi
+
+cd /LocalAI/backend/cpp/ik-llama-cpp
+
+if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
+  # ARM64 / ROCm: build without x86 SIMD
+  make ik-llama-cpp-fallback
+else
+  # ik_llama.cpp's IQK kernels require at least AVX2
+  make ik-llama-cpp-avx2
+fi
+
+ccache -s || true
diff --git a/.docker/install-base-deps.sh b/.docker/install-base-deps.sh
new file mode 100755
index 000000000..5b0908fa8
--- /dev/null
+++ b/.docker/install-base-deps.sh
@@ -0,0 +1,244 @@
+#!/usr/bin/env bash
+# Single source of truth for builder-base contents.
+#
+# Used by:
+#   - backend/Dockerfile.base-grpc-builder        (CI prebuilt-base source of truth)
+#   - backend/Dockerfile.llama-cpp                (builder-fromsource stage)
+#   - backend/Dockerfile.ik-llama-cpp             (builder-fromsource stage)
+#   - backend/Dockerfile.turboquant               (builder-fromsource stage)
+#
+# All four files invoke this script via
+#   RUN --mount=type=bind,source=.docker/install-base-deps.sh,target=/usr/local/sbin/install-base-deps \
+#       --mount=type=bind,source=.docker/apt-mirror.sh,target=/usr/local/sbin/apt-mirror \
+#       bash /usr/local/sbin/install-base-deps
+#
+# so the prebuilt CI base image and the from-source local-dev path are
+# bit-equivalent by construction.
+#
+# Inputs (env, populated from Dockerfile ARG/ENV):
+#   BUILD_TYPE                ("cublas"|"l4t"|"hipblas"|"vulkan"|"sycl"|"clblas"|"")
+#   CUDA_MAJOR_VERSION        ("12" | "13" | "")
+#   CUDA_MINOR_VERSION        ("8" | "0" | "")
+#   TARGETARCH                ("amd64" | "arm64")
+#   UBUNTU_VERSION            ("2204" | "2404")
+#   SKIP_DRIVERS              ("false" | "true")
+#   CMAKE_FROM_SOURCE         ("false" | "true")
+#   CMAKE_VERSION             ("3.31.10")
+#   GRPC_VERSION              ("v1.65.0")
+#   GRPC_MAKEFLAGS            ("-j4 -Otarget")
+#   APT_MIRROR / APT_PORTS_MIRROR  (optional; consumed by /usr/local/sbin/apt-mirror)
+#   AMDGPU_TARGETS            (optional; only relevant for hipblas downstream)
+#
+# IMPORTANT: install logic is copied verbatim from the prior in-Dockerfile
+# RUN blocks. Do not paraphrase apt invocations / version pins / sed line
+# numbers / deb URLs — the bit-equivalence guarantee depends on it.
+
+set -eux
+
+# --- 0. apt mirror rewrite (no-op when APT_MIRROR / APT_PORTS_MIRROR unset) ---
+if [ -x /usr/local/sbin/apt-mirror ]; then
+    APT_MIRROR="${APT_MIRROR:-}" APT_PORTS_MIRROR="${APT_PORTS_MIRROR:-}" \
+        sh /usr/local/sbin/apt-mirror
+fi
+
+export DEBIAN_FRONTEND=noninteractive
+export MAKEFLAGS="${GRPC_MAKEFLAGS:-}"
+
+# --- 1. Base apt build deps ---
+apt-get update
+apt-get install -y --no-install-recommends \
+    build-essential \
+    ccache git \
+    ca-certificates \
+    make \
+    pkg-config libcurl4-openssl-dev \
+    curl unzip \
+    libssl-dev wget
+apt-get clean
+rm -rf /var/lib/apt/lists/*
+
+# --- 2. Vulkan SDK (BUILD_TYPE=vulkan) ---
+# NB: this block intentionally installs `cmake` via apt as part of the
+# Vulkan tooling — must run before the dedicated CMake step below.
+if [ "${BUILD_TYPE:-}" = "vulkan" ] && [ "${SKIP_DRIVERS:-false}" = "false" ]; then
+    apt-get update
+    apt-get install -y  --no-install-recommends \
+        software-properties-common pciutils wget gpg-agent
+    apt-get install -y libglm-dev cmake libxcb-dri3-0 libxcb-present0 libpciaccess0 \
+        libpng-dev libxcb-keysyms1-dev libxcb-dri3-dev libx11-dev g++ gcc \
+        libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
+        git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
+        ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
+        clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
+    if [ "amd64" = "${TARGETARCH:-}" ]; then
+        wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz"
+        tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz
+        rm vulkansdk-linux-x86_64-1.4.335.0.tar.xz
+        mkdir -p /opt/vulkan-sdk
+        mv 1.4.335.0 /opt/vulkan-sdk/
+        ( cd /opt/vulkan-sdk/1.4.335.0 && \
+          ./vulkansdk --no-deps --maxjobs \
+              vulkan-loader \
+              vulkan-validationlayers \
+              vulkan-extensionlayer \
+              vulkan-tools \
+              shaderc )
+        cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/bin/* /usr/bin/
+        cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/lib/* /usr/lib/x86_64-linux-gnu/
+        cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/include/* /usr/include/
+        cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/share/* /usr/share/
+        rm -rf /opt/vulkan-sdk
+    fi
+    if [ "arm64" = "${TARGETARCH:-}" ]; then
+        mkdir vulkan
+        ( cd vulkan && \
+          curl -L -o vulkan-sdk.tar.xz https://github.com/mudler/vulkan-sdk-arm/releases/download/1.4.335.0/vulkansdk-ubuntu-24.04-arm-1.4.335.0.tar.xz && \
+          tar -xvf vulkan-sdk.tar.xz && \
+          rm vulkan-sdk.tar.xz && \
+          cd 1.4.335.0 && \
+          cp -rfv aarch64/bin/* /usr/bin/ && \
+          cp -rfv aarch64/lib/* /usr/lib/aarch64-linux-gnu/ && \
+          cp -rfv aarch64/include/* /usr/include/ && \
+          cp -rfv aarch64/share/* /usr/share/ )
+        rm -rf vulkan
+    fi
+    ldconfig
+    apt-get clean
+    rm -rf /var/lib/apt/lists/*
+fi
+
+# --- 3. CUDA toolkit (BUILD_TYPE=cublas|l4t) ---
+if { [ "${BUILD_TYPE:-}" = "cublas" ] || [ "${BUILD_TYPE:-}" = "l4t" ]; } && [ "${SKIP_DRIVERS:-false}" = "false" ]; then
+    apt-get update
+    apt-get install -y  --no-install-recommends \
+        software-properties-common pciutils
+    if [ "amd64" = "${TARGETARCH:-}" ]; then
+        curl -O "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb"
+    fi
+    if [ "arm64" = "${TARGETARCH:-}" ]; then
+        if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
+            curl -O "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb"
+        else
+            curl -O "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb"
+        fi
+    fi
+    dpkg -i cuda-keyring_1.1-1_all.deb
+    rm -f cuda-keyring_1.1-1_all.deb
+    apt-get update
+    apt-get install -y --no-install-recommends \
+        "cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" \
+        "libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" \
+        "libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" \
+        "libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" \
+        "libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" \
+        "libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}"
+    if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "${TARGETARCH:-}" ]; then
+        apt-get install -y --no-install-recommends \
+            "libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" \
+            "libcudnn9-cuda-${CUDA_MAJOR_VERSION}" \
+            "cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" \
+            "libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}"
+    fi
+    apt-get clean
+    rm -rf /var/lib/apt/lists/*
+fi
+
+# --- 4. cuDSS / NVPL on arm64 + cublas (legacy JetPack / Tegra) ---
+# https://github.com/NVIDIA/Isaac-GR00T/issues/343
+if [ "${BUILD_TYPE:-}" = "cublas" ] && [ "${TARGETARCH:-}" = "arm64" ]; then
+    wget "https://developer.download.nvidia.com/compute/cudss/0.6.0/local_installers/cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb"
+    dpkg -i "cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb"
+    cp /var/cudss-local-tegra-repo-ubuntu"${UBUNTU_VERSION}"-0.6.0/cudss-*-keyring.gpg /usr/share/keyrings/
+    apt-get update
+    apt-get -y install cudss "cudss-cuda-${CUDA_MAJOR_VERSION}"
+    wget "https://developer.download.nvidia.com/compute/nvpl/25.5/local_installers/nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb"
+    dpkg -i "nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb"
+    cp /var/nvpl-local-repo-ubuntu"${UBUNTU_VERSION}"-25.5/nvpl-*-keyring.gpg /usr/share/keyrings/
+    apt-get update
+    apt-get install -y nvpl
+fi
+
+# --- 5. clBLAS (BUILD_TYPE=clblas) ---
+# Present in variant Dockerfiles' from-source path but not in master's
+# Dockerfile.base-grpc-builder. No CI matrix entry currently uses this,
+# but keep parity so a future BUILD_TYPE=clblas build doesn't drift.
+if [ "${BUILD_TYPE:-}" = "clblas" ] && [ "${SKIP_DRIVERS:-false}" = "false" ]; then
+    apt-get update
+    apt-get install -y --no-install-recommends \
+        libclblast-dev
+    apt-get clean
+    rm -rf /var/lib/apt/lists/*
+fi
+
+# --- 6. ROCm / HIP build deps (BUILD_TYPE=hipblas) ---
+if [ "${BUILD_TYPE:-}" = "hipblas" ] && [ "${SKIP_DRIVERS:-false}" = "false" ]; then
+    apt-get update
+    apt-get install -y --no-install-recommends \
+        hipblas-dev \
+        hipblaslt-dev \
+        rocblas-dev
+    apt-get clean
+    rm -rf /var/lib/apt/lists/*
+    # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install,
+    # which results in local-ai and others not being able to locate the libraries.
+    # We run ldconfig ourselves to work around this packaging deficiency.
+    ldconfig
+    # Log which GPU architectures have rocBLAS kernel support
+    echo "rocBLAS library data architectures:"
+    (ls /opt/rocm*/lib/rocblas/library/Kernels* 2>/dev/null || ls /opt/rocm*/lib64/rocblas/library/Kernels* 2>/dev/null) | grep -oP 'gfx[0-9a-z+-]+' | sort -u || \
+        echo "WARNING: No rocBLAS kernel data found"
+fi
+
+echo "TARGETARCH: ${TARGETARCH:-}"
+
+# --- 7. protoc (always) ---
+# The version in 22.04 is too old. We will create one as part of installing
+# the GRPC build below but that will also bring in a newer version of absl
+# which stablediffusion cannot compile with. This version of protoc is only
+# here so that we can generate the grpc code for the stablediffusion build.
+if [ "amd64" = "${TARGETARCH:-}" ]; then
+    curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip
+    unzip -j -d /usr/local/bin protoc.zip bin/protoc
+    rm protoc.zip
+fi
+if [ "arm64" = "${TARGETARCH:-}" ]; then
+    curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip
+    unzip -j -d /usr/local/bin protoc.zip bin/protoc
+    rm protoc.zip
+fi
+
+# --- 8. CMake (apt or compiled from source) ---
+# The version in 22.04 is too old. Vulkan path above already pulled cmake
+# via apt; the from-source branch here will install over it which is fine.
+if [ "${CMAKE_FROM_SOURCE:-false}" = "true" ]; then
+    curl -L -s "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz" -o cmake.tar.gz
+    tar xvf cmake.tar.gz
+    ( cd "cmake-${CMAKE_VERSION}" && ./configure && make && make install )
+else
+    apt-get update
+    apt-get install -y \
+        cmake
+    apt-get clean
+    rm -rf /var/lib/apt/lists/*
+fi
+
+# --- 9. gRPC compile + install at /opt/grpc ---
+# We install GRPC to a different prefix here so that we can copy in only
+# the build artifacts later — saves several hundred MB on the final docker
+# image size vs copying in the entire GRPC source tree and running
+# `make install` in the target container.
+#
+# The TESTONLY abseil sed patch and /opt/grpc prefix are load-bearing —
+# downstream Dockerfiles `COPY` /opt/grpc to /usr/local (or rely on the
+# prebuilt base having it at /opt/grpc).
+mkdir -p /build
+cd /build
+git clone --recurse-submodules --jobs 4 -b "${GRPC_VERSION}" --depth 1 --shallow-submodules https://github.com/grpc/grpc
+mkdir -p /build/grpc/cmake/build
+cd /build/grpc/cmake/build
+sed -i "216i\\  TESTONLY" "../../third_party/abseil-cpp/absl/container/CMakeLists.txt"
+cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../..
+make
+make install
+cd /
+rm -rf /build
diff --git a/.docker/llama-cpp-compile.sh b/.docker/llama-cpp-compile.sh
new file mode 100755
index 000000000..bbc9aa21f
--- /dev/null
+++ b/.docker/llama-cpp-compile.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Shared compile logic for backend/Dockerfile.llama-cpp.
+# Sourced (via bind mount) from both builder-fromsource and builder-prebuilt stages.
+
+set -euxo pipefail
+
+export CCACHE_DIR=/root/.ccache
+ccache --max-size=5G || true
+ccache -z || true
+
+export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"
+
+if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
+  CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
+  export CMAKE_ARGS="${CMAKE_ARGS} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
+  echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
+  rm -rf /LocalAI/backend/cpp/llama-cpp-*-build
+fi
+
+if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
+  cd /LocalAI/backend/cpp/llama-cpp
+  make llama-cpp-fallback
+  make llama-cpp-grpc
+  make llama-cpp-rpc-server
+else
+  cd /LocalAI/backend/cpp/llama-cpp
+  make llama-cpp-avx
+  make llama-cpp-avx2
+  make llama-cpp-avx512
+  make llama-cpp-fallback
+  make llama-cpp-grpc
+  make llama-cpp-rpc-server
+fi
+
+ccache -s || true
diff --git a/.docker/turboquant-compile.sh b/.docker/turboquant-compile.sh
new file mode 100755
index 000000000..7468bc1a7
--- /dev/null
+++ b/.docker/turboquant-compile.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Shared compile logic for backend/Dockerfile.turboquant.
+# Sourced (via bind mount) from both builder-fromsource and builder-prebuilt stages.
+
+set -euxo pipefail
+
+export CCACHE_DIR=/root/.ccache
+ccache --max-size=5G || true
+ccache -z || true
+
+export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"
+
+if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
+  CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
+  export CMAKE_ARGS="${CMAKE_ARGS} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
+  echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
+  rm -rf /LocalAI/backend/cpp/turboquant-*-build
+fi
+
+cd /LocalAI/backend/cpp/turboquant
+
+if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
+  make turboquant-fallback
+  make turboquant-grpc
+  make turboquant-rpc-server
+else
+  make turboquant-avx
+  make turboquant-avx2
+  make turboquant-avx512
+  make turboquant-fallback
+  make turboquant-grpc
+  make turboquant-rpc-server
+fi
+
+ccache -s || true
diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml
index 7a4388e95..f957cea03 100644
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -388,6 +388,7 @@ include:
     platforms: 'linux/amd64'
     tag-latest: 'auto'
     tag-suffix: '-gpu-nvidia-cuda-12-llama-cpp'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-cuda-12-amd64'
     runs-on: 'ubuntu-latest'
     base-image: "ubuntu:24.04"
     skip-drivers: 'false'
@@ -401,6 +402,7 @@ include:
     platforms: 'linux/amd64'
     tag-latest: 'auto'
     tag-suffix: '-gpu-nvidia-cuda-12-turboquant'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-cuda-12-amd64'
     runs-on: 'ubuntu-latest'
     base-image: "ubuntu:24.04"
     skip-drivers: 'false'
@@ -896,6 +898,7 @@ include:
     platforms: 'linux/amd64'
     tag-latest: 'auto'
     tag-suffix: '-gpu-nvidia-cuda-13-llama-cpp'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-cuda-13-amd64'
     runs-on: 'ubuntu-latest'
     base-image: "ubuntu:24.04"
     skip-drivers: 'false'
@@ -909,6 +912,7 @@ include:
     platforms: 'linux/amd64'
     tag-latest: 'auto'
     tag-suffix: '-gpu-nvidia-cuda-13-turboquant'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-cuda-13-amd64'
     runs-on: 'ubuntu-latest'
     base-image: "ubuntu:24.04"
     skip-drivers: 'false'
@@ -923,6 +927,7 @@ include:
     skip-drivers: 'false'
     tag-latest: 'auto'
     tag-suffix: '-nvidia-l4t-cuda-13-arm64-llama-cpp'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-cuda-13-arm64'
     base-image: "ubuntu:24.04"
     runs-on: 'ubuntu-24.04-arm'
     ubuntu-version: '2404'
@@ -936,6 +941,7 @@ include:
     skip-drivers: 'false'
     tag-latest: 'auto'
     tag-suffix: '-nvidia-l4t-cuda-13-arm64-turboquant'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-cuda-13-arm64'
     base-image: "ubuntu:24.04"
     runs-on: 'ubuntu-24.04-arm'
     ubuntu-version: '2404'
@@ -1560,6 +1566,7 @@ include:
     platforms: 'linux/amd64'
     tag-latest: 'auto'
     tag-suffix: '-gpu-rocm-hipblas-llama-cpp'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-rocm-amd64'
     runs-on: 'ubuntu-latest'
     base-image: "rocm/dev-ubuntu-24.04:7.2.1"
     skip-drivers: 'false'
@@ -1573,6 +1580,7 @@ include:
     platforms: 'linux/amd64'
     tag-latest: 'auto'
     tag-suffix: '-gpu-rocm-hipblas-turboquant'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-rocm-amd64'
     runs-on: 'ubuntu-latest'
     base-image: "rocm/dev-ubuntu-24.04:7.2.1"
     skip-drivers: 'false'
@@ -1809,6 +1817,7 @@ include:
     platforms: 'linux/amd64'
     tag-latest: 'auto'
     tag-suffix: '-gpu-intel-sycl-f32-llama-cpp'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-intel-amd64'
     runs-on: 'ubuntu-latest'
     base-image: "intel/oneapi-basekit:2025.3.2-0-devel-ubuntu24.04"
     skip-drivers: 'false'
@@ -1822,6 +1831,7 @@ include:
     platforms: 'linux/amd64'
     tag-latest: 'auto'
     tag-suffix: '-gpu-intel-sycl-f32-turboquant'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-intel-amd64'
     runs-on: 'ubuntu-latest'
     base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
     skip-drivers: 'false'
@@ -1835,6 +1845,7 @@ include:
     platforms: 'linux/amd64'
     tag-latest: 'auto'
     tag-suffix: '-gpu-intel-sycl-f16-llama-cpp'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-intel-amd64'
     runs-on: 'ubuntu-latest'
     base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
     skip-drivers: 'false'
@@ -1848,6 +1859,7 @@ include:
     platforms: 'linux/amd64'
     tag-latest: 'auto'
     tag-suffix: '-gpu-intel-sycl-f16-turboquant'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-intel-amd64'
     runs-on: 'ubuntu-latest'
     base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
     skip-drivers: 'false'
@@ -2256,6 +2268,7 @@ include:
     platform-tag: 'amd64'
     tag-latest: 'auto'
     tag-suffix: '-cpu-llama-cpp'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-amd64'
     runs-on: 'ubuntu-latest'
     base-image: "ubuntu:24.04"
     skip-drivers: 'false'
@@ -2270,6 +2283,7 @@ include:
     platform-tag: 'arm64'
     tag-latest: 'auto'
     tag-suffix: '-cpu-llama-cpp'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-arm64'
     runs-on: 'ubuntu-24.04-arm'
     base-image: "ubuntu:24.04"
     skip-drivers: 'false'
@@ -2284,6 +2298,7 @@ include:
     platform-tag: 'amd64'
     tag-latest: 'auto'
     tag-suffix: '-cpu-turboquant'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-amd64'
     runs-on: 'ubuntu-latest'
     base-image: "ubuntu:24.04"
     skip-drivers: 'false'
@@ -2298,6 +2313,7 @@ include:
     platform-tag: 'arm64'
     tag-latest: 'auto'
     tag-suffix: '-cpu-turboquant'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-arm64'
     runs-on: 'ubuntu-24.04-arm'
     base-image: "ubuntu:24.04"
     skip-drivers: 'false'
@@ -2311,6 +2327,7 @@ include:
     platforms: 'linux/amd64'
     tag-latest: 'auto'
     tag-suffix: '-cpu-ik-llama-cpp'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-amd64'
     runs-on: 'ubuntu-latest'
     base-image: "ubuntu:24.04"
     skip-drivers: 'false'
@@ -2325,6 +2342,7 @@ include:
     skip-drivers: 'false'
     tag-latest: 'auto'
     tag-suffix: '-nvidia-l4t-arm64-llama-cpp'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-l4t-cuda-12-arm64'
     base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
     runs-on: 'ubuntu-24.04-arm'
     backend: "llama-cpp"
@@ -2338,6 +2356,7 @@ include:
     skip-drivers: 'false'
     tag-latest: 'auto'
     tag-suffix: '-nvidia-l4t-arm64-turboquant'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-l4t-cuda-12-arm64'
     base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
     runs-on: 'ubuntu-24.04-arm'
     backend: "turboquant"
@@ -2351,6 +2370,7 @@ include:
     platform-tag: 'amd64'
     tag-latest: 'auto'
     tag-suffix: '-gpu-vulkan-llama-cpp'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-vulkan-amd64'
     runs-on: 'ubuntu-latest'
     base-image: "ubuntu:24.04"
     skip-drivers: 'false'
@@ -2365,6 +2385,7 @@ include:
     platform-tag: 'arm64'
     tag-latest: 'auto'
     tag-suffix: '-gpu-vulkan-llama-cpp'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-vulkan-arm64'
     runs-on: 'ubuntu-24.04-arm'
     base-image: "ubuntu:24.04"
     skip-drivers: 'false'
@@ -2379,6 +2400,7 @@ include:
     platform-tag: 'amd64'
     tag-latest: 'auto'
     tag-suffix: '-gpu-vulkan-turboquant'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-vulkan-amd64'
     runs-on: 'ubuntu-latest'
     base-image: "ubuntu:24.04"
     skip-drivers: 'false'
@@ -2394,6 +2416,7 @@ include:
     platform-tag: 'arm64'
     tag-latest: 'auto'
     tag-suffix: '-gpu-vulkan-turboquant'
+    builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-vulkan-arm64'
     runs-on: 'ubuntu-24.04-arm'
     base-image: "ubuntu:24.04"
     skip-drivers: 'false'
diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
index 8eb0f31e9..6e6c4ab33 100644
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -74,6 +74,7 @@ jobs:
       platforms: ${{ matrix.platforms }}
       platform-tag: ${{ matrix.platform-tag || '' }}
       runs-on: ${{ matrix.runs-on }}
+      builder-base-image: ${{ matrix.builder-base-image || '' }}
       base-image: ${{ matrix.base-image }}
       backend: ${{ matrix.backend }}
       dockerfile: ${{ matrix.dockerfile }}
diff --git a/.github/workflows/backend_build.yml b/.github/workflows/backend_build.yml
index 97b650075..524615baa 100644
--- a/.github/workflows/backend_build.yml
+++ b/.github/workflows/backend_build.yml
@@ -74,6 +74,15 @@ on:
         required: false
         default: ''
         type: string
+      builder-base-image:
+        description: |
+          Pre-built builder base image (e.g. quay.io/go-skynet/ci-cache:base-grpc-cuda-13-amd64).
+          When set, the variant Dockerfile uses its `builder-prebuilt` stage which FROMs this
+          image directly instead of running its own gRPC stage + apt installs. Empty for
+          backends whose Dockerfile doesn't support a prebuilt base.
+        required: false
+        default: ''
+        type: string
     secrets:
       dockerUsername:
         required: false
@@ -190,6 +199,8 @@ jobs:
             APT_MIRROR=${{ steps.apt_mirror.outputs.effective-mirror }}
             APT_PORTS_MIRROR=${{ steps.apt_mirror.outputs.effective-ports-mirror }}
             DEPS_REFRESH=${{ steps.deps_refresh.outputs.key }}
+            BUILDER_BASE_IMAGE=${{ inputs.builder-base-image }}
+            BUILDER_TARGET=${{ inputs.builder-base-image != '' && 'builder-prebuilt' || 'builder-fromsource' }}
           context: ${{ inputs.context }}
           file: ${{ inputs.dockerfile }}
           cache-from: type=registry,ref=quay.io/go-skynet/ci-cache:cache${{ inputs.tag-suffix }}-${{ inputs.platform-tag }}
@@ -243,6 +254,8 @@ jobs:
             APT_MIRROR=${{ steps.apt_mirror.outputs.effective-mirror }}
             APT_PORTS_MIRROR=${{ steps.apt_mirror.outputs.effective-ports-mirror }}
             DEPS_REFRESH=${{ steps.deps_refresh.outputs.key }}
+            BUILDER_BASE_IMAGE=${{ inputs.builder-base-image }}
+            BUILDER_TARGET=${{ inputs.builder-base-image != '' && 'builder-prebuilt' || 'builder-fromsource' }}
           context: ${{ inputs.context }}
           file: ${{ inputs.dockerfile }}
           cache-from: type=registry,ref=quay.io/go-skynet/ci-cache:cache${{ inputs.tag-suffix }}-${{ inputs.platform-tag }}
diff --git a/.github/workflows/backend_pr.yml b/.github/workflows/backend_pr.yml
index 863610c87..b01e5dddc 100644
--- a/.github/workflows/backend_pr.yml
+++ b/.github/workflows/backend_pr.yml
@@ -50,6 +50,7 @@ jobs:
       platforms: ${{ matrix.platforms }}
       platform-tag: ${{ matrix.platform-tag || '' }}
       runs-on: ${{ matrix.runs-on }}
+      builder-base-image: ${{ matrix.builder-base-image || '' }}
       base-image: ${{ matrix.base-image }}
       backend: ${{ matrix.backend }}
       dockerfile: ${{ matrix.dockerfile }}
diff --git a/.github/workflows/base-images.yml b/.github/workflows/base-images.yml
index a8baece3c..7d6f2b238 100644
--- a/.github/workflows/base-images.yml
+++ b/.github/workflows/base-images.yml
@@ -104,6 +104,23 @@ jobs:
             cuda-major-version: ''
             cuda-minor-version: ''
             ubuntu-version: '2404'
+          # Legacy JetPack r36.4.0 base for older Jetson devices (CUDA 12).
+          # Distinct from base-grpc-cuda-13-arm64 (Ubuntu 24.04 + CUDA 13 sbsa)
+          # which targets newer Jetsons. Some matrix entries
+          # (-nvidia-l4t-arm64-llama-cpp / -turboquant) still build against
+          # the JetPack image, so we need a matching base.
+          - tag: 'base-grpc-l4t-cuda-12-arm64'
+            runs-on: 'ubuntu-24.04-arm'
+            base-image: 'nvcr.io/nvidia/l4t-jetpack:r36.4.0'
+            build-type: 'l4t'
+            cuda-major-version: '12'
+            cuda-minor-version: '0'
+            ubuntu-version: '2204'
+            # JetPack r36.4.0 already ships CUDA preinstalled at /usr/local/cuda;
+            # apt-installing cuda-nvcc-12-0 from the public repos fails because
+            # those packages aren't published for the JetPack apt feed. Match
+            # the original l4t matrix entry which set skip-drivers: 'true'.
+            skip-drivers: 'true'
     steps:
       - uses: actions/checkout@v6
         with:
@@ -131,6 +148,7 @@ jobs:
             CUDA_MAJOR_VERSION=${{ matrix.cuda-major-version }}
             CUDA_MINOR_VERSION=${{ matrix.cuda-minor-version }}
             UBUNTU_VERSION=${{ matrix.ubuntu-version }}
+            SKIP_DRIVERS=${{ matrix.skip-drivers || 'false' }}
           cache-from: type=registry,ref=quay.io/go-skynet/ci-cache:cache-${{ matrix.tag }}
           cache-to: type=registry,ref=quay.io/go-skynet/ci-cache:cache-${{ matrix.tag }},mode=max,ignore-error=true
           provenance: false
diff --git a/backend/Dockerfile.base-grpc-builder b/backend/Dockerfile.base-grpc-builder
index ab1b6eadf..8843a0a72 100644
--- a/backend/Dockerfile.base-grpc-builder
+++ b/backend/Dockerfile.base-grpc-builder
@@ -25,6 +25,7 @@
 #   base-grpc-cuda-12-amd64         ubuntu:24.04 + CUDA 12.8
 #   base-grpc-cuda-13-amd64         ubuntu:22.04 + CUDA 13.0
 #   base-grpc-cuda-13-arm64         ubuntu:24.04 + CUDA 13.0 (sbsa)
+#   base-grpc-l4t-cuda-12-arm64     ubuntu:22.04 + CUDA 12.x (legacy JetPack)
 #   base-grpc-rocm-amd64            rocm/dev-ubuntu-24.04:7.2.1 + hipblas
 #   base-grpc-vulkan-amd64          ubuntu:24.04 + Vulkan SDK 1.4.335
 #   base-grpc-vulkan-arm64          ubuntu:24.04 + Vulkan SDK ARM 1.4.335
@@ -38,10 +39,11 @@
 # downstream builds will add it to CMAKE_PREFIX_PATH (or copy to
 # /usr/local) the same way Dockerfile.llama-cpp does today.
 #
-# Install logic is copied verbatim from backend/Dockerfile.llama-cpp on
-# master so the resulting image is bit-identical to what the variant
-# Dockerfile produces today. Do not paraphrase apt invocations — PR 2
-# depends on bit-equivalence.
+# Install logic lives in .docker/install-base-deps.sh, which is also
+# bind-mounted by the variant Dockerfiles' builder-fromsource stage.
+# This guarantees bit-equivalence between the prebuilt CI base and the
+# from-source local-dev path — both invoke the same script with the
+# same env inputs.
 
 ARG BASE_IMAGE=ubuntu:24.04
 
@@ -64,12 +66,21 @@ ARG APT_MIRROR=""
 ARG APT_PORTS_MIRROR=""
 ARG AMDGPU_TARGETS=""
 
-ENV BUILD_TYPE=${BUILD_TYPE}
-ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
-ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
-ENV AMDGPU_TARGETS=${AMDGPU_TARGETS}
-ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
-ENV DEBIAN_FRONTEND=noninteractive
+ENV BUILD_TYPE=${BUILD_TYPE} \
+    CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} \
+    CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION} \
+    CMAKE_FROM_SOURCE=${CMAKE_FROM_SOURCE} \
+    CMAKE_VERSION=${CMAKE_VERSION} \
+    GRPC_VERSION=${GRPC_VERSION} \
+    GRPC_MAKEFLAGS=${GRPC_MAKEFLAGS} \
+    SKIP_DRIVERS=${SKIP_DRIVERS} \
+    TARGETARCH=${TARGETARCH} \
+    UBUNTU_VERSION=${UBUNTU_VERSION} \
+    APT_MIRROR=${APT_MIRROR} \
+    APT_PORTS_MIRROR=${APT_PORTS_MIRROR} \
+    AMDGPU_TARGETS=${AMDGPU_TARGETS} \
+    MAKEFLAGS=${GRPC_MAKEFLAGS} \
+    DEBIAN_FRONTEND=noninteractive
 
 # CUDA on PATH (no-op when CUDA isn't installed)
 ENV PATH=/usr/local/cuda/bin:${PATH}
@@ -78,191 +89,10 @@ ENV PATH=/opt/rocm/bin:${PATH}
 
 WORKDIR /build
 
-# Base apt build deps. Mirrors backend/Dockerfile.llama-cpp lines 85-97
-# (the `builder` stage's apt block) — superset of the gRPC stage's deps
-# so the same image can compile gRPC and downstream backends.
-RUN --mount=type=bind,source=.docker/apt-mirror.sh,target=/usr/local/sbin/apt-mirror \
-    APT_MIRROR="${APT_MIRROR}" APT_PORTS_MIRROR="${APT_PORTS_MIRROR}" sh /usr/local/sbin/apt-mirror && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        build-essential \
-        ccache git \
-        ca-certificates \
-        make \
-        pkg-config libcurl4-openssl-dev \
-        curl unzip \
-        libssl-dev wget && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-# Vulkan SDK install. Mirrors backend/Dockerfile.llama-cpp lines 107-154.
-RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
-        apt-get update && \
-        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils wget gpg-agent && \
-        apt-get install -y libglm-dev cmake libxcb-dri3-0 libxcb-present0 libpciaccess0 \
-            libpng-dev libxcb-keysyms1-dev libxcb-dri3-dev libx11-dev g++ gcc \
-            libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
-            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
-            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
-            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
-        if [ "amd64" = "$TARGETARCH" ]; then
-            wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
-            tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
-            rm vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
-            mkdir -p /opt/vulkan-sdk && \
-            mv 1.4.335.0 /opt/vulkan-sdk/ && \
-            cd /opt/vulkan-sdk/1.4.335.0 && \
-            ./vulkansdk --no-deps --maxjobs \
-                vulkan-loader \
-                vulkan-validationlayers \
-                vulkan-extensionlayer \
-                vulkan-tools \
-                shaderc && \
-            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/bin/* /usr/bin/ && \
-            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/lib/* /usr/lib/x86_64-linux-gnu/ && \
-            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/include/* /usr/include/ && \
-            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/share/* /usr/share/ && \
-            rm -rf /opt/vulkan-sdk
-        fi
-        if [ "arm64" = "$TARGETARCH" ]; then
-            mkdir vulkan && cd vulkan && \
-            curl -L -o vulkan-sdk.tar.xz https://github.com/mudler/vulkan-sdk-arm/releases/download/1.4.335.0/vulkansdk-ubuntu-24.04-arm-1.4.335.0.tar.xz && \
-            tar -xvf vulkan-sdk.tar.xz && \
-            rm vulkan-sdk.tar.xz && \
-            cd 1.4.335.0 && \
-            cp -rfv aarch64/bin/* /usr/bin/ && \
-            cp -rfv aarch64/lib/* /usr/lib/aarch64-linux-gnu/ && \
-            cp -rfv aarch64/include/* /usr/include/ && \
-            cp -rfv aarch64/share/* /usr/share/ && \
-            cd ../.. && \
-            rm -rf vulkan
-        fi
-        ldconfig && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-# CuBLAS (CUDA toolkit) install. Mirrors backend/Dockerfile.llama-cpp
-# lines 157-189.
-RUN <<EOT bash
-    if ( [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "l4t" ] ) && [ "${SKIP_DRIVERS}" = "false" ]; then
-        apt-get update && \
-        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils
-        if [ "amd64" = "$TARGETARCH" ]; then
-            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
-        fi
-        if [ "arm64" = "$TARGETARCH" ]; then
-            if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
-            else
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
-            fi
-        fi
-        dpkg -i cuda-keyring_1.1-1_all.deb && \
-        rm -f cuda-keyring_1.1-1_all.deb && \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
-            apt-get install -y --no-install-recommends \
-            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        fi
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-# cuDSS / NVPL on arm64 + cublas. Mirrors backend/Dockerfile.llama-cpp
-# lines 193-204. https://github.com/NVIDIA/Isaac-GR00T/issues/343
-RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
-        wget https://developer.download.nvidia.com/compute/cudss/0.6.0/local_installers/cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb && \
-        dpkg -i cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb && \
-        cp /var/cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0/cudss-*-keyring.gpg /usr/share/keyrings/ && \
-        apt-get update && apt-get -y install cudss cudss-cuda-${CUDA_MAJOR_VERSION} && \
-        wget https://developer.download.nvidia.com/compute/nvpl/25.5/local_installers/nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb && \
-        dpkg -i nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb && \
-        cp /var/nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5/nvpl-*-keyring.gpg /usr/share/keyrings/ && \
-        apt-get update && apt-get install -y nvpl
-    fi
-EOT
-
-# ROCm / HIP build deps. Mirrors backend/Dockerfile.llama-cpp lines 215-230.
-RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            hipblas-dev \
-            hipblaslt-dev \
-            rocblas-dev && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* && \
-        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
-        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
-        ldconfig && \
-        # Log which GPU architectures have rocBLAS kernel support
-        echo "rocBLAS library data architectures:" && \
-        (ls /opt/rocm*/lib/rocblas/library/Kernels* 2>/dev/null || ls /opt/rocm*/lib64/rocblas/library/Kernels* 2>/dev/null) | grep -oP 'gfx[0-9a-z+-]+' | sort -u || \
-        echo "WARNING: No rocBLAS kernel data found" \
-    ; fi
-
-RUN echo "TARGETARCH: $TARGETARCH"
-
-# protoc download. Mirrors backend/Dockerfile.llama-cpp lines 237-248.
-# We need protoc installed, and the version in 22.04 is too old. We will create one as part of installing the GRPC build below
-# but that will also bring in a newer version of absl which stablediffusion cannot compile with. This version of protoc is only
-# here so that we can generate the grpc code for the stablediffusion build.
-RUN <<EOT bash
-    if [ "amd64" = "$TARGETARCH" ]; then
-        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
-        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
-        rm protoc.zip
-    fi
-    if [ "arm64" = "$TARGETARCH" ]; then
-        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
-        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
-        rm protoc.zip
-    fi
-EOT
-
-# CMake install. Mirrors backend/Dockerfile.llama-cpp lines 250-261
-# (the `builder` stage's CMake block). The version in 22.04 is too old.
-RUN <<EOT bash
-    if [ "${CMAKE_FROM_SOURCE}" = "true" ]; then
-        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
-    else
-        apt-get update && \
-        apt-get install -y \
-            cmake && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-# gRPC compile + install at /opt/grpc. Mirrors backend/Dockerfile.llama-cpp
-# lines 50-57 (the `grpc` stage's clone+build+install block). Using the
-# same prefix and the same TESTONLY abseil patch so consumer Dockerfiles
-# in PR 2 can copy /opt/grpc -> /usr/local exactly like
-# `COPY --from=grpc /opt/grpc /usr/local` does today.
-#
-# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
-# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
-# and running make install in the target container
-RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-    mkdir -p /build/grpc/cmake/build && \
-    cd /build/grpc/cmake/build && \
-    sed -i "216i\  TESTONLY" "../../third_party/abseil-cpp/absl/container/CMakeLists.txt" && \
-    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
-    make && \
-    make install && \
-    rm -rf /build
+# Single RUN that delegates to .docker/install-base-deps.sh — the same
+# script the variant Dockerfiles' builder-fromsource stage runs.
+RUN --mount=type=bind,source=.docker/install-base-deps.sh,target=/usr/local/sbin/install-base-deps \
+    --mount=type=bind,source=.docker/apt-mirror.sh,target=/usr/local/sbin/apt-mirror \
+    bash /usr/local/sbin/install-base-deps
 
 WORKDIR /
diff --git a/backend/Dockerfile.ik-llama-cpp b/backend/Dockerfile.ik-llama-cpp
index 364de3447..9694441b0 100644
--- a/backend/Dockerfile.ik-llama-cpp
+++ b/backend/Dockerfile.ik-llama-cpp
@@ -1,261 +1,85 @@
 ARG BASE_IMAGE=ubuntu:24.04
-ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
+# BUILDER_BASE_IMAGE defaults to BASE_IMAGE so the Dockerfile parses even
+# when no prebuilt base is supplied. The builder-prebuilt stage is only
+# entered when BUILDER_TARGET=builder-prebuilt, so a "wrong" fallback
+# content here is harmless — BuildKit prunes the unreferenced builder.
+ARG BUILDER_BASE_IMAGE=${BASE_IMAGE}
+# BUILDER_TARGET selects which builder stage the final scratch image copies
+# package output from. Declared at global scope (before any FROM) so it's
+# usable in `FROM ${BUILDER_TARGET}` below. Default keeps local
+# `make backends/ik-llama-cpp` on the from-source path.
+ARG BUILDER_TARGET=builder-fromsource
 ARG APT_MIRROR=""
 ARG APT_PORTS_MIRROR=""
 
 
-# The grpc target does one thing, it builds and installs GRPC.  This is in it's own layer so that it can be effectively cached by CI.
-# You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work.
-FROM ${GRPC_BASE_IMAGE} AS grpc
-
-# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
-ARG GRPC_MAKEFLAGS="-j4 -Otarget"
-ARG GRPC_VERSION=v1.65.0
+# ============================================================================
+# Stage: builder-fromsource — self-contained build path.
+# Runs .docker/install-base-deps.sh (apt deps + cmake + protoc + gRPC +
+# conditional CUDA/ROCm/Vulkan), copies /opt/grpc to /usr/local, then
+# compiles the variant. Used when BUILDER_TARGET=builder-fromsource (the
+# default; local `make backends/ik-llama-cpp`).
+#
+# The install script is the same one that backend/Dockerfile.base-grpc-builder
+# runs, so the result is bit-equivalent to the prebuilt-base path
+# (builder-prebuilt below).
+# ============================================================================
+FROM ${BASE_IMAGE} AS builder-fromsource
+ARG BUILD_TYPE
+ARG CUDA_MAJOR_VERSION
+ARG CUDA_MINOR_VERSION
 ARG CMAKE_FROM_SOURCE=false
 # CUDA Toolkit 13.x compatibility: CMake 3.31.9+ fixes toolchain detection/arch table issues
 ARG CMAKE_VERSION=3.31.10
-ARG APT_MIRROR
-ARG APT_PORTS_MIRROR
-
-ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
-
-WORKDIR /build
-
-RUN --mount=type=bind,source=.docker/apt-mirror.sh,target=/usr/local/sbin/apt-mirror \
-    APT_MIRROR="${APT_MIRROR}" APT_PORTS_MIRROR="${APT_PORTS_MIRROR}" sh /usr/local/sbin/apt-mirror && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        ca-certificates \
-        build-essential curl libssl-dev \
-        git wget && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install CMake (the version in 22.04 is too old)
-RUN <<EOT bash
-    if [ "${CMAKE_FROM_SOURCE}" = "true" ]; then
-        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
-    else
-        apt-get update && \
-        apt-get install -y \
-            cmake && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
-# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
-# and running make install in the target container
-RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-    mkdir -p /build/grpc/cmake/build && \
-    cd /build/grpc/cmake/build && \
-    sed -i "216i\  TESTONLY" "../../third_party/abseil-cpp/absl/container/CMakeLists.txt" && \
-    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
-    make && \
-    make install && \
-    rm -rf /build
-
-FROM ${BASE_IMAGE} AS builder
-ARG CMAKE_FROM_SOURCE=false
-ARG CMAKE_VERSION=3.31.10
-# We can target specific CUDA ARCHITECTURES like --build-arg CUDA_DOCKER_ARCH='75;86;89;120'
-ARG CUDA_DOCKER_ARCH
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-ARG CMAKE_ARGS
-ENV CMAKE_ARGS=${CMAKE_ARGS}
-ARG BACKEND=rerankers
-ARG BUILD_TYPE
-ENV BUILD_TYPE=${BUILD_TYPE}
-ARG CUDA_MAJOR_VERSION
-ARG CUDA_MINOR_VERSION
+ARG GRPC_VERSION=v1.65.0
+ARG GRPC_MAKEFLAGS="-j4 -Otarget"
 ARG SKIP_DRIVERS=false
-ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
-ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
-ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETARCH
 ARG TARGETVARIANT
 ARG GO_VERSION=1.25.4
 ARG UBUNTU_VERSION=2404
 ARG APT_MIRROR
 ARG APT_PORTS_MIRROR
+ARG AMDGPU_TARGETS=""
+ARG BACKEND=rerankers
+# CUDA target archs, e.g. --build-arg CUDA_DOCKER_ARCH='75;86;89;120'
+ARG CUDA_DOCKER_ARCH
+ARG CMAKE_ARGS
 
-RUN --mount=type=bind,source=.docker/apt-mirror.sh,target=/usr/local/sbin/apt-mirror \
-    APT_MIRROR="${APT_MIRROR}" APT_PORTS_MIRROR="${APT_PORTS_MIRROR}" sh /usr/local/sbin/apt-mirror && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        build-essential \
-        ccache git \
-        ca-certificates \
-        make \
-        pkg-config libcurl4-openssl-dev \
-        curl unzip \
-        libssl-dev wget && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+ENV BUILD_TYPE=${BUILD_TYPE} \
+    CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} \
+    CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION} \
+    CMAKE_FROM_SOURCE=${CMAKE_FROM_SOURCE} \
+    CMAKE_VERSION=${CMAKE_VERSION} \
+    GRPC_VERSION=${GRPC_VERSION} \
+    GRPC_MAKEFLAGS=${GRPC_MAKEFLAGS} \
+    SKIP_DRIVERS=${SKIP_DRIVERS} \
+    TARGETARCH=${TARGETARCH} \
+    UBUNTU_VERSION=${UBUNTU_VERSION} \
+    APT_MIRROR=${APT_MIRROR} \
+    APT_PORTS_MIRROR=${APT_PORTS_MIRROR} \
+    AMDGPU_TARGETS=${AMDGPU_TARGETS} \
+    CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} \
+    CMAKE_ARGS=${CMAKE_ARGS} \
+    DEBIAN_FRONTEND=noninteractive
 
-# Cuda
+# CUDA on PATH (no-op when CUDA isn't installed)
 ENV PATH=/usr/local/cuda/bin:${PATH}
-
-# HipBLAS requirements
+# HipBLAS / ROCm on PATH (no-op when ROCm isn't installed)
 ENV PATH=/opt/rocm/bin:${PATH}
 
+WORKDIR /build
 
-# Vulkan requirements
-RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
-        apt-get update && \
-        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils wget gpg-agent && \
-        apt-get install -y libglm-dev cmake libxcb-dri3-0 libxcb-present0 libpciaccess0 \
-            libpng-dev libxcb-keysyms1-dev libxcb-dri3-dev libx11-dev g++ gcc \
-            libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
-            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
-            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
-            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
-        if [ "amd64" = "$TARGETARCH" ]; then
-            wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
-            tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
-            rm vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
-            mkdir -p /opt/vulkan-sdk && \
-            mv 1.4.335.0 /opt/vulkan-sdk/ && \
-            cd /opt/vulkan-sdk/1.4.335.0 && \
-            ./vulkansdk --no-deps --maxjobs \
-                vulkan-loader \
-                vulkan-validationlayers \
-                vulkan-extensionlayer \
-                vulkan-tools \
-                shaderc && \
-            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/bin/* /usr/bin/ && \
-            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/lib/* /usr/lib/x86_64-linux-gnu/ && \
-            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/include/* /usr/include/ && \
-            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/share/* /usr/share/ && \
-            rm -rf /opt/vulkan-sdk
-        fi
-        if [ "arm64" = "$TARGETARCH" ]; then
-            mkdir vulkan && cd vulkan && \
-            curl -L -o vulkan-sdk.tar.xz https://github.com/mudler/vulkan-sdk-arm/releases/download/1.4.335.0/vulkansdk-ubuntu-24.04-arm-1.4.335.0.tar.xz && \
-            tar -xvf vulkan-sdk.tar.xz && \
-            rm vulkan-sdk.tar.xz && \
-            cd 1.4.335.0 && \
-            cp -rfv aarch64/bin/* /usr/bin/ && \
-            cp -rfv aarch64/lib/* /usr/lib/aarch64-linux-gnu/ && \
-            cp -rfv aarch64/include/* /usr/include/ && \
-            cp -rfv aarch64/share/* /usr/share/ && \
-            cd ../.. && \
-            rm -rf vulkan
-        fi
-        ldconfig && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-# CuBLAS requirements
-RUN <<EOT bash
-    if ( [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "l4t" ] ) && [ "${SKIP_DRIVERS}" = "false" ]; then
-        apt-get update && \
-        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils
-        if [ "amd64" = "$TARGETARCH" ]; then
-            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
-        fi
-        if [ "arm64" = "$TARGETARCH" ]; then
-            if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
-            else
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
-            fi
-        fi
-        dpkg -i cuda-keyring_1.1-1_all.deb && \
-        rm -f cuda-keyring_1.1-1_all.deb && \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
-            apt-get install -y --no-install-recommends \
-            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        fi
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-
-# https://github.com/NVIDIA/Isaac-GR00T/issues/343
-RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
-        wget https://developer.download.nvidia.com/compute/cudss/0.6.0/local_installers/cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb && \
-        dpkg -i cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb && \
-        cp /var/cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0/cudss-*-keyring.gpg /usr/share/keyrings/ && \
-        apt-get update && apt-get -y install cudss cudss-cuda-${CUDA_MAJOR_VERSION} && \
-        wget https://developer.download.nvidia.com/compute/nvpl/25.5/local_installers/nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb && \
-        dpkg -i nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb && \
-        cp /var/nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5/nvpl-*-keyring.gpg /usr/share/keyrings/ && \
-        apt-get update && apt-get install -y nvpl
-    fi
-EOT
-
-# If we are building with clblas support, we need the libraries for the builds
-RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            libclblast-dev && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* \
-    ; fi
-
-RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            hipblas-dev \
-            hipblaslt-dev \
-            rocblas-dev && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* && \
-        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
-        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
-        ldconfig \
-    ; fi
-
-RUN echo "TARGETARCH: $TARGETARCH"
-
-# We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
-# but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
-# here so that we can generate the grpc code for the stablediffusion build
-RUN <<EOT bash
-    if [ "amd64" = "$TARGETARCH" ]; then
-        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
-        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
-        rm protoc.zip
-    fi
-    if [ "arm64" = "$TARGETARCH" ]; then
-        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
-        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
-        rm protoc.zip
-    fi
-EOT
-
-# Install CMake (the version in 22.04 is too old)
-RUN <<EOT bash
-    if [ "${CMAKE_FROM_SOURCE}" = "true" ]; then
-        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
-    else
-        apt-get update && \
-        apt-get install -y \
-            cmake && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-COPY --from=grpc /opt/grpc /usr/local
+# Install everything via the shared script — the same one that
+# backend/Dockerfile.base-grpc-builder runs, so the prebuilt CI base and
+# this from-source path are bit-equivalent.
+RUN --mount=type=bind,source=.docker/install-base-deps.sh,target=/usr/local/sbin/install-base-deps \
+    --mount=type=bind,source=.docker/apt-mirror.sh,target=/usr/local/sbin/apt-mirror \
+    bash /usr/local/sbin/install-base-deps
 
+# Mirror builder-prebuilt: copy gRPC from /opt/grpc to /usr/local so
+# CMake's find_package finds it at the canonical prefix the Makefile expects.
+RUN cp -a /opt/grpc/. /usr/local/
 
 COPY . /LocalAI
 
@@ -263,40 +87,63 @@ COPY . /LocalAI
 # for the rationale. Distinct mount id so ik-llama-cpp's cache doesn't
 # overlap with llama-cpp's — ik_llama.cpp is a different fork with
 # different source.
-RUN --mount=type=cache,target=/root/.ccache,id=ik-llama-cpp-ccache-${TARGETARCH}-${BUILD_TYPE},sharing=locked <<'EOT' bash
-set -euxo pipefail
-
-export CCACHE_DIR=/root/.ccache
-ccache --max-size=5G || true
-ccache -z || true
-
-export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"
-
-if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
-  CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
-  export CMAKE_ARGS="${CMAKE_ARGS} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
-  echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
-  rm -rf /LocalAI/backend/cpp/ik-llama-cpp-*-build
-fi
-
-cd /LocalAI/backend/cpp/ik-llama-cpp
-
-if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
-  # ARM64 / ROCm: build without x86 SIMD
-  make ik-llama-cpp-fallback
-else
-  # ik_llama.cpp's IQK kernels require at least AVX2
-  make ik-llama-cpp-avx2
-fi
-
-ccache -s || true
-EOT
+#
+# The compile body is shared with builder-prebuilt via .docker/ik-llama-cpp-compile.sh.
+RUN --mount=type=bind,source=.docker/ik-llama-cpp-compile.sh,target=/usr/local/sbin/compile.sh \
+    --mount=type=cache,target=/root/.ccache,id=ik-llama-cpp-ccache-${TARGETARCH}-${BUILD_TYPE},sharing=locked \
+    bash /usr/local/sbin/compile.sh
 
 
 # Copy libraries using a script to handle architecture differences
 RUN make -BC /LocalAI/backend/cpp/ik-llama-cpp package
 
 
+# ============================================================================
+# Stage: builder-prebuilt — uses the pre-built base from
+# quay.io/go-skynet/ci-cache:base-grpc-* (built by .github/workflows/base-images.yml).
+# That image already has gRPC at /opt/grpc + apt deps + CUDA/ROCm/Vulkan
+# pre-installed, so we just copy gRPC to /usr/local and compile. Used when
+# BUILDER_TARGET=builder-prebuilt (CI when the matrix entry sets
+# builder-base-image).
+# ============================================================================
+FROM ${BUILDER_BASE_IMAGE} AS builder-prebuilt
+
+ARG BUILD_TYPE
+ENV BUILD_TYPE=${BUILD_TYPE}
+ARG CUDA_DOCKER_ARCH
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+ARG CMAKE_ARGS
+ENV CMAKE_ARGS=${CMAKE_ARGS}
+ARG TARGETARCH
+ARG TARGETVARIANT
+
+# The base-grpc-* image installs gRPC to /opt/grpc but doesn't copy it to
+# /usr/local. Mirror what the from-source path does so the compile step
+# can find gRPC at the canonical prefix the Makefile expects.
+RUN cp -a /opt/grpc/. /usr/local/
+
+COPY . /LocalAI
+
+RUN --mount=type=bind,source=.docker/ik-llama-cpp-compile.sh,target=/usr/local/sbin/compile.sh \
+    --mount=type=cache,target=/root/.ccache,id=ik-llama-cpp-ccache-${TARGETARCH}-${BUILD_TYPE},sharing=locked \
+    bash /usr/local/sbin/compile.sh
+
+RUN make -BC /LocalAI/backend/cpp/ik-llama-cpp package
+
+
+# ============================================================================
+# Final stage — copies package output from one of the two builders.
+# BUILDER_TARGET selects which one. BuildKit prunes the unreferenced builder.
+#
+# BuildKit doesn't support variable expansion in `COPY --from=` directly,
+# so we resolve the ARG by aliasing the chosen builder to a fixed stage
+# name via `FROM ${BUILDER_TARGET} AS builder` and then COPY --from=builder.
+# BUILDER_TARGET itself is declared as a global ARG at the top of this
+# file (required for use in FROM), so we just re-import it into this
+# stage's scope before the FROM directive.
+# ============================================================================
+FROM ${BUILDER_TARGET} AS builder
+
 FROM scratch
 
 
diff --git a/backend/Dockerfile.llama-cpp b/backend/Dockerfile.llama-cpp
index 4e6bd3b18..8e725ef62 100644
--- a/backend/Dockerfile.llama-cpp
+++ b/backend/Dockerfile.llama-cpp
@@ -1,267 +1,84 @@
 ARG BASE_IMAGE=ubuntu:24.04
-ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
+# BUILDER_BASE_IMAGE defaults to BASE_IMAGE so the Dockerfile parses even
+# when no prebuilt base is supplied. The builder-prebuilt stage is only
+# entered when BUILDER_TARGET=builder-prebuilt, so a "wrong" fallback
+# content here is harmless — BuildKit prunes the unreferenced builder.
+ARG BUILDER_BASE_IMAGE=${BASE_IMAGE}
+# BUILDER_TARGET selects which builder stage the final scratch image copies
+# package output from. Declared at global scope (before any FROM) so it's
+# usable in `FROM ${BUILDER_TARGET}` below. Default keeps local
+# `make backends/llama-cpp` on the from-source path.
+ARG BUILDER_TARGET=builder-fromsource
 ARG APT_MIRROR=""
 ARG APT_PORTS_MIRROR=""
 
 
-# The grpc target does one thing, it builds and installs GRPC.  This is in it's own layer so that it can be effectively cached by CI.
-# You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work.
-FROM ${GRPC_BASE_IMAGE} AS grpc
-
-# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
-ARG GRPC_MAKEFLAGS="-j4 -Otarget"
-ARG GRPC_VERSION=v1.65.0
+# ============================================================================
+# Stage: builder-fromsource — self-contained build path.
+# Runs .docker/install-base-deps.sh (apt deps + cmake + protoc + gRPC +
+# conditional CUDA/ROCm/Vulkan), copies /opt/grpc to /usr/local, then
+# compiles the variant. Used when BUILDER_TARGET=builder-fromsource (the
+# default; local `make backends/llama-cpp`).
+#
+# The install script is the same one that backend/Dockerfile.base-grpc-builder
+# runs, so the result is bit-equivalent to the prebuilt-base path
+# (builder-prebuilt below).
+# ============================================================================
+FROM ${BASE_IMAGE} AS builder-fromsource
+ARG BUILD_TYPE
+ARG CUDA_MAJOR_VERSION
+ARG CUDA_MINOR_VERSION
 ARG CMAKE_FROM_SOURCE=false
 # CUDA Toolkit 13.x compatibility: CMake 3.31.9+ fixes toolchain detection/arch table issues
 ARG CMAKE_VERSION=3.31.10
-ARG APT_MIRROR
-ARG APT_PORTS_MIRROR
-
-ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
-
-WORKDIR /build
-
-RUN --mount=type=bind,source=.docker/apt-mirror.sh,target=/usr/local/sbin/apt-mirror \
-    APT_MIRROR="${APT_MIRROR}" APT_PORTS_MIRROR="${APT_PORTS_MIRROR}" sh /usr/local/sbin/apt-mirror && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        ca-certificates \
-        build-essential curl libssl-dev \
-        git wget && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install CMake (the version in 22.04 is too old)
-RUN <<EOT bash
-    if [ "${CMAKE_FROM_SOURCE}" = "true" ]; then
-        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
-    else
-        apt-get update && \
-        apt-get install -y \
-            cmake && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
-# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
-# and running make install in the target container
-RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-    mkdir -p /build/grpc/cmake/build && \
-    cd /build/grpc/cmake/build && \
-    sed -i "216i\  TESTONLY" "../../third_party/abseil-cpp/absl/container/CMakeLists.txt" && \
-    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
-    make && \
-    make install && \
-    rm -rf /build
-
-FROM ${BASE_IMAGE} AS builder
-ARG CMAKE_FROM_SOURCE=false
-ARG CMAKE_VERSION=3.31.10
-# We can target specific CUDA ARCHITECTURES like --build-arg CUDA_DOCKER_ARCH='75;86;89;120'
-ARG CUDA_DOCKER_ARCH
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-ARG CMAKE_ARGS
-ENV CMAKE_ARGS=${CMAKE_ARGS}
-ARG AMDGPU_TARGETS
-ENV AMDGPU_TARGETS=${AMDGPU_TARGETS}
-ARG BACKEND=rerankers
-ARG BUILD_TYPE
-ENV BUILD_TYPE=${BUILD_TYPE}
-ARG CUDA_MAJOR_VERSION
-ARG CUDA_MINOR_VERSION
+ARG GRPC_VERSION=v1.65.0
+ARG GRPC_MAKEFLAGS="-j4 -Otarget"
 ARG SKIP_DRIVERS=false
-ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
-ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
-ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETARCH
 ARG TARGETVARIANT
 ARG GO_VERSION=1.25.4
 ARG UBUNTU_VERSION=2404
 ARG APT_MIRROR
 ARG APT_PORTS_MIRROR
+ARG AMDGPU_TARGETS
+# CUDA target archs, e.g. --build-arg CUDA_DOCKER_ARCH='75;86;89;120'
+ARG CUDA_DOCKER_ARCH
+ARG CMAKE_ARGS
 
-RUN --mount=type=bind,source=.docker/apt-mirror.sh,target=/usr/local/sbin/apt-mirror \
-    APT_MIRROR="${APT_MIRROR}" APT_PORTS_MIRROR="${APT_PORTS_MIRROR}" sh /usr/local/sbin/apt-mirror && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        build-essential \
-        ccache git \
-        ca-certificates \
-        make \
-        pkg-config libcurl4-openssl-dev \
-        curl unzip \
-        libssl-dev wget && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+ENV BUILD_TYPE=${BUILD_TYPE} \
+    CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} \
+    CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION} \
+    CMAKE_FROM_SOURCE=${CMAKE_FROM_SOURCE} \
+    CMAKE_VERSION=${CMAKE_VERSION} \
+    GRPC_VERSION=${GRPC_VERSION} \
+    GRPC_MAKEFLAGS=${GRPC_MAKEFLAGS} \
+    SKIP_DRIVERS=${SKIP_DRIVERS} \
+    TARGETARCH=${TARGETARCH} \
+    UBUNTU_VERSION=${UBUNTU_VERSION} \
+    APT_MIRROR=${APT_MIRROR} \
+    APT_PORTS_MIRROR=${APT_PORTS_MIRROR} \
+    AMDGPU_TARGETS=${AMDGPU_TARGETS} \
+    CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} \
+    CMAKE_ARGS=${CMAKE_ARGS} \
+    DEBIAN_FRONTEND=noninteractive
 
-# Cuda
+# CUDA on PATH (no-op when CUDA isn't installed)
 ENV PATH=/usr/local/cuda/bin:${PATH}
-
-# HipBLAS requirements
+# HipBLAS / ROCm on PATH (no-op when ROCm isn't installed)
 ENV PATH=/opt/rocm/bin:${PATH}
 
+WORKDIR /build
 
-# Vulkan requirements
-RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
-        apt-get update && \
-        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils wget gpg-agent && \
-        apt-get install -y libglm-dev cmake libxcb-dri3-0 libxcb-present0 libpciaccess0 \
-            libpng-dev libxcb-keysyms1-dev libxcb-dri3-dev libx11-dev g++ gcc \
-            libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
-            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
-            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
-            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
-        if [ "amd64" = "$TARGETARCH" ]; then
-            wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
-            tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
-            rm vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
-            mkdir -p /opt/vulkan-sdk && \
-            mv 1.4.335.0 /opt/vulkan-sdk/ && \
-            cd /opt/vulkan-sdk/1.4.335.0 && \
-            ./vulkansdk --no-deps --maxjobs \
-                vulkan-loader \
-                vulkan-validationlayers \
-                vulkan-extensionlayer \
-                vulkan-tools \
-                shaderc && \
-            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/bin/* /usr/bin/ && \
-            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/lib/* /usr/lib/x86_64-linux-gnu/ && \
-            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/include/* /usr/include/ && \
-            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/share/* /usr/share/ && \
-            rm -rf /opt/vulkan-sdk
-        fi
-        if [ "arm64" = "$TARGETARCH" ]; then
-            mkdir vulkan && cd vulkan && \
-            curl -L -o vulkan-sdk.tar.xz https://github.com/mudler/vulkan-sdk-arm/releases/download/1.4.335.0/vulkansdk-ubuntu-24.04-arm-1.4.335.0.tar.xz && \
-            tar -xvf vulkan-sdk.tar.xz && \
-            rm vulkan-sdk.tar.xz && \
-            cd 1.4.335.0 && \
-            cp -rfv aarch64/bin/* /usr/bin/ && \
-            cp -rfv aarch64/lib/* /usr/lib/aarch64-linux-gnu/ && \
-            cp -rfv aarch64/include/* /usr/include/ && \
-            cp -rfv aarch64/share/* /usr/share/ && \
-            cd ../.. && \
-            rm -rf vulkan
-        fi
-        ldconfig && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-# CuBLAS requirements
-RUN <<EOT bash
-    if ( [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "l4t" ] ) && [ "${SKIP_DRIVERS}" = "false" ]; then
-        apt-get update && \
-        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils
-        if [ "amd64" = "$TARGETARCH" ]; then
-            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
-        fi
-        if [ "arm64" = "$TARGETARCH" ]; then
-            if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
-            else
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
-            fi
-        fi
-        dpkg -i cuda-keyring_1.1-1_all.deb && \
-        rm -f cuda-keyring_1.1-1_all.deb && \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
-            apt-get install -y --no-install-recommends \
-            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        fi
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-
-# https://github.com/NVIDIA/Isaac-GR00T/issues/343
-RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
-        wget https://developer.download.nvidia.com/compute/cudss/0.6.0/local_installers/cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb && \
-        dpkg -i cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb && \
-        cp /var/cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0/cudss-*-keyring.gpg /usr/share/keyrings/ && \
-        apt-get update && apt-get -y install cudss cudss-cuda-${CUDA_MAJOR_VERSION} && \
-        wget https://developer.download.nvidia.com/compute/nvpl/25.5/local_installers/nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb && \
-        dpkg -i nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb && \
-        cp /var/nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5/nvpl-*-keyring.gpg /usr/share/keyrings/ && \
-        apt-get update && apt-get install -y nvpl
-    fi
-EOT
-
-# If we are building with clblas support, we need the libraries for the builds
-RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            libclblast-dev && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* \
-    ; fi
-
-RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            hipblas-dev \
-            hipblaslt-dev \
-            rocblas-dev && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* && \
-        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
-        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
-        ldconfig && \
-        # Log which GPU architectures have rocBLAS kernel support
-        echo "rocBLAS library data architectures:" && \
-        (ls /opt/rocm*/lib/rocblas/library/Kernels* 2>/dev/null || ls /opt/rocm*/lib64/rocblas/library/Kernels* 2>/dev/null) | grep -oP 'gfx[0-9a-z+-]+' | sort -u || \
-        echo "WARNING: No rocBLAS kernel data found" \
-    ; fi
-
-RUN echo "TARGETARCH: $TARGETARCH"
-
-# We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
-# but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
-# here so that we can generate the grpc code for the stablediffusion build
-RUN <<EOT bash
-    if [ "amd64" = "$TARGETARCH" ]; then
-        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
-        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
-        rm protoc.zip
-    fi
-    if [ "arm64" = "$TARGETARCH" ]; then
-        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
-        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
-        rm protoc.zip
-    fi
-EOT
-
-# Install CMake (the version in 22.04 is too old)
-RUN <<EOT bash
-    if [ "${CMAKE_FROM_SOURCE}" = "true" ]; then
-        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
-    else
-        apt-get update && \
-        apt-get install -y \
-            cmake && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-COPY --from=grpc /opt/grpc /usr/local
+# Install everything via the shared script — the same one that
+# backend/Dockerfile.base-grpc-builder runs, so the prebuilt CI base and
+# this from-source path are bit-equivalent.
+RUN --mount=type=bind,source=.docker/install-base-deps.sh,target=/usr/local/sbin/install-base-deps \
+    --mount=type=bind,source=.docker/apt-mirror.sh,target=/usr/local/sbin/apt-mirror \
+    bash /usr/local/sbin/install-base-deps
 
+# Mirror builder-prebuilt: copy gRPC from /opt/grpc to /usr/local so
+# CMake's find_package finds it at the canonical prefix the Makefile expects.
+RUN cp -a /opt/grpc/. /usr/local/
 
 COPY . /LocalAI
 
@@ -273,45 +90,66 @@ COPY . /LocalAI
 # CMAKE_*_COMPILER_LAUNCHER threads ccache through CMake to wrap gcc/g++/nvcc.
 # sharing=locked serializes concurrent writes if multiple matrix variants
 # share the same cache mount id.
-RUN --mount=type=cache,target=/root/.ccache,id=llama-cpp-ccache-${TARGETARCH}-${BUILD_TYPE},sharing=locked <<'EOT' bash
-set -euxo pipefail
-
-export CCACHE_DIR=/root/.ccache
-ccache --max-size=5G || true
-ccache -z || true
-
-export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"
-
-if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
-  CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
-  export CMAKE_ARGS="${CMAKE_ARGS} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
-  echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
-  rm -rf /LocalAI/backend/cpp/llama-cpp-*-build
-fi
-
-if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
-  cd /LocalAI/backend/cpp/llama-cpp
-  make llama-cpp-fallback
-  make llama-cpp-grpc
-  make llama-cpp-rpc-server
-else
-  cd /LocalAI/backend/cpp/llama-cpp
-  make llama-cpp-avx
-  make llama-cpp-avx2
-  make llama-cpp-avx512
-  make llama-cpp-fallback
-  make llama-cpp-grpc
-  make llama-cpp-rpc-server
-fi
-
-ccache -s || true
-EOT
+#
+# The compile body is shared with builder-prebuilt via .docker/llama-cpp-compile.sh.
+RUN --mount=type=bind,source=.docker/llama-cpp-compile.sh,target=/usr/local/sbin/compile.sh \
+    --mount=type=cache,target=/root/.ccache,id=llama-cpp-ccache-${TARGETARCH}-${BUILD_TYPE},sharing=locked \
+    bash /usr/local/sbin/compile.sh
 
 
 # Copy libraries using a script to handle architecture differences
 RUN make -BC /LocalAI/backend/cpp/llama-cpp package
 
 
+# ============================================================================
+# Stage: builder-prebuilt — uses the pre-built base from
+# quay.io/go-skynet/ci-cache:base-grpc-* (built by .github/workflows/base-images.yml).
+# That image already has gRPC at /opt/grpc + apt deps + CUDA/ROCm/Vulkan
+# pre-installed, so we just copy gRPC to /usr/local and compile. Used when
+# BUILDER_TARGET=builder-prebuilt (CI when the matrix entry sets
+# builder-base-image).
+# ============================================================================
+FROM ${BUILDER_BASE_IMAGE} AS builder-prebuilt
+
+ARG BUILD_TYPE
+ENV BUILD_TYPE=${BUILD_TYPE}
+ARG CUDA_DOCKER_ARCH
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+ARG CMAKE_ARGS
+ENV CMAKE_ARGS=${CMAKE_ARGS}
+ARG AMDGPU_TARGETS
+ENV AMDGPU_TARGETS=${AMDGPU_TARGETS}
+ARG TARGETARCH
+ARG TARGETVARIANT
+
+# The base-grpc-* image installs gRPC to /opt/grpc but doesn't copy it to
+# /usr/local. The variant Dockerfile's from-source path does that too;
+# mirror it here so the compile step can find gRPC at the canonical
+# prefix the Makefile expects.
+RUN cp -a /opt/grpc/. /usr/local/
+
+COPY . /LocalAI
+
+RUN --mount=type=bind,source=.docker/llama-cpp-compile.sh,target=/usr/local/sbin/compile.sh \
+    --mount=type=cache,target=/root/.ccache,id=llama-cpp-ccache-${TARGETARCH}-${BUILD_TYPE},sharing=locked \
+    bash /usr/local/sbin/compile.sh
+
+RUN make -BC /LocalAI/backend/cpp/llama-cpp package
+
+
+# ============================================================================
+# Final stage — copies package output from one of the two builders.
+# BUILDER_TARGET selects which one. BuildKit prunes the unreferenced builder.
+#
+# BuildKit doesn't support variable expansion in `COPY --from=` directly,
+# so we resolve the ARG by aliasing the chosen builder to a fixed stage
+# name via `FROM ${BUILDER_TARGET} AS builder` and then COPY --from=builder.
+# BUILDER_TARGET itself is declared as a global ARG at the top of this
+# file (required for use in FROM), so we just re-import it into this
+# stage's scope before the FROM directive.
+# ============================================================================
+FROM ${BUILDER_TARGET} AS builder
+
 FROM scratch
 
 
diff --git a/backend/Dockerfile.turboquant b/backend/Dockerfile.turboquant
index db8648417..ffdccf416 100644
--- a/backend/Dockerfile.turboquant
+++ b/backend/Dockerfile.turboquant
@@ -1,265 +1,85 @@
 ARG BASE_IMAGE=ubuntu:24.04
-ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
+# BUILDER_BASE_IMAGE defaults to BASE_IMAGE so the Dockerfile parses even
+# when no prebuilt base is supplied. The builder-prebuilt stage is only
+# entered when BUILDER_TARGET=builder-prebuilt, so a "wrong" fallback
+# content here is harmless — BuildKit prunes the unreferenced builder.
+ARG BUILDER_BASE_IMAGE=${BASE_IMAGE}
+# BUILDER_TARGET selects which builder stage the final scratch image copies
+# package output from. Declared at global scope (before any FROM) so it's
+# usable in `FROM ${BUILDER_TARGET}` below. Default keeps local
+# `make backends/turboquant` on the from-source path.
+ARG BUILDER_TARGET=builder-fromsource
 ARG APT_MIRROR=""
 ARG APT_PORTS_MIRROR=""
 
 
-# The grpc target does one thing, it builds and installs GRPC.  This is in it's own layer so that it can be effectively cached by CI.
-# You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work.
-FROM ${GRPC_BASE_IMAGE} AS grpc
-
-# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
-ARG GRPC_MAKEFLAGS="-j4 -Otarget"
-ARG GRPC_VERSION=v1.65.0
+# ============================================================================
+# Stage: builder-fromsource — self-contained build path.
+# Runs .docker/install-base-deps.sh (apt deps + cmake + protoc + gRPC +
+# conditional CUDA/ROCm/Vulkan), copies /opt/grpc to /usr/local, then
+# compiles the variant. Used when BUILDER_TARGET=builder-fromsource (the
+# default; local `make backends/turboquant`).
+#
+# The install script is the same one that backend/Dockerfile.base-grpc-builder
+# runs, so the result is bit-equivalent to the prebuilt-base path
+# (builder-prebuilt below).
+# ============================================================================
+FROM ${BASE_IMAGE} AS builder-fromsource
+ARG BUILD_TYPE
+ARG CUDA_MAJOR_VERSION
+ARG CUDA_MINOR_VERSION
 ARG CMAKE_FROM_SOURCE=false
 # CUDA Toolkit 13.x compatibility: CMake 3.31.9+ fixes toolchain detection/arch table issues
 ARG CMAKE_VERSION=3.31.10
-ARG APT_MIRROR
-ARG APT_PORTS_MIRROR
-
-ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
-
-WORKDIR /build
-
-RUN --mount=type=bind,source=.docker/apt-mirror.sh,target=/usr/local/sbin/apt-mirror \
-    APT_MIRROR="${APT_MIRROR}" APT_PORTS_MIRROR="${APT_PORTS_MIRROR}" sh /usr/local/sbin/apt-mirror && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        ca-certificates \
-        build-essential curl libssl-dev \
-        git wget && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install CMake (the version in 22.04 is too old)
-RUN <<EOT bash
-    if [ "${CMAKE_FROM_SOURCE}" = "true" ]; then
-        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
-    else
-        apt-get update && \
-        apt-get install -y \
-            cmake && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
-# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
-# and running make install in the target container
-RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-    mkdir -p /build/grpc/cmake/build && \
-    cd /build/grpc/cmake/build && \
-    sed -i "216i\  TESTONLY" "../../third_party/abseil-cpp/absl/container/CMakeLists.txt" && \
-    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
-    make && \
-    make install && \
-    rm -rf /build
-
-FROM ${BASE_IMAGE} AS builder
-ARG CMAKE_FROM_SOURCE=false
-ARG CMAKE_VERSION=3.31.10
-# We can target specific CUDA ARCHITECTURES like --build-arg CUDA_DOCKER_ARCH='75;86;89;120'
-ARG CUDA_DOCKER_ARCH
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-ARG CMAKE_ARGS
-ENV CMAKE_ARGS=${CMAKE_ARGS}
-ARG BACKEND=rerankers
-ARG BUILD_TYPE
-ENV BUILD_TYPE=${BUILD_TYPE}
-ARG CUDA_MAJOR_VERSION
-ARG CUDA_MINOR_VERSION
+ARG GRPC_VERSION=v1.65.0
+ARG GRPC_MAKEFLAGS="-j4 -Otarget"
 ARG SKIP_DRIVERS=false
-ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
-ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
-ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETARCH
 ARG TARGETVARIANT
 ARG GO_VERSION=1.25.4
 ARG UBUNTU_VERSION=2404
 ARG APT_MIRROR
 ARG APT_PORTS_MIRROR
+ARG AMDGPU_TARGETS=""
+ARG BACKEND=rerankers
+# CUDA target archs, e.g. --build-arg CUDA_DOCKER_ARCH='75;86;89;120'
+ARG CUDA_DOCKER_ARCH
+ARG CMAKE_ARGS
 
-RUN --mount=type=bind,source=.docker/apt-mirror.sh,target=/usr/local/sbin/apt-mirror \
-    APT_MIRROR="${APT_MIRROR}" APT_PORTS_MIRROR="${APT_PORTS_MIRROR}" sh /usr/local/sbin/apt-mirror && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        build-essential \
-        ccache git \
-        ca-certificates \
-        make \
-        pkg-config libcurl4-openssl-dev \
-        curl unzip \
-        libssl-dev wget && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+ENV BUILD_TYPE=${BUILD_TYPE} \
+    CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} \
+    CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION} \
+    CMAKE_FROM_SOURCE=${CMAKE_FROM_SOURCE} \
+    CMAKE_VERSION=${CMAKE_VERSION} \
+    GRPC_VERSION=${GRPC_VERSION} \
+    GRPC_MAKEFLAGS=${GRPC_MAKEFLAGS} \
+    SKIP_DRIVERS=${SKIP_DRIVERS} \
+    TARGETARCH=${TARGETARCH} \
+    UBUNTU_VERSION=${UBUNTU_VERSION} \
+    APT_MIRROR=${APT_MIRROR} \
+    APT_PORTS_MIRROR=${APT_PORTS_MIRROR} \
+    AMDGPU_TARGETS=${AMDGPU_TARGETS} \
+    CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} \
+    CMAKE_ARGS=${CMAKE_ARGS} \
+    DEBIAN_FRONTEND=noninteractive
 
-# Cuda
+# CUDA on PATH (no-op when CUDA isn't installed)
 ENV PATH=/usr/local/cuda/bin:${PATH}
-
-# HipBLAS requirements
+# HipBLAS / ROCm on PATH (no-op when ROCm isn't installed)
 ENV PATH=/opt/rocm/bin:${PATH}
 
+WORKDIR /build
 
-# Vulkan requirements
-RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
-        apt-get update && \
-        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils wget gpg-agent && \
-        apt-get install -y libglm-dev cmake libxcb-dri3-0 libxcb-present0 libpciaccess0 \
-            libpng-dev libxcb-keysyms1-dev libxcb-dri3-dev libx11-dev g++ gcc \
-            libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
-            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
-            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
-            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
-        if [ "amd64" = "$TARGETARCH" ]; then
-            wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
-            tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
-            rm vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
-            mkdir -p /opt/vulkan-sdk && \
-            mv 1.4.335.0 /opt/vulkan-sdk/ && \
-            cd /opt/vulkan-sdk/1.4.335.0 && \
-            ./vulkansdk --no-deps --maxjobs \
-                vulkan-loader \
-                vulkan-validationlayers \
-                vulkan-extensionlayer \
-                vulkan-tools \
-                shaderc && \
-            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/bin/* /usr/bin/ && \
-            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/lib/* /usr/lib/x86_64-linux-gnu/ && \
-            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/include/* /usr/include/ && \
-            cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/share/* /usr/share/ && \
-            rm -rf /opt/vulkan-sdk
-        fi
-        if [ "arm64" = "$TARGETARCH" ]; then
-            mkdir vulkan && cd vulkan && \
-            curl -L -o vulkan-sdk.tar.xz https://github.com/mudler/vulkan-sdk-arm/releases/download/1.4.335.0/vulkansdk-ubuntu-24.04-arm-1.4.335.0.tar.xz && \
-            tar -xvf vulkan-sdk.tar.xz && \
-            rm vulkan-sdk.tar.xz && \
-            cd 1.4.335.0 && \
-            cp -rfv aarch64/bin/* /usr/bin/ && \
-            cp -rfv aarch64/lib/* /usr/lib/aarch64-linux-gnu/ && \
-            cp -rfv aarch64/include/* /usr/include/ && \
-            cp -rfv aarch64/share/* /usr/share/ && \
-            cd ../.. && \
-            rm -rf vulkan
-        fi
-        ldconfig && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-# CuBLAS requirements
-RUN <<EOT bash
-    if ( [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "l4t" ] ) && [ "${SKIP_DRIVERS}" = "false" ]; then
-        apt-get update && \
-        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils
-        if [ "amd64" = "$TARGETARCH" ]; then
-            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
-        fi
-        if [ "arm64" = "$TARGETARCH" ]; then
-            if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
-            else
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
-            fi
-        fi
-        dpkg -i cuda-keyring_1.1-1_all.deb && \
-        rm -f cuda-keyring_1.1-1_all.deb && \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
-            apt-get install -y --no-install-recommends \
-            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        fi
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-
-# https://github.com/NVIDIA/Isaac-GR00T/issues/343
-RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
-        wget https://developer.download.nvidia.com/compute/cudss/0.6.0/local_installers/cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb && \
-        dpkg -i cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb && \
-        cp /var/cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0/cudss-*-keyring.gpg /usr/share/keyrings/ && \
-        apt-get update && apt-get -y install cudss cudss-cuda-${CUDA_MAJOR_VERSION} && \
-        wget https://developer.download.nvidia.com/compute/nvpl/25.5/local_installers/nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb && \
-        dpkg -i nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb && \
-        cp /var/nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5/nvpl-*-keyring.gpg /usr/share/keyrings/ && \
-        apt-get update && apt-get install -y nvpl
-    fi
-EOT
-
-# If we are building with clblas support, we need the libraries for the builds
-RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            libclblast-dev && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* \
-    ; fi
-
-RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            hipblas-dev \
-            hipblaslt-dev \
-            rocblas-dev && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* && \
-        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
-        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
-        ldconfig && \
-        # Log which GPU architectures have rocBLAS kernel support
-        echo "rocBLAS library data architectures:" && \
-        (ls /opt/rocm*/lib/rocblas/library/Kernels* 2>/dev/null || ls /opt/rocm*/lib64/rocblas/library/Kernels* 2>/dev/null) | grep -oP 'gfx[0-9a-z+-]+' | sort -u || \
-        echo "WARNING: No rocBLAS kernel data found" \
-    ; fi
-
-RUN echo "TARGETARCH: $TARGETARCH"
-
-# We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
-# but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
-# here so that we can generate the grpc code for the stablediffusion build
-RUN <<EOT bash
-    if [ "amd64" = "$TARGETARCH" ]; then
-        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
-        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
-        rm protoc.zip
-    fi
-    if [ "arm64" = "$TARGETARCH" ]; then
-        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
-        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
-        rm protoc.zip
-    fi
-EOT
-
-# Install CMake (the version in 22.04 is too old)
-RUN <<EOT bash
-    if [ "${CMAKE_FROM_SOURCE}" = "true" ]; then
-        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
-    else
-        apt-get update && \
-        apt-get install -y \
-            cmake && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-COPY --from=grpc /opt/grpc /usr/local
+# Install everything via the shared script — the same one that
+# backend/Dockerfile.base-grpc-builder runs, so the prebuilt CI base and
+# this from-source path are bit-equivalent.
+RUN --mount=type=bind,source=.docker/install-base-deps.sh,target=/usr/local/sbin/install-base-deps \
+    --mount=type=bind,source=.docker/apt-mirror.sh,target=/usr/local/sbin/apt-mirror \
+    bash /usr/local/sbin/install-base-deps
 
+# Mirror builder-prebuilt: copy gRPC from /opt/grpc to /usr/local so
+# CMake's find_package finds it at the canonical prefix the Makefile expects.
+RUN cp -a /opt/grpc/. /usr/local/
 
 COPY . /LocalAI
 
@@ -270,45 +90,63 @@ COPY . /LocalAI
 # id with llama-cpp could give cross-fork hits — but for now keep them
 # separate so a regression in one doesn't poison the other. Revisit
 # sharing after measuring the actual hit rate.
-RUN --mount=type=cache,target=/root/.ccache,id=turboquant-ccache-${TARGETARCH}-${BUILD_TYPE},sharing=locked <<'EOT' bash
-set -euxo pipefail
-
-export CCACHE_DIR=/root/.ccache
-ccache --max-size=5G || true
-ccache -z || true
-
-export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache"
-
-if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
-  CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
-  export CMAKE_ARGS="${CMAKE_ARGS} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
-  echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
-  rm -rf /LocalAI/backend/cpp/turboquant-*-build
-fi
-
-cd /LocalAI/backend/cpp/turboquant
-
-if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
-  make turboquant-fallback
-  make turboquant-grpc
-  make turboquant-rpc-server
-else
-  make turboquant-avx
-  make turboquant-avx2
-  make turboquant-avx512
-  make turboquant-fallback
-  make turboquant-grpc
-  make turboquant-rpc-server
-fi
-
-ccache -s || true
-EOT
+#
+# The compile body is shared with builder-prebuilt via .docker/turboquant-compile.sh.
+RUN --mount=type=bind,source=.docker/turboquant-compile.sh,target=/usr/local/sbin/compile.sh \
+    --mount=type=cache,target=/root/.ccache,id=turboquant-ccache-${TARGETARCH}-${BUILD_TYPE},sharing=locked \
+    bash /usr/local/sbin/compile.sh
 
 
 # Copy libraries using a script to handle architecture differences
 RUN make -BC /LocalAI/backend/cpp/turboquant package
 
 
+# ============================================================================
+# Stage: builder-prebuilt — uses the pre-built base from
+# quay.io/go-skynet/ci-cache:base-grpc-* (built by .github/workflows/base-images.yml).
+# That image already has gRPC at /opt/grpc + apt deps + CUDA/ROCm/Vulkan
+# pre-installed, so we just copy gRPC to /usr/local and compile. Used when
+# BUILDER_TARGET=builder-prebuilt (CI when the matrix entry sets
+# builder-base-image).
+# ============================================================================
+FROM ${BUILDER_BASE_IMAGE} AS builder-prebuilt
+
+ARG BUILD_TYPE
+ENV BUILD_TYPE=${BUILD_TYPE}
+ARG CUDA_DOCKER_ARCH
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+ARG CMAKE_ARGS
+ENV CMAKE_ARGS=${CMAKE_ARGS}
+ARG TARGETARCH
+ARG TARGETVARIANT
+
+# The base-grpc-* image installs gRPC to /opt/grpc but doesn't copy it to
+# /usr/local. Mirror what the from-source path does so the compile step
+# can find gRPC at the canonical prefix the Makefile expects.
+RUN cp -a /opt/grpc/. /usr/local/
+
+COPY . /LocalAI
+
+RUN --mount=type=bind,source=.docker/turboquant-compile.sh,target=/usr/local/sbin/compile.sh \
+    --mount=type=cache,target=/root/.ccache,id=turboquant-ccache-${TARGETARCH}-${BUILD_TYPE},sharing=locked \
+    bash /usr/local/sbin/compile.sh
+
+RUN make -BC /LocalAI/backend/cpp/turboquant package
+
+
+# ============================================================================
+# Final stage — copies package output from one of the two builders.
+# BUILDER_TARGET selects which one. BuildKit prunes the unreferenced builder.
+#
+# BuildKit doesn't support variable expansion in `COPY --from=` directly,
+# so we resolve the ARG by aliasing the chosen builder to a fixed stage
+# name via `FROM ${BUILDER_TARGET} AS builder` and then COPY --from=builder.
+# BUILDER_TARGET itself is declared as a global ARG at the top of this
+# file (required for use in FROM), so we just re-import it into this
+# stage's scope before the FROM directive.
+# ============================================================================
+FROM ${BUILDER_TARGET} AS builder
+
 FROM scratch