feat(quantization): add quantization backend (#9096)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-07-01 11:56:57 -04:00 · 2026-03-22 00:56:34 +01:00
parent 4b183b7bb6
commit f7e8d9e791
37 changed files with 2574 additions and 47 deletions
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -131,6 +131,19 @@ jobs:
            dockerfile: "./backend/Dockerfile.python"
            context: "./"
            ubuntu-version: '2404'
+          - build-type: ''
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64,linux/arm64'
+            tag-latest: 'auto'
+            tag-suffix: '-cpu-llama-cpp-quantization'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'true'
+            backend: "llama-cpp-quantization"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
+            ubuntu-version: '2404'
          - build-type: ''
            cuda-major-version: ""
            cuda-minor-version: ""
@@ -2412,6 +2425,9 @@ jobs:
            tag-suffix: "-metal-darwin-arm64-local-store"
            build-type: "metal"
            lang: "go"
+          - backend: "llama-cpp-quantization"
+            tag-suffix: "-metal-darwin-arm64-llama-cpp-quantization"
+            build-type: "mps"
    with:
      backend: ${{ matrix.backend }}
      build-type: ${{ matrix.build-type }}
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -383,6 +383,33 @@ jobs:
        run: |
          make --jobs=5 --output-sync=target -C backend/python/voxcpm
          make --jobs=5 --output-sync=target -C backend/python/voxcpm test
+  tests-llama-cpp-quantization:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential cmake curl git python3-pip
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
+      - name: Build llama-quantize from llama.cpp
+        run: |
+          git clone --depth 1 https://github.com/ggml-org/llama.cpp.git /tmp/llama.cpp
+          cmake -B /tmp/llama.cpp/build -S /tmp/llama.cpp -DGGML_NATIVE=OFF
+          cmake --build /tmp/llama.cpp/build --target llama-quantize -j$(nproc)
+          sudo cp /tmp/llama.cpp/build/bin/llama-quantize /usr/local/bin/
+      - name: Install backend
+        run: |
+          make --jobs=5 --output-sync=target -C backend/python/llama-cpp-quantization
+      - name: Test llama-cpp-quantization
+        run: |
+          make --jobs=5 --output-sync=target -C backend/python/llama-cpp-quantization test
  tests-acestep-cpp:
    runs-on: ubuntu-latest
    steps: