feat(quantization): add quantization backend (#9096)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-07-01 20:07:18 -04:00 · 2026-03-22 00:56:34 +01:00
parent 4b183b7bb6
commit f7e8d9e791
37 changed files with 2574 additions and 47 deletions
--- a/backend/python/llama-cpp-quantization/test.py
+++ b/backend/python/llama-cpp-quantization/test.py
@@ -0,0 +1,99 @@
+"""
+Test script for the llama-cpp-quantization gRPC backend.
+
+Downloads a small model (functiongemma-270m-it), converts it to GGUF,
+and quantizes it to q4_k_m.
+"""
+import os
+import shutil
+import subprocess
+import tempfile
+import time
+import unittest
+
+import grpc
+import backend_pb2
+import backend_pb2_grpc
+
+
+SERVER_ADDR = "localhost:50051"
+# Small model for CI testing (~540MB)
+TEST_MODEL = "unsloth/functiongemma-270m-it"
+
+
+class TestQuantizationBackend(unittest.TestCase):
+    """Tests for the llama-cpp-quantization gRPC service."""
+
+    @classmethod
+    def setUpClass(cls):
+        cls.service = subprocess.Popen(
+            ["python3", "backend.py", "--addr", SERVER_ADDR]
+        )
+        time.sleep(5)
+        cls.output_dir = tempfile.mkdtemp(prefix="quantize-test-")
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.service.kill()
+        cls.service.wait()
+        # Clean up output directory
+        if os.path.isdir(cls.output_dir):
+            shutil.rmtree(cls.output_dir, ignore_errors=True)
+
+    def _channel(self):
+        return grpc.insecure_channel(SERVER_ADDR)
+
+    def test_01_health(self):
+        """Test that the server starts and responds to health checks."""
+        with self._channel() as channel:
+            stub = backend_pb2_grpc.BackendStub(channel)
+            response = stub.Health(backend_pb2.HealthMessage())
+            self.assertEqual(response.message, b"OK")
+
+    def test_02_quantize_small_model(self):
+        """Download, convert, and quantize functiongemma-270m-it to q4_k_m."""
+        with self._channel() as channel:
+            stub = backend_pb2_grpc.BackendStub(channel)
+
+            job_id = "test-quantize-001"
+
+            # Start quantization
+            result = stub.StartQuantization(
+                backend_pb2.QuantizationRequest(
+                    model=TEST_MODEL,
+                    quantization_type="q4_k_m",
+                    output_dir=self.output_dir,
+                    job_id=job_id,
+                )
+            )
+            self.assertTrue(result.success, f"StartQuantization failed: {result.message}")
+            self.assertEqual(result.job_id, job_id)
+
+            # Stream progress until completion
+            final_status = None
+            output_file = None
+            for update in stub.QuantizationProgress(
+                backend_pb2.QuantizationProgressRequest(job_id=job_id)
+            ):
+                print(f"  [{update.status}] {update.progress_percent:.1f}% - {update.message}")
+                final_status = update.status
+                if update.output_file:
+                    output_file = update.output_file
+
+            self.assertEqual(final_status, "completed", f"Expected completed, got {final_status}")
+            self.assertIsNotNone(output_file, "No output_file in progress updates")
+            self.assertTrue(os.path.isfile(output_file), f"Output file not found: {output_file}")
+
+            # Verify the output is a valid GGUF file (starts with "GGUF" magic)
+            with open(output_file, "rb") as f:
+                magic = f.read(4)
+            self.assertEqual(magic, b"GGUF", f"Output file does not have GGUF magic: {magic!r}")
+
+            # Verify reasonable file size (q4_k_m of 270M model should be ~150-400MB)
+            size_mb = os.path.getsize(output_file) / (1024 * 1024)
+            print(f"  Output file size: {size_mb:.1f} MB")
+            self.assertGreater(size_mb, 10, "Output file suspiciously small")
+
+
+if __name__ == "__main__":
+    unittest.main()