mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-01 05:36:49 -04:00
feat(quantization): add quantization backend (#9096)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
committed by
GitHub
parent
4b183b7bb6
commit
f7e8d9e791
99
backend/python/llama-cpp-quantization/test.py
Normal file
99
backend/python/llama-cpp-quantization/test.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""
|
||||
Test script for the llama-cpp-quantization gRPC backend.
|
||||
|
||||
Downloads a small model (functiongemma-270m-it), converts it to GGUF,
|
||||
and quantizes it to q4_k_m.
|
||||
"""
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
import unittest
|
||||
|
||||
import grpc
|
||||
import backend_pb2
|
||||
import backend_pb2_grpc
|
||||
|
||||
|
||||
SERVER_ADDR = "localhost:50051"
|
||||
# Small model for CI testing (~540MB)
|
||||
TEST_MODEL = "unsloth/functiongemma-270m-it"
|
||||
|
||||
|
||||
class TestQuantizationBackend(unittest.TestCase):
|
||||
"""Tests for the llama-cpp-quantization gRPC service."""
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.service = subprocess.Popen(
|
||||
["python3", "backend.py", "--addr", SERVER_ADDR]
|
||||
)
|
||||
time.sleep(5)
|
||||
cls.output_dir = tempfile.mkdtemp(prefix="quantize-test-")
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
cls.service.kill()
|
||||
cls.service.wait()
|
||||
# Clean up output directory
|
||||
if os.path.isdir(cls.output_dir):
|
||||
shutil.rmtree(cls.output_dir, ignore_errors=True)
|
||||
|
||||
def _channel(self):
|
||||
return grpc.insecure_channel(SERVER_ADDR)
|
||||
|
||||
def test_01_health(self):
|
||||
"""Test that the server starts and responds to health checks."""
|
||||
with self._channel() as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
response = stub.Health(backend_pb2.HealthMessage())
|
||||
self.assertEqual(response.message, b"OK")
|
||||
|
||||
def test_02_quantize_small_model(self):
|
||||
"""Download, convert, and quantize functiongemma-270m-it to q4_k_m."""
|
||||
with self._channel() as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
|
||||
job_id = "test-quantize-001"
|
||||
|
||||
# Start quantization
|
||||
result = stub.StartQuantization(
|
||||
backend_pb2.QuantizationRequest(
|
||||
model=TEST_MODEL,
|
||||
quantization_type="q4_k_m",
|
||||
output_dir=self.output_dir,
|
||||
job_id=job_id,
|
||||
)
|
||||
)
|
||||
self.assertTrue(result.success, f"StartQuantization failed: {result.message}")
|
||||
self.assertEqual(result.job_id, job_id)
|
||||
|
||||
# Stream progress until completion
|
||||
final_status = None
|
||||
output_file = None
|
||||
for update in stub.QuantizationProgress(
|
||||
backend_pb2.QuantizationProgressRequest(job_id=job_id)
|
||||
):
|
||||
print(f" [{update.status}] {update.progress_percent:.1f}% - {update.message}")
|
||||
final_status = update.status
|
||||
if update.output_file:
|
||||
output_file = update.output_file
|
||||
|
||||
self.assertEqual(final_status, "completed", f"Expected completed, got {final_status}")
|
||||
self.assertIsNotNone(output_file, "No output_file in progress updates")
|
||||
self.assertTrue(os.path.isfile(output_file), f"Output file not found: {output_file}")
|
||||
|
||||
# Verify the output is a valid GGUF file (starts with "GGUF" magic)
|
||||
with open(output_file, "rb") as f:
|
||||
magic = f.read(4)
|
||||
self.assertEqual(magic, b"GGUF", f"Output file does not have GGUF magic: {magic!r}")
|
||||
|
||||
# Verify reasonable file size (q4_k_m of 270M model should be ~150-400MB)
|
||||
size_mb = os.path.getsize(output_file) / (1024 * 1024)
|
||||
print(f" Output file size: {size_mb:.1f} MB")
|
||||
self.assertGreater(size_mb, 10, "Output file suspiciously small")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user