chore(exllama): drop backend now almost deprecated (#8186)

exllama2 development has stalled and only old architectures are supported. exllamav3 is still in development, meanwhile cleaning up exllama2 from the gallery. Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-01-30 01:02:37 -05:00 · 2026-01-24 08:57:37 +01:00
parent 17783fa7d9
commit 05904c77f5
18 changed files with 2 additions and 345 deletions
--- a/backend/python/README.md
+++ b/backend/python/README.md
@@ -16,7 +16,6 @@ The Python backends use a unified build system based on `libbackend.sh` that pro
 - **transformers** - Hugging Face Transformers framework (PyTorch-based)
 - **vllm** - High-performance LLM inference engine
 - **mlx** - Apple Silicon optimized ML framework
- **exllama2** - ExLlama2 quantized models

 ### Audio & Speech
 - **bark** - Text-to-speech synthesis
--- a/backend/python/exllama2/.gitignore
+++ b/backend/python/exllama2/.gitignore
@@ -1 +0,0 @@
-source
--- a/backend/python/exllama2/Makefile
+++ b/backend/python/exllama2/Makefile
@@ -1,17 +0,0 @@
-.PHONY: exllama2
-exllama2:
-	bash install.sh
-
-.PHONY: run
-run: exllama2
-	@echo "Running exllama2..."
-	bash run.sh
-	@echo "exllama2 run."
-
-.PHONY: protogen-clean
-protogen-clean:
-	$(RM) backend_pb2_grpc.py backend_pb2.py
-
-.PHONY: clean
-clean: protogen-clean
-	$(RM) -r venv source __pycache__
--- a/backend/python/exllama2/backend.py
+++ b/backend/python/exllama2/backend.py
@@ -1,143 +0,0 @@
-#!/usr/bin/env python3
-import grpc
-from concurrent import futures
-import time
-import backend_pb2
-import backend_pb2_grpc
-import argparse
-import signal
-import sys
-import os
-import glob
-
-from pathlib import Path
-import torch
-import torch.nn.functional as F
-from torch import version as torch_version
-
-
-from exllamav2.generator import (
-    ExLlamaV2BaseGenerator,
-    ExLlamaV2Sampler
-)
-
-
-from exllamav2 import (
-    ExLlamaV2,
-    ExLlamaV2Config,
-    ExLlamaV2Cache,
-    ExLlamaV2Cache_8bit,
-    ExLlamaV2Tokenizer,
-    model_init,
-)
-
-
-_ONE_DAY_IN_SECONDS = 60 * 60 * 24
-
-# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
-MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
-
-# Implement the BackendServicer class with the service methods
-class BackendServicer(backend_pb2_grpc.BackendServicer):
-    def Health(self, request, context):
-        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
-
-    def LoadModel(self, request, context):
-        try:
-            model_directory = request.ModelFile
-
-            config = ExLlamaV2Config()
-            config.model_dir = model_directory
-            config.prepare()
-
-            model = ExLlamaV2(config)
-
-            cache = ExLlamaV2Cache(model, lazy=True)
-            model.load_autosplit(cache)
-
-            tokenizer = ExLlamaV2Tokenizer(config)
-
-            # Initialize generator
-
-            generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)
-
-            self.generator = generator
-
-            generator.warmup()
-            self.model = model
-            self.tokenizer = tokenizer
-            self.cache = cache
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-        return backend_pb2.Result(message="Model loaded successfully", success=True)
-
-    def Predict(self, request, context):
-
-        penalty = 1.15
-        if request.Penalty != 0.0:
-            penalty = request.Penalty
-
-        settings = ExLlamaV2Sampler.Settings()
-        settings.temperature = request.Temperature
-        settings.top_k = request.TopK
-        settings.top_p = request.TopP
-        settings.token_repetition_penalty = penalty
-        settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id])
-        tokens = 512
-
-        if request.Tokens != 0:
-            tokens = request.Tokens
-        output = self.generator.generate_simple(
-            request.Prompt, settings, tokens)
-
-        # Remove prompt from response if present
-        if request.Prompt in output:
-            output = output.replace(request.Prompt, "")
-
-        return backend_pb2.Result(message=bytes(output, encoding='utf-8'))
-
-    def PredictStream(self, request, context):
-        # Implement PredictStream RPC
-        # for reply in some_data_generator():
-        #    yield reply
-        # Not implemented yet
-        return self.Predict(request, context)
-
-
-def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
-        options=[
-            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
-        ])
-    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
-    server.add_insecure_port(address)
-    server.start()
-    print("Server started. Listening on: " + address, file=sys.stderr)
-
-    # Define the signal handler function
-    def signal_handler(sig, frame):
-        print("Received termination signal. Shutting down...")
-        server.stop(0)
-        sys.exit(0)
-
-    # Set the signal handlers for SIGINT and SIGTERM
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        while True:
-            time.sleep(_ONE_DAY_IN_SECONDS)
-    except KeyboardInterrupt:
-        server.stop(0)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run the gRPC server.")
-    parser.add_argument(
-        "--addr", default="localhost:50051", help="The address to bind the server to."
-    )
-    args = parser.parse_args()
-
-    serve(args.addr)
--- a/backend/python/exllama2/install.sh
+++ b/backend/python/exllama2/install.sh
@@ -1,21 +0,0 @@
-#!/bin/bash
-set -e
-
-LIMIT_TARGETS="cublas"
-EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation"
-EXLLAMA2_VERSION=c0ddebaaaf8ffd1b3529c2bb654e650bce2f790f
-
-backend_dir=$(dirname $0)
-if [ -d $backend_dir/common ]; then
-    source $backend_dir/common/libbackend.sh
-else
-    source $backend_dir/../common/libbackend.sh
-fi
-
-installRequirements
-
-git clone https://github.com/turboderp/exllamav2 $MY_DIR/source
-pushd ${MY_DIR}/source && git checkout -b build ${EXLLAMA2_VERSION} && popd
-
-# This installs exllamav2 in JIT mode so it will compile the appropriate torch extension at runtime
-EXLLAMA_NOCOMPILE= uv pip install ${EXTRA_PIP_INSTALL_FLAGS} ${MY_DIR}/source/
--- a/backend/python/exllama2/requirements-cpu.txt
+++ b/backend/python/exllama2/requirements-cpu.txt
@@ -1,3 +0,0 @@
-transformers
-accelerate
-torch==2.4.1
--- a/backend/python/exllama2/requirements-cublas12.txt
+++ b/backend/python/exllama2/requirements-cublas12.txt
@@ -1,3 +0,0 @@
-torch==2.4.1
-transformers
-accelerate
--- a/backend/python/exllama2/requirements-install.txt
+++ b/backend/python/exllama2/requirements-install.txt
@@ -1,4 +0,0 @@
-# This is here to trigger the install script to add --no-build-isolation to the uv pip install commands
-# exllama2 does not specify it's build requirements per PEP517, so we need to provide some things ourselves
-wheel
-setuptools
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,5 +0,0 @@
-grpcio==1.76.0
-protobuf
-certifi
-wheel
-setuptools
--- a/backend/python/exllama2/run.sh
+++ b/backend/python/exllama2/run.sh
@@ -1,11 +0,0 @@
-#!/bin/bash
-LIMIT_TARGETS="cublas"
-
-backend_dir=$(dirname $0)
-if [ -d $backend_dir/common ]; then
-    source $backend_dir/common/libbackend.sh
-else
-    source $backend_dir/../common/libbackend.sh
-fi
-
-startBackend $@
--- a/backend/python/exllama2/test.sh
+++ b/backend/python/exllama2/test.sh
@@ -1,11 +0,0 @@
-#!/bin/bash
-set -e
-
-backend_dir=$(dirname $0)
-if [ -d $backend_dir/common ]; then
-    source $backend_dir/common/libbackend.sh
-else
-    source $backend_dir/../common/libbackend.sh
-fi
-
-runUnittests