chore(exllama): drop backend now almost deprecated (#8186)

exllama2 development has stalled and only old architectures are supported. exllamav3 is still in development, meanwhile cleaning up exllama2 from the gallery. Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-02-18 23:21:58 -05:00 · 2026-01-24 08:57:37 +01:00
parent 17783fa7d9
commit 05904c77f5
18 changed files with 2 additions and 345 deletions
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -313,19 +313,6 @@ jobs:
            dockerfile: "./backend/Dockerfile.python"
            context: "./"
            ubuntu-version: '2404'
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "9"
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-gpu-nvidia-cuda-12-exllama2'
-            runs-on: 'ubuntu-latest'
-            base-image: "ubuntu:24.04"
-            skip-drivers: 'false'
-            backend: "exllama2"
-            dockerfile: "./backend/Dockerfile.python"
-            context: "./"
-            ubuntu-version: '2404'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "9"
@@ -1301,46 +1288,6 @@ jobs:
            dockerfile: "./backend/Dockerfile.python"
            context: "./"
            ubuntu-version: '2204'
-          # exllama2
-          - build-type: ''
-            cuda-major-version: ""
-            cuda-minor-version: ""
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-cpu-exllama2'
-            runs-on: 'ubuntu-latest'
-            base-image: "ubuntu:24.04"
-            skip-drivers: 'false'
-            backend: "exllama2"
-            dockerfile: "./backend/Dockerfile.python"
-            context: "./"
-            ubuntu-version: '2404'
-          - build-type: 'intel'
-            cuda-major-version: ""
-            cuda-minor-version: ""
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-gpu-intel-exllama2'
-            runs-on: 'ubuntu-latest'
-            base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
-            skip-drivers: 'false'
-            backend: "exllama2"
-            dockerfile: "./backend/Dockerfile.python"
-            context: "./"
-            ubuntu-version: '2404'
-          - build-type: 'hipblas'
-            cuda-major-version: ""
-            cuda-minor-version: ""
-            platforms: 'linux/amd64'
-            skip-drivers: 'true'
-            tag-latest: 'auto'
-            tag-suffix: '-gpu-hipblas-exllama2'
-            base-image: "rocm/dev-ubuntu-24.04:6.4.4"
-            runs-on: 'ubuntu-latest'
-            backend: "exllama2"
-            dockerfile: "./backend/Dockerfile.python"
-            context: "./"
-            ubuntu-version: '2404'
          - build-type: 'l4t'
            cuda-major-version: "12"
            cuda-minor-version: "0"
--- a/4
+++ b/4
@@ -450,7 +450,6 @@ BACKEND_TRANSFORMERS = transformers|python|.|false|true
 BACKEND_FASTER_WHISPER = faster-whisper|python|.|false|true
 BACKEND_COQUI = coqui|python|.|false|true
 BACKEND_BARK = bark|python|.|false|true
-BACKEND_EXLLAMA2 = exllama2|python|.|false|true
 BACKEND_RFDETR = rfdetr|python|.|false|true
 BACKEND_KITTEN_TTS = kitten-tts|python|.|false|true
 BACKEND_NEUTTS = neutts|python|.|false|true
@@ -497,7 +496,6 @@ $(eval $(call generate-docker-build-target,$(BACKEND_TRANSFORMERS)))
 $(eval $(call generate-docker-build-target,$(BACKEND_FASTER_WHISPER)))
 $(eval $(call generate-docker-build-target,$(BACKEND_COQUI)))
 $(eval $(call generate-docker-build-target,$(BACKEND_BARK)))
-$(eval $(call generate-docker-build-target,$(BACKEND_EXLLAMA2)))
 $(eval $(call generate-docker-build-target,$(BACKEND_RFDETR)))
 $(eval $(call generate-docker-build-target,$(BACKEND_KITTEN_TTS)))
 $(eval $(call generate-docker-build-target,$(BACKEND_NEUTTS)))
@@ -514,7 +512,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_QWEN_TTS)))
 docker-save-%: backend-images
 	docker save local-ai-backend:$* -o backend-images/$*.tar

-docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-bark docker-build-chatterbox docker-build-vibevoice docker-build-exllama2 docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts
+docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-bark docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts

 ########################################################
 ### END Backends
--- a/README.md
+++ b/README.md
@@ -278,7 +278,6 @@ LocalAI supports a comprehensive range of AI backends with multiple acceleration
 | **llama.cpp** | LLM inference in C/C++ | CUDA 12/13, ROCm, Intel SYCL, Vulkan, Metal, CPU |
 | **vLLM** | Fast LLM inference with PagedAttention | CUDA 12/13, ROCm, Intel |
 | **transformers** | HuggingFace transformers framework | CUDA 12/13, ROCm, Intel, CPU |
-| **exllama2** | GPTQ inference library | CUDA 12/13 |
 | **MLX** | Apple Silicon LLM inference | Metal (M1/M2/M3+) |
 | **MLX-VLM** | Apple Silicon Vision-Language Models | Metal (M1/M2/M3+) |

@@ -321,7 +320,7 @@ LocalAI supports a comprehensive range of AI backends with multiple acceleration
 | **NVIDIA CUDA 12** | All CUDA-compatible backends | Nvidia hardware |
 | **NVIDIA CUDA 13** | All CUDA-compatible backends | Nvidia hardware |
 | **AMD ROCm** | llama.cpp, whisper, vllm, transformers, diffusers, rerankers, coqui, kokoro, bark, neutts, vibevoice, pocket-tts, qwen-tts | AMD Graphics |
-| **Intel oneAPI** | llama.cpp, whisper, stablediffusion, vllm, transformers, diffusers, rfdetr, rerankers, exllama2, coqui, kokoro, bark, vibevoice, pocket-tts, qwen-tts | Intel Arc, Intel iGPUs |
+| **Intel oneAPI** | llama.cpp, whisper, stablediffusion, vllm, transformers, diffusers, rfdetr, rerankers, coqui, kokoro, bark, vibevoice, pocket-tts, qwen-tts | Intel Arc, Intel iGPUs |
 | **Apple Metal** | llama.cpp, whisper, diffusers, MLX, MLX-VLM, bark-cpp | Apple M1/M2/M3+ |
 | **Vulkan** | llama.cpp, whisper, stablediffusion | Cross-platform GPUs |
 | **NVIDIA Jetson (CUDA 12)** | llama.cpp, whisper, stablediffusion, diffusers, rfdetr | ARM64 embedded AI (AGX Orin, etc.) |
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -241,22 +241,6 @@
    nvidia-cuda-12: "cuda12-diffusers"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-diffusers"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-diffusers"
- &exllama2
-  name: "exllama2"
-  urls:
-    - https://github.com/turboderp-org/exllamav2
-  tags:
-    - text-to-text
-    - LLM
-    - EXL2
-  license: MIT
-  description: |
-    ExLlamaV2 is an inference library for running local LLMs on modern consumer GPUs.
-  alias: "exllama2"
-  capabilities:
-    nvidia: "cuda12-exllama2"
-    intel: "intel-exllama2"
-    nvidia-cuda-12: "cuda12-exllama2"
 - &faster-whisper
  icon: https://avatars.githubusercontent.com/u/1520500?s=200&v=4
  description: |
@@ -1251,22 +1235,6 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-diffusers"
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-diffusers
-  ## exllama2
- !!merge <<: *exllama2
-  name: "exllama2-development"
-  capabilities:
-    nvidia: "cuda12-exllama2-development"
-    intel: "intel-exllama2-development"
- !!merge <<: *exllama2
-  name: "cuda12-exllama2"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-exllama2"
-  mirrors:
-    - localai/localai-backends:latest-gpu-nvidia-cuda-12-exllama2
- !!merge <<: *exllama2
-  name: "cuda12-exllama2-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-exllama2"
-  mirrors:
-    - localai/localai-backends:master-gpu-nvidia-cuda-12-exllama2
 ## kokoro
 - !!merge <<: *kokoro
  name: "kokoro-development"
--- a/backend/python/README.md
+++ b/backend/python/README.md
@@ -16,7 +16,6 @@ The Python backends use a unified build system based on `libbackend.sh` that pro
 - **transformers** - Hugging Face Transformers framework (PyTorch-based)
 - **vllm** - High-performance LLM inference engine
 - **mlx** - Apple Silicon optimized ML framework
- **exllama2** - ExLlama2 quantized models

 ### Audio & Speech
 - **bark** - Text-to-speech synthesis
--- a/backend/python/exllama2/.gitignore
+++ b/backend/python/exllama2/.gitignore
@@ -1 +0,0 @@
-source
--- a/backend/python/exllama2/Makefile
+++ b/backend/python/exllama2/Makefile
@@ -1,17 +0,0 @@
-.PHONY: exllama2
-exllama2:
-	bash install.sh
-
-.PHONY: run
-run: exllama2
-	@echo "Running exllama2..."
-	bash run.sh
-	@echo "exllama2 run."
-
-.PHONY: protogen-clean
-protogen-clean:
-	$(RM) backend_pb2_grpc.py backend_pb2.py
-
-.PHONY: clean
-clean: protogen-clean
-	$(RM) -r venv source __pycache__
--- a/backend/python/exllama2/backend.py
+++ b/backend/python/exllama2/backend.py
@@ -1,143 +0,0 @@
-#!/usr/bin/env python3
-import grpc
-from concurrent import futures
-import time
-import backend_pb2
-import backend_pb2_grpc
-import argparse
-import signal
-import sys
-import os
-import glob
-
-from pathlib import Path
-import torch
-import torch.nn.functional as F
-from torch import version as torch_version
-
-
-from exllamav2.generator import (
-    ExLlamaV2BaseGenerator,
-    ExLlamaV2Sampler
-)
-
-
-from exllamav2 import (
-    ExLlamaV2,
-    ExLlamaV2Config,
-    ExLlamaV2Cache,
-    ExLlamaV2Cache_8bit,
-    ExLlamaV2Tokenizer,
-    model_init,
-)
-
-
-_ONE_DAY_IN_SECONDS = 60 * 60 * 24
-
-# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
-MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
-
-# Implement the BackendServicer class with the service methods
-class BackendServicer(backend_pb2_grpc.BackendServicer):
-    def Health(self, request, context):
-        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
-
-    def LoadModel(self, request, context):
-        try:
-            model_directory = request.ModelFile
-
-            config = ExLlamaV2Config()
-            config.model_dir = model_directory
-            config.prepare()
-
-            model = ExLlamaV2(config)
-
-            cache = ExLlamaV2Cache(model, lazy=True)
-            model.load_autosplit(cache)
-
-            tokenizer = ExLlamaV2Tokenizer(config)
-
-            # Initialize generator
-
-            generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)
-
-            self.generator = generator
-
-            generator.warmup()
-            self.model = model
-            self.tokenizer = tokenizer
-            self.cache = cache
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-        return backend_pb2.Result(message="Model loaded successfully", success=True)
-
-    def Predict(self, request, context):
-
-        penalty = 1.15
-        if request.Penalty != 0.0:
-            penalty = request.Penalty
-
-        settings = ExLlamaV2Sampler.Settings()
-        settings.temperature = request.Temperature
-        settings.top_k = request.TopK
-        settings.top_p = request.TopP
-        settings.token_repetition_penalty = penalty
-        settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id])
-        tokens = 512
-
-        if request.Tokens != 0:
-            tokens = request.Tokens
-        output = self.generator.generate_simple(
-            request.Prompt, settings, tokens)
-
-        # Remove prompt from response if present
-        if request.Prompt in output:
-            output = output.replace(request.Prompt, "")
-
-        return backend_pb2.Result(message=bytes(output, encoding='utf-8'))
-
-    def PredictStream(self, request, context):
-        # Implement PredictStream RPC
-        # for reply in some_data_generator():
-        #    yield reply
-        # Not implemented yet
-        return self.Predict(request, context)
-
-
-def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
-        options=[
-            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
-        ])
-    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
-    server.add_insecure_port(address)
-    server.start()
-    print("Server started. Listening on: " + address, file=sys.stderr)
-
-    # Define the signal handler function
-    def signal_handler(sig, frame):
-        print("Received termination signal. Shutting down...")
-        server.stop(0)
-        sys.exit(0)
-
-    # Set the signal handlers for SIGINT and SIGTERM
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        while True:
-            time.sleep(_ONE_DAY_IN_SECONDS)
-    except KeyboardInterrupt:
-        server.stop(0)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run the gRPC server.")
-    parser.add_argument(
-        "--addr", default="localhost:50051", help="The address to bind the server to."
-    )
-    args = parser.parse_args()
-
-    serve(args.addr)
--- a/backend/python/exllama2/install.sh
+++ b/backend/python/exllama2/install.sh
@@ -1,21 +0,0 @@
-#!/bin/bash
-set -e
-
-LIMIT_TARGETS="cublas"
-EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation"
-EXLLAMA2_VERSION=c0ddebaaaf8ffd1b3529c2bb654e650bce2f790f
-
-backend_dir=$(dirname $0)
-if [ -d $backend_dir/common ]; then
-    source $backend_dir/common/libbackend.sh
-else
-    source $backend_dir/../common/libbackend.sh
-fi
-
-installRequirements
-
-git clone https://github.com/turboderp/exllamav2 $MY_DIR/source
-pushd ${MY_DIR}/source && git checkout -b build ${EXLLAMA2_VERSION} && popd
-
-# This installs exllamav2 in JIT mode so it will compile the appropriate torch extension at runtime
-EXLLAMA_NOCOMPILE= uv pip install ${EXTRA_PIP_INSTALL_FLAGS} ${MY_DIR}/source/
--- a/backend/python/exllama2/requirements-cpu.txt
+++ b/backend/python/exllama2/requirements-cpu.txt
@@ -1,3 +0,0 @@
-transformers
-accelerate
-torch==2.4.1
--- a/backend/python/exllama2/requirements-cublas12.txt
+++ b/backend/python/exllama2/requirements-cublas12.txt
@@ -1,3 +0,0 @@
-torch==2.4.1
-transformers
-accelerate
--- a/backend/python/exllama2/requirements-install.txt
+++ b/backend/python/exllama2/requirements-install.txt
@@ -1,4 +0,0 @@
-# This is here to trigger the install script to add --no-build-isolation to the uv pip install commands
-# exllama2 does not specify it's build requirements per PEP517, so we need to provide some things ourselves
-wheel
-setuptools
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,5 +0,0 @@
-grpcio==1.76.0
-protobuf
-certifi
-wheel
-setuptools
--- a/backend/python/exllama2/run.sh
+++ b/backend/python/exllama2/run.sh
@@ -1,11 +0,0 @@
-#!/bin/bash
-LIMIT_TARGETS="cublas"
-
-backend_dir=$(dirname $0)
-if [ -d $backend_dir/common ]; then
-    source $backend_dir/common/libbackend.sh
-else
-    source $backend_dir/../common/libbackend.sh
-fi
-
-startBackend $@
--- a/backend/python/exllama2/test.sh
+++ b/backend/python/exllama2/test.sh
@@ -1,11 +0,0 @@
-#!/bin/bash
-set -e
-
-backend_dir=$(dirname $0)
-if [ -d $backend_dir/common ]; then
-    source $backend_dir/common/libbackend.sh
-else
-    source $backend_dir/../common/libbackend.sh
-fi
-
-runUnittests
--- a/docs/content/features/GPU-acceleration.md
+++ b/docs/content/features/GPU-acceleration.md
@@ -159,12 +159,8 @@ The devices in the following list have been tested with `hipblas` images running
 | bark | no | none |
 | coqui | no | none |
 | transformers | no | none |
-| exllama | no | none |
-| exllama2 | no | none |
-| mamba | no | none |
 | sentencetransformers | no | none |
 | transformers-musicgen | no | none |
-| vall-e-x | no | none |
 | vllm | no | none |

 **You can help by expanding this list.**
--- a/docs/content/features/text-generation.md
+++ b/docs/content/features/text-generation.md
@@ -539,36 +539,6 @@ options:
 - [llama](https://github.com/ggerganov/llama.cpp)


-### exllama/2
-
-[Exllama](https://github.com/turboderp/exllama) is a "A more memory-efficient rewrite of the HF transformers implementation of Llama for use with quantized weights". Both `exllama` and `exllama2` are supported.
-
-#### Model setup
-
-Download the model as a folder inside the `model ` directory and create a YAML file specifying the `exllama` backend. For instance with the `TheBloke/WizardLM-7B-uncensored-GPTQ` model:
-
-```
-$ git lfs install
-$ cd models && git clone https://huggingface.co/TheBloke/WizardLM-7B-uncensored-GPTQ
-$ ls models/                                                                 
-.keep                        WizardLM-7B-uncensored-GPTQ/ exllama.yaml
-$ cat models/exllama.yaml                                                     
-name: exllama
-parameters:
-  model: WizardLM-7B-uncensored-GPTQ
-backend: exllama
-```
-
-Test with:
-
-```bash
-curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{                                                                                                         
-   "model": "exllama",
-   "messages": [{"role": "user", "content": "How are you?"}],
-   "temperature": 0.1
- }'
-```
-
 ### vLLM

 [vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference.
--- a/docs/content/reference/compatibility-table.md
+++ b/docs/content/reference/compatibility-table.md
@@ -21,7 +21,6 @@ LocalAI will attempt to automatically load models which are not explicitly confi
 | [llama.cpp]({{%relref "features/text-generation#llama.cpp" %}})        | LLama, Mamba, RWKV, Falcon, Starcoder, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes                      | GPT and Functions                        | yes | yes                  | CUDA 12/13, ROCm, Intel SYCL, Vulkan, Metal, CPU |
 | [vLLM](https://github.com/vllm-project/vllm)        | Various GPTs and quantization formats | yes                      | GPT             | no | no                  | CUDA 12/13, ROCm, Intel |
 | [transformers](https://github.com/huggingface/transformers) | Various GPTs and quantization formats  | yes                      | GPT, embeddings, Audio generation            | yes | yes*                  | CUDA 12/13, ROCm, Intel, CPU |
-| [exllama2](https://github.com/turboderp-org/exllamav2)  | GPTQ                   | yes                       | GPT only                  | no                               | no                   | CUDA 12/13 |
 | [MLX](https://github.com/ml-explore/mlx-lm)        | Various LLMs               | yes                       | GPT                        | no                                | no                   | Metal (Apple Silicon) |
 | [MLX-VLM](https://github.com/Blaizzy/mlx-vlm)        | Vision-Language Models               | yes                       | Multimodal GPT                        | no                                | no                   | Metal (Apple Silicon) |
 | [langchain-huggingface](https://github.com/tmc/langchaingo)                                                                    | Any text generators available on HuggingFace through API | yes                      | GPT                        | no                                | no                   | N/A |