From 05904c77f50dd8c968a7ca528873b1e83f644bb6 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 24 Jan 2026 08:57:37 +0100 Subject: [PATCH] chore(exllama): drop backend now almost deprecated (#8186) exllama2 development has stalled and only old architectures are supported. exllamav3 is still in development, meanwhile cleaning up exllama2 from the gallery. Signed-off-by: Ettore Di Giacinto --- .github/workflows/backend.yml | 53 ------- Makefile | 4 +- README.md | 3 +- backend/index.yaml | 32 ---- backend/python/README.md | 1 - backend/python/exllama2/.gitignore | 1 - backend/python/exllama2/Makefile | 17 --- backend/python/exllama2/backend.py | 143 ------------------ backend/python/exllama2/install.sh | 21 --- backend/python/exllama2/requirements-cpu.txt | 3 - .../python/exllama2/requirements-cublas12.txt | 3 - .../python/exllama2/requirements-install.txt | 4 - backend/python/exllama2/requirements.txt | 5 - backend/python/exllama2/run.sh | 11 -- backend/python/exllama2/test.sh | 11 -- docs/content/features/GPU-acceleration.md | 4 - docs/content/features/text-generation.md | 30 ---- docs/content/reference/compatibility-table.md | 1 - 18 files changed, 2 insertions(+), 345 deletions(-) delete mode 100644 backend/python/exllama2/.gitignore delete mode 100644 backend/python/exllama2/Makefile delete mode 100755 backend/python/exllama2/backend.py delete mode 100755 backend/python/exllama2/install.sh delete mode 100644 backend/python/exllama2/requirements-cpu.txt delete mode 100644 backend/python/exllama2/requirements-cublas12.txt delete mode 100644 backend/python/exllama2/requirements-install.txt delete mode 100644 backend/python/exllama2/requirements.txt delete mode 100755 backend/python/exllama2/run.sh delete mode 100755 backend/python/exllama2/test.sh diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index 235b94a19..479e06e80 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -313,19 +313,6 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' - - build-type: 'cublas' - cuda-major-version: "12" - cuda-minor-version: "9" - platforms: 'linux/amd64' - tag-latest: 'auto' - tag-suffix: '-gpu-nvidia-cuda-12-exllama2' - runs-on: 'ubuntu-latest' - base-image: "ubuntu:24.04" - skip-drivers: 'false' - backend: "exllama2" - dockerfile: "./backend/Dockerfile.python" - context: "./" - ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "12" cuda-minor-version: "9" @@ -1301,46 +1288,6 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2204' - # exllama2 - - build-type: '' - cuda-major-version: "" - cuda-minor-version: "" - platforms: 'linux/amd64' - tag-latest: 'auto' - tag-suffix: '-cpu-exllama2' - runs-on: 'ubuntu-latest' - base-image: "ubuntu:24.04" - skip-drivers: 'false' - backend: "exllama2" - dockerfile: "./backend/Dockerfile.python" - context: "./" - ubuntu-version: '2404' - - build-type: 'intel' - cuda-major-version: "" - cuda-minor-version: "" - platforms: 'linux/amd64' - tag-latest: 'auto' - tag-suffix: '-gpu-intel-exllama2' - runs-on: 'ubuntu-latest' - base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" - skip-drivers: 'false' - backend: "exllama2" - dockerfile: "./backend/Dockerfile.python" - context: "./" - ubuntu-version: '2404' - - build-type: 'hipblas' - cuda-major-version: "" - cuda-minor-version: "" - platforms: 'linux/amd64' - skip-drivers: 'true' - tag-latest: 'auto' - tag-suffix: '-gpu-hipblas-exllama2' - base-image: "rocm/dev-ubuntu-24.04:6.4.4" - runs-on: 'ubuntu-latest' - backend: "exllama2" - dockerfile: "./backend/Dockerfile.python" - context: "./" - ubuntu-version: '2404' - build-type: 'l4t' cuda-major-version: "12" cuda-minor-version: "0" diff --git a/Makefile b/Makefile index 69f0e37a5..301adf72e 100644 --- a/Makefile +++ b/Makefile @@ -450,7 +450,6 @@ BACKEND_TRANSFORMERS = transformers|python|.|false|true BACKEND_FASTER_WHISPER = faster-whisper|python|.|false|true BACKEND_COQUI = coqui|python|.|false|true BACKEND_BARK = bark|python|.|false|true -BACKEND_EXLLAMA2 = exllama2|python|.|false|true BACKEND_RFDETR = rfdetr|python|.|false|true BACKEND_KITTEN_TTS = kitten-tts|python|.|false|true BACKEND_NEUTTS = neutts|python|.|false|true @@ -497,7 +496,6 @@ $(eval $(call generate-docker-build-target,$(BACKEND_TRANSFORMERS))) $(eval $(call generate-docker-build-target,$(BACKEND_FASTER_WHISPER))) $(eval $(call generate-docker-build-target,$(BACKEND_COQUI))) $(eval $(call generate-docker-build-target,$(BACKEND_BARK))) -$(eval $(call generate-docker-build-target,$(BACKEND_EXLLAMA2))) $(eval $(call generate-docker-build-target,$(BACKEND_RFDETR))) $(eval $(call generate-docker-build-target,$(BACKEND_KITTEN_TTS))) $(eval $(call generate-docker-build-target,$(BACKEND_NEUTTS))) @@ -514,7 +512,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_QWEN_TTS))) docker-save-%: backend-images docker save local-ai-backend:$* -o backend-images/$*.tar -docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-bark docker-build-chatterbox docker-build-vibevoice docker-build-exllama2 docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts +docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-bark docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts ######################################################## ### END Backends diff --git a/README.md b/README.md index 14a23fda2..d05cdfc0c 100644 --- a/README.md +++ b/README.md @@ -278,7 +278,6 @@ LocalAI supports a comprehensive range of AI backends with multiple acceleration | **llama.cpp** | LLM inference in C/C++ | CUDA 12/13, ROCm, Intel SYCL, Vulkan, Metal, CPU | | **vLLM** | Fast LLM inference with PagedAttention | CUDA 12/13, ROCm, Intel | | **transformers** | HuggingFace transformers framework | CUDA 12/13, ROCm, Intel, CPU | -| **exllama2** | GPTQ inference library | CUDA 12/13 | | **MLX** | Apple Silicon LLM inference | Metal (M1/M2/M3+) | | **MLX-VLM** | Apple Silicon Vision-Language Models | Metal (M1/M2/M3+) | @@ -321,7 +320,7 @@ LocalAI supports a comprehensive range of AI backends with multiple acceleration | **NVIDIA CUDA 12** | All CUDA-compatible backends | Nvidia hardware | | **NVIDIA CUDA 13** | All CUDA-compatible backends | Nvidia hardware | | **AMD ROCm** | llama.cpp, whisper, vllm, transformers, diffusers, rerankers, coqui, kokoro, bark, neutts, vibevoice, pocket-tts, qwen-tts | AMD Graphics | -| **Intel oneAPI** | llama.cpp, whisper, stablediffusion, vllm, transformers, diffusers, rfdetr, rerankers, exllama2, coqui, kokoro, bark, vibevoice, pocket-tts, qwen-tts | Intel Arc, Intel iGPUs | +| **Intel oneAPI** | llama.cpp, whisper, stablediffusion, vllm, transformers, diffusers, rfdetr, rerankers, coqui, kokoro, bark, vibevoice, pocket-tts, qwen-tts | Intel Arc, Intel iGPUs | | **Apple Metal** | llama.cpp, whisper, diffusers, MLX, MLX-VLM, bark-cpp | Apple M1/M2/M3+ | | **Vulkan** | llama.cpp, whisper, stablediffusion | Cross-platform GPUs | | **NVIDIA Jetson (CUDA 12)** | llama.cpp, whisper, stablediffusion, diffusers, rfdetr | ARM64 embedded AI (AGX Orin, etc.) | diff --git a/backend/index.yaml b/backend/index.yaml index d03e8a8ac..3b9d3ee69 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -241,22 +241,6 @@ nvidia-cuda-12: "cuda12-diffusers" nvidia-l4t-cuda-12: "nvidia-l4t-arm64-diffusers" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-diffusers" -- &exllama2 - name: "exllama2" - urls: - - https://github.com/turboderp-org/exllamav2 - tags: - - text-to-text - - LLM - - EXL2 - license: MIT - description: | - ExLlamaV2 is an inference library for running local LLMs on modern consumer GPUs. - alias: "exllama2" - capabilities: - nvidia: "cuda12-exllama2" - intel: "intel-exllama2" - nvidia-cuda-12: "cuda12-exllama2" - &faster-whisper icon: https://avatars.githubusercontent.com/u/1520500?s=200&v=4 description: | @@ -1251,22 +1235,6 @@ uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-diffusers" mirrors: - localai/localai-backends:master-metal-darwin-arm64-diffusers - ## exllama2 -- !!merge <<: *exllama2 - name: "exllama2-development" - capabilities: - nvidia: "cuda12-exllama2-development" - intel: "intel-exllama2-development" -- !!merge <<: *exllama2 - name: "cuda12-exllama2" - uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-exllama2" - mirrors: - - localai/localai-backends:latest-gpu-nvidia-cuda-12-exllama2 -- !!merge <<: *exllama2 - name: "cuda12-exllama2-development" - uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-exllama2" - mirrors: - - localai/localai-backends:master-gpu-nvidia-cuda-12-exllama2 ## kokoro - !!merge <<: *kokoro name: "kokoro-development" diff --git a/backend/python/README.md b/backend/python/README.md index 9f894b77b..e140ab627 100644 --- a/backend/python/README.md +++ b/backend/python/README.md @@ -16,7 +16,6 @@ The Python backends use a unified build system based on `libbackend.sh` that pro - **transformers** - Hugging Face Transformers framework (PyTorch-based) - **vllm** - High-performance LLM inference engine - **mlx** - Apple Silicon optimized ML framework -- **exllama2** - ExLlama2 quantized models ### Audio & Speech - **bark** - Text-to-speech synthesis diff --git a/backend/python/exllama2/.gitignore b/backend/python/exllama2/.gitignore deleted file mode 100644 index 1d3a06547..000000000 --- a/backend/python/exllama2/.gitignore +++ /dev/null @@ -1 +0,0 @@ -source \ No newline at end of file diff --git a/backend/python/exllama2/Makefile b/backend/python/exllama2/Makefile deleted file mode 100644 index 15ba38d12..000000000 --- a/backend/python/exllama2/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -.PHONY: exllama2 -exllama2: - bash install.sh - -.PHONY: run -run: exllama2 - @echo "Running exllama2..." - bash run.sh - @echo "exllama2 run." - -.PHONY: protogen-clean -protogen-clean: - $(RM) backend_pb2_grpc.py backend_pb2.py - -.PHONY: clean -clean: protogen-clean - $(RM) -r venv source __pycache__ \ No newline at end of file diff --git a/backend/python/exllama2/backend.py b/backend/python/exllama2/backend.py deleted file mode 100755 index 7aacea360..000000000 --- a/backend/python/exllama2/backend.py +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/env python3 -import grpc -from concurrent import futures -import time -import backend_pb2 -import backend_pb2_grpc -import argparse -import signal -import sys -import os -import glob - -from pathlib import Path -import torch -import torch.nn.functional as F -from torch import version as torch_version - - -from exllamav2.generator import ( - ExLlamaV2BaseGenerator, - ExLlamaV2Sampler -) - - -from exllamav2 import ( - ExLlamaV2, - ExLlamaV2Config, - ExLlamaV2Cache, - ExLlamaV2Cache_8bit, - ExLlamaV2Tokenizer, - model_init, -) - - -_ONE_DAY_IN_SECONDS = 60 * 60 * 24 - -# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 -MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) - -# Implement the BackendServicer class with the service methods -class BackendServicer(backend_pb2_grpc.BackendServicer): - def Health(self, request, context): - return backend_pb2.Reply(message=bytes("OK", 'utf-8')) - - def LoadModel(self, request, context): - try: - model_directory = request.ModelFile - - config = ExLlamaV2Config() - config.model_dir = model_directory - config.prepare() - - model = ExLlamaV2(config) - - cache = ExLlamaV2Cache(model, lazy=True) - model.load_autosplit(cache) - - tokenizer = ExLlamaV2Tokenizer(config) - - # Initialize generator - - generator = ExLlamaV2BaseGenerator(model, cache, tokenizer) - - self.generator = generator - - generator.warmup() - self.model = model - self.tokenizer = tokenizer - self.cache = cache - except Exception as err: - return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") - return backend_pb2.Result(message="Model loaded successfully", success=True) - - def Predict(self, request, context): - - penalty = 1.15 - if request.Penalty != 0.0: - penalty = request.Penalty - - settings = ExLlamaV2Sampler.Settings() - settings.temperature = request.Temperature - settings.top_k = request.TopK - settings.top_p = request.TopP - settings.token_repetition_penalty = penalty - settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id]) - tokens = 512 - - if request.Tokens != 0: - tokens = request.Tokens - output = self.generator.generate_simple( - request.Prompt, settings, tokens) - - # Remove prompt from response if present - if request.Prompt in output: - output = output.replace(request.Prompt, "") - - return backend_pb2.Result(message=bytes(output, encoding='utf-8')) - - def PredictStream(self, request, context): - # Implement PredictStream RPC - # for reply in some_data_generator(): - # yield reply - # Not implemented yet - return self.Predict(request, context) - - -def serve(address): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), - options=[ - ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB - ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB - ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB - ]) - backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) - server.add_insecure_port(address) - server.start() - print("Server started. Listening on: " + address, file=sys.stderr) - - # Define the signal handler function - def signal_handler(sig, frame): - print("Received termination signal. Shutting down...") - server.stop(0) - sys.exit(0) - - # Set the signal handlers for SIGINT and SIGTERM - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) - - try: - while True: - time.sleep(_ONE_DAY_IN_SECONDS) - except KeyboardInterrupt: - server.stop(0) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run the gRPC server.") - parser.add_argument( - "--addr", default="localhost:50051", help="The address to bind the server to." - ) - args = parser.parse_args() - - serve(args.addr) diff --git a/backend/python/exllama2/install.sh b/backend/python/exllama2/install.sh deleted file mode 100755 index 6cbc28a17..000000000 --- a/backend/python/exllama2/install.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -set -e - -LIMIT_TARGETS="cublas" -EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation" -EXLLAMA2_VERSION=c0ddebaaaf8ffd1b3529c2bb654e650bce2f790f - -backend_dir=$(dirname $0) -if [ -d $backend_dir/common ]; then - source $backend_dir/common/libbackend.sh -else - source $backend_dir/../common/libbackend.sh -fi - -installRequirements - -git clone https://github.com/turboderp/exllamav2 $MY_DIR/source -pushd ${MY_DIR}/source && git checkout -b build ${EXLLAMA2_VERSION} && popd - -# This installs exllamav2 in JIT mode so it will compile the appropriate torch extension at runtime -EXLLAMA_NOCOMPILE= uv pip install ${EXTRA_PIP_INSTALL_FLAGS} ${MY_DIR}/source/ diff --git a/backend/python/exllama2/requirements-cpu.txt b/backend/python/exllama2/requirements-cpu.txt deleted file mode 100644 index 2021fc201..000000000 --- a/backend/python/exllama2/requirements-cpu.txt +++ /dev/null @@ -1,3 +0,0 @@ -transformers -accelerate -torch==2.4.1 \ No newline at end of file diff --git a/backend/python/exllama2/requirements-cublas12.txt b/backend/python/exllama2/requirements-cublas12.txt deleted file mode 100644 index 93e62c5ab..000000000 --- a/backend/python/exllama2/requirements-cublas12.txt +++ /dev/null @@ -1,3 +0,0 @@ -torch==2.4.1 -transformers -accelerate \ No newline at end of file diff --git a/backend/python/exllama2/requirements-install.txt b/backend/python/exllama2/requirements-install.txt deleted file mode 100644 index 322799ff6..000000000 --- a/backend/python/exllama2/requirements-install.txt +++ /dev/null @@ -1,4 +0,0 @@ -# This is here to trigger the install script to add --no-build-isolation to the uv pip install commands -# exllama2 does not specify it's build requirements per PEP517, so we need to provide some things ourselves -wheel -setuptools \ No newline at end of file diff --git a/backend/python/exllama2/requirements.txt b/backend/python/exllama2/requirements.txt deleted file mode 100644 index 3044ff0e2..000000000 --- a/backend/python/exllama2/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -grpcio==1.76.0 -protobuf -certifi -wheel -setuptools \ No newline at end of file diff --git a/backend/python/exllama2/run.sh b/backend/python/exllama2/run.sh deleted file mode 100755 index 91c79aade..000000000 --- a/backend/python/exllama2/run.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -LIMIT_TARGETS="cublas" - -backend_dir=$(dirname $0) -if [ -d $backend_dir/common ]; then - source $backend_dir/common/libbackend.sh -else - source $backend_dir/../common/libbackend.sh -fi - -startBackend $@ \ No newline at end of file diff --git a/backend/python/exllama2/test.sh b/backend/python/exllama2/test.sh deleted file mode 100755 index eb59f2aaf..000000000 --- a/backend/python/exllama2/test.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -set -e - -backend_dir=$(dirname $0) -if [ -d $backend_dir/common ]; then - source $backend_dir/common/libbackend.sh -else - source $backend_dir/../common/libbackend.sh -fi - -runUnittests diff --git a/docs/content/features/GPU-acceleration.md b/docs/content/features/GPU-acceleration.md index 2f10054d3..fe93be299 100644 --- a/docs/content/features/GPU-acceleration.md +++ b/docs/content/features/GPU-acceleration.md @@ -159,12 +159,8 @@ The devices in the following list have been tested with `hipblas` images running | bark | no | none | | coqui | no | none | | transformers | no | none | -| exllama | no | none | -| exllama2 | no | none | -| mamba | no | none | | sentencetransformers | no | none | | transformers-musicgen | no | none | -| vall-e-x | no | none | | vllm | no | none | **You can help by expanding this list.** diff --git a/docs/content/features/text-generation.md b/docs/content/features/text-generation.md index 6b205b686..b83d01b2c 100644 --- a/docs/content/features/text-generation.md +++ b/docs/content/features/text-generation.md @@ -539,36 +539,6 @@ options: - [llama](https://github.com/ggerganov/llama.cpp) -### exllama/2 - -[Exllama](https://github.com/turboderp/exllama) is a "A more memory-efficient rewrite of the HF transformers implementation of Llama for use with quantized weights". Both `exllama` and `exllama2` are supported. - -#### Model setup - -Download the model as a folder inside the `model ` directory and create a YAML file specifying the `exllama` backend. For instance with the `TheBloke/WizardLM-7B-uncensored-GPTQ` model: - -``` -$ git lfs install -$ cd models && git clone https://huggingface.co/TheBloke/WizardLM-7B-uncensored-GPTQ -$ ls models/ -.keep WizardLM-7B-uncensored-GPTQ/ exllama.yaml -$ cat models/exllama.yaml -name: exllama -parameters: - model: WizardLM-7B-uncensored-GPTQ -backend: exllama -``` - -Test with: - -```bash -curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ - "model": "exllama", - "messages": [{"role": "user", "content": "How are you?"}], - "temperature": 0.1 - }' -``` - ### vLLM [vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference. diff --git a/docs/content/reference/compatibility-table.md b/docs/content/reference/compatibility-table.md index 2461c95e1..c2a19ae52 100644 --- a/docs/content/reference/compatibility-table.md +++ b/docs/content/reference/compatibility-table.md @@ -21,7 +21,6 @@ LocalAI will attempt to automatically load models which are not explicitly confi | [llama.cpp]({{%relref "features/text-generation#llama.cpp" %}}) | LLama, Mamba, RWKV, Falcon, Starcoder, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes | GPT and Functions | yes | yes | CUDA 12/13, ROCm, Intel SYCL, Vulkan, Metal, CPU | | [vLLM](https://github.com/vllm-project/vllm) | Various GPTs and quantization formats | yes | GPT | no | no | CUDA 12/13, ROCm, Intel | | [transformers](https://github.com/huggingface/transformers) | Various GPTs and quantization formats | yes | GPT, embeddings, Audio generation | yes | yes* | CUDA 12/13, ROCm, Intel, CPU | -| [exllama2](https://github.com/turboderp-org/exllamav2) | GPTQ | yes | GPT only | no | no | CUDA 12/13 | | [MLX](https://github.com/ml-explore/mlx-lm) | Various LLMs | yes | GPT | no | no | Metal (Apple Silicon) | | [MLX-VLM](https://github.com/Blaizzy/mlx-vlm) | Vision-Language Models | yes | Multimodal GPT | no | no | Metal (Apple Silicon) | | [langchain-huggingface](https://github.com/tmc/langchaingo) | Any text generators available on HuggingFace through API | yes | GPT | no | no | N/A |