chore(ci): wire external backend for tests

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
feat(swagger): update swagger (#8706 )
2026-05-20 22:58:34 -04:00 · 2026-03-01 21:33:20 +00:00 · 2026-03-01 21:33:19 +01:00 · 2026-03-01 21:32:38 +01:00 · 2026-03-01 16:54:01 +01:00 · 2026-03-01 16:19:38 +01:00
54 changed files with 2239 additions and 210 deletions
--- a/.github/gallery-agent/agent.go
+++ b/.github/gallery-agent/agent.go
@@ -141,7 +141,7 @@ func getRealReadme(ctx context.Context, repository string) (string, error) {
 	result = result.AddMessage("user", "Describe the model in a clear and concise way that can be shared in a model gallery.")

 	// Get a response
-	newFragment, err := llm.Ask(ctx, result)
+	_, err = llm.Ask(ctx, result)
 	if err != nil {
 		return "", err
 	}
--- a/.github/gallery-agent/tools.go
+++ b/.github/gallery-agent/tools.go
@@ -13,16 +13,16 @@ type HFReadmeTool struct {
 	client *hfapi.Client
 }

-func (s *HFReadmeTool) Execute(args map[string]any) (string, error) {
+func (s *HFReadmeTool) Execute(args map[string]any) (string, any, error) {
 	q, ok := args["repository"].(string)
 	if !ok {
-		return "", fmt.Errorf("no query")
+		return "", nil, fmt.Errorf("no query")
 	}
 	readme, err := s.client.GetReadmeContent(q, "README.md")
 	if err != nil {
-		return "", err
+		return "", nil, err
 	}
-	return readme, nil
+	return readme, nil, nil
 }

 func (s *HFReadmeTool) Tool() openai.Tool {
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -210,6 +210,19 @@ jobs:
            dockerfile: "./backend/Dockerfile.python"
            context: "./"
            ubuntu-version: '2404'
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "8"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-12-faster-qwen3-tts'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "faster-qwen3-tts"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
+            ubuntu-version: '2404'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "8"
@@ -575,6 +588,19 @@ jobs:
            dockerfile: "./backend/Dockerfile.python"
            context: "./"
            ubuntu-version: '2404'
+          - build-type: 'cublas'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-13-faster-qwen3-tts'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            backend: "faster-qwen3-tts"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
+            ubuntu-version: '2404'
          - build-type: 'cublas'
            cuda-major-version: "13"
            cuda-minor-version: "0"
@@ -705,6 +731,19 @@ jobs:
            backend: "qwen-tts"
            dockerfile: "./backend/Dockerfile.python"
            context: "./"
+          - build-type: 'l4t'
+            cuda-major-version: "13"
+            cuda-minor-version: "0"
+            platforms: 'linux/arm64'
+            tag-latest: 'auto'
+            tag-suffix: '-nvidia-l4t-cuda-13-arm64-faster-qwen3-tts'
+            runs-on: 'ubuntu-24.04-arm'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'false'
+            ubuntu-version: '2404'
+            backend: "faster-qwen3-tts"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
          - build-type: 'l4t'
            cuda-major-version: "13"
            cuda-minor-version: "0"
@@ -1306,6 +1345,19 @@ jobs:
            dockerfile: "./backend/Dockerfile.python"
            context: "./"
            ubuntu-version: '2204'
+          - build-type: 'l4t'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/arm64'
+            tag-latest: 'auto'
+            tag-suffix: '-nvidia-l4t-faster-qwen3-tts'
+            runs-on: 'ubuntu-24.04-arm'
+            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
+            skip-drivers: 'true'
+            backend: "faster-qwen3-tts"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
+            ubuntu-version: '2204'
          - build-type: 'l4t'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -1905,7 +1957,7 @@ jobs:
          - build-type: ''
            cuda-major-version: ""
            cuda-minor-version: ""
-            platforms: 'linux/amd64'
+            platforms: 'linux/amd64,linux/arm64'
            tag-latest: 'auto'
            tag-suffix: '-cpu-voxcpm'
            runs-on: 'ubuntu-latest'
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -93,30 +93,15 @@ jobs:
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
-          sudo apt-get install -y libgmock-dev clang
-          # Install UV
-          curl -LsSf https://astral.sh/uv/install.sh | sh
-          sudo apt-get install -y ca-certificates cmake patch python3-pip unzip
-          sudo apt-get install -y libopencv-dev
-
-          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
-          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
-          rm protoc.zip
-
-          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-          sudo dpkg -i cuda-keyring_1.1-1_all.deb
-          sudo apt-get update
-          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
-          export CUDACXX=/usr/local/cuda/bin/nvcc
-          make -C backend/python/transformers
-
+          sudo apt-get install curl ffmpeg
+      - name: Build backends
+        run: |
+          make backends/transformers 
+          mv backends/transformer external/transformers
          make backends/huggingface backends/llama-cpp backends/local-store backends/silero-vad backends/piper backends/whisper backends/stablediffusion-ggml
-        env:
-          CUDA_VERSION: 12-4
      - name: Test
        run: |
-          PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
+          TRANSFORMER_BACKEND=$(abspath ./)/external/transformers/run.sh PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.23
--- a/10
+++ b/10
@@ -1,5 +1,5 @@
 # Disable parallel execution for backend builds
-.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/voxtral
+.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/voxtral

 GOCMD=go
 GOTEST=$(GOCMD) test
@@ -149,7 +149,7 @@ test: test-models/testmodel.ggml protogen-go
 	@echo 'Running tests'
 	export GO_TAGS="debug"
 	$(MAKE) prepare-test
-	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models BACKENDS_PATH=$(abspath ./)/backends \
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models BACKENDS_PATH=$(abspath ./)/backends \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
@@ -317,6 +317,7 @@ prepare-test-extra: protogen-python
 	$(MAKE) -C backend/python/moonshine
 	$(MAKE) -C backend/python/pocket-tts
 	$(MAKE) -C backend/python/qwen-tts
+	$(MAKE) -C backend/python/faster-qwen3-tts
 	$(MAKE) -C backend/python/qwen-asr
 	$(MAKE) -C backend/python/nemo
 	$(MAKE) -C backend/python/voxcpm
@@ -334,6 +335,7 @@ test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/moonshine test
 	$(MAKE) -C backend/python/pocket-tts test
 	$(MAKE) -C backend/python/qwen-tts test
+	$(MAKE) -C backend/python/faster-qwen3-tts test
 	$(MAKE) -C backend/python/qwen-asr test
 	$(MAKE) -C backend/python/nemo test
 	$(MAKE) -C backend/python/voxcpm test
@@ -473,6 +475,7 @@ BACKEND_VIBEVOICE = vibevoice|python|.|--progress=plain|true
 BACKEND_MOONSHINE = moonshine|python|.|false|true
 BACKEND_POCKET_TTS = pocket-tts|python|.|false|true
 BACKEND_QWEN_TTS = qwen-tts|python|.|false|true
+BACKEND_FASTER_QWEN3_TTS = faster-qwen3-tts|python|.|false|true
 BACKEND_QWEN_ASR = qwen-asr|python|.|false|true
 BACKEND_NEMO = nemo|python|.|false|true
 BACKEND_VOXCPM = voxcpm|python|.|false|true
@@ -525,6 +528,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE)))
 $(eval $(call generate-docker-build-target,$(BACKEND_MOONSHINE)))
 $(eval $(call generate-docker-build-target,$(BACKEND_POCKET_TTS)))
 $(eval $(call generate-docker-build-target,$(BACKEND_QWEN_TTS)))
+$(eval $(call generate-docker-build-target,$(BACKEND_FASTER_QWEN3_TTS)))
 $(eval $(call generate-docker-build-target,$(BACKEND_QWEN_ASR)))
 $(eval $(call generate-docker-build-target,$(BACKEND_NEMO)))
 $(eval $(call generate-docker-build-target,$(BACKEND_VOXCPM)))
@@ -535,7 +539,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_ACE_STEP)))
 docker-save-%: backend-images
 	docker save local-ai-backend:$* -o backend-images/$*.tar

-docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-voxtral
+docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-voxtral

 ########################################################
 ### Mock Backend for E2E Tests
--- a/README.md
+++ b/README.md
@@ -334,7 +334,7 @@ Other:
 - Langchain: https://python.langchain.com/docs/integrations/providers/localai/
 - Terminal utility https://github.com/djcopley/ShellOracle
 - Local Smart assistant https://github.com/mudler/LocalAGI
- Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
+- Home Assistant https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-llmvision / https://github.com/loryanstrant/HA-LocalAI-Monitor
 - Discord bot https://github.com/mudler/LocalAGI/tree/main/examples/discord
 - Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
 - Shell-Pilot(Interact with LLM using LocalAI models via pure shell scripts on your Linux or MacOS system) https://github.com/reid41/shell-pilot
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=723c71064da0908c19683f8c344715fbf6d986fd
+LLAMA_VERSION?=05728db18eea59de81ee3a7699739daaf015206b
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -362,7 +362,7 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
      params.mmproj.path = request->mmproj();
    }
    //  params.model_alias ??
-    params.model_alias =  request->modelfile();
+    params.model_alias.insert(request->modelfile());
    if (!request->cachetypekey().empty()) {
        params.cache_type_k = kv_cache_type_from_str(request->cachetypekey());
    }
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=21411d81ea736ed5d9cdea4df360d3c4b60a4adb
+WHISPER_CPP_VERSION?=9453b4b9be9b73adfc35051083f37cefa039acee
 SO_TARGET?=libgowhisper.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -528,6 +528,28 @@
    nvidia-l4t-cuda-12: "nvidia-l4t-qwen-tts"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-qwen-tts"
  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png
+- &faster-qwen3-tts
+  urls:
+    - https://github.com/andimarafioti/faster-qwen3-tts
+    - https://pypi.org/project/faster-qwen3-tts/
+  description: |
+    Real-time Qwen3-TTS inference using CUDA graph capture. Voice clone only; requires NVIDIA GPU with CUDA.
+  tags:
+    - text-to-speech
+    - TTS
+    - voice-clone
+  license: apache-2.0
+  name: "faster-qwen3-tts"
+  alias: "faster-qwen3-tts"
+  capabilities:
+    nvidia: "cuda12-faster-qwen3-tts"
+    default: "cuda12-faster-qwen3-tts"
+    nvidia-cuda-13: "cuda13-faster-qwen3-tts"
+    nvidia-cuda-12: "cuda12-faster-qwen3-tts"
+    nvidia-l4t: "nvidia-l4t-faster-qwen3-tts"
+    nvidia-l4t-cuda-12: "nvidia-l4t-faster-qwen3-tts"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-faster-qwen3-tts"
+  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png
 - &qwen-asr
  urls:
    - https://github.com/QwenLM/Qwen3-ASR
@@ -2279,6 +2301,57 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-qwen-tts"
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-qwen-tts
+## faster-qwen3-tts
+- !!merge <<: *faster-qwen3-tts
+  name: "faster-qwen3-tts-development"
+  capabilities:
+    nvidia: "cuda12-faster-qwen3-tts-development"
+    default: "cuda12-faster-qwen3-tts-development"
+    nvidia-cuda-13: "cuda13-faster-qwen3-tts-development"
+    nvidia-cuda-12: "cuda12-faster-qwen3-tts-development"
+    nvidia-l4t: "nvidia-l4t-faster-qwen3-tts-development"
+    nvidia-l4t-cuda-12: "nvidia-l4t-faster-qwen3-tts-development"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-faster-qwen3-tts-development"
+- !!merge <<: *faster-qwen3-tts
+  name: "cuda12-faster-qwen3-tts"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-faster-qwen3-tts"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-12-faster-qwen3-tts
+- !!merge <<: *faster-qwen3-tts
+  name: "cuda12-faster-qwen3-tts-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-faster-qwen3-tts"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-12-faster-qwen3-tts
+- !!merge <<: *faster-qwen3-tts
+  name: "cuda13-faster-qwen3-tts"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-faster-qwen3-tts"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-13-faster-qwen3-tts
+- !!merge <<: *faster-qwen3-tts
+  name: "cuda13-faster-qwen3-tts-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-faster-qwen3-tts"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-13-faster-qwen3-tts
+- !!merge <<: *faster-qwen3-tts
+  name: "nvidia-l4t-faster-qwen3-tts"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-faster-qwen3-tts"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-faster-qwen3-tts
+- !!merge <<: *faster-qwen3-tts
+  name: "nvidia-l4t-faster-qwen3-tts-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-faster-qwen3-tts"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-faster-qwen3-tts
+- !!merge <<: *faster-qwen3-tts
+  name: "cuda13-nvidia-l4t-arm64-faster-qwen3-tts"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-faster-qwen3-tts"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-faster-qwen3-tts
+- !!merge <<: *faster-qwen3-tts
+  name: "cuda13-nvidia-l4t-arm64-faster-qwen3-tts-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-faster-qwen3-tts"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-faster-qwen3-tts
 ## qwen-asr
 - !!merge <<: *qwen-asr
  name: "qwen-asr-development"
--- a/backend/python/faster-qwen3-tts/Makefile
+++ b/backend/python/faster-qwen3-tts/Makefile
@@ -0,0 +1,23 @@
+.PHONY: faster-qwen3-tts
+faster-qwen3-tts:
+	bash install.sh
+
+.PHONY: run
+run: faster-qwen3-tts
+	@echo "Running faster-qwen3-tts..."
+	bash run.sh
+	@echo "faster-qwen3-tts run."
+
+.PHONY: test
+test: faster-qwen3-tts
+	@echo "Testing faster-qwen3-tts..."
+	bash test.sh
+	@echo "faster-qwen3-tts tested."
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
--- a/backend/python/faster-qwen3-tts/backend.py
+++ b/backend/python/faster-qwen3-tts/backend.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+"""
+gRPC server of LocalAI for Faster Qwen3-TTS (CUDA graph capture, voice clone only).
+"""
+from concurrent import futures
+import time
+import argparse
+import signal
+import sys
+import os
+import traceback
+import backend_pb2
+import backend_pb2_grpc
+import torch
+import soundfile as sf
+
+import grpc
+
+
+def is_float(s):
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+
+
+def is_int(s):
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    def Health(self, request, context):
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+
+    def LoadModel(self, request, context):
+        if not torch.cuda.is_available():
+            return backend_pb2.Result(
+                success=False,
+                message="faster-qwen3-tts requires NVIDIA GPU with CUDA"
+            )
+
+        self.options = {}
+        for opt in request.Options:
+            if ":" not in opt:
+                continue
+            key, value = opt.split(":", 1)
+            if is_float(value):
+                value = float(value)
+            elif is_int(value):
+                value = int(value)
+            elif value.lower() in ["true", "false"]:
+                value = value.lower() == "true"
+            self.options[key] = value
+
+        model_path = request.Model or "Qwen/Qwen3-TTS-12Hz-0.6B-Base"
+        self.audio_path = request.AudioPath if hasattr(request, 'AudioPath') and request.AudioPath else None
+        self.model_file = request.ModelFile if hasattr(request, 'ModelFile') and request.ModelFile else None
+        self.model_path = request.ModelPath if hasattr(request, 'ModelPath') and request.ModelPath else None
+
+        from faster_qwen3_tts import FasterQwen3TTS
+        print(f"Loading model from: {model_path}", file=sys.stderr)
+        try:
+            self.model = FasterQwen3TTS.from_pretrained(model_path)
+        except Exception as e:
+            print(f"[ERROR] Loading model: {type(e).__name__}: {e}", file=sys.stderr)
+            print(traceback.format_exc(), file=sys.stderr)
+            return backend_pb2.Result(success=False, message=str(e))
+
+        print(f"Model loaded successfully: {model_path}", file=sys.stderr)
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def _get_ref_audio_path(self, request):
+        if not self.audio_path:
+            return None
+        if os.path.isabs(self.audio_path):
+            return self.audio_path
+        if self.model_file:
+            model_file_base = os.path.dirname(self.model_file)
+            ref_path = os.path.join(model_file_base, self.audio_path)
+            if os.path.exists(ref_path):
+                return ref_path
+        if self.model_path:
+            ref_path = os.path.join(self.model_path, self.audio_path)
+            if os.path.exists(ref_path):
+                return ref_path
+        return self.audio_path
+
+    def TTS(self, request, context):
+        try:
+            if not request.dst:
+                return backend_pb2.Result(
+                    success=False,
+                    message="dst (output path) is required"
+                )
+            text = request.text.strip()
+            if not text:
+                return backend_pb2.Result(
+                    success=False,
+                    message="Text is empty"
+                )
+
+            language = request.language if hasattr(request, 'language') and request.language else None
+            if not language or language == "":
+                language = "English"
+
+            ref_audio = self._get_ref_audio_path(request)
+            if not ref_audio:
+                return backend_pb2.Result(
+                    success=False,
+                    message="AudioPath is required for voice clone (set in LoadModel)"
+                )
+            ref_text = self.options.get("ref_text")
+            if not ref_text and hasattr(request, 'ref_text') and request.ref_text:
+                ref_text = request.ref_text
+            if not ref_text:
+                return backend_pb2.Result(
+                    success=False,
+                    message="ref_text is required for voice clone (set via LoadModel Options, e.g. ref_text:Your reference transcript)"
+                )
+
+            chunk_size = self.options.get("chunk_size")
+            generation_kwargs = {}
+            if chunk_size is not None:
+                generation_kwargs["chunk_size"] = int(chunk_size)
+
+            audio_list, sr = self.model.generate_voice_clone(
+                text=text,
+                language=language,
+                ref_audio=ref_audio,
+                ref_text=ref_text,
+                **generation_kwargs
+            )
+
+            if audio_list is None or (isinstance(audio_list, list) and len(audio_list) == 0):
+                return backend_pb2.Result(
+                    success=False,
+                    message="No audio output generated"
+                )
+            audio_data = audio_list[0] if isinstance(audio_list, list) else audio_list
+            sf.write(request.dst, audio_data, sr)
+            print(f"Saved output to {request.dst}", file=sys.stderr)
+
+        except Exception as err:
+            print(f"Error in TTS: {err}", file=sys.stderr)
+            print(traceback.format_exc(), file=sys.stderr)
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+
+        return backend_pb2.Result(success=True)
+
+
+def serve(address):
+    server = grpc.server(
+        futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),
+        ]
+    )
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument("--addr", default="localhost:50051", help="The address to bind the server to.")
+    args = parser.parse_args()
+    serve(args.addr)
--- a/backend/python/faster-qwen3-tts/install.sh
+++ b/backend/python/faster-qwen3-tts/install.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+set -e
+
+EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation"
+
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+installRequirements
--- a/backend/python/faster-qwen3-tts/requirements-cublas12.txt
+++ b/backend/python/faster-qwen3-tts/requirements-cublas12.txt
@@ -0,0 +1,4 @@
+--extra-index-url https://download.pytorch.org/whl/cu121
+torch
+torchaudio
+faster-qwen3-tts
--- a/backend/python/faster-qwen3-tts/requirements-cublas13.txt
+++ b/backend/python/faster-qwen3-tts/requirements-cublas13.txt
@@ -0,0 +1,4 @@
+--extra-index-url https://download.pytorch.org/whl/cu130
+torch
+torchaudio
+faster-qwen3-tts
--- a/backend/python/faster-qwen3-tts/requirements-install.txt
+++ b/backend/python/faster-qwen3-tts/requirements-install.txt
@@ -0,0 +1 @@
+setuptools
--- a/backend/python/faster-qwen3-tts/requirements-l4t12.txt
+++ b/backend/python/faster-qwen3-tts/requirements-l4t12.txt
@@ -0,0 +1,4 @@
+--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/
+torch
+torchaudio
+faster-qwen3-tts
--- a/backend/python/faster-qwen3-tts/requirements-l4t13.txt
+++ b/backend/python/faster-qwen3-tts/requirements-l4t13.txt
@@ -0,0 +1,4 @@
+--extra-index-url https://download.pytorch.org/whl/cu130
+torch
+torchaudio
+faster-qwen3-tts
--- a/backend/python/faster-qwen3-tts/requirements.txt
+++ b/backend/python/faster-qwen3-tts/requirements.txt
@@ -0,0 +1,8 @@
+grpcio==1.71.0
+protobuf
+certifi
+packaging==24.1
+soundfile
+setuptools
+six
+sox
--- a/backend/python/faster-qwen3-tts/run.sh
+++ b/backend/python/faster-qwen3-tts/run.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+startBackend $@
--- a/backend/python/faster-qwen3-tts/test.py
+++ b/backend/python/faster-qwen3-tts/test.py
@@ -0,0 +1,104 @@
+"""
+Tests for the faster-qwen3-tts gRPC backend.
+"""
+import unittest
+import subprocess
+import time
+import os
+import sys
+import tempfile
+import backend_pb2
+import backend_pb2_grpc
+import grpc
+
+
+class TestBackendServicer(unittest.TestCase):
+    def setUp(self):
+        self.service = subprocess.Popen(
+            ["python3", "backend.py", "--addr", "localhost:50052"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            cwd=os.path.dirname(os.path.abspath(__file__)),
+        )
+        time.sleep(15)
+
+    def tearDown(self):
+        self.service.terminate()
+        try:
+            self.service.communicate(timeout=5)
+        except subprocess.TimeoutExpired:
+            self.service.kill()
+            self.service.communicate()
+
+    def test_health(self):
+        with grpc.insecure_channel("localhost:50052") as channel:
+            stub = backend_pb2_grpc.BackendStub(channel)
+            reply = stub.Health(backend_pb2.HealthMessage(), timeout=5.0)
+        self.assertEqual(reply.message, b"OK")
+
+    def test_load_model_requires_cuda(self):
+        with grpc.insecure_channel("localhost:50052") as channel:
+            stub = backend_pb2_grpc.BackendStub(channel)
+            response = stub.LoadModel(
+                backend_pb2.ModelOptions(
+                    Model="Qwen/Qwen3-TTS-12Hz-0.6B-Base",
+                    CUDA=True,
+                ),
+                timeout=10.0,
+            )
+        self.assertFalse(response.success)
+
+    @unittest.skipUnless(
+        __import__("torch").cuda.is_available(),
+        "faster-qwen3-tts TTS requires CUDA",
+    )
+    def test_tts(self):
+        import soundfile as sf
+        try:
+            with grpc.insecure_channel("localhost:50052") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                ref_audio = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
+                ref_audio.close()
+                try:
+                    sr = 22050
+                    duration = 1.0
+                    samples = int(sr * duration)
+                    sf.write(ref_audio.name, [0.0] * samples, sr)
+
+                    response = stub.LoadModel(
+                        backend_pb2.ModelOptions(
+                            Model="Qwen/Qwen3-TTS-12Hz-0.6B-Base",
+                            AudioPath=ref_audio.name,
+                            Options=["ref_text:Hello world"],
+                        ),
+                        timeout=600.0,
+                    )
+                    self.assertTrue(response.success, response.message)
+
+                    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as out:
+                        output_path = out.name
+                    try:
+                        tts_response = stub.TTS(
+                            backend_pb2.TTSRequest(
+                                text="Test output.",
+                                dst=output_path,
+                                language="English",
+                            ),
+                            timeout=120.0,
+                        )
+                        self.assertTrue(tts_response.success, tts_response.message)
+                        self.assertTrue(os.path.exists(output_path))
+                        self.assertGreater(os.path.getsize(output_path), 0)
+                    finally:
+                        if os.path.exists(output_path):
+                            os.unlink(output_path)
+                finally:
+                    if os.path.exists(ref_audio.name):
+                        os.unlink(ref_audio.name)
+        except Exception as err:
+            self.fail(f"TTS test failed: {err}")
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/backend/python/faster-qwen3-tts/test.sh
+++ b/backend/python/faster-qwen3-tts/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e
+
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+runUnittests
--- a/core/gallery/backends.go
+++ b/core/gallery/backends.go
@@ -25,6 +25,39 @@ const (
 	runFile      = "run.sh"
 )

+// Environment variables for configurable fallback URI patterns
+const (
+	// Default fallback tag values
+	defaultLatestTag = "latest"
+	defaultMasterTag = "master"
+	defaultDevSuffix = "development"
+
+	// Environment variable names
+	envLatestTag = "LOCALAI_BACKEND_IMAGES_RELEASE_TAG"
+	envMasterTag = "LOCALAI_BACKEND_IMAGES_BRANCH_TAG"
+	envDevSuffix = "LOCALAI_BACKEND_DEV_SUFFIX"
+)
+
+// getFallbackTagValues returns the configurable fallback tag values from environment variables
+func getFallbackTagValues() (latestTag, masterTag, devSuffix string) {
+	latestTag = os.Getenv(envLatestTag)
+	masterTag = os.Getenv(envMasterTag)
+	devSuffix = os.Getenv(envDevSuffix)
+
+	// Use defaults if environment variables are not set
+	if latestTag == "" {
+		latestTag = defaultLatestTag
+	}
+	if masterTag == "" {
+		masterTag = defaultMasterTag
+	}
+	if devSuffix == "" {
+		devSuffix = defaultDevSuffix
+	}
+
+	return latestTag, masterTag, devSuffix
+}
+
 // backendCandidate represents an installed concrete backend option for a given alias
 type backendCandidate struct {
 	name    string
@@ -139,6 +172,9 @@ func InstallBackendFromGallery(ctx context.Context, galleries []config.Gallery,
 }

 func InstallBackend(ctx context.Context, systemState *system.SystemState, modelLoader *model.ModelLoader, config *GalleryBackend, downloadStatus func(string, string, string, float64)) error {
+	// Get configurable fallback tag values from environment variables
+	latestTag, masterTag, devSuffix := getFallbackTagValues()
+
 	// Create base path if it doesn't exist
 	err := os.MkdirAll(systemState.Backend.BackendsPath, 0750)
 	if err != nil {
@@ -166,6 +202,12 @@ func InstallBackend(ctx context.Context, systemState *system.SystemState, modelL
 	} else {
 		xlog.Debug("Downloading backend", "uri", config.URI, "backendPath", backendPath)
 		if err := uri.DownloadFileWithContext(ctx, backendPath, "", 1, 1, downloadStatus); err != nil {
+			// Clean up the partially downloaded backend directory on failure
+			xlog.Debug("Backend download failed, cleaning up", "backendPath", backendPath, "error", err)
+			if cleanupErr := os.RemoveAll(backendPath); cleanupErr != nil {
+				xlog.Warn("Failed to clean up backend directory", "backendPath", backendPath, "error", cleanupErr)
+			}
+
 			success := false
 			// Try to download from mirrors
 			for _, mirror := range config.Mirrors {
@@ -182,6 +224,36 @@ func InstallBackend(ctx context.Context, systemState *system.SystemState, modelL
 				}
 			}

+			// Try fallback: replace latestTag + "-" with masterTag + "-" in the URI
+			fallbackURI := strings.Replace(string(config.URI), latestTag + "-", masterTag + "-", 1)
+			if fallbackURI != string(config.URI) {
+				xlog.Debug("Trying fallback URI", "original", config.URI, "fallback", fallbackURI)
+				if err := downloader.URI(fallbackURI).DownloadFileWithContext(ctx, backendPath, "", 1, 1, downloadStatus); err == nil {
+					xlog.Debug("Downloaded backend using fallback URI", "uri", fallbackURI, "backendPath", backendPath)
+					success = true
+				} else {
+					// Try another fallback: add "-" + devSuffix suffix to the backend name
+					// For example: master-gpu-nvidia-cuda-13-ace-step -> master-gpu-nvidia-cuda-13-ace-step-development
+					if !strings.Contains(fallbackURI, "-" + devSuffix) {
+						// Extract backend name from URI and add -development
+						parts := strings.Split(fallbackURI, "-")
+						if len(parts) >= 2 {
+							// Find where the backend name ends (usually the last part before the tag)
+							// Pattern: quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ace-step
+							lastDash := strings.LastIndex(fallbackURI, "-")
+							if lastDash > 0 {
+								devFallbackURI := fallbackURI[:lastDash] + "-" + devSuffix
+								xlog.Debug("Trying development fallback URI", "fallback", devFallbackURI)
+								if err := downloader.URI(devFallbackURI).DownloadFileWithContext(ctx, backendPath, "", 1, 1, downloadStatus); err == nil {
+									xlog.Debug("Downloaded backend using development fallback URI", "uri", devFallbackURI, "backendPath", backendPath)
+									success = true
+								}
+							}
+						}
+					}
+				}
+			}
+
 			if !success {
 				xlog.Error("Failed to download backend", "uri", config.URI, "backendPath", backendPath, "error", err)
 				return fmt.Errorf("failed to download backend %q: %v", config.URI, err)
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -916,7 +916,7 @@ parameters:

 			application, err := application.New(
 				append(commonOpts,
-					config.WithExternalBackend("transformers", os.Getenv("HUGGINGFACE_GRPC")),
+					config.WithExternalBackend("transformers", os.Getenv("TRANSFORMER_BACKEND")),
 					config.WithContext(c),
 					config.WithSystemState(systemState),
 				)...)
--- a/core/http/endpoints/anthropic/messages.go
+++ b/core/http/endpoints/anthropic/messages.go
@@ -125,13 +125,21 @@ func handleAnthropicNonStream(c echo.Context, id string, input *schema.Anthropic
 		return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("model inference failed: %v", err))
 	}

-	prediction, err := predFunc()
-	if err != nil {
-		xlog.Error("Anthropic prediction failed", "error", err)
-		return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("prediction failed: %v", err))
+	const maxEmptyRetries = 5
+	var prediction backend.LLMResponse
+	var result string
+	for attempt := 0; attempt <= maxEmptyRetries; attempt++ {
+		prediction, err = predFunc()
+		if err != nil {
+			xlog.Error("Anthropic prediction failed", "error", err)
+			return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("prediction failed: %v", err))
+		}
+		result = backend.Finetune(*cfg, predInput, prediction.Response)
+		if result != "" || !shouldUseFn {
+			break
+		}
+		xlog.Warn("Anthropic: retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries)
 	}
-
-	result := backend.Finetune(*cfg, predInput, prediction.Response)
 	
 	// Check if the result contains tool calls
 	toolCalls := functions.ParseFunctionCall(result, cfg.FunctionsConfig)
--- a/core/http/endpoints/localai/import_model.go
+++ b/core/http/endpoints/localai/import_model.go
@@ -1,6 +1,7 @@
 package localai

 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -8,6 +9,7 @@ import (
 	"os"
 	"path/filepath"
 	"strings"
+	"time"

 	"github.com/google/uuid"
 	"github.com/labstack/echo/v4"
@@ -18,6 +20,7 @@ import (
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/mudler/LocalAI/pkg/utils"
+	"github.com/mudler/LocalAI/pkg/vram"

 	"gopkg.in/yaml.v3"
 )
@@ -37,6 +40,31 @@ func ImportModelURIEndpoint(cl *config.ModelConfigLoader, appConfig *config.Appl
 			return fmt.Errorf("failed to discover model config: %w", err)
 		}

+		resp := schema.GalleryResponse{
+			StatusURL: fmt.Sprintf("%smodels/jobs/%s", httpUtils.BaseURL(c), ""),
+		}
+
+		if len(modelConfig.Files) > 0 {
+			files := make([]vram.FileInput, 0, len(modelConfig.Files))
+			for _, f := range modelConfig.Files {
+				files = append(files, vram.FileInput{URI: f.URI, Size: 0})
+			}
+			estCtx, cancel := context.WithTimeout(c.Request().Context(), 5*time.Second)
+			defer cancel()
+			opts := vram.EstimateOptions{ContextLength: 8192}
+			result, err := vram.Estimate(estCtx, files, opts, vram.DefaultCachedSizeResolver(), vram.DefaultCachedGGUFReader())
+			if err == nil {
+				if result.SizeBytes > 0 {
+					resp.EstimatedSizeBytes = result.SizeBytes
+					resp.EstimatedSizeDisplay = result.SizeDisplay
+				}
+				if result.VRAMBytes > 0 {
+					resp.EstimatedVRAMBytes = result.VRAMBytes
+					resp.EstimatedVRAMDisplay = result.VRAMDisplay
+				}
+			}
+		}
+
 		uuid, err := uuid.NewUUID()
 		if err != nil {
 			return err
@@ -63,10 +91,9 @@ func ImportModelURIEndpoint(cl *config.ModelConfigLoader, appConfig *config.Appl
 			BackendGalleries:   appConfig.BackendGalleries,
 		}

-		return c.JSON(200, schema.GalleryResponse{
-			ID:        uuid.String(),
-			StatusURL: fmt.Sprintf("%smodels/jobs/%s", httpUtils.BaseURL(c), uuid.String()),
-		})
+		resp.ID = uuid.String()
+		resp.StatusURL = fmt.Sprintf("%smodels/jobs/%s", httpUtils.BaseURL(c), uuid.String())
+		return c.JSON(200, resp)
 	}
 }

--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -270,7 +270,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			}
 			responses <- initialMessage

-			result, err := handleQuestion(config, cl, req, ml, startupOptions, functionResults, result, prompt)
+			result, err := handleQuestion(config, functionResults, result, prompt)
 			if err != nil {
 				xlog.Error("error handling question", "error", err)
 				return err
@@ -388,6 +388,14 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 		shouldUseFn := len(input.Functions) > 0 && config.ShouldUseFunctions()
 		strictMode := false

+		xlog.Debug("Tool call routing decision",
+			"shouldUseFn", shouldUseFn,
+			"len(input.Functions)", len(input.Functions),
+			"len(input.Tools)", len(input.Tools),
+			"config.ShouldUseFunctions()", config.ShouldUseFunctions(),
+			"config.FunctionToCall()", config.FunctionToCall(),
+		)
+
 		for _, f := range input.Functions {
 			if f.Strict {
 				strictMode = true
@@ -648,12 +656,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator

 			xlog.Debug("Thinking start token", "thinkingStartToken", thinkingStartToken, "template", template)

+			var emptyRetryNeeded bool
+
 			tokenCallback := func(s string, c *[]schema.Choice) {
 				// Prepend thinking token if needed, then extract reasoning from the response
 				reasoning, s := reason.ExtractReasoningWithConfig(s, thinkingStartToken, config.ReasoningConfig)

 				if !shouldUseFn {
-					// no function is called, just reply and use stop as finish reason
 					stopReason := FinishReasonStop
 					message := &schema.Message{Role: "assistant", Content: &s}
 					if reasoning != "" {
@@ -671,9 +680,15 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator

 				switch {
 				case noActionsToRun:
-					result, err := handleQuestion(config, cl, input, ml, startupOptions, results, s, predInput)
+					if s == "" && textContentToReturn == "" {
+						xlog.Warn("Backend returned empty content in tool-calling context, will retry")
+						emptyRetryNeeded = true
+						return
+					}
+					result, err := handleQuestion(config, results, s, predInput)
 					if err != nil {
 						xlog.Error("error handling question", "error", err)
+						emptyRetryNeeded = true
 						return
 					}

@@ -745,19 +760,42 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			// Echo properly supports context cancellation via c.Request().Context()
 			// No workaround needed!

-			result, tokenUsage, err := ComputeChoices(
-				input,
-				predInput,
-				config,
-				cl,
-				startupOptions,
-				ml,
-				tokenCallback,
-				nil,
-			)
+			const maxEmptyRetries = 5
+			var result []schema.Choice
+			var tokenUsage backend.TokenUsage
+			var err error
+
+			for attempt := 0; attempt <= maxEmptyRetries; attempt++ {
+				emptyRetryNeeded = false
+				result, tokenUsage, err = ComputeChoices(
+					input,
+					predInput,
+					config,
+					cl,
+					startupOptions,
+					ml,
+					tokenCallback,
+					nil,
+				)
+				if err != nil || !emptyRetryNeeded {
+					break
+				}
+				xlog.Warn("Retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries)
+			}
 			if err != nil {
 				return err
 			}
+
+			if emptyRetryNeeded {
+				xlog.Warn("All retries exhausted, backend still returning empty content")
+				stopReason := FinishReasonStop
+				empty := ""
+				result = append(result, schema.Choice{
+					FinishReason: &stopReason,
+					Index:        0,
+					Message:      &schema.Message{Role: "assistant", Content: &empty},
+				})
+			}
 			usage := schema.OpenAIUsage{
 				PromptTokens:     tokenUsage.Prompt,
 				CompletionTokens: tokenUsage.Completion,
@@ -785,7 +823,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 	}
 }

-func handleQuestion(config *config.ModelConfig, cl *config.ModelConfigLoader, input *schema.OpenAIRequest, ml *model.ModelLoader, o *config.ApplicationConfig, funcResults []functions.FuncCallResults, result, prompt string) (string, error) {
+func handleQuestion(config *config.ModelConfig, funcResults []functions.FuncCallResults, result, prompt string) (string, error) {

 	if len(funcResults) == 0 && result != "" {
 		xlog.Debug("nothing function results but we had a message from the LLM")
@@ -818,73 +856,6 @@ func handleQuestion(config *config.ModelConfig, cl *config.ModelConfigLoader, in
 	}

 	xlog.Debug("No action received from LLM, without a message, computing a reply")
-	// Otherwise ask the LLM to understand the JSON output and the context, and return a message
-	// Note: This costs (in term of CPU/GPU) another computation
-	config.Grammar = ""
-	images := []string{}
-	for _, m := range input.Messages {
-		images = append(images, m.StringImages...)
-	}
-	videos := []string{}
-	for _, m := range input.Messages {
-		videos = append(videos, m.StringVideos...)
-	}
-	audios := []string{}
-	for _, m := range input.Messages {
-		audios = append(audios, m.StringAudios...)
-	}

-	// Serialize tools and tool_choice to JSON strings
-	toolsJSON := ""
-	if len(input.Tools) > 0 {
-		toolsBytes, err := json.Marshal(input.Tools)
-		if err == nil {
-			toolsJSON = string(toolsBytes)
-		}
-	}
-	toolChoiceJSON := ""
-	if input.ToolsChoice != nil {
-		toolChoiceBytes, err := json.Marshal(input.ToolsChoice)
-		if err == nil {
-			toolChoiceJSON = string(toolChoiceBytes)
-		}
-	}
-
-	// Extract logprobs from request
-	// According to OpenAI API: logprobs is boolean, top_logprobs (0-20) controls how many top tokens per position
-	var logprobs *int
-	var topLogprobs *int
-	if input.Logprobs.IsEnabled() {
-		// If logprobs is enabled, use top_logprobs if provided, otherwise default to 1
-		if input.TopLogprobs != nil {
-			topLogprobs = input.TopLogprobs
-			// For backend compatibility, set logprobs to the top_logprobs value
-			logprobs = input.TopLogprobs
-		} else {
-			// Default to 1 if logprobs is true but top_logprobs not specified
-			val := 1
-			logprobs = &val
-			topLogprobs = &val
-		}
-	}
-
-	// Extract logit_bias from request
-	// According to OpenAI API: logit_bias is a map of token IDs (as strings) to bias values (-100 to 100)
-	var logitBias map[string]float64
-	if len(input.LogitBias) > 0 {
-		logitBias = input.LogitBias
-	}
-
-	predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, config, cl, o, nil, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias)
-	if err != nil {
-		xlog.Error("model inference failed", "error", err)
-		return "", err
-	}
-
-	prediction, err := predFunc()
-	if err != nil {
-		xlog.Error("prediction failed", "error", err)
-		return "", err
-	}
-	return backend.Finetune(*config, prompt, prediction.Response), nil
+	return "", fmt.Errorf("no action received from LLM, without a message, computing a reply")
 }
--- a/core/http/endpoints/openresponses/responses.go
+++ b/core/http/endpoints/openresponses/responses.go
@@ -800,13 +800,26 @@ func handleBackgroundNonStream(ctx context.Context, store *ResponseStore, respon
 	default:
 	}

-	prediction, err := predFunc()
-	if err != nil {
-		return nil, fmt.Errorf("prediction failed: %w", err)
+	const maxEmptyRetries = 5
+	var prediction backend.LLMResponse
+	var result string
+	for attempt := 0; attempt <= maxEmptyRetries; attempt++ {
+		prediction, err = predFunc()
+		if err != nil {
+			return nil, fmt.Errorf("prediction failed: %w", err)
+		}
+		result = backend.Finetune(*cfg, predInput, prediction.Response)
+		if result != "" || !shouldUseFn {
+			break
+		}
+		select {
+		case <-ctx.Done():
+			return nil, ctx.Err()
+		default:
+		}
+		xlog.Warn("Open Responses background: retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries)
 	}

-	result := backend.Finetune(*cfg, predInput, prediction.Response)
-
 	// Parse tool calls if using functions (same logic as regular handler)
 	var outputItems []schema.ORItemField
 	var toolCalls []schema.ToolCall
@@ -1475,13 +1488,21 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
 		return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("model inference failed: %v", err), "")
 	}

-	prediction, err := predFunc()
-	if err != nil {
-		xlog.Error("Open Responses prediction failed", "error", err)
-		return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("prediction failed: %v", err), "")
+	const maxEmptyRetries = 5
+	var prediction backend.LLMResponse
+	var result string
+	for attempt := 0; attempt <= maxEmptyRetries; attempt++ {
+		prediction, err = predFunc()
+		if err != nil {
+			xlog.Error("Open Responses prediction failed", "error", err)
+			return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("prediction failed: %v", err), "")
+		}
+		result = backend.Finetune(*cfg, predInput, prediction.Response)
+		if result != "" || !shouldUseFn {
+			break
+		}
+		xlog.Warn("Open Responses: retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries)
 	}
-
-	result := backend.Finetune(*cfg, predInput, prediction.Response)
 	xlog.Debug("Open Responses - Raw model result", "result", result, "shouldUseFn", shouldUseFn)

 	// Detect if thinking token is already in prompt or template
--- a/core/http/routes/ui_api.go
+++ b/core/http/routes/ui_api.go
@@ -1,14 +1,19 @@
 package routes

+import "os"
+
 import (
 	"context"
 	"fmt"
 	"math"
 	"net/http"
 	"net/url"
+	"path"
 	"sort"
 	"strconv"
 	"strings"
+	"sync"
+	"time"

 	"github.com/google/uuid"
 	"github.com/labstack/echo/v4"
@@ -20,6 +25,7 @@ import (
 	"github.com/mudler/LocalAI/core/p2p"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/vram"
 	"github.com/mudler/LocalAI/pkg/xsysinfo"
 	"github.com/mudler/xlog"
 )
@@ -32,6 +38,25 @@ const (
 	ascSortOrder            = "asc"
 )

+// getDirectorySize calculates the total size of files in a directory
+func getDirectorySize(path string) (int64, error) {
+	var totalSize int64
+	entries, err := os.ReadDir(path)
+	if err != nil {
+		return 0, err
+	}
+	for _, entry := range entries {
+		info, err := entry.Info()
+		if err != nil {
+			continue
+		}
+		if !info.IsDir() {
+			totalSize += info.Size()
+		}
+	}
+	return totalSize, nil
+}
+
 // RegisterUIAPIRoutes registers JSON API routes for the web UI
 func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, galleryService *services.GalleryService, opcache *services.OpCache, applicationInstance *application.Application) {

@@ -242,6 +267,22 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
 		modelsJSON := make([]map[string]interface{}, 0, len(models))
 		seenIDs := make(map[string]bool)

+		weightExts := map[string]bool{".gguf": true, ".safetensors": true, ".bin": true, ".pt": true}
+		hasWeightFiles := func(files []gallery.File) bool {
+			for _, f := range files {
+				ext := strings.ToLower(path.Ext(path.Base(f.URI)))
+				if weightExts[ext] {
+					return true
+				}
+			}
+			return false
+		}
+
+		const estimateTimeout = 3 * time.Second
+		const estimateConcurrency = 3
+		sem := make(chan struct{}, estimateConcurrency)
+		var wg sync.WaitGroup
+
 		for _, m := range models {
 			modelID := m.ID()

@@ -265,7 +306,7 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model

 			_, trustRemoteCodeExists := m.Overrides["trust_remote_code"]

-			modelsJSON = append(modelsJSON, map[string]interface{}{
+			obj := map[string]interface{}{
 				"id":              modelID,
 				"name":            m.Name,
 				"description":     m.Description,
@@ -280,9 +321,48 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
 				"isDeletion":      isDeletionOp,
 				"trustRemoteCode": trustRemoteCodeExists,
 				"additionalFiles": m.AdditionalFiles,
-			})
+			}
+
+			if hasWeightFiles(m.AdditionalFiles) {
+				files := make([]gallery.File, len(m.AdditionalFiles))
+				copy(files, m.AdditionalFiles)
+				wg.Add(1)
+				go func(files []gallery.File, out map[string]interface{}) {
+					defer wg.Done()
+					sem <- struct{}{}
+					defer func() { <-sem }()
+					inputs := make([]vram.FileInput, 0, len(files))
+					for _, f := range files {
+						ext := strings.ToLower(path.Ext(path.Base(f.URI)))
+						if weightExts[ext] {
+							inputs = append(inputs, vram.FileInput{URI: f.URI, Size: 0})
+						}
+					}
+					if len(inputs) == 0 {
+						return
+					}
+					ctx, cancel := context.WithTimeout(context.Background(), estimateTimeout)
+					defer cancel()
+					opts := vram.EstimateOptions{ContextLength: 8192}
+					result, err := vram.Estimate(ctx, inputs, opts, vram.DefaultCachedSizeResolver(), vram.DefaultCachedGGUFReader())
+					if err == nil {
+						if result.SizeBytes > 0 {
+							out["estimated_size_bytes"] = result.SizeBytes
+							out["estimated_size_display"] = result.SizeDisplay
+						}
+						if result.VRAMBytes > 0 {
+							out["estimated_vram_bytes"] = result.VRAMBytes
+							out["estimated_vram_display"] = result.VRAMDisplay
+						}
+					}
+				}(files, obj)
+			}
+
+			modelsJSON = append(modelsJSON, obj)
 		}

+		wg.Wait()
+
 		prevPage := pageNum - 1
 		nextPage := pageNum + 1
 		if prevPage < 1 {
@@ -297,6 +377,8 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
 		modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY)
 		installedModelsCount := len(modelConfigs) + len(modelsWithoutConfig)

+		ramInfo, _ := xsysinfo.GetSystemRAMInfo()
+
 		return c.JSON(200, map[string]interface{}{
 			"models":           modelsJSON,
 			"repositories":     appConfig.Galleries,
@@ -305,6 +387,9 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
 			"taskTypes":        taskTypes,
 			"availableModels":  totalModels,
 			"installedModels":  installedModelsCount,
+			"ramTotal":         ramInfo.Total,
+			"ramUsed":          ramInfo.Used,
+			"ramUsagePercent":  ramInfo.UsagePercent,
 			"currentPage":      pageNum,
 			"totalPages":       totalPages,
 			"prevPage":         prevPage,
@@ -936,12 +1021,15 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
 			watchdogInterval = appConfig.WatchDogInterval.String()
 		}

+		storageSize, _ := getDirectorySize(appConfig.SystemState.Model.ModelsPath)
+
 		response := map[string]interface{}{
 			"type":                resourceInfo.Type, // "gpu" or "ram"
 			"available":           resourceInfo.Available,
 			"gpus":                resourceInfo.GPUs,
 			"ram":                 resourceInfo.RAM,
 			"aggregate":           resourceInfo.Aggregate,
+			"storage_size":        storageSize,
 			"reclaimer_enabled":   appConfig.MemoryReclaimerEnabled,
 			"reclaimer_threshold": appConfig.MemoryReclaimerThreshold,
 			"watchdog_interval":   watchdogInterval,
--- a/core/http/views/manage.html
+++ b/core/http/views/manage.html
@@ -141,6 +141,15 @@
                            </div>
                        </div>
                    </template>
+                    <!-- Models storage (disk usage) -->
+                    <template x-if="resourceData.storage_size != null">
+                        <div class="mt-3 pt-3 border-t border-[var(--color-primary-border)]/20">
+                            <div class="flex justify-between text-xs">
+                                <span class="text-[var(--color-text-secondary)]">Models storage</span>
+                                <span class="font-mono text-[var(--color-text-primary)]" x-text="formatBytes(resourceData.storage_size)"></span>
+                            </div>
+                        </div>
+                    </template>
                </div>
            </template>
        </div>
--- a/core/http/views/model-editor.html
+++ b/core/http/views/model-editor.html
@@ -59,6 +59,26 @@
        <!-- Alert Messages -->
        <div id="alertContainer" class="mb-6"></div>

+        <!-- Persistent estimate (stays visible so user can see size/VRAM even if alert is replaced) -->
+        <div x-show="!isAdvancedMode && !isEditMode && lastEstimate && ((lastEstimate.sizeDisplay && lastEstimate.sizeDisplay !== '0 B') || (lastEstimate.vramDisplay && lastEstimate.vramDisplay !== '0 B'))"
+             x-transition
+             class="mb-6 p-4 rounded-xl border border-[var(--color-primary)]/30 bg-[var(--color-primary-light)]/30">
+            <h3 class="text-sm font-semibold text-[var(--color-text-primary)] mb-2 flex items-center gap-2">
+                <i class="fas fa-memory text-[var(--color-primary)]"></i>
+                Estimated requirements
+            </h3>
+            <div class="flex flex-wrap gap-4 text-sm text-[var(--color-text-secondary)]">
+                <span x-show="lastEstimate && lastEstimate.sizeDisplay && lastEstimate.sizeDisplay !== '0 B'">
+                    <i class="fas fa-download mr-1.5 text-[var(--color-primary)]"></i>
+                    Download size: <span class="font-medium text-[var(--color-text-primary)]" x-text="lastEstimate?.sizeDisplay"></span>
+                </span>
+                <span x-show="lastEstimate && lastEstimate.vramDisplay && lastEstimate.vramDisplay !== '0 B'">
+                    <i class="fas fa-microchip mr-1.5 text-[var(--color-primary)]"></i>
+                    VRAM: <span class="font-medium text-[var(--color-text-primary)]" x-text="lastEstimate?.vramDisplay"></span>
+                </span>
+            </div>
+        </div>
+
        <!-- Simple Import Mode -->
        <div x-show="!isAdvancedMode && !isEditMode" 
             x-transition:enter="transition ease-out duration-200"
@@ -731,6 +751,7 @@ function importModel() {
        jobPollInterval: null,
        yamlEditor: null,
        modelEditor: null,
+        lastEstimate: null,
        
        init() {
            // If in edit mode, always show advanced mode
@@ -854,15 +875,36 @@ function importModel() {
                }
                
                const result = await response.json();
-                
+
+                const hasSize = result.estimated_size_display && result.estimated_size_display !== '0 B';
+                const hasVram = result.estimated_vram_display && result.estimated_vram_display !== '0 B';
+                if (hasSize || hasVram) {
+                    this.lastEstimate = {
+                        sizeDisplay: result.estimated_size_display || '',
+                        vramDisplay: result.estimated_vram_display || '',
+                        sizeBytes: result.estimated_size_bytes || 0,
+                        vramBytes: result.estimated_vram_bytes || 0
+                    };
+                } else {
+                    this.lastEstimate = null;
+                }
+
+                let successMsg = 'Import started! Tracking progress...';
+                if (hasSize || hasVram) {
+                    const parts = [];
+                    if (hasSize) parts.push('Size: ' + result.estimated_size_display);
+                    if (hasVram) parts.push('VRAM: ' + result.estimated_vram_display);
+                    successMsg += ' (' + parts.join(' · ') + ')';
+                }
+
                if (result.uuid) {
                    this.currentJobId = result.uuid;
-                    this.showAlert('success', 'Import started! Tracking progress...');
+                    this.showAlert('success', successMsg);
                    this.startJobPolling();
                } else if (result.ID) {
                    // Fallback for different response format
                    this.currentJobId = result.ID;
-                    this.showAlert('success', 'Import started! Tracking progress...');
+                    this.showAlert('success', successMsg);
                    this.startJobPolling();
                } else {
                    throw new Error('No job ID returned from server');
--- a/core/http/views/models.html
+++ b/core/http/views/models.html
@@ -177,7 +177,7 @@
        </div>

        <!-- Results Section -->
-        <div id="search-results" class="transition-all duration-300">
+        <div id="search-results" class="transition-all duration-300 relative">
            <div x-show="loading && models.length === 0" class="text-center py-12">
                <svg class="animate-spin h-12 w-12 text-[var(--color-primary)] mx-auto mb-4" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
                    <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
@@ -191,6 +191,21 @@
                <p class="text-[var(--color-text-secondary)]">No models found matching your criteria</p>
            </div>

+            <!-- Loading overlay when switching pages (we have models but loading) -->
+            <div x-show="loading && models.length > 0"
+                 x-transition:enter="transition ease-out duration-150"
+                 x-transition:enter-start="opacity-0"
+                 x-transition:enter-end="opacity-100"
+                 class="absolute inset-0 z-10 flex items-center justify-center rounded-2xl bg-[var(--color-bg-secondary)]/80 backdrop-blur-sm">
+                <div class="flex flex-col items-center gap-3">
+                    <svg class="animate-spin h-12 w-12 text-[var(--color-primary)]" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
+                        <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
+                        <path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
+                    </svg>
+                    <p class="text-sm text-[var(--color-text-secondary)]">Loading page...</p>
+                </div>
+            </div>
+
            <!-- Table View -->
            <div x-show="models.length > 0" class="bg-[var(--color-bg-secondary)] rounded-2xl border border-[var(--color-border-subtle)] overflow-hidden shadow-xl backdrop-blur-sm">
                <div class="overflow-x-auto">
@@ -209,26 +224,7 @@
                                    </div>
                                </th>
                                <th class="px-6 py-4 text-left text-xs font-semibold text-[var(--color-text-primary)] uppercase tracking-wider">Description</th>
-                                <th @click="setSort('repository')" 
-                                    :class="sortBy === 'repository' ? 'bg-[var(--color-primary-light)]' : ''"
-                                    class="px-6 py-4 text-left text-xs font-semibold text-[var(--color-text-primary)] uppercase tracking-wider cursor-pointer hover:bg-[var(--color-bg-primary)] transition-colors">
-                                    <div class="flex items-center gap-2">
-                                        <span>Repository</span>
-                                        <i :class="sortBy === 'repository' ? (sortOrder === 'asc' ? 'fas fa-sort-up' : 'fas fa-sort-down') : 'fas fa-sort'" 
-                                           :class="sortBy === 'repository' ? 'text-[var(--color-primary)]' : 'text-[var(--color-text-secondary)]'"
-                                           class="text-xs"></i>
-                                    </div>
-                                </th>
-                                <th @click="setSort('license')" 
-                                    :class="sortBy === 'license' ? 'bg-[var(--color-primary-light)]' : ''"
-                                    class="px-6 py-4 text-left text-xs font-semibold text-[var(--color-text-primary)] uppercase tracking-wider cursor-pointer hover:bg-[var(--color-bg-primary)] transition-colors">
-                                    <div class="flex items-center gap-2">
-                                        <span>License</span>
-                                        <i :class="sortBy === 'license' ? (sortOrder === 'asc' ? 'fas fa-sort-up' : 'fas fa-sort-down') : 'fas fa-sort'" 
-                                           :class="sortBy === 'license' ? 'text-[var(--color-primary)]' : 'text-[var(--color-text-secondary)]'"
-                                           class="text-xs"></i>
-                                    </div>
-                                </th>
+                                <th class="px-6 py-4 text-left text-xs font-semibold text-[var(--color-text-primary)] uppercase tracking-wider">Size / VRAM</th>
                                <th @click="setSort('status')" 
                                    :class="sortBy === 'status' ? 'bg-[var(--color-primary-light)]' : ''"
                                    class="px-6 py-4 text-left text-xs font-semibold text-[var(--color-text-primary)] uppercase tracking-wider cursor-pointer hover:bg-[var(--color-bg-primary)] transition-colors">
@@ -275,21 +271,26 @@
                                        <div class="text-sm text-[var(--color-text-secondary)] max-w-xs truncate" x-text="model.description" :title="model.description"></div>
                                    </td>

-                                    <!-- Repository -->
+                                    <!-- Size / VRAM -->
                                    <td class="px-6 py-4">
-                                        <span class="inline-flex items-center text-xs px-2 py-1 rounded bg-[var(--color-primary-light)] text-[var(--color-text-primary)] border border-[var(--color-primary-border)]">
-                                            <i class="fa-brands fa-git-alt mr-1"></i>
-                                            <span x-text="model.gallery"></span>
-                                        </span>
-                                    </td>
-
-                                    <!-- License -->
-                                    <td class="px-6 py-4">
-                                        <span x-show="model.license" class="inline-flex items-center text-xs px-2 py-1 rounded bg-[var(--color-accent-light)] text-[var(--color-text-primary)] border border-[var(--color-accent)]/30">
-                                            <i class="fas fa-book mr-1"></i>
-                                            <span x-text="model.license"></span>
-                                        </span>
-                                        <span x-show="!model.license" class="text-xs text-[var(--color-text-secondary)]">-</span>
+                                        <div class="flex flex-col gap-0.5">
+                                            <template x-if="(model.estimated_size_display && model.estimated_size_display !== '0 B') || (model.estimated_vram_display && model.estimated_vram_display !== '0 B')">
+                                                <div class="text-xs text-[var(--color-text-secondary)]">
+                                                    <span x-show="model.estimated_size_display && model.estimated_size_display !== '0 B'" x-text="'Size: ' + model.estimated_size_display"></span>
+                                                    <span x-show="(model.estimated_size_display && model.estimated_size_display !== '0 B') && (model.estimated_vram_display && model.estimated_vram_display !== '0 B')"> · </span>
+                                                    <span x-show="model.estimated_vram_display && model.estimated_vram_display !== '0 B'" x-text="'VRAM: ' + model.estimated_vram_display"></span>
+                                                </div>
+                                            </template>
+                                            <template x-if="model.estimated_vram_bytes && totalMemory > 0">
+                                                <span :title="(model.estimated_vram_bytes <= totalMemory * 0.95 ? 'Fits your GPU' : 'May not fit your GPU')"
+                                                      class="inline-flex items-center text-xs">
+                                                    <i class="fas fa-microchip mr-1"
+                                                       :class="model.estimated_vram_bytes <= totalMemory * 0.95 ? 'text-[var(--color-success)]' : 'text-[var(--color-error)]'"></i>
+                                                    <span x-text="model.estimated_vram_bytes <= totalMemory * 0.95 ? 'Fits' : 'May not fit'"></span>
+                                                </span>
+                                            </template>
+                                            <span x-show="(!model.estimated_size_display || model.estimated_size_display === '0 B') && (!model.estimated_vram_display || model.estimated_vram_display === '0 B')" class="text-xs text-[var(--color-text-muted)]">-</span>
+                                        </div>
                                    </td>

                                    <!-- Status -->
@@ -405,6 +406,36 @@
                                </div>
                            </div>
                            <div class="text-base leading-relaxed text-[var(--color-text-secondary)] break-words max-w-full markdown-content" x-html="renderMarkdown(selectedModel?.description)"></div>
+                            <template x-if="(selectedModel?.estimated_size_display && selectedModel.estimated_size_display !== '0 B') || (selectedModel?.estimated_vram_display && selectedModel.estimated_vram_display !== '0 B')">
+                                <div class="space-y-1">
+                                    <p x-show="selectedModel?.estimated_size_display && selectedModel.estimated_size_display !== '0 B'" class="text-sm text-[var(--color-text-secondary)]">
+                                        <i class="fas fa-download mr-2 text-[var(--color-primary)]"></i>
+                                        Estimated download size: <span x-text="selectedModel?.estimated_size_display" class="font-medium text-[var(--color-text-primary)]"></span>
+                                    </p>
+                                    <p x-show="selectedModel?.estimated_vram_display && selectedModel.estimated_vram_display !== '0 B'" class="text-sm text-[var(--color-text-secondary)]">
+                                        <i class="fas fa-memory mr-2 text-[var(--color-primary)]"></i>
+                                        Estimated VRAM: <span x-text="selectedModel?.estimated_vram_display" class="font-medium text-[var(--color-text-primary)]"></span>
+                                    </p>
+                                    <p x-show="selectedModel?.estimated_vram_bytes && totalMemory > 0" class="text-sm">
+                                        <i class="fas fa-microchip mr-2"
+                                           :class="selectedModel?.estimated_vram_bytes <= totalMemory * 0.95 ? 'text-[var(--color-success)]' : 'text-[var(--color-error)]'"></i>
+                                        <span x-text="selectedModel?.estimated_vram_bytes <= totalMemory * 0.95 ? 'Fits your GPU' : 'May not fit your GPU'"
+                                              :class="selectedModel?.estimated_vram_bytes <= totalMemory * 0.95 ? 'text-[var(--color-success)]' : 'text-[var(--color-error)]'"></span>
+                                    </p>
+                                </div>
+                            </template>
+                            <template x-if="selectedModel?.gallery || selectedModel?.license">
+                                <div class="space-y-1">
+                                    <p x-show="selectedModel?.gallery" class="text-sm text-[var(--color-text-secondary)]">
+                                        <i class="fa-brands fa-git-alt mr-2 text-[var(--color-primary)]"></i>
+                                        Repository: <span x-text="selectedModel?.gallery" class="font-medium text-[var(--color-text-primary)]"></span>
+                                    </p>
+                                    <p x-show="selectedModel?.license" class="text-sm text-[var(--color-text-secondary)]">
+                                        <i class="fas fa-book mr-2 text-[var(--color-primary)]"></i>
+                                        License: <span x-text="selectedModel?.license" class="font-medium text-[var(--color-text-primary)]"></span>
+                                    </p>
+                                </div>
+                            </template>
                            <hr>
                            <template x-if="selectedModel?.urls && selectedModel.urls.length > 0">
                                <div>
@@ -605,6 +636,10 @@ function modelsGallery() {
        totalPages: 1,
        availableModels: 0,
        installedModels: 0,
+        ramTotal: 0,
+        ramUsed: 0,
+        ramUsagePercent: 0,
+        totalMemory: 0,
        selectedModel: null,
        jobProgress: {},
        notifications: [],
@@ -613,10 +648,21 @@ function modelsGallery() {

        init() {
            this.fetchModels();
+            this.fetchResources();
            // Poll for job progress every 600ms
            setInterval(() => this.pollJobs(), 600);
        },

+        async fetchResources() {
+            try {
+                const response = await fetch('/api/resources');
+                if (response.ok) {
+                    const data = await response.json();
+                    this.totalMemory = data.aggregate?.total_memory || 0;
+                }
+            } catch (e) {}
+        },
+
        addNotification(message, type = 'error') {
            const id = Date.now();
            this.notifications.push({ id, message, type });
@@ -650,6 +696,9 @@ function modelsGallery() {
                this.totalPages = data.totalPages || 1;
                this.availableModels = data.availableModels || 0;
                this.installedModels = data.installedModels || 0;
+                this.ramTotal = data.ramTotal || 0;
+                this.ramUsed = data.ramUsed || 0;
+                this.ramUsagePercent = data.ramUsagePercent || 0;
            } catch (error) {
                console.error('Error fetching models:', error);
            } finally {
@@ -826,6 +875,14 @@ function modelsGallery() {
            this.selectedModel = model;
        },

+        formatBytes(bytes) {
+            if (bytes === 0) return "0 B";
+            const k = 1024;
+            const sizes = ["B", "KB", "MB", "GB", "TB"];
+            const i = Math.floor(Math.log(bytes) / Math.log(k));
+            return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + " " + sizes[i];
+        },
+
        closeModal() {
            this.selectedModel = null;
        }
--- a/core/http/views/video.html
+++ b/core/http/views/video.html
@@ -20,7 +20,7 @@
                        <div class="flex items-center justify-between gap-2">
                            <label class="text-xs font-medium text-[var(--color-text-secondary)] uppercase tracking-wide flex-shrink-0">Model</label>
                        </div>
-                        <select x-data="{ link : '' }" x-model="link" x-init="$watch('link', value => window.location = link)" 
+                        <select x-data="{ link : '{{if .Model}}video/{{.Model}}{{end}}' }" x-model="link" x-init="$watch('link', value => window.location = link)" 
                            id="model-select"
                            class="input w-full p-1.5 text-xs"
                        >	
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@@ -24,6 +24,11 @@ type BackendMonitorResponse struct {
 type GalleryResponse struct {
 	ID        string `json:"uuid"`
 	StatusURL string `json:"status"`
+
+	EstimatedVRAMBytes    uint64 `json:"estimated_vram_bytes,omitempty"`
+	EstimatedVRAMDisplay  string `json:"estimated_vram_display,omitempty"`
+	EstimatedSizeBytes    uint64 `json:"estimated_size_bytes,omitempty"`
+	EstimatedSizeDisplay string `json:"estimated_size_display,omitempty"`
 }

 type VideoRequest struct {
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -27,6 +27,24 @@ services:
    # or an URL pointing to a YAML configuration file, for example:
    # - https://gist.githubusercontent.com/mudler/ad601a0488b497b69ec549150d9edd18/raw/a8a8869ef1bb7e3830bf5c0bae29a0cce991ff8d/phi-2.yaml
    - phi-2
+    # For NVIDIA GPU support with CDI (recommended for NVIDIA Container Toolkit 1.14+):
+    # Uncomment the following deploy section and use driver: nvidia.com/gpu
+    # deploy:
+    #   resources:
+    #     reservations:
+    #       devices:
+    #         - driver: nvidia.com/gpu
+    #           count: all
+    #           capabilities: [gpu]
+    #
+    # For legacy NVIDIA driver (for older NVIDIA Container Toolkit):
+    # deploy:
+    #   resources:
+    #     reservations:
+    #       devices:
+    #         - driver: nvidia
+    #           count: 1
+    #           capabilities: [gpu]

 volumes:
  models:
--- a/docs/content/advanced/reverse-proxy-tls.md
+++ b/docs/content/advanced/reverse-proxy-tls.md
@@ -0,0 +1,148 @@
+---
+title: TLS Reverse Proxy Configuration
+description: Configure LocalAI behind a TLS termination reverse proxy (HAProxy, Apache, Nginx)
+weight: 100
+---
+
+# TLS Reverse Proxy Configuration
+
+When running LocalAI behind a TLS termination reverse proxy, the Web UI may fail to load static assets (CSS, JS) correctly because the application doesn't automatically detect that it's being served over HTTPS. This guide explains how to properly configure your reverse proxy to work with LocalAI.
+
+## How It Works
+
+LocalAI uses the `X-Forwarded-Proto` HTTP header to determine the protocol used by clients. When this header is set to `https`, LocalAI will generate HTTPS URLs for static assets in the Web UI.
+
+## Required Headers
+
+Your reverse proxy must forward these headers to LocalAI:
+
+| Header | Purpose |
+|--------|---------|
+| `X-Forwarded-Proto` | Set to `https` when TLS is terminated at the proxy |
+| `X-Forwarded-Host` | The original host requested by the client |
+| `X-Forwarded-Prefix` | Any path prefix if LocalAI is served under a sub-path |
+
+## HAProxy Configuration
+
+```haproxy
+frontend https-in
+    bind *:443 ssl crt /path/to/cert.pem
+    mode http
+    
+    # Set the X-Forwarded-Proto header
+    http-request set-header X-Forwarded-Proto https
+    
+    # Pass the original host
+    http-request set-header X-Forwarded-Host %[hdr(host)]
+    
+    # If serving under a sub-path, set the prefix
+    # http-request set-header X-Forwarded-Prefix /localai
+    
+    default_backend localai
+
+backend localai
+    mode http
+    server localai1 127.0.0.1:8080 check
+```
+
+## Apache Configuration
+
+```apache
+<VirtualHost *:443>
+    ServerName your-domain.com
+    SSLEngine on
+    SSLCertificateFile /path/to/cert.pem
+    SSLCertificateKeyFile /path/to/key.pem
+    
+    # Enable proxy and headers modules
+    ProxyRequests Off
+    ProxyPreserveHost On
+    
+    <Proxy *>
+        Require all granted
+    </Proxy>
+    
+    # Set the X-Forwarded-Proto header
+    RequestHeader set X-Forwarded-Proto "https"
+    
+    # Set the X-Forwarded-Host header (optional, usually automatic)
+    RequestHeader set X-Forwarded-Host "%{HTTP_HOST}s"
+    
+    # If serving under a sub-path
+    # RequestHeader set X-Forwarded-Prefix "/localai"
+    
+    ProxyPass / http://127.0.0.1:8080/
+    ProxyPassReverse / http://127.0.0.1:8080/
+</VirtualHost>
+```
+
+## Nginx Configuration
+
+```nginx
+server {
+    listen 443 ssl;
+    server_name your-domain.com;
+    
+    ssl_certificate /path/to/cert.pem;
+    ssl_certificate_key /path/to/key.pem;
+    
+    # Set the X-Forwarded-Proto header
+    proxy_set_header X-Forwarded-Proto $scheme;
+    
+    # Pass the original host
+    proxy_set_header X-Forwarded-Host $host;
+    
+    # If serving under a sub-path
+    # proxy_set_header X-Forwarded-Prefix /localai;
+    
+    # Other proxy settings
+    proxy_pass http://127.0.0.1:8080;
+    proxy_http_version 1.1;
+    proxy_set_header Upgrade $http_upgrade;
+    proxy_set_header Connection "upgrade";
+    proxy_set_header Host $host;
+    proxy_cache_bypass $http_upgrade;
+}
+```
+
+## Serving Under a Sub-Path
+
+If you serve LocalAI under a sub-path (e.g., `https://your-domain.com/localai`), you need to:
+
+1. Configure your reverse proxy to set the `X-Forwarded-Prefix` header
+
+Example with Nginx:
+
+```nginx
+proxy_set_header X-Forwarded-Prefix /localai;
+```
+
+## Testing Your Configuration
+
+1. Start LocalAI: `localai`
+2. Configure your reverse proxy as shown above
+3. Access the Web UI through the proxy
+4. Check the browser's developer console for any mixed content warnings or failed asset loads
+5. Verify that the HTML source contains `https://` URLs for static assets
+
+## Troubleshooting
+
+### Static Assets Not Loading
+
+- Verify the `X-Forwarded-Proto` header is being forwarded
+- Check that the header value is exactly `https` (lowercase)
+- Inspect the network tab in your browser to see which requests are failing
+
+### Mixed Content Warnings
+
+- Ensure LocalAI is generating HTTPS URLs (check the BaseURL middleware is working)
+- Verify the `X-Forwarded-Proto` header is set before LocalAI processes the request
+
+### Redirect Loops
+
+- Check that your proxy is not adding duplicate headers
+- Verify `X-Forwarded-Proto` is not being set to both `http` and `https`
+
+## Security Note
+
+When using reverse proxies, ensure your proxy only accepts connections from trusted sources and properly validates SSL certificates. Never expose LocalAI directly to the internet without TLS termination.
--- a/docs/content/features/model-gallery.md
+++ b/docs/content/features/model-gallery.md
@@ -31,6 +31,15 @@ GPT and text generation models might have a license which is not permissive for

 Navigate the WebUI interface in the "Models" section from the navbar at the top. Here you can find a list of models that can be installed, and you can install them by clicking the "Install" button.

+## VRAM and download size estimates
+
+When browsing the gallery or importing a model by URI, LocalAI can show **estimated download size** and **estimated VRAM** for models.
+
+- **Where they appear**: In the model gallery table (Size / VRAM column), in the model detail modal, and after starting an import from URI (in the success message).
+- **How they are computed**: GGUF models use file size (HTTP HEAD or local stat) and optional GGUF metadata (HTTP Range) for KV cache and overhead; other formats use Hugging Face file sizes and optional config when available. If metadata is unavailable, a size-only heuristic is used.
+- **Hardware fit indicator**: When your system reports GPU or RAM capacity, the gallery shows whether the estimated VRAM fits (green) or may not fit (red) using a 95% headroom rule.
+- Estimates are best-effort and may be missing if the server does not support HEAD/Range or the request times out.
+
 ## Add other galleries

 You can add other galleries by:
--- a/docs/content/installation/containers.md
+++ b/docs/content/installation/containers.md
@@ -139,17 +139,16 @@ podman run -ti --name local-ai -p 8080:8080 --device gpu.intel.com/all localai/l

 For a more manageable setup, especially with persistent volumes, use Docker Compose or Podman Compose:

+### Using CDI (Container Device Interface) - Recommended for NVIDIA Container Toolkit 1.14+
+
+The CDI approach is recommended for newer versions of the NVIDIA Container Toolkit (1.14 and later). It provides better compatibility and is the future-proof method:
+
 ```yaml
 version: "3.9"
 services:
  api:
-    image: localai/localai:latest-aio-cpu
-    # For GPU support, use one of:
-    # image: localai/localai:latest-aio-gpu-nvidia-cuda-13
-    # image: localai/localai:latest-aio-gpu-nvidia-cuda-12
-    # image: localai/localai:latest-aio-gpu-nvidia-cuda-11
-    # image: localai/localai:latest-aio-gpu-hipblas
-    # image: localai/localai:latest-aio-gpu-intel
+    image: localai/localai:latest-aio-gpu-nvidia-cuda-12
+    # For CUDA 13, use: localai/localai:latest-aio-gpu-nvidia-cuda-13
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080/readyz"]
      interval: 1m
@@ -161,14 +160,15 @@ services:
      - DEBUG=false
    volumes:
      - ./models:/models:cached
-    # For NVIDIA GPUs, uncomment:
-    # deploy:
-    #   resources:
-    #     reservations:
-    #       devices:
-    #         - driver: nvidia
-    #           count: 1
-    #           capabilities: [gpu]
+    # CDI driver configuration (recommended for NVIDIA Container Toolkit 1.14+)
+    # This uses the nvidia.com/gpu resource API
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia.com/gpu
+              count: all
+              capabilities: [gpu]
 ```

 Save this as `compose.yaml` and run:
@@ -179,6 +179,37 @@ docker compose up -d
 podman-compose up -d
 ```

+### Using Legacy NVIDIA Driver - For Older NVIDIA Container Toolkit
+
+If you are using an older version of the NVIDIA Container Toolkit (before 1.14), or need backward compatibility, use the legacy approach:
+
+```yaml
+version: "3.9"
+services:
+  api:
+    image: localai/localai:latest-aio-gpu-nvidia-cuda-12
+    # For CUDA 13, use: localai/localai:latest-aio-gpu-nvidia-cuda-13
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8080/readyz"]
+      interval: 1m
+      timeout: 20m
+      retries: 5
+    ports:
+      - 8080:8080
+    environment:
+      - DEBUG=false
+    volumes:
+      - ./models:/models:cached
+    # Legacy NVIDIA driver configuration (for older NVIDIA Container Toolkit)
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+```
+
 ## Persistent Storage

 To persist models and configurations, mount a volume:
@@ -244,6 +275,35 @@ After installation:
 - For NVIDIA: Install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)
 - For AMD: Ensure devices are accessible: `ls -la /dev/kfd /dev/dri`

+### NVIDIA Container fails to start with "Auto-detected mode as 'legacy'" error
+
+If you encounter this error:
+```
+Error response from daemon: failed to create task for container: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: error during container init: error running prestart hook #0: exit status 1, stdout: , stderr: Auto-detected mode as 'legacy'
+nvidia-container-cli: requirement error: invalid expression
+```
+
+This indicates a Docker/NVIDIA Container Toolkit configuration issue. The container runtime's prestart hook fails before LocalAI starts. This is **not** a LocalAI code bug.
+
+**Solutions:**
+
+1. **Use CDI mode (recommended)**: Update your docker-compose.yaml to use the CDI driver configuration:
+   ```yaml
+   deploy:
+     resources:
+       reservations:
+         devices:
+           - driver: nvidia.com/gpu
+             count: all
+             capabilities: [gpu]
+   ```
+
+2. **Upgrade NVIDIA Container Toolkit**: Ensure you have version 1.14 or later, which has better CDI support.
+
+3. **Check NVIDIA Container Toolkit configuration**: Run `nvidia-container-cli --query-gpu` to verify your installation is working correctly outside of containers.
+
+4. **Verify Docker GPU access**: Test with `docker run --rm --gpus all nvidia/cuda:12.0.0-base-ubuntu22.04 nvidia-smi`
+
 ### Models not downloading

 - Check internet connection
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,166 @@
 ---
+- name: "qwen3.5-397b-a17b"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF
+  description: |
+    AI model: qwen3.5-397b-a17b
+  overrides:
+    parameters:
+      model: llama-cpp/models/Qwen3.5-397B-A17B-Q4_K_M-00001-of-00006.gguf
+    name: Qwen3.5-397B-A17B-GGUF
+    backend: llama-cpp
+    template:
+      use_tokenizer_template: true
+    known_usecases:
+      - chat
+    function:
+      grammar:
+        disable: true
+    mmproj: llama-cpp/mmproj/mmproj-F32.gguf
+    description: Imported from https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF
+    options:
+      - use_jinja:true
+  files:
+    - filename: llama-cpp/models/Qwen3.5-397B-A17B-Q4_K_M-00001-of-00006.gguf
+      sha256: 1300b09fae0f87ee8dc10f2b17e0070eaf73a3561e8664a3fa307fcad50c55e3
+      uri: https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF/resolve/main/Q4_K_M/Qwen3.5-397B-A17B-Q4_K_M-00001-of-00006.gguf
+    - filename: llama-cpp/models/Qwen3.5-397B-A17B-Q4_K_M-00002-of-00006.gguf
+      sha256: 2bc58495b9108480cd9f3ceea0c323ddcb9fceffe354e56b71d48ef01c35ef60
+      uri: https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF/resolve/main/Q4_K_M/Qwen3.5-397B-A17B-Q4_K_M-00002-of-00006.gguf
+    - filename: llama-cpp/models/Qwen3.5-397B-A17B-Q4_K_M-00003-of-00006.gguf
+      sha256: 64954cb1376d1de1778ddad0c8231f4bbd15492627caf118a685ae475d3efa81
+      uri: https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF/resolve/main/Q4_K_M/Qwen3.5-397B-A17B-Q4_K_M-00003-of-00006.gguf
+    - filename: llama-cpp/models/Qwen3.5-397B-A17B-Q4_K_M-00004-of-00006.gguf
+      sha256: 554485298f616b0ff59e1ec2982167d55bece87f682827c68a32acd0fd03425f
+      uri: https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF/resolve/main/Q4_K_M/Qwen3.5-397B-A17B-Q4_K_M-00004-of-00006.gguf
+    - filename: llama-cpp/models/Qwen3.5-397B-A17B-Q4_K_M-00005-of-00006.gguf
+      sha256: 24d6f5668ea2c6eaddde5f08ea6325b495bc66be7217bb2de0a5c8b5eace1c51
+      uri: https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF/resolve/main/Q4_K_M/Qwen3.5-397B-A17B-Q4_K_M-00005-of-00006.gguf
+    - filename: llama-cpp/models/Qwen3.5-397B-A17B-Q4_K_M-00006-of-00006.gguf
+      sha256: e36715e951da55d9e48b40aab61ba7829a7bfad5c6a155eb79aa13fe8b39347f
+      uri: https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF/resolve/main/Q4_K_M/Qwen3.5-397B-A17B-Q4_K_M-00006-of-00006.gguf
+    - filename: llama-cpp/mmproj/mmproj-F32.gguf
+      sha256: e47df150363dd9d53b4ddf01e5477a6803f7fc2d2e0341064dcf39511ad5f110
+      uri: https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF/resolve/main/mmproj-F32.gguf
+- name: "qwen3.5-27b"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/unsloth/Qwen3.5-27B-GGUF
+  description: |
+    AI model: qwen3.5-27b
+  overrides:
+    parameters:
+      model: llama-cpp/models/Qwen3.5-27B-Q4_K_M.gguf
+    name: Qwen3.5-27B-GGUF
+    backend: llama-cpp
+    template:
+      use_tokenizer_template: true
+    known_usecases:
+      - chat
+    function:
+      grammar:
+        disable: true
+    mmproj: llama-cpp/mmproj/mmproj-F32.gguf
+    description: Imported from https://huggingface.co/unsloth/Qwen3.5-27B-GGUF
+    options:
+      - use_jinja:true
+  files:
+    - filename: llama-cpp/models/Qwen3.5-27B-Q4_K_M.gguf
+      sha256: 728960e4dda52d4f2af5bee09b2cbe86addfa93220fe9324bfac9dc727605c17
+      uri: https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/resolve/main/Qwen3.5-27B-Q4_K_M.gguf
+    - filename: llama-cpp/mmproj/mmproj-F32.gguf
+      sha256: c4efc54971085f29eecd433a8fba3edd2890584dfa2fc978933d1dd193f174dd
+      uri: https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/resolve/main/mmproj-F32.gguf
+- name: "qwen3.5-122b-a10b"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/unsloth/Qwen3.5-122B-A10B-GGUF
+  description: |
+    AI model: qwen3.5-122b-a10b
+  overrides:
+    parameters:
+      model: llama-cpp/models/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf
+    name: Qwen3.5-122B-A10B-GGUF
+    backend: llama-cpp
+    template:
+      use_tokenizer_template: true
+    known_usecases:
+      - chat
+    function:
+      grammar:
+        disable: true
+    mmproj: llama-cpp/mmproj/mmproj-F32.gguf
+    description: Imported from https://huggingface.co/unsloth/Qwen3.5-122B-A10B-GGUF
+    options:
+      - use_jinja:true
+  files:
+    - filename: llama-cpp/models/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf
+      sha256: 914ac4aea369a78a16db389cd11293bd7ed4d2fe7960cdc7bc5140b21e5d8074
+      uri: https://huggingface.co/unsloth/Qwen3.5-122B-A10B-GGUF/resolve/main/Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf
+    - filename: llama-cpp/models/Qwen3.5-122B-A10B-Q4_K_M-00002-of-00003.gguf
+      sha256: 073b82aaccefa6b360d4220299e488dc8810ad76d286b282c44ec374534e41d4
+      uri: https://huggingface.co/unsloth/Qwen3.5-122B-A10B-GGUF/resolve/main/Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00002-of-00003.gguf
+    - filename: llama-cpp/models/Qwen3.5-122B-A10B-Q4_K_M-00003-of-00003.gguf
+      sha256: 0c9eed4a95f8fac03cb57e3fb63a49dcf400f958d86a387b98f0e9b4fbb54fd6
+      uri: https://huggingface.co/unsloth/Qwen3.5-122B-A10B-GGUF/resolve/main/Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00003-of-00003.gguf
+    - filename: llama-cpp/mmproj/mmproj-F32.gguf
+      sha256: ba889ce164a6cc7ffe34296851d0f2bbe139bd27deeb7fe3830d08bd776a28a6
+      uri: https://huggingface.co/unsloth/Qwen3.5-122B-A10B-GGUF/resolve/main/mmproj-F32.gguf
+- name: "qwen3.5-35b-a3b"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF
+  description: |
+    AI model: qwen3.5-35b-a3b
+  overrides:
+    parameters:
+      model: llama-cpp/models/Qwen3.5-35B-A3B-UD-Q4_K_M.gguf
+    name: Qwen3.5-35B-A3B-GGUF
+    backend: llama-cpp
+    template:
+      use_tokenizer_template: true
+    known_usecases:
+      - chat
+    function:
+      grammar:
+        disable: true
+    mmproj: llama-cpp/mmproj/mmproj-F32.gguf
+    description: Imported from https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF
+    options:
+      - use_jinja:true
+  files:
+    - filename: llama-cpp/models/Qwen3.5-35B-A3B-UD-Q4_K_M.gguf
+      sha256: 223138866b87b12e68ffb43a1d45afb572921e9cd4c594e6a736df94c5130466
+      uri: https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/Qwen3.5-35B-A3B-UD-Q4_K_M.gguf
+    - filename: llama-cpp/mmproj/mmproj-F32.gguf
+      sha256: 40169fdbd92afb86ef298c8f535353c7cc1307e3493db4359454246bcfc92131
+      uri: https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F32.gguf
+- name: "qwen_qwen3-next-80b-a3b-thinking"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/bartowski/Qwen_Qwen3-Next-80B-A3B-Thinking-GGUF
+  description: |
+    Describe the model in a clear and concise way that can be shared in a model gallery.
+  overrides:
+    parameters:
+      model: llama-cpp/models/Qwen_Qwen3-Next-80B-A3B-Thinking-Q4_K_M.gguf
+    name: Qwen_Qwen3-Next-80B-A3B-Thinking-GGUF
+    backend: llama-cpp
+    template:
+      use_tokenizer_template: true
+    known_usecases:
+      - chat
+    function:
+      grammar:
+        disable: true
+    description: Imported from https://huggingface.co/bartowski/Qwen_Qwen3-Next-80B-A3B-Thinking-GGUF
+    options:
+      - use_jinja:true
+  files:
+    - filename: llama-cpp/models/Qwen_Qwen3-Next-80B-A3B-Thinking-Q4_K_M.gguf
+      sha256: 83481c75cc6c0837ba9afa52b59b4cd3f85f55dd7aa6c60e27230ff329c81367
+      uri: https://huggingface.co/bartowski/Qwen_Qwen3-Next-80B-A3B-Thinking-GGUF/resolve/main/Qwen_Qwen3-Next-80B-A3B-Thinking-Q4_K_M.gguf
 - &nanbeige4
  name: "nanbeige4.1-3b-q8"
  url: "github:mudler/LocalAI/gallery/nanbeige4.1.yaml@master"
--- a/pkg/downloader/uri.go
+++ b/pkg/downloader/uri.go
@@ -275,6 +275,68 @@ func (uri URI) checkSeverSupportsRangeHeader() (bool, error) {
 	return resp.Header.Get("Accept-Ranges") == "bytes", nil
 }

+// ContentLength returns the size in bytes of the resource at the URI.
+// For file:// it uses os.Stat on the resolved path; for HTTP/HTTPS it uses HEAD
+// and optionally a Range request if Content-Length is missing.
+func (u URI) ContentLength(ctx context.Context) (int64, error) {
+	urlStr := u.ResolveURL()
+	if strings.HasPrefix(string(u), LocalPrefix) {
+		info, err := os.Stat(urlStr)
+		if err != nil {
+			return 0, err
+		}
+		return info.Size(), nil
+	}
+	if !u.LooksLikeHTTPURL() {
+		return 0, fmt.Errorf("unsupported URI scheme for ContentLength: %s", string(u))
+	}
+	req, err := http.NewRequestWithContext(ctx, "HEAD", urlStr, nil)
+	if err != nil {
+		return 0, err
+	}
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		return 0, err
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode >= 400 {
+		return 0, fmt.Errorf("HEAD %s: status %d", urlStr, resp.StatusCode)
+	}
+	if resp.ContentLength >= 0 {
+		return resp.ContentLength, nil
+	}
+	if resp.Header.Get("Accept-Ranges") != "bytes" {
+		return 0, fmt.Errorf("HEAD %s: no Content-Length and server does not support Range", urlStr)
+	}
+	req2, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
+	if err != nil {
+		return 0, err
+	}
+	req2.Header.Set("Range", "bytes=0-0")
+	resp2, err := http.DefaultClient.Do(req2)
+	if err != nil {
+		return 0, err
+	}
+	defer resp2.Body.Close()
+	if resp2.StatusCode != http.StatusPartialContent && resp2.StatusCode != http.StatusOK {
+		return 0, fmt.Errorf("Range request %s: status %d", urlStr, resp2.StatusCode)
+	}
+	cr := resp2.Header.Get("Content-Range")
+	// Content-Range: bytes 0-0/12345
+	if cr == "" {
+		return 0, fmt.Errorf("Range request %s: no Content-Range header", urlStr)
+	}
+	parts := strings.Split(cr, "/")
+	if len(parts) != 2 {
+		return 0, fmt.Errorf("invalid Content-Range: %s", cr)
+	}
+	size, err := strconv.ParseInt(strings.TrimSpace(parts[1]), 10, 64)
+	if err != nil || size < 0 {
+		return 0, fmt.Errorf("invalid Content-Range total length: %s", parts[1])
+	}
+	return size, nil
+}
+
 func (uri URI) DownloadFile(filePath, sha string, fileN, total int, downloadStatus func(string, string, string, float64)) error {
 	return uri.DownloadFileWithContext(context.Background(), filePath, sha, fileN, total, downloadStatus)
 }
--- a/pkg/downloader/uri_test.go
+++ b/pkg/downloader/uri_test.go
@@ -1,12 +1,15 @@
 package downloader_test

 import (
+	"context"
 	"crypto/rand"
 	"crypto/sha256"
+	"errors"
 	"fmt"
 	"net/http"
 	"net/http/httptest"
 	"os"
+	"path/filepath"
 	"regexp"
 	"strconv"

@@ -48,6 +51,86 @@ var _ = Describe("Gallery API tests", func() {
 	})
 })

+var _ = Describe("ContentLength", func() {
+	Context("local file", func() {
+		It("returns file size for existing file", func() {
+			dir, err := os.MkdirTemp("", "contentlength-*")
+			Expect(err).ToNot(HaveOccurred())
+			defer os.RemoveAll(dir)
+			fpath := filepath.Join(dir, "model.gguf")
+			err = os.WriteFile(fpath, make([]byte, 1234), 0644)
+			Expect(err).ToNot(HaveOccurred())
+			uri := URI("file://" + fpath)
+			ctx := context.Background()
+			size, err := uri.ContentLength(ctx)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(size).To(Equal(int64(1234)))
+		})
+		It("returns error for missing file", func() {
+			uri := URI("file:///nonexistent/path/model.gguf")
+			ctx := context.Background()
+			_, err := uri.ContentLength(ctx)
+			Expect(err).To(HaveOccurred())
+		})
+	})
+	Context("HTTP", func() {
+		It("returns Content-Length when present", func() {
+			server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				Expect(r.Method).To(Equal("HEAD"))
+				w.Header().Set("Content-Length", "1000")
+				w.WriteHeader(http.StatusOK)
+			}))
+			defer server.Close()
+			uri := URI(server.URL)
+			ctx := context.Background()
+			size, err := uri.ContentLength(ctx)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(size).To(Equal(int64(1000)))
+		})
+		It("returns error on 404", func() {
+			server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				w.WriteHeader(http.StatusNotFound)
+			}))
+			defer server.Close()
+			uri := URI(server.URL)
+			ctx := context.Background()
+			_, err := uri.ContentLength(ctx)
+			Expect(err).To(HaveOccurred())
+		})
+		It("uses Range when Content-Length missing and Accept-Ranges bytes", func() {
+			server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				if r.Method == "HEAD" {
+					w.Header().Set("Accept-Ranges", "bytes")
+					w.WriteHeader(http.StatusOK)
+					return
+				}
+				Expect(r.Header.Get("Range")).To(Equal("bytes=0-0"))
+				w.Header().Set("Content-Range", "bytes 0-0/5000")
+				w.WriteHeader(http.StatusPartialContent)
+			}))
+			defer server.Close()
+			uri := URI(server.URL)
+			ctx := context.Background()
+			size, err := uri.ContentLength(ctx)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(size).To(Equal(int64(5000)))
+		})
+		It("respects context cancellation", func() {
+			server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				w.Header().Set("Content-Length", "1000")
+				w.WriteHeader(http.StatusOK)
+			}))
+			defer server.Close()
+			ctx, cancel := context.WithCancel(context.Background())
+			cancel()
+			uri := URI(server.URL)
+			_, err := uri.ContentLength(ctx)
+			Expect(err).To(HaveOccurred())
+			Expect(errors.Is(err, context.Canceled)).To(BeTrue())
+		})
+	})
+})
+
 type RangeHeaderError struct {
 	msg string
 }
--- a/pkg/functions/iterative_parser.go
+++ b/pkg/functions/iterative_parser.go
@@ -305,6 +305,16 @@ func AllSpace(s string) bool {
 	return strings.TrimSpace(s) == ""
 }

+// allSpaceOrEscapedNewlines reports whether s is empty or contains only whitespace
+// and the two-character sequences \n and \r (as in escaped JSON or backtick strings).
+// Used for XML tool-call prelude checks so that content with literal \n between
+// tags is accepted like real newlines, matching behavior when input has actual newlines.
+func allSpaceOrEscapedNewlines(s string) bool {
+	normalized := strings.ReplaceAll(s, "\\n", "")
+	normalized = strings.ReplaceAll(normalized, "\\r", "")
+	return strings.TrimSpace(normalized) == ""
+}
+
 // TryConsumeJSON attempts to consume a JSON value from the current position
 // Returns the parsed JSON (can be object, array, or any JSON type), whether it's partial,
 // and the jsonDumpMarker (non-empty if JSON was healed)
@@ -721,7 +731,7 @@ func (p *ChatMsgParser) TryConsumeXMLToolCalls(format *XMLToolCallFormat) (bool,
 				// No more scopes found, break
 				break
 			}
-			if !AllSpace(tc.Prelude) {
+			if !allSpaceOrEscapedNewlines(tc.Prelude) {
 				// Non-whitespace before scope_start, stop parsing
 				p.MoveTo(tc.Groups[0].Begin - len(tc.Prelude))
 				break
@@ -743,7 +753,7 @@ func (p *ChatMsgParser) TryConsumeXMLToolCalls(format *XMLToolCallFormat) (bool,
 				break
 			}

-			if !AllSpace(tc.Prelude) {
+			if !allSpaceOrEscapedNewlines(tc.Prelude) {
 				// Non-whitespace before tool_start, stop parsing
 				p.MoveTo(tc.Groups[0].Begin - len(tc.Prelude))
 				break
@@ -845,7 +855,7 @@ func (p *ChatMsgParser) TryConsumeXMLToolCalls(format *XMLToolCallFormat) (bool,
 					break
 				}

-				if !AllSpace(keyStart.Prelude) {
+				if !allSpaceOrEscapedNewlines(keyStart.Prelude) {
 					// Non-whitespace before key_start, stop parsing parameters
 					p.MoveTo(keyStart.Groups[0].Begin - len(keyStart.Prelude))
 					break
@@ -1009,7 +1019,7 @@ func (p *ChatMsgParser) TryConsumeXMLToolCalls(format *XMLToolCallFormat) (bool,
 					// Rewind to json_end and check if val_end follows
 					p.MoveTo(jsonEnd)
 					valEndSize, valEnd := tryFindValEnd()
-					if valEnd != nil && AllSpace(valEnd.Prelude) && jsonHealingMarker == "" {
+					if valEnd != nil && allSpaceOrEscapedNewlines(valEnd.Prelude) && jsonHealingMarker == "" {
 						// val_end follows JSON
 						if len(valEnd.Groups) > 0 {
 							matchedSize := valEnd.Groups[0].End - valEnd.Groups[0].Begin
@@ -1105,7 +1115,7 @@ func (p *ChatMsgParser) TryConsumeXMLToolCalls(format *XMLToolCallFormat) (bool,
 				return false, &ChatMsgPartialException{Message: "incomplete tool_call"}
 			}

-			if !AllSpace(toolEnd.Prelude) {
+			if !allSpaceOrEscapedNewlines(toolEnd.Prelude) {
 				return returnError(errors.New("non-whitespace before tool_end"), recovery)
 			}

@@ -1147,7 +1157,7 @@ func (p *ChatMsgParser) TryConsumeXMLToolCalls(format *XMLToolCallFormat) (bool,
 					break
 				}
 				break
-			} else if !AllSpace(tc.Prelude) {
+			} else if !allSpaceOrEscapedNewlines(tc.Prelude) {
 				// Non-whitespace before scope_end - this might be another scope_start
 				// Check if it's actually another scope_start
 				if format.ScopeStart != "" {
--- a/pkg/functions/parse.go
+++ b/pkg/functions/parse.go
@@ -375,6 +375,20 @@ func getAllXMLFormats() []xmlFormatPreset {
 				TrimRawArgVal: true,
 			},
 		},
+		{
+			name: "qwen3.5",
+			format: &XMLToolCallFormat{
+				ScopeStart:    "<tool_call>",
+				ToolStart:     "<function=",
+				ToolSep:       ">",
+				KeyStart:      "<parameter=",
+				KeyValSep:     ">",
+				ValEnd:        "</parameter>",
+				ToolEnd:       "</function>",
+				ScopeEnd:      "</tool_call>",
+				TrimRawArgVal: true,
+			},
+		},
 		{
 			name: "glm-4.5",
 			format: &XMLToolCallFormat{
@@ -483,9 +497,70 @@ func ParseXML(s string, format *XMLToolCallFormat) ([]FuncCallResults, error) {
 	return parseXMLWithFormat(s, format)
 }

+// getScopeOrToolStart returns the string to search for to start the tool-calls section
+// (ScopeStart if set, else ToolStart). Used to mimic llama.cpp's "content until <tool_call>" order.
+func getScopeOrToolStart(format *XMLToolCallFormat) string {
+	if format == nil {
+		return ""
+	}
+	if format.ScopeStart != "" {
+		return format.ScopeStart
+	}
+	return format.ToolStart
+}
+
+// tryParseXMLFromScopeStart finds the first occurrence of scopeStart (or format.ToolStart),
+// splits the input there, and parses only the suffix as XML tool calls. Returns (toolCalls, true)
+// if any tool calls were parsed, else (nil, false). This mimics llama.cpp's PEG order so that
+// reasoning or content before the tool block does not cause "whitespace only before scope" to fail.
+func tryParseXMLFromScopeStart(s string, format *XMLToolCallFormat, isPartial bool) ([]FuncCallResults, bool) {
+	if format == nil {
+		return nil, false
+	}
+	scopeStart := getScopeOrToolStart(format)
+	if scopeStart == "" {
+		return nil, false
+	}
+	idx := strings.Index(s, scopeStart)
+	if idx < 0 {
+		return nil, false
+	}
+	toolCallsPart := s[idx:]
+	parser := NewChatMsgParser(toolCallsPart, isPartial)
+	success, err := parser.TryConsumeXMLToolCalls(format)
+	if err != nil {
+		if _, ok := err.(*ChatMsgPartialException); ok && isPartial {
+			return parser.ToolCalls(), len(parser.ToolCalls()) > 0
+		}
+		return nil, false
+	}
+	if success && len(parser.ToolCalls()) > 0 {
+		return parser.ToolCalls(), true
+	}
+	return nil, false
+}
+
 // ParseXMLIterative parses XML tool calls using the iterative parser
-// This provides better streaming and partial parsing support
+// This provides better streaming and partial parsing support.
+// When format is nil or when format is set, tries "find scope/tool start, split, parse suffix"
+// first (llama.cpp PEG order) so that content before the tool block does not cause parse failure.
 func ParseXMLIterative(s string, format *XMLToolCallFormat, isPartial bool) ([]FuncCallResults, error) {
+	// Try split-on-scope first so reasoning/content before tool block is skipped
+	if format != nil {
+		if results, ok := tryParseXMLFromScopeStart(s, format, isPartial); ok {
+			return results, nil
+		}
+	} else {
+		formats := getAllXMLFormats()
+		for _, fmtPreset := range formats {
+			if fmtPreset.format != nil {
+				if results, ok := tryParseXMLFromScopeStart(s, fmtPreset.format, isPartial); ok {
+					return results, nil
+				}
+			}
+		}
+	}
+
 	parser := NewChatMsgParser(s, isPartial)

 	// Auto-detect format if not provided
@@ -1621,16 +1696,54 @@ func ParseFunctionCall(llmresult string, functionConfig FunctionsConfig) []FuncC
 	// but we've already parsed it, so we shouldn't try XML parsing on the same content
 	skipXMLParsing := (len(functionConfig.JSONRegexMatch) > 0 || len(functionConfig.ResponseRegex) > 0) && len(results) > 0
 	if len(results) == 0 && !skipXMLParsing {
-		xmlResults, err := ParseXML(llmresult, xmlFormat)
-		if err == nil && len(xmlResults) > 0 {
-			xlog.Debug("Found XML tool calls", "count", len(xmlResults))
-			results = append(results, xmlResults...)
+		// Mimic llama.cpp PEG order: try "find scope/tool start, split, parse suffix" first so that
+		// reasoning or content before the tool block (e.g. <think>...</think>) does not cause parse failure.
+		if xmlFormat != nil {
+			if xmlResults, ok := tryParseXMLFromScopeStart(llmresult, xmlFormat, false); ok {
+				xlog.Debug("Found XML tool calls (split-on-scope)", "count", len(xmlResults))
+				results = append(results, xmlResults...)
+			}
+		} else {
+			formats := getAllXMLFormats()
+			for _, fmtPreset := range formats {
+				if fmtPreset.format != nil {
+					if xmlResults, ok := tryParseXMLFromScopeStart(llmresult, fmtPreset.format, false); ok {
+						xlog.Debug("Found XML tool calls (split-on-scope, auto-detect)", "format", fmtPreset.name, "count", len(xmlResults))
+						results = append(results, xmlResults...)
+						break
+					}
+				}
+			}
+		}
+		if len(results) == 0 {
+			xmlResults, err := ParseXML(llmresult, xmlFormat)
+			if err == nil && len(xmlResults) > 0 {
+				xlog.Debug("Found XML tool calls", "count", len(xmlResults))
+				results = append(results, xmlResults...)
+			}
 		}
 	} else if len(results) > 0 && !skipXMLParsing {
 		// Even if we found JSON results, check for XML tool calls in the response
-		// This handles mixed content scenarios (text + JSON + XML)
-		// But skip if JSONRegexMatch or ResponseRegex was used (they already extracted the content)
-		xmlResults, err := ParseXML(llmresult, xmlFormat)
+		// Try split-on-scope first (llama.cpp order), then full ParseXML
+		var xmlResults []FuncCallResults
+		var err error
+		if xmlFormat != nil {
+			xmlResults, _ = tryParseXMLFromScopeStart(llmresult, xmlFormat, false)
+		}
+		if len(xmlResults) == 0 && xmlFormat == nil {
+			formats := getAllXMLFormats()
+			for _, fmtPreset := range formats {
+				if fmtPreset.format != nil {
+					xmlResults, _ = tryParseXMLFromScopeStart(llmresult, fmtPreset.format, false)
+					if len(xmlResults) > 0 {
+						break
+					}
+				}
+			}
+		}
+		if len(xmlResults) == 0 {
+			xmlResults, err = ParseXML(llmresult, xmlFormat)
+		}
 		if err == nil && len(xmlResults) > 0 {
 			// Check if JSON is inside XML tags, if so, skip it
 			for _, result := range xmlResults {
--- a/pkg/functions/parse_test.go
+++ b/pkg/functions/parse_test.go
@@ -779,6 +779,42 @@ value
 			Expect(results[0].Name).To(Equal("glob"))
 			Expect(results[0].Arguments).To(Equal(`{"pattern":"**/package.json"}`))
 		})
+		It("should parse tool calls when reasoning (<think>) precedes tool block (Qwen3.5-style)", func() {
+			input := `<think>
+I need to run a command.
+</think>
+<tool_call>
+<function=bash>
+<parameter=script>
+echo hello
+</parameter>
+</function>
+</tool_call>`
+			cfg := FunctionsConfig{}
+			results := ParseFunctionCall(input, cfg)
+			Expect(results).To(HaveLen(1))
+			Expect(results[0].Name).To(Equal("bash"))
+			Expect(results[0].Arguments).To(ContainSubstring("echo hello"))
+		})
+
+		It("should parse tool calls when reasoning (<think>) precedes tool block (Qwen3.5-style)", func() {
+			input := `<think>
+I need to run a command.
+</think>
+<tool_call>
+<function=bash>
+<parameter=script>
+echo hello
+</parameter>
+</function>
+</tool_call>`
+			cfg := FunctionsConfig{}
+			cfg.XMLFormatPreset = "qwen3.5"
+			results := ParseFunctionCall(input, cfg)
+			Expect(results).To(HaveLen(1))
+			Expect(results[0].Name).To(Equal("bash"))
+			Expect(results[0].Arguments).To(ContainSubstring("echo hello"))
+		})

 		It("should parse XML tool calls alongside JSON tool calls", func() {
 			input := `{"name": "add", "arguments": {"x": 5, "y": 3}}
@@ -1690,6 +1726,24 @@ value
 				// Arguments should contain partial flag
 				Expect(results[0].Arguments).To(ContainSubstring("key"))
 			})
+			It("should return tool call when leading text precedes tool block (real newlines)", func() {
+				input := "The memory reclaimer functionality already exists! Let me examine the watchdog to understand how it works and what might need to be implemented for \"auto-fit\" vs unloading.\n\n<tool_call>\n<function=bash>\n<parameter=script>\ncd /root/worktrees/LocalAI/task_8562 && cat core/application/watchdog.go\n</parameter>\n</function>\n</tool_call>"
+				results, err := ParseXMLIterative(input, nil, true)
+				Expect(err).NotTo(HaveOccurred())
+				Expect(results).NotTo(BeNil())
+				Expect(results).To(HaveLen(1))
+				Expect(results[0].Name).To(Equal("bash"))
+				Expect(results[0].Arguments).To(ContainSubstring("task_8562"))
+			})
+			It("should return tool call when leading text precedes tool block (literal \\n between tags)", func() {
+				input := `The memory reclaimer functionality already exists! Let me examine the watchdog to understand how it works and what might need to be implemented for "auto-fit" vs unloading.\n\n<tool_call>\n<function=bash>\n<parameter=script>\ncd /root/worktrees/LocalAI/task_8562 && cat core/application/watchdog.go\n</parameter>\n</function>\n</tool_call>`
+				results, err := ParseXMLIterative(input, nil, false)
+				Expect(err).NotTo(HaveOccurred())
+				Expect(results).NotTo(BeNil())
+				Expect(results).To(HaveLen(1))
+				Expect(results[0].Name).To(Equal("bash"))
+				Expect(results[0].Arguments).To(ContainSubstring("task_8562"))
+			})
 		})

 		Describe("ParseJSONIterative", func() {
--- a/pkg/system/capabilities.go
+++ b/pkg/system/capabilities.go
@@ -50,9 +50,9 @@ var (
 )

 func init() {
-	_, err := os.Stat(filepath.Join("usr", "local", "cuda-13"))
+	_, err := os.Stat(filepath.Join(string(os.PathSeparator), "usr", "local", "cuda-13"))
 	cuda13DirExists = err == nil
-	_, err = os.Stat(filepath.Join("usr", "local", "cuda-12"))
+	_, err = os.Stat(filepath.Join(string(os.PathSeparator), "usr", "local", "cuda-12"))
 	cuda12DirExists = err == nil
 }

--- a/pkg/vram/cache.go
+++ b/pkg/vram/cache.go
@@ -0,0 +1,96 @@
+package vram
+
+import (
+	"context"
+	"sync"
+	"time"
+)
+
+const defaultEstimateCacheTTL = 15 * time.Minute
+
+type sizeCacheEntry struct {
+	size  int64
+	err   error
+	until time.Time
+}
+
+type cachedSizeResolver struct {
+	underlying SizeResolver
+	ttl       time.Duration
+	mu        sync.Mutex
+	cache     map[string]sizeCacheEntry
+}
+
+func (c *cachedSizeResolver) ContentLength(ctx context.Context, uri string) (int64, error) {
+	c.mu.Lock()
+	e, ok := c.cache[uri]
+	c.mu.Unlock()
+	if ok && time.Now().Before(e.until) {
+		return e.size, e.err
+	}
+	size, err := c.underlying.ContentLength(ctx, uri)
+	c.mu.Lock()
+	if c.cache == nil {
+		c.cache = make(map[string]sizeCacheEntry)
+	}
+	c.cache[uri] = sizeCacheEntry{size: size, err: err, until: time.Now().Add(c.ttl)}
+	c.mu.Unlock()
+	return size, err
+}
+
+type ggufCacheEntry struct {
+	meta  *GGUFMeta
+	err   error
+	until time.Time
+}
+
+type cachedGGUFReader struct {
+	underlying GGUFMetadataReader
+	ttl       time.Duration
+	mu        sync.Mutex
+	cache     map[string]ggufCacheEntry
+}
+
+func (c *cachedGGUFReader) ReadMetadata(ctx context.Context, uri string) (*GGUFMeta, error) {
+	c.mu.Lock()
+	e, ok := c.cache[uri]
+	c.mu.Unlock()
+	if ok && time.Now().Before(e.until) {
+		return e.meta, e.err
+	}
+	meta, err := c.underlying.ReadMetadata(ctx, uri)
+	c.mu.Lock()
+	if c.cache == nil {
+		c.cache = make(map[string]ggufCacheEntry)
+	}
+	c.cache[uri] = ggufCacheEntry{meta: meta, err: err, until: time.Now().Add(c.ttl)}
+	c.mu.Unlock()
+	return meta, err
+}
+
+// CachedSizeResolver returns a SizeResolver that caches ContentLength results by URI for the given TTL.
+func CachedSizeResolver(underlying SizeResolver, ttl time.Duration) SizeResolver {
+	return &cachedSizeResolver{underlying: underlying, ttl: ttl, cache: make(map[string]sizeCacheEntry)}
+}
+
+// CachedGGUFReader returns a GGUFMetadataReader that caches ReadMetadata results by URI for the given TTL.
+func CachedGGUFReader(underlying GGUFMetadataReader, ttl time.Duration) GGUFMetadataReader {
+	return &cachedGGUFReader{underlying: underlying, ttl: ttl, cache: make(map[string]ggufCacheEntry)}
+}
+
+// DefaultCachedSizeResolver returns a cached SizeResolver using the default implementation and default TTL (15 min).
+// A single shared cache is used so repeated HEAD requests for the same URI are avoided across requests.
+func DefaultCachedSizeResolver() SizeResolver {
+	return defaultCachedSizeResolver
+}
+
+// DefaultCachedGGUFReader returns a cached GGUFMetadataReader using the default implementation and default TTL (15 min).
+// A single shared cache is used so repeated GGUF metadata fetches for the same URI are avoided across requests.
+func DefaultCachedGGUFReader() GGUFMetadataReader {
+	return defaultCachedGGUFReader
+}
+
+var (
+	defaultCachedSizeResolver = CachedSizeResolver(defaultSizeResolver{}, defaultEstimateCacheTTL)
+	defaultCachedGGUFReader   = CachedGGUFReader(defaultGGUFReader{}, defaultEstimateCacheTTL)
+)
--- a/pkg/vram/estimate.go
+++ b/pkg/vram/estimate.go
@@ -0,0 +1,152 @@
+package vram
+
+import (
+	"context"
+	"fmt"
+	"path"
+	"strings"
+
+	"github.com/mudler/LocalAI/pkg/downloader"
+)
+
+var weightExts = map[string]bool{
+	".gguf": true, ".safetensors": true, ".bin": true, ".pt": true,
+}
+
+func isWeightFile(nameOrURI string) bool {
+	ext := strings.ToLower(path.Ext(path.Base(nameOrURI)))
+	return weightExts[ext]
+}
+
+func isGGUF(nameOrURI string) bool {
+	return strings.ToLower(path.Ext(path.Base(nameOrURI))) == ".gguf"
+}
+
+func Estimate(ctx context.Context, files []FileInput, opts EstimateOptions, sizeResolver SizeResolver, ggufReader GGUFMetadataReader) (EstimateResult, error) {
+	if opts.ContextLength == 0 {
+		opts.ContextLength = 8192
+	}
+	if opts.KVQuantBits == 0 {
+		opts.KVQuantBits = 16
+	}
+
+	var sizeBytes uint64
+	var ggufSize uint64
+	var firstGGUFURI string
+	for i := range files {
+		f := &files[i]
+		if !isWeightFile(f.URI) {
+			continue
+		}
+		sz := f.Size
+		if sz <= 0 && sizeResolver != nil {
+			var err error
+			sz, err = sizeResolver.ContentLength(ctx, f.URI)
+			if err != nil {
+				continue
+			}
+		}
+		sizeBytes += uint64(sz)
+		if isGGUF(f.URI) {
+			ggufSize += uint64(sz)
+			if firstGGUFURI == "" {
+				firstGGUFURI = f.URI
+			}
+		}
+	}
+
+	sizeDisplay := FormatBytes(sizeBytes)
+
+	var vramBytes uint64
+	if ggufSize > 0 {
+		var meta *GGUFMeta
+		if ggufReader != nil && firstGGUFURI != "" {
+			meta, _ = ggufReader.ReadMetadata(ctx, firstGGUFURI)
+		}
+		if meta != nil && (meta.BlockCount > 0 || meta.EmbeddingLength > 0) {
+			nLayers := meta.BlockCount
+			if nLayers == 0 {
+				nLayers = 32
+			}
+			dModel := meta.EmbeddingLength
+			if dModel == 0 {
+				dModel = 4096
+			}
+			headCountKV := meta.HeadCountKV
+			if headCountKV == 0 {
+				headCountKV = meta.HeadCount
+			}
+			if headCountKV == 0 {
+				headCountKV = 8
+			}
+			gpuLayers := opts.GPULayers
+			if gpuLayers <= 0 {
+				gpuLayers = int(nLayers)
+			}
+			ctxLen := opts.ContextLength
+			bKV := uint32(opts.KVQuantBits / 8)
+			if bKV == 0 {
+				bKV = 4
+			}
+			M_model := ggufSize
+			M_KV := uint64(bKV) * uint64(dModel) * uint64(nLayers) * uint64(ctxLen)
+			if headCountKV > 0 && meta.HeadCount > 0 {
+				M_KV = uint64(bKV) * uint64(dModel) * uint64(headCountKV) * uint64(ctxLen)
+			}
+			P := M_model * 2
+			M_overhead := uint64(0.02*float64(P) + 0.15*1e9)
+			vramBytes = M_model + M_KV + M_overhead
+			if nLayers > 0 && gpuLayers < int(nLayers) {
+				layerRatio := float64(gpuLayers) / float64(nLayers)
+				vramBytes = uint64(layerRatio*float64(M_model)) + M_KV + M_overhead
+			}
+		} else {
+			vramBytes = sizeOnlyVRAM(ggufSize, opts.ContextLength)
+		}
+	} else if sizeBytes > 0 {
+		vramBytes = sizeOnlyVRAM(sizeBytes, opts.ContextLength)
+	}
+
+	return EstimateResult{
+		SizeBytes:   sizeBytes,
+		SizeDisplay: sizeDisplay,
+		VRAMBytes:   vramBytes,
+		VRAMDisplay: FormatBytes(vramBytes),
+	}, nil
+}
+
+func sizeOnlyVRAM(sizeOnDisk uint64, ctxLen uint32) uint64 {
+	k := uint64(1024)
+	vram := sizeOnDisk + k*uint64(ctxLen)*2
+	if vram < sizeOnDisk {
+		vram = sizeOnDisk
+	}
+	return vram
+}
+
+func FormatBytes(n uint64) string {
+	const unit = 1000
+	if n < unit {
+		return fmt.Sprintf("%d B", n)
+	}
+	div, exp := uint64(unit), 0
+	for u := n / unit; u >= unit; u /= unit {
+		div *= unit
+		exp++
+	}
+	return fmt.Sprintf("%.1f %cB", float64(n)/float64(div), "KMGTPE"[exp])
+}
+
+type defaultSizeResolver struct{}
+
+func (defaultSizeResolver) ContentLength(ctx context.Context, uri string) (int64, error) {
+	return downloader.URI(uri).ContentLength(ctx)
+}
+
+func DefaultSizeResolver() SizeResolver {
+	return defaultSizeResolver{}
+}
+
+func DefaultGGUFReader() GGUFMetadataReader {
+	return defaultGGUFReader{}
+}
--- a/pkg/vram/estimate_test.go
+++ b/pkg/vram/estimate_test.go
@@ -0,0 +1,137 @@
+package vram_test
+
+import (
+	"context"
+
+	. "github.com/mudler/LocalAI/pkg/vram"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+type fakeSizeResolver map[string]int64
+
+func (f fakeSizeResolver) ContentLength(ctx context.Context, uri string) (int64, error) {
+	if n, ok := f[uri]; ok {
+		return int64(n), nil
+	}
+	return 0, nil
+}
+
+type fakeGGUFReader map[string]*GGUFMeta
+
+func (f fakeGGUFReader) ReadMetadata(ctx context.Context, uri string) (*GGUFMeta, error) {
+	return f[uri], nil
+}
+
+var _ = Describe("Estimate", func() {
+	ctx := context.Background()
+
+	Describe("empty or non-GGUF inputs", func() {
+		It("returns zero size and vram for nil files", func() {
+			opts := EstimateOptions{ContextLength: 8192}
+			res, err := Estimate(ctx, nil, opts, nil, nil)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(res.SizeBytes).To(Equal(uint64(0)))
+			Expect(res.VRAMBytes).To(Equal(uint64(0)))
+			Expect(res.SizeDisplay).To(Equal("0 B"))
+		})
+
+		It("counts only .gguf files and ignores other extensions", func() {
+			files := []FileInput{
+				{URI: "http://a/model.gguf", Size: 1_000_000_000},
+				{URI: "http://a/readme.txt", Size: 100},
+			}
+			opts := EstimateOptions{ContextLength: 8192}
+			res, err := Estimate(ctx, files, opts, nil, nil)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(res.SizeBytes).To(Equal(uint64(1_000_000_000)))
+		})
+
+		It("sums size for multiple non-GGUF weight files (e.g. safetensors)", func() {
+			files := []FileInput{
+				{URI: "http://hf.co/model/model.safetensors", Size: 2_000_000_000},
+				{URI: "http://hf.co/model/model2.safetensors", Size: 3_000_000_000},
+			}
+			opts := EstimateOptions{ContextLength: 8192}
+			res, err := Estimate(ctx, files, opts, nil, nil)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(res.SizeBytes).To(Equal(uint64(5_000_000_000)))
+		})
+	})
+
+	Describe("GGUF size and resolver", func() {
+		It("uses size resolver when file size is not set", func() {
+			sizes := fakeSizeResolver{"http://example.com/model.gguf": 1_500_000_000}
+			opts := EstimateOptions{ContextLength: 8192}
+			files := []FileInput{{URI: "http://example.com/model.gguf"}}
+
+			res, err := Estimate(ctx, files, opts, sizes, nil)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(res.SizeBytes).To(Equal(uint64(1_500_000_000)))
+			Expect(res.VRAMBytes).To(BeNumerically(">=", res.SizeBytes))
+			Expect(res.SizeDisplay).To(Equal("1.5 GB"))
+		})
+
+		It("uses size-only VRAM formula when metadata is missing and size is large", func() {
+			sizes := fakeSizeResolver{"http://a/model.gguf": 10_000_000_000}
+			opts := EstimateOptions{ContextLength: 8192}
+			files := []FileInput{{URI: "http://a/model.gguf"}}
+
+			res, err := Estimate(ctx, files, opts, sizes, nil)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(res.VRAMBytes).To(BeNumerically(">", 10_000_000_000))
+		})
+
+		It("sums size for multiple GGUF shards", func() {
+			files := []FileInput{
+				{URI: "http://a/shard1.gguf", Size: 10_000_000_000},
+				{URI: "http://a/shard2.gguf", Size: 5_000_000_000},
+			}
+			opts := EstimateOptions{ContextLength: 8192}
+
+			res, err := Estimate(ctx, files, opts, nil, nil)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(res.SizeBytes).To(Equal(uint64(15_000_000_000)))
+		})
+
+		It("formats size display correctly", func() {
+			files := []FileInput{{URI: "http://a/model.gguf", Size: 2_500_000_000}}
+			opts := EstimateOptions{ContextLength: 8192}
+
+			res, err := Estimate(ctx, files, opts, nil, nil)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(res.SizeDisplay).To(Equal("2.5 GB"))
+		})
+	})
+
+	Describe("GGUF with metadata reader", func() {
+		It("uses metadata for VRAM when reader returns meta and partial offload", func() {
+			meta := &GGUFMeta{BlockCount: 32, EmbeddingLength: 4096}
+			reader := fakeGGUFReader{"http://a/model.gguf": meta}
+			opts := EstimateOptions{ContextLength: 8192, GPULayers: 20}
+			files := []FileInput{{URI: "http://a/model.gguf", Size: 8_000_000_000}}
+
+			res, err := Estimate(ctx, files, opts, nil, reader)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(res.VRAMBytes).To(BeNumerically(">", 0))
+		})
+
+		It("uses metadata head counts for KV and yields vram > size", func() {
+			files := []FileInput{{URI: "http://a/model.gguf", Size: 15_000_000_000}}
+			meta := &GGUFMeta{BlockCount: 32, EmbeddingLength: 4096, HeadCount: 32, HeadCountKV: 8}
+			reader := fakeGGUFReader{"http://a/model.gguf": meta}
+			opts := EstimateOptions{ContextLength: 8192}
+
+			res, err := Estimate(ctx, files, opts, nil, reader)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(res.SizeBytes).To(Equal(uint64(15_000_000_000)))
+			Expect(res.VRAMBytes).To(BeNumerically(">", res.SizeBytes))
+		})
+	})
+})
+
+var _ = Describe("FormatBytes", func() {
+	It("formats 2.5e9 as 2.5 GB", func() {
+		Expect(FormatBytes(2_500_000_000)).To(Equal("2.5 GB"))
+	})
+})
--- a/pkg/vram/gguf_reader.go
+++ b/pkg/vram/gguf_reader.go
@@ -0,0 +1,46 @@
+package vram
+
+import (
+	"context"
+	"strings"
+
+	"github.com/mudler/LocalAI/pkg/downloader"
+	gguf "github.com/gpustack/gguf-parser-go"
+)
+
+type defaultGGUFReader struct{}
+
+func (defaultGGUFReader) ReadMetadata(ctx context.Context, uri string) (*GGUFMeta, error) {
+	u := downloader.URI(uri)
+	urlStr := u.ResolveURL()
+
+	if strings.HasPrefix(uri, downloader.LocalPrefix) {
+		f, err := gguf.ParseGGUFFile(urlStr)
+		if err != nil {
+			return nil, err
+		}
+		return ggufFileToMeta(f), nil
+	}
+	if !u.LooksLikeHTTPURL() {
+		return nil, nil
+	}
+	f, err := gguf.ParseGGUFFileRemote(ctx, urlStr)
+	if err != nil {
+		return nil, err
+	}
+	return ggufFileToMeta(f), nil
+}
+
+func ggufFileToMeta(f *gguf.GGUFFile) *GGUFMeta {
+	arch := f.Architecture()
+	meta := &GGUFMeta{
+		BlockCount:       uint32(arch.BlockCount),
+		EmbeddingLength:  uint32(arch.EmbeddingLength),
+		HeadCount:        uint32(arch.AttentionHeadCount),
+		HeadCountKV:      uint32(arch.AttentionHeadCountKV),
+	}
+	if meta.HeadCountKV == 0 {
+		meta.HeadCountKV = meta.HeadCount
+	}
+	return meta
+}
--- a/pkg/vram/types.go
+++ b/pkg/vram/types.go
@@ -0,0 +1,42 @@
+package vram
+
+import "context"
+
+// FileInput represents a single model file for estimation (URI and optional pre-known size).
+type FileInput struct {
+	URI  string
+	Size int64
+}
+
+// SizeResolver returns the content length in bytes for a given URI.
+type SizeResolver interface {
+	ContentLength(ctx context.Context, uri string) (int64, error)
+}
+
+// GGUFMeta holds parsed GGUF metadata used for VRAM estimation.
+type GGUFMeta struct {
+	BlockCount       uint32
+	EmbeddingLength  uint32
+	HeadCount        uint32
+	HeadCountKV      uint32
+}
+
+// GGUFMetadataReader reads GGUF metadata from a URI (e.g. via HTTP Range).
+type GGUFMetadataReader interface {
+	ReadMetadata(ctx context.Context, uri string) (*GGUFMeta, error)
+}
+
+// EstimateOptions configures VRAM/size estimation.
+type EstimateOptions struct {
+	ContextLength uint32
+	GPULayers     int
+	KVQuantBits   int
+}
+
+// EstimateResult holds estimated download size and VRAM with display strings.
+type EstimateResult struct {
+	SizeBytes    uint64
+	SizeDisplay  string
+	VRAMBytes    uint64
+	VRAMDisplay  string
+}
--- a/pkg/vram/vram_suite_test.go
+++ b/pkg/vram/vram_suite_test.go
@@ -0,0 +1,13 @@
+package vram_test
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestVram(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Vram test suite")
+}
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -2183,6 +2183,18 @@ const docTemplate = `{
        "schema.GalleryResponse": {
            "type": "object",
            "properties": {
+                "estimated_size_bytes": {
+                    "type": "integer"
+                },
+                "estimated_size_display": {
+                    "type": "string"
+                },
+                "estimated_vram_bytes": {
+                    "type": "integer"
+                },
+                "estimated_vram_display": {
+                    "type": "string"
+                },
                "status": {
                    "type": "string"
                },
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -2176,6 +2176,18 @@
        "schema.GalleryResponse": {
            "type": "object",
            "properties": {
+                "estimated_size_bytes": {
+                    "type": "integer"
+                },
+                "estimated_size_display": {
+                    "type": "string"
+                },
+                "estimated_vram_bytes": {
+                    "type": "integer"
+                },
+                "estimated_vram_display": {
+                    "type": "string"
+                },
                "status": {
                    "type": "string"
                },
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -444,6 +444,14 @@ definitions:
    type: object
  schema.GalleryResponse:
    properties:
+      estimated_size_bytes:
+        type: integer
+      estimated_size_display:
+        type: string
+      estimated_vram_bytes:
+        type: integer
+      estimated_vram_display:
+        type: string
      status:
        type: string
      uuid:
Author	SHA1	Message	Date
Ettore Di Giacinto	2aaddbb3b8	chore(ci): wire external backend for tests Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-03-01 21:33:20 +00:00
LocalAI [bot]	0063e5d68f	feat(swagger): update swagger (#8706 ) Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-03-01 21:33:19 +01:00
Ettore Di Giacinto	c7c4a20a9e	fix: retry when LLM returns empty messages (#8704 ) * debug Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * retry instead of re-computing a response Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-03-01 21:32:38 +01:00
LocalAI [bot]	94539f3992	chore(model gallery): 🤖 add 1 new models via gallery agent (#8698 ) chore(model gallery): 🤖 add new models via gallery agent Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-03-01 16:54:01 +01:00
LocalAI [bot]	525278658d	chore(model gallery): 🤖 add 1 new models via gallery agent (#8696 ) chore(model gallery): 🤖 add new models via gallery agent Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-03-01 16:19:38 +01:00
LocalAI [bot]	919f801e25	chore(model gallery): 🤖 add 1 new models via gallery agent (#8695 ) chore(model gallery): 🤖 add new models via gallery agent Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-03-01 15:58:02 +01:00
LocalAI [bot]	362eb261c5	chore(model gallery): 🤖 add 1 new models via gallery agent (#8694 ) chore(model gallery): 🤖 add new models via gallery agent Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-03-01 15:40:43 +01:00
LocalAI [bot]	d407f4ead5	chore(model gallery): 🤖 add 1 new models via gallery agent (#8693 ) chore(model gallery): 🤖 add new models via gallery agent Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-03-01 15:25:08 +01:00
Ettore Di Giacinto	1fc8ad854f	fix(toolcall): consider also literal \n between tags Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-03-01 11:20:46 +01:00
Loryan Strant	f49a8edd87	docs: Update Home Assistant links in README.md (#8688 ) Update Home Assistant links in README.md Signed-off-by: Loryan Strant <51473494+loryanstrant@users.noreply.github.com>	2026-03-01 08:28:58 +01:00
Ettore Di Giacinto	510b830d2b	fix: simplify CI steps, fix gallery agent (#8685 ) chore: simplify CI steps, fix gallery agent Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-03-01 01:00:30 +01:00
LocalAI [bot]	ddb36468ed	chore: ⬆️ Update ggml-org/llama.cpp to `05728db18eea59de81ee3a7699739daaf015206b` (#8683 ) ⬆️ Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-03-01 00:48:26 +01:00
Ettore Di Giacinto	983db7bedc	feat(ui): add model size estimation (#8684 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-02-28 23:03:47 +01:00
LocalAI [bot]	b260378694	docs: add TLS reverse proxy configuration guide (#8673 ) * docs: add TLS reverse proxy configuration guide Add documentation explaining how to use LocalAI behind a TLS termination reverse proxy (HAProxy, Apache, Nginx). The documentation covers: - How LocalAI detects HTTPS via X-Forwarded-Proto header - Required headers that must be forwarded - Configuration examples for HAProxy, Apache, and Nginx - Sub-path serving configuration - Testing and troubleshooting guide Fixes: Issue #7176 - Web UI broken behind TLS reverse proxy Signed-off-by: localai-bot <localai-bot@users.noreply.github.com> * docs: remove non-existent --base-url option from sub-path section --------- Signed-off-by: localai-bot <localai-bot@users.noreply.github.com> Co-authored-by: localai-bot <localai-bot@users.noreply.github.com>	2026-02-28 23:02:17 +01:00
LocalAI [bot]	b10443ab5a	feat(models): add model storage size display and RAM warning (#8675 ) Add model storage size display and RAM warning in Models tab - Backend (ui_api.go): - Added getDirectorySize() helper function to calculate total size of model files - Added storageSize, ramTotal, ramUsed, ramUsagePercent to /api/models endpoint response - Uses xsysinfo.GetSystemRAMInfo() for RAM information - Frontend (models.html): - Added storageSize, ramTotal, ramUsed, ramUsagePercent to Alpine.js data object - Added formatBytes() helper for human-readable byte formatting - Display storage size in hero header with blue indicator - Show warning banner when storage exceeds RAM (model too large for system) Addresses: https://github.com/mudler/LocalAI/issues/6251 Signed-off-by: localai-bot <localai-bot@users.noreply.github.com> Co-authored-by: localai-bot <localai-bot@users.noreply.github.com>	2026-02-28 22:05:01 +01:00
LocalAI [bot]	b647b6caf1	fix: properly sync model selection dropdown in video generation UI (#8680 ) fix(video): initialize model selection dropdown with current model value The Alpine.js link variable was starting empty, causing the dropdown selection to not reflect the currently selected model. This fix initializes the link variable with the current model value from the template (e.g., video/{{.Model}}), following the same pattern used in image.html. Signed-off-by: localai-bot <localai-bot@users.noreply.github.com> Co-authored-by: localai-bot <localai-bot@users.noreply.github.com>	2026-02-28 13:11:33 +01:00
LocalAI [bot]	c187b160e7	fix(gallery): clean up partially downloaded backend on installation failure (#8679 ) When a backend download fails (e.g., on Mac OS with port conflicts causing connection issues), the backend directory is left with partial files. This causes subsequent installation attempts to fail with 'run file not found' because the sanity check runs on an empty/partial directory. This fix cleans up the backend directory when the initial download fails before attempting fallback URIs or mirrors. This ensures a clean state for retry attempts. Fixes: #8016 Signed-off-by: localai-bot <localai-bot@users.noreply.github.com> Co-authored-by: localai-bot <localai-bot@users.noreply.github.com>	2026-02-28 13:10:53 +01:00
LocalAI [bot]	42e580bed0	fix: whisper breaking on cuda-13 (use absolute path for CUDA directory detection) (#8678 ) fix: use absolute path for CUDA directory detection The capability detection was using a relative path 'usr/local/cuda-13' which doesn't work when LocalAI is run from a different working directory. This caused whisper (and other backends) to fail on CUDA-13 containers because the system incorrectly detected 'nvidia' capability instead of 'nvidia-cuda-13', leading to wrong backend selection (cuda12-whisper instead of cuda13-whisper). Fixes: https://github.com/mudler/LocalAI/issues/8033 Co-authored-by: localai-bot <localai-bot@users.noreply.github.com>	2026-02-28 09:10:40 +01:00
LocalAI [bot]	5e13193d84	docs: add CDI driver config for NVIDIA GPU in containers (fix #8108 ) (#8677 ) This addresses issue #8108 where the legacy nvidia driver configuration causes container startup failures with newer NVIDIA Container Toolkit versions. Changes: - Update docker-compose example to show both CDI (recommended) and legacy nvidia driver options - Add troubleshooting section for 'Auto-detected mode as legacy' error - Document the fix for nvidia-container-cli 'invalid expression' errors The root cause is a Docker/NVIDIA Container Toolkit configuration issue, not a LocalAI code bug. The error occurs during the container runtime's prestart hook before LocalAI starts. Co-authored-by: localai-bot <localai-bot@users.noreply.github.com>	2026-02-28 08:42:53 +01:00
Ettore Di Giacinto	1c5dc83232	chore(deps): bump llama.cpp to 'ecbcb7ea9d3303097519723b264a8b5f1e977028' (#8672 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-02-28 00:33:56 +01:00
LocalAI [bot]	73b997686a	chore: ⬆️ Update ggml-org/whisper.cpp to `9453b4b9be9b73adfc35051083f37cefa039acee` (#8671 ) ⬆️ Update ggml-org/whisper.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-02-27 21:28:48 +00:00
Ettore Di Giacinto	00abf1be1f	fix(qwen3.5): add qwen3.5 preset and mimick llama.cpp's PEG (#8668 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-02-27 12:15:00 +01:00
LocalAI [bot]	959458f0db	fix(gallery): add fallback URI resolution for backend installation (#8663 ) * fix(gallery): add fallback URI resolution for backend installation When a backend installation fails (e.g., due to missing 'latest-' tag), try fallback URIs in order: 1. Replace 'latest-' with 'master-' in the URI 2. If that fails, append '-development' to the backend name This fixes the issue where backend index entries don't match the repository tags. For example, installing 'ace-step' tries to download 'latest-gpu-nvidia-cuda-13-ace-step' but only 'master-gpu-nvidia-cuda-13-ace-step' exists in the quay.io registry. Fixes: #8437 Signed-off-by: localai-bot <139863280+localai-bot@users.noreply.github.com> * chore(gallery): make fallback URI patterns configurable via env vars --------- Signed-off-by: localai-bot <139863280+localai-bot@users.noreply.github.com>	2026-02-27 10:56:33 +01:00
LocalAI [bot]	dfc6efb88d	feat(backends): add faster-qwen3-tts (#8664 ) * feat(backends): add faster-qwen3-tts Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix: this backend is CUDA only Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix: add requirements-install.txt with setuptools for build isolation The faster-qwen3-tts backend requires setuptools to build packages like sox that have setuptools as a build dependency. This ensures the build completes successfully in CI. Signed-off-by: LocalAI Bot <localai-bot@users.noreply.github.com> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Signed-off-by: LocalAI Bot <localai-bot@users.noreply.github.com> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>	2026-02-27 08:16:51 +01:00