chore: ⬆️ Update ggml-org/llama.cpp to 0320ac5264279d74f8ee91bafa6c90e9ab9bbb91 (#6306 )

⬆️ Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
Add permissions for issues and actions
2026-02-03 11:13:31 -05:00 · 2025-09-18 09:27:18 +02:00 · 2025-09-18 09:26:10 +02:00 · 2025-09-17 21:06:50 +00:00 · 2025-09-17 19:31:43 +02:00 · 2025-09-17 09:49:58 +02:00
66 changed files with 1272 additions and 344 deletions
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -111,6 +111,18 @@ jobs:
            backend: "diffusers"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
+          - build-type: ''
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-cpu-chatterbox'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'true'
+            backend: "chatterbox"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
          # CUDA 11 additional backends
          - build-type: 'cublas'
            cuda-major-version: "11"
@@ -763,7 +775,7 @@ jobs:
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-gpu-hipblas-whisper'
+            tag-suffix: '-gpu-rocm-hipblas-whisper'
            base-image: "rocm/dev-ubuntu-22.04:6.4.3"
            runs-on: 'ubuntu-latest'
            skip-drivers: 'false'
@@ -968,6 +980,9 @@ jobs:
          - backend: "mlx"
            tag-suffix: "-metal-darwin-arm64-mlx"
            build-type: "mps"
+          - backend: "chatterbox"
+            tag-suffix: "-metal-darwin-arm64-chatterbox"
+            build-type: "mps"
          - backend: "mlx-vlm"
            tag-suffix: "-metal-darwin-arm64-mlx-vlm"
            build-type: "mps"
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -9,4 +9,4 @@ jobs:
      pull-requests: write
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/labeler@v5
+    - uses: actions/labeler@v6
--- a/.github/workflows/localaibot_automerge.yml
+++ b/.github/workflows/localaibot_automerge.yml
@@ -6,7 +6,8 @@ permissions:
  contents: write
  pull-requests: write
  packages: read
-
+  issues: write # for Homebrew/actions/post-comment
+  actions: write # to dispatch publish workflow
 jobs:
  dependabot:
    runs-on: ubuntu-latest
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -41,7 +41,7 @@ jobs:
      - name: Upload DMG to Release
        uses: softprops/action-gh-release@v2
        with:
-          files: ./dist/LocalAI-Launcher.dmg
+          files: ./dist/LocalAI.dmg
  launcher-build-linux:
    runs-on: ubuntu-latest
    steps:
@@ -61,4 +61,4 @@ jobs:
      - name: Upload Linux launcher artifacts
        uses: softprops/action-gh-release@v2
        with:
-          files: ./local-ai-launcher-linux.tar.xz
+          files: ./local-ai-launcher-linux.tar.xz
--- a/.github/workflows/stalebot.yml
+++ b/.github/workflows/stalebot.yml
@@ -10,7 +10,7 @@ jobs:
  stale:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9
+      - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v9
        with:
          stale-issue-message: 'This issue is stale because it has been open 90 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
          stale-pr-message: 'This PR is stale because it has been open 90 days with no activity. Remove stale label or comment or this will be closed in 10 days.'
--- a/9
+++ b/9
@@ -117,8 +117,8 @@ run: ## run local-ai
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) run ./

 test-models/testmodel.ggml:
-	mkdir test-models
-	mkdir test-dir
+	mkdir -p test-models
+	mkdir -p test-dir
 	wget -q https://huggingface.co/mradermacher/gpt2-alpaca-gpt4-GGUF/resolve/main/gpt2-alpaca-gpt4.Q4_K_M.gguf -O test-models/testmodel.ggml
 	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
 	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
@@ -369,6 +369,9 @@ backends/kitten-tts: docker-build-kitten-tts docker-save-kitten-tts build
 backends/kokoro: docker-build-kokoro docker-save-kokoro build
 	./local-ai backends install "ocifile://$(abspath ./backend-images/kokoro.tar)"

+backends/chatterbox: docker-build-chatterbox docker-save-chatterbox build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/chatterbox.tar)"
+
 backends/llama-cpp-darwin: build
 	bash ./scripts/build/llama-cpp-darwin.sh
 	./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)"
@@ -493,7 +496,7 @@ docker-build-bark:
 	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:bark -f backend/Dockerfile.python --build-arg BACKEND=bark .

 docker-build-chatterbox:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:chatterbox -f backend/Dockerfile.python --build-arg BACKEND=chatterbox .
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:chatterbox -f backend/Dockerfile.python --build-arg BACKEND=chatterbox ./backend

 docker-build-exllama2:
 	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:exllama2 -f backend/Dockerfile.python --build-arg BACKEND=exllama2 .
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@

 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
 >
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) Try on 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) Try on 
 [![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/localaiofficial_bot)

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
@@ -110,6 +110,12 @@ curl https://localai.io/install.sh | sh

 For more installation options, see [Installer Options](https://localai.io/docs/advanced/installer/).

+### macOS Download:
+
+<a href="https://github.com/mudler/LocalAI/releases/latest/download/LocalAI.dmg">
+  <img src="https://img.shields.io/badge/Download-macOS-blue?style=for-the-badge&logo=apple&logoColor=white" alt="Download LocalAI for macOS"/>
+</a>
+
 Or run with docker:

 ### CPU only image:
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -2,10 +2,10 @@ context_size: 4096
 f16: true
 backend: llama-cpp
 mmap: true
-mmproj: minicpm-v-2_6-mmproj-f16.gguf
+mmproj: minicpm-v-4_5-mmproj-f16.gguf
 name: gpt-4o
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
+  model: minicpm-v-4_5-Q4_K_M.gguf
 stopwords:
 - <|im_end|>
 - <dummy32000>
@@ -42,9 +42,9 @@ template:
    <|im_start|>assistant

 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+- filename: minicpm-v-4_5-Q4_K_M.gguf
+  sha256: c1c3c33100b15b4caf7319acce4e23c0eb0ce1cbd12f70e8d24f05aa67b7512f
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/ggml-model-Q4_K_M.gguf
+- filename: minicpm-v-4_5-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/mmproj-model-f16.gguf
+  sha256: 7a7225a32e8d453aaa3d22d8c579b5bf833c253f784cdb05c99c9a76fd616df8
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -2,10 +2,10 @@ context_size: 4096
 backend: llama-cpp
 f16: true
 mmap: true
-mmproj: minicpm-v-2_6-mmproj-f16.gguf
+mmproj: minicpm-v-4_5-mmproj-f16.gguf
 name: gpt-4o
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
+  model: minicpm-v-4_5-Q4_K_M.gguf
 stopwords:
 - <|im_end|>
 - <dummy32000>
@@ -42,9 +42,9 @@ template:
    <|im_start|>assistant

 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+- filename: minicpm-v-4_5-Q4_K_M.gguf
+  sha256: c1c3c33100b15b4caf7319acce4e23c0eb0ce1cbd12f70e8d24f05aa67b7512f
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/ggml-model-Q4_K_M.gguf
+- filename: minicpm-v-4_5-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/mmproj-model-f16.gguf
+  sha256: 7a7225a32e8d453aaa3d22d8c579b5bf833c253f784cdb05c99c9a76fd616df8
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -2,10 +2,10 @@ context_size: 4096
 backend: llama-cpp
 f16: true
 mmap: true
-mmproj: minicpm-v-2_6-mmproj-f16.gguf
+mmproj: minicpm-v-4_5-mmproj-f16.gguf
 name: gpt-4o
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
+  model: minicpm-v-4_5-Q4_K_M.gguf
 stopwords:
 - <|im_end|>
 - <dummy32000>
@@ -43,9 +43,9 @@ template:


 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+- filename: minicpm-v-4_5-Q4_K_M.gguf
+  sha256: c1c3c33100b15b4caf7319acce4e23c0eb0ce1cbd12f70e8d24f05aa67b7512f
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/ggml-model-Q4_K_M.gguf
+- filename: minicpm-v-4_5-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/mmproj-model-f16.gguf
+  sha256: 7a7225a32e8d453aaa3d22d8c579b5bf833c253f784cdb05c99c9a76fd616df8
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -276,6 +276,7 @@ message TranscriptRequest {
  string language = 3;
  uint32 threads = 4;
  bool translate = 5;
+  bool diarize = 6;
 }

 message TranscriptResult {
@@ -305,7 +306,7 @@ message GenerateImageRequest {
  // Diffusers
  string EnableParameters = 10;
  int32 CLIPSkip = 11;
-  
+
  // Reference images for models that support them (e.g., Flux Kontext)
  repeated string ref_images = 12;
 }
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=3de008208b9b8a33f49f979097a99b4d59e6e521
+LLAMA_VERSION?=0320ac5264279d74f8ee91bafa6c90e9ab9bbb91
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -701,7 +701,7 @@ public:
        */

        // for the shape of input/content, see tokenize_input_prompts()
-        json prompt = body.at("prompt");
+        json prompt = body.at("embeddings");


        auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
@@ -712,6 +712,7 @@ public:
            }
        }

+        int embd_normalize = 2; // default to Euclidean/L2 norm
        // create and queue the task
        json responses = json::array();
        bool error = false;
@@ -725,9 +726,8 @@ public:
                task.index         = i;
                task.prompt_tokens = std::move(tokenized_prompts[i]);

-                // OAI-compat
-                task.params.oaicompat = OAICOMPAT_TYPE_EMBEDDING;
-
+                task.params.oaicompat = OAICOMPAT_TYPE_NONE;
+                task.params.embd_normalize = embd_normalize;
                tasks.push_back(std::move(task));
            }

@@ -743,9 +743,8 @@ public:
                responses.push_back(res->to_json());
            }
        }, [&](const json & error_data) {
-            return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, error_data.value("content", ""));
+            error = true;
        }, [&]() {
-            // NOTE: we should try to check when the writer is closed here
            return false;
        });

@@ -755,12 +754,36 @@ public:
            return grpc::Status(grpc::StatusCode::INTERNAL, "Error in receiving results");
        }

-        std::vector<float> embeddings = responses[0].value("embedding", std::vector<float>());
-        // loop the vector and set the embeddings results
-        for (int i = 0; i < embeddings.size(); i++) {
-            embeddingResult->add_embeddings(embeddings[i]);
+        std::cout << "[DEBUG] Responses size: " << responses.size() << std::endl;
+        
+        // Process the responses and extract embeddings
+        for (const auto & response_elem : responses) {
+            // Check if the response has an "embedding" field
+            if (response_elem.contains("embedding")) {
+                json embedding_data = json_value(response_elem, "embedding", json::array());
+                
+                if (embedding_data.is_array() && !embedding_data.empty()) {
+                    for (const auto & embedding_vector : embedding_data) {
+                        if (embedding_vector.is_array()) {
+                            for (const auto & embedding_value : embedding_vector) {
+                                embeddingResult->add_embeddings(embedding_value.get<float>());
+                            }
+                        }
+                    }
+                }
+            } else {
+                // Check if the response itself contains the embedding data directly
+                if (response_elem.is_array()) {
+                    for (const auto & embedding_value : response_elem) {
+                        embeddingResult->add_embeddings(embedding_value.get<float>());
+                    }
+                }
+            }
        }

+
+    
+
        return grpc::Status::OK;
    }

--- a/backend/go/stablediffusion-ggml/.gitignore
+++ b/backend/go/stablediffusion-ggml/.gitignore
@@ -1,4 +1,6 @@
 package/
 sources/
+.cache/
+build/
 libgosd.so
 stablediffusion-ggml
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=4c6475f9176bf99271ccf5a2817b30a490b83db0
+STABLEDIFFUSION_GGML_VERSION?=0ebe6fe118f125665939b27c89f34ed38716bff8

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/go/stablediffusion-ggml/gosd.cpp
+++ b/backend/go/stablediffusion-ggml/gosd.cpp
@@ -4,17 +4,11 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
-#include <iostream>
-#include <random>
 #include <string>
 #include <vector>
 #include <filesystem>
 #include "gosd.h"

-// #include "preprocessing.hpp"
-#include "flux.hpp"
-#include "stable-diffusion.h"
-
 #define STB_IMAGE_IMPLEMENTATION
 #define STB_IMAGE_STATIC
 #include "stb_image.h"
@@ -29,7 +23,7 @@

 // Names of the sampler method, same order as enum sample_method in stable-diffusion.h
 const char* sample_method_str[] = {
-    "euler_a",
+    "default",
    "euler",
    "heun",
    "dpm2",
@@ -41,19 +35,27 @@ const char* sample_method_str[] = {
    "lcm",
    "ddim_trailing",
    "tcd",
+    "euler_a",
 };

+static_assert(std::size(sample_method_str) == SAMPLE_METHOD_COUNT, "sample method mismatch");
+
 // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
-const char* schedule_str[] = {
+const char* schedulers[] = {
    "default",
    "discrete",
    "karras",
    "exponential",
    "ays",
    "gits",
+    "smoothstep",
 };

+static_assert(std::size(schedulers) == SCHEDULE_COUNT, "schedulers mismatch");
+
 sd_ctx_t* sd_c;
+// Moved from the context (load time) to generation time params
+scheduler_t scheduler = scheduler_t::DEFAULT;

 sample_method_t sample_method;

@@ -105,7 +107,7 @@ int load_model(const char *model, char *model_path, char* options[], int threads
    const char *clip_g_path  = "";
    const char *t5xxl_path  = "";
    const char *vae_path  = "";
-    const char *scheduler = "";
+    const char *scheduler_str = "";
    const char *sampler = "";
    char *lora_dir = model_path;
    bool lora_dir_allocated = false;
@@ -133,7 +135,7 @@ int load_model(const char *model, char *model_path, char* options[], int threads
            vae_path = optval;
        }
        if (!strcmp(optname, "scheduler")) {
-            scheduler = optval;
+            scheduler_str = optval;
        }
        if (!strcmp(optname, "sampler")) {
            sampler = optval;
@@ -166,26 +168,17 @@ int load_model(const char *model, char *model_path, char* options[], int threads
    }
    if (sample_method_found == -1) {
        fprintf(stderr, "Invalid sample method, default to EULER_A!\n");
-        sample_method_found = EULER_A;
+        sample_method_found = sample_method_t::SAMPLE_METHOD_DEFAULT;
    }
    sample_method = (sample_method_t)sample_method_found;

-    int schedule_found            = -1;
    for (int d = 0; d < SCHEDULE_COUNT; d++) {
-        if (!strcmp(scheduler, schedule_str[d])) {
-            schedule_found = d;
-                fprintf (stderr, "Found scheduler: %s\n", scheduler);
-
+        if (!strcmp(scheduler_str, schedulers[d])) {
+            scheduler = (scheduler_t)d;
+            fprintf (stderr, "Found scheduler: %s\n", scheduler_str);
        }
    }

-    if (schedule_found == -1) {
-        fprintf (stderr, "Invalid scheduler! using DEFAULT\n");
-        schedule_found = DEFAULT;
-    }
-
-    schedule_t schedule = (schedule_t)schedule_found;
-
    fprintf (stderr, "Creating context\n");
    sd_ctx_params_t ctx_params;
    sd_ctx_params_init(&ctx_params);
@@ -199,13 +192,10 @@ int load_model(const char *model, char *model_path, char* options[], int threads
    ctx_params.control_net_path = "";
    ctx_params.lora_model_dir = lora_dir;
    ctx_params.embedding_dir = "";
-    ctx_params.stacked_id_embed_dir = "";
    ctx_params.vae_decode_only = false;
-    ctx_params.vae_tiling = false;
    ctx_params.free_params_immediately = false;
    ctx_params.n_threads = threads;
    ctx_params.rng_type = STD_DEFAULT_RNG;
-    ctx_params.schedule = schedule;
    sd_ctx_t* sd_ctx = new_sd_ctx(&ctx_params);

    if (sd_ctx == NULL) {
@@ -228,7 +218,49 @@ int load_model(const char *model, char *model_path, char* options[], int threads
    return 0;
 }

-int gen_image(char *text, char *negativeText, int width, int height, int steps, int64_t seed, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char **ref_images, int ref_images_count) {
+void sd_tiling_params_set_enabled(sd_tiling_params_t *params, bool enabled) {
+    params->enabled = enabled;
+}
+
+void sd_tiling_params_set_tile_sizes(sd_tiling_params_t *params, int tile_size_x, int tile_size_y) {
+    params->tile_size_x = tile_size_x;
+    params->tile_size_y = tile_size_y;
+}
+
+void sd_tiling_params_set_rel_sizes(sd_tiling_params_t *params, float rel_size_x, float rel_size_y) {
+    params->rel_size_x = rel_size_x;
+    params->rel_size_y = rel_size_y;
+}
+
+void sd_tiling_params_set_target_overlap(sd_tiling_params_t *params, float target_overlap) {
+    params->target_overlap = target_overlap;
+}
+
+sd_tiling_params_t* sd_img_gen_params_get_vae_tiling_params(sd_img_gen_params_t *params) {
+    return &params->vae_tiling_params;
+}
+
+sd_img_gen_params_t* sd_img_gen_params_new(void) {
+    sd_img_gen_params_t *params = (sd_img_gen_params_t *)std::malloc(sizeof(sd_img_gen_params_t));
+    sd_img_gen_params_init(params);
+    return params;
+}
+
+void sd_img_gen_params_set_prompts(sd_img_gen_params_t *params, const char *prompt, const char *negative_prompt) {
+    params->prompt = prompt;
+    params->negative_prompt = negative_prompt;
+}
+
+void sd_img_gen_params_set_dimensions(sd_img_gen_params_t *params, int width, int height) {
+    params->width = width;
+    params->height = height;
+}
+
+void sd_img_gen_params_set_seed(sd_img_gen_params_t *params, int64_t seed) {
+    params->seed = seed;
+}
+
+int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char **ref_images, int ref_images_count) {

    sd_image_t* results;

@@ -236,20 +268,15 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,

    fprintf (stderr, "Generating image\n");

-    sd_img_gen_params_t p;
-    sd_img_gen_params_init(&p);
+    p->sample_params.guidance.txt_cfg = cfg_scale;
+    p->sample_params.guidance.slg.layers = skip_layers.data();
+    p->sample_params.guidance.slg.layer_count = skip_layers.size();
+    p->sample_params.sample_method = sample_method;
+    p->sample_params.sample_steps = steps;
+    p->sample_params.scheduler = scheduler;

-    p.prompt = text;
-    p.negative_prompt = negativeText;
-    p.guidance.txt_cfg = cfg_scale;
-    p.guidance.slg.layers = skip_layers.data();
-    p.guidance.slg.layer_count = skip_layers.size();
-    p.width = width;
-    p.height = height;
-    p.sample_method = sample_method;
-    p.sample_steps = steps;
-    p.seed = seed;
-    p.input_id_images_path = "";
+    int width = p->width;
+    int height = p->height;

    // Handle input image for img2img
    bool has_input_image = (src_image != NULL && strlen(src_image) > 0);
@@ -298,13 +325,13 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,
            input_image_buffer = resized_image_buffer;
        }

-        p.init_image = {(uint32_t)width, (uint32_t)height, 3, input_image_buffer};
-        p.strength = strength;
+        p->init_image = {(uint32_t)width, (uint32_t)height, 3, input_image_buffer};
+        p->strength = strength;
        fprintf(stderr, "Using img2img with strength: %.2f\n", strength);
    } else {
        // No input image, use empty image for text-to-image
-        p.init_image = {(uint32_t)width, (uint32_t)height, 3, NULL};
-        p.strength = 0.0f;
+        p->init_image = {(uint32_t)width, (uint32_t)height, 3, NULL};
+        p->strength = 0.0f;
    }

    // Handle mask image for inpainting
@@ -344,12 +371,12 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,
            mask_image_buffer = resized_mask_buffer;
        }

-        p.mask_image = {(uint32_t)width, (uint32_t)height, 1, mask_image_buffer};
+        p->mask_image = {(uint32_t)width, (uint32_t)height, 1, mask_image_buffer};
        fprintf(stderr, "Using inpainting with mask\n");
    } else {
        // No mask image, create default full mask
        default_mask_image_vec.resize(width * height, 255);
-        p.mask_image = {(uint32_t)width, (uint32_t)height, 1, default_mask_image_vec.data()};
+        p->mask_image = {(uint32_t)width, (uint32_t)height, 1, default_mask_image_vec.data()};
    }

    // Handle reference images
@@ -407,13 +434,15 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,
        }

        if (!ref_images_vec.empty()) {
-            p.ref_images = ref_images_vec.data();
-            p.ref_images_count = ref_images_vec.size();
+            p->ref_images = ref_images_vec.data();
+            p->ref_images_count = ref_images_vec.size();
            fprintf(stderr, "Using %zu reference images\n", ref_images_vec.size());
        }
    }

-    results = generate_image(sd_c, &p);
+    results = generate_image(sd_c, p);
+
+    std::free(p);

    if (results == NULL) {
        fprintf (stderr, "NO results\n");
--- a/backend/go/stablediffusion-ggml/gosd.go
+++ b/backend/go/stablediffusion-ggml/gosd.go
@@ -22,7 +22,18 @@ type SDGGML struct {

 var (
 	LoadModel func(model, model_apth string, options []uintptr, threads int32, diff int) int
-	GenImage  func(text, negativeText string, width, height, steps int, seed int64, dst string, cfgScale float32, srcImage string, strength float32, maskImage string, refImages []string, refImagesCount int) int
+	GenImage  func(params uintptr, steps int, dst string, cfgScale float32, srcImage string, strength float32, maskImage string, refImages []string, refImagesCount int) int
+
+	TilingParamsSetEnabled       func(params uintptr, enabled bool)
+	TilingParamsSetTileSizes     func(params uintptr, tileSizeX int, tileSizeY int)
+	TilingParamsSetRelSizes      func(params uintptr, relSizeX float32, relSizeY float32)
+	TilingParamsSetTargetOverlap func(params uintptr, targetOverlap float32)
+
+	ImgGenParamsNew                func() uintptr
+	ImgGenParamsSetPrompts         func(params uintptr, prompt string, negativePrompt string)
+	ImgGenParamsSetDimensions      func(params uintptr, width int, height int)
+	ImgGenParamsSetSeed            func(params uintptr, seed int64)
+	ImgGenParamsGetVaeTilingParams func(params uintptr) uintptr
 )

 // Copied from Purego internal/strings
@@ -120,7 +131,15 @@ func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
 	// Default strength for img2img (0.75 is a good default)
 	strength := float32(0.75)

-	ret := GenImage(t, negative, int(opts.Width), int(opts.Height), int(opts.Step), int64(opts.Seed), dst, sd.cfgScale, srcImage, strength, maskImage, refImages, refImagesCount)
+	// free'd by GenImage
+	p := ImgGenParamsNew()
+	ImgGenParamsSetPrompts(p, t, negative)
+	ImgGenParamsSetDimensions(p, int(opts.Width), int(opts.Height))
+	ImgGenParamsSetSeed(p, int64(opts.Seed))
+	vaep := ImgGenParamsGetVaeTilingParams(p)
+	TilingParamsSetEnabled(vaep, false)
+
+	ret := GenImage(p, int(opts.Step), dst, sd.cfgScale, srcImage, strength, maskImage, refImages, refImagesCount)
 	if ret != 0 {
 		return fmt.Errorf("inference failed")
 	}
--- a/backend/go/stablediffusion-ggml/gosd.h
+++ b/backend/go/stablediffusion-ggml/gosd.h
@@ -1,8 +1,23 @@
+#include <cstdint>
+#include "stable-diffusion.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
+
+void sd_tiling_params_set_enabled(sd_tiling_params_t *params, bool enabled);
+void sd_tiling_params_set_tile_sizes(sd_tiling_params_t *params, int tile_size_x, int tile_size_y);
+void sd_tiling_params_set_rel_sizes(sd_tiling_params_t *params, float rel_size_x, float rel_size_y);
+void sd_tiling_params_set_target_overlap(sd_tiling_params_t *params, float target_overlap);
+sd_tiling_params_t* sd_img_gen_params_get_vae_tiling_params(sd_img_gen_params_t *params);
+
+sd_img_gen_params_t* sd_img_gen_params_new(void);
+void sd_img_gen_params_set_prompts(sd_img_gen_params_t *params, const char *prompt, const char *negative_prompt);
+void sd_img_gen_params_set_dimensions(sd_img_gen_params_t *params, int width, int height);
+void sd_img_gen_params_set_seed(sd_img_gen_params_t *params, int64_t seed);
+
 int load_model(const char *model, char *model_path, char* options[], int threads, int diffusionModel);
-int gen_image(char *text, char *negativeText, int width, int height, int steps, int64_t seed, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char **ref_images, int ref_images_count);
+int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char **ref_images, int ref_images_count);
 #ifdef __cplusplus
 }
 #endif
--- a/backend/go/stablediffusion-ggml/main.go
+++ b/backend/go/stablediffusion-ggml/main.go
@@ -11,14 +11,35 @@ var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )

+type LibFuncs struct {
+	FuncPtr any
+	Name    string
+}
+
 func main() {
 	gosd, err := purego.Dlopen("./libgosd.so", purego.RTLD_NOW|purego.RTLD_GLOBAL)
 	if err != nil {
 		panic(err)
 	}

-	purego.RegisterLibFunc(&LoadModel, gosd, "load_model")
-	purego.RegisterLibFunc(&GenImage, gosd, "gen_image")
+	libFuncs := []LibFuncs{
+		{&LoadModel, "load_model"},
+		{&GenImage, "gen_image"},
+		{&TilingParamsSetEnabled, "sd_tiling_params_set_enabled"},
+		{&TilingParamsSetTileSizes, "sd_tiling_params_set_tile_sizes"},
+		{&TilingParamsSetRelSizes, "sd_tiling_params_set_rel_sizes"},
+		{&TilingParamsSetTargetOverlap, "sd_tiling_params_set_target_overlap"},
+
+		{&ImgGenParamsNew, "sd_img_gen_params_new"},
+		{&ImgGenParamsSetPrompts, "sd_img_gen_params_set_prompts"},
+		{&ImgGenParamsSetDimensions, "sd_img_gen_params_set_dimensions"},
+		{&ImgGenParamsSetSeed, "sd_img_gen_params_set_seed"},
+		{&ImgGenParamsGetVaeTilingParams, "sd_img_gen_params_get_vae_tiling_params"},
+	}
+
+	for _, lf := range libFuncs {
+		purego.RegisterLibFunc(lf.FuncPtr, gosd, lf.Name)
+	}

 	flag.Parse()

--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=7745fcf32846006128f16de429cfe1677c963b30
+WHISPER_CPP_VERSION?=edea8a9c3cf0eb7676dcdb604991eb2f95c3d984

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

--- a/backend/go/whisper/gowhisper.cpp
+++ b/backend/go/whisper/gowhisper.cpp
@@ -7,34 +7,35 @@ static struct whisper_vad_context *vctx;
 static struct whisper_context *ctx;
 static std::vector<float> flat_segs;

-static void ggml_log_cb(enum ggml_log_level level, const char* log, void* data) {
-    const char* level_str;
+static void ggml_log_cb(enum ggml_log_level level, const char *log,
+                        void *data) {
+  const char *level_str;

-    if (!log) {
-        return;
-    }
+  if (!log) {
+    return;
+  }

-    switch (level) {
-        case GGML_LOG_LEVEL_DEBUG:
-            level_str = "DEBUG";
-            break;
-        case GGML_LOG_LEVEL_INFO:
-            level_str = "INFO";
-            break;
-        case GGML_LOG_LEVEL_WARN:
-            level_str = "WARN";
-            break;
-        case GGML_LOG_LEVEL_ERROR:
-            level_str = "ERROR";
-            break;
-        default: /* Potential future-proofing */
-            level_str = "?????";
-            break;
-    }
+  switch (level) {
+  case GGML_LOG_LEVEL_DEBUG:
+    level_str = "DEBUG";
+    break;
+  case GGML_LOG_LEVEL_INFO:
+    level_str = "INFO";
+    break;
+  case GGML_LOG_LEVEL_WARN:
+    level_str = "WARN";
+    break;
+  case GGML_LOG_LEVEL_ERROR:
+    level_str = "ERROR";
+    break;
+  default: /* Potential future-proofing */
+    level_str = "?????";
+    break;
+  }

-    fprintf(stderr, "[%-5s] ", level_str);
-    fputs(log, stderr);
-    fflush(stderr);
+  fprintf(stderr, "[%-5s] ", level_str);
+  fputs(log, stderr);
+  fflush(stderr);
 }

 int load_model(const char *const model_path) {
@@ -105,8 +106,8 @@ int vad(float pcmf32[], size_t pcmf32_len, float **segs_out,
  return 0;
 }

-int transcribe(uint32_t threads, char *lang, bool translate, float pcmf32[],
-               size_t pcmf32_len, size_t *segs_out_len) {
+int transcribe(uint32_t threads, char *lang, bool translate, bool tdrz,
+               float pcmf32[], size_t pcmf32_len, size_t *segs_out_len) {
  whisper_full_params wparams =
      whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

@@ -120,6 +121,9 @@ int transcribe(uint32_t threads, char *lang, bool translate, float pcmf32[],
  wparams.translate = translate;
  wparams.debug_mode = true;
  wparams.print_progress = true;
+  wparams.tdrz_enable = tdrz;
+
+  fprintf(stderr, "info: Enable tdrz: %d\n", tdrz);

  if (whisper_full(ctx, wparams, pcmf32, pcmf32_len)) {
    fprintf(stderr, "error: transcription failed\n");
@@ -144,3 +148,7 @@ int n_tokens(int i) { return whisper_full_n_tokens(ctx, i); }
 int32_t get_token_id(int i, int j) {
  return whisper_full_get_token_id(ctx, i, j);
 }
+
+bool get_segment_speaker_turn_next(int i) {
+  return whisper_full_get_segment_speaker_turn_next(ctx, i);
+}
--- a/backend/go/whisper/gowhisper.go
+++ b/backend/go/whisper/gowhisper.go
@@ -14,15 +14,16 @@ import (
 )

 var (
-	CppLoadModel       func(modelPath string) int
-	CppLoadModelVAD    func(modelPath string) int
-	CppVAD             func(pcmf32 []float32, pcmf32Size uintptr, segsOut unsafe.Pointer, segsOutLen unsafe.Pointer) int
-	CppTranscribe      func(threads uint32, lang string, translate bool, pcmf32 []float32, pcmf32Len uintptr, segsOutLen unsafe.Pointer) int
-	CppGetSegmentText  func(i int) string
-	CppGetSegmentStart func(i int) int64
-	CppGetSegmentEnd   func(i int) int64
-	CppNTokens         func(i int) int
-	CppGetTokenID      func(i int, j int) int
+	CppLoadModel                 func(modelPath string) int
+	CppLoadModelVAD              func(modelPath string) int
+	CppVAD                       func(pcmf32 []float32, pcmf32Size uintptr, segsOut unsafe.Pointer, segsOutLen unsafe.Pointer) int
+	CppTranscribe                func(threads uint32, lang string, translate bool, diarize bool, pcmf32 []float32, pcmf32Len uintptr, segsOutLen unsafe.Pointer) int
+	CppGetSegmentText            func(i int) string
+	CppGetSegmentStart           func(i int) int64
+	CppGetSegmentEnd             func(i int) int64
+	CppNTokens                   func(i int) int
+	CppGetTokenID                func(i int, j int) int
+	CppGetSegmentSpeakerTurnNext func(i int) bool
 )

 type Whisper struct {
@@ -122,7 +123,7 @@ func (w *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptR
 	segsLen := uintptr(0xdeadbeef)
 	segsLenPtr := unsafe.Pointer(&segsLen)

-	if ret := CppTranscribe(opts.Threads, opts.Language, opts.Translate, data, uintptr(len(data)), segsLenPtr); ret != 0 {
+	if ret := CppTranscribe(opts.Threads, opts.Language, opts.Translate, opts.Diarize, data, uintptr(len(data)), segsLenPtr); ret != 0 {
 		return pb.TranscriptResult{}, fmt.Errorf("Failed Transcribe")
 	}

@@ -134,6 +135,10 @@ func (w *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptR
 		txt := strings.Clone(CppGetSegmentText(i))
 		tokens := make([]int32, CppNTokens(i))

+		if opts.Diarize && CppGetSegmentSpeakerTurnNext(i) {
+			txt += " [SPEAKER_TURN]"
+		}
+
 		for j := range tokens {
 			tokens[j] = int32(CppGetTokenID(i, j))
 		}
@@ -151,6 +156,6 @@ func (w *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptR

 	return pb.TranscriptResult{
 		Segments: segments,
-		Text: strings.TrimSpace(text),
+		Text:     strings.TrimSpace(text),
 	}, nil
 }
--- a/backend/go/whisper/gowhisper.h
+++ b/backend/go/whisper/gowhisper.h
@@ -6,11 +6,12 @@ int load_model(const char *const model_path);
 int load_model_vad(const char *const model_path);
 int vad(float pcmf32[], size_t pcmf32_size, float **segs_out,
        size_t *segs_out_len);
-int transcribe(uint32_t threads, char *lang, bool translate, float pcmf32[],
-               size_t pcmf32_len, size_t *segs_out_len);
+int transcribe(uint32_t threads, char *lang, bool translate, bool tdrz,
+               float pcmf32[], size_t pcmf32_len, size_t *segs_out_len);
 const char *get_segment_text(int i);
 int64_t get_segment_t0(int i);
 int64_t get_segment_t1(int i);
 int n_tokens(int i);
 int32_t get_token_id(int i, int j);
+bool get_segment_speaker_turn_next(int i);
 }
--- a/backend/go/whisper/main.go
+++ b/backend/go/whisper/main.go
@@ -33,6 +33,7 @@ func main() {
 		{&CppGetSegmentEnd, "get_segment_t1"},
 		{&CppNTokens, "n_tokens"},
 		{&CppGetTokenID, "get_token_id"},
+		{&CppGetSegmentSpeakerTurnNext, "get_segment_speaker_turn_next"},
 	}

 	for _, lf := range libFuncs {
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -350,6 +350,8 @@
  alias: "chatterbox"
  capabilities:
    nvidia: "cuda12-chatterbox"
+    metal: "metal-chatterbox"
+    default: "cpu-chatterbox"
 - &piper
  name: "piper"
  uri: "quay.io/go-skynet/local-ai-backends:latest-piper"
@@ -1223,6 +1225,28 @@
  name: "chatterbox-development"
  capabilities:
    nvidia: "cuda12-chatterbox-development"
+    metal: "metal-chatterbox-development"
+    default: "cpu-chatterbox-development"
+- !!merge <<: *chatterbox
+  name: "cpu-chatterbox"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-chatterbox"
+  mirrors:
+    - localai/localai-backends:latest-cpu-chatterbox
+- !!merge <<: *chatterbox
+  name: "cpu-chatterbox-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-chatterbox"
+  mirrors:
+    - localai/localai-backends:master-cpu-chatterbox
+- !!merge <<: *chatterbox
+  name: "metal-chatterbox"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-chatterbox"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-chatterbox
+- !!merge <<: *chatterbox
+  name: "metal-chatterbox-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-chatterbox"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-chatterbox
 - !!merge <<: *chatterbox
  name: "cuda12-chatterbox-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-chatterbox"
--- a/backend/python/chatterbox/requirements-cpu.txt
+++ b/backend/python/chatterbox/requirements-cpu.txt
@@ -1,5 +1,6 @@
+--extra-index-url https://download.pytorch.org/whl/cpu
 accelerate
 torch==2.6.0
 torchaudio==2.6.0
 transformers==4.46.3
-chatterbox-tts
+chatterbox-tts==0.1.2
--- a/backend/python/chatterbox/requirements-cublas11.txt
+++ b/backend/python/chatterbox/requirements-cublas11.txt
@@ -2,5 +2,5 @@
 torch==2.6.0+cu118
 torchaudio==2.6.0+cu118
 transformers==4.46.3
-chatterbox-tts
+chatterbox-tts==0.1.2
 accelerate
--- a/backend/python/chatterbox/requirements-cublas12.txt
+++ b/backend/python/chatterbox/requirements-cublas12.txt
@@ -1,5 +1,5 @@
 torch==2.6.0
 torchaudio==2.6.0
 transformers==4.46.3
-chatterbox-tts
+chatterbox-tts==0.1.2
 accelerate
--- a/backend/python/chatterbox/requirements-hipblas.txt
+++ b/backend/python/chatterbox/requirements-hipblas.txt
@@ -2,5 +2,5 @@
 torch==2.6.0+rocm6.1
 torchaudio==2.6.0+rocm6.1
 transformers==4.46.3
-chatterbox-tts
+chatterbox-tts==0.1.2
 accelerate
--- a/backend/python/chatterbox/requirements-intel.txt
+++ b/backend/python/chatterbox/requirements-intel.txt
@@ -3,9 +3,8 @@ intel-extension-for-pytorch==2.3.110+xpu
 torch==2.3.1+cxx11.abi
 torchaudio==2.3.1+cxx11.abi
 transformers==4.46.3
-chatterbox-tts
+chatterbox-tts==0.1.2
 accelerate
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools
-accelerate
+setuptools
--- a/cmd/launcher/internal/launcher.go
+++ b/cmd/launcher/internal/launcher.go
@@ -31,6 +31,7 @@ type Config struct {
 	StartOnBoot     bool              `json:"start_on_boot"`
 	LogLevel        string            `json:"log_level"`
 	EnvironmentVars map[string]string `json:"environment_vars"`
+	ShowWelcome     *bool             `json:"show_welcome"`
 }

 // Launcher represents the main launcher application
@@ -148,6 +149,13 @@ func (l *Launcher) Initialize() error {
 		log.Printf("Initializing empty EnvironmentVars map")
 	}

+	// Set default welcome window preference
+	if l.config.ShowWelcome == nil {
+		true := true
+		l.config.ShowWelcome = &true
+		log.Printf("Setting default ShowWelcome: true")
+	}
+
 	// Create directories
 	os.MkdirAll(l.config.ModelsPath, 0755)
 	os.MkdirAll(l.config.BackendsPath, 0755)
--- a/cmd/launcher/internal/launcher_test.go
+++ b/cmd/launcher/internal/launcher_test.go
@@ -48,6 +48,14 @@ var _ = Describe("Launcher", func() {
 			config := launcherInstance.GetConfig()
 			Expect(config.ModelsPath).ToNot(BeEmpty())
 			Expect(config.BackendsPath).ToNot(BeEmpty())
+		})
+
+		It("should set default ShowWelcome to true", func() {
+			err := launcherInstance.Initialize()
+			Expect(err).ToNot(HaveOccurred())
+
+			config := launcherInstance.GetConfig()
+			Expect(config.ShowWelcome).To(BeTrue())
 			Expect(config.Address).To(Equal("127.0.0.1:8080"))
 			Expect(config.LogLevel).To(Equal("info"))
 		})
--- a/cmd/launcher/internal/systray_manager.go
+++ b/cmd/launcher/internal/systray_manager.go
@@ -177,6 +177,9 @@ func (sm *SystrayManager) recreateMenu() {
 		fyne.NewMenuItem("Settings", func() {
 			sm.showSettings()
 		}),
+		fyne.NewMenuItem("Show Welcome Window", func() {
+			sm.showWelcomeWindow()
+		}),
 		fyne.NewMenuItem("Open Data Folder", func() {
 			sm.openDataFolder()
 		}),
@@ -243,6 +246,13 @@ func (sm *SystrayManager) showSettings() {
 	sm.window.RequestFocus()
 }

+// showWelcomeWindow shows the welcome window
+func (sm *SystrayManager) showWelcomeWindow() {
+	if sm.launcher.GetUI() != nil {
+		sm.launcher.GetUI().ShowWelcomeWindow()
+	}
+}
+
 // openDataFolder opens the data folder in file manager
 func (sm *SystrayManager) openDataFolder() {
 	dataPath := sm.launcher.GetDataPath()
--- a/cmd/launcher/internal/ui.go
+++ b/cmd/launcher/internal/ui.go
@@ -675,3 +675,121 @@ func (ui *LauncherUI) UpdateRunningState(isRunning bool) {
 		}
 	})
 }
+
+// ShowWelcomeWindow displays the welcome window with helpful information
+func (ui *LauncherUI) ShowWelcomeWindow() {
+	if ui.launcher == nil || ui.launcher.window == nil {
+		log.Printf("Cannot show welcome window: launcher or window is nil")
+		return
+	}
+
+	fyne.DoAndWait(func() {
+		// Create welcome window
+		welcomeWindow := ui.launcher.app.NewWindow("Welcome to LocalAI Launcher")
+		welcomeWindow.Resize(fyne.NewSize(600, 500))
+		welcomeWindow.CenterOnScreen()
+		welcomeWindow.SetCloseIntercept(func() {
+			welcomeWindow.Close()
+		})
+
+		// Title
+		titleLabel := widget.NewLabel("Welcome to LocalAI Launcher!")
+		titleLabel.TextStyle = fyne.TextStyle{Bold: true}
+		titleLabel.Alignment = fyne.TextAlignCenter
+
+		// Welcome message
+		welcomeText := `LocalAI Launcher makes it easy to run LocalAI on your system.
+
+What you can do:
+• Start and stop LocalAI server
+• Configure models and backends paths
+• Set environment variables
+• Check for updates automatically
+• Access LocalAI WebUI when running
+
+Getting Started:
+1. Configure your models and backends paths
+2. Click "Start LocalAI" to begin
+3. Use "Open WebUI" to access the interface
+4. Check the system tray for quick access`
+
+		welcomeLabel := widget.NewLabel(welcomeText)
+		welcomeLabel.Wrapping = fyne.TextWrapWord
+
+		// Useful links section
+		linksTitle := widget.NewLabel("Useful Links:")
+		linksTitle.TextStyle = fyne.TextStyle{Bold: true}
+
+		// Create link buttons
+		docsButton := widget.NewButton("📚 Documentation", func() {
+			ui.openURL("https://localai.io/docs/")
+		})
+
+		githubButton := widget.NewButton("🐙 GitHub Repository", func() {
+			ui.openURL("https://github.com/mudler/LocalAI")
+		})
+
+		modelsButton := widget.NewButton("🤖 Model Gallery", func() {
+			ui.openURL("https://localai.io/models/")
+		})
+
+		communityButton := widget.NewButton("💬 Community", func() {
+			ui.openURL("https://discord.gg/XgwjKptP7Z")
+		})
+
+		// Checkbox to disable welcome window
+		dontShowAgainCheck := widget.NewCheck("Don't show this welcome window again", func(checked bool) {
+			if ui.launcher != nil {
+				config := ui.launcher.GetConfig()
+				v := !checked
+				config.ShowWelcome = &v
+				ui.launcher.SetConfig(config)
+			}
+		})
+
+		config := ui.launcher.GetConfig()
+		if config.ShowWelcome != nil {
+			dontShowAgainCheck.SetChecked(*config.ShowWelcome)
+		}
+
+		// Close button
+		closeButton := widget.NewButton("Get Started", func() {
+			welcomeWindow.Close()
+		})
+		closeButton.Importance = widget.HighImportance
+
+		// Layout
+		linksContainer := container.NewVBox(
+			linksTitle,
+			docsButton,
+			githubButton,
+			modelsButton,
+			communityButton,
+		)
+
+		content := container.NewVBox(
+			titleLabel,
+			widget.NewSeparator(),
+			welcomeLabel,
+			widget.NewSeparator(),
+			linksContainer,
+			widget.NewSeparator(),
+			dontShowAgainCheck,
+			widget.NewSeparator(),
+			closeButton,
+		)
+
+		welcomeWindow.SetContent(content)
+		welcomeWindow.Show()
+	})
+}
+
+// openURL opens a URL in the default browser
+func (ui *LauncherUI) openURL(urlString string) {
+	parsedURL, err := url.Parse(urlString)
+	if err != nil {
+		log.Printf("Failed to parse URL %s: %v", urlString, err)
+		return
+	}
+	fyne.CurrentApp().OpenURL(parsedURL)
+}
--- a/cmd/launcher/main.go
+++ b/cmd/launcher/main.go
@@ -55,6 +55,12 @@ func main() {
 			// Load configuration into UI
 			launcher.GetUI().LoadConfiguration()
 			launcher.GetUI().UpdateStatus("Ready")
+
+			// Show welcome window if configured to do so
+			config := launcher.GetConfig()
+			if *config.ShowWelcome {
+				launcher.GetUI().ShowWelcomeWindow()
+			}
 		}
 	}()

--- a/cmd/local-ai/main.go
+++ b/cmd/local-ai/main.go
@@ -2,9 +2,7 @@ package main

 import (
 	"os"
-	"os/signal"
 	"path/filepath"
-	"syscall"

 	"github.com/alecthomas/kong"
 	"github.com/joho/godotenv"
@@ -24,15 +22,7 @@ func main() {
 	log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
 	zerolog.SetGlobalLevel(zerolog.InfoLevel)

-	// Catch signals from the OS requesting us to exit
-	go func() {
-		c := make(chan os.Signal, 1) // we need to reserve to buffer size 1, so the notifier are not blocked
-		signal.Notify(c, os.Interrupt, syscall.SIGTERM)
-		<-c
-		os.Exit(1)
-	}()
-
-	// handle loading environment variabled from .env files
+	// handle loading environment variables from .env files
 	envFiles := []string{".env", "localai.env"}
 	homeDir, err := os.UserHomeDir()
 	if err == nil {
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -12,7 +12,7 @@ import (
 	"github.com/mudler/LocalAI/pkg/model"
 )

-func ModelTranscription(audio, language string, translate bool, ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {
+func ModelTranscription(audio, language string, translate bool, diarize bool, ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {

 	if modelConfig.Backend == "" {
 		modelConfig.Backend = model.WhisperBackend
@@ -34,6 +34,7 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
 		Dst:       audio,
 		Language:  language,
 		Translate: translate,
+		Diarize:   diarize,
 		Threads:   uint32(*modelConfig.Threads),
 	})
 	if err != nil {
--- a/core/cli/explorer.go
+++ b/core/cli/explorer.go
@@ -5,6 +5,7 @@ import (
 	"time"

 	cliContext "github.com/mudler/LocalAI/core/cli/context"
+	"github.com/mudler/LocalAI/core/cli/signals"
 	"github.com/mudler/LocalAI/core/explorer"
 	"github.com/mudler/LocalAI/core/http"
 )
@@ -45,5 +46,7 @@ func (e *ExplorerCMD) Run(ctx *cliContext.Context) error {

 	appHTTP := http.Explorer(db)

+	signals.Handler(nil)
+
 	return appHTTP.Listen(e.Address)
 }
--- a/core/cli/federated.go
+++ b/core/cli/federated.go
@@ -4,6 +4,7 @@ import (
 	"context"

 	cliContext "github.com/mudler/LocalAI/core/cli/context"
+	"github.com/mudler/LocalAI/core/cli/signals"
 	"github.com/mudler/LocalAI/core/p2p"
 )

@@ -19,5 +20,7 @@ func (f *FederatedCLI) Run(ctx *cliContext.Context) error {

 	fs := p2p.NewFederatedServer(f.Address, p2p.NetworkID(f.Peer2PeerNetworkID, p2p.FederatedID), f.Peer2PeerToken, !f.RandomWorker, f.TargetWorker)

+	signals.Handler(nil)
+
 	return fs.Start(context.Background())
 }
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -10,6 +10,7 @@ import (
 	"github.com/mudler/LocalAI/core/application"
 	cli_api "github.com/mudler/LocalAI/core/cli/api"
 	cliContext "github.com/mudler/LocalAI/core/cli/context"
+	"github.com/mudler/LocalAI/core/cli/signals"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http"
 	"github.com/mudler/LocalAI/core/p2p"
@@ -224,5 +225,8 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		return err
 	}

+	// Catch signals from the OS requesting us to exit, and stop all backends
+	signals.Handler(app.ModelLoader())
+
 	return appHTTP.Listen(r.Address)
 }
--- a/core/cli/signals/signals.go
+++ b/core/cli/signals/signals.go
@@ -0,0 +1,25 @@
+package signals
+
+import (
+	"os"
+	"os/signal"
+	"syscall"
+
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/rs/zerolog/log"
+)
+
+func Handler(m *model.ModelLoader) {
+	// Catch signals from the OS requesting us to exit, and stop all backends
+	go func(m *model.ModelLoader) {
+		c := make(chan os.Signal, 1) // we need to reserve to buffer size 1, so the notifier are not blocked
+		signal.Notify(c, os.Interrupt, syscall.SIGTERM, syscall.SIGINT)
+		<-c
+		if m != nil {
+			if err := m.StopAllGRPC(); err != nil {
+				log.Error().Err(err).Msg("error while stopping all grpc backends")
+			}
+		}
+		os.Exit(1)
+	}(m)
+}
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@@ -20,6 +20,7 @@ type TranscriptCMD struct {
 	Model      string `short:"m" required:"" help:"Model name to run the TTS"`
 	Language   string `short:"l" help:"Language of the audio file"`
 	Translate  bool   `short:"c" help:"Translate the transcription to english"`
+	Diarize    bool   `short:"d" help:"Mark speaker turns"`
 	Threads    int    `short:"t" default:"1" help:"Number of threads used for parallel computation"`
 	ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 }
@@ -56,7 +57,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
 		}
 	}()

-	tr, err := backend.ModelTranscription(t.Filename, t.Language, t.Translate, ml, c, opts)
+	tr, err := backend.ModelTranscription(t.Filename, t.Language, t.Translate, t.Diarize, ml, c, opts)
 	if err != nil {
 		return err
 	}
--- a/core/cli/worker/worker.go
+++ b/core/cli/worker/worker.go
@@ -2,6 +2,7 @@ package worker

 type WorkerFlags struct {
 	BackendsPath       string `env:"LOCALAI_BACKENDS_PATH,BACKENDS_PATH" type:"path" default:"${basepath}/backends" help:"Path containing backends used for inferencing" group:"backends"`
+	BackendGalleries   string `env:"LOCALAI_BACKEND_GALLERIES,BACKEND_GALLERIES" help:"JSON list of backend galleries" group:"backends" default:"${backends}"`
 	BackendsSystemPath string `env:"LOCALAI_BACKENDS_SYSTEM_PATH,BACKEND_SYSTEM_PATH" type:"path" default:"/usr/share/localai/backends" help:"Path containing system backends used for inferencing" group:"backends"`
 	ExtraLLamaCPPArgs  string `name:"llama-cpp-args" env:"LOCALAI_EXTRA_LLAMA_CPP_ARGS,EXTRA_LLAMA_CPP_ARGS" help:"Extra arguments to pass to llama-cpp-rpc-server"`
 }
--- a/core/cli/worker/worker_llamacpp.go
+++ b/core/cli/worker/worker_llamacpp.go
@@ -1,6 +1,7 @@
 package worker

 import (
+	"encoding/json"
 	"errors"
 	"fmt"
 	"os"
@@ -9,7 +10,10 @@ import (
 	"syscall"

 	cliContext "github.com/mudler/LocalAI/core/cli/context"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/cli/signals"
 	"github.com/mudler/LocalAI/core/gallery"
+	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/system"
 	"github.com/rs/zerolog/log"
 )
@@ -20,9 +24,10 @@ type LLamaCPP struct {

 const (
 	llamaCPPRPCBinaryName = "llama-cpp-rpc-server"
+	llamaCPPGalleryName   = "llama-cpp"
 )

-func findLLamaCPPBackend(systemState *system.SystemState) (string, error) {
+func findLLamaCPPBackend(galleries string, systemState *system.SystemState) (string, error) {
 	backends, err := gallery.ListSystemBackends(systemState)
 	if err != nil {
 		log.Warn().Msgf("Failed listing system backends: %s", err)
@@ -30,9 +35,19 @@ func findLLamaCPPBackend(systemState *system.SystemState) (string, error) {
 	}
 	log.Debug().Msgf("System backends: %v", backends)

-	backend, ok := backends.Get("llama-cpp")
+	backend, ok := backends.Get(llamaCPPGalleryName)
 	if !ok {
-		return "", errors.New("llama-cpp backend not found, install it first")
+		ml := model.NewModelLoader(systemState, true)
+		var gals []config.Gallery
+		if err := json.Unmarshal([]byte(galleries), &gals); err != nil {
+			log.Error().Err(err).Msg("failed loading galleries")
+			return "", err
+		}
+		err := gallery.InstallBackendFromGallery(gals, systemState, ml, llamaCPPGalleryName, nil, true)
+		if err != nil {
+			log.Error().Err(err).Msg("llama-cpp backend not found, failed to install it")
+			return "", err
+		}
 	}
 	backendPath := filepath.Dir(backend.RunFile)

@@ -61,7 +76,7 @@ func (r *LLamaCPP) Run(ctx *cliContext.Context) error {
 	if err != nil {
 		return err
 	}
-	grpcProcess, err := findLLamaCPPBackend(systemState)
+	grpcProcess, err := findLLamaCPPBackend(r.BackendGalleries, systemState)
 	if err != nil {
 		return err
 	}
@@ -69,6 +84,9 @@ func (r *LLamaCPP) Run(ctx *cliContext.Context) error {
 	args := strings.Split(r.ExtraLLamaCPPArgs, " ")

 	args = append([]string{grpcProcess}, args...)
+
+	signals.Handler(nil)
+
 	return syscall.Exec(
 		grpcProcess,
 		args,
--- a/core/cli/worker/worker_p2p.go
+++ b/core/cli/worker/worker_p2p.go
@@ -9,6 +9,7 @@ import (
 	"time"

 	cliContext "github.com/mudler/LocalAI/core/cli/context"
+	"github.com/mudler/LocalAI/core/cli/signals"
 	"github.com/mudler/LocalAI/core/p2p"
 	"github.com/mudler/LocalAI/pkg/system"
 	"github.com/phayes/freeport"
@@ -69,7 +70,7 @@ func (r *P2P) Run(ctx *cliContext.Context) error {
 			for {
 				log.Info().Msgf("Starting llama-cpp-rpc-server on '%s:%d'", address, port)

-				grpcProcess, err := findLLamaCPPBackend(systemState)
+				grpcProcess, err := findLLamaCPPBackend(r.BackendGalleries, systemState)
 				if err != nil {
 					log.Error().Err(err).Msg("Failed to find llama-cpp-rpc-server")
 					return
@@ -106,6 +107,8 @@ func (r *P2P) Run(ctx *cliContext.Context) error {
 		}
 	}

+	signals.Handler(nil)
+
 	for {
 		time.Sleep(1 * time.Second)
 	}
--- a/core/gallery/backends.go
+++ b/core/gallery/backends.go
@@ -1,3 +1,5 @@
+// Package gallery provides installation and registration utilities for LocalAI backends,
+// including meta-backend resolution based on system capabilities.
 package gallery

 import (
@@ -5,6 +7,7 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"strings"
 	"time"

 	"github.com/mudler/LocalAI/core/config"
@@ -20,6 +23,12 @@ const (
 	runFile      = "run.sh"
 )

+// backendCandidate represents an installed concrete backend option for a given alias
+type backendCandidate struct {
+	name    string
+	runFile string
+}
+
 // readBackendMetadata reads the metadata JSON file for a backend
 func readBackendMetadata(backendPath string) (*BackendMetadata, error) {
 	metadataPath := filepath.Join(backendPath, metadataFile)
@@ -58,7 +67,7 @@ func writeBackendMetadata(backendPath string, metadata *BackendMetadata) error {
 	return nil
 }

-// Installs a model from the gallery
+// InstallBackendFromGallery installs a backend from the gallery.
 func InstallBackendFromGallery(galleries []config.Gallery, systemState *system.SystemState, modelLoader *model.ModelLoader, name string, downloadStatus func(string, string, string, float64), force bool) error {
 	if !force {
 		// check if we already have the backend installed
@@ -282,23 +291,18 @@ func (b SystemBackends) GetAll() []SystemBackend {
 }

 func ListSystemBackends(systemState *system.SystemState) (SystemBackends, error) {
-	potentialBackends, err := os.ReadDir(systemState.Backend.BackendsPath)
-	if err != nil {
-		return nil, err
-	}
-
+	// Gather backends from system and user paths, then resolve alias conflicts by capability.
 	backends := make(SystemBackends)

-	systemBackends, err := os.ReadDir(systemState.Backend.BackendsSystemPath)
-	if err == nil {
-		// system backends are special, they are provided by the system and not managed by LocalAI
+	// System-provided backends
+	if systemBackends, err := os.ReadDir(systemState.Backend.BackendsSystemPath); err == nil {
 		for _, systemBackend := range systemBackends {
 			if systemBackend.IsDir() {
-				systemBackendRunFile := filepath.Join(systemState.Backend.BackendsSystemPath, systemBackend.Name(), runFile)
-				if _, err := os.Stat(systemBackendRunFile); err == nil {
+				run := filepath.Join(systemState.Backend.BackendsSystemPath, systemBackend.Name(), runFile)
+				if _, err := os.Stat(run); err == nil {
 					backends[systemBackend.Name()] = SystemBackend{
 						Name:     systemBackend.Name(),
-						RunFile:  filepath.Join(systemState.Backend.BackendsSystemPath, systemBackend.Name(), runFile),
+						RunFile:  run,
 						IsMeta:   false,
 						IsSystem: true,
 						Metadata: nil,
@@ -307,64 +311,104 @@ func ListSystemBackends(systemState *system.SystemState) (SystemBackends, error)
 			}
 		}
 	} else {
-		log.Warn().Err(err).Msg("Failed to read system backends, but that's ok, we will just use the backends managed by LocalAI")
+		log.Warn().Err(err).Msg("Failed to read system backends, proceeding with user-managed backends")
 	}

-	for _, potentialBackend := range potentialBackends {
-		if potentialBackend.IsDir() {
-			potentialBackendRunFile := filepath.Join(systemState.Backend.BackendsPath, potentialBackend.Name(), runFile)
+	// User-managed backends and alias collection
+	entries, err := os.ReadDir(systemState.Backend.BackendsPath)
+	if err != nil {
+		return nil, err
+	}

-			var metadata *BackendMetadata
+	aliasGroups := make(map[string][]backendCandidate)
+	metaMap := make(map[string]*BackendMetadata)

-			// If metadata file does not exist, we just use the directory name
-			// and we do not fill the other metadata (such as potential backend Aliases)
-			metadataFilePath := filepath.Join(systemState.Backend.BackendsPath, potentialBackend.Name(), metadataFile)
-			if _, err := os.Stat(metadataFilePath); os.IsNotExist(err) {
-				metadata = &BackendMetadata{
-					Name: potentialBackend.Name(),
-				}
+	for _, e := range entries {
+		if !e.IsDir() {
+			continue
+		}
+		dir := e.Name()
+		run := filepath.Join(systemState.Backend.BackendsPath, dir, runFile)
+
+		var metadata *BackendMetadata
+		metadataPath := filepath.Join(systemState.Backend.BackendsPath, dir, metadataFile)
+		if _, err := os.Stat(metadataPath); os.IsNotExist(err) {
+			metadata = &BackendMetadata{Name: dir}
+		} else {
+			m, rerr := readBackendMetadata(filepath.Join(systemState.Backend.BackendsPath, dir))
+			if rerr != nil {
+				return nil, rerr
+			}
+			if m == nil {
+				metadata = &BackendMetadata{Name: dir}
 			} else {
-				// Check for alias in metadata
-				metadata, err = readBackendMetadata(filepath.Join(systemState.Backend.BackendsPath, potentialBackend.Name()))
-				if err != nil {
-					return nil, err
+				metadata = m
+			}
+		}
+
+		metaMap[dir] = metadata
+
+		// Concrete backend entry
+		if _, err := os.Stat(run); err == nil {
+			backends[dir] = SystemBackend{
+				Name:     dir,
+				RunFile:  run,
+				IsMeta:   false,
+				Metadata: metadata,
+			}
+		}
+
+		// Alias candidates
+		if metadata.Alias != "" {
+			aliasGroups[metadata.Alias] = append(aliasGroups[metadata.Alias], backendCandidate{name: dir, runFile: run})
+		}
+
+		// Meta backends indirection
+		if metadata.MetaBackendFor != "" {
+			backends[metadata.Name] = SystemBackend{
+				Name:     metadata.Name,
+				RunFile:  filepath.Join(systemState.Backend.BackendsPath, metadata.MetaBackendFor, runFile),
+				IsMeta:   true,
+				Metadata: metadata,
+			}
+		}
+	}
+
+	// Resolve aliases using system capability preferences
+	tokens := systemState.BackendPreferenceTokens()
+	for alias, cands := range aliasGroups {
+		chosen := backendCandidate{}
+		// Try preference tokens
+		for _, t := range tokens {
+			for _, c := range cands {
+				if strings.Contains(strings.ToLower(c.name), t) && c.runFile != "" {
+					chosen = c
+					break
 				}
 			}
-
-			if !backends.Exists(potentialBackend.Name()) {
-				// We don't want to override aliases if already set, and if we are meta backend
-				if _, err := os.Stat(potentialBackendRunFile); err == nil {
-					backends[potentialBackend.Name()] = SystemBackend{
-						Name:     potentialBackend.Name(),
-						RunFile:  potentialBackendRunFile,
-						IsMeta:   false,
-						Metadata: metadata,
-					}
-				}
-			}
-
-			if metadata == nil {
-				continue
-			}
-
-			if metadata.Alias != "" {
-				backends[metadata.Alias] = SystemBackend{
-					Name:     metadata.Alias,
-					RunFile:  potentialBackendRunFile,
-					IsMeta:   false,
-					Metadata: metadata,
-				}
-			}
-
-			if metadata.MetaBackendFor != "" {
-				backends[metadata.Name] = SystemBackend{
-					Name:     metadata.Name,
-					RunFile:  filepath.Join(systemState.Backend.BackendsPath, metadata.MetaBackendFor, runFile),
-					IsMeta:   true,
-					Metadata: metadata,
+			if chosen.runFile != "" {
+				break
+			}
+		}
+		// Fallback: first runnable
+		if chosen.runFile == "" {
+			for _, c := range cands {
+				if c.runFile != "" {
+					chosen = c
+					break
 				}
 			}
 		}
+		if chosen.runFile == "" {
+			continue
+		}
+		md := metaMap[chosen.name]
+		backends[alias] = SystemBackend{
+			Name:     alias,
+			RunFile:  chosen.runFile,
+			IsMeta:   false,
+			Metadata: md,
+		}
 	}

 	return backends, nil
--- a/core/gallery/backends_test.go
+++ b/core/gallery/backends_test.go
@@ -18,6 +18,73 @@ const (
 	testImage = "quay.io/mudler/tests:localai-backend-test"
 )

+var _ = Describe("Runtime capability-based backend selection", func() {
+	var tempDir string
+
+	BeforeEach(func() {
+		var err error
+		tempDir, err = os.MkdirTemp("", "gallery-caps-*")
+		Expect(err).NotTo(HaveOccurred())
+	})
+
+	AfterEach(func() {
+		os.RemoveAll(tempDir)
+	})
+
+	It("ListSystemBackends prefers optimal alias candidate", func() {
+		// Arrange two installed backends sharing the same alias
+		must := func(err error) { Expect(err).NotTo(HaveOccurred()) }
+
+		cpuDir := filepath.Join(tempDir, "cpu-llama-cpp")
+		must(os.MkdirAll(cpuDir, 0o750))
+		cpuMeta := &BackendMetadata{Alias: "llama-cpp", Name: "cpu-llama-cpp"}
+		b, _ := json.Marshal(cpuMeta)
+		must(os.WriteFile(filepath.Join(cpuDir, "metadata.json"), b, 0o644))
+		must(os.WriteFile(filepath.Join(cpuDir, "run.sh"), []byte(""), 0o755))
+
+		cudaDir := filepath.Join(tempDir, "cuda12-llama-cpp")
+		must(os.MkdirAll(cudaDir, 0o750))
+		cudaMeta := &BackendMetadata{Alias: "llama-cpp", Name: "cuda12-llama-cpp"}
+		b, _ = json.Marshal(cudaMeta)
+		must(os.WriteFile(filepath.Join(cudaDir, "metadata.json"), b, 0o644))
+		must(os.WriteFile(filepath.Join(cudaDir, "run.sh"), []byte(""), 0o755))
+
+		// Default system: alias should point to CPU
+		sysDefault, err := system.GetSystemState(
+			system.WithBackendPath(tempDir),
+		)
+		must(err)
+		sysDefault.GPUVendor = "" // force default selection
+	backs, err := ListSystemBackends(sysDefault)
+		must(err)
+		aliasBack, ok := backs.Get("llama-cpp")
+		Expect(ok).To(BeTrue())
+		Expect(aliasBack.RunFile).To(Equal(filepath.Join(cpuDir, "run.sh")))
+		// concrete entries remain
+		_, ok = backs.Get("cpu-llama-cpp")
+		Expect(ok).To(BeTrue())
+		_, ok = backs.Get("cuda12-llama-cpp")
+		Expect(ok).To(BeTrue())
+
+		// NVIDIA system: alias should point to CUDA
+		// Force capability to nvidia to make the test deterministic on platforms like darwin/arm64 (which default to metal)
+		os.Setenv("LOCALAI_FORCE_META_BACKEND_CAPABILITY", "nvidia")
+		defer os.Unsetenv("LOCALAI_FORCE_META_BACKEND_CAPABILITY")
+
+		sysNvidia, err := system.GetSystemState(
+			system.WithBackendPath(tempDir),
+		)
+		must(err)
+		sysNvidia.GPUVendor = "nvidia"
+		sysNvidia.VRAM = 8 * 1024 * 1024 * 1024
+	backs, err = ListSystemBackends(sysNvidia)
+		must(err)
+		aliasBack, ok = backs.Get("llama-cpp")
+		Expect(ok).To(BeTrue())
+		Expect(aliasBack.RunFile).To(Equal(filepath.Join(cudaDir, "run.sh")))
+	})
+})
+
 var _ = Describe("Gallery Backends", func() {
 	var (
 		tempDir     string
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -836,27 +836,40 @@ var _ = Describe("API test", func() {
 			if runtime.GOOS != "linux" {
 				Skip("test supported only on linux")
 			}
+			embeddingModel := openai.AdaEmbeddingV2
 			resp, err := client.CreateEmbeddings(
 				context.Background(),
 				openai.EmbeddingRequest{
-					Model: openai.AdaEmbeddingV2,
+					Model: embeddingModel,
 					Input: []string{"sun", "cat"},
 				},
 			)
 			Expect(err).ToNot(HaveOccurred(), err)
-			Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 2048))
-			Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 2048))
+			Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 4096))
+			Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 4096))

 			sunEmbedding := resp.Data[0].Embedding
 			resp2, err := client.CreateEmbeddings(
 				context.Background(),
 				openai.EmbeddingRequest{
-					Model: openai.AdaEmbeddingV2,
+					Model: embeddingModel,
 					Input: []string{"sun"},
 				},
 			)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(resp2.Data[0].Embedding).To(Equal(sunEmbedding))
+			Expect(resp2.Data[0].Embedding).ToNot(Equal(resp.Data[1].Embedding))
+
+			resp3, err := client.CreateEmbeddings(
+				context.Background(),
+				openai.EmbeddingRequest{
+					Model: embeddingModel,
+					Input: []string{"cat"},
+				},
+			)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(resp3.Data[0].Embedding).To(Equal(resp.Data[1].Embedding))
+			Expect(resp3.Data[0].Embedding).ToNot(Equal(sunEmbedding))
 		})

 		Context("External gRPC calls", func() {
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -398,9 +398,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				}

 				finishReason := "stop"
-				if toolsCalled {
+				if toolsCalled && len(input.Tools) > 0 {
 					finishReason = "tool_calls"
-				} else if toolsCalled && len(input.Tools) == 0 {
+				} else if toolsCalled {
 					finishReason = "function_call"
 				}

@@ -443,11 +443,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				log.Debug().Msgf("Text content to return: %s", textContentToReturn)
 				noActionsToRun := len(results) > 0 && results[0].Name == noActionName || len(results) == 0

-				finishReason := "stop"
-				if len(input.Tools) > 0 {
-					finishReason = "tool_calls"
-				}
-
 				switch {
 				case noActionsToRun:
 					result, err := handleQuestion(config, cl, input, ml, startupOptions, results, s, predInput)
@@ -457,11 +452,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					}

 					*c = append(*c, schema.Choice{
-						FinishReason: finishReason,
+						FinishReason: "stop",
 						Message:      &schema.Message{Role: "assistant", Content: &result}})
 				default:
 					toolChoice := schema.Choice{
-						FinishReason: finishReason,
+						FinishReason: "tool_calls",
 						Message: &schema.Message{
 							Role: "assistant",
 						},
--- a/core/http/endpoints/openai/transcription.go
+++ b/core/http/endpoints/openai/transcription.go
@@ -36,6 +36,8 @@ func TranscriptEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, app
 			return fiber.ErrBadRequest
 		}

+		diarize := c.FormValue("diarize", "false") != "false"
+
 		// retrieve the file data from the request
 		file, err := c.FormFile("file")
 		if err != nil {
@@ -67,7 +69,7 @@ func TranscriptEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, app

 		log.Debug().Msgf("Audio file copied to: %+v", dst)

-		tr, err := backend.ModelTranscription(dst, input.Language, input.Translate, ml, *config, appConfig)
+		tr, err := backend.ModelTranscription(dst, input.Language, input.Translate, diarize, ml, *config, appConfig)
 		if err != nil {
 			return err
 		}
--- a/core/http/http_suite_test.go
+++ b/core/http/http_suite_test.go
@@ -9,5 +9,5 @@ import (

 func TestLocalAI(t *testing.T) {
 	RegisterFailHandler(Fail)
-	RunSpecs(t, "LocalAI test suite")
+	RunSpecs(t, "LocalAI HTTP test suite")
 }
--- a/docs/content/docs/features/GPU-acceleration.md
+++ b/docs/content/docs/features/GPU-acceleration.md
@@ -139,7 +139,7 @@ Due to the nature of ROCm it is best to run all implementations in containers as

 ### Limitations

-Ongoing verification testing of ROCm compatability with integrated backends.
+Ongoing verification testing of ROCm compatibility with integrated backends.
 Please note the following list of verified backends and devices.

 LocalAI hipblas images are built against the following targets: gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
@@ -172,7 +172,7 @@ The devices in the following list have been tested with `hipblas` images running
 ### System Prep

 1. Check your GPU LLVM target is compatible with the version of ROCm. This can be found in the [LLVM Docs](https://llvm.org/docs/AMDGPUUsage.html).
-2. Check which ROCm version is compatible with your LLVM target and your chosen OS (pay special attention to supported kernel versions). See the following for compatability for ([ROCm 6.0.0](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.0.0/reference/system-requirements.html)) or ([ROCm 6.0.2](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html))
+2. Check which ROCm version is compatible with your LLVM target and your chosen OS (pay special attention to supported kernel versions). See the following for compatibility for ([ROCm 6.0.0](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.0.0/reference/system-requirements.html)) or ([ROCm 6.0.2](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html))
 3. Install you chosen version of the `dkms` and `rocm` (it is recommended that the native package manager be used for this process for any OS as version changes are executed more easily via this method if updates are required). Take care to restart after installing `amdgpu-dkms` and before installing `rocm`, for details regarding this see the installation documentation for your chosen OS ([6.0.2](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/native-install/index.html) or [6.0.0](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.0.0/how-to/native-install/index.html))
 4. Deploy. Yes it's that easy.

@@ -216,7 +216,7 @@ The rebuild process will take some time to complete when deploying these contain
 #### Example (k8s) (Advanced Deployment/WIP)

 For k8s deployments there is an additional step required before deployment, this is the deployment of the [ROCm/k8s-device-plugin](https://artifacthub.io/packages/helm/amd-gpu-helm/amd-gpu).
-For any k8s environment the documentation provided by AMD from the ROCm project should be successful. It is recommended that if you use rke2 or OpenShift that you deploy the SUSE or RedHat provided version of this resource to ensure compatability.
+For any k8s environment the documentation provided by AMD from the ROCm project should be successful. It is recommended that if you use rke2 or OpenShift that you deploy the SUSE or RedHat provided version of this resource to ensure compatibility.
 After this has been completed the [helm chart from go-skynet](https://github.com/go-skynet/helm-charts) can be configured and deployed mostly un-edited.

 The following are details of the changes that should be made to ensure proper function.
@@ -241,7 +241,7 @@ spec:
              value: '0'
              # This variable indicates the devices available to container (0:device1 1:device2 2:device3) etc.
              # For multiple devices (say device 1 and 3) the value would be equivalent to HIP_VISIBLE_DEVICES="0,2"
-              # Please take note of this when an iGPU is present in host system as compatability is not assured.
+              # Please take note of this when an iGPU is present in host system as compatibility is not assured.
          ...
          resources:
            limits:
@@ -250,7 +250,7 @@ spec:
              amd.com/gpu: '1'
 ```

-This configuration has been tested on a 'custom' cluster managed by SUSE Rancher that was deployed on top of Ubuntu 22.04.4, certification of other configuration is ongoing and compatability is not guaranteed.
+This configuration has been tested on a 'custom' cluster managed by SUSE Rancher that was deployed on top of Ubuntu 22.04.4, certification of other configuration is ongoing and compatibility is not guaranteed.

 ### Notes

--- a/docs/content/docs/features/gpt-vision.md
+++ b/docs/content/docs/features/gpt-vision.md
@@ -34,5 +34,5 @@ Grammars and function tools can be used as well in conjunction with vision APIs:

 All-in-One images have already shipped the llava model as `gpt-4-vision-preview`, so no setup is needed in this case. 

-To setup the LLaVa models, follow the full example in the [configuration examples](https://github.com/mudler/LocalAI/blob/master/examples/configurations/README.md#llava).
+To setup the LLaVa models, follow the full example in the [configuration examples](https://github.com/mudler/LocalAI-examples/blob/main/configurations/llava/llava.yaml).

--- a/docs/content/docs/getting-started/quickstart.md
+++ b/docs/content/docs/getting-started/quickstart.md
@@ -27,6 +27,12 @@ curl https://localai.io/install.sh | sh

 See [Installer]({{% relref "docs/advanced/installer" %}}) for all the supported options

+### macOS Download
+
+<a href="https://github.com/mudler/LocalAI/releases/latest/download/LocalAI.dmg">
+  <img src="https://img.shields.io/badge/Download-macOS-blue?style=for-the-badge&logo=apple&logoColor=white" alt="Download LocalAI for macOS"/>
+</a>
+
 ### Run with docker


@@ -176,7 +182,7 @@ MODEL_NAME=gemma-3-12b-it docker compose up

 # NVIDIA GPU setup with custom multimodal and image models
 MODEL_NAME=gemma-3-12b-it \
-MULTIMODAL_MODEL=minicpm-v-2_6 \
+MULTIMODAL_MODEL=minicpm-v-4_5 \
 IMAGE_MODEL=flux.1-dev-ggml \
 docker compose -f docker-compose.nvidia.yaml up
 ```
--- a/docs/content/docs/overview.md
+++ b/docs/content/docs/overview.md
@@ -56,6 +56,12 @@ The fastest way to get started is with our one-line installer:
 curl https://localai.io/install.sh | sh
 ```

+### macOS Download
+
+<a href="https://github.com/mudler/LocalAI/releases/latest/download/LocalAI.dmg">
+  <img src="https://img.shields.io/badge/Download-macOS-blue?style=for-the-badge&logo=apple&logoColor=white" alt="Download LocalAI for macOS"/>
+</a>
+
 Or use Docker for a quick start:

 ```bash
--- a/docs/content/docs/reference/binaries.md
+++ b/docs/content/docs/reference/binaries.md
@@ -5,7 +5,13 @@ title = "LocalAI binaries"
 weight = 26
 +++

-LocalAI binaries are available for both Linux and MacOS platforms and can be executed directly from your command line. These binaries are continuously updated and hosted on [our GitHub Releases page](https://github.com/mudler/LocalAI/releases). This method also supports Windows users via the Windows Subsystem for Linux (WSL). 
+LocalAI binaries are available for both Linux and MacOS platforms and can be executed directly from your command line. These binaries are continuously updated and hosted on [our GitHub Releases page](https://github.com/mudler/LocalAI/releases). This method also supports Windows users via the Windows Subsystem for Linux (WSL).
+
+### macOS Download
+
+<a href="https://github.com/mudler/LocalAI/releases/latest/download/LocalAI.dmg">
+  <img src="https://img.shields.io/badge/Download-macOS-blue?style=for-the-badge&logo=apple&logoColor=white" alt="Download LocalAI for macOS"/>
+</a> 

 Use the following one-liner command in your terminal to download and run LocalAI on Linux or MacOS:

--- a/docs/content/docs/reference/compatibility-table.md
+++ b/docs/content/docs/reference/compatibility-table.md
@@ -43,6 +43,7 @@ LocalAI will attempt to automatically load models which are not explicitly confi
 | [chatterbox](https://github.com/resemble-ai/chatterbox) | Chatterbox TTS    | no                       | Text-to-speech    | no                               | no                   | CUDA 11/12, CPU |
 | [kitten-tts](https://github.com/KittenML/KittenTTS) | Kitten TTS    | no                       | Text-to-speech    | no                               | no                   | CPU |
 | [silero-vad](https://github.com/snakers4/silero-vad) with [Golang bindings](https://github.com/streamer45/silero-vad-go) | Silero VAD    | no                       | Voice Activity Detection    | no                               | no                   | CPU |
+| [mlx-audio](https://github.com/Blaizzy/mlx-audio) | MLX | no                       | Text-tospeech    | no                               | no                   | Metal (Apple Silicon) |
 {{< /table >}}

 ## Image & Video Generation
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v3.4.0"
+  "version": "v3.5.1"
 }
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,30 @@
 ---
+- &ernie
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  name: "baidu_ernie-4.5-21b-a3b-thinking"
+  license: apache-2.0
+  tags:
+    - gguf
+    - GPU
+    - CPU
+    - text-to-text
+  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/64f187a2cc1c03340ac30498/TYYUxK8xD1AxExFMWqbZD.png
+  urls:
+    - https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking
+    - https://huggingface.co/bartowski/baidu_ERNIE-4.5-21B-A3B-Thinking-GGUF
+  description: |
+    Over the past three months, we have continued to scale the thinking capability of ERNIE-4.5-21B-A3B, improving both the quality and depth of reasoning, thereby advancing the competitiveness of ERNIE lightweight models in complex reasoning tasks. We are pleased to introduce ERNIE-4.5-21B-A3B-Thinking, featuring the following key enhancements:
+    Significantly improved performance on reasoning tasks, including logical reasoning, mathematics, science, coding, text generation, and academic benchmarks that typically require human expertise.
+    Efficient tool usage capabilities.
+    Enhanced 128K long-context understanding capabilities.
+    Note: This version has an increased thinking length. We strongly recommend its use in highly complex reasoning tasks. ERNIE-4.5-21B-A3B-Thinking is a text MoE post-trained model, with 21B total parameters and 3B activated parameters for each token.
+  overrides:
+    parameters:
+      model: baidu_ERNIE-4.5-21B-A3B-Thinking-Q4_K_M.gguf
+  files:
+    - filename: baidu_ERNIE-4.5-21B-A3B-Thinking-Q4_K_M.gguf
+      sha256: f309f225c413324c585e74ce28c55e76dec25340156374551d39707fc2966840
+      uri: huggingface://bartowski/baidu_ERNIE-4.5-21B-A3B-Thinking-GGUF/baidu_ERNIE-4.5-21B-A3B-Thinking-Q4_K_M.gguf
 - &mimo
  license: mit
  tags:
@@ -2489,6 +2515,129 @@
    - filename: Qwen_Qwen3-4B-Thinking-2507-Q8_0.gguf
      sha256: 2c08db093bc57c2c77222d27ffe8d41cb0b5648e66ba84e5fb9ceab429f6735c
      uri: huggingface://bartowski/Qwen_Qwen3-4B-Thinking-2507-GGUF/Qwen_Qwen3-4B-Thinking-2507-Q8_0.gguf
+- !!merge <<: *qwen3
+  name: "nousresearch_hermes-4-14b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/6317aade83d8d2fd903192d9/7B7nMvHJiL72QzVBEPKOG.png
+  urls:
+    - https://huggingface.co/NousResearch/Hermes-4-14B
+    - https://huggingface.co/bartowski/NousResearch_Hermes-4-14B-GGUF
+  description: |
+    Hermes 4 14B is a frontier, hybrid-mode reasoning model based on Qwen 3 14B by Nous Research that is aligned to you.
+
+    Read the Hermes 4 technical report here: Hermes 4 Technical Report
+
+    Chat with Hermes in Nous Chat: https://chat.nousresearch.com
+
+    Training highlights include a newly synthesized post-training corpus emphasizing verified reasoning traces, massive improvements in math, code, STEM, logic, creativity, and format-faithful outputs, while preserving general assistant quality and broadly neutral alignment.
+    What’s new vs Hermes 3
+
+        Post-training corpus: Massively increased dataset size from 1M samples and 1.2B tokens to ~5M samples / ~60B tokens blended across reasoning and non-reasoning data.
+        Hybrid reasoning mode with explicit <think>…</think> segments when the model decides to deliberate, and options to make your responses faster when you want.
+        Reasoning that is top quality, expressive, improves math, code, STEM, logic, and even creative writing and subjective responses.
+        Schema adherence & structured outputs: trained to produce valid JSON for given schemas and to repair malformed objects.
+        Much easier to steer and align: extreme improvements on steerability, especially on reduced refusal rates.
+  overrides:
+    parameters:
+      model: NousResearch_Hermes-4-14B-Q4_K_M.gguf
+  files:
+    - filename: NousResearch_Hermes-4-14B-Q4_K_M.gguf
+      sha256: 7ad9be1e446e3da0c149fdf55284c90be666d3e13c6e2581587853f4f9538073
+      uri: huggingface://bartowski/NousResearch_Hermes-4-14B-GGUF/NousResearch_Hermes-4-14B-Q4_K_M.gguf
+- !!merge <<: *qwen3
+  name: "minicpm-v-4_5"
+  license: apache-2.0
+  icon: https://avatars.githubusercontent.com/u/89920203
+  urls:
+    - https://huggingface.co/openbmb/MiniCPM-V-4_5-gguf
+    - https://huggingface.co/openbmb/MiniCPM-V-4_5
+  description: |
+    MiniCPM-V 4.5 is the latest and most capable model in the MiniCPM-V series. The model is built on Qwen3-8B and SigLIP2-400M with a total of 8B parameters.
+  tags:
+    - llm
+    - multimodal
+    - gguf
+    - gpu
+    - qwen3
+    - cpu
+  overrides:
+    mmproj: minicpm-v-4_5-mmproj-f16.gguf
+    parameters:
+      model: minicpm-v-4_5-Q4_K_M.gguf
+  files:
+    - filename: minicpm-v-4_5-Q4_K_M.gguf
+      sha256: c1c3c33100b15b4caf7319acce4e23c0eb0ce1cbd12f70e8d24f05aa67b7512f
+      uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/ggml-model-Q4_K_M.gguf
+    - filename: minicpm-v-4_5-mmproj-f16.gguf
+      uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/mmproj-model-f16.gguf
+      sha256: 7a7225a32e8d453aaa3d22d8c579b5bf833c253f784cdb05c99c9a76fd616df8
+- !!merge <<: *qwen3
+  name: "aquif-ai_aquif-3.5-8b-think"
+  urls:
+    - https://huggingface.co/aquif-ai/aquif-3.5-8B-Think
+    - https://huggingface.co/bartowski/aquif-ai_aquif-3.5-8B-Think-GGUF
+  description: |
+    The aquif-3.5 series is the successor to aquif-3, featuring a simplified naming scheme, expanded Mixture of Experts (MoE) options, and across-the-board performance improvements. This release streamlines model selection while delivering enhanced capabilities across reasoning, multilingual support, and general intelligence tasks.
+    An experimental small-scale Mixture of Experts model designed for multilingual applications with minimal computational overhead. Despite its compact active parameter count, it demonstrates competitive performance against larger dense models.
+  overrides:
+    parameters:
+      model: aquif-ai_aquif-3.5-8B-Think-Q4_K_M.gguf
+  files:
+    - filename: aquif-ai_aquif-3.5-8B-Think-Q4_K_M.gguf
+      sha256: 9e49b9c840de23bb3eb181ba7a102706c120b3e3d006983c3f14ebae307ff02e
+      uri: huggingface://bartowski/aquif-ai_aquif-3.5-8B-Think-GGUF/aquif-ai_aquif-3.5-8B-Think-Q4_K_M.gguf
+- !!merge <<: *qwen3
+  name: "qwen3-stargate-sg1-uncensored-abliterated-8b-i1"
+  icon: https://huggingface.co/DavidAU/Qwen3-Stargate-SG1-Uncensored-Abliterated-8B/resolve/main/sg1.jpg
+  urls:
+    - https://huggingface.co/DavidAU/Qwen3-Stargate-SG1-Uncensored-Abliterated-8B
+    - https://huggingface.co/mradermacher/Qwen3-Stargate-SG1-Uncensored-Abliterated-8B-i1-GGUF
+  description: |
+    This repo contains the full precision source code, in "safe tensors" format to generate GGUFs, GPTQ, EXL2, AWQ, HQQ and other formats. The source code can also be used directly.
+
+    This model is specifically for SG1 (Stargate Series), science fiction, story generation (all genres) but also does coding and general tasks too.
+
+    This model can also be used for Role play.
+
+    This model will produce uncensored content (see notes below).
+
+    Fine tune (6 epochs, using Unsloth for Win 11) on an inhouse generated dataset to simulate / explore the Stargate SG1 Universe.
+
+    This version has the "canon" of all 10 seasons of SG1.
+
+    Model also contains, but not trained, on content from Stargate Atlantis, and Universe.
+
+    Fine tune process adds knowledge to the model, and alter all aspects of its operations.
+
+    Float32 (32 bit precision) was used to further increase the model's quality.
+
+    This model is based on "Goekdeniz-Guelmez/Josiefied-Qwen3-8B-abliterated-v1".
+
+    Example generations at the bottom of this page.
+
+    This is a Stargate (SG1) fine tune (1,331,953,664 of 9,522,689,024 (13.99% trained)), SIX epochs on this model.
+    As this is an instruct model, it will also benefit from a detailed system prompt too.
+  overrides:
+    parameters:
+      model: Qwen3-Stargate-SG1-Uncensored-Abliterated-8B.i1-Q4_K_M.gguf
+  files:
+    - filename: Qwen3-Stargate-SG1-Uncensored-Abliterated-8B.i1-Q4_K_M.gguf
+      sha256: 31ec697ccebbd7928c49714b8a0ec8be747be0f7c1ad71627967d2f8fe376990
+      uri: huggingface://mradermacher/Qwen3-Stargate-SG1-Uncensored-Abliterated-8B-i1-GGUF/Qwen3-Stargate-SG1-Uncensored-Abliterated-8B.i1-Q4_K_M.gguf
+- !!merge <<: *qwen3
+  url: "github:mudler/LocalAI/gallery/qwen3-deepresearch.yaml@master"
+  name: "alibaba-nlp_tongyi-deepresearch-30b-a3b"
+  urls:
+    - https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B
+    - https://huggingface.co/bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF
+  description: |
+    We present Tongyi DeepResearch, an agentic large language model featuring 30 billion total parameters, with only 3 billion activated per token. Developed by Tongyi Lab, the model is specifically designed for long-horizon, deep information-seeking tasks. Tongyi-DeepResearch demonstrates state-of-the-art performance across a range of agentic search benchmarks, including Humanity's Last Exam, BrowserComp, BrowserComp-ZH, WebWalkerQA, GAIA, xbench-DeepSearch and FRAMES.
+  overrides:
+    parameters:
+      model: Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf
+  files:
+    - filename: Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf
+      sha256: 1afefb3b369ea2de191f24fe8ea22cbbb7b412357902f27bd81d693dde35c2d9
+      uri: huggingface://bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf
 - &gemma3
  url: "github:mudler/LocalAI/gallery/gemma.yaml@master"
  name: "gemma-3-27b-it"
@@ -7430,6 +7579,40 @@
    - filename: Qwentile2.5-32B-Instruct-Q4_K_M.gguf
      sha256: e476d6e3c15c78fc3f986d7ae8fa35c16116843827f2e6243c05767cef2f3615
      uri: huggingface://bartowski/Qwentile2.5-32B-Instruct-GGUF/Qwentile2.5-32B-Instruct-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "websailor-32b"
+  urls:
+    - https://huggingface.co/Alibaba-NLP/WebSailor-32B
+    - https://huggingface.co/mradermacher/WebSailor-32B-GGUF
+  description: |
+    WebSailor is a complete post-training methodology designed to teach LLM agents sophisticated reasoning for complex web navigation and information-seeking tasks. It addresses the challenge of extreme uncertainty in vast information landscapes, a capability where previous open-source models lagged behind proprietary systems.
+    We classify information-seeking tasks into three difficulty levels, where Level 3 represents problems with both high uncertainty and a complex, non-linear path to a solution. To generate these challenging tasks, we introduce SailorFog-QA, a novel data synthesis pipeline that constructs intricate knowledge graphs and then applies information obfuscation. This process creates questions with high initial uncertainty that demand creative exploration and transcend simple, structured reasoning patterns.
+    Our training process begins by generating expert trajectories and then reconstructing the reasoning to create concise, action-oriented supervision signals, avoiding the stylistic and verbosity issues of teacher models. The agent is first given a "cold start" using rejection sampling fine-tuning (RFT) on a small set of high-quality examples to establish a baseline capability. This is followed by an efficient agentic reinforcement learning stage using our Duplicating Sampling Policy Optimization (DUPO) algorithm, which refines the agent's exploratory strategies.
+    WebSailor establishes a new state-of-the-art for open-source agents, achieving outstanding results on difficult benchmarks like BrowseComp-en and BrowseComp-zh. Notably, our smaller models like WebSailor-7B outperform agents built on much larger backbones, highlighting the efficacy of our training paradigm. Ultimately, WebSailor closes the performance gap to proprietary systems, achieving results on par with agents like Doubao-Search.
+  overrides:
+    parameters:
+      model: WebSailor-32B.Q4_K_M.gguf
+  files:
+    - filename: WebSailor-32B.Q4_K_M.gguf
+      sha256: 60cea732b8314cedf1807530857b4ebd9f6c41431b3223384eb7f94fbff7b5bc
+      uri: huggingface://mradermacher/WebSailor-32B-GGUF/WebSailor-32B.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "websailor-7b"
+  urls:
+    - https://huggingface.co/Alibaba-NLP/WebSailor-7B
+    - https://huggingface.co/mradermacher/WebSailor-7B-GGUF
+  description: |
+    WebSailor is a complete post-training methodology designed to teach LLM agents sophisticated reasoning for complex web navigation and information-seeking tasks. It addresses the challenge of extreme uncertainty in vast information landscapes, a capability where previous open-source models lagged behind proprietary systems.
+    We classify information-seeking tasks into three difficulty levels, where Level 3 represents problems with both high uncertainty and a complex, non-linear path to a solution. To generate these challenging tasks, we introduce SailorFog-QA, a novel data synthesis pipeline that constructs intricate knowledge graphs and then applies information obfuscation. This process creates questions with high initial uncertainty that demand creative exploration and transcend simple, structured reasoning patterns.
+    Our training process begins by generating expert trajectories and then reconstructing the reasoning to create concise, action-oriented supervision signals, avoiding the stylistic and verbosity issues of teacher models. The agent is first given a "cold start" using rejection sampling fine-tuning (RFT) on a small set of high-quality examples to establish a baseline capability. This is followed by an efficient agentic reinforcement learning stage using our Duplicating Sampling Policy Optimization (DUPO) algorithm, which refines the agent's exploratory strategies.
+    WebSailor establishes a new state-of-the-art for open-source agents, achieving outstanding results on difficult benchmarks like BrowseComp-en and BrowseComp-zh. Notably, our smaller models like WebSailor-7B outperform agents built on much larger backbones, highlighting the efficacy of our training paradigm. Ultimately, WebSailor closes the performance gap to proprietary systems, achieving results on par with agents like Doubao-Search.
+  overrides:
+    parameters:
+      model: WebSailor-7B.Q4_K_M.gguf
+  files:
+    - filename: WebSailor-7B.Q4_K_M.gguf
+      sha256: 6ede884af5d82176606c3af19a5cc90da6fdf81a520f54284084f5e012217a56
+      uri: huggingface://mradermacher/WebSailor-7B-GGUF/WebSailor-7B.Q4_K_M.gguf
 - &archfunct
  license: apache-2.0
  tags:
@@ -9829,6 +10012,119 @@
    - filename: baichuan-inc_Baichuan-M2-32B-Q4_K_M.gguf
      sha256: 51907419518e6f79c28f75e4097518e54c2efecd85cb4c714334395fa2d591c2
      uri: huggingface://bartowski/baichuan-inc_Baichuan-M2-32B-GGUF/baichuan-inc_Baichuan-M2-32B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "k2-think-i1"
+  icon: https://huggingface.co/LLM360/K2-Think/resolve/main/banner.png
+  urls:
+    - https://huggingface.co/LLM360/K2-Think
+    - https://huggingface.co/mradermacher/K2-Think-i1-GGUF
+  description: |
+    K2-Think is a 32 billion parameter open-weights general reasoning model with strong performance in competitive mathematical problem solving.
+  overrides:
+    parameters:
+      model: K2-Think.i1-Q4_K_M.gguf
+  files:
+    - filename: K2-Think.i1-Q4_K_M.gguf
+      sha256: 510fad18b0cf58059437338c1b5b982996ef89456a8d88da52eb3d50fe78b9fd
+      uri: huggingface://mradermacher/K2-Think-i1-GGUF/K2-Think.i1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "holo1.5-72b"
+  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/677d3f355f847864bb644112/OQyAJ33sssiTDIQEQ7oH_.png
+  urls:
+    - https://huggingface.co/Hcompany/Holo1.5-72B
+    - https://huggingface.co/mradermacher/Holo1.5-72B-GGUF
+  description: |
+    Computer Use (CU) agents are AI systems that can interact with real applications—web, desktop, and mobile—on behalf of a user. They can navigate interfaces, manipulate elements, and answer questions about content, enabling powerful automation and productivity tools. CU agents are becoming increasingly important as they allow humans to delegate complex digital tasks safely and efficiently.
+    The Holo1.5 series provides state-of-the-art foundational models for building such agents. Holo1.5 models excel at user interface (UI) localization and UI-based question answering (QA) across web, computer, and mobile environments, with strong performance on benchmarks including Screenspot-V2, Screenspot-Pro, GroundUI-Web, Showdown, and our newly introduced WebClick.
+  overrides:
+    mmproj: Holo1.5-72B.mmproj-Q8_0.gguf
+    parameters:
+      model: Holo1.5-72B.Q4_K_M.gguf
+  files:
+    - filename: Holo1.5-72B.Q4_K_M.gguf
+      sha256: 3404347c245fefa352a3dc16134b5870f594ab8bff11e50582205b5538201a23
+      uri: huggingface://mradermacher/Holo1.5-72B-GGUF/Holo1.5-72B.Q4_K_M.gguf
+    - filename: Holo1.5-72B.mmproj-Q8_0.gguf
+      sha256: f172cffc96a00d4f885eecffbc798912d37105f4191ba16a9947a5776b0f8a02
+      uri: huggingface://mradermacher/Holo1.5-72B-GGUF/Holo1.5-72B.mmproj-Q8_0.gguf
+- !!merge <<: *qwen25
+  name: "holo1.5-7b"
+  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/677d3f355f847864bb644112/OQyAJ33sssiTDIQEQ7oH_.png
+  urls:
+    - https://huggingface.co/Hcompany/Holo1.5-7B
+    - https://huggingface.co/mradermacher/Holo1.5-7B-GGUF
+  description: |
+    Computer Use (CU) agents are AI systems that can interact with real applications—web, desktop, and mobile—on behalf of a user. They can navigate interfaces, manipulate elements, and answer questions about content, enabling powerful automation and productivity tools. CU agents are becoming increasingly important as they allow humans to delegate complex digital tasks safely and efficiently.
+    The Holo1.5 series provides state-of-the-art foundational models for building such agents. Holo1.5 models excel at user interface (UI) localization and UI-based question answering (QA) across web, computer, and mobile environments, with strong performance on benchmarks including Screenspot-V2, Screenspot-Pro, GroundUI-Web, Showdown, and our newly introduced WebClick.
+  overrides:
+    mmproj: Holo1.5-7B.mmproj-Q8_0.gguf
+    parameters:
+      model: Holo1.5-7B.Q4_K_M.gguf
+  files:
+    - filename: Holo1.5-7B.Q4_K_M.gguf
+      sha256: 37d1c060b73b783ffdab8d70fa47a6cff46cd34b1cf44b5bfbf4f20ff99eacdd
+      uri: huggingface://mradermacher/Holo1.5-7B-GGUF/Holo1.5-7B.Q4_K_M.gguf
+    - filename: Holo1.5-7B.mmproj-Q8_0.gguf
+      sha256: a9bad2d3d9241251b8753d9be4ea737c03197077d96153c1365a62db709489f6
+      uri: huggingface://mradermacher/Holo1.5-7B-GGUF/Holo1.5-7B.mmproj-Q8_0.gguf
+- !!merge <<: *qwen25
+  name: "holo1.5-3b"
+  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/677d3f355f847864bb644112/OQyAJ33sssiTDIQEQ7oH_.png
+  urls:
+    - https://huggingface.co/Hcompany/Holo1.5-3B
+    - https://huggingface.co/mradermacher/Holo1.5-3B-GGUF
+  description: |
+    Computer Use (CU) agents are AI systems that can interact with real applications—web, desktop, and mobile—on behalf of a user. They can navigate interfaces, manipulate elements, and answer questions about content, enabling powerful automation and productivity tools. CU agents are becoming increasingly important as they allow humans to delegate complex digital tasks safely and efficiently.
+    The Holo1.5 series provides state-of-the-art foundational models for building such agents. Holo1.5 models excel at user interface (UI) localization and UI-based question answering (QA) across web, computer, and mobile environments, with strong performance on benchmarks including Screenspot-V2, Screenspot-Pro, GroundUI-Web, Showdown, and our newly introduced WebClick.
+  overrides:
+    mmproj: Holo1.5-3B.mmproj-Q8_0.gguf
+    parameters:
+      model: Holo1.5-3B.Q4_K_M.gguf
+  files:
+    - filename: Holo1.5-3B.Q4_K_M.gguf
+      sha256: 5efb1318d439fe1f71e38825a17203c48ced7de4a5d0796427c8c638e817622a
+      uri: huggingface://mradermacher/Holo1.5-3B-GGUF/Holo1.5-3B.Q4_K_M.gguf
+    - filename: Holo1.5-3B.mmproj-Q8_0.gguf
+      sha256: fb5cc798b386a4b680c306f061457cb16cc627c7d9ed401d660b8b940463142b
+      uri: huggingface://mradermacher/Holo1.5-3B-GGUF/Holo1.5-3B.mmproj-Q8_0.gguf
+- !!merge <<: *qwen25
+  name: "webwatcher-7b"
+  icon: https://huggingface.co/Alibaba-NLP/WebWatcher-7B/resolve/main/assets/webwatcher_logo.png
+  urls:
+    - https://huggingface.co/Alibaba-NLP/WebWatcher-7B
+    - https://huggingface.co/mradermacher/WebWatcher-7B-GGUF
+  description: |
+      WebWatcher is a multimodal agent for deep research that possesses enhanced visual-language reasoning capabilities. Our work presents a unified framework that combines complex vision-language reasoning with multi-tool interaction.
+  overrides:
+    mmproj: WebWatcher-7B.mmproj-Q8_0.gguf
+    parameters:
+      model: WebWatcher-7B.Q4_K_M.gguf
+  files:
+    - filename: WebWatcher-7B.Q4_K_M.gguf
+      sha256: 300c76a51de59552f997ee7ee78ec519620931dea15c655111633b96de1a47f2
+      uri: huggingface://mradermacher/WebWatcher-7B-GGUF/WebWatcher-7B.Q4_K_M.gguf
+    - filename: WebWatcher-7B.mmproj-Q8_0.gguf
+      sha256: 841dc1bcc4f69ca864518d2c9a9a37b1815169d9bd061b054e091061124e4e62
+      uri: huggingface://mradermacher/WebWatcher-7B-GGUF/WebWatcher-7B.mmproj-Q8_0.gguf
+- !!merge <<: *qwen25
+  name: "webwatcher-32b"
+  icon: https://huggingface.co/Alibaba-NLP/WebWatcher-32B/resolve/main/assets/webwatcher_logo.png
+  urls:
+    - https://huggingface.co/Alibaba-NLP/WebWatcher-32B
+    - https://huggingface.co/mradermacher/WebWatcher-32B-GGUF
+  description: |
+      WebWatcher is a multimodal agent for deep research that possesses enhanced visual-language reasoning capabilities. Our work presents a unified framework that combines complex vision-language reasoning with multi-tool interaction.
+  overrides:
+    mmproj: WebWatcher-32B.mmproj-Q8_0.gguf
+    parameters:
+      model: WebWatcher-32B.Q4_K_M.gguf
+  files:
+    - filename: WebWatcher-32B.Q4_K_M.gguf
+      sha256: 6cd51d97b9451759a4ce4ec0c2048b36ff99fd9f83bb32cd9f06af6c5438c69b
+      uri: huggingface://mradermacher/WebWatcher-32B-GGUF/WebWatcher-32B.Q4_K_M.gguf
+    - filename: WebWatcher-32B.mmproj-Q8_0.gguf
+      sha256: e8815515f71a959465cc62e08e0ef45d7d8592215139b34efece848552cb2327
+      uri: huggingface://mradermacher/WebWatcher-32B-GGUF/WebWatcher-32B.mmproj-Q8_0.gguf
 - &llama31
  url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
  icon: https://avatars.githubusercontent.com/u/153379578
@@ -20051,145 +20347,154 @@
  name: "whisper-base-q5_1"
  overrides:
    parameters:
-      model: ggml-model-whisper-base-q5_1.bin
+      model: ggml-base-q5_1.bin
  files:
-    - filename: "ggml-model-whisper-base-q5_1.bin"
-      uri: "https://ggml.ggerganov.com/ggml-model-whisper-base-q5_1.bin"
+    - filename: "ggml-base-q5_1.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-base-q5_1.bin"
      sha256: 422f1ae452ade6f30a004d7e5c6a43195e4433bc370bf23fac9cc591f01a8898
 - !!merge <<: *whisper
  name: "whisper-base"
  overrides:
    parameters:
-      model: ggml-model-whisper-base.bin
+      model: ggml-base.bin
  files:
-    - filename: "ggml-model-whisper-base.bin"
-      uri: "https://ggml.ggerganov.com/ggml-model-whisper-base.bin"
+    - filename: "ggml-base.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-base.bin"
      sha256: 60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe
 - !!merge <<: *whisper
  name: "whisper-base-en-q5_1"
  overrides:
    parameters:
-      model: ggml-model-whisper-base.en-q5_1.bin
+      model: ggml-base.en-q5_1.bin
  files:
-    - filename: "ggml-model-whisper-base.en-q5_1.bin"
-      uri: "https://ggml.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin"
+    - filename: "ggml-base.en-q5_1.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-base.en-q5_1.bin"
      sha256: 4baf70dd0d7c4247ba2b81fafd9c01005ac77c2f9ef064e00dcf195d0e2fdd2f
 - !!merge <<: *whisper
  name: "whisper-base-en"
  overrides:
    parameters:
-      model: ggml-model-whisper-base.en.bin
+      model: ggml-base.en.bin
  files:
-    - filename: "ggml-model-whisper-base.en.bin"
-      uri: "https://ggml.ggerganov.com/ggml-model-whisper-base.en.bin"
+    - filename: "ggml-base.en.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-base.en.bin"
      sha256: a03779c86df3323075f5e796cb2ce5029f00ec8869eee3fdfb897afe36c6d002
 - !!merge <<: *whisper
  name: "whisper-large-q5_0"
  overrides:
    parameters:
-      model: ggml-model-whisper-large-q5_0.bin
+      model: ggml-large-q5_0.bin
  files:
-    - filename: "ggml-model-whisper-large-q5_0.bin"
-      uri: "https://ggml.ggerganov.com/ggml-model-whisper-large-q5_0.bin"
+    - filename: "ggml-large-q5_0.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-large-q5_0.bin"
      sha256: 3a214837221e4530dbc1fe8d734f302af393eb30bd0ed046042ebf4baf70f6f2
 - !!merge <<: *whisper
  name: "whisper-medium-q5_0"
  overrides:
    parameters:
-      model: ggml-model-whisper-medium-q5_0.bin
+      model: ggml-medium-q5_0.bin
  files:
-    - filename: "ggml-model-whisper-medium-q5_0.bin"
-      uri: "https://ggml.ggerganov.com/ggml-model-whisper-medium-q5_0.bin"
+    - filename: "ggml-medium-q5_0.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-medium-q5_0.bin"
      sha256: 19fea4b380c3a618ec4723c3eef2eb785ffba0d0538cf43f8f235e7b3b34220f
 - !!merge <<: *whisper
  name: "whisper-small-q5_1"
  overrides:
    parameters:
-      model: ggml-model-whisper-small-q5_1.bin
+      model: ggml-small-q5_1.bin
  files:
-    - filename: "ggml-model-whisper-small-q5_1.bin"
-      uri: "https://ggml.ggerganov.com/ggml-model-whisper-small-q5_1.bin"
+    - filename: "ggml-small-q5_1.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-small-q5_1.bin"
      sha256: ae85e4a935d7a567bd102fe55afc16bb595bdb618e11b2fc7591bc08120411bb
 - !!merge <<: *whisper
  name: "whisper-small"
  overrides:
    parameters:
-      model: ggml-model-whisper-small.bin
+      model: ggml-small.bin
  files:
-    - filename: "ggml-model-whisper-small.bin"
-      uri: "https://ggml.ggerganov.com/ggml-model-whisper-small.bin"
+    - filename: "ggml-small.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-small.bin"
      sha256: 1be3a9b2063867b937e64e2ec7483364a79917e157fa98c5d94b5c1fffea987b
+- !!merge <<: *whisper
+  name: "whisper-small-en-tdrz"
+  overrides:
+    parameters:
+      model: ggml-small.en-tdrz.bin
+  files:
+    - filename: "ggml-small.bin"
+      uri: "huggingface://akashmjn/tinydiarize-whisper.cpp/ggml-small.en-tdrz.bin"
+      sha256: ceac3ec06d1d98ef71aec665283564631055fd6129b79d8e1be4f9cc33cc54b4
 - !!merge <<: *whisper
  name: "whisper-small-en-q5_1"
  overrides:
    parameters:
-      model: ggml-model-whisper-small.en-q5_1.bin
+      model: ggml-small.en-q5_1.bin
  files:
-    - filename: "ggml-model-whisper-small.en-q5_1.bin"
-      uri: "https://ggml.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin"
+    - filename: "ggml-small.en-q5_1.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-small.en-q5_1.bin"
      sha256: bfdff4894dcb76bbf647d56263ea2a96645423f1669176f4844a1bf8e478ad30
 - !!merge <<: *whisper
  name: "whisper-small"
  overrides:
    parameters:
-      model: ggml-model-whisper-small.en.bin
+      model: ggml-small.en.bin
  files:
-    - filename: "ggml-model-whisper-small.en.bin"
-      uri: "https://ggml.ggerganov.com/ggml-model-whisper-small.en.bin"
+    - filename: "ggml-small.en.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-small.en.bin"
      sha256: c6138d6d58ecc8322097e0f987c32f1be8bb0a18532a3f88f734d1bbf9c41e5d
 - !!merge <<: *whisper
  name: "whisper-small-q5_1"
  overrides:
    parameters:
-      model: ggml-model-whisper-small-q5_1.bin
+      model: ggml-small-q5_1.bin
  files:
-    - filename: "ggml-model-whisper-small-q5_1.bin"
-      uri: "https://ggml.ggerganov.com/ggml-model-whisper-small-q5_1.bin"
+    - filename: "ggml-small-q5_1.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-small-q5_1.bin"
      sha256: ae85e4a935d7a567bd102fe55afc16bb595bdb618e11b2fc7591bc08120411bb
 - !!merge <<: *whisper
  name: "whisper-tiny"
  overrides:
    parameters:
-      model: ggml-model-whisper-tiny.bin
+      model: ggml-tiny.bin
  files:
-    - filename: "ggml-model-whisper-tiny.bin"
-      uri: "https://ggml.ggerganov.com/ggml-model-whisper-tiny.bin"
+    - filename: "ggml-tiny.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-tiny.bin"
      sha256: be07e048e1e599ad46341c8d2a135645097a538221678b7acdd1b1919c6e1b21
 - !!merge <<: *whisper
  name: "whisper-tiny-q5_1"
  overrides:
    parameters:
-      model: ggml-model-whisper-tiny-q5_1.bin
+      model: ggml-tiny-q5_1.bin
  files:
-    - filename: "ggml-model-whisper-tiny-q5_1.bin"
-      uri: "https://ggml.ggerganov.com/ggml-model-whisper-tiny-q5_1.bin"
+    - filename: "ggml-tiny-q5_1.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-tiny-q5_1.bin"
      sha256: 818710568da3ca15689e31a743197b520007872ff9576237bda97bd1b469c3d7
 - !!merge <<: *whisper
  name: "whisper-tiny-en-q5_1"
  overrides:
    parameters:
-      model: ggml-model-whisper-tiny.en-q5_1.bin
+      model: ggml-tiny.en-q5_1.bin
  files:
-    - filename: "ggml-model-whisper-tiny.en-q5_1.bin"
-      uri: "https://ggml.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin"
+    - filename: "ggml-tiny.en-q5_1.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-tiny.en-q5_1.bin"
      sha256: c77c5766f1cef09b6b7d47f21b546cbddd4157886b3b5d6d4f709e91e66c7c2b
 - !!merge <<: *whisper
  name: "whisper-tiny-en"
  overrides:
    parameters:
-      model: ggml-model-whisper-tiny.en.bin
+      model: ggml-tiny.en.bin
  files:
-    - filename: "ggml-model-whisper-tiny.en.bin"
-      uri: "https://ggml.ggerganov.com/ggml-model-whisper-tiny.en.bin"
+    - filename: "ggml-tiny.en.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-tiny.en.bin"
      sha256: 921e4cf8686fdd993dcd081a5da5b6c365bfde1162e72b08d75ac75289920b1f
 - !!merge <<: *whisper
  name: "whisper-tiny-en-q8_0"
  overrides:
    parameters:
-      model: ggml-model-whisper-tiny.en-q8_0.bin
+      model: ggml-tiny.en-q8_0.bin
  files:
-    - filename: "ggml-model-whisper-tiny.en-q8_0.bin"
-      uri: "https://ggml.ggerganov.com/ggml-model-whisper-tiny.en-q8_0.bin"
+    - filename: "ggml-tiny.en-q8_0.bin"
+      uri: "huggingface://ggerganov/whisper.cpp/ggml-tiny.en-q8_0.bin"
      sha256: 5bc2b3860aa151a4c6e7bb095e1fcce7cf12c7b020ca08dcec0c6d018bb7dd94
 ## Bert embeddings (llama3.2 drop-in)
 - !!merge <<: *llama32
--- a/gallery/qwen3-deepresearch.yaml
+++ b/gallery/qwen3-deepresearch.yaml
@@ -0,0 +1,45 @@
+---
+name: "qwen3"
+
+config_file: |
+  mmap: true
+  backend: "llama-cpp"
+  template:
+    chat_message: |
+      <|im_start|>{{if eq .RoleName "tool" }}user{{else}}{{ .RoleName }}{{end}}
+      {{ if eq .RoleName "tool" -}}
+      <tool_response>
+      {{ end -}}
+      {{ if .Content -}}
+      {{.Content }}
+      {{ end -}}
+      {{ if eq .RoleName "tool" -}}
+      </tool_response>
+      {{ end -}}
+      {{ if .FunctionCall -}}
+      <tool_call>
+      {{toJson .FunctionCall}}
+      </tool_call>
+      {{ end -}}<|im_end|>
+    function: |
+      <|im_start|>system
+      You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+      {{range .Functions}}
+      {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+      {{end}}
+      For each function call return a json object with function name and arguments
+      <|im_end|>
+      {{.Input -}}
+      <|im_start|>assistant
+    chat: |
+      {{.Input -}}
+      <|im_start|>assistant
+    completion: |
+      {{.Input}}
+  context_size: 8192
+  f16: true
+  stopwords:
+  - '<|im_end|>'
+  - '<dummy32000>'
+  - '</s>'
+  - '<|endoftext|>'
--- a/go.mod
+++ b/go.mod
@@ -41,26 +41,26 @@ require (
 	github.com/otiai10/copy v1.14.1
 	github.com/otiai10/openaigo v1.7.0
 	github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5
-	github.com/prometheus/client_golang v1.22.0
+	github.com/prometheus/client_golang v1.23.0
 	github.com/rs/zerolog v1.33.0
 	github.com/russross/blackfriday v1.6.0
 	github.com/sashabaranov/go-openai v1.26.2
 	github.com/schollz/progressbar/v3 v3.14.4
 	github.com/shirou/gopsutil/v3 v3.24.5
 	github.com/streamer45/silero-vad-go v0.2.1
-	github.com/stretchr/testify v1.10.0
-	github.com/swaggo/swag v1.16.3
+	github.com/stretchr/testify v1.11.1
+	github.com/swaggo/swag v1.16.6
 	github.com/testcontainers/testcontainers-go v0.35.0
 	github.com/tmc/langchaingo v0.1.13
 	github.com/valyala/fasthttp v1.55.0
-	go.opentelemetry.io/otel v1.35.0
-	go.opentelemetry.io/otel/exporters/prometheus v0.50.0
-	go.opentelemetry.io/otel/metric v1.35.0
-	go.opentelemetry.io/otel/sdk/metric v1.28.0
+	go.opentelemetry.io/otel v1.38.0
+	go.opentelemetry.io/otel/exporters/prometheus v0.60.0
+	go.opentelemetry.io/otel/metric v1.38.0
+	go.opentelemetry.io/otel/sdk/metric v1.38.0
 	google.golang.org/grpc v1.67.1
 	gopkg.in/yaml.v2 v2.4.0
 	gopkg.in/yaml.v3 v3.0.1
-	oras.land/oras-go/v2 v2.5.0
+	oras.land/oras-go/v2 v2.6.0
 )

 require (
@@ -90,6 +90,7 @@ require (
 	github.com/go-text/render v0.2.0 // indirect
 	github.com/go-text/typesetting v0.2.1 // indirect
 	github.com/godbus/dbus/v5 v5.1.0 // indirect
+	github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc // indirect
 	github.com/hack-pad/go-indexeddb v0.3.2 // indirect
 	github.com/hack-pad/safejs v0.1.0 // indirect
 	github.com/jeandeaual/go-locale v0.0.0-20250612000132-0ef82f21eade // indirect
@@ -129,6 +130,7 @@ require (
 	github.com/pion/transport/v3 v3.0.7 // indirect
 	github.com/pion/turn/v4 v4.0.2 // indirect
 	github.com/pion/webrtc/v4 v4.1.2 // indirect
+	github.com/prometheus/otlptranslator v0.0.2 // indirect
 	github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 // indirect
 	github.com/rymdport/portal v0.4.1 // indirect
 	github.com/savsgio/gotils v0.0.0-20240303185622-093b76447511 // indirect
@@ -144,7 +146,7 @@ require (
 	go.yaml.in/yaml/v3 v3.0.4 // indirect
 	golang.org/x/image v0.25.0 // indirect
 	golang.org/x/time v0.12.0 // indirect
-	google.golang.org/protobuf v1.36.7 // indirect
+	google.golang.org/protobuf v1.36.8 // indirect
 )

 require (
@@ -267,7 +269,7 @@ require (
 	github.com/multiformats/go-varint v0.0.7 // indirect
 	github.com/nwaples/rardecode v1.1.0 // indirect
 	github.com/opencontainers/go-digest v1.0.0 // indirect
-	github.com/opencontainers/image-spec v1.1.0
+	github.com/opencontainers/image-spec v1.1.1
 	github.com/opentracing/opentracing-go v1.2.0 // indirect
 	github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 // indirect
 	github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
@@ -279,8 +281,8 @@ require (
 	github.com/polydawn/refmt v0.89.0 // indirect
 	github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect
 	github.com/prometheus/client_model v0.6.2 // indirect
-	github.com/prometheus/common v0.64.0 // indirect
-	github.com/prometheus/procfs v0.16.1 // indirect
+	github.com/prometheus/common v0.65.0 // indirect
+	github.com/prometheus/procfs v0.17.0 // indirect
 	github.com/quic-go/qpack v0.5.1 // indirect
 	github.com/quic-go/quic-go v0.54.0 // indirect
 	github.com/quic-go/webtransport-go v0.9.0 // indirect
@@ -308,8 +310,8 @@ require (
 	github.com/yuin/goldmark-emoji v1.0.5 // indirect
 	github.com/yusufpapurcu/wmi v1.2.4 // indirect
 	go.opencensus.io v0.24.0 // indirect
-	go.opentelemetry.io/otel/sdk v1.31.0 // indirect
-	go.opentelemetry.io/otel/trace v1.35.0 // indirect
+	go.opentelemetry.io/otel/sdk v1.38.0 // indirect
+	go.opentelemetry.io/otel/trace v1.38.0 // indirect
 	go.uber.org/dig v1.19.0 // indirect
 	go.uber.org/fx v1.24.0 // indirect
 	go.uber.org/multierr v1.11.0 // indirect
--- a/go.sum
+++ b/go.sum
@@ -301,6 +301,8 @@ github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aN
 github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
 github.com/gpustack/gguf-parser-go v0.17.0 h1:DkSziWLsiQM0pqqkr/zMcaBn94KY7iQTi4zmaHixDus=
 github.com/gpustack/gguf-parser-go v0.17.0/go.mod h1:GvHh1Kvvq5ojCOsJ5UpwiJJmIjFw3Qk5cW7R+CZ3IJo=
+github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc h1:GN2Lv3MGO7AS6PrRoT6yV5+wkrOpcszoIsO4+4ds248=
+github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc/go.mod h1:+JKpmjMGhpgPL+rXZ5nsZieVzvarn86asRlBg4uNGnk=
 github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
 github.com/grpc-ecosystem/grpc-gateway v1.5.0 h1:WcmKMm43DR7RdtlkEXQJyo5ws8iTp98CyhCCbOHMvNI=
 github.com/grpc-ecosystem/grpc-gateway v1.5.0/go.mod h1:RSKVYQBd5MCa4OVpNdGskqpgL2+G+NZTnrVHpWWfpdw=
@@ -560,8 +562,8 @@ github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A=
 github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k=
 github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
-github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug=
-github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM=
+github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040=
+github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M=
 github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs=
 github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc=
 github.com/openzipkin/zipkin-go v0.1.1/go.mod h1:NtoC/o8u3JlF1lSlyPNswIbeQH9bJTmOf0Erfk+hxe8=
@@ -639,18 +641,20 @@ github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:Om
 github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g=
 github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U=
 github.com/prometheus/client_golang v0.8.0/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
-github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q=
-github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0=
+github.com/prometheus/client_golang v1.23.0 h1:ust4zpdl9r4trLY/gSjlm07PuiBq2ynaXXlptpfy8Uc=
+github.com/prometheus/client_golang v1.23.0/go.mod h1:i/o0R9ByOnHX0McrTMTyhYvKE4haaf2mW08I+jGAjEE=
 github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
 github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
 github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
 github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
 github.com/prometheus/common v0.0.0-20180801064454-c7de2306084e/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
-github.com/prometheus/common v0.64.0 h1:pdZeA+g617P7oGv1CzdTzyeShxAGrTBsolKNOLQPGO4=
-github.com/prometheus/common v0.64.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8=
+github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE=
+github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8=
+github.com/prometheus/otlptranslator v0.0.2 h1:+1CdeLVrRQ6Psmhnobldo0kTp96Rj80DRXRd5OSnMEQ=
+github.com/prometheus/otlptranslator v0.0.2/go.mod h1:P8AwMgdD7XEr6QRUJ2QWLpiAZTgTE2UYgjlu3svompI=
 github.com/prometheus/procfs v0.0.0-20180725123919-05ee40e3a273/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
-github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
-github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
+github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0=
+github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw=
 github.com/quic-go/qpack v0.5.1 h1:giqksBPnT/HDtZ6VhtFKgoLOWmlyo9Ei6u9PqzIMbhI=
 github.com/quic-go/qpack v0.5.1/go.mod h1:+PC4XFrEskIVkcLzpEkbLqq1uCoxPhQuvK5rH1ZgaEg=
 github.com/quic-go/quic-go v0.54.0 h1:6s1YB9QotYI6Ospeiguknbp2Znb/jZYjZLRXn9kMQBg=
@@ -754,12 +758,12 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
 github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
-github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
-github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
+github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
 github.com/swaggo/files/v2 v2.0.0 h1:hmAt8Dkynw7Ssz46F6pn8ok6YmGZqHSVLZ+HQM7i0kw=
 github.com/swaggo/files/v2 v2.0.0/go.mod h1:24kk2Y9NYEJ5lHuCra6iVwkMjIekMCaFq/0JQj66kyM=
-github.com/swaggo/swag v1.16.3 h1:PnCYjPCah8FK4I26l2F/KQ4yz3sILcVUN3cTlBFA9Pg=
-github.com/swaggo/swag v1.16.3/go.mod h1:DImHIuOFXKpMFAQjcC7FG4m3Dg4+QuUgUzJmKjI/gRk=
+github.com/swaggo/swag v1.16.6 h1:qBNcx53ZaX+M5dxVyTrgQ0PJ/ACK+NzhwcbieTt+9yI=
+github.com/swaggo/swag v1.16.6/go.mod h1:ngP2etMK5a0P3QBizic5MEwpRmluJZPHjXcMoj4Xesg=
 github.com/tarm/serial v0.0.0-20180830185346-98f6abe2eb07/go.mod h1:kDXzergiv9cbyO7IOYJZWg1U88JhDg3PB6klq9Hg2pA=
 github.com/testcontainers/testcontainers-go v0.35.0 h1:uADsZpTKFAtp8SLK+hMwSaa+X+JiERHtd4sQAFmXeMo=
 github.com/testcontainers/testcontainers-go v0.35.0/go.mod h1:oEVBj5zrfJTrgjwONs1SsRbnBtH9OKl+IGl3UMcr2B4=
@@ -820,22 +824,22 @@ go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJyS
 go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A=
 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.56.0 h1:UP6IpuHFkUgOQL9FFQFrZ+5LiwhhYRbi7VZSIx6Nj5s=
 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.56.0/go.mod h1:qxuZLtbq5QDtdeSHsS7bcf6EH6uO6jUAgk764zd3rhM=
-go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ=
-go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y=
+go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8=
+go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.31.0 h1:K0XaT3DwHAcV4nKLzcQvwAgSyisUghWoY20I7huthMk=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.31.0/go.mod h1:B5Ki776z/MBnVha1Nzwp5arlzBbE3+1jk+pGmaP5HME=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.31.0 h1:lUsI2TYsQw2r1IASwoROaCnjdj2cvC2+Jbxvk6nHnWU=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.31.0/go.mod h1:2HpZxxQurfGxJlJDblybejHB6RX6pmExPNe517hREw4=
-go.opentelemetry.io/otel/exporters/prometheus v0.50.0 h1:2Ewsda6hejmbhGFyUvWZjUThC98Cf8Zy6g0zkIimOng=
-go.opentelemetry.io/otel/exporters/prometheus v0.50.0/go.mod h1:pMm5PkUo5YwbLiuEf7t2xg4wbP0/eSJrMxIMxKosynY=
-go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M=
-go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE=
-go.opentelemetry.io/otel/sdk v1.31.0 h1:xLY3abVHYZ5HSfOg3l2E5LUj2Cwva5Y7yGxnSW9H5Gk=
-go.opentelemetry.io/otel/sdk v1.31.0/go.mod h1:TfRbMdhvxIIr/B2N2LQW2S5v9m3gOQ/08KsbbO5BPT0=
-go.opentelemetry.io/otel/sdk/metric v1.28.0 h1:OkuaKgKrgAbYrrY0t92c+cC+2F6hsFNnCQArXCKlg08=
-go.opentelemetry.io/otel/sdk/metric v1.28.0/go.mod h1:cWPjykihLAPvXKi4iZc1dpER3Jdq2Z0YLse3moQUCpg=
-go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs=
-go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc=
+go.opentelemetry.io/otel/exporters/prometheus v0.60.0 h1:cGtQxGvZbnrWdC2GyjZi0PDKVSLWP/Jocix3QWfXtbo=
+go.opentelemetry.io/otel/exporters/prometheus v0.60.0/go.mod h1:hkd1EekxNo69PTV4OWFGZcKQiIqg0RfuWExcPKFvepk=
+go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA=
+go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI=
+go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E=
+go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg=
+go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM=
+go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA=
+go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE=
+go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs=
 go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0=
 go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8=
 go.uber.org/atomic v1.6.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ=
@@ -1066,8 +1070,8 @@ google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2
 google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
 google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
 google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
-google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2A=
-google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
+google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
+google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
@@ -1099,8 +1103,8 @@ howett.net/plist v1.0.2-0.20250314012144-ee69052608d9 h1:eeH1AIcPvSc0Z25ThsYF+Xo
 howett.net/plist v1.0.2-0.20250314012144-ee69052608d9/go.mod h1:fyFX5Hj5tP1Mpk8obqA9MZgXT416Q5711SDT7dQLTLk=
 lukechampine.com/blake3 v1.4.1 h1:I3Smz7gso8w4/TunLKec6K2fn+kyKtDxr/xcQEN84Wg=
 lukechampine.com/blake3 v1.4.1/go.mod h1:QFosUxmjB8mnrWFSNwKmvxHpfY72bmD2tQ0kBMM3kwo=
-oras.land/oras-go/v2 v2.5.0 h1:o8Me9kLY74Vp5uw07QXPiitjsw7qNXi8Twd+19Zf02c=
-oras.land/oras-go/v2 v2.5.0/go.mod h1:z4eisnLP530vwIOUOJeBIj0aGI0L1C3d53atvCBqZHg=
+oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc=
+oras.land/oras-go/v2 v2.6.0/go.mod h1:magiQDfG6H1O9APp+rOsvCPcW1GD2MM7vgnKY0Y+u1o=
 sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo=
 sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8=
 sourcegraph.com/sourcegraph/go-diff v0.5.0/go.mod h1:kuch7UrkMzY0X+p9CRK03kfuPQ2zzQcaEFbx8wA8rck=
--- a/pkg/downloader/huggingface.go
+++ b/pkg/downloader/huggingface.go
@@ -23,10 +23,10 @@ var ErrUnsafeFilesFound = errors.New("unsafe files found")

 func HuggingFaceScan(uri URI) (*HuggingFaceScanResult, error) {
 	cleanParts := strings.Split(uri.ResolveURL(), "/")
-	if len(cleanParts) <= 4 || cleanParts[2] != "huggingface.co" {
+	if len(cleanParts) <= 4 || cleanParts[2] != "huggingface.co" && cleanParts[2] != HF_ENDPOINT {
 		return nil, ErrNonHuggingFaceFile
 	}
-	results, err := http.Get(fmt.Sprintf("https://huggingface.co/api/models/%s/%s/scan", cleanParts[3], cleanParts[4]))
+	results, err := http.Get(fmt.Sprintf("%s/api/models/%s/%s/scan", HF_ENDPOINT, cleanParts[3], cleanParts[4]))
 	if err != nil {
 		return nil, err
 	}
--- a/pkg/downloader/uri.go
+++ b/pkg/downloader/uri.go
@@ -37,6 +37,17 @@ const (

 type URI string

+// HF_ENDPOINT is the HuggingFace endpoint, can be overridden by setting the HF_ENDPOINT environment variable.
+var HF_ENDPOINT string = loadConfig()
+
+func loadConfig() string {
+	HF_ENDPOINT := os.Getenv("HF_ENDPOINT")
+	if HF_ENDPOINT == "" {
+		HF_ENDPOINT = "https://huggingface.co"
+	}
+	return HF_ENDPOINT
+}
+
 func (uri URI) DownloadWithCallback(basePath string, f func(url string, i []byte) error) error {
 	return uri.DownloadWithAuthorizationAndCallback(basePath, "", f)
 }
@@ -213,7 +224,7 @@ func (s URI) ResolveURL() string {
 			filepath = strings.Split(filepath, "@")[0]
 		}

-		return fmt.Sprintf("https://huggingface.co/%s/%s/resolve/%s/%s", owner, repo, branch, filepath)
+		return fmt.Sprintf("%s/%s/%s/resolve/%s/%s", HF_ENDPOINT, owner, repo, branch, filepath)
 	}

 	return string(s)
--- a/pkg/system/capabilities.go
+++ b/pkg/system/capabilities.go
@@ -1,3 +1,5 @@
+// Package system provides system detection utilities, including GPU/vendor detection
+// and capability classification used to select optimal backends at runtime.
 package system

 import (
@@ -116,3 +118,25 @@ func detectGPUVendor(gpus []*gpu.GraphicsCard) (string, error) {

 	return "", nil
 }
+
+// BackendPreferenceTokens returns a list of substrings that represent the preferred
+// backend implementation order for the current system capability. Callers can use
+// these tokens to select the most appropriate concrete backend among multiple
+// candidates sharing the same alias (e.g., "llama-cpp").
+func (s *SystemState) BackendPreferenceTokens() []string {
+	capStr := strings.ToLower(s.getSystemCapabilities())
+	switch {
+	case strings.HasPrefix(capStr, nvidia):
+		return []string{"cuda", "vulkan", "cpu"}
+	case strings.HasPrefix(capStr, amd):
+		return []string{"rocm", "hip", "vulkan", "cpu"}
+	case strings.HasPrefix(capStr, intel):
+		return []string{"sycl", "intel", "cpu"}
+	case strings.HasPrefix(capStr, metal):
+		return []string{"metal", "cpu"}
+	case strings.HasPrefix(capStr, darwinX86):
+		return []string{"darwin-x86", "cpu"}
+	default:
+		return []string{"cpu"}
+	}
+}
--- a/tests/e2e-aio/e2e_test.go
+++ b/tests/e2e-aio/e2e_test.go
@@ -169,6 +169,30 @@ var _ = Describe("E2E test", func() {
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp.Data)).To(Equal(1), fmt.Sprint(resp))
 				Expect(resp.Data[0].Embedding).ToNot(BeEmpty())
+
+				resp2, err := client.CreateEmbeddings(context.TODO(),
+					openai.EmbeddingRequestStrings{
+						Input: []string{"cat"},
+						Model: openai.AdaEmbeddingV2,
+					},
+				)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp2.Data)).To(Equal(1), fmt.Sprint(resp))
+				Expect(resp2.Data[0].Embedding).ToNot(BeEmpty())
+				Expect(resp2.Data[0].Embedding).ToNot(Equal(resp.Data[0].Embedding))
+
+				resp3, err := client.CreateEmbeddings(context.TODO(),
+					openai.EmbeddingRequestStrings{
+						Input: []string{"doc", "cat"},
+						Model: openai.AdaEmbeddingV2,
+					},
+				)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp3.Data)).To(Equal(2), fmt.Sprint(resp))
+				Expect(resp3.Data[0].Embedding).ToNot(BeEmpty())
+				Expect(resp3.Data[0].Embedding).To(Equal(resp.Data[0].Embedding))
+				Expect(resp3.Data[1].Embedding).To(Equal(resp2.Data[0].Embedding))
+				Expect(resp3.Data[0].Embedding).ToNot(Equal(resp3.Data[1].Embedding))
 			})
 		})
 		Context("vision", func() {