fix(go-grpc-server): always close resultChan

By not closing the channel, if a server not implementing PredictStream receives a client call would hang indefinetly as would wait for resultChan to be consumed. If the prediction stream returns we close the channel now and we wait for the goroutine to finish. Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-02-03 11:13:31 -05:00 · 2024-10-05 00:07:58 +02:00
31 changed files with 185 additions and 611 deletions
--- a/.github/ci/modelslist.go
+++ b/.github/ci/modelslist.go
@@ -6,7 +6,6 @@ import (
 	"io/ioutil"
 	"os"

-	"github.com/microcosm-cc/bluemonday"
 	"gopkg.in/yaml.v3"
 )

@@ -280,12 +279,6 @@ func main() {
 		return
 	}

-	// Ensure that all arbitrary text content is sanitized before display
-	for i, m := range models {
-		models[i].Name = bluemonday.StrictPolicy().Sanitize(m.Name)
-		models[i].Description = bluemonday.StrictPolicy().Sanitize(m.Description)
-	}
-
 	// render the template
 	data := struct {
 		Models          []*GalleryModel
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -33,7 +33,7 @@ jobs:
        run: |
          CGO_ENABLED=0 make build-api
      - name: rm
-        uses: appleboy/ssh-action@v1.1.0
+        uses: appleboy/ssh-action@v1.0.3
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@@ -53,7 +53,7 @@ jobs:
            rm: true
            target: ./local-ai
      - name: restarting
-        uses: appleboy/ssh-action@v1.1.0
+        uses: appleboy/ssh-action@v1.0.3
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
--- a/36
+++ b/36
@@ -9,8 +9,6 @@ FROM ${BASE_IMAGE} AS requirements-core
 USER root

 ARG GO_VERSION=1.22.6
-ARG CMAKE_VERSION=3.26.4
-ARG CMAKE_FROM_SOURCE=false
 ARG TARGETARCH
 ARG TARGETVARIANT

@@ -23,25 +21,13 @@ RUN apt-get update && \
        build-essential \
        ccache \
        ca-certificates \
-        curl libssl-dev \
+        cmake \
+        curl \
        git \
        unzip upx-ucl && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

-# Install CMake (the version in 22.04 is too old)
-RUN <<EOT bash
-    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
-        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
-    else
-        apt-get update && \
-        apt-get install -y \
-            cmake && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
 # Install Go
 RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
 ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
@@ -202,8 +188,6 @@ FROM ${GRPC_BASE_IMAGE} AS grpc
 # This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
 ARG GRPC_MAKEFLAGS="-j4 -Otarget"
 ARG GRPC_VERSION=v1.65.0
-ARG CMAKE_FROM_SOURCE=false
-ARG CMAKE_VERSION=3.26.4

 ENV MAKEFLAGS=${GRPC_MAKEFLAGS}

@@ -212,24 +196,12 @@ WORKDIR /build
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        ca-certificates \
-        build-essential curl libssl-dev \
+        build-essential \
+        cmake \
        git && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

-# Install CMake (the version in 22.04 is too old)
-RUN <<EOT bash
-    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
-        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
-    else
-        apt-get update && \
-        apt-get install -y \
-            cmake && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
 # We install GRPC to a different prefix here so that we can copy in only the build artifacts later
 # saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
 # and running make install in the target container
--- a/18
+++ b/18
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=96776405a17034dcfd53d3ddf5d142d34bdbb657
+CPPLLAMA_VERSION?=d5ed2b929d85bbd7dbeecb690880f07d9d7a6077

 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
@@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=fdbfb460ed546452a5d53611bba66d10d842e719
+WHISPER_CPP_VERSION?=ccc2547210e09e3a1785817383ab770389bb442b

 # bert.cpp version
 BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
@@ -470,13 +470,13 @@ run-e2e-image:

 run-e2e-aio: protogen-go
 	@echo 'Running e2e AIO tests'
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio

 test-e2e:
 	@echo 'Running e2e tests'
 	BUILD_TYPE=$(BUILD_TYPE) \
 	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e

 teardown-e2e:
 	rm -rf $(TEST_DIR) || true
@@ -484,24 +484,24 @@ teardown-e2e:

 test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)

 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r $(TEST_PATHS)

 test-tts: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r $(TEST_PATHS)

 test-stablediffusion: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r $(TEST_PATHS)

 test-stores: backend-assets/grpc/local-store
 	mkdir -p tests/integration/backend-assets/grpc
 	cp -f backend-assets/grpc/local-store tests/integration/backend-assets/grpc/
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts $(TEST_FLAKES) -v -r tests/integration
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts 1 -v -r tests/integration

 test-container:
 	docker build --target requirements -t local-ai-test-container .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -113,7 +113,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
    std::string ret;
    for (; begin != end; ++begin)
    {
-        ret += common_token_to_piece(ctx, *begin);
+        ret += llama_token_to_piece(ctx, *begin);
    }
    return ret;
 }
@@ -121,7 +121,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
 // format incomplete utf-8 multibyte character for output
 static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
 {
-    std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
+    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
    // if the size is 1 and first bit is 1, meaning it's a partial character
    //   (size > 1 meaning it's already a known token)
    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
@@ -203,8 +203,8 @@ struct llama_client_slot
    std::string stopping_word;

    // sampling
-    struct common_sampler_params sparams;
-    common_sampler *ctx_sampling = nullptr;
+    struct gpt_sampler_params sparams;
+    gpt_sampler *ctx_sampling = nullptr;

    int32_t ga_i = 0;   // group-attention state
    int32_t ga_n = 1;   // group-attention factor
@@ -257,7 +257,7 @@ struct llama_client_slot
        images.clear();
    }

-    bool has_budget(common_params &global_params) {
+    bool has_budget(gpt_params &global_params) {
        if (params.n_predict == -1 && global_params.n_predict == -1)
        {
            return true; // limitless
@@ -398,7 +398,7 @@ struct llama_server_context

    clip_ctx *clp_ctx = nullptr;

-    common_params params;
+    gpt_params params;

    llama_batch batch;

@@ -441,7 +441,7 @@ struct llama_server_context
        }
    }

-    bool load_model(const common_params &params_)
+    bool load_model(const gpt_params &params_)
    {
        params = params_;
        if (!params.mmproj.empty()) {
@@ -458,9 +458,9 @@ struct llama_server_context
            }
        }

-        common_init_result common_init = common_init_from_params(params);
-        model = common_init.model;
-        ctx = common_init.context;
+        llama_init_result llama_init = llama_init_from_gpt_params(params);
+        model = llama_init.model;
+        ctx = llama_init.context;
        if (model == nullptr)
        {
            LOG_ERR("unable to load model: %s", params.model.c_str());
@@ -578,12 +578,12 @@ struct llama_server_context
                    std::vector<llama_token> p;
                    if (first)
                    {
-                        p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
+                        p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
                        first = false;
                    }
                    else
                    {
-                        p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
+                        p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
                    }
                    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
                }
@@ -600,7 +600,7 @@ struct llama_server_context
        else
        {
            auto s = json_prompt.template get<std::string>();
-            prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
+            prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
        }

        return prompt_tokens;
@@ -629,7 +629,7 @@ struct llama_server_context

    bool launch_slot_with_data(llama_client_slot* &slot, json data) {
        slot_params default_params;
-        common_sampler_params default_sparams;
+        gpt_sampler_params default_sparams;
 
        slot->params.stream             = json_value(data, "stream",            false);
        slot->params.cache_prompt       = json_value(data, "cache_prompt",      false);
@@ -769,7 +769,7 @@ struct llama_server_context
                    }
                    else if (el[0].is_string())
                    {
-                        auto toks = common_tokenize(model, el[0].get<std::string>(), false);
+                        auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
                        for (auto tok : toks)
                        {
                            slot->sparams.logit_bias.push_back({tok, bias});
@@ -801,7 +801,7 @@ struct llama_server_context
                        sampler_names.emplace_back(name);
                    }
                }
-                slot->sparams.samplers = common_sampler_types_from_names(sampler_names, false);
+                slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
        }
        else
        {
@@ -885,9 +885,9 @@ struct llama_server_context

        if (slot->ctx_sampling != nullptr)
        {
-            common_sampler_free(slot->ctx_sampling);
+            gpt_sampler_free(slot->ctx_sampling);
        }
-        slot->ctx_sampling = common_sampler_init(model, slot->sparams);
+        slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
        //llama_set_rng_seed(ctx, slot->params.seed);
        slot->command = LOAD_PROMPT;

@@ -914,13 +914,13 @@ struct llama_server_context
        system_tokens.clear();

        if (!system_prompt.empty()) {
-            system_tokens = common_tokenize(ctx, system_prompt, add_bos_token);
+            system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);

-            common_batch_clear(batch);
+            llama_batch_clear(batch);

            for (int i = 0; i < (int)system_tokens.size(); ++i)
            {
-                common_batch_add(batch, system_tokens[i], i, { 0 }, false);
+                llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
            }

            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
@@ -1009,7 +1009,7 @@ struct llama_server_context

    bool process_token(completion_token_output &result, llama_client_slot &slot) {
        // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = common_token_to_piece(ctx, result.tok);
+        const std::string token_str = llama_token_to_piece(ctx, result.tok);
        slot.sampled = result.tok;

        // search stop word and delete it
@@ -1160,7 +1160,7 @@ struct llama_server_context
        samplers.reserve(slot.sparams.samplers.size());
        for (const auto & sampler : slot.sparams.samplers)
        {
-            samplers.emplace_back(common_sampler_type_to_str(sampler));
+            samplers.emplace_back(gpt_sampler_type_to_str(sampler));
        }

        return json {
@@ -1216,7 +1216,7 @@ struct llama_server_context
        if (slot.sparams.n_probs > 0)
        {
            std::vector<completion_token_output> probs_output = {};
-            const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
+            const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
            size_t probs_pos      = std::min(slot.sent_token_probs_index,                       slot.generated_token_probs.size());
            size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
            if (probs_pos < probs_stop_pos)
@@ -1268,7 +1268,7 @@ struct llama_server_context
            std::vector<completion_token_output> probs = {};
            if (!slot.params.stream && slot.stopped_word)
            {
-                const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
+                const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
                probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
            }
            else
@@ -1408,7 +1408,7 @@ struct llama_server_context
            }
            image_idx++;

-            common_batch_clear(batch);
+            llama_batch_clear(batch);

            // append prefix of next image
            const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
@@ -1418,7 +1418,7 @@ struct llama_server_context
            std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
            for (int i = 0; i < (int) append_tokens.size(); ++i)
            {
-                common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
+                llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
                slot.n_past += 1;
            }
        }
@@ -1550,7 +1550,7 @@ struct llama_server_context
            update_system_prompt();
        }

-        common_batch_clear(batch);
+        llama_batch_clear(batch);

        if (all_slots_are_idle)
        {
@@ -1628,7 +1628,7 @@ struct llama_server_context

            // TODO: we always have to take into account the "system_tokens"
            //       this is not great and needs to be improved somehow
-            common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
+            llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
            slot.n_past += 1;
        }

@@ -1722,7 +1722,7 @@ struct llama_server_context

                    if (!slot.params.cache_prompt)
                    {
-                        common_sampler_reset(slot.ctx_sampling);
+                        gpt_sampler_reset(slot.ctx_sampling);

                        slot.n_past = 0;
                        slot.n_past_se = 0;
@@ -1734,7 +1734,7 @@ struct llama_server_context
                        // push the prompt into the sampling context (do not apply grammar)
                        for (auto &token : prompt_tokens)
                        {
-                            common_sampler_accept(slot.ctx_sampling, token, false);
+                            gpt_sampler_accept(slot.ctx_sampling, token, false);
                        }

                        slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
@@ -1826,7 +1826,7 @@ struct llama_server_context
                                ga_i += ga_w/ga_n;
                            }
                        }
-                        common_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
+                        llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
                        slot_npast++;
                    }

@@ -1943,9 +1943,9 @@ struct llama_server_context
                }

                completion_token_output result;
-                const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
+                const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);

-                common_sampler_accept(slot.ctx_sampling, id, true);
+                gpt_sampler_accept(slot.ctx_sampling, id, true);

                slot.n_decoded += 1;
                if (slot.n_decoded == 1)
@@ -1956,7 +1956,7 @@ struct llama_server_context
                }

                result.tok = id;
-                const auto * cur_p = common_sampler_get_candidates(slot.ctx_sampling);
+                const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);

                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
                    result.probs.push_back({
@@ -2009,7 +2009,7 @@ static json format_partial_response(
 struct token_translator
 {
    llama_context * ctx;
-    std::string operator()(llama_token tok)                    const { return common_token_to_piece(ctx, tok); }
+    std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); }
    std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
 };

@@ -2203,7 +2203,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 // }

 static void params_parse(const backend::ModelOptions* request,
-                                common_params & params) {
+                                gpt_params & params) {
   
    // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809

@@ -2311,7 +2311,7 @@ public:

  grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
    // Implement LoadModel RPC
-    common_params params;
+    gpt_params params;
    params_parse(request, params);

    llama_backend_init();
--- a/backend/python/coqui/test.py
+++ b/backend/python/coqui/test.py
@@ -19,7 +19,7 @@ class TestBackendServicer(unittest.TestCase):
        This method sets up the gRPC service by starting the server
        """
        self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
-        time.sleep(30)
+        time.sleep(10)

    def tearDown(self) -> None:
        """
--- a/backend/python/openvoice/test.py
+++ b/backend/python/openvoice/test.py
@@ -19,7 +19,7 @@ class TestBackendServicer(unittest.TestCase):
        This method sets up the gRPC service by starting the server
        """
        self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
-        time.sleep(30)
+        time.sleep(10)

    def tearDown(self) -> None:
        """
--- a/backend/python/transformers/backend.py
+++ b/backend/python/transformers/backend.py
@@ -72,12 +72,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        Returns:
            A Result object that contains the result of the LoadModel operation.
        """
-
        model_name = request.Model
-        
-        # Check to see if the Model exists in the filesystem already.
-        if os.path.exists(request.ModelFile):
-            model_name = request.ModelFile

        compute = torch.float16
        if request.F16Memory == True:
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -13,9 +13,7 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi

-# We don't embed this into the images as it is a large dependency and not always needed.
-# Besides, the speed inference are not actually usable in the current state for production use-cases.
-if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE}" == "xtrue" ]; then
+if [ "x${BUILD_TYPE}" == "x" ]; then
        ensureVenv
        # https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
        if [ ! -d vllm ]; then
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -12,7 +12,6 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
-	"strings"

 	"github.com/mudler/LocalAI/core/config"
 	. "github.com/mudler/LocalAI/core/http"
@@ -951,7 +950,7 @@ var _ = Describe("API test", func() {
 					openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp.Choices) > 0).To(BeTrue())
-				Expect(strings.ToLower(resp.Choices[0].Message.Content)).To(Or(ContainSubstring("sure"), ContainSubstring("five")))
+				Expect(resp.Choices[0].Message.Content).To(Or(ContainSubstring("Sure"), ContainSubstring("five")))

 				stream, err := client.CreateChatCompletionStream(context.TODO(), openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
 				Expect(err).ToNot(HaveOccurred())
@@ -970,7 +969,7 @@ var _ = Describe("API test", func() {
 					tokens++
 				}
 				Expect(text).ToNot(BeEmpty())
-				Expect(strings.ToLower(text)).To(Or(ContainSubstring("sure"), ContainSubstring("five")))
+				Expect(text).To(Or(ContainSubstring("Sure"), ContainSubstring("five")))

 				Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
 			})
--- a/core/http/elements/gallery.go
+++ b/core/http/elements/gallery.go
@@ -6,7 +6,6 @@ import (

 	"github.com/chasefleming/elem-go"
 	"github.com/chasefleming/elem-go/attrs"
-	"github.com/microcosm-cc/bluemonday"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/p2p"
 	"github.com/mudler/LocalAI/core/services"
@@ -42,7 +41,7 @@ func DoneProgress(galleryID, text string, showDelete bool) string {
 				"tabindex":  "-1",
 				"autofocus": "",
 			},
-			elem.Text(bluemonday.StrictPolicy().Sanitize(text)),
+			elem.Text(text),
 		),
 		elem.If(showDelete, deleteButton(galleryID, modelName), reInstallButton(galleryID)),
 	).Render()
@@ -58,7 +57,7 @@ func ErrorProgress(err, galleryName string) string {
 				"tabindex":  "-1",
 				"autofocus": "",
 			},
-			elem.Text("Error "+bluemonday.StrictPolicy().Sanitize(err)),
+			elem.Text("Error "+err),
 		),
 		installButton(galleryName),
 	).Render()
@@ -171,7 +170,7 @@ func P2PNodeBoxes(nodes []p2p.NodeData) string {
 						attrs.Props{
 							"class": "text-gray-200 font-semibold ml-2 mr-1",
 						},
-						elem.Text(bluemonday.StrictPolicy().Sanitize(n.ID)),
+						elem.Text(n.ID),
 					),
 					elem.Text("Status: "),
 					elem.If(
@@ -228,7 +227,7 @@ func StartProgressBar(uid, progress, text string) string {
 				"tabindex":  "-1",
 				"autofocus": "",
 			},
-			elem.Text(bluemonday.StrictPolicy().Sanitize(text)), //Perhaps overly defensive
+			elem.Text(text),
 			elem.Div(attrs.Props{
 				"hx-get":     "/browse/job/progress/" + uid,
 				"hx-trigger": "every 600ms",
@@ -250,7 +249,9 @@ func cardSpan(text, icon string) elem.Node {
 			"class": icon + " pr-2",
 		}),

-		elem.Text(bluemonday.StrictPolicy().Sanitize(text)),
+		elem.Text(text),
+
+		//elem.Text(text),
 	)
 }

@@ -284,9 +285,11 @@ func searchableElement(text, icon string) elem.Node {
 				elem.I(attrs.Props{
 					"class": icon + " pr-2",
 				}),
-				elem.Text(bluemonday.StrictPolicy().Sanitize(text)),
+				elem.Text(text),
 			),
 		),
+
+		//elem.Text(text),
 	)
 }

@@ -300,7 +303,7 @@ func link(text, url string) elem.Node {
 		elem.I(attrs.Props{
 			"class": "fas fa-link pr-2",
 		}),
-		elem.Text(bluemonday.StrictPolicy().Sanitize(text)),
+		elem.Text(text),
 	)
 }
 func installButton(galleryName string) elem.Node {
@@ -384,13 +387,13 @@ func ListModels(models []*gallery.GalleryModel, processTracker ProcessTracker, g
 				attrs.Props{
 					"class": "mb-2 text-xl font-bold leading-tight",
 				},
-				elem.Text(bluemonday.StrictPolicy().Sanitize(m.Name)),
+				elem.Text(m.Name),
 			),
 			elem.P(
 				attrs.Props{
 					"class": "mb-4 text-sm [&:not(:hover)]:truncate text-base",
 				},
-				elem.Text(bluemonday.StrictPolicy().Sanitize(m.Description)),
+				elem.Text(m.Description),
 			),
 		)
 	}
--- a/core/http/endpoints/localai/welcome.go
+++ b/core/http/endpoints/localai/welcome.go
@@ -13,10 +13,15 @@ import (
 func WelcomeEndpoint(appConfig *config.ApplicationConfig,
 	cl *config.BackendConfigLoader, ml *model.ModelLoader, modelStatus func() (map[string]string, map[string]string)) func(*fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
+		models, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 		backendConfigs := cl.GetAllBackendConfigs()
+
 		galleryConfigs := map[string]*gallery.Config{}
+		modelsWithBackendConfig := map[string]interface{}{}

 		for _, m := range backendConfigs {
+			modelsWithBackendConfig[m.Name] = nil
+
 			cfg, err := gallery.GetLocalModelConfiguration(ml.ModelPath, m.Name)
 			if err != nil {
 				continue
@@ -24,15 +29,13 @@ func WelcomeEndpoint(appConfig *config.ApplicationConfig,
 			galleryConfigs[m.Name] = cfg
 		}

-		modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY)
-
 		// Get model statuses to display in the UI the operation in progress
 		processingModels, taskTypes := modelStatus()

 		summary := fiber.Map{
 			"Title":             "LocalAI API - " + internal.PrintableVersion(),
 			"Version":           internal.PrintableVersion(),
-			"Models":            modelsWithoutConfig,
+			"Models":            models,
 			"ModelsConfig":      backendConfigs,
 			"GalleryConfig":     galleryConfigs,
 			"IsP2PEnabled":      p2p.IsP2PEnabled(),
--- a/core/http/endpoints/openai/assistant.go
+++ b/core/http/endpoints/openai/assistant.go
@@ -10,7 +10,6 @@ import (
 	"time"

 	"github.com/gofiber/fiber/v2"
-	"github.com/microcosm-cc/bluemonday"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services"
@@ -84,7 +83,7 @@ func CreateAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoad

 		if !modelExists(cl, ml, request.Model) {
 			log.Warn().Msgf("Model: %s was not found in list of models.", request.Model)
-			return c.Status(fiber.StatusBadRequest).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Model %q not found", request.Model)))
+			return c.Status(fiber.StatusBadRequest).SendString("Model " + request.Model + " not found")
 		}

 		if request.Tools == nil {
@@ -148,7 +147,7 @@ func ListAssistantsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoade
 		// Convert string limit to integer
 		limit, err := strconv.Atoi(limitQuery)
 		if err != nil {
-			return c.Status(http.StatusBadRequest).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Invalid limit query value: %s", limitQuery)))
+			return c.Status(http.StatusBadRequest).SendString(fmt.Sprintf("Invalid limit query value: %s", limitQuery))
 		}

 		// Sort assistants
@@ -289,7 +288,7 @@ func GetAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader,
 			}
 		}

-		return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant with id: %s", assistantID)))
+		return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant with id: %s", assistantID))
 	}
 }

@@ -338,11 +337,11 @@ func CreateAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.Model
 					}
 				}

-				return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find file_id: %s", request.FileID)))
+				return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find file_id: %s", request.FileID))
 			}
 		}

-		return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find %q", assistantID)))
+		return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find %q", assistantID))
 	}
 }

@@ -443,7 +442,7 @@ func ModifyAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoad
 				return c.Status(fiber.StatusOK).JSON(newAssistant)
 			}
 		}
-		return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant with id: %s", assistantID)))
+		return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant with id: %s", assistantID))
 	}
 }

@@ -514,9 +513,9 @@ func GetAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoa
 				if assistantFile.ID == fileId {
 					return c.Status(fiber.StatusOK).JSON(assistantFile)
 				}
-				return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant file with file_id: %s", fileId)))
+				return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant file with file_id: %s", fileId))
 			}
 		}
-		return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant file with assistant_id: %s", assistantID)))
+		return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant file with assistant_id: %s", assistantID))
 	}
 }
--- a/core/http/endpoints/openai/files.go
+++ b/core/http/endpoints/openai/files.go
@@ -8,7 +8,6 @@ import (
 	"sync/atomic"
 	"time"

-	"github.com/microcosm-cc/bluemonday"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"

@@ -50,7 +49,7 @@ func UploadFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.Appli

 		err = c.SaveFile(file, savePath)
 		if err != nil {
-			return c.Status(fiber.StatusInternalServerError).SendString("Failed to save file: " + bluemonday.StrictPolicy().Sanitize(err.Error()))
+			return c.Status(fiber.StatusInternalServerError).SendString("Failed to save file: " + err.Error())
 		}

 		f := schema.File{
@@ -122,7 +121,7 @@ func GetFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.Applicat
 	return func(c *fiber.Ctx) error {
 		file, err := getFileFromRequest(c)
 		if err != nil {
-			return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
+			return c.Status(fiber.StatusInternalServerError).SendString(err.Error())
 		}

 		return c.JSON(file)
@@ -144,14 +143,14 @@ func DeleteFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.Appli
 	return func(c *fiber.Ctx) error {
 		file, err := getFileFromRequest(c)
 		if err != nil {
-			return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
+			return c.Status(fiber.StatusInternalServerError).SendString(err.Error())
 		}

 		err = os.Remove(filepath.Join(appConfig.UploadDir, file.Filename))
 		if err != nil {
 			// If the file doesn't exist then we should just continue to remove it
 			if !errors.Is(err, os.ErrNotExist) {
-				return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to delete file: %s, %v", file.Filename, err)))
+				return c.Status(fiber.StatusInternalServerError).SendString(fmt.Sprintf("Unable to delete file: %s, %v", file.Filename, err))
 			}
 		}

@@ -181,12 +180,12 @@ func GetFilesContentsEndpoint(cm *config.BackendConfigLoader, appConfig *config.
 	return func(c *fiber.Ctx) error {
 		file, err := getFileFromRequest(c)
 		if err != nil {
-			return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
+			return c.Status(fiber.StatusInternalServerError).SendString(err.Error())
 		}

 		fileContents, err := os.ReadFile(filepath.Join(appConfig.UploadDir, file.Filename))
 		if err != nil {
-			return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
+			return c.Status(fiber.StatusInternalServerError).SendString(err.Error())
 		}

 		return c.Send(fileContents)
--- a/core/http/middleware/auth.go
+++ b/core/http/middleware/auth.go
@@ -7,7 +7,6 @@ import (
 	"github.com/dave-gray101/v2keyauth"
 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/fiber/v2/middleware/keyauth"
-	"github.com/microcosm-cc/bluemonday"
 	"github.com/mudler/LocalAI/core/config"
 )

@@ -39,7 +38,7 @@ func getApiKeyErrorHandler(applicationConfig *config.ApplicationConfig) fiber.Er
 			if applicationConfig.OpaqueErrors {
 				return ctx.SendStatus(403)
 			}
-			return ctx.Status(403).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
+			return ctx.Status(403).SendString(err.Error())
 		}
 		if applicationConfig.OpaqueErrors {
 			return ctx.SendStatus(500)
--- a/core/http/routes/ui.go
+++ b/core/http/routes/ui.go
@@ -6,7 +6,6 @@ import (
 	"sort"
 	"strings"

-	"github.com/microcosm-cc/bluemonday"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/http/elements"
@@ -172,7 +171,7 @@ func RegisterUIRoutes(app *fiber.App,
 				Search string `form:"search"`
 			}{}
 			if err := c.BodyParser(&form); err != nil {
-				return c.Status(fiber.StatusBadRequest).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
+				return c.Status(fiber.StatusBadRequest).SendString(err.Error())
 			}

 			models, _ := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.ModelPath)
--- a/core/services/list_models.go
+++ b/core/services/list_models.go
@@ -8,10 +8,10 @@ import (
 type LooseFilePolicy int

 const (
-	LOOSE_ONLY LooseFilePolicy = iota
-	SKIP_IF_CONFIGURED
+	SKIP_IF_CONFIGURED LooseFilePolicy = iota
 	SKIP_ALWAYS
 	ALWAYS_INCLUDE
+	LOOSE_ONLY
 )

 func ListModels(bcl *config.BackendConfigLoader, ml *model.ModelLoader, filter config.BackendConfigFilterFn, looseFilePolicy LooseFilePolicy) ([]string, error) {
@@ -21,13 +21,11 @@ func ListModels(bcl *config.BackendConfigLoader, ml *model.ModelLoader, filter c
 	dataModels := []string{}

 	// Start with known configurations
-
-	for _, c := range bcl.GetBackendConfigsByFilter(filter) {
-		// Is this better than looseFilePolicy <= SKIP_IF_CONFIGURED ? less performant but more readable?
-		if (looseFilePolicy == SKIP_IF_CONFIGURED) || (looseFilePolicy == LOOSE_ONLY) {
-			skipMap[c.Model] = nil
-		}
-		if looseFilePolicy != LOOSE_ONLY {
+	if looseFilePolicy != LOOSE_ONLY {
+		for _, c := range bcl.GetBackendConfigsByFilter(filter) {
+			if looseFilePolicy == SKIP_IF_CONFIGURED {
+				skipMap[c.Model] = nil
+			}
 			dataModels = append(dataModels, c.Name)
 		}
 	}
--- a/examples/chainlit/requirements.txt
+++ b/examples/chainlit/requirements.txt
@@ -1,4 +1,4 @@
-llama_index==0.11.16
+llama_index==0.11.14
 requests==2.32.3
 weaviate_client==4.8.1
 transformers
--- a/examples/functions/requirements.txt
+++ b/examples/functions/requirements.txt
@@ -1,2 +1,2 @@
-langchain==0.3.2
-openai==1.51.1
+langchain==0.3.1
+openai==1.50.2
--- a/examples/k8sgpt/broken-pod.yaml
+++ b/examples/k8sgpt/broken-pod.yaml
@@ -5,7 +5,7 @@ metadata:
 spec:
  containers:
    - name: broken-pod
-      image: nginx:1.27.2
+      image: nginx:1.27.0
      livenessProbe:
        httpGet:
          path: /
--- a/examples/langchain-chroma/requirements.txt
+++ b/examples/langchain-chroma/requirements.txt
@@ -1,4 +1,4 @@
 langchain==0.3.1
-openai==1.51.1
+openai==1.50.2
 chromadb==0.5.11
-llama-index==0.11.16
+llama-index==0.11.14
--- a/examples/langchain/langchainpy-localai-example/requirements.txt
+++ b/examples/langchain/langchainpy-localai-example/requirements.txt
@@ -1,4 +1,4 @@
-aiohttp==3.10.9
+aiohttp==3.10.8
 aiosignal==1.3.1
 async-timeout==4.0.3
 attrs==24.2.0
@@ -6,19 +6,19 @@ certifi==2024.8.30
 charset-normalizer==3.3.2
 colorama==0.4.6
 dataclasses-json==0.6.7
-debugpy==1.8.6
+debugpy==1.8.2
 frozenlist==1.4.1
 greenlet==3.1.1
 idna==3.10
-langchain==0.3.2
+langchain==0.3.1
 langchain-community==0.3.1
 marshmallow==3.22.0
 marshmallow-enum==1.5.1
-multidict==6.1.0
+multidict==6.0.5
 mypy-extensions==1.0.0
 numexpr==2.10.1
 numpy==2.1.1
-openai==1.51.1
+openai==1.45.1
 openapi-schema-pydantic==1.2.4
 packaging>=23.2
 pydantic==2.9.2
--- a/examples/streamlit-bot/requirements.txt
+++ b/examples/streamlit-bot/requirements.txt
@@ -1,2 +1,2 @@
-streamlit==1.39.0
+streamlit==1.38.0
 requests
--- a/gallery/arch-function.yaml
+++ b/gallery/arch-function.yaml
@@ -1,66 +0,0 @@
---
-name: "chatml"
-
-config_file: |
-  mmap: true
-  function:
-    disable_no_action: true
-    grammar:
-      mixed_mode: false
-      disable: true
-      parallel_calls: true
-      expect_strings_after_json: true
-    json_regex_match:
-    - "(?s)<tool_call>(.*?)</tool_call>"
-    - "(?s)<tool_call>(.*)"
-    capture_llm_results:
-      - (?s)<scratchpad>(.*?)</scratchpad>
-    replace_llm_results:
-      - key: (?s)<scratchpad>(.*?)</scratchpad>
-        value: ""
-  template:
-    chat_message: |
-      <|im_start|>{{ .RoleName }}
-      {{ if .FunctionCall -}}
-      Function call:
-      {{ else if eq .RoleName "tool" -}}
-      Function response:
-      {{ end -}}
-      {{ if .Content -}}
-      {{.Content }}
-      {{ end -}}
-      {{ if .FunctionCall -}}
-      {{toJson .FunctionCall}}
-      {{ end -}}<|im_end|>
-    function: |
-      <|im_start|>system
-      # Tools
-
-      You may call one or more functions to assist with the user query.
-
-      You are provided with function signatures within <tools></tools> XML tags:
-      <tools>
-      {{range .Functions}}
-      {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-      {{end}}
-      </tools>
-      For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
-      <tool_call>
-      {"name": <function-name>, "arguments": <args-json-object>}
-      </tool_call>
-      <|im_end|>
-      {{.Input -}}
-      <|im_start|>assistant
-    chat: |
-      {{.Input -}}
-      <|im_start|>assistant
-    completion: |
-      {{.Input}}
-  context_size: 4096
-  f16: true
-  stopwords:
-  - '<|im_end|>'
-  - '<dummy32000>'
-  - '</s>'
-  - "<|eot_id|>"
-  - "<|end_of_text|>"
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,21 +1,4 @@
 ---
- name: "moe-girl-1ba-7bt-i1"
-  icon: https://cdn-uploads.huggingface.co/production/uploads/634262af8d8089ebaefd410e/kTXXSSSqpb21rfyOX7FUa.jpeg
-  # chatml
-  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
-  urls:
-    - https://huggingface.co/allura-org/MoE-Girl-1BA-7BT
-    - https://huggingface.co/mradermacher/MoE-Girl-1BA-7BT-i1-GGUF
-  description: |
-    A finetune of OLMoE by AllenAI designed for roleplaying (and maybe general usecases if you try hard enough).
-    PLEASE do not expect godliness out of this, it's a model with 1 billion active parameters. Expect something more akin to Gemma 2 2B, not Llama 3 8B.
-  overrides:
-    parameters:
-      model: MoE-Girl-1BA-7BT.i1-Q4_K_M.gguf
-  files:
-    - filename: MoE-Girl-1BA-7BT.i1-Q4_K_M.gguf
-      sha256: e6ef9c311c73573b243de6ff7538b386f430af30b2be0a96a5745c17137ad432
-      uri: huggingface://mradermacher/MoE-Girl-1BA-7BT-i1-GGUF/MoE-Girl-1BA-7BT.i1-Q4_K_M.gguf
 - name: "salamandra-7b-instruct"
  icon: https://huggingface.co/BSC-LT/salamandra-7b-instruct/resolve/main/images/salamandra_header.png
  # Uses chatml
@@ -100,88 +83,6 @@
    - filename: llama-3.2-1b-instruct-q8_0.gguf
      sha256: ba345c83bf5cc679c653b853c46517eea5a34f03ed2205449db77184d9ae62a9
      uri: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF/llama-3.2-1b-instruct-q8_0.gguf
-## Uncensored
- !!merge <<: *llama32
-  icon: https://cdn-uploads.huggingface.co/production/uploads/66c9d7a26f2335ba288810a4/4YDg-rcEXCK0fdTS1fBzE.webp
-  name: "versatillama-llama-3.2-3b-instruct-abliterated"
-  urls:
-    - https://huggingface.co/QuantFactory/VersatiLlama-Llama-3.2-3B-Instruct-Abliterated-GGUF
-  description: |
-    Small but Smart Fine-Tuned on Vast dataset of Conversations. Able to Generate Human like text with high performance within its size. It is Very Versatile when compared for it's size and Parameters and offers capability almost as good as Llama 3.1 8B Instruct.
-  overrides:
-    parameters:
-      model: VersatiLlama-Llama-3.2-3B-Instruct-Abliterated.Q4_K_M.gguf
-  files:
-    - filename: VersatiLlama-Llama-3.2-3B-Instruct-Abliterated.Q4_K_M.gguf
-      sha256: 15b9e4a987f50d7594d030815c7166a996e20db46fe1e20da03e96955020312c
-      uri: huggingface://QuantFactory/VersatiLlama-Llama-3.2-3B-Instruct-Abliterated-GGUF/VersatiLlama-Llama-3.2-3B-Instruct-Abliterated.Q4_K_M.gguf
- !!merge <<: *llama32
-  name: "llama3.2-3b-enigma"
-  icon: https://cdn-uploads.huggingface.co/production/uploads/64f267a8a4f79a118e0fcc89/it7MY5MyLCLpFQev5dUis.jpeg
-  urls:
-    - https://huggingface.co/QuantFactory/Llama3.2-3B-Enigma-GGUF
-  description: |
-    Enigma is a code-instruct model built on Llama 3.2 3b. It is a high quality code instruct model with the Llama 3.2 Instruct chat format. The model is finetuned on synthetic code-instruct data generated with Llama 3.1 405b and supplemented with generalist synthetic data. It uses the Llama 3.2 Instruct prompt format.
-  overrides:
-    parameters:
-      model: Llama3.2-3B-Enigma.Q4_K_M.gguf
-  files:
-    - filename: Llama3.2-3B-Enigma.Q4_K_M.gguf
-      sha256: 4304e6ee1e348b228470700ec1e9423f5972333d376295195ce6cd5c70cae5e4
-      uri: huggingface://QuantFactory/Llama3.2-3B-Enigma-GGUF/Llama3.2-3B-Enigma.Q4_K_M.gguf
- !!merge <<: *llama32
-  name: "llama3.2-3b-esper2"
-  icon: https://cdn-uploads.huggingface.co/production/uploads/64f267a8a4f79a118e0fcc89/4I6oK8DG0so4VD8GroFsd.jpeg
-  urls:
-    - https://huggingface.co/QuantFactory/Llama3.2-3B-Esper2-GGUF
-  description: |
-    Esper 2 is a DevOps and cloud architecture code specialist built on Llama 3.2 3b. It is an AI assistant focused on AWS, Azure, GCP, Terraform, Dockerfiles, pipelines, shell scripts and more, with real world problem solving and high quality code instruct performance within the Llama 3.2 Instruct chat format. Finetuned on synthetic DevOps-instruct and code-instruct data generated with Llama 3.1 405b and supplemented with generalist chat data.
-  overrides:
-    parameters:
-      model: Llama3.2-3B-Esper2.Q4_K_M.gguf
-  files:
-    - filename: Llama3.2-3B-Esper2.Q4_K_M.gguf
-      sha256: 11d2bd674aa22a71a59ec49ad29b695000d14bc275b0195b8d7089bfc7582fc7
-      uri: huggingface://QuantFactory/Llama3.2-3B-Esper2-GGUF/Llama3.2-3B-Esper2.Q4_K_M.gguf
- !!merge <<: *llama32
-  name: "llama-3.2-3b-agent007"
-  urls:
-    - https://huggingface.co/QuantFactory/Llama-3.2-3B-Agent007-GGUF
-  description: |
-    The model is a quantized version of EpistemeAI/Llama-3.2-3B-Agent007, developed by EpistemeAI and fine-tuned from unsloth/llama-3.2-3b-instruct-bnb-4bit. It was trained 2x faster with Unsloth and Huggingface's TRL library. Fine tuned with Agent datasets.
-  overrides:
-    parameters:
-      model: Llama-3.2-3B-Agent007.Q4_K_M.gguf
-  files:
-    - filename: Llama-3.2-3B-Agent007.Q4_K_M.gguf
-      sha256: 7a2543a69b116f2a059e2e445e5d362bb7df4a51b97e83d8785c1803dc9d687f
-      uri: huggingface://QuantFactory/Llama-3.2-3B-Agent007-GGUF/Llama-3.2-3B-Agent007.Q4_K_M.gguf
- !!merge <<: *llama32
-  name: "llama-3.2-3b-agent007-coder"
-  urls:
-    - https://huggingface.co/QuantFactory/Llama-3.2-3B-Agent007-Coder-GGUF
-  description: |
-    The Llama-3.2-3B-Agent007-Coder-GGUF is a quantized version of the EpistemeAI/Llama-3.2-3B-Agent007-Coder model, which is a fine-tuned version of the unsloth/llama-3.2-3b-instruct-bnb-4bit model. It is created using llama.cpp and trained with additional datasets such as the Agent dataset, Code Alpaca 20K, and magpie ultra 0.1. This model is optimized for multilingual dialogue use cases and agentic retrieval and summarization tasks. The model is available for commercial and research use in multiple languages and is best used with the transformers library.
-  overrides:
-    parameters:
-      model: Llama-3.2-3B-Agent007-Coder.Q4_K_M.gguf
-  files:
-    - filename: Llama-3.2-3B-Agent007-Coder.Q4_K_M.gguf
-      sha256: 49a4861c094d94ef5faa33f69b02cd132bb0167f1c3ca59059404f85f61e1d12
-      uri: huggingface://QuantFactory/Llama-3.2-3B-Agent007-Coder-GGUF/Llama-3.2-3B-Agent007-Coder.Q4_K_M.gguf
- !!merge <<: *llama32
-  name: "fireball-meta-llama-3.2-8b-instruct-agent-003-128k-code-dpo"
-  urls:
-    - https://huggingface.co/QuantFactory/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO-GGUF
-  description: |
-    The LLM model is a quantized version of EpistemeAI/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO, which is an experimental and revolutionary fine-tune with DPO dataset to allow LLama 3.1 8B to be an agentic coder. It has some built-in agent features such as search, calculator, and ReAct. Other noticeable features include self-learning using unsloth, RAG applications, and memory. The context window of the model is 128K. It can be integrated into projects using popular libraries like Transformers and vLLM. The model is suitable for use with Langchain or LLamaIndex. The model is developed by EpistemeAI and licensed under the Apache 2.0 license.
-  overrides:
-    parameters:
-      model: Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO.Q4_K_M.gguf
-  files:
-    - filename: Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO.Q4_K_M.gguf
-      sha256: 7f45fa79bc6c9847ef9fbad08c3bb5a0f2dbb56d2e2200a5d37b260a57274e55
-      uri: huggingface://QuantFactory/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO-GGUF/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO.Q4_K_M.gguf
 - &qwen25
  ## Qwen2.5
  name: "qwen2.5-14b-instruct"
@@ -418,113 +319,6 @@
    - filename: calme-2.2-qwen2.5-72b.i1-Q4_K_M.gguf
      sha256: 5fdfa599724d7c78502c477ced1d294e92781b91d3265bd0748fbf15a6fefde6
      uri: huggingface://mradermacher/calme-2.2-qwen2.5-72b-i1-GGUF/calme-2.2-qwen2.5-72b.i1-Q4_K_M.gguf
- !!merge <<: *qwen25
-  name: "t.e-8.1-iq-imatrix-request"
-  # chatml
-  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
-  icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/K1aNPf32z-6tYZdcSQBzF.png
-  urls:
-    - https://huggingface.co/Cran-May/T.E-8.1
-    - https://huggingface.co/Lewdiculous/T.E-8.1-GGUF-IQ-Imatrix-Request
-  description: |
-    Trained for roleplay uses.
-  overrides:
-    parameters:
-      model: T.E-8.1-Q4_K_M-imat.gguf
-  files:
-    - filename: T.E-8.1-Q4_K_M-imat.gguf
-      sha256: 1b7892b82c01ea4cbebe34cd00f9836cbbc369fc3247c1f44a92842201e7ec0b
-      uri: huggingface://Lewdiculous/T.E-8.1-GGUF-IQ-Imatrix-Request/T.E-8.1-Q4_K_M-imat.gguf
- !!merge <<: *qwen25
-  name: "rombos-llm-v2.5.1-qwen-3b"
-  icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/pNDtgE5FDkxxvbG4qiZ1A.jpeg
-  urls:
-    - https://huggingface.co/QuantFactory/Rombos-LLM-V2.5.1-Qwen-3b-GGUF
-  description: |
-    Rombos-LLM-V2.5.1-Qwen-3b is a little experiment that merges a high-quality LLM, arcee-ai/raspberry-3B, using the last step of the Continuous Finetuning method outlined in a Google document. The merge is done using the mergekit with the following parameters:
-
-    - Models: Qwen2.5-3B-Instruct, raspberry-3B
-    - Merge method: ties
-    - Base model: Qwen2.5-3B
-    - Parameters: weight=1, density=1, normalize=true, int8_mask=true
-    - Dtype: bfloat16
-
-    The model has been evaluated on various tasks and datasets, and the results are available on the Open LLM Leaderboard. The model has shown promising performance across different benchmarks.
-  overrides:
-    parameters:
-      model: Rombos-LLM-V2.5.1-Qwen-3b.Q4_K_M.gguf
-  files:
-    - filename: Rombos-LLM-V2.5.1-Qwen-3b.Q4_K_M.gguf
-      sha256: 656c342a2921cac8912e0123fc295c3bb3d631a85c671c12a3843a957e46d30d
-      uri: huggingface://QuantFactory/Rombos-LLM-V2.5.1-Qwen-3b-GGUF/Rombos-LLM-V2.5.1-Qwen-3b.Q4_K_M.gguf
- !!merge <<: *qwen25
-  name: "qwen2.5-7b-ins-v3"
-  urls:
-    - https://huggingface.co/happzy2633/qwen2.5-7b-ins-v3
-    - https://huggingface.co/bartowski/qwen2.5-7b-ins-v3-GGUF
-  description: |
-    Qwen 2.5 fine-tuned on CoT to match o1 performance. An attempt to build an Open o1 mathcing OpenAI o1 model
-    Demo: https://huggingface.co/spaces/happzy2633/open-o1
-  overrides:
-    parameters:
-      model: qwen2.5-7b-ins-v3-Q4_K_M.gguf
-  files:
-    - filename: qwen2.5-7b-ins-v3-Q4_K_M.gguf
-      sha256: 9c23734072714a4886c0386ae0ff07a5e940d67ad52278e2ed689fec44e1e0c8
-      uri: huggingface://bartowski/qwen2.5-7b-ins-v3-GGUF/qwen2.5-7b-ins-v3-Q4_K_M.gguf
- &archfunct
-  license: apache-2.0
-  tags:
-    - llm
-    - gguf
-    - gpu
-    - qwen
-    - qwen2.5
-    - cpu
-    - function-calling
-  name: "arch-function-1.5b"
-  uri: "github:mudler/LocalAI/gallery/arch-function.yaml@master"
-  urls:
-    - https://huggingface.co/katanemolabs/Arch-Function-1.5B
-    - https://huggingface.co/mradermacher/Arch-Function-1.5B-GGUF
-  description: |
-    The Katanemo Arch-Function collection of large language models (LLMs) is a collection state-of-the-art (SOTA) LLMs specifically designed for function calling tasks. The models are designed to understand complex function signatures, identify required parameters, and produce accurate function call outputs based on natural language prompts. Achieving performance on par with GPT-4, these models set a new benchmark in the domain of function-oriented tasks, making them suitable for scenarios where automated API interaction and function execution is crucial.
-    In summary, the Katanemo Arch-Function collection demonstrates:
-        State-of-the-art performance in function calling
-        Accurate parameter identification and suggestion, even in ambiguous or incomplete inputs
-        High generalization across multiple function calling use cases, from API interactions to automated backend tasks.
-        Optimized low-latency, high-throughput performance, making it suitable for real-time, production environments.
-  overrides:
-    parameters:
-      model: Arch-Function-1.5B.Q4_K_M.gguf
-  files:
-    - filename: Arch-Function-1.5B.Q4_K_M.gguf
-      sha256: 5ac54d2d50cca0ee0335ca2c9b688204c0829cd3a73de3ee3fda108281ad9691
-      uri: huggingface://mradermacher/Arch-Function-1.5B-GGUF/Arch-Function-1.5B.Q4_K_M.gguf
- !!merge <<: *archfunct
-  name: "arch-function-7b"
-  urls:
-    - https://huggingface.co/katanemolabs/Arch-Function-7B
-    - https://huggingface.co/mradermacher/Arch-Function-7B-GGUF
-  overrides:
-    parameters:
-      model: Arch-Function-7B.Q4_K_M.gguf
-  files:
-    - filename: Arch-Function-7B.Q4_K_M.gguf
-      sha256: 6e38661321d79d02b8cf57c79d97c6c0e19adb9ffa66083cc440c24e257234b6
-      uri: huggingface://mradermacher/Arch-Function-7B-GGUF/Arch-Function-7B.Q4_K_M.gguf
- !!merge <<: *archfunct
-  name: "arch-function-3b"
-  urls:
-    - https://huggingface.co/katanemolabs/Arch-Function-3B
-    - https://huggingface.co/mradermacher/Arch-Function-3B-GGUF
-  overrides:
-    parameters:
-      model: Arch-Function-3B.Q4_K_M.gguf
-  files:
-    - filename: Arch-Function-3B.Q4_K_M.gguf
-      sha256: 9945cb8d070498d163e5df90c1987f591d35e4fd2222a6c51bcfff848c4b573b
-      uri: huggingface://mradermacher/Arch-Function-3B-GGUF/Arch-Function-3B.Q4_K_M.gguf
 - &smollm
  ## SmolLM
  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
@@ -1034,21 +828,6 @@
    - filename: NightyGurps-14b-v1.1-Q4_K_M.gguf
      sha256: d09d53259ad2c0298150fa8c2db98fe42f11731af89fdc80ad0e255a19adc4b0
      uri: huggingface://bartowski/NightyGurps-14b-v1.1-GGUF/NightyGurps-14b-v1.1-Q4_K_M.gguf
- !!merge <<: *llama31
-  name: "llama-3.1-swallow-70b-v0.1-i1"
-  icon: https://huggingface.co/tokyotech-llm/Llama-3.1-Swallow-70B-v0.1/resolve/main/logo.png
-  urls:
-    - https://huggingface.co/tokyotech-llm/Llama-3.1-Swallow-70B-v0.1
-    - https://huggingface.co/mradermacher/Llama-3.1-Swallow-70B-v0.1-i1-GGUF
-  description: |
-    Llama 3.1 Swallow is a series of large language models (8B, 70B) that were built by continual pre-training on the Meta Llama 3.1 models. Llama 3.1 Swallow enhanced the Japanese language capabilities of the original Llama 3.1 while retaining the English language capabilities. We use approximately 200 billion tokens that were sampled from a large Japanese web corpus (Swallow Corpus Version 2), Japanese and English Wikipedia articles, and mathematical and coding contents, etc (see the Training Datasets section) for continual pre-training. The instruction-tuned models (Instruct) were built by supervised fine-tuning (SFT) on the synthetic data specially built for Japanese. See the Swallow Model Index section to find other model variants.
-  overrides:
-    parameters:
-      model: Llama-3.1-Swallow-70B-v0.1.i1-Q4_K_M.gguf
-  files:
-    - filename: Llama-3.1-Swallow-70B-v0.1.i1-Q4_K_M.gguf
-      sha256: 9eaa08a4872a26f56fe34b27a99f7bd0d22ee2b2d1c84cfcde2091b5f61af5fa
-      uri: huggingface://mradermacher/Llama-3.1-Swallow-70B-v0.1-i1-GGUF/Llama-3.1-Swallow-70B-v0.1.i1-Q4_K_M.gguf
 ## Uncensored models
 - !!merge <<: *llama31
  name: "humanish-roleplay-llama-3.1-8b-i1"
@@ -1365,53 +1144,6 @@
    - filename: Llama-3.1-8B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
      sha256: 0a601c7341228d9160332965298d799369a1dc2b7080771fb8051bdeb556b30c
      uri: huggingface://bartowski/Llama-3.1-8B-ArliAI-RPMax-v1.1-GGUF/Llama-3.1-8B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
- !!merge <<: *llama31
-  name: "violet_twilight-v0.2-iq-imatrix"
-  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
-  icon: https://cdn-uploads.huggingface.co/production/uploads/64adfd277b5ff762771e4571/P962FQhRG4I8nbU_DJolY.png
-  urls:
-    - https://huggingface.co/Epiculous/Violet_Twilight-v0.2
-    - https://huggingface.co/Lewdiculous/Violet_Twilight-v0.2-GGUF-IQ-Imatrix
-  description: |
-    Now for something a bit different, Violet_Twilight-v0.2! This model is a SLERP merge of Azure_Dusk-v0.2 and Crimson_Dawn-v0.2!
-  overrides:
-    parameters:
-      model: Violet_Twilight-v0.2-Q4_K_M-imat.gguf
-  files:
-    - filename: Violet_Twilight-v0.2-Q4_K_M-imat.gguf
-      sha256: 0793d196a00cd6fd4e67b8c585b27a94d397e33d427e4ad4aa9a16b7abc339cd
-      uri: huggingface://Lewdiculous/Violet_Twilight-v0.2-GGUF-IQ-Imatrix/Violet_Twilight-v0.2-Q4_K_M-imat.gguf
- !!merge <<: *llama31
-  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
-  name: "dans-personalityengine-v1.0.0-8b"
-  urls:
-    - https://huggingface.co/PocketDoc/Dans-PersonalityEngine-v1.0.0-8b
-    - https://huggingface.co/bartowski/Dans-PersonalityEngine-v1.0.0-8b-GGUF
-  description: |
-    This model is intended to be multifarious in its capabilities and should be quite capable at both co-writing and roleplay as well as find itself quite at home performing sentiment analysis or summarization as part of a pipeline. It has been trained on a wide array of one shot instructions, multi turn instructions, role playing scenarios, text adventure games, co-writing, and much more. The full dataset is publicly available and can be found in the datasets section of the model page.
-
-    There has not been any form of harmfulness alignment done on this model, please take the appropriate precautions when using it in a production environment.
-  overrides:
-    parameters:
-      model: Dans-PersonalityEngine-v1.0.0-8b-Q4_K_M.gguf
-  files:
-    - filename: Dans-PersonalityEngine-v1.0.0-8b-Q4_K_M.gguf
-      sha256: 193b66434c9962e278bb171a21e652f0d3f299f04e86c95f9f75ec5aa8ff006e
-      uri: huggingface://bartowski/Dans-PersonalityEngine-v1.0.0-8b-GGUF/Dans-PersonalityEngine-v1.0.0-8b-Q4_K_M.gguf
- !!merge <<: *llama31
-  name: "nihappy-l3.1-8b-v0.09"
-  urls:
-    - https://huggingface.co/Arkana08/NIHAPPY-L3.1-8B-v0.09
-    - https://huggingface.co/QuantFactory/NIHAPPY-L3.1-8B-v0.09-GGUF
-  description: |
-    The model is a quantized version of Arkana08/NIHAPPY-L3.1-8B-v0.09 created using llama.cpp. It is a role-playing model that integrates the finest qualities of various pre-trained language models, focusing on dynamic storytelling.
-  overrides:
-    parameters:
-      model: NIHAPPY-L3.1-8B-v0.09.Q4_K_M.gguf
-  files:
-    - filename: NIHAPPY-L3.1-8B-v0.09.Q4_K_M.gguf
-      sha256: 9bd46a06093448b143bd2775f0fb1b1b172c851fafdce31289e13b7dfc23a0d7
-      uri: huggingface://QuantFactory/NIHAPPY-L3.1-8B-v0.09-GGUF/NIHAPPY-L3.1-8B-v0.09.Q4_K_M.gguf
 - &deepseek
  ## Deepseek
  url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"
@@ -2002,26 +1734,6 @@
    - filename: MagnusIntellectus-12B-v1.i1-Q4_K_M.gguf
      sha256: c97107983b4edc5b6f2a592d227ca2dd4196e2af3d3bc0fe6b7a8954a1fb5870
      uri: huggingface://mradermacher/MagnusIntellectus-12B-v1-i1-GGUF/MagnusIntellectus-12B-v1.i1-Q4_K_M.gguf
- !!merge <<: *mistral03
-  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
-  name: "mn-backyardai-party-12b-v1-iq-arm-imatrix"
-  icon: https://huggingface.co/Sao10K/MN-BackyardAI-Party-12B-v1/resolve/main/party1.png
-  urls:
-    - https://huggingface.co/Sao10K/MN-BackyardAI-Party-12B-v1
-    - https://huggingface.co/Lewdiculous/MN-BackyardAI-Party-12B-v1-GGUF-IQ-ARM-Imatrix
-  description: |
-    This is a group-chat based roleplaying model, based off of 12B-Lyra-v4a2, a variant of Lyra-v4 that is currently private.
-
-    It is trained on an entirely human-based dataset, based on forum / internet group roleplaying styles. The only augmentation done with LLMs is to the character sheets, to fit to the system prompt, to fit various character sheets within context.
-
-    This model is still capable of 1 on 1 roleplay, though I recommend using ChatML when doing that instead.
-  overrides:
-    parameters:
-      model: MN-BackyardAI-Party-12B-v1-Q4_K_M-imat.gguf
-  files:
-    - filename: MN-BackyardAI-Party-12B-v1-Q4_K_M-imat.gguf
-      sha256: cea68768dff58b553974b755bb40ef790ab8b86866d9b5c46bc2e6c3311b876a
-      uri: huggingface://Lewdiculous/MN-BackyardAI-Party-12B-v1-GGUF-IQ-ARM-Imatrix/MN-BackyardAI-Party-12B-v1-Q4_K_M-imat.gguf
 - &mudler
  ### START mudler's LocalAI specific-models
  url: "github:mudler/LocalAI/gallery/mudler.yaml@master"
@@ -2618,37 +2330,6 @@
    - filename: Gemma-2-2B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
      sha256: 89fe35345754d7e9de8d0c0d5bf35b2be9b12a09811b365b712b8b27112f7712
      uri: huggingface://bartowski/Gemma-2-2B-ArliAI-RPMax-v1.1-GGUF/Gemma-2-2B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
- !!merge <<: *gemma
-  name: "gemma-2-9b-it-abliterated"
-  urls:
-    - https://huggingface.co/IlyaGusev/gemma-2-9b-it-abliterated
-    - https://huggingface.co/bartowski/gemma-2-9b-it-abliterated-GGUF
-  description: |
-    Abliterated version of google/gemma-2-9b-it.
-
-    The abliteration script (link) is based on code from the blog post and heavily uses TransformerLens. The only major difference from the code used for Llama is scaling the embedding layer back.
-
-    Orthogonalization did not produce the same results as regular interventions since there are RMSNorm layers before merging activations into the residual stream. However, the final model still seems to be uncensored.
-  overrides:
-    parameters:
-      model: gemma-2-9b-it-abliterated-Q4_K_M.gguf
-  files:
-    - filename: gemma-2-9b-it-abliterated-Q4_K_M.gguf
-      sha256: 88d84ac9796732c10f6c58e0feb4db8e04c05d74bdb7047a5e37906a589896e1
-      uri: huggingface://bartowski/gemma-2-9b-it-abliterated-GGUF/gemma-2-9b-it-abliterated-Q4_K_M.gguf
- !!merge <<: *gemma
-  name: "gemma-2-ataraxy-v3i-9b"
-  urls:
-    - https://huggingface.co/QuantFactory/Gemma-2-Ataraxy-v3i-9B-GGUF
-  description: |
-    Gemma-2-Ataraxy-v3i-9B is an experimental model that replaces the simpo model in the original recipe with a different simpo model and a writing model trained on Gutenberg, using a higher density. It is a merge of pre-trained language models created using mergekit, with della merge method using unsloth/gemma-2-9b-it as the base. The models included in the merge are nbeerbower/Gemma2-Gutenberg-Doppel-9B, ifable/gemma-2-Ifable-9B, and wzhouad/gemma-2-9b-it-WPO-HB. It has been quantized using llama.cpp.
-  overrides:
-    parameters:
-      model: Gemma-2-Ataraxy-v3i-9B.Q4_K_M.gguf
-  files:
-    - filename: Gemma-2-Ataraxy-v3i-9B.Q4_K_M.gguf
-      sha256: f14c5b9373d4058f0f812c6c34184addeb4aeeecb02a7bbcf9844d9afc8d0066
-      uri: huggingface://QuantFactory/Gemma-2-Ataraxy-v3i-9B-GGUF/Gemma-2-Ataraxy-v3i-9B.Q4_K_M.gguf
 - &llama3
  url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master"
  icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png
--- a/gallery/moondream.yaml
+++ b/gallery/moondream.yaml
@@ -3,6 +3,7 @@ name: "moondream2"


 config_file: |
+    backend: llama-cpp
    context_size: 2046
    roles:
      user: "\nQuestion: "
--- a/pkg/grpc/base/base.go
+++ b/pkg/grpc/base/base.go
@@ -41,7 +41,6 @@ func (llm *Base) Predict(opts *pb.PredictOptions) (string, error) {
 }

 func (llm *Base) PredictStream(opts *pb.PredictOptions, results chan string) error {
-	close(results)
 	return fmt.Errorf("unimplemented")
 }

--- a/pkg/grpc/server.go
+++ b/pkg/grpc/server.go
@@ -144,6 +144,8 @@ func (s *server) PredictStream(in *pb.PredictOptions, stream pb.Backend_PredictS
 	}()

 	err := s.llm.PredictStream(in, resultChan)
+	// close the channel, so if resultChan is not closed by the LLM (maybe because does not implement PredictStream), the client will not hang
+	close(resultChan)
 	<-done

 	return err
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -28,7 +28,7 @@ var Aliases map[string]string = map[string]string{
 	"langchain-huggingface": LCHuggingFaceBackend,
 }

-var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
+var autoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"

 const (
 	LlamaGGML = "llama-ggml"
@@ -62,7 +62,7 @@ func backendPath(assetDir, backend string) string {

 // backendsInAssetDir returns the list of backends in the asset directory
 // that should be loaded
-func backendsInAssetDir(assetDir string) (map[string][]string, error) {
+func backendsInAssetDir(assetDir string) ([]string, error) {
 	// Exclude backends from automatic loading
 	excludeBackends := []string{LocalStoreBackend}
 	entry, err := os.ReadDir(backendPath(assetDir, ""))
@@ -86,7 +86,7 @@ ENTRY:

 		// Skip the llama.cpp variants if we are autoDetecting
 		// But we always load the fallback variant if it exists
-		if strings.Contains(e.Name(), LLamaCPP) && !strings.Contains(e.Name(), LLamaCPPFallback) && AutoDetect {
+		if strings.Contains(e.Name(), LLamaCPP) && !strings.Contains(e.Name(), LLamaCPPFallback) && autoDetect {
 			continue
 		}

@@ -94,7 +94,7 @@ ENTRY:
 	}

 	// if we are autoDetecting, we want to show the llama.cpp variants as a single backend
-	if AutoDetect {
+	if autoDetect {
 		// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
 		// when starting the service
 		foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas, foundSycl16, foundSycl32 := false, false, false, false, false, false, false, false
@@ -136,10 +136,6 @@ ENTRY:
 		}
 	}

-	return backends, nil
-}
-
-func orderBackends(backends map[string][]string) ([]string, error) {
 	// order backends from the asset directory.
 	// as we scan for backends, we want to keep some order which backends are tried of.
 	// for example, llama.cpp should be tried first, and we want to keep the huggingface backend at the last.
@@ -185,9 +181,8 @@ func orderBackends(backends map[string][]string) ([]string, error) {
 	return orderedBackends.Keys(), nil
 }

-// selectGRPCProcessByHostCapabilities selects the GRPC process to start based on system capabilities
-// Note: this is now relevant only for llama.cpp
-func selectGRPCProcessByHostCapabilities(backend, assetDir string, f16 bool) string {
+// selectGRPCProcess selects the GRPC process to start based on system capabilities
+func selectGRPCProcess(backend, assetDir string, f16 bool) string {
 	foundCUDA := false
 	foundAMDGPU := false
 	foundIntelGPU := false
@@ -204,7 +199,6 @@ func selectGRPCProcessByHostCapabilities(backend, assetDir string, f16 bool) str
 		return backendPath(assetDir, LLamaCPPGRPC)
 	}

-	// Check for GPU-binaries that are shipped with single binary releases
 	gpus, err := xsysinfo.GPUs()
 	if err == nil {
 		for _, gpu := range gpus {
@@ -249,37 +243,32 @@ func selectGRPCProcessByHostCapabilities(backend, assetDir string, f16 bool) str
 		return grpcProcess
 	}

-	// No GPU found or no specific binaries found, try to load the CPU variant(s)
-
-	// Select the Fallback by default
-	selectedProcess := backendPath(assetDir, LLamaCPPFallback)
-
-	// IF we find any optimized binary, we use that
 	if xsysinfo.HasCPUCaps(cpuid.AVX2) {
 		p := backendPath(assetDir, LLamaCPPAVX2)
 		if _, err := os.Stat(p); err == nil {
 			log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
-			selectedProcess = p
+			grpcProcess = p
 		}
 	} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
 		p := backendPath(assetDir, LLamaCPPAVX)
 		if _, err := os.Stat(p); err == nil {
 			log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
-			selectedProcess = p
+			grpcProcess = p
+		}
+	} else {
+		p := backendPath(assetDir, LLamaCPPFallback)
+		if _, err := os.Stat(p); err == nil {
+			log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
+			grpcProcess = p
 		}
 	}

-	// Check if the binary exists!
-	if _, err := os.Stat(selectedProcess); err == nil {
-		return selectedProcess
-	}
-
-	return ""
+	return grpcProcess
 }

 // starts the grpcModelProcess for the backend, and returns a grpc client
 // It also loads the model
-func (ml *ModelLoader) grpcModel(backend string, autodetect bool, o *Options) func(string, string, string) (*Model, error) {
+func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string, string) (*Model, error) {
 	return func(modelID, modelName, modelFile string) (*Model, error) {

 		log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelID, modelFile, backend, *o)
@@ -335,9 +324,9 @@ func (ml *ModelLoader) grpcModel(backend string, autodetect bool, o *Options) fu
 				return nil, fmt.Errorf("refering to a backend not in asset dir: %s", err.Error())
 			}

-			if autodetect {
+			if autoDetect {
 				// autoDetect GRPC process to start based on system capabilities
-				if selectedProcess := selectGRPCProcessByHostCapabilities(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
+				if selectedProcess := selectGRPCProcess(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
 					grpcProcess = selectedProcess
 				}
 			}
@@ -418,11 +407,7 @@ func (ml *ModelLoader) grpcModel(backend string, autodetect bool, o *Options) fu
 }

 func (ml *ModelLoader) ListAvailableBackends(assetdir string) ([]string, error) {
-	backends, err := backendsInAssetDir(assetdir)
-	if err != nil {
-		return nil, err
-	}
-	return orderBackends(backends)
+	return backendsInAssetDir(assetdir)
 }

 func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err error) {
@@ -436,7 +421,13 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
 		log.Debug().Msgf("%s is an alias of %s", backend, realBackend)
 	}

-	ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
+	if o.singleActiveBackend {
+		log.Debug().Msgf("Stopping all backends except '%s'", o.modelID)
+		err := ml.StopGRPC(allExcept(o.modelID))
+		if err != nil {
+			log.Error().Err(err).Str("keptModel", o.modelID).Msg("error while shutting down all backends except for the keptModel")
+		}
+	}

 	var backendToConsume string

@@ -448,40 +439,14 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
 		backendToConsume = backend
 	}

-	model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backendToConsume, AutoDetect, o))
+	model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backendToConsume, o))
 	if err != nil {
-		// XXX: This is too backend specific(llama-cpp), remove this bit or generalize further
-		// We failed somehow starting the binary. For instance, could be that we are missing
-		// some libraries if running in binary-only mode.
-		// In this case, we attempt to load the model with the fallback variant.
-
-		// If not llama-cpp backend, return error immediately
-		if backend != LLamaCPP {
-			return nil, err
-		}
-
-		// Otherwise attempt with fallback
-		log.Error().Msgf("[%s] Failed loading model, trying with fallback '%s'", backend, LLamaCPPFallback)
-		model, err = ml.LoadModel(o.modelID, o.model, ml.grpcModel(LLamaCPPFallback, false, o))
-		if err != nil {
-			return nil, err
-		}
+		return nil, err
 	}

 	return model.GRPC(o.parallelRequests, ml.wd), nil
 }

-func (ml *ModelLoader) stopActiveBackends(modelID string, singleActiveBackend bool) {
-	// If we can have only one backend active, kill all the others (except external backends)
-	if singleActiveBackend {
-		log.Debug().Msgf("Stopping all backends except '%s'", modelID)
-		err := ml.StopGRPC(allExcept(modelID))
-		if err != nil {
-			log.Error().Err(err).Str("keptModel", modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
-		}
-	}
-}
-
 func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
 	o := NewOptions(opts...)

@@ -493,12 +458,19 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
 		return m.GRPC(o.parallelRequests, ml.wd), nil
 	}

-	ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
+	// If we can have only one backend active, kill all the others (except external backends)
+	if o.singleActiveBackend {
+		log.Debug().Msgf("Stopping all backends except '%s'", o.modelID)
+		err := ml.StopGRPC(allExcept(o.modelID))
+		if err != nil {
+			log.Error().Err(err).Str("keptModel", o.modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
+		}
+	}

 	var err error

 	// get backends embedded in the binary
-	autoLoadBackends, err := ml.ListAvailableBackends(o.assetDir)
+	autoLoadBackends, err := backendsInAssetDir(o.assetDir)
 	if err != nil {
 		return nil, err
 	}
@@ -529,6 +501,39 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
 			err = errors.Join(err, fmt.Errorf("backend %s returned no usable model", key))
 			log.Info().Msgf("[%s] Fails: %s", key, "backend returned no usable model")
 		}
+
+		if autoDetect && key == LLamaCPP && err != nil {
+			// try as hard as possible to run the llama.cpp variants
+			backendToUse := ""
+			if xsysinfo.HasCPUCaps(cpuid.AVX2) {
+				if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPAVX2)); err == nil {
+					backendToUse = LLamaCPPAVX2
+				}
+			} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
+				if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPAVX2)); err == nil {
+					backendToUse = LLamaCPPAVX
+				}
+			} else {
+				if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPFallback)); err == nil {
+					backendToUse = LLamaCPPFallback
+				} else {
+					// If we don't have a fallback, just skip fallback
+					continue
+				}
+			}
+
+			// Autodetection failed, try the fallback
+			log.Info().Msgf("[%s] Autodetection failed, trying the fallback", key)
+			options = append(options, WithBackendString(backendToUse))
+			model, modelerr = ml.BackendLoader(options...)
+			if modelerr == nil && model != nil {
+				log.Info().Msgf("[%s] Loads OK", key)
+				return model, nil
+			} else {
+				err = errors.Join(err, fmt.Errorf("[%s]: %w", key, modelerr))
+				log.Info().Msgf("[%s] Fails: %s", key, modelerr.Error())
+			}
+		}
 	}

 	return nil, fmt.Errorf("could not load model - all backends returned error: %s", err.Error())
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -157,11 +157,6 @@ func (ml *ModelLoader) ShutdownModel(modelName string) error {
 		}
 		time.Sleep(dur)
 		retries++
-
-		if retries > 10 && os.Getenv("LOCALAI_FORCE_BACKEND_SHUTDOWN") == "true" {
-			log.Warn().Msgf("Model %s is still busy after %d retries. Forcing shutdown.", modelName, retries)
-			break
-		}
 	}

 	return ml.deleteProcess(modelName)
--- a/pkg/model/loader_options.go
+++ b/pkg/model/loader_options.go