feat: unify and propagate CMAKE_ARGS to GGML-based backends

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-24 08:38:02 -04:00 · 2024-12-11 22:02:58 +01:00
111 changed files with 516 additions and 4359 deletions
--- a/.env
+++ b/.env
@@ -82,15 +82,6 @@
 # Enable to allow p2p mode
 # LOCALAI_P2P=true
 # Enable to use federated mode
 # LOCALAI_FEDERATED=true
 # Enable to start federation server
 # FEDERATED_SERVER=true
 # Define to use federation token
 # TOKEN=""
 ### Watchdog settings
 ###
 # Enables watchdog to kill backends that are inactive for too much time
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -5,10 +5,6 @@ dependencies:
 - any:
  - changed-files:
    - any-glob-to-any-file: 'Makefile'
  - changed-files:
    - any-glob-to-any-file: '*.mod'
  - changed-files:
    - any-glob-to-any-file: '*.sum'
 kind/documentation:
 - any:
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -280,7 +280,6 @@ jobs:
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
      skip-drivers: ${{ matrix.skip-drivers }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -302,7 +301,6 @@ jobs:
            latest-image: 'latest-cpu'
            latest-image-aio: 'latest-aio-cpu'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -314,7 +312,6 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -326,7 +323,6 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -338,7 +334,6 @@ jobs:
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -349,7 +344,6 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
@@ -360,45 +354,4 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
 #  parallel-builds:
 #    uses: ./.github/workflows/image_build.yml
 #    with:
 #      tag-latest: ${{ matrix.tag-latest }}
 #      tag-suffix: ${{ matrix.tag-suffix }}
 #      ffmpeg: ${{ matrix.ffmpeg }}
 #      image-type: ${{ matrix.image-type }}
 #      build-type: ${{ matrix.build-type }}
 #      cuda-major-version: ${{ matrix.cuda-major-version }}
 #      cuda-minor-version: ${{ matrix.cuda-minor-version }}
 #      platforms: ${{ matrix.platforms }}
 #      runs-on: ${{ matrix.runs-on }}
 #      aio: ${{ matrix.aio }}
 #      base-image: ${{ matrix.base-image }}
 #      grpc-base-image: ${{ matrix.grpc-base-image }}
 #      makeflags: ${{ matrix.makeflags }}
 #      latest-image: ${{ matrix.latest-image }}
 #      latest-image-aio: ${{ matrix.latest-image-aio }}
 #      skip-drivers: ${{ matrix.skip-drivers }}
 #    secrets:
 #      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
 #      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
 #      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
 #      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
 #    strategy:
 #      matrix:
 #        include:
 #          - build-type: 'cublas'
 #            cuda-major-version: "12"
 #            cuda-minor-version: "0"
 #            platforms: 'linux/arm64'
 #            tag-latest: 'false'
 #            tag-suffix: '-nvidia-l4t-arm64-core'
 #            latest-image: 'latest-nvidia-l4t-arm64-core'
 #            ffmpeg: 'true'
 #            image-type: 'core'
 #            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
 #            runs-on: 'self-hosted'
 #            makeflags: "--jobs=4 --output-sync=target"
 #            skip-drivers: 'true'
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -49,10 +49,6 @@ on:
        description: 'FFMPEG'
        default: ''
        type: string
      skip-drivers:
        description: 'Skip drivers by default'
        default: 'false'
        type: string
      image-type:
        description: 'Image type'
        default: ''
@@ -238,7 +234,6 @@ jobs:
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
            GRPC_VERSION=v1.65.0
            MAKEFLAGS=${{ inputs.makeflags }}
            SKIP_DRIVERS=${{ inputs.skip-drivers }}
          context: .
          file: ./Dockerfile
          cache-from: type=gha
@@ -267,7 +262,6 @@ jobs:
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
            GRPC_VERSION=v1.65.0
            MAKEFLAGS=${{ inputs.makeflags }}
            SKIP_DRIVERS=${{ inputs.skip-drivers }}
          context: .
          file: ./Dockerfile
          cache-from: type=gha
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.0
+        uses: securego/gosec@v2.21.4
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/9
+++ b/9
@@ -115,13 +115,12 @@ FROM requirements-${IMAGE_TYPE} AS requirements-drivers
 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=12
 ARG CUDA_MINOR_VERSION=0
 ARG SKIP_DRIVERS=false
 ENV BUILD_TYPE=${BUILD_TYPE}
 # Vulkan requirements
 RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+    if [ "${BUILD_TYPE}" = "vulkan" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils wget gpg-agent && \
@@ -137,7 +136,7 @@ EOT
 # CuBLAS requirements
 RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+    if [ "${BUILD_TYPE}" = "cublas" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils
@@ -163,7 +162,7 @@ RUN <<EOT bash
 EOT
 # If we are building with clblas support, we need the libraries for the builds
-RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            libclblast-dev && \
@@ -171,7 +170,7 @@ RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        rm -rf /var/lib/apt/lists/* \
    ; fi
-RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            hipblas-dev \
--- a/63
+++ b/63
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=504af20ee4eae72080a56d59d744f6774f7901ce
+CPPLLAMA_VERSION?=dafae66cc242eb766797194d3c85c5e502625623
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@@ -32,7 +32,7 @@ BARKCPP_VERSION?=v1.0.0
 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=dcf91f9e0f2cbf9da472ee2a556751ed4bab2d2a
+STABLEDIFFUSION_GGML_VERSION?=9578fdcc4632dc3de5565f28e2fb16b7c18f8d48
 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
@@ -40,6 +40,7 @@ ONNX_OS?=linux
 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
 export GGML_CMAKE_ARGS?=
 export CMAKE_ARGS?=
 export BACKEND_LIBS?=
@@ -88,9 +89,45 @@ ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
-# IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS
+# IF native is false, we add -DGGML_NATIVE=OFF to GGML_CMAKE_ARGS
 ifeq ($(NATIVE),false)
-	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+	GGML_CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
 GGML_CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
 # If build type is cublas, then we set -DGGML_CUDA=ON to GGML_CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
 	GGML_CMAKE_ARGS+=-DGGML_CUDA=ON
 # If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # to GGML_CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
 	GGML_CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblas)
 	GGML_CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
 	GGML_CMAKE_ARGS+=-DGGML_HIP=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
 else ifeq ($(OS),Darwin)
 	ifneq ($(BUILD_TYPE),metal)
 		GGML_CMAKE_ARGS+=-DGGML_METAL=OFF
 	else
 		GGML_CMAKE_ARGS+=-DGGML_METAL=ON
 		GGML_CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		TARGET+=--target ggml-metal
 	endif
 endif
 ifeq ($(BUILD_TYPE),sycl_f16)
 	GGML_CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 endif
 ifeq ($(BUILD_TYPE),sycl_f32)
 	GGML_CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 endif
 # Detect if we are running on arm64
@@ -117,7 +154,7 @@ ifeq ($(OS),Darwin)
 		BUILD_TYPE=metal
 	# disable metal if on Darwin and any other value is explicitly passed.
 	else ifneq ($(BUILD_TYPE),metal)
-		CMAKE_ARGS+=-DGGML_METAL=OFF
+		GGML_CMAKE_ARGS+=-DGGML_METAL=OFF
 		export GGML_NO_ACCELERATE=1
 		export GGML_NO_METAL=1
 	endif
@@ -142,7 +179,7 @@ ifeq ($(BUILD_TYPE),cublas)
 endif
 ifeq ($(BUILD_TYPE),vulkan)
-	CMAKE_ARGS+=-DGGML_VULKAN=1
+	GGML_CMAKE_ARGS+=-DGGML_VULKAN=1
 endif
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
@@ -164,7 +201,7 @@ ifeq ($(BUILD_TYPE),hipblas)
 	export GGML_HIP=1
 	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
-	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
+	GGML_CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
 	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
 endif
@@ -235,6 +272,8 @@ ifeq ($(BUILD_API_ONLY),true)
 	GRPC_BACKENDS=
 endif
 export CMAKE_ARGS?=$(GGML_CMAKE_ARGS)
 .PHONY: all test build vendor get-sources prepare-sources prepare
 all: help
@@ -302,8 +341,14 @@ sources/stablediffusion-ggml.cpp:
 	git checkout $(STABLEDIFFUSION_GGML_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
-backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
+sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a: sources/stablediffusion-ggml.cpp
-	$(MAKE) -C backend/go/image/stablediffusion-ggml build/libstable-diffusion.a
+	cd sources/stablediffusion-ggml.cpp && \
 	mkdir -p build && \
 	cd build && \
 	cmake $(CMAKE_ARGS) .. && \
 	cmake --build . --config Release
 backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a
 	$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
 backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
--- a/README.md
+++ b/README.md
@@ -126,10 +126,10 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
 ## 🚀 [Features](https://localai.io/features/)
- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
+- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
 - 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
 - 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
- 🎨 [Image generation](https://localai.io/features/image-generation)
+- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
 - 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/) 
 - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
@@ -137,7 +137,6 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
 - 📈 [Reranker API](https://localai.io/features/reranker/)
 - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
 - 🔊 Voice activity detection (Silero-VAD support)
 - 🌍 Integrated WebUI!
 ## 💻 Usage
@@ -160,7 +159,6 @@ Model galleries
 Other:
 - Helm chart https://github.com/go-skynet/helm-charts
 - VSCode extension https://github.com/badgooooor/localai-vscode-plugin
 - Langchain: https://python.langchain.com/docs/integrations/providers/localai/
 - Terminal utility https://github.com/djcopley/ShellOracle
 - Local Smart assistant https://github.com/mudler/LocalAGI
 - Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -159,7 +159,6 @@ message Reply {
  bytes message = 1;
  int32 tokens = 2;
  int32 prompt_tokens = 3;
  bytes audio = 5;
 }
 message ModelOptions {
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -7,42 +7,6 @@ BUILD_TYPE?=
 ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 TARGET?=--target grpc-server
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DGGML_CUDA=ON
 # If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # to CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DGGML_HIP=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
 else ifeq ($(OS),Darwin)
 	ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 	else
 		CMAKE_ARGS+=-DGGML_METAL=ON
 		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		TARGET+=--target ggml-metal
 	endif
 endif
 ifeq ($(BUILD_TYPE),sycl_f16)
 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 endif
 ifeq ($(BUILD_TYPE),sycl_f32)
 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 endif
 llama.cpp:
 	mkdir -p llama.cpp
 	cd llama.cpp && \
@@ -70,7 +34,7 @@ clean: purge
 	rm -rf llama.cpp
 grpc-server: llama.cpp llama.cpp/examples/grpc-server
-	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
+	@echo "Building grpc-server for llama.cpp with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
 	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)"
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -428,7 +428,6 @@ struct llama_server_context
 {
    llama_model *model = nullptr;
    llama_context *ctx = nullptr;
    const llama_vocab * vocab = nullptr;
    clip_ctx *clp_ctx = nullptr;
@@ -440,7 +439,6 @@ struct llama_server_context
    bool clean_kv_cache     = true;
    bool all_slots_are_idle = false;
    bool add_bos_token      = true;
    bool has_eos_token      = true;
    int32_t n_ctx;  // total context for all clients / slots
@@ -494,8 +492,8 @@ struct llama_server_context
        }
        common_init_result common_init = common_init_from_params(params);
-        model = common_init.model.release();
+        model = common_init.model;
-        ctx = common_init.context.release();
+        ctx = common_init.context;
        if (model == nullptr)
        {
            LOG_ERR("unable to load model: %s", params.model.c_str());
@@ -504,7 +502,7 @@ struct llama_server_context
        if (multimodal) {
            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
-            const int n_embd_llm  = llama_model_n_embd(model);
+            const int n_embd_llm  = llama_n_embd(model);
            if (n_embd_clip != n_embd_llm) {
                LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
                llama_free(ctx);
@@ -513,15 +511,23 @@ struct llama_server_context
            }
        }
        vocab = llama_model_get_vocab(model);
        n_ctx = llama_n_ctx(ctx);
-        add_bos_token = llama_vocab_get_add_bos(vocab);
+        add_bos_token = llama_add_bos_token(model);
        has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
        return true;
    }
    void validate_model_chat_template(server_params & sparams) {
        llama_chat_message chat[] = {{"user", "test"}};
        std::vector<char> buf(1);
        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
        if (res < 0) {
            LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__);
            sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
        }
    }
    llama_client_slot* get_active_slot() {
        for (llama_client_slot& slot : slots) {
            // Check if the slot is currently processing
@@ -675,6 +681,7 @@ struct llama_server_context
        slot->sparams.mirostat          = json_value(data, "mirostat",          default_sparams.mirostat);
        slot->sparams.mirostat_tau      = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
        slot->sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
        slot->sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
        slot->params.n_keep             = json_value(data, "n_keep",            slot->params.n_keep);
        slot->sparams.seed               = json_value(data, "seed",              default_sparams.seed);
        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
@@ -719,8 +726,8 @@ struct llama_server_context
            slot->prompt = "";
        }
-        if (json_value(data, "ignore_eos", false) && has_eos_token) {
+        if (json_value(data, "ignore_eos", false)) {
-                slot->sparams.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
+                slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
        }
        /*
        slot->sparams.penalty_prompt_tokens.clear();
@@ -759,13 +766,13 @@ struct llama_server_context
            }
        }
      */
        slot->sparams.logit_bias.clear();
        const auto &logit_bias = data.find("logit_bias");
        if (logit_bias != data.end() && logit_bias->is_array())
        {
-            const llama_vocab * vocab = llama_model_get_vocab(model);
+            const int n_vocab = llama_n_vocab(model);
            const int n_vocab = llama_vocab_n_tokens(vocab);
            for (const auto &el : *logit_bias)
            {
                if (el.is_array() && el.size() == 2)
@@ -794,7 +801,7 @@ struct llama_server_context
                    }
                    else if (el[0].is_string())
                    {
-                        auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
+                        auto toks = common_tokenize(model, el[0].get<std::string>(), false);
                        for (auto tok : toks)
                        {
                            slot->sparams.logit_bias.push_back({tok, bias});
@@ -1124,7 +1131,7 @@ struct llama_server_context
            slot.has_next_token = false;
        }
-        if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok))
+        if (result.tok == llama_token_eos(model))
        {
            slot.stopped_eos = true;
            slot.has_next_token = false;
@@ -1206,12 +1213,13 @@ struct llama_server_context
            {"mirostat",          slot.sparams.mirostat},
            {"mirostat_tau",      slot.sparams.mirostat_tau},
            {"mirostat_eta",      slot.sparams.mirostat_eta},
            {"penalize_nl",       slot.sparams.penalize_nl},
            {"stop",              slot.params.antiprompt},
            {"n_predict",         slot.params.n_predict},
            {"n_keep",            params.n_keep},
            {"ignore_eos",        slot.sparams.ignore_eos},
            {"stream",            slot.params.stream},
-             //      {"logit_bias",        slot.sparams.logit_bias},
+      //      {"logit_bias",        slot.sparams.logit_bias},
            {"n_probs",           slot.sparams.n_probs},
            {"min_keep",          slot.sparams.min_keep},
            {"grammar",           slot.sparams.grammar},
@@ -1319,7 +1327,7 @@ struct llama_server_context
        res.error = false;
        res.stop = true;
-        const int n_embd = llama_model_n_embd(model);
+        const int n_embd = llama_n_embd(model);
        if (!params.embedding)
        {
            LOG_WARNING("embedding disabled", {
@@ -1418,7 +1426,7 @@ struct llama_server_context
                    n_eval = n_batch;
                }
-                const int n_embd = llama_model_n_embd(model);
+                const int n_embd = llama_n_embd(model);
                float * embd = img.image_embedding + i * n_embd;
                llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
                if (llama_decode(ctx, llava_batch.batch))
@@ -1699,11 +1707,11 @@ struct llama_server_context
                            suffix_tokens.erase(suffix_tokens.begin());
                        }
-                        prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_fim_pre(vocab));
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
-                        prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_bos(vocab)); // always add BOS
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
-                        prefix_tokens.insert(prefix_tokens.end(),   llama_vocab_fim_suf(vocab));
+                        prefix_tokens.insert(prefix_tokens.end(),   llama_token_suffix(model));
                        prefix_tokens.insert(prefix_tokens.end(),   suffix_tokens.begin(), suffix_tokens.end());
-                        prefix_tokens.push_back(llama_vocab_fim_mid(vocab));
+                        prefix_tokens.push_back(llama_token_middle(model));
                        prompt_tokens = prefix_tokens;
                    }
                    else
@@ -2104,6 +2112,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    //     slot->sparams.mirostat        = json_value(data, "mirostat",          default_sparams.mirostat);
    //     slot->sparams.mirostat_tau    = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
    //     slot->sparams.mirostat_eta    = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
    //     slot->sparams.penalize_nl     = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
    //     slot->params.n_keep           = json_value(data, "n_keep",            slot->params.n_keep);
    //     slot->params.seed             = json_value(data, "seed",              default_params.seed);
    //     slot->sparams.grammar         = json_value(data, "grammar",           default_sparams.grammar);
@@ -2126,6 +2135,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    data["mirostat"] = predict->mirostat();
    data["mirostat_tau"] = predict->mirostattau();
    data["mirostat_eta"] = predict->mirostateta();
    data["penalize_nl"] = predict->penalizenl();
    data["n_keep"] = predict->nkeep();
    data["seed"] = predict->seed();
    data["grammar"] = predict->grammar();
@@ -2171,6 +2181,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 //     llama.params.sparams.mirostat = predict->mirostat();
 //     llama.params.sparams.mirostat_tau = predict->mirostattau();
 //     llama.params.sparams.mirostat_eta = predict->mirostateta();
 //     llama.params.sparams.penalize_nl = predict->penalizenl();
 //     llama.params.n_keep = predict->nkeep();
 //     llama.params.seed = predict->seed();
 //     llama.params.sparams.grammar = predict->grammar();
@@ -2217,35 +2228,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 //     }
 // }
 const std::vector<ggml_type> kv_cache_types = {
    GGML_TYPE_F32,
    GGML_TYPE_F16,
    GGML_TYPE_BF16,
    GGML_TYPE_Q8_0,
    GGML_TYPE_Q4_0,
    GGML_TYPE_Q4_1,
    GGML_TYPE_IQ4_NL,
    GGML_TYPE_Q5_0,
    GGML_TYPE_Q5_1,
 };
 static ggml_type kv_cache_type_from_str(const std::string & s) {
    for (const auto & type : kv_cache_types) {
        if (ggml_type_name(type) == s) {
            return type;
        }
    }
    throw std::runtime_error("Unsupported cache type: " + s);
 }
 static std::string get_all_kv_cache_types() {
    std::ostringstream msg;
    for (const auto & type : kv_cache_types) {
        msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
    }
    return msg.str();
 }
 static void params_parse(const backend::ModelOptions* request,
                                common_params & params) {
@@ -2260,10 +2242,10 @@ static void params_parse(const backend::ModelOptions* request,
    //  params.model_alias ??
    params.model_alias =  request->modelfile();
    if (!request->cachetypekey().empty()) {
-        params.cache_type_k = kv_cache_type_from_str(request->cachetypekey());
+        params.cache_type_k = request->cachetypekey();
    }
    if (!request->cachetypevalue().empty()) {
-        params.cache_type_v = kv_cache_type_from_str(request->cachetypevalue());
+        params.cache_type_v = request->cachetypevalue();
    }
    params.n_ctx = request->contextsize();
    //params.memory_f16 = request->f16memory();
--- a/backend/cpp/llama/patches/01-llava.patch
+++ b/backend/cpp/llama/patches/01-llava.patch
@@ -1,13 +1,13 @@
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 3cd0d2fa..6c5e811a 100644
+index 342042ff..224db9b5 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
-@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
+@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
-                 struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
+             struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
-                 int* patches_data = (int*)malloc(ggml_nbytes(patches));
+             int* patches_data = (int*)malloc(ggml_nbytes(patches));
-                 for (int i = 0; i < num_patches; i++) {
+             for (int i = 0; i < num_patches; i++) {
-                    patches_data[i] = i + 1;
+-                patches_data[i] = i + 1;
-+                    patches_data[i] = i;
+                patches_data[i] = i;
-                 }
+             }
-                 ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
+             ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
-                 free(patches_data);
+             free(patches_data);
--- a/backend/go/image/stablediffusion-ggml/Makefile
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@@ -2,95 +2,20 @@ INCLUDE_PATH := $(abspath ./)
 LIBRARY_PATH := $(abspath ./)
 AR?=ar
-CMAKE_ARGS?=
+
 BUILD_TYPE?=
 ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 # keep standard at C11 and C++11
 CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DGGML_CUDA=ON
 # If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # to CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DGGML_HIP=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
 else ifeq ($(OS),Darwin)
 	ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 	else
 		CMAKE_ARGS+=-DGGML_METAL=ON
 		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		TARGET+=--target ggml-metal
 	endif
 endif
 # ifeq ($(BUILD_TYPE),sycl_f16)
 # 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DSD_SYCL=ON -DGGML_SYCL_F16=ON
 # endif
 # ifeq ($(BUILD_TYPE),sycl_f32)
 # 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON
 # endif
 # warnings
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
 # Find all .a archives in ARCHIVE_DIR
 # (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
 GGML_ARCHIVE_DIR := build/ggml/src/
 ALL_ARCHIVES := $(shell find $(GGML_ARCHIVE_DIR) -type f -name '*.a')
 # Name of the single merged library
 COMBINED_LIB := libggmlall.a
 # Rule to merge all the .a files into one
 $(COMBINED_LIB): $(ALL_ARCHIVES)
 	@echo "Merging all .a into $(COMBINED_LIB)"
 	rm -f $@
 	mkdir -p merge-tmp
 	for a in $(ALL_ARCHIVES); do \
 		( cd merge-tmp && ar x ../$$a ); \
 	done
 	( cd merge-tmp && ar rcs ../$@ *.o )
 	# Ensure we have a proper index
 	ranlib $@
 	# Clean up
 	rm -rf merge-tmp
 build/libstable-diffusion.a:
 	@echo "Building SD with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
 	mkdir -p build && \
 	cd build && \
 	cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
 	cmake --build . --config Release"
 else
 	mkdir -p build && \
 	cd build && \
 	cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
 	cmake --build . --config Release
 endif
 	$(MAKE) $(COMBINED_LIB)
 gosd.o:
 	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
 libsd.a: gosd.o
-	cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
+	cp $(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a ./libsd.a
 	$(AR) rcs libsd.a gosd.o
 clean:
-	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
+	rm -f gosd.o libsd.a
--- a/backend/go/image/stablediffusion-ggml/gosd.go
+++ b/backend/go/image/stablediffusion-ggml/gosd.go
@@ -1,7 +1,7 @@
 package main
 // #cgo CXXFLAGS: -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/ggml/include
-// #cgo LDFLAGS: -L${SRCDIR}/ -lsd -lstdc++ -lm -lggmlall -lgomp
+// #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/build/ggml/src/ggml-cpu -L${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/build/ggml/src -lsd -lstdc++ -lm -lggml -lggml-base -lggml-cpu -lgomp
 // #include <gosd.h>
 // #include <stdlib.h>
 import "C"
--- a/backend/go/vad/silero/vad.go
+++ b/backend/go/vad/silero/vad.go
@@ -21,8 +21,8 @@ func (vad *VAD) Load(opts *pb.ModelOptions) error {
 		SampleRate: 16000,
 		//WindowSize:           1024,
 		Threshold:            0.5,
-		MinSilenceDurationMs: 100,
+		MinSilenceDurationMs: 0,
-		SpeechPadMs:          30,
+		SpeechPadMs:          0,
 	})
 	if err != nil {
 		return fmt.Errorf("create silero detector: %w", err)
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -1,6 +1,5 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.69.0
+grpcio==1.68.1
 protobuf
 certifi
 transformers
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -1,9 +1,8 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
-torchaudio==2.3.1+cxx11.abi
+torchaudio
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.69.0
+grpcio==1.68.1
 protobuf
 certifi
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -17,9 +17,6 @@
 # LIMIT_TARGETS="cublas12"
 # source $(dirname $0)/../common/libbackend.sh
 #
 PYTHON_VERSION="3.10"
 function init() {
    # Name of the backend (directory name)
    BACKEND_NAME=${PWD##*/}
@@ -91,7 +88,7 @@ function getBuildProfile() {
 # always result in an activated virtual environment
 function ensureVenv() {
    if [ ! -d "${EDIR}/venv" ]; then
-        uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
+        uv venv ${EDIR}/venv
        echo "virtualenv created"
    fi
--- a/backend/python/common/template/requirements-intel.txt
+++ b/backend/python/common/template/requirements-intel.txt
@@ -1,5 +1,4 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.69.0
+grpcio==1.68.1
 protobuf
 grpcio-tools
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -1,10 +1,9 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
-torchaudio==2.3.1+cxx11.abi
+torchaudio
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
 coqui-tts
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.68.1
 protobuf
 certifi
 packaging==24.1
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -17,7 +17,7 @@ import backend_pb2_grpc
 import grpc
-from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
+from diffusers import StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
    EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
 from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
@@ -275,13 +275,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                    if request.LowVRAM:
                        self.pipe.enable_model_cpu_offload()
            elif request.PipelineType == "SanaPipeline":
                self.pipe = SanaPipeline.from_pretrained(
                    request.Model,
                    variant="bf16",
                    torch_dtype=torch.bfloat16)
                self.pipe.vae.to(torch.bfloat16)
                self.pipe.text_encoder.to(torch.bfloat16)
            if CLIPSKIP and request.CLIPSkip != 0:
                self.clip_skip = request.CLIPSkip
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -1,10 +1,9 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
-torchvision==0.18.1+cxx11.abi
+torchvision
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 diffusers
 opencv-python
 transformers
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.69.0
+grpcio==1.68.1
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.68.1
 protobuf
 certifi
 wheel
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.69.0
+grpcio==1.68.1
 protobuf
 certifi
--- a/backend/python/openvoice/requirements-cpu.txt
+++ b/backend/python/openvoice/requirements-cpu.txt
@@ -1,7 +1,3 @@
 torch==2.4.1
 git+https://github.com/myshell-ai/MeloTTS.git
 git+https://github.com/myshell-ai/OpenVoice.git
 whisper-timestamped
 pydub==0.25.1
 wavmark==0.0.3
 eng_to_ipa==0.0.2
--- a/backend/python/openvoice/requirements-cublas11.txt
+++ b/backend/python/openvoice/requirements-cublas11.txt
@@ -2,7 +2,3 @@
 torch==2.4.1+cu118
 git+https://github.com/myshell-ai/MeloTTS.git
 git+https://github.com/myshell-ai/OpenVoice.git
 whisper-timestamped
 pydub==0.25.1
 wavmark==0.0.3
 eng_to_ipa==0.0.2
--- a/backend/python/openvoice/requirements-cublas12.txt
+++ b/backend/python/openvoice/requirements-cublas12.txt
@@ -1,7 +1,3 @@
 torch==2.4.1
 git+https://github.com/myshell-ai/MeloTTS.git
 git+https://github.com/myshell-ai/OpenVoice.git
 whisper-timestamped
 pydub==0.25.1
 wavmark==0.0.3
 eng_to_ipa==0.0.2
--- a/backend/python/openvoice/requirements-hipblas.txt
+++ b/backend/python/openvoice/requirements-hipblas.txt
@@ -2,7 +2,3 @@
 torch==2.4.1+rocm6.0
 git+https://github.com/myshell-ai/MeloTTS.git
 git+https://github.com/myshell-ai/OpenVoice.git
 whisper-timestamped
 pydub==0.25.1
 wavmark==0.0.3
 eng_to_ipa==0.0.2
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -1,15 +1,14 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
 torchaudio==2.3.1+cxx11.abi
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-grpcio==1.69.0
+grpcio==1.68.1
 protobuf
 librosa==0.9.1
 faster-whisper==0.9.0
 pydub==0.25.1
 wavmark==0.0.3
 numpy==1.22.0
 eng_to_ipa==0.0.2
 inflect==7.0.0
 unidecode==1.3.7
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -1,17 +1,20 @@
-grpcio==1.69.0
+grpcio==1.68.1
 protobuf
 librosa
 faster-whisper
 pydub==0.25.1
 wavmark==0.0.3
 numpy==1.22.0
 eng_to_ipa==0.0.2
 inflect
 unidecode
 whisper-timestamped
 openai
 python-dotenv
 pypinyin
 cn2an==0.5.22
 numpy==1.22.0
 networkx==2.8.8
 jieba==0.42.1
-gradio==5.9.1
+gradio==3.48.0
 langid==1.1.6
 llvmlite==0.43.0
 setuptools
--- a/backend/python/parler-tts/requirements-intel.txt
+++ b/backend/python/parler-tts/requirements-intel.txt
@@ -1,8 +1,8 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
-torchaudio==2.3.1+cxx11.abi
+torchaudio
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
 setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,4 +1,3 @@
-grpcio==1.69.0
+grpcio==1.68.1
 certifi
 llvmlite==0.43.0
 setuptools
--- a/backend/python/rerankers/requirements-intel.txt
+++ b/backend/python/rerankers/requirements-intel.txt
@@ -1,9 +1,8 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
 transformers
 accelerate
-torch==2.3.1+cxx11.abi
+torch
 oneccl_bind_pt==2.3.100+xpu
 rerankers[transformers]
 optimum[openvino]
-setuptools
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.69.0
+grpcio==1.68.1
 protobuf
 certifi
--- a/backend/python/sentencetransformers/requirements-intel.txt
+++ b/backend/python/sentencetransformers/requirements-intel.txt
@@ -1,9 +1,8 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
 accelerate
 sentence-transformers==3.3.1
 transformers
--- a/backend/python/sentencetransformers/requirements.txt
+++ b/backend/python/sentencetransformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.68.1
 protobuf
 certifi
 datasets
--- a/backend/python/transformers-musicgen/requirements-intel.txt
+++ b/backend/python/transformers-musicgen/requirements-intel.txt
@@ -1,8 +1,7 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
 transformers
 oneccl_bind_pt==2.3.100+xpu
 accelerate
-torch==2.3.1+cxx11.abi
+torch
 optimum[openvino]
-setuptools
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/transformers-musicgen/requirements.txt
+++ b/backend/python/transformers-musicgen/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.68.1
 protobuf
 scipy==1.14.0
 certifi
--- a/backend/python/transformers/requirements-intel.txt
+++ b/backend/python/transformers/requirements-intel.txt
@@ -1,7 +1,6 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
 intel-extension-for-transformers
 bitsandbytes
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.68.1
 protobuf
 certifi
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vall-e-x/requirements-intel.txt
+++ b/backend/python/vall-e-x/requirements-intel.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
 accelerate
-torch==2.3.1+cxx11.abi
+torch
-torchaudio==2.3.1+cxx11.abi
+torchaudio
 optimum[openvino]
-oneccl_bind_pt==2.3.100+xpu
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vall-e-x/requirements.txt
+++ b/backend/python/vall-e-x/requirements.txt
@@ -1,4 +1,3 @@
-grpcio==1.69.0
+grpcio==1.68.1
 protobuf
 certifi
 setuptools
--- a/backend/python/vllm/requirements-intel.txt
+++ b/backend/python/vllm/requirements-intel.txt
@@ -1,9 +1,8 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
 accelerate
-torch==2.3.1+cxx11.abi
+torch
 transformers
 optimum[openvino]
-setuptools
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 bitsandbytes
 oneccl_bind_pt==2.3.100+xpu
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.68.1
 protobuf
 certifi
 setuptools
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -22,9 +22,8 @@ import (
 )
 type LLMResponse struct {
-	Response    string // should this be []byte?
+	Response string // should this be []byte?
-	Usage       TokenUsage
+	Usage    TokenUsage
 	AudioOutput string
 }
 type TokenUsage struct {
@@ -118,12 +117,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 			ss := ""
 			var partialRune []byte
-			err := inferenceModel.PredictStream(ctx, opts, func(reply *proto.Reply) {
+			err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
-				msg := reply.Message
+				partialRune = append(partialRune, chars...)
 				partialRune = append(partialRune, msg...)
 				tokenUsage.Prompt = int(reply.PromptTokens)
 				tokenUsage.Completion = int(reply.Tokens)
 				for len(partialRune) > 0 {
 					r, size := utf8.DecodeRune(partialRune)
@@ -137,10 +132,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 					partialRune = partialRune[size:]
 				}
 				if len(msg) == 0 {
 					tokenCallback("", tokenUsage)
 				}
 			})
 			return LLMResponse{
 				Response: ss,
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -38,7 +38,6 @@ type BackendConfig struct {
 	TemplateConfig      TemplateConfig         `yaml:"template"`
 	KnownUsecaseStrings []string               `yaml:"known_usecases"`
 	KnownUsecases       *BackendConfigUsecases `yaml:"-"`
 	Pipeline            Pipeline               `yaml:"pipeline"`
 	PromptStrings, InputStrings                []string               `yaml:"-"`
 	InputToken                                 [][]int                `yaml:"-"`
@@ -77,18 +76,6 @@ type BackendConfig struct {
 	Options []string `yaml:"options"`
 }
 // Pipeline defines other models to use for audio-to-audio
 type Pipeline struct {
 	TTS           string `yaml:"tts"`
 	LLM           string `yaml:"llm"`
 	Transcription string `yaml:"transcription"`
 	VAD           string `yaml:"vad"`
 }
 func (p Pipeline) IsNotConfigured() bool {
 	return p.LLM == "" || p.TTS == "" || p.Transcription == ""
 }
 type File struct {
 	Filename string         `yaml:"filename" json:"filename"`
 	SHA256   string         `yaml:"sha256" json:"sha256"`
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -7,7 +7,6 @@ import (
 	"net/http"
 	"github.com/dave-gray101/v2keyauth"
 	"github.com/gofiber/websocket/v2"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/mudler/LocalAI/core/http/endpoints/localai"
@@ -88,17 +87,6 @@ func API(application *application.Application) (*fiber.App, error) {
 	router := fiber.New(fiberCfg)
 	router.Use(middleware.StripPathPrefix())
 	router.Use("/v1/realtime", func(c *fiber.Ctx) error {
 		if websocket.IsWebSocketUpgrade(c) {
 			// Returns true if the client requested upgrade to the WebSocket protocol
 			return c.Next()
 		}
 		return nil
 	})
 	router.Hooks().OnListen(func(listenData fiber.ListenData) error {
 		scheme := "http"
 		if listenData.TLS {
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -237,31 +237,6 @@ func postInvalidRequest(url string) (error, int) {
 	return nil, resp.StatusCode
 }
 func getRequest(url string, header http.Header) (error, int, []byte) {
 	req, err := http.NewRequest("GET", url, nil)
 	if err != nil {
 		return err, -1, nil
 	}
 	req.Header = header
 	client := &http.Client{}
 	resp, err := client.Do(req)
 	if err != nil {
 		return err, -1, nil
 	}
 	defer resp.Body.Close()
 	body, err := io.ReadAll(resp.Body)
 	if err != nil {
 		return err, -1, nil
 	}
 	return nil, resp.StatusCode, body
 }
 const bertEmbeddingsURL = `https://gist.githubusercontent.com/mudler/0a080b166b87640e8644b09c2aee6e3b/raw/f0e8c26bb72edc16d9fbafbfd6638072126ff225/bert-embeddings-gallery.yaml`
 //go:embed backend-assets/*
@@ -370,33 +345,6 @@ var _ = Describe("API test", func() {
 			})
 		})
 		Context("URL routing Tests", func() {
 			It("Should support reverse-proxy when unauthenticated", func() {
 				err, sc, body := getRequest("http://127.0.0.1:9090/myprefix/", http.Header{
 					"X-Forwarded-Proto":  {"https"},
 					"X-Forwarded-Host":   {"example.org"},
 					"X-Forwarded-Prefix": {"/myprefix/"},
 				})
 				Expect(err).To(BeNil(), "error")
 				Expect(sc).To(Equal(401), "status code")
 				Expect(string(body)).To(ContainSubstring(`<base href="https://example.org/myprefix/" />`), "body")
 			})
 			It("Should support reverse-proxy when authenticated", func() {
 				err, sc, body := getRequest("http://127.0.0.1:9090/myprefix/", http.Header{
 					"Authorization":      {bearerKey},
 					"X-Forwarded-Proto":  {"https"},
 					"X-Forwarded-Host":   {"example.org"},
 					"X-Forwarded-Prefix": {"/myprefix/"},
 				})
 				Expect(err).To(BeNil(), "error")
 				Expect(sc).To(Equal(200), "status code")
 				Expect(string(body)).To(ContainSubstring(`<base href="https://example.org/myprefix/" />`), "body")
 			})
 		})
 		Context("Applying models", func() {
 			It("applies models from a gallery", func() {
@@ -756,7 +704,7 @@ var _ = Describe("API test", func() {
 			Expect(err).ToNot(HaveOccurred(), fmt.Sprint(resp))
 			Expect(resp.StatusCode).To(Equal(200), fmt.Sprint(string(dat)))
-			Expect(resp.Header.Get("Content-Type")).To(Or(Equal("audio/x-wav"), Equal("audio/vnd.wave")))
+			Expect(resp.Header.Get("Content-Type")).To(Equal("audio/x-wav"))
 		})
 		It("installs and is capable to generate images", Label("stablediffusion"), func() {
 			if runtime.GOOS != "linux" {
--- a/core/http/ctx/fiber.go
+++ b/core/http/ctx/fiber.go
@@ -19,11 +19,9 @@ func ModelFromContext(ctx *fiber.Ctx, cl *config.BackendConfigLoader, loader *mo
 	if ctx.Params("model") != "" {
 		modelInput = ctx.Params("model")
 	}
 	if ctx.Query("model") != "" {
 		modelInput = ctx.Query("model")
 	}
 	// Set model from bearer token, if available
 	bearer := strings.TrimLeft(ctx.Get("authorization"), "Bear ") // Reduced duplicate characters of Bearer
 	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
--- a/core/http/elements/buttons.go
+++ b/core/http/elements/buttons.go
@@ -16,7 +16,7 @@ func installButton(galleryName string) elem.Node {
 			"class":                 "float-right inline-block rounded bg-primary px-6 pb-2.5 mb-3 pt-2.5 text-xs font-medium uppercase leading-normal text-white shadow-primary-3 transition duration-150 ease-in-out hover:bg-primary-accent-300 hover:shadow-primary-2 focus:bg-primary-accent-300 focus:shadow-primary-2 focus:outline-none focus:ring-0 active:bg-primary-600 active:shadow-primary-2 dark:shadow-black/30 dark:hover:shadow-dark-strong dark:focus:shadow-dark-strong dark:active:shadow-dark-strong",
 			"hx-swap":               "outerHTML",
 			// post the Model ID as param
-			"hx-post": "browse/install/model/" + galleryName,
+			"hx-post": "/browse/install/model/" + galleryName,
 		},
 		elem.I(
 			attrs.Props{
@@ -36,7 +36,7 @@ func reInstallButton(galleryName string) elem.Node {
 			"hx-target":             "#action-div-" + dropBadChars(galleryName),
 			"hx-swap":               "outerHTML",
 			// post the Model ID as param
-			"hx-post": "browse/install/model/" + galleryName,
+			"hx-post": "/browse/install/model/" + galleryName,
 		},
 		elem.I(
 			attrs.Props{
@@ -80,7 +80,7 @@ func deleteButton(galleryID string) elem.Node {
 			"hx-target":             "#action-div-" + dropBadChars(galleryID),
 			"hx-swap":               "outerHTML",
 			// post the Model ID as param
-			"hx-post": "browse/delete/model/" + galleryID,
+			"hx-post": "/browse/delete/model/" + galleryID,
 		},
 		elem.I(
 			attrs.Props{
--- a/core/http/elements/gallery.go
+++ b/core/http/elements/gallery.go
@@ -47,7 +47,7 @@ func searchableElement(text, icon string) elem.Node {
 					//	"value":     text,
 					//"class":     "inline-block bg-gray-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2",
 					"href":      "#!",
-					"hx-post":   "browse/search/models",
+					"hx-post":   "/browse/search/models",
 					"hx-target": "#search-results",
 					// TODO: this doesn't work
 					//	"hx-vals":      `{ \"search\": \"` + text + `\" }`,
--- a/core/http/elements/progressbar.go
+++ b/core/http/elements/progressbar.go
@@ -64,7 +64,7 @@ func StartProgressBar(uid, progress, text string) string {
 	return elem.Div(
 		attrs.Props{
 			"hx-trigger": "done",
-			"hx-get":     "browse/job/" + uid,
+			"hx-get":     "/browse/job/" + uid,
 			"hx-swap":    "outerHTML",
 			"hx-target":  "this",
 		},
@@ -77,7 +77,7 @@ func StartProgressBar(uid, progress, text string) string {
 			},
 			elem.Text(bluemonday.StrictPolicy().Sanitize(text)), //Perhaps overly defensive
 			elem.Div(attrs.Props{
-				"hx-get":     "browse/job/progress/" + uid,
+				"hx-get":     "/browse/job/progress/" + uid,
 				"hx-trigger": "every 600ms",
 				"hx-target":  "this",
 				"hx-swap":    "innerHTML",
--- a/core/http/endpoints/explorer/dashboard.go
+++ b/core/http/endpoints/explorer/dashboard.go
@@ -6,7 +6,6 @@ import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/explorer"
 	"github.com/mudler/LocalAI/core/http/utils"
 	"github.com/mudler/LocalAI/internal"
 )
@@ -15,7 +14,6 @@ func Dashboard() func(*fiber.Ctx) error {
 		summary := fiber.Map{
 			"Title":   "LocalAI API - " + internal.PrintableVersion(),
 			"Version": internal.PrintableVersion(),
 			"BaseURL": utils.BaseURL(c),
 		}
 		if string(c.Context().Request.Header.ContentType()) == "application/json" || len(c.Accepts("html")) == 0 {
--- a/core/http/endpoints/localai/gallery.go
+++ b/core/http/endpoints/localai/gallery.go
@@ -9,7 +9,6 @@ import (
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/http/utils"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/rs/zerolog/log"
@@ -83,8 +82,7 @@ func (mgs *ModelGalleryEndpointService) ApplyModelGalleryEndpoint() func(c *fibe
 			Galleries:        mgs.galleries,
 			ConfigURL:        input.ConfigURL,
 		}
-
+		return c.JSON(schema.GalleryResponse{ID: uuid.String(), StatusURL: c.BaseURL() + "/models/jobs/" + uuid.String()})
 		return c.JSON(schema.GalleryResponse{ID: uuid.String(), StatusURL: fmt.Sprintf("%smodels/jobs/%s", utils.BaseURL(c), uuid.String())})
 	}
 }
@@ -107,7 +105,7 @@ func (mgs *ModelGalleryEndpointService) DeleteModelGalleryEndpoint() func(c *fib
 			return err
 		}
-		return c.JSON(schema.GalleryResponse{ID: uuid.String(), StatusURL: fmt.Sprintf("%smodels/jobs/%s", utils.BaseURL(c), uuid.String())})
+		return c.JSON(schema.GalleryResponse{ID: uuid.String(), StatusURL: c.BaseURL() + "/models/jobs/" + uuid.String()})
 	}
 }
--- a/core/http/endpoints/localai/welcome.go
+++ b/core/http/endpoints/localai/welcome.go
@@ -4,7 +4,6 @@ import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/http/utils"
 	"github.com/mudler/LocalAI/core/p2p"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/mudler/LocalAI/internal"
@@ -33,7 +32,6 @@ func WelcomeEndpoint(appConfig *config.ApplicationConfig,
 		summary := fiber.Map{
 			"Title":             "LocalAI API - " + internal.PrintableVersion(),
 			"Version":           internal.PrintableVersion(),
 			"BaseURL":           utils.BaseURL(c),
 			"Models":            modelsWithoutConfig,
 			"ModelsConfig":      backendConfigs,
 			"GalleryConfig":     galleryConfigs,
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -1,186 +0,0 @@
 package openai
 import (
 	"context"
 	"fmt"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	grpcClient "github.com/mudler/LocalAI/pkg/grpc"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
 	"google.golang.org/grpc"
 )
 var (
 	_ Model = new(wrappedModel)
 	_ Model = new(anyToAnyModel)
 )
 // wrappedModel represent a model which does not support Any-to-Any operations
 // This means that we will fake an Any-to-Any model by overriding some of the gRPC client methods
 // which are for Any-To-Any models, but instead we will call a pipeline (for e.g STT->LLM->TTS)
 type wrappedModel struct {
 	TTSConfig           *config.BackendConfig
 	TranscriptionConfig *config.BackendConfig
 	LLMConfig           *config.BackendConfig
 	TTSClient           grpcClient.Backend
 	TranscriptionClient grpcClient.Backend
 	LLMClient           grpcClient.Backend
 	VADConfig *config.BackendConfig
 	VADClient grpcClient.Backend
 }
 // anyToAnyModel represent a model which supports Any-to-Any operations
 // We have to wrap this out as well because we want to load two models one for VAD and one for the actual model.
 // In the future there could be models that accept continous audio input only so this design will be useful for that
 type anyToAnyModel struct {
 	LLMConfig *config.BackendConfig
 	LLMClient grpcClient.Backend
 	VADConfig *config.BackendConfig
 	VADClient grpcClient.Backend
 }
 func (m *wrappedModel) VAD(ctx context.Context, in *proto.VADRequest, opts ...grpc.CallOption) (*proto.VADResponse, error) {
 	return m.VADClient.VAD(ctx, in)
 }
 func (m *anyToAnyModel) VAD(ctx context.Context, in *proto.VADRequest, opts ...grpc.CallOption) (*proto.VADResponse, error) {
 	return m.VADClient.VAD(ctx, in)
 }
 func (m *wrappedModel) Predict(ctx context.Context, in *proto.PredictOptions, opts ...grpc.CallOption) (*proto.Reply, error) {
 	// TODO: Convert with pipeline (audio to text, text to llm, result to tts, and return it)
 	// sound.BufferAsWAV(audioData, "audio.wav")
 	return m.LLMClient.Predict(ctx, in)
 }
 func (m *wrappedModel) PredictStream(ctx context.Context, in *proto.PredictOptions, f func(reply *proto.Reply), opts ...grpc.CallOption) error {
 	// TODO: Convert with pipeline (audio to text, text to llm, result to tts, and return it)
 	return m.LLMClient.PredictStream(ctx, in, f)
 }
 func (m *anyToAnyModel) Predict(ctx context.Context, in *proto.PredictOptions, opts ...grpc.CallOption) (*proto.Reply, error) {
 	return m.LLMClient.Predict(ctx, in)
 }
 func (m *anyToAnyModel) PredictStream(ctx context.Context, in *proto.PredictOptions, f func(reply *proto.Reply), opts ...grpc.CallOption) error {
 	return m.LLMClient.PredictStream(ctx, in, f)
 }
 // returns and loads either a wrapped model or a model that support audio-to-audio
 func newModel(cfg *config.BackendConfig, cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, modelName string) (Model, error) {
 	// Prepare VAD model
 	cfgVAD, err := cl.LoadBackendConfigFileByName(cfg.Pipeline.VAD, ml.ModelPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load backend config: %w", err)
 	}
 	if !cfgVAD.Validate() {
 		return nil, fmt.Errorf("failed to validate config: %w", err)
 	}
 	opts := backend.ModelOptions(*cfgVAD, appConfig)
 	VADClient, err := ml.Load(opts...)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load tts model: %w", err)
 	}
 	// If we don't have Wrapped model definitions, just return a standard model
 	if cfg.Pipeline.IsNotConfigured() {
 		// Otherwise we want to return a wrapped model, which is a "virtual" model that re-uses other models to perform operations
 		cfgAnyToAny, err := cl.LoadBackendConfigFileByName(cfg.Model, ml.ModelPath)
 		if err != nil {
 			return nil, fmt.Errorf("failed to load backend config: %w", err)
 		}
 		if !cfgAnyToAny.Validate() {
 			return nil, fmt.Errorf("failed to validate config: %w", err)
 		}
 		opts := backend.ModelOptions(*cfgAnyToAny, appConfig)
 		anyToAnyClient, err := ml.Load(opts...)
 		if err != nil {
 			return nil, fmt.Errorf("failed to load tts model: %w", err)
 		}
 		return &anyToAnyModel{
 			LLMConfig: cfgAnyToAny,
 			LLMClient: anyToAnyClient,
 			VADConfig: cfgVAD,
 			VADClient: VADClient,
 		}, nil
 	}
 	log.Debug().Msg("Loading a wrapped model")
 	// Otherwise we want to return a wrapped model, which is a "virtual" model that re-uses other models to perform operations
 	cfgLLM, err := cl.LoadBackendConfigFileByName(cfg.Pipeline.LLM, ml.ModelPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load backend config: %w", err)
 	}
 	if !cfgLLM.Validate() {
 		return nil, fmt.Errorf("failed to validate config: %w", err)
 	}
 	cfgTTS, err := cl.LoadBackendConfigFileByName(cfg.Pipeline.TTS, ml.ModelPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load backend config: %w", err)
 	}
 	if !cfgTTS.Validate() {
 		return nil, fmt.Errorf("failed to validate config: %w", err)
 	}
 	cfgSST, err := cl.LoadBackendConfigFileByName(cfg.Pipeline.Transcription, ml.ModelPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load backend config: %w", err)
 	}
 	if !cfgSST.Validate() {
 		return nil, fmt.Errorf("failed to validate config: %w", err)
 	}
 	opts = backend.ModelOptions(*cfgTTS, appConfig)
 	ttsClient, err := ml.Load(opts...)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load tts model: %w", err)
 	}
 	opts = backend.ModelOptions(*cfgSST, appConfig)
 	transcriptionClient, err := ml.Load(opts...)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load SST model: %w", err)
 	}
 	opts = backend.ModelOptions(*cfgLLM, appConfig)
 	llmClient, err := ml.Load(opts...)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load LLM model: %w", err)
 	}
 	return &wrappedModel{
 		TTSConfig:           cfgTTS,
 		TranscriptionConfig: cfgSST,
 		LLMConfig:           cfgLLM,
 		TTSClient:           ttsClient,
 		TranscriptionClient: transcriptionClient,
 		LLMClient:           llmClient,
 		VADConfig: cfgVAD,
 		VADClient: VADClient,
 	}, nil
 }
--- a/core/http/endpoints/openai/request.go
+++ b/core/http/endpoints/openai/request.go
@@ -48,25 +48,6 @@ func readRequest(c *fiber.Ctx, cl *config.BackendConfigLoader, ml *model.ModelLo
 	return modelFile, input, err
 }
 // func readWSRequest(c *websocket.Conn, cl *config.BackendConfigLoader, ml *model.ModelLoader, o *config.ApplicationConfig, firstModel bool) (string, *schema.OpenAIRequest, error) {
 // 	input := new(schema.OpenAIRequest)
 // 	input.Model = c.Query("name")
 // 	received, _ := json.Marshal(input)
 // 	ctx, cancel := context.WithCancel(o.Context)
 // 	input.Context = ctx
 // 	input.Cancel = cancel
 // 	log.Debug().Msgf("Request received: %s", string(received))
 // 	modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, firstModel)
 // 	return modelFile, input, err
 // }
 func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIRequest) {
 	if input.Echo {
 		config.Echo = input.Echo
--- a/core/http/explorer.go
+++ b/core/http/explorer.go
@@ -7,7 +7,6 @@ import (
 	"github.com/gofiber/fiber/v2/middleware/favicon"
 	"github.com/gofiber/fiber/v2/middleware/filesystem"
 	"github.com/mudler/LocalAI/core/explorer"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/http/routes"
 )
@@ -23,7 +22,6 @@ func Explorer(db *explorer.Database) *fiber.App {
 	app := fiber.New(fiberCfg)
 	app.Use(middleware.StripPathPrefix())
 	routes.RegisterExplorerRoutes(app, db)
 	httpFS := http.FS(embedDirStatic)
--- a/core/http/middleware/auth.go
+++ b/core/http/middleware/auth.go
@@ -8,7 +8,6 @@ import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/fiber/v2/middleware/keyauth"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/utils"
 )
 // This file contains the configuration generators and handler functions that are used along with the fiber/keyauth middleware
@@ -40,9 +39,7 @@ func getApiKeyErrorHandler(applicationConfig *config.ApplicationConfig) fiber.Er
 			if applicationConfig.OpaqueErrors {
 				return ctx.SendStatus(401)
 			}
-			return ctx.Status(401).Render("views/login", fiber.Map{
+			return ctx.Status(401).Render("views/login", nil)
 				"BaseURL": utils.BaseURL(ctx),
 			})
 		}
 		if applicationConfig.OpaqueErrors {
 			return ctx.SendStatus(500)
--- a/core/http/middleware/strippathprefix.go
+++ b/core/http/middleware/strippathprefix.go
@@ -1,36 +0,0 @@
 package middleware
 import (
 	"strings"
 	"github.com/gofiber/fiber/v2"
 )
 // StripPathPrefix returns a middleware that strips a path prefix from the request path.
 // The path prefix is obtained from the X-Forwarded-Prefix HTTP request header.
 func StripPathPrefix() fiber.Handler {
 	return func(c *fiber.Ctx) error {
 		for _, prefix := range c.GetReqHeaders()["X-Forwarded-Prefix"] {
 			if prefix != "" {
 				path := c.Path()
 				pos := len(prefix)
 				if prefix[pos-1] == '/' {
 					pos--
 				} else {
 					prefix += "/"
 				}
 				if strings.HasPrefix(path, prefix) {
 					c.Path(path[pos:])
 					break
 				} else if prefix[:pos] == path {
 					c.Redirect(prefix)
 					return nil
 				}
 			}
 		}
 		return c.Next()
 	}
 }
--- a/core/http/middleware/strippathprefix_test.go
+++ b/core/http/middleware/strippathprefix_test.go
@@ -1,121 +0,0 @@
 package middleware
 import (
 	"net/http/httptest"
 	"testing"
 	"github.com/gofiber/fiber/v2"
 	"github.com/stretchr/testify/require"
 )
 func TestStripPathPrefix(t *testing.T) {
 	var actualPath string
 	app := fiber.New()
 	app.Use(StripPathPrefix())
 	app.Get("/hello/world", func(c *fiber.Ctx) error {
 		actualPath = c.Path()
 		return nil
 	})
 	app.Get("/", func(c *fiber.Ctx) error {
 		actualPath = c.Path()
 		return nil
 	})
 	for _, tc := range []struct {
 		name         string
 		path         string
 		prefixHeader []string
 		expectStatus int
 		expectPath   string
 	}{
 		{
 			name:         "without prefix and header",
 			path:         "/hello/world",
 			expectStatus: 200,
 			expectPath:   "/hello/world",
 		},
 		{
 			name:         "without prefix and headers on root path",
 			path:         "/",
 			expectStatus: 200,
 			expectPath:   "/",
 		},
 		{
 			name:         "without prefix but header",
 			path:         "/hello/world",
 			prefixHeader: []string{"/otherprefix/"},
 			expectStatus: 200,
 			expectPath:   "/hello/world",
 		},
 		{
 			name:         "with prefix but non-matching header",
 			path:         "/prefix/hello/world",
 			prefixHeader: []string{"/otherprefix/"},
 			expectStatus: 404,
 		},
 		{
 			name:         "with prefix and matching header",
 			path:         "/myprefix/hello/world",
 			prefixHeader: []string{"/myprefix/"},
 			expectStatus: 200,
 			expectPath:   "/hello/world",
 		},
 		{
 			name:         "with prefix and 1st header matching",
 			path:         "/myprefix/hello/world",
 			prefixHeader: []string{"/myprefix/", "/otherprefix/"},
 			expectStatus: 200,
 			expectPath:   "/hello/world",
 		},
 		{
 			name:         "with prefix and 2nd header matching",
 			path:         "/myprefix/hello/world",
 			prefixHeader: []string{"/otherprefix/", "/myprefix/"},
 			expectStatus: 200,
 			expectPath:   "/hello/world",
 		},
 		{
 			name:         "with prefix and header not ending with slash",
 			path:         "/myprefix/hello/world",
 			prefixHeader: []string{"/myprefix"},
 			expectStatus: 200,
 			expectPath:   "/hello/world",
 		},
 		{
 			name:         "with prefix and non-matching header not ending with slash",
 			path:         "/myprefix-suffix/hello/world",
 			prefixHeader: []string{"/myprefix"},
 			expectStatus: 404,
 		},
 		{
 			name:         "redirect when prefix does not end with a slash",
 			path:         "/myprefix",
 			prefixHeader: []string{"/myprefix"},
 			expectStatus: 302,
 			expectPath:   "/myprefix/",
 		},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			actualPath = ""
 			req := httptest.NewRequest("GET", tc.path, nil)
 			if tc.prefixHeader != nil {
 				req.Header["X-Forwarded-Prefix"] = tc.prefixHeader
 			}
 			resp, err := app.Test(req, -1)
 			require.NoError(t, err)
 			require.Equal(t, tc.expectStatus, resp.StatusCode, "response status code")
 			if tc.expectStatus == 200 {
 				require.Equal(t, tc.expectPath, actualPath, "rewritten path")
 			} else if tc.expectStatus == 302 {
 				require.Equal(t, tc.expectPath, resp.Header.Get("Location"), "redirect location")
 			}
 		})
 	}
 }
--- a/core/http/render.go
+++ b/core/http/render.go
@@ -10,7 +10,6 @@ import (
 	"github.com/gofiber/fiber/v2"
 	fiberhtml "github.com/gofiber/template/html/v2"
 	"github.com/microcosm-cc/bluemonday"
 	"github.com/mudler/LocalAI/core/http/utils"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/russross/blackfriday"
 )
@@ -27,9 +26,7 @@ func notFoundHandler(c *fiber.Ctx) error {
 		})
 	} else {
 		// The client expects an HTML response
-		return c.Status(fiber.StatusNotFound).Render("views/404", fiber.Map{
+		return c.Status(fiber.StatusNotFound).Render("views/404", fiber.Map{})
 			"BaseURL": utils.BaseURL(c),
 		})
 	}
 }
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -11,9 +11,6 @@ func RegisterOpenAIRoutes(app *fiber.App,
 	application *application.Application) {
 	// openAI compatible API endpoint
 	// realtime
 	app.Get("/v1/realtime", openai.Realtime(application))
 	// chat
 	app.Post("/v1/chat/completions",
 		openai.ChatEndpoint(
--- a/core/http/routes/ui.go
+++ b/core/http/routes/ui.go
@@ -6,21 +6,20 @@ import (
 	"sort"
 	"strings"
 	"github.com/microcosm-cc/bluemonday"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/http/elements"
 	"github.com/mudler/LocalAI/core/http/endpoints/localai"
 	"github.com/mudler/LocalAI/core/http/utils"
 	"github.com/mudler/LocalAI/core/p2p"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/mudler/LocalAI/internal"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/xsync"
 	"github.com/rs/zerolog/log"
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
 	"github.com/microcosm-cc/bluemonday"
 	"github.com/rs/zerolog/log"
 )
 type modelOpCache struct {
@@ -92,7 +91,6 @@ func RegisterUIRoutes(app *fiber.App,
 		app.Get("/p2p", func(c *fiber.Ctx) error {
 			summary := fiber.Map{
 				"Title":   "LocalAI - P2P dashboard",
 				"BaseURL": utils.BaseURL(c),
 				"Version": internal.PrintableVersion(),
 				//"Nodes":          p2p.GetAvailableNodes(""),
 				//"FederatedNodes": p2p.GetAvailableNodes(p2p.FederatedID),
@@ -151,7 +149,6 @@ func RegisterUIRoutes(app *fiber.App,
 			summary := fiber.Map{
 				"Title":            "LocalAI - Models",
 				"BaseURL":          utils.BaseURL(c),
 				"Version":          internal.PrintableVersion(),
 				"Models":           template.HTML(elements.ListModels(models, processingModels, galleryService)),
 				"Repositories":     appConfig.Galleries,
@@ -311,7 +308,6 @@ func RegisterUIRoutes(app *fiber.App,
 		summary := fiber.Map{
 			"Title":        "LocalAI - Chat with " + c.Params("model"),
 			"BaseURL":      utils.BaseURL(c),
 			"ModelsConfig": backendConfigs,
 			"Model":        c.Params("model"),
 			"Version":      internal.PrintableVersion(),
@@ -327,12 +323,11 @@ func RegisterUIRoutes(app *fiber.App,
 		if len(backendConfigs) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
-			return c.Redirect(utils.BaseURL(c))
+			return c.Redirect("/")
 		}
 		summary := fiber.Map{
 			"Title":        "LocalAI - Talk",
 			"BaseURL":      utils.BaseURL(c),
 			"ModelsConfig": backendConfigs,
 			"Model":        backendConfigs[0],
 			"IsP2PEnabled": p2p.IsP2PEnabled(),
@@ -349,12 +344,11 @@ func RegisterUIRoutes(app *fiber.App,
 		if len(backendConfigs) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
-			return c.Redirect(utils.BaseURL(c))
+			return c.Redirect("/")
 		}
 		summary := fiber.Map{
 			"Title":        "LocalAI - Chat with " + backendConfigs[0],
 			"BaseURL":      utils.BaseURL(c),
 			"ModelsConfig": backendConfigs,
 			"Model":        backendConfigs[0],
 			"Version":      internal.PrintableVersion(),
@@ -370,7 +364,6 @@ func RegisterUIRoutes(app *fiber.App,
 		summary := fiber.Map{
 			"Title":        "LocalAI - Generate images with " + c.Params("model"),
 			"BaseURL":      utils.BaseURL(c),
 			"ModelsConfig": backendConfigs,
 			"Model":        c.Params("model"),
 			"Version":      internal.PrintableVersion(),
@@ -387,12 +380,11 @@ func RegisterUIRoutes(app *fiber.App,
 		if len(backendConfigs) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
-			return c.Redirect(utils.BaseURL(c))
+			return c.Redirect("/")
 		}
 		summary := fiber.Map{
 			"Title":        "LocalAI - Generate images with " + backendConfigs[0].Name,
 			"BaseURL":      utils.BaseURL(c),
 			"ModelsConfig": backendConfigs,
 			"Model":        backendConfigs[0].Name,
 			"Version":      internal.PrintableVersion(),
@@ -408,7 +400,6 @@ func RegisterUIRoutes(app *fiber.App,
 		summary := fiber.Map{
 			"Title":        "LocalAI - Generate images with " + c.Params("model"),
 			"BaseURL":      utils.BaseURL(c),
 			"ModelsConfig": backendConfigs,
 			"Model":        c.Params("model"),
 			"Version":      internal.PrintableVersion(),
@@ -425,12 +416,11 @@ func RegisterUIRoutes(app *fiber.App,
 		if len(backendConfigs) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
-			return c.Redirect(utils.BaseURL(c))
+			return c.Redirect("/")
 		}
 		summary := fiber.Map{
 			"Title":        "LocalAI - Generate audio with " + backendConfigs[0].Name,
 			"BaseURL":      utils.BaseURL(c),
 			"ModelsConfig": backendConfigs,
 			"Model":        backendConfigs[0].Name,
 			"IsP2PEnabled": p2p.IsP2PEnabled(),
--- a/core/http/static/assets/font1.css
+++ b/core/http/static/assets/font1.css
@@ -7,33 +7,33 @@ https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&family=Roboto:wg
  font-style: normal;
  font-weight: 400;
  font-display: swap;
-  src: url(./UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuLyfMZg.ttf) format('truetype');
+  src: url(/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuLyfMZg.ttf) format('truetype');
 }
@font-face {
  font-family: 'Inter';
  font-style: normal;
  font-weight: 600;
  font-display: swap;
-  src: url(./UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuGKYMZg.ttf) format('truetype');
+  src: url(/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuGKYMZg.ttf) format('truetype');
 }
@font-face {
  font-family: 'Inter';
  font-style: normal;
  font-weight: 700;
  font-display: swap;
-  src: url(./UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuFuYMZg.ttf) format('truetype');
+  src: url(/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuFuYMZg.ttf) format('truetype');
 }
@font-face {
  font-family: 'Roboto';
  font-style: normal;
  font-weight: 400;
  font-display: swap;
-  src: url(./KFOmCnqEu92Fr1Me5Q.ttf) format('truetype');
+  src: url(/static/assets/KFOmCnqEu92Fr1Me5Q.ttf) format('truetype');
 }
@font-face {
  font-family: 'Roboto';
  font-style: normal;
  font-weight: 500;
  font-display: swap;
-  src: url(./KFOlCnqEu92Fr1MmEU9vAw.ttf) format('truetype');
+  src: url(/static/assets/KFOlCnqEu92Fr1MmEU9vAw.ttf) format('truetype');
 }
--- a/core/http/static/assets/font2.css
+++ b/core/http/static/assets/font2.css
@@ -7,33 +7,33 @@ https://fonts.googleapis.com/css?family=Roboto:300,400,500,700,900&display=swap
  font-style: normal;
  font-weight: 300;
  font-display: swap;
-  src: url(./KFOlCnqEu92Fr1MmSU5fBBc9.ttf) format('truetype');
+  src: url(/static/assets//KFOlCnqEu92Fr1MmSU5fBBc9.ttf) format('truetype');
 }
@font-face {
  font-family: 'Roboto';
  font-style: normal;
  font-weight: 400;
  font-display: swap;
-  src: url(./KFOmCnqEu92Fr1Mu4mxP.ttf) format('truetype');
+  src: url(/static/assets//KFOmCnqEu92Fr1Mu4mxP.ttf) format('truetype');
 }
@font-face {
  font-family: 'Roboto';
  font-style: normal;
  font-weight: 500;
  font-display: swap;
-  src: url(./KFOlCnqEu92Fr1MmEU9fBBc9.ttf) format('truetype');
+  src: url(/static/assets//KFOlCnqEu92Fr1MmEU9fBBc9.ttf) format('truetype');
 }
@font-face {
  font-family: 'Roboto';
  font-style: normal;
  font-weight: 700;
  font-display: swap;
-  src: url(./KFOlCnqEu92Fr1MmWUlfBBc9.ttf) format('truetype');
+  src: url(/static/assets//KFOlCnqEu92Fr1MmWUlfBBc9.ttf) format('truetype');
 }
@font-face {
  font-family: 'Roboto';
  font-style: normal;
  font-weight: 900;
  font-display: swap;
-  src: url(./KFOlCnqEu92Fr1MmYUtfBBc9.ttf) format('truetype');
+  src: url(/static/assets//KFOlCnqEu92Fr1MmYUtfBBc9.ttf) format('truetype');
 }
--- a/core/http/static/chat.js
+++ b/core/http/static/chat.js
@@ -143,7 +143,7 @@ function readInputImage() {
    // }
    // Source: https://stackoverflow.com/a/75751803/11386095
-    const response = await fetch("v1/chat/completions", {
+    const response = await fetch("/v1/chat/completions", {
      method: "POST",
      headers: {
        Authorization: `Bearer ${key}`,
--- a/core/http/static/image.js
+++ b/core/http/static/image.js
@@ -48,7 +48,7 @@ async function promptDallE(key, input) {
  document.getElementById("input").disabled = true;
  const model = document.getElementById("image-model").value;
-  const response = await fetch("v1/images/generations", {
+  const response = await fetch("/v1/images/generations", {
    method: "POST",
    headers: {
      Authorization: `Bearer ${key}`,
--- a/core/http/static/talk.js
+++ b/core/http/static/talk.js
@@ -122,7 +122,7 @@ async function sendAudioToWhisper(audioBlob) {
    formData.append('model', getWhisperModel());
    API_KEY = localStorage.getItem("key");
-    const response = await fetch('v1/audio/transcriptions', {
+    const response = await fetch('/v1/audio/transcriptions', {
        method: 'POST',
        headers: {
            'Authorization': `Bearer ${API_KEY}`
@@ -139,7 +139,7 @@ async function sendTextToChatGPT(text) {
    conversationHistory.push({ role: "user", content: text });
    API_KEY = localStorage.getItem("key");
-    const response = await fetch('v1/chat/completions', {
+    const response = await fetch('/v1/chat/completions', {
        method: 'POST',
        headers: {
            'Authorization': `Bearer ${API_KEY}`,
@@ -163,7 +163,7 @@ async function sendTextToChatGPT(text) {
 async function getTextToSpeechAudio(text) {
    API_KEY = localStorage.getItem("key");
-    const response = await fetch('v1/audio/speech', {
+    const response = await fetch('/v1/audio/speech', {
        method: 'POST',
        headers: {
--- a/core/http/static/tts.js
+++ b/core/http/static/tts.js
@@ -19,7 +19,7 @@ async function tts(key, input) {
  document.getElementById("input").disabled = true;
  const model = document.getElementById("tts-model").value;
-  const response = await fetch("tts", {
+  const response = await fetch("/tts", {
    method: "POST",
    headers: {
      Authorization: `Bearer ${key}`,
--- a/core/http/utils/baseurl.go
+++ b/core/http/utils/baseurl.go
@@ -1,24 +0,0 @@
 package utils
 import (
 	"strings"
 	"github.com/gofiber/fiber/v2"
 )
 // BaseURL returns the base URL for the given HTTP request context.
 // It takes into account that the app may be exposed by a reverse-proxy under a different protocol, host and path.
 // The returned URL is guaranteed to end with `/`.
 // The method should be used in conjunction with the StripPathPrefix middleware.
 func BaseURL(c *fiber.Ctx) string {
 	path := c.Path()
 	origPath := c.OriginalURL()
 	if path != origPath && strings.HasSuffix(origPath, path) {
 		pathPrefix := origPath[:len(origPath)-len(path)+1]
 		return c.BaseURL() + pathPrefix
 	}
 	return c.BaseURL() + "/"
 }
--- a/core/http/utils/baseurl_test.go
+++ b/core/http/utils/baseurl_test.go
@@ -1,48 +0,0 @@
 package utils
 import (
 	"net/http/httptest"
 	"testing"
 	"github.com/gofiber/fiber/v2"
 	"github.com/stretchr/testify/require"
 )
 func TestBaseURL(t *testing.T) {
 	for _, tc := range []struct {
 		name      string
 		prefix    string
 		expectURL string
 	}{
 		{
 			name:      "without prefix",
 			prefix:    "/",
 			expectURL: "http://example.com/",
 		},
 		{
 			name:      "with prefix",
 			prefix:    "/myprefix/",
 			expectURL: "http://example.com/myprefix/",
 		},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			app := fiber.New()
 			actualURL := ""
 			app.Get(tc.prefix+"hello/world", func(c *fiber.Ctx) error {
 				if tc.prefix != "/" {
 					c.Path("/hello/world")
 				}
 				actualURL = BaseURL(c)
 				return nil
 			})
 			req := httptest.NewRequest("GET", tc.prefix+"hello/world", nil)
 			resp, err := app.Test(req, -1)
 			require.NoError(t, err)
 			require.Equal(t, 200, resp.StatusCode, "response status code")
 			require.Equal(t, tc.expectURL, actualURL, "base URL")
 		})
 	}
 }
--- a/core/http/views/404.html
+++ b/core/http/views/404.html
@@ -12,7 +12,7 @@
        <div class="header text-center py-12">
            <h1 class="text-5xl font-bold">Welcome to your LocalAI instance!</h1>
            <div class="mt-6">
-         <!--       <a href="./" aria-label="HomePage" alt="HomePage">
+         <!--       <a href="/" aria-label="HomePage" alt="HomePage">           
                    <img class="mx-auto w-1/4 h-auto" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" alt="LocalAI Logo">            
                </a>
            -->
--- a/core/http/views/chat.html
+++ b/core/http/views/chat.html
@@ -28,7 +28,7 @@ SOFTWARE.
 <!doctype html>
 <html lang="en">
  {{template "views/partials/head" .}}
-  <script defer src="static/chat.js"></script>
+  <script defer src="/static/chat.js"></script>
  <style>
    body {
        overflow: hidden; 
@@ -101,9 +101,9 @@ SOFTWARE.
        {{ $model:=.Model}}
        {{ range .ModelsConfig }}
        {{ if eq . $model }}
-        <option value="chat/{{.}}" selected  class="bg-gray-700 text-white">{{.}}</option>
+        <option value="/chat/{{.}}" selected  class="bg-gray-700 text-white">{{.}}</option>
        {{ else }}
-        <option value="chat/{{.}}" class="bg-gray-700 text-white">{{.}}</option>
+        <option value="/chat/{{.}}" class="bg-gray-700 text-white">{{.}}</option>
        {{ end }}
        {{ end }}
      </select>
@@ -142,7 +142,7 @@ SOFTWARE.
      <div id="loader" class="my-2 loader" style="display: none;"></div>
      <input id="chat-model" type="hidden" value="{{.Model}}">
      <input id="input_image" type="file" style="display: none;" @change="fileName = $event.target.files[0].name">
-      <form id="prompt" action="chat/{{.Model}}" method="get" @submit.prevent="submitPrompt">
+      <form id="prompt" action="/chat/{{.Model}}" method="get" @submit.prevent="submitPrompt">
          <div class="relative w-full">
              <textarea
                  id="input"
--- a/core/http/views/explorer.html
+++ b/core/http/views/explorer.html
@@ -370,7 +370,7 @@
                }
            }
        </script>
-        <script src="static/p2panimation.js"></script>
+        <script src="/static/p2panimation.js"></script>
        {{template "views/partials/footer" .}}
    </div>
--- a/core/http/views/index.html
+++ b/core/http/views/index.html
@@ -20,7 +20,7 @@
            {{template "views/partials/inprogress" .}}
            {{ if eq (len .ModelsConfig) 0 }}
            <h2 class="text-center text-3xl font-semibold text-gray-100"> <i class="text-yellow-200 ml-2 fa-solid fa-triangle-exclamation animate-pulse"></i> Ouch! seems you don't have any models installed from the LocalAI gallery!</h2>
-            <p class="text-center mt-4 text-xl">..install something from the <a class="text-gray-400 hover:text-white ml-1 px-3 py-2 rounded" href="browse">🖼️ Gallery</a> or check the <a href="https://localai.io/basics/getting_started/" class="text-gray-400 hover:text-white ml-1 px-3 py-2 rounded"> <i class="fa-solid fa-book"></i> Getting started documentation </a></p>
+            <p class="text-center mt-4 text-xl">..install something from the <a class="text-gray-400 hover:text-white ml-1 px-3 py-2 rounded" href="/browse">🖼️ Gallery</a> or check the <a href="https://localai.io/basics/getting_started/" class="text-gray-400 hover:text-white ml-1 px-3 py-2 rounded"> <i class="fa-solid fa-book"></i> Getting started documentation </a></p>
            {{ if ne (len .Models) 0 }}
            <hr class="my-4">
@@ -66,7 +66,7 @@
                        {{ end }}
                    </td>
                    <td class="px-4 py-3 font-bold">
-                        <p class="font-bold text-white flex items-center"><i class="fas fa-brain pr-2"></i><a href="browse?term={{.Name}}">{{.Name}}</a></p>
+                        <p class="font-bold text-white flex items-center"><i class="fas fa-brain pr-2"></i><a href="/browse?term={{.Name}}">{{.Name}}</a></p>
                    </td>
                    <td class="px-4 py-3 font-bold">
                        {{ if .Backend }}
@@ -84,7 +84,7 @@
                    <td class="px-4 py-3">
                        <button
                            class="float-right inline-block rounded bg-red-800 px-6 pb-2.5 mb-3 pt-2.5 text-xs font-medium uppercase leading-normal text-white shadow-primary-3 transition duration-150 ease-in-out hover:bg-red-accent-300 hover:shadow-red-2 focus:bg-red-accent-300 focus:shadow-primary-2 focus:outline-none focus:ring-0 active:bg-red-600 active:shadow-primary-2 dark:shadow-black/30 dark:hover:shadow-dark-strong dark:focus:shadow-dark-strong dark:active:shadow-dark-strong"
-                            data-twe-ripple-color="light" data-twe-ripple-init="" hx-confirm="Are you sure you wish to delete the model?" hx-post="browse/delete/model/{{.Name}}" hx-swap="outerHTML"><i class="fa-solid fa-cancel pr-2"></i>Delete</button>
+                            data-twe-ripple-color="light" data-twe-ripple-init="" hx-confirm="Are you sure you wish to delete the model?" hx-post="/browse/delete/model/{{.Name}}" hx-swap="outerHTML"><i class="fa-solid fa-cancel pr-2"></i>Delete</button>
                    </td>
                {{ end }}
                {{ range .Models }}
--- a/core/http/views/login.html
+++ b/core/http/views/login.html
@@ -4,8 +4,6 @@
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Open Authenticated Website</title>
    <base href="{{.BaseURL}}" />
    <link rel="icon" type="image/x-icon" href="favicon.ico" />
 </head>
 <body>
    <h1>Authorization is required</h1>
--- a/core/http/views/models.html
+++ b/core/http/views/models.html
@@ -16,38 +16,38 @@
            <div class="text-center font-semibold text-gray-100">
                <h2>Filter by type:</h2>
-                <button  hx-post="browse/search/models"
+                <button  hx-post="/browse/search/models"
                    class="text-white-500 inline-block bg-blue-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2"
                    hx-target="#search-results" 
                    hx-vals='{"search": "tts"}'
                hx-indicator=".htmx-indicator" >TTS</button> 
-                <button  hx-post="browse/search/models" 
+                <button  hx-post="/browse/search/models" 
                    class="text-white-500 inline-block bg-blue-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2"
                    hx-target="#search-results" 
                    hx-vals='{"search": "stablediffusion"}'
                hx-indicator=".htmx-indicator" >Image generation</button> 
-                <button  hx-post="browse/search/models" \
+                <button  hx-post="/browse/search/models" \
                    class="text-white-500 inline-block bg-blue-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2"
                    hx-target="#search-results" 
                    hx-vals='{"search": "llm"}'
                hx-indicator=".htmx-indicator" >Text generation</button> 
-                <button  hx-post="browse/search/models" 
+                <button  hx-post="/browse/search/models" 
                    class="text-white-500 inline-block bg-blue-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2"
                    hx-target="#search-results" 
                    hx-vals='{"search": "multimodal"}'
                hx-indicator=".htmx-indicator" >Multimodal</button> 
-                <button  hx-post="browse/search/models" 
+                <button  hx-post="/browse/search/models" 
                    class="text-white-500 inline-block bg-blue-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2"
                    hx-target="#search-results" 
                    hx-vals='{"search": "embedding"}'
                hx-indicator=".htmx-indicator" >Embeddings</button>
-                <button  hx-post="browse/search/models"
+                <button  hx-post="/browse/search/models"
                    class="text-white-500 inline-block bg-blue-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2"
                    hx-target="#search-results" 
                    hx-vals='{"search": "rerank"}'
                hx-indicator=".htmx-indicator" >Rerankers</button> 
                <button  
-                    hx-post="browse/search/models"
+                    hx-post="/browse/search/models"
                    class="text-white-500 inline-block bg-blue-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2"
                    hx-target="#search-results" 
                    hx-vals='{"search": "whisper"}'
@@ -57,7 +57,7 @@
            <div class="text-center text-xs font-semibold text-gray-100">
                Filter by tags:
                {{ range .AllTags }}
-                    <button  hx-post="browse/search/models" class="text-blue-500" hx-target="#search-results" 
+                    <button  hx-post="/browse/search/models" class="text-blue-500" hx-target="#search-results" 
                    hx-vals='{"search": "{{.}}"}'
                    hx-indicator=".htmx-indicator" >{{.}}</button> 
                {{ end }}
@@ -69,7 +69,7 @@
            <input class="form-control appearance-none block w-full mt-5 px-3 py-2 text-base font-normal text-gray-300 pb-2 mb-5 bg-gray-800 bg-clip-padding border border-solid border-gray-600 rounded transition ease-in-out m-0 focus:text-gray-300 focus:bg-gray-900 focus:border-blue-500 focus:outline-none" type="search" 
                name="search" placeholder="Begin Typing To Search models..." 
-                hx-post="browse/search/models" 
+                hx-post="/browse/search/models" 
                hx-trigger="input changed delay:500ms, search" 
                hx-target="#search-results" 
                hx-indicator=".htmx-indicator">
--- a/core/http/views/p2p.html
+++ b/core/http/views/p2p.html
@@ -48,11 +48,11 @@
            <!-- Federation Box -->
            <div class="bg-gray-800 p-6 rounded-lg shadow-lg mb-12 text-left">
-                <p class="text-xl font-semibold text-gray-200"> <i class="text-gray-200 fa-solid fa-circle-nodes"></i> Federated Nodes: <span hx-get="p2p/ui/workers-federation-stats" hx-trigger="every 1s"></span> </p>
+                <p class="text-xl font-semibold text-gray-200"> <i class="text-gray-200 fa-solid fa-circle-nodes"></i> Federated Nodes: <span hx-get="/p2p/ui/workers-federation-stats" hx-trigger="every 1s"></span> </p>
                <p class="mb-4">You can start LocalAI in federated mode to share your instance, or start the federated server to balance requests between nodes of the federation.</p>
                <div class="grid grid-cols-1 sm:grid-cols-2 md:grid-cols-3 gap-4 mb-12">
-                    <div hx-get="p2p/ui/workers-federation" hx-trigger="every 1s"></div>
+                    <div hx-get="/p2p/ui/workers-federation" hx-trigger="every 1s"></div>
                </div>
                <hr class="border-gray-700 mb-12">
@@ -123,11 +123,11 @@
            <div class="bg-gray-800 p-6 rounded-lg shadow-lg mb-12 text-left">
-                <p class="text-xl font-semibold text-gray-200"> <i class="text-gray-200 fa-solid fa-circle-nodes"></i> Workers (llama.cpp): <span hx-get="p2p/ui/workers-stats" hx-trigger="every 1s"></span> </p>
+                <p class="text-xl font-semibold text-gray-200"> <i class="text-gray-200 fa-solid fa-circle-nodes"></i> Workers (llama.cpp): <span hx-get="/p2p/ui/workers-stats" hx-trigger="every 1s"></span> </p>
                <p class="mb-4">You can start llama.cpp workers to distribute weights between the workers and offload part of the computation. To start a new worker, you can use the CLI or Docker.</p>
                <div class="grid grid-cols-1 sm:grid-cols-2 md:grid-cols-3 gap-4 mb-12">
-                    <div hx-get="p2p/ui/workers" hx-trigger="every 1s"></div>
+                    <div hx-get="/p2p/ui/workers" hx-trigger="every 1s"></div>
                </div>
                <hr class="border-gray-700 mb-12">
@@ -177,7 +177,7 @@
    {{template "views/partials/footer" .}}
 </div>
-<script src="static/p2panimation.js"></script>
+<script src="/static/p2panimation.js"></script>
 <style>
    .token {
        word-break: break-all;
--- a/core/http/views/partials/footer.html
+++ b/core/http/views/partials/footer.html
@@ -2,4 +2,4 @@
    LocalAI Version {{.Version}}<br>
    <a href='https://github.com/mudler/LocalAI' class="text-blue-400 hover:text-blue-600" target="_blank">LocalAI</a> © 2023-2024 <a href='https://mudler.pm' class="text-blue-400 hover:text-blue-600" target="_blank">Ettore Di Giacinto</a>
 </footer>
-<script src="static/assets/tw-elements.js"></script>
+<script src="/static/assets/tw-elements.js"></script>
--- a/core/http/views/partials/head.html
+++ b/core/http/views/partials/head.html
@@ -2,35 +2,33 @@
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{{.Title}}</title>
    <base href="{{.BaseURL}}" />
    <link rel="icon" type="image/x-icon" href="favicon.ico" />
    <link
    rel="stylesheet"
-    href="static/assets/highlightjs.css"
+    href="/static/assets/highlightjs.css"
  />
-  <script defer src="static/assets/highlightjs.js"></script>
+  <script defer src="/static/assets/highlightjs.js"></script>
    <script
    defer
-    src="static/assets/alpine.js"
+    src="/static/assets/alpine.js"
  ></script>
  <script
    defer
-    src="static/assets/marked.js"
+    src="/static/assets/marked.js"
  ></script>
  <script
    defer
-    src="static/assets/purify.js"
+    src="/static/assets/purify.js"
  ></script>
-  <link href="static/general.css" rel="stylesheet" />
+  <link href="/static/general.css" rel="stylesheet" />
-    <link href="static/assets/font1.css" rel="stylesheet">
+    <link href="/static/assets/font1.css" rel="stylesheet">
    <link
-    href="static/assets/font2.css"
+    href="/static/assets/font2.css"
    rel="stylesheet" />
  <link
    rel="stylesheet"
-    href="static/assets/tw-elements.css" />
+    href="/static/assets/tw-elements.css" />
-  <script src="static/assets/tailwindcss.js"></script>
+  <script src="/static/assets/tailwindcss.js"></script>
  <script>
    tailwind.config = {
      darkMode: "class",
@@ -56,11 +54,11 @@
      });
    }
  </script>
-  <link href="static/assets/fontawesome/css/fontawesome.css" rel="stylesheet" />
+  <link href="/static/assets/fontawesome/css/fontawesome.css" rel="stylesheet" />
-  <link href="static/assets/fontawesome/css/brands.css" rel="stylesheet" />
+  <link href="/static/assets/fontawesome/css/brands.css" rel="stylesheet" />
-  <link href="static/assets/fontawesome/css/solid.css" rel="stylesheet" />
+  <link href="/static/assets/fontawesome/css/solid.css" rel="stylesheet" />
-  <script src="static/assets/flowbite.min.js"></script>
+  <script src="/static/assets/flowbite.min.js"></script>
-  <script src="static/assets/htmx.js" crossorigin="anonymous"></script>
+  <script src="/static/assets/htmx.js" crossorigin="anonymous"></script>
  <!-- P2P Animation START -->
  <style>
    .animation-container {
--- a/core/http/views/partials/inprogress.html
+++ b/core/http/views/partials/inprogress.html
@@ -17,13 +17,13 @@
      <div class="flex items-center justify-between bg-slate-600 p-2 mb-2 rounded-md">
         <div class="flex items center">
-             <span class="text-gray-300"><a href="browse?term={{$parts._1}}"
+             <span class="text-gray-300"><a href="/browse?term={{$parts._1}}"
                 class="text-white-500 inline-block bg-blue-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2"
                 >{{$modelName}}</a> {{if $repository}} (from the '{{$repository}}' repository) {{end}}</span>
         </div>
-         <div hx-get="browse/job/{{$value}}" hx-swap="outerHTML" hx-target="this" hx-trigger="done">
+         <div hx-get="/browse/job/{{$value}}" hx-swap="outerHTML" hx-target="this" hx-trigger="done">
             <h3 role="status" id="pblabel" >{{$op}}
-                 <div hx-get="browse/job/progress/{{$value}}" hx-trigger="every 600ms" 
+                 <div hx-get="/browse/job/progress/{{$value}}" hx-trigger="every 600ms" 
                 hx-target="this"
                 hx-swap="innerHTML"  ></div></h3>
         </div>     
--- a/core/http/views/partials/navbar.html
+++ b/core/http/views/partials/navbar.html
@@ -3,8 +3,8 @@
        <div class="flex items-center justify-between">
            <div class="flex items-center">
                <!-- Logo Image: Replace 'logo_url_here' with your actual logo URL -->
-                <a href="./" class="text-white text-xl font-bold"><img src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" alt="LocalAI Logo" class="h-10 mr-3 border-2 border-gray-300 shadow rounded"></a>
+                <a href="/" class="text-white text-xl font-bold"><img src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" alt="LocalAI Logo" class="h-10 mr-3 border-2 border-gray-300 shadow rounded"></a>
-                <a href="./" class="text-white text-xl font-bold">LocalAI</a>
+                <a href="/" class="text-white text-xl font-bold">LocalAI</a>
            </div>
            <!-- Menu button for small screens -->
            <div class="lg:hidden">
@@ -14,33 +14,33 @@
            </div>
            <!-- Navigation links -->
            <div class="hidden lg:flex lg:items-center lg:justify-end lg:flex-1 lg:w-0">
-                <a href="./" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-home pr-2"></i>Home</a>
+                <a href="/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-home pr-2"></i>Home</a>
                <a href="https://localai.io" class="text-gray-400 hover:text-white px-3 py-2 rounded" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
-                <a href="browse/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-brain pr-2"></i> Models</a>
+                <a href="/browse/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-brain pr-2"></i> Models</a>
-                <a href="chat/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-comments pr-2"></i> Chat</a>
+                <a href="/chat/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-comments pr-2"></i> Chat</a>
-                <a href="text2image/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-image pr-2"></i> Generate images</a>
+                <a href="/text2image/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-image pr-2"></i> Generate images</a>
-                <a href="tts/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-music pr-2"></i> TTS </a>
+                <a href="/tts/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-music pr-2"></i> TTS </a>
-                <a href="talk/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-phone pr-2"></i> Talk </a>
+                <a href="/talk/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-phone pr-2"></i> Talk </a>
                {{ if .IsP2PEnabled }}
-                <a href="p2p/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-circle-nodes"></i> Swarm </a>
+                <a href="/p2p/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-circle-nodes"></i> Swarm </a>
                {{ end }}
-                <a href="swagger/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-code pr-2"></i> API</a>
+                <a href="/swagger/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-code pr-2"></i> API</a>
            </div>
        </div>
        <!-- Collapsible menu for small screens -->
        <div class="hidden lg:hidden" id="mobile-menu">
            <div class="pt-4 pb-3 border-t border-gray-700">
-                <a href="./" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-home pr-2"></i>Home</a>
+                <a href="/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-home pr-2"></i>Home</a>
                <a href="https://localai.io" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
-                <a href="browse/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-brain pr-2"></i> Models</a>
+                <a href="/browse/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-brain pr-2"></i> Models</a>
-                <a href="chat/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-comments pr-2"></i> Chat</a>
+                <a href="/chat/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-comments pr-2"></i> Chat</a>
-                <a href="text2image/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-image pr-2"></i> Generate images</a>
+                <a href="/text2image/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-image pr-2"></i> Generate images</a>
-                <a href="tts/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-music pr-2"></i> TTS </a>
+                <a href="/tts/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-music pr-2"></i> TTS </a>
-                <a href="talk/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-phone pr-2"></i> Talk </a>
+                <a href="/talk/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-phone pr-2"></i> Talk </a>
                {{ if .IsP2PEnabled }}
-                <a href="p2p/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-circle-nodes"></i> Swarm </a>
+                <a href="/p2p/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-circle-nodes"></i> Swarm </a>
                {{ end }}
-                <a href="swagger/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-code pr-2"></i> API</a>
+                <a href="/swagger/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-code pr-2"></i> API</a>
            </div>
        </div>
    </div>
--- a/core/http/views/partials/navbar_explorer.html
+++ b/core/http/views/partials/navbar_explorer.html
@@ -3,8 +3,8 @@
        <div class="flex items-center justify-between">
            <div class="flex items-center">
                <!-- Logo Image: Replace 'logo_url_here' with your actual logo URL -->
-                <a href="./" class="text-white text-xl font-bold"><img src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" alt="LocalAI Logo" class="h-10 mr-3 border-2 border-gray-300 shadow rounded"></a>
+                <a href="/" class="text-white text-xl font-bold"><img src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" alt="LocalAI Logo" class="h-10 mr-3 border-2 border-gray-300 shadow rounded"></a>
-                <a href="./" class="text-white text-xl font-bold">LocalAI</a>
+                <a href="/" class="text-white text-xl font-bold">LocalAI</a>
            </div>
            <!-- Menu button for small screens -->
            <div class="lg:hidden">
@@ -14,7 +14,7 @@
            </div>
            <!-- Navigation links -->
            <div class="hidden lg:flex lg:items-center lg:justify-end lg:flex-1 lg:w-0">
-                <a href="./" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-home pr-2"></i>Home</a>
+                <a href="/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-home pr-2"></i>Home</a>
                <a href="https://localai.io" class="text-gray-400 hover:text-white px-3 py-2 rounded" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
                <a href="https://models.localai.io/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-brain pr-2"></i> Models</a>
            </div>
@@ -22,7 +22,7 @@
        <!-- Collapsible menu for small screens -->
        <div class="hidden lg:hidden" id="mobile-menu">
            <div class="pt-4 pb-3 border-t border-gray-700">
-                <a href="./" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-home pr-2"></i>Home</a>
+                <a href="/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-home pr-2"></i>Home</a>
                <a href="https://localai.io" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
                <a href="https://models.localai.io/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-brain pr-2"></i> Models</a>
            </div>
--- a/core/http/views/talk.html
+++ b/core/http/views/talk.html
@@ -1,7 +1,7 @@
 <!doctype html>
 <html lang="en">
  {{template "views/partials/head" .}}
-  <script defer src="static/talk.js"></script>
+  <script defer src="/static/talk.js"></script>
  <style>
    body {
        overflow: hidden; 
--- a/core/http/views/text2image.html
+++ b/core/http/views/text2image.html
@@ -1,7 +1,7 @@
 <!DOCTYPE html>
 <html lang="en">
 {{template "views/partials/head" .}}
-<script defer src="static/image.js"></script>
+<script defer src="/static/image.js"></script>
 <body class="bg-gray-900 text-gray-200">
 <div class="flex flex-col min-h-screen">
@@ -50,9 +50,9 @@
                {{ $model:=.Model}}
                {{ range .ModelsConfig }}
                {{ if eq .Name $model }}
-                <option value="text2image/{{.Name}}" selected class="bg-gray-700 text-white">{{.Name}}</option>
+                <option value="/text2image/{{.Name}}" selected class="bg-gray-700 text-white">{{.Name}}</option>
                {{ else }}
-                <option value="text2image/{{.Name}}" class="bg-gray-700 text-white">{{.Name}}</option>
+                <option value="/text2image/{{.Name}}" class="bg-gray-700 text-white">{{.Name}}</option>
                {{ end }}
                {{ end }}
              </select>
@@ -62,7 +62,7 @@
            <div class="mt-12">
              <input id="image-model" type="hidden" value="{{.Model}}">
-              <form id="genimage" action="text2image/{{.Model}}" method="get">
+              <form id="genimage" action="/text2image/{{.Model}}" method="get">
                <input
                  type="text"
                  id="input"
--- a/core/http/views/tts.html
+++ b/core/http/views/tts.html
@@ -1,7 +1,7 @@
 <!DOCTYPE html>
 <html lang="en">
 {{template "views/partials/head" .}}
-<script defer src="static/tts.js"></script>
+<script defer src="/static/tts.js"></script>
 <body class="bg-gray-900 text-gray-200">
 <div class="flex flex-col min-h-screen">
@@ -47,9 +47,9 @@
                {{ $model:=.Model}}
                {{ range .ModelsConfig }}
                {{ if eq .Name $model }}
-                <option value="tts/{{.Name}}" selected class="bg-gray-700 text-white">{{.Name}}</option>
+                <option value="/tts/{{.Name}}" selected class="bg-gray-700 text-white">{{.Name}}</option>
                {{ else }}
-                <option value="tts/{{.Name}}" class="bg-gray-700 text-white">{{.Name}}</option>
+                <option value="/tts/{{.Name}}" class="bg-gray-700 text-white">{{.Name}}</option>
                {{ end }}
                {{ end }}
              </select>
@@ -59,7 +59,7 @@
            <div class="mt-12">
              <input id="tts-model" type="hidden" value="{{.Model}}">
-              <form id="tts" action="tts/{{.Model}}" method="get">
+              <form id="tts" action="/tts/{{.Model}}" method="get">
                <input
                  type="text"
                  id="input"
--- a/docs/content/docs/features/distributed_inferencing.md
+++ b/docs/content/docs/features/distributed_inferencing.md
@@ -129,9 +129,6 @@ There are options that can be tweaked or parameters that can be set using enviro
 | Environment Variable | Description |
 |----------------------|-------------|
 | **LOCALAI_P2P** | Set to "true" to enable p2p |
 | **LOCALAI_FEDERATED** | Set to "true" to enable federated mode |
 | **FEDERATED_SERVER** | Set to "true" to enable federated server |
 | **LOCALAI_P2P_DISABLE_DHT** | Set to "true" to disable DHT and enable p2p layer to be local only (mDNS) |
 | **LOCALAI_P2P_ENABLE_LIMITS** | Set to "true" to enable connection limits and resources management (useful when running with poor connectivity or want to limit resources consumption) |
 | **LOCALAI_P2P_LISTEN_MADDRS** | Set to comma separated list of multiaddresses to override default libp2p 0.0.0.0 multiaddresses |
--- a/docs/content/docs/reference/compatibility-table.md
+++ b/docs/content/docs/reference/compatibility-table.md
@@ -16,8 +16,8 @@ LocalAI will attempt to automatically load models which are not explicitly confi
 | Backend and Bindings                                                             | Compatible models     | Completion/Chat endpoint | Capability | Embeddings support                | Token stream support | Acceleration |
 |----------------------------------------------------------------------------------|-----------------------|--------------------------|---------------------------|-----------------------------------|----------------------|--------------|
-| [llama.cpp]({{%relref "docs/features/text-generation#llama.cpp" %}})        | LLama, Mamba, RWKV, Falcon, Starcoder, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes                      | GPT and Functions                        | yes | yes                  | CUDA, openCL, cuBLAS, Metal |
+| [llama.cpp]({{%relref "docs/features/text-generation#llama.cpp" %}})        | LLama, Mamba, RWKV, Falcon, Starcoder, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes                      | GPT and Functions                        | yes** | yes                  | CUDA, openCL, cuBLAS, Metal |
-| [llama.cpp's ggml model (backward compatibility with old format, before GGUF)](https://github.com/ggerganov/llama.cpp) ([binding](https://github.com/go-skynet/go-llama.cpp))  | LLama, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes                      | GPT and Functions                        | yes | yes                  | CUDA, openCL, cuBLAS, Metal |
+| [llama.cpp's ggml model (backward compatibility with old format, before GGUF)](https://github.com/ggerganov/llama.cpp) ([binding](https://github.com/go-skynet/go-llama.cpp))  | LLama, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes                      | GPT and Functions                        | yes** | yes                  | CUDA, openCL, cuBLAS, Metal |
 | [whisper](https://github.com/ggerganov/whisper.cpp)         | whisper               | no                       | Audio                 | no                                | no                   | N/A |
 | [stablediffusion](https://github.com/EdVince/Stable-Diffusion-NCNN) ([binding](https://github.com/mudler/go-stable-diffusion))        | stablediffusion               | no                       | Image                 | no                                | no                   | N/A |
 | [langchain-huggingface](https://github.com/tmc/langchaingo)                                                                    | Any text generators available on HuggingFace through API | yes                      | GPT                        | no                                | no                   | N/A |
@@ -37,11 +37,14 @@ LocalAI will attempt to automatically load models which are not explicitly confi
 | `openvoice` | Open voice    | no                       | Audio generation and Voice cloning    | no                               | no                   | CPU/CUDA |
 | `parler-tts` | Open voice    | no                       | Audio generation and Voice cloning    | no                               | no                   | CPU/CUDA |
 | [rerankers](https://github.com/AnswerDotAI/rerankers) | Reranking API    | no                       | Reranking   | no                               | no                   | CPU/CUDA |
-| `transformers` | Various GPTs and quantization formats | yes                      | GPT, embeddings            | yes | yes*                  | CPU/CUDA/XPU |
+| `transformers` | Various GPTs and quantization formats | yes                      | GPT, embeddings            | yes | yes****                  | CPU/CUDA/XPU |
 | [bark-cpp](https://github.com/PABannier/bark.cpp)        | bark               | no                       | Audio-Only                 | no                                | no                   | yes |
 | [stablediffusion-cpp](https://github.com/leejet/stable-diffusion.cpp)         | stablediffusion-1, stablediffusion-2, stablediffusion-3, flux, PhotoMaker               | no                       | Image                 | no                                | no                   | N/A |
 | [silero-vad](https://github.com/snakers4/silero-vad) with [Golang bindings](https://github.com/streamer45/silero-vad-go) | Silero VAD    | no                       | Voice Activity Detection    | no                               | no                   | CPU |
 Note: any backend name listed above can be used in the `backend` field of the model configuration file (See [the advanced section]({{%relref "docs/advanced" %}})).
- \* Only for CUDA and OpenVINO CPU/XPU acceleration.
+- \* 7b ONLY
 - ** doesn't seem to be accurate
 - *** 7b and 40b with the `ggccv` format, for instance: https://huggingface.co/TheBloke/WizardLM-Uncensored-Falcon-40B-GGML
 - **** Only for CUDA and OpenVINO CPU/XPU acceleration.
--- a/docs/content/docs/reference/nvidia-l4t.md
+++ b/docs/content/docs/reference/nvidia-l4t.md
@@ -1,35 +0,0 @@
 +++
 disableToc = false
 title = "Running on Nvidia ARM64"
 weight = 27
 +++
 LocalAI can be run on Nvidia ARM64 devices, such as the Jetson Nano, Jetson Xavier NX, and Jetson AGX Xavier. The following instructions will guide you through building the LocalAI container for Nvidia ARM64 devices.
 ## Prerequisites
 - Docker engine installed (https://docs.docker.com/engine/install/ubuntu/)
 - Nvidia container toolkit installed (https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-ap)
 ## Build the container
 Build the LocalAI container for Nvidia ARM64 devices using the following command:
 ```bash
 git clone https://github.com/mudler/LocalAI
 cd LocalAI
 docker build --build-arg SKIP_DRIVERS=true --build-arg BUILD_TYPE=cublas --build-arg BASE_IMAGE=nvcr.io/nvidia/l4t-jetpack:r36.4.0 --build-arg IMAGE_TYPE=core -t localai-orin .
 ```
 ## Usage
 Run the LocalAI container on Nvidia ARM64 devices using the following command, where `/data/models` is the directory containing the models:
 ```bash
 docker run -e DEBUG=true -p 8080:8080 -v /data/models:/build/models  -ti --restart=always --name local-ai --runtime nvidia --gpus all localai-orin
 ```
 Note: `/data/models` is the directory containing the models. You can replace it with the directory containing your models.
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v2.25.0"
+  "version": "v2.24.2"
 }
--- a/docs/themes/hugo-theme-relearn
+++ b/docs/themes/hugo-theme-relearn
--- a/gallery/falcon3.yaml
+++ b/gallery/falcon3.yaml
@@ -1,40 +0,0 @@
 ---
 name: "falcon3"
 config_file: |
  mmap: true
  template:
    chat_message: |
      <|{{ .RoleName }}|>
      {{ if .FunctionCall -}}
      Function call:
      {{ else if eq .RoleName "tool" -}}
      Function response:
      {{ end -}}
      {{ if .Content -}}
      {{.Content }}
      {{ end -}}
      {{ if .FunctionCall -}}
      {{toJson .FunctionCall}}
      {{ end -}}
      {{ if eq .RoleName "assistant" }}<|endoftext|>{{ end }}
    function: |
      <|system|>
      You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
      {{range .Functions}}
      {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
      {{end}}
      For each function call return a json object with function name and arguments
      {{.Input }}
      <|im_start|>assistant
    chat: |
      {{.Input }}
      <|im_start|>assistant
    completion: |
      {{.Input}}
  context_size: 4096
  f16: true
  stopwords:
  - '<|endoftext|>'
  - '<dummy32000>'
  - '</s>'
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
--- a/Show More
+++ b/Show More