more robust approach

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
WIP - improve start and end of speech detection
2026-02-04 03:32:40 -05:00 · 2025-01-14 17:13:58 +01:00 · 2025-01-14 17:13:58 +01:00 · 2025-01-14 17:13:58 +01:00 · 2025-01-14 17:13:58 +01:00 · 2025-01-14 17:13:58 +01:00
110 changed files with 4317 additions and 471 deletions
--- a/.env
+++ b/.env
@@ -82,6 +82,15 @@
 # Enable to allow p2p mode
 # LOCALAI_P2P=true

+# Enable to use federated mode
+# LOCALAI_FEDERATED=true
+
+# Enable to start federation server
+# FEDERATED_SERVER=true
+
+# Define to use federation token
+# TOKEN=""
+
 ### Watchdog settings
 ###
 # Enables watchdog to kill backends that are inactive for too much time
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -5,6 +5,10 @@ dependencies:
 - any:
  - changed-files:
    - any-glob-to-any-file: 'Makefile'
+  - changed-files:
+    - any-glob-to-any-file: '*.mod'
+  - changed-files:
+    - any-glob-to-any-file: '*.sum'

 kind/documentation:
 - any:
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -280,6 +280,7 @@ jobs:
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
+      skip-drivers: ${{ matrix.skip-drivers }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -301,6 +302,7 @@ jobs:
            latest-image: 'latest-cpu'
            latest-image-aio: 'latest-aio-cpu'
            makeflags: "--jobs=4 --output-sync=target"
+            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -312,6 +314,7 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
+            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -323,6 +326,7 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
+            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -334,6 +338,7 @@ jobs:
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
+            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -344,6 +349,7 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
@@ -354,4 +360,45 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
+#  parallel-builds:
+#    uses: ./.github/workflows/image_build.yml
+#    with:
+#      tag-latest: ${{ matrix.tag-latest }}
+#      tag-suffix: ${{ matrix.tag-suffix }}
+#      ffmpeg: ${{ matrix.ffmpeg }}
+#      image-type: ${{ matrix.image-type }}
+#      build-type: ${{ matrix.build-type }}
+#      cuda-major-version: ${{ matrix.cuda-major-version }}
+#      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+#      platforms: ${{ matrix.platforms }}
+#      runs-on: ${{ matrix.runs-on }}
+#      aio: ${{ matrix.aio }}
+#      base-image: ${{ matrix.base-image }}
+#      grpc-base-image: ${{ matrix.grpc-base-image }}
+#      makeflags: ${{ matrix.makeflags }}
+#      latest-image: ${{ matrix.latest-image }}
+#      latest-image-aio: ${{ matrix.latest-image-aio }}
+#      skip-drivers: ${{ matrix.skip-drivers }}
+#    secrets:
+#      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+#      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+#      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+#      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+#    strategy:
+#      matrix:
+#        include:
+#          - build-type: 'cublas'
+#            cuda-major-version: "12"
+#            cuda-minor-version: "0"
+#            platforms: 'linux/arm64'
+#            tag-latest: 'false'
+#            tag-suffix: '-nvidia-l4t-arm64-core'
+#            latest-image: 'latest-nvidia-l4t-arm64-core'
+#            ffmpeg: 'true'
+#            image-type: 'core'
+#            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
+#            runs-on: 'self-hosted'
+#            makeflags: "--jobs=4 --output-sync=target"
+#            skip-drivers: 'true'
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -49,6 +49,10 @@ on:
        description: 'FFMPEG'
        default: ''
        type: string
+      skip-drivers:
+        description: 'Skip drivers by default'
+        default: 'false'
+        type: string
      image-type:
        description: 'Image type'
        default: ''
@@ -234,6 +238,7 @@ jobs:
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
            GRPC_VERSION=v1.65.0
            MAKEFLAGS=${{ inputs.makeflags }}
+            SKIP_DRIVERS=${{ inputs.skip-drivers }}
          context: .
          file: ./Dockerfile
          cache-from: type=gha
@@ -262,6 +267,7 @@ jobs:
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
            GRPC_VERSION=v1.65.0
            MAKEFLAGS=${{ inputs.makeflags }}
+            SKIP_DRIVERS=${{ inputs.skip-drivers }}
          context: .
          file: ./Dockerfile
          cache-from: type=gha
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.21.4
+        uses: securego/gosec@v2.22.0
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/9
+++ b/9
@@ -115,12 +115,13 @@ FROM requirements-${IMAGE_TYPE} AS requirements-drivers
 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=12
 ARG CUDA_MINOR_VERSION=0
+ARG SKIP_DRIVERS=false

 ENV BUILD_TYPE=${BUILD_TYPE}

 # Vulkan requirements
 RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "vulkan" ]; then
+    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils wget gpg-agent && \
@@ -136,7 +137,7 @@ EOT

 # CuBLAS requirements
 RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "cublas" ]; then
+    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils
@@ -162,7 +163,7 @@ RUN <<EOT bash
 EOT

 # If we are building with clblas support, we need the libraries for the builds
-RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
+RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            libclblast-dev && \
@@ -170,7 +171,7 @@ RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
        rm -rf /var/lib/apt/lists/* \
    ; fi

-RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
+RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            hipblas-dev \
--- a/14
+++ b/14
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=dafae66cc242eb766797194d3c85c5e502625623
+CPPLLAMA_VERSION?=504af20ee4eae72080a56d59d744f6774f7901ce

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@@ -32,7 +32,7 @@ BARKCPP_VERSION?=v1.0.0

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=9578fdcc4632dc3de5565f28e2fb16b7c18f8d48
+STABLEDIFFUSION_GGML_VERSION?=dcf91f9e0f2cbf9da472ee2a556751ed4bab2d2a

 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
@@ -302,14 +302,8 @@ sources/stablediffusion-ggml.cpp:
 	git checkout $(STABLEDIFFUSION_GGML_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch

-sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a: sources/stablediffusion-ggml.cpp
-	cd sources/stablediffusion-ggml.cpp && \
-	mkdir -p build && \
-	cd build && \
-	cmake $(CMAKE_ARGS) .. && \
-	cmake --build . --config Release
-
-backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a
+backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
+	$(MAKE) -C backend/go/image/stablediffusion-ggml build/libstable-diffusion.a
 	$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a

 backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
--- a/README.md
+++ b/README.md
@@ -126,10 +126,10 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl

 ## 🚀 [Features](https://localai.io/features/)

- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
+- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
 - 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
 - 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
+- 🎨 [Image generation](https://localai.io/features/image-generation)
 - 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/) 
 - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
@@ -137,6 +137,7 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
 - 📈 [Reranker API](https://localai.io/features/reranker/)
 - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
+- 🔊 Voice activity detection (Silero-VAD support)
 - 🌍 Integrated WebUI!

 ## 💻 Usage
@@ -159,6 +160,7 @@ Model galleries
 Other:
 - Helm chart https://github.com/go-skynet/helm-charts
 - VSCode extension https://github.com/badgooooor/localai-vscode-plugin
+- Langchain: https://python.langchain.com/docs/integrations/providers/localai/
 - Terminal utility https://github.com/djcopley/ShellOracle
 - Local Smart assistant https://github.com/mudler/LocalAGI
 - Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -159,6 +159,7 @@ message Reply {
  bytes message = 1;
  int32 tokens = 2;
  int32 prompt_tokens = 3;
+  bytes audio = 5;
 }

 message ModelOptions {
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -428,6 +428,7 @@ struct llama_server_context
 {
    llama_model *model = nullptr;
    llama_context *ctx = nullptr;
+    const llama_vocab * vocab = nullptr;

    clip_ctx *clp_ctx = nullptr;

@@ -439,6 +440,7 @@ struct llama_server_context
    bool clean_kv_cache     = true;
    bool all_slots_are_idle = false;
    bool add_bos_token      = true;
+    bool has_eos_token      = true;

    int32_t n_ctx;  // total context for all clients / slots

@@ -492,8 +494,8 @@ struct llama_server_context
        }

        common_init_result common_init = common_init_from_params(params);
-        model = common_init.model;
-        ctx = common_init.context;
+        model = common_init.model.release();
+        ctx = common_init.context.release();
        if (model == nullptr)
        {
            LOG_ERR("unable to load model: %s", params.model.c_str());
@@ -502,7 +504,7 @@ struct llama_server_context

        if (multimodal) {
            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
-            const int n_embd_llm  = llama_n_embd(model);
+            const int n_embd_llm  = llama_model_n_embd(model);
            if (n_embd_clip != n_embd_llm) {
                LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
                llama_free(ctx);
@@ -511,23 +513,15 @@ struct llama_server_context
            }
        }

+        vocab = llama_model_get_vocab(model);
        n_ctx = llama_n_ctx(ctx);

-        add_bos_token = llama_add_bos_token(model);
+        add_bos_token = llama_vocab_get_add_bos(vocab);
+        has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;

        return true;
    }

-    void validate_model_chat_template(server_params & sparams) {
-        llama_chat_message chat[] = {{"user", "test"}};
-        std::vector<char> buf(1);
-        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
-        if (res < 0) {
-            LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__);
-            sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
-        }
-    }
-
    llama_client_slot* get_active_slot() {
        for (llama_client_slot& slot : slots) {
            // Check if the slot is currently processing
@@ -681,7 +675,6 @@ struct llama_server_context
        slot->sparams.mirostat          = json_value(data, "mirostat",          default_sparams.mirostat);
        slot->sparams.mirostat_tau      = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
        slot->sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
-        slot->sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
        slot->params.n_keep             = json_value(data, "n_keep",            slot->params.n_keep);
        slot->sparams.seed               = json_value(data, "seed",              default_sparams.seed);
        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
@@ -726,8 +719,8 @@ struct llama_server_context
            slot->prompt = "";
        }

-        if (json_value(data, "ignore_eos", false)) {
-                slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
+        if (json_value(data, "ignore_eos", false) && has_eos_token) {
+                slot->sparams.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
        }
        /*
        slot->sparams.penalty_prompt_tokens.clear();
@@ -766,13 +759,13 @@ struct llama_server_context
            }
        }
      */
-
        slot->sparams.logit_bias.clear();

        const auto &logit_bias = data.find("logit_bias");
        if (logit_bias != data.end() && logit_bias->is_array())
        {
-            const int n_vocab = llama_n_vocab(model);
+            const llama_vocab * vocab = llama_model_get_vocab(model);
+            const int n_vocab = llama_vocab_n_tokens(vocab);
            for (const auto &el : *logit_bias)
            {
                if (el.is_array() && el.size() == 2)
@@ -801,7 +794,7 @@ struct llama_server_context
                    }
                    else if (el[0].is_string())
                    {
-                        auto toks = common_tokenize(model, el[0].get<std::string>(), false);
+                        auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
                        for (auto tok : toks)
                        {
                            slot->sparams.logit_bias.push_back({tok, bias});
@@ -1131,7 +1124,7 @@ struct llama_server_context
            slot.has_next_token = false;
        }

-        if (result.tok == llama_token_eos(model))
+        if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok))
        {
            slot.stopped_eos = true;
            slot.has_next_token = false;
@@ -1213,13 +1206,12 @@ struct llama_server_context
            {"mirostat",          slot.sparams.mirostat},
            {"mirostat_tau",      slot.sparams.mirostat_tau},
            {"mirostat_eta",      slot.sparams.mirostat_eta},
-            {"penalize_nl",       slot.sparams.penalize_nl},
            {"stop",              slot.params.antiprompt},
            {"n_predict",         slot.params.n_predict},
            {"n_keep",            params.n_keep},
            {"ignore_eos",        slot.sparams.ignore_eos},
            {"stream",            slot.params.stream},
-      //      {"logit_bias",        slot.sparams.logit_bias},
+             //      {"logit_bias",        slot.sparams.logit_bias},
            {"n_probs",           slot.sparams.n_probs},
            {"min_keep",          slot.sparams.min_keep},
            {"grammar",           slot.sparams.grammar},
@@ -1327,7 +1319,7 @@ struct llama_server_context
        res.error = false;
        res.stop = true;

-        const int n_embd = llama_n_embd(model);
+        const int n_embd = llama_model_n_embd(model);
        if (!params.embedding)
        {
            LOG_WARNING("embedding disabled", {
@@ -1426,7 +1418,7 @@ struct llama_server_context
                    n_eval = n_batch;
                }

-                const int n_embd = llama_n_embd(model);
+                const int n_embd = llama_model_n_embd(model);
                float * embd = img.image_embedding + i * n_embd;
                llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
                if (llama_decode(ctx, llava_batch.batch))
@@ -1707,11 +1699,11 @@ struct llama_server_context
                            suffix_tokens.erase(suffix_tokens.begin());
                        }

-                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
-                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
-                        prefix_tokens.insert(prefix_tokens.end(),   llama_token_suffix(model));
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_fim_pre(vocab));
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_bos(vocab)); // always add BOS
+                        prefix_tokens.insert(prefix_tokens.end(),   llama_vocab_fim_suf(vocab));
                        prefix_tokens.insert(prefix_tokens.end(),   suffix_tokens.begin(), suffix_tokens.end());
-                        prefix_tokens.push_back(llama_token_middle(model));
+                        prefix_tokens.push_back(llama_vocab_fim_mid(vocab));
                        prompt_tokens = prefix_tokens;
                    }
                    else
@@ -2112,7 +2104,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    //     slot->sparams.mirostat        = json_value(data, "mirostat",          default_sparams.mirostat);
    //     slot->sparams.mirostat_tau    = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
    //     slot->sparams.mirostat_eta    = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
-    //     slot->sparams.penalize_nl     = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
    //     slot->params.n_keep           = json_value(data, "n_keep",            slot->params.n_keep);
    //     slot->params.seed             = json_value(data, "seed",              default_params.seed);
    //     slot->sparams.grammar         = json_value(data, "grammar",           default_sparams.grammar);
@@ -2135,7 +2126,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    data["mirostat"] = predict->mirostat();
    data["mirostat_tau"] = predict->mirostattau();
    data["mirostat_eta"] = predict->mirostateta();
-    data["penalize_nl"] = predict->penalizenl();
    data["n_keep"] = predict->nkeep();
    data["seed"] = predict->seed();
    data["grammar"] = predict->grammar();
@@ -2181,7 +2171,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 //     llama.params.sparams.mirostat = predict->mirostat();
 //     llama.params.sparams.mirostat_tau = predict->mirostattau();
 //     llama.params.sparams.mirostat_eta = predict->mirostateta();
-//     llama.params.sparams.penalize_nl = predict->penalizenl();
 //     llama.params.n_keep = predict->nkeep();
 //     llama.params.seed = predict->seed();
 //     llama.params.sparams.grammar = predict->grammar();
@@ -2228,6 +2217,35 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 //     }
 // }

+const std::vector<ggml_type> kv_cache_types = {
+    GGML_TYPE_F32,
+    GGML_TYPE_F16,
+    GGML_TYPE_BF16,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1,
+    GGML_TYPE_IQ4_NL,
+    GGML_TYPE_Q5_0,
+    GGML_TYPE_Q5_1,
+};
+
+static ggml_type kv_cache_type_from_str(const std::string & s) {
+    for (const auto & type : kv_cache_types) {
+        if (ggml_type_name(type) == s) {
+            return type;
+        }
+    }
+    throw std::runtime_error("Unsupported cache type: " + s);
+}
+
+static std::string get_all_kv_cache_types() {
+    std::ostringstream msg;
+    for (const auto & type : kv_cache_types) {
+        msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
+    }
+    return msg.str();
+}
+
 static void params_parse(const backend::ModelOptions* request,
                                common_params & params) {
   
@@ -2242,10 +2260,10 @@ static void params_parse(const backend::ModelOptions* request,
    //  params.model_alias ??
    params.model_alias =  request->modelfile();
    if (!request->cachetypekey().empty()) {
-        params.cache_type_k = request->cachetypekey();
+        params.cache_type_k = kv_cache_type_from_str(request->cachetypekey());
    }
    if (!request->cachetypevalue().empty()) {
-        params.cache_type_v = request->cachetypevalue();
+        params.cache_type_v = kv_cache_type_from_str(request->cachetypevalue());
    }
    params.n_ctx = request->contextsize();
    //params.memory_f16 = request->f16memory();
--- a/backend/cpp/llama/patches/01-llava.patch
+++ b/backend/cpp/llama/patches/01-llava.patch
@@ -1,13 +1,13 @@
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 342042ff..224db9b5 100644
+index 3cd0d2fa..6c5e811a 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
-@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
-             struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
-             int* patches_data = (int*)malloc(ggml_nbytes(patches));
-             for (int i = 0; i < num_patches; i++) {
-                patches_data[i] = i + 1;
-+                patches_data[i] = i;
-             }
-             ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
-             free(patches_data);
+@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
+                 struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
+                 int* patches_data = (int*)malloc(ggml_nbytes(patches));
+                 for (int i = 0; i < num_patches; i++) {
+-                    patches_data[i] = i + 1;
+                    patches_data[i] = i;
+                 }
+                 ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
+                 free(patches_data);
--- a/backend/go/image/stablediffusion-ggml/Makefile
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@@ -2,20 +2,95 @@ INCLUDE_PATH := $(abspath ./)
 LIBRARY_PATH := $(abspath ./)

 AR?=ar
-
+CMAKE_ARGS?=
 BUILD_TYPE?=
+ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 # keep standard at C11 and C++11
 CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC

+# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
+
+# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DGGML_CUDA=ON
+# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+# to CMAKE_ARGS automatically
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
+else ifeq ($(BUILD_TYPE),clblas)
+	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
+else ifeq ($(BUILD_TYPE),hipblas)
+	CMAKE_ARGS+=-DGGML_HIP=ON
+# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
+# But if it's OSX without metal, disable it here
+else ifeq ($(OS),Darwin)
+	ifneq ($(BUILD_TYPE),metal)
+		CMAKE_ARGS+=-DGGML_METAL=OFF
+	else
+		CMAKE_ARGS+=-DGGML_METAL=ON
+		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
+		TARGET+=--target ggml-metal
+	endif
+endif
+
+# ifeq ($(BUILD_TYPE),sycl_f16)
+# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DSD_SYCL=ON -DGGML_SYCL_F16=ON
+# endif
+
+# ifeq ($(BUILD_TYPE),sycl_f32)
+# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON
+# endif
+
 # warnings
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function

+# Find all .a archives in ARCHIVE_DIR
+# (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
+GGML_ARCHIVE_DIR := build/ggml/src/
+ALL_ARCHIVES := $(shell find $(GGML_ARCHIVE_DIR) -type f -name '*.a')
+
+# Name of the single merged library
+COMBINED_LIB := libggmlall.a
+
+# Rule to merge all the .a files into one
+$(COMBINED_LIB): $(ALL_ARCHIVES)
+	@echo "Merging all .a into $(COMBINED_LIB)"
+	rm -f $@
+	mkdir -p merge-tmp
+	for a in $(ALL_ARCHIVES); do \
+		( cd merge-tmp && ar x ../$$a ); \
+	done
+	( cd merge-tmp && ar rcs ../$@ *.o )
+	# Ensure we have a proper index
+	ranlib $@
+	# Clean up
+	rm -rf merge-tmp
+
+build/libstable-diffusion.a:
+	@echo "Building SD with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+	+bash -c "source $(ONEAPI_VARS); \
+	mkdir -p build && \
+	cd build && \
+	cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
+	cmake --build . --config Release"
+else
+	mkdir -p build && \
+	cd build && \
+	cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
+	cmake --build . --config Release
+endif
+	$(MAKE) $(COMBINED_LIB)
+
 gosd.o:
 	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c

 libsd.a: gosd.o
-	cp $(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a ./libsd.a
+	cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
 	$(AR) rcs libsd.a gosd.o

 clean:
-	rm -f gosd.o libsd.a
+	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
--- a/backend/go/image/stablediffusion-ggml/gosd.go
+++ b/backend/go/image/stablediffusion-ggml/gosd.go
@@ -1,7 +1,7 @@
 package main

 // #cgo CXXFLAGS: -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/ggml/include
-// #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/build/ggml/src/ggml-cpu -L${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/build/ggml/src -lsd -lstdc++ -lm -lggml -lggml-base -lggml-cpu -lgomp
+// #cgo LDFLAGS: -L${SRCDIR}/ -lsd -lstdc++ -lm -lggmlall -lgomp
 // #include <gosd.h>
 // #include <stdlib.h>
 import "C"
--- a/backend/go/vad/silero/vad.go
+++ b/backend/go/vad/silero/vad.go
@@ -21,8 +21,8 @@ func (vad *VAD) Load(opts *pb.ModelOptions) error {
 		SampleRate: 16000,
 		//WindowSize:           1024,
 		Threshold:            0.5,
-		MinSilenceDurationMs: 0,
-		SpeechPadMs:          0,
+		MinSilenceDurationMs: 100,
+		SpeechPadMs:          30,
 	})
 	if err != nil {
 		return fmt.Errorf("create silero detector: %w", err)
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -1,5 +1,6 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch
-torch
+intel-extension-for-pytorch==2.3.110+xpu
+torch==2.3.1+cxx11.abi
+oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.68.1
+grpcio==1.69.0
 protobuf
 certifi
 transformers
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -1,8 +1,9 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch
-torch
-torchaudio
+intel-extension-for-pytorch==2.3.110+xpu
+torch==2.3.1+cxx11.abi
+torchaudio==2.3.1+cxx11.abi
+oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 transformers
 accelerate
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.68.1
+grpcio==1.69.0
 protobuf
 certifi
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -17,6 +17,9 @@
 # LIMIT_TARGETS="cublas12"
 # source $(dirname $0)/../common/libbackend.sh
 #
+
+PYTHON_VERSION="3.10"
+
 function init() {
    # Name of the backend (directory name)
    BACKEND_NAME=${PWD##*/}
@@ -88,7 +91,7 @@ function getBuildProfile() {
 # always result in an activated virtual environment
 function ensureVenv() {
    if [ ! -d "${EDIR}/venv" ]; then
-        uv venv ${EDIR}/venv
+        uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
        echo "virtualenv created"
    fi

--- a/backend/python/common/template/requirements-intel.txt
+++ b/backend/python/common/template/requirements-intel.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch
-torch
+intel-extension-for-pytorch==2.3.110+xpu
+torch==2.3.1+cxx11.abi
+oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.68.1
+grpcio==1.69.0
 protobuf
 grpcio-tools
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -1,9 +1,10 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch
-torch
-torchaudio
+intel-extension-for-pytorch==2.3.110+xpu
+torch==2.3.1+cxx11.abi
+torchaudio==2.3.1+cxx11.abi
+oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 transformers
 accelerate
 coqui-tts
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.1
+grpcio==1.69.0
 protobuf
 certifi
 packaging==24.1
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -17,7 +17,7 @@ import backend_pb2_grpc

 import grpc

-from diffusers import StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
+from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
    EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
 from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
@@ -275,6 +275,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

                    if request.LowVRAM:
                        self.pipe.enable_model_cpu_offload()
+            elif request.PipelineType == "SanaPipeline":
+                self.pipe = SanaPipeline.from_pretrained(
+                    request.Model,
+                    variant="bf16",
+                    torch_dtype=torch.bfloat16)
+                self.pipe.vae.to(torch.bfloat16)
+                self.pipe.text_encoder.to(torch.bfloat16)

            if CLIPSKIP and request.CLIPSkip != 0:
                self.clip_skip = request.CLIPSkip
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -1,9 +1,10 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch
-torch
-torchvision
+intel-extension-for-pytorch==2.3.110+xpu
+torch==2.3.1+cxx11.abi
+torchvision==0.18.1+cxx11.abi
+oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 diffusers
 opencv-python
 transformers
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.68.1
+grpcio==1.69.0
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.1
+grpcio==1.69.0
 protobuf
 certifi
 wheel
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.68.1
+grpcio==1.69.0
 protobuf
 certifi
--- a/backend/python/openvoice/requirements-cpu.txt
+++ b/backend/python/openvoice/requirements-cpu.txt
@@ -1,3 +1,7 @@
 torch==2.4.1
 git+https://github.com/myshell-ai/MeloTTS.git
-git+https://github.com/myshell-ai/OpenVoice.git
+git+https://github.com/myshell-ai/OpenVoice.git
+whisper-timestamped
+pydub==0.25.1
+wavmark==0.0.3
+eng_to_ipa==0.0.2
--- a/backend/python/openvoice/requirements-cublas11.txt
+++ b/backend/python/openvoice/requirements-cublas11.txt
@@ -1,4 +1,8 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.4.1+cu118
 git+https://github.com/myshell-ai/MeloTTS.git
-git+https://github.com/myshell-ai/OpenVoice.git
+git+https://github.com/myshell-ai/OpenVoice.git
+whisper-timestamped
+pydub==0.25.1
+wavmark==0.0.3
+eng_to_ipa==0.0.2
--- a/backend/python/openvoice/requirements-cublas12.txt
+++ b/backend/python/openvoice/requirements-cublas12.txt
@@ -1,3 +1,7 @@
 torch==2.4.1
 git+https://github.com/myshell-ai/MeloTTS.git
-git+https://github.com/myshell-ai/OpenVoice.git
+git+https://github.com/myshell-ai/OpenVoice.git
+whisper-timestamped
+pydub==0.25.1
+wavmark==0.0.3
+eng_to_ipa==0.0.2
--- a/backend/python/openvoice/requirements-hipblas.txt
+++ b/backend/python/openvoice/requirements-hipblas.txt
@@ -1,4 +1,8 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch==2.4.1+rocm6.0
 git+https://github.com/myshell-ai/MeloTTS.git
-git+https://github.com/myshell-ai/OpenVoice.git
+git+https://github.com/myshell-ai/OpenVoice.git
+whisper-timestamped
+pydub==0.25.1
+wavmark==0.0.3
+eng_to_ipa==0.0.2
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -1,14 +1,15 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch
-torch
+intel-extension-for-pytorch==2.3.110+xpu
+torch==2.3.1+cxx11.abi
+torchaudio==2.3.1+cxx11.abi
+oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-grpcio==1.68.1
+grpcio==1.69.0
 protobuf
 librosa==0.9.1
 faster-whisper==0.9.0
 pydub==0.25.1
 wavmark==0.0.3
-numpy==1.22.0
 eng_to_ipa==0.0.2
 inflect==7.0.0
 unidecode==1.3.7
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -1,20 +1,17 @@
-grpcio==1.68.1
+grpcio==1.69.0
 protobuf
 librosa
 faster-whisper
-pydub==0.25.1
-wavmark==0.0.3
-numpy==1.22.0
-eng_to_ipa==0.0.2
 inflect
 unidecode
-whisper-timestamped
 openai
 python-dotenv
 pypinyin
 cn2an==0.5.22
+numpy==1.22.0
 networkx==2.8.8
 jieba==0.42.1
-gradio==3.48.0
+gradio==5.9.1
 langid==1.1.6
 llvmlite==0.43.0
+setuptools
--- a/backend/python/parler-tts/requirements-intel.txt
+++ b/backend/python/parler-tts/requirements-intel.txt
@@ -1,8 +1,8 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch
-torch
-torchaudio
+intel-extension-for-pytorch==2.3.110+xpu
+torch==2.3.1+cxx11.abi
+torchaudio==2.3.1+cxx11.abi
+oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,3 +1,4 @@
-grpcio==1.68.1
+grpcio==1.69.0
 certifi
 llvmlite==0.43.0
+setuptools
--- a/backend/python/rerankers/requirements-intel.txt
+++ b/backend/python/rerankers/requirements-intel.txt
@@ -1,8 +1,9 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch
+intel-extension-for-pytorch==2.3.110+xpu
 transformers
 accelerate
-torch
+torch==2.3.1+cxx11.abi
+oneccl_bind_pt==2.3.100+xpu
 rerankers[transformers]
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.68.1
+grpcio==1.69.0
 protobuf
 certifi
--- a/backend/python/sentencetransformers/requirements-intel.txt
+++ b/backend/python/sentencetransformers/requirements-intel.txt
@@ -1,8 +1,9 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch
-torch
+intel-extension-for-pytorch==2.3.110+xpu
+torch==2.3.1+cxx11.abi
+oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 accelerate
 sentence-transformers==3.3.1
 transformers
--- a/backend/python/sentencetransformers/requirements.txt
+++ b/backend/python/sentencetransformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.1
+grpcio==1.69.0
 protobuf
 certifi
 datasets
--- a/backend/python/transformers-musicgen/requirements-intel.txt
+++ b/backend/python/transformers-musicgen/requirements-intel.txt
@@ -1,7 +1,8 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch
+intel-extension-for-pytorch==2.3.110+xpu
 transformers
+oneccl_bind_pt==2.3.100+xpu
 accelerate
-torch
+torch==2.3.1+cxx11.abi
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
--- a/backend/python/transformers-musicgen/requirements.txt
+++ b/backend/python/transformers-musicgen/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.1
+grpcio==1.69.0
 protobuf
 scipy==1.14.0
 certifi
--- a/backend/python/transformers/requirements-intel.txt
+++ b/backend/python/transformers/requirements-intel.txt
@@ -1,6 +1,7 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch
-torch
+intel-extension-for-pytorch==2.3.110+xpu
+torch==2.3.1+cxx11.abi
+oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
 intel-extension-for-transformers
 bitsandbytes
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.1
+grpcio==1.69.0
 protobuf
 certifi
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
--- a/backend/python/vall-e-x/requirements-intel.txt
+++ b/backend/python/vall-e-x/requirements-intel.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch
+intel-extension-for-pytorch==2.3.110+xpu
 accelerate
-torch
-torchaudio
+torch==2.3.1+cxx11.abi
+torchaudio==2.3.1+cxx11.abi
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+oneccl_bind_pt==2.3.100+xpu
--- a/backend/python/vall-e-x/requirements.txt
+++ b/backend/python/vall-e-x/requirements.txt
@@ -1,3 +1,4 @@
-grpcio==1.68.1
+grpcio==1.69.0
 protobuf
-certifi
+certifi
+setuptools
--- a/backend/python/vllm/requirements-intel.txt
+++ b/backend/python/vllm/requirements-intel.txt
@@ -1,8 +1,9 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch
+intel-extension-for-pytorch==2.3.110+xpu
 accelerate
-torch
+torch==2.3.1+cxx11.abi
 transformers
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
-bitsandbytes
+setuptools
+bitsandbytes
+oneccl_bind_pt==2.3.100+xpu
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.1
+grpcio==1.69.0
 protobuf
 certifi
 setuptools
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -22,8 +22,9 @@ import (
 )

 type LLMResponse struct {
-	Response string // should this be []byte?
-	Usage    TokenUsage
+	Response    string // should this be []byte?
+	Usage       TokenUsage
+	AudioOutput string
 }

 type TokenUsage struct {
@@ -117,8 +118,12 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 			ss := ""

 			var partialRune []byte
-			err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
-				partialRune = append(partialRune, chars...)
+			err := inferenceModel.PredictStream(ctx, opts, func(reply *proto.Reply) {
+				msg := reply.Message
+				partialRune = append(partialRune, msg...)
+
+				tokenUsage.Prompt = int(reply.PromptTokens)
+				tokenUsage.Completion = int(reply.Tokens)

 				for len(partialRune) > 0 {
 					r, size := utf8.DecodeRune(partialRune)
@@ -132,6 +137,10 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im

 					partialRune = partialRune[size:]
 				}
+
+				if len(msg) == 0 {
+					tokenCallback("", tokenUsage)
+				}
 			})
 			return LLMResponse{
 				Response: ss,
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -38,6 +38,7 @@ type BackendConfig struct {
 	TemplateConfig      TemplateConfig         `yaml:"template"`
 	KnownUsecaseStrings []string               `yaml:"known_usecases"`
 	KnownUsecases       *BackendConfigUsecases `yaml:"-"`
+	Pipeline            Pipeline               `yaml:"pipeline"`

 	PromptStrings, InputStrings                []string               `yaml:"-"`
 	InputToken                                 [][]int                `yaml:"-"`
@@ -76,6 +77,18 @@ type BackendConfig struct {
 	Options []string `yaml:"options"`
 }

+// Pipeline defines other models to use for audio-to-audio
+type Pipeline struct {
+	TTS           string `yaml:"tts"`
+	LLM           string `yaml:"llm"`
+	Transcription string `yaml:"transcription"`
+	VAD           string `yaml:"vad"`
+}
+
+func (p Pipeline) IsNotConfigured() bool {
+	return p.LLM == "" || p.TTS == "" || p.Transcription == ""
+}
+
 type File struct {
 	Filename string         `yaml:"filename" json:"filename"`
 	SHA256   string         `yaml:"sha256" json:"sha256"`
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -7,6 +7,7 @@ import (
 	"net/http"

 	"github.com/dave-gray101/v2keyauth"
+	"github.com/gofiber/websocket/v2"
 	"github.com/mudler/LocalAI/pkg/utils"

 	"github.com/mudler/LocalAI/core/http/endpoints/localai"
@@ -87,6 +88,17 @@ func API(application *application.Application) (*fiber.App, error) {

 	router := fiber.New(fiberCfg)

+	router.Use(middleware.StripPathPrefix())
+
+	router.Use("/v1/realtime", func(c *fiber.Ctx) error {
+		if websocket.IsWebSocketUpgrade(c) {
+			// Returns true if the client requested upgrade to the WebSocket protocol
+			return c.Next()
+		}
+
+		return nil
+	})
+
 	router.Hooks().OnListen(func(listenData fiber.ListenData) error {
 		scheme := "http"
 		if listenData.TLS {
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -237,6 +237,31 @@ func postInvalidRequest(url string) (error, int) {
 	return nil, resp.StatusCode
 }

+func getRequest(url string, header http.Header) (error, int, []byte) {
+
+	req, err := http.NewRequest("GET", url, nil)
+	if err != nil {
+		return err, -1, nil
+	}
+
+	req.Header = header
+
+	client := &http.Client{}
+	resp, err := client.Do(req)
+	if err != nil {
+		return err, -1, nil
+	}
+
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return err, -1, nil
+	}
+
+	return nil, resp.StatusCode, body
+}
+
 const bertEmbeddingsURL = `https://gist.githubusercontent.com/mudler/0a080b166b87640e8644b09c2aee6e3b/raw/f0e8c26bb72edc16d9fbafbfd6638072126ff225/bert-embeddings-gallery.yaml`

 //go:embed backend-assets/*
@@ -345,6 +370,33 @@ var _ = Describe("API test", func() {
 			})
 		})

+		Context("URL routing Tests", func() {
+			It("Should support reverse-proxy when unauthenticated", func() {
+
+				err, sc, body := getRequest("http://127.0.0.1:9090/myprefix/", http.Header{
+					"X-Forwarded-Proto":  {"https"},
+					"X-Forwarded-Host":   {"example.org"},
+					"X-Forwarded-Prefix": {"/myprefix/"},
+				})
+				Expect(err).To(BeNil(), "error")
+				Expect(sc).To(Equal(401), "status code")
+				Expect(string(body)).To(ContainSubstring(`<base href="https://example.org/myprefix/" />`), "body")
+			})
+
+			It("Should support reverse-proxy when authenticated", func() {
+
+				err, sc, body := getRequest("http://127.0.0.1:9090/myprefix/", http.Header{
+					"Authorization":      {bearerKey},
+					"X-Forwarded-Proto":  {"https"},
+					"X-Forwarded-Host":   {"example.org"},
+					"X-Forwarded-Prefix": {"/myprefix/"},
+				})
+				Expect(err).To(BeNil(), "error")
+				Expect(sc).To(Equal(200), "status code")
+				Expect(string(body)).To(ContainSubstring(`<base href="https://example.org/myprefix/" />`), "body")
+			})
+		})
+
 		Context("Applying models", func() {

 			It("applies models from a gallery", func() {
@@ -704,7 +756,7 @@ var _ = Describe("API test", func() {
 			Expect(err).ToNot(HaveOccurred(), fmt.Sprint(resp))

 			Expect(resp.StatusCode).To(Equal(200), fmt.Sprint(string(dat)))
-			Expect(resp.Header.Get("Content-Type")).To(Equal("audio/x-wav"))
+			Expect(resp.Header.Get("Content-Type")).To(Or(Equal("audio/x-wav"), Equal("audio/vnd.wave")))
 		})
 		It("installs and is capable to generate images", Label("stablediffusion"), func() {
 			if runtime.GOOS != "linux" {
--- a/core/http/ctx/fiber.go
+++ b/core/http/ctx/fiber.go
@@ -19,9 +19,11 @@ func ModelFromContext(ctx *fiber.Ctx, cl *config.BackendConfigLoader, loader *mo
 	if ctx.Params("model") != "" {
 		modelInput = ctx.Params("model")
 	}
+
 	if ctx.Query("model") != "" {
 		modelInput = ctx.Query("model")
 	}
+
 	// Set model from bearer token, if available
 	bearer := strings.TrimLeft(ctx.Get("authorization"), "Bear ") // Reduced duplicate characters of Bearer
 	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
--- a/core/http/elements/buttons.go
+++ b/core/http/elements/buttons.go
@@ -16,7 +16,7 @@ func installButton(galleryName string) elem.Node {
 			"class":                 "float-right inline-block rounded bg-primary px-6 pb-2.5 mb-3 pt-2.5 text-xs font-medium uppercase leading-normal text-white shadow-primary-3 transition duration-150 ease-in-out hover:bg-primary-accent-300 hover:shadow-primary-2 focus:bg-primary-accent-300 focus:shadow-primary-2 focus:outline-none focus:ring-0 active:bg-primary-600 active:shadow-primary-2 dark:shadow-black/30 dark:hover:shadow-dark-strong dark:focus:shadow-dark-strong dark:active:shadow-dark-strong",
 			"hx-swap":               "outerHTML",
 			// post the Model ID as param
-			"hx-post": "/browse/install/model/" + galleryName,
+			"hx-post": "browse/install/model/" + galleryName,
 		},
 		elem.I(
 			attrs.Props{
@@ -36,7 +36,7 @@ func reInstallButton(galleryName string) elem.Node {
 			"hx-target":             "#action-div-" + dropBadChars(galleryName),
 			"hx-swap":               "outerHTML",
 			// post the Model ID as param
-			"hx-post": "/browse/install/model/" + galleryName,
+			"hx-post": "browse/install/model/" + galleryName,
 		},
 		elem.I(
 			attrs.Props{
@@ -80,7 +80,7 @@ func deleteButton(galleryID string) elem.Node {
 			"hx-target":             "#action-div-" + dropBadChars(galleryID),
 			"hx-swap":               "outerHTML",
 			// post the Model ID as param
-			"hx-post": "/browse/delete/model/" + galleryID,
+			"hx-post": "browse/delete/model/" + galleryID,
 		},
 		elem.I(
 			attrs.Props{
--- a/core/http/elements/gallery.go
+++ b/core/http/elements/gallery.go
@@ -47,7 +47,7 @@ func searchableElement(text, icon string) elem.Node {
 					//	"value":     text,
 					//"class":     "inline-block bg-gray-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2",
 					"href":      "#!",
-					"hx-post":   "/browse/search/models",
+					"hx-post":   "browse/search/models",
 					"hx-target": "#search-results",
 					// TODO: this doesn't work
 					//	"hx-vals":      `{ \"search\": \"` + text + `\" }`,
--- a/core/http/elements/progressbar.go
+++ b/core/http/elements/progressbar.go
@@ -64,7 +64,7 @@ func StartProgressBar(uid, progress, text string) string {
 	return elem.Div(
 		attrs.Props{
 			"hx-trigger": "done",
-			"hx-get":     "/browse/job/" + uid,
+			"hx-get":     "browse/job/" + uid,
 			"hx-swap":    "outerHTML",
 			"hx-target":  "this",
 		},
@@ -77,7 +77,7 @@ func StartProgressBar(uid, progress, text string) string {
 			},
 			elem.Text(bluemonday.StrictPolicy().Sanitize(text)), //Perhaps overly defensive
 			elem.Div(attrs.Props{
-				"hx-get":     "/browse/job/progress/" + uid,
+				"hx-get":     "browse/job/progress/" + uid,
 				"hx-trigger": "every 600ms",
 				"hx-target":  "this",
 				"hx-swap":    "innerHTML",
--- a/core/http/endpoints/explorer/dashboard.go
+++ b/core/http/endpoints/explorer/dashboard.go
@@ -6,6 +6,7 @@ import (

 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/explorer"
+	"github.com/mudler/LocalAI/core/http/utils"
 	"github.com/mudler/LocalAI/internal"
 )

@@ -14,6 +15,7 @@ func Dashboard() func(*fiber.Ctx) error {
 		summary := fiber.Map{
 			"Title":   "LocalAI API - " + internal.PrintableVersion(),
 			"Version": internal.PrintableVersion(),
+			"BaseURL": utils.BaseURL(c),
 		}

 		if string(c.Context().Request.Header.ContentType()) == "application/json" || len(c.Accepts("html")) == 0 {
--- a/core/http/endpoints/localai/gallery.go
+++ b/core/http/endpoints/localai/gallery.go
@@ -9,6 +9,7 @@ import (
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
+	"github.com/mudler/LocalAI/core/http/utils"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/rs/zerolog/log"
@@ -82,7 +83,8 @@ func (mgs *ModelGalleryEndpointService) ApplyModelGalleryEndpoint() func(c *fibe
 			Galleries:        mgs.galleries,
 			ConfigURL:        input.ConfigURL,
 		}
-		return c.JSON(schema.GalleryResponse{ID: uuid.String(), StatusURL: c.BaseURL() + "/models/jobs/" + uuid.String()})
+
+		return c.JSON(schema.GalleryResponse{ID: uuid.String(), StatusURL: fmt.Sprintf("%smodels/jobs/%s", utils.BaseURL(c), uuid.String())})
 	}
 }

@@ -105,7 +107,7 @@ func (mgs *ModelGalleryEndpointService) DeleteModelGalleryEndpoint() func(c *fib
 			return err
 		}

-		return c.JSON(schema.GalleryResponse{ID: uuid.String(), StatusURL: c.BaseURL() + "/models/jobs/" + uuid.String()})
+		return c.JSON(schema.GalleryResponse{ID: uuid.String(), StatusURL: fmt.Sprintf("%smodels/jobs/%s", utils.BaseURL(c), uuid.String())})
 	}
 }

--- a/core/http/endpoints/localai/welcome.go
+++ b/core/http/endpoints/localai/welcome.go
@@ -4,6 +4,7 @@ import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
+	"github.com/mudler/LocalAI/core/http/utils"
 	"github.com/mudler/LocalAI/core/p2p"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/mudler/LocalAI/internal"
@@ -32,6 +33,7 @@ func WelcomeEndpoint(appConfig *config.ApplicationConfig,
 		summary := fiber.Map{
 			"Title":             "LocalAI API - " + internal.PrintableVersion(),
 			"Version":           internal.PrintableVersion(),
+			"BaseURL":           utils.BaseURL(c),
 			"Models":            modelsWithoutConfig,
 			"ModelsConfig":      backendConfigs,
 			"GalleryConfig":     galleryConfigs,
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -0,0 +1,186 @@
+package openai
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	grpcClient "github.com/mudler/LocalAI/pkg/grpc"
+	"github.com/mudler/LocalAI/pkg/grpc/proto"
+	model "github.com/mudler/LocalAI/pkg/model"
+	"github.com/rs/zerolog/log"
+	"google.golang.org/grpc"
+)
+
+var (
+	_ Model = new(wrappedModel)
+	_ Model = new(anyToAnyModel)
+)
+
+// wrappedModel represent a model which does not support Any-to-Any operations
+// This means that we will fake an Any-to-Any model by overriding some of the gRPC client methods
+// which are for Any-To-Any models, but instead we will call a pipeline (for e.g STT->LLM->TTS)
+type wrappedModel struct {
+	TTSConfig           *config.BackendConfig
+	TranscriptionConfig *config.BackendConfig
+	LLMConfig           *config.BackendConfig
+	TTSClient           grpcClient.Backend
+	TranscriptionClient grpcClient.Backend
+	LLMClient           grpcClient.Backend
+
+	VADConfig *config.BackendConfig
+	VADClient grpcClient.Backend
+}
+
+// anyToAnyModel represent a model which supports Any-to-Any operations
+// We have to wrap this out as well because we want to load two models one for VAD and one for the actual model.
+// In the future there could be models that accept continous audio input only so this design will be useful for that
+type anyToAnyModel struct {
+	LLMConfig *config.BackendConfig
+	LLMClient grpcClient.Backend
+
+	VADConfig *config.BackendConfig
+	VADClient grpcClient.Backend
+}
+
+func (m *wrappedModel) VAD(ctx context.Context, in *proto.VADRequest, opts ...grpc.CallOption) (*proto.VADResponse, error) {
+	return m.VADClient.VAD(ctx, in)
+}
+
+func (m *anyToAnyModel) VAD(ctx context.Context, in *proto.VADRequest, opts ...grpc.CallOption) (*proto.VADResponse, error) {
+	return m.VADClient.VAD(ctx, in)
+}
+
+func (m *wrappedModel) Predict(ctx context.Context, in *proto.PredictOptions, opts ...grpc.CallOption) (*proto.Reply, error) {
+	// TODO: Convert with pipeline (audio to text, text to llm, result to tts, and return it)
+	// sound.BufferAsWAV(audioData, "audio.wav")
+
+	return m.LLMClient.Predict(ctx, in)
+}
+
+func (m *wrappedModel) PredictStream(ctx context.Context, in *proto.PredictOptions, f func(reply *proto.Reply), opts ...grpc.CallOption) error {
+	// TODO: Convert with pipeline (audio to text, text to llm, result to tts, and return it)
+
+	return m.LLMClient.PredictStream(ctx, in, f)
+}
+
+func (m *anyToAnyModel) Predict(ctx context.Context, in *proto.PredictOptions, opts ...grpc.CallOption) (*proto.Reply, error) {
+	return m.LLMClient.Predict(ctx, in)
+}
+
+func (m *anyToAnyModel) PredictStream(ctx context.Context, in *proto.PredictOptions, f func(reply *proto.Reply), opts ...grpc.CallOption) error {
+	return m.LLMClient.PredictStream(ctx, in, f)
+}
+
+// returns and loads either a wrapped model or a model that support audio-to-audio
+func newModel(cfg *config.BackendConfig, cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, modelName string) (Model, error) {
+
+	// Prepare VAD model
+	cfgVAD, err := cl.LoadBackendConfigFileByName(cfg.Pipeline.VAD, ml.ModelPath)
+	if err != nil {
+
+		return nil, fmt.Errorf("failed to load backend config: %w", err)
+	}
+
+	if !cfgVAD.Validate() {
+		return nil, fmt.Errorf("failed to validate config: %w", err)
+	}
+
+	opts := backend.ModelOptions(*cfgVAD, appConfig)
+	VADClient, err := ml.Load(opts...)
+	if err != nil {
+		return nil, fmt.Errorf("failed to load tts model: %w", err)
+	}
+
+	// If we don't have Wrapped model definitions, just return a standard model
+	if cfg.Pipeline.IsNotConfigured() {
+
+		// Otherwise we want to return a wrapped model, which is a "virtual" model that re-uses other models to perform operations
+		cfgAnyToAny, err := cl.LoadBackendConfigFileByName(cfg.Model, ml.ModelPath)
+		if err != nil {
+
+			return nil, fmt.Errorf("failed to load backend config: %w", err)
+		}
+
+		if !cfgAnyToAny.Validate() {
+			return nil, fmt.Errorf("failed to validate config: %w", err)
+		}
+
+		opts := backend.ModelOptions(*cfgAnyToAny, appConfig)
+		anyToAnyClient, err := ml.Load(opts...)
+		if err != nil {
+			return nil, fmt.Errorf("failed to load tts model: %w", err)
+		}
+
+		return &anyToAnyModel{
+			LLMConfig: cfgAnyToAny,
+			LLMClient: anyToAnyClient,
+			VADConfig: cfgVAD,
+			VADClient: VADClient,
+		}, nil
+	}
+
+	log.Debug().Msg("Loading a wrapped model")
+
+	// Otherwise we want to return a wrapped model, which is a "virtual" model that re-uses other models to perform operations
+	cfgLLM, err := cl.LoadBackendConfigFileByName(cfg.Pipeline.LLM, ml.ModelPath)
+	if err != nil {
+
+		return nil, fmt.Errorf("failed to load backend config: %w", err)
+	}
+
+	if !cfgLLM.Validate() {
+		return nil, fmt.Errorf("failed to validate config: %w", err)
+	}
+
+	cfgTTS, err := cl.LoadBackendConfigFileByName(cfg.Pipeline.TTS, ml.ModelPath)
+	if err != nil {
+
+		return nil, fmt.Errorf("failed to load backend config: %w", err)
+	}
+
+	if !cfgTTS.Validate() {
+		return nil, fmt.Errorf("failed to validate config: %w", err)
+	}
+
+	cfgSST, err := cl.LoadBackendConfigFileByName(cfg.Pipeline.Transcription, ml.ModelPath)
+	if err != nil {
+
+		return nil, fmt.Errorf("failed to load backend config: %w", err)
+	}
+
+	if !cfgSST.Validate() {
+		return nil, fmt.Errorf("failed to validate config: %w", err)
+	}
+
+	opts = backend.ModelOptions(*cfgTTS, appConfig)
+	ttsClient, err := ml.Load(opts...)
+	if err != nil {
+		return nil, fmt.Errorf("failed to load tts model: %w", err)
+	}
+
+	opts = backend.ModelOptions(*cfgSST, appConfig)
+	transcriptionClient, err := ml.Load(opts...)
+	if err != nil {
+		return nil, fmt.Errorf("failed to load SST model: %w", err)
+	}
+
+	opts = backend.ModelOptions(*cfgLLM, appConfig)
+	llmClient, err := ml.Load(opts...)
+	if err != nil {
+		return nil, fmt.Errorf("failed to load LLM model: %w", err)
+	}
+
+	return &wrappedModel{
+		TTSConfig:           cfgTTS,
+		TranscriptionConfig: cfgSST,
+		LLMConfig:           cfgLLM,
+		TTSClient:           ttsClient,
+		TranscriptionClient: transcriptionClient,
+		LLMClient:           llmClient,
+
+		VADConfig: cfgVAD,
+		VADClient: VADClient,
+	}, nil
+}
--- a/core/http/endpoints/openai/request.go
+++ b/core/http/endpoints/openai/request.go
@@ -48,6 +48,25 @@ func readRequest(c *fiber.Ctx, cl *config.BackendConfigLoader, ml *model.ModelLo
 	return modelFile, input, err
 }

+// func readWSRequest(c *websocket.Conn, cl *config.BackendConfigLoader, ml *model.ModelLoader, o *config.ApplicationConfig, firstModel bool) (string, *schema.OpenAIRequest, error) {
+// 	input := new(schema.OpenAIRequest)
+
+// 	input.Model = c.Query("name")
+
+// 	received, _ := json.Marshal(input)
+
+// 	ctx, cancel := context.WithCancel(o.Context)
+
+// 	input.Context = ctx
+// 	input.Cancel = cancel
+
+// 	log.Debug().Msgf("Request received: %s", string(received))
+
+// 	modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, firstModel)
+
+// 	return modelFile, input, err
+// }
+
 func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIRequest) {
 	if input.Echo {
 		config.Echo = input.Echo
--- a/core/http/explorer.go
+++ b/core/http/explorer.go
@@ -7,6 +7,7 @@ import (
 	"github.com/gofiber/fiber/v2/middleware/favicon"
 	"github.com/gofiber/fiber/v2/middleware/filesystem"
 	"github.com/mudler/LocalAI/core/explorer"
+	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/http/routes"
 )

@@ -22,6 +23,7 @@ func Explorer(db *explorer.Database) *fiber.App {

 	app := fiber.New(fiberCfg)

+	app.Use(middleware.StripPathPrefix())
 	routes.RegisterExplorerRoutes(app, db)

 	httpFS := http.FS(embedDirStatic)
--- a/core/http/middleware/auth.go
+++ b/core/http/middleware/auth.go
@@ -8,6 +8,7 @@ import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/fiber/v2/middleware/keyauth"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/utils"
 )

 // This file contains the configuration generators and handler functions that are used along with the fiber/keyauth middleware
@@ -39,7 +40,9 @@ func getApiKeyErrorHandler(applicationConfig *config.ApplicationConfig) fiber.Er
 			if applicationConfig.OpaqueErrors {
 				return ctx.SendStatus(401)
 			}
-			return ctx.Status(401).Render("views/login", nil)
+			return ctx.Status(401).Render("views/login", fiber.Map{
+				"BaseURL": utils.BaseURL(ctx),
+			})
 		}
 		if applicationConfig.OpaqueErrors {
 			return ctx.SendStatus(500)
--- a/core/http/middleware/strippathprefix.go
+++ b/core/http/middleware/strippathprefix.go
@@ -0,0 +1,36 @@
+package middleware
+
+import (
+	"strings"
+
+	"github.com/gofiber/fiber/v2"
+)
+
+// StripPathPrefix returns a middleware that strips a path prefix from the request path.
+// The path prefix is obtained from the X-Forwarded-Prefix HTTP request header.
+func StripPathPrefix() fiber.Handler {
+	return func(c *fiber.Ctx) error {
+		for _, prefix := range c.GetReqHeaders()["X-Forwarded-Prefix"] {
+			if prefix != "" {
+				path := c.Path()
+				pos := len(prefix)
+
+				if prefix[pos-1] == '/' {
+					pos--
+				} else {
+					prefix += "/"
+				}
+
+				if strings.HasPrefix(path, prefix) {
+					c.Path(path[pos:])
+					break
+				} else if prefix[:pos] == path {
+					c.Redirect(prefix)
+					return nil
+				}
+			}
+		}
+
+		return c.Next()
+	}
+}
--- a/core/http/middleware/strippathprefix_test.go
+++ b/core/http/middleware/strippathprefix_test.go
@@ -0,0 +1,121 @@
+package middleware
+
+import (
+	"net/http/httptest"
+	"testing"
+
+	"github.com/gofiber/fiber/v2"
+	"github.com/stretchr/testify/require"
+)
+
+func TestStripPathPrefix(t *testing.T) {
+	var actualPath string
+
+	app := fiber.New()
+
+	app.Use(StripPathPrefix())
+
+	app.Get("/hello/world", func(c *fiber.Ctx) error {
+		actualPath = c.Path()
+		return nil
+	})
+
+	app.Get("/", func(c *fiber.Ctx) error {
+		actualPath = c.Path()
+		return nil
+	})
+
+	for _, tc := range []struct {
+		name         string
+		path         string
+		prefixHeader []string
+		expectStatus int
+		expectPath   string
+	}{
+		{
+			name:         "without prefix and header",
+			path:         "/hello/world",
+			expectStatus: 200,
+			expectPath:   "/hello/world",
+		},
+		{
+			name:         "without prefix and headers on root path",
+			path:         "/",
+			expectStatus: 200,
+			expectPath:   "/",
+		},
+		{
+			name:         "without prefix but header",
+			path:         "/hello/world",
+			prefixHeader: []string{"/otherprefix/"},
+			expectStatus: 200,
+			expectPath:   "/hello/world",
+		},
+		{
+			name:         "with prefix but non-matching header",
+			path:         "/prefix/hello/world",
+			prefixHeader: []string{"/otherprefix/"},
+			expectStatus: 404,
+		},
+		{
+			name:         "with prefix and matching header",
+			path:         "/myprefix/hello/world",
+			prefixHeader: []string{"/myprefix/"},
+			expectStatus: 200,
+			expectPath:   "/hello/world",
+		},
+		{
+			name:         "with prefix and 1st header matching",
+			path:         "/myprefix/hello/world",
+			prefixHeader: []string{"/myprefix/", "/otherprefix/"},
+			expectStatus: 200,
+			expectPath:   "/hello/world",
+		},
+		{
+			name:         "with prefix and 2nd header matching",
+			path:         "/myprefix/hello/world",
+			prefixHeader: []string{"/otherprefix/", "/myprefix/"},
+			expectStatus: 200,
+			expectPath:   "/hello/world",
+		},
+		{
+			name:         "with prefix and header not ending with slash",
+			path:         "/myprefix/hello/world",
+			prefixHeader: []string{"/myprefix"},
+			expectStatus: 200,
+			expectPath:   "/hello/world",
+		},
+		{
+			name:         "with prefix and non-matching header not ending with slash",
+			path:         "/myprefix-suffix/hello/world",
+			prefixHeader: []string{"/myprefix"},
+			expectStatus: 404,
+		},
+		{
+			name:         "redirect when prefix does not end with a slash",
+			path:         "/myprefix",
+			prefixHeader: []string{"/myprefix"},
+			expectStatus: 302,
+			expectPath:   "/myprefix/",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			actualPath = ""
+			req := httptest.NewRequest("GET", tc.path, nil)
+			if tc.prefixHeader != nil {
+				req.Header["X-Forwarded-Prefix"] = tc.prefixHeader
+			}
+
+			resp, err := app.Test(req, -1)
+
+			require.NoError(t, err)
+			require.Equal(t, tc.expectStatus, resp.StatusCode, "response status code")
+
+			if tc.expectStatus == 200 {
+				require.Equal(t, tc.expectPath, actualPath, "rewritten path")
+			} else if tc.expectStatus == 302 {
+				require.Equal(t, tc.expectPath, resp.Header.Get("Location"), "redirect location")
+			}
+		})
+	}
+}
--- a/core/http/render.go
+++ b/core/http/render.go
@@ -10,6 +10,7 @@ import (
 	"github.com/gofiber/fiber/v2"
 	fiberhtml "github.com/gofiber/template/html/v2"
 	"github.com/microcosm-cc/bluemonday"
+	"github.com/mudler/LocalAI/core/http/utils"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/russross/blackfriday"
 )
@@ -26,7 +27,9 @@ func notFoundHandler(c *fiber.Ctx) error {
 		})
 	} else {
 		// The client expects an HTML response
-		return c.Status(fiber.StatusNotFound).Render("views/404", fiber.Map{})
+		return c.Status(fiber.StatusNotFound).Render("views/404", fiber.Map{
+			"BaseURL": utils.BaseURL(c),
+		})
 	}
 }

--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -11,6 +11,9 @@ func RegisterOpenAIRoutes(app *fiber.App,
 	application *application.Application) {
 	// openAI compatible API endpoint

+	// realtime
+	app.Get("/v1/realtime", openai.Realtime(application))
+
 	// chat
 	app.Post("/v1/chat/completions",
 		openai.ChatEndpoint(
--- a/core/http/routes/ui.go
+++ b/core/http/routes/ui.go
@@ -6,20 +6,21 @@ import (
 	"sort"
 	"strings"

-	"github.com/microcosm-cc/bluemonday"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/http/elements"
 	"github.com/mudler/LocalAI/core/http/endpoints/localai"
+	"github.com/mudler/LocalAI/core/http/utils"
 	"github.com/mudler/LocalAI/core/p2p"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/mudler/LocalAI/internal"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/xsync"
-	"github.com/rs/zerolog/log"

 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
+	"github.com/microcosm-cc/bluemonday"
+	"github.com/rs/zerolog/log"
 )

 type modelOpCache struct {
@@ -91,6 +92,7 @@ func RegisterUIRoutes(app *fiber.App,
 		app.Get("/p2p", func(c *fiber.Ctx) error {
 			summary := fiber.Map{
 				"Title":   "LocalAI - P2P dashboard",
+				"BaseURL": utils.BaseURL(c),
 				"Version": internal.PrintableVersion(),
 				//"Nodes":          p2p.GetAvailableNodes(""),
 				//"FederatedNodes": p2p.GetAvailableNodes(p2p.FederatedID),
@@ -149,6 +151,7 @@ func RegisterUIRoutes(app *fiber.App,

 			summary := fiber.Map{
 				"Title":            "LocalAI - Models",
+				"BaseURL":          utils.BaseURL(c),
 				"Version":          internal.PrintableVersion(),
 				"Models":           template.HTML(elements.ListModels(models, processingModels, galleryService)),
 				"Repositories":     appConfig.Galleries,
@@ -308,6 +311,7 @@ func RegisterUIRoutes(app *fiber.App,

 		summary := fiber.Map{
 			"Title":        "LocalAI - Chat with " + c.Params("model"),
+			"BaseURL":      utils.BaseURL(c),
 			"ModelsConfig": backendConfigs,
 			"Model":        c.Params("model"),
 			"Version":      internal.PrintableVersion(),
@@ -323,11 +327,12 @@ func RegisterUIRoutes(app *fiber.App,

 		if len(backendConfigs) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
-			return c.Redirect("/")
+			return c.Redirect(utils.BaseURL(c))
 		}

 		summary := fiber.Map{
 			"Title":        "LocalAI - Talk",
+			"BaseURL":      utils.BaseURL(c),
 			"ModelsConfig": backendConfigs,
 			"Model":        backendConfigs[0],
 			"IsP2PEnabled": p2p.IsP2PEnabled(),
@@ -344,11 +349,12 @@ func RegisterUIRoutes(app *fiber.App,

 		if len(backendConfigs) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
-			return c.Redirect("/")
+			return c.Redirect(utils.BaseURL(c))
 		}

 		summary := fiber.Map{
 			"Title":        "LocalAI - Chat with " + backendConfigs[0],
+			"BaseURL":      utils.BaseURL(c),
 			"ModelsConfig": backendConfigs,
 			"Model":        backendConfigs[0],
 			"Version":      internal.PrintableVersion(),
@@ -364,6 +370,7 @@ func RegisterUIRoutes(app *fiber.App,

 		summary := fiber.Map{
 			"Title":        "LocalAI - Generate images with " + c.Params("model"),
+			"BaseURL":      utils.BaseURL(c),
 			"ModelsConfig": backendConfigs,
 			"Model":        c.Params("model"),
 			"Version":      internal.PrintableVersion(),
@@ -380,11 +387,12 @@ func RegisterUIRoutes(app *fiber.App,

 		if len(backendConfigs) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
-			return c.Redirect("/")
+			return c.Redirect(utils.BaseURL(c))
 		}

 		summary := fiber.Map{
 			"Title":        "LocalAI - Generate images with " + backendConfigs[0].Name,
+			"BaseURL":      utils.BaseURL(c),
 			"ModelsConfig": backendConfigs,
 			"Model":        backendConfigs[0].Name,
 			"Version":      internal.PrintableVersion(),
@@ -400,6 +408,7 @@ func RegisterUIRoutes(app *fiber.App,

 		summary := fiber.Map{
 			"Title":        "LocalAI - Generate images with " + c.Params("model"),
+			"BaseURL":      utils.BaseURL(c),
 			"ModelsConfig": backendConfigs,
 			"Model":        c.Params("model"),
 			"Version":      internal.PrintableVersion(),
@@ -416,11 +425,12 @@ func RegisterUIRoutes(app *fiber.App,

 		if len(backendConfigs) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
-			return c.Redirect("/")
+			return c.Redirect(utils.BaseURL(c))
 		}

 		summary := fiber.Map{
 			"Title":        "LocalAI - Generate audio with " + backendConfigs[0].Name,
+			"BaseURL":      utils.BaseURL(c),
 			"ModelsConfig": backendConfigs,
 			"Model":        backendConfigs[0].Name,
 			"IsP2PEnabled": p2p.IsP2PEnabled(),
--- a/core/http/static/assets/font1.css
+++ b/core/http/static/assets/font1.css
@@ -7,33 +7,33 @@ https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&family=Roboto:wg
  font-style: normal;
  font-weight: 400;
  font-display: swap;
-  src: url(/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuLyfMZg.ttf) format('truetype');
+  src: url(./UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuLyfMZg.ttf) format('truetype');
 }
@font-face {
  font-family: 'Inter';
  font-style: normal;
  font-weight: 600;
  font-display: swap;
-  src: url(/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuGKYMZg.ttf) format('truetype');
+  src: url(./UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuGKYMZg.ttf) format('truetype');
 }
@font-face {
  font-family: 'Inter';
  font-style: normal;
  font-weight: 700;
  font-display: swap;
-  src: url(/static/assets/UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuFuYMZg.ttf) format('truetype');
+  src: url(./UcCO3FwrK3iLTeHuS_fvQtMwCp50KnMw2boKoduKmMEVuFuYMZg.ttf) format('truetype');
 }
@font-face {
  font-family: 'Roboto';
  font-style: normal;
  font-weight: 400;
  font-display: swap;
-  src: url(/static/assets/KFOmCnqEu92Fr1Me5Q.ttf) format('truetype');
+  src: url(./KFOmCnqEu92Fr1Me5Q.ttf) format('truetype');
 }
@font-face {
  font-family: 'Roboto';
  font-style: normal;
  font-weight: 500;
  font-display: swap;
-  src: url(/static/assets/KFOlCnqEu92Fr1MmEU9vAw.ttf) format('truetype');
+  src: url(./KFOlCnqEu92Fr1MmEU9vAw.ttf) format('truetype');
 }
--- a/core/http/static/assets/font2.css
+++ b/core/http/static/assets/font2.css
@@ -7,33 +7,33 @@ https://fonts.googleapis.com/css?family=Roboto:300,400,500,700,900&display=swap
  font-style: normal;
  font-weight: 300;
  font-display: swap;
-  src: url(/static/assets//KFOlCnqEu92Fr1MmSU5fBBc9.ttf) format('truetype');
+  src: url(./KFOlCnqEu92Fr1MmSU5fBBc9.ttf) format('truetype');
 }
@font-face {
  font-family: 'Roboto';
  font-style: normal;
  font-weight: 400;
  font-display: swap;
-  src: url(/static/assets//KFOmCnqEu92Fr1Mu4mxP.ttf) format('truetype');
+  src: url(./KFOmCnqEu92Fr1Mu4mxP.ttf) format('truetype');
 }
@font-face {
  font-family: 'Roboto';
  font-style: normal;
  font-weight: 500;
  font-display: swap;
-  src: url(/static/assets//KFOlCnqEu92Fr1MmEU9fBBc9.ttf) format('truetype');
+  src: url(./KFOlCnqEu92Fr1MmEU9fBBc9.ttf) format('truetype');
 }
@font-face {
  font-family: 'Roboto';
  font-style: normal;
  font-weight: 700;
  font-display: swap;
-  src: url(/static/assets//KFOlCnqEu92Fr1MmWUlfBBc9.ttf) format('truetype');
+  src: url(./KFOlCnqEu92Fr1MmWUlfBBc9.ttf) format('truetype');
 }
@font-face {
  font-family: 'Roboto';
  font-style: normal;
  font-weight: 900;
  font-display: swap;
-  src: url(/static/assets//KFOlCnqEu92Fr1MmYUtfBBc9.ttf) format('truetype');
+  src: url(./KFOlCnqEu92Fr1MmYUtfBBc9.ttf) format('truetype');
 }
--- a/core/http/static/chat.js
+++ b/core/http/static/chat.js
@@ -143,7 +143,7 @@ function readInputImage() {
    // }

    // Source: https://stackoverflow.com/a/75751803/11386095
-    const response = await fetch("/v1/chat/completions", {
+    const response = await fetch("v1/chat/completions", {
      method: "POST",
      headers: {
        Authorization: `Bearer ${key}`,
--- a/core/http/static/image.js
+++ b/core/http/static/image.js
@@ -48,7 +48,7 @@ async function promptDallE(key, input) {
  document.getElementById("input").disabled = true;

  const model = document.getElementById("image-model").value;
-  const response = await fetch("/v1/images/generations", {
+  const response = await fetch("v1/images/generations", {
    method: "POST",
    headers: {
      Authorization: `Bearer ${key}`,
--- a/core/http/static/talk.js
+++ b/core/http/static/talk.js
@@ -122,7 +122,7 @@ async function sendAudioToWhisper(audioBlob) {
    formData.append('model', getWhisperModel());
    API_KEY = localStorage.getItem("key");

-    const response = await fetch('/v1/audio/transcriptions', {
+    const response = await fetch('v1/audio/transcriptions', {
        method: 'POST',
        headers: {
            'Authorization': `Bearer ${API_KEY}`
@@ -139,7 +139,7 @@ async function sendTextToChatGPT(text) {
    conversationHistory.push({ role: "user", content: text });
    API_KEY = localStorage.getItem("key");

-    const response = await fetch('/v1/chat/completions', {
+    const response = await fetch('v1/chat/completions', {
        method: 'POST',
        headers: {
            'Authorization': `Bearer ${API_KEY}`,
@@ -163,7 +163,7 @@ async function sendTextToChatGPT(text) {
 async function getTextToSpeechAudio(text) {
    API_KEY = localStorage.getItem("key");

-    const response = await fetch('/v1/audio/speech', {
+    const response = await fetch('v1/audio/speech', {
        
        method: 'POST',
        headers: {
--- a/core/http/static/tts.js
+++ b/core/http/static/tts.js
@@ -19,7 +19,7 @@ async function tts(key, input) {
  document.getElementById("input").disabled = true;

  const model = document.getElementById("tts-model").value;
-  const response = await fetch("/tts", {
+  const response = await fetch("tts", {
    method: "POST",
    headers: {
      Authorization: `Bearer ${key}`,
--- a/core/http/utils/baseurl.go
+++ b/core/http/utils/baseurl.go
@@ -0,0 +1,24 @@
+package utils
+
+import (
+	"strings"
+
+	"github.com/gofiber/fiber/v2"
+)
+
+// BaseURL returns the base URL for the given HTTP request context.
+// It takes into account that the app may be exposed by a reverse-proxy under a different protocol, host and path.
+// The returned URL is guaranteed to end with `/`.
+// The method should be used in conjunction with the StripPathPrefix middleware.
+func BaseURL(c *fiber.Ctx) string {
+	path := c.Path()
+	origPath := c.OriginalURL()
+
+	if path != origPath && strings.HasSuffix(origPath, path) {
+		pathPrefix := origPath[:len(origPath)-len(path)+1]
+
+		return c.BaseURL() + pathPrefix
+	}
+
+	return c.BaseURL() + "/"
+}
--- a/core/http/utils/baseurl_test.go
+++ b/core/http/utils/baseurl_test.go
@@ -0,0 +1,48 @@
+package utils
+
+import (
+	"net/http/httptest"
+	"testing"
+
+	"github.com/gofiber/fiber/v2"
+	"github.com/stretchr/testify/require"
+)
+
+func TestBaseURL(t *testing.T) {
+	for _, tc := range []struct {
+		name      string
+		prefix    string
+		expectURL string
+	}{
+		{
+			name:      "without prefix",
+			prefix:    "/",
+			expectURL: "http://example.com/",
+		},
+		{
+			name:      "with prefix",
+			prefix:    "/myprefix/",
+			expectURL: "http://example.com/myprefix/",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			app := fiber.New()
+			actualURL := ""
+
+			app.Get(tc.prefix+"hello/world", func(c *fiber.Ctx) error {
+				if tc.prefix != "/" {
+					c.Path("/hello/world")
+				}
+				actualURL = BaseURL(c)
+				return nil
+			})
+
+			req := httptest.NewRequest("GET", tc.prefix+"hello/world", nil)
+			resp, err := app.Test(req, -1)
+
+			require.NoError(t, err)
+			require.Equal(t, 200, resp.StatusCode, "response status code")
+			require.Equal(t, tc.expectURL, actualURL, "base URL")
+		})
+	}
+}
--- a/core/http/views/404.html
+++ b/core/http/views/404.html
@@ -12,7 +12,7 @@
        <div class="header text-center py-12">
            <h1 class="text-5xl font-bold">Welcome to your LocalAI instance!</h1>
            <div class="mt-6">
-         <!--       <a href="/" aria-label="HomePage" alt="HomePage">           
+         <!--       <a href="./" aria-label="HomePage" alt="HomePage">
                    <img class="mx-auto w-1/4 h-auto" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" alt="LocalAI Logo">            
                </a>
            -->
--- a/core/http/views/chat.html
+++ b/core/http/views/chat.html
@@ -28,7 +28,7 @@ SOFTWARE.
 <!doctype html>
 <html lang="en">
  {{template "views/partials/head" .}}
-  <script defer src="/static/chat.js"></script>
+  <script defer src="static/chat.js"></script>
  <style>
    body {
        overflow: hidden; 
@@ -101,9 +101,9 @@ SOFTWARE.
        {{ $model:=.Model}}
        {{ range .ModelsConfig }}
        {{ if eq . $model }}
-        <option value="/chat/{{.}}" selected  class="bg-gray-700 text-white">{{.}}</option>
+        <option value="chat/{{.}}" selected  class="bg-gray-700 text-white">{{.}}</option>
        {{ else }}
-        <option value="/chat/{{.}}" class="bg-gray-700 text-white">{{.}}</option>
+        <option value="chat/{{.}}" class="bg-gray-700 text-white">{{.}}</option>
        {{ end }}
        {{ end }}
      </select>
@@ -142,7 +142,7 @@ SOFTWARE.
      <div id="loader" class="my-2 loader" style="display: none;"></div>
      <input id="chat-model" type="hidden" value="{{.Model}}">
      <input id="input_image" type="file" style="display: none;" @change="fileName = $event.target.files[0].name">
-      <form id="prompt" action="/chat/{{.Model}}" method="get" @submit.prevent="submitPrompt">
+      <form id="prompt" action="chat/{{.Model}}" method="get" @submit.prevent="submitPrompt">
          <div class="relative w-full">
              <textarea
                  id="input"
--- a/core/http/views/explorer.html
+++ b/core/http/views/explorer.html
@@ -370,7 +370,7 @@
                }
            }
        </script>
-        <script src="/static/p2panimation.js"></script>
+        <script src="static/p2panimation.js"></script>

        {{template "views/partials/footer" .}}
    </div>
--- a/core/http/views/index.html
+++ b/core/http/views/index.html
@@ -20,7 +20,7 @@
            {{template "views/partials/inprogress" .}}
            {{ if eq (len .ModelsConfig) 0 }}
            <h2 class="text-center text-3xl font-semibold text-gray-100"> <i class="text-yellow-200 ml-2 fa-solid fa-triangle-exclamation animate-pulse"></i> Ouch! seems you don't have any models installed from the LocalAI gallery!</h2>
-            <p class="text-center mt-4 text-xl">..install something from the <a class="text-gray-400 hover:text-white ml-1 px-3 py-2 rounded" href="/browse">🖼️ Gallery</a> or check the <a href="https://localai.io/basics/getting_started/" class="text-gray-400 hover:text-white ml-1 px-3 py-2 rounded"> <i class="fa-solid fa-book"></i> Getting started documentation </a></p>
+            <p class="text-center mt-4 text-xl">..install something from the <a class="text-gray-400 hover:text-white ml-1 px-3 py-2 rounded" href="browse">🖼️ Gallery</a> or check the <a href="https://localai.io/basics/getting_started/" class="text-gray-400 hover:text-white ml-1 px-3 py-2 rounded"> <i class="fa-solid fa-book"></i> Getting started documentation </a></p>

            {{ if ne (len .Models) 0 }}
            <hr class="my-4">
@@ -66,7 +66,7 @@
                        {{ end }}
                    </td>
                    <td class="px-4 py-3 font-bold">
-                        <p class="font-bold text-white flex items-center"><i class="fas fa-brain pr-2"></i><a href="/browse?term={{.Name}}">{{.Name}}</a></p>
+                        <p class="font-bold text-white flex items-center"><i class="fas fa-brain pr-2"></i><a href="browse?term={{.Name}}">{{.Name}}</a></p>
                    </td>
                    <td class="px-4 py-3 font-bold">
                        {{ if .Backend }}
@@ -84,7 +84,7 @@
                    <td class="px-4 py-3">
                        <button
                            class="float-right inline-block rounded bg-red-800 px-6 pb-2.5 mb-3 pt-2.5 text-xs font-medium uppercase leading-normal text-white shadow-primary-3 transition duration-150 ease-in-out hover:bg-red-accent-300 hover:shadow-red-2 focus:bg-red-accent-300 focus:shadow-primary-2 focus:outline-none focus:ring-0 active:bg-red-600 active:shadow-primary-2 dark:shadow-black/30 dark:hover:shadow-dark-strong dark:focus:shadow-dark-strong dark:active:shadow-dark-strong"
-                            data-twe-ripple-color="light" data-twe-ripple-init="" hx-confirm="Are you sure you wish to delete the model?" hx-post="/browse/delete/model/{{.Name}}" hx-swap="outerHTML"><i class="fa-solid fa-cancel pr-2"></i>Delete</button>
+                            data-twe-ripple-color="light" data-twe-ripple-init="" hx-confirm="Are you sure you wish to delete the model?" hx-post="browse/delete/model/{{.Name}}" hx-swap="outerHTML"><i class="fa-solid fa-cancel pr-2"></i>Delete</button>
                    </td>
                {{ end }}
                {{ range .Models }}
--- a/core/http/views/login.html
+++ b/core/http/views/login.html
@@ -4,6 +4,8 @@
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Open Authenticated Website</title>
+    <base href="{{.BaseURL}}" />
+    <link rel="icon" type="image/x-icon" href="favicon.ico" />
 </head>
 <body>
    <h1>Authorization is required</h1>
--- a/core/http/views/models.html
+++ b/core/http/views/models.html
@@ -16,38 +16,38 @@

            <div class="text-center font-semibold text-gray-100">
                <h2>Filter by type:</h2>
-                <button  hx-post="/browse/search/models"
+                <button  hx-post="browse/search/models"
                    class="text-white-500 inline-block bg-blue-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2"
                    hx-target="#search-results" 
                    hx-vals='{"search": "tts"}'
                hx-indicator=".htmx-indicator" >TTS</button> 
-                <button  hx-post="/browse/search/models" 
+                <button  hx-post="browse/search/models" 
                    class="text-white-500 inline-block bg-blue-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2"
                    hx-target="#search-results" 
                    hx-vals='{"search": "stablediffusion"}'
                hx-indicator=".htmx-indicator" >Image generation</button> 
-                <button  hx-post="/browse/search/models" \
+                <button  hx-post="browse/search/models" \
                    class="text-white-500 inline-block bg-blue-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2"
                    hx-target="#search-results" 
                    hx-vals='{"search": "llm"}'
                hx-indicator=".htmx-indicator" >Text generation</button> 
-                <button  hx-post="/browse/search/models" 
+                <button  hx-post="browse/search/models" 
                    class="text-white-500 inline-block bg-blue-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2"
                    hx-target="#search-results" 
                    hx-vals='{"search": "multimodal"}'
                hx-indicator=".htmx-indicator" >Multimodal</button> 
-                <button  hx-post="/browse/search/models" 
+                <button  hx-post="browse/search/models" 
                    class="text-white-500 inline-block bg-blue-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2"
                    hx-target="#search-results" 
                    hx-vals='{"search": "embedding"}'
                hx-indicator=".htmx-indicator" >Embeddings</button>
-                <button  hx-post="/browse/search/models"
+                <button  hx-post="browse/search/models"
                    class="text-white-500 inline-block bg-blue-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2"
                    hx-target="#search-results" 
                    hx-vals='{"search": "rerank"}'
                hx-indicator=".htmx-indicator" >Rerankers</button> 
                <button  
-                    hx-post="/browse/search/models"
+                    hx-post="browse/search/models"
                    class="text-white-500 inline-block bg-blue-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2"
                    hx-target="#search-results" 
                    hx-vals='{"search": "whisper"}'
@@ -57,7 +57,7 @@
            <div class="text-center text-xs font-semibold text-gray-100">
                Filter by tags:
                {{ range .AllTags }}
-                    <button  hx-post="/browse/search/models" class="text-blue-500" hx-target="#search-results" 
+                    <button  hx-post="browse/search/models" class="text-blue-500" hx-target="#search-results" 
                    hx-vals='{"search": "{{.}}"}'
                    hx-indicator=".htmx-indicator" >{{.}}</button> 
                {{ end }}
@@ -69,7 +69,7 @@

            <input class="form-control appearance-none block w-full mt-5 px-3 py-2 text-base font-normal text-gray-300 pb-2 mb-5 bg-gray-800 bg-clip-padding border border-solid border-gray-600 rounded transition ease-in-out m-0 focus:text-gray-300 focus:bg-gray-900 focus:border-blue-500 focus:outline-none" type="search" 
                name="search" placeholder="Begin Typing To Search models..." 
-                hx-post="/browse/search/models" 
+                hx-post="browse/search/models" 
                hx-trigger="input changed delay:500ms, search" 
                hx-target="#search-results" 
                hx-indicator=".htmx-indicator">
--- a/core/http/views/p2p.html
+++ b/core/http/views/p2p.html
@@ -48,11 +48,11 @@
            <!-- Federation Box -->
            <div class="bg-gray-800 p-6 rounded-lg shadow-lg mb-12 text-left">

-                <p class="text-xl font-semibold text-gray-200"> <i class="text-gray-200 fa-solid fa-circle-nodes"></i> Federated Nodes: <span hx-get="/p2p/ui/workers-federation-stats" hx-trigger="every 1s"></span> </p>
+                <p class="text-xl font-semibold text-gray-200"> <i class="text-gray-200 fa-solid fa-circle-nodes"></i> Federated Nodes: <span hx-get="p2p/ui/workers-federation-stats" hx-trigger="every 1s"></span> </p>
                <p class="mb-4">You can start LocalAI in federated mode to share your instance, or start the federated server to balance requests between nodes of the federation.</p>

                <div class="grid grid-cols-1 sm:grid-cols-2 md:grid-cols-3 gap-4 mb-12">
-                    <div hx-get="/p2p/ui/workers-federation" hx-trigger="every 1s"></div>
+                    <div hx-get="p2p/ui/workers-federation" hx-trigger="every 1s"></div>
                </div>

                <hr class="border-gray-700 mb-12">
@@ -123,11 +123,11 @@

            <div class="bg-gray-800 p-6 rounded-lg shadow-lg mb-12 text-left">

-                <p class="text-xl font-semibold text-gray-200"> <i class="text-gray-200 fa-solid fa-circle-nodes"></i> Workers (llama.cpp): <span hx-get="/p2p/ui/workers-stats" hx-trigger="every 1s"></span> </p>
+                <p class="text-xl font-semibold text-gray-200"> <i class="text-gray-200 fa-solid fa-circle-nodes"></i> Workers (llama.cpp): <span hx-get="p2p/ui/workers-stats" hx-trigger="every 1s"></span> </p>
                <p class="mb-4">You can start llama.cpp workers to distribute weights between the workers and offload part of the computation. To start a new worker, you can use the CLI or Docker.</p>

                <div class="grid grid-cols-1 sm:grid-cols-2 md:grid-cols-3 gap-4 mb-12">
-                    <div hx-get="/p2p/ui/workers" hx-trigger="every 1s"></div>
+                    <div hx-get="p2p/ui/workers" hx-trigger="every 1s"></div>
                </div>
                <hr class="border-gray-700 mb-12">

@@ -177,7 +177,7 @@

    {{template "views/partials/footer" .}}
 </div>
-<script src="/static/p2panimation.js"></script>
+<script src="static/p2panimation.js"></script>
 <style>
    .token {
        word-break: break-all;
--- a/core/http/views/partials/footer.html
+++ b/core/http/views/partials/footer.html
@@ -2,4 +2,4 @@
    LocalAI Version {{.Version}}<br>
    <a href='https://github.com/mudler/LocalAI' class="text-blue-400 hover:text-blue-600" target="_blank">LocalAI</a> © 2023-2024 <a href='https://mudler.pm' class="text-blue-400 hover:text-blue-600" target="_blank">Ettore Di Giacinto</a>
 </footer>
-<script src="/static/assets/tw-elements.js"></script>
+<script src="static/assets/tw-elements.js"></script>
--- a/core/http/views/partials/head.html
+++ b/core/http/views/partials/head.html
@@ -2,33 +2,35 @@
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{{.Title}}</title>
+    <base href="{{.BaseURL}}" />
+    <link rel="icon" type="image/x-icon" href="favicon.ico" />
    <link
    rel="stylesheet"
-    href="/static/assets/highlightjs.css"
+    href="static/assets/highlightjs.css"
  />
-  <script defer src="/static/assets/highlightjs.js"></script>
+  <script defer src="static/assets/highlightjs.js"></script>
    <script
    defer
-    src="/static/assets/alpine.js"
+    src="static/assets/alpine.js"
  ></script>
  <script
    defer
-    src="/static/assets/marked.js"
+    src="static/assets/marked.js"
  ></script>
  <script
    defer
-    src="/static/assets/purify.js"
+    src="static/assets/purify.js"
  ></script>

-  <link href="/static/general.css" rel="stylesheet" />
-    <link href="/static/assets/font1.css" rel="stylesheet">
+  <link href="static/general.css" rel="stylesheet" />
+    <link href="static/assets/font1.css" rel="stylesheet">
    <link
-    href="/static/assets/font2.css"
+    href="static/assets/font2.css"
    rel="stylesheet" />
  <link
    rel="stylesheet"
-    href="/static/assets/tw-elements.css" />
-  <script src="/static/assets/tailwindcss.js"></script>
+    href="static/assets/tw-elements.css" />
+  <script src="static/assets/tailwindcss.js"></script>
  <script>
    tailwind.config = {
      darkMode: "class",
@@ -54,11 +56,11 @@
      });
    }
  </script>
-  <link href="/static/assets/fontawesome/css/fontawesome.css" rel="stylesheet" />
-  <link href="/static/assets/fontawesome/css/brands.css" rel="stylesheet" />
-  <link href="/static/assets/fontawesome/css/solid.css" rel="stylesheet" />
-  <script src="/static/assets/flowbite.min.js"></script>
-  <script src="/static/assets/htmx.js" crossorigin="anonymous"></script>
+  <link href="static/assets/fontawesome/css/fontawesome.css" rel="stylesheet" />
+  <link href="static/assets/fontawesome/css/brands.css" rel="stylesheet" />
+  <link href="static/assets/fontawesome/css/solid.css" rel="stylesheet" />
+  <script src="static/assets/flowbite.min.js"></script>
+  <script src="static/assets/htmx.js" crossorigin="anonymous"></script>
  <!-- P2P Animation START -->
  <style>
    .animation-container {
--- a/core/http/views/partials/inprogress.html
+++ b/core/http/views/partials/inprogress.html
@@ -17,13 +17,13 @@

      <div class="flex items-center justify-between bg-slate-600 p-2 mb-2 rounded-md">
         <div class="flex items center">
-             <span class="text-gray-300"><a href="/browse?term={{$parts._1}}"
+             <span class="text-gray-300"><a href="browse?term={{$parts._1}}"
                 class="text-white-500 inline-block bg-blue-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2"
                 >{{$modelName}}</a> {{if $repository}} (from the '{{$repository}}' repository) {{end}}</span>
         </div>
-         <div hx-get="/browse/job/{{$value}}" hx-swap="outerHTML" hx-target="this" hx-trigger="done">
+         <div hx-get="browse/job/{{$value}}" hx-swap="outerHTML" hx-target="this" hx-trigger="done">
             <h3 role="status" id="pblabel" >{{$op}}
-                 <div hx-get="/browse/job/progress/{{$value}}" hx-trigger="every 600ms" 
+                 <div hx-get="browse/job/progress/{{$value}}" hx-trigger="every 600ms" 
                 hx-target="this"
                 hx-swap="innerHTML"  ></div></h3>
         </div>     
--- a/core/http/views/partials/navbar.html
+++ b/core/http/views/partials/navbar.html
@@ -3,8 +3,8 @@
        <div class="flex items-center justify-between">
            <div class="flex items-center">
                <!-- Logo Image: Replace 'logo_url_here' with your actual logo URL -->
-                <a href="/" class="text-white text-xl font-bold"><img src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" alt="LocalAI Logo" class="h-10 mr-3 border-2 border-gray-300 shadow rounded"></a>
-                <a href="/" class="text-white text-xl font-bold">LocalAI</a>
+                <a href="./" class="text-white text-xl font-bold"><img src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" alt="LocalAI Logo" class="h-10 mr-3 border-2 border-gray-300 shadow rounded"></a>
+                <a href="./" class="text-white text-xl font-bold">LocalAI</a>
            </div>
            <!-- Menu button for small screens -->
            <div class="lg:hidden">
@@ -14,33 +14,33 @@
            </div>
            <!-- Navigation links -->
            <div class="hidden lg:flex lg:items-center lg:justify-end lg:flex-1 lg:w-0">
-                <a href="/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-home pr-2"></i>Home</a>
+                <a href="./" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-home pr-2"></i>Home</a>
                <a href="https://localai.io" class="text-gray-400 hover:text-white px-3 py-2 rounded" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
-                <a href="/browse/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-brain pr-2"></i> Models</a>
-                <a href="/chat/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-comments pr-2"></i> Chat</a>
-                <a href="/text2image/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-image pr-2"></i> Generate images</a>
-                <a href="/tts/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-music pr-2"></i> TTS </a>
-                <a href="/talk/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-phone pr-2"></i> Talk </a>
+                <a href="browse/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-brain pr-2"></i> Models</a>
+                <a href="chat/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-comments pr-2"></i> Chat</a>
+                <a href="text2image/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-image pr-2"></i> Generate images</a>
+                <a href="tts/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-music pr-2"></i> TTS </a>
+                <a href="talk/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-phone pr-2"></i> Talk </a>
                {{ if .IsP2PEnabled }}
-                <a href="/p2p/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-circle-nodes"></i> Swarm </a>
+                <a href="p2p/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fa-solid fa-circle-nodes"></i> Swarm </a>
                {{ end }}
-                <a href="/swagger/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-code pr-2"></i> API</a>
+                <a href="swagger/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-code pr-2"></i> API</a>
            </div>
        </div>
        <!-- Collapsible menu for small screens -->
        <div class="hidden lg:hidden" id="mobile-menu">
            <div class="pt-4 pb-3 border-t border-gray-700">
-                <a href="/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-home pr-2"></i>Home</a>
+                <a href="./" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-home pr-2"></i>Home</a>
                <a href="https://localai.io" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
-                <a href="/browse/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-brain pr-2"></i> Models</a>
-                <a href="/chat/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-comments pr-2"></i> Chat</a>
-                <a href="/text2image/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-image pr-2"></i> Generate images</a>
-                <a href="/tts/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-music pr-2"></i> TTS </a>
-                <a href="/talk/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-phone pr-2"></i> Talk </a>
+                <a href="browse/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-brain pr-2"></i> Models</a>
+                <a href="chat/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-comments pr-2"></i> Chat</a>
+                <a href="text2image/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-image pr-2"></i> Generate images</a>
+                <a href="tts/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-music pr-2"></i> TTS </a>
+                <a href="talk/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-phone pr-2"></i> Talk </a>
                {{ if .IsP2PEnabled }}
-                <a href="/p2p/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-circle-nodes"></i> Swarm </a>
+                <a href="p2p/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fa-solid fa-circle-nodes"></i> Swarm </a>
                {{ end }}
-                <a href="/swagger/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-code pr-2"></i> API</a>
+                <a href="swagger/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-code pr-2"></i> API</a>
            </div>
        </div>
    </div>
--- a/core/http/views/partials/navbar_explorer.html
+++ b/core/http/views/partials/navbar_explorer.html
@@ -3,8 +3,8 @@
        <div class="flex items-center justify-between">
            <div class="flex items-center">
                <!-- Logo Image: Replace 'logo_url_here' with your actual logo URL -->
-                <a href="/" class="text-white text-xl font-bold"><img src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" alt="LocalAI Logo" class="h-10 mr-3 border-2 border-gray-300 shadow rounded"></a>
-                <a href="/" class="text-white text-xl font-bold">LocalAI</a>
+                <a href="./" class="text-white text-xl font-bold"><img src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" alt="LocalAI Logo" class="h-10 mr-3 border-2 border-gray-300 shadow rounded"></a>
+                <a href="./" class="text-white text-xl font-bold">LocalAI</a>
            </div>
            <!-- Menu button for small screens -->
            <div class="lg:hidden">
@@ -14,7 +14,7 @@
            </div>
            <!-- Navigation links -->
            <div class="hidden lg:flex lg:items-center lg:justify-end lg:flex-1 lg:w-0">
-                <a href="/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-home pr-2"></i>Home</a>
+                <a href="./" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-home pr-2"></i>Home</a>
                <a href="https://localai.io" class="text-gray-400 hover:text-white px-3 py-2 rounded" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
                <a href="https://models.localai.io/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-brain pr-2"></i> Models</a>
            </div>
@@ -22,7 +22,7 @@
        <!-- Collapsible menu for small screens -->
        <div class="hidden lg:hidden" id="mobile-menu">
            <div class="pt-4 pb-3 border-t border-gray-700">
-                <a href="/" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-home pr-2"></i>Home</a>
+                <a href="./" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1"><i class="fas fa-home pr-2"></i>Home</a>
                <a href="https://localai.io" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
                <a href="https://models.localai.io/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-brain pr-2"></i> Models</a>
            </div>
--- a/core/http/views/talk.html
+++ b/core/http/views/talk.html
@@ -1,7 +1,7 @@
 <!doctype html>
 <html lang="en">
  {{template "views/partials/head" .}}
-  <script defer src="/static/talk.js"></script>
+  <script defer src="static/talk.js"></script>
  <style>
    body {
        overflow: hidden; 
--- a/core/http/views/text2image.html
+++ b/core/http/views/text2image.html
@@ -1,7 +1,7 @@
 <!DOCTYPE html>
 <html lang="en">
 {{template "views/partials/head" .}}
-<script defer src="/static/image.js"></script>
+<script defer src="static/image.js"></script>

 <body class="bg-gray-900 text-gray-200">
 <div class="flex flex-col min-h-screen">
@@ -50,9 +50,9 @@
                {{ $model:=.Model}}
                {{ range .ModelsConfig }}
                {{ if eq .Name $model }}
-                <option value="/text2image/{{.Name}}" selected class="bg-gray-700 text-white">{{.Name}}</option>
+                <option value="text2image/{{.Name}}" selected class="bg-gray-700 text-white">{{.Name}}</option>
                {{ else }}
-                <option value="/text2image/{{.Name}}" class="bg-gray-700 text-white">{{.Name}}</option>
+                <option value="text2image/{{.Name}}" class="bg-gray-700 text-white">{{.Name}}</option>
                {{ end }}
                {{ end }}
              </select>
@@ -62,7 +62,7 @@

            <div class="mt-12">
              <input id="image-model" type="hidden" value="{{.Model}}">
-              <form id="genimage" action="/text2image/{{.Model}}" method="get">
+              <form id="genimage" action="text2image/{{.Model}}" method="get">
                <input
                  type="text"
                  id="input"
--- a/core/http/views/tts.html
+++ b/core/http/views/tts.html
@@ -1,7 +1,7 @@
 <!DOCTYPE html>
 <html lang="en">
 {{template "views/partials/head" .}}
-<script defer src="/static/tts.js"></script>
+<script defer src="static/tts.js"></script>

 <body class="bg-gray-900 text-gray-200">
 <div class="flex flex-col min-h-screen">
@@ -47,9 +47,9 @@
                {{ $model:=.Model}}
                {{ range .ModelsConfig }}
                {{ if eq .Name $model }}
-                <option value="/tts/{{.Name}}" selected class="bg-gray-700 text-white">{{.Name}}</option>
+                <option value="tts/{{.Name}}" selected class="bg-gray-700 text-white">{{.Name}}</option>
                {{ else }}
-                <option value="/tts/{{.Name}}" class="bg-gray-700 text-white">{{.Name}}</option>
+                <option value="tts/{{.Name}}" class="bg-gray-700 text-white">{{.Name}}</option>
                {{ end }}
                {{ end }}
              </select>
@@ -59,7 +59,7 @@

            <div class="mt-12">
              <input id="tts-model" type="hidden" value="{{.Model}}">
-              <form id="tts" action="/tts/{{.Model}}" method="get">
+              <form id="tts" action="tts/{{.Model}}" method="get">
                <input
                  type="text"
                  id="input"
--- a/docs/content/docs/features/distributed_inferencing.md
+++ b/docs/content/docs/features/distributed_inferencing.md
@@ -129,6 +129,9 @@ There are options that can be tweaked or parameters that can be set using enviro

 | Environment Variable | Description |
 |----------------------|-------------|
+| **LOCALAI_P2P** | Set to "true" to enable p2p |
+| **LOCALAI_FEDERATED** | Set to "true" to enable federated mode |
+| **FEDERATED_SERVER** | Set to "true" to enable federated server |
 | **LOCALAI_P2P_DISABLE_DHT** | Set to "true" to disable DHT and enable p2p layer to be local only (mDNS) |
 | **LOCALAI_P2P_ENABLE_LIMITS** | Set to "true" to enable connection limits and resources management (useful when running with poor connectivity or want to limit resources consumption) |
 | **LOCALAI_P2P_LISTEN_MADDRS** | Set to comma separated list of multiaddresses to override default libp2p 0.0.0.0 multiaddresses |
--- a/docs/content/docs/reference/compatibility-table.md
+++ b/docs/content/docs/reference/compatibility-table.md
@@ -16,8 +16,8 @@ LocalAI will attempt to automatically load models which are not explicitly confi

 | Backend and Bindings                                                             | Compatible models     | Completion/Chat endpoint | Capability | Embeddings support                | Token stream support | Acceleration |
 |----------------------------------------------------------------------------------|-----------------------|--------------------------|---------------------------|-----------------------------------|----------------------|--------------|
-| [llama.cpp]({{%relref "docs/features/text-generation#llama.cpp" %}})        | LLama, Mamba, RWKV, Falcon, Starcoder, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes                      | GPT and Functions                        | yes** | yes                  | CUDA, openCL, cuBLAS, Metal |
-| [llama.cpp's ggml model (backward compatibility with old format, before GGUF)](https://github.com/ggerganov/llama.cpp) ([binding](https://github.com/go-skynet/go-llama.cpp))  | LLama, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes                      | GPT and Functions                        | yes** | yes                  | CUDA, openCL, cuBLAS, Metal |
+| [llama.cpp]({{%relref "docs/features/text-generation#llama.cpp" %}})        | LLama, Mamba, RWKV, Falcon, Starcoder, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes                      | GPT and Functions                        | yes | yes                  | CUDA, openCL, cuBLAS, Metal |
+| [llama.cpp's ggml model (backward compatibility with old format, before GGUF)](https://github.com/ggerganov/llama.cpp) ([binding](https://github.com/go-skynet/go-llama.cpp))  | LLama, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes                      | GPT and Functions                        | yes | yes                  | CUDA, openCL, cuBLAS, Metal |
 | [whisper](https://github.com/ggerganov/whisper.cpp)         | whisper               | no                       | Audio                 | no                                | no                   | N/A |
 | [stablediffusion](https://github.com/EdVince/Stable-Diffusion-NCNN) ([binding](https://github.com/mudler/go-stable-diffusion))        | stablediffusion               | no                       | Image                 | no                                | no                   | N/A |
 | [langchain-huggingface](https://github.com/tmc/langchaingo)                                                                    | Any text generators available on HuggingFace through API | yes                      | GPT                        | no                                | no                   | N/A |
@@ -37,14 +37,11 @@ LocalAI will attempt to automatically load models which are not explicitly confi
 | `openvoice` | Open voice    | no                       | Audio generation and Voice cloning    | no                               | no                   | CPU/CUDA |
 | `parler-tts` | Open voice    | no                       | Audio generation and Voice cloning    | no                               | no                   | CPU/CUDA |
 | [rerankers](https://github.com/AnswerDotAI/rerankers) | Reranking API    | no                       | Reranking   | no                               | no                   | CPU/CUDA |
-| `transformers` | Various GPTs and quantization formats | yes                      | GPT, embeddings            | yes | yes****                  | CPU/CUDA/XPU |
+| `transformers` | Various GPTs and quantization formats | yes                      | GPT, embeddings            | yes | yes*                  | CPU/CUDA/XPU |
 | [bark-cpp](https://github.com/PABannier/bark.cpp)        | bark               | no                       | Audio-Only                 | no                                | no                   | yes |
 | [stablediffusion-cpp](https://github.com/leejet/stable-diffusion.cpp)         | stablediffusion-1, stablediffusion-2, stablediffusion-3, flux, PhotoMaker               | no                       | Image                 | no                                | no                   | N/A |
 | [silero-vad](https://github.com/snakers4/silero-vad) with [Golang bindings](https://github.com/streamer45/silero-vad-go) | Silero VAD    | no                       | Voice Activity Detection    | no                               | no                   | CPU |

 Note: any backend name listed above can be used in the `backend` field of the model configuration file (See [the advanced section]({{%relref "docs/advanced" %}})).

- \* 7b ONLY
- ** doesn't seem to be accurate
- *** 7b and 40b with the `ggccv` format, for instance: https://huggingface.co/TheBloke/WizardLM-Uncensored-Falcon-40B-GGML
- **** Only for CUDA and OpenVINO CPU/XPU acceleration.
+- \* Only for CUDA and OpenVINO CPU/XPU acceleration.
--- a/docs/content/docs/reference/nvidia-l4t.md
+++ b/docs/content/docs/reference/nvidia-l4t.md
@@ -0,0 +1,35 @@
+
+++
+disableToc = false
+title = "Running on Nvidia ARM64"
+weight = 27
+++
+
+LocalAI can be run on Nvidia ARM64 devices, such as the Jetson Nano, Jetson Xavier NX, and Jetson AGX Xavier. The following instructions will guide you through building the LocalAI container for Nvidia ARM64 devices.
+
+## Prerequisites
+
+- Docker engine installed (https://docs.docker.com/engine/install/ubuntu/)
+- Nvidia container toolkit installed (https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-ap)
+
+## Build the container
+
+Build the LocalAI container for Nvidia ARM64 devices using the following command:
+
+```bash
+git clone https://github.com/mudler/LocalAI
+
+cd LocalAI
+
+docker build --build-arg SKIP_DRIVERS=true --build-arg BUILD_TYPE=cublas --build-arg BASE_IMAGE=nvcr.io/nvidia/l4t-jetpack:r36.4.0 --build-arg IMAGE_TYPE=core -t localai-orin .
+```
+
+## Usage
+
+Run the LocalAI container on Nvidia ARM64 devices using the following command, where `/data/models` is the directory containing the models:
+
+```bash
+docker run -e DEBUG=true -p 8080:8080 -v /data/models:/build/models  -ti --restart=always --name local-ai --runtime nvidia --gpus all localai-orin
+```
+
+Note: `/data/models` is the directory containing the models. You can replace it with the directory containing your models.
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v2.24.2"
+  "version": "v2.25.0"
 }
--- a/docs/themes/hugo-theme-relearn
+++ b/docs/themes/hugo-theme-relearn
--- a/gallery/falcon3.yaml
+++ b/gallery/falcon3.yaml
@@ -0,0 +1,40 @@
+---
+name: "falcon3"
+
+config_file: |
+  mmap: true
+  template:
+    chat_message: |
+      <|{{ .RoleName }}|>
+      {{ if .FunctionCall -}}
+      Function call:
+      {{ else if eq .RoleName "tool" -}}
+      Function response:
+      {{ end -}}
+      {{ if .Content -}}
+      {{.Content }}
+      {{ end -}}
+      {{ if .FunctionCall -}}
+      {{toJson .FunctionCall}}
+      {{ end -}}
+      {{ if eq .RoleName "assistant" }}<|endoftext|>{{ end }}
+    function: |
+      <|system|>
+      You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+      {{range .Functions}}
+      {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+      {{end}}
+      For each function call return a json object with function name and arguments
+      {{.Input }}
+      <|im_start|>assistant
+    chat: |
+      {{.Input }}
+      <|im_start|>assistant
+    completion: |
+      {{.Input}}
+  context_size: 4096
+  f16: true
+  stopwords:
+  - '<|endoftext|>'
+  - '<dummy32000>'
+  - '</s>'
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
--- a/gallery/phi-4-chat-fcall.yaml
+++ b/gallery/phi-4-chat-fcall.yaml
@@ -0,0 +1,37 @@
+---
+name: "phi-4-chat"
+
+config_file: |
+  mmap: true
+  function:
+    json_regex_match:
+    - "(?s)<Output>(.*?)</Output>"
+    capture_llm_results:
+      - (?s)<Thought>(.*?)</Thought>
+    replace_llm_results:
+      - key: (?s)<Thought>(.*?)</Thought>
+        value: ""
+    grammar:
+      properties_order: "name,arguments"
+  template:
+    chat_message: |
+      <|im_start|>{{ .RoleName }}<|im_sep|>
+      {{.Content}}<|im_end|>
+    chat: |
+      {{.Input}}
+      <|im_start|>assistant<|im_sep|>
+    completion: |
+      {{.Input}}
+    function: |
+      <|im_start|>system<|im_sep|>
+      You are an AI assistant that executes function calls, and these are the tools at your disposal:
+      {{range .Functions}}
+      {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+      {{end}}
+      {{.Input}}<|im_end|>
+  context_size: 4096
+  f16: true
+  stopwords:
+  - <|end|>
+  - <|endoftext|>
+  - <|im_end|>
--- a/Show More
+++ b/Show More