chore: ⬆️ Update ggerganov/llama.cpp (#3102 )

⬆️ Update ggerganov/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
models(gallery): add tifa (#3099 )
2026-02-03 03:02:38 -05:00 · 2024-08-01 00:55:09 +00:00 · 2024-07-31 17:14:46 +02:00 · 2024-07-31 17:10:31 +02:00 · 2024-07-31 17:06:06 +02:00 · 2024-07-31 16:09:58 +02:00
155 changed files with 7461 additions and 1887 deletions
--- a/.github/check_and_update.py
+++ b/.github/check_and_update.py
@@ -4,7 +4,7 @@ import requests
 import sys
 import os

-uri = sys.argv[0]
+uri = sys.argv[1]
 file_name = uri.split('/')[-1]

 # Function to parse the URI and determine download method
@@ -36,6 +36,7 @@ def manual_safety_check_hf(repo_id):
 download_type, repo_id_or_url = parse_uri(uri)

 new_checksum =  None
+file_path = None

 # Decide download method based on URI type
 if download_type == 'huggingface':
--- a/.github/checksum_checker.sh
+++ b/.github/checksum_checker.sh
@@ -14,10 +14,10 @@ function check_and_update_checksum() {
    idx="$5"

    # Download the file and calculate new checksum using Python
-    new_checksum=$(python3 ./check_and_update.py $uri)
+    new_checksum=$(python3 ./.github/check_and_update.py $uri)
    result=$?

-    if [[ result -eq 5]]; then
+    if [[ $result -eq 5 ]]; then
        echo "Contaminated entry detected, deleting entry for $model_name..."
        yq eval -i "del([$idx])" "$input_yaml"
        return
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -1,6 +1,10 @@
 # https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 version: 2
 updates:
+  - package-ecosystem: "gitsubmodule"
+    directory: "/"
+    schedule:
+      interval: "weekly"
  - package-ecosystem: "gomod"
    directory: "/"
    schedule:
@@ -23,3 +27,111 @@ updates:
    schedule:
      # Check for updates to GitHub Actions every weekday
      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/autogptq"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/bark"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/common/template"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/coqui"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/diffusers"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/exllama"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/exllama2"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/mamba"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/openvoice"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/parler-tts"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/petals"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/rerankers"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/sentencetransformers"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/transformers"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/transformers-musicgen"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/vall-e-x"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/vllm"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/examples/chainlit"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/examples/functions"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/examples/langchain/langchainpy-localai-example"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/examples/langchain-chroma"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/examples/streamlit-bot"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "docker"
+    directory: "/examples/k8sgpt"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "docker"
+    directory: "/examples/kubernetes"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "docker"
+    directory: "/examples/langchain"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "gomod"
+    directory: "/examples/semantic-todo"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "docker"
+    directory: "/examples/telegram-bot"
+    schedule:
+      interval: "weekly"
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -9,9 +9,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - repository: "go-skynet/go-llama.cpp"
-            variable: "GOLLAMA_VERSION"
-            branch: "master"
          - repository: "ggerganov/llama.cpp"
            variable: "CPPLLAMA_VERSION"
            branch: "master"
@@ -30,9 +27,6 @@ jobs:
          - repository: "go-skynet/bloomz.cpp"
            variable: "BLOOMZ_VERSION"
            branch: "main"
-          - repository: "nomic-ai/gpt4all"
-            variable: "GPT4ALL_VERSION"
-            branch: "main"
          - repository: "mudler/go-ggllm.cpp"
            variable: "GOGGLLM_VERSION"
            branch: "master"
@@ -54,7 +48,7 @@ jobs:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Update ${{ matrix.repository }}'
-          title: ':arrow_up: Update ${{ matrix.repository }}'
+          title: 'chore: :arrow_up: Update ${{ matrix.repository }}'
          branch: "update/${{ matrix.variable }}"
          body: Bump of ${{ matrix.repository }} version
          signoff: true
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -22,7 +22,7 @@ jobs:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Update docs version ${{ matrix.repository }}'
-          title: ':arrow_up: Update docs version ${{ matrix.repository }}'
+          title: 'docs: :arrow_up: update docs version ${{ matrix.repository }}'
          branch: "update/docs"
          body: Bump of ${{ matrix.repository }} version inside docs
          signoff: true
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -20,12 +20,12 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install -y pip wget
-          sudo pip install --upgrade pip 
+          sudo pip install --upgrade pip
          pip install huggingface_hub
      - name: 'Setup yq'
        uses: dcarbone/install-yq-action@v1.1.1
        with:
-          version: 'v4.43.1'
+          version: 'v4.44.2'
          download-compressed: true
          force: true

@@ -41,7 +41,7 @@ jobs:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Checksum updates in gallery/index.yaml'
-          title: 'models(gallery): :arrow_up: update checksum'
+          title: 'chore(model-gallery): :arrow_up: update checksum'
          branch: "update/checksum"
          body: Updating checksums in gallery/index.yaml
          signoff: true
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@@ -14,7 +14,7 @@ jobs:
    steps:
      - name: Dependabot metadata
        id: metadata
-        uses: dependabot/fetch-metadata@v2.1.0
+        uses: dependabot/fetch-metadata@v2.2.0
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          skip-commit-verification: true
--- a/.github/workflows/disabled/comment-pr.yaml
+++ b/.github/workflows/disabled/comment-pr.yaml
@@ -0,0 +1,83 @@
+name: Comment PRs
+on:
+  pull_request_target:
+
+jobs:
+  comment-pr:
+    env:
+        MODEL_NAME: hermes-2-theta-llama-3-8b
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+      with:
+        ref: "${{ github.event.pull_request.merge_commit_sha }}"
+        fetch-depth: 0 # needed to checkout all branches for this Action to work
+    - uses: mudler/localai-github-action@v1
+      with:
+        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+      # Check the PR diff using the current branch and the base branch of the PR
+    - uses: GrantBirki/git-diff-action@v2.7.0
+      id: git-diff-action
+      with:
+            json_diff_file_output: diff.json
+            raw_diff_file_output: diff.txt
+            file_output_only: "true"
+            base_branch: ${{ github.event.pull_request.base.sha }}
+    - name: Show diff
+      env:
+        DIFF: ${{ steps.git-diff-action.outputs.raw-diff-path }}
+      run: |
+            cat $DIFF
+    - name: Summarize
+      env:
+        DIFF: ${{ steps.git-diff-action.outputs.raw-diff-path }}
+      id: summarize
+      run: |
+            input="$(cat $DIFF)"
+
+            # Define the LocalAI API endpoint
+            API_URL="http://localhost:8080/chat/completions"
+
+            # Create a JSON payload using jq to handle special characters
+            json_payload=$(jq -n --arg input "$input" '{
+            model: "'$MODEL_NAME'",
+            messages: [
+                {
+                role: "system",
+                content: "You are LocalAI-bot in Github that helps understanding PRs and assess complexity. Explain what has changed in this PR diff and why"
+                },
+                {
+                role: "user",
+                content: $input
+                }
+            ]
+            }')
+
+            # Send the request to LocalAI
+            response=$(curl -s -X POST $API_URL \
+            -H "Content-Type: application/json" \
+            -d "$json_payload")
+
+            # Extract the summary from the response
+            summary="$(echo $response | jq -r '.choices[0].message.content')"
+
+            # Print the summary
+            #  -H "Authorization: Bearer $API_KEY" \
+            echo "Summary:"
+            echo "$summary"
+            echo "payload sent"
+            echo "$json_payload"
+            {
+                echo 'message<<EOF'
+                echo "$summary"
+                echo EOF
+              } >> "$GITHUB_OUTPUT"
+            docker logs --tail 10 local-ai
+    - uses: mshick/add-pr-comment@v2
+      if: always()
+      with:
+          repo-token: ${{ secrets.UPDATE_BOT_TOKEN }}
+          message: ${{ steps.summarize.outputs.message }}
+          message-failure: |
+            Uh oh! Could not analyze this PR, maybe it's too big?
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -75,7 +75,7 @@ jobs:
        uses: actions/checkout@v4

      - name: Cache GRPC
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          builder: ${{ steps.buildx.outputs.name }}
          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
@@ -84,11 +84,11 @@ jobs:
          build-args: |
            GRPC_BASE_IMAGE=${{ matrix.grpc-base-image }}
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
-            GRPC_VERSION=v1.64.0
+            GRPC_VERSION=v1.65.0
          context: .
          file: ./Dockerfile
          cache-to: type=gha,ignore-error=true
          cache-from: type=gha
          target: grpc
          platforms: ${{ matrix.platforms }}
-          push: false
+          push: false
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -15,7 +15,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - base-image: intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
+          - base-image: intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04
            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64'
    runs-on: ${{matrix.runs-on}}
@@ -46,7 +46,7 @@ jobs:
        uses: actions/checkout@v4

      - name: Cache Intel images
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          builder: ${{ steps.buildx.outputs.name }}
          build-args: |
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -35,18 +35,19 @@ jobs:
      max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }}
      matrix:
        include:
-          - build-type: ''
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=3 --output-sync=target"
+          # This is basically covered by the AIO test
+          # - build-type: ''
+          #   platforms: 'linux/amd64'
+          #   tag-latest: 'false'
+          #   tag-suffix: '-ffmpeg'
+          #   ffmpeg: 'true'
+          #   image-type: 'extras'
+          #   runs-on: 'arc-runner-set'
+          #   base-image: "ubuntu:22.04"
+          #   makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "4"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-ffmpeg'
@@ -55,85 +56,85 @@ jobs:
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas'
-            ffmpeg: 'false'
-            image-type: 'extras'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'sycl_f16'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: 'sycl-f16-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-  core-image-build:
-    uses: ./.github/workflows/image_build.yml
-    with:
-      tag-latest: ${{ matrix.tag-latest }}
-      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
-      image-type: ${{ matrix.image-type }}
-      build-type: ${{ matrix.build-type }}
-      cuda-major-version: ${{ matrix.cuda-major-version }}
-      cuda-minor-version: ${{ matrix.cuda-minor-version }}
-      platforms: ${{ matrix.platforms }}
-      runs-on: ${{ matrix.runs-on }}
-      base-image: ${{ matrix.base-image }}
-      grpc-base-image: ${{ matrix.grpc-base-image }}
-      makeflags: ${{ matrix.makeflags }}
-    secrets:
-      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
-      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
-      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-    strategy:
-      matrix:
-        include:
-          - build-type: ''
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'ubuntu-latest'
-            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=4 --output-sync=target"
-          - build-type: 'sycl_f16'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: 'sycl-f16-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "4"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'ubuntu-latest'
-            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=4 --output-sync=target"
-          - build-type: 'vulkan'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-vulkan-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'ubuntu-latest'
-            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=4 --output-sync=target"
+          # - build-type: 'hipblas'
+          #   platforms: 'linux/amd64'
+          #   tag-latest: 'false'
+          #   tag-suffix: '-hipblas'
+          #   ffmpeg: 'false'
+          #   image-type: 'extras'
+          #   base-image: "rocm/dev-ubuntu-22.04:6.1"
+          #   grpc-base-image: "ubuntu:22.04"
+          #   runs-on: 'arc-runner-set'
+          #   makeflags: "--jobs=3 --output-sync=target"
+          # - build-type: 'sycl_f16'
+          #   platforms: 'linux/amd64'
+          #   tag-latest: 'false'
+          #   base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+          #   grpc-base-image: "ubuntu:22.04"
+          #   tag-suffix: 'sycl-f16-ffmpeg'
+          #   ffmpeg: 'true'
+          #   image-type: 'extras'
+          #   runs-on: 'arc-runner-set'
+          #   makeflags: "--jobs=3 --output-sync=target"
+  # core-image-build:
+  #   uses: ./.github/workflows/image_build.yml
+  #   with:
+  #     tag-latest: ${{ matrix.tag-latest }}
+  #     tag-suffix: ${{ matrix.tag-suffix }}
+  #     ffmpeg: ${{ matrix.ffmpeg }}
+  #     image-type: ${{ matrix.image-type }}
+  #     build-type: ${{ matrix.build-type }}
+  #     cuda-major-version: ${{ matrix.cuda-major-version }}
+  #     cuda-minor-version: ${{ matrix.cuda-minor-version }}
+  #     platforms: ${{ matrix.platforms }}
+  #     runs-on: ${{ matrix.runs-on }}
+  #     base-image: ${{ matrix.base-image }}
+  #     grpc-base-image: ${{ matrix.grpc-base-image }}
+  #     makeflags: ${{ matrix.makeflags }}
+  #   secrets:
+  #     dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+  #     dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+  #     quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+  #     quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+  #   strategy:
+  #     matrix:
+  #       include:
+          # - build-type: ''
+          #   platforms: 'linux/amd64'
+          #   tag-latest: 'false'
+          #   tag-suffix: '-ffmpeg-core'
+          #   ffmpeg: 'true'
+          #   image-type: 'core'
+          #   runs-on: 'ubuntu-latest'
+          #   base-image: "ubuntu:22.04"
+          #   makeflags: "--jobs=4 --output-sync=target"
+          # - build-type: 'sycl_f16'
+          #   platforms: 'linux/amd64'
+          #   tag-latest: 'false'
+          #   base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+          #   grpc-base-image: "ubuntu:22.04"
+          #   tag-suffix: 'sycl-f16-ffmpeg-core'
+          #   ffmpeg: 'true'
+          #   image-type: 'core'
+          #   runs-on: 'arc-runner-set'
+          #   makeflags: "--jobs=3 --output-sync=target"
+          # - build-type: 'cublas'
+          #   cuda-major-version: "12"
+          #   cuda-minor-version: "0"
+          #   platforms: 'linux/amd64'
+          #   tag-latest: 'false'
+          #   tag-suffix: '-cublas-cuda12-ffmpeg-core'
+          #   ffmpeg: 'true'
+          #   image-type: 'core'
+          #   runs-on: 'ubuntu-latest'
+          #   base-image: "ubuntu:22.04"
+          #   makeflags: "--jobs=4 --output-sync=target"
+          # - build-type: 'vulkan'
+          #   platforms: 'linux/amd64'
+          #   tag-latest: 'false'
+          #   tag-suffix: '-vulkan-ffmpeg-core'
+          #   ffmpeg: 'true'
+          #   image-type: 'core'
+          #   runs-on: 'ubuntu-latest'
+          #   base-image: "ubuntu:22.04"
+          #   makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -75,7 +75,7 @@ jobs:
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "4"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12'
@@ -100,7 +100,7 @@ jobs:
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "4"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-cublas-cuda12-ffmpeg'
@@ -285,7 +285,7 @@ jobs:
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "4"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-core'
@@ -307,7 +307,7 @@ jobs:
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "4"
+            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-ffmpeg-core'
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -215,7 +215,7 @@ jobs:
          password: ${{ secrets.quayPassword }}

      - name: Build and push
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        if: github.event_name != 'pull_request'
        with:
          builder: ${{ steps.buildx.outputs.name }}
@@ -232,7 +232,7 @@ jobs:
            BASE_IMAGE=${{ inputs.base-image }}
            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
-            GRPC_VERSION=v1.64.0
+            GRPC_VERSION=v1.65.0
            MAKEFLAGS=${{ inputs.makeflags }}
          context: .
          file: ./Dockerfile
@@ -243,7 +243,7 @@ jobs:
          labels: ${{ steps.meta.outputs.labels }}
 ### Start testing image
      - name: Build and push
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        if: github.event_name == 'pull_request'
        with:
          builder: ${{ steps.buildx.outputs.name }}
@@ -260,7 +260,7 @@ jobs:
            BASE_IMAGE=${{ inputs.base-image }}
            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
-            GRPC_VERSION=v1.64.0
+            GRPC_VERSION=v1.65.0
            MAKEFLAGS=${{ inputs.makeflags }}
          context: .
          file: ./Dockerfile
@@ -276,7 +276,7 @@ jobs:
 ## End testing image
      - name: Build and push AIO image
        if: inputs.aio != ''
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          builder: ${{ steps.buildx.outputs.name }}
          build-args: |
@@ -291,7 +291,7 @@ jobs:

      - name: Build and push AIO image (dockerhub)
        if: inputs.aio != ''
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          builder: ${{ steps.buildx.outputs.name }}
          build-args: |
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -14,12 +14,10 @@ jobs:
    - uses: actions/checkout@v4
      with:
        fetch-depth: 0 # needed to checkout all branches for this Action to work
-    - name: Start LocalAI
-      run: |
-        echo "Starting LocalAI..."
-        docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
-        until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready";  docker logs --tail 10 local-ai; sleep 2; done
-      # Check the PR diff using the current branch and the base branch of the PR
+    - uses: mudler/localai-github-action@v1
+      with:
+        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        # Check the PR diff using the current branch and the base branch of the PR
    - uses: GrantBirki/git-diff-action@v2.7.0
      id: git-diff-action
      with:
--- a/.github/workflows/notify-releases.yaml
+++ b/.github/workflows/notify-releases.yaml
@@ -12,11 +12,9 @@ jobs:
        RELEASE_TITLE: ${{ github.event.release.name }}
        RELEASE_TAG_NAME: ${{ github.event.release.tag_name }}
    steps:
-    - name: Start LocalAI
-      run: |
-        echo "Starting LocalAI..."
-        docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
-        until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready";  docker logs --tail 10 local-ai; sleep 2; done
+    - uses: mudler/localai-github-action@v1
+      with:
+        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
    - name: Summarize
      id: summarize
      run: |
--- a/.github/workflows/prlint.yaml
+++ b/.github/workflows/prlint.yaml
@@ -0,0 +1,28 @@
+name: Check PR style
+
+on:
+  pull_request_target:
+    types:
+      - opened
+      - reopened
+      - edited
+      - synchronize
+
+jobs:
+  title-lint:
+    runs-on: ubuntu-latest
+    permissions:
+      statuses: write
+    steps:
+      - uses: aslafy-z/conventional-pr-title-action@v3
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+#  check-pr-description:
+#    runs-on: ubuntu-latest
+#    steps:
+#      - uses: actions/checkout@v2
+#      - uses: jadrol/pr-description-checker-action@v1.0.0
+#        id: description-checker
+#        with:
+#          repo-token: ${{ secrets.GITHUB_TOKEN }}
+#          exempt-labels: no qa
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -1,11 +1,15 @@
 name: Build and Release

 on:
- push
- pull_request
+  push:
+    branches:
+      - master
+    tags:
+      - 'v*'
+  pull_request:

 env:
-  GRPC_VERSION: v1.64.0
+  GRPC_VERSION: v1.65.0

 permissions:
  contents: write
@@ -27,11 +31,10 @@ jobs:
        with:
          go-version: '1.21.x'
          cache: false
-
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache gawk
+          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk
          sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev
      - name: Install CUDA Dependencies
        run: |
@@ -52,7 +55,8 @@ jobs:
        run: |

          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+          cd grpc && sed -i "216i\  TESTONLY" "third_party/abseil-cpp/absl/container/CMakeLists.txt" && mkdir -p cmake/build && \
+          cd cmake/build && cmake -DgRPC_INSTALL=ON \
            -DgRPC_BUILD_TESTS=OFF \
            ../.. && sudo make --jobs 5 --output-sync=target
      - name: Install gRPC
@@ -96,8 +100,8 @@ jobs:
          CROSS_TOOLCHAIN=/usr/$GNU_HOST
          CROSS_STAGING_PREFIX=$CROSS_TOOLCHAIN/stage
          CMAKE_CROSS_TOOLCHAIN=/tmp/arm.toolchain.cmake
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          export PATH=$PATH:$GOPATH/bin
          export PATH=/usr/local/cuda/bin:$PATH
          sudo rm -rf /usr/aarch64-linux-gnu/lib/libstdc++.so.6
@@ -146,7 +150,7 @@ jobs:
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache gawk cmake libgmock-dev
+          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
      - name: Intel Dependencies
        run: |
          wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
@@ -160,7 +164,7 @@ jobs:
          sudo apt-get update
          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
        env:
-          CUDA_VERSION: 12-3
+          CUDA_VERSION: 12-5
      - name: "Install Hipblas"
        env:
          ROCM_VERSION: "6.1"
@@ -196,7 +200,8 @@ jobs:
        if: steps.cache-grpc.outputs.cache-hit != 'true'
        run: |
          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+          cd grpc && sed -i "216i\  TESTONLY" "third_party/abseil-cpp/absl/container/CMakeLists.txt" && mkdir -p cmake/build && \
+          cd cmake/build && cmake -DgRPC_INSTALL=ON \
            -DgRPC_BUILD_TESTS=OFF \
            ../.. && sudo make --jobs 5 --output-sync=target
      - name: Install gRPC
@@ -206,8 +211,8 @@ jobs:
      - name: Build
        id: build
        run: |
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          export PATH=$PATH:$GOPATH/bin
          export PATH=/usr/local/cuda/bin:$PATH
          export PATH=/opt/rocm/bin:$PATH
@@ -246,9 +251,9 @@ jobs:
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
+          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache upx-ucl
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Build stablediffusion
        run: |
          export PATH=$PATH:$GOPATH/bin
@@ -323,8 +328,8 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Build
        id: build
        run: |
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -19,7 +19,7 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v4
-        with: 
+        with:
          submodules: true
      - name: Dependencies
        run: |
@@ -29,8 +29,8 @@ jobs:
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.64.0
-          
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
+
      - name: Test transformers
        run: |
           make --jobs=5 --output-sync=target -C backend/python/transformers
@@ -41,7 +41,7 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v4
-        with: 
+        with:
          submodules: true
      - name: Dependencies
        run: |
@@ -51,8 +51,8 @@ jobs:
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.64.0
-          
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
+
      - name: Test sentencetransformers
        run: |
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
@@ -64,7 +64,7 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v4
-        with: 
+        with:
          submodules: true
      - name: Dependencies
        run: |
@@ -74,7 +74,7 @@ jobs:
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.64.0
+          pip install --user --no-cache-dir grpcio-tools==1.64.1

      - name: Test rerankers
        run: |
@@ -86,7 +86,7 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v4
-        with: 
+        with:
          submodules: true
      - name: Dependencies
        run: |
@@ -96,7 +96,7 @@ jobs:
          sudo apt-get install -y libopencv-dev
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
-          pip install --user grpcio-tools==1.64.0
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test diffusers
        run: |
          make --jobs=5 --output-sync=target -C backend/python/diffusers
@@ -107,7 +107,7 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v4
-        with: 
+        with:
          submodules: true
      - name: Dependencies
        run: |
@@ -117,19 +117,19 @@ jobs:
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.64.0
+          pip install --user --no-cache-dir grpcio-tools==1.64.1

      - name: Test parler-tts
        run: |
           make --jobs=5 --output-sync=target -C backend/python/parler-tts
           make --jobs=5 --output-sync=target -C backend/python/parler-tts test
-  
+
  tests-openvoice:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
-        with: 
+        with:
          submodules: true
      - name: Dependencies
        run: |
@@ -139,7 +139,7 @@ jobs:
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.64.0
+          pip install --user --no-cache-dir grpcio-tools==1.64.1

      - name: Test openvoice
        run: |
@@ -151,7 +151,7 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v4
-        with: 
+        with:
          submodules: true
      - name: Dependencies
        run: |
@@ -161,7 +161,7 @@ jobs:
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.64.0
+          pip install --user --no-cache-dir grpcio-tools==1.64.1

      - name: Test transformers-musicgen
        run: |
@@ -175,7 +175,7 @@ jobs:
  #   steps:
  #     - name: Clone
  #       uses: actions/checkout@v4
-  #       with: 
+  #       with:
  #         submodules: true
  #     - name: Dependencies
  #       run: |
@@ -185,14 +185,14 @@ jobs:
  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user grpcio-tools==1.64.0
+  #         pip install --user --no-cache-dir grpcio-tools==1.64.1

  #     - name: Test petals
  #       run: |
  #          make --jobs=5 --output-sync=target -C backend/python/petals
  #          make --jobs=5 --output-sync=target -C backend/python/petals test

-           
+

  # tests-bark:
  #   runs-on: ubuntu-latest
@@ -239,7 +239,7 @@ jobs:
  #           df -h
  #     - name: Clone
  #       uses: actions/checkout@v4
-  #       with: 
+  #       with:
  #         submodules: true
  #     - name: Dependencies
  #       run: |
@@ -249,14 +249,14 @@ jobs:
  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user grpcio-tools==1.64.0
+  #         pip install --user --no-cache-dir grpcio-tools==1.64.1

  #     - name: Test bark
  #       run: |
  #          make --jobs=5 --output-sync=target -C backend/python/bark
  #          make --jobs=5 --output-sync=target -C backend/python/bark test

-           
+
  # Below tests needs GPU. Commented out for now
  # TODO: Re-enable as soon as we have GPU nodes
  # tests-vllm:
@@ -264,7 +264,7 @@ jobs:
  #   steps:
  #     - name: Clone
  #       uses: actions/checkout@v4
-  #       with: 
+  #       with:
  #         submodules: true
  #     - name: Dependencies
  #       run: |
@@ -274,7 +274,7 @@ jobs:
  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user grpcio-tools==1.64.0
+  #         pip install --user --no-cache-dir grpcio-tools==1.64.1
  #     - name: Test vllm
  #       run: |
  #          make --jobs=5 --output-sync=target -C backend/python/vllm
@@ -284,7 +284,7 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v4
-        with: 
+        with:
          submodules: true
      - name: Dependencies
        run: |
@@ -294,7 +294,7 @@ jobs:
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.64.0
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test vall-e-x
        run: |
           make --jobs=5 --output-sync=target -C backend/python/vall-e-x
@@ -305,7 +305,7 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v4
-        with: 
+        with:
          submodules: true
      - name: Dependencies
        run: |
@@ -314,8 +314,8 @@ jobs:
          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng python3-pip
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
-          pip install --user grpcio-tools==1.64.0
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test coqui
        run: |
          make --jobs=5 --output-sync=target -C backend/python/coqui
-          make --jobs=5 --output-sync=target -C backend/python/coqui test
+          make --jobs=5 --output-sync=target -C backend/python/coqui test
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -10,7 +10,7 @@ on:
      - '*'

 env:
-  GRPC_VERSION: v1.64.0
+  GRPC_VERSION: v1.65.0

 concurrency:
  group: ci-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
@@ -70,7 +70,7 @@ jobs:
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential curl ffmpeg
+          sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
          sudo apt-get install -y libgmock-dev
          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
@@ -94,8 +94,8 @@ jobs:
          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
          export CUDACXX=/usr/local/cuda/bin/nvcc

-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af

          # The python3-grpc-tools package in 22.04 is too old
          pip install --user grpcio-tools
@@ -110,7 +110,7 @@ jobs:
          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
        env:
-          CUDA_VERSION: 12-3
+          CUDA_VERSION: 12-4
      - name: Cache grpc
        id: cache-grpc
        uses: actions/cache@v4
@@ -121,7 +121,8 @@ jobs:
        if: steps.cache-grpc.outputs.cache-hit != 'true'
        run: |
          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --jobs 5 --shallow-submodules https://github.com/grpc/grpc && \
-          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+          cd grpc && sed -i "216i\  TESTONLY" "third_party/abseil-cpp/absl/container/CMakeLists.txt" && mkdir -p cmake/build && cd cmake/build && \
+          cmake -DgRPC_INSTALL=ON \
            -DgRPC_BUILD_TESTS=OFF \
            ../.. && sudo make --jobs 5
      - name: Install gRPC
@@ -214,7 +215,7 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc
-          pip install --user grpcio-tools==1.64.0
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -13,11 +13,17 @@ jobs:
      - uses: actions/setup-go@v5
        with:
          go-version: 'stable'
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install protobuf-compiler
      - run: |
          go install github.com/swaggo/swag/cmd/swag@latest
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Bump swagger 🔧
        run: |
-          make swagger
+          make protogen-go swagger
      - name: Create Pull Request
        uses: peter-evans/create-pull-request@v6
        with:
--- a/42
+++ b/42
@@ -8,7 +8,7 @@ FROM ${BASE_IMAGE} AS requirements-core

 USER root

-ARG GO_VERSION=1.22.4
+ARG GO_VERSION=1.22.5
 ARG TARGETARCH
 ARG TARGETVARIANT

@@ -24,7 +24,7 @@ RUN apt-get update && \
        cmake \
        curl \
        git \
-        unzip && \
+        unzip upx-ucl && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

@@ -99,7 +99,7 @@ FROM requirements-${IMAGE_TYPE} AS requirements-drivers

 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=12
-ARG CUDA_MINOR_VERSION=4
+ARG CUDA_MINOR_VERSION=0

 ENV BUILD_TYPE=${BUILD_TYPE}

@@ -108,11 +108,11 @@ RUN <<EOT bash
    if [ "${BUILD_TYPE}" = "vulkan" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
-                        software-properties-common pciutils wget gpg-agent && \
+            software-properties-common pciutils wget gpg-agent && \
        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
        apt-get update && \
-            apt-get install -y \
+        apt-get install -y \
            vulkan-sdk && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
@@ -124,33 +124,13 @@ RUN <<EOT bash
    if [ "${BUILD_TYPE}" = "cublas" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
-                        software-properties-common pciutils
+            software-properties-common pciutils
        if [ "amd64" = "$TARGETARCH" ]; then
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-            fi
+        fi
        if [ "arm64" = "$TARGETARCH" ]; then
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
        fi
-        dpkg -i cuda-keyring_1.1-1_all.deb && \
-            rm -f cuda-keyring_1.1-1_all.deb && \
-            apt-get update && \
-            apt-get install -y --no-install-recommends \
-                cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-                libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-                libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-                libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-                libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-                libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
-            apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
-        apt-get update && \
-        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils && \
-        curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
        dpkg -i cuda-keyring_1.1-1_all.deb && \
        rm -f cuda-keyring_1.1-1_all.deb && \
        apt-get update && \
@@ -162,8 +142,9 @@ RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* \
-    ; fi
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT

 # If we are building with clblas support, we need the libraries for the builds
 RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
@@ -206,7 +187,7 @@ FROM ${GRPC_BASE_IMAGE} AS grpc

 # This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
 ARG GRPC_MAKEFLAGS="-j4 -Otarget"
-ARG GRPC_VERSION=v1.64.2
+ARG GRPC_VERSION=v1.65.0

 ENV MAKEFLAGS=${GRPC_MAKEFLAGS}

@@ -227,6 +208,7 @@ RUN apt-get update && \
 RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
    mkdir -p /build/grpc/cmake/build && \
    cd /build/grpc/cmake/build && \
+    sed -i "216i\  TESTONLY" "../../third_party/abseil-cpp/absl/container/CMakeLists.txt" && \
    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
    make && \
    make install && \
--- a/155
+++ b/155
@@ -6,8 +6,9 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true

 # llama.cpp versions
-GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=fd560fe680c72fd0a0af2bc8881add20ad919071
+GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
+GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
+CPPLLAMA_VERSION?=ed9d2854c9de4ae1f448334294e61167b04bec2a

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -18,18 +19,23 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
-WHISPER_CPP_VERSION?=1c31f9d4a8936aec550e6c4dc9ca5cae3b4f304a
+WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
+WHISPER_CPP_VERSION?=6739eb83c3ca5cf40d24c6fe8442a761a1eb6248

 # bert.cpp version
+BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
 BERT_VERSION?=710044b124545415f555e4260d16b146c725a6e4

 # go-piper version
+PIPER_REPO?=https://github.com/mudler/go-piper
 PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759

 # stablediffusion version
+STABLEDIFFUSION_REPO?=https://github.com/mudler/go-stable-diffusion
 STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f

 # tinydream version
+TINYDREAM_REPO?=https://github.com/M0Rf30/go-tiny-dream
 TINYDREAM_VERSION?=c04fa463ace9d9a6464313aa5f9cd0f953b6c057

 export BUILD_TYPE?=
@@ -52,7 +58,7 @@ RANDOM := $(shell bash -c 'echo $$RANDOM')

 VERSION?=$(shell git describe --always --tags || echo "dev" )
 # go tool nm ./local-ai | grep Commit
-LD_FLAGS?=
+LD_FLAGS?=-s -w
 override LD_FLAGS += -X "github.com/mudler/LocalAI/internal.Version=$(VERSION)"
 override LD_FLAGS += -X "github.com/mudler/LocalAI/internal.Commit=$(shell git rev-parse HEAD)"

@@ -66,6 +72,14 @@ WHITE  := $(shell tput -Txterm setaf 7)
 CYAN   := $(shell tput -Txterm setaf 6)
 RESET  := $(shell tput -Txterm sgr0)

+UPX?=
+# check if upx exists
+ifeq (, $(shell which upx))
+	UPX=
+else
+	UPX=$(shell which upx)
+endif
+
 # Default Docker bridge IP
 E2E_BRIDGE_IP?=172.17.0.1

@@ -202,69 +216,109 @@ all: help

 ## BERT embeddings
 sources/go-bert.cpp:
-	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert.cpp
-	cd sources/go-bert.cpp && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1
+	mkdir -p sources/go-bert.cpp
+	cd sources/go-bert.cpp && \
+	git init && \
+	git remote add origin $(BERT_REPO) && \
+	git fetch origin && \
+	git checkout $(BERT_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch

 sources/go-bert.cpp/libgobert.a: sources/go-bert.cpp
 	$(MAKE) -C sources/go-bert.cpp libgobert.a

 ## go-llama.cpp
 sources/go-llama.cpp:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama.cpp
-	cd sources/go-llama.cpp && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
+	mkdir -p sources/go-llama.cpp
+	cd sources/go-llama.cpp && \
+	git init && \
+	git remote add origin $(GOLLAMA_REPO) && \
+	git fetch origin && \
+	git checkout $(GOLLAMA_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch

 sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
 	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a

 ## go-piper
 sources/go-piper:
-	git clone --recurse-submodules https://github.com/mudler/go-piper sources/go-piper
-	cd sources/go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1
+	mkdir -p sources/go-piper
+	cd sources/go-piper && \
+	git init && \
+	git remote add origin $(PIPER_REPO) && \
+	git fetch origin && \
+	git checkout $(PIPER_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch

 sources/go-piper/libpiper_binding.a: sources/go-piper
 	$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o

 ## GPT4ALL
 sources/gpt4all:
-	git clone --recurse-submodules $(GPT4ALL_REPO) sources/gpt4all
-	cd sources/gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1
+	mkdir -p sources/gpt4all
+	cd sources/gpt4all && \
+	git init && \
+	git remote add origin $(GPT4ALL_REPO) && \
+	git fetch origin && \
+	git checkout $(GPT4ALL_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch

 sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a

 ## RWKV
 sources/go-rwkv.cpp:
-	git clone --recurse-submodules $(RWKV_REPO) sources/go-rwkv.cpp
-	cd sources/go-rwkv.cpp && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
+	mkdir -p sources/go-rwkv.cpp
+	cd sources/go-rwkv.cpp && \
+	git init && \
+	git remote add origin $(RWKV_REPO) && \
+	git fetch origin && \
+	git checkout $(RWKV_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch

 sources/go-rwkv.cpp/librwkv.a: sources/go-rwkv.cpp
 	cd sources/go-rwkv.cpp && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..

 ## stable diffusion
 sources/go-stable-diffusion:
-	git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion sources/go-stable-diffusion
-	cd sources/go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1
+	mkdir -p sources/go-stable-diffusion
+	cd sources/go-stable-diffusion && \
+	git init && \
+	git remote add origin $(STABLEDIFFUSION_REPO) && \
+	git fetch origin && \
+	git checkout $(STABLEDIFFUSION_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch

 sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
 	CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a

 ## tiny-dream
 sources/go-tiny-dream:
-	git clone --recurse-submodules https://github.com/M0Rf30/go-tiny-dream sources/go-tiny-dream
-	cd sources/go-tiny-dream && git checkout -b build $(TINYDREAM_VERSION) && git submodule update --init --recursive --depth 1
+	mkdir -p sources/go-tiny-dream
+	cd sources/go-tiny-dream && \
+	git init && \
+	git remote add origin $(TINYDREAM_REPO) && \
+	git fetch origin && \
+	git checkout $(TINYDREAM_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch

 sources/go-tiny-dream/libtinydream.a: sources/go-tiny-dream
 	$(MAKE) -C sources/go-tiny-dream libtinydream.a

 ## whisper
 sources/whisper.cpp:
-	git clone https://github.com/ggerganov/whisper.cpp sources/whisper.cpp
-	cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1
+	mkdir -p sources/whisper.cpp
+	cd sources/whisper.cpp && \
+	git init && \
+	git remote add origin $(WHISPER_REPO) && \
+	git fetch origin && \
+	git checkout $(WHISPER_CPP_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch

 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a

-get-sources: sources/go-llama.cpp sources/gpt4all sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream
+get-sources: sources/go-llama.cpp sources/gpt4all sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp

 replace:
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv.cpp
@@ -331,6 +385,7 @@ build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
 	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
 	$(info ${GREEN}I LD_FLAGS: ${YELLOW}$(LD_FLAGS)${RESET})
+	$(info ${GREEN}I UPX: ${YELLOW}$(UPX)${RESET})
 ifneq ($(BACKEND_LIBS),)
 	$(MAKE) backend-assets/lib
 	cp -f $(BACKEND_LIBS) backend-assets/lib/
@@ -338,7 +393,7 @@ endif
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./

 build-minimal:
-	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=none $(MAKE) build
+	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build

 build-api:
 	BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build
@@ -375,7 +430,7 @@ else
 endif

 dist-cross-linux-arm64:
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" \
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" GO_TAGS="p2p" \
 	STATIC=true $(MAKE) build
 	mkdir -p release
 # if BUILD_ID is empty, then we don't append it to the binary name
@@ -425,7 +480,7 @@ prepare-e2e:
 	mkdir -p $(TEST_DIR)
 	cp -rfv $(abspath ./tests/e2e-fixtures)/gpu.yaml $(TEST_DIR)/gpu.yaml
 	test -e $(TEST_DIR)/ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O $(TEST_DIR)/ggllm-test-model.bin
-	docker build --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=12 --build-arg CUDA_MINOR_VERSION=4 --build-arg FFMPEG=true -t localai-tests .
+	docker build --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=12 --build-arg CUDA_MINOR_VERSION=0 --build-arg FFMPEG=true -t localai-tests .

 run-e2e-image:
 	ls -liah $(abspath ./tests/e2e-fixtures)
@@ -687,13 +742,22 @@ backend-assets/grpc: protogen-go replace
 backend-assets/grpc/bert-embeddings: sources/go-bert.cpp sources/go-bert.cpp/libgobert.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert.cpp LIBRARY_PATH=$(CURDIR)/sources/go-bert.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/bert-embeddings
+endif

 backend-assets/grpc/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a backend-assets/gpt4all backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/gpt4all
+endif

 backend-assets/grpc/huggingface: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/huggingface ./backend/go/llm/langchain/
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/huggingface
+endif

 backend/cpp/llama/llama.cpp:
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp
@@ -721,28 +785,28 @@ else
 endif

 # This target is for manually building a variant with-auto detected flags
-backend-assets/grpc/llama-cpp: backend-assets/grpc
+backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-cpp
 	$(MAKE) -C backend/cpp/llama-cpp purge
 	$(info ${GREEN}I llama-cpp build info:avx2${RESET})
 	$(MAKE) VARIANT="llama-cpp" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-cpp/grpc-server backend-assets/grpc/llama-cpp

-backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc
+backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-avx2
 	$(MAKE) -C backend/cpp/llama-avx2 purge
 	$(info ${GREEN}I llama-cpp build info:avx2${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2

-backend-assets/grpc/llama-cpp-avx: backend-assets/grpc
+backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-avx
 	$(MAKE) -C backend/cpp/llama-avx purge
 	$(info ${GREEN}I llama-cpp build info:avx${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx/grpc-server backend-assets/grpc/llama-cpp-avx

-backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc
+backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-fallback
 	$(MAKE) -C backend/cpp/llama-fallback purge
 	$(info ${GREEN}I llama-cpp build info:fallback${RESET})
@@ -753,35 +817,35 @@ ifeq ($(BUILD_TYPE),metal)
 	cp backend/cpp/llama-fallback/llama.cpp/build/bin/default.metallib backend-assets/grpc/
 endif

-backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc
+backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-cuda
 	$(MAKE) -C backend/cpp/llama-cuda purge
 	$(info ${GREEN}I llama-cpp build info:cuda${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda

-backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc
+backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-hipblas
 	$(MAKE) -C backend/cpp/llama-hipblas purge
 	$(info ${GREEN}I llama-cpp build info:hipblas${RESET})
 	BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas

-backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc
+backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-sycl_f16
 	$(MAKE) -C backend/cpp/llama-sycl_f16 purge
 	$(info ${GREEN}I llama-cpp build info:sycl_f16${RESET})
 	BUILD_TYPE="sycl_f16" $(MAKE) VARIANT="llama-sycl_f16" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-sycl_f16/grpc-server backend-assets/grpc/llama-cpp-sycl_f16

-backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc
+backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-sycl_f32
 	$(MAKE) -C backend/cpp/llama-sycl_f32 purge
 	$(info ${GREEN}I llama-cpp build info:sycl_f32${RESET})
 	BUILD_TYPE="sycl_f32" $(MAKE) VARIANT="llama-sycl_f32" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-sycl_f32/grpc-server backend-assets/grpc/llama-cpp-sycl_f32

-backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
+backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-grpc
 	$(MAKE) -C backend/cpp/llama-grpc purge
 	$(info ${GREEN}I llama-cpp build info:grpc${RESET})
@@ -795,29 +859,50 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
 backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/llama-ggml
+endif

 backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
 	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/piper
+endif

 backend-assets/grpc/rwkv: sources/go-rwkv.cpp sources/go-rwkv.cpp/librwkv.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv.cpp LIBRARY_PATH=$(CURDIR)/sources/go-rwkv.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/rwkv
+endif

 backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/go-stable-diffusion/:/usr/include/opencv4" LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/stablediffusion
+endif

 backend-assets/grpc/tinydream: sources/go-tiny-dream sources/go-tiny-dream/libtinydream.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/tinydream
+endif

 backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/whisper
+endif

 backend-assets/grpc/local-store: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/local-store ./backend/go/stores/
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/local-store
+endif

 grpcs: prepare $(GRPC_BACKENDS)

@@ -859,7 +944,7 @@ docker-aio-all:

 docker-image-intel:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -867,7 +952,7 @@ docker-image-intel:

 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
--- a/README.md
+++ b/README.md
@@ -72,14 +72,15 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu

 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

- 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
- 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
- 🔥🔥 Decentralized llama.cpp:  https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs  https://localai.io/features/distribute/
- 🔥🔥 Openvoice: https://github.com/mudler/LocalAI/pull/2334
- 🆕 Function calls without grammars and mixed mode: https://github.com/mudler/LocalAI/pull/2328
- 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
- Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
- Reranker API: https://github.com/mudler/LocalAI/pull/2121
+- July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
+- June 2024: 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
+- June 2024: Support for models from OCI registries: https://github.com/mudler/LocalAI/pull/2628
+- May 2024: 🔥🔥 Decentralized P2P llama.cpp:  https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs  https://localai.io/features/distribute/
+- May 2024: 🔥🔥 Openvoice: https://github.com/mudler/LocalAI/pull/2334
+- May 2024: 🆕 Function calls without grammars and mixed mode: https://github.com/mudler/LocalAI/pull/2328
+- May 2024: 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
+- May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
+- April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121

 Hot topics (looking for contributors):

@@ -89,6 +90,7 @@ Hot topics (looking for contributors):
 - Assistant API: https://github.com/mudler/LocalAI/issues/1273
 - Moderation endpoint: https://github.com/mudler/LocalAI/issues/999
 - Vulkan: https://github.com/mudler/LocalAI/issues/1647
+- Anthropic API: https://github.com/mudler/LocalAI/issues/1808

 If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22

@@ -134,6 +136,7 @@ Other:
 - Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
 - Shell-Pilot(Interact with LLM using LocalAI models via pure shell scripts on your Linux or MacOS system) https://github.com/reid41/shell-pilot
 - Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
+- Github Actions: https://github.com/marketplace/actions/start-localai
 - Examples: https://github.com/mudler/LocalAI/tree/master/examples/
  

--- a/backend/cpp/grpc/Makefile
+++ b/backend/cpp/grpc/Makefile
@@ -46,9 +46,14 @@ endif
 $(INSTALLED_PACKAGES): grpc_build

 $(GRPC_REPO):
-	git clone --depth $(GIT_CLONE_DEPTH) -b $(TAG_LIB_GRPC) $(GIT_REPO_LIB_GRPC) $(GRPC_REPO)/grpc
-	cd $(GRPC_REPO)/grpc && git submodule update --jobs 2 --init --recursive --depth $(GIT_CLONE_DEPTH)
-
+	mkdir -p $(GRPC_REPO)/grpc
+	cd $(GRPC_REPO)/grpc && \
+	git init && \
+	git remote add origin $(GIT_REPO_LIB_GRPC)  && \
+	git fetch origin && \
+	git checkout $(TAG_LIB_GRPC) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+	
 $(GRPC_BUILD): $(GRPC_REPO)
 	mkdir -p $(GRPC_BUILD)
 	cd $(GRPC_BUILD) && cmake $(CMAKE_ARGS) ../$(GRPC_REPO)/grpc && cmake --build . && cmake --build . --target install
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -1,5 +1,6 @@

 LLAMA_VERSION?=
+LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
 BUILD_TYPE?=
@@ -45,11 +46,13 @@ ifeq ($(BUILD_TYPE),sycl_f32)
 endif

 llama.cpp:
-	git clone --recurse-submodules https://github.com/ggerganov/llama.cpp llama.cpp
-	if [ -z "$(LLAMA_VERSION)" ]; then \
-		exit 1; \
-	fi
-	cd llama.cpp && git checkout -b build $(LLAMA_VERSION) && git submodule update --init --recursive --depth 1
+	mkdir -p llama.cpp
+	cd llama.cpp && \
+	git init && \
+	git remote add origin $(LLAMA_REPO)  && \
+	git fetch origin && \
+	git checkout -b build $(LLAMA_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch

 llama.cpp/examples/grpc-server: llama.cpp
 	mkdir -p llama.cpp/examples/grpc-server
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -2108,6 +2108,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    data["grammar"] = predict->grammar();
    data["prompt"] = predict->prompt();
    data["ignore_eos"] = predict->ignoreeos();
+    data["embeddings"] = predict->embeddings();

    // for each image in the request, add the image data
    //
@@ -2258,7 +2259,6 @@ static void params_parse(const backend::ModelOptions* request,
     // get the directory of modelfile
     std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
     params.lora_adapter.push_back(std::make_tuple(model_dir + "/"+request->loraadapter(), scale_factor));
-     params.lora_base  =  model_dir + "/"+request->lorabase();
    }
    params.use_mlock = request->mlock();
    params.use_mmap = request->mmap();
@@ -2385,6 +2385,31 @@ public:

        return grpc::Status::OK;
    }
+
+    /// https://github.com/ggerganov/llama.cpp/blob/aa2341298924ac89778252015efcb792f2df1e20/examples/server/server.cpp#L2969
+    grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) {
+        json data = parse_options(false, request, llama);
+        const int task_id = llama.queue_tasks.get_new_id();
+        llama.queue_results.add_waiting_task_id(task_id);
+        llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, -1);
+        // get the result
+        task_result result = llama.queue_results.recv(task_id);
+        //std::cout << "Embedding result JSON" << result.result_json.dump() << std::endl;
+        llama.queue_results.remove_waiting_task_id(task_id);
+        if (!result.error && result.stop) {
+            std::vector<float> embeddings = result.result_json.value("embedding", std::vector<float>());
+            // loop the vector and set the embeddings results
+            for (int i = 0; i < embeddings.size(); i++) {
+                embeddingResult->add_embeddings(embeddings[i]);
+            }
+        }
+        else
+        {
+            return grpc::Status::OK;
+        }
+
+        return grpc::Status::OK;
+    }
 };

 void RunServer(const std::string& server_address) {
--- a/backend/go/llm/llama/llama.go
+++ b/backend/go/llm/llama/llama.go
@@ -6,9 +6,9 @@ import (
 	"fmt"
 	"path/filepath"

-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	"github.com/go-skynet/go-llama.cpp"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )

 type LLM struct {
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.64.0
+grpcio==1.65.1
 protobuf
 torch
 certifi
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -3,4 +3,4 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 bark==0.1.5
-grpcio==1.64.0
+grpcio==1.65.1
 protobuf
 certifi
 transformers
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,2 +1,2 @@
-grpcio==1.64.0
+grpcio==1.65.1
 protobuf
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -3,4 +3,4 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 TTS==0.22.0
-grpcio==1.64.0
+grpcio==1.65.1
 protobuf
 certifi
 transformers
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 from concurrent import futures
-
+import traceback
 import argparse
 from collections import defaultdict
 from enum import Enum
@@ -17,35 +17,39 @@ import backend_pb2_grpc

 import grpc

-from diffusers import StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, EulerAncestralDiscreteScheduler
+from diffusers import StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
+    EulerAncestralDiscreteScheduler
 from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
-from diffusers.utils import load_image,export_to_video
+from diffusers.utils import load_image, export_to_video
 from compel import Compel, ReturnedEmbeddingsType

 from transformers import CLIPTextModel
 from safetensors.torch import load_file

-
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
-COMPEL=os.environ.get("COMPEL", "0") == "1"
-XPU=os.environ.get("XPU", "0") == "1"
-CLIPSKIP=os.environ.get("CLIPSKIP", "1") == "1"
-SAFETENSORS=os.environ.get("SAFETENSORS", "1") == "1"
-CHUNK_SIZE=os.environ.get("CHUNK_SIZE", "8")
-FPS=os.environ.get("FPS", "7")
-DISABLE_CPU_OFFLOAD=os.environ.get("DISABLE_CPU_OFFLOAD", "0") == "1"
-FRAMES=os.environ.get("FRAMES", "64")
+COMPEL = os.environ.get("COMPEL", "0") == "1"
+XPU = os.environ.get("XPU", "0") == "1"
+CLIPSKIP = os.environ.get("CLIPSKIP", "1") == "1"
+SAFETENSORS = os.environ.get("SAFETENSORS", "1") == "1"
+CHUNK_SIZE = os.environ.get("CHUNK_SIZE", "8")
+FPS = os.environ.get("FPS", "7")
+DISABLE_CPU_OFFLOAD = os.environ.get("DISABLE_CPU_OFFLOAD", "0") == "1"
+FRAMES = os.environ.get("FRAMES", "64")

 if XPU:
    import intel_extension_for_pytorch as ipex
+
    print(ipex.xpu.get_device_name(0))

 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))

+
 # https://github.com/CompVis/stable-diffusion/issues/239#issuecomment-1627615287
-def sc(self, clip_input, images) : return images, [False for i in images]
+def sc(self, clip_input, images): return images, [False for i in images]
+
+
 # edit the StableDiffusionSafetyChecker class so that, when called, it just returns the images and an array of True values
 safety_checker.StableDiffusionSafetyChecker.forward = sc

@@ -62,6 +66,8 @@ from diffusers.schedulers import (
    PNDMScheduler,
    UniPCMultistepScheduler,
 )
+
+
 # The scheduler list mapping was taken from here: https://github.com/neggles/animatediff-cli/blob/6f336f5f4b5e38e85d7f06f1744ef42d0a45f2a7/src/animatediff/schedulers.py#L39
 # Credits to https://github.com/neggles
 # See https://github.com/huggingface/diffusers/issues/4167 for more details on sched mapping from A1111
@@ -136,10 +142,12 @@ def get_scheduler(name: str, config: dict = {}):

    return sched_class.from_config(config)

+
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    def Health(self, request, context):
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+
    def LoadModel(self, request, context):
        try:
            print(f"Loading model {request.Model}...", file=sys.stderr)
@@ -149,7 +157,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

            if request.F16Memory:
                torchType = torch.float16
-                variant="fp16"
+                variant = "fp16"

            local = False
            modelFile = request.Model
@@ -157,38 +165,38 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            self.cfg_scale = 7
            if request.CFGScale != 0:
                self.cfg_scale = request.CFGScale
-            
+
            clipmodel = "runwayml/stable-diffusion-v1-5"
            if request.CLIPModel != "":
                clipmodel = request.CLIPModel
            clipsubfolder = "text_encoder"
            if request.CLIPSubfolder != "":
                clipsubfolder = request.CLIPSubfolder
-            
+
            # Check if ModelFile exists
            if request.ModelFile != "":
                if os.path.exists(request.ModelFile):
                    local = True
                    modelFile = request.ModelFile
-            
+
            fromSingleFile = request.Model.startswith("http") or request.Model.startswith("/") or local
-            self.img2vid=False
-            self.txt2vid=False
+            self.img2vid = False
+            self.txt2vid = False
            ## img2img
            if (request.PipelineType == "StableDiffusionImg2ImgPipeline") or (request.IMG2IMG and request.PipelineType == ""):
                if fromSingleFile:
                    self.pipe = StableDiffusionImg2ImgPipeline.from_single_file(modelFile,
-                                torch_dtype=torchType)
+                                                                                torch_dtype=torchType)
                else:
                    self.pipe = StableDiffusionImg2ImgPipeline.from_pretrained(request.Model,
-                                torch_dtype=torchType)
+                                                                               torch_dtype=torchType)

            elif request.PipelineType == "StableDiffusionDepth2ImgPipeline":
                self.pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(request.Model,
-                            torch_dtype=torchType)
+                                                                             torch_dtype=torchType)
            ## img2vid
            elif request.PipelineType == "StableVideoDiffusionPipeline":
-                self.img2vid=True
+                self.img2vid = True
                self.pipe = StableVideoDiffusionPipeline.from_pretrained(
                    request.Model, torch_dtype=torchType, variant=variant
                )
@@ -197,64 +205,63 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            ## text2img
            elif request.PipelineType == "AutoPipelineForText2Image" or request.PipelineType == "":
                self.pipe = AutoPipelineForText2Image.from_pretrained(request.Model,
-                                                    torch_dtype=torchType,
-                                                    use_safetensors=SAFETENSORS, 
-                                                    variant=variant)
+                                                                      torch_dtype=torchType,
+                                                                      use_safetensors=SAFETENSORS,
+                                                                      variant=variant)
            elif request.PipelineType == "StableDiffusionPipeline":
                if fromSingleFile:
                    self.pipe = StableDiffusionPipeline.from_single_file(modelFile,
-                                                        torch_dtype=torchType)
+                                                                         torch_dtype=torchType)
                else:
                    self.pipe = StableDiffusionPipeline.from_pretrained(request.Model,
-                                                        torch_dtype=torchType)
+                                                                        torch_dtype=torchType)
            elif request.PipelineType == "DiffusionPipeline":
                self.pipe = DiffusionPipeline.from_pretrained(request.Model,
-                                                        torch_dtype=torchType)
+                                                              torch_dtype=torchType)
            elif request.PipelineType == "VideoDiffusionPipeline":
-                self.txt2vid=True
+                self.txt2vid = True
                self.pipe = DiffusionPipeline.from_pretrained(request.Model,
-                                                        torch_dtype=torchType)
+                                                              torch_dtype=torchType)
            elif request.PipelineType == "StableDiffusionXLPipeline":
                if fromSingleFile:
                    self.pipe = StableDiffusionXLPipeline.from_single_file(modelFile,
-                                                               torch_dtype=torchType,
-                                                               use_safetensors=True)
+                                                                           torch_dtype=torchType,
+                                                                           use_safetensors=True)
                else:
                    self.pipe = StableDiffusionXLPipeline.from_pretrained(
-                        request.Model, 
-                        torch_dtype=torchType, 
-                        use_safetensors=True, 
+                        request.Model,
+                        torch_dtype=torchType,
+                        use_safetensors=True,
                        variant=variant)
            elif request.PipelineType == "StableDiffusion3Pipeline":
                if fromSingleFile:
                    self.pipe = StableDiffusion3Pipeline.from_single_file(modelFile,
-                                                               torch_dtype=torchType,
-                                                               use_safetensors=True)
+                                                                          torch_dtype=torchType,
+                                                                          use_safetensors=True)
                else:
                    self.pipe = StableDiffusion3Pipeline.from_pretrained(
-                        request.Model, 
-                        torch_dtype=torchType, 
-                        use_safetensors=True, 
+                        request.Model,
+                        torch_dtype=torchType,
+                        use_safetensors=True,
                        variant=variant)

            if CLIPSKIP and request.CLIPSkip != 0:
                self.clip_skip = request.CLIPSkip
            else:
                self.clip_skip = 0
-            
+
            # torch_dtype needs to be customized. float16 for GPU, float32 for CPU
            # TODO: this needs to be customized
            if request.SchedulerType != "":
                self.pipe.scheduler = get_scheduler(request.SchedulerType, self.pipe.scheduler.config)
-                
+
            if COMPEL:
                self.compel = Compel(
-                    tokenizer=[self.pipe.tokenizer, self.pipe.tokenizer_2 ], 
+                    tokenizer=[self.pipe.tokenizer, self.pipe.tokenizer_2],
                    text_encoder=[self.pipe.text_encoder, self.pipe.text_encoder_2],
                    returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
                    requires_pooled=[False, True]
-                    )
-
+                )

            if request.ControlNet:
                self.controlnet = ControlNetModel.from_pretrained(
@@ -263,13 +270,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                self.pipe.controlnet = self.controlnet
            else:
                self.controlnet = None
-
-            if request.CUDA:
-                self.pipe.to('cuda')
-                if self.controlnet:
-                    self.controlnet.to('cuda')
-            if XPU:
-                self.pipe = self.pipe.to("xpu")
            # Assume directory from request.ModelFile.
            # Only if request.LoraAdapter it's not an absolute path
            if request.LoraAdapter and request.ModelFile != "" and not os.path.isabs(request.LoraAdapter) and request.LoraAdapter:
@@ -282,10 +282,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            if request.LoraAdapter:
                # Check if its a local file and not a directory ( we load lora differently for a safetensor file )
                if os.path.exists(request.LoraAdapter) and not os.path.isdir(request.LoraAdapter):
-                    self.load_lora_weights(request.LoraAdapter, 1, device, torchType)
+                    # self.load_lora_weights(request.LoraAdapter, 1, device, torchType)
+                    self.pipe.load_lora_weights(request.LoraAdapter)
                else:
                    self.pipe.unet.load_attn_procs(request.LoraAdapter)

+            if request.CUDA:
+                self.pipe.to('cuda')
+                if self.controlnet:
+                    self.controlnet.to('cuda')
+            if XPU:
+                self.pipe = self.pipe.to("xpu")
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        # Implement your logic here for the LoadModel service
@@ -358,9 +365,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        # create a dictionary of values for the parameters
        options = {
-            "negative_prompt":     request.negative_prompt, 
-            "width":               request.width, 
-            "height":              request.height,
+            "negative_prompt": request.negative_prompt,
+            "width": request.width,
+            "height": request.height,
            "num_inference_steps": steps,
        }

@@ -372,7 +379,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            options["image"] = pose_image

        if CLIPSKIP and self.clip_skip != 0:
-            options["clip_skip"]=self.clip_skip
+            options["clip_skip"] = self.clip_skip

        # Get the keys that we will build the args for our pipe for
        keys = options.keys()
@@ -416,20 +423,21 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            image = self.pipe(
                guidance_scale=self.cfg_scale,
                **kwargs
-                ).images[0] 
+            ).images[0]
        else:
            # pass the kwargs dictionary to the self.pipe method
            image = self.pipe(
                prompt,
                guidance_scale=self.cfg_scale,
                **kwargs
-                ).images[0]
+            ).images[0]

        # save the result
        image.save(request.dst)

        return backend_pb2.Result(message="Media generated", success=True)

+
 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
@@ -453,6 +461,7 @@ def serve(address):
    except KeyboardInterrupt:
        server.stop(0)

+
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the gRPC server.")
    parser.add_argument(
@@ -460,4 +469,4 @@ if __name__ == "__main__":
    )
    args = parser.parse_args()

-    serve(args.addr)
+    serve(args.addr)
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -3,4 +3,4 @@ intel-extension-for-pytorch
 torch
 torchvision
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,7 +1,9 @@
+setuptools
 accelerate
 compel
+peft
 diffusers
-grpcio==1.64.0
+grpcio==1.65.1
 opencv-python
 pillow
 protobuf
--- a/backend/python/exllama/requirements.txt
+++ b/backend/python/exllama/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.64.0
+grpcio==1.65.0
 protobuf
 torch
 transformers
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,5 +1,5 @@
 accelerate
-grpcio==1.64.0
+grpcio==1.65.1
 protobuf
 certifi
 torch
--- a/backend/python/mamba/requirements-install.txt
+++ b/backend/python/mamba/requirements-install.txt
@@ -4,4 +4,4 @@
 packaging
 setuptools
 wheel
-torch==2.2.0
+torch==2.3.1
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,6 +1,6 @@
-causal-conv1d==1.2.0.post2
-mamba-ssm==1.2.0.post1
-grpcio==1.64.0
+causal-conv1d==1.4.0
+mamba-ssm==2.2.2
+grpcio==1.65.1
 protobuf
 certifi
 transformers
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -2,22 +2,22 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-grpcio==1.64.0
+grpcio==1.65.1
 protobuf
 librosa==0.9.1
-faster-whisper==0.9.0
+faster-whisper==1.0.3
 pydub==0.25.1
 wavmark==0.0.3
-numpy==1.22.0
+numpy==1.26.4
 eng_to_ipa==0.0.2
 inflect==7.0.0
 unidecode==1.3.7
-whisper-timestamped==1.14.2
+whisper-timestamped==1.15.4
 openai
 python-dotenv
 pypinyin==0.50.0
 cn2an==0.5.22
 jieba==0.42.1
-gradio==3.48.0
+gradio==4.38.1
 langid==1.1.6
-git+https://github.com/myshell-ai/MeloTTS.git
+git+https://github.com/myshell-ai/MeloTTS.git
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -1,20 +1,20 @@
-grpcio==1.64.0
+grpcio==1.65.1
 protobuf
-librosa==0.9.1
-faster-whisper==0.9.0
+librosa
+faster-whisper
 pydub==0.25.1
 wavmark==0.0.3
-numpy==1.22.0
+numpy
 eng_to_ipa==0.0.2
-inflect==7.0.0
-unidecode==1.3.7
-whisper-timestamped==1.14.2
+inflect
+unidecode
+whisper-timestamped
 openai
 python-dotenv
-pypinyin==0.50.0
+pypinyin
 cn2an==0.5.22
 jieba==0.42.1
-gradio==3.48.0
+gradio
 langid==1.1.6
 git+https://github.com/myshell-ai/MeloTTS.git
-git+https://github.com/myshell-ai/OpenVoice.git
+git+https://github.com/myshell-ai/OpenVoice.git
--- a/backend/python/openvoice/test.sh
+++ b/backend/python/openvoice/test.sh
@@ -5,7 +5,7 @@ source $(dirname $0)/../common/libbackend.sh

 # Download checkpoints if not present
 if [ ! -d "checkpoints_v2" ]; then
-    wget https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip -O checkpoints_v2.zip
+    wget https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip -O checkpoints_v2.zip
    unzip checkpoints_v2.zip
 fi

--- a/backend/python/parler-tts/requirements-intel.txt
+++ b/backend/python/parler-tts/requirements-intel.txt
@@ -3,4 +3,4 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,5 +1,5 @@
 accelerate
-grpcio==1.64.0
+grpcio==1.65.1
 protobuf
 torch
 git+https://github.com/huggingface/parler-tts.git@10016fb0300c0dc31a0fb70e26f3affee7b62f16
--- a/backend/python/petals/requirements-intel.txt
+++ b/backend/python/petals/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/rerankers/requirements-intel.txt
+++ b/backend/python/rerankers/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 rerankers[transformers]
-grpcio==1.64.0
+grpcio==1.65.1
 protobuf
 certifi
 transformers
--- a/backend/python/sentencetransformers/requirements.txt
+++ b/backend/python/sentencetransformers/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
-sentence-transformers==2.5.1
+sentence-transformers==3.0.1
 transformers
-grpcio==1.64.0
+grpcio==1.65.1
 protobuf
 certifi
--- a/backend/python/transformers-musicgen/requirements.txt
+++ b/backend/python/transformers-musicgen/requirements.txt
@@ -1,7 +1,7 @@
 accelerate
 transformers
-grpcio==1.64.0
+grpcio==1.65.1
 protobuf
 torch
-scipy==1.13.0
+scipy==1.14.0
 certifi
--- a/backend/python/transformers/requirements-intel.txt
+++ b/backend/python/transformers/requirements-intel.txt
@@ -2,4 +2,3 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 transformers
-grpcio==1.64.0
+grpcio==1.65.1
 protobuf
 torch
 certifi
--- a/backend/python/vall-e-x/requirements-intel.txt
+++ b/backend/python/vall-e-x/requirements-intel.txt
@@ -3,4 +3,4 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vall-e-x/requirements.txt
+++ b/backend/python/vall-e-x/requirements.txt
@@ -1,4 +1,4 @@
 accelerate
-grpcio==1.64.0
+grpcio==1.65.1
 protobuf
 certifi
--- a/backend/python/vllm/requirements-intel.txt
+++ b/backend/python/vllm/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 vllm
-grpcio==1.64.0
+grpcio==1.65.1
 protobuf
 certifi
 transformers
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -91,7 +91,7 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		Type:                 c.ModelType,
 		RopeFreqScale:        c.RopeFreqScale,
 		NUMA:                 c.NUMA,
-		Embeddings:           c.Embeddings,
+		Embeddings:           *c.Embeddings,
 		LowVRAM:              *c.LowVRAM,
 		NGPULayers:           int32(*c.NGPULayers),
 		MMap:                 *c.MMap,
--- a/core/cli/federated.go
+++ b/core/cli/federated.go
@@ -2,129 +2,20 @@ package cli

 import (
 	"context"
-	"errors"
-	"fmt"
-	"io"
-	"net"
-	"time"
-
-	"math/rand/v2"

 	cliContext "github.com/mudler/LocalAI/core/cli/context"
 	"github.com/mudler/LocalAI/core/p2p"
-	"github.com/mudler/edgevpn/pkg/node"
-	"github.com/mudler/edgevpn/pkg/protocol"
-	"github.com/mudler/edgevpn/pkg/types"
-	"github.com/rs/zerolog/log"
 )

 type FederatedCLI struct {
 	Address        string `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
 	Peer2PeerToken string `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
+	LoadBalanced   bool   `env:"LOCALAI_LOAD_BALANCED,LOAD_BALANCED" default:"false" help:"Enable load balancing" group:"p2p"`
 }

 func (f *FederatedCLI) Run(ctx *cliContext.Context) error {

-	n, err := p2p.NewNode(f.Peer2PeerToken)
-	if err != nil {
-		return fmt.Errorf("creating a new node: %w", err)
-	}
-	err = n.Start(context.Background())
-	if err != nil {
-		return fmt.Errorf("creating a new node: %w", err)
-	}
+	fs := p2p.NewFederatedServer(f.Address, p2p.FederatedID, f.Peer2PeerToken, f.LoadBalanced)

-	if err := p2p.ServiceDiscoverer(context.Background(), n, f.Peer2PeerToken, p2p.FederatedID, nil); err != nil {
-		return err
-	}
-
-	return Proxy(context.Background(), n, f.Address, p2p.FederatedID)
-}
-
-func Proxy(ctx context.Context, node *node.Node, listenAddr, service string) error {
-
-	log.Info().Msgf("Allocating service '%s' on: %s", service, listenAddr)
-	// Open local port for listening
-	l, err := net.Listen("tcp", listenAddr)
-	if err != nil {
-		log.Error().Err(err).Msg("Error listening")
-		return err
-	}
-	//	ll.Info("Binding local port on", srcaddr)
-
-	ledger, _ := node.Ledger()
-
-	// Announce ourselves so nodes accepts our connection
-	ledger.Announce(
-		ctx,
-		10*time.Second,
-		func() {
-			// Retrieve current ID for ip in the blockchain
-			//_, found := ledger.GetKey(protocol.UsersLedgerKey, node.Host().ID().String())
-			// If mismatch, update the blockchain
-			//if !found {
-			updatedMap := map[string]interface{}{}
-			updatedMap[node.Host().ID().String()] = &types.User{
-				PeerID:    node.Host().ID().String(),
-				Timestamp: time.Now().String(),
-			}
-			ledger.Add(protocol.UsersLedgerKey, updatedMap)
-			//	}
-		},
-	)
-
-	defer l.Close()
-	for {
-		select {
-		case <-ctx.Done():
-			return errors.New("context canceled")
-		default:
-			log.Debug().Msg("New for connection")
-			// Listen for an incoming connection.
-			conn, err := l.Accept()
-			if err != nil {
-				fmt.Println("Error accepting: ", err.Error())
-				continue
-			}
-
-			// Handle connections in a new goroutine, forwarding to the p2p service
-			go func() {
-				var tunnelAddresses []string
-				for _, v := range p2p.GetAvailableNodes(p2p.FederatedID) {
-					if v.IsOnline() {
-						tunnelAddresses = append(tunnelAddresses, v.TunnelAddress)
-					} else {
-						log.Info().Msgf("Node %s is offline", v.ID)
-					}
-				}
-
-				// open a TCP stream to one of the tunnels
-				// chosen randomly
-				// TODO: optimize this and track usage
-				tunnelAddr := tunnelAddresses[rand.IntN(len(tunnelAddresses))]
-
-				tunnelConn, err := net.Dial("tcp", tunnelAddr)
-				if err != nil {
-					log.Error().Err(err).Msg("Error connecting to tunnel")
-					return
-				}
-
-				log.Info().Msgf("Redirecting %s to %s", conn.LocalAddr().String(), tunnelConn.RemoteAddr().String())
-				closer := make(chan struct{}, 2)
-				go copyStream(closer, tunnelConn, conn)
-				go copyStream(closer, conn, tunnelConn)
-				<-closer
-
-				tunnelConn.Close()
-				conn.Close()
-				//	ll.Infof("(service %s) Done handling %s", serviceID, l.Addr().String())
-			}()
-		}
-	}
-
-}
-
-func copyStream(closer chan struct{}, dst io.Writer, src io.Reader) {
-	defer func() { closer <- struct{}{} }() // connection is closed, send signal to stop proxy
-	io.Copy(dst, src)
+	return fs.Start(context.Background())
 }
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -119,7 +119,7 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		}

 		log.Info().Msg("Starting P2P server discovery...")
-		if err := p2p.ServiceDiscoverer(context.Background(), node, token, "", func() {
+		if err := p2p.ServiceDiscoverer(context.Background(), node, token, "", func(serviceID string, node p2p.NodeData) {
 			var tunnelAddresses []string
 			for _, v := range p2p.GetAvailableNodes("") {
 				if v.IsOnline() {
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -32,7 +32,7 @@ type BackendConfig struct {
 	Threads        *int              `yaml:"threads"`
 	Debug          *bool             `yaml:"debug"`
 	Roles          map[string]string `yaml:"roles"`
-	Embeddings     bool              `yaml:"embeddings"`
+	Embeddings     *bool             `yaml:"embeddings"`
 	Backend        string            `yaml:"backend"`
 	TemplateConfig TemplateConfig    `yaml:"template"`

@@ -338,6 +338,10 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.LowVRAM = &falseV
 	}

+	if cfg.Embeddings == nil {
+		cfg.Embeddings = &falseV
+	}
+
 	// Value passed by the top level are treated as default (no implicit defaults)
 	// defaults are set by the user
 	if ctx == 0 {
--- a/core/config/guesser.go
+++ b/core/config/guesser.go
@@ -20,6 +20,7 @@ const (
 	ChatML
 	Mistral03
 	Gemma
+	DeepSeek2
 )

 type settingsConfig struct {
@@ -37,6 +38,17 @@ var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConf
 			Completion:  "{{.Input}}",
 		},
 	},
+	DeepSeek2: {
+		StopWords: []string{"<｜end▁of▁sentence｜>"},
+		TemplateConfig: TemplateConfig{
+			ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
+{{ end -}}
+{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<｜end▁of▁sentence｜>{{end}}
+{{if eq .RoleName "system" -}}{{.Content}}
+{{end -}}`,
+			Chat: "{{.Input -}}\nAssistant: ",
+		},
+	},
 	LLaMa3: {
 		StopWords: []string{"<|eot_id|>"},
 		TemplateConfig: TemplateConfig{
@@ -208,8 +220,11 @@ func identifyFamily(f *gguf.GGUFFile) familyType {
 	qwen2 := arch == "qwen2"
 	phi3 := arch == "phi-3"
 	gemma := strings.HasPrefix(f.Model().Name, "gemma")
+	deepseek2 := arch == "deepseek2"

 	switch {
+	case deepseek2:
+		return DeepSeek2
 	case gemma:
 		return Gemma
 	case llama3:
--- a/core/gallery/gallery.go
+++ b/core/gallery/gallery.go
@@ -7,9 +7,10 @@ import (
 	"path/filepath"
 	"strings"

-	"github.com/imdario/mergo"
+	"dario.cat/mergo"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/downloader"
+	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/rs/zerolog/log"
 	"gopkg.in/yaml.v2"
 )
@@ -189,6 +190,12 @@ func DeleteModelFromSystem(basePath string, name string, additionalFiles []strin

 	galleryFile := filepath.Join(basePath, galleryFileName(name))

+	for _, f := range []string{configFile, galleryFile} {
+		if err := utils.VerifyPath(f, basePath); err != nil {
+			return fmt.Errorf("failed to verify path %s: %w", f, err)
+		}
+	}
+
 	var err error
 	// Delete all the files associated to the model
 	// read the model config
@@ -197,35 +204,34 @@ func DeleteModelFromSystem(basePath string, name string, additionalFiles []strin
 		log.Error().Err(err).Msgf("failed to read gallery file %s", configFile)
 	}

+	var filesToRemove []string
+
 	// Remove additional files
 	if galleryconfig != nil {
 		for _, f := range galleryconfig.Files {
 			fullPath := filepath.Join(basePath, f.Filename)
-			log.Debug().Msgf("Removing file %s", fullPath)
-			if e := os.Remove(fullPath); e != nil {
-				err = errors.Join(err, fmt.Errorf("failed to remove file %s: %w", f.Filename, e))
-			}
+			filesToRemove = append(filesToRemove, fullPath)
 		}
 	}

 	for _, f := range additionalFiles {
 		fullPath := filepath.Join(filepath.Join(basePath, f))
-		log.Debug().Msgf("Removing additional file %s", fullPath)
-		if e := os.Remove(fullPath); e != nil {
+		filesToRemove = append(filesToRemove, fullPath)
+	}
+
+	filesToRemove = append(filesToRemove, configFile)
+	filesToRemove = append(filesToRemove, galleryFile)
+
+	// skip duplicates
+	filesToRemove = utils.Unique(filesToRemove)
+
+	// Removing files
+	for _, f := range filesToRemove {
+		if e := os.Remove(f); e != nil {
 			err = errors.Join(err, fmt.Errorf("failed to remove file %s: %w", f, e))
 		}
 	}

-	log.Debug().Msgf("Removing model config file %s", configFile)
-
-	// Delete the model config file
-	if e := os.Remove(configFile); e != nil {
-		err = errors.Join(err, fmt.Errorf("failed to remove file %s: %w", configFile, e))
-	}
-
-	// Delete gallery config file
-	os.Remove(galleryFile)
-
 	return err
 }

--- a/core/gallery/models.go
+++ b/core/gallery/models.go
@@ -6,7 +6,7 @@ import (
 	"os"
 	"path/filepath"

-	"github.com/imdario/mergo"
+	"dario.cat/mergo"
 	lconfig "github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/utils"
--- a/core/http/elements/gallery.go
+++ b/core/http/elements/gallery.go
@@ -9,7 +9,6 @@ import (
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/p2p"
 	"github.com/mudler/LocalAI/core/services"
-	"github.com/mudler/LocalAI/pkg/xsync"
 )

 const (
@@ -372,7 +371,12 @@ func dropBadChars(s string) string {
 	return strings.ReplaceAll(s, "@", "__")
 }

-func ListModels(models []*gallery.GalleryModel, processing *xsync.SyncedMap[string, string], galleryService *services.GalleryService) string {
+type ProcessTracker interface {
+	Exists(string) bool
+	Get(string) string
+}
+
+func ListModels(models []*gallery.GalleryModel, processTracker ProcessTracker, galleryService *services.GalleryService) string {
 	modelsElements := []elem.Node{}
 	descriptionDiv := func(m *gallery.GalleryModel) elem.Node {
 		return elem.Div(
@@ -396,7 +400,7 @@ func ListModels(models []*gallery.GalleryModel, processing *xsync.SyncedMap[stri

 	actionDiv := func(m *gallery.GalleryModel) elem.Node {
 		galleryID := fmt.Sprintf("%s@%s", m.Gallery.Name, m.Name)
-		currentlyProcessing := processing.Exists(galleryID)
+		currentlyProcessing := processTracker.Exists(galleryID)
 		jobID := ""
 		isDeletionOp := false
 		if currentlyProcessing {
@@ -404,7 +408,7 @@ func ListModels(models []*gallery.GalleryModel, processing *xsync.SyncedMap[stri
 			if status != nil && status.Deletion {
 				isDeletionOp = true
 			}
-			jobID = processing.Get(galleryID)
+			jobID = processTracker.Get(galleryID)
 			// TODO:
 			// case not handled, if status == nil : "Waiting"
 		}
--- a/core/http/endpoints/jina/rerank.go
+++ b/core/http/endpoints/jina/rerank.go
@@ -12,6 +12,11 @@ import (
 	"github.com/rs/zerolog/log"
 )

+// JINARerankEndpoint acts like the Jina reranker endpoint (https://jina.ai/reranker/)
+// @Summary Reranks a list of phrases by relevance to a given text query.
+// @Param request body schema.JINARerankRequest true "query params"
+// @Success 200 {object} schema.JINARerankResponse "Response"
+// @Router /v1/rerank [post]
 func JINARerankEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		req := new(schema.JINARerankRequest)
--- a/core/http/endpoints/localai/backend_monitor.go
+++ b/core/http/endpoints/localai/backend_monitor.go
@@ -6,6 +6,11 @@ import (
 	"github.com/mudler/LocalAI/core/services"
 )

+// BackendMonitorEndpoint returns the status of the specified backend
+// @Summary Backend monitor endpoint
+// @Param request body schema.BackendMonitorRequest true "Backend statistics request"
+// @Success 200 {object} proto.StatusResponse "Response"
+// @Router /backend/monitor [get]
 func BackendMonitorEndpoint(bm *services.BackendMonitorService) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {

@@ -23,6 +28,10 @@ func BackendMonitorEndpoint(bm *services.BackendMonitorService) func(c *fiber.Ct
 	}
 }

+// BackendMonitorEndpoint shuts down the specified backend
+// @Summary Backend monitor endpoint
+// @Param request body schema.BackendMonitorRequest true "Backend statistics request"
+// @Router /backend/shutdown [post]
 func BackendShutdownEndpoint(bm *services.BackendMonitorService) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(schema.BackendMonitorRequest)
--- a/core/http/endpoints/localai/gallery.go
+++ b/core/http/endpoints/localai/gallery.go
@@ -9,6 +9,7 @@ import (
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
+	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/rs/zerolog/log"
 )
@@ -33,6 +34,10 @@ func CreateModelGalleryEndpointService(galleries []config.Gallery, modelPath str
 	}
 }

+// GetOpStatusEndpoint returns the job status
+// @Summary Returns the job status
+// @Success 200 {object} gallery.GalleryOpStatus "Response"
+// @Router /models/jobs/{uuid} [get]
 func (mgs *ModelGalleryEndpointService) GetOpStatusEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		status := mgs.galleryApplier.GetStatus(c.Params("uuid"))
@@ -43,12 +48,21 @@ func (mgs *ModelGalleryEndpointService) GetOpStatusEndpoint() func(c *fiber.Ctx)
 	}
 }

+// GetAllStatusEndpoint returns all the jobs status progress
+// @Summary Returns all the jobs status progress
+// @Success 200 {object} map[string]gallery.GalleryOpStatus "Response"
+// @Router /models/jobs [get]
 func (mgs *ModelGalleryEndpointService) GetAllStatusEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		return c.JSON(mgs.galleryApplier.GetAllStatus())
 	}
 }

+// ApplyModelGalleryEndpoint installs a new model to a LocalAI instance from the model gallery
+// @Summary Install models to LocalAI.
+// @Param request body GalleryModel true "query params"
+// @Success 200 {object} schema.GalleryResponse "Response"
+// @Router /models/apply [post]
 func (mgs *ModelGalleryEndpointService) ApplyModelGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(GalleryModel)
@@ -68,13 +82,15 @@ func (mgs *ModelGalleryEndpointService) ApplyModelGalleryEndpoint() func(c *fibe
 			Galleries:        mgs.galleries,
 			ConfigURL:        input.ConfigURL,
 		}
-		return c.JSON(struct {
-			ID        string `json:"uuid"`
-			StatusURL string `json:"status"`
-		}{ID: uuid.String(), StatusURL: c.BaseURL() + "/models/jobs/" + uuid.String()})
+		return c.JSON(schema.GalleryResponse{ID: uuid.String(), StatusURL: c.BaseURL() + "/models/jobs/" + uuid.String()})
 	}
 }

+// DeleteModelGalleryEndpoint lets delete models from a LocalAI instance
+// @Summary delete models to LocalAI.
+// @Param name	path string	true	"Model name"
+// @Success 200 {object} schema.GalleryResponse "Response"
+// @Router /models/delete/{name} [post]
 func (mgs *ModelGalleryEndpointService) DeleteModelGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		modelName := c.Params("name")
@@ -89,13 +105,14 @@ func (mgs *ModelGalleryEndpointService) DeleteModelGalleryEndpoint() func(c *fib
 			return err
 		}

-		return c.JSON(struct {
-			ID        string `json:"uuid"`
-			StatusURL string `json:"status"`
-		}{ID: uuid.String(), StatusURL: c.BaseURL() + "/models/jobs/" + uuid.String()})
+		return c.JSON(schema.GalleryResponse{ID: uuid.String(), StatusURL: c.BaseURL() + "/models/jobs/" + uuid.String()})
 	}
 }

+// ListModelFromGalleryEndpoint list the available models for installation from the active galleries
+// @Summary List installable models.
+// @Success 200 {object} []gallery.GalleryModel "Response"
+// @Router /models/available [get]
 func (mgs *ModelGalleryEndpointService) ListModelFromGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		log.Debug().Msgf("Listing models from galleries: %+v", mgs.galleries)
@@ -116,6 +133,10 @@ func (mgs *ModelGalleryEndpointService) ListModelFromGalleryEndpoint() func(c *f
 	}
 }

+// ListModelGalleriesEndpoint list the available galleries configured in LocalAI
+// @Summary List all Galleries
+// @Success 200 {object} []config.Gallery "Response"
+// @Router /models/galleries [get]
 // NOTE: This is different (and much simpler!) than above! This JUST lists the model galleries that have been loaded, not their contents!
 func (mgs *ModelGalleryEndpointService) ListModelGalleriesEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
@@ -128,6 +149,11 @@ func (mgs *ModelGalleryEndpointService) ListModelGalleriesEndpoint() func(c *fib
 	}
 }

+// AddModelGalleryEndpoint adds a gallery in LocalAI
+// @Summary Adds a gallery in LocalAI
+// @Param request body config.Gallery true "Gallery details"
+// @Success 200 {object} []config.Gallery "Response"
+// @Router /models/galleries [post]
 func (mgs *ModelGalleryEndpointService) AddModelGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(config.Gallery)
@@ -150,6 +176,11 @@ func (mgs *ModelGalleryEndpointService) AddModelGalleryEndpoint() func(c *fiber.
 	}
 }

+// RemoveModelGalleryEndpoint remove a gallery in LocalAI
+// @Summary removes a gallery from LocalAI
+// @Param request body config.Gallery true "Gallery details"
+// @Success 200 {object} []config.Gallery "Response"
+// @Router /models/galleries [delete]
 func (mgs *ModelGalleryEndpointService) RemoveModelGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(config.Gallery)
@@ -165,6 +196,10 @@ func (mgs *ModelGalleryEndpointService) RemoveModelGalleryEndpoint() func(c *fib
 		mgs.galleries = slices.DeleteFunc(mgs.galleries, func(gallery config.Gallery) bool {
 			return gallery.Name == input.Name
 		})
-		return c.Send(nil)
+		dat, err := json.Marshal(mgs.galleries)
+		if err != nil {
+			return err
+		}
+		return c.Send(dat)
 	}
 }
--- a/core/http/endpoints/localai/metrics.go
+++ b/core/http/endpoints/localai/metrics.go
@@ -9,8 +9,11 @@ import (
 	"github.com/prometheus/client_golang/prometheus/promhttp"
 )

+// LocalAIMetricsEndpoint returns the metrics endpoint for LocalAI
+// @Summary Prometheus metrics endpoint
+// @Param request body config.Gallery true "Gallery details"
+// @Router /metrics [get]
 func LocalAIMetricsEndpoint() fiber.Handler {
-
 	return adaptor.HTTPHandler(promhttp.Handler())
 }

--- a/core/http/endpoints/localai/p2p.go
+++ b/core/http/endpoints/localai/p2p.go
@@ -0,0 +1,28 @@
+package localai
+
+import (
+	"github.com/gofiber/fiber/v2"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/p2p"
+	"github.com/mudler/LocalAI/core/schema"
+)
+
+// ShowP2PNodes returns the P2P Nodes
+// @Summary Returns available P2P nodes
+// @Success 200 {object} []schema.P2PNodesResponse "Response"
+// @Router /api/p2p [get]
+func ShowP2PNodes(c *fiber.Ctx) error {
+	// Render index
+	return c.JSON(schema.P2PNodesResponse{
+		Nodes:          p2p.GetAvailableNodes(""),
+		FederatedNodes: p2p.GetAvailableNodes(p2p.FederatedID),
+	})
+}
+
+// ShowP2PToken returns the P2P token
+// @Summary Show the P2P token
+// @Success 200 {string} string	 "Response"
+// @Router /api/p2p/token [get]
+func ShowP2PToken(appConfig *config.ApplicationConfig) func(*fiber.Ctx) error {
+	return func(c *fiber.Ctx) error { return c.Send([]byte(appConfig.P2PToken)) }
+}
--- a/core/http/endpoints/localai/welcome.go
+++ b/core/http/endpoints/localai/welcome.go
@@ -29,10 +29,18 @@ func WelcomeEndpoint(appConfig *config.ApplicationConfig,
 		// Get model statuses to display in the UI the operation in progress
 		processingModels, taskTypes := modelStatus()

+		modelsWithoutConfig := []string{}
+
+		for _, m := range models {
+			if _, ok := galleryConfigs[m]; !ok {
+				modelsWithoutConfig = append(modelsWithoutConfig, m)
+			}
+		}
+
 		summary := fiber.Map{
 			"Title":             "LocalAI API - " + internal.PrintableVersion(),
 			"Version":           internal.PrintableVersion(),
-			"Models":            models,
+			"Models":            modelsWithoutConfig,
 			"ModelsConfig":      backendConfigs,
 			"GalleryConfig":     galleryConfigs,
 			"IsP2PEnabled":      p2p.IsP2PEnabled(),
--- a/core/http/endpoints/openai/assistant.go
+++ b/core/http/endpoints/openai/assistant.go
@@ -11,6 +11,7 @@ import (

 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services"
 	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/utils"
@@ -125,6 +126,14 @@ func generateRandomID() int64 {
 	return currentId
 }

+// ListAssistantsEndpoint is the OpenAI Assistant API endpoint to list assistents https://platform.openai.com/docs/api-reference/assistants/listAssistants
+// @Summary List available assistents
+// @Param limit query int false "Limit the number of assistants returned"
+// @Param order query string false "Order of assistants returned"
+// @Param after query string false "Return assistants created after the given ID"
+// @Param before query string false "Return assistants created before the given ID"
+// @Success 200 {object} []Assistant "Response"
+// @Router /v1/assistants [get]
 func ListAssistantsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		// Because we're altering the existing assistants list we should just duplicate it for now.
@@ -230,13 +239,11 @@ func modelExists(cl *config.BackendConfigLoader, ml *model.ModelLoader, modelNam
 	return
 }

+// DeleteAssistantEndpoint is the OpenAI Assistant API endpoint to delete assistents https://platform.openai.com/docs/api-reference/assistants/deleteAssistant
+// @Summary Delete assistents
+// @Success 200 {object} schema.DeleteAssistantResponse "Response"
+// @Router /v1/assistants/{assistant_id} [delete]
 func DeleteAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
-	type DeleteAssistantResponse struct {
-		ID      string `json:"id"`
-		Object  string `json:"object"`
-		Deleted bool   `json:"deleted"`
-	}
-
 	return func(c *fiber.Ctx) error {
 		assistantID := c.Params("assistant_id")
 		if assistantID == "" {
@@ -247,7 +254,7 @@ func DeleteAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoad
 			if assistant.ID == assistantID {
 				Assistants = append(Assistants[:i], Assistants[i+1:]...)
 				utils.SaveConfig(appConfig.ConfigsDir, AssistantsConfigFile, Assistants)
-				return c.Status(fiber.StatusOK).JSON(DeleteAssistantResponse{
+				return c.Status(fiber.StatusOK).JSON(schema.DeleteAssistantResponse{
 					ID:      assistantID,
 					Object:  "assistant.deleted",
 					Deleted: true,
@@ -256,7 +263,7 @@ func DeleteAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoad
 		}

 		log.Warn().Msgf("Unable to find assistant %s for deletion", assistantID)
-		return c.Status(fiber.StatusNotFound).JSON(DeleteAssistantResponse{
+		return c.Status(fiber.StatusNotFound).JSON(schema.DeleteAssistantResponse{
 			ID:      assistantID,
 			Object:  "assistant.deleted",
 			Deleted: false,
@@ -264,6 +271,10 @@ func DeleteAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoad
 	}
 }

+// GetAssistantEndpoint is the OpenAI Assistant API endpoint to get assistents https://platform.openai.com/docs/api-reference/assistants/getAssistant
+// @Summary Get assistent data
+// @Success 200 {object} Assistant "Response"
+// @Router /v1/assistants/{assistant_id} [get]
 func GetAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		assistantID := c.Params("assistant_id")
@@ -293,19 +304,9 @@ var (
 	AssistantsFileConfigFile = "assistantsFile.json"
 )

-type AssistantFileRequest struct {
-	FileID string `json:"file_id"`
-}
-
-type DeleteAssistantFileResponse struct {
-	ID      string `json:"id"`
-	Object  string `json:"object"`
-	Deleted bool   `json:"deleted"`
-}
-
 func CreateAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		request := new(AssistantFileRequest)
+		request := new(schema.AssistantFileRequest)
 		if err := c.BodyParser(request); err != nil {
 			return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{"error": "Cannot parse JSON"})
 		}
@@ -346,7 +347,7 @@ func CreateAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.Model

 func ListAssistantFilesEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	type ListAssistantFiles struct {
-		Data   []File
+		Data   []schema.File
 		Object string
 	}

@@ -464,7 +465,7 @@ func DeleteAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.Model
 							// Remove the file from the assistantFiles slice
 							AssistantFiles = append(AssistantFiles[:i], AssistantFiles[i+1:]...)
 							utils.SaveConfig(appConfig.ConfigsDir, AssistantsFileConfigFile, AssistantFiles)
-							return c.Status(fiber.StatusOK).JSON(DeleteAssistantFileResponse{
+							return c.Status(fiber.StatusOK).JSON(schema.DeleteAssistantFileResponse{
 								ID:      fileId,
 								Object:  "assistant.file.deleted",
 								Deleted: true,
@@ -480,7 +481,7 @@ func DeleteAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.Model
 						AssistantFiles = append(AssistantFiles[:i], AssistantFiles[i+1:]...)
 						utils.SaveConfig(appConfig.ConfigsDir, AssistantsFileConfigFile, AssistantFiles)

-						return c.Status(fiber.StatusNotFound).JSON(DeleteAssistantFileResponse{
+						return c.Status(fiber.StatusNotFound).JSON(schema.DeleteAssistantFileResponse{
 							ID:      fileId,
 							Object:  "assistant.file.deleted",
 							Deleted: true,
@@ -491,7 +492,7 @@ func DeleteAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.Model
 		}
 		log.Warn().Msgf("Unable to find assistant: %s", assistantID)

-		return c.Status(fiber.StatusNotFound).JSON(DeleteAssistantFileResponse{
+		return c.Status(fiber.StatusNotFound).JSON(schema.DeleteAssistantFileResponse{
 			ID:      fileId,
 			Object:  "assistant.file.deleted",
 			Deleted: false,
--- a/core/http/endpoints/openai/assistant_test.go
+++ b/core/http/endpoints/openai/assistant_test.go
@@ -14,6 +14,7 @@ import (

 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/stretchr/testify/assert"
 )
@@ -26,7 +27,7 @@ type MockLoader struct {

 func tearDown() func() {
 	return func() {
-		UploadedFiles = []File{}
+		UploadedFiles = []schema.File{}
 		Assistants = []Assistant{}
 		AssistantFiles = []AssistantFile{}
 		_ = os.Remove(filepath.Join(configsDir, AssistantsConfigFile))
@@ -294,7 +295,7 @@ func TestAssistantEndpoints(t *testing.T) {
 		file, assistant, err := createFileAndAssistant(t, app, appConfig)
 		assert.NoError(t, err)

-		afr := AssistantFileRequest{FileID: file.ID}
+		afr := schema.AssistantFileRequest{FileID: file.ID}
 		af, _, err := createAssistantFile(app, afr, assistant.ID)

 		assert.NoError(t, err)
@@ -305,7 +306,7 @@ func TestAssistantEndpoints(t *testing.T) {
 		file, assistant, err := createFileAndAssistant(t, app, appConfig)
 		assert.NoError(t, err)

-		afr := AssistantFileRequest{FileID: file.ID}
+		afr := schema.AssistantFileRequest{FileID: file.ID}
 		af, _, err := createAssistantFile(app, afr, assistant.ID)
 		assert.NoError(t, err)

@@ -316,7 +317,7 @@ func TestAssistantEndpoints(t *testing.T) {
 		file, assistant, err := createFileAndAssistant(t, app, appConfig)
 		assert.NoError(t, err)

-		afr := AssistantFileRequest{FileID: file.ID}
+		afr := schema.AssistantFileRequest{FileID: file.ID}
 		af, _, err := createAssistantFile(app, afr, assistant.ID)
 		assert.NoError(t, err)
 		t.Cleanup(cleanupAssistantFile(t, app, af.ID, af.AssistantID))
@@ -338,7 +339,7 @@ func TestAssistantEndpoints(t *testing.T) {
 		file, assistant, err := createFileAndAssistant(t, app, appConfig)
 		assert.NoError(t, err)

-		afr := AssistantFileRequest{FileID: file.ID}
+		afr := schema.AssistantFileRequest{FileID: file.ID}
 		af, _, err := createAssistantFile(app, afr, assistant.ID)
 		assert.NoError(t, err)

@@ -349,7 +350,7 @@ func TestAssistantEndpoints(t *testing.T) {

 }

-func createFileAndAssistant(t *testing.T, app *fiber.App, o *config.ApplicationConfig) (File, Assistant, error) {
+func createFileAndAssistant(t *testing.T, app *fiber.App, o *config.ApplicationConfig) (schema.File, Assistant, error) {
 	ar := &AssistantRequest{
 		Model:        "ggml-gpt4all-j",
 		Name:         "3.5-turbo",
@@ -362,7 +363,7 @@ func createFileAndAssistant(t *testing.T, app *fiber.App, o *config.ApplicationC

 	assistant, _, err := createAssistant(app, *ar)
 	if err != nil {
-		return File{}, Assistant{}, err
+		return schema.File{}, Assistant{}, err
 	}
 	t.Cleanup(cleanupAllAssistants(t, app, []string{assistant.ID}))

@@ -374,7 +375,7 @@ func createFileAndAssistant(t *testing.T, app *fiber.App, o *config.ApplicationC
 	return file, assistant, nil
 }

-func createAssistantFile(app *fiber.App, afr AssistantFileRequest, assistantId string) (AssistantFile, *http.Response, error) {
+func createAssistantFile(app *fiber.App, afr schema.AssistantFileRequest, assistantId string) (AssistantFile, *http.Response, error) {
 	afrJson, err := json.Marshal(afr)
 	if err != nil {
 		return AssistantFile{}, nil, err
@@ -451,7 +452,7 @@ func cleanupAssistantFile(t *testing.T, app *fiber.App, fileId, assistantId stri
 		resp, err := app.Test(request)
 		assert.NoError(t, err)

-		var dafr DeleteAssistantFileResponse
+		var dafr schema.DeleteAssistantFileResponse
 		err = json.NewDecoder(resp.Body).Decode(&dafr)
 		assert.NoError(t, err)
 		assert.True(t, dafr.Deleted)
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -225,18 +225,16 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 			}

 			// Update input grammar
-			// Handle if we should return "name" instead of "functions"
-			if config.FunctionsConfig.FunctionName {
-				jsStruct := funcs.ToJSONNameStructure()
-				config.Grammar = jsStruct.Grammar(config.FunctionsConfig.GrammarConfig.Options()...)
-			} else {
-				jsStruct := funcs.ToJSONFunctionStructure()
-				config.Grammar = jsStruct.Grammar(config.FunctionsConfig.GrammarConfig.Options()...)
+			jsStruct := funcs.ToJSONStructure(config.FunctionsConfig.FunctionNameKey, config.FunctionsConfig.FunctionNameKey)
+			g, err := jsStruct.Grammar(config.FunctionsConfig.GrammarOptions()...)
+			if err == nil {
+				config.Grammar = g
 			}
 		case input.JSONFunctionGrammarObject != nil:
-			config.Grammar = input.JSONFunctionGrammarObject.Grammar(config.FunctionsConfig.GrammarConfig.Options()...)
-		case input.JSONFunctionGrammarObjectName != nil:
-			config.Grammar = input.JSONFunctionGrammarObjectName.Grammar(config.FunctionsConfig.GrammarConfig.Options()...)
+			g, err := input.JSONFunctionGrammarObject.Grammar(config.FunctionsConfig.GrammarOptions()...)
+			if err == nil {
+				config.Grammar = g
+			}
 		default:
 			// Force picking one of the functions by the request
 			if config.FunctionToCall() != "" {
--- a/core/http/endpoints/openai/edit.go
+++ b/core/http/endpoints/openai/edit.go
@@ -16,6 +16,11 @@ import (
 	"github.com/rs/zerolog/log"
 )

+// EditEndpoint is the OpenAI edit API endpoint
+// @Summary OpenAI edit endpoint
+// @Param request body schema.OpenAIRequest true "query params"
+// @Success 200 {object} schema.OpenAIResponse "Response"
+// @Router /v1/edits [post]
 func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		modelFile, input, err := readRequest(c, cl, ml, appConfig, true)
--- a/core/http/endpoints/openai/files.go
+++ b/core/http/endpoints/openai/files.go
@@ -9,25 +9,16 @@ import (
 	"time"

 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"

 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/pkg/utils"
 )

-var UploadedFiles []File
+var UploadedFiles []schema.File

 const UploadedFilesFile = "uploadedFiles.json"

-// File represents the structure of a file object from the OpenAI API.
-type File struct {
-	ID        string    `json:"id"`         // Unique identifier for the file
-	Object    string    `json:"object"`     // Type of the object (e.g., "file")
-	Bytes     int       `json:"bytes"`      // Size of the file in bytes
-	CreatedAt time.Time `json:"created_at"` // The time at which the file was created
-	Filename  string    `json:"filename"`   // The name of the file
-	Purpose   string    `json:"purpose"`    // The purpose of the file (e.g., "fine-tune", "classifications", etc.)
-}
-
 // UploadFilesEndpoint https://platform.openai.com/docs/api-reference/files/create
 func UploadFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
@@ -61,7 +52,7 @@ func UploadFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.Appli
 			return c.Status(fiber.StatusInternalServerError).SendString("Failed to save file: " + err.Error())
 		}

-		f := File{
+		f := schema.File{
 			ID:        fmt.Sprintf("file-%d", getNextFileId()),
 			Object:    "file",
 			Bytes:     int(file.Size),
@@ -84,14 +75,13 @@ func getNextFileId() int64 {
 }

 // ListFilesEndpoint https://platform.openai.com/docs/api-reference/files/list
+// @Summary List files.
+// @Success 200 {object} schema.ListFiles "Response"
+// @Router /v1/files [get]
 func ListFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
-	type ListFiles struct {
-		Data   []File
-		Object string
-	}

 	return func(c *fiber.Ctx) error {
-		var listFiles ListFiles
+		var listFiles schema.ListFiles

 		purpose := c.Query("purpose")
 		if purpose == "" {
@@ -108,7 +98,7 @@ func ListFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.Applica
 	}
 }

-func getFileFromRequest(c *fiber.Ctx) (*File, error) {
+func getFileFromRequest(c *fiber.Ctx) (*schema.File, error) {
 	id := c.Params("file_id")
 	if id == "" {
 		return nil, fmt.Errorf("file_id parameter is required")
@@ -125,7 +115,7 @@ func getFileFromRequest(c *fiber.Ctx) (*File, error) {

 // GetFilesEndpoint is the OpenAI API endpoint to get files https://platform.openai.com/docs/api-reference/files/retrieve
 // @Summary Returns information about a specific file.
-// @Success 200 {object} File "Response"
+// @Success 200 {object} schema.File "Response"
 // @Router /v1/files/{file_id} [get]
 func GetFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
--- a/core/http/endpoints/openai/files_test.go
+++ b/core/http/endpoints/openai/files_test.go
@@ -14,6 +14,7 @@ import (
 	"github.com/rs/zerolog/log"

 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"

 	"github.com/gofiber/fiber/v2"
 	utils2 "github.com/mudler/LocalAI/pkg/utils"
@@ -22,11 +23,6 @@ import (
 	"testing"
 )

-type ListFiles struct {
-	Data   []File
-	Object string
-}
-
 func startUpApp() (app *fiber.App, option *config.ApplicationConfig, loader *config.BackendConfigLoader) {
 	// Preparing the mocked objects
 	loader = &config.BackendConfigLoader{}
@@ -159,7 +155,7 @@ func TestUploadFileExceedSizeLimit(t *testing.T) {
 		resp, _ := app.Test(req)
 		assert.Equal(t, 200, resp.StatusCode)

-		var listFiles ListFiles
+		var listFiles schema.ListFiles
 		if err := json.Unmarshal(bodyToByteArray(resp, t), &listFiles); err != nil {
 			t.Errorf("Failed to decode response: %v", err)
 			return
@@ -201,7 +197,7 @@ func CallFilesUploadEndpoint(t *testing.T, app *fiber.App, fileName, tag, purpos
 	return app.Test(req)
 }

-func CallFilesUploadEndpointWithCleanup(t *testing.T, app *fiber.App, fileName, tag, purpose string, fileSize int, appConfig *config.ApplicationConfig) File {
+func CallFilesUploadEndpointWithCleanup(t *testing.T, app *fiber.App, fileName, tag, purpose string, fileSize int, appConfig *config.ApplicationConfig) schema.File {
 	// Create a file that exceeds the limit
 	testName := strings.Split(t.Name(), "/")[1]
 	file := createTestFile(t, testName+"-"+fileName, fileSize, appConfig)
@@ -280,8 +276,8 @@ func bodyToByteArray(resp *http.Response, t *testing.T) []byte {
 	return bodyBytes
 }

-func responseToFile(t *testing.T, resp *http.Response) File {
-	var file File
+func responseToFile(t *testing.T, resp *http.Response) schema.File {
+	var file schema.File
 	responseToString := bodyToString(resp, t)

 	err := json.NewDecoder(strings.NewReader(responseToString)).Decode(&file)
@@ -292,8 +288,8 @@ func responseToFile(t *testing.T, resp *http.Response) File {
 	return file
 }

-func responseToListFile(t *testing.T, resp *http.Response) ListFiles {
-	var listFiles ListFiles
+func responseToListFile(t *testing.T, resp *http.Response) schema.ListFiles {
+	var listFiles schema.ListFiles
 	responseToString := bodyToString(resp, t)

 	err := json.NewDecoder(strings.NewReader(responseToString)).Decode(&listFiles)
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -59,16 +59,8 @@ func RegisterLocalAIRoutes(app *fiber.App,

 	// p2p
 	if p2p.IsP2PEnabled() {
-		app.Get("/api/p2p", auth, func(c *fiber.Ctx) error {
-			// Render index
-			return c.JSON(map[string]interface{}{
-				"Nodes":          p2p.GetAvailableNodes(""),
-				"FederatedNodes": p2p.GetAvailableNodes(p2p.FederatedID),
-			})
-		})
-		app.Get("/api/p2p/token", auth, func(c *fiber.Ctx) error {
-			return c.Send([]byte(appConfig.P2PToken))
-		})
+		app.Get("/api/p2p", auth, localai.ShowP2PNodes)
+		app.Get("/api/p2p/token", auth, localai.ShowP2PToken(appConfig))
 	}

 	app.Get("/version", auth, func(c *fiber.Ctx) error {
--- a/core/http/routes/ui.go
+++ b/core/http/routes/ui.go
@@ -21,6 +21,40 @@ import (
 	"github.com/google/uuid"
 )

+type modelOpCache struct {
+	status *xsync.SyncedMap[string, string]
+}
+
+func NewModelOpCache() *modelOpCache {
+	return &modelOpCache{
+		status: xsync.NewSyncedMap[string, string](),
+	}
+}
+
+func (m *modelOpCache) Set(key string, value string) {
+	m.status.Set(key, value)
+}
+
+func (m *modelOpCache) Get(key string) string {
+	return m.status.Get(key)
+}
+
+func (m *modelOpCache) DeleteUUID(uuid string) {
+	for _, k := range m.status.Keys() {
+		if m.status.Get(k) == uuid {
+			m.status.Delete(k)
+		}
+	}
+}
+
+func (m *modelOpCache) Map() map[string]string {
+	return m.status.Map()
+}
+
+func (m *modelOpCache) Exists(key string) bool {
+	return m.status.Exists(key)
+}
+
 func RegisterUIRoutes(app *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
@@ -29,7 +63,7 @@ func RegisterUIRoutes(app *fiber.App,
 	auth func(*fiber.Ctx) error) {

 	// keeps the state of models that are being installed from the UI
-	var processingModels = xsync.NewSyncedMap[string, string]()
+	var processingModels = NewModelOpCache()

 	// modelStatus returns the current status of the models being processed (installation or deletion)
 	// it is called asynchonously from the UI
@@ -232,6 +266,8 @@ func RegisterUIRoutes(app *fiber.App,
 			return c.SendString(elements.ProgressBar("100"))
 		}
 		if status.Error != nil {
+			// TODO: instead of deleting the job, we should keep it in the cache and make it dismissable
+			processingModels.DeleteUUID(jobUID)
 			return c.SendString(elements.ErrorProgress(status.Error.Error(), status.GalleryModelName))
 		}

@@ -246,12 +282,7 @@ func RegisterUIRoutes(app *fiber.App,
 		status := galleryService.GetStatus(jobUID)

 		galleryID := ""
-		for _, k := range processingModels.Keys() {
-			if processingModels.Get(k) == jobUID {
-				galleryID = k
-				processingModels.Delete(k)
-			}
-		}
+		processingModels.DeleteUUID(jobUID)
 		if galleryID == "" {
 			log.Debug().Msgf("no processing model found for job : %+v\n", jobUID)
 		}
--- a/core/http/views/index.html
+++ b/core/http/views/index.html
@@ -17,15 +17,26 @@
        </div>

        <div class="models mt-4">
-
            {{template "views/partials/inprogress" .}}
-
            {{ if eq (len .ModelsConfig) 0 }}
-            <h2 class="text-center text-3xl font-semibold text-gray-100"> <i class="text-yellow-200 ml-2 fa-solid fa-triangle-exclamation animate-pulse"></i> Ouch! seems you don't have any models installed!</h2>
+            <h2 class="text-center text-3xl font-semibold text-gray-100"> <i class="text-yellow-200 ml-2 fa-solid fa-triangle-exclamation animate-pulse"></i> Ouch! seems you don't have any models installed from the LocalAI gallery!</h2>
            <p class="text-center mt-4 text-xl">..install something from the <a class="text-gray-400 hover:text-white ml-1 px-3 py-2 rounded" href="/browse">🖼️ Gallery</a> or check the <a href="https://localai.io/basics/getting_started/" class="text-gray-400 hover:text-white ml-1 px-3 py-2 rounded"> <i class="fa-solid fa-book"></i> Getting started documentation </a></p>
+
+            {{ if ne (len .Models) 0 }}
+            <hr class="my-4">
+            <h3 class="text-center text-xl font-semibold text-gray-100"> 
+                However, It seems you have installed some models installed without a configuration file:
+            </h3>
+            {{ range .Models }}
+            <div class="bg-gray-800 border-b border-gray-700 p-4 mt-4">
+                <h4 class="text-md font-bold text-gray-200">{{.}}</h4>
+            </div>              
+            {{end}}
+            {{end}}
            {{ else }}
-            <h2 class="text-center text-3xl font-semibold text-gray-100">Installed models</h2>
-            <p class="text-center mt-4 text-xl">We have {{len .ModelsConfig}} pre-loaded models available.</p>
+            {{ $modelsN := len .ModelsConfig}}
+            {{ $modelsN = add $modelsN (len .Models)}}
+            <h2 class="text-center text-3xl font-semibold text-gray-100">{{$modelsN}} Installed model(s)</h2>
            <table class="table-auto mt-4 w-full text-left text-gray-200">
                <thead class="text-xs text-gray-400 uppercase bg-gray-700">
                    <tr>
@@ -76,12 +87,29 @@
                            data-twe-ripple-color="light" data-twe-ripple-init="" hx-confirm="Are you sure you wish to delete the model?" hx-post="/browse/delete/model/{{.Name}}" hx-swap="outerHTML"><i class="fa-solid fa-cancel pr-2"></i>Delete</button>
                    </td>
                {{ end }}
+                {{ range .Models }}
+                <tr class="bg-gray-800 border-b border-gray-700">
+                    <td class="px-4 py-3">
+                            <img src="{{$noicon}}" class="rounded-t-lg max-h-24 max-w-24 object-cover mt-3">
+                    </td>
+                    <td class="px-4 py-3 font-bold">
+                        <p class="font-bold text-white flex items-center"><i class="fas fa-brain pr-2"></i>{{.}}</p>
+                    </td>
+                    <td class="px-4 py-3 font-bold">
+                        <span class="inline-block bg-yellow-500 text-white py-1 px-3 rounded-full text-xs">
+                            auto
+                        </span>
+                    </td>
+
+                    <td class="px-4 py-3">
+                        <span class="float-right inline-block bg-red-800 text-white py-1 px-3 rounded-full text-xs">
+                            No Configuration
+                        </span>
+                    </td>           
+                {{end}}
                </tbody>
            </table>
            {{ end }}
-
-
-
        </div>
    </div>

--- a/core/http/views/p2p.html
+++ b/core/http/views/p2p.html
@@ -16,7 +16,16 @@
                </a> 
            </h2> 
            <h5 class="mb-4 text-justify">LocalAI uses P2P technologies to enable distribution of work between peers. It is possible to share an instance with Federation and/or split the weights of a model across peers (only available with llama.cpp models). You can now share computational resources between your devices or your friends!</h5>
-            
+            <!-- Warning box if p2p token is empty and p2p is enabled -->
+            {{ if and .IsP2PEnabled (eq .P2PToken "") }}
+            <div class="bg-red-500 p-4 rounded-lg shadow-lg mb-12 text-left">
+                <p class="text-xl font-semibold text-white"> <i class="fa-solid fa-exclamation-triangle"></i> Warning: P2P mode is disabled or no token was specified</p>
+                <p class="mb-4">You have to enable P2P mode by starting LocalAI with <code>--p2p</code>. Please restart the server with <code>--p2p</code> to generate a new token automatically that can be used to automatically discover other nodes. If you already have a token specify it with <code>export TOKEN=".."</code> <a href="https://localai.io/features/distribute/" target="_blank">
+                    Check out the documentation for more information.
+                </a> </p>
+            </div>
+            {{ else }}
+
            <!-- Federation Box -->
            <div class="bg-gray-800 p-6 rounded-lg shadow-lg mb-12 text-left">

@@ -128,7 +137,8 @@
                    </div>
                </div>
            </div>
-            <!-- Llama.cpp Box END -->       
+            <!-- Llama.cpp Box END -->    
+            {{ end }}   
        </div>
    </div>

--- a/core/p2p/federated.go
+++ b/core/p2p/federated.go
@@ -0,0 +1,47 @@
+package p2p
+
+const FederatedID = "federated"
+
+type FederatedServer struct {
+	listenAddr, service, p2ptoken string
+	requestTable                  map[string]int
+	loadBalanced                  bool
+}
+
+func NewFederatedServer(listenAddr, service, p2pToken string, loadBalanced bool) *FederatedServer {
+	return &FederatedServer{
+		listenAddr:   listenAddr,
+		service:      service,
+		p2ptoken:     p2pToken,
+		requestTable: map[string]int{},
+		loadBalanced: loadBalanced,
+	}
+}
+
+func (fs *FederatedServer) SelectLeastUsedServer() string {
+	// cycle over requestTable and find the entry with the lower number
+	// if there are multiple entries with the same number, select one randomly
+	// if there are no entries, return an empty string
+	var min int
+	var minKey string
+	for k, v := range fs.requestTable {
+		if min == 0 || v < min {
+			min = v
+			minKey = k
+		}
+	}
+	return minKey
+}
+
+func (fs *FederatedServer) RecordRequest(nodeID string) {
+	// increment the counter for the nodeID in the requestTable
+	fs.requestTable[nodeID]++
+}
+
+func (fs *FederatedServer) EnsureRecordExist(nodeID string) {
+	// if the nodeID is not in the requestTable, add it with a counter of 0
+	_, ok := fs.requestTable[nodeID]
+	if !ok {
+		fs.requestTable[nodeID] = 0
+	}
+}
--- a/core/p2p/federated_server.go
+++ b/core/p2p/federated_server.go
@@ -0,0 +1,140 @@
+//go:build p2p
+// +build p2p
+
+package p2p
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net"
+	"time"
+
+	"math/rand/v2"
+
+	"github.com/mudler/edgevpn/pkg/node"
+	"github.com/mudler/edgevpn/pkg/protocol"
+	"github.com/mudler/edgevpn/pkg/types"
+	"github.com/rs/zerolog/log"
+)
+
+func (f *FederatedServer) Start(ctx context.Context) error {
+
+	n, err := NewNode(f.p2ptoken)
+	if err != nil {
+		return fmt.Errorf("creating a new node: %w", err)
+	}
+	err = n.Start(ctx)
+	if err != nil {
+		return fmt.Errorf("creating a new node: %w", err)
+	}
+
+	if err := ServiceDiscoverer(ctx, n, f.p2ptoken, f.service, func(servicesID string, tunnel NodeData) {
+		log.Debug().Msgf("Discovered node: %s", tunnel.ID)
+	}); err != nil {
+		return err
+	}
+
+	return f.proxy(ctx, n)
+}
+
+func (fs *FederatedServer) proxy(ctx context.Context, node *node.Node) error {
+
+	log.Info().Msgf("Allocating service '%s' on: %s", fs.service, fs.listenAddr)
+	// Open local port for listening
+	l, err := net.Listen("tcp", fs.listenAddr)
+	if err != nil {
+		log.Error().Err(err).Msg("Error listening")
+		return err
+	}
+	//	ll.Info("Binding local port on", srcaddr)
+
+	ledger, _ := node.Ledger()
+
+	// Announce ourselves so nodes accepts our connection
+	ledger.Announce(
+		ctx,
+		10*time.Second,
+		func() {
+			// Retrieve current ID for ip in the blockchain
+			//_, found := ledger.GetKey(protocol.UsersLedgerKey, node.Host().ID().String())
+			// If mismatch, update the blockchain
+			//if !found {
+			updatedMap := map[string]interface{}{}
+			updatedMap[node.Host().ID().String()] = &types.User{
+				PeerID:    node.Host().ID().String(),
+				Timestamp: time.Now().String(),
+			}
+			ledger.Add(protocol.UsersLedgerKey, updatedMap)
+			//	}
+		},
+	)
+
+	defer l.Close()
+	for {
+		select {
+		case <-ctx.Done():
+			return errors.New("context canceled")
+		default:
+			log.Debug().Msg("New for connection")
+			// Listen for an incoming connection.
+			conn, err := l.Accept()
+			if err != nil {
+				fmt.Println("Error accepting: ", err.Error())
+				continue
+			}
+
+			// Handle connections in a new goroutine, forwarding to the p2p service
+			go func() {
+				var tunnelAddresses []string
+				for _, v := range GetAvailableNodes(fs.service) {
+					if v.IsOnline() {
+						tunnelAddresses = append(tunnelAddresses, v.TunnelAddress)
+					} else {
+						log.Info().Msgf("Node %s is offline", v.ID)
+					}
+				}
+
+				if len(tunnelAddresses) == 0 {
+					log.Error().Msg("No available nodes yet")
+					return
+				}
+
+				tunnelAddr := ""
+
+				if fs.loadBalanced {
+					for _, t := range tunnelAddresses {
+						fs.EnsureRecordExist(t)
+					}
+
+					tunnelAddr = fs.SelectLeastUsedServer()
+					log.Debug().Msgf("Selected tunnel %s", tunnelAddr)
+					if tunnelAddr == "" {
+						tunnelAddr = tunnelAddresses[rand.IntN(len(tunnelAddresses))]
+					}
+
+					fs.RecordRequest(tunnelAddr)
+				} else {
+					tunnelAddr = tunnelAddresses[rand.IntN(len(tunnelAddresses))]
+				}
+
+				tunnelConn, err := net.Dial("tcp", tunnelAddr)
+				if err != nil {
+					log.Error().Err(err).Msg("Error connecting to tunnel")
+					return
+				}
+
+				log.Info().Msgf("Redirecting %s to %s", conn.LocalAddr().String(), tunnelConn.RemoteAddr().String())
+				closer := make(chan struct{}, 2)
+				go copyStream(closer, tunnelConn, conn)
+				go copyStream(closer, conn, tunnelConn)
+				<-closer
+
+				tunnelConn.Close()
+				conn.Close()
+				//	ll.Infof("(service %s) Done handling %s", serviceID, l.Addr().String())
+			}()
+		}
+	}
+
+}
--- a/core/p2p/node.go
+++ b/core/p2p/node.go
@@ -6,7 +6,6 @@ import (
 )

 const defaultServicesID = "services_localai"
-const FederatedID = "federated"

 type NodeData struct {
 	Name          string
--- a/core/p2p/p2p.go
+++ b/core/p2p/p2p.go
@@ -137,14 +137,9 @@ func allocateLocalService(ctx context.Context, node *node.Node, listenAddr, serv

 }

-func copyStream(closer chan struct{}, dst io.Writer, src io.Reader) {
-	defer func() { closer <- struct{}{} }() // connection is closed, send signal to stop proxy
-	io.Copy(dst, src)
-}
-
 // This is the main of the server (which keeps the env variable updated)
 // This starts a goroutine that keeps LLAMACPP_GRPC_SERVERS updated with the discovered services
-func ServiceDiscoverer(ctx context.Context, n *node.Node, token, servicesID string, discoveryFunc func()) error {
+func ServiceDiscoverer(ctx context.Context, n *node.Node, token, servicesID string, discoveryFunc func(serviceID string, node NodeData)) error {
 	if servicesID == "" {
 		servicesID = defaultServicesID
 	}
@@ -166,7 +161,7 @@ func ServiceDiscoverer(ctx context.Context, n *node.Node, token, servicesID stri
 			case tunnel := <-tunnels:
 				AddNode(servicesID, tunnel)
 				if discoveryFunc != nil {
-					discoveryFunc()
+					discoveryFunc(servicesID, tunnel)
 				}
 			}
 		}
@@ -396,3 +391,8 @@ func newNodeOpts(token string) ([]node.Option, error) {

 	return nodeOpts, nil
 }
+
+func copyStream(closer chan struct{}, dst io.Writer, src io.Reader) {
+	defer func() { closer <- struct{}{} }() // connection is closed, send signal to stop proxy
+	io.Copy(dst, src)
+}
--- a/core/p2p/p2p_disabled.go
+++ b/core/p2p/p2p_disabled.go
@@ -14,7 +14,11 @@ func GenerateToken() string {
 	return "not implemented"
 }

-func ServiceDiscoverer(ctx context.Context, node *node.Node, token, servicesID string, fn func()) error {
+func (f *FederatedServer) Start(ctx context.Context) error {
+	return fmt.Errorf("not implemented")
+}
+
+func ServiceDiscoverer(ctx context.Context, node *node.Node, token, servicesID string, fn func(string, NodeData)) error {
 	return fmt.Errorf("not implemented")
 }

--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@@ -1,6 +1,7 @@
 package schema

 import (
+	"github.com/mudler/LocalAI/core/p2p"
 	gopsutil "github.com/shirou/gopsutil/v3/process"
 )

@@ -14,6 +15,11 @@ type BackendMonitorResponse struct {
 	CPUPercent    float64
 }

+type GalleryResponse struct {
+	ID        string `json:"uuid"`
+	StatusURL string `json:"status"`
+}
+
 // @Description TTS request body
 type TTSRequest struct {
 	Model    string `json:"model" yaml:"model"` // model name or full path
@@ -59,3 +65,8 @@ type StoresFindResponse struct {
 	Values       []string    `json:"values" yaml:"values"`
 	Similarities []float32   `json:"similarities" yaml:"similarities"`
 }
+
+type P2PNodesResponse struct {
+	Nodes          []p2p.NodeData `json:"nodes" yaml:"nodes"`
+	FederatedNodes []p2p.NodeData `json:"federated_nodes" yaml:"federated_nodes"`
+}
--- a/core/schema/openai.go
+++ b/core/schema/openai.go
@@ -2,6 +2,7 @@ package schema

 import (
 	"context"
+	"time"

 	functions "github.com/mudler/LocalAI/pkg/functions"
 )
@@ -99,6 +100,37 @@ type OpenAIModel struct {
 	Object string `json:"object"`
 }

+type DeleteAssistantResponse struct {
+	ID      string `json:"id"`
+	Object  string `json:"object"`
+	Deleted bool   `json:"deleted"`
+}
+
+// File represents the structure of a file object from the OpenAI API.
+type File struct {
+	ID        string    `json:"id"`         // Unique identifier for the file
+	Object    string    `json:"object"`     // Type of the object (e.g., "file")
+	Bytes     int       `json:"bytes"`      // Size of the file in bytes
+	CreatedAt time.Time `json:"created_at"` // The time at which the file was created
+	Filename  string    `json:"filename"`   // The name of the file
+	Purpose   string    `json:"purpose"`    // The purpose of the file (e.g., "fine-tune", "classifications", etc.)
+}
+
+type ListFiles struct {
+	Data   []File
+	Object string
+}
+
+type AssistantFileRequest struct {
+	FileID string `json:"file_id"`
+}
+
+type DeleteAssistantFileResponse struct {
+	ID      string `json:"id"`
+	Object  string `json:"object"`
+	Deleted bool   `json:"deleted"`
+}
+
 type ImageGenerationResponseFormat string

 type ChatCompletionResponseFormatType string
@@ -147,8 +179,7 @@ type OpenAIRequest struct {
 	// A grammar to constrain the LLM output
 	Grammar string `json:"grammar" yaml:"grammar"`

-	JSONFunctionGrammarObject     *functions.JSONFunctionStructureFunction `json:"grammar_json_functions" yaml:"grammar_json_functions"`
-	JSONFunctionGrammarObjectName *functions.JSONFunctionStructureName     `json:"grammar_json_name" yaml:"grammar_json_name"`
+	JSONFunctionGrammarObject *functions.JSONFunctionStructure `json:"grammar_json_functions" yaml:"grammar_json_functions"`

 	Backend string `json:"backend" yaml:"backend"`

--- a/core/startup/config_file_watcher.go
+++ b/core/startup/config_file_watcher.go
@@ -9,7 +9,7 @@ import (
 	"time"

 	"github.com/fsnotify/fsnotify"
-	"github.com/imdario/mergo"
+	"dario.cat/mergo"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/rs/zerolog/log"
 )
--- a/docs/content/docs/advanced/advanced-usage.md
+++ b/docs/content/docs/advanced/advanced-usage.md
@@ -112,6 +112,8 @@ name: "" # Model name, used to identify the model in API calls.
 # Precision settings for the model, reducing precision can enhance performance on some hardware.
 f16: null # Whether to use 16-bit floating-point precision.

+embeddings: true # Enable embeddings for the model.
+
 # Concurrency settings for the application.
 threads: null # Number of threads to use for processing.

@@ -150,7 +152,8 @@ function:
    replace_function_results: [] # Placeholder to replace function call results with arbitrary strings or patterns.
    replace_llm_results: [] # Replace language model results with arbitrary strings or patterns.
    capture_llm_results: [] # Capture language model results as text result, among JSON, in function calls. For instance, if a model returns a block for "thinking" and a block for "response", this will allow you to capture the thinking block.
-    return_name_in_function_response: false # Some models might prefer to use "name" rather then "function" when returning JSON data. This will allow to use "name" as a key in the JSON response.
+    function_name_key: "name"
+    function_arguments_key: "arguments"

 # Feature gating flags to enable experimental or optional features.
 feature_flags: {}
--- a/docs/content/docs/advanced/installer.md
+++ b/docs/content/docs/advanced/installer.md
@@ -29,5 +29,9 @@ List of the Environment Variables:
 | **THREADS**              | Number of processor threads the application should use. Defaults to the number of logical cores minus one. |
 | **VERSION**              | Specifies the version of LocalAI to install. Defaults to the latest available version. |
 | **MODELS_PATH**          | Directory path where LocalAI models are stored (default is /usr/share/local-ai/models). |
+| **P2P_TOKEN** | Token to use for the federation or for starting workers see [documentation]({{%relref "docs/features/distributed_inferencing" %}}) |
+| **WORKER** | Set to "true" to make the instance a worker (p2p token is required see [documentation]({{%relref "docs/features/distributed_inferencing" %}})) |
+| **FEDERATED** | Set to "true" to share the instance with the federation (p2p token is required see [documentation]({{%relref "docs/features/distributed_inferencing" %}}))  |
+| **FEDERATED_SERVER** | Set to "true" to run the instance as a federation server which forwards requests to the federation (p2p token is required see [documentation]({{%relref "docs/features/distributed_inferencing" %}}))  |

 We are looking into improving the installer, and as this is a first iteration any feedback is welcome! Open up an [issue](https://github.com/mudler/LocalAI/issues/new/choose) if something doesn't work for you!
--- a/docs/content/docs/advanced/run-other-models.md
+++ b/docs/content/docs/advanced/run-other-models.md
@@ -8,9 +8,9 @@ icon = "rocket_launch"

 ## Running other models

-> _Do you have already a model file? Skip to [Run models manually]({{%relref "docs/getting-started/manual" %}})_.
+> _Do you have already a model file? Skip to [Run models manually]({{%relref "docs/getting-started/models" %}})_.

-To load models into LocalAI, you can either [use models manually]({{%relref "docs/getting-started/manual" %}}) or configure LocalAI to pull the models from external sources, like Huggingface and configure the model.
+To load models into LocalAI, you can either [use models manually]({{%relref "docs/getting-started/models" %}}) or configure LocalAI to pull the models from external sources, like Huggingface and configure the model.

 To do that, you can point LocalAI to an URL to a YAML configuration file - however - LocalAI does also have some popular model configuration embedded in the binary as well. Below you can find a list of the models configuration that LocalAI has pre-built, see [Model customization]({{%relref "docs/getting-started/customize-model" %}}) on how to configure models from URLs.

--- a/docs/content/docs/faq.md
+++ b/docs/content/docs/faq.md
@@ -16,6 +16,10 @@ Here are answers to some of the most common questions.

 Most gguf-based models should work, but newer models may require additions to the API. If a model doesn't work, please feel free to open up issues. However, be cautious about downloading models from the internet and directly onto your machine, as there may be security vulnerabilities in lama.cpp or ggml that could be maliciously exploited. Some models can be found on Hugging Face: https://huggingface.co/models?search=gguf, or models from gpt4all are compatible too: https://github.com/nomic-ai/gpt4all.

+### Benchmarking LocalAI and llama.cpp shows different results!
+
+LocalAI applies a set of defaults when loading models with the llama.cpp backend, one of these is mirostat sampling - while it achieves better results, it slows down the inference. You can disable this by setting `mirostat: 0` in the model config file. See also the advanced section ({{%relref "docs/advanced/advanced-usage" %}}) for more information and [this issue](https://github.com/mudler/LocalAI/issues/2780).
+
 ### What's the difference with Serge, or XXX?

 LocalAI is a multi-model solution that doesn't focus on a specific model type (e.g., llama.cpp or alpaca.cpp), and it handles all of these internally for faster inference,  easy to set up locally and deploy to Kubernetes.
--- a/docs/content/docs/features/distributed_inferencing.md
+++ b/docs/content/docs/features/distributed_inferencing.md
@@ -5,17 +5,65 @@ weight = 15
 url = "/features/distribute/"
 +++

+
+This functionality enables LocalAI to distribute inference requests across multiple worker nodes, improving efficiency and performance. Nodes are automatically discovered and connect via p2p by using a shared token which makes sure the communication is secure and private between the nodes of the network.
+
+LocalAI supports two modes of distributed inferencing via p2p:
+
+- **Federated Mode**: Requests are shared between the cluster and routed to a single worker node in the network based on the load balancer's decision.
+- **Worker Mode** (aka "model sharding" or "splitting weights"): Requests are processed by all the workers which contributes to the final inference result (by sharing the model weights).
+
+## Usage
+
+Starting LocalAI with `--p2p` generates a shared token for connecting multiple instances: and that's all you need to create AI clusters, eliminating the need for intricate network setups. 
+
+Simply navigate to the "Swarm" section in the WebUI and follow the on-screen instructions.
+
+For fully shared instances, initiate LocalAI with --p2p --federated and adhere to the Swarm section's guidance. This feature, while still experimental, offers a tech preview quality experience.
+
+### Federated mode
+
+Federated mode allows to launch multiple LocalAI instances and connect them together in a federated network. This mode is useful when you want to distribute the load of the inference across multiple nodes, but you want to have a single point of entry for the API. In the Swarm section of the WebUI, you can see the instructions to connect multiple instances together.
+
+![346663124-1d2324fd-8b55-4fa2-9856-721a467969c2](https://github.com/user-attachments/assets/19ebd44a-20ff-412c-b92f-cfb8efbe4b21)
+
+To start a LocalAI server in federated mode, run:
+
+```bash
+local-ai run --p2p --federated
+```
+
+This will generate a token that you can use to connect other LocalAI instances to the network or others can use to join the network. If you already have a token, you can specify it using the `TOKEN` environment variable.
+
+To start a load balanced server that routes the requests to the network, run with the `TOKEN`:
+
+```bash
+local-ai federated
+```
+
+To see all the available options, run `local-ai federated --help`.
+
+The instructions are displayed in the "Swarm" section of the WebUI, guiding you through the process of connecting multiple instances.
+
+### Workers mode
+
 {{% alert note %}}
 This feature is available exclusively with llama-cpp compatible models.

 This feature was introduced in [LocalAI pull request #2324](https://github.com/mudler/LocalAI/pull/2324) and is based on the upstream work in [llama.cpp pull request #6829](https://github.com/ggerganov/llama.cpp/pull/6829).
 {{% /alert %}}

-This functionality enables LocalAI to distribute inference requests across multiple worker nodes, improving efficiency and performance.
+To connect multiple workers to a single LocalAI instance, start first a server in p2p mode:

-## Usage
+```bash
+local-ai run --p2p
+```

-### Starting Workers
+And navigate the WebUI to the "Swarm" section to see the instructions to connect multiple workers to the network.
+
+![346663124-1d2324fd-8b55-4fa2-9856-721a467969c2](https://github.com/user-attachments/assets/b8cadddf-a467-49cf-a1ed-8850de95366d)
+
+### Without P2P

 To start workers for distributing the computational load, run:

@@ -23,48 +71,27 @@ To start workers for distributing the computational load, run:
 local-ai worker llama-cpp-rpc <listening_address> <listening_port>
 ```

-Alternatively, you can build the RPC server following the llama.cpp [README](https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md), which is compatible with LocalAI.
-
-### Starting LocalAI
-
-To start the LocalAI server, which handles API requests, specify the worker addresses using the `LLAMACPP_GRPC_SERVERS` environment variable:
+And you can specify the address of the workers when starting LocalAI with the `LLAMACPP_GRPC_SERVERS` environment variable:

 ```bash
 LLAMACPP_GRPC_SERVERS="address1:port,address2:port" local-ai run
 ```
-
 The workload on the LocalAI server will then be distributed across the specified nodes.

-## Peer-to-Peer Networking
+Alternatively, you can build the RPC workers/server following the llama.cpp [README](https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md), which is compatible with LocalAI.

-![output](https://github.com/mudler/LocalAI/assets/2420543/8ca277cf-c208-4562-8929-808b2324b584)
+## Manual example (worker)

-Workers can also connect to each other in a peer-to-peer network, distributing the workload in a decentralized manner.
-
-A shared token between the server and the workers is required for communication within the peer-to-peer network. This feature supports both local network (using mDNS discovery) and DHT for communication across different networks.
-
-The token is automatically generated when starting the server with the `--p2p` flag. Workers can be started with the token using `local-ai worker p2p-llama-cpp-rpc` and specifying the token via the environment variable `TOKEN` or with the `--token` argument.
-
-A network is established between the server and workers using DHT and mDNS discovery protocols. The llama.cpp RPC server is automatically started and exposed to the peer-to-peer network, allowing the API server to connect.
-
-When the HTTP server starts, it discovers workers in the network and creates port forwards to the local service. Llama.cpp is configured to use these services. For more details on the implementation, refer to [LocalAI pull request #2343](https://github.com/mudler/LocalAI/pull/2343).
-
-### Usage
+Use the WebUI to guide you in the process of starting new workers. This example shows the manual steps to highlight the process.

 1. Start the server with `--p2p`:

 ```bash
 ./local-ai run --p2p
-# 1:02AM INF loading environment variables from file envFile=.env
-# 1:02AM INF Setting logging to info
-# 1:02AM INF P2P mode enabled
-# 1:02AM INF No token provided, generating one
-# 1:02AM INF Generated Token:
-# XXXXXXXXXXX
-# 1:02AM INF Press a button to proceed
+# Get the token in the Swarm section of the WebUI
 ```

-Copy the displayed token and press Enter.
+Copy the token from the WebUI or via API call (e.g., `curl http://localhost:8000/p2p/token`) and save it for later use.

 To reuse the same token later, restart the server with `--p2ptoken` or `P2P_TOKEN`.

@@ -93,11 +120,7 @@ The server logs should indicate that new workers are being discovered.

 3. Start inference as usual on the server initiated in step 1.

-## Notes
-
- If running in p2p mode with container images, make sure you start the container with `--net host` or `network_mode: host` in the docker-compose file.
- Only a single model is supported currently.
- Ensure the server detects new workers before starting inference. Currently, additional workers cannot be added once inference has begun.
+![output](https://github.com/mudler/LocalAI/assets/2420543/8ca277cf-c208-4562-8929-808b2324b584)


 ## Environment Variables
@@ -109,3 +132,20 @@ There are options that can be tweaked or parameters that can be set using enviro
 | **LOCALAI_P2P_DISABLE_DHT** | Set to "true" to disable DHT and enable p2p layer to be local only (mDNS) |
 | **LOCALAI_P2P_DISABLE_LIMITS** | Set to "true" to disable connection limits and resources management |
 | **LOCALAI_P2P_TOKEN** | Set the token for the p2p network |
+
+## Architecture
+
+LocalAI uses https://github.com/libp2p/go-libp2p under the hood, the same project powering IPFS. Differently from other frameworks, LocalAI uses peer2peer without a single master server, but rather it uses sub/gossip and ledger functionalities to achieve consensus across different peers. 
+
+[EdgeVPN](https://github.com/mudler/edgevpn) is used as a library to establish the network and expose the ledger functionality under a shared token to ease out automatic discovery and have separated, private peer2peer networks.
+
+The weights are split proportional to the memory when running into worker mode, when in federation mode each request is split to every node which have to load the model fully.
+
+## Notes
+
+- If running in p2p mode with container images, make sure you start the container with `--net host` or `network_mode: host` in the docker-compose file.
+- Only a single model is supported currently.
+- Ensure the server detects new workers before starting inference. Currently, additional workers cannot be added once inference has begun.
+- For more details on the implementation, refer to [LocalAI pull request #2343](https://github.com/mudler/LocalAI/pull/2343)
+
+
--- a/docs/content/docs/getting-started/build.md
+++ b/docs/content/docs/getting-started/build.md
@@ -55,8 +55,8 @@ apt install cmake golang libgrpc-dev make protobuf-compiler-grpc python3-grpc-to
 After you have golang installed and working, you can install the required binaries for compiling the golang protobuf components via the following commands

 ```bash
-go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
-go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
+go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af

 ```

--- a/docs/content/docs/getting-started/models.md
+++ b/docs/content/docs/getting-started/models.md
@@ -1,21 +1,69 @@
---
+++
+disableToc = false
+title = "Install and Run Models"
+weight = 4
+icon = "rocket_launch"
+++

-disableToc: false
-title: "Run models manually"
-weight: 5
-icon: "rocket_launch"
+To install models with LocalAI, you can:

---
+- Browse the Model Gallery from the Web Interface and install models with a couple of clicks. For more details, refer to the [Gallery Documentation]({{% relref "docs/features/model-gallery" %}}).
+- Specify a model from the LocalAI gallery during startup, e.g., `local-ai run <model_gallery_name>`.
+- Use a URI to specify a model file (e.g., `huggingface://...`, `oci://`, or `ollama://`) when starting LocalAI, e.g., `local-ai run huggingface://TheBloke/phi-2-GGUF/phi-2.Q8_0.gguf`.
+- Specify a URL to a model configuration file when starting LocalAI, e.g., `local-ai run https://gist.githubusercontent.com/.../phi-2.yaml`.
+- Manually install the models by copying the files into the models directory (`--models`).

-# Run Models Manually
+## Run and Install Models via the Gallery
+
+To run models available in the LocalAI gallery, you can use the WebUI or specify the model name when starting LocalAI. Models can be found in the gallery via the Web interface, the [model gallery](https://models.localai.io), or the CLI with: `local-ai models list`.
+
+To install a model from the gallery, use the model name as the URI. For example, to run LocalAI with the Hermes model, execute:
+
+```bash
+local-ai run hermes-2-theta-llama-3-8b
+```
+
+To install only the model, use:
+
+```bash
+local-ai models install hermes-2-theta-llama-3-8b
+```
+
+Note: The galleries available in LocalAI can be customized to point to a different URL or a local directory. For more information on how to setup your own gallery, see the [Gallery Documentation]({{% relref "docs/features/model-gallery" %}}).
+
+## Run Models via URI
+
+To run models via URI, specify a URI to a model file or a configuration file when starting LocalAI. Valid syntax includes:
+
+- `file://path/to/model`
+- `huggingface://repository_id/model_file` (e.g., `huggingface://TheBloke/phi-2-GGUF/phi-2.Q8_0.gguf`)
+- From OCIs: `oci://container_image:tag`, `ollama://model_id:tag`
+- From configuration files: `https://gist.githubusercontent.com/.../phi-2.yaml`
+
+Configuration files can be used to customize the model defaults and settings. For advanced configurations, refer to the [Customize Models section]({{% relref "docs/getting-started/customize-model" %}}).
+
+### Examples
+
+```bash
+# Start LocalAI with the phi-2 model
+local-ai run huggingface://TheBloke/phi-2-GGUF/phi-2.Q8_0.gguf
+# Install and run a model from the Ollama OCI registry
+local-ai run ollama://gemma:2b
+# Run a model from a configuration file
+local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
+# Install and run a model from a standard OCI registry (e.g., Docker Hub)
+local-ai run oci://localai/phi-2:latest
+```
+
+## Run Models Manually

 Follow these steps to manually run models using LocalAI:

 1. **Prepare Your Model and Configuration Files**:
-   Ensure you have a model file and a configuration YAML file, if necessary. Customize model defaults and specific settings with a configuration file. For advanced configurations, refer to the [Advanced Documentation]({{% relref "docs/advanced" %}}).
+   Ensure you have a model file and, if necessary, a configuration YAML file. Customize model defaults and settings with a configuration file. For advanced configurations, refer to the [Advanced Documentation]({{% relref "docs/advanced" %}}).

 2. **GPU Acceleration**:
-   For instructions on GPU acceleration, visit the [GPU acceleration]({{% relref "docs/features/gpu-acceleration" %}}) page.
+   For instructions on GPU acceleration, visit the [GPU Acceleration]({{% relref "docs/features/gpu-acceleration" %}}) page.

 3. **Run LocalAI**:
   Choose one of the following methods to run LocalAI:
@@ -160,5 +208,3 @@ For instructions on building LocalAI from source, see the [Build Section]({{% re
 {{< /tabs >}}

 For more model configurations, visit the [Examples Section](https://github.com/mudler/LocalAI/tree/master/examples/configurations).
-
---
--- a/docs/content/docs/getting-started/quickstart.md
+++ b/docs/content/docs/getting-started/quickstart.md
@@ -38,13 +38,13 @@ For detailed instructions, see [Using container images]({{% relref "docs/getting

 ## Running LocalAI with All-in-One (AIO) Images

-> _Already have a model file? Skip to [Run models manually]({{% relref "docs/getting-started/manual" %}})_.
+> _Already have a model file? Skip to [Run models manually]({{% relref "docs/getting-started/models" %}})_.

 LocalAI's All-in-One (AIO) images are pre-configured with a set of models and backends to fully leverage almost all the features of LocalAI. If pre-configured models are not required, you can use the standard [images]({{% relref "docs/getting-started/container-images" %}}).

 These images are available for both CPU and GPU environments. AIO images are designed for ease of use and require no additional configuration.

-It is recommended to use AIO images if you prefer not to configure the models manually or via the web interface. For running specific models, refer to the [manual method]({{% relref "docs/getting-started/manual" %}}).
+It is recommended to use AIO images if you prefer not to configure the models manually or via the web interface. For running specific models, refer to the [manual method]({{% relref "docs/getting-started/models" %}}).

 The AIO images come pre-configured with the following features:
 - Text to Speech (TTS)
@@ -66,5 +66,5 @@ Explore additional resources and community contributions:
 - [Run from Container images]({{% relref "docs/getting-started/container-images" %}})
 - [Examples to try from the CLI]({{% relref "docs/getting-started/try-it-out" %}})
 - [Build LocalAI and the container image]({{% relref "docs/getting-started/build" %}})
- [Run models manually]({{% relref "docs/getting-started/manual" %}})
+- [Run models manually]({{% relref "docs/getting-started/models" %}})
 - [Examples](https://github.com/mudler/LocalAI/tree/master/examples#examples)
--- a/docs/content/docs/getting-started/try-it-out.md
+++ b/docs/content/docs/getting-started/try-it-out.md
@@ -17,10 +17,10 @@ After installation, install new models by navigating the model gallery, or by us
 To install models with the WebUI, see the [Models section]({{%relref "docs/features/model-gallery" %}}).
 With the CLI you can list the models with `local-ai models list` and install them with `local-ai models install <model-name>`.

-You can also [run models manually]({{%relref "docs/getting-started/manual" %}}) by copying files into the `models` directory.
+You can also [run models manually]({{%relref "docs/getting-started/models" %}}) by copying files into the `models` directory.
 {{% /alert %}}

-You can test out the API endpoints using `curl`, few examples are listed below. The models we are refering here (`gpt-4`, `gpt-4-vision-preview`, `tts-1`, `whisper-1`) are the default models that come with the AIO images - you can also use any other model you have installed.
+You can test out the API endpoints using `curl`, few examples are listed below. The models we are referring here (`gpt-4`, `gpt-4-vision-preview`, `tts-1`, `whisper-1`) are the default models that come with the AIO images - you can also use any other model you have installed.

 ### Text Generation

@@ -193,4 +193,4 @@ Don't use the model file as `model` in the request unless you want to handle the

 Use the model names like you would do with OpenAI like in the examples below. For instance `gpt-4-vision-preview`, or `gpt-4`.

-{{% /alert %}}
+{{% /alert %}}
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v2.18.1"
+  "version": "v2.19.3"
 }
--- a/docs/static/install.sh
+++ b/docs/static/install.sh
@@ -78,6 +78,9 @@ API_KEY=${API_KEY:-}
 CORE_IMAGES=${CORE_IMAGES:-false}
 P2P_TOKEN=${P2P_TOKEN:-}
 WORKER=${WORKER:-false}
+FEDERATED=${FEDERATED:-false}
+FEDERATED_SERVER=${FEDERATED_SERVER:-false}
+
 # nprocs -1
 if available nproc; then
    procs=$(nproc)
@@ -134,14 +137,6 @@ configure_systemd() {

    info "Adding current user to local-ai group..."
    $SUDO usermod -a -G local-ai $(whoami)
-    STARTCOMMAND="run"
-    if [ "$WORKER" = true ]; then
-        if [ -n "$P2P_TOKEN" ]; then
-            STARTCOMMAND="worker p2p-llama-cpp-rpc"
-        else 
-            STARTCOMMAND="worker llama-cpp-rpc"
-        fi
-    fi
    info "Creating local-ai systemd service..."
    cat <<EOF | $SUDO tee /etc/systemd/system/local-ai.service >/dev/null
 [Unit]
@@ -173,6 +168,10 @@ EOF
        $SUDO echo "LOCALAI_P2P=true" | $SUDO tee -a /etc/localai.env >/dev/null
    fi

+    if [ "$LOCALAI_P2P_DISABLE_DHT" = true ]; then
+        $SUDO echo "LOCALAI_P2P_DISABLE_DHT=true" | $SUDO tee -a /etc/localai.env >/dev/null
+    fi
+
    SYSTEMCTL_RUNNING="$(systemctl is-system-running || true)"
    case $SYSTEMCTL_RUNNING in
        running|degraded)
@@ -195,7 +194,7 @@ install_container_toolkit_yum() {
    curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \
    $SUDO  tee /etc/yum.repos.d/nvidia-container-toolkit.repo

-    if [ "$PACKAGE_MANAGER" == "dnf" ]; then
+    if [ "$PACKAGE_MANAGER" = "dnf" ]; then
        $SUDO $PACKAGE_MANAGER config-manager --enable nvidia-container-toolkit-experimental
    else 
        $SUDO $PACKAGE_MANAGER -y install yum-utils
@@ -421,18 +420,13 @@ install_docker() {
        # exit 0
    fi

-    STARTCOMMAND="run"
-    if [ "$WORKER" = true ]; then
-        if [ -n "$P2P_TOKEN" ]; then
-            STARTCOMMAND="worker p2p-llama-cpp-rpc"
-        else 
-            STARTCOMMAND="worker llama-cpp-rpc"
-        fi
-    fi
    envs=""
    if [ -n "$P2P_TOKEN" ]; then
        envs="-e LOCALAI_P2P_TOKEN=$P2P_TOKEN -e LOCALAI_P2P=true"
    fi
+    if [ "$LOCALAI_P2P_DISABLE_DHT" = true ]; then
+        envs="$envs -e LOCALAI_P2P_DISABLE_DHT=true"
+    fi

    IMAGE_TAG=
    if [ "$HAS_CUDA" ]; then
@@ -604,6 +598,28 @@ install_binary() {
    exit 0
 }

+detect_start_command() {
+    STARTCOMMAND="run"
+    if [ "$WORKER" = true ]; then
+        if [ -n "$P2P_TOKEN" ]; then
+            STARTCOMMAND="worker p2p-llama-cpp-rpc"
+        else 
+            STARTCOMMAND="worker llama-cpp-rpc"
+        fi
+    elif [ "$FEDERATED" = true ]; then
+        if [ "$FEDERATED_SERVER" = true ]; then
+            STARTCOMMAND="federated"
+        else
+            STARTCOMMAND="$STARTCOMMAND --p2p --federated"
+        fi
+    elif [ -n "$P2P_TOKEN" ]; then
+        STARTCOMMAND="$STARTCOMMAND --p2p"
+    fi
+}
+
+
+detect_start_command
+
 OS="$(uname -s)"

 ARCH=$(uname -m)
@@ -613,7 +629,7 @@ case "$ARCH" in
    *) fatal "Unsupported architecture: $ARCH" ;;
 esac

-if [ "$OS" == "Darwin" ]; then
+if [ "$OS" = "Darwin" ]; then
    install_binary_darwin
    exit 0
 fi
--- a/Show More
+++ b/Show More