wire to grpc

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
wip reranking llama.cpp
2026-05-24 08:38:02 -04:00 · 2025-04-19 20:22:31 +02:00 · 2025-04-19 19:52:02 +02:00 · 2025-04-19 15:52:29 +02:00 · 2025-04-19 08:53:24 +02:00 · 2025-04-18 21:45:48 +00:00
366 changed files with 251800 additions and 7542 deletions
--- a/Generation/musicgen.bru
+++ b/Generation/musicgen.bru
@@ -1,23 +0,0 @@
 meta {
  name: musicgen
  type: http
  seq: 1
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/sound-generation
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model_id": "facebook/musicgen-small",
      "text": "Exciting 80s Newscast Interstitial",
      "duration_seconds": 8
  }
 }
--- a/Requests/backend
+++ b/Requests/backend
@@ -1,17 +0,0 @@
 meta {
  name: backend monitor
  type: http
  seq: 4
 }
 get {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/monitor
  body: json
  auth: none
 }
 body:json {
  {
    "model": "{{DEFAULT_MODEL}}"
  }
 }
--- a/monitor/backend-shutdown.bru
+++ b/monitor/backend-shutdown.bru
@@ -1,21 +0,0 @@
 meta {
  name: backend-shutdown
  type: http
  seq: 3
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/shutdown
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}"
  }
 }
--- a/Requests/bruno.json
+++ b/Requests/bruno.json
@@ -1,5 +0,0 @@
 {
  "version": "1",
  "name": "LocalAI Test Requests",
  "type": "collection"
 }
--- a/Requests/environments/localhost.bru
+++ b/Requests/environments/localhost.bru
@@ -1,6 +0,0 @@
 vars {
  HOST: localhost
  PORT: 8080
  DEFAULT_MODEL: gpt-3.5-turbo
  PROTOCOL: http://
 }
--- a/.bruno/LocalAI
+++ b/.bruno/LocalAI
@@ -1,11 +0,0 @@
 meta {
  name: get models list
  type: http
  seq: 2
 }
 get {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models
  body: none
  auth: none
 }
--- a/generation/Generate
+++ b/generation/Generate
@@ -1,25 +0,0 @@
 meta {
  name: Generate image
  type: http
  seq: 1
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/images/generations
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
    "prompt": "<positive prompt>|<negative prompt>",
    "model": "model-name",
    "step": 51,
    "size": "1024x1024",
    "image": ""
  }
 }
--- a/text/-completions.bru
+++ b/text/-completions.bru
@@ -1,24 +0,0 @@
 meta {
  name: -completions
  type: http
  seq: 4
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/completions
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}",
      "prompt": "function downloadFile(string url, string outputPath) {",
      "max_tokens": 256,
      "temperature": 0.5
  }
 }
--- a/text/-edits.bru
+++ b/text/-edits.bru
@@ -1,23 +0,0 @@
 meta {
  name: -edits
  type: http
  seq: 5
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/edits
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}",
      "input": "What day of the wek is it?",
      "instruction": "Fix the spelling mistakes"
  }
 }
--- a/text/-embeddings.bru
+++ b/text/-embeddings.bru
@@ -1,22 +0,0 @@
 meta {
  name: -embeddings
  type: http
  seq: 6
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/embeddings
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}",
      "input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
  }
 }
--- a/text/chat/chat
+++ b/text/chat/chat
@@ -1,30 +0,0 @@
 meta {
  name: chat completion -simple- 1 message-
  type: http
  seq: 4
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
    "model": "{{DEFAULT_MODEL}}",
    "messages": [
      {
        "role": "user",
        "content": "How could one use friction to cook an egg?"
      }
    ],
    "max_tokens": 256,
    "temperature": 0.2,
    "grammar": ""
  }
 }
--- a/text/chat/chat-completions
+++ b/text/chat/chat-completions
@@ -1,29 +0,0 @@
 meta {
  name: chat-completions -long-
  type: http
  seq: 5
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
       "model": "{{DEFAULT_MODEL}}",
       "messages": [{"role": "system", "content": "You are a helpful, intelligent pirate assistant. Always answer as helpfully as possible, while being a pirate and using pirate language. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."},
           {"role": "user", "content": "How could one use electricity to cook an egg?"},
           {"role": "assistant",
                  "content": "Shiver me timbers! Using electricity to cook an egg? Well, matey, I reckon that be a right curious idea! *adjusts spectacles* Now, I ain't sure if this be possible, but I'll do me best to help ye out.\n\nFirst things first, ye gotta understand that electricity be a powerful force, and it can't just cook an egg on its own. Ye see, electricity be like a mighty wind that can make things happen, but it needs somethin' to work with. So, if ye want to use electricity to cook an egg, ye gotta find a way to harness that power and make it do the cookin'.\n\nNow, I know what ye might be thinkin': \"How do I harness the power of electricity to cook an egg?\" Well, matey, there be a few ways to do it. One way be to use a special device called an \"electric frying pan.\" This be a pan that has a built-in heating element that gets hot when ye plug it into a wall socket. When the element gets hot, ye can crack an egg into the pan and watch as it cook"
              },
              {"role": "user", "content": "I don't have one of those, just a raw wire and plenty of power! How do we get it done?"}],
       "max_tokens": 1024,
       "temperature": 0.5
  }
 }
--- a/text/chat/chat-completions
+++ b/text/chat/chat-completions
@@ -1,25 +0,0 @@
 meta {
  name: chat-completions -stream-
  type: http
  seq: 6
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
       "model": "{{DEFAULT_MODEL}}",
       "messages": [{"role": "user", "content": "Explain how I can set sail on the ocean using only power generated by seagulls?"}],
       "max_tokens": 256,
       "temperature": 0.9,
       "stream": true
  }
 }
--- a/Requests/model
+++ b/Requests/model
@@ -1,22 +0,0 @@
 meta {
  name: add model gallery
  type: http
  seq: 10
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "url": "file:///home/dave/projects/model-gallery/huggingface/TheBloke__CodeLlama-7B-Instruct-GGML.yaml",
      "name": "test"
  }
 }
--- a/gallery/delete
+++ b/gallery/delete
@@ -1,21 +0,0 @@
 meta {
  name: delete model gallery
  type: http
  seq: 11
 }
 delete {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "name": "test"
  }
 }
--- a/Requests/model
+++ b/Requests/model
@@ -1,11 +0,0 @@
 meta {
  name: list MODELS in galleries
  type: http
  seq: 7
 }
 get {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/available
  body: none
  auth: none
 }
--- a/Requests/model
+++ b/Requests/model
@@ -1,11 +0,0 @@
 meta {
  name: list model GALLERIES
  type: http
  seq: 8
 }
 get {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
  body: none
  auth: none
 }
--- a/Requests/model
+++ b/Requests/model
@@ -1,11 +0,0 @@
 meta {
  name: model delete
  type: http
  seq: 7
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
  body: none
  auth: none
 }
--- a/Requests/model
+++ b/Requests/model
@@ -1,21 +0,0 @@
 meta {
  name: model gallery apply -gist-
  type: http
  seq: 12
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "id": "TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q2_K.bin"
  }
 }
--- a/Requests/model
+++ b/Requests/model
@@ -1,22 +0,0 @@
 meta {
  name: model gallery apply
  type: http
  seq: 9
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "id": "dave@TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q3_K_S.bin",
      "name": "codellama7b"
  }
 }
--- a/Requests/transcription/gb1.ogg
+++ b/Requests/transcription/gb1.ogg
--- a/Requests/transcription/transcribe.bru
+++ b/Requests/transcription/transcribe.bru
@@ -1,16 +0,0 @@
 meta {
  name: transcribe
  type: http
  seq: 1
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/audio/transcriptions
  body: multipartForm
  auth: none
 }
 body:multipart-form {
  file: @file(transcription/gb1.ogg)
  model: whisper-1
 }
--- a/Requests/tts/-tts.bru
+++ b/Requests/tts/-tts.bru
@@ -1,22 +0,0 @@
 meta {
  name: -tts
  type: http
  seq: 2
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}",
      "input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
  }
 }
--- a/Requests/tts/musicgen.bru
+++ b/Requests/tts/musicgen.bru
@@ -1,23 +0,0 @@
 meta {
  name: musicgen
  type: http
  seq: 2
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "backend": "transformers",
      "model": "facebook/musicgen-small",
      "input": "80s Synths playing Jazz"
  }
 }
--- a/.devcontainer/docker-compose-devcontainer.yml
+++ b/.devcontainer/docker-compose-devcontainer.yml
@@ -7,7 +7,7 @@ services:
      args:
      - FFMPEG=true
      - IMAGE_TYPE=extras
-      - GO_TAGS=stablediffusion p2p tts
+      - GO_TAGS=p2p tts
    env_file:
      - ../.env
    ports:
--- a/.env
+++ b/.env
@@ -29,6 +29,9 @@
 ## Enable/Disable single backend (useful if only one GPU is available)
 # LOCALAI_SINGLE_ACTIVE_BACKEND=true
 # Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
 # LOCALAI_FORCE_BACKEND_SHUTDOWN=true
 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
 ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
@@ -38,12 +41,12 @@
 ## Uncomment and set to true to enable rebuilding from source
 # REBUILD=true
-## Enable go tags, available: stablediffusion, tts
+## Enable go tags, available: p2p, tts
-## stablediffusion: image generation with stablediffusion
+## p2p: enable distributed inferencing
 ## tts: enables text-to-speech with go-piper 
 ## (requires REBUILD=true)
 #
-# GO_TAGS=stablediffusion
+# GO_TAGS=p2p
 ## Path where to store generated images
 # LOCALAI_IMAGE_PATH=/tmp/generated/images
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -29,10 +29,6 @@ updates:
    schedule:
      # Check for updates to GitHub Actions every weekday
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/autogptq"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/bark"
    schedule:
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,4 +1,4 @@
-enhancements:
+enhancement:
 - head-branch: ['^feature', 'feature']
 dependencies:
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -9,7 +9,7 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - repository: "ggerganov/llama.cpp"
+          - repository: "ggml-org/llama.cpp"
            variable: "CPPLLAMA_VERSION"
            branch: "master"
          - repository: "ggerganov/whisper.cpp"
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@@ -14,7 +14,7 @@ jobs:
    steps:
      - name: Dependabot metadata
        id: metadata
-        uses: dependabot/fetch-metadata@v2.2.0
+        uses: dependabot/fetch-metadata@v2.3.0
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          skip-commit-verification: true
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -33,7 +33,7 @@ jobs:
        run: |
          CGO_ENABLED=0 make build-api
      - name: rm
-        uses: appleboy/ssh-action@v1.2.0
+        uses: appleboy/ssh-action@v1.2.2
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@@ -53,7 +53,7 @@ jobs:
            rm: true
            target: ./local-ai
      - name: restarting
-        uses: appleboy/ssh-action@v1.2.0
+        uses: appleboy/ssh-action@v1.2.2
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -2,9 +2,10 @@ name: 'generate and publish GRPC docker caches'
 on:
  workflow_dispatch:
-  push:
+
-    branches:
+  schedule:
-      - master
+    # daily at midnight
    - cron: '0 0 * * *'
 concurrency:
  group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
@@ -16,7 +17,7 @@ jobs:
      matrix:
        include:
          - grpc-base-image: ubuntu:22.04
-            runs-on: 'ubuntu-latest'
+            runs-on: 'arc-runner-set'
            platforms: 'linux/amd64,linux/arm64'
    runs-on: ${{matrix.runs-on}}
    steps:
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -15,7 +15,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
+          - base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04
            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64'
    runs-on: ${{matrix.runs-on}}
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -75,6 +75,7 @@ jobs:
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
            latest-image: 'latest-gpu-hipblas-core'
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -251,6 +252,7 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
            latest-image: 'latest-gpu-intel-f16-core'
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -261,6 +263,7 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
            latest-image: 'latest-gpu-intel-f32-core'
  core-image-build:
    uses: ./.github/workflows/image_build.yml
@@ -339,6 +342,7 @@ jobs:
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
            latest-image: 'latest-gpu-nvidia-cuda-12-core'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -351,17 +355,18 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
            latest-image: 'latest-gpu-nvidia-cuda-12-core'
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-vulkan-ffmpeg-core'
            latest-image: 'latest-vulkan-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
            latest-image: 'latest-gpu-vulkan-core'
  gh-runner:
    uses: ./.github/workflows/image_build.yml
    with:
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -310,6 +310,11 @@ jobs:
          tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
          labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}
      - name: Cleanup
        run: |
          docker builder prune -f
          docker system prune --force --volumes --all
      - name: Latest tag
        # run this on branches, when it is a tag and there is a latest-image defined
        if: github.event_name != 'pull_request' && inputs.latest-image != ''  && github.ref_type == 'tag'
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -8,7 +8,7 @@ jobs:
  notify-discord:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: hermes-2-theta-llama-3-8b
+        MODEL_NAME: gemma-3-12b-it
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
@@ -16,9 +16,9 @@ jobs:
        fetch-depth: 0 # needed to checkout all branches for this Action to work
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
        # Check the PR diff using the current branch and the base branch of the PR
-    - uses: GrantBirki/git-diff-action@v2.7.0
+    - uses: GrantBirki/git-diff-action@v2.8.0
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
@@ -87,7 +87,7 @@ jobs:
  notify-twitter:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: hermes-2-theta-llama-3-8b
+        MODEL_NAME: gemma-3-12b-it
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
@@ -99,7 +99,7 @@ jobs:
        docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
        until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready";  docker logs --tail 10 local-ai; sleep 2; done
      # Check the PR diff using the current branch and the base branch of the PR
-    - uses: GrantBirki/git-diff-action@v2.7.0
+    - uses: GrantBirki/git-diff-action@v2.8.0
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
--- a/.github/workflows/notify-releases.yaml
+++ b/.github/workflows/notify-releases.yaml
@@ -14,7 +14,7 @@ jobs:
    steps:
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
    - name: Summarize
      id: summarize
      run: |
@@ -60,4 +60,4 @@ jobs:
        DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
      uses: Ilshidur/action-discord@master
      with:
-        args: ${{ steps.summarize.outputs.message }}
+        args: ${{ steps.summarize.outputs.message }}
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -237,40 +237,7 @@ jobs:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
-  build-stablediffusion:
+
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache upx-ucl
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Build stablediffusion
        run: |
          export PATH=$PATH:$GOPATH/bin
          make backend-assets/grpc/stablediffusion
          mkdir -p release && cp backend-assets/grpc/stablediffusion release
        env:
          GO_TAGS: stablediffusion
      - uses: actions/upload-artifact@v4
        with:
          name: stablediffusion
          path: release/
      - name: Release
        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
            release/*
  build-macOS-x86_64:
    runs-on: macos-13
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.0
+        uses: securego/gosec@v2.22.3
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -78,57 +78,6 @@ jobs:
          make --jobs=5 --output-sync=target -C backend/python/diffusers
          make --jobs=5 --output-sync=target -C backend/python/diffusers test
  tests-parler-tts:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test parler-tts
        run: |
           make --jobs=5 --output-sync=target -C backend/python/parler-tts
           make --jobs=5 --output-sync=target -C backend/python/parler-tts test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
  tests-openvoice:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test openvoice
        run: |
           make --jobs=5 --output-sync=target -C backend/python/openvoice
           make --jobs=5 --output-sync=target -C backend/python/openvoice test
  # tests-transformers-musicgen:
  #   runs-on: ubuntu-latest
  #   steps:
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -105,9 +105,7 @@ jobs:
          # Pre-build piper before we start tests in order to have shared libraries in place
          make sources/go-piper && \
          GO_TAGS="tts" make -C sources/go-piper piper.o && \
-          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
+          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/
          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
        env:
          CUDA_VERSION: 12-4
      - name: Cache grpc
@@ -129,7 +127,7 @@ jobs:
          cd grpc && cd cmake/build && sudo make --jobs 5 install
      - name: Test
        run: |
-          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
+          PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.19
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -26,7 +26,7 @@
                "LOCALAI_P2P": "true",
                "LOCALAI_FEDERATED": "true"
            },
-            "buildFlags": ["-tags", "stablediffusion p2p tts", "-v"],
+            "buildFlags": ["-tags", "p2p tts", "-v"],
            "envFile": "${workspaceFolder}/.env",
            "cwd": "${workspaceRoot}"
        }
--- a/57
+++ b/57
@@ -15,8 +15,7 @@ ARG TARGETARCH
 ARG TARGETVARIANT
 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
@@ -25,6 +24,7 @@ RUN apt-get update && \
        ca-certificates \
        curl libssl-dev \
        git \
        git-lfs \
        unzip upx-ucl && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
@@ -69,14 +69,10 @@ ENV PATH=/opt/rocm/bin:${PATH}
 # OpenBLAS requirements and stable diffusion
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        libopenblas-dev \
+        libopenblas-dev && \
        libopencv-dev && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # Set up OpenCV
 RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
 WORKDIR /build
 ###################################
@@ -251,7 +247,7 @@ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shall
 FROM requirements-drivers AS builder-base
-ARG GO_TAGS="stablediffusion tts p2p"
+ARG GO_TAGS="tts p2p"
 ARG GRPC_BACKENDS
 ARG MAKEFLAGS
 ARG LD_FLAGS="-s -w"
@@ -285,35 +281,12 @@ RUN <<EOT bash
    fi
 EOT
 ###################################
 ###################################
 # This first portion of builder holds the layers specifically used to build backend-assets/grpc/stablediffusion
 # In most cases, builder is the image you should be using - however, this can save build time if one just needs to copy backend-assets/grpc/stablediffusion and nothing else.
 FROM builder-base AS builder-sd
 # stablediffusion does not tolerate a newer version of abseil, copy only over enough elements to build it
 COPY Makefile .
 COPY go.mod .
 COPY go.sum .
 COPY backend/backend.proto ./backend/backend.proto
 COPY backend/go/image/stablediffusion ./backend/go/image/stablediffusion
 COPY pkg/grpc ./pkg/grpc
 COPY pkg/stablediffusion ./pkg/stablediffusion
 RUN git init
 RUN make sources/go-stable-diffusion
 RUN touch prepare-sources
 # Actually build the backend
 RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make backend-assets/grpc/stablediffusion
 ###################################
 ###################################
 # The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
 # Adjustments to the build process should likely be made here.
-FROM builder-sd AS builder
+FROM builder-base AS builder
 # Install the pre-built GRPC
 COPY --from=grpc /opt/grpc /usr/local
@@ -331,7 +304,7 @@ RUN make prepare
 ## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
 ## (both will use CUDA or hipblas for the actual computation)
 RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
-        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
+        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
    else \
        make build; \
    fi
@@ -353,8 +326,6 @@ ARG FFMPEG
 COPY --from=grpc /opt/grpc /usr/local
 COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion /build/backend-assets/grpc/stablediffusion
 COPY .devcontainer-scripts /.devcontainer-scripts
 # Add FFmpeg
@@ -427,9 +398,6 @@ COPY --from=builder /build/local-ai ./
 # Copy shared libraries for piper
 COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
 # do not let stablediffusion rebuild (requires an older version of absl)
 COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
 # Change the shell to bash so we can use [[ tests below
 SHELL ["/bin/bash", "-c"]
 # We try to strike a balance between individual layer size (as that affects total push time) and total image size
@@ -443,8 +411,8 @@ RUN if [[ ( "${IMAGE_TYPE}" == "extras ")]]; then \
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/coqui \
    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "parler-tts" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "faster-whisper" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/parler-tts \
+        make -C backend/python/faster-whisper \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "diffusers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/diffusers \
@@ -453,9 +421,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/kokoro \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "openvoice" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/openvoice \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama2" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/exllama2 \
    ; fi && \
@@ -466,17 +431,11 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMA
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/vllm \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/autogptq \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/bark \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "rerankers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/rerankers \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "mamba" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/mamba \
    ; fi
 # Make sure the models directory exists
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2023-2024 Ettore Di Giacinto (mudler@localai.io)
+Copyright (c) 2023-2025 Ettore Di Giacinto (mudler@localai.io)
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/148
+++ b/148
@@ -6,9 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 # llama.cpp versions
-GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
+CPPLLAMA_VERSION?=6408210082cc0a61b992b487be7e2ff2efbb9e36
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
 CPPLLAMA_VERSION?=92bc493917d43b83e592349e138b54c90b1c3ea7
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@@ -18,17 +16,13 @@ WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
 PIPER_REPO?=https://github.com/mudler/go-piper
 PIPER_VERSION?=e10ca041a885d4a8f3871d52924b47792d5e5aa0
 # stablediffusion version
 STABLEDIFFUSION_REPO?=https://github.com/mudler/go-stable-diffusion
 STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f
 # bark.cpp
 BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
 BARKCPP_VERSION?=v1.0.0
 # stablediffusion.cpp (ggml)
-STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
+STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=5eb15ef4d022bef4a391de4f5f6556e81fbb5024
+STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae
 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
@@ -155,7 +149,6 @@ ifeq ($(BUILD_TYPE),hipblas)
 	LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
 	export CXX=$(ROCM_HOME)/llvm/bin/clang++
 	export CC=$(ROCM_HOME)/llvm/bin/clang
 	# llama-ggml has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
 	export GGML_HIP=1
 	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
@@ -179,11 +172,6 @@ ifeq ($(STATIC),true)
 	LD_FLAGS+=-linkmode external -extldflags -static
 endif
 ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion)
 #	OPTIONAL_TARGETS+=go-stable-diffusion/libstablediffusion.a
 	OPTIONAL_GRPC+=backend-assets/grpc/stablediffusion
 endif
 ifeq ($(findstring tts,$(GO_TAGS)),tts)
 #	OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
 #	OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
@@ -195,8 +183,8 @@ endif
 ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
 ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
@@ -230,19 +218,6 @@ endif
 all: help
 ## go-llama.cpp
 sources/go-llama.cpp:
 	mkdir -p sources/go-llama.cpp
 	cd sources/go-llama.cpp && \
 	git init && \
 	git remote add origin $(GOLLAMA_REPO) && \
 	git fetch origin && \
 	git checkout $(GOLLAMA_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
 	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
 ## bark.cpp
 sources/bark.cpp:
 	git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
@@ -273,19 +248,6 @@ sources/go-piper:
 sources/go-piper/libpiper_binding.a: sources/go-piper
 	$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
 ## stable diffusion (onnx)
 sources/go-stable-diffusion:
 	mkdir -p sources/go-stable-diffusion
 	cd sources/go-stable-diffusion && \
 	git init && \
 	git remote add origin $(STABLEDIFFUSION_REPO) && \
 	git fetch origin && \
 	git checkout $(STABLEDIFFUSION_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
 	CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
 ## stablediffusion (ggml)
 sources/stablediffusion-ggml.cpp:
 	git clone --recursive $(STABLEDIFFUSION_GGML_REPO) sources/stablediffusion-ggml.cpp && \
@@ -298,11 +260,7 @@ backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
 	$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
 backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
+	$(MAKE) -C backend/go/image/stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" stablediffusion-ggml
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/stablediffusion-ggml
 endif
 sources/onnxruntime:
 	mkdir -p sources/onnxruntime
@@ -331,21 +289,17 @@ sources/whisper.cpp:
 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
-get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp sources/go-stable-diffusion backend/cpp/llama/llama.cpp
+get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
 replace:
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
 dropreplace:
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
 	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
 prepare-sources: get-sources replace
 	$(GOCMD) mod download
@@ -353,9 +307,7 @@ prepare-sources: get-sources replace
 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
 	$(MAKE) -C sources/go-llama.cpp clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-stable-diffusion clean
 	$(MAKE) -C sources/go-piper clean
 	$(MAKE) build
@@ -458,7 +410,7 @@ run: prepare ## run local-ai
 test-models/testmodel.ggml:
 	mkdir test-models
 	mkdir test-dir
-	wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
+	wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml
 	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
 	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
 	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@@ -470,11 +422,10 @@ prepare-test: grpcs
 test: prepare test-models/testmodel.ggml grpcs
 	@echo 'Running tests'
-	export GO_TAGS="tts stablediffusion debug"
+	export GO_TAGS="tts debug"
 	$(MAKE) prepare-test
 	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
 	$(MAKE) test-llama
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion
@@ -503,10 +454,6 @@ teardown-e2e:
 	rm -rf $(TEST_DIR) || true
 	docker stop $$(docker ps -q --filter ancestor=localai-tests)
 test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
@@ -558,18 +505,10 @@ protogen-go-clean:
 	$(RM) bin/*
 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen transformers-protogen parler-tts-protogen kokoro-protogen vllm-protogen openvoice-protogen
+protogen-python: bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean mamba-protogen-clean rerankers-protogen-clean transformers-protogen-clean parler-tts-protogen-clean kokoro-protogen-clean vllm-protogen-clean openvoice-protogen-clean
+protogen-python-clean: bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
 .PHONY: autogptq-protogen
 autogptq-protogen:
 	$(MAKE) -C backend/python/autogptq protogen
 .PHONY: autogptq-protogen-clean
 autogptq-protogen-clean:
 	$(MAKE) -C backend/python/autogptq protogen-clean
 .PHONY: bark-protogen
 bark-protogen:
@@ -595,6 +534,14 @@ diffusers-protogen:
 diffusers-protogen-clean:
 	$(MAKE) -C backend/python/diffusers protogen-clean
 .PHONY: faster-whisper-protogen
 faster-whisper-protogen:
 	$(MAKE) -C backend/python/faster-whisper protogen
 .PHONY: faster-whisper-protogen-clean
 faster-whisper-protogen-clean:
 	$(MAKE) -C backend/python/faster-whisper protogen-clean
 .PHONY: exllama2-protogen
 exllama2-protogen:
 	$(MAKE) -C backend/python/exllama2 protogen
@@ -603,14 +550,6 @@ exllama2-protogen:
 exllama2-protogen-clean:
 	$(MAKE) -C backend/python/exllama2 protogen-clean
 .PHONY: mamba-protogen
 mamba-protogen:
 	$(MAKE) -C backend/python/mamba protogen
 .PHONY: mamba-protogen-clean
 mamba-protogen-clean:
 	$(MAKE) -C backend/python/mamba protogen-clean
 .PHONY: rerankers-protogen
 rerankers-protogen:
 	$(MAKE) -C backend/python/rerankers protogen
@@ -627,14 +566,6 @@ transformers-protogen:
 transformers-protogen-clean:
 	$(MAKE) -C backend/python/transformers protogen-clean
 .PHONY: parler-tts-protogen
 parler-tts-protogen:
 	$(MAKE) -C backend/python/parler-tts protogen
 .PHONY: parler-tts-protogen-clean
 parler-tts-protogen-clean:
 	$(MAKE) -C backend/python/parler-tts protogen-clean
 .PHONY: kokoro-protogen
 kokoro-protogen:
 	$(MAKE) -C backend/python/kokoro protogen
@@ -643,14 +574,6 @@ kokoro-protogen:
 kokoro-protogen-clean:
 	$(MAKE) -C backend/python/kokoro protogen-clean
 .PHONY: openvoice-protogen
 openvoice-protogen:
 	$(MAKE) -C backend/python/openvoice protogen
 .PHONY: openvoice-protogen-clean
 openvoice-protogen-clean:
 	$(MAKE) -C backend/python/openvoice protogen-clean
 .PHONY: vllm-protogen
 vllm-protogen:
 	$(MAKE) -C backend/python/vllm protogen
@@ -662,17 +585,14 @@ vllm-protogen-clean:
 ## GRPC
 # Note: it is duplicated in the Dockerfile
 prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/autogptq
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
 	$(MAKE) -C backend/python/diffusers
 	$(MAKE) -C backend/python/faster-whisper
 	$(MAKE) -C backend/python/vllm
 	$(MAKE) -C backend/python/mamba
 	$(MAKE) -C backend/python/rerankers
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/parler-tts
 	$(MAKE) -C backend/python/kokoro
 	$(MAKE) -C backend/python/openvoice
 	$(MAKE) -C backend/python/exllama2
 prepare-test-extra: protogen-python
@@ -742,6 +662,13 @@ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
 backend-assets/grpc/llama-cpp-avx512: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-avx512
 	$(MAKE) -C backend/cpp/llama-avx512 purge
 	$(info ${GREEN}I llama-cpp build info:avx512${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx512" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx512/grpc-server backend-assets/grpc/llama-cpp-avx512
 backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-avx
 	$(MAKE) -C backend/cpp/llama-avx purge
@@ -795,13 +722,6 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
 	mkdir -p backend-assets/util/
 	cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
 backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/llama-ggml
 endif
 backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
@@ -816,13 +736,6 @@ ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/piper
 endif
 backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/go-stable-diffusion/:/usr/include/opencv4" LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/stablediffusion
 endif
 backend-assets/grpc/silero-vad: backend-assets/grpc backend-assets/lib/libonnxruntime.so.1
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/onnxruntime/include/" LIBRARY_PATH=$(CURDIR)/backend-assets/lib \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/silero-vad ./backend/go/vad/silero
@@ -883,7 +796,8 @@ docker-aio-all:
 docker-image-intel:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--progress plain \
 		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -891,7 +805,7 @@ docker-image-intel:
 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -903,7 +817,7 @@ swagger:
 .PHONY: gen-assets
 gen-assets:
-	$(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets
+	$(GOCMD) run core/dependencies_manager/manager.go webui_static.yaml core/http/static/assets
 ## Documentation
 docs/layouts/_default:
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
 <h1 align="center">
  <br>
-  <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
+  <img height="300" src="./core/http/static/logo.png"> <br>
    LocalAI
 <br>
 </h1>
@@ -39,7 +38,7 @@
 </p>
 <p align="center">
-<a href="https://trendshift.io/repositories/1484" target="_blank"><img src="https://trendshift.io/api/badge/repositories/1484" alt="go-skynet%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+<a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 </p>
 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
@@ -48,9 +47,58 @@
 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
-![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
+
 ## 📚🆕 Local Stack Family
 🆕 LocalAI is now part of a comprehensive suite of AI tools designed to work together:
 <table>
  <tr>
    <td width="50%" valign="top">
      <a href="https://github.com/mudler/LocalAGI">
        <img src="https://raw.githubusercontent.com/mudler/LocalAGI/refs/heads/main/webui/react-ui/public/logo_2.png" width="300" alt="LocalAGI Logo">
      </a>
    </td>
    <td width="50%" valign="top">
      <h3><a href="https://github.com/mudler/LocalAGI">LocalAGI</a></h3>
      <p>A powerful Local AI agent management platform that serves as a drop-in replacement for OpenAI's Responses API, enhanced with advanced agentic capabilities.</p>
    </td>
  </tr>
  <tr>
    <td width="50%" valign="top">
      <a href="https://github.com/mudler/LocalRecall">
        <img src="https://raw.githubusercontent.com/mudler/LocalRecall/refs/heads/main/static/localrecall_horizontal.png" width="300" alt="LocalRecall Logo">
      </a>
    </td>
    <td width="50%" valign="top">
      <h3><a href="https://github.com/mudler/LocalRecall">LocalRecall</a></h3>
      <p>A REST-ful API and knowledge base management system that provides persistent memory and storage capabilities for AI agents.</p>
    </td>
  </tr>
 </table>
 ## Screenshots
 | Talk Interface | Generate Audio |
 | --- | --- |
 | ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](./docs/assets/images/screenshots/screenshot_tts.png) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](./docs/assets/images/screenshots/screenshot_tts.png) |
 | Models Overview | Generate Images |
 | --- | --- |
 | ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](./docs/assets/images/screenshots/screenshot_gallery.png) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](./docs/assets/images/screenshots/screenshot_image.png) |
 | Chat Interface | Home |
 | --- | --- |
 | ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](./docs/assets/images/screenshots/screenshot_chat.png) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](./docs/assets/images/screenshots/screenshot_home.png) |
 | Login | Swarm |
 | --- | --- |
 |![Screenshot 2025-03-31 at 12-09-59 ](./docs/assets/images/screenshots/screenshot_login.png) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](./docs/assets/images/screenshots/screenshot_p2p.png) |
 ## 💻 Quickstart
 Run the installer script:
@@ -59,17 +107,21 @@ curl https://localai.io/install.sh | sh
 ```
 Or run with docker:
 ### CPU only image:
 ```bash
 # CPU only image:
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
-
+```
-# Nvidia GPU:
+### Nvidia GPU:
 ```bash
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
-
+```
-# CPU and GPU image (bigger size):
+### CPU and GPU image (bigger size):
 ```bash
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
-
+```
-# AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
+### AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
 ```bash
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 ```
@@ -88,10 +140,13 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
 local-ai run oci://localai/phi-2:latest
 ```
-[💻 Getting started](https://localai.io/basics/getting_started/index.html)
+For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html)
 ## 📰 Latest project news
 - Apr 2025: [LocalAGI](https://github.com/mudler/LocalAGI) and [LocalRecall](https://github.com/mudler/LocalRecall) join the LocalAI family stack.
 - Apr 2025: WebUI overhaul, AIO images updates
 - Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
 - Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
 - Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
 - Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
@@ -105,19 +160,6 @@ local-ai run oci://localai/phi-2:latest
 Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
 ## 🔥🔥 Hot topics (looking for help):
 - Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
 - Realtime API https://github.com/mudler/LocalAI/issues/3714
 - WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
 - Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
 - Assistant API: https://github.com/mudler/LocalAI/issues/1273
 - Vulkan: https://github.com/mudler/LocalAI/issues/1647
 - Anthropic API: https://github.com/mudler/LocalAI/issues/1808
 If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
 ## 🚀 [Features](https://localai.io/features/)
 - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
@@ -131,12 +173,10 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
 - 📈 [Reranker API](https://localai.io/features/reranker/)
 - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
 - [Agentic capabilities](https://github.com/mudler/LocalAGI)
 - 🔊 Voice activity detection (Silero-VAD support)
 - 🌍 Integrated WebUI!
 ## 💻 Usage
 Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
 ### 🔗 Community and integrations
@@ -212,7 +252,7 @@ A huge thank you to our generous sponsors who support this project covering CI e
 <p align="center">
  <a href="https://www.spectrocloud.com/" target="blank">
-    <img height="200" src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
+    <img height="200" src="https://github.com/user-attachments/assets/72eab1dd-8b93-4fc0-9ade-84db49f24962">
  </a>
  <a href="https://www.premai.io/" target="blank">
    <img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
--- a/aio/cpu/embeddings.yaml
+++ b/aio/cpu/embeddings.yaml
@@ -1,7 +1,7 @@
 name: text-embedding-ada-002
 embeddings: true
 name: text-embedding-ada-002
 parameters:
-  model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
+  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
 usage: |
    You can test this model with curl like this:
--- a/aio/cpu/image-gen.yaml
+++ b/aio/cpu/image-gen.yaml
@@ -1,56 +1,17 @@
 name: stablediffusion
-backend: stablediffusion
+backend: stablediffusion-ggml
 cfg_scale: 4.5
 options:
 - sampler:euler
 parameters:
-  model: stablediffusion_assets
+  model: stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf
-
+step: 25
 license: "BSD-3"
 urls:
 - https://github.com/EdVince/Stable-Diffusion-NCNN
 - https://github.com/EdVince/Stable-Diffusion-NCNN/blob/main/LICENSE
 description: |
     Stable Diffusion in NCNN with c++, supported txt2img and img2img
 download_files:
- filename: "stablediffusion_assets/AutoencoderKL-256-256-fp16-opt.param"
+- filename: "stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
-  sha256: "18ca4b66685e21406bcf64c484b3b680b4949900415536d599cc876579c85c82"
+  sha256: "b8944e9fe0b69b36ae1b5bb0185b3a7b8ef14347fe0fa9af6c64c4829022261f"
-  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-256-256-fp16-opt.param"
+  uri: "huggingface://second-state/stable-diffusion-v1-5-GGUF/stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
 - filename: "stablediffusion_assets/AutoencoderKL-512-512-fp16-opt.param"
  sha256: "cf45f63aacf3dbbab0f59ed92a6f2c14d9a1801314631cd3abe91e3c85639a20"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-512-512-fp16-opt.param"
 - filename: "stablediffusion_assets/AutoencoderKL-base-fp16.param"
  sha256: "0254a056dce61b0c27dc9ec1b78b53bcf55315c540f55f051eb841aa992701ba"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-base-fp16.param"
 - filename: "stablediffusion_assets/AutoencoderKL-encoder-512-512-fp16.bin"
  sha256: "ddcb79a9951b9f91e05e087739ed69da2c1c4ae30ba4168cce350b49d617c9fa"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-encoder-512-512-fp16.bin"
 - filename: "stablediffusion_assets/AutoencoderKL-fp16.bin"
  sha256: "f02e71f80e70252734724bbfaed5c4ddd3a8ed7e61bb2175ff5f53099f0e35dd"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-fp16.bin"
 - filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.bin"
  sha256: "1c9a12f4e1dd1b295a388045f7f28a2352a4d70c3dc96a542189a3dd7051fdd6"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/FrozenCLIPEmbedder-fp16.bin"
 - filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.param"
  sha256: "471afbe678dd1fd3fe764ef9c6eccaccb0a7d7e601f27b462aa926b20eb368c9"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/FrozenCLIPEmbedder-fp16.param"
 - filename: "stablediffusion_assets/log_sigmas.bin"
  sha256: "a2089f8aa4c61f9c200feaec541ab3f5c94233b28deb6d5e8bcd974fa79b68ac"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/raw/main/x86/linux/assets/log_sigmas.bin"
 - filename: "stablediffusion_assets/UNetModel-256-256-MHA-fp16-opt.param"
  sha256: "a58c380229f09491776df837b7aa7adffc0a87821dc4708b34535da2e36e3da1"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-256-256-MHA-fp16-opt.param"
 - filename: "stablediffusion_assets/UNetModel-512-512-MHA-fp16-opt.param"
  sha256: "f12034067062827bd7f43d1d21888d1f03905401acf6c6eea22be23c259636fa"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-512-512-MHA-fp16-opt.param"
 - filename: "stablediffusion_assets/UNetModel-base-MHA-fp16.param"
  sha256: "696f6975de49f4325b53ce32aff81861a6d6c07cd9ce3f0aae2cc405350af38d"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-base-MHA-fp16.param"
 - filename: "stablediffusion_assets/UNetModel-MHA-fp16.bin"
  sha256: "d618918d011bfc1f644c0f2a33bf84931bd53b28a98492b0a8ed6f3a818852c3"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/UNetModel-MHA-fp16.bin"
 - filename: "stablediffusion_assets/vocab.txt"
  sha256: "e30e57b6f1e47616982ef898d8922be24e535b4fa3d0110477b3a6f02ebbae7d"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/vocab.txt"
 usage: |
        curl http://localhost:8080/v1/images/generations \
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -1,101 +1,57 @@
 name: gpt-4
 mmap: true
 parameters:
  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
 context_size: 8192
-
+f16: true
 stopwords:
 - "<|im_end|>"
 - "<dummy32000>"
 - "</tool_call>"
 - "<|eot_id|>"
 - "<|end_of_text|>"
 function:
  # disable injecting the "answer" tool
  disable_no_action: true
  grammar:
-    # This allows the grammar to also return messages
+    no_mixed_free_string: true
-    mixed_mode: true
+    schema_type: llama3.1 # or JSON is supported too (json)
-    # Suffix to add to the grammar
+  response_regex:
-    #prefix: '<tool_call>\n'
+  - <function=(?P<name>\w+)>(?P<arguments>.*)</function>
-    # Force parallel calls in the grammar
+mmap: true
-    # parallel_calls: true
+name: gpt-4
-
+parameters:
-  return_name_in_function_response: true
+  model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
-  # Without grammar uncomment the lines below
+stopwords:
-  # Warning: this is relying only on the capability of the
+- <|im_end|>
-  # LLM model to generate the correct function call.
+- <dummy32000>
-  json_regex_match: 
+- <|eot_id|>
-   - "(?s)<tool_call>(.*?)</tool_call>"
+- <|end_of_text|>
   - "(?s)<tool_call>(.*?)"
  replace_llm_results:
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
  replace_function_results: 
  # Replace everything that is not JSON array or object
  # 
  - key: '(?s)^[^{\[]*'
    value: ""
  - key: '(?s)[^}\]]*$'
    value: ""
  - key: "'([^']*?)'"
    value: "_DQUOTE_${1}_DQUOTE_"
  - key: '\\"'
    value: "__TEMP_QUOTE__"
  - key: "\'"
    value: "'"
  - key: "_DQUOTE_"
    value: '"'
  - key: "__TEMP_QUOTE__"
    value: '"'
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
 template:
  chat: |
-    {{.Input -}}
+    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
-    <|im_start|>assistant
+    You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
    {{.Input }}
    <|start_header_id|>assistant<|end_header_id|>
  chat_message: |
-    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    <|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
-    {{- if .FunctionCall }}
+    {{ if .FunctionCall -}}
-    <tool_call>
+    {{ else if eq .RoleName "tool" -}}
-    {{- else if eq .RoleName "tool" }}
+    The Function was executed and the response was:
-    <tool_response>
+    {{ end -}}
-    {{- end }}
+    {{ if .Content -}}
-    {{- if .Content}}
+    {{.Content -}}
-    {{.Content }}
+    {{ else if .FunctionCall -}}
-    {{- end }}
+    {{ range .FunctionCall }}
-    {{- if .FunctionCall}}
+    [{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})]
-    {{toJson .FunctionCall}}
+    {{ end }}
-    {{- end }}
+    {{ end -}}
-    {{- if .FunctionCall }}
+    <|eot_id|>
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |-
+  function: |
-    <|im_start|>system
+    <|start_header_id|>system<|end_header_id|>
-    You are a function calling AI model.
+    You are an expert in composing functions. You are given a question and a set of possible functions.
-    Here are the available tools:
+    Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
-    <tools>
+    If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.
-    {{range .Functions}}
+    If you decide to invoke any of the function(s), you MUST put it in the format as follows:
-    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    [func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)]
-    {{end}}
+    You SHOULD NOT include any other text in the response.
-    </tools>
+    Here is a list of functions in JSON format that you can invoke.
-    You should call the tools provided to you sequentially
+    {{toJson .Functions}}
-    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+    <|eot_id|><|start_header_id|>user<|end_header_id|>
-    <scratchpad>
+    {{.Input}}
-    {step-by-step reasoning and plan in bullet points}
+    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
-    </scratchpad>
+
-    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
+download_files:
-    <tool_call>
+- filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
-    {"arguments": <args-dict>, "name": <function-name>}
+  sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5
-    </tool_call><|im_end|>
+  uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
    {{.Input -}}
    <|im_start|>assistant
--- a/aio/cpu/vad.yaml
+++ b/aio/cpu/vad.yaml
@@ -0,0 +1,8 @@
 backend: silero-vad
 name: silero-vad
 parameters:
  model: silero-vad.onnx
 download_files:
 - filename: silero-vad.onnx
  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -1,31 +1,49 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
 mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: bakllava-mmproj.gguf
 parameters:
-  model: bakllava.gguf
+  model: minicpm-v-2_6-Q4_K_M.gguf
-
+stopwords:
 - <|im_end|>
 - <dummy32000>
 - </s>
 - <|endoftext|>
 template:
  chat: |
-    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{ .RoleName }}
    {{ if .FunctionCall -}}
    Function call:
    {{ else if eq .RoleName "tool" -}}
    Function response:
    {{ end -}}
    {{ if .Content -}}
    {{.Content }}
    {{ end -}}
    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-    ASSISTANT:
+  function: |
    <|im_start|>system
    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    For each function call return a json object with function name and arguments
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
- filename: bakllava.gguf
+- filename: minicpm-v-2_6-Q4_K_M.gguf
-  uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
+  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
- filename: bakllava-mmproj.gguf
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
-  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
+- filename: minicpm-v-2_6-mmproj-f16.gguf
-
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-usage: |
+  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "gpt-4-vision-preview",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/entrypoint.sh
+++ b/aio/entrypoint.sh
@@ -129,7 +129,7 @@ detect_gpu
 detect_gpu_size
 PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
-export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
+export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vad.yaml,/aio/${PROFILE}/vision.yaml}"
 check_vars
--- a/aio/gpu-8g/embeddings.yaml
+++ b/aio/gpu-8g/embeddings.yaml
@@ -1,7 +1,7 @@
 embeddings: true
 name: text-embedding-ada-002
 backend: sentencetransformers
 parameters:
-  model: all-MiniLM-L6-v2
+  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
 usage: |
    You can test this model with curl like this:
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -1,101 +1,53 @@
-name: gpt-4
+context_size: 4096
-mmap: true
+f16: true
 parameters:
  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
 context_size: 8192
 stopwords:
 - "<|im_end|>"
 - "<dummy32000>"
 - "</tool_call>"
 - "<|eot_id|>"
 - "<|end_of_text|>"
 function:
-  # disable injecting the "answer" tool
+  capture_llm_results:
-  disable_no_action: true
+  - (?s)<Thought>(.*?)</Thought>
  grammar:
-    # This allows the grammar to also return messages
+    properties_order: name,arguments
-    mixed_mode: true
+  json_regex_match:
-    # Suffix to add to the grammar
+  - (?s)<Output>(.*?)</Output>
    #prefix: '<tool_call>\n'
    # Force parallel calls in the grammar
    # parallel_calls: true
  return_name_in_function_response: true
  # Without grammar uncomment the lines below
  # Warning: this is relying only on the capability of the
  # LLM model to generate the correct function call.
  json_regex_match: 
   - "(?s)<tool_call>(.*?)</tool_call>"
   - "(?s)<tool_call>(.*?)"
  replace_llm_results:
-  # Drop the scratchpad content from responses
+  - key: (?s)<Thought>(.*?)</Thought>
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
-  replace_function_results: 
+mmap: true
-  # Replace everything that is not JSON array or object
+name: gpt-4
-  # 
+parameters:
-  - key: '(?s)^[^{\[]*'
+  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
-    value: ""
+stopwords:
-  - key: '(?s)[^}\]]*$'
+- <|im_end|>
-    value: ""
+- <dummy32000>
-  - key: "'([^']*?)'"
+- </s>
    value: "_DQUOTE_${1}_DQUOTE_"
  - key: '\\"'
    value: "__TEMP_QUOTE__"
  - key: "\'"
    value: "'"
  - key: "_DQUOTE_"
    value: '"'
  - key: "__TEMP_QUOTE__"
    value: '"'
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    <|im_start|>{{ .RoleName }}
-    {{- if .FunctionCall }}
+    {{ if .FunctionCall -}}
-    <tool_call>
+    Function call:
-    {{- else if eq .RoleName "tool" }}
+    {{ else if eq .RoleName "tool" -}}
-    <tool_response>
+    Function response:
-    {{- end }}
+    {{ end -}}
-    {{- if .Content}}
+    {{ if .Content -}}
    {{.Content }}
-    {{- end }}
+    {{ end -}}
-    {{- if .FunctionCall}}
+    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
-    {{- end }}
+    {{ end -}}<|im_end|>
    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |-
+  function: |
    <|im_start|>system
-    You are a function calling AI model.
+    You are an AI assistant that executes function calls, and these are the tools at your disposal:
    Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    </tools>
+    <|im_end|>
    You should call the tools provided to you sequentially
    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
    <scratchpad>
    {step-by-step reasoning and plan in bullet points}
    </scratchpad>
    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
    <tool_call>
    {"arguments": <args-dict>, "name": <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
-    <|im_start|>assistant
+    <|im_start|>assistant
 download_files:
 - filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
  sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
  uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
--- a/aio/gpu-8g/vad.yaml
+++ b/aio/gpu-8g/vad.yaml
@@ -0,0 +1,8 @@
 backend: silero-vad
 name: silero-vad
 parameters:
  model: silero-vad.onnx
 download_files:
 - filename: silero-vad.onnx
  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -1,35 +1,49 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
 mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  model: minicpm-v-2_6-Q4_K_M.gguf
-  temperature: 0.2
+stopwords:
-  top_k: 40
+- <|im_end|>
-  top_p: 0.95
+- <dummy32000>
-  seed: -1
+- </s>
-
+- <|endoftext|>
 template:
  chat: |
-    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{ .RoleName }}
    {{ if .FunctionCall -}}
    Function call:
    {{ else if eq .RoleName "tool" -}}
    Function response:
    {{ end -}}
    {{ if .Content -}}
    {{.Content }}
    {{ end -}}
    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-    ASSISTANT:
+  function: |
    <|im_start|>system
    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    For each function call return a json object with function name and arguments
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
+- filename: minicpm-v-2_6-Q4_K_M.gguf
-  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
+  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
- filename: llava-v1.6-7b-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
-  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
+- filename: minicpm-v-2_6-mmproj-f16.gguf
-
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-usage: |
+  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "gpt-4-vision-preview",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/intel/embeddings.yaml
+++ b/aio/intel/embeddings.yaml
@@ -1,7 +1,7 @@
 embeddings: true
 name: text-embedding-ada-002
 backend: sentencetransformers
 parameters:
-  model: all-MiniLM-L6-v2
+  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
 usage: |
    You can test this model with curl like this:
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -1,103 +1,53 @@
-name: gpt-4
+context_size: 4096
-mmap: false
+f16: true
 context_size: 8192
 f16: false
 parameters:
  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
 stopwords:
 - "<|im_end|>"
 - "<dummy32000>"
 - "</tool_call>"
 - "<|eot_id|>"
 - "<|end_of_text|>"
 function:
-  # disable injecting the "answer" tool
+  capture_llm_results:
-  disable_no_action: true
+  - (?s)<Thought>(.*?)</Thought>
  grammar:
-    # This allows the grammar to also return messages
+    properties_order: name,arguments
-    mixed_mode: true
+  json_regex_match:
-    # Suffix to add to the grammar
+  - (?s)<Output>(.*?)</Output>
    #prefix: '<tool_call>\n'
    # Force parallel calls in the grammar
    # parallel_calls: true
  return_name_in_function_response: true
  # Without grammar uncomment the lines below
  # Warning: this is relying only on the capability of the
  # LLM model to generate the correct function call.
  json_regex_match: 
   - "(?s)<tool_call>(.*?)</tool_call>"
   - "(?s)<tool_call>(.*?)"
  replace_llm_results:
-  # Drop the scratchpad content from responses
+  - key: (?s)<Thought>(.*?)</Thought>
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
-  replace_function_results: 
+mmap: true
-  # Replace everything that is not JSON array or object
+name: gpt-4
-  # 
+parameters:
-  - key: '(?s)^[^{\[]*'
+  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
-    value: ""
+stopwords:
-  - key: '(?s)[^}\]]*$'
+- <|im_end|>
-    value: ""
+- <dummy32000>
-  - key: "'([^']*?)'"
+- </s>
    value: "_DQUOTE_${1}_DQUOTE_"
  - key: '\\"'
    value: "__TEMP_QUOTE__"
  - key: "\'"
    value: "'"
  - key: "_DQUOTE_"
    value: '"'
  - key: "__TEMP_QUOTE__"
    value: '"'
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    <|im_start|>{{ .RoleName }}
-    {{- if .FunctionCall }}
+    {{ if .FunctionCall -}}
-    <tool_call>
+    Function call:
-    {{- else if eq .RoleName "tool" }}
+    {{ else if eq .RoleName "tool" -}}
-    <tool_response>
+    Function response:
-    {{- end }}
+    {{ end -}}
-    {{- if .Content}}
+    {{ if .Content -}}
    {{.Content }}
-    {{- end }}
+    {{ end -}}
-    {{- if .FunctionCall}}
+    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
-    {{- end }}
+    {{ end -}}<|im_end|>
    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |-
+  function: |
    <|im_start|>system
-    You are a function calling AI model.
+    You are an AI assistant that executes function calls, and these are the tools at your disposal:
    Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    </tools>
+    <|im_end|>
    You should call the tools provided to you sequentially
    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
    <scratchpad>
    {step-by-step reasoning and plan in bullet points}
    </scratchpad>
    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
    <tool_call>
    {"arguments": <args-dict>, "name": <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
 - filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
  sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
  uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
--- a/aio/intel/vad.yaml
+++ b/aio/intel/vad.yaml
@@ -0,0 +1,8 @@
 backend: silero-vad
 name: silero-vad
 parameters:
  model: silero-vad.onnx
 download_files:
 - filename: silero-vad.onnx
  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -1,35 +1,50 @@
 backend: llama-cpp
 context_size: 4096
-mmap: false
+f16: true
-f16: false
+mmap: true
 mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  model: minicpm-v-2_6-Q4_K_M.gguf
-  temperature: 0.2
+stopwords:
-  top_k: 40
+- <|im_end|>
-  top_p: 0.95
+- <dummy32000>
-  seed: -1
+- </s>
-
+- <|endoftext|>
 template:
  chat: |
-    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{ .RoleName }}
    {{ if .FunctionCall -}}
    Function call:
    {{ else if eq .RoleName "tool" -}}
    Function response:
    {{ end -}}
    {{ if .Content -}}
    {{.Content }}
    {{ end -}}
    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-    ASSISTANT:
+  function: |
    <|im_start|>system
    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    For each function call return a json object with function name and arguments
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
+- filename: minicpm-v-2_6-Q4_K_M.gguf
-  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
+  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
- filename: llava-v1.6-7b-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
-  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
+- filename: minicpm-v-2_6-mmproj-f16.gguf
-
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-usage: |
+  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "gpt-4-vision-preview",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -21,7 +21,8 @@ service Backend {
  rpc Status(HealthMessage) returns (StatusResponse) {}
  rpc StoresSet(StoresSetOptions) returns (Result) {}
-  rpc StoresReset(StoresResetOptions) returns (Result) {}
+  rpc StoresDelete(StoresDeleteOptions) returns (Result) {}
  rpc StoresGet(StoresGetOptions) returns (StoresGetResult) {}
  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
  rpc Rerank(RerankRequest) returns (RerankResult) {}
@@ -77,10 +78,19 @@ message StoresSetOptions {
  repeated StoresValue Values = 2;
 }
-message StoresResetOptions {
+message StoresDeleteOptions {
  repeated StoresKey Keys = 1;
 }
 message StoresGetOptions {
  repeated StoresKey Keys = 1;
 }
 message StoresGetResult {
  repeated StoresKey Keys = 1;
  repeated StoresValue Values = 2;
 }
 message StoresFindOptions {
  StoresKey Key = 1;
  int32 TopK = 2;
@@ -153,6 +163,10 @@ message Reply {
  double timing_token_generation = 5;
 }
 message GrammarTrigger {
  string word = 1;
 }
 message ModelOptions {
  string Model = 1;
  int32 ContextSize = 2;
@@ -176,11 +190,7 @@ message ModelOptions {
  int32 NGQA = 20;
  string ModelFile = 21;
-  // AutoGPTQ
+
  string Device = 22;
  bool UseTriton = 23;
  string ModelBaseName = 24;
  bool UseFastTokenizer = 25;
  // Diffusers
  string PipelineType = 26;
@@ -214,6 +224,11 @@ message ModelOptions {
  int32  MaxModelLen = 54;
  int32  TensorParallelSize = 55;
  string LoadFormat = 58;
  bool   DisableLogStatus = 66;
  string DType = 67;
  int32  LimitImagePerPrompt = 68;
  int32  LimitVideoPerPrompt = 69;
  int32  LimitAudioPerPrompt = 70;
  string MMProj = 41;
@@ -237,6 +252,8 @@ message ModelOptions {
  string CacheTypeKey = 63;
  string CacheTypeValue = 64;
  repeated GrammarTrigger GrammarTriggers = 65;
 }
 message Result {
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -2,7 +2,7 @@
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 set(TARGET myclip)
-add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
+add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
 install(TARGETS ${TARGET} LIBRARY)
 target_include_directories(myclip PUBLIC .)
 target_include_directories(myclip PUBLIC ../..)
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 TARGET?=--target grpc-server
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
-CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
@@ -36,11 +36,18 @@ else ifeq ($(OS),Darwin)
 endif
 ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
 		-DCMAKE_C_COMPILER=icx \
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DCMAKE_CXX_FLAGS="-fsycl" \
 		-DGGML_SYCL_F16=ON
 endif
 ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
 		-DCMAKE_C_COMPILER=icx \
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DCMAKE_CXX_FLAGS="-fsycl"
 endif
 llama.cpp:
@@ -77,4 +84,4 @@ ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 else
 	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
 endif
-	cp llama.cpp/build/bin/grpc-server .
+	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -217,6 +217,7 @@ struct llama_client_slot
    bool infill = false;
    bool embedding = false;
    bool reranker = false;
    bool has_next_token = true;
    bool truncated = false;
    bool stopped_eos = false;
@@ -467,6 +468,10 @@ struct llama_server_context
    bool all_slots_are_idle = false;
    bool add_bos_token      = true;
    bool has_eos_token      = true;
    bool has_gpu = false;
    bool grammar_lazy = false;
    std::vector<common_grammar_trigger> grammar_triggers;
    int32_t n_ctx;  // total context for all clients / slots
@@ -505,12 +510,15 @@ struct llama_server_context
    bool load_model(const common_params &params_)
    {
        params = params_;
-        if (!params.mmproj.empty()) {
+        if (!params.mmproj.path.empty()) {
            multimodal = true;
            LOG_INFO("Multi Modal Mode Enabled", {});
-            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
+            clp_ctx = clip_init(params.mmproj.path.c_str(), clip_context_params {
                /* use_gpu */ has_gpu,
                /*verbosity=*/ GGML_LOG_LEVEL_INFO,
            });
            if(clp_ctx == nullptr) {
-                LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
+                LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
                return false;
            }
@@ -524,10 +532,16 @@ struct llama_server_context
        ctx = common_init.context.release();
        if (model == nullptr)
        {
-            LOG_ERR("unable to load model: %s", params.model.c_str());
+            LOG_ERR("unable to load model: %s", params.model.path.c_str());
            return false;
        }
        // Enable reranking if embeddings are enabled - moved after context initialization
        if (params.embedding) {
            params.reranking = true;
            LOG_INFO("Reranking enabled (embeddings are enabled)", {});
        }
        if (multimodal) {
            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
            const int n_embd_llm  = llama_model_n_embd(model);
@@ -706,6 +720,8 @@ struct llama_server_context
        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
        slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
        slot->sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
        slot->sparams.grammar_triggers = grammar_triggers;
        slot->sparams.grammar_lazy = grammar_lazy;
        if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
            // Might be better to reject the request with a 400 ?
@@ -1150,6 +1166,14 @@ struct llama_server_context
            slot.has_next_token = false;
        }
        if (slot.n_past >= slot.n_ctx) {
            slot.truncated      = true;
            slot.stopped_limit = true;
            slot.has_next_token = false;
            LOG_VERBOSE("stopped due to running out of context capacity", {});
        }
        if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok))
        {
            slot.stopped_eos = true;
@@ -1337,7 +1361,7 @@ struct llama_server_context
        queue_results.send(res);
    }
-    void send_embedding(llama_client_slot &slot)
+    void send_embedding(llama_client_slot &slot, const llama_batch & batch)
    {
        task_result res;
        res.id = slot.task_id;
@@ -1359,16 +1383,96 @@ struct llama_server_context
        else
        {
            const float *data = llama_get_embeddings(ctx);
-            std::vector<float> embedding(data, data + n_embd);
+            std::vector<float> embd_res(n_embd, 0.0f);
            std::vector<std::vector<float>> embedding;
            for (int i = 0; i < batch.n_tokens; ++i) {
                if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
                    continue;
                }
                const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
                if (embd == NULL) {
                    embd = llama_get_embeddings_ith(ctx, i);
                }
                if (embd == NULL) {
                    LOG("failed to get embeddings");
                    continue;
                }
                // normalize only when there is pooling
                // TODO: configurable
                if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE) {
                    common_embd_normalize(embd, embd_res.data(), n_embd, 2);
                    embedding.push_back(embd_res);
                } else {
                    embedding.push_back({ embd, embd + n_embd });
                }
            }
            // OAI compat
            res.result_json = json
            {
-                {"embedding", embedding },
+                {"embedding", embedding[0] },
            };
        }
        queue_results.send(res);
    }
-    void request_completion(int task_id, json data, bool infill, bool embedding, int multitask_id)
+    void send_rerank(llama_client_slot &slot, const llama_batch & batch)
    {
        task_result res;
        res.id = slot.task_id;
        res.multitask_id = slot.multitask_id;
        res.error = false;
        res.stop = true;
        float score = -1e6f; // Default score if we fail to get embeddings
        if (!params.reranking)
        {
            LOG_WARNING("reranking disabled", {
                {"params.reranking", params.reranking},
            });
        }
        else if (ctx == nullptr)
        {
            LOG_ERR("context is null, cannot perform reranking");
            res.error = true;
        }
        else
        {
            for (int i = 0; i < batch.n_tokens; ++i) {
                if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
                    continue;
                }
                const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
                if (embd == NULL) {
                    embd = llama_get_embeddings_ith(ctx, i);
                }
                if (embd == NULL) {
                    LOG("failed to get embeddings");
                    continue;
                }
                score = embd[0];
            }
        }
        // Format result as JSON similar to the embedding function
        res.result_json = json
        {
            {"score", score},
            {"tokens", slot.num_prompt_tokens}
        };
        queue_results.send(res);
    }
    void request_completion(int task_id, json data, bool infill, bool embedding, bool rerank, int multitask_id)
    {
        task_server task;
        task.id = task_id;
@@ -1376,6 +1480,7 @@ struct llama_server_context
        task.data = std::move(data);
        task.infill_mode = infill;
        task.embedding_mode = embedding;
        task.reranking_mode = rerank;
        task.type = TASK_TYPE_COMPLETION;
        task.multitask_id = multitask_id;
@@ -1507,7 +1612,7 @@ struct llama_server_context
            subtask_data["prompt"] = subtask_data["prompt"][i];
            // subtasks inherit everything else (infill mode, embedding mode, etc.)
-            request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
+            request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multiprompt_task.reranking_mode, multitask_id);
        }
    }
@@ -1546,6 +1651,7 @@ struct llama_server_context
                slot->infill       = task.infill_mode;
                slot->embedding    = task.embedding_mode;
                slot->reranker    = task.reranking_mode;
                slot->task_id      = task.id;
                slot->multitask_id = task.multitask_id;
@@ -1622,17 +1728,17 @@ struct llama_server_context
            {
                if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
                {
                    // this check is redundant (for good)
                    // we should never get here, because generation should already stopped in process_token()
                    // START LOCALAI changes
                    // Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
                    // See: https://github.com/mudler/LocalAI/issues/1333
                    // Context is exhausted, release the slot
                    slot.release();
                    send_final_response(slot);
-                    slot.cache_tokens.clear();
+                    slot.has_next_token = false;
-                    slot.n_past = 0;
+                    LOG_ERROR("context is exhausted, release the slot", {});
                    slot.truncated = false;
                    slot.has_next_token = true;
                    LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
                    continue;
                    // END LOCALAI changes
@@ -1983,7 +2089,15 @@ struct llama_server_context
                // prompt evaluated for embedding
                if (slot.embedding)
                {
-                    send_embedding(slot);
+                    send_embedding(slot, batch_view);
                    slot.release();
                    slot.i_batch = -1;
                    continue;
                }
                if (slot.reranker)
                {
                    send_rerank(slot, batch_view);
                    slot.release();
                    slot.i_batch = -1;
                    continue;
@@ -2077,7 +2191,11 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
 }
 std::function<void(int)> shutdown_handler;
-inline void signal_handler(int signal) { shutdown_handler(signal); }
+
 inline void signal_handler(int signal) {
    exit(1);
 }
 /////////////////////////////////
 ////////////////////////////////
@@ -2273,15 +2391,15 @@ static std::string get_all_kv_cache_types() {
 }
 static void params_parse(const backend::ModelOptions* request,
-                                common_params & params) {
+                                common_params & params, llama_server_context &llama) {
    // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
-    params.model = request->modelfile();
+    params.model.path = request->modelfile();
    if (!request->mmproj().empty()) {
    // get the directory of modelfile
-      std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
+      std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
-      params.mmproj = model_dir + "/"+ request->mmproj();
+      params.mmproj.path = model_dir + "/"+ request->mmproj();
    }
    //  params.model_alias ??
    params.model_alias =  request->modelfile();
@@ -2311,6 +2429,20 @@ static void params_parse(const backend::ModelOptions* request,
        add_rpc_devices(std::string(llama_grpc_servers));
    }
     // decode options. Options are in form optname:optvale, or if booleans only optname.
    for (int i = 0; i < request->options_size(); i++) {
        std::string opt = request->options(i);
        char *optname = strtok(&opt[0], ":");
        char *optval = strtok(NULL, ":");
        if (optval == NULL) {
            optval = "true";
        }
        if (!strcmp(optname, "gpu")) {
            llama.has_gpu = true;
        }
    }
    // TODO: Add yarn
    if (!request->tensorsplit().empty()) {
@@ -2342,7 +2474,7 @@ static void params_parse(const backend::ModelOptions* request,
        scale_factor = request->lorascale();
     }
     // get the directory of modelfile
-     std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
+     std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
     params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor });
    }
    params.use_mlock = request->mlock();
@@ -2374,6 +2506,21 @@ static void params_parse(const backend::ModelOptions* request,
    if ( request->ropefreqscale() != 0.0f ) {
        params.rope_freq_scale = request->ropefreqscale();
    }
    if (request->grammartriggers_size() > 0) {
        LOG_INFO("configuring grammar triggers", {});
        llama.grammar_lazy = true;
        for (int i = 0; i < request->grammartriggers_size(); i++) {
            common_grammar_trigger trigger;
 	    trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_WORD;
            trigger.value = request->grammartriggers(i).word();
 	    // trigger.at_start = request->grammartriggers(i).at_start();
            llama.grammar_triggers.push_back(trigger);
            LOG_INFO("grammar trigger", {
                { "word", trigger.value },
            });
        }
    }
 }
@@ -2389,7 +2536,7 @@ public:
  grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
    // Implement LoadModel RPC
    common_params params;
-    params_parse(request, params);
+    params_parse(request, params, llama);
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -2411,7 +2558,7 @@ public:
        json data = parse_options(true, request, llama);
        const int task_id = llama.queue_tasks.get_new_id();
        llama.queue_results.add_waiting_task_id(task_id);
-        llama.request_completion(task_id, data, false, false, -1);
+        llama.request_completion(task_id, data, false, false, false, -1);
        while (true)
        {
            task_result result = llama.queue_results.recv(task_id);
@@ -2465,7 +2612,7 @@ public:
        json data = parse_options(false, request, llama);
        const int task_id = llama.queue_tasks.get_new_id();
        llama.queue_results.add_waiting_task_id(task_id);
-        llama.request_completion(task_id, data, false, false, -1);
+        llama.request_completion(task_id, data, false, false, false, -1);
        std::string completion_text;
        task_result result = llama.queue_results.recv(task_id);
        if (!result.error && result.stop) {
@@ -2502,7 +2649,7 @@ public:
        json data = parse_options(false, request, llama);
        const int task_id = llama.queue_tasks.get_new_id();
        llama.queue_results.add_waiting_task_id(task_id);
-        llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, -1);
+        llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, false, -1);
        // get the result
        task_result result = llama.queue_results.recv(task_id);
        //std::cout << "Embedding result JSON" << result.result_json.dump() << std::endl;
@@ -2522,6 +2669,58 @@ public:
        return grpc::Status::OK;
    }
    grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){
         json data = parse_options(false, request, llama);
         std::vector<llama_token> tokens = llama.tokenize(data["prompt"],false);
         for (int i=0 ; i< tokens.size(); i++){
            response->add_tokens(tokens[i]);
         }
        return grpc::Status::OK;
    }
    grpc::Status Rerank(ServerContext* context, const backend::RerankRequest* request, backend::RerankResult* rerankResult) {
        // Create a JSON object with the query and documents
        json data = {
            {"prompt", request->query()},
            {"documents", request->documents()},
            {"top_n", request->top_n()}
        };
        // Generate a new task ID
        const int task_id = llama.queue_tasks.get_new_id();
        llama.queue_results.add_waiting_task_id(task_id);
        // Queue the task with reranking mode enabled
        llama.request_completion(task_id, data, false, false, true, -1);
        // Get the result
        task_result result = llama.queue_results.recv(task_id);
        llama.queue_results.remove_waiting_task_id(task_id);
        if (!result.error && result.stop) {
            // Set usage information
            backend::Usage* usage = rerankResult->mutable_usage();
            usage->set_total_tokens(result.result_json.value("tokens", 0));
            usage->set_prompt_tokens(result.result_json.value("tokens", 0));
            // Get the score from the result
            float score = result.result_json.value("score", 0.0f);
            // Create document results for each input document
            for (int i = 0; i < request->documents_size(); i++) {
                backend::DocumentResult* doc_result = rerankResult->add_results();
                doc_result->set_index(i);
                doc_result->set_text(request->documents(i));
                doc_result->set_relevance_score(score);
            }
        }
        return grpc::Status::OK;
    }
    grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
        llama_client_slot* active_slot = llama.get_active_slot();
@@ -2554,7 +2753,9 @@ void RunServer(const std::string& server_address) {
  ServerBuilder builder;
  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
  builder.RegisterService(&service);
-
+  builder.SetMaxMessageSize(50 * 1024 * 1024); // 50MB
  builder.SetMaxSendMessageSize(50 * 1024 * 1024); // 50MB
  builder.SetMaxReceiveMessageSize(50 * 1024 * 1024); // 50MB
  std::unique_ptr<Server> server(builder.BuildAndStart());
  std::cout << "Server listening on " << server_address << std::endl;
  server->Wait();
@@ -2563,6 +2764,20 @@ void RunServer(const std::string& server_address) {
 int main(int argc, char** argv) {
  std::string server_address("localhost:50051");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
    struct sigaction sigint_action;
    sigint_action.sa_handler = signal_handler;
    sigemptyset (&sigint_action.sa_mask);
    sigint_action.sa_flags = 0;
    sigaction(SIGINT, &sigint_action, NULL);
    sigaction(SIGTERM, &sigint_action, NULL);
 #elif defined (_WIN32)
    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
    };
    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
  // Define long and short options
  struct option long_options[] = {
      {"addr", required_argument, nullptr, 'a'},
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -21,6 +21,7 @@ fi
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
 cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h
 cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
 echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
 cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -61,6 +61,7 @@ struct task_server {
    json data;
    bool infill_mode = false;
    bool embedding_mode = false;
    bool reranking_mode = false;
    int multitask_id = -1;
 };
--- a/backend/go/image/stablediffusion-ggml/Makefile
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@@ -8,6 +8,13 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 # keep standard at C11 and C++11
 CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
 GOCMD?=go
 CGO_LDFLAGS?=
 # Avoid parent make file overwriting CGO_LDFLAGS which is needed for hipblas
 CGO_LDFLAGS_SYCL=
 GO_TAGS?=
 LD_FLAGS?=
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
@@ -21,7 +28,7 @@ else ifeq ($(BUILD_TYPE),openblas)
 # If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
-# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DGGML_HIP=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
@@ -36,16 +43,35 @@ else ifeq ($(OS),Darwin)
 	endif
 endif
-# ifeq ($(BUILD_TYPE),sycl_f16)
+ifeq ($(BUILD_TYPE),sycl_f16)
-# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DSD_SYCL=ON -DGGML_SYCL_F16=ON
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
-# endif
+		-DCMAKE_C_COMPILER=icx \
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DSD_SYCL=ON \
 		-DGGML_SYCL_F16=ON
 	CC=icx
 	CXX=icpx
 	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
 	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
 	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
 	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
 endif
-# ifeq ($(BUILD_TYPE),sycl_f32)
+ifeq ($(BUILD_TYPE),sycl_f32)
-# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
-# endif
+		-DCMAKE_C_COMPILER=icx \
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DSD_SYCL=ON
 	CC=icx
 	CXX=icpx
 	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
 	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
 	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
 	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
 endif
 # warnings
-CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+# CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
 # Find all .a archives in ARCHIVE_DIR
 # (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
@@ -86,11 +112,24 @@ endif
 	$(MAKE) $(COMBINED_LIB)
 gosd.o:
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
 	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c"
 else
 	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
 endif
 libsd.a: gosd.o
 	cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
 	$(AR) rcs libsd.a gosd.o
 stablediffusion-ggml:
 	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_SYCL)" C_INCLUDE_PATH="$(INCLUDE_PATH)" LIBRARY_PATH="$(LIBRARY_PATH)" \
 	CC="$(CC)" CXX="$(CXX)" CGO_CXXFLAGS="$(CGO_CXXFLAGS)" \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o ../../../../backend-assets/grpc/stablediffusion-ggml ./
 ifneq ($(UPX),)
 	$(UPX) ../../../../backend-assets/grpc/stablediffusion-ggml
 endif
 clean:
-	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
+	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
--- a/backend/go/image/stablediffusion-ggml/gosd.cpp
+++ b/backend/go/image/stablediffusion-ggml/gosd.cpp
@@ -35,6 +35,8 @@ const char* sample_method_str[] = {
    "ipndm",
    "ipndm_v",
    "lcm",
    "ddim_trailing",
    "tcd",
 };
 // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
@@ -173,6 +175,7 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,
                            -1, //clip_skip
                            cfg_scale, // sfg_scale
                            3.5f,
 			    0, // eta
                            width,
                            height,
                            sample_method, 
--- a/backend/go/image/stablediffusion/main.go
+++ b/backend/go/image/stablediffusion/main.go
@@ -1,21 +0,0 @@
 package main
 // Note: this is started internally by LocalAI and a server is allocated for each model
 import (
 	"flag"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 func main() {
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &Image{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/image/stablediffusion/stablediffusion.go
+++ b/backend/go/image/stablediffusion/stablediffusion.go
@@ -1,33 +0,0 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/stablediffusion"
 )
 type Image struct {
 	base.SingleThread
 	stablediffusion *stablediffusion.StableDiffusion
 }
 func (image *Image) Load(opts *pb.ModelOptions) error {
 	var err error
 	// Note: the Model here is a path to a directory containing the model files
 	image.stablediffusion, err = stablediffusion.New(opts.ModelFile)
 	return err
 }
 func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
 	return image.stablediffusion.GenerateImage(
 		int(opts.Height),
 		int(opts.Width),
 		int(opts.Mode),
 		int(opts.Step),
 		int(opts.Seed),
 		opts.PositivePrompt,
 		opts.NegativePrompt,
 		opts.Dst)
 }
--- a/backend/go/llm/llama-ggml/llama.go
+++ b/backend/go/llm/llama-ggml/llama.go
@@ -1,204 +0,0 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"github.com/go-skynet/go-llama.cpp"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 type LLM struct {
 	base.SingleThread
 	llama *llama.LLama
 }
 func (llm *LLM) Load(opts *pb.ModelOptions) error {
 	ropeFreqBase := float32(10000)
 	ropeFreqScale := float32(1)
 	if opts.RopeFreqBase != 0 {
 		ropeFreqBase = opts.RopeFreqBase
 	}
 	if opts.RopeFreqScale != 0 {
 		ropeFreqScale = opts.RopeFreqScale
 	}
 	llamaOpts := []llama.ModelOption{
 		llama.WithRopeFreqBase(ropeFreqBase),
 		llama.WithRopeFreqScale(ropeFreqScale),
 	}
 	if opts.NGQA != 0 {
 		llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
 	}
 	if opts.RMSNormEps != 0 {
 		llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
 	}
 	if opts.ContextSize != 0 {
 		llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
 	}
 	if opts.F16Memory {
 		llamaOpts = append(llamaOpts, llama.EnableF16Memory)
 	}
 	if opts.Embeddings {
 		llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
 	}
 	if opts.NGPULayers != 0 {
 		llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
 	}
 	llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
 	llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
 	llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
 	if opts.NBatch != 0 {
 		llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
 	} else {
 		llamaOpts = append(llamaOpts, llama.SetNBatch(512))
 	}
 	if opts.NUMA {
 		llamaOpts = append(llamaOpts, llama.EnableNUMA)
 	}
 	if opts.LowVRAM {
 		llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
 	}
 	model, err := llama.New(opts.ModelFile, llamaOpts...)
 	llm.llama = model
 	return err
 }
 func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
 	ropeFreqBase := float32(10000)
 	ropeFreqScale := float32(1)
 	if opts.RopeFreqBase != 0 {
 		ropeFreqBase = opts.RopeFreqBase
 	}
 	if opts.RopeFreqScale != 0 {
 		ropeFreqScale = opts.RopeFreqScale
 	}
 	predictOptions := []llama.PredictOption{
 		llama.SetTemperature(opts.Temperature),
 		llama.SetTopP(opts.TopP),
 		llama.SetTopK(int(opts.TopK)),
 		llama.SetTokens(int(opts.Tokens)),
 		llama.SetThreads(int(opts.Threads)),
 		llama.WithGrammar(opts.Grammar),
 		llama.SetRopeFreqBase(ropeFreqBase),
 		llama.SetRopeFreqScale(ropeFreqScale),
 		llama.SetNegativePromptScale(opts.NegativePromptScale),
 		llama.SetNegativePrompt(opts.NegativePrompt),
 	}
 	if opts.PromptCacheAll {
 		predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
 	}
 	if opts.PromptCacheRO {
 		predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
 	}
 	// Expected absolute path
 	if opts.PromptCachePath != "" {
 		predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
 	}
 	if opts.Mirostat != 0 {
 		predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
 	}
 	if opts.MirostatETA != 0 {
 		predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
 	}
 	if opts.MirostatTAU != 0 {
 		predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
 	}
 	if opts.Debug {
 		predictOptions = append(predictOptions, llama.Debug)
 	}
 	predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
 	if opts.PresencePenalty != 0 {
 		predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
 	}
 	if opts.NKeep != 0 {
 		predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
 	}
 	if opts.Batch != 0 {
 		predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
 	}
 	if opts.F16KV {
 		predictOptions = append(predictOptions, llama.EnableF16KV)
 	}
 	if opts.IgnoreEOS {
 		predictOptions = append(predictOptions, llama.IgnoreEOS)
 	}
 	if opts.Seed != 0 {
 		predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
 	}
 	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
 	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
 	predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
 	predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
 	predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
 	predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
 	predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
 	predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
 	return predictOptions
 }
 func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }
 func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	predictOptions := buildPredictOptions(opts)
 	predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
 		results <- token
 		return true
 	}))
 	go func() {
 		_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
 		if err != nil {
 			fmt.Println("err: ", err)
 		}
 		close(results)
 	}()
 	return nil
 }
 func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
 	predictOptions := buildPredictOptions(opts)
 	if len(opts.EmbeddingTokens) > 0 {
 		tokens := []int{}
 		for _, t := range opts.EmbeddingTokens {
 			tokens = append(tokens, int(t))
 		}
 		return llm.llama.TokenEmbeddings(tokens, predictOptions...)
 	}
 	return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
 }
--- a/backend/go/llm/llama-ggml/main.go
+++ b/backend/go/llm/llama-ggml/main.go
@@ -1,19 +0,0 @@
 package main
 import (
 	"flag"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 func main() {
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/stores/store.go
+++ b/backend/go/stores/store.go
@@ -4,36 +4,101 @@ package main
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"container/heap"
 	"context"
 	"fmt"
 	"math"
-	"runtime"
+	"slices"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	chromem "github.com/philippgille/chromem-go"
 	"github.com/rs/zerolog/log"
 )
 type Store struct {
 	base.SingleThread
-	*chromem.DB
+
-	*chromem.Collection
+	// The sorted keys
 	keys [][]float32
 	// The sorted values
 	values [][]byte
 	// If for every K it holds that ||k||^2 = 1, then we can use the normalized distance functions
 	// TODO: Should we normalize incoming keys if they are not instead?
 	keysAreNormalized bool
 	// The first key decides the length of the keys
 	keyLen int
 }
 // TODO: Only used for sorting using Go's builtin implementation. The interfaces are columnar because
 // that's theoretically best for memory layout and cache locality, but this isn't optimized yet.
 type Pair struct {
 	Key   []float32
 	Value []byte
 }
 func NewStore() *Store {
-	return &Store{}
+	return &Store{
 		keys:              make([][]float32, 0),
 		values:            make([][]byte, 0),
 		keysAreNormalized: true,
 		keyLen:            -1,
 	}
 }
 func compareSlices(k1, k2 []float32) int {
 	assert(len(k1) == len(k2), fmt.Sprintf("compareSlices: len(k1) = %d, len(k2) = %d", len(k1), len(k2)))
 	return slices.Compare(k1, k2)
 }
 func hasKey(unsortedSlice [][]float32, target []float32) bool {
 	return slices.ContainsFunc(unsortedSlice, func(k []float32) bool {
 		return compareSlices(k, target) == 0
 	})
 }
 func findInSortedSlice(sortedSlice [][]float32, target []float32) (int, bool) {
 	return slices.BinarySearchFunc(sortedSlice, target, func(k, t []float32) int {
 		return compareSlices(k, t)
 	})
 }
 func isSortedPairs(kvs []Pair) bool {
 	for i := 1; i < len(kvs); i++ {
 		if compareSlices(kvs[i-1].Key, kvs[i].Key) > 0 {
 			return false
 		}
 	}
 	return true
 }
 func isSortedKeys(keys [][]float32) bool {
 	for i := 1; i < len(keys); i++ {
 		if compareSlices(keys[i-1], keys[i]) > 0 {
 			return false
 		}
 	}
 	return true
 }
 func sortIntoKeySlicese(keys []*pb.StoresKey) [][]float32 {
 	ks := make([][]float32, len(keys))
 	for i, k := range keys {
 		ks[i] = k.Floats
 	}
 	slices.SortFunc(ks, compareSlices)
 	assert(len(ks) == len(keys), fmt.Sprintf("len(ks) = %d, len(keys) = %d", len(ks), len(keys)))
 	assert(isSortedKeys(ks), "keys are not sorted")
 	return ks
 }
 func (s *Store) Load(opts *pb.ModelOptions) error {
 	db := chromem.NewDB()
 	collection, err := db.CreateCollection("all-documents", nil, nil)
 	if err != nil {
 		return err
 	}
 	s.DB = db
 	s.Collection = collection
 	return nil
 }
@@ -46,25 +111,156 @@ func (s *Store) StoresSet(opts *pb.StoresSetOptions) error {
 	if len(opts.Keys) != len(opts.Values) {
 		return fmt.Errorf("len(keys) = %d, len(values) = %d", len(opts.Keys), len(opts.Values))
 	}
-	docs := []chromem.Document{}
+
 	if s.keyLen == -1 {
 		s.keyLen = len(opts.Keys[0].Floats)
 	} else {
 		if len(opts.Keys[0].Floats) != s.keyLen {
 			return fmt.Errorf("Try to add key with length %d when existing length is %d", len(opts.Keys[0].Floats), s.keyLen)
 		}
 	}
 	kvs := make([]Pair, len(opts.Keys))
 	for i, k := range opts.Keys {
-		docs = append(docs, chromem.Document{
+		if s.keysAreNormalized && !isNormalized(k.Floats) {
-			ID:      k.String(),
+			s.keysAreNormalized = false
-			Content: opts.Values[i].String(),
+			var sample []float32
-		})
+			if len(s.keys) > 5 {
 				sample = k.Floats[:5]
 			} else {
 				sample = k.Floats
 			}
 			log.Debug().Msgf("Key is not normalized: %v", sample)
 		}
 		kvs[i] = Pair{
 			Key:   k.Floats,
 			Value: opts.Values[i].Bytes,
 		}
 	}
-	return s.Collection.AddDocuments(context.Background(), docs, runtime.NumCPU())
+	slices.SortFunc(kvs, func(a, b Pair) int {
 		return compareSlices(a.Key, b.Key)
 	})
 	assert(len(kvs) == len(opts.Keys), fmt.Sprintf("len(kvs) = %d, len(opts.Keys) = %d", len(kvs), len(opts.Keys)))
 	assert(isSortedPairs(kvs), "keys are not sorted")
 	l := len(kvs) + len(s.keys)
 	merge_ks := make([][]float32, 0, l)
 	merge_vs := make([][]byte, 0, l)
 	i, j := 0, 0
 	for {
 		if i+j >= l {
 			break
 		}
 		if i >= len(kvs) {
 			merge_ks = append(merge_ks, s.keys[j])
 			merge_vs = append(merge_vs, s.values[j])
 			j++
 			continue
 		}
 		if j >= len(s.keys) {
 			merge_ks = append(merge_ks, kvs[i].Key)
 			merge_vs = append(merge_vs, kvs[i].Value)
 			i++
 			continue
 		}
 		c := compareSlices(kvs[i].Key, s.keys[j])
 		if c < 0 {
 			merge_ks = append(merge_ks, kvs[i].Key)
 			merge_vs = append(merge_vs, kvs[i].Value)
 			i++
 		} else if c > 0 {
 			merge_ks = append(merge_ks, s.keys[j])
 			merge_vs = append(merge_vs, s.values[j])
 			j++
 		} else {
 			merge_ks = append(merge_ks, kvs[i].Key)
 			merge_vs = append(merge_vs, kvs[i].Value)
 			i++
 			j++
 		}
 	}
 	assert(len(merge_ks) == l, fmt.Sprintf("len(merge_ks) = %d, l = %d", len(merge_ks), l))
 	assert(isSortedKeys(merge_ks), "merge keys are not sorted")
 	s.keys = merge_ks
 	s.values = merge_vs
 	return nil
 }
-func (s *Store) StoresReset(opts *pb.StoresResetOptions) error {
+func (s *Store) StoresDelete(opts *pb.StoresDeleteOptions) error {
-	err := s.DB.DeleteCollection("all-documents")
+	if len(opts.Keys) == 0 {
-	if err != nil {
+		return fmt.Errorf("no keys to delete")
 		return err
 	}
-	s.Collection, err = s.CreateCollection("all-documents", nil, nil)
+
-	return err
+	if len(opts.Keys) == 0 {
 		return fmt.Errorf("no keys to add")
 	}
 	if s.keyLen == -1 {
 		s.keyLen = len(opts.Keys[0].Floats)
 	} else {
 		if len(opts.Keys[0].Floats) != s.keyLen {
 			return fmt.Errorf("Trying to delete key with length %d when existing length is %d", len(opts.Keys[0].Floats), s.keyLen)
 		}
 	}
 	ks := sortIntoKeySlicese(opts.Keys)
 	l := len(s.keys) - len(ks)
 	merge_ks := make([][]float32, 0, l)
 	merge_vs := make([][]byte, 0, l)
 	tail_ks := s.keys
 	tail_vs := s.values
 	for _, k := range ks {
 		j, found := findInSortedSlice(tail_ks, k)
 		if found {
 			merge_ks = append(merge_ks, tail_ks[:j]...)
 			merge_vs = append(merge_vs, tail_vs[:j]...)
 			tail_ks = tail_ks[j+1:]
 			tail_vs = tail_vs[j+1:]
 		} else {
 			assert(!hasKey(s.keys, k), fmt.Sprintf("Key exists, but was not found: t=%d, %v", len(tail_ks), k))
 		}
 		log.Debug().Msgf("Delete: found = %v, t = %d, j = %d, len(merge_ks) = %d, len(merge_vs) = %d", found, len(tail_ks), j, len(merge_ks), len(merge_vs))
 	}
 	merge_ks = append(merge_ks, tail_ks...)
 	merge_vs = append(merge_vs, tail_vs...)
 	assert(len(merge_ks) <= len(s.keys), fmt.Sprintf("len(merge_ks) = %d, len(s.keys) = %d", len(merge_ks), len(s.keys)))
 	s.keys = merge_ks
 	s.values = merge_vs
 	assert(len(s.keys) >= l, fmt.Sprintf("len(s.keys) = %d, l = %d", len(s.keys), l))
 	assert(isSortedKeys(s.keys), "keys are not sorted")
 	assert(func() bool {
 		for _, k := range ks {
 			if _, found := findInSortedSlice(s.keys, k); found {
 				return false
 			}
 		}
 		return true
 	}(), "Keys to delete still present")
 	if len(s.keys) != l {
 		log.Debug().Msgf("Delete: Some keys not found: len(s.keys) = %d, l = %d", len(s.keys), l)
 	}
 	return nil
 }
 func (s *Store) StoresGet(opts *pb.StoresGetOptions) (pb.StoresGetResult, error) {
@@ -115,12 +311,16 @@ func (s *Store) StoresGet(opts *pb.StoresGetOptions) (pb.StoresGetResult, error)
 }
 func isNormalized(k []float32) bool {
-	var sum float32
+	var sum float64
 	for _, v := range k {
-		sum += v
+		v64 := float64(v)
 		sum += v64*v64
 	}
-	return sum == 1.0
+	s := math.Sqrt(sum)
 	return s >= 0.99 && s <= 1.01
 }
 // TODO: This we could replace with handwritten SIMD code
@@ -132,7 +332,7 @@ func normalizedCosineSimilarity(k1, k2 []float32) float32 {
 		dot += k1[i] * k2[i]
 	}
-	assert(dot >= -1 && dot <= 1, fmt.Sprintf("dot = %f", dot))
+	assert(dot >= -1.01 && dot <= 1.01, fmt.Sprintf("dot = %f", dot))
 	// 2.0 * (1.0 - dot) would be the Euclidean distance
 	return dot
@@ -222,7 +422,7 @@ func cosineSimilarity(k1, k2 []float32, mag1 float64) float32 {
 	sim := float32(dot / (mag1 * math.Sqrt(mag2)))
-	assert(sim >= -1 && sim <= 1, fmt.Sprintf("sim = %f", sim))
+	assert(sim >= -1.01 && sim <= 1.01, fmt.Sprintf("sim = %f", sim))
 	return sim
 }
--- a/backend/python/autogptq/README.md
+++ b/backend/python/autogptq/README.md
@@ -1,5 +0,0 @@
 # Creating a separate environment for the autogptq project
 ```
 make autogptq
 ```
--- a/backend/python/autogptq/backend.py
+++ b/backend/python/autogptq/backend.py
@@ -1,153 +0,0 @@
 #!/usr/bin/env python3
 from concurrent import futures
 import argparse
 import signal
 import sys
 import os
 import time
 import base64
 import grpc
 import backend_pb2
 import backend_pb2_grpc
 from auto_gptq import AutoGPTQForCausalLM
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers import TextGenerationPipeline
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    def Health(self, request, context):
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
        try:
            device = "cuda:0"
            if request.Device != "":
                device = request.Device
            # support loading local model files
            model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model)
            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode)
            # support model `Qwen/Qwen-VL-Chat-Int4`
            if "qwen-vl" in request.Model.lower():
                self.model_name = "Qwen-VL-Chat"
                model = AutoModelForCausalLM.from_pretrained(model_path, 
                    trust_remote_code=request.TrustRemoteCode,
                    device_map="auto").eval()
            else:
                model = AutoGPTQForCausalLM.from_quantized(model_path,
                    model_basename=request.ModelBaseName,
                    use_safetensors=True,
                    trust_remote_code=request.TrustRemoteCode,
                    device=device,
                    use_triton=request.UseTriton,
                    quantize_config=None)
            self.model = model
            self.tokenizer = tokenizer
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def Predict(self, request, context):
        penalty = 1.0
        if request.Penalty != 0.0:
            penalty = request.Penalty
        tokens = 512
        if request.Tokens != 0:
            tokens = request.Tokens
        top_p = 0.95
        if request.TopP != 0.0:
            top_p = request.TopP
        prompt_images = self.recompile_vl_prompt(request)
        compiled_prompt = prompt_images[0]
        print(f"Prompt: {compiled_prompt}", file=sys.stderr)
        # Implement Predict RPC
        pipeline = TextGenerationPipeline(
            model=self.model, 
            tokenizer=self.tokenizer,
            max_new_tokens=tokens,
            temperature=request.Temperature,
            top_p=top_p,
            repetition_penalty=penalty,
            )
        t = pipeline(compiled_prompt)[0]["generated_text"]
        print(f"generated_text: {t}", file=sys.stderr)
        if compiled_prompt in t:
            t = t.replace(compiled_prompt, "")
        # house keeping. Remove the image files from /tmp folder
        for img_path in prompt_images[1]:
            try:
                os.remove(img_path)
            except Exception as e:
                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
    def PredictStream(self, request, context):
        # Implement PredictStream RPC
        #for reply in some_data_generator():
        #    yield reply
        # Not implemented yet
        return self.Predict(request, context)
    def recompile_vl_prompt(self, request):
        prompt = request.Prompt
        image_paths = []
        if "qwen-vl" in self.model_name.lower():
            # request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename.
            # Then, save the image file paths to an array "image_paths".
            # read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt".
            for i, img in enumerate(request.Images):
                timestamp = str(int(time.time() * 1000))  # Generate timestamp
                img_path = f"/tmp/vl-{timestamp}.jpg"  # Use timestamp in filename
                with open(img_path, "wb") as f:
                    f.write(base64.b64decode(img))
                image_paths.append(img_path)
                prompt = prompt.replace(f"[img-{i}]", "<img>" + img_path + "</img>,")
        else:
            prompt = request.Prompt
        return (prompt, image_paths)
 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
    print("Server started. Listening on: " + address, file=sys.stderr)
    # Define the signal handler function
    def signal_handler(sig, frame):
        print("Received termination signal. Shutting down...")
        server.stop(0)
        sys.exit(0)
    # Set the signal handlers for SIGINT and SIGTERM
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
    try:
        while True:
            time.sleep(_ONE_DAY_IN_SECONDS)
    except KeyboardInterrupt:
        server.stop(0)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the gRPC server.")
    parser.add_argument(
        "--addr", default="localhost:50051", help="The address to bind the server to."
    )
    args = parser.parse_args()
    serve(args.addr)
--- a/backend/python/autogptq/requirements-cublas11.txt
+++ b/backend/python/autogptq/requirements-cublas11.txt
@@ -1,2 +0,0 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.4.1+cu118
--- a/backend/python/autogptq/requirements-cublas12.txt
+++ b/backend/python/autogptq/requirements-cublas12.txt
@@ -1 +0,0 @@
 torch==2.4.1
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +0,0 @@
 accelerate
 auto-gptq==0.7.1
 grpcio==1.69.0
 protobuf
 certifi
 transformers
--- a/backend/python/bark/backend.py
+++ b/backend/python/bark/backend.py
@@ -61,7 +61,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.69.0
+grpcio==1.71.0
 protobuf
 certifi
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.69.0
+grpcio==1.71.0
 protobuf
 grpcio-tools
--- a/backend/python/coqui/backend.py
+++ b/backend/python/coqui/backend.py
@@ -86,7 +86,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/coqui/requirements-cpu.txt
+++ b/backend/python/coqui/requirements-cpu.txt
@@ -1,4 +1,4 @@
-transformers
+transformers==4.48.3
 accelerate
 torch==2.4.1
 coqui-tts
--- a/backend/python/coqui/requirements-cublas11.txt
+++ b/backend/python/coqui/requirements-cublas11.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.4.1+cu118
 torchaudio==2.4.1+cu118
-transformers
+transformers==4.48.3
 accelerate
 coqui-tts
--- a/backend/python/coqui/requirements-cublas12.txt
+++ b/backend/python/coqui/requirements-cublas12.txt
@@ -1,5 +1,5 @@
 torch==2.4.1
 torchaudio==2.4.1
-transformers
+transformers==4.48.3
 accelerate
 coqui-tts
--- a/backend/python/coqui/requirements-hipblas.txt
+++ b/backend/python/coqui/requirements-hipblas.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch==2.4.1+rocm6.0
 torchaudio==2.4.1+rocm6.0
-transformers
+transformers==4.48.3
 accelerate
 coqui-tts
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -5,6 +5,6 @@ torchaudio==2.3.1+cxx11.abi
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
 setuptools
-transformers
+transformers==4.48.3
 accelerate
 coqui-tts
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.71.0
 protobuf
 certifi
 packaging==24.1
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -19,7 +19,7 @@ import grpc
 from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
    EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
-from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
+from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline, Lumina2Text2ImgPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
 from diffusers.utils import load_image, export_to_video
 from compel import Compel, ReturnedEmbeddingsType
@@ -159,6 +159,18 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                torchType = torch.float16
                variant = "fp16"
            options = request.Options
            # empty dict
            self.options = {}
            # The options are a list of strings in this form optname:optvalue
            # We are storing all the options in a dict so we can use it later when
            # generating the images
            for opt in options:
                key, value = opt.split(":")
                self.options[key] = value
            local = False
            modelFile = request.Model
@@ -275,6 +287,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                    if request.LowVRAM:
                        self.pipe.enable_model_cpu_offload()
            elif request.PipelineType == "Lumina2Text2ImgPipeline":
                self.pipe = Lumina2Text2ImgPipeline.from_pretrained(
                    request.Model,
                    torch_dtype=torch.bfloat16)
                if request.LowVRAM:
                    self.pipe.enable_model_cpu_offload()
            elif request.PipelineType == "SanaPipeline":
                self.pipe = SanaPipeline.from_pretrained(
                    request.Model,
@@ -441,6 +459,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        # create a dictionary of parameters by using the keys from EnableParameters and the values from defaults
        kwargs = {key: options.get(key) for key in keys if key in options}
        # populate kwargs from self.options.
        kwargs.update(self.options)
        # Set seed
        if request.seed > 0:
            kwargs["generator"] = torch.Generator(device=self.device).manual_seed(
@@ -501,7 +522,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.69.0
+grpcio==1.71.0
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/backend.py
+++ b/backend/python/exllama2/backend.py
@@ -105,7 +105,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.71.0
 protobuf
 certifi
 wheel
--- a/backend/python/faster-whisper/Makefile
+++ b/backend/python/faster-whisper/Makefile
@@ -1,6 +1,9 @@
-.PHONY: autogptq
+.DEFAULT_GOAL := install
-autogptq: protogen
+
 .PHONY: install
 install:
 	bash install.sh
 	$(MAKE) protogen
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
@@ -10,7 +13,7 @@ protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+	bash protogen.sh
 .PHONY: clean
 clean: protogen-clean
--- a/backend/python/faster-whisper/backend.py
+++ b/backend/python/faster-whisper/backend.py
@@ -0,0 +1,99 @@
 #!/usr/bin/env python3
 """
 This is an extra gRPC server of LocalAI for Bark TTS
 """
 from concurrent import futures
 import time
 import argparse
 import signal
 import sys
 import os
 import backend_pb2
 import backend_pb2_grpc
 from faster_whisper import WhisperModel
 import grpc
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
 COQUI_LANGUAGE = os.environ.get('COQUI_LANGUAGE', None)
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    """
    BackendServicer is the class that implements the gRPC service
    """
    def Health(self, request, context):
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
        device = "cpu"
        # Get device
        # device = "cuda" if request.CUDA else "cpu"
        if request.CUDA:
            device = "cuda"
        try:
            print("Preparing models, please wait", file=sys.stderr)
            self.model = WhisperModel(request.Model, device=device, compute_type="float16")
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        # Implement your logic here for the LoadModel service
        # Replace this with your desired response
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def AudioTranscription(self, request, context):
        resultSegments = []
        text = ""
        try:
            segments, info = self.model.transcribe(request.dst, beam_size=5, condition_on_previous_text=False)
            id = 0
            for segment in segments:
                print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
                resultSegments.append(backend_pb2.TranscriptSegment(id=id, start=segment.start, end=segment.end, text=segment.text))
                text += segment.text
                id += 1            
        except Exception as err:
            print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
        return backend_pb2.TranscriptResult(segments=resultSegments, text=text)
 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
    print("Server started. Listening on: " + address, file=sys.stderr)
    # Define the signal handler function
    def signal_handler(sig, frame):
        print("Received termination signal. Shutting down...")
        server.stop(0)
        sys.exit(0)
    # Set the signal handlers for SIGINT and SIGTERM
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
    try:
        while True:
            time.sleep(_ONE_DAY_IN_SECONDS)
    except KeyboardInterrupt:
        server.stop(0)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the gRPC server.")
    parser.add_argument(
        "--addr", default="localhost:50051", help="The address to bind the server to."
    )
    args = parser.parse_args()
    serve(args.addr)
--- a/backend/python/faster-whisper/install.sh
+++ b/backend/python/faster-whisper/install.sh
--- a/backend/python/faster-whisper/protogen.sh
+++ b/backend/python/faster-whisper/protogen.sh
--- a/backend/python/faster-whisper/requirements-cpu.txt
+++ b/backend/python/faster-whisper/requirements-cpu.txt
@@ -0,0 +1,8 @@
 faster-whisper
 opencv-python
 accelerate
 compel
 peft
 sentencepiece
 torch==2.4.1
 optimum-quanto
--- a/backend/python/faster-whisper/requirements-cublas11.txt
+++ b/backend/python/faster-whisper/requirements-cublas11.txt
@@ -0,0 +1,9 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.4.1+cu118
 faster-whisper
 opencv-python
 accelerate
 compel
 peft
 sentencepiece
 optimum-quanto
--- a/backend/python/faster-whisper/requirements-cublas12.txt
+++ b/backend/python/faster-whisper/requirements-cublas12.txt
@@ -0,0 +1,8 @@
 torch==2.4.1
 faster-whisper
 opencv-python
 accelerate
 compel
 peft
 sentencepiece
 optimum-quanto
--- a/backend/python/faster-whisper/requirements-hipblas.txt
+++ b/backend/python/faster-whisper/requirements-hipblas.txt
@@ -1,2 +1,3 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch==2.4.1+rocm6.0
+torch
 faster-whisper
--- a/Show More
+++ b/Show More
		`@@ -1,2 +0,0 @@`
			`--extra-index-url https://download.pytorch.org/whl/cu118`
			`torch==2.4.1+cu118`