chore: ⬆️ Update ggml-org/llama.cpp to 72babea5dea56c8a8e8420ccf731b12a5cf37854 (#5743 )

⬆️ Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
chore: ⬆️ Update ggml-org/whisper.cpp to c88ffbf9baeaae8c2cc0a4f496618314bb2ee9e0 (#5742 )
2026-02-03 03:02:38 -05:00 · 2025-06-27 23:46:27 +02:00 · 2025-06-27 23:45:57 +02:00 · 2025-06-27 22:23:43 +02:00 · 2025-06-27 22:23:14 +02:00 · 2025-06-27 22:22:58 +02:00
592 changed files with 271619 additions and 39183 deletions
--- a/Generation/musicgen.bru
+++ b/Generation/musicgen.bru
@@ -1,23 +0,0 @@
-meta {
-  name: musicgen
-  type: http
-  seq: 1
-}
-
-post {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/sound-generation
-  body: json
-  auth: none
-}
-
-headers {
-  Content-Type: application/json
-}
-
-body:json {
-  {
-      "model_id": "facebook/musicgen-small",
-      "text": "Exciting 80s Newscast Interstitial",
-      "duration_seconds": 8
-  }
-}
--- a/Requests/backend
+++ b/Requests/backend
@@ -1,17 +0,0 @@
-meta {
-  name: backend monitor
-  type: http
-  seq: 4
-}
-
-get {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/monitor
-  body: json
-  auth: none
-}
-
-body:json {
-  {
-    "model": "{{DEFAULT_MODEL}}"
-  }
-}
--- a/monitor/backend-shutdown.bru
+++ b/monitor/backend-shutdown.bru
@@ -1,21 +0,0 @@
-meta {
-  name: backend-shutdown
-  type: http
-  seq: 3
-}
-
-post {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/shutdown
-  body: json
-  auth: none
-}
-
-headers {
-  Content-Type: application/json
-}
-
-body:json {
-  {
-      "model": "{{DEFAULT_MODEL}}"
-  }
-}
--- a/Requests/bruno.json
+++ b/Requests/bruno.json
@@ -1,5 +0,0 @@
-{
-  "version": "1",
-  "name": "LocalAI Test Requests",
-  "type": "collection"
-}
--- a/Requests/environments/localhost.bru
+++ b/Requests/environments/localhost.bru
@@ -1,6 +0,0 @@
-vars {
-  HOST: localhost
-  PORT: 8080
-  DEFAULT_MODEL: gpt-3.5-turbo
-  PROTOCOL: http://
-}
--- a/.bruno/LocalAI
+++ b/.bruno/LocalAI
@@ -1,11 +0,0 @@
-meta {
-  name: get models list
-  type: http
-  seq: 2
-}
-
-get {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models
-  body: none
-  auth: none
-}
--- a/generation/Generate
+++ b/generation/Generate
@@ -1,25 +0,0 @@
-meta {
-  name: Generate image
-  type: http
-  seq: 1
-}
-
-post {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/images/generations
-  body: json
-  auth: none
-}
-
-headers {
-  Content-Type: application/json
-}
-
-body:json {
-  {
-    "prompt": "<positive prompt>|<negative prompt>",
-    "model": "model-name",
-    "step": 51,
-    "size": "1024x1024",
-    "image": ""
-  }
-}
--- a/text/-completions.bru
+++ b/text/-completions.bru
@@ -1,24 +0,0 @@
-meta {
-  name: -completions
-  type: http
-  seq: 4
-}
-
-post {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/completions
-  body: json
-  auth: none
-}
-
-headers {
-  Content-Type: application/json
-}
-
-body:json {
-  {
-      "model": "{{DEFAULT_MODEL}}",
-      "prompt": "function downloadFile(string url, string outputPath) {",
-      "max_tokens": 256,
-      "temperature": 0.5
-  }
-}
--- a/text/-edits.bru
+++ b/text/-edits.bru
@@ -1,23 +0,0 @@
-meta {
-  name: -edits
-  type: http
-  seq: 5
-}
-
-post {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/edits
-  body: json
-  auth: none
-}
-
-headers {
-  Content-Type: application/json
-}
-
-body:json {
-  {
-      "model": "{{DEFAULT_MODEL}}",
-      "input": "What day of the wek is it?",
-      "instruction": "Fix the spelling mistakes"
-  }
-}
--- a/text/-embeddings.bru
+++ b/text/-embeddings.bru
@@ -1,22 +0,0 @@
-meta {
-  name: -embeddings
-  type: http
-  seq: 6
-}
-
-post {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/embeddings
-  body: json
-  auth: none
-}
-
-headers {
-  Content-Type: application/json
-}
-
-body:json {
-  {
-      "model": "{{DEFAULT_MODEL}}",
-      "input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
-  }
-}
--- a/text/chat/chat
+++ b/text/chat/chat
@@ -1,30 +0,0 @@
-meta {
-  name: chat completion -simple- 1 message-
-  type: http
-  seq: 4
-}
-
-post {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
-  body: json
-  auth: none
-}
-
-headers {
-  Content-Type: application/json
-}
-
-body:json {
-  {
-    "model": "{{DEFAULT_MODEL}}",
-    "messages": [
-      {
-        "role": "user",
-        "content": "How could one use friction to cook an egg?"
-      }
-    ],
-    "max_tokens": 256,
-    "temperature": 0.2,
-    "grammar": ""
-  }
-}
--- a/text/chat/chat-completions
+++ b/text/chat/chat-completions
@@ -1,29 +0,0 @@
-meta {
-  name: chat-completions -long-
-  type: http
-  seq: 5
-}
-
-post {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
-  body: json
-  auth: none
-}
-
-headers {
-  Content-Type: application/json
-}
-
-body:json {
-  {
-       "model": "{{DEFAULT_MODEL}}",
-       "messages": [{"role": "system", "content": "You are a helpful, intelligent pirate assistant. Always answer as helpfully as possible, while being a pirate and using pirate language. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."},
-           {"role": "user", "content": "How could one use electricity to cook an egg?"},
-           {"role": "assistant",
-                  "content": "Shiver me timbers! Using electricity to cook an egg? Well, matey, I reckon that be a right curious idea! *adjusts spectacles* Now, I ain't sure if this be possible, but I'll do me best to help ye out.\n\nFirst things first, ye gotta understand that electricity be a powerful force, and it can't just cook an egg on its own. Ye see, electricity be like a mighty wind that can make things happen, but it needs somethin' to work with. So, if ye want to use electricity to cook an egg, ye gotta find a way to harness that power and make it do the cookin'.\n\nNow, I know what ye might be thinkin': \"How do I harness the power of electricity to cook an egg?\" Well, matey, there be a few ways to do it. One way be to use a special device called an \"electric frying pan.\" This be a pan that has a built-in heating element that gets hot when ye plug it into a wall socket. When the element gets hot, ye can crack an egg into the pan and watch as it cook"
-              },
-              {"role": "user", "content": "I don't have one of those, just a raw wire and plenty of power! How do we get it done?"}],
-       "max_tokens": 1024,
-       "temperature": 0.5
-  }
-}
--- a/text/chat/chat-completions
+++ b/text/chat/chat-completions
@@ -1,25 +0,0 @@
-meta {
-  name: chat-completions -stream-
-  type: http
-  seq: 6
-}
-
-post {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
-  body: json
-  auth: none
-}
-
-headers {
-  Content-Type: application/json
-}
-
-body:json {
-  {
-       "model": "{{DEFAULT_MODEL}}",
-       "messages": [{"role": "user", "content": "Explain how I can set sail on the ocean using only power generated by seagulls?"}],
-       "max_tokens": 256,
-       "temperature": 0.9,
-       "stream": true
-  }
-}
--- a/Requests/model
+++ b/Requests/model
@@ -1,22 +0,0 @@
-meta {
-  name: add model gallery
-  type: http
-  seq: 10
-}
-
-post {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
-  body: json
-  auth: none
-}
-
-headers {
-  Content-Type: application/json
-}
-
-body:json {
-  {
-      "url": "file:///home/dave/projects/model-gallery/huggingface/TheBloke__CodeLlama-7B-Instruct-GGML.yaml",
-      "name": "test"
-  }
-}
--- a/gallery/delete
+++ b/gallery/delete
@@ -1,21 +0,0 @@
-meta {
-  name: delete model gallery
-  type: http
-  seq: 11
-}
-
-delete {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
-  body: json
-  auth: none
-}
-
-headers {
-  Content-Type: application/json
-}
-
-body:json {
-  {
-      "name": "test"
-  }
-}
--- a/Requests/model
+++ b/Requests/model
@@ -1,11 +0,0 @@
-meta {
-  name: list MODELS in galleries
-  type: http
-  seq: 7
-}
-
-get {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/available
-  body: none
-  auth: none
-}
--- a/Requests/model
+++ b/Requests/model
@@ -1,11 +0,0 @@
-meta {
-  name: list model GALLERIES
-  type: http
-  seq: 8
-}
-
-get {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
-  body: none
-  auth: none
-}
--- a/Requests/model
+++ b/Requests/model
@@ -1,11 +0,0 @@
-meta {
-  name: model delete
-  type: http
-  seq: 7
-}
-
-post {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
-  body: none
-  auth: none
-}
--- a/Requests/model
+++ b/Requests/model
@@ -1,21 +0,0 @@
-meta {
-  name: model gallery apply -gist-
-  type: http
-  seq: 12
-}
-
-post {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
-  body: json
-  auth: none
-}
-
-headers {
-  Content-Type: application/json
-}
-
-body:json {
-  {
-      "id": "TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q2_K.bin"
-  }
-}
--- a/Requests/model
+++ b/Requests/model
@@ -1,22 +0,0 @@
-meta {
-  name: model gallery apply
-  type: http
-  seq: 9
-}
-
-post {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
-  body: json
-  auth: none
-}
-
-headers {
-  Content-Type: application/json
-}
-
-body:json {
-  {
-      "id": "dave@TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q3_K_S.bin",
-      "name": "codellama7b"
-  }
-}
--- a/Requests/transcription/gb1.ogg
+++ b/Requests/transcription/gb1.ogg
--- a/Requests/transcription/transcribe.bru
+++ b/Requests/transcription/transcribe.bru
@@ -1,16 +0,0 @@
-meta {
-  name: transcribe
-  type: http
-  seq: 1
-}
-
-post {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/audio/transcriptions
-  body: multipartForm
-  auth: none
-}
-
-body:multipart-form {
-  file: @file(transcription/gb1.ogg)
-  model: whisper-1
-}
--- a/Requests/tts/-tts.bru
+++ b/Requests/tts/-tts.bru
@@ -1,22 +0,0 @@
-meta {
-  name: -tts
-  type: http
-  seq: 2
-}
-
-post {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
-  body: json
-  auth: none
-}
-
-headers {
-  Content-Type: application/json
-}
-
-body:json {
-  {
-      "model": "{{DEFAULT_MODEL}}",
-      "input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
-  }
-}
--- a/Requests/tts/musicgen.bru
+++ b/Requests/tts/musicgen.bru
@@ -1,23 +0,0 @@
-meta {
-  name: musicgen
-  type: http
-  seq: 2
-}
-
-post {
-  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
-  body: json
-  auth: none
-}
-
-headers {
-  Content-Type: application/json
-}
-
-body:json {
-  {
-      "backend": "transformers-musicgen",
-      "model": "facebook/musicgen-small",
-      "input": "80s Synths playing Jazz"
-  }
-}
--- a/.devcontainer/docker-compose-devcontainer.yml
+++ b/.devcontainer/docker-compose-devcontainer.yml
@@ -6,8 +6,7 @@ services:
      target: devcontainer
      args:
      - FFMPEG=true
-      - IMAGE_TYPE=extras
-      - GO_TAGS=stablediffusion p2p tts
+      - GO_TAGS=p2p tts
    env_file:
      - ../.env
    ports:
--- a/.env
+++ b/.env
@@ -29,6 +29,9 @@
 ## Enable/Disable single backend (useful if only one GPU is available)
 # LOCALAI_SINGLE_ACTIVE_BACKEND=true

+# Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
+# LOCALAI_FORCE_BACKEND_SHUTDOWN=true
+
 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
 ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
@@ -38,12 +41,12 @@
 ## Uncomment and set to true to enable rebuilding from source
 # REBUILD=true

-## Enable go tags, available: stablediffusion, tts
-## stablediffusion: image generation with stablediffusion
+## Enable go tags, available: p2p, tts
+## p2p: enable distributed inferencing
 ## tts: enables text-to-speech with go-piper 
 ## (requires REBUILD=true)
 #
-# GO_TAGS=stablediffusion
+# GO_TAGS=p2p

 ## Path where to store generated images
 # LOCALAI_IMAGE_PATH=/tmp/generated/images
@@ -73,7 +76,7 @@

 ### Define a list of GRPC Servers for llama-cpp workers to distribute the load
 # https://github.com/ggerganov/llama.cpp/pull/6829
-# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
+# https://github.com/ggerganov/llama.cpp/blob/master/tools/rpc/README.md
 # LLAMACPP_GRPC_SERVERS=""

 ### Enable to run parallel requests
@@ -82,6 +85,15 @@
 # Enable to allow p2p mode
 # LOCALAI_P2P=true

+# Enable to use federated mode
+# LOCALAI_FEDERATED=true
+
+# Enable to start federation server
+# FEDERATED_SERVER=true
+
+# Define to use federation token
+# TOKEN=""
+
 ### Watchdog settings
 ###
 # Enables watchdog to kill backends that are inactive for too much time
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -29,10 +29,6 @@ updates:
    schedule:
      # Check for updates to GitHub Actions every weekday
      interval: "weekly"
-  - package-ecosystem: "pip"
-    directory: "/backend/python/autogptq"
-    schedule:
-      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/bark"
    schedule:
@@ -65,10 +61,6 @@ updates:
    directory: "/backend/python/openvoice"
    schedule:
      interval: "weekly"
-  - package-ecosystem: "pip"
-    directory: "/backend/python/parler-tts"
-    schedule:
-      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/rerankers"
    schedule:
@@ -81,14 +73,6 @@ updates:
    directory: "/backend/python/transformers"
    schedule:
      interval: "weekly"
-  - package-ecosystem: "pip"
-    directory: "/backend/python/transformers-musicgen"
-    schedule:
-      interval: "weekly"
-  - package-ecosystem: "pip"
-    directory: "/backend/python/vall-e-x"
-    schedule:
-      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/vllm"
    schedule:
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,6 +1,15 @@
-enhancements:
+enhancement:
 - head-branch: ['^feature', 'feature']

+dependencies:
+- any:
+  - changed-files:
+    - any-glob-to-any-file: 'Makefile'
+  - changed-files:
+    - any-glob-to-any-file: '*.mod'
+  - changed-files:
+    - any-glob-to-any-file: '*.sum'
+
 kind/documentation:
 - any:
  - changed-files:
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -0,0 +1,511 @@
+---
+name: 'build backend container images'
+
+on:
+  push:
+    branches:
+      - master
+    tags:
+      - '*'
+  #pull_request:
+
+concurrency:
+  group: ci-backends-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  backend-jobs:
+    uses: ./.github/workflows/backend_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+      base-image: ${{ matrix.base-image }}
+      backend: ${{ matrix.backend }}
+      dockerfile: $${ matrix.dockerfile }}
+      context: $${ matrix.context }}
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      fail-fast: false
+      #max-parallel: ${{ github.event_name != 'pull_request' && 6 || 4 }}
+      matrix:
+        include:
+          # CUDA 11 builds
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-11-rerankers'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "rerankers"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-11-vllm'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "vllm"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-11-transformers'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "transformers"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-11-diffusers'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "diffusers"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          # CUDA 11 additional backends
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-11-kokoro'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "kokoro"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-11-faster-whisper'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "faster-whisper"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-11-coqui'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "coqui"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-11-bark'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "bark"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-11-chatterbox'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "chatterbox"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          # CUDA 12 builds
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-12-rerankers'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "rerankers"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-12-vllm'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "vllm"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-12-transformers'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "transformers"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-12-diffusers'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "diffusers"            
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          # CUDA 12 additional backends
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-12-kokoro'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "kokoro"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-12-faster-whisper'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "faster-whisper"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-12-coqui'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "coqui"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-12-bark'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "bark"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-12-chatterbox'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "chatterbox"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          # hipblas builds
+          - build-type: 'hipblas'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-rocm-hipblas-rerankers'
+            runs-on: 'ubuntu-latest'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            backend: "rerankers"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'hipblas'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-rocm-hipblas-vllm'
+            runs-on: 'ubuntu-latest'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            backend: "vllm"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'hipblas'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-rocm-hipblas-transformers'
+            runs-on: 'ubuntu-latest'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            backend: "transformers"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'hipblas'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-rocm-hipblas-diffusers'
+            runs-on: 'ubuntu-latest'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            backend: "diffusers"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          # ROCm additional backends
+          - build-type: 'hipblas'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-rocm-hipblas-kokoro'
+            runs-on: 'ubuntu-latest'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            backend: "kokoro"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'hipblas'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-rocm-hipblas-faster-whisper'
+            runs-on: 'ubuntu-latest'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            backend: "faster-whisper"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'hipblas'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-rocm-hipblas-coqui'
+            runs-on: 'ubuntu-latest'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            backend: "coqui"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'hipblas'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-rocm-hipblas-bark'
+            runs-on: 'ubuntu-latest'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            backend: "bark"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          # sycl builds
+          - build-type: 'sycl_f32'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-intel-sycl-f32-rerankers'
+            runs-on: 'ubuntu-latest'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            backend: "rerankers"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'sycl_f16'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-intel-sycl-f16-rerankers'
+            runs-on: 'ubuntu-latest'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            backend: "rerankers"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'sycl_f32'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-intel-sycl-f32-vllm'
+            runs-on: 'ubuntu-latest'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            backend: "vllm"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'sycl_f16'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-intel-sycl-f16-vllm'
+            runs-on: 'ubuntu-latest'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            backend: "vllm"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'sycl_f32'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-intel-sycl-f32-transformers'
+            runs-on: 'ubuntu-latest'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            backend: "transformers"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'sycl_f16'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-intel-sycl-f16-transformers'
+            runs-on: 'ubuntu-latest'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            backend: "transformers"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'sycl_f32'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-intel-sycl-f32-diffusers'
+            runs-on: 'ubuntu-latest'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            backend: "diffusers"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          # SYCL additional backends
+          - build-type: 'sycl_f32'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-intel-sycl-f32-kokoro'
+            runs-on: 'ubuntu-latest'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            backend: "kokoro"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'sycl_f16'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-intel-sycl-f16-kokoro'
+            runs-on: 'ubuntu-latest'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            backend: "kokoro"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'sycl_f32'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-intel-sycl-f32-faster-whisper'
+            runs-on: 'ubuntu-latest'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            backend: "faster-whisper"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'sycl_f16'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-intel-sycl-f16-faster-whisper'
+            runs-on: 'ubuntu-latest'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            backend: "faster-whisper"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'sycl_f32'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-intel-sycl-f32-coqui'
+            runs-on: 'ubuntu-latest'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            backend: "coqui"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'sycl_f16'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-intel-sycl-f16-coqui'
+            runs-on: 'ubuntu-latest'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            backend: "coqui"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'sycl_f32'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-intel-sycl-f32-bark'
+            runs-on: 'ubuntu-latest'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            backend: "bark"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          - build-type: 'sycl_f16'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-intel-sycl-f16-bark'
+            runs-on: 'ubuntu-latest'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            backend: "bark"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
+          # bark-cpp
+          - build-type: ''
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-bark-cpp'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "bark"
+            dockerfile: "./backend/Dockerfile.go"
+            context: "./"
--- a/.github/workflows/backend_build.yml
+++ b/.github/workflows/backend_build.yml
@@ -0,0 +1,235 @@
+---
+name: 'build python backend container images (reusable)'
+
+on:
+  workflow_call:
+    inputs:
+      base-image:
+        description: 'Base image'
+        required: true
+        type: string
+      build-type:
+        description: 'Build type'
+        default: ''
+        type: string
+      cuda-major-version:
+        description: 'CUDA major version'
+        default: "12"
+        type: string
+      cuda-minor-version:
+        description: 'CUDA minor version'
+        default: "1"
+        type: string
+      platforms:
+        description: 'Platforms'
+        default: ''
+        type: string
+      tag-latest:
+        description: 'Tag latest'
+        default: ''
+        type: string
+      tag-suffix:
+        description: 'Tag suffix'
+        default: ''
+        type: string
+      runs-on:
+        description: 'Runs on'
+        required: true
+        default: ''
+        type: string
+      backend:
+        description: 'Backend to build'
+        required: true
+        type: string
+      context:
+        description: 'Build context'
+        required: true
+        type: string
+      dockerfile:
+        description: 'Build Dockerfile'
+        required: true
+        type: string
+    secrets:
+      dockerUsername:
+        required: true
+      dockerPassword:
+        required: true
+      quayUsername:
+        required: true
+      quayPassword:
+        required: true
+
+jobs:
+  reusable_python_backend-build:
+    runs-on: ${{ inputs.runs-on }}
+    steps:
+
+
+      - name: Free Disk Space (Ubuntu)
+        if: inputs.runs-on == 'ubuntu-latest'
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: true
+          swap-storage: true
+
+      - name: Force Install GIT latest
+        run: |
+          sudo apt-get update \
+          && sudo apt-get install -y software-properties-common \
+          && sudo apt-get update \
+          && sudo add-apt-repository -y ppa:git-core/ppa \
+          && sudo apt-get update \
+          && sudo apt-get install -y git
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Release space from worker
+        if: inputs.runs-on == 'ubuntu-latest'
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools snapd || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools snapd || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get remove -y microsoft-edge-stable || true
+          sudo apt-get remove -y firefox || true
+          sudo apt-get remove -y powershell || true
+          sudo apt-get remove -y r-base-core || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          sudo rm -rf /usr/share/dotnet || true
+          sudo rm -rf /opt/ghc || true
+          sudo rm -rf "/usr/local/share/boost" || true
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
+          df -h
+
+      - name: Docker meta
+        id: meta
+        if: github.event_name != 'pull_request'
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            quay.io/go-skynet/local-ai-backends
+            localai/localai-backends
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+            type=sha
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.tag-suffix }},onlatest=true
+
+      - name: Docker meta for PR
+        id: meta_pull_request
+        if: github.event_name == 'pull_request'
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            quay.io/go-skynet/ci-tests
+          tags: |
+            type=ref,event=branch,suffix=${{ github.event.number }}-${{ inputs.backend }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
+            type=semver,pattern={{raw}},suffix=${{ github.event.number }}-${{ inputs.backend }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
+            type=sha,suffix=${{ github.event.number }}-${{ inputs.backend }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.tag-suffix }},onlatest=true
+## End testing image
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@master
+        with:
+          platforms: all
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@master
+
+      - name: Login to DockerHub
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.dockerUsername }}
+          password: ${{ secrets.dockerPassword }}
+
+      - name: Login to Quay.io
+        # if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: quay.io
+          username: ${{ secrets.quayUsername }}
+          password: ${{ secrets.quayPassword }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v6
+        if: github.event_name != 'pull_request'
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BUILD_TYPE=${{ inputs.build-type }}
+            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
+            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
+            BASE_IMAGE=${{ inputs.base-image }}
+            BACKEND=${{ inputs.backend }}
+          context: ./backend
+          file: ./backend/Dockerfile.python
+          cache-from: type=gha
+          platforms: ${{ inputs.platforms }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+
+      - name: Build and push (PR)
+        uses: docker/build-push-action@v6
+        if: github.event_name == 'pull_request'
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BUILD_TYPE=${{ inputs.build-type }}
+            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
+            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
+            BASE_IMAGE=${{ inputs.base-image }}
+            BACKEND=${{ inputs.backend }}
+          context: ./backend
+          file: ./backend/Dockerfile.python
+          cache-from: type=gha
+          platforms: ${{ inputs.platforms }}
+          push: true
+          tags: ${{ steps.meta_pull_request.outputs.tags }}
+          labels: ${{ steps.meta_pull_request.outputs.labels }}
+
+
+
+      - name: job summary
+        run: |
+          echo "Built image: ${{ steps.meta.outputs.labels }}" >> $GITHUB_STEP_SUMMARY 
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -9,26 +9,17 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - repository: "ggerganov/llama.cpp"
+          - repository: "ggml-org/llama.cpp"
            variable: "CPPLLAMA_VERSION"
            branch: "master"
-          - repository: "go-skynet/go-ggml-transformers.cpp"
-            variable: "GOGGMLTRANSFORMERS_VERSION"
-            branch: "master"
-          - repository: "donomii/go-rwkv.cpp"
-            variable: "RWKV_VERSION"
-            branch: "main"
-          - repository: "ggerganov/whisper.cpp"
+          - repository: "ggml-org/whisper.cpp"
            variable: "WHISPER_CPP_VERSION"
            branch: "master"
-          - repository: "go-skynet/go-bert.cpp"
-            variable: "BERT_VERSION"
-            branch: "master"
-          - repository: "go-skynet/bloomz.cpp"
-            variable: "BLOOMZ_VERSION"
+          - repository: "PABannier/bark.cpp"
+            variable: "BARKCPP_VERSION"
            branch: "main"
-          - repository: "mudler/go-ggllm.cpp"
-            variable: "GOGGLLM_VERSION"
+          - repository: "leejet/stable-diffusion.cpp"
+            variable: "STABLEDIFFUSION_GGML_VERSION"
            branch: "master"
          - repository: "mudler/go-stable-diffusion"
            variable: "STABLEDIFFUSION_VERSION"
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -5,7 +5,7 @@ on:
  workflow_dispatch:
 jobs:
  checksum_check:
-    runs-on: arc-runner-set
+    runs-on: ubuntu-latest
    steps:
      - name: Force Install GIT latest
        run: |
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@@ -14,7 +14,7 @@ jobs:
    steps:
      - name: Dependabot metadata
        id: metadata
-        uses: dependabot/fetch-metadata@v2.2.0
+        uses: dependabot/fetch-metadata@v2.4.0
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          skip-commit-verification: true
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -33,7 +33,7 @@ jobs:
        run: |
          CGO_ENABLED=0 make build-api
      - name: rm
-        uses: appleboy/ssh-action@v1.2.0
+        uses: appleboy/ssh-action@v1.2.2
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@@ -42,7 +42,7 @@ jobs:
            script: |
                sudo rm -rf local-ai/ || true
      - name: copy file via ssh
-        uses: appleboy/scp-action@v0.1.7
+        uses: appleboy/scp-action@v1.0.0
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@@ -53,7 +53,7 @@ jobs:
            rm: true
            target: ./local-ai
      - name: restarting
-        uses: appleboy/ssh-action@v1.2.0
+        uses: appleboy/ssh-action@v1.2.2
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -2,9 +2,10 @@ name: 'generate and publish GRPC docker caches'

 on:
  workflow_dispatch:
-  push:
-    branches:
-      - master
+
+  schedule:
+    # daily at midnight
+    - cron: '0 0 * * *'

 concurrency:
  group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -15,7 +15,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
+          - base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04
            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64'
    runs-on: ${{matrix.runs-on}}
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -9,13 +9,12 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  extras-image-build:
+  image-build:
    uses: ./.github/workflows/image_build.yml
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
      ffmpeg: ${{ matrix.ffmpeg }}
-      image-type: ${{ matrix.image-type }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
@@ -33,108 +32,42 @@ jobs:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
      max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }}
+      fail-fast: false
      matrix:
        include:
-          # This is basically covered by the AIO test
-          # - build-type: ''
-          #   platforms: 'linux/amd64'
-          #   tag-latest: 'false'
-          #   tag-suffix: '-ffmpeg'
-          #   ffmpeg: 'true'
-          #   image-type: 'extras'
-          #   runs-on: 'arc-runner-set'
-          #   base-image: "ubuntu:22.04"
-          #   makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-ffmpeg'
+            tag-suffix: '-gpu-nvidia-cuda12-ffmpeg'
            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
+            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
-          # - build-type: 'hipblas'
-          #   platforms: 'linux/amd64'
-          #   tag-latest: 'false'
-          #   tag-suffix: '-hipblas'
-          #   ffmpeg: 'false'
-          #   image-type: 'extras'
-          #   base-image: "rocm/dev-ubuntu-22.04:6.1"
-          #   grpc-base-image: "ubuntu:22.04"
-          #   runs-on: 'arc-runner-set'
-          #   makeflags: "--jobs=3 --output-sync=target"
-          # - build-type: 'sycl_f16'
-          #   platforms: 'linux/amd64'
-          #   tag-latest: 'false'
-          #   base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-          #   grpc-base-image: "ubuntu:22.04"
-          #   tag-suffix: 'sycl-f16-ffmpeg'
-          #   ffmpeg: 'true'
-          #   image-type: 'extras'
-          #   runs-on: 'arc-runner-set'
-          #   makeflags: "--jobs=3 --output-sync=target"
-  # core-image-build:
-  #   uses: ./.github/workflows/image_build.yml
-  #   with:
-  #     tag-latest: ${{ matrix.tag-latest }}
-  #     tag-suffix: ${{ matrix.tag-suffix }}
-  #     ffmpeg: ${{ matrix.ffmpeg }}
-  #     image-type: ${{ matrix.image-type }}
-  #     build-type: ${{ matrix.build-type }}
-  #     cuda-major-version: ${{ matrix.cuda-major-version }}
-  #     cuda-minor-version: ${{ matrix.cuda-minor-version }}
-  #     platforms: ${{ matrix.platforms }}
-  #     runs-on: ${{ matrix.runs-on }}
-  #     base-image: ${{ matrix.base-image }}
-  #     grpc-base-image: ${{ matrix.grpc-base-image }}
-  #     makeflags: ${{ matrix.makeflags }}
-  #   secrets:
-  #     dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
-  #     dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
-  #     quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-  #     quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-  #   strategy:
-  #     matrix:
-  #       include:
-          # - build-type: ''
-          #   platforms: 'linux/amd64'
-          #   tag-latest: 'false'
-          #   tag-suffix: '-ffmpeg-core'
-          #   ffmpeg: 'true'
-          #   image-type: 'core'
-          #   runs-on: 'ubuntu-latest'
-          #   base-image: "ubuntu:22.04"
-          #   makeflags: "--jobs=4 --output-sync=target"
-          # - build-type: 'sycl_f16'
-          #   platforms: 'linux/amd64'
-          #   tag-latest: 'false'
-          #   base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-          #   grpc-base-image: "ubuntu:22.04"
-          #   tag-suffix: 'sycl-f16-ffmpeg-core'
-          #   ffmpeg: 'true'
-          #   image-type: 'core'
-          #   runs-on: 'arc-runner-set'
-          #   makeflags: "--jobs=3 --output-sync=target"
-          # - build-type: 'cublas'
-          #   cuda-major-version: "12"
-          #   cuda-minor-version: "0"
-          #   platforms: 'linux/amd64'
-          #   tag-latest: 'false'
-          #   tag-suffix: '-cublas-cuda12-ffmpeg-core'
-          #   ffmpeg: 'true'
-          #   image-type: 'core'
-          #   runs-on: 'ubuntu-latest'
-          #   base-image: "ubuntu:22.04"
-          #   makeflags: "--jobs=4 --output-sync=target"
-          # - build-type: 'vulkan'
-          #   platforms: 'linux/amd64'
-          #   tag-latest: 'false'
-          #   tag-suffix: '-vulkan-ffmpeg-core'
-          #   ffmpeg: 'true'
-          #   image-type: 'core'
-          #   runs-on: 'ubuntu-latest'
-          #   base-image: "ubuntu:22.04"
-          #   makeflags: "--jobs=4 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas'
+            ffmpeg: 'false'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'ubuntu-latest'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            grpc-base-image: "ubuntu:22.04"
+            tag-suffix: 'sycl-f16-ffmpeg'
+            ffmpeg: 'true'
+            runs-on: 'ubuntu-latest'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'vulkan'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-vulkan-ffmpeg-core'
+            ffmpeg: 'true'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -19,7 +19,6 @@ jobs:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
      ffmpeg: ${{ matrix.ffmpeg }}
-      image-type: ${{ matrix.image-type }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
@@ -29,238 +28,24 @@ jobs:
      grpc-base-image: ${{ matrix.grpc-base-image }}
      aio: ${{ matrix.aio }}
      makeflags: ${{ matrix.makeflags }}
-      latest-image: ${{ matrix.latest-image }}
-      latest-image-aio: ${{ matrix.latest-image-aio }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
-      # Pushing with all jobs in parallel
-      # eats the bandwidth of all the nodes
-      max-parallel: 2
      matrix:
        include:
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-hipblas-ffmpeg'
+            tag-suffix: '-gpu-hipblas'
            ffmpeg: 'true'
-            image-type: 'extras'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'ubuntu-latest'
+            makeflags: "--jobs=3 --output-sync=target"
            aio: "-aio-gpu-hipblas"
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            latest-image: 'latest-gpu-hipblas'
-            latest-image-aio: 'latest-aio-gpu-hipblas'
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas'
-            ffmpeg: 'false'
-            image-type: 'extras'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-core'
-            ffmpeg: 'false'
-            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-  self-hosted-jobs:
-    uses: ./.github/workflows/image_build.yml
-    with:
-      tag-latest: ${{ matrix.tag-latest }}
-      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
-      image-type: ${{ matrix.image-type }}
-      build-type: ${{ matrix.build-type }}
-      cuda-major-version: ${{ matrix.cuda-major-version }}
-      cuda-minor-version: ${{ matrix.cuda-minor-version }}
-      platforms: ${{ matrix.platforms }}
-      runs-on: ${{ matrix.runs-on }}
-      base-image: ${{ matrix.base-image }}
-      grpc-base-image: ${{ matrix.grpc-base-image }}
-      aio: ${{ matrix.aio }}
-      makeflags: ${{ matrix.makeflags }}
-      latest-image: ${{ matrix.latest-image }}
-      latest-image-aio: ${{ matrix.latest-image-aio }}
-    secrets:
-      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
-      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
-      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-    strategy:
-      # Pushing with all jobs in parallel
-      # eats the bandwidth of all the nodes
-      max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
-      matrix:
-        include:
-          # Extra images
-          - build-type: ''
-            #platforms: 'linux/amd64,linux/arm64'
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: ''
-            ffmpeg: ''
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: ''
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'cublas'
-            cuda-major-version: "11"
-            cuda-minor-version: "7"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11'
-            ffmpeg: ''
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "0"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12'
-            ffmpeg: ''
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'cublas'
-            cuda-major-version: "11"
-            cuda-minor-version: "7"
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-cublas-cuda11-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-            aio: "-aio-gpu-nvidia-cuda-11"
-            latest-image: 'latest-gpu-nvidia-cuda-11'
-            latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "0"
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-cublas-cuda12-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-            aio: "-aio-gpu-nvidia-cuda-12"
-            latest-image: 'latest-gpu-nvidia-cuda-12'
-            latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: ''
-            #platforms: 'linux/amd64,linux/arm64'
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: ''
-            ffmpeg: ''
-            image-type: 'extras'
-            base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'sycl_f16'
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f16-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            aio: "-aio-gpu-intel-f16"
-            latest-image: 'latest-gpu-intel-f16'
-            latest-image-aio: 'latest-aio-gpu-intel-f16'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'sycl_f32'
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f32-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            aio: "-aio-gpu-intel-f32"
-            latest-image: 'latest-gpu-intel-f32'
-            latest-image-aio: 'latest-aio-gpu-intel-f32'
-            makeflags: "--jobs=3 --output-sync=target"
-          # Core images
-          - build-type: 'sycl_f16'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f16-core'
-            ffmpeg: 'false'
-            image-type: 'core'
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'sycl_f32'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f32-core'
-            ffmpeg: 'false'
-            image-type: 'core'
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'sycl_f16'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f16-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'sycl_f32'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f32-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"

  core-image-build:
    uses: ./.github/workflows/image_build.yml
@@ -268,7 +53,6 @@ jobs:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
      ffmpeg: ${{ matrix.ffmpeg }}
-      image-type: ${{ matrix.image-type }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
@@ -278,80 +62,113 @@ jobs:
      base-image: ${{ matrix.base-image }}
      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
-      latest-image: ${{ matrix.latest-image }}
-      latest-image-aio: ${{ matrix.latest-image-aio }}
+      skip-drivers: ${{ matrix.skip-drivers }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
-      max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
+      #max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
      matrix:
        include:
          - build-type: ''
            platforms: 'linux/amd64,linux/arm64'
            tag-latest: 'auto'
-            tag-suffix: '-ffmpeg-core'
+            tag-suffix: ''
            ffmpeg: 'true'
-            image-type: 'core'
            base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
+            runs-on: 'ubuntu-latest'
            aio: "-aio-cpu"
-            latest-image: 'latest-cpu'
-            latest-image-aio: 'latest-aio-cpu'
            makeflags: "--jobs=4 --output-sync=target"
+            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11-core'
-            ffmpeg: ''
-            image-type: 'core'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda11'
+            ffmpeg: 'true'
+            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
+            skip-drivers: 'false'
+            aio: "-aio-gpu-nvidia-cuda-11"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-core'
-            ffmpeg: ''
-            image-type: 'core'
-            base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=4 --output-sync=target"
-          - build-type: 'cublas'
-            cuda-major-version: "11"
-            cuda-minor-version: "7"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11-ffmpeg-core'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda12'
            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=4 --output-sync=target"
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "0"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'arc-runner-set'
+            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
+            aio: "-aio-gpu-nvidia-cuda-12"
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-vulkan-ffmpeg-core'
-            latest-image: 'latest-vulkan-ffmpeg-core'
+            tag-latest: 'auto'
+            tag-suffix: '-vulkan'
            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'arc-runner-set'
+            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
+            aio: "-aio-gpu-vulkan"
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            grpc-base-image: "ubuntu:22.04"
+            tag-suffix: '-gpu-intel-f16'
+            ffmpeg: 'true'
+            runs-on: 'ubuntu-latest'
+            makeflags: "--jobs=3 --output-sync=target"
+            aio: "-aio-gpu-intel-f16"
+          - build-type: 'sycl_f32'
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            grpc-base-image: "ubuntu:22.04"
+            tag-suffix: '-gpu-intel-f32'
+            ffmpeg: 'true'
+            runs-on: 'ubuntu-latest'
+            makeflags: "--jobs=3 --output-sync=target"
+            aio: "-aio-gpu-intel-f32"
+
+  gh-runner:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      ffmpeg: ${{ matrix.ffmpeg }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+      aio: ${{ matrix.aio }}
+      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
+      makeflags: ${{ matrix.makeflags }}
+      skip-drivers: ${{ matrix.skip-drivers }}
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      matrix:
+        include:
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/arm64'
+            tag-latest: 'auto'
+            tag-suffix: '-nvidia-l4t-arm64'
+            ffmpeg: 'true'
+            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
+            runs-on: 'ubuntu-24.04-arm'
+            makeflags: "--jobs=4 --output-sync=target"
+            skip-drivers: 'true'
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -33,14 +33,6 @@ on:
        description: 'Tag latest'
        default: ''
        type: string
-      latest-image:
-          description: 'Tag latest'
-          default: ''
-          type: string
-      latest-image-aio:
-          description: 'Tag latest'
-          default: ''
-          type: string
      tag-suffix:
        description: 'Tag suffix'
        default: ''
@@ -49,9 +41,9 @@ on:
        description: 'FFMPEG'
        default: ''
        type: string
-      image-type:
-        description: 'Image type'
-        default: ''
+      skip-drivers:
+        description: 'Skip drivers by default'
+        default: 'false'
        type: string
      runs-on:
        description: 'Runs on'
@@ -81,6 +73,22 @@ jobs:
  reusable_image-build:
    runs-on: ${{ inputs.runs-on }}
    steps:
+
+      - name: Free Disk Space (Ubuntu)
+        if: inputs.runs-on == 'ubuntu-latest'
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: true
+          swap-storage: true
      - name: Force Install GIT latest
        run: |
          sudo apt-get update \
@@ -102,8 +110,8 @@ jobs:
          df -h
          echo
          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
-          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
-          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools snapd || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools snapd || true
          sudo rm -rf /usr/local/lib/android
          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
          sudo rm -rf /usr/share/dotnet
@@ -148,18 +156,18 @@ jobs:
            type=sha
          flavor: |
            latest=${{ inputs.tag-latest }}
-            suffix=${{ inputs.tag-suffix }}
+            suffix=${{ inputs.tag-suffix }},onlatest=true
      - name: Docker meta for PR
        id: meta_pull_request
        if: github.event_name == 'pull_request'
        uses: docker/metadata-action@v5
        with:
          images: |
-            ttl.sh/localai-ci-pr-${{ github.event.number }}
+            quay.io/go-skynet/ci-tests
          tags: |
-            type=ref,event=branch
-            type=semver,pattern={{raw}}
-            type=sha
+            type=ref,event=branch,suffix=localai${{ github.event.number }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
+            type=semver,pattern={{raw}},suffix=localai${{ github.event.number }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
+            type=sha,suffix=localai${{ github.event.number }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
          flavor: |
            latest=${{ inputs.tag-latest }}
            suffix=${{ inputs.tag-suffix }}
@@ -175,7 +183,7 @@ jobs:
            type=semver,pattern={{raw}}
          flavor: |
            latest=${{ inputs.tag-latest }}
-            suffix=${{ inputs.aio }}
+            suffix=${{ inputs.aio }},onlatest=true

      - name: Docker meta AIO (dockerhub)
        if: inputs.aio != ''
@@ -188,7 +196,8 @@ jobs:
            type=ref,event=branch
            type=semver,pattern={{raw}}
          flavor: |
-            suffix=${{ inputs.aio }}
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.aio }},onlatest=true

      - name: Set up QEMU
        uses: docker/setup-qemu-action@master
@@ -228,12 +237,12 @@ jobs:
            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
            FFMPEG=${{ inputs.ffmpeg }}
-            IMAGE_TYPE=${{ inputs.image-type }}
            BASE_IMAGE=${{ inputs.base-image }}
            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
            GRPC_VERSION=v1.65.0
            MAKEFLAGS=${{ inputs.makeflags }}
+            SKIP_DRIVERS=${{ inputs.skip-drivers }}
          context: .
          file: ./Dockerfile
          cache-from: type=gha
@@ -256,23 +265,19 @@ jobs:
            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
            FFMPEG=${{ inputs.ffmpeg }}
-            IMAGE_TYPE=${{ inputs.image-type }}
            BASE_IMAGE=${{ inputs.base-image }}
            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
            GRPC_VERSION=v1.65.0
            MAKEFLAGS=${{ inputs.makeflags }}
+            SKIP_DRIVERS=${{ inputs.skip-drivers }}
          context: .
          file: ./Dockerfile
          cache-from: type=gha
          platforms: ${{ inputs.platforms }}
-          push: true
+          #push: true
          tags: ${{ steps.meta_pull_request.outputs.tags }}
          labels: ${{ steps.meta_pull_request.outputs.labels }}
-      - name: Testing image
-        if: github.event_name == 'pull_request'
-        run: |
-          echo "Image is available at ttl.sh/localai-ci-pr-${{ github.event.number }}:${{ steps.meta_pull_request.outputs.version }}" >> $GITHUB_STEP_SUMMARY
 ## End testing image
      - name: Build and push AIO image
        if: inputs.aio != ''
@@ -304,27 +309,6 @@ jobs:
          tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
          labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}

-      - name: Latest tag
-        # run this on branches, when it is a tag and there is a latest-image defined
-        if: github.event_name != 'pull_request' && inputs.latest-image != ''  && github.ref_type == 'tag'
-        run: |
-          docker pull localai/localai:${{ steps.meta.outputs.version }}
-          docker tag localai/localai:${{ steps.meta.outputs.version }} localai/localai:${{ inputs.latest-image }}
-          docker push localai/localai:${{ inputs.latest-image }}
-          docker pull quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
-          docker tag quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }} quay.io/go-skynet/local-ai:${{ inputs.latest-image }}
-          docker push quay.io/go-skynet/local-ai:${{ inputs.latest-image }}
-      - name: Latest AIO tag
-        # run this on branches, when it is a tag and there is a latest-image defined
-        if: github.event_name != 'pull_request' && inputs.latest-image-aio != ''  && github.ref_type == 'tag'
-        run: |
-          docker pull localai/localai:${{ steps.meta_aio_dockerhub.outputs.version }}
-          docker tag localai/localai:${{ steps.meta_aio_dockerhub.outputs.version }} localai/localai:${{ inputs.latest-image-aio }}
-          docker push localai/localai:${{ inputs.latest-image-aio }}
-          docker pull quay.io/go-skynet/local-ai:${{ steps.meta_aio.outputs.version }}
-          docker tag quay.io/go-skynet/local-ai:${{ steps.meta_aio.outputs.version }} quay.io/go-skynet/local-ai:${{ inputs.latest-image-aio }}
-          docker push quay.io/go-skynet/local-ai:${{ inputs.latest-image-aio }}
-
      - name: job summary
        run: |
          echo "Built image: ${{ steps.meta.outputs.labels }}" >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -8,7 +8,7 @@ jobs:
  notify-discord:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: hermes-2-theta-llama-3-8b
+        MODEL_NAME: gemma-3-12b-it
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
@@ -16,9 +16,9 @@ jobs:
        fetch-depth: 0 # needed to checkout all branches for this Action to work
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
        # Check the PR diff using the current branch and the base branch of the PR
-    - uses: GrantBirki/git-diff-action@v2.7.0
+    - uses: GrantBirki/git-diff-action@v2.8.1
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
@@ -79,7 +79,7 @@ jobs:
        args: ${{ steps.summarize.outputs.message }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
-      uses: mxschmitt/action-tmate@v3.19
+      uses: mxschmitt/action-tmate@v3.22
      with:
        detached: true
        connect-timeout-seconds: 180
@@ -87,7 +87,7 @@ jobs:
  notify-twitter:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: hermes-2-theta-llama-3-8b
+        MODEL_NAME: gemma-3-12b-it
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
@@ -99,7 +99,7 @@ jobs:
        docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
        until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready";  docker logs --tail 10 local-ai; sleep 2; done
      # Check the PR diff using the current branch and the base branch of the PR
-    - uses: GrantBirki/git-diff-action@v2.7.0
+    - uses: GrantBirki/git-diff-action@v2.8.1
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
@@ -161,7 +161,7 @@ jobs:
        TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
-      uses: mxschmitt/action-tmate@v3.19
+      uses: mxschmitt/action-tmate@v3.22
      with:
        detached: true
        connect-timeout-seconds: 180
--- a/.github/workflows/notify-releases.yaml
+++ b/.github/workflows/notify-releases.yaml
@@ -14,7 +14,7 @@ jobs:
    steps:
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
    - name: Summarize
      id: summarize
      run: |
@@ -60,4 +60,4 @@ jobs:
        DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
      uses: Ilshidur/action-discord@master
      with:
-        args: ${{ steps.summarize.outputs.message }}
+        args: ${{ steps.summarize.outputs.message }}
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -36,6 +36,7 @@ jobs:
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk
          sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev
+          make install-go-tools
      - name: Install CUDA Dependencies
        run: |
          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
@@ -123,14 +124,70 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
  build-linux:
-    runs-on: arc-runner-set
+    runs-on: ubuntu-latest
    steps:
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: true
+          swap-storage: true
+
+      - name: Release space from worker
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools snapd || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools snapd || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get remove -y microsoft-edge-stable || true
+          sudo apt-get remove -y firefox || true
+          sudo apt-get remove -y powershell || true
+          sudo apt-get remove -y r-base-core || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          sudo rm -rf /usr/share/dotnet || true
+          sudo rm -rf /opt/ghc || true
+          sudo rm -rf "/usr/local/share/boost" || true
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
+          df -h
+
      - name: Force Install GIT latest
        run: |
          sudo apt-get update \
@@ -151,6 +208,7 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
+          make install-go-tools
      - name: Intel Dependencies
        run: |
          wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
@@ -175,17 +233,12 @@ jobs:
            sudo apt-get update
            sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates curl libnuma-dev gnupg

-            curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add -
+            sudo apt update
+            wget https://repo.radeon.com/amdgpu-install/6.4.1/ubuntu/noble/amdgpu-install_6.4.60401-1_all.deb
+            sudo apt install ./amdgpu-install_6.4.60401-1_all.deb
+            sudo apt update

-            printf "deb [arch=amd64] https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | sudo tee /etc/apt/sources.list.d/rocm.list
-
-            printf "deb [arch=amd64] https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | sudo tee /etc/apt/sources.list.d/amdgpu.list
-            printf 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
-            sudo apt-get update
-
-            sudo DEBIAN_FRONTEND=noninteractive apt-get install -y \
-                hipblas-dev rocm-dev \
-                rocblas-dev
+            sudo amdgpu-install --usecase=rocm

            sudo apt-get clean
            sudo rm -rf /var/lib/apt/lists/*
@@ -232,45 +285,12 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
-  build-stablediffusion:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.21.x'
-          cache: false
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache upx-ucl
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
-      - name: Build stablediffusion
-        run: |
-          export PATH=$PATH:$GOPATH/bin
-          make backend-assets/grpc/stablediffusion
-          mkdir -p release && cp backend-assets/grpc/stablediffusion release
-        env:
-          GO_TAGS: stablediffusion
-      - uses: actions/upload-artifact@v4
-        with:
-          name: stablediffusion
-          path: release/
-      - name: Release
-        uses: softprops/action-gh-release@v2
-        if: startsWith(github.ref, 'refs/tags/')
-        with:
-          files: |
-            release/*
+

  build-macOS-x86_64:
    runs-on: macos-13
@@ -286,8 +306,7 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
+          make install-go-tools
      - name: Build
        id: build
        run: |
@@ -308,7 +327,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -328,8 +347,7 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc libomp llvm
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+          make install-go-tools
      - name: Build
        id: build
        run: |
@@ -350,7 +368,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.21.4
+        uses: securego/gosec@v2.22.5
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/stalebot.yml
+++ b/.github/workflows/stalebot.yml
@@ -0,0 +1,24 @@
+name: 'Close stale issues and PRs'
+permissions:
+  issues: write
+  pull-requests: write
+on:
+  schedule:
+    - cron: '30 1 * * *'
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9
+        with:
+          stale-issue-message: 'This issue is stale because it has been open 90 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
+          stale-pr-message: 'This PR is stale because it has been open 90 days with no activity. Remove stale label or comment or this will be closed in 10 days.'
+          close-issue-message: 'This issue was closed because it has been stalled for 5 days with no activity.'
+          close-pr-message: 'This PR was closed because it has been stalled for 10 days with no activity.'
+          days-before-issue-stale: 90
+          days-before-pr-stale: 90
+          days-before-issue-close: 5
+          days-before-pr-close: 10
+          exempt-issue-labels: 'roadmap'
+          exempt-pr-labels: 'roadmap'
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -14,6 +14,28 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  # Requires CUDA
+  # tests-chatterbox-tts:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - name: Clone
+  #       uses: actions/checkout@v4
+  #       with:
+  #         submodules: true
+  #     - name: Dependencies
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install build-essential ffmpeg
+  #         # Install UV
+  #         curl -LsSf https://astral.sh/uv/install.sh | sh
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user --no-cache-dir grpcio-tools==1.64.1
+
+  #     - name: Test chatterbox-tts
+  #       run: |
+  #          make --jobs=5 --output-sync=target -C backend/python/chatterbox
+  #          make --jobs=5 --output-sync=target -C backend/python/chatterbox test
  tests-transformers:
    runs-on: ubuntu-latest
    steps:
@@ -35,30 +57,6 @@ jobs:
        run: |
           make --jobs=5 --output-sync=target -C backend/python/transformers
           make --jobs=5 --output-sync=target -C backend/python/transformers test
-
-  tests-sentencetransformers:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          # Install UV
-          curl -LsSf https://astral.sh/uv/install.sh | sh
-          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
-          sudo apt-get install -y libopencv-dev
-          pip install --user --no-cache-dir grpcio-tools==1.64.1
-
-      - name: Test sentencetransformers
-        run: |
-           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
-           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers test
-
-
  tests-rerankers:
    runs-on: ubuntu-latest
    steps:
@@ -102,78 +100,47 @@ jobs:
          make --jobs=5 --output-sync=target -C backend/python/diffusers
          make --jobs=5 --output-sync=target -C backend/python/diffusers test

-  tests-parler-tts:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          # Install UV
-          curl -LsSf https://astral.sh/uv/install.sh | sh
-          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
-          sudo apt-get install -y libopencv-dev
-          pip install --user --no-cache-dir grpcio-tools==1.64.1
+  #tests-vllm:
+  #  runs-on: ubuntu-latest
+  #  steps:
+  #    - name: Clone
+  #      uses: actions/checkout@v4
+  #      with:
+  #        submodules: true
+  #    - name: Dependencies
+  #      run: |
+  #        sudo apt-get update
+  #        sudo apt-get install -y build-essential ffmpeg
+  #        sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+  #        sudo apt-get install -y libopencv-dev
+  #        # Install UV
+  #        curl -LsSf https://astral.sh/uv/install.sh | sh
+  #        pip install --user --no-cache-dir grpcio-tools==1.64.1
+  #    - name: Test vllm backend
+  #      run: |
+  #        make --jobs=5 --output-sync=target -C backend/python/vllm
+  #        make --jobs=5 --output-sync=target -C backend/python/vllm test
+  # tests-transformers-musicgen:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - name: Clone
+  #       uses: actions/checkout@v4
+  #       with:
+  #         submodules: true
+  #     - name: Dependencies
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install build-essential ffmpeg
+  #         # Install UV
+  #         curl -LsSf https://astral.sh/uv/install.sh | sh
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user --no-cache-dir grpcio-tools==1.64.1

-      - name: Test parler-tts
-        run: |
-           make --jobs=5 --output-sync=target -C backend/python/parler-tts
-           make --jobs=5 --output-sync=target -C backend/python/parler-tts test
-      - name: Setup tmate session if tests fail
-        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
-        with:
-          detached: true
-          connect-timeout-seconds: 180
-          limit-access-to-actor: true
-
-  tests-openvoice:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          # Install UV
-          curl -LsSf https://astral.sh/uv/install.sh | sh
-          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
-          sudo apt-get install -y libopencv-dev
-          pip install --user --no-cache-dir grpcio-tools==1.64.1
-
-      - name: Test openvoice
-        run: |
-           make --jobs=5 --output-sync=target -C backend/python/openvoice
-           make --jobs=5 --output-sync=target -C backend/python/openvoice test
-
-  tests-transformers-musicgen:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          # Install UV
-          curl -LsSf https://astral.sh/uv/install.sh | sh
-          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
-          sudo apt-get install -y libopencv-dev
-          pip install --user --no-cache-dir grpcio-tools==1.64.1
-
-      - name: Test transformers-musicgen
-        run: |
-           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
-           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
+  #     - name: Test transformers-musicgen
+  #       run: |
+  #          make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
+  #          make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test

  # tests-bark:
  #   runs-on: ubuntu-latest
@@ -260,26 +227,6 @@ jobs:
  #       run: |
  #          make --jobs=5 --output-sync=target -C backend/python/vllm
  #          make --jobs=5 --output-sync=target -C backend/python/vllm test
-  tests-vallex:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          # Install UV
-          curl -LsSf https://astral.sh/uv/install.sh | sh
-          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
-          sudo apt-get install -y libopencv-dev
-          pip install --user --no-cache-dir grpcio-tools==1.64.1
-      - name: Test vall-e-x
-        run: |
-           make --jobs=5 --output-sync=target -C backend/python/vall-e-x
-           make --jobs=5 --output-sync=target -C backend/python/vall-e-x test

  tests-coqui:
    runs-on: ubuntu-latest
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -71,7 +71,7 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
-          sudo apt-get install -y libgmock-dev
+          sudo apt-get install -y libgmock-dev clang
          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
@@ -96,19 +96,17 @@ jobs:

          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          go install github.com/GeertJohan/go.rice/rice@latest

          # The python3-grpc-tools package in 22.04 is too old
-          pip install --user grpcio-tools
+          pip install --user grpcio-tools==1.71.0 grpcio==1.71.0

-          sudo rm -rfv /usr/bin/conda || true
-          PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers
+          make -C backend/python/transformers

          # Pre-build piper before we start tests in order to have shared libraries in place
          make sources/go-piper && \
          GO_TAGS="tts" make -C sources/go-piper piper.o && \
-          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
-          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
-          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/
        env:
          CUDA_VERSION: 12-4
      - name: Cache grpc
@@ -130,10 +128,10 @@ jobs:
          cd grpc && cd cmake/build && sudo make --jobs 5 install
      - name: Test
        run: |
-          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
+          PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -186,10 +184,11 @@ jobs:
          rm protoc.zip
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          go install github.com/GeertJohan/go.rice/rice@latest
          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Build images
        run: |
-          docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=extras --build-arg EXTRA_BACKENDS=rerankers --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
+          docker build --build-arg FFMPEG=true --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
          BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
      - name: Test
        run: |
@@ -197,7 +196,7 @@ jobs:
            make run-e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -224,7 +223,8 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
-          pip install --user --no-cache-dir grpcio-tools
+          pip install --user --no-cache-dir grpcio-tools==1.71.0 grpcio==1.71.0
+          go install github.com/GeertJohan/go.rice/rice@latest
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
@@ -235,7 +235,7 @@ jobs:
          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/.github/workflows/yaml-check.yml
+++ b/.github/workflows/yaml-check.yml
@@ -8,7 +8,7 @@ jobs:
    steps:
      - name: 'Checkout'
        uses: actions/checkout@master
-      - name: 'Yamllint'
+      - name: 'Yamllint model gallery'
        uses: karancode/yamllint-github-action@master
        with:
          yamllint_file_or_dir: 'gallery'
@@ -16,3 +16,11 @@ jobs:
          yamllint_comment: true
        env:
          GITHUB_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: 'Yamllint Backend gallery'
+        uses: karancode/yamllint-github-action@master
+        with:
+          yamllint_file_or_dir: 'backend'
+          yamllint_strict: false
+          yamllint_comment: true
+        env:
+          GITHUB_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 /sources/
 __pycache__/
 *.a
+*.o
 get-sources
 prepare-sources
 /backend/cpp/llama/grpc-server
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -26,7 +26,7 @@
                "LOCALAI_P2P": "true",
                "LOCALAI_FEDERATED": "true"
            },
-            "buildFlags": ["-tags", "stablediffusion p2p tts", "-v"],
+            "buildFlags": ["-tags", "p2p tts", "-v"],
            "envFile": "${workspaceFolder}/.env",
            "cwd": "${workspaceRoot}"
        }
--- a/369
+++ b/369
@@ -1,126 +1,32 @@
-ARG IMAGE_TYPE=extras
 ARG BASE_IMAGE=ubuntu:22.04
 ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
 ARG INTEL_BASE_IMAGE=${BASE_IMAGE}

-# The requirements-core target is common to all images.  It should not be placed in requirements-core unless every single build will use it.
-FROM ${BASE_IMAGE} AS requirements-core
-
-USER root
-
-ARG GO_VERSION=1.22.6
-ARG CMAKE_VERSION=3.26.4
-ARG CMAKE_FROM_SOURCE=false
-ARG TARGETARCH
-ARG TARGETVARIANT
+FROM ${BASE_IMAGE} AS requirements

 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
-

 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        build-essential \
-        ccache \
-        ca-certificates \
-        curl libssl-dev \
-        git \
-        unzip upx-ucl && \
+        ca-certificates curl wget espeak-ng libgomp1 \
+        python3 python-is-python3 ffmpeg && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

-# Install CMake (the version in 22.04 is too old)
-RUN <<EOT bash
-    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
-        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
-    else
-        apt-get update && \
-        apt-get install -y \
-            cmake && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-# Install Go
-RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
-ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
-
-# Install grpc compilers
-RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
-    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
-
-COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
-RUN update-ca-certificates
-
-RUN test -n "$TARGETARCH" \
-    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
-
-# Use the variables in subsequent instructions
-RUN echo "Target Architecture: $TARGETARCH"
-RUN echo "Target Variant: $TARGETVARIANT"
-
-# Cuda
-ENV PATH=/usr/local/cuda/bin:${PATH}
-
-# HipBLAS requirements
-ENV PATH=/opt/rocm/bin:${PATH}
-
-# OpenBLAS requirements and stable diffusion
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        libopenblas-dev \
-        libopencv-dev && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-# Set up OpenCV
-RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-
-WORKDIR /build
-
-###################################
-###################################
-
-# The requirements-extras target is for any builds with IMAGE_TYPE=extras. It should not be placed in this target unless every IMAGE_TYPE=extras build will use it
-FROM requirements-core AS requirements-extras
-
-# Install uv as a system package
-RUN curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR=/usr/bin sh
-ENV PATH="/root/.cargo/bin:${PATH}"
-
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        espeak-ng \
-        espeak \
-        python3-pip \
-        python-is-python3 \
-        python3-dev llvm \
-        python3-venv && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* && \
-    pip install --upgrade pip
-
-# Install grpcio-tools (the version in 22.04 is too old)
-RUN pip install --user grpcio-tools
-
-###################################
-###################################
-
 # The requirements-drivers target is for BUILD_TYPE specific items.  If you need to install something specific to CUDA, or specific to ROCM, it goes here.
-# This target will be built on top of requirements-core or requirements-extras as retermined by the IMAGE_TYPE build-arg
-FROM requirements-${IMAGE_TYPE} AS requirements-drivers
+FROM requirements AS requirements-drivers

 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=12
 ARG CUDA_MINOR_VERSION=0
-
+ARG SKIP_DRIVERS=false
+ARG TARGETARCH
+ARG TARGETVARIANT
 ENV BUILD_TYPE=${BUILD_TYPE}

 # Vulkan requirements
 RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "vulkan" ]; then
+    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils wget gpg-agent && \
@@ -136,7 +42,7 @@ EOT

 # CuBLAS requirements
 RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "cublas" ]; then
+    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils
@@ -162,7 +68,7 @@ RUN <<EOT bash
 EOT

 # If we are building with clblas support, we need the libraries for the builds
-RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
+RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            libclblast-dev && \
@@ -170,7 +76,7 @@ RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
        rm -rf /var/lib/apt/lists/* \
    ; fi

-RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
+RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            hipblas-dev \
@@ -182,6 +88,83 @@ RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
        ldconfig \
    ; fi

+# Cuda
+ENV PATH=/usr/local/cuda/bin:${PATH}
+
+# HipBLAS requirements
+ENV PATH=/opt/rocm/bin:${PATH}
+
+###################################
+###################################
+
+# The requirements-core target is common to all images.  It should not be placed in requirements-core unless every single build will use it.
+FROM requirements-drivers AS build-requirements
+
+ARG GO_VERSION=1.22.6
+ARG CMAKE_VERSION=3.26.4
+ARG CMAKE_FROM_SOURCE=false
+ARG TARGETARCH
+ARG TARGETVARIANT
+
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        ccache \
+        ca-certificates espeak-ng \
+        curl libssl-dev \
+        git \
+        git-lfs \
+        unzip upx-ucl python3 python-is-python3 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install CMake (the version in 22.04 is too old)
+RUN <<EOT bash
+    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
+        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
+    else
+        apt-get update && \
+        apt-get install -y \
+            cmake && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+# Install Go
+RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
+ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
+
+# Install grpc compilers and rice
+RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
+    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af && \
+    go install github.com/GeertJohan/go.rice/rice@latest
+
+COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
+RUN update-ca-certificates
+
+
+# OpenBLAS requirements and stable diffusion
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        libopenblas-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN test -n "$TARGETARCH" \
+    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
+
+# Use the variables in subsequent instructions
+RUN echo "Target Architecture: $TARGETARCH"
+RUN echo "Target Variant: $TARGETVARIANT"
+
+
+
+
+WORKDIR /build
+
+
 ###################################
 ###################################

@@ -248,13 +231,14 @@ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shall

 # The builder-base target has the arguments, variables, and copies shared between full builder images and the uncompiled devcontainer

-FROM requirements-drivers AS builder-base
+FROM build-requirements AS builder-base

-ARG GO_TAGS="stablediffusion tts p2p"
+ARG GO_TAGS="tts p2p"
 ARG GRPC_BACKENDS
 ARG MAKEFLAGS
 ARG LD_FLAGS="-s -w"
-
+ARG TARGETARCH
+ARG TARGETVARIANT
 ENV GRPC_BACKENDS=${GRPC_BACKENDS}
 ENV GO_TAGS=${GO_TAGS}
 ENV MAKEFLAGS=${MAKEFLAGS}
@@ -284,53 +268,51 @@ RUN <<EOT bash
    fi
 EOT

-
 ###################################
 ###################################

-# This first portion of builder holds the layers specifically used to build backend-assets/grpc/stablediffusion
-# In most cases, builder is the image you should be using - however, this can save build time if one just needs to copy backend-assets/grpc/stablediffusion and nothing else.
-FROM builder-base AS builder-sd
+# Compile backends first in a separate stage
+FROM builder-base AS builder-backends
+ARG TARGETARCH
+ARG TARGETVARIANT

-# stablediffusion does not tolerate a newer version of abseil, copy only over enough elements to build it
-COPY Makefile .
-COPY go.mod .
-COPY go.sum .
-COPY backend/backend.proto ./backend/backend.proto
-COPY backend/go/image/stablediffusion ./backend/go/image/stablediffusion
-COPY pkg/grpc ./pkg/grpc
-COPY pkg/stablediffusion ./pkg/stablediffusion
-RUN git init
-RUN make sources/go-stable-diffusion
-RUN touch prepare-sources
+COPY --from=grpc /opt/grpc /usr/local

-# Actually build the backend
-RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make backend-assets/grpc/stablediffusion
+WORKDIR /build

-###################################
-###################################
+COPY ./Makefile .
+COPY ./backend ./backend
+COPY ./go.mod .
+COPY ./go.sum .
+COPY ./.git ./.git
+
+# Some of the Go backends use libs from the main src, we could further optimize the caching by building the CPP backends before here
+COPY ./pkg/grpc ./pkg/grpc
+COPY ./pkg/utils ./pkg/utils
+COPY ./pkg/langchain ./pkg/langchain
+
+RUN ls -l ./
+RUN make backend-assets
+RUN make prepare
+RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
+        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make grpcs; \
+    else \
+        make grpcs; \
+    fi

 # The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
 # Adjustments to the build process should likely be made here.
-FROM builder-sd AS builder
+FROM builder-backends AS builder

-# Install the pre-built GRPC
-COPY --from=grpc /opt/grpc /usr/local
-
-# Rebuild with defaults backends
 WORKDIR /build

 COPY . .
-COPY .git .
-
-RUN make prepare

 ## Build the binary
-## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
-## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
-## (both will use CUDA or hipblas for the actual computation)
-RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
-        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
+## If we're on arm64 AND using cublas/hipblas, skip some of the llama-compat backends to save space
+## Otherwise just run the normal build
+RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
+        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
    else \
        make build; \
    fi
@@ -348,26 +330,13 @@ RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \

 FROM builder-base AS devcontainer

-ARG FFMPEG
-
 COPY --from=grpc /opt/grpc /usr/local

-COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion /build/backend-assets/grpc/stablediffusion
-
 COPY .devcontainer-scripts /.devcontainer-scripts

-# Add FFmpeg
-RUN if [ "${FFMPEG}" = "true" ]; then \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            ffmpeg && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* \
-    ; fi
-
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        ssh less wget
+        ssh less
 # For the devcontainer, leave apt functional in case additional devtools are needed at runtime.

 RUN go install github.com/go-delve/delve/cmd/dlv@latest
@@ -381,44 +350,16 @@ RUN go install github.com/mikefarah/yq/v4@latest
 # If you cannot find a more suitable place for an addition, this layer is a suitable place for it.
 FROM requirements-drivers

-ARG FFMPEG
-ARG BUILD_TYPE
-ARG TARGETARCH
-ARG IMAGE_TYPE=extras
-ARG EXTRA_BACKENDS
-ARG MAKEFLAGS
-
-ENV BUILD_TYPE=${BUILD_TYPE}
-ENV REBUILD=false
 ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz
-ENV MAKEFLAGS=${MAKEFLAGS}

 ARG CUDA_MAJOR_VERSION=12
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all

-# Add FFmpeg
-RUN if [ "${FFMPEG}" = "true" ]; then \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            ffmpeg && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* \
-    ; fi
+WORKDIR /

-WORKDIR /build
-
-# we start fresh & re-copy all assets because `make build` does not clean up nicely after itself
-# so when `entrypoint.sh` runs `make build` again (which it does by default), the build would fail
-# see https://github.com/go-skynet/LocalAI/pull/658#discussion_r1241971626 and
-# https://github.com/go-skynet/LocalAI/pull/434
-COPY . .
-
-COPY --from=builder /build/sources ./sources/
-COPY --from=grpc /opt/grpc /usr/local
-
-RUN make prepare-sources
+COPY ./entrypoint.sh .

 # Copy the binary
 COPY --from=builder /build/local-ai ./
@@ -426,67 +367,13 @@ COPY --from=builder /build/local-ai ./
 # Copy shared libraries for piper
 COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/

-# do not let stablediffusion rebuild (requires an older version of absl)
-COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
-
-# Change the shell to bash so we can use [[ tests below
-SHELL ["/bin/bash", "-c"]
-# We try to strike a balance between individual layer size (as that affects total push time) and total image size
-# Splitting the backends into more groups with fewer items results in a larger image, but a smaller size for the largest layer
-# Splitting the backends into fewer groups with more items results in a smaller image, but a larger size for the largest layer
-
-RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/coqui \
-    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "parler-tts" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/parler-tts \
-    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "diffusers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/diffusers \
-    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/transformers-musicgen \
-    ; fi
-
-RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/vall-e-x \
-    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "openvoice" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/openvoice \
-    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "sentencetransformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/sentencetransformers \
-    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama2" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/exllama2 \
-    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "transformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/transformers \
-    ; fi
-
-RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/vllm \
-    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/autogptq \
-    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/bark \
-    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "rerankers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/rerankers \
-    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "mamba" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/mamba \
-    ; fi
-
 # Make sure the models directory exists
-RUN mkdir -p /build/models
+RUN mkdir -p /models /backends

 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
  CMD curl -f ${HEALTHCHECK_ENDPOINT} || exit 1

-VOLUME /build/models
+VOLUME /models /backends
 EXPOSE 8080
-ENTRYPOINT [ "/build/entrypoint.sh" ]
+ENTRYPOINT [ "/entrypoint.sh" ]
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2023-2024 Ettore Di Giacinto (mudler@localai.io)
+Copyright (c) 2023-2025 Ettore Di Giacinto (mudler@localai.io)

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/353
+++ b/353
@@ -6,29 +6,26 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true

 # llama.cpp versions
-GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
-GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=47f931c8f9a26c072d71224bc8013cc66ea9e445
+CPPLLAMA_VERSION?=72babea5dea56c8a8e8420ccf731b12a5cf37854

 # whisper.cpp version
-WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
-
-# bert.cpp version
-BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
-BERT_VERSION?=710044b124545415f555e4260d16b146c725a6e4
+WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
+WHISPER_CPP_VERSION?=c88ffbf9baeaae8c2cc0a4f496618314bb2ee9e0

 # go-piper version
 PIPER_REPO?=https://github.com/mudler/go-piper
 PIPER_VERSION?=e10ca041a885d4a8f3871d52924b47792d5e5aa0

-# stablediffusion version
-STABLEDIFFUSION_REPO?=https://github.com/mudler/go-stable-diffusion
-STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f
+# bark.cpp
+BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
+BARKCPP_VERSION?=v1.0.0

-# tinydream version
-TINYDREAM_REPO?=https://github.com/M0Rf30/go-tiny-dream
-TINYDREAM_VERSION?=c04fa463ace9d9a6464313aa5f9cd0f953b6c057
+# stablediffusion.cpp (ggml)
+STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp
+STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae
+
+# ONEAPI variables for SYCL
+export ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh

 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
@@ -36,8 +33,12 @@ ONNX_OS?=linux

 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
-export CMAKE_ARGS?=
+export CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
+export WHISPER_CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
 export BACKEND_LIBS?=
+export WHISPER_DIR=$(abspath ./sources/whisper.cpp)
+export WHISPER_INCLUDE_PATH=$(WHISPER_DIR)/include:$(WHISPER_DIR)/ggml/include
+export WHISPER_LIBRARY_PATH=$(WHISPER_DIR)/build/src/:$(WHISPER_DIR)/build/ggml/src

 CGO_LDFLAGS?=
 CGO_LDFLAGS_WHISPER?=
@@ -87,6 +88,7 @@ endif
 # IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+	WHISPER_CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif

 # Detect if we are running on arm64
@@ -114,13 +116,31 @@ ifeq ($(OS),Darwin)
 	# disable metal if on Darwin and any other value is explicitly passed.
 	else ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
+		WHISPER_CMAKE_ARGS+=-DGGML_METAL=OFF
 		export GGML_NO_ACCELERATE=1
 		export GGML_NO_METAL=1
+		GO_LDFLAGS_WHISPER+=-lggml-blas
+		export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
 	endif

 	ifeq ($(BUILD_TYPE),metal)
-#			-lcblas 	removed: it seems to always be listed as a duplicate flag.
 		CGO_LDFLAGS += -framework Accelerate
+		CGO_LDFLAGS_WHISPER+=-lggml-metal -lggml-blas
+		CMAKE_ARGS+=-DGGML_METAL=ON
+		CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
+		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
+		CMAKE_ARGS+=-DGGML_OPENMP=OFF
+		WHISPER_CMAKE_ARGS+=-DGGML_METAL=ON
+		WHISPER_CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
+		WHISPER_CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
+		WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_EXAMPLES=OFF
+		WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_TESTS=OFF
+		WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_SERVER=OFF
+		WHISPER_CMAKE_ARGS+=-DGGML_OPENMP=OFF
+		export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-metal/:$(WHISPER_DIR)/build/ggml/src/ggml-blas
+	else
+		CGO_LDFLAGS_WHISPER+=-lggml-blas
+		export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
 	endif
 else
 CGO_LDFLAGS_WHISPER+=-lgomp
@@ -132,21 +152,29 @@ ifeq ($(BUILD_TYPE),openblas)
 endif

 ifeq ($(BUILD_TYPE),cublas)
-	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
+	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH) -L$(CUDA_LIBPATH)/stubs/ -lcuda
 	export GGML_CUDA=1
-	CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
+	CMAKE_ARGS+=-DGGML_CUDA=ON
+	WHISPER_CMAKE_ARGS+=-DGGML_CUDA=ON
+	CGO_LDFLAGS_WHISPER+=-lcufft -lggml-cuda
+	export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-cuda/
 endif

 ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=1
+	WHISPER_CMAKE_ARGS+=-DGGML_VULKAN=1
+	CGO_LDFLAGS_WHISPER+=-lggml-vulkan -lvulkan
+	export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-vulkan/
 endif

 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	export GGML_SYCL=1
+	CMAKE_ARGS+=-DGGML_SYCL=ON
 endif

 ifeq ($(BUILD_TYPE),sycl_f16)
 	export GGML_SYCL_F16=1
+	CMAKE_ARGS+=-DGGML_SYCL_F16=ON
 endif

 ifeq ($(BUILD_TYPE),hipblas)
@@ -155,10 +183,9 @@ ifeq ($(BUILD_TYPE),hipblas)
 	LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
 	export CXX=$(ROCM_HOME)/llvm/bin/clang++
 	export CC=$(ROCM_HOME)/llvm/bin/clang
-	# llama-ggml has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
 	export GGML_HIP=1
-	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
+	GPU_TARGETS ?= gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
 	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
 	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
@@ -179,16 +206,6 @@ ifeq ($(STATIC),true)
 	LD_FLAGS+=-linkmode external -extldflags -static
 endif

-ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion)
-#	OPTIONAL_TARGETS+=go-stable-diffusion/libstablediffusion.a
-	OPTIONAL_GRPC+=backend-assets/grpc/stablediffusion
-endif
-
-ifeq ($(findstring tinydream,$(GO_TAGS)),tinydream)
-#	OPTIONAL_TARGETS+=go-tiny-dream/libtinydream.a
-	OPTIONAL_GRPC+=backend-assets/grpc/tinydream
-endif
-
 ifeq ($(findstring tts,$(GO_TAGS)),tts)
 #	OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
 #	OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
@@ -198,14 +215,20 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
 endif

 ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
-ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
+ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
-ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
 ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
+
+ifeq ($(ONNX_OS),linux)
+ifeq ($(ONNX_ARCH),x64)
+	ALL_GRPC_BACKENDS+=backend-assets/grpc/stablediffusion-ggml
+endif
+endif
+
 ALL_GRPC_BACKENDS+=backend-assets/grpc/local-store
 ALL_GRPC_BACKENDS+=backend-assets/grpc/silero-vad
 ALL_GRPC_BACKENDS+=$(OPTIONAL_GRPC)
@@ -228,31 +251,22 @@ endif

 all: help

-## BERT embeddings
-sources/go-bert.cpp:
-	mkdir -p sources/go-bert.cpp
-	cd sources/go-bert.cpp && \
-	git init && \
-	git remote add origin $(BERT_REPO) && \
-	git fetch origin && \
-	git checkout $(BERT_VERSION) && \
+## bark.cpp
+sources/bark.cpp:
+	git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
+	cd sources/bark.cpp && \
+	git checkout $(BARKCPP_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch

-sources/go-bert.cpp/libgobert.a: sources/go-bert.cpp
-	$(MAKE) -C sources/go-bert.cpp libgobert.a
+sources/bark.cpp/build/libbark.a: sources/bark.cpp
+	cd sources/bark.cpp && \
+	mkdir -p build && \
+	cd build && \
+	cmake $(CMAKE_ARGS) .. && \
+	cmake --build . --config Release

-## go-llama.cpp
-sources/go-llama.cpp:
-	mkdir -p sources/go-llama.cpp
-	cd sources/go-llama.cpp && \
-	git init && \
-	git remote add origin $(GOLLAMA_REPO) && \
-	git fetch origin && \
-	git checkout $(GOLLAMA_VERSION) && \
-	git submodule update --init --recursive --depth 1 --single-branch
-
-sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
-	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
+backend/go/bark/libbark.a: sources/bark.cpp/build/libbark.a
+	$(MAKE) -C backend/go/bark libbark.a

 ## go-piper
 sources/go-piper:
@@ -267,18 +281,19 @@ sources/go-piper:
 sources/go-piper/libpiper_binding.a: sources/go-piper
 	$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o

-## stable diffusion
-sources/go-stable-diffusion:
-	mkdir -p sources/go-stable-diffusion
-	cd sources/go-stable-diffusion && \
-	git init && \
-	git remote add origin $(STABLEDIFFUSION_REPO) && \
-	git fetch origin && \
-	git checkout $(STABLEDIFFUSION_VERSION) && \
+## stablediffusion (ggml)
+sources/stablediffusion-ggml.cpp:
+	git clone --recursive $(STABLEDIFFUSION_GGML_REPO) sources/stablediffusion-ggml.cpp && \
+	cd sources/stablediffusion-ggml.cpp && \
+	git checkout $(STABLEDIFFUSION_GGML_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch

-sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
-	CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
+backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
+	$(MAKE) -C backend/go/image/stablediffusion-ggml build/libstable-diffusion.a
+	$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
+
+backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
+	$(MAKE) -C backend/go/image/stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" stablediffusion-ggml

 sources/onnxruntime:
 	mkdir -p sources/onnxruntime
@@ -294,19 +309,6 @@ else
 	mv backend-assets/lib/libonnxruntime.so.$(ONNX_VERSION) backend-assets/lib/libonnxruntime.so.1
 endif

-## tiny-dream
-sources/go-tiny-dream:
-	mkdir -p sources/go-tiny-dream
-	cd sources/go-tiny-dream && \
-	git init && \
-	git remote add origin $(TINYDREAM_REPO) && \
-	git fetch origin && \
-	git checkout $(TINYDREAM_VERSION) && \
-	git submodule update --init --recursive --depth 1 --single-branch
-
-sources/go-tiny-dream/libtinydream.a: sources/go-tiny-dream
-	$(MAKE) -C sources/go-tiny-dream libtinydream.a
-
 ## whisper
 sources/whisper.cpp:
 	mkdir -p sources/whisper.cpp
@@ -317,28 +319,21 @@ sources/whisper.cpp:
 	git checkout $(WHISPER_CPP_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch

-sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
-	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
+sources/whisper.cpp/build/src/libwhisper.a: sources/whisper.cpp
+	cd sources/whisper.cpp && cmake $(WHISPER_CMAKE_ARGS) . -B ./build
+	cd sources/whisper.cpp/build && cmake --build . --config Release

-get-sources: sources/go-llama.cpp sources/go-piper sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
+get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp

 replace:
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert.cpp
-	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
-	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp

 dropreplace:
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
-	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-bert.cpp
-	$(GOCMD) mod edit -dropreplace github.com/M0Rf30/go-tiny-dream
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
-	$(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
-	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp

 prepare-sources: get-sources replace
 	$(GOCMD) mod download
@@ -346,12 +341,8 @@ prepare-sources: get-sources replace
 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
-	$(MAKE) -C sources/go-llama.cpp clean
 	$(MAKE) -C sources/whisper.cpp clean
-	$(MAKE) -C sources/go-stable-diffusion clean
-	$(MAKE) -C sources/go-bert.cpp clean
 	$(MAKE) -C sources/go-piper clean
-	$(MAKE) -C sources/go-tiny-dream clean
 	$(MAKE) build

 prepare: prepare-sources $(OPTIONAL_TARGETS)
@@ -364,7 +355,9 @@ clean: ## Remove build related file
 	rm -rf release/
 	rm -rf backend-assets/*
 	$(MAKE) -C backend/cpp/grpc clean
+	$(MAKE) -C backend/go/bark clean
 	$(MAKE) -C backend/cpp/llama clean
+	$(MAKE) -C backend/go/image/stablediffusion-ggml clean
 	rm -rf backend/cpp/llama-* || true
 	$(MAKE) dropreplace
 	$(MAKE) protogen-clean
@@ -378,8 +371,14 @@ clean-tests:
 clean-dc: clean
 	cp -r /build/backend-assets /workspace/backend-assets

+## Install Go tools
+install-go-tools:
+	go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+	go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+	go install github.com/GeertJohan/go.rice/rice@latest
+
 ## Build:
-build: prepare backend-assets grpcs ## Build the project
+build: prepare backend-assets grpcs install-go-tools ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
 	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
 	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
@@ -389,7 +388,9 @@ ifneq ($(BACKEND_LIBS),)
 	$(MAKE) backend-assets/lib
 	cp -f $(BACKEND_LIBS) backend-assets/lib/
 endif
+	rm -rf $(BINARY_NAME) || true
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
+	rice append --exec $(BINARY_NAME)

 build-minimal:
 	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build
@@ -451,7 +452,7 @@ run: prepare ## run local-ai
 test-models/testmodel.ggml:
 	mkdir test-models
 	mkdir test-dir
-	wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
+	wget -q https://huggingface.co/mradermacher/gpt2-alpaca-gpt4-GGUF/resolve/main/gpt2-alpaca-gpt4.Q4_K_M.gguf -O test-models/testmodel.ggml
 	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
 	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
 	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@@ -461,13 +462,13 @@ prepare-test: grpcs
 	cp -rf backend-assets core/http
 	cp tests/models_fixtures/* test-models

+## Test targets
 test: prepare test-models/testmodel.ggml grpcs
 	@echo 'Running tests'
-	export GO_TAGS="tts stablediffusion debug"
+	export GO_TAGS="tts debug"
 	$(MAKE) prepare-test
-	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
-	$(MAKE) test-llama
+	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion
@@ -496,10 +497,6 @@ teardown-e2e:
 	rm -rf $(TEST_DIR) || true
 	docker stop $$(docker ps -q --filter ancestor=localai-tests)

-test-llama: prepare-test
-	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
-
 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
@@ -540,7 +537,7 @@ protogen: protogen-go protogen-python
 protogen-clean: protogen-go-clean protogen-python-clean

 .PHONY: protogen-go
-protogen-go:
+protogen-go: install-go-tools
 	mkdir -p pkg/grpc/proto
 	protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
    backend/backend.proto
@@ -551,18 +548,10 @@ protogen-go-clean:
 	$(RM) bin/*

 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
+protogen-python: bark-protogen coqui-protogen chatterbox-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen

 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
-
-.PHONY: autogptq-protogen
-autogptq-protogen:
-	$(MAKE) -C backend/python/autogptq protogen
-
-.PHONY: autogptq-protogen-clean
-autogptq-protogen-clean:
-	$(MAKE) -C backend/python/autogptq protogen-clean
+protogen-python-clean: bark-protogen-clean coqui-protogen-clean chatterbox-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean

 .PHONY: bark-protogen
 bark-protogen:
@@ -584,10 +573,26 @@ coqui-protogen-clean:
 diffusers-protogen:
 	$(MAKE) -C backend/python/diffusers protogen

+.PHONY: chatterbox-protogen
+chatterbox-protogen:
+	$(MAKE) -C backend/python/chatterbox protogen
+
 .PHONY: diffusers-protogen-clean
 diffusers-protogen-clean:
 	$(MAKE) -C backend/python/diffusers protogen-clean

+.PHONY: chatterbox-protogen-clean
+chatterbox-protogen-clean:
+	$(MAKE) -C backend/python/chatterbox protogen-clean
+
+.PHONY: faster-whisper-protogen
+faster-whisper-protogen:
+	$(MAKE) -C backend/python/faster-whisper protogen
+
+.PHONY: faster-whisper-protogen-clean
+faster-whisper-protogen-clean:
+	$(MAKE) -C backend/python/faster-whisper protogen-clean
+
 .PHONY: exllama2-protogen
 exllama2-protogen:
 	$(MAKE) -C backend/python/exllama2 protogen
@@ -596,14 +601,6 @@ exllama2-protogen:
 exllama2-protogen-clean:
 	$(MAKE) -C backend/python/exllama2 protogen-clean

-.PHONY: mamba-protogen
-mamba-protogen:
-	$(MAKE) -C backend/python/mamba protogen
-
-.PHONY: mamba-protogen-clean
-mamba-protogen-clean:
-	$(MAKE) -C backend/python/mamba protogen-clean
-
 .PHONY: rerankers-protogen
 rerankers-protogen:
 	$(MAKE) -C backend/python/rerankers protogen
@@ -612,14 +609,6 @@ rerankers-protogen:
 rerankers-protogen-clean:
 	$(MAKE) -C backend/python/rerankers protogen-clean

-.PHONY: sentencetransformers-protogen
-sentencetransformers-protogen:
-	$(MAKE) -C backend/python/sentencetransformers protogen
-
-.PHONY: sentencetransformers-protogen-clean
-sentencetransformers-protogen-clean:
-	$(MAKE) -C backend/python/sentencetransformers protogen-clean
-
 .PHONY: transformers-protogen
 transformers-protogen:
 	$(MAKE) -C backend/python/transformers protogen
@@ -628,37 +617,13 @@ transformers-protogen:
 transformers-protogen-clean:
 	$(MAKE) -C backend/python/transformers protogen-clean

-.PHONY: parler-tts-protogen
-parler-tts-protogen:
-	$(MAKE) -C backend/python/parler-tts protogen
+.PHONY: kokoro-protogen
+kokoro-protogen:
+	$(MAKE) -C backend/python/kokoro protogen

-.PHONY: parler-tts-protogen-clean
-parler-tts-protogen-clean:
-	$(MAKE) -C backend/python/parler-tts protogen-clean
-
-.PHONY: transformers-musicgen-protogen
-transformers-musicgen-protogen:
-	$(MAKE) -C backend/python/transformers-musicgen protogen
-
-.PHONY: transformers-musicgen-protogen-clean
-transformers-musicgen-protogen-clean:
-	$(MAKE) -C backend/python/transformers-musicgen protogen-clean
-
-.PHONY: vall-e-x-protogen
-vall-e-x-protogen:
-	$(MAKE) -C backend/python/vall-e-x protogen
-
-.PHONY: vall-e-x-protogen-clean
-vall-e-x-protogen-clean:
-	$(MAKE) -C backend/python/vall-e-x protogen-clean
-
-.PHONY: openvoice-protogen
-openvoice-protogen:
-	$(MAKE) -C backend/python/openvoice protogen
-
-.PHONY: openvoice-protogen-clean
-openvoice-protogen-clean:
-	$(MAKE) -C backend/python/openvoice protogen-clean
+.PHONY: kokoro-protogen-clean
+kokoro-protogen-clean:
+	$(MAKE) -C backend/python/kokoro protogen-clean

 .PHONY: vllm-protogen
 vllm-protogen:
@@ -671,28 +636,28 @@ vllm-protogen-clean:
 ## GRPC
 # Note: it is duplicated in the Dockerfile
 prepare-extra-conda-environments: protogen-python
-	$(MAKE) -C backend/python/autogptq
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
 	$(MAKE) -C backend/python/diffusers
+	$(MAKE) -C backend/python/chatterbox
+	$(MAKE) -C backend/python/faster-whisper
 	$(MAKE) -C backend/python/vllm
-	$(MAKE) -C backend/python/mamba
-	$(MAKE) -C backend/python/sentencetransformers
 	$(MAKE) -C backend/python/rerankers
 	$(MAKE) -C backend/python/transformers
-	$(MAKE) -C backend/python/transformers-musicgen
-	$(MAKE) -C backend/python/parler-tts
-	$(MAKE) -C backend/python/vall-e-x
-	$(MAKE) -C backend/python/openvoice
+	$(MAKE) -C backend/python/kokoro
 	$(MAKE) -C backend/python/exllama2

 prepare-test-extra: protogen-python
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/diffusers
+	$(MAKE) -C backend/python/chatterbox
+	$(MAKE) -C backend/python/vllm

 test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/transformers test
 	$(MAKE) -C backend/python/diffusers test
+	$(MAKE) -C backend/python/chatterbox test
+	$(MAKE) -C backend/python/vllm test

 backend-assets:
 	mkdir -p backend-assets
@@ -707,13 +672,6 @@ backend-assets/espeak-ng-data: sources/go-piper sources/go-piper/libpiper_bindin
 backend-assets/grpc: protogen-go replace
 	mkdir -p backend-assets/grpc

-backend-assets/grpc/bert-embeddings: sources/go-bert.cpp sources/go-bert.cpp/libgobert.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert.cpp LIBRARY_PATH=$(CURDIR)/sources/go-bert.cpp \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/
-ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/bert-embeddings
-endif
-
 backend-assets/grpc/huggingface: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/huggingface ./backend/go/llm/langchain/
 ifneq ($(UPX),)
@@ -760,6 +718,13 @@ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2

+backend-assets/grpc/llama-cpp-avx512: backend-assets/grpc backend/cpp/llama/llama.cpp
+	cp -rf backend/cpp/llama backend/cpp/llama-avx512
+	$(MAKE) -C backend/cpp/llama-avx512 purge
+	$(info ${GREEN}I llama-cpp build info:avx512${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx512" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-avx512/grpc-server backend-assets/grpc/llama-cpp-avx512
+
 backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-avx
 	$(MAKE) -C backend/cpp/llama-avx purge
@@ -773,10 +738,6 @@ backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc backend/cpp/llama/ll
 	$(info ${GREEN}I llama-cpp build info:fallback${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback
-# TODO: every binary should have its own folder instead, so can have different metal implementations
-ifeq ($(BUILD_TYPE),metal)
-	cp backend/cpp/llama-fallback/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
-endif

 backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-cuda
@@ -817,11 +778,11 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
 	mkdir -p backend-assets/util/
 	cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server

-backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
+backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
 ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/llama-ggml
+	$(UPX) backend-assets/grpc/bark-cpp
 endif

 backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
@@ -831,13 +792,6 @@ ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/piper
 endif

-backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/go-stable-diffusion/:/usr/include/opencv4" LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion
-ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/stablediffusion
-endif
-
 backend-assets/grpc/silero-vad: backend-assets/grpc backend-assets/lib/libonnxruntime.so.1
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/onnxruntime/include/" LIBRARY_PATH=$(CURDIR)/backend-assets/lib \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/silero-vad ./backend/go/vad/silero
@@ -845,15 +799,8 @@ ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/silero-vad
 endif

-backend-assets/grpc/tinydream: sources/go-tiny-dream sources/go-tiny-dream/libtinydream.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
-ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/tinydream
-endif
-
-backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
+backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/build/src/libwhisper.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="${WHISPER_INCLUDE_PATH}" LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" LD_LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/whisper
@@ -905,17 +852,17 @@ docker-aio-all:

 docker-image-intel:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
-		--build-arg GO_TAGS="none" \
+		--build-arg GO_TAGS="$(GO_TAGS)" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
 		--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .

 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
-		--build-arg GO_TAGS="none" \
+		--build-arg GO_TAGS="$(GO_TAGS)" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
 		--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .

@@ -925,7 +872,7 @@ swagger:

 .PHONY: gen-assets
 gen-assets:
-	$(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets
+	$(GOCMD) run core/dependencies_manager/manager.go webui_static.yaml core/http/static/assets

 ## Documentation
 docs/layouts/_default:
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
 <h1 align="center">
  <br>
-  <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
-    LocalAI
+  <img height="300" src="./core/http/static/logo.png"> <br>
 <br>
 </h1>

@@ -31,7 +30,7 @@

 <p align="center">
 <a href="https://twitter.com/LocalAI_API" target="blank">
-<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
+<img src="https://img.shields.io/badge/X-%23000000.svg?style=for-the-badge&logo=X&logoColor=white&label=LocalAI_API" alt="Follow LocalAI_API"/>
 </a>
 <a href="https://discord.gg/uJAeKSAGDy" target="blank">
 <img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
@@ -39,40 +38,142 @@
 </p>

 <p align="center">
-<a href="https://trendshift.io/repositories/1484" target="_blank"><img src="https://trendshift.io/api/badge/repositories/1484" alt="go-skynet%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+<a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 </p>

 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
 >
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) Try on 
+[![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/localaiofficial_bot)

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).

-![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
+
+## 📚🆕 Local Stack Family
+
+🆕 LocalAI is now part of a comprehensive suite of AI tools designed to work together:
+
+<table>
+  <tr>
+    <td width="50%" valign="top">
+      <a href="https://github.com/mudler/LocalAGI">
+        <img src="https://raw.githubusercontent.com/mudler/LocalAGI/refs/heads/main/webui/react-ui/public/logo_2.png" width="300" alt="LocalAGI Logo">
+      </a>
+    </td>
+    <td width="50%" valign="top">
+      <h3><a href="https://github.com/mudler/LocalAGI">LocalAGI</a></h3>
+      <p>A powerful Local AI agent management platform that serves as a drop-in replacement for OpenAI's Responses API, enhanced with advanced agentic capabilities.</p>
+    </td>
+  </tr>
+  <tr>
+    <td width="50%" valign="top">
+      <a href="https://github.com/mudler/LocalRecall">
+        <img src="https://raw.githubusercontent.com/mudler/LocalRecall/refs/heads/main/static/localrecall_horizontal.png" width="300" alt="LocalRecall Logo">
+      </a>
+    </td>
+    <td width="50%" valign="top">
+      <h3><a href="https://github.com/mudler/LocalRecall">LocalRecall</a></h3>
+      <p>A REST-ful API and knowledge base management system that provides persistent memory and storage capabilities for AI agents.</p>
+    </td>
+  </tr>
+</table>
+
+## Screenshots
+
+
+| Talk Interface | Generate Audio |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](./docs/assets/images/screenshots/screenshot_tts.png) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](./docs/assets/images/screenshots/screenshot_tts.png) |
+
+| Models Overview | Generate Images |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](./docs/assets/images/screenshots/screenshot_gallery.png) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](./docs/assets/images/screenshots/screenshot_image.png) |
+
+| Chat Interface | Home |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](./docs/assets/images/screenshots/screenshot_chat.png) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](./docs/assets/images/screenshots/screenshot_home.png) |
+
+| Login | Swarm |
+| --- | --- |
+|![Screenshot 2025-03-31 at 12-09-59 ](./docs/assets/images/screenshots/screenshot_login.png) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](./docs/assets/images/screenshots/screenshot_p2p.png) |
+
+## 💻 Quickstart

 Run the installer script:

 ```bash
+# Basic installation
 curl https://localai.io/install.sh | sh
 ```

-Or run with docker:
-```bash
-# CPU only image:
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
+For more installation options, see [Installer Options](https://localai.io/docs/advanced/installer/).

-# Nvidia GPU:
+Or run with docker:
+
+### CPU only image:
+
+```bash
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
+```
+
+### NVIDIA GPU Images:
+
+```bash
+# CUDA 12.0
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12

-# CPU and GPU image (bigger size):
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
+# CUDA 11.7
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-11

-# AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
+# NVIDIA Jetson (L4T) ARM64
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-nvidia-l4t-arm64
 ```

+### AMD GPU Images (ROCm):
+
+```bash
+docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-gpu-hipblas
+```
+
+### Intel GPU Images (oneAPI):
+
+```bash
+# Intel GPU with FP16 support
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f16
+
+# Intel GPU with FP32 support
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f32
+```
+
+### Vulkan GPU Images:
+
+```bash
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-vulkan
+```
+
+### AIO Images (pre-downloaded models):
+
+```bash
+# CPU version
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
+
+# NVIDIA CUDA 12 version
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
+
+# NVIDIA CUDA 11 version
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-11
+
+# Intel GPU version
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-gpu-intel-f16
+
+# AMD GPU version
+docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-aio-gpu-hipblas
+```
+
+For more information about the AIO images and pre-downloaded models, see [Container Documentation](https://localai.io/basics/container/).
+
 To load models:

 ```bash
@@ -88,46 +189,36 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
 local-ai run oci://localai/phi-2:latest
 ```

-[💻 Getting started](https://localai.io/basics/getting_started/index.html)
+For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html)

 ## 📰 Latest project news

+- June 2025: [Backend management](https://github.com/mudler/LocalAI/pull/5607) has been added. Attention: extras images are going to be deprecated from the next release! Read [the backend management PR](https://github.com/mudler/LocalAI/pull/5607).
+- May 2025: [Audio input](https://github.com/mudler/LocalAI/pull/5466) and [Reranking](https://github.com/mudler/LocalAI/pull/5396) in llama.cpp backend, [Realtime API](https://github.com/mudler/LocalAI/pull/5392),  Support to Gemma, SmollVLM, and more multimodal models (available in the gallery).
+- May 2025: Important: image name changes [See release](https://github.com/mudler/LocalAI/releases/tag/v2.29.0)
+- Apr 2025: Rebrand, WebUI enhancements
+- Apr 2025: [LocalAGI](https://github.com/mudler/LocalAGI) and [LocalRecall](https://github.com/mudler/LocalRecall) join the LocalAI family stack.
+- Apr 2025: WebUI overhaul, AIO images updates
+- Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
+- Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
+- Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
+- Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
 - Nov 2024: Voice activity detection models (**VAD**) added to the API: https://github.com/mudler/LocalAI/pull/4204
 - Oct 2024: examples moved to [LocalAI-examples](https://github.com/mudler/LocalAI-examples)
 - Aug 2024:  🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
- July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
- June 2024: 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
- June 2024: Support for models from OCI registries: https://github.com/mudler/LocalAI/pull/2628
+- July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723. P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
 - May 2024: 🔥🔥 Decentralized P2P llama.cpp:  https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs  https://localai.io/features/distribute/
- May 2024: 🔥🔥 Openvoice: https://github.com/mudler/LocalAI/pull/2334
- May 2024: 🆕 Function calls without grammars and mixed mode: https://github.com/mudler/LocalAI/pull/2328
 - May 2024: 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
- May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
 - April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121

 Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

-## 🔥🔥 Hot topics (looking for help):
-
- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
- Realtime API https://github.com/mudler/LocalAI/issues/3714
- 🔥🔥 Distributed, P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
- Backends v2: https://github.com/mudler/LocalAI/issues/1126
- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
- Assistant API: https://github.com/mudler/LocalAI/issues/1273
- Moderation endpoint: https://github.com/mudler/LocalAI/issues/999
- Vulkan: https://github.com/mudler/LocalAI/issues/1647
- Anthropic API: https://github.com/mudler/LocalAI/issues/1808
-
-If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
-
 ## 🚀 [Features](https://localai.io/features/)

- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
+- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
 - 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
 - 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
+- 🎨 [Image generation](https://localai.io/features/image-generation)
 - 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/) 
 - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
@@ -135,11 +226,10 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
 - 📈 [Reranker API](https://localai.io/features/reranker/)
 - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
+- [Agentic capabilities](https://github.com/mudler/LocalAGI)
+- 🔊 Voice activity detection (Silero-VAD support)
 - 🌍 Integrated WebUI!

-## 💻 Usage
-
-Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.

 ### 🔗 Community and integrations

@@ -157,6 +247,7 @@ Model galleries
 Other:
 - Helm chart https://github.com/go-skynet/helm-charts
 - VSCode extension https://github.com/badgooooor/localai-vscode-plugin
+- Langchain: https://python.langchain.com/docs/integrations/providers/localai/
 - Terminal utility https://github.com/djcopley/ShellOracle
 - Local Smart assistant https://github.com/mudler/LocalAGI
 - Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
@@ -214,7 +305,7 @@ A huge thank you to our generous sponsors who support this project covering CI e

 <p align="center">
  <a href="https://www.spectrocloud.com/" target="blank">
-    <img height="200" src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
+    <img height="200" src="https://github.com/user-attachments/assets/72eab1dd-8b93-4fc0-9ade-84db49f24962">
  </a>
  <a href="https://www.premai.io/" target="blank">
    <img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
--- a/aio/cpu/embeddings.yaml
+++ b/aio/cpu/embeddings.yaml
@@ -1,7 +1,7 @@
+embeddings: true
 name: text-embedding-ada-002
-backend: bert-embeddings
 parameters:
-  model: huggingface://mudler/all-MiniLM-L6-v2/ggml-model-q4_0.bin
+  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf

 usage: |
    You can test this model with curl like this:
--- a/aio/cpu/image-gen.yaml
+++ b/aio/cpu/image-gen.yaml
@@ -1,56 +1,17 @@
 name: stablediffusion
-backend: stablediffusion
+backend: stablediffusion-ggml
+cfg_scale: 4.5
+
+options:
+- sampler:euler
 parameters:
-  model: stablediffusion_assets
-
-license: "BSD-3"
-urls:
- https://github.com/EdVince/Stable-Diffusion-NCNN
- https://github.com/EdVince/Stable-Diffusion-NCNN/blob/main/LICENSE
-
-description: |
-     Stable Diffusion in NCNN with c++, supported txt2img and img2img
+  model: stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf
+step: 25

 download_files:
- filename: "stablediffusion_assets/AutoencoderKL-256-256-fp16-opt.param"
-  sha256: "18ca4b66685e21406bcf64c484b3b680b4949900415536d599cc876579c85c82"
-  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-256-256-fp16-opt.param"
- filename: "stablediffusion_assets/AutoencoderKL-512-512-fp16-opt.param"
-  sha256: "cf45f63aacf3dbbab0f59ed92a6f2c14d9a1801314631cd3abe91e3c85639a20"
-  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-512-512-fp16-opt.param"
- filename: "stablediffusion_assets/AutoencoderKL-base-fp16.param"
-  sha256: "0254a056dce61b0c27dc9ec1b78b53bcf55315c540f55f051eb841aa992701ba"
-  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-base-fp16.param"
- filename: "stablediffusion_assets/AutoencoderKL-encoder-512-512-fp16.bin"
-  sha256: "ddcb79a9951b9f91e05e087739ed69da2c1c4ae30ba4168cce350b49d617c9fa"
-  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-encoder-512-512-fp16.bin"
- filename: "stablediffusion_assets/AutoencoderKL-fp16.bin"
-  sha256: "f02e71f80e70252734724bbfaed5c4ddd3a8ed7e61bb2175ff5f53099f0e35dd"
-  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-fp16.bin"
- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.bin"
-  sha256: "1c9a12f4e1dd1b295a388045f7f28a2352a4d70c3dc96a542189a3dd7051fdd6"
-  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/FrozenCLIPEmbedder-fp16.bin"
- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.param"
-  sha256: "471afbe678dd1fd3fe764ef9c6eccaccb0a7d7e601f27b462aa926b20eb368c9"
-  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/FrozenCLIPEmbedder-fp16.param"
- filename: "stablediffusion_assets/log_sigmas.bin"
-  sha256: "a2089f8aa4c61f9c200feaec541ab3f5c94233b28deb6d5e8bcd974fa79b68ac"
-  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/raw/main/x86/linux/assets/log_sigmas.bin"
- filename: "stablediffusion_assets/UNetModel-256-256-MHA-fp16-opt.param"
-  sha256: "a58c380229f09491776df837b7aa7adffc0a87821dc4708b34535da2e36e3da1"
-  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-256-256-MHA-fp16-opt.param"
- filename: "stablediffusion_assets/UNetModel-512-512-MHA-fp16-opt.param"
-  sha256: "f12034067062827bd7f43d1d21888d1f03905401acf6c6eea22be23c259636fa"
-  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-512-512-MHA-fp16-opt.param"
- filename: "stablediffusion_assets/UNetModel-base-MHA-fp16.param"
-  sha256: "696f6975de49f4325b53ce32aff81861a6d6c07cd9ce3f0aae2cc405350af38d"
-  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-base-MHA-fp16.param"
- filename: "stablediffusion_assets/UNetModel-MHA-fp16.bin"
-  sha256: "d618918d011bfc1f644c0f2a33bf84931bd53b28a98492b0a8ed6f3a818852c3"
-  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/UNetModel-MHA-fp16.bin"
- filename: "stablediffusion_assets/vocab.txt"
-  sha256: "e30e57b6f1e47616982ef898d8922be24e535b4fa3d0110477b3a6f02ebbae7d"
-  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/vocab.txt"
+- filename: "stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
+  sha256: "b8944e9fe0b69b36ae1b5bb0185b3a7b8ef14347fe0fa9af6c64c4829022261f"
+  uri: "huggingface://second-state/stable-diffusion-v1-5-GGUF/stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"

 usage: |
        curl http://localhost:8080/v1/images/generations \
--- a/aio/cpu/rerank.yaml
+++ b/aio/cpu/rerank.yaml
@@ -1,7 +1,13 @@
 name: jina-reranker-v1-base-en
-backend: rerankers
+reranking: true
+f16: true
 parameters:
-  model: cross-encoder
+  model: jina-reranker-v1-tiny-en.f16.gguf
+
+download_files:
+  - filename: jina-reranker-v1-tiny-en.f16.gguf
+    sha256: 5f696cf0d0f3d347c4a279eee8270e5918554cdac0ed1f632f2619e4e8341407
+    uri: huggingface://mradermacher/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en.f16.gguf 

 usage: |
    You can test this model with curl like this:
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -1,101 +1,57 @@
-name: gpt-4
-mmap: true
-parameters:
-  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
 context_size: 8192
-
-stopwords:
- "<|im_end|>"
- "<dummy32000>"
- "</tool_call>"
- "<|eot_id|>"
- "<|end_of_text|>"
-
+f16: true
 function:
-  # disable injecting the "answer" tool
-  disable_no_action: true
-
  grammar:
-    # This allows the grammar to also return messages
-    mixed_mode: true
-    # Suffix to add to the grammar
-    #prefix: '<tool_call>\n'
-    # Force parallel calls in the grammar
-    # parallel_calls: true
-
-  return_name_in_function_response: true
-  # Without grammar uncomment the lines below
-  # Warning: this is relying only on the capability of the
-  # LLM model to generate the correct function call.
-  json_regex_match: 
-   - "(?s)<tool_call>(.*?)</tool_call>"
-   - "(?s)<tool_call>(.*?)"
-  replace_llm_results:
-  # Drop the scratchpad content from responses
-  - key: "(?s)<scratchpad>.*</scratchpad>"
-    value: ""
-  replace_function_results: 
-  # Replace everything that is not JSON array or object
-  # 
-  - key: '(?s)^[^{\[]*'
-    value: ""
-  - key: '(?s)[^}\]]*$'
-    value: ""
-  - key: "'([^']*?)'"
-    value: "_DQUOTE_${1}_DQUOTE_"
-  - key: '\\"'
-    value: "__TEMP_QUOTE__"
-  - key: "\'"
-    value: "'"
-  - key: "_DQUOTE_"
-    value: '"'
-  - key: "__TEMP_QUOTE__"
-    value: '"'
-  # Drop the scratchpad content from responses
-  - key: "(?s)<scratchpad>.*</scratchpad>"
-    value: ""
-
+    no_mixed_free_string: true
+    schema_type: llama3.1 # or JSON is supported too (json)
+  response_regex:
+  - <function=(?P<name>\w+)>(?P<arguments>.*)</function>
+mmap: true
+name: gpt-4
+parameters:
+  model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- <|eot_id|>
+- <|end_of_text|>
 template:
  chat: |
-    {{.Input -}}
-    <|im_start|>assistant
+    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
+    You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
+    {{.Input }}
+    <|start_header_id|>assistant<|end_header_id|>
  chat_message: |
-    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}
-    <tool_call>
-    {{- else if eq .RoleName "tool" }}
-    <tool_response>
-    {{- end }}
-    {{- if .Content}}
-    {{.Content }}
-    {{- end }}
-    {{- if .FunctionCall}}
-    {{toJson .FunctionCall}}
-    {{- end }}
-    {{- if .FunctionCall }}
-    </tool_call>
-    {{- else if eq .RoleName "tool" }}
-    </tool_response>
-    {{- end }}<|im_end|>
+    <|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
+    {{ if .FunctionCall -}}
+    {{ else if eq .RoleName "tool" -}}
+    The Function was executed and the response was:
+    {{ end -}}
+    {{ if .Content -}}
+    {{.Content -}}
+    {{ else if .FunctionCall -}}
+    {{ range .FunctionCall }}
+    [{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})]
+    {{ end }}
+    {{ end -}}
+    <|eot_id|>
  completion: |
    {{.Input}}
-  function: |-
-    <|im_start|>system
-    You are a function calling AI model.
-    Here are the available tools:
-    <tools>
-    {{range .Functions}}
-    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-    {{end}}
-    </tools>
-    You should call the tools provided to you sequentially
-    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
-    <scratchpad>
-    {step-by-step reasoning and plan in bullet points}
-    </scratchpad>
-    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
-    <tool_call>
-    {"arguments": <args-dict>, "name": <function-name>}
-    </tool_call><|im_end|>
-    {{.Input -}}
-    <|im_start|>assistant
+  function: |
+    <|start_header_id|>system<|end_header_id|>
+    You are an expert in composing functions. You are given a question and a set of possible functions.
+    Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
+    If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.
+    If you decide to invoke any of the function(s), you MUST put it in the format as follows:
+    [func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)]
+    You SHOULD NOT include any other text in the response.
+    Here is a list of functions in JSON format that you can invoke.
+    {{toJson .Functions}}
+    <|eot_id|><|start_header_id|>user<|end_header_id|>
+    {{.Input}}
+    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+download_files:
+- filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+  sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5
+  uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
--- a/aio/cpu/vad.yaml
+++ b/aio/cpu/vad.yaml
@@ -0,0 +1,8 @@
+backend: silero-vad
+name: silero-vad
+parameters:
+  model: silero-vad.onnx
+download_files:
+- filename: silero-vad.onnx
+  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
+  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -1,31 +1,49 @@
-backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
+mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
-
-roles:
-  user: "USER:"
-  assistant: "ASSISTANT:"
-  system: "SYSTEM:"
-
-mmproj: bakllava-mmproj.gguf
 parameters:
-  model: bakllava.gguf
-
+  model: minicpm-v-2_6-Q4_K_M.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- </s>
+- <|endoftext|>
 template:
  chat: |
-    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input -}}
+    <|im_start|>assistant
+  chat_message: |
+    <|im_start|>{{ .RoleName }}
+    {{ if .FunctionCall -}}
+    Function call:
+    {{ else if eq .RoleName "tool" -}}
+    Function response:
+    {{ end -}}
+    {{ if .Content -}}
+    {{.Content }}
+    {{ end -}}
+    {{ if .FunctionCall -}}
+    {{toJson .FunctionCall}}
+    {{ end -}}<|im_end|>
+  completion: |
    {{.Input}}
-    ASSISTANT:
+  function: |
+    <|im_start|>system
+    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    For each function call return a json object with function name and arguments
+    <|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant

 download_files:
- filename: bakllava.gguf
-  uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
- filename: bakllava-mmproj.gguf
-  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
-
-usage: |
-    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-        "model": "gpt-4-vision-preview",
-        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
+- filename: minicpm-v-2_6-Q4_K_M.gguf
+  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
+- filename: minicpm-v-2_6-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
+  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
--- a/aio/entrypoint.sh
+++ b/aio/entrypoint.sh
@@ -129,10 +129,10 @@ detect_gpu
 detect_gpu_size

 PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
-export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
+export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vad.yaml,/aio/${PROFILE}/vision.yaml}"

 check_vars

 echo "===> Starting LocalAI[$PROFILE] with the following models: $MODELS"

-exec /build/entrypoint.sh "$@"
+exec /entrypoint.sh "$@"
--- a/aio/gpu-8g/embeddings.yaml
+++ b/aio/gpu-8g/embeddings.yaml
@@ -1,7 +1,7 @@
+embeddings: true
 name: text-embedding-ada-002
-backend: sentencetransformers
 parameters:
-  model: all-MiniLM-L6-v2
+  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf

 usage: |
    You can test this model with curl like this:
--- a/aio/gpu-8g/rerank.yaml
+++ b/aio/gpu-8g/rerank.yaml
@@ -1,7 +1,13 @@
 name: jina-reranker-v1-base-en
-backend: rerankers
+reranking: true
+f16: true
 parameters:
-  model: cross-encoder
+  model: jina-reranker-v1-tiny-en.f16.gguf
+
+download_files:
+  - filename: jina-reranker-v1-tiny-en.f16.gguf
+    sha256: 5f696cf0d0f3d347c4a279eee8270e5918554cdac0ed1f632f2619e4e8341407
+    uri: huggingface://mradermacher/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en.f16.gguf 

 usage: |
    You can test this model with curl like this:
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -1,101 +1,53 @@
-name: gpt-4
-mmap: true
-parameters:
-  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
-context_size: 8192
-
-stopwords:
- "<|im_end|>"
- "<dummy32000>"
- "</tool_call>"
- "<|eot_id|>"
- "<|end_of_text|>"
-
+context_size: 4096
+f16: true
 function:
-  # disable injecting the "answer" tool
-  disable_no_action: true
-
+  capture_llm_results:
+  - (?s)<Thought>(.*?)</Thought>
  grammar:
-    # This allows the grammar to also return messages
-    mixed_mode: true
-    # Suffix to add to the grammar
-    #prefix: '<tool_call>\n'
-    # Force parallel calls in the grammar
-    # parallel_calls: true
-
-  return_name_in_function_response: true
-  # Without grammar uncomment the lines below
-  # Warning: this is relying only on the capability of the
-  # LLM model to generate the correct function call.
-  json_regex_match: 
-   - "(?s)<tool_call>(.*?)</tool_call>"
-   - "(?s)<tool_call>(.*?)"
+    properties_order: name,arguments
+  json_regex_match:
+  - (?s)<Output>(.*?)</Output>
  replace_llm_results:
-  # Drop the scratchpad content from responses
-  - key: "(?s)<scratchpad>.*</scratchpad>"
+  - key: (?s)<Thought>(.*?)</Thought>
    value: ""
-  replace_function_results: 
-  # Replace everything that is not JSON array or object
-  # 
-  - key: '(?s)^[^{\[]*'
-    value: ""
-  - key: '(?s)[^}\]]*$'
-    value: ""
-  - key: "'([^']*?)'"
-    value: "_DQUOTE_${1}_DQUOTE_"
-  - key: '\\"'
-    value: "__TEMP_QUOTE__"
-  - key: "\'"
-    value: "'"
-  - key: "_DQUOTE_"
-    value: '"'
-  - key: "__TEMP_QUOTE__"
-    value: '"'
-  # Drop the scratchpad content from responses
-  - key: "(?s)<scratchpad>.*</scratchpad>"
-    value: ""
-
+mmap: true
+name: gpt-4
+parameters:
+  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- </s>
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}
-    <tool_call>
-    {{- else if eq .RoleName "tool" }}
-    <tool_response>
-    {{- end }}
-    {{- if .Content}}
+    <|im_start|>{{ .RoleName }}
+    {{ if .FunctionCall -}}
+    Function call:
+    {{ else if eq .RoleName "tool" -}}
+    Function response:
+    {{ end -}}
+    {{ if .Content -}}
    {{.Content }}
-    {{- end }}
-    {{- if .FunctionCall}}
+    {{ end -}}
+    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
-    {{- end }}
-    {{- if .FunctionCall }}
-    </tool_call>
-    {{- else if eq .RoleName "tool" }}
-    </tool_response>
-    {{- end }}<|im_end|>
+    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-  function: |-
+  function: |
    <|im_start|>system
-    You are a function calling AI model.
-    Here are the available tools:
-    <tools>
+    You are an AI assistant that executes function calls, and these are the tools at your disposal:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    </tools>
-    You should call the tools provided to you sequentially
-    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
-    <scratchpad>
-    {step-by-step reasoning and plan in bullet points}
-    </scratchpad>
-    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
-    <tool_call>
-    {"arguments": <args-dict>, "name": <function-name>}
-    </tool_call><|im_end|>
+    <|im_end|>
    {{.Input -}}
-    <|im_start|>assistant
+    <|im_start|>assistant
+
+download_files:
+- filename: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+  sha256: 4e7b7fe1d54b881f1ef90799219dc6cc285d29db24f559c8998d1addb35713d4
+  uri: huggingface://mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
--- a/aio/gpu-8g/vad.yaml
+++ b/aio/gpu-8g/vad.yaml
@@ -0,0 +1,8 @@
+backend: silero-vad
+name: silero-vad
+parameters:
+  model: silero-vad.onnx
+download_files:
+- filename: silero-vad.onnx
+  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
+  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -1,35 +1,49 @@
-backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
+mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
-
-roles:
-  user: "USER:"
-  assistant: "ASSISTANT:"
-  system: "SYSTEM:"
-
-mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
-  temperature: 0.2
-  top_k: 40
-  top_p: 0.95
-  seed: -1
-
+  model: minicpm-v-2_6-Q4_K_M.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- </s>
+- <|endoftext|>
 template:
  chat: |
-    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input -}}
+    <|im_start|>assistant
+  chat_message: |
+    <|im_start|>{{ .RoleName }}
+    {{ if .FunctionCall -}}
+    Function call:
+    {{ else if eq .RoleName "tool" -}}
+    Function response:
+    {{ end -}}
+    {{ if .Content -}}
+    {{.Content }}
+    {{ end -}}
+    {{ if .FunctionCall -}}
+    {{toJson .FunctionCall}}
+    {{ end -}}<|im_end|>
+  completion: |
    {{.Input}}
-    ASSISTANT:
+  function: |
+    <|im_start|>system
+    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    For each function call return a json object with function name and arguments
+    <|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant

 download_files:
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
-  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
- filename: llava-v1.6-7b-mmproj-f16.gguf
-  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
-
-usage: |
-    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-        "model": "gpt-4-vision-preview",
-        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
+- filename: minicpm-v-2_6-Q4_K_M.gguf
+  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
+- filename: minicpm-v-2_6-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
+  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
--- a/aio/intel/embeddings.yaml
+++ b/aio/intel/embeddings.yaml
@@ -1,7 +1,7 @@
+embeddings: true
 name: text-embedding-ada-002
-backend: sentencetransformers
 parameters:
-  model: all-MiniLM-L6-v2
+  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf

 usage: |
    You can test this model with curl like this:
--- a/aio/intel/rerank.yaml
+++ b/aio/intel/rerank.yaml
@@ -1,7 +1,13 @@
 name: jina-reranker-v1-base-en
-backend: rerankers
+reranking: true
+f16: true
 parameters:
-  model: cross-encoder
+  model: jina-reranker-v1-tiny-en.f16.gguf
+
+download_files:
+  - filename: jina-reranker-v1-tiny-en.f16.gguf
+    sha256: 5f696cf0d0f3d347c4a279eee8270e5918554cdac0ed1f632f2619e4e8341407
+    uri: huggingface://mradermacher/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en.f16.gguf 

 usage: |
    You can test this model with curl like this:
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -1,103 +1,53 @@
-name: gpt-4
-mmap: false
-context_size: 8192
-
-f16: false
-parameters:
-  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
-
-stopwords:
- "<|im_end|>"
- "<dummy32000>"
- "</tool_call>"
- "<|eot_id|>"
- "<|end_of_text|>"
-
+context_size: 4096
+f16: true
 function:
-  # disable injecting the "answer" tool
-  disable_no_action: true
-
+  capture_llm_results:
+  - (?s)<Thought>(.*?)</Thought>
  grammar:
-    # This allows the grammar to also return messages
-    mixed_mode: true
-    # Suffix to add to the grammar
-    #prefix: '<tool_call>\n'
-    # Force parallel calls in the grammar
-    # parallel_calls: true
-
-  return_name_in_function_response: true
-  # Without grammar uncomment the lines below
-  # Warning: this is relying only on the capability of the
-  # LLM model to generate the correct function call.
-  json_regex_match: 
-   - "(?s)<tool_call>(.*?)</tool_call>"
-   - "(?s)<tool_call>(.*?)"
+    properties_order: name,arguments
+  json_regex_match:
+  - (?s)<Output>(.*?)</Output>
  replace_llm_results:
-  # Drop the scratchpad content from responses
-  - key: "(?s)<scratchpad>.*</scratchpad>"
+  - key: (?s)<Thought>(.*?)</Thought>
    value: ""
-  replace_function_results: 
-  # Replace everything that is not JSON array or object
-  # 
-  - key: '(?s)^[^{\[]*'
-    value: ""
-  - key: '(?s)[^}\]]*$'
-    value: ""
-  - key: "'([^']*?)'"
-    value: "_DQUOTE_${1}_DQUOTE_"
-  - key: '\\"'
-    value: "__TEMP_QUOTE__"
-  - key: "\'"
-    value: "'"
-  - key: "_DQUOTE_"
-    value: '"'
-  - key: "__TEMP_QUOTE__"
-    value: '"'
-  # Drop the scratchpad content from responses
-  - key: "(?s)<scratchpad>.*</scratchpad>"
-    value: ""
-
+mmap: true
+name: gpt-4
+parameters:
+  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- </s>
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}
-    <tool_call>
-    {{- else if eq .RoleName "tool" }}
-    <tool_response>
-    {{- end }}
-    {{- if .Content}}
+    <|im_start|>{{ .RoleName }}
+    {{ if .FunctionCall -}}
+    Function call:
+    {{ else if eq .RoleName "tool" -}}
+    Function response:
+    {{ end -}}
+    {{ if .Content -}}
    {{.Content }}
-    {{- end }}
-    {{- if .FunctionCall}}
+    {{ end -}}
+    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
-    {{- end }}
-    {{- if .FunctionCall }}
-    </tool_call>
-    {{- else if eq .RoleName "tool" }}
-    </tool_response>
-    {{- end }}<|im_end|>
+    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-  function: |-
+  function: |
    <|im_start|>system
-    You are a function calling AI model.
-    Here are the available tools:
-    <tools>
+    You are an AI assistant that executes function calls, and these are the tools at your disposal:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    </tools>
-    You should call the tools provided to you sequentially
-    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
-    <scratchpad>
-    {step-by-step reasoning and plan in bullet points}
-    </scratchpad>
-    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
-    <tool_call>
-    {"arguments": <args-dict>, "name": <function-name>}
-    </tool_call><|im_end|>
+    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
+
+download_files:
+- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
+  sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
+  uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
--- a/aio/intel/vad.yaml
+++ b/aio/intel/vad.yaml
@@ -0,0 +1,8 @@
+backend: silero-vad
+name: silero-vad
+parameters:
+  model: silero-vad.onnx
+download_files:
+- filename: silero-vad.onnx
+  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
+  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -1,35 +1,50 @@
-backend: llama-cpp
 context_size: 4096
-mmap: false
-f16: false
+f16: true
+mmap: true
+mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
-
-roles:
-  user: "USER:"
-  assistant: "ASSISTANT:"
-  system: "SYSTEM:"
-
-mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
-  temperature: 0.2
-  top_k: 40
-  top_p: 0.95
-  seed: -1
-
+  model: minicpm-v-2_6-Q4_K_M.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- </s>
+- <|endoftext|>
 template:
  chat: |
-    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input -}}
+    <|im_start|>assistant
+  chat_message: |
+    <|im_start|>{{ .RoleName }}
+    {{ if .FunctionCall -}}
+    Function call:
+    {{ else if eq .RoleName "tool" -}}
+    Function response:
+    {{ end -}}
+    {{ if .Content -}}
+    {{.Content }}
+    {{ end -}}
+    {{ if .FunctionCall -}}
+    {{toJson .FunctionCall}}
+    {{ end -}}<|im_end|>
+  completion: |
    {{.Input}}
-    ASSISTANT:
+  function: |
+    <|im_start|>system
+    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    For each function call return a json object with function name and arguments
+    <|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant
+

 download_files:
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
-  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
- filename: llava-v1.6-7b-mmproj-f16.gguf
-  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
-
-usage: |
-    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-        "model": "gpt-4-vision-preview",
-        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
+- filename: minicpm-v-2_6-Q4_K_M.gguf
+  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
+- filename: minicpm-v-2_6-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
+  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
--- a/assets.go
+++ b/assets.go
@@ -1,6 +1,15 @@
 package main

-import "embed"
+import (
+	rice "github.com/GeertJohan/go.rice"
+)

-//go:embed backend-assets/*
-var backendAssets embed.FS
+var backendAssets *rice.Box
+
+func init() {
+	var err error
+	backendAssets, err = rice.FindBox("backend-assets")
+	if err != nil {
+		panic(err)
+	}
+}
--- a/backend/Dockerfile.go
+++ b/backend/Dockerfile.go
@@ -0,0 +1,131 @@
+ARG BASE_IMAGE=ubuntu:22.04
+
+FROM ${BASE_IMAGE} AS builder
+ARG BACKEND=rerankers
+ARG BUILD_TYPE
+ENV BUILD_TYPE=${BUILD_TYPE}
+ARG CUDA_MAJOR_VERSION
+ARG CUDA_MINOR_VERSION
+ARG SKIP_DRIVERS=false
+ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
+ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
+ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETARCH
+ARG TARGETVARIANT
+ARG GO_VERSION=1.22.6
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        ccache \
+        ca-certificates \
+        make \
+        curl unzip \
+        libssl-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+
+# Cuda
+ENV PATH=/usr/local/cuda/bin:${PATH}
+
+# HipBLAS requirements
+ENV PATH=/opt/rocm/bin:${PATH}
+
+# Vulkan requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common pciutils wget gpg-agent && \
+        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+        apt-get update && \
+        apt-get install -y \
+            vulkan-sdk && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+# CuBLAS requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common pciutils
+        if [ "amd64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+        fi
+        if [ "arm64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
+        fi
+        dpkg -i cuda-keyring_1.1-1_all.deb && \
+        rm -f cuda-keyring_1.1-1_all.deb && \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+# If we are building with clblas support, we need the libraries for the builds
+RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            libclblast-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* \
+    ; fi
+
+RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            hipblas-dev \
+            rocblas-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* && \
+        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
+        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
+        ldconfig \
+    ; fi
+
+# Install Go
+RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
+ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin:/usr/local/bin
+
+# Install grpc compilers
+RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
+    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+RUN echo "TARGETARCH: $TARGETARCH"
+
+# We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
+# but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
+# here so that we can generate the grpc code for the stablediffusion build
+RUN <<EOT bash
+    if [ "amd64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+    if [ "arm64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+EOT
+
+COPY . /LocalAI
+
+RUN cd /LocalAI && make backend-assets/grpc/bark-cpp
+
+FROM scratch
+
+COPY --from=builder /LocalAI/backend-assets/grpc/bark-cpp ./
+COPY --from=builder /LocalAI/backend/go/bark/run.sh ./
--- a/backend/Dockerfile.python
+++ b/backend/Dockerfile.python
@@ -0,0 +1,123 @@
+ARG BASE_IMAGE=ubuntu:22.04
+
+FROM ${BASE_IMAGE} AS builder
+ARG BACKEND=rerankers
+ARG BUILD_TYPE
+ENV BUILD_TYPE=${BUILD_TYPE}
+ARG CUDA_MAJOR_VERSION
+ARG CUDA_MINOR_VERSION
+ARG SKIP_DRIVERS=false
+ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
+ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
+ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETARCH
+ARG TARGETVARIANT
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        ccache \
+        ca-certificates \
+        espeak-ng \
+        curl \
+        libssl-dev \
+        git \
+        git-lfs \
+        unzip \
+        upx-ucl \
+        curl python3-pip \
+        python-is-python3 \
+        python3-dev llvm \
+        python3-venv make && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* && \
+    pip install --upgrade pip
+
+
+# Cuda
+ENV PATH=/usr/local/cuda/bin:${PATH}
+
+# HipBLAS requirements
+ENV PATH=/opt/rocm/bin:${PATH}
+
+# Vulkan requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common pciutils wget gpg-agent && \
+        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+        apt-get update && \
+        apt-get install -y \
+            vulkan-sdk && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+# CuBLAS requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common pciutils
+        if [ "amd64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+        fi
+        if [ "arm64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
+        fi
+        dpkg -i cuda-keyring_1.1-1_all.deb && \
+        rm -f cuda-keyring_1.1-1_all.deb && \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+# If we are building with clblas support, we need the libraries for the builds
+RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            libclblast-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* \
+    ; fi
+
+RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            hipblas-dev \
+            rocblas-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* && \
+        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
+        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
+        ldconfig \
+    ; fi
+# Install uv as a system package
+RUN curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR=/usr/bin sh
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+
+# Install grpcio-tools (the version in 22.04 is too old)
+RUN pip install --user grpcio-tools==1.71.0 grpcio==1.71.0
+
+COPY python/${BACKEND} /${BACKEND}
+COPY backend.proto /${BACKEND}/backend.proto
+COPY python/common/ /${BACKEND}/common
+
+RUN cd /${BACKEND} && make
+
+FROM scratch
+ARG BACKEND=rerankers
+COPY --from=builder /${BACKEND}/ /
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -14,6 +14,7 @@ service Backend {
  rpc PredictStream(PredictOptions) returns (stream Reply) {}
  rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
  rpc GenerateImage(GenerateImageRequest) returns (Result) {}
+  rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
  rpc TTS(TTSRequest) returns (Result) {}
  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
@@ -159,6 +160,13 @@ message Reply {
  bytes message = 1;
  int32 tokens = 2;
  int32 prompt_tokens = 3;
+  double timing_prompt_processing = 4;
+  double timing_token_generation = 5;
+  bytes audio = 6;
+}
+
+message GrammarTrigger {
+  string word = 1;
 }

 message ModelOptions {
@@ -184,11 +192,7 @@ message ModelOptions {
  int32 NGQA = 20;
  string ModelFile = 21;

-  // AutoGPTQ
-  string Device = 22;
-  bool UseTriton = 23;
-  string ModelBaseName = 24;
-  bool UseFastTokenizer = 25;
+

  // Diffusers
  string PipelineType = 26;
@@ -222,6 +226,11 @@ message ModelOptions {
  int32  MaxModelLen = 54;
  int32  TensorParallelSize = 55;
  string LoadFormat = 58;
+  bool   DisableLogStatus = 66;
+  string DType = 67;
+  int32  LimitImagePerPrompt = 68;
+  int32  LimitVideoPerPrompt = 69;
+  int32  LimitAudioPerPrompt = 70;

  string MMProj = 41;

@@ -240,6 +249,15 @@ message ModelOptions {

  repeated string LoraAdapters = 60;
  repeated float LoraScales = 61;
+
+  repeated string Options = 62;
+
+  string CacheTypeKey = 63;
+  string CacheTypeValue = 64;
+
+  repeated GrammarTrigger GrammarTriggers = 65;
+
+  bool Reranking = 71;
 }

 message Result {
@@ -287,6 +305,19 @@ message GenerateImageRequest {
  int32 CLIPSkip = 11;
 }

+message GenerateVideoRequest {
+  string prompt = 1;
+  string start_image = 2;  // Path or base64 encoded image for the start frame
+  string end_image = 3;    // Path or base64 encoded image for the end frame
+  int32 width = 4;
+  int32 height = 5;
+  int32 num_frames = 6;    // Number of frames to generate
+  int32 fps = 7;          // Frames per second
+  int32 seed = 8;
+  float cfg_scale = 9;    // Classifier-free guidance scale
+  string dst = 10;        // Output path for the generated video
+}
+
 message TTSRequest {
  string text = 1;
  string model = 2;
@@ -343,4 +374,4 @@ message StatusResponse {
 message Message {
  string role = 1;
  string content = 2;
-}
+}
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -1,20 +1,3 @@
-
-## XXX: In some versions of CMake clip wasn't being built before llama.
-## This is an hack for now, but it should be fixed in the future.
-set(TARGET myclip)
-add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
-install(TARGETS ${TARGET} LIBRARY)
-target_include_directories(myclip PUBLIC .)
-target_include_directories(myclip PUBLIC ../..)
-target_include_directories(myclip PUBLIC ../../common)
-target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if (NOT MSVC)
-    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
-endif()
-# END CLIP hack
-
-
 set(TARGET grpc-server)
 set(CMAKE_CXX_STANDARD 17)
 cmake_minimum_required(VERSION 3.15)
@@ -74,8 +57,12 @@ add_library(hw_grpc_proto
  ${hw_proto_srcs}
  ${hw_proto_hdrs} )

-add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
-target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
+add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp httplib.h)
+
+target_include_directories(${TARGET} PRIVATE ../llava)
+target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
+
+target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
  absl::flags_parse
  gRPC::${_REFLECTION}
  gRPC::${_GRPC_GRPCPP}
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 TARGET?=--target grpc-server

 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
-CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF

 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
@@ -30,19 +30,24 @@ else ifeq ($(OS),Darwin)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 	else
 		CMAKE_ARGS+=-DGGML_METAL=ON
-# Until this is tested properly, we disable embedded metal file
-# as we already embed it as part of the LocalAI assets
-		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=OFF
+		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		TARGET+=--target ggml-metal
 	endif
 endif

 ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DCMAKE_CXX_FLAGS="-fsycl" \
+		-DGGML_SYCL_F16=ON
 endif

 ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DCMAKE_CXX_FLAGS="-fsycl"
 endif

 llama.cpp:
@@ -54,8 +59,8 @@ llama.cpp:
 	git checkout -b build $(LLAMA_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch

-llama.cpp/examples/grpc-server: llama.cpp
-	mkdir -p llama.cpp/examples/grpc-server
+llama.cpp/tools/grpc-server: llama.cpp
+	mkdir -p llama.cpp/tools/grpc-server
 	bash prepare.sh

 rebuild:
@@ -65,13 +70,13 @@ rebuild:

 purge:
 	rm -rf llama.cpp/build
-	rm -rf llama.cpp/examples/grpc-server
+	rm -rf llama.cpp/tools/grpc-server
 	rm -rf grpc-server

 clean: purge
 	rm -rf llama.cpp

-grpc-server: llama.cpp llama.cpp/examples/grpc-server
+grpc-server: llama.cpp llama.cpp/tools/grpc-server
 	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
@@ -79,4 +84,4 @@ ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 else
 	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
 endif
-	cp llama.cpp/build/bin/grpc-server .
+	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
--- a/backend/cpp/llama/json.hpp
+++ b/backend/cpp/llama/json.hpp
--- a/backend/cpp/llama/patches/01-llava.patch
+++ b/backend/cpp/llama/patches/01-llava.patch
@@ -1,13 +1,13 @@
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 342042ff..224db9b5 100644
--- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
-             struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
-             int* patches_data = (int*)malloc(ggml_nbytes(patches));
-             for (int i = 0; i < num_patches; i++) {
-                patches_data[i] = i + 1;
-+                patches_data[i] = i;
-             }
-             ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
-             free(patches_data);
+diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
+index 3cd0d2fa..6c5e811a 100644
+--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
+@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
+                 struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
+                 int* patches_data = (int*)malloc(ggml_nbytes(patches));
+                 for (int i = 0; i < num_patches; i++) {
+-                    patches_data[i] = i + 1;
+                    patches_data[i] = i;
+                 }
+                 ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
+                 free(patches_data);
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -7,21 +7,46 @@ for patch in $(ls patches); do
    patch -d llama.cpp/ -p1 < patches/$patch
 done 

-cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
-cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
-cp -rfv json.hpp llama.cpp/examples/grpc-server/
-cp -rfv utils.hpp llama.cpp/examples/grpc-server/
-    
-if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
+set -e
+
+cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
+cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
+cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/tools/grpc-server/
+cp -rfv llama.cpp/tools/server/utils.hpp llama.cpp/tools/grpc-server/
+cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/tools/grpc-server/
+
+set +e
+if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
    echo "grpc-server already added"
 else
-    echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
+    echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt
 fi
+set -e

-## XXX: In some versions of CMake clip wasn't being built before llama.
-## This is an hack for now, but it should be fixed in the future.
-cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
-cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
-echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
-cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
-cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
+# Now to keep maximum compatibility with the original server.cpp, we need to remove the index.html.gz.hpp and loading.html.hpp includes
+# and remove the main function
+# TODO: upstream this to the original server.cpp by extracting the upstream main function to a separate file
+awk '
+/int[ \t]+main[ \t]*\(/ {          # If the line starts the main function
+    in_main=1;                     # Set a flag
+    open_braces=0;                 # Track number of open braces
+}
+in_main {
+    open_braces += gsub(/\{/, "{"); # Count opening braces
+    open_braces -= gsub(/\}/, "}"); # Count closing braces
+    if (open_braces == 0) {         # If all braces are closed
+        in_main=0;                  # End skipping
+    }
+    next;                           # Skip lines inside main
+}
+!in_main                           # Print lines not inside main
+' "llama.cpp/tools/server/server.cpp" > llama.cpp/tools/grpc-server/server.cpp
+
+# remove index.html.gz.hpp and loading.html.hpp includes
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # macOS
+    sed -i '' '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
+else
+    # Linux and others
+    sed -i '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
+fi
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -1,483 +0,0 @@
-// https://github.com/ggerganov/llama.cpp/blob/master/examples/server/utils.hpp
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include <set>
-#include <mutex>
-#include <condition_variable>
-#include <unordered_map>
-
-#include "json.hpp"
-
-#include "../llava/clip.h"
-
-using json = nlohmann::json;
-
-extern bool server_verbose;
-
-#ifndef SERVER_VERBOSE
-#define SERVER_VERBOSE 1
-#endif
-
-#if SERVER_VERBOSE != 1
-#define LOG_VERBOSE(MSG, ...)
-#else
-#define LOG_VERBOSE(MSG, ...)                                            \
-    do                                                                   \
-    {                                                                    \
-        if (server_verbose)                                              \
-        {                                                                \
-            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
-        }                                                                \
-    } while (0)
-#endif
-
-#define LOG_ERROR(  MSG, ...) server_log("ERROR",   __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
-
-//
-// parallel
-//
-
-enum server_state {
-    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
-    SERVER_STATE_READY,          // Server is ready and model is loaded
-    SERVER_STATE_ERROR           // An error occurred, load_model failed
-};
-
-enum task_type {
-    TASK_TYPE_COMPLETION,
-    TASK_TYPE_CANCEL,
-    TASK_TYPE_NEXT_RESPONSE
-};
-
-struct task_server {
-    int id = -1; // to be filled by llama_server_queue
-    int target_id;
-    task_type type;
-    json data;
-    bool infill_mode = false;
-    bool embedding_mode = false;
-    int multitask_id = -1;
-};
-
-struct task_result {
-    int id;
-    int multitask_id = -1;
-    bool stop;
-    bool error;
-    json result_json;
-};
-
-struct task_multi {
-    int id;
-    std::set<int> subtasks_remaining{};
-    std::vector<task_result> results{};
-};
-
-// TODO: can become bool if we can't find use of more states
-enum slot_state
-{
-    IDLE,
-    PROCESSING,
-};
-
-enum slot_command
-{
-    NONE,
-    LOAD_PROMPT,
-    RELEASE,
-};
-
-struct slot_params
-{
-    bool stream       = true;
-    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
-
-    uint32_t seed      = -1; // RNG seed
-    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
-    int32_t  n_predict = -1; // new tokens to predict
-
-    std::vector<std::string> antiprompt;
-
-    json input_prefix;
-    json input_suffix;
-};
-
-struct slot_image
-{
-    int32_t id;
-
-    bool request_encode_image = false;
-    float * image_embedding = nullptr;
-    int32_t image_tokens = 0;
-
-    clip_image_u8 * img_data;
-
-    std::string prefix_prompt; // before of this image
-};
-
-// completion token output with probabilities
-struct completion_token_output
-{
-    struct token_prob
-    {
-        llama_token tok;
-        float prob;
-    };
-
-    std::vector<token_prob> probs;
-    llama_token tok;
-    std::string text_to_send;
-};
-
-static inline void server_log(const char *level, const char *function, int line,
-                       const char *message, const nlohmann::ordered_json &extra)
-{
-    nlohmann::ordered_json log
-    {
-        {"timestamp", time(nullptr)},
-        {"level",     level},
-        {"function",  function},
-        {"line",      line},
-        {"message",   message},
-    };
-
-    if (!extra.empty())
-    {
-        log.merge_patch(extra);
-    }
-
-    const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
-    printf("%.*s\n", (int)str.size(), str.data());
-    fflush(stdout);
-}
-
-//
-// server utils
-//
-
-template <typename T>
-static T json_value(const json &body, const std::string &key, const T &default_value)
-{
-    // Fallback null to default value
-    return body.contains(key) && !body.at(key).is_null()
-        ? body.value(key, default_value)
-        : default_value;
-}
-
-inline std::string format_chatml(std::vector<json> messages)
-{
-    std::ostringstream chatml_msgs;
-
-    for (auto it = messages.begin(); it != messages.end(); ++it) {
-        chatml_msgs << "<|im_start|>"
-                    << json_value(*it, "role",    std::string("user")) << '\n';
-        chatml_msgs << json_value(*it, "content", std::string(""))
-                    << "<|im_end|>\n";
-    }
-
-    chatml_msgs << "<|im_start|>assistant" << '\n';
-
-    return chatml_msgs.str();
-}
-
-//
-// work queue utils
-//
-
-struct llama_server_queue {
-    int id = 0;
-    std::mutex mutex_tasks;
-    // queues
-    std::vector<task_server> queue_tasks;
-    std::vector<task_server> queue_tasks_deferred;
-    std::vector<task_multi> queue_multitasks;
-    std::condition_variable condition_tasks;
-    // callback functions
-    std::function<void(task_server&)> callback_new_task;
-    std::function<void(task_multi&)> callback_finish_multitask;
-    std::function<void(void)> callback_all_task_finished;
-
-    // Add a new task to the end of the queue
-    int post(task_server task) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        if (task.id == -1) {
-            task.id = id++;
-        }
-        queue_tasks.push_back(std::move(task));
-        condition_tasks.notify_one();
-        return task.id;
-    }
-
-    // Add a new task, but defer until one slot is available
-    void defer(task_server task) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        queue_tasks_deferred.push_back(std::move(task));
-    }
-
-    // Get the next id for creating anew task
-    int get_new_id() {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        return id++;
-    }
-
-    // Register function to process a new task
-    void on_new_task(std::function<void(task_server&)> callback) {
-        callback_new_task = callback;
-    }
-
-    // Register function to process a multitask
-    void on_finish_multitask(std::function<void(task_multi&)> callback) {
-        callback_finish_multitask = callback;
-    }
-
-    // Register the function to be called when the batch of tasks is finished
-    void on_all_tasks_finished(std::function<void(void)> callback) {
-        callback_all_task_finished = callback;
-    }
-
-    // Call when the state of one slot is changed
-    void notify_slot_changed() {
-        // move deferred tasks back to main loop
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        for (auto & task : queue_tasks_deferred) {
-            queue_tasks.push_back(std::move(task));
-        }
-        queue_tasks_deferred.clear();
-    }
-
-    // Start the main loop. This call is blocking
-    [[noreturn]]
-    void start_loop() {
-        while (true) {
-            // new task arrived
-            LOG_VERBOSE("have new task", {});
-            {
-                while (true)
-                {
-                    std::unique_lock<std::mutex> lock(mutex_tasks);
-                    if (queue_tasks.empty()) {
-                        lock.unlock();
-                        break;
-                    }
-                    task_server task = queue_tasks.front();
-                    queue_tasks.erase(queue_tasks.begin());
-                    lock.unlock();
-                    LOG_VERBOSE("callback_new_task", {});
-                    callback_new_task(task);
-                }
-                LOG_VERBOSE("callback_all_task_finished", {});
-                // process and update all the multitasks
-                auto queue_iterator = queue_multitasks.begin();
-                while (queue_iterator != queue_multitasks.end())
-                {
-                    if (queue_iterator->subtasks_remaining.empty())
-                    {
-                        // all subtasks done == multitask is done
-                        task_multi current_multitask = *queue_iterator;
-                        callback_finish_multitask(current_multitask);
-                        // remove this multitask
-                        queue_iterator = queue_multitasks.erase(queue_iterator);
-                    }
-                    else
-                    {
-                        ++queue_iterator;
-                    }
-                }
-                // all tasks in the current loop is finished
-                callback_all_task_finished();
-            }
-            LOG_VERBOSE("wait for new task", {});
-            // wait for new task
-            {
-                std::unique_lock<std::mutex> lock(mutex_tasks);
-                if (queue_tasks.empty()) {
-                    condition_tasks.wait(lock, [&]{
-                        return !queue_tasks.empty();
-                    });
-                }
-            }
-        }
-    }
-
-    //
-    // functions to manage multitasks
-    //
-
-    // add a multitask by specifying the id of all subtask (subtask is a task_server)
-    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
-    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
-        task_multi multi;
-        multi.id = multitask_id;
-        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
-        queue_multitasks.push_back(multi);
-    }
-
-    // updatethe remaining subtasks, while appending results to multitask
-    void update_multitask(int multitask_id, int subtask_id, task_result& result)
-    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
-        for (auto& multitask : queue_multitasks)
-        {
-            if (multitask.id == multitask_id)
-            {
-                multitask.subtasks_remaining.erase(subtask_id);
-                multitask.results.push_back(result);
-            }
-        }
-    }
-};
-
-struct llama_server_response {
-    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
-    callback_multitask_t callback_update_multitask;
-    // for keeping track of all tasks waiting for the result
-    std::set<int> waiting_task_ids;
-    // the main result queue
-    std::vector<task_result> queue_results;
-    std::mutex mutex_results;
-    std::condition_variable condition_results;
-
-    void add_waiting_task_id(int task_id) {
-        std::unique_lock<std::mutex> lock(mutex_results);
-        waiting_task_ids.insert(task_id);
-    }
-
-    void remove_waiting_task_id(int task_id) {
-        std::unique_lock<std::mutex> lock(mutex_results);
-        waiting_task_ids.erase(task_id);
-    }
-
-    // This function blocks the thread until there is a response for this task_id
-    task_result recv(int task_id) {
-        while (true)
-        {
-            std::unique_lock<std::mutex> lock(mutex_results);
-            condition_results.wait(lock, [&]{
-                return !queue_results.empty();
-            });
-            LOG_VERBOSE("condition_results unblock", {});
-
-            for (int i = 0; i < (int) queue_results.size(); i++)
-            {
-                if (queue_results[i].id == task_id)
-                {
-                    assert(queue_results[i].multitask_id == -1);
-                    task_result res = queue_results[i];
-                    queue_results.erase(queue_results.begin() + i);
-                    return res;
-                }
-            }
-        }
-
-        // should never reach here
-    }
-
-    // Register the function to update multitask
-    void on_multitask_update(callback_multitask_t callback) {
-        callback_update_multitask = callback;
-    }
-
-    // Send a new result to a waiting task_id
-    void send(task_result result) {
-        std::unique_lock<std::mutex> lock(mutex_results);
-        LOG_VERBOSE("send new result", {});
-        for (auto& task_id : waiting_task_ids) {
-            // LOG_TEE("waiting task id %i \n", task_id);
-            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
-            if (result.multitask_id == task_id)
-            {
-                LOG_VERBOSE("callback_update_multitask", {});
-                callback_update_multitask(task_id, result.id, result);
-                continue;
-            }
-
-            if (result.id == task_id)
-            {
-                LOG_VERBOSE("queue_results.push_back", {});
-                queue_results.push_back(result);
-                condition_results.notify_one();
-                return;
-            }
-        }
-    }
-};
-
-//
-// base64 utils (TODO: move to common in the future)
-//
-
-static const std::string base64_chars =
-             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-             "abcdefghijklmnopqrstuvwxyz"
-             "0123456789+/";
-
-static inline bool is_base64(uint8_t c)
-{
-    return (isalnum(c) || (c == '+') || (c == '/'));
-}
-
-static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
-{
-    int i = 0;
-    int j = 0;
-    int in_ = 0;
-
-    int in_len = encoded_string.size();
-
-    uint8_t char_array_4[4];
-    uint8_t char_array_3[3];
-
-    std::vector<uint8_t> ret;
-
-    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
-    {
-        char_array_4[i++] = encoded_string[in_]; in_++;
-        if (i == 4)
-        {
-            for (i = 0; i <4; i++)
-            {
-                char_array_4[i] = base64_chars.find(char_array_4[i]);
-            }
-
-            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
-            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
-
-            for (i = 0; (i < 3); i++)
-            {
-                ret.push_back(char_array_3[i]);
-            }
-            i = 0;
-        }
-    }
-
-    if (i)
-    {
-        for (j = i; j <4; j++)
-        {
-            char_array_4[j] = 0;
-        }
-
-        for (j = 0; j <4; j++)
-        {
-            char_array_4[j] = base64_chars.find(char_array_4[j]);
-        }
-
-        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
-        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
-
-        for (j = 0; (j < i - 1); j++)
-        {
-            ret.push_back(char_array_3[j]);
-        }
-    }
-
-    return ret;
-}
--- a/backend/go/bark/Makefile
+++ b/backend/go/bark/Makefile
@@ -0,0 +1,25 @@
+INCLUDE_PATH := $(abspath ./)
+LIBRARY_PATH := $(abspath ./)
+
+AR?=ar
+
+BUILD_TYPE?=
+# keep standard at C11 and C++11
+CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../sources/bark.cpp/examples -I$(INCLUDE_PATH)/../../../sources/bark.cpp/spm-headers -I$(INCLUDE_PATH)/../../../sources/bark.cpp -O3 -DNDEBUG -std=c++17 -fPIC
+LDFLAGS  = -L$(LIBRARY_PATH) -L$(LIBRARY_PATH)/../../../sources/bark.cpp/build/examples -lbark -lstdc++ -lm
+
+# warnings
+CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+
+gobark.o:
+	$(CXX) $(CXXFLAGS) gobark.cpp -o gobark.o -c $(LDFLAGS)
+
+libbark.a: gobark.o
+	cp $(INCLUDE_PATH)/../../../sources/bark.cpp/build/libbark.a ./
+	$(AR) rcs libbark.a gobark.o
+	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml.c.o
+	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-alloc.c.o
+	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-backend.c.o
+
+clean:
+	rm -f gobark.o libbark.a
--- a/backend/go/bark/gobark.cpp
+++ b/backend/go/bark/gobark.cpp
@@ -0,0 +1,85 @@
+#include <iostream>
+#include <tuple>
+
+#include "bark.h"
+#include "gobark.h"
+#include "common.h"
+#include "ggml.h"
+
+struct bark_context *c;
+
+void bark_print_progress_callback(struct bark_context *bctx, enum bark_encoding_step step, int progress, void *user_data) {
+    if (step == bark_encoding_step::SEMANTIC) {
+        printf("\rGenerating semantic tokens... %d%%", progress);
+    } else if (step == bark_encoding_step::COARSE) {
+        printf("\rGenerating coarse tokens... %d%%", progress);
+    } else if (step == bark_encoding_step::FINE) {
+        printf("\rGenerating fine tokens... %d%%", progress);
+    }
+    fflush(stdout);
+}
+
+int load_model(char *model) {
+    // initialize bark context
+    struct bark_context_params ctx_params = bark_context_default_params();
+    bark_params params;
+
+    params.model_path = model;
+
+   // ctx_params.verbosity = verbosity;
+    ctx_params.progress_callback = bark_print_progress_callback;
+    ctx_params.progress_callback_user_data = nullptr;
+
+    struct bark_context *bctx = bark_load_model(params.model_path.c_str(), ctx_params, params.seed);
+    if (!bctx) {
+        fprintf(stderr, "%s: Could not load model\n", __func__);
+        return 1;
+    }
+
+    c = bctx;
+
+    return 0;
+}
+
+int tts(char *text,int  threads, char *dst ) {
+
+    ggml_time_init();
+    const int64_t t_main_start_us = ggml_time_us();
+
+    // generate audio
+    if (!bark_generate_audio(c, text, threads)) {
+        fprintf(stderr, "%s: An error occurred. If the problem persists, feel free to open an issue to report it.\n", __func__);
+        return 1;
+    }
+
+    const float *audio_data = bark_get_audio_data(c);
+    if (audio_data == NULL) {
+        fprintf(stderr, "%s: Could not get audio data\n", __func__);
+        return 1;
+    }
+
+    const int audio_arr_size = bark_get_audio_data_size(c);
+
+    std::vector<float> audio_arr(audio_data, audio_data + audio_arr_size);
+
+    write_wav_on_disk(audio_arr, dst);
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+        const int64_t t_load_us = bark_get_load_time(c);
+        const int64_t t_eval_us = bark_get_eval_time(c);
+
+        printf("\n\n");
+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
+        printf("%s:     eval time = %8.2f ms\n", __func__, t_eval_us / 1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
+    }
+    
+    return 0;
+}
+
+int unload() {
+    bark_free(c);
+}
+
--- a/backend/go/bark/gobark.go
+++ b/backend/go/bark/gobark.go
@@ -0,0 +1,52 @@
+package main
+
+// #cgo CXXFLAGS: -I${SRCDIR}/../../../sources/bark.cpp/ -I${SRCDIR}/../../../sources/bark.cpp/encodec.cpp -I${SRCDIR}/../../../sources/bark.cpp/examples -I${SRCDIR}/../../../sources/bark.cpp/spm-headers
+// #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../sources/bark.cpp/build/examples -L${SRCDIR}/../../../sources/bark.cpp/build/encodec.cpp/ -lbark -lencodec -lcommon
+// #include <gobark.h>
+// #include <stdlib.h>
+import "C"
+
+import (
+	"fmt"
+	"unsafe"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+type Bark struct {
+	base.SingleThread
+	threads int
+}
+
+func (sd *Bark) Load(opts *pb.ModelOptions) error {
+
+	sd.threads = int(opts.Threads)
+
+	modelFile := C.CString(opts.ModelFile)
+	defer C.free(unsafe.Pointer(modelFile))
+
+	ret := C.load_model(modelFile)
+	if ret != 0 {
+		return fmt.Errorf("inference failed")
+	}
+
+	return nil
+}
+
+func (sd *Bark) TTS(opts *pb.TTSRequest) error {
+	t := C.CString(opts.Text)
+	defer C.free(unsafe.Pointer(t))
+
+	dst := C.CString(opts.Dst)
+	defer C.free(unsafe.Pointer(dst))
+
+	threads := C.int(sd.threads)
+
+	ret := C.tts(t, threads, dst)
+	if ret != 0 {
+		return fmt.Errorf("inference failed")
+	}
+
+	return nil
+}
--- a/backend/go/bark/gobark.h
+++ b/backend/go/bark/gobark.h
@@ -0,0 +1,8 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+int load_model(char *model);
+int tts(char *text,int  threads, char *dst );
+#ifdef __cplusplus
+}
+#endif
--- a/backend/go/image/stablediffusion/main.go
+++ b/backend/go/image/stablediffusion/main.go
@@ -1,7 +1,6 @@
 package main

 // Note: this is started internally by LocalAI and a server is allocated for each model
-
 import (
 	"flag"

@@ -15,7 +14,7 @@ var (
 func main() {
 	flag.Parse()

-	if err := grpc.StartServer(*addr, &Image{}); err != nil {
+	if err := grpc.StartServer(*addr, &Bark{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/bark/run.sh
+++ b/backend/go/bark/run.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+set -ex
+exec ./bark-cpp
--- a/backend/go/image/stablediffusion-ggml/Makefile
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@@ -0,0 +1,135 @@
+INCLUDE_PATH := $(abspath ./)
+LIBRARY_PATH := $(abspath ./)
+
+AR?=ar
+CMAKE_ARGS?=
+BUILD_TYPE?=
+ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
+# keep standard at C11 and C++11
+CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
+
+GOCMD?=go
+CGO_LDFLAGS?=
+# Avoid parent make file overwriting CGO_LDFLAGS which is needed for hipblas
+CGO_LDFLAGS_SYCL=
+GO_TAGS?=
+LD_FLAGS?=
+
+# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
+
+# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DSD_CUDA=ON
+# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+# to CMAKE_ARGS automatically
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
+else ifeq ($(BUILD_TYPE),clblas)
+	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
+else ifeq ($(BUILD_TYPE),hipblas)
+	CMAKE_ARGS+=-DSD_HIPBLAS=ON
+# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
+# But if it's OSX without metal, disable it here
+else ifeq ($(OS),Darwin)
+	ifneq ($(BUILD_TYPE),metal)
+		CMAKE_ARGS+=-DSD_METAL=OFF
+	else
+		CMAKE_ARGS+=-DSD_METAL=ON
+		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
+		TARGET+=--target ggml-metal
+	endif
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f16)
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DSD_SYCL=ON \
+		-DGGML_SYCL_F16=ON
+	CC=icx
+	CXX=icpx
+	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
+	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
+	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
+	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f32)
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DSD_SYCL=ON
+	CC=icx
+	CXX=icpx
+	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
+	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
+	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
+	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
+endif
+
+# warnings
+# CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+
+# Find all .a archives in ARCHIVE_DIR
+# (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
+GGML_ARCHIVE_DIR := build/ggml/src/
+ALL_ARCHIVES := $(shell find $(GGML_ARCHIVE_DIR) -type f -name '*.a')
+
+# Name of the single merged library
+COMBINED_LIB := libggmlall.a
+
+# Rule to merge all the .a files into one
+$(COMBINED_LIB): $(ALL_ARCHIVES)
+	@echo "Merging all .a into $(COMBINED_LIB)"
+	rm -f $@
+	mkdir -p merge-tmp
+	for a in $(ALL_ARCHIVES); do \
+		( cd merge-tmp && ar x ../$$a ); \
+	done
+	( cd merge-tmp && ar rcs ../$@ *.o )
+	# Ensure we have a proper index
+	ranlib $@
+	# Clean up
+	rm -rf merge-tmp
+
+build/libstable-diffusion.a:
+	@echo "Building SD with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+	+bash -c "source $(ONEAPI_VARS); \
+	mkdir -p build && \
+	cd build && \
+	cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
+	cmake --build . --config Release"
+else
+	mkdir -p build && \
+	cd build && \
+	cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
+	cmake --build . --config Release
+endif
+	$(MAKE) $(COMBINED_LIB)
+
+gosd.o:
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+	+bash -c "source $(ONEAPI_VARS); \
+	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c"
+else
+	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
+endif
+
+libsd.a: gosd.o
+	cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
+	$(AR) rcs libsd.a gosd.o
+
+stablediffusion-ggml:
+	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_SYCL)" C_INCLUDE_PATH="$(INCLUDE_PATH)" LIBRARY_PATH="$(LIBRARY_PATH)" \
+	CC="$(CC)" CXX="$(CXX)" CGO_CXXFLAGS="$(CGO_CXXFLAGS)" \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o ../../../../backend-assets/grpc/stablediffusion-ggml ./
+ifneq ($(UPX),)
+	$(UPX) ../../../../backend-assets/grpc/stablediffusion-ggml
+endif
+
+clean:
+	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
--- a/backend/go/image/stablediffusion-ggml/gosd.cpp
+++ b/backend/go/image/stablediffusion-ggml/gosd.cpp
@@ -0,0 +1,231 @@
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <iostream>
+#include <random>
+#include <string>
+#include <vector>
+#include "gosd.h"
+
+// #include "preprocessing.hpp"
+#include "flux.hpp"
+#include "stable-diffusion.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#define STB_IMAGE_STATIC
+#include "stb_image.h"
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#define STB_IMAGE_WRITE_STATIC
+#include "stb_image_write.h"
+
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#define STB_IMAGE_RESIZE_STATIC
+#include "stb_image_resize.h"
+
+// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
+const char* sample_method_str[] = {
+    "euler_a",
+    "euler",
+    "heun",
+    "dpm2",
+    "dpm++2s_a",
+    "dpm++2m",
+    "dpm++2mv2",
+    "ipndm",
+    "ipndm_v",
+    "lcm",
+    "ddim_trailing",
+    "tcd",
+};
+
+// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
+const char* schedule_str[] = {
+    "default",
+    "discrete",
+    "karras",
+    "exponential",
+    "ays",
+    "gits",
+};
+
+sd_ctx_t* sd_c;
+
+sample_method_t sample_method;
+
+int load_model(char *model, char* options[], int threads, int diff) {
+    fprintf (stderr, "Loading model!\n");
+
+    char *stableDiffusionModel = "";
+    if (diff == 1 ) {
+        stableDiffusionModel = model;
+        model = "";
+    }
+
+    // decode options. Options are in form optname:optvale, or if booleans only optname.
+    char *clip_l_path  = "";
+    char *clip_g_path  = "";
+    char *t5xxl_path  = "";
+    char *vae_path  = "";
+    char *scheduler = "";
+    char *sampler = "";
+
+    // If options is not NULL, parse options
+    for (int i = 0; options[i] != NULL; i++) {
+        char *optname = strtok(options[i], ":");
+        char *optval = strtok(NULL, ":");
+        if (optval == NULL) {
+            optval = "true";
+        }
+
+        if (!strcmp(optname, "clip_l_path")) {
+            clip_l_path = optval;
+        }
+        if (!strcmp(optname, "clip_g_path")) {
+            clip_g_path = optval;
+        }
+        if (!strcmp(optname, "t5xxl_path")) {
+            t5xxl_path = optval;
+        }
+        if (!strcmp(optname, "vae_path")) {
+            vae_path = optval;
+        }
+        if (!strcmp(optname, "scheduler")) {
+            scheduler = optval;
+        }
+        if (!strcmp(optname, "sampler")) {
+            sampler = optval;
+        }
+    }
+
+    int sample_method_found = -1;
+    for (int m = 0; m < N_SAMPLE_METHODS; m++) {
+        if (!strcmp(sampler, sample_method_str[m])) {
+            sample_method_found = m;
+        }
+    }
+    if (sample_method_found == -1) {
+        fprintf(stderr, "Invalid sample method, default to EULER_A!\n");
+        sample_method_found = EULER_A;
+    }
+    sample_method = (sample_method_t)sample_method_found;
+
+    int schedule_found            = -1;
+    for (int d = 0; d < N_SCHEDULES; d++) {
+        if (!strcmp(scheduler, schedule_str[d])) {
+            schedule_found = d;
+                fprintf (stderr, "Found scheduler: %s\n", scheduler);
+
+        }
+    }
+
+    if (schedule_found == -1) {
+        fprintf (stderr, "Invalid scheduler! using DEFAULT\n");
+        schedule_found = DEFAULT;
+    }
+
+    schedule_t schedule = (schedule_t)schedule_found;
+    
+    fprintf (stderr, "Creating context\n");
+    sd_ctx_t* sd_ctx = new_sd_ctx(model,
+                                  clip_l_path,
+                                  clip_g_path,
+                                  t5xxl_path,
+                                  stableDiffusionModel,
+                                  vae_path,
+                                  "",
+                                  "",
+                                  "",
+                                  "",
+                                  "",
+                                  false,
+                                  false,
+                                  false,
+                                  threads,
+                                  SD_TYPE_COUNT,
+                                  STD_DEFAULT_RNG,
+                                  schedule,
+                                  false,
+                                  false,
+                                  false,
+                                  false);
+
+    if (sd_ctx == NULL) {
+        fprintf (stderr, "failed loading model (generic error)\n");
+        return 1;
+    }
+    fprintf (stderr, "Created context: OK\n");
+
+    sd_c = sd_ctx;
+
+    return 0;
+}
+
+int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed , char *dst, float cfg_scale) {
+
+    sd_image_t* results;
+
+    std::vector<int> skip_layers = {7, 8, 9};
+
+    fprintf (stderr, "Generating image\n");
+
+    results = txt2img(sd_c,
+                            text,
+                            negativeText,
+                            -1, //clip_skip
+                            cfg_scale, // sfg_scale
+                            3.5f,
+			    0, // eta
+                            width,
+                            height,
+                            sample_method, 
+                            steps,
+                            seed,
+                            1,
+                            NULL,
+                            0.9f,
+                            20.f,
+                            false,
+                            "",
+                            skip_layers.data(),
+                            skip_layers.size(),
+                            0,
+                            0.01,
+                            0.2);
+
+    if (results == NULL) {
+        fprintf (stderr, "NO results\n");
+        return 1;
+    }
+
+    if (results[0].data == NULL) {
+        fprintf (stderr, "Results with no data\n");
+        return 1;
+    }
+
+    fprintf (stderr, "Writing PNG\n");
+
+    fprintf (stderr, "DST: %s\n", dst);
+    fprintf (stderr, "Width: %d\n", results[0].width);
+    fprintf (stderr, "Height: %d\n", results[0].height);
+    fprintf (stderr, "Channel: %d\n", results[0].channel);
+    fprintf (stderr, "Data: %p\n", results[0].data);
+
+    stbi_write_png(dst, results[0].width, results[0].height, results[0].channel,
+                       results[0].data, 0, NULL);
+    fprintf (stderr, "Saved resulting image to '%s'\n", dst);
+
+    // TODO: free results. Why does it crash?
+
+    free(results[0].data);
+    results[0].data = NULL;
+    free(results);
+    fprintf (stderr, "gen_image is done", dst);
+
+    return 0;
+}
+
+int unload() {
+    free_sd_ctx(sd_c);
+}
+
--- a/backend/go/image/stablediffusion-ggml/gosd.go
+++ b/backend/go/image/stablediffusion-ggml/gosd.go
@@ -0,0 +1,96 @@
+package main
+
+// #cgo CXXFLAGS: -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/ggml/include
+// #cgo LDFLAGS: -L${SRCDIR}/ -lsd -lstdc++ -lm -lggmlall -lgomp
+// #include <gosd.h>
+// #include <stdlib.h>
+import "C"
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"unsafe"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/utils"
+)
+
+type SDGGML struct {
+	base.SingleThread
+	threads      int
+	sampleMethod string
+	cfgScale     float32
+}
+
+func (sd *SDGGML) Load(opts *pb.ModelOptions) error {
+
+	sd.threads = int(opts.Threads)
+
+	modelFile := C.CString(opts.ModelFile)
+	defer C.free(unsafe.Pointer(modelFile))
+
+	var options **C.char
+	// prepare the options array to pass to C
+
+	size := C.size_t(unsafe.Sizeof((*C.char)(nil)))
+	length := C.size_t(len(opts.Options))
+	options = (**C.char)(C.malloc(length * size))
+	view := (*[1 << 30]*C.char)(unsafe.Pointer(options))[0:len(opts.Options):len(opts.Options)]
+
+	var diffusionModel int
+
+	var oo []string
+	for _, op := range opts.Options {
+		if op == "diffusion_model" {
+			diffusionModel = 1
+			continue
+		}
+
+		// If it's an option path, we resolve absolute path from the model path
+		if strings.Contains(op, ":") && strings.Contains(op, "path") {
+			data := strings.Split(op, ":")
+			data[1] = filepath.Join(opts.ModelPath, data[1])
+			if err := utils.VerifyPath(data[1], opts.ModelPath); err == nil {
+				oo = append(oo, strings.Join(data, ":"))
+			}
+		} else {
+			oo = append(oo, op)
+		}
+	}
+
+	fmt.Fprintf(os.Stderr, "Options: %+v\n", oo)
+
+	for i, x := range oo {
+		view[i] = C.CString(x)
+	}
+
+	sd.cfgScale = opts.CFGScale
+
+	ret := C.load_model(modelFile, options, C.int(opts.Threads), C.int(diffusionModel))
+	if ret != 0 {
+		return fmt.Errorf("could not load model")
+	}
+
+	return nil
+}
+
+func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
+	t := C.CString(opts.PositivePrompt)
+	defer C.free(unsafe.Pointer(t))
+
+	dst := C.CString(opts.Dst)
+	defer C.free(unsafe.Pointer(dst))
+
+	negative := C.CString(opts.NegativePrompt)
+	defer C.free(unsafe.Pointer(negative))
+
+	ret := C.gen_image(t, negative, C.int(opts.Width), C.int(opts.Height), C.int(opts.Step), C.int(opts.Seed), dst, C.float(sd.cfgScale))
+	if ret != 0 {
+		return fmt.Errorf("inference failed")
+	}
+
+	return nil
+}
--- a/backend/go/image/stablediffusion-ggml/gosd.h
+++ b/backend/go/image/stablediffusion-ggml/gosd.h
@@ -0,0 +1,8 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+int load_model(char *model, char* options[], int threads, int diffusionModel);
+int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed, char *dst, float cfg_scale);
+#ifdef __cplusplus
+}
+#endif
--- a/backend/go/image/stablediffusion-ggml/main.go
+++ b/backend/go/image/stablediffusion-ggml/main.go
@@ -1,7 +1,6 @@
 package main

 // Note: this is started internally by LocalAI and a server is allocated for each model
-
 import (
 	"flag"

@@ -15,7 +14,7 @@ var (
 func main() {
 	flag.Parse()

-	if err := grpc.StartServer(*addr, &Image{}); err != nil {
+	if err := grpc.StartServer(*addr, &SDGGML{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/image/stablediffusion/stablediffusion.go
+++ b/backend/go/image/stablediffusion/stablediffusion.go
@@ -1,33 +0,0 @@
-package main
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/LocalAI/pkg/stablediffusion"
-)
-
-type Image struct {
-	base.SingleThread
-	stablediffusion *stablediffusion.StableDiffusion
-}
-
-func (image *Image) Load(opts *pb.ModelOptions) error {
-	var err error
-	// Note: the Model here is a path to a directory containing the model files
-	image.stablediffusion, err = stablediffusion.New(opts.ModelFile)
-	return err
-}
-
-func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
-	return image.stablediffusion.GenerateImage(
-		int(opts.Height),
-		int(opts.Width),
-		int(opts.Mode),
-		int(opts.Step),
-		int(opts.Seed),
-		opts.PositivePrompt,
-		opts.NegativePrompt,
-		opts.Dst)
-}
--- a/backend/go/image/tinydream/tinydream.go
+++ b/backend/go/image/tinydream/tinydream.go
@@ -1,32 +0,0 @@
-package main
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/LocalAI/pkg/tinydream"
-)
-
-type Image struct {
-	base.SingleThread
-	tinydream *tinydream.TinyDream
-}
-
-func (image *Image) Load(opts *pb.ModelOptions) error {
-	var err error
-	// Note: the Model here is a path to a directory containing the model files
-	image.tinydream, err = tinydream.New(opts.ModelFile)
-	return err
-}
-
-func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
-	return image.tinydream.GenerateImage(
-		int(opts.Height),
-		int(opts.Width),
-		int(opts.Step),
-		int(opts.Seed),
-		opts.PositivePrompt,
-		opts.NegativePrompt,
-		opts.Dst)
-}
--- a/backend/go/llm/bert/bert.go
+++ b/backend/go/llm/bert/bert.go
@@ -1,34 +0,0 @@
-package main
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	bert "github.com/go-skynet/go-bert.cpp"
-
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-)
-
-type Embeddings struct {
-	base.SingleThread
-	bert *bert.Bert
-}
-
-func (llm *Embeddings) Load(opts *pb.ModelOptions) error {
-	model, err := bert.New(opts.ModelFile)
-	llm.bert = model
-	return err
-}
-
-func (llm *Embeddings) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
-
-	if len(opts.EmbeddingTokens) > 0 {
-		tokens := []int{}
-		for _, t := range opts.EmbeddingTokens {
-			tokens = append(tokens, int(t))
-		}
-		return llm.bert.TokenEmbeddings(tokens, bert.SetThreads(int(opts.Threads)))
-	}
-
-	return llm.bert.Embeddings(opts.Embeddings, bert.SetThreads(int(opts.Threads)))
-}
--- a/backend/go/llm/bert/main.go
+++ b/backend/go/llm/bert/main.go
@@ -1,21 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	grpc "github.com/mudler/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &Embeddings{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/llm/llama-ggml/llama.go
+++ b/backend/go/llm/llama-ggml/llama.go
@@ -1,204 +0,0 @@
-package main
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"fmt"
-
-	"github.com/go-skynet/go-llama.cpp"
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-)
-
-type LLM struct {
-	base.SingleThread
-
-	llama *llama.LLama
-}
-
-func (llm *LLM) Load(opts *pb.ModelOptions) error {
-	ropeFreqBase := float32(10000)
-	ropeFreqScale := float32(1)
-
-	if opts.RopeFreqBase != 0 {
-		ropeFreqBase = opts.RopeFreqBase
-	}
-	if opts.RopeFreqScale != 0 {
-		ropeFreqScale = opts.RopeFreqScale
-	}
-
-	llamaOpts := []llama.ModelOption{
-		llama.WithRopeFreqBase(ropeFreqBase),
-		llama.WithRopeFreqScale(ropeFreqScale),
-	}
-
-	if opts.NGQA != 0 {
-		llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
-	}
-
-	if opts.RMSNormEps != 0 {
-		llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
-	}
-
-	if opts.ContextSize != 0 {
-		llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
-	}
-	if opts.F16Memory {
-		llamaOpts = append(llamaOpts, llama.EnableF16Memory)
-	}
-	if opts.Embeddings {
-		llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
-	}
-	if opts.NGPULayers != 0 {
-		llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
-	}
-
-	llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
-	llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
-	llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
-	if opts.NBatch != 0 {
-		llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
-	} else {
-		llamaOpts = append(llamaOpts, llama.SetNBatch(512))
-	}
-
-	if opts.NUMA {
-		llamaOpts = append(llamaOpts, llama.EnableNUMA)
-	}
-
-	if opts.LowVRAM {
-		llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
-	}
-
-	model, err := llama.New(opts.ModelFile, llamaOpts...)
-	llm.llama = model
-
-	return err
-}
-
-func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
-	ropeFreqBase := float32(10000)
-	ropeFreqScale := float32(1)
-
-	if opts.RopeFreqBase != 0 {
-		ropeFreqBase = opts.RopeFreqBase
-	}
-	if opts.RopeFreqScale != 0 {
-		ropeFreqScale = opts.RopeFreqScale
-	}
-	predictOptions := []llama.PredictOption{
-		llama.SetTemperature(opts.Temperature),
-		llama.SetTopP(opts.TopP),
-		llama.SetTopK(int(opts.TopK)),
-		llama.SetTokens(int(opts.Tokens)),
-		llama.SetThreads(int(opts.Threads)),
-		llama.WithGrammar(opts.Grammar),
-		llama.SetRopeFreqBase(ropeFreqBase),
-		llama.SetRopeFreqScale(ropeFreqScale),
-		llama.SetNegativePromptScale(opts.NegativePromptScale),
-		llama.SetNegativePrompt(opts.NegativePrompt),
-	}
-
-	if opts.PromptCacheAll {
-		predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
-	}
-
-	if opts.PromptCacheRO {
-		predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
-	}
-
-	// Expected absolute path
-	if opts.PromptCachePath != "" {
-		predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
-	}
-
-	if opts.Mirostat != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
-	}
-
-	if opts.MirostatETA != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
-	}
-
-	if opts.MirostatTAU != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
-	}
-
-	if opts.Debug {
-		predictOptions = append(predictOptions, llama.Debug)
-	}
-
-	predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
-
-	if opts.PresencePenalty != 0 {
-		predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
-	}
-
-	if opts.NKeep != 0 {
-		predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
-	}
-
-	if opts.Batch != 0 {
-		predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
-	}
-
-	if opts.F16KV {
-		predictOptions = append(predictOptions, llama.EnableF16KV)
-	}
-
-	if opts.IgnoreEOS {
-		predictOptions = append(predictOptions, llama.IgnoreEOS)
-	}
-
-	if opts.Seed != 0 {
-		predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
-	}
-
-	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
-
-	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
-	predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
-	predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
-	predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
-	predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
-	predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
-	predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
-	return predictOptions
-}
-
-func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
-	return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
-}
-
-func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
-	predictOptions := buildPredictOptions(opts)
-
-	predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
-		results <- token
-		return true
-	}))
-
-	go func() {
-		_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
-		if err != nil {
-			fmt.Println("err: ", err)
-		}
-		close(results)
-	}()
-
-	return nil
-}
-
-func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
-	predictOptions := buildPredictOptions(opts)
-
-	if len(opts.EmbeddingTokens) > 0 {
-		tokens := []int{}
-		for _, t := range opts.EmbeddingTokens {
-			tokens = append(tokens, int(t))
-		}
-		return llm.llama.TokenEmbeddings(tokens, predictOptions...)
-	}
-
-	return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
-}
--- a/backend/go/llm/llama-ggml/main.go
+++ b/backend/go/llm/llama-ggml/main.go
@@ -1,19 +0,0 @@
-package main
-
-import (
-	"flag"
-
-	grpc "github.com/mudler/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/llm/llama/llama.go
+++ b/backend/go/llm/llama/llama.go
@@ -58,6 +58,9 @@ func (llm *LLM) Load(opts *pb.ModelOptions) error {
 	if opts.Embeddings {
 		llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
 	}
+	if opts.Reranking {
+		llamaOpts = append(llamaOpts, llama.EnableReranking)
+	}
 	if opts.NGPULayers != 0 {
 		llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
 	}
--- a/backend/go/stores/store.go
+++ b/backend/go/stores/store.go
@@ -311,12 +311,16 @@ func (s *Store) StoresGet(opts *pb.StoresGetOptions) (pb.StoresGetResult, error)
 }

 func isNormalized(k []float32) bool {
-	var sum float32
+	var sum float64
+
 	for _, v := range k {
-		sum += v
+		v64 := float64(v)
+		sum += v64*v64
 	}

-	return sum == 1.0
+	s := math.Sqrt(sum)
+
+	return s >= 0.99 && s <= 1.01
 }

 // TODO: This we could replace with handwritten SIMD code
@@ -328,7 +332,7 @@ func normalizedCosineSimilarity(k1, k2 []float32) float32 {
 		dot += k1[i] * k2[i]
 	}

-	assert(dot >= -1 && dot <= 1, fmt.Sprintf("dot = %f", dot))
+	assert(dot >= -1.01 && dot <= 1.01, fmt.Sprintf("dot = %f", dot))

 	// 2.0 * (1.0 - dot) would be the Euclidean distance
 	return dot
@@ -418,7 +422,7 @@ func cosineSimilarity(k1, k2 []float32, mag1 float64) float32 {

 	sim := float32(dot / (mag1 * math.Sqrt(mag2)))

-	assert(sim >= -1 && sim <= 1, fmt.Sprintf("sim = %f", sim))
+	assert(sim >= -1.01 && sim <= 1.01, fmt.Sprintf("sim = %f", sim))

 	return sim
 }
--- a/Show More
+++ b/Show More