fix(ci): install latest git

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-24 08:38:02 -04:00 · 2024-10-24 14:55:24 +02:00
371 changed files with 11396 additions and 10408 deletions
--- a/Requests/model
+++ b/Requests/model
@@ -1,11 +0,0 @@
 meta {
  name: model delete
  type: http
  seq: 7
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
  body: none
  auth: none
 }
--- a/Requests/transcription/gb1.ogg
+++ b/Requests/transcription/gb1.ogg
--- a/Requests/transcription/transcribe.bru
+++ b/Requests/transcription/transcribe.bru
@@ -1,16 +0,0 @@
 meta {
  name: transcribe
  type: http
  seq: 1
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/audio/transcriptions
  body: multipartForm
  auth: none
 }
 body:multipart-form {
  file: @file(transcription/gb1.ogg)
  model: whisper-1
 }
--- a/.env
+++ b/.env
@@ -82,15 +82,6 @@
 # Enable to allow p2p mode
 # LOCALAI_P2P=true
 # Enable to use federated mode
 # LOCALAI_FEDERATED=true
 # Enable to start federation server
 # FEDERATED_SERVER=true
 # Define to use federation token
 # TOKEN=""
 ### Watchdog settings
 ###
 # Enables watchdog to kill backends that are inactive for too much time
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1 @@
 *.sh text eol=lf
 backend/cpp/llama/*.hpp linguist-vendored
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,15 +1,6 @@
 enhancements:
 - head-branch: ['^feature', 'feature']
 dependencies:
 - any:
  - changed-files:
    - any-glob-to-any-file: 'Makefile'
  - changed-files:
    - any-glob-to-any-file: '*.mod'
  - changed-files:
    - any-glob-to-any-file: '*.sum'
 kind/documentation:
 - any:
  - changed-files:
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -12,14 +12,23 @@ jobs:
          - repository: "ggerganov/llama.cpp"
            variable: "CPPLLAMA_VERSION"
            branch: "master"
          - repository: "go-skynet/go-ggml-transformers.cpp"
            variable: "GOGGMLTRANSFORMERS_VERSION"
            branch: "master"
          - repository: "donomii/go-rwkv.cpp"
            variable: "RWKV_VERSION"
            branch: "main"
          - repository: "ggerganov/whisper.cpp"
            variable: "WHISPER_CPP_VERSION"
            branch: "master"
-          - repository: "PABannier/bark.cpp"
+          - repository: "go-skynet/go-bert.cpp"
-            variable: "BARKCPP_VERSION"
+            variable: "BERT_VERSION"
            branch: "master"
          - repository: "go-skynet/bloomz.cpp"
            variable: "BLOOMZ_VERSION"
            branch: "main"
-          - repository: "leejet/stable-diffusion.cpp"
+          - repository: "mudler/go-ggllm.cpp"
-            variable: "STABLEDIFFUSION_GGML_VERSION"
+            variable: "GOGGLLM_VERSION"
            branch: "master"
          - repository: "mudler/go-stable-diffusion"
            variable: "STABLEDIFFUSION_VERSION"
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -23,7 +23,7 @@ jobs:
          sudo pip install --upgrade pip
          pip install huggingface_hub
      - name: 'Setup yq'
-        uses: dcarbone/install-yq-action@v1.3.1
+        uses: dcarbone/install-yq-action@v1.1.1
        with:
          version: 'v4.44.2'
          download-compressed: true
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -33,7 +33,7 @@ jobs:
        run: |
          CGO_ENABLED=0 make build-api
      - name: rm
-        uses: appleboy/ssh-action@v1.2.0
+        uses: appleboy/ssh-action@v1.1.0
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@@ -53,7 +53,7 @@ jobs:
            rm: true
            target: ./local-ai
      - name: restarting
-        uses: appleboy/ssh-action@v1.2.0
+        uses: appleboy/ssh-action@v1.1.0
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -15,7 +15,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
+          - base-image: intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04
            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64'
    runs-on: ${{matrix.runs-on}}
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -280,7 +280,6 @@ jobs:
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
      skip-drivers: ${{ matrix.skip-drivers }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -302,7 +301,6 @@ jobs:
            latest-image: 'latest-cpu'
            latest-image-aio: 'latest-aio-cpu'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -314,7 +312,6 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -326,7 +323,6 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -338,7 +334,6 @@ jobs:
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -349,7 +344,6 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
@@ -360,45 +354,4 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
 #  parallel-builds:
 #    uses: ./.github/workflows/image_build.yml
 #    with:
 #      tag-latest: ${{ matrix.tag-latest }}
 #      tag-suffix: ${{ matrix.tag-suffix }}
 #      ffmpeg: ${{ matrix.ffmpeg }}
 #      image-type: ${{ matrix.image-type }}
 #      build-type: ${{ matrix.build-type }}
 #      cuda-major-version: ${{ matrix.cuda-major-version }}
 #      cuda-minor-version: ${{ matrix.cuda-minor-version }}
 #      platforms: ${{ matrix.platforms }}
 #      runs-on: ${{ matrix.runs-on }}
 #      aio: ${{ matrix.aio }}
 #      base-image: ${{ matrix.base-image }}
 #      grpc-base-image: ${{ matrix.grpc-base-image }}
 #      makeflags: ${{ matrix.makeflags }}
 #      latest-image: ${{ matrix.latest-image }}
 #      latest-image-aio: ${{ matrix.latest-image-aio }}
 #      skip-drivers: ${{ matrix.skip-drivers }}
 #    secrets:
 #      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
 #      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
 #      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
 #      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
 #    strategy:
 #      matrix:
 #        include:
 #          - build-type: 'cublas'
 #            cuda-major-version: "12"
 #            cuda-minor-version: "0"
 #            platforms: 'linux/arm64'
 #            tag-latest: 'false'
 #            tag-suffix: '-nvidia-l4t-arm64-core'
 #            latest-image: 'latest-nvidia-l4t-arm64-core'
 #            ffmpeg: 'true'
 #            image-type: 'core'
 #            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
 #            runs-on: 'self-hosted'
 #            makeflags: "--jobs=4 --output-sync=target"
 #            skip-drivers: 'true'
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -49,10 +49,6 @@ on:
        description: 'FFMPEG'
        default: ''
        type: string
      skip-drivers:
        description: 'Skip drivers by default'
        default: 'false'
        type: string
      image-type:
        description: 'Image type'
        default: ''
@@ -238,7 +234,6 @@ jobs:
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
            GRPC_VERSION=v1.65.0
            MAKEFLAGS=${{ inputs.makeflags }}
            SKIP_DRIVERS=${{ inputs.skip-drivers }}
          context: .
          file: ./Dockerfile
          cache-from: type=gha
@@ -267,7 +262,6 @@ jobs:
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
            GRPC_VERSION=v1.65.0
            MAKEFLAGS=${{ inputs.makeflags }}
            SKIP_DRIVERS=${{ inputs.skip-drivers }}
          context: .
          file: ./Dockerfile
          cache-from: type=gha
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.0
+        uses: securego/gosec@v2.21.4
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -105,6 +105,14 @@ jobs:
  tests-parler-tts:
    runs-on: ubuntu-latest
    steps:
      - name: Force Install GIT latest
        run: |
          sudo apt-get update \
          && sudo apt-get install -y software-properties-common \
          && sudo apt-get update \
          && sudo add-apt-repository -y ppa:git-core/ppa \
          && sudo apt-get update \
          && sudo apt-get install -y git
      - name: Clone
        uses: actions/checkout@v4
        with:
@@ -123,13 +131,6 @@ jobs:
        run: |
           make --jobs=5 --output-sync=target -C backend/python/parler-tts
           make --jobs=5 --output-sync=target -C backend/python/parler-tts test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
  tests-openvoice:
    runs-on: ubuntu-latest
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -224,7 +224,7 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
-          pip install --user --no-cache-dir grpcio-tools
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,6 @@
 /sources/
 __pycache__/
 *.a
 *.o
 get-sources
 prepare-sources
 /backend/cpp/llama/grpc-server
@@ -13,6 +12,7 @@ prepare-sources
 go-ggml-transformers
 go-gpt2
 go-rwkv
 whisper.cpp
 /bloomz
 go-bert
--- a/12
+++ b/12
@@ -85,8 +85,7 @@ WORKDIR /build
 # The requirements-extras target is for any builds with IMAGE_TYPE=extras. It should not be placed in this target unless every IMAGE_TYPE=extras build will use it
 FROM requirements-core AS requirements-extras
-# Install uv as a system package
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
 RUN curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR=/usr/bin sh
 ENV PATH="/root/.cargo/bin:${PATH}"
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
@@ -115,13 +114,12 @@ FROM requirements-${IMAGE_TYPE} AS requirements-drivers
 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=12
 ARG CUDA_MINOR_VERSION=0
 ARG SKIP_DRIVERS=false
 ENV BUILD_TYPE=${BUILD_TYPE}
 # Vulkan requirements
 RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+    if [ "${BUILD_TYPE}" = "vulkan" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils wget gpg-agent && \
@@ -137,7 +135,7 @@ EOT
 # CuBLAS requirements
 RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+    if [ "${BUILD_TYPE}" = "cublas" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils
@@ -163,7 +161,7 @@ RUN <<EOT bash
 EOT
 # If we are building with clblas support, we need the libraries for the builds
-RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            libclblast-dev && \
@@ -171,7 +169,7 @@ RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        rm -rf /var/lib/apt/lists/* \
    ; fi
-RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            hipblas-dev \
--- a/188
+++ b/188
@@ -8,15 +8,23 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=504af20ee4eae72080a56d59d744f6774f7901ce
+CPPLLAMA_VERSION?=0a1c750c80147687df267114c81956757cc14382
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
+WHISPER_CPP_VERSION?=0fbaac9c891055796456df7b9122a70c220f9ca1
 # bert.cpp version
 BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
 BERT_VERSION?=710044b124545415f555e4260d16b146c725a6e4
 # go-piper version
 PIPER_REPO?=https://github.com/mudler/go-piper
-PIPER_VERSION?=e10ca041a885d4a8f3871d52924b47792d5e5aa0
+PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759
 # stablediffusion version
 STABLEDIFFUSION_REPO?=https://github.com/mudler/go-stable-diffusion
@@ -26,18 +34,6 @@ STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f
 TINYDREAM_REPO?=https://github.com/M0Rf30/go-tiny-dream
 TINYDREAM_VERSION?=c04fa463ace9d9a6464313aa5f9cd0f953b6c057
 # bark.cpp
 BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
 BARKCPP_VERSION?=v1.0.0
 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
 STABLEDIFFUSION_GGML_VERSION?=dcf91f9e0f2cbf9da472ee2a556751ed4bab2d2a
 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
 ONNX_OS?=linux
 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
 export CMAKE_ARGS?=
@@ -49,7 +45,6 @@ CGO_LDFLAGS_WHISPER+=-lggml
 CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
 BUILD_ID?=
 NATIVE?=false
 TEST_DIR=/tmp/test
@@ -88,25 +83,7 @@ ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
 # IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif
 # Detect if we are running on arm64
 ifneq (,$(findstring aarch64,$(shell uname -m)))
 	ONNX_ARCH=aarch64
 endif
 ifeq ($(OS),Darwin)
 	ONNX_OS=osx
 	ifneq (,$(findstring aarch64,$(shell uname -m)))
 		ONNX_ARCH=arm64
 	else ifneq (,$(findstring arm64,$(shell uname -m)))
 		ONNX_ARCH=arm64
 	else
 		ONNX_ARCH=x86_64
 	endif
 	ifeq ($(OSX_SIGNING_IDENTITY),)
 		OSX_SIGNING_IDENTITY := $(shell security find-identity -v -p codesigning | grep '"' | head -n 1 | sed -E 's/.*"(.*)"/\1/')
@@ -161,10 +138,10 @@ ifeq ($(BUILD_TYPE),hipblas)
 	export CC=$(ROCM_HOME)/llvm/bin/clang
 	# llama-ggml has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
-	export GGML_HIP=1
+	export GGML_HIPBLAS=1
 	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
-	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
+	CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
 	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
 endif
@@ -202,23 +179,16 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
 endif
 ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
 ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
 ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
 ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
 ifeq ($(ONNX_OS),linux)
 ifeq ($(ONNX_ARCH),x64)
 	ALL_GRPC_BACKENDS+=backend-assets/grpc/bark-cpp
 	ALL_GRPC_BACKENDS+=backend-assets/grpc/stablediffusion-ggml
 endif
 endif
 ALL_GRPC_BACKENDS+=backend-assets/grpc/local-store
 ALL_GRPC_BACKENDS+=backend-assets/grpc/silero-vad
 ALL_GRPC_BACKENDS+=$(OPTIONAL_GRPC)
 # Use filter-out to remove the specified backends
 ALL_GRPC_BACKENDS := $(filter-out $(SKIP_GRPC_BACKEND),$(ALL_GRPC_BACKENDS))
@@ -239,6 +209,19 @@ endif
 all: help
 ## BERT embeddings
 sources/go-bert.cpp:
 	mkdir -p sources/go-bert.cpp
 	cd sources/go-bert.cpp && \
 	git init && \
 	git remote add origin $(BERT_REPO) && \
 	git fetch origin && \
 	git checkout $(BERT_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 sources/go-bert.cpp/libgobert.a: sources/go-bert.cpp
 	$(MAKE) -C sources/go-bert.cpp libgobert.a
 ## go-llama.cpp
 sources/go-llama.cpp:
 	mkdir -p sources/go-llama.cpp
@@ -252,23 +235,6 @@ sources/go-llama.cpp:
 sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
 	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
 ## bark.cpp
 sources/bark.cpp:
 	git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
 	cd sources/bark.cpp && \
 	git checkout $(BARKCPP_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 sources/bark.cpp/build/libbark.a: sources/bark.cpp
 	cd sources/bark.cpp && \
 	mkdir -p build && \
 	cd build && \
 	cmake $(CMAKE_ARGS) .. && \
 	cmake --build . --config Release
 backend/go/bark/libbark.a: sources/bark.cpp/build/libbark.a
 	$(MAKE) -C backend/go/bark libbark.a
 ## go-piper
 sources/go-piper:
 	mkdir -p sources/go-piper
@@ -282,7 +248,21 @@ sources/go-piper:
 sources/go-piper/libpiper_binding.a: sources/go-piper
 	$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
-## stable diffusion (onnx)
+
 ## RWKV
 sources/go-rwkv.cpp:
 	mkdir -p sources/go-rwkv.cpp
 	cd sources/go-rwkv.cpp && \
 	git init && \
 	git remote add origin $(RWKV_REPO) && \
 	git fetch origin && \
 	git checkout $(RWKV_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 sources/go-rwkv.cpp/librwkv.a: sources/go-rwkv.cpp
 	cd sources/go-rwkv.cpp && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..
 ## stable diffusion
 sources/go-stable-diffusion:
 	mkdir -p sources/go-stable-diffusion
 	cd sources/go-stable-diffusion && \
@@ -295,38 +275,6 @@ sources/go-stable-diffusion:
 sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
 	CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
 ## stablediffusion (ggml)
 sources/stablediffusion-ggml.cpp:
 	git clone --recursive $(STABLEDIFFUSION_GGML_REPO) sources/stablediffusion-ggml.cpp && \
 	cd sources/stablediffusion-ggml.cpp && \
 	git checkout $(STABLEDIFFUSION_GGML_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
 	$(MAKE) -C backend/go/image/stablediffusion-ggml build/libstable-diffusion.a
 	$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
 backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/stablediffusion-ggml
 endif
 sources/onnxruntime:
 	mkdir -p sources/onnxruntime
 	curl -L https://github.com/microsoft/onnxruntime/releases/download/v$(ONNX_VERSION)/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz -o sources/onnxruntime/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz
 	cd sources/onnxruntime && tar -xvf onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz && rm onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz
 	cd sources/onnxruntime && mv onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION)/* ./
 backend-assets/lib/libonnxruntime.so.1: backend-assets/lib sources/onnxruntime
 	cp -rfv sources/onnxruntime/lib/* backend-assets/lib/
 ifeq ($(OS),Darwin)
 	mv backend-assets/lib/libonnxruntime.$(ONNX_VERSION).dylib backend-assets/lib/libonnxruntime.dylib
 else
 	mv backend-assets/lib/libonnxruntime.so.$(ONNX_VERSION) backend-assets/lib/libonnxruntime.so.1
 endif
 ## tiny-dream
 sources/go-tiny-dream:
 	mkdir -p sources/go-tiny-dream
@@ -353,19 +301,23 @@ sources/whisper.cpp:
 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
-get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
+get-sources: sources/go-llama.cpp sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
 replace:
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert.cpp
 	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
 dropreplace:
 	$(GOCMD) mod edit -dropreplace github.com/donomii/go-rwkv.cpp
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-bert.cpp
 	$(GOCMD) mod edit -dropreplace github.com/M0Rf30/go-tiny-dream
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
@@ -378,8 +330,10 @@ prepare-sources: get-sources replace
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
 	$(MAKE) -C sources/go-llama.cpp clean
 	$(MAKE) -C sources/go-rwkv.cpp clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-stable-diffusion clean
 	$(MAKE) -C sources/go-bert.cpp clean
 	$(MAKE) -C sources/go-piper clean
 	$(MAKE) -C sources/go-tiny-dream clean
 	$(MAKE) build
@@ -394,9 +348,7 @@ clean: ## Remove build related file
 	rm -rf release/
 	rm -rf backend-assets/*
 	$(MAKE) -C backend/cpp/grpc clean
 	$(MAKE) -C backend/go/bark clean
 	$(MAKE) -C backend/cpp/llama clean
 	$(MAKE) -C backend/go/image/stablediffusion-ggml clean
 	rm -rf backend/cpp/llama-* || true
 	$(MAKE) dropreplace
 	$(MAKE) protogen-clean
@@ -487,6 +439,8 @@ test-models/testmodel.ggml:
 	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
 	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
 	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
 	wget -q https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
 	wget -q https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
 	cp tests/models_fixtures/* test-models
 prepare-test: grpcs
@@ -739,6 +693,13 @@ backend-assets/espeak-ng-data: sources/go-piper sources/go-piper/libpiper_bindin
 backend-assets/grpc: protogen-go replace
 	mkdir -p backend-assets/grpc
 backend-assets/grpc/bert-embeddings: sources/go-bert.cpp sources/go-bert.cpp/libgobert.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert.cpp LIBRARY_PATH=$(CURDIR)/sources/go-bert.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/bert-embeddings
 endif
 backend-assets/grpc/huggingface: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/huggingface ./backend/go/llm/langchain/
 ifneq ($(UPX),)
@@ -798,6 +759,10 @@ backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc backend/cpp/llama/ll
 	$(info ${GREEN}I llama-cpp build info:fallback${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback
 # TODO: every binary should have its own folder instead, so can have different metal implementations
 ifeq ($(BUILD_TYPE),metal)
 	cp backend/cpp/llama-fallback/llama.cpp/build/bin/default.metallib backend-assets/grpc/
 endif
 backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-cuda
@@ -810,7 +775,7 @@ backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc backend/cpp/llama/lla
 	cp -rf backend/cpp/llama backend/cpp/llama-hipblas
 	$(MAKE) -C backend/cpp/llama-hipblas purge
 	$(info ${GREEN}I llama-cpp build info:hipblas${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
+	BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas
 backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc backend/cpp/llama/llama.cpp
@@ -845,13 +810,6 @@ ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/llama-ggml
 endif
 backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/bark-cpp
 endif
 backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
 	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/
@@ -859,6 +817,13 @@ ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/piper
 endif
 backend-assets/grpc/rwkv: sources/go-rwkv.cpp sources/go-rwkv.cpp/librwkv.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv.cpp LIBRARY_PATH=$(CURDIR)/sources/go-rwkv.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/rwkv
 endif
 backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/go-stable-diffusion/:/usr/include/opencv4" LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion
@@ -866,13 +831,6 @@ ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/stablediffusion
 endif
 backend-assets/grpc/silero-vad: backend-assets/grpc backend-assets/lib/libonnxruntime.so.1
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/onnxruntime/include/" LIBRARY_PATH=$(CURDIR)/backend-assets/lib \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/silero-vad ./backend/go/vad/silero
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/silero-vad
 endif
 backend-assets/grpc/tinydream: sources/go-tiny-dream sources/go-tiny-dream/libtinydream.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
@@ -933,7 +891,7 @@ docker-aio-all:
 docker-image-intel:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -941,7 +899,7 @@ docker-image-intel:
 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
--- a/README.md
+++ b/README.md
@@ -38,13 +38,9 @@
 </a>
 </p>
 <p align="center">
 <a href="https://trendshift.io/repositories/1484" target="_blank"><img src="https://trendshift.io/api/badge/repositories/1484" alt="go-skynet%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 </p>
 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
 >
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/go-skynet/LocalAI/tree/master/examples/) 
 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
@@ -60,17 +56,14 @@ curl https://localai.io/install.sh | sh
 Or run with docker:
 ```bash
 # CPU only image:
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
 # Nvidia GPU:
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
 # CPU and GPU image (bigger size):
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
 # AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 # Alternative images:
 # - if you have an Nvidia GPU:
 # docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
 # - without preconfigured models
 # docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
 # - without preconfigured models for Nvidia GPUs
 # docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12 
 ```
 To load models:
@@ -92,10 +85,6 @@ local-ai run oci://localai/phi-2:latest
 ## 📰 Latest project news
 - Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
 - Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
 - Nov 2024: Voice activity detection models (**VAD**) added to the API: https://github.com/mudler/LocalAI/pull/4204
 - Oct 2024: examples moved to [LocalAI-examples](https://github.com/mudler/LocalAI-examples)
 - Aug 2024:  🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
 - July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
 - June 2024: 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
@@ -126,10 +115,10 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
 ## 🚀 [Features](https://localai.io/features/)
- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
+- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
 - 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
 - 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
- 🎨 [Image generation](https://localai.io/features/image-generation)
+- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
 - 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/) 
 - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
@@ -137,7 +126,6 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
 - 📈 [Reranker API](https://localai.io/features/reranker/)
 - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
 - 🔊 Voice activity detection (Silero-VAD support)
 - 🌍 Integrated WebUI!
 ## 💻 Usage
@@ -160,7 +148,6 @@ Model galleries
 Other:
 - Helm chart https://github.com/go-skynet/helm-charts
 - VSCode extension https://github.com/badgooooor/localai-vscode-plugin
 - Langchain: https://python.langchain.com/docs/integrations/providers/localai/
 - Terminal utility https://github.com/djcopley/ShellOracle
 - Local Smart assistant https://github.com/mudler/LocalAGI
 - Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
@@ -168,9 +155,6 @@ Other:
 - Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
 - Shell-Pilot(Interact with LLM using LocalAI models via pure shell scripts on your Linux or MacOS system) https://github.com/reid41/shell-pilot
 - Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
 - Another Telegram Bot https://github.com/JackBekket/Hellper
 - Auto-documentation https://github.com/JackBekket/Reflexia
 - Github bot which answer on issues, with code and documentation as context https://github.com/JackBekket/GitHelper
 - Github Actions: https://github.com/marketplace/actions/start-localai
 - Examples: https://github.com/mudler/LocalAI/tree/master/examples/
@@ -245,6 +229,7 @@ LocalAI couldn't have been built without the help of great software already avai
 - https://github.com/antimatter15/alpaca.cpp
 - https://github.com/EdVince/Stable-Diffusion-NCNN
 - https://github.com/ggerganov/whisper.cpp
 - https://github.com/saharNooby/rwkv.cpp
 - https://github.com/rhasspy/piper
 ## 🤗 Contributors
--- a/aio/cpu/embeddings.yaml
+++ b/aio/cpu/embeddings.yaml
@@ -1,7 +1,7 @@
 name: text-embedding-ada-002
-embeddings: true
+backend: bert-embeddings
 parameters:
-  model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
+  model: huggingface://mudler/all-MiniLM-L6-v2/ggml-model-q4_0.bin
 usage: |
    You can test this model with curl like this:
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -28,8 +28,6 @@ service Backend {
  rpc Rerank(RerankRequest) returns (RerankResult) {}
  rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
  rpc VAD(VADRequest) returns (VADResponse) {}
 }
 // Define the empty request
@@ -159,7 +157,6 @@ message Reply {
  bytes message = 1;
  int32 tokens = 2;
  int32 prompt_tokens = 3;
  bytes audio = 5;
 }
 message ModelOptions {
@@ -236,16 +233,6 @@ message ModelOptions {
  bool FlashAttention = 56;
  bool NoKVOffload = 57;
  string ModelPath = 59;
  repeated string LoraAdapters = 60;
  repeated float LoraScales = 61;
  repeated string Options = 62;
  string CacheTypeKey = 63;
  string CacheTypeValue = 64;
 }
 message Result {
@@ -301,19 +288,6 @@ message TTSRequest {
  optional string language = 5;
 }
 message VADRequest {
  repeated float audio = 1;
 }
 message VADSegment {
  float start = 1;
  float end = 2;
 }
 message VADResponse {
  repeated VADSegment segments = 1;
 }
 message SoundGenerationRequest {
  string text = 1;
  string model = 2;
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -22,7 +22,7 @@ else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIP=ON
+	CMAKE_ARGS+=-DGGML_HIPBLAS=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
 else ifeq ($(OS),Darwin)
@@ -30,7 +30,9 @@ else ifeq ($(OS),Darwin)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 	else
 		CMAKE_ARGS+=-DGGML_METAL=ON
-		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
+# Until this is tested properly, we disable embedded metal file
 # as we already embed it as part of the LocalAI assets
 		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=OFF
 		TARGET+=--target ggml-metal
 	endif
 endif
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -203,7 +203,7 @@ struct llama_client_slot
    std::string stopping_word;
    // sampling
-    struct common_params_sampling sparams;
+    struct common_sampler_params sparams;
    common_sampler *ctx_sampling = nullptr;
    int32_t ga_i = 0;   // group-attention state
@@ -428,7 +428,6 @@ struct llama_server_context
 {
    llama_model *model = nullptr;
    llama_context *ctx = nullptr;
    const llama_vocab * vocab = nullptr;
    clip_ctx *clp_ctx = nullptr;
@@ -440,7 +439,6 @@ struct llama_server_context
    bool clean_kv_cache     = true;
    bool all_slots_are_idle = false;
    bool add_bos_token      = true;
    bool has_eos_token      = true;
    int32_t n_ctx;  // total context for all clients / slots
@@ -494,8 +492,8 @@ struct llama_server_context
        }
        common_init_result common_init = common_init_from_params(params);
-        model = common_init.model.release();
+        model = common_init.model;
-        ctx = common_init.context.release();
+        ctx = common_init.context;
        if (model == nullptr)
        {
            LOG_ERR("unable to load model: %s", params.model.c_str());
@@ -504,7 +502,7 @@ struct llama_server_context
        if (multimodal) {
            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
-            const int n_embd_llm  = llama_model_n_embd(model);
+            const int n_embd_llm  = llama_n_embd(model);
            if (n_embd_clip != n_embd_llm) {
                LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
                llama_free(ctx);
@@ -513,15 +511,23 @@ struct llama_server_context
            }
        }
        vocab = llama_model_get_vocab(model);
        n_ctx = llama_n_ctx(ctx);
-        add_bos_token = llama_vocab_get_add_bos(vocab);
+        add_bos_token = llama_add_bos_token(model);
        has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
        return true;
    }
    void validate_model_chat_template(server_params & sparams) {
        llama_chat_message chat[] = {{"user", "test"}};
        std::vector<char> buf(1);
        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
        if (res < 0) {
            LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__);
            sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
        }
    }
    llama_client_slot* get_active_slot() {
        for (llama_client_slot& slot : slots) {
            // Check if the slot is currently processing
@@ -656,7 +662,7 @@ struct llama_server_context
    bool launch_slot_with_data(llama_client_slot* &slot, json data) {
        slot_params default_params;
-        common_params_sampling default_sparams;
+        common_sampler_params default_sparams;
        slot->params.stream             = json_value(data, "stream",            false);
        slot->params.cache_prompt       = json_value(data, "cache_prompt",      false);
@@ -664,6 +670,7 @@ struct llama_server_context
        slot->sparams.top_k             = json_value(data, "top_k",             default_sparams.top_k);
        slot->sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
        slot->sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
        slot->sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
        slot->sparams.typ_p             = json_value(data, "typical_p",         default_sparams.typ_p);
        slot->sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
        slot->sparams.dynatemp_range    = json_value(data, "dynatemp_range",    default_sparams.dynatemp_range);
@@ -675,6 +682,7 @@ struct llama_server_context
        slot->sparams.mirostat          = json_value(data, "mirostat",          default_sparams.mirostat);
        slot->sparams.mirostat_tau      = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
        slot->sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
        slot->sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
        slot->params.n_keep             = json_value(data, "n_keep",            slot->params.n_keep);
        slot->sparams.seed               = json_value(data, "seed",              default_sparams.seed);
        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
@@ -719,8 +727,8 @@ struct llama_server_context
            slot->prompt = "";
        }
-        if (json_value(data, "ignore_eos", false) && has_eos_token) {
+        if (json_value(data, "ignore_eos", false)) {
-                slot->sparams.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
+                slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
        }
        /*
        slot->sparams.penalty_prompt_tokens.clear();
@@ -759,13 +767,13 @@ struct llama_server_context
            }
        }
      */
        slot->sparams.logit_bias.clear();
        const auto &logit_bias = data.find("logit_bias");
        if (logit_bias != data.end() && logit_bias->is_array())
        {
-            const llama_vocab * vocab = llama_model_get_vocab(model);
+            const int n_vocab = llama_n_vocab(model);
            const int n_vocab = llama_vocab_n_tokens(vocab);
            for (const auto &el : *logit_bias)
            {
                if (el.is_array() && el.size() == 2)
@@ -794,7 +802,7 @@ struct llama_server_context
                    }
                    else if (el[0].is_string())
                    {
-                        auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
+                        auto toks = common_tokenize(model, el[0].get<std::string>(), false);
                        for (auto tok : toks)
                        {
                            slot->sparams.logit_bias.push_back({tok, bias});
@@ -1124,7 +1132,7 @@ struct llama_server_context
            slot.has_next_token = false;
        }
-        if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok))
+        if (result.tok == llama_token_eos(model))
        {
            slot.stopped_eos = true;
            slot.has_next_token = false;
@@ -1198,6 +1206,7 @@ struct llama_server_context
            {"top_k",             slot.sparams.top_k},
            {"top_p",             slot.sparams.top_p},
            {"min_p",             slot.sparams.min_p},
            {"tfs_z",             slot.sparams.tfs_z},
            {"typical_p",         slot.sparams.typ_p},
            {"repeat_last_n",     slot.sparams.penalty_last_n},
            {"repeat_penalty",    slot.sparams.penalty_repeat},
@@ -1206,12 +1215,13 @@ struct llama_server_context
            {"mirostat",          slot.sparams.mirostat},
            {"mirostat_tau",      slot.sparams.mirostat_tau},
            {"mirostat_eta",      slot.sparams.mirostat_eta},
            {"penalize_nl",       slot.sparams.penalize_nl},
            {"stop",              slot.params.antiprompt},
            {"n_predict",         slot.params.n_predict},
            {"n_keep",            params.n_keep},
            {"ignore_eos",        slot.sparams.ignore_eos},
            {"stream",            slot.params.stream},
-             //      {"logit_bias",        slot.sparams.logit_bias},
+      //      {"logit_bias",        slot.sparams.logit_bias},
            {"n_probs",           slot.sparams.n_probs},
            {"min_keep",          slot.sparams.min_keep},
            {"grammar",           slot.sparams.grammar},
@@ -1319,7 +1329,7 @@ struct llama_server_context
        res.error = false;
        res.stop = true;
-        const int n_embd = llama_model_n_embd(model);
+        const int n_embd = llama_n_embd(model);
        if (!params.embedding)
        {
            LOG_WARNING("embedding disabled", {
@@ -1418,7 +1428,7 @@ struct llama_server_context
                    n_eval = n_batch;
                }
-                const int n_embd = llama_model_n_embd(model);
+                const int n_embd = llama_n_embd(model);
                float * embd = img.image_embedding + i * n_embd;
                llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
                if (llama_decode(ctx, llava_batch.batch))
@@ -1699,11 +1709,11 @@ struct llama_server_context
                            suffix_tokens.erase(suffix_tokens.begin());
                        }
-                        prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_fim_pre(vocab));
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
-                        prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_bos(vocab)); // always add BOS
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
-                        prefix_tokens.insert(prefix_tokens.end(),   llama_vocab_fim_suf(vocab));
+                        prefix_tokens.insert(prefix_tokens.end(),   llama_token_suffix(model));
                        prefix_tokens.insert(prefix_tokens.end(),   suffix_tokens.begin(), suffix_tokens.end());
-                        prefix_tokens.push_back(llama_vocab_fim_mid(vocab));
+                        prefix_tokens.push_back(llama_token_middle(model));
                        prompt_tokens = prefix_tokens;
                    }
                    else
@@ -2095,6 +2105,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    //     slot->params.n_predict        = json_value(data, "n_predict",         default_params.n_predict);
    //     slot->sparams.top_k           = json_value(data, "top_k",             default_sparams.top_k);
    //     slot->sparams.top_p           = json_value(data, "top_p",             default_sparams.top_p);
    //     slot->sparams.tfs_z           = json_value(data, "tfs_z",             default_sparams.tfs_z);
    //     slot->sparams.typical_p       = json_value(data, "typical_p",         default_sparams.typical_p);
    //     slot->sparams.temp            = json_value(data, "temperature",       default_sparams.temp);
    //     slot->sparams.penalty_last_n  = json_value(data, "repeat_last_n",     default_sparams.penalty_last_n);
@@ -2104,6 +2115,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    //     slot->sparams.mirostat        = json_value(data, "mirostat",          default_sparams.mirostat);
    //     slot->sparams.mirostat_tau    = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
    //     slot->sparams.mirostat_eta    = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
    //     slot->sparams.penalize_nl     = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
    //     slot->params.n_keep           = json_value(data, "n_keep",            slot->params.n_keep);
    //     slot->params.seed             = json_value(data, "seed",              default_params.seed);
    //     slot->sparams.grammar         = json_value(data, "grammar",           default_sparams.grammar);
@@ -2117,6 +2129,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    data["n_predict"] = predict->tokens() == 0 ? -1 : predict->tokens();
    data["top_k"] = predict->topk();
    data["top_p"] = predict->topp();
    data["tfs_z"] = predict->tailfreesamplingz();
    data["typical_p"] = predict->typicalp();
    data["temperature"] = predict->temperature();
    data["repeat_last_n"] = predict->repeat();
@@ -2126,6 +2139,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    data["mirostat"] = predict->mirostat();
    data["mirostat_tau"] = predict->mirostattau();
    data["mirostat_eta"] = predict->mirostateta();
    data["penalize_nl"] = predict->penalizenl();
    data["n_keep"] = predict->nkeep();
    data["seed"] = predict->seed();
    data["grammar"] = predict->grammar();
@@ -2162,6 +2176,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 //     llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens();
 //     llama.params.sparams.top_k = predict->topk();
 //     llama.params.sparams.top_p = predict->topp();
 //     llama.params.sparams.tfs_z = predict->tailfreesamplingz();
 //     llama.params.sparams.typical_p = predict->typicalp();
 //     llama.params.sparams.penalty_last_n = predict->repeat();
 //     llama.params.sparams.temp = predict->temperature();
@@ -2171,6 +2186,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 //     llama.params.sparams.mirostat = predict->mirostat();
 //     llama.params.sparams.mirostat_tau = predict->mirostattau();
 //     llama.params.sparams.mirostat_eta = predict->mirostateta();
 //     llama.params.sparams.penalize_nl = predict->penalizenl();
 //     llama.params.n_keep = predict->nkeep();
 //     llama.params.seed = predict->seed();
 //     llama.params.sparams.grammar = predict->grammar();
@@ -2217,35 +2233,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 //     }
 // }
 const std::vector<ggml_type> kv_cache_types = {
    GGML_TYPE_F32,
    GGML_TYPE_F16,
    GGML_TYPE_BF16,
    GGML_TYPE_Q8_0,
    GGML_TYPE_Q4_0,
    GGML_TYPE_Q4_1,
    GGML_TYPE_IQ4_NL,
    GGML_TYPE_Q5_0,
    GGML_TYPE_Q5_1,
 };
 static ggml_type kv_cache_type_from_str(const std::string & s) {
    for (const auto & type : kv_cache_types) {
        if (ggml_type_name(type) == s) {
            return type;
        }
    }
    throw std::runtime_error("Unsupported cache type: " + s);
 }
 static std::string get_all_kv_cache_types() {
    std::ostringstream msg;
    for (const auto & type : kv_cache_types) {
        msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
    }
    return msg.str();
 }
 static void params_parse(const backend::ModelOptions* request,
                                common_params & params) {
@@ -2259,12 +2246,6 @@ static void params_parse(const backend::ModelOptions* request,
    }
    //  params.model_alias ??
    params.model_alias =  request->modelfile();
    if (!request->cachetypekey().empty()) {
        params.cache_type_k = kv_cache_type_from_str(request->cachetypekey());
    }
    if (!request->cachetypevalue().empty()) {
        params.cache_type_v = kv_cache_type_from_str(request->cachetypevalue());
    }
    params.n_ctx = request->contextsize();
    //params.memory_f16 = request->f16memory();
    params.cpuparams.n_threads = request->threads();
@@ -2323,7 +2304,6 @@ static void params_parse(const backend::ModelOptions* request,
    params.use_mmap = request->mmap();
    params.flash_attn = request->flashattention();
    params.no_kv_offload = request->nokvoffload();
    params.ctx_shift = false; // We control context-shifting in any case (and we disable it as it could just lead to infinite loops)
    params.embedding = request->embeddings();
--- a/backend/cpp/llama/patches/01-llava.patch
+++ b/backend/cpp/llama/patches/01-llava.patch
@@ -1,13 +1,13 @@
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 3cd0d2fa..6c5e811a 100644
+index 342042ff..224db9b5 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
-@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
+@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
-                 struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
+             struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
-                 int* patches_data = (int*)malloc(ggml_nbytes(patches));
+             int* patches_data = (int*)malloc(ggml_nbytes(patches));
-                 for (int i = 0; i < num_patches; i++) {
+             for (int i = 0; i < num_patches; i++) {
-                    patches_data[i] = i + 1;
+-                patches_data[i] = i + 1;
-+                    patches_data[i] = i;
+                patches_data[i] = i;
-                 }
+             }
-                 ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
+             ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
-                 free(patches_data);
+             free(patches_data);
--- a/backend/go/bark/Makefile
+++ b/backend/go/bark/Makefile
@@ -1,25 +0,0 @@
 INCLUDE_PATH := $(abspath ./)
 LIBRARY_PATH := $(abspath ./)
 AR?=ar
 BUILD_TYPE?=
 # keep standard at C11 and C++11
 CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../sources/bark.cpp/examples -I$(INCLUDE_PATH)/../../../sources/bark.cpp/spm-headers -I$(INCLUDE_PATH)/../../../sources/bark.cpp -O3 -DNDEBUG -std=c++17 -fPIC
 LDFLAGS  = -L$(LIBRARY_PATH) -L$(LIBRARY_PATH)/../../../sources/bark.cpp/build/examples -lbark -lstdc++ -lm
 # warnings
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
 gobark.o:
 	$(CXX) $(CXXFLAGS) gobark.cpp -o gobark.o -c $(LDFLAGS)
 libbark.a: gobark.o
 	cp $(INCLUDE_PATH)/../../../sources/bark.cpp/build/libbark.a ./
 	$(AR) rcs libbark.a gobark.o
 	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml.c.o
 	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-alloc.c.o
 	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-backend.c.o
 clean:
 	rm -f gobark.o libbark.a
--- a/backend/go/bark/gobark.cpp
+++ b/backend/go/bark/gobark.cpp
@@ -1,85 +0,0 @@
 #include <iostream>
 #include <tuple>
 #include "bark.h"
 #include "gobark.h"
 #include "common.h"
 #include "ggml.h"
 struct bark_context *c;
 void bark_print_progress_callback(struct bark_context *bctx, enum bark_encoding_step step, int progress, void *user_data) {
    if (step == bark_encoding_step::SEMANTIC) {
        printf("\rGenerating semantic tokens... %d%%", progress);
    } else if (step == bark_encoding_step::COARSE) {
        printf("\rGenerating coarse tokens... %d%%", progress);
    } else if (step == bark_encoding_step::FINE) {
        printf("\rGenerating fine tokens... %d%%", progress);
    }
    fflush(stdout);
 }
 int load_model(char *model) {
    // initialize bark context
    struct bark_context_params ctx_params = bark_context_default_params();
    bark_params params;
    params.model_path = model;
   // ctx_params.verbosity = verbosity;
    ctx_params.progress_callback = bark_print_progress_callback;
    ctx_params.progress_callback_user_data = nullptr;
    struct bark_context *bctx = bark_load_model(params.model_path.c_str(), ctx_params, params.seed);
    if (!bctx) {
        fprintf(stderr, "%s: Could not load model\n", __func__);
        return 1;
    }
    c = bctx;
    return 0;
 }
 int tts(char *text,int  threads, char *dst ) {
    ggml_time_init();
    const int64_t t_main_start_us = ggml_time_us();
    // generate audio
    if (!bark_generate_audio(c, text, threads)) {
        fprintf(stderr, "%s: An error occured. If the problem persists, feel free to open an issue to report it.\n", __func__);
        return 1;
    }
    const float *audio_data = bark_get_audio_data(c);
    if (audio_data == NULL) {
        fprintf(stderr, "%s: Could not get audio data\n", __func__);
        return 1;
    }
    const int audio_arr_size = bark_get_audio_data_size(c);
    std::vector<float> audio_arr(audio_data, audio_data + audio_arr_size);
    write_wav_on_disk(audio_arr, dst);
    // report timing
    {
        const int64_t t_main_end_us = ggml_time_us();
        const int64_t t_load_us = bark_get_load_time(c);
        const int64_t t_eval_us = bark_get_eval_time(c);
        printf("\n\n");
        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
        printf("%s:     eval time = %8.2f ms\n", __func__, t_eval_us / 1000.0f);
        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
    }
    return 0;
 }
 int unload() {
    bark_free(c);
 }
--- a/backend/go/bark/gobark.go
+++ b/backend/go/bark/gobark.go
@@ -1,52 +0,0 @@
 package main
 // #cgo CXXFLAGS: -I${SRCDIR}/../../../sources/bark.cpp/ -I${SRCDIR}/../../../sources/bark.cpp/encodec.cpp -I${SRCDIR}/../../../sources/bark.cpp/examples -I${SRCDIR}/../../../sources/bark.cpp/spm-headers
 // #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../sources/bark.cpp/build/examples -L${SRCDIR}/../../../sources/bark.cpp/build/encodec.cpp/ -lbark -lencodec -lcommon
 // #include <gobark.h>
 // #include <stdlib.h>
 import "C"
 import (
 	"fmt"
 	"unsafe"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 type Bark struct {
 	base.SingleThread
 	threads int
 }
 func (sd *Bark) Load(opts *pb.ModelOptions) error {
 	sd.threads = int(opts.Threads)
 	modelFile := C.CString(opts.ModelFile)
 	defer C.free(unsafe.Pointer(modelFile))
 	ret := C.load_model(modelFile)
 	if ret != 0 {
 		return fmt.Errorf("inference failed")
 	}
 	return nil
 }
 func (sd *Bark) TTS(opts *pb.TTSRequest) error {
 	t := C.CString(opts.Text)
 	defer C.free(unsafe.Pointer(t))
 	dst := C.CString(opts.Dst)
 	defer C.free(unsafe.Pointer(dst))
 	threads := C.int(sd.threads)
 	ret := C.tts(t, threads, dst)
 	if ret != 0 {
 		return fmt.Errorf("inference failed")
 	}
 	return nil
 }
--- a/backend/go/bark/gobark.h
+++ b/backend/go/bark/gobark.h
@@ -1,8 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 int load_model(char *model);
 int tts(char *text,int  threads, char *dst );
 #ifdef __cplusplus
 }
 #endif
--- a/backend/go/image/stablediffusion-ggml/Makefile
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@@ -1,96 +0,0 @@
 INCLUDE_PATH := $(abspath ./)
 LIBRARY_PATH := $(abspath ./)
 AR?=ar
 CMAKE_ARGS?=
 BUILD_TYPE?=
 ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 # keep standard at C11 and C++11
 CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DGGML_CUDA=ON
 # If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # to CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DGGML_HIP=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
 else ifeq ($(OS),Darwin)
 	ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 	else
 		CMAKE_ARGS+=-DGGML_METAL=ON
 		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		TARGET+=--target ggml-metal
 	endif
 endif
 # ifeq ($(BUILD_TYPE),sycl_f16)
 # 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DSD_SYCL=ON -DGGML_SYCL_F16=ON
 # endif
 # ifeq ($(BUILD_TYPE),sycl_f32)
 # 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON
 # endif
 # warnings
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
 # Find all .a archives in ARCHIVE_DIR
 # (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
 GGML_ARCHIVE_DIR := build/ggml/src/
 ALL_ARCHIVES := $(shell find $(GGML_ARCHIVE_DIR) -type f -name '*.a')
 # Name of the single merged library
 COMBINED_LIB := libggmlall.a
 # Rule to merge all the .a files into one
 $(COMBINED_LIB): $(ALL_ARCHIVES)
 	@echo "Merging all .a into $(COMBINED_LIB)"
 	rm -f $@
 	mkdir -p merge-tmp
 	for a in $(ALL_ARCHIVES); do \
 		( cd merge-tmp && ar x ../$$a ); \
 	done
 	( cd merge-tmp && ar rcs ../$@ *.o )
 	# Ensure we have a proper index
 	ranlib $@
 	# Clean up
 	rm -rf merge-tmp
 build/libstable-diffusion.a:
 	@echo "Building SD with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
 	mkdir -p build && \
 	cd build && \
 	cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
 	cmake --build . --config Release"
 else
 	mkdir -p build && \
 	cd build && \
 	cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
 	cmake --build . --config Release
 endif
 	$(MAKE) $(COMBINED_LIB)
 gosd.o:
 	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
 libsd.a: gosd.o
 	cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
 	$(AR) rcs libsd.a gosd.o
 clean:
 	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
--- a/backend/go/image/stablediffusion-ggml/gosd.cpp
+++ b/backend/go/image/stablediffusion-ggml/gosd.cpp
@@ -1,228 +0,0 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
 #include <iostream>
 #include <random>
 #include <string>
 #include <vector>
 #include "gosd.h"
 // #include "preprocessing.hpp"
 #include "flux.hpp"
 #include "stable-diffusion.h"
 #define STB_IMAGE_IMPLEMENTATION
 #define STB_IMAGE_STATIC
 #include "stb_image.h"
 #define STB_IMAGE_WRITE_IMPLEMENTATION
 #define STB_IMAGE_WRITE_STATIC
 #include "stb_image_write.h"
 #define STB_IMAGE_RESIZE_IMPLEMENTATION
 #define STB_IMAGE_RESIZE_STATIC
 #include "stb_image_resize.h"
 // Names of the sampler method, same order as enum sample_method in stable-diffusion.h
 const char* sample_method_str[] = {
    "euler_a",
    "euler",
    "heun",
    "dpm2",
    "dpm++2s_a",
    "dpm++2m",
    "dpm++2mv2",
    "ipndm",
    "ipndm_v",
    "lcm",
 };
 // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
 const char* schedule_str[] = {
    "default",
    "discrete",
    "karras",
    "exponential",
    "ays",
    "gits",
 };
 sd_ctx_t* sd_c;
 sample_method_t sample_method;
 int load_model(char *model, char* options[], int threads, int diff) {
    fprintf (stderr, "Loading model!\n");
    char *stableDiffusionModel = "";
    if (diff == 1 ) {
        stableDiffusionModel = model;
        model = "";
    }
    // decode options. Options are in form optname:optvale, or if booleans only optname.
    char *clip_l_path  = "";
    char *clip_g_path  = "";
    char *t5xxl_path  = "";
    char *vae_path  = "";
    char *scheduler = "";
    char *sampler = "";
    // If options is not NULL, parse options
    for (int i = 0; options[i] != NULL; i++) {
        char *optname = strtok(options[i], ":");
        char *optval = strtok(NULL, ":");
        if (optval == NULL) {
            optval = "true";
        }
        if (!strcmp(optname, "clip_l_path")) {
            clip_l_path = optval;
        }
        if (!strcmp(optname, "clip_g_path")) {
            clip_g_path = optval;
        }
        if (!strcmp(optname, "t5xxl_path")) {
            t5xxl_path = optval;
        }
        if (!strcmp(optname, "vae_path")) {
            vae_path = optval;
        }
        if (!strcmp(optname, "scheduler")) {
            scheduler = optval;
        }
        if (!strcmp(optname, "sampler")) {
            sampler = optval;
        }
    }
    int sample_method_found = -1;
    for (int m = 0; m < N_SAMPLE_METHODS; m++) {
        if (!strcmp(sampler, sample_method_str[m])) {
            sample_method_found = m;
        }
    }
    if (sample_method_found == -1) {
        fprintf(stderr, "Invalid sample method, default to EULER_A!\n");
        sample_method_found = EULER_A;
    }
    sample_method = (sample_method_t)sample_method_found;
    int schedule_found            = -1;
    for (int d = 0; d < N_SCHEDULES; d++) {
        if (!strcmp(scheduler, schedule_str[d])) {
            schedule_found = d;
                fprintf (stderr, "Found scheduler: %s\n", scheduler);
        }
    }
    if (schedule_found == -1) {
        fprintf (stderr, "Invalid scheduler! using DEFAULT\n");
        schedule_found = DEFAULT;
    }
    schedule_t schedule = (schedule_t)schedule_found;
    fprintf (stderr, "Creating context\n");
    sd_ctx_t* sd_ctx = new_sd_ctx(model,
                                  clip_l_path,
                                  clip_g_path,
                                  t5xxl_path,
                                  stableDiffusionModel,
                                  vae_path,
                                  "",
                                  "",
                                  "",
                                  "",
                                  "",
                                  false,
                                  false,
                                  false,
                                  threads,
                                  SD_TYPE_COUNT,
                                  STD_DEFAULT_RNG,
                                  schedule,
                                  false,
                                  false,
                                  false,
                                  false);
    if (sd_ctx == NULL) {
        fprintf (stderr, "failed loading model (generic error)\n");
        return 1;
    }
    fprintf (stderr, "Created context: OK\n");
    sd_c = sd_ctx;
    return 0;
 }
 int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed , char *dst, float cfg_scale) {
    sd_image_t* results;
    std::vector<int> skip_layers = {7, 8, 9};
    fprintf (stderr, "Generating image\n");
    results = txt2img(sd_c,
                            text,
                            negativeText,
                            -1, //clip_skip
                            cfg_scale, // sfg_scale
                            3.5f,
                            width,
                            height,
                            sample_method, 
                            steps,
                            seed,
                            1,
                            NULL,
                            0.9f,
                            20.f,
                            false,
                            "",
                            skip_layers.data(),
                            skip_layers.size(),
                            0,
                            0.01,
                            0.2);
    if (results == NULL) {
        fprintf (stderr, "NO results\n");
        return 1;
    }
    if (results[0].data == NULL) {
        fprintf (stderr, "Results with no data\n");
        return 1;
    }
    fprintf (stderr, "Writing PNG\n");
    fprintf (stderr, "DST: %s\n", dst);
    fprintf (stderr, "Width: %d\n", results[0].width);
    fprintf (stderr, "Height: %d\n", results[0].height);
    fprintf (stderr, "Channel: %d\n", results[0].channel);
    fprintf (stderr, "Data: %p\n", results[0].data);
    stbi_write_png(dst, results[0].width, results[0].height, results[0].channel,
                       results[0].data, 0, NULL);
    fprintf (stderr, "Saved resulting image to '%s'\n", dst);
    // TODO: free results. Why does it crash?
    free(results[0].data);
    results[0].data = NULL;
    free(results);
    fprintf (stderr, "gen_image is done", dst);
    return 0;
 }
 int unload() {
    free_sd_ctx(sd_c);
 }
--- a/backend/go/image/stablediffusion-ggml/gosd.go
+++ b/backend/go/image/stablediffusion-ggml/gosd.go
@@ -1,96 +0,0 @@
 package main
 // #cgo CXXFLAGS: -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/ggml/include
 // #cgo LDFLAGS: -L${SRCDIR}/ -lsd -lstdc++ -lm -lggmlall -lgomp
 // #include <gosd.h>
 // #include <stdlib.h>
 import "C"
 import (
 	"fmt"
 	"os"
 	"path/filepath"
 	"strings"
 	"unsafe"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/utils"
 )
 type SDGGML struct {
 	base.SingleThread
 	threads      int
 	sampleMethod string
 	cfgScale     float32
 }
 func (sd *SDGGML) Load(opts *pb.ModelOptions) error {
 	sd.threads = int(opts.Threads)
 	modelFile := C.CString(opts.ModelFile)
 	defer C.free(unsafe.Pointer(modelFile))
 	var options **C.char
 	// prepare the options array to pass to C
 	size := C.size_t(unsafe.Sizeof((*C.char)(nil)))
 	length := C.size_t(len(opts.Options))
 	options = (**C.char)(C.malloc(length * size))
 	view := (*[1 << 30]*C.char)(unsafe.Pointer(options))[0:len(opts.Options):len(opts.Options)]
 	var diffusionModel int
 	var oo []string
 	for _, op := range opts.Options {
 		if op == "diffusion_model" {
 			diffusionModel = 1
 			continue
 		}
 		// If it's an option path, we resolve absolute path from the model path
 		if strings.Contains(op, ":") && strings.Contains(op, "path") {
 			data := strings.Split(op, ":")
 			data[1] = filepath.Join(opts.ModelPath, data[1])
 			if err := utils.VerifyPath(data[1], opts.ModelPath); err == nil {
 				oo = append(oo, strings.Join(data, ":"))
 			}
 		} else {
 			oo = append(oo, op)
 		}
 	}
 	fmt.Fprintf(os.Stderr, "Options: %+v\n", oo)
 	for i, x := range oo {
 		view[i] = C.CString(x)
 	}
 	sd.cfgScale = opts.CFGScale
 	ret := C.load_model(modelFile, options, C.int(opts.Threads), C.int(diffusionModel))
 	if ret != 0 {
 		return fmt.Errorf("could not load model")
 	}
 	return nil
 }
 func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
 	t := C.CString(opts.PositivePrompt)
 	defer C.free(unsafe.Pointer(t))
 	dst := C.CString(opts.Dst)
 	defer C.free(unsafe.Pointer(dst))
 	negative := C.CString(opts.NegativePrompt)
 	defer C.free(unsafe.Pointer(negative))
 	ret := C.gen_image(t, negative, C.int(opts.Width), C.int(opts.Height), C.int(opts.Step), C.int(opts.Seed), dst, C.float(sd.cfgScale))
 	if ret != 0 {
 		return fmt.Errorf("inference failed")
 	}
 	return nil
 }
--- a/backend/go/image/stablediffusion-ggml/gosd.h
+++ b/backend/go/image/stablediffusion-ggml/gosd.h
@@ -1,8 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 int load_model(char *model, char* options[], int threads, int diffusionModel);
 int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed, char *dst, float cfg_scale);
 #ifdef __cplusplus
 }
 #endif
--- a/backend/go/image/stablediffusion-ggml/main.go
+++ b/backend/go/image/stablediffusion-ggml/main.go
@@ -1,20 +0,0 @@
 package main
 // Note: this is started internally by LocalAI and a server is allocated for each model
 import (
 	"flag"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 func main() {
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &SDGGML{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/llm/bert/bert.go
+++ b/backend/go/llm/bert/bert.go
@@ -0,0 +1,34 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	bert "github.com/go-skynet/go-bert.cpp"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 type Embeddings struct {
 	base.SingleThread
 	bert *bert.Bert
 }
 func (llm *Embeddings) Load(opts *pb.ModelOptions) error {
 	model, err := bert.New(opts.ModelFile)
 	llm.bert = model
 	return err
 }
 func (llm *Embeddings) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
 	if len(opts.EmbeddingTokens) > 0 {
 		tokens := []int{}
 		for _, t := range opts.EmbeddingTokens {
 			tokens = append(tokens, int(t))
 		}
 		return llm.bert.TokenEmbeddings(tokens, bert.SetThreads(int(opts.Threads)))
 	}
 	return llm.bert.Embeddings(opts.Embeddings, bert.SetThreads(int(opts.Threads)))
 }
--- a/backend/go/llm/bert/main.go
+++ b/backend/go/llm/bert/main.go
@@ -1,6 +1,7 @@
 package main
 // Note: this is started internally by LocalAI and a server is allocated for each model
 import (
 	"flag"
@@ -14,7 +15,7 @@ var (
 func main() {
 	flag.Parse()
-	if err := grpc.StartServer(*addr, &Bark{}); err != nil {
+	if err := grpc.StartServer(*addr, &Embeddings{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/vad/silero/main.go
+++ b/backend/go/vad/silero/main.go
@@ -15,7 +15,7 @@ var (
 func main() {
 	flag.Parse()
-	if err := grpc.StartServer(*addr, &VAD{}); err != nil {
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/llm/rwkv/rwkv.go
+++ b/backend/go/llm/rwkv/rwkv.go
@@ -0,0 +1,95 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"path/filepath"
 	"github.com/donomii/go-rwkv.cpp"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 const tokenizerSuffix = ".tokenizer.json"
 type LLM struct {
 	base.SingleThread
 	rwkv *rwkv.RwkvState
 }
 func (llm *LLM) Load(opts *pb.ModelOptions) error {
 	tokenizerFile := opts.Tokenizer
 	if tokenizerFile == "" {
 		modelFile := filepath.Base(opts.ModelFile)
 		tokenizerFile = modelFile + tokenizerSuffix
 	}
 	modelPath := filepath.Dir(opts.ModelFile)
 	tokenizerPath := filepath.Join(modelPath, tokenizerFile)
 	model := rwkv.LoadFiles(opts.ModelFile, tokenizerPath, uint32(opts.GetThreads()))
 	if model == nil {
 		return fmt.Errorf("rwkv could not load model")
 	}
 	llm.rwkv = model
 	return nil
 }
 func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
 	stopWord := "\n"
 	if len(opts.StopPrompts) > 0 {
 		stopWord = opts.StopPrompts[0]
 	}
 	if err := llm.rwkv.ProcessInput(opts.Prompt); err != nil {
 		return "", err
 	}
 	response := llm.rwkv.GenerateResponse(int(opts.Tokens), stopWord, float32(opts.Temperature), float32(opts.TopP), nil)
 	return response, nil
 }
 func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	go func() {
 		stopWord := "\n"
 		if len(opts.StopPrompts) > 0 {
 			stopWord = opts.StopPrompts[0]
 		}
 		if err := llm.rwkv.ProcessInput(opts.Prompt); err != nil {
 			fmt.Println("Error processing input: ", err)
 			return
 		}
 		llm.rwkv.GenerateResponse(int(opts.Tokens), stopWord, float32(opts.Temperature), float32(opts.TopP), func(s string) bool {
 			results <- s
 			return true
 		})
 		close(results)
 	}()
 	return nil
 }
 func (llm *LLM) TokenizeString(opts *pb.PredictOptions) (pb.TokenizationResponse, error) {
 	tokens, err := llm.rwkv.Tokenizer.Encode(opts.Prompt)
 	if err != nil {
 		return pb.TokenizationResponse{}, err
 	}
 	l := len(tokens)
 	i32Tokens := make([]int32, l)
 	for i, t := range tokens {
 		i32Tokens[i] = int32(t.ID)
 	}
 	return pb.TokenizationResponse{
 		Length: int32(l),
 		Tokens: i32Tokens,
 	}, nil
 }
--- a/backend/go/vad/silero/vad.go
+++ b/backend/go/vad/silero/vad.go
@@ -1,54 +0,0 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/streamer45/silero-vad-go/speech"
 )
 type VAD struct {
 	base.SingleThread
 	detector *speech.Detector
 }
 func (vad *VAD) Load(opts *pb.ModelOptions) error {
 	v, err := speech.NewDetector(speech.DetectorConfig{
 		ModelPath:  opts.ModelFile,
 		SampleRate: 16000,
 		//WindowSize:           1024,
 		Threshold:            0.5,
 		MinSilenceDurationMs: 100,
 		SpeechPadMs:          30,
 	})
 	if err != nil {
 		return fmt.Errorf("create silero detector: %w", err)
 	}
 	vad.detector = v
 	return err
 }
 func (vad *VAD) VAD(req *pb.VADRequest) (pb.VADResponse, error) {
 	audio := req.Audio
 	segments, err := vad.detector.Detect(audio)
 	if err != nil {
 		return pb.VADResponse{}, fmt.Errorf("detect: %w", err)
 	}
 	vadSegments := []*pb.VADSegment{}
 	for _, s := range segments {
 		vadSegments = append(vadSegments, &pb.VADSegment{
 			Start: float32(s.SpeechStartAt),
 			End:   float32(s.SpeechEndAt),
 		})
 	}
 	return pb.VADResponse{
 		Segments: vadSegments,
 	}, nil
 }
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -1,6 +1,5 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.69.0
+grpcio==1.67.0
 protobuf
 certifi
 transformers
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -1,9 +1,8 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
-torchaudio==2.3.1+cxx11.abi
+torchaudio
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.69.0
+grpcio==1.67.0
 protobuf
 certifi
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -17,9 +17,6 @@
 # LIMIT_TARGETS="cublas12"
 # source $(dirname $0)/../common/libbackend.sh
 #
 PYTHON_VERSION="3.10"
 function init() {
    # Name of the backend (directory name)
    BACKEND_NAME=${PWD##*/}
@@ -91,7 +88,7 @@ function getBuildProfile() {
 # always result in an activated virtual environment
 function ensureVenv() {
    if [ ! -d "${EDIR}/venv" ]; then
-        uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
+        uv venv ${EDIR}/venv
        echo "virtualenv created"
    fi
--- a/backend/python/common/template/Makefile
+++ b/backend/python/common/template/Makefile
@@ -1,9 +1,8 @@
 .DEFAULT_GOAL := install
 .PHONY: install
-install:
+install: protogen
 	bash install.sh
 	$(MAKE) protogen
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
@@ -13,7 +12,7 @@ protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
-	bash protogen.sh
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
 .PHONY: clean
 clean: protogen-clean
--- a/backend/python/common/template/protogen.sh
+++ b/backend/python/common/template/protogen.sh
@@ -1,6 +0,0 @@
 #!/bin/bash
 set -e
 source $(dirname $0)/../common/libbackend.sh
 python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/common/template/requirements-intel.txt
+++ b/backend/python/common/template/requirements-intel.txt
@@ -1,5 +1,4 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,3 +1,2 @@
-grpcio==1.69.0
+grpcio==1.67.0
-protobuf
+protobuf
 grpcio-tools
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -1,10 +1,9 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
-torchaudio==2.3.1+cxx11.abi
+torchaudio
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
 coqui-tts
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.67.0
 protobuf
 certifi
 packaging==24.1
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -17,7 +17,7 @@ import backend_pb2_grpc
 import grpc
-from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
+from diffusers import StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
    EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
 from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
@@ -247,16 +247,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        use_safetensors=True,
                        variant=variant)
            elif request.PipelineType == "FluxPipeline":
                if fromSingleFile:
                    self.pipe = FluxPipeline.from_single_file(modelFile,
                                                              torch_dtype=torchType,
                                                              use_safetensors=True)
                else:
                    self.pipe = FluxPipeline.from_pretrained(
                        request.Model,
                        torch_dtype=torch.bfloat16)
-                if request.LowVRAM:
+                    if request.LowVRAM:
-                    self.pipe.enable_model_cpu_offload()
+                        self.pipe.enable_model_cpu_offload()
            elif request.PipelineType == "FluxTransformer2DModel":
                    dtype = torch.bfloat16
                    # specify from environment or default to "ChuckMcSneed/FLUX.1-dev"
@@ -275,13 +270,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                    if request.LowVRAM:
                        self.pipe.enable_model_cpu_offload()
            elif request.PipelineType == "SanaPipeline":
                self.pipe = SanaPipeline.from_pretrained(
                    request.Model,
                    variant="bf16",
                    torch_dtype=torch.bfloat16)
                self.pipe.vae.to(torch.bfloat16)
                self.pipe.text_encoder.to(torch.bfloat16)
            if CLIPSKIP and request.CLIPSkip != 0:
                self.clip_skip = request.CLIPSkip
@@ -308,34 +296,22 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                self.pipe.controlnet = self.controlnet
            else:
                self.controlnet = None
-
+            # Assume directory from request.ModelFile.
-            if request.LoraAdapter and not os.path.isabs(request.LoraAdapter):
+            # Only if request.LoraAdapter it's not an absolute path
            if request.LoraAdapter and request.ModelFile != "" and not os.path.isabs(request.LoraAdapter) and request.LoraAdapter:
                # get base path of modelFile
                modelFileBase = os.path.dirname(request.ModelFile)
                # modify LoraAdapter to be relative to modelFileBase
-                request.LoraAdapter = os.path.join(request.ModelPath, request.LoraAdapter)
+                request.LoraAdapter = os.path.join(modelFileBase, request.LoraAdapter)
            device = "cpu" if not request.CUDA else "cuda"
            self.device = device
            if request.LoraAdapter:
                # Check if its a local file and not a directory ( we load lora differently for a safetensor file )
                if os.path.exists(request.LoraAdapter) and not os.path.isdir(request.LoraAdapter):
                    # self.load_lora_weights(request.LoraAdapter, 1, device, torchType)
                    self.pipe.load_lora_weights(request.LoraAdapter)
                else:
                    self.pipe.unet.load_attn_procs(request.LoraAdapter)
            if len(request.LoraAdapters) > 0:
                i = 0
                adapters_name = []
                adapters_weights = []
                for adapter in request.LoraAdapters:
                    if not os.path.isabs(adapter):
                        adapter = os.path.join(request.ModelPath, adapter)
                    self.pipe.load_lora_weights(adapter, adapter_name=f"adapter_{i}")
                    adapters_name.append(f"adapter_{i}")
                    i += 1
                for adapters_weight in request.LoraScales:
                    adapters_weights.append(adapters_weight)
                self.pipe.set_adapters(adapters_name, adapter_weights=adapters_weights)
            if request.CUDA:
                self.pipe.to('cuda')
@@ -416,6 +392,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        # create a dictionary of values for the parameters
        options = {
            "negative_prompt": request.negative_prompt,
            "width": request.width,
            "height": request.height,
            "num_inference_steps": steps,
        }
@@ -433,13 +411,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        keys = options.keys()
        if request.EnableParameters != "":
-            keys = [key.strip() for key in request.EnableParameters.split(",")]
+            keys = request.EnableParameters.split(",")
        if request.EnableParameters == "none":
            keys = []
        # create a dictionary of parameters by using the keys from EnableParameters and the values from defaults
-        kwargs = {key: options.get(key) for key in keys if key in options}
+        kwargs = {key: options[key] for key in keys}
        # Set seed
        if request.seed > 0:
@@ -450,12 +428,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if self.PipelineType == "FluxPipeline":
            kwargs["max_sequence_length"] = 256
        if request.width:
            kwargs["width"] = request.width
        if request.height:
            kwargs["height"] = request.height
        if self.PipelineType == "FluxTransformer2DModel":
            kwargs["output_type"] = "pil"
            kwargs["generator"] = torch.Generator("cpu").manual_seed(0)
@@ -475,7 +447,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            export_to_video(video_frames, request.dst)
            return backend_pb2.Result(message="Media generated successfully", success=True)
        print(f"Generating image with {kwargs=}", file=sys.stderr)
        image = {}
        if COMPEL:
            conditioning, pooled = self.compel.build_conditioning_tensor(prompt)
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -1,10 +1,9 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
-torchvision==0.18.1+cxx11.abi
+torchvision
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 diffusers
 opencv-python
 transformers
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.69.0
+grpcio==1.67.0
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.67.0
 protobuf
 certifi
 wheel
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.69.0
+grpcio==1.67.0
 protobuf
 certifi
--- a/backend/python/openvoice/requirements-cpu.txt
+++ b/backend/python/openvoice/requirements-cpu.txt
@@ -1,7 +1,3 @@
 torch==2.4.1
 git+https://github.com/myshell-ai/MeloTTS.git
-git+https://github.com/myshell-ai/OpenVoice.git
+git+https://github.com/myshell-ai/OpenVoice.git
 whisper-timestamped
 pydub==0.25.1
 wavmark==0.0.3
 eng_to_ipa==0.0.2
--- a/backend/python/openvoice/requirements-cublas11.txt
+++ b/backend/python/openvoice/requirements-cublas11.txt
@@ -1,8 +1,4 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.4.1+cu118
 git+https://github.com/myshell-ai/MeloTTS.git
-git+https://github.com/myshell-ai/OpenVoice.git
+git+https://github.com/myshell-ai/OpenVoice.git
 whisper-timestamped
 pydub==0.25.1
 wavmark==0.0.3
 eng_to_ipa==0.0.2
--- a/backend/python/openvoice/requirements-cublas12.txt
+++ b/backend/python/openvoice/requirements-cublas12.txt
@@ -1,7 +1,3 @@
 torch==2.4.1
 git+https://github.com/myshell-ai/MeloTTS.git
-git+https://github.com/myshell-ai/OpenVoice.git
+git+https://github.com/myshell-ai/OpenVoice.git
 whisper-timestamped
 pydub==0.25.1
 wavmark==0.0.3
 eng_to_ipa==0.0.2
--- a/backend/python/openvoice/requirements-hipblas.txt
+++ b/backend/python/openvoice/requirements-hipblas.txt
@@ -1,8 +1,4 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch==2.4.1+rocm6.0
 git+https://github.com/myshell-ai/MeloTTS.git
-git+https://github.com/myshell-ai/OpenVoice.git
+git+https://github.com/myshell-ai/OpenVoice.git
 whisper-timestamped
 pydub==0.25.1
 wavmark==0.0.3
 eng_to_ipa==0.0.2
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -1,15 +1,14 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
 torchaudio==2.3.1+cxx11.abi
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-grpcio==1.69.0
+grpcio==1.67.0
 protobuf
 librosa==0.9.1
 faster-whisper==0.9.0
 pydub==0.25.1
 wavmark==0.0.3
 numpy==1.22.0
 eng_to_ipa==0.0.2
 inflect==7.0.0
 unidecode==1.3.7
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -1,17 +1,20 @@
-grpcio==1.69.0
+grpcio==1.67.0
 protobuf
 librosa
 faster-whisper
 pydub==0.25.1
 wavmark==0.0.3
 numpy==1.22.0
 eng_to_ipa==0.0.2
 inflect
 unidecode
 whisper-timestamped
 openai
 python-dotenv
 pypinyin
 cn2an==0.5.22
 numpy==1.22.0
 networkx==2.8.8
 jieba==0.42.1
-gradio==5.9.1
+gradio==3.48.0
 langid==1.1.6
 llvmlite==0.43.0
 setuptools
--- a/backend/python/parler-tts/Makefile
+++ b/backend/python/parler-tts/Makefile
@@ -12,10 +12,9 @@ export SKIP_CONDA=1
 endif
 .PHONY: parler-tts
-parler-tts:
+parler-tts: protogen
 	@echo "Installing $(CONDA_ENV_PATH)..."
 	bash install.sh $(CONDA_ENV_PATH)
 	$(MAKE) protogen
 .PHONY: run
 run: protogen
@@ -37,7 +36,7 @@ protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
-	bash protogen.sh
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
 .PHONY: clean
 clean: protogen-clean
--- a/backend/python/parler-tts/install.sh
+++ b/backend/python/parler-tts/install.sh
@@ -11,18 +11,16 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
 installRequirements
 # https://github.com/descriptinc/audiotools/issues/101
 # incompatible protobuf versions.
-PYDIR=python3.10
+# PYDIR=python3.10
-pyenv="${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/"
+# pyenv="${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/"
-if [ ! -d ${pyenv} ]; then
+# if [ ! -d ${pyenv} ]; then
-    echo "(parler-tts/install.sh): Error: ${pyenv} does not exist"
+#     echo "(parler-tts/install.sh): Error: ${pyenv} does not exist"
-    exit 1
+#     exit 1
-fi
+# fi
-curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${pyenv}/builder.py
+# curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${pyenv}/builder.py
--- a/backend/python/parler-tts/protogen.sh
+++ b/backend/python/parler-tts/protogen.sh
@@ -1,6 +0,0 @@
 #!/bin/bash
 set -e
 source $(dirname $0)/../common/libbackend.sh
 python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/parler-tts/requirements-after.txt
+++ b/backend/python/parler-tts/requirements-after.txt
@@ -1,4 +1,4 @@
 git+https://github.com/huggingface/parler-tts.git@8e465f1b5fcd223478e07175cb40494d19ffbe17
 llvmlite==0.43.0
 numba==0.60.0
-grpcio-tools==1.42.0
+git+https://github.com/descriptinc/audiotools
--- a/backend/python/parler-tts/requirements-intel.txt
+++ b/backend/python/parler-tts/requirements-intel.txt
@@ -1,8 +1,8 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
-torchaudio==2.3.1+cxx11.abi
+torchaudio
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
 setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.67.0
 protobuf
 certifi
-llvmlite==0.43.0
+llvmlite==0.43.0
 setuptools
--- a/backend/python/rerankers/requirements-intel.txt
+++ b/backend/python/rerankers/requirements-intel.txt
@@ -1,9 +1,8 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
 transformers
 accelerate
-torch==2.3.1+cxx11.abi
+torch
 oneccl_bind_pt==2.3.100+xpu
 rerankers[transformers]
 optimum[openvino]
-setuptools
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.69.0
+grpcio==1.67.0
 protobuf
 certifi
--- a/backend/python/sentencetransformers/requirements-cpu.txt
+++ b/backend/python/sentencetransformers/requirements-cpu.txt
@@ -2,5 +2,5 @@ torch==2.4.1
 accelerate
 transformers
 bitsandbytes
-sentence-transformers==3.3.1
+sentence-transformers==3.2.0
 transformers
--- a/backend/python/sentencetransformers/requirements-cublas11.txt
+++ b/backend/python/sentencetransformers/requirements-cublas11.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.4.1+cu118
 accelerate
-sentence-transformers==3.3.1
+sentence-transformers==3.2.0
 transformers
--- a/backend/python/sentencetransformers/requirements-cublas12.txt
+++ b/backend/python/sentencetransformers/requirements-cublas12.txt
@@ -1,4 +1,4 @@
 torch==2.4.1
 accelerate
-sentence-transformers==3.3.1
+sentence-transformers==3.2.0
 transformers
--- a/backend/python/sentencetransformers/requirements-hipblas.txt
+++ b/backend/python/sentencetransformers/requirements-hipblas.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch==2.4.1+rocm6.0
 accelerate
-sentence-transformers==3.3.1
+sentence-transformers==3.2.0
 transformers
--- a/backend/python/sentencetransformers/requirements-intel.txt
+++ b/backend/python/sentencetransformers/requirements-intel.txt
@@ -1,9 +1,8 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
 accelerate
-sentence-transformers==3.3.1
+sentence-transformers==3.2.0
 transformers
--- a/backend/python/sentencetransformers/requirements.txt
+++ b/backend/python/sentencetransformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.67.0
 protobuf
 certifi
 datasets
--- a/backend/python/transformers-musicgen/requirements-intel.txt
+++ b/backend/python/transformers-musicgen/requirements-intel.txt
@@ -1,8 +1,7 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
 transformers
 oneccl_bind_pt==2.3.100+xpu
 accelerate
-torch==2.3.1+cxx11.abi
+torch
 optimum[openvino]
-setuptools
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/transformers-musicgen/requirements.txt
+++ b/backend/python/transformers-musicgen/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.67.0
 protobuf
 scipy==1.14.0
 certifi
--- a/backend/python/transformers/requirements-intel.txt
+++ b/backend/python/transformers/requirements-intel.txt
@@ -1,7 +1,6 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
 intel-extension-for-transformers
 bitsandbytes
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.67.0
 protobuf
 certifi
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vall-e-x/requirements-intel.txt
+++ b/backend/python/vall-e-x/requirements-intel.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
 accelerate
-torch==2.3.1+cxx11.abi
+torch
-torchaudio==2.3.1+cxx11.abi
+torchaudio
 optimum[openvino]
-oneccl_bind_pt==2.3.100+xpu
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vall-e-x/requirements.txt
+++ b/backend/python/vall-e-x/requirements.txt
@@ -1,4 +1,3 @@
-grpcio==1.69.0
+grpcio==1.67.0
 protobuf
-certifi
+certifi
 setuptools
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -22,7 +22,7 @@ if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE}" == "xtrue" ]; then
            git clone https://github.com/vllm-project/vllm
        fi
        pushd vllm
-            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.68.1 protobuf bitsandbytes
+            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.67.0 protobuf bitsandbytes
            uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
            VLLM_TARGET_DEVICE=cpu python setup.py install
        popd
--- a/backend/python/vllm/requirements-intel.txt
+++ b/backend/python/vllm/requirements-intel.txt
@@ -1,9 +1,8 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
 accelerate
-torch==2.3.1+cxx11.abi
+torch
 transformers
 optimum[openvino]
-setuptools
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
-bitsandbytes
+bitsandbytes
 oneccl_bind_pt==2.3.100+xpu
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.69.0
+grpcio==1.67.0
 protobuf
 certifi
 setuptools
--- a/core/application.go
+++ b/core/application.go
@@ -0,0 +1,38 @@
 package core
 import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/mudler/LocalAI/pkg/model"
 )
 // The purpose of this structure is to hold pointers to all initialized services, to make plumbing easy
 // Perhaps a proper DI system is worth it in the future, but for now keep things simple.
 type Application struct {
 	// Application-Level Config
 	ApplicationConfig *config.ApplicationConfig
 	// ApplicationState *ApplicationState
 	// Core Low-Level Services
 	BackendConfigLoader *config.BackendConfigLoader
 	ModelLoader         *model.ModelLoader
 	// Backend Services
 	// EmbeddingsBackendService      *backend.EmbeddingsBackendService
 	// ImageGenerationBackendService *backend.ImageGenerationBackendService
 	// LLMBackendService             *backend.LLMBackendService
 	// TranscriptionBackendService *backend.TranscriptionBackendService
 	// TextToSpeechBackendService  *backend.TextToSpeechBackendService
 	// LocalAI System Services
 	BackendMonitorService *services.BackendMonitorService
 	GalleryService        *services.GalleryService
 	LocalAIMetricsService *services.LocalAIMetricsService
 	// OpenAIService         *services.OpenAIService
 }
 // TODO [NEXT PR?]: Break up ApplicationConfig.
 // Migrate over stuff that is not set via config at all - especially runtime stuff
 type ApplicationState struct {
 }
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -1,39 +0,0 @@
 package application
 import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/templates"
 )
 type Application struct {
 	backendLoader      *config.BackendConfigLoader
 	modelLoader        *model.ModelLoader
 	applicationConfig  *config.ApplicationConfig
 	templatesEvaluator *templates.Evaluator
 }
 func newApplication(appConfig *config.ApplicationConfig) *Application {
 	return &Application{
 		backendLoader:      config.NewBackendConfigLoader(appConfig.ModelPath),
 		modelLoader:        model.NewModelLoader(appConfig.ModelPath),
 		applicationConfig:  appConfig,
 		templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
 	}
 }
 func (a *Application) BackendLoader() *config.BackendConfigLoader {
 	return a.backendLoader
 }
 func (a *Application) ModelLoader() *model.ModelLoader {
 	return a.modelLoader
 }
 func (a *Application) ApplicationConfig() *config.ApplicationConfig {
 	return a.applicationConfig
 }
 func (a *Application) TemplatesEvaluator() *templates.Evaluator {
 	return a.templatesEvaluator
 }
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -11,9 +11,17 @@ import (
 func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
-	opts := ModelOptions(backendConfig, appConfig)
+	var inferenceModel interface{}
 	var err error
-	inferenceModel, err := loader.Load(opts...)
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{})
 	if backendConfig.Backend == "" {
 		inferenceModel, err = loader.GreedyLoader(opts...)
 	} else {
 		opts = append(opts, model.WithBackendString(backendConfig.Backend))
 		inferenceModel, err = loader.BackendLoader(opts...)
 	}
 	if err != nil {
 		return nil, err
 	}
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -9,8 +9,9 @@ import (
 func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
-	opts := ModelOptions(backendConfig, appConfig)
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{})
-	inferenceModel, err := loader.Load(
+
 	inferenceModel, err := loader.BackendLoader(
 		opts...,
 	)
 	if err != nil {
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -16,15 +16,15 @@ import (
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/pkg/grpc"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/utils"
 )
 type LLMResponse struct {
-	Response    string // should this be []byte?
+	Response string // should this be []byte?
-	Usage       TokenUsage
+	Usage    TokenUsage
 	AudioOutput string
 }
 type TokenUsage struct {
@@ -35,6 +35,15 @@ type TokenUsage struct {
 func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
 	var inferenceModel grpc.Backend
 	var err error
 	opts := ModelOptions(c, o, []model.Option{})
 	if c.Backend != "" {
 		opts = append(opts, model.WithBackendString(c.Backend))
 	}
 	// Check if the modelFile exists, if it doesn't try to load it from the gallery
 	if o.AutoloadGalleries { // experimental
 		if _, err := os.Stat(modelFile); os.IsNotExist(err) {
@@ -47,8 +56,12 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 		}
 	}
-	opts := ModelOptions(c, o)
+	if c.Backend == "" {
-	inferenceModel, err := loader.Load(opts...)
+		inferenceModel, err = loader.GreedyLoader(opts...)
 	} else {
 		inferenceModel, err = loader.BackendLoader(opts...)
 	}
 	if err != nil {
 		return nil, err
 	}
@@ -118,12 +131,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 			ss := ""
 			var partialRune []byte
-			err := inferenceModel.PredictStream(ctx, opts, func(reply *proto.Reply) {
+			err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
-				msg := reply.Message
+				partialRune = append(partialRune, chars...)
 				partialRune = append(partialRune, msg...)
 				tokenUsage.Prompt = int(reply.PromptTokens)
 				tokenUsage.Completion = int(reply.Tokens)
 				for len(partialRune) > 0 {
 					r, size := utf8.DecodeRune(partialRune)
@@ -137,10 +146,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 					partialRune = partialRune[size:]
 				}
 				if len(msg) == 0 {
 					tokenCallback("", tokenUsage)
 				}
 			})
 			return LLMResponse{
 				Response: ss,
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -11,7 +11,7 @@ import (
 	"github.com/rs/zerolog/log"
 )
-func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts ...model.Option) []model.Option {
+func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
 	name := c.Name
 	if name == "" {
 		name = c.Model
@@ -122,17 +122,14 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		CUDA:                 c.CUDA || c.Diffusers.CUDA,
 		SchedulerType:        c.Diffusers.SchedulerType,
 		PipelineType:         c.Diffusers.PipelineType,
-		CFGScale:             c.CFGScale,
+		CFGScale:             c.Diffusers.CFGScale,
 		LoraAdapter:          c.LoraAdapter,
 		LoraScale:            c.LoraScale,
 		LoraAdapters:         c.LoraAdapters,
 		LoraScales:           c.LoraScales,
 		F16Memory:            f16,
 		LoraBase:             c.LoraBase,
 		IMG2IMG:              c.Diffusers.IMG2IMG,
 		CLIPModel:            c.Diffusers.ClipModel,
 		CLIPSubfolder:        c.Diffusers.ClipSubFolder,
 		Options:              c.Options,
 		CLIPSkip:             int32(c.Diffusers.ClipSkip),
 		ControlNet:           c.Diffusers.ControlNet,
 		ContextSize:          int32(ctxSize),
@@ -151,8 +148,6 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		TensorParallelSize:   int32(c.TensorParallelSize),
 		MMProj:               c.MMProj,
 		FlashAttention:       c.FlashAttention,
 		CacheTypeKey:         c.CacheTypeK,
 		CacheTypeValue:       c.CacheTypeV,
 		NoKVOffload:          c.NoKVOffloading,
 		YarnExtFactor:        c.YarnExtFactor,
 		YarnAttnFactor:       c.YarnAttnFactor,
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -11,8 +11,8 @@ import (
 func Rerank(modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
-	opts := ModelOptions(backendConfig, appConfig, model.WithModel(modelFile))
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{model.WithModel(modelFile)})
-	rerankModel, err := loader.Load(opts...)
+	rerankModel, err := loader.BackendLoader(opts...)
 	if err != nil {
 		return nil, err
 	}
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@@ -25,8 +25,9 @@ func SoundGeneration(
 	backendConfig config.BackendConfig,
 ) (string, *proto.Result, error) {
-	opts := ModelOptions(backendConfig, appConfig, model.WithModel(modelFile))
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{model.WithModel(modelFile)})
-	soundGenModel, err := loader.Load(opts...)
+
 	soundGenModel, err := loader.BackendLoader(opts...)
 	if err != nil {
 		return "", nil, err
 	}
--- a/core/backend/stores.go
+++ b/core/backend/stores.go
@@ -8,15 +8,16 @@ import (
 )
 func StoreBackend(sl *model.ModelLoader, appConfig *config.ApplicationConfig, storeName string) (grpc.Backend, error) {
-	if storeName == "" {
+    if storeName == "" {
-		storeName = "default"
+      storeName = "default"
-	}
+    }
-	sc := []model.Option{
+    sc := []model.Option{
-		model.WithBackendString(model.LocalStoreBackend),
+      model.WithBackendString(model.LocalStoreBackend),
-		model.WithAssetDir(appConfig.AssetsDestination),
+      model.WithAssetDir(appConfig.AssetsDestination),
-		model.WithModel(storeName),
+      model.WithModel(storeName),
-	}
+    }
-	return sl.Load(sc...)
+    return sl.BackendLoader(sc...)
 }
--- a/core/backend/token_metrics.go
+++ b/core/backend/token_metrics.go
@@ -15,8 +15,10 @@ func TokenMetrics(
 	appConfig *config.ApplicationConfig,
 	backendConfig config.BackendConfig) (*proto.MetricsResponse, error) {
-	opts := ModelOptions(backendConfig, appConfig, model.WithModel(modelFile))
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{
-	model, err := loader.Load(opts...)
+		model.WithModel(modelFile),
 	})
 	model, err := loader.BackendLoader(opts...)
 	if err != nil {
 		return nil, err
 	}
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@@ -14,13 +14,15 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
 	var inferenceModel grpc.Backend
 	var err error
-	opts := ModelOptions(backendConfig, appConfig, model.WithModel(modelFile))
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{
 		model.WithModel(modelFile),
 	})
 	if backendConfig.Backend == "" {
-		inferenceModel, err = loader.Load(opts...)
+		inferenceModel, err = loader.GreedyLoader(opts...)
 	} else {
 		opts = append(opts, model.WithBackendString(backendConfig.Backend))
-		inferenceModel, err = loader.Load(opts...)
+		inferenceModel, err = loader.BackendLoader(opts...)
 	}
 	if err != nil {
 		return schema.TokenizeResponse{}, err
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -18,9 +18,9 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
 		backendConfig.Backend = model.WhisperBackend
 	}
-	opts := ModelOptions(backendConfig, appConfig)
+	opts := ModelOptions(backendConfig, appConfig, []model.Option{})
-	transcriptionModel, err := ml.Load(opts...)
+	transcriptionModel, err := ml.BackendLoader(opts...)
 	if err != nil {
 		return nil, err
 	}
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -28,8 +28,11 @@ func ModelTTS(
 		bb = model.PiperBackend
 	}
-	opts := ModelOptions(backendConfig, appConfig, model.WithBackendString(bb), model.WithModel(modelFile))
+	opts := ModelOptions(config.BackendConfig{}, appConfig, []model.Option{
-	ttsModel, err := loader.Load(opts...)
+		model.WithBackendString(bb),
 		model.WithModel(modelFile),
 	})
 	ttsModel, err := loader.BackendLoader(opts...)
 	if err != nil {
 		return "", nil, err
 	}
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -6,12 +6,12 @@ import (
 	"strings"
 	"time"
 	"github.com/mudler/LocalAI/core/application"
 	cli_api "github.com/mudler/LocalAI/core/cli/api"
 	cliContext "github.com/mudler/LocalAI/core/cli/context"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http"
 	"github.com/mudler/LocalAI/core/p2p"
 	"github.com/mudler/LocalAI/core/startup"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 )
@@ -186,16 +186,16 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 	}
 	if r.PreloadBackendOnly {
-		_, err := application.New(opts...)
+		_, _, _, err := startup.Startup(opts...)
 		return err
 	}
-	app, err := application.New(opts...)
+	cl, ml, options, err := startup.Startup(opts...)
 	if err != nil {
 		return fmt.Errorf("failed basic startup tasks with error %s", err.Error())
 	}
-	appHTTP, err := http.API(app)
+	appHTTP, err := http.App(cl, ml, options)
 	if err != nil {
 		log.Error().Err(err).Msg("error during HTTP App construction")
 		return err
--- a/core/cli/worker/worker_p2p.go
+++ b/core/cli/worker/worker_p2p.go
@@ -76,14 +76,8 @@ func (r *P2P) Run(ctx *cliContext.Context) error {
 					"util",
 					"llama-cpp-rpc-server",
 				)
-				var extraArgs []string
+				extraArgs := strings.Split(r.ExtraLLamaCPPArgs, " ")
 				if r.ExtraLLamaCPPArgs != "" {
 					extraArgs = strings.Split(r.ExtraLLamaCPPArgs, " ")
 				}
 				args := append([]string{"--host", address, "--port", fmt.Sprint(port)}, extraArgs...)
 				log.Debug().Msgf("Starting llama-cpp-rpc-server on '%s:%d' with args: %+v (%d)", address, port, args, len(args))
 				args, grpcProcess = library.LoadLDSO(r.BackendAssetsPath, args, grpcProcess)
 				cmd := exec.Command(
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -38,7 +38,6 @@ type BackendConfig struct {
 	TemplateConfig      TemplateConfig         `yaml:"template"`
 	KnownUsecaseStrings []string               `yaml:"known_usecases"`
 	KnownUsecases       *BackendConfigUsecases `yaml:"-"`
 	Pipeline            Pipeline               `yaml:"pipeline"`
 	PromptStrings, InputStrings                []string               `yaml:"-"`
 	InputToken                                 [][]int                `yaml:"-"`
@@ -73,20 +72,6 @@ type BackendConfig struct {
 	Description string `yaml:"description"`
 	Usage       string `yaml:"usage"`
 	Options []string `yaml:"options"`
 }
 // Pipeline defines other models to use for audio-to-audio
 type Pipeline struct {
 	TTS           string `yaml:"tts"`
 	LLM           string `yaml:"llm"`
 	Transcription string `yaml:"transcription"`
 	VAD           string `yaml:"vad"`
 }
 func (p Pipeline) IsNotConfigured() bool {
 	return p.LLM == "" || p.TTS == "" || p.Transcription == ""
 }
 type File struct {
@@ -112,15 +97,16 @@ type GRPC struct {
 }
 type Diffusers struct {
-	CUDA             bool   `yaml:"cuda"`
+	CUDA             bool    `yaml:"cuda"`
-	PipelineType     string `yaml:"pipeline_type"`
+	PipelineType     string  `yaml:"pipeline_type"`
-	SchedulerType    string `yaml:"scheduler_type"`
+	SchedulerType    string  `yaml:"scheduler_type"`
-	EnableParameters string `yaml:"enable_parameters"` // A list of comma separated parameters to specify
+	EnableParameters string  `yaml:"enable_parameters"` // A list of comma separated parameters to specify
-	IMG2IMG          bool   `yaml:"img2img"`           // Image to Image Diffuser
+	CFGScale         float32 `yaml:"cfg_scale"`         // Classifier-Free Guidance Scale
-	ClipSkip         int    `yaml:"clip_skip"`         // Skip every N frames
+	IMG2IMG          bool    `yaml:"img2img"`           // Image to Image Diffuser
-	ClipModel        string `yaml:"clip_model"`        // Clip model to use
+	ClipSkip         int     `yaml:"clip_skip"`         // Skip every N frames
-	ClipSubFolder    string `yaml:"clip_subfolder"`    // Subfolder to use for clip model
+	ClipModel        string  `yaml:"clip_model"`        // Clip model to use
-	ControlNet       string `yaml:"control_net"`
+	ClipSubFolder    string  `yaml:"clip_subfolder"`    // Subfolder to use for clip model
 	ControlNet       string  `yaml:"control_net"`
 }
 // LLMConfig is a struct that holds the configuration that are
@@ -148,30 +134,26 @@ type LLMConfig struct {
 	TrimSpace       []string `yaml:"trimspace"`
 	TrimSuffix      []string `yaml:"trimsuffix"`
-	ContextSize          *int      `yaml:"context_size"`
+	ContextSize          *int    `yaml:"context_size"`
-	NUMA                 bool      `yaml:"numa"`
+	NUMA                 bool    `yaml:"numa"`
-	LoraAdapter          string    `yaml:"lora_adapter"`
+	LoraAdapter          string  `yaml:"lora_adapter"`
-	LoraBase             string    `yaml:"lora_base"`
+	LoraBase             string  `yaml:"lora_base"`
-	LoraAdapters         []string  `yaml:"lora_adapters"`
+	LoraScale            float32 `yaml:"lora_scale"`
-	LoraScales           []float32 `yaml:"lora_scales"`
+	NoMulMatQ            bool    `yaml:"no_mulmatq"`
-	LoraScale            float32   `yaml:"lora_scale"`
+	DraftModel           string  `yaml:"draft_model"`
-	NoMulMatQ            bool      `yaml:"no_mulmatq"`
+	NDraft               int32   `yaml:"n_draft"`
-	DraftModel           string    `yaml:"draft_model"`
+	Quantization         string  `yaml:"quantization"`
-	NDraft               int32     `yaml:"n_draft"`
+	LoadFormat           string  `yaml:"load_format"`
-	Quantization         string    `yaml:"quantization"`
+	GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
-	LoadFormat           string    `yaml:"load_format"`
+	TrustRemoteCode      bool    `yaml:"trust_remote_code"`      // vLLM
-	GPUMemoryUtilization float32   `yaml:"gpu_memory_utilization"` // vLLM
+	EnforceEager         bool    `yaml:"enforce_eager"`          // vLLM
-	TrustRemoteCode      bool      `yaml:"trust_remote_code"`      // vLLM
+	SwapSpace            int     `yaml:"swap_space"`             // vLLM
-	EnforceEager         bool      `yaml:"enforce_eager"`          // vLLM
+	MaxModelLen          int     `yaml:"max_model_len"`          // vLLM
-	SwapSpace            int       `yaml:"swap_space"`             // vLLM
+	TensorParallelSize   int     `yaml:"tensor_parallel_size"`   // vLLM
-	MaxModelLen          int       `yaml:"max_model_len"`          // vLLM
+	MMProj               string  `yaml:"mmproj"`
 	TensorParallelSize   int       `yaml:"tensor_parallel_size"`   // vLLM
 	MMProj               string    `yaml:"mmproj"`
-	FlashAttention bool   `yaml:"flash_attention"`
+	FlashAttention bool `yaml:"flash_attention"`
-	NoKVOffloading bool   `yaml:"no_kv_offloading"`
+	NoKVOffloading bool `yaml:"no_kv_offloading"`
 	CacheTypeK     string `yaml:"cache_type_k"`
 	CacheTypeV     string `yaml:"cache_type_v"`
 	RopeScaling string `yaml:"rope_scaling"`
 	ModelType   string `yaml:"type"`
@@ -180,8 +162,6 @@ type LLMConfig struct {
 	YarnAttnFactor float32 `yaml:"yarn_attn_factor"`
 	YarnBetaFast   float32 `yaml:"yarn_beta_fast"`
 	YarnBetaSlow   float32 `yaml:"yarn_beta_slow"`
 	CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
 }
 // AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
@@ -219,8 +199,6 @@ type TemplateConfig struct {
 	JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`
 	Multimodal string `yaml:"multimodal"`
 	JinjaTemplate bool `yaml:"jinja_template"`
 }
 func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
--- a/core/config/backend_config_loader.go
+++ b/core/config/backend_config_loader.go
@@ -140,7 +140,7 @@ func (bcl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath
 		}
 	}
-	cfg.SetDefaults(append(opts, ModelPath(modelPath))...)
+	cfg.SetDefaults(opts...)
 	return cfg, nil
 }
--- a/Show More
+++ b/Show More
`@@ -1,2 +1 @@`
	`*.sh text eol=lf`	`*.sh text eol=lf`
	`backend/cpp/llama/*.hpp linguist-vendored`