models(gallery): add phi-3 vision (#3890 )

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
chore: ⬆️ Update ggerganov/llama.cpp to 45f097645efb11b6d09a5b4adbbfd7c312ac0126 (#3889 )
2026-02-03 11:13:31 -05:00 · 2024-10-21 11:47:52 +02:00 · 2024-10-20 21:40:26 +00:00 · 2024-10-20 11:44:28 +02:00 · 2024-10-20 00:26:49 +02:00 · 2024-10-19 23:04:42 +02:00
193 changed files with 4166 additions and 1171 deletions
--- a/.devcontainer-scripts/utils.sh
+++ b/.devcontainer-scripts/utils.sh
@@ -9,6 +9,7 @@
 # Param 2: email
 #
 config_user() {
+    echo "Configuring git for $1 <$2>"
    local gcn=$(git config --global user.name)
    if [ -z "${gcn}" ]; then
        echo "Setting up git user / remote"
@@ -24,6 +25,7 @@ config_user() {
 # Param 2: remote url
 #
 config_remote() {
+    echo "Adding git remote and fetching $2 as $1"
    local gr=$(git remote -v | grep $1)
    if [ -z "${gr}" ]; then
        git remote add $1 $2
--- a/.github/check_and_update.py
+++ b/.github/check_and_update.py
@@ -29,9 +29,14 @@ def calculate_sha256(file_path):
 def manual_safety_check_hf(repo_id):
    scanResponse = requests.get('https://huggingface.co/api/models/' + repo_id + "/scan")
    scan = scanResponse.json()
-    if scan['hasUnsafeFile']:
-        return scan
-    return None
+    # Check if 'hasUnsafeFile' exists in the response
+    if 'hasUnsafeFile' in scan:
+        if scan['hasUnsafeFile']:
+            return scan
+        else:
+            return None
+    else:
+        return None

 download_type, repo_id_or_url = parse_uri(uri)

--- a/.github/ci/modelslist.go
+++ b/.github/ci/modelslist.go
@@ -6,6 +6,7 @@ import (
 	"io/ioutil"
 	"os"

+	"github.com/microcosm-cc/bluemonday"
 	"gopkg.in/yaml.v3"
 )

@@ -279,6 +280,12 @@ func main() {
 		return
 	}

+	// Ensure that all arbitrary text content is sanitized before display
+	for i, m := range models {
+		models[i].Name = bluemonday.StrictPolicy().Sanitize(m.Name)
+		models[i].Description = bluemonday.StrictPolicy().Sanitize(m.Description)
+	}
+
 	// render the template
 	data := struct {
 		Models          []*GalleryModel
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -9,6 +9,8 @@ updates:
    directory: "/"
    schedule:
      interval: "weekly"
+    ignore:
+    - dependency-name: "github.com/mudler/LocalAI/pkg/grpc/proto"
  - package-ecosystem: "github-actions"
    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
    directory: "/"
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -33,7 +33,7 @@ jobs:
        run: |
          CGO_ENABLED=0 make build-api
      - name: rm
-        uses: appleboy/ssh-action@v1.0.3
+        uses: appleboy/ssh-action@v1.1.0
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@@ -53,7 +53,7 @@ jobs:
            rm: true
            target: ./local-ai
      - name: restarting
-        uses: appleboy/ssh-action@v1.0.3
+        uses: appleboy/ssh-action@v1.1.0
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -13,6 +13,78 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  hipblas-jobs:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      ffmpeg: ${{ matrix.ffmpeg }}
+      image-type: ${{ matrix.image-type }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
+      aio: ${{ matrix.aio }}
+      makeflags: ${{ matrix.makeflags }}
+      latest-image: ${{ matrix.latest-image }}
+      latest-image-aio: ${{ matrix.latest-image-aio }}
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      # Pushing with all jobs in parallel
+      # eats the bandwidth of all the nodes
+      max-parallel: 2
+      matrix:
+        include:
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-hipblas-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            aio: "-aio-gpu-hipblas"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            latest-image: 'latest-gpu-hipblas'
+            latest-image-aio: 'latest-aio-gpu-hipblas'
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas'
+            ffmpeg: 'false'
+            image-type: 'extras'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas-core'
+            ffmpeg: 'false'
+            image-type: 'core'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
  self-hosted-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
@@ -39,7 +111,7 @@ jobs:
    strategy:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
-      max-parallel: ${{ github.event_name != 'pull_request' && 6 || 10 }}
+      max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
      matrix:
        include:
          # Extra images
@@ -122,29 +194,6 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-hipblas-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            aio: "-aio-gpu-hipblas"
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            latest-image: 'latest-gpu-hipblas'
-            latest-image-aio: 'latest-aio-gpu-hipblas'
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas'
-            ffmpeg: 'false'
-            image-type: 'extras'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
@@ -212,26 +261,6 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-core'
-            ffmpeg: 'false'
-            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"

  core-image-build:
    uses: ./.github/workflows/image_build.yml
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -79,7 +79,7 @@ jobs:
        args: ${{ steps.summarize.outputs.message }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
-      uses: mxschmitt/action-tmate@v3.18
+      uses: mxschmitt/action-tmate@v3.19
      with:
        detached: true
        connect-timeout-seconds: 180
@@ -161,7 +161,7 @@ jobs:
        TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
-      uses: mxschmitt/action-tmate@v3.18
+      uses: mxschmitt/action-tmate@v3.19
      with:
        detached: true
        connect-timeout-seconds: 180
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -123,7 +123,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.18
+        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -232,7 +232,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.18
+        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -308,7 +308,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.18
+        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -350,7 +350,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.18
+        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.21.0
+        uses: securego/gosec@v2.21.4
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -133,7 +133,7 @@ jobs:
          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.18
+        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -178,17 +178,26 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
+      - name: Dependencies
+        run: |
+          # Install protoc
+          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+          rm protoc.zip
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Build images
        run: |
          docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=extras --build-arg EXTRA_BACKENDS=rerankers --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
          BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
      - name: Test
        run: |
-          LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
+            PATH="$PATH:$HOME/go/bin" LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
            make run-e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.18
+        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -226,7 +235,7 @@ jobs:
          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.18
+        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,8 +15,6 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time
 - [Documentation](#documentation)
 - [Community and Communication](#community-and-communication)

-
-
 ## Getting Started

 ### Prerequisites
@@ -54,7 +52,7 @@ If you find a bug, have a feature request, or encounter any issues, please check

 ## Coding Guidelines

- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
+- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like [`golangci-lint`](https://golangci-lint.run) can help you here.

 ## Testing

@@ -84,5 +82,3 @@ We are welcome the contribution of the documents, please open new PR or create a
 - You can reach out via the Github issue tracker.
 - Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
 - Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
-
---
--- a/49
+++ b/49
@@ -9,6 +9,8 @@ FROM ${BASE_IMAGE} AS requirements-core
 USER root

 ARG GO_VERSION=1.22.6
+ARG CMAKE_VERSION=3.26.4
+ARG CMAKE_FROM_SOURCE=false
 ARG TARGETARCH
 ARG TARGETVARIANT

@@ -21,13 +23,25 @@ RUN apt-get update && \
        build-essential \
        ccache \
        ca-certificates \
-        cmake \
-        curl \
+        curl libssl-dev \
        git \
        unzip upx-ucl && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

+# Install CMake (the version in 22.04 is too old)
+RUN <<EOT bash
+    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
+        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
+    else
+        apt-get update && \
+        apt-get install -y \
+            cmake && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
 # Install Go
 RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
 ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
@@ -188,6 +202,8 @@ FROM ${GRPC_BASE_IMAGE} AS grpc
 # This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
 ARG GRPC_MAKEFLAGS="-j4 -Otarget"
 ARG GRPC_VERSION=v1.65.0
+ARG CMAKE_FROM_SOURCE=false
+ARG CMAKE_VERSION=3.26.4

 ENV MAKEFLAGS=${GRPC_MAKEFLAGS}

@@ -196,12 +212,24 @@ WORKDIR /build
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        ca-certificates \
-        build-essential \
-        cmake \
+        build-essential curl libssl-dev \
        git && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

+# Install CMake (the version in 22.04 is too old)
+RUN <<EOT bash
+    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
+        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
+    else
+        apt-get update && \
+        apt-get install -y \
+            cmake && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
 # We install GRPC to a different prefix here so that we can copy in only the build artifacts later
 # saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
 # and running make install in the target container
@@ -297,10 +325,10 @@ COPY .git .
 RUN make prepare

 ## Build the binary
-## If it's CUDA, we want to skip some of the llama-compat backends to save space
-## We only leave the most CPU-optimized variant and the fallback for the cublas build
-## (both will use CUDA for the actual computation)
-RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
+## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
+## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
+## (both will use CUDA or hipblas for the actual computation)
+RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
    else \
        make build; \
@@ -338,9 +366,8 @@ RUN if [ "${FFMPEG}" = "true" ]; then \

 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        ssh less && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+        ssh less wget
+# For the devcontainer, leave apt functional in case additional devtools are needed at runtime.

 RUN go install github.com/go-delve/delve/cmd/dlv@latest

--- a/23
+++ b/23
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=e6b7801bd189d102d901d3e72035611a25456ef1
+CPPLLAMA_VERSION?=45f097645efb11b6d09a5b4adbbfd7c312ac0126

 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
@@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=a551933542d956ae84634937acd2942eb40efaaf
+WHISPER_CPP_VERSION?=a5abfe6a90495f7bf19fe70d016ecc255e97359c

 # bert.cpp version
 BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
@@ -359,6 +359,9 @@ clean-tests:
 	rm -rf test-dir
 	rm -rf core/http/backend-assets

+clean-dc: clean
+	cp -r /build/backend-assets /workspace/backend-assets
+
 ## Build:
 build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
@@ -465,15 +468,15 @@ run-e2e-image:
 	ls -liah $(abspath ./tests/e2e-fixtures)
 	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests

-run-e2e-aio:
+run-e2e-aio: protogen-go
 	@echo 'Running e2e AIO tests'
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio

 test-e2e:
 	@echo 'Running e2e tests'
 	BUILD_TYPE=$(BUILD_TYPE) \
 	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e

 teardown-e2e:
 	rm -rf $(TEST_DIR) || true
@@ -481,24 +484,24 @@ teardown-e2e:

 test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)

 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)

 test-tts: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)

 test-stablediffusion: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)

 test-stores: backend-assets/grpc/local-store
 	mkdir -p tests/integration/backend-assets/grpc
 	cp -f backend-assets/grpc/local-store tests/integration/backend-assets/grpc/
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts 1 -v -r tests/integration
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts $(TEST_FLAKES) -v -r tests/integration

 test-container:
 	docker build --target requirements -t local-ai-test-container .
--- a/README.md
+++ b/README.md
@@ -66,11 +66,24 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 # docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12 
 ```

+To load models:
+
+```bash
+# From the model gallery (see available models with `local-ai models list`, in the WebUI from the model tab, or visiting https://models.localai.io)
+local-ai run llama-3.2-1b-instruct:q4_k_m
+# Start LocalAI with the phi-2 model directly from huggingface
+local-ai run huggingface://TheBloke/phi-2-GGUF/phi-2.Q8_0.gguf
+# Install and run a model from the Ollama OCI registry
+local-ai run ollama://gemma:2b
+# Run a model from a configuration file
+local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
+# Install and run a model from a standard OCI registry (e.g., Docker Hub)
+local-ai run oci://localai/phi-2:latest
+```
+
 [💻 Getting started](https://localai.io/basics/getting_started/index.html)

-## 🔥🔥 Hot topics / Roadmap
-
-[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
+## 📰 Latest project news

 - Aug 2024:  🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
 - July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
@@ -83,8 +96,12 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 - May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
 - April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121

-Hot topics (looking for contributors):
+Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

+## 🔥🔥 Hot topics (looking for help):
+
+- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
+- Realtime API https://github.com/mudler/LocalAI/issues/3714
 - 🔥🔥 Distributed, P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
 - WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4-vision-preview
+name: gpt-4o

 roles:
  user: "USER:"
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4-vision-preview
+name: gpt-4o

 roles:
  user: "USER:"
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 mmap: false
 f16: false
-name: gpt-4-vision-preview
+name: gpt-4o

 roles:
  user: "USER:"
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -26,6 +26,19 @@ service Backend {
  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}

  rpc Rerank(RerankRequest) returns (RerankResult) {}
+
+  rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
+}
+
+// Define the empty request
+message MetricsRequest {}
+
+message MetricsResponse {
+  int32 slot_id = 1;
+  string prompt_json_for_slot = 2;  // Stores the prompt as a JSON string.
+  float tokens_per_second = 3;
+  int32 tokens_generated = 4;
+  int32 prompt_tokens_processed = 5;
 }

 message RerankRequest {
@@ -134,6 +147,9 @@ message PredictOptions {
  repeated string Images = 42;
  bool UseTokenizerTemplate = 43;
  repeated Message Messages = 44;
+  repeated string Videos = 45;
+  repeated string Audios = 46;
+  string CorrelationId = 47;
 }

 // The response message containing the result
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -13,6 +13,7 @@
 #include <getopt.h>
 #include "clip.h"
 #include "llava.h"
+#include "log.h"
 #include "stb_image.h"
 #include "common.h"
 #include "json.hpp"
@@ -112,7 +113,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
    std::string ret;
    for (; begin != end; ++begin)
    {
-        ret += llama_token_to_piece(ctx, *begin);
+        ret += common_token_to_piece(ctx, *begin);
    }
    return ret;
 }
@@ -120,7 +121,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
 // format incomplete utf-8 multibyte character for output
 static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
 {
-    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
+    std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
    // if the size is 1 and first bit is 1, meaning it's a partial character
    //   (size > 1 meaning it's already a known token)
    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
@@ -202,8 +203,8 @@ struct llama_client_slot
    std::string stopping_word;

    // sampling
-    struct gpt_sampler_params sparams;
-    gpt_sampler *ctx_sampling = nullptr;
+    struct common_sampler_params sparams;
+    common_sampler *ctx_sampling = nullptr;

    int32_t ga_i = 0;   // group-attention state
    int32_t ga_n = 1;   // group-attention factor
@@ -256,7 +257,7 @@ struct llama_client_slot
        images.clear();
    }

-    bool has_budget(gpt_params &global_params) {
+    bool has_budget(common_params &global_params) {
        if (params.n_predict == -1 && global_params.n_predict == -1)
        {
            return true; // limitless
@@ -390,6 +391,39 @@ struct llama_metrics {
    }
 };

+struct llava_embd_batch {
+    std::vector<llama_pos>      pos;
+    std::vector<int32_t>        n_seq_id;
+    std::vector<llama_seq_id>   seq_id_0;
+    std::vector<llama_seq_id *> seq_ids;
+    std::vector<int8_t>         logits;
+    llama_batch batch;
+    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+        pos     .resize(n_tokens);
+        n_seq_id.resize(n_tokens);
+        seq_ids .resize(n_tokens + 1);
+        logits  .resize(n_tokens);
+        seq_id_0.resize(1);
+        seq_id_0[0] = seq_id;
+        seq_ids [n_tokens] = nullptr;
+        batch = {
+            /*n_tokens       =*/ n_tokens,
+            /*tokens         =*/ nullptr,
+            /*embd           =*/ embd,
+            /*pos            =*/ pos.data(),
+            /*n_seq_id       =*/ n_seq_id.data(),
+            /*seq_id         =*/ seq_ids.data(),
+            /*logits         =*/ logits.data(),
+        };
+        for (int i = 0; i < n_tokens; i++) {
+            batch.pos     [i] = pos_0 + i;
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
+};
+
 struct llama_server_context
 {
    llama_model *model = nullptr;
@@ -397,7 +431,7 @@ struct llama_server_context

    clip_ctx *clp_ctx = nullptr;

-    gpt_params params;
+    common_params params;

    llama_batch batch;

@@ -440,7 +474,7 @@ struct llama_server_context
        }
    }

-    bool load_model(const gpt_params &params_)
+    bool load_model(const common_params &params_)
    {
        params = params_;
        if (!params.mmproj.empty()) {
@@ -448,7 +482,7 @@ struct llama_server_context
            LOG_INFO("Multi Modal Mode Enabled", {});
            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
            if(clp_ctx == nullptr) {
-                LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
+                LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
                return false;
            }

@@ -457,12 +491,12 @@ struct llama_server_context
            }
        }

-        llama_init_result llama_init = llama_init_from_gpt_params(params);
-        model = llama_init.model;
-        ctx = llama_init.context;
+        common_init_result common_init = common_init_from_params(params);
+        model = common_init.model;
+        ctx = common_init.context;
        if (model == nullptr)
        {
-            LOG_ERROR("unable to load model", {{"model", params.model}});
+            LOG_ERR("unable to load model: %s", params.model.c_str());
            return false;
        }

@@ -470,7 +504,7 @@ struct llama_server_context
            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
            const int n_embd_llm  = llama_n_embd(model);
            if (n_embd_clip != n_embd_llm) {
-                LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
+                LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
                llama_free(ctx);
                llama_free_model(model);
                return false;
@@ -489,11 +523,21 @@ struct llama_server_context
        std::vector<char> buf(1);
        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
        if (res < 0) {
-            LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
+            LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__);
            sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
        }
    }

+    llama_client_slot* get_active_slot() {
+        for (llama_client_slot& slot : slots) {
+            // Check if the slot is currently processing
+            if (slot.is_processing()) {
+                return &slot;  // Return the active slot
+            }
+        }
+        return nullptr;  // No active slot found
+    }
+
    void initialize() {
        // create slots
        all_slots_are_idle = true;
@@ -567,12 +611,12 @@ struct llama_server_context
                    std::vector<llama_token> p;
                    if (first)
                    {
-                        p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
+                        p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
                        first = false;
                    }
                    else
                    {
-                        p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
+                        p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
                    }
                    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
                }
@@ -589,7 +633,7 @@ struct llama_server_context
        else
        {
            auto s = json_prompt.template get<std::string>();
-            prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
+            prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
        }

        return prompt_tokens;
@@ -618,7 +662,7 @@ struct llama_server_context

    bool launch_slot_with_data(llama_client_slot* &slot, json data) {
        slot_params default_params;
-        gpt_sampler_params default_sparams;
+        common_sampler_params default_sparams;
 
        slot->params.stream             = json_value(data, "stream",            false);
        slot->params.cache_prompt       = json_value(data, "cache_prompt",      false);
@@ -758,7 +802,7 @@ struct llama_server_context
                    }
                    else if (el[0].is_string())
                    {
-                        auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
+                        auto toks = common_tokenize(model, el[0].get<std::string>(), false);
                        for (auto tok : toks)
                        {
                            slot->sparams.logit_bias.push_back({tok, bias});
@@ -790,7 +834,7 @@ struct llama_server_context
                        sampler_names.emplace_back(name);
                    }
                }
-                slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
+                slot->sparams.samplers = common_sampler_types_from_names(sampler_names, false);
        }
        else
        {
@@ -812,10 +856,11 @@ struct llama_server_context
                    img_sl.img_data = clip_image_u8_init();
                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
                    {
-                        LOG_ERROR("failed to load image", {
-                            {"slot_id",   slot->id},
-                            {"img_sl_id", img_sl.id}
-                        });
+                        LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d", 
+                             __func__,
+                             slot->id,
+                             img_sl.id
+                        );
                        return false;
                    }
                    LOG_VERBOSE("image loaded", {
@@ -853,12 +898,12 @@ struct llama_server_context
                                    }
                                }
                                if (!found) {
-                                    LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
+                                    LOG("ERROR: Image with id: %i, not found.\n", img_id);
                                    slot->images.clear();
                                    return false;
                                }
                            } catch (const std::invalid_argument& e) {
-                                LOG_TEE("Invalid image number id in prompt\n");
+                                LOG("Invalid image number id in prompt\n");
                                slot->images.clear();
                                return false;
                            }
@@ -873,9 +918,9 @@ struct llama_server_context

        if (slot->ctx_sampling != nullptr)
        {
-            gpt_sampler_free(slot->ctx_sampling);
+            common_sampler_free(slot->ctx_sampling);
        }
-        slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
+        slot->ctx_sampling = common_sampler_init(model, slot->sparams);
        //llama_set_rng_seed(ctx, slot->params.seed);
        slot->command = LOAD_PROMPT;

@@ -886,7 +931,7 @@ struct llama_server_context
            {"task_id", slot->task_id},
        });

-      //  LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
+      //  LOG("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());

        return true;
    }
@@ -902,13 +947,13 @@ struct llama_server_context
        system_tokens.clear();

        if (!system_prompt.empty()) {
-            system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
+            system_tokens = common_tokenize(ctx, system_prompt, add_bos_token);

-            llama_batch_clear(batch);
+            common_batch_clear(batch);

            for (int i = 0; i < (int)system_tokens.size(); ++i)
            {
-                llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
+                common_batch_add(batch, system_tokens[i], i, { 0 }, false);
            }

            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
@@ -922,11 +967,10 @@ struct llama_server_context
                    batch.n_seq_id + i,
                    batch.seq_id   + i,
                    batch.logits   + i,
-                    0, 0, 0, // unused
                };
                if (llama_decode(ctx, batch_view) != 0)
                {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    LOG("%s: llama_decode() failed\n", __func__);
                    return;
                }
            }
@@ -938,7 +982,7 @@ struct llama_server_context
            }
        }

-        LOG_TEE("system prompt updated\n");
+        LOG("system prompt updated\n");
        system_need_update = false;
    }

@@ -997,7 +1041,7 @@ struct llama_server_context

    bool process_token(completion_token_output &result, llama_client_slot &slot) {
        // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = llama_token_to_piece(ctx, result.tok);
+        const std::string token_str = common_token_to_piece(ctx, result.tok);
        slot.sampled = result.tok;

        // search stop word and delete it
@@ -1120,7 +1164,7 @@ struct llama_server_context
            }

            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
-                LOG_TEE("Error processing the given image");
+                LOG("Error processing the given image");
                return false;
            }

@@ -1132,7 +1176,7 @@ struct llama_server_context

    void send_error(task_server& task, const std::string &error)
    {
-        LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
+        LOG("task %i - error: %s\n", task.id, error.c_str());
        task_result res;
        res.id = task.id;
        res.multitask_id = task.multitask_id;
@@ -1148,7 +1192,7 @@ struct llama_server_context
        samplers.reserve(slot.sparams.samplers.size());
        for (const auto & sampler : slot.sparams.samplers)
        {
-            samplers.emplace_back(gpt_sampler_type_to_str(sampler));
+            samplers.emplace_back(common_sampler_type_to_str(sampler));
        }

        return json {
@@ -1204,7 +1248,7 @@ struct llama_server_context
        if (slot.sparams.n_probs > 0)
        {
            std::vector<completion_token_output> probs_output = {};
-            const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
+            const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
            size_t probs_pos      = std::min(slot.sent_token_probs_index,                       slot.generated_token_probs.size());
            size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
            if (probs_pos < probs_stop_pos)
@@ -1256,7 +1300,7 @@ struct llama_server_context
            std::vector<completion_token_output> probs = {};
            if (!slot.params.stream && slot.stopped_word)
            {
-                const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
+                const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
                probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
            }
            else
@@ -1367,11 +1411,10 @@ struct llama_server_context
                    batch.n_seq_id + i,
                    batch.seq_id   + i,
                    batch.logits   + i,
-                    0, 0, 0, // unused
                };
                if (llama_decode(ctx, batch_view))
                {
-                    LOG_TEE("%s : failed to eval\n", __func__);
+                    LOG("%s : failed to eval\n", __func__);
                    return false;
                }
            }
@@ -1386,17 +1429,18 @@ struct llama_server_context
                }

                const int n_embd = llama_n_embd(model);
-                llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
-                if (llama_decode(ctx, batch_img))
+                float * embd = img.image_embedding + i * n_embd;
+                llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
+                if (llama_decode(ctx, llava_batch.batch))
                {
-                    LOG_TEE("%s : failed to eval image\n", __func__);
+                    LOG("%s : failed to eval image\n", __func__);
                    return false;
                }
                slot.n_past += n_eval;
            }
            image_idx++;

-            llama_batch_clear(batch);
+            common_batch_clear(batch);

            // append prefix of next image
            const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
@@ -1406,7 +1450,7 @@ struct llama_server_context
            std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
            for (int i = 0; i < (int) append_tokens.size(); ++i)
            {
-                llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
+                common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
                slot.n_past += 1;
            }
        }
@@ -1538,7 +1582,7 @@ struct llama_server_context
            update_system_prompt();
        }

-        llama_batch_clear(batch);
+        common_batch_clear(batch);

        if (all_slots_are_idle)
        {
@@ -1572,7 +1616,7 @@ struct llama_server_context
                    slot.n_past = 0;
                    slot.truncated = false;
                    slot.has_next_token = true;
-                    LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
+                    LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());

                    continue;
                    // END LOCALAI changes
@@ -1616,7 +1660,7 @@ struct llama_server_context

            // TODO: we always have to take into account the "system_tokens"
            //       this is not great and needs to be improved somehow
-            llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
+            common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
            slot.n_past += 1;
        }

@@ -1710,7 +1754,7 @@ struct llama_server_context

                    if (!slot.params.cache_prompt)
                    {
-                        gpt_sampler_reset(slot.ctx_sampling);
+                        common_sampler_reset(slot.ctx_sampling);

                        slot.n_past = 0;
                        slot.n_past_se = 0;
@@ -1722,7 +1766,7 @@ struct llama_server_context
                        // push the prompt into the sampling context (do not apply grammar)
                        for (auto &token : prompt_tokens)
                        {
-                            gpt_sampler_accept(slot.ctx_sampling, token, false);
+                            common_sampler_accept(slot.ctx_sampling, token, false);
                        }

                        slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
@@ -1814,16 +1858,17 @@ struct llama_server_context
                                ga_i += ga_w/ga_n;
                            }
                        }
-                        llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
+                        common_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
                        slot_npast++;
                    }

                    if (has_images && !ingest_images(slot, n_batch))
                    {
-                        LOG_ERROR("failed processing images", {
-                            "slot_id", slot.id,
-                            "task_id", slot.task_id,
-                        });
+                        LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d", 
+                            __func__,
+                            slot.id,
+                            slot.task_id
+                        );
                        // FIXME @phymbert: to be properly tested
                        //  early returning without changing the slot state will block the slot for ever
                        // no one at the moment is checking the return value
@@ -1863,10 +1908,10 @@ struct llama_server_context
                        const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
                        const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;

-                        LOG_TEE("\n");
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
-                        LOG_TEE("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
+                        LOG("\n");
+                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
+                        LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
+                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);

                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
                        llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
@@ -1876,7 +1921,7 @@ struct llama_server_context

                        slot.ga_i += slot.ga_w / slot.ga_n;

-                        LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
+                        LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
                    }
                    slot.n_past_se += n_tokens;
                }
@@ -1891,7 +1936,6 @@ struct llama_server_context
                batch.n_seq_id + i,
                batch.seq_id   + i,
                batch.logits   + i,
-                0, 0, 0, // unused
            };

            const int ret = llama_decode(ctx, batch_view);
@@ -1901,11 +1945,11 @@ struct llama_server_context
                if (n_batch == 1 || ret < 0)
                {
                    // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
                    return false;
                }

-                LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
+                LOG("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);

                // retry with half the batch size to try to find a free slot in the KV cache
                n_batch /= 2;
@@ -1930,9 +1974,9 @@ struct llama_server_context
                }

                completion_token_output result;
-                const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
+                const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);

-                gpt_sampler_accept(slot.ctx_sampling, id, true);
+                common_sampler_accept(slot.ctx_sampling, id, true);

                slot.n_decoded += 1;
                if (slot.n_decoded == 1)
@@ -1943,7 +1987,7 @@ struct llama_server_context
                }

                result.tok = id;
-                const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);
+                const auto * cur_p = common_sampler_get_candidates(slot.ctx_sampling);

                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
                    result.probs.push_back({
@@ -1996,7 +2040,7 @@ static json format_partial_response(
 struct token_translator
 {
    llama_context * ctx;
-    std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); }
+    std::string operator()(llama_token tok)                    const { return common_token_to_piece(ctx, tok); }
    std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
 };

@@ -2103,6 +2147,9 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    data["ignore_eos"] = predict->ignoreeos();
    data["embeddings"] = predict->embeddings();

+    // Add the correlationid to json data
+    data["correlation_id"] = predict->correlationid();
+
    // for each image in the request, add the image data
    //
    for (int i = 0; i < predict->images_size(); i++) {
@@ -2187,7 +2234,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 // }

 static void params_parse(const backend::ModelOptions* request,
-                                gpt_params & params) {
+                                common_params & params) {
   
    // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809

@@ -2295,7 +2342,7 @@ public:

  grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
    // Implement LoadModel RPC
-    gpt_params params;
+    common_params params;
    params_parse(request, params);

    llama_backend_init();
@@ -2341,6 +2388,11 @@ public:
                int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
                reply.set_prompt_tokens(tokens_evaluated);

+                // Log Request Correlation Id
+                LOG_VERBOSE("correlation:", {
+                    { "id", data["correlation_id"] }
+                });
+
                // Send the reply
                writer->Write(reply);

@@ -2364,6 +2416,12 @@ public:
        std::string completion_text;
        task_result result = llama.queue_results.recv(task_id);
        if (!result.error && result.stop) {
+            
+            // Log Request Correlation Id
+            LOG_VERBOSE("correlation:", {
+                { "id", data["correlation_id"] }
+            });
+
            completion_text = result.result_json.value("content", "");
            int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
            int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
@@ -2403,6 +2461,31 @@ public:

        return grpc::Status::OK;
    }
+
+    grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
+        llama_client_slot* active_slot = llama.get_active_slot();
+
+        if (active_slot != nullptr) {
+            // Calculate the tokens per second using existing logic
+            double tokens_per_second = 1e3 / active_slot->t_token_generation * active_slot->n_decoded;
+
+            // Populate the response with metrics
+            response->set_slot_id(active_slot->id);
+            response->set_prompt_json_for_slot(active_slot->prompt.dump());
+            response->set_tokens_per_second(tokens_per_second);
+            response->set_tokens_generated(active_slot->n_decoded);
+            response->set_prompt_tokens_processed(active_slot->num_prompt_tokens_processed);
+        } else {
+            // Handle case when no active slot exists
+            response->set_slot_id(0);
+            response->set_prompt_json_for_slot("");
+            response->set_tokens_per_second(0);
+            response->set_tokens_generated(0);
+            response->set_prompt_tokens_processed(0);
+        }
+
+        return grpc::Status::OK;
+    } 
 };

 void RunServer(const std::string& server_address) {
--- a/backend/python/autogptq/requirements-cublas11.txt
+++ b/backend/python/autogptq/requirements-cublas11.txt
@@ -1,2 +1,2 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
-torch
+torch==2.4.1+cu118
--- a/backend/python/autogptq/requirements-cublas12.txt
+++ b/backend/python/autogptq/requirements-cublas12.txt
@@ -1 +1 @@
-torch
+torch==2.4.1
--- a/backend/python/autogptq/requirements-hipblas.txt
+++ b/backend/python/autogptq/requirements-hipblas.txt
@@ -1,2 +1,2 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch
+torch==2.4.1+rocm6.0
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.66.1
+grpcio==1.67.0
 protobuf
 certifi
 transformers
--- a/backend/python/bark/requirements-cpu.txt
+++ b/backend/python/bark/requirements-cpu.txt
@@ -1,4 +1,4 @@
 transformers
 accelerate
-torch
-torchaudio
+torch==2.4.1
+torchaudio==2.4.1
--- a/backend/python/bark/requirements-cublas11.txt
+++ b/backend/python/bark/requirements-cublas11.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
-torch
-torchaudio
+torch==2.4.1+cu118
+torchaudio==2.4.1+cu118
 transformers
 accelerate
--- a/backend/python/bark/requirements-cublas12.txt
+++ b/backend/python/bark/requirements-cublas12.txt
@@ -1,4 +1,4 @@
-torch
-torchaudio
+torch==2.4.1
+torchaudio==2.4.1
 transformers
 accelerate
--- a/backend/python/bark/requirements-hipblas.txt
+++ b/backend/python/bark/requirements-hipblas.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch
-torchaudio
+torch==2.4.1+rocm6.0
+torchaudio==2.4.1+rocm6.0
 transformers
 accelerate
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.66.1
+grpcio==1.67.0
 protobuf
 certifi
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,2 +1,2 @@
-grpcio==1.66.1
+grpcio==1.67.0
 protobuf
--- a/backend/python/coqui/requirements-cpu.txt
+++ b/backend/python/coqui/requirements-cpu.txt
@@ -1,3 +1,4 @@
 transformers
 accelerate
-torch
+torch==2.4.1
+coqui-tts
--- a/backend/python/coqui/requirements-cublas11.txt
+++ b/backend/python/coqui/requirements-cublas11.txt
@@ -1,5 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
-torch
-torchaudio
+torch==2.4.1+cu118
+torchaudio==2.4.1+cu118
 transformers
-accelerate
+accelerate
+coqui-tts
--- a/backend/python/coqui/requirements-cublas12.txt
+++ b/backend/python/coqui/requirements-cublas12.txt
@@ -1,4 +1,5 @@
-torch
-torchaudio
+torch==2.4.1
+torchaudio==2.4.1
 transformers
-accelerate
+accelerate
+coqui-tts
--- a/backend/python/coqui/requirements-hipblas.txt
+++ b/backend/python/coqui/requirements-hipblas.txt
@@ -1,5 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch
-torchaudio
+torch==2.4.1+rocm6.0
+torchaudio==2.4.1+rocm6.0
 transformers
-accelerate
+accelerate
+coqui-tts
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -3,6 +3,7 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
-accelerate
+accelerate
+coqui-tts
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-TTS==0.22.0
-grpcio==1.66.1
+grpcio==1.67.0
 protobuf
-certifi
+certifi
+packaging==24.1
--- a/backend/python/coqui/test.py
+++ b/backend/python/coqui/test.py
@@ -19,7 +19,7 @@ class TestBackendServicer(unittest.TestCase):
        This method sets up the gRPC service by starting the server
        """
        self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
-        time.sleep(10)
+        time.sleep(30)

    def tearDown(self) -> None:
        """
--- a/backend/python/diffusers/requirements-cpu.txt
+++ b/backend/python/diffusers/requirements-cpu.txt
@@ -5,5 +5,5 @@ accelerate
 compel
 peft
 sentencepiece
-torch
+torch==2.4.1
 optimum-quanto
--- a/backend/python/diffusers/requirements-cublas11.txt
+++ b/backend/python/diffusers/requirements-cublas11.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
-torch
+torch==2.4.1+cu118
 diffusers
 opencv-python
 transformers
--- a/backend/python/diffusers/requirements-cublas12.txt
+++ b/backend/python/diffusers/requirements-cublas12.txt
@@ -1,4 +1,4 @@
-torch
+torch==2.4.1
 diffusers
 opencv-python
 transformers
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -3,7 +3,7 @@ intel-extension-for-pytorch
 torch
 torchvision
 optimum[openvino]
-setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 diffusers
 opencv-python
 transformers
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.66.1
+grpcio==1.67.0
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/requirements-cpu.txt
+++ b/backend/python/exllama2/requirements-cpu.txt
@@ -1,3 +1,3 @@
 transformers
 accelerate
-torch
+torch==2.4.1
--- a/backend/python/exllama2/requirements-cublas11.txt
+++ b/backend/python/exllama2/requirements-cublas11.txt
@@ -1,4 +1,4 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
-torch
+torch==2.4.1+cu118
 transformers
 accelerate
--- a/backend/python/exllama2/requirements-cublas12.txt
+++ b/backend/python/exllama2/requirements-cublas12.txt
@@ -1,3 +1,3 @@
-torch
+torch==2.4.1
 transformers
 accelerate
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.67.0
 protobuf
 certifi
 wheel
--- a/backend/python/mamba/requirements-cpu.txt
+++ b/backend/python/mamba/requirements-cpu.txt
@@ -1,2 +1,2 @@
-torch
+torch==2.4.1
 transformers
--- a/backend/python/mamba/requirements-cublas11.txt
+++ b/backend/python/mamba/requirements-cublas11.txt
@@ -1,3 +1,3 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
-torch
+torch==2.4.1+cu118
 transformers
--- a/backend/python/mamba/requirements-cublas12.txt
+++ b/backend/python/mamba/requirements-cublas12.txt
@@ -1,2 +1,2 @@
-torch
+torch==2.4.1
 transformers
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.67.0
 protobuf
 certifi
--- a/backend/python/openvoice/requirements-cpu.txt
+++ b/backend/python/openvoice/requirements-cpu.txt
@@ -1 +1,3 @@
-torch
+torch==2.4.1
+git+https://github.com/myshell-ai/MeloTTS.git
+git+https://github.com/myshell-ai/OpenVoice.git
--- a/backend/python/openvoice/requirements-cublas11.txt
+++ b/backend/python/openvoice/requirements-cublas11.txt
@@ -1,2 +1,4 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
-torch
+torch==2.4.1+cu118
+git+https://github.com/myshell-ai/MeloTTS.git
+git+https://github.com/myshell-ai/OpenVoice.git
--- a/backend/python/openvoice/requirements-cublas12.txt
+++ b/backend/python/openvoice/requirements-cublas12.txt
@@ -1 +1,3 @@
-torch
+torch==2.4.1
+git+https://github.com/myshell-ai/MeloTTS.git
+git+https://github.com/myshell-ai/OpenVoice.git
--- a/backend/python/openvoice/requirements-hipblas.txt
+++ b/backend/python/openvoice/requirements-hipblas.txt
@@ -1,2 +1,4 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch
+torch==2.4.1+rocm6.0
+git+https://github.com/myshell-ai/MeloTTS.git
+git+https://github.com/myshell-ai/OpenVoice.git
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -2,22 +2,22 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-grpcio==1.66.1
+grpcio==1.67.0
 protobuf
 librosa==0.9.1
-faster-whisper==1.0.3
+faster-whisper==0.9.0
 pydub==0.25.1
 wavmark==0.0.3
-numpy==1.26.4
+numpy==1.22.0
 eng_to_ipa==0.0.2
 inflect==7.0.0
 unidecode==1.3.7
-whisper-timestamped==1.15.4
+whisper-timestamped==1.14.2
 openai
 python-dotenv
 pypinyin==0.50.0
 cn2an==0.5.22
 jieba==0.42.1
-gradio==4.38.1
 langid==1.1.6
 git+https://github.com/myshell-ai/MeloTTS.git
+git+https://github.com/myshell-ai/OpenVoice.git
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -1,10 +1,10 @@
-grpcio==1.66.1
+grpcio==1.67.0
 protobuf
 librosa
 faster-whisper
 pydub==0.25.1
 wavmark==0.0.3
-numpy
+numpy==1.22.0
 eng_to_ipa==0.0.2
 inflect
 unidecode
@@ -13,8 +13,8 @@ openai
 python-dotenv
 pypinyin
 cn2an==0.5.22
+networkx==2.8.8
 jieba==0.42.1
-gradio
+gradio==3.48.0
 langid==1.1.6
-git+https://github.com/myshell-ai/MeloTTS.git
-git+https://github.com/myshell-ai/OpenVoice.git
+llvmlite==0.43.0
--- a/backend/python/openvoice/test.py
+++ b/backend/python/openvoice/test.py
@@ -19,7 +19,7 @@ class TestBackendServicer(unittest.TestCase):
        This method sets up the gRPC service by starting the server
        """
        self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
-        time.sleep(10)
+        time.sleep(30)

    def tearDown(self) -> None:
        """
--- a/backend/python/parler-tts/install.sh
+++ b/backend/python/parler-tts/install.sh
@@ -15,5 +15,12 @@ installRequirements

 # https://github.com/descriptinc/audiotools/issues/101
 # incompatible protobuf versions.
-PYDIR=$(ls ${MY_DIR}/venv/lib)
-curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/builder.py
+PYDIR=python3.10
+pyenv="${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/"
+
+if [ ! -d ${pyenv} ]; then
+    echo "(parler-tts/install.sh): Error: ${pyenv} does not exist"
+    exit 1
+fi
+
+curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${pyenv}/builder.py
--- a/backend/python/parler-tts/requirements-cpu.txt
+++ b/backend/python/parler-tts/requirements-cpu.txt
@@ -1,3 +1,3 @@
 transformers
 accelerate
-torch
+torch==2.4.1
--- a/backend/python/parler-tts/requirements-cublas11.txt
+++ b/backend/python/parler-tts/requirements-cublas11.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
-torch
-torchaudio
+torch==2.4.1+cu118
+torchaudio==2.4.1+cu118
 transformers
 accelerate
--- a/backend/python/parler-tts/requirements-cublas12.txt
+++ b/backend/python/parler-tts/requirements-cublas12.txt
@@ -1,4 +1,4 @@
-torch
-torchaudio
+torch==2.4.1
+torchaudio==2.4.1
 transformers
 accelerate
--- a/backend/python/parler-tts/requirements-intel.txt
+++ b/backend/python/parler-tts/requirements-intel.txt
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.67.0
 protobuf
 certifi
 llvmlite==0.43.0
--- a/backend/python/rerankers/requirements-cpu.txt
+++ b/backend/python/rerankers/requirements-cpu.txt
@@ -1,4 +1,4 @@
 transformers
 accelerate
-torch
+torch==2.4.1
 rerankers[transformers]
--- a/backend/python/rerankers/requirements-cublas11.txt
+++ b/backend/python/rerankers/requirements-cublas11.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 transformers
 accelerate
-torch
+torch==2.4.1+cu118
 rerankers[transformers]
--- a/backend/python/rerankers/requirements-cublas12.txt
+++ b/backend/python/rerankers/requirements-cublas12.txt
@@ -1,4 +1,4 @@
 transformers
 accelerate
-torch
+torch==2.4.1
 rerankers[transformers]
--- a/backend/python/rerankers/requirements-hipblas.txt
+++ b/backend/python/rerankers/requirements-hipblas.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 transformers
 accelerate
-torch
+torch==2.4.1+rocm6.0
 rerankers[transformers]
--- a/backend/python/rerankers/requirements-intel.txt
+++ b/backend/python/rerankers/requirements-intel.txt
@@ -5,4 +5,4 @@ accelerate
 torch
 rerankers[transformers]
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.67.0
 protobuf
 certifi
--- a/backend/python/sentencetransformers/backend.py
+++ b/backend/python/sentencetransformers/backend.py
@@ -55,7 +55,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        """
        model_name = request.Model
        try:
-            self.model = SentenceTransformer(model_name)
+            self.model = SentenceTransformer(model_name, trust_remote_code=request.TrustRemoteCode)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")

--- a/backend/python/sentencetransformers/requirements-cpu.txt
+++ b/backend/python/sentencetransformers/requirements-cpu.txt
@@ -1,6 +1,6 @@
-torch
+torch==2.4.1
 accelerate
 transformers
 bitsandbytes
-sentence-transformers==3.0.1
+sentence-transformers==3.2.0
 transformers
--- a/backend/python/sentencetransformers/requirements-cublas11.txt
+++ b/backend/python/sentencetransformers/requirements-cublas11.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
-torch
+torch==2.4.1+cu118
 accelerate
-sentence-transformers==3.0.1
+sentence-transformers==3.2.0
 transformers
--- a/backend/python/sentencetransformers/requirements-cublas12.txt
+++ b/backend/python/sentencetransformers/requirements-cublas12.txt
@@ -1,4 +1,4 @@
-torch
+torch==2.4.1
 accelerate
-sentence-transformers==3.0.1
+sentence-transformers==3.2.0
 transformers
--- a/backend/python/sentencetransformers/requirements-hipblas.txt
+++ b/backend/python/sentencetransformers/requirements-hipblas.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch
+torch==2.4.1+rocm6.0
 accelerate
-sentence-transformers==3.0.1
+sentence-transformers==3.2.0
 transformers
--- a/backend/python/sentencetransformers/requirements-intel.txt
+++ b/backend/python/sentencetransformers/requirements-intel.txt
@@ -4,5 +4,5 @@ torch
 optimum[openvino]
 setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
 accelerate
-sentence-transformers==3.0.1
+sentence-transformers==3.2.0
 transformers
--- a/backend/python/sentencetransformers/requirements.txt
+++ b/backend/python/sentencetransformers/requirements.txt
@@ -1,3 +1,5 @@
-grpcio==1.66.1
+grpcio==1.67.0
 protobuf
-certifi
+certifi
+datasets
+einops
--- a/backend/python/transformers-musicgen/requirements-cpu.txt
+++ b/backend/python/transformers-musicgen/requirements-cpu.txt
@@ -1,3 +1,3 @@
 transformers
 accelerate
-torch
+torch==2.4.1
--- a/backend/python/transformers-musicgen/requirements-cublas11.txt
+++ b/backend/python/transformers-musicgen/requirements-cublas11.txt
@@ -1,4 +1,4 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 transformers
 accelerate
-torch
+torch==2.4.1+cu118
--- a/backend/python/transformers-musicgen/requirements-cublas12.txt
+++ b/backend/python/transformers-musicgen/requirements-cublas12.txt
@@ -1,3 +1,3 @@
 transformers
 accelerate
-torch
+torch==2.4.1
--- a/backend/python/transformers-musicgen/requirements-hipblas.txt
+++ b/backend/python/transformers-musicgen/requirements-hipblas.txt
@@ -1,4 +1,4 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 transformers
 accelerate
-torch
+torch==2.4.1+rocm6.0
--- a/backend/python/transformers-musicgen/requirements-intel.txt
+++ b/backend/python/transformers-musicgen/requirements-intel.txt
@@ -4,4 +4,4 @@ transformers
 accelerate
 torch
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/transformers-musicgen/requirements.txt
+++ b/backend/python/transformers-musicgen/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.67.0
 protobuf
 scipy==1.14.0
 certifi
--- a/backend/python/transformers/backend.py
+++ b/backend/python/transformers/backend.py
@@ -72,7 +72,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        Returns:
            A Result object that contains the result of the LoadModel operation.
        """
+
        model_name = request.Model
+        
+        # Check to see if the Model exists in the filesystem already.
+        if os.path.exists(request.ModelFile):
+            model_name = request.ModelFile

        compute = torch.float16
        if request.F16Memory == True:
--- a/backend/python/transformers/requirements-cpu.txt
+++ b/backend/python/transformers/requirements-cpu.txt
@@ -1,4 +1,4 @@
-torch
+torch==2.4.1
 accelerate
 transformers
 bitsandbytes
--- a/backend/python/transformers/requirements-cublas11.txt
+++ b/backend/python/transformers/requirements-cublas11.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
-torch
+torch==2.4.1+cu118
 accelerate
 transformers
 bitsandbytes
--- a/backend/python/transformers/requirements-cublas12.txt
+++ b/backend/python/transformers/requirements-cublas12.txt
@@ -1,4 +1,4 @@
-torch
+torch==2.4.1
 accelerate
 transformers
 bitsandbytes
--- a/backend/python/transformers/requirements-hipblas.txt
+++ b/backend/python/transformers/requirements-hipblas.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch
+torch==2.4.1+rocm6.0
 accelerate
 transformers
 bitsandbytes
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.67.0
 protobuf
 certifi
 setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vall-e-x/requirements-cpu.txt
+++ b/backend/python/vall-e-x/requirements-cpu.txt
@@ -1,3 +1,3 @@
 accelerate
-torch
-torchaudio
+torch==2.4.1
+torchaudio==2.4.1
--- a/backend/python/vall-e-x/requirements-cublas11.txt
+++ b/backend/python/vall-e-x/requirements-cublas11.txt
@@ -1,4 +1,4 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 accelerate
-torch
-torchaudio
+torch==2.4.1+cu118
+torchaudio==2.4.1+cu118
--- a/backend/python/vall-e-x/requirements-cublas12.txt
+++ b/backend/python/vall-e-x/requirements-cublas12.txt
@@ -1,3 +1,3 @@
 accelerate
-torch
-torchaudio
+torch==2.4.1
+torchaudio==2.4.1
--- a/backend/python/vall-e-x/requirements-intel.txt
+++ b/backend/python/vall-e-x/requirements-intel.txt
@@ -4,4 +4,4 @@ accelerate
 torch
 torchaudio
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vall-e-x/requirements.txt
+++ b/backend/python/vall-e-x/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.1
+grpcio==1.67.0
 protobuf
 certifi
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -5,6 +5,8 @@ import argparse
 import signal
 import sys
 import os
+from typing import List
+from PIL import Image

 import backend_pb2
 import backend_pb2_grpc
@@ -15,6 +17,10 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.multimodal.utils import fetch_image
+from vllm.assets.video import VideoAsset
+import base64
+import io

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

@@ -105,6 +111,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        try:
            self.llm = AsyncLLMEngine.from_engine_args(engine_args)
        except Exception as err:
+            print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")

        try:
@@ -117,7 +124,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
           )
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-
+        print("Model loaded successfully", file=sys.stderr)
        return backend_pb2.Result(message="Model loaded successfully", success=True)

    async def Predict(self, request, context):
@@ -196,15 +203,35 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if request.Seed != 0:
            sampling_params.seed = request.Seed

+        # Extract image paths and process images
        prompt = request.Prompt
-        
-        # If tokenizer template is enabled and messages are provided instead of prompt apply the tokenizer template
+
+        image_paths = request.Images
+        image_data = [self.load_image(img_path) for img_path in image_paths]
+
+        videos_path = request.Videos
+        video_data = [self.load_video(video_path) for video_path in videos_path]
+
+        # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)

-        # Generate text
+        # Generate text using the LLM engine
        request_id = random_uuid()
-        outputs = self.llm.generate(prompt, sampling_params, request_id)
+        print(f"Generating text with request_id: {request_id}", file=sys.stderr)
+        multi_modal_data = {}
+        if image_data:
+            multi_modal_data["image"] = image_data
+        if video_data:
+            multi_modal_data["video"] = video_data
+        outputs = self.llm.generate(
+            {
+            "prompt": prompt,
+            "multi_modal_data": multi_modal_data if multi_modal_data else None,
+            },
+            sampling_params=sampling_params,
+            request_id=request_id,
+        )

        # Stream the results
        generated_text = ""
@@ -227,9 +254,57 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if streaming:
            return

+        # Remove the image files from /tmp folder
+        for img_path in image_paths:
+            try:
+                os.remove(img_path)
+            except Exception as e:
+                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
+
        # Sending the final generated text
        yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))

+    def load_image(self, image_path: str):
+        """
+        Load an image from the given file path or base64 encoded data.
+        
+        Args:
+            image_path (str): The path to the image file or base64 encoded data.
+
+        Returns:
+            Image: The loaded image.
+        """
+        try:
+
+            image_data = base64.b64decode(image_path)
+            image = Image.open(io.BytesIO(image_data))
+            return image
+        except Exception as e:
+            print(f"Error loading image {image_path}: {e}", file=sys.stderr)
+            return None
+
+    def load_video(self, video_path: str):
+        """
+        Load a video from the given file path.
+        
+        Args:
+            video_path (str): The path to the image file.
+
+        Returns:
+            Video: The loaded video.
+        """
+        try:
+            timestamp = str(int(time.time() * 1000))  # Generate timestamp
+            p = f"/tmp/vl-{timestamp}.data"  # Use timestamp in filename
+            with open(p, "wb") as f:
+                f.write(base64.b64decode(video_path))
+            video = VideoAsset(name=p).np_ndarrays
+            os.remove(p)
+            return video
+        except Exception as e:
+            print(f"Error loading video {video_path}: {e}", file=sys.stderr)
+            return None
+
 async def serve(address):
    # Start asyncio gRPC server
    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -13,4 +13,20 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi

-installRequirements
+# We don't embed this into the images as it is a large dependency and not always needed.
+# Besides, the speed inference are not actually usable in the current state for production use-cases.
+if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE}" == "xtrue" ]; then
+        ensureVenv
+        # https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
+        if [ ! -d vllm ]; then
+            git clone https://github.com/vllm-project/vllm
+        fi
+        pushd vllm
+            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.67.0 protobuf bitsandbytes
+            uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+            VLLM_TARGET_DEVICE=cpu python setup.py install
+        popd
+        rm -rf vllm
+    else
+        installRequirements
+fi
--- a/backend/python/vllm/requirements-cpu.txt
+++ b/backend/python/vllm/requirements-cpu.txt
@@ -1,3 +1,3 @@
 accelerate
-torch
+torch==2.4.1
 transformers
--- a/backend/python/vllm/requirements-cublas11.txt
+++ b/backend/python/vllm/requirements-cublas11.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 accelerate
-torch
-transformers
+torch==2.4.1+cu118
+transformers
+bitsandbytes
--- a/backend/python/vllm/requirements-cublas12.txt
+++ b/backend/python/vllm/requirements-cublas12.txt
@@ -1,3 +1,4 @@
 accelerate
-torch
-transformers
+torch==2.4.1
+transformers
+bitsandbytes
--- a/backend/python/vllm/requirements-hipblas.txt
+++ b/backend/python/vllm/requirements-hipblas.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 accelerate
-torch
-transformers
+torch==2.4.1+rocm6.0
+transformers
+bitsandbytes
--- a/Show More
+++ b/Show More