diff --git a/.github/workflows/binary-releases.yml b/.github/workflows/binary-releases.yml
index e28c7181..c27339f3 100644
--- a/.github/workflows/binary-releases.yml
+++ b/.github/workflows/binary-releases.yml
@@ -61,13 +61,7 @@ jobs:
     runs-on: ubuntu-latest
     needs: get_commit_message
     if: >-
-      contains(needs.get_commit_message.outputs.message, '[binary build]') ||
-      github.event_name == 'workflow_dispatch' ||
-      github.event_name == 'workflow_call' ||
-      (github.event_name == 'pull_request' &&
-      contains(github.event.pull_request.labels.*.name, '03 - Standalone Build')) ||
-      (github.event_name == 'push' &&
-      startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0')))
+      contains(needs.get_commit_message.outputs.message, '[binary build]') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '03 - Standalone Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))
     steps:
       - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
diff --git a/.github/workflows/build-embedding.yml b/.github/workflows/build-embedding.yml
new file mode 100644
index 00000000..a8e976f7
--- /dev/null
+++ b/.github/workflows/build-embedding.yml
@@ -0,0 +1,155 @@
+name: Build Embedding Bentos
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - "main"
+    tags:
+      - "*"
+    paths:
+      - ".github/workflows/build-embedding.yml"
+      - "openllm-python/src/openllm/**"
+  pull_request:
+    branches:
+      - "main"
+    paths:
+      - ".github/workflows/build-embedding.yml"
+      - "openllm-python/src/openllm/**"
+# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrun
+defaults:
+  run:
+    shell: bash --noprofile --norc -exo pipefail {0}
+env:
+  OPENLLM_OPT_MODEL_ID: facebook/opt-125m
+  BENTOML_HOME: ${{ github.workspace }}/bentoml
+  OPENLLM_DEV_BUILD: True
+  OPENLLM_DO_NOT_TRACK: True
+jobs:
+  build-and-push-embedding-bento:
+    name: Build embedding container
+    runs-on: ubuntu-latest
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    permissions:
+      contents: write
+      packages: write
+      # This is used to complete the identity challenge
+      # with sigstore/fulcio when running outside of PRs.
+      id-token: write
+      security-events: write
+    steps:
+      - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - uses: bentoml/setup-bentoml-action@59beefe94e2e8f8ebbedf555fc86bd5d1ae0a708 # ratchet:bentoml/setup-bentoml-action@v1
+        with:
+          bentoml-version: 'main'
+          python-version: '3.11'
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@102b1a064a9b145e56556e22b18b19c624538d94 # ratchet:rlespinasse/github-slug-action@v4.4.1
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@2b82ce82d56a2a04d2637cd93a637ae1b359c0a7 # ratchet:docker/setup-qemu-action@v2.2.0
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@4c0219f9ac95b02789c1075625400b2acbff50b1 # ratchet:docker/setup-buildx-action@v2.9.1
+        with:
+          install: true
+          driver-opts: |
+            image=moby/buildkit:master
+            network=host
+      - name: Install cosign
+        if: github.event_name != 'pull_request'
+        uses: sigstore/cosign-installer@6e04d228eb30da1757ee4e1dd75a0ec73a653e06 # ratchet:sigstore/cosign-installer@v3.1.1
+        with:
+          cosign-release: 'v2.1.1'
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@465a07811f14bebb1938fbed4728c6a1ff8901fc # ratchet:docker/login-action@v2.2.0
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Extract metadata tags and labels on PRs
+        if: github.event_name == 'pull_request'
+        id: meta-pr
+        uses: docker/metadata-action@818d4b7b91585d195f67373fd9cb0332e31a7175 # ratchet:docker/metadata-action@v4.6.0
+        with:
+          images: ghcr.io/bentoml/openllm-embedding
+          tags: |
+            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
+      - name: Extract metadata tags and labels for main, release or tag
+        if: github.event_name != 'pull_request'
+        id: meta
+        uses: docker/metadata-action@818d4b7b91585d195f67373fd9cb0332e31a7175 # ratchet:docker/metadata-action@v4.6.0
+        with:
+          flavor: latest=auto
+          images: ghcr.io/bentoml/openllm-embedding
+          tags: |
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
+            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
+      - name: Build OPT Bento with base embeddings
+        id: bento-tag
+        run: |
+          pip install -e ${{ github.workspace }}/openllm-python
+          openllm build opt --serialisation legacy --bento-version sha-${{ env.GITHUB_SHA_SHORT }} --machine --dockerfile-template - <<EOF
+            {% extends "python_debian.j2" %}
+            {% block SETUP_BENTO_BASE_IMAGE %}
+            FROM python:3.11-slim as base-container
+
+            ENV LANG=C.UTF-8
+            ENV LC_ALL=C.UTF-8
+            ENV PYTHONIOENCODING=UTF-8
+            ENV PYTHONUNBUFFERED=1
+
+            USER root
+            ENV DEBIAN_FRONTEND=noninteractive
+            RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache
+            {% call common.RUN(__enable_buildkit__) -%} {{ common.mount_cache(__lib_apt__) }} {{ common.mount_cache(__cache_apt__) }} {% endcall -%} set -eux && \
+                apt-get update -y && \
+                apt-get install -q -y --no-install-recommends --allow-remove-essential \
+                    ca-certificates gnupg2 bash build-essential {% if __options__system_packages is not none %}{{ __options__system_packages | join(' ') }}{% endif -%}
+            {% endblock %}
+          EOF
+          bento_tag=$(python -c "import openllm;print(str(openllm.build('opt',bento_version='sha-${{ env.GITHUB_SHA_SHORT }}',serialisation_format='legacy').tag))")
+          echo "tag=$bento_tag" >> $GITHUB_OUTPUT
+      - name: Build and push Embedding Bento
+        id: build-and-push
+        uses: bentoml/containerize-push-action@main # ratchet:exclude
+        with:
+          bento-tag: ${{ steps.bento-tag.outputs.tag }}
+          platforms: linux/amd64
+          push: true
+          build-args: |
+            GIT_SHA=${{ env.GITHUB_SHA }}
+            DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
+          tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
+      - name: Sign the released image
+        if: ${{ github.event_name != 'pull_request' }}
+        env:
+          COSIGN_EXPERIMENTAL: "true"
+        run: echo "${{ steps.meta.outputs.tags }}" | xargs -I {} cosign sign --yes {}@${{ steps.build-and-push.outputs.digest }}
+      - name: Run Trivy in GitHub SBOM mode and submit results to Dependency Graph
+        uses: aquasecurity/trivy-action@559eb1224e654a86c844a795e6702a0742c60c72 # ratchet:aquasecurity/trivy-action@master
+        if: ${{ github.event_name != 'pull_request' }}
+        with:
+          image-ref: 'ghcr.io/bentoml/openllm-embedding:sha-${{ env.GITHUB_SHA_SHORT }}'
+          format: 'github'
+          output: 'dependency-results.sbom.json'
+          github-pat: ${{ secrets.UI_GITHUB_TOKEN }}
+          scanners: 'vuln'
+      - name: Run Trivy vulnerability scanner
+        uses: aquasecurity/trivy-action@559eb1224e654a86c844a795e6702a0742c60c72 # ratchet:aquasecurity/trivy-action@master
+        if: ${{ github.event_name != 'pull_request' }}
+        with:
+          image-ref: 'ghcr.io/bentoml/openllm-embedding:sha-${{ env.GITHUB_SHA_SHORT }}'
+          format: 'sarif'
+          output: 'trivy-results.sarif'
+          severity: 'CRITICAL'
+          scanners: 'vuln'
+      - name: Upload Trivy scan results to GitHub Security tab
+        uses: github/codeql-action/upload-sarif@a09933a12a80f87b87005513f0abb1494c27a716 # ratchet:github/codeql-action/upload-sarif@v2
+        if: ${{ github.event_name != 'pull_request' }}
+        with:
+          sarif_file: 'trivy-results.sarif'
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 286b82ce..0972cd8d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -41,12 +41,7 @@ jobs:
     runs-on: ubuntu-latest
     needs: get_commit_message
     if: >-
-      contains(needs.get_commit_message.outputs.message, '[ec2 build]') ||
-      github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'pull_request' &&
-      contains(github.event.pull_request.labels.*.name, '00 - EC2 Build')) ||
-      (github.event_name == 'push' &&
-      startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0')))
+      contains(needs.get_commit_message.outputs.message, '[ec2 build]') || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '00 - EC2 Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))
     env:
       EC2_INSTANCE_TYPE: g5.12xlarge
       EC2_AMI_ID: ami-089dafe9af191a0fd
@@ -211,12 +206,7 @@ jobs:
       - get_commit_message
     runs-on: ubuntu-latest
     if: >-
-      (contains(needs.get_commit_message.outputs.message, '[ec2 build]') ||
-      github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'pull_request' &&
-      contains(github.event.pull_request.labels.*.name, '00 - EC2 Build')) ||
-      (github.event_name == 'push' &&
-      startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0')))) && always()
+      (contains(needs.get_commit_message.outputs.message, '[ec2 build]') || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '00 - EC2 Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))) && always()
     steps:
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@5fd3084fc36e372ff1fff382a39b10d03659f355 # ratchet:aws-actions/configure-aws-credentials@v2
diff --git a/.github/workflows/clojure-frontend.yml b/.github/workflows/clojure-frontend.yml
index 9a168f69..2772c91a 100644
--- a/.github/workflows/clojure-frontend.yml
+++ b/.github/workflows/clojure-frontend.yml
@@ -2,13 +2,18 @@ name: Build Clojure UI
 on:
   workflow_dispatch:
   push:
+    branches: [main]
+    tags:
+      - "*"
     paths:
       - 'contrib/clojure/**'
+      - ".github/workflows/clojure-frontend.yml"
   pull_request:
     types: [labeled, opened, synchronize, reopened]
     branches: [main]
     paths:
       - 'contrib/clojure/**'
+      - ".github/workflows/clojure-frontend.yml"
 # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrun
 defaults:
   run:
@@ -37,12 +42,7 @@ jobs:
     runs-on: ubuntu-latest
     needs: get_commit_message
     if: >-
-      contains(needs.get_commit_message.outputs.message, '[clojure-ui build]') ||
-      github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'pull_request' &&
-      contains(github.event.pull_request.labels.*.name, '01 - Clojure Build')) ||
-      (github.event_name == 'push' &&
-      startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0')))
+      contains(needs.get_commit_message.outputs.message, '[clojure-ui build]') || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '01 - Clojure Build')) || (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0')))
     concurrency:
       group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
@@ -104,6 +104,7 @@ jobs:
         with:
           context: contrib/clojure
           file: contrib/clojure/Dockerfile
+          platforms: linux/amd64,linux/arm64
           push: true
           build-args: |
             GIT_SHA=${{ env.GITHUB_SHA }}
diff --git a/.github/workflows/compile-pypi.yml b/.github/workflows/compile-pypi.yml
index c48b817f..57d897ac 100644
--- a/.github/workflows/compile-pypi.yml
+++ b/.github/workflows/compile-pypi.yml
@@ -41,8 +41,8 @@ env:
   HATCH_VERBOSE: 2
   CIBW_BUILD_FRONTEND: build
   CIBW_ENVIRONMENT_PASS_LINUX: >
-    HATCH_BUILD_HOOKS_ENABLE
-    MYPYPATH
+    HATCH_BUILD_HOOKS_ENABLE MYPYPATH
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
   cancel-in-progress: true
@@ -70,13 +70,7 @@ jobs:
     runs-on: ubuntu-latest
     needs: get_commit_message
     if: >-
-      contains(needs.get_commit_message.outputs.message, '[wheel build]') ||
-      github.event_name == 'workflow_dispatch' ||
-      github.event_name == 'workflow_call' ||
-      (github.event_name == 'pull_request'
-      && contains(github.event.pull_request.labels.*.name, '02 - Wheel Build')) ||
-      (github.event_name == 'push' &&
-      startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0')))
+      contains(needs.get_commit_message.outputs.message, '[wheel build]') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '02 - Wheel Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))
     steps:
       - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
@@ -121,13 +115,7 @@ jobs:
             macos_arch: "universal2"
     needs: get_commit_message
     if: >-
-      contains(needs.get_commit_message.outputs.message, '[wheel build]') ||
-      github.event_name == 'workflow_dispatch' ||
-      github.event_name == 'workflow_call' ||
-      (github.event_name == 'pull_request'
-      && contains(github.event.pull_request.labels.*.name, '02 - Wheel Build')) ||
-      (github.event_name == 'push' &&
-      startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0')))
+      contains(needs.get_commit_message.outputs.message, '[wheel build]') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '02 - Wheel Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))
     steps:
       - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
         with:
diff --git a/README.md b/README.md
index 2e1d908e..3ce8b7fa 100644
--- a/README.md
+++ b/README.md
@@ -500,7 +500,7 @@ to see how you can do it yourself.
 
 ### Embeddings
 
-OpenLLM tentatively provides embeddings endpoint for supported models. This can
+OpenLLM provides embeddings endpoint for embeddings calculation. This can
 be accessed via `/v1/embeddings`.
 
 To use via CLI, simply call `openllm embed`:
@@ -534,8 +534,10 @@ client.embed("I like to eat apples")
 ```
 
 > [!NOTE]
-> Currently, the following model family supports embeddings: Llama, T5
-> (Flan-T5, FastChat, etc.), ChatGLM
+> Currently, the following model family supports embeddings calculation: Llama, T5 (Flan-T5, FastChat, etc.), ChatGLM
+> For the remaining LLM that doesn't have specific embedding implementation,
+> we will use a generic [BertModel](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
+> for embeddings generation. The implementation is largely based on [`bentoml/sentence-embedding-bento`](https://github.com/bentoml/sentence-embedding-bento)
 
 ### Playground and Chat UI
 
@@ -543,7 +545,7 @@ The following UIs are currently available for OpenLLM:
 
 | UI                                                                                | Owner                                         | Type                 | Progress |
 |-----------------------------------------------------------------------------------|-----------------------------------------------|----------------------|----------|
-| [Clojure](https://github.com/bentoml/OpenLLM/blob/main/contrib/clojure/README.md) | [@ GutZuFusss](https://github.com/GutZuFusss) | Community-maintained | 🔧       |
+| [Clojure](https://github.com/bentoml/OpenLLM/blob/main/contrib/clojure/README.md) | [@GutZuFusss](https://github.com/GutZuFusss)  | Community-maintained | 🔧       |
 | TS                                                                                | BentoML Team                                  |                      | 🚧       |
 
 ## ⚙️ Integrations
diff --git a/changelog.d/227.feature.md b/changelog.d/227.feature.md
new file mode 100644
index 00000000..a4508a1a
--- /dev/null
+++ b/changelog.d/227.feature.md
@@ -0,0 +1,2 @@
+Added a generic embedding implementation largely based on https://github.com/bentoml/sentence-embedding-bento
+For all unsupported models.
diff --git a/openllm-python/README.md b/openllm-python/README.md
index 1327dd53..3ce8b7fa 100644
--- a/openllm-python/README.md
+++ b/openllm-python/README.md
@@ -6,6 +6,8 @@
     <h1 align="center">🦾 OpenLLM</h1>
     <a href="https://pypi.org/project/openllm">
         <img src="https://img.shields.io/pypi/v/openllm.svg?logo=pypi&label=PyPI&logoColor=gold" alt="pypi_status" />
+    </a><a href="https://test.pypi.org/project/openllm/">
+        <img src="https://img.shields.io/badge/Nightly-PyPI?logo=pypi&label=PyPI&color=gray&link=https%3A%2F%2Ftest.pypi.org%2Fproject%2Fopenllm%2F" alt="test_pypi_status" />
     </a><a href="https://twitter.com/bentomlai">
         <img src="https://badgen.net/badge/icon/@bentomlai/1DA1F2?icon=twitter&label=Follow%20Us" alt="Twitter" />
     </a><a href="https://l.bentoml.com/join-openllm-discord">
@@ -493,12 +495,12 @@ openllm build opt --model-id facebook/opt-6.7b --adapter-id ...
 
 OpenLLM encourages contributions by welcoming users to incorporate their custom
 LLMs into the ecosystem. Check out
-[Adding a New Model Guide](https://github.com/bentoml/OpenLLM/blob/main/ADDING_NEW_MODEL.md)
+[Adding a New Model Guide](https://github.com/bentoml/OpenLLM/blob/main/openllm-python/ADDING_NEW_MODEL.md)
 to see how you can do it yourself.
 
 ### Embeddings
 
-OpenLLM tentatively provides embeddings endpoint for supported models. This can
+OpenLLM provides embeddings endpoint for embeddings calculation. This can
 be accessed via `/v1/embeddings`.
 
 To use via CLI, simply call `openllm embed`:
@@ -532,8 +534,19 @@ client.embed("I like to eat apples")
 ```
 
 > [!NOTE]
-> Currently, the following model family supports embeddings: Llama, T5
-> (Flan-T5, FastChat, etc.), ChatGLM
+> Currently, the following model family supports embeddings calculation: Llama, T5 (Flan-T5, FastChat, etc.), ChatGLM
+> For the remaining LLM that doesn't have specific embedding implementation,
+> we will use a generic [BertModel](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
+> for embeddings generation. The implementation is largely based on [`bentoml/sentence-embedding-bento`](https://github.com/bentoml/sentence-embedding-bento)
+
+### Playground and Chat UI
+
+The following UIs are currently available for OpenLLM:
+
+| UI                                                                                | Owner                                         | Type                 | Progress |
+|-----------------------------------------------------------------------------------|-----------------------------------------------|----------------------|----------|
+| [Clojure](https://github.com/bentoml/OpenLLM/blob/main/contrib/clojure/README.md) | [@GutZuFusss](https://github.com/GutZuFusss)  | Community-maintained | 🔧       |
+| TS                                                                                | BentoML Team                                  |                      | 🚧       |
 
 ## ⚙️ Integrations
 
diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml
index 206ebfb7..6451bc64 100644
--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -2,80 +2,80 @@
 [build-system]
 build-backend = "hatchling.build"
 requires = [
-    "hatchling==1.18.0",
-    "hatch-vcs==0.3.0",
-    "hatch-fancy-pypi-readme==23.1.0",
-    "hatch-mypyc==0.16.0",
+  "hatchling==1.18.0",
+  "hatch-vcs==0.3.0",
+  "hatch-fancy-pypi-readme==23.1.0",
+  "hatch-mypyc==0.16.0",
 ]
 
 [project]
 authors = [
-    {name = "Aaron Pham",email = "aarnphm@bentoml.com"},
-    {name = "BentoML Team",email = "contact@bentoml.com"},
+  { name = "Aaron Pham", email = "aarnphm@bentoml.com" },
+  { name = "BentoML Team", email = "contact@bentoml.com" },
 ]
 classifiers = [
-    "Development Status :: 5 - Production/Stable",
-    "Environment :: GPU :: NVIDIA CUDA",
-    "Environment :: GPU :: NVIDIA CUDA :: 12",
-    "Environment :: GPU :: NVIDIA CUDA :: 11.8",
-    "Environment :: GPU :: NVIDIA CUDA :: 11.7",
-    "License :: OSI Approved :: Apache Software License",
-    "Topic :: Scientific/Engineering :: Artificial Intelligence",
-    "Topic :: Software Development :: Libraries",
-    "Operating System :: OS Independent",
-    "Intended Audience :: Developers",
-    "Intended Audience :: Science/Research",
-    "Intended Audience :: System Administrators",
-    "Typing :: Typed",
-    "Programming Language :: Python",
-    "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
-    "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11",
-    "Programming Language :: Python :: 3.12",
-    "Programming Language :: Python :: Implementation :: CPython",
-    "Programming Language :: Python :: Implementation :: PyPy",
+  "Development Status :: 5 - Production/Stable",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "Environment :: GPU :: NVIDIA CUDA :: 12",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.8",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.7",
+  "License :: OSI Approved :: Apache Software License",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+  "Topic :: Software Development :: Libraries",
+  "Operating System :: OS Independent",
+  "Intended Audience :: Developers",
+  "Intended Audience :: Science/Research",
+  "Intended Audience :: System Administrators",
+  "Typing :: Typed",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Programming Language :: Python :: Implementation :: PyPy",
 ]
 dependencies = [
-    "bentoml[grpc,io]>=1.0.25",
-    "transformers[torch,tokenizers,accelerate]>=4.29.0",
-    "safetensors",
-    "optimum",
-    "attrs>=23.1.0",
-    "cattrs>=23.1.0",
-    "orjson",
-    "inflection",
-    "tabulate[widechars]>=0.9.0",
-    "httpx",
-    "click>=8.1.3",
-    "typing_extensions",
-    "mypy_extensions",
-    "ghapi",
-    "cuda-python;platform_system!=\"Darwin\"",
-    "bitsandbytes<0.42",
+  "bentoml[grpc,io]>=1.0.25",
+  "transformers[torch,tokenizers,accelerate]>=4.29.0",
+  "safetensors",
+  "optimum",
+  "attrs>=23.1.0",
+  "cattrs>=23.1.0",
+  "orjson",
+  "inflection",
+  "tabulate[widechars]>=0.9.0",
+  "httpx",
+  "click>=8.1.3",
+  "typing_extensions",
+  "mypy_extensions",
+  "ghapi",
+  "cuda-python;platform_system!=\"Darwin\"",
+  "bitsandbytes<0.42",
 ]
 description = "OpenLLM: Operating LLMs in production"
 dynamic = ["version", "readme"]
 keywords = [
-    "MLOps",
-    "AI",
-    "BentoML",
-    "Model Serving",
-    "Model Deployment",
-    "LLMOps",
-    "Falcon",
-    "Vicuna",
-    "Llama 2",
-    "Fine tuning",
-    "Serverless",
-    "Large Language Model",
-    "Generative AI",
-    "StableLM",
-    "Alpaca",
-    "PyTorch",
-    "Transformers",
+  "MLOps",
+  "AI",
+  "BentoML",
+  "Model Serving",
+  "Model Deployment",
+  "LLMOps",
+  "Falcon",
+  "Vicuna",
+  "Llama 2",
+  "Fine tuning",
+  "Serverless",
+  "Large Language Model",
+  "Generative AI",
+  "StableLM",
+  "Alpaca",
+  "PyTorch",
+  "Transformers",
 ]
 license = "Apache-2.0"
 name = "openllm"
@@ -103,21 +103,21 @@ Twitter = "https://twitter.com/bentomlai"
 [project.optional-dependencies]
 agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
 all = [
-    "openllm[agents]",
-    "openllm[baichuan]",
-    "openllm[chatglm]",
-    "openllm[falcon]",
-    "openllm[fine-tune]",
-    "openllm[flan-t5]",
-    "openllm[ggml]",
-    "openllm[gptq]",
-    "openllm[llama]",
-    "openllm[mpt]",
-    "openllm[openai]",
-    "openllm[opt]",
-    "openllm[playground]",
-    "openllm[starcoder]",
-    "openllm[vllm]",
+  "openllm[agents]",
+  "openllm[baichuan]",
+  "openllm[chatglm]",
+  "openllm[falcon]",
+  "openllm[fine-tune]",
+  "openllm[flan-t5]",
+  "openllm[ggml]",
+  "openllm[gptq]",
+  "openllm[llama]",
+  "openllm[mpt]",
+  "openllm[openai]",
+  "openllm[opt]",
+  "openllm[playground]",
+  "openllm[starcoder]",
+  "openllm[vllm]",
 ]
 baichuan = ["cpm-kernels", "sentencepiece"]
 chatglm = ["cpm-kernels", "sentencepiece"]
@@ -141,12 +141,12 @@ source = "vcs"
 version-file = "src/openllm/_version.py"
 [tool.hatch.version.raw-options]
 git_describe_command = [
-    "git",
-    "describe",
-    "--dirty",
-    "--tags",
-    "--long",
-    "--first-parent",
+  "git",
+  "describe",
+  "--dirty",
+  "--tags",
+  "--long",
+  "--first-parent",
 ]
 local_scheme = "no-local-version"
 root = ".."
@@ -157,55 +157,54 @@ only-include = ["src/openllm"]
 sources = ["src"]
 [tool.hatch.build.targets.sdist]
 exclude = [
-    "/.git_archival.txt",
-    "tests",
-    "/.python-version-default",
-    "ADDING_NEW_MODEL.md",
+  "/.git_archival.txt",
+  "tests",
+  "/.python-version-default",
+  "ADDING_NEW_MODEL.md",
 ]
 [tool.hatch.build.targets.wheel.hooks.mypyc]
 dependencies = [
-    "hatch-mypyc==0.16.0",
-    "mypy==1.4.1",
-    # avoid https://github.com/pallets/click/issues/2558
-    "click==8.1.3",
-    "bentoml==1.1.1",
-    "transformers>=4.31.0",
-    "pandas-stubs",
-    "types-psutil",
-    "types-tabulate",
-    "types-PyYAML",
-    "types-protobuf",
+  "hatch-mypyc==0.16.0",
+  "mypy==1.4.1",
+  # avoid https://github.com/pallets/click/issues/2558
+  "click==8.1.3",
+  "bentoml==1.1.1",
+  "transformers>=4.31.0",
+  "pandas-stubs",
+  "types-psutil",
+  "types-tabulate",
+  "types-PyYAML",
+  "types-protobuf",
 ]
 enable-by-default = false
+exclude = ["src/openllm/_service.py", "src/openllm/_typing_compat.py"]
 include = [
-    "src/openllm/bundle",
-    "src/openllm/models/__init__.py",
-    "src/openllm/models/auto/__init__.py",
-    "src/openllm/utils/__init__.py",
-    "src/openllm/utils/codegen.py",
-    "src/openllm/__init__.py",
-    "src/openllm/_prompt.py",
-    "src/openllm/_schema.py",
-    "src/openllm/_quantisation.py",
-    "src/openllm/_generation.py",
-    "src/openllm/_strategies.py",
-    "src/openllm/exceptions.py",
-    "src/openllm/testing.py",
+  "src/openllm/models/__init__.py",
+  "src/openllm/models/auto/__init__.py",
+  "src/openllm/utils/__init__.py",
+  "src/openllm/__init__.py",
+  "src/openllm/_prompt.py",
+  "src/openllm/_schema.py",
+  "src/openllm/_quantisation.py",
+  "src/openllm/_generation.py",
+  "src/openllm/_strategies.py",
+  "src/openllm/exceptions.py",
+  "src/openllm/testing.py",
 ]
 # NOTE: This is consistent with pyproject.toml
 mypy-args = [
-    "--strict",
-    # this is because all transient library doesn't have types
-    "--allow-subclassing-any",
-    "--follow-imports=skip",
-    "--check-untyped-defs",
-    "--ignore-missing-imports",
-    "--no-warn-return-any",
-    "--warn-unreachable",
-    "--no-warn-no-return",
-    "--no-warn-unused-ignores",
-    "--exclude='/src\\/openllm\\/playground\\/**'",
-    "--exclude='/src\\/openllm\\/_typing_compat\\.py$'",
+  "--strict",
+  # this is because all transient library doesn't have types
+  "--follow-imports=skip",
+  "--allow-subclassing-any",
+  "--check-untyped-defs",
+  "--ignore-missing-imports",
+  "--no-warn-return-any",
+  "--warn-unreachable",
+  "--no-warn-no-return",
+  "--no-warn-unused-ignores",
+  "--exclude='/src\\/openllm\\/playground\\/**'",
+  "--exclude='/src\\/openllm\\/_typing_compat\\.py$'",
 ]
 options = { verbose = true, strip_asserts = true, debug_level = "2", opt_level = "3", include_runtime_files = true }
 require-runtime-dependencies = true
diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py
index 20e2245c..eb9048d5 100644
--- a/openllm-python/src/openllm/__init__.py
+++ b/openllm-python/src/openllm/__init__.py
@@ -28,7 +28,9 @@ else:
   _warnings.filterwarnings("ignore", message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")
 
 _import_structure: dict[str, list[str]] = {
-    "exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [], "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"], "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_configuration": ["LLMConfig", "GenerationConfig", "SamplingParams"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"], "_quantisation": ["infer_quantisation_config"], "_schema": ["GenerationInput", "GenerationOutput", "MetadataOutput", "EmbeddingsOutput", "unmarshal_vllm_outputs", "HfAgentInput"],
+    "exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [], "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"],
+    "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_configuration": ["LLMConfig", "GenerationConfig", "SamplingParams"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"],
+    "_quantisation": ["infer_quantisation_config"], "_schema": ["GenerationInput", "GenerationOutput", "MetadataOutput", "EmbeddingsOutput", "unmarshal_vllm_outputs", "HfAgentInput"], "_embeddings": ["GenericEmbeddingRunnable"], "_strategies": ["CascadingResourceStrategy", "get_resource"],
     "models.auto": ["AutoConfig", "CONFIG_MAPPING", "MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"], "models.chatglm": ["ChatGLMConfig"], "models.baichuan": ["BaichuanConfig"], "models.dolly_v2": ["DollyV2Config"], "models.falcon": ["FalconConfig"], "models.flan_t5": ["FlanT5Config"], "models.gpt_neox": ["GPTNeoXConfig"], "models.llama": ["LlamaConfig"], "models.mpt": ["MPTConfig"], "models.opt": ["OPTConfig"], "models.stablelm": ["StableLMConfig"], "models.starcoder": ["StarCoderConfig"]
 }
 COMPILED = _Path(__file__).suffix in (".pyd", ".so")
@@ -40,6 +42,8 @@ if _t.TYPE_CHECKING:
   from ._llm import LLM as LLM, LLMEmbeddings as LLMEmbeddings, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
   from ._quantisation import infer_quantisation_config as infer_quantisation_config
   from ._schema import EmbeddingsOutput as EmbeddingsOutput, GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, HfAgentInput as HfAgentInput, MetadataOutput as MetadataOutput, unmarshal_vllm_outputs as unmarshal_vllm_outputs
+  from ._embeddings import GenericEmbeddingRunnable as GenericEmbeddingRunnable
+  from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
   from .cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start, start_grpc as start_grpc
   from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING, MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES, MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES as MODEL_VLLM_MAPPING_NAMES, AutoConfig as AutoConfig
   from .models.baichuan import BaichuanConfig as BaichuanConfig
@@ -54,7 +58,7 @@ if _t.TYPE_CHECKING:
   from .models.stablelm import StableLMConfig as StableLMConfig
   from .models.starcoder import StarCoderConfig as StarCoderConfig
   from .serialisation import ggml as ggml, transformers as transformers
-  from openllm.utils import infer_auto_class as infer_auto_class
+  from .utils import infer_auto_class as infer_auto_class
 
 try:
   if not (utils.is_torch_available() and utils.is_cpm_kernels_available()): raise exceptions.MissingDependencyError
diff --git a/openllm-python/src/openllm/_embeddings.py b/openllm-python/src/openllm/_embeddings.py
new file mode 100644
index 00000000..1db41ca9
--- /dev/null
+++ b/openllm-python/src/openllm/_embeddings.py
@@ -0,0 +1,48 @@
+# See https://github.com/bentoml/sentence-embedding-bento for more information.
+from __future__ import annotations
+import bentoml, openllm, transformers, typing as t
+from huggingface_hub import snapshot_download
+from bentoml._internal.frameworks.transformers import MODULE_NAME, API_VERSION
+from bentoml._internal.models.model import ModelOptions, ModelSignature
+if t.TYPE_CHECKING: import torch
+
+_GENERIC_EMBEDDING_ID="sentence-transformers/all-MiniLM-L6-v2"
+_BENTOMODEL_ID="sentence-transformers--all-MiniLM-L6-v2"
+
+def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
+  try:
+    return bentoml.transformers.get(ids)
+  except bentoml.exceptions.NotFound:
+    model_signatures = {k: ModelSignature(batchable=False) for k in ("forward", "generate", "contrastive_search", "greedy_search", "sample", "beam_search", "beam_sample", "group_beam_search", "constrained_beam_search", "__call__")}
+    with bentoml.models.create(ids, module=MODULE_NAME, api_version=API_VERSION, options=ModelOptions(), context=openllm.utils.generate_context(framework_name="transformers"), labels={"runtime": "pt", "framework": "openllm"}, signatures=model_signatures) as bentomodel:
+      snapshot_download(_GENERIC_EMBEDDING_ID, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=["*.safetensors","*.h5","*.ot","*.pdf","*.md",".gitattributes","LICENSE.txt"])
+      return bentomodel
+
+class GenericEmbeddingRunnable(bentoml.Runnable):
+  SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
+  SUPPORTS_CPU_MULTI_THREADING = True
+  def __init__(self) -> None:
+    self.device = "cuda" if openllm.utils.device_count() > 0 else "cpu"
+    self._bentomodel = get_or_download()
+    self.tokenizer = transformers.AutoTokenizer.from_pretrained(self._bentomodel.path)
+    self.model = transformers.AutoModel.from_pretrained(self._bentomodel.path)
+    self.model.to(self.device)
+  @bentoml.Runnable.method(batchable=True, batch_dim=0)
+  def encode(self, sentences: list[str]) -> t.Sequence[openllm.LLMEmbeddings]:
+    import torch, torch.nn.functional as F
+    encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(self.device)
+    attention_mask = encoded_input["attention_mask"]
+    # Compute token embeddings
+    with torch.no_grad(): model_output = self.model(**encoded_input)
+    # Perform pooling and normalize
+    sentence_embeddings = F.normalize(self.mean_pooling(model_output, attention_mask), p=2, dim=1)
+    return [openllm.LLMEmbeddings(embeddings=sentence_embeddings.cpu().numpy(), num_tokens=int(torch.sum(attention_mask).item()))]
+  @staticmethod
+  def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    import torch
+    # Mean Pooling - Take attention mask into account for correct averaging
+    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+
+__all__ = ["GenericEmbeddingRunnable"]
diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py
index 77313d89..6a43dde2 100644
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -926,7 +926,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
       scheduling_strategy = CascadingResourceStrategy
 
     generate_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
-    embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
+    embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True, batch_dim=0)))
     generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True)))
 
     # NOTE: returning the two langchain API's to the runner
@@ -1036,8 +1036,8 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
       logger.info("Successfully apply LoRA layer %s", adapter_name)
 
     @bentoml.Runnable.method(**method_signature(embeddings_sig))
-    def embeddings(__self: _Runnable, prompt: str | list[str]) -> LLMEmbeddings:
-      return self.embeddings([prompt] if isinstance(prompt, str) else prompt)
+    def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[LLMEmbeddings]:
+      return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)]
 
     @bentoml.Runnable.method(**method_signature(generate_sig))
     def __call__(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py
index 7c80ee61..7ca3b3ef 100644
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -1,9 +1,3 @@
-# mypy: disable-error-code="arg-type,misc"
-"""The service definition for running any LLMService.
-
-For line with comment '# openllm: ...', it must not be modified as it is managed internally by OpenLLM.
-Codegen can be found under 'openllm.utils.codegen'
-"""
 from __future__ import annotations
 import os, warnings, orjson, bentoml, openllm, typing as t
 from starlette.applications import Starlette
@@ -12,6 +6,7 @@ from starlette.routing import Route
 if t.TYPE_CHECKING:
   from starlette.requests import Request
   from starlette.responses import Response
+  from bentoml._internal.runner.runner import RunnerMethod
 # The following warnings from bitsandbytes, and probably not that important for users to see
 warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
 warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
@@ -20,7 +15,10 @@ model = os.environ.get("OPENLLM_MODEL", "{__model_name__}")  # openllm: model na
 adapter_map = os.environ.get("OPENLLM_ADAPTER_MAP", """{__model_adapter_map__}""")  # openllm: model adapter map
 llm_config = openllm.AutoConfig.for_model(model)
 runner = openllm.Runner(model, llm_config=llm_config, ensure_available=False, adapter_map=orjson.loads(adapter_map))
-svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[runner])
+generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable, name="llm-generic-embedding", scheduling_strategy=openllm.CascadingResourceStrategy, max_batch_size=32, max_latency_ms=300)
+runners: t.Sequence[bentoml.Runner] = [runner]
+if not runner.supports_embeddings: runners.append(generic_embedding_runner)
+svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners)
 
 @svc.api(route="/v1/generate", input=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True)}), output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)}))
 async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
@@ -33,11 +31,11 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
 def metadata_v1(_: str) -> openllm.MetadataOutput:
   return openllm.MetadataOutput(timeout=llm_config["timeout"], model_name=llm_config["model_name"], framework=llm_config["env"]["framework_value"], model_id=runner.llm.model_id, configuration=llm_config.model_dump_json().decode(), supports_embeddings=runner.supports_embeddings, supports_hf_agent=runner.supports_hf_agent)
 
-if runner.supports_embeddings:
-  @svc.api(route="/v1/embeddings", input=bentoml.io.JSON.from_sample(["Hey Jude, welcome to the jungle!", "What is the meaning of life?"]), output=bentoml.io.JSON.from_sample({"embeddings": [0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282, -0.014814382418990135, 0.01796768605709076], "num_tokens": 20}))
-  async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
-    responses = await runner.embeddings.async_run(phrases)
-    return openllm.EmbeddingsOutput(embeddings=responses["embeddings"], num_tokens=responses["num_tokens"])
+@svc.api(route="/v1/embeddings", input=bentoml.io.JSON.from_sample(["Hey Jude, welcome to the jungle!", "What is the meaning of life?"]), output=bentoml.io.JSON.from_sample({"embeddings": [0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282, -0.014814382418990135, 0.01796768605709076], "num_tokens": 20}))
+async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
+  embed_call: RunnerMethod[bentoml.Runnable | openllm.LLMRunnable[t.Any, t.Any], [list[str]], t.Sequence[openllm.LLMEmbeddings]] = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode
+  responses = (await embed_call.async_run(phrases))[0]
+  return openllm.EmbeddingsOutput(embeddings=responses["embeddings"], num_tokens=responses["num_tokens"])
 
 if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
   async def hf_agent(request: Request) -> Response:
diff --git a/openllm-python/src/openllm/_strategies.py b/openllm-python/src/openllm/_strategies.py
index e40d43c1..63255fa0 100644
--- a/openllm-python/src/openllm/_strategies.py
+++ b/openllm-python/src/openllm/_strategies.py
@@ -330,3 +330,5 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
       if idx >= len(gpus): raise ValueError(f"Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}")
       dev = str(gpus[idx])
     return dev
+
+__all__=["CascadingResourceStrategy", "get_resource"]
diff --git a/openllm-python/src/openllm/_typing_compat.py b/openllm-python/src/openllm/_typing_compat.py
index 8fe8d92e..dd6b35eb 100644
--- a/openllm-python/src/openllm/_typing_compat.py
+++ b/openllm-python/src/openllm/_typing_compat.py
@@ -82,7 +82,7 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
   supports_embeddings: bool
   supports_hf_agent: bool
   has_adapters: bool
-  embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], LLMEmbeddings]
+  embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], t.Sequence[LLMEmbeddings]]
   generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]]
   generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]]
   generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[t.Any, None, None]]
diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py
index 288cf487..9e3bf20d 100644
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -78,16 +78,15 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
   return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"])
 
 def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
-  _bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
-  _bentoml_config_options_opts = ["tracing.sample_rate=1.0", f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'api_server.traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}']
-  _bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
+  from openllm.cli._factory import parse_config_options
+  environ = parse_config_options(llm.config, llm.config["timeout"], workers_per_resource, None, True, os.environ.copy())
   env: openllm.utils.EnvVarMixin = llm.config["env"]
   if env["framework_value"] == "vllm": serialisation_format = "legacy"
   env_dict = {
       env.framework: env["framework_value"], env.config: f"'{llm.config.model_dump_json().decode()}'",
       env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}",
       "OPENLLM_MODEL": llm.config["model_name"], "OPENLLM_SERIALIZATION": serialisation_format,
-      "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{_bentoml_config_options}'",
+      "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{environ['BENTOML_CONFIG_OPTIONS']}'",
   }
   if adapter_map: env_dict["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1")
 
@@ -125,8 +124,7 @@ def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.A
   )
 
   bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath("/"))
-  # NOTE: the model_id_path here are only used for setting this environment variable within the container
-  # built with for BentoLLM.
+  # NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM.
   service_fs_path = fs.path.join("src", llm.config["service_name"])
   service_path = bento._fs.getsyspath(service_fs_path)
   with open(service_path, "r") as f:
diff --git a/openllm-python/src/openllm/cli/_factory.py b/openllm-python/src/openllm/cli/_factory.py
index c93e649f..53b974ab 100644
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -27,6 +27,7 @@ def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_res
   if device:
     if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
     else: _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
+  _bentoml_config_options_opts.append(f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
   if cors:
     _bentoml_config_options_opts.extend(["api_server.http.cors.enabled=true", 'api_server.http.cors.access_control_allow_origins="*"'])
     _bentoml_config_options_opts.extend([f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(["GET", "OPTIONS", "POST", "HEAD", "PUT"])])
diff --git a/openllm-python/src/openllm/cli/_sdk.py b/openllm-python/src/openllm/cli/_sdk.py
index 0e1d39d3..30bdc717 100644
--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -77,7 +77,7 @@ def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30
   return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False)
 
 @inject
-def _build(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, workers_per_resource: float | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", dockerfile_template: str | None = None, overwrite: bool = False, container_registry: LiteralContainerRegistry | None = None, container_version_strategy: LiteralContainerVersionStrategy | None = None, push: bool = False, containerize: bool = False, serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", additional_args: list[str] | None = None, bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
+def _build(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, bento_version: str | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, workers_per_resource: float | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", dockerfile_template: str | None = None, overwrite: bool = False, container_registry: LiteralContainerRegistry | None = None, container_version_strategy: LiteralContainerVersionStrategy | None = None, push: bool = False, containerize: bool = False, serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", additional_args: list[str] | None = None, bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
   """Package a LLM into a Bento.
 
   The LLM will be built into a BentoService with the following structure:
@@ -92,6 +92,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st
       model_name: The model name to start this LLM
       model_id: Optional model id for this given LLM
       model_version: Optional model version for this given LLM
+      bento_version: Optional bento veresion for this given BentoLLM
       quantize: Quantize the model weights. This is only applicable for PyTorch models.
                 Possible quantisation strategies:
                 - int8: Quantize the model with 8bit (bitsandbytes required)
@@ -126,7 +127,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st
   Returns:
       ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
   """
-  args: list[str] = [sys.executable, "-m", "openllm", "build", model_name, "--machine", "--runtime", runtime, "--serialisation", serialisation_format,]
+  args: list[str] = [sys.executable, "-m", "openllm", "build", model_name, "--machine", "--runtime", runtime, "--serialisation", serialisation_format]
   if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
   if quantize: args.extend(["--quantize", quantize])
   if bettertransformer: args.append("--bettertransformer")
@@ -140,6 +141,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st
   if overwrite: args.append("--overwrite")
   if adapter_map: args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()])
   if model_version: args.extend(["--model-version", model_version])
+  if bento_version: args.extend(["--bento-version", bento_version])
   if dockerfile_template: args.extend(["--dockerfile-template", dockerfile_template])
   if container_registry is None: container_registry = "ecr"
   if container_version_strategy is None: container_version_strategy = "release"
diff --git a/openllm-python/src/openllm/client/runtimes/http.py b/openllm-python/src/openllm/client/runtimes/http.py
index f59f6f5a..a8a73bf7 100644
--- a/openllm-python/src/openllm/client/runtimes/http.py
+++ b/openllm-python/src/openllm/client/runtimes/http.py
@@ -20,7 +20,6 @@ class HTTPClient(BaseClient[DictStrAny]):
 
   def health(self) -> t.Any: return self._cached.health()
   def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
-    if not self.supports_embeddings: raise ValueError("This model does not support embeddings.")
     if isinstance(prompt, str): prompt = [prompt]
     result = httpx.post(urljoin(self._address, f"/{self._api_version}/embeddings"), json=list(prompt), timeout=self.timeout).json() if in_async_context() else self.call("embeddings", list(prompt))
     return openllm.EmbeddingsOutput(**result)
@@ -62,7 +61,6 @@ class AsyncHTTPClient(BaseAsyncClient[DictStrAny]):
 
   async def health(self) -> t.Any: return await self._cached.async_health()
   async def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
-    if not self.supports_embeddings: raise ValueError("This model does not support embeddings.")
     if isinstance(prompt, str): prompt = [prompt]
     res = await self.acall("embeddings", list(prompt))
     return openllm.EmbeddingsOutput(**res)
diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py
index 5ab9148d..0e0c07b6 100644
--- a/openllm-python/src/openllm/serialisation/transformers/__init__.py
+++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py
@@ -106,8 +106,7 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
         else:
           # we will clone the all tings into the bentomodel path without loading model into memory
           snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm))
-    except Exception:
-      raise
+    except Exception: raise
     else:
       bentomodel.flush()  # type: ignore[no-untyped-call]
       bentomodel.save(_model_store)
@@ -117,7 +116,6 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
       # NOTE: We need to free up the cache after importing the model
       # in the case where users first run openllm start without the model available locally.
       if openllm.utils.is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
-
     return bentomodel
 
 def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
diff --git a/openllm-python/src/openllm/utils/codegen.py b/openllm-python/src/openllm/utils/codegen.py
index 49431d5c..994e8818 100644
--- a/openllm-python/src/openllm/utils/codegen.py
+++ b/openllm-python/src/openllm/utils/codegen.py
@@ -32,7 +32,7 @@ class ModelIdFormatter(ModelNameFormatter):
 class ModelAdapterMapFormatter(ModelNameFormatter):
   model_keyword: LiteralString = "__model_adapter_map__"
 
-_service_file = Path(os.path.abspath("__file__")).parent.parent/"_service.py"
+_service_file = Path(os.path.abspath(__file__)).parent.parent/"_service.py"
 def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None:
   from openllm.utils import DEBUG
   model_name = llm.config["model_name"]
diff --git a/pyproject.toml b/pyproject.toml
index 2c427fb3..b3368009 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,61 +1,61 @@
 [build-system]
 build-backend = "hatchling.build"
 requires = [
-    "hatchling==1.18.0",
-    "hatch-vcs==0.3.0",
-    "hatch-fancy-pypi-readme==23.1.0",
-    "hatch-mypyc==0.16.0",
+  "hatchling==1.18.0",
+  "hatch-vcs==0.3.0",
+  "hatch-fancy-pypi-readme==23.1.0",
+  "hatch-mypyc==0.16.0",
 ]
 [project]
 authors = [
-    { name = "Aaron Pham", email = "aarnphm@bentoml.com" },
-    { name = "BentoML Team", email = "contact@bentoml.com" },
+  { name = "Aaron Pham", email = "aarnphm@bentoml.com" },
+  { name = "BentoML Team", email = "contact@bentoml.com" },
 ]
 classifiers = [
-    "Development Status :: 5 - Production/Stable",
-    "Environment :: GPU :: NVIDIA CUDA",
-    "Environment :: GPU :: NVIDIA CUDA :: 12",
-    "Environment :: GPU :: NVIDIA CUDA :: 11.8",
-    "Environment :: GPU :: NVIDIA CUDA :: 11.7",
-    "License :: OSI Approved :: Apache Software License",
-    "Topic :: Scientific/Engineering :: Artificial Intelligence",
-    "Topic :: Software Development :: Libraries",
-    "Operating System :: OS Independent",
-    "Intended Audience :: Developers",
-    "Intended Audience :: Science/Research",
-    "Intended Audience :: System Administrators",
-    "Typing :: Typed",
-    "Programming Language :: Python",
-    "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
-    "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11",
-    "Programming Language :: Python :: 3.12",
-    "Programming Language :: Python :: Implementation :: CPython",
-    "Programming Language :: Python :: Implementation :: PyPy",
+  "Development Status :: 5 - Production/Stable",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "Environment :: GPU :: NVIDIA CUDA :: 12",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.8",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.7",
+  "License :: OSI Approved :: Apache Software License",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+  "Topic :: Software Development :: Libraries",
+  "Operating System :: OS Independent",
+  "Intended Audience :: Developers",
+  "Intended Audience :: Science/Research",
+  "Intended Audience :: System Administrators",
+  "Typing :: Typed",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Programming Language :: Python :: Implementation :: PyPy",
 ]
 description = "OpenLLM monorepo"
 dynamic = ["version", "readme", "dependencies"]
 keywords = [
-    "MLOps",
-    "AI",
-    "BentoML",
-    "Model Serving",
-    "Model Deployment",
-    "LLMOps",
-    "Falcon",
-    "Vicuna",
-    "Llama 2",
-    "Fine tuning",
-    "Serverless",
-    "Large Language Model",
-    "Generative AI",
-    "StableLM",
-    "Alpaca",
-    "PyTorch",
-    "Transformers",
+  "MLOps",
+  "AI",
+  "BentoML",
+  "Model Serving",
+  "Model Deployment",
+  "LLMOps",
+  "Falcon",
+  "Vicuna",
+  "Llama 2",
+  "Fine tuning",
+  "Serverless",
+  "Large Language Model",
+  "Generative AI",
+  "StableLM",
+  "Alpaca",
+  "PyTorch",
+  "Transformers",
 ]
 license = "Apache-2.0"
 name = "openllm-monorepo"
@@ -74,12 +74,12 @@ fallback-version = "0.0.0"
 source = "vcs"
 [tool.hatch.version.raw-options]
 git_describe_command = [
-    "git",
-    "describe",
-    "--dirty",
-    "--tags",
-    "--long",
-    "--first-parent",
+  "git",
+  "describe",
+  "--dirty",
+  "--tags",
+  "--long",
+  "--first-parent",
 ]
 local_scheme = "no-local-version"
 [tool.hatch.metadata]
@@ -151,59 +151,59 @@ toplevel = ["openllm"]
 
 [tool.ruff]
 extend-exclude = [
-    "tools",
-    "examples",
-    "openllm-python/src/openllm/playground",
-    "openllm-python/src/openllm/__init__.py",
-    "openllm-python/src/openllm/_version.py",
-    "openllm-python/src/openllm/utils/dummy_*.py",
-    "openllm-python/src/openllm/models/__init__.py",
+  "tools",
+  "examples",
+  "openllm-python/src/openllm/playground",
+  "openllm-python/src/openllm/__init__.py",
+  "openllm-python/src/openllm/_version.py",
+  "openllm-python/src/openllm/utils/dummy_*.py",
+  "openllm-python/src/openllm/models/__init__.py",
 ]
 extend-include = ["*.ipynb"]
 extend-select = [
-    "B",    # flake8-bugbear
-    "I",    # isort
-    "G",    # flake8-logging-format
-    "D",    # pydocstyle
-    "W",    # pycodestyle
-    "Q",    # flake8-quotes
-    "FA",   # flake8-future-annotations
-    "S",    # flake8-bandit
-    "TCH",  # flake8-type-checking
-    "PLW",  # pylint-warning
-    "PLR",  # pylint-refactor
-    "PT",   # flake8-pytest-style
-    "PERF", # perflint
-    "FLY",  # flynt
-    "RUF",  # Ruff-specific rules
-    "YTT",  # flake8-2020
+  "B",    # flake8-bugbear
+  "I",    # isort
+  "G",    # flake8-logging-format
+  "D",    # pydocstyle
+  "W",    # pycodestyle
+  "Q",    # flake8-quotes
+  "FA",   # flake8-future-annotations
+  "S",    # flake8-bandit
+  "TCH",  # flake8-type-checking
+  "PLW",  # pylint-warning
+  "PLR",  # pylint-refactor
+  "PT",   # flake8-pytest-style
+  "PERF", # perflint
+  "FLY",  # flynt
+  "RUF",  # Ruff-specific rules
+  "YTT",  # flake8-2020
 ]
 fix = true
 ignore = [
-    "B027",    # Allow non-abstract empty methods in abstract base classes
-    "FBT003",  # Allow boolean positional values in function calls, like `dict.get(... True)`
-    "S105",    # Ignore checks for possible passwords
-    "S106",
-    "S107",
-    "S603",    # ignore subprocess.call
-    "PLR0911",
-    "PLR0912",
-    "PLR0913",
-    "PLR0915",
-    "PLR2004", # magic value to use constant
-    "E501",    # ignore line length violation
-    "E401",    # ignore multiple line import
-    "E702",
-    "I001",    # unsorted imports
-    "D103",    # Just missing docstring for magic methods.
-    "D102",
-    "D101",
-    "D100",
-    "TCH004",  # don't move runtime import out, just warn about it
-    "RUF012",  # mutable attributes to be used with ClassVar
-    "B905",    # zip warning about strict, only applicable for 3.10+
-    "D105",    # magic docstring
-    "E701",    # multiple statement on single line
+  "B027",    # Allow non-abstract empty methods in abstract base classes
+  "FBT003",  # Allow boolean positional values in function calls, like `dict.get(... True)`
+  "S105",    # Ignore checks for possible passwords
+  "S106",
+  "S107",
+  "S603",    # ignore subprocess.call
+  "PLR0911",
+  "PLR0912",
+  "PLR0913",
+  "PLR0915",
+  "PLR2004", # magic value to use constant
+  "E501",    # ignore line length violation
+  "E401",    # ignore multiple line import
+  "E702",
+  "I001",    # unsorted imports
+  "D103",    # Just missing docstring for magic methods.
+  "D102",
+  "D101",
+  "D100",
+  "TCH004",  # don't move runtime import out, just warn about it
+  "RUF012",  # mutable attributes to be used with ClassVar
+  "B905",    # zip warning about strict, only applicable for 3.10+
+  "D105",    # magic docstring
+  "E701",    # multiple statement on single line
 ]
 line-length = 768
 target-version = "py38"
@@ -212,10 +212,10 @@ unfixable = ["TCH004"]
 [tool.ruff.flake8-type-checking]
 exempt-modules = ["typing", "typing_extensions", "openllm._typing_compat"]
 runtime-evaluated-base-classes = [
-    "pydantic.BaseModel",
-    "openllm._configuration.LLMConfig",
-    "openllm._configuration.GenerationConfig",
-    "openllm._configuration.ModelSettings",
+  "pydantic.BaseModel",
+  "openllm._configuration.LLMConfig",
+  "openllm._configuration.GenerationConfig",
+  "openllm._configuration.ModelSettings",
 ]
 runtime-evaluated-decorators = ["attrs.define", "attrs.frozen", "trait"]
 [tool.ruff.pydocstyle]
@@ -228,12 +228,12 @@ force-single-line = false
 force-wrap-aliases = true
 known-first-party = ["openllm", "bentoml"]
 known-third-party = [
-    "transformers",
-    "click",
-    "huggingface_hub",
-    "torch",
-    "vllm",
-    "auto_gptq",
+  "transformers",
+  "click",
+  "huggingface_hub",
+  "torch",
+  "vllm",
+  "auto_gptq",
 ]
 lines-after-imports = 0
 lines-between-types = 0
@@ -305,44 +305,44 @@ openllm = ["openllm-python/src/openllm", "*/openllm-python/src/openllm"]
 [tool.coverage.run]
 branch = true
 omit = [
-    "__pypackages__/*",
-    "openllm-python/src/openllm/_version.py",
-    "openllm-python/src/openllm/playground/",
-    "openllm-python/src/openllm/__init__.py",
-    "openllm-python/src/openllm/__main__.py",
-    "openllm-python/src/openllm/utils/dummy_*.py",
-    "openllm-python/src/openllm/_typing_compat.py",
+  "__pypackages__/*",
+  "openllm-python/src/openllm/_version.py",
+  "openllm-python/src/openllm/playground/",
+  "openllm-python/src/openllm/__init__.py",
+  "openllm-python/src/openllm/__main__.py",
+  "openllm-python/src/openllm/utils/dummy_*.py",
+  "openllm-python/src/openllm/_typing_compat.py",
 ]
 source_pkgs = ["openllm"]
 [tool.coverage.report]
 exclude_lines = [
-    "no cov",
-    "pragma: no cover",
-    "if __name__ == .__main__.:",
-    "if t.TYPE_CHECKING:",
-    "if _t.TYPE_CHECKING:",
-    'if TYPE_CHECKING:',
-    'if typing.TYPE_CHECKING:',
-    'if t.TYPE_CHECKING and not MYPY:',
-    'if DEBUG:',
-    'if utils.DEBUG',
-    'if openllm.utils.DEBUG',
-    '@_overload',
-    '@overload',
-    '@t.overload',
-    '@typing.overload',
-    'raise NotImplementedError',
-    'raise NotImplemented',
-    'except MissingDependencyError:',
+  "no cov",
+  "pragma: no cover",
+  "if __name__ == .__main__.:",
+  "if t.TYPE_CHECKING:",
+  "if _t.TYPE_CHECKING:",
+  'if TYPE_CHECKING:',
+  'if typing.TYPE_CHECKING:',
+  'if t.TYPE_CHECKING and not MYPY:',
+  'if DEBUG:',
+  'if utils.DEBUG',
+  'if openllm.utils.DEBUG',
+  '@_overload',
+  '@overload',
+  '@t.overload',
+  '@typing.overload',
+  'raise NotImplementedError',
+  'raise NotImplemented',
+  'except MissingDependencyError:',
 ]
 omit = [
-    "__pypackages__/*",
-    "openllm-python/src/openllm/_version.py",
-    "openllm-python/src/openllm/playground/",
-    "openllm-python/src/openllm/__init__.py",
-    "openllm-python/src/openllm/__main__.py",
-    "openllm-python/src/openllm/utils/dummy_*.py",
-    "openllm-python/src/openllm/_typing_compat.py",
+  "__pypackages__/*",
+  "openllm-python/src/openllm/_version.py",
+  "openllm-python/src/openllm/playground/",
+  "openllm-python/src/openllm/__init__.py",
+  "openllm-python/src/openllm/__main__.py",
+  "openllm-python/src/openllm/utils/dummy_*.py",
+  "openllm-python/src/openllm/_typing_compat.py",
 ]
 precision = 2
 show_missing = true
@@ -353,19 +353,19 @@ show_missing = true
 [tool.pyright]
 analysis.useLibraryCodeForTypes = true
 exclude = [
-    "__pypackages__/*",
-    "openllm-python/src/openllm/playground/",
-    "openllm-python/src/openllm/models/",
-    "openllm-python/src/openllm/__init__.py",
-    "openllm-python/src/openllm/__main__.py",
-    "openllm-python/src/openllm/utils/dummy_*.py",
-    "openllm-python/src/openllm/_typing_compat.py",
-    "openllm-python/tests",
-    "cz.py",
-    "hatch_build.py",
-    "tools",
-    "examples",
-    "typings",
+  "__pypackages__/*",
+  "openllm-python/src/openllm/playground/",
+  "openllm-python/src/openllm/models/",
+  "openllm-python/src/openllm/__init__.py",
+  "openllm-python/src/openllm/__main__.py",
+  "openllm-python/src/openllm/utils/dummy_*.py",
+  "openllm-python/src/openllm/_typing_compat.py",
+  "openllm-python/tests",
+  "cz.py",
+  "hatch_build.py",
+  "tools",
+  "examples",
+  "typings",
 ]
 executionEnvironments = [{ root = "openllm-python/src" }]
 pythonVersion = "3.12"
@@ -382,14 +382,15 @@ typeCheckingMode = "strict"
 
 [tool.mypy]
 exclude = [
-    "examples",
-    "tools",
-    "cz.py",
-    "openllm-python/tests",
-    "openllm-python/src/openllm/playground",
-    "openllm-python/src/openllm/utils/dummy_*.py",
-    "openllm-python/src/openllm/models",
-    "openllm-python/src/openllm/_typing_compat.py",
+  "examples",
+  "tools",
+  "cz.py",
+  "openllm-python/tests",
+  "openllm-python/src/openllm/playground",
+  "openllm-python/src/openllm/utils/dummy_*.py",
+  "openllm-python/src/openllm/models",
+  "openllm-python/src/openllm/_typing_compat.py",
+  "openllm-python/src/openllm/_service.py",
 ]
 modules = ["openllm"]
 mypy_path = "typings"
@@ -404,24 +405,24 @@ warn_unused_ignores = false
 [[tool.mypy.overrides]]
 ignore_missing_imports = true
 module = [
-    "IPython.*",
-    "numpy.*",
-    "tensorflow.*",
-    "torch.*",
-    "optimum.*",
-    "inflection.*",
-    "huggingface_hub.*",
-    "click_option_group.*",
-    "peft.*",
-    "auto_gptq.*",
-    "vllm.*",
-    "orjson.*",
-    "httpx.*",
-    "cloudpickle.*",
-    "circus.*",
-    "grpc_health.v1.*",
-    "transformers.*",
-    "ghapi.*",
+  "IPython.*",
+  "numpy.*",
+  "tensorflow.*",
+  "torch.*",
+  "optimum.*",
+  "inflection.*",
+  "huggingface_hub.*",
+  "click_option_group.*",
+  "peft.*",
+  "auto_gptq.*",
+  "vllm.*",
+  "orjson.*",
+  "httpx.*",
+  "cloudpickle.*",
+  "circus.*",
+  "grpc_health.v1.*",
+  "transformers.*",
+  "ghapi.*",
 ]
 [[tool.mypy.overrides]]
 ignore_errors = true
diff --git a/tools/run-release-action b/tools/run-release-action
index b454becd..501a851d 100755
--- a/tools/run-release-action
+++ b/tools/run-release-action
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-set -e -o pipefail
+set -ex -o pipefail
 
 # Function to print script usage
 print_usage() {
@@ -55,21 +55,55 @@ else
     exit 1
 fi
 
+#get highest tags across all branches, not just the current branch
+version="$(git describe --tags "$(git rev-list --tags --max-count=1)")"
+VERSION="${version#v}"
+# Save the current value of IFS to restore it later
+OLD_IFS=$IFS
+IFS='.'
+# split into array
+read -ra VERSION_BITS <<< "$VERSION"
+# Restore the original value of IFS
+IFS=$OLD_IFS
+VNUM1=${VERSION_BITS[0]}
+VNUM2=${VERSION_BITS[1]}
+VNUM3=${VERSION_BITS[2]}
+
+if [[ $release == 'major' ]]; then
+    VNUM1=$((VNUM1 + 1))
+    VNUM2=0
+    VNUM3=0
+elif [[ $release == 'minor' ]]; then
+    VNUM2=$((VNUM2 + 1))
+    VNUM3=0
+else
+    VNUM3=$((VNUM3 + 1))
+fi
+#create new tag
+RELEASE_TAG="v$VNUM1.$VNUM2.$VNUM3"
+
 echo "Running release actions (create-releases.yml)..."
 echo '{"release_type": "'"$release"'"}' | gh workflow run create-releases.yml --repo bentoml/openllm --json
 
-sleep 10
+sleep 20
 
-echo "Getting 'create-releases.yml' workflow id..."
-WORKFLOW_ID=$(gh run list -w release --repo bentoml/openllm -L 1 --json databaseId | jq '.[]| .databaseId')
-
-echo "Waiting for workflow create-releases.yml to complete..."
+echo "Waiting for new tags to be released from 'create-releases.yml'"
 while true; do
-    STATUS=$(gh run view "$WORKFLOW_ID" --json status --jq '.status')
-    if [[ "${STATUS}" == "completed" ]]; then
+    git pull --autostash --no-edit --gpg-sign --ff origin main
+    if git ls-remote -t --exit-code origin "refs/tags/${RELEASE_TAG}" &> /dev/null; then
         break
     fi
     sleep 10
 done
 
-sleep 10
+# sleep 5
+# echo "Building OpenLLM container for ${RELEASE_TAG}..."
+# gh workflow run build.yml -R bentoml/openllm -r "${RELEASE_TAG}"
+#
+# sleep 5
+# echo "Building OpenLLM embedding container for ${RELEASE_TAG}..."
+# gh workflow run build-embedding.yml -R bentoml/openllm -r "${RELEASE_TAG}"
+#
+# sleep 5
+# echo "Building Clojure UI (community-maintained) for ${RELEASE_TAG}..."
+# gh workflow run clojure-frontend.yml -R bentoml/openllm -r "${RELEASE_TAG}"