feat(embedding): Adding generic endpoint (#227)

2026-05-24 16:44:39 -04:00 · 2023-08-17 15:17:00 -04:00
parent d5c4066ff4
commit 4140d160b8
23 changed files with 621 additions and 393 deletions
--- a/openllm-python/README.md
+++ b/openllm-python/README.md
@@ -6,6 +6,8 @@
    <h1 align="center">🦾 OpenLLM</h1>
    <a href="https://pypi.org/project/openllm">
        <img src="https://img.shields.io/pypi/v/openllm.svg?logo=pypi&label=PyPI&logoColor=gold" alt="pypi_status" />
+    </a><a href="https://test.pypi.org/project/openllm/">
+        <img src="https://img.shields.io/badge/Nightly-PyPI?logo=pypi&label=PyPI&color=gray&link=https%3A%2F%2Ftest.pypi.org%2Fproject%2Fopenllm%2F" alt="test_pypi_status" />
    </a><a href="https://twitter.com/bentomlai">
        <img src="https://badgen.net/badge/icon/@bentomlai/1DA1F2?icon=twitter&label=Follow%20Us" alt="Twitter" />
    </a><a href="https://l.bentoml.com/join-openllm-discord">
@@ -493,12 +495,12 @@ openllm build opt --model-id facebook/opt-6.7b --adapter-id ...

 OpenLLM encourages contributions by welcoming users to incorporate their custom
 LLMs into the ecosystem. Check out
-[Adding a New Model Guide](https://github.com/bentoml/OpenLLM/blob/main/ADDING_NEW_MODEL.md)
+[Adding a New Model Guide](https://github.com/bentoml/OpenLLM/blob/main/openllm-python/ADDING_NEW_MODEL.md)
 to see how you can do it yourself.

 ### Embeddings

-OpenLLM tentatively provides embeddings endpoint for supported models. This can
+OpenLLM provides embeddings endpoint for embeddings calculation. This can
 be accessed via `/v1/embeddings`.

 To use via CLI, simply call `openllm embed`:
@@ -532,8 +534,19 @@ client.embed("I like to eat apples")
 ```

 > [!NOTE]
-> Currently, the following model family supports embeddings: Llama, T5
-> (Flan-T5, FastChat, etc.), ChatGLM
+> Currently, the following model family supports embeddings calculation: Llama, T5 (Flan-T5, FastChat, etc.), ChatGLM
+> For the remaining LLM that doesn't have specific embedding implementation,
+> we will use a generic [BertModel](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
+> for embeddings generation. The implementation is largely based on [`bentoml/sentence-embedding-bento`](https://github.com/bentoml/sentence-embedding-bento)
+
+### Playground and Chat UI
+
+The following UIs are currently available for OpenLLM:
+
+| UI                                                                                | Owner                                         | Type                 | Progress |
+|-----------------------------------------------------------------------------------|-----------------------------------------------|----------------------|----------|
+| [Clojure](https://github.com/bentoml/OpenLLM/blob/main/contrib/clojure/README.md) | [@GutZuFusss](https://github.com/GutZuFusss)  | Community-maintained | 🔧       |
+| TS                                                                                | BentoML Team                                  |                      | 🚧       |

 ## ⚙️ Integrations

--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -2,80 +2,80 @@
 [build-system]
 build-backend = "hatchling.build"
 requires = [
-    "hatchling==1.18.0",
-    "hatch-vcs==0.3.0",
-    "hatch-fancy-pypi-readme==23.1.0",
-    "hatch-mypyc==0.16.0",
+  "hatchling==1.18.0",
+  "hatch-vcs==0.3.0",
+  "hatch-fancy-pypi-readme==23.1.0",
+  "hatch-mypyc==0.16.0",
 ]

 [project]
 authors = [
-    {name = "Aaron Pham",email = "aarnphm@bentoml.com"},
-    {name = "BentoML Team",email = "contact@bentoml.com"},
+  { name = "Aaron Pham", email = "aarnphm@bentoml.com" },
+  { name = "BentoML Team", email = "contact@bentoml.com" },
 ]
 classifiers = [
-    "Development Status :: 5 - Production/Stable",
-    "Environment :: GPU :: NVIDIA CUDA",
-    "Environment :: GPU :: NVIDIA CUDA :: 12",
-    "Environment :: GPU :: NVIDIA CUDA :: 11.8",
-    "Environment :: GPU :: NVIDIA CUDA :: 11.7",
-    "License :: OSI Approved :: Apache Software License",
-    "Topic :: Scientific/Engineering :: Artificial Intelligence",
-    "Topic :: Software Development :: Libraries",
-    "Operating System :: OS Independent",
-    "Intended Audience :: Developers",
-    "Intended Audience :: Science/Research",
-    "Intended Audience :: System Administrators",
-    "Typing :: Typed",
-    "Programming Language :: Python",
-    "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
-    "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11",
-    "Programming Language :: Python :: 3.12",
-    "Programming Language :: Python :: Implementation :: CPython",
-    "Programming Language :: Python :: Implementation :: PyPy",
+  "Development Status :: 5 - Production/Stable",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "Environment :: GPU :: NVIDIA CUDA :: 12",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.8",
+  "Environment :: GPU :: NVIDIA CUDA :: 11.7",
+  "License :: OSI Approved :: Apache Software License",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+  "Topic :: Software Development :: Libraries",
+  "Operating System :: OS Independent",
+  "Intended Audience :: Developers",
+  "Intended Audience :: Science/Research",
+  "Intended Audience :: System Administrators",
+  "Typing :: Typed",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Programming Language :: Python :: Implementation :: PyPy",
 ]
 dependencies = [
-    "bentoml[grpc,io]>=1.0.25",
-    "transformers[torch,tokenizers,accelerate]>=4.29.0",
-    "safetensors",
-    "optimum",
-    "attrs>=23.1.0",
-    "cattrs>=23.1.0",
-    "orjson",
-    "inflection",
-    "tabulate[widechars]>=0.9.0",
-    "httpx",
-    "click>=8.1.3",
-    "typing_extensions",
-    "mypy_extensions",
-    "ghapi",
-    "cuda-python;platform_system!=\"Darwin\"",
-    "bitsandbytes<0.42",
+  "bentoml[grpc,io]>=1.0.25",
+  "transformers[torch,tokenizers,accelerate]>=4.29.0",
+  "safetensors",
+  "optimum",
+  "attrs>=23.1.0",
+  "cattrs>=23.1.0",
+  "orjson",
+  "inflection",
+  "tabulate[widechars]>=0.9.0",
+  "httpx",
+  "click>=8.1.3",
+  "typing_extensions",
+  "mypy_extensions",
+  "ghapi",
+  "cuda-python;platform_system!=\"Darwin\"",
+  "bitsandbytes<0.42",
 ]
 description = "OpenLLM: Operating LLMs in production"
 dynamic = ["version", "readme"]
 keywords = [
-    "MLOps",
-    "AI",
-    "BentoML",
-    "Model Serving",
-    "Model Deployment",
-    "LLMOps",
-    "Falcon",
-    "Vicuna",
-    "Llama 2",
-    "Fine tuning",
-    "Serverless",
-    "Large Language Model",
-    "Generative AI",
-    "StableLM",
-    "Alpaca",
-    "PyTorch",
-    "Transformers",
+  "MLOps",
+  "AI",
+  "BentoML",
+  "Model Serving",
+  "Model Deployment",
+  "LLMOps",
+  "Falcon",
+  "Vicuna",
+  "Llama 2",
+  "Fine tuning",
+  "Serverless",
+  "Large Language Model",
+  "Generative AI",
+  "StableLM",
+  "Alpaca",
+  "PyTorch",
+  "Transformers",
 ]
 license = "Apache-2.0"
 name = "openllm"
@@ -103,21 +103,21 @@ Twitter = "https://twitter.com/bentomlai"
 [project.optional-dependencies]
 agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
 all = [
-    "openllm[agents]",
-    "openllm[baichuan]",
-    "openllm[chatglm]",
-    "openllm[falcon]",
-    "openllm[fine-tune]",
-    "openllm[flan-t5]",
-    "openllm[ggml]",
-    "openllm[gptq]",
-    "openllm[llama]",
-    "openllm[mpt]",
-    "openllm[openai]",
-    "openllm[opt]",
-    "openllm[playground]",
-    "openllm[starcoder]",
-    "openllm[vllm]",
+  "openllm[agents]",
+  "openllm[baichuan]",
+  "openllm[chatglm]",
+  "openllm[falcon]",
+  "openllm[fine-tune]",
+  "openllm[flan-t5]",
+  "openllm[ggml]",
+  "openllm[gptq]",
+  "openllm[llama]",
+  "openllm[mpt]",
+  "openllm[openai]",
+  "openllm[opt]",
+  "openllm[playground]",
+  "openllm[starcoder]",
+  "openllm[vllm]",
 ]
 baichuan = ["cpm-kernels", "sentencepiece"]
 chatglm = ["cpm-kernels", "sentencepiece"]
@@ -141,12 +141,12 @@ source = "vcs"
 version-file = "src/openllm/_version.py"
 [tool.hatch.version.raw-options]
 git_describe_command = [
-    "git",
-    "describe",
-    "--dirty",
-    "--tags",
-    "--long",
-    "--first-parent",
+  "git",
+  "describe",
+  "--dirty",
+  "--tags",
+  "--long",
+  "--first-parent",
 ]
 local_scheme = "no-local-version"
 root = ".."
@@ -157,55 +157,54 @@ only-include = ["src/openllm"]
 sources = ["src"]
 [tool.hatch.build.targets.sdist]
 exclude = [
-    "/.git_archival.txt",
-    "tests",
-    "/.python-version-default",
-    "ADDING_NEW_MODEL.md",
+  "/.git_archival.txt",
+  "tests",
+  "/.python-version-default",
+  "ADDING_NEW_MODEL.md",
 ]
 [tool.hatch.build.targets.wheel.hooks.mypyc]
 dependencies = [
-    "hatch-mypyc==0.16.0",
-    "mypy==1.4.1",
-    # avoid https://github.com/pallets/click/issues/2558
-    "click==8.1.3",
-    "bentoml==1.1.1",
-    "transformers>=4.31.0",
-    "pandas-stubs",
-    "types-psutil",
-    "types-tabulate",
-    "types-PyYAML",
-    "types-protobuf",
+  "hatch-mypyc==0.16.0",
+  "mypy==1.4.1",
+  # avoid https://github.com/pallets/click/issues/2558
+  "click==8.1.3",
+  "bentoml==1.1.1",
+  "transformers>=4.31.0",
+  "pandas-stubs",
+  "types-psutil",
+  "types-tabulate",
+  "types-PyYAML",
+  "types-protobuf",
 ]
 enable-by-default = false
+exclude = ["src/openllm/_service.py", "src/openllm/_typing_compat.py"]
 include = [
-    "src/openllm/bundle",
-    "src/openllm/models/__init__.py",
-    "src/openllm/models/auto/__init__.py",
-    "src/openllm/utils/__init__.py",
-    "src/openllm/utils/codegen.py",
-    "src/openllm/__init__.py",
-    "src/openllm/_prompt.py",
-    "src/openllm/_schema.py",
-    "src/openllm/_quantisation.py",
-    "src/openllm/_generation.py",
-    "src/openllm/_strategies.py",
-    "src/openllm/exceptions.py",
-    "src/openllm/testing.py",
+  "src/openllm/models/__init__.py",
+  "src/openllm/models/auto/__init__.py",
+  "src/openllm/utils/__init__.py",
+  "src/openllm/__init__.py",
+  "src/openllm/_prompt.py",
+  "src/openllm/_schema.py",
+  "src/openllm/_quantisation.py",
+  "src/openllm/_generation.py",
+  "src/openllm/_strategies.py",
+  "src/openllm/exceptions.py",
+  "src/openllm/testing.py",
 ]
 # NOTE: This is consistent with pyproject.toml
 mypy-args = [
-    "--strict",
-    # this is because all transient library doesn't have types
-    "--allow-subclassing-any",
-    "--follow-imports=skip",
-    "--check-untyped-defs",
-    "--ignore-missing-imports",
-    "--no-warn-return-any",
-    "--warn-unreachable",
-    "--no-warn-no-return",
-    "--no-warn-unused-ignores",
-    "--exclude='/src\\/openllm\\/playground\\/**'",
-    "--exclude='/src\\/openllm\\/_typing_compat\\.py$'",
+  "--strict",
+  # this is because all transient library doesn't have types
+  "--follow-imports=skip",
+  "--allow-subclassing-any",
+  "--check-untyped-defs",
+  "--ignore-missing-imports",
+  "--no-warn-return-any",
+  "--warn-unreachable",
+  "--no-warn-no-return",
+  "--no-warn-unused-ignores",
+  "--exclude='/src\\/openllm\\/playground\\/**'",
+  "--exclude='/src\\/openllm\\/_typing_compat\\.py$'",
 ]
 options = { verbose = true, strip_asserts = true, debug_level = "2", opt_level = "3", include_runtime_files = true }
 require-runtime-dependencies = true
--- a/openllm-python/src/openllm/init.py
+++ b/openllm-python/src/openllm/init.py
@@ -28,7 +28,9 @@ else:
  _warnings.filterwarnings("ignore", message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")

 _import_structure: dict[str, list[str]] = {
-    "exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [], "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"], "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_configuration": ["LLMConfig", "GenerationConfig", "SamplingParams"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"], "_quantisation": ["infer_quantisation_config"], "_schema": ["GenerationInput", "GenerationOutput", "MetadataOutput", "EmbeddingsOutput", "unmarshal_vllm_outputs", "HfAgentInput"],
+    "exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [], "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"],
+    "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_configuration": ["LLMConfig", "GenerationConfig", "SamplingParams"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"],
+    "_quantisation": ["infer_quantisation_config"], "_schema": ["GenerationInput", "GenerationOutput", "MetadataOutput", "EmbeddingsOutput", "unmarshal_vllm_outputs", "HfAgentInput"], "_embeddings": ["GenericEmbeddingRunnable"], "_strategies": ["CascadingResourceStrategy", "get_resource"],
    "models.auto": ["AutoConfig", "CONFIG_MAPPING", "MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"], "models.chatglm": ["ChatGLMConfig"], "models.baichuan": ["BaichuanConfig"], "models.dolly_v2": ["DollyV2Config"], "models.falcon": ["FalconConfig"], "models.flan_t5": ["FlanT5Config"], "models.gpt_neox": ["GPTNeoXConfig"], "models.llama": ["LlamaConfig"], "models.mpt": ["MPTConfig"], "models.opt": ["OPTConfig"], "models.stablelm": ["StableLMConfig"], "models.starcoder": ["StarCoderConfig"]
 }
 COMPILED = _Path(__file__).suffix in (".pyd", ".so")
@@ -40,6 +42,8 @@ if _t.TYPE_CHECKING:
  from ._llm import LLM as LLM, LLMEmbeddings as LLMEmbeddings, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
  from ._quantisation import infer_quantisation_config as infer_quantisation_config
  from ._schema import EmbeddingsOutput as EmbeddingsOutput, GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, HfAgentInput as HfAgentInput, MetadataOutput as MetadataOutput, unmarshal_vllm_outputs as unmarshal_vllm_outputs
+  from ._embeddings import GenericEmbeddingRunnable as GenericEmbeddingRunnable
+  from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
  from .cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start, start_grpc as start_grpc
  from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING, MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES, MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES as MODEL_VLLM_MAPPING_NAMES, AutoConfig as AutoConfig
  from .models.baichuan import BaichuanConfig as BaichuanConfig
@@ -54,7 +58,7 @@ if _t.TYPE_CHECKING:
  from .models.stablelm import StableLMConfig as StableLMConfig
  from .models.starcoder import StarCoderConfig as StarCoderConfig
  from .serialisation import ggml as ggml, transformers as transformers
-  from openllm.utils import infer_auto_class as infer_auto_class
+  from .utils import infer_auto_class as infer_auto_class

 try:
  if not (utils.is_torch_available() and utils.is_cpm_kernels_available()): raise exceptions.MissingDependencyError
--- a/openllm-python/src/openllm/_embeddings.py
+++ b/openllm-python/src/openllm/_embeddings.py
@@ -0,0 +1,48 @@
+# See https://github.com/bentoml/sentence-embedding-bento for more information.
+from __future__ import annotations
+import bentoml, openllm, transformers, typing as t
+from huggingface_hub import snapshot_download
+from bentoml._internal.frameworks.transformers import MODULE_NAME, API_VERSION
+from bentoml._internal.models.model import ModelOptions, ModelSignature
+if t.TYPE_CHECKING: import torch
+
+_GENERIC_EMBEDDING_ID="sentence-transformers/all-MiniLM-L6-v2"
+_BENTOMODEL_ID="sentence-transformers--all-MiniLM-L6-v2"
+
+def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
+  try:
+    return bentoml.transformers.get(ids)
+  except bentoml.exceptions.NotFound:
+    model_signatures = {k: ModelSignature(batchable=False) for k in ("forward", "generate", "contrastive_search", "greedy_search", "sample", "beam_search", "beam_sample", "group_beam_search", "constrained_beam_search", "__call__")}
+    with bentoml.models.create(ids, module=MODULE_NAME, api_version=API_VERSION, options=ModelOptions(), context=openllm.utils.generate_context(framework_name="transformers"), labels={"runtime": "pt", "framework": "openllm"}, signatures=model_signatures) as bentomodel:
+      snapshot_download(_GENERIC_EMBEDDING_ID, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=["*.safetensors","*.h5","*.ot","*.pdf","*.md",".gitattributes","LICENSE.txt"])
+      return bentomodel
+
+class GenericEmbeddingRunnable(bentoml.Runnable):
+  SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
+  SUPPORTS_CPU_MULTI_THREADING = True
+  def __init__(self) -> None:
+    self.device = "cuda" if openllm.utils.device_count() > 0 else "cpu"
+    self._bentomodel = get_or_download()
+    self.tokenizer = transformers.AutoTokenizer.from_pretrained(self._bentomodel.path)
+    self.model = transformers.AutoModel.from_pretrained(self._bentomodel.path)
+    self.model.to(self.device)
+  @bentoml.Runnable.method(batchable=True, batch_dim=0)
+  def encode(self, sentences: list[str]) -> t.Sequence[openllm.LLMEmbeddings]:
+    import torch, torch.nn.functional as F
+    encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(self.device)
+    attention_mask = encoded_input["attention_mask"]
+    # Compute token embeddings
+    with torch.no_grad(): model_output = self.model(**encoded_input)
+    # Perform pooling and normalize
+    sentence_embeddings = F.normalize(self.mean_pooling(model_output, attention_mask), p=2, dim=1)
+    return [openllm.LLMEmbeddings(embeddings=sentence_embeddings.cpu().numpy(), num_tokens=int(torch.sum(attention_mask).item()))]
+  @staticmethod
+  def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    import torch
+    # Mean Pooling - Take attention mask into account for correct averaging
+    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+
+__all__ = ["GenericEmbeddingRunnable"]
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -926,7 +926,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
      scheduling_strategy = CascadingResourceStrategy

    generate_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
-    embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
+    embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True, batch_dim=0)))
    generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True)))

    # NOTE: returning the two langchain API's to the runner
@@ -1036,8 +1036,8 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
      logger.info("Successfully apply LoRA layer %s", adapter_name)

    @bentoml.Runnable.method(**method_signature(embeddings_sig))
-    def embeddings(__self: _Runnable, prompt: str | list[str]) -> LLMEmbeddings:
-      return self.embeddings([prompt] if isinstance(prompt, str) else prompt)
+    def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[LLMEmbeddings]:
+      return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)]

    @bentoml.Runnable.method(**method_signature(generate_sig))
    def __call__(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -1,9 +1,3 @@
-# mypy: disable-error-code="arg-type,misc"
-"""The service definition for running any LLMService.
-
-For line with comment '# openllm: ...', it must not be modified as it is managed internally by OpenLLM.
-Codegen can be found under 'openllm.utils.codegen'
-"""
 from __future__ import annotations
 import os, warnings, orjson, bentoml, openllm, typing as t
 from starlette.applications import Starlette
@@ -12,6 +6,7 @@ from starlette.routing import Route
 if t.TYPE_CHECKING:
  from starlette.requests import Request
  from starlette.responses import Response
+  from bentoml._internal.runner.runner import RunnerMethod
 # The following warnings from bitsandbytes, and probably not that important for users to see
 warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
 warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
@@ -20,7 +15,10 @@ model = os.environ.get("OPENLLM_MODEL", "{__model_name__}")  # openllm: model na
 adapter_map = os.environ.get("OPENLLM_ADAPTER_MAP", """{__model_adapter_map__}""")  # openllm: model adapter map
 llm_config = openllm.AutoConfig.for_model(model)
 runner = openllm.Runner(model, llm_config=llm_config, ensure_available=False, adapter_map=orjson.loads(adapter_map))
-svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[runner])
+generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable, name="llm-generic-embedding", scheduling_strategy=openllm.CascadingResourceStrategy, max_batch_size=32, max_latency_ms=300)
+runners: t.Sequence[bentoml.Runner] = [runner]
+if not runner.supports_embeddings: runners.append(generic_embedding_runner)
+svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners)

@svc.api(route="/v1/generate", input=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True)}), output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)}))
 async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
@@ -33,11 +31,11 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
 def metadata_v1(_: str) -> openllm.MetadataOutput:
  return openllm.MetadataOutput(timeout=llm_config["timeout"], model_name=llm_config["model_name"], framework=llm_config["env"]["framework_value"], model_id=runner.llm.model_id, configuration=llm_config.model_dump_json().decode(), supports_embeddings=runner.supports_embeddings, supports_hf_agent=runner.supports_hf_agent)

-if runner.supports_embeddings:
-  @svc.api(route="/v1/embeddings", input=bentoml.io.JSON.from_sample(["Hey Jude, welcome to the jungle!", "What is the meaning of life?"]), output=bentoml.io.JSON.from_sample({"embeddings": [0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282, -0.014814382418990135, 0.01796768605709076], "num_tokens": 20}))
-  async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
-    responses = await runner.embeddings.async_run(phrases)
-    return openllm.EmbeddingsOutput(embeddings=responses["embeddings"], num_tokens=responses["num_tokens"])
+@svc.api(route="/v1/embeddings", input=bentoml.io.JSON.from_sample(["Hey Jude, welcome to the jungle!", "What is the meaning of life?"]), output=bentoml.io.JSON.from_sample({"embeddings": [0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282, -0.014814382418990135, 0.01796768605709076], "num_tokens": 20}))
+async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
+  embed_call: RunnerMethod[bentoml.Runnable | openllm.LLMRunnable[t.Any, t.Any], [list[str]], t.Sequence[openllm.LLMEmbeddings]] = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode
+  responses = (await embed_call.async_run(phrases))[0]
+  return openllm.EmbeddingsOutput(embeddings=responses["embeddings"], num_tokens=responses["num_tokens"])

 if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
  async def hf_agent(request: Request) -> Response:
--- a/openllm-python/src/openllm/_strategies.py
+++ b/openllm-python/src/openllm/_strategies.py
@@ -330,3 +330,5 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
      if idx >= len(gpus): raise ValueError(f"Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}")
      dev = str(gpus[idx])
    return dev
+
+__all__=["CascadingResourceStrategy", "get_resource"]
--- a/openllm-python/src/openllm/_typing_compat.py
+++ b/openllm-python/src/openllm/_typing_compat.py
@@ -82,7 +82,7 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
  supports_embeddings: bool
  supports_hf_agent: bool
  has_adapters: bool
-  embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], LLMEmbeddings]
+  embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], t.Sequence[LLMEmbeddings]]
  generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]]
  generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]]
  generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[t.Any, None, None]]
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -78,16 +78,15 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
  return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"])

 def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
-  _bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
-  _bentoml_config_options_opts = ["tracing.sample_rate=1.0", f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'api_server.traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}']
-  _bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
+  from openllm.cli._factory import parse_config_options
+  environ = parse_config_options(llm.config, llm.config["timeout"], workers_per_resource, None, True, os.environ.copy())
  env: openllm.utils.EnvVarMixin = llm.config["env"]
  if env["framework_value"] == "vllm": serialisation_format = "legacy"
  env_dict = {
      env.framework: env["framework_value"], env.config: f"'{llm.config.model_dump_json().decode()}'",
      env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}",
      "OPENLLM_MODEL": llm.config["model_name"], "OPENLLM_SERIALIZATION": serialisation_format,
-      "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{_bentoml_config_options}'",
+      "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{environ['BENTOML_CONFIG_OPTIONS']}'",
  }
  if adapter_map: env_dict["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1")

@@ -125,8 +124,7 @@ def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.A
  )

  bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath("/"))
-  # NOTE: the model_id_path here are only used for setting this environment variable within the container
-  # built with for BentoLLM.
+  # NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM.
  service_fs_path = fs.path.join("src", llm.config["service_name"])
  service_path = bento._fs.getsyspath(service_fs_path)
  with open(service_path, "r") as f:
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -27,6 +27,7 @@ def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_res
  if device:
    if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
    else: _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
+  _bentoml_config_options_opts.append(f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
  if cors:
    _bentoml_config_options_opts.extend(["api_server.http.cors.enabled=true", 'api_server.http.cors.access_control_allow_origins="*"'])
    _bentoml_config_options_opts.extend([f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(["GET", "OPTIONS", "POST", "HEAD", "PUT"])])
--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -77,7 +77,7 @@ def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30
  return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False)

@inject
-def _build(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, workers_per_resource: float | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", dockerfile_template: str | None = None, overwrite: bool = False, container_registry: LiteralContainerRegistry | None = None, container_version_strategy: LiteralContainerVersionStrategy | None = None, push: bool = False, containerize: bool = False, serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", additional_args: list[str] | None = None, bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
+def _build(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, bento_version: str | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, workers_per_resource: float | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", dockerfile_template: str | None = None, overwrite: bool = False, container_registry: LiteralContainerRegistry | None = None, container_version_strategy: LiteralContainerVersionStrategy | None = None, push: bool = False, containerize: bool = False, serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", additional_args: list[str] | None = None, bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
  """Package a LLM into a Bento.

  The LLM will be built into a BentoService with the following structure:
@@ -92,6 +92,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st
      model_name: The model name to start this LLM
      model_id: Optional model id for this given LLM
      model_version: Optional model version for this given LLM
+      bento_version: Optional bento veresion for this given BentoLLM
      quantize: Quantize the model weights. This is only applicable for PyTorch models.
                Possible quantisation strategies:
                - int8: Quantize the model with 8bit (bitsandbytes required)
@@ -126,7 +127,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st
  Returns:
      ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
  """
-  args: list[str] = [sys.executable, "-m", "openllm", "build", model_name, "--machine", "--runtime", runtime, "--serialisation", serialisation_format,]
+  args: list[str] = [sys.executable, "-m", "openllm", "build", model_name, "--machine", "--runtime", runtime, "--serialisation", serialisation_format]
  if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
  if quantize: args.extend(["--quantize", quantize])
  if bettertransformer: args.append("--bettertransformer")
@@ -140,6 +141,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st
  if overwrite: args.append("--overwrite")
  if adapter_map: args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()])
  if model_version: args.extend(["--model-version", model_version])
+  if bento_version: args.extend(["--bento-version", bento_version])
  if dockerfile_template: args.extend(["--dockerfile-template", dockerfile_template])
  if container_registry is None: container_registry = "ecr"
  if container_version_strategy is None: container_version_strategy = "release"
--- a/openllm-python/src/openllm/client/runtimes/http.py
+++ b/openllm-python/src/openllm/client/runtimes/http.py
@@ -20,7 +20,6 @@ class HTTPClient(BaseClient[DictStrAny]):

  def health(self) -> t.Any: return self._cached.health()
  def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
-    if not self.supports_embeddings: raise ValueError("This model does not support embeddings.")
    if isinstance(prompt, str): prompt = [prompt]
    result = httpx.post(urljoin(self._address, f"/{self._api_version}/embeddings"), json=list(prompt), timeout=self.timeout).json() if in_async_context() else self.call("embeddings", list(prompt))
    return openllm.EmbeddingsOutput(**result)
@@ -62,7 +61,6 @@ class AsyncHTTPClient(BaseAsyncClient[DictStrAny]):

  async def health(self) -> t.Any: return await self._cached.async_health()
  async def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
-    if not self.supports_embeddings: raise ValueError("This model does not support embeddings.")
    if isinstance(prompt, str): prompt = [prompt]
    res = await self.acall("embeddings", list(prompt))
    return openllm.EmbeddingsOutput(**res)
--- a/openllm-python/src/openllm/serialisation/transformers/init.py
+++ b/openllm-python/src/openllm/serialisation/transformers/init.py
@@ -106,8 +106,7 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
        else:
          # we will clone the all tings into the bentomodel path without loading model into memory
          snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm))
-    except Exception:
-      raise
+    except Exception: raise
    else:
      bentomodel.flush()  # type: ignore[no-untyped-call]
      bentomodel.save(_model_store)
@@ -117,7 +116,6 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
      # NOTE: We need to free up the cache after importing the model
      # in the case where users first run openllm start without the model available locally.
      if openllm.utils.is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
-
    return bentomodel

 def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
--- a/openllm-python/src/openllm/utils/codegen.py
+++ b/openllm-python/src/openllm/utils/codegen.py
@@ -32,7 +32,7 @@ class ModelIdFormatter(ModelNameFormatter):
 class ModelAdapterMapFormatter(ModelNameFormatter):
  model_keyword: LiteralString = "__model_adapter_map__"

-_service_file = Path(os.path.abspath("__file__")).parent.parent/"_service.py"
+_service_file = Path(os.path.abspath(__file__)).parent.parent/"_service.py"
 def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None:
  from openllm.utils import DEBUG
  model_name = llm.config["model_name"]