feat(embedding): Adding generic endpoint (#227)

2026-06-11 18:09:52 -04:00 · 2023-08-17 15:17:00 -04:00
parent d5c4066ff4
commit 4140d160b8
23 changed files with 621 additions and 393 deletions
--- a/openllm-python/src/openllm/init.py
+++ b/openllm-python/src/openllm/init.py
@@ -28,7 +28,9 @@ else:
  _warnings.filterwarnings("ignore", message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")

 _import_structure: dict[str, list[str]] = {
-    "exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [], "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"], "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_configuration": ["LLMConfig", "GenerationConfig", "SamplingParams"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"], "_quantisation": ["infer_quantisation_config"], "_schema": ["GenerationInput", "GenerationOutput", "MetadataOutput", "EmbeddingsOutput", "unmarshal_vllm_outputs", "HfAgentInput"],
+    "exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [], "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"],
+    "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_configuration": ["LLMConfig", "GenerationConfig", "SamplingParams"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"],
+    "_quantisation": ["infer_quantisation_config"], "_schema": ["GenerationInput", "GenerationOutput", "MetadataOutput", "EmbeddingsOutput", "unmarshal_vllm_outputs", "HfAgentInput"], "_embeddings": ["GenericEmbeddingRunnable"], "_strategies": ["CascadingResourceStrategy", "get_resource"],
    "models.auto": ["AutoConfig", "CONFIG_MAPPING", "MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"], "models.chatglm": ["ChatGLMConfig"], "models.baichuan": ["BaichuanConfig"], "models.dolly_v2": ["DollyV2Config"], "models.falcon": ["FalconConfig"], "models.flan_t5": ["FlanT5Config"], "models.gpt_neox": ["GPTNeoXConfig"], "models.llama": ["LlamaConfig"], "models.mpt": ["MPTConfig"], "models.opt": ["OPTConfig"], "models.stablelm": ["StableLMConfig"], "models.starcoder": ["StarCoderConfig"]
 }
 COMPILED = _Path(__file__).suffix in (".pyd", ".so")
@@ -40,6 +42,8 @@ if _t.TYPE_CHECKING:
  from ._llm import LLM as LLM, LLMEmbeddings as LLMEmbeddings, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
  from ._quantisation import infer_quantisation_config as infer_quantisation_config
  from ._schema import EmbeddingsOutput as EmbeddingsOutput, GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, HfAgentInput as HfAgentInput, MetadataOutput as MetadataOutput, unmarshal_vllm_outputs as unmarshal_vllm_outputs
+  from ._embeddings import GenericEmbeddingRunnable as GenericEmbeddingRunnable
+  from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
  from .cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start, start_grpc as start_grpc
  from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING, MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES, MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES as MODEL_VLLM_MAPPING_NAMES, AutoConfig as AutoConfig
  from .models.baichuan import BaichuanConfig as BaichuanConfig
@@ -54,7 +58,7 @@ if _t.TYPE_CHECKING:
  from .models.stablelm import StableLMConfig as StableLMConfig
  from .models.starcoder import StarCoderConfig as StarCoderConfig
  from .serialisation import ggml as ggml, transformers as transformers
-  from openllm.utils import infer_auto_class as infer_auto_class
+  from .utils import infer_auto_class as infer_auto_class

 try:
  if not (utils.is_torch_available() and utils.is_cpm_kernels_available()): raise exceptions.MissingDependencyError
--- a/openllm-python/src/openllm/_embeddings.py
+++ b/openllm-python/src/openllm/_embeddings.py
@@ -0,0 +1,48 @@
+# See https://github.com/bentoml/sentence-embedding-bento for more information.
+from __future__ import annotations
+import bentoml, openllm, transformers, typing as t
+from huggingface_hub import snapshot_download
+from bentoml._internal.frameworks.transformers import MODULE_NAME, API_VERSION
+from bentoml._internal.models.model import ModelOptions, ModelSignature
+if t.TYPE_CHECKING: import torch
+
+_GENERIC_EMBEDDING_ID="sentence-transformers/all-MiniLM-L6-v2"
+_BENTOMODEL_ID="sentence-transformers--all-MiniLM-L6-v2"
+
+def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
+  try:
+    return bentoml.transformers.get(ids)
+  except bentoml.exceptions.NotFound:
+    model_signatures = {k: ModelSignature(batchable=False) for k in ("forward", "generate", "contrastive_search", "greedy_search", "sample", "beam_search", "beam_sample", "group_beam_search", "constrained_beam_search", "__call__")}
+    with bentoml.models.create(ids, module=MODULE_NAME, api_version=API_VERSION, options=ModelOptions(), context=openllm.utils.generate_context(framework_name="transformers"), labels={"runtime": "pt", "framework": "openllm"}, signatures=model_signatures) as bentomodel:
+      snapshot_download(_GENERIC_EMBEDDING_ID, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=["*.safetensors","*.h5","*.ot","*.pdf","*.md",".gitattributes","LICENSE.txt"])
+      return bentomodel
+
+class GenericEmbeddingRunnable(bentoml.Runnable):
+  SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
+  SUPPORTS_CPU_MULTI_THREADING = True
+  def __init__(self) -> None:
+    self.device = "cuda" if openllm.utils.device_count() > 0 else "cpu"
+    self._bentomodel = get_or_download()
+    self.tokenizer = transformers.AutoTokenizer.from_pretrained(self._bentomodel.path)
+    self.model = transformers.AutoModel.from_pretrained(self._bentomodel.path)
+    self.model.to(self.device)
+  @bentoml.Runnable.method(batchable=True, batch_dim=0)
+  def encode(self, sentences: list[str]) -> t.Sequence[openllm.LLMEmbeddings]:
+    import torch, torch.nn.functional as F
+    encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(self.device)
+    attention_mask = encoded_input["attention_mask"]
+    # Compute token embeddings
+    with torch.no_grad(): model_output = self.model(**encoded_input)
+    # Perform pooling and normalize
+    sentence_embeddings = F.normalize(self.mean_pooling(model_output, attention_mask), p=2, dim=1)
+    return [openllm.LLMEmbeddings(embeddings=sentence_embeddings.cpu().numpy(), num_tokens=int(torch.sum(attention_mask).item()))]
+  @staticmethod
+  def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    import torch
+    # Mean Pooling - Take attention mask into account for correct averaging
+    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+
+__all__ = ["GenericEmbeddingRunnable"]
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -926,7 +926,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
      scheduling_strategy = CascadingResourceStrategy

    generate_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
-    embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
+    embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True, batch_dim=0)))
    generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True)))

    # NOTE: returning the two langchain API's to the runner
@@ -1036,8 +1036,8 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
      logger.info("Successfully apply LoRA layer %s", adapter_name)

    @bentoml.Runnable.method(**method_signature(embeddings_sig))
-    def embeddings(__self: _Runnable, prompt: str | list[str]) -> LLMEmbeddings:
-      return self.embeddings([prompt] if isinstance(prompt, str) else prompt)
+    def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[LLMEmbeddings]:
+      return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)]

    @bentoml.Runnable.method(**method_signature(generate_sig))
    def __call__(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -1,9 +1,3 @@
-# mypy: disable-error-code="arg-type,misc"
-"""The service definition for running any LLMService.
-
-For line with comment '# openllm: ...', it must not be modified as it is managed internally by OpenLLM.
-Codegen can be found under 'openllm.utils.codegen'
-"""
 from __future__ import annotations
 import os, warnings, orjson, bentoml, openllm, typing as t
 from starlette.applications import Starlette
@@ -12,6 +6,7 @@ from starlette.routing import Route
 if t.TYPE_CHECKING:
  from starlette.requests import Request
  from starlette.responses import Response
+  from bentoml._internal.runner.runner import RunnerMethod
 # The following warnings from bitsandbytes, and probably not that important for users to see
 warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
 warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
@@ -20,7 +15,10 @@ model = os.environ.get("OPENLLM_MODEL", "{__model_name__}")  # openllm: model na
 adapter_map = os.environ.get("OPENLLM_ADAPTER_MAP", """{__model_adapter_map__}""")  # openllm: model adapter map
 llm_config = openllm.AutoConfig.for_model(model)
 runner = openllm.Runner(model, llm_config=llm_config, ensure_available=False, adapter_map=orjson.loads(adapter_map))
-svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[runner])
+generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable, name="llm-generic-embedding", scheduling_strategy=openllm.CascadingResourceStrategy, max_batch_size=32, max_latency_ms=300)
+runners: t.Sequence[bentoml.Runner] = [runner]
+if not runner.supports_embeddings: runners.append(generic_embedding_runner)
+svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners)

@svc.api(route="/v1/generate", input=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True)}), output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)}))
 async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
@@ -33,11 +31,11 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
 def metadata_v1(_: str) -> openllm.MetadataOutput:
  return openllm.MetadataOutput(timeout=llm_config["timeout"], model_name=llm_config["model_name"], framework=llm_config["env"]["framework_value"], model_id=runner.llm.model_id, configuration=llm_config.model_dump_json().decode(), supports_embeddings=runner.supports_embeddings, supports_hf_agent=runner.supports_hf_agent)

-if runner.supports_embeddings:
-  @svc.api(route="/v1/embeddings", input=bentoml.io.JSON.from_sample(["Hey Jude, welcome to the jungle!", "What is the meaning of life?"]), output=bentoml.io.JSON.from_sample({"embeddings": [0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282, -0.014814382418990135, 0.01796768605709076], "num_tokens": 20}))
-  async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
-    responses = await runner.embeddings.async_run(phrases)
-    return openllm.EmbeddingsOutput(embeddings=responses["embeddings"], num_tokens=responses["num_tokens"])
+@svc.api(route="/v1/embeddings", input=bentoml.io.JSON.from_sample(["Hey Jude, welcome to the jungle!", "What is the meaning of life?"]), output=bentoml.io.JSON.from_sample({"embeddings": [0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282, -0.014814382418990135, 0.01796768605709076], "num_tokens": 20}))
+async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
+  embed_call: RunnerMethod[bentoml.Runnable | openllm.LLMRunnable[t.Any, t.Any], [list[str]], t.Sequence[openllm.LLMEmbeddings]] = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode
+  responses = (await embed_call.async_run(phrases))[0]
+  return openllm.EmbeddingsOutput(embeddings=responses["embeddings"], num_tokens=responses["num_tokens"])

 if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
  async def hf_agent(request: Request) -> Response:
--- a/openllm-python/src/openllm/_strategies.py
+++ b/openllm-python/src/openllm/_strategies.py
@@ -330,3 +330,5 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
      if idx >= len(gpus): raise ValueError(f"Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}")
      dev = str(gpus[idx])
    return dev
+
+__all__=["CascadingResourceStrategy", "get_resource"]
--- a/openllm-python/src/openllm/_typing_compat.py
+++ b/openllm-python/src/openllm/_typing_compat.py
@@ -82,7 +82,7 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
  supports_embeddings: bool
  supports_hf_agent: bool
  has_adapters: bool
-  embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], LLMEmbeddings]
+  embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], t.Sequence[LLMEmbeddings]]
  generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]]
  generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]]
  generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[t.Any, None, None]]
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -78,16 +78,15 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
  return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"])

 def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
-  _bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
-  _bentoml_config_options_opts = ["tracing.sample_rate=1.0", f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'api_server.traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}']
-  _bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
+  from openllm.cli._factory import parse_config_options
+  environ = parse_config_options(llm.config, llm.config["timeout"], workers_per_resource, None, True, os.environ.copy())
  env: openllm.utils.EnvVarMixin = llm.config["env"]
  if env["framework_value"] == "vllm": serialisation_format = "legacy"
  env_dict = {
      env.framework: env["framework_value"], env.config: f"'{llm.config.model_dump_json().decode()}'",
      env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}",
      "OPENLLM_MODEL": llm.config["model_name"], "OPENLLM_SERIALIZATION": serialisation_format,
-      "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{_bentoml_config_options}'",
+      "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{environ['BENTOML_CONFIG_OPTIONS']}'",
  }
  if adapter_map: env_dict["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1")

@@ -125,8 +124,7 @@ def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.A
  )

  bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath("/"))
-  # NOTE: the model_id_path here are only used for setting this environment variable within the container
-  # built with for BentoLLM.
+  # NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM.
  service_fs_path = fs.path.join("src", llm.config["service_name"])
  service_path = bento._fs.getsyspath(service_fs_path)
  with open(service_path, "r") as f:
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -27,6 +27,7 @@ def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_res
  if device:
    if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
    else: _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
+  _bentoml_config_options_opts.append(f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
  if cors:
    _bentoml_config_options_opts.extend(["api_server.http.cors.enabled=true", 'api_server.http.cors.access_control_allow_origins="*"'])
    _bentoml_config_options_opts.extend([f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(["GET", "OPTIONS", "POST", "HEAD", "PUT"])])
--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -77,7 +77,7 @@ def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30
  return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False)

@inject
-def _build(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, workers_per_resource: float | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", dockerfile_template: str | None = None, overwrite: bool = False, container_registry: LiteralContainerRegistry | None = None, container_version_strategy: LiteralContainerVersionStrategy | None = None, push: bool = False, containerize: bool = False, serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", additional_args: list[str] | None = None, bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
+def _build(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, bento_version: str | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, workers_per_resource: float | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", dockerfile_template: str | None = None, overwrite: bool = False, container_registry: LiteralContainerRegistry | None = None, container_version_strategy: LiteralContainerVersionStrategy | None = None, push: bool = False, containerize: bool = False, serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", additional_args: list[str] | None = None, bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
  """Package a LLM into a Bento.

  The LLM will be built into a BentoService with the following structure:
@@ -92,6 +92,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st
      model_name: The model name to start this LLM
      model_id: Optional model id for this given LLM
      model_version: Optional model version for this given LLM
+      bento_version: Optional bento veresion for this given BentoLLM
      quantize: Quantize the model weights. This is only applicable for PyTorch models.
                Possible quantisation strategies:
                - int8: Quantize the model with 8bit (bitsandbytes required)
@@ -126,7 +127,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st
  Returns:
      ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
  """
-  args: list[str] = [sys.executable, "-m", "openllm", "build", model_name, "--machine", "--runtime", runtime, "--serialisation", serialisation_format,]
+  args: list[str] = [sys.executable, "-m", "openllm", "build", model_name, "--machine", "--runtime", runtime, "--serialisation", serialisation_format]
  if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
  if quantize: args.extend(["--quantize", quantize])
  if bettertransformer: args.append("--bettertransformer")
@@ -140,6 +141,7 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st
  if overwrite: args.append("--overwrite")
  if adapter_map: args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()])
  if model_version: args.extend(["--model-version", model_version])
+  if bento_version: args.extend(["--bento-version", bento_version])
  if dockerfile_template: args.extend(["--dockerfile-template", dockerfile_template])
  if container_registry is None: container_registry = "ecr"
  if container_version_strategy is None: container_version_strategy = "release"
--- a/openllm-python/src/openllm/client/runtimes/http.py
+++ b/openllm-python/src/openllm/client/runtimes/http.py
@@ -20,7 +20,6 @@ class HTTPClient(BaseClient[DictStrAny]):

  def health(self) -> t.Any: return self._cached.health()
  def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
-    if not self.supports_embeddings: raise ValueError("This model does not support embeddings.")
    if isinstance(prompt, str): prompt = [prompt]
    result = httpx.post(urljoin(self._address, f"/{self._api_version}/embeddings"), json=list(prompt), timeout=self.timeout).json() if in_async_context() else self.call("embeddings", list(prompt))
    return openllm.EmbeddingsOutput(**result)
@@ -62,7 +61,6 @@ class AsyncHTTPClient(BaseAsyncClient[DictStrAny]):

  async def health(self) -> t.Any: return await self._cached.async_health()
  async def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
-    if not self.supports_embeddings: raise ValueError("This model does not support embeddings.")
    if isinstance(prompt, str): prompt = [prompt]
    res = await self.acall("embeddings", list(prompt))
    return openllm.EmbeddingsOutput(**res)
--- a/openllm-python/src/openllm/serialisation/transformers/init.py
+++ b/openllm-python/src/openllm/serialisation/transformers/init.py
@@ -106,8 +106,7 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
        else:
          # we will clone the all tings into the bentomodel path without loading model into memory
          snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm))
-    except Exception:
-      raise
+    except Exception: raise
    else:
      bentomodel.flush()  # type: ignore[no-untyped-call]
      bentomodel.save(_model_store)
@@ -117,7 +116,6 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
      # NOTE: We need to free up the cache after importing the model
      # in the case where users first run openllm start without the model available locally.
      if openllm.utils.is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
-
    return bentomodel

 def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
--- a/openllm-python/src/openllm/utils/codegen.py
+++ b/openllm-python/src/openllm/utils/codegen.py
@@ -32,7 +32,7 @@ class ModelIdFormatter(ModelNameFormatter):
 class ModelAdapterMapFormatter(ModelNameFormatter):
  model_keyword: LiteralString = "__model_adapter_map__"

-_service_file = Path(os.path.abspath("__file__")).parent.parent/"_service.py"
+_service_file = Path(os.path.abspath(__file__)).parent.parent/"_service.py"
 def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None:
  from openllm.utils import DEBUG
  model_name = llm.config["model_name"]