refactor(breaking): unify LLM API (#283)

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-05-19 14:16:22 -04:00 · 2023-09-01 05:15:19 -04:00
parent 35601dab20
commit 3e45530abd
50 changed files with 881 additions and 1232 deletions
--- a/openllm-python/src/openllm/init.py
+++ b/openllm-python/src/openllm/init.py
@@ -47,7 +47,7 @@ _import_structure: dict[str, list[str]] = {
    "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"],
    "_quantisation": ["infer_quantisation_config"],
    "_embeddings": ["GenericEmbeddingRunnable"],
-    "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"],
+    "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "EmbeddingsOutput"],
    "_generation": [
        "StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList",
        "prepare_logits_processor"
@@ -72,7 +72,7 @@ COMPILED = _Path(__file__).suffix in (".pyd", ".so")
 if _t.TYPE_CHECKING:
  from . import bundle as bundle, cli as cli, client as client, models as models, playground as playground, serialisation as serialisation, testing as testing
  from ._generation import LogitsProcessorList as LogitsProcessorList, StopOnTokens as StopOnTokens, StoppingCriteriaList as StoppingCriteriaList, StopSequenceCriteria as StopSequenceCriteria, prepare_logits_processor as prepare_logits_processor
-  from ._llm import LLM as LLM, LLMEmbeddings as LLMEmbeddings, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
+  from ._llm import LLM as LLM, EmbeddingsOutput as EmbeddingsOutput, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
  from ._quantisation import infer_quantisation_config as infer_quantisation_config
  from ._embeddings import GenericEmbeddingRunnable as GenericEmbeddingRunnable
  from .cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start, start_grpc as start_grpc
@@ -196,7 +196,12 @@ else:
 __lazy = openllm_core.utils.LazyModule(__name__,
                                       globals()["__file__"],
                                       _import_structure,
-                                       extra_objects={"COMPILED": COMPILED})
+                                       extra_objects={
+                                           "COMPILED": COMPILED,
+                                           "__openllm_migration__": {
+                                               "LLMEmbeddings": "EmbeddingsOutput"
+                                           }
+                                       })
 __all__ = __lazy.__all__
 __dir__ = __lazy.__dir__
 __getattr__ = __lazy.__getattr__
--- a/openllm-python/src/openllm/_assign.py
+++ b/openllm-python/src/openllm/_assign.py
@@ -0,0 +1,201 @@
+'''LLM assignment magik.'''
+from __future__ import annotations
+import functools
+import traceback
+import typing as t
+
+import openllm
+
+from openllm.exceptions import OpenLLMException
+from openllm_core._configuration import _object_getattribute
+from openllm_core._configuration import _setattr_class
+from openllm_core._schema import unmarshal_vllm_outputs
+from openllm_core._typing_compat import DictStrAny
+from openllm_core._typing_compat import ListStr
+from openllm_core._typing_compat import M
+from openllm_core._typing_compat import T
+from openllm_core._typing_compat import import_model_protocol
+from openllm_core._typing_compat import llm_post_init_protocol
+from openllm_core._typing_compat import load_model_protocol
+from openllm_core._typing_compat import load_tokenizer_protocol
+from openllm_core.utils import LazyLoader
+from openllm_core.utils import codegen
+from openllm_core.utils import device_count
+from openllm_core.utils import first_not_none
+from openllm_core.utils import is_torch_available
+
+if t.TYPE_CHECKING:
+  import torch
+  import vllm
+
+  import bentoml
+
+  from openllm._llm import LLM
+else:
+  torch = LazyLoader('torch', globals(), 'torch')
+  vllm = LazyLoader('vllm', globals(), 'vllm')
+
+def import_model(fn: import_model_protocol[bentoml.Model, M, T]) -> t.Callable[[LLM[M, T]], bentoml.Model]:
+
+  @functools.wraps(fn)
+  def inner(self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any) -> bentoml.Model:
+    trust_remote_code = first_not_none(trust_remote_code, default=self.trust_remote_code)
+    (model_decls, model_attrs), _ = self.llm_parameters
+    decls = (*model_decls, *decls)
+    attrs = {**model_attrs, **attrs}
+    return fn(self, *decls, trust_remote_code=trust_remote_code, **attrs)
+
+  return inner
+
+def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.LLMEngine]:
+
+  @functools.wraps(fn)
+  def inner(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine:
+    if self.__llm_backend__ == 'vllm':
+      # TODO: Do some more processing with token_id once we support token streaming
+      try:
+        return vllm.LLMEngine.from_engine_args(
+            vllm.EngineArgs(model=self._bentomodel.path,
+                            tokenizer=self._bentomodel.path if self.tokenizer_id == 'local' else self.tokenizer_id,
+                            tokenizer_mode='auto',
+                            tensor_parallel_size=1 if device_count() < 2 else device_count(),
+                            dtype='auto',
+                            worker_use_ray=False))
+      except Exception as err:
+        traceback.print_exc()
+        raise OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from None
+    else:
+      (model_decls, model_attrs), _ = self.llm_parameters
+      return fn(self, *(*model_decls, *decls), **{**model_attrs, **attrs})
+
+  return inner
+
+def load_tokenizer(fn: load_tokenizer_protocol[M, T]) -> t.Callable[[LLM[M, T]], T]:
+
+  @functools.wraps(fn)
+  def inner(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T:
+    return fn(self, **{**self.llm_parameters[-1], **tokenizer_attrs})
+
+  return inner
+
+def llm_post_init(fn: llm_post_init_protocol[M, T]) -> t.Callable[[LLM[M, T]], None]:
+
+  @functools.wraps(fn)
+  def inner(self: LLM[M, T]) -> None:
+    if self.__llm_backend__ == 'pt' and is_torch_available():
+      self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    fn(self)
+
+  return inner
+
+def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]:
+  '''Make LLM attributes for the given LLM subclass.'''
+  from ._llm import LLM
+  from ._llm import LLMFunction
+  from ._llm import LLMInterface
+  from ._llm import LLMSerialisation
+
+  args: ListStr = []
+  globs: DictStrAny = {'cls': cls, '__wrapped_llm_post_init': llm_post_init, 'LLM': LLM}
+  # _cached_LLMFunction_get and _ccached_LLMSerialisation_get
+  globs.update(
+      {f'_cached_{cl_.__name__}_get': _object_getattribute.__get__(cl_) for cl_ in {LLMSerialisation, LLMFunction}})
+  # llm_post_init implementation
+  lines: ListStr = [
+      f'_impl_{cls.__name__}_func=cls.llm_post_init',
+      _setattr_class('llm_post_init', f'__wrapped_llm_post_init(_impl_{cls.__name__}_func)')
+  ]
+
+  serialisation_attr = {'import_model': import_model, 'load_model': load_model, 'load_tokenizer': load_tokenizer,}
+  for func, impl in serialisation_attr.items():
+    impl_name = f'__wrapped_{func}'
+    globs.update({f'__serialisation_{func}': getattr(openllm.serialisation, func, None), impl_name: impl})
+    cached_func_name = f'_cached_{cls.__name__}_func'
+    func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMSerialisation_get('{func}') else __serialisation_{func}"
+    lines.extend([
+        f'{cached_func_name}=cls.{func}', func_call,
+        _setattr_class(func, f'{impl_name}(_impl_{cls.__name__}_{func})')
+    ])
+
+  # assign vLLM implementation
+  if cls.__llm_backend__ == 'vllm':
+    vllm_func = {
+        f'_vllm_{it}': fn
+        for it, fn in zip(('generate', 'generate_iterator',
+                           'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate))
+    }
+    globs.update(vllm_func)
+    lines.extend([_setattr_class(it[6:], it) for it in vllm_func])
+
+  interface_anns = codegen.get_annotations(LLMInterface)
+
+  # cached attribute initialisation
+  def dunder_cached(key: str) -> str:
+    return f'__llm_{key}__'
+
+  st_attr = {'model', 'tokenizer', 'adapter_map'}
+  lines.extend([_setattr_class(dunder_cached(v), None) for v in st_attr])
+
+  # boolean for better LLM implementation resolver
+  def dunder_support(key: str) -> str:
+    return f'__llm_supports_{key}__'
+
+  bool_attr = {it[15:-2] for it in interface_anns if it.startswith('__llm_supports_')}
+  lines.extend(
+      [_setattr_class(dunder_support(fn), f"cls.{fn} is not _cached_LLMFunction_get('{fn}')") for fn in bool_attr])
+
+  return codegen.generate_function(cls,
+                                   '__assign_llm_attr',
+                                   lines,
+                                   args=('cls', *args),
+                                   globs=globs,
+                                   annotations={
+                                       'cls': 't.Type[LLM]',
+                                       'return': None
+                                   })
+
+def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]],
+                              **_: t.Any) -> str:
+  return generation_result[0]['outputs'][0]['text']
+
+def vllm_generate_iterator(self: LLM['vllm.LLMEngine', T],
+                           prompt: str,
+                           /,
+                           *,
+                           echo: bool = False,
+                           stop: str | t.Iterable[str] | None = None,
+                           stop_token_ids: list[int] | None = None,
+                           **attrs: t.Any) -> t.Iterator[dict[str, t.Any]]:
+  request_id: str | None = attrs.pop('request_id', None)
+  if request_id is None: raise ValueError('request_id must not be None.')
+  if stop_token_ids is None: stop_token_ids = []
+  stop_token_ids.append(self.tokenizer.eos_token_id)
+  stop_: set[str] = set()
+  if isinstance(stop, str) and stop != '': stop_.add(stop)
+  elif isinstance(stop, list) and stop != []: stop_.update(stop)
+  for tid in stop_token_ids:
+    if tid: stop_.add(self.tokenizer.decode(tid))
+
+  if self.config['temperature'] <= 1e-5: top_p = 1.0
+  else: top_p = self.config['top_p']
+  config = self.config.model_construct_env(stop=list(stop_), top_p=top_p, **attrs)
+  self.model.add_request(request_id=request_id, prompt=prompt, sampling_params=config.to_sampling_config())
+  while self.model.has_unfinished_requests():
+    for request_output in self.model.step():
+      prompt = request_output.prompt
+      if echo: text_outputs = [prompt + output.text for output in request_output.outputs]
+      else: text_outputs = [output.text for output in request_output.outputs]
+      yield {'text': text_outputs, 'error_code': 0}
+      if request_output.finished: break
+
+def vllm_generate(self: LLM['vllm.LLMEngine', T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]:
+  request_id: str | None = attrs.pop('request_id', None)
+  if request_id is None: raise ValueError('request_id must not be None.')
+  outputs: list[vllm.RequestOutput] = []
+  # TODO: support prompt_token_ids
+  self.model.add_request(request_id=request_id,
+                         prompt=prompt,
+                         sampling_params=self.config.model_construct_env(**attrs).to_sampling_config())
+  while self.model.has_unfinished_requests():
+    outputs.extend([r for r in self.model.step() if r.finished])
+  return [unmarshal_vllm_outputs(i) for i in outputs]
--- a/openllm-python/src/openllm/_embeddings.py
+++ b/openllm-python/src/openllm/_embeddings.py
@@ -58,7 +58,7 @@ class GenericEmbeddingRunnable(bentoml.Runnable):
    self.model.to(self.device)

  @bentoml.Runnable.method(batchable=True, batch_dim=0)
-  def encode(self, sentences: list[str]) -> t.Sequence[openllm.LLMEmbeddings]:
+  def encode(self, sentences: list[str]) -> t.Sequence[openllm.EmbeddingsOutput]:
    import torch
    import torch.nn.functional as F
    encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(self.device)
@@ -69,8 +69,8 @@ class GenericEmbeddingRunnable(bentoml.Runnable):
    # Perform pooling and normalize
    sentence_embeddings = F.normalize(self.mean_pooling(model_output, attention_mask), p=2, dim=1)
    return [
-        openllm.LLMEmbeddings(embeddings=sentence_embeddings.cpu().numpy(),
-                              num_tokens=int(torch.sum(attention_mask).item()))
+        openllm.EmbeddingsOutput(embeddings=sentence_embeddings.cpu().numpy(),
+                                 num_tokens=int(torch.sum(attention_mask).item()))
    ]

  @staticmethod
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -78,7 +78,7 @@ async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[s
             'model_id': runner.llm.model_id,
             'timeout': 3600,
             'model_name': llm_config['model_name'],
-             'framework': runner.llm_framework,
+             'backend': runner.backend,
             'configuration': '',
             'supports_embeddings': runner.supports_embeddings,
             'supports_hf_agent': runner.supports_hf_agent
@@ -86,7 +86,7 @@ async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[s
 def metadata_v1(_: str) -> openllm.MetadataOutput:
  return openllm.MetadataOutput(timeout=llm_config['timeout'],
                                model_name=llm_config['model_name'],
-                                framework=llm_config['env']['framework_value'],
+                                backend=llm_config['env']['backend_value'],
                                model_id=runner.llm.model_id,
                                configuration=llm_config.model_dump_json().decode(),
                                supports_embeddings=runner.supports_embeddings,
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -86,17 +86,17 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
    packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}")

  env = llm.config['env']
-  framework_envvar = env['framework_value']
-  if framework_envvar == 'flax':
+  backend_envvar = env['backend_value']
+  if backend_envvar == 'flax':
    if not openllm_core.utils.is_flax_available():
-      raise ValueError(f"Flax is not available, while {env.framework} is set to 'flax'")
+      raise ValueError(f"Flax is not available, while {env.backend} is set to 'flax'")
    packages.extend(
        [importlib.metadata.version('flax'),
         importlib.metadata.version('jax'),
         importlib.metadata.version('jaxlib')])
-  elif framework_envvar == 'tf':
+  elif backend_envvar == 'tf':
    if not openllm_core.utils.is_tf_available():
-      raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'")
+      raise ValueError(f"TensorFlow is not available, while {env.backend} is set to 'tf'")
    candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu',
                  'intel-tensorflow', 'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos',
                 )
@@ -125,21 +125,22 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
  return PythonOptions(packages=packages,
                       wheels=wheels,
                       lock_packages=False,
-                       extra_index_url=['https://download.pytorch.org/whl/cu118'])
+                       extra_index_url=[
+                           'https://download.pytorch.org/whl/cu118',
+                           'https://huggingface.github.io/autogptq-index/whl/cu118/'
+                       ])

 def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float,
-                             quantize: LiteralString | None, bettertransformer: bool | None,
-                             adapter_map: dict[str, str | None] | None, dockerfile_template: str | None,
-                             runtime: t.Literal['ggml', 'transformers'], serialisation_format: t.Literal['safetensors',
-                                                                                                         'legacy'],
+                             quantize: LiteralString | None, adapter_map: dict[str, str | None] | None,
+                             dockerfile_template: str | None, serialisation_format: t.Literal['safetensors', 'legacy'],
                             container_registry: LiteralContainerRegistry,
                             container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
  from openllm.cli._factory import parse_config_options
  environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy())
  env: openllm_core.utils.EnvVarMixin = llm.config['env']
-  if env['framework_value'] == 'vllm': serialisation_format = 'legacy'
+  if env['backend_value'] == 'vllm': serialisation_format = 'legacy'
  env_dict = {
-      env.framework: env['framework_value'],
+      env.backend: env['backend_value'],
      env.config: f"'{llm.config.model_dump_json().decode()}'",
      env.model_id: f'/home/bentoml/bento/models/{llm.tag.path()}',
      'OPENLLM_MODEL': llm.config['model_name'],
@@ -152,14 +153,9 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_
  if adapter_map: env_dict['BITSANDBYTES_NOWELCOME'] = os.environ.get('BITSANDBYTES_NOWELCOME', '1')

  # We need to handle None separately here, as env from subprocess doesn't accept None value.
-  _env = openllm_core.utils.EnvVarMixin(llm.config['model_name'],
-                                        bettertransformer=bettertransformer,
-                                        quantize=quantize,
-                                        runtime=runtime)
+  _env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], quantize=quantize)

-  env_dict[_env.bettertransformer] = str(_env['bettertransformer_value'])
  if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value'])
-  env_dict[_env.runtime] = _env['runtime_value']
  return DockerOptions(
      base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}',
      env=env_dict,
@@ -218,21 +214,19 @@ def create_bento(bento_tag: bentoml.Tag,
                 llm: openllm.LLM[t.Any, t.Any],
                 workers_per_resource: str | float,
                 quantize: LiteralString | None,
-                 bettertransformer: bool | None,
                 dockerfile_template: str | None,
                 adapter_map: dict[str, str | None] | None = None,
                 extra_dependencies: tuple[str, ...] | None = None,
-                 runtime: t.Literal['ggml', 'transformers'] = 'transformers',
                 serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
                 container_registry: LiteralContainerRegistry = 'ecr',
                 container_version_strategy: LiteralContainerVersionStrategy = 'release',
                 _bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
                 _model_store: ModelStore = Provide[BentoMLContainer.model_store]) -> bentoml.Bento:
-  framework_envvar = llm.config['env']['framework_value']
+  backend_envvar = llm.config['env']['backend_value']
  labels = dict(llm.identifying_params)
  labels.update({
      '_type': llm.llm_type,
-      '_framework': framework_envvar,
+      '_framework': backend_envvar,
      'start_name': llm.config['start_name'],
      'base_name_or_path': llm.model_id,
      'bundler': 'openllm.bundle'
@@ -265,8 +259,8 @@ def create_bento(bento_tag: bentoml.Tag,
                                  python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
                                  models=[llm_spec],
                                  docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize,
-                                                                  bettertransformer, adapter_map, dockerfile_template,
-                                                                  runtime, serialisation_format, container_registry,
+                                                                  adapter_map, dockerfile_template,
+                                                                  serialisation_format, container_registry,
                                                                  container_version_strategy))

  bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/'))
--- a/openllm-python/src/openllm/bundle/oci/init.py
+++ b/openllm-python/src/openllm/bundle/oci/init.py
@@ -94,7 +94,7 @@ class RefResolver:
  git_hash: str = attr.field()
  version: openllm_core.utils.VersionInfo = attr.field(converter=_convert_version_from_string)
  strategy: LiteralContainerVersionStrategy = attr.field()
-  _ghapi: t.ClassVar[all.GhApi] = all.GhApi(owner=_OWNER, repo=_REPO)
+  _ghapi: t.ClassVar[all.GhApi] = all.GhApi(owner=_OWNER, repo=_REPO, authenticate=False)

  @classmethod
  def _nightly_ref(cls) -> RefTuple:
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -16,12 +16,15 @@ from click.shell_completion import CompletionItem

 import bentoml
 import openllm
+import openllm_core

 from bentoml._internal.configuration.containers import BentoMLContainer
 from openllm_core._typing_compat import Concatenate
 from openllm_core._typing_compat import DictStrAny
+from openllm_core._typing_compat import LiteralBackend
 from openllm_core._typing_compat import LiteralString
 from openllm_core._typing_compat import ParamSpec
+from openllm_core._typing_compat import get_literal_args
 from openllm_core.utils import DEBUG

 from . import termui
@@ -147,14 +150,12 @@ Available official model_id(s): [default: {llm_config['default_id']}]
  @click.pass_context
  def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None,
                workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...],
-                quantize: t.Literal['int8', 'int4', 'gptq'] | None, bettertransformer: bool | None,
-                runtime: t.Literal['ggml', 'transformers'], fast: bool, serialisation_format: t.Literal['safetensors',
-                                                                                                        'legacy'],
-                cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any,
+                quantize: t.Literal['int8', 'int4', 'gptq'] | None, backend: LiteralBackend,
+                serialisation_format: t.Literal['safetensors', 'legacy'], cors: bool, adapter_id: str | None,
+                return_process: bool, **attrs: t.Any,
               ) -> LLMConfig | subprocess.Popen[bytes]:
-    fast = str(fast).upper() in openllm.utils.ENV_VARS_TRUE_VALUES
-    if serialisation_format == 'safetensors' and quantize is not None and os.environ.get(
-        'OPENLLM_SERIALIZATION_WARNING', str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES:
+    if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env(
+        'OPENLLM_SERIALIZATION_WARNING'):
      termui.echo(
          f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.",
          fg='yellow')
@@ -184,20 +185,18 @@ Available official model_id(s): [default: {llm_config['default_id']}]

    # Create a new model env to work with the envvar during CLI invocation
    env = openllm.utils.EnvVarMixin(config['model_name'],
-                                    config.default_implementation(),
+                                    backend,
                                    model_id=model_id or config['default_id'],
-                                    bettertransformer=bettertransformer,
-                                    quantize=quantize,
-                                    runtime=runtime)
-    prerequisite_check(ctx, config, quantize, adapter_map, int(1 / wpr))
+                                    quantize=quantize)
+    requirements = llm_config['requirements']
+    if requirements is not None and len(requirements) > 0:
+      missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
+      if len(missing_requirements) > 0:
+        termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')

    # NOTE: This is to set current configuration
    start_env = os.environ.copy()
    start_env = parse_config_options(config, server_timeout, wpr, device, cors, start_env)
-    if fast:
-      termui.echo(
-          f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'",
-          fg='yellow')

    start_env.update({
        'OPENLLM_MODEL': model,
@@ -205,21 +204,18 @@ Available official model_id(s): [default: {llm_config['default_id']}]
        'BENTOML_HOME': os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()),
        'OPENLLM_ADAPTER_MAP': orjson.dumps(adapter_map).decode(),
        'OPENLLM_SERIALIZATION': serialisation_format,
-        env.runtime: env['runtime_value'],
-        env.framework: env['framework_value']
+        env.backend: env['backend_value']
    })
    if env['model_id_value']: start_env[env.model_id] = str(env['model_id_value'])
-    # NOTE: quantize and bettertransformer value is already assigned within env
-    if bettertransformer is not None: start_env[env.bettertransformer] = str(env['bettertransformer_value'])
    if quantize is not None: start_env[env.quantize] = str(t.cast(str, env['quantize_value']))

-    llm = openllm.utils.infer_auto_class(env['framework_value']).for_model(model,
-                                                                           model_id=start_env[env.model_id],
-                                                                           model_version=model_version,
-                                                                           llm_config=config,
-                                                                           ensure_available=not fast,
-                                                                           adapter_map=adapter_map,
-                                                                           serialisation=serialisation_format)
+    llm = openllm.utils.infer_auto_class(env['backend_value']).for_model(model,
+                                                                         model_id=start_env[env.model_id],
+                                                                         model_version=model_version,
+                                                                         llm_config=config,
+                                                                         ensure_available=True,
+                                                                         adapter_map=adapter_map,
+                                                                         serialisation=serialisation_format)
    start_env.update({env.config: llm.config.model_dump_json().decode()})

    server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer(
@@ -268,21 +264,6 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, *

  return noop

-def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None,
-                       adapter_map: dict[str, str | None] | None, num_workers: int) -> None:
-  if adapter_map and not openllm.utils.is_peft_available():
-    ctx.fail(
-        "Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'")
-  if quantize and llm_config.default_implementation() == 'vllm':
-    ctx.fail(
-        f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization."
-    )
-  requirements = llm_config['requirements']
-  if requirements is not None and len(requirements) > 0:
-    missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
-    if len(missing_requirements) > 0:
-      termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')
-
 def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:

  def wrapper(fn: FC) -> t.Callable[[FC], FC]:
@@ -291,22 +272,21 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
        cog.optgroup.group(
            'General LLM Options',
            help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
-        model_id_option(factory=cog.optgroup, model_env=llm_config['env']), model_version_option(factory=cog.optgroup),
+        model_id_option(factory=cog.optgroup), model_version_option(factory=cog.optgroup),
        cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
        workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup),
-        fast_option(factory=cog.optgroup),
+        backend_option(factory=cog.optgroup),
        cog.optgroup.group('LLM Optimization Options',
                           help='''Optimization related options.

-            OpenLLM supports running model with [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/),
-            k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
+            OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.

            The following are either in our roadmap or currently being worked on:

            - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
            - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
            ''',
-                          ),
+                          ), quantize_option(factory=cog.optgroup), serialisation_option(factory=cog.optgroup),
        cog.optgroup.option('--device',
                            type=openllm.utils.dantic.CUDA,
                            multiple=True,
@@ -314,13 +294,6 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
                            callback=parse_device_callback,
                            help=f"Assign GPU devices (if available) for {llm_config['model_name']}.",
                            show_envvar=True),
-        cog.optgroup.option('--runtime',
-                            type=click.Choice(['ggml', 'transformers']),
-                            default='transformers',
-                            help='The runtime to use for the given model. Default is transformers.'),
-        quantize_option(factory=cog.optgroup, model_env=llm_config['env']),
-        bettertransformer_option(factory=cog.optgroup, model_env=llm_config['env']),
-        serialisation_option(factory=cog.optgroup),
        cog.optgroup.group('Fine-tuning related options',
                           help='''\
    Note that the argument `--adapter-id` can accept the following format:
@@ -439,18 +412,6 @@ def output_option(f: _AnyCallable | None = None,
                    shell_complete=complete_output_var,
                    **attrs)(f)

-def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option('--fast/--no-fast',
-                    show_default=True,
-                    default=False,
-                    envvar='OPENLLM_USE_LOCAL_LATEST',
-                    show_envvar=True,
-                    help='''Whether to skip checking if models is already in store.
-
-                                                                                                          This is useful if you already downloaded or setup the model beforehand.
-                                                                                                          ''',
-                    **attrs)(f)
-
 def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option('--cors/--no-cors',
                    show_default=True,
@@ -463,15 +424,12 @@ def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC
 def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option('--machine', is_flag=True, default=False, hidden=True, **attrs)(f)

-def model_id_option(f: _AnyCallable | None = None,
-                    *,
-                    model_env: openllm.utils.EnvVarMixin | None = None,
-                    **attrs: t.Any) -> t.Callable[[FC], FC]:
+def model_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option('--model-id',
                    type=click.STRING,
                    default=None,
-                    envvar=model_env.model_id if model_env is not None else None,
-                    show_envvar=model_env is not None,
+                    envvar='OPENLLM_MODEL_ID',
+                    show_envvar=True,
                    help='Optional model_id name or path for (fine-tune) weight.',
                    **attrs)(f)

@@ -483,24 +441,31 @@ def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
      help='Optional model version to save for this model. It will be inferred automatically from model-id.',
      **attrs)(f)

+def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
+  # NOTE: LiteralBackend needs to remove the last two item as ggml and mlc is wip
+  # XXX: remove the check for __args__ once we have ggml and mlc supports
+  return cli_option('--backend',
+                    type=click.Choice(get_literal_args(LiteralBackend)[:-2]),
+                    default='pt',
+                    envvar='OPENLLM_BACKEND',
+                    show_envvar=True,
+                    help='The implementation for saving this LLM.',
+                    **attrs)(f)
+
 def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_argument('model_name',
                      type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]),
                      required=required,
                      **attrs)(f)

-def quantize_option(f: _AnyCallable | None = None,
-                    *,
-                    build: bool = False,
-                    model_env: openllm.utils.EnvVarMixin | None = None,
-                    **attrs: t.Any) -> t.Callable[[FC], FC]:
+def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option('--quantise',
                    '--quantize',
                    'quantize',
                    type=click.Choice(['int8', 'int4', 'gptq']),
                    default=None,
-                    envvar=model_env.quantize if model_env is not None else None,
-                    show_envvar=model_env is not None,
+                    envvar='OPENLLM_QUANTIZE',
+                    show_envvar=True,
                    help='''Dynamic quantization for running this LLM.

      The following quantization strategies are supported:
@@ -542,21 +507,6 @@ def workers_per_resource_option(f: _AnyCallable | None = None,
      > ensure it has the same effect with 'openllm start --api-workers ...'""" if build else ''),
                    **attrs)(f)

-def bettertransformer_option(f: _AnyCallable | None = None,
-                             *,
-                             build: bool = False,
-                             model_env: openllm.utils.EnvVarMixin | None = None,
-                             **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-      '--bettertransformer',
-      is_flag=True,
-      default=None,
-      envvar=model_env.bettertransformer if model_env is not None else None,
-      show_envvar=model_env is not None,
-      help='Apply FasterTransformer wrapper to serve model. This will applies during serving time.' if not build else
-      'Set default environment variable whether to serve this model with FasterTransformer in build time.',
-      **attrs)(f)
-
 def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option('--serialisation',
                    '--serialization',
@@ -586,22 +536,18 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
                    **attrs)(f)

 def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option('--container-registry',
-                    'container_registry',
-                    type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
-                    default='ecr',
-                    show_default=True,
-                    show_envvar=True,
-                    envvar='OPENLLM_CONTAINER_REGISTRY',
-                    callback=container_registry_callback,
-                    help='''The default container registry to get the base image for building BentoLLM.
-
-      Currently, it supports 'ecr', 'ghcr.io', 'docker.io'
-
-      \b
-      > [!NOTE] that in order to build the base image, you will need a GPUs to compile custom kernel. See ``openllm ext build-base-container`` for more information.
-      ''',
-                    **attrs)(f)
+  return cli_option(
+      '--container-registry',
+      'container_registry',
+      type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
+      default='ecr',
+      show_default=True,
+      show_envvar=True,
+      envvar='OPENLLM_CONTAINER_REGISTRY',
+      callback=container_registry_callback,
+      help=
+      'The default container registry to get the base image for building BentoLLM. Currently, it supports ecr, ghcr, docker',
+      **attrs)(f)

 _wpr_strategies = {'round_robin', 'conserved'}

--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -23,9 +23,9 @@ from ._factory import start_command_factory
 if t.TYPE_CHECKING:
  from bentoml._internal.bento import BentoStore
  from openllm_core._configuration import LLMConfig
+  from openllm_core._typing_compat import LiteralBackend
  from openllm_core._typing_compat import LiteralContainerRegistry
  from openllm_core._typing_compat import LiteralContainerVersionStrategy
-  from openllm_core._typing_compat import LiteralRuntime
  from openllm_core._typing_compat import LiteralString

 logger = logging.getLogger(__name__)
@@ -38,10 +38,8 @@ def _start(model_name: str,
           workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None,
           device: tuple[str, ...] | t.Literal['all'] | None = None,
           quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-           bettertransformer: bool | None = None,
-           runtime: t.Literal['ggml', 'transformers'] = 'transformers',
           adapter_map: dict[LiteralString, str | None] | None = None,
-           framework: LiteralRuntime | None = None,
+           backend: LiteralBackend | None = None,
           additional_args: list[str] | None = None,
           cors: bool = False,
           _serve_grpc: bool = False,
@@ -57,48 +55,42 @@ def _start(model_name: str,

  ``openllm.start`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI interaction.

-  > [!NOTE] ``quantize`` and ``bettertransformer`` are mutually exclusive.
-
  Args:
-      model_name: The model name to start this LLM
-      model_id: Optional model id for this given LLM
-      timeout: The server timeout
-      workers_per_resource: Number of workers per resource assigned.
-                            See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
-                            for more information. By default, this is set to 1.
+    model_name: The model name to start this LLM
+    model_id: Optional model id for this given LLM
+    timeout: The server timeout
+    workers_per_resource: Number of workers per resource assigned.
+                          See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
+                          for more information. By default, this is set to 1.

-                            > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
-                            > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
-                            > - ``conserved``: This will determine the number of available GPU resources, and only assign
-                            >                  one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
-                            >                  equivalent to ``--workers-per-resource 0.25``.
-      device: Assign GPU devices (if available) to this LLM. By default, this is set to ``None``. It also accepts 'all'
-      argument to assign all available GPUs to this LLM.
-      quantize: Quantize the model weights. This is only applicable for PyTorch models.
-                Possible quantisation strategies:
-                - int8: Quantize the model with 8bit (bitsandbytes required)
-                - int4: Quantize the model with 4bit (bitsandbytes required)
-                - gptq: Quantize the model with GPTQ (auto-gptq required)
-      bettertransformer: Convert given model to FastTransformer with PyTorch.
-      runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
-      cors: Whether to enable CORS for this LLM. By default, this is set to ``False``.
-      adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
-      framework: The framework to use for this LLM. By default, this is set to ``pt``.
-      additional_args: Additional arguments to pass to ``openllm start``.
+                          > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
+                          > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
+                          > - ``conserved``: This will determine the number of available GPU resources, and only assign
+                          >                  one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
+                          >                  equivalent to ``--workers-per-resource 0.25``.
+    device: Assign GPU devices (if available) to this LLM. By default, this is set to ``None``. It also accepts 'all'
+    argument to assign all available GPUs to this LLM.
+    quantize: Quantize the model weights. This is only applicable for PyTorch models.
+              Possible quantisation strategies:
+              - int8: Quantize the model with 8bit (bitsandbytes required)
+              - int4: Quantize the model with 4bit (bitsandbytes required)
+              - gptq: Quantize the model with GPTQ (auto-gptq required)
+    cors: Whether to enable CORS for this LLM. By default, this is set to ``False``.
+    adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
+    backend: The backend to use for this LLM. By default, this is set to ``pt``.
+    additional_args: Additional arguments to pass to ``openllm start``.
  """
  from .entrypoint import start_command
  from .entrypoint import start_grpc_command
  llm_config = openllm.AutoConfig.for_model(model_name)
  _ModelEnv = openllm_core.utils.EnvVarMixin(model_name,
-                                             openllm_core.utils.first_not_none(
-                                                 framework, default=llm_config.default_implementation()),
+                                             backend=openllm_core.utils.first_not_none(
+                                                 backend, default=llm_config.default_backend()),
                                             model_id=model_id,
-                                             bettertransformer=bettertransformer,
-                                             quantize=quantize,
-                                             runtime=runtime)
-  os.environ[_ModelEnv.framework] = _ModelEnv['framework_value']
+                                             quantize=quantize)
+  os.environ[_ModelEnv.backend] = _ModelEnv['backend_value']

-  args: list[str] = ['--runtime', runtime]
+  args: list[str] = []
  if model_id: args.extend(['--model-id', model_id])
  if timeout: args.extend(['--server-timeout', str(timeout)])
  if workers_per_resource:
@@ -107,10 +99,7 @@ def _start(model_name: str,
        str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource
    ])
  if device and not os.environ.get('CUDA_VISIBLE_DEVICES'): args.extend(['--device', ','.join(device)])
-  if quantize and bettertransformer:
-    raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
  if quantize: args.extend(['--quantize', str(quantize)])
-  elif bettertransformer: args.append('--bettertransformer')
  if cors: args.append('--cors')
  if adapter_map:
    args.extend(
@@ -134,12 +123,10 @@ def _build(model_name: str,
           model_version: str | None = None,
           bento_version: str | None = None,
           quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-           bettertransformer: bool | None = None,
           adapter_map: dict[str, str | None] | None = None,
           build_ctx: str | None = None,
           enable_features: tuple[str, ...] | None = None,
           workers_per_resource: float | None = None,
-           runtime: t.Literal['ggml', 'transformers'] = 'transformers',
           dockerfile_template: str | None = None,
           overwrite: bool = False,
           container_registry: LiteralContainerRegistry | None = None,
@@ -153,59 +140,50 @@ def _build(model_name: str,

  The LLM will be built into a BentoService with the following structure:
  if ``quantize`` is passed, it will instruct the model to be quantized dynamically during serving time.
-  if ``bettertransformer`` is passed, it will instruct the model to apply FasterTransformer during serving time.

  ``openllm.build`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as ``openllm build`` CLI.

-  > [!NOTE] ``quantize`` and ``bettertransformer`` are mutually exclusive.
-
  Args:
-      model_name: The model name to start this LLM
-      model_id: Optional model id for this given LLM
-      model_version: Optional model version for this given LLM
-      bento_version: Optional bento veresion for this given BentoLLM
-      quantize: Quantize the model weights. This is only applicable for PyTorch models.
-                Possible quantisation strategies:
-                - int8: Quantize the model with 8bit (bitsandbytes required)
-                - int4: Quantize the model with 4bit (bitsandbytes required)
-                - gptq: Quantize the model with GPTQ (auto-gptq required)
-      bettertransformer: Convert given model to FastTransformer with PyTorch.
-      adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
-      build_ctx: The build context to use for building BentoLLM. By default, it sets to current directory.
-      enable_features: Additional OpenLLM features to be included with this BentoLLM.
-      workers_per_resource: Number of workers per resource assigned.
-                            See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
-                            for more information. By default, this is set to 1.
+    model_name: The model name to start this LLM
+    model_id: Optional model id for this given LLM
+    model_version: Optional model version for this given LLM
+    bento_version: Optional bento veresion for this given BentoLLM
+    quantize: Quantize the model weights. This is only applicable for PyTorch models.
+              Possible quantisation strategies:
+              - int8: Quantize the model with 8bit (bitsandbytes required)
+              - int4: Quantize the model with 4bit (bitsandbytes required)
+              - gptq: Quantize the model with GPTQ (auto-gptq required)
+    adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
+    build_ctx: The build context to use for building BentoLLM. By default, it sets to current directory.
+    enable_features: Additional OpenLLM features to be included with this BentoLLM.
+    workers_per_resource: Number of workers per resource assigned.
+                          See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
+                          for more information. By default, this is set to 1.

-                            > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
-                            > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
-                            > - ``conserved``: This will determine the number of available GPU resources, and only assign
-                            >                  one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
-                            >                  equivalent to ``--workers-per-resource 0.25``.
-      runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
-      dockerfile_template: The dockerfile template to use for building BentoLLM. See https://docs.bentoml.com/en/latest/guides/containerization.html#dockerfile-template.
-      overwrite: Whether to overwrite the existing BentoLLM. By default, this is set to ``False``.
-      push: Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.
-      containerize: Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.
-                    Note that 'containerize' and 'push' are mutually exclusive
-                    container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
-      container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
-      container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
-      serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
-      additional_args: Additional arguments to pass to ``openllm build``.
-      bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.
+                          > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
+                          > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
+                          > - ``conserved``: This will determine the number of available GPU resources, and only assign
+                          >                  one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
+                          >                  equivalent to ``--workers-per-resource 0.25``.
+    dockerfile_template: The dockerfile template to use for building BentoLLM. See https://docs.bentoml.com/en/latest/guides/containerization.html#dockerfile-template.
+    overwrite: Whether to overwrite the existing BentoLLM. By default, this is set to ``False``.
+    push: Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.
+    containerize: Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.
+                  Note that 'containerize' and 'push' are mutually exclusive
+                  container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
+    container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
+    container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
+    serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
+    additional_args: Additional arguments to pass to ``openllm build``.
+    bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.

  Returns:
      ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
  """
  args: list[str] = [
-      sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--runtime', runtime, '--serialisation',
-      serialisation_format
+      sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation_format
  ]
-  if quantize and bettertransformer:
-    raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
  if quantize: args.extend(['--quantize', quantize])
-  if bettertransformer: args.append('--bettertransformer')
  if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
  if push: args.extend(['--push'])
  if containerize: args.extend(['--containerize'])
@@ -241,8 +219,7 @@ def _import_model(model_name: str,
                  *,
                  model_id: str | None = None,
                  model_version: str | None = None,
-                  runtime: t.Literal['ggml', 'transformers'] = 'transformers',
-                  implementation: LiteralRuntime = 'pt',
+                  backend: LiteralBackend = 'pt',
                  quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
                  serialisation_format: t.Literal['legacy', 'safetensors'] = 'safetensors',
                  additional_args: t.Sequence[str] | None = None) -> bentoml.Model:
@@ -259,28 +236,24 @@ def _import_model(model_name: str,
  > ``openllm.start`` will automatically invoke ``openllm.download`` under the hood.

  Args:
-      model_name: The model name to start this LLM
-      model_id: Optional model id for this given LLM
-      model_version: Optional model version for this given LLM
-      runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
-      implementation: The implementation to use for this LLM. By default, this is set to ``pt``.
-      quantize: Quantize the model weights. This is only applicable for PyTorch models.
-                Possible quantisation strategies:
-                - int8: Quantize the model with 8bit (bitsandbytes required)
-                - int4: Quantize the model with 4bit (bitsandbytes required)
-                - gptq: Quantize the model with GPTQ (auto-gptq required)
-      serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
-      Default behaviour is similar to ``safe_serialization=False``.
-      additional_args: Additional arguments to pass to ``openllm import``.
+    model_name: The model name to start this LLM
+    model_id: Optional model id for this given LLM
+    model_version: Optional model version for this given LLM
+    backend: The backend to use for this LLM. By default, this is set to ``pt``.
+    quantize: Quantize the model weights. This is only applicable for PyTorch models.
+              Possible quantisation strategies:
+              - int8: Quantize the model with 8bit (bitsandbytes required)
+              - int4: Quantize the model with 4bit (bitsandbytes required)
+              - gptq: Quantize the model with GPTQ (auto-gptq required)
+    serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
+    Default behaviour is similar to ``safe_serialization=False``.
+    additional_args: Additional arguments to pass to ``openllm import``.

  Returns:
-      ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
+    ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
  """
  from .entrypoint import import_command
-  args = [
-      model_name, '--runtime', runtime, '--implementation', implementation, '--machine', '--serialisation',
-      serialisation_format,
-  ]
+  args = [model_name, '--backend', backend, '--machine', '--serialisation', serialisation_format]
  if model_id is not None: args.append(model_id)
  if model_version is not None: args.extend(['--model-version', str(model_version)])
  if additional_args is not None: args.extend(additional_args)
--- a/openllm-python/src/openllm/cli/entrypoint.py
+++ b/openllm-python/src/openllm/cli/entrypoint.py
@@ -66,7 +66,7 @@ from openllm.models.auto import AutoLLM
 from openllm.utils import infer_auto_class
 from openllm_core._typing_compat import Concatenate
 from openllm_core._typing_compat import DictStrAny
-from openllm_core._typing_compat import LiteralRuntime
+from openllm_core._typing_compat import LiteralBackend
 from openllm_core._typing_compat import LiteralString
 from openllm_core._typing_compat import ParamSpec
 from openllm_core._typing_compat import Self
@@ -80,7 +80,6 @@ from openllm_core.utils import analytics
 from openllm_core.utils import bentoml_cattr
 from openllm_core.utils import compose
 from openllm_core.utils import configure_logging
-from openllm_core.utils import dantic
 from openllm_core.utils import first_not_none
 from openllm_core.utils import get_debug_mode
 from openllm_core.utils import get_quiet_mode
@@ -94,15 +93,13 @@ from . import termui
 from ._factory import FC
 from ._factory import LiteralOutput
 from ._factory import _AnyCallable
-from ._factory import bettertransformer_option
+from ._factory import backend_option
 from ._factory import container_registry_option
-from ._factory import fast_option
 from ._factory import machine_option
 from ._factory import model_id_option
 from ._factory import model_name_argument
 from ._factory import model_version_option
 from ._factory import output_option
-from ._factory import parse_device_callback
 from ._factory import quantize_option
 from ._factory import serialisation_option
 from ._factory import start_command_factory
@@ -205,21 +202,6 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):

    return t.cast(t.Callable[Concatenate[bool, P], t.Any], wrapper)

-  @staticmethod
-  def exception_handling(func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any) -> t.Callable[P, t.Any]:
-    command_name = attrs.get('name', func.__name__)
-
-    @functools.wraps(func)
-    def wrapper(*args: P.args, **attrs: P.kwargs) -> t.Any:
-      try:
-        return func(*args, **attrs)
-      except OpenLLMException as err:
-        raise click.ClickException(click.style(f"[{group.name}] '{command_name}' failed: " + err.message, fg='red')) from err
-      except KeyboardInterrupt:
-        pass
-
-    return wrapper
-
  def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None:
    if cmd_name in t.cast('Extensions', extension_command).list_commands(ctx):
      return t.cast('Extensions', extension_command).get_command(ctx, cmd_name)
@@ -253,11 +235,11 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
      name = name.replace('_', '-')
      kwargs.setdefault('help', inspect.getdoc(f))
      kwargs.setdefault('name', name)
-      wrapped = self.exception_handling(self.usage_tracking(self.common_params(f), self, **kwargs), self, **kwargs)
+      wrapped = self.usage_tracking(self.common_params(f), self, **kwargs)

      # move common parameters to end of the parameters list
      _memo = getattr(wrapped, '__click_params__', None)
-      if _memo is None: raise RuntimeError('Click command not register correctly.')
+      if _memo is None: raise ValueError('Click command not register correctly.')
      _object_setattr(wrapped, '__click_params__', _memo[-self.NUMBER_OF_COMMON_PARAMS:] + _memo[:-self.NUMBER_OF_COMMON_PARAMS])
      # NOTE: we need to call super of super to avoid conflict with BentoMLCommandGroup command setup
      cmd = super(BentoMLCommandGroup, self).command(*args, **kwargs)(wrapped)
@@ -348,11 +330,10 @@ _start_mapping = {
@click.argument('model_id', type=click.STRING, default=None, metavar='Optional[REMOTE_REPO/MODEL_ID | /path/to/local/model]', required=False)
@click.argument('converter', envvar='CONVERTER', type=click.STRING, default=None, required=False, metavar=None)
@model_version_option
-@click.option('--runtime', type=click.Choice(['ggml', 'transformers']), default='transformers', help='The runtime to use for the given model. Default is transformers.')
@output_option
@quantize_option
@machine_option
-@click.option('--implementation', type=click.Choice(['pt', 'tf', 'flax', 'vllm']), default=None, help='The implementation for saving this LLM.')
+@backend_option
@serialisation_option
 def import_command(
    model_name: str,
@@ -360,9 +341,8 @@ def import_command(
    converter: str | None,
    model_version: str | None,
    output: LiteralOutput,
-    runtime: t.Literal['ggml', 'transformers'],
    machine: bool,
-    implementation: LiteralRuntime | None,
+    backend: LiteralBackend,
    quantize: t.Literal['int8', 'int4', 'gptq'] | None,
    serialisation_format: t.Literal['safetensors', 'legacy'],
 ) -> bentoml.Model:
@@ -415,45 +395,42 @@ def import_command(
  ```bash
  $ CONVERTER=llama2-hf openllm import llama /path/to/llama-2
  ```
-
-  > [!WARNING] This behaviour will override ``--runtime``. Therefore make sure that the LLM contains correct conversion strategies to both GGML and HF.
  """
  llm_config = AutoConfig.for_model(model_name)
-  env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, runtime=runtime, quantize=quantize)
-  impl: LiteralRuntime = first_not_none(implementation, default=env['framework_value'])
-  llm = infer_auto_class(impl).for_model(
+  env = EnvVarMixin(model_name, backend=llm_config.default_backend(), model_id=model_id, quantize=quantize)
+  backend = first_not_none(backend, default=env['backend_value'])
+  llm = infer_auto_class(backend).for_model(
      model_name, model_id=env['model_id_value'], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format
  )
  _previously_saved = False
  try:
    _ref = serialisation.get(llm)
    _previously_saved = True
-  except bentoml.exceptions.NotFound:
+  except openllm.exceptions.OpenLLMException:
    if not machine and output == 'pretty':
-      msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store for implementation {llm.__llm_implementation__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
+      msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store for backend {llm.__llm_backend__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
      termui.echo(msg, fg='yellow', nl=True)
    _ref = serialisation.get(llm, auto_import=True)
-    if impl == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
+    if backend == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
  if machine: return _ref
  elif output == 'pretty':
-    if _previously_saved: termui.echo(f"{model_name} with 'model_id={model_id}' is already setup for framework '{impl}': {_ref.tag!s}", nl=True, fg='yellow')
+    if _previously_saved: termui.echo(f"{model_name} with 'model_id={model_id}' is already setup for backend '{backend}': {_ref.tag!s}", nl=True, fg='yellow')
    else: termui.echo(f'Saved model: {_ref.tag}')
-  elif output == 'json': termui.echo(orjson.dumps({'previously_setup': _previously_saved, 'framework': impl, 'tag': str(_ref.tag)}, option=orjson.OPT_INDENT_2).decode())
+  elif output == 'json': termui.echo(orjson.dumps({'previously_setup': _previously_saved, 'backend': backend, 'tag': str(_ref.tag)}, option=orjson.OPT_INDENT_2).decode())
  else: termui.echo(_ref.tag)
  return _ref
+
@cli.command(context_settings={'token_normalize_func': inflection.underscore})
@model_name_argument
@model_id_option
@output_option
@machine_option
+@backend_option
@click.option('--bento-version', type=str, default=None, help='Optional bento version for this BentoLLM. Default is the the model revision.')
@click.option('--overwrite', is_flag=True, help='Overwrite existing Bento for given LLM if it already exists.')
@workers_per_resource_option(factory=click, build=True)
-@click.option('--device', type=dantic.CUDA, multiple=True, envvar='CUDA_VISIBLE_DEVICES', callback=parse_device_callback, help='Set the device', show_envvar=True)
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Optimisation options')
@quantize_option(factory=cog.optgroup, build=True)
-@bettertransformer_option(factory=cog.optgroup)
-@click.option('--runtime', type=click.Choice(['ggml', 'transformers']), default='transformers', help='The runtime to use for the given model. Default is transformers.')
@click.option(
    '--enable-features',
    multiple=True,
@@ -476,7 +453,6 @@ def import_command(
@click.option(
    '--container-version-strategy', type=click.Choice(['release', 'latest', 'nightly']), default='release', help="Default container version strategy for the image from '--container-registry'"
 )
-@fast_option
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Utilities options')
@cog.optgroup.option(
    '--containerize',
@@ -496,21 +472,18 @@ def build_command(
    bento_version: str | None,
    overwrite: bool,
    output: LiteralOutput,
-    runtime: t.Literal['ggml', 'transformers'],
    quantize: t.Literal['int8', 'int4', 'gptq'] | None,
    enable_features: tuple[str, ...] | None,
-    bettertransformer: bool | None,
    workers_per_resource: float | None,
    adapter_id: tuple[str, ...],
    build_ctx: str | None,
+    backend: LiteralBackend,
    machine: bool,
-    device: tuple[str, ...],
    model_version: str | None,
    dockerfile_template: t.TextIO | None,
    containerize: bool,
    push: bool,
    serialisation_format: t.Literal['safetensors', 'legacy'],
-    fast: bool,
    container_registry: LiteralContainerRegistry,
    container_version_strategy: LiteralContainerVersionStrategy,
    force_push: bool,
@@ -539,22 +512,21 @@ def build_command(
  _previously_built = False

  llm_config = AutoConfig.for_model(model_name)
-  env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, quantize=quantize, bettertransformer=bettertransformer, runtime=runtime)
+  env = EnvVarMixin(model_name, backend=backend, model_id=model_id, quantize=quantize)

  # NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
  # during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
  try:
-    os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), env.runtime: str(env['runtime_value']), 'OPENLLM_SERIALIZATION': serialisation_format})
+    os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), 'OPENLLM_SERIALIZATION': serialisation_format, 'OPENLLM_BACKEND': env['backend_value']})
    if env['model_id_value']: os.environ[env.model_id] = str(env['model_id_value'])
    if env['quantize_value']: os.environ[env.quantize] = str(env['quantize_value'])
-    os.environ[env.bettertransformer] = str(env['bettertransformer_value'])

-    llm = infer_auto_class(env['framework_value']).for_model(
-        model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs
+    llm = infer_auto_class(env['backend_value']).for_model(
+        model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=True, model_version=model_version, serialisation=serialisation_format, **attrs
    )

    labels = dict(llm.identifying_params)
-    labels.update({'_type': llm.llm_type, '_framework': env['framework_value']})
+    labels.update({'_type': llm.llm_type, '_framework': env['backend_value']})
    workers_per_resource = first_not_none(workers_per_resource, default=llm_config['workers_per_resource'])

    with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
@@ -603,10 +575,8 @@ def build_command(
            workers_per_resource=workers_per_resource,
            adapter_map=adapter_map,
            quantize=quantize,
-            bettertransformer=bettertransformer,
            extra_dependencies=enable_features,
            dockerfile_template=dockerfile_template_path,
-            runtime=runtime,
            container_registry=container_registry,
            container_version_strategy=container_version_strategy
        )
@@ -632,16 +602,17 @@ def build_command(

  if push: BentoMLContainer.bentocloud_client.get().push_bento(bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=force_push)
  elif containerize:
-    backend = t.cast('DefaultBuilder', os.environ.get('BENTOML_CONTAINERIZE_BACKEND', 'docker'))
+    container_backend = t.cast('DefaultBuilder', os.environ.get('BENTOML_CONTAINERIZE_BACKEND', 'docker'))
    try:
-      bentoml.container.health(backend)
+      bentoml.container.health(container_backend)
    except subprocess.CalledProcessError:
      raise OpenLLMException(f'Failed to use backend {backend}') from None
    try:
-      bentoml.container.build(bento.tag, backend=backend, features=('grpc', 'io'))
+      bentoml.container.build(bento.tag, backend=container_backend, features=('grpc', 'io'))
    except Exception as err:
      raise OpenLLMException(f"Exception caught while containerizing '{bento.tag!s}':\n{err}") from err
  return bento
+
@cli.command()
@output_option
@click.option('--show-available', is_flag=True, default=False, help="Show available models in local store (mutually exclusive with '-o porcelain').")
@@ -667,21 +638,21 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
  else:
    failed_initialized: list[tuple[str, Exception]] = []

-    json_data: dict[str, dict[t.Literal['architecture', 'model_id', 'url', 'installation', 'cpu', 'gpu', 'runtime_impl'], t.Any] | t.Any] = {}
+    json_data: dict[str, dict[t.Literal['architecture', 'model_id', 'url', 'installation', 'cpu', 'gpu', 'backend'], t.Any] | t.Any] = {}
    converted: list[str] = []
    for m in models:
      config = AutoConfig.for_model(m)
-      runtime_impl: tuple[str, ...] = ()
-      if config['model_name'] in MODEL_MAPPING_NAMES: runtime_impl += ('pt',)
-      if config['model_name'] in MODEL_FLAX_MAPPING_NAMES: runtime_impl += ('flax',)
-      if config['model_name'] in MODEL_TF_MAPPING_NAMES: runtime_impl += ('tf',)
-      if config['model_name'] in MODEL_VLLM_MAPPING_NAMES: runtime_impl += ('vllm',)
+      backend: tuple[str, ...] = ()
+      if config['model_name'] in MODEL_MAPPING_NAMES: backend += ('pt',)
+      if config['model_name'] in MODEL_FLAX_MAPPING_NAMES: backend += ('flax',)
+      if config['model_name'] in MODEL_TF_MAPPING_NAMES: backend += ('tf',)
+      if config['model_name'] in MODEL_VLLM_MAPPING_NAMES: backend += ('vllm',)
      json_data[m] = {
          'architecture': config['architecture'],
          'model_id': config['model_ids'],
          'cpu': not config['requires_gpu'],
          'gpu': True,
-          'runtime_impl': runtime_impl,
+          'backend': backend,
          'installation': f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config['requirements'] else 'openllm',
      }
      converted.extend([normalise_model_name(i) for i in config['model_ids']])
@@ -708,10 +679,10 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
      import tabulate

      tabulate.PRESERVE_WHITESPACE = True
-      # llm, architecture, url, model_id, installation, cpu, gpu, runtime_impl
-      data: list[str | tuple[str, str, list[str], str, LiteralString, LiteralString, tuple[LiteralRuntime, ...]]] = []
+      # llm, architecture, url, model_id, installation, cpu, gpu, backend
+      data: list[str | tuple[str, str, list[str], str, LiteralString, LiteralString, tuple[LiteralBackend, ...]]] = []
      for m, v in json_data.items():
-        data.extend([(m, v['architecture'], v['model_id'], v['installation'], '❌' if not v['cpu'] else '✅', '✅', v['runtime_impl'],)])
+        data.extend([(m, v['architecture'], v['model_id'], v['installation'], '❌' if not v['cpu'] else '✅', '✅', v['backend'],)])
      column_widths = [
          int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 4),
      ]
--- a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
@@ -18,7 +18,7 @@ class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrain
                             prompt,
                             generation_config=self.config.model_construct_env(**attrs).to_generation_config())

-  def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
+  def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
    import torch
    import torch.nn.functional as F
    embeddings: list[list[float]] = []
@@ -30,4 +30,4 @@ class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrain
        data = F.normalize(torch.mean(outputs.hidden_states[-1].transpose(0, 1), dim=0), p=2, dim=0)
        embeddings.append(data.tolist())
        num_tokens += len(input_ids[0])
-    return openllm.LLMEmbeddings(embeddings=embeddings, num_tokens=num_tokens)
+    return openllm.EmbeddingsOutput(embeddings=embeddings, num_tokens=num_tokens)
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -17,7 +17,7 @@ class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformer
                              generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
          skip_special_tokens=True)

-  def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
+  def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
    import torch
    import torch.nn.functional as F
    embeddings: list[list[float]] = []
@@ -29,4 +29,4 @@ class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformer
        data = F.normalize(torch.mean(outputs.encoder_last_hidden_state[0], dim=0), p=2, dim=0)
        embeddings.append(data.tolist())
        num_tokens += len(input_ids[0])
-    return openllm.LLMEmbeddings(embeddings=embeddings, num_tokens=num_tokens)
+    return openllm.EmbeddingsOutput(embeddings=embeddings, num_tokens=num_tokens)
--- a/openllm-python/src/openllm/models/llama/modeling_llama.py
+++ b/openllm-python/src/openllm/models/llama/modeling_llama.py
@@ -13,7 +13,7 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke
    import torch
    return {'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {}

-  def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
+  def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
    import torch
    import torch.nn.functional as F
    encoding = self.tokenizer(prompts, padding=True, return_tensors='pt').to(self.device)
@@ -23,8 +23,8 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke
      mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
      masked_embeddings = data * mask
      sum_embeddings, seq_length = torch.sum(masked_embeddings, dim=1), torch.sum(mask, dim=1)
-    return openllm.LLMEmbeddings(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(),
-                                 num_tokens=int(torch.sum(attention_mask).item()))
+    return openllm.EmbeddingsOutput(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(),
+                                    num_tokens=int(torch.sum(attention_mask).item()))

  def generate_one(self, prompt: str, stop: list[str],
                   **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
--- a/openllm-python/src/openllm/models/mpt/modeling_mpt.py
+++ b/openllm-python/src/openllm/models/mpt/modeling_mpt.py
@@ -33,10 +33,6 @@ def get_mpt_config(model_id_or_path: str,
 class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXTokenizerFast']):
  __openllm_internal__ = True

-  def llm_post_init(self) -> None:
-    import torch
-    self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-
  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
    import torch
@@ -49,7 +45,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
    import torch
    import transformers
    _, tokenizer_attrs = self.llm_parameters
-    torch_dtype = attrs.pop('torch_dtype', self.dtype)
+    torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32)
    device_map = attrs.pop('device_map', None)
    attrs.pop('low_cpu_mem_usage', None)
    config = get_mpt_config(self.model_id,
@@ -75,7 +71,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken

  def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
    import transformers
-    torch_dtype = attrs.pop('torch_dtype', self.dtype)
+    torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32)
    device_map = attrs.pop('device_map', None)
    trust_remote_code = attrs.pop('trust_remote_code', True)
    config = get_mpt_config(self._bentomodel.path,
--- a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
@@ -8,10 +8,6 @@ if t.TYPE_CHECKING:
 class StableLM(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNeoXTokenizerFast']):
  __openllm_internal__ = True

-  def llm_post_init(self) -> None:
-    import torch
-    self.bettertransformer = True if not torch.cuda.is_available() else False
-
  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
    import torch
--- a/openllm-python/src/openllm/serialisation/init.py
+++ b/openllm-python/src/openllm/serialisation/init.py
@@ -1,27 +1,9 @@
-"""Serialisation utilities for OpenLLM.
+'''Serialisation utilities for OpenLLM.

 Currently supports transformers for PyTorch, Tensorflow and Flax.

 Currently, GGML format is working in progress.
-
-## Usage
-
-```python
-import openllm
-
-llm = openllm.AutoLLM.for_model("dolly-v2")
-llm.save_pretrained("./path/to/local-dolly")
-```
-
-To use different runtime, specify directly in the `for_model` method:
-
-```python
-import openllm
-
-llm = openllm.AutoLLM.for_model("dolly-v2", runtime='ggml')
-llm.save_pretrained("./path/to/local-dolly")
-```
-"""
+'''
 from __future__ import annotations
 import importlib
 import typing as t
@@ -54,7 +36,7 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
  from .transformers._helpers import infer_tokenizers_from_llm
  from .transformers._helpers import process_config

-  config, *_ = process_config(llm._bentomodel.path, llm.__llm_trust_remote_code__)
+  config, *_ = process_config(llm._bentomodel.path, llm.trust_remote_code)
  bentomodel_fs = fs.open_fs(llm._bentomodel.path)
  if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
    with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, 'rb') as cofile:
@@ -62,12 +44,11 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
        tokenizer = cloudpickle.load(t.cast('t.IO[bytes]', cofile))['tokenizer']
      except KeyError:
        raise openllm.exceptions.OpenLLMException(
-            "Bento model does not have tokenizer. Make sure to save"
-            " the tokenizer within the model via 'custom_objects'."
-            " For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
+            "Bento model does not have tokenizer. Make sure to save the tokenizer within the model via 'custom_objects'. "
+            "For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
  else:
    tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath('/'),
-                                                               trust_remote_code=llm.__llm_trust_remote_code__,
+                                                               trust_remote_code=llm.trust_remote_code,
                                                               **tokenizer_attrs)

  if tokenizer.pad_token_id is None:
@@ -82,18 +63,20 @@ class _Caller(t.Protocol[P]):
  def __call__(self, llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
    ...

-_extras = ['get', 'import_model', 'save_pretrained', 'load_model']
+_extras = ['get', 'import_model', 'load_model']

 def _make_dispatch_function(fn: str) -> _Caller[P]:

  def caller(llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
    """Generic function dispatch to correct serialisation submodules based on LLM runtime.

-    > [!NOTE] See 'openllm.serialisation.transformers' if 'llm.runtime="transformers"'
+    > [!NOTE] See 'openllm.serialisation.transformers' if 'llm.__llm_backend__ in ("pt", "tf", "flax", "vllm")'

-    > [!NOTE] See 'openllm.serialisation.ggml' if 'llm.runtime="ggml"'
+    > [!NOTE] See 'openllm.serialisation.ggml' if 'llm.__llm_backend__="ggml"'
    """
-    return getattr(importlib.import_module(f'.{llm.runtime}', __name__), fn)(llm, *args, **kwargs)
+    serde = 'transformers'
+    if llm.__llm_backend__ == 'ggml': serde = 'ggml'
+    return getattr(importlib.import_module(f'.{serde}', __name__), fn)(llm, *args, **kwargs)

  return caller

@@ -105,9 +88,6 @@ if t.TYPE_CHECKING:
  def import_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> bentoml.Model:
    ...

-  def save_pretrained(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> None:
-    ...
-
  def load_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> M:
    ...

--- a/openllm-python/src/openllm/serialisation/ggml.py
+++ b/openllm-python/src/openllm/serialisation/ggml.py
@@ -5,10 +5,10 @@ This requires ctransformers to be installed.
 from __future__ import annotations
 import typing as t

-import bentoml
-import openllm
-
 if t.TYPE_CHECKING:
+  import bentoml
+  import openllm
+
  from openllm_core._typing_compat import M

 _conversion_strategy = {'pt': 'ggml'}
@@ -21,30 +21,7 @@ def import_model(llm: openllm.LLM[t.Any, t.Any],
  raise NotImplementedError('Currently work in progress.')

 def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Model:
-  '''Return an instance of ``bentoml.Model`` from given LLM instance.
-
-  By default, it will try to check the model in the local store.
-  If model is not found, and ``auto_import`` is set to True, it will try to import the model from HuggingFace Hub.
-
-  Otherwise, it will raises a ``bentoml.exceptions.NotFound``.
-  '''
-  try:
-    model = bentoml.models.get(llm.tag)
-    if model.info.module not in ('openllm.serialisation.ggml', __name__):
-      raise bentoml.exceptions.NotFound(
-          f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'."
-      )
-    if 'runtime' in model.info.labels and model.info.labels['runtime'] != llm.runtime:
-      raise openllm.exceptions.OpenLLMException(
-          f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")
-    return model
-  except bentoml.exceptions.NotFound:
-    if auto_import:
-      return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__)
-    raise
+  raise NotImplementedError('Currently work in progress.')

 def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
  raise NotImplementedError('Currently work in progress.')
-
-def save_pretrained(llm: openllm.LLM[t.Any, t.Any], save_directory: str, **attrs: t.Any) -> None:
-  raise NotImplementedError('Currently work in progress.')
--- a/openllm-python/src/openllm/serialisation/transformers/init.py
+++ b/openllm-python/src/openllm/serialisation/transformers/init.py
@@ -5,6 +5,7 @@ import logging
 import typing as t

 from huggingface_hub import snapshot_download
+from packaging.version import Version
 from simple_di import Provide
 from simple_di import inject

@@ -28,22 +29,18 @@ if t.TYPE_CHECKING:
  import auto_gptq as autogptq
  import torch
  import torch.nn
-  import transformers
-  import vllm

  from bentoml._internal.models import ModelStore
  from openllm_core._typing_compat import DictStrAny
  from openllm_core._typing_compat import M
  from openllm_core._typing_compat import T
 else:
-  vllm = openllm.utils.LazyLoader('vllm', globals(), 'vllm')
  autogptq = openllm.utils.LazyLoader('autogptq', globals(), 'auto_gptq')
-  transformers = openllm.utils.LazyLoader('transformers', globals(), 'transformers')
  torch = openllm.utils.LazyLoader('torch', globals(), 'torch')

 logger = logging.getLogger(__name__)

-__all__ = ['import_model', 'get', 'load_model', 'save_pretrained']
+__all__ = ['import_model', 'get', 'load_model']

@inject
 def import_model(llm: openllm.LLM[M, T],
@@ -74,7 +71,7 @@ def import_model(llm: openllm.LLM[M, T],
  safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'),
                                                    default=llm._serialisation_format == 'safetensors')
  # Disable safe serialization with vLLM
-  if llm.__llm_implementation__ == 'vllm': safe_serialisation = False
+  if llm.__llm_backend__ == 'vllm': safe_serialisation = False
  metadata: DictStrAny = {
      'safe_serialisation': safe_serialisation,
      '_quantize': quantize_method is not None and quantize_method
@@ -95,8 +92,8 @@ def import_model(llm: openllm.LLM[M, T],
    # since saving int4 is not yet supported
    if 'quantization_config' in attrs and getattr(attrs['quantization_config'], 'load_in_4bit', False):
      attrs.pop('quantization_config')
-    if llm.__llm_implementation__ != 'flax': attrs['use_safetensors'] = safe_serialisation
-    metadata['_framework'] = 'pt' if llm.__llm_implementation__ == 'vllm' else llm.__llm_implementation__
+    if llm.__llm_backend__ != 'flax': attrs['use_safetensors'] = safe_serialisation
+    metadata['_framework'] = 'pt' if llm.__llm_backend__ == 'vllm' else llm.__llm_backend__

  tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id,
                                                             trust_remote_code=trust_remote_code,
@@ -108,7 +105,7 @@ def import_model(llm: openllm.LLM[M, T],
  imported_modules: list[types.ModuleType] = []
  bentomodel = bentoml.Model.create(llm.tag,
                                    module='openllm.serialisation.transformers',
-                                    api_version='v1',
+                                    api_version='v2',
                                    options=ModelOptions(),
                                    context=openllm.utils.generate_context(framework_name='openllm'),
                                    labels=openllm.utils.generate_labels(llm),
@@ -133,8 +130,7 @@ def import_model(llm: openllm.LLM[M, T],
                                                            trust_remote_code=trust_remote_code,
                                                            use_safetensors=safe_serialisation,
                                                            **hub_attrs,
-                                                            **attrs,
-                                                           )
+                                                            **attrs)
        update_model(bentomodel,
                     metadata={
                         '_pretrained_class': model.__class__.__name__,
@@ -192,27 +188,21 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
  '''
  try:
    model = bentoml.models.get(llm.tag)
-    if model.info.module not in ('openllm.serialisation.transformers'
-                                 'bentoml.transformers', 'bentoml._internal.frameworks.transformers',
-                                 __name__):  # NOTE: backward compatible with previous version of OpenLLM.
-      raise bentoml.exceptions.NotFound(
-          f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'."
-      )
-    if 'runtime' in model.info.labels and model.info.labels['runtime'] != llm.runtime:
+    if Version(model.info.api_version) < Version('v2'):
      raise openllm.exceptions.OpenLLMException(
-          f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")
+          'Please run "openllm prune -y --include-bentos" and upgrade all saved model to latest release.')
+    if model.info.labels['backend'] != llm.__llm_backend__:
+      raise openllm.exceptions.OpenLLMException(
+          f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}."
+      )
    return model
-  except bentoml.exceptions.NotFound as err:
-    if auto_import: return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__)
-    raise err from None
+  except Exception as err:
+    if auto_import: return import_model(llm, trust_remote_code=llm.trust_remote_code)
+    raise openllm.exceptions.OpenLLMException(
+        f'Failed while getting stored artefact (lookup for traceback):\n{err}') from err

 def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
-  '''Load the model from BentoML store.
-
-  By default, it will try to find check the model in the local store.
-  If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
-  '''
-  config, hub_attrs, attrs = process_config(llm.model_id, llm.__llm_trust_remote_code__, **attrs)
+  config, hub_attrs, attrs = process_config(llm.model_id, llm.trust_remote_code, **attrs)
  safe_serialization = openllm.utils.first_not_none(t.cast(
      t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)),
                                                    attrs.pop('safe_serialization', None),
@@ -229,7 +219,7 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
                                                       *decls,
                                                       quantize_config=t.cast('autogptq.BaseQuantizeConfig',
                                                                              llm.quantization_config),
-                                                       trust_remote_code=llm.__llm_trust_remote_code__,
+                                                       trust_remote_code=llm.trust_remote_code,
                                                       use_safetensors=safe_serialization,
                                                       **hub_attrs,
                                                       **attrs)
@@ -238,57 +228,9 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
  model = infer_autoclass_from_llm(llm, config).from_pretrained(llm._bentomodel.path,
                                                                *decls,
                                                                config=config,
-                                                                trust_remote_code=llm.__llm_trust_remote_code__,
+                                                                trust_remote_code=llm.trust_remote_code,
                                                                device_map=device_map,
                                                                **hub_attrs,
                                                                **attrs).eval()
-  # BetterTransformer is currently only supported on PyTorch.
-  if llm.bettertransformer and isinstance(model, transformers.PreTrainedModel): model = model.to_bettertransformer()
-  if llm.__llm_implementation__ in {'pt', 'vllm'}: check_unintialised_params(model)
+  if llm.__llm_backend__ in {'pt', 'vllm'}: check_unintialised_params(model)
  return t.cast('M', model)
-
-def save_pretrained(llm: openllm.LLM[M, T],
-                    save_directory: str,
-                    is_main_process: bool = True,
-                    state_dict: DictStrAny | None = None,
-                    save_function: t.Any | None = None,
-                    push_to_hub: bool = False,
-                    max_shard_size: int | str = '10GB',
-                    safe_serialization: bool = False,
-                    variant: str | None = None,
-                    **attrs: t.Any) -> None:
-  save_function = t.cast(t.Callable[..., None], openllm.utils.first_not_none(save_function, default=torch.save))
-  model_save_attrs, tokenizer_save_attrs = openllm.utils.normalize_attrs_to_model_tokenizer_pair(**attrs)
-  safe_serialization = safe_serialization or llm._serialisation_format == 'safetensors'
-  # NOTE: disable safetensors for vllm
-  if llm.__llm_implementation__ == 'vllm': safe_serialization = False
-  if llm._quantize_method == 'gptq':
-    if not openllm.utils.is_autogptq_available():
-      raise openllm.exceptions.OpenLLMException(
-          "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
-      )
-    if llm.config['model_type'] != 'causal_lm':
-      raise openllm.exceptions.OpenLLMException(
-          f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
-    if not openllm.utils.lenient_issubclass(llm.model, autogptq.modeling.BaseGPTQForCausalLM):
-      raise ValueError(f'Model is not a BaseGPTQForCausalLM (type: {type(llm.model)})')
-    t.cast('autogptq.modeling.BaseGPTQForCausalLM', llm.model).save_quantized(save_directory,
-                                                                              use_safetensors=safe_serialization)
-  elif openllm.utils.LazyType['vllm.LLMEngine']('vllm.LLMEngine').isinstance(llm.model):
-    raise RuntimeError(
-        "vllm.LLMEngine cannot be serialisation directly. This happens when 'save_pretrained' is called directly after `openllm.AutoVLLM` is initialized."
-    )
-  elif isinstance(llm.model, transformers.Pipeline):
-    llm.model.save_pretrained(save_directory, safe_serialization=safe_serialization)
-  else:
-    # We can safely cast here since it will be the PreTrainedModel protocol.
-    t.cast('transformers.PreTrainedModel', llm.model).save_pretrained(save_directory,
-                                                                      is_main_process=is_main_process,
-                                                                      state_dict=state_dict,
-                                                                      save_function=save_function,
-                                                                      push_to_hub=push_to_hub,
-                                                                      max_shard_size=max_shard_size,
-                                                                      safe_serialization=safe_serialization,
-                                                                      variant=variant,
-                                                                      **model_save_attrs)
-  llm.tokenizer.save_pretrained(save_directory, push_to_hub=push_to_hub, **tokenizer_save_attrs)
--- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py
+++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
@@ -76,7 +76,7 @@ def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.Pretra
    if type(config) in transformers.MODEL_FOR_CAUSAL_LM_MAPPING: idx = 0
    elif type(config) in transformers.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING: idx = 1
    else: raise openllm.exceptions.OpenLLMException(f'Model type {type(config)} is not supported yet.')
-    return getattr(transformers, FRAMEWORK_TO_AUTOCLASS_MAPPING[llm.__llm_implementation__][idx])
+    return getattr(transformers, FRAMEWORK_TO_AUTOCLASS_MAPPING[llm.__llm_backend__][idx])

 def check_unintialised_params(model: torch.nn.Module) -> None:
  unintialized = [n for n, param in model.named_parameters() if param.data.device == torch.device('meta')]
@@ -104,11 +104,11 @@ def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Mod
 def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType:
  infer_fn: tuple[str, ...] = ('__call__',)
  default_config = ModelSignature(batchable=False)
-  if llm.__llm_implementation__ in {'pt', 'vllm'}:
+  if llm.__llm_backend__ in {'pt', 'vllm'}:
    infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample',
                 'group_beam_search', 'constrained_beam_search',
                )
-  elif llm.__llm_implementation__ == 'tf':
+  elif llm.__llm_backend__ == 'tf':
    infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search',
                 'contrastive_search',
                )
--- a/openllm-python/src/openllm/serialisation/transformers/weights.py
+++ b/openllm-python/src/openllm/serialisation/transformers/weights.py
@@ -23,9 +23,9 @@ class HfIgnore:

  @classmethod
  def ignore_patterns(cls, llm: openllm.LLM[M, T]) -> list[str]:
-    if llm.__llm_implementation__ == 'vllm': base = [cls.tf, cls.flax, cls.safetensors]
-    elif llm.__llm_implementation__ == 'tf': base = [cls.flax, cls.pt]
-    elif llm.__llm_implementation__ == 'flax':
+    if llm.__llm_backend__ == 'vllm': base = [cls.tf, cls.flax, cls.safetensors]
+    elif llm.__llm_backend__ == 'tf': base = [cls.flax, cls.pt]
+    elif llm.__llm_backend__ == 'flax':
      base = [cls.tf, cls.pt, cls.safetensors]  # as of current, safetensors is not supported with flax
    else:
      base = [cls.tf, cls.flax]
--- a/openllm-python/src/openllm/testing.py
+++ b/openllm-python/src/openllm/testing.py
@@ -10,7 +10,7 @@ import bentoml
 import openllm

 if t.TYPE_CHECKING:
-  from openllm_core._typing_compat import LiteralRuntime
+  from openllm_core._typing_compat import LiteralBackend

 logger = logging.getLogger(__name__)

@@ -18,10 +18,9 @@ logger = logging.getLogger(__name__)
 def build_bento(model: str,
                model_id: str | None = None,
                quantize: t.Literal['int4', 'int8', 'gptq'] | None = None,
-                runtime: t.Literal['ggml', 'transformers'] = 'transformers',
                cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
  logger.info('Building BentoML for %s', model)
-  bento = openllm.build(model, model_id=model_id, quantize=quantize, runtime=runtime)
+  bento = openllm.build(model, model_id=model_id, quantize=quantize)
  yield bento
  if cleanup:
    logger.info('Deleting %s', bento.tag)
@@ -49,7 +48,7 @@ def build_container(bento: bentoml.Bento | str | bentoml.Tag,
@contextlib.contextmanager
 def prepare(model: str,
            model_id: str | None = None,
-            implementation: LiteralRuntime = 'pt',
+            implementation: LiteralBackend = 'pt',
            deployment_mode: t.Literal['container', 'local'] = 'local',
            clean_context: contextlib.ExitStack | None = None,
            cleanup: bool = True) -> t.Iterator[str]:
--- a/openllm-python/src/openllm/utils/init.py
+++ b/openllm-python/src/openllm/utils/init.py
@@ -16,11 +16,11 @@ from . import dummy_vllm_objects as dummy_vllm_objects
 if t.TYPE_CHECKING:
  import openllm

-  from openllm_core._typing_compat import LiteralRuntime
+  from openllm_core._typing_compat import LiteralBackend

 def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
  return {
-      'runtime': llm.runtime,
+      'backend': llm.__llm_backend__,
      'framework': 'openllm',
      'model_name': llm.config['model_name'],
      'architecture': llm.config['architecture'],
@@ -28,14 +28,13 @@ def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
  }

 def infer_auto_class(
-    implementation: LiteralRuntime
-) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
+    backend: LiteralBackend) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
  import openllm
-  if implementation == 'tf': return openllm.AutoTFLLM
-  elif implementation == 'flax': return openllm.AutoFlaxLLM
-  elif implementation == 'pt': return openllm.AutoLLM
-  elif implementation == 'vllm': return openllm.AutoVLLM
-  else: raise RuntimeError(f"Unknown implementation: {implementation} (supported: 'pt', 'flax', 'tf', 'vllm')")
+  if backend == 'tf': return openllm.AutoTFLLM
+  elif backend == 'flax': return openllm.AutoFlaxLLM
+  elif backend == 'pt': return openllm.AutoLLM
+  elif backend == 'vllm': return openllm.AutoVLLM
+  else: raise RuntimeError(f"Unknown backend: {backend} (supported: 'pt', 'flax', 'tf', 'vllm')")

 __all__ = [
    'generate_labels', 'infer_auto_class', 'dummy_flax_objects', 'dummy_pt_objects', 'dummy_tf_objects',
--- a/openllm-python/tests/_strategies/_configuration.py
+++ b/openllm-python/tests/_strategies/_configuration.py
@@ -30,12 +30,10 @@ def model_settings(draw: st.DrawFn):
          st.booleans(),
      'requirements':
          st.none() | st.lists(st.text(), min_size=1),
-      'default_implementation':
+      'default_backend':
          st.dictionaries(st.sampled_from(['cpu', 'nvidia.com/gpu']), st.sampled_from(['vllm', 'pt', 'tf', 'flax'])),
      'model_type':
          st.sampled_from(['causal_lm', 'seq2seq_lm']),
-      'runtime':
-          st.sampled_from(['transformers', 'ggml']),
      'name_type':
          st.sampled_from(['dasherize', 'lowercase']),
      'timeout':
--- a/openllm-python/tests/configuration_test.py
+++ b/openllm-python/tests/configuration_test.py
@@ -111,10 +111,7 @@ def patch_env(**attrs: t.Any):
    yield

 def test_struct_envvar():
-  with patch_env(**{
-      field_env_key('env_llm', 'field1'): '4',
-      field_env_key('env_llm', 'temperature', suffix='generation'): '0.2',
-  }):
+  with patch_env(**{field_env_key('field1'): '4', field_env_key('temperature', suffix='generation'): '0.2',}):

    class EnvLLM(openllm.LLMConfig):
      __config__ = {'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel',}
@@ -146,8 +143,8 @@ def test_struct_provided_fields():

 def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPatch):
  with monkeypatch.context() as mk:
-    mk.setenv(field_env_key('overwrite_with_env_available', 'field1'), str(4.0))
-    mk.setenv(field_env_key('overwrite_with_env_available', 'temperature', suffix='generation'), str(0.2))
+    mk.setenv(field_env_key('field1'), str(4.0))
+    mk.setenv(field_env_key('temperature', suffix='generation'), str(0.2))
    sent = make_llm_config('OverwriteWithEnvAvailable', {
        'default_id': 'asdfasdf',
        'model_ids': ['asdf', 'asdfasdfads'],
--- a/openllm-python/tests/conftest.py
+++ b/openllm-python/tests/conftest.py
@@ -8,9 +8,9 @@ import pytest
 import openllm

 if t.TYPE_CHECKING:
-  from openllm_core._typing_compat import LiteralRuntime
+  from openllm_core._typing_compat import LiteralBackend

-_FRAMEWORK_MAPPING = {
+_MODELING_MAPPING = {
    'flan_t5': 'google/flan-t5-small',
    'opt': 'facebook/opt-125m',
    'baichuan': 'baichuan-inc/Baichuan-7B',
@@ -22,19 +22,17 @@ _PROMPT_MAPPING = {

 def parametrise_local_llm(
    model: str,) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]:
-  if model not in _FRAMEWORK_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.")
-  runtime_impl: tuple[LiteralRuntime, ...] = tuple()
-  if model in openllm.MODEL_MAPPING_NAMES: runtime_impl += ('pt',)
-  if model in openllm.MODEL_FLAX_MAPPING_NAMES: runtime_impl += ('flax',)
-  if model in openllm.MODEL_TF_MAPPING_NAMES: runtime_impl += ('tf',)
-  for framework, prompt in itertools.product(runtime_impl, _PROMPT_MAPPING.keys()):
-    llm = openllm.Runner(model,
-                         model_id=_FRAMEWORK_MAPPING[model],
-                         ensure_available=True,
-                         implementation=framework,
-                         init_local=True,
-                        )
-    yield prompt, llm
+  if model not in _MODELING_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.")
+  backends: tuple[LiteralBackend, ...] = tuple()
+  if model in openllm.MODEL_MAPPING_NAMES: backends += ('pt',)
+  if model in openllm.MODEL_FLAX_MAPPING_NAMES: backends += ('flax',)
+  if model in openllm.MODEL_TF_MAPPING_NAMES: backends += ('tf',)
+  for backend, prompt in itertools.product(backends, _PROMPT_MAPPING.keys()):
+    yield prompt, openllm.Runner(model,
+                                 model_id=_MODELING_MAPPING[model],
+                                 ensure_available=True,
+                                 backend=backend,
+                                 init_local=True)

 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
  if os.getenv('GITHUB_ACTIONS') is None:
--- a/openllm-python/tests/package_test.py
+++ b/openllm-python/tests/package_test.py
@@ -4,6 +4,7 @@ import os
 import typing as t

 import pytest
+import transformers

 import openllm

@@ -28,7 +29,7 @@ def test_general_build_with_internal_testing():
  bento = openllm.build('flan-t5', model_id=HF_INTERNAL_T5_TESTING)

  assert llm.llm_type == bento.info.labels['_type']
-  assert llm.config['env']['framework_value'] == bento.info.labels['_framework']
+  assert llm.config['env']['backend_value'] == bento.info.labels['_framework']

  bento = openllm.build('flan-t5', model_id=HF_INTERNAL_T5_TESTING)
  assert len(bento_store.list(bento.tag)) == 1
@@ -38,10 +39,11 @@ def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory):
  local_path = tmp_path_factory.mktemp('local_t5')
  llm = openllm.AutoLLM.for_model('flan-t5', model_id=HF_INTERNAL_T5_TESTING, ensure_available=True)

-  if llm.bettertransformer:
-    llm.__llm_model__ = llm.model.reverse_bettertransformer()
-
-  llm.save_pretrained(local_path)
+  if isinstance(llm.model, transformers.Pipeline):
+    llm.model.save_pretrained(str(local_path))
+  else:
+    llm.model.save_pretrained(str(local_path))
+    llm.tokenizer.save_pretrained(str(local_path))

  assert openllm.build('flan-t5', model_id=local_path.resolve().__fspath__(), model_version='local')