From 3e45530abde781618758d049de1d4d4ad6eb8369 Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Fri, 1 Sep 2023 05:15:19 -0400
Subject: [PATCH] refactor(breaking): unify LLM API (#283)

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
---
 .github/SECURITY.md                           |   3 +-
 CHANGELOG.md                                  |   7 +-
 README.md                                     |  11 +-
 changelog.d/283.breaking.md                   |  20 +
 hatch.toml                                    |   1 -
 openllm-client/src/openllm_client/_base.py    |   8 +-
 openllm-client/src/openllm_client/py.typed    |   0
 .../side_bar/model_selection/db.cljs          |   4 +-
 .../src/openllm_core/_configuration.py        | 109 ++-
 openllm-core/src/openllm_core/_schema.py      |   4 +-
 openllm-core/src/openllm_core/_strategies.py  |   2 +-
 .../src/openllm_core/_typing_compat.py        |  42 +-
 .../config/configuration_dolly_v2.py          |   8 +-
 .../config/configuration_flan_t5.py           |   6 +-
 .../config/configuration_llama.py             |   9 +-
 .../openllm_core/config/configuration_opt.py  |   6 +-
 openllm-core/src/openllm_core/py.typed        |   0
 .../src/openllm_core/utils/__init__.py        |  24 +-
 .../src/openllm_core/utils/analytics.py       |   3 +-
 .../src/openllm_core/utils/codegen.py         |  11 +-
 .../src/openllm_core/utils/import_utils.py    |  58 +-
 openllm-python/src/openllm/__init__.py        |  11 +-
 openllm-python/src/openllm/_assign.py         | 201 ++++++
 openllm-python/src/openllm/_embeddings.py     |   6 +-
 openllm-python/src/openllm/_llm.py            | 665 +++++-------------
 openllm-python/src/openllm/_service.py        |   4 +-
 openllm-python/src/openllm/bundle/_package.py |  42 +-
 .../src/openllm/bundle/oci/__init__.py        |   2 +-
 openllm-python/src/openllm/cli/_factory.py    | 166 ++---
 openllm-python/src/openllm/cli/_sdk.py        | 179 ++---
 openllm-python/src/openllm/cli/entrypoint.py  | 101 +--
 .../models/chatglm/modeling_chatglm.py        |   4 +-
 .../models/flan_t5/modeling_flan_t5.py        |   4 +-
 .../openllm/models/llama/modeling_llama.py    |   6 +-
 .../src/openllm/models/mpt/modeling_mpt.py    |   8 +-
 .../models/stablelm/modeling_stablelm.py      |   4 -
 .../src/openllm/serialisation/__init__.py     |  44 +-
 .../src/openllm/serialisation/ggml.py         |  31 +-
 .../serialisation/transformers/__init__.py    | 100 +--
 .../serialisation/transformers/_helpers.py    |   6 +-
 .../serialisation/transformers/weights.py     |   6 +-
 openllm-python/src/openllm/testing.py         |   7 +-
 openllm-python/src/openllm/utils/__init__.py  |  17 +-
 .../tests/_strategies/_configuration.py       |   4 +-
 openllm-python/tests/configuration_test.py    |   9 +-
 openllm-python/tests/conftest.py              |  28 +-
 openllm-python/tests/package_test.py          |  12 +-
 pyproject.toml                                |   8 +-
 tools/update-config-stubs.py                  |  54 +-
 tools/update-dummy.py                         |  48 +-
 50 files changed, 881 insertions(+), 1232 deletions(-)
 create mode 100644 changelog.d/283.breaking.md
 create mode 100644 openllm-client/src/openllm_client/py.typed
 create mode 100644 openllm-core/src/openllm_core/py.typed
 create mode 100644 openllm-python/src/openllm/_assign.py

diff --git a/.github/SECURITY.md b/.github/SECURITY.md
index 9585a107..a0baf626 100644
--- a/.github/SECURITY.md
+++ b/.github/SECURITY.md
@@ -8,8 +8,7 @@ are backward compatible. We are more lenient with patch as the development can
 move quickly.
 
 If you are just using public API, then feel free to always upgrade. Whenever
-there is a breaking policies, it will become a `DeprecationWarning` with a
-period of 12 months before becoming broken.
+there is a breaking policies, it will be announced and will be broken.
 
 > [!WARNING]
 > Everything package under `openllm` that has an underscore prefixes
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 88e6804f..15d7e2f8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -230,7 +230,7 @@ No significant changes.
 
   ```bash
   docker run --rm --gpus all -it -v /home/ubuntu/.local/share/bentoml:/tmp/bentoml -e BENTOML_HOME=/tmp/bentoml \
-              -e OPENLLM_USE_LOCAL_LATEST=True -e OPENLLM_LLAMA_FRAMEWORK=vllm ghcr.io/bentoml/openllm:2b5e96f90ad314f54e07b5b31e386e7d688d9bb2 start llama --model-id meta-llama/Llama-2-7b-chat-hf --workers-per-resource conserved --debug`
+              -e OPENLLM_USE_LOCAL_LATEST=True -e OPENLLM_BACKEND=vllm ghcr.io/bentoml/openllm:2b5e96f90ad314f54e07b5b31e386e7d688d9bb2 start llama --model-id meta-llama/Llama-2-7b-chat-hf --workers-per-resource conserved --debug`
   ```
 
   In conjunction with this, OpenLLM now also have a set of small CLI utilities via ``openllm ext`` for ease-of-use
@@ -721,9 +721,6 @@ No significant changes.
   `openllm start` now support `--quantize int8` and `--quantize int4` `GPTQ`
   quantization support is on the roadmap and currently being worked on.
 
-  `openllm start` now also support `--bettertransformer` to use
-  `BetterTransformer` for serving.
-
   Refactored `openllm.LLMConfig` to be able to use with `__getitem__`:
   `openllm.DollyV2Config()['requirements']`.
 
@@ -732,8 +729,6 @@ No significant changes.
 
   Added `towncrier` workflow to easily generate changelog entries
 
-  Added `use_pipeline`, `bettertransformer` flag into ModelSettings
-
   `LLMConfig` now supported `__dataclass_transform__` protocol to help with
   type-checking
 
diff --git a/README.md b/README.md
index 44dd5cc5..8ba82b92 100644
--- a/README.md
+++ b/README.md
@@ -407,17 +407,19 @@ pip install "openllm[baichuan]"
 ### Runtime Implementations (Experimental)
 
 Different LLMs may have multiple runtime implementations. For instance, they
-might use Pytorch (`pt`), Tensorflow (`tf`), or Flax (`flax`).
+might use Pytorch (`pt`), Tensorflow (`tf`), Flax (`flax`) or vLLM (`vllm`).
 
 If you wish to specify a particular runtime for a model, you can do so by
-setting the `OPENLLM_{MODEL_NAME}_FRAMEWORK={runtime}` environment variable
+setting the `OPENLLM_BACKEND={runtime}` environment variable
 before running `openllm start`.
 
 For example, if you want to use the Tensorflow (`tf`) implementation for the
 `flan-t5` model, you can use the following command:
 
 ```bash
-OPENLLM_FLAN_T5_FRAMEWORK=tf openllm start flan-t5
+OPENLLM_BACKEND=tf openllm start flan-t5
+
+openllm start flan-t5 --backend tf
 ```
 
 > [!NOTE]
@@ -425,6 +427,9 @@ OPENLLM_FLAN_T5_FRAMEWORK=tf openllm start flan-t5
 > [Jax's installation](https://github.com/google/jax#pip-installation-gpu-cuda-installed-via-pip-easier)
 > to make sure that you have Jax support for the corresponding CUDA version.
 
+> [!IMPORTANT]
+> To use vLLM backend, at least a GPU with Ampere or newer architecture and CUDA 11.8 is required.
+
 ### Quantisation
 
 OpenLLM supports quantisation with
diff --git a/changelog.d/283.breaking.md b/changelog.d/283.breaking.md
new file mode 100644
index 00000000..80321200
--- /dev/null
+++ b/changelog.d/283.breaking.md
@@ -0,0 +1,20 @@
+All environment variable now will be more simplified, without the need for the specific model prefix
+
+For example: OPENLLM_LLAMA_GENERATION_MAX_NEW_TOKENS now becomes OPENLLM_GENERATION_MAX_NEW_TOKENS
+
+Unify some misc environment variable. To switch different backend, one can use `--backend` for both `start` and `build`
+
+```bash
+openllm start llama --backend vllm
+```
+
+or the environment variable `OPENLLM_BACKEND`
+
+```bash
+OPENLLM_BACKEND=vllm openllm start llama
+```
+
+`openllm.Runner` now will default to try download the model the first time if the model is not available, and get the cached in model store consequently
+
+Model serialisation now updated to a new API version with more clear name change, kindly ask users to do `openllm prune -y --include-bentos` and update to
+this current version of openllm
diff --git a/hatch.toml b/hatch.toml
index 32ffac03..0a294b57 100644
--- a/hatch.toml
+++ b/hatch.toml
@@ -31,7 +31,6 @@ check-stubs = [
 inplace-changelog = "towncrier build --version main --keep"
 quality = [
     "./tools/dependencies.py",
-    "./tools/update-readme.py",
     "- ./tools/update-brew-tap.py",
     "bash ./tools/sync-readme.sh",
     "check-stubs",
diff --git a/openllm-client/src/openllm_client/_base.py b/openllm-client/src/openllm_client/_base.py
index fed7f87d..f2216958 100644
--- a/openllm-client/src/openllm_client/_base.py
+++ b/openllm-client/src/openllm_client/_base.py
@@ -28,7 +28,7 @@ if t.TYPE_CHECKING:
   import transformers
 
   from openllm_core._typing_compat import DictStrAny
-  from openllm_core._typing_compat import LiteralRuntime
+  from openllm_core._typing_compat import LiteralBackend
 
 logger = logging.getLogger(__name__)
 
@@ -98,7 +98,7 @@ class _ClientAttr:
       raise RuntimeError(
           "transformers is required to use HF agent. Install with 'pip install \"openllm-client[agents]\"'.")
     if not self.supports_hf_agent:
-      raise RuntimeError(f'{self.model_name} ({self.framework}) does not support running HF agent.')
+      raise RuntimeError(f'{self.model_name} ({self.backend}) does not support running HF agent.')
     if not is_transformers_supports_agent():
       raise RuntimeError(
           "Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'"
@@ -125,9 +125,9 @@ class _ClientAttr:
       raise RuntimeError('Malformed service endpoint. (Possible malicious)') from None
 
   @property
-  def framework(self) -> LiteralRuntime:
+  def backend(self) -> LiteralBackend:
     try:
-      return self._metadata['framework']
+      return self._metadata['backend']
     except KeyError:
       raise RuntimeError('Malformed service endpoint. (Possible malicious)') from None
 
diff --git a/openllm-client/src/openllm_client/py.typed b/openllm-client/src/openllm_client/py.typed
new file mode 100644
index 00000000..e69de29b
diff --git a/openllm-contrib/clojure/src/main/openllm/components/side_bar/model_selection/db.cljs b/openllm-contrib/clojure/src/main/openllm/components/side_bar/model_selection/db.cljs
index 6502f0e2..e6646dd0 100644
--- a/openllm-contrib/clojure/src/main/openllm/components/side_bar/model_selection/db.cljs
+++ b/openllm-contrib/clojure/src/main/openllm/components/side_bar/model_selection/db.cljs
@@ -31,10 +31,10 @@
 (s/def ::model_id (s/coll-of string? :kind vector?))                   ;; model_id is a vector of all models for a given model_type
 (s/def ::url string?)                                                  ;; url to the model's page
 (s/def ::requires_gpu boolean?)                                        ;; whether the model requires a gpu
-(s/def ::runtime_impl ::vec-of-runtimes?)                              ;; supported runtimes
+(s/def ::backend ::vec-of-runtimes?)                                   ;; supported runtimes
 (s/def ::installation string?)                                         ;; installation instructions (pip command)
 (s/def ::model-spec (s/keys :req-un [::model_id ::url ::requires_gpu   ;; the spec for a single model (aggregates all the above)
-                                     ::runtime_impl ::installation]))
+                                     ::backend ::installation]))
 (s/def ::all-models #(or loading-text                                  ;; -- this is the case when the file with the model data has not been loaded yet by the ::set-model-data effect
                          (s/map-of keyword? ::model-spec)))            ;; map of all models
 
diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py
index da25116a..043f9356 100644
--- a/openllm-core/src/openllm_core/_configuration.py
+++ b/openllm-core/src/openllm_core/_configuration.py
@@ -66,14 +66,13 @@ from ._typing_compat import AnyCallable
 from ._typing_compat import At
 from ._typing_compat import DictStrAny
 from ._typing_compat import ListStr
-from ._typing_compat import LiteralRuntime
+from ._typing_compat import LiteralBackend
 from ._typing_compat import LiteralString
 from ._typing_compat import NotRequired
 from ._typing_compat import Required
 from ._typing_compat import Self
 from ._typing_compat import overload
 from .exceptions import ForbiddenAttributeError
-from .utils import ENV_VARS_TRUE_VALUES
 from .utils import MYPY
 from .utils import LazyLoader
 from .utils import ReprMixin
@@ -312,7 +311,7 @@ class GenerationConfig(ReprMixin):
   eta_cutoff: float = dantic.Field(
       0.0,
       description=
-      '''Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details. '''
+      'Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details. '
   )
   diversity_penalty: float = dantic.Field(
       0.0,
@@ -387,17 +386,17 @@ class GenerationConfig(ReprMixin):
   output_attentions: bool = dantic.Field(
       False,
       description=
-      '''Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.'''
+      'Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.'
   )
   output_hidden_states: bool = dantic.Field(
       False,
       description=
-      '''Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.'''
+      'Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.'
   )
   output_scores: bool = dantic.Field(
       False,
-      description=
-      '''Whether or not to return the prediction scores. See `scores` under returned tensors for more details.''')
+      description='Whether or not to return the prediction scores. See `scores` under returned tensors for more details.'
+  )
   pad_token_id: int = dantic.Field(description='The id of the *padding* token.')
   bos_token_id: int = dantic.Field(description='The id of the *beginning-of-sequence* token.')
   eos_token_id: t.Union[int, t.List[int]] = dantic.Field(
@@ -565,7 +564,7 @@ class ModelSettings(t.TypedDict, total=False):
   architecture: Required[str]
 
   # default OpenLLM runtime imlementation
-  default_implementation: NotRequired[t.Dict[LiteralResourceSpec, LiteralRuntime]]
+  default_backend: NotRequired[t.Dict[LiteralResourceSpec, LiteralBackend]]
 
   # meta
   url: str
@@ -575,9 +574,7 @@ class ModelSettings(t.TypedDict, total=False):
   requirements: t.Optional[ListStr]
 
   # llm implementation specifics
-  bettertransformer: bool
   model_type: t.Literal['causal_lm', 'seq2seq_lm']
-  runtime: t.Literal['transformers', 'ggml']
 
   # naming convention, only name_type is needed to infer from the class
   # as the three below it can be determined automatically
@@ -597,7 +594,7 @@ class ModelSettings(t.TypedDict, total=False):
 
 _transformed_type: DictStrAny = {
     'fine_tune_strategies': t.Dict[AdapterType, FineTuneConfig],
-    'default_implementation': t.Dict[LiteralResourceSpec, LiteralRuntime]
+    'default_backend': t.Dict[LiteralResourceSpec, LiteralBackend]
 }
 
 @attr.define(frozen=False,
@@ -628,7 +625,7 @@ class _ModelSettingsAttr:
         ModelSettings(default_id='__default__',
                       model_ids=['__default__'],
                       architecture='PreTrainedModel',
-                      default_implementation={
+                      default_backend={
                           'cpu': 'pt',
                           'nvidia.com/gpu': 'pt'
                       },
@@ -641,8 +638,7 @@ class _ModelSettingsAttr:
                       tokenizer_class=None,
                       timeout=int(36e6),
                       service_name='',
-                      workers_per_resource=1.,
-                      runtime='transformers')))
+                      workers_per_resource=1.)))
 
   # NOTE: The below are dynamically generated by the field_transformer
   if t.TYPE_CHECKING:
@@ -650,15 +646,13 @@ class _ModelSettingsAttr:
     default_id: str
     model_ids: ListStr
     architecture: str
-    default_implementation: t.Dict[LiteralResourceSpec, LiteralRuntime]
+    default_backend: t.Dict[LiteralResourceSpec, LiteralBackend]
     url: str
     requires_gpu: bool
     trust_remote_code: bool
     service_name: str
     requirements: t.Optional[ListStr]
-    bettertransformer: bool
     model_type: t.Literal['causal_lm', 'seq2seq_lm']
-    runtime: t.Literal['transformers', 'ggml']
     name_type: t.Optional[t.Literal['dasherize', 'lowercase']]
     model_name: str
     start_name: str
@@ -670,15 +664,14 @@ class _ModelSettingsAttr:
     # update-config-stubs.py: attrs stop
 
 # a heuristic cascading implementation resolver based on available resources
-def get_default_implementation(
-    default_implementation_mapping: dict[LiteralResourceSpec, LiteralRuntime]) -> LiteralRuntime:
+def get_default_backend(backend_mapping: dict[LiteralResourceSpec, LiteralBackend]) -> LiteralBackend:
   available_spec = available_resource_spec()
-  if resource_spec('tpu') in available_spec: return default_implementation_mapping.get(resource_spec('tpu'), 'pt')
-  elif resource_spec('amd') in available_spec: return default_implementation_mapping.get(resource_spec('amd'), 'pt')
+  if resource_spec('tpu') in available_spec: return backend_mapping.get(resource_spec('tpu'), 'pt')
+  elif resource_spec('amd') in available_spec: return backend_mapping.get(resource_spec('amd'), 'pt')
   elif resource_spec('nvidia') in available_spec:
-    return default_implementation_mapping.get(resource_spec('nvidia'), 'pt')
+    return backend_mapping.get(resource_spec('nvidia'), 'pt')
   else:
-    return default_implementation_mapping.get(resource_spec('cpu'), 'pt')
+    return backend_mapping.get(resource_spec('cpu'), 'pt')
 
 def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ModelSettingsAttr:
   if 'generation_class' in cl_.__config__:
@@ -704,23 +697,17 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _
 
   model_name = _final_value_dct['model_name'] if 'model_name' in _final_value_dct else _settings_attr.model_name
   # if the default implementation dependencies doesn't exist, then always fallback to 'pt'
-  default_implementation = _settings_attr.default_implementation
-  for rs, runtime in default_implementation.items():
+  default_backend = _settings_attr.default_backend
+  for rs, runtime in default_backend.items():
     library_stub = 'torch' if runtime == 'pt' else runtime
-    if not BACKENDS_MAPPING[library_stub][0](): default_implementation[rs] = 'pt'
-  _final_value_dct['default_implementation'] = default_implementation
+    if not BACKENDS_MAPPING[library_stub][0](): default_backend[rs] = 'pt'
+  _final_value_dct['default_backend'] = default_backend
 
   env = openllm_core.utils.EnvVarMixin(model_name,
-                                       get_default_implementation(default_implementation),
-                                       model_id=_settings_attr.default_id,
-                                       bettertransformer=_settings_attr.bettertransformer)
+                                       backend=get_default_backend(default_backend),
+                                       model_id=_settings_attr.default_id)
   _final_value_dct['env'] = env
 
-  # bettertransformer support
-  if _settings_attr['bettertransformer'] is None:
-    _final_value_dct['bettertransformer'] = str(env['bettertransformer_value']).upper() in ENV_VARS_TRUE_VALUES
-  # if requires_gpu is True, then disable BetterTransformer for quantization.
-  if _settings_attr['requires_gpu']: _final_value_dct['bettertransformer'] = False
   _final_value_dct['service_name'] = f'generated_{model_name}_service.py'
 
   # NOTE: The key for fine-tune strategies is 'fine_tune_strategies'
@@ -775,16 +762,16 @@ class _ConfigAttr:
 
   @staticmethod
   def Field(default: t.Any = None, **attrs: t.Any) -> t.Any:
+    '''Field is a alias to the internal dantic utilities to easily create
+      attrs.fields with pydantic-compatible interface. For example:
+
+      ```python
+      class MyModelConfig(openllm.LLMConfig):
+          field1 = openllm.LLMConfig.Field(...)
+      ```
+    '''
     return dantic.Field(default, **attrs)
 
-  '''Field is a alias to the internal dantic utilities to easily create
-    attrs.fields with pydantic-compatible interface. For example:
-
-    ```python
-    class MyModelConfig(openllm.LLMConfig):
-        field1 = openllm.LLMConfig.Field(...)
-    ```
-    '''
   # NOTE: The following is handled via __init_subclass__, and is only used for TYPE_CHECKING
   if t.TYPE_CHECKING:
     # NOTE: public attributes to override
@@ -873,11 +860,8 @@ class _ConfigAttr:
             ```bash
             openllm start gpt-neox --model-id stabilityai/stablelm-tuned-alpha-3b
             ```'''
-    __openllm_default_implementation__: t.Dict[LiteralResourceSpec, LiteralRuntime] = Field(None)
-    '''The default runtime to run this LLM. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`.
-
-    It is a dictionary of key as the accelerator spec in k4s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM Runtime ('flax', 'tf', 'pt', 'vllm')
-    '''
+    __openllm_default_backend__: t.Dict[LiteralResourceSpec, LiteralBackend] = Field(None)
+    '''The default backend to run LLM based on available accelerator. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`. It is a dictionary of key as the accelerator spec in k8s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM backend ('flax', 'tf', 'pt', 'vllm', 'ggml', 'mlc')'''
     __openllm_url__: str = Field(None)
     '''The resolved url for this LLMConfig.'''
     __openllm_requires_gpu__: bool = Field(None)
@@ -885,18 +869,11 @@ class _ConfigAttr:
     __openllm_trust_remote_code__: bool = Field(None)
     '''Whether to always trust remote code'''
     __openllm_service_name__: str = Field(None)
-    """Generated service name for this LLMConfig. By default, it is 'generated_{model_name}_service.py'"""
+    '''Generated service name for this LLMConfig. By default, it is "generated_{model_name}_service.py"'''
     __openllm_requirements__: t.Optional[ListStr] = Field(None)
-    '''The default PyPI requirements needed to run this given LLM. By default, we will depend on
-        bentoml, torch, transformers.'''
-    __openllm_bettertransformer__: bool = Field(None)
-    '''Whether to use BetterTransformer for this given LLM. This depends per model architecture. By default, we will use BetterTransformer for T5 and StableLM models, and set to False for every other models.'''
+    '''The default PyPI requirements needed to run this given LLM. By default, we will depend on bentoml, torch, transformers.'''
     __openllm_model_type__: t.Literal['causal_lm', 'seq2seq_lm'] = Field(None)
-    '''The model type for this given LLM. By default, it should be causal language modeling.
-        Currently supported 'causal_lm' or 'seq2seq_lm'
-        '''
-    __openllm_runtime__: t.Literal['transformers', 'ggml'] = Field(None)
-    '''The runtime to use for this model. Possible values are `transformers` or `ggml`. See Llama for more information.'''
+    '''The model type for this given LLM. By default, it should be causal language modeling. Currently supported "causal_lm" or "seq2seq_lm"'''
     __openllm_name_type__: t.Optional[t.Literal['dasherize', 'lowercase']] = Field(None)
     '''The default name typed for this model. "dasherize" will convert the name to lowercase and
         replace spaces with dashes. "lowercase" will convert the name to lowercase. If this is not set, then both
@@ -1212,8 +1189,8 @@ class LLMConfig(_ConfigAttr):
       annotated_names.add(attr_name)
       val = cd.get(attr_name, attr.NOTHING)
       if not isinstance(val, _CountingAttr):
-        if val is attr.NOTHING: val = cls.Field(env=field_env_key(cls.__openllm_model_name__, attr_name))
-        else: val = cls.Field(default=val, env=field_env_key(cls.__openllm_model_name__, attr_name))
+        if val is attr.NOTHING: val = cls.Field(env=field_env_key(attr_name))
+        else: val = cls.Field(default=val, env=field_env_key(attr_name))
       these[attr_name] = val
     unannotated = ca_names - annotated_names
     if len(unannotated) > 0:
@@ -1293,7 +1270,7 @@ class LLMConfig(_ConfigAttr):
   @overload
   def __getitem__(self, item: t.Literal['architecture']) -> str: ...
   @overload
-  def __getitem__(self, item: t.Literal['default_implementation']) -> t.Dict[LiteralResourceSpec, LiteralRuntime]: ...
+  def __getitem__(self, item: t.Literal['default_backend']) -> t.Dict[LiteralResourceSpec, LiteralBackend]: ...
   @overload
   def __getitem__(self, item: t.Literal['url']) -> str: ...
   @overload
@@ -1305,12 +1282,8 @@ class LLMConfig(_ConfigAttr):
   @overload
   def __getitem__(self, item: t.Literal['requirements']) -> t.Optional[ListStr]: ...
   @overload
-  def __getitem__(self, item: t.Literal['bettertransformer']) -> bool: ...
-  @overload
   def __getitem__(self, item: t.Literal['model_type']) -> t.Literal['causal_lm', 'seq2seq_lm']: ...
   @overload
-  def __getitem__(self, item: t.Literal['runtime']) -> t.Literal['transformers', 'ggml']: ...
-  @overload
   def __getitem__(self, item: t.Literal['name_type']) -> t.Optional[t.Literal['dasherize', 'lowercase']]: ...
   @overload
   def __getitem__(self, item: t.Literal['model_name']) -> str: ...
@@ -1663,9 +1636,9 @@ class LLMConfig(_ConfigAttr):
     return _PEFT_TASK_TYPE_TARGET_MAPPING[cls.__openllm_model_type__]
 
   @classmethod
-  def default_implementation(cls) -> LiteralRuntime:
-    return first_not_none(cls.__openllm_env__['framework_value'],
-                          default=get_default_implementation(cls.__openllm_default_implementation__))
+  def default_backend(cls) -> LiteralBackend:
+    return first_not_none(cls.__openllm_env__['backend_value'],
+                          default=get_default_backend(cls.__openllm_default_backend__))
 
   def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
     '''This handler will sanitize all attrs and setup prompt text.
diff --git a/openllm-core/src/openllm_core/_schema.py b/openllm-core/src/openllm_core/_schema.py
index 04748034..9e28451b 100644
--- a/openllm-core/src/openllm_core/_schema.py
+++ b/openllm-core/src/openllm_core/_schema.py
@@ -1,4 +1,4 @@
-'''Schema definition for OpenLLM. This can be use for client interaction.'''
+'''Schema definition for OpenLLM. This schema is used throughout openllm core components library.'''
 from __future__ import annotations
 import functools
 import typing as t
@@ -77,7 +77,7 @@ class MetadataOutput:
   model_id: str
   timeout: int
   model_name: str
-  framework: str
+  backend: str
   configuration: str
   supports_embeddings: bool
   supports_hf_agent: bool
diff --git a/openllm-core/src/openllm_core/_strategies.py b/openllm-core/src/openllm_core/_strategies.py
index a6e9b5a0..a1b976c3 100644
--- a/openllm-core/src/openllm_core/_strategies.py
+++ b/openllm-core/src/openllm_core/_strategies.py
@@ -94,7 +94,7 @@ def _from_system(cls: type[DynResource]) -> list[str]:
   if visible_devices is None:
     if cls.resource_id == 'amd.com/gpu':
       if not psutil.LINUX:
-        if DEBUG: warnings.warn('AMD GPUs is currently only supported on Linux.', stacklevel=_STACK_LEVEL)
+        if DEBUG: logger.debug('AMD GPUs is currently only supported on Linux.')
         return []
       # ROCm does not currently have the rocm_smi wheel.
       # So we need to use the ctypes bindings directly.
diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py
index 85fec6bc..d28e53fd 100644
--- a/openllm-core/src/openllm_core/_typing_compat.py
+++ b/openllm-core/src/openllm_core/_typing_compat.py
@@ -21,6 +21,8 @@ if t.TYPE_CHECKING:
   from bentoml._internal.runner.runnable import RunnableMethod
   from bentoml._internal.runner.runner import RunnerMethod
   from bentoml._internal.runner.strategy import Strategy
+  from openllm._llm import LLM
+  from openllm_core._schema import EmbeddingsOutput
 
   from .utils.lazy import VersionInfo
 
@@ -35,6 +37,9 @@ T = t.TypeVar(
     't.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]'
 )
 
+def get_literal_args(typ: t.Any) -> tuple[str, ...]:
+  return getattr(typ, '__args__')
+
 AnyCallable = t.Callable[..., t.Any]
 DictStrAny = t.Dict[str, t.Any]
 ListAny = t.List[t.Any]
@@ -42,7 +47,7 @@ ListStr = t.List[str]
 TupleAny = t.Tuple[t.Any, ...]
 At = t.TypeVar('At', bound=attr.AttrsInstance)
 
-LiteralRuntime = t.Literal['pt', 'tf', 'flax', 'vllm']
+LiteralBackend = t.Literal['pt', 'tf', 'flax', 'vllm', 'ggml', 'mlc']
 AdapterType = t.Literal['lora', 'adalora', 'adaption_prompt', 'prefix_tuning', 'p_tuning', 'prompt_tuning', 'ia3']
 
 # TODO: support quay
@@ -78,10 +83,6 @@ class PeftAdapterOutput(t.TypedDict):
   result: t.Dict[str, peft.PeftConfig]
   error_msg: str
 
-class LLMEmbeddings(t.TypedDict):
-  embeddings: t.List[t.List[float]]
-  num_tokens: int
-
 class AdaptersTuple(TupleAny):
   adapter_id: str
   name: t.Optional[str]
@@ -98,7 +99,7 @@ class LLMRunnable(bentoml.Runnable, t.Generic[M, T]):
   SUPPORTED_RESOURCES = ('amd.com/gpu', 'nvidia.com/gpu', 'cpu')
   SUPPORTS_CPU_MULTI_THREADING = True
   __call__: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]]
-  embeddings: RunnableMethod[LLMRunnable[M, T], [list[str]], LLMEmbeddings]
+  embeddings: RunnableMethod[LLMRunnable[M, T], [list[str]], EmbeddingsOutput]
   generate: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]]
   generate_one: RunnableMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal['generated_text'], str]]]
   generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]]
@@ -108,15 +109,14 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
   __module__: str
   llm_type: str
   llm_tag: bentoml.Tag
-  llm_framework: LiteralRuntime
   identifying_params: dict[str, t.Any]
   llm: openllm.LLM[M, T]
   config: openllm.LLMConfig
-  implementation: LiteralRuntime
+  backend: LiteralBackend
   supports_embeddings: bool
   supports_hf_agent: bool
   has_adapters: bool
-  embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], t.Sequence[LLMEmbeddings]]
+  embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], t.Sequence[EmbeddingsOutput]]
   generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]]
   generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal['generated_text'], str]]]
   generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]]
@@ -139,7 +139,7 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
     ...
 
   @abc.abstractmethod
-  def embed(self, prompt: str | list[str]) -> LLMEmbeddings:
+  def embed(self, prompt: str | list[str]) -> EmbeddingsOutput:
     ...
 
   def run(self, prompt: str, **attrs: t.Any) -> t.Any:
@@ -161,3 +161,25 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
   @abc.abstractmethod
   def __repr_keys__(self) -> set[str]:
     ...
+
+class load_model_protocol(t.Generic[M, T], t.Protocol):
+
+  def __call__(self, llm: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
+    ...
+
+class load_tokenizer_protocol(t.Generic[M, T], t.Protocol):
+
+  def __call__(self, llm: LLM[M, T], **attrs: t.Any) -> T:
+    ...
+
+_R = t.TypeVar('_R', covariant=True)
+
+class import_model_protocol(t.Generic[_R, M, T], t.Protocol):
+
+  def __call__(self, llm: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any) -> _R:
+    ...
+
+class llm_post_init_protocol(t.Generic[M, T], t.Protocol):
+
+  def __call__(self, llm: LLM[M, T]) -> T:
+    ...
diff --git a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py
index 6e822f5a..6ab24a99 100644
--- a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py
+++ b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py
@@ -48,14 +48,14 @@ def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str)
   treated specially and converted to a single, new token.  This retrieves the token ID each of these keys map to.
 
   Args:
-  tokenizer: the tokenizer
-  key: the key to convert to a single token
+    tokenizer: the tokenizer
+    key: the key to convert to a single token
 
   Raises:
-  RuntimeError: if more than one ID was generated
+    RuntimeError: if more than one ID was generated
 
   Returns:
-  int: the token ID for the given key.
+    int: the token ID for the given key.
   '''
   token_ids = tokenizer.encode(key)
   if len(token_ids) > 1: raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
diff --git a/openllm-core/src/openllm_core/config/configuration_flan_t5.py b/openllm-core/src/openllm_core/config/configuration_flan_t5.py
index e0a73e91..1c5eddc9 100644
--- a/openllm-core/src/openllm_core/config/configuration_flan_t5.py
+++ b/openllm-core/src/openllm_core/config/configuration_flan_t5.py
@@ -17,14 +17,14 @@ Run a LLMServer for FLAN-T5 model.
 By default, this model will use the PyTorch model for inference. However, this model supports both Flax and Tensorflow.
 
 \b
-- To use Flax, set the environment variable ``OPENLLM_FLAN_T5_FRAMEWORK="flax"``
+- To use Flax, set the environment variable ``OPENLLM_BACKEND="flax"``
 
 \b
-- To use Tensorflow, set the environment variable ``OPENLLM_FLAN_T5_FRAMEWORK="tf"``
+- To use Tensorflow, set the environment variable ``OPENLLM_BACKEND="tf"``
 
 \b
 FLAN-T5 Runner will use google/flan-t5-large as the default model. To change to any other FLAN-T5
-saved pretrained, or a fine-tune FLAN-T5, provide ``OPENLLM_FLAN_T5_MODEL_ID='google/flan-t5-xxl'``
+saved pretrained, or a fine-tune FLAN-T5, provide ``OPENLLM_MODEL_ID='google/flan-t5-xxl'``
 or provide `--model-id` flag when running ``openllm start flan-t5``:
 
 \b
diff --git a/openllm-core/src/openllm_core/config/configuration_llama.py b/openllm-core/src/openllm_core/config/configuration_llama.py
index b6ce000d..b96a9785 100644
--- a/openllm-core/src/openllm_core/config/configuration_llama.py
+++ b/openllm-core/src/openllm_core/config/configuration_llama.py
@@ -19,11 +19,14 @@ By default, this model will use [vLLM](https://github.com/vllm-project/vllm) for
 This model will also supports PyTorch.
 
 \b
-- To use PyTorch, set the environment variable ``OPENLLM_LLAMA_FRAMEWORK="pt"``
+- To use PyTorch, set the environment variable ``OPENLLM_BACKEND="pt"``
+
+\b
+- To use vLLM, set the environment variable ``OPENLLM_BACKEND="vllm"``
 
 \b
 Llama Runner will use decapoda-research/llama-7b-hf as the default model. To change to any other Llama
-saved pretrained, or a fine-tune Llama, provide ``OPENLLM_LLAMA_MODEL_ID='openlm-research/open_llama_7b_v2'``
+saved pretrained, or a fine-tune Llama, provide ``OPENLLM_MODEL_ID='openlm-research/open_llama_7b_v2'``
 or provide `--model-id` flag when running ``openllm start llama``:
 
 \b
@@ -70,7 +73,7 @@ class LlamaConfig(openllm_core.LLMConfig):
           'lowercase',
       'url':
           'https://github.com/facebookresearch/llama',
-      'default_implementation': {
+      'default_backend': {
           'cpu': 'pt',
           'nvidia.com/gpu': 'pt'
       },
diff --git a/openllm-core/src/openllm_core/config/configuration_opt.py b/openllm-core/src/openllm_core/config/configuration_opt.py
index 2a5c323e..2ddf0cdc 100644
--- a/openllm-core/src/openllm_core/config/configuration_opt.py
+++ b/openllm-core/src/openllm_core/config/configuration_opt.py
@@ -18,14 +18,14 @@ Run a LLMServer for OPT model.
 By default, this model will use the PyTorch model for inference. However, this model supports both Flax and Tensorflow.
 
 \b
-- To use Flax, set the environment variable ``OPENLLM_OPT_FRAMEWORK="flax"``
+- To use Flax, set the environment variable ``OPENLLM_BACKEND="flax"``
 
 \b
-- To use Tensorflow, set the environment variable ``OPENLLM_OPT_FRAMEWORK="tf"``
+- To use Tensorflow, set the environment variable ``OPENLLM_BACKEND="tf"``
 
 \b
 OPT Runner will use facebook/opt-2.7b as the default model. To change to any other OPT
-saved pretrained, or a fine-tune OPT, provide ``OPENLLM_OPT_MODEL_ID='facebook/opt-6.7b'``
+saved pretrained, or a fine-tune OPT, provide ``OPENLLM_MODEL_ID='facebook/opt-6.7b'``
 or provide `--model-id` flag when running ``openllm start opt``:
 
 \b
diff --git a/openllm-core/src/openllm_core/py.typed b/openllm-core/src/openllm_core/py.typed
new file mode 100644
index 00000000..e69de29b
diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py
index 521170a8..94948144 100644
--- a/openllm-core/src/openllm_core/utils/__init__.py
+++ b/openllm-core/src/openllm_core/utils/__init__.py
@@ -47,12 +47,12 @@ logger = logging.getLogger(__name__)
 try:
   from typing import GenericAlias as _TypingGenericAlias  # type: ignore
 except ImportError:
-  _TypingGenericAlias = (
-  )  # type: ignore  # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
+  # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
+  _TypingGenericAlias = ()  # type: ignore
 if sys.version_info < (3, 10): _WithArgsTypes = (_TypingGenericAlias,)
 else:
-  _WithArgsTypes: t.Any = (t._GenericAlias, types.GenericAlias, types.UnionType
-                          )  # type: ignore #  _GenericAlias is the actual GenericAlias implementation
+  #  _GenericAlias is the actual GenericAlias implementation
+  _WithArgsTypes: t.Any = (t._GenericAlias, types.GenericAlias, types.UnionType)  # type: ignore
 
 DEV_DEBUG_VAR = 'OPENLLMDEVDEBUG'
 
@@ -96,6 +96,9 @@ def generate_hash_from_file(f: str, algorithm: t.Literal['md5', 'sha1'] = 'sha1'
 def device_count() -> int:
   return len(available_devices())
 
+def check_bool_env(env: str, default: bool = True) -> bool:
+  return os.environ.get(env, str(default)).upper() in ENV_VARS_TRUE_VALUES
+
 # equivocal setattr to save one lookup per assignment
 _object_setattr = object.__setattr__
 
@@ -104,14 +107,16 @@ def non_intrusive_setattr(obj: t.Any, name: str, value: t.Any) -> None:
   _setattr = functools.partial(setattr, obj) if isinstance(obj, type) else _object_setattr.__get__(obj)
   if not hasattr(obj, name): _setattr(name, value)
 
-def field_env_key(model_name: str, key: str, suffix: str | None = None) -> str:
-  return '_'.join(filter(None, map(str.upper, ['OPENLLM', model_name, suffix.strip('_') if suffix else '', key])))
+def field_env_key(key: str, suffix: str | None = None) -> str:
+  return '_'.join(filter(None, map(str.upper, ['OPENLLM', suffix.strip('_') if suffix else '', key])))
 
 # Special debug flag controled via OPENLLMDEVDEBUG
-DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.environ.get(DEV_DEBUG_VAR)))
+DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and check_bool_env(DEV_DEBUG_VAR, default=False))
+# Whether to show the codenge for debug purposes
+SHOW_CODEGEN: bool = DEBUG and (os.environ.get(DEV_DEBUG_VAR, str(0)).isdigit() and
+                                int(os.environ.get(DEV_DEBUG_VAR, str(0))) > 3)
 # MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins
 MYPY = False
-SHOW_CODEGEN: bool = DEBUG and int(os.environ.get('OPENLLMDEVDEBUG', str(0))) > 3
 
 def get_debug_mode() -> bool:
   return DEBUG or _get_debug_mode()
@@ -193,6 +198,7 @@ def configure_logging() -> None:
     _LOGGING_CONFIG['loggers']['bentoml']['level'] = logging.ERROR
     _LOGGING_CONFIG['root']['level'] = logging.ERROR
   elif get_debug_mode() or DEBUG:
+    _LOGGING_CONFIG['handlers']['defaulthandler']['level'] = logging.DEBUG
     _LOGGING_CONFIG['loggers']['openllm']['level'] = logging.DEBUG
     _LOGGING_CONFIG['loggers']['bentoml']['level'] = logging.DEBUG
     _LOGGING_CONFIG['root']['level'] = logging.DEBUG
@@ -330,8 +336,8 @@ _import_structure: dict[str, list[str]] = {
     'analytics': [],
     'codegen': [],
     'dantic': [],
+    'lazy': [],
     'representation': ['ReprMixin'],
-    'lazy': ['LazyModule'],
     'import_utils': [
         'OPTIONAL_DEPENDENCIES', 'DummyMetaclass', 'EnvVarMixin', 'require_backends', 'is_cpm_kernels_available',
         'is_einops_available', 'is_flax_available', 'is_tf_available', 'is_vllm_available', 'is_torch_available',
diff --git a/openllm-core/src/openllm_core/utils/analytics.py b/openllm-core/src/openllm_core/utils/analytics.py
index 53a10f65..3e680ccf 100644
--- a/openllm-core/src/openllm_core/utils/analytics.py
+++ b/openllm-core/src/openllm_core/utils/analytics.py
@@ -24,11 +24,10 @@ logger = logging.getLogger(__name__)
 
 # This variable is a proxy that will control BENTOML_DO_NOT_TRACK
 OPENLLM_DO_NOT_TRACK = 'OPENLLM_DO_NOT_TRACK'
-DO_NOT_TRACK = os.environ.get(OPENLLM_DO_NOT_TRACK, str(False)).upper()
 
 @functools.lru_cache(maxsize=1)
 def do_not_track() -> bool:
-  return DO_NOT_TRACK in openllm_core.utils.ENV_VARS_TRUE_VALUES
+  return openllm_core.utils.check_bool_env(OPENLLM_DO_NOT_TRACK)
 
 @functools.lru_cache(maxsize=1)
 def _usage_event_debugging() -> bool:
diff --git a/openllm-core/src/openllm_core/utils/codegen.py b/openllm-core/src/openllm_core/utils/codegen.py
index 7c2cce5c..7141bdc1 100644
--- a/openllm-core/src/openllm_core/utils/codegen.py
+++ b/openllm-core/src/openllm_core/utils/codegen.py
@@ -96,7 +96,7 @@ def make_attr_tuple_class(cls_name: str, attr_names: t.Sequence[str]) -> type[t.
   else:
     attr_class_template.append('    pass')
   globs: DictStrAny = {'_attrs_itemgetter': itemgetter, '_attrs_property': property}
-  if SHOW_CODEGEN: logger.info('Generated class for %s:\n\n%s', attr_class_name, '\n'.join(attr_class_template))
+  if SHOW_CODEGEN: print(f'Generated class for {attr_class_name}:\n\n', '\n'.join(attr_class_template))
   _compile_and_eval('\n'.join(attr_class_template), globs)
   return globs[attr_class_name]
 
@@ -114,7 +114,7 @@ def generate_function(typ: type[t.Any],
                                       '\n    '.join(lines) if lines else 'pass')
   meth = _make_method(func_name, script, generate_unique_filename(typ, func_name), globs)
   if annotations: meth.__annotations__ = annotations
-  if SHOW_CODEGEN: logger.info('Generated script for %s:\n\n%s', typ, script)
+  if SHOW_CODEGEN: print('Generated script for {typ}:\n\n', script)
   return meth
 
 def make_env_transformer(cls: type[openllm_core.LLMConfig],
@@ -139,11 +139,8 @@ def make_env_transformer(cls: type[openllm_core.LLMConfig],
       '__model_name': model_name,
   })
   lines: ListStr = [
-      '__env = lambda field_name: __field_env(__model_name, field_name, __suffix)', 'return [', '    f.evolve(',
-      '        default=__populate_env(__default_callback(f.name, f.default), __env(f.name)),', '        metadata={',
-      "            'env': f.metadata.get('env', __env(f.name)),",
-      "            'description': f.metadata.get('description', '(not provided)'),", '        },', '    )',
-      '    for f in fields', ']'
+      '__env=lambda field_name:__field_env(field_name,__suffix)',
+      "return [f.evolve(default=__populate_env(__default_callback(f.name,f.default),__env(f.name)),metadata={'env':f.metadata.get('env',__env(f.name)),'description':f.metadata.get('description', '(not provided)')}) for f in fields]"
   ]
   fields_ann = 'list[attr.Attribute[t.Any]]'
   return generate_function(cls,
diff --git a/openllm-core/src/openllm_core/utils/import_utils.py b/openllm-core/src/openllm_core/utils/import_utils.py
index eb1d2474..8ea867b2 100644
--- a/openllm-core/src/openllm_core/utils/import_utils.py
+++ b/openllm-core/src/openllm_core/utils/import_utils.py
@@ -17,6 +17,7 @@ import openllm_core
 
 from bentoml._internal.utils import LazyLoader
 from bentoml._internal.utils import pkg
+from openllm_core._typing_compat import LiteralBackend
 from openllm_core._typing_compat import LiteralString
 from openllm_core._typing_compat import overload
 
@@ -24,7 +25,6 @@ from .representation import ReprMixin
 
 if t.TYPE_CHECKING:
   BackendOrderedDict = OrderedDict[str, t.Tuple[t.Callable[[], bool], str]]
-  from openllm_core._typing_compat import LiteralRuntime
 
 logger = logging.getLogger(__name__)
 OPTIONAL_DEPENDENCIES = {
@@ -336,9 +336,7 @@ class EnvVarMixin(ReprMixin):
   config: str
   model_id: str
   quantize: str
-  framework: str
-  bettertransformer: str
-  runtime: str
+  backend: str
 
   @overload
   def __getitem__(self, item: t.Literal['config']) -> str:
@@ -353,19 +351,11 @@ class EnvVarMixin(ReprMixin):
     ...
 
   @overload
-  def __getitem__(self, item: t.Literal['framework']) -> str:
+  def __getitem__(self, item: t.Literal['backend']) -> str:
     ...
 
   @overload
-  def __getitem__(self, item: t.Literal['bettertransformer']) -> str:
-    ...
-
-  @overload
-  def __getitem__(self, item: t.Literal['runtime']) -> str:
-    ...
-
-  @overload
-  def __getitem__(self, item: t.Literal['framework_value']) -> LiteralRuntime:
+  def __getitem__(self, item: t.Literal['backend_value']) -> LiteralBackend:
     ...
 
   @overload
@@ -376,14 +366,6 @@ class EnvVarMixin(ReprMixin):
   def __getitem__(self, item: t.Literal['model_id_value']) -> str | None:
     ...
 
-  @overload
-  def __getitem__(self, item: t.Literal['bettertransformer_value']) -> bool:
-    ...
-
-  @overload
-  def __getitem__(self, item: t.Literal['runtime_value']) -> t.Literal['ggml', 'transformers']:
-    ...
-
   def __getitem__(self, item: str | t.Any) -> t.Any:
     if item.endswith('_value') and hasattr(self, f'_{item}'): return object.__getattribute__(self, f'_{item}')()
     elif hasattr(self, item): return getattr(self, item)
@@ -391,50 +373,34 @@ class EnvVarMixin(ReprMixin):
 
   def __init__(self,
                model_name: str,
-               implementation: LiteralRuntime = 'pt',
+               backend: LiteralBackend = 'pt',
                model_id: str | None = None,
-               bettertransformer: bool | None = None,
-               quantize: LiteralString | None = None,
-               runtime: t.Literal['ggml', 'transformers'] = 'transformers') -> None:
+               quantize: LiteralString | None = None) -> None:
     '''EnvVarMixin is a mixin class that returns the value extracted from environment variables.'''
     from openllm_core.utils import field_env_key
     self.model_name = inflection.underscore(model_name)
-    self._implementation = implementation
+    self._backend = backend
     self._model_id = model_id
-    self._bettertransformer = bettertransformer
     self._quantize = quantize
-    self._runtime = runtime
-    for att in {'config', 'model_id', 'quantize', 'framework', 'bettertransformer', 'runtime'}:
-      setattr(self, att, field_env_key(self.model_name, att.upper()))
+    for att in {'config', 'model_id', 'quantize', 'backend'}:
+      setattr(self, att, field_env_key(att.upper()))
 
   def _quantize_value(self) -> t.Literal['int8', 'int4', 'gptq'] | None:
     from . import first_not_none
     return t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']],
                   first_not_none(os.environ.get(self['quantize']), default=self._quantize))
 
-  def _framework_value(self) -> LiteralRuntime:
+  def _backend_value(self) -> LiteralBackend:
     from . import first_not_none
-    return t.cast(LiteralRuntime, first_not_none(os.environ.get(self['framework']), default=self._implementation))
-
-  def _bettertransformer_value(self) -> bool:
-    from . import first_not_none
-    return t.cast(
-        bool,
-        first_not_none(os.environ.get(self['bettertransformer'], str(False)).upper() in ENV_VARS_TRUE_VALUES,
-                       default=self._bettertransformer))
+    return t.cast(LiteralBackend, first_not_none(os.environ.get(self['backend']), default=self._backend))
 
   def _model_id_value(self) -> str | None:
     from . import first_not_none
     return first_not_none(os.environ.get(self['model_id']), default=self._model_id)
 
-  def _runtime_value(self) -> t.Literal['ggml', 'transformers']:
-    from . import first_not_none
-    return t.cast(t.Literal['ggml', 'transformers'],
-                  first_not_none(os.environ.get(self['runtime']), default=self._runtime))
-
   @property
   def __repr_keys__(self) -> set[str]:
-    return {'config', 'model_id', 'quantize', 'framework', 'bettertransformer', 'runtime'}
+    return {'config', 'model_id', 'quantize', 'backend'}
 
   @property
   def start_docstring(self) -> str:
diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py
index 2060b332..a0772855 100644
--- a/openllm-python/src/openllm/__init__.py
+++ b/openllm-python/src/openllm/__init__.py
@@ -47,7 +47,7 @@ _import_structure: dict[str, list[str]] = {
     "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"],
     "_quantisation": ["infer_quantisation_config"],
     "_embeddings": ["GenericEmbeddingRunnable"],
-    "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"],
+    "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "EmbeddingsOutput"],
     "_generation": [
         "StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList",
         "prepare_logits_processor"
@@ -72,7 +72,7 @@ COMPILED = _Path(__file__).suffix in (".pyd", ".so")
 if _t.TYPE_CHECKING:
   from . import bundle as bundle, cli as cli, client as client, models as models, playground as playground, serialisation as serialisation, testing as testing
   from ._generation import LogitsProcessorList as LogitsProcessorList, StopOnTokens as StopOnTokens, StoppingCriteriaList as StoppingCriteriaList, StopSequenceCriteria as StopSequenceCriteria, prepare_logits_processor as prepare_logits_processor
-  from ._llm import LLM as LLM, LLMEmbeddings as LLMEmbeddings, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
+  from ._llm import LLM as LLM, EmbeddingsOutput as EmbeddingsOutput, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
   from ._quantisation import infer_quantisation_config as infer_quantisation_config
   from ._embeddings import GenericEmbeddingRunnable as GenericEmbeddingRunnable
   from .cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start, start_grpc as start_grpc
@@ -196,7 +196,12 @@ else:
 __lazy = openllm_core.utils.LazyModule(__name__,
                                        globals()["__file__"],
                                        _import_structure,
-                                       extra_objects={"COMPILED": COMPILED})
+                                       extra_objects={
+                                           "COMPILED": COMPILED,
+                                           "__openllm_migration__": {
+                                               "LLMEmbeddings": "EmbeddingsOutput"
+                                           }
+                                       })
 __all__ = __lazy.__all__
 __dir__ = __lazy.__dir__
 __getattr__ = __lazy.__getattr__
diff --git a/openllm-python/src/openllm/_assign.py b/openllm-python/src/openllm/_assign.py
new file mode 100644
index 00000000..8026204e
--- /dev/null
+++ b/openllm-python/src/openllm/_assign.py
@@ -0,0 +1,201 @@
+'''LLM assignment magik.'''
+from __future__ import annotations
+import functools
+import traceback
+import typing as t
+
+import openllm
+
+from openllm.exceptions import OpenLLMException
+from openllm_core._configuration import _object_getattribute
+from openllm_core._configuration import _setattr_class
+from openllm_core._schema import unmarshal_vllm_outputs
+from openllm_core._typing_compat import DictStrAny
+from openllm_core._typing_compat import ListStr
+from openllm_core._typing_compat import M
+from openllm_core._typing_compat import T
+from openllm_core._typing_compat import import_model_protocol
+from openllm_core._typing_compat import llm_post_init_protocol
+from openllm_core._typing_compat import load_model_protocol
+from openllm_core._typing_compat import load_tokenizer_protocol
+from openllm_core.utils import LazyLoader
+from openllm_core.utils import codegen
+from openllm_core.utils import device_count
+from openllm_core.utils import first_not_none
+from openllm_core.utils import is_torch_available
+
+if t.TYPE_CHECKING:
+  import torch
+  import vllm
+
+  import bentoml
+
+  from openllm._llm import LLM
+else:
+  torch = LazyLoader('torch', globals(), 'torch')
+  vllm = LazyLoader('vllm', globals(), 'vllm')
+
+def import_model(fn: import_model_protocol[bentoml.Model, M, T]) -> t.Callable[[LLM[M, T]], bentoml.Model]:
+
+  @functools.wraps(fn)
+  def inner(self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any) -> bentoml.Model:
+    trust_remote_code = first_not_none(trust_remote_code, default=self.trust_remote_code)
+    (model_decls, model_attrs), _ = self.llm_parameters
+    decls = (*model_decls, *decls)
+    attrs = {**model_attrs, **attrs}
+    return fn(self, *decls, trust_remote_code=trust_remote_code, **attrs)
+
+  return inner
+
+def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.LLMEngine]:
+
+  @functools.wraps(fn)
+  def inner(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine:
+    if self.__llm_backend__ == 'vllm':
+      # TODO: Do some more processing with token_id once we support token streaming
+      try:
+        return vllm.LLMEngine.from_engine_args(
+            vllm.EngineArgs(model=self._bentomodel.path,
+                            tokenizer=self._bentomodel.path if self.tokenizer_id == 'local' else self.tokenizer_id,
+                            tokenizer_mode='auto',
+                            tensor_parallel_size=1 if device_count() < 2 else device_count(),
+                            dtype='auto',
+                            worker_use_ray=False))
+      except Exception as err:
+        traceback.print_exc()
+        raise OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from None
+    else:
+      (model_decls, model_attrs), _ = self.llm_parameters
+      return fn(self, *(*model_decls, *decls), **{**model_attrs, **attrs})
+
+  return inner
+
+def load_tokenizer(fn: load_tokenizer_protocol[M, T]) -> t.Callable[[LLM[M, T]], T]:
+
+  @functools.wraps(fn)
+  def inner(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T:
+    return fn(self, **{**self.llm_parameters[-1], **tokenizer_attrs})
+
+  return inner
+
+def llm_post_init(fn: llm_post_init_protocol[M, T]) -> t.Callable[[LLM[M, T]], None]:
+
+  @functools.wraps(fn)
+  def inner(self: LLM[M, T]) -> None:
+    if self.__llm_backend__ == 'pt' and is_torch_available():
+      self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    fn(self)
+
+  return inner
+
+def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]:
+  '''Make LLM attributes for the given LLM subclass.'''
+  from ._llm import LLM
+  from ._llm import LLMFunction
+  from ._llm import LLMInterface
+  from ._llm import LLMSerialisation
+
+  args: ListStr = []
+  globs: DictStrAny = {'cls': cls, '__wrapped_llm_post_init': llm_post_init, 'LLM': LLM}
+  # _cached_LLMFunction_get and _ccached_LLMSerialisation_get
+  globs.update(
+      {f'_cached_{cl_.__name__}_get': _object_getattribute.__get__(cl_) for cl_ in {LLMSerialisation, LLMFunction}})
+  # llm_post_init implementation
+  lines: ListStr = [
+      f'_impl_{cls.__name__}_func=cls.llm_post_init',
+      _setattr_class('llm_post_init', f'__wrapped_llm_post_init(_impl_{cls.__name__}_func)')
+  ]
+
+  serialisation_attr = {'import_model': import_model, 'load_model': load_model, 'load_tokenizer': load_tokenizer,}
+  for func, impl in serialisation_attr.items():
+    impl_name = f'__wrapped_{func}'
+    globs.update({f'__serialisation_{func}': getattr(openllm.serialisation, func, None), impl_name: impl})
+    cached_func_name = f'_cached_{cls.__name__}_func'
+    func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMSerialisation_get('{func}') else __serialisation_{func}"
+    lines.extend([
+        f'{cached_func_name}=cls.{func}', func_call,
+        _setattr_class(func, f'{impl_name}(_impl_{cls.__name__}_{func})')
+    ])
+
+  # assign vLLM implementation
+  if cls.__llm_backend__ == 'vllm':
+    vllm_func = {
+        f'_vllm_{it}': fn
+        for it, fn in zip(('generate', 'generate_iterator',
+                           'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate))
+    }
+    globs.update(vllm_func)
+    lines.extend([_setattr_class(it[6:], it) for it in vllm_func])
+
+  interface_anns = codegen.get_annotations(LLMInterface)
+
+  # cached attribute initialisation
+  def dunder_cached(key: str) -> str:
+    return f'__llm_{key}__'
+
+  st_attr = {'model', 'tokenizer', 'adapter_map'}
+  lines.extend([_setattr_class(dunder_cached(v), None) for v in st_attr])
+
+  # boolean for better LLM implementation resolver
+  def dunder_support(key: str) -> str:
+    return f'__llm_supports_{key}__'
+
+  bool_attr = {it[15:-2] for it in interface_anns if it.startswith('__llm_supports_')}
+  lines.extend(
+      [_setattr_class(dunder_support(fn), f"cls.{fn} is not _cached_LLMFunction_get('{fn}')") for fn in bool_attr])
+
+  return codegen.generate_function(cls,
+                                   '__assign_llm_attr',
+                                   lines,
+                                   args=('cls', *args),
+                                   globs=globs,
+                                   annotations={
+                                       'cls': 't.Type[LLM]',
+                                       'return': None
+                                   })
+
+def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]],
+                              **_: t.Any) -> str:
+  return generation_result[0]['outputs'][0]['text']
+
+def vllm_generate_iterator(self: LLM['vllm.LLMEngine', T],
+                           prompt: str,
+                           /,
+                           *,
+                           echo: bool = False,
+                           stop: str | t.Iterable[str] | None = None,
+                           stop_token_ids: list[int] | None = None,
+                           **attrs: t.Any) -> t.Iterator[dict[str, t.Any]]:
+  request_id: str | None = attrs.pop('request_id', None)
+  if request_id is None: raise ValueError('request_id must not be None.')
+  if stop_token_ids is None: stop_token_ids = []
+  stop_token_ids.append(self.tokenizer.eos_token_id)
+  stop_: set[str] = set()
+  if isinstance(stop, str) and stop != '': stop_.add(stop)
+  elif isinstance(stop, list) and stop != []: stop_.update(stop)
+  for tid in stop_token_ids:
+    if tid: stop_.add(self.tokenizer.decode(tid))
+
+  if self.config['temperature'] <= 1e-5: top_p = 1.0
+  else: top_p = self.config['top_p']
+  config = self.config.model_construct_env(stop=list(stop_), top_p=top_p, **attrs)
+  self.model.add_request(request_id=request_id, prompt=prompt, sampling_params=config.to_sampling_config())
+  while self.model.has_unfinished_requests():
+    for request_output in self.model.step():
+      prompt = request_output.prompt
+      if echo: text_outputs = [prompt + output.text for output in request_output.outputs]
+      else: text_outputs = [output.text for output in request_output.outputs]
+      yield {'text': text_outputs, 'error_code': 0}
+      if request_output.finished: break
+
+def vllm_generate(self: LLM['vllm.LLMEngine', T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]:
+  request_id: str | None = attrs.pop('request_id', None)
+  if request_id is None: raise ValueError('request_id must not be None.')
+  outputs: list[vllm.RequestOutput] = []
+  # TODO: support prompt_token_ids
+  self.model.add_request(request_id=request_id,
+                         prompt=prompt,
+                         sampling_params=self.config.model_construct_env(**attrs).to_sampling_config())
+  while self.model.has_unfinished_requests():
+    outputs.extend([r for r in self.model.step() if r.finished])
+  return [unmarshal_vllm_outputs(i) for i in outputs]
diff --git a/openllm-python/src/openllm/_embeddings.py b/openllm-python/src/openllm/_embeddings.py
index 526ce67b..8a0cda9c 100644
--- a/openllm-python/src/openllm/_embeddings.py
+++ b/openllm-python/src/openllm/_embeddings.py
@@ -58,7 +58,7 @@ class GenericEmbeddingRunnable(bentoml.Runnable):
     self.model.to(self.device)
 
   @bentoml.Runnable.method(batchable=True, batch_dim=0)
-  def encode(self, sentences: list[str]) -> t.Sequence[openllm.LLMEmbeddings]:
+  def encode(self, sentences: list[str]) -> t.Sequence[openllm.EmbeddingsOutput]:
     import torch
     import torch.nn.functional as F
     encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(self.device)
@@ -69,8 +69,8 @@ class GenericEmbeddingRunnable(bentoml.Runnable):
     # Perform pooling and normalize
     sentence_embeddings = F.normalize(self.mean_pooling(model_output, attention_mask), p=2, dim=1)
     return [
-        openllm.LLMEmbeddings(embeddings=sentence_embeddings.cpu().numpy(),
-                              num_tokens=int(torch.sum(attention_mask).item()))
+        openllm.EmbeddingsOutput(embeddings=sentence_embeddings.cpu().numpy(),
+                                 num_tokens=int(torch.sum(attention_mask).item()))
     ]
 
   @staticmethod
diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py
index 80e0ae12..e7bccd8b 100644
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -1,14 +1,10 @@
 # mypy: disable-error-code="name-defined,attr-defined"
 from __future__ import annotations
 import abc
-import functools
 import gc
 import inspect
 import logging
 import os
-import pathlib
-import re
-import traceback
 import types
 import typing as t
 
@@ -26,29 +22,22 @@ import openllm_core
 from bentoml._internal.models.model import ModelSignature
 from openllm_core._configuration import FineTuneConfig
 from openllm_core._configuration import LLMConfig
-from openllm_core._configuration import _object_getattribute
-from openllm_core._configuration import _setattr_class
-from openllm_core._schema import unmarshal_vllm_outputs
+from openllm_core._schema import EmbeddingsOutput
 from openllm_core._typing_compat import AdaptersMapping
 from openllm_core._typing_compat import AdaptersTuple
 from openllm_core._typing_compat import AdapterType
-from openllm_core._typing_compat import AnyCallable
 from openllm_core._typing_compat import DictStrAny
-from openllm_core._typing_compat import ListStr
-from openllm_core._typing_compat import LiteralRuntime
+from openllm_core._typing_compat import LiteralBackend
 from openllm_core._typing_compat import LiteralString
-from openllm_core._typing_compat import LLMEmbeddings
 from openllm_core._typing_compat import LLMRunnable
 from openllm_core._typing_compat import LLMRunner
 from openllm_core._typing_compat import M
-from openllm_core._typing_compat import ModelSignatureDict as _ModelSignatureDict
-from openllm_core._typing_compat import NotRequired
+from openllm_core._typing_compat import ModelSignatureDict
 from openllm_core._typing_compat import PeftAdapterOutput
 from openllm_core._typing_compat import T
 from openllm_core._typing_compat import TupleAny
 from openllm_core._typing_compat import overload
 from openllm_core.utils import DEBUG
-from openllm_core.utils import ENV_VARS_TRUE_VALUES
 from openllm_core.utils import MYPY
 from openllm_core.utils import EnvVarMixin
 from openllm_core.utils import LazyLoader
@@ -61,11 +50,11 @@ from openllm_core.utils import first_not_none
 from openllm_core.utils import generate_hash_from_file
 from openllm_core.utils import is_peft_available
 from openllm_core.utils import is_torch_available
-from openllm_core.utils import non_intrusive_setattr
 from openllm_core.utils import normalize_attrs_to_model_tokenizer_pair
 from openllm_core.utils import resolve_filepath
 from openllm_core.utils import validate_is_path
 
+from ._assign import make_llm_attributes
 from ._quantisation import infer_quantisation_config
 from .exceptions import ForbiddenAttributeError
 from .exceptions import GpuNotAvailableError
@@ -73,17 +62,16 @@ from .exceptions import OpenLLMException
 from .utils import infer_auto_class
 
 if t.TYPE_CHECKING:
+
   import auto_gptq as autogptq
   import peft
   import torch
   import transformers
-  import vllm
 
   from openllm_core._configuration import PeftType
   from openllm_core.utils.representation import ReprArgs
 else:
   autogptq = LazyLoader('autogptq', globals(), 'auto_gptq')
-  vllm = LazyLoader('vllm', globals(), 'vllm')
   transformers = LazyLoader('transformers', globals(), 'transformers')
   torch = LazyLoader('torch', globals(), 'torch')
   peft = LazyLoader('peft', globals(), 'peft')
@@ -92,14 +80,10 @@ ResolvedAdaptersMapping = t.Dict[AdapterType, t.Dict[str, t.Tuple['peft.PeftConf
 
 logger = logging.getLogger(__name__)
 
-class ModelSignatureDict(t.TypedDict, total=False):
-  batchable: bool
-  batch_dim: t.Union[t.Tuple[int, int], int]
-  input_spec: NotRequired[t.Union[t.Any, t.Tuple[t.Any]]]
-  output_spec: NotRequired[t.Any]
-
 def normalise_model_name(name: str) -> str:
-  return os.path.basename(resolve_filepath(name)) if validate_is_path(name) else re.sub('[^a-zA-Z0-9]+', '-', name)
+  if validate_is_path(name): return os.path.basename(resolve_filepath(name))
+  name = name.replace('/', '--')
+  return inflection.dasherize(name)
 
 # the below is similar to peft.utils.other.CONFIG_NAME
 PEFT_CONFIG_NAME = 'adapter_config.json'
@@ -137,36 +121,41 @@ def resolve_peft_config_type(adapter_map: dict[str, str | None]) -> AdaptersMapp
 
 _reserved_namespace = {'config_class', 'model', 'tokenizer', 'import_kwargs'}
 
-class LLMInterface(abc.ABC, t.Generic[M, T]):
-  '''This defines the loose contract for all openllm.LLM implementations.'''
+class LLMFunction(abc.ABC):
 
-  @property
-  def import_kwargs(self) -> tuple[DictStrAny, DictStrAny] | None:
-    """The default import kwargs to used when importing the model.
+  @abc.abstractmethod
+  def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any:
+    '''This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.).
 
-    This will be passed into 'openllm.LLM.import_model'.
-    It returns two dictionaries: one for model kwargs and one for tokenizer kwargs.
+    You can customize how the output of the LLM looks with this hook. By default, it is a simple echo.
 
-    Returns:
-        Optional tuple of model kwargs and tokenizer kwargs
-    """
-
-  def embeddings(self, prompts: list[str]) -> LLMEmbeddings:
-    '''The implementation for generating text embeddings from given prompt.
-
-    It takes the prompt and output the embeddings for this given LLM.
-
-    Returns:
-        The embeddings for the given prompt.
+    > [!NOTE]
+    > This will be used from the client side.
     '''
     raise NotImplementedError
 
   @abc.abstractmethod
   def generate(self, prompt: str, **preprocess_generate_kwds: t.Any) -> t.Any:
-    """The implementation for text generation from given prompt.
+    '''Text generation implementation for any given prompt.
 
-    It takes the prompt and 'generation_kwargs' from 'self.sanitize_parameters' and then pass it to 'self.model.generate'.
-    """
+    It takes the prompt and 'generation_kwargs'. The main implementation will parse all of kwargs
+    correctly for you, so that subclass implementation don't have to repeat some of these boilercode.
+    '''
+    raise NotImplementedError
+
+  @abc.abstractmethod
+  def generate_iterator(self, prompt: str, /, **attrs: t.Any) -> t.Iterator[t.Any]:
+    '''The iterator implementation of generate.
+
+    This will be used for Token streaming and SSE support.
+
+    Args:
+      prompt: the input prompt
+      **attrs: Relevant attributes to be pass to the stream generation implementation.
+
+    Returns:
+      An iterator of incoming token generation. It will returns a dictionary
+    '''
     raise NotImplementedError
 
   def generate_one(self, prompt: str, stop: list[str],
@@ -177,17 +166,20 @@ class LLMInterface(abc.ABC, t.Generic[M, T]):
     '''
     raise NotImplementedError
 
-  def generate_iterator(self, prompt: str, /, **attrs: t.Any) -> t.Iterator[t.Any]:
-    '''The iterator version of `generate` function.'''
-    raise NotImplementedError(
-        'Currently generate_iterator requires SSE (Server-side events) support, which is not yet implemented.')
+  def embeddings(self, prompts: list[str]) -> EmbeddingsOutput:
+    '''The implementation for generating text embeddings from given prompt.
 
-  def llm_post_init(self) -> None:
-    """This function can be implemented if you need to initialized any additional variables that doesn't concern OpenLLM internals."""
-    pass
+    It takes the prompt and output the embeddings for this given LLM.
+
+    Returns:
+        The embeddings for the given prompt.
+    '''
+    raise NotImplementedError
+
+class LLMSerialisation(abc.ABC, t.Generic[M, T]):
 
   def import_model(self, *args: t.Any, trust_remote_code: bool, **attrs: t.Any) -> bentoml.Model:
-    """This function can be implemented if default import_model doesn't satisfy your needs.
+    '''Import both model and tokenizer weights into as a BentoML models.
 
     Note that tokenizer attrs can be accessed via ``llm.llm_parameters``.
 
@@ -196,7 +188,7 @@ class LLMInterface(abc.ABC, t.Generic[M, T]):
     ```
 
     By default, `model_decls` and `model_attrs` is already sanitised and concatenated into `args` and `attrs`
-    """
+    '''
     raise NotImplementedError
 
   def load_model(self, *args: t.Any, **attrs: t.Any) -> M:
@@ -213,40 +205,47 @@ class LLMInterface(abc.ABC, t.Generic[M, T]):
     '''
     raise NotImplementedError
 
-  def save_pretrained(self, save_directory: str | pathlib.Path, **attrs: t.Any) -> None:
-    '''This function defines how this model can be saved to local store.
+class LLMInterface(LLMFunction, LLMSerialisation[M, T], abc.ABC):
 
-    This will be called during ``import_model``. By default, it will use ``openllm.serialisation.save_pretrained``.
-    Additionally, the function signature are similar to ``transformers.PreTrainedModel.save_pretrained``
-    This is useful during fine tuning.
+  def llm_post_init(self) -> None:
+    '''This function can be implemented if you need to initialized any additional variables that doesn't concern OpenLLM internals.
+    By default, this will add `self.device` if the implementation is PyTorch.
+    '''
+    pass
+
+  def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
+    '''This handler will sanitize all attrs and setup prompt text.
+
+    It takes a prompt that is given by the user, attrs that can be parsed with the prompt.
+
+    Returns a tuple of three items:
+    - The attributes dictionary that can be passed into LLMConfig to generate a GenerationConfig
+    - The attributes dictionary that will be passed into `self.postprocess_generate`.
     '''
     raise NotImplementedError
 
+  @property
+  def import_kwargs(self) -> tuple[DictStrAny, DictStrAny] | None:
+    '''The default import kwargs to used when importing the model.
+
+    This will be passed into 'openllm.LLM.import_model'.
+    It returns two dictionaries: one for model kwargs and one for tokenizer kwargs.
+
+    Returns:
+        Optional tuple of model kwargs and tokenizer kwargs
+    '''
+
   # NOTE: All fields below are attributes that can be accessed by users.
   config_class: t.Type[LLMConfig]
   '''The config class to use for this LLM. If you are creating a custom LLM, you must specify this class.'''
-  bettertransformer: bool
-  '''Whether to load this LLM with FasterTransformer enabled. The order of loading is:
-
-    - If pass within `for_model`, `from_pretrained` or `__init__`.
-    - If `self.bettertransformer` is set within `llm_post_init`.
-    - Finally, if none of the above, default to self.config['bettertransformer']
-
-    > [!NOTE] that if LoRA is enabled, bettertransformer will be disabled.
-    '''
   device: 'torch.device'
   '''The device to be used for this LLM. If the implementation is 'pt', then it will be torch.device, else string.'''
   tokenizer_id: t.Union[t.Literal['local'], LiteralString]
   '''optional tokenizer_id for loading with vLLM if the model supports vLLM.'''
-  # NOTE: The following will be populated by __init_subclass__, note that these should be immutable.
-  __llm_trust_remote_code__: bool
-  '''This is used to determine during 'import_model' whether to trust remote code or not.
 
-    This works synonymous with `trust_remote_code` kwarg in transformers Auto classes. If not passed,
-    then by default fallback to config_class['trust_remote_code']
-    '''
-  __llm_implementation__: LiteralRuntime
-  '''This is used to determine which implementation that this LLM has.
+  # NOTE: The following will be populated by __init_subclass__, note that these should be immutable.
+  __llm_backend__: LiteralBackend
+  '''This is used to determine which framework implementation for this given LLM.
 
     Usually, this will inferred from class name, that follows the HuggingFace's naming convention:
 
@@ -254,16 +253,17 @@ class LLMInterface(abc.ABC, t.Generic[M, T]):
     - `TFOPTForConditionalGeneration` -> `tf`
     - `FlaxOPTForConditionalGeneration` -> `flax`
 
-    An additional naming for all VLLM backend: VLLMLlama -> `vllm`
+    For all VLLM backend: VLLMLlama -> `vllm`
+    For all GGML backend: GGMLLlama -> `ggml`
+    For all MLC backend: MLCLlama -> `mlc`
     '''
   __llm_model__: t.Optional[M]
   '''A reference to the actual model. Instead of access this directly, you should use `model` property instead.'''
   __llm_tokenizer__: t.Optional[T]
   '''A reference to the actual tokenizer. Instead of access this directly, you should use `tokenizer` property instead.'''
-  __llm_bentomodel__: t.Optional[bentoml.Model]
-  '''A reference to the bentomodel used for this LLM. Instead of access this directly, you should use `_bentomodel` property instead.'''
   __llm_adapter_map__: t.Optional[ResolvedAdaptersMapping]
   '''A reference to the the cached LoRA adapter mapping.'''
+
   __llm_supports_embeddings__: bool
   '''A boolean to determine whether models does implement ``LLM.embeddings``.'''
   __llm_supports_generate__: bool
@@ -272,243 +272,30 @@ class LLMInterface(abc.ABC, t.Generic[M, T]):
   '''A boolean to determine whether models does implement ``LLM.generate_one``.'''
   __llm_supports_generate_iterator__: bool
   '''A boolean to determine whether models does implement ``LLM.generate_iterator``.'''
-  if t.TYPE_CHECKING and not MYPY:
-
-    def __attrs_init__(self, config: LLMConfig, quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig,
-                                                                                        autogptq.BaseQuantizeConfig]],
-                       model_id: str, runtime: t.Literal['ggml', 'transformers'], model_decls: TupleAny,
-                       model_attrs: DictStrAny, tokenizer_attrs: DictStrAny, tag: bentoml.Tag,
-                       adapters_mapping: t.Optional[AdaptersMapping], model_version: t.Optional[str],
-                       quantize_method: t.Optional[t.Literal['int8', 'int4', 'gptq']],
-                       serialisation_format: t.Literal['safetensors', 'legacy'], _local: bool, **attrs: t.Any) -> None:
-      '''Generated __attrs_init__ for openllm.LLM.'''
-
-_R = t.TypeVar('_R', covariant=True)
-
-class _import_model_wrapper(t.Generic[_R, M, T], t.Protocol):
-
-  def __call__(self, llm: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any) -> _R:
-    ...
-
-class _load_model_wrapper(t.Generic[M, T], t.Protocol):
-
-  def __call__(self, llm: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
-    ...
-
-class _load_tokenizer_wrapper(t.Generic[M, T], t.Protocol):
-
-  def __call__(self, llm: LLM[M, T], **attrs: t.Any) -> T:
-    ...
-
-class _llm_post_init_wrapper(t.Generic[M, T], t.Protocol):
-
-  def __call__(self, llm: LLM[M, T]) -> T:
-    ...
-
-class _save_pretrained_wrapper(t.Generic[M, T], t.Protocol):
-
-  def __call__(self, llm: LLM[M, T], save_directory: str | pathlib.Path, **attrs: t.Any) -> None:
-    ...
-
-_object_setattr = object.__setattr__
-
-# NOTE: the following wrapper are a light meta ops for wrapping default params to internal methods implementation.
-def _wrapped_import_model(f: _import_model_wrapper[bentoml.Model, M, T]) -> t.Callable[[LLM[M, T]], bentoml.Model]:
-
-  @functools.wraps(f)
-  def wrapper(self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any) -> bentoml.Model:
-    trust_remote_code = first_not_none(trust_remote_code, default=self.__llm_trust_remote_code__)
-    (model_decls, model_attrs), _ = self.llm_parameters
-    decls = (*model_decls, *decls)
-    attrs = {**model_attrs, **attrs}
-    return f(self, *decls, trust_remote_code=trust_remote_code, **attrs)
-
-  return wrapper
 
 _DEFAULT_TOKENIZER = 'hf-internal-testing/llama-tokenizer'
 
-def get_engine_args(llm: LLM[M, T], tokenizer: str = _DEFAULT_TOKENIZER) -> vllm.EngineArgs:
-  return vllm.EngineArgs(model=llm._bentomodel.path,
-                         tokenizer=tokenizer,
-                         tokenizer_mode='auto',
-                         tensor_parallel_size=1 if device_count() < 2 else device_count(),
-                         dtype='auto',
-                         worker_use_ray=False)
-
-def _wrapped_load_model(f: _load_model_wrapper[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.LLMEngine]:
-
-  @functools.wraps(f)
-  def wrapper(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine:
-    if self.__llm_implementation__ == 'vllm':
-      # TODO: Do some more processing with token_id once we support token streaming
-      try:
-        return vllm.LLMEngine.from_engine_args(
-            get_engine_args(self,
-                            tokenizer=self._bentomodel.path if self.tokenizer_id == 'local' else self.tokenizer_id))
-      except Exception as err:
-        traceback.print_exc()
-        raise OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from None
-    else:
-      (model_decls, model_attrs), _ = self.llm_parameters
-      return f(self, *(*model_decls, *decls), **{**model_attrs, **attrs})
-
-  return wrapper
-
-def _wrapped_load_tokenizer(f: _load_tokenizer_wrapper[M, T]) -> t.Callable[[LLM[M, T]], T]:
-
-  @functools.wraps(f)
-  def wrapper(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T:
-    return f(self, **{**self.llm_parameters[-1], **tokenizer_attrs})
-
-  return wrapper
-
-def _wrapped_llm_post_init(f: _llm_post_init_wrapper[M, T]) -> t.Callable[[LLM[M, T]], None]:
-
-  @functools.wraps(f)
-  def wrapper(self: LLM[M, T]) -> None:
-    if self.__llm_implementation__ == 'pt' and is_torch_available():
-      self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    f(self)
-
-  return wrapper
-
-def _wrapped_save_pretrained(f: _save_pretrained_wrapper[M, T]) -> t.Callable[[LLM[M, T], str | pathlib.Path], None]:
-
-  @functools.wraps(f)
-  def wrapper(self: LLM[M, T], save_directory: str | pathlib.Path, **attrs: t.Any) -> None:
-    if isinstance(save_directory, pathlib.Path): save_directory = str(save_directory)
-    if self.__llm_model__ is None: raise RuntimeError("Cannot 'save_pretrained' with unload model instance.")
-    if self.bettertransformer and self.__llm_implementation__ == 'pt':
-      _object_setattr(self, '__llm_model__',
-                      t.cast('transformers.PreTrainedModel', self.__llm_model__).reverse_bettertransformer())
-    f(self, save_directory, **attrs)
-
-  return wrapper
-
-def _update_docstring(cls: LLM[M, T], fn: str) -> AnyCallable:
-  # update docstring for given entrypoint
-  original_fn = getattr(cls, fn, getattr(LLMInterface, fn))
-  original_fn.__doc__ = original_fn.__doc__ or f'''\
-    {cls.__name__}'s implementation for {fn}.
-
-    Note that if LoRA is enabled (via either SDK or CLI), `self.model` will become a `peft.PeftModel`
-    The original model can then be accessed with 'self.model.get_base_model()'.
-    '''
-  setattr(cls, fn, original_fn)
-  return original_fn
-
-def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]:
-  attributes = {
-      'import_model': _wrapped_import_model,
-      'load_model': _wrapped_load_model,
-      'load_tokenizer': _wrapped_load_tokenizer,
-      'llm_post_init': _wrapped_llm_post_init,
-      'save_pretrained': _wrapped_save_pretrained
-  }
-  args: ListStr = []
-  anns: DictStrAny = {}
-  lines: ListStr = []
-  globs: DictStrAny = {
-      'cls': cls,
-      '_cached_LLMInterface_get': _object_getattribute.__get__(LLMInterface),
-      '__gen_docstring': _update_docstring
-  }
-  # function initialisation
-  for func, impl in attributes.items():
-    impl_name = f'__wrapped_{func}'
-    globs.update({f'__serialisation_{func}': getattr(openllm.serialisation, func, None), impl_name: impl})
-    cached_func_name = f'_cached_{cls.__name__}_func'
-    if func == 'llm_post_init': func_call = f'_impl_{cls.__name__}_{func}={cached_func_name}'
-    else:
-      func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMInterface_get('{func}') else __serialisation_{func}"
-    lines.extend([
-        f'{cached_func_name}=cls.{func}', func_call,
-        _setattr_class(func, f'{impl_name}(_impl_{cls.__name__}_{func})'),
-    ])
-
-  # assign vllm specific implementation
-  if cls.__llm_implementation__ == 'vllm':
-    globs.update({
-        '_vllm_generate': vllm_generate,
-        '_vllm_postprocess_generate': vllm_postprocess_generate,
-        '_vllm_generate_iterator': vllm_generate_iterator
-    })
-    lines.extend(
-        [_setattr_class(it, f'_vllm_{it}') for it in {'generate', 'postprocess_generate', 'generate_iterator'}])
-
-  # cached attribute initialisation
-  interface_anns = codegen.get_annotations(LLMInterface)
-  for v in {'bentomodel', 'model', 'tokenizer', 'adapter_map'}:
-    lines.append(_setattr_class(f'__llm_{v}__', None))
-    anns[f'__llm_{v}__'] = interface_anns.get(f'__llm_{v}__')
-
-  # boolean to determine whether LLM has defined an implementation for a function
-  for fn in {'generate', 'generate_one', 'generate_iterator', 'embeddings'}:
-    key = f'__llm_supports_{fn}__'
-    lines.extend([
-        _setattr_class(key, f"cls.{fn} is not _cached_LLMInterface_get('{fn}')"), f"__gen_docstring(cls, '{fn}')",
-    ])
-    anns[key] = interface_anns.get(key)
-  return codegen.generate_function(cls, '__assign_llm_attr', lines, args=('cls', *args), globs=globs, annotations=anns)
-
-def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]],
-                              **_: t.Any) -> str:
-  return generation_result[0]['outputs'][0]['text']
-
-def vllm_generate_iterator(self: LLM['vllm.LLMEngine', T],
-                           prompt: str,
-                           /,
-                           *,
-                           echo: bool = False,
-                           stop: str | t.Iterable[str] | None = None,
-                           stop_token_ids: list[int] | None = None,
-                           **attrs: t.Any) -> t.Iterator[dict[str, t.Any]]:
-  request_id: str = attrs.pop('request_id', None)
-  if request_id is None: raise ValueError('request_id must not be None.')
-  if stop_token_ids is None: stop_token_ids = []
-  stop_token_ids.append(self.tokenizer.eos_token_id)
-  stop_ = set()
-  if isinstance(stop, str) and stop != '': stop_.add(stop)
-  elif isinstance(stop, list) and stop != []: stop_.update(stop)
-  for tid in stop_token_ids:
-    if tid: stop_.add(self.tokenizer.decode(tid))
-
-  if self.config['temperature'] <= 1e-5: top_p = 1.0
-  else: top_p = self.config['top_p']
-  config = self.config.model_construct_env(stop=list(stop_), top_p=top_p, **attrs)
-  self.model.add_request(request_id=request_id, prompt=prompt, sampling_params=config.to_sampling_config())
-  while self.model.has_unfinished_requests():
-    for request_output in self.model.step():
-      prompt = request_output.prompt
-      if echo: text_outputs = [prompt + output.text for output in request_output.outputs]
-      else: text_outputs = [output.text for output in request_output.outputs]
-      yield {'text': text_outputs, 'error_code': 0}
-      if request_output.finished: break
-
-def vllm_generate(self: LLM['vllm.LLMEngine', T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]:
-  request_id: str = attrs.pop('request_id', None)
-  if request_id is None: raise ValueError('request_id must not be None.')
-  outputs: list[vllm.RequestOutput] = []
-  # TODO: support prompt_token_ids
-  self.model.add_request(request_id=request_id,
-                         prompt=prompt,
-                         sampling_params=self.config.model_construct_env(**attrs).to_sampling_config())
-  while self.model.has_unfinished_requests():
-    outputs.extend([r for r in self.model.step() if r.finished])
-  return [unmarshal_vllm_outputs(i) for i in outputs]
-
 _AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class('AdaptersTuple', ['adapter_id', 'name', 'config'])
 
 @attr.define(slots=True, repr=False, init=False)
 class LLM(LLMInterface[M, T], ReprMixin):
   if t.TYPE_CHECKING: __name__: str
+  if t.TYPE_CHECKING and not MYPY:
+
+    def __attrs_init__(self, config: LLMConfig, quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig,
+                                                                                        autogptq.BaseQuantizeConfig]],
+                       model_id: str, model_decls: TupleAny, model_attrs: DictStrAny, tokenizer_attrs: DictStrAny,
+                       tag: bentoml.Tag, adapters_mapping: t.Optional[AdaptersMapping], model_version: t.Optional[str],
+                       quantize_method: t.Optional[t.Literal['int8', 'int4', 'gptq']],
+                       serialisation_format: t.Literal['safetensors', 'legacy'], _local: bool, **attrs: t.Any) -> None:
+      '''Generated __attrs_init__ for openllm.LLM.'''
+
   config: LLMConfig
-  '''The config instance to use for this LLM. This will be created based on config_class and available
-    when initialising the LLM.'''
+  '''The config instance to use for this LLM. This will be created based on config_class and available when initialising the LLM.'''
   quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None
   '''Quantisation config for quantised model on the fly.'''
+
   _model_id: str
-  _runtime: t.Literal['ggml', 'transformers']
   _model_decls: TupleAny
   _model_attrs: DictStrAny
   _tokenizer_attrs: DictStrAny
@@ -519,31 +306,28 @@ class LLM(LLMInterface[M, T], ReprMixin):
   _serialisation_format: t.Literal['safetensors', 'legacy']
   _local: bool
 
-  @staticmethod
-  def _infer_implementation_from_name(name: str) -> tuple[LiteralRuntime, str]:
-    if name.startswith('Flax'): return 'flax', name[4:]
-    elif name.startswith('TF'): return 'tf', name[2:]
-    elif name.startswith('VLLM'): return 'vllm', name[4:]
-    else: return 'pt', name
-
   def __init_subclass__(cls: type[LLM[M, T]]) -> None:
     cd = cls.__dict__
-    implementation, config_class_name = cls._infer_implementation_from_name(cls.__name__)
-    cls.__llm_implementation__ = implementation
-    config_class = openllm.AutoConfig.infer_class_from_name(config_class_name)
-    if '__openllm_internal__' in cd:
-      if 'config_class' not in cd: cls.config_class = config_class
-    elif 'config_class' not in cd:
+    if cls.__name__.startswith('Flax'):
+      cls.__llm_backend__, config_class = 'flax', openllm.AutoConfig.infer_class_from_name(cls.__name__[4:])
+    elif cls.__name__.startswith('TF'):
+      cls.__llm_backend__, config_class = 'tf', openllm.AutoConfig.infer_class_from_name(cls.__name__[2:])
+    elif cls.__name__.startswith('VLLM'):
+      cls.__llm_backend__, config_class = 'vllm', openllm.AutoConfig.infer_class_from_name(cls.__name__[4:])
+    else:
+      cls.__llm_backend__, config_class = 'pt', openllm.AutoConfig.infer_class_from_name(cls.__name__)
+    if '__openllm_internal__' not in cd and 'config_class' not in cd:
       raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.")
-    _make_assignment_script(cls)(cls)
-    if 'tokenizer_id' not in cd and cls.__llm_implementation__ == 'vllm': cls.tokenizer_id = _DEFAULT_TOKENIZER
+    if '__openllm_internal__' in cd and 'config_class' not in cd: cls.config_class = config_class
+    if 'tokenizer_id' not in cd and cls.__llm_backend__ == 'vllm': cls.tokenizer_id = _DEFAULT_TOKENIZER
+    make_llm_attributes(cls)(cls)
 
   @overload
   def __getitem__(self, item: t.Literal['trust_remote_code']) -> bool:
     ...
 
   @overload
-  def __getitem__(self, item: t.Literal['implementation']) -> LiteralRuntime:
+  def __getitem__(self, item: t.Literal['backend']) -> LiteralBackend:
     ...
 
   @overload
@@ -554,10 +338,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
   def __getitem__(self, item: t.Literal['tokenizer']) -> T | None:
     ...
 
-  @overload
-  def __getitem__(self, item: t.Literal['bentomodel']) -> bentoml.Model | None:
-    ...
-
   @overload
   def __getitem__(self, item: t.Literal['adapter_map']) -> ResolvedAdaptersMapping | None:
     ...
@@ -586,58 +366,20 @@ class LLM(LLMInterface[M, T], ReprMixin):
     elif hasattr(self, item): return getattr(self, item)
     else: raise KeyError(item)
 
-  @overload
-  @classmethod
-  def from_pretrained(cls,
-                      model_id: str | None = ...,
-                      model_version: str | None = ...,
-                      llm_config: LLMConfig | None = ...,
-                      *args: t.Any,
-                      runtime: t.Literal['ggml', 'transformers'] | None = ...,
-                      quantize: t.Literal['int8', 'int4'] = ...,
-                      bettertransformer: str | bool | None = ...,
-                      adapter_id: str | None = ...,
-                      adapter_name: str | None = ...,
-                      adapter_map: dict[str, str | None] | None = ...,
-                      quantization_config: transformers.BitsAndBytesConfig | None = ...,
-                      serialisation: t.Literal['safetensors', 'legacy'] = ...,
-                      **attrs: t.Any) -> LLM[M, T]:
-    ...
-
-  @overload
-  @classmethod
-  def from_pretrained(cls,
-                      model_id: str | None = ...,
-                      model_version: str | None = ...,
-                      llm_config: LLMConfig | None = ...,
-                      *args: t.Any,
-                      runtime: t.Literal['ggml', 'transformers'] | None = ...,
-                      quantize: t.Literal['gptq'] = ...,
-                      bettertransformer: str | bool | None = ...,
-                      adapter_id: str | None = ...,
-                      adapter_name: str | None = ...,
-                      adapter_map: dict[str, str | None] | None = ...,
-                      quantization_config: autogptq.BaseQuantizeConfig | None = ...,
-                      serialisation: t.Literal['safetensors', 'legacy'] = ...,
-                      **attrs: t.Any) -> LLM[M, T]:
-    ...
-
   @classmethod
   def from_pretrained(cls,
                       model_id: str | None = None,
                       model_version: str | None = None,
                       llm_config: LLMConfig | None = None,
                       *args: t.Any,
-                      runtime: t.Literal['ggml', 'transformers'] | None = None,
                       quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-                      bettertransformer: str | bool | None = None,
                       adapter_id: str | None = None,
                       adapter_name: str | None = None,
                       adapter_map: dict[str, str | None] | None = None,
                       quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
                       serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors',
                       **attrs: t.Any) -> LLM[M, T]:
-    """Instantiate a pretrained LLM.
+    '''Instantiate a pretrained LLM.
 
     ``LLM.from_pretrained`` follows the same design principle as HuggingFace's `from_pretrained` method, plus the following:
 
@@ -646,7 +388,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
     > This is most notable during serving time.
 
     - quantize: quantize the model with the given quantization method. Currently supported int8, int4 quantization
-    - bettertransformer: Apply FasterTransformer to given pretrained weight
 
     > Currently, the above two options are mutually exclusive.
 
@@ -682,17 +423,15 @@ class LLM(LLMInterface[M, T], ReprMixin):
                     will use `config_class` to construct default configuration.
         quantize: The quantization to use for this LLM. Defaults to None. Possible values
                   include int8, int4 and gptq.
-        runtime: Optional runtime to run this LLM. Default to 'transformers'. 'ggml' supports is working in progress.
         quantization_config: The quantization config (`transformers.BitsAndBytesConfig` | `autogtpq.BaseQuantizeConfig`) to use. Note that this is mutually exclusive with `quantize`
         serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
                       Default behaviour is similar to ``safe_serialization=False``.
-        bettertransformer: Whether to use BetterTransformer with this model. Defaults to False.
         adapter_id: The [LoRA](https://arxiv.org/pdf/2106.09685.pdf) pretrained id or local path to use for this LLM. Defaults to None.
         adapter_name: The adapter name to use for this LLM. Defaults to None.
         adapter_map: The adapter map to use for this LLM. Defaults to None. Note that this is mutually exclusive with adapter_id/adapter_name arguments.
         *args: The args to be passed to the model.
         **attrs: The kwargs to be passed to the model.
-    """
+    '''
     cfg_cls = cls.config_class
     _local = False
     _model_id: str = first_not_none(model_id,
@@ -712,7 +451,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
     if quantization_config is None and quantize is not None:
       quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs)
     if quantize == 'gptq': serialisation = 'safetensors'
-    elif cls.__llm_implementation__ == 'vllm': serialisation = 'legacy'  # Currently working-in-progress
+    elif cls.__llm_backend__ == 'vllm': serialisation = 'legacy'  # Currently working-in-progress
 
     # NOTE: LoRA adapter setup
     if adapter_map and adapter_id:
@@ -749,14 +488,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
                _tag=_tag,
                _serialisation_format=serialisation,
                _local=_local,
-               bettertransformer=str(
-                   first_not_none(bettertransformer,
-                                  os.environ.get(cfg_cls.__openllm_env__['bettertransformer']),
-                                  default=None)).upper() in ENV_VARS_TRUE_VALUES,
-               _runtime=first_not_none(runtime,
-                                       t.cast(t.Optional[t.Literal['ggml', 'transformers']],
-                                              os.environ.get(cfg_cls.__openllm_env__['runtime'])),
-                                       default=cfg_cls.__openllm_runtime__),
                _adapters_mapping=resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
                **attrs)
 
@@ -765,9 +496,9 @@ class LLM(LLMInterface[M, T], ReprMixin):
   def _generate_tag_str(cls, model_id: str, model_version: str | None) -> str:
     '''Generate a compliant ``bentoml.Tag`` from model_id.
 
-    If model_id is a pretrained_id from HF, then it will have the following format: <framework>-<normalise_model_id>:<revision>
+    If model_id is a pretrained_id from HF, then it will have the following format: <backend>-<normalise_model_id>:<revision>
     If model_id contains the revision itself, then the same format above
-    If model_id is a path, then it will be <framework>-<basename_of_path>:<generated_sha1> if model_version is not passesd, otherwise <framework>-<basename_of_path>:<model_version>
+    If model_id is a path, then it will be <backend>-<basename_of_path>:<generated_sha1> if model_version is not passesd, otherwise <backend>-<basename_of_path>:<model_version>
 
     > [!NOTE] here that the generated SHA1 for path cases is that it will be based on last modified time.
 
@@ -788,12 +519,11 @@ class LLM(LLMInterface[M, T], ReprMixin):
       if model_version is not None:
         logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.",
                        maybe_revision[0], model_version)
-      return f'{cls.__llm_implementation__}-{model_name}:{maybe_revision[0]}'
+      return f'{cls.__llm_backend__}-{model_name}:{maybe_revision[0]}'
 
-    tag_name = f'{cls.__llm_implementation__}-{model_name}'
-    if os.environ.get('OPENLLM_USE_LOCAL_LATEST', str(False)).upper() in ENV_VARS_TRUE_VALUES:
-      return bentoml_cattr.unstructure(
-          bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag)
+    tag_name = f'{cls.__llm_backend__}-{model_name}'
+    if openllm_core.utils.check_bool_env('OPENLLM_USE_LOCAL_LATEST', False):
+      return str(bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag)
     if validate_is_path(model_id):
       model_id, model_version = resolve_filepath(model_id), first_not_none(model_version,
                                                                            default=generate_hash_from_file(model_id))
@@ -811,14 +541,13 @@ class LLM(LLMInterface[M, T], ReprMixin):
   def generate_tag(cls, *param_decls: t.Any, **attrs: t.Any) -> bentoml.Tag:
     return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs))
 
-  def __init__(self, *args: t.Any, model_id: str, llm_config: LLMConfig, bettertransformer: bool | None,
+  def __init__(self, *args: t.Any, model_id: str, llm_config: LLMConfig,
                quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
                _adapters_mapping: AdaptersMapping | None, _tag: bentoml.Tag,
-               _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None, _runtime: t.Literal['ggml', 'transformers'],
-               _model_version: str, _serialisation_format: t.Literal['safetensors',
-                                                                     'legacy'], _local: bool, **attrs: t.Any,
+               _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None, _model_version: str,
+               _serialisation_format: t.Literal['safetensors', 'legacy'], _local: bool, **attrs: t.Any,
               ):
-    """Initialize the LLM with given pretrained model.
+    '''Initialize the LLM with given pretrained model.
 
     > [!WARNING]
     > To initializing any LLM, you should use `openllm.AutoLLM` or `openllm.LLM.from_pretrained` instead.
@@ -896,15 +625,14 @@ class LLM(LLMInterface[M, T], ReprMixin):
         model_id: The pretrained model to use. Defaults to None. If None, 'self.default_id' will be used.
         llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM
                     will use `config_class` to construct default configuration.
-        bettertransformer: Whether to use BetterTransformer with this model. Defaults to False.
         quantization_config: ``transformers.BitsAndBytesConfig`` configuration, or 'gptq' denoting this model to be loaded with GPTQ.
         *args: The args to be passed to the model.
         **attrs: The kwargs to be passed to the model.
-    """
+    '''
     # low_cpu_mem_usage is only available for model
     # this is helpful on system with low memory to avoid OOM
     low_cpu_mem_usage = attrs.pop('low_cpu_mem_usage', True)
-    if self.__llm_implementation__ == 'pt':
+    if self.__llm_backend__ == 'pt':
       attrs.update({'low_cpu_mem_usage': low_cpu_mem_usage, 'quantization_config': quantization_config})
     model_kwds: DictStrAny = {}
     tokenizer_kwds: DictStrAny = {}
@@ -915,25 +643,15 @@ class LLM(LLMInterface[M, T], ReprMixin):
     # parsing tokenizer and model kwargs, as the hierachy is param pass > default
     normalized_model_kwds, normalized_tokenizer_kwds = normalize_attrs_to_model_tokenizer_pair(**attrs)
     # NOTE: Save the args and kwargs for latter load
-    self.__attrs_init__(llm_config, quantization_config, model_id, _runtime, args, {
+    self.__attrs_init__(llm_config, quantization_config, model_id, args, {
         **model_kwds,
         **normalized_model_kwds
     }, {
         **tokenizer_kwds,
         **normalized_tokenizer_kwds
     }, _tag, _adapters_mapping, _model_version, _quantize_method, _serialisation_format, _local)
-    # handle trust_remote_code
-    _from_env = os.getenv('TRUST_REMOTE_CODE', None)
-    self.__llm_trust_remote_code__ = first_not_none(
-        str(_from_env).upper() in ENV_VARS_TRUE_VALUES if _from_env else None,
-        default=self._model_attrs.pop('trust_remote_code', self.config['trust_remote_code']))
 
     self.llm_post_init()
-    # we set it here so that we allow subclass to overwrite bettertransformer in llm_post_init
-    if bettertransformer is True: self.bettertransformer = bettertransformer
-    else: non_intrusive_setattr(self, 'bettertransformer', self.config['bettertransformer'])
-    # If lora is passed, the disable bettertransformer
-    if _adapters_mapping and self.bettertransformer is True: self.bettertransformer = False
 
   def __setattr__(self, attr: str, value: t.Any) -> None:
     if attr in _reserved_namespace:
@@ -942,6 +660,11 @@ class LLM(LLMInterface[M, T], ReprMixin):
       )
     super().__setattr__(attr, value)
 
+  @property
+  def trust_remote_code(self) -> bool:
+    return first_not_none(openllm_core.utils.check_bool_env('TRUST_REMOTE_CODE'),
+                          default=self.config['trust_remote_code'])
+
   @property
   def adapters_mapping(self) -> AdaptersMapping | None:
     return self._adapters_mapping
@@ -952,21 +675,18 @@ class LLM(LLMInterface[M, T], ReprMixin):
 
   @property
   def __repr_keys__(self) -> set[str]:
-    return {'model_id', 'runner_name', 'config', 'adapters_mapping', 'runtime', 'tag'}
+    return {'model_id', 'runner_name', 'config', 'adapters_mapping', 'tag'}
 
   def __repr_args__(self) -> ReprArgs:
     for k in self.__repr_keys__:
       if k == 'config': yield k, self.config.model_dump(flatten=True)
       else: yield k, getattr(self, k)
+    yield 'backend', self.__llm_backend__
 
   @property
   def model_id(self) -> str:
     return self._model_id
 
-  @property
-  def runtime(self) -> t.Literal['ggml', 'transformers']:
-    return self._runtime
-
   @property
   def runner_name(self) -> str:
     return f"llm-{self.config['start_name']}-runner"
@@ -995,15 +715,13 @@ class LLM(LLMInterface[M, T], ReprMixin):
     return openllm.import_model(self.config['start_name'],
                                 model_id=self.model_id,
                                 model_version=self._model_version,
-                                runtime=self.runtime,
-                                implementation=self.__llm_implementation__,
+                                backend=self.__llm_backend__,
                                 quantize=self._quantize_method,
                                 serialisation_format=self._serialisation_format)
 
   @property
   def _bentomodel(self) -> bentoml.Model:
-    if self.__llm_bentomodel__ is None: self.__llm_bentomodel__ = openllm.serialisation.get(self)
-    return self.__llm_bentomodel__
+    return openllm.serialisation.get(self, auto_import=True)
 
   def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
     '''This handler will sanitize all attrs and setup prompt text.
@@ -1024,7 +742,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
     > [!NOTE]
     > This will be used from the client side.
     '''
-    if isinstance(generation_result, dict): return generation_result['text']
+    if isinstance(generation_result, dict) and 'text' in generation_result: return generation_result['text']
     return self.config.postprocess_generate(prompt, generation_result, **attrs)
 
   @property
@@ -1036,7 +754,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
     if self.__llm_model__ is None:
       model = self.load_model(*self._model_decls, **self._model_attrs)
       # If OOM, then it is probably you don't have enough VRAM to run this model.
-      if self.__llm_implementation__ == 'pt' and is_torch_available():
+      if self.__llm_backend__ == 'pt' and is_torch_available():
         loaded_in_kbit = getattr(model, 'is_loaded_in_8bit', False) or getattr(
             model, 'is_loaded_in_4bit', False) or getattr(model, 'is_quantized', False)
         if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
@@ -1055,12 +773,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
     if self.__llm_tokenizer__ is None: self.__llm_tokenizer__ = self.load_tokenizer(**self._tokenizer_attrs)
     return self.__llm_tokenizer__
 
-  def _default_ft_config(self, _adapter_type: AdapterType, inference_mode: bool) -> FineTuneConfig:
-    strategy = first_not_none(self.config['fine_tune_strategies'].get(_adapter_type),
-                              default=FineTuneConfig(adapter_type=t.cast('PeftType', _adapter_type),
-                                                     llm_config_class=self.config_class))
-    return strategy.eval() if inference_mode else strategy.train()
-
   def _transpose_adapter_mapping(self, inference_mode: bool = True, use_cache: bool = True) -> ResolvedAdaptersMapping:
     if self._adapters_mapping is None: raise ValueError('LoRA mapping is not set up correctly.')
     # early out if we already serialized everything.
@@ -1072,7 +784,10 @@ class LLM(LLMInterface[M, T], ReprMixin):
     # then we will raise Error when the optional_name is set to None in next iteration.
     _converted_first_none = False
     for _adapter_type, _adapters_tuples in self._adapters_mapping.items():
-      default_config = self._default_ft_config(_adapter_type, inference_mode)
+      strategy = first_not_none(self.config['fine_tune_strategies'].get(_adapter_type),
+                                default=FineTuneConfig(adapter_type=t.cast('PeftType', _adapter_type),
+                                                       llm_config_class=self.config_class))
+      default_config = strategy.eval() if inference_mode else strategy.train()
       for adapter in _adapters_tuples:
         if not adapter.name and _converted_first_none:
           raise ValueError(
@@ -1173,9 +888,8 @@ class LLM(LLMInterface[M, T], ReprMixin):
         # BUG: This hits during inference, need fixing
         model = peft_class.from_pretrained(self.__llm_model__, peft_model_id, **kwargs)
       else:
-        model = peft_class(
-            self.__llm_model__,
-            default_config)  # in this case, the given base_model_name_or_path is None. This will be hit during training
+        # in this case, the given base_model_name_or_path is None. This will be hit during training
+        model = peft_class(self.__llm_model__, default_config)
     return model
 
   # order of these fields matter here, make sure to sync it with
@@ -1186,7 +900,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
       max_batch_size: int | None = None,
       max_latency_ms: int | None = None,
       scheduling_strategy: type[bentoml.Strategy] = openllm_core.CascadingResourceStrategy) -> LLMRunner[M, T]:
-    """Convert this LLM into a Runner.
+    '''Convert this LLM into a Runner.
 
     Args:
       models: Any additional ``bentoml.Model`` to be included in this given models.
@@ -1205,7 +919,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
     > - 'name': will be generated by OpenLLM, hence users don't shouldn't worry about this. The generated name will be 'llm-<model-start-name>-runner' (ex: llm-dolly-v2-runner, llm-chatglm-runner)
     > - 'embedded': Will be disabled by default. There is no reason to run LLM in embedded mode.
     > - 'method_configs': The method configs for the runner will be managed internally by OpenLLM.
-    """
+    '''
     models = models if models is not None else []
 
     try:
@@ -1213,10 +927,9 @@ class LLM(LLMInterface[M, T], ReprMixin):
     except bentoml.exceptions.NotFound as err:
       raise RuntimeError(f'Failed to locate {self._bentomodel}:{err}') from None
 
-    generate_sig = ModelSignature.from_dict(t.cast('_ModelSignatureDict', ModelSignatureDict(batchable=False)))
-    embeddings_sig = ModelSignature.from_dict(
-        t.cast('_ModelSignatureDict', ModelSignatureDict(batchable=True, batch_dim=0)))
-    generate_iterator_sig = ModelSignature.from_dict(t.cast('_ModelSignatureDict', ModelSignatureDict(batchable=False)))
+    generate_sig = ModelSignature.from_dict(ModelSignatureDict(batchable=False))
+    embeddings_sig = ModelSignature.from_dict(ModelSignatureDict(batchable=True, batch_dim=0))
+    generate_iterator_sig = ModelSignature.from_dict(ModelSignatureDict(batchable=False))
 
     # NOTE: returning the two langchain API's to the runner
     return llm_runner_class(self)(llm_runnable_class(self, embeddings_sig, generate_sig, generate_iterator_sig),
@@ -1232,8 +945,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
                                       'generate_one': generate_sig,
                                       'generate_iterator': generate_iterator_sig
                                   }),
-                                  scheduling_strategy=scheduling_strategy,
-                                 )
+                                  scheduling_strategy=scheduling_strategy)
 
   # NOTE: Scikit API
   def predict(self, prompt: str, **attrs: t.Any) -> t.Any:
@@ -1406,7 +1118,6 @@ def Runner(model_name: str,
            max_batch_size: int | None = ...,
            max_latency_ms: int | None = ...,
            method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = ...,
-           embedded: t.Literal[True, False] = ...,
            scheduling_strategy: type[bentoml.Strategy] | None = ...,
            **attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
   ...
@@ -1414,9 +1125,9 @@ def Runner(model_name: str,
 @overload
 def Runner(model_name: str,
            *,
-           ensure_available: bool | None = None,
+           ensure_available: bool = ...,
            init_local: bool = ...,
-           implementation: LiteralRuntime | None = None,
+           backend: LiteralBackend | None = None,
            llm_config: LLMConfig | None = None,
            **attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
   ...
@@ -1427,9 +1138,7 @@ def Runner(model_name: str,
            model_id: str | None = ...,
            model_version: str | None = ...,
            llm_config: LLMConfig | None = ...,
-           runtime: t.Literal['ggml', 'transformers'] | None = ...,
            quantize: t.Literal['int8', 'int4', 'gptq'] | None = ...,
-           bettertransformer: str | bool | None = ...,
            adapter_id: str | None = ...,
            adapter_name: str | None = ...,
            adapter_map: dict[str, str | None] | None = ...,
@@ -1439,12 +1148,12 @@ def Runner(model_name: str,
   ...
 
 def Runner(model_name: str,
-           ensure_available: bool | None = None,
+           ensure_available: bool = False,
            init_local: bool = False,
-           implementation: LiteralRuntime | None = None,
+           backend: LiteralBackend | None = None,
            llm_config: LLMConfig | None = None,
            **attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
-  """Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'.
+  '''Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'.
 
   The behaviour of ensure_available that is synonymous to `AutoLLM.for_model` depends on `init_local`.
   By default, `ensure_available` is synonymous to `init_local`, meaning on the service when creating
@@ -1466,38 +1175,33 @@ def Runner(model_name: str,
   Args:
     model_name: Supported model name from 'openllm models'
     ensure_available: If True, it will download the model if it is not available. If False, it will skip downloading the model.
-    If False, make sure the model is available locally.
-    implementation: The given Runner implementation one choose for this Runner. By default, it is retrieved from the enviroment variable
-    of the respected model_name. For example: 'flan-t5' -> "OPENLLM_FLAN_T5_FRAMEWORK"
+                      If False, make sure the model is available locally.
+    backend: The given Runner implementation one choose for this Runner. If `OPENLLM_BACKEND` is set, it will respect it.
     llm_config: Optional ``openllm.LLMConfig`` to initialise this ``openllm.LLMRunner``.
-    init_local: If True, it will initialize the model locally. This is useful if you want to
-    run the model locally. (Symmetrical to bentoml.Runner.init_local())
-    **attrs: The rest of kwargs will then be passed to the LLM. Refer to the LLM documentation for the kwargs
-    behaviour
-  """
+    init_local: If True, it will initialize the model locally. This is useful if you want to run the model locally. (Symmetrical to bentoml.Runner.init_local())
+    **attrs: The rest of kwargs will then be passed to the LLM. Refer to the LLM documentation for the kwargs behaviour
+  '''
   if llm_config is not None:
     attrs.update({
         'model_id':
             llm_config['env']['model_id_value'],
-        'bettertransformer':
-            llm_config['env']['bettertransformer_value'],
         'quantize':
             llm_config['env']['quantize_value'],
-        'runtime':
-            llm_config['env']['runtime_value'],
         'serialisation':
             first_not_none(os.environ.get('OPENLLM_SERIALIZATION'), attrs.get('serialisation'), default='safetensors')
     })
 
-  default_implementation = llm_config.default_implementation() if llm_config is not None else 'pt'
-  implementation = t.cast(
-      LiteralRuntime,
-      first_not_none(implementation, default=EnvVarMixin(model_name, default_implementation)['framework_value']))
-  runner = infer_auto_class(implementation).create_runner(
-      model_name,
-      llm_config=llm_config,
-      ensure_available=ensure_available if ensure_available is not None else init_local,
-      **attrs)
+  backend = t.cast(
+      LiteralBackend,
+      first_not_none(backend,
+                     default=EnvVarMixin(
+                         model_name,
+                         backend=llm_config.default_backend() if llm_config is not None else 'pt')['backend_value']))
+  if init_local: ensure_available = True
+  runner = infer_auto_class(backend).create_runner(model_name,
+                                                   llm_config=llm_config,
+                                                   ensure_available=ensure_available,
+                                                   **attrs)
   if init_local: runner.init_local(quiet=True)
   return runner
 
@@ -1514,12 +1218,11 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
   class _Runnable(bentoml.Runnable):
     SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
     SUPPORTS_CPU_MULTI_THREADING = True
-    framework = self.__llm_implementation__
+    backend = self.__llm_backend__
 
     def __init__(__self: _Runnable):
-      # NOTE: The side effect of this line
-      # is that it will load the imported model during
-      # runner startup. So don't remove it!!
+      # NOTE: The side effect of this line is that it will load the
+      # imported model during runner startup. So don't remove it!!
       if not self.model: raise RuntimeError('Failed to load the model correctly (See traceback above)')
       if self.adapters_mapping is not None:
         logger.info('Applying LoRA to %s...', self.runner_name)
@@ -1531,37 +1234,37 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
       if adapter_name != 'default': self.model.set_adapter(adapter_name)
       logger.info('Successfully apply LoRA layer %s', adapter_name)
 
-    @bentoml.Runnable.method(**method_signature(embeddings_sig))
-    def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[LLMEmbeddings]:
+    @bentoml.Runnable.method(**method_signature(embeddings_sig))  # type: ignore
+    def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[EmbeddingsOutput]:
       return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)]
 
-    @bentoml.Runnable.method(**method_signature(generate_sig))
+    @bentoml.Runnable.method(**method_signature(generate_sig))  # type: ignore
     def __call__(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
       adapter_name = attrs.pop('adapter_name', None)
       if adapter_name is not None: __self.set_adapter(adapter_name)
       return self.generate(prompt, **attrs)
 
-    @bentoml.Runnable.method(**method_signature(generate_sig))
+    @bentoml.Runnable.method(**method_signature(generate_sig))  # type: ignore
     def generate(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
       adapter_name = attrs.pop('adapter_name', None)
       if adapter_name is not None: __self.set_adapter(adapter_name)
-      if __self.framework == 'vllm': attrs.setdefault('request_id', openllm_core.utils.gen_random_uuid())
+      if __self.backend == 'vllm': attrs.setdefault('request_id', openllm_core.utils.gen_random_uuid())
       return self.generate(prompt, **attrs)
 
-    @bentoml.Runnable.method(**method_signature(generate_sig))
+    @bentoml.Runnable.method(**method_signature(generate_sig))  # type: ignore
     def generate_one(__self: _Runnable, prompt: str, stop: list[str],
                      **attrs: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]:
       adapter_name = attrs.pop('adapter_name', None)
       if adapter_name is not None: __self.set_adapter(adapter_name)
       return self.generate_one(prompt, stop, **attrs)
 
-    @bentoml.Runnable.method(**method_signature(generate_iterator_sig))
+    @bentoml.Runnable.method(**method_signature(generate_iterator_sig))  # type: ignore
     def generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.Generator[str, None, str]:
       adapter_name = attrs.pop('adapter_name', None)
       if adapter_name is not None: __self.set_adapter(adapter_name)
       pre = 0
       for outputs in self.generate_iterator(prompt, request_id=openllm_core.utils.gen_random_uuid(), **attrs):
-        output_text = outputs['text'][0] if __self.framework == 'vllm' else outputs['text']
+        output_text = outputs['text'][0] if __self.backend == 'vllm' else outputs['text']
         output_text = output_text.strip().split(' ')
         now = len(output_text) - 1
         if now > pre:
@@ -1609,20 +1312,20 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
     prompt, generate_kwargs, postprocess_kwargs = self.sanitize_parameters(prompt, **kwargs)
     return self.postprocess_generate(prompt, __self.generate.run(prompt, **generate_kwargs), **postprocess_kwargs)
 
-  def _wrapped_embeddings_run(__self: LLMRunner[M, T], prompt: str | list[str]) -> LLMEmbeddings:
-    """``llm.embed`` is a light wrapper around runner.embeedings.run().
+  def _wrapped_embeddings_run(__self: LLMRunner[M, T], prompt: str | list[str]) -> EmbeddingsOutput:
+    '''``llm.embed`` is a light wrapper around runner.embeedings.run().
 
     Usage:
 
     ```python
-    runner = openllm.Runner('llama', implementation='pt')
+    runner = openllm.Runner('llama', backend='pt')
     runner.embed("What is the meaning of life?")
     ```
-    """
+    '''
     return __self.embeddings.run([prompt] if isinstance(prompt, str) else prompt)
 
   def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]:
-    return {'config', 'llm_type', 'runner_methods', 'runtime', 'llm_tag'}
+    return {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'}
 
   def _wrapped_repr_args(__self: LLMRunner[M, T]) -> ReprArgs:
     yield 'runner_methods', {
@@ -1633,19 +1336,17 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
     }
     yield 'config', self.config.model_dump(flatten=True)
     yield 'llm_type', __self.llm_type
-    yield 'runtime', self.runtime
+    yield 'backend', self.__llm_backend__
     yield 'llm_tag', self.tag
-    yield 'llm_framework', self.__llm_implementation__
 
   return types.new_class(self.__class__.__name__ + 'Runner', (bentoml.Runner,),
                          exec_body=lambda ns: ns.update({
                              'llm_type': self.llm_type,
                              'identifying_params': self.identifying_params,
-                             'llm_framework': self.__llm_implementation__,
                              'llm_tag': self.tag,
                              'llm': self,
                              'config': self.config,
-                             'implementation': self.__llm_implementation__,
+                             'backend': self.__llm_backend__,
                              'peft_adapters': property(fget=available_adapters),
                              'download_model': self.ensure_model_id_exists,
                              '__call__': _wrapped_generate_run,
@@ -1660,4 +1361,4 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
                              'has_adapters': self._adapters_mapping is not None
                          }))
 
-__all__ = ['LLMRunner', 'LLMRunnable', 'Runner', 'LLM', 'llm_runner_class', 'llm_runnable_class', 'LLMEmbeddings']
+__all__ = ['LLMRunner', 'LLMRunnable', 'Runner', 'LLM', 'llm_runner_class', 'llm_runnable_class', 'EmbeddingsOutput']
diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py
index 129647f5..8d910ad3 100644
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -78,7 +78,7 @@ async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[s
              'model_id': runner.llm.model_id,
              'timeout': 3600,
              'model_name': llm_config['model_name'],
-             'framework': runner.llm_framework,
+             'backend': runner.backend,
              'configuration': '',
              'supports_embeddings': runner.supports_embeddings,
              'supports_hf_agent': runner.supports_hf_agent
@@ -86,7 +86,7 @@ async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[s
 def metadata_v1(_: str) -> openllm.MetadataOutput:
   return openllm.MetadataOutput(timeout=llm_config['timeout'],
                                 model_name=llm_config['model_name'],
-                                framework=llm_config['env']['framework_value'],
+                                backend=llm_config['env']['backend_value'],
                                 model_id=runner.llm.model_id,
                                 configuration=llm_config.model_dump_json().decode(),
                                 supports_embeddings=runner.supports_embeddings,
diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py
index 32f15de4..f56dfc56 100644
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -86,17 +86,17 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
     packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}")
 
   env = llm.config['env']
-  framework_envvar = env['framework_value']
-  if framework_envvar == 'flax':
+  backend_envvar = env['backend_value']
+  if backend_envvar == 'flax':
     if not openllm_core.utils.is_flax_available():
-      raise ValueError(f"Flax is not available, while {env.framework} is set to 'flax'")
+      raise ValueError(f"Flax is not available, while {env.backend} is set to 'flax'")
     packages.extend(
         [importlib.metadata.version('flax'),
          importlib.metadata.version('jax'),
          importlib.metadata.version('jaxlib')])
-  elif framework_envvar == 'tf':
+  elif backend_envvar == 'tf':
     if not openllm_core.utils.is_tf_available():
-      raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'")
+      raise ValueError(f"TensorFlow is not available, while {env.backend} is set to 'tf'")
     candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu',
                   'intel-tensorflow', 'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos',
                  )
@@ -125,21 +125,22 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
   return PythonOptions(packages=packages,
                        wheels=wheels,
                        lock_packages=False,
-                       extra_index_url=['https://download.pytorch.org/whl/cu118'])
+                       extra_index_url=[
+                           'https://download.pytorch.org/whl/cu118',
+                           'https://huggingface.github.io/autogptq-index/whl/cu118/'
+                       ])
 
 def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float,
-                             quantize: LiteralString | None, bettertransformer: bool | None,
-                             adapter_map: dict[str, str | None] | None, dockerfile_template: str | None,
-                             runtime: t.Literal['ggml', 'transformers'], serialisation_format: t.Literal['safetensors',
-                                                                                                         'legacy'],
+                             quantize: LiteralString | None, adapter_map: dict[str, str | None] | None,
+                             dockerfile_template: str | None, serialisation_format: t.Literal['safetensors', 'legacy'],
                              container_registry: LiteralContainerRegistry,
                              container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
   from openllm.cli._factory import parse_config_options
   environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy())
   env: openllm_core.utils.EnvVarMixin = llm.config['env']
-  if env['framework_value'] == 'vllm': serialisation_format = 'legacy'
+  if env['backend_value'] == 'vllm': serialisation_format = 'legacy'
   env_dict = {
-      env.framework: env['framework_value'],
+      env.backend: env['backend_value'],
       env.config: f"'{llm.config.model_dump_json().decode()}'",
       env.model_id: f'/home/bentoml/bento/models/{llm.tag.path()}',
       'OPENLLM_MODEL': llm.config['model_name'],
@@ -152,14 +153,9 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_
   if adapter_map: env_dict['BITSANDBYTES_NOWELCOME'] = os.environ.get('BITSANDBYTES_NOWELCOME', '1')
 
   # We need to handle None separately here, as env from subprocess doesn't accept None value.
-  _env = openllm_core.utils.EnvVarMixin(llm.config['model_name'],
-                                        bettertransformer=bettertransformer,
-                                        quantize=quantize,
-                                        runtime=runtime)
+  _env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], quantize=quantize)
 
-  env_dict[_env.bettertransformer] = str(_env['bettertransformer_value'])
   if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value'])
-  env_dict[_env.runtime] = _env['runtime_value']
   return DockerOptions(
       base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}',
       env=env_dict,
@@ -218,21 +214,19 @@ def create_bento(bento_tag: bentoml.Tag,
                  llm: openllm.LLM[t.Any, t.Any],
                  workers_per_resource: str | float,
                  quantize: LiteralString | None,
-                 bettertransformer: bool | None,
                  dockerfile_template: str | None,
                  adapter_map: dict[str, str | None] | None = None,
                  extra_dependencies: tuple[str, ...] | None = None,
-                 runtime: t.Literal['ggml', 'transformers'] = 'transformers',
                  serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
                  container_registry: LiteralContainerRegistry = 'ecr',
                  container_version_strategy: LiteralContainerVersionStrategy = 'release',
                  _bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
                  _model_store: ModelStore = Provide[BentoMLContainer.model_store]) -> bentoml.Bento:
-  framework_envvar = llm.config['env']['framework_value']
+  backend_envvar = llm.config['env']['backend_value']
   labels = dict(llm.identifying_params)
   labels.update({
       '_type': llm.llm_type,
-      '_framework': framework_envvar,
+      '_framework': backend_envvar,
       'start_name': llm.config['start_name'],
       'base_name_or_path': llm.model_id,
       'bundler': 'openllm.bundle'
@@ -265,8 +259,8 @@ def create_bento(bento_tag: bentoml.Tag,
                                   python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
                                   models=[llm_spec],
                                   docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize,
-                                                                  bettertransformer, adapter_map, dockerfile_template,
-                                                                  runtime, serialisation_format, container_registry,
+                                                                  adapter_map, dockerfile_template,
+                                                                  serialisation_format, container_registry,
                                                                   container_version_strategy))
 
   bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/'))
diff --git a/openllm-python/src/openllm/bundle/oci/__init__.py b/openllm-python/src/openllm/bundle/oci/__init__.py
index 735a67a1..10a8dd97 100644
--- a/openllm-python/src/openllm/bundle/oci/__init__.py
+++ b/openllm-python/src/openllm/bundle/oci/__init__.py
@@ -94,7 +94,7 @@ class RefResolver:
   git_hash: str = attr.field()
   version: openllm_core.utils.VersionInfo = attr.field(converter=_convert_version_from_string)
   strategy: LiteralContainerVersionStrategy = attr.field()
-  _ghapi: t.ClassVar[all.GhApi] = all.GhApi(owner=_OWNER, repo=_REPO)
+  _ghapi: t.ClassVar[all.GhApi] = all.GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
 
   @classmethod
   def _nightly_ref(cls) -> RefTuple:
diff --git a/openllm-python/src/openllm/cli/_factory.py b/openllm-python/src/openllm/cli/_factory.py
index 974b8d84..195a6143 100644
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -16,12 +16,15 @@ from click.shell_completion import CompletionItem
 
 import bentoml
 import openllm
+import openllm_core
 
 from bentoml._internal.configuration.containers import BentoMLContainer
 from openllm_core._typing_compat import Concatenate
 from openllm_core._typing_compat import DictStrAny
+from openllm_core._typing_compat import LiteralBackend
 from openllm_core._typing_compat import LiteralString
 from openllm_core._typing_compat import ParamSpec
+from openllm_core._typing_compat import get_literal_args
 from openllm_core.utils import DEBUG
 
 from . import termui
@@ -147,14 +150,12 @@ Available official model_id(s): [default: {llm_config['default_id']}]
   @click.pass_context
   def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None,
                 workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...],
-                quantize: t.Literal['int8', 'int4', 'gptq'] | None, bettertransformer: bool | None,
-                runtime: t.Literal['ggml', 'transformers'], fast: bool, serialisation_format: t.Literal['safetensors',
-                                                                                                        'legacy'],
-                cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any,
+                quantize: t.Literal['int8', 'int4', 'gptq'] | None, backend: LiteralBackend,
+                serialisation_format: t.Literal['safetensors', 'legacy'], cors: bool, adapter_id: str | None,
+                return_process: bool, **attrs: t.Any,
                ) -> LLMConfig | subprocess.Popen[bytes]:
-    fast = str(fast).upper() in openllm.utils.ENV_VARS_TRUE_VALUES
-    if serialisation_format == 'safetensors' and quantize is not None and os.environ.get(
-        'OPENLLM_SERIALIZATION_WARNING', str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES:
+    if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env(
+        'OPENLLM_SERIALIZATION_WARNING'):
       termui.echo(
           f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.",
           fg='yellow')
@@ -184,20 +185,18 @@ Available official model_id(s): [default: {llm_config['default_id']}]
 
     # Create a new model env to work with the envvar during CLI invocation
     env = openllm.utils.EnvVarMixin(config['model_name'],
-                                    config.default_implementation(),
+                                    backend,
                                     model_id=model_id or config['default_id'],
-                                    bettertransformer=bettertransformer,
-                                    quantize=quantize,
-                                    runtime=runtime)
-    prerequisite_check(ctx, config, quantize, adapter_map, int(1 / wpr))
+                                    quantize=quantize)
+    requirements = llm_config['requirements']
+    if requirements is not None and len(requirements) > 0:
+      missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
+      if len(missing_requirements) > 0:
+        termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')
 
     # NOTE: This is to set current configuration
     start_env = os.environ.copy()
     start_env = parse_config_options(config, server_timeout, wpr, device, cors, start_env)
-    if fast:
-      termui.echo(
-          f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'",
-          fg='yellow')
 
     start_env.update({
         'OPENLLM_MODEL': model,
@@ -205,21 +204,18 @@ Available official model_id(s): [default: {llm_config['default_id']}]
         'BENTOML_HOME': os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()),
         'OPENLLM_ADAPTER_MAP': orjson.dumps(adapter_map).decode(),
         'OPENLLM_SERIALIZATION': serialisation_format,
-        env.runtime: env['runtime_value'],
-        env.framework: env['framework_value']
+        env.backend: env['backend_value']
     })
     if env['model_id_value']: start_env[env.model_id] = str(env['model_id_value'])
-    # NOTE: quantize and bettertransformer value is already assigned within env
-    if bettertransformer is not None: start_env[env.bettertransformer] = str(env['bettertransformer_value'])
     if quantize is not None: start_env[env.quantize] = str(t.cast(str, env['quantize_value']))
 
-    llm = openllm.utils.infer_auto_class(env['framework_value']).for_model(model,
-                                                                           model_id=start_env[env.model_id],
-                                                                           model_version=model_version,
-                                                                           llm_config=config,
-                                                                           ensure_available=not fast,
-                                                                           adapter_map=adapter_map,
-                                                                           serialisation=serialisation_format)
+    llm = openllm.utils.infer_auto_class(env['backend_value']).for_model(model,
+                                                                         model_id=start_env[env.model_id],
+                                                                         model_version=model_version,
+                                                                         llm_config=config,
+                                                                         ensure_available=True,
+                                                                         adapter_map=adapter_map,
+                                                                         serialisation=serialisation_format)
     start_env.update({env.config: llm.config.model_dump_json().decode()})
 
     server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer(
@@ -268,21 +264,6 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, *
 
   return noop
 
-def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None,
-                       adapter_map: dict[str, str | None] | None, num_workers: int) -> None:
-  if adapter_map and not openllm.utils.is_peft_available():
-    ctx.fail(
-        "Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'")
-  if quantize and llm_config.default_implementation() == 'vllm':
-    ctx.fail(
-        f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization."
-    )
-  requirements = llm_config['requirements']
-  if requirements is not None and len(requirements) > 0:
-    missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
-    if len(missing_requirements) > 0:
-      termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')
-
 def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
 
   def wrapper(fn: FC) -> t.Callable[[FC], FC]:
@@ -291,22 +272,21 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
         cog.optgroup.group(
             'General LLM Options',
             help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
-        model_id_option(factory=cog.optgroup, model_env=llm_config['env']), model_version_option(factory=cog.optgroup),
+        model_id_option(factory=cog.optgroup), model_version_option(factory=cog.optgroup),
         cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
         workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup),
-        fast_option(factory=cog.optgroup),
+        backend_option(factory=cog.optgroup),
         cog.optgroup.group('LLM Optimization Options',
                            help='''Optimization related options.
 
-            OpenLLM supports running model with [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/),
-            k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
+            OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
 
             The following are either in our roadmap or currently being worked on:
 
             - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
             - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
             ''',
-                          ),
+                          ), quantize_option(factory=cog.optgroup), serialisation_option(factory=cog.optgroup),
         cog.optgroup.option('--device',
                             type=openllm.utils.dantic.CUDA,
                             multiple=True,
@@ -314,13 +294,6 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
                             callback=parse_device_callback,
                             help=f"Assign GPU devices (if available) for {llm_config['model_name']}.",
                             show_envvar=True),
-        cog.optgroup.option('--runtime',
-                            type=click.Choice(['ggml', 'transformers']),
-                            default='transformers',
-                            help='The runtime to use for the given model. Default is transformers.'),
-        quantize_option(factory=cog.optgroup, model_env=llm_config['env']),
-        bettertransformer_option(factory=cog.optgroup, model_env=llm_config['env']),
-        serialisation_option(factory=cog.optgroup),
         cog.optgroup.group('Fine-tuning related options',
                            help='''\
     Note that the argument `--adapter-id` can accept the following format:
@@ -439,18 +412,6 @@ def output_option(f: _AnyCallable | None = None,
                     shell_complete=complete_output_var,
                     **attrs)(f)
 
-def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option('--fast/--no-fast',
-                    show_default=True,
-                    default=False,
-                    envvar='OPENLLM_USE_LOCAL_LATEST',
-                    show_envvar=True,
-                    help='''Whether to skip checking if models is already in store.
-
-                                                                                                          This is useful if you already downloaded or setup the model beforehand.
-                                                                                                          ''',
-                    **attrs)(f)
-
 def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option('--cors/--no-cors',
                     show_default=True,
@@ -463,15 +424,12 @@ def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC
 def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option('--machine', is_flag=True, default=False, hidden=True, **attrs)(f)
 
-def model_id_option(f: _AnyCallable | None = None,
-                    *,
-                    model_env: openllm.utils.EnvVarMixin | None = None,
-                    **attrs: t.Any) -> t.Callable[[FC], FC]:
+def model_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option('--model-id',
                     type=click.STRING,
                     default=None,
-                    envvar=model_env.model_id if model_env is not None else None,
-                    show_envvar=model_env is not None,
+                    envvar='OPENLLM_MODEL_ID',
+                    show_envvar=True,
                     help='Optional model_id name or path for (fine-tune) weight.',
                     **attrs)(f)
 
@@ -483,24 +441,31 @@ def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
       help='Optional model version to save for this model. It will be inferred automatically from model-id.',
       **attrs)(f)
 
+def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
+  # NOTE: LiteralBackend needs to remove the last two item as ggml and mlc is wip
+  # XXX: remove the check for __args__ once we have ggml and mlc supports
+  return cli_option('--backend',
+                    type=click.Choice(get_literal_args(LiteralBackend)[:-2]),
+                    default='pt',
+                    envvar='OPENLLM_BACKEND',
+                    show_envvar=True,
+                    help='The implementation for saving this LLM.',
+                    **attrs)(f)
+
 def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_argument('model_name',
                       type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]),
                       required=required,
                       **attrs)(f)
 
-def quantize_option(f: _AnyCallable | None = None,
-                    *,
-                    build: bool = False,
-                    model_env: openllm.utils.EnvVarMixin | None = None,
-                    **attrs: t.Any) -> t.Callable[[FC], FC]:
+def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option('--quantise',
                     '--quantize',
                     'quantize',
                     type=click.Choice(['int8', 'int4', 'gptq']),
                     default=None,
-                    envvar=model_env.quantize if model_env is not None else None,
-                    show_envvar=model_env is not None,
+                    envvar='OPENLLM_QUANTIZE',
+                    show_envvar=True,
                     help='''Dynamic quantization for running this LLM.
 
       The following quantization strategies are supported:
@@ -542,21 +507,6 @@ def workers_per_resource_option(f: _AnyCallable | None = None,
       > ensure it has the same effect with 'openllm start --api-workers ...'""" if build else ''),
                     **attrs)(f)
 
-def bettertransformer_option(f: _AnyCallable | None = None,
-                             *,
-                             build: bool = False,
-                             model_env: openllm.utils.EnvVarMixin | None = None,
-                             **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-      '--bettertransformer',
-      is_flag=True,
-      default=None,
-      envvar=model_env.bettertransformer if model_env is not None else None,
-      show_envvar=model_env is not None,
-      help='Apply FasterTransformer wrapper to serve model. This will applies during serving time.' if not build else
-      'Set default environment variable whether to serve this model with FasterTransformer in build time.',
-      **attrs)(f)
-
 def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option('--serialisation',
                     '--serialization',
@@ -586,22 +536,18 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
                     **attrs)(f)
 
 def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option('--container-registry',
-                    'container_registry',
-                    type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
-                    default='ecr',
-                    show_default=True,
-                    show_envvar=True,
-                    envvar='OPENLLM_CONTAINER_REGISTRY',
-                    callback=container_registry_callback,
-                    help='''The default container registry to get the base image for building BentoLLM.
-
-      Currently, it supports 'ecr', 'ghcr.io', 'docker.io'
-
-      \b
-      > [!NOTE] that in order to build the base image, you will need a GPUs to compile custom kernel. See ``openllm ext build-base-container`` for more information.
-      ''',
-                    **attrs)(f)
+  return cli_option(
+      '--container-registry',
+      'container_registry',
+      type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
+      default='ecr',
+      show_default=True,
+      show_envvar=True,
+      envvar='OPENLLM_CONTAINER_REGISTRY',
+      callback=container_registry_callback,
+      help=
+      'The default container registry to get the base image for building BentoLLM. Currently, it supports ecr, ghcr, docker',
+      **attrs)(f)
 
 _wpr_strategies = {'round_robin', 'conserved'}
 
diff --git a/openllm-python/src/openllm/cli/_sdk.py b/openllm-python/src/openllm/cli/_sdk.py
index c43779ba..3ac7a11a 100644
--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -23,9 +23,9 @@ from ._factory import start_command_factory
 if t.TYPE_CHECKING:
   from bentoml._internal.bento import BentoStore
   from openllm_core._configuration import LLMConfig
+  from openllm_core._typing_compat import LiteralBackend
   from openllm_core._typing_compat import LiteralContainerRegistry
   from openllm_core._typing_compat import LiteralContainerVersionStrategy
-  from openllm_core._typing_compat import LiteralRuntime
   from openllm_core._typing_compat import LiteralString
 
 logger = logging.getLogger(__name__)
@@ -38,10 +38,8 @@ def _start(model_name: str,
            workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None,
            device: tuple[str, ...] | t.Literal['all'] | None = None,
            quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-           bettertransformer: bool | None = None,
-           runtime: t.Literal['ggml', 'transformers'] = 'transformers',
            adapter_map: dict[LiteralString, str | None] | None = None,
-           framework: LiteralRuntime | None = None,
+           backend: LiteralBackend | None = None,
            additional_args: list[str] | None = None,
            cors: bool = False,
            _serve_grpc: bool = False,
@@ -57,48 +55,42 @@ def _start(model_name: str,
 
   ``openllm.start`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI interaction.
 
-  > [!NOTE] ``quantize`` and ``bettertransformer`` are mutually exclusive.
-
   Args:
-      model_name: The model name to start this LLM
-      model_id: Optional model id for this given LLM
-      timeout: The server timeout
-      workers_per_resource: Number of workers per resource assigned.
-                            See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
-                            for more information. By default, this is set to 1.
+    model_name: The model name to start this LLM
+    model_id: Optional model id for this given LLM
+    timeout: The server timeout
+    workers_per_resource: Number of workers per resource assigned.
+                          See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
+                          for more information. By default, this is set to 1.
 
-                            > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
-                            > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
-                            > - ``conserved``: This will determine the number of available GPU resources, and only assign
-                            >                  one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
-                            >                  equivalent to ``--workers-per-resource 0.25``.
-      device: Assign GPU devices (if available) to this LLM. By default, this is set to ``None``. It also accepts 'all'
-      argument to assign all available GPUs to this LLM.
-      quantize: Quantize the model weights. This is only applicable for PyTorch models.
-                Possible quantisation strategies:
-                - int8: Quantize the model with 8bit (bitsandbytes required)
-                - int4: Quantize the model with 4bit (bitsandbytes required)
-                - gptq: Quantize the model with GPTQ (auto-gptq required)
-      bettertransformer: Convert given model to FastTransformer with PyTorch.
-      runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
-      cors: Whether to enable CORS for this LLM. By default, this is set to ``False``.
-      adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
-      framework: The framework to use for this LLM. By default, this is set to ``pt``.
-      additional_args: Additional arguments to pass to ``openllm start``.
+                          > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
+                          > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
+                          > - ``conserved``: This will determine the number of available GPU resources, and only assign
+                          >                  one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
+                          >                  equivalent to ``--workers-per-resource 0.25``.
+    device: Assign GPU devices (if available) to this LLM. By default, this is set to ``None``. It also accepts 'all'
+    argument to assign all available GPUs to this LLM.
+    quantize: Quantize the model weights. This is only applicable for PyTorch models.
+              Possible quantisation strategies:
+              - int8: Quantize the model with 8bit (bitsandbytes required)
+              - int4: Quantize the model with 4bit (bitsandbytes required)
+              - gptq: Quantize the model with GPTQ (auto-gptq required)
+    cors: Whether to enable CORS for this LLM. By default, this is set to ``False``.
+    adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
+    backend: The backend to use for this LLM. By default, this is set to ``pt``.
+    additional_args: Additional arguments to pass to ``openllm start``.
   """
   from .entrypoint import start_command
   from .entrypoint import start_grpc_command
   llm_config = openllm.AutoConfig.for_model(model_name)
   _ModelEnv = openllm_core.utils.EnvVarMixin(model_name,
-                                             openllm_core.utils.first_not_none(
-                                                 framework, default=llm_config.default_implementation()),
+                                             backend=openllm_core.utils.first_not_none(
+                                                 backend, default=llm_config.default_backend()),
                                              model_id=model_id,
-                                             bettertransformer=bettertransformer,
-                                             quantize=quantize,
-                                             runtime=runtime)
-  os.environ[_ModelEnv.framework] = _ModelEnv['framework_value']
+                                             quantize=quantize)
+  os.environ[_ModelEnv.backend] = _ModelEnv['backend_value']
 
-  args: list[str] = ['--runtime', runtime]
+  args: list[str] = []
   if model_id: args.extend(['--model-id', model_id])
   if timeout: args.extend(['--server-timeout', str(timeout)])
   if workers_per_resource:
@@ -107,10 +99,7 @@ def _start(model_name: str,
         str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource
     ])
   if device and not os.environ.get('CUDA_VISIBLE_DEVICES'): args.extend(['--device', ','.join(device)])
-  if quantize and bettertransformer:
-    raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
   if quantize: args.extend(['--quantize', str(quantize)])
-  elif bettertransformer: args.append('--bettertransformer')
   if cors: args.append('--cors')
   if adapter_map:
     args.extend(
@@ -134,12 +123,10 @@ def _build(model_name: str,
            model_version: str | None = None,
            bento_version: str | None = None,
            quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-           bettertransformer: bool | None = None,
            adapter_map: dict[str, str | None] | None = None,
            build_ctx: str | None = None,
            enable_features: tuple[str, ...] | None = None,
            workers_per_resource: float | None = None,
-           runtime: t.Literal['ggml', 'transformers'] = 'transformers',
            dockerfile_template: str | None = None,
            overwrite: bool = False,
            container_registry: LiteralContainerRegistry | None = None,
@@ -153,59 +140,50 @@ def _build(model_name: str,
 
   The LLM will be built into a BentoService with the following structure:
   if ``quantize`` is passed, it will instruct the model to be quantized dynamically during serving time.
-  if ``bettertransformer`` is passed, it will instruct the model to apply FasterTransformer during serving time.
 
   ``openllm.build`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as ``openllm build`` CLI.
 
-  > [!NOTE] ``quantize`` and ``bettertransformer`` are mutually exclusive.
-
   Args:
-      model_name: The model name to start this LLM
-      model_id: Optional model id for this given LLM
-      model_version: Optional model version for this given LLM
-      bento_version: Optional bento veresion for this given BentoLLM
-      quantize: Quantize the model weights. This is only applicable for PyTorch models.
-                Possible quantisation strategies:
-                - int8: Quantize the model with 8bit (bitsandbytes required)
-                - int4: Quantize the model with 4bit (bitsandbytes required)
-                - gptq: Quantize the model with GPTQ (auto-gptq required)
-      bettertransformer: Convert given model to FastTransformer with PyTorch.
-      adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
-      build_ctx: The build context to use for building BentoLLM. By default, it sets to current directory.
-      enable_features: Additional OpenLLM features to be included with this BentoLLM.
-      workers_per_resource: Number of workers per resource assigned.
-                            See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
-                            for more information. By default, this is set to 1.
+    model_name: The model name to start this LLM
+    model_id: Optional model id for this given LLM
+    model_version: Optional model version for this given LLM
+    bento_version: Optional bento veresion for this given BentoLLM
+    quantize: Quantize the model weights. This is only applicable for PyTorch models.
+              Possible quantisation strategies:
+              - int8: Quantize the model with 8bit (bitsandbytes required)
+              - int4: Quantize the model with 4bit (bitsandbytes required)
+              - gptq: Quantize the model with GPTQ (auto-gptq required)
+    adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
+    build_ctx: The build context to use for building BentoLLM. By default, it sets to current directory.
+    enable_features: Additional OpenLLM features to be included with this BentoLLM.
+    workers_per_resource: Number of workers per resource assigned.
+                          See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
+                          for more information. By default, this is set to 1.
 
-                            > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
-                            > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
-                            > - ``conserved``: This will determine the number of available GPU resources, and only assign
-                            >                  one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
-                            >                  equivalent to ``--workers-per-resource 0.25``.
-      runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
-      dockerfile_template: The dockerfile template to use for building BentoLLM. See https://docs.bentoml.com/en/latest/guides/containerization.html#dockerfile-template.
-      overwrite: Whether to overwrite the existing BentoLLM. By default, this is set to ``False``.
-      push: Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.
-      containerize: Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.
-                    Note that 'containerize' and 'push' are mutually exclusive
-                    container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
-      container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
-      container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
-      serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
-      additional_args: Additional arguments to pass to ``openllm build``.
-      bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.
+                          > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
+                          > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
+                          > - ``conserved``: This will determine the number of available GPU resources, and only assign
+                          >                  one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
+                          >                  equivalent to ``--workers-per-resource 0.25``.
+    dockerfile_template: The dockerfile template to use for building BentoLLM. See https://docs.bentoml.com/en/latest/guides/containerization.html#dockerfile-template.
+    overwrite: Whether to overwrite the existing BentoLLM. By default, this is set to ``False``.
+    push: Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.
+    containerize: Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.
+                  Note that 'containerize' and 'push' are mutually exclusive
+                  container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
+    container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
+    container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
+    serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
+    additional_args: Additional arguments to pass to ``openllm build``.
+    bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.
 
   Returns:
       ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
   """
   args: list[str] = [
-      sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--runtime', runtime, '--serialisation',
-      serialisation_format
+      sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation_format
   ]
-  if quantize and bettertransformer:
-    raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
   if quantize: args.extend(['--quantize', quantize])
-  if bettertransformer: args.append('--bettertransformer')
   if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
   if push: args.extend(['--push'])
   if containerize: args.extend(['--containerize'])
@@ -241,8 +219,7 @@ def _import_model(model_name: str,
                   *,
                   model_id: str | None = None,
                   model_version: str | None = None,
-                  runtime: t.Literal['ggml', 'transformers'] = 'transformers',
-                  implementation: LiteralRuntime = 'pt',
+                  backend: LiteralBackend = 'pt',
                   quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
                   serialisation_format: t.Literal['legacy', 'safetensors'] = 'safetensors',
                   additional_args: t.Sequence[str] | None = None) -> bentoml.Model:
@@ -259,28 +236,24 @@ def _import_model(model_name: str,
   > ``openllm.start`` will automatically invoke ``openllm.download`` under the hood.
 
   Args:
-      model_name: The model name to start this LLM
-      model_id: Optional model id for this given LLM
-      model_version: Optional model version for this given LLM
-      runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
-      implementation: The implementation to use for this LLM. By default, this is set to ``pt``.
-      quantize: Quantize the model weights. This is only applicable for PyTorch models.
-                Possible quantisation strategies:
-                - int8: Quantize the model with 8bit (bitsandbytes required)
-                - int4: Quantize the model with 4bit (bitsandbytes required)
-                - gptq: Quantize the model with GPTQ (auto-gptq required)
-      serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
-      Default behaviour is similar to ``safe_serialization=False``.
-      additional_args: Additional arguments to pass to ``openllm import``.
+    model_name: The model name to start this LLM
+    model_id: Optional model id for this given LLM
+    model_version: Optional model version for this given LLM
+    backend: The backend to use for this LLM. By default, this is set to ``pt``.
+    quantize: Quantize the model weights. This is only applicable for PyTorch models.
+              Possible quantisation strategies:
+              - int8: Quantize the model with 8bit (bitsandbytes required)
+              - int4: Quantize the model with 4bit (bitsandbytes required)
+              - gptq: Quantize the model with GPTQ (auto-gptq required)
+    serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
+    Default behaviour is similar to ``safe_serialization=False``.
+    additional_args: Additional arguments to pass to ``openllm import``.
 
   Returns:
-      ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
+    ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
   """
   from .entrypoint import import_command
-  args = [
-      model_name, '--runtime', runtime, '--implementation', implementation, '--machine', '--serialisation',
-      serialisation_format,
-  ]
+  args = [model_name, '--backend', backend, '--machine', '--serialisation', serialisation_format]
   if model_id is not None: args.append(model_id)
   if model_version is not None: args.extend(['--model-version', str(model_version)])
   if additional_args is not None: args.extend(additional_args)
diff --git a/openllm-python/src/openllm/cli/entrypoint.py b/openllm-python/src/openllm/cli/entrypoint.py
index 7545176d..93445c34 100644
--- a/openllm-python/src/openllm/cli/entrypoint.py
+++ b/openllm-python/src/openllm/cli/entrypoint.py
@@ -66,7 +66,7 @@ from openllm.models.auto import AutoLLM
 from openllm.utils import infer_auto_class
 from openllm_core._typing_compat import Concatenate
 from openllm_core._typing_compat import DictStrAny
-from openllm_core._typing_compat import LiteralRuntime
+from openllm_core._typing_compat import LiteralBackend
 from openllm_core._typing_compat import LiteralString
 from openllm_core._typing_compat import ParamSpec
 from openllm_core._typing_compat import Self
@@ -80,7 +80,6 @@ from openllm_core.utils import analytics
 from openllm_core.utils import bentoml_cattr
 from openllm_core.utils import compose
 from openllm_core.utils import configure_logging
-from openllm_core.utils import dantic
 from openllm_core.utils import first_not_none
 from openllm_core.utils import get_debug_mode
 from openllm_core.utils import get_quiet_mode
@@ -94,15 +93,13 @@ from . import termui
 from ._factory import FC
 from ._factory import LiteralOutput
 from ._factory import _AnyCallable
-from ._factory import bettertransformer_option
+from ._factory import backend_option
 from ._factory import container_registry_option
-from ._factory import fast_option
 from ._factory import machine_option
 from ._factory import model_id_option
 from ._factory import model_name_argument
 from ._factory import model_version_option
 from ._factory import output_option
-from ._factory import parse_device_callback
 from ._factory import quantize_option
 from ._factory import serialisation_option
 from ._factory import start_command_factory
@@ -205,21 +202,6 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
 
     return t.cast(t.Callable[Concatenate[bool, P], t.Any], wrapper)
 
-  @staticmethod
-  def exception_handling(func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any) -> t.Callable[P, t.Any]:
-    command_name = attrs.get('name', func.__name__)
-
-    @functools.wraps(func)
-    def wrapper(*args: P.args, **attrs: P.kwargs) -> t.Any:
-      try:
-        return func(*args, **attrs)
-      except OpenLLMException as err:
-        raise click.ClickException(click.style(f"[{group.name}] '{command_name}' failed: " + err.message, fg='red')) from err
-      except KeyboardInterrupt:
-        pass
-
-    return wrapper
-
   def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None:
     if cmd_name in t.cast('Extensions', extension_command).list_commands(ctx):
       return t.cast('Extensions', extension_command).get_command(ctx, cmd_name)
@@ -253,11 +235,11 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
       name = name.replace('_', '-')
       kwargs.setdefault('help', inspect.getdoc(f))
       kwargs.setdefault('name', name)
-      wrapped = self.exception_handling(self.usage_tracking(self.common_params(f), self, **kwargs), self, **kwargs)
+      wrapped = self.usage_tracking(self.common_params(f), self, **kwargs)
 
       # move common parameters to end of the parameters list
       _memo = getattr(wrapped, '__click_params__', None)
-      if _memo is None: raise RuntimeError('Click command not register correctly.')
+      if _memo is None: raise ValueError('Click command not register correctly.')
       _object_setattr(wrapped, '__click_params__', _memo[-self.NUMBER_OF_COMMON_PARAMS:] + _memo[:-self.NUMBER_OF_COMMON_PARAMS])
       # NOTE: we need to call super of super to avoid conflict with BentoMLCommandGroup command setup
       cmd = super(BentoMLCommandGroup, self).command(*args, **kwargs)(wrapped)
@@ -348,11 +330,10 @@ _start_mapping = {
 @click.argument('model_id', type=click.STRING, default=None, metavar='Optional[REMOTE_REPO/MODEL_ID | /path/to/local/model]', required=False)
 @click.argument('converter', envvar='CONVERTER', type=click.STRING, default=None, required=False, metavar=None)
 @model_version_option
-@click.option('--runtime', type=click.Choice(['ggml', 'transformers']), default='transformers', help='The runtime to use for the given model. Default is transformers.')
 @output_option
 @quantize_option
 @machine_option
-@click.option('--implementation', type=click.Choice(['pt', 'tf', 'flax', 'vllm']), default=None, help='The implementation for saving this LLM.')
+@backend_option
 @serialisation_option
 def import_command(
     model_name: str,
@@ -360,9 +341,8 @@ def import_command(
     converter: str | None,
     model_version: str | None,
     output: LiteralOutput,
-    runtime: t.Literal['ggml', 'transformers'],
     machine: bool,
-    implementation: LiteralRuntime | None,
+    backend: LiteralBackend,
     quantize: t.Literal['int8', 'int4', 'gptq'] | None,
     serialisation_format: t.Literal['safetensors', 'legacy'],
 ) -> bentoml.Model:
@@ -415,45 +395,42 @@ def import_command(
   ```bash
   $ CONVERTER=llama2-hf openllm import llama /path/to/llama-2
   ```
-
-  > [!WARNING] This behaviour will override ``--runtime``. Therefore make sure that the LLM contains correct conversion strategies to both GGML and HF.
   """
   llm_config = AutoConfig.for_model(model_name)
-  env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, runtime=runtime, quantize=quantize)
-  impl: LiteralRuntime = first_not_none(implementation, default=env['framework_value'])
-  llm = infer_auto_class(impl).for_model(
+  env = EnvVarMixin(model_name, backend=llm_config.default_backend(), model_id=model_id, quantize=quantize)
+  backend = first_not_none(backend, default=env['backend_value'])
+  llm = infer_auto_class(backend).for_model(
       model_name, model_id=env['model_id_value'], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format
   )
   _previously_saved = False
   try:
     _ref = serialisation.get(llm)
     _previously_saved = True
-  except bentoml.exceptions.NotFound:
+  except openllm.exceptions.OpenLLMException:
     if not machine and output == 'pretty':
-      msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store for implementation {llm.__llm_implementation__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
+      msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store for backend {llm.__llm_backend__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
       termui.echo(msg, fg='yellow', nl=True)
     _ref = serialisation.get(llm, auto_import=True)
-    if impl == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
+    if backend == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
   if machine: return _ref
   elif output == 'pretty':
-    if _previously_saved: termui.echo(f"{model_name} with 'model_id={model_id}' is already setup for framework '{impl}': {_ref.tag!s}", nl=True, fg='yellow')
+    if _previously_saved: termui.echo(f"{model_name} with 'model_id={model_id}' is already setup for backend '{backend}': {_ref.tag!s}", nl=True, fg='yellow')
     else: termui.echo(f'Saved model: {_ref.tag}')
-  elif output == 'json': termui.echo(orjson.dumps({'previously_setup': _previously_saved, 'framework': impl, 'tag': str(_ref.tag)}, option=orjson.OPT_INDENT_2).decode())
+  elif output == 'json': termui.echo(orjson.dumps({'previously_setup': _previously_saved, 'backend': backend, 'tag': str(_ref.tag)}, option=orjson.OPT_INDENT_2).decode())
   else: termui.echo(_ref.tag)
   return _ref
+
 @cli.command(context_settings={'token_normalize_func': inflection.underscore})
 @model_name_argument
 @model_id_option
 @output_option
 @machine_option
+@backend_option
 @click.option('--bento-version', type=str, default=None, help='Optional bento version for this BentoLLM. Default is the the model revision.')
 @click.option('--overwrite', is_flag=True, help='Overwrite existing Bento for given LLM if it already exists.')
 @workers_per_resource_option(factory=click, build=True)
-@click.option('--device', type=dantic.CUDA, multiple=True, envvar='CUDA_VISIBLE_DEVICES', callback=parse_device_callback, help='Set the device', show_envvar=True)
 @cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Optimisation options')
 @quantize_option(factory=cog.optgroup, build=True)
-@bettertransformer_option(factory=cog.optgroup)
-@click.option('--runtime', type=click.Choice(['ggml', 'transformers']), default='transformers', help='The runtime to use for the given model. Default is transformers.')
 @click.option(
     '--enable-features',
     multiple=True,
@@ -476,7 +453,6 @@ def import_command(
 @click.option(
     '--container-version-strategy', type=click.Choice(['release', 'latest', 'nightly']), default='release', help="Default container version strategy for the image from '--container-registry'"
 )
-@fast_option
 @cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Utilities options')
 @cog.optgroup.option(
     '--containerize',
@@ -496,21 +472,18 @@ def build_command(
     bento_version: str | None,
     overwrite: bool,
     output: LiteralOutput,
-    runtime: t.Literal['ggml', 'transformers'],
     quantize: t.Literal['int8', 'int4', 'gptq'] | None,
     enable_features: tuple[str, ...] | None,
-    bettertransformer: bool | None,
     workers_per_resource: float | None,
     adapter_id: tuple[str, ...],
     build_ctx: str | None,
+    backend: LiteralBackend,
     machine: bool,
-    device: tuple[str, ...],
     model_version: str | None,
     dockerfile_template: t.TextIO | None,
     containerize: bool,
     push: bool,
     serialisation_format: t.Literal['safetensors', 'legacy'],
-    fast: bool,
     container_registry: LiteralContainerRegistry,
     container_version_strategy: LiteralContainerVersionStrategy,
     force_push: bool,
@@ -539,22 +512,21 @@ def build_command(
   _previously_built = False
 
   llm_config = AutoConfig.for_model(model_name)
-  env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, quantize=quantize, bettertransformer=bettertransformer, runtime=runtime)
+  env = EnvVarMixin(model_name, backend=backend, model_id=model_id, quantize=quantize)
 
   # NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
   # during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
   try:
-    os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), env.runtime: str(env['runtime_value']), 'OPENLLM_SERIALIZATION': serialisation_format})
+    os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), 'OPENLLM_SERIALIZATION': serialisation_format, 'OPENLLM_BACKEND': env['backend_value']})
     if env['model_id_value']: os.environ[env.model_id] = str(env['model_id_value'])
     if env['quantize_value']: os.environ[env.quantize] = str(env['quantize_value'])
-    os.environ[env.bettertransformer] = str(env['bettertransformer_value'])
 
-    llm = infer_auto_class(env['framework_value']).for_model(
-        model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs
+    llm = infer_auto_class(env['backend_value']).for_model(
+        model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=True, model_version=model_version, serialisation=serialisation_format, **attrs
     )
 
     labels = dict(llm.identifying_params)
-    labels.update({'_type': llm.llm_type, '_framework': env['framework_value']})
+    labels.update({'_type': llm.llm_type, '_framework': env['backend_value']})
     workers_per_resource = first_not_none(workers_per_resource, default=llm_config['workers_per_resource'])
 
     with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
@@ -603,10 +575,8 @@ def build_command(
             workers_per_resource=workers_per_resource,
             adapter_map=adapter_map,
             quantize=quantize,
-            bettertransformer=bettertransformer,
             extra_dependencies=enable_features,
             dockerfile_template=dockerfile_template_path,
-            runtime=runtime,
             container_registry=container_registry,
             container_version_strategy=container_version_strategy
         )
@@ -632,16 +602,17 @@ def build_command(
 
   if push: BentoMLContainer.bentocloud_client.get().push_bento(bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=force_push)
   elif containerize:
-    backend = t.cast('DefaultBuilder', os.environ.get('BENTOML_CONTAINERIZE_BACKEND', 'docker'))
+    container_backend = t.cast('DefaultBuilder', os.environ.get('BENTOML_CONTAINERIZE_BACKEND', 'docker'))
     try:
-      bentoml.container.health(backend)
+      bentoml.container.health(container_backend)
     except subprocess.CalledProcessError:
       raise OpenLLMException(f'Failed to use backend {backend}') from None
     try:
-      bentoml.container.build(bento.tag, backend=backend, features=('grpc', 'io'))
+      bentoml.container.build(bento.tag, backend=container_backend, features=('grpc', 'io'))
     except Exception as err:
       raise OpenLLMException(f"Exception caught while containerizing '{bento.tag!s}':\n{err}") from err
   return bento
+
 @cli.command()
 @output_option
 @click.option('--show-available', is_flag=True, default=False, help="Show available models in local store (mutually exclusive with '-o porcelain').")
@@ -667,21 +638,21 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
   else:
     failed_initialized: list[tuple[str, Exception]] = []
 
-    json_data: dict[str, dict[t.Literal['architecture', 'model_id', 'url', 'installation', 'cpu', 'gpu', 'runtime_impl'], t.Any] | t.Any] = {}
+    json_data: dict[str, dict[t.Literal['architecture', 'model_id', 'url', 'installation', 'cpu', 'gpu', 'backend'], t.Any] | t.Any] = {}
     converted: list[str] = []
     for m in models:
       config = AutoConfig.for_model(m)
-      runtime_impl: tuple[str, ...] = ()
-      if config['model_name'] in MODEL_MAPPING_NAMES: runtime_impl += ('pt',)
-      if config['model_name'] in MODEL_FLAX_MAPPING_NAMES: runtime_impl += ('flax',)
-      if config['model_name'] in MODEL_TF_MAPPING_NAMES: runtime_impl += ('tf',)
-      if config['model_name'] in MODEL_VLLM_MAPPING_NAMES: runtime_impl += ('vllm',)
+      backend: tuple[str, ...] = ()
+      if config['model_name'] in MODEL_MAPPING_NAMES: backend += ('pt',)
+      if config['model_name'] in MODEL_FLAX_MAPPING_NAMES: backend += ('flax',)
+      if config['model_name'] in MODEL_TF_MAPPING_NAMES: backend += ('tf',)
+      if config['model_name'] in MODEL_VLLM_MAPPING_NAMES: backend += ('vllm',)
       json_data[m] = {
           'architecture': config['architecture'],
           'model_id': config['model_ids'],
           'cpu': not config['requires_gpu'],
           'gpu': True,
-          'runtime_impl': runtime_impl,
+          'backend': backend,
           'installation': f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config['requirements'] else 'openllm',
       }
       converted.extend([normalise_model_name(i) for i in config['model_ids']])
@@ -708,10 +679,10 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
       import tabulate
 
       tabulate.PRESERVE_WHITESPACE = True
-      # llm, architecture, url, model_id, installation, cpu, gpu, runtime_impl
-      data: list[str | tuple[str, str, list[str], str, LiteralString, LiteralString, tuple[LiteralRuntime, ...]]] = []
+      # llm, architecture, url, model_id, installation, cpu, gpu, backend
+      data: list[str | tuple[str, str, list[str], str, LiteralString, LiteralString, tuple[LiteralBackend, ...]]] = []
       for m, v in json_data.items():
-        data.extend([(m, v['architecture'], v['model_id'], v['installation'], '❌' if not v['cpu'] else '✅', '✅', v['runtime_impl'],)])
+        data.extend([(m, v['architecture'], v['model_id'], v['installation'], '❌' if not v['cpu'] else '✅', '✅', v['backend'],)])
       column_widths = [
           int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 4),
       ]
diff --git a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
index 704c3833..93f286e7 100644
--- a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
@@ -18,7 +18,7 @@ class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrain
                              prompt,
                              generation_config=self.config.model_construct_env(**attrs).to_generation_config())
 
-  def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
+  def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
     import torch
     import torch.nn.functional as F
     embeddings: list[list[float]] = []
@@ -30,4 +30,4 @@ class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrain
         data = F.normalize(torch.mean(outputs.hidden_states[-1].transpose(0, 1), dim=0), p=2, dim=0)
         embeddings.append(data.tolist())
         num_tokens += len(input_ids[0])
-    return openllm.LLMEmbeddings(embeddings=embeddings, num_tokens=num_tokens)
+    return openllm.EmbeddingsOutput(embeddings=embeddings, num_tokens=num_tokens)
diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
index 22f94531..51a76400 100644
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -17,7 +17,7 @@ class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformer
                               generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
           skip_special_tokens=True)
 
-  def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
+  def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
     import torch
     import torch.nn.functional as F
     embeddings: list[list[float]] = []
@@ -29,4 +29,4 @@ class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformer
         data = F.normalize(torch.mean(outputs.encoder_last_hidden_state[0], dim=0), p=2, dim=0)
         embeddings.append(data.tolist())
         num_tokens += len(input_ids[0])
-    return openllm.LLMEmbeddings(embeddings=embeddings, num_tokens=num_tokens)
+    return openllm.EmbeddingsOutput(embeddings=embeddings, num_tokens=num_tokens)
diff --git a/openllm-python/src/openllm/models/llama/modeling_llama.py b/openllm-python/src/openllm/models/llama/modeling_llama.py
index 79946d4c..b259fba8 100644
--- a/openllm-python/src/openllm/models/llama/modeling_llama.py
+++ b/openllm-python/src/openllm/models/llama/modeling_llama.py
@@ -13,7 +13,7 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke
     import torch
     return {'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {}
 
-  def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
+  def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
     import torch
     import torch.nn.functional as F
     encoding = self.tokenizer(prompts, padding=True, return_tensors='pt').to(self.device)
@@ -23,8 +23,8 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke
       mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
       masked_embeddings = data * mask
       sum_embeddings, seq_length = torch.sum(masked_embeddings, dim=1), torch.sum(mask, dim=1)
-    return openllm.LLMEmbeddings(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(),
-                                 num_tokens=int(torch.sum(attention_mask).item()))
+    return openllm.EmbeddingsOutput(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(),
+                                    num_tokens=int(torch.sum(attention_mask).item()))
 
   def generate_one(self, prompt: str, stop: list[str],
                    **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
diff --git a/openllm-python/src/openllm/models/mpt/modeling_mpt.py b/openllm-python/src/openllm/models/mpt/modeling_mpt.py
index 7ef664bc..33553246 100644
--- a/openllm-python/src/openllm/models/mpt/modeling_mpt.py
+++ b/openllm-python/src/openllm/models/mpt/modeling_mpt.py
@@ -33,10 +33,6 @@ def get_mpt_config(model_id_or_path: str,
 class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXTokenizerFast']):
   __openllm_internal__ = True
 
-  def llm_post_init(self) -> None:
-    import torch
-    self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-
   @property
   def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
     import torch
@@ -49,7 +45,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
     import torch
     import transformers
     _, tokenizer_attrs = self.llm_parameters
-    torch_dtype = attrs.pop('torch_dtype', self.dtype)
+    torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32)
     device_map = attrs.pop('device_map', None)
     attrs.pop('low_cpu_mem_usage', None)
     config = get_mpt_config(self.model_id,
@@ -75,7 +71,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
 
   def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
     import transformers
-    torch_dtype = attrs.pop('torch_dtype', self.dtype)
+    torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32)
     device_map = attrs.pop('device_map', None)
     trust_remote_code = attrs.pop('trust_remote_code', True)
     config = get_mpt_config(self._bentomodel.path,
diff --git a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
index 0594c856..0a8c9b82 100644
--- a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
@@ -8,10 +8,6 @@ if t.TYPE_CHECKING:
 class StableLM(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNeoXTokenizerFast']):
   __openllm_internal__ = True
 
-  def llm_post_init(self) -> None:
-    import torch
-    self.bettertransformer = True if not torch.cuda.is_available() else False
-
   @property
   def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
     import torch
diff --git a/openllm-python/src/openllm/serialisation/__init__.py b/openllm-python/src/openllm/serialisation/__init__.py
index ea4e325c..1714d65b 100644
--- a/openllm-python/src/openllm/serialisation/__init__.py
+++ b/openllm-python/src/openllm/serialisation/__init__.py
@@ -1,27 +1,9 @@
-"""Serialisation utilities for OpenLLM.
+'''Serialisation utilities for OpenLLM.
 
 Currently supports transformers for PyTorch, Tensorflow and Flax.
 
 Currently, GGML format is working in progress.
-
-## Usage
-
-```python
-import openllm
-
-llm = openllm.AutoLLM.for_model("dolly-v2")
-llm.save_pretrained("./path/to/local-dolly")
-```
-
-To use different runtime, specify directly in the `for_model` method:
-
-```python
-import openllm
-
-llm = openllm.AutoLLM.for_model("dolly-v2", runtime='ggml')
-llm.save_pretrained("./path/to/local-dolly")
-```
-"""
+'''
 from __future__ import annotations
 import importlib
 import typing as t
@@ -54,7 +36,7 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
   from .transformers._helpers import infer_tokenizers_from_llm
   from .transformers._helpers import process_config
 
-  config, *_ = process_config(llm._bentomodel.path, llm.__llm_trust_remote_code__)
+  config, *_ = process_config(llm._bentomodel.path, llm.trust_remote_code)
   bentomodel_fs = fs.open_fs(llm._bentomodel.path)
   if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
     with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, 'rb') as cofile:
@@ -62,12 +44,11 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
         tokenizer = cloudpickle.load(t.cast('t.IO[bytes]', cofile))['tokenizer']
       except KeyError:
         raise openllm.exceptions.OpenLLMException(
-            "Bento model does not have tokenizer. Make sure to save"
-            " the tokenizer within the model via 'custom_objects'."
-            " For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
+            "Bento model does not have tokenizer. Make sure to save the tokenizer within the model via 'custom_objects'. "
+            "For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
   else:
     tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath('/'),
-                                                               trust_remote_code=llm.__llm_trust_remote_code__,
+                                                               trust_remote_code=llm.trust_remote_code,
                                                                **tokenizer_attrs)
 
   if tokenizer.pad_token_id is None:
@@ -82,18 +63,20 @@ class _Caller(t.Protocol[P]):
   def __call__(self, llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
     ...
 
-_extras = ['get', 'import_model', 'save_pretrained', 'load_model']
+_extras = ['get', 'import_model', 'load_model']
 
 def _make_dispatch_function(fn: str) -> _Caller[P]:
 
   def caller(llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
     """Generic function dispatch to correct serialisation submodules based on LLM runtime.
 
-    > [!NOTE] See 'openllm.serialisation.transformers' if 'llm.runtime="transformers"'
+    > [!NOTE] See 'openllm.serialisation.transformers' if 'llm.__llm_backend__ in ("pt", "tf", "flax", "vllm")'
 
-    > [!NOTE] See 'openllm.serialisation.ggml' if 'llm.runtime="ggml"'
+    > [!NOTE] See 'openllm.serialisation.ggml' if 'llm.__llm_backend__="ggml"'
     """
-    return getattr(importlib.import_module(f'.{llm.runtime}', __name__), fn)(llm, *args, **kwargs)
+    serde = 'transformers'
+    if llm.__llm_backend__ == 'ggml': serde = 'ggml'
+    return getattr(importlib.import_module(f'.{serde}', __name__), fn)(llm, *args, **kwargs)
 
   return caller
 
@@ -105,9 +88,6 @@ if t.TYPE_CHECKING:
   def import_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> bentoml.Model:
     ...
 
-  def save_pretrained(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> None:
-    ...
-
   def load_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> M:
     ...
 
diff --git a/openllm-python/src/openllm/serialisation/ggml.py b/openllm-python/src/openllm/serialisation/ggml.py
index d0539672..fd4397cc 100644
--- a/openllm-python/src/openllm/serialisation/ggml.py
+++ b/openllm-python/src/openllm/serialisation/ggml.py
@@ -5,10 +5,10 @@ This requires ctransformers to be installed.
 from __future__ import annotations
 import typing as t
 
-import bentoml
-import openllm
-
 if t.TYPE_CHECKING:
+  import bentoml
+  import openllm
+
   from openllm_core._typing_compat import M
 
 _conversion_strategy = {'pt': 'ggml'}
@@ -21,30 +21,7 @@ def import_model(llm: openllm.LLM[t.Any, t.Any],
   raise NotImplementedError('Currently work in progress.')
 
 def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Model:
-  '''Return an instance of ``bentoml.Model`` from given LLM instance.
-
-  By default, it will try to check the model in the local store.
-  If model is not found, and ``auto_import`` is set to True, it will try to import the model from HuggingFace Hub.
-
-  Otherwise, it will raises a ``bentoml.exceptions.NotFound``.
-  '''
-  try:
-    model = bentoml.models.get(llm.tag)
-    if model.info.module not in ('openllm.serialisation.ggml', __name__):
-      raise bentoml.exceptions.NotFound(
-          f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'."
-      )
-    if 'runtime' in model.info.labels and model.info.labels['runtime'] != llm.runtime:
-      raise openllm.exceptions.OpenLLMException(
-          f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")
-    return model
-  except bentoml.exceptions.NotFound:
-    if auto_import:
-      return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__)
-    raise
+  raise NotImplementedError('Currently work in progress.')
 
 def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
   raise NotImplementedError('Currently work in progress.')
-
-def save_pretrained(llm: openllm.LLM[t.Any, t.Any], save_directory: str, **attrs: t.Any) -> None:
-  raise NotImplementedError('Currently work in progress.')
diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py
index dc2d8e2b..c75e3636 100644
--- a/openllm-python/src/openllm/serialisation/transformers/__init__.py
+++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py
@@ -5,6 +5,7 @@ import logging
 import typing as t
 
 from huggingface_hub import snapshot_download
+from packaging.version import Version
 from simple_di import Provide
 from simple_di import inject
 
@@ -28,22 +29,18 @@ if t.TYPE_CHECKING:
   import auto_gptq as autogptq
   import torch
   import torch.nn
-  import transformers
-  import vllm
 
   from bentoml._internal.models import ModelStore
   from openllm_core._typing_compat import DictStrAny
   from openllm_core._typing_compat import M
   from openllm_core._typing_compat import T
 else:
-  vllm = openllm.utils.LazyLoader('vllm', globals(), 'vllm')
   autogptq = openllm.utils.LazyLoader('autogptq', globals(), 'auto_gptq')
-  transformers = openllm.utils.LazyLoader('transformers', globals(), 'transformers')
   torch = openllm.utils.LazyLoader('torch', globals(), 'torch')
 
 logger = logging.getLogger(__name__)
 
-__all__ = ['import_model', 'get', 'load_model', 'save_pretrained']
+__all__ = ['import_model', 'get', 'load_model']
 
 @inject
 def import_model(llm: openllm.LLM[M, T],
@@ -74,7 +71,7 @@ def import_model(llm: openllm.LLM[M, T],
   safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'),
                                                     default=llm._serialisation_format == 'safetensors')
   # Disable safe serialization with vLLM
-  if llm.__llm_implementation__ == 'vllm': safe_serialisation = False
+  if llm.__llm_backend__ == 'vllm': safe_serialisation = False
   metadata: DictStrAny = {
       'safe_serialisation': safe_serialisation,
       '_quantize': quantize_method is not None and quantize_method
@@ -95,8 +92,8 @@ def import_model(llm: openllm.LLM[M, T],
     # since saving int4 is not yet supported
     if 'quantization_config' in attrs and getattr(attrs['quantization_config'], 'load_in_4bit', False):
       attrs.pop('quantization_config')
-    if llm.__llm_implementation__ != 'flax': attrs['use_safetensors'] = safe_serialisation
-    metadata['_framework'] = 'pt' if llm.__llm_implementation__ == 'vllm' else llm.__llm_implementation__
+    if llm.__llm_backend__ != 'flax': attrs['use_safetensors'] = safe_serialisation
+    metadata['_framework'] = 'pt' if llm.__llm_backend__ == 'vllm' else llm.__llm_backend__
 
   tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id,
                                                              trust_remote_code=trust_remote_code,
@@ -108,7 +105,7 @@ def import_model(llm: openllm.LLM[M, T],
   imported_modules: list[types.ModuleType] = []
   bentomodel = bentoml.Model.create(llm.tag,
                                     module='openllm.serialisation.transformers',
-                                    api_version='v1',
+                                    api_version='v2',
                                     options=ModelOptions(),
                                     context=openllm.utils.generate_context(framework_name='openllm'),
                                     labels=openllm.utils.generate_labels(llm),
@@ -133,8 +130,7 @@ def import_model(llm: openllm.LLM[M, T],
                                                             trust_remote_code=trust_remote_code,
                                                             use_safetensors=safe_serialisation,
                                                             **hub_attrs,
-                                                            **attrs,
-                                                           )
+                                                            **attrs)
         update_model(bentomodel,
                      metadata={
                          '_pretrained_class': model.__class__.__name__,
@@ -192,27 +188,21 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
   '''
   try:
     model = bentoml.models.get(llm.tag)
-    if model.info.module not in ('openllm.serialisation.transformers'
-                                 'bentoml.transformers', 'bentoml._internal.frameworks.transformers',
-                                 __name__):  # NOTE: backward compatible with previous version of OpenLLM.
-      raise bentoml.exceptions.NotFound(
-          f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'."
-      )
-    if 'runtime' in model.info.labels and model.info.labels['runtime'] != llm.runtime:
+    if Version(model.info.api_version) < Version('v2'):
       raise openllm.exceptions.OpenLLMException(
-          f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")
+          'Please run "openllm prune -y --include-bentos" and upgrade all saved model to latest release.')
+    if model.info.labels['backend'] != llm.__llm_backend__:
+      raise openllm.exceptions.OpenLLMException(
+          f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}."
+      )
     return model
-  except bentoml.exceptions.NotFound as err:
-    if auto_import: return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__)
-    raise err from None
+  except Exception as err:
+    if auto_import: return import_model(llm, trust_remote_code=llm.trust_remote_code)
+    raise openllm.exceptions.OpenLLMException(
+        f'Failed while getting stored artefact (lookup for traceback):\n{err}') from err
 
 def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
-  '''Load the model from BentoML store.
-
-  By default, it will try to find check the model in the local store.
-  If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
-  '''
-  config, hub_attrs, attrs = process_config(llm.model_id, llm.__llm_trust_remote_code__, **attrs)
+  config, hub_attrs, attrs = process_config(llm.model_id, llm.trust_remote_code, **attrs)
   safe_serialization = openllm.utils.first_not_none(t.cast(
       t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)),
                                                     attrs.pop('safe_serialization', None),
@@ -229,7 +219,7 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
                                                        *decls,
                                                        quantize_config=t.cast('autogptq.BaseQuantizeConfig',
                                                                               llm.quantization_config),
-                                                       trust_remote_code=llm.__llm_trust_remote_code__,
+                                                       trust_remote_code=llm.trust_remote_code,
                                                        use_safetensors=safe_serialization,
                                                        **hub_attrs,
                                                        **attrs)
@@ -238,57 +228,9 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
   model = infer_autoclass_from_llm(llm, config).from_pretrained(llm._bentomodel.path,
                                                                 *decls,
                                                                 config=config,
-                                                                trust_remote_code=llm.__llm_trust_remote_code__,
+                                                                trust_remote_code=llm.trust_remote_code,
                                                                 device_map=device_map,
                                                                 **hub_attrs,
                                                                 **attrs).eval()
-  # BetterTransformer is currently only supported on PyTorch.
-  if llm.bettertransformer and isinstance(model, transformers.PreTrainedModel): model = model.to_bettertransformer()
-  if llm.__llm_implementation__ in {'pt', 'vllm'}: check_unintialised_params(model)
+  if llm.__llm_backend__ in {'pt', 'vllm'}: check_unintialised_params(model)
   return t.cast('M', model)
-
-def save_pretrained(llm: openllm.LLM[M, T],
-                    save_directory: str,
-                    is_main_process: bool = True,
-                    state_dict: DictStrAny | None = None,
-                    save_function: t.Any | None = None,
-                    push_to_hub: bool = False,
-                    max_shard_size: int | str = '10GB',
-                    safe_serialization: bool = False,
-                    variant: str | None = None,
-                    **attrs: t.Any) -> None:
-  save_function = t.cast(t.Callable[..., None], openllm.utils.first_not_none(save_function, default=torch.save))
-  model_save_attrs, tokenizer_save_attrs = openllm.utils.normalize_attrs_to_model_tokenizer_pair(**attrs)
-  safe_serialization = safe_serialization or llm._serialisation_format == 'safetensors'
-  # NOTE: disable safetensors for vllm
-  if llm.__llm_implementation__ == 'vllm': safe_serialization = False
-  if llm._quantize_method == 'gptq':
-    if not openllm.utils.is_autogptq_available():
-      raise openllm.exceptions.OpenLLMException(
-          "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
-      )
-    if llm.config['model_type'] != 'causal_lm':
-      raise openllm.exceptions.OpenLLMException(
-          f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
-    if not openllm.utils.lenient_issubclass(llm.model, autogptq.modeling.BaseGPTQForCausalLM):
-      raise ValueError(f'Model is not a BaseGPTQForCausalLM (type: {type(llm.model)})')
-    t.cast('autogptq.modeling.BaseGPTQForCausalLM', llm.model).save_quantized(save_directory,
-                                                                              use_safetensors=safe_serialization)
-  elif openllm.utils.LazyType['vllm.LLMEngine']('vllm.LLMEngine').isinstance(llm.model):
-    raise RuntimeError(
-        "vllm.LLMEngine cannot be serialisation directly. This happens when 'save_pretrained' is called directly after `openllm.AutoVLLM` is initialized."
-    )
-  elif isinstance(llm.model, transformers.Pipeline):
-    llm.model.save_pretrained(save_directory, safe_serialization=safe_serialization)
-  else:
-    # We can safely cast here since it will be the PreTrainedModel protocol.
-    t.cast('transformers.PreTrainedModel', llm.model).save_pretrained(save_directory,
-                                                                      is_main_process=is_main_process,
-                                                                      state_dict=state_dict,
-                                                                      save_function=save_function,
-                                                                      push_to_hub=push_to_hub,
-                                                                      max_shard_size=max_shard_size,
-                                                                      safe_serialization=safe_serialization,
-                                                                      variant=variant,
-                                                                      **model_save_attrs)
-  llm.tokenizer.save_pretrained(save_directory, push_to_hub=push_to_hub, **tokenizer_save_attrs)
diff --git a/openllm-python/src/openllm/serialisation/transformers/_helpers.py b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
index 643a40f6..b325fd85 100644
--- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py
+++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
@@ -76,7 +76,7 @@ def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.Pretra
     if type(config) in transformers.MODEL_FOR_CAUSAL_LM_MAPPING: idx = 0
     elif type(config) in transformers.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING: idx = 1
     else: raise openllm.exceptions.OpenLLMException(f'Model type {type(config)} is not supported yet.')
-    return getattr(transformers, FRAMEWORK_TO_AUTOCLASS_MAPPING[llm.__llm_implementation__][idx])
+    return getattr(transformers, FRAMEWORK_TO_AUTOCLASS_MAPPING[llm.__llm_backend__][idx])
 
 def check_unintialised_params(model: torch.nn.Module) -> None:
   unintialized = [n for n, param in model.named_parameters() if param.data.device == torch.device('meta')]
@@ -104,11 +104,11 @@ def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Mod
 def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType:
   infer_fn: tuple[str, ...] = ('__call__',)
   default_config = ModelSignature(batchable=False)
-  if llm.__llm_implementation__ in {'pt', 'vllm'}:
+  if llm.__llm_backend__ in {'pt', 'vllm'}:
     infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample',
                  'group_beam_search', 'constrained_beam_search',
                 )
-  elif llm.__llm_implementation__ == 'tf':
+  elif llm.__llm_backend__ == 'tf':
     infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search',
                  'contrastive_search',
                 )
diff --git a/openllm-python/src/openllm/serialisation/transformers/weights.py b/openllm-python/src/openllm/serialisation/transformers/weights.py
index 0acb0a25..e1218444 100644
--- a/openllm-python/src/openllm/serialisation/transformers/weights.py
+++ b/openllm-python/src/openllm/serialisation/transformers/weights.py
@@ -23,9 +23,9 @@ class HfIgnore:
 
   @classmethod
   def ignore_patterns(cls, llm: openllm.LLM[M, T]) -> list[str]:
-    if llm.__llm_implementation__ == 'vllm': base = [cls.tf, cls.flax, cls.safetensors]
-    elif llm.__llm_implementation__ == 'tf': base = [cls.flax, cls.pt]
-    elif llm.__llm_implementation__ == 'flax':
+    if llm.__llm_backend__ == 'vllm': base = [cls.tf, cls.flax, cls.safetensors]
+    elif llm.__llm_backend__ == 'tf': base = [cls.flax, cls.pt]
+    elif llm.__llm_backend__ == 'flax':
       base = [cls.tf, cls.pt, cls.safetensors]  # as of current, safetensors is not supported with flax
     else:
       base = [cls.tf, cls.flax]
diff --git a/openllm-python/src/openllm/testing.py b/openllm-python/src/openllm/testing.py
index 4c4ee109..5736d1da 100644
--- a/openllm-python/src/openllm/testing.py
+++ b/openllm-python/src/openllm/testing.py
@@ -10,7 +10,7 @@ import bentoml
 import openllm
 
 if t.TYPE_CHECKING:
-  from openllm_core._typing_compat import LiteralRuntime
+  from openllm_core._typing_compat import LiteralBackend
 
 logger = logging.getLogger(__name__)
 
@@ -18,10 +18,9 @@ logger = logging.getLogger(__name__)
 def build_bento(model: str,
                 model_id: str | None = None,
                 quantize: t.Literal['int4', 'int8', 'gptq'] | None = None,
-                runtime: t.Literal['ggml', 'transformers'] = 'transformers',
                 cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
   logger.info('Building BentoML for %s', model)
-  bento = openllm.build(model, model_id=model_id, quantize=quantize, runtime=runtime)
+  bento = openllm.build(model, model_id=model_id, quantize=quantize)
   yield bento
   if cleanup:
     logger.info('Deleting %s', bento.tag)
@@ -49,7 +48,7 @@ def build_container(bento: bentoml.Bento | str | bentoml.Tag,
 @contextlib.contextmanager
 def prepare(model: str,
             model_id: str | None = None,
-            implementation: LiteralRuntime = 'pt',
+            implementation: LiteralBackend = 'pt',
             deployment_mode: t.Literal['container', 'local'] = 'local',
             clean_context: contextlib.ExitStack | None = None,
             cleanup: bool = True) -> t.Iterator[str]:
diff --git a/openllm-python/src/openllm/utils/__init__.py b/openllm-python/src/openllm/utils/__init__.py
index fdeed2c5..4033d3fb 100644
--- a/openllm-python/src/openllm/utils/__init__.py
+++ b/openllm-python/src/openllm/utils/__init__.py
@@ -16,11 +16,11 @@ from . import dummy_vllm_objects as dummy_vllm_objects
 if t.TYPE_CHECKING:
   import openllm
 
-  from openllm_core._typing_compat import LiteralRuntime
+  from openllm_core._typing_compat import LiteralBackend
 
 def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
   return {
-      'runtime': llm.runtime,
+      'backend': llm.__llm_backend__,
       'framework': 'openllm',
       'model_name': llm.config['model_name'],
       'architecture': llm.config['architecture'],
@@ -28,14 +28,13 @@ def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
   }
 
 def infer_auto_class(
-    implementation: LiteralRuntime
-) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
+    backend: LiteralBackend) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
   import openllm
-  if implementation == 'tf': return openllm.AutoTFLLM
-  elif implementation == 'flax': return openllm.AutoFlaxLLM
-  elif implementation == 'pt': return openllm.AutoLLM
-  elif implementation == 'vllm': return openllm.AutoVLLM
-  else: raise RuntimeError(f"Unknown implementation: {implementation} (supported: 'pt', 'flax', 'tf', 'vllm')")
+  if backend == 'tf': return openllm.AutoTFLLM
+  elif backend == 'flax': return openllm.AutoFlaxLLM
+  elif backend == 'pt': return openllm.AutoLLM
+  elif backend == 'vllm': return openllm.AutoVLLM
+  else: raise RuntimeError(f"Unknown backend: {backend} (supported: 'pt', 'flax', 'tf', 'vllm')")
 
 __all__ = [
     'generate_labels', 'infer_auto_class', 'dummy_flax_objects', 'dummy_pt_objects', 'dummy_tf_objects',
diff --git a/openllm-python/tests/_strategies/_configuration.py b/openllm-python/tests/_strategies/_configuration.py
index ee484747..813df70d 100644
--- a/openllm-python/tests/_strategies/_configuration.py
+++ b/openllm-python/tests/_strategies/_configuration.py
@@ -30,12 +30,10 @@ def model_settings(draw: st.DrawFn):
           st.booleans(),
       'requirements':
           st.none() | st.lists(st.text(), min_size=1),
-      'default_implementation':
+      'default_backend':
           st.dictionaries(st.sampled_from(['cpu', 'nvidia.com/gpu']), st.sampled_from(['vllm', 'pt', 'tf', 'flax'])),
       'model_type':
           st.sampled_from(['causal_lm', 'seq2seq_lm']),
-      'runtime':
-          st.sampled_from(['transformers', 'ggml']),
       'name_type':
           st.sampled_from(['dasherize', 'lowercase']),
       'timeout':
diff --git a/openllm-python/tests/configuration_test.py b/openllm-python/tests/configuration_test.py
index 0dcbc5e2..147ebc66 100644
--- a/openllm-python/tests/configuration_test.py
+++ b/openllm-python/tests/configuration_test.py
@@ -111,10 +111,7 @@ def patch_env(**attrs: t.Any):
     yield
 
 def test_struct_envvar():
-  with patch_env(**{
-      field_env_key('env_llm', 'field1'): '4',
-      field_env_key('env_llm', 'temperature', suffix='generation'): '0.2',
-  }):
+  with patch_env(**{field_env_key('field1'): '4', field_env_key('temperature', suffix='generation'): '0.2',}):
 
     class EnvLLM(openllm.LLMConfig):
       __config__ = {'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel',}
@@ -146,8 +143,8 @@ def test_struct_provided_fields():
 
 def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPatch):
   with monkeypatch.context() as mk:
-    mk.setenv(field_env_key('overwrite_with_env_available', 'field1'), str(4.0))
-    mk.setenv(field_env_key('overwrite_with_env_available', 'temperature', suffix='generation'), str(0.2))
+    mk.setenv(field_env_key('field1'), str(4.0))
+    mk.setenv(field_env_key('temperature', suffix='generation'), str(0.2))
     sent = make_llm_config('OverwriteWithEnvAvailable', {
         'default_id': 'asdfasdf',
         'model_ids': ['asdf', 'asdfasdfads'],
diff --git a/openllm-python/tests/conftest.py b/openllm-python/tests/conftest.py
index 5eed1c6a..959b6e11 100644
--- a/openllm-python/tests/conftest.py
+++ b/openllm-python/tests/conftest.py
@@ -8,9 +8,9 @@ import pytest
 import openllm
 
 if t.TYPE_CHECKING:
-  from openllm_core._typing_compat import LiteralRuntime
+  from openllm_core._typing_compat import LiteralBackend
 
-_FRAMEWORK_MAPPING = {
+_MODELING_MAPPING = {
     'flan_t5': 'google/flan-t5-small',
     'opt': 'facebook/opt-125m',
     'baichuan': 'baichuan-inc/Baichuan-7B',
@@ -22,19 +22,17 @@ _PROMPT_MAPPING = {
 
 def parametrise_local_llm(
     model: str,) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]:
-  if model not in _FRAMEWORK_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.")
-  runtime_impl: tuple[LiteralRuntime, ...] = tuple()
-  if model in openllm.MODEL_MAPPING_NAMES: runtime_impl += ('pt',)
-  if model in openllm.MODEL_FLAX_MAPPING_NAMES: runtime_impl += ('flax',)
-  if model in openllm.MODEL_TF_MAPPING_NAMES: runtime_impl += ('tf',)
-  for framework, prompt in itertools.product(runtime_impl, _PROMPT_MAPPING.keys()):
-    llm = openllm.Runner(model,
-                         model_id=_FRAMEWORK_MAPPING[model],
-                         ensure_available=True,
-                         implementation=framework,
-                         init_local=True,
-                        )
-    yield prompt, llm
+  if model not in _MODELING_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.")
+  backends: tuple[LiteralBackend, ...] = tuple()
+  if model in openllm.MODEL_MAPPING_NAMES: backends += ('pt',)
+  if model in openllm.MODEL_FLAX_MAPPING_NAMES: backends += ('flax',)
+  if model in openllm.MODEL_TF_MAPPING_NAMES: backends += ('tf',)
+  for backend, prompt in itertools.product(backends, _PROMPT_MAPPING.keys()):
+    yield prompt, openllm.Runner(model,
+                                 model_id=_MODELING_MAPPING[model],
+                                 ensure_available=True,
+                                 backend=backend,
+                                 init_local=True)
 
 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
   if os.getenv('GITHUB_ACTIONS') is None:
diff --git a/openllm-python/tests/package_test.py b/openllm-python/tests/package_test.py
index 1d1fc74f..17c121e6 100644
--- a/openllm-python/tests/package_test.py
+++ b/openllm-python/tests/package_test.py
@@ -4,6 +4,7 @@ import os
 import typing as t
 
 import pytest
+import transformers
 
 import openllm
 
@@ -28,7 +29,7 @@ def test_general_build_with_internal_testing():
   bento = openllm.build('flan-t5', model_id=HF_INTERNAL_T5_TESTING)
 
   assert llm.llm_type == bento.info.labels['_type']
-  assert llm.config['env']['framework_value'] == bento.info.labels['_framework']
+  assert llm.config['env']['backend_value'] == bento.info.labels['_framework']
 
   bento = openllm.build('flan-t5', model_id=HF_INTERNAL_T5_TESTING)
   assert len(bento_store.list(bento.tag)) == 1
@@ -38,10 +39,11 @@ def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory):
   local_path = tmp_path_factory.mktemp('local_t5')
   llm = openllm.AutoLLM.for_model('flan-t5', model_id=HF_INTERNAL_T5_TESTING, ensure_available=True)
 
-  if llm.bettertransformer:
-    llm.__llm_model__ = llm.model.reverse_bettertransformer()
-
-  llm.save_pretrained(local_path)
+  if isinstance(llm.model, transformers.Pipeline):
+    llm.model.save_pretrained(str(local_path))
+  else:
+    llm.model.save_pretrained(str(local_path))
+    llm.tokenizer.save_pretrained(str(local_path))
 
   assert openllm.build('flan-t5', model_id=local_path.resolve().__fspath__(), model_version='local')
 
diff --git a/pyproject.toml b/pyproject.toml
index 462251c9..fe922a8b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -266,10 +266,6 @@ USE_TABS = false
 BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION = 1
 BLANK_LINES_BETWEEN_TOP_LEVEL_IMPORTS_AND_VARIABLES = 1
 DISABLE_ENDING_COMMA_HEURISTIC = true
-# DEDENT_CLOSING_BRACKETS = true
-# INDENT_CLOSING_BRACKETS = false
-# COALESCE_BRACKETS = true
-# EACH_DICT_ENTRY_ON_SEPARATE_LINE = true
 # ALIGN_CLOSING_BRACKET_WITH_VISUAL_INDENT = true
 # ALLOW_MULTILINE_DICTIONARY_KEYS = false
 # ALLOW_MULTILINE_LAMBDAS = false
@@ -279,6 +275,10 @@ DISABLE_ENDING_COMMA_HEURISTIC = true
 # BLANK_LINE_BEFORE_CLASS_DOCSTRING = false
 # BLANK_LINE_BEFORE_MODULE_DOCSTRING = false
 # BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = false
+# DEDENT_CLOSING_BRACKETS = true
+# INDENT_CLOSING_BRACKETS = false
+# COALESCE_BRACKETS = true
+# EACH_DICT_ENTRY_ON_SEPARATE_LINE = true
 # CONTINUATION_ALIGN_STYLE = "SPACE"
 # INDENT_BLANK_LINES = false
 # NO_SPACES_AROUND_SELECTED_BINARY_OPERATORS = true
diff --git a/tools/update-config-stubs.py b/tools/update-config-stubs.py
index 27e1225d..1daf0ab2 100755
--- a/tools/update-config-stubs.py
+++ b/tools/update-config-stubs.py
@@ -50,42 +50,32 @@ _value_docstring = {
             ```bash
             openllm start gpt-neox --model-id stabilityai/stablelm-tuned-alpha-3b
             ```''',
-    'default_implementation':
-        '''The default runtime to run this LLM. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`.
-
-    It is a dictionary of key as the accelerator spec in k4s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM Runtime ('flax', 'tf', 'pt', 'vllm')
-    ''',
+    'default_backend':
+        '''The default backend to run LLM based on available accelerator. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`. It is a dictionary of key as the accelerator spec in k8s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM backend ('flax', 'tf', 'pt', 'vllm', 'ggml', 'mlc')''',
     'url':
-        '''The resolved url for this LLMConfig.''',
+        'The resolved url for this LLMConfig.',
     'requires_gpu':
-        '''Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU.''',
+        'Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU.',
     'trust_remote_code':
-        '''Whether to always trust remote code''',
+        'Whether to always trust remote code',
     'service_name':
-        """Generated service name for this LLMConfig. By default, it is 'generated_{model_name}_service.py'""",
+        "Generated service name for this LLMConfig. By default, it is \"generated_{model_name}_service.py\"",
     'requirements':
-        '''The default PyPI requirements needed to run this given LLM. By default, we will depend on
-        bentoml, torch, transformers.''',
-    'bettertransformer':
-        '''Whether to use BetterTransformer for this given LLM. This depends per model architecture. By default, we will use BetterTransformer for T5 and StableLM models, and set to False for every other models.''',
+        'The default PyPI requirements needed to run this given LLM. By default, we will depend on bentoml, torch, transformers.',
     'model_type':
-        '''The model type for this given LLM. By default, it should be causal language modeling.
-        Currently supported 'causal_lm' or 'seq2seq_lm'
-        ''',
-    'runtime':
-        '''The runtime to use for this model. Possible values are `transformers` or `ggml`. See Llama for more information.''',
+        'The model type for this given LLM. By default, it should be causal language modeling. Currently supported "causal_lm" or "seq2seq_lm"',
     'name_type':
         '''The default name typed for this model. "dasherize" will convert the name to lowercase and
         replace spaces with dashes. "lowercase" will convert the name to lowercase. If this is not set, then both
         `model_name` and `start_name` must be specified.''',
     'model_name':
-        '''The normalized version of __openllm_start_name__, determined by __openllm_name_type__''',
+        'The normalized version of __openllm_start_name__, determined by __openllm_name_type__',
     'start_name':
-        '''Default name to be used with `openllm start`''',
+        'Default name to be used with `openllm start`',
     'env':
-        '''A EnvVarMixin instance for this LLMConfig.''',
+        'A EnvVarMixin instance for this LLMConfig.',
     'timeout':
-        '''The default timeout to be set for this given LLM.''',
+        'The default timeout to be set for this given LLM.',
     'workers_per_resource':
         '''The number of workers per resource. This is used to determine the number of workers to use for this model.
         For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then
@@ -97,9 +87,9 @@ _value_docstring = {
         By default, it is set to 1.
         ''',
     'fine_tune_strategies':
-        '''The fine-tune strategies for this given LLM.''',
+        'The fine-tune strategies for this given LLM.',
     'tokenizer_class':
-        '''Optional tokenizer class for this given LLM. See Llama for example.''',
+        'Optional tokenizer class for this given LLM. See Llama for example.',
 }
 
 _transformed = {'fine_tune_strategies': 't.Dict[AdapterType, FineTuneConfig]'}
@@ -125,7 +115,7 @@ def main() -> int:
     config_attr_lines.extend([
         ' ' * 4 + line for line in [
             f'__openllm_{keys}__: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))} = Field(None)\n',
-            f'"""{_value_docstring[keys]}"""\n',
+            f"'''{_value_docstring[keys]}'''\n",
         ]
     ])
   # NOTE: inline runtime __getitem__ overload process
@@ -135,7 +125,7 @@ def main() -> int:
     lines.extend([
         ' ' * 2 + line for line in [
             '@overload\n',
-            f'def __getitem__(self, item: t.Literal["{keys}"]) -> {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}: ...\n',
+            f"def __getitem__(self, item: t.Literal['{keys}']) -> {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}: ...\n",
         ]
     ])
   # special case variables: generation_class, extras, sampling_class
@@ -143,10 +133,10 @@ def main() -> int:
   lines.extend([
       ' ' * 2 + line for line in [
           '@overload\n',
-          'def __getitem__(self, item: t.Literal["generation_class"]) -> t.Type[openllm_core.GenerationConfig]: ...\n',
+          "def __getitem__(self, item: t.Literal['generation_class']) -> t.Type[openllm_core.GenerationConfig]: ...\n",
           '@overload\n',
-          'def __getitem__(self, item: t.Literal["sampling_class"]) -> t.Type[openllm_core.SamplingParams]: ...\n',
-          '@overload\n', 'def __getitem__(self, item: t.Literal["extras"]) -> t.Dict[str, t.Any]: ...\n',
+          "def __getitem__(self, item: t.Literal['sampling_class']) -> t.Type[openllm_core.SamplingParams]: ...\n",
+          '@overload\n', "def __getitem__(self, item: t.Literal['extras']) -> t.Dict[str, t.Any]: ...\n",
       ]
   ])
   lines.append(' ' * 2 + '# NOTE: GenerationConfig arguments\n')
@@ -154,20 +144,20 @@ def main() -> int:
   for keys, type_pep563 in generation_config_anns.items():
     lines.extend([
         ' ' * 2 + line
-        for line in ['@overload\n', f'def __getitem__(self, item: t.Literal["{keys}"]) -> {type_pep563}: ...\n']
+        for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {type_pep563}: ...\n"]
     ])
   lines.append(' ' * 2 + '# NOTE: SamplingParams arguments\n')
   for keys, type_pep563 in codegen.get_annotations(SamplingParams).items():
     if keys not in generation_config_anns:
       lines.extend([
           ' ' * 2 + line
-          for line in ['@overload\n', f'def __getitem__(self, item: t.Literal["{keys}"]) -> {type_pep563}: ...\n',]
+          for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {type_pep563}: ...\n",]
       ])
   lines.append(' ' * 2 + '# NOTE: PeftType arguments\n')
   for keys in PeftType._member_names_:
     lines.extend([
         ' ' * 2 + line for line in
-        ['@overload\n', f'def __getitem__(self, item: t.Literal["{keys.lower()}"]) -> dict[str, t.Any]: ...\n',]
+        ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys.lower()}']) -> dict[str, t.Any]: ...\n",]
     ])
 
   processed = processed[:start_attrs_idx] + [
diff --git a/tools/update-dummy.py b/tools/update-dummy.py
index 76f86342..104430de 100755
--- a/tools/update-dummy.py
+++ b/tools/update-dummy.py
@@ -7,7 +7,7 @@ _ROOT = Path(__file__).parent.parent
 
 sys.path.insert(0, (_ROOT / 'openllm-core' / 'src').__fspath__())
 sys.path.insert(1, (_ROOT / 'openllm-python' / 'src').__fspath__())
-from openllm_core._configuration import LiteralRuntime
+from openllm_core._typing_compat import LiteralBackend
 from openllm.models import auto
 from openllm import CONFIG_MAPPING
 
@@ -17,31 +17,31 @@ config_requirements = {
     k: [_.replace('-', '_') for _ in v.__openllm_requirements__] if v.__openllm_requirements__ else None
     for k, v in CONFIG_MAPPING.items()
 }
-_dependencies: dict[LiteralRuntime, str] = {
-    k: v for k, v in zip(LiteralRuntime.__args__, ('torch', 'tensorflow', 'flax', 'vllm'))
+_dependencies: dict[LiteralBackend, str] = {
+    k: v for k, v in zip(LiteralBackend.__args__[:-2], ('torch', 'tensorflow', 'flax', 'vllm'))
 }
 _auto: dict[str, str] = {
-    k: v for k, v in zip(LiteralRuntime.__args__, ('AutoLLM', 'AutoTFLLM', 'AutoFlaxLLM', 'AutoVLLM'))
+    k: v for k, v in zip(LiteralBackend.__args__[:-2], ('AutoLLM', 'AutoTFLLM', 'AutoFlaxLLM', 'AutoVLLM'))
 }
 
-def get_target_dummy_file(framework: LiteralRuntime) -> Path:
-  return _ROOT / 'openllm-python' / 'src' / 'openllm' / 'utils' / f'dummy_{framework}_objects.py'
+def get_target_dummy_file(backend: LiteralBackend) -> Path:
+  return _ROOT / 'openllm-python' / 'src' / 'openllm' / 'utils' / f'dummy_{backend}_objects.py'
 
-def mapping_names(framework: LiteralRuntime):
-  return 'MODEL_MAPPING_NAMES' if framework == 'pt' else f'MODEL_{framework.upper()}_MAPPING_NAMES'
+def mapping_names(backend: LiteralBackend):
+  return 'MODEL_MAPPING_NAMES' if backend == 'pt' else f'MODEL_{backend.upper()}_MAPPING_NAMES'
 
-def get_mapping(framework: LiteralRuntime) -> OrderedDict[t.Any, t.Any]:
-  return getattr(auto, mapping_names(framework))
+def get_mapping(backend: LiteralBackend) -> OrderedDict[t.Any, t.Any]:
+  return getattr(auto, mapping_names(backend))
 
-def make_class_stub(model_name: str, framework: LiteralRuntime, indentation: int = 2, auto: bool = False) -> list[str]:
+def make_class_stub(model_name: str, backend: LiteralBackend, indentation: int = 2, auto: bool = False) -> list[str]:
   _dep_list: list[str] = [
       f'"{v}"' for v in [
-          _dependencies[framework], *(t.cast(t.List[str], config_requirements[model_name])
-                                      if model_name != '__default__' and config_requirements[model_name] else [])
+          _dependencies[backend], *(t.cast(t.List[str], config_requirements[model_name]
+                                          ) if model_name != '__default__' and config_requirements[model_name] else [])
       ]
   ]
-  if auto: cl_ = _auto[framework]
-  else: cl_ = get_mapping(framework)[model_name]
+  if auto: cl_ = _auto[backend]
+  else: cl_ = get_mapping(backend)[model_name]
   lines = [
       f'class {cl_}(metaclass=_DummyMetaclass):', ' ' * indentation + f"_backends=[{','.join(_dep_list)}]",
       ' ' * indentation +
@@ -49,28 +49,28 @@ def make_class_stub(model_name: str, framework: LiteralRuntime, indentation: int
   ]
   return lines
 
-def write_stub(framework: LiteralRuntime, _path: str) -> list[str]:
+def write_stub(backend: LiteralBackend, _path: str) -> list[str]:
   base = [
       f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!', f'# To update this, run ./{_path}',
       'from __future__ import annotations', 'import typing as _t',
       'from openllm_core.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends',
   ]
-  base.extend([v for it in [make_class_stub(k, framework) for k in get_mapping(framework)] for v in it])
+  base.extend([v for it in [make_class_stub(k, backend) for k in get_mapping(backend)] for v in it])
   # autoclass
-  base.extend(make_class_stub('__default__', framework, auto=True))
+  base.extend(make_class_stub('__default__', backend, auto=True))
   # mapping and export
-  _imports = [f'"{v}"' for v in get_mapping(framework).values()]
+  _imports = [f'"{v}"' for v in get_mapping(backend).values()]
   base += [
-      f'{mapping_names(framework)}:_t.Any=None',
-      f"__all__:list[str]=[\"{mapping_names(framework)}\",\"{_auto[framework]}\",{','.join(_imports)}]\n"
+      f'{mapping_names(backend)}:_t.Any=None',
+      f"__all__:list[str]=[\"{mapping_names(backend)}\",\"{_auto[backend]}\",{','.join(_imports)}]\n"
   ]
   return base
 
 def main() -> int:
   _path = os.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__))
-  for framework in _dependencies:
-    with get_target_dummy_file(framework).open('w') as f:
-      f.write('\n'.join(write_stub(framework, _path)))
+  for backend in _dependencies:
+    with get_target_dummy_file(backend).open('w') as f:
+      f.write('\n'.join(write_stub(backend, _path)))
   return 0
 
 if __name__ == '__main__': raise SystemExit(main())