diff --git a/changelog.d/663.feature.md b/changelog.d/663.feature.md
new file mode 100644
index 00000000..a8ee89bf
--- /dev/null
+++ b/changelog.d/663.feature.md
@@ -0,0 +1,3 @@
+Type hints for all exposed API are now provided through stubs. This means REPL
+and static analysis tools like mypy can infer types from library instantly without
+having to infer types from runtime function signatures.
diff --git a/mypy.ini b/mypy.ini
index 9648d989..7962f935 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -7,4 +7,4 @@ warn_unused_configs = True
 ignore_missing_imports = true
 check_untyped_defs = true
 warn_unreachable = true
-files = openllm-client/src/openllm_client/__init__.pyi, openllm-client/src/openllm_client/_utils.pyi, openllm-core/src/openllm_core/_typing_compat.py, openllm-client/src/openllm_client/_typing_compat.py, openllm-python/src/openllm/__init__.pyi, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/serialisation/__init__.pyi
+files = openllm-python/src/openllm/bundle/__init__.pyi, openllm-python/src/openllm/serialisation/__init__.pyi, openllm-client/src/openllm_client/__init__.pyi, openllm-client/src/openllm_client/_utils.pyi, openllm-python/src/openllm/__init__.pyi, openllm-client/src/openllm_client/_typing_compat.py, openllm-core/src/openllm_core/_typing_compat.py, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/bundle/_package.pyi, openllm-python/src/openllm/_runners.pyi, openllm-python/src/openllm/_quantisation.pyi, openllm-python/src/openllm/_llm.pyi, openllm-python/src/openllm/_generation.pyi
diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py
index c926635f..7a9c4065 100644
--- a/openllm-core/src/openllm_core/_typing_compat.py
+++ b/openllm-core/src/openllm_core/_typing_compat.py
@@ -88,9 +88,3 @@ class AdapterTuple(TupleAny):
 
 
 AdapterMap = t.Dict[AdapterType, t.Tuple[AdapterTuple, ...]]
-
-
-class RefTuple(TupleAny):
-  git_hash: str
-  version: VersionInfo
-  strategy: LiteralContainerVersionStrategy
diff --git a/openllm-core/src/openllm_core/config/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py
index 2c38a3f1..680e8566 100644
--- a/openllm-core/src/openllm_core/config/configuration_auto.py
+++ b/openllm-core/src/openllm_core/config/configuration_auto.py
@@ -121,6 +121,9 @@ class AutoConfig:
   # update-config-stubs.py: auto stubs start
   @t.overload
   @classmethod
+  def for_model(cls,model_name:t.Literal['baichuan'],**attrs:t.Any)->openllm_core.config.BaichuanConfig:...
+  @t.overload
+  @classmethod
   def for_model(cls,model_name:t.Literal['chatglm'],**attrs:t.Any)->openllm_core.config.ChatGLMConfig:...
   @t.overload
   @classmethod
@@ -139,6 +142,9 @@ class AutoConfig:
   def for_model(cls,model_name:t.Literal['llama'],**attrs:t.Any)->openllm_core.config.LlamaConfig:...
   @t.overload
   @classmethod
+  def for_model(cls,model_name:t.Literal['mistral'],**attrs:t.Any)->openllm_core.config.MistralConfig:...
+  @t.overload
+  @classmethod
   def for_model(cls,model_name:t.Literal['mpt'],**attrs:t.Any)->openllm_core.config.MPTConfig:...
   @t.overload
   @classmethod
@@ -151,13 +157,7 @@ class AutoConfig:
   def for_model(cls,model_name:t.Literal['starcoder'],**attrs:t.Any)->openllm_core.config.StarCoderConfig:...
   @t.overload
   @classmethod
-  def for_model(cls,model_name:t.Literal['mistral'],**attrs:t.Any)->openllm_core.config.MistralConfig:...
-  @t.overload
-  @classmethod
   def for_model(cls,model_name:t.Literal['yi'],**attrs:t.Any)->openllm_core.config.YiConfig:...
-  @t.overload
-  @classmethod
-  def for_model(cls,model_name:t.Literal['baichuan'],**attrs:t.Any)->openllm_core.config.BaichuanConfig:...
   # update-config-stubs.py: auto stubs stop
   # fmt: on
 
diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py
index a3eda552..835cb195 100644
--- a/openllm-core/src/openllm_core/utils/__init__.py
+++ b/openllm-core/src/openllm_core/utils/__init__.py
@@ -350,13 +350,16 @@ T = t.TypeVar('T')
 K = t.TypeVar('K')
 
 
-# yapf: disable
 @overload
 def first_not_none(*args: T | None, default: T) -> T: ...
+
+
 @overload
 def first_not_none(*args: T | None) -> T | None: ...
-def first_not_none(*args: T | None, default: None | T = None) -> T | None: return next((arg for arg in args if arg is not None), default)
-# yapf: enable
+
+
+def first_not_none(*args: T | None, default: T | None = None) -> T | None:
+  return next((arg for arg in args if arg is not None), default)
 
 
 def resolve_filepath(path: str, ctx: str | None = None) -> str:
diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml
index 0995afd8..4a4bb9d3 100644
--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -38,7 +38,7 @@ classifiers = [
   "Programming Language :: Python :: Implementation :: PyPy",
 ]
 dependencies = [
-  "bentoml[io]>=1.1.2",
+  "bentoml[io]>=1.1.9",
   "transformers[torch,tokenizers]>=4.35.0",
   "openllm-client",
   "openllm-core",
diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py
index 3d48570b..2703d9ec 100644
--- a/openllm-python/src/openllm/__init__.py
+++ b/openllm-python/src/openllm/__init__.py
@@ -1,14 +1,3 @@
-"""OpenLLM.
-
-An open platform for operating large language models in production. Fine-tune, serve,
-deploy, and monitor any LLMs with ease.
-
-* Built-in support for StableLM, Llama 2, Dolly, Flan-T5, Vicuna
-* Option to bring your own fine-tuned LLMs
-* Online Serving with HTTP, gRPC, SSE(coming soon) or custom API
-* Native integration with BentoML and LangChain for custom LLM apps
-"""
-
 import logging as _logging
 import os as _os
 import pathlib as _pathlib
@@ -57,13 +46,14 @@ __lazy = utils.LazyModule(
     'entrypoints': ['mount_entrypoints'],
     'serialisation': ['ggml', 'transformers'],
     '_quantisation': ['infer_quantisation_config'],
-    '_llm': ['LLM', 'LLMRunner', 'LLMRunnable'],
+    '_llm': ['LLM'],
     '_generation': [
       'StopSequenceCriteria',
       'StopOnTokens',
-      'LogitsProcessorList',
-      'StoppingCriteriaList',
       'prepare_logits_processor',
+      'get_context_length',
+      'is_sentence_complete',
+      'is_partial_stop',
     ],
   },
   extra_objects={
diff --git a/openllm-python/src/openllm/__init__.pyi b/openllm-python/src/openllm/__init__.pyi
index 531ee8b7..6bb92cb9 100644
--- a/openllm-python/src/openllm/__init__.pyi
+++ b/openllm-python/src/openllm/__init__.pyi
@@ -1,3 +1,21 @@
+"""OpenLLM.
+===========
+
+An open platform for operating large language models in production.
+Fine-tune, serve, deploy, and monitor any LLMs with ease.
+
+* Built-in support for Mistral, Llama 2, Yi, StableLM, Dolly, Flan-T5, Vicuna
+* Option to bring your own fine-tuned LLMs
+* Online Serving with HTTP, gRPC, SSE or custom API
+* Native integration with BentoML, LangChain, OpenAI compatible endpoints, LlamaIndex for custom LLM apps
+"""
+
+# fmt: off
+# update-config-stubs.py: import stubs start
+from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING,CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,AutoConfig as AutoConfig,BaichuanConfig as BaichuanConfig,ChatGLMConfig as ChatGLMConfig,DollyV2Config as DollyV2Config,FalconConfig as FalconConfig,FlanT5Config as FlanT5Config,GPTNeoXConfig as GPTNeoXConfig,LlamaConfig as LlamaConfig,MistralConfig as MistralConfig,MPTConfig as MPTConfig,OPTConfig as OPTConfig,StableLMConfig as StableLMConfig,StarCoderConfig as StarCoderConfig,YiConfig as YiConfig
+# update-config-stubs.py: import stubs stop
+# fmt: on
+
 import openllm_cli as _cli
 from openllm_cli._sdk import (
   build as build,
@@ -16,23 +34,6 @@ from openllm_core._schemas import (
   GenerationOutput as GenerationOutput,
   MetadataOutput as MetadataOutput,
 )
-from openllm_core.config import (
-  CONFIG_MAPPING as CONFIG_MAPPING,
-  CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,
-  AutoConfig as AutoConfig,
-  BaichuanConfig as BaichuanConfig,
-  ChatGLMConfig as ChatGLMConfig,
-  DollyV2Config as DollyV2Config,
-  FalconConfig as FalconConfig,
-  FlanT5Config as FlanT5Config,
-  GPTNeoXConfig as GPTNeoXConfig,
-  LlamaConfig as LlamaConfig,
-  MistralConfig as MistralConfig,
-  MPTConfig as MPTConfig,
-  OPTConfig as OPTConfig,
-  StableLMConfig as StableLMConfig,
-  StarCoderConfig as StarCoderConfig,
-)
 
 from . import (
   bundle as bundle,
@@ -44,13 +45,14 @@ from . import (
 )
 from ._deprecated import Runner as Runner
 from ._generation import (
-  LogitsProcessorList as LogitsProcessorList,
   StopOnTokens as StopOnTokens,
-  StoppingCriteriaList as StoppingCriteriaList,
   StopSequenceCriteria as StopSequenceCriteria,
   prepare_logits_processor as prepare_logits_processor,
+  is_partial_stop as is_partial_stop,
+  is_sentence_complete as is_sentence_complete,
+  get_context_length as get_context_length,
 )
-from ._llm import LLM as LLM, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner
+from ._llm import LLM as LLM
 from ._quantisation import infer_quantisation_config as infer_quantisation_config
 from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
 from .client import AsyncHTTPClient as AsyncHTTPClient, HTTPClient as HTTPClient
diff --git a/openllm-python/src/openllm/_generation.py b/openllm-python/src/openllm/_generation.py
index d6d4eaef..c3b4fa5b 100644
--- a/openllm-python/src/openllm/_generation.py
+++ b/openllm-python/src/openllm/_generation.py
@@ -1,43 +1,24 @@
-# mypy: disable-error-code="misc"
-from __future__ import annotations
-import typing as t
-
 import transformers
 
-if t.TYPE_CHECKING:
-  import torch
-
-  import openllm
-
-# reexport from transformers
-LogitsProcessorList = transformers.LogitsProcessorList
-StoppingCriteriaList = transformers.StoppingCriteriaList
-
 
 class StopSequenceCriteria(transformers.StoppingCriteria):
-  def __init__(
-    self,
-    stop_sequences: str | list[str],
-    tokenizer: transformers.PreTrainedTokenizer
-    | transformers.PreTrainedTokenizerBase
-    | transformers.PreTrainedTokenizerFast,
-  ):
+  def __init__(self, stop_sequences, tokenizer):
     if isinstance(stop_sequences, str):
       stop_sequences = [stop_sequences]
     self.stop_sequences, self.tokenizer = stop_sequences, tokenizer
 
-  def __call__(self, input_ids: torch.Tensor, scores: t.Any, **_: t.Any) -> bool:
+  def __call__(self, input_ids, scores, **kwargs):
     return any(
       self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences
     )
 
 
 class StopOnTokens(transformers.StoppingCriteria):
-  def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **_: t.Any) -> bool:
+  def __call__(self, input_ids, scores, **kwargs):
     return input_ids[0][-1] in {50278, 50279, 50277, 1, 0}
 
 
-def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsProcessorList:
+def prepare_logits_processor(config):
   generation_config = config.generation_config
   logits_processor = transformers.LogitsProcessorList()
   if generation_config['temperature'] >= 1e-5 and generation_config['temperature'] != 1.0:
@@ -55,7 +36,7 @@ def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsPr
 SEQLEN_KEYS = ['max_sequence_length', 'seq_length', 'max_position_embeddings', 'max_seq_len', 'model_max_length']
 
 
-def get_context_length(config: transformers.PretrainedConfig) -> int:
+def get_context_length(config):
   rope_scaling = getattr(config, 'rope_scaling', None)
   rope_scaling_factor = config.rope_scaling['factor'] if rope_scaling else 1.0
   for key in SEQLEN_KEYS:
@@ -64,11 +45,11 @@ def get_context_length(config: transformers.PretrainedConfig) -> int:
   return 2048
 
 
-def is_sentence_complete(output: str) -> bool:
+def is_sentence_complete(output):
   return output.endswith(('.', '?', '!', '...', '。', '?', '!', '…', '"', "'", '”'))
 
 
-def is_partial_stop(output: str, stop_str: str) -> bool:
+def is_partial_stop(output, stop_str):
   """Check whether the output contains a partial stop str."""
   for i in range(min(len(output), len(stop_str))):
     if stop_str.startswith(output[-i:]):
diff --git a/openllm-python/src/openllm/_generation.pyi b/openllm-python/src/openllm/_generation.pyi
new file mode 100644
index 00000000..c727f6be
--- /dev/null
+++ b/openllm-python/src/openllm/_generation.pyi
@@ -0,0 +1,28 @@
+from typing import Any, List, Union
+
+from torch import FloatTensor, LongTensor
+from transformers import (
+  LogitsProcessorList,
+  PretrainedConfig,
+  PreTrainedTokenizer,
+  PreTrainedTokenizerBase,
+  PreTrainedTokenizerFast,
+)
+
+from openllm_core import LLMConfig
+
+Tokenizer = Union[PreTrainedTokenizerBase, PreTrainedTokenizer, PreTrainedTokenizerFast]
+
+class StopSequenceCriteria:
+  stop_sequences: List[str]
+  tokenizer: Tokenizer
+  def __init__(self, stop_sequences: Union[str, List[str]], tokenizer: Tokenizer) -> None: ...
+  def __call__(self, input_ids: LongTensor, scores: FloatTensor, **kwargs: Any) -> bool: ...
+
+class StopOnTokens:
+  def __call__(self, input_ids: LongTensor, scores: FloatTensor, **kwargs: Any) -> bool: ...
+
+def prepare_logits_processor(config: LLMConfig) -> LogitsProcessorList: ...
+def get_context_length(config: PretrainedConfig) -> int: ...
+def is_sentence_complete(output: str) -> bool: ...
+def is_partial_stop(output: str, stop_str: str) -> bool: ...
diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py
index 61965537..2ee2387d 100644
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -1,6 +1,4 @@
-# mypy: disable-error-code="name-defined,attr-defined"
 from __future__ import annotations
-import abc
 import functools
 import logging
 import os
@@ -10,14 +8,12 @@ import typing as t
 import attr
 import inflection
 import orjson
-from huggingface_hub import hf_hub_download
 
 import bentoml
 import openllm
-import openllm_core
 from bentoml._internal.models.model import ModelSignature
 from bentoml._internal.runner.runner_handle import DummyRunnerHandle
-from openllm_core._schemas import CompletionChunk, GenerationOutput
+from openllm_core._schemas import GenerationOutput
 from openllm_core._typing_compat import (
   AdapterMap,
   AdapterTuple,
@@ -43,32 +39,27 @@ from openllm_core.utils import (
   converter,
   first_not_none,
   flatten_attrs,
+  gen_random_uuid,
   generate_hash_from_file,
   get_debug_mode,
   get_disable_warnings,
   get_quiet_mode,
   is_peft_available,
+  is_vllm_available,
   resolve_filepath,
   validate_is_path,
 )
 
-from ._quantisation import infer_quantisation_config
-from ._strategies import CascadingResourceStrategy
 from .exceptions import ForbiddenAttributeError, OpenLLMException
 from .serialisation.constants import PEFT_CONFIG_NAME
 
 if t.TYPE_CHECKING:
-  import torch
   import transformers
   from peft.config import PeftConfig
-  from peft.peft_model import PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM
 
-  from bentoml._internal.runner.runnable import RunnableMethod
-  from bentoml._internal.runner.runner import RunnerMethod
-  from bentoml._internal.runner.runner_handle import RunnerHandle
-  from bentoml._internal.runner.strategy import Strategy
   from openllm_core._configuration import LLMConfig
-  from openllm_core.utils.representation import ReprArgs
+
+  from ._runners import Runner
 
 ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]]
 
@@ -84,16 +75,15 @@ def normalise_model_name(name: str) -> str:
   return inflection.dasherize(name)
 
 
-def resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
-  """Resolve the type of the PeftConfig given the adapter_map.
+def _resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
+  try:
+    from huggingface_hub import hf_hub_download
+  except ImportError:
+    raise MissingDependencyError(
+      "Failed to import 'huggingface_hub'. Make sure to do 'pip install \"openllm[fine-tune]\"'"
+    ) from None
 
-  This is similar to how PeftConfig resolve its config type.
-
-  Args:
-  adapter_map: The given mapping from either SDK or CLI. See CLI docs for more information.
-  """
   resolved: AdapterMap = {}
-  _has_set_default = False
   for path_or_adapter_id, name in adapter_map.items():
     if name is None:
       raise ValueError('Adapter name must be specified.')
@@ -107,7 +97,7 @@ def resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
     with open(config_file, 'r') as file:
       resolved_config = orjson.loads(file.read())
     # all peft_type should be available in PEFT_CONFIG_NAME
-    _peft_type: AdapterType = resolved_config['peft_type'].lower()
+    _peft_type = resolved_config['peft_type'].lower()
     if _peft_type not in resolved:
       resolved[_peft_type] = ()
     resolved[_peft_type] += (_AdapterTuple((path_or_adapter_id, name, resolved_config)),)
@@ -151,7 +141,7 @@ class LLM(t.Generic[M, T], ReprMixin):
   __llm_config__: LLMConfig | None = None
   __llm_backend__: LiteralBackend = None  # type: ignore
   __llm_quantization_config__: transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig | None = None
-  __llm_runner__: t.Optional[LLMRunner[M, T]] = None
+  __llm_runner__: t.Optional[Runner[M, T]] = None
   __llm_model__: t.Optional[M] = None
   __llm_tokenizer__: t.Optional[T] = None
   __llm_adapter_map__: t.Optional[ResolvedAdapterMap] = None
@@ -159,35 +149,29 @@ class LLM(t.Generic[M, T], ReprMixin):
 
   def __init__(
     self,
-    model_id: str,
-    model_version: str | None = None,
-    model_tag: str | bentoml.Tag | None = None,
-    prompt_template: PromptTemplate | str | None = None,
-    system_message: str | None = None,
-    llm_config: LLMConfig | None = None,
-    backend: LiteralBackend | None = None,
-    *args: t.Any,
-    quantize: LiteralQuantise | None = None,
-    quantization_config: transformers.BitsAndBytesConfig
-    | transformers.GPTQConfig
-    | transformers.AwqConfig
-    | None = None,
-    adapter_map: dict[str, str] | None = None,
-    serialisation: LiteralSerialisation = 'safetensors',
-    trust_remote_code: bool = False,
-    embedded: bool = False,
-    torch_dtype: LiteralDtype | t.Literal['auto', 'half', 'float'] = 'auto',
-    **attrs: t.Any,
+    model_id,
+    model_version=None,
+    model_tag=None,
+    prompt_template=None,
+    system_message=None,
+    llm_config=None,
+    backend=None,
+    *args,
+    quantize=None,
+    quantization_config=None,
+    adapter_map=None,
+    serialisation='safetensors',
+    trust_remote_code=False,
+    embedded=False,
+    torch_dtype='auto',
+    low_cpu_mem_usage=True,
+    **attrs,
   ):
-    # low_cpu_mem_usage is only available for model this is helpful on system with low memory to avoid OOM
-    low_cpu_mem_usage = attrs.pop('low_cpu_mem_usage', True)
     _local = False
     if validate_is_path(model_id):
       model_id, _local = resolve_filepath(model_id), True
 
-    backend = first_not_none(
-      backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if openllm.utils.is_vllm_available() else 'pt'
-    )
+    backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt')
     torch_dtype = first_not_none(os.getenv('TORCH_DTYPE'), torch_dtype, default='auto')
     quantize = first_not_none(quantize, os.getenv('OPENLLM_QUANTIZE'), default=None)
     # elif quantization_config is None and quantize is not None:
@@ -215,7 +199,7 @@ class LLM(t.Generic[M, T], ReprMixin):
       quantization_config=quantization_config,
       quantise=quantize,
       model_decls=args,
-      adapter_map=resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
+      adapter_map=_resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
       serialisation=serialisation,
       local=_local,
       prompt_template=prompt_template,
@@ -244,7 +228,7 @@ class LLM(t.Generic[M, T], ReprMixin):
       self.runner.init_local(quiet=True)
 
   @property
-  def _torch_dtype(self) -> torch.dtype:
+  def _torch_dtype(self):
     import torch
     import transformers
 
@@ -298,11 +282,15 @@ class LLM(t.Generic[M, T], ReprMixin):
     super().__setattr__(attr, value)
 
   @property
-  def _model_attrs(self) -> dict[str, t.Any]:
+  def _model_attrs(self):
     return {**self.import_kwargs[0], **self.__model_attrs}
 
+  @_model_attrs.setter
+  def _model_attrs(self, value):
+    self.__model_attrs = value
+
   @property
-  def _tokenizer_attrs(self) -> dict[str, t.Any]:
+  def _tokenizer_attrs(self):
     return {**self.import_kwargs[1], **self.__tokenizer_attrs}
 
   @property
@@ -319,41 +307,42 @@ class LLM(t.Generic[M, T], ReprMixin):
   def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
     import torch
 
-    return {'device_map': 'auto' if torch.cuda.is_available() else None, 'torch_dtype': self._torch_dtype}, {
-      'padding_side': 'left',
-      'truncation_side': 'left',
-    }
+    model_attrs = {'device_map': 'auto' if torch.cuda.is_available() else None, 'torch_dtype': self._torch_dtype}
+    tokenizer_attrs = {'padding_side': 'left', 'truncation_side': 'left'}
+    return model_attrs, tokenizer_attrs
 
   @property
-  def trust_remote_code(self) -> bool:
+  def trust_remote_code(self):
     env = os.getenv('TRUST_REMOTE_CODE')
     if env is not None:
       return str(env).upper() in ENV_VARS_TRUE_VALUES
     return self.__llm_trust_remote_code__
 
   @property
-  def runner_name(self) -> str:
+  def runner_name(self):
     return f"llm-{self.config['start_name']}-runner"
 
   @property
-  def model_id(self) -> str:
+  def model_id(self):
     return self._model_id
 
   @property
-  def revision(self) -> str:
-    return t.cast(str, self._revision)
+  def revision(self):
+    return self._revision
 
   @property
-  def tag(self) -> bentoml.Tag:
+  def tag(self):
     return self._tag
 
   @property
-  def bentomodel(self) -> bentoml.Model:
+  def bentomodel(self):
     return openllm.serialisation.get(self)
 
   @property
-  def quantization_config(self) -> transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig:
+  def quantization_config(self):
     if self.__llm_quantization_config__ is None:
+      from ._quantisation import infer_quantisation_config
+
       if self._quantization_config is not None:
         self.__llm_quantization_config__ = self._quantization_config
       elif self._quantise is not None:
@@ -365,55 +354,55 @@ class LLM(t.Generic[M, T], ReprMixin):
     return self.__llm_quantization_config__
 
   @property
-  def has_adapters(self) -> bool:
+  def has_adapters(self):
     return self._adapter_map is not None
 
   @property
-  def local(self) -> bool:
+  def local(self):
     return self._local
 
   @property
-  def quantise(self) -> LiteralQuantise | None:
+  def quantise(self):
     return self._quantise
 
   # NOTE: The section below defines a loose contract with langchain's LLM interface.
   @property
-  def llm_type(self) -> str:
+  def llm_type(self):
     return normalise_model_name(self._model_id)
 
   @property
-  def identifying_params(self) -> DictStrAny:
+  def llm_parameters(self):
+    return (self._model_decls, self._model_attrs), self._tokenizer_attrs
+
+  @property
+  def identifying_params(self):
     return {
       'configuration': self.config.model_dump_json().decode(),
       'model_ids': orjson.dumps(self.config['model_ids']).decode(),
       'model_id': self.model_id,
     }
 
-  @property
-  def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]:
-    return (self._model_decls, self._model_attrs), self._tokenizer_attrs
-
   # NOTE: This section is the actual model, tokenizer, and config reference here.
   @property
-  def config(self) -> LLMConfig:
+  def config(self):
     if self.__llm_config__ is None:
       self.__llm_config__ = openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs)
     return self.__llm_config__
 
   @property
-  def tokenizer(self) -> T:
+  def tokenizer(self):
     if self.__llm_tokenizer__ is None:
       self.__llm_tokenizer__ = openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1])
     return self.__llm_tokenizer__
 
   @property
-  def runner(self) -> LLMRunner[M, T]:
+  def runner(self):
     if self.__llm_runner__ is None:
       self.__llm_runner__ = _RunnerFactory(self)
     return self.__llm_runner__
 
   @property
-  def model(self) -> M:
+  def model(self):
     if self.__llm_model__ is None:
       model = openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs)
       # If OOM, then it is probably you don't have enough VRAM to run this model.
@@ -439,7 +428,7 @@ class LLM(t.Generic[M, T], ReprMixin):
     return self.__llm_model__
 
   @property
-  def adapter_map(self) -> ResolvedAdapterMap:
+  def adapter_map(self):
     try:
       import peft as _  # noqa: F401
     except ImportError as err:
@@ -461,9 +450,7 @@ class LLM(t.Generic[M, T], ReprMixin):
       self.__llm_adapter_map__ = _map
     return self.__llm_adapter_map__
 
-  def prepare_for_training(
-    self, adapter_type: AdapterType = 'lora', use_gradient_checking: bool = True, **attrs: t.Any
-  ) -> tuple[PeftModel | PeftModelForCausalLM | PeftModelForSeq2SeqLM, T]:
+  def prepare_for_training(self, adapter_type='lora', use_gradient_checking=True, **attrs):
     from peft.mapping import get_peft_model
     from peft.utils.other import prepare_model_for_kbit_training
 
@@ -484,15 +471,8 @@ class LLM(t.Generic[M, T], ReprMixin):
     return model, self.tokenizer
 
   async def generate(
-    self,
-    prompt: str | None,
-    prompt_token_ids: list[int] | None = None,
-    stop: str | t.Iterable[str] | None = None,
-    stop_token_ids: list[int] | None = None,
-    request_id: str | None = None,
-    adapter_name: str | None = None,
-    **attrs: t.Any,
-  ) -> GenerationOutput:
+    self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
+  ):
     config = self.config.model_construct_env(**attrs)
     texts: list[list[str]] = [[]] * config['n']
     token_ids: list[list[int]] = [[]] * config['n']
@@ -515,15 +495,8 @@ class LLM(t.Generic[M, T], ReprMixin):
     )
 
   async def generate_iterator(
-    self,
-    prompt: str | None,
-    prompt_token_ids: list[int] | None = None,
-    stop: str | t.Iterable[str] | None = None,
-    stop_token_ids: list[int] | None = None,
-    request_id: str | None = None,
-    adapter_name: str | None = None,
-    **attrs: t.Any,
-  ) -> t.AsyncGenerator[GenerationOutput, None]:
+    self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
+  ):
     if isinstance(self.runner._runner_handle, DummyRunnerHandle):
       if os.getenv('BENTO_PATH') is not None:
         raise RuntimeError('Runner client failed to set up correctly.')
@@ -551,14 +524,13 @@ class LLM(t.Generic[M, T], ReprMixin):
         raise ValueError('Either prompt or prompt_token_ids must be specified.')
       prompt_token_ids = self.tokenizer.encode(prompt)
 
-    if request_id is None:
-      request_id = openllm_core.utils.gen_random_uuid()
+    request_id = gen_random_uuid() if request_id is None else request_id
     previous_texts, previous_num_tokens = [''] * config['n'], [0] * config['n']
     async for out in self.runner.generate_iterator.async_stream(
-      prompt_token_ids, request_id, stop, adapter_name, **config.model_dump(flatten=True)
+      prompt_token_ids, request_id, stop=stop, adapter_name=adapter_name, **config.model_dump(flatten=True)
     ):
       generated = GenerationOutput.from_runner(out).with_options(prompt=prompt)
-      delta_outputs = t.cast(t.List[CompletionChunk], [None] * len(generated.outputs))
+      delta_outputs = [None] * len(generated.outputs)
       if generated.finished:
         break
       for output in generated.outputs:
@@ -570,44 +542,37 @@ class LLM(t.Generic[M, T], ReprMixin):
 
 
 def _RunnerFactory(
-  self: openllm.LLM[M, T],
-  /,
-  models: list[bentoml.Model] | None = None,
-  max_batch_size: int | None = None,
-  max_latency_ms: int | None = None,
-  scheduling_strategy: type[bentoml.Strategy] = CascadingResourceStrategy,
-  *,
-  backend: LiteralBackend | None = None,
-) -> LLMRunner[M, T]:
+  llm, /, models=None, max_batch_size=None, max_latency_ms=None, scheduling_strategy=None, *, backend=None
+):
   from ._runners import runnable
 
-  backend = t.cast(
-    LiteralBackend, first_not_none(backend, os.environ.get('OPENLLM_BACKEND'), default=self.__llm_backend__)
-  )
+  if scheduling_strategy is None:
+    from ._strategies import CascadingResourceStrategy
+
+    scheduling_strategy = CascadingResourceStrategy
+
+  backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND', default=llm.__llm_backend__))
 
   models = models if models is not None else []
   try:
-    models.append(self.bentomodel)
+    models.append(llm.bentomodel)
   except bentoml.exceptions.NotFound as err:
-    raise RuntimeError(f'Failed to locate {self.bentomodel}:{err}') from err
+    raise RuntimeError(f'Failed to locate {llm.bentomodel}:{err}') from err
 
-  if self._prompt_template:
-    prompt_template = self._prompt_template.to_string()
-  elif hasattr(self.config, 'default_prompt_template'):
-    prompt_template = self.config.default_prompt_template
+  if llm._prompt_template:
+    prompt_template = llm._prompt_template.to_string()
+  elif hasattr(llm.config, 'default_prompt_template'):
+    prompt_template = llm.config.default_prompt_template
   else:
     prompt_template = None
-  if self._system_message:
-    system_message = self._system_message
-  elif hasattr(self.config, 'default_system_message'):
-    system_message = self.config.default_system_message
+  if llm._system_message:
+    system_message = llm._system_message
+  elif hasattr(llm.config, 'default_system_message'):
+    system_message = llm.config.default_system_message
   else:
     system_message = None
 
-  def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]:
-    return {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'}
-
-  def _wrapped_repr_args(_: LLMRunner[M, T]) -> ReprArgs:
+  def _wrapped_repr_args(_):
     yield (
       'runner_methods',
       {
@@ -618,89 +583,40 @@ def _RunnerFactory(
         for method in _.runner_methods
       },
     )
-    yield 'config', self.config.model_dump(flatten=True)
-    yield 'llm_type', self.llm_type
+    yield 'config', llm.config.model_dump(flatten=True)
+    yield 'llm_type', llm.llm_type
     yield 'backend', backend
-    yield 'llm_tag', self.tag
+    yield 'llm_tag', llm.tag
 
   return types.new_class(
-    self.__class__.__name__ + 'Runner',
+    llm.config.__class__.__name__[:-6] + 'Runner',
     (bentoml.Runner,),
     exec_body=lambda ns: ns.update(
       {
-        'llm_type': self.llm_type,
-        'identifying_params': self.identifying_params,
-        'llm_tag': self.tag,
-        'llm': self,
-        'config': self.config,
+        'llm_type': llm.llm_type,
+        'identifying_params': llm.identifying_params,
+        'llm_tag': llm.tag,
+        'llm': llm,
+        'config': llm.config,
         'backend': backend,
-        '__module__': self.__module__,
+        '__doc__': llm.config.__class__.__doc__ or f'Generated Runner class for {llm.config["model_name"]}',
+        '__module__': llm.__module__,
         '__repr__': ReprMixin.__repr__,
-        '__repr_keys__': property(_wrapped_repr_keys),
+        '__repr_keys__': property(lambda _: {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'}),
         '__repr_args__': _wrapped_repr_args,
-        'has_adapters': self.has_adapters,
+        'has_adapters': llm.has_adapters,
         'prompt_template': prompt_template,
         'system_message': system_message,
       }
     ),
   )(
     runnable(backend),
-    name=self.runner_name,
+    name=llm.runner_name,
     embedded=False,
     models=models,
     max_batch_size=max_batch_size,
     max_latency_ms=max_latency_ms,
     scheduling_strategy=scheduling_strategy,
-    runnable_init_params=dict(llm=self),
+    runnable_init_params={'llm': llm},
     method_configs=converter.unstructure({'generate_iterator': ModelSignature(batchable=False)}),
   )
-
-
-@t.final
-class LLMRunnable(bentoml.Runnable, t.Generic[M, T]):
-  SUPPORTED_RESOURCES = ('amd.com/gpu', 'nvidia.com/gpu', 'cpu')
-  SUPPORTS_CPU_MULTI_THREADING = True
-  generate_iterator: RunnableMethod[LLMRunnable[M, T], [list[int], str, str | t.Iterable[str] | None, str | None], str]
-
-
-@t.final
-class LLMRunner(t.Protocol[M, T]):
-  __doc__: str
-  __module__: str
-  llm_type: str
-  llm_tag: bentoml.Tag
-  identifying_params: dict[str, t.Any]
-  llm: openllm.LLM[M, T]
-  config: openllm.LLMConfig
-  backend: LiteralBackend
-  has_adapters: bool
-  system_message: str | None
-  prompt_template: str | None
-  generate_iterator: RunnerMethod[LLMRunnable[M, T], [list[int], str, str | t.Iterable[str] | None, str | None], str]
-
-  runner_methods: list[RunnerMethod[t.Any, t.Any, t.Any]]
-  scheduling_strategy: type[Strategy]
-  workers_per_resource: int | float
-  runnable_init_params: dict[str, t.Any]
-  _runner_handle: RunnerHandle
-
-  def __init__(
-    self,
-    runnable_class: type[LLMRunnable[M, T]],
-    *,
-    runnable_init_params: dict[str, t.Any] | None = ...,
-    name: str | None = ...,
-    scheduling_strategy: type[Strategy] = ...,
-    models: list[bentoml.Model] | None = ...,
-    max_batch_size: int | None = ...,
-    max_latency_ms: int | None = ...,
-    method_configs: dict[str, dict[str, int]] | None = ...,
-    embedded: bool = False,
-  ) -> None: ...
-
-  @property
-  @abc.abstractmethod
-  def __repr_keys__(self) -> set[str]: ...
-
-
-__all__ = ['LLMRunner', 'LLMRunnable', 'LLM']
diff --git a/openllm-python/src/openllm/_llm.pyi b/openllm-python/src/openllm/_llm.pyi
new file mode 100644
index 00000000..bac2b346
--- /dev/null
+++ b/openllm-python/src/openllm/_llm.pyi
@@ -0,0 +1,158 @@
+from typing import Any, AsyncGenerator, Dict, Generic, Iterable, List, Literal, Optional, Set, Tuple, TypedDict, Union
+
+import attr
+import torch
+from peft.config import PeftConfig
+from peft.peft_model import PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM
+
+from bentoml import Model, Tag
+from openllm_core import LLMConfig
+from openllm_core._schemas import GenerationOutput
+from openllm_core._typing_compat import (
+  AdapterMap,
+  AdapterType,
+  LiteralBackend,
+  LiteralDtype,
+  LiteralQuantise,
+  LiteralSerialisation,
+  M,
+  T,
+)
+from openllm_core.prompts import PromptTemplate
+from openllm_core.utils.representation import ReprArgs
+
+from ._quantisation import QuantizationConfig
+from ._runners import Runner
+
+InjectedModel = Union[PeftModel | PeftModelForCausalLM | PeftModelForSeq2SeqLM]
+
+class IdentifyingParams(TypedDict):
+  configuration: str
+  model_ids: str
+  model_id: str
+
+ResolvedAdapterMap = Dict[AdapterType, Dict[str, Tuple[PeftConfig, str]]]
+Dtype = Union[LiteralDtype, Literal['auto', 'half', 'float']]
+
+@attr.define(slots=True, repr=False, init=False)
+class LLM(Generic[M, T]):
+  _model_id: str
+  _revision: Optional[str]
+  _quantization_config: Optional[QuantizationConfig]
+  _quantise: Optional[LiteralQuantise]
+  _model_decls: Tuple[Any, ...]
+  __model_attrs: Dict[str, Any]
+  __tokenizer_attrs: Dict[str, Any]
+  _tag: Tag
+  _adapter_map: Optional[AdapterMap]
+  _serialisation: LiteralSerialisation
+  _local: bool
+  _prompt_template: Optional[PromptTemplate]
+  _system_message: Optional[str]
+
+  __llm_torch_dtype__: Dtype = ...
+  __llm_config__: Optional[LLMConfig] = ...
+  __llm_backend__: LiteralBackend = ...
+  __llm_quantization_config__: Optional[QuantizationConfig] = ...
+  __llm_runner__: Optional[Runner[M, T]] = ...
+  __llm_model__: Optional[M] = ...
+  __llm_tokenizer__: Optional[T] = ...
+  __llm_adapter_map__: Optional[ResolvedAdapterMap] = ...
+  __llm_trust_remote_code__: bool = ...
+
+  @property
+  def __repr_keys__(self) -> Set[str]: ...
+  def __repr__(self) -> str: ...
+  def __str__(self) -> str: ...
+  def __repr_name__(self) -> str: ...
+  def __repr_str__(self, join_str: str) -> str: ...
+  def __repr_args__(self) -> ReprArgs: ...
+  def __init__(
+    self,
+    model_id: str,
+    model_version: Optional[str] = ...,
+    model_tag: Optional[Union[str, Tag]] = ...,
+    prompt_template: Optional[Union[str, PromptTemplate]] = ...,
+    system_message: Optional[str] = ...,
+    llm_config: Optional[LLMConfig] = ...,
+    backend: Optional[LiteralBackend] = ...,
+    *args: Any,
+    quantize: Optional[LiteralQuantise] = ...,
+    quantization_config: Optional[QuantizationConfig] = ...,
+    adapter_map: Optional[Dict[str, str]] = ...,
+    serialisation: LiteralSerialisation = ...,
+    trust_remote_code: bool = ...,
+    embedded: bool = ...,
+    torch_dtype: Dtype = ...,
+    low_cpu_mem_usage: bool = ...,
+    **attrs: Any,
+  ) -> None: ...
+  @property
+  def _torch_dtype(self) -> torch.dtype: ...
+  @property
+  def _model_attrs(self) -> Dict[str, Any]: ...
+  @_model_attrs.setter
+  def _model_attrs(self, model_attrs: Dict[str, Any]) -> None: ...
+  @property
+  def _tokenizer_attrs(self) -> Dict[str, Any]: ...
+  @property
+  def import_kwargs(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: ...
+  @property
+  def trust_remote_code(self) -> bool: ...
+  @property
+  def runner_name(self) -> str: ...
+  @property
+  def model_id(self) -> str: ...
+  @property
+  def revision(self) -> str: ...
+  @property
+  def tag(self) -> Tag: ...
+  @property
+  def bentomodel(self) -> Model: ...
+  @property
+  def quantization_config(self) -> QuantizationConfig: ...
+  @property
+  def has_adapters(self) -> bool: ...
+  @property
+  def local(self) -> bool: ...
+  @property
+  def quantise(self) -> Optional[LiteralQuantise]: ...
+  @property
+  def llm_type(self) -> str: ...
+  @property
+  def identifying_params(self) -> IdentifyingParams: ...
+  @property
+  def llm_parameters(self) -> Tuple[Tuple[Tuple[Any, ...], Dict[str, Any]], Dict[str, Any]]: ...
+  @property
+  def config(self) -> LLMConfig: ...
+  @property
+  def tokenizer(self) -> T: ...
+  @property
+  def model(self) -> M: ...
+  @property
+  def runner(self) -> Runner[M, T]: ...
+  @property
+  def adapter_map(self) -> ResolvedAdapterMap: ...
+  def prepare_for_training(
+    self, adapter_type: AdapterType = ..., use_gradient_checking: bool = ..., **attrs: Any
+  ) -> Tuple[InjectedModel, T]: ...
+  async def generate(
+    self,
+    prompt: Optional[str],
+    prompt_token_ids: Optional[List[int]] = ...,
+    stop: Optional[Union[str, Iterable[str]]] = ...,
+    stop_token_ids: Optional[List[int]] = ...,
+    request_id: Optional[str] = ...,
+    adapter_name: Optional[str] = ...,
+    **attrs: Any,
+  ) -> GenerationOutput: ...
+  async def generate_iterator(
+    self,
+    prompt: Optional[str],
+    prompt_token_ids: Optional[List[int]] = ...,
+    stop: Optional[Union[str, Iterable[str]]] = ...,
+    stop_token_ids: Optional[List[int]] = ...,
+    request_id: Optional[str] = ...,
+    adapter_name: Optional[str] = ...,
+    **attrs: Any,
+  ) -> AsyncGenerator[GenerationOutput, None]: ...
diff --git a/openllm-python/src/openllm/_quantisation.py b/openllm-python/src/openllm/_quantisation.py
index 68724559..96a799e2 100644
--- a/openllm-python/src/openllm/_quantisation.py
+++ b/openllm-python/src/openllm/_quantisation.py
@@ -1,12 +1,5 @@
-# mypy: disable-error-code="name-defined,no-redef"
 from __future__ import annotations
-import logging
-import typing as t
 
-import torch
-import transformers
-
-from openllm_core._typing_compat import LiteralQuantise, overload
 from openllm_core.exceptions import MissingDependencyError
 from openllm_core.utils import (
   is_autoawq_available,
@@ -15,35 +8,11 @@ from openllm_core.utils import (
   is_optimum_supports_gptq,
 )
 
-if t.TYPE_CHECKING:
-  from openllm_core._typing_compat import DictStrAny
 
-  from ._llm import LLM
+def infer_quantisation_config(llm, quantise, **attrs):
+  import torch
+  import transformers
 
-logger = logging.getLogger(__name__)
-
-
-@overload
-def infer_quantisation_config(
-  self: LLM[t.Any, t.Any], quantise: t.Literal['int8', 'int4'], **attrs: t.Any
-) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]: ...
-
-
-@overload
-def infer_quantisation_config(
-  self: LLM[t.Any, t.Any], quantise: t.Literal['gptq'], **attrs: t.Any
-) -> tuple[transformers.GPTQConfig, DictStrAny]: ...
-
-
-@overload
-def infer_quantisation_config(
-  self: LLM[t.Any, t.Any], quantise: t.Literal['awq'], **attrs: t.Any
-) -> tuple[transformers.AwqConfig, DictStrAny]: ...
-
-
-def infer_quantisation_config(
-  self: LLM[t.Any, t.Any], quantise: LiteralQuantise, **attrs: t.Any
-) -> tuple[transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig, DictStrAny]:
   # 8 bit configuration
   int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
   int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
@@ -54,12 +23,17 @@ def infer_quantisation_config(
   bits = attrs.pop('bits', 4)
   group_size = attrs.pop('group_size', 128)
 
-  def create_awq_config() -> transformers.AwqConfig:
+  # 4 bit configuration
+  int4_compute_dtype = attrs.pop('bnb_4bit_compute_dtype', torch.bfloat16)
+  int4_quant_type = attrs.pop('bnb_4bit_quant_type', 'nf4')
+  int4_use_double_quant = attrs.pop('bnb_4bit_use_double_quant', True)
+
+  def create_awq_config():
     zero_point = attrs.pop('zero_point', True)
     return transformers.AwqConfig(bits=bits, group_size=group_size, zero_point=zero_point)
 
-  def create_gptq_config() -> transformers.GPTQConfig:
-    gptq_tokenizer = attrs.pop('tokenizer', self.model_id)
+  def create_gptq_config():
+    gptq_tokenizer = attrs.pop('tokenizer', llm.model_id)
     gptq_dataset = attrs.pop('dataset', 'c4')
     gptq_damp_percent = attrs.pop('damp_percent', 0.1)
     gptq_desc_act = attrs.pop('desc_act', False)
@@ -94,10 +68,9 @@ def infer_quantisation_config(
       exllama_config={'version': 1},
     )  # XXX: See how to migrate to v2
 
-  def create_int8_config(int8_skip_modules: list[str] | None) -> transformers.BitsAndBytesConfig:
+  def create_int8_config(int8_skip_modules):
     # if int8_skip_modules is None: int8_skip_modules = []
     # if 'lm_head' not in int8_skip_modules and self.config_class.__openllm_model_type__ == 'causal_lm':
-    #   logger.debug("Skipping 'lm_head' for quantization for %s", self.__name__)
     #   int8_skip_modules.append('lm_head')
     return transformers.BitsAndBytesConfig(
       load_in_8bit=True,
@@ -107,10 +80,13 @@ def infer_quantisation_config(
       llm_int8_has_fp16_weight=int8_has_fp16_weight,
     )
 
-  # 4 bit configuration
-  int4_compute_dtype = attrs.pop('bnb_4bit_compute_dtype', torch.bfloat16)
-  int4_quant_type = attrs.pop('bnb_4bit_quant_type', 'nf4')
-  int4_use_double_quant = attrs.pop('bnb_4bit_use_double_quant', True)
+  def create_int4_config():
+    return transformers.BitsAndBytesConfig(
+      load_in_4bit=True,
+      bnb_4bit_compute_dtype=int4_compute_dtype,
+      bnb_4bit_quant_type=int4_quant_type,
+      bnb_4bit_use_double_quant=int4_use_double_quant,
+    )
 
   # NOTE: Quantization setup quantize is a openllm.LLM feature, where we can quantize the model with bitsandbytes or quantization aware training.
   if not is_bitsandbytes_available():
@@ -120,23 +96,18 @@ def infer_quantisation_config(
   if quantise == 'int8':
     quantisation_config = create_int8_config(int8_skip_modules)
   elif quantise == 'int4':
-    quantisation_config = transformers.BitsAndBytesConfig(
-      load_in_4bit=True,
-      bnb_4bit_compute_dtype=int4_compute_dtype,
-      bnb_4bit_quant_type=int4_quant_type,
-      bnb_4bit_use_double_quant=int4_use_double_quant,
-    )
+    quantisation_config = create_int4_config()
   elif quantise == 'gptq':
     if not is_autogptq_available() or not is_optimum_supports_gptq():
       raise MissingDependencyError(
-        "'quantize=\"gptq\"' requires 'auto-gptq' and 'optimum>=0.12' to be installed (missing or failed to import). Make sure to do 'pip install \"openllm[gptq]\"'"
+        "GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'"
       )
     else:
       quantisation_config = create_gptq_config()
   elif quantise == 'awq':
     if not is_autoawq_available():
       raise MissingDependencyError(
-        "quantize='awq' requires 'auto-awq' to be installed (missing or failed to import). Make sure to do 'pip install \"openllm[awq]\"'."
+        "AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'."
       )
     else:
       quantisation_config = create_awq_config()
diff --git a/openllm-python/src/openllm/_quantisation.pyi b/openllm-python/src/openllm/_quantisation.pyi
new file mode 100644
index 00000000..d41809f7
--- /dev/null
+++ b/openllm-python/src/openllm/_quantisation.pyi
@@ -0,0 +1,26 @@
+from typing import Any, Dict, Literal, Union
+
+from transformers import AwqConfig, BitsAndBytesConfig, GPTQConfig
+
+from openllm_core._typing_compat import LiteralQuantise, M, T, overload
+
+from ._llm import LLM
+
+QuantizationConfig = Union[BitsAndBytesConfig, GPTQConfig, AwqConfig]
+
+@overload
+def infer_quantisation_config(
+  self: LLM[M, T], quantise: Literal['int8', 'int4'], **attrs: Any
+) -> tuple[BitsAndBytesConfig, Dict[str, Any]]: ...
+@overload
+def infer_quantisation_config(
+  self: LLM[M, T], quantise: Literal['gptq'], **attrs: Any
+) -> tuple[GPTQConfig, Dict[str, Any]]: ...
+@overload
+def infer_quantisation_config(
+  self: LLM[M, T], quantise: Literal['awq'], **attrs: Any
+) -> tuple[AwqConfig, Dict[str, Any]]: ...
+@overload
+def infer_quantisation_config(
+  self: LLM[M, T], quantise: LiteralQuantise, **attrs: Any
+) -> tuple[QuantizationConfig, Dict[str, Any]]: ...
diff --git a/openllm-python/src/openllm/_runners.py b/openllm-python/src/openllm/_runners.py
index bc4b2bf7..16b26916 100644
--- a/openllm-python/src/openllm/_runners.py
+++ b/openllm-python/src/openllm/_runners.py
@@ -9,27 +9,14 @@ import torch
 import bentoml
 import openllm
 from openllm_core._schemas import CompletionChunk, GenerationOutput
-from openllm_core._typing_compat import LiteralBackend, M, T
 from openllm_core.exceptions import OpenLLMException
 from openllm_core.utils import first_not_none, is_vllm_available
 
-if t.TYPE_CHECKING:
-  import vllm
-
-  from openllm_core._schemas import FinishReason
-else:
-  vllm = openllm.utils.LazyLoader('vllm', globals(), 'vllm')
-
-_DEFAULT_TOKENIZER = 'hf-internal-testing/llama-tokenizer'
-
 __all__ = ['runnable']
 
 
-def runnable(backend: LiteralBackend | None = None) -> type[bentoml.Runnable]:
-  backend = t.cast(
-    LiteralBackend,
-    first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt'),
-  )
+def runnable(backend=None):
+  backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt')
   return vLLMRunnable if backend == 'vllm' else PyTorchRunnable
 
 
@@ -37,7 +24,11 @@ class vLLMRunnable(bentoml.Runnable):
   SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
   SUPPORTS_CPU_MULTI_THREADING = True
 
-  def __init__(self, llm: openllm.LLM[M, T]) -> None:
+  def __init__(self, llm):
+    try:
+      import vllm
+    except ImportError:
+      raise OpenLLMException('vLLM is not installed. Please install it via `pip install "openllm[vllm]"`.') from None
     self.config = llm.config
     num_gpus, dev = 1, openllm.utils.device_count()
     if dev >= 2:
@@ -64,14 +55,7 @@ class vLLMRunnable(bentoml.Runnable):
       raise OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from err
 
   @bentoml.Runnable.method(batchable=False)
-  async def generate_iterator(
-    self,
-    prompt_token_ids: list[int],
-    request_id: str,
-    stop: str | t.Iterable[str] | None = None,
-    adapter_name: str | None = None,
-    **attrs: t.Any,
-  ) -> t.AsyncGenerator[str, None]:
+  async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
     if adapter_name is not None:
       raise NotImplementedError('Adapter is not supported with vLLM.')
     stop_: set[str] = set()
@@ -99,28 +83,19 @@ class PyTorchRunnable(bentoml.Runnable):
   SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
   SUPPORTS_CPU_MULTI_THREADING = True
 
-  def __init__(self, llm: openllm.LLM[M, T]) -> None:
+  def __init__(self, llm):
     self.model = llm.model
     self.tokenizer = llm.tokenizer
     self.config = llm.config
 
   @bentoml.Runnable.method(batchable=False)
-  async def generate_iterator(
-    self,
-    prompt_token_ids: list[int],
-    request_id: str,
-    stop: str | t.Iterable[str] | None = None,
-    adapter_name: str | None = None,
-    **attrs: t.Any,
-  ) -> t.AsyncGenerator[str, None]:
+  async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
     if adapter_name is not None:
       self.model.set_adapter(adapter_name)
     async for generation_output in self.forward(prompt_token_ids, request_id, stop=stop, **attrs):
       yield generation_output.model_dump_json()
 
-  async def forward(
-    self, prompt_token_ids: list[int], request_id: str, stop: str | t.Iterable[str] | None = None, **attrs: t.Any
-  ) -> t.AsyncGenerator[GenerationOutput, None]:
+  async def forward(self, prompt_token_ids, request_id, stop=None, **attrs):
     from ._generation import is_partial_stop, prepare_logits_processor
 
     stop_: set[str] = set()
@@ -142,7 +117,7 @@ class PyTorchRunnable(bentoml.Runnable):
       logits_processor = prepare_logits_processor(config)
 
       past_key_values = out = token = None
-      finish_reason: t.Optional[FinishReason] = None
+      finish_reason = None
       for i in range(config['max_new_tokens']):
         if i == 0:  # prefill
           out = self.model(torch.as_tensor([prompt_token_ids], device=self.model.device), use_cache=True)
diff --git a/openllm-python/src/openllm/_runners.pyi b/openllm-python/src/openllm/_runners.pyi
new file mode 100644
index 00000000..a1ab4d7f
--- /dev/null
+++ b/openllm-python/src/openllm/_runners.pyi
@@ -0,0 +1,126 @@
+from typing import (
+  Any,
+  AsyncGenerator,
+  Dict,
+  Generic,
+  Iterable,
+  List,
+  Literal,
+  Optional,
+  Protocol,
+  Tuple,
+  Type,
+  TypeVar,
+  Union,
+  final,
+)
+
+from bentoml import Model, Strategy, Tag
+from bentoml._internal.runner.runner_handle import RunnerHandle
+from openllm_core import LLMConfig
+from openllm_core._typing_compat import LiteralBackend, T, overload
+
+from ._llm import LLM
+
+try:
+  from vllm import AsyncLLMEngine
+except ImportError:
+  AsyncLLMEngine = Any
+
+try:
+  from transformers import PreTrainedModel
+except ImportError:
+  PreTrainedModel = Any
+
+Mo = TypeVar('Mo')
+
+class _Runnable(Protocol[Mo]):
+  SUPPORTED_RESOURCES: Tuple[Literal['nvidia.com/gpu'], Literal['amd.com/gpu'], Literal['cpu']] = ...
+  SUPPORTS_CPU_MULTI_THREADING: bool = ...
+  config: LLMConfig = ...
+  model: Mo = ...
+  def __init__(self, llm: LLM[Mo, T]) -> None: ...
+  async def generate_iterator(
+    self,
+    prompt_token_ids: List[int],
+    request_id: str,
+    stop: Optional[Union[str, Iterable[str]]] = ...,
+    adapter_name: Optional[str] = ...,
+    **attrs: Any,
+  ) -> AsyncGenerator[str, None]: ...
+
+In = TypeVar('In')
+Ret = TypeVar('Ret')
+
+class RunnerMethod(Generic[In, Ret]): ...
+
+@final
+class vLLMRunnable(_Runnable[AsyncLLMEngine]): ...
+
+@final
+class PyTorchRunnable(_Runnable[PreTrainedModel]):
+  tokenizer: Any
+
+@overload
+def runnable(backend: Literal['vllm']) -> Type[vLLMRunnable]: ...
+@overload
+def runnable(backend: Literal['pt']) -> Type[PyTorchRunnable]: ...
+@overload
+def runnable(backend: Optional[str] = ...) -> Type[Union[vLLMRunnable, PyTorchRunnable]]: ...
+
+class Runner(Protocol[Mo, T]):
+  __doc__: str = ...
+  __module__: str = ...
+  llm_type: str = ...
+  llm_tag: Tag = ...
+  identifying_params: Dict[str, Any] = ...
+  llm: LLM[Mo, T] = ...
+  config: LLMConfig = ...
+  backend: LiteralBackend = ...
+  has_adapters: bool = ...
+  prompt_template: Optional[str] = ...
+  system_message: Optional[str] = ...
+
+  class generate_iterator(RunnerMethod[List[int], AsyncGenerator[str, None]]):
+    @staticmethod
+    def async_stream(
+      prompt_token_ids: List[int],
+      request_id: str,
+      stop: Optional[Union[Iterable[str], str]] = ...,
+      adapter_name: Optional[str] = ...,
+      **attrs: Any,
+    ) -> AsyncGenerator[str, None]: ...
+
+  def __init__(
+    self,
+    runnable_class: Type[_Runnable[Mo]],
+    *,
+    runnable_init_params: Optional[Dict[str, Any]] = ...,
+    name: Optional[str] = ...,
+    scheduling_strategy: Type[Strategy] = ...,
+    models: Optional[List[Model]] = ...,
+    max_batch_size: Optional[int] = ...,
+    max_latency_ms: Optional[int] = ...,
+    method_configs: Optional[Dict[str, Dict[str, int]]] = ...,
+    embedded: bool = ...,
+  ) -> None: ...
+
+  name: str = ...
+  models: List[Model] = ...
+  resource_config: Dict[str, Any]
+  runnable_class: Type[_Runnable[Mo]]
+  embedded: bool
+  runner_methods: List[RunnerMethod[Any, Any]]
+  scheduling_strategy: Type[Strategy]
+  workers_per_resource: Union[int, float] = ...
+  runnable_init_params: Dict[str, Any] = ...
+  _runner_handle: RunnerHandle = ...
+
+  def init_local(self, quiet: bool = False) -> None: ...
+  def init_client(self, handle_class: Optional[Type[RunnerHandle]] = ..., *args: Any, **kwargs: Any) -> None: ...
+  async def runner_handle_is_ready(self, timeout: int = ...) -> bool: ...
+  def destroy(self) -> None: ...
+  @property
+  def scheduled_worker_count(self) -> int: ...
+  @property
+  def scheduled_worker_env_map(self) -> Dict[int, Dict[str, Any]]: ...
diff --git a/openllm-python/src/openllm/_service_vars.py b/openllm-python/src/openllm/_service_vars.py
index b7b2821a..49ceedc3 100644
--- a/openllm-python/src/openllm/_service_vars.py
+++ b/openllm-python/src/openllm/_service_vars.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 import os
 
 model_id = os.environ['OPENLLM_MODEL_ID']  # openllm: model name
diff --git a/openllm-python/src/openllm/_service_vars_pkg.py b/openllm-python/src/openllm/_service_vars_pkg.py
index 773a24f8..f7ed217b 100644
--- a/openllm-python/src/openllm/_service_vars_pkg.py
+++ b/openllm-python/src/openllm/_service_vars_pkg.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 model_id = '{__model_id__}'  # openllm: model id
 model_tag = '{__model_tag__}'  # openllm: model tag
 adapter_map = """{__model_adapter_map__}"""  # openllm: model adapter map
diff --git a/openllm-python/src/openllm/bundle/__init__.py b/openllm-python/src/openllm/bundle/__init__.py
index ea8b0e10..bf508134 100644
--- a/openllm-python/src/openllm/bundle/__init__.py
+++ b/openllm-python/src/openllm/bundle/__init__.py
@@ -1,36 +1,15 @@
 import os
-import typing as t
 
 from openllm_core.utils import LazyModule
 
-_import_structure = {
-  '_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'],
-  'oci': [
-    'CONTAINER_NAMES',
-    'get_base_container_tag',
-    'get_base_container_name',
-    'supported_registries',
-    'RefResolver',
-  ],
-}
-
-if t.TYPE_CHECKING:
-  from . import _package as _package, oci as oci
-  from ._package import (
-    build_editable as build_editable,
-    construct_docker_options as construct_docker_options,
-    construct_python_options as construct_python_options,
-    create_bento as create_bento,
-  )
-  from .oci import (
-    CONTAINER_NAMES as CONTAINER_NAMES,
-    RefResolver as RefResolver,
-    get_base_container_name as get_base_container_name,
-    get_base_container_tag as get_base_container_tag,
-    supported_registries as supported_registries,
-  )
-
-__lazy = LazyModule(__name__, os.path.abspath('__file__'), _import_structure)
+__lazy = LazyModule(
+  __name__,
+  os.path.abspath('__file__'),
+  {
+    '_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'],
+    'oci': ['CONTAINER_NAMES', 'supported_registries', 'RefResolver'],
+  },
+)
 __all__ = __lazy.__all__
 __dir__ = __lazy.__dir__
 __getattr__ = __lazy.__getattr__
diff --git a/openllm-python/src/openllm/bundle/__init__.pyi b/openllm-python/src/openllm/bundle/__init__.pyi
new file mode 100644
index 00000000..46cb314d
--- /dev/null
+++ b/openllm-python/src/openllm/bundle/__init__.pyi
@@ -0,0 +1,32 @@
+from typing import Optional
+
+import attr
+
+from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
+from openllm_core.utils.lazy import VersionInfo
+
+from . import _package as _package, oci as oci
+from ._package import (
+  build_editable as build_editable,
+  construct_docker_options as construct_docker_options,
+  construct_python_options as construct_python_options,
+  create_bento as create_bento,
+)
+
+CONTAINER_NAMES: dict[LiteralContainerRegistry, str] = ...
+supported_registries: list[str] = ...
+
+@attr.attrs(eq=False, order=False, slots=True, frozen=True)
+class RefResolver:
+  git_hash: str
+  version: VersionInfo
+  strategy: LiteralContainerVersionStrategy
+
+  @classmethod
+  def from_strategy(cls, strategy_or_version: Optional[LiteralContainerVersionStrategy] = ...) -> RefResolver: ...
+  @property
+  def tag(self) -> str: ...
+  @staticmethod
+  def construct_base_image(
+    reg: LiteralContainerRegistry, strategy: Optional[LiteralContainerVersionStrategy] = ...
+  ) -> str: ...
diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py
index 148d14e1..26daf585 100644
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -1,16 +1,12 @@
 # mypy: disable-error-code="misc"
 from __future__ import annotations
 import importlib.metadata
-import inspect
 import logging
 import os
 import string
 import typing as t
 from pathlib import Path
 
-import fs
-import fs.copy
-import fs.errors
 import orjson
 from simple_di import Provide, inject
 
@@ -18,38 +14,27 @@ import bentoml
 import openllm_core
 from bentoml._internal.bento.build_config import BentoBuildConfig, DockerOptions, ModelSpec, PythonOptions
 from bentoml._internal.configuration.containers import BentoMLContainer
+from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg
 
 from . import oci
 
 if t.TYPE_CHECKING:
-  from fs.base import FS
-
-  import openllm
-  from bentoml._internal.bento import BentoStore
-  from bentoml._internal.models.model import ModelStore
-  from openllm_core._typing_compat import (
-    LiteralContainerRegistry,
-    LiteralContainerVersionStrategy,
-    LiteralSerialisation,
-    LiteralString,
-  )
+  from openllm_core._typing_compat import LiteralString
 
 logger = logging.getLogger(__name__)
 
 OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
 
 
-def build_editable(
-  path: str, package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm'
-) -> str | None:
+def build_editable(path, package='openllm'):
   """Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set."""
-  if not openllm_core.utils.check_bool_env(OPENLLM_DEV_BUILD, default=False):
+  if not check_bool_env(OPENLLM_DEV_BUILD, default=False):
     return None
   # We need to build the package in editable mode, so that we can import it
   from build import ProjectBuilder
   from build.env import IsolatedEnvBuilder
 
-  module_location = openllm_core.utils.pkg.source_locations(package)
+  module_location = pkg.source_locations(package)
   if not module_location:
     raise RuntimeError(
       'Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.'
@@ -68,12 +53,7 @@ def build_editable(
   )
 
 
-def construct_python_options(
-  llm: openllm.LLM[t.Any, t.Any],
-  llm_fs: FS,
-  extra_dependencies: tuple[str, ...] | None = None,
-  adapter_map: dict[str, str] | None = None,
-) -> PythonOptions:
+def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=None):
   packages = ['openllm', 'scipy']  # apparently bnb misses this one
   if adapter_map is not None:
     packages += ['openllm[fine-tune]']
@@ -88,24 +68,18 @@ def construct_python_options(
   if req is not None:
     packages.extend(req)
   if str(os.environ.get('BENTOML_BUNDLE_LOCAL_BUILD', False)).lower() == 'false':
-    packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}")
+    packages.append(f"bentoml>={'.'.join([str(i) for i in pkg.pkg_version_info('bentoml')])}")
 
-  if not openllm_core.utils.is_torch_available():
-    raise ValueError('PyTorch is not available. Make sure to have it locally installed.')
-  packages.extend(
-    ['torch==2.0.1+cu118', 'vllm==0.2.1.post1', 'xformers==0.0.22', 'bentoml[tracing]==1.1.9']
-  )  # XXX: Currently locking this for correctness
-  wheels: list[str] = []
-  built_wheels = [
-    build_editable(llm_fs.getsyspath('/'), t.cast(t.Literal['openllm', 'openllm_core', 'openllm_client'], p))
-    for p in ('openllm_core', 'openllm_client', 'openllm')
-  ]
+  # XXX: Currently locking this for correctness
+  packages.extend(['torch==2.0.1+cu118', 'vllm==0.2.1.post1', 'xformers==0.0.22', 'bentoml[tracing]==1.1.9'])
+  wheels = []
+  built_wheels = [build_editable(llm_fs.getsyspath('/'), p) for p in ('openllm_core', 'openllm_client', 'openllm')]
   if all(i for i in built_wheels):
     wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)])
   return PythonOptions(
     packages=packages,
     wheels=wheels,
-    lock_packages=False,
+    lock_packages=True,
     extra_index_url=[
       'https://download.pytorch.org/whl/cu118',
       'https://huggingface.github.io/autogptq-index/whl/cu118/',
@@ -114,15 +88,8 @@ def construct_python_options(
 
 
 def construct_docker_options(
-  llm: openllm.LLM[t.Any, t.Any],
-  _: FS,
-  quantize: LiteralString | None,
-  adapter_map: dict[str, str] | None,
-  dockerfile_template: str | None,
-  serialisation: LiteralSerialisation,
-  container_registry: LiteralContainerRegistry,
-  container_version_strategy: LiteralContainerVersionStrategy,
-) -> DockerOptions:
+  llm, _, quantize, adapter_map, dockerfile_template, serialisation, container_registry, container_version_strategy
+):
   from openllm_cli._factory import parse_config_options
 
   environ = parse_config_options(llm.config, llm.config['timeout'], 1.0, None, True, os.environ.copy())
@@ -145,7 +112,7 @@ def construct_docker_options(
   if quantize:
     env_dict['OPENLLM_QUANTIZE'] = str(quantize)
   return DockerOptions(
-    base_image=f'{oci.get_base_container_name(container_registry)}:{oci.get_base_container_tag(container_version_strategy)}',
+    base_image=oci.RefResolver.construct_base_image(container_registry, container_version_strategy),
     env=env_dict,
     dockerfile_template=dockerfile_template,
   )
@@ -160,21 +127,13 @@ class _ServiceVarsFormatter(string.Formatter):
   keyword: LiteralString = '__model_name__'
   identifier: LiteralString = '# openllm: model name'
 
-  def __init__(self, target: str):
-    """The formatter that extends model_name to be formatted the 'service.py'."""
+  def __init__(self, target):
     super().__init__()
     self.target = target
 
-  def vformat(self, format_string: str, *args: t.Any, **attrs: t.Any) -> t.Any:
+  def vformat(self, format_string, *args, **attrs) -> str:
     return super().vformat(format_string, (), {self.keyword: self.target})
 
-  def can_format(self, value: str) -> bool:
-    try:
-      self.parse(value)
-      return True
-    except ValueError:
-      return False
-
   def parse_line(self, line: str, nl: bool = True) -> str:
     if self.identifier not in line:
       return line
@@ -201,9 +160,7 @@ _service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
 _service_vars_file = Path(os.path.abspath(__file__)).parent.parent / '_service_vars_pkg.py'
 
 
-def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str] | None, llm_fs: FS) -> None:
-  from openllm_core.utils import DEBUG
-
+def write_service(llm, llm_fs, adapter_map):
   model_id_formatter = ModelIdFormatter(llm.model_id)
   model_tag_formatter = ModelTagFormatter(str(llm.tag))
   adapter_map_formatter = ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode())
@@ -222,8 +179,8 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str] |
       src_contents[i] = adapter_map_formatter.parse_line(it)
 
   script = f"# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n\n" + ''.join(src_contents)
-  if DEBUG:
-    logger.info('Generated script:\n%s', script)
+  if SHOW_CODEGEN:
+    logger.info('Generated _service_vars.py:\n%s', script)
   llm_fs.writetext('_service_vars.py', script)
 
   logger.debug(
@@ -236,22 +193,20 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str] |
 
 @inject
 def create_bento(
-  bento_tag: bentoml.Tag,
-  llm_fs: FS,
-  llm: openllm.LLM[t.Any, t.Any],
-  quantize: LiteralString | None,
-  dockerfile_template: str | None,
-  adapter_map: dict[str, str] | None = None,
-  extra_dependencies: tuple[str, ...] | None = None,
-  serialisation: LiteralSerialisation | None = None,
-  container_registry: LiteralContainerRegistry = 'ecr',
-  container_version_strategy: LiteralContainerVersionStrategy = 'release',
-  _bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
-  _model_store: ModelStore = Provide[BentoMLContainer.model_store],
-) -> bentoml.Bento:
-  _serialisation: LiteralSerialisation = openllm_core.utils.first_not_none(
-    serialisation, default=llm.config['serialisation']
-  )
+  bento_tag,
+  llm_fs,
+  llm,
+  quantize,
+  dockerfile_template,
+  adapter_map=None,
+  extra_dependencies=None,
+  serialisation=None,
+  container_registry='ecr',
+  container_version_strategy='release',
+  _bento_store=Provide[BentoMLContainer.bento_store],
+  _model_store=Provide[BentoMLContainer.model_store],
+):
+  _serialisation = openllm_core.utils.first_not_none(serialisation, default=llm.config['serialisation'])
   labels = dict(llm.identifying_params)
   labels.update(
     {
@@ -270,47 +225,31 @@ def create_bento(
     labels.update(adapter_map)
   logger.debug("Building Bento '%s' with model backend '%s'", bento_tag, llm.__llm_backend__)
   # add service.py definition to this temporary folder
-  write_service(llm, adapter_map, llm_fs)
+  write_service(llm, llm_fs, adapter_map)
 
-  llm_spec = ModelSpec.from_item({'tag': str(llm.tag), 'alias': llm.tag.name})
-  build_config = BentoBuildConfig(
-    service=f"{llm.config['service_name']}:svc",
-    name=bento_tag.name,
-    labels=labels,
-    models=[llm_spec],
-    description=f"OpenLLM service for {llm.config['start_name']}",
-    include=list(llm_fs.walk.files()),
-    exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
-    python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
-    docker=construct_docker_options(
-      llm,
-      llm_fs,
-      quantize,
-      adapter_map,
-      dockerfile_template,
-      _serialisation,
-      container_registry,
-      container_version_strategy,
+  bento = bentoml.Bento.create(
+    version=bento_tag.version,
+    build_ctx=llm_fs.getsyspath('/'),
+    build_config=BentoBuildConfig(
+      service=f"{llm.config['service_name']}:svc",
+      name=bento_tag.name,
+      labels=labels,
+      models=[ModelSpec.from_item({'tag': str(llm.tag), 'alias': llm.tag.name})],
+      description=f"OpenLLM service for {llm.config['start_name']}",
+      include=list(llm_fs.walk.files()),
+      exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
+      python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
+      docker=construct_docker_options(
+        llm,
+        llm_fs,
+        quantize,
+        adapter_map,
+        dockerfile_template,
+        _serialisation,
+        container_registry,
+        container_version_strategy,
+      ),
     ),
   )
 
-  bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/'))
-  # NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM.
-  service_fs_path = fs.path.join('src', llm.config['service_name'])
-  service_path = bento._fs.getsyspath(service_fs_path)
-  with open(service_path, 'r') as f:
-    service_contents = f.readlines()
-
-  for it in service_contents:
-    if '__bento_name__' in it:
-      service_contents[service_contents.index(it)] = it.format(__bento_name__=str(bento.tag))
-
-  script = ''.join(service_contents)
-  if openllm_core.utils.DEBUG:
-    logger.info('Generated script:\n%s', script)
-
-  bento._fs.writetext(service_fs_path, script)
-  if 'model_store' in inspect.signature(bento.save).parameters:
-    return bento.save(bento_store=_bento_store, model_store=_model_store)
-  # backward arguments. `model_store` is added recently
-  return bento.save(bento_store=_bento_store)
+  return bento.save(bento_store=_bento_store, model_store=_model_store)
diff --git a/openllm-python/src/openllm/bundle/_package.pyi b/openllm-python/src/openllm/bundle/_package.pyi
new file mode 100644
index 00000000..3289b3bd
--- /dev/null
+++ b/openllm-python/src/openllm/bundle/_package.pyi
@@ -0,0 +1,52 @@
+from typing import Dict, Optional, Tuple
+
+from fs.base import FS
+from typing_extensions import LiteralString
+
+from bentoml import Bento, Tag
+from bentoml._internal.bento import BentoStore
+from bentoml._internal.bento.build_config import DockerOptions, PythonOptions
+from bentoml._internal.models.model import ModelStore
+from openllm_core._typing_compat import (
+  LiteralContainerRegistry,
+  LiteralContainerVersionStrategy,
+  LiteralQuantise,
+  LiteralSerialisation,
+  M,
+  T,
+)
+
+from .._llm import LLM
+
+def build_editable(path: str, package: LiteralString) -> Optional[str]: ...
+def construct_python_options(
+  llm: LLM[M, T],
+  llm_fs: FS,
+  extra_dependencies: Optional[Tuple[str, ...]] = ...,
+  adapter_map: Optional[Dict[str, str]] = ...,
+) -> PythonOptions: ...
+def construct_docker_options(
+  llm: LLM[M, T],
+  llm_fs: FS,
+  quantize: Optional[LiteralQuantise],
+  adapter_map: Optional[Dict[str, str]],
+  dockerfile_template: Optional[str],
+  serialisation: LiteralSerialisation,
+  container_registry: LiteralContainerRegistry,
+  container_version_strategy: LiteralContainerVersionStrategy,
+) -> DockerOptions: ...
+def write_service(llm: LLM[M, T], llm_fs: FS, adapter_map: Optional[Dict[str, str]]) -> None: ...
+def create_bento(
+  bento_tag: Tag,
+  llm_fs: FS,
+  llm: LLM[M, T],
+  quantize: Optional[LiteralQuantise],
+  dockerfile_template: Optional[str],
+  adapter_map: Optional[Dict[str, str]] = ...,
+  extra_dependencies: Optional[Tuple[str, ...]] = ...,
+  serialisation: Optional[LiteralSerialisation] = ...,
+  container_registry: LiteralContainerRegistry = ...,
+  container_version_strategy: LiteralContainerVersionStrategy = ...,
+  _bento_store: BentoStore = ...,
+  _model_store: ModelStore = ...,
+) -> Bento: ...
diff --git a/openllm-python/src/openllm/bundle/oci/__init__.py b/openllm-python/src/openllm/bundle/oci/__init__.py
index 8bb4d19d..ddb35c4f 100644
--- a/openllm-python/src/openllm/bundle/oci/__init__.py
+++ b/openllm-python/src/openllm/bundle/oci/__init__.py
@@ -1,26 +1,21 @@
-# mypy: disable-error-code="misc"
 from __future__ import annotations
 import functools
 import importlib
 import logging
 import os
 import pathlib
-import typing as t
 
 import attr
 
+from openllm_core._typing_compat import LiteralContainerVersionStrategy
 from openllm_core.exceptions import OpenLLMException
-from openllm_core.utils import codegen
 from openllm_core.utils.lazy import VersionInfo
 
-if t.TYPE_CHECKING:
-  from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy, RefTuple
-
 logger = logging.getLogger(__name__)
 
 ROOT_DIR = pathlib.Path(os.path.abspath('__file__')).parent.parent.parent
 
-_CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {
+_CONTAINER_REGISTRY = {
   'docker': 'docker.io/bentoml/openllm',
   'gh': 'ghcr.io/bentoml/openllm',
   'ecr': 'public.ecr.aws/y5w8i4y6/bentoml/openllm',
@@ -30,80 +25,48 @@ _CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {
 _OWNER, _REPO = 'bentoml', 'openllm'
 
 
-def _convert_version_from_string(s: str) -> VersionInfo:
-  return VersionInfo.from_version_string(s)
-
-
-_RefTuple: type[RefTuple] = codegen.make_attr_tuple_class('_RefTuple', ['git_hash', 'version', 'strategy'])
-
-
 @attr.attrs(eq=False, order=False, slots=True, frozen=True)
 class RefResolver:
   git_hash: str = attr.field()
-  version: VersionInfo = attr.field(converter=_convert_version_from_string)
+  version: VersionInfo = attr.field(converter=lambda s: VersionInfo.from_version_string(s))
   strategy: LiteralContainerVersionStrategy = attr.field()
 
-  @classmethod
-  def _release_ref(cls, version_str: str | None = None) -> RefTuple:
-    try:
-      from ghapi.all import GhApi
-
-      ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
-      meta = t.cast(t.Dict[str, t.Any], ghapi.repos.get_latest_release())
-    except Exception as err:
-      raise OpenLLMException('Failed to determine latest release version.') from err
-    _use_base_strategy = version_str is None
-    if version_str is None:
-      # NOTE: This strategy will only support openllm>0.2.12
-      version_str = meta['name'].lstrip('v')
-      version = (ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'], version_str)
-    else:
-      version = ('', version_str)
-    return _RefTuple((*version, 'release' if _use_base_strategy else 'custom'))
-
   @classmethod
   @functools.lru_cache(maxsize=64)
-  def from_strategy(cls, strategy_or_version: LiteralContainerVersionStrategy | None = None) -> RefResolver:
+  def from_strategy(cls, strategy_or_version=None):
     # using default strategy
     if strategy_or_version is None or strategy_or_version == 'release':
-      return cls(*cls._release_ref())
+      try:
+        from ghapi.all import GhApi
+
+        ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
+        meta = ghapi.repos.get_latest_release()
+        git_hash = ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha']
+      except Exception as err:
+        raise OpenLLMException('Failed to determine latest release version.') from err
+      return cls(git_hash=git_hash, version=meta['name'].lstrip('v'), strategy='release')
     elif strategy_or_version in ('latest', 'nightly'):  # latest is nightly
       return cls(git_hash='latest', version='0.0.0', strategy='latest')
     else:
       raise ValueError(f'Unknown strategy: {strategy_or_version}')
 
   @property
-  def tag(self) -> str:
+  def tag(self):
     return 'latest' if self.strategy in {'latest', 'nightly'} else repr(self.version)
 
-
-@functools.lru_cache(maxsize=256)
-def get_base_container_tag(strategy: LiteralContainerVersionStrategy | None = None) -> str:
-  return RefResolver.from_strategy(strategy).tag
+  @staticmethod
+  def construct_base_image(reg, strategy=None):
+    return f'{_CONTAINER_REGISTRY[reg]}:{RefResolver.from_strategy(strategy).tag}'
 
 
-def get_base_container_name(reg: LiteralContainerRegistry) -> str:
-  return _CONTAINER_REGISTRY[reg]
+__all__ = ['CONTAINER_NAMES', 'RefResolver', 'supported_registries']
 
 
-if t.TYPE_CHECKING:
-  CONTAINER_NAMES: dict[LiteralContainerRegistry, str]
-  supported_registries: list[str]
-
-__all__ = [
-  'CONTAINER_NAMES',
-  'get_base_container_tag',
-  'get_base_container_name',
-  'supported_registries',
-  'RefResolver',
-]
-
-
-def __dir__() -> list[str]:
+def __dir__():
   return sorted(__all__)
 
 
-def __getattr__(name: str) -> t.Any:
+def __getattr__(name):
   if name == 'supported_registries':
     return functools.lru_cache(1)(lambda: list(_CONTAINER_REGISTRY))()
   elif name == 'CONTAINER_NAMES':
diff --git a/openllm-python/src/openllm/testing.py b/openllm-python/src/openllm/testing.py
index 1fceeaa0..f51355bf 100644
--- a/openllm-python/src/openllm/testing.py
+++ b/openllm-python/src/openllm/testing.py
@@ -1,5 +1,3 @@
-"""Tests utilities for OpenLLM."""
-
 from __future__ import annotations
 import contextlib
 import logging
diff --git a/openllm-python/src/openllm_cli/_factory.py b/openllm-python/src/openllm_cli/_factory.py
index fff432c6..b55f27eb 100644
--- a/openllm-python/src/openllm_cli/_factory.py
+++ b/openllm-python/src/openllm_cli/_factory.py
@@ -429,7 +429,7 @@ def workers_per_resource_option(
 
       - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
 
-      - ``conserved``: This will determine the number of available GPU resources, and only assign one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``.
+      - ``conserved``: This will determine the number of available GPU resources. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``.
       """
     + (
       """\n
diff --git a/openllm-python/src/openllm_cli/entrypoint.py b/openllm-python/src/openllm_cli/entrypoint.py
index 7741fd6f..c809c110 100644
--- a/openllm-python/src/openllm_cli/entrypoint.py
+++ b/openllm-python/src/openllm_cli/entrypoint.py
@@ -77,6 +77,7 @@ from openllm_core.utils import (
   compose,
   configure_logging,
   first_not_none,
+  gen_random_uuid,
   get_debug_mode,
   get_disable_warnings,
   get_quiet_mode,
@@ -986,7 +987,6 @@ def build_command(
   > To build the bento with compiled OpenLLM, make sure to prepend HATCH_BUILD_HOOKS_ENABLE=1. Make sure that the deployment
   > target also use the same Python version and architecture as build machine.
   """
-  from openllm._llm import normalise_model_name
   from openllm.serialisation.transformers.weights import has_safetensors_weights
 
   if model_id in openllm.CONFIG_MAPPING:
@@ -1046,7 +1046,7 @@ def build_command(
     labels = dict(llm.identifying_params)
     labels.update({'_type': llm.llm_type, '_framework': llm.__llm_backend__})
 
-    with fs.open_fs(f'temp://llm_{normalise_model_name(model_id)}') as llm_fs:
+    with fs.open_fs(f'temp://llm_{gen_random_uuid()}') as llm_fs:
       dockerfile_template_path = None
       if dockerfile_template:
         with dockerfile_template:
diff --git a/openllm-python/src/openllm_cli/extension/build_base_container.py b/openllm-python/src/openllm_cli/extension/build_base_container.py
index a5783c79..93d560c2 100644
--- a/openllm-python/src/openllm_cli/extension/build_base_container.py
+++ b/openllm-python/src/openllm_cli/extension/build_base_container.py
@@ -43,16 +43,13 @@ def build_container(
       "This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'"
     )
   if not registries:
-    tags: dict[str | LiteralContainerRegistry, str] = {
-      alias: f'{value}:{openllm.bundle.get_base_container_tag(version_strategy)}'
-      for alias, value in openllm.bundle.CONTAINER_NAMES.items()
+    tags = {
+      alias: openllm.bundle.RefResolver.construct_base_image(alias, version_strategy)
+      for alias in openllm.bundle.CONTAINER_NAMES
     }
   else:
     registries = [registries] if isinstance(registries, str) else list(registries)
-    tags = {
-      name: f'{openllm.bundle.CONTAINER_NAMES[name]}:{openllm.bundle.get_base_container_tag(version_strategy)}'
-      for name in registries
-    }
+    tags = {name: openllm.bundle.RefResolver.construct_base_image(name, version_strategy) for name in registries}
   try:
     outputs = _BUILDER.build(
       file=pathlib.Path(__file__).parent.joinpath('Dockerfile').resolve().__fspath__(),
diff --git a/openllm-python/tests/conftest.py b/openllm-python/tests/conftest.py
index 8561b899..e49b2656 100644
--- a/openllm-python/tests/conftest.py
+++ b/openllm-python/tests/conftest.py
@@ -20,9 +20,7 @@ _PROMPT_MAPPING = {
 }
 
 
-def parametrise_local_llm(
-  model: str
-) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]:
+def parametrise_local_llm(model: str) -> t.Generator[tuple[str, openllm.LLM[t.Any, t.Any]], None, None]:
   if model not in _MODELING_MAPPING:
     pytest.skip(f"'{model}' is not yet supported in framework testing.")
   backends: tuple[LiteralBackend, ...] = ('pt',)
diff --git a/ruff.toml b/ruff.toml
index 00e5ceb7..0b6f16c5 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -121,3 +121,4 @@ docstring-quotes = "double"
 "openllm-python/src/openllm/_llm.py" = ["F811"]
 "openllm-core/src/openllm_core/utils/import_utils.py" = ["PLW0603", "F811"]
 "openllm-core/src/openllm_core/_configuration.py" = ["F811", "Q001"]
+"openllm-python/src/openllm/__init__.pyi" = ["I001"]
diff --git a/tools/dependencies.py b/tools/dependencies.py
index c3b66f35..ce55507c 100755
--- a/tools/dependencies.py
+++ b/tools/dependencies.py
@@ -140,7 +140,7 @@ class Dependencies:
     return cls(*decls)
 
 
-lower_bentoml_constraint = '1.1.2'
+lower_bentoml_constraint = '1.1.9'
 _BENTOML_EXT = ['io']
 _TRANSFORMERS_EXT = ['torch', 'tokenizers']
 
diff --git a/tools/update-config-stubs.py b/tools/update-config-stubs.py
index 9340b83e..7d4c5007 100755
--- a/tools/update-config-stubs.py
+++ b/tools/update-config-stubs.py
@@ -14,10 +14,14 @@ END_ATTRS_COMMENT = f'# {os.path.basename(__file__)}: attrs stop\n'
 # Stubs for auto class
 START_AUTO_STUBS_COMMENT = f'# {os.path.basename(__file__)}: auto stubs start\n'
 END_AUTO_STUBS_COMMENT = f'# {os.path.basename(__file__)}: auto stubs stop\n'
+# Stubs for actual imports
+START_IMPORT_STUBS_COMMENT = f'# {os.path.basename(__file__)}: import stubs start\n'
+END_IMPORT_STUBS_COMMENT = f'# {os.path.basename(__file__)}: import stubs stop\n'
 
 ROOT = Path(__file__).parent.parent
 _TARGET_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / '_configuration.py'
 _TARGET_AUTO_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / 'config' / 'configuration_auto.py'
+_TARGET_INIT_FILE = ROOT / 'openllm-python' / 'src' / 'openllm' / '__init__.pyi'
 
 sys.path.insert(0, (ROOT / 'openllm-core' / 'src').__fspath__())
 from openllm_core._configuration import GenerationConfig, ModelSettings, SamplingParams
@@ -216,6 +220,22 @@ def main() -> int:
   )
   with _TARGET_AUTO_FILE.open('w') as f:
     f.writelines(processed)
+
+  with _TARGET_INIT_FILE.open('r') as f:
+    processed = f.readlines()
+  start_import_stubs_idx, end_import_stubs_idx = (
+    processed.index(START_IMPORT_STUBS_COMMENT),
+    processed.index(END_IMPORT_STUBS_COMMENT),
+  )
+  lines = f'from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING,CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,AutoConfig as AutoConfig,{",".join([a+" as "+a for a in CONFIG_MAPPING_NAMES.values()])}\n'
+  processed = (
+    processed[:start_import_stubs_idx]
+    + [START_IMPORT_STUBS_COMMENT, lines, END_IMPORT_STUBS_COMMENT]
+    + processed[end_import_stubs_idx + 1 :]
+  )
+  with _TARGET_INIT_FILE.open('w') as f:
+    f.writelines(processed)
+
   return 0
 
 
diff --git a/tools/update-mypy.py b/tools/update-mypy.py
new file mode 100755
index 00000000..bf5cbfb3
--- /dev/null
+++ b/tools/update-mypy.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+import concurrent.futures
+import configparser
+import os
+from typing import List
+
+
+# Function to find .pyi files in a given directory
+def pyi_in_subdir(directory: str, git_root: str) -> List[str]:
+  pyi_files = []
+  for root, _, files in os.walk(directory):
+    for file in files:
+      if file.endswith('.pyi') or file == '_typing_compat.py':
+        full_path = os.path.join(root, file)
+        # Convert to relative path with respect to the git root
+        relative_path = os.path.relpath(full_path, git_root)
+        pyi_files.append(relative_path)
+  return pyi_files
+
+
+def find_pyi_files(git_root: str) -> List[str]:
+  # List all subdirectories
+  subdirectories = [
+    os.path.join(git_root, name) for name in os.listdir(git_root) if os.path.isdir(os.path.join(git_root, name))
+  ]
+
+  # Use a thread pool to execute searches concurrently
+  with concurrent.futures.ThreadPoolExecutor() as executor:
+    # Map of future to subdirectory
+    future_to_subdir = {executor.submit(pyi_in_subdir, subdir, git_root): subdir for subdir in subdirectories}
+
+    all_pyi_files = set()
+    for future in concurrent.futures.as_completed(future_to_subdir):
+      pyi_files = future.result()
+      all_pyi_files.update(pyi_files)
+
+  return list(all_pyi_files)
+
+
+# Function to update mypy.ini file
+def update_mypy_ini(pyi_files: List[str], mypy_ini_path: str) -> int:
+  config = configparser.ConfigParser()
+  config.read(mypy_ini_path)
+
+  # Existing files from mypy.ini
+  existing_files = config.get('mypy', 'files', fallback='').split(', ')
+
+  # Add new .pyi files if they are not already in the list
+  updated_files = existing_files + [f for f in pyi_files if f not in existing_files]
+
+  # Update the 'files' entry
+  config['mypy']['files'] = ', '.join(updated_files)
+
+  # Write changes back to mypy.ini
+  with open(mypy_ini_path, 'w') as configfile:
+    config.write(configfile)
+  return 0
+
+
+ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+MYPY_CONFIG = os.path.join(ROOT, 'mypy.ini')
+
+if __name__ == '__main__':
+  raise SystemExit(update_mypy_ini(find_pyi_files(ROOT), MYPY_CONFIG))