feat(type): provide structured annotations stubs (#663)

* feat(type): provide client stubs separation of concern for more brevity code base Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * docs: update changelog Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-02-19 15:18:12 -05:00 · 2023-11-16 02:58:45 -05:00
parent c6264f3af7
commit 4a6f13ddd2
32 changed files with 795 additions and 582 deletions
--- a/openllm-python/src/openllm/init.py
+++ b/openllm-python/src/openllm/init.py
@@ -1,14 +1,3 @@
-"""OpenLLM.
-
-An open platform for operating large language models in production. Fine-tune, serve,
-deploy, and monitor any LLMs with ease.
-
-* Built-in support for StableLM, Llama 2, Dolly, Flan-T5, Vicuna
-* Option to bring your own fine-tuned LLMs
-* Online Serving with HTTP, gRPC, SSE(coming soon) or custom API
-* Native integration with BentoML and LangChain for custom LLM apps
-"""
-
 import logging as _logging
 import os as _os
 import pathlib as _pathlib
@@ -57,13 +46,14 @@ __lazy = utils.LazyModule(
    'entrypoints': ['mount_entrypoints'],
    'serialisation': ['ggml', 'transformers'],
    '_quantisation': ['infer_quantisation_config'],
-    '_llm': ['LLM', 'LLMRunner', 'LLMRunnable'],
+    '_llm': ['LLM'],
    '_generation': [
      'StopSequenceCriteria',
      'StopOnTokens',
-      'LogitsProcessorList',
-      'StoppingCriteriaList',
      'prepare_logits_processor',
+      'get_context_length',
+      'is_sentence_complete',
+      'is_partial_stop',
    ],
  },
  extra_objects={
--- a/openllm-python/src/openllm/init.pyi
+++ b/openllm-python/src/openllm/init.pyi
@@ -1,3 +1,21 @@
+"""OpenLLM.
+===========
+
+An open platform for operating large language models in production.
+Fine-tune, serve, deploy, and monitor any LLMs with ease.
+
+* Built-in support for Mistral, Llama 2, Yi, StableLM, Dolly, Flan-T5, Vicuna
+* Option to bring your own fine-tuned LLMs
+* Online Serving with HTTP, gRPC, SSE or custom API
+* Native integration with BentoML, LangChain, OpenAI compatible endpoints, LlamaIndex for custom LLM apps
+"""
+
+# fmt: off
+# update-config-stubs.py: import stubs start
+from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING,CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,AutoConfig as AutoConfig,BaichuanConfig as BaichuanConfig,ChatGLMConfig as ChatGLMConfig,DollyV2Config as DollyV2Config,FalconConfig as FalconConfig,FlanT5Config as FlanT5Config,GPTNeoXConfig as GPTNeoXConfig,LlamaConfig as LlamaConfig,MistralConfig as MistralConfig,MPTConfig as MPTConfig,OPTConfig as OPTConfig,StableLMConfig as StableLMConfig,StarCoderConfig as StarCoderConfig,YiConfig as YiConfig
+# update-config-stubs.py: import stubs stop
+# fmt: on
+
 import openllm_cli as _cli
 from openllm_cli._sdk import (
  build as build,
@@ -16,23 +34,6 @@ from openllm_core._schemas import (
  GenerationOutput as GenerationOutput,
  MetadataOutput as MetadataOutput,
 )
-from openllm_core.config import (
-  CONFIG_MAPPING as CONFIG_MAPPING,
-  CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,
-  AutoConfig as AutoConfig,
-  BaichuanConfig as BaichuanConfig,
-  ChatGLMConfig as ChatGLMConfig,
-  DollyV2Config as DollyV2Config,
-  FalconConfig as FalconConfig,
-  FlanT5Config as FlanT5Config,
-  GPTNeoXConfig as GPTNeoXConfig,
-  LlamaConfig as LlamaConfig,
-  MistralConfig as MistralConfig,
-  MPTConfig as MPTConfig,
-  OPTConfig as OPTConfig,
-  StableLMConfig as StableLMConfig,
-  StarCoderConfig as StarCoderConfig,
-)

 from . import (
  bundle as bundle,
@@ -44,13 +45,14 @@ from . import (
 )
 from ._deprecated import Runner as Runner
 from ._generation import (
-  LogitsProcessorList as LogitsProcessorList,
  StopOnTokens as StopOnTokens,
-  StoppingCriteriaList as StoppingCriteriaList,
  StopSequenceCriteria as StopSequenceCriteria,
  prepare_logits_processor as prepare_logits_processor,
+  is_partial_stop as is_partial_stop,
+  is_sentence_complete as is_sentence_complete,
+  get_context_length as get_context_length,
 )
-from ._llm import LLM as LLM, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner
+from ._llm import LLM as LLM
 from ._quantisation import infer_quantisation_config as infer_quantisation_config
 from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
 from .client import AsyncHTTPClient as AsyncHTTPClient, HTTPClient as HTTPClient
--- a/openllm-python/src/openllm/_generation.py
+++ b/openllm-python/src/openllm/_generation.py
@@ -1,43 +1,24 @@
-# mypy: disable-error-code="misc"
-from __future__ import annotations
-import typing as t
-
 import transformers

-if t.TYPE_CHECKING:
-  import torch
-
-  import openllm
-
-# reexport from transformers
-LogitsProcessorList = transformers.LogitsProcessorList
-StoppingCriteriaList = transformers.StoppingCriteriaList
-

 class StopSequenceCriteria(transformers.StoppingCriteria):
-  def __init__(
-    self,
-    stop_sequences: str | list[str],
-    tokenizer: transformers.PreTrainedTokenizer
-    | transformers.PreTrainedTokenizerBase
-    | transformers.PreTrainedTokenizerFast,
-  ):
+  def __init__(self, stop_sequences, tokenizer):
    if isinstance(stop_sequences, str):
      stop_sequences = [stop_sequences]
    self.stop_sequences, self.tokenizer = stop_sequences, tokenizer

-  def __call__(self, input_ids: torch.Tensor, scores: t.Any, **_: t.Any) -> bool:
+  def __call__(self, input_ids, scores, **kwargs):
    return any(
      self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences
    )


 class StopOnTokens(transformers.StoppingCriteria):
-  def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **_: t.Any) -> bool:
+  def __call__(self, input_ids, scores, **kwargs):
    return input_ids[0][-1] in {50278, 50279, 50277, 1, 0}


-def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsProcessorList:
+def prepare_logits_processor(config):
  generation_config = config.generation_config
  logits_processor = transformers.LogitsProcessorList()
  if generation_config['temperature'] >= 1e-5 and generation_config['temperature'] != 1.0:
@@ -55,7 +36,7 @@ def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsPr
 SEQLEN_KEYS = ['max_sequence_length', 'seq_length', 'max_position_embeddings', 'max_seq_len', 'model_max_length']


-def get_context_length(config: transformers.PretrainedConfig) -> int:
+def get_context_length(config):
  rope_scaling = getattr(config, 'rope_scaling', None)
  rope_scaling_factor = config.rope_scaling['factor'] if rope_scaling else 1.0
  for key in SEQLEN_KEYS:
@@ -64,11 +45,11 @@ def get_context_length(config: transformers.PretrainedConfig) -> int:
  return 2048


-def is_sentence_complete(output: str) -> bool:
+def is_sentence_complete(output):
  return output.endswith(('.', '?', '!', '...', '。', '?', '!', '…', '"', "'", '”'))


-def is_partial_stop(output: str, stop_str: str) -> bool:
+def is_partial_stop(output, stop_str):
  """Check whether the output contains a partial stop str."""
  for i in range(min(len(output), len(stop_str))):
    if stop_str.startswith(output[-i:]):
--- a/openllm-python/src/openllm/_generation.pyi
+++ b/openllm-python/src/openllm/_generation.pyi
@@ -0,0 +1,28 @@
+from typing import Any, List, Union
+
+from torch import FloatTensor, LongTensor
+from transformers import (
+  LogitsProcessorList,
+  PretrainedConfig,
+  PreTrainedTokenizer,
+  PreTrainedTokenizerBase,
+  PreTrainedTokenizerFast,
+)
+
+from openllm_core import LLMConfig
+
+Tokenizer = Union[PreTrainedTokenizerBase, PreTrainedTokenizer, PreTrainedTokenizerFast]
+
+class StopSequenceCriteria:
+  stop_sequences: List[str]
+  tokenizer: Tokenizer
+  def __init__(self, stop_sequences: Union[str, List[str]], tokenizer: Tokenizer) -> None: ...
+  def __call__(self, input_ids: LongTensor, scores: FloatTensor, **kwargs: Any) -> bool: ...
+
+class StopOnTokens:
+  def __call__(self, input_ids: LongTensor, scores: FloatTensor, **kwargs: Any) -> bool: ...
+
+def prepare_logits_processor(config: LLMConfig) -> LogitsProcessorList: ...
+def get_context_length(config: PretrainedConfig) -> int: ...
+def is_sentence_complete(output: str) -> bool: ...
+def is_partial_stop(output: str, stop_str: str) -> bool: ...
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -1,6 +1,4 @@
-# mypy: disable-error-code="name-defined,attr-defined"
 from __future__ import annotations
-import abc
 import functools
 import logging
 import os
@@ -10,14 +8,12 @@ import typing as t
 import attr
 import inflection
 import orjson
-from huggingface_hub import hf_hub_download

 import bentoml
 import openllm
-import openllm_core
 from bentoml._internal.models.model import ModelSignature
 from bentoml._internal.runner.runner_handle import DummyRunnerHandle
-from openllm_core._schemas import CompletionChunk, GenerationOutput
+from openllm_core._schemas import GenerationOutput
 from openllm_core._typing_compat import (
  AdapterMap,
  AdapterTuple,
@@ -43,32 +39,27 @@ from openllm_core.utils import (
  converter,
  first_not_none,
  flatten_attrs,
+  gen_random_uuid,
  generate_hash_from_file,
  get_debug_mode,
  get_disable_warnings,
  get_quiet_mode,
  is_peft_available,
+  is_vllm_available,
  resolve_filepath,
  validate_is_path,
 )

-from ._quantisation import infer_quantisation_config
-from ._strategies import CascadingResourceStrategy
 from .exceptions import ForbiddenAttributeError, OpenLLMException
 from .serialisation.constants import PEFT_CONFIG_NAME

 if t.TYPE_CHECKING:
-  import torch
  import transformers
  from peft.config import PeftConfig
-  from peft.peft_model import PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM

-  from bentoml._internal.runner.runnable import RunnableMethod
-  from bentoml._internal.runner.runner import RunnerMethod
-  from bentoml._internal.runner.runner_handle import RunnerHandle
-  from bentoml._internal.runner.strategy import Strategy
  from openllm_core._configuration import LLMConfig
-  from openllm_core.utils.representation import ReprArgs
+
+  from ._runners import Runner

 ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]]

@@ -84,16 +75,15 @@ def normalise_model_name(name: str) -> str:
  return inflection.dasherize(name)


-def resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
-  """Resolve the type of the PeftConfig given the adapter_map.
+def _resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
+  try:
+    from huggingface_hub import hf_hub_download
+  except ImportError:
+    raise MissingDependencyError(
+      "Failed to import 'huggingface_hub'. Make sure to do 'pip install \"openllm[fine-tune]\"'"
+    ) from None

-  This is similar to how PeftConfig resolve its config type.
-
-  Args:
-  adapter_map: The given mapping from either SDK or CLI. See CLI docs for more information.
-  """
  resolved: AdapterMap = {}
-  _has_set_default = False
  for path_or_adapter_id, name in adapter_map.items():
    if name is None:
      raise ValueError('Adapter name must be specified.')
@@ -107,7 +97,7 @@ def resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
    with open(config_file, 'r') as file:
      resolved_config = orjson.loads(file.read())
    # all peft_type should be available in PEFT_CONFIG_NAME
-    _peft_type: AdapterType = resolved_config['peft_type'].lower()
+    _peft_type = resolved_config['peft_type'].lower()
    if _peft_type not in resolved:
      resolved[_peft_type] = ()
    resolved[_peft_type] += (_AdapterTuple((path_or_adapter_id, name, resolved_config)),)
@@ -151,7 +141,7 @@ class LLM(t.Generic[M, T], ReprMixin):
  __llm_config__: LLMConfig | None = None
  __llm_backend__: LiteralBackend = None  # type: ignore
  __llm_quantization_config__: transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig | None = None
-  __llm_runner__: t.Optional[LLMRunner[M, T]] = None
+  __llm_runner__: t.Optional[Runner[M, T]] = None
  __llm_model__: t.Optional[M] = None
  __llm_tokenizer__: t.Optional[T] = None
  __llm_adapter_map__: t.Optional[ResolvedAdapterMap] = None
@@ -159,35 +149,29 @@ class LLM(t.Generic[M, T], ReprMixin):

  def __init__(
    self,
-    model_id: str,
-    model_version: str | None = None,
-    model_tag: str | bentoml.Tag | None = None,
-    prompt_template: PromptTemplate | str | None = None,
-    system_message: str | None = None,
-    llm_config: LLMConfig | None = None,
-    backend: LiteralBackend | None = None,
-    *args: t.Any,
-    quantize: LiteralQuantise | None = None,
-    quantization_config: transformers.BitsAndBytesConfig
-    | transformers.GPTQConfig
-    | transformers.AwqConfig
-    | None = None,
-    adapter_map: dict[str, str] | None = None,
-    serialisation: LiteralSerialisation = 'safetensors',
-    trust_remote_code: bool = False,
-    embedded: bool = False,
-    torch_dtype: LiteralDtype | t.Literal['auto', 'half', 'float'] = 'auto',
-    **attrs: t.Any,
+    model_id,
+    model_version=None,
+    model_tag=None,
+    prompt_template=None,
+    system_message=None,
+    llm_config=None,
+    backend=None,
+    *args,
+    quantize=None,
+    quantization_config=None,
+    adapter_map=None,
+    serialisation='safetensors',
+    trust_remote_code=False,
+    embedded=False,
+    torch_dtype='auto',
+    low_cpu_mem_usage=True,
+    **attrs,
  ):
-    # low_cpu_mem_usage is only available for model this is helpful on system with low memory to avoid OOM
-    low_cpu_mem_usage = attrs.pop('low_cpu_mem_usage', True)
    _local = False
    if validate_is_path(model_id):
      model_id, _local = resolve_filepath(model_id), True

-    backend = first_not_none(
-      backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if openllm.utils.is_vllm_available() else 'pt'
-    )
+    backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt')
    torch_dtype = first_not_none(os.getenv('TORCH_DTYPE'), torch_dtype, default='auto')
    quantize = first_not_none(quantize, os.getenv('OPENLLM_QUANTIZE'), default=None)
    # elif quantization_config is None and quantize is not None:
@@ -215,7 +199,7 @@ class LLM(t.Generic[M, T], ReprMixin):
      quantization_config=quantization_config,
      quantise=quantize,
      model_decls=args,
-      adapter_map=resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
+      adapter_map=_resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
      serialisation=serialisation,
      local=_local,
      prompt_template=prompt_template,
@@ -244,7 +228,7 @@ class LLM(t.Generic[M, T], ReprMixin):
      self.runner.init_local(quiet=True)

  @property
-  def _torch_dtype(self) -> torch.dtype:
+  def _torch_dtype(self):
    import torch
    import transformers

@@ -298,11 +282,15 @@ class LLM(t.Generic[M, T], ReprMixin):
    super().__setattr__(attr, value)

  @property
-  def _model_attrs(self) -> dict[str, t.Any]:
+  def _model_attrs(self):
    return {**self.import_kwargs[0], **self.__model_attrs}

+  @_model_attrs.setter
+  def _model_attrs(self, value):
+    self.__model_attrs = value
+
  @property
-  def _tokenizer_attrs(self) -> dict[str, t.Any]:
+  def _tokenizer_attrs(self):
    return {**self.import_kwargs[1], **self.__tokenizer_attrs}

  @property
@@ -319,41 +307,42 @@ class LLM(t.Generic[M, T], ReprMixin):
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
    import torch

-    return {'device_map': 'auto' if torch.cuda.is_available() else None, 'torch_dtype': self._torch_dtype}, {
-      'padding_side': 'left',
-      'truncation_side': 'left',
-    }
+    model_attrs = {'device_map': 'auto' if torch.cuda.is_available() else None, 'torch_dtype': self._torch_dtype}
+    tokenizer_attrs = {'padding_side': 'left', 'truncation_side': 'left'}
+    return model_attrs, tokenizer_attrs

  @property
-  def trust_remote_code(self) -> bool:
+  def trust_remote_code(self):
    env = os.getenv('TRUST_REMOTE_CODE')
    if env is not None:
      return str(env).upper() in ENV_VARS_TRUE_VALUES
    return self.__llm_trust_remote_code__

  @property
-  def runner_name(self) -> str:
+  def runner_name(self):
    return f"llm-{self.config['start_name']}-runner"

  @property
-  def model_id(self) -> str:
+  def model_id(self):
    return self._model_id

  @property
-  def revision(self) -> str:
-    return t.cast(str, self._revision)
+  def revision(self):
+    return self._revision

  @property
-  def tag(self) -> bentoml.Tag:
+  def tag(self):
    return self._tag

  @property
-  def bentomodel(self) -> bentoml.Model:
+  def bentomodel(self):
    return openllm.serialisation.get(self)

  @property
-  def quantization_config(self) -> transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig:
+  def quantization_config(self):
    if self.__llm_quantization_config__ is None:
+      from ._quantisation import infer_quantisation_config
+
      if self._quantization_config is not None:
        self.__llm_quantization_config__ = self._quantization_config
      elif self._quantise is not None:
@@ -365,55 +354,55 @@ class LLM(t.Generic[M, T], ReprMixin):
    return self.__llm_quantization_config__

  @property
-  def has_adapters(self) -> bool:
+  def has_adapters(self):
    return self._adapter_map is not None

  @property
-  def local(self) -> bool:
+  def local(self):
    return self._local

  @property
-  def quantise(self) -> LiteralQuantise | None:
+  def quantise(self):
    return self._quantise

  # NOTE: The section below defines a loose contract with langchain's LLM interface.
  @property
-  def llm_type(self) -> str:
+  def llm_type(self):
    return normalise_model_name(self._model_id)

  @property
-  def identifying_params(self) -> DictStrAny:
+  def llm_parameters(self):
+    return (self._model_decls, self._model_attrs), self._tokenizer_attrs
+
+  @property
+  def identifying_params(self):
    return {
      'configuration': self.config.model_dump_json().decode(),
      'model_ids': orjson.dumps(self.config['model_ids']).decode(),
      'model_id': self.model_id,
    }

-  @property
-  def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]:
-    return (self._model_decls, self._model_attrs), self._tokenizer_attrs
-
  # NOTE: This section is the actual model, tokenizer, and config reference here.
  @property
-  def config(self) -> LLMConfig:
+  def config(self):
    if self.__llm_config__ is None:
      self.__llm_config__ = openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs)
    return self.__llm_config__

  @property
-  def tokenizer(self) -> T:
+  def tokenizer(self):
    if self.__llm_tokenizer__ is None:
      self.__llm_tokenizer__ = openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1])
    return self.__llm_tokenizer__

  @property
-  def runner(self) -> LLMRunner[M, T]:
+  def runner(self):
    if self.__llm_runner__ is None:
      self.__llm_runner__ = _RunnerFactory(self)
    return self.__llm_runner__

  @property
-  def model(self) -> M:
+  def model(self):
    if self.__llm_model__ is None:
      model = openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs)
      # If OOM, then it is probably you don't have enough VRAM to run this model.
@@ -439,7 +428,7 @@ class LLM(t.Generic[M, T], ReprMixin):
    return self.__llm_model__

  @property
-  def adapter_map(self) -> ResolvedAdapterMap:
+  def adapter_map(self):
    try:
      import peft as _  # noqa: F401
    except ImportError as err:
@@ -461,9 +450,7 @@ class LLM(t.Generic[M, T], ReprMixin):
      self.__llm_adapter_map__ = _map
    return self.__llm_adapter_map__

-  def prepare_for_training(
-    self, adapter_type: AdapterType = 'lora', use_gradient_checking: bool = True, **attrs: t.Any
-  ) -> tuple[PeftModel | PeftModelForCausalLM | PeftModelForSeq2SeqLM, T]:
+  def prepare_for_training(self, adapter_type='lora', use_gradient_checking=True, **attrs):
    from peft.mapping import get_peft_model
    from peft.utils.other import prepare_model_for_kbit_training

@@ -484,15 +471,8 @@ class LLM(t.Generic[M, T], ReprMixin):
    return model, self.tokenizer

  async def generate(
-    self,
-    prompt: str | None,
-    prompt_token_ids: list[int] | None = None,
-    stop: str | t.Iterable[str] | None = None,
-    stop_token_ids: list[int] | None = None,
-    request_id: str | None = None,
-    adapter_name: str | None = None,
-    **attrs: t.Any,
-  ) -> GenerationOutput:
+    self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
+  ):
    config = self.config.model_construct_env(**attrs)
    texts: list[list[str]] = [[]] * config['n']
    token_ids: list[list[int]] = [[]] * config['n']
@@ -515,15 +495,8 @@ class LLM(t.Generic[M, T], ReprMixin):
    )

  async def generate_iterator(
-    self,
-    prompt: str | None,
-    prompt_token_ids: list[int] | None = None,
-    stop: str | t.Iterable[str] | None = None,
-    stop_token_ids: list[int] | None = None,
-    request_id: str | None = None,
-    adapter_name: str | None = None,
-    **attrs: t.Any,
-  ) -> t.AsyncGenerator[GenerationOutput, None]:
+    self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
+  ):
    if isinstance(self.runner._runner_handle, DummyRunnerHandle):
      if os.getenv('BENTO_PATH') is not None:
        raise RuntimeError('Runner client failed to set up correctly.')
@@ -551,14 +524,13 @@ class LLM(t.Generic[M, T], ReprMixin):
        raise ValueError('Either prompt or prompt_token_ids must be specified.')
      prompt_token_ids = self.tokenizer.encode(prompt)

-    if request_id is None:
-      request_id = openllm_core.utils.gen_random_uuid()
+    request_id = gen_random_uuid() if request_id is None else request_id
    previous_texts, previous_num_tokens = [''] * config['n'], [0] * config['n']
    async for out in self.runner.generate_iterator.async_stream(
-      prompt_token_ids, request_id, stop, adapter_name, **config.model_dump(flatten=True)
+      prompt_token_ids, request_id, stop=stop, adapter_name=adapter_name, **config.model_dump(flatten=True)
    ):
      generated = GenerationOutput.from_runner(out).with_options(prompt=prompt)
-      delta_outputs = t.cast(t.List[CompletionChunk], [None] * len(generated.outputs))
+      delta_outputs = [None] * len(generated.outputs)
      if generated.finished:
        break
      for output in generated.outputs:
@@ -570,44 +542,37 @@ class LLM(t.Generic[M, T], ReprMixin):


 def _RunnerFactory(
-  self: openllm.LLM[M, T],
-  /,
-  models: list[bentoml.Model] | None = None,
-  max_batch_size: int | None = None,
-  max_latency_ms: int | None = None,
-  scheduling_strategy: type[bentoml.Strategy] = CascadingResourceStrategy,
-  *,
-  backend: LiteralBackend | None = None,
-) -> LLMRunner[M, T]:
+  llm, /, models=None, max_batch_size=None, max_latency_ms=None, scheduling_strategy=None, *, backend=None
+):
  from ._runners import runnable

-  backend = t.cast(
-    LiteralBackend, first_not_none(backend, os.environ.get('OPENLLM_BACKEND'), default=self.__llm_backend__)
-  )
+  if scheduling_strategy is None:
+    from ._strategies import CascadingResourceStrategy
+
+    scheduling_strategy = CascadingResourceStrategy
+
+  backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND', default=llm.__llm_backend__))

  models = models if models is not None else []
  try:
-    models.append(self.bentomodel)
+    models.append(llm.bentomodel)
  except bentoml.exceptions.NotFound as err:
-    raise RuntimeError(f'Failed to locate {self.bentomodel}:{err}') from err
+    raise RuntimeError(f'Failed to locate {llm.bentomodel}:{err}') from err

-  if self._prompt_template:
-    prompt_template = self._prompt_template.to_string()
-  elif hasattr(self.config, 'default_prompt_template'):
-    prompt_template = self.config.default_prompt_template
+  if llm._prompt_template:
+    prompt_template = llm._prompt_template.to_string()
+  elif hasattr(llm.config, 'default_prompt_template'):
+    prompt_template = llm.config.default_prompt_template
  else:
    prompt_template = None
-  if self._system_message:
-    system_message = self._system_message
-  elif hasattr(self.config, 'default_system_message'):
-    system_message = self.config.default_system_message
+  if llm._system_message:
+    system_message = llm._system_message
+  elif hasattr(llm.config, 'default_system_message'):
+    system_message = llm.config.default_system_message
  else:
    system_message = None

-  def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]:
-    return {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'}
-
-  def _wrapped_repr_args(_: LLMRunner[M, T]) -> ReprArgs:
+  def _wrapped_repr_args(_):
    yield (
      'runner_methods',
      {
@@ -618,89 +583,40 @@ def _RunnerFactory(
        for method in _.runner_methods
      },
    )
-    yield 'config', self.config.model_dump(flatten=True)
-    yield 'llm_type', self.llm_type
+    yield 'config', llm.config.model_dump(flatten=True)
+    yield 'llm_type', llm.llm_type
    yield 'backend', backend
-    yield 'llm_tag', self.tag
+    yield 'llm_tag', llm.tag

  return types.new_class(
-    self.__class__.__name__ + 'Runner',
+    llm.config.__class__.__name__[:-6] + 'Runner',
    (bentoml.Runner,),
    exec_body=lambda ns: ns.update(
      {
-        'llm_type': self.llm_type,
-        'identifying_params': self.identifying_params,
-        'llm_tag': self.tag,
-        'llm': self,
-        'config': self.config,
+        'llm_type': llm.llm_type,
+        'identifying_params': llm.identifying_params,
+        'llm_tag': llm.tag,
+        'llm': llm,
+        'config': llm.config,
        'backend': backend,
-        '__module__': self.__module__,
+        '__doc__': llm.config.__class__.__doc__ or f'Generated Runner class for {llm.config["model_name"]}',
+        '__module__': llm.__module__,
        '__repr__': ReprMixin.__repr__,
-        '__repr_keys__': property(_wrapped_repr_keys),
+        '__repr_keys__': property(lambda _: {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'}),
        '__repr_args__': _wrapped_repr_args,
-        'has_adapters': self.has_adapters,
+        'has_adapters': llm.has_adapters,
        'prompt_template': prompt_template,
        'system_message': system_message,
      }
    ),
  )(
    runnable(backend),
-    name=self.runner_name,
+    name=llm.runner_name,
    embedded=False,
    models=models,
    max_batch_size=max_batch_size,
    max_latency_ms=max_latency_ms,
    scheduling_strategy=scheduling_strategy,
-    runnable_init_params=dict(llm=self),
+    runnable_init_params={'llm': llm},
    method_configs=converter.unstructure({'generate_iterator': ModelSignature(batchable=False)}),
  )
-
-
-@t.final
-class LLMRunnable(bentoml.Runnable, t.Generic[M, T]):
-  SUPPORTED_RESOURCES = ('amd.com/gpu', 'nvidia.com/gpu', 'cpu')
-  SUPPORTS_CPU_MULTI_THREADING = True
-  generate_iterator: RunnableMethod[LLMRunnable[M, T], [list[int], str, str | t.Iterable[str] | None, str | None], str]
-
-
-@t.final
-class LLMRunner(t.Protocol[M, T]):
-  __doc__: str
-  __module__: str
-  llm_type: str
-  llm_tag: bentoml.Tag
-  identifying_params: dict[str, t.Any]
-  llm: openllm.LLM[M, T]
-  config: openllm.LLMConfig
-  backend: LiteralBackend
-  has_adapters: bool
-  system_message: str | None
-  prompt_template: str | None
-  generate_iterator: RunnerMethod[LLMRunnable[M, T], [list[int], str, str | t.Iterable[str] | None, str | None], str]
-
-  runner_methods: list[RunnerMethod[t.Any, t.Any, t.Any]]
-  scheduling_strategy: type[Strategy]
-  workers_per_resource: int | float
-  runnable_init_params: dict[str, t.Any]
-  _runner_handle: RunnerHandle
-
-  def __init__(
-    self,
-    runnable_class: type[LLMRunnable[M, T]],
-    *,
-    runnable_init_params: dict[str, t.Any] | None = ...,
-    name: str | None = ...,
-    scheduling_strategy: type[Strategy] = ...,
-    models: list[bentoml.Model] | None = ...,
-    max_batch_size: int | None = ...,
-    max_latency_ms: int | None = ...,
-    method_configs: dict[str, dict[str, int]] | None = ...,
-    embedded: bool = False,
-  ) -> None: ...
-
-  @property
-  @abc.abstractmethod
-  def __repr_keys__(self) -> set[str]: ...
-
-
-__all__ = ['LLMRunner', 'LLMRunnable', 'LLM']
--- a/openllm-python/src/openllm/_llm.pyi
+++ b/openllm-python/src/openllm/_llm.pyi
@@ -0,0 +1,158 @@
+from typing import Any, AsyncGenerator, Dict, Generic, Iterable, List, Literal, Optional, Set, Tuple, TypedDict, Union
+
+import attr
+import torch
+from peft.config import PeftConfig
+from peft.peft_model import PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM
+
+from bentoml import Model, Tag
+from openllm_core import LLMConfig
+from openllm_core._schemas import GenerationOutput
+from openllm_core._typing_compat import (
+  AdapterMap,
+  AdapterType,
+  LiteralBackend,
+  LiteralDtype,
+  LiteralQuantise,
+  LiteralSerialisation,
+  M,
+  T,
+)
+from openllm_core.prompts import PromptTemplate
+from openllm_core.utils.representation import ReprArgs
+
+from ._quantisation import QuantizationConfig
+from ._runners import Runner
+
+InjectedModel = Union[PeftModel | PeftModelForCausalLM | PeftModelForSeq2SeqLM]
+
+class IdentifyingParams(TypedDict):
+  configuration: str
+  model_ids: str
+  model_id: str
+
+ResolvedAdapterMap = Dict[AdapterType, Dict[str, Tuple[PeftConfig, str]]]
+Dtype = Union[LiteralDtype, Literal['auto', 'half', 'float']]
+
+@attr.define(slots=True, repr=False, init=False)
+class LLM(Generic[M, T]):
+  _model_id: str
+  _revision: Optional[str]
+  _quantization_config: Optional[QuantizationConfig]
+  _quantise: Optional[LiteralQuantise]
+  _model_decls: Tuple[Any, ...]
+  __model_attrs: Dict[str, Any]
+  __tokenizer_attrs: Dict[str, Any]
+  _tag: Tag
+  _adapter_map: Optional[AdapterMap]
+  _serialisation: LiteralSerialisation
+  _local: bool
+  _prompt_template: Optional[PromptTemplate]
+  _system_message: Optional[str]
+
+  __llm_torch_dtype__: Dtype = ...
+  __llm_config__: Optional[LLMConfig] = ...
+  __llm_backend__: LiteralBackend = ...
+  __llm_quantization_config__: Optional[QuantizationConfig] = ...
+  __llm_runner__: Optional[Runner[M, T]] = ...
+  __llm_model__: Optional[M] = ...
+  __llm_tokenizer__: Optional[T] = ...
+  __llm_adapter_map__: Optional[ResolvedAdapterMap] = ...
+  __llm_trust_remote_code__: bool = ...
+
+  @property
+  def __repr_keys__(self) -> Set[str]: ...
+  def __repr__(self) -> str: ...
+  def __str__(self) -> str: ...
+  def __repr_name__(self) -> str: ...
+  def __repr_str__(self, join_str: str) -> str: ...
+  def __repr_args__(self) -> ReprArgs: ...
+  def __init__(
+    self,
+    model_id: str,
+    model_version: Optional[str] = ...,
+    model_tag: Optional[Union[str, Tag]] = ...,
+    prompt_template: Optional[Union[str, PromptTemplate]] = ...,
+    system_message: Optional[str] = ...,
+    llm_config: Optional[LLMConfig] = ...,
+    backend: Optional[LiteralBackend] = ...,
+    *args: Any,
+    quantize: Optional[LiteralQuantise] = ...,
+    quantization_config: Optional[QuantizationConfig] = ...,
+    adapter_map: Optional[Dict[str, str]] = ...,
+    serialisation: LiteralSerialisation = ...,
+    trust_remote_code: bool = ...,
+    embedded: bool = ...,
+    torch_dtype: Dtype = ...,
+    low_cpu_mem_usage: bool = ...,
+    **attrs: Any,
+  ) -> None: ...
+  @property
+  def _torch_dtype(self) -> torch.dtype: ...
+  @property
+  def _model_attrs(self) -> Dict[str, Any]: ...
+  @_model_attrs.setter
+  def _model_attrs(self, model_attrs: Dict[str, Any]) -> None: ...
+  @property
+  def _tokenizer_attrs(self) -> Dict[str, Any]: ...
+  @property
+  def import_kwargs(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: ...
+  @property
+  def trust_remote_code(self) -> bool: ...
+  @property
+  def runner_name(self) -> str: ...
+  @property
+  def model_id(self) -> str: ...
+  @property
+  def revision(self) -> str: ...
+  @property
+  def tag(self) -> Tag: ...
+  @property
+  def bentomodel(self) -> Model: ...
+  @property
+  def quantization_config(self) -> QuantizationConfig: ...
+  @property
+  def has_adapters(self) -> bool: ...
+  @property
+  def local(self) -> bool: ...
+  @property
+  def quantise(self) -> Optional[LiteralQuantise]: ...
+  @property
+  def llm_type(self) -> str: ...
+  @property
+  def identifying_params(self) -> IdentifyingParams: ...
+  @property
+  def llm_parameters(self) -> Tuple[Tuple[Tuple[Any, ...], Dict[str, Any]], Dict[str, Any]]: ...
+  @property
+  def config(self) -> LLMConfig: ...
+  @property
+  def tokenizer(self) -> T: ...
+  @property
+  def model(self) -> M: ...
+  @property
+  def runner(self) -> Runner[M, T]: ...
+  @property
+  def adapter_map(self) -> ResolvedAdapterMap: ...
+  def prepare_for_training(
+    self, adapter_type: AdapterType = ..., use_gradient_checking: bool = ..., **attrs: Any
+  ) -> Tuple[InjectedModel, T]: ...
+  async def generate(
+    self,
+    prompt: Optional[str],
+    prompt_token_ids: Optional[List[int]] = ...,
+    stop: Optional[Union[str, Iterable[str]]] = ...,
+    stop_token_ids: Optional[List[int]] = ...,
+    request_id: Optional[str] = ...,
+    adapter_name: Optional[str] = ...,
+    **attrs: Any,
+  ) -> GenerationOutput: ...
+  async def generate_iterator(
+    self,
+    prompt: Optional[str],
+    prompt_token_ids: Optional[List[int]] = ...,
+    stop: Optional[Union[str, Iterable[str]]] = ...,
+    stop_token_ids: Optional[List[int]] = ...,
+    request_id: Optional[str] = ...,
+    adapter_name: Optional[str] = ...,
+    **attrs: Any,
+  ) -> AsyncGenerator[GenerationOutput, None]: ...
--- a/openllm-python/src/openllm/_quantisation.py
+++ b/openllm-python/src/openllm/_quantisation.py
@@ -1,12 +1,5 @@
-# mypy: disable-error-code="name-defined,no-redef"
 from __future__ import annotations
-import logging
-import typing as t

-import torch
-import transformers
-
-from openllm_core._typing_compat import LiteralQuantise, overload
 from openllm_core.exceptions import MissingDependencyError
 from openllm_core.utils import (
  is_autoawq_available,
@@ -15,35 +8,11 @@ from openllm_core.utils import (
  is_optimum_supports_gptq,
 )

-if t.TYPE_CHECKING:
-  from openllm_core._typing_compat import DictStrAny

-  from ._llm import LLM
+def infer_quantisation_config(llm, quantise, **attrs):
+  import torch
+  import transformers

-logger = logging.getLogger(__name__)
-
-
-@overload
-def infer_quantisation_config(
-  self: LLM[t.Any, t.Any], quantise: t.Literal['int8', 'int4'], **attrs: t.Any
-) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]: ...
-
-
-@overload
-def infer_quantisation_config(
-  self: LLM[t.Any, t.Any], quantise: t.Literal['gptq'], **attrs: t.Any
-) -> tuple[transformers.GPTQConfig, DictStrAny]: ...
-
-
-@overload
-def infer_quantisation_config(
-  self: LLM[t.Any, t.Any], quantise: t.Literal['awq'], **attrs: t.Any
-) -> tuple[transformers.AwqConfig, DictStrAny]: ...
-
-
-def infer_quantisation_config(
-  self: LLM[t.Any, t.Any], quantise: LiteralQuantise, **attrs: t.Any
-) -> tuple[transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig, DictStrAny]:
  # 8 bit configuration
  int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
  int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
@@ -54,12 +23,17 @@ def infer_quantisation_config(
  bits = attrs.pop('bits', 4)
  group_size = attrs.pop('group_size', 128)

-  def create_awq_config() -> transformers.AwqConfig:
+  # 4 bit configuration
+  int4_compute_dtype = attrs.pop('bnb_4bit_compute_dtype', torch.bfloat16)
+  int4_quant_type = attrs.pop('bnb_4bit_quant_type', 'nf4')
+  int4_use_double_quant = attrs.pop('bnb_4bit_use_double_quant', True)
+
+  def create_awq_config():
    zero_point = attrs.pop('zero_point', True)
    return transformers.AwqConfig(bits=bits, group_size=group_size, zero_point=zero_point)

-  def create_gptq_config() -> transformers.GPTQConfig:
-    gptq_tokenizer = attrs.pop('tokenizer', self.model_id)
+  def create_gptq_config():
+    gptq_tokenizer = attrs.pop('tokenizer', llm.model_id)
    gptq_dataset = attrs.pop('dataset', 'c4')
    gptq_damp_percent = attrs.pop('damp_percent', 0.1)
    gptq_desc_act = attrs.pop('desc_act', False)
@@ -94,10 +68,9 @@ def infer_quantisation_config(
      exllama_config={'version': 1},
    )  # XXX: See how to migrate to v2

-  def create_int8_config(int8_skip_modules: list[str] | None) -> transformers.BitsAndBytesConfig:
+  def create_int8_config(int8_skip_modules):
    # if int8_skip_modules is None: int8_skip_modules = []
    # if 'lm_head' not in int8_skip_modules and self.config_class.__openllm_model_type__ == 'causal_lm':
-    #   logger.debug("Skipping 'lm_head' for quantization for %s", self.__name__)
    #   int8_skip_modules.append('lm_head')
    return transformers.BitsAndBytesConfig(
      load_in_8bit=True,
@@ -107,10 +80,13 @@ def infer_quantisation_config(
      llm_int8_has_fp16_weight=int8_has_fp16_weight,
    )

-  # 4 bit configuration
-  int4_compute_dtype = attrs.pop('bnb_4bit_compute_dtype', torch.bfloat16)
-  int4_quant_type = attrs.pop('bnb_4bit_quant_type', 'nf4')
-  int4_use_double_quant = attrs.pop('bnb_4bit_use_double_quant', True)
+  def create_int4_config():
+    return transformers.BitsAndBytesConfig(
+      load_in_4bit=True,
+      bnb_4bit_compute_dtype=int4_compute_dtype,
+      bnb_4bit_quant_type=int4_quant_type,
+      bnb_4bit_use_double_quant=int4_use_double_quant,
+    )

  # NOTE: Quantization setup quantize is a openllm.LLM feature, where we can quantize the model with bitsandbytes or quantization aware training.
  if not is_bitsandbytes_available():
@@ -120,23 +96,18 @@ def infer_quantisation_config(
  if quantise == 'int8':
    quantisation_config = create_int8_config(int8_skip_modules)
  elif quantise == 'int4':
-    quantisation_config = transformers.BitsAndBytesConfig(
-      load_in_4bit=True,
-      bnb_4bit_compute_dtype=int4_compute_dtype,
-      bnb_4bit_quant_type=int4_quant_type,
-      bnb_4bit_use_double_quant=int4_use_double_quant,
-    )
+    quantisation_config = create_int4_config()
  elif quantise == 'gptq':
    if not is_autogptq_available() or not is_optimum_supports_gptq():
      raise MissingDependencyError(
-        "'quantize=\"gptq\"' requires 'auto-gptq' and 'optimum>=0.12' to be installed (missing or failed to import). Make sure to do 'pip install \"openllm[gptq]\"'"
+        "GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'"
      )
    else:
      quantisation_config = create_gptq_config()
  elif quantise == 'awq':
    if not is_autoawq_available():
      raise MissingDependencyError(
-        "quantize='awq' requires 'auto-awq' to be installed (missing or failed to import). Make sure to do 'pip install \"openllm[awq]\"'."
+        "AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'."
      )
    else:
      quantisation_config = create_awq_config()
--- a/openllm-python/src/openllm/_quantisation.pyi
+++ b/openllm-python/src/openllm/_quantisation.pyi
@@ -0,0 +1,26 @@
+from typing import Any, Dict, Literal, Union
+
+from transformers import AwqConfig, BitsAndBytesConfig, GPTQConfig
+
+from openllm_core._typing_compat import LiteralQuantise, M, T, overload
+
+from ._llm import LLM
+
+QuantizationConfig = Union[BitsAndBytesConfig, GPTQConfig, AwqConfig]
+
+@overload
+def infer_quantisation_config(
+  self: LLM[M, T], quantise: Literal['int8', 'int4'], **attrs: Any
+) -> tuple[BitsAndBytesConfig, Dict[str, Any]]: ...
+@overload
+def infer_quantisation_config(
+  self: LLM[M, T], quantise: Literal['gptq'], **attrs: Any
+) -> tuple[GPTQConfig, Dict[str, Any]]: ...
+@overload
+def infer_quantisation_config(
+  self: LLM[M, T], quantise: Literal['awq'], **attrs: Any
+) -> tuple[AwqConfig, Dict[str, Any]]: ...
+@overload
+def infer_quantisation_config(
+  self: LLM[M, T], quantise: LiteralQuantise, **attrs: Any
+) -> tuple[QuantizationConfig, Dict[str, Any]]: ...
--- a/openllm-python/src/openllm/_runners.py
+++ b/openllm-python/src/openllm/_runners.py
@@ -9,27 +9,14 @@ import torch
 import bentoml
 import openllm
 from openllm_core._schemas import CompletionChunk, GenerationOutput
-from openllm_core._typing_compat import LiteralBackend, M, T
 from openllm_core.exceptions import OpenLLMException
 from openllm_core.utils import first_not_none, is_vllm_available

-if t.TYPE_CHECKING:
-  import vllm
-
-  from openllm_core._schemas import FinishReason
-else:
-  vllm = openllm.utils.LazyLoader('vllm', globals(), 'vllm')
-
-_DEFAULT_TOKENIZER = 'hf-internal-testing/llama-tokenizer'
-
 __all__ = ['runnable']


-def runnable(backend: LiteralBackend | None = None) -> type[bentoml.Runnable]:
-  backend = t.cast(
-    LiteralBackend,
-    first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt'),
-  )
+def runnable(backend=None):
+  backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt')
  return vLLMRunnable if backend == 'vllm' else PyTorchRunnable


@@ -37,7 +24,11 @@ class vLLMRunnable(bentoml.Runnable):
  SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
  SUPPORTS_CPU_MULTI_THREADING = True

-  def __init__(self, llm: openllm.LLM[M, T]) -> None:
+  def __init__(self, llm):
+    try:
+      import vllm
+    except ImportError:
+      raise OpenLLMException('vLLM is not installed. Please install it via `pip install "openllm[vllm]"`.') from None
    self.config = llm.config
    num_gpus, dev = 1, openllm.utils.device_count()
    if dev >= 2:
@@ -64,14 +55,7 @@ class vLLMRunnable(bentoml.Runnable):
      raise OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from err

  @bentoml.Runnable.method(batchable=False)
-  async def generate_iterator(
-    self,
-    prompt_token_ids: list[int],
-    request_id: str,
-    stop: str | t.Iterable[str] | None = None,
-    adapter_name: str | None = None,
-    **attrs: t.Any,
-  ) -> t.AsyncGenerator[str, None]:
+  async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
    if adapter_name is not None:
      raise NotImplementedError('Adapter is not supported with vLLM.')
    stop_: set[str] = set()
@@ -99,28 +83,19 @@ class PyTorchRunnable(bentoml.Runnable):
  SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
  SUPPORTS_CPU_MULTI_THREADING = True

-  def __init__(self, llm: openllm.LLM[M, T]) -> None:
+  def __init__(self, llm):
    self.model = llm.model
    self.tokenizer = llm.tokenizer
    self.config = llm.config

  @bentoml.Runnable.method(batchable=False)
-  async def generate_iterator(
-    self,
-    prompt_token_ids: list[int],
-    request_id: str,
-    stop: str | t.Iterable[str] | None = None,
-    adapter_name: str | None = None,
-    **attrs: t.Any,
-  ) -> t.AsyncGenerator[str, None]:
+  async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
    if adapter_name is not None:
      self.model.set_adapter(adapter_name)
    async for generation_output in self.forward(prompt_token_ids, request_id, stop=stop, **attrs):
      yield generation_output.model_dump_json()

-  async def forward(
-    self, prompt_token_ids: list[int], request_id: str, stop: str | t.Iterable[str] | None = None, **attrs: t.Any
-  ) -> t.AsyncGenerator[GenerationOutput, None]:
+  async def forward(self, prompt_token_ids, request_id, stop=None, **attrs):
    from ._generation import is_partial_stop, prepare_logits_processor

    stop_: set[str] = set()
@@ -142,7 +117,7 @@ class PyTorchRunnable(bentoml.Runnable):
      logits_processor = prepare_logits_processor(config)

      past_key_values = out = token = None
-      finish_reason: t.Optional[FinishReason] = None
+      finish_reason = None
      for i in range(config['max_new_tokens']):
        if i == 0:  # prefill
          out = self.model(torch.as_tensor([prompt_token_ids], device=self.model.device), use_cache=True)
--- a/openllm-python/src/openllm/_runners.pyi
+++ b/openllm-python/src/openllm/_runners.pyi
@@ -0,0 +1,126 @@
+from typing import (
+  Any,
+  AsyncGenerator,
+  Dict,
+  Generic,
+  Iterable,
+  List,
+  Literal,
+  Optional,
+  Protocol,
+  Tuple,
+  Type,
+  TypeVar,
+  Union,
+  final,
+)
+
+from bentoml import Model, Strategy, Tag
+from bentoml._internal.runner.runner_handle import RunnerHandle
+from openllm_core import LLMConfig
+from openllm_core._typing_compat import LiteralBackend, T, overload
+
+from ._llm import LLM
+
+try:
+  from vllm import AsyncLLMEngine
+except ImportError:
+  AsyncLLMEngine = Any
+
+try:
+  from transformers import PreTrainedModel
+except ImportError:
+  PreTrainedModel = Any
+
+Mo = TypeVar('Mo')
+
+class _Runnable(Protocol[Mo]):
+  SUPPORTED_RESOURCES: Tuple[Literal['nvidia.com/gpu'], Literal['amd.com/gpu'], Literal['cpu']] = ...
+  SUPPORTS_CPU_MULTI_THREADING: bool = ...
+  config: LLMConfig = ...
+  model: Mo = ...
+  def __init__(self, llm: LLM[Mo, T]) -> None: ...
+  async def generate_iterator(
+    self,
+    prompt_token_ids: List[int],
+    request_id: str,
+    stop: Optional[Union[str, Iterable[str]]] = ...,
+    adapter_name: Optional[str] = ...,
+    **attrs: Any,
+  ) -> AsyncGenerator[str, None]: ...
+
+In = TypeVar('In')
+Ret = TypeVar('Ret')
+
+class RunnerMethod(Generic[In, Ret]): ...
+
+@final
+class vLLMRunnable(_Runnable[AsyncLLMEngine]): ...
+
+@final
+class PyTorchRunnable(_Runnable[PreTrainedModel]):
+  tokenizer: Any
+
+@overload
+def runnable(backend: Literal['vllm']) -> Type[vLLMRunnable]: ...
+@overload
+def runnable(backend: Literal['pt']) -> Type[PyTorchRunnable]: ...
+@overload
+def runnable(backend: Optional[str] = ...) -> Type[Union[vLLMRunnable, PyTorchRunnable]]: ...
+
+class Runner(Protocol[Mo, T]):
+  __doc__: str = ...
+  __module__: str = ...
+  llm_type: str = ...
+  llm_tag: Tag = ...
+  identifying_params: Dict[str, Any] = ...
+  llm: LLM[Mo, T] = ...
+  config: LLMConfig = ...
+  backend: LiteralBackend = ...
+  has_adapters: bool = ...
+  prompt_template: Optional[str] = ...
+  system_message: Optional[str] = ...
+
+  class generate_iterator(RunnerMethod[List[int], AsyncGenerator[str, None]]):
+    @staticmethod
+    def async_stream(
+      prompt_token_ids: List[int],
+      request_id: str,
+      stop: Optional[Union[Iterable[str], str]] = ...,
+      adapter_name: Optional[str] = ...,
+      **attrs: Any,
+    ) -> AsyncGenerator[str, None]: ...
+
+  def __init__(
+    self,
+    runnable_class: Type[_Runnable[Mo]],
+    *,
+    runnable_init_params: Optional[Dict[str, Any]] = ...,
+    name: Optional[str] = ...,
+    scheduling_strategy: Type[Strategy] = ...,
+    models: Optional[List[Model]] = ...,
+    max_batch_size: Optional[int] = ...,
+    max_latency_ms: Optional[int] = ...,
+    method_configs: Optional[Dict[str, Dict[str, int]]] = ...,
+    embedded: bool = ...,
+  ) -> None: ...
+
+  name: str = ...
+  models: List[Model] = ...
+  resource_config: Dict[str, Any]
+  runnable_class: Type[_Runnable[Mo]]
+  embedded: bool
+  runner_methods: List[RunnerMethod[Any, Any]]
+  scheduling_strategy: Type[Strategy]
+  workers_per_resource: Union[int, float] = ...
+  runnable_init_params: Dict[str, Any] = ...
+  _runner_handle: RunnerHandle = ...
+
+  def init_local(self, quiet: bool = False) -> None: ...
+  def init_client(self, handle_class: Optional[Type[RunnerHandle]] = ..., *args: Any, **kwargs: Any) -> None: ...
+  async def runner_handle_is_ready(self, timeout: int = ...) -> bool: ...
+  def destroy(self) -> None: ...
+  @property
+  def scheduled_worker_count(self) -> int: ...
+  @property
+  def scheduled_worker_env_map(self) -> Dict[int, Dict[str, Any]]: ...
--- a/openllm-python/src/openllm/_service_vars.py
+++ b/openllm-python/src/openllm/_service_vars.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 import os

 model_id = os.environ['OPENLLM_MODEL_ID']  # openllm: model name
--- a/openllm-python/src/openllm/_service_vars_pkg.py
+++ b/openllm-python/src/openllm/_service_vars_pkg.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 model_id = '{__model_id__}'  # openllm: model id
 model_tag = '{__model_tag__}'  # openllm: model tag
 adapter_map = """{__model_adapter_map__}"""  # openllm: model adapter map
--- a/openllm-python/src/openllm/bundle/init.py
+++ b/openllm-python/src/openllm/bundle/init.py
@@ -1,36 +1,15 @@
 import os
-import typing as t

 from openllm_core.utils import LazyModule

-_import_structure = {
-  '_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'],
-  'oci': [
-    'CONTAINER_NAMES',
-    'get_base_container_tag',
-    'get_base_container_name',
-    'supported_registries',
-    'RefResolver',
-  ],
-}
-
-if t.TYPE_CHECKING:
-  from . import _package as _package, oci as oci
-  from ._package import (
-    build_editable as build_editable,
-    construct_docker_options as construct_docker_options,
-    construct_python_options as construct_python_options,
-    create_bento as create_bento,
-  )
-  from .oci import (
-    CONTAINER_NAMES as CONTAINER_NAMES,
-    RefResolver as RefResolver,
-    get_base_container_name as get_base_container_name,
-    get_base_container_tag as get_base_container_tag,
-    supported_registries as supported_registries,
-  )
-
-__lazy = LazyModule(__name__, os.path.abspath('__file__'), _import_structure)
+__lazy = LazyModule(
+  __name__,
+  os.path.abspath('__file__'),
+  {
+    '_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'],
+    'oci': ['CONTAINER_NAMES', 'supported_registries', 'RefResolver'],
+  },
+)
 __all__ = __lazy.__all__
 __dir__ = __lazy.__dir__
 __getattr__ = __lazy.__getattr__
--- a/openllm-python/src/openllm/bundle/init.pyi
+++ b/openllm-python/src/openllm/bundle/init.pyi
@@ -0,0 +1,32 @@
+from typing import Optional
+
+import attr
+
+from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
+from openllm_core.utils.lazy import VersionInfo
+
+from . import _package as _package, oci as oci
+from ._package import (
+  build_editable as build_editable,
+  construct_docker_options as construct_docker_options,
+  construct_python_options as construct_python_options,
+  create_bento as create_bento,
+)
+
+CONTAINER_NAMES: dict[LiteralContainerRegistry, str] = ...
+supported_registries: list[str] = ...
+
+@attr.attrs(eq=False, order=False, slots=True, frozen=True)
+class RefResolver:
+  git_hash: str
+  version: VersionInfo
+  strategy: LiteralContainerVersionStrategy
+
+  @classmethod
+  def from_strategy(cls, strategy_or_version: Optional[LiteralContainerVersionStrategy] = ...) -> RefResolver: ...
+  @property
+  def tag(self) -> str: ...
+  @staticmethod
+  def construct_base_image(
+    reg: LiteralContainerRegistry, strategy: Optional[LiteralContainerVersionStrategy] = ...
+  ) -> str: ...
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -1,16 +1,12 @@
 # mypy: disable-error-code="misc"
 from __future__ import annotations
 import importlib.metadata
-import inspect
 import logging
 import os
 import string
 import typing as t
 from pathlib import Path

-import fs
-import fs.copy
-import fs.errors
 import orjson
 from simple_di import Provide, inject

@@ -18,38 +14,27 @@ import bentoml
 import openllm_core
 from bentoml._internal.bento.build_config import BentoBuildConfig, DockerOptions, ModelSpec, PythonOptions
 from bentoml._internal.configuration.containers import BentoMLContainer
+from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg

 from . import oci

 if t.TYPE_CHECKING:
-  from fs.base import FS
-
-  import openllm
-  from bentoml._internal.bento import BentoStore
-  from bentoml._internal.models.model import ModelStore
-  from openllm_core._typing_compat import (
-    LiteralContainerRegistry,
-    LiteralContainerVersionStrategy,
-    LiteralSerialisation,
-    LiteralString,
-  )
+  from openllm_core._typing_compat import LiteralString

 logger = logging.getLogger(__name__)

 OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'


-def build_editable(
-  path: str, package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm'
-) -> str | None:
+def build_editable(path, package='openllm'):
  """Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set."""
-  if not openllm_core.utils.check_bool_env(OPENLLM_DEV_BUILD, default=False):
+  if not check_bool_env(OPENLLM_DEV_BUILD, default=False):
    return None
  # We need to build the package in editable mode, so that we can import it
  from build import ProjectBuilder
  from build.env import IsolatedEnvBuilder

-  module_location = openllm_core.utils.pkg.source_locations(package)
+  module_location = pkg.source_locations(package)
  if not module_location:
    raise RuntimeError(
      'Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.'
@@ -68,12 +53,7 @@ def build_editable(
  )


-def construct_python_options(
-  llm: openllm.LLM[t.Any, t.Any],
-  llm_fs: FS,
-  extra_dependencies: tuple[str, ...] | None = None,
-  adapter_map: dict[str, str] | None = None,
-) -> PythonOptions:
+def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=None):
  packages = ['openllm', 'scipy']  # apparently bnb misses this one
  if adapter_map is not None:
    packages += ['openllm[fine-tune]']
@@ -88,24 +68,18 @@ def construct_python_options(
  if req is not None:
    packages.extend(req)
  if str(os.environ.get('BENTOML_BUNDLE_LOCAL_BUILD', False)).lower() == 'false':
-    packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}")
+    packages.append(f"bentoml>={'.'.join([str(i) for i in pkg.pkg_version_info('bentoml')])}")

-  if not openllm_core.utils.is_torch_available():
-    raise ValueError('PyTorch is not available. Make sure to have it locally installed.')
-  packages.extend(
-    ['torch==2.0.1+cu118', 'vllm==0.2.1.post1', 'xformers==0.0.22', 'bentoml[tracing]==1.1.9']
-  )  # XXX: Currently locking this for correctness
-  wheels: list[str] = []
-  built_wheels = [
-    build_editable(llm_fs.getsyspath('/'), t.cast(t.Literal['openllm', 'openllm_core', 'openllm_client'], p))
-    for p in ('openllm_core', 'openllm_client', 'openllm')
-  ]
+  # XXX: Currently locking this for correctness
+  packages.extend(['torch==2.0.1+cu118', 'vllm==0.2.1.post1', 'xformers==0.0.22', 'bentoml[tracing]==1.1.9'])
+  wheels = []
+  built_wheels = [build_editable(llm_fs.getsyspath('/'), p) for p in ('openllm_core', 'openllm_client', 'openllm')]
  if all(i for i in built_wheels):
    wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)])
  return PythonOptions(
    packages=packages,
    wheels=wheels,
-    lock_packages=False,
+    lock_packages=True,
    extra_index_url=[
      'https://download.pytorch.org/whl/cu118',
      'https://huggingface.github.io/autogptq-index/whl/cu118/',
@@ -114,15 +88,8 @@ def construct_python_options(


 def construct_docker_options(
-  llm: openllm.LLM[t.Any, t.Any],
-  _: FS,
-  quantize: LiteralString | None,
-  adapter_map: dict[str, str] | None,
-  dockerfile_template: str | None,
-  serialisation: LiteralSerialisation,
-  container_registry: LiteralContainerRegistry,
-  container_version_strategy: LiteralContainerVersionStrategy,
-) -> DockerOptions:
+  llm, _, quantize, adapter_map, dockerfile_template, serialisation, container_registry, container_version_strategy
+):
  from openllm_cli._factory import parse_config_options

  environ = parse_config_options(llm.config, llm.config['timeout'], 1.0, None, True, os.environ.copy())
@@ -145,7 +112,7 @@ def construct_docker_options(
  if quantize:
    env_dict['OPENLLM_QUANTIZE'] = str(quantize)
  return DockerOptions(
-    base_image=f'{oci.get_base_container_name(container_registry)}:{oci.get_base_container_tag(container_version_strategy)}',
+    base_image=oci.RefResolver.construct_base_image(container_registry, container_version_strategy),
    env=env_dict,
    dockerfile_template=dockerfile_template,
  )
@@ -160,21 +127,13 @@ class _ServiceVarsFormatter(string.Formatter):
  keyword: LiteralString = '__model_name__'
  identifier: LiteralString = '# openllm: model name'

-  def __init__(self, target: str):
-    """The formatter that extends model_name to be formatted the 'service.py'."""
+  def __init__(self, target):
    super().__init__()
    self.target = target

-  def vformat(self, format_string: str, *args: t.Any, **attrs: t.Any) -> t.Any:
+  def vformat(self, format_string, *args, **attrs) -> str:
    return super().vformat(format_string, (), {self.keyword: self.target})

-  def can_format(self, value: str) -> bool:
-    try:
-      self.parse(value)
-      return True
-    except ValueError:
-      return False
-
  def parse_line(self, line: str, nl: bool = True) -> str:
    if self.identifier not in line:
      return line
@@ -201,9 +160,7 @@ _service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
 _service_vars_file = Path(os.path.abspath(__file__)).parent.parent / '_service_vars_pkg.py'


-def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str] | None, llm_fs: FS) -> None:
-  from openllm_core.utils import DEBUG
-
+def write_service(llm, llm_fs, adapter_map):
  model_id_formatter = ModelIdFormatter(llm.model_id)
  model_tag_formatter = ModelTagFormatter(str(llm.tag))
  adapter_map_formatter = ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode())
@@ -222,8 +179,8 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str] |
      src_contents[i] = adapter_map_formatter.parse_line(it)

  script = f"# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n\n" + ''.join(src_contents)
-  if DEBUG:
-    logger.info('Generated script:\n%s', script)
+  if SHOW_CODEGEN:
+    logger.info('Generated _service_vars.py:\n%s', script)
  llm_fs.writetext('_service_vars.py', script)

  logger.debug(
@@ -236,22 +193,20 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str] |

@inject
 def create_bento(
-  bento_tag: bentoml.Tag,
-  llm_fs: FS,
-  llm: openllm.LLM[t.Any, t.Any],
-  quantize: LiteralString | None,
-  dockerfile_template: str | None,
-  adapter_map: dict[str, str] | None = None,
-  extra_dependencies: tuple[str, ...] | None = None,
-  serialisation: LiteralSerialisation | None = None,
-  container_registry: LiteralContainerRegistry = 'ecr',
-  container_version_strategy: LiteralContainerVersionStrategy = 'release',
-  _bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
-  _model_store: ModelStore = Provide[BentoMLContainer.model_store],
-) -> bentoml.Bento:
-  _serialisation: LiteralSerialisation = openllm_core.utils.first_not_none(
-    serialisation, default=llm.config['serialisation']
-  )
+  bento_tag,
+  llm_fs,
+  llm,
+  quantize,
+  dockerfile_template,
+  adapter_map=None,
+  extra_dependencies=None,
+  serialisation=None,
+  container_registry='ecr',
+  container_version_strategy='release',
+  _bento_store=Provide[BentoMLContainer.bento_store],
+  _model_store=Provide[BentoMLContainer.model_store],
+):
+  _serialisation = openllm_core.utils.first_not_none(serialisation, default=llm.config['serialisation'])
  labels = dict(llm.identifying_params)
  labels.update(
    {
@@ -270,47 +225,31 @@ def create_bento(
    labels.update(adapter_map)
  logger.debug("Building Bento '%s' with model backend '%s'", bento_tag, llm.__llm_backend__)
  # add service.py definition to this temporary folder
-  write_service(llm, adapter_map, llm_fs)
+  write_service(llm, llm_fs, adapter_map)

-  llm_spec = ModelSpec.from_item({'tag': str(llm.tag), 'alias': llm.tag.name})
-  build_config = BentoBuildConfig(
-    service=f"{llm.config['service_name']}:svc",
-    name=bento_tag.name,
-    labels=labels,
-    models=[llm_spec],
-    description=f"OpenLLM service for {llm.config['start_name']}",
-    include=list(llm_fs.walk.files()),
-    exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
-    python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
-    docker=construct_docker_options(
-      llm,
-      llm_fs,
-      quantize,
-      adapter_map,
-      dockerfile_template,
-      _serialisation,
-      container_registry,
-      container_version_strategy,
+  bento = bentoml.Bento.create(
+    version=bento_tag.version,
+    build_ctx=llm_fs.getsyspath('/'),
+    build_config=BentoBuildConfig(
+      service=f"{llm.config['service_name']}:svc",
+      name=bento_tag.name,
+      labels=labels,
+      models=[ModelSpec.from_item({'tag': str(llm.tag), 'alias': llm.tag.name})],
+      description=f"OpenLLM service for {llm.config['start_name']}",
+      include=list(llm_fs.walk.files()),
+      exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
+      python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
+      docker=construct_docker_options(
+        llm,
+        llm_fs,
+        quantize,
+        adapter_map,
+        dockerfile_template,
+        _serialisation,
+        container_registry,
+        container_version_strategy,
+      ),
    ),
  )

-  bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/'))
-  # NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM.
-  service_fs_path = fs.path.join('src', llm.config['service_name'])
-  service_path = bento._fs.getsyspath(service_fs_path)
-  with open(service_path, 'r') as f:
-    service_contents = f.readlines()
-
-  for it in service_contents:
-    if '__bento_name__' in it:
-      service_contents[service_contents.index(it)] = it.format(__bento_name__=str(bento.tag))
-
-  script = ''.join(service_contents)
-  if openllm_core.utils.DEBUG:
-    logger.info('Generated script:\n%s', script)
-
-  bento._fs.writetext(service_fs_path, script)
-  if 'model_store' in inspect.signature(bento.save).parameters:
-    return bento.save(bento_store=_bento_store, model_store=_model_store)
-  # backward arguments. `model_store` is added recently
-  return bento.save(bento_store=_bento_store)
+  return bento.save(bento_store=_bento_store, model_store=_model_store)
--- a/openllm-python/src/openllm/bundle/_package.pyi
+++ b/openllm-python/src/openllm/bundle/_package.pyi
@@ -0,0 +1,52 @@
+from typing import Dict, Optional, Tuple
+
+from fs.base import FS
+from typing_extensions import LiteralString
+
+from bentoml import Bento, Tag
+from bentoml._internal.bento import BentoStore
+from bentoml._internal.bento.build_config import DockerOptions, PythonOptions
+from bentoml._internal.models.model import ModelStore
+from openllm_core._typing_compat import (
+  LiteralContainerRegistry,
+  LiteralContainerVersionStrategy,
+  LiteralQuantise,
+  LiteralSerialisation,
+  M,
+  T,
+)
+
+from .._llm import LLM
+
+def build_editable(path: str, package: LiteralString) -> Optional[str]: ...
+def construct_python_options(
+  llm: LLM[M, T],
+  llm_fs: FS,
+  extra_dependencies: Optional[Tuple[str, ...]] = ...,
+  adapter_map: Optional[Dict[str, str]] = ...,
+) -> PythonOptions: ...
+def construct_docker_options(
+  llm: LLM[M, T],
+  llm_fs: FS,
+  quantize: Optional[LiteralQuantise],
+  adapter_map: Optional[Dict[str, str]],
+  dockerfile_template: Optional[str],
+  serialisation: LiteralSerialisation,
+  container_registry: LiteralContainerRegistry,
+  container_version_strategy: LiteralContainerVersionStrategy,
+) -> DockerOptions: ...
+def write_service(llm: LLM[M, T], llm_fs: FS, adapter_map: Optional[Dict[str, str]]) -> None: ...
+def create_bento(
+  bento_tag: Tag,
+  llm_fs: FS,
+  llm: LLM[M, T],
+  quantize: Optional[LiteralQuantise],
+  dockerfile_template: Optional[str],
+  adapter_map: Optional[Dict[str, str]] = ...,
+  extra_dependencies: Optional[Tuple[str, ...]] = ...,
+  serialisation: Optional[LiteralSerialisation] = ...,
+  container_registry: LiteralContainerRegistry = ...,
+  container_version_strategy: LiteralContainerVersionStrategy = ...,
+  _bento_store: BentoStore = ...,
+  _model_store: ModelStore = ...,
+) -> Bento: ...
--- a/openllm-python/src/openllm/bundle/oci/init.py
+++ b/openllm-python/src/openllm/bundle/oci/init.py
@@ -1,26 +1,21 @@
-# mypy: disable-error-code="misc"
 from __future__ import annotations
 import functools
 import importlib
 import logging
 import os
 import pathlib
-import typing as t

 import attr

+from openllm_core._typing_compat import LiteralContainerVersionStrategy
 from openllm_core.exceptions import OpenLLMException
-from openllm_core.utils import codegen
 from openllm_core.utils.lazy import VersionInfo

-if t.TYPE_CHECKING:
-  from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy, RefTuple
-
 logger = logging.getLogger(__name__)

 ROOT_DIR = pathlib.Path(os.path.abspath('__file__')).parent.parent.parent

-_CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {
+_CONTAINER_REGISTRY = {
  'docker': 'docker.io/bentoml/openllm',
  'gh': 'ghcr.io/bentoml/openllm',
  'ecr': 'public.ecr.aws/y5w8i4y6/bentoml/openllm',
@@ -30,80 +25,48 @@ _CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {
 _OWNER, _REPO = 'bentoml', 'openllm'


-def _convert_version_from_string(s: str) -> VersionInfo:
-  return VersionInfo.from_version_string(s)
-
-
-_RefTuple: type[RefTuple] = codegen.make_attr_tuple_class('_RefTuple', ['git_hash', 'version', 'strategy'])
-
-
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
 class RefResolver:
  git_hash: str = attr.field()
-  version: VersionInfo = attr.field(converter=_convert_version_from_string)
+  version: VersionInfo = attr.field(converter=lambda s: VersionInfo.from_version_string(s))
  strategy: LiteralContainerVersionStrategy = attr.field()

-  @classmethod
-  def _release_ref(cls, version_str: str | None = None) -> RefTuple:
-    try:
-      from ghapi.all import GhApi
-
-      ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
-      meta = t.cast(t.Dict[str, t.Any], ghapi.repos.get_latest_release())
-    except Exception as err:
-      raise OpenLLMException('Failed to determine latest release version.') from err
-    _use_base_strategy = version_str is None
-    if version_str is None:
-      # NOTE: This strategy will only support openllm>0.2.12
-      version_str = meta['name'].lstrip('v')
-      version = (ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'], version_str)
-    else:
-      version = ('', version_str)
-    return _RefTuple((*version, 'release' if _use_base_strategy else 'custom'))
-
  @classmethod
  @functools.lru_cache(maxsize=64)
-  def from_strategy(cls, strategy_or_version: LiteralContainerVersionStrategy | None = None) -> RefResolver:
+  def from_strategy(cls, strategy_or_version=None):
    # using default strategy
    if strategy_or_version is None or strategy_or_version == 'release':
-      return cls(*cls._release_ref())
+      try:
+        from ghapi.all import GhApi
+
+        ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
+        meta = ghapi.repos.get_latest_release()
+        git_hash = ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha']
+      except Exception as err:
+        raise OpenLLMException('Failed to determine latest release version.') from err
+      return cls(git_hash=git_hash, version=meta['name'].lstrip('v'), strategy='release')
    elif strategy_or_version in ('latest', 'nightly'):  # latest is nightly
      return cls(git_hash='latest', version='0.0.0', strategy='latest')
    else:
      raise ValueError(f'Unknown strategy: {strategy_or_version}')

  @property
-  def tag(self) -> str:
+  def tag(self):
    return 'latest' if self.strategy in {'latest', 'nightly'} else repr(self.version)

-
-@functools.lru_cache(maxsize=256)
-def get_base_container_tag(strategy: LiteralContainerVersionStrategy | None = None) -> str:
-  return RefResolver.from_strategy(strategy).tag
+  @staticmethod
+  def construct_base_image(reg, strategy=None):
+    return f'{_CONTAINER_REGISTRY[reg]}:{RefResolver.from_strategy(strategy).tag}'


-def get_base_container_name(reg: LiteralContainerRegistry) -> str:
-  return _CONTAINER_REGISTRY[reg]
+__all__ = ['CONTAINER_NAMES', 'RefResolver', 'supported_registries']


-if t.TYPE_CHECKING:
-  CONTAINER_NAMES: dict[LiteralContainerRegistry, str]
-  supported_registries: list[str]
-
-__all__ = [
-  'CONTAINER_NAMES',
-  'get_base_container_tag',
-  'get_base_container_name',
-  'supported_registries',
-  'RefResolver',
-]
-
-
-def __dir__() -> list[str]:
+def __dir__():
  return sorted(__all__)


-def __getattr__(name: str) -> t.Any:
+def __getattr__(name):
  if name == 'supported_registries':
    return functools.lru_cache(1)(lambda: list(_CONTAINER_REGISTRY))()
  elif name == 'CONTAINER_NAMES':
--- a/openllm-python/src/openllm/testing.py
+++ b/openllm-python/src/openllm/testing.py
@@ -1,5 +1,3 @@
-"""Tests utilities for OpenLLM."""
-
 from __future__ import annotations
 import contextlib
 import logging