chore(logger): fix warnings and streamline style (#717)

Sorry but there are too much wasted spacing in `_llm.py`, and I'm unhappy and not productive anytime I look or want to do anything with it --------- Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2026-05-18 21:54:11 -04:00 · 2023-11-21 18:54:51 -05:00
parent d53cf234bd
commit 77bd6f090a
6 changed files with 437 additions and 521 deletions
--- a/.ruff.toml
+++ b/.ruff.toml
@@ -1,77 +1,75 @@
 extend-exclude = [
-  "tools",
-  "examples",
-  "openllm-python/src/openllm/__init__.py",
-  "openllm-python/src/openllm/_version.py",
-  "openllm-python/src/openllm/models/__init__.py",
-  "openllm-python/src/openllm_cli/playground",
-  "openllm-client/src/openllm_client/pb/**",
+    "tools",
+    "examples",
+    "openllm-python/src/openllm/__init__.py",
+    "openllm-python/src/openllm/_version.py",
+    "openllm-python/src/openllm/models/__init__.py",
+    "openllm-python/src/openllm_cli/playground",
+    "openllm-client/src/openllm_client/pb/**",
 ]
 extend-include = ["*.ipynb"]
 extend-select = [
-  "E",
-  "F",
-  "B",
-  "PIE",
-  "I",    # isort
-  "G",    # flake8-logging-format
-  "W",    # pycodestyle
-  "Q",    # flake8-quotes
-  "FA",   # flake8-future-annotations
-  "TCH",  # flake8-type-checking
-  "PLW",  # pylint-warning
-  "PLR",  # pylint-refactor
-  "PT",   # flake8-pytest-style
-  "PERF", # perflint
-  "FLY",  # flynt
-  "RUF",  # Ruff-specific rules
-  "YTT",  # flake8-2020
+    "E",
+    "F",
+    "B",
+    "PIE",
+    "G",    # flake8-logging-format
+    "W",    # pycodestyle
+    "Q",    # flake8-quotes
+    "FA",   # flake8-future-annotations
+    "TCH",  # flake8-type-checking
+    "PLW",  # pylint-warning
+    "PLR",  # pylint-refactor
+    "PT",   # flake8-pytest-style
+    "PERF", # perflint
+    "RUF",  # Ruff-specific rules
+    "YTT",  # flake8-2020
 ]
 fix = true
 ignore = [
-  "PLR0911",
-  "PLR0912",
-  "PLR0913",
-  "PLR0915",
-  "PLR2004", # magic value to use constant
-  "E501",    # ignore line length violation
-  "E401",    # ignore multiple line import
-  "E702",
-  "TCH004",  # don't move runtime import out, just warn about it
-  "RUF012",  # mutable attributes to be used with ClassVar
-  "E701",    # multiple statement on single line
+    "PLR0911",
+    "PLR0912",
+    "PLR0913",
+    "PLR0915",
+    "PLR2004", # magic value to use constant
+    "E501",    # ignore line length violation
+    "E401",    # ignore multiple line import
+    "E702",
+    "TCH004",  # don't move runtime import out, just warn about it
+    "RUF012",  # mutable attributes to be used with ClassVar
+    "E701",    # multiple statement on single line
 ]
 line-length = 119
 indent-width = 2
 target-version = "py38"
 typing-modules = [
-  "openllm_core._typing_compat",
-  "openllm_client._typing_compat",
+    "openllm_core._typing_compat",
+    "openllm_client._typing_compat",
 ]
 unfixable = ["TCH004"]

 [lint.flake8-type-checking]
 exempt-modules = [
-  "typing",
-  "typing_extensions",
-  "openllm_core._typing_compat",
-  "openllm_client._typing_compat",
+    "typing",
+    "typing_extensions",
+    "openllm_core._typing_compat",
+    "openllm_client._typing_compat",
 ]
 runtime-evaluated-base-classes = [
-  "openllm_core._configuration.LLMConfig",
-  "openllm_core._configuration.GenerationConfig",
-  "openllm_core._configuration.SamplingParams",
-  "openllm_core._configuration.ModelSettings",
-  "openllm.LLMConfig",
+    "openllm_core._configuration.LLMConfig",
+    "openllm_core._configuration.GenerationConfig",
+    "openllm_core._configuration.SamplingParams",
+    "openllm_core._configuration.ModelSettings",
+    "openllm.LLMConfig",
 ]
 runtime-evaluated-decorators = [
-  "attrs.define",
-  "attrs.frozen",
-  "trait",
-  "attr.attrs",
-  'attr.define',
-  '_attr.define',
-  'attr.frozen',
+    "attrs.define",
+    "attrs.frozen",
+    "trait",
+    "attr.attrs",
+    'attr.define',
+    '_attr.define',
+    'attr.frozen',
 ]

 [format]
@@ -87,29 +85,6 @@ convention = "google"
 ignore-overlong-task-comments = true
 max-line-length = 119

-[lint.isort]
-combine-as-imports = true
-known-first-party = [
-  "openllm",
-  "bentoml",
-  "openllm_core",
-  "openllm_client",
-  "openllm_cli",
-]
-known-third-party = [
-  "transformers",
-  "click",
-  "huggingface_hub",
-  "torch",
-  "vllm",
-  "auto_gptq",
-  "peft",
-  "click_option_group",
-]
-split-on-trailing-comma = false
-no-lines-before = ["future", "standard-library"]
-relative-imports-order = "closest-to-furthest"
-
 [lint.flake8-quotes]
 avoid-escape = false
 inline-quotes = "single"
@@ -121,5 +96,4 @@ docstring-quotes = "double"
 "openllm-python/src/openllm/_llm.py" = ["F811"]
 "openllm-core/src/openllm_core/utils/import_utils.py" = ["PLW0603", "F811"]
 "openllm-core/src/openllm_core/_configuration.py" = ["F811", "Q001"]
-"openllm-python/src/openllm/__init__.pyi" = ["I001"]
 "openllm-python/src/openllm/_service_vars_pkg.py" = ["F821"]
--- a/openllm-core/src/openllm_core/utils/init.py
+++ b/openllm-core/src/openllm_core/utils/init.py
@@ -36,7 +36,7 @@ logger = logging.getLogger(__name__)
 _T=t.TypeVar('_T')
@functools.lru_cache(maxsize=1)
 def _WithArgsTypes()->tuple[type[t.Any],...]:
-  try:from typing import GenericAlias as _TypingGenericAlias  # type: ignore # noqa: I001
+  try:from typing import GenericAlias as _TypingGenericAlias  # type: ignore
  except ImportError:_TypingGenericAlias = ()  # type: ignore # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
  #  _GenericAlias is the actual GenericAlias implementation
  return (_TypingGenericAlias,) if sys.version_info<(3,10) else (t._GenericAlias, types.GenericAlias, types.UnionType) # type: ignore
@@ -69,7 +69,7 @@ def getenv(env:str,default:t.Any=None,var:t.Sequence[str]|None=None)->t.Any:
  if var is not None:env_key=set(var)|env_key
  def callback(k:str)->t.Any:
    _var = os.getenv(k)
-    if _var and k.startswith('OPENLLM_') and not get_disable_warnings() and not get_quiet_mode():logger.warning("Using '%s' environment is deprecated, use '%s' instead.",k.upper(),k[8:].upper())
+    if _var and k.startswith('OPENLLM_'):logger.warning("Using '%s' environment is deprecated, use '%s' instead.",k.upper(),k[8:].upper())
    return _var
  return first_not_none(*(callback(k) for k in env_key),default=default)
 def field_env_key(key:str,suffix:str|None=None)->str:return '_'.join(filter(None,map(str.upper,['OPENLLM',suffix.strip('_') if suffix else '',key])))
@@ -80,13 +80,13 @@ def get_quiet_mode()->bool:
  return False
 def get_disable_warnings()->bool:return check_bool_env(WARNING_ENV_VAR, False)
 def set_disable_warnings(disable:bool=True)->None:
-  if get_disable_warnings():os.environ[WARNING_ENV_VAR]=str(disable)
+  if disable:os.environ[WARNING_ENV_VAR]=str(disable)
 def set_debug_mode(enabled:bool,level:int=1)->None:
  if enabled:os.environ[DEV_DEBUG_VAR] = str(level)
  os.environ.update({DEBUG_ENV_VAR:str(enabled),QUIET_ENV_VAR:str(not enabled),_GRPC_DEBUG_ENV_VAR:'DEBUG' if enabled else 'ERROR','CT2_VERBOSE':'3'})
-  set_disable_warnings(enabled)
+  set_disable_warnings(not enabled)
 def set_quiet_mode(enabled:bool)->None:
-  os.environ.update({QUIET_ENV_VAR:str(enabled),_GRPC_DEBUG_ENV_VAR:'NONE','CT2_VERBOSE':'-1'})
+  os.environ.update({QUIET_ENV_VAR:str(enabled),DEBUG_ENV_VAR:str(not enabled),_GRPC_DEBUG_ENV_VAR:'NONE','CT2_VERBOSE':'-1'})
  set_disable_warnings(enabled)
 def gen_random_uuid(prefix:str|None=None)->str:return '-'.join([prefix or 'openllm', str(uuid.uuid4().hex)])
 # NOTE: `compose` any number of unary functions into a single unary function
@@ -113,11 +113,8 @@ def generate_context(framework_name:str):
  return ModelContext(framework_name=framework_name,framework_versions=framework_versions)
@functools.lru_cache(maxsize=1)
 def in_notebook()->bool:
-  try:from IPython.core.getipython import get_ipython; return 'IPKernelApp' in get_ipython().config  # noqa: I001
-  except (ImportError, AttributeError):return False
-# Used to filter out INFO log
-class InfoFilter(logging.Filter):
-  def filter(self,record:logging.LogRecord)->bool:return logging.INFO<=record.levelno<logging.WARNING
+  try:from IPython.core.getipython import get_ipython;return 'IPKernelApp' in get_ipython().config
+  except Exception:return False
 _TOKENIZER_PREFIX = '_tokenizer_'
 def flatten_attrs(**attrs:t.Any)->tuple[dict[str,t.Any],dict[str, t.Any]]:
  tokenizer_attrs = {k[len(_TOKENIZER_PREFIX):]:v for k,v in attrs.items() if k.startswith(_TOKENIZER_PREFIX)}
@@ -130,31 +127,31 @@ DEBUG=sys.flags.dev_mode or (not sys.flags.ignore_environment and check_bool_env
 SHOW_CODEGEN=DEBUG and (os.environ.get(DEV_DEBUG_VAR,str(0)).isdigit() and int(os.environ.get(DEV_DEBUG_VAR,str(0)))>3)
 # MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins
 MYPY=False
-# fmt: on
-
-
 class ExceptionFilter(logging.Filter):
  def __init__(self, exclude_exceptions: list[type[Exception]] | None = None, **kwargs: t.Any):
-    if exclude_exceptions is None:
-      exclude_exceptions = []
+    if exclude_exceptions is None:exclude_exceptions=[]
    try:
      from circus.exc import ConflictError
-
-      if ConflictError not in exclude_exceptions:
-        exclude_exceptions.append(ConflictError)
+      if ConflictError not in exclude_exceptions:exclude_exceptions.append(ConflictError)
    except ImportError:
      pass
    super(ExceptionFilter, self).__init__(**kwargs)
-    self.EXCLUDE_EXCEPTIONS = exclude_exceptions
-
-  def filter(self, record: logging.LogRecord) -> bool:
+    self.EXCLUDE_EXCEPTIONS=exclude_exceptions
+  def filter(self,record:logging.LogRecord)->bool:
    if record.exc_info:
-      etype, _, _ = record.exc_info
+      etype,_,_=record.exc_info
      if etype is not None:
        for exc in self.EXCLUDE_EXCEPTIONS:
-          if issubclass(etype, exc):
-            return False
+          if issubclass(etype, exc):return False
    return True
+# Used to filter out INFO log
+class InfoFilter(logging.Filter):
+  def filter(self,record:logging.LogRecord)->bool:return logging.INFO<=record.levelno<logging.WARNING
+class WarningFilter(logging.Filter): # FIXME: Why does this not work?
+  def filter(self,record:logging.LogRecord)->bool:
+    if get_disable_warnings():return record.levelno>=logging.ERROR
+    return True
+# fmt: on


 _LOGGING_CONFIG: dict[str, t.Any] = {
@@ -163,11 +160,12 @@ _LOGGING_CONFIG: dict[str, t.Any] = {
  'filters': {
    'excfilter': {'()': 'openllm_core.utils.ExceptionFilter'},
    'infofilter': {'()': 'openllm_core.utils.InfoFilter'},
+    'warningfilter': {'()': 'openllm_core.utils.WarningFilter'},
  },
  'handlers': {
    'bentomlhandler': {
      'class': 'logging.StreamHandler',
-      'filters': ['excfilter', 'infofilter'],
+      'filters': ['excfilter', 'warningfilter', 'infofilter'],
      'stream': 'ext://sys.stdout',
    },
    'defaulthandler': {'class': 'logging.StreamHandler', 'level': logging.WARNING},
@@ -195,6 +193,9 @@ def configure_logging() -> None:
    _LOGGING_CONFIG['loggers']['bentoml']['level'] = logging.INFO
    _LOGGING_CONFIG['root']['level'] = logging.INFO

+  if get_disable_warnings():  # HACK: This is a hack to disable warnings
+    _LOGGING_CONFIG['loggers']['openllm']['level'] = logging.ERROR
+
  logging.config.dictConfig(_LOGGING_CONFIG)


@@ -241,24 +242,3 @@ __lazy = LazyModule(
 __all__ = __lazy.__all__
 __dir__ = __lazy.__dir__
 __getattr__ = __lazy.__getattr__
-
-if t.TYPE_CHECKING:
-  from . import analytics as analytics, codegen as codegen, dantic as dantic, serde as serde
-  from .import_utils import (
-    OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES,
-    is_autoawq_available as is_autoawq_available,
-    is_autogptq_available as is_autogptq_available,
-    is_bentoml_available as is_bentoml_available,
-    is_bitsandbytes_available as is_bitsandbytes_available,
-    is_ctranslate_available as is_ctranslate_available,
-    is_grpc_available as is_grpc_available,
-    is_jupyter_available as is_jupyter_available,
-    is_jupytext_available as is_jupytext_available,
-    is_notebook_available as is_notebook_available,
-    is_peft_available as is_peft_available,
-    is_torch_available as is_torch_available,
-    is_transformers_available as is_transformers_available,
-    is_vllm_available as is_vllm_available,
-  )
-  from .representation import ReprMixin as ReprMixin
-  from .serde import converter as converter
--- a/openllm-python/src/openllm/main.py
+++ b/openllm-python/src/openllm/main.py
@@ -1,2 +1,2 @@
 # fmt: off
-if __name__ == '__main__':from openllm_cli.entrypoint import cli;cli()  # noqa
+if __name__ == '__main__':from openllm_cli.entrypoint import cli;cli()
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -1,15 +1,8 @@
 from __future__ import annotations
-import functools
-import logging
-import os
+import functools, logging, os, warnings
 import typing as t
-
-import attr
-import inflection
-import orjson
-
-import bentoml
-import openllm
+import attr, inflection, orjson
+import bentoml, openllm
 from openllm_core._schemas import GenerationOutput
 from openllm_core._typing_compat import (
  AdapterMap,
@@ -35,8 +28,6 @@ from openllm_core.utils import (
  flatten_attrs,
  gen_random_uuid,
  generate_hash_from_file,
-  get_disable_warnings,
-  get_quiet_mode,
  getenv,
  is_ctranslate_available,
  is_peft_available,
@@ -49,365 +40,18 @@ from .exceptions import ForbiddenAttributeError, OpenLLMException
 from .serialisation.constants import PEFT_CONFIG_NAME

 if t.TYPE_CHECKING:
-  import torch
-  import transformers
+  import torch, transformers
  from peft.config import PeftConfig
-
  from openllm_core._configuration import LLMConfig
-
  from ._runners import Runner

-ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]]
-
 logger = logging.getLogger(__name__)
-
-
-def normalise_model_name(name: str) -> str:
-  if validate_is_path(name):
-    return os.path.basename(resolve_filepath(name))
-  name = name.replace('/', '--')
-  return inflection.dasherize(name)
-
-
-def _resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
-  if not is_peft_available():
-    raise RuntimeError("Requires 'peft' to be installed. Do 'pip install \"openllm[fine-tune]\"'")
-  from huggingface_hub import hf_hub_download
-
-  resolved: AdapterMap = {}
-  for path_or_adapter_id, name in adapter_map.items():
-    if name is None:
-      raise ValueError('Adapter name must be specified.')
-    if os.path.isfile(os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)):
-      config_file = os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)
-    else:
-      try:
-        config_file = hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME)
-      except Exception as err:
-        raise ValueError(f"Can't find '{PEFT_CONFIG_NAME}' at '{path_or_adapter_id}'") from err
-    with open(config_file, 'r') as file:
-      resolved_config = orjson.loads(file.read())
-    # all peft_type should be available in PEFT_CONFIG_NAME
-    _peft_type = resolved_config['peft_type'].lower()
-    if _peft_type not in resolved:
-      resolved[_peft_type] = ()
-    resolved[_peft_type] += (_AdapterTuple((path_or_adapter_id, name, resolved_config)),)
-  return resolved
-
-
-_reserved_namespace = {'model', 'tokenizer', 'runner', 'import_kwargs'}
 _AdapterTuple: type[AdapterTuple] = codegen.make_attr_tuple_class('AdapterTuple', ['adapter_id', 'name', 'config'])
-
-
-@functools.lru_cache(maxsize=1)
-def _torch_dtype_mapping():
-  import torch
-
-  return {
-    'half': torch.float16,
-    'float': torch.float32,
-    'float16': torch.float16,
-    'float32': torch.float32,
-    'bfloat16': torch.bfloat16,
-  }
+ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]]


@attr.define(slots=True, repr=False, init=False)
 class LLM(t.Generic[M, T], ReprMixin):
-  _model_id: str
-  _revision: str | None
-  _quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig | None
-  _quantise: LiteralQuantise | None
-  _model_decls: TupleAny
-  __model_attrs: DictStrAny
-  __tokenizer_attrs: DictStrAny
-  _tag: bentoml.Tag
-  _adapter_map: AdapterMap | None
-  _serialisation: LiteralSerialisation
-  _local: bool
-  _max_model_len: int | None
-
-  __llm_dtype__: LiteralDtype | t.Literal['auto', 'half', 'float'] = 'auto'
-  __llm_torch_dtype__: 'torch.dtype' = None
-  __llm_config__: LLMConfig | None = None
-  __llm_backend__: LiteralBackend = None  # type: ignore
-  __llm_quantization_config__: transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig | None = None
-  __llm_runner__: t.Optional[Runner[M, T]] = None
-  __llm_model__: t.Optional[M] = None
-  __llm_tokenizer__: t.Optional[T] = None
-  __llm_adapter_map__: t.Optional[ResolvedAdapterMap] = None
-  __llm_trust_remote_code__: bool = False
-
-  def __init__(
-    self,
-    model_id,
-    model_version=None,
-    model_tag=None,
-    llm_config=None,
-    backend=None,
-    *args,
-    quantize=None,
-    quantization_config=None,
-    adapter_map=None,
-    serialisation='safetensors',
-    trust_remote_code=False,
-    embedded=False,
-    dtype='auto',
-    low_cpu_mem_usage=True,
-    max_model_len=None,
-    _eager=True,
-    **attrs,
-  ):
-    # fmt: off
-    torch_dtype = attrs.pop('torch_dtype',None)  # backward compatible
-    if torch_dtype is not None:logger.warning('The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.');dtype=torch_dtype
-    _local = False
-    if validate_is_path(model_id):model_id,_local=resolve_filepath(model_id),True
-    backend=first_not_none(getenv('backend',default=backend),default=self._cascade_backend())
-    dtype=first_not_none(getenv('dtype',default=dtype,var=['TORCH_DTYPE']),default='auto')
-    quantize=first_not_none(getenv('quantize',default=quantize,var=['QUANITSE']),default=None)
-    attrs.update({'low_cpu_mem_usage':low_cpu_mem_usage})
-    # parsing tokenizer and model kwargs, as the hierarchy is param pass > default
-    model_attrs, tokenizer_attrs = flatten_attrs(**attrs)
-    if model_tag is None:
-      model_tag,model_version=self._make_tag_components(model_id,model_version,backend=backend)
-      if model_version:model_tag=f'{model_tag}:{model_version}'
-    # fmt: on
-
-    self.__attrs_init__(
-      model_id=model_id,
-      revision=model_version,
-      tag=bentoml.Tag.from_taglike(model_tag),
-      quantization_config=quantization_config,
-      quantise=self._resolve_quantise(quantize, backend),
-      model_decls=args,
-      adapter_map=_resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
-      serialisation=serialisation,
-      local=_local,
-      max_model_len=max_model_len,
-      LLM__model_attrs=model_attrs,
-      LLM__tokenizer_attrs=tokenizer_attrs,
-      llm_dtype__=dtype.lower(),
-      llm_backend__=backend,
-      llm_config__=llm_config,
-      llm_trust_remote_code__=trust_remote_code,
-    )
-
-    if _eager:
-      try:
-        model = bentoml.models.get(self.tag)
-      except bentoml.exceptions.NotFound:
-        model = openllm.serialisation.import_model(self, trust_remote_code=self.trust_remote_code)
-      # resolve the tag
-      self._tag = model.tag
-    if not _eager and embedded:
-      raise RuntimeError("Embedded mode is not supported when '_eager' is False.")
-    if embedded and not get_disable_warnings() and not get_quiet_mode():
-      logger.warning(
-        'You are using embedded mode, which means the models will be loaded into memory. This is often not recommended in production and should only be used for local development only.'
-      )
-      self.runner.init_local(quiet=True)
-
-  # fmt: off
-  def _resolve_quantise(self, quantise, backend):
-    if backend in ('pt', 'vllm'):return quantise
-    if backend=='ctranslate':return self._resolve_ctranslate_quantise(quantise)
-    raise NotImplementedError(f"Quantisation is not supported for backend '{backend}'")
-  def _resolve_ctranslate_quantise(self,quantise):
-    if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}:raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
-    if quantise == 'int8':quantise='int8_float16' if self._has_gpus else 'int8_float32'
-    return quantise
-  @apply(lambda val:tuple(str.lower(i) if i else i for i in val))
-  def _make_tag_components(self,model_id:str,model_version:str|None,backend:str)->tuple[str,str|None]:
-    model_id,*maybe_revision=model_id.rsplit(':')
-    if len(maybe_revision)>0:
-      if model_version is not None:logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.",maybe_revision[0],model_version)
-      model_version = maybe_revision[0]
-    if validate_is_path(model_id):model_id,model_version=resolve_filepath(model_id),first_not_none(model_version,default=generate_hash_from_file(model_id))
-    return f'{backend}-{normalise_model_name(model_id)}',model_version
-  @functools.cached_property
-  def _has_gpus(self):
-    try:
-      from cuda import cuda
-      err,*_=cuda.cuInit(0)
-      if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to initialise CUDA runtime binding.')
-      err,num_gpus=cuda.cuDeviceGetCount()
-      if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to get CUDA device count.')
-      return True
-    except (ImportError, RuntimeError):return False
-  @property
-  def _torch_dtype(self):
-    import torch, transformers  # noqa: I001
-    _map=_torch_dtype_mapping()
-    if not isinstance(self.__llm_torch_dtype__,torch.dtype):
-      try:hf_config=transformers.AutoConfig.from_pretrained(self.bentomodel.path,trust_remote_code=self.trust_remote_code)
-      except OpenLLMException:hf_config=transformers.AutoConfig.from_pretrained(self.model_id,trust_remote_code=self.trust_remote_code)
-      config_dtype=getattr(hf_config,'torch_dtype',None)
-      if config_dtype is None:config_dtype=torch.float32
-      if self.__llm_dtype__=='auto':
-        if config_dtype==torch.float32:torch_dtype=torch.float16
-        else:torch_dtype=config_dtype
-      else:
-        if self.__llm_dtype__ not in _map:raise ValueError(f"Unknown dtype '{self.__llm_dtype__}'")
-        torch_dtype=_map[self.__llm_dtype__]
-      self.__llm_torch_dtype__=torch_dtype
-    return self.__llm_torch_dtype__
-  @property
-  def _model_attrs(self):return {**self.import_kwargs[0],**self.__model_attrs}
-  @_model_attrs.setter
-  def _model_attrs(self, value):self.__model_attrs = value
-  @property
-  def _tokenizer_attrs(self):return {**self.import_kwargs[1],**self.__tokenizer_attrs}
-  def _cascade_backend(self)->LiteralBackend:
-    if self._has_gpus:
-      if is_vllm_available():return 'vllm'
-      elif is_ctranslate_available():return 'ctranslate'  # XXX: base OpenLLM image should always include vLLM
-    elif is_ctranslate_available():return 'ctranslate'
-    else:return 'pt'
-  def __setattr__(self,attr,value):
-    if attr in _reserved_namespace:raise ForbiddenAttributeError(f'{attr} should not be set during runtime.')
-    super().__setattr__(attr, value)
-  def __del__(self):del self.__llm_model__,self.__llm_tokenizer__,self.__llm_adapter_map__
-  @property
-  def __repr_keys__(self):return {'model_id','revision','backend','type'}
-  def __repr_args__(self):
-    yield 'model_id',self._model_id if not self._local else self.tag.name
-    yield 'revision',self._revision if self._revision else self.tag.version
-    yield 'backend',self.__llm_backend__
-    yield 'type',self.llm_type
-  @property
-  def import_kwargs(self):return {'device_map':'auto' if self._has_gpus else None,'torch_dtype':self._torch_dtype},{'padding_side':'left','truncation_side':'left'}
-  @property
-  def trust_remote_code(self):
-    env=os.getenv('TRUST_REMOTE_CODE')
-    if env is not None:return str(env).upper() in ENV_VARS_TRUE_VALUES
-    return self.__llm_trust_remote_code__
-  @property
-  def model_id(self):return self._model_id
-  @property
-  def revision(self):return self._revision
-  @property
-  def tag(self):return self._tag
-  @property
-  def bentomodel(self):return openllm.serialisation.get(self)
-  @property
-  def quantization_config(self):
-    if self.__llm_quantization_config__ is None:
-      from ._quantisation import infer_quantisation_config
-      if self._quantization_config is not None:self.__llm_quantization_config__ = self._quantization_config
-      elif self._quantise is not None:self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(self,self._quantise,**self._model_attrs)
-      else:raise ValueError("Either 'quantization_config' or 'quantise' must be specified.")
-    return self.__llm_quantization_config__
-  @property
-  def has_adapters(self):return self._adapter_map is not None
-  @property
-  def local(self):return self._local
-  @property
-  def quantise(self):return self._quantise
-  @property
-  def llm_type(self):return normalise_model_name(self._model_id)
-  @property
-  def llm_parameters(self):return (self._model_decls,self._model_attrs),self._tokenizer_attrs
-  @property
-  def identifying_params(self):return {'configuration':self.config.model_dump_json().decode(),'model_ids':orjson.dumps(self.config['model_ids']).decode(),'model_id':self.model_id}
-  @property
-  def tokenizer(self):
-    if self.__llm_tokenizer__ is None:self.__llm_tokenizer__=openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1])
-    return self.__llm_tokenizer__
-  @property
-  def runner(self):
-    from ._runners import runner
-    if self.__llm_runner__ is None:self.__llm_runner__=runner(self)
-    return self.__llm_runner__
-  def prepare(self,adapter_type='lora',use_gradient_checking=True,**attrs):
-    if self.__llm_backend__!='pt':raise RuntimeError('Fine tuning is only supported for PyTorch backend.')
-    from peft.mapping import get_peft_model
-    from peft.utils.other import prepare_model_for_kbit_training
-    model=get_peft_model(
-      prepare_model_for_kbit_training(self.model,use_gradient_checkpointing=use_gradient_checking),
-      self.config['fine_tune_strategies']
-      .get(adapter_type,self.config.make_fine_tune_config(adapter_type))
-      .train()
-      .with_config(**attrs)
-      .build(),
-    )
-    if DEBUG:model.print_trainable_parameters()
-    return model,self.tokenizer
-  def prepare_for_training(self,*args,**attrs):logger.warning('`prepare_for_training` is deprecated and will be removed in the future. Please use `prepare` instead.');return self.prepare(*args,**attrs)
-  # fmt: on
-
-  @property
-  def adapter_map(self):
-    if not is_peft_available():
-      raise MissingDependencyError("Failed to import 'peft'. Make sure to do 'pip install \"openllm[fine-tune]\"'")
-    if not self.has_adapters:
-      raise AttributeError('Adapter map is not available.')
-    assert self._adapter_map is not None
-    if self.__llm_adapter_map__ is None:
-      _map: ResolvedAdapterMap = {k: {} for k in self._adapter_map}
-      for adapter_type, adapter_tuple in self._adapter_map.items():
-        base = first_not_none(
-          self.config['fine_tune_strategies'].get(adapter_type),
-          default=self.config.make_fine_tune_config(adapter_type),
-        )
-        for adapter in adapter_tuple:
-          _map[adapter_type][adapter.name] = (base.with_config(**adapter.config).build(), adapter.adapter_id)
-      self.__llm_adapter_map__ = _map
-    return self.__llm_adapter_map__
-
-  @property
-  def model(self):
-    if self.__llm_model__ is None:
-      model = openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs)
-      # If OOM, then it is probably you don't have enough VRAM to run this model.
-      if self.__llm_backend__ == 'pt':
-        import torch
-
-        loaded_in_kbit = (
-          getattr(model, 'is_loaded_in_8bit', False)
-          or getattr(model, 'is_loaded_in_4bit', False)
-          or getattr(model, 'is_quantized', False)
-        )
-        if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
-          try:
-            model = model.to('cuda')
-          except Exception as err:
-            raise OpenLLMException(f'Failed to load model into GPU: {err}.\n') from err
-        if self.has_adapters:
-          logger.debug('Applying the following adapters: %s', self.adapter_map)
-          for adapter_dict in self.adapter_map.values():
-            for adapter_name, (peft_config, peft_model_id) in adapter_dict.items():
-              model.load_adapter(peft_model_id, adapter_name, peft_config=peft_config)
-      self.__llm_model__ = model
-    return self.__llm_model__
-
-  @property
-  def config(self):
-    import transformers
-
-    if self.__llm_config__ is None:
-      if self.__llm_backend__ == 'ctranslate':
-        try:
-          config = transformers.AutoConfig.from_pretrained(
-            self.bentomodel.path_of('/hf'), trust_remote_code=self.trust_remote_code
-          )
-        except OpenLLMException:
-          config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
-        for architecture in config.architectures:
-          if architecture in openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE():
-            config = openllm.AutoConfig.infer_class_from_name(
-              openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture]
-            ).model_construct_env(**self._model_attrs)
-            break
-          else:
-            raise OpenLLMException(
-              f"Failed to infer the configuration class from the given model. Make sure the model is a supported model. Supported models are: {', '.join(openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE.keys())}"
-            )
-      else:
-        config = openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs)
-      self.__llm_config__ = config
-    return self.__llm_config__
-
  async def generate(
    self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
  ) -> GenerationOutput:
@@ -495,3 +139,325 @@ class LLM(t.Generic[M, T], ReprMixin):
        yield generated.with_options(outputs=delta_outputs)
    except Exception as err:
      raise RuntimeError(f'Exception caught during generation: {err}') from err
+
+  # NOTE: If you are here to see how generate_iterator and generate works, see above.
+  # The below are mainly for internal implementation that you don't have to worry about.
+  # fmt: off
+
+  _model_id:str
+  _revision:t.Optional[str]
+  _quantization_config:t.Optional[t.Union[transformers.BitsAndBytesConfig,transformers.GPTQConfig,transformers.AwqConfig]]
+  _quantise: t.Optional[LiteralQuantise]
+  _model_decls:TupleAny
+  __model_attrs:DictStrAny
+  __tokenizer_attrs:DictStrAny
+  _tag:bentoml.Tag
+  _adapter_map:t.Optional[AdapterMap]
+  _serialisation:LiteralSerialisation
+  _local:bool
+  _max_model_len:t.Optional[int]
+
+  __llm_dtype__: t.Union[LiteralDtype,t.Literal['auto', 'half', 'float']]='auto'
+  __llm_torch_dtype__:'torch.dtype'=None
+  __llm_config__:t.Optional[LLMConfig]=None
+  __llm_backend__:LiteralBackend=None
+  __llm_quantization_config__:t.Optional[t.Union[transformers.BitsAndBytesConfig,transformers.GPTQConfig,transformers.AwqConfig]]=None
+  __llm_runner__:t.Optional[Runner[M, T]]=None
+  __llm_model__:t.Optional[M]=None
+  __llm_tokenizer__:t.Optional[T]=None
+  __llm_adapter_map__:t.Optional[ResolvedAdapterMap]=None
+  __llm_trust_remote_code__:bool=False
+
+  def __init__(
+    self,
+    model_id,
+    model_version=None,
+    model_tag=None,
+    llm_config=None,
+    backend=None,
+    *args,
+    quantize=None,
+    quantization_config=None,
+    adapter_map=None,
+    serialisation='safetensors',
+    trust_remote_code=False,
+    embedded=False,
+    dtype='auto',
+    low_cpu_mem_usage=True,
+    max_model_len=None,
+    _eager=True,
+    **attrs,
+  ):
+    torch_dtype=attrs.pop('torch_dtype',None)  # backward compatible
+    if torch_dtype is not None:warnings.warns('The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',DeprecationWarning,stacklevel=3);dtype=torch_dtype
+    _local = False
+    if validate_is_path(model_id):model_id,_local=resolve_filepath(model_id),True
+    backend=first_not_none(getenv('backend',default=backend),default=self._cascade_backend())
+    dtype=first_not_none(getenv('dtype',default=dtype,var=['TORCH_DTYPE']),default='auto')
+    quantize=first_not_none(getenv('quantize',default=quantize,var=['QUANITSE']),default=None)
+    attrs.update({'low_cpu_mem_usage':low_cpu_mem_usage})
+    # parsing tokenizer and model kwargs, as the hierarchy is param pass > default
+    model_attrs,tokenizer_attrs=flatten_attrs(**attrs)
+    if model_tag is None:
+      model_tag,model_version=self._make_tag_components(model_id,model_version,backend=backend)
+      if model_version:model_tag=f'{model_tag}:{model_version}'
+
+    self.__attrs_init__(
+      model_id=model_id,
+      revision=model_version,
+      tag=bentoml.Tag.from_taglike(model_tag),
+      quantization_config=quantization_config,
+      quantise=getattr(self._Quantise,backend)(self,quantize),
+      model_decls=args,
+      adapter_map=convert_peft_config_type(adapter_map) if adapter_map is not None else None,
+      serialisation=serialisation,
+      local=_local,
+      max_model_len=max_model_len,
+      LLM__model_attrs=model_attrs,
+      LLM__tokenizer_attrs=tokenizer_attrs,
+      llm_dtype__=dtype.lower(),
+      llm_backend__=backend,
+      llm_config__=llm_config,
+      llm_trust_remote_code__=trust_remote_code,
+    )
+
+    if _eager:
+      try:
+        model=bentoml.models.get(self.tag)
+      except bentoml.exceptions.NotFound:
+        model=openllm.serialisation.import_model(self,trust_remote_code=self.trust_remote_code)
+      # resolve the tag
+      self._tag=model.tag
+    if not _eager and embedded:raise RuntimeError("Embedded mode is not supported when '_eager' is False.")
+    if embedded:logger.warning('Models will be loaded into memory. NOT RECOMMENDED in production and SHOULD ONLY used for development.');self.runner.init_local(quiet=True)
+  class _Quantise:
+    @staticmethod
+    def pt(llm:LLM,quantise=None):return quantise
+    @staticmethod
+    def vllm(llm:LLM,quantise=None):return quantise
+    @staticmethod
+    def ctranslate(llm:LLM,quantise=None):
+      if quantise in {'int4','awq','gptq','squeezellm'}:raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
+      if quantise=='int8':quantise='int8_float16' if llm._has_gpus else 'int8_float32'
+      return quantise
+  @apply(lambda val:tuple(str.lower(i) if i else i for i in val))
+  def _make_tag_components(self,model_id:str,model_version:str|None,backend:str)->tuple[str,str|None]:
+    model_id,*maybe_revision=model_id.rsplit(':')
+    if len(maybe_revision)>0:
+      if model_version is not None:logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.",maybe_revision[0],model_version)
+      model_version = maybe_revision[0]
+    if validate_is_path(model_id):model_id,model_version=resolve_filepath(model_id),first_not_none(model_version,default=generate_hash_from_file(model_id))
+    return f'{backend}-{normalise_model_name(model_id)}',model_version
+  @functools.cached_property
+  def _has_gpus(self):
+    try:
+      from cuda import cuda
+      err,*_=cuda.cuInit(0)
+      if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to initialise CUDA runtime binding.')
+      err,num_gpus=cuda.cuDeviceGetCount()
+      if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to get CUDA device count.')
+      return True
+    except (ImportError, RuntimeError):return False
+  @property
+  def _torch_dtype(self):
+    import torch, transformers
+    _map=_torch_dtype_mapping()
+    if not isinstance(self.__llm_torch_dtype__,torch.dtype):
+      try:hf_config=transformers.AutoConfig.from_pretrained(self.bentomodel.path,trust_remote_code=self.trust_remote_code)
+      except OpenLLMException:hf_config=transformers.AutoConfig.from_pretrained(self.model_id,trust_remote_code=self.trust_remote_code)
+      config_dtype=getattr(hf_config,'torch_dtype',None)
+      if config_dtype is None:config_dtype=torch.float32
+      if self.__llm_dtype__=='auto':
+        if config_dtype==torch.float32:torch_dtype=torch.float16
+        else:torch_dtype=config_dtype
+      else:
+        if self.__llm_dtype__ not in _map:raise ValueError(f"Unknown dtype '{self.__llm_dtype__}'")
+        torch_dtype=_map[self.__llm_dtype__]
+      self.__llm_torch_dtype__=torch_dtype
+    return self.__llm_torch_dtype__
+  @property
+  def _model_attrs(self):return {**self.import_kwargs[0],**self.__model_attrs}
+  @_model_attrs.setter
+  def _model_attrs(self, value):self.__model_attrs = value
+  @property
+  def _tokenizer_attrs(self):return {**self.import_kwargs[1],**self.__tokenizer_attrs}
+  def _cascade_backend(self)->LiteralBackend:
+    if self._has_gpus:
+      if is_vllm_available():return 'vllm'
+      elif is_ctranslate_available():return 'ctranslate'  # XXX: base OpenLLM image should always include vLLM
+    elif is_ctranslate_available():return 'ctranslate'
+    else:return 'pt'
+  def __setattr__(self,attr,value):
+    if attr in {'model', 'tokenizer', 'runner', 'import_kwargs'}:raise ForbiddenAttributeError(f'{attr} should not be set during runtime.')
+    super().__setattr__(attr, value)
+  def __del__(self):del self.__llm_model__,self.__llm_tokenizer__,self.__llm_adapter_map__
+  @property
+  def __repr_keys__(self):return {'model_id','revision','backend','type'}
+  def __repr_args__(self):
+    yield 'model_id',self._model_id if not self._local else self.tag.name
+    yield 'revision',self._revision if self._revision else self.tag.version
+    yield 'backend',self.__llm_backend__
+    yield 'type',self.llm_type
+  @property
+  def import_kwargs(self):return {'device_map':'auto' if self._has_gpus else None,'torch_dtype':self._torch_dtype},{'padding_side':'left','truncation_side':'left'}
+  @property
+  def trust_remote_code(self):
+    env=os.getenv('TRUST_REMOTE_CODE')
+    if env is not None:return str(env).upper() in ENV_VARS_TRUE_VALUES
+    return self.__llm_trust_remote_code__
+  @property
+  def model_id(self):return self._model_id
+  @property
+  def revision(self):return self._revision
+  @property
+  def tag(self):return self._tag
+  @property
+  def bentomodel(self):return openllm.serialisation.get(self)
+  @property
+  def quantization_config(self):
+    if self.__llm_quantization_config__ is None:
+      from ._quantisation import infer_quantisation_config
+      if self._quantization_config is not None:self.__llm_quantization_config__ = self._quantization_config
+      elif self._quantise is not None:self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(self,self._quantise,**self._model_attrs)
+      else:raise ValueError("Either 'quantization_config' or 'quantise' must be specified.")
+    return self.__llm_quantization_config__
+  @property
+  def has_adapters(self):return self._adapter_map is not None
+  @property
+  def local(self):return self._local
+  @property
+  def quantise(self):return self._quantise
+  @property
+  def llm_type(self):return normalise_model_name(self._model_id)
+  @property
+  def llm_parameters(self):return (self._model_decls,self._model_attrs),self._tokenizer_attrs
+  @property
+  def identifying_params(self):return {'configuration':self.config.model_dump_json().decode(),'model_ids':orjson.dumps(self.config['model_ids']).decode(),'model_id':self.model_id}
+  @property
+  def tokenizer(self):
+    if self.__llm_tokenizer__ is None:self.__llm_tokenizer__=openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1])
+    return self.__llm_tokenizer__
+  @property
+  def runner(self):
+    from ._runners import runner
+    if self.__llm_runner__ is None:self.__llm_runner__=runner(self)
+    return self.__llm_runner__
+  def prepare(self,adapter_type='lora',use_gradient_checking=True,**attrs):
+    if self.__llm_backend__!='pt':raise RuntimeError('Fine tuning is only supported for PyTorch backend.')
+    from peft.mapping import get_peft_model
+    from peft.utils.other import prepare_model_for_kbit_training
+    model=get_peft_model(
+      prepare_model_for_kbit_training(self.model,use_gradient_checkpointing=use_gradient_checking),
+      self.config['fine_tune_strategies']
+      .get(adapter_type,self.config.make_fine_tune_config(adapter_type))
+      .train()
+      .with_config(**attrs)
+      .build(),
+    )
+    if DEBUG:model.print_trainable_parameters()
+    return model,self.tokenizer
+  def prepare_for_training(self,*args,**attrs):logger.warning('`prepare_for_training` is deprecated and will be removed in the future. Please use `prepare` instead.');return self.prepare(*args,**attrs)
+
+  @property
+  def adapter_map(self):
+    if not is_peft_available():
+      raise MissingDependencyError("Failed to import 'peft'. Make sure to do 'pip install \"openllm[fine-tune]\"'")
+    if not self.has_adapters:
+      raise AttributeError('Adapter map is not available.')
+    assert self._adapter_map is not None
+    if self.__llm_adapter_map__ is None:
+      _map: ResolvedAdapterMap = {k: {} for k in self._adapter_map}
+      for adapter_type, adapter_tuple in self._adapter_map.items():
+        base = first_not_none(
+          self.config['fine_tune_strategies'].get(adapter_type),
+          default=self.config.make_fine_tune_config(adapter_type),
+        )
+        for adapter in adapter_tuple:
+          _map[adapter_type][adapter.name] = (base.with_config(**adapter.config).build(), adapter.adapter_id)
+      self.__llm_adapter_map__ = _map
+    return self.__llm_adapter_map__
+
+  @property
+  def model(self):
+    if self.__llm_model__ is None:
+      model = openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs)
+      # If OOM, then it is probably you don't have enough VRAM to run this model.
+      if self.__llm_backend__ == 'pt':
+        import torch
+
+        loaded_in_kbit = (
+          getattr(model, 'is_loaded_in_8bit', False)
+          or getattr(model, 'is_loaded_in_4bit', False)
+          or getattr(model, 'is_quantized', False)
+        )
+        if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
+          try:
+            model = model.to('cuda')
+          except Exception as err:
+            raise OpenLLMException(f'Failed to load model into GPU: {err}.\n') from err
+        if self.has_adapters:
+          logger.debug('Applying the following adapters: %s', self.adapter_map)
+          for adapter_dict in self.adapter_map.values():
+            for adapter_name, (peft_config, peft_model_id) in adapter_dict.items():
+              model.load_adapter(peft_model_id, adapter_name, peft_config=peft_config)
+      self.__llm_model__ = model
+    return self.__llm_model__
+
+  @property
+  def config(self):
+    import transformers
+
+    if self.__llm_config__ is None:
+      if self.__llm_backend__ == 'ctranslate':
+        try:
+          config = transformers.AutoConfig.from_pretrained(
+            self.bentomodel.path_of('/hf'), trust_remote_code=self.trust_remote_code
+          )
+        except OpenLLMException:
+          config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
+        for architecture in config.architectures:
+          if architecture in openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE():
+            config = openllm.AutoConfig.infer_class_from_name(
+              openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture]
+            ).model_construct_env(**self._model_attrs)
+            break
+          else:
+            raise OpenLLMException(
+              f"Failed to infer the configuration class from the given model. Make sure the model is a supported model. Supported models are: {', '.join(openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE.keys())}"
+            )
+      else:
+        config = openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs)
+      self.__llm_config__ = config
+    return self.__llm_config__
+
+
+# fmt: off
+@functools.lru_cache(maxsize=1)
+def _torch_dtype_mapping()->dict[str,torch.dtype]:
+  import torch; return {
+    'half': torch.float16,
+    'float': torch.float32,
+    'float16': torch.float16,
+    'float32': torch.float32,
+    'bfloat16': torch.bfloat16,
+  }
+def normalise_model_name(name:str)->str:return os.path.basename(resolve_filepath(name)) if validate_is_path(name) else inflection.dasherize(name.replace('/','--'))
+def convert_peft_config_type(adapter_map:dict[str, str])->AdapterMap:
+  if not is_peft_available():raise RuntimeError("LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'")
+  from huggingface_hub import hf_hub_download
+
+  resolved:AdapterMap={}
+  for path_or_adapter_id, name in adapter_map.items():
+    if name is None:raise ValueError('Adapter name must be specified.')
+    if os.path.isfile(os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)):
+      config_file=os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)
+    else:
+      try:
+        config_file=hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME)
+      except Exception as err:
+        raise ValueError(f"Can't find '{PEFT_CONFIG_NAME}' at '{path_or_adapter_id}'") from err
+    with open(config_file, 'r') as file:resolved_config=orjson.loads(file.read())
+    _peft_type=resolved_config['peft_type'].lower()
+    if _peft_type not in resolved:resolved[_peft_type]=()
+    resolved[_peft_type]+=(_AdapterTuple((path_or_adapter_id, name, resolved_config)),)
+  return resolved
--- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py
+++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
@@ -6,7 +6,6 @@ import typing as t
 import transformers

 from openllm.serialisation.constants import HUB_ATTRS
-from openllm_core.utils import get_disable_warnings, get_quiet_mode

 logger = logging.getLogger(__name__)

@@ -44,10 +43,9 @@ def infer_autoclass_from_llm(llm, config, /):
    # in case this model doesn't use the correct auto class for model type, for example like chatglm
    # where it uses AutoModel instead of AutoModelForCausalLM. Then we fallback to AutoModel
    if autoclass not in config.auto_map:
-      if not get_disable_warnings() and not get_quiet_mode():
-        logger.warning(
-          "OpenLLM failed to determine compatible Auto classes to load %s. Falling back to 'AutoModel'.\nTip: Make sure to specify 'AutoModelForCausalLM' or 'AutoModelForSeq2SeqLM' in your 'config.auto_map'. If your model type is yet to be supported, please file an issues on our GitHub tracker.",
-          llm._model_id,
-        )
+      logger.warning(
+        "OpenLLM failed to determine compatible Auto classes to load %s. Falling back to 'AutoModel'.\nTip: Make sure to specify 'AutoModelForCausalLM' or 'AutoModelForSeq2SeqLM' in your 'config.auto_map'. If your model type is yet to be supported, please file an issues on our GitHub tracker.",
+        llm._model_id,
+      )
      autoclass = 'AutoModel'
  return getattr(transformers, autoclass)
--- a/openllm-python/src/openllm_cli/entrypoint.py
+++ b/openllm-python/src/openllm_cli/entrypoint.py
@@ -57,7 +57,6 @@ from openllm_core.utils import (
  first_not_none,
  gen_random_uuid,
  get_debug_mode,
-  get_disable_warnings,
  get_quiet_mode,
  is_torch_available,
  pkg,
@@ -94,7 +93,7 @@ else:
  torch = LazyLoader('torch', globals(), 'torch')

 P = ParamSpec('P')
-logger = logging.getLogger(__name__)
+logger = logging.getLogger('openllm')
 OPENLLM_FIGLET = """\
 ██████╗ ██████╗ ███████╗███╗   ██╗██╗     ██╗     ███╗   ███╗
 ██╔═══██╗██╔══██╗██╔════╝████╗  ██║██║     ██║     ████╗ ████║
@@ -123,21 +122,19 @@ _EXT_FOLDER = os.path.abspath(os.path.join(os.path.dirname(__file__), 'extension


 def backend_warning(backend: LiteralBackend, build: bool = False) -> None:
-  if backend == 'pt' and (not get_disable_warnings()) and not get_quiet_mode():
+  if backend == 'pt':
    if openllm.utils.is_vllm_available():
-      termui.warning(
+      logger.warning(
        'vLLM is available, but using PyTorch backend instead. Note that vLLM is a lot more performant and should always be used in production (by explicitly set --backend vllm).'
      )
    else:
-      termui.warning(
+      logger.warning(
        'vLLM is not available. Note that PyTorch backend is not as performant as vLLM and you should always consider using vLLM for production.'
      )
    if build:
-      termui.info(
+      logger.info(
        "Tip: You can set '--backend vllm' to package your Bento with vLLM backend regardless if vLLM is available locally."
      )
-    if not get_debug_mode():
-      termui.info("To disable these warnings, set 'OPENLLM_DISABLE_WARNING=True'")


 class Extensions(click.MultiCommand):
@@ -419,22 +416,22 @@ def start_command(
      serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
    ),
  )
-  if serialisation == 'safetensors' and quantize is not None and not get_disable_warnings() and not get_quiet_mode():
-    termui.warning(f"'--quantize={quantize}' might not work with 'safetensors' serialisation format.")
-    termui.warning(
-      f"Make sure to check out '{model_id}' repository to see if the weights is in '{serialisation}' format if unsure."
+  if serialisation == 'safetensors' and quantize is not None:
+    logger.warning("'--quantize=%s' might not work with 'safetensors' serialisation format.", quantize)
+    logger.warning(
+      "Make sure to check out '%s' repository to see if the weights is in '%s' format if unsure.",
+      model_id,
+      serialisation,
    )
-    termui.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.")
-    if not get_debug_mode():
-      termui.info("To disable these warnings, set 'OPENLLM_DISABLE_WARNING=True'")
+    logger.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.")

  import torch

  if backend == 'pt' and not torch.cuda.is_available():
    if dtype == 'auto':
      dtype = 'float'
-    elif dtype not in {'float', 'float32'} and not get_disable_warnings() and not get_quiet_mode():
-      termui.warning('"bfloat16" and "half" are not supported on CPU. OpenLLM will default fallback to "float32".')
+    elif dtype not in {'float', 'float32'}:
+      logger.warning('"bfloat16" and "half" are not supported on CPU. OpenLLM will default fallback to "float32".')
    dtype = 'float'  # we need to cast back to full precision if cuda is not available
  llm = openllm.LLM[t.Any, t.Any](
    model_id=model_id,
@@ -549,22 +546,22 @@ def start_grpc_command(
  serialisation = first_not_none(
    serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
  )
-  if serialisation == 'safetensors' and quantize is not None and not get_disable_warnings() and not get_quiet_mode():
-    termui.warning(f"'--quantize={quantize}' might not work with 'safetensors' serialisation format.")
-    termui.warning(
-      f"Make sure to check out '{model_id}' repository to see if the weights is in '{serialisation}' format if unsure."
+  if serialisation == 'safetensors' and quantize is not None:
+    logger.warning("'--quantize=%s' might not work with 'safetensors' serialisation format.", quantize)
+    logger.warning(
+      "Make sure to check out '%s' repository to see if the weights is in '%s' format if unsure.",
+      model_id,
+      serialisation,
    )
-    termui.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.")
-    if not get_debug_mode():
-      termui.info("To disable these warnings, set 'OPENLLM_DISABLE_WARNING=True'")
+    logger.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.")

  import torch

  if backend == 'pt' and not torch.cuda.is_available():
    if dtype == 'auto':
      dtype = 'float'
-    elif dtype not in {'float', 'float32'} and not get_disable_warnings() and not get_quiet_mode():
-      termui.warning('"bfloat16" and "half" are not supported on CPU. OpenLLM will default fallback to "float32".')
+    elif dtype not in {'float', 'float32'}:
+      logger.warning('"bfloat16" and "half" are not supported on CPU. OpenLLM will default fallback to "float32".')
    dtype = 'float'  # we need to cast back to full precision if cuda is not available
  llm = openllm.LLM[t.Any, t.Any](
    model_id=model_id,
@@ -1095,13 +1092,14 @@ def build_command(

  push_cmd = f'bentoml push {bento_tag}'
  cloud_context = get_current_bentocloud_context()
-  if cloud_context is None and (not get_disable_warnings()) and not get_quiet_mode():
+  if cloud_context is None:
    available_context = [c.name for c in cloud_config.contexts]
    if not available_context:
-      termui.warning('No default BentoCloud context found. Please login with `bentoml cloud login` first.')
+      logger.warning('No default BentoCloud context found. Please login with `bentoml cloud login` first.')
    else:
-      termui.warning(
-        f'No context is passed, but the following context is available: {available_context}. Make sure to specify the argument "--context" for specific context you want to push to.'
+      logger.warning(
+        'No context is passed, but the following context is available: %s. Make sure to specify the argument "--context" for specific context you want to push to.',
+        available_context,
      )
  else:
    push_cmd += f' --context {cloud_context}'