chore(logger): fix warnings and streamline style (#717)

Sorry but there are too much wasted spacing in `_llm.py`, and I'm unhappy and not productive anytime I look or want to do anything with it --------- Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2026-02-19 07:06:02 -05:00 · 2023-11-21 18:54:51 -05:00
parent d53cf234bd
commit 77bd6f090a
6 changed files with 437 additions and 521 deletions
--- a/openllm-python/src/openllm/main.py
+++ b/openllm-python/src/openllm/main.py
@@ -1,2 +1,2 @@
 # fmt: off
-if __name__ == '__main__':from openllm_cli.entrypoint import cli;cli()  # noqa
+if __name__ == '__main__':from openllm_cli.entrypoint import cli;cli()
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -1,15 +1,8 @@
 from __future__ import annotations
-import functools
-import logging
-import os
+import functools, logging, os, warnings
 import typing as t
-
-import attr
-import inflection
-import orjson
-
-import bentoml
-import openllm
+import attr, inflection, orjson
+import bentoml, openllm
 from openllm_core._schemas import GenerationOutput
 from openllm_core._typing_compat import (
  AdapterMap,
@@ -35,8 +28,6 @@ from openllm_core.utils import (
  flatten_attrs,
  gen_random_uuid,
  generate_hash_from_file,
-  get_disable_warnings,
-  get_quiet_mode,
  getenv,
  is_ctranslate_available,
  is_peft_available,
@@ -49,365 +40,18 @@ from .exceptions import ForbiddenAttributeError, OpenLLMException
 from .serialisation.constants import PEFT_CONFIG_NAME

 if t.TYPE_CHECKING:
-  import torch
-  import transformers
+  import torch, transformers
  from peft.config import PeftConfig
-
  from openllm_core._configuration import LLMConfig
-
  from ._runners import Runner

-ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]]
-
 logger = logging.getLogger(__name__)
-
-
-def normalise_model_name(name: str) -> str:
-  if validate_is_path(name):
-    return os.path.basename(resolve_filepath(name))
-  name = name.replace('/', '--')
-  return inflection.dasherize(name)
-
-
-def _resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
-  if not is_peft_available():
-    raise RuntimeError("Requires 'peft' to be installed. Do 'pip install \"openllm[fine-tune]\"'")
-  from huggingface_hub import hf_hub_download
-
-  resolved: AdapterMap = {}
-  for path_or_adapter_id, name in adapter_map.items():
-    if name is None:
-      raise ValueError('Adapter name must be specified.')
-    if os.path.isfile(os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)):
-      config_file = os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)
-    else:
-      try:
-        config_file = hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME)
-      except Exception as err:
-        raise ValueError(f"Can't find '{PEFT_CONFIG_NAME}' at '{path_or_adapter_id}'") from err
-    with open(config_file, 'r') as file:
-      resolved_config = orjson.loads(file.read())
-    # all peft_type should be available in PEFT_CONFIG_NAME
-    _peft_type = resolved_config['peft_type'].lower()
-    if _peft_type not in resolved:
-      resolved[_peft_type] = ()
-    resolved[_peft_type] += (_AdapterTuple((path_or_adapter_id, name, resolved_config)),)
-  return resolved
-
-
-_reserved_namespace = {'model', 'tokenizer', 'runner', 'import_kwargs'}
 _AdapterTuple: type[AdapterTuple] = codegen.make_attr_tuple_class('AdapterTuple', ['adapter_id', 'name', 'config'])
-
-
-@functools.lru_cache(maxsize=1)
-def _torch_dtype_mapping():
-  import torch
-
-  return {
-    'half': torch.float16,
-    'float': torch.float32,
-    'float16': torch.float16,
-    'float32': torch.float32,
-    'bfloat16': torch.bfloat16,
-  }
+ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]]


@attr.define(slots=True, repr=False, init=False)
 class LLM(t.Generic[M, T], ReprMixin):
-  _model_id: str
-  _revision: str | None
-  _quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig | None
-  _quantise: LiteralQuantise | None
-  _model_decls: TupleAny
-  __model_attrs: DictStrAny
-  __tokenizer_attrs: DictStrAny
-  _tag: bentoml.Tag
-  _adapter_map: AdapterMap | None
-  _serialisation: LiteralSerialisation
-  _local: bool
-  _max_model_len: int | None
-
-  __llm_dtype__: LiteralDtype | t.Literal['auto', 'half', 'float'] = 'auto'
-  __llm_torch_dtype__: 'torch.dtype' = None
-  __llm_config__: LLMConfig | None = None
-  __llm_backend__: LiteralBackend = None  # type: ignore
-  __llm_quantization_config__: transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig | None = None
-  __llm_runner__: t.Optional[Runner[M, T]] = None
-  __llm_model__: t.Optional[M] = None
-  __llm_tokenizer__: t.Optional[T] = None
-  __llm_adapter_map__: t.Optional[ResolvedAdapterMap] = None
-  __llm_trust_remote_code__: bool = False
-
-  def __init__(
-    self,
-    model_id,
-    model_version=None,
-    model_tag=None,
-    llm_config=None,
-    backend=None,
-    *args,
-    quantize=None,
-    quantization_config=None,
-    adapter_map=None,
-    serialisation='safetensors',
-    trust_remote_code=False,
-    embedded=False,
-    dtype='auto',
-    low_cpu_mem_usage=True,
-    max_model_len=None,
-    _eager=True,
-    **attrs,
-  ):
-    # fmt: off
-    torch_dtype = attrs.pop('torch_dtype',None)  # backward compatible
-    if torch_dtype is not None:logger.warning('The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.');dtype=torch_dtype
-    _local = False
-    if validate_is_path(model_id):model_id,_local=resolve_filepath(model_id),True
-    backend=first_not_none(getenv('backend',default=backend),default=self._cascade_backend())
-    dtype=first_not_none(getenv('dtype',default=dtype,var=['TORCH_DTYPE']),default='auto')
-    quantize=first_not_none(getenv('quantize',default=quantize,var=['QUANITSE']),default=None)
-    attrs.update({'low_cpu_mem_usage':low_cpu_mem_usage})
-    # parsing tokenizer and model kwargs, as the hierarchy is param pass > default
-    model_attrs, tokenizer_attrs = flatten_attrs(**attrs)
-    if model_tag is None:
-      model_tag,model_version=self._make_tag_components(model_id,model_version,backend=backend)
-      if model_version:model_tag=f'{model_tag}:{model_version}'
-    # fmt: on
-
-    self.__attrs_init__(
-      model_id=model_id,
-      revision=model_version,
-      tag=bentoml.Tag.from_taglike(model_tag),
-      quantization_config=quantization_config,
-      quantise=self._resolve_quantise(quantize, backend),
-      model_decls=args,
-      adapter_map=_resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
-      serialisation=serialisation,
-      local=_local,
-      max_model_len=max_model_len,
-      LLM__model_attrs=model_attrs,
-      LLM__tokenizer_attrs=tokenizer_attrs,
-      llm_dtype__=dtype.lower(),
-      llm_backend__=backend,
-      llm_config__=llm_config,
-      llm_trust_remote_code__=trust_remote_code,
-    )
-
-    if _eager:
-      try:
-        model = bentoml.models.get(self.tag)
-      except bentoml.exceptions.NotFound:
-        model = openllm.serialisation.import_model(self, trust_remote_code=self.trust_remote_code)
-      # resolve the tag
-      self._tag = model.tag
-    if not _eager and embedded:
-      raise RuntimeError("Embedded mode is not supported when '_eager' is False.")
-    if embedded and not get_disable_warnings() and not get_quiet_mode():
-      logger.warning(
-        'You are using embedded mode, which means the models will be loaded into memory. This is often not recommended in production and should only be used for local development only.'
-      )
-      self.runner.init_local(quiet=True)
-
-  # fmt: off
-  def _resolve_quantise(self, quantise, backend):
-    if backend in ('pt', 'vllm'):return quantise
-    if backend=='ctranslate':return self._resolve_ctranslate_quantise(quantise)
-    raise NotImplementedError(f"Quantisation is not supported for backend '{backend}'")
-  def _resolve_ctranslate_quantise(self,quantise):
-    if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}:raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
-    if quantise == 'int8':quantise='int8_float16' if self._has_gpus else 'int8_float32'
-    return quantise
-  @apply(lambda val:tuple(str.lower(i) if i else i for i in val))
-  def _make_tag_components(self,model_id:str,model_version:str|None,backend:str)->tuple[str,str|None]:
-    model_id,*maybe_revision=model_id.rsplit(':')
-    if len(maybe_revision)>0:
-      if model_version is not None:logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.",maybe_revision[0],model_version)
-      model_version = maybe_revision[0]
-    if validate_is_path(model_id):model_id,model_version=resolve_filepath(model_id),first_not_none(model_version,default=generate_hash_from_file(model_id))
-    return f'{backend}-{normalise_model_name(model_id)}',model_version
-  @functools.cached_property
-  def _has_gpus(self):
-    try:
-      from cuda import cuda
-      err,*_=cuda.cuInit(0)
-      if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to initialise CUDA runtime binding.')
-      err,num_gpus=cuda.cuDeviceGetCount()
-      if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to get CUDA device count.')
-      return True
-    except (ImportError, RuntimeError):return False
-  @property
-  def _torch_dtype(self):
-    import torch, transformers  # noqa: I001
-    _map=_torch_dtype_mapping()
-    if not isinstance(self.__llm_torch_dtype__,torch.dtype):
-      try:hf_config=transformers.AutoConfig.from_pretrained(self.bentomodel.path,trust_remote_code=self.trust_remote_code)
-      except OpenLLMException:hf_config=transformers.AutoConfig.from_pretrained(self.model_id,trust_remote_code=self.trust_remote_code)
-      config_dtype=getattr(hf_config,'torch_dtype',None)
-      if config_dtype is None:config_dtype=torch.float32
-      if self.__llm_dtype__=='auto':
-        if config_dtype==torch.float32:torch_dtype=torch.float16
-        else:torch_dtype=config_dtype
-      else:
-        if self.__llm_dtype__ not in _map:raise ValueError(f"Unknown dtype '{self.__llm_dtype__}'")
-        torch_dtype=_map[self.__llm_dtype__]
-      self.__llm_torch_dtype__=torch_dtype
-    return self.__llm_torch_dtype__
-  @property
-  def _model_attrs(self):return {**self.import_kwargs[0],**self.__model_attrs}
-  @_model_attrs.setter
-  def _model_attrs(self, value):self.__model_attrs = value
-  @property
-  def _tokenizer_attrs(self):return {**self.import_kwargs[1],**self.__tokenizer_attrs}
-  def _cascade_backend(self)->LiteralBackend:
-    if self._has_gpus:
-      if is_vllm_available():return 'vllm'
-      elif is_ctranslate_available():return 'ctranslate'  # XXX: base OpenLLM image should always include vLLM
-    elif is_ctranslate_available():return 'ctranslate'
-    else:return 'pt'
-  def __setattr__(self,attr,value):
-    if attr in _reserved_namespace:raise ForbiddenAttributeError(f'{attr} should not be set during runtime.')
-    super().__setattr__(attr, value)
-  def __del__(self):del self.__llm_model__,self.__llm_tokenizer__,self.__llm_adapter_map__
-  @property
-  def __repr_keys__(self):return {'model_id','revision','backend','type'}
-  def __repr_args__(self):
-    yield 'model_id',self._model_id if not self._local else self.tag.name
-    yield 'revision',self._revision if self._revision else self.tag.version
-    yield 'backend',self.__llm_backend__
-    yield 'type',self.llm_type
-  @property
-  def import_kwargs(self):return {'device_map':'auto' if self._has_gpus else None,'torch_dtype':self._torch_dtype},{'padding_side':'left','truncation_side':'left'}
-  @property
-  def trust_remote_code(self):
-    env=os.getenv('TRUST_REMOTE_CODE')
-    if env is not None:return str(env).upper() in ENV_VARS_TRUE_VALUES
-    return self.__llm_trust_remote_code__
-  @property
-  def model_id(self):return self._model_id
-  @property
-  def revision(self):return self._revision
-  @property
-  def tag(self):return self._tag
-  @property
-  def bentomodel(self):return openllm.serialisation.get(self)
-  @property
-  def quantization_config(self):
-    if self.__llm_quantization_config__ is None:
-      from ._quantisation import infer_quantisation_config
-      if self._quantization_config is not None:self.__llm_quantization_config__ = self._quantization_config
-      elif self._quantise is not None:self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(self,self._quantise,**self._model_attrs)
-      else:raise ValueError("Either 'quantization_config' or 'quantise' must be specified.")
-    return self.__llm_quantization_config__
-  @property
-  def has_adapters(self):return self._adapter_map is not None
-  @property
-  def local(self):return self._local
-  @property
-  def quantise(self):return self._quantise
-  @property
-  def llm_type(self):return normalise_model_name(self._model_id)
-  @property
-  def llm_parameters(self):return (self._model_decls,self._model_attrs),self._tokenizer_attrs
-  @property
-  def identifying_params(self):return {'configuration':self.config.model_dump_json().decode(),'model_ids':orjson.dumps(self.config['model_ids']).decode(),'model_id':self.model_id}
-  @property
-  def tokenizer(self):
-    if self.__llm_tokenizer__ is None:self.__llm_tokenizer__=openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1])
-    return self.__llm_tokenizer__
-  @property
-  def runner(self):
-    from ._runners import runner
-    if self.__llm_runner__ is None:self.__llm_runner__=runner(self)
-    return self.__llm_runner__
-  def prepare(self,adapter_type='lora',use_gradient_checking=True,**attrs):
-    if self.__llm_backend__!='pt':raise RuntimeError('Fine tuning is only supported for PyTorch backend.')
-    from peft.mapping import get_peft_model
-    from peft.utils.other import prepare_model_for_kbit_training
-    model=get_peft_model(
-      prepare_model_for_kbit_training(self.model,use_gradient_checkpointing=use_gradient_checking),
-      self.config['fine_tune_strategies']
-      .get(adapter_type,self.config.make_fine_tune_config(adapter_type))
-      .train()
-      .with_config(**attrs)
-      .build(),
-    )
-    if DEBUG:model.print_trainable_parameters()
-    return model,self.tokenizer
-  def prepare_for_training(self,*args,**attrs):logger.warning('`prepare_for_training` is deprecated and will be removed in the future. Please use `prepare` instead.');return self.prepare(*args,**attrs)
-  # fmt: on
-
-  @property
-  def adapter_map(self):
-    if not is_peft_available():
-      raise MissingDependencyError("Failed to import 'peft'. Make sure to do 'pip install \"openllm[fine-tune]\"'")
-    if not self.has_adapters:
-      raise AttributeError('Adapter map is not available.')
-    assert self._adapter_map is not None
-    if self.__llm_adapter_map__ is None:
-      _map: ResolvedAdapterMap = {k: {} for k in self._adapter_map}
-      for adapter_type, adapter_tuple in self._adapter_map.items():
-        base = first_not_none(
-          self.config['fine_tune_strategies'].get(adapter_type),
-          default=self.config.make_fine_tune_config(adapter_type),
-        )
-        for adapter in adapter_tuple:
-          _map[adapter_type][adapter.name] = (base.with_config(**adapter.config).build(), adapter.adapter_id)
-      self.__llm_adapter_map__ = _map
-    return self.__llm_adapter_map__
-
-  @property
-  def model(self):
-    if self.__llm_model__ is None:
-      model = openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs)
-      # If OOM, then it is probably you don't have enough VRAM to run this model.
-      if self.__llm_backend__ == 'pt':
-        import torch
-
-        loaded_in_kbit = (
-          getattr(model, 'is_loaded_in_8bit', False)
-          or getattr(model, 'is_loaded_in_4bit', False)
-          or getattr(model, 'is_quantized', False)
-        )
-        if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
-          try:
-            model = model.to('cuda')
-          except Exception as err:
-            raise OpenLLMException(f'Failed to load model into GPU: {err}.\n') from err
-        if self.has_adapters:
-          logger.debug('Applying the following adapters: %s', self.adapter_map)
-          for adapter_dict in self.adapter_map.values():
-            for adapter_name, (peft_config, peft_model_id) in adapter_dict.items():
-              model.load_adapter(peft_model_id, adapter_name, peft_config=peft_config)
-      self.__llm_model__ = model
-    return self.__llm_model__
-
-  @property
-  def config(self):
-    import transformers
-
-    if self.__llm_config__ is None:
-      if self.__llm_backend__ == 'ctranslate':
-        try:
-          config = transformers.AutoConfig.from_pretrained(
-            self.bentomodel.path_of('/hf'), trust_remote_code=self.trust_remote_code
-          )
-        except OpenLLMException:
-          config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
-        for architecture in config.architectures:
-          if architecture in openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE():
-            config = openllm.AutoConfig.infer_class_from_name(
-              openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture]
-            ).model_construct_env(**self._model_attrs)
-            break
-          else:
-            raise OpenLLMException(
-              f"Failed to infer the configuration class from the given model. Make sure the model is a supported model. Supported models are: {', '.join(openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE.keys())}"
-            )
-      else:
-        config = openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs)
-      self.__llm_config__ = config
-    return self.__llm_config__
-
  async def generate(
    self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
  ) -> GenerationOutput:
@@ -495,3 +139,325 @@ class LLM(t.Generic[M, T], ReprMixin):
        yield generated.with_options(outputs=delta_outputs)
    except Exception as err:
      raise RuntimeError(f'Exception caught during generation: {err}') from err
+
+  # NOTE: If you are here to see how generate_iterator and generate works, see above.
+  # The below are mainly for internal implementation that you don't have to worry about.
+  # fmt: off
+
+  _model_id:str
+  _revision:t.Optional[str]
+  _quantization_config:t.Optional[t.Union[transformers.BitsAndBytesConfig,transformers.GPTQConfig,transformers.AwqConfig]]
+  _quantise: t.Optional[LiteralQuantise]
+  _model_decls:TupleAny
+  __model_attrs:DictStrAny
+  __tokenizer_attrs:DictStrAny
+  _tag:bentoml.Tag
+  _adapter_map:t.Optional[AdapterMap]
+  _serialisation:LiteralSerialisation
+  _local:bool
+  _max_model_len:t.Optional[int]
+
+  __llm_dtype__: t.Union[LiteralDtype,t.Literal['auto', 'half', 'float']]='auto'
+  __llm_torch_dtype__:'torch.dtype'=None
+  __llm_config__:t.Optional[LLMConfig]=None
+  __llm_backend__:LiteralBackend=None
+  __llm_quantization_config__:t.Optional[t.Union[transformers.BitsAndBytesConfig,transformers.GPTQConfig,transformers.AwqConfig]]=None
+  __llm_runner__:t.Optional[Runner[M, T]]=None
+  __llm_model__:t.Optional[M]=None
+  __llm_tokenizer__:t.Optional[T]=None
+  __llm_adapter_map__:t.Optional[ResolvedAdapterMap]=None
+  __llm_trust_remote_code__:bool=False
+
+  def __init__(
+    self,
+    model_id,
+    model_version=None,
+    model_tag=None,
+    llm_config=None,
+    backend=None,
+    *args,
+    quantize=None,
+    quantization_config=None,
+    adapter_map=None,
+    serialisation='safetensors',
+    trust_remote_code=False,
+    embedded=False,
+    dtype='auto',
+    low_cpu_mem_usage=True,
+    max_model_len=None,
+    _eager=True,
+    **attrs,
+  ):
+    torch_dtype=attrs.pop('torch_dtype',None)  # backward compatible
+    if torch_dtype is not None:warnings.warns('The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',DeprecationWarning,stacklevel=3);dtype=torch_dtype
+    _local = False
+    if validate_is_path(model_id):model_id,_local=resolve_filepath(model_id),True
+    backend=first_not_none(getenv('backend',default=backend),default=self._cascade_backend())
+    dtype=first_not_none(getenv('dtype',default=dtype,var=['TORCH_DTYPE']),default='auto')
+    quantize=first_not_none(getenv('quantize',default=quantize,var=['QUANITSE']),default=None)
+    attrs.update({'low_cpu_mem_usage':low_cpu_mem_usage})
+    # parsing tokenizer and model kwargs, as the hierarchy is param pass > default
+    model_attrs,tokenizer_attrs=flatten_attrs(**attrs)
+    if model_tag is None:
+      model_tag,model_version=self._make_tag_components(model_id,model_version,backend=backend)
+      if model_version:model_tag=f'{model_tag}:{model_version}'
+
+    self.__attrs_init__(
+      model_id=model_id,
+      revision=model_version,
+      tag=bentoml.Tag.from_taglike(model_tag),
+      quantization_config=quantization_config,
+      quantise=getattr(self._Quantise,backend)(self,quantize),
+      model_decls=args,
+      adapter_map=convert_peft_config_type(adapter_map) if adapter_map is not None else None,
+      serialisation=serialisation,
+      local=_local,
+      max_model_len=max_model_len,
+      LLM__model_attrs=model_attrs,
+      LLM__tokenizer_attrs=tokenizer_attrs,
+      llm_dtype__=dtype.lower(),
+      llm_backend__=backend,
+      llm_config__=llm_config,
+      llm_trust_remote_code__=trust_remote_code,
+    )
+
+    if _eager:
+      try:
+        model=bentoml.models.get(self.tag)
+      except bentoml.exceptions.NotFound:
+        model=openllm.serialisation.import_model(self,trust_remote_code=self.trust_remote_code)
+      # resolve the tag
+      self._tag=model.tag
+    if not _eager and embedded:raise RuntimeError("Embedded mode is not supported when '_eager' is False.")
+    if embedded:logger.warning('Models will be loaded into memory. NOT RECOMMENDED in production and SHOULD ONLY used for development.');self.runner.init_local(quiet=True)
+  class _Quantise:
+    @staticmethod
+    def pt(llm:LLM,quantise=None):return quantise
+    @staticmethod
+    def vllm(llm:LLM,quantise=None):return quantise
+    @staticmethod
+    def ctranslate(llm:LLM,quantise=None):
+      if quantise in {'int4','awq','gptq','squeezellm'}:raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
+      if quantise=='int8':quantise='int8_float16' if llm._has_gpus else 'int8_float32'
+      return quantise
+  @apply(lambda val:tuple(str.lower(i) if i else i for i in val))
+  def _make_tag_components(self,model_id:str,model_version:str|None,backend:str)->tuple[str,str|None]:
+    model_id,*maybe_revision=model_id.rsplit(':')
+    if len(maybe_revision)>0:
+      if model_version is not None:logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.",maybe_revision[0],model_version)
+      model_version = maybe_revision[0]
+    if validate_is_path(model_id):model_id,model_version=resolve_filepath(model_id),first_not_none(model_version,default=generate_hash_from_file(model_id))
+    return f'{backend}-{normalise_model_name(model_id)}',model_version
+  @functools.cached_property
+  def _has_gpus(self):
+    try:
+      from cuda import cuda
+      err,*_=cuda.cuInit(0)
+      if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to initialise CUDA runtime binding.')
+      err,num_gpus=cuda.cuDeviceGetCount()
+      if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to get CUDA device count.')
+      return True
+    except (ImportError, RuntimeError):return False
+  @property
+  def _torch_dtype(self):
+    import torch, transformers
+    _map=_torch_dtype_mapping()
+    if not isinstance(self.__llm_torch_dtype__,torch.dtype):
+      try:hf_config=transformers.AutoConfig.from_pretrained(self.bentomodel.path,trust_remote_code=self.trust_remote_code)
+      except OpenLLMException:hf_config=transformers.AutoConfig.from_pretrained(self.model_id,trust_remote_code=self.trust_remote_code)
+      config_dtype=getattr(hf_config,'torch_dtype',None)
+      if config_dtype is None:config_dtype=torch.float32
+      if self.__llm_dtype__=='auto':
+        if config_dtype==torch.float32:torch_dtype=torch.float16
+        else:torch_dtype=config_dtype
+      else:
+        if self.__llm_dtype__ not in _map:raise ValueError(f"Unknown dtype '{self.__llm_dtype__}'")
+        torch_dtype=_map[self.__llm_dtype__]
+      self.__llm_torch_dtype__=torch_dtype
+    return self.__llm_torch_dtype__
+  @property
+  def _model_attrs(self):return {**self.import_kwargs[0],**self.__model_attrs}
+  @_model_attrs.setter
+  def _model_attrs(self, value):self.__model_attrs = value
+  @property
+  def _tokenizer_attrs(self):return {**self.import_kwargs[1],**self.__tokenizer_attrs}
+  def _cascade_backend(self)->LiteralBackend:
+    if self._has_gpus:
+      if is_vllm_available():return 'vllm'
+      elif is_ctranslate_available():return 'ctranslate'  # XXX: base OpenLLM image should always include vLLM
+    elif is_ctranslate_available():return 'ctranslate'
+    else:return 'pt'
+  def __setattr__(self,attr,value):
+    if attr in {'model', 'tokenizer', 'runner', 'import_kwargs'}:raise ForbiddenAttributeError(f'{attr} should not be set during runtime.')
+    super().__setattr__(attr, value)
+  def __del__(self):del self.__llm_model__,self.__llm_tokenizer__,self.__llm_adapter_map__
+  @property
+  def __repr_keys__(self):return {'model_id','revision','backend','type'}
+  def __repr_args__(self):
+    yield 'model_id',self._model_id if not self._local else self.tag.name
+    yield 'revision',self._revision if self._revision else self.tag.version
+    yield 'backend',self.__llm_backend__
+    yield 'type',self.llm_type
+  @property
+  def import_kwargs(self):return {'device_map':'auto' if self._has_gpus else None,'torch_dtype':self._torch_dtype},{'padding_side':'left','truncation_side':'left'}
+  @property
+  def trust_remote_code(self):
+    env=os.getenv('TRUST_REMOTE_CODE')
+    if env is not None:return str(env).upper() in ENV_VARS_TRUE_VALUES
+    return self.__llm_trust_remote_code__
+  @property
+  def model_id(self):return self._model_id
+  @property
+  def revision(self):return self._revision
+  @property
+  def tag(self):return self._tag
+  @property
+  def bentomodel(self):return openllm.serialisation.get(self)
+  @property
+  def quantization_config(self):
+    if self.__llm_quantization_config__ is None:
+      from ._quantisation import infer_quantisation_config
+      if self._quantization_config is not None:self.__llm_quantization_config__ = self._quantization_config
+      elif self._quantise is not None:self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(self,self._quantise,**self._model_attrs)
+      else:raise ValueError("Either 'quantization_config' or 'quantise' must be specified.")
+    return self.__llm_quantization_config__
+  @property
+  def has_adapters(self):return self._adapter_map is not None
+  @property
+  def local(self):return self._local
+  @property
+  def quantise(self):return self._quantise
+  @property
+  def llm_type(self):return normalise_model_name(self._model_id)
+  @property
+  def llm_parameters(self):return (self._model_decls,self._model_attrs),self._tokenizer_attrs
+  @property
+  def identifying_params(self):return {'configuration':self.config.model_dump_json().decode(),'model_ids':orjson.dumps(self.config['model_ids']).decode(),'model_id':self.model_id}
+  @property
+  def tokenizer(self):
+    if self.__llm_tokenizer__ is None:self.__llm_tokenizer__=openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1])
+    return self.__llm_tokenizer__
+  @property
+  def runner(self):
+    from ._runners import runner
+    if self.__llm_runner__ is None:self.__llm_runner__=runner(self)
+    return self.__llm_runner__
+  def prepare(self,adapter_type='lora',use_gradient_checking=True,**attrs):
+    if self.__llm_backend__!='pt':raise RuntimeError('Fine tuning is only supported for PyTorch backend.')
+    from peft.mapping import get_peft_model
+    from peft.utils.other import prepare_model_for_kbit_training
+    model=get_peft_model(
+      prepare_model_for_kbit_training(self.model,use_gradient_checkpointing=use_gradient_checking),
+      self.config['fine_tune_strategies']
+      .get(adapter_type,self.config.make_fine_tune_config(adapter_type))
+      .train()
+      .with_config(**attrs)
+      .build(),
+    )
+    if DEBUG:model.print_trainable_parameters()
+    return model,self.tokenizer
+  def prepare_for_training(self,*args,**attrs):logger.warning('`prepare_for_training` is deprecated and will be removed in the future. Please use `prepare` instead.');return self.prepare(*args,**attrs)
+
+  @property
+  def adapter_map(self):
+    if not is_peft_available():
+      raise MissingDependencyError("Failed to import 'peft'. Make sure to do 'pip install \"openllm[fine-tune]\"'")
+    if not self.has_adapters:
+      raise AttributeError('Adapter map is not available.')
+    assert self._adapter_map is not None
+    if self.__llm_adapter_map__ is None:
+      _map: ResolvedAdapterMap = {k: {} for k in self._adapter_map}
+      for adapter_type, adapter_tuple in self._adapter_map.items():
+        base = first_not_none(
+          self.config['fine_tune_strategies'].get(adapter_type),
+          default=self.config.make_fine_tune_config(adapter_type),
+        )
+        for adapter in adapter_tuple:
+          _map[adapter_type][adapter.name] = (base.with_config(**adapter.config).build(), adapter.adapter_id)
+      self.__llm_adapter_map__ = _map
+    return self.__llm_adapter_map__
+
+  @property
+  def model(self):
+    if self.__llm_model__ is None:
+      model = openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs)
+      # If OOM, then it is probably you don't have enough VRAM to run this model.
+      if self.__llm_backend__ == 'pt':
+        import torch
+
+        loaded_in_kbit = (
+          getattr(model, 'is_loaded_in_8bit', False)
+          or getattr(model, 'is_loaded_in_4bit', False)
+          or getattr(model, 'is_quantized', False)
+        )
+        if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
+          try:
+            model = model.to('cuda')
+          except Exception as err:
+            raise OpenLLMException(f'Failed to load model into GPU: {err}.\n') from err
+        if self.has_adapters:
+          logger.debug('Applying the following adapters: %s', self.adapter_map)
+          for adapter_dict in self.adapter_map.values():
+            for adapter_name, (peft_config, peft_model_id) in adapter_dict.items():
+              model.load_adapter(peft_model_id, adapter_name, peft_config=peft_config)
+      self.__llm_model__ = model
+    return self.__llm_model__
+
+  @property
+  def config(self):
+    import transformers
+
+    if self.__llm_config__ is None:
+      if self.__llm_backend__ == 'ctranslate':
+        try:
+          config = transformers.AutoConfig.from_pretrained(
+            self.bentomodel.path_of('/hf'), trust_remote_code=self.trust_remote_code
+          )
+        except OpenLLMException:
+          config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
+        for architecture in config.architectures:
+          if architecture in openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE():
+            config = openllm.AutoConfig.infer_class_from_name(
+              openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture]
+            ).model_construct_env(**self._model_attrs)
+            break
+          else:
+            raise OpenLLMException(
+              f"Failed to infer the configuration class from the given model. Make sure the model is a supported model. Supported models are: {', '.join(openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE.keys())}"
+            )
+      else:
+        config = openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs)
+      self.__llm_config__ = config
+    return self.__llm_config__
+
+
+# fmt: off
+@functools.lru_cache(maxsize=1)
+def _torch_dtype_mapping()->dict[str,torch.dtype]:
+  import torch; return {
+    'half': torch.float16,
+    'float': torch.float32,
+    'float16': torch.float16,
+    'float32': torch.float32,
+    'bfloat16': torch.bfloat16,
+  }
+def normalise_model_name(name:str)->str:return os.path.basename(resolve_filepath(name)) if validate_is_path(name) else inflection.dasherize(name.replace('/','--'))
+def convert_peft_config_type(adapter_map:dict[str, str])->AdapterMap:
+  if not is_peft_available():raise RuntimeError("LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'")
+  from huggingface_hub import hf_hub_download
+
+  resolved:AdapterMap={}
+  for path_or_adapter_id, name in adapter_map.items():
+    if name is None:raise ValueError('Adapter name must be specified.')
+    if os.path.isfile(os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)):
+      config_file=os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)
+    else:
+      try:
+        config_file=hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME)
+      except Exception as err:
+        raise ValueError(f"Can't find '{PEFT_CONFIG_NAME}' at '{path_or_adapter_id}'") from err
+    with open(config_file, 'r') as file:resolved_config=orjson.loads(file.read())
+    _peft_type=resolved_config['peft_type'].lower()
+    if _peft_type not in resolved:resolved[_peft_type]=()
+    resolved[_peft_type]+=(_AdapterTuple((path_or_adapter_id, name, resolved_config)),)
+  return resolved
--- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py
+++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
@@ -6,7 +6,6 @@ import typing as t
 import transformers

 from openllm.serialisation.constants import HUB_ATTRS
-from openllm_core.utils import get_disable_warnings, get_quiet_mode

 logger = logging.getLogger(__name__)

@@ -44,10 +43,9 @@ def infer_autoclass_from_llm(llm, config, /):
    # in case this model doesn't use the correct auto class for model type, for example like chatglm
    # where it uses AutoModel instead of AutoModelForCausalLM. Then we fallback to AutoModel
    if autoclass not in config.auto_map:
-      if not get_disable_warnings() and not get_quiet_mode():
-        logger.warning(
-          "OpenLLM failed to determine compatible Auto classes to load %s. Falling back to 'AutoModel'.\nTip: Make sure to specify 'AutoModelForCausalLM' or 'AutoModelForSeq2SeqLM' in your 'config.auto_map'. If your model type is yet to be supported, please file an issues on our GitHub tracker.",
-          llm._model_id,
-        )
+      logger.warning(
+        "OpenLLM failed to determine compatible Auto classes to load %s. Falling back to 'AutoModel'.\nTip: Make sure to specify 'AutoModelForCausalLM' or 'AutoModelForSeq2SeqLM' in your 'config.auto_map'. If your model type is yet to be supported, please file an issues on our GitHub tracker.",
+        llm._model_id,
+      )
      autoclass = 'AutoModel'
  return getattr(transformers, autoclass)