diff --git a/.ruff.toml b/.ruff.toml index 584858c3..7db838d3 100644 --- a/.ruff.toml +++ b/.ruff.toml @@ -1,77 +1,75 @@ extend-exclude = [ - "tools", - "examples", - "openllm-python/src/openllm/__init__.py", - "openllm-python/src/openllm/_version.py", - "openllm-python/src/openllm/models/__init__.py", - "openllm-python/src/openllm_cli/playground", - "openllm-client/src/openllm_client/pb/**", + "tools", + "examples", + "openllm-python/src/openllm/__init__.py", + "openllm-python/src/openllm/_version.py", + "openllm-python/src/openllm/models/__init__.py", + "openllm-python/src/openllm_cli/playground", + "openllm-client/src/openllm_client/pb/**", ] extend-include = ["*.ipynb"] extend-select = [ - "E", - "F", - "B", - "PIE", - "I", # isort - "G", # flake8-logging-format - "W", # pycodestyle - "Q", # flake8-quotes - "FA", # flake8-future-annotations - "TCH", # flake8-type-checking - "PLW", # pylint-warning - "PLR", # pylint-refactor - "PT", # flake8-pytest-style - "PERF", # perflint - "FLY", # flynt - "RUF", # Ruff-specific rules - "YTT", # flake8-2020 + "E", + "F", + "B", + "PIE", + "G", # flake8-logging-format + "W", # pycodestyle + "Q", # flake8-quotes + "FA", # flake8-future-annotations + "TCH", # flake8-type-checking + "PLW", # pylint-warning + "PLR", # pylint-refactor + "PT", # flake8-pytest-style + "PERF", # perflint + "RUF", # Ruff-specific rules + "YTT", # flake8-2020 ] fix = true ignore = [ - "PLR0911", - "PLR0912", - "PLR0913", - "PLR0915", - "PLR2004", # magic value to use constant - "E501", # ignore line length violation - "E401", # ignore multiple line import - "E702", - "TCH004", # don't move runtime import out, just warn about it - "RUF012", # mutable attributes to be used with ClassVar - "E701", # multiple statement on single line + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", + "PLR2004", # magic value to use constant + "E501", # ignore line length violation + "E401", # ignore multiple line import + "E702", + "TCH004", # don't move runtime import out, just warn about it + "RUF012", # mutable attributes to be used with ClassVar + "E701", # multiple statement on single line ] line-length = 119 indent-width = 2 target-version = "py38" typing-modules = [ - "openllm_core._typing_compat", - "openllm_client._typing_compat", + "openllm_core._typing_compat", + "openllm_client._typing_compat", ] unfixable = ["TCH004"] [lint.flake8-type-checking] exempt-modules = [ - "typing", - "typing_extensions", - "openllm_core._typing_compat", - "openllm_client._typing_compat", + "typing", + "typing_extensions", + "openllm_core._typing_compat", + "openllm_client._typing_compat", ] runtime-evaluated-base-classes = [ - "openllm_core._configuration.LLMConfig", - "openllm_core._configuration.GenerationConfig", - "openllm_core._configuration.SamplingParams", - "openllm_core._configuration.ModelSettings", - "openllm.LLMConfig", + "openllm_core._configuration.LLMConfig", + "openllm_core._configuration.GenerationConfig", + "openllm_core._configuration.SamplingParams", + "openllm_core._configuration.ModelSettings", + "openllm.LLMConfig", ] runtime-evaluated-decorators = [ - "attrs.define", - "attrs.frozen", - "trait", - "attr.attrs", - 'attr.define', - '_attr.define', - 'attr.frozen', + "attrs.define", + "attrs.frozen", + "trait", + "attr.attrs", + 'attr.define', + '_attr.define', + 'attr.frozen', ] [format] @@ -87,29 +85,6 @@ convention = "google" ignore-overlong-task-comments = true max-line-length = 119 -[lint.isort] -combine-as-imports = true -known-first-party = [ - "openllm", - "bentoml", - "openllm_core", - "openllm_client", - "openllm_cli", -] -known-third-party = [ - "transformers", - "click", - "huggingface_hub", - "torch", - "vllm", - "auto_gptq", - "peft", - "click_option_group", -] -split-on-trailing-comma = false -no-lines-before = ["future", "standard-library"] -relative-imports-order = "closest-to-furthest" - [lint.flake8-quotes] avoid-escape = false inline-quotes = "single" @@ -121,5 +96,4 @@ docstring-quotes = "double" "openllm-python/src/openllm/_llm.py" = ["F811"] "openllm-core/src/openllm_core/utils/import_utils.py" = ["PLW0603", "F811"] "openllm-core/src/openllm_core/_configuration.py" = ["F811", "Q001"] -"openllm-python/src/openllm/__init__.pyi" = ["I001"] "openllm-python/src/openllm/_service_vars_pkg.py" = ["F821"] diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py index d0963bf1..69da3a52 100644 --- a/openllm-core/src/openllm_core/utils/__init__.py +++ b/openllm-core/src/openllm_core/utils/__init__.py @@ -36,7 +36,7 @@ logger = logging.getLogger(__name__) _T=t.TypeVar('_T') @functools.lru_cache(maxsize=1) def _WithArgsTypes()->tuple[type[t.Any],...]: - try:from typing import GenericAlias as _TypingGenericAlias # type: ignore # noqa: I001 + try:from typing import GenericAlias as _TypingGenericAlias # type: ignore except ImportError:_TypingGenericAlias = () # type: ignore # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on) # _GenericAlias is the actual GenericAlias implementation return (_TypingGenericAlias,) if sys.version_info<(3,10) else (t._GenericAlias, types.GenericAlias, types.UnionType) # type: ignore @@ -69,7 +69,7 @@ def getenv(env:str,default:t.Any=None,var:t.Sequence[str]|None=None)->t.Any: if var is not None:env_key=set(var)|env_key def callback(k:str)->t.Any: _var = os.getenv(k) - if _var and k.startswith('OPENLLM_') and not get_disable_warnings() and not get_quiet_mode():logger.warning("Using '%s' environment is deprecated, use '%s' instead.",k.upper(),k[8:].upper()) + if _var and k.startswith('OPENLLM_'):logger.warning("Using '%s' environment is deprecated, use '%s' instead.",k.upper(),k[8:].upper()) return _var return first_not_none(*(callback(k) for k in env_key),default=default) def field_env_key(key:str,suffix:str|None=None)->str:return '_'.join(filter(None,map(str.upper,['OPENLLM',suffix.strip('_') if suffix else '',key]))) @@ -80,13 +80,13 @@ def get_quiet_mode()->bool: return False def get_disable_warnings()->bool:return check_bool_env(WARNING_ENV_VAR, False) def set_disable_warnings(disable:bool=True)->None: - if get_disable_warnings():os.environ[WARNING_ENV_VAR]=str(disable) + if disable:os.environ[WARNING_ENV_VAR]=str(disable) def set_debug_mode(enabled:bool,level:int=1)->None: if enabled:os.environ[DEV_DEBUG_VAR] = str(level) os.environ.update({DEBUG_ENV_VAR:str(enabled),QUIET_ENV_VAR:str(not enabled),_GRPC_DEBUG_ENV_VAR:'DEBUG' if enabled else 'ERROR','CT2_VERBOSE':'3'}) - set_disable_warnings(enabled) + set_disable_warnings(not enabled) def set_quiet_mode(enabled:bool)->None: - os.environ.update({QUIET_ENV_VAR:str(enabled),_GRPC_DEBUG_ENV_VAR:'NONE','CT2_VERBOSE':'-1'}) + os.environ.update({QUIET_ENV_VAR:str(enabled),DEBUG_ENV_VAR:str(not enabled),_GRPC_DEBUG_ENV_VAR:'NONE','CT2_VERBOSE':'-1'}) set_disable_warnings(enabled) def gen_random_uuid(prefix:str|None=None)->str:return '-'.join([prefix or 'openllm', str(uuid.uuid4().hex)]) # NOTE: `compose` any number of unary functions into a single unary function @@ -113,11 +113,8 @@ def generate_context(framework_name:str): return ModelContext(framework_name=framework_name,framework_versions=framework_versions) @functools.lru_cache(maxsize=1) def in_notebook()->bool: - try:from IPython.core.getipython import get_ipython; return 'IPKernelApp' in get_ipython().config # noqa: I001 - except (ImportError, AttributeError):return False -# Used to filter out INFO log -class InfoFilter(logging.Filter): - def filter(self,record:logging.LogRecord)->bool:return logging.INFO<=record.levelnotuple[dict[str,t.Any],dict[str, t.Any]]: tokenizer_attrs = {k[len(_TOKENIZER_PREFIX):]:v for k,v in attrs.items() if k.startswith(_TOKENIZER_PREFIX)} @@ -130,31 +127,31 @@ DEBUG=sys.flags.dev_mode or (not sys.flags.ignore_environment and check_bool_env SHOW_CODEGEN=DEBUG and (os.environ.get(DEV_DEBUG_VAR,str(0)).isdigit() and int(os.environ.get(DEV_DEBUG_VAR,str(0)))>3) # MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins MYPY=False -# fmt: on - - class ExceptionFilter(logging.Filter): def __init__(self, exclude_exceptions: list[type[Exception]] | None = None, **kwargs: t.Any): - if exclude_exceptions is None: - exclude_exceptions = [] + if exclude_exceptions is None:exclude_exceptions=[] try: from circus.exc import ConflictError - - if ConflictError not in exclude_exceptions: - exclude_exceptions.append(ConflictError) + if ConflictError not in exclude_exceptions:exclude_exceptions.append(ConflictError) except ImportError: pass super(ExceptionFilter, self).__init__(**kwargs) - self.EXCLUDE_EXCEPTIONS = exclude_exceptions - - def filter(self, record: logging.LogRecord) -> bool: + self.EXCLUDE_EXCEPTIONS=exclude_exceptions + def filter(self,record:logging.LogRecord)->bool: if record.exc_info: - etype, _, _ = record.exc_info + etype,_,_=record.exc_info if etype is not None: for exc in self.EXCLUDE_EXCEPTIONS: - if issubclass(etype, exc): - return False + if issubclass(etype, exc):return False return True +# Used to filter out INFO log +class InfoFilter(logging.Filter): + def filter(self,record:logging.LogRecord)->bool:return logging.INFO<=record.levelnobool: + if get_disable_warnings():return record.levelno>=logging.ERROR + return True +# fmt: on _LOGGING_CONFIG: dict[str, t.Any] = { @@ -163,11 +160,12 @@ _LOGGING_CONFIG: dict[str, t.Any] = { 'filters': { 'excfilter': {'()': 'openllm_core.utils.ExceptionFilter'}, 'infofilter': {'()': 'openllm_core.utils.InfoFilter'}, + 'warningfilter': {'()': 'openllm_core.utils.WarningFilter'}, }, 'handlers': { 'bentomlhandler': { 'class': 'logging.StreamHandler', - 'filters': ['excfilter', 'infofilter'], + 'filters': ['excfilter', 'warningfilter', 'infofilter'], 'stream': 'ext://sys.stdout', }, 'defaulthandler': {'class': 'logging.StreamHandler', 'level': logging.WARNING}, @@ -195,6 +193,9 @@ def configure_logging() -> None: _LOGGING_CONFIG['loggers']['bentoml']['level'] = logging.INFO _LOGGING_CONFIG['root']['level'] = logging.INFO + if get_disable_warnings(): # HACK: This is a hack to disable warnings + _LOGGING_CONFIG['loggers']['openllm']['level'] = logging.ERROR + logging.config.dictConfig(_LOGGING_CONFIG) @@ -241,24 +242,3 @@ __lazy = LazyModule( __all__ = __lazy.__all__ __dir__ = __lazy.__dir__ __getattr__ = __lazy.__getattr__ - -if t.TYPE_CHECKING: - from . import analytics as analytics, codegen as codegen, dantic as dantic, serde as serde - from .import_utils import ( - OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES, - is_autoawq_available as is_autoawq_available, - is_autogptq_available as is_autogptq_available, - is_bentoml_available as is_bentoml_available, - is_bitsandbytes_available as is_bitsandbytes_available, - is_ctranslate_available as is_ctranslate_available, - is_grpc_available as is_grpc_available, - is_jupyter_available as is_jupyter_available, - is_jupytext_available as is_jupytext_available, - is_notebook_available as is_notebook_available, - is_peft_available as is_peft_available, - is_torch_available as is_torch_available, - is_transformers_available as is_transformers_available, - is_vllm_available as is_vllm_available, - ) - from .representation import ReprMixin as ReprMixin - from .serde import converter as converter diff --git a/openllm-python/src/openllm/__main__.py b/openllm-python/src/openllm/__main__.py index 99866462..581b46ee 100644 --- a/openllm-python/src/openllm/__main__.py +++ b/openllm-python/src/openllm/__main__.py @@ -1,2 +1,2 @@ # fmt: off -if __name__ == '__main__':from openllm_cli.entrypoint import cli;cli() # noqa +if __name__ == '__main__':from openllm_cli.entrypoint import cli;cli() diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index 31b63d72..5a6ac73f 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -1,15 +1,8 @@ from __future__ import annotations -import functools -import logging -import os +import functools, logging, os, warnings import typing as t - -import attr -import inflection -import orjson - -import bentoml -import openllm +import attr, inflection, orjson +import bentoml, openllm from openllm_core._schemas import GenerationOutput from openllm_core._typing_compat import ( AdapterMap, @@ -35,8 +28,6 @@ from openllm_core.utils import ( flatten_attrs, gen_random_uuid, generate_hash_from_file, - get_disable_warnings, - get_quiet_mode, getenv, is_ctranslate_available, is_peft_available, @@ -49,365 +40,18 @@ from .exceptions import ForbiddenAttributeError, OpenLLMException from .serialisation.constants import PEFT_CONFIG_NAME if t.TYPE_CHECKING: - import torch - import transformers + import torch, transformers from peft.config import PeftConfig - from openllm_core._configuration import LLMConfig - from ._runners import Runner -ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]] - logger = logging.getLogger(__name__) - - -def normalise_model_name(name: str) -> str: - if validate_is_path(name): - return os.path.basename(resolve_filepath(name)) - name = name.replace('/', '--') - return inflection.dasherize(name) - - -def _resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap: - if not is_peft_available(): - raise RuntimeError("Requires 'peft' to be installed. Do 'pip install \"openllm[fine-tune]\"'") - from huggingface_hub import hf_hub_download - - resolved: AdapterMap = {} - for path_or_adapter_id, name in adapter_map.items(): - if name is None: - raise ValueError('Adapter name must be specified.') - if os.path.isfile(os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)): - config_file = os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME) - else: - try: - config_file = hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME) - except Exception as err: - raise ValueError(f"Can't find '{PEFT_CONFIG_NAME}' at '{path_or_adapter_id}'") from err - with open(config_file, 'r') as file: - resolved_config = orjson.loads(file.read()) - # all peft_type should be available in PEFT_CONFIG_NAME - _peft_type = resolved_config['peft_type'].lower() - if _peft_type not in resolved: - resolved[_peft_type] = () - resolved[_peft_type] += (_AdapterTuple((path_or_adapter_id, name, resolved_config)),) - return resolved - - -_reserved_namespace = {'model', 'tokenizer', 'runner', 'import_kwargs'} _AdapterTuple: type[AdapterTuple] = codegen.make_attr_tuple_class('AdapterTuple', ['adapter_id', 'name', 'config']) - - -@functools.lru_cache(maxsize=1) -def _torch_dtype_mapping(): - import torch - - return { - 'half': torch.float16, - 'float': torch.float32, - 'float16': torch.float16, - 'float32': torch.float32, - 'bfloat16': torch.bfloat16, - } +ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]] @attr.define(slots=True, repr=False, init=False) class LLM(t.Generic[M, T], ReprMixin): - _model_id: str - _revision: str | None - _quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig | None - _quantise: LiteralQuantise | None - _model_decls: TupleAny - __model_attrs: DictStrAny - __tokenizer_attrs: DictStrAny - _tag: bentoml.Tag - _adapter_map: AdapterMap | None - _serialisation: LiteralSerialisation - _local: bool - _max_model_len: int | None - - __llm_dtype__: LiteralDtype | t.Literal['auto', 'half', 'float'] = 'auto' - __llm_torch_dtype__: 'torch.dtype' = None - __llm_config__: LLMConfig | None = None - __llm_backend__: LiteralBackend = None # type: ignore - __llm_quantization_config__: transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig | None = None - __llm_runner__: t.Optional[Runner[M, T]] = None - __llm_model__: t.Optional[M] = None - __llm_tokenizer__: t.Optional[T] = None - __llm_adapter_map__: t.Optional[ResolvedAdapterMap] = None - __llm_trust_remote_code__: bool = False - - def __init__( - self, - model_id, - model_version=None, - model_tag=None, - llm_config=None, - backend=None, - *args, - quantize=None, - quantization_config=None, - adapter_map=None, - serialisation='safetensors', - trust_remote_code=False, - embedded=False, - dtype='auto', - low_cpu_mem_usage=True, - max_model_len=None, - _eager=True, - **attrs, - ): - # fmt: off - torch_dtype = attrs.pop('torch_dtype',None) # backward compatible - if torch_dtype is not None:logger.warning('The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.');dtype=torch_dtype - _local = False - if validate_is_path(model_id):model_id,_local=resolve_filepath(model_id),True - backend=first_not_none(getenv('backend',default=backend),default=self._cascade_backend()) - dtype=first_not_none(getenv('dtype',default=dtype,var=['TORCH_DTYPE']),default='auto') - quantize=first_not_none(getenv('quantize',default=quantize,var=['QUANITSE']),default=None) - attrs.update({'low_cpu_mem_usage':low_cpu_mem_usage}) - # parsing tokenizer and model kwargs, as the hierarchy is param pass > default - model_attrs, tokenizer_attrs = flatten_attrs(**attrs) - if model_tag is None: - model_tag,model_version=self._make_tag_components(model_id,model_version,backend=backend) - if model_version:model_tag=f'{model_tag}:{model_version}' - # fmt: on - - self.__attrs_init__( - model_id=model_id, - revision=model_version, - tag=bentoml.Tag.from_taglike(model_tag), - quantization_config=quantization_config, - quantise=self._resolve_quantise(quantize, backend), - model_decls=args, - adapter_map=_resolve_peft_config_type(adapter_map) if adapter_map is not None else None, - serialisation=serialisation, - local=_local, - max_model_len=max_model_len, - LLM__model_attrs=model_attrs, - LLM__tokenizer_attrs=tokenizer_attrs, - llm_dtype__=dtype.lower(), - llm_backend__=backend, - llm_config__=llm_config, - llm_trust_remote_code__=trust_remote_code, - ) - - if _eager: - try: - model = bentoml.models.get(self.tag) - except bentoml.exceptions.NotFound: - model = openllm.serialisation.import_model(self, trust_remote_code=self.trust_remote_code) - # resolve the tag - self._tag = model.tag - if not _eager and embedded: - raise RuntimeError("Embedded mode is not supported when '_eager' is False.") - if embedded and not get_disable_warnings() and not get_quiet_mode(): - logger.warning( - 'You are using embedded mode, which means the models will be loaded into memory. This is often not recommended in production and should only be used for local development only.' - ) - self.runner.init_local(quiet=True) - - # fmt: off - def _resolve_quantise(self, quantise, backend): - if backend in ('pt', 'vllm'):return quantise - if backend=='ctranslate':return self._resolve_ctranslate_quantise(quantise) - raise NotImplementedError(f"Quantisation is not supported for backend '{backend}'") - def _resolve_ctranslate_quantise(self,quantise): - if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}:raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'") - if quantise == 'int8':quantise='int8_float16' if self._has_gpus else 'int8_float32' - return quantise - @apply(lambda val:tuple(str.lower(i) if i else i for i in val)) - def _make_tag_components(self,model_id:str,model_version:str|None,backend:str)->tuple[str,str|None]: - model_id,*maybe_revision=model_id.rsplit(':') - if len(maybe_revision)>0: - if model_version is not None:logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.",maybe_revision[0],model_version) - model_version = maybe_revision[0] - if validate_is_path(model_id):model_id,model_version=resolve_filepath(model_id),first_not_none(model_version,default=generate_hash_from_file(model_id)) - return f'{backend}-{normalise_model_name(model_id)}',model_version - @functools.cached_property - def _has_gpus(self): - try: - from cuda import cuda - err,*_=cuda.cuInit(0) - if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to initialise CUDA runtime binding.') - err,num_gpus=cuda.cuDeviceGetCount() - if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to get CUDA device count.') - return True - except (ImportError, RuntimeError):return False - @property - def _torch_dtype(self): - import torch, transformers # noqa: I001 - _map=_torch_dtype_mapping() - if not isinstance(self.__llm_torch_dtype__,torch.dtype): - try:hf_config=transformers.AutoConfig.from_pretrained(self.bentomodel.path,trust_remote_code=self.trust_remote_code) - except OpenLLMException:hf_config=transformers.AutoConfig.from_pretrained(self.model_id,trust_remote_code=self.trust_remote_code) - config_dtype=getattr(hf_config,'torch_dtype',None) - if config_dtype is None:config_dtype=torch.float32 - if self.__llm_dtype__=='auto': - if config_dtype==torch.float32:torch_dtype=torch.float16 - else:torch_dtype=config_dtype - else: - if self.__llm_dtype__ not in _map:raise ValueError(f"Unknown dtype '{self.__llm_dtype__}'") - torch_dtype=_map[self.__llm_dtype__] - self.__llm_torch_dtype__=torch_dtype - return self.__llm_torch_dtype__ - @property - def _model_attrs(self):return {**self.import_kwargs[0],**self.__model_attrs} - @_model_attrs.setter - def _model_attrs(self, value):self.__model_attrs = value - @property - def _tokenizer_attrs(self):return {**self.import_kwargs[1],**self.__tokenizer_attrs} - def _cascade_backend(self)->LiteralBackend: - if self._has_gpus: - if is_vllm_available():return 'vllm' - elif is_ctranslate_available():return 'ctranslate' # XXX: base OpenLLM image should always include vLLM - elif is_ctranslate_available():return 'ctranslate' - else:return 'pt' - def __setattr__(self,attr,value): - if attr in _reserved_namespace:raise ForbiddenAttributeError(f'{attr} should not be set during runtime.') - super().__setattr__(attr, value) - def __del__(self):del self.__llm_model__,self.__llm_tokenizer__,self.__llm_adapter_map__ - @property - def __repr_keys__(self):return {'model_id','revision','backend','type'} - def __repr_args__(self): - yield 'model_id',self._model_id if not self._local else self.tag.name - yield 'revision',self._revision if self._revision else self.tag.version - yield 'backend',self.__llm_backend__ - yield 'type',self.llm_type - @property - def import_kwargs(self):return {'device_map':'auto' if self._has_gpus else None,'torch_dtype':self._torch_dtype},{'padding_side':'left','truncation_side':'left'} - @property - def trust_remote_code(self): - env=os.getenv('TRUST_REMOTE_CODE') - if env is not None:return str(env).upper() in ENV_VARS_TRUE_VALUES - return self.__llm_trust_remote_code__ - @property - def model_id(self):return self._model_id - @property - def revision(self):return self._revision - @property - def tag(self):return self._tag - @property - def bentomodel(self):return openllm.serialisation.get(self) - @property - def quantization_config(self): - if self.__llm_quantization_config__ is None: - from ._quantisation import infer_quantisation_config - if self._quantization_config is not None:self.__llm_quantization_config__ = self._quantization_config - elif self._quantise is not None:self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(self,self._quantise,**self._model_attrs) - else:raise ValueError("Either 'quantization_config' or 'quantise' must be specified.") - return self.__llm_quantization_config__ - @property - def has_adapters(self):return self._adapter_map is not None - @property - def local(self):return self._local - @property - def quantise(self):return self._quantise - @property - def llm_type(self):return normalise_model_name(self._model_id) - @property - def llm_parameters(self):return (self._model_decls,self._model_attrs),self._tokenizer_attrs - @property - def identifying_params(self):return {'configuration':self.config.model_dump_json().decode(),'model_ids':orjson.dumps(self.config['model_ids']).decode(),'model_id':self.model_id} - @property - def tokenizer(self): - if self.__llm_tokenizer__ is None:self.__llm_tokenizer__=openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1]) - return self.__llm_tokenizer__ - @property - def runner(self): - from ._runners import runner - if self.__llm_runner__ is None:self.__llm_runner__=runner(self) - return self.__llm_runner__ - def prepare(self,adapter_type='lora',use_gradient_checking=True,**attrs): - if self.__llm_backend__!='pt':raise RuntimeError('Fine tuning is only supported for PyTorch backend.') - from peft.mapping import get_peft_model - from peft.utils.other import prepare_model_for_kbit_training - model=get_peft_model( - prepare_model_for_kbit_training(self.model,use_gradient_checkpointing=use_gradient_checking), - self.config['fine_tune_strategies'] - .get(adapter_type,self.config.make_fine_tune_config(adapter_type)) - .train() - .with_config(**attrs) - .build(), - ) - if DEBUG:model.print_trainable_parameters() - return model,self.tokenizer - def prepare_for_training(self,*args,**attrs):logger.warning('`prepare_for_training` is deprecated and will be removed in the future. Please use `prepare` instead.');return self.prepare(*args,**attrs) - # fmt: on - - @property - def adapter_map(self): - if not is_peft_available(): - raise MissingDependencyError("Failed to import 'peft'. Make sure to do 'pip install \"openllm[fine-tune]\"'") - if not self.has_adapters: - raise AttributeError('Adapter map is not available.') - assert self._adapter_map is not None - if self.__llm_adapter_map__ is None: - _map: ResolvedAdapterMap = {k: {} for k in self._adapter_map} - for adapter_type, adapter_tuple in self._adapter_map.items(): - base = first_not_none( - self.config['fine_tune_strategies'].get(adapter_type), - default=self.config.make_fine_tune_config(adapter_type), - ) - for adapter in adapter_tuple: - _map[adapter_type][adapter.name] = (base.with_config(**adapter.config).build(), adapter.adapter_id) - self.__llm_adapter_map__ = _map - return self.__llm_adapter_map__ - - @property - def model(self): - if self.__llm_model__ is None: - model = openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs) - # If OOM, then it is probably you don't have enough VRAM to run this model. - if self.__llm_backend__ == 'pt': - import torch - - loaded_in_kbit = ( - getattr(model, 'is_loaded_in_8bit', False) - or getattr(model, 'is_loaded_in_4bit', False) - or getattr(model, 'is_quantized', False) - ) - if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit: - try: - model = model.to('cuda') - except Exception as err: - raise OpenLLMException(f'Failed to load model into GPU: {err}.\n') from err - if self.has_adapters: - logger.debug('Applying the following adapters: %s', self.adapter_map) - for adapter_dict in self.adapter_map.values(): - for adapter_name, (peft_config, peft_model_id) in adapter_dict.items(): - model.load_adapter(peft_model_id, adapter_name, peft_config=peft_config) - self.__llm_model__ = model - return self.__llm_model__ - - @property - def config(self): - import transformers - - if self.__llm_config__ is None: - if self.__llm_backend__ == 'ctranslate': - try: - config = transformers.AutoConfig.from_pretrained( - self.bentomodel.path_of('/hf'), trust_remote_code=self.trust_remote_code - ) - except OpenLLMException: - config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code) - for architecture in config.architectures: - if architecture in openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE(): - config = openllm.AutoConfig.infer_class_from_name( - openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture] - ).model_construct_env(**self._model_attrs) - break - else: - raise OpenLLMException( - f"Failed to infer the configuration class from the given model. Make sure the model is a supported model. Supported models are: {', '.join(openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE.keys())}" - ) - else: - config = openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs) - self.__llm_config__ = config - return self.__llm_config__ - async def generate( self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs ) -> GenerationOutput: @@ -495,3 +139,325 @@ class LLM(t.Generic[M, T], ReprMixin): yield generated.with_options(outputs=delta_outputs) except Exception as err: raise RuntimeError(f'Exception caught during generation: {err}') from err + + # NOTE: If you are here to see how generate_iterator and generate works, see above. + # The below are mainly for internal implementation that you don't have to worry about. + # fmt: off + + _model_id:str + _revision:t.Optional[str] + _quantization_config:t.Optional[t.Union[transformers.BitsAndBytesConfig,transformers.GPTQConfig,transformers.AwqConfig]] + _quantise: t.Optional[LiteralQuantise] + _model_decls:TupleAny + __model_attrs:DictStrAny + __tokenizer_attrs:DictStrAny + _tag:bentoml.Tag + _adapter_map:t.Optional[AdapterMap] + _serialisation:LiteralSerialisation + _local:bool + _max_model_len:t.Optional[int] + + __llm_dtype__: t.Union[LiteralDtype,t.Literal['auto', 'half', 'float']]='auto' + __llm_torch_dtype__:'torch.dtype'=None + __llm_config__:t.Optional[LLMConfig]=None + __llm_backend__:LiteralBackend=None + __llm_quantization_config__:t.Optional[t.Union[transformers.BitsAndBytesConfig,transformers.GPTQConfig,transformers.AwqConfig]]=None + __llm_runner__:t.Optional[Runner[M, T]]=None + __llm_model__:t.Optional[M]=None + __llm_tokenizer__:t.Optional[T]=None + __llm_adapter_map__:t.Optional[ResolvedAdapterMap]=None + __llm_trust_remote_code__:bool=False + + def __init__( + self, + model_id, + model_version=None, + model_tag=None, + llm_config=None, + backend=None, + *args, + quantize=None, + quantization_config=None, + adapter_map=None, + serialisation='safetensors', + trust_remote_code=False, + embedded=False, + dtype='auto', + low_cpu_mem_usage=True, + max_model_len=None, + _eager=True, + **attrs, + ): + torch_dtype=attrs.pop('torch_dtype',None) # backward compatible + if torch_dtype is not None:warnings.warns('The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',DeprecationWarning,stacklevel=3);dtype=torch_dtype + _local = False + if validate_is_path(model_id):model_id,_local=resolve_filepath(model_id),True + backend=first_not_none(getenv('backend',default=backend),default=self._cascade_backend()) + dtype=first_not_none(getenv('dtype',default=dtype,var=['TORCH_DTYPE']),default='auto') + quantize=first_not_none(getenv('quantize',default=quantize,var=['QUANITSE']),default=None) + attrs.update({'low_cpu_mem_usage':low_cpu_mem_usage}) + # parsing tokenizer and model kwargs, as the hierarchy is param pass > default + model_attrs,tokenizer_attrs=flatten_attrs(**attrs) + if model_tag is None: + model_tag,model_version=self._make_tag_components(model_id,model_version,backend=backend) + if model_version:model_tag=f'{model_tag}:{model_version}' + + self.__attrs_init__( + model_id=model_id, + revision=model_version, + tag=bentoml.Tag.from_taglike(model_tag), + quantization_config=quantization_config, + quantise=getattr(self._Quantise,backend)(self,quantize), + model_decls=args, + adapter_map=convert_peft_config_type(adapter_map) if adapter_map is not None else None, + serialisation=serialisation, + local=_local, + max_model_len=max_model_len, + LLM__model_attrs=model_attrs, + LLM__tokenizer_attrs=tokenizer_attrs, + llm_dtype__=dtype.lower(), + llm_backend__=backend, + llm_config__=llm_config, + llm_trust_remote_code__=trust_remote_code, + ) + + if _eager: + try: + model=bentoml.models.get(self.tag) + except bentoml.exceptions.NotFound: + model=openllm.serialisation.import_model(self,trust_remote_code=self.trust_remote_code) + # resolve the tag + self._tag=model.tag + if not _eager and embedded:raise RuntimeError("Embedded mode is not supported when '_eager' is False.") + if embedded:logger.warning('Models will be loaded into memory. NOT RECOMMENDED in production and SHOULD ONLY used for development.');self.runner.init_local(quiet=True) + class _Quantise: + @staticmethod + def pt(llm:LLM,quantise=None):return quantise + @staticmethod + def vllm(llm:LLM,quantise=None):return quantise + @staticmethod + def ctranslate(llm:LLM,quantise=None): + if quantise in {'int4','awq','gptq','squeezellm'}:raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'") + if quantise=='int8':quantise='int8_float16' if llm._has_gpus else 'int8_float32' + return quantise + @apply(lambda val:tuple(str.lower(i) if i else i for i in val)) + def _make_tag_components(self,model_id:str,model_version:str|None,backend:str)->tuple[str,str|None]: + model_id,*maybe_revision=model_id.rsplit(':') + if len(maybe_revision)>0: + if model_version is not None:logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.",maybe_revision[0],model_version) + model_version = maybe_revision[0] + if validate_is_path(model_id):model_id,model_version=resolve_filepath(model_id),first_not_none(model_version,default=generate_hash_from_file(model_id)) + return f'{backend}-{normalise_model_name(model_id)}',model_version + @functools.cached_property + def _has_gpus(self): + try: + from cuda import cuda + err,*_=cuda.cuInit(0) + if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to initialise CUDA runtime binding.') + err,num_gpus=cuda.cuDeviceGetCount() + if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to get CUDA device count.') + return True + except (ImportError, RuntimeError):return False + @property + def _torch_dtype(self): + import torch, transformers + _map=_torch_dtype_mapping() + if not isinstance(self.__llm_torch_dtype__,torch.dtype): + try:hf_config=transformers.AutoConfig.from_pretrained(self.bentomodel.path,trust_remote_code=self.trust_remote_code) + except OpenLLMException:hf_config=transformers.AutoConfig.from_pretrained(self.model_id,trust_remote_code=self.trust_remote_code) + config_dtype=getattr(hf_config,'torch_dtype',None) + if config_dtype is None:config_dtype=torch.float32 + if self.__llm_dtype__=='auto': + if config_dtype==torch.float32:torch_dtype=torch.float16 + else:torch_dtype=config_dtype + else: + if self.__llm_dtype__ not in _map:raise ValueError(f"Unknown dtype '{self.__llm_dtype__}'") + torch_dtype=_map[self.__llm_dtype__] + self.__llm_torch_dtype__=torch_dtype + return self.__llm_torch_dtype__ + @property + def _model_attrs(self):return {**self.import_kwargs[0],**self.__model_attrs} + @_model_attrs.setter + def _model_attrs(self, value):self.__model_attrs = value + @property + def _tokenizer_attrs(self):return {**self.import_kwargs[1],**self.__tokenizer_attrs} + def _cascade_backend(self)->LiteralBackend: + if self._has_gpus: + if is_vllm_available():return 'vllm' + elif is_ctranslate_available():return 'ctranslate' # XXX: base OpenLLM image should always include vLLM + elif is_ctranslate_available():return 'ctranslate' + else:return 'pt' + def __setattr__(self,attr,value): + if attr in {'model', 'tokenizer', 'runner', 'import_kwargs'}:raise ForbiddenAttributeError(f'{attr} should not be set during runtime.') + super().__setattr__(attr, value) + def __del__(self):del self.__llm_model__,self.__llm_tokenizer__,self.__llm_adapter_map__ + @property + def __repr_keys__(self):return {'model_id','revision','backend','type'} + def __repr_args__(self): + yield 'model_id',self._model_id if not self._local else self.tag.name + yield 'revision',self._revision if self._revision else self.tag.version + yield 'backend',self.__llm_backend__ + yield 'type',self.llm_type + @property + def import_kwargs(self):return {'device_map':'auto' if self._has_gpus else None,'torch_dtype':self._torch_dtype},{'padding_side':'left','truncation_side':'left'} + @property + def trust_remote_code(self): + env=os.getenv('TRUST_REMOTE_CODE') + if env is not None:return str(env).upper() in ENV_VARS_TRUE_VALUES + return self.__llm_trust_remote_code__ + @property + def model_id(self):return self._model_id + @property + def revision(self):return self._revision + @property + def tag(self):return self._tag + @property + def bentomodel(self):return openllm.serialisation.get(self) + @property + def quantization_config(self): + if self.__llm_quantization_config__ is None: + from ._quantisation import infer_quantisation_config + if self._quantization_config is not None:self.__llm_quantization_config__ = self._quantization_config + elif self._quantise is not None:self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(self,self._quantise,**self._model_attrs) + else:raise ValueError("Either 'quantization_config' or 'quantise' must be specified.") + return self.__llm_quantization_config__ + @property + def has_adapters(self):return self._adapter_map is not None + @property + def local(self):return self._local + @property + def quantise(self):return self._quantise + @property + def llm_type(self):return normalise_model_name(self._model_id) + @property + def llm_parameters(self):return (self._model_decls,self._model_attrs),self._tokenizer_attrs + @property + def identifying_params(self):return {'configuration':self.config.model_dump_json().decode(),'model_ids':orjson.dumps(self.config['model_ids']).decode(),'model_id':self.model_id} + @property + def tokenizer(self): + if self.__llm_tokenizer__ is None:self.__llm_tokenizer__=openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1]) + return self.__llm_tokenizer__ + @property + def runner(self): + from ._runners import runner + if self.__llm_runner__ is None:self.__llm_runner__=runner(self) + return self.__llm_runner__ + def prepare(self,adapter_type='lora',use_gradient_checking=True,**attrs): + if self.__llm_backend__!='pt':raise RuntimeError('Fine tuning is only supported for PyTorch backend.') + from peft.mapping import get_peft_model + from peft.utils.other import prepare_model_for_kbit_training + model=get_peft_model( + prepare_model_for_kbit_training(self.model,use_gradient_checkpointing=use_gradient_checking), + self.config['fine_tune_strategies'] + .get(adapter_type,self.config.make_fine_tune_config(adapter_type)) + .train() + .with_config(**attrs) + .build(), + ) + if DEBUG:model.print_trainable_parameters() + return model,self.tokenizer + def prepare_for_training(self,*args,**attrs):logger.warning('`prepare_for_training` is deprecated and will be removed in the future. Please use `prepare` instead.');return self.prepare(*args,**attrs) + + @property + def adapter_map(self): + if not is_peft_available(): + raise MissingDependencyError("Failed to import 'peft'. Make sure to do 'pip install \"openllm[fine-tune]\"'") + if not self.has_adapters: + raise AttributeError('Adapter map is not available.') + assert self._adapter_map is not None + if self.__llm_adapter_map__ is None: + _map: ResolvedAdapterMap = {k: {} for k in self._adapter_map} + for adapter_type, adapter_tuple in self._adapter_map.items(): + base = first_not_none( + self.config['fine_tune_strategies'].get(adapter_type), + default=self.config.make_fine_tune_config(adapter_type), + ) + for adapter in adapter_tuple: + _map[adapter_type][adapter.name] = (base.with_config(**adapter.config).build(), adapter.adapter_id) + self.__llm_adapter_map__ = _map + return self.__llm_adapter_map__ + + @property + def model(self): + if self.__llm_model__ is None: + model = openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs) + # If OOM, then it is probably you don't have enough VRAM to run this model. + if self.__llm_backend__ == 'pt': + import torch + + loaded_in_kbit = ( + getattr(model, 'is_loaded_in_8bit', False) + or getattr(model, 'is_loaded_in_4bit', False) + or getattr(model, 'is_quantized', False) + ) + if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit: + try: + model = model.to('cuda') + except Exception as err: + raise OpenLLMException(f'Failed to load model into GPU: {err}.\n') from err + if self.has_adapters: + logger.debug('Applying the following adapters: %s', self.adapter_map) + for adapter_dict in self.adapter_map.values(): + for adapter_name, (peft_config, peft_model_id) in adapter_dict.items(): + model.load_adapter(peft_model_id, adapter_name, peft_config=peft_config) + self.__llm_model__ = model + return self.__llm_model__ + + @property + def config(self): + import transformers + + if self.__llm_config__ is None: + if self.__llm_backend__ == 'ctranslate': + try: + config = transformers.AutoConfig.from_pretrained( + self.bentomodel.path_of('/hf'), trust_remote_code=self.trust_remote_code + ) + except OpenLLMException: + config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code) + for architecture in config.architectures: + if architecture in openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE(): + config = openllm.AutoConfig.infer_class_from_name( + openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture] + ).model_construct_env(**self._model_attrs) + break + else: + raise OpenLLMException( + f"Failed to infer the configuration class from the given model. Make sure the model is a supported model. Supported models are: {', '.join(openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE.keys())}" + ) + else: + config = openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs) + self.__llm_config__ = config + return self.__llm_config__ + + +# fmt: off +@functools.lru_cache(maxsize=1) +def _torch_dtype_mapping()->dict[str,torch.dtype]: + import torch; return { + 'half': torch.float16, + 'float': torch.float32, + 'float16': torch.float16, + 'float32': torch.float32, + 'bfloat16': torch.bfloat16, + } +def normalise_model_name(name:str)->str:return os.path.basename(resolve_filepath(name)) if validate_is_path(name) else inflection.dasherize(name.replace('/','--')) +def convert_peft_config_type(adapter_map:dict[str, str])->AdapterMap: + if not is_peft_available():raise RuntimeError("LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'") + from huggingface_hub import hf_hub_download + + resolved:AdapterMap={} + for path_or_adapter_id, name in adapter_map.items(): + if name is None:raise ValueError('Adapter name must be specified.') + if os.path.isfile(os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)): + config_file=os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME) + else: + try: + config_file=hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME) + except Exception as err: + raise ValueError(f"Can't find '{PEFT_CONFIG_NAME}' at '{path_or_adapter_id}'") from err + with open(config_file, 'r') as file:resolved_config=orjson.loads(file.read()) + _peft_type=resolved_config['peft_type'].lower() + if _peft_type not in resolved:resolved[_peft_type]=() + resolved[_peft_type]+=(_AdapterTuple((path_or_adapter_id, name, resolved_config)),) + return resolved diff --git a/openllm-python/src/openllm/serialisation/transformers/_helpers.py b/openllm-python/src/openllm/serialisation/transformers/_helpers.py index 5664e1ea..3ab49df5 100644 --- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py +++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py @@ -6,7 +6,6 @@ import typing as t import transformers from openllm.serialisation.constants import HUB_ATTRS -from openllm_core.utils import get_disable_warnings, get_quiet_mode logger = logging.getLogger(__name__) @@ -44,10 +43,9 @@ def infer_autoclass_from_llm(llm, config, /): # in case this model doesn't use the correct auto class for model type, for example like chatglm # where it uses AutoModel instead of AutoModelForCausalLM. Then we fallback to AutoModel if autoclass not in config.auto_map: - if not get_disable_warnings() and not get_quiet_mode(): - logger.warning( - "OpenLLM failed to determine compatible Auto classes to load %s. Falling back to 'AutoModel'.\nTip: Make sure to specify 'AutoModelForCausalLM' or 'AutoModelForSeq2SeqLM' in your 'config.auto_map'. If your model type is yet to be supported, please file an issues on our GitHub tracker.", - llm._model_id, - ) + logger.warning( + "OpenLLM failed to determine compatible Auto classes to load %s. Falling back to 'AutoModel'.\nTip: Make sure to specify 'AutoModelForCausalLM' or 'AutoModelForSeq2SeqLM' in your 'config.auto_map'. If your model type is yet to be supported, please file an issues on our GitHub tracker.", + llm._model_id, + ) autoclass = 'AutoModel' return getattr(transformers, autoclass) diff --git a/openllm-python/src/openllm_cli/entrypoint.py b/openllm-python/src/openllm_cli/entrypoint.py index 4f0286cc..97075252 100644 --- a/openllm-python/src/openllm_cli/entrypoint.py +++ b/openllm-python/src/openllm_cli/entrypoint.py @@ -57,7 +57,6 @@ from openllm_core.utils import ( first_not_none, gen_random_uuid, get_debug_mode, - get_disable_warnings, get_quiet_mode, is_torch_available, pkg, @@ -94,7 +93,7 @@ else: torch = LazyLoader('torch', globals(), 'torch') P = ParamSpec('P') -logger = logging.getLogger(__name__) +logger = logging.getLogger('openllm') OPENLLM_FIGLET = """\ ██████╗ ██████╗ ███████╗███╗ ██╗██╗ ██╗ ███╗ ███╗ ██╔═══██╗██╔══██╗██╔════╝████╗ ██║██║ ██║ ████╗ ████║ @@ -123,21 +122,19 @@ _EXT_FOLDER = os.path.abspath(os.path.join(os.path.dirname(__file__), 'extension def backend_warning(backend: LiteralBackend, build: bool = False) -> None: - if backend == 'pt' and (not get_disable_warnings()) and not get_quiet_mode(): + if backend == 'pt': if openllm.utils.is_vllm_available(): - termui.warning( + logger.warning( 'vLLM is available, but using PyTorch backend instead. Note that vLLM is a lot more performant and should always be used in production (by explicitly set --backend vllm).' ) else: - termui.warning( + logger.warning( 'vLLM is not available. Note that PyTorch backend is not as performant as vLLM and you should always consider using vLLM for production.' ) if build: - termui.info( + logger.info( "Tip: You can set '--backend vllm' to package your Bento with vLLM backend regardless if vLLM is available locally." ) - if not get_debug_mode(): - termui.info("To disable these warnings, set 'OPENLLM_DISABLE_WARNING=True'") class Extensions(click.MultiCommand): @@ -419,22 +416,22 @@ def start_command( serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy' ), ) - if serialisation == 'safetensors' and quantize is not None and not get_disable_warnings() and not get_quiet_mode(): - termui.warning(f"'--quantize={quantize}' might not work with 'safetensors' serialisation format.") - termui.warning( - f"Make sure to check out '{model_id}' repository to see if the weights is in '{serialisation}' format if unsure." + if serialisation == 'safetensors' and quantize is not None: + logger.warning("'--quantize=%s' might not work with 'safetensors' serialisation format.", quantize) + logger.warning( + "Make sure to check out '%s' repository to see if the weights is in '%s' format if unsure.", + model_id, + serialisation, ) - termui.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.") - if not get_debug_mode(): - termui.info("To disable these warnings, set 'OPENLLM_DISABLE_WARNING=True'") + logger.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.") import torch if backend == 'pt' and not torch.cuda.is_available(): if dtype == 'auto': dtype = 'float' - elif dtype not in {'float', 'float32'} and not get_disable_warnings() and not get_quiet_mode(): - termui.warning('"bfloat16" and "half" are not supported on CPU. OpenLLM will default fallback to "float32".') + elif dtype not in {'float', 'float32'}: + logger.warning('"bfloat16" and "half" are not supported on CPU. OpenLLM will default fallback to "float32".') dtype = 'float' # we need to cast back to full precision if cuda is not available llm = openllm.LLM[t.Any, t.Any]( model_id=model_id, @@ -549,22 +546,22 @@ def start_grpc_command( serialisation = first_not_none( serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy' ) - if serialisation == 'safetensors' and quantize is not None and not get_disable_warnings() and not get_quiet_mode(): - termui.warning(f"'--quantize={quantize}' might not work with 'safetensors' serialisation format.") - termui.warning( - f"Make sure to check out '{model_id}' repository to see if the weights is in '{serialisation}' format if unsure." + if serialisation == 'safetensors' and quantize is not None: + logger.warning("'--quantize=%s' might not work with 'safetensors' serialisation format.", quantize) + logger.warning( + "Make sure to check out '%s' repository to see if the weights is in '%s' format if unsure.", + model_id, + serialisation, ) - termui.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.") - if not get_debug_mode(): - termui.info("To disable these warnings, set 'OPENLLM_DISABLE_WARNING=True'") + logger.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.") import torch if backend == 'pt' and not torch.cuda.is_available(): if dtype == 'auto': dtype = 'float' - elif dtype not in {'float', 'float32'} and not get_disable_warnings() and not get_quiet_mode(): - termui.warning('"bfloat16" and "half" are not supported on CPU. OpenLLM will default fallback to "float32".') + elif dtype not in {'float', 'float32'}: + logger.warning('"bfloat16" and "half" are not supported on CPU. OpenLLM will default fallback to "float32".') dtype = 'float' # we need to cast back to full precision if cuda is not available llm = openllm.LLM[t.Any, t.Any]( model_id=model_id, @@ -1095,13 +1092,14 @@ def build_command( push_cmd = f'bentoml push {bento_tag}' cloud_context = get_current_bentocloud_context() - if cloud_context is None and (not get_disable_warnings()) and not get_quiet_mode(): + if cloud_context is None: available_context = [c.name for c in cloud_config.contexts] if not available_context: - termui.warning('No default BentoCloud context found. Please login with `bentoml cloud login` first.') + logger.warning('No default BentoCloud context found. Please login with `bentoml cloud login` first.') else: - termui.warning( - f'No context is passed, but the following context is available: {available_context}. Make sure to specify the argument "--context" for specific context you want to push to.' + logger.warning( + 'No context is passed, but the following context is available: %s. Make sure to specify the argument "--context" for specific context you want to push to.', + available_context, ) else: push_cmd += f' --context {cloud_context}'