mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-05-18 21:54:11 -04:00
chore(logger): fix warnings and streamline style (#717)
Sorry but there are too much wasted spacing in `_llm.py`, and I'm unhappy and not productive anytime I look or want to do anything with it --------- Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
128
.ruff.toml
128
.ruff.toml
@@ -1,77 +1,75 @@
|
||||
extend-exclude = [
|
||||
"tools",
|
||||
"examples",
|
||||
"openllm-python/src/openllm/__init__.py",
|
||||
"openllm-python/src/openllm/_version.py",
|
||||
"openllm-python/src/openllm/models/__init__.py",
|
||||
"openllm-python/src/openllm_cli/playground",
|
||||
"openllm-client/src/openllm_client/pb/**",
|
||||
"tools",
|
||||
"examples",
|
||||
"openllm-python/src/openllm/__init__.py",
|
||||
"openllm-python/src/openllm/_version.py",
|
||||
"openllm-python/src/openllm/models/__init__.py",
|
||||
"openllm-python/src/openllm_cli/playground",
|
||||
"openllm-client/src/openllm_client/pb/**",
|
||||
]
|
||||
extend-include = ["*.ipynb"]
|
||||
extend-select = [
|
||||
"E",
|
||||
"F",
|
||||
"B",
|
||||
"PIE",
|
||||
"I", # isort
|
||||
"G", # flake8-logging-format
|
||||
"W", # pycodestyle
|
||||
"Q", # flake8-quotes
|
||||
"FA", # flake8-future-annotations
|
||||
"TCH", # flake8-type-checking
|
||||
"PLW", # pylint-warning
|
||||
"PLR", # pylint-refactor
|
||||
"PT", # flake8-pytest-style
|
||||
"PERF", # perflint
|
||||
"FLY", # flynt
|
||||
"RUF", # Ruff-specific rules
|
||||
"YTT", # flake8-2020
|
||||
"E",
|
||||
"F",
|
||||
"B",
|
||||
"PIE",
|
||||
"G", # flake8-logging-format
|
||||
"W", # pycodestyle
|
||||
"Q", # flake8-quotes
|
||||
"FA", # flake8-future-annotations
|
||||
"TCH", # flake8-type-checking
|
||||
"PLW", # pylint-warning
|
||||
"PLR", # pylint-refactor
|
||||
"PT", # flake8-pytest-style
|
||||
"PERF", # perflint
|
||||
"RUF", # Ruff-specific rules
|
||||
"YTT", # flake8-2020
|
||||
]
|
||||
fix = true
|
||||
ignore = [
|
||||
"PLR0911",
|
||||
"PLR0912",
|
||||
"PLR0913",
|
||||
"PLR0915",
|
||||
"PLR2004", # magic value to use constant
|
||||
"E501", # ignore line length violation
|
||||
"E401", # ignore multiple line import
|
||||
"E702",
|
||||
"TCH004", # don't move runtime import out, just warn about it
|
||||
"RUF012", # mutable attributes to be used with ClassVar
|
||||
"E701", # multiple statement on single line
|
||||
"PLR0911",
|
||||
"PLR0912",
|
||||
"PLR0913",
|
||||
"PLR0915",
|
||||
"PLR2004", # magic value to use constant
|
||||
"E501", # ignore line length violation
|
||||
"E401", # ignore multiple line import
|
||||
"E702",
|
||||
"TCH004", # don't move runtime import out, just warn about it
|
||||
"RUF012", # mutable attributes to be used with ClassVar
|
||||
"E701", # multiple statement on single line
|
||||
]
|
||||
line-length = 119
|
||||
indent-width = 2
|
||||
target-version = "py38"
|
||||
typing-modules = [
|
||||
"openllm_core._typing_compat",
|
||||
"openllm_client._typing_compat",
|
||||
"openllm_core._typing_compat",
|
||||
"openllm_client._typing_compat",
|
||||
]
|
||||
unfixable = ["TCH004"]
|
||||
|
||||
[lint.flake8-type-checking]
|
||||
exempt-modules = [
|
||||
"typing",
|
||||
"typing_extensions",
|
||||
"openllm_core._typing_compat",
|
||||
"openllm_client._typing_compat",
|
||||
"typing",
|
||||
"typing_extensions",
|
||||
"openllm_core._typing_compat",
|
||||
"openllm_client._typing_compat",
|
||||
]
|
||||
runtime-evaluated-base-classes = [
|
||||
"openllm_core._configuration.LLMConfig",
|
||||
"openllm_core._configuration.GenerationConfig",
|
||||
"openllm_core._configuration.SamplingParams",
|
||||
"openllm_core._configuration.ModelSettings",
|
||||
"openllm.LLMConfig",
|
||||
"openllm_core._configuration.LLMConfig",
|
||||
"openllm_core._configuration.GenerationConfig",
|
||||
"openllm_core._configuration.SamplingParams",
|
||||
"openllm_core._configuration.ModelSettings",
|
||||
"openllm.LLMConfig",
|
||||
]
|
||||
runtime-evaluated-decorators = [
|
||||
"attrs.define",
|
||||
"attrs.frozen",
|
||||
"trait",
|
||||
"attr.attrs",
|
||||
'attr.define',
|
||||
'_attr.define',
|
||||
'attr.frozen',
|
||||
"attrs.define",
|
||||
"attrs.frozen",
|
||||
"trait",
|
||||
"attr.attrs",
|
||||
'attr.define',
|
||||
'_attr.define',
|
||||
'attr.frozen',
|
||||
]
|
||||
|
||||
[format]
|
||||
@@ -87,29 +85,6 @@ convention = "google"
|
||||
ignore-overlong-task-comments = true
|
||||
max-line-length = 119
|
||||
|
||||
[lint.isort]
|
||||
combine-as-imports = true
|
||||
known-first-party = [
|
||||
"openllm",
|
||||
"bentoml",
|
||||
"openllm_core",
|
||||
"openllm_client",
|
||||
"openllm_cli",
|
||||
]
|
||||
known-third-party = [
|
||||
"transformers",
|
||||
"click",
|
||||
"huggingface_hub",
|
||||
"torch",
|
||||
"vllm",
|
||||
"auto_gptq",
|
||||
"peft",
|
||||
"click_option_group",
|
||||
]
|
||||
split-on-trailing-comma = false
|
||||
no-lines-before = ["future", "standard-library"]
|
||||
relative-imports-order = "closest-to-furthest"
|
||||
|
||||
[lint.flake8-quotes]
|
||||
avoid-escape = false
|
||||
inline-quotes = "single"
|
||||
@@ -121,5 +96,4 @@ docstring-quotes = "double"
|
||||
"openllm-python/src/openllm/_llm.py" = ["F811"]
|
||||
"openllm-core/src/openllm_core/utils/import_utils.py" = ["PLW0603", "F811"]
|
||||
"openllm-core/src/openllm_core/_configuration.py" = ["F811", "Q001"]
|
||||
"openllm-python/src/openllm/__init__.pyi" = ["I001"]
|
||||
"openllm-python/src/openllm/_service_vars_pkg.py" = ["F821"]
|
||||
|
||||
@@ -36,7 +36,7 @@ logger = logging.getLogger(__name__)
|
||||
_T=t.TypeVar('_T')
|
||||
@functools.lru_cache(maxsize=1)
|
||||
def _WithArgsTypes()->tuple[type[t.Any],...]:
|
||||
try:from typing import GenericAlias as _TypingGenericAlias # type: ignore # noqa: I001
|
||||
try:from typing import GenericAlias as _TypingGenericAlias # type: ignore
|
||||
except ImportError:_TypingGenericAlias = () # type: ignore # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
|
||||
# _GenericAlias is the actual GenericAlias implementation
|
||||
return (_TypingGenericAlias,) if sys.version_info<(3,10) else (t._GenericAlias, types.GenericAlias, types.UnionType) # type: ignore
|
||||
@@ -69,7 +69,7 @@ def getenv(env:str,default:t.Any=None,var:t.Sequence[str]|None=None)->t.Any:
|
||||
if var is not None:env_key=set(var)|env_key
|
||||
def callback(k:str)->t.Any:
|
||||
_var = os.getenv(k)
|
||||
if _var and k.startswith('OPENLLM_') and not get_disable_warnings() and not get_quiet_mode():logger.warning("Using '%s' environment is deprecated, use '%s' instead.",k.upper(),k[8:].upper())
|
||||
if _var and k.startswith('OPENLLM_'):logger.warning("Using '%s' environment is deprecated, use '%s' instead.",k.upper(),k[8:].upper())
|
||||
return _var
|
||||
return first_not_none(*(callback(k) for k in env_key),default=default)
|
||||
def field_env_key(key:str,suffix:str|None=None)->str:return '_'.join(filter(None,map(str.upper,['OPENLLM',suffix.strip('_') if suffix else '',key])))
|
||||
@@ -80,13 +80,13 @@ def get_quiet_mode()->bool:
|
||||
return False
|
||||
def get_disable_warnings()->bool:return check_bool_env(WARNING_ENV_VAR, False)
|
||||
def set_disable_warnings(disable:bool=True)->None:
|
||||
if get_disable_warnings():os.environ[WARNING_ENV_VAR]=str(disable)
|
||||
if disable:os.environ[WARNING_ENV_VAR]=str(disable)
|
||||
def set_debug_mode(enabled:bool,level:int=1)->None:
|
||||
if enabled:os.environ[DEV_DEBUG_VAR] = str(level)
|
||||
os.environ.update({DEBUG_ENV_VAR:str(enabled),QUIET_ENV_VAR:str(not enabled),_GRPC_DEBUG_ENV_VAR:'DEBUG' if enabled else 'ERROR','CT2_VERBOSE':'3'})
|
||||
set_disable_warnings(enabled)
|
||||
set_disable_warnings(not enabled)
|
||||
def set_quiet_mode(enabled:bool)->None:
|
||||
os.environ.update({QUIET_ENV_VAR:str(enabled),_GRPC_DEBUG_ENV_VAR:'NONE','CT2_VERBOSE':'-1'})
|
||||
os.environ.update({QUIET_ENV_VAR:str(enabled),DEBUG_ENV_VAR:str(not enabled),_GRPC_DEBUG_ENV_VAR:'NONE','CT2_VERBOSE':'-1'})
|
||||
set_disable_warnings(enabled)
|
||||
def gen_random_uuid(prefix:str|None=None)->str:return '-'.join([prefix or 'openllm', str(uuid.uuid4().hex)])
|
||||
# NOTE: `compose` any number of unary functions into a single unary function
|
||||
@@ -113,11 +113,8 @@ def generate_context(framework_name:str):
|
||||
return ModelContext(framework_name=framework_name,framework_versions=framework_versions)
|
||||
@functools.lru_cache(maxsize=1)
|
||||
def in_notebook()->bool:
|
||||
try:from IPython.core.getipython import get_ipython; return 'IPKernelApp' in get_ipython().config # noqa: I001
|
||||
except (ImportError, AttributeError):return False
|
||||
# Used to filter out INFO log
|
||||
class InfoFilter(logging.Filter):
|
||||
def filter(self,record:logging.LogRecord)->bool:return logging.INFO<=record.levelno<logging.WARNING
|
||||
try:from IPython.core.getipython import get_ipython;return 'IPKernelApp' in get_ipython().config
|
||||
except Exception:return False
|
||||
_TOKENIZER_PREFIX = '_tokenizer_'
|
||||
def flatten_attrs(**attrs:t.Any)->tuple[dict[str,t.Any],dict[str, t.Any]]:
|
||||
tokenizer_attrs = {k[len(_TOKENIZER_PREFIX):]:v for k,v in attrs.items() if k.startswith(_TOKENIZER_PREFIX)}
|
||||
@@ -130,31 +127,31 @@ DEBUG=sys.flags.dev_mode or (not sys.flags.ignore_environment and check_bool_env
|
||||
SHOW_CODEGEN=DEBUG and (os.environ.get(DEV_DEBUG_VAR,str(0)).isdigit() and int(os.environ.get(DEV_DEBUG_VAR,str(0)))>3)
|
||||
# MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins
|
||||
MYPY=False
|
||||
# fmt: on
|
||||
|
||||
|
||||
class ExceptionFilter(logging.Filter):
|
||||
def __init__(self, exclude_exceptions: list[type[Exception]] | None = None, **kwargs: t.Any):
|
||||
if exclude_exceptions is None:
|
||||
exclude_exceptions = []
|
||||
if exclude_exceptions is None:exclude_exceptions=[]
|
||||
try:
|
||||
from circus.exc import ConflictError
|
||||
|
||||
if ConflictError not in exclude_exceptions:
|
||||
exclude_exceptions.append(ConflictError)
|
||||
if ConflictError not in exclude_exceptions:exclude_exceptions.append(ConflictError)
|
||||
except ImportError:
|
||||
pass
|
||||
super(ExceptionFilter, self).__init__(**kwargs)
|
||||
self.EXCLUDE_EXCEPTIONS = exclude_exceptions
|
||||
|
||||
def filter(self, record: logging.LogRecord) -> bool:
|
||||
self.EXCLUDE_EXCEPTIONS=exclude_exceptions
|
||||
def filter(self,record:logging.LogRecord)->bool:
|
||||
if record.exc_info:
|
||||
etype, _, _ = record.exc_info
|
||||
etype,_,_=record.exc_info
|
||||
if etype is not None:
|
||||
for exc in self.EXCLUDE_EXCEPTIONS:
|
||||
if issubclass(etype, exc):
|
||||
return False
|
||||
if issubclass(etype, exc):return False
|
||||
return True
|
||||
# Used to filter out INFO log
|
||||
class InfoFilter(logging.Filter):
|
||||
def filter(self,record:logging.LogRecord)->bool:return logging.INFO<=record.levelno<logging.WARNING
|
||||
class WarningFilter(logging.Filter): # FIXME: Why does this not work?
|
||||
def filter(self,record:logging.LogRecord)->bool:
|
||||
if get_disable_warnings():return record.levelno>=logging.ERROR
|
||||
return True
|
||||
# fmt: on
|
||||
|
||||
|
||||
_LOGGING_CONFIG: dict[str, t.Any] = {
|
||||
@@ -163,11 +160,12 @@ _LOGGING_CONFIG: dict[str, t.Any] = {
|
||||
'filters': {
|
||||
'excfilter': {'()': 'openllm_core.utils.ExceptionFilter'},
|
||||
'infofilter': {'()': 'openllm_core.utils.InfoFilter'},
|
||||
'warningfilter': {'()': 'openllm_core.utils.WarningFilter'},
|
||||
},
|
||||
'handlers': {
|
||||
'bentomlhandler': {
|
||||
'class': 'logging.StreamHandler',
|
||||
'filters': ['excfilter', 'infofilter'],
|
||||
'filters': ['excfilter', 'warningfilter', 'infofilter'],
|
||||
'stream': 'ext://sys.stdout',
|
||||
},
|
||||
'defaulthandler': {'class': 'logging.StreamHandler', 'level': logging.WARNING},
|
||||
@@ -195,6 +193,9 @@ def configure_logging() -> None:
|
||||
_LOGGING_CONFIG['loggers']['bentoml']['level'] = logging.INFO
|
||||
_LOGGING_CONFIG['root']['level'] = logging.INFO
|
||||
|
||||
if get_disable_warnings(): # HACK: This is a hack to disable warnings
|
||||
_LOGGING_CONFIG['loggers']['openllm']['level'] = logging.ERROR
|
||||
|
||||
logging.config.dictConfig(_LOGGING_CONFIG)
|
||||
|
||||
|
||||
@@ -241,24 +242,3 @@ __lazy = LazyModule(
|
||||
__all__ = __lazy.__all__
|
||||
__dir__ = __lazy.__dir__
|
||||
__getattr__ = __lazy.__getattr__
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from . import analytics as analytics, codegen as codegen, dantic as dantic, serde as serde
|
||||
from .import_utils import (
|
||||
OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES,
|
||||
is_autoawq_available as is_autoawq_available,
|
||||
is_autogptq_available as is_autogptq_available,
|
||||
is_bentoml_available as is_bentoml_available,
|
||||
is_bitsandbytes_available as is_bitsandbytes_available,
|
||||
is_ctranslate_available as is_ctranslate_available,
|
||||
is_grpc_available as is_grpc_available,
|
||||
is_jupyter_available as is_jupyter_available,
|
||||
is_jupytext_available as is_jupytext_available,
|
||||
is_notebook_available as is_notebook_available,
|
||||
is_peft_available as is_peft_available,
|
||||
is_torch_available as is_torch_available,
|
||||
is_transformers_available as is_transformers_available,
|
||||
is_vllm_available as is_vllm_available,
|
||||
)
|
||||
from .representation import ReprMixin as ReprMixin
|
||||
from .serde import converter as converter
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
# fmt: off
|
||||
if __name__ == '__main__':from openllm_cli.entrypoint import cli;cli() # noqa
|
||||
if __name__ == '__main__':from openllm_cli.entrypoint import cli;cli()
|
||||
|
||||
@@ -1,15 +1,8 @@
|
||||
from __future__ import annotations
|
||||
import functools
|
||||
import logging
|
||||
import os
|
||||
import functools, logging, os, warnings
|
||||
import typing as t
|
||||
|
||||
import attr
|
||||
import inflection
|
||||
import orjson
|
||||
|
||||
import bentoml
|
||||
import openllm
|
||||
import attr, inflection, orjson
|
||||
import bentoml, openllm
|
||||
from openllm_core._schemas import GenerationOutput
|
||||
from openllm_core._typing_compat import (
|
||||
AdapterMap,
|
||||
@@ -35,8 +28,6 @@ from openllm_core.utils import (
|
||||
flatten_attrs,
|
||||
gen_random_uuid,
|
||||
generate_hash_from_file,
|
||||
get_disable_warnings,
|
||||
get_quiet_mode,
|
||||
getenv,
|
||||
is_ctranslate_available,
|
||||
is_peft_available,
|
||||
@@ -49,365 +40,18 @@ from .exceptions import ForbiddenAttributeError, OpenLLMException
|
||||
from .serialisation.constants import PEFT_CONFIG_NAME
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
import transformers
|
||||
import torch, transformers
|
||||
from peft.config import PeftConfig
|
||||
|
||||
from openllm_core._configuration import LLMConfig
|
||||
|
||||
from ._runners import Runner
|
||||
|
||||
ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]]
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def normalise_model_name(name: str) -> str:
|
||||
if validate_is_path(name):
|
||||
return os.path.basename(resolve_filepath(name))
|
||||
name = name.replace('/', '--')
|
||||
return inflection.dasherize(name)
|
||||
|
||||
|
||||
def _resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
|
||||
if not is_peft_available():
|
||||
raise RuntimeError("Requires 'peft' to be installed. Do 'pip install \"openllm[fine-tune]\"'")
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
resolved: AdapterMap = {}
|
||||
for path_or_adapter_id, name in adapter_map.items():
|
||||
if name is None:
|
||||
raise ValueError('Adapter name must be specified.')
|
||||
if os.path.isfile(os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)):
|
||||
config_file = os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)
|
||||
else:
|
||||
try:
|
||||
config_file = hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME)
|
||||
except Exception as err:
|
||||
raise ValueError(f"Can't find '{PEFT_CONFIG_NAME}' at '{path_or_adapter_id}'") from err
|
||||
with open(config_file, 'r') as file:
|
||||
resolved_config = orjson.loads(file.read())
|
||||
# all peft_type should be available in PEFT_CONFIG_NAME
|
||||
_peft_type = resolved_config['peft_type'].lower()
|
||||
if _peft_type not in resolved:
|
||||
resolved[_peft_type] = ()
|
||||
resolved[_peft_type] += (_AdapterTuple((path_or_adapter_id, name, resolved_config)),)
|
||||
return resolved
|
||||
|
||||
|
||||
_reserved_namespace = {'model', 'tokenizer', 'runner', 'import_kwargs'}
|
||||
_AdapterTuple: type[AdapterTuple] = codegen.make_attr_tuple_class('AdapterTuple', ['adapter_id', 'name', 'config'])
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=1)
|
||||
def _torch_dtype_mapping():
|
||||
import torch
|
||||
|
||||
return {
|
||||
'half': torch.float16,
|
||||
'float': torch.float32,
|
||||
'float16': torch.float16,
|
||||
'float32': torch.float32,
|
||||
'bfloat16': torch.bfloat16,
|
||||
}
|
||||
ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]]
|
||||
|
||||
|
||||
@attr.define(slots=True, repr=False, init=False)
|
||||
class LLM(t.Generic[M, T], ReprMixin):
|
||||
_model_id: str
|
||||
_revision: str | None
|
||||
_quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig | None
|
||||
_quantise: LiteralQuantise | None
|
||||
_model_decls: TupleAny
|
||||
__model_attrs: DictStrAny
|
||||
__tokenizer_attrs: DictStrAny
|
||||
_tag: bentoml.Tag
|
||||
_adapter_map: AdapterMap | None
|
||||
_serialisation: LiteralSerialisation
|
||||
_local: bool
|
||||
_max_model_len: int | None
|
||||
|
||||
__llm_dtype__: LiteralDtype | t.Literal['auto', 'half', 'float'] = 'auto'
|
||||
__llm_torch_dtype__: 'torch.dtype' = None
|
||||
__llm_config__: LLMConfig | None = None
|
||||
__llm_backend__: LiteralBackend = None # type: ignore
|
||||
__llm_quantization_config__: transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig | None = None
|
||||
__llm_runner__: t.Optional[Runner[M, T]] = None
|
||||
__llm_model__: t.Optional[M] = None
|
||||
__llm_tokenizer__: t.Optional[T] = None
|
||||
__llm_adapter_map__: t.Optional[ResolvedAdapterMap] = None
|
||||
__llm_trust_remote_code__: bool = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_id,
|
||||
model_version=None,
|
||||
model_tag=None,
|
||||
llm_config=None,
|
||||
backend=None,
|
||||
*args,
|
||||
quantize=None,
|
||||
quantization_config=None,
|
||||
adapter_map=None,
|
||||
serialisation='safetensors',
|
||||
trust_remote_code=False,
|
||||
embedded=False,
|
||||
dtype='auto',
|
||||
low_cpu_mem_usage=True,
|
||||
max_model_len=None,
|
||||
_eager=True,
|
||||
**attrs,
|
||||
):
|
||||
# fmt: off
|
||||
torch_dtype = attrs.pop('torch_dtype',None) # backward compatible
|
||||
if torch_dtype is not None:logger.warning('The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.');dtype=torch_dtype
|
||||
_local = False
|
||||
if validate_is_path(model_id):model_id,_local=resolve_filepath(model_id),True
|
||||
backend=first_not_none(getenv('backend',default=backend),default=self._cascade_backend())
|
||||
dtype=first_not_none(getenv('dtype',default=dtype,var=['TORCH_DTYPE']),default='auto')
|
||||
quantize=first_not_none(getenv('quantize',default=quantize,var=['QUANITSE']),default=None)
|
||||
attrs.update({'low_cpu_mem_usage':low_cpu_mem_usage})
|
||||
# parsing tokenizer and model kwargs, as the hierarchy is param pass > default
|
||||
model_attrs, tokenizer_attrs = flatten_attrs(**attrs)
|
||||
if model_tag is None:
|
||||
model_tag,model_version=self._make_tag_components(model_id,model_version,backend=backend)
|
||||
if model_version:model_tag=f'{model_tag}:{model_version}'
|
||||
# fmt: on
|
||||
|
||||
self.__attrs_init__(
|
||||
model_id=model_id,
|
||||
revision=model_version,
|
||||
tag=bentoml.Tag.from_taglike(model_tag),
|
||||
quantization_config=quantization_config,
|
||||
quantise=self._resolve_quantise(quantize, backend),
|
||||
model_decls=args,
|
||||
adapter_map=_resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
|
||||
serialisation=serialisation,
|
||||
local=_local,
|
||||
max_model_len=max_model_len,
|
||||
LLM__model_attrs=model_attrs,
|
||||
LLM__tokenizer_attrs=tokenizer_attrs,
|
||||
llm_dtype__=dtype.lower(),
|
||||
llm_backend__=backend,
|
||||
llm_config__=llm_config,
|
||||
llm_trust_remote_code__=trust_remote_code,
|
||||
)
|
||||
|
||||
if _eager:
|
||||
try:
|
||||
model = bentoml.models.get(self.tag)
|
||||
except bentoml.exceptions.NotFound:
|
||||
model = openllm.serialisation.import_model(self, trust_remote_code=self.trust_remote_code)
|
||||
# resolve the tag
|
||||
self._tag = model.tag
|
||||
if not _eager and embedded:
|
||||
raise RuntimeError("Embedded mode is not supported when '_eager' is False.")
|
||||
if embedded and not get_disable_warnings() and not get_quiet_mode():
|
||||
logger.warning(
|
||||
'You are using embedded mode, which means the models will be loaded into memory. This is often not recommended in production and should only be used for local development only.'
|
||||
)
|
||||
self.runner.init_local(quiet=True)
|
||||
|
||||
# fmt: off
|
||||
def _resolve_quantise(self, quantise, backend):
|
||||
if backend in ('pt', 'vllm'):return quantise
|
||||
if backend=='ctranslate':return self._resolve_ctranslate_quantise(quantise)
|
||||
raise NotImplementedError(f"Quantisation is not supported for backend '{backend}'")
|
||||
def _resolve_ctranslate_quantise(self,quantise):
|
||||
if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}:raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
|
||||
if quantise == 'int8':quantise='int8_float16' if self._has_gpus else 'int8_float32'
|
||||
return quantise
|
||||
@apply(lambda val:tuple(str.lower(i) if i else i for i in val))
|
||||
def _make_tag_components(self,model_id:str,model_version:str|None,backend:str)->tuple[str,str|None]:
|
||||
model_id,*maybe_revision=model_id.rsplit(':')
|
||||
if len(maybe_revision)>0:
|
||||
if model_version is not None:logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.",maybe_revision[0],model_version)
|
||||
model_version = maybe_revision[0]
|
||||
if validate_is_path(model_id):model_id,model_version=resolve_filepath(model_id),first_not_none(model_version,default=generate_hash_from_file(model_id))
|
||||
return f'{backend}-{normalise_model_name(model_id)}',model_version
|
||||
@functools.cached_property
|
||||
def _has_gpus(self):
|
||||
try:
|
||||
from cuda import cuda
|
||||
err,*_=cuda.cuInit(0)
|
||||
if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to initialise CUDA runtime binding.')
|
||||
err,num_gpus=cuda.cuDeviceGetCount()
|
||||
if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to get CUDA device count.')
|
||||
return True
|
||||
except (ImportError, RuntimeError):return False
|
||||
@property
|
||||
def _torch_dtype(self):
|
||||
import torch, transformers # noqa: I001
|
||||
_map=_torch_dtype_mapping()
|
||||
if not isinstance(self.__llm_torch_dtype__,torch.dtype):
|
||||
try:hf_config=transformers.AutoConfig.from_pretrained(self.bentomodel.path,trust_remote_code=self.trust_remote_code)
|
||||
except OpenLLMException:hf_config=transformers.AutoConfig.from_pretrained(self.model_id,trust_remote_code=self.trust_remote_code)
|
||||
config_dtype=getattr(hf_config,'torch_dtype',None)
|
||||
if config_dtype is None:config_dtype=torch.float32
|
||||
if self.__llm_dtype__=='auto':
|
||||
if config_dtype==torch.float32:torch_dtype=torch.float16
|
||||
else:torch_dtype=config_dtype
|
||||
else:
|
||||
if self.__llm_dtype__ not in _map:raise ValueError(f"Unknown dtype '{self.__llm_dtype__}'")
|
||||
torch_dtype=_map[self.__llm_dtype__]
|
||||
self.__llm_torch_dtype__=torch_dtype
|
||||
return self.__llm_torch_dtype__
|
||||
@property
|
||||
def _model_attrs(self):return {**self.import_kwargs[0],**self.__model_attrs}
|
||||
@_model_attrs.setter
|
||||
def _model_attrs(self, value):self.__model_attrs = value
|
||||
@property
|
||||
def _tokenizer_attrs(self):return {**self.import_kwargs[1],**self.__tokenizer_attrs}
|
||||
def _cascade_backend(self)->LiteralBackend:
|
||||
if self._has_gpus:
|
||||
if is_vllm_available():return 'vllm'
|
||||
elif is_ctranslate_available():return 'ctranslate' # XXX: base OpenLLM image should always include vLLM
|
||||
elif is_ctranslate_available():return 'ctranslate'
|
||||
else:return 'pt'
|
||||
def __setattr__(self,attr,value):
|
||||
if attr in _reserved_namespace:raise ForbiddenAttributeError(f'{attr} should not be set during runtime.')
|
||||
super().__setattr__(attr, value)
|
||||
def __del__(self):del self.__llm_model__,self.__llm_tokenizer__,self.__llm_adapter_map__
|
||||
@property
|
||||
def __repr_keys__(self):return {'model_id','revision','backend','type'}
|
||||
def __repr_args__(self):
|
||||
yield 'model_id',self._model_id if not self._local else self.tag.name
|
||||
yield 'revision',self._revision if self._revision else self.tag.version
|
||||
yield 'backend',self.__llm_backend__
|
||||
yield 'type',self.llm_type
|
||||
@property
|
||||
def import_kwargs(self):return {'device_map':'auto' if self._has_gpus else None,'torch_dtype':self._torch_dtype},{'padding_side':'left','truncation_side':'left'}
|
||||
@property
|
||||
def trust_remote_code(self):
|
||||
env=os.getenv('TRUST_REMOTE_CODE')
|
||||
if env is not None:return str(env).upper() in ENV_VARS_TRUE_VALUES
|
||||
return self.__llm_trust_remote_code__
|
||||
@property
|
||||
def model_id(self):return self._model_id
|
||||
@property
|
||||
def revision(self):return self._revision
|
||||
@property
|
||||
def tag(self):return self._tag
|
||||
@property
|
||||
def bentomodel(self):return openllm.serialisation.get(self)
|
||||
@property
|
||||
def quantization_config(self):
|
||||
if self.__llm_quantization_config__ is None:
|
||||
from ._quantisation import infer_quantisation_config
|
||||
if self._quantization_config is not None:self.__llm_quantization_config__ = self._quantization_config
|
||||
elif self._quantise is not None:self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(self,self._quantise,**self._model_attrs)
|
||||
else:raise ValueError("Either 'quantization_config' or 'quantise' must be specified.")
|
||||
return self.__llm_quantization_config__
|
||||
@property
|
||||
def has_adapters(self):return self._adapter_map is not None
|
||||
@property
|
||||
def local(self):return self._local
|
||||
@property
|
||||
def quantise(self):return self._quantise
|
||||
@property
|
||||
def llm_type(self):return normalise_model_name(self._model_id)
|
||||
@property
|
||||
def llm_parameters(self):return (self._model_decls,self._model_attrs),self._tokenizer_attrs
|
||||
@property
|
||||
def identifying_params(self):return {'configuration':self.config.model_dump_json().decode(),'model_ids':orjson.dumps(self.config['model_ids']).decode(),'model_id':self.model_id}
|
||||
@property
|
||||
def tokenizer(self):
|
||||
if self.__llm_tokenizer__ is None:self.__llm_tokenizer__=openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1])
|
||||
return self.__llm_tokenizer__
|
||||
@property
|
||||
def runner(self):
|
||||
from ._runners import runner
|
||||
if self.__llm_runner__ is None:self.__llm_runner__=runner(self)
|
||||
return self.__llm_runner__
|
||||
def prepare(self,adapter_type='lora',use_gradient_checking=True,**attrs):
|
||||
if self.__llm_backend__!='pt':raise RuntimeError('Fine tuning is only supported for PyTorch backend.')
|
||||
from peft.mapping import get_peft_model
|
||||
from peft.utils.other import prepare_model_for_kbit_training
|
||||
model=get_peft_model(
|
||||
prepare_model_for_kbit_training(self.model,use_gradient_checkpointing=use_gradient_checking),
|
||||
self.config['fine_tune_strategies']
|
||||
.get(adapter_type,self.config.make_fine_tune_config(adapter_type))
|
||||
.train()
|
||||
.with_config(**attrs)
|
||||
.build(),
|
||||
)
|
||||
if DEBUG:model.print_trainable_parameters()
|
||||
return model,self.tokenizer
|
||||
def prepare_for_training(self,*args,**attrs):logger.warning('`prepare_for_training` is deprecated and will be removed in the future. Please use `prepare` instead.');return self.prepare(*args,**attrs)
|
||||
# fmt: on
|
||||
|
||||
@property
|
||||
def adapter_map(self):
|
||||
if not is_peft_available():
|
||||
raise MissingDependencyError("Failed to import 'peft'. Make sure to do 'pip install \"openllm[fine-tune]\"'")
|
||||
if not self.has_adapters:
|
||||
raise AttributeError('Adapter map is not available.')
|
||||
assert self._adapter_map is not None
|
||||
if self.__llm_adapter_map__ is None:
|
||||
_map: ResolvedAdapterMap = {k: {} for k in self._adapter_map}
|
||||
for adapter_type, adapter_tuple in self._adapter_map.items():
|
||||
base = first_not_none(
|
||||
self.config['fine_tune_strategies'].get(adapter_type),
|
||||
default=self.config.make_fine_tune_config(adapter_type),
|
||||
)
|
||||
for adapter in adapter_tuple:
|
||||
_map[adapter_type][adapter.name] = (base.with_config(**adapter.config).build(), adapter.adapter_id)
|
||||
self.__llm_adapter_map__ = _map
|
||||
return self.__llm_adapter_map__
|
||||
|
||||
@property
|
||||
def model(self):
|
||||
if self.__llm_model__ is None:
|
||||
model = openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs)
|
||||
# If OOM, then it is probably you don't have enough VRAM to run this model.
|
||||
if self.__llm_backend__ == 'pt':
|
||||
import torch
|
||||
|
||||
loaded_in_kbit = (
|
||||
getattr(model, 'is_loaded_in_8bit', False)
|
||||
or getattr(model, 'is_loaded_in_4bit', False)
|
||||
or getattr(model, 'is_quantized', False)
|
||||
)
|
||||
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
|
||||
try:
|
||||
model = model.to('cuda')
|
||||
except Exception as err:
|
||||
raise OpenLLMException(f'Failed to load model into GPU: {err}.\n') from err
|
||||
if self.has_adapters:
|
||||
logger.debug('Applying the following adapters: %s', self.adapter_map)
|
||||
for adapter_dict in self.adapter_map.values():
|
||||
for adapter_name, (peft_config, peft_model_id) in adapter_dict.items():
|
||||
model.load_adapter(peft_model_id, adapter_name, peft_config=peft_config)
|
||||
self.__llm_model__ = model
|
||||
return self.__llm_model__
|
||||
|
||||
@property
|
||||
def config(self):
|
||||
import transformers
|
||||
|
||||
if self.__llm_config__ is None:
|
||||
if self.__llm_backend__ == 'ctranslate':
|
||||
try:
|
||||
config = transformers.AutoConfig.from_pretrained(
|
||||
self.bentomodel.path_of('/hf'), trust_remote_code=self.trust_remote_code
|
||||
)
|
||||
except OpenLLMException:
|
||||
config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
|
||||
for architecture in config.architectures:
|
||||
if architecture in openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE():
|
||||
config = openllm.AutoConfig.infer_class_from_name(
|
||||
openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture]
|
||||
).model_construct_env(**self._model_attrs)
|
||||
break
|
||||
else:
|
||||
raise OpenLLMException(
|
||||
f"Failed to infer the configuration class from the given model. Make sure the model is a supported model. Supported models are: {', '.join(openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE.keys())}"
|
||||
)
|
||||
else:
|
||||
config = openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs)
|
||||
self.__llm_config__ = config
|
||||
return self.__llm_config__
|
||||
|
||||
async def generate(
|
||||
self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
|
||||
) -> GenerationOutput:
|
||||
@@ -495,3 +139,325 @@ class LLM(t.Generic[M, T], ReprMixin):
|
||||
yield generated.with_options(outputs=delta_outputs)
|
||||
except Exception as err:
|
||||
raise RuntimeError(f'Exception caught during generation: {err}') from err
|
||||
|
||||
# NOTE: If you are here to see how generate_iterator and generate works, see above.
|
||||
# The below are mainly for internal implementation that you don't have to worry about.
|
||||
# fmt: off
|
||||
|
||||
_model_id:str
|
||||
_revision:t.Optional[str]
|
||||
_quantization_config:t.Optional[t.Union[transformers.BitsAndBytesConfig,transformers.GPTQConfig,transformers.AwqConfig]]
|
||||
_quantise: t.Optional[LiteralQuantise]
|
||||
_model_decls:TupleAny
|
||||
__model_attrs:DictStrAny
|
||||
__tokenizer_attrs:DictStrAny
|
||||
_tag:bentoml.Tag
|
||||
_adapter_map:t.Optional[AdapterMap]
|
||||
_serialisation:LiteralSerialisation
|
||||
_local:bool
|
||||
_max_model_len:t.Optional[int]
|
||||
|
||||
__llm_dtype__: t.Union[LiteralDtype,t.Literal['auto', 'half', 'float']]='auto'
|
||||
__llm_torch_dtype__:'torch.dtype'=None
|
||||
__llm_config__:t.Optional[LLMConfig]=None
|
||||
__llm_backend__:LiteralBackend=None
|
||||
__llm_quantization_config__:t.Optional[t.Union[transformers.BitsAndBytesConfig,transformers.GPTQConfig,transformers.AwqConfig]]=None
|
||||
__llm_runner__:t.Optional[Runner[M, T]]=None
|
||||
__llm_model__:t.Optional[M]=None
|
||||
__llm_tokenizer__:t.Optional[T]=None
|
||||
__llm_adapter_map__:t.Optional[ResolvedAdapterMap]=None
|
||||
__llm_trust_remote_code__:bool=False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_id,
|
||||
model_version=None,
|
||||
model_tag=None,
|
||||
llm_config=None,
|
||||
backend=None,
|
||||
*args,
|
||||
quantize=None,
|
||||
quantization_config=None,
|
||||
adapter_map=None,
|
||||
serialisation='safetensors',
|
||||
trust_remote_code=False,
|
||||
embedded=False,
|
||||
dtype='auto',
|
||||
low_cpu_mem_usage=True,
|
||||
max_model_len=None,
|
||||
_eager=True,
|
||||
**attrs,
|
||||
):
|
||||
torch_dtype=attrs.pop('torch_dtype',None) # backward compatible
|
||||
if torch_dtype is not None:warnings.warns('The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',DeprecationWarning,stacklevel=3);dtype=torch_dtype
|
||||
_local = False
|
||||
if validate_is_path(model_id):model_id,_local=resolve_filepath(model_id),True
|
||||
backend=first_not_none(getenv('backend',default=backend),default=self._cascade_backend())
|
||||
dtype=first_not_none(getenv('dtype',default=dtype,var=['TORCH_DTYPE']),default='auto')
|
||||
quantize=first_not_none(getenv('quantize',default=quantize,var=['QUANITSE']),default=None)
|
||||
attrs.update({'low_cpu_mem_usage':low_cpu_mem_usage})
|
||||
# parsing tokenizer and model kwargs, as the hierarchy is param pass > default
|
||||
model_attrs,tokenizer_attrs=flatten_attrs(**attrs)
|
||||
if model_tag is None:
|
||||
model_tag,model_version=self._make_tag_components(model_id,model_version,backend=backend)
|
||||
if model_version:model_tag=f'{model_tag}:{model_version}'
|
||||
|
||||
self.__attrs_init__(
|
||||
model_id=model_id,
|
||||
revision=model_version,
|
||||
tag=bentoml.Tag.from_taglike(model_tag),
|
||||
quantization_config=quantization_config,
|
||||
quantise=getattr(self._Quantise,backend)(self,quantize),
|
||||
model_decls=args,
|
||||
adapter_map=convert_peft_config_type(adapter_map) if adapter_map is not None else None,
|
||||
serialisation=serialisation,
|
||||
local=_local,
|
||||
max_model_len=max_model_len,
|
||||
LLM__model_attrs=model_attrs,
|
||||
LLM__tokenizer_attrs=tokenizer_attrs,
|
||||
llm_dtype__=dtype.lower(),
|
||||
llm_backend__=backend,
|
||||
llm_config__=llm_config,
|
||||
llm_trust_remote_code__=trust_remote_code,
|
||||
)
|
||||
|
||||
if _eager:
|
||||
try:
|
||||
model=bentoml.models.get(self.tag)
|
||||
except bentoml.exceptions.NotFound:
|
||||
model=openllm.serialisation.import_model(self,trust_remote_code=self.trust_remote_code)
|
||||
# resolve the tag
|
||||
self._tag=model.tag
|
||||
if not _eager and embedded:raise RuntimeError("Embedded mode is not supported when '_eager' is False.")
|
||||
if embedded:logger.warning('Models will be loaded into memory. NOT RECOMMENDED in production and SHOULD ONLY used for development.');self.runner.init_local(quiet=True)
|
||||
class _Quantise:
|
||||
@staticmethod
|
||||
def pt(llm:LLM,quantise=None):return quantise
|
||||
@staticmethod
|
||||
def vllm(llm:LLM,quantise=None):return quantise
|
||||
@staticmethod
|
||||
def ctranslate(llm:LLM,quantise=None):
|
||||
if quantise in {'int4','awq','gptq','squeezellm'}:raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
|
||||
if quantise=='int8':quantise='int8_float16' if llm._has_gpus else 'int8_float32'
|
||||
return quantise
|
||||
@apply(lambda val:tuple(str.lower(i) if i else i for i in val))
|
||||
def _make_tag_components(self,model_id:str,model_version:str|None,backend:str)->tuple[str,str|None]:
|
||||
model_id,*maybe_revision=model_id.rsplit(':')
|
||||
if len(maybe_revision)>0:
|
||||
if model_version is not None:logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.",maybe_revision[0],model_version)
|
||||
model_version = maybe_revision[0]
|
||||
if validate_is_path(model_id):model_id,model_version=resolve_filepath(model_id),first_not_none(model_version,default=generate_hash_from_file(model_id))
|
||||
return f'{backend}-{normalise_model_name(model_id)}',model_version
|
||||
@functools.cached_property
|
||||
def _has_gpus(self):
|
||||
try:
|
||||
from cuda import cuda
|
||||
err,*_=cuda.cuInit(0)
|
||||
if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to initialise CUDA runtime binding.')
|
||||
err,num_gpus=cuda.cuDeviceGetCount()
|
||||
if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to get CUDA device count.')
|
||||
return True
|
||||
except (ImportError, RuntimeError):return False
|
||||
@property
|
||||
def _torch_dtype(self):
|
||||
import torch, transformers
|
||||
_map=_torch_dtype_mapping()
|
||||
if not isinstance(self.__llm_torch_dtype__,torch.dtype):
|
||||
try:hf_config=transformers.AutoConfig.from_pretrained(self.bentomodel.path,trust_remote_code=self.trust_remote_code)
|
||||
except OpenLLMException:hf_config=transformers.AutoConfig.from_pretrained(self.model_id,trust_remote_code=self.trust_remote_code)
|
||||
config_dtype=getattr(hf_config,'torch_dtype',None)
|
||||
if config_dtype is None:config_dtype=torch.float32
|
||||
if self.__llm_dtype__=='auto':
|
||||
if config_dtype==torch.float32:torch_dtype=torch.float16
|
||||
else:torch_dtype=config_dtype
|
||||
else:
|
||||
if self.__llm_dtype__ not in _map:raise ValueError(f"Unknown dtype '{self.__llm_dtype__}'")
|
||||
torch_dtype=_map[self.__llm_dtype__]
|
||||
self.__llm_torch_dtype__=torch_dtype
|
||||
return self.__llm_torch_dtype__
|
||||
@property
|
||||
def _model_attrs(self):return {**self.import_kwargs[0],**self.__model_attrs}
|
||||
@_model_attrs.setter
|
||||
def _model_attrs(self, value):self.__model_attrs = value
|
||||
@property
|
||||
def _tokenizer_attrs(self):return {**self.import_kwargs[1],**self.__tokenizer_attrs}
|
||||
def _cascade_backend(self)->LiteralBackend:
|
||||
if self._has_gpus:
|
||||
if is_vllm_available():return 'vllm'
|
||||
elif is_ctranslate_available():return 'ctranslate' # XXX: base OpenLLM image should always include vLLM
|
||||
elif is_ctranslate_available():return 'ctranslate'
|
||||
else:return 'pt'
|
||||
def __setattr__(self,attr,value):
|
||||
if attr in {'model', 'tokenizer', 'runner', 'import_kwargs'}:raise ForbiddenAttributeError(f'{attr} should not be set during runtime.')
|
||||
super().__setattr__(attr, value)
|
||||
def __del__(self):del self.__llm_model__,self.__llm_tokenizer__,self.__llm_adapter_map__
|
||||
@property
|
||||
def __repr_keys__(self):return {'model_id','revision','backend','type'}
|
||||
def __repr_args__(self):
|
||||
yield 'model_id',self._model_id if not self._local else self.tag.name
|
||||
yield 'revision',self._revision if self._revision else self.tag.version
|
||||
yield 'backend',self.__llm_backend__
|
||||
yield 'type',self.llm_type
|
||||
@property
|
||||
def import_kwargs(self):return {'device_map':'auto' if self._has_gpus else None,'torch_dtype':self._torch_dtype},{'padding_side':'left','truncation_side':'left'}
|
||||
@property
|
||||
def trust_remote_code(self):
|
||||
env=os.getenv('TRUST_REMOTE_CODE')
|
||||
if env is not None:return str(env).upper() in ENV_VARS_TRUE_VALUES
|
||||
return self.__llm_trust_remote_code__
|
||||
@property
|
||||
def model_id(self):return self._model_id
|
||||
@property
|
||||
def revision(self):return self._revision
|
||||
@property
|
||||
def tag(self):return self._tag
|
||||
@property
|
||||
def bentomodel(self):return openllm.serialisation.get(self)
|
||||
@property
|
||||
def quantization_config(self):
|
||||
if self.__llm_quantization_config__ is None:
|
||||
from ._quantisation import infer_quantisation_config
|
||||
if self._quantization_config is not None:self.__llm_quantization_config__ = self._quantization_config
|
||||
elif self._quantise is not None:self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(self,self._quantise,**self._model_attrs)
|
||||
else:raise ValueError("Either 'quantization_config' or 'quantise' must be specified.")
|
||||
return self.__llm_quantization_config__
|
||||
@property
|
||||
def has_adapters(self):return self._adapter_map is not None
|
||||
@property
|
||||
def local(self):return self._local
|
||||
@property
|
||||
def quantise(self):return self._quantise
|
||||
@property
|
||||
def llm_type(self):return normalise_model_name(self._model_id)
|
||||
@property
|
||||
def llm_parameters(self):return (self._model_decls,self._model_attrs),self._tokenizer_attrs
|
||||
@property
|
||||
def identifying_params(self):return {'configuration':self.config.model_dump_json().decode(),'model_ids':orjson.dumps(self.config['model_ids']).decode(),'model_id':self.model_id}
|
||||
@property
|
||||
def tokenizer(self):
|
||||
if self.__llm_tokenizer__ is None:self.__llm_tokenizer__=openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1])
|
||||
return self.__llm_tokenizer__
|
||||
@property
|
||||
def runner(self):
|
||||
from ._runners import runner
|
||||
if self.__llm_runner__ is None:self.__llm_runner__=runner(self)
|
||||
return self.__llm_runner__
|
||||
def prepare(self,adapter_type='lora',use_gradient_checking=True,**attrs):
|
||||
if self.__llm_backend__!='pt':raise RuntimeError('Fine tuning is only supported for PyTorch backend.')
|
||||
from peft.mapping import get_peft_model
|
||||
from peft.utils.other import prepare_model_for_kbit_training
|
||||
model=get_peft_model(
|
||||
prepare_model_for_kbit_training(self.model,use_gradient_checkpointing=use_gradient_checking),
|
||||
self.config['fine_tune_strategies']
|
||||
.get(adapter_type,self.config.make_fine_tune_config(adapter_type))
|
||||
.train()
|
||||
.with_config(**attrs)
|
||||
.build(),
|
||||
)
|
||||
if DEBUG:model.print_trainable_parameters()
|
||||
return model,self.tokenizer
|
||||
def prepare_for_training(self,*args,**attrs):logger.warning('`prepare_for_training` is deprecated and will be removed in the future. Please use `prepare` instead.');return self.prepare(*args,**attrs)
|
||||
|
||||
@property
|
||||
def adapter_map(self):
|
||||
if not is_peft_available():
|
||||
raise MissingDependencyError("Failed to import 'peft'. Make sure to do 'pip install \"openllm[fine-tune]\"'")
|
||||
if not self.has_adapters:
|
||||
raise AttributeError('Adapter map is not available.')
|
||||
assert self._adapter_map is not None
|
||||
if self.__llm_adapter_map__ is None:
|
||||
_map: ResolvedAdapterMap = {k: {} for k in self._adapter_map}
|
||||
for adapter_type, adapter_tuple in self._adapter_map.items():
|
||||
base = first_not_none(
|
||||
self.config['fine_tune_strategies'].get(adapter_type),
|
||||
default=self.config.make_fine_tune_config(adapter_type),
|
||||
)
|
||||
for adapter in adapter_tuple:
|
||||
_map[adapter_type][adapter.name] = (base.with_config(**adapter.config).build(), adapter.adapter_id)
|
||||
self.__llm_adapter_map__ = _map
|
||||
return self.__llm_adapter_map__
|
||||
|
||||
@property
|
||||
def model(self):
|
||||
if self.__llm_model__ is None:
|
||||
model = openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs)
|
||||
# If OOM, then it is probably you don't have enough VRAM to run this model.
|
||||
if self.__llm_backend__ == 'pt':
|
||||
import torch
|
||||
|
||||
loaded_in_kbit = (
|
||||
getattr(model, 'is_loaded_in_8bit', False)
|
||||
or getattr(model, 'is_loaded_in_4bit', False)
|
||||
or getattr(model, 'is_quantized', False)
|
||||
)
|
||||
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
|
||||
try:
|
||||
model = model.to('cuda')
|
||||
except Exception as err:
|
||||
raise OpenLLMException(f'Failed to load model into GPU: {err}.\n') from err
|
||||
if self.has_adapters:
|
||||
logger.debug('Applying the following adapters: %s', self.adapter_map)
|
||||
for adapter_dict in self.adapter_map.values():
|
||||
for adapter_name, (peft_config, peft_model_id) in adapter_dict.items():
|
||||
model.load_adapter(peft_model_id, adapter_name, peft_config=peft_config)
|
||||
self.__llm_model__ = model
|
||||
return self.__llm_model__
|
||||
|
||||
@property
|
||||
def config(self):
|
||||
import transformers
|
||||
|
||||
if self.__llm_config__ is None:
|
||||
if self.__llm_backend__ == 'ctranslate':
|
||||
try:
|
||||
config = transformers.AutoConfig.from_pretrained(
|
||||
self.bentomodel.path_of('/hf'), trust_remote_code=self.trust_remote_code
|
||||
)
|
||||
except OpenLLMException:
|
||||
config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
|
||||
for architecture in config.architectures:
|
||||
if architecture in openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE():
|
||||
config = openllm.AutoConfig.infer_class_from_name(
|
||||
openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture]
|
||||
).model_construct_env(**self._model_attrs)
|
||||
break
|
||||
else:
|
||||
raise OpenLLMException(
|
||||
f"Failed to infer the configuration class from the given model. Make sure the model is a supported model. Supported models are: {', '.join(openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE.keys())}"
|
||||
)
|
||||
else:
|
||||
config = openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs)
|
||||
self.__llm_config__ = config
|
||||
return self.__llm_config__
|
||||
|
||||
|
||||
# fmt: off
|
||||
@functools.lru_cache(maxsize=1)
|
||||
def _torch_dtype_mapping()->dict[str,torch.dtype]:
|
||||
import torch; return {
|
||||
'half': torch.float16,
|
||||
'float': torch.float32,
|
||||
'float16': torch.float16,
|
||||
'float32': torch.float32,
|
||||
'bfloat16': torch.bfloat16,
|
||||
}
|
||||
def normalise_model_name(name:str)->str:return os.path.basename(resolve_filepath(name)) if validate_is_path(name) else inflection.dasherize(name.replace('/','--'))
|
||||
def convert_peft_config_type(adapter_map:dict[str, str])->AdapterMap:
|
||||
if not is_peft_available():raise RuntimeError("LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'")
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
resolved:AdapterMap={}
|
||||
for path_or_adapter_id, name in adapter_map.items():
|
||||
if name is None:raise ValueError('Adapter name must be specified.')
|
||||
if os.path.isfile(os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)):
|
||||
config_file=os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)
|
||||
else:
|
||||
try:
|
||||
config_file=hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME)
|
||||
except Exception as err:
|
||||
raise ValueError(f"Can't find '{PEFT_CONFIG_NAME}' at '{path_or_adapter_id}'") from err
|
||||
with open(config_file, 'r') as file:resolved_config=orjson.loads(file.read())
|
||||
_peft_type=resolved_config['peft_type'].lower()
|
||||
if _peft_type not in resolved:resolved[_peft_type]=()
|
||||
resolved[_peft_type]+=(_AdapterTuple((path_or_adapter_id, name, resolved_config)),)
|
||||
return resolved
|
||||
|
||||
@@ -6,7 +6,6 @@ import typing as t
|
||||
import transformers
|
||||
|
||||
from openllm.serialisation.constants import HUB_ATTRS
|
||||
from openllm_core.utils import get_disable_warnings, get_quiet_mode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -44,10 +43,9 @@ def infer_autoclass_from_llm(llm, config, /):
|
||||
# in case this model doesn't use the correct auto class for model type, for example like chatglm
|
||||
# where it uses AutoModel instead of AutoModelForCausalLM. Then we fallback to AutoModel
|
||||
if autoclass not in config.auto_map:
|
||||
if not get_disable_warnings() and not get_quiet_mode():
|
||||
logger.warning(
|
||||
"OpenLLM failed to determine compatible Auto classes to load %s. Falling back to 'AutoModel'.\nTip: Make sure to specify 'AutoModelForCausalLM' or 'AutoModelForSeq2SeqLM' in your 'config.auto_map'. If your model type is yet to be supported, please file an issues on our GitHub tracker.",
|
||||
llm._model_id,
|
||||
)
|
||||
logger.warning(
|
||||
"OpenLLM failed to determine compatible Auto classes to load %s. Falling back to 'AutoModel'.\nTip: Make sure to specify 'AutoModelForCausalLM' or 'AutoModelForSeq2SeqLM' in your 'config.auto_map'. If your model type is yet to be supported, please file an issues on our GitHub tracker.",
|
||||
llm._model_id,
|
||||
)
|
||||
autoclass = 'AutoModel'
|
||||
return getattr(transformers, autoclass)
|
||||
|
||||
@@ -57,7 +57,6 @@ from openllm_core.utils import (
|
||||
first_not_none,
|
||||
gen_random_uuid,
|
||||
get_debug_mode,
|
||||
get_disable_warnings,
|
||||
get_quiet_mode,
|
||||
is_torch_available,
|
||||
pkg,
|
||||
@@ -94,7 +93,7 @@ else:
|
||||
torch = LazyLoader('torch', globals(), 'torch')
|
||||
|
||||
P = ParamSpec('P')
|
||||
logger = logging.getLogger(__name__)
|
||||
logger = logging.getLogger('openllm')
|
||||
OPENLLM_FIGLET = """\
|
||||
██████╗ ██████╗ ███████╗███╗ ██╗██╗ ██╗ ███╗ ███╗
|
||||
██╔═══██╗██╔══██╗██╔════╝████╗ ██║██║ ██║ ████╗ ████║
|
||||
@@ -123,21 +122,19 @@ _EXT_FOLDER = os.path.abspath(os.path.join(os.path.dirname(__file__), 'extension
|
||||
|
||||
|
||||
def backend_warning(backend: LiteralBackend, build: bool = False) -> None:
|
||||
if backend == 'pt' and (not get_disable_warnings()) and not get_quiet_mode():
|
||||
if backend == 'pt':
|
||||
if openllm.utils.is_vllm_available():
|
||||
termui.warning(
|
||||
logger.warning(
|
||||
'vLLM is available, but using PyTorch backend instead. Note that vLLM is a lot more performant and should always be used in production (by explicitly set --backend vllm).'
|
||||
)
|
||||
else:
|
||||
termui.warning(
|
||||
logger.warning(
|
||||
'vLLM is not available. Note that PyTorch backend is not as performant as vLLM and you should always consider using vLLM for production.'
|
||||
)
|
||||
if build:
|
||||
termui.info(
|
||||
logger.info(
|
||||
"Tip: You can set '--backend vllm' to package your Bento with vLLM backend regardless if vLLM is available locally."
|
||||
)
|
||||
if not get_debug_mode():
|
||||
termui.info("To disable these warnings, set 'OPENLLM_DISABLE_WARNING=True'")
|
||||
|
||||
|
||||
class Extensions(click.MultiCommand):
|
||||
@@ -419,22 +416,22 @@ def start_command(
|
||||
serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
|
||||
),
|
||||
)
|
||||
if serialisation == 'safetensors' and quantize is not None and not get_disable_warnings() and not get_quiet_mode():
|
||||
termui.warning(f"'--quantize={quantize}' might not work with 'safetensors' serialisation format.")
|
||||
termui.warning(
|
||||
f"Make sure to check out '{model_id}' repository to see if the weights is in '{serialisation}' format if unsure."
|
||||
if serialisation == 'safetensors' and quantize is not None:
|
||||
logger.warning("'--quantize=%s' might not work with 'safetensors' serialisation format.", quantize)
|
||||
logger.warning(
|
||||
"Make sure to check out '%s' repository to see if the weights is in '%s' format if unsure.",
|
||||
model_id,
|
||||
serialisation,
|
||||
)
|
||||
termui.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.")
|
||||
if not get_debug_mode():
|
||||
termui.info("To disable these warnings, set 'OPENLLM_DISABLE_WARNING=True'")
|
||||
logger.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.")
|
||||
|
||||
import torch
|
||||
|
||||
if backend == 'pt' and not torch.cuda.is_available():
|
||||
if dtype == 'auto':
|
||||
dtype = 'float'
|
||||
elif dtype not in {'float', 'float32'} and not get_disable_warnings() and not get_quiet_mode():
|
||||
termui.warning('"bfloat16" and "half" are not supported on CPU. OpenLLM will default fallback to "float32".')
|
||||
elif dtype not in {'float', 'float32'}:
|
||||
logger.warning('"bfloat16" and "half" are not supported on CPU. OpenLLM will default fallback to "float32".')
|
||||
dtype = 'float' # we need to cast back to full precision if cuda is not available
|
||||
llm = openllm.LLM[t.Any, t.Any](
|
||||
model_id=model_id,
|
||||
@@ -549,22 +546,22 @@ def start_grpc_command(
|
||||
serialisation = first_not_none(
|
||||
serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
|
||||
)
|
||||
if serialisation == 'safetensors' and quantize is not None and not get_disable_warnings() and not get_quiet_mode():
|
||||
termui.warning(f"'--quantize={quantize}' might not work with 'safetensors' serialisation format.")
|
||||
termui.warning(
|
||||
f"Make sure to check out '{model_id}' repository to see if the weights is in '{serialisation}' format if unsure."
|
||||
if serialisation == 'safetensors' and quantize is not None:
|
||||
logger.warning("'--quantize=%s' might not work with 'safetensors' serialisation format.", quantize)
|
||||
logger.warning(
|
||||
"Make sure to check out '%s' repository to see if the weights is in '%s' format if unsure.",
|
||||
model_id,
|
||||
serialisation,
|
||||
)
|
||||
termui.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.")
|
||||
if not get_debug_mode():
|
||||
termui.info("To disable these warnings, set 'OPENLLM_DISABLE_WARNING=True'")
|
||||
logger.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.")
|
||||
|
||||
import torch
|
||||
|
||||
if backend == 'pt' and not torch.cuda.is_available():
|
||||
if dtype == 'auto':
|
||||
dtype = 'float'
|
||||
elif dtype not in {'float', 'float32'} and not get_disable_warnings() and not get_quiet_mode():
|
||||
termui.warning('"bfloat16" and "half" are not supported on CPU. OpenLLM will default fallback to "float32".')
|
||||
elif dtype not in {'float', 'float32'}:
|
||||
logger.warning('"bfloat16" and "half" are not supported on CPU. OpenLLM will default fallback to "float32".')
|
||||
dtype = 'float' # we need to cast back to full precision if cuda is not available
|
||||
llm = openllm.LLM[t.Any, t.Any](
|
||||
model_id=model_id,
|
||||
@@ -1095,13 +1092,14 @@ def build_command(
|
||||
|
||||
push_cmd = f'bentoml push {bento_tag}'
|
||||
cloud_context = get_current_bentocloud_context()
|
||||
if cloud_context is None and (not get_disable_warnings()) and not get_quiet_mode():
|
||||
if cloud_context is None:
|
||||
available_context = [c.name for c in cloud_config.contexts]
|
||||
if not available_context:
|
||||
termui.warning('No default BentoCloud context found. Please login with `bentoml cloud login` first.')
|
||||
logger.warning('No default BentoCloud context found. Please login with `bentoml cloud login` first.')
|
||||
else:
|
||||
termui.warning(
|
||||
f'No context is passed, but the following context is available: {available_context}. Make sure to specify the argument "--context" for specific context you want to push to.'
|
||||
logger.warning(
|
||||
'No context is passed, but the following context is available: %s. Make sure to specify the argument "--context" for specific context you want to push to.',
|
||||
available_context,
|
||||
)
|
||||
else:
|
||||
push_cmd += f' --context {cloud_context}'
|
||||
|
||||
Reference in New Issue
Block a user