From 6f724416c01b4fbffbf47ef5b16a0e8e1c8559e2 Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Sat, 17 Jun 2023 08:56:14 -0400 Subject: [PATCH] perf: build quantization and better transformer behaviour (#28) Fixes quantization_config and low_cpu_mem_usage to be available on PyTorch implementation only See changelog for more details on #28 --- .github/workflows/ci.yml | 16 - .pre-commit-config.yaml | 9 +- changelog.d/27.feature.md | 28 +- changelog.d/28.change.md | 14 + src/openllm/__init__.py | 26 +- src/openllm/_configuration.py | 373 ++++---- src/openllm/_llm.py | 123 ++- src/openllm/_package.py | 216 +++-- src/openllm/_service.py | 13 +- src/openllm/cli.py | 884 +++++++++--------- src/openllm/models/auto/factory.py | 15 +- .../models/chatglm/modeling_chatglm.py | 36 +- .../models/dolly_v2/modeling_dolly_v2.py | 26 +- .../models/flan_t5/modeling_flan_t5.py | 20 +- src/openllm/models/opt/modeling_opt.py | 20 +- .../models/starcoder/modeling_starcoder.py | 30 +- src/openllm/utils/__init__.py | 9 +- src/openllm/utils/dantic.py | 7 +- src/openllm/utils/import_utils.py | 87 +- src/openllm/utils/lazy.py | 38 +- src/openllm_client/_prompt.py | 7 +- src/openllm_client/runtimes/base.py | 11 +- tests/test_configuration.py | 4 +- 23 files changed, 1159 insertions(+), 853 deletions(-) create mode 100644 changelog.d/28.change.md diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a4df22a1..e0f06166 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,22 +28,6 @@ defaults: run: shell: bash --noprofile --norc -exo pipefail {0} jobs: - codestyle_check: - runs-on: ubuntu-latest - if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }} - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - name: Setup CI - uses: ./.github/actions/setup-repo - - name: Running changelog check - run: hatch run changelog - - name: Format and lint check - run: hatch run fmt - - name: Type check - if: ${{ github.event_name == 'pull_request' }} - run: git diff --name-only --diff-filter=AM "origin/$GITHUB_BASE_REF" -z -- '*.py{,i}' | xargs -0 --no-run-if-empty hatch run dev:typing tests: runs-on: ubuntu-latest if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6b63fa2a..a0a22fe1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,7 +14,7 @@ ci: autoupdate_schedule: weekly - skip: [check-models-table-update, check-models-table-update] + skip: [check-models-table-update, check-models-table-update, changelog-dry-run] exclude: '.*\.(css|js|svg)$' repos: - repo: https://github.com/charliermarsh/ruff-pre-commit @@ -51,13 +51,16 @@ repos: typings/.*| .github/.* )$ - - repo: local - hooks: - id: check-models-table-update name: check if table in README.md is up-to-date entry: ./tools/assert-model-table-latest language: script files: README.md + - id: changelog-dry-run + name: Running changelog dry-run + entry: hatch run changelog + language: system + files: CHANGELOG.md - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.4.0 hooks: diff --git a/changelog.d/27.feature.md b/changelog.d/27.feature.md index 8ebb70cb..9239952c 100644 --- a/changelog.d/27.feature.md +++ b/changelog.d/27.feature.md @@ -1,14 +1,22 @@ Added support for quantization during serving time. -`openllm start` now support `--quantize 8bit` and `--quantize 4bit` -`GPTQ` quantization support is on the roadmap and currently -being worked on. + +`openllm start` now support `--quantize int8` and `--quantize int4` `GPTQ` +quantization support is on the roadmap and currently being worked on. + `openllm start` now also support `--bettertransformer` to use -`BetterTransformer` for serving -Refactored `openllm.LLMConfig` to be able to use with `__getitem__` -to acecss the config value: `openllm.DollyV2Config()['requirements']` -the order being: `__openllm_*__ > self. > __openllm_generation_class__ > __openllm_extras__` +`BetterTransformer` for serving. + +Refactored `openllm.LLMConfig` to be able to use with `__getitem__`: +`openllm.DollyV2Config()['requirements']`. + +The access order being: +`__openllm_*__ > self. > __openllm_generation_class__ > __openllm_extras__`. + Added `towncrier` workflow to easily generate changelog entries + Added `use_pipeline`, `bettertransformer` flag into ModelSettings -`LLMConfig` now supported `__dataclass_transform__` protocol to help -with type-checking -Changed `openllm download-models` to `openllm download` + +`LLMConfig` now supported `__dataclass_transform__` protocol to help with +type-checking + +`openllm download-models` now becomes `openllm download` diff --git a/changelog.d/28.change.md b/changelog.d/28.change.md new file mode 100644 index 00000000..e8a54090 --- /dev/null +++ b/changelog.d/28.change.md @@ -0,0 +1,14 @@ +`--quantize` now takes `int8, int4` instead of `8bit, 4bit` to be consistent +with bitsandbytes concept. + +`openllm CLI` now cached all available model command, allow faster startup time. + +Fixes `openllm start model-id --debug` to filtered out debug message log from +`bentoml.Server`. + +`--model-id` from `openllm start` now support choice for easier selection. + +Updated `ModelConfig` implementation with **getitem** and auto generation value. + +Cleanup CLI and improve loading time, `openllm start` should be 'blazingly +fast'. diff --git a/src/openllm/__init__.py b/src/openllm/__init__.py index cb03cd17..76c9a455 100644 --- a/src/openllm/__init__.py +++ b/src/openllm/__init__.py @@ -25,7 +25,7 @@ deploy, and monitor any LLMs with ease. """ from __future__ import annotations -import logging as _ +import logging import typing as t from . import utils as utils @@ -33,15 +33,11 @@ from .__about__ import __version__ as __version__ from .exceptions import MissingDependencyError if utils.DEBUG: - from bentoml._internal.configuration import set_debug_mode, set_quiet_mode + utils.set_debug_mode(True) + utils.set_quiet_mode(False) - set_debug_mode(True) - set_quiet_mode(False) - - from bentoml._internal.log import configure_logging - - configure_logging() - _.basicConfig(level=_.NOTSET) + utils.configure_logging() + logging.basicConfig(level=logging.NOTSET) _import_structure = { @@ -147,7 +143,6 @@ if t.TYPE_CHECKING: from . import exceptions as exceptions from . import models as models from . import playground as playground - # Specific types import from ._configuration import LLMConfig as LLMConfig from ._llm import LLM as LLM @@ -160,7 +155,8 @@ if t.TYPE_CHECKING: from .cli import start as start from .cli import start_grpc as start_grpc from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING - from .models.auto import MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES + from .models.auto import \ + MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES from .models.auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES from .models.auto import MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES from .models.auto import AutoConfig as AutoConfig @@ -234,5 +230,11 @@ else: globals()["__file__"], _import_structure, module_spec=__spec__, - extra_objects={"__version__": __version__}, + extra_objects={ + "__version__": __version__, + # The below is a special mapping that allows openllm to be used as a dictionary. + # This is purely for convenience sake, and should not be used in performance critcal + # code. This is also not considered as a public API. + "__openllm_special__": {"flax": "AutoFlaxLLM", "tf": "AutoTFLLM", "pt": "AutoLLM"}, + }, ) diff --git a/src/openllm/_configuration.py b/src/openllm/_configuration.py index 279e64b8..174cf995 100644 --- a/src/openllm/_configuration.py +++ b/src/openllm/_configuration.py @@ -395,16 +395,7 @@ bentoml_cattr.register_unstructure_hook_factory( ) -def _populate_value_from_env_var( - key: str, transform: t.Callable[[str], str] | None = None, fallback: t.Any = None -) -> t.Any: - if transform is not None and callable(transform): - key = transform(key) - - return os.environ.get(key, fallback) - - -def _field_env_key(model_name: str, key: str, suffix: str | None = None) -> str: +def _field_env_key(model_name: str, key: str, suffix: str | t.Literal[""] | None = None) -> str: return "_".join(filter(None, map(str.upper, ["OPENLLM", model_name, suffix.strip("_") if suffix else "", key]))) @@ -425,6 +416,7 @@ class ModelSettings(t.TypedDict, total=False): url: str requires_gpu: bool trust_remote_code: bool + service_name: NotRequired[str] requirements: t.Optional[ListStr] # llm implementation specifics @@ -448,128 +440,174 @@ class ModelSettings(t.TypedDict, total=False): generation_class: t.Type[GenerationConfig] -_ModelSettings: type[attr.AttrsInstance] = codegen.add_method_dunders( - type("__openllm_internal__", (ModelSettings,), {"__module__": "openllm._configuration"}), - attr.make_class( - "ModelSettings", - { - k: dantic.Field( +def _settings_field_transformer( + _: type[attr.AttrsInstance], __: list[attr.Attribute[t.Any]] +) -> list[attr.Attribute[t.Any]]: + return [ + attr.Attribute.from_counting_attr( + k, + dantic.Field( kw_only=False if t.get_origin(ann) is not Required else True, auto_default=True, use_default_converter=False, type=ann, - metadata={ - "target": f"__openllm_{k}__", - "required": False if t.get_origin(ann) is NotRequired else t.get_origin(ann) is Required, - }, + metadata={"target": f"__openllm_{k}__"}, description=f"ModelSettings field for {k}.", - ) - for k, ann in t.get_type_hints(ModelSettings).items() - }, - bases=(DictStrAny,), - slots=True, - weakref_slot=True, - collect_by_mro=True, - ), - _overwrite_doc="Internal attrs representation of ModelSettings.", -) + ), + ) + for k, ann in t.get_type_hints(ModelSettings).items() + ] -def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]): +@attr.define(slots=True, field_transformer=_settings_field_transformer, frozen=False) +class _ModelSettingsAttr: + """Internal attrs representation of ModelSettings.""" + + def __getitem__(self, key: str) -> t.Any: + if key in codegen.get_annotations(ModelSettings): + return _object_getattribute(self, key) + raise KeyError(key) + + @classmethod + def default(cls) -> _ModelSettingsAttr: + _ = ModelSettings( + default_id="__default__", + model_ids=["__default__"], + name_type="dasherize", + requires_gpu=False, + url="", + use_pipeline=False, + model_type="causal_lm", + trust_remote_code=False, + requirements=None, + timeout=3600, + service_name="", + workers_per_resource=1, + runtime="transformers", + ) + return cls(**t.cast(DictStrAny, _)) + + +def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]): if not lenient_issubclass(cl_, LLMConfig): - raise RuntimeError(f"Given LLMConfig must be a subclass type of 'LLMConfig', got '{cl_}' instead.") + raise RuntimeError(f"Given '{cl_}' must be a subclass type of 'LLMConfig', got '{cl_}' instead.") if not hasattr(cl_, "__config__") or getattr(cl_, "__config__") is None: raise RuntimeError("Given LLMConfig must have '__config__' that is not None defined.") - settings = cl_.__config__ - assert settings + assert cl_.__config__ is not None - required = [i.name for i in attr.fields(cls) if i.metadata.get("required", False)] - - missing = set(required) - set(settings.keys()) - - if len(missing) > 0: - raise ValueError(f"The following keys are required under '__config__': {required} (missing: {missing})") - - if "generation_class" in settings: + if "generation_class" in cl_.__config__: raise ValueError( "'generation_class' shouldn't be defined in '__config__', rather defining " - f"all required attributes under '{cl_}.GenerationConfig' when defining the class." + f"all required attributes under '{cl_}.GenerationConfig' instead." ) - if not settings["default_id"] or not settings["model_ids"]: + _cl_name = cl_.__name__.replace("Config", "") + + _settings_attr = _ModelSettingsAttr.default() + try: + cls(**t.cast(DictStrAny, cl_.__config__)) + _settings_attr = attr.evolve(_settings_attr, **t.cast(DictStrAny, cl_.__config__)) + except TypeError: raise ValueError("Either 'default_id' or 'model_ids' are emptied under '__config__' (required fields).") - # NOTE: value in __config__ can be None, hense we use setdefault - # to update in-place - _cl_name = cl_.__name__.replace("Config", "") - name_type = settings.setdefault("name_type", "dasherize") - model_name = settings.setdefault( - "model_name", inflection.underscore(_cl_name) if name_type == "dasherize" else _cl_name.lower() + _final_value_dct: DictStrAny = { + "model_name": inflection.underscore(_cl_name) + if _settings_attr["name_type"] == "dasherize" + else _cl_name.lower() + } + _final_value_dct["start_name"] = ( + inflection.dasherize(_final_value_dct["model_name"]) + if _settings_attr["name_type"] == "dasherize" + else _final_value_dct["model_name"] ) - partialed = functools.partial(_field_env_key, model_name=model_name, suffix="generation") + env = openllm.utils.ModelEnv(_final_value_dct["model_name"]) + _final_value_dct["env"] = env - def auto_env_transformers(_: t.Any, fields: list[attr.Attribute[t.Any]]) -> list[attr.Attribute[t.Any]]: - _has_own_gen = codegen.has_own_attribute(cl_, "GenerationConfig") - return [ - f.evolve( - default=_populate_value_from_env_var( - partialed(key=f.name), - fallback=getattr(cl_.GenerationConfig, f.name, f.default) if _has_own_gen else f.default, - ), - metadata={"env": partialed(key=f.name), "description": f.metadata.get("description", "(not provided)")}, - converter=None, - ) - for f in fields - ] + # bettertransformer support + if _settings_attr["bettertransformer"] is None: + _final_value_dct["bettertransformer"] = ( + os.environ.get(env.bettertransformer, str(False)).upper() in ENV_VARS_TRUE_VALUES + ) + if _settings_attr["requires_gpu"]: + # if requires_gpu is True, then disable BetterTransformer for quantization. + _final_value_dct["bettertransformer"] = False - settings.setdefault( - "generation_class", - attr.make_class( - f"{_cl_name}GenerationConfig", - [], - bases=(GenerationConfig,), - slots=True, - weakref_slot=True, - frozen=False, - repr=True, - collect_by_mro=True, - field_transformer=auto_env_transformers, + _final_value_dct["service_name"] = f"generated_{_final_value_dct['model_name']}_service.py" + _final_value_dct["generation_class"] = attr.make_class( + f"{_cl_name}GenerationConfig", + [], + bases=(GenerationConfig,), + slots=True, + weakref_slot=True, + frozen=True, + repr=True, + collect_by_mro=True, + field_transformer=_make_env_transformer( + cl_, + _final_value_dct["model_name"], + suffix="generation", + default_callback=lambda field_name, field_default: getattr(cl_.GenerationConfig, field_name, field_default) + if codegen.has_own_attribute(cl_, "GenerationConfig") + else field_default, + globs={"cl_": cl_}, ), ) - env = settings.setdefault("env", openllm.utils.ModelEnv(model_name)) - requires_gpu = settings.setdefault("requires_gpu", False) + return attr.evolve(_settings_attr, **_final_value_dct) - # bettertransformer support - bettertransformer = settings.setdefault( - "bettertransformer", - os.environ.get(env.bettertransformer, str(False)).upper() in ENV_VARS_TRUE_VALUES, + +bentoml_cattr.register_structure_hook(_ModelSettingsAttr, structure_settings) + + +def _make_env_transformer( + cls: type[LLMConfig], + model_name: str, + suffix: t.LiteralString | None = None, + default_callback: t.Callable[[str, t.Any], t.Any] | None = None, + globs: DictStrAny | None = None, +): + def identity(_: str, x_value: t.Any) -> t.Any: + return x_value + + default_callback = identity if default_callback is None else default_callback + + globs = {} if globs is None else globs + globs.update( + { + "functools": functools, + "__populate_env": dantic.env_converter, + "__default_callback": default_callback, + "__field_env": _field_env_key, + "__suffix": suffix or "", + "__model_name": model_name, + } ) - if requires_gpu: - # For all models that requires GPU, no need to offload it to BetterTransformer - # use bitsandbytes or gptq instead for latency improvement - if bettertransformer: - logger.debug("Model requires GPU by default, disabling bettertransformer.") - bettertransformer = False - settings["bettertransformer"] = bettertransformer - # default value - settings.setdefault("url", "") - settings.setdefault("use_pipeline", False) - settings.setdefault("model_type", "causal_lm") - settings.setdefault("trust_remote_code", False) - settings.setdefault("requirements", None) - settings.setdefault("timeout", 3600) - settings.setdefault("workers_per_resource", 1) - settings.setdefault("runtime", "transformers") - settings.setdefault("start_name", inflection.dasherize(model_name) if name_type == "dasherize" else model_name) + lines: ListStr = [ + "__env = lambda field_name: __field_env(__model_name, field_name, __suffix)", + "return [", + " f.evolve(", + " default=__populate_env(__default_callback(f.name, f.default), __env(f.name)),", + " metadata={", + " 'env': f.metadata.get('env', __env(f.name)),", + " 'description': f.metadata.get('description', '(not provided)'),", + " },", + " )", + " for f in fields", + "]", + ] + fields_ann = "list[attr.Attribute[t.Any]]" - return cls(**settings) - - -bentoml_cattr.register_structure_hook(_ModelSettings, structure_settings) + return codegen.generate_function( + cls, + "__auto_env", + lines, + args=("_", "fields"), + globs=globs, + annotations={"_": "type[LLMConfig]", "fields": fields_ann, "return": fields_ann}, + ) def _setattr_class(attr_name: str, value_var: t.Any, add_dunder: bool = False): @@ -577,6 +615,10 @@ def _setattr_class(attr_name: str, value_var: t.Any, add_dunder: bool = False): Use the builtin setattr to set *attr_name* to *value_var*. We can't use the cached object.__setattr__ since we are setting attributes to a class. + + If add_dunder to True, the generated globs should include a __add_dunder + value that will be used to add the dunder methods to the class for given + value_var """ val = f"__add_dunder(cls, {value_var})" if add_dunder else value_var return f"setattr(cls, '{attr_name}', {val})" @@ -742,6 +784,23 @@ class LLMConfig: # NOTE: The following will be populated from __config__ and also # considered to be public API. + __openllm_default_id__: str = Field(None) + """Return the default model to use when using 'openllm start '. + This could be one of the keys in 'self.model_ids' or custom users model. + + This field is required when defining under '__config__'. + """ + + __openllm_model_ids__: ListStr = Field(None) + """A list of supported pretrained models tag for this given runnable. + + For example: + For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base", + "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"] + + This field is required when defining under '__config__'. + """ + __openllm_url__: str = Field(None, init=False) """The resolved url for this LLMConfig.""" @@ -751,46 +810,13 @@ class LLMConfig: __openllm_trust_remote_code__: bool = Field(False) """Whether to always trust remote code""" + __openllm_service_name__: str = Field(None) + """Generated service name for this LLMConfig. By default, it is 'generated_{model_name}_service.py'""" + __openllm_requirements__: ListStr | None = Field(None) """The default PyPI requirements needed to run this given LLM. By default, we will depend on bentoml, torch, transformers.""" - __openllm_env__: openllm.utils.ModelEnv = Field(None, init=False) - """A ModelEnv instance for this LLMConfig.""" - - __openllm_model_name__: str = Field("") - """The normalized version of __openllm_start_name__, determined by __openllm_name_type__""" - - __openllm_model_type__: t.Literal["causal_lm", "seq2seq_lm"] = Field("causal_lm") - """The model type for this given LLM. By default, it should be causal language modeling. - Currently supported 'causal_lm' or 'seq2seq_lm' - """ - - __openllm_start_name__: str = Field("") - """Default name to be used with `openllm start`""" - - __openllm_name_type__: t.Literal["dasherize", "lowercase"] = Field("dasherize") - """the default name typed for this model. "dasherize" will convert the name to lowercase and - replace spaces with dashes. "lowercase" will convert the name to lowercase.""" - - __openllm_timeout__: int = Field(36000) - """The default timeout to be set for this given LLM.""" - - __openllm_workers_per_resource__: int | float = Field(1) - """The number of workers per resource. This is used to determine the number of workers to use for this model. - For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then - OpenLLM will use 1 worker per resource. If this is set to 2, then OpenLLM will use 2 workers per resource. - - See StarCoder for more advanced usage. See - https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy for more details. - - By default, it is set to 1. - """ - - __openllm_runtime__: t.Literal["transformers", "cpp"] = Field("transformers") - """The runtime to use for this model. Possible values are `transformers` or `cpp`. See - LlaMA for more information.""" - __openllm_use_pipeline__: bool = Field(False) """Whether this LLM will use HuggingFace Pipeline API. By default, this is set to False. The reason for this to be here is because we want to access this object before loading @@ -804,16 +830,40 @@ class LLMConfig: and set to False for every other models. """ - __openllm_default_id__: str = Field(None) - """Return the default model to use when using 'openllm start '. - This could be one of the keys in 'self.model_ids' or custom users model.""" + __openllm_model_type__: t.Literal["causal_lm", "seq2seq_lm"] = Field("causal_lm") + """The model type for this given LLM. By default, it should be causal language modeling. + Currently supported 'causal_lm' or 'seq2seq_lm' + """ - __openllm_model_ids__: ListStr = Field(None) - """A list of supported pretrained models tag for this given runnable. + __openllm_runtime__: t.Literal["transformers", "cpp"] = Field("transformers") + """The runtime to use for this model. Possible values are `transformers` or `cpp`. See + LlaMA for more information.""" - For example: - For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base", - "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"] + __openllm_name_type__: t.Literal["dasherize", "lowercase"] = Field("dasherize") + """the default name typed for this model. "dasherize" will convert the name to lowercase and + replace spaces with dashes. "lowercase" will convert the name to lowercase.""" + + __openllm_model_name__: str = Field(None) + """The normalized version of __openllm_start_name__, determined by __openllm_name_type__""" + + __openllm_start_name__: str = Field(None) + """Default name to be used with `openllm start`""" + + __openllm_env__: openllm.utils.ModelEnv = Field(None, init=False) + """A ModelEnv instance for this LLMConfig.""" + + __openllm_timeout__: int = Field(36000) + """The default timeout to be set for this given LLM.""" + + __openllm_workers_per_resource__: int | float = Field(1) + """The number of workers per resource. This is used to determine the number of workers to use for this model. + For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then + OpenLLM will use 1 worker per resource. If this is set to 2, then OpenLLM will use 2 workers per resource. + + See StarCoder for more advanced usage. See + https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy for more details. + + By default, it is set to 1. """ __openllm_generation_class__: type[GenerationConfig] = Field(None, init=False) @@ -835,23 +885,10 @@ class LLMConfig: cls.__name__ = f"{cls.__name__}Config" # NOTE: auto assignment attributes generated from __config__ - _make_assignment_script(cls, bentoml_cattr.structure(cls, _ModelSettings))(cls) + _make_assignment_script(cls, bentoml_cattr.structure(cls, _ModelSettingsAttr))(cls) # process a fields under cls.__dict__ and auto convert them with dantic.Field cd = cls.__dict__ anns = codegen.get_annotations(cls) - partialed = functools.partial(_field_env_key, model_name=cls.__openllm_model_name__) - - def auto_config_env(_: type[LLMConfig], attrs: list[attr.Attribute[t.Any]]) -> list[attr.Attribute[t.Any]]: - return [ - a.evolve( - default=_populate_value_from_env_var(partialed(key=a.name), fallback=a.default), - metadata={ - "env": a.metadata.get("env", partialed(key=a.name)), - "description": a.metadata.get("description", "(not provided)"), - }, - ) - for a in attrs - ] # _CountingAttr is the underlying representation of attr.field ca_names = {name for name, attr in cd.items() if isinstance(attr, _CountingAttr)} @@ -864,9 +901,9 @@ class LLMConfig: val = cd.get(attr_name, attr.NOTHING) if not LazyType["_CountingAttr[t.Any]"](_CountingAttr).isinstance(val): if val is attr.NOTHING: - val = cls.Field(env=partialed(key=attr_name)) + val = cls.Field(env=_field_env_key(cls.__openllm_model_name__, attr_name)) else: - val = cls.Field(default=val, env=partialed(key=attr_name)) + val = cls.Field(default=val, env=_field_env_key(cls.__openllm_model_name__, attr_name)) these[attr_name] = val unannotated = ca_names - annotated_names if len(unannotated) > 0: @@ -894,7 +931,7 @@ class LLMConfig: False, # disable auto_attribs, since we already handle these False, # disable kw_only True, # collect_by_mro - field_transformer=auto_config_env, + field_transformer=_make_env_transformer(cls, cls.__openllm_model_name__), ) _weakref_slot = True # slots = True _base_names = {a.name for a in base_attrs} @@ -910,7 +947,7 @@ class LLMConfig: _make_init( cls, # cls (the attrs-decorated class) attrs, # tuple of attr.Attribute of cls - _has_pre_init, # pre_initjk + _has_pre_init, # pre_init _has_post_init, # post_init False, # frozen True, # slots @@ -1047,14 +1084,14 @@ class LLMConfig: def __getattribute__(self, item: str) -> t.Any: if item in _reserved_namespace: raise ForbiddenAttributeError( - f"'{item}' is a reserved namespace for {self.__class__} and should not be access nor modified." + f"'{item}' belongs to a private namespace for {self.__class__} and should not be access nor modified." ) return _object_getattribute.__get__(self)(item) @classmethod def check_if_gpu_is_available(cls, implementation: str | None = None, force: bool = False): if implementation is None: - implementation = cls.__openllm_env__.get_framework_env() + implementation = cls.__openllm_env__["framework_value"] try: if cls.__openllm_requires_gpu__ or force: @@ -1091,7 +1128,7 @@ class LLMConfig: """ attrs = {k: v for k, v in attrs.items() if v is not None} - model_config = cls.__openllm_env__.model_config + model_config = cls.__openllm_env__.config env_json_string = os.environ.get(model_config, None) diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py index 75f76929..4c06a598 100644 --- a/src/openllm/_llm.py +++ b/src/openllm/_llm.py @@ -35,14 +35,17 @@ from bentoml._internal.types import ModelSignatureDict import openllm from .exceptions import ForbiddenAttributeError, OpenLLMException -from .utils import (LazyLoader, bentoml_cattr, is_bitsandbytes_available, - non_intrusive_setattr) +from .utils import (DEBUG, LazyLoader, ModelEnv, bentoml_cattr, first_not_none, + get_debug_mode, is_bitsandbytes_available, + is_torch_available, non_intrusive_setattr, pkg) if t.TYPE_CHECKING: import torch import transformers from bentoml._internal.runner.strategy import Strategy + from .models.auto.factory import _BaseAutoLLMClass + class LLMRunner(bentoml.Runner): __doc__: str __module__: str @@ -170,7 +173,7 @@ def import_model( # NOTE: We need to free up the cache after importing the model # in the case where users first run openllm start without the model # available locally. - if openllm.utils.is_torch_available() and torch.cuda.is_available(): + if is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache() @@ -314,16 +317,25 @@ class LLM(LLMInterface, t.Generic[_M, _T]): model_id: str | None = None, llm_config: openllm.LLMConfig | None = None, *args: t.Any, + quantize: t.Literal["int8", "int4", "gptq"] | None = None, + bettertransformer: bool | None = None, **attrs: t.Any, ) -> LLM[_M, _T]: - return cls(model_id=model_id, llm_config=llm_config, *args, **attrs) + return cls( + model_id=model_id, + llm_config=llm_config, + *args, + quantize=quantize, + bettertransformer=bettertransformer, + **attrs, + ) def __init__( self, model_id: str | None = None, llm_config: openllm.LLMConfig | None = None, *args: t.Any, - quantize: t.Literal["8bit", "4bit", "gptq"] | None = None, + quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, **attrs: t.Any, ): @@ -402,7 +414,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]): llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM will use `config_class` to construct default configuration. quantize: The quantization to use for this LLM. Defaults to None. Possible values - include 8bit, 4bit and gptq. + include int8, int4 and gptq. bettertransformer: Whether to use BetterTransformer with this model. Defaults to False. *args: The args to be passed to the model. **attrs: The kwargs to be passed to the model. @@ -431,6 +443,14 @@ class LLM(LLMInterface, t.Generic[_M, _T]): int4_quant_type = attrs.pop("llm_bnb_4bit_quant_type", "nf4") int4_use_double_quant = attrs.pop("llm_bnb_4bit_use_double_quant", True) + if llm_config is not None: + logger.debug("Using given 'llm_config=(%s)' to initialize LLM.", llm_config) + self.config = llm_config + else: + self.config = self.config_class.model_construct_env(**attrs) + # The rests of the kwargs that is not used by the config class should be stored into __openllm_extras__. + attrs = self.config["extras"] + if quantization_config and quantize: raise ValueError( """'quantization_config' and 'quantize' are mutually exclusive. Either customise @@ -452,7 +472,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]): self, quantize, ) - if quantize == "8bit": + if quantize == "int8": if int8_skip_modules is None: int8_skip_modules = [] if "lm_head" not in int8_skip_modules and self.config["model_type"] == "causal_lm": @@ -465,8 +485,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]): llm_int8_skip_modules=int8_skip_modules, llm_int8_has_fp16_weight=int8_has_fp16_weight, ) - elif quantize == "4bit": - trf_versions = openllm.utils.pkg.pkg_version_info("transformers") + elif quantize == "int4": + trf_versions = pkg.pkg_version_info("transformers") supports_kbits = trf_versions[:2] >= (4, 30) if supports_kbits: quantization_config = transformers.BitsAndBytesConfig( @@ -477,7 +497,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]): ) else: logger.warning( - "'quantize' is set to 4bit, while the current transformers version %s does not support " + "'quantize' is set to int4, while the current transformers version %s does not support " "k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore " "make sure to install the latest version of transformers either via PyPI or " "from git source: 'pip install git+https://github.com/huggingface/transformers'.", @@ -495,20 +515,12 @@ class LLM(LLMInterface, t.Generic[_M, _T]): ) raise NotImplementedError("GPTQ is not supported yet.") else: - raise ValueError(f"'quantize' must be one of ['8bit', '4bit', 'gptq'], got {quantize} instead.") + raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantize} instead.") - attrs.update({"quantization_config": quantization_config}) - - if llm_config is not None: - logger.debug("Using given 'llm_config=(%s)' to initialize LLM", llm_config) - self.config = llm_config - else: - self.config = self.config_class.model_construct_env(**attrs) - # The rests of the kwargs that is not used by the config class should be stored into __openllm_extras__. - attrs = self.config["extras"] - - if not self.config["use_pipeline"]: - attrs["low_cpu_mem_usage"] = low_cpu_mem_usage + if self.__llm_implementation__ == "pt": + if not self.config["use_pipeline"]: + attrs["low_cpu_mem_usage"] = low_cpu_mem_usage + attrs["quantization_config"] = quantization_config model_kwds, tokenizer_kwds = {}, {} if self.__llm_init_kwargs__: @@ -527,8 +539,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]): model_id = os.environ.get(self.config["env"].model_id, self.config["default_id"]) # NOTE: This is the actual given path or pretrained weight for this LLM. - if t.TYPE_CHECKING: - assert model_id is not None + assert model_id is not None self._model_id = model_id # parsing tokenizer and model kwargs @@ -590,6 +601,16 @@ class LLM(LLMInterface, t.Generic[_M, _T]): "model_ids": orjson.dumps(self.config["model_ids"]).decode(), } + @property + def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], dict[str, t.Any]], dict[str, t.Any]]: + """Returning the processed model and tokenizer parameters to be used with + 'import_model' or any other place that requires loading model and tokenizer. + + See 'openllm.cli.download_models' for example usage. + It returns a tuple of (model_args, model_kwargs) & tokenizer_kwargs + """ + return (self._model_args, self._model_attrs), self._tokenizer_attrs + @staticmethod def make_tag( model_id: str | None = None, @@ -638,6 +659,10 @@ class LLM(LLMInterface, t.Generic[_M, _T]): return bentoml.Tag.from_taglike(f"{implementation}-{name}:{model_version}") def ensure_model_id_exists(self) -> bentoml.Model: + """This utility function will download the model if it doesn't exist yet. + Make sure to call this function if 'ensure_available' is not set during + Auto LLM initialisation. + """ output = subprocess.check_output( [ sys.executable, @@ -651,7 +676,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]): "porcelain", ] ) - if openllm.utils.DEBUG: + if DEBUG or get_debug_mode(): # NOTE: This usually only concern BentoML devs. pattern = r"^__tag__:[^:\n]+:[^:\n]+" matched = re.search(pattern, output.decode("utf-8").strip(), re.MULTILINE) @@ -665,7 +690,15 @@ class LLM(LLMInterface, t.Generic[_M, _T]): @property def _bentomodel(self) -> bentoml.Model: if self.__llm_bentomodel__ is None: - self.__llm_bentomodel__ = self.ensure_model_id_exists() + # NOTE: Since PR#28, self.__llm_bentomodel__ changed from + # ensure_model_id_exists() into just returning the model ref. + # This is because we want to save a few seconds of loading time, + # as openllm.Runner and openllm.AutoLLM initialisation is around 700ms + # before #28. + # If users want to make sure to have the model downloaded, + # one should invoke `LLM.ensure_model_id_exists()` manually, + # or pass `ensure_available=True` into the Auto LLM initialisation. + self.__llm_bentomodel__ = bentoml.transformers.get(self.tag) return self.__llm_bentomodel__ @property @@ -729,13 +762,14 @@ class LLM(LLMInterface, t.Generic[_M, _T]): ) return self.__llm_tokenizer__ + # order of these fields matter here, make sure to sync it with + # openllm.models.auto.factory._BaseAutoLLMClass.for_model def to_runner( self, models: list[bentoml.Model] | None = None, max_batch_size: int | None = None, max_latency_ms: int | None = None, method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = None, - embedded: bool = False, scheduling_strategy: type[Strategy] | None = None, ) -> LLMRunner: """Convert this LLM into a Runner. @@ -753,6 +787,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]): NOTE: There are some difference between bentoml.models.get().to_runner() and LLM.to_runner(): 'name'. - 'name': will be generated by OpenLLM, hence users don't shouldn't worry about this. The generated name will be 'llm--runner' (ex: llm-dolly-v2-runner, llm-chatglm-runner) + - 'embedded': Will be disabled by default. There is no reason to run LLM in embedded mode. """ models = models if models is not None else [] models.append(self._bentomodel) @@ -768,10 +803,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]): method_configs = {"generate": generate_sig, "generate_iterator": generate_iterator_sig} else: signatures = ModelSignature.convert_signatures_dict(method_configs) - generate_sig = openllm.utils.first_not_none(signatures.get("generate"), default=generate_sig) - generate_iterator_sig = openllm.utils.first_not_none( - signatures.get("generate_iterator"), default=generate_iterator_sig - ) + generate_sig = first_not_none(signatures.get("generate"), default=generate_sig) + generate_iterator_sig = first_not_none(signatures.get("generate_iterator"), default=generate_iterator_sig) class _Runnable(bentoml.Runnable): SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu") @@ -860,11 +893,11 @@ class LLM(LLMInterface, t.Generic[_M, _T]): }, ), name=self.runner_name, + embedded=False, models=models, max_batch_size=max_batch_size, max_latency_ms=max_latency_ms, method_configs=bentoml_cattr.unstructure(method_configs), - embedded=embedded, scheduling_strategy=scheduling_strategy, ) @@ -918,22 +951,28 @@ def Runner( ... -def Runner(model_name: str, **attrs: t.Any) -> LLMRunner: +def Runner(model_name: str, ensure_available: bool = True, init_local: bool = False, **attrs: t.Any) -> LLMRunner: """Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models' Args: model_name: Supported model name from 'openllm models' + ensure_available: If True, it will ensure the model is available before creating the runner. + Set to False for faster creation time. Note that you will need to make sure + the model for this 'model_id' is available before calling the runner. + One can do this by doing the following: + ```python + runner = openllm.Runner("dolly-v2", ensure_available=False) + runner.llm.ensure_model_id_exists() + ``` + init_local: If True, it will initialize the model locally. This is useful if you want to + run the model locally. (Symmetrical to bentoml.Runner.init_local()) **attrs: The rest of kwargs will then be passed to the LLM. Refer to the LLM documentation for the kwargs behaviour """ - init_local = attrs.pop("init_local", False) - ModelEnv = openllm.utils.ModelEnv(model_name) - if ModelEnv.get_framework_env() == "flax": - runner = openllm.AutoFlaxLLM.create_runner(model_name, **attrs) - elif ModelEnv.get_framework_env() == "tf": - runner = openllm.AutoTFLLM.create_runner(model_name, **attrs) - else: - runner = openllm.AutoLLM.create_runner(model_name, **attrs) + runner = t.cast( + "_BaseAutoLLMClass", + openllm[ModelEnv(model_name)["framework_value"]], # type: ignore (internal API) + ).create_runner(model_name, ensure_available=ensure_available, **attrs) if init_local: runner.init_local(quiet=True) diff --git a/src/openllm/_package.py b/src/openllm/_package.py index 624f741e..3ffbf226 100644 --- a/src/openllm/_package.py +++ b/src/openllm/_package.py @@ -29,12 +29,15 @@ from bentoml._internal.bento.build_config import DockerOptions, PythonOptions from bentoml._internal.configuration import get_debug_mode import openllm -import openllm.utils as utils -from openllm.utils import pkg + +from .utils import (ModelEnv, codegen, first_not_none, is_flax_available, + is_tf_available, is_torch_available, pkg) if t.TYPE_CHECKING: from fs.base import FS + from .models.auto.factory import _BaseAutoLLMClass + logger = logging.getLogger(__name__) OPENLLM_DEV_BUILD = "OPENLLM_DEV_BUILD" @@ -82,10 +85,10 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth if not (str(os.environ.get("BENTOML_BUNDLE_LOCAL_BUILD", False)).lower() == "false"): packages.append(f"bentoml>={'.'.join([str(i) for i in pkg.pkg_version_info('bentoml')])}") - env = llm.config["env"] - to_use_framework = env.get_framework_env() - if to_use_framework == "flax": - assert utils.is_flax_available(), f"Flax is not available, while {env.framework} is set to 'flax'" + env: ModelEnv = llm.config["env"] + framework_envvar = env["framework_value"] + if framework_envvar == "flax": + assert is_flax_available(), f"Flax is not available, while {env.framework} is set to 'flax'" packages.extend( [ f"flax>={importlib.metadata.version('flax')}", @@ -93,8 +96,8 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth f"jaxlib>={importlib.metadata.version('jaxlib')}", ] ) - elif to_use_framework == "tf": - assert utils.is_tf_available(), f"TensorFlow is not available, while {env.framework} is set to 'tf'" + elif framework_envvar == "tf": + assert is_tf_available(), f"TensorFlow is not available, while {env.framework} is set to 'tf'" candidates = ( "tensorflow", "tensorflow-cpu", @@ -116,7 +119,7 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth except importlib.metadata.PackageNotFoundError: pass else: - assert utils.is_torch_available(), "PyTorch is not available. Make sure to have it locally installed." + assert is_torch_available(), "PyTorch is not available. Make sure to have it locally installed." packages.extend([f"torch>={importlib.metadata.version('torch')}"]) wheels: list[str] = [] @@ -127,7 +130,13 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth return PythonOptions(packages=packages, wheels=wheels, lock_packages=True) -def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float) -> DockerOptions: +def construct_docker_options( + llm: openllm.LLM[t.Any, t.Any], + _: FS, + workers_per_resource: int | float, + quantize: t.LiteralString | None, + bettertransformer: bool | None, +) -> DockerOptions: _bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "") _bentoml_config_options_opts = [ "api_server.traffic.timeout=36000", # NOTE: Currently we hardcode this value @@ -135,39 +144,112 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_ f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}', ] _bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts) - env = llm.config["env"] - return DockerOptions( - cuda_version="11.6", # NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version - env={ - env.framework: env.get_framework_env(), - "OPENLLM_MODEL": llm.config["model_name"], - "OPENLLM_MODEL_ID": llm.model_id, - "BENTOML_DEBUG": str(get_debug_mode()), - "BENTOML_CONFIG_OPTIONS": _bentoml_config_options, - }, - system_packages=["git"], + env: ModelEnv = llm.config["env"] + + env_dict = { + env.framework: env.framework_value, + env.config: llm.config.model_dump_json().decode(), + "OPENLLM_MODEL": llm.config["model_name"], + "OPENLLM_MODEL_ID": llm.model_id, + "BENTOML_DEBUG": str(get_debug_mode()), + "BENTOML_CONFIG_OPTIONS": _bentoml_config_options, + } + + # We need to handle None separately here, as env from subprocess doesn't + # accept None value. + _env = ModelEnv(llm.config["model_name"], bettertransformer=bettertransformer, quantize=quantize) + + if _env.bettertransformer_value is not None: + env_dict[_env.bettertransformer] = _env.bettertransformer_value + if _env.quantize_value is not None: + env_dict[_env.quantize] = _env.quantize_value + + # NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version + return DockerOptions(cuda_version="11.6", env=env_dict, system_packages=["git"]) + + +@t.overload +def build( + model_name: str, + *, + model_id: str | None = ..., + quantize: t.LiteralString | None = ..., + bettertransformer: bool | None = ..., + _workers_per_resource: int | float | None = ..., + _overwrite_existing_bento: bool = ..., + __cli__: t.Literal[False] = ..., + **attrs: t.Any, +) -> bentoml.Bento: + ... + + +@t.overload +def build( + model_name: str, + *, + model_id: str | None = ..., + quantize: t.LiteralString | None = ..., + bettertransformer: bool | None = ..., + _workers_per_resource: int | float | None = ..., + _overwrite_existing_bento: bool = ..., + __cli__: t.Literal[True] = ..., + **attrs: t.Any, +) -> tuple[bentoml.Bento, bool]: + ... + + +def _build_bento( + bento_tag: bentoml.Tag, + service_name: str, + llm_fs: FS, + llm: openllm.LLM[t.Any, t.Any], + workers_per_resource: int | float, + quantize: t.LiteralString | None, + bettertransformer: bool | None, +) -> bentoml.Bento: + framework_envvar = llm.config["env"]["framework_value"] + labels = dict(llm.identifying_params) + labels.update({"_type": llm.llm_type, "_framework": framework_envvar}) + logger.info("Building Bento for LLM '%s'", llm.config["start_name"]) + return bentoml.bentos.build( + f"{service_name}:svc", + name=bento_tag.name, + labels=labels, + description=f"OpenLLM service for {llm.config['start_name']}", + include=[ + f for f in llm_fs.walk.files(filter=["*.py"]) + ], # NOTE: By default, we are using _service.py as the default service, for now. + exclude=["/venv", "__pycache__/", "*.py[cod]", "*$py.class"], + python=construct_python_options(llm, llm_fs), + docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer), + version=bento_tag.version, + build_ctx=llm_fs.getsyspath("/"), ) -@t.overload -def build(model_name: str, *, __cli__: t.Literal[False] = ..., **attrs: t.Any) -> bentoml.Bento: - ... +def build( + model_name: str, + *, + model_id: str | None = None, + quantize: t.LiteralString | None = None, + bettertransformer: bool | None = None, + _workers_per_resource: int | float | None = None, + _overwrite_existing_bento: bool = False, + __cli__: bool = False, + **attrs: t.Any, +) -> tuple[bentoml.Bento, bool] | bentoml.Bento: + """Package a LLM into a Bento. + The LLM will be built into a BentoService with the following structure: + if quantize is passed, it will instruct the model to be quantized dynamically during serving time. + if bettertransformer is passed, it will instruct the model to use BetterTransformer during serving time. -@t.overload -def build(model_name: str, *, __cli__: t.Literal[True] = ..., **attrs: t.Any) -> tuple[bentoml.Bento, bool]: - ... + Other parameters including model_name, model_id and attrs will be passed to the LLM class itself. + """ - -def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[bentoml.Bento, bool] | bentoml.Bento: - """Package a LLM into a Bento.""" - - overwrite_existing_bento = attrs.pop("_overwrite_existing_bento", False) + _previously_built = False current_model_envvar = os.environ.pop("OPENLLM_MODEL", None) current_model_id_envvar = os.environ.pop("OPENLLM_MODEL_ID", None) - _previously_built = False - workers_per_resource = attrs.pop("_workers_per_resource", None) - model_id: str = attrs.pop("model_id", None) llm_config = openllm.AutoConfig.for_model(model_name) @@ -178,52 +260,58 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be try: os.environ["OPENLLM_MODEL"] = inflection.underscore(model_name) - to_use_framework = llm_config["env"].get_framework_env() - if to_use_framework == "flax": - llm = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=llm_config, **attrs) - elif to_use_framework == "tf": - llm = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=llm_config, **attrs) - else: - llm = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=llm_config, **attrs) + framework_envvar = llm_config["env"]["framework_value"] + llm = t.cast( + "_BaseAutoLLMClass", + openllm[framework_envvar], # type: ignore (internal API) + ).for_model( + model_name, + model_id=model_id, + llm_config=llm_config, + quantize=quantize, + bettertransformer=bettertransformer, + **attrs, + ) os.environ["OPENLLM_MODEL_ID"] = llm.model_id labels = dict(llm.identifying_params) - labels.update({"_type": llm.llm_type, "_framework": to_use_framework}) + labels.update({"_type": llm.llm_type, "_framework": framework_envvar}) service_name = f"generated_{llm_config['model_name']}_service.py" - workers_per_resource = utils.first_not_none(workers_per_resource, default=llm_config["workers_per_resource"]) + workers_per_resource = first_not_none(_workers_per_resource, default=llm_config["workers_per_resource"]) with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs: # add service.py definition to this temporary folder - utils.codegen.write_service(model_name, llm.model_id, service_name, llm_fs) + codegen.write_service(model_name, llm.model_id, service_name, llm_fs) bento_tag = bentoml.Tag.from_taglike(f"{llm.llm_type}-service:{llm.tag.version}") try: bento = bentoml.get(bento_tag) - if overwrite_existing_bento: + if _overwrite_existing_bento: + logger.info("Overwriting previously saved Bento.") bentoml.delete(bento_tag) - raise bentoml.exceptions.NotFound("Overwriting previously saved Bento.") + bento = _build_bento( + bento_tag, + service_name, + llm_fs, + llm, + workers_per_resource=workers_per_resource, + quantize=quantize, + bettertransformer=bettertransformer, + ) _previously_built = True except bentoml.exceptions.NotFound: logger.info("Building Bento for LLM '%s'", llm_config["start_name"]) - bento = bentoml.bentos.build( - f"{service_name}:svc", - name=bento_tag.name, - labels=labels, - description=f"OpenLLM service for {llm_config['start_name']}", - include=[ - f for f in llm_fs.walk.files(filter=["*.py"]) - ], # NOTE: By default, we are using _service.py as the default service, for now. - exclude=["/venv", "__pycache__/", "*.py[cod]", "*$py.class"], - python=construct_python_options(llm, llm_fs), - docker=construct_docker_options(llm, llm_fs, workers_per_resource), - version=bento_tag.version, - build_ctx=llm_fs.getsyspath("/"), + bento = _build_bento( + bento_tag, + service_name, + llm_fs, + llm, + workers_per_resource=workers_per_resource, + quantize=quantize, + bettertransformer=bettertransformer, ) - if __cli__: - return bento, _previously_built - else: - return bento + return (bento, _previously_built) if __cli__ else bento except Exception as e: logger.error("\nException caught during building LLM %s: \n", model_name, exc_info=e) raise diff --git a/src/openllm/_service.py b/src/openllm/_service.py index 6127d646..55f4bab7 100644 --- a/src/openllm/_service.py +++ b/src/openllm/_service.py @@ -34,7 +34,16 @@ model = os.environ.get("OPENLLM_MODEL", "{__model_name__}") # openllm: model na model_id = os.environ.get("OPENLLM_MODEL_ID", "{__model_id__}") # openllm: model id llm_config = openllm.AutoConfig.for_model(model) -runner = openllm.Runner(model, model_id=model_id, llm_config=llm_config) + +runner = openllm.Runner( + model, + model_id=model_id, + llm_config=llm_config, + bettertransformer=llm_config["env"]["bettertransformer_value"], + quantize=llm_config["env"]["quantize_value"], + ensure_available=False, + init_local=False, +) svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[runner]) @@ -57,6 +66,6 @@ def metadata_v1(_: str) -> openllm.MetadataOutput: model_id=model_id, timeout=llm_config["timeout"], model_name=llm_config["model_name"], - framework=llm_config["env"].get_framework_env(), + framework=llm_config["env"]["framework_value"], configuration=llm_config.model_dump_json().decode(), ) diff --git a/src/openllm/cli.py b/src/openllm/cli.py index 1b6aba31..dad8f7c8 100644 --- a/src/openllm/cli.py +++ b/src/openllm/cli.py @@ -41,12 +41,18 @@ from simple_di import Provide, inject import openllm from .__about__ import __version__ +from .exceptions import OpenLLMException +from .utils import (DEBUG, LazyLoader, LazyType, ModelEnv, analytics, + bentoml_cattr, configure_logging, configure_server_logging, + first_not_none, get_debug_mode, get_quiet_mode, gpu_count, + is_torch_available, set_debug_mode, set_quiet_mode) if t.TYPE_CHECKING: import torch from bentoml._internal.models import ModelStore from ._types import ClickFunctionWrapper, F, P + from .models.auto.factory import _BaseAutoLLMClass ServeCommand = t.Literal["serve", "serve-grpc"] OutputLiteral = t.Literal["json", "pretty", "porcelain"] @@ -54,11 +60,9 @@ if t.TYPE_CHECKING: TupleStrAny = tuple[str, ...] else: TupleStrAny = tuple - torch = openllm.utils.LazyLoader("torch", globals(), "torch") + torch = LazyLoader("torch", globals(), "torch") -openllm.utils.configure_logging() - logger = logging.getLogger(__name__) COLUMNS = int(os.environ.get("COLUMNS", 120)) @@ -75,243 +79,161 @@ OPENLLM_FIGLET = """\ """ +class NargsOptions(cog.GroupedOption): + """An option that supports nargs=-1. + Derived from https://stackoverflow.com/a/48394004/8643197 + + We mk add_to_parser to handle multiple value that is passed into this specific + options. + """ + + def __init__(self, *args: t.Any, **attrs: t.Any): + nargs = attrs.pop("nargs", -1) + if nargs != -1: + raise OpenLLMException(f"'nargs' is set, and must be -1 instead of {nargs}") + super(NargsOptions, self).__init__(*args, **attrs) + self._prev_parser_process: t.Callable[[t.Any, click.parser.ParsingState], None] | None = None + self._nargs_parser: click.parser.Option | None = None + + def add_to_parser(self, parser: click.OptionParser, ctx: click.Context) -> None: + def _parser(value: t.Any, state: click.parser.ParsingState): + # method to hook to the parser.process + done = False + value = [value] + # grab everything up to the next option + assert self._nargs_parser is not None + while state.rargs and not done: + for prefix in self._nargs_parser.prefixes: + if state.rargs[0].startswith(prefix): + done = True + if not done: + value.append(state.rargs.pop(0)) + value = tuple(value) + + # call the actual process + assert self._prev_parser_process is not None + self._prev_parser_process(value, state) + + retval = super(NargsOptions, self).add_to_parser(parser, ctx) + for name in self.opts: + our_parser = parser._long_opt.get(name) or parser._short_opt.get(name) + if our_parser: + self._nargs_parser = our_parser + self._prev_parser_process = our_parser.process + our_parser.process = _parser + break + return retval + + +def parse_device_callback( + _: click.Context, params: click.Parameter, value: tuple[str, ...] | tuple[t.Literal["all"] | str] | None +) -> t.Any: + if value is None: + return value + + if not LazyType(TupleStrAny).isinstance(value): + raise RuntimeError(f"{params} only accept multiple values.") + + # NOTE: --device all is a special case + if len(value) == 1 and value[0] == "all": + return gpu_count() + + parsed: tuple[str, ...] = tuple() + for v in value: + if v == ",": + # NOTE: This hits when CUDA_VISIBLE_DEVICES is set + continue + if "," in v: + parsed += tuple(v.split(",")) + else: + parsed += tuple(v.split()) + return tuple(filter(lambda x: x, parsed)) + + def _echo(text: t.Any, fg: str = "green", _with_style: bool = True, **attrs: t.Any) -> None: call = click.echo if _with_style: - attrs["fg"] = fg if not openllm.utils.get_debug_mode() else None + attrs["fg"] = fg if not get_debug_mode() else None call = click.secho call(text, **attrs) -def quantize_option(factory: t.Any): - help_str = """Running this model in quantized mode. - Note that GPTQ is currently working in progress and will be available soon. +output_option = click.option( + "-o", + "--output", + type=click.Choice(["json", "pretty", "porcelain"]), + default="pretty", + help="Showing output type.", + show_default=True, + envvar="OPENLLM_OUTPUT", + show_envvar=True, +) + + +def model_id_option(factory: t.Any, model_env: ModelEnv | None = None, click_type: click.ParamType | None = None): + envvar = None + if model_env is not None: + envvar = model_env.model_id + return factory.option( + "--model-id", + type=click_type if click_type else click.STRING, + default=None, + help="Optional model_id name or path for (fine-tune) weight.", + envvar=envvar, + show_envvar=True if envvar is not None else False, + ) + + +def workers_per_resource_option(factory: t.Any, build: bool = False): + help_str = """Number of workers per resource assigned. + See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy + for more information. By default, this is set to 1.""" + if build: + help_str += """\n + NOTE: The workers value passed into 'build' will determine how the LLM can + be provisioned in Kubernetes as well as in standalone container. This will + ensure it has the same effect with 'openllm start --workers ...'""" + return factory.option( + "--workers-per-resource", + default=None, + type=click.FLOAT, + help=help_str, + required=False, + ) + + +def quantize_option(factory: t.Any, build: bool = False): + help_str = ( + "Running this model in quantized mode." if not build else "Set quantization mode for serving in deployment." + ) + help_str += """\n + + GPTQ is currently working in progress and will be available soon. NOTE: Quantization is only available for PyTorch models. """ return factory.option( "--quantize", - type=click.Choice(["8bit", "4bit", "gptq"]), + type=click.Choice(["int8", "int4", "gptq"]), default=None, help=help_str, ) -def bettertransformer_option(factory: t.Any): +def bettertransformer_option(factory: t.Any, model_env: ModelEnv | None = None): + envvar = None + if model_env is not None: + envvar = model_env.model_id return factory.option( "--bettertransformer", is_flag=True, default=None, - help="Use BetterTransformer wrapper to serve model", + help="Use BetterTransformer wrapper to serve model. This will applies during serving time.", + envvar=envvar, + show_envvar=True if envvar is not None else False, ) -def start_model_command( - model_name: str, - group: click.Group, - _context_settings: dict[str, t.Any] | None = None, - _serve_grpc: bool = False, -) -> click.Command: - """Generate a 'click.Command' for any given LLM. - - Args: - model_name: The name of the model - factory: The click.Group to add the command to - _context_settings: The context settings to use for the command - _serve_grpc: Whether to serve the model via gRPC or HTTP - - Returns: - The click.Command for starting the model server - - Note that the internal commands will return the llm_config and a boolean determine - whether the server is run with GPU or not. - """ - from bentoml._internal.configuration.containers import BentoMLContainer - - openllm.utils.configure_logging() - - llm_config = openllm.AutoConfig.for_model(model_name) - env = llm_config["env"] - - docstring = f"""\ -{env.start_docstring} -\b -Available model_id(s): {llm_config['model_ids']} [default: {llm_config['default_id']}] -""" - command_attrs: dict[str, t.Any] = { - "name": llm_config["model_name"], - "context_settings": _context_settings or {}, - "short_help": f"Start a LLMServer for '{model_name}' ('--help' for more details)", - "help": docstring, - } - - aliases: list[str] = [] - if llm_config["name_type"] == "dasherize": - aliases.append(llm_config["start_name"]) - - command_attrs["aliases"] = aliases if len(aliases) > 0 else None - - serve_decorator = _http_server_args if not _serve_grpc else _grpc_server_args - - available_gpu = openllm.utils.gpu_count() - if llm_config["requires_gpu"] and len(available_gpu) < 1: - # NOTE: The model requires GPU, therefore we will return a dummy command - command_attrs.update( - { - "short_help": "(Disabled because there is no GPU available)", - "help": f"""{model_name} is currently not available to run on your - local machine because it requires GPU for faster inference.""", - } - ) - - @group.command(**command_attrs) - def noop() -> openllm.LLMConfig: - _echo("No GPU available, therefore this command is disabled", fg="red") - openllm.utils.analytics.track_start_init(llm_config) - return llm_config - - return noop - - @group.command(**command_attrs) - @llm_config.to_click_options - @serve_decorator - @cog.optgroup.group("General LLM Options") - @cog.optgroup.option( - "--server-timeout", - type=int, - default=None, - help="Server timeout in seconds", - ) - @model_id_option(cog.optgroup, model_env=env) - @cog.optgroup.option( - "--device", - type=tuple, - cls=NargsOptions, - nargs=-1, - envvar="CUDA_VISIBLE_DEVICES", - callback=parse_device_callback, - help=f"Assign GPU devices (if available) for {model_name}.", - show_envvar=True, - ) - @workers_per_resource_option(cog.optgroup) - @quantize_option(cog.optgroup) - @bettertransformer_option(cog.optgroup) - def model_start( - server_timeout: int | None, - model_id: str | None, - workers_per_resource: float | None, - device: tuple[str, ...] | None, - quantize: t.Literal["8bit", "4bit", "gptq"] | None, - bettertransformer: bool | None, - **attrs: t.Any, - ) -> openllm.LLMConfig: - config, server_attrs = llm_config.model_validate_click(**attrs) - - if quantize and env.get_framework_env() != "pt": - _echo("Quantization is only available for PyTorch models.", fg="yellow") - - if env.get_framework_env() == "flax": - llm = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=config, ensure_available=True) - elif env.get_framework_env() == "tf": - llm = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=config, ensure_available=True) - else: - llm = openllm.AutoLLM.for_model( - model_name, - model_id=model_id, - llm_config=config, - quantize=quantize, - bettertransformer=bettertransformer, - ensure_available=True, - ) - - requirements = config["requirements"] - if requirements is not None and len(requirements) > 0: - _echo( - f"Make sure to have the following dependencies available: {requirements}", - fg="yellow", - ) - - workers_per_resource = openllm.utils.first_not_none( - workers_per_resource, default=config["workers_per_resource"] - ) - server_timeout = openllm.utils.first_not_none(server_timeout, default=config["timeout"]) - - num_workers = int(1 / workers_per_resource) - if num_workers > 1: - _echo( - f"{model_name} requires at least {num_workers} GPUs/CPUs available per worker." - " Make sure that it has available resources to run inference.", - fg="yellow", - ) - - server_attrs.update({"working_dir": os.path.dirname(__file__)}) - if _serve_grpc: - server_attrs["grpc_protocol_version"] = "v1" - # NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream. - development = server_attrs.pop("development") - server_attrs.setdefault("production", not development) - - start_env = os.environ.copy() - - # NOTE: This is to set current configuration - _bentoml_config_options = start_env.pop("BENTOML_CONFIG_OPTIONS", "") - _bentoml_config_options_opts = [ - "tracing.sample_rate=1.0", - f"api_server.traffic.timeout={server_timeout}", - f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}', - f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}', - ] - if device: - if len(device) > 1: - for idx, dev in enumerate(device): - _bentoml_config_options_opts.append( - f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' - ) - else: - _bentoml_config_options_opts.append( - f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]' - ) - - _bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts) - - start_env.update( - { - env.framework: env.get_framework_env(), - env.model_config: llm.config.model_dump_json().decode(), - "OPENLLM_MODEL": model_name, - "OPENLLM_MODEL_ID": llm.model_id, - "BENTOML_DEBUG": str(openllm.utils.get_debug_mode()), - "BENTOML_CONFIG_OPTIONS": _bentoml_config_options, - "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()), - } - ) - - if t.TYPE_CHECKING: - server_cls: type[bentoml.HTTPServer] if not _serve_grpc else type[bentoml.GrpcServer] - - server_cls = getattr(bentoml, "HTTPServer" if not _serve_grpc else "GrpcServer") - server_attrs["timeout"] = 90 - server = server_cls("_service.py:svc", **server_attrs) - - try: - openllm.utils.analytics.track_start_init(llm.config) - server.start(env=start_env, text=True, blocking=True) - except Exception as err: - _echo(f"Error caught while starting LLM Server:\n{err}", fg="red") - raise - else: - if not openllm.utils.get_debug_mode(): - _echo( - f"\nšŸš€ Next step: run 'openllm build {model_name}' to create a Bento for {model_name}", - fg="blue", - ) - - # NOTE: Return the configuration for telemetry purposes. - return llm_config - - return model_start - - class OpenLLMCommandGroup(BentoMLCommandGroup): NUMBER_OF_COMMON_PARAMS = 3 @@ -333,19 +255,19 @@ class OpenLLMCommandGroup(BentoMLCommandGroup): "--do-not-track", is_flag=True, default=False, - envvar=openllm.utils.analytics.OPENLLM_DO_NOT_TRACK, + envvar=analytics.OPENLLM_DO_NOT_TRACK, help="Do not send usage info", ) @functools.wraps(f) def wrapper(quiet: bool, debug: bool, *args: P.args, **attrs: P.kwargs) -> t.Any: if quiet: - openllm.utils.set_quiet_mode(True) + set_quiet_mode(True) if debug: logger.warning("'--quiet' passed; ignoring '--verbose/--debug'") elif debug: - openllm.utils.set_debug_mode(True) + set_debug_mode(True) - openllm.utils.configure_logging() + configure_logging() return f(*args, **attrs) @@ -363,26 +285,26 @@ class OpenLLMCommandGroup(BentoMLCommandGroup): @functools.wraps(func) def wrapper(do_not_track: bool, *args: P.args, **attrs: P.kwargs) -> t.Any: if do_not_track: - with openllm.utils.analytics.set_bentoml_tracking(): + with analytics.set_bentoml_tracking(): return func(*args, **attrs) start_time = time.time_ns() - with openllm.utils.analytics.set_bentoml_tracking(): + with analytics.set_bentoml_tracking(): assert group.name is not None, "group.name should not be None" - event = openllm.utils.analytics.OpenllmCliEvent(cmd_group=group.name, cmd_name=command_name) + event = analytics.OpenllmCliEvent(cmd_group=group.name, cmd_name=command_name) try: return_value = func(*args, **attrs) duration_in_ms = (time.time_ns() - start_time) / 1e6 event.duration_in_ms = duration_in_ms - openllm.utils.analytics.track(event) + analytics.track(event) return return_value except Exception as e: duration_in_ms = (time.time_ns() - start_time) / 1e6 event.duration_in_ms = duration_in_ms event.error_type = type(e).__name__ event.return_code = 2 if isinstance(e, KeyboardInterrupt) else 1 - openllm.utils.analytics.track(event) + analytics.track(event) raise return t.cast("ClickFunctionWrapper[..., t.Any]", wrapper) @@ -400,7 +322,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup): def wrapper(*args: P.args, **attrs: P.kwargs) -> t.Any: try: return func(*args, **attrs) - except openllm.exceptions.OpenLLMException as err: + except OpenLLMException as err: raise click.ClickException( click.style(f"[{group.name}] '{command_name}' failed: " + err.message, fg="red") ) from err @@ -409,26 +331,18 @@ class OpenLLMCommandGroup(BentoMLCommandGroup): return t.cast("ClickFunctionWrapper[..., t.Any]", wrapper) - def __init__(self, *args: t.Any, **attrs: t.Any) -> None: - super(OpenLLMCommandGroup, self).__init__(*args, **attrs) - # these two dictionaries will store known aliases for commands and groups - self._cached_http: dict[str, t.Any] = {} - self._cached_grpc: dict[str, t.Any] = {} - def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None: cmd_name = self.resolve_alias(cmd_name) if ctx.command.name == "start": - if cmd_name not in self._cached_http: - self._cached_http[cmd_name] = start_model_command( - cmd_name, self, _context_settings=ctx.command.context_settings - ) - return self._cached_http[cmd_name] + try: + return _cached_http[cmd_name] + except KeyError: + raise click.BadArgumentUsage(f"{cmd_name} is not a valid model identifier supported by OpenLLM.") elif ctx.command.name == "start-grpc": - if cmd_name not in self._cached_grpc: - self._cached_grpc[cmd_name] = start_model_command( - cmd_name, self, _context_settings=ctx.command.context_settings, _serve_grpc=True - ) - return self._cached_grpc[cmd_name] + try: + return _cached_grpc[cmd_name] + except KeyError: + raise click.BadArgumentUsage(f"{cmd_name} is not a valid model identifier supported by OpenLLM.") return super().get_command(ctx, cmd_name) def list_commands(self, ctx: click.Context) -> list[str]: @@ -484,6 +398,42 @@ class OpenLLMCommandGroup(BentoMLCommandGroup): return t.cast("F[[t.Callable[..., t.Any]], click.Command]", wrapper) +@click.group(cls=OpenLLMCommandGroup, context_settings=_CONTEXT_SETTINGS, name="openllm") +@click.version_option(__version__, "--version", "-v") +def cli(): + """ + \b + ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•—ā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā•—ā–ˆā–ˆā•— ā–ˆā–ˆā•— ā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā–ˆā•— + ā–ˆā–ˆā•”ā•ā•ā•ā–ˆā–ˆā•—ā–ˆā–ˆā•”ā•ā•ā–ˆā–ˆā•—ā–ˆā–ˆā•”ā•ā•ā•ā•ā•ā–ˆā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā•‘ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ ā–ˆā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā–ˆā–ˆā•‘ + ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•”ā•ā–ˆā–ˆā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā•”ā–ˆā–ˆā•— ā–ˆā–ˆā•‘ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ ā–ˆā–ˆā•”ā–ˆā–ˆā–ˆā–ˆā•”ā–ˆā–ˆā•‘ + ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ā–ˆā–ˆā•”ā•ā•ā•ā• ā–ˆā–ˆā•”ā•ā•ā• ā–ˆā–ˆā•‘ā•šā–ˆā–ˆā•—ā–ˆā–ˆā•‘ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ā•šā–ˆā–ˆā•”ā•ā–ˆā–ˆā•‘ + ā•šā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•”ā•ā–ˆā–ˆā•‘ ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•—ā–ˆā–ˆā•‘ ā•šā–ˆā–ˆā–ˆā–ˆā•‘ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•—ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•—ā–ˆā–ˆā•‘ ā•šā•ā• ā–ˆā–ˆā•‘ + ā•šā•ā•ā•ā•ā•ā• ā•šā•ā• ā•šā•ā•ā•ā•ā•ā•ā•ā•šā•ā• ā•šā•ā•ā•ā•ā•šā•ā•ā•ā•ā•ā•ā•ā•šā•ā•ā•ā•ā•ā•ā•ā•šā•ā• ā•šā•ā• + + \b + An open platform for operating large language models in production. + Fine-tune, serve, deploy, and monitor any LLMs with ease. + """ + + +@cli.group(cls=OpenLLMCommandGroup, context_settings=_CONTEXT_SETTINGS, name="start") +def start_cli(): + """ + Start any LLM as a REST server. + + $ openllm start -- ... + """ + + +@cli.group(cls=OpenLLMCommandGroup, context_settings=_CONTEXT_SETTINGS, name="start-grpc") +def start_grpc_cli(): + """ + Start any LLM as a gRPC server. + + $ openllm start-grpc -- ... + """ + + # NOTE: A list of bentoml option that is not needed for parsing. # NOTE: User shouldn't set '--working-dir', as OpenLLM will setup this. # NOTE: production is also deprecated @@ -533,75 +483,244 @@ _http_server_args = parse_serve_args(False) _grpc_server_args = parse_serve_args(True) -class NargsOptions(cog.GroupedOption): - """An option that supports nargs=-1. - Derived from https://stackoverflow.com/a/48394004/8643197 +def start_model_command( + model_name: str, + _context_settings: dict[str, t.Any] | None = None, + _serve_grpc: bool = False, +) -> click.Command: + """Generate a 'click.Command' for any given LLM. - We mk add_to_parser to handle multiple value that is passed into this specific - options. + Args: + model_name: The name of the model + factory: The click.Group to add the command to + _context_settings: The context settings to use for the command + _serve_grpc: Whether to serve the model via gRPC or HTTP + + Returns: + The click.Command for starting the model server + + Note that the internal commands will return the llm_config and a boolean determine + whether the server is run with GPU or not. """ + from bentoml._internal.configuration.containers import BentoMLContainer - def __init__(self, *args: t.Any, **attrs: t.Any): - nargs = attrs.pop("nargs", -1) - if nargs != -1: - raise openllm.exceptions.OpenLLMException(f"'nargs' is set, and must be -1 instead of {nargs}") - super(NargsOptions, self).__init__(*args, **attrs) - self._prev_parser_process: t.Callable[[t.Any, click.parser.ParsingState], None] | None = None - self._nargs_parser: click.parser.Option | None = None + configure_logging() - def add_to_parser(self, parser: click.OptionParser, ctx: click.Context) -> None: - def _parser(value: t.Any, state: click.parser.ParsingState): - # method to hook to the parser.process - done = False - value = [value] - # grab everything up to the next option - assert self._nargs_parser is not None - while state.rargs and not done: - for prefix in self._nargs_parser.prefixes: - if state.rargs[0].startswith(prefix): - done = True - if not done: - value.append(state.rargs.pop(0)) - value = tuple(value) + llm_config = openllm.AutoConfig.for_model(model_name) + env: ModelEnv = llm_config["env"] - # call the actual process - assert self._prev_parser_process is not None - self._prev_parser_process(value, state) + docstring = f"""\ +{env.start_docstring} +\b +Available model_id(s): {llm_config['model_ids']} [default: {llm_config['default_id']}] +""" + command_attrs: dict[str, t.Any] = { + "name": llm_config["model_name"], + "context_settings": _context_settings or {}, + "short_help": f"Start a LLMServer for '{model_name}' ('--help' for more details)", + "help": docstring, + } - retval = super(NargsOptions, self).add_to_parser(parser, ctx) - for name in self.opts: - our_parser = parser._long_opt.get(name) or parser._short_opt.get(name) - if our_parser: - self._nargs_parser = our_parser - self._prev_parser_process = our_parser.process - our_parser.process = _parser - break - return retval + aliases: list[str] = [] + if llm_config["name_type"] == "dasherize": + aliases.append(llm_config["start_name"]) + command_attrs["aliases"] = aliases if len(aliases) > 0 else None -def parse_device_callback( - _: click.Context, params: click.Parameter, value: tuple[str, ...] | tuple[t.Literal["all"] | str] | None -) -> t.Any: - if value is None: - return value + serve_decorator = _http_server_args if not _serve_grpc else _grpc_server_args + group = start_cli if not _serve_grpc else start_grpc_cli - if not openllm.utils.LazyType(TupleStrAny).isinstance(value): - raise RuntimeError(f"{params} only accept multiple values.") + available_gpu = gpu_count() + if llm_config["requires_gpu"] and len(available_gpu) < 1: + # NOTE: The model requires GPU, therefore we will return a dummy command + command_attrs.update( + { + "short_help": "(Disabled because there is no GPU available)", + "help": f"""{model_name} is currently not available to run on your + local machine because it requires GPU for faster inference.""", + } + ) - # NOTE: --device all is a special case - if len(value) == 1 and value[0] == "all": - return openllm.utils.gpu_count() + @group.command(**command_attrs) + def noop() -> openllm.LLMConfig: + _echo("No GPU available, therefore this command is disabled", fg="red") + analytics.track_start_init(llm_config) + return llm_config - parsed: tuple[str, ...] = tuple() - for v in value: - if v == ",": - # NOTE: This hits when CUDA_VISIBLE_DEVICES is set - continue - if "," in v: - parsed += tuple(v.split(",")) + return noop + + @group.command(**command_attrs) + @llm_config.to_click_options + @serve_decorator + @cog.optgroup.group("General LLM Options") + @cog.optgroup.option( + "--server-timeout", + type=int, + default=None, + help="Server timeout in seconds", + ) + @workers_per_resource_option(cog.optgroup) + @model_id_option(cog.optgroup, model_env=env, click_type=click.Choice(llm_config["model_ids"])) + @cog.optgroup.option( + "--device", + type=tuple, + cls=NargsOptions, + nargs=-1, + envvar="CUDA_VISIBLE_DEVICES", + callback=parse_device_callback, + help=f"Assign GPU devices (if available) for {model_name}.", + show_envvar=True, + ) + @quantize_option(cog.optgroup) + @bettertransformer_option(cog.optgroup, model_env=env) + @cog.optgroup.option( + "--fast", + is_flag=True, + default=False, + help="Bypass auto model checks and setup. This option is ahead-of-serving time.", + ) + @click.pass_context + def model_start( + ctx: click.Context, + server_timeout: int | None, + model_id: str | None, + workers_per_resource: float | None, + device: tuple[str, ...] | None, + quantize: t.Literal["int8", "int4", "gptq"] | None, + bettertransformer: bool | None, + fast: bool, + **attrs: t.Any, + ) -> openllm.LLMConfig: + config, server_attrs = llm_config.model_validate_click(**attrs) + + # Create a new model env to work with the envvar during CLI invocation + env = ModelEnv(config["model_name"]) + framework_envvar = env.framework_value + + if quantize: + gpu_available = gpu_count() + if len(gpu_available) < 1: + _echo(f"Quantization requires at least 1 GPU (got {len(gpu_available)})", fg="red") + ctx.exit(1) + if framework_envvar != "pt": + _echo("Quantization is currently only available for PyTorch models.", fg="red") + ctx.exit(1) + + # We need to handle None separately here, as env from subprocess doesn't + # accept None value. + env = ModelEnv(env.model_name, bettertransformer=bettertransformer, quantize=quantize) + + requirements = config["requirements"] + if requirements is not None and len(requirements) > 0: + _echo( + f"Make sure to have the following dependencies available: {requirements}", + fg="yellow", + ) + + workers_per_resource = first_not_none(workers_per_resource, default=config["workers_per_resource"]) + server_timeout = first_not_none(server_timeout, default=config["timeout"]) + + num_workers = int(1 / workers_per_resource) + if num_workers > 1: + _echo( + f"Running '{model_name}' requires at least {num_workers} GPUs/CPUs available per worker." + " Make sure that it has available resources for inference.", + fg="yellow", + ) + + server_attrs.update({"working_dir": os.path.dirname(__file__)}) + if _serve_grpc: + server_attrs["grpc_protocol_version"] = "v1" + # NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream. + development = server_attrs.pop("development") + server_attrs.setdefault("production", not development) + + # NOTE: This is to set current configuration + start_env = os.environ.copy() + _bentoml_config_options_env = start_env.pop("BENTOML_CONFIG_OPTIONS", "") + _bentoml_config_options_opts = [ + "tracing.sample_rate=1.0", + f"api_server.traffic.timeout={server_timeout}", + f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}', + f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}', + ] + if device: + if len(device) > 1: + for idx, dev in enumerate(device): + _bentoml_config_options_opts.append( + f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' + ) + else: + _bentoml_config_options_opts.append( + f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]' + ) + + _bentoml_config_options_env += ( + " " if _bentoml_config_options_env else "" + " ".join(_bentoml_config_options_opts) + ) + + automodel_attrs = { + "model_id": model_id, + "llm_config": config, + "ensure_available": not fast, + } + + if framework_envvar == "pt": + automodel_attrs.update({"quantize": quantize, "bettertransformer": bettertransformer}) + + llm = t.cast( + "_BaseAutoLLMClass", + openllm[framework_envvar], # type: ignore (internal API) + ).for_model(model_name, **automodel_attrs) + + start_env.update( + { + env.framework: env.framework_value, + env.config: llm.config.model_dump_json().decode(), + "OPENLLM_MODEL": model_name, + "OPENLLM_MODEL_ID": llm.model_id, + "BENTOML_DEBUG": str(get_debug_mode()), + "BENTOML_CONFIG_OPTIONS": _bentoml_config_options_env, + "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()), + } + ) + + if env.bettertransformer_value is not None: + start_env[env.bettertransformer] = env.bettertransformer_value + if env.quantize_value is not None: + start_env[env.quantize] = env.quantize_value + + if t.TYPE_CHECKING: + server_cls: type[bentoml.HTTPServer] if not _serve_grpc else type[bentoml.GrpcServer] + + server_cls = getattr(bentoml, "HTTPServer" if not _serve_grpc else "GrpcServer") + server_attrs["timeout"] = server_timeout + server = server_cls("_service.py:svc", **server_attrs) + + try: + analytics.track_start_init(llm.config) + server.start(env=start_env, text=True, blocking=True) + except Exception as err: + _echo(f"Error caught while starting LLM Server:\n{err}", fg="red") + raise else: - parsed += tuple(v.split()) - return tuple(filter(lambda x: x, parsed)) + if not get_debug_mode(): + _echo( + f"\nšŸš€ Next step: run 'openllm build {model_name}' to create a Bento for {model_name}", + fg="blue", + ) + + # NOTE: Return the configuration for telemetry purposes. + return llm_config + + return model_start + + +_cached_http = {key: start_model_command(key, _context_settings=_CONTEXT_SETTINGS) for key in openllm.CONFIG_MAPPING} +_cached_grpc = { + key: start_model_command(key, _context_settings=_CONTEXT_SETTINGS, _serve_grpc=True) + for key in openllm.CONFIG_MAPPING +} def _start( @@ -612,113 +731,32 @@ def _start( """Python API to start a LLM server.""" _serve_grpc = attrs.pop("_serve_grpc", False) - _ModelEnv = openllm.utils.ModelEnv(model_name) + _ModelEnv = ModelEnv(model_name) if framework is not None: os.environ[_ModelEnv.framework] = framework - start_model_command(model_name, t.cast(OpenLLMCommandGroup, cli), _serve_grpc=_serve_grpc)( - standalone_mode=False, **attrs - ) + start_model_command(model_name, _serve_grpc=_serve_grpc)(standalone_mode=False, **attrs) start = functools.partial(_start, _serve_grpc=False) start_grpc = functools.partial(_start, _serve_grpc=True) -output_option = click.option( - "-o", - "--output", - type=click.Choice(["json", "pretty", "porcelain"]), - default="pretty", - help="Showing output type.", - show_default=True, - envvar="OPENLLM_OUTPUT", - show_envvar=True, -) - - -def model_id_option(factory: t.Any, model_env: openllm.utils.ModelEnv | None = None): - envvar = None - if model_env is not None: - envvar = model_env.model_id - return factory.option( - "--model-id", - type=click.STRING, - default=None, - help="Optional model_id name or path for (fine-tune) weight.", - envvar=envvar, - show_envvar=True if envvar is not None else False, - ) - - -def workers_per_resource_option(factory: t.Any, build: bool = False): - help_str = """Number of workers per resource assigned. - See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy - for more information. By default, this is set to 1.""" - if build: - help_str += """\n - NOTE: The workers value passed into 'build' will determine how the LLM can - be provisioned in Kubernetes as well as in standalone container. This will - ensure it has the same effect with 'openllm start --workers ...'""" - return factory.option( - "--workers-per-resource", - default=None, - type=click.FLOAT, - help=help_str, - required=False, - ) - - -@click.group(cls=OpenLLMCommandGroup, context_settings=_CONTEXT_SETTINGS, name="openllm") -@click.version_option(__version__, "--version", "-v") -def cli(): - """ - \b - ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•—ā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā•—ā–ˆā–ˆā•— ā–ˆā–ˆā•— ā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā–ˆā•— - ā–ˆā–ˆā•”ā•ā•ā•ā–ˆā–ˆā•—ā–ˆā–ˆā•”ā•ā•ā–ˆā–ˆā•—ā–ˆā–ˆā•”ā•ā•ā•ā•ā•ā–ˆā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā•‘ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ ā–ˆā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā–ˆā–ˆā•‘ - ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•”ā•ā–ˆā–ˆā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā•”ā–ˆā–ˆā•— ā–ˆā–ˆā•‘ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ ā–ˆā–ˆā•”ā–ˆā–ˆā–ˆā–ˆā•”ā–ˆā–ˆā•‘ - ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ā–ˆā–ˆā•”ā•ā•ā•ā• ā–ˆā–ˆā•”ā•ā•ā• ā–ˆā–ˆā•‘ā•šā–ˆā–ˆā•—ā–ˆā–ˆā•‘ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ā•šā–ˆā–ˆā•”ā•ā–ˆā–ˆā•‘ - ā•šā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•”ā•ā–ˆā–ˆā•‘ ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•—ā–ˆā–ˆā•‘ ā•šā–ˆā–ˆā–ˆā–ˆā•‘ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•—ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•—ā–ˆā–ˆā•‘ ā•šā•ā• ā–ˆā–ˆā•‘ - ā•šā•ā•ā•ā•ā•ā• ā•šā•ā• ā•šā•ā•ā•ā•ā•ā•ā•ā•šā•ā• ā•šā•ā•ā•ā•ā•šā•ā•ā•ā•ā•ā•ā•ā•šā•ā•ā•ā•ā•ā•ā•ā•šā•ā• ā•šā•ā• - - \b - An open platform for operating large language models in production. - Fine-tune, serve, deploy, and monitor any LLMs with ease. - """ - - -@cli.group(cls=OpenLLMCommandGroup, context_settings=_CONTEXT_SETTINGS, name="start") -def start_cli(): - """ - Start any LLM as a REST server. - - $ openllm start -- ... - """ - - -@cli.group(cls=OpenLLMCommandGroup, context_settings=_CONTEXT_SETTINGS, name="start-grpc") -def start_grpc_cli(): - """ - Start any LLM as a gRPC server. - - $ openllm start-grpc -- ... - """ - - @cli.command() @click.argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()])) @model_id_option(click) @output_option @click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.") @workers_per_resource_option(click, build=True) -@quantize_option(click) -@bettertransformer_option(click) +@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name="Optimisation options.") +@quantize_option(cog.optgroup, build=True) +@bettertransformer_option(cog.optgroup) def build( model_name: str, model_id: str | None, overwrite: bool, output: OutputLiteral, - quantize: t.Literal["8bit", "4bit", "gptq"] | None, + quantize: t.Literal["int8", "int4", "gptq"] | None, bettertransformer: bool | None, workers_per_resource: float | None, ): @@ -731,8 +769,8 @@ def build( to have https://github.com/NVIDIA/nvidia-container-toolkit install locally. """ if output == "porcelain": - openllm.utils.set_quiet_mode(True) - openllm.utils.configure_server_logging() + set_quiet_mode(True) + configure_server_logging() if output == "pretty": if overwrite: @@ -749,7 +787,7 @@ def build( ) if output == "pretty": - if not openllm.utils.get_quiet_mode(): + if not get_quiet_mode(): _echo("\n" + OPENLLM_FIGLET, fg="white") if not _previously_built: _echo(f"Successfully built {bento}.", fg="green") @@ -827,7 +865,7 @@ def models(output: OutputLiteral, show_available: bool): "installation": "pip install openllm" if m not in extras else f'pip install "openllm[{m}]"', } converted.extend([convert_transformers_model_name(i) for i in config["model_ids"]]) - if openllm.utils.DEBUG: + if DEBUG: try: openllm.AutoLLM.for_model(m, llm_config=config) except Exception as err: @@ -835,7 +873,8 @@ def models(output: OutputLiteral, show_available: bool): ids_in_local_store = None if show_available: - ids_in_local_store = [i for i in bentoml.models.list() if any(n in i.tag.name for n in converted)] + ids_in_local_store = {k: [i for i in bentoml.models.list() if k in i.tag.name] for k in json_data.keys()} + ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v} if output == "pretty": import tabulate @@ -888,7 +927,7 @@ def models(output: OutputLiteral, show_available: bool): ) _echo(formatted_table, fg="white") - if openllm.utils.DEBUG and len(failed_initialized) > 0: + if DEBUG and len(failed_initialized) > 0: _echo("\nThe following models are supported but failed to initialize:\n") for m, err in failed_initialized: _echo(f"- {m}: ", fg="blue", nl=False) @@ -896,14 +935,28 @@ def models(output: OutputLiteral, show_available: bool): if show_available: assert ids_in_local_store - _echo("The following models are available in local store:\n", fg="white") - for i in ids_in_local_store: - _echo(f"- {i}", fg="white") + + _available = [[k + "\n\n" * len(v), [str(i.tag) for i in v]] for k, v in ids_in_local_store.items()] + column_widths = [int(COLUMNS / 6), int(COLUMNS / 2)] + table = tabulate.tabulate( + _available, + tablefmt="fancy_grid", + headers=["Model Id", "Models"], + maxcolwidths=column_widths, + ) + _echo("The following models are available in local store:\n", fg="magenta") + + formatted_table = "" + for line in table.split("\n"): + formatted_table += ( + "".join(f"{cell:{width}}" for cell, width in zip(line.split("\t"), column_widths)) + "\n" + ) + _echo(formatted_table, fg="white") else: dumped: dict[str, t.Any] = json_data if show_available: assert ids_in_local_store - dumped["local"] = [openllm.utils.bentoml_cattr.unstructure(i.tag) for i in ids_in_local_store] + dumped["local"] = [bentoml_cattr.unstructure(i.tag) for m in ids_in_local_store.values() for i in m] _echo( orjson.dumps( dumped, @@ -977,12 +1030,10 @@ def query_( else openllm.client.GrpcClient(endpoint, timeout=timeout) ) - if client.framework == "flax": - model = openllm.AutoFlaxLLM.for_model(client.model_name) - elif client.framework == "tf": - model = openllm.AutoTFLLM.for_model(client.model_name) - else: - model = openllm.AutoLLM.for_model(client.model_name) + model = t.cast( + "_BaseAutoLLMClass", + openllm[client.framework], # type: ignore (internal API) + ).for_model(client.model_name) if output != "porcelain": _echo(f"Processing query: {query}\n", fg="white") @@ -1012,17 +1063,15 @@ def download_models(model_name: str, model_id: str | None, output: OutputLiteral Note: This is useful for development and setup for fine-tune. """ if output == "porcelain": - openllm.utils.set_quiet_mode(True) - openllm.utils.configure_logging() + set_quiet_mode(True) + configure_logging() config = openllm.AutoConfig.for_model(model_name) - envvar = config["env"].get_framework_env() - if envvar == "flax": - model = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=config) - elif envvar == "tf": - model = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=config) - else: - model = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=config) + envvar = config["env"]["framework_value"] + model = t.cast( + "_BaseAutoLLMClass", + openllm[envvar], # type: ignore (internal API) + ).for_model(model_name, model_id=model_id, llm_config=config) try: _ref = bentoml.transformers.get(model.tag) @@ -1036,7 +1085,7 @@ def download_models(model_name: str, model_id: str | None, output: OutputLiteral fg="white", ) else: - if openllm.utils.DEBUG: + if DEBUG or get_debug_mode(): # NOTE: When debug is enabled, # We will prefix the tag with __tag__ and we can use regex to correctly # get the tag from 'bentoml.bentos.build|build_bentofile' @@ -1052,13 +1101,14 @@ def download_models(model_name: str, model_id: str | None, output: OutputLiteral nl=True, ) + (model_args, model_attrs), tokenizer_attrs = model.llm_parameters _ref = model.import_model( model.model_id, model.tag, - *model._model_args, - tokenizer_kwds=model._tokenizer_attrs, + *model_args, + tokenizer_kwds=tokenizer_attrs, trust_remote_code=model.__llm_trust_remote_code__, - **model._model_attrs, + **model_attrs, ) if output == "pretty": _echo(f"Saved model: {_ref.tag}") @@ -1070,7 +1120,7 @@ def download_models(model_name: str, model_id: str | None, output: OutputLiteral ).decode() ) else: - if openllm.utils.DEBUG: + if DEBUG or get_debug_mode(): # NOTE: When debug is enabled, # We will prefix the tag with __tag__ and we can use regex to correctly # get the tag from 'bentoml.bentos.build|build_bentofile' @@ -1078,7 +1128,7 @@ def download_models(model_name: str, model_id: str | None, output: OutputLiteral else: _echo(_ref.tag) finally: - if openllm.utils.is_torch_available() and torch.cuda.is_available(): + if is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache() return _ref diff --git a/src/openllm/models/auto/factory.py b/src/openllm/models/auto/factory.py index 231d34d0..638ba67c 100644 --- a/src/openllm/models/auto/factory.py +++ b/src/openllm/models/auto/factory.py @@ -94,22 +94,21 @@ class _BaseAutoLLMClass: >>> llm = openllm.AutoLLM.for_model("flan-t5") ``` """ - runner_kwargs_name = [ + # order matters here + runner_kwargs_name = { "models", "max_batch_size", "max_latency_ms", "method_configs", - "embedded", "scheduling_strategy", - ] + } to_runner_attrs = {k: v for k, v in attrs.items() if k in runner_kwargs_name} - for k in to_runner_attrs: - del attrs[k] - normalized = inflection.underscore(model_name) - if cls._model_mapping.get(normalized, None, mapping_type="name2model"): + attrs = {k: v for k, v in attrs.items() if k not in to_runner_attrs} + if cls._model_mapping.get(inflection.underscore(model_name), None, mapping_type="name2model"): if not isinstance(llm_config, openllm.LLMConfig): # The rest of kwargs is now passed to config - llm_config = AutoConfig.for_model(normalized, **attrs) + llm_config = AutoConfig.for_model(model_name, **attrs) + attrs = llm_config.__openllm_extras__ # the rest of attrs will be saved to __openllm_extras__ llm = cls._model_mapping[type(llm_config)].from_pretrained( model_id, diff --git a/src/openllm/models/chatglm/modeling_chatglm.py b/src/openllm/models/chatglm/modeling_chatglm.py index 2acd9b11..1e947ad5 100644 --- a/src/openllm/models/chatglm/modeling_chatglm.py +++ b/src/openllm/models/chatglm/modeling_chatglm.py @@ -104,25 +104,25 @@ class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrain chat_history.append((prompt, generation_result)) return "".join(generation_result) - @torch.inference_mode() - def generate(self, prompt: str, use_default_prompt_template: bool = True, **attrs: t.Any) -> str: - self.model.eval() + def generate(self, prompt: str, **attrs: t.Any) -> str: + with torch.inference_mode(): + self.model.eval() - # Only use half precision if the model is not yet quantized - if self.config.use_half_precision: - self.model.half() + # Only use half precision if the model is not yet quantized + if self.config.use_half_precision: + self.model.half() - self.model.cuda() + self.model.cuda() - logit_processor: list[LogitsProcessor] = LogitsProcessorList() - logit_processor.append(InvalidScoreLogitsProcessor()) + logit_processor: list[LogitsProcessor] = LogitsProcessorList() + logit_processor.append(InvalidScoreLogitsProcessor()) - inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device) - outputs = self.model.generate( - **inputs, - generation_config=self.config.model_construct_env(do_sample=True, **attrs).to_generation_config(), - logits_processor=logit_processor, - ) - outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) :] - response = self.tokenizer.decode(outputs) - return self.model.process_response(response) + inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device) + outputs = self.model.generate( + **inputs, + generation_config=self.config.model_construct_env(do_sample=True, **attrs).to_generation_config(), + logits_processor=logit_processor, + ) + outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) :] + response = self.tokenizer.decode(outputs) + return self.model.process_response(response) diff --git a/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/src/openllm/models/dolly_v2/modeling_dolly_v2.py index 3175d692..60a44cf1 100644 --- a/src/openllm/models/dolly_v2/modeling_dolly_v2.py +++ b/src/openllm/models/dolly_v2/modeling_dolly_v2.py @@ -98,19 +98,19 @@ class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedToken ) -> str: return generation_result[0]["generated_text"] - @torch.inference_mode() def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal["generated_text"], str]]: - self.model.tokenizer = self.tokenizer - llm_config = self.config.model_construct_env(**attrs) - decoded: list[dict[t.Literal["generated_text"], str]] = self.model( - prompt, generation_config=llm_config.to_generation_config() - ) + with torch.inference_mode(): + self.model.tokenizer = self.tokenizer + llm_config = self.config.model_construct_env(**attrs) + decoded: list[dict[t.Literal["generated_text"], str]] = self.model( + prompt, generation_config=llm_config.to_generation_config() + ) - if llm_config.return_full_text: - return [ - {k: f"{DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt)}\n{generated}"} - for i in decoded - for k, generated in i.items() - ] + if llm_config.return_full_text: + return [ + {k: f"{DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt)}\n{generated}"} + for i in decoded + for k, generated in i.items() + ] - return decoded + return decoded diff --git a/src/openllm/models/flan_t5/modeling_flan_t5.py b/src/openllm/models/flan_t5/modeling_flan_t5.py index d2162e33..92edae8a 100644 --- a/src/openllm/models/flan_t5/modeling_flan_t5.py +++ b/src/openllm/models/flan_t5/modeling_flan_t5.py @@ -74,14 +74,14 @@ class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformer def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0] - @torch.inference_mode() def generate(self, prompt: str, **attrs: t.Any) -> list[str]: - if torch.cuda.is_available(): - self.model.cuda() - input_ids = t.cast("torch.Tensor", self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device) - result_tensor = self.model.generate( - input_ids, - do_sample=True, - generation_config=self.config.model_construct_env(**attrs).to_generation_config(), - ) - return self.tokenizer.batch_decode(result_tensor, skip_special_tokens=True) + with torch.inference_mode(): + if torch.cuda.is_available(): + self.model.cuda() + input_ids = t.cast("torch.Tensor", self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device) + result_tensor = self.model.generate( + input_ids, + do_sample=True, + generation_config=self.config.model_construct_env(**attrs).to_generation_config(), + ) + return self.tokenizer.batch_decode(result_tensor, skip_special_tokens=True) diff --git a/src/openllm/models/opt/modeling_opt.py b/src/openllm/models/opt/modeling_opt.py index 36a63df6..f8b50649 100644 --- a/src/openllm/models/opt/modeling_opt.py +++ b/src/openllm/models/opt/modeling_opt.py @@ -129,15 +129,15 @@ class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer else: return "\n".join(generation_result) - @torch.inference_mode() def generate(self, prompt: str, **attrs: t.Any) -> list[str]: - if torch.cuda.is_available() and torch.cuda.device_count() == 1: - self.model.cuda() + with torch.inference_mode(): + if torch.cuda.is_available() and torch.cuda.device_count() == 1: + self.model.cuda() - input_ids = t.cast(torch.Tensor, self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device) - generated_tensors = self.model.generate( - input_ids, - do_sample=True, - generation_config=self.config.model_construct_env(**attrs).to_generation_config(), - ) - return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True) + input_ids = t.cast(torch.Tensor, self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device) + generated_tensors = self.model.generate( + input_ids, + do_sample=True, + generation_config=self.config.model_construct_env(**attrs).to_generation_config(), + ) + return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True) diff --git a/src/openllm/models/starcoder/modeling_starcoder.py b/src/openllm/models/starcoder/modeling_starcoder.py index 5c7e459a..e92e6c93 100644 --- a/src/openllm/models/starcoder/modeling_starcoder.py +++ b/src/openllm/models/starcoder/modeling_starcoder.py @@ -120,18 +120,20 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers. def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0] - @torch.inference_mode() def generate(self, prompt: str, **attrs: t.Any) -> list[str]: - inputs = t.cast("torch.Tensor", self.tokenizer.encode(prompt, return_tensors="pt")).to(self.device) - result_tensor = self.model.generate( - inputs, - do_sample=True, - pad_token_id=self.tokenizer.eos_token_id, - # eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder - generation_config=self.config.model_construct_env(**attrs).to_generation_config(), - ) - # TODO: We will probably want to return the tokenizer here so that we can manually process this - # return (skip_special_tokens=False, clean_up_tokenization_spaces=False)) - return self.tokenizer.batch_decode( - result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True - ) + with torch.inference_mode(): + inputs = t.cast("torch.Tensor", self.tokenizer.encode(prompt, return_tensors="pt")).to(self.device) + result_tensor = self.model.generate( + inputs, + do_sample=True, + pad_token_id=self.tokenizer.eos_token_id, + # eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder + generation_config=self.config.model_construct_env(**attrs).to_generation_config(), + ) + # TODO: We will probably want to return the tokenizer here so that we can manually process this + # return (skip_special_tokens=False, clean_up_tokenization_spaces=False)) + return self.tokenizer.batch_decode( + result_tensor[0], + skip_special_tokens=True, + clean_up_tokenization_spaces=True, + ) diff --git a/src/openllm/utils/__init__.py b/src/openllm/utils/__init__.py index cde27c03..0c6a7169 100644 --- a/src/openllm/utils/__init__.py +++ b/src/openllm/utils/__init__.py @@ -35,6 +35,11 @@ from bentoml._internal.utils import (LazyLoader, bentoml_cattr, from .lazy import LazyModule +# NOTE: The set marks contains a set of modules name +# that are available above and are whitelisted +# to be included in the extra_objects map. +_whitelist_modules = {"pkg"} + logger = logging.getLogger(__name__) try: @@ -86,7 +91,9 @@ DEBUG = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.envi # XXX: define all classes, functions import above this line # since _extras will be the locals() import from this file. _extras: dict[str, t.Any] = { - k: v for k, v in locals().items() if not isinstance(v, types.ModuleType) and not k.startswith("_") + k: v + for k, v in locals().items() + if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith("_")) } _import_structure = { diff --git a/src/openllm/utils/dantic.py b/src/openllm/utils/dantic.py index ac598bb8..6fe409aa 100644 --- a/src/openllm/utils/dantic.py +++ b/src/openllm/utils/dantic.py @@ -97,7 +97,7 @@ def attrs_to_options( ) -def _default_converter(value: t.Any, env: str | None) -> t.Any: +def env_converter(value: t.Any, env: str | None = None) -> t.Any: if env is not None: value = os.environ.get(env, value) if value is not None and isinstance(value, str): @@ -135,7 +135,8 @@ def Field( on kw_only. If kw_only=True, the this field will become 'Required' and the default value is omitted. If kw_only=False, then the default value will be used as before. use_default_converter: a bool indicating whether to use the default converter. Defaults - to True. If set to False, then the default converter will not be used. + to True. If set to False, then the default converter will not be used. The default + converter converts a given value from the environment variable for this given Field. **kwargs: The rest of the arguments are passed to attr.field """ metadata = attrs.pop("metadata", {}) @@ -148,7 +149,7 @@ def Field( converter = attrs.pop("converter", None) if use_default_converter: - converter = functools.partial(_default_converter, env=env) + converter = functools.partial(env_converter, env=env) if ge is not None: piped.append(attr.validators.ge(ge)) diff --git a/src/openllm/utils/import_utils.py b/src/openllm/utils/import_utils.py index 324710b0..4d092cf8 100644 --- a/src/openllm/utils/import_utils.py +++ b/src/openllm/utils/import_utils.py @@ -15,6 +15,8 @@ """ Some imports utils are vendorred from transformers/utils/import_utils.py for performance reasons. """ +from __future__ import annotations + import importlib import importlib.metadata import importlib.util @@ -24,7 +26,6 @@ import typing as t from abc import ABCMeta from collections import OrderedDict -import attr import inflection from bentoml._internal.utils import LazyLoader from packaging import version @@ -236,31 +237,73 @@ def require_backends(o: t.Any, backends: t.MutableSequence[str]): raise ImportError("".join(failed)) -@attr.define class ModelEnv: - model_name: str = attr.field(converter=inflection.underscore) + model_name: str - @property - def framework(self) -> str: - return f"OPENLLM_{self.model_name.upper()}_FRAMEWORK" + if t.TYPE_CHECKING: + config: property + model_id: property + quantize: property + framework: property + bettertransformer: property - @property - def model_config(self) -> str: - return f"OPENLLM_{self.model_name.upper()}_CONFIG" + framework_value: property + quantize_value: property + bettertransformer_value: property - @property - def model_id(self) -> str: - return f"OPENLLM_{self.model_name.upper()}_MODEL_ID" + def __getitem__(self, item: str | t.Any) -> t.Any: + if hasattr(self, item): + return getattr(self, item) + raise KeyError(f"Key {item} not found in {self}") - @property - def bettertransformer(self) -> str: - return f"OPENLLM_{self.model_name.upper()}_BETTERTRANSFORMER" + def __new__(cls, model_name: str, bettertransformer: bool | None = None, quantize: t.LiteralString | None = None): + from .._configuration import _field_env_key + from . import codegen - def gen_env_key(self, key: str) -> str: - return f"OPENLLM_{self.model_name.upper()}_{key.upper()}" + model_name = inflection.underscore(model_name) - def convert_to_bettertransformer(self) -> bool: - return os.environ.get(self.bettertransformer, str(False)).lower() == "true" + res = super().__new__(cls) + res.model_name = model_name + + # gen properties env key + attributes = {"config", "model_id", "quantize", "framework", "bettertransformer"} + for att in attributes: + setattr(res, att, _field_env_key(model_name, att.upper())) + + # gen properties env value + attributes_with_values = { + "quantize": (bool, quantize), + "bettertransformer": (bool, bettertransformer), + "framework": (str, "pt"), + } + globs: dict[str, t.Any] = { + "__bool_vars_value": ENV_VARS_TRUE_VALUES, + "__env_get": os.environ.get, + "self": res, + } + + for attribute, (default_type, default_value) in attributes_with_values.items(): + lines: list[str] = [] + if default_type is bool: + lines.append( + f"return str(__env_get(self['{attribute}'], str(__env_default)).upper() in __bool_vars_value)" + ) + else: + lines.append(f"return __env_get(self['{attribute}'], __env_default)") + + setattr( + res, + f"{attribute}_value", + codegen.generate_function( + cls, + "_env_get_" + attribute, + lines, + ("__env_default",), + globs, + )(default_value), + ) + + return res @property def start_docstring(self) -> str: @@ -269,9 +312,3 @@ class ModelEnv: @property def module(self) -> LazyLoader: return LazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}") - - def get_framework_env(self) -> t.Literal["pt", "flax", "tf"]: - envvar = os.environ.get(self.framework, "pt") - if envvar not in ("pt", "tf", "flax"): - raise ValueError(f"Invalid framework implementation {envvar}, must be one of 'pt', 'tf', 'flax'") - return envvar diff --git a/src/openllm/utils/lazy.py b/src/openllm/utils/lazy.py index 32147f59..149e5634 100644 --- a/src/openllm/utils/lazy.py +++ b/src/openllm/utils/lazy.py @@ -21,6 +21,19 @@ import os import types import typing as t +from ..exceptions import ForbiddenAttributeError, OpenLLMException + + +class UsageNotAllowedError(OpenLLMException): + """Raised when LazyModule.__getitem__ is forbidden.""" + + +class MissingAttributesError(OpenLLMException): + """Raised when given keys is not available in LazyModule special mapping.""" + + +_reserved_namespace = {"__openllm_special__"} + class LazyModule(types.ModuleType): """ @@ -49,9 +62,7 @@ class LazyModule(types.ModuleType): for value in values: self._class_to_module[value] = key # Needed for autocompletion in an IDE - self.__all__ = ( - list(import_structure.keys()) + list(itertools.chain(*import_structure.values())) + list(_extra_objects) - ) + self.__all__ = list(import_structure.keys()) + list(itertools.chain(*import_structure.values())) self.__file__ = module_file self.__spec__ = module_spec self.__path__ = [os.path.dirname(module_file)] @@ -71,13 +82,30 @@ class LazyModule(types.ModuleType): result.append(attribute) return result + def __getitem__(self, key: str) -> t.Any: + if self._objects.get("__openllm_special__") is None: + raise UsageNotAllowedError(f"'{self._name}' is not allowed to be used as a dict.") + _special_mapping = self._objects.get("__openllm_special__", {}) + try: + if key in _special_mapping: + return getattr(self, _special_mapping.__getitem__(key)) + raise MissingAttributesError(f"Requested '{key}' is not available in given mapping.") + except AttributeError as e: + raise KeyError(f"'{self._name}' has no attribute {_special_mapping[key]}") from e + except Exception as e: + raise KeyError(f"Failed to lookup '{key}' in '{self._name}'") from e + def __getattr__(self, name: str) -> t.Any: + if name in _reserved_namespace: + raise ForbiddenAttributeError( + f"'{name}' is a reserved namespace for {self._name} and should not be access nor modified." + ) if name in self._objects: - return self._objects[name] + return self._objects.__getitem__(name) if name in self._modules: value = self._get_module(name) elif name in self._class_to_module.keys(): - module = self._get_module(self._class_to_module[name]) + module = self._get_module(self._class_to_module.__getitem__(name)) value = getattr(module, name) else: raise AttributeError(f"module {self.__name__} has no attribute {name}") diff --git a/src/openllm_client/_prompt.py b/src/openllm_client/_prompt.py index 033ffbb0..9aba658a 100644 --- a/src/openllm_client/_prompt.py +++ b/src/openllm_client/_prompt.py @@ -13,9 +13,10 @@ # limitations under the License. from __future__ import annotations -import dataclasses import typing as t +import attr + import openllm from openllm._prompt import PromptFormatter @@ -34,13 +35,11 @@ class PartialDict(DictStrStr): return "{" + key + "}" -@dataclasses.dataclass(slots=True) +@attr.define(slots=True) class PromptTemplate: template: str input_variables: t.Sequence[str] - model_config = {"extra": "forbid"} - def to_str(self, __partial_dict__: PartialDict | None = None, **attrs: str) -> str: """Generate a prompt from the template and input variables""" if __partial_dict__: diff --git a/src/openllm_client/runtimes/base.py b/src/openllm_client/runtimes/base.py index fe019dac..9a316eb9 100644 --- a/src/openllm_client/runtimes/base.py +++ b/src/openllm_client/runtimes/base.py @@ -25,6 +25,7 @@ import httpx import openllm if t.TYPE_CHECKING: + from openllm.models.auto.factory import _BaseAutoLLMClass class AnnotatedClient(bentoml.client.Client): def health(self, *args: t.Any, **attrs: t.Any) -> t.Any: @@ -107,12 +108,10 @@ class ClientMixin: @property def llm(self) -> openllm.LLM[t.Any, t.Any]: if self.__llm__ is None: - if self.framework == "flax": - self.__llm__ = openllm.AutoFlaxLLM.for_model(self.model_name) - elif self.framework == "tf": - self.__llm__ = openllm.AutoTFLLM.for_model(self.model_name) - else: - self.__llm__ = openllm.AutoLLM.for_model(self.model_name) + self.__llm__ = t.cast( + "_BaseAutoLLMClass", + openllm[self.framework], # type: ignore (internal API) + ).for_model(self.model_name) return self.__llm__ @property diff --git a/tests/test_configuration.py b/tests/test_configuration.py index c18c6f54..d65e2911 100644 --- a/tests/test_configuration.py +++ b/tests/test_configuration.py @@ -33,10 +33,10 @@ logger = logging.getLogger(__name__) def test_missing_default(): - with pytest.raises(ValueError, match="The following keys are required*"): + with pytest.raises(ValueError, match="Either 'default_id' or 'model_ids'*"): make_llm_config("MissingDefaultId", {"name_type": "lowercase", "requirements": ["bentoml"]}) - with pytest.raises(ValueError, match="The following keys are required*"): + with pytest.raises(ValueError, match="Either 'default_id' or 'model_ids'*"): make_llm_config("MissingModelId", {"default_id": "huggingface/t5-tiny-testing", "requirements": ["bentoml"]})