From 3e45530abde781618758d049de1d4d4ad6eb8369 Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Fri, 1 Sep 2023 05:15:19 -0400 Subject: [PATCH] refactor(breaking): unify LLM API (#283) Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --- .github/SECURITY.md | 3 +- CHANGELOG.md | 7 +- README.md | 11 +- changelog.d/283.breaking.md | 20 + hatch.toml | 1 - openllm-client/src/openllm_client/_base.py | 8 +- openllm-client/src/openllm_client/py.typed | 0 .../side_bar/model_selection/db.cljs | 4 +- .../src/openllm_core/_configuration.py | 109 ++- openllm-core/src/openllm_core/_schema.py | 4 +- openllm-core/src/openllm_core/_strategies.py | 2 +- .../src/openllm_core/_typing_compat.py | 42 +- .../config/configuration_dolly_v2.py | 8 +- .../config/configuration_flan_t5.py | 6 +- .../config/configuration_llama.py | 9 +- .../openllm_core/config/configuration_opt.py | 6 +- openllm-core/src/openllm_core/py.typed | 0 .../src/openllm_core/utils/__init__.py | 24 +- .../src/openllm_core/utils/analytics.py | 3 +- .../src/openllm_core/utils/codegen.py | 11 +- .../src/openllm_core/utils/import_utils.py | 58 +- openllm-python/src/openllm/__init__.py | 11 +- openllm-python/src/openllm/_assign.py | 201 ++++++ openllm-python/src/openllm/_embeddings.py | 6 +- openllm-python/src/openllm/_llm.py | 665 +++++------------- openllm-python/src/openllm/_service.py | 4 +- openllm-python/src/openllm/bundle/_package.py | 42 +- .../src/openllm/bundle/oci/__init__.py | 2 +- openllm-python/src/openllm/cli/_factory.py | 166 ++--- openllm-python/src/openllm/cli/_sdk.py | 179 ++--- openllm-python/src/openllm/cli/entrypoint.py | 101 +-- .../models/chatglm/modeling_chatglm.py | 4 +- .../models/flan_t5/modeling_flan_t5.py | 4 +- .../openllm/models/llama/modeling_llama.py | 6 +- .../src/openllm/models/mpt/modeling_mpt.py | 8 +- .../models/stablelm/modeling_stablelm.py | 4 - .../src/openllm/serialisation/__init__.py | 44 +- .../src/openllm/serialisation/ggml.py | 31 +- .../serialisation/transformers/__init__.py | 100 +-- .../serialisation/transformers/_helpers.py | 6 +- .../serialisation/transformers/weights.py | 6 +- openllm-python/src/openllm/testing.py | 7 +- openllm-python/src/openllm/utils/__init__.py | 17 +- .../tests/_strategies/_configuration.py | 4 +- openllm-python/tests/configuration_test.py | 9 +- openllm-python/tests/conftest.py | 28 +- openllm-python/tests/package_test.py | 12 +- pyproject.toml | 8 +- tools/update-config-stubs.py | 54 +- tools/update-dummy.py | 48 +- 50 files changed, 881 insertions(+), 1232 deletions(-) create mode 100644 changelog.d/283.breaking.md create mode 100644 openllm-client/src/openllm_client/py.typed create mode 100644 openllm-core/src/openllm_core/py.typed create mode 100644 openllm-python/src/openllm/_assign.py diff --git a/.github/SECURITY.md b/.github/SECURITY.md index 9585a107..a0baf626 100644 --- a/.github/SECURITY.md +++ b/.github/SECURITY.md @@ -8,8 +8,7 @@ are backward compatible. We are more lenient with patch as the development can move quickly. If you are just using public API, then feel free to always upgrade. Whenever -there is a breaking policies, it will become a `DeprecationWarning` with a -period of 12 months before becoming broken. +there is a breaking policies, it will be announced and will be broken. > [!WARNING] > Everything package under `openllm` that has an underscore prefixes diff --git a/CHANGELOG.md b/CHANGELOG.md index 88e6804f..15d7e2f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -230,7 +230,7 @@ No significant changes. ```bash docker run --rm --gpus all -it -v /home/ubuntu/.local/share/bentoml:/tmp/bentoml -e BENTOML_HOME=/tmp/bentoml \ - -e OPENLLM_USE_LOCAL_LATEST=True -e OPENLLM_LLAMA_FRAMEWORK=vllm ghcr.io/bentoml/openllm:2b5e96f90ad314f54e07b5b31e386e7d688d9bb2 start llama --model-id meta-llama/Llama-2-7b-chat-hf --workers-per-resource conserved --debug` + -e OPENLLM_USE_LOCAL_LATEST=True -e OPENLLM_BACKEND=vllm ghcr.io/bentoml/openllm:2b5e96f90ad314f54e07b5b31e386e7d688d9bb2 start llama --model-id meta-llama/Llama-2-7b-chat-hf --workers-per-resource conserved --debug` ``` In conjunction with this, OpenLLM now also have a set of small CLI utilities via ``openllm ext`` for ease-of-use @@ -721,9 +721,6 @@ No significant changes. `openllm start` now support `--quantize int8` and `--quantize int4` `GPTQ` quantization support is on the roadmap and currently being worked on. - `openllm start` now also support `--bettertransformer` to use - `BetterTransformer` for serving. - Refactored `openllm.LLMConfig` to be able to use with `__getitem__`: `openllm.DollyV2Config()['requirements']`. @@ -732,8 +729,6 @@ No significant changes. Added `towncrier` workflow to easily generate changelog entries - Added `use_pipeline`, `bettertransformer` flag into ModelSettings - `LLMConfig` now supported `__dataclass_transform__` protocol to help with type-checking diff --git a/README.md b/README.md index 44dd5cc5..8ba82b92 100644 --- a/README.md +++ b/README.md @@ -407,17 +407,19 @@ pip install "openllm[baichuan]" ### Runtime Implementations (Experimental) Different LLMs may have multiple runtime implementations. For instance, they -might use Pytorch (`pt`), Tensorflow (`tf`), or Flax (`flax`). +might use Pytorch (`pt`), Tensorflow (`tf`), Flax (`flax`) or vLLM (`vllm`). If you wish to specify a particular runtime for a model, you can do so by -setting the `OPENLLM_{MODEL_NAME}_FRAMEWORK={runtime}` environment variable +setting the `OPENLLM_BACKEND={runtime}` environment variable before running `openllm start`. For example, if you want to use the Tensorflow (`tf`) implementation for the `flan-t5` model, you can use the following command: ```bash -OPENLLM_FLAN_T5_FRAMEWORK=tf openllm start flan-t5 +OPENLLM_BACKEND=tf openllm start flan-t5 + +openllm start flan-t5 --backend tf ``` > [!NOTE] @@ -425,6 +427,9 @@ OPENLLM_FLAN_T5_FRAMEWORK=tf openllm start flan-t5 > [Jax's installation](https://github.com/google/jax#pip-installation-gpu-cuda-installed-via-pip-easier) > to make sure that you have Jax support for the corresponding CUDA version. +> [!IMPORTANT] +> To use vLLM backend, at least a GPU with Ampere or newer architecture and CUDA 11.8 is required. + ### Quantisation OpenLLM supports quantisation with diff --git a/changelog.d/283.breaking.md b/changelog.d/283.breaking.md new file mode 100644 index 00000000..80321200 --- /dev/null +++ b/changelog.d/283.breaking.md @@ -0,0 +1,20 @@ +All environment variable now will be more simplified, without the need for the specific model prefix + +For example: OPENLLM_LLAMA_GENERATION_MAX_NEW_TOKENS now becomes OPENLLM_GENERATION_MAX_NEW_TOKENS + +Unify some misc environment variable. To switch different backend, one can use `--backend` for both `start` and `build` + +```bash +openllm start llama --backend vllm +``` + +or the environment variable `OPENLLM_BACKEND` + +```bash +OPENLLM_BACKEND=vllm openllm start llama +``` + +`openllm.Runner` now will default to try download the model the first time if the model is not available, and get the cached in model store consequently + +Model serialisation now updated to a new API version with more clear name change, kindly ask users to do `openllm prune -y --include-bentos` and update to +this current version of openllm diff --git a/hatch.toml b/hatch.toml index 32ffac03..0a294b57 100644 --- a/hatch.toml +++ b/hatch.toml @@ -31,7 +31,6 @@ check-stubs = [ inplace-changelog = "towncrier build --version main --keep" quality = [ "./tools/dependencies.py", - "./tools/update-readme.py", "- ./tools/update-brew-tap.py", "bash ./tools/sync-readme.sh", "check-stubs", diff --git a/openllm-client/src/openllm_client/_base.py b/openllm-client/src/openllm_client/_base.py index fed7f87d..f2216958 100644 --- a/openllm-client/src/openllm_client/_base.py +++ b/openllm-client/src/openllm_client/_base.py @@ -28,7 +28,7 @@ if t.TYPE_CHECKING: import transformers from openllm_core._typing_compat import DictStrAny - from openllm_core._typing_compat import LiteralRuntime + from openllm_core._typing_compat import LiteralBackend logger = logging.getLogger(__name__) @@ -98,7 +98,7 @@ class _ClientAttr: raise RuntimeError( "transformers is required to use HF agent. Install with 'pip install \"openllm-client[agents]\"'.") if not self.supports_hf_agent: - raise RuntimeError(f'{self.model_name} ({self.framework}) does not support running HF agent.') + raise RuntimeError(f'{self.model_name} ({self.backend}) does not support running HF agent.') if not is_transformers_supports_agent(): raise RuntimeError( "Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'" @@ -125,9 +125,9 @@ class _ClientAttr: raise RuntimeError('Malformed service endpoint. (Possible malicious)') from None @property - def framework(self) -> LiteralRuntime: + def backend(self) -> LiteralBackend: try: - return self._metadata['framework'] + return self._metadata['backend'] except KeyError: raise RuntimeError('Malformed service endpoint. (Possible malicious)') from None diff --git a/openllm-client/src/openllm_client/py.typed b/openllm-client/src/openllm_client/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/openllm-contrib/clojure/src/main/openllm/components/side_bar/model_selection/db.cljs b/openllm-contrib/clojure/src/main/openllm/components/side_bar/model_selection/db.cljs index 6502f0e2..e6646dd0 100644 --- a/openllm-contrib/clojure/src/main/openllm/components/side_bar/model_selection/db.cljs +++ b/openllm-contrib/clojure/src/main/openllm/components/side_bar/model_selection/db.cljs @@ -31,10 +31,10 @@ (s/def ::model_id (s/coll-of string? :kind vector?)) ;; model_id is a vector of all models for a given model_type (s/def ::url string?) ;; url to the model's page (s/def ::requires_gpu boolean?) ;; whether the model requires a gpu -(s/def ::runtime_impl ::vec-of-runtimes?) ;; supported runtimes +(s/def ::backend ::vec-of-runtimes?) ;; supported runtimes (s/def ::installation string?) ;; installation instructions (pip command) (s/def ::model-spec (s/keys :req-un [::model_id ::url ::requires_gpu ;; the spec for a single model (aggregates all the above) - ::runtime_impl ::installation])) + ::backend ::installation])) (s/def ::all-models #(or loading-text ;; -- this is the case when the file with the model data has not been loaded yet by the ::set-model-data effect (s/map-of keyword? ::model-spec))) ;; map of all models diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py index da25116a..043f9356 100644 --- a/openllm-core/src/openllm_core/_configuration.py +++ b/openllm-core/src/openllm_core/_configuration.py @@ -66,14 +66,13 @@ from ._typing_compat import AnyCallable from ._typing_compat import At from ._typing_compat import DictStrAny from ._typing_compat import ListStr -from ._typing_compat import LiteralRuntime +from ._typing_compat import LiteralBackend from ._typing_compat import LiteralString from ._typing_compat import NotRequired from ._typing_compat import Required from ._typing_compat import Self from ._typing_compat import overload from .exceptions import ForbiddenAttributeError -from .utils import ENV_VARS_TRUE_VALUES from .utils import MYPY from .utils import LazyLoader from .utils import ReprMixin @@ -312,7 +311,7 @@ class GenerationConfig(ReprMixin): eta_cutoff: float = dantic.Field( 0.0, description= - '''Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details. ''' + 'Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details. ' ) diversity_penalty: float = dantic.Field( 0.0, @@ -387,17 +386,17 @@ class GenerationConfig(ReprMixin): output_attentions: bool = dantic.Field( False, description= - '''Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.''' + 'Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.' ) output_hidden_states: bool = dantic.Field( False, description= - '''Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.''' + 'Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.' ) output_scores: bool = dantic.Field( False, - description= - '''Whether or not to return the prediction scores. See `scores` under returned tensors for more details.''') + description='Whether or not to return the prediction scores. See `scores` under returned tensors for more details.' + ) pad_token_id: int = dantic.Field(description='The id of the *padding* token.') bos_token_id: int = dantic.Field(description='The id of the *beginning-of-sequence* token.') eos_token_id: t.Union[int, t.List[int]] = dantic.Field( @@ -565,7 +564,7 @@ class ModelSettings(t.TypedDict, total=False): architecture: Required[str] # default OpenLLM runtime imlementation - default_implementation: NotRequired[t.Dict[LiteralResourceSpec, LiteralRuntime]] + default_backend: NotRequired[t.Dict[LiteralResourceSpec, LiteralBackend]] # meta url: str @@ -575,9 +574,7 @@ class ModelSettings(t.TypedDict, total=False): requirements: t.Optional[ListStr] # llm implementation specifics - bettertransformer: bool model_type: t.Literal['causal_lm', 'seq2seq_lm'] - runtime: t.Literal['transformers', 'ggml'] # naming convention, only name_type is needed to infer from the class # as the three below it can be determined automatically @@ -597,7 +594,7 @@ class ModelSettings(t.TypedDict, total=False): _transformed_type: DictStrAny = { 'fine_tune_strategies': t.Dict[AdapterType, FineTuneConfig], - 'default_implementation': t.Dict[LiteralResourceSpec, LiteralRuntime] + 'default_backend': t.Dict[LiteralResourceSpec, LiteralBackend] } @attr.define(frozen=False, @@ -628,7 +625,7 @@ class _ModelSettingsAttr: ModelSettings(default_id='__default__', model_ids=['__default__'], architecture='PreTrainedModel', - default_implementation={ + default_backend={ 'cpu': 'pt', 'nvidia.com/gpu': 'pt' }, @@ -641,8 +638,7 @@ class _ModelSettingsAttr: tokenizer_class=None, timeout=int(36e6), service_name='', - workers_per_resource=1., - runtime='transformers'))) + workers_per_resource=1.))) # NOTE: The below are dynamically generated by the field_transformer if t.TYPE_CHECKING: @@ -650,15 +646,13 @@ class _ModelSettingsAttr: default_id: str model_ids: ListStr architecture: str - default_implementation: t.Dict[LiteralResourceSpec, LiteralRuntime] + default_backend: t.Dict[LiteralResourceSpec, LiteralBackend] url: str requires_gpu: bool trust_remote_code: bool service_name: str requirements: t.Optional[ListStr] - bettertransformer: bool model_type: t.Literal['causal_lm', 'seq2seq_lm'] - runtime: t.Literal['transformers', 'ggml'] name_type: t.Optional[t.Literal['dasherize', 'lowercase']] model_name: str start_name: str @@ -670,15 +664,14 @@ class _ModelSettingsAttr: # update-config-stubs.py: attrs stop # a heuristic cascading implementation resolver based on available resources -def get_default_implementation( - default_implementation_mapping: dict[LiteralResourceSpec, LiteralRuntime]) -> LiteralRuntime: +def get_default_backend(backend_mapping: dict[LiteralResourceSpec, LiteralBackend]) -> LiteralBackend: available_spec = available_resource_spec() - if resource_spec('tpu') in available_spec: return default_implementation_mapping.get(resource_spec('tpu'), 'pt') - elif resource_spec('amd') in available_spec: return default_implementation_mapping.get(resource_spec('amd'), 'pt') + if resource_spec('tpu') in available_spec: return backend_mapping.get(resource_spec('tpu'), 'pt') + elif resource_spec('amd') in available_spec: return backend_mapping.get(resource_spec('amd'), 'pt') elif resource_spec('nvidia') in available_spec: - return default_implementation_mapping.get(resource_spec('nvidia'), 'pt') + return backend_mapping.get(resource_spec('nvidia'), 'pt') else: - return default_implementation_mapping.get(resource_spec('cpu'), 'pt') + return backend_mapping.get(resource_spec('cpu'), 'pt') def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ModelSettingsAttr: if 'generation_class' in cl_.__config__: @@ -704,23 +697,17 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ model_name = _final_value_dct['model_name'] if 'model_name' in _final_value_dct else _settings_attr.model_name # if the default implementation dependencies doesn't exist, then always fallback to 'pt' - default_implementation = _settings_attr.default_implementation - for rs, runtime in default_implementation.items(): + default_backend = _settings_attr.default_backend + for rs, runtime in default_backend.items(): library_stub = 'torch' if runtime == 'pt' else runtime - if not BACKENDS_MAPPING[library_stub][0](): default_implementation[rs] = 'pt' - _final_value_dct['default_implementation'] = default_implementation + if not BACKENDS_MAPPING[library_stub][0](): default_backend[rs] = 'pt' + _final_value_dct['default_backend'] = default_backend env = openllm_core.utils.EnvVarMixin(model_name, - get_default_implementation(default_implementation), - model_id=_settings_attr.default_id, - bettertransformer=_settings_attr.bettertransformer) + backend=get_default_backend(default_backend), + model_id=_settings_attr.default_id) _final_value_dct['env'] = env - # bettertransformer support - if _settings_attr['bettertransformer'] is None: - _final_value_dct['bettertransformer'] = str(env['bettertransformer_value']).upper() in ENV_VARS_TRUE_VALUES - # if requires_gpu is True, then disable BetterTransformer for quantization. - if _settings_attr['requires_gpu']: _final_value_dct['bettertransformer'] = False _final_value_dct['service_name'] = f'generated_{model_name}_service.py' # NOTE: The key for fine-tune strategies is 'fine_tune_strategies' @@ -775,16 +762,16 @@ class _ConfigAttr: @staticmethod def Field(default: t.Any = None, **attrs: t.Any) -> t.Any: + '''Field is a alias to the internal dantic utilities to easily create + attrs.fields with pydantic-compatible interface. For example: + + ```python + class MyModelConfig(openllm.LLMConfig): + field1 = openllm.LLMConfig.Field(...) + ``` + ''' return dantic.Field(default, **attrs) - '''Field is a alias to the internal dantic utilities to easily create - attrs.fields with pydantic-compatible interface. For example: - - ```python - class MyModelConfig(openllm.LLMConfig): - field1 = openllm.LLMConfig.Field(...) - ``` - ''' # NOTE: The following is handled via __init_subclass__, and is only used for TYPE_CHECKING if t.TYPE_CHECKING: # NOTE: public attributes to override @@ -873,11 +860,8 @@ class _ConfigAttr: ```bash openllm start gpt-neox --model-id stabilityai/stablelm-tuned-alpha-3b ```''' - __openllm_default_implementation__: t.Dict[LiteralResourceSpec, LiteralRuntime] = Field(None) - '''The default runtime to run this LLM. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`. - - It is a dictionary of key as the accelerator spec in k4s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM Runtime ('flax', 'tf', 'pt', 'vllm') - ''' + __openllm_default_backend__: t.Dict[LiteralResourceSpec, LiteralBackend] = Field(None) + '''The default backend to run LLM based on available accelerator. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`. It is a dictionary of key as the accelerator spec in k8s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM backend ('flax', 'tf', 'pt', 'vllm', 'ggml', 'mlc')''' __openllm_url__: str = Field(None) '''The resolved url for this LLMConfig.''' __openllm_requires_gpu__: bool = Field(None) @@ -885,18 +869,11 @@ class _ConfigAttr: __openllm_trust_remote_code__: bool = Field(None) '''Whether to always trust remote code''' __openllm_service_name__: str = Field(None) - """Generated service name for this LLMConfig. By default, it is 'generated_{model_name}_service.py'""" + '''Generated service name for this LLMConfig. By default, it is "generated_{model_name}_service.py"''' __openllm_requirements__: t.Optional[ListStr] = Field(None) - '''The default PyPI requirements needed to run this given LLM. By default, we will depend on - bentoml, torch, transformers.''' - __openllm_bettertransformer__: bool = Field(None) - '''Whether to use BetterTransformer for this given LLM. This depends per model architecture. By default, we will use BetterTransformer for T5 and StableLM models, and set to False for every other models.''' + '''The default PyPI requirements needed to run this given LLM. By default, we will depend on bentoml, torch, transformers.''' __openllm_model_type__: t.Literal['causal_lm', 'seq2seq_lm'] = Field(None) - '''The model type for this given LLM. By default, it should be causal language modeling. - Currently supported 'causal_lm' or 'seq2seq_lm' - ''' - __openllm_runtime__: t.Literal['transformers', 'ggml'] = Field(None) - '''The runtime to use for this model. Possible values are `transformers` or `ggml`. See Llama for more information.''' + '''The model type for this given LLM. By default, it should be causal language modeling. Currently supported "causal_lm" or "seq2seq_lm"''' __openllm_name_type__: t.Optional[t.Literal['dasherize', 'lowercase']] = Field(None) '''The default name typed for this model. "dasherize" will convert the name to lowercase and replace spaces with dashes. "lowercase" will convert the name to lowercase. If this is not set, then both @@ -1212,8 +1189,8 @@ class LLMConfig(_ConfigAttr): annotated_names.add(attr_name) val = cd.get(attr_name, attr.NOTHING) if not isinstance(val, _CountingAttr): - if val is attr.NOTHING: val = cls.Field(env=field_env_key(cls.__openllm_model_name__, attr_name)) - else: val = cls.Field(default=val, env=field_env_key(cls.__openllm_model_name__, attr_name)) + if val is attr.NOTHING: val = cls.Field(env=field_env_key(attr_name)) + else: val = cls.Field(default=val, env=field_env_key(attr_name)) these[attr_name] = val unannotated = ca_names - annotated_names if len(unannotated) > 0: @@ -1293,7 +1270,7 @@ class LLMConfig(_ConfigAttr): @overload def __getitem__(self, item: t.Literal['architecture']) -> str: ... @overload - def __getitem__(self, item: t.Literal['default_implementation']) -> t.Dict[LiteralResourceSpec, LiteralRuntime]: ... + def __getitem__(self, item: t.Literal['default_backend']) -> t.Dict[LiteralResourceSpec, LiteralBackend]: ... @overload def __getitem__(self, item: t.Literal['url']) -> str: ... @overload @@ -1305,12 +1282,8 @@ class LLMConfig(_ConfigAttr): @overload def __getitem__(self, item: t.Literal['requirements']) -> t.Optional[ListStr]: ... @overload - def __getitem__(self, item: t.Literal['bettertransformer']) -> bool: ... - @overload def __getitem__(self, item: t.Literal['model_type']) -> t.Literal['causal_lm', 'seq2seq_lm']: ... @overload - def __getitem__(self, item: t.Literal['runtime']) -> t.Literal['transformers', 'ggml']: ... - @overload def __getitem__(self, item: t.Literal['name_type']) -> t.Optional[t.Literal['dasherize', 'lowercase']]: ... @overload def __getitem__(self, item: t.Literal['model_name']) -> str: ... @@ -1663,9 +1636,9 @@ class LLMConfig(_ConfigAttr): return _PEFT_TASK_TYPE_TARGET_MAPPING[cls.__openllm_model_type__] @classmethod - def default_implementation(cls) -> LiteralRuntime: - return first_not_none(cls.__openllm_env__['framework_value'], - default=get_default_implementation(cls.__openllm_default_implementation__)) + def default_backend(cls) -> LiteralBackend: + return first_not_none(cls.__openllm_env__['backend_value'], + default=get_default_backend(cls.__openllm_default_backend__)) def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]: '''This handler will sanitize all attrs and setup prompt text. diff --git a/openllm-core/src/openllm_core/_schema.py b/openllm-core/src/openllm_core/_schema.py index 04748034..9e28451b 100644 --- a/openllm-core/src/openllm_core/_schema.py +++ b/openllm-core/src/openllm_core/_schema.py @@ -1,4 +1,4 @@ -'''Schema definition for OpenLLM. This can be use for client interaction.''' +'''Schema definition for OpenLLM. This schema is used throughout openllm core components library.''' from __future__ import annotations import functools import typing as t @@ -77,7 +77,7 @@ class MetadataOutput: model_id: str timeout: int model_name: str - framework: str + backend: str configuration: str supports_embeddings: bool supports_hf_agent: bool diff --git a/openllm-core/src/openllm_core/_strategies.py b/openllm-core/src/openllm_core/_strategies.py index a6e9b5a0..a1b976c3 100644 --- a/openllm-core/src/openllm_core/_strategies.py +++ b/openllm-core/src/openllm_core/_strategies.py @@ -94,7 +94,7 @@ def _from_system(cls: type[DynResource]) -> list[str]: if visible_devices is None: if cls.resource_id == 'amd.com/gpu': if not psutil.LINUX: - if DEBUG: warnings.warn('AMD GPUs is currently only supported on Linux.', stacklevel=_STACK_LEVEL) + if DEBUG: logger.debug('AMD GPUs is currently only supported on Linux.') return [] # ROCm does not currently have the rocm_smi wheel. # So we need to use the ctypes bindings directly. diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py index 85fec6bc..d28e53fd 100644 --- a/openllm-core/src/openllm_core/_typing_compat.py +++ b/openllm-core/src/openllm_core/_typing_compat.py @@ -21,6 +21,8 @@ if t.TYPE_CHECKING: from bentoml._internal.runner.runnable import RunnableMethod from bentoml._internal.runner.runner import RunnerMethod from bentoml._internal.runner.strategy import Strategy + from openllm._llm import LLM + from openllm_core._schema import EmbeddingsOutput from .utils.lazy import VersionInfo @@ -35,6 +37,9 @@ T = t.TypeVar( 't.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]' ) +def get_literal_args(typ: t.Any) -> tuple[str, ...]: + return getattr(typ, '__args__') + AnyCallable = t.Callable[..., t.Any] DictStrAny = t.Dict[str, t.Any] ListAny = t.List[t.Any] @@ -42,7 +47,7 @@ ListStr = t.List[str] TupleAny = t.Tuple[t.Any, ...] At = t.TypeVar('At', bound=attr.AttrsInstance) -LiteralRuntime = t.Literal['pt', 'tf', 'flax', 'vllm'] +LiteralBackend = t.Literal['pt', 'tf', 'flax', 'vllm', 'ggml', 'mlc'] AdapterType = t.Literal['lora', 'adalora', 'adaption_prompt', 'prefix_tuning', 'p_tuning', 'prompt_tuning', 'ia3'] # TODO: support quay @@ -78,10 +83,6 @@ class PeftAdapterOutput(t.TypedDict): result: t.Dict[str, peft.PeftConfig] error_msg: str -class LLMEmbeddings(t.TypedDict): - embeddings: t.List[t.List[float]] - num_tokens: int - class AdaptersTuple(TupleAny): adapter_id: str name: t.Optional[str] @@ -98,7 +99,7 @@ class LLMRunnable(bentoml.Runnable, t.Generic[M, T]): SUPPORTED_RESOURCES = ('amd.com/gpu', 'nvidia.com/gpu', 'cpu') SUPPORTS_CPU_MULTI_THREADING = True __call__: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]] - embeddings: RunnableMethod[LLMRunnable[M, T], [list[str]], LLMEmbeddings] + embeddings: RunnableMethod[LLMRunnable[M, T], [list[str]], EmbeddingsOutput] generate: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]] generate_one: RunnableMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal['generated_text'], str]]] generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]] @@ -108,15 +109,14 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]): __module__: str llm_type: str llm_tag: bentoml.Tag - llm_framework: LiteralRuntime identifying_params: dict[str, t.Any] llm: openllm.LLM[M, T] config: openllm.LLMConfig - implementation: LiteralRuntime + backend: LiteralBackend supports_embeddings: bool supports_hf_agent: bool has_adapters: bool - embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], t.Sequence[LLMEmbeddings]] + embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], t.Sequence[EmbeddingsOutput]] generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]] generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal['generated_text'], str]]] generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]] @@ -139,7 +139,7 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]): ... @abc.abstractmethod - def embed(self, prompt: str | list[str]) -> LLMEmbeddings: + def embed(self, prompt: str | list[str]) -> EmbeddingsOutput: ... def run(self, prompt: str, **attrs: t.Any) -> t.Any: @@ -161,3 +161,25 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]): @abc.abstractmethod def __repr_keys__(self) -> set[str]: ... + +class load_model_protocol(t.Generic[M, T], t.Protocol): + + def __call__(self, llm: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: + ... + +class load_tokenizer_protocol(t.Generic[M, T], t.Protocol): + + def __call__(self, llm: LLM[M, T], **attrs: t.Any) -> T: + ... + +_R = t.TypeVar('_R', covariant=True) + +class import_model_protocol(t.Generic[_R, M, T], t.Protocol): + + def __call__(self, llm: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any) -> _R: + ... + +class llm_post_init_protocol(t.Generic[M, T], t.Protocol): + + def __call__(self, llm: LLM[M, T]) -> T: + ... diff --git a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py index 6e822f5a..6ab24a99 100644 --- a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py +++ b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py @@ -48,14 +48,14 @@ def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str) treated specially and converted to a single, new token. This retrieves the token ID each of these keys map to. Args: - tokenizer: the tokenizer - key: the key to convert to a single token + tokenizer: the tokenizer + key: the key to convert to a single token Raises: - RuntimeError: if more than one ID was generated + RuntimeError: if more than one ID was generated Returns: - int: the token ID for the given key. + int: the token ID for the given key. ''' token_ids = tokenizer.encode(key) if len(token_ids) > 1: raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}") diff --git a/openllm-core/src/openllm_core/config/configuration_flan_t5.py b/openllm-core/src/openllm_core/config/configuration_flan_t5.py index e0a73e91..1c5eddc9 100644 --- a/openllm-core/src/openllm_core/config/configuration_flan_t5.py +++ b/openllm-core/src/openllm_core/config/configuration_flan_t5.py @@ -17,14 +17,14 @@ Run a LLMServer for FLAN-T5 model. By default, this model will use the PyTorch model for inference. However, this model supports both Flax and Tensorflow. \b -- To use Flax, set the environment variable ``OPENLLM_FLAN_T5_FRAMEWORK="flax"`` +- To use Flax, set the environment variable ``OPENLLM_BACKEND="flax"`` \b -- To use Tensorflow, set the environment variable ``OPENLLM_FLAN_T5_FRAMEWORK="tf"`` +- To use Tensorflow, set the environment variable ``OPENLLM_BACKEND="tf"`` \b FLAN-T5 Runner will use google/flan-t5-large as the default model. To change to any other FLAN-T5 -saved pretrained, or a fine-tune FLAN-T5, provide ``OPENLLM_FLAN_T5_MODEL_ID='google/flan-t5-xxl'`` +saved pretrained, or a fine-tune FLAN-T5, provide ``OPENLLM_MODEL_ID='google/flan-t5-xxl'`` or provide `--model-id` flag when running ``openllm start flan-t5``: \b diff --git a/openllm-core/src/openllm_core/config/configuration_llama.py b/openllm-core/src/openllm_core/config/configuration_llama.py index b6ce000d..b96a9785 100644 --- a/openllm-core/src/openllm_core/config/configuration_llama.py +++ b/openllm-core/src/openllm_core/config/configuration_llama.py @@ -19,11 +19,14 @@ By default, this model will use [vLLM](https://github.com/vllm-project/vllm) for This model will also supports PyTorch. \b -- To use PyTorch, set the environment variable ``OPENLLM_LLAMA_FRAMEWORK="pt"`` +- To use PyTorch, set the environment variable ``OPENLLM_BACKEND="pt"`` + +\b +- To use vLLM, set the environment variable ``OPENLLM_BACKEND="vllm"`` \b Llama Runner will use decapoda-research/llama-7b-hf as the default model. To change to any other Llama -saved pretrained, or a fine-tune Llama, provide ``OPENLLM_LLAMA_MODEL_ID='openlm-research/open_llama_7b_v2'`` +saved pretrained, or a fine-tune Llama, provide ``OPENLLM_MODEL_ID='openlm-research/open_llama_7b_v2'`` or provide `--model-id` flag when running ``openllm start llama``: \b @@ -70,7 +73,7 @@ class LlamaConfig(openllm_core.LLMConfig): 'lowercase', 'url': 'https://github.com/facebookresearch/llama', - 'default_implementation': { + 'default_backend': { 'cpu': 'pt', 'nvidia.com/gpu': 'pt' }, diff --git a/openllm-core/src/openllm_core/config/configuration_opt.py b/openllm-core/src/openllm_core/config/configuration_opt.py index 2a5c323e..2ddf0cdc 100644 --- a/openllm-core/src/openllm_core/config/configuration_opt.py +++ b/openllm-core/src/openllm_core/config/configuration_opt.py @@ -18,14 +18,14 @@ Run a LLMServer for OPT model. By default, this model will use the PyTorch model for inference. However, this model supports both Flax and Tensorflow. \b -- To use Flax, set the environment variable ``OPENLLM_OPT_FRAMEWORK="flax"`` +- To use Flax, set the environment variable ``OPENLLM_BACKEND="flax"`` \b -- To use Tensorflow, set the environment variable ``OPENLLM_OPT_FRAMEWORK="tf"`` +- To use Tensorflow, set the environment variable ``OPENLLM_BACKEND="tf"`` \b OPT Runner will use facebook/opt-2.7b as the default model. To change to any other OPT -saved pretrained, or a fine-tune OPT, provide ``OPENLLM_OPT_MODEL_ID='facebook/opt-6.7b'`` +saved pretrained, or a fine-tune OPT, provide ``OPENLLM_MODEL_ID='facebook/opt-6.7b'`` or provide `--model-id` flag when running ``openllm start opt``: \b diff --git a/openllm-core/src/openllm_core/py.typed b/openllm-core/src/openllm_core/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py index 521170a8..94948144 100644 --- a/openllm-core/src/openllm_core/utils/__init__.py +++ b/openllm-core/src/openllm_core/utils/__init__.py @@ -47,12 +47,12 @@ logger = logging.getLogger(__name__) try: from typing import GenericAlias as _TypingGenericAlias # type: ignore except ImportError: - _TypingGenericAlias = ( - ) # type: ignore # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on) + # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on) + _TypingGenericAlias = () # type: ignore if sys.version_info < (3, 10): _WithArgsTypes = (_TypingGenericAlias,) else: - _WithArgsTypes: t.Any = (t._GenericAlias, types.GenericAlias, types.UnionType - ) # type: ignore # _GenericAlias is the actual GenericAlias implementation + # _GenericAlias is the actual GenericAlias implementation + _WithArgsTypes: t.Any = (t._GenericAlias, types.GenericAlias, types.UnionType) # type: ignore DEV_DEBUG_VAR = 'OPENLLMDEVDEBUG' @@ -96,6 +96,9 @@ def generate_hash_from_file(f: str, algorithm: t.Literal['md5', 'sha1'] = 'sha1' def device_count() -> int: return len(available_devices()) +def check_bool_env(env: str, default: bool = True) -> bool: + return os.environ.get(env, str(default)).upper() in ENV_VARS_TRUE_VALUES + # equivocal setattr to save one lookup per assignment _object_setattr = object.__setattr__ @@ -104,14 +107,16 @@ def non_intrusive_setattr(obj: t.Any, name: str, value: t.Any) -> None: _setattr = functools.partial(setattr, obj) if isinstance(obj, type) else _object_setattr.__get__(obj) if not hasattr(obj, name): _setattr(name, value) -def field_env_key(model_name: str, key: str, suffix: str | None = None) -> str: - return '_'.join(filter(None, map(str.upper, ['OPENLLM', model_name, suffix.strip('_') if suffix else '', key]))) +def field_env_key(key: str, suffix: str | None = None) -> str: + return '_'.join(filter(None, map(str.upper, ['OPENLLM', suffix.strip('_') if suffix else '', key]))) # Special debug flag controled via OPENLLMDEVDEBUG -DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.environ.get(DEV_DEBUG_VAR))) +DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and check_bool_env(DEV_DEBUG_VAR, default=False)) +# Whether to show the codenge for debug purposes +SHOW_CODEGEN: bool = DEBUG and (os.environ.get(DEV_DEBUG_VAR, str(0)).isdigit() and + int(os.environ.get(DEV_DEBUG_VAR, str(0))) > 3) # MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins MYPY = False -SHOW_CODEGEN: bool = DEBUG and int(os.environ.get('OPENLLMDEVDEBUG', str(0))) > 3 def get_debug_mode() -> bool: return DEBUG or _get_debug_mode() @@ -193,6 +198,7 @@ def configure_logging() -> None: _LOGGING_CONFIG['loggers']['bentoml']['level'] = logging.ERROR _LOGGING_CONFIG['root']['level'] = logging.ERROR elif get_debug_mode() or DEBUG: + _LOGGING_CONFIG['handlers']['defaulthandler']['level'] = logging.DEBUG _LOGGING_CONFIG['loggers']['openllm']['level'] = logging.DEBUG _LOGGING_CONFIG['loggers']['bentoml']['level'] = logging.DEBUG _LOGGING_CONFIG['root']['level'] = logging.DEBUG @@ -330,8 +336,8 @@ _import_structure: dict[str, list[str]] = { 'analytics': [], 'codegen': [], 'dantic': [], + 'lazy': [], 'representation': ['ReprMixin'], - 'lazy': ['LazyModule'], 'import_utils': [ 'OPTIONAL_DEPENDENCIES', 'DummyMetaclass', 'EnvVarMixin', 'require_backends', 'is_cpm_kernels_available', 'is_einops_available', 'is_flax_available', 'is_tf_available', 'is_vllm_available', 'is_torch_available', diff --git a/openllm-core/src/openllm_core/utils/analytics.py b/openllm-core/src/openllm_core/utils/analytics.py index 53a10f65..3e680ccf 100644 --- a/openllm-core/src/openllm_core/utils/analytics.py +++ b/openllm-core/src/openllm_core/utils/analytics.py @@ -24,11 +24,10 @@ logger = logging.getLogger(__name__) # This variable is a proxy that will control BENTOML_DO_NOT_TRACK OPENLLM_DO_NOT_TRACK = 'OPENLLM_DO_NOT_TRACK' -DO_NOT_TRACK = os.environ.get(OPENLLM_DO_NOT_TRACK, str(False)).upper() @functools.lru_cache(maxsize=1) def do_not_track() -> bool: - return DO_NOT_TRACK in openllm_core.utils.ENV_VARS_TRUE_VALUES + return openllm_core.utils.check_bool_env(OPENLLM_DO_NOT_TRACK) @functools.lru_cache(maxsize=1) def _usage_event_debugging() -> bool: diff --git a/openllm-core/src/openllm_core/utils/codegen.py b/openllm-core/src/openllm_core/utils/codegen.py index 7c2cce5c..7141bdc1 100644 --- a/openllm-core/src/openllm_core/utils/codegen.py +++ b/openllm-core/src/openllm_core/utils/codegen.py @@ -96,7 +96,7 @@ def make_attr_tuple_class(cls_name: str, attr_names: t.Sequence[str]) -> type[t. else: attr_class_template.append(' pass') globs: DictStrAny = {'_attrs_itemgetter': itemgetter, '_attrs_property': property} - if SHOW_CODEGEN: logger.info('Generated class for %s:\n\n%s', attr_class_name, '\n'.join(attr_class_template)) + if SHOW_CODEGEN: print(f'Generated class for {attr_class_name}:\n\n', '\n'.join(attr_class_template)) _compile_and_eval('\n'.join(attr_class_template), globs) return globs[attr_class_name] @@ -114,7 +114,7 @@ def generate_function(typ: type[t.Any], '\n '.join(lines) if lines else 'pass') meth = _make_method(func_name, script, generate_unique_filename(typ, func_name), globs) if annotations: meth.__annotations__ = annotations - if SHOW_CODEGEN: logger.info('Generated script for %s:\n\n%s', typ, script) + if SHOW_CODEGEN: print('Generated script for {typ}:\n\n', script) return meth def make_env_transformer(cls: type[openllm_core.LLMConfig], @@ -139,11 +139,8 @@ def make_env_transformer(cls: type[openllm_core.LLMConfig], '__model_name': model_name, }) lines: ListStr = [ - '__env = lambda field_name: __field_env(__model_name, field_name, __suffix)', 'return [', ' f.evolve(', - ' default=__populate_env(__default_callback(f.name, f.default), __env(f.name)),', ' metadata={', - " 'env': f.metadata.get('env', __env(f.name)),", - " 'description': f.metadata.get('description', '(not provided)'),", ' },', ' )', - ' for f in fields', ']' + '__env=lambda field_name:__field_env(field_name,__suffix)', + "return [f.evolve(default=__populate_env(__default_callback(f.name,f.default),__env(f.name)),metadata={'env':f.metadata.get('env',__env(f.name)),'description':f.metadata.get('description', '(not provided)')}) for f in fields]" ] fields_ann = 'list[attr.Attribute[t.Any]]' return generate_function(cls, diff --git a/openllm-core/src/openllm_core/utils/import_utils.py b/openllm-core/src/openllm_core/utils/import_utils.py index eb1d2474..8ea867b2 100644 --- a/openllm-core/src/openllm_core/utils/import_utils.py +++ b/openllm-core/src/openllm_core/utils/import_utils.py @@ -17,6 +17,7 @@ import openllm_core from bentoml._internal.utils import LazyLoader from bentoml._internal.utils import pkg +from openllm_core._typing_compat import LiteralBackend from openllm_core._typing_compat import LiteralString from openllm_core._typing_compat import overload @@ -24,7 +25,6 @@ from .representation import ReprMixin if t.TYPE_CHECKING: BackendOrderedDict = OrderedDict[str, t.Tuple[t.Callable[[], bool], str]] - from openllm_core._typing_compat import LiteralRuntime logger = logging.getLogger(__name__) OPTIONAL_DEPENDENCIES = { @@ -336,9 +336,7 @@ class EnvVarMixin(ReprMixin): config: str model_id: str quantize: str - framework: str - bettertransformer: str - runtime: str + backend: str @overload def __getitem__(self, item: t.Literal['config']) -> str: @@ -353,19 +351,11 @@ class EnvVarMixin(ReprMixin): ... @overload - def __getitem__(self, item: t.Literal['framework']) -> str: + def __getitem__(self, item: t.Literal['backend']) -> str: ... @overload - def __getitem__(self, item: t.Literal['bettertransformer']) -> str: - ... - - @overload - def __getitem__(self, item: t.Literal['runtime']) -> str: - ... - - @overload - def __getitem__(self, item: t.Literal['framework_value']) -> LiteralRuntime: + def __getitem__(self, item: t.Literal['backend_value']) -> LiteralBackend: ... @overload @@ -376,14 +366,6 @@ class EnvVarMixin(ReprMixin): def __getitem__(self, item: t.Literal['model_id_value']) -> str | None: ... - @overload - def __getitem__(self, item: t.Literal['bettertransformer_value']) -> bool: - ... - - @overload - def __getitem__(self, item: t.Literal['runtime_value']) -> t.Literal['ggml', 'transformers']: - ... - def __getitem__(self, item: str | t.Any) -> t.Any: if item.endswith('_value') and hasattr(self, f'_{item}'): return object.__getattribute__(self, f'_{item}')() elif hasattr(self, item): return getattr(self, item) @@ -391,50 +373,34 @@ class EnvVarMixin(ReprMixin): def __init__(self, model_name: str, - implementation: LiteralRuntime = 'pt', + backend: LiteralBackend = 'pt', model_id: str | None = None, - bettertransformer: bool | None = None, - quantize: LiteralString | None = None, - runtime: t.Literal['ggml', 'transformers'] = 'transformers') -> None: + quantize: LiteralString | None = None) -> None: '''EnvVarMixin is a mixin class that returns the value extracted from environment variables.''' from openllm_core.utils import field_env_key self.model_name = inflection.underscore(model_name) - self._implementation = implementation + self._backend = backend self._model_id = model_id - self._bettertransformer = bettertransformer self._quantize = quantize - self._runtime = runtime - for att in {'config', 'model_id', 'quantize', 'framework', 'bettertransformer', 'runtime'}: - setattr(self, att, field_env_key(self.model_name, att.upper())) + for att in {'config', 'model_id', 'quantize', 'backend'}: + setattr(self, att, field_env_key(att.upper())) def _quantize_value(self) -> t.Literal['int8', 'int4', 'gptq'] | None: from . import first_not_none return t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], first_not_none(os.environ.get(self['quantize']), default=self._quantize)) - def _framework_value(self) -> LiteralRuntime: + def _backend_value(self) -> LiteralBackend: from . import first_not_none - return t.cast(LiteralRuntime, first_not_none(os.environ.get(self['framework']), default=self._implementation)) - - def _bettertransformer_value(self) -> bool: - from . import first_not_none - return t.cast( - bool, - first_not_none(os.environ.get(self['bettertransformer'], str(False)).upper() in ENV_VARS_TRUE_VALUES, - default=self._bettertransformer)) + return t.cast(LiteralBackend, first_not_none(os.environ.get(self['backend']), default=self._backend)) def _model_id_value(self) -> str | None: from . import first_not_none return first_not_none(os.environ.get(self['model_id']), default=self._model_id) - def _runtime_value(self) -> t.Literal['ggml', 'transformers']: - from . import first_not_none - return t.cast(t.Literal['ggml', 'transformers'], - first_not_none(os.environ.get(self['runtime']), default=self._runtime)) - @property def __repr_keys__(self) -> set[str]: - return {'config', 'model_id', 'quantize', 'framework', 'bettertransformer', 'runtime'} + return {'config', 'model_id', 'quantize', 'backend'} @property def start_docstring(self) -> str: diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py index 2060b332..a0772855 100644 --- a/openllm-python/src/openllm/__init__.py +++ b/openllm-python/src/openllm/__init__.py @@ -47,7 +47,7 @@ _import_structure: dict[str, list[str]] = { "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"], "_quantisation": ["infer_quantisation_config"], "_embeddings": ["GenericEmbeddingRunnable"], - "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], + "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "EmbeddingsOutput"], "_generation": [ "StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor" @@ -72,7 +72,7 @@ COMPILED = _Path(__file__).suffix in (".pyd", ".so") if _t.TYPE_CHECKING: from . import bundle as bundle, cli as cli, client as client, models as models, playground as playground, serialisation as serialisation, testing as testing from ._generation import LogitsProcessorList as LogitsProcessorList, StopOnTokens as StopOnTokens, StoppingCriteriaList as StoppingCriteriaList, StopSequenceCriteria as StopSequenceCriteria, prepare_logits_processor as prepare_logits_processor - from ._llm import LLM as LLM, LLMEmbeddings as LLMEmbeddings, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner + from ._llm import LLM as LLM, EmbeddingsOutput as EmbeddingsOutput, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner from ._quantisation import infer_quantisation_config as infer_quantisation_config from ._embeddings import GenericEmbeddingRunnable as GenericEmbeddingRunnable from .cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start, start_grpc as start_grpc @@ -196,7 +196,12 @@ else: __lazy = openllm_core.utils.LazyModule(__name__, globals()["__file__"], _import_structure, - extra_objects={"COMPILED": COMPILED}) + extra_objects={ + "COMPILED": COMPILED, + "__openllm_migration__": { + "LLMEmbeddings": "EmbeddingsOutput" + } + }) __all__ = __lazy.__all__ __dir__ = __lazy.__dir__ __getattr__ = __lazy.__getattr__ diff --git a/openllm-python/src/openllm/_assign.py b/openllm-python/src/openllm/_assign.py new file mode 100644 index 00000000..8026204e --- /dev/null +++ b/openllm-python/src/openllm/_assign.py @@ -0,0 +1,201 @@ +'''LLM assignment magik.''' +from __future__ import annotations +import functools +import traceback +import typing as t + +import openllm + +from openllm.exceptions import OpenLLMException +from openllm_core._configuration import _object_getattribute +from openllm_core._configuration import _setattr_class +from openllm_core._schema import unmarshal_vllm_outputs +from openllm_core._typing_compat import DictStrAny +from openllm_core._typing_compat import ListStr +from openllm_core._typing_compat import M +from openllm_core._typing_compat import T +from openllm_core._typing_compat import import_model_protocol +from openllm_core._typing_compat import llm_post_init_protocol +from openllm_core._typing_compat import load_model_protocol +from openllm_core._typing_compat import load_tokenizer_protocol +from openllm_core.utils import LazyLoader +from openllm_core.utils import codegen +from openllm_core.utils import device_count +from openllm_core.utils import first_not_none +from openllm_core.utils import is_torch_available + +if t.TYPE_CHECKING: + import torch + import vllm + + import bentoml + + from openllm._llm import LLM +else: + torch = LazyLoader('torch', globals(), 'torch') + vllm = LazyLoader('vllm', globals(), 'vllm') + +def import_model(fn: import_model_protocol[bentoml.Model, M, T]) -> t.Callable[[LLM[M, T]], bentoml.Model]: + + @functools.wraps(fn) + def inner(self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any) -> bentoml.Model: + trust_remote_code = first_not_none(trust_remote_code, default=self.trust_remote_code) + (model_decls, model_attrs), _ = self.llm_parameters + decls = (*model_decls, *decls) + attrs = {**model_attrs, **attrs} + return fn(self, *decls, trust_remote_code=trust_remote_code, **attrs) + + return inner + +def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.LLMEngine]: + + @functools.wraps(fn) + def inner(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine: + if self.__llm_backend__ == 'vllm': + # TODO: Do some more processing with token_id once we support token streaming + try: + return vllm.LLMEngine.from_engine_args( + vllm.EngineArgs(model=self._bentomodel.path, + tokenizer=self._bentomodel.path if self.tokenizer_id == 'local' else self.tokenizer_id, + tokenizer_mode='auto', + tensor_parallel_size=1 if device_count() < 2 else device_count(), + dtype='auto', + worker_use_ray=False)) + except Exception as err: + traceback.print_exc() + raise OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from None + else: + (model_decls, model_attrs), _ = self.llm_parameters + return fn(self, *(*model_decls, *decls), **{**model_attrs, **attrs}) + + return inner + +def load_tokenizer(fn: load_tokenizer_protocol[M, T]) -> t.Callable[[LLM[M, T]], T]: + + @functools.wraps(fn) + def inner(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T: + return fn(self, **{**self.llm_parameters[-1], **tokenizer_attrs}) + + return inner + +def llm_post_init(fn: llm_post_init_protocol[M, T]) -> t.Callable[[LLM[M, T]], None]: + + @functools.wraps(fn) + def inner(self: LLM[M, T]) -> None: + if self.__llm_backend__ == 'pt' and is_torch_available(): + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + fn(self) + + return inner + +def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]: + '''Make LLM attributes for the given LLM subclass.''' + from ._llm import LLM + from ._llm import LLMFunction + from ._llm import LLMInterface + from ._llm import LLMSerialisation + + args: ListStr = [] + globs: DictStrAny = {'cls': cls, '__wrapped_llm_post_init': llm_post_init, 'LLM': LLM} + # _cached_LLMFunction_get and _ccached_LLMSerialisation_get + globs.update( + {f'_cached_{cl_.__name__}_get': _object_getattribute.__get__(cl_) for cl_ in {LLMSerialisation, LLMFunction}}) + # llm_post_init implementation + lines: ListStr = [ + f'_impl_{cls.__name__}_func=cls.llm_post_init', + _setattr_class('llm_post_init', f'__wrapped_llm_post_init(_impl_{cls.__name__}_func)') + ] + + serialisation_attr = {'import_model': import_model, 'load_model': load_model, 'load_tokenizer': load_tokenizer,} + for func, impl in serialisation_attr.items(): + impl_name = f'__wrapped_{func}' + globs.update({f'__serialisation_{func}': getattr(openllm.serialisation, func, None), impl_name: impl}) + cached_func_name = f'_cached_{cls.__name__}_func' + func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMSerialisation_get('{func}') else __serialisation_{func}" + lines.extend([ + f'{cached_func_name}=cls.{func}', func_call, + _setattr_class(func, f'{impl_name}(_impl_{cls.__name__}_{func})') + ]) + + # assign vLLM implementation + if cls.__llm_backend__ == 'vllm': + vllm_func = { + f'_vllm_{it}': fn + for it, fn in zip(('generate', 'generate_iterator', + 'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate)) + } + globs.update(vllm_func) + lines.extend([_setattr_class(it[6:], it) for it in vllm_func]) + + interface_anns = codegen.get_annotations(LLMInterface) + + # cached attribute initialisation + def dunder_cached(key: str) -> str: + return f'__llm_{key}__' + + st_attr = {'model', 'tokenizer', 'adapter_map'} + lines.extend([_setattr_class(dunder_cached(v), None) for v in st_attr]) + + # boolean for better LLM implementation resolver + def dunder_support(key: str) -> str: + return f'__llm_supports_{key}__' + + bool_attr = {it[15:-2] for it in interface_anns if it.startswith('__llm_supports_')} + lines.extend( + [_setattr_class(dunder_support(fn), f"cls.{fn} is not _cached_LLMFunction_get('{fn}')") for fn in bool_attr]) + + return codegen.generate_function(cls, + '__assign_llm_attr', + lines, + args=('cls', *args), + globs=globs, + annotations={ + 'cls': 't.Type[LLM]', + 'return': None + }) + +def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]], + **_: t.Any) -> str: + return generation_result[0]['outputs'][0]['text'] + +def vllm_generate_iterator(self: LLM['vllm.LLMEngine', T], + prompt: str, + /, + *, + echo: bool = False, + stop: str | t.Iterable[str] | None = None, + stop_token_ids: list[int] | None = None, + **attrs: t.Any) -> t.Iterator[dict[str, t.Any]]: + request_id: str | None = attrs.pop('request_id', None) + if request_id is None: raise ValueError('request_id must not be None.') + if stop_token_ids is None: stop_token_ids = [] + stop_token_ids.append(self.tokenizer.eos_token_id) + stop_: set[str] = set() + if isinstance(stop, str) and stop != '': stop_.add(stop) + elif isinstance(stop, list) and stop != []: stop_.update(stop) + for tid in stop_token_ids: + if tid: stop_.add(self.tokenizer.decode(tid)) + + if self.config['temperature'] <= 1e-5: top_p = 1.0 + else: top_p = self.config['top_p'] + config = self.config.model_construct_env(stop=list(stop_), top_p=top_p, **attrs) + self.model.add_request(request_id=request_id, prompt=prompt, sampling_params=config.to_sampling_config()) + while self.model.has_unfinished_requests(): + for request_output in self.model.step(): + prompt = request_output.prompt + if echo: text_outputs = [prompt + output.text for output in request_output.outputs] + else: text_outputs = [output.text for output in request_output.outputs] + yield {'text': text_outputs, 'error_code': 0} + if request_output.finished: break + +def vllm_generate(self: LLM['vllm.LLMEngine', T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]: + request_id: str | None = attrs.pop('request_id', None) + if request_id is None: raise ValueError('request_id must not be None.') + outputs: list[vllm.RequestOutput] = [] + # TODO: support prompt_token_ids + self.model.add_request(request_id=request_id, + prompt=prompt, + sampling_params=self.config.model_construct_env(**attrs).to_sampling_config()) + while self.model.has_unfinished_requests(): + outputs.extend([r for r in self.model.step() if r.finished]) + return [unmarshal_vllm_outputs(i) for i in outputs] diff --git a/openllm-python/src/openllm/_embeddings.py b/openllm-python/src/openllm/_embeddings.py index 526ce67b..8a0cda9c 100644 --- a/openllm-python/src/openllm/_embeddings.py +++ b/openllm-python/src/openllm/_embeddings.py @@ -58,7 +58,7 @@ class GenericEmbeddingRunnable(bentoml.Runnable): self.model.to(self.device) @bentoml.Runnable.method(batchable=True, batch_dim=0) - def encode(self, sentences: list[str]) -> t.Sequence[openllm.LLMEmbeddings]: + def encode(self, sentences: list[str]) -> t.Sequence[openllm.EmbeddingsOutput]: import torch import torch.nn.functional as F encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(self.device) @@ -69,8 +69,8 @@ class GenericEmbeddingRunnable(bentoml.Runnable): # Perform pooling and normalize sentence_embeddings = F.normalize(self.mean_pooling(model_output, attention_mask), p=2, dim=1) return [ - openllm.LLMEmbeddings(embeddings=sentence_embeddings.cpu().numpy(), - num_tokens=int(torch.sum(attention_mask).item())) + openllm.EmbeddingsOutput(embeddings=sentence_embeddings.cpu().numpy(), + num_tokens=int(torch.sum(attention_mask).item())) ] @staticmethod diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index 80e0ae12..e7bccd8b 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -1,14 +1,10 @@ # mypy: disable-error-code="name-defined,attr-defined" from __future__ import annotations import abc -import functools import gc import inspect import logging import os -import pathlib -import re -import traceback import types import typing as t @@ -26,29 +22,22 @@ import openllm_core from bentoml._internal.models.model import ModelSignature from openllm_core._configuration import FineTuneConfig from openllm_core._configuration import LLMConfig -from openllm_core._configuration import _object_getattribute -from openllm_core._configuration import _setattr_class -from openllm_core._schema import unmarshal_vllm_outputs +from openllm_core._schema import EmbeddingsOutput from openllm_core._typing_compat import AdaptersMapping from openllm_core._typing_compat import AdaptersTuple from openllm_core._typing_compat import AdapterType -from openllm_core._typing_compat import AnyCallable from openllm_core._typing_compat import DictStrAny -from openllm_core._typing_compat import ListStr -from openllm_core._typing_compat import LiteralRuntime +from openllm_core._typing_compat import LiteralBackend from openllm_core._typing_compat import LiteralString -from openllm_core._typing_compat import LLMEmbeddings from openllm_core._typing_compat import LLMRunnable from openllm_core._typing_compat import LLMRunner from openllm_core._typing_compat import M -from openllm_core._typing_compat import ModelSignatureDict as _ModelSignatureDict -from openllm_core._typing_compat import NotRequired +from openllm_core._typing_compat import ModelSignatureDict from openllm_core._typing_compat import PeftAdapterOutput from openllm_core._typing_compat import T from openllm_core._typing_compat import TupleAny from openllm_core._typing_compat import overload from openllm_core.utils import DEBUG -from openllm_core.utils import ENV_VARS_TRUE_VALUES from openllm_core.utils import MYPY from openllm_core.utils import EnvVarMixin from openllm_core.utils import LazyLoader @@ -61,11 +50,11 @@ from openllm_core.utils import first_not_none from openllm_core.utils import generate_hash_from_file from openllm_core.utils import is_peft_available from openllm_core.utils import is_torch_available -from openllm_core.utils import non_intrusive_setattr from openllm_core.utils import normalize_attrs_to_model_tokenizer_pair from openllm_core.utils import resolve_filepath from openllm_core.utils import validate_is_path +from ._assign import make_llm_attributes from ._quantisation import infer_quantisation_config from .exceptions import ForbiddenAttributeError from .exceptions import GpuNotAvailableError @@ -73,17 +62,16 @@ from .exceptions import OpenLLMException from .utils import infer_auto_class if t.TYPE_CHECKING: + import auto_gptq as autogptq import peft import torch import transformers - import vllm from openllm_core._configuration import PeftType from openllm_core.utils.representation import ReprArgs else: autogptq = LazyLoader('autogptq', globals(), 'auto_gptq') - vllm = LazyLoader('vllm', globals(), 'vllm') transformers = LazyLoader('transformers', globals(), 'transformers') torch = LazyLoader('torch', globals(), 'torch') peft = LazyLoader('peft', globals(), 'peft') @@ -92,14 +80,10 @@ ResolvedAdaptersMapping = t.Dict[AdapterType, t.Dict[str, t.Tuple['peft.PeftConf logger = logging.getLogger(__name__) -class ModelSignatureDict(t.TypedDict, total=False): - batchable: bool - batch_dim: t.Union[t.Tuple[int, int], int] - input_spec: NotRequired[t.Union[t.Any, t.Tuple[t.Any]]] - output_spec: NotRequired[t.Any] - def normalise_model_name(name: str) -> str: - return os.path.basename(resolve_filepath(name)) if validate_is_path(name) else re.sub('[^a-zA-Z0-9]+', '-', name) + if validate_is_path(name): return os.path.basename(resolve_filepath(name)) + name = name.replace('/', '--') + return inflection.dasherize(name) # the below is similar to peft.utils.other.CONFIG_NAME PEFT_CONFIG_NAME = 'adapter_config.json' @@ -137,36 +121,41 @@ def resolve_peft_config_type(adapter_map: dict[str, str | None]) -> AdaptersMapp _reserved_namespace = {'config_class', 'model', 'tokenizer', 'import_kwargs'} -class LLMInterface(abc.ABC, t.Generic[M, T]): - '''This defines the loose contract for all openllm.LLM implementations.''' +class LLMFunction(abc.ABC): - @property - def import_kwargs(self) -> tuple[DictStrAny, DictStrAny] | None: - """The default import kwargs to used when importing the model. + @abc.abstractmethod + def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any: + '''This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.). - This will be passed into 'openllm.LLM.import_model'. - It returns two dictionaries: one for model kwargs and one for tokenizer kwargs. + You can customize how the output of the LLM looks with this hook. By default, it is a simple echo. - Returns: - Optional tuple of model kwargs and tokenizer kwargs - """ - - def embeddings(self, prompts: list[str]) -> LLMEmbeddings: - '''The implementation for generating text embeddings from given prompt. - - It takes the prompt and output the embeddings for this given LLM. - - Returns: - The embeddings for the given prompt. + > [!NOTE] + > This will be used from the client side. ''' raise NotImplementedError @abc.abstractmethod def generate(self, prompt: str, **preprocess_generate_kwds: t.Any) -> t.Any: - """The implementation for text generation from given prompt. + '''Text generation implementation for any given prompt. - It takes the prompt and 'generation_kwargs' from 'self.sanitize_parameters' and then pass it to 'self.model.generate'. - """ + It takes the prompt and 'generation_kwargs'. The main implementation will parse all of kwargs + correctly for you, so that subclass implementation don't have to repeat some of these boilercode. + ''' + raise NotImplementedError + + @abc.abstractmethod + def generate_iterator(self, prompt: str, /, **attrs: t.Any) -> t.Iterator[t.Any]: + '''The iterator implementation of generate. + + This will be used for Token streaming and SSE support. + + Args: + prompt: the input prompt + **attrs: Relevant attributes to be pass to the stream generation implementation. + + Returns: + An iterator of incoming token generation. It will returns a dictionary + ''' raise NotImplementedError def generate_one(self, prompt: str, stop: list[str], @@ -177,17 +166,20 @@ class LLMInterface(abc.ABC, t.Generic[M, T]): ''' raise NotImplementedError - def generate_iterator(self, prompt: str, /, **attrs: t.Any) -> t.Iterator[t.Any]: - '''The iterator version of `generate` function.''' - raise NotImplementedError( - 'Currently generate_iterator requires SSE (Server-side events) support, which is not yet implemented.') + def embeddings(self, prompts: list[str]) -> EmbeddingsOutput: + '''The implementation for generating text embeddings from given prompt. - def llm_post_init(self) -> None: - """This function can be implemented if you need to initialized any additional variables that doesn't concern OpenLLM internals.""" - pass + It takes the prompt and output the embeddings for this given LLM. + + Returns: + The embeddings for the given prompt. + ''' + raise NotImplementedError + +class LLMSerialisation(abc.ABC, t.Generic[M, T]): def import_model(self, *args: t.Any, trust_remote_code: bool, **attrs: t.Any) -> bentoml.Model: - """This function can be implemented if default import_model doesn't satisfy your needs. + '''Import both model and tokenizer weights into as a BentoML models. Note that tokenizer attrs can be accessed via ``llm.llm_parameters``. @@ -196,7 +188,7 @@ class LLMInterface(abc.ABC, t.Generic[M, T]): ``` By default, `model_decls` and `model_attrs` is already sanitised and concatenated into `args` and `attrs` - """ + ''' raise NotImplementedError def load_model(self, *args: t.Any, **attrs: t.Any) -> M: @@ -213,40 +205,47 @@ class LLMInterface(abc.ABC, t.Generic[M, T]): ''' raise NotImplementedError - def save_pretrained(self, save_directory: str | pathlib.Path, **attrs: t.Any) -> None: - '''This function defines how this model can be saved to local store. +class LLMInterface(LLMFunction, LLMSerialisation[M, T], abc.ABC): - This will be called during ``import_model``. By default, it will use ``openllm.serialisation.save_pretrained``. - Additionally, the function signature are similar to ``transformers.PreTrainedModel.save_pretrained`` - This is useful during fine tuning. + def llm_post_init(self) -> None: + '''This function can be implemented if you need to initialized any additional variables that doesn't concern OpenLLM internals. + By default, this will add `self.device` if the implementation is PyTorch. + ''' + pass + + def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]: + '''This handler will sanitize all attrs and setup prompt text. + + It takes a prompt that is given by the user, attrs that can be parsed with the prompt. + + Returns a tuple of three items: + - The attributes dictionary that can be passed into LLMConfig to generate a GenerationConfig + - The attributes dictionary that will be passed into `self.postprocess_generate`. ''' raise NotImplementedError + @property + def import_kwargs(self) -> tuple[DictStrAny, DictStrAny] | None: + '''The default import kwargs to used when importing the model. + + This will be passed into 'openllm.LLM.import_model'. + It returns two dictionaries: one for model kwargs and one for tokenizer kwargs. + + Returns: + Optional tuple of model kwargs and tokenizer kwargs + ''' + # NOTE: All fields below are attributes that can be accessed by users. config_class: t.Type[LLMConfig] '''The config class to use for this LLM. If you are creating a custom LLM, you must specify this class.''' - bettertransformer: bool - '''Whether to load this LLM with FasterTransformer enabled. The order of loading is: - - - If pass within `for_model`, `from_pretrained` or `__init__`. - - If `self.bettertransformer` is set within `llm_post_init`. - - Finally, if none of the above, default to self.config['bettertransformer'] - - > [!NOTE] that if LoRA is enabled, bettertransformer will be disabled. - ''' device: 'torch.device' '''The device to be used for this LLM. If the implementation is 'pt', then it will be torch.device, else string.''' tokenizer_id: t.Union[t.Literal['local'], LiteralString] '''optional tokenizer_id for loading with vLLM if the model supports vLLM.''' - # NOTE: The following will be populated by __init_subclass__, note that these should be immutable. - __llm_trust_remote_code__: bool - '''This is used to determine during 'import_model' whether to trust remote code or not. - This works synonymous with `trust_remote_code` kwarg in transformers Auto classes. If not passed, - then by default fallback to config_class['trust_remote_code'] - ''' - __llm_implementation__: LiteralRuntime - '''This is used to determine which implementation that this LLM has. + # NOTE: The following will be populated by __init_subclass__, note that these should be immutable. + __llm_backend__: LiteralBackend + '''This is used to determine which framework implementation for this given LLM. Usually, this will inferred from class name, that follows the HuggingFace's naming convention: @@ -254,16 +253,17 @@ class LLMInterface(abc.ABC, t.Generic[M, T]): - `TFOPTForConditionalGeneration` -> `tf` - `FlaxOPTForConditionalGeneration` -> `flax` - An additional naming for all VLLM backend: VLLMLlama -> `vllm` + For all VLLM backend: VLLMLlama -> `vllm` + For all GGML backend: GGMLLlama -> `ggml` + For all MLC backend: MLCLlama -> `mlc` ''' __llm_model__: t.Optional[M] '''A reference to the actual model. Instead of access this directly, you should use `model` property instead.''' __llm_tokenizer__: t.Optional[T] '''A reference to the actual tokenizer. Instead of access this directly, you should use `tokenizer` property instead.''' - __llm_bentomodel__: t.Optional[bentoml.Model] - '''A reference to the bentomodel used for this LLM. Instead of access this directly, you should use `_bentomodel` property instead.''' __llm_adapter_map__: t.Optional[ResolvedAdaptersMapping] '''A reference to the the cached LoRA adapter mapping.''' + __llm_supports_embeddings__: bool '''A boolean to determine whether models does implement ``LLM.embeddings``.''' __llm_supports_generate__: bool @@ -272,243 +272,30 @@ class LLMInterface(abc.ABC, t.Generic[M, T]): '''A boolean to determine whether models does implement ``LLM.generate_one``.''' __llm_supports_generate_iterator__: bool '''A boolean to determine whether models does implement ``LLM.generate_iterator``.''' - if t.TYPE_CHECKING and not MYPY: - - def __attrs_init__(self, config: LLMConfig, quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, - autogptq.BaseQuantizeConfig]], - model_id: str, runtime: t.Literal['ggml', 'transformers'], model_decls: TupleAny, - model_attrs: DictStrAny, tokenizer_attrs: DictStrAny, tag: bentoml.Tag, - adapters_mapping: t.Optional[AdaptersMapping], model_version: t.Optional[str], - quantize_method: t.Optional[t.Literal['int8', 'int4', 'gptq']], - serialisation_format: t.Literal['safetensors', 'legacy'], _local: bool, **attrs: t.Any) -> None: - '''Generated __attrs_init__ for openllm.LLM.''' - -_R = t.TypeVar('_R', covariant=True) - -class _import_model_wrapper(t.Generic[_R, M, T], t.Protocol): - - def __call__(self, llm: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any) -> _R: - ... - -class _load_model_wrapper(t.Generic[M, T], t.Protocol): - - def __call__(self, llm: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: - ... - -class _load_tokenizer_wrapper(t.Generic[M, T], t.Protocol): - - def __call__(self, llm: LLM[M, T], **attrs: t.Any) -> T: - ... - -class _llm_post_init_wrapper(t.Generic[M, T], t.Protocol): - - def __call__(self, llm: LLM[M, T]) -> T: - ... - -class _save_pretrained_wrapper(t.Generic[M, T], t.Protocol): - - def __call__(self, llm: LLM[M, T], save_directory: str | pathlib.Path, **attrs: t.Any) -> None: - ... - -_object_setattr = object.__setattr__ - -# NOTE: the following wrapper are a light meta ops for wrapping default params to internal methods implementation. -def _wrapped_import_model(f: _import_model_wrapper[bentoml.Model, M, T]) -> t.Callable[[LLM[M, T]], bentoml.Model]: - - @functools.wraps(f) - def wrapper(self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any) -> bentoml.Model: - trust_remote_code = first_not_none(trust_remote_code, default=self.__llm_trust_remote_code__) - (model_decls, model_attrs), _ = self.llm_parameters - decls = (*model_decls, *decls) - attrs = {**model_attrs, **attrs} - return f(self, *decls, trust_remote_code=trust_remote_code, **attrs) - - return wrapper _DEFAULT_TOKENIZER = 'hf-internal-testing/llama-tokenizer' -def get_engine_args(llm: LLM[M, T], tokenizer: str = _DEFAULT_TOKENIZER) -> vllm.EngineArgs: - return vllm.EngineArgs(model=llm._bentomodel.path, - tokenizer=tokenizer, - tokenizer_mode='auto', - tensor_parallel_size=1 if device_count() < 2 else device_count(), - dtype='auto', - worker_use_ray=False) - -def _wrapped_load_model(f: _load_model_wrapper[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.LLMEngine]: - - @functools.wraps(f) - def wrapper(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine: - if self.__llm_implementation__ == 'vllm': - # TODO: Do some more processing with token_id once we support token streaming - try: - return vllm.LLMEngine.from_engine_args( - get_engine_args(self, - tokenizer=self._bentomodel.path if self.tokenizer_id == 'local' else self.tokenizer_id)) - except Exception as err: - traceback.print_exc() - raise OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from None - else: - (model_decls, model_attrs), _ = self.llm_parameters - return f(self, *(*model_decls, *decls), **{**model_attrs, **attrs}) - - return wrapper - -def _wrapped_load_tokenizer(f: _load_tokenizer_wrapper[M, T]) -> t.Callable[[LLM[M, T]], T]: - - @functools.wraps(f) - def wrapper(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T: - return f(self, **{**self.llm_parameters[-1], **tokenizer_attrs}) - - return wrapper - -def _wrapped_llm_post_init(f: _llm_post_init_wrapper[M, T]) -> t.Callable[[LLM[M, T]], None]: - - @functools.wraps(f) - def wrapper(self: LLM[M, T]) -> None: - if self.__llm_implementation__ == 'pt' and is_torch_available(): - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - f(self) - - return wrapper - -def _wrapped_save_pretrained(f: _save_pretrained_wrapper[M, T]) -> t.Callable[[LLM[M, T], str | pathlib.Path], None]: - - @functools.wraps(f) - def wrapper(self: LLM[M, T], save_directory: str | pathlib.Path, **attrs: t.Any) -> None: - if isinstance(save_directory, pathlib.Path): save_directory = str(save_directory) - if self.__llm_model__ is None: raise RuntimeError("Cannot 'save_pretrained' with unload model instance.") - if self.bettertransformer and self.__llm_implementation__ == 'pt': - _object_setattr(self, '__llm_model__', - t.cast('transformers.PreTrainedModel', self.__llm_model__).reverse_bettertransformer()) - f(self, save_directory, **attrs) - - return wrapper - -def _update_docstring(cls: LLM[M, T], fn: str) -> AnyCallable: - # update docstring for given entrypoint - original_fn = getattr(cls, fn, getattr(LLMInterface, fn)) - original_fn.__doc__ = original_fn.__doc__ or f'''\ - {cls.__name__}'s implementation for {fn}. - - Note that if LoRA is enabled (via either SDK or CLI), `self.model` will become a `peft.PeftModel` - The original model can then be accessed with 'self.model.get_base_model()'. - ''' - setattr(cls, fn, original_fn) - return original_fn - -def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]: - attributes = { - 'import_model': _wrapped_import_model, - 'load_model': _wrapped_load_model, - 'load_tokenizer': _wrapped_load_tokenizer, - 'llm_post_init': _wrapped_llm_post_init, - 'save_pretrained': _wrapped_save_pretrained - } - args: ListStr = [] - anns: DictStrAny = {} - lines: ListStr = [] - globs: DictStrAny = { - 'cls': cls, - '_cached_LLMInterface_get': _object_getattribute.__get__(LLMInterface), - '__gen_docstring': _update_docstring - } - # function initialisation - for func, impl in attributes.items(): - impl_name = f'__wrapped_{func}' - globs.update({f'__serialisation_{func}': getattr(openllm.serialisation, func, None), impl_name: impl}) - cached_func_name = f'_cached_{cls.__name__}_func' - if func == 'llm_post_init': func_call = f'_impl_{cls.__name__}_{func}={cached_func_name}' - else: - func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMInterface_get('{func}') else __serialisation_{func}" - lines.extend([ - f'{cached_func_name}=cls.{func}', func_call, - _setattr_class(func, f'{impl_name}(_impl_{cls.__name__}_{func})'), - ]) - - # assign vllm specific implementation - if cls.__llm_implementation__ == 'vllm': - globs.update({ - '_vllm_generate': vllm_generate, - '_vllm_postprocess_generate': vllm_postprocess_generate, - '_vllm_generate_iterator': vllm_generate_iterator - }) - lines.extend( - [_setattr_class(it, f'_vllm_{it}') for it in {'generate', 'postprocess_generate', 'generate_iterator'}]) - - # cached attribute initialisation - interface_anns = codegen.get_annotations(LLMInterface) - for v in {'bentomodel', 'model', 'tokenizer', 'adapter_map'}: - lines.append(_setattr_class(f'__llm_{v}__', None)) - anns[f'__llm_{v}__'] = interface_anns.get(f'__llm_{v}__') - - # boolean to determine whether LLM has defined an implementation for a function - for fn in {'generate', 'generate_one', 'generate_iterator', 'embeddings'}: - key = f'__llm_supports_{fn}__' - lines.extend([ - _setattr_class(key, f"cls.{fn} is not _cached_LLMInterface_get('{fn}')"), f"__gen_docstring(cls, '{fn}')", - ]) - anns[key] = interface_anns.get(key) - return codegen.generate_function(cls, '__assign_llm_attr', lines, args=('cls', *args), globs=globs, annotations=anns) - -def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]], - **_: t.Any) -> str: - return generation_result[0]['outputs'][0]['text'] - -def vllm_generate_iterator(self: LLM['vllm.LLMEngine', T], - prompt: str, - /, - *, - echo: bool = False, - stop: str | t.Iterable[str] | None = None, - stop_token_ids: list[int] | None = None, - **attrs: t.Any) -> t.Iterator[dict[str, t.Any]]: - request_id: str = attrs.pop('request_id', None) - if request_id is None: raise ValueError('request_id must not be None.') - if stop_token_ids is None: stop_token_ids = [] - stop_token_ids.append(self.tokenizer.eos_token_id) - stop_ = set() - if isinstance(stop, str) and stop != '': stop_.add(stop) - elif isinstance(stop, list) and stop != []: stop_.update(stop) - for tid in stop_token_ids: - if tid: stop_.add(self.tokenizer.decode(tid)) - - if self.config['temperature'] <= 1e-5: top_p = 1.0 - else: top_p = self.config['top_p'] - config = self.config.model_construct_env(stop=list(stop_), top_p=top_p, **attrs) - self.model.add_request(request_id=request_id, prompt=prompt, sampling_params=config.to_sampling_config()) - while self.model.has_unfinished_requests(): - for request_output in self.model.step(): - prompt = request_output.prompt - if echo: text_outputs = [prompt + output.text for output in request_output.outputs] - else: text_outputs = [output.text for output in request_output.outputs] - yield {'text': text_outputs, 'error_code': 0} - if request_output.finished: break - -def vllm_generate(self: LLM['vllm.LLMEngine', T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]: - request_id: str = attrs.pop('request_id', None) - if request_id is None: raise ValueError('request_id must not be None.') - outputs: list[vllm.RequestOutput] = [] - # TODO: support prompt_token_ids - self.model.add_request(request_id=request_id, - prompt=prompt, - sampling_params=self.config.model_construct_env(**attrs).to_sampling_config()) - while self.model.has_unfinished_requests(): - outputs.extend([r for r in self.model.step() if r.finished]) - return [unmarshal_vllm_outputs(i) for i in outputs] - _AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class('AdaptersTuple', ['adapter_id', 'name', 'config']) @attr.define(slots=True, repr=False, init=False) class LLM(LLMInterface[M, T], ReprMixin): if t.TYPE_CHECKING: __name__: str + if t.TYPE_CHECKING and not MYPY: + + def __attrs_init__(self, config: LLMConfig, quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, + autogptq.BaseQuantizeConfig]], + model_id: str, model_decls: TupleAny, model_attrs: DictStrAny, tokenizer_attrs: DictStrAny, + tag: bentoml.Tag, adapters_mapping: t.Optional[AdaptersMapping], model_version: t.Optional[str], + quantize_method: t.Optional[t.Literal['int8', 'int4', 'gptq']], + serialisation_format: t.Literal['safetensors', 'legacy'], _local: bool, **attrs: t.Any) -> None: + '''Generated __attrs_init__ for openllm.LLM.''' + config: LLMConfig - '''The config instance to use for this LLM. This will be created based on config_class and available - when initialising the LLM.''' + '''The config instance to use for this LLM. This will be created based on config_class and available when initialising the LLM.''' quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None '''Quantisation config for quantised model on the fly.''' + _model_id: str - _runtime: t.Literal['ggml', 'transformers'] _model_decls: TupleAny _model_attrs: DictStrAny _tokenizer_attrs: DictStrAny @@ -519,31 +306,28 @@ class LLM(LLMInterface[M, T], ReprMixin): _serialisation_format: t.Literal['safetensors', 'legacy'] _local: bool - @staticmethod - def _infer_implementation_from_name(name: str) -> tuple[LiteralRuntime, str]: - if name.startswith('Flax'): return 'flax', name[4:] - elif name.startswith('TF'): return 'tf', name[2:] - elif name.startswith('VLLM'): return 'vllm', name[4:] - else: return 'pt', name - def __init_subclass__(cls: type[LLM[M, T]]) -> None: cd = cls.__dict__ - implementation, config_class_name = cls._infer_implementation_from_name(cls.__name__) - cls.__llm_implementation__ = implementation - config_class = openllm.AutoConfig.infer_class_from_name(config_class_name) - if '__openllm_internal__' in cd: - if 'config_class' not in cd: cls.config_class = config_class - elif 'config_class' not in cd: + if cls.__name__.startswith('Flax'): + cls.__llm_backend__, config_class = 'flax', openllm.AutoConfig.infer_class_from_name(cls.__name__[4:]) + elif cls.__name__.startswith('TF'): + cls.__llm_backend__, config_class = 'tf', openllm.AutoConfig.infer_class_from_name(cls.__name__[2:]) + elif cls.__name__.startswith('VLLM'): + cls.__llm_backend__, config_class = 'vllm', openllm.AutoConfig.infer_class_from_name(cls.__name__[4:]) + else: + cls.__llm_backend__, config_class = 'pt', openllm.AutoConfig.infer_class_from_name(cls.__name__) + if '__openllm_internal__' not in cd and 'config_class' not in cd: raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.") - _make_assignment_script(cls)(cls) - if 'tokenizer_id' not in cd and cls.__llm_implementation__ == 'vllm': cls.tokenizer_id = _DEFAULT_TOKENIZER + if '__openllm_internal__' in cd and 'config_class' not in cd: cls.config_class = config_class + if 'tokenizer_id' not in cd and cls.__llm_backend__ == 'vllm': cls.tokenizer_id = _DEFAULT_TOKENIZER + make_llm_attributes(cls)(cls) @overload def __getitem__(self, item: t.Literal['trust_remote_code']) -> bool: ... @overload - def __getitem__(self, item: t.Literal['implementation']) -> LiteralRuntime: + def __getitem__(self, item: t.Literal['backend']) -> LiteralBackend: ... @overload @@ -554,10 +338,6 @@ class LLM(LLMInterface[M, T], ReprMixin): def __getitem__(self, item: t.Literal['tokenizer']) -> T | None: ... - @overload - def __getitem__(self, item: t.Literal['bentomodel']) -> bentoml.Model | None: - ... - @overload def __getitem__(self, item: t.Literal['adapter_map']) -> ResolvedAdaptersMapping | None: ... @@ -586,58 +366,20 @@ class LLM(LLMInterface[M, T], ReprMixin): elif hasattr(self, item): return getattr(self, item) else: raise KeyError(item) - @overload - @classmethod - def from_pretrained(cls, - model_id: str | None = ..., - model_version: str | None = ..., - llm_config: LLMConfig | None = ..., - *args: t.Any, - runtime: t.Literal['ggml', 'transformers'] | None = ..., - quantize: t.Literal['int8', 'int4'] = ..., - bettertransformer: str | bool | None = ..., - adapter_id: str | None = ..., - adapter_name: str | None = ..., - adapter_map: dict[str, str | None] | None = ..., - quantization_config: transformers.BitsAndBytesConfig | None = ..., - serialisation: t.Literal['safetensors', 'legacy'] = ..., - **attrs: t.Any) -> LLM[M, T]: - ... - - @overload - @classmethod - def from_pretrained(cls, - model_id: str | None = ..., - model_version: str | None = ..., - llm_config: LLMConfig | None = ..., - *args: t.Any, - runtime: t.Literal['ggml', 'transformers'] | None = ..., - quantize: t.Literal['gptq'] = ..., - bettertransformer: str | bool | None = ..., - adapter_id: str | None = ..., - adapter_name: str | None = ..., - adapter_map: dict[str, str | None] | None = ..., - quantization_config: autogptq.BaseQuantizeConfig | None = ..., - serialisation: t.Literal['safetensors', 'legacy'] = ..., - **attrs: t.Any) -> LLM[M, T]: - ... - @classmethod def from_pretrained(cls, model_id: str | None = None, model_version: str | None = None, llm_config: LLMConfig | None = None, *args: t.Any, - runtime: t.Literal['ggml', 'transformers'] | None = None, quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, - bettertransformer: str | bool | None = None, adapter_id: str | None = None, adapter_name: str | None = None, adapter_map: dict[str, str | None] | None = None, quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None, serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors', **attrs: t.Any) -> LLM[M, T]: - """Instantiate a pretrained LLM. + '''Instantiate a pretrained LLM. ``LLM.from_pretrained`` follows the same design principle as HuggingFace's `from_pretrained` method, plus the following: @@ -646,7 +388,6 @@ class LLM(LLMInterface[M, T], ReprMixin): > This is most notable during serving time. - quantize: quantize the model with the given quantization method. Currently supported int8, int4 quantization - - bettertransformer: Apply FasterTransformer to given pretrained weight > Currently, the above two options are mutually exclusive. @@ -682,17 +423,15 @@ class LLM(LLMInterface[M, T], ReprMixin): will use `config_class` to construct default configuration. quantize: The quantization to use for this LLM. Defaults to None. Possible values include int8, int4 and gptq. - runtime: Optional runtime to run this LLM. Default to 'transformers'. 'ggml' supports is working in progress. quantization_config: The quantization config (`transformers.BitsAndBytesConfig` | `autogtpq.BaseQuantizeConfig`) to use. Note that this is mutually exclusive with `quantize` serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors. Default behaviour is similar to ``safe_serialization=False``. - bettertransformer: Whether to use BetterTransformer with this model. Defaults to False. adapter_id: The [LoRA](https://arxiv.org/pdf/2106.09685.pdf) pretrained id or local path to use for this LLM. Defaults to None. adapter_name: The adapter name to use for this LLM. Defaults to None. adapter_map: The adapter map to use for this LLM. Defaults to None. Note that this is mutually exclusive with adapter_id/adapter_name arguments. *args: The args to be passed to the model. **attrs: The kwargs to be passed to the model. - """ + ''' cfg_cls = cls.config_class _local = False _model_id: str = first_not_none(model_id, @@ -712,7 +451,7 @@ class LLM(LLMInterface[M, T], ReprMixin): if quantization_config is None and quantize is not None: quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs) if quantize == 'gptq': serialisation = 'safetensors' - elif cls.__llm_implementation__ == 'vllm': serialisation = 'legacy' # Currently working-in-progress + elif cls.__llm_backend__ == 'vllm': serialisation = 'legacy' # Currently working-in-progress # NOTE: LoRA adapter setup if adapter_map and adapter_id: @@ -749,14 +488,6 @@ class LLM(LLMInterface[M, T], ReprMixin): _tag=_tag, _serialisation_format=serialisation, _local=_local, - bettertransformer=str( - first_not_none(bettertransformer, - os.environ.get(cfg_cls.__openllm_env__['bettertransformer']), - default=None)).upper() in ENV_VARS_TRUE_VALUES, - _runtime=first_not_none(runtime, - t.cast(t.Optional[t.Literal['ggml', 'transformers']], - os.environ.get(cfg_cls.__openllm_env__['runtime'])), - default=cfg_cls.__openllm_runtime__), _adapters_mapping=resolve_peft_config_type(adapter_map) if adapter_map is not None else None, **attrs) @@ -765,9 +496,9 @@ class LLM(LLMInterface[M, T], ReprMixin): def _generate_tag_str(cls, model_id: str, model_version: str | None) -> str: '''Generate a compliant ``bentoml.Tag`` from model_id. - If model_id is a pretrained_id from HF, then it will have the following format: -: + If model_id is a pretrained_id from HF, then it will have the following format: -: If model_id contains the revision itself, then the same format above - If model_id is a path, then it will be -: if model_version is not passesd, otherwise -: + If model_id is a path, then it will be -: if model_version is not passesd, otherwise -: > [!NOTE] here that the generated SHA1 for path cases is that it will be based on last modified time. @@ -788,12 +519,11 @@ class LLM(LLMInterface[M, T], ReprMixin): if model_version is not None: logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.", maybe_revision[0], model_version) - return f'{cls.__llm_implementation__}-{model_name}:{maybe_revision[0]}' + return f'{cls.__llm_backend__}-{model_name}:{maybe_revision[0]}' - tag_name = f'{cls.__llm_implementation__}-{model_name}' - if os.environ.get('OPENLLM_USE_LOCAL_LATEST', str(False)).upper() in ENV_VARS_TRUE_VALUES: - return bentoml_cattr.unstructure( - bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag) + tag_name = f'{cls.__llm_backend__}-{model_name}' + if openllm_core.utils.check_bool_env('OPENLLM_USE_LOCAL_LATEST', False): + return str(bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag) if validate_is_path(model_id): model_id, model_version = resolve_filepath(model_id), first_not_none(model_version, default=generate_hash_from_file(model_id)) @@ -811,14 +541,13 @@ class LLM(LLMInterface[M, T], ReprMixin): def generate_tag(cls, *param_decls: t.Any, **attrs: t.Any) -> bentoml.Tag: return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs)) - def __init__(self, *args: t.Any, model_id: str, llm_config: LLMConfig, bettertransformer: bool | None, + def __init__(self, *args: t.Any, model_id: str, llm_config: LLMConfig, quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, _adapters_mapping: AdaptersMapping | None, _tag: bentoml.Tag, - _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None, _runtime: t.Literal['ggml', 'transformers'], - _model_version: str, _serialisation_format: t.Literal['safetensors', - 'legacy'], _local: bool, **attrs: t.Any, + _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None, _model_version: str, + _serialisation_format: t.Literal['safetensors', 'legacy'], _local: bool, **attrs: t.Any, ): - """Initialize the LLM with given pretrained model. + '''Initialize the LLM with given pretrained model. > [!WARNING] > To initializing any LLM, you should use `openllm.AutoLLM` or `openllm.LLM.from_pretrained` instead. @@ -896,15 +625,14 @@ class LLM(LLMInterface[M, T], ReprMixin): model_id: The pretrained model to use. Defaults to None. If None, 'self.default_id' will be used. llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM will use `config_class` to construct default configuration. - bettertransformer: Whether to use BetterTransformer with this model. Defaults to False. quantization_config: ``transformers.BitsAndBytesConfig`` configuration, or 'gptq' denoting this model to be loaded with GPTQ. *args: The args to be passed to the model. **attrs: The kwargs to be passed to the model. - """ + ''' # low_cpu_mem_usage is only available for model # this is helpful on system with low memory to avoid OOM low_cpu_mem_usage = attrs.pop('low_cpu_mem_usage', True) - if self.__llm_implementation__ == 'pt': + if self.__llm_backend__ == 'pt': attrs.update({'low_cpu_mem_usage': low_cpu_mem_usage, 'quantization_config': quantization_config}) model_kwds: DictStrAny = {} tokenizer_kwds: DictStrAny = {} @@ -915,25 +643,15 @@ class LLM(LLMInterface[M, T], ReprMixin): # parsing tokenizer and model kwargs, as the hierachy is param pass > default normalized_model_kwds, normalized_tokenizer_kwds = normalize_attrs_to_model_tokenizer_pair(**attrs) # NOTE: Save the args and kwargs for latter load - self.__attrs_init__(llm_config, quantization_config, model_id, _runtime, args, { + self.__attrs_init__(llm_config, quantization_config, model_id, args, { **model_kwds, **normalized_model_kwds }, { **tokenizer_kwds, **normalized_tokenizer_kwds }, _tag, _adapters_mapping, _model_version, _quantize_method, _serialisation_format, _local) - # handle trust_remote_code - _from_env = os.getenv('TRUST_REMOTE_CODE', None) - self.__llm_trust_remote_code__ = first_not_none( - str(_from_env).upper() in ENV_VARS_TRUE_VALUES if _from_env else None, - default=self._model_attrs.pop('trust_remote_code', self.config['trust_remote_code'])) self.llm_post_init() - # we set it here so that we allow subclass to overwrite bettertransformer in llm_post_init - if bettertransformer is True: self.bettertransformer = bettertransformer - else: non_intrusive_setattr(self, 'bettertransformer', self.config['bettertransformer']) - # If lora is passed, the disable bettertransformer - if _adapters_mapping and self.bettertransformer is True: self.bettertransformer = False def __setattr__(self, attr: str, value: t.Any) -> None: if attr in _reserved_namespace: @@ -942,6 +660,11 @@ class LLM(LLMInterface[M, T], ReprMixin): ) super().__setattr__(attr, value) + @property + def trust_remote_code(self) -> bool: + return first_not_none(openllm_core.utils.check_bool_env('TRUST_REMOTE_CODE'), + default=self.config['trust_remote_code']) + @property def adapters_mapping(self) -> AdaptersMapping | None: return self._adapters_mapping @@ -952,21 +675,18 @@ class LLM(LLMInterface[M, T], ReprMixin): @property def __repr_keys__(self) -> set[str]: - return {'model_id', 'runner_name', 'config', 'adapters_mapping', 'runtime', 'tag'} + return {'model_id', 'runner_name', 'config', 'adapters_mapping', 'tag'} def __repr_args__(self) -> ReprArgs: for k in self.__repr_keys__: if k == 'config': yield k, self.config.model_dump(flatten=True) else: yield k, getattr(self, k) + yield 'backend', self.__llm_backend__ @property def model_id(self) -> str: return self._model_id - @property - def runtime(self) -> t.Literal['ggml', 'transformers']: - return self._runtime - @property def runner_name(self) -> str: return f"llm-{self.config['start_name']}-runner" @@ -995,15 +715,13 @@ class LLM(LLMInterface[M, T], ReprMixin): return openllm.import_model(self.config['start_name'], model_id=self.model_id, model_version=self._model_version, - runtime=self.runtime, - implementation=self.__llm_implementation__, + backend=self.__llm_backend__, quantize=self._quantize_method, serialisation_format=self._serialisation_format) @property def _bentomodel(self) -> bentoml.Model: - if self.__llm_bentomodel__ is None: self.__llm_bentomodel__ = openllm.serialisation.get(self) - return self.__llm_bentomodel__ + return openllm.serialisation.get(self, auto_import=True) def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]: '''This handler will sanitize all attrs and setup prompt text. @@ -1024,7 +742,7 @@ class LLM(LLMInterface[M, T], ReprMixin): > [!NOTE] > This will be used from the client side. ''' - if isinstance(generation_result, dict): return generation_result['text'] + if isinstance(generation_result, dict) and 'text' in generation_result: return generation_result['text'] return self.config.postprocess_generate(prompt, generation_result, **attrs) @property @@ -1036,7 +754,7 @@ class LLM(LLMInterface[M, T], ReprMixin): if self.__llm_model__ is None: model = self.load_model(*self._model_decls, **self._model_attrs) # If OOM, then it is probably you don't have enough VRAM to run this model. - if self.__llm_implementation__ == 'pt' and is_torch_available(): + if self.__llm_backend__ == 'pt' and is_torch_available(): loaded_in_kbit = getattr(model, 'is_loaded_in_8bit', False) or getattr( model, 'is_loaded_in_4bit', False) or getattr(model, 'is_quantized', False) if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit: @@ -1055,12 +773,6 @@ class LLM(LLMInterface[M, T], ReprMixin): if self.__llm_tokenizer__ is None: self.__llm_tokenizer__ = self.load_tokenizer(**self._tokenizer_attrs) return self.__llm_tokenizer__ - def _default_ft_config(self, _adapter_type: AdapterType, inference_mode: bool) -> FineTuneConfig: - strategy = first_not_none(self.config['fine_tune_strategies'].get(_adapter_type), - default=FineTuneConfig(adapter_type=t.cast('PeftType', _adapter_type), - llm_config_class=self.config_class)) - return strategy.eval() if inference_mode else strategy.train() - def _transpose_adapter_mapping(self, inference_mode: bool = True, use_cache: bool = True) -> ResolvedAdaptersMapping: if self._adapters_mapping is None: raise ValueError('LoRA mapping is not set up correctly.') # early out if we already serialized everything. @@ -1072,7 +784,10 @@ class LLM(LLMInterface[M, T], ReprMixin): # then we will raise Error when the optional_name is set to None in next iteration. _converted_first_none = False for _adapter_type, _adapters_tuples in self._adapters_mapping.items(): - default_config = self._default_ft_config(_adapter_type, inference_mode) + strategy = first_not_none(self.config['fine_tune_strategies'].get(_adapter_type), + default=FineTuneConfig(adapter_type=t.cast('PeftType', _adapter_type), + llm_config_class=self.config_class)) + default_config = strategy.eval() if inference_mode else strategy.train() for adapter in _adapters_tuples: if not adapter.name and _converted_first_none: raise ValueError( @@ -1173,9 +888,8 @@ class LLM(LLMInterface[M, T], ReprMixin): # BUG: This hits during inference, need fixing model = peft_class.from_pretrained(self.__llm_model__, peft_model_id, **kwargs) else: - model = peft_class( - self.__llm_model__, - default_config) # in this case, the given base_model_name_or_path is None. This will be hit during training + # in this case, the given base_model_name_or_path is None. This will be hit during training + model = peft_class(self.__llm_model__, default_config) return model # order of these fields matter here, make sure to sync it with @@ -1186,7 +900,7 @@ class LLM(LLMInterface[M, T], ReprMixin): max_batch_size: int | None = None, max_latency_ms: int | None = None, scheduling_strategy: type[bentoml.Strategy] = openllm_core.CascadingResourceStrategy) -> LLMRunner[M, T]: - """Convert this LLM into a Runner. + '''Convert this LLM into a Runner. Args: models: Any additional ``bentoml.Model`` to be included in this given models. @@ -1205,7 +919,7 @@ class LLM(LLMInterface[M, T], ReprMixin): > - 'name': will be generated by OpenLLM, hence users don't shouldn't worry about this. The generated name will be 'llm--runner' (ex: llm-dolly-v2-runner, llm-chatglm-runner) > - 'embedded': Will be disabled by default. There is no reason to run LLM in embedded mode. > - 'method_configs': The method configs for the runner will be managed internally by OpenLLM. - """ + ''' models = models if models is not None else [] try: @@ -1213,10 +927,9 @@ class LLM(LLMInterface[M, T], ReprMixin): except bentoml.exceptions.NotFound as err: raise RuntimeError(f'Failed to locate {self._bentomodel}:{err}') from None - generate_sig = ModelSignature.from_dict(t.cast('_ModelSignatureDict', ModelSignatureDict(batchable=False))) - embeddings_sig = ModelSignature.from_dict( - t.cast('_ModelSignatureDict', ModelSignatureDict(batchable=True, batch_dim=0))) - generate_iterator_sig = ModelSignature.from_dict(t.cast('_ModelSignatureDict', ModelSignatureDict(batchable=False))) + generate_sig = ModelSignature.from_dict(ModelSignatureDict(batchable=False)) + embeddings_sig = ModelSignature.from_dict(ModelSignatureDict(batchable=True, batch_dim=0)) + generate_iterator_sig = ModelSignature.from_dict(ModelSignatureDict(batchable=False)) # NOTE: returning the two langchain API's to the runner return llm_runner_class(self)(llm_runnable_class(self, embeddings_sig, generate_sig, generate_iterator_sig), @@ -1232,8 +945,7 @@ class LLM(LLMInterface[M, T], ReprMixin): 'generate_one': generate_sig, 'generate_iterator': generate_iterator_sig }), - scheduling_strategy=scheduling_strategy, - ) + scheduling_strategy=scheduling_strategy) # NOTE: Scikit API def predict(self, prompt: str, **attrs: t.Any) -> t.Any: @@ -1406,7 +1118,6 @@ def Runner(model_name: str, max_batch_size: int | None = ..., max_latency_ms: int | None = ..., method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = ..., - embedded: t.Literal[True, False] = ..., scheduling_strategy: type[bentoml.Strategy] | None = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ... @@ -1414,9 +1125,9 @@ def Runner(model_name: str, @overload def Runner(model_name: str, *, - ensure_available: bool | None = None, + ensure_available: bool = ..., init_local: bool = ..., - implementation: LiteralRuntime | None = None, + backend: LiteralBackend | None = None, llm_config: LLMConfig | None = None, **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ... @@ -1427,9 +1138,7 @@ def Runner(model_name: str, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., - runtime: t.Literal['ggml', 'transformers'] | None = ..., quantize: t.Literal['int8', 'int4', 'gptq'] | None = ..., - bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ..., @@ -1439,12 +1148,12 @@ def Runner(model_name: str, ... def Runner(model_name: str, - ensure_available: bool | None = None, + ensure_available: bool = False, init_local: bool = False, - implementation: LiteralRuntime | None = None, + backend: LiteralBackend | None = None, llm_config: LLMConfig | None = None, **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: - """Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'. + '''Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'. The behaviour of ensure_available that is synonymous to `AutoLLM.for_model` depends on `init_local`. By default, `ensure_available` is synonymous to `init_local`, meaning on the service when creating @@ -1466,38 +1175,33 @@ def Runner(model_name: str, Args: model_name: Supported model name from 'openllm models' ensure_available: If True, it will download the model if it is not available. If False, it will skip downloading the model. - If False, make sure the model is available locally. - implementation: The given Runner implementation one choose for this Runner. By default, it is retrieved from the enviroment variable - of the respected model_name. For example: 'flan-t5' -> "OPENLLM_FLAN_T5_FRAMEWORK" + If False, make sure the model is available locally. + backend: The given Runner implementation one choose for this Runner. If `OPENLLM_BACKEND` is set, it will respect it. llm_config: Optional ``openllm.LLMConfig`` to initialise this ``openllm.LLMRunner``. - init_local: If True, it will initialize the model locally. This is useful if you want to - run the model locally. (Symmetrical to bentoml.Runner.init_local()) - **attrs: The rest of kwargs will then be passed to the LLM. Refer to the LLM documentation for the kwargs - behaviour - """ + init_local: If True, it will initialize the model locally. This is useful if you want to run the model locally. (Symmetrical to bentoml.Runner.init_local()) + **attrs: The rest of kwargs will then be passed to the LLM. Refer to the LLM documentation for the kwargs behaviour + ''' if llm_config is not None: attrs.update({ 'model_id': llm_config['env']['model_id_value'], - 'bettertransformer': - llm_config['env']['bettertransformer_value'], 'quantize': llm_config['env']['quantize_value'], - 'runtime': - llm_config['env']['runtime_value'], 'serialisation': first_not_none(os.environ.get('OPENLLM_SERIALIZATION'), attrs.get('serialisation'), default='safetensors') }) - default_implementation = llm_config.default_implementation() if llm_config is not None else 'pt' - implementation = t.cast( - LiteralRuntime, - first_not_none(implementation, default=EnvVarMixin(model_name, default_implementation)['framework_value'])) - runner = infer_auto_class(implementation).create_runner( - model_name, - llm_config=llm_config, - ensure_available=ensure_available if ensure_available is not None else init_local, - **attrs) + backend = t.cast( + LiteralBackend, + first_not_none(backend, + default=EnvVarMixin( + model_name, + backend=llm_config.default_backend() if llm_config is not None else 'pt')['backend_value'])) + if init_local: ensure_available = True + runner = infer_auto_class(backend).create_runner(model_name, + llm_config=llm_config, + ensure_available=ensure_available, + **attrs) if init_local: runner.init_local(quiet=True) return runner @@ -1514,12 +1218,11 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate class _Runnable(bentoml.Runnable): SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu') SUPPORTS_CPU_MULTI_THREADING = True - framework = self.__llm_implementation__ + backend = self.__llm_backend__ def __init__(__self: _Runnable): - # NOTE: The side effect of this line - # is that it will load the imported model during - # runner startup. So don't remove it!! + # NOTE: The side effect of this line is that it will load the + # imported model during runner startup. So don't remove it!! if not self.model: raise RuntimeError('Failed to load the model correctly (See traceback above)') if self.adapters_mapping is not None: logger.info('Applying LoRA to %s...', self.runner_name) @@ -1531,37 +1234,37 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate if adapter_name != 'default': self.model.set_adapter(adapter_name) logger.info('Successfully apply LoRA layer %s', adapter_name) - @bentoml.Runnable.method(**method_signature(embeddings_sig)) - def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[LLMEmbeddings]: + @bentoml.Runnable.method(**method_signature(embeddings_sig)) # type: ignore + def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[EmbeddingsOutput]: return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)] - @bentoml.Runnable.method(**method_signature(generate_sig)) + @bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore def __call__(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]: adapter_name = attrs.pop('adapter_name', None) if adapter_name is not None: __self.set_adapter(adapter_name) return self.generate(prompt, **attrs) - @bentoml.Runnable.method(**method_signature(generate_sig)) + @bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore def generate(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]: adapter_name = attrs.pop('adapter_name', None) if adapter_name is not None: __self.set_adapter(adapter_name) - if __self.framework == 'vllm': attrs.setdefault('request_id', openllm_core.utils.gen_random_uuid()) + if __self.backend == 'vllm': attrs.setdefault('request_id', openllm_core.utils.gen_random_uuid()) return self.generate(prompt, **attrs) - @bentoml.Runnable.method(**method_signature(generate_sig)) + @bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore def generate_one(__self: _Runnable, prompt: str, stop: list[str], **attrs: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]: adapter_name = attrs.pop('adapter_name', None) if adapter_name is not None: __self.set_adapter(adapter_name) return self.generate_one(prompt, stop, **attrs) - @bentoml.Runnable.method(**method_signature(generate_iterator_sig)) + @bentoml.Runnable.method(**method_signature(generate_iterator_sig)) # type: ignore def generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.Generator[str, None, str]: adapter_name = attrs.pop('adapter_name', None) if adapter_name is not None: __self.set_adapter(adapter_name) pre = 0 for outputs in self.generate_iterator(prompt, request_id=openllm_core.utils.gen_random_uuid(), **attrs): - output_text = outputs['text'][0] if __self.framework == 'vllm' else outputs['text'] + output_text = outputs['text'][0] if __self.backend == 'vllm' else outputs['text'] output_text = output_text.strip().split(' ') now = len(output_text) - 1 if now > pre: @@ -1609,20 +1312,20 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]: prompt, generate_kwargs, postprocess_kwargs = self.sanitize_parameters(prompt, **kwargs) return self.postprocess_generate(prompt, __self.generate.run(prompt, **generate_kwargs), **postprocess_kwargs) - def _wrapped_embeddings_run(__self: LLMRunner[M, T], prompt: str | list[str]) -> LLMEmbeddings: - """``llm.embed`` is a light wrapper around runner.embeedings.run(). + def _wrapped_embeddings_run(__self: LLMRunner[M, T], prompt: str | list[str]) -> EmbeddingsOutput: + '''``llm.embed`` is a light wrapper around runner.embeedings.run(). Usage: ```python - runner = openllm.Runner('llama', implementation='pt') + runner = openllm.Runner('llama', backend='pt') runner.embed("What is the meaning of life?") ``` - """ + ''' return __self.embeddings.run([prompt] if isinstance(prompt, str) else prompt) def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]: - return {'config', 'llm_type', 'runner_methods', 'runtime', 'llm_tag'} + return {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'} def _wrapped_repr_args(__self: LLMRunner[M, T]) -> ReprArgs: yield 'runner_methods', { @@ -1633,19 +1336,17 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]: } yield 'config', self.config.model_dump(flatten=True) yield 'llm_type', __self.llm_type - yield 'runtime', self.runtime + yield 'backend', self.__llm_backend__ yield 'llm_tag', self.tag - yield 'llm_framework', self.__llm_implementation__ return types.new_class(self.__class__.__name__ + 'Runner', (bentoml.Runner,), exec_body=lambda ns: ns.update({ 'llm_type': self.llm_type, 'identifying_params': self.identifying_params, - 'llm_framework': self.__llm_implementation__, 'llm_tag': self.tag, 'llm': self, 'config': self.config, - 'implementation': self.__llm_implementation__, + 'backend': self.__llm_backend__, 'peft_adapters': property(fget=available_adapters), 'download_model': self.ensure_model_id_exists, '__call__': _wrapped_generate_run, @@ -1660,4 +1361,4 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]: 'has_adapters': self._adapters_mapping is not None })) -__all__ = ['LLMRunner', 'LLMRunnable', 'Runner', 'LLM', 'llm_runner_class', 'llm_runnable_class', 'LLMEmbeddings'] +__all__ = ['LLMRunner', 'LLMRunnable', 'Runner', 'LLM', 'llm_runner_class', 'llm_runnable_class', 'EmbeddingsOutput'] diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py index 129647f5..8d910ad3 100644 --- a/openllm-python/src/openllm/_service.py +++ b/openllm-python/src/openllm/_service.py @@ -78,7 +78,7 @@ async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[s 'model_id': runner.llm.model_id, 'timeout': 3600, 'model_name': llm_config['model_name'], - 'framework': runner.llm_framework, + 'backend': runner.backend, 'configuration': '', 'supports_embeddings': runner.supports_embeddings, 'supports_hf_agent': runner.supports_hf_agent @@ -86,7 +86,7 @@ async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[s def metadata_v1(_: str) -> openllm.MetadataOutput: return openllm.MetadataOutput(timeout=llm_config['timeout'], model_name=llm_config['model_name'], - framework=llm_config['env']['framework_value'], + backend=llm_config['env']['backend_value'], model_id=runner.llm.model_id, configuration=llm_config.model_dump_json().decode(), supports_embeddings=runner.supports_embeddings, diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index 32f15de4..f56dfc56 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -86,17 +86,17 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}") env = llm.config['env'] - framework_envvar = env['framework_value'] - if framework_envvar == 'flax': + backend_envvar = env['backend_value'] + if backend_envvar == 'flax': if not openllm_core.utils.is_flax_available(): - raise ValueError(f"Flax is not available, while {env.framework} is set to 'flax'") + raise ValueError(f"Flax is not available, while {env.backend} is set to 'flax'") packages.extend( [importlib.metadata.version('flax'), importlib.metadata.version('jax'), importlib.metadata.version('jaxlib')]) - elif framework_envvar == 'tf': + elif backend_envvar == 'tf': if not openllm_core.utils.is_tf_available(): - raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'") + raise ValueError(f"TensorFlow is not available, while {env.backend} is set to 'tf'") candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu', 'intel-tensorflow', 'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos', ) @@ -125,21 +125,22 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, - extra_index_url=['https://download.pytorch.org/whl/cu118']) + extra_index_url=[ + 'https://download.pytorch.org/whl/cu118', + 'https://huggingface.github.io/autogptq-index/whl/cu118/' + ]) def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, - quantize: LiteralString | None, bettertransformer: bool | None, - adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, - runtime: t.Literal['ggml', 'transformers'], serialisation_format: t.Literal['safetensors', - 'legacy'], + quantize: LiteralString | None, adapter_map: dict[str, str | None] | None, + dockerfile_template: str | None, serialisation_format: t.Literal['safetensors', 'legacy'], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions: from openllm.cli._factory import parse_config_options environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy()) env: openllm_core.utils.EnvVarMixin = llm.config['env'] - if env['framework_value'] == 'vllm': serialisation_format = 'legacy' + if env['backend_value'] == 'vllm': serialisation_format = 'legacy' env_dict = { - env.framework: env['framework_value'], + env.backend: env['backend_value'], env.config: f"'{llm.config.model_dump_json().decode()}'", env.model_id: f'/home/bentoml/bento/models/{llm.tag.path()}', 'OPENLLM_MODEL': llm.config['model_name'], @@ -152,14 +153,9 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_ if adapter_map: env_dict['BITSANDBYTES_NOWELCOME'] = os.environ.get('BITSANDBYTES_NOWELCOME', '1') # We need to handle None separately here, as env from subprocess doesn't accept None value. - _env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], - bettertransformer=bettertransformer, - quantize=quantize, - runtime=runtime) + _env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], quantize=quantize) - env_dict[_env.bettertransformer] = str(_env['bettertransformer_value']) if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value']) - env_dict[_env.runtime] = _env['runtime_value'] return DockerOptions( base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}', env=env_dict, @@ -218,21 +214,19 @@ def create_bento(bento_tag: bentoml.Tag, llm: openllm.LLM[t.Any, t.Any], workers_per_resource: str | float, quantize: LiteralString | None, - bettertransformer: bool | None, dockerfile_template: str | None, adapter_map: dict[str, str | None] | None = None, extra_dependencies: tuple[str, ...] | None = None, - runtime: t.Literal['ggml', 'transformers'] = 'transformers', serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors', container_registry: LiteralContainerRegistry = 'ecr', container_version_strategy: LiteralContainerVersionStrategy = 'release', _bento_store: BentoStore = Provide[BentoMLContainer.bento_store], _model_store: ModelStore = Provide[BentoMLContainer.model_store]) -> bentoml.Bento: - framework_envvar = llm.config['env']['framework_value'] + backend_envvar = llm.config['env']['backend_value'] labels = dict(llm.identifying_params) labels.update({ '_type': llm.llm_type, - '_framework': framework_envvar, + '_framework': backend_envvar, 'start_name': llm.config['start_name'], 'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle' @@ -265,8 +259,8 @@ def create_bento(bento_tag: bentoml.Tag, python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map), models=[llm_spec], docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, - bettertransformer, adapter_map, dockerfile_template, - runtime, serialisation_format, container_registry, + adapter_map, dockerfile_template, + serialisation_format, container_registry, container_version_strategy)) bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/')) diff --git a/openllm-python/src/openllm/bundle/oci/__init__.py b/openllm-python/src/openllm/bundle/oci/__init__.py index 735a67a1..10a8dd97 100644 --- a/openllm-python/src/openllm/bundle/oci/__init__.py +++ b/openllm-python/src/openllm/bundle/oci/__init__.py @@ -94,7 +94,7 @@ class RefResolver: git_hash: str = attr.field() version: openllm_core.utils.VersionInfo = attr.field(converter=_convert_version_from_string) strategy: LiteralContainerVersionStrategy = attr.field() - _ghapi: t.ClassVar[all.GhApi] = all.GhApi(owner=_OWNER, repo=_REPO) + _ghapi: t.ClassVar[all.GhApi] = all.GhApi(owner=_OWNER, repo=_REPO, authenticate=False) @classmethod def _nightly_ref(cls) -> RefTuple: diff --git a/openllm-python/src/openllm/cli/_factory.py b/openllm-python/src/openllm/cli/_factory.py index 974b8d84..195a6143 100644 --- a/openllm-python/src/openllm/cli/_factory.py +++ b/openllm-python/src/openllm/cli/_factory.py @@ -16,12 +16,15 @@ from click.shell_completion import CompletionItem import bentoml import openllm +import openllm_core from bentoml._internal.configuration.containers import BentoMLContainer from openllm_core._typing_compat import Concatenate from openllm_core._typing_compat import DictStrAny +from openllm_core._typing_compat import LiteralBackend from openllm_core._typing_compat import LiteralString from openllm_core._typing_compat import ParamSpec +from openllm_core._typing_compat import get_literal_args from openllm_core.utils import DEBUG from . import termui @@ -147,14 +150,12 @@ Available official model_id(s): [default: {llm_config['default_id']}] @click.pass_context def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None, workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...], - quantize: t.Literal['int8', 'int4', 'gptq'] | None, bettertransformer: bool | None, - runtime: t.Literal['ggml', 'transformers'], fast: bool, serialisation_format: t.Literal['safetensors', - 'legacy'], - cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any, + quantize: t.Literal['int8', 'int4', 'gptq'] | None, backend: LiteralBackend, + serialisation_format: t.Literal['safetensors', 'legacy'], cors: bool, adapter_id: str | None, + return_process: bool, **attrs: t.Any, ) -> LLMConfig | subprocess.Popen[bytes]: - fast = str(fast).upper() in openllm.utils.ENV_VARS_TRUE_VALUES - if serialisation_format == 'safetensors' and quantize is not None and os.environ.get( - 'OPENLLM_SERIALIZATION_WARNING', str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES: + if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env( + 'OPENLLM_SERIALIZATION_WARNING'): termui.echo( f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.", fg='yellow') @@ -184,20 +185,18 @@ Available official model_id(s): [default: {llm_config['default_id']}] # Create a new model env to work with the envvar during CLI invocation env = openllm.utils.EnvVarMixin(config['model_name'], - config.default_implementation(), + backend, model_id=model_id or config['default_id'], - bettertransformer=bettertransformer, - quantize=quantize, - runtime=runtime) - prerequisite_check(ctx, config, quantize, adapter_map, int(1 / wpr)) + quantize=quantize) + requirements = llm_config['requirements'] + if requirements is not None and len(requirements) > 0: + missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None] + if len(missing_requirements) > 0: + termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow') # NOTE: This is to set current configuration start_env = os.environ.copy() start_env = parse_config_options(config, server_timeout, wpr, device, cors, start_env) - if fast: - termui.echo( - f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'", - fg='yellow') start_env.update({ 'OPENLLM_MODEL': model, @@ -205,21 +204,18 @@ Available official model_id(s): [default: {llm_config['default_id']}] 'BENTOML_HOME': os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()), 'OPENLLM_ADAPTER_MAP': orjson.dumps(adapter_map).decode(), 'OPENLLM_SERIALIZATION': serialisation_format, - env.runtime: env['runtime_value'], - env.framework: env['framework_value'] + env.backend: env['backend_value'] }) if env['model_id_value']: start_env[env.model_id] = str(env['model_id_value']) - # NOTE: quantize and bettertransformer value is already assigned within env - if bettertransformer is not None: start_env[env.bettertransformer] = str(env['bettertransformer_value']) if quantize is not None: start_env[env.quantize] = str(t.cast(str, env['quantize_value'])) - llm = openllm.utils.infer_auto_class(env['framework_value']).for_model(model, - model_id=start_env[env.model_id], - model_version=model_version, - llm_config=config, - ensure_available=not fast, - adapter_map=adapter_map, - serialisation=serialisation_format) + llm = openllm.utils.infer_auto_class(env['backend_value']).for_model(model, + model_id=start_env[env.model_id], + model_version=model_version, + llm_config=config, + ensure_available=True, + adapter_map=adapter_map, + serialisation=serialisation_format) start_env.update({env.config: llm.config.model_dump_json().decode()}) server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer( @@ -268,21 +264,6 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, * return noop -def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None, - adapter_map: dict[str, str | None] | None, num_workers: int) -> None: - if adapter_map and not openllm.utils.is_peft_available(): - ctx.fail( - "Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'") - if quantize and llm_config.default_implementation() == 'vllm': - ctx.fail( - f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization." - ) - requirements = llm_config['requirements'] - if requirements is not None and len(requirements) > 0: - missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None] - if len(missing_requirements) > 0: - termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow') - def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]: def wrapper(fn: FC) -> t.Callable[[FC], FC]: @@ -291,22 +272,21 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab cog.optgroup.group( 'General LLM Options', help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."), - model_id_option(factory=cog.optgroup, model_env=llm_config['env']), model_version_option(factory=cog.optgroup), + model_id_option(factory=cog.optgroup), model_version_option(factory=cog.optgroup), cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'), workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup), - fast_option(factory=cog.optgroup), + backend_option(factory=cog.optgroup), cog.optgroup.group('LLM Optimization Options', help='''Optimization related options. - OpenLLM supports running model with [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/), - k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM. + OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM. The following are either in our roadmap or currently being worked on: - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/) - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml) ''', - ), + ), quantize_option(factory=cog.optgroup), serialisation_option(factory=cog.optgroup), cog.optgroup.option('--device', type=openllm.utils.dantic.CUDA, multiple=True, @@ -314,13 +294,6 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab callback=parse_device_callback, help=f"Assign GPU devices (if available) for {llm_config['model_name']}.", show_envvar=True), - cog.optgroup.option('--runtime', - type=click.Choice(['ggml', 'transformers']), - default='transformers', - help='The runtime to use for the given model. Default is transformers.'), - quantize_option(factory=cog.optgroup, model_env=llm_config['env']), - bettertransformer_option(factory=cog.optgroup, model_env=llm_config['env']), - serialisation_option(factory=cog.optgroup), cog.optgroup.group('Fine-tuning related options', help='''\ Note that the argument `--adapter-id` can accept the following format: @@ -439,18 +412,6 @@ def output_option(f: _AnyCallable | None = None, shell_complete=complete_output_var, **attrs)(f) -def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: - return cli_option('--fast/--no-fast', - show_default=True, - default=False, - envvar='OPENLLM_USE_LOCAL_LATEST', - show_envvar=True, - help='''Whether to skip checking if models is already in store. - - This is useful if you already downloaded or setup the model beforehand. - ''', - **attrs)(f) - def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option('--cors/--no-cors', show_default=True, @@ -463,15 +424,12 @@ def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option('--machine', is_flag=True, default=False, hidden=True, **attrs)(f) -def model_id_option(f: _AnyCallable | None = None, - *, - model_env: openllm.utils.EnvVarMixin | None = None, - **attrs: t.Any) -> t.Callable[[FC], FC]: +def model_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option('--model-id', type=click.STRING, default=None, - envvar=model_env.model_id if model_env is not None else None, - show_envvar=model_env is not None, + envvar='OPENLLM_MODEL_ID', + show_envvar=True, help='Optional model_id name or path for (fine-tune) weight.', **attrs)(f) @@ -483,24 +441,31 @@ def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal help='Optional model version to save for this model. It will be inferred automatically from model-id.', **attrs)(f) +def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: + # NOTE: LiteralBackend needs to remove the last two item as ggml and mlc is wip + # XXX: remove the check for __args__ once we have ggml and mlc supports + return cli_option('--backend', + type=click.Choice(get_literal_args(LiteralBackend)[:-2]), + default='pt', + envvar='OPENLLM_BACKEND', + show_envvar=True, + help='The implementation for saving this LLM.', + **attrs)(f) + def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f) -def quantize_option(f: _AnyCallable | None = None, - *, - build: bool = False, - model_env: openllm.utils.EnvVarMixin | None = None, - **attrs: t.Any) -> t.Callable[[FC], FC]: +def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option('--quantise', '--quantize', 'quantize', type=click.Choice(['int8', 'int4', 'gptq']), default=None, - envvar=model_env.quantize if model_env is not None else None, - show_envvar=model_env is not None, + envvar='OPENLLM_QUANTIZE', + show_envvar=True, help='''Dynamic quantization for running this LLM. The following quantization strategies are supported: @@ -542,21 +507,6 @@ def workers_per_resource_option(f: _AnyCallable | None = None, > ensure it has the same effect with 'openllm start --api-workers ...'""" if build else ''), **attrs)(f) -def bettertransformer_option(f: _AnyCallable | None = None, - *, - build: bool = False, - model_env: openllm.utils.EnvVarMixin | None = None, - **attrs: t.Any) -> t.Callable[[FC], FC]: - return cli_option( - '--bettertransformer', - is_flag=True, - default=None, - envvar=model_env.bettertransformer if model_env is not None else None, - show_envvar=model_env is not None, - help='Apply FasterTransformer wrapper to serve model. This will applies during serving time.' if not build else - 'Set default environment variable whether to serve this model with FasterTransformer in build time.', - **attrs)(f) - def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option('--serialisation', '--serialization', @@ -586,22 +536,18 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal **attrs)(f) def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: - return cli_option('--container-registry', - 'container_registry', - type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)), - default='ecr', - show_default=True, - show_envvar=True, - envvar='OPENLLM_CONTAINER_REGISTRY', - callback=container_registry_callback, - help='''The default container registry to get the base image for building BentoLLM. - - Currently, it supports 'ecr', 'ghcr.io', 'docker.io' - - \b - > [!NOTE] that in order to build the base image, you will need a GPUs to compile custom kernel. See ``openllm ext build-base-container`` for more information. - ''', - **attrs)(f) + return cli_option( + '--container-registry', + 'container_registry', + type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)), + default='ecr', + show_default=True, + show_envvar=True, + envvar='OPENLLM_CONTAINER_REGISTRY', + callback=container_registry_callback, + help= + 'The default container registry to get the base image for building BentoLLM. Currently, it supports ecr, ghcr, docker', + **attrs)(f) _wpr_strategies = {'round_robin', 'conserved'} diff --git a/openllm-python/src/openllm/cli/_sdk.py b/openllm-python/src/openllm/cli/_sdk.py index c43779ba..3ac7a11a 100644 --- a/openllm-python/src/openllm/cli/_sdk.py +++ b/openllm-python/src/openllm/cli/_sdk.py @@ -23,9 +23,9 @@ from ._factory import start_command_factory if t.TYPE_CHECKING: from bentoml._internal.bento import BentoStore from openllm_core._configuration import LLMConfig + from openllm_core._typing_compat import LiteralBackend from openllm_core._typing_compat import LiteralContainerRegistry from openllm_core._typing_compat import LiteralContainerVersionStrategy - from openllm_core._typing_compat import LiteralRuntime from openllm_core._typing_compat import LiteralString logger = logging.getLogger(__name__) @@ -38,10 +38,8 @@ def _start(model_name: str, workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None, device: tuple[str, ...] | t.Literal['all'] | None = None, quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, - bettertransformer: bool | None = None, - runtime: t.Literal['ggml', 'transformers'] = 'transformers', adapter_map: dict[LiteralString, str | None] | None = None, - framework: LiteralRuntime | None = None, + backend: LiteralBackend | None = None, additional_args: list[str] | None = None, cors: bool = False, _serve_grpc: bool = False, @@ -57,48 +55,42 @@ def _start(model_name: str, ``openllm.start`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI interaction. - > [!NOTE] ``quantize`` and ``bettertransformer`` are mutually exclusive. - Args: - model_name: The model name to start this LLM - model_id: Optional model id for this given LLM - timeout: The server timeout - workers_per_resource: Number of workers per resource assigned. - See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy) - for more information. By default, this is set to 1. + model_name: The model name to start this LLM + model_id: Optional model id for this given LLM + timeout: The server timeout + workers_per_resource: Number of workers per resource assigned. + See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy) + for more information. By default, this is set to 1. - > [!NOTE] ``--workers-per-resource`` will also accept the following strategies: - > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models. - > - ``conserved``: This will determine the number of available GPU resources, and only assign - > one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is - > equivalent to ``--workers-per-resource 0.25``. - device: Assign GPU devices (if available) to this LLM. By default, this is set to ``None``. It also accepts 'all' - argument to assign all available GPUs to this LLM. - quantize: Quantize the model weights. This is only applicable for PyTorch models. - Possible quantisation strategies: - - int8: Quantize the model with 8bit (bitsandbytes required) - - int4: Quantize the model with 4bit (bitsandbytes required) - - gptq: Quantize the model with GPTQ (auto-gptq required) - bettertransformer: Convert given model to FastTransformer with PyTorch. - runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML. - cors: Whether to enable CORS for this LLM. By default, this is set to ``False``. - adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``. - framework: The framework to use for this LLM. By default, this is set to ``pt``. - additional_args: Additional arguments to pass to ``openllm start``. + > [!NOTE] ``--workers-per-resource`` will also accept the following strategies: + > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models. + > - ``conserved``: This will determine the number of available GPU resources, and only assign + > one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is + > equivalent to ``--workers-per-resource 0.25``. + device: Assign GPU devices (if available) to this LLM. By default, this is set to ``None``. It also accepts 'all' + argument to assign all available GPUs to this LLM. + quantize: Quantize the model weights. This is only applicable for PyTorch models. + Possible quantisation strategies: + - int8: Quantize the model with 8bit (bitsandbytes required) + - int4: Quantize the model with 4bit (bitsandbytes required) + - gptq: Quantize the model with GPTQ (auto-gptq required) + cors: Whether to enable CORS for this LLM. By default, this is set to ``False``. + adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``. + backend: The backend to use for this LLM. By default, this is set to ``pt``. + additional_args: Additional arguments to pass to ``openllm start``. """ from .entrypoint import start_command from .entrypoint import start_grpc_command llm_config = openllm.AutoConfig.for_model(model_name) _ModelEnv = openllm_core.utils.EnvVarMixin(model_name, - openllm_core.utils.first_not_none( - framework, default=llm_config.default_implementation()), + backend=openllm_core.utils.first_not_none( + backend, default=llm_config.default_backend()), model_id=model_id, - bettertransformer=bettertransformer, - quantize=quantize, - runtime=runtime) - os.environ[_ModelEnv.framework] = _ModelEnv['framework_value'] + quantize=quantize) + os.environ[_ModelEnv.backend] = _ModelEnv['backend_value'] - args: list[str] = ['--runtime', runtime] + args: list[str] = [] if model_id: args.extend(['--model-id', model_id]) if timeout: args.extend(['--server-timeout', str(timeout)]) if workers_per_resource: @@ -107,10 +99,7 @@ def _start(model_name: str, str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource ]) if device and not os.environ.get('CUDA_VISIBLE_DEVICES'): args.extend(['--device', ','.join(device)]) - if quantize and bettertransformer: - raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.") if quantize: args.extend(['--quantize', str(quantize)]) - elif bettertransformer: args.append('--bettertransformer') if cors: args.append('--cors') if adapter_map: args.extend( @@ -134,12 +123,10 @@ def _build(model_name: str, model_version: str | None = None, bento_version: str | None = None, quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, - bettertransformer: bool | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, workers_per_resource: float | None = None, - runtime: t.Literal['ggml', 'transformers'] = 'transformers', dockerfile_template: str | None = None, overwrite: bool = False, container_registry: LiteralContainerRegistry | None = None, @@ -153,59 +140,50 @@ def _build(model_name: str, The LLM will be built into a BentoService with the following structure: if ``quantize`` is passed, it will instruct the model to be quantized dynamically during serving time. - if ``bettertransformer`` is passed, it will instruct the model to apply FasterTransformer during serving time. ``openllm.build`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as ``openllm build`` CLI. - > [!NOTE] ``quantize`` and ``bettertransformer`` are mutually exclusive. - Args: - model_name: The model name to start this LLM - model_id: Optional model id for this given LLM - model_version: Optional model version for this given LLM - bento_version: Optional bento veresion for this given BentoLLM - quantize: Quantize the model weights. This is only applicable for PyTorch models. - Possible quantisation strategies: - - int8: Quantize the model with 8bit (bitsandbytes required) - - int4: Quantize the model with 4bit (bitsandbytes required) - - gptq: Quantize the model with GPTQ (auto-gptq required) - bettertransformer: Convert given model to FastTransformer with PyTorch. - adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``. - build_ctx: The build context to use for building BentoLLM. By default, it sets to current directory. - enable_features: Additional OpenLLM features to be included with this BentoLLM. - workers_per_resource: Number of workers per resource assigned. - See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy) - for more information. By default, this is set to 1. + model_name: The model name to start this LLM + model_id: Optional model id for this given LLM + model_version: Optional model version for this given LLM + bento_version: Optional bento veresion for this given BentoLLM + quantize: Quantize the model weights. This is only applicable for PyTorch models. + Possible quantisation strategies: + - int8: Quantize the model with 8bit (bitsandbytes required) + - int4: Quantize the model with 4bit (bitsandbytes required) + - gptq: Quantize the model with GPTQ (auto-gptq required) + adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``. + build_ctx: The build context to use for building BentoLLM. By default, it sets to current directory. + enable_features: Additional OpenLLM features to be included with this BentoLLM. + workers_per_resource: Number of workers per resource assigned. + See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy) + for more information. By default, this is set to 1. - > [!NOTE] ``--workers-per-resource`` will also accept the following strategies: - > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models. - > - ``conserved``: This will determine the number of available GPU resources, and only assign - > one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is - > equivalent to ``--workers-per-resource 0.25``. - runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML. - dockerfile_template: The dockerfile template to use for building BentoLLM. See https://docs.bentoml.com/en/latest/guides/containerization.html#dockerfile-template. - overwrite: Whether to overwrite the existing BentoLLM. By default, this is set to ``False``. - push: Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first. - containerize: Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'. - Note that 'containerize' and 'push' are mutually exclusive - container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR. - container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR. - container_version_strategy: The container version strategy. Default to the latest release of OpenLLM. - serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True` - additional_args: Additional arguments to pass to ``openllm build``. - bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store. + > [!NOTE] ``--workers-per-resource`` will also accept the following strategies: + > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models. + > - ``conserved``: This will determine the number of available GPU resources, and only assign + > one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is + > equivalent to ``--workers-per-resource 0.25``. + dockerfile_template: The dockerfile template to use for building BentoLLM. See https://docs.bentoml.com/en/latest/guides/containerization.html#dockerfile-template. + overwrite: Whether to overwrite the existing BentoLLM. By default, this is set to ``False``. + push: Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first. + containerize: Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'. + Note that 'containerize' and 'push' are mutually exclusive + container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR. + container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR. + container_version_strategy: The container version strategy. Default to the latest release of OpenLLM. + serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True` + additional_args: Additional arguments to pass to ``openllm build``. + bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store. Returns: ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud. """ args: list[str] = [ - sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--runtime', runtime, '--serialisation', - serialisation_format + sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation_format ] - if quantize and bettertransformer: - raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.") if quantize: args.extend(['--quantize', quantize]) - if bettertransformer: args.append('--bettertransformer') if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.") if push: args.extend(['--push']) if containerize: args.extend(['--containerize']) @@ -241,8 +219,7 @@ def _import_model(model_name: str, *, model_id: str | None = None, model_version: str | None = None, - runtime: t.Literal['ggml', 'transformers'] = 'transformers', - implementation: LiteralRuntime = 'pt', + backend: LiteralBackend = 'pt', quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, serialisation_format: t.Literal['legacy', 'safetensors'] = 'safetensors', additional_args: t.Sequence[str] | None = None) -> bentoml.Model: @@ -259,28 +236,24 @@ def _import_model(model_name: str, > ``openllm.start`` will automatically invoke ``openllm.download`` under the hood. Args: - model_name: The model name to start this LLM - model_id: Optional model id for this given LLM - model_version: Optional model version for this given LLM - runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML. - implementation: The implementation to use for this LLM. By default, this is set to ``pt``. - quantize: Quantize the model weights. This is only applicable for PyTorch models. - Possible quantisation strategies: - - int8: Quantize the model with 8bit (bitsandbytes required) - - int4: Quantize the model with 4bit (bitsandbytes required) - - gptq: Quantize the model with GPTQ (auto-gptq required) - serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors. - Default behaviour is similar to ``safe_serialization=False``. - additional_args: Additional arguments to pass to ``openllm import``. + model_name: The model name to start this LLM + model_id: Optional model id for this given LLM + model_version: Optional model version for this given LLM + backend: The backend to use for this LLM. By default, this is set to ``pt``. + quantize: Quantize the model weights. This is only applicable for PyTorch models. + Possible quantisation strategies: + - int8: Quantize the model with 8bit (bitsandbytes required) + - int4: Quantize the model with 4bit (bitsandbytes required) + - gptq: Quantize the model with GPTQ (auto-gptq required) + serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors. + Default behaviour is similar to ``safe_serialization=False``. + additional_args: Additional arguments to pass to ``openllm import``. Returns: - ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud. + ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud. """ from .entrypoint import import_command - args = [ - model_name, '--runtime', runtime, '--implementation', implementation, '--machine', '--serialisation', - serialisation_format, - ] + args = [model_name, '--backend', backend, '--machine', '--serialisation', serialisation_format] if model_id is not None: args.append(model_id) if model_version is not None: args.extend(['--model-version', str(model_version)]) if additional_args is not None: args.extend(additional_args) diff --git a/openllm-python/src/openllm/cli/entrypoint.py b/openllm-python/src/openllm/cli/entrypoint.py index 7545176d..93445c34 100644 --- a/openllm-python/src/openllm/cli/entrypoint.py +++ b/openllm-python/src/openllm/cli/entrypoint.py @@ -66,7 +66,7 @@ from openllm.models.auto import AutoLLM from openllm.utils import infer_auto_class from openllm_core._typing_compat import Concatenate from openllm_core._typing_compat import DictStrAny -from openllm_core._typing_compat import LiteralRuntime +from openllm_core._typing_compat import LiteralBackend from openllm_core._typing_compat import LiteralString from openllm_core._typing_compat import ParamSpec from openllm_core._typing_compat import Self @@ -80,7 +80,6 @@ from openllm_core.utils import analytics from openllm_core.utils import bentoml_cattr from openllm_core.utils import compose from openllm_core.utils import configure_logging -from openllm_core.utils import dantic from openllm_core.utils import first_not_none from openllm_core.utils import get_debug_mode from openllm_core.utils import get_quiet_mode @@ -94,15 +93,13 @@ from . import termui from ._factory import FC from ._factory import LiteralOutput from ._factory import _AnyCallable -from ._factory import bettertransformer_option +from ._factory import backend_option from ._factory import container_registry_option -from ._factory import fast_option from ._factory import machine_option from ._factory import model_id_option from ._factory import model_name_argument from ._factory import model_version_option from ._factory import output_option -from ._factory import parse_device_callback from ._factory import quantize_option from ._factory import serialisation_option from ._factory import start_command_factory @@ -205,21 +202,6 @@ class OpenLLMCommandGroup(BentoMLCommandGroup): return t.cast(t.Callable[Concatenate[bool, P], t.Any], wrapper) - @staticmethod - def exception_handling(func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any) -> t.Callable[P, t.Any]: - command_name = attrs.get('name', func.__name__) - - @functools.wraps(func) - def wrapper(*args: P.args, **attrs: P.kwargs) -> t.Any: - try: - return func(*args, **attrs) - except OpenLLMException as err: - raise click.ClickException(click.style(f"[{group.name}] '{command_name}' failed: " + err.message, fg='red')) from err - except KeyboardInterrupt: - pass - - return wrapper - def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None: if cmd_name in t.cast('Extensions', extension_command).list_commands(ctx): return t.cast('Extensions', extension_command).get_command(ctx, cmd_name) @@ -253,11 +235,11 @@ class OpenLLMCommandGroup(BentoMLCommandGroup): name = name.replace('_', '-') kwargs.setdefault('help', inspect.getdoc(f)) kwargs.setdefault('name', name) - wrapped = self.exception_handling(self.usage_tracking(self.common_params(f), self, **kwargs), self, **kwargs) + wrapped = self.usage_tracking(self.common_params(f), self, **kwargs) # move common parameters to end of the parameters list _memo = getattr(wrapped, '__click_params__', None) - if _memo is None: raise RuntimeError('Click command not register correctly.') + if _memo is None: raise ValueError('Click command not register correctly.') _object_setattr(wrapped, '__click_params__', _memo[-self.NUMBER_OF_COMMON_PARAMS:] + _memo[:-self.NUMBER_OF_COMMON_PARAMS]) # NOTE: we need to call super of super to avoid conflict with BentoMLCommandGroup command setup cmd = super(BentoMLCommandGroup, self).command(*args, **kwargs)(wrapped) @@ -348,11 +330,10 @@ _start_mapping = { @click.argument('model_id', type=click.STRING, default=None, metavar='Optional[REMOTE_REPO/MODEL_ID | /path/to/local/model]', required=False) @click.argument('converter', envvar='CONVERTER', type=click.STRING, default=None, required=False, metavar=None) @model_version_option -@click.option('--runtime', type=click.Choice(['ggml', 'transformers']), default='transformers', help='The runtime to use for the given model. Default is transformers.') @output_option @quantize_option @machine_option -@click.option('--implementation', type=click.Choice(['pt', 'tf', 'flax', 'vllm']), default=None, help='The implementation for saving this LLM.') +@backend_option @serialisation_option def import_command( model_name: str, @@ -360,9 +341,8 @@ def import_command( converter: str | None, model_version: str | None, output: LiteralOutput, - runtime: t.Literal['ggml', 'transformers'], machine: bool, - implementation: LiteralRuntime | None, + backend: LiteralBackend, quantize: t.Literal['int8', 'int4', 'gptq'] | None, serialisation_format: t.Literal['safetensors', 'legacy'], ) -> bentoml.Model: @@ -415,45 +395,42 @@ def import_command( ```bash $ CONVERTER=llama2-hf openllm import llama /path/to/llama-2 ``` - - > [!WARNING] This behaviour will override ``--runtime``. Therefore make sure that the LLM contains correct conversion strategies to both GGML and HF. """ llm_config = AutoConfig.for_model(model_name) - env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, runtime=runtime, quantize=quantize) - impl: LiteralRuntime = first_not_none(implementation, default=env['framework_value']) - llm = infer_auto_class(impl).for_model( + env = EnvVarMixin(model_name, backend=llm_config.default_backend(), model_id=model_id, quantize=quantize) + backend = first_not_none(backend, default=env['backend_value']) + llm = infer_auto_class(backend).for_model( model_name, model_id=env['model_id_value'], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format ) _previously_saved = False try: _ref = serialisation.get(llm) _previously_saved = True - except bentoml.exceptions.NotFound: + except openllm.exceptions.OpenLLMException: if not machine and output == 'pretty': - msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store for implementation {llm.__llm_implementation__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..." + msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store for backend {llm.__llm_backend__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..." termui.echo(msg, fg='yellow', nl=True) _ref = serialisation.get(llm, auto_import=True) - if impl == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache() + if backend == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache() if machine: return _ref elif output == 'pretty': - if _previously_saved: termui.echo(f"{model_name} with 'model_id={model_id}' is already setup for framework '{impl}': {_ref.tag!s}", nl=True, fg='yellow') + if _previously_saved: termui.echo(f"{model_name} with 'model_id={model_id}' is already setup for backend '{backend}': {_ref.tag!s}", nl=True, fg='yellow') else: termui.echo(f'Saved model: {_ref.tag}') - elif output == 'json': termui.echo(orjson.dumps({'previously_setup': _previously_saved, 'framework': impl, 'tag': str(_ref.tag)}, option=orjson.OPT_INDENT_2).decode()) + elif output == 'json': termui.echo(orjson.dumps({'previously_setup': _previously_saved, 'backend': backend, 'tag': str(_ref.tag)}, option=orjson.OPT_INDENT_2).decode()) else: termui.echo(_ref.tag) return _ref + @cli.command(context_settings={'token_normalize_func': inflection.underscore}) @model_name_argument @model_id_option @output_option @machine_option +@backend_option @click.option('--bento-version', type=str, default=None, help='Optional bento version for this BentoLLM. Default is the the model revision.') @click.option('--overwrite', is_flag=True, help='Overwrite existing Bento for given LLM if it already exists.') @workers_per_resource_option(factory=click, build=True) -@click.option('--device', type=dantic.CUDA, multiple=True, envvar='CUDA_VISIBLE_DEVICES', callback=parse_device_callback, help='Set the device', show_envvar=True) @cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Optimisation options') @quantize_option(factory=cog.optgroup, build=True) -@bettertransformer_option(factory=cog.optgroup) -@click.option('--runtime', type=click.Choice(['ggml', 'transformers']), default='transformers', help='The runtime to use for the given model. Default is transformers.') @click.option( '--enable-features', multiple=True, @@ -476,7 +453,6 @@ def import_command( @click.option( '--container-version-strategy', type=click.Choice(['release', 'latest', 'nightly']), default='release', help="Default container version strategy for the image from '--container-registry'" ) -@fast_option @cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Utilities options') @cog.optgroup.option( '--containerize', @@ -496,21 +472,18 @@ def build_command( bento_version: str | None, overwrite: bool, output: LiteralOutput, - runtime: t.Literal['ggml', 'transformers'], quantize: t.Literal['int8', 'int4', 'gptq'] | None, enable_features: tuple[str, ...] | None, - bettertransformer: bool | None, workers_per_resource: float | None, adapter_id: tuple[str, ...], build_ctx: str | None, + backend: LiteralBackend, machine: bool, - device: tuple[str, ...], model_version: str | None, dockerfile_template: t.TextIO | None, containerize: bool, push: bool, serialisation_format: t.Literal['safetensors', 'legacy'], - fast: bool, container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy, force_push: bool, @@ -539,22 +512,21 @@ def build_command( _previously_built = False llm_config = AutoConfig.for_model(model_name) - env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, quantize=quantize, bettertransformer=bettertransformer, runtime=runtime) + env = EnvVarMixin(model_name, backend=backend, model_id=model_id, quantize=quantize) # NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError # during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path try: - os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), env.runtime: str(env['runtime_value']), 'OPENLLM_SERIALIZATION': serialisation_format}) + os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), 'OPENLLM_SERIALIZATION': serialisation_format, 'OPENLLM_BACKEND': env['backend_value']}) if env['model_id_value']: os.environ[env.model_id] = str(env['model_id_value']) if env['quantize_value']: os.environ[env.quantize] = str(env['quantize_value']) - os.environ[env.bettertransformer] = str(env['bettertransformer_value']) - llm = infer_auto_class(env['framework_value']).for_model( - model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs + llm = infer_auto_class(env['backend_value']).for_model( + model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=True, model_version=model_version, serialisation=serialisation_format, **attrs ) labels = dict(llm.identifying_params) - labels.update({'_type': llm.llm_type, '_framework': env['framework_value']}) + labels.update({'_type': llm.llm_type, '_framework': env['backend_value']}) workers_per_resource = first_not_none(workers_per_resource, default=llm_config['workers_per_resource']) with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs: @@ -603,10 +575,8 @@ def build_command( workers_per_resource=workers_per_resource, adapter_map=adapter_map, quantize=quantize, - bettertransformer=bettertransformer, extra_dependencies=enable_features, dockerfile_template=dockerfile_template_path, - runtime=runtime, container_registry=container_registry, container_version_strategy=container_version_strategy ) @@ -632,16 +602,17 @@ def build_command( if push: BentoMLContainer.bentocloud_client.get().push_bento(bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=force_push) elif containerize: - backend = t.cast('DefaultBuilder', os.environ.get('BENTOML_CONTAINERIZE_BACKEND', 'docker')) + container_backend = t.cast('DefaultBuilder', os.environ.get('BENTOML_CONTAINERIZE_BACKEND', 'docker')) try: - bentoml.container.health(backend) + bentoml.container.health(container_backend) except subprocess.CalledProcessError: raise OpenLLMException(f'Failed to use backend {backend}') from None try: - bentoml.container.build(bento.tag, backend=backend, features=('grpc', 'io')) + bentoml.container.build(bento.tag, backend=container_backend, features=('grpc', 'io')) except Exception as err: raise OpenLLMException(f"Exception caught while containerizing '{bento.tag!s}':\n{err}") from err return bento + @cli.command() @output_option @click.option('--show-available', is_flag=True, default=False, help="Show available models in local store (mutually exclusive with '-o porcelain').") @@ -667,21 +638,21 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo else: failed_initialized: list[tuple[str, Exception]] = [] - json_data: dict[str, dict[t.Literal['architecture', 'model_id', 'url', 'installation', 'cpu', 'gpu', 'runtime_impl'], t.Any] | t.Any] = {} + json_data: dict[str, dict[t.Literal['architecture', 'model_id', 'url', 'installation', 'cpu', 'gpu', 'backend'], t.Any] | t.Any] = {} converted: list[str] = [] for m in models: config = AutoConfig.for_model(m) - runtime_impl: tuple[str, ...] = () - if config['model_name'] in MODEL_MAPPING_NAMES: runtime_impl += ('pt',) - if config['model_name'] in MODEL_FLAX_MAPPING_NAMES: runtime_impl += ('flax',) - if config['model_name'] in MODEL_TF_MAPPING_NAMES: runtime_impl += ('tf',) - if config['model_name'] in MODEL_VLLM_MAPPING_NAMES: runtime_impl += ('vllm',) + backend: tuple[str, ...] = () + if config['model_name'] in MODEL_MAPPING_NAMES: backend += ('pt',) + if config['model_name'] in MODEL_FLAX_MAPPING_NAMES: backend += ('flax',) + if config['model_name'] in MODEL_TF_MAPPING_NAMES: backend += ('tf',) + if config['model_name'] in MODEL_VLLM_MAPPING_NAMES: backend += ('vllm',) json_data[m] = { 'architecture': config['architecture'], 'model_id': config['model_ids'], 'cpu': not config['requires_gpu'], 'gpu': True, - 'runtime_impl': runtime_impl, + 'backend': backend, 'installation': f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config['requirements'] else 'openllm', } converted.extend([normalise_model_name(i) for i in config['model_ids']]) @@ -708,10 +679,10 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo import tabulate tabulate.PRESERVE_WHITESPACE = True - # llm, architecture, url, model_id, installation, cpu, gpu, runtime_impl - data: list[str | tuple[str, str, list[str], str, LiteralString, LiteralString, tuple[LiteralRuntime, ...]]] = [] + # llm, architecture, url, model_id, installation, cpu, gpu, backend + data: list[str | tuple[str, str, list[str], str, LiteralString, LiteralString, tuple[LiteralBackend, ...]]] = [] for m, v in json_data.items(): - data.extend([(m, v['architecture'], v['model_id'], v['installation'], '❌' if not v['cpu'] else '✅', '✅', v['runtime_impl'],)]) + data.extend([(m, v['architecture'], v['model_id'], v['installation'], '❌' if not v['cpu'] else '✅', '✅', v['backend'],)]) column_widths = [ int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 4), ] diff --git a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py index 704c3833..93f286e7 100644 --- a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py +++ b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py @@ -18,7 +18,7 @@ class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrain prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config()) - def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings: + def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput: import torch import torch.nn.functional as F embeddings: list[list[float]] = [] @@ -30,4 +30,4 @@ class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrain data = F.normalize(torch.mean(outputs.hidden_states[-1].transpose(0, 1), dim=0), p=2, dim=0) embeddings.append(data.tolist()) num_tokens += len(input_ids[0]) - return openllm.LLMEmbeddings(embeddings=embeddings, num_tokens=num_tokens) + return openllm.EmbeddingsOutput(embeddings=embeddings, num_tokens=num_tokens) diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py index 22f94531..51a76400 100644 --- a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py +++ b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py @@ -17,7 +17,7 @@ class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformer generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True) - def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings: + def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput: import torch import torch.nn.functional as F embeddings: list[list[float]] = [] @@ -29,4 +29,4 @@ class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformer data = F.normalize(torch.mean(outputs.encoder_last_hidden_state[0], dim=0), p=2, dim=0) embeddings.append(data.tolist()) num_tokens += len(input_ids[0]) - return openllm.LLMEmbeddings(embeddings=embeddings, num_tokens=num_tokens) + return openllm.EmbeddingsOutput(embeddings=embeddings, num_tokens=num_tokens) diff --git a/openllm-python/src/openllm/models/llama/modeling_llama.py b/openllm-python/src/openllm/models/llama/modeling_llama.py index 79946d4c..b259fba8 100644 --- a/openllm-python/src/openllm/models/llama/modeling_llama.py +++ b/openllm-python/src/openllm/models/llama/modeling_llama.py @@ -13,7 +13,7 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke import torch return {'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {} - def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings: + def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput: import torch import torch.nn.functional as F encoding = self.tokenizer(prompts, padding=True, return_tensors='pt').to(self.device) @@ -23,8 +23,8 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke mask = attention_mask.unsqueeze(-1).expand(data.size()).float() masked_embeddings = data * mask sum_embeddings, seq_length = torch.sum(masked_embeddings, dim=1), torch.sum(mask, dim=1) - return openllm.LLMEmbeddings(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(), - num_tokens=int(torch.sum(attention_mask).item())) + return openllm.EmbeddingsOutput(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(), + num_tokens=int(torch.sum(attention_mask).item())) def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]: diff --git a/openllm-python/src/openllm/models/mpt/modeling_mpt.py b/openllm-python/src/openllm/models/mpt/modeling_mpt.py index 7ef664bc..33553246 100644 --- a/openllm-python/src/openllm/models/mpt/modeling_mpt.py +++ b/openllm-python/src/openllm/models/mpt/modeling_mpt.py @@ -33,10 +33,6 @@ def get_mpt_config(model_id_or_path: str, class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXTokenizerFast']): __openllm_internal__ = True - def llm_post_init(self) -> None: - import torch - self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32 - @property def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: import torch @@ -49,7 +45,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken import torch import transformers _, tokenizer_attrs = self.llm_parameters - torch_dtype = attrs.pop('torch_dtype', self.dtype) + torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32) device_map = attrs.pop('device_map', None) attrs.pop('low_cpu_mem_usage', None) config = get_mpt_config(self.model_id, @@ -75,7 +71,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel: import transformers - torch_dtype = attrs.pop('torch_dtype', self.dtype) + torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32) device_map = attrs.pop('device_map', None) trust_remote_code = attrs.pop('trust_remote_code', True) config = get_mpt_config(self._bentomodel.path, diff --git a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py index 0594c856..0a8c9b82 100644 --- a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py +++ b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py @@ -8,10 +8,6 @@ if t.TYPE_CHECKING: class StableLM(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNeoXTokenizerFast']): __openllm_internal__ = True - def llm_post_init(self) -> None: - import torch - self.bettertransformer = True if not torch.cuda.is_available() else False - @property def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: import torch diff --git a/openllm-python/src/openllm/serialisation/__init__.py b/openllm-python/src/openllm/serialisation/__init__.py index ea4e325c..1714d65b 100644 --- a/openllm-python/src/openllm/serialisation/__init__.py +++ b/openllm-python/src/openllm/serialisation/__init__.py @@ -1,27 +1,9 @@ -"""Serialisation utilities for OpenLLM. +'''Serialisation utilities for OpenLLM. Currently supports transformers for PyTorch, Tensorflow and Flax. Currently, GGML format is working in progress. - -## Usage - -```python -import openllm - -llm = openllm.AutoLLM.for_model("dolly-v2") -llm.save_pretrained("./path/to/local-dolly") -``` - -To use different runtime, specify directly in the `for_model` method: - -```python -import openllm - -llm = openllm.AutoLLM.for_model("dolly-v2", runtime='ggml') -llm.save_pretrained("./path/to/local-dolly") -``` -""" +''' from __future__ import annotations import importlib import typing as t @@ -54,7 +36,7 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T: from .transformers._helpers import infer_tokenizers_from_llm from .transformers._helpers import process_config - config, *_ = process_config(llm._bentomodel.path, llm.__llm_trust_remote_code__) + config, *_ = process_config(llm._bentomodel.path, llm.trust_remote_code) bentomodel_fs = fs.open_fs(llm._bentomodel.path) if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME): with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, 'rb') as cofile: @@ -62,12 +44,11 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T: tokenizer = cloudpickle.load(t.cast('t.IO[bytes]', cofile))['tokenizer'] except KeyError: raise openllm.exceptions.OpenLLMException( - "Bento model does not have tokenizer. Make sure to save" - " the tokenizer within the model via 'custom_objects'." - " For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None + "Bento model does not have tokenizer. Make sure to save the tokenizer within the model via 'custom_objects'. " + "For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None else: tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath('/'), - trust_remote_code=llm.__llm_trust_remote_code__, + trust_remote_code=llm.trust_remote_code, **tokenizer_attrs) if tokenizer.pad_token_id is None: @@ -82,18 +63,20 @@ class _Caller(t.Protocol[P]): def __call__(self, llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any: ... -_extras = ['get', 'import_model', 'save_pretrained', 'load_model'] +_extras = ['get', 'import_model', 'load_model'] def _make_dispatch_function(fn: str) -> _Caller[P]: def caller(llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any: """Generic function dispatch to correct serialisation submodules based on LLM runtime. - > [!NOTE] See 'openllm.serialisation.transformers' if 'llm.runtime="transformers"' + > [!NOTE] See 'openllm.serialisation.transformers' if 'llm.__llm_backend__ in ("pt", "tf", "flax", "vllm")' - > [!NOTE] See 'openllm.serialisation.ggml' if 'llm.runtime="ggml"' + > [!NOTE] See 'openllm.serialisation.ggml' if 'llm.__llm_backend__="ggml"' """ - return getattr(importlib.import_module(f'.{llm.runtime}', __name__), fn)(llm, *args, **kwargs) + serde = 'transformers' + if llm.__llm_backend__ == 'ggml': serde = 'ggml' + return getattr(importlib.import_module(f'.{serde}', __name__), fn)(llm, *args, **kwargs) return caller @@ -105,9 +88,6 @@ if t.TYPE_CHECKING: def import_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> bentoml.Model: ... - def save_pretrained(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> None: - ... - def load_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> M: ... diff --git a/openllm-python/src/openllm/serialisation/ggml.py b/openllm-python/src/openllm/serialisation/ggml.py index d0539672..fd4397cc 100644 --- a/openllm-python/src/openllm/serialisation/ggml.py +++ b/openllm-python/src/openllm/serialisation/ggml.py @@ -5,10 +5,10 @@ This requires ctransformers to be installed. from __future__ import annotations import typing as t -import bentoml -import openllm - if t.TYPE_CHECKING: + import bentoml + import openllm + from openllm_core._typing_compat import M _conversion_strategy = {'pt': 'ggml'} @@ -21,30 +21,7 @@ def import_model(llm: openllm.LLM[t.Any, t.Any], raise NotImplementedError('Currently work in progress.') def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Model: - '''Return an instance of ``bentoml.Model`` from given LLM instance. - - By default, it will try to check the model in the local store. - If model is not found, and ``auto_import`` is set to True, it will try to import the model from HuggingFace Hub. - - Otherwise, it will raises a ``bentoml.exceptions.NotFound``. - ''' - try: - model = bentoml.models.get(llm.tag) - if model.info.module not in ('openllm.serialisation.ggml', __name__): - raise bentoml.exceptions.NotFound( - f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'." - ) - if 'runtime' in model.info.labels and model.info.labels['runtime'] != llm.runtime: - raise openllm.exceptions.OpenLLMException( - f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.") - return model - except bentoml.exceptions.NotFound: - if auto_import: - return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__) - raise + raise NotImplementedError('Currently work in progress.') def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M: raise NotImplementedError('Currently work in progress.') - -def save_pretrained(llm: openllm.LLM[t.Any, t.Any], save_directory: str, **attrs: t.Any) -> None: - raise NotImplementedError('Currently work in progress.') diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py index dc2d8e2b..c75e3636 100644 --- a/openllm-python/src/openllm/serialisation/transformers/__init__.py +++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py @@ -5,6 +5,7 @@ import logging import typing as t from huggingface_hub import snapshot_download +from packaging.version import Version from simple_di import Provide from simple_di import inject @@ -28,22 +29,18 @@ if t.TYPE_CHECKING: import auto_gptq as autogptq import torch import torch.nn - import transformers - import vllm from bentoml._internal.models import ModelStore from openllm_core._typing_compat import DictStrAny from openllm_core._typing_compat import M from openllm_core._typing_compat import T else: - vllm = openllm.utils.LazyLoader('vllm', globals(), 'vllm') autogptq = openllm.utils.LazyLoader('autogptq', globals(), 'auto_gptq') - transformers = openllm.utils.LazyLoader('transformers', globals(), 'transformers') torch = openllm.utils.LazyLoader('torch', globals(), 'torch') logger = logging.getLogger(__name__) -__all__ = ['import_model', 'get', 'load_model', 'save_pretrained'] +__all__ = ['import_model', 'get', 'load_model'] @inject def import_model(llm: openllm.LLM[M, T], @@ -74,7 +71,7 @@ def import_model(llm: openllm.LLM[M, T], safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), default=llm._serialisation_format == 'safetensors') # Disable safe serialization with vLLM - if llm.__llm_implementation__ == 'vllm': safe_serialisation = False + if llm.__llm_backend__ == 'vllm': safe_serialisation = False metadata: DictStrAny = { 'safe_serialisation': safe_serialisation, '_quantize': quantize_method is not None and quantize_method @@ -95,8 +92,8 @@ def import_model(llm: openllm.LLM[M, T], # since saving int4 is not yet supported if 'quantization_config' in attrs and getattr(attrs['quantization_config'], 'load_in_4bit', False): attrs.pop('quantization_config') - if llm.__llm_implementation__ != 'flax': attrs['use_safetensors'] = safe_serialisation - metadata['_framework'] = 'pt' if llm.__llm_implementation__ == 'vllm' else llm.__llm_implementation__ + if llm.__llm_backend__ != 'flax': attrs['use_safetensors'] = safe_serialisation + metadata['_framework'] = 'pt' if llm.__llm_backend__ == 'vllm' else llm.__llm_backend__ tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id, trust_remote_code=trust_remote_code, @@ -108,7 +105,7 @@ def import_model(llm: openllm.LLM[M, T], imported_modules: list[types.ModuleType] = [] bentomodel = bentoml.Model.create(llm.tag, module='openllm.serialisation.transformers', - api_version='v1', + api_version='v2', options=ModelOptions(), context=openllm.utils.generate_context(framework_name='openllm'), labels=openllm.utils.generate_labels(llm), @@ -133,8 +130,7 @@ def import_model(llm: openllm.LLM[M, T], trust_remote_code=trust_remote_code, use_safetensors=safe_serialisation, **hub_attrs, - **attrs, - ) + **attrs) update_model(bentomodel, metadata={ '_pretrained_class': model.__class__.__name__, @@ -192,27 +188,21 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model: ''' try: model = bentoml.models.get(llm.tag) - if model.info.module not in ('openllm.serialisation.transformers' - 'bentoml.transformers', 'bentoml._internal.frameworks.transformers', - __name__): # NOTE: backward compatible with previous version of OpenLLM. - raise bentoml.exceptions.NotFound( - f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'." - ) - if 'runtime' in model.info.labels and model.info.labels['runtime'] != llm.runtime: + if Version(model.info.api_version) < Version('v2'): raise openllm.exceptions.OpenLLMException( - f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.") + 'Please run "openllm prune -y --include-bentos" and upgrade all saved model to latest release.') + if model.info.labels['backend'] != llm.__llm_backend__: + raise openllm.exceptions.OpenLLMException( + f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}." + ) return model - except bentoml.exceptions.NotFound as err: - if auto_import: return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__) - raise err from None + except Exception as err: + if auto_import: return import_model(llm, trust_remote_code=llm.trust_remote_code) + raise openllm.exceptions.OpenLLMException( + f'Failed while getting stored artefact (lookup for traceback):\n{err}') from err def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: - '''Load the model from BentoML store. - - By default, it will try to find check the model in the local store. - If model is not found, it will raises a ``bentoml.exceptions.NotFound``. - ''' - config, hub_attrs, attrs = process_config(llm.model_id, llm.__llm_trust_remote_code__, **attrs) + config, hub_attrs, attrs = process_config(llm.model_id, llm.trust_remote_code, **attrs) safe_serialization = openllm.utils.first_not_none(t.cast( t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)), attrs.pop('safe_serialization', None), @@ -229,7 +219,7 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: *decls, quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config), - trust_remote_code=llm.__llm_trust_remote_code__, + trust_remote_code=llm.trust_remote_code, use_safetensors=safe_serialization, **hub_attrs, **attrs) @@ -238,57 +228,9 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: model = infer_autoclass_from_llm(llm, config).from_pretrained(llm._bentomodel.path, *decls, config=config, - trust_remote_code=llm.__llm_trust_remote_code__, + trust_remote_code=llm.trust_remote_code, device_map=device_map, **hub_attrs, **attrs).eval() - # BetterTransformer is currently only supported on PyTorch. - if llm.bettertransformer and isinstance(model, transformers.PreTrainedModel): model = model.to_bettertransformer() - if llm.__llm_implementation__ in {'pt', 'vllm'}: check_unintialised_params(model) + if llm.__llm_backend__ in {'pt', 'vllm'}: check_unintialised_params(model) return t.cast('M', model) - -def save_pretrained(llm: openllm.LLM[M, T], - save_directory: str, - is_main_process: bool = True, - state_dict: DictStrAny | None = None, - save_function: t.Any | None = None, - push_to_hub: bool = False, - max_shard_size: int | str = '10GB', - safe_serialization: bool = False, - variant: str | None = None, - **attrs: t.Any) -> None: - save_function = t.cast(t.Callable[..., None], openllm.utils.first_not_none(save_function, default=torch.save)) - model_save_attrs, tokenizer_save_attrs = openllm.utils.normalize_attrs_to_model_tokenizer_pair(**attrs) - safe_serialization = safe_serialization or llm._serialisation_format == 'safetensors' - # NOTE: disable safetensors for vllm - if llm.__llm_implementation__ == 'vllm': safe_serialization = False - if llm._quantize_method == 'gptq': - if not openllm.utils.is_autogptq_available(): - raise openllm.exceptions.OpenLLMException( - "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'" - ) - if llm.config['model_type'] != 'causal_lm': - raise openllm.exceptions.OpenLLMException( - f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") - if not openllm.utils.lenient_issubclass(llm.model, autogptq.modeling.BaseGPTQForCausalLM): - raise ValueError(f'Model is not a BaseGPTQForCausalLM (type: {type(llm.model)})') - t.cast('autogptq.modeling.BaseGPTQForCausalLM', llm.model).save_quantized(save_directory, - use_safetensors=safe_serialization) - elif openllm.utils.LazyType['vllm.LLMEngine']('vllm.LLMEngine').isinstance(llm.model): - raise RuntimeError( - "vllm.LLMEngine cannot be serialisation directly. This happens when 'save_pretrained' is called directly after `openllm.AutoVLLM` is initialized." - ) - elif isinstance(llm.model, transformers.Pipeline): - llm.model.save_pretrained(save_directory, safe_serialization=safe_serialization) - else: - # We can safely cast here since it will be the PreTrainedModel protocol. - t.cast('transformers.PreTrainedModel', llm.model).save_pretrained(save_directory, - is_main_process=is_main_process, - state_dict=state_dict, - save_function=save_function, - push_to_hub=push_to_hub, - max_shard_size=max_shard_size, - safe_serialization=safe_serialization, - variant=variant, - **model_save_attrs) - llm.tokenizer.save_pretrained(save_directory, push_to_hub=push_to_hub, **tokenizer_save_attrs) diff --git a/openllm-python/src/openllm/serialisation/transformers/_helpers.py b/openllm-python/src/openllm/serialisation/transformers/_helpers.py index 643a40f6..b325fd85 100644 --- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py +++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py @@ -76,7 +76,7 @@ def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.Pretra if type(config) in transformers.MODEL_FOR_CAUSAL_LM_MAPPING: idx = 0 elif type(config) in transformers.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING: idx = 1 else: raise openllm.exceptions.OpenLLMException(f'Model type {type(config)} is not supported yet.') - return getattr(transformers, FRAMEWORK_TO_AUTOCLASS_MAPPING[llm.__llm_implementation__][idx]) + return getattr(transformers, FRAMEWORK_TO_AUTOCLASS_MAPPING[llm.__llm_backend__][idx]) def check_unintialised_params(model: torch.nn.Module) -> None: unintialized = [n for n, param in model.named_parameters() if param.data.device == torch.device('meta')] @@ -104,11 +104,11 @@ def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Mod def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType: infer_fn: tuple[str, ...] = ('__call__',) default_config = ModelSignature(batchable=False) - if llm.__llm_implementation__ in {'pt', 'vllm'}: + if llm.__llm_backend__ in {'pt', 'vllm'}: infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search', 'constrained_beam_search', ) - elif llm.__llm_implementation__ == 'tf': + elif llm.__llm_backend__ == 'tf': infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search', 'contrastive_search', ) diff --git a/openllm-python/src/openllm/serialisation/transformers/weights.py b/openllm-python/src/openllm/serialisation/transformers/weights.py index 0acb0a25..e1218444 100644 --- a/openllm-python/src/openllm/serialisation/transformers/weights.py +++ b/openllm-python/src/openllm/serialisation/transformers/weights.py @@ -23,9 +23,9 @@ class HfIgnore: @classmethod def ignore_patterns(cls, llm: openllm.LLM[M, T]) -> list[str]: - if llm.__llm_implementation__ == 'vllm': base = [cls.tf, cls.flax, cls.safetensors] - elif llm.__llm_implementation__ == 'tf': base = [cls.flax, cls.pt] - elif llm.__llm_implementation__ == 'flax': + if llm.__llm_backend__ == 'vllm': base = [cls.tf, cls.flax, cls.safetensors] + elif llm.__llm_backend__ == 'tf': base = [cls.flax, cls.pt] + elif llm.__llm_backend__ == 'flax': base = [cls.tf, cls.pt, cls.safetensors] # as of current, safetensors is not supported with flax else: base = [cls.tf, cls.flax] diff --git a/openllm-python/src/openllm/testing.py b/openllm-python/src/openllm/testing.py index 4c4ee109..5736d1da 100644 --- a/openllm-python/src/openllm/testing.py +++ b/openllm-python/src/openllm/testing.py @@ -10,7 +10,7 @@ import bentoml import openllm if t.TYPE_CHECKING: - from openllm_core._typing_compat import LiteralRuntime + from openllm_core._typing_compat import LiteralBackend logger = logging.getLogger(__name__) @@ -18,10 +18,9 @@ logger = logging.getLogger(__name__) def build_bento(model: str, model_id: str | None = None, quantize: t.Literal['int4', 'int8', 'gptq'] | None = None, - runtime: t.Literal['ggml', 'transformers'] = 'transformers', cleanup: bool = False) -> t.Iterator[bentoml.Bento]: logger.info('Building BentoML for %s', model) - bento = openllm.build(model, model_id=model_id, quantize=quantize, runtime=runtime) + bento = openllm.build(model, model_id=model_id, quantize=quantize) yield bento if cleanup: logger.info('Deleting %s', bento.tag) @@ -49,7 +48,7 @@ def build_container(bento: bentoml.Bento | str | bentoml.Tag, @contextlib.contextmanager def prepare(model: str, model_id: str | None = None, - implementation: LiteralRuntime = 'pt', + implementation: LiteralBackend = 'pt', deployment_mode: t.Literal['container', 'local'] = 'local', clean_context: contextlib.ExitStack | None = None, cleanup: bool = True) -> t.Iterator[str]: diff --git a/openllm-python/src/openllm/utils/__init__.py b/openllm-python/src/openllm/utils/__init__.py index fdeed2c5..4033d3fb 100644 --- a/openllm-python/src/openllm/utils/__init__.py +++ b/openllm-python/src/openllm/utils/__init__.py @@ -16,11 +16,11 @@ from . import dummy_vllm_objects as dummy_vllm_objects if t.TYPE_CHECKING: import openllm - from openllm_core._typing_compat import LiteralRuntime + from openllm_core._typing_compat import LiteralBackend def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]: return { - 'runtime': llm.runtime, + 'backend': llm.__llm_backend__, 'framework': 'openllm', 'model_name': llm.config['model_name'], 'architecture': llm.config['architecture'], @@ -28,14 +28,13 @@ def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]: } def infer_auto_class( - implementation: LiteralRuntime -) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]: + backend: LiteralBackend) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]: import openllm - if implementation == 'tf': return openllm.AutoTFLLM - elif implementation == 'flax': return openllm.AutoFlaxLLM - elif implementation == 'pt': return openllm.AutoLLM - elif implementation == 'vllm': return openllm.AutoVLLM - else: raise RuntimeError(f"Unknown implementation: {implementation} (supported: 'pt', 'flax', 'tf', 'vllm')") + if backend == 'tf': return openllm.AutoTFLLM + elif backend == 'flax': return openllm.AutoFlaxLLM + elif backend == 'pt': return openllm.AutoLLM + elif backend == 'vllm': return openllm.AutoVLLM + else: raise RuntimeError(f"Unknown backend: {backend} (supported: 'pt', 'flax', 'tf', 'vllm')") __all__ = [ 'generate_labels', 'infer_auto_class', 'dummy_flax_objects', 'dummy_pt_objects', 'dummy_tf_objects', diff --git a/openllm-python/tests/_strategies/_configuration.py b/openllm-python/tests/_strategies/_configuration.py index ee484747..813df70d 100644 --- a/openllm-python/tests/_strategies/_configuration.py +++ b/openllm-python/tests/_strategies/_configuration.py @@ -30,12 +30,10 @@ def model_settings(draw: st.DrawFn): st.booleans(), 'requirements': st.none() | st.lists(st.text(), min_size=1), - 'default_implementation': + 'default_backend': st.dictionaries(st.sampled_from(['cpu', 'nvidia.com/gpu']), st.sampled_from(['vllm', 'pt', 'tf', 'flax'])), 'model_type': st.sampled_from(['causal_lm', 'seq2seq_lm']), - 'runtime': - st.sampled_from(['transformers', 'ggml']), 'name_type': st.sampled_from(['dasherize', 'lowercase']), 'timeout': diff --git a/openllm-python/tests/configuration_test.py b/openllm-python/tests/configuration_test.py index 0dcbc5e2..147ebc66 100644 --- a/openllm-python/tests/configuration_test.py +++ b/openllm-python/tests/configuration_test.py @@ -111,10 +111,7 @@ def patch_env(**attrs: t.Any): yield def test_struct_envvar(): - with patch_env(**{ - field_env_key('env_llm', 'field1'): '4', - field_env_key('env_llm', 'temperature', suffix='generation'): '0.2', - }): + with patch_env(**{field_env_key('field1'): '4', field_env_key('temperature', suffix='generation'): '0.2',}): class EnvLLM(openllm.LLMConfig): __config__ = {'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel',} @@ -146,8 +143,8 @@ def test_struct_provided_fields(): def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as mk: - mk.setenv(field_env_key('overwrite_with_env_available', 'field1'), str(4.0)) - mk.setenv(field_env_key('overwrite_with_env_available', 'temperature', suffix='generation'), str(0.2)) + mk.setenv(field_env_key('field1'), str(4.0)) + mk.setenv(field_env_key('temperature', suffix='generation'), str(0.2)) sent = make_llm_config('OverwriteWithEnvAvailable', { 'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], diff --git a/openllm-python/tests/conftest.py b/openllm-python/tests/conftest.py index 5eed1c6a..959b6e11 100644 --- a/openllm-python/tests/conftest.py +++ b/openllm-python/tests/conftest.py @@ -8,9 +8,9 @@ import pytest import openllm if t.TYPE_CHECKING: - from openllm_core._typing_compat import LiteralRuntime + from openllm_core._typing_compat import LiteralBackend -_FRAMEWORK_MAPPING = { +_MODELING_MAPPING = { 'flan_t5': 'google/flan-t5-small', 'opt': 'facebook/opt-125m', 'baichuan': 'baichuan-inc/Baichuan-7B', @@ -22,19 +22,17 @@ _PROMPT_MAPPING = { def parametrise_local_llm( model: str,) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]: - if model not in _FRAMEWORK_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.") - runtime_impl: tuple[LiteralRuntime, ...] = tuple() - if model in openllm.MODEL_MAPPING_NAMES: runtime_impl += ('pt',) - if model in openllm.MODEL_FLAX_MAPPING_NAMES: runtime_impl += ('flax',) - if model in openllm.MODEL_TF_MAPPING_NAMES: runtime_impl += ('tf',) - for framework, prompt in itertools.product(runtime_impl, _PROMPT_MAPPING.keys()): - llm = openllm.Runner(model, - model_id=_FRAMEWORK_MAPPING[model], - ensure_available=True, - implementation=framework, - init_local=True, - ) - yield prompt, llm + if model not in _MODELING_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.") + backends: tuple[LiteralBackend, ...] = tuple() + if model in openllm.MODEL_MAPPING_NAMES: backends += ('pt',) + if model in openllm.MODEL_FLAX_MAPPING_NAMES: backends += ('flax',) + if model in openllm.MODEL_TF_MAPPING_NAMES: backends += ('tf',) + for backend, prompt in itertools.product(backends, _PROMPT_MAPPING.keys()): + yield prompt, openllm.Runner(model, + model_id=_MODELING_MAPPING[model], + ensure_available=True, + backend=backend, + init_local=True) def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: if os.getenv('GITHUB_ACTIONS') is None: diff --git a/openllm-python/tests/package_test.py b/openllm-python/tests/package_test.py index 1d1fc74f..17c121e6 100644 --- a/openllm-python/tests/package_test.py +++ b/openllm-python/tests/package_test.py @@ -4,6 +4,7 @@ import os import typing as t import pytest +import transformers import openllm @@ -28,7 +29,7 @@ def test_general_build_with_internal_testing(): bento = openllm.build('flan-t5', model_id=HF_INTERNAL_T5_TESTING) assert llm.llm_type == bento.info.labels['_type'] - assert llm.config['env']['framework_value'] == bento.info.labels['_framework'] + assert llm.config['env']['backend_value'] == bento.info.labels['_framework'] bento = openllm.build('flan-t5', model_id=HF_INTERNAL_T5_TESTING) assert len(bento_store.list(bento.tag)) == 1 @@ -38,10 +39,11 @@ def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory): local_path = tmp_path_factory.mktemp('local_t5') llm = openllm.AutoLLM.for_model('flan-t5', model_id=HF_INTERNAL_T5_TESTING, ensure_available=True) - if llm.bettertransformer: - llm.__llm_model__ = llm.model.reverse_bettertransformer() - - llm.save_pretrained(local_path) + if isinstance(llm.model, transformers.Pipeline): + llm.model.save_pretrained(str(local_path)) + else: + llm.model.save_pretrained(str(local_path)) + llm.tokenizer.save_pretrained(str(local_path)) assert openllm.build('flan-t5', model_id=local_path.resolve().__fspath__(), model_version='local') diff --git a/pyproject.toml b/pyproject.toml index 462251c9..fe922a8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -266,10 +266,6 @@ USE_TABS = false BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION = 1 BLANK_LINES_BETWEEN_TOP_LEVEL_IMPORTS_AND_VARIABLES = 1 DISABLE_ENDING_COMMA_HEURISTIC = true -# DEDENT_CLOSING_BRACKETS = true -# INDENT_CLOSING_BRACKETS = false -# COALESCE_BRACKETS = true -# EACH_DICT_ENTRY_ON_SEPARATE_LINE = true # ALIGN_CLOSING_BRACKET_WITH_VISUAL_INDENT = true # ALLOW_MULTILINE_DICTIONARY_KEYS = false # ALLOW_MULTILINE_LAMBDAS = false @@ -279,6 +275,10 @@ DISABLE_ENDING_COMMA_HEURISTIC = true # BLANK_LINE_BEFORE_CLASS_DOCSTRING = false # BLANK_LINE_BEFORE_MODULE_DOCSTRING = false # BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = false +# DEDENT_CLOSING_BRACKETS = true +# INDENT_CLOSING_BRACKETS = false +# COALESCE_BRACKETS = true +# EACH_DICT_ENTRY_ON_SEPARATE_LINE = true # CONTINUATION_ALIGN_STYLE = "SPACE" # INDENT_BLANK_LINES = false # NO_SPACES_AROUND_SELECTED_BINARY_OPERATORS = true diff --git a/tools/update-config-stubs.py b/tools/update-config-stubs.py index 27e1225d..1daf0ab2 100755 --- a/tools/update-config-stubs.py +++ b/tools/update-config-stubs.py @@ -50,42 +50,32 @@ _value_docstring = { ```bash openllm start gpt-neox --model-id stabilityai/stablelm-tuned-alpha-3b ```''', - 'default_implementation': - '''The default runtime to run this LLM. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`. - - It is a dictionary of key as the accelerator spec in k4s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM Runtime ('flax', 'tf', 'pt', 'vllm') - ''', + 'default_backend': + '''The default backend to run LLM based on available accelerator. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`. It is a dictionary of key as the accelerator spec in k8s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM backend ('flax', 'tf', 'pt', 'vllm', 'ggml', 'mlc')''', 'url': - '''The resolved url for this LLMConfig.''', + 'The resolved url for this LLMConfig.', 'requires_gpu': - '''Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU.''', + 'Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU.', 'trust_remote_code': - '''Whether to always trust remote code''', + 'Whether to always trust remote code', 'service_name': - """Generated service name for this LLMConfig. By default, it is 'generated_{model_name}_service.py'""", + "Generated service name for this LLMConfig. By default, it is \"generated_{model_name}_service.py\"", 'requirements': - '''The default PyPI requirements needed to run this given LLM. By default, we will depend on - bentoml, torch, transformers.''', - 'bettertransformer': - '''Whether to use BetterTransformer for this given LLM. This depends per model architecture. By default, we will use BetterTransformer for T5 and StableLM models, and set to False for every other models.''', + 'The default PyPI requirements needed to run this given LLM. By default, we will depend on bentoml, torch, transformers.', 'model_type': - '''The model type for this given LLM. By default, it should be causal language modeling. - Currently supported 'causal_lm' or 'seq2seq_lm' - ''', - 'runtime': - '''The runtime to use for this model. Possible values are `transformers` or `ggml`. See Llama for more information.''', + 'The model type for this given LLM. By default, it should be causal language modeling. Currently supported "causal_lm" or "seq2seq_lm"', 'name_type': '''The default name typed for this model. "dasherize" will convert the name to lowercase and replace spaces with dashes. "lowercase" will convert the name to lowercase. If this is not set, then both `model_name` and `start_name` must be specified.''', 'model_name': - '''The normalized version of __openllm_start_name__, determined by __openllm_name_type__''', + 'The normalized version of __openllm_start_name__, determined by __openllm_name_type__', 'start_name': - '''Default name to be used with `openllm start`''', + 'Default name to be used with `openllm start`', 'env': - '''A EnvVarMixin instance for this LLMConfig.''', + 'A EnvVarMixin instance for this LLMConfig.', 'timeout': - '''The default timeout to be set for this given LLM.''', + 'The default timeout to be set for this given LLM.', 'workers_per_resource': '''The number of workers per resource. This is used to determine the number of workers to use for this model. For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then @@ -97,9 +87,9 @@ _value_docstring = { By default, it is set to 1. ''', 'fine_tune_strategies': - '''The fine-tune strategies for this given LLM.''', + 'The fine-tune strategies for this given LLM.', 'tokenizer_class': - '''Optional tokenizer class for this given LLM. See Llama for example.''', + 'Optional tokenizer class for this given LLM. See Llama for example.', } _transformed = {'fine_tune_strategies': 't.Dict[AdapterType, FineTuneConfig]'} @@ -125,7 +115,7 @@ def main() -> int: config_attr_lines.extend([ ' ' * 4 + line for line in [ f'__openllm_{keys}__: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))} = Field(None)\n', - f'"""{_value_docstring[keys]}"""\n', + f"'''{_value_docstring[keys]}'''\n", ] ]) # NOTE: inline runtime __getitem__ overload process @@ -135,7 +125,7 @@ def main() -> int: lines.extend([ ' ' * 2 + line for line in [ '@overload\n', - f'def __getitem__(self, item: t.Literal["{keys}"]) -> {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}: ...\n', + f"def __getitem__(self, item: t.Literal['{keys}']) -> {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}: ...\n", ] ]) # special case variables: generation_class, extras, sampling_class @@ -143,10 +133,10 @@ def main() -> int: lines.extend([ ' ' * 2 + line for line in [ '@overload\n', - 'def __getitem__(self, item: t.Literal["generation_class"]) -> t.Type[openllm_core.GenerationConfig]: ...\n', + "def __getitem__(self, item: t.Literal['generation_class']) -> t.Type[openllm_core.GenerationConfig]: ...\n", '@overload\n', - 'def __getitem__(self, item: t.Literal["sampling_class"]) -> t.Type[openllm_core.SamplingParams]: ...\n', - '@overload\n', 'def __getitem__(self, item: t.Literal["extras"]) -> t.Dict[str, t.Any]: ...\n', + "def __getitem__(self, item: t.Literal['sampling_class']) -> t.Type[openllm_core.SamplingParams]: ...\n", + '@overload\n', "def __getitem__(self, item: t.Literal['extras']) -> t.Dict[str, t.Any]: ...\n", ] ]) lines.append(' ' * 2 + '# NOTE: GenerationConfig arguments\n') @@ -154,20 +144,20 @@ def main() -> int: for keys, type_pep563 in generation_config_anns.items(): lines.extend([ ' ' * 2 + line - for line in ['@overload\n', f'def __getitem__(self, item: t.Literal["{keys}"]) -> {type_pep563}: ...\n'] + for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {type_pep563}: ...\n"] ]) lines.append(' ' * 2 + '# NOTE: SamplingParams arguments\n') for keys, type_pep563 in codegen.get_annotations(SamplingParams).items(): if keys not in generation_config_anns: lines.extend([ ' ' * 2 + line - for line in ['@overload\n', f'def __getitem__(self, item: t.Literal["{keys}"]) -> {type_pep563}: ...\n',] + for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {type_pep563}: ...\n",] ]) lines.append(' ' * 2 + '# NOTE: PeftType arguments\n') for keys in PeftType._member_names_: lines.extend([ ' ' * 2 + line for line in - ['@overload\n', f'def __getitem__(self, item: t.Literal["{keys.lower()}"]) -> dict[str, t.Any]: ...\n',] + ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys.lower()}']) -> dict[str, t.Any]: ...\n",] ]) processed = processed[:start_attrs_idx] + [ diff --git a/tools/update-dummy.py b/tools/update-dummy.py index 76f86342..104430de 100755 --- a/tools/update-dummy.py +++ b/tools/update-dummy.py @@ -7,7 +7,7 @@ _ROOT = Path(__file__).parent.parent sys.path.insert(0, (_ROOT / 'openllm-core' / 'src').__fspath__()) sys.path.insert(1, (_ROOT / 'openllm-python' / 'src').__fspath__()) -from openllm_core._configuration import LiteralRuntime +from openllm_core._typing_compat import LiteralBackend from openllm.models import auto from openllm import CONFIG_MAPPING @@ -17,31 +17,31 @@ config_requirements = { k: [_.replace('-', '_') for _ in v.__openllm_requirements__] if v.__openllm_requirements__ else None for k, v in CONFIG_MAPPING.items() } -_dependencies: dict[LiteralRuntime, str] = { - k: v for k, v in zip(LiteralRuntime.__args__, ('torch', 'tensorflow', 'flax', 'vllm')) +_dependencies: dict[LiteralBackend, str] = { + k: v for k, v in zip(LiteralBackend.__args__[:-2], ('torch', 'tensorflow', 'flax', 'vllm')) } _auto: dict[str, str] = { - k: v for k, v in zip(LiteralRuntime.__args__, ('AutoLLM', 'AutoTFLLM', 'AutoFlaxLLM', 'AutoVLLM')) + k: v for k, v in zip(LiteralBackend.__args__[:-2], ('AutoLLM', 'AutoTFLLM', 'AutoFlaxLLM', 'AutoVLLM')) } -def get_target_dummy_file(framework: LiteralRuntime) -> Path: - return _ROOT / 'openllm-python' / 'src' / 'openllm' / 'utils' / f'dummy_{framework}_objects.py' +def get_target_dummy_file(backend: LiteralBackend) -> Path: + return _ROOT / 'openllm-python' / 'src' / 'openllm' / 'utils' / f'dummy_{backend}_objects.py' -def mapping_names(framework: LiteralRuntime): - return 'MODEL_MAPPING_NAMES' if framework == 'pt' else f'MODEL_{framework.upper()}_MAPPING_NAMES' +def mapping_names(backend: LiteralBackend): + return 'MODEL_MAPPING_NAMES' if backend == 'pt' else f'MODEL_{backend.upper()}_MAPPING_NAMES' -def get_mapping(framework: LiteralRuntime) -> OrderedDict[t.Any, t.Any]: - return getattr(auto, mapping_names(framework)) +def get_mapping(backend: LiteralBackend) -> OrderedDict[t.Any, t.Any]: + return getattr(auto, mapping_names(backend)) -def make_class_stub(model_name: str, framework: LiteralRuntime, indentation: int = 2, auto: bool = False) -> list[str]: +def make_class_stub(model_name: str, backend: LiteralBackend, indentation: int = 2, auto: bool = False) -> list[str]: _dep_list: list[str] = [ f'"{v}"' for v in [ - _dependencies[framework], *(t.cast(t.List[str], config_requirements[model_name]) - if model_name != '__default__' and config_requirements[model_name] else []) + _dependencies[backend], *(t.cast(t.List[str], config_requirements[model_name] + ) if model_name != '__default__' and config_requirements[model_name] else []) ] ] - if auto: cl_ = _auto[framework] - else: cl_ = get_mapping(framework)[model_name] + if auto: cl_ = _auto[backend] + else: cl_ = get_mapping(backend)[model_name] lines = [ f'class {cl_}(metaclass=_DummyMetaclass):', ' ' * indentation + f"_backends=[{','.join(_dep_list)}]", ' ' * indentation + @@ -49,28 +49,28 @@ def make_class_stub(model_name: str, framework: LiteralRuntime, indentation: int ] return lines -def write_stub(framework: LiteralRuntime, _path: str) -> list[str]: +def write_stub(backend: LiteralBackend, _path: str) -> list[str]: base = [ f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!', f'# To update this, run ./{_path}', 'from __future__ import annotations', 'import typing as _t', 'from openllm_core.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends', ] - base.extend([v for it in [make_class_stub(k, framework) for k in get_mapping(framework)] for v in it]) + base.extend([v for it in [make_class_stub(k, backend) for k in get_mapping(backend)] for v in it]) # autoclass - base.extend(make_class_stub('__default__', framework, auto=True)) + base.extend(make_class_stub('__default__', backend, auto=True)) # mapping and export - _imports = [f'"{v}"' for v in get_mapping(framework).values()] + _imports = [f'"{v}"' for v in get_mapping(backend).values()] base += [ - f'{mapping_names(framework)}:_t.Any=None', - f"__all__:list[str]=[\"{mapping_names(framework)}\",\"{_auto[framework]}\",{','.join(_imports)}]\n" + f'{mapping_names(backend)}:_t.Any=None', + f"__all__:list[str]=[\"{mapping_names(backend)}\",\"{_auto[backend]}\",{','.join(_imports)}]\n" ] return base def main() -> int: _path = os.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__)) - for framework in _dependencies: - with get_target_dummy_file(framework).open('w') as f: - f.write('\n'.join(write_stub(framework, _path))) + for backend in _dependencies: + with get_target_dummy_file(backend).open('w') as f: + f.write('\n'.join(write_stub(backend, _path))) return 0 if __name__ == '__main__': raise SystemExit(main())