diff --git a/src/openllm/_configuration.py b/src/openllm/_configuration.py index 3aa8c435..28dc39c3 100644 --- a/src/openllm/_configuration.py +++ b/src/openllm/_configuration.py @@ -38,7 +38,6 @@ class FlanT5Config(openllm.LLMConfig): """ from __future__ import annotations -import copy import os import types import typing as t @@ -55,7 +54,7 @@ from click_option_group import optgroup import openllm from .exceptions import GpuNotAvailableError, OpenLLMException -from .utils import _object_setattr +from .utils import LazyType from .utils.dantic import allows_multiple, parse_default if t.TYPE_CHECKING: @@ -70,15 +69,20 @@ if t.TYPE_CHECKING: import transformers from pydantic.fields import FieldInfo from transformers.generation.beam_constraints import Constraint + + DictStrAny = dict[str, t.Any] else: from transformers.utils.dummy_pt_objects import Constraint + DictStrAny = dict transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers") torch = openllm.utils.LazyLoader("torch", globals(), "torch") tf = openllm.utils.LazyLoader("tf", globals(), "tensorflow") __all__ = ["LLMConfig", "ModelSignature"] +_object_setattr = object.__setattr__ + def field_to_options( name: str, field: FieldInfo, model_name: str, suffix_generation: bool = False @@ -87,6 +91,7 @@ def field_to_options( envvar = field.json_schema_extra.get("env") if field.json_schema_extra else None dasherized = inflection.dasherize(name) underscored = inflection.underscore(name) + full_option_name = f"--{dasherized}" if field.annotation is bool: full_option_name += f"/--no-{dasherized}" @@ -101,7 +106,7 @@ def field_to_options( type=field.annotation, required=field.is_required(), default=parse_default(field.default, field.annotation), - show_default=False, + show_default=True if field.default else False, multiple=allows_multiple(field.annotation), help=field.description, show_envvar=True if envvar else False, @@ -109,14 +114,13 @@ def field_to_options( ) -def generate_kwargs_from_envvar(model: GenerationConfig | LLMConfig) -> dict[str, str]: - kwargs: dict[str, t.Any] = {} - for key, field in model.model_fields.items(): - if field.json_schema_extra is not None: - if "env" not in field.json_schema_extra: - raise RuntimeError(f"Invalid {model} passed. Only accept LLMConfig or LLMConfig.generation_config") - kwargs[key] = os.environ.get(field.json_schema_extra["env"], field.default) - return {k: v for k, v in kwargs.items() if v is not None} +def generate_kwargs_from_envvar(model: GenerationConfig | LLMConfig) -> dict[str, t.Any]: + # NOTE: We can safe cast here since all of the fields in GenerationConfig or LLMConfig + # will have a `env` field in `json_schema_extra` + return { + key: os.environ.get(t.cast("dict[str, t.Any]", field.json_schema_extra)["env"], field.default) + for key, field in model.model_fields.items() + } class GenerationConfig(pydantic.BaseModel): @@ -136,7 +140,7 @@ class GenerationConfig(pydantic.BaseModel): description="""The minimum length of the sequence to be generated. Corresponds to the length of the input prompt + `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.""", ) - min_new_tokens: t.Optional[int] = pydantic.Field( + min_new_tokens: int = pydantic.Field( None, description="The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt." ) early_stopping: bool = pydantic.Field( @@ -150,7 +154,7 @@ class GenerationConfig(pydantic.BaseModel): (canonical beam search algorithm) """, ) - max_time: t.Optional[float] = pydantic.Field( + max_time: float = pydantic.Field( None, description="""The maximum amount of time you allow the computation to run for in seconds. generation will still finish the current pass after allocated time has been passed.""", @@ -163,7 +167,7 @@ class GenerationConfig(pydantic.BaseModel): description="""Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.""", ) - penalty_alpha: t.Optional[float] = pydantic.Field( + penalty_alpha: float = pydantic.Field( None, description="""The values balance the model confidence and the degeneration penalty in contrastive search decoding.""", @@ -242,14 +246,15 @@ class GenerationConfig(pydantic.BaseModel): no_repeat_ngram_size: int = pydantic.Field( 0, description="If set to int > 0, all ngrams of that size can only occur once." ) - bad_words_ids: t.Optional[t.List[t.List[int]]] = pydantic.Field( + bad_words_ids: t.List[t.List[int]] = pydantic.Field( None, description="""List of token ids that are not allowed to be generated. In order to get the token ids of the words that should not appear in the generated text, use `tokenizer(bad_words, add_prefix_space=True, add_special_tokens=False).input_ids`. """, ) - force_words_ids: t.Optional[t.Union[t.List[t.List[int]], t.List[t.List[t.List[int]]]]] = pydantic.Field( + # NOTE: t.Union is not yet supported on CLI, but the environment variable should already be available. + force_words_ids: t.Union[t.List[t.List[int]], t.List[t.List[t.List[int]]]] = pydantic.Field( None, description="""List of token ids that must be generated. If given a `List[List[int]]`, this is treated as a simple list of words that must be included, the opposite to `bad_words_ids`. @@ -265,13 +270,13 @@ class GenerationConfig(pydantic.BaseModel): algorithms suppose the score logits are normalized but some logit processors or warpers break the normalization. """, ) - constraints: t.Optional[t.List["Constraint"]] = pydantic.Field( + constraints: t.List["Constraint"] = pydantic.Field( None, description="""Custom constraints that can be added to the generation to ensure that the output will contain the use of certain tokens as defined by ``Constraint`` objects, in the most sensible way possible. """, ) - forced_bos_token_id: t.Optional[int] = pydantic.Field( + forced_bos_token_id: int = pydantic.Field( None, description="""The id of the token to force as the first generated token after the ``decoder_start_token_id``. Useful for multilingual models like @@ -279,7 +284,7 @@ class GenerationConfig(pydantic.BaseModel): to be the target language token. """, ) - forced_eos_token_id: t.Optional[t.Union[int, t.List[int]]] = pydantic.Field( + forced_eos_token_id: t.Union[int, t.List[int]] = pydantic.Field( None, description="""The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a list to set multiple *end-of-sequence* tokens.""", @@ -289,26 +294,26 @@ class GenerationConfig(pydantic.BaseModel): description="""Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to crash. Note that using `remove_invalid_values` can slow down generation.""", ) - exponential_decay_length_penalty: t.Optional[t.Tuple[int, float]] = pydantic.Field( + exponential_decay_length_penalty: t.Tuple[int, float] = pydantic.Field( None, description="""This tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where penalty starts and `decay_factor` represents the factor of exponential decay """, ) - suppress_tokens: t.Optional[t.List[int]] = pydantic.Field( + suppress_tokens: t.List[int] = pydantic.Field( None, description="""A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their log probs to `-inf` so that they are not sampled. """, ) - begin_suppress_tokens: t.Optional[t.List[int]] = pydantic.Field( + begin_suppress_tokens: t.List[int] = pydantic.Field( None, description="""A list of tokens that will be suppressed at the beginning of the generation. The `SupressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled. """, ) - forced_decoder_ids: t.Optional[t.List[t.List[int]]] = pydantic.Field( + forced_decoder_ids: t.List[t.List[int]] = pydantic.Field( None, description="""A list of pairs of integers which indicates a mapping from generation indices to token indices that will be forced before sampling. For example, `[[1, 123]]` means the second generated token will always @@ -338,9 +343,9 @@ class GenerationConfig(pydantic.BaseModel): ) # NOTE: Special tokens that can be used at generation time - pad_token_id: t.Optional[int] = pydantic.Field(None, description="The id of the *padding* token.") - bos_token_id: t.Optional[int] = pydantic.Field(None, description="The id of the *beginning-of-sequence* token.") - eos_token_id: t.Optional[t.Union[int, t.List[int]]] = pydantic.Field( + pad_token_id: int = pydantic.Field(None, description="The id of the *padding* token.") + bos_token_id: int = pydantic.Field(None, description="The id of the *beginning-of-sequence* token.") + eos_token_id: t.Union[int, t.List[int]] = pydantic.Field( None, description="""The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.""", @@ -353,7 +358,7 @@ class GenerationConfig(pydantic.BaseModel): `encoder_input_ids` cannot occur in the `decoder_input_ids`. """, ) - decoder_start_token_id: t.Optional[int] = pydantic.Field( + decoder_start_token_id: int = pydantic.Field( None, description="""If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token. @@ -361,7 +366,7 @@ class GenerationConfig(pydantic.BaseModel): ) # NOTE: pydantic definition - model_config = dict(arbitrary_types_allowed=True, extra="forbid") + model_config = {"extra": "forbid", "arbitrary_types_allowed": True} if t.TYPE_CHECKING: # The following is handled via __pydantic_init_subclass__ @@ -395,28 +400,11 @@ class GenerationConfig(pydantic.BaseModel): # NOTE: I don't know how to do this more efficiently in pydantic v2 yet, will probably # need to consult the pydantic team on this. for key, field in self.model_fields.items(): - json_schema: dict[str, t.Any] = ( - copy.deepcopy(field.json_schema_extra) if field.json_schema_extra is not None else {} - ) - env_key = f"OPENLLM_{self.__openllm_env_name__}_GENERATION_{key.upper()}" - if "env" in json_schema: - field.default = os.environ.get(json_schema["env"], field.default) + if not field.json_schema_extra: + field.json_schema_extra = {} + if "env" in field.json_schema_extra: continue - json_schema["env"] = env_key - # then assign json_schema back to field - field.json_schema_extra = json_schema - field.default = os.environ.get(env_key, field.default) - - def to_click_options(self, f: F[P]) -> t.Callable[[t.Callable[..., t.Any]], click.Command]: - for name, field in self.model_fields.items(): - if t.get_origin(field.annotation) is t.Union: - # NOTE: Union type is currently not yet supported, we probably just need to use environment instead. - continue - f = field_to_options(name, field, self.__openllm_model_name__, suffix_generation=True)(f) - return optgroup.group( - f"{self.__class__.__name__} generation options", - help=f"[Auto-generated from '{self.__class__.__qualname__}']", - )(f) + field.json_schema_extra["env"] = f"OPENLLM_{self.__openllm_env_name__}_GENERATION_{key.upper()}" class LLMConfig(pydantic.BaseModel, ABC): @@ -428,23 +416,15 @@ class LLMConfig(pydantic.BaseModel, ABC): return getattr(self.generation_config, attr) return getattr(self, attr) - def __repr_args__(self) -> ReprArgs: - """Overwrite from default BaseModel and don't show __pydantic_extra__.""" - yield from ( - (k, v) - for k, v in self.__dict__.items() - if not k.startswith("_") and (k not in self.model_fields or self.model_fields[k].repr) - ) - yield from ((k, getattr(self, k)) for k, v in self.model_computed_fields.items() if v.repr) - if t.TYPE_CHECKING: # The following is handled via __pydantic_init_subclass__, and is only used for TYPE_CHECKING - __openllm_model_name__: str = "" - __openllm_start_name__: str = "" - __openllm_timeout__: int = 0 + __openllm_model_name__: str + __openllm_start_name__: str + __openllm_timeout__: int = 3600 __openllm_name_type__: t.Literal["dasherize", "lowercase"] = "dasherize" __openllm_trust_remote_code__: bool = False __openllm_requires_gpu__: bool = False + __openllm_env__: openllm.utils.ModelEnv GenerationConfig: type[t.Any] = GenerationConfig def __init_subclass__( @@ -488,6 +468,8 @@ class LLMConfig(pydantic.BaseModel, ABC): cls.__openllm_model_name__ = cls.__name__.replace("Config", "").lower() cls.__openllm_start_name__ = cls.__openllm_model_name__ + cls.__openllm_env__ = openllm.utils.ModelEnv(cls.__openllm_model_name__) + if hasattr(cls, "GenerationConfig"): cls.generation_config = t.cast( "type[GenerationConfig]", @@ -508,12 +490,10 @@ class LLMConfig(pydantic.BaseModel, ABC): def model_post_init(self, _: t.Any): if self.__pydantic_extra__: - generation_config = self.__pydantic_extra__.pop("generation_config", None) + generation_config: dict[str, t.Any] | None = self.__pydantic_extra__.pop("generation_config", None) if generation_config is not None: - assert isinstance(generation_config, dict), "generation_config must be a dict." - self.generation_config = self.generation_config.model_copy( - update=t.cast("dict[str, t.Any]", generation_config), deep=True - ) + assert LazyType[DictStrAny](dict).isinstance(generation_config), "generation_config must be a dict." + self.generation_config = self.generation_config.model_copy(update=generation_config, deep=True) else: # The rest of the extras fields should just be the generation_config. self.generation_config = self.generation_config.model_copy(update=self.__pydantic_extra__, deep=True) @@ -551,64 +531,48 @@ class LLMConfig(pydantic.BaseModel, ABC): except pydantic.ValidationError as e: raise openllm.exceptions.ValidationError(f"Failed to dump configuration to dict: {e}") from e - def with_options(self, __llm_config__: LLMConfig | None = None, **attrs: t.Any) -> LLMConfig: + @classmethod + def model_construct_env(cls, __llm_config__: LLMConfig | None = None, **attrs: t.Any) -> LLMConfig: """A helpers that respect configuration values that sets from environment variables for any given configuration class. """ - from_env_ = self.from_env() - # filtered out None values - attrs = {k: v for k, v in attrs.items() if v is not None} - generation_keys = {k for k in attrs if k in self.generation_config.model_fields} - - generation_attrs = {k: v for k, v in attrs.items() if k in generation_keys} - config_attrs = {k: v for k, v in attrs.items() if k not in generation_keys} - - # NOTE: first set the default config kwargs. - # We will always respect envvar as default, then the one that is pass - attrs = {**generate_kwargs_from_envvar(self), **config_attrs} + env_json_string = os.environ.get(cls.__openllm_env__.model_config, None) + if env_json_string is not None: + try: + self = cls.model_construct(**orjson.loads(env_json_string)) + except pydantic.ValidationError as e: + raise RuntimeError(f"Failed to parse '{cls.__openllm_env__.model_config}' as valid JSON string.") from e + else: + self = cls.model_construct() if __llm_config__ is not None: # NOTE: Only hit this branch on the server. Client shouldn't use __llm_config__ - attrs = {**attrs, **__llm_config__.model_dump()} + # as it is not set. + return self.model_construct(**__llm_config__.model_dump(flatten=True)) - # NOTE: Then we setup generation config values - attrs["generation_config"] = { - **generate_kwargs_from_envvar(self.generation_config), - **attrs.get("generation_config", {}), - **generation_attrs, - } + # filtered out None values + attrs = {k: v for k, v in attrs.items() if v is not None} - if from_env_: - return from_env_.model_construct(**attrs) - return self.model_construct(**attrs) + construct_attrs = generate_kwargs_from_envvar(self) + construct_attrs.update(generate_kwargs_from_envvar(self.generation_config)) + construct_attrs.update(attrs) - @classmethod - def from_env(cls) -> LLMConfig | None: - envvar = openllm.utils.MODEL_CONFIG_ENV_VAR(cls.__openllm_model_name__) - env_json_string = os.environ.get(envvar, None) - if env_json_string is None: - return - - try: - return cls.model_construct(**orjson.loads(env_json_string)) - except pydantic.ValidationError as e: - raise RuntimeError(f"Failed to parse environment variable '{envvar}' as a valid JSON string.") from e + return self.model_construct(**construct_attrs) def model_validate_click(self, **attrs: t.Any) -> tuple[LLMConfig, dict[str, t.Any]]: """Parse given click attributes into a LLMConfig and return the remaining click attributes.""" - llm_config_attrs = { - k[len(self.__openllm_model_name__) + 1 :]: v - for k, v in attrs.items() - if k[len(self.__openllm_model_name__) + 1 :] in self.model_fields - } - llm_config_attrs["generation_config"] = { - k[len(self.__openllm_model_name__ + "_generation") + 1 :]: v - for k, v in attrs.items() - if k[len(self.__openllm_model_name__ + "_generation") + 1 :] in self.generation_config.model_fields - } - return self.with_options(**llm_config_attrs), { - k: v for k, v in attrs.items() if not k.startswith(self.__openllm_model_name__) - } + llm_config_attrs = {} + key_to_remove: list[str] = [] + + for k, v in attrs.items(): + if k.startswith(f"{self.__openllm_model_name__}_"): + llm_config_attrs[k[len(self.__openllm_model_name__) + 1 :]] = v + key_to_remove.append(k) + elif k.startswith(f"{self.__openllm_model_name__}_generation_"): + llm_config_attrs[k[len(self.__openllm_model_name__ + "_generation") + 1 :]] = v + key_to_remove.append(k) + + return self.model_construct_env(**llm_config_attrs), {k: v for k, v in attrs.items() if k not in key_to_remove} @t.overload def to_generation_config(self, return_as_dict: t.Literal[True] = ...) -> dict[str, t.Any]: @@ -627,17 +591,25 @@ class LLMConfig(pydantic.BaseModel, ABC): return config - def to_click_options(self, f: F[P]) -> t.Callable[[t.Callable[..., t.Any]], click.Command]: + def to_click_options(self, f: F[P]) -> t.Callable[[F[P]], click.Command]: """ Convert current model to click options. This can be used as a decorator for click commands. Note that the identifier for all LLMConfig will be prefixed with '_*', and the generation config will be prefixed with '_generation_*'. """ - wrapped_generation = self.generation_config.to_click_options(f) + + for name, field in self.generation_config.model_fields.items(): + if t.get_origin(field.annotation) is t.Union: + # NOTE: Union type is currently not yet supported, we probably just need to use environment instead. + continue + f = field_to_options(name, field, self.__openllm_model_name__, suffix_generation=True)(f) + f = optgroup.group(f"{self.__class__.__name__} generation options")(f) + if len(self.model_fields.values()) == 0: - return wrapped_generation + return f for name, field in self.model_fields.items(): - wrapped_generation = field_to_options(name, field, self.__openllm_model_name__)(wrapped_generation) - return optgroup.group( - f"{self.__class__.__name__} options", help=f"[Auto-generated from '{self.__class__.__qualname__}']" - )(wrapped_generation) + if t.get_origin(field.annotation) is t.Union: + # NOTE: Union type is currently not yet supported, we probably just need to use environment instead. + continue + f = field_to_options(name, field, self.__openllm_model_name__)(f) + return optgroup.group(f"{self.__class__.__name__} options")(f) diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py index a9a99021..5b8044a9 100644 --- a/src/openllm/_llm.py +++ b/src/openllm/_llm.py @@ -440,8 +440,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass): # NOTE: The section below defines a loose contract with langchain's LLM interface. @property def llm_type(self) -> str: - assert self.default_model is not None - return openllm.utils.convert_transformers_model_name(self.default_model) + return openllm.utils.convert_transformers_model_name(self._pretrained) @property def identifying_params(self) -> dict[str, t.Any]: @@ -637,10 +636,10 @@ def Runner(start_name: str, **attrs: t.Any) -> bentoml.Runner: behaviour """ init_local = attrs.pop("init_local", False) - envvar = openllm.utils.get_framework_env(start_name) - if envvar == "flax": + ModelEnv = openllm.utils.ModelEnv(start_name) + if ModelEnv.get_framework_env() == "flax": runner = openllm.AutoFlaxLLM.create_runner(start_name, **attrs) - elif envvar == "tf": + elif ModelEnv.get_framework_env() == "tf": runner = openllm.AutoTFLLM.create_runner(start_name, **attrs) else: runner = openllm.AutoLLM.create_runner(start_name, **attrs) diff --git a/src/openllm/_package.py b/src/openllm/_package.py index 1255c7c2..c3649133 100644 --- a/src/openllm/_package.py +++ b/src/openllm/_package.py @@ -70,6 +70,8 @@ def build_editable(path: str) -> str | None: def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions: packages: list[str] = [] + + ModelEnv = openllm.utils.ModelEnv(llm.__openllm_start_name__) if llm.requirements is not None: packages.extend(llm.requirements) @@ -89,11 +91,9 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions: ] ) - to_use_framework = utils.get_framework_env(llm.__openllm_start_name__) + to_use_framework = ModelEnv.get_framework_env() if to_use_framework == "flax": - assert ( - utils.is_flax_available() - ), f"Flax is not available, while {utils.FRAMEWORK_ENV_VAR(llm.__openllm_start_name__)} is set to 'flax'" + assert utils.is_flax_available(), f"Flax is not available, while {ModelEnv.framework} is set to 'flax'" packages.extend( [ f"flax>={importlib.metadata.version('flax')}", @@ -102,9 +102,7 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions: ] ) elif to_use_framework == "tf": - assert ( - utils.is_tf_available() - ), f"TensorFlow is not available, while {utils.FRAMEWORK_ENV_VAR(llm.__openllm_start_name__)} is set to 'tf'" + assert utils.is_tf_available(), f"TensorFlow is not available, while {ModelEnv.framework} is set to 'tf'" candidates = ( "tensorflow", "tensorflow-cpu", @@ -137,11 +135,12 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions: return PythonOptions(packages=packages, wheels=wheels, lock_packages=True) -def construct_docker_options(llm: openllm.LLM, llm_fs: FS) -> DockerOptions: +def construct_docker_options(llm: openllm.LLM, _: FS) -> DockerOptions: + ModelEnv = openllm.utils.ModelEnv(llm.__openllm_start_name__) return DockerOptions( cuda_version="11.6", # NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version env={ - utils.FRAMEWORK_ENV_VAR(llm.__openllm_start_name__): utils.get_framework_env(llm.__openllm_start_name__), + ModelEnv.framework: ModelEnv.get_framework_env(), "OPENLLM_MODEL": llm.config.__openllm_model_name__, }, system_packages=["git"], @@ -165,14 +164,16 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be current_model_envvar = os.environ.pop("OPENLLM_MODEL", None) _previously_built = False - logger.debug("Packing '%s' into a Bento with kwargs=%s...", model_name, attrs) + ModelEnv = openllm.utils.ModelEnv(model_name) + + logger.info("Packing '%s' into a Bento with kwargs=%s...", model_name, attrs) # NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError # during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path try: os.environ["OPENLLM_MODEL"] = inflection.underscore(model_name) - to_use_framework = openllm.utils.get_framework_env(model_name) + to_use_framework = ModelEnv.get_framework_env() if to_use_framework == "flax": llm = openllm.AutoFlaxLLM.for_model(model_name, **attrs) elif to_use_framework == "tf": diff --git a/src/openllm/_service.py b/src/openllm/_service.py index 8385cd90..8979facf 100644 --- a/src/openllm/_service.py +++ b/src/openllm/_service.py @@ -30,7 +30,7 @@ svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", r route="/v1/generate", ) async def generate_v1(qa: openllm.GenerationInput) -> openllm.GenerationOutput: - config = llm_config.with_options(__llm_config__=qa.llm_config).model_dump() + config = llm_config.model_construct_env(__llm_config__=qa.llm_config).model_dump() responses = await runner.generate.async_run(qa.prompt, **config) return openllm.GenerationOutput(responses=responses, configuration=config) @@ -39,5 +39,5 @@ async def generate_v1(qa: openllm.GenerationInput) -> openllm.GenerationOutput: def metadata_v1(_: str) -> dict[str, t.Any]: return { "model_name": llm_config.__openllm_model_name__, - "framework": openllm.utils.get_framework_env(llm_config.__openllm_model_name__), + "framework": llm_config.__openllm_env__.get_framework_env(), } diff --git a/src/openllm/cli.py b/src/openllm/cli.py index 7241dfba..ded1313a 100644 --- a/src/openllm/cli.py +++ b/src/openllm/cli.py @@ -314,7 +314,7 @@ def start_model_command( Note that the internal commands will return the llm_config and a boolean determine whether the server is run with GPU or not. """ - envvar = openllm.utils.get_framework_env(model_name) + ModelEnv = openllm.utils.ModelEnv(model_name) model_command_decr: dict[str, t.Any] = { "name": inflection.underscore(model_name), "context_settings": _context_settings or {}, @@ -330,16 +330,15 @@ def start_model_command( { "name": config.__openllm_model_name__, "short_help": f"Start a LLMServer for '{model_name}' ('--help' for more details)", - "help": getattr( - openllm.utils.get_lazy_module(model_name), - f"START_{inflection.underscore(model_name).upper()}_COMMAND_DOCSTRING", - ), + "help": ModelEnv.start_docstring, "aliases": aliases if len(aliases) > 0 else None, } ) + gpu_available = False try: - config.check_if_gpu_is_available(envvar) + config.check_if_gpu_is_available(ModelEnv.get_framework_env()) + gpu_available = True except openllm.exceptions.GpuNotAvailableError: # NOTE: The model requires GPU, therefore we will return a dummy command model_command_decr.update( @@ -353,7 +352,7 @@ def start_model_command( @factory.command(**model_command_decr) def noop() -> openllm.LLMConfig: click.secho("No GPU available, therefore this command is disabled", fg="red") - openllm.utils.analytics.track_start_init(config, False) + openllm.utils.analytics.track_start_init(config, gpu_available) return config return noop @@ -371,15 +370,24 @@ def start_model_command( configure_logging() - updated_config, server_kwds = config.model_validate_click(**attrs) - openllm.utils.analytics.track_start_init(updated_config, False) + updated_config, server_attrs = config.model_validate_click(**attrs) - server_kwds.update({"working_dir": os.path.dirname(__file__)}) + # NOTE: check for GPU one more time in cases this model doesn't requires GPU but users can still + # run this model on GPU + try: + updated_config.check_if_gpu_is_available(ModelEnv.get_framework_env()) + gpu_available = True + except openllm.exceptions.GpuNotAvailableError: + gpu_available = False + + openllm.utils.analytics.track_start_init(updated_config, gpu_available) + + server_attrs.update({"working_dir": os.path.dirname(__file__)}) if _serve_grpc: - server_kwds["grpc_protocol_version"] = "v1" + server_attrs["grpc_protocol_version"] = "v1" # NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream. - development = server_kwds.pop("development") - server_kwds.setdefault("production", not development) + development = server_attrs.pop("development") + server_attrs.setdefault("production", not development) start_env = os.environ.copy() @@ -395,17 +403,17 @@ def start_model_command( start_env.update( { - openllm.utils.FRAMEWORK_ENV_VAR(model_name): envvar, - openllm.utils.MODEL_CONFIG_ENV_VAR(model_name): updated_config.model_dump_json(), + ModelEnv.framework: ModelEnv.get_framework_env(), + ModelEnv.model_config: updated_config.model_dump_json(), "OPENLLM_MODEL": model_name, "BENTOML_DEBUG": str(get_debug_mode()), "BENTOML_CONFIG_OPTIONS": _bentoml_config_options, } ) - if envvar == "flax": + if ModelEnv.get_framework_env() == "flax": llm = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained) - elif envvar == "tf": + elif ModelEnv.get_framework_env() == "tf": llm = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained) else: llm = openllm.AutoLLM.for_model(model_name, pretrained=pretrained) @@ -416,7 +424,7 @@ def start_model_command( ) click.secho(f"Starting LLM Server for '{model_name}'\n", fg="blue") server_cls = getattr(bentoml, "HTTPServer" if not _serve_grpc else "GrpcServer") - server: bentoml.server.Server = server_cls("_service.py:svc", **server_kwds) + server: bentoml.server.Server = server_cls("_service.py:svc", **server_attrs) server.timeout = 90 try: @@ -449,8 +457,10 @@ def _start( _serve_grpc = attrs.pop("_serve_grpc", False) + ModelEnv = openllm.utils.ModelEnv(model_name) + if framework is not None: - os.environ[openllm.utils.FRAMEWORK_ENV_VAR(model_name)] = framework + os.environ[ModelEnv.framework] = framework start_model_command(model_name, t.cast(OpenLLMCommandGroup, cli), _serve_grpc=_serve_grpc)( standalone_mode=False, **attrs ) @@ -585,9 +595,12 @@ def list_supported_models(output: t.Literal["json", "pretty", "porcelain"]): except Exception as err: failed_initialized.append((m, err)) _console.print(table) - _console.print("\n[bold yellow] The following models are supported but failed to initialize:[/bold yellow]\n") - for m, err in failed_initialized: - _console.print(Text(f"- {m}: ") + Text(f"{err}\n", style="bold red")) + if len(failed_initialized) > 0: + _console.print( + "\n[bold yellow] The following models are supported but failed to initialize:[/bold yellow]\n" + ) + for m, err in failed_initialized: + _console.print(Text(f"- {m}: ") + Text(f"{err}\n", style="bold red")) elif output == "json": result_json: dict[str, dict[t.Literal["variants", "description"], t.Any]] = {} for m in models: diff --git a/src/openllm/models/auto/configuration_auto.py b/src/openllm/models/auto/configuration_auto.py index 9405a1c3..f7861af0 100644 --- a/src/openllm/models/auto/configuration_auto.py +++ b/src/openllm/models/auto/configuration_auto.py @@ -53,7 +53,7 @@ class _LazyConfigMapping(ConfigOrderedDict): value = self._mapping[key] module_name = inflection.underscore(key) if module_name not in self._modules: - self._modules[module_name] = openllm.utils.get_lazy_module(module_name) + self._modules[module_name] = openllm.utils.ModelEnv(module_name).module if hasattr(self._modules[module_name], value): return getattr(self._modules[module_name], value) @@ -93,10 +93,10 @@ class AutoConfig: raise EnvironmentError("Cannot instantiate Config. Please use `Config.for_model(model_name)` instead.") @classmethod - def for_model(cls, model_name: str, *args: t.Any, **attrs: t.Any) -> openllm.LLMConfig: + def for_model(cls, model_name: str, **attrs: t.Any) -> openllm.LLMConfig: model_name = inflection.underscore(model_name) if model_name in CONFIG_MAPPING: - return CONFIG_MAPPING[model_name]().with_options(*args, **attrs) + return CONFIG_MAPPING[model_name].model_construct_env(**attrs) raise ValueError( f"Unrecognized configuration class for {model_name}. " f"Model name should be one of {', '.join(CONFIG_MAPPING.keys())}." diff --git a/src/openllm/models/auto/factory.py b/src/openllm/models/auto/factory.py index 59481337..40a575cd 100644 --- a/src/openllm/models/auto/factory.py +++ b/src/openllm/models/auto/factory.py @@ -190,7 +190,7 @@ class _LazyAutoMapping(ConfigModelOrderedDict): def _load_attr_from_module(self, model_type: str, attr: str) -> t.Any: module_name = inflection.underscore(model_type) if module_name not in self._modules: - self._modules[module_name] = openllm.utils.get_lazy_module(module_name) + self._modules[module_name] = openllm.utils.ModelEnv(module_name).module return getattribute_from_module(self._modules[module_name], attr) def keys(self): diff --git a/src/openllm/models/chatglm/modeling_chatglm.py b/src/openllm/models/chatglm/modeling_chatglm.py index bb48e312..38d8d9b9 100644 --- a/src/openllm/models/chatglm/modeling_chatglm.py +++ b/src/openllm/models/chatglm/modeling_chatglm.py @@ -81,7 +81,7 @@ class ChatGLM(openllm.LLM): else: prompt_text = prompt - generation_config = self.config.with_options( + generation_config = self.config.model_construct_env( max_new_tokens=max_new_tokens, num_beams=num_beams, top_p=top_p, @@ -123,7 +123,7 @@ class ChatGLM(openllm.LLM): inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device) outputs = self.model.generate( **inputs, - generation_config=self.config.with_options( + generation_config=self.config.model_construct_env( max_new_tokens=max_new_tokens, num_beams=num_beams, top_p=top_p, diff --git a/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/src/openllm/models/dolly_v2/modeling_dolly_v2.py index 652847a5..5392464f 100644 --- a/src/openllm/models/dolly_v2/modeling_dolly_v2.py +++ b/src/openllm/models/dolly_v2/modeling_dolly_v2.py @@ -79,7 +79,7 @@ class DollyV2(openllm.LLM): ) -> tuple[str, dict[str, t.Any]]: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt) - generation_config = self.config.with_options( + generation_config = self.config.model_construct_env( max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k, @@ -112,7 +112,7 @@ class DollyV2(openllm.LLM): end_key_token_id = None eos_token_id = None - llm_config = self.config.with_options( + llm_config = self.config.model_construct_env( max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k, diff --git a/src/openllm/models/falcon/modeling_falcon.py b/src/openllm/models/falcon/modeling_falcon.py index 7edfa34d..9a544790 100644 --- a/src/openllm/models/falcon/modeling_falcon.py +++ b/src/openllm/models/falcon/modeling_falcon.py @@ -66,7 +66,7 @@ class Falcon(openllm.LLM): eos_token_id: int | None = None, **attrs: t.Any, ) -> tuple[str, dict[str, t.Any]]: - generation_config = self.config.with_options( + generation_config = self.config.model_construct_env( max_new_tokens=max_new_tokens, top_k=top_k, num_return_sequences=num_return_sequences, @@ -95,7 +95,7 @@ class Falcon(openllm.LLM): return self.model( prompt, do_sample=True, - generation_config=self.config.with_options( + generation_config=self.config.model_construct_env( max_new_tokens=max_new_tokens, top_k=top_k, num_return_sequences=num_return_sequences, diff --git a/src/openllm/models/flan_t5/modeling_flan_t5.py b/src/openllm/models/flan_t5/modeling_flan_t5.py index 40c90477..fc578168 100644 --- a/src/openllm/models/flan_t5/modeling_flan_t5.py +++ b/src/openllm/models/flan_t5/modeling_flan_t5.py @@ -48,7 +48,7 @@ class FlanT5(openllm.LLM): repetition_penalty: float | None = None, **attrs: t.Any, ) -> tuple[str, dict[str, t.Any]]: - return prompt, self.config.with_options( + return prompt, self.config.model_construct_env( max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k, @@ -75,7 +75,7 @@ class FlanT5(openllm.LLM): result_tensor = self.model.generate( input_ids, do_sample=True, - generation_config=self.config.with_options( + generation_config=self.config.model_construct_env( max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k, diff --git a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py index 4253b84f..88d8f7de 100644 --- a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py +++ b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py @@ -41,7 +41,7 @@ class FlaxFlanT5(openllm.LLM): repetition_penalty: float | None = None, **attrs: t.Any, ) -> tuple[str, dict[str, t.Any]]: - return prompt, self.config.with_options( + return prompt, self.config.model_construct_env( max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k, @@ -67,7 +67,7 @@ class FlaxFlanT5(openllm.LLM): result_tensor = self.model.generate( input_ids, do_sample=True, - generation_config=self.config.with_options( + generation_config=self.config.model_construct_env( max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k, diff --git a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py index 9dac1ad0..9ead74ba 100644 --- a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py +++ b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py @@ -41,7 +41,7 @@ class TFFlanT5(openllm.LLM): repetition_penalty: float | None = None, **attrs: t.Any, ) -> tuple[str, dict[str, t.Any]]: - return prompt, self.config.with_options( + return prompt, self.config.model_construct_env( max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k, @@ -67,7 +67,7 @@ class TFFlanT5(openllm.LLM): outputs = self.model.generate( input_ids, do_sample=True, - generation_config=self.config.with_options( + generation_config=self.config.model_construct_env( max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k, diff --git a/src/openllm/models/starcoder/modeling_starcoder.py b/src/openllm/models/starcoder/modeling_starcoder.py index 9db4e6bc..2f3626ea 100644 --- a/src/openllm/models/starcoder/modeling_starcoder.py +++ b/src/openllm/models/starcoder/modeling_starcoder.py @@ -116,7 +116,7 @@ class StarCoder(openllm.LLM): raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err prompt = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}" - return prompt, self.config.with_options( + return prompt, self.config.model_construct_env( top_p=top_p, temperature=temperature, max_new_tokens=max_new_tokens, @@ -154,7 +154,7 @@ class StarCoder(openllm.LLM): result_tensor = self.model.generate( inputs, do_sample=True, - generation_config=self.config.with_options( + generation_config=self.config.model_construct_env( top_p=top_p, temperature=temperature, max_new_tokens=max_new_tokens, diff --git a/src/openllm/utils/__init__.py b/src/openllm/utils/__init__.py index d7752322..ea32d0b1 100644 --- a/src/openllm/utils/__init__.py +++ b/src/openllm/utils/__init__.py @@ -26,6 +26,7 @@ import re import types import typing as t +import attrs import bentoml import inflection from bentoml._internal.types import LazyType as LazyType @@ -56,27 +57,37 @@ else: logger = logging.getLogger(__name__) -_object_setattr = object.__setattr__ - def get_lazy_module(model_name: str) -> LazyLoader: snaked_model_name = inflection.underscore(model_name) return LazyLoader(snaked_model_name, globals(), f"openllm.models.{snaked_model_name}") -def FRAMEWORK_ENV_VAR(model_name: str) -> str: - return f"OPENLLM_{inflection.underscore(model_name).upper()}_FRAMEWORK" +@attrs.define +class ModelEnv: + model_name: str = attrs.field(converter=inflection.underscore) + @property + def framework(self) -> str: + return f"OPENLLM_{self.model_name.upper()}_FRAMEWORK" -def MODEL_CONFIG_ENV_VAR(model_name: str) -> str: - return f"OPENLLM_{inflection.underscore(model_name).upper()}_CONFIG" + @property + def model_config(self) -> str: + return f"OPENLLM_{self.model_name.upper()}_CONFIG" + @property + def start_docstring(self) -> str: + return getattr(self.module, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING") -def get_framework_env(model_name: str) -> t.Literal["pt", "flax", "tf"]: - envvar = os.environ.get(FRAMEWORK_ENV_VAR(model_name), "pt") - if envvar not in ("pt", "tf", "flax"): - raise ValueError(f"Invalid framework implementation {envvar}, must be one of 'pt', 'tf', 'flax'") - return envvar + @property + def module(self) -> LazyLoader: + return LazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}") + + def get_framework_env(self) -> t.Literal["pt", "flax", "tf"]: + envvar = os.environ.get(self.framework, "pt") + if envvar not in ("pt", "tf", "flax"): + raise ValueError(f"Invalid framework implementation {envvar}, must be one of 'pt', 'tf', 'flax'") + return envvar def convert_transformers_model_name(name: str) -> str: diff --git a/src/openllm_client/_prompt.py b/src/openllm_client/_prompt.py index 41b7afbc..082d9702 100644 --- a/src/openllm_client/_prompt.py +++ b/src/openllm_client/_prompt.py @@ -79,7 +79,7 @@ class PromptTemplate: @classmethod def from_default(cls, model: str) -> PromptTemplate: - template = getattr(openllm.utils.get_lazy_module(model), "DEFAULT_PROMPT_TEMPLATE") + template = getattr(openllm.utils.ModelEnv(model).module, "DEFAULT_PROMPT_TEMPLATE") if template is None: raise ValueError(f"Model {model} does not have a default prompt template.") return cls.from_template(template) diff --git a/src/openllm_client/runtimes/base.py b/src/openllm_client/runtimes/base.py index 389065a1..31d6f8b4 100644 --- a/src/openllm_client/runtimes/base.py +++ b/src/openllm_client/runtimes/base.py @@ -109,7 +109,7 @@ class BaseClient(ClientMixin): def query(self, prompt: str, **attrs: t.Any) -> dict[str, t.Any] | str: return_raw_response = attrs.pop("return_raw_response", False) prompt, attrs = self.llm.preprocess_parameters(prompt, **attrs) - inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.with_options(**attrs)) + inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**attrs)) r = openllm.GenerationOutput(**self.call("generate", inputs)) if return_raw_response: @@ -132,7 +132,7 @@ class BaseAsyncClient(ClientMixin): async def query(self, prompt: str, **attrs: t.Any) -> dict[str, t.Any] | str: return_raw_response = attrs.pop("return_raw_response", False) prompt, attrs = self.llm.preprocess_parameters(prompt, **attrs) - inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.with_options(**attrs)) + inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**attrs)) res = await self.acall("generate", inputs) r = openllm.GenerationOutput(**res)