perf: reduce unecessary object creation for config class

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron
2023-05-28 05:22:22 -07:00
parent 3fb1e5338a
commit 0df8d8b9a6
17 changed files with 187 additions and 191 deletions

View File

@@ -38,7 +38,6 @@ class FlanT5Config(openllm.LLMConfig):
"""
from __future__ import annotations
import copy
import os
import types
import typing as t
@@ -55,7 +54,7 @@ from click_option_group import optgroup
import openllm
from .exceptions import GpuNotAvailableError, OpenLLMException
from .utils import _object_setattr
from .utils import LazyType
from .utils.dantic import allows_multiple, parse_default
if t.TYPE_CHECKING:
@@ -70,15 +69,20 @@ if t.TYPE_CHECKING:
import transformers
from pydantic.fields import FieldInfo
from transformers.generation.beam_constraints import Constraint
DictStrAny = dict[str, t.Any]
else:
from transformers.utils.dummy_pt_objects import Constraint
DictStrAny = dict
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
tf = openllm.utils.LazyLoader("tf", globals(), "tensorflow")
__all__ = ["LLMConfig", "ModelSignature"]
_object_setattr = object.__setattr__
def field_to_options(
name: str, field: FieldInfo, model_name: str, suffix_generation: bool = False
@@ -87,6 +91,7 @@ def field_to_options(
envvar = field.json_schema_extra.get("env") if field.json_schema_extra else None
dasherized = inflection.dasherize(name)
underscored = inflection.underscore(name)
full_option_name = f"--{dasherized}"
if field.annotation is bool:
full_option_name += f"/--no-{dasherized}"
@@ -101,7 +106,7 @@ def field_to_options(
type=field.annotation,
required=field.is_required(),
default=parse_default(field.default, field.annotation),
show_default=False,
show_default=True if field.default else False,
multiple=allows_multiple(field.annotation),
help=field.description,
show_envvar=True if envvar else False,
@@ -109,14 +114,13 @@ def field_to_options(
)
def generate_kwargs_from_envvar(model: GenerationConfig | LLMConfig) -> dict[str, str]:
kwargs: dict[str, t.Any] = {}
for key, field in model.model_fields.items():
if field.json_schema_extra is not None:
if "env" not in field.json_schema_extra:
raise RuntimeError(f"Invalid {model} passed. Only accept LLMConfig or LLMConfig.generation_config")
kwargs[key] = os.environ.get(field.json_schema_extra["env"], field.default)
return {k: v for k, v in kwargs.items() if v is not None}
def generate_kwargs_from_envvar(model: GenerationConfig | LLMConfig) -> dict[str, t.Any]:
# NOTE: We can safe cast here since all of the fields in GenerationConfig or LLMConfig
# will have a `env` field in `json_schema_extra`
return {
key: os.environ.get(t.cast("dict[str, t.Any]", field.json_schema_extra)["env"], field.default)
for key, field in model.model_fields.items()
}
class GenerationConfig(pydantic.BaseModel):
@@ -136,7 +140,7 @@ class GenerationConfig(pydantic.BaseModel):
description="""The minimum length of the sequence to be generated. Corresponds to the length of the
input prompt + `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.""",
)
min_new_tokens: t.Optional[int] = pydantic.Field(
min_new_tokens: int = pydantic.Field(
None, description="The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt."
)
early_stopping: bool = pydantic.Field(
@@ -150,7 +154,7 @@ class GenerationConfig(pydantic.BaseModel):
(canonical beam search algorithm)
""",
)
max_time: t.Optional[float] = pydantic.Field(
max_time: float = pydantic.Field(
None,
description="""The maximum amount of time you allow the computation to run for in seconds. generation will
still finish the current pass after allocated time has been passed.""",
@@ -163,7 +167,7 @@ class GenerationConfig(pydantic.BaseModel):
description="""Number of groups to divide `num_beams` into in order to ensure diversity among different
groups of beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.""",
)
penalty_alpha: t.Optional[float] = pydantic.Field(
penalty_alpha: float = pydantic.Field(
None,
description="""The values balance the model confidence and the degeneration penalty in
contrastive search decoding.""",
@@ -242,14 +246,15 @@ class GenerationConfig(pydantic.BaseModel):
no_repeat_ngram_size: int = pydantic.Field(
0, description="If set to int > 0, all ngrams of that size can only occur once."
)
bad_words_ids: t.Optional[t.List[t.List[int]]] = pydantic.Field(
bad_words_ids: t.List[t.List[int]] = pydantic.Field(
None,
description="""List of token ids that are not allowed to be generated. In order to get the token ids
of the words that should not appear in the generated text, use
`tokenizer(bad_words, add_prefix_space=True, add_special_tokens=False).input_ids`.
""",
)
force_words_ids: t.Optional[t.Union[t.List[t.List[int]], t.List[t.List[t.List[int]]]]] = pydantic.Field(
# NOTE: t.Union is not yet supported on CLI, but the environment variable should already be available.
force_words_ids: t.Union[t.List[t.List[int]], t.List[t.List[t.List[int]]]] = pydantic.Field(
None,
description="""List of token ids that must be generated. If given a `List[List[int]]`, this is treated
as a simple list of words that must be included, the opposite to `bad_words_ids`.
@@ -265,13 +270,13 @@ class GenerationConfig(pydantic.BaseModel):
algorithms suppose the score logits are normalized but some logit processors or warpers break the normalization.
""",
)
constraints: t.Optional[t.List["Constraint"]] = pydantic.Field(
constraints: t.List["Constraint"] = pydantic.Field(
None,
description="""Custom constraints that can be added to the generation to ensure that the output
will contain the use of certain tokens as defined by ``Constraint`` objects, in the most sensible way possible.
""",
)
forced_bos_token_id: t.Optional[int] = pydantic.Field(
forced_bos_token_id: int = pydantic.Field(
None,
description="""The id of the token to force as the first generated token after the
``decoder_start_token_id``. Useful for multilingual models like
@@ -279,7 +284,7 @@ class GenerationConfig(pydantic.BaseModel):
to be the target language token.
""",
)
forced_eos_token_id: t.Optional[t.Union[int, t.List[int]]] = pydantic.Field(
forced_eos_token_id: t.Union[int, t.List[int]] = pydantic.Field(
None,
description="""The id of the token to force as the last generated token when `max_length` is reached.
Optionally, use a list to set multiple *end-of-sequence* tokens.""",
@@ -289,26 +294,26 @@ class GenerationConfig(pydantic.BaseModel):
description="""Whether to remove possible *nan* and *inf* outputs of the model to prevent the
generation method to crash. Note that using `remove_invalid_values` can slow down generation.""",
)
exponential_decay_length_penalty: t.Optional[t.Tuple[int, float]] = pydantic.Field(
exponential_decay_length_penalty: t.Tuple[int, float] = pydantic.Field(
None,
description="""This tuple adds an exponentially increasing length penalty, after a certain amount of tokens
have been generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index`
indicates where penalty starts and `decay_factor` represents the factor of exponential decay
""",
)
suppress_tokens: t.Optional[t.List[int]] = pydantic.Field(
suppress_tokens: t.List[int] = pydantic.Field(
None,
description="""A list of tokens that will be suppressed at generation. The `SupressTokens` logit
processor will set their log probs to `-inf` so that they are not sampled.
""",
)
begin_suppress_tokens: t.Optional[t.List[int]] = pydantic.Field(
begin_suppress_tokens: t.List[int] = pydantic.Field(
None,
description="""A list of tokens that will be suppressed at the beginning of the generation. The
`SupressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled.
""",
)
forced_decoder_ids: t.Optional[t.List[t.List[int]]] = pydantic.Field(
forced_decoder_ids: t.List[t.List[int]] = pydantic.Field(
None,
description="""A list of pairs of integers which indicates a mapping from generation indices to token indices
that will be forced before sampling. For example, `[[1, 123]]` means the second generated token will always
@@ -338,9 +343,9 @@ class GenerationConfig(pydantic.BaseModel):
)
# NOTE: Special tokens that can be used at generation time
pad_token_id: t.Optional[int] = pydantic.Field(None, description="The id of the *padding* token.")
bos_token_id: t.Optional[int] = pydantic.Field(None, description="The id of the *beginning-of-sequence* token.")
eos_token_id: t.Optional[t.Union[int, t.List[int]]] = pydantic.Field(
pad_token_id: int = pydantic.Field(None, description="The id of the *padding* token.")
bos_token_id: int = pydantic.Field(None, description="The id of the *beginning-of-sequence* token.")
eos_token_id: t.Union[int, t.List[int]] = pydantic.Field(
None,
description="""The id of the *end-of-sequence* token. Optionally, use a list to set
multiple *end-of-sequence* tokens.""",
@@ -353,7 +358,7 @@ class GenerationConfig(pydantic.BaseModel):
`encoder_input_ids` cannot occur in the `decoder_input_ids`.
""",
)
decoder_start_token_id: t.Optional[int] = pydantic.Field(
decoder_start_token_id: int = pydantic.Field(
None,
description="""If an encoder-decoder model starts decoding with a
different token than *bos*, the id of that token.
@@ -361,7 +366,7 @@ class GenerationConfig(pydantic.BaseModel):
)
# NOTE: pydantic definition
model_config = dict(arbitrary_types_allowed=True, extra="forbid")
model_config = {"extra": "forbid", "arbitrary_types_allowed": True}
if t.TYPE_CHECKING:
# The following is handled via __pydantic_init_subclass__
@@ -395,28 +400,11 @@ class GenerationConfig(pydantic.BaseModel):
# NOTE: I don't know how to do this more efficiently in pydantic v2 yet, will probably
# need to consult the pydantic team on this.
for key, field in self.model_fields.items():
json_schema: dict[str, t.Any] = (
copy.deepcopy(field.json_schema_extra) if field.json_schema_extra is not None else {}
)
env_key = f"OPENLLM_{self.__openllm_env_name__}_GENERATION_{key.upper()}"
if "env" in json_schema:
field.default = os.environ.get(json_schema["env"], field.default)
if not field.json_schema_extra:
field.json_schema_extra = {}
if "env" in field.json_schema_extra:
continue
json_schema["env"] = env_key
# then assign json_schema back to field
field.json_schema_extra = json_schema
field.default = os.environ.get(env_key, field.default)
def to_click_options(self, f: F[P]) -> t.Callable[[t.Callable[..., t.Any]], click.Command]:
for name, field in self.model_fields.items():
if t.get_origin(field.annotation) is t.Union:
# NOTE: Union type is currently not yet supported, we probably just need to use environment instead.
continue
f = field_to_options(name, field, self.__openllm_model_name__, suffix_generation=True)(f)
return optgroup.group(
f"{self.__class__.__name__} generation options",
help=f"[Auto-generated from '{self.__class__.__qualname__}']",
)(f)
field.json_schema_extra["env"] = f"OPENLLM_{self.__openllm_env_name__}_GENERATION_{key.upper()}"
class LLMConfig(pydantic.BaseModel, ABC):
@@ -428,23 +416,15 @@ class LLMConfig(pydantic.BaseModel, ABC):
return getattr(self.generation_config, attr)
return getattr(self, attr)
def __repr_args__(self) -> ReprArgs:
"""Overwrite from default BaseModel and don't show __pydantic_extra__."""
yield from (
(k, v)
for k, v in self.__dict__.items()
if not k.startswith("_") and (k not in self.model_fields or self.model_fields[k].repr)
)
yield from ((k, getattr(self, k)) for k, v in self.model_computed_fields.items() if v.repr)
if t.TYPE_CHECKING:
# The following is handled via __pydantic_init_subclass__, and is only used for TYPE_CHECKING
__openllm_model_name__: str = ""
__openllm_start_name__: str = ""
__openllm_timeout__: int = 0
__openllm_model_name__: str
__openllm_start_name__: str
__openllm_timeout__: int = 3600
__openllm_name_type__: t.Literal["dasherize", "lowercase"] = "dasherize"
__openllm_trust_remote_code__: bool = False
__openllm_requires_gpu__: bool = False
__openllm_env__: openllm.utils.ModelEnv
GenerationConfig: type[t.Any] = GenerationConfig
def __init_subclass__(
@@ -488,6 +468,8 @@ class LLMConfig(pydantic.BaseModel, ABC):
cls.__openllm_model_name__ = cls.__name__.replace("Config", "").lower()
cls.__openllm_start_name__ = cls.__openllm_model_name__
cls.__openllm_env__ = openllm.utils.ModelEnv(cls.__openllm_model_name__)
if hasattr(cls, "GenerationConfig"):
cls.generation_config = t.cast(
"type[GenerationConfig]",
@@ -508,12 +490,10 @@ class LLMConfig(pydantic.BaseModel, ABC):
def model_post_init(self, _: t.Any):
if self.__pydantic_extra__:
generation_config = self.__pydantic_extra__.pop("generation_config", None)
generation_config: dict[str, t.Any] | None = self.__pydantic_extra__.pop("generation_config", None)
if generation_config is not None:
assert isinstance(generation_config, dict), "generation_config must be a dict."
self.generation_config = self.generation_config.model_copy(
update=t.cast("dict[str, t.Any]", generation_config), deep=True
)
assert LazyType[DictStrAny](dict).isinstance(generation_config), "generation_config must be a dict."
self.generation_config = self.generation_config.model_copy(update=generation_config, deep=True)
else:
# The rest of the extras fields should just be the generation_config.
self.generation_config = self.generation_config.model_copy(update=self.__pydantic_extra__, deep=True)
@@ -551,64 +531,48 @@ class LLMConfig(pydantic.BaseModel, ABC):
except pydantic.ValidationError as e:
raise openllm.exceptions.ValidationError(f"Failed to dump configuration to dict: {e}") from e
def with_options(self, __llm_config__: LLMConfig | None = None, **attrs: t.Any) -> LLMConfig:
@classmethod
def model_construct_env(cls, __llm_config__: LLMConfig | None = None, **attrs: t.Any) -> LLMConfig:
"""A helpers that respect configuration values that
sets from environment variables for any given configuration class.
"""
from_env_ = self.from_env()
# filtered out None values
attrs = {k: v for k, v in attrs.items() if v is not None}
generation_keys = {k for k in attrs if k in self.generation_config.model_fields}
generation_attrs = {k: v for k, v in attrs.items() if k in generation_keys}
config_attrs = {k: v for k, v in attrs.items() if k not in generation_keys}
# NOTE: first set the default config kwargs.
# We will always respect envvar as default, then the one that is pass
attrs = {**generate_kwargs_from_envvar(self), **config_attrs}
env_json_string = os.environ.get(cls.__openllm_env__.model_config, None)
if env_json_string is not None:
try:
self = cls.model_construct(**orjson.loads(env_json_string))
except pydantic.ValidationError as e:
raise RuntimeError(f"Failed to parse '{cls.__openllm_env__.model_config}' as valid JSON string.") from e
else:
self = cls.model_construct()
if __llm_config__ is not None:
# NOTE: Only hit this branch on the server. Client shouldn't use __llm_config__
attrs = {**attrs, **__llm_config__.model_dump()}
# as it is not set.
return self.model_construct(**__llm_config__.model_dump(flatten=True))
# NOTE: Then we setup generation config values
attrs["generation_config"] = {
**generate_kwargs_from_envvar(self.generation_config),
**attrs.get("generation_config", {}),
**generation_attrs,
}
# filtered out None values
attrs = {k: v for k, v in attrs.items() if v is not None}
if from_env_:
return from_env_.model_construct(**attrs)
return self.model_construct(**attrs)
construct_attrs = generate_kwargs_from_envvar(self)
construct_attrs.update(generate_kwargs_from_envvar(self.generation_config))
construct_attrs.update(attrs)
@classmethod
def from_env(cls) -> LLMConfig | None:
envvar = openllm.utils.MODEL_CONFIG_ENV_VAR(cls.__openllm_model_name__)
env_json_string = os.environ.get(envvar, None)
if env_json_string is None:
return
try:
return cls.model_construct(**orjson.loads(env_json_string))
except pydantic.ValidationError as e:
raise RuntimeError(f"Failed to parse environment variable '{envvar}' as a valid JSON string.") from e
return self.model_construct(**construct_attrs)
def model_validate_click(self, **attrs: t.Any) -> tuple[LLMConfig, dict[str, t.Any]]:
"""Parse given click attributes into a LLMConfig and return the remaining click attributes."""
llm_config_attrs = {
k[len(self.__openllm_model_name__) + 1 :]: v
for k, v in attrs.items()
if k[len(self.__openllm_model_name__) + 1 :] in self.model_fields
}
llm_config_attrs["generation_config"] = {
k[len(self.__openllm_model_name__ + "_generation") + 1 :]: v
for k, v in attrs.items()
if k[len(self.__openllm_model_name__ + "_generation") + 1 :] in self.generation_config.model_fields
}
return self.with_options(**llm_config_attrs), {
k: v for k, v in attrs.items() if not k.startswith(self.__openllm_model_name__)
}
llm_config_attrs = {}
key_to_remove: list[str] = []
for k, v in attrs.items():
if k.startswith(f"{self.__openllm_model_name__}_"):
llm_config_attrs[k[len(self.__openllm_model_name__) + 1 :]] = v
key_to_remove.append(k)
elif k.startswith(f"{self.__openllm_model_name__}_generation_"):
llm_config_attrs[k[len(self.__openllm_model_name__ + "_generation") + 1 :]] = v
key_to_remove.append(k)
return self.model_construct_env(**llm_config_attrs), {k: v for k, v in attrs.items() if k not in key_to_remove}
@t.overload
def to_generation_config(self, return_as_dict: t.Literal[True] = ...) -> dict[str, t.Any]:
@@ -627,17 +591,25 @@ class LLMConfig(pydantic.BaseModel, ABC):
return config
def to_click_options(self, f: F[P]) -> t.Callable[[t.Callable[..., t.Any]], click.Command]:
def to_click_options(self, f: F[P]) -> t.Callable[[F[P]], click.Command]:
"""
Convert current model to click options. This can be used as a decorator for click commands.
Note that the identifier for all LLMConfig will be prefixed with '<model_name>_*', and the generation config
will be prefixed with '<model_name>_generation_*'.
"""
wrapped_generation = self.generation_config.to_click_options(f)
for name, field in self.generation_config.model_fields.items():
if t.get_origin(field.annotation) is t.Union:
# NOTE: Union type is currently not yet supported, we probably just need to use environment instead.
continue
f = field_to_options(name, field, self.__openllm_model_name__, suffix_generation=True)(f)
f = optgroup.group(f"{self.__class__.__name__} generation options")(f)
if len(self.model_fields.values()) == 0:
return wrapped_generation
return f
for name, field in self.model_fields.items():
wrapped_generation = field_to_options(name, field, self.__openllm_model_name__)(wrapped_generation)
return optgroup.group(
f"{self.__class__.__name__} options", help=f"[Auto-generated from '{self.__class__.__qualname__}']"
)(wrapped_generation)
if t.get_origin(field.annotation) is t.Union:
# NOTE: Union type is currently not yet supported, we probably just need to use environment instead.
continue
f = field_to_options(name, field, self.__openllm_model_name__)(f)
return optgroup.group(f"{self.__class__.__name__} options")(f)

View File

@@ -440,8 +440,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
# NOTE: The section below defines a loose contract with langchain's LLM interface.
@property
def llm_type(self) -> str:
assert self.default_model is not None
return openllm.utils.convert_transformers_model_name(self.default_model)
return openllm.utils.convert_transformers_model_name(self._pretrained)
@property
def identifying_params(self) -> dict[str, t.Any]:
@@ -637,10 +636,10 @@ def Runner(start_name: str, **attrs: t.Any) -> bentoml.Runner:
behaviour
"""
init_local = attrs.pop("init_local", False)
envvar = openllm.utils.get_framework_env(start_name)
if envvar == "flax":
ModelEnv = openllm.utils.ModelEnv(start_name)
if ModelEnv.get_framework_env() == "flax":
runner = openllm.AutoFlaxLLM.create_runner(start_name, **attrs)
elif envvar == "tf":
elif ModelEnv.get_framework_env() == "tf":
runner = openllm.AutoTFLLM.create_runner(start_name, **attrs)
else:
runner = openllm.AutoLLM.create_runner(start_name, **attrs)

View File

@@ -70,6 +70,8 @@ def build_editable(path: str) -> str | None:
def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
packages: list[str] = []
ModelEnv = openllm.utils.ModelEnv(llm.__openllm_start_name__)
if llm.requirements is not None:
packages.extend(llm.requirements)
@@ -89,11 +91,9 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
]
)
to_use_framework = utils.get_framework_env(llm.__openllm_start_name__)
to_use_framework = ModelEnv.get_framework_env()
if to_use_framework == "flax":
assert (
utils.is_flax_available()
), f"Flax is not available, while {utils.FRAMEWORK_ENV_VAR(llm.__openllm_start_name__)} is set to 'flax'"
assert utils.is_flax_available(), f"Flax is not available, while {ModelEnv.framework} is set to 'flax'"
packages.extend(
[
f"flax>={importlib.metadata.version('flax')}",
@@ -102,9 +102,7 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
]
)
elif to_use_framework == "tf":
assert (
utils.is_tf_available()
), f"TensorFlow is not available, while {utils.FRAMEWORK_ENV_VAR(llm.__openllm_start_name__)} is set to 'tf'"
assert utils.is_tf_available(), f"TensorFlow is not available, while {ModelEnv.framework} is set to 'tf'"
candidates = (
"tensorflow",
"tensorflow-cpu",
@@ -137,11 +135,12 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
return PythonOptions(packages=packages, wheels=wheels, lock_packages=True)
def construct_docker_options(llm: openllm.LLM, llm_fs: FS) -> DockerOptions:
def construct_docker_options(llm: openllm.LLM, _: FS) -> DockerOptions:
ModelEnv = openllm.utils.ModelEnv(llm.__openllm_start_name__)
return DockerOptions(
cuda_version="11.6", # NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version
env={
utils.FRAMEWORK_ENV_VAR(llm.__openllm_start_name__): utils.get_framework_env(llm.__openllm_start_name__),
ModelEnv.framework: ModelEnv.get_framework_env(),
"OPENLLM_MODEL": llm.config.__openllm_model_name__,
},
system_packages=["git"],
@@ -165,14 +164,16 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
current_model_envvar = os.environ.pop("OPENLLM_MODEL", None)
_previously_built = False
logger.debug("Packing '%s' into a Bento with kwargs=%s...", model_name, attrs)
ModelEnv = openllm.utils.ModelEnv(model_name)
logger.info("Packing '%s' into a Bento with kwargs=%s...", model_name, attrs)
# NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
# during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
try:
os.environ["OPENLLM_MODEL"] = inflection.underscore(model_name)
to_use_framework = openllm.utils.get_framework_env(model_name)
to_use_framework = ModelEnv.get_framework_env()
if to_use_framework == "flax":
llm = openllm.AutoFlaxLLM.for_model(model_name, **attrs)
elif to_use_framework == "tf":

View File

@@ -30,7 +30,7 @@ svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", r
route="/v1/generate",
)
async def generate_v1(qa: openllm.GenerationInput) -> openllm.GenerationOutput:
config = llm_config.with_options(__llm_config__=qa.llm_config).model_dump()
config = llm_config.model_construct_env(__llm_config__=qa.llm_config).model_dump()
responses = await runner.generate.async_run(qa.prompt, **config)
return openllm.GenerationOutput(responses=responses, configuration=config)
@@ -39,5 +39,5 @@ async def generate_v1(qa: openllm.GenerationInput) -> openllm.GenerationOutput:
def metadata_v1(_: str) -> dict[str, t.Any]:
return {
"model_name": llm_config.__openllm_model_name__,
"framework": openllm.utils.get_framework_env(llm_config.__openllm_model_name__),
"framework": llm_config.__openllm_env__.get_framework_env(),
}

View File

@@ -314,7 +314,7 @@ def start_model_command(
Note that the internal commands will return the llm_config and a boolean determine
whether the server is run with GPU or not.
"""
envvar = openllm.utils.get_framework_env(model_name)
ModelEnv = openllm.utils.ModelEnv(model_name)
model_command_decr: dict[str, t.Any] = {
"name": inflection.underscore(model_name),
"context_settings": _context_settings or {},
@@ -330,16 +330,15 @@ def start_model_command(
{
"name": config.__openllm_model_name__,
"short_help": f"Start a LLMServer for '{model_name}' ('--help' for more details)",
"help": getattr(
openllm.utils.get_lazy_module(model_name),
f"START_{inflection.underscore(model_name).upper()}_COMMAND_DOCSTRING",
),
"help": ModelEnv.start_docstring,
"aliases": aliases if len(aliases) > 0 else None,
}
)
gpu_available = False
try:
config.check_if_gpu_is_available(envvar)
config.check_if_gpu_is_available(ModelEnv.get_framework_env())
gpu_available = True
except openllm.exceptions.GpuNotAvailableError:
# NOTE: The model requires GPU, therefore we will return a dummy command
model_command_decr.update(
@@ -353,7 +352,7 @@ def start_model_command(
@factory.command(**model_command_decr)
def noop() -> openllm.LLMConfig:
click.secho("No GPU available, therefore this command is disabled", fg="red")
openllm.utils.analytics.track_start_init(config, False)
openllm.utils.analytics.track_start_init(config, gpu_available)
return config
return noop
@@ -371,15 +370,24 @@ def start_model_command(
configure_logging()
updated_config, server_kwds = config.model_validate_click(**attrs)
openllm.utils.analytics.track_start_init(updated_config, False)
updated_config, server_attrs = config.model_validate_click(**attrs)
server_kwds.update({"working_dir": os.path.dirname(__file__)})
# NOTE: check for GPU one more time in cases this model doesn't requires GPU but users can still
# run this model on GPU
try:
updated_config.check_if_gpu_is_available(ModelEnv.get_framework_env())
gpu_available = True
except openllm.exceptions.GpuNotAvailableError:
gpu_available = False
openllm.utils.analytics.track_start_init(updated_config, gpu_available)
server_attrs.update({"working_dir": os.path.dirname(__file__)})
if _serve_grpc:
server_kwds["grpc_protocol_version"] = "v1"
server_attrs["grpc_protocol_version"] = "v1"
# NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream.
development = server_kwds.pop("development")
server_kwds.setdefault("production", not development)
development = server_attrs.pop("development")
server_attrs.setdefault("production", not development)
start_env = os.environ.copy()
@@ -395,17 +403,17 @@ def start_model_command(
start_env.update(
{
openllm.utils.FRAMEWORK_ENV_VAR(model_name): envvar,
openllm.utils.MODEL_CONFIG_ENV_VAR(model_name): updated_config.model_dump_json(),
ModelEnv.framework: ModelEnv.get_framework_env(),
ModelEnv.model_config: updated_config.model_dump_json(),
"OPENLLM_MODEL": model_name,
"BENTOML_DEBUG": str(get_debug_mode()),
"BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
}
)
if envvar == "flax":
if ModelEnv.get_framework_env() == "flax":
llm = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained)
elif envvar == "tf":
elif ModelEnv.get_framework_env() == "tf":
llm = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained)
else:
llm = openllm.AutoLLM.for_model(model_name, pretrained=pretrained)
@@ -416,7 +424,7 @@ def start_model_command(
)
click.secho(f"Starting LLM Server for '{model_name}'\n", fg="blue")
server_cls = getattr(bentoml, "HTTPServer" if not _serve_grpc else "GrpcServer")
server: bentoml.server.Server = server_cls("_service.py:svc", **server_kwds)
server: bentoml.server.Server = server_cls("_service.py:svc", **server_attrs)
server.timeout = 90
try:
@@ -449,8 +457,10 @@ def _start(
_serve_grpc = attrs.pop("_serve_grpc", False)
ModelEnv = openllm.utils.ModelEnv(model_name)
if framework is not None:
os.environ[openllm.utils.FRAMEWORK_ENV_VAR(model_name)] = framework
os.environ[ModelEnv.framework] = framework
start_model_command(model_name, t.cast(OpenLLMCommandGroup, cli), _serve_grpc=_serve_grpc)(
standalone_mode=False, **attrs
)
@@ -585,9 +595,12 @@ def list_supported_models(output: t.Literal["json", "pretty", "porcelain"]):
except Exception as err:
failed_initialized.append((m, err))
_console.print(table)
_console.print("\n[bold yellow] The following models are supported but failed to initialize:[/bold yellow]\n")
for m, err in failed_initialized:
_console.print(Text(f"- {m}: ") + Text(f"{err}\n", style="bold red"))
if len(failed_initialized) > 0:
_console.print(
"\n[bold yellow] The following models are supported but failed to initialize:[/bold yellow]\n"
)
for m, err in failed_initialized:
_console.print(Text(f"- {m}: ") + Text(f"{err}\n", style="bold red"))
elif output == "json":
result_json: dict[str, dict[t.Literal["variants", "description"], t.Any]] = {}
for m in models:

View File

@@ -53,7 +53,7 @@ class _LazyConfigMapping(ConfigOrderedDict):
value = self._mapping[key]
module_name = inflection.underscore(key)
if module_name not in self._modules:
self._modules[module_name] = openllm.utils.get_lazy_module(module_name)
self._modules[module_name] = openllm.utils.ModelEnv(module_name).module
if hasattr(self._modules[module_name], value):
return getattr(self._modules[module_name], value)
@@ -93,10 +93,10 @@ class AutoConfig:
raise EnvironmentError("Cannot instantiate Config. Please use `Config.for_model(model_name)` instead.")
@classmethod
def for_model(cls, model_name: str, *args: t.Any, **attrs: t.Any) -> openllm.LLMConfig:
def for_model(cls, model_name: str, **attrs: t.Any) -> openllm.LLMConfig:
model_name = inflection.underscore(model_name)
if model_name in CONFIG_MAPPING:
return CONFIG_MAPPING[model_name]().with_options(*args, **attrs)
return CONFIG_MAPPING[model_name].model_construct_env(**attrs)
raise ValueError(
f"Unrecognized configuration class for {model_name}. "
f"Model name should be one of {', '.join(CONFIG_MAPPING.keys())}."

View File

@@ -190,7 +190,7 @@ class _LazyAutoMapping(ConfigModelOrderedDict):
def _load_attr_from_module(self, model_type: str, attr: str) -> t.Any:
module_name = inflection.underscore(model_type)
if module_name not in self._modules:
self._modules[module_name] = openllm.utils.get_lazy_module(module_name)
self._modules[module_name] = openllm.utils.ModelEnv(module_name).module
return getattribute_from_module(self._modules[module_name], attr)
def keys(self):

View File

@@ -81,7 +81,7 @@ class ChatGLM(openllm.LLM):
else:
prompt_text = prompt
generation_config = self.config.with_options(
generation_config = self.config.model_construct_env(
max_new_tokens=max_new_tokens,
num_beams=num_beams,
top_p=top_p,
@@ -123,7 +123,7 @@ class ChatGLM(openllm.LLM):
inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
outputs = self.model.generate(
**inputs,
generation_config=self.config.with_options(
generation_config=self.config.model_construct_env(
max_new_tokens=max_new_tokens,
num_beams=num_beams,
top_p=top_p,

View File

@@ -79,7 +79,7 @@ class DollyV2(openllm.LLM):
) -> tuple[str, dict[str, t.Any]]:
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt)
generation_config = self.config.with_options(
generation_config = self.config.model_construct_env(
max_new_tokens=max_new_tokens,
temperature=temperature,
top_k=top_k,
@@ -112,7 +112,7 @@ class DollyV2(openllm.LLM):
end_key_token_id = None
eos_token_id = None
llm_config = self.config.with_options(
llm_config = self.config.model_construct_env(
max_new_tokens=max_new_tokens,
temperature=temperature,
top_k=top_k,

View File

@@ -66,7 +66,7 @@ class Falcon(openllm.LLM):
eos_token_id: int | None = None,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any]]:
generation_config = self.config.with_options(
generation_config = self.config.model_construct_env(
max_new_tokens=max_new_tokens,
top_k=top_k,
num_return_sequences=num_return_sequences,
@@ -95,7 +95,7 @@ class Falcon(openllm.LLM):
return self.model(
prompt,
do_sample=True,
generation_config=self.config.with_options(
generation_config=self.config.model_construct_env(
max_new_tokens=max_new_tokens,
top_k=top_k,
num_return_sequences=num_return_sequences,

View File

@@ -48,7 +48,7 @@ class FlanT5(openllm.LLM):
repetition_penalty: float | None = None,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any]]:
return prompt, self.config.with_options(
return prompt, self.config.model_construct_env(
max_new_tokens=max_new_tokens,
temperature=temperature,
top_k=top_k,
@@ -75,7 +75,7 @@ class FlanT5(openllm.LLM):
result_tensor = self.model.generate(
input_ids,
do_sample=True,
generation_config=self.config.with_options(
generation_config=self.config.model_construct_env(
max_new_tokens=max_new_tokens,
temperature=temperature,
top_k=top_k,

View File

@@ -41,7 +41,7 @@ class FlaxFlanT5(openllm.LLM):
repetition_penalty: float | None = None,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any]]:
return prompt, self.config.with_options(
return prompt, self.config.model_construct_env(
max_new_tokens=max_new_tokens,
temperature=temperature,
top_k=top_k,
@@ -67,7 +67,7 @@ class FlaxFlanT5(openllm.LLM):
result_tensor = self.model.generate(
input_ids,
do_sample=True,
generation_config=self.config.with_options(
generation_config=self.config.model_construct_env(
max_new_tokens=max_new_tokens,
temperature=temperature,
top_k=top_k,

View File

@@ -41,7 +41,7 @@ class TFFlanT5(openllm.LLM):
repetition_penalty: float | None = None,
**attrs: t.Any,
) -> tuple[str, dict[str, t.Any]]:
return prompt, self.config.with_options(
return prompt, self.config.model_construct_env(
max_new_tokens=max_new_tokens,
temperature=temperature,
top_k=top_k,
@@ -67,7 +67,7 @@ class TFFlanT5(openllm.LLM):
outputs = self.model.generate(
input_ids,
do_sample=True,
generation_config=self.config.with_options(
generation_config=self.config.model_construct_env(
max_new_tokens=max_new_tokens,
temperature=temperature,
top_k=top_k,

View File

@@ -116,7 +116,7 @@ class StarCoder(openllm.LLM):
raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
prompt = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
return prompt, self.config.with_options(
return prompt, self.config.model_construct_env(
top_p=top_p,
temperature=temperature,
max_new_tokens=max_new_tokens,
@@ -154,7 +154,7 @@ class StarCoder(openllm.LLM):
result_tensor = self.model.generate(
inputs,
do_sample=True,
generation_config=self.config.with_options(
generation_config=self.config.model_construct_env(
top_p=top_p,
temperature=temperature,
max_new_tokens=max_new_tokens,

View File

@@ -26,6 +26,7 @@ import re
import types
import typing as t
import attrs
import bentoml
import inflection
from bentoml._internal.types import LazyType as LazyType
@@ -56,27 +57,37 @@ else:
logger = logging.getLogger(__name__)
_object_setattr = object.__setattr__
def get_lazy_module(model_name: str) -> LazyLoader:
snaked_model_name = inflection.underscore(model_name)
return LazyLoader(snaked_model_name, globals(), f"openllm.models.{snaked_model_name}")
def FRAMEWORK_ENV_VAR(model_name: str) -> str:
return f"OPENLLM_{inflection.underscore(model_name).upper()}_FRAMEWORK"
@attrs.define
class ModelEnv:
model_name: str = attrs.field(converter=inflection.underscore)
@property
def framework(self) -> str:
return f"OPENLLM_{self.model_name.upper()}_FRAMEWORK"
def MODEL_CONFIG_ENV_VAR(model_name: str) -> str:
return f"OPENLLM_{inflection.underscore(model_name).upper()}_CONFIG"
@property
def model_config(self) -> str:
return f"OPENLLM_{self.model_name.upper()}_CONFIG"
@property
def start_docstring(self) -> str:
return getattr(self.module, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING")
def get_framework_env(model_name: str) -> t.Literal["pt", "flax", "tf"]:
envvar = os.environ.get(FRAMEWORK_ENV_VAR(model_name), "pt")
if envvar not in ("pt", "tf", "flax"):
raise ValueError(f"Invalid framework implementation {envvar}, must be one of 'pt', 'tf', 'flax'")
return envvar
@property
def module(self) -> LazyLoader:
return LazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}")
def get_framework_env(self) -> t.Literal["pt", "flax", "tf"]:
envvar = os.environ.get(self.framework, "pt")
if envvar not in ("pt", "tf", "flax"):
raise ValueError(f"Invalid framework implementation {envvar}, must be one of 'pt', 'tf', 'flax'")
return envvar
def convert_transformers_model_name(name: str) -> str:

View File

@@ -79,7 +79,7 @@ class PromptTemplate:
@classmethod
def from_default(cls, model: str) -> PromptTemplate:
template = getattr(openllm.utils.get_lazy_module(model), "DEFAULT_PROMPT_TEMPLATE")
template = getattr(openllm.utils.ModelEnv(model).module, "DEFAULT_PROMPT_TEMPLATE")
if template is None:
raise ValueError(f"Model {model} does not have a default prompt template.")
return cls.from_template(template)

View File

@@ -109,7 +109,7 @@ class BaseClient(ClientMixin):
def query(self, prompt: str, **attrs: t.Any) -> dict[str, t.Any] | str:
return_raw_response = attrs.pop("return_raw_response", False)
prompt, attrs = self.llm.preprocess_parameters(prompt, **attrs)
inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.with_options(**attrs))
inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**attrs))
r = openllm.GenerationOutput(**self.call("generate", inputs))
if return_raw_response:
@@ -132,7 +132,7 @@ class BaseAsyncClient(ClientMixin):
async def query(self, prompt: str, **attrs: t.Any) -> dict[str, t.Any] | str:
return_raw_response = attrs.pop("return_raw_response", False)
prompt, attrs = self.llm.preprocess_parameters(prompt, **attrs)
inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.with_options(**attrs))
inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**attrs))
res = await self.acall("generate", inputs)
r = openllm.GenerationOutput(**res)