mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-05-05 14:22:43 -04:00
perf: reduce unecessary object creation for config class
Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -38,7 +38,6 @@ class FlanT5Config(openllm.LLMConfig):
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import os
|
||||
import types
|
||||
import typing as t
|
||||
@@ -55,7 +54,7 @@ from click_option_group import optgroup
|
||||
import openllm
|
||||
|
||||
from .exceptions import GpuNotAvailableError, OpenLLMException
|
||||
from .utils import _object_setattr
|
||||
from .utils import LazyType
|
||||
from .utils.dantic import allows_multiple, parse_default
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
@@ -70,15 +69,20 @@ if t.TYPE_CHECKING:
|
||||
import transformers
|
||||
from pydantic.fields import FieldInfo
|
||||
from transformers.generation.beam_constraints import Constraint
|
||||
|
||||
DictStrAny = dict[str, t.Any]
|
||||
else:
|
||||
from transformers.utils.dummy_pt_objects import Constraint
|
||||
|
||||
DictStrAny = dict
|
||||
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
|
||||
tf = openllm.utils.LazyLoader("tf", globals(), "tensorflow")
|
||||
|
||||
__all__ = ["LLMConfig", "ModelSignature"]
|
||||
|
||||
_object_setattr = object.__setattr__
|
||||
|
||||
|
||||
def field_to_options(
|
||||
name: str, field: FieldInfo, model_name: str, suffix_generation: bool = False
|
||||
@@ -87,6 +91,7 @@ def field_to_options(
|
||||
envvar = field.json_schema_extra.get("env") if field.json_schema_extra else None
|
||||
dasherized = inflection.dasherize(name)
|
||||
underscored = inflection.underscore(name)
|
||||
|
||||
full_option_name = f"--{dasherized}"
|
||||
if field.annotation is bool:
|
||||
full_option_name += f"/--no-{dasherized}"
|
||||
@@ -101,7 +106,7 @@ def field_to_options(
|
||||
type=field.annotation,
|
||||
required=field.is_required(),
|
||||
default=parse_default(field.default, field.annotation),
|
||||
show_default=False,
|
||||
show_default=True if field.default else False,
|
||||
multiple=allows_multiple(field.annotation),
|
||||
help=field.description,
|
||||
show_envvar=True if envvar else False,
|
||||
@@ -109,14 +114,13 @@ def field_to_options(
|
||||
)
|
||||
|
||||
|
||||
def generate_kwargs_from_envvar(model: GenerationConfig | LLMConfig) -> dict[str, str]:
|
||||
kwargs: dict[str, t.Any] = {}
|
||||
for key, field in model.model_fields.items():
|
||||
if field.json_schema_extra is not None:
|
||||
if "env" not in field.json_schema_extra:
|
||||
raise RuntimeError(f"Invalid {model} passed. Only accept LLMConfig or LLMConfig.generation_config")
|
||||
kwargs[key] = os.environ.get(field.json_schema_extra["env"], field.default)
|
||||
return {k: v for k, v in kwargs.items() if v is not None}
|
||||
def generate_kwargs_from_envvar(model: GenerationConfig | LLMConfig) -> dict[str, t.Any]:
|
||||
# NOTE: We can safe cast here since all of the fields in GenerationConfig or LLMConfig
|
||||
# will have a `env` field in `json_schema_extra`
|
||||
return {
|
||||
key: os.environ.get(t.cast("dict[str, t.Any]", field.json_schema_extra)["env"], field.default)
|
||||
for key, field in model.model_fields.items()
|
||||
}
|
||||
|
||||
|
||||
class GenerationConfig(pydantic.BaseModel):
|
||||
@@ -136,7 +140,7 @@ class GenerationConfig(pydantic.BaseModel):
|
||||
description="""The minimum length of the sequence to be generated. Corresponds to the length of the
|
||||
input prompt + `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.""",
|
||||
)
|
||||
min_new_tokens: t.Optional[int] = pydantic.Field(
|
||||
min_new_tokens: int = pydantic.Field(
|
||||
None, description="The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt."
|
||||
)
|
||||
early_stopping: bool = pydantic.Field(
|
||||
@@ -150,7 +154,7 @@ class GenerationConfig(pydantic.BaseModel):
|
||||
(canonical beam search algorithm)
|
||||
""",
|
||||
)
|
||||
max_time: t.Optional[float] = pydantic.Field(
|
||||
max_time: float = pydantic.Field(
|
||||
None,
|
||||
description="""The maximum amount of time you allow the computation to run for in seconds. generation will
|
||||
still finish the current pass after allocated time has been passed.""",
|
||||
@@ -163,7 +167,7 @@ class GenerationConfig(pydantic.BaseModel):
|
||||
description="""Number of groups to divide `num_beams` into in order to ensure diversity among different
|
||||
groups of beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.""",
|
||||
)
|
||||
penalty_alpha: t.Optional[float] = pydantic.Field(
|
||||
penalty_alpha: float = pydantic.Field(
|
||||
None,
|
||||
description="""The values balance the model confidence and the degeneration penalty in
|
||||
contrastive search decoding.""",
|
||||
@@ -242,14 +246,15 @@ class GenerationConfig(pydantic.BaseModel):
|
||||
no_repeat_ngram_size: int = pydantic.Field(
|
||||
0, description="If set to int > 0, all ngrams of that size can only occur once."
|
||||
)
|
||||
bad_words_ids: t.Optional[t.List[t.List[int]]] = pydantic.Field(
|
||||
bad_words_ids: t.List[t.List[int]] = pydantic.Field(
|
||||
None,
|
||||
description="""List of token ids that are not allowed to be generated. In order to get the token ids
|
||||
of the words that should not appear in the generated text, use
|
||||
`tokenizer(bad_words, add_prefix_space=True, add_special_tokens=False).input_ids`.
|
||||
""",
|
||||
)
|
||||
force_words_ids: t.Optional[t.Union[t.List[t.List[int]], t.List[t.List[t.List[int]]]]] = pydantic.Field(
|
||||
# NOTE: t.Union is not yet supported on CLI, but the environment variable should already be available.
|
||||
force_words_ids: t.Union[t.List[t.List[int]], t.List[t.List[t.List[int]]]] = pydantic.Field(
|
||||
None,
|
||||
description="""List of token ids that must be generated. If given a `List[List[int]]`, this is treated
|
||||
as a simple list of words that must be included, the opposite to `bad_words_ids`.
|
||||
@@ -265,13 +270,13 @@ class GenerationConfig(pydantic.BaseModel):
|
||||
algorithms suppose the score logits are normalized but some logit processors or warpers break the normalization.
|
||||
""",
|
||||
)
|
||||
constraints: t.Optional[t.List["Constraint"]] = pydantic.Field(
|
||||
constraints: t.List["Constraint"] = pydantic.Field(
|
||||
None,
|
||||
description="""Custom constraints that can be added to the generation to ensure that the output
|
||||
will contain the use of certain tokens as defined by ``Constraint`` objects, in the most sensible way possible.
|
||||
""",
|
||||
)
|
||||
forced_bos_token_id: t.Optional[int] = pydantic.Field(
|
||||
forced_bos_token_id: int = pydantic.Field(
|
||||
None,
|
||||
description="""The id of the token to force as the first generated token after the
|
||||
``decoder_start_token_id``. Useful for multilingual models like
|
||||
@@ -279,7 +284,7 @@ class GenerationConfig(pydantic.BaseModel):
|
||||
to be the target language token.
|
||||
""",
|
||||
)
|
||||
forced_eos_token_id: t.Optional[t.Union[int, t.List[int]]] = pydantic.Field(
|
||||
forced_eos_token_id: t.Union[int, t.List[int]] = pydantic.Field(
|
||||
None,
|
||||
description="""The id of the token to force as the last generated token when `max_length` is reached.
|
||||
Optionally, use a list to set multiple *end-of-sequence* tokens.""",
|
||||
@@ -289,26 +294,26 @@ class GenerationConfig(pydantic.BaseModel):
|
||||
description="""Whether to remove possible *nan* and *inf* outputs of the model to prevent the
|
||||
generation method to crash. Note that using `remove_invalid_values` can slow down generation.""",
|
||||
)
|
||||
exponential_decay_length_penalty: t.Optional[t.Tuple[int, float]] = pydantic.Field(
|
||||
exponential_decay_length_penalty: t.Tuple[int, float] = pydantic.Field(
|
||||
None,
|
||||
description="""This tuple adds an exponentially increasing length penalty, after a certain amount of tokens
|
||||
have been generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index`
|
||||
indicates where penalty starts and `decay_factor` represents the factor of exponential decay
|
||||
""",
|
||||
)
|
||||
suppress_tokens: t.Optional[t.List[int]] = pydantic.Field(
|
||||
suppress_tokens: t.List[int] = pydantic.Field(
|
||||
None,
|
||||
description="""A list of tokens that will be suppressed at generation. The `SupressTokens` logit
|
||||
processor will set their log probs to `-inf` so that they are not sampled.
|
||||
""",
|
||||
)
|
||||
begin_suppress_tokens: t.Optional[t.List[int]] = pydantic.Field(
|
||||
begin_suppress_tokens: t.List[int] = pydantic.Field(
|
||||
None,
|
||||
description="""A list of tokens that will be suppressed at the beginning of the generation. The
|
||||
`SupressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled.
|
||||
""",
|
||||
)
|
||||
forced_decoder_ids: t.Optional[t.List[t.List[int]]] = pydantic.Field(
|
||||
forced_decoder_ids: t.List[t.List[int]] = pydantic.Field(
|
||||
None,
|
||||
description="""A list of pairs of integers which indicates a mapping from generation indices to token indices
|
||||
that will be forced before sampling. For example, `[[1, 123]]` means the second generated token will always
|
||||
@@ -338,9 +343,9 @@ class GenerationConfig(pydantic.BaseModel):
|
||||
)
|
||||
|
||||
# NOTE: Special tokens that can be used at generation time
|
||||
pad_token_id: t.Optional[int] = pydantic.Field(None, description="The id of the *padding* token.")
|
||||
bos_token_id: t.Optional[int] = pydantic.Field(None, description="The id of the *beginning-of-sequence* token.")
|
||||
eos_token_id: t.Optional[t.Union[int, t.List[int]]] = pydantic.Field(
|
||||
pad_token_id: int = pydantic.Field(None, description="The id of the *padding* token.")
|
||||
bos_token_id: int = pydantic.Field(None, description="The id of the *beginning-of-sequence* token.")
|
||||
eos_token_id: t.Union[int, t.List[int]] = pydantic.Field(
|
||||
None,
|
||||
description="""The id of the *end-of-sequence* token. Optionally, use a list to set
|
||||
multiple *end-of-sequence* tokens.""",
|
||||
@@ -353,7 +358,7 @@ class GenerationConfig(pydantic.BaseModel):
|
||||
`encoder_input_ids` cannot occur in the `decoder_input_ids`.
|
||||
""",
|
||||
)
|
||||
decoder_start_token_id: t.Optional[int] = pydantic.Field(
|
||||
decoder_start_token_id: int = pydantic.Field(
|
||||
None,
|
||||
description="""If an encoder-decoder model starts decoding with a
|
||||
different token than *bos*, the id of that token.
|
||||
@@ -361,7 +366,7 @@ class GenerationConfig(pydantic.BaseModel):
|
||||
)
|
||||
|
||||
# NOTE: pydantic definition
|
||||
model_config = dict(arbitrary_types_allowed=True, extra="forbid")
|
||||
model_config = {"extra": "forbid", "arbitrary_types_allowed": True}
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
# The following is handled via __pydantic_init_subclass__
|
||||
@@ -395,28 +400,11 @@ class GenerationConfig(pydantic.BaseModel):
|
||||
# NOTE: I don't know how to do this more efficiently in pydantic v2 yet, will probably
|
||||
# need to consult the pydantic team on this.
|
||||
for key, field in self.model_fields.items():
|
||||
json_schema: dict[str, t.Any] = (
|
||||
copy.deepcopy(field.json_schema_extra) if field.json_schema_extra is not None else {}
|
||||
)
|
||||
env_key = f"OPENLLM_{self.__openllm_env_name__}_GENERATION_{key.upper()}"
|
||||
if "env" in json_schema:
|
||||
field.default = os.environ.get(json_schema["env"], field.default)
|
||||
if not field.json_schema_extra:
|
||||
field.json_schema_extra = {}
|
||||
if "env" in field.json_schema_extra:
|
||||
continue
|
||||
json_schema["env"] = env_key
|
||||
# then assign json_schema back to field
|
||||
field.json_schema_extra = json_schema
|
||||
field.default = os.environ.get(env_key, field.default)
|
||||
|
||||
def to_click_options(self, f: F[P]) -> t.Callable[[t.Callable[..., t.Any]], click.Command]:
|
||||
for name, field in self.model_fields.items():
|
||||
if t.get_origin(field.annotation) is t.Union:
|
||||
# NOTE: Union type is currently not yet supported, we probably just need to use environment instead.
|
||||
continue
|
||||
f = field_to_options(name, field, self.__openllm_model_name__, suffix_generation=True)(f)
|
||||
return optgroup.group(
|
||||
f"{self.__class__.__name__} generation options",
|
||||
help=f"[Auto-generated from '{self.__class__.__qualname__}']",
|
||||
)(f)
|
||||
field.json_schema_extra["env"] = f"OPENLLM_{self.__openllm_env_name__}_GENERATION_{key.upper()}"
|
||||
|
||||
|
||||
class LLMConfig(pydantic.BaseModel, ABC):
|
||||
@@ -428,23 +416,15 @@ class LLMConfig(pydantic.BaseModel, ABC):
|
||||
return getattr(self.generation_config, attr)
|
||||
return getattr(self, attr)
|
||||
|
||||
def __repr_args__(self) -> ReprArgs:
|
||||
"""Overwrite from default BaseModel and don't show __pydantic_extra__."""
|
||||
yield from (
|
||||
(k, v)
|
||||
for k, v in self.__dict__.items()
|
||||
if not k.startswith("_") and (k not in self.model_fields or self.model_fields[k].repr)
|
||||
)
|
||||
yield from ((k, getattr(self, k)) for k, v in self.model_computed_fields.items() if v.repr)
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
# The following is handled via __pydantic_init_subclass__, and is only used for TYPE_CHECKING
|
||||
__openllm_model_name__: str = ""
|
||||
__openllm_start_name__: str = ""
|
||||
__openllm_timeout__: int = 0
|
||||
__openllm_model_name__: str
|
||||
__openllm_start_name__: str
|
||||
__openllm_timeout__: int = 3600
|
||||
__openllm_name_type__: t.Literal["dasherize", "lowercase"] = "dasherize"
|
||||
__openllm_trust_remote_code__: bool = False
|
||||
__openllm_requires_gpu__: bool = False
|
||||
__openllm_env__: openllm.utils.ModelEnv
|
||||
GenerationConfig: type[t.Any] = GenerationConfig
|
||||
|
||||
def __init_subclass__(
|
||||
@@ -488,6 +468,8 @@ class LLMConfig(pydantic.BaseModel, ABC):
|
||||
cls.__openllm_model_name__ = cls.__name__.replace("Config", "").lower()
|
||||
cls.__openllm_start_name__ = cls.__openllm_model_name__
|
||||
|
||||
cls.__openllm_env__ = openllm.utils.ModelEnv(cls.__openllm_model_name__)
|
||||
|
||||
if hasattr(cls, "GenerationConfig"):
|
||||
cls.generation_config = t.cast(
|
||||
"type[GenerationConfig]",
|
||||
@@ -508,12 +490,10 @@ class LLMConfig(pydantic.BaseModel, ABC):
|
||||
|
||||
def model_post_init(self, _: t.Any):
|
||||
if self.__pydantic_extra__:
|
||||
generation_config = self.__pydantic_extra__.pop("generation_config", None)
|
||||
generation_config: dict[str, t.Any] | None = self.__pydantic_extra__.pop("generation_config", None)
|
||||
if generation_config is not None:
|
||||
assert isinstance(generation_config, dict), "generation_config must be a dict."
|
||||
self.generation_config = self.generation_config.model_copy(
|
||||
update=t.cast("dict[str, t.Any]", generation_config), deep=True
|
||||
)
|
||||
assert LazyType[DictStrAny](dict).isinstance(generation_config), "generation_config must be a dict."
|
||||
self.generation_config = self.generation_config.model_copy(update=generation_config, deep=True)
|
||||
else:
|
||||
# The rest of the extras fields should just be the generation_config.
|
||||
self.generation_config = self.generation_config.model_copy(update=self.__pydantic_extra__, deep=True)
|
||||
@@ -551,64 +531,48 @@ class LLMConfig(pydantic.BaseModel, ABC):
|
||||
except pydantic.ValidationError as e:
|
||||
raise openllm.exceptions.ValidationError(f"Failed to dump configuration to dict: {e}") from e
|
||||
|
||||
def with_options(self, __llm_config__: LLMConfig | None = None, **attrs: t.Any) -> LLMConfig:
|
||||
@classmethod
|
||||
def model_construct_env(cls, __llm_config__: LLMConfig | None = None, **attrs: t.Any) -> LLMConfig:
|
||||
"""A helpers that respect configuration values that
|
||||
sets from environment variables for any given configuration class.
|
||||
"""
|
||||
from_env_ = self.from_env()
|
||||
# filtered out None values
|
||||
attrs = {k: v for k, v in attrs.items() if v is not None}
|
||||
generation_keys = {k for k in attrs if k in self.generation_config.model_fields}
|
||||
|
||||
generation_attrs = {k: v for k, v in attrs.items() if k in generation_keys}
|
||||
config_attrs = {k: v for k, v in attrs.items() if k not in generation_keys}
|
||||
|
||||
# NOTE: first set the default config kwargs.
|
||||
# We will always respect envvar as default, then the one that is pass
|
||||
attrs = {**generate_kwargs_from_envvar(self), **config_attrs}
|
||||
env_json_string = os.environ.get(cls.__openllm_env__.model_config, None)
|
||||
if env_json_string is not None:
|
||||
try:
|
||||
self = cls.model_construct(**orjson.loads(env_json_string))
|
||||
except pydantic.ValidationError as e:
|
||||
raise RuntimeError(f"Failed to parse '{cls.__openllm_env__.model_config}' as valid JSON string.") from e
|
||||
else:
|
||||
self = cls.model_construct()
|
||||
|
||||
if __llm_config__ is not None:
|
||||
# NOTE: Only hit this branch on the server. Client shouldn't use __llm_config__
|
||||
attrs = {**attrs, **__llm_config__.model_dump()}
|
||||
# as it is not set.
|
||||
return self.model_construct(**__llm_config__.model_dump(flatten=True))
|
||||
|
||||
# NOTE: Then we setup generation config values
|
||||
attrs["generation_config"] = {
|
||||
**generate_kwargs_from_envvar(self.generation_config),
|
||||
**attrs.get("generation_config", {}),
|
||||
**generation_attrs,
|
||||
}
|
||||
# filtered out None values
|
||||
attrs = {k: v for k, v in attrs.items() if v is not None}
|
||||
|
||||
if from_env_:
|
||||
return from_env_.model_construct(**attrs)
|
||||
return self.model_construct(**attrs)
|
||||
construct_attrs = generate_kwargs_from_envvar(self)
|
||||
construct_attrs.update(generate_kwargs_from_envvar(self.generation_config))
|
||||
construct_attrs.update(attrs)
|
||||
|
||||
@classmethod
|
||||
def from_env(cls) -> LLMConfig | None:
|
||||
envvar = openllm.utils.MODEL_CONFIG_ENV_VAR(cls.__openllm_model_name__)
|
||||
env_json_string = os.environ.get(envvar, None)
|
||||
if env_json_string is None:
|
||||
return
|
||||
|
||||
try:
|
||||
return cls.model_construct(**orjson.loads(env_json_string))
|
||||
except pydantic.ValidationError as e:
|
||||
raise RuntimeError(f"Failed to parse environment variable '{envvar}' as a valid JSON string.") from e
|
||||
return self.model_construct(**construct_attrs)
|
||||
|
||||
def model_validate_click(self, **attrs: t.Any) -> tuple[LLMConfig, dict[str, t.Any]]:
|
||||
"""Parse given click attributes into a LLMConfig and return the remaining click attributes."""
|
||||
llm_config_attrs = {
|
||||
k[len(self.__openllm_model_name__) + 1 :]: v
|
||||
for k, v in attrs.items()
|
||||
if k[len(self.__openllm_model_name__) + 1 :] in self.model_fields
|
||||
}
|
||||
llm_config_attrs["generation_config"] = {
|
||||
k[len(self.__openllm_model_name__ + "_generation") + 1 :]: v
|
||||
for k, v in attrs.items()
|
||||
if k[len(self.__openllm_model_name__ + "_generation") + 1 :] in self.generation_config.model_fields
|
||||
}
|
||||
return self.with_options(**llm_config_attrs), {
|
||||
k: v for k, v in attrs.items() if not k.startswith(self.__openllm_model_name__)
|
||||
}
|
||||
llm_config_attrs = {}
|
||||
key_to_remove: list[str] = []
|
||||
|
||||
for k, v in attrs.items():
|
||||
if k.startswith(f"{self.__openllm_model_name__}_"):
|
||||
llm_config_attrs[k[len(self.__openllm_model_name__) + 1 :]] = v
|
||||
key_to_remove.append(k)
|
||||
elif k.startswith(f"{self.__openllm_model_name__}_generation_"):
|
||||
llm_config_attrs[k[len(self.__openllm_model_name__ + "_generation") + 1 :]] = v
|
||||
key_to_remove.append(k)
|
||||
|
||||
return self.model_construct_env(**llm_config_attrs), {k: v for k, v in attrs.items() if k not in key_to_remove}
|
||||
|
||||
@t.overload
|
||||
def to_generation_config(self, return_as_dict: t.Literal[True] = ...) -> dict[str, t.Any]:
|
||||
@@ -627,17 +591,25 @@ class LLMConfig(pydantic.BaseModel, ABC):
|
||||
|
||||
return config
|
||||
|
||||
def to_click_options(self, f: F[P]) -> t.Callable[[t.Callable[..., t.Any]], click.Command]:
|
||||
def to_click_options(self, f: F[P]) -> t.Callable[[F[P]], click.Command]:
|
||||
"""
|
||||
Convert current model to click options. This can be used as a decorator for click commands.
|
||||
Note that the identifier for all LLMConfig will be prefixed with '<model_name>_*', and the generation config
|
||||
will be prefixed with '<model_name>_generation_*'.
|
||||
"""
|
||||
wrapped_generation = self.generation_config.to_click_options(f)
|
||||
|
||||
for name, field in self.generation_config.model_fields.items():
|
||||
if t.get_origin(field.annotation) is t.Union:
|
||||
# NOTE: Union type is currently not yet supported, we probably just need to use environment instead.
|
||||
continue
|
||||
f = field_to_options(name, field, self.__openllm_model_name__, suffix_generation=True)(f)
|
||||
f = optgroup.group(f"{self.__class__.__name__} generation options")(f)
|
||||
|
||||
if len(self.model_fields.values()) == 0:
|
||||
return wrapped_generation
|
||||
return f
|
||||
for name, field in self.model_fields.items():
|
||||
wrapped_generation = field_to_options(name, field, self.__openllm_model_name__)(wrapped_generation)
|
||||
return optgroup.group(
|
||||
f"{self.__class__.__name__} options", help=f"[Auto-generated from '{self.__class__.__qualname__}']"
|
||||
)(wrapped_generation)
|
||||
if t.get_origin(field.annotation) is t.Union:
|
||||
# NOTE: Union type is currently not yet supported, we probably just need to use environment instead.
|
||||
continue
|
||||
f = field_to_options(name, field, self.__openllm_model_name__)(f)
|
||||
return optgroup.group(f"{self.__class__.__name__} options")(f)
|
||||
|
||||
@@ -440,8 +440,7 @@ class LLM(LLMInterface, metaclass=LLMMetaclass):
|
||||
# NOTE: The section below defines a loose contract with langchain's LLM interface.
|
||||
@property
|
||||
def llm_type(self) -> str:
|
||||
assert self.default_model is not None
|
||||
return openllm.utils.convert_transformers_model_name(self.default_model)
|
||||
return openllm.utils.convert_transformers_model_name(self._pretrained)
|
||||
|
||||
@property
|
||||
def identifying_params(self) -> dict[str, t.Any]:
|
||||
@@ -637,10 +636,10 @@ def Runner(start_name: str, **attrs: t.Any) -> bentoml.Runner:
|
||||
behaviour
|
||||
"""
|
||||
init_local = attrs.pop("init_local", False)
|
||||
envvar = openllm.utils.get_framework_env(start_name)
|
||||
if envvar == "flax":
|
||||
ModelEnv = openllm.utils.ModelEnv(start_name)
|
||||
if ModelEnv.get_framework_env() == "flax":
|
||||
runner = openllm.AutoFlaxLLM.create_runner(start_name, **attrs)
|
||||
elif envvar == "tf":
|
||||
elif ModelEnv.get_framework_env() == "tf":
|
||||
runner = openllm.AutoTFLLM.create_runner(start_name, **attrs)
|
||||
else:
|
||||
runner = openllm.AutoLLM.create_runner(start_name, **attrs)
|
||||
|
||||
@@ -70,6 +70,8 @@ def build_editable(path: str) -> str | None:
|
||||
|
||||
def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
|
||||
packages: list[str] = []
|
||||
|
||||
ModelEnv = openllm.utils.ModelEnv(llm.__openllm_start_name__)
|
||||
if llm.requirements is not None:
|
||||
packages.extend(llm.requirements)
|
||||
|
||||
@@ -89,11 +91,9 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
|
||||
]
|
||||
)
|
||||
|
||||
to_use_framework = utils.get_framework_env(llm.__openllm_start_name__)
|
||||
to_use_framework = ModelEnv.get_framework_env()
|
||||
if to_use_framework == "flax":
|
||||
assert (
|
||||
utils.is_flax_available()
|
||||
), f"Flax is not available, while {utils.FRAMEWORK_ENV_VAR(llm.__openllm_start_name__)} is set to 'flax'"
|
||||
assert utils.is_flax_available(), f"Flax is not available, while {ModelEnv.framework} is set to 'flax'"
|
||||
packages.extend(
|
||||
[
|
||||
f"flax>={importlib.metadata.version('flax')}",
|
||||
@@ -102,9 +102,7 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
|
||||
]
|
||||
)
|
||||
elif to_use_framework == "tf":
|
||||
assert (
|
||||
utils.is_tf_available()
|
||||
), f"TensorFlow is not available, while {utils.FRAMEWORK_ENV_VAR(llm.__openllm_start_name__)} is set to 'tf'"
|
||||
assert utils.is_tf_available(), f"TensorFlow is not available, while {ModelEnv.framework} is set to 'tf'"
|
||||
candidates = (
|
||||
"tensorflow",
|
||||
"tensorflow-cpu",
|
||||
@@ -137,11 +135,12 @@ def construct_python_options(llm: openllm.LLM, llm_fs: FS) -> PythonOptions:
|
||||
return PythonOptions(packages=packages, wheels=wheels, lock_packages=True)
|
||||
|
||||
|
||||
def construct_docker_options(llm: openllm.LLM, llm_fs: FS) -> DockerOptions:
|
||||
def construct_docker_options(llm: openllm.LLM, _: FS) -> DockerOptions:
|
||||
ModelEnv = openllm.utils.ModelEnv(llm.__openllm_start_name__)
|
||||
return DockerOptions(
|
||||
cuda_version="11.6", # NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version
|
||||
env={
|
||||
utils.FRAMEWORK_ENV_VAR(llm.__openllm_start_name__): utils.get_framework_env(llm.__openllm_start_name__),
|
||||
ModelEnv.framework: ModelEnv.get_framework_env(),
|
||||
"OPENLLM_MODEL": llm.config.__openllm_model_name__,
|
||||
},
|
||||
system_packages=["git"],
|
||||
@@ -165,14 +164,16 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
|
||||
current_model_envvar = os.environ.pop("OPENLLM_MODEL", None)
|
||||
_previously_built = False
|
||||
|
||||
logger.debug("Packing '%s' into a Bento with kwargs=%s...", model_name, attrs)
|
||||
ModelEnv = openllm.utils.ModelEnv(model_name)
|
||||
|
||||
logger.info("Packing '%s' into a Bento with kwargs=%s...", model_name, attrs)
|
||||
|
||||
# NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
|
||||
# during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
|
||||
try:
|
||||
os.environ["OPENLLM_MODEL"] = inflection.underscore(model_name)
|
||||
|
||||
to_use_framework = openllm.utils.get_framework_env(model_name)
|
||||
to_use_framework = ModelEnv.get_framework_env()
|
||||
if to_use_framework == "flax":
|
||||
llm = openllm.AutoFlaxLLM.for_model(model_name, **attrs)
|
||||
elif to_use_framework == "tf":
|
||||
|
||||
@@ -30,7 +30,7 @@ svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", r
|
||||
route="/v1/generate",
|
||||
)
|
||||
async def generate_v1(qa: openllm.GenerationInput) -> openllm.GenerationOutput:
|
||||
config = llm_config.with_options(__llm_config__=qa.llm_config).model_dump()
|
||||
config = llm_config.model_construct_env(__llm_config__=qa.llm_config).model_dump()
|
||||
responses = await runner.generate.async_run(qa.prompt, **config)
|
||||
return openllm.GenerationOutput(responses=responses, configuration=config)
|
||||
|
||||
@@ -39,5 +39,5 @@ async def generate_v1(qa: openllm.GenerationInput) -> openllm.GenerationOutput:
|
||||
def metadata_v1(_: str) -> dict[str, t.Any]:
|
||||
return {
|
||||
"model_name": llm_config.__openllm_model_name__,
|
||||
"framework": openllm.utils.get_framework_env(llm_config.__openllm_model_name__),
|
||||
"framework": llm_config.__openllm_env__.get_framework_env(),
|
||||
}
|
||||
|
||||
@@ -314,7 +314,7 @@ def start_model_command(
|
||||
Note that the internal commands will return the llm_config and a boolean determine
|
||||
whether the server is run with GPU or not.
|
||||
"""
|
||||
envvar = openllm.utils.get_framework_env(model_name)
|
||||
ModelEnv = openllm.utils.ModelEnv(model_name)
|
||||
model_command_decr: dict[str, t.Any] = {
|
||||
"name": inflection.underscore(model_name),
|
||||
"context_settings": _context_settings or {},
|
||||
@@ -330,16 +330,15 @@ def start_model_command(
|
||||
{
|
||||
"name": config.__openllm_model_name__,
|
||||
"short_help": f"Start a LLMServer for '{model_name}' ('--help' for more details)",
|
||||
"help": getattr(
|
||||
openllm.utils.get_lazy_module(model_name),
|
||||
f"START_{inflection.underscore(model_name).upper()}_COMMAND_DOCSTRING",
|
||||
),
|
||||
"help": ModelEnv.start_docstring,
|
||||
"aliases": aliases if len(aliases) > 0 else None,
|
||||
}
|
||||
)
|
||||
|
||||
gpu_available = False
|
||||
try:
|
||||
config.check_if_gpu_is_available(envvar)
|
||||
config.check_if_gpu_is_available(ModelEnv.get_framework_env())
|
||||
gpu_available = True
|
||||
except openllm.exceptions.GpuNotAvailableError:
|
||||
# NOTE: The model requires GPU, therefore we will return a dummy command
|
||||
model_command_decr.update(
|
||||
@@ -353,7 +352,7 @@ def start_model_command(
|
||||
@factory.command(**model_command_decr)
|
||||
def noop() -> openllm.LLMConfig:
|
||||
click.secho("No GPU available, therefore this command is disabled", fg="red")
|
||||
openllm.utils.analytics.track_start_init(config, False)
|
||||
openllm.utils.analytics.track_start_init(config, gpu_available)
|
||||
return config
|
||||
|
||||
return noop
|
||||
@@ -371,15 +370,24 @@ def start_model_command(
|
||||
|
||||
configure_logging()
|
||||
|
||||
updated_config, server_kwds = config.model_validate_click(**attrs)
|
||||
openllm.utils.analytics.track_start_init(updated_config, False)
|
||||
updated_config, server_attrs = config.model_validate_click(**attrs)
|
||||
|
||||
server_kwds.update({"working_dir": os.path.dirname(__file__)})
|
||||
# NOTE: check for GPU one more time in cases this model doesn't requires GPU but users can still
|
||||
# run this model on GPU
|
||||
try:
|
||||
updated_config.check_if_gpu_is_available(ModelEnv.get_framework_env())
|
||||
gpu_available = True
|
||||
except openllm.exceptions.GpuNotAvailableError:
|
||||
gpu_available = False
|
||||
|
||||
openllm.utils.analytics.track_start_init(updated_config, gpu_available)
|
||||
|
||||
server_attrs.update({"working_dir": os.path.dirname(__file__)})
|
||||
if _serve_grpc:
|
||||
server_kwds["grpc_protocol_version"] = "v1"
|
||||
server_attrs["grpc_protocol_version"] = "v1"
|
||||
# NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream.
|
||||
development = server_kwds.pop("development")
|
||||
server_kwds.setdefault("production", not development)
|
||||
development = server_attrs.pop("development")
|
||||
server_attrs.setdefault("production", not development)
|
||||
|
||||
start_env = os.environ.copy()
|
||||
|
||||
@@ -395,17 +403,17 @@ def start_model_command(
|
||||
|
||||
start_env.update(
|
||||
{
|
||||
openllm.utils.FRAMEWORK_ENV_VAR(model_name): envvar,
|
||||
openllm.utils.MODEL_CONFIG_ENV_VAR(model_name): updated_config.model_dump_json(),
|
||||
ModelEnv.framework: ModelEnv.get_framework_env(),
|
||||
ModelEnv.model_config: updated_config.model_dump_json(),
|
||||
"OPENLLM_MODEL": model_name,
|
||||
"BENTOML_DEBUG": str(get_debug_mode()),
|
||||
"BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
|
||||
}
|
||||
)
|
||||
|
||||
if envvar == "flax":
|
||||
if ModelEnv.get_framework_env() == "flax":
|
||||
llm = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained)
|
||||
elif envvar == "tf":
|
||||
elif ModelEnv.get_framework_env() == "tf":
|
||||
llm = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained)
|
||||
else:
|
||||
llm = openllm.AutoLLM.for_model(model_name, pretrained=pretrained)
|
||||
@@ -416,7 +424,7 @@ def start_model_command(
|
||||
)
|
||||
click.secho(f"Starting LLM Server for '{model_name}'\n", fg="blue")
|
||||
server_cls = getattr(bentoml, "HTTPServer" if not _serve_grpc else "GrpcServer")
|
||||
server: bentoml.server.Server = server_cls("_service.py:svc", **server_kwds)
|
||||
server: bentoml.server.Server = server_cls("_service.py:svc", **server_attrs)
|
||||
server.timeout = 90
|
||||
|
||||
try:
|
||||
@@ -449,8 +457,10 @@ def _start(
|
||||
|
||||
_serve_grpc = attrs.pop("_serve_grpc", False)
|
||||
|
||||
ModelEnv = openllm.utils.ModelEnv(model_name)
|
||||
|
||||
if framework is not None:
|
||||
os.environ[openllm.utils.FRAMEWORK_ENV_VAR(model_name)] = framework
|
||||
os.environ[ModelEnv.framework] = framework
|
||||
start_model_command(model_name, t.cast(OpenLLMCommandGroup, cli), _serve_grpc=_serve_grpc)(
|
||||
standalone_mode=False, **attrs
|
||||
)
|
||||
@@ -585,9 +595,12 @@ def list_supported_models(output: t.Literal["json", "pretty", "porcelain"]):
|
||||
except Exception as err:
|
||||
failed_initialized.append((m, err))
|
||||
_console.print(table)
|
||||
_console.print("\n[bold yellow] The following models are supported but failed to initialize:[/bold yellow]\n")
|
||||
for m, err in failed_initialized:
|
||||
_console.print(Text(f"- {m}: ") + Text(f"{err}\n", style="bold red"))
|
||||
if len(failed_initialized) > 0:
|
||||
_console.print(
|
||||
"\n[bold yellow] The following models are supported but failed to initialize:[/bold yellow]\n"
|
||||
)
|
||||
for m, err in failed_initialized:
|
||||
_console.print(Text(f"- {m}: ") + Text(f"{err}\n", style="bold red"))
|
||||
elif output == "json":
|
||||
result_json: dict[str, dict[t.Literal["variants", "description"], t.Any]] = {}
|
||||
for m in models:
|
||||
|
||||
@@ -53,7 +53,7 @@ class _LazyConfigMapping(ConfigOrderedDict):
|
||||
value = self._mapping[key]
|
||||
module_name = inflection.underscore(key)
|
||||
if module_name not in self._modules:
|
||||
self._modules[module_name] = openllm.utils.get_lazy_module(module_name)
|
||||
self._modules[module_name] = openllm.utils.ModelEnv(module_name).module
|
||||
if hasattr(self._modules[module_name], value):
|
||||
return getattr(self._modules[module_name], value)
|
||||
|
||||
@@ -93,10 +93,10 @@ class AutoConfig:
|
||||
raise EnvironmentError("Cannot instantiate Config. Please use `Config.for_model(model_name)` instead.")
|
||||
|
||||
@classmethod
|
||||
def for_model(cls, model_name: str, *args: t.Any, **attrs: t.Any) -> openllm.LLMConfig:
|
||||
def for_model(cls, model_name: str, **attrs: t.Any) -> openllm.LLMConfig:
|
||||
model_name = inflection.underscore(model_name)
|
||||
if model_name in CONFIG_MAPPING:
|
||||
return CONFIG_MAPPING[model_name]().with_options(*args, **attrs)
|
||||
return CONFIG_MAPPING[model_name].model_construct_env(**attrs)
|
||||
raise ValueError(
|
||||
f"Unrecognized configuration class for {model_name}. "
|
||||
f"Model name should be one of {', '.join(CONFIG_MAPPING.keys())}."
|
||||
|
||||
@@ -190,7 +190,7 @@ class _LazyAutoMapping(ConfigModelOrderedDict):
|
||||
def _load_attr_from_module(self, model_type: str, attr: str) -> t.Any:
|
||||
module_name = inflection.underscore(model_type)
|
||||
if module_name not in self._modules:
|
||||
self._modules[module_name] = openllm.utils.get_lazy_module(module_name)
|
||||
self._modules[module_name] = openllm.utils.ModelEnv(module_name).module
|
||||
return getattribute_from_module(self._modules[module_name], attr)
|
||||
|
||||
def keys(self):
|
||||
|
||||
@@ -81,7 +81,7 @@ class ChatGLM(openllm.LLM):
|
||||
else:
|
||||
prompt_text = prompt
|
||||
|
||||
generation_config = self.config.with_options(
|
||||
generation_config = self.config.model_construct_env(
|
||||
max_new_tokens=max_new_tokens,
|
||||
num_beams=num_beams,
|
||||
top_p=top_p,
|
||||
@@ -123,7 +123,7 @@ class ChatGLM(openllm.LLM):
|
||||
inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
|
||||
outputs = self.model.generate(
|
||||
**inputs,
|
||||
generation_config=self.config.with_options(
|
||||
generation_config=self.config.model_construct_env(
|
||||
max_new_tokens=max_new_tokens,
|
||||
num_beams=num_beams,
|
||||
top_p=top_p,
|
||||
|
||||
@@ -79,7 +79,7 @@ class DollyV2(openllm.LLM):
|
||||
) -> tuple[str, dict[str, t.Any]]:
|
||||
prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt)
|
||||
|
||||
generation_config = self.config.with_options(
|
||||
generation_config = self.config.model_construct_env(
|
||||
max_new_tokens=max_new_tokens,
|
||||
temperature=temperature,
|
||||
top_k=top_k,
|
||||
@@ -112,7 +112,7 @@ class DollyV2(openllm.LLM):
|
||||
end_key_token_id = None
|
||||
eos_token_id = None
|
||||
|
||||
llm_config = self.config.with_options(
|
||||
llm_config = self.config.model_construct_env(
|
||||
max_new_tokens=max_new_tokens,
|
||||
temperature=temperature,
|
||||
top_k=top_k,
|
||||
|
||||
@@ -66,7 +66,7 @@ class Falcon(openllm.LLM):
|
||||
eos_token_id: int | None = None,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any]]:
|
||||
generation_config = self.config.with_options(
|
||||
generation_config = self.config.model_construct_env(
|
||||
max_new_tokens=max_new_tokens,
|
||||
top_k=top_k,
|
||||
num_return_sequences=num_return_sequences,
|
||||
@@ -95,7 +95,7 @@ class Falcon(openllm.LLM):
|
||||
return self.model(
|
||||
prompt,
|
||||
do_sample=True,
|
||||
generation_config=self.config.with_options(
|
||||
generation_config=self.config.model_construct_env(
|
||||
max_new_tokens=max_new_tokens,
|
||||
top_k=top_k,
|
||||
num_return_sequences=num_return_sequences,
|
||||
|
||||
@@ -48,7 +48,7 @@ class FlanT5(openllm.LLM):
|
||||
repetition_penalty: float | None = None,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any]]:
|
||||
return prompt, self.config.with_options(
|
||||
return prompt, self.config.model_construct_env(
|
||||
max_new_tokens=max_new_tokens,
|
||||
temperature=temperature,
|
||||
top_k=top_k,
|
||||
@@ -75,7 +75,7 @@ class FlanT5(openllm.LLM):
|
||||
result_tensor = self.model.generate(
|
||||
input_ids,
|
||||
do_sample=True,
|
||||
generation_config=self.config.with_options(
|
||||
generation_config=self.config.model_construct_env(
|
||||
max_new_tokens=max_new_tokens,
|
||||
temperature=temperature,
|
||||
top_k=top_k,
|
||||
|
||||
@@ -41,7 +41,7 @@ class FlaxFlanT5(openllm.LLM):
|
||||
repetition_penalty: float | None = None,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any]]:
|
||||
return prompt, self.config.with_options(
|
||||
return prompt, self.config.model_construct_env(
|
||||
max_new_tokens=max_new_tokens,
|
||||
temperature=temperature,
|
||||
top_k=top_k,
|
||||
@@ -67,7 +67,7 @@ class FlaxFlanT5(openllm.LLM):
|
||||
result_tensor = self.model.generate(
|
||||
input_ids,
|
||||
do_sample=True,
|
||||
generation_config=self.config.with_options(
|
||||
generation_config=self.config.model_construct_env(
|
||||
max_new_tokens=max_new_tokens,
|
||||
temperature=temperature,
|
||||
top_k=top_k,
|
||||
|
||||
@@ -41,7 +41,7 @@ class TFFlanT5(openllm.LLM):
|
||||
repetition_penalty: float | None = None,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[str, dict[str, t.Any]]:
|
||||
return prompt, self.config.with_options(
|
||||
return prompt, self.config.model_construct_env(
|
||||
max_new_tokens=max_new_tokens,
|
||||
temperature=temperature,
|
||||
top_k=top_k,
|
||||
@@ -67,7 +67,7 @@ class TFFlanT5(openllm.LLM):
|
||||
outputs = self.model.generate(
|
||||
input_ids,
|
||||
do_sample=True,
|
||||
generation_config=self.config.with_options(
|
||||
generation_config=self.config.model_construct_env(
|
||||
max_new_tokens=max_new_tokens,
|
||||
temperature=temperature,
|
||||
top_k=top_k,
|
||||
|
||||
@@ -116,7 +116,7 @@ class StarCoder(openllm.LLM):
|
||||
raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
|
||||
prompt = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
|
||||
|
||||
return prompt, self.config.with_options(
|
||||
return prompt, self.config.model_construct_env(
|
||||
top_p=top_p,
|
||||
temperature=temperature,
|
||||
max_new_tokens=max_new_tokens,
|
||||
@@ -154,7 +154,7 @@ class StarCoder(openllm.LLM):
|
||||
result_tensor = self.model.generate(
|
||||
inputs,
|
||||
do_sample=True,
|
||||
generation_config=self.config.with_options(
|
||||
generation_config=self.config.model_construct_env(
|
||||
top_p=top_p,
|
||||
temperature=temperature,
|
||||
max_new_tokens=max_new_tokens,
|
||||
|
||||
@@ -26,6 +26,7 @@ import re
|
||||
import types
|
||||
import typing as t
|
||||
|
||||
import attrs
|
||||
import bentoml
|
||||
import inflection
|
||||
from bentoml._internal.types import LazyType as LazyType
|
||||
@@ -56,27 +57,37 @@ else:
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_object_setattr = object.__setattr__
|
||||
|
||||
|
||||
def get_lazy_module(model_name: str) -> LazyLoader:
|
||||
snaked_model_name = inflection.underscore(model_name)
|
||||
return LazyLoader(snaked_model_name, globals(), f"openllm.models.{snaked_model_name}")
|
||||
|
||||
|
||||
def FRAMEWORK_ENV_VAR(model_name: str) -> str:
|
||||
return f"OPENLLM_{inflection.underscore(model_name).upper()}_FRAMEWORK"
|
||||
@attrs.define
|
||||
class ModelEnv:
|
||||
model_name: str = attrs.field(converter=inflection.underscore)
|
||||
|
||||
@property
|
||||
def framework(self) -> str:
|
||||
return f"OPENLLM_{self.model_name.upper()}_FRAMEWORK"
|
||||
|
||||
def MODEL_CONFIG_ENV_VAR(model_name: str) -> str:
|
||||
return f"OPENLLM_{inflection.underscore(model_name).upper()}_CONFIG"
|
||||
@property
|
||||
def model_config(self) -> str:
|
||||
return f"OPENLLM_{self.model_name.upper()}_CONFIG"
|
||||
|
||||
@property
|
||||
def start_docstring(self) -> str:
|
||||
return getattr(self.module, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING")
|
||||
|
||||
def get_framework_env(model_name: str) -> t.Literal["pt", "flax", "tf"]:
|
||||
envvar = os.environ.get(FRAMEWORK_ENV_VAR(model_name), "pt")
|
||||
if envvar not in ("pt", "tf", "flax"):
|
||||
raise ValueError(f"Invalid framework implementation {envvar}, must be one of 'pt', 'tf', 'flax'")
|
||||
return envvar
|
||||
@property
|
||||
def module(self) -> LazyLoader:
|
||||
return LazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}")
|
||||
|
||||
def get_framework_env(self) -> t.Literal["pt", "flax", "tf"]:
|
||||
envvar = os.environ.get(self.framework, "pt")
|
||||
if envvar not in ("pt", "tf", "flax"):
|
||||
raise ValueError(f"Invalid framework implementation {envvar}, must be one of 'pt', 'tf', 'flax'")
|
||||
return envvar
|
||||
|
||||
|
||||
def convert_transformers_model_name(name: str) -> str:
|
||||
|
||||
@@ -79,7 +79,7 @@ class PromptTemplate:
|
||||
|
||||
@classmethod
|
||||
def from_default(cls, model: str) -> PromptTemplate:
|
||||
template = getattr(openllm.utils.get_lazy_module(model), "DEFAULT_PROMPT_TEMPLATE")
|
||||
template = getattr(openllm.utils.ModelEnv(model).module, "DEFAULT_PROMPT_TEMPLATE")
|
||||
if template is None:
|
||||
raise ValueError(f"Model {model} does not have a default prompt template.")
|
||||
return cls.from_template(template)
|
||||
|
||||
@@ -109,7 +109,7 @@ class BaseClient(ClientMixin):
|
||||
def query(self, prompt: str, **attrs: t.Any) -> dict[str, t.Any] | str:
|
||||
return_raw_response = attrs.pop("return_raw_response", False)
|
||||
prompt, attrs = self.llm.preprocess_parameters(prompt, **attrs)
|
||||
inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.with_options(**attrs))
|
||||
inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**attrs))
|
||||
r = openllm.GenerationOutput(**self.call("generate", inputs))
|
||||
|
||||
if return_raw_response:
|
||||
@@ -132,7 +132,7 @@ class BaseAsyncClient(ClientMixin):
|
||||
async def query(self, prompt: str, **attrs: t.Any) -> dict[str, t.Any] | str:
|
||||
return_raw_response = attrs.pop("return_raw_response", False)
|
||||
prompt, attrs = self.llm.preprocess_parameters(prompt, **attrs)
|
||||
inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.with_options(**attrs))
|
||||
inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**attrs))
|
||||
res = await self.acall("generate", inputs)
|
||||
r = openllm.GenerationOutput(**res)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user