perf: build quantization and better transformer behaviour (#28)

Fixes quantization_config and low_cpu_mem_usage to be available on PyTorch implementation only

See changelog for more details on #28
This commit is contained in:
Aaron Pham
2023-06-17 08:56:14 -04:00
committed by GitHub
parent 233d4697b5
commit 6f724416c0
23 changed files with 1159 additions and 853 deletions

View File

@@ -28,22 +28,6 @@ defaults:
run:
shell: bash --noprofile --norc -exo pipefail {0}
jobs:
codestyle_check:
runs-on: ubuntu-latest
if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }}
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Setup CI
uses: ./.github/actions/setup-repo
- name: Running changelog check
run: hatch run changelog
- name: Format and lint check
run: hatch run fmt
- name: Type check
if: ${{ github.event_name == 'pull_request' }}
run: git diff --name-only --diff-filter=AM "origin/$GITHUB_BASE_REF" -z -- '*.py{,i}' | xargs -0 --no-run-if-empty hatch run dev:typing
tests:
runs-on: ubuntu-latest
if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }}

View File

@@ -14,7 +14,7 @@
ci:
autoupdate_schedule: weekly
skip: [check-models-table-update, check-models-table-update]
skip: [check-models-table-update, check-models-table-update, changelog-dry-run]
exclude: '.*\.(css|js|svg)$'
repos:
- repo: https://github.com/charliermarsh/ruff-pre-commit
@@ -51,13 +51,16 @@ repos:
typings/.*|
.github/.*
)$
- repo: local
hooks:
- id: check-models-table-update
name: check if table in README.md is up-to-date
entry: ./tools/assert-model-table-latest
language: script
files: README.md
- id: changelog-dry-run
name: Running changelog dry-run
entry: hatch run changelog
language: system
files: CHANGELOG.md
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:

View File

@@ -1,14 +1,22 @@
Added support for quantization during serving time.
`openllm start` now support `--quantize 8bit` and `--quantize 4bit`
`GPTQ` quantization support is on the roadmap and currently
being worked on.
`openllm start` now support `--quantize int8` and `--quantize int4` `GPTQ`
quantization support is on the roadmap and currently being worked on.
`openllm start` now also support `--bettertransformer` to use
`BetterTransformer` for serving
Refactored `openllm.LLMConfig` to be able to use with `__getitem__`
to acecss the config value: `openllm.DollyV2Config()['requirements']`
the order being: `__openllm_*__ > self.<key> > __openllm_generation_class__ > __openllm_extras__`
`BetterTransformer` for serving.
Refactored `openllm.LLMConfig` to be able to use with `__getitem__`:
`openllm.DollyV2Config()['requirements']`.
The access order being:
`__openllm_*__ > self.<key> > __openllm_generation_class__ > __openllm_extras__`.
Added `towncrier` workflow to easily generate changelog entries
Added `use_pipeline`, `bettertransformer` flag into ModelSettings
`LLMConfig` now supported `__dataclass_transform__` protocol to help
with type-checking
Changed `openllm download-models` to `openllm download`
`LLMConfig` now supported `__dataclass_transform__` protocol to help with
type-checking
`openllm download-models` now becomes `openllm download`

14
changelog.d/28.change.md Normal file
View File

@@ -0,0 +1,14 @@
`--quantize` now takes `int8, int4` instead of `8bit, 4bit` to be consistent
with bitsandbytes concept.
`openllm CLI` now cached all available model command, allow faster startup time.
Fixes `openllm start model-id --debug` to filtered out debug message log from
`bentoml.Server`.
`--model-id` from `openllm start` now support choice for easier selection.
Updated `ModelConfig` implementation with **getitem** and auto generation value.
Cleanup CLI and improve loading time, `openllm start` should be 'blazingly
fast'.

View File

@@ -25,7 +25,7 @@ deploy, and monitor any LLMs with ease.
"""
from __future__ import annotations
import logging as _
import logging
import typing as t
from . import utils as utils
@@ -33,15 +33,11 @@ from .__about__ import __version__ as __version__
from .exceptions import MissingDependencyError
if utils.DEBUG:
from bentoml._internal.configuration import set_debug_mode, set_quiet_mode
utils.set_debug_mode(True)
utils.set_quiet_mode(False)
set_debug_mode(True)
set_quiet_mode(False)
from bentoml._internal.log import configure_logging
configure_logging()
_.basicConfig(level=_.NOTSET)
utils.configure_logging()
logging.basicConfig(level=logging.NOTSET)
_import_structure = {
@@ -147,7 +143,6 @@ if t.TYPE_CHECKING:
from . import exceptions as exceptions
from . import models as models
from . import playground as playground
# Specific types import
from ._configuration import LLMConfig as LLMConfig
from ._llm import LLM as LLM
@@ -160,7 +155,8 @@ if t.TYPE_CHECKING:
from .cli import start as start
from .cli import start_grpc as start_grpc
from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING
from .models.auto import MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
from .models.auto import \
MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
from .models.auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
from .models.auto import MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
from .models.auto import AutoConfig as AutoConfig
@@ -234,5 +230,11 @@ else:
globals()["__file__"],
_import_structure,
module_spec=__spec__,
extra_objects={"__version__": __version__},
extra_objects={
"__version__": __version__,
# The below is a special mapping that allows openllm to be used as a dictionary.
# This is purely for convenience sake, and should not be used in performance critcal
# code. This is also not considered as a public API.
"__openllm_special__": {"flax": "AutoFlaxLLM", "tf": "AutoTFLLM", "pt": "AutoLLM"},
},
)

View File

@@ -395,16 +395,7 @@ bentoml_cattr.register_unstructure_hook_factory(
)
def _populate_value_from_env_var(
key: str, transform: t.Callable[[str], str] | None = None, fallback: t.Any = None
) -> t.Any:
if transform is not None and callable(transform):
key = transform(key)
return os.environ.get(key, fallback)
def _field_env_key(model_name: str, key: str, suffix: str | None = None) -> str:
def _field_env_key(model_name: str, key: str, suffix: str | t.Literal[""] | None = None) -> str:
return "_".join(filter(None, map(str.upper, ["OPENLLM", model_name, suffix.strip("_") if suffix else "", key])))
@@ -425,6 +416,7 @@ class ModelSettings(t.TypedDict, total=False):
url: str
requires_gpu: bool
trust_remote_code: bool
service_name: NotRequired[str]
requirements: t.Optional[ListStr]
# llm implementation specifics
@@ -448,128 +440,174 @@ class ModelSettings(t.TypedDict, total=False):
generation_class: t.Type[GenerationConfig]
_ModelSettings: type[attr.AttrsInstance] = codegen.add_method_dunders(
type("__openllm_internal__", (ModelSettings,), {"__module__": "openllm._configuration"}),
attr.make_class(
"ModelSettings",
{
k: dantic.Field(
def _settings_field_transformer(
_: type[attr.AttrsInstance], __: list[attr.Attribute[t.Any]]
) -> list[attr.Attribute[t.Any]]:
return [
attr.Attribute.from_counting_attr(
k,
dantic.Field(
kw_only=False if t.get_origin(ann) is not Required else True,
auto_default=True,
use_default_converter=False,
type=ann,
metadata={
"target": f"__openllm_{k}__",
"required": False if t.get_origin(ann) is NotRequired else t.get_origin(ann) is Required,
},
metadata={"target": f"__openllm_{k}__"},
description=f"ModelSettings field for {k}.",
)
for k, ann in t.get_type_hints(ModelSettings).items()
},
bases=(DictStrAny,),
slots=True,
weakref_slot=True,
collect_by_mro=True,
),
_overwrite_doc="Internal attrs representation of ModelSettings.",
)
),
)
for k, ann in t.get_type_hints(ModelSettings).items()
]
def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
@attr.define(slots=True, field_transformer=_settings_field_transformer, frozen=False)
class _ModelSettingsAttr:
"""Internal attrs representation of ModelSettings."""
def __getitem__(self, key: str) -> t.Any:
if key in codegen.get_annotations(ModelSettings):
return _object_getattribute(self, key)
raise KeyError(key)
@classmethod
def default(cls) -> _ModelSettingsAttr:
_ = ModelSettings(
default_id="__default__",
model_ids=["__default__"],
name_type="dasherize",
requires_gpu=False,
url="",
use_pipeline=False,
model_type="causal_lm",
trust_remote_code=False,
requirements=None,
timeout=3600,
service_name="",
workers_per_resource=1,
runtime="transformers",
)
return cls(**t.cast(DictStrAny, _))
def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]):
if not lenient_issubclass(cl_, LLMConfig):
raise RuntimeError(f"Given LLMConfig must be a subclass type of 'LLMConfig', got '{cl_}' instead.")
raise RuntimeError(f"Given '{cl_}' must be a subclass type of 'LLMConfig', got '{cl_}' instead.")
if not hasattr(cl_, "__config__") or getattr(cl_, "__config__") is None:
raise RuntimeError("Given LLMConfig must have '__config__' that is not None defined.")
settings = cl_.__config__
assert settings
assert cl_.__config__ is not None
required = [i.name for i in attr.fields(cls) if i.metadata.get("required", False)]
missing = set(required) - set(settings.keys())
if len(missing) > 0:
raise ValueError(f"The following keys are required under '__config__': {required} (missing: {missing})")
if "generation_class" in settings:
if "generation_class" in cl_.__config__:
raise ValueError(
"'generation_class' shouldn't be defined in '__config__', rather defining "
f"all required attributes under '{cl_}.GenerationConfig' when defining the class."
f"all required attributes under '{cl_}.GenerationConfig' instead."
)
if not settings["default_id"] or not settings["model_ids"]:
_cl_name = cl_.__name__.replace("Config", "")
_settings_attr = _ModelSettingsAttr.default()
try:
cls(**t.cast(DictStrAny, cl_.__config__))
_settings_attr = attr.evolve(_settings_attr, **t.cast(DictStrAny, cl_.__config__))
except TypeError:
raise ValueError("Either 'default_id' or 'model_ids' are emptied under '__config__' (required fields).")
# NOTE: value in __config__ can be None, hense we use setdefault
# to update in-place
_cl_name = cl_.__name__.replace("Config", "")
name_type = settings.setdefault("name_type", "dasherize")
model_name = settings.setdefault(
"model_name", inflection.underscore(_cl_name) if name_type == "dasherize" else _cl_name.lower()
_final_value_dct: DictStrAny = {
"model_name": inflection.underscore(_cl_name)
if _settings_attr["name_type"] == "dasherize"
else _cl_name.lower()
}
_final_value_dct["start_name"] = (
inflection.dasherize(_final_value_dct["model_name"])
if _settings_attr["name_type"] == "dasherize"
else _final_value_dct["model_name"]
)
partialed = functools.partial(_field_env_key, model_name=model_name, suffix="generation")
env = openllm.utils.ModelEnv(_final_value_dct["model_name"])
_final_value_dct["env"] = env
def auto_env_transformers(_: t.Any, fields: list[attr.Attribute[t.Any]]) -> list[attr.Attribute[t.Any]]:
_has_own_gen = codegen.has_own_attribute(cl_, "GenerationConfig")
return [
f.evolve(
default=_populate_value_from_env_var(
partialed(key=f.name),
fallback=getattr(cl_.GenerationConfig, f.name, f.default) if _has_own_gen else f.default,
),
metadata={"env": partialed(key=f.name), "description": f.metadata.get("description", "(not provided)")},
converter=None,
)
for f in fields
]
# bettertransformer support
if _settings_attr["bettertransformer"] is None:
_final_value_dct["bettertransformer"] = (
os.environ.get(env.bettertransformer, str(False)).upper() in ENV_VARS_TRUE_VALUES
)
if _settings_attr["requires_gpu"]:
# if requires_gpu is True, then disable BetterTransformer for quantization.
_final_value_dct["bettertransformer"] = False
settings.setdefault(
"generation_class",
attr.make_class(
f"{_cl_name}GenerationConfig",
[],
bases=(GenerationConfig,),
slots=True,
weakref_slot=True,
frozen=False,
repr=True,
collect_by_mro=True,
field_transformer=auto_env_transformers,
_final_value_dct["service_name"] = f"generated_{_final_value_dct['model_name']}_service.py"
_final_value_dct["generation_class"] = attr.make_class(
f"{_cl_name}GenerationConfig",
[],
bases=(GenerationConfig,),
slots=True,
weakref_slot=True,
frozen=True,
repr=True,
collect_by_mro=True,
field_transformer=_make_env_transformer(
cl_,
_final_value_dct["model_name"],
suffix="generation",
default_callback=lambda field_name, field_default: getattr(cl_.GenerationConfig, field_name, field_default)
if codegen.has_own_attribute(cl_, "GenerationConfig")
else field_default,
globs={"cl_": cl_},
),
)
env = settings.setdefault("env", openllm.utils.ModelEnv(model_name))
requires_gpu = settings.setdefault("requires_gpu", False)
return attr.evolve(_settings_attr, **_final_value_dct)
# bettertransformer support
bettertransformer = settings.setdefault(
"bettertransformer",
os.environ.get(env.bettertransformer, str(False)).upper() in ENV_VARS_TRUE_VALUES,
bentoml_cattr.register_structure_hook(_ModelSettingsAttr, structure_settings)
def _make_env_transformer(
cls: type[LLMConfig],
model_name: str,
suffix: t.LiteralString | None = None,
default_callback: t.Callable[[str, t.Any], t.Any] | None = None,
globs: DictStrAny | None = None,
):
def identity(_: str, x_value: t.Any) -> t.Any:
return x_value
default_callback = identity if default_callback is None else default_callback
globs = {} if globs is None else globs
globs.update(
{
"functools": functools,
"__populate_env": dantic.env_converter,
"__default_callback": default_callback,
"__field_env": _field_env_key,
"__suffix": suffix or "",
"__model_name": model_name,
}
)
if requires_gpu:
# For all models that requires GPU, no need to offload it to BetterTransformer
# use bitsandbytes or gptq instead for latency improvement
if bettertransformer:
logger.debug("Model requires GPU by default, disabling bettertransformer.")
bettertransformer = False
settings["bettertransformer"] = bettertransformer
# default value
settings.setdefault("url", "")
settings.setdefault("use_pipeline", False)
settings.setdefault("model_type", "causal_lm")
settings.setdefault("trust_remote_code", False)
settings.setdefault("requirements", None)
settings.setdefault("timeout", 3600)
settings.setdefault("workers_per_resource", 1)
settings.setdefault("runtime", "transformers")
settings.setdefault("start_name", inflection.dasherize(model_name) if name_type == "dasherize" else model_name)
lines: ListStr = [
"__env = lambda field_name: __field_env(__model_name, field_name, __suffix)",
"return [",
" f.evolve(",
" default=__populate_env(__default_callback(f.name, f.default), __env(f.name)),",
" metadata={",
" 'env': f.metadata.get('env', __env(f.name)),",
" 'description': f.metadata.get('description', '(not provided)'),",
" },",
" )",
" for f in fields",
"]",
]
fields_ann = "list[attr.Attribute[t.Any]]"
return cls(**settings)
bentoml_cattr.register_structure_hook(_ModelSettings, structure_settings)
return codegen.generate_function(
cls,
"__auto_env",
lines,
args=("_", "fields"),
globs=globs,
annotations={"_": "type[LLMConfig]", "fields": fields_ann, "return": fields_ann},
)
def _setattr_class(attr_name: str, value_var: t.Any, add_dunder: bool = False):
@@ -577,6 +615,10 @@ def _setattr_class(attr_name: str, value_var: t.Any, add_dunder: bool = False):
Use the builtin setattr to set *attr_name* to *value_var*.
We can't use the cached object.__setattr__ since we are setting
attributes to a class.
If add_dunder to True, the generated globs should include a __add_dunder
value that will be used to add the dunder methods to the class for given
value_var
"""
val = f"__add_dunder(cls, {value_var})" if add_dunder else value_var
return f"setattr(cls, '{attr_name}', {val})"
@@ -742,6 +784,23 @@ class LLMConfig:
# NOTE: The following will be populated from __config__ and also
# considered to be public API.
__openllm_default_id__: str = Field(None)
"""Return the default model to use when using 'openllm start <model_id>'.
This could be one of the keys in 'self.model_ids' or custom users model.
This field is required when defining under '__config__'.
"""
__openllm_model_ids__: ListStr = Field(None)
"""A list of supported pretrained models tag for this given runnable.
For example:
For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base",
"google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"]
This field is required when defining under '__config__'.
"""
__openllm_url__: str = Field(None, init=False)
"""The resolved url for this LLMConfig."""
@@ -751,46 +810,13 @@ class LLMConfig:
__openllm_trust_remote_code__: bool = Field(False)
"""Whether to always trust remote code"""
__openllm_service_name__: str = Field(None)
"""Generated service name for this LLMConfig. By default, it is 'generated_{model_name}_service.py'"""
__openllm_requirements__: ListStr | None = Field(None)
"""The default PyPI requirements needed to run this given LLM. By default, we will depend on
bentoml, torch, transformers."""
__openllm_env__: openllm.utils.ModelEnv = Field(None, init=False)
"""A ModelEnv instance for this LLMConfig."""
__openllm_model_name__: str = Field("")
"""The normalized version of __openllm_start_name__, determined by __openllm_name_type__"""
__openllm_model_type__: t.Literal["causal_lm", "seq2seq_lm"] = Field("causal_lm")
"""The model type for this given LLM. By default, it should be causal language modeling.
Currently supported 'causal_lm' or 'seq2seq_lm'
"""
__openllm_start_name__: str = Field("")
"""Default name to be used with `openllm start`"""
__openllm_name_type__: t.Literal["dasherize", "lowercase"] = Field("dasherize")
"""the default name typed for this model. "dasherize" will convert the name to lowercase and
replace spaces with dashes. "lowercase" will convert the name to lowercase."""
__openllm_timeout__: int = Field(36000)
"""The default timeout to be set for this given LLM."""
__openllm_workers_per_resource__: int | float = Field(1)
"""The number of workers per resource. This is used to determine the number of workers to use for this model.
For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then
OpenLLM will use 1 worker per resource. If this is set to 2, then OpenLLM will use 2 workers per resource.
See StarCoder for more advanced usage. See
https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy for more details.
By default, it is set to 1.
"""
__openllm_runtime__: t.Literal["transformers", "cpp"] = Field("transformers")
"""The runtime to use for this model. Possible values are `transformers` or `cpp`. See
LlaMA for more information."""
__openllm_use_pipeline__: bool = Field(False)
"""Whether this LLM will use HuggingFace Pipeline API. By default, this is set to False.
The reason for this to be here is because we want to access this object before loading
@@ -804,16 +830,40 @@ class LLMConfig:
and set to False for every other models.
"""
__openllm_default_id__: str = Field(None)
"""Return the default model to use when using 'openllm start <model_id>'.
This could be one of the keys in 'self.model_ids' or custom users model."""
__openllm_model_type__: t.Literal["causal_lm", "seq2seq_lm"] = Field("causal_lm")
"""The model type for this given LLM. By default, it should be causal language modeling.
Currently supported 'causal_lm' or 'seq2seq_lm'
"""
__openllm_model_ids__: ListStr = Field(None)
"""A list of supported pretrained models tag for this given runnable.
__openllm_runtime__: t.Literal["transformers", "cpp"] = Field("transformers")
"""The runtime to use for this model. Possible values are `transformers` or `cpp`. See
LlaMA for more information."""
For example:
For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base",
"google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"]
__openllm_name_type__: t.Literal["dasherize", "lowercase"] = Field("dasherize")
"""the default name typed for this model. "dasherize" will convert the name to lowercase and
replace spaces with dashes. "lowercase" will convert the name to lowercase."""
__openllm_model_name__: str = Field(None)
"""The normalized version of __openllm_start_name__, determined by __openllm_name_type__"""
__openllm_start_name__: str = Field(None)
"""Default name to be used with `openllm start`"""
__openllm_env__: openllm.utils.ModelEnv = Field(None, init=False)
"""A ModelEnv instance for this LLMConfig."""
__openllm_timeout__: int = Field(36000)
"""The default timeout to be set for this given LLM."""
__openllm_workers_per_resource__: int | float = Field(1)
"""The number of workers per resource. This is used to determine the number of workers to use for this model.
For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then
OpenLLM will use 1 worker per resource. If this is set to 2, then OpenLLM will use 2 workers per resource.
See StarCoder for more advanced usage. See
https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy for more details.
By default, it is set to 1.
"""
__openllm_generation_class__: type[GenerationConfig] = Field(None, init=False)
@@ -835,23 +885,10 @@ class LLMConfig:
cls.__name__ = f"{cls.__name__}Config"
# NOTE: auto assignment attributes generated from __config__
_make_assignment_script(cls, bentoml_cattr.structure(cls, _ModelSettings))(cls)
_make_assignment_script(cls, bentoml_cattr.structure(cls, _ModelSettingsAttr))(cls)
# process a fields under cls.__dict__ and auto convert them with dantic.Field
cd = cls.__dict__
anns = codegen.get_annotations(cls)
partialed = functools.partial(_field_env_key, model_name=cls.__openllm_model_name__)
def auto_config_env(_: type[LLMConfig], attrs: list[attr.Attribute[t.Any]]) -> list[attr.Attribute[t.Any]]:
return [
a.evolve(
default=_populate_value_from_env_var(partialed(key=a.name), fallback=a.default),
metadata={
"env": a.metadata.get("env", partialed(key=a.name)),
"description": a.metadata.get("description", "(not provided)"),
},
)
for a in attrs
]
# _CountingAttr is the underlying representation of attr.field
ca_names = {name for name, attr in cd.items() if isinstance(attr, _CountingAttr)}
@@ -864,9 +901,9 @@ class LLMConfig:
val = cd.get(attr_name, attr.NOTHING)
if not LazyType["_CountingAttr[t.Any]"](_CountingAttr).isinstance(val):
if val is attr.NOTHING:
val = cls.Field(env=partialed(key=attr_name))
val = cls.Field(env=_field_env_key(cls.__openllm_model_name__, attr_name))
else:
val = cls.Field(default=val, env=partialed(key=attr_name))
val = cls.Field(default=val, env=_field_env_key(cls.__openllm_model_name__, attr_name))
these[attr_name] = val
unannotated = ca_names - annotated_names
if len(unannotated) > 0:
@@ -894,7 +931,7 @@ class LLMConfig:
False, # disable auto_attribs, since we already handle these
False, # disable kw_only
True, # collect_by_mro
field_transformer=auto_config_env,
field_transformer=_make_env_transformer(cls, cls.__openllm_model_name__),
)
_weakref_slot = True # slots = True
_base_names = {a.name for a in base_attrs}
@@ -910,7 +947,7 @@ class LLMConfig:
_make_init(
cls, # cls (the attrs-decorated class)
attrs, # tuple of attr.Attribute of cls
_has_pre_init, # pre_initjk
_has_pre_init, # pre_init
_has_post_init, # post_init
False, # frozen
True, # slots
@@ -1047,14 +1084,14 @@ class LLMConfig:
def __getattribute__(self, item: str) -> t.Any:
if item in _reserved_namespace:
raise ForbiddenAttributeError(
f"'{item}' is a reserved namespace for {self.__class__} and should not be access nor modified."
f"'{item}' belongs to a private namespace for {self.__class__} and should not be access nor modified."
)
return _object_getattribute.__get__(self)(item)
@classmethod
def check_if_gpu_is_available(cls, implementation: str | None = None, force: bool = False):
if implementation is None:
implementation = cls.__openllm_env__.get_framework_env()
implementation = cls.__openllm_env__["framework_value"]
try:
if cls.__openllm_requires_gpu__ or force:
@@ -1091,7 +1128,7 @@ class LLMConfig:
"""
attrs = {k: v for k, v in attrs.items() if v is not None}
model_config = cls.__openllm_env__.model_config
model_config = cls.__openllm_env__.config
env_json_string = os.environ.get(model_config, None)

View File

@@ -35,14 +35,17 @@ from bentoml._internal.types import ModelSignatureDict
import openllm
from .exceptions import ForbiddenAttributeError, OpenLLMException
from .utils import (LazyLoader, bentoml_cattr, is_bitsandbytes_available,
non_intrusive_setattr)
from .utils import (DEBUG, LazyLoader, ModelEnv, bentoml_cattr, first_not_none,
get_debug_mode, is_bitsandbytes_available,
is_torch_available, non_intrusive_setattr, pkg)
if t.TYPE_CHECKING:
import torch
import transformers
from bentoml._internal.runner.strategy import Strategy
from .models.auto.factory import _BaseAutoLLMClass
class LLMRunner(bentoml.Runner):
__doc__: str
__module__: str
@@ -170,7 +173,7 @@ def import_model(
# NOTE: We need to free up the cache after importing the model
# in the case where users first run openllm start without the model
# available locally.
if openllm.utils.is_torch_available() and torch.cuda.is_available():
if is_torch_available() and torch.cuda.is_available():
torch.cuda.empty_cache()
@@ -314,16 +317,25 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
model_id: str | None = None,
llm_config: openllm.LLMConfig | None = None,
*args: t.Any,
quantize: t.Literal["int8", "int4", "gptq"] | None = None,
bettertransformer: bool | None = None,
**attrs: t.Any,
) -> LLM[_M, _T]:
return cls(model_id=model_id, llm_config=llm_config, *args, **attrs)
return cls(
model_id=model_id,
llm_config=llm_config,
*args,
quantize=quantize,
bettertransformer=bettertransformer,
**attrs,
)
def __init__(
self,
model_id: str | None = None,
llm_config: openllm.LLMConfig | None = None,
*args: t.Any,
quantize: t.Literal["8bit", "4bit", "gptq"] | None = None,
quantize: t.Literal["int8", "int4", "gptq"] | None = None,
bettertransformer: bool | None = None,
**attrs: t.Any,
):
@@ -402,7 +414,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM
will use `config_class` to construct default configuration.
quantize: The quantization to use for this LLM. Defaults to None. Possible values
include 8bit, 4bit and gptq.
include int8, int4 and gptq.
bettertransformer: Whether to use BetterTransformer with this model. Defaults to False.
*args: The args to be passed to the model.
**attrs: The kwargs to be passed to the model.
@@ -431,6 +443,14 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
int4_quant_type = attrs.pop("llm_bnb_4bit_quant_type", "nf4")
int4_use_double_quant = attrs.pop("llm_bnb_4bit_use_double_quant", True)
if llm_config is not None:
logger.debug("Using given 'llm_config=(%s)' to initialize LLM.", llm_config)
self.config = llm_config
else:
self.config = self.config_class.model_construct_env(**attrs)
# The rests of the kwargs that is not used by the config class should be stored into __openllm_extras__.
attrs = self.config["extras"]
if quantization_config and quantize:
raise ValueError(
"""'quantization_config' and 'quantize' are mutually exclusive. Either customise
@@ -452,7 +472,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
self,
quantize,
)
if quantize == "8bit":
if quantize == "int8":
if int8_skip_modules is None:
int8_skip_modules = []
if "lm_head" not in int8_skip_modules and self.config["model_type"] == "causal_lm":
@@ -465,8 +485,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
llm_int8_skip_modules=int8_skip_modules,
llm_int8_has_fp16_weight=int8_has_fp16_weight,
)
elif quantize == "4bit":
trf_versions = openllm.utils.pkg.pkg_version_info("transformers")
elif quantize == "int4":
trf_versions = pkg.pkg_version_info("transformers")
supports_kbits = trf_versions[:2] >= (4, 30)
if supports_kbits:
quantization_config = transformers.BitsAndBytesConfig(
@@ -477,7 +497,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
)
else:
logger.warning(
"'quantize' is set to 4bit, while the current transformers version %s does not support "
"'quantize' is set to int4, while the current transformers version %s does not support "
"k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore "
"make sure to install the latest version of transformers either via PyPI or "
"from git source: 'pip install git+https://github.com/huggingface/transformers'.",
@@ -495,20 +515,12 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
)
raise NotImplementedError("GPTQ is not supported yet.")
else:
raise ValueError(f"'quantize' must be one of ['8bit', '4bit', 'gptq'], got {quantize} instead.")
raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantize} instead.")
attrs.update({"quantization_config": quantization_config})
if llm_config is not None:
logger.debug("Using given 'llm_config=(%s)' to initialize LLM", llm_config)
self.config = llm_config
else:
self.config = self.config_class.model_construct_env(**attrs)
# The rests of the kwargs that is not used by the config class should be stored into __openllm_extras__.
attrs = self.config["extras"]
if not self.config["use_pipeline"]:
attrs["low_cpu_mem_usage"] = low_cpu_mem_usage
if self.__llm_implementation__ == "pt":
if not self.config["use_pipeline"]:
attrs["low_cpu_mem_usage"] = low_cpu_mem_usage
attrs["quantization_config"] = quantization_config
model_kwds, tokenizer_kwds = {}, {}
if self.__llm_init_kwargs__:
@@ -527,8 +539,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
model_id = os.environ.get(self.config["env"].model_id, self.config["default_id"])
# NOTE: This is the actual given path or pretrained weight for this LLM.
if t.TYPE_CHECKING:
assert model_id is not None
assert model_id is not None
self._model_id = model_id
# parsing tokenizer and model kwargs
@@ -590,6 +601,16 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
"model_ids": orjson.dumps(self.config["model_ids"]).decode(),
}
@property
def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], dict[str, t.Any]], dict[str, t.Any]]:
"""Returning the processed model and tokenizer parameters to be used with
'import_model' or any other place that requires loading model and tokenizer.
See 'openllm.cli.download_models' for example usage.
It returns a tuple of (model_args, model_kwargs) & tokenizer_kwargs
"""
return (self._model_args, self._model_attrs), self._tokenizer_attrs
@staticmethod
def make_tag(
model_id: str | None = None,
@@ -638,6 +659,10 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
return bentoml.Tag.from_taglike(f"{implementation}-{name}:{model_version}")
def ensure_model_id_exists(self) -> bentoml.Model:
"""This utility function will download the model if it doesn't exist yet.
Make sure to call this function if 'ensure_available' is not set during
Auto LLM initialisation.
"""
output = subprocess.check_output(
[
sys.executable,
@@ -651,7 +676,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
"porcelain",
]
)
if openllm.utils.DEBUG:
if DEBUG or get_debug_mode():
# NOTE: This usually only concern BentoML devs.
pattern = r"^__tag__:[^:\n]+:[^:\n]+"
matched = re.search(pattern, output.decode("utf-8").strip(), re.MULTILINE)
@@ -665,7 +690,15 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
@property
def _bentomodel(self) -> bentoml.Model:
if self.__llm_bentomodel__ is None:
self.__llm_bentomodel__ = self.ensure_model_id_exists()
# NOTE: Since PR#28, self.__llm_bentomodel__ changed from
# ensure_model_id_exists() into just returning the model ref.
# This is because we want to save a few seconds of loading time,
# as openllm.Runner and openllm.AutoLLM initialisation is around 700ms
# before #28.
# If users want to make sure to have the model downloaded,
# one should invoke `LLM.ensure_model_id_exists()` manually,
# or pass `ensure_available=True` into the Auto LLM initialisation.
self.__llm_bentomodel__ = bentoml.transformers.get(self.tag)
return self.__llm_bentomodel__
@property
@@ -729,13 +762,14 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
)
return self.__llm_tokenizer__
# order of these fields matter here, make sure to sync it with
# openllm.models.auto.factory._BaseAutoLLMClass.for_model
def to_runner(
self,
models: list[bentoml.Model] | None = None,
max_batch_size: int | None = None,
max_latency_ms: int | None = None,
method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = None,
embedded: bool = False,
scheduling_strategy: type[Strategy] | None = None,
) -> LLMRunner:
"""Convert this LLM into a Runner.
@@ -753,6 +787,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
NOTE: There are some difference between bentoml.models.get().to_runner() and LLM.to_runner(): 'name'.
- 'name': will be generated by OpenLLM, hence users don't shouldn't worry about this.
The generated name will be 'llm-<model-start-name>-runner' (ex: llm-dolly-v2-runner, llm-chatglm-runner)
- 'embedded': Will be disabled by default. There is no reason to run LLM in embedded mode.
"""
models = models if models is not None else []
models.append(self._bentomodel)
@@ -768,10 +803,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
method_configs = {"generate": generate_sig, "generate_iterator": generate_iterator_sig}
else:
signatures = ModelSignature.convert_signatures_dict(method_configs)
generate_sig = openllm.utils.first_not_none(signatures.get("generate"), default=generate_sig)
generate_iterator_sig = openllm.utils.first_not_none(
signatures.get("generate_iterator"), default=generate_iterator_sig
)
generate_sig = first_not_none(signatures.get("generate"), default=generate_sig)
generate_iterator_sig = first_not_none(signatures.get("generate_iterator"), default=generate_iterator_sig)
class _Runnable(bentoml.Runnable):
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
@@ -860,11 +893,11 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
},
),
name=self.runner_name,
embedded=False,
models=models,
max_batch_size=max_batch_size,
max_latency_ms=max_latency_ms,
method_configs=bentoml_cattr.unstructure(method_configs),
embedded=embedded,
scheduling_strategy=scheduling_strategy,
)
@@ -918,22 +951,28 @@ def Runner(
...
def Runner(model_name: str, **attrs: t.Any) -> LLMRunner:
def Runner(model_name: str, ensure_available: bool = True, init_local: bool = False, **attrs: t.Any) -> LLMRunner:
"""Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'
Args:
model_name: Supported model name from 'openllm models'
ensure_available: If True, it will ensure the model is available before creating the runner.
Set to False for faster creation time. Note that you will need to make sure
the model for this 'model_id' is available before calling the runner.
One can do this by doing the following:
```python
runner = openllm.Runner("dolly-v2", ensure_available=False)
runner.llm.ensure_model_id_exists()
```
init_local: If True, it will initialize the model locally. This is useful if you want to
run the model locally. (Symmetrical to bentoml.Runner.init_local())
**attrs: The rest of kwargs will then be passed to the LLM. Refer to the LLM documentation for the kwargs
behaviour
"""
init_local = attrs.pop("init_local", False)
ModelEnv = openllm.utils.ModelEnv(model_name)
if ModelEnv.get_framework_env() == "flax":
runner = openllm.AutoFlaxLLM.create_runner(model_name, **attrs)
elif ModelEnv.get_framework_env() == "tf":
runner = openllm.AutoTFLLM.create_runner(model_name, **attrs)
else:
runner = openllm.AutoLLM.create_runner(model_name, **attrs)
runner = t.cast(
"_BaseAutoLLMClass",
openllm[ModelEnv(model_name)["framework_value"]], # type: ignore (internal API)
).create_runner(model_name, ensure_available=ensure_available, **attrs)
if init_local:
runner.init_local(quiet=True)

View File

@@ -29,12 +29,15 @@ from bentoml._internal.bento.build_config import DockerOptions, PythonOptions
from bentoml._internal.configuration import get_debug_mode
import openllm
import openllm.utils as utils
from openllm.utils import pkg
from .utils import (ModelEnv, codegen, first_not_none, is_flax_available,
is_tf_available, is_torch_available, pkg)
if t.TYPE_CHECKING:
from fs.base import FS
from .models.auto.factory import _BaseAutoLLMClass
logger = logging.getLogger(__name__)
OPENLLM_DEV_BUILD = "OPENLLM_DEV_BUILD"
@@ -82,10 +85,10 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
if not (str(os.environ.get("BENTOML_BUNDLE_LOCAL_BUILD", False)).lower() == "false"):
packages.append(f"bentoml>={'.'.join([str(i) for i in pkg.pkg_version_info('bentoml')])}")
env = llm.config["env"]
to_use_framework = env.get_framework_env()
if to_use_framework == "flax":
assert utils.is_flax_available(), f"Flax is not available, while {env.framework} is set to 'flax'"
env: ModelEnv = llm.config["env"]
framework_envvar = env["framework_value"]
if framework_envvar == "flax":
assert is_flax_available(), f"Flax is not available, while {env.framework} is set to 'flax'"
packages.extend(
[
f"flax>={importlib.metadata.version('flax')}",
@@ -93,8 +96,8 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
f"jaxlib>={importlib.metadata.version('jaxlib')}",
]
)
elif to_use_framework == "tf":
assert utils.is_tf_available(), f"TensorFlow is not available, while {env.framework} is set to 'tf'"
elif framework_envvar == "tf":
assert is_tf_available(), f"TensorFlow is not available, while {env.framework} is set to 'tf'"
candidates = (
"tensorflow",
"tensorflow-cpu",
@@ -116,7 +119,7 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
except importlib.metadata.PackageNotFoundError:
pass
else:
assert utils.is_torch_available(), "PyTorch is not available. Make sure to have it locally installed."
assert is_torch_available(), "PyTorch is not available. Make sure to have it locally installed."
packages.extend([f"torch>={importlib.metadata.version('torch')}"])
wheels: list[str] = []
@@ -127,7 +130,13 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
return PythonOptions(packages=packages, wheels=wheels, lock_packages=True)
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float) -> DockerOptions:
def construct_docker_options(
llm: openllm.LLM[t.Any, t.Any],
_: FS,
workers_per_resource: int | float,
quantize: t.LiteralString | None,
bettertransformer: bool | None,
) -> DockerOptions:
_bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
_bentoml_config_options_opts = [
"api_server.traffic.timeout=36000", # NOTE: Currently we hardcode this value
@@ -135,39 +144,112 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_
f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}',
]
_bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
env = llm.config["env"]
return DockerOptions(
cuda_version="11.6", # NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version
env={
env.framework: env.get_framework_env(),
"OPENLLM_MODEL": llm.config["model_name"],
"OPENLLM_MODEL_ID": llm.model_id,
"BENTOML_DEBUG": str(get_debug_mode()),
"BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
},
system_packages=["git"],
env: ModelEnv = llm.config["env"]
env_dict = {
env.framework: env.framework_value,
env.config: llm.config.model_dump_json().decode(),
"OPENLLM_MODEL": llm.config["model_name"],
"OPENLLM_MODEL_ID": llm.model_id,
"BENTOML_DEBUG": str(get_debug_mode()),
"BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
}
# We need to handle None separately here, as env from subprocess doesn't
# accept None value.
_env = ModelEnv(llm.config["model_name"], bettertransformer=bettertransformer, quantize=quantize)
if _env.bettertransformer_value is not None:
env_dict[_env.bettertransformer] = _env.bettertransformer_value
if _env.quantize_value is not None:
env_dict[_env.quantize] = _env.quantize_value
# NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version
return DockerOptions(cuda_version="11.6", env=env_dict, system_packages=["git"])
@t.overload
def build(
model_name: str,
*,
model_id: str | None = ...,
quantize: t.LiteralString | None = ...,
bettertransformer: bool | None = ...,
_workers_per_resource: int | float | None = ...,
_overwrite_existing_bento: bool = ...,
__cli__: t.Literal[False] = ...,
**attrs: t.Any,
) -> bentoml.Bento:
...
@t.overload
def build(
model_name: str,
*,
model_id: str | None = ...,
quantize: t.LiteralString | None = ...,
bettertransformer: bool | None = ...,
_workers_per_resource: int | float | None = ...,
_overwrite_existing_bento: bool = ...,
__cli__: t.Literal[True] = ...,
**attrs: t.Any,
) -> tuple[bentoml.Bento, bool]:
...
def _build_bento(
bento_tag: bentoml.Tag,
service_name: str,
llm_fs: FS,
llm: openllm.LLM[t.Any, t.Any],
workers_per_resource: int | float,
quantize: t.LiteralString | None,
bettertransformer: bool | None,
) -> bentoml.Bento:
framework_envvar = llm.config["env"]["framework_value"]
labels = dict(llm.identifying_params)
labels.update({"_type": llm.llm_type, "_framework": framework_envvar})
logger.info("Building Bento for LLM '%s'", llm.config["start_name"])
return bentoml.bentos.build(
f"{service_name}:svc",
name=bento_tag.name,
labels=labels,
description=f"OpenLLM service for {llm.config['start_name']}",
include=[
f for f in llm_fs.walk.files(filter=["*.py"])
], # NOTE: By default, we are using _service.py as the default service, for now.
exclude=["/venv", "__pycache__/", "*.py[cod]", "*$py.class"],
python=construct_python_options(llm, llm_fs),
docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer),
version=bento_tag.version,
build_ctx=llm_fs.getsyspath("/"),
)
@t.overload
def build(model_name: str, *, __cli__: t.Literal[False] = ..., **attrs: t.Any) -> bentoml.Bento:
...
def build(
model_name: str,
*,
model_id: str | None = None,
quantize: t.LiteralString | None = None,
bettertransformer: bool | None = None,
_workers_per_resource: int | float | None = None,
_overwrite_existing_bento: bool = False,
__cli__: bool = False,
**attrs: t.Any,
) -> tuple[bentoml.Bento, bool] | bentoml.Bento:
"""Package a LLM into a Bento.
The LLM will be built into a BentoService with the following structure:
if quantize is passed, it will instruct the model to be quantized dynamically during serving time.
if bettertransformer is passed, it will instruct the model to use BetterTransformer during serving time.
@t.overload
def build(model_name: str, *, __cli__: t.Literal[True] = ..., **attrs: t.Any) -> tuple[bentoml.Bento, bool]:
...
Other parameters including model_name, model_id and attrs will be passed to the LLM class itself.
"""
def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[bentoml.Bento, bool] | bentoml.Bento:
"""Package a LLM into a Bento."""
overwrite_existing_bento = attrs.pop("_overwrite_existing_bento", False)
_previously_built = False
current_model_envvar = os.environ.pop("OPENLLM_MODEL", None)
current_model_id_envvar = os.environ.pop("OPENLLM_MODEL_ID", None)
_previously_built = False
workers_per_resource = attrs.pop("_workers_per_resource", None)
model_id: str = attrs.pop("model_id", None)
llm_config = openllm.AutoConfig.for_model(model_name)
@@ -178,52 +260,58 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
try:
os.environ["OPENLLM_MODEL"] = inflection.underscore(model_name)
to_use_framework = llm_config["env"].get_framework_env()
if to_use_framework == "flax":
llm = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=llm_config, **attrs)
elif to_use_framework == "tf":
llm = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=llm_config, **attrs)
else:
llm = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=llm_config, **attrs)
framework_envvar = llm_config["env"]["framework_value"]
llm = t.cast(
"_BaseAutoLLMClass",
openllm[framework_envvar], # type: ignore (internal API)
).for_model(
model_name,
model_id=model_id,
llm_config=llm_config,
quantize=quantize,
bettertransformer=bettertransformer,
**attrs,
)
os.environ["OPENLLM_MODEL_ID"] = llm.model_id
labels = dict(llm.identifying_params)
labels.update({"_type": llm.llm_type, "_framework": to_use_framework})
labels.update({"_type": llm.llm_type, "_framework": framework_envvar})
service_name = f"generated_{llm_config['model_name']}_service.py"
workers_per_resource = utils.first_not_none(workers_per_resource, default=llm_config["workers_per_resource"])
workers_per_resource = first_not_none(_workers_per_resource, default=llm_config["workers_per_resource"])
with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
# add service.py definition to this temporary folder
utils.codegen.write_service(model_name, llm.model_id, service_name, llm_fs)
codegen.write_service(model_name, llm.model_id, service_name, llm_fs)
bento_tag = bentoml.Tag.from_taglike(f"{llm.llm_type}-service:{llm.tag.version}")
try:
bento = bentoml.get(bento_tag)
if overwrite_existing_bento:
if _overwrite_existing_bento:
logger.info("Overwriting previously saved Bento.")
bentoml.delete(bento_tag)
raise bentoml.exceptions.NotFound("Overwriting previously saved Bento.")
bento = _build_bento(
bento_tag,
service_name,
llm_fs,
llm,
workers_per_resource=workers_per_resource,
quantize=quantize,
bettertransformer=bettertransformer,
)
_previously_built = True
except bentoml.exceptions.NotFound:
logger.info("Building Bento for LLM '%s'", llm_config["start_name"])
bento = bentoml.bentos.build(
f"{service_name}:svc",
name=bento_tag.name,
labels=labels,
description=f"OpenLLM service for {llm_config['start_name']}",
include=[
f for f in llm_fs.walk.files(filter=["*.py"])
], # NOTE: By default, we are using _service.py as the default service, for now.
exclude=["/venv", "__pycache__/", "*.py[cod]", "*$py.class"],
python=construct_python_options(llm, llm_fs),
docker=construct_docker_options(llm, llm_fs, workers_per_resource),
version=bento_tag.version,
build_ctx=llm_fs.getsyspath("/"),
bento = _build_bento(
bento_tag,
service_name,
llm_fs,
llm,
workers_per_resource=workers_per_resource,
quantize=quantize,
bettertransformer=bettertransformer,
)
if __cli__:
return bento, _previously_built
else:
return bento
return (bento, _previously_built) if __cli__ else bento
except Exception as e:
logger.error("\nException caught during building LLM %s: \n", model_name, exc_info=e)
raise

View File

@@ -34,7 +34,16 @@ model = os.environ.get("OPENLLM_MODEL", "{__model_name__}") # openllm: model na
model_id = os.environ.get("OPENLLM_MODEL_ID", "{__model_id__}") # openllm: model id
llm_config = openllm.AutoConfig.for_model(model)
runner = openllm.Runner(model, model_id=model_id, llm_config=llm_config)
runner = openllm.Runner(
model,
model_id=model_id,
llm_config=llm_config,
bettertransformer=llm_config["env"]["bettertransformer_value"],
quantize=llm_config["env"]["quantize_value"],
ensure_available=False,
init_local=False,
)
svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[runner])
@@ -57,6 +66,6 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
model_id=model_id,
timeout=llm_config["timeout"],
model_name=llm_config["model_name"],
framework=llm_config["env"].get_framework_env(),
framework=llm_config["env"]["framework_value"],
configuration=llm_config.model_dump_json().decode(),
)

View File

File diff suppressed because it is too large Load Diff

View File

@@ -94,22 +94,21 @@ class _BaseAutoLLMClass:
>>> llm = openllm.AutoLLM.for_model("flan-t5")
```
"""
runner_kwargs_name = [
# order matters here
runner_kwargs_name = {
"models",
"max_batch_size",
"max_latency_ms",
"method_configs",
"embedded",
"scheduling_strategy",
]
}
to_runner_attrs = {k: v for k, v in attrs.items() if k in runner_kwargs_name}
for k in to_runner_attrs:
del attrs[k]
normalized = inflection.underscore(model_name)
if cls._model_mapping.get(normalized, None, mapping_type="name2model"):
attrs = {k: v for k, v in attrs.items() if k not in to_runner_attrs}
if cls._model_mapping.get(inflection.underscore(model_name), None, mapping_type="name2model"):
if not isinstance(llm_config, openllm.LLMConfig):
# The rest of kwargs is now passed to config
llm_config = AutoConfig.for_model(normalized, **attrs)
llm_config = AutoConfig.for_model(model_name, **attrs)
attrs = llm_config.__openllm_extras__
# the rest of attrs will be saved to __openllm_extras__
llm = cls._model_mapping[type(llm_config)].from_pretrained(
model_id,

View File

@@ -104,25 +104,25 @@ class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrain
chat_history.append((prompt, generation_result))
return "".join(generation_result)
@torch.inference_mode()
def generate(self, prompt: str, use_default_prompt_template: bool = True, **attrs: t.Any) -> str:
self.model.eval()
def generate(self, prompt: str, **attrs: t.Any) -> str:
with torch.inference_mode():
self.model.eval()
# Only use half precision if the model is not yet quantized
if self.config.use_half_precision:
self.model.half()
# Only use half precision if the model is not yet quantized
if self.config.use_half_precision:
self.model.half()
self.model.cuda()
self.model.cuda()
logit_processor: list[LogitsProcessor] = LogitsProcessorList()
logit_processor.append(InvalidScoreLogitsProcessor())
logit_processor: list[LogitsProcessor] = LogitsProcessorList()
logit_processor.append(InvalidScoreLogitsProcessor())
inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
outputs = self.model.generate(
**inputs,
generation_config=self.config.model_construct_env(do_sample=True, **attrs).to_generation_config(),
logits_processor=logit_processor,
)
outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) :]
response = self.tokenizer.decode(outputs)
return self.model.process_response(response)
inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
outputs = self.model.generate(
**inputs,
generation_config=self.config.model_construct_env(do_sample=True, **attrs).to_generation_config(),
logits_processor=logit_processor,
)
outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) :]
response = self.tokenizer.decode(outputs)
return self.model.process_response(response)

View File

@@ -98,19 +98,19 @@ class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedToken
) -> str:
return generation_result[0]["generated_text"]
@torch.inference_mode()
def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
self.model.tokenizer = self.tokenizer
llm_config = self.config.model_construct_env(**attrs)
decoded: list[dict[t.Literal["generated_text"], str]] = self.model(
prompt, generation_config=llm_config.to_generation_config()
)
with torch.inference_mode():
self.model.tokenizer = self.tokenizer
llm_config = self.config.model_construct_env(**attrs)
decoded: list[dict[t.Literal["generated_text"], str]] = self.model(
prompt, generation_config=llm_config.to_generation_config()
)
if llm_config.return_full_text:
return [
{k: f"{DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt)}\n{generated}"}
for i in decoded
for k, generated in i.items()
]
if llm_config.return_full_text:
return [
{k: f"{DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt)}\n{generated}"}
for i in decoded
for k, generated in i.items()
]
return decoded
return decoded

View File

@@ -74,14 +74,14 @@ class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformer
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
return generation_result[0]
@torch.inference_mode()
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
if torch.cuda.is_available():
self.model.cuda()
input_ids = t.cast("torch.Tensor", self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device)
result_tensor = self.model.generate(
input_ids,
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
)
return self.tokenizer.batch_decode(result_tensor, skip_special_tokens=True)
with torch.inference_mode():
if torch.cuda.is_available():
self.model.cuda()
input_ids = t.cast("torch.Tensor", self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device)
result_tensor = self.model.generate(
input_ids,
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
)
return self.tokenizer.batch_decode(result_tensor, skip_special_tokens=True)

View File

@@ -129,15 +129,15 @@ class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer
else:
return "\n".join(generation_result)
@torch.inference_mode()
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
if torch.cuda.is_available() and torch.cuda.device_count() == 1:
self.model.cuda()
with torch.inference_mode():
if torch.cuda.is_available() and torch.cuda.device_count() == 1:
self.model.cuda()
input_ids = t.cast(torch.Tensor, self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device)
generated_tensors = self.model.generate(
input_ids,
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
)
return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
input_ids = t.cast(torch.Tensor, self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device)
generated_tensors = self.model.generate(
input_ids,
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
)
return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)

View File

@@ -120,18 +120,20 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
return generation_result[0]
@torch.inference_mode()
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
inputs = t.cast("torch.Tensor", self.tokenizer.encode(prompt, return_tensors="pt")).to(self.device)
result_tensor = self.model.generate(
inputs,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
# eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
)
# TODO: We will probably want to return the tokenizer here so that we can manually process this
# return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
return self.tokenizer.batch_decode(
result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True
)
with torch.inference_mode():
inputs = t.cast("torch.Tensor", self.tokenizer.encode(prompt, return_tensors="pt")).to(self.device)
result_tensor = self.model.generate(
inputs,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
# eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
)
# TODO: We will probably want to return the tokenizer here so that we can manually process this
# return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
return self.tokenizer.batch_decode(
result_tensor[0],
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
)

View File

@@ -35,6 +35,11 @@ from bentoml._internal.utils import (LazyLoader, bentoml_cattr,
from .lazy import LazyModule
# NOTE: The set marks contains a set of modules name
# that are available above and are whitelisted
# to be included in the extra_objects map.
_whitelist_modules = {"pkg"}
logger = logging.getLogger(__name__)
try:
@@ -86,7 +91,9 @@ DEBUG = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.envi
# XXX: define all classes, functions import above this line
# since _extras will be the locals() import from this file.
_extras: dict[str, t.Any] = {
k: v for k, v in locals().items() if not isinstance(v, types.ModuleType) and not k.startswith("_")
k: v
for k, v in locals().items()
if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith("_"))
}
_import_structure = {

View File

@@ -97,7 +97,7 @@ def attrs_to_options(
)
def _default_converter(value: t.Any, env: str | None) -> t.Any:
def env_converter(value: t.Any, env: str | None = None) -> t.Any:
if env is not None:
value = os.environ.get(env, value)
if value is not None and isinstance(value, str):
@@ -135,7 +135,8 @@ def Field(
on kw_only. If kw_only=True, the this field will become 'Required' and the default
value is omitted. If kw_only=False, then the default value will be used as before.
use_default_converter: a bool indicating whether to use the default converter. Defaults
to True. If set to False, then the default converter will not be used.
to True. If set to False, then the default converter will not be used. The default
converter converts a given value from the environment variable for this given Field.
**kwargs: The rest of the arguments are passed to attr.field
"""
metadata = attrs.pop("metadata", {})
@@ -148,7 +149,7 @@ def Field(
converter = attrs.pop("converter", None)
if use_default_converter:
converter = functools.partial(_default_converter, env=env)
converter = functools.partial(env_converter, env=env)
if ge is not None:
piped.append(attr.validators.ge(ge))

View File

@@ -15,6 +15,8 @@
"""
Some imports utils are vendorred from transformers/utils/import_utils.py for performance reasons.
"""
from __future__ import annotations
import importlib
import importlib.metadata
import importlib.util
@@ -24,7 +26,6 @@ import typing as t
from abc import ABCMeta
from collections import OrderedDict
import attr
import inflection
from bentoml._internal.utils import LazyLoader
from packaging import version
@@ -236,31 +237,73 @@ def require_backends(o: t.Any, backends: t.MutableSequence[str]):
raise ImportError("".join(failed))
@attr.define
class ModelEnv:
model_name: str = attr.field(converter=inflection.underscore)
model_name: str
@property
def framework(self) -> str:
return f"OPENLLM_{self.model_name.upper()}_FRAMEWORK"
if t.TYPE_CHECKING:
config: property
model_id: property
quantize: property
framework: property
bettertransformer: property
@property
def model_config(self) -> str:
return f"OPENLLM_{self.model_name.upper()}_CONFIG"
framework_value: property
quantize_value: property
bettertransformer_value: property
@property
def model_id(self) -> str:
return f"OPENLLM_{self.model_name.upper()}_MODEL_ID"
def __getitem__(self, item: str | t.Any) -> t.Any:
if hasattr(self, item):
return getattr(self, item)
raise KeyError(f"Key {item} not found in {self}")
@property
def bettertransformer(self) -> str:
return f"OPENLLM_{self.model_name.upper()}_BETTERTRANSFORMER"
def __new__(cls, model_name: str, bettertransformer: bool | None = None, quantize: t.LiteralString | None = None):
from .._configuration import _field_env_key
from . import codegen
def gen_env_key(self, key: str) -> str:
return f"OPENLLM_{self.model_name.upper()}_{key.upper()}"
model_name = inflection.underscore(model_name)
def convert_to_bettertransformer(self) -> bool:
return os.environ.get(self.bettertransformer, str(False)).lower() == "true"
res = super().__new__(cls)
res.model_name = model_name
# gen properties env key
attributes = {"config", "model_id", "quantize", "framework", "bettertransformer"}
for att in attributes:
setattr(res, att, _field_env_key(model_name, att.upper()))
# gen properties env value
attributes_with_values = {
"quantize": (bool, quantize),
"bettertransformer": (bool, bettertransformer),
"framework": (str, "pt"),
}
globs: dict[str, t.Any] = {
"__bool_vars_value": ENV_VARS_TRUE_VALUES,
"__env_get": os.environ.get,
"self": res,
}
for attribute, (default_type, default_value) in attributes_with_values.items():
lines: list[str] = []
if default_type is bool:
lines.append(
f"return str(__env_get(self['{attribute}'], str(__env_default)).upper() in __bool_vars_value)"
)
else:
lines.append(f"return __env_get(self['{attribute}'], __env_default)")
setattr(
res,
f"{attribute}_value",
codegen.generate_function(
cls,
"_env_get_" + attribute,
lines,
("__env_default",),
globs,
)(default_value),
)
return res
@property
def start_docstring(self) -> str:
@@ -269,9 +312,3 @@ class ModelEnv:
@property
def module(self) -> LazyLoader:
return LazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}")
def get_framework_env(self) -> t.Literal["pt", "flax", "tf"]:
envvar = os.environ.get(self.framework, "pt")
if envvar not in ("pt", "tf", "flax"):
raise ValueError(f"Invalid framework implementation {envvar}, must be one of 'pt', 'tf', 'flax'")
return envvar

View File

@@ -21,6 +21,19 @@ import os
import types
import typing as t
from ..exceptions import ForbiddenAttributeError, OpenLLMException
class UsageNotAllowedError(OpenLLMException):
"""Raised when LazyModule.__getitem__ is forbidden."""
class MissingAttributesError(OpenLLMException):
"""Raised when given keys is not available in LazyModule special mapping."""
_reserved_namespace = {"__openllm_special__"}
class LazyModule(types.ModuleType):
"""
@@ -49,9 +62,7 @@ class LazyModule(types.ModuleType):
for value in values:
self._class_to_module[value] = key
# Needed for autocompletion in an IDE
self.__all__ = (
list(import_structure.keys()) + list(itertools.chain(*import_structure.values())) + list(_extra_objects)
)
self.__all__ = list(import_structure.keys()) + list(itertools.chain(*import_structure.values()))
self.__file__ = module_file
self.__spec__ = module_spec
self.__path__ = [os.path.dirname(module_file)]
@@ -71,13 +82,30 @@ class LazyModule(types.ModuleType):
result.append(attribute)
return result
def __getitem__(self, key: str) -> t.Any:
if self._objects.get("__openllm_special__") is None:
raise UsageNotAllowedError(f"'{self._name}' is not allowed to be used as a dict.")
_special_mapping = self._objects.get("__openllm_special__", {})
try:
if key in _special_mapping:
return getattr(self, _special_mapping.__getitem__(key))
raise MissingAttributesError(f"Requested '{key}' is not available in given mapping.")
except AttributeError as e:
raise KeyError(f"'{self._name}' has no attribute {_special_mapping[key]}") from e
except Exception as e:
raise KeyError(f"Failed to lookup '{key}' in '{self._name}'") from e
def __getattr__(self, name: str) -> t.Any:
if name in _reserved_namespace:
raise ForbiddenAttributeError(
f"'{name}' is a reserved namespace for {self._name} and should not be access nor modified."
)
if name in self._objects:
return self._objects[name]
return self._objects.__getitem__(name)
if name in self._modules:
value = self._get_module(name)
elif name in self._class_to_module.keys():
module = self._get_module(self._class_to_module[name])
module = self._get_module(self._class_to_module.__getitem__(name))
value = getattr(module, name)
else:
raise AttributeError(f"module {self.__name__} has no attribute {name}")

View File

@@ -13,9 +13,10 @@
# limitations under the License.
from __future__ import annotations
import dataclasses
import typing as t
import attr
import openllm
from openllm._prompt import PromptFormatter
@@ -34,13 +35,11 @@ class PartialDict(DictStrStr):
return "{" + key + "}"
@dataclasses.dataclass(slots=True)
@attr.define(slots=True)
class PromptTemplate:
template: str
input_variables: t.Sequence[str]
model_config = {"extra": "forbid"}
def to_str(self, __partial_dict__: PartialDict | None = None, **attrs: str) -> str:
"""Generate a prompt from the template and input variables"""
if __partial_dict__:

View File

@@ -25,6 +25,7 @@ import httpx
import openllm
if t.TYPE_CHECKING:
from openllm.models.auto.factory import _BaseAutoLLMClass
class AnnotatedClient(bentoml.client.Client):
def health(self, *args: t.Any, **attrs: t.Any) -> t.Any:
@@ -107,12 +108,10 @@ class ClientMixin:
@property
def llm(self) -> openllm.LLM[t.Any, t.Any]:
if self.__llm__ is None:
if self.framework == "flax":
self.__llm__ = openllm.AutoFlaxLLM.for_model(self.model_name)
elif self.framework == "tf":
self.__llm__ = openllm.AutoTFLLM.for_model(self.model_name)
else:
self.__llm__ = openllm.AutoLLM.for_model(self.model_name)
self.__llm__ = t.cast(
"_BaseAutoLLMClass",
openllm[self.framework], # type: ignore (internal API)
).for_model(self.model_name)
return self.__llm__
@property

View File

@@ -33,10 +33,10 @@ logger = logging.getLogger(__name__)
def test_missing_default():
with pytest.raises(ValueError, match="The following keys are required*"):
with pytest.raises(ValueError, match="Either 'default_id' or 'model_ids'*"):
make_llm_config("MissingDefaultId", {"name_type": "lowercase", "requirements": ["bentoml"]})
with pytest.raises(ValueError, match="The following keys are required*"):
with pytest.raises(ValueError, match="Either 'default_id' or 'model_ids'*"):
make_llm_config("MissingModelId", {"default_id": "huggingface/t5-tiny-testing", "requirements": ["bentoml"]})