mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-05-19 05:57:39 -04:00
perf: build quantization and better transformer behaviour (#28)
Fixes quantization_config and low_cpu_mem_usage to be available on PyTorch implementation only See changelog for more details on #28
This commit is contained in:
16
.github/workflows/ci.yml
vendored
16
.github/workflows/ci.yml
vendored
@@ -28,22 +28,6 @@ defaults:
|
||||
run:
|
||||
shell: bash --noprofile --norc -exo pipefail {0}
|
||||
jobs:
|
||||
codestyle_check:
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }}
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Setup CI
|
||||
uses: ./.github/actions/setup-repo
|
||||
- name: Running changelog check
|
||||
run: hatch run changelog
|
||||
- name: Format and lint check
|
||||
run: hatch run fmt
|
||||
- name: Type check
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
run: git diff --name-only --diff-filter=AM "origin/$GITHUB_BASE_REF" -z -- '*.py{,i}' | xargs -0 --no-run-if-empty hatch run dev:typing
|
||||
tests:
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }}
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
ci:
|
||||
autoupdate_schedule: weekly
|
||||
skip: [check-models-table-update, check-models-table-update]
|
||||
skip: [check-models-table-update, check-models-table-update, changelog-dry-run]
|
||||
exclude: '.*\.(css|js|svg)$'
|
||||
repos:
|
||||
- repo: https://github.com/charliermarsh/ruff-pre-commit
|
||||
@@ -51,13 +51,16 @@ repos:
|
||||
typings/.*|
|
||||
.github/.*
|
||||
)$
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: check-models-table-update
|
||||
name: check if table in README.md is up-to-date
|
||||
entry: ./tools/assert-model-table-latest
|
||||
language: script
|
||||
files: README.md
|
||||
- id: changelog-dry-run
|
||||
name: Running changelog dry-run
|
||||
entry: hatch run changelog
|
||||
language: system
|
||||
files: CHANGELOG.md
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.4.0
|
||||
hooks:
|
||||
|
||||
@@ -1,14 +1,22 @@
|
||||
Added support for quantization during serving time.
|
||||
`openllm start` now support `--quantize 8bit` and `--quantize 4bit`
|
||||
`GPTQ` quantization support is on the roadmap and currently
|
||||
being worked on.
|
||||
|
||||
`openllm start` now support `--quantize int8` and `--quantize int4` `GPTQ`
|
||||
quantization support is on the roadmap and currently being worked on.
|
||||
|
||||
`openllm start` now also support `--bettertransformer` to use
|
||||
`BetterTransformer` for serving
|
||||
Refactored `openllm.LLMConfig` to be able to use with `__getitem__`
|
||||
to acecss the config value: `openllm.DollyV2Config()['requirements']`
|
||||
the order being: `__openllm_*__ > self.<key> > __openllm_generation_class__ > __openllm_extras__`
|
||||
`BetterTransformer` for serving.
|
||||
|
||||
Refactored `openllm.LLMConfig` to be able to use with `__getitem__`:
|
||||
`openllm.DollyV2Config()['requirements']`.
|
||||
|
||||
The access order being:
|
||||
`__openllm_*__ > self.<key> > __openllm_generation_class__ > __openllm_extras__`.
|
||||
|
||||
Added `towncrier` workflow to easily generate changelog entries
|
||||
|
||||
Added `use_pipeline`, `bettertransformer` flag into ModelSettings
|
||||
`LLMConfig` now supported `__dataclass_transform__` protocol to help
|
||||
with type-checking
|
||||
Changed `openllm download-models` to `openllm download`
|
||||
|
||||
`LLMConfig` now supported `__dataclass_transform__` protocol to help with
|
||||
type-checking
|
||||
|
||||
`openllm download-models` now becomes `openllm download`
|
||||
|
||||
14
changelog.d/28.change.md
Normal file
14
changelog.d/28.change.md
Normal file
@@ -0,0 +1,14 @@
|
||||
`--quantize` now takes `int8, int4` instead of `8bit, 4bit` to be consistent
|
||||
with bitsandbytes concept.
|
||||
|
||||
`openllm CLI` now cached all available model command, allow faster startup time.
|
||||
|
||||
Fixes `openllm start model-id --debug` to filtered out debug message log from
|
||||
`bentoml.Server`.
|
||||
|
||||
`--model-id` from `openllm start` now support choice for easier selection.
|
||||
|
||||
Updated `ModelConfig` implementation with **getitem** and auto generation value.
|
||||
|
||||
Cleanup CLI and improve loading time, `openllm start` should be 'blazingly
|
||||
fast'.
|
||||
@@ -25,7 +25,7 @@ deploy, and monitor any LLMs with ease.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging as _
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
from . import utils as utils
|
||||
@@ -33,15 +33,11 @@ from .__about__ import __version__ as __version__
|
||||
from .exceptions import MissingDependencyError
|
||||
|
||||
if utils.DEBUG:
|
||||
from bentoml._internal.configuration import set_debug_mode, set_quiet_mode
|
||||
utils.set_debug_mode(True)
|
||||
utils.set_quiet_mode(False)
|
||||
|
||||
set_debug_mode(True)
|
||||
set_quiet_mode(False)
|
||||
|
||||
from bentoml._internal.log import configure_logging
|
||||
|
||||
configure_logging()
|
||||
_.basicConfig(level=_.NOTSET)
|
||||
utils.configure_logging()
|
||||
logging.basicConfig(level=logging.NOTSET)
|
||||
|
||||
|
||||
_import_structure = {
|
||||
@@ -147,7 +143,6 @@ if t.TYPE_CHECKING:
|
||||
from . import exceptions as exceptions
|
||||
from . import models as models
|
||||
from . import playground as playground
|
||||
|
||||
# Specific types import
|
||||
from ._configuration import LLMConfig as LLMConfig
|
||||
from ._llm import LLM as LLM
|
||||
@@ -160,7 +155,8 @@ if t.TYPE_CHECKING:
|
||||
from .cli import start as start
|
||||
from .cli import start_grpc as start_grpc
|
||||
from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING
|
||||
from .models.auto import MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
|
||||
from .models.auto import \
|
||||
MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
|
||||
from .models.auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
|
||||
from .models.auto import MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
|
||||
from .models.auto import AutoConfig as AutoConfig
|
||||
@@ -234,5 +230,11 @@ else:
|
||||
globals()["__file__"],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={"__version__": __version__},
|
||||
extra_objects={
|
||||
"__version__": __version__,
|
||||
# The below is a special mapping that allows openllm to be used as a dictionary.
|
||||
# This is purely for convenience sake, and should not be used in performance critcal
|
||||
# code. This is also not considered as a public API.
|
||||
"__openllm_special__": {"flax": "AutoFlaxLLM", "tf": "AutoTFLLM", "pt": "AutoLLM"},
|
||||
},
|
||||
)
|
||||
|
||||
@@ -395,16 +395,7 @@ bentoml_cattr.register_unstructure_hook_factory(
|
||||
)
|
||||
|
||||
|
||||
def _populate_value_from_env_var(
|
||||
key: str, transform: t.Callable[[str], str] | None = None, fallback: t.Any = None
|
||||
) -> t.Any:
|
||||
if transform is not None and callable(transform):
|
||||
key = transform(key)
|
||||
|
||||
return os.environ.get(key, fallback)
|
||||
|
||||
|
||||
def _field_env_key(model_name: str, key: str, suffix: str | None = None) -> str:
|
||||
def _field_env_key(model_name: str, key: str, suffix: str | t.Literal[""] | None = None) -> str:
|
||||
return "_".join(filter(None, map(str.upper, ["OPENLLM", model_name, suffix.strip("_") if suffix else "", key])))
|
||||
|
||||
|
||||
@@ -425,6 +416,7 @@ class ModelSettings(t.TypedDict, total=False):
|
||||
url: str
|
||||
requires_gpu: bool
|
||||
trust_remote_code: bool
|
||||
service_name: NotRequired[str]
|
||||
requirements: t.Optional[ListStr]
|
||||
|
||||
# llm implementation specifics
|
||||
@@ -448,128 +440,174 @@ class ModelSettings(t.TypedDict, total=False):
|
||||
generation_class: t.Type[GenerationConfig]
|
||||
|
||||
|
||||
_ModelSettings: type[attr.AttrsInstance] = codegen.add_method_dunders(
|
||||
type("__openllm_internal__", (ModelSettings,), {"__module__": "openllm._configuration"}),
|
||||
attr.make_class(
|
||||
"ModelSettings",
|
||||
{
|
||||
k: dantic.Field(
|
||||
def _settings_field_transformer(
|
||||
_: type[attr.AttrsInstance], __: list[attr.Attribute[t.Any]]
|
||||
) -> list[attr.Attribute[t.Any]]:
|
||||
return [
|
||||
attr.Attribute.from_counting_attr(
|
||||
k,
|
||||
dantic.Field(
|
||||
kw_only=False if t.get_origin(ann) is not Required else True,
|
||||
auto_default=True,
|
||||
use_default_converter=False,
|
||||
type=ann,
|
||||
metadata={
|
||||
"target": f"__openllm_{k}__",
|
||||
"required": False if t.get_origin(ann) is NotRequired else t.get_origin(ann) is Required,
|
||||
},
|
||||
metadata={"target": f"__openllm_{k}__"},
|
||||
description=f"ModelSettings field for {k}.",
|
||||
)
|
||||
for k, ann in t.get_type_hints(ModelSettings).items()
|
||||
},
|
||||
bases=(DictStrAny,),
|
||||
slots=True,
|
||||
weakref_slot=True,
|
||||
collect_by_mro=True,
|
||||
),
|
||||
_overwrite_doc="Internal attrs representation of ModelSettings.",
|
||||
)
|
||||
),
|
||||
)
|
||||
for k, ann in t.get_type_hints(ModelSettings).items()
|
||||
]
|
||||
|
||||
|
||||
def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
|
||||
@attr.define(slots=True, field_transformer=_settings_field_transformer, frozen=False)
|
||||
class _ModelSettingsAttr:
|
||||
"""Internal attrs representation of ModelSettings."""
|
||||
|
||||
def __getitem__(self, key: str) -> t.Any:
|
||||
if key in codegen.get_annotations(ModelSettings):
|
||||
return _object_getattribute(self, key)
|
||||
raise KeyError(key)
|
||||
|
||||
@classmethod
|
||||
def default(cls) -> _ModelSettingsAttr:
|
||||
_ = ModelSettings(
|
||||
default_id="__default__",
|
||||
model_ids=["__default__"],
|
||||
name_type="dasherize",
|
||||
requires_gpu=False,
|
||||
url="",
|
||||
use_pipeline=False,
|
||||
model_type="causal_lm",
|
||||
trust_remote_code=False,
|
||||
requirements=None,
|
||||
timeout=3600,
|
||||
service_name="",
|
||||
workers_per_resource=1,
|
||||
runtime="transformers",
|
||||
)
|
||||
return cls(**t.cast(DictStrAny, _))
|
||||
|
||||
|
||||
def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]):
|
||||
if not lenient_issubclass(cl_, LLMConfig):
|
||||
raise RuntimeError(f"Given LLMConfig must be a subclass type of 'LLMConfig', got '{cl_}' instead.")
|
||||
raise RuntimeError(f"Given '{cl_}' must be a subclass type of 'LLMConfig', got '{cl_}' instead.")
|
||||
|
||||
if not hasattr(cl_, "__config__") or getattr(cl_, "__config__") is None:
|
||||
raise RuntimeError("Given LLMConfig must have '__config__' that is not None defined.")
|
||||
|
||||
settings = cl_.__config__
|
||||
assert settings
|
||||
assert cl_.__config__ is not None
|
||||
|
||||
required = [i.name for i in attr.fields(cls) if i.metadata.get("required", False)]
|
||||
|
||||
missing = set(required) - set(settings.keys())
|
||||
|
||||
if len(missing) > 0:
|
||||
raise ValueError(f"The following keys are required under '__config__': {required} (missing: {missing})")
|
||||
|
||||
if "generation_class" in settings:
|
||||
if "generation_class" in cl_.__config__:
|
||||
raise ValueError(
|
||||
"'generation_class' shouldn't be defined in '__config__', rather defining "
|
||||
f"all required attributes under '{cl_}.GenerationConfig' when defining the class."
|
||||
f"all required attributes under '{cl_}.GenerationConfig' instead."
|
||||
)
|
||||
|
||||
if not settings["default_id"] or not settings["model_ids"]:
|
||||
_cl_name = cl_.__name__.replace("Config", "")
|
||||
|
||||
_settings_attr = _ModelSettingsAttr.default()
|
||||
try:
|
||||
cls(**t.cast(DictStrAny, cl_.__config__))
|
||||
_settings_attr = attr.evolve(_settings_attr, **t.cast(DictStrAny, cl_.__config__))
|
||||
except TypeError:
|
||||
raise ValueError("Either 'default_id' or 'model_ids' are emptied under '__config__' (required fields).")
|
||||
|
||||
# NOTE: value in __config__ can be None, hense we use setdefault
|
||||
# to update in-place
|
||||
_cl_name = cl_.__name__.replace("Config", "")
|
||||
name_type = settings.setdefault("name_type", "dasherize")
|
||||
model_name = settings.setdefault(
|
||||
"model_name", inflection.underscore(_cl_name) if name_type == "dasherize" else _cl_name.lower()
|
||||
_final_value_dct: DictStrAny = {
|
||||
"model_name": inflection.underscore(_cl_name)
|
||||
if _settings_attr["name_type"] == "dasherize"
|
||||
else _cl_name.lower()
|
||||
}
|
||||
_final_value_dct["start_name"] = (
|
||||
inflection.dasherize(_final_value_dct["model_name"])
|
||||
if _settings_attr["name_type"] == "dasherize"
|
||||
else _final_value_dct["model_name"]
|
||||
)
|
||||
partialed = functools.partial(_field_env_key, model_name=model_name, suffix="generation")
|
||||
env = openllm.utils.ModelEnv(_final_value_dct["model_name"])
|
||||
_final_value_dct["env"] = env
|
||||
|
||||
def auto_env_transformers(_: t.Any, fields: list[attr.Attribute[t.Any]]) -> list[attr.Attribute[t.Any]]:
|
||||
_has_own_gen = codegen.has_own_attribute(cl_, "GenerationConfig")
|
||||
return [
|
||||
f.evolve(
|
||||
default=_populate_value_from_env_var(
|
||||
partialed(key=f.name),
|
||||
fallback=getattr(cl_.GenerationConfig, f.name, f.default) if _has_own_gen else f.default,
|
||||
),
|
||||
metadata={"env": partialed(key=f.name), "description": f.metadata.get("description", "(not provided)")},
|
||||
converter=None,
|
||||
)
|
||||
for f in fields
|
||||
]
|
||||
# bettertransformer support
|
||||
if _settings_attr["bettertransformer"] is None:
|
||||
_final_value_dct["bettertransformer"] = (
|
||||
os.environ.get(env.bettertransformer, str(False)).upper() in ENV_VARS_TRUE_VALUES
|
||||
)
|
||||
if _settings_attr["requires_gpu"]:
|
||||
# if requires_gpu is True, then disable BetterTransformer for quantization.
|
||||
_final_value_dct["bettertransformer"] = False
|
||||
|
||||
settings.setdefault(
|
||||
"generation_class",
|
||||
attr.make_class(
|
||||
f"{_cl_name}GenerationConfig",
|
||||
[],
|
||||
bases=(GenerationConfig,),
|
||||
slots=True,
|
||||
weakref_slot=True,
|
||||
frozen=False,
|
||||
repr=True,
|
||||
collect_by_mro=True,
|
||||
field_transformer=auto_env_transformers,
|
||||
_final_value_dct["service_name"] = f"generated_{_final_value_dct['model_name']}_service.py"
|
||||
_final_value_dct["generation_class"] = attr.make_class(
|
||||
f"{_cl_name}GenerationConfig",
|
||||
[],
|
||||
bases=(GenerationConfig,),
|
||||
slots=True,
|
||||
weakref_slot=True,
|
||||
frozen=True,
|
||||
repr=True,
|
||||
collect_by_mro=True,
|
||||
field_transformer=_make_env_transformer(
|
||||
cl_,
|
||||
_final_value_dct["model_name"],
|
||||
suffix="generation",
|
||||
default_callback=lambda field_name, field_default: getattr(cl_.GenerationConfig, field_name, field_default)
|
||||
if codegen.has_own_attribute(cl_, "GenerationConfig")
|
||||
else field_default,
|
||||
globs={"cl_": cl_},
|
||||
),
|
||||
)
|
||||
|
||||
env = settings.setdefault("env", openllm.utils.ModelEnv(model_name))
|
||||
requires_gpu = settings.setdefault("requires_gpu", False)
|
||||
return attr.evolve(_settings_attr, **_final_value_dct)
|
||||
|
||||
# bettertransformer support
|
||||
bettertransformer = settings.setdefault(
|
||||
"bettertransformer",
|
||||
os.environ.get(env.bettertransformer, str(False)).upper() in ENV_VARS_TRUE_VALUES,
|
||||
|
||||
bentoml_cattr.register_structure_hook(_ModelSettingsAttr, structure_settings)
|
||||
|
||||
|
||||
def _make_env_transformer(
|
||||
cls: type[LLMConfig],
|
||||
model_name: str,
|
||||
suffix: t.LiteralString | None = None,
|
||||
default_callback: t.Callable[[str, t.Any], t.Any] | None = None,
|
||||
globs: DictStrAny | None = None,
|
||||
):
|
||||
def identity(_: str, x_value: t.Any) -> t.Any:
|
||||
return x_value
|
||||
|
||||
default_callback = identity if default_callback is None else default_callback
|
||||
|
||||
globs = {} if globs is None else globs
|
||||
globs.update(
|
||||
{
|
||||
"functools": functools,
|
||||
"__populate_env": dantic.env_converter,
|
||||
"__default_callback": default_callback,
|
||||
"__field_env": _field_env_key,
|
||||
"__suffix": suffix or "",
|
||||
"__model_name": model_name,
|
||||
}
|
||||
)
|
||||
if requires_gpu:
|
||||
# For all models that requires GPU, no need to offload it to BetterTransformer
|
||||
# use bitsandbytes or gptq instead for latency improvement
|
||||
if bettertransformer:
|
||||
logger.debug("Model requires GPU by default, disabling bettertransformer.")
|
||||
bettertransformer = False
|
||||
settings["bettertransformer"] = bettertransformer
|
||||
|
||||
# default value
|
||||
settings.setdefault("url", "")
|
||||
settings.setdefault("use_pipeline", False)
|
||||
settings.setdefault("model_type", "causal_lm")
|
||||
settings.setdefault("trust_remote_code", False)
|
||||
settings.setdefault("requirements", None)
|
||||
settings.setdefault("timeout", 3600)
|
||||
settings.setdefault("workers_per_resource", 1)
|
||||
settings.setdefault("runtime", "transformers")
|
||||
settings.setdefault("start_name", inflection.dasherize(model_name) if name_type == "dasherize" else model_name)
|
||||
lines: ListStr = [
|
||||
"__env = lambda field_name: __field_env(__model_name, field_name, __suffix)",
|
||||
"return [",
|
||||
" f.evolve(",
|
||||
" default=__populate_env(__default_callback(f.name, f.default), __env(f.name)),",
|
||||
" metadata={",
|
||||
" 'env': f.metadata.get('env', __env(f.name)),",
|
||||
" 'description': f.metadata.get('description', '(not provided)'),",
|
||||
" },",
|
||||
" )",
|
||||
" for f in fields",
|
||||
"]",
|
||||
]
|
||||
fields_ann = "list[attr.Attribute[t.Any]]"
|
||||
|
||||
return cls(**settings)
|
||||
|
||||
|
||||
bentoml_cattr.register_structure_hook(_ModelSettings, structure_settings)
|
||||
return codegen.generate_function(
|
||||
cls,
|
||||
"__auto_env",
|
||||
lines,
|
||||
args=("_", "fields"),
|
||||
globs=globs,
|
||||
annotations={"_": "type[LLMConfig]", "fields": fields_ann, "return": fields_ann},
|
||||
)
|
||||
|
||||
|
||||
def _setattr_class(attr_name: str, value_var: t.Any, add_dunder: bool = False):
|
||||
@@ -577,6 +615,10 @@ def _setattr_class(attr_name: str, value_var: t.Any, add_dunder: bool = False):
|
||||
Use the builtin setattr to set *attr_name* to *value_var*.
|
||||
We can't use the cached object.__setattr__ since we are setting
|
||||
attributes to a class.
|
||||
|
||||
If add_dunder to True, the generated globs should include a __add_dunder
|
||||
value that will be used to add the dunder methods to the class for given
|
||||
value_var
|
||||
"""
|
||||
val = f"__add_dunder(cls, {value_var})" if add_dunder else value_var
|
||||
return f"setattr(cls, '{attr_name}', {val})"
|
||||
@@ -742,6 +784,23 @@ class LLMConfig:
|
||||
|
||||
# NOTE: The following will be populated from __config__ and also
|
||||
# considered to be public API.
|
||||
__openllm_default_id__: str = Field(None)
|
||||
"""Return the default model to use when using 'openllm start <model_id>'.
|
||||
This could be one of the keys in 'self.model_ids' or custom users model.
|
||||
|
||||
This field is required when defining under '__config__'.
|
||||
"""
|
||||
|
||||
__openllm_model_ids__: ListStr = Field(None)
|
||||
"""A list of supported pretrained models tag for this given runnable.
|
||||
|
||||
For example:
|
||||
For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base",
|
||||
"google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"]
|
||||
|
||||
This field is required when defining under '__config__'.
|
||||
"""
|
||||
|
||||
__openllm_url__: str = Field(None, init=False)
|
||||
"""The resolved url for this LLMConfig."""
|
||||
|
||||
@@ -751,46 +810,13 @@ class LLMConfig:
|
||||
__openllm_trust_remote_code__: bool = Field(False)
|
||||
"""Whether to always trust remote code"""
|
||||
|
||||
__openllm_service_name__: str = Field(None)
|
||||
"""Generated service name for this LLMConfig. By default, it is 'generated_{model_name}_service.py'"""
|
||||
|
||||
__openllm_requirements__: ListStr | None = Field(None)
|
||||
"""The default PyPI requirements needed to run this given LLM. By default, we will depend on
|
||||
bentoml, torch, transformers."""
|
||||
|
||||
__openllm_env__: openllm.utils.ModelEnv = Field(None, init=False)
|
||||
"""A ModelEnv instance for this LLMConfig."""
|
||||
|
||||
__openllm_model_name__: str = Field("")
|
||||
"""The normalized version of __openllm_start_name__, determined by __openllm_name_type__"""
|
||||
|
||||
__openllm_model_type__: t.Literal["causal_lm", "seq2seq_lm"] = Field("causal_lm")
|
||||
"""The model type for this given LLM. By default, it should be causal language modeling.
|
||||
Currently supported 'causal_lm' or 'seq2seq_lm'
|
||||
"""
|
||||
|
||||
__openllm_start_name__: str = Field("")
|
||||
"""Default name to be used with `openllm start`"""
|
||||
|
||||
__openllm_name_type__: t.Literal["dasherize", "lowercase"] = Field("dasherize")
|
||||
"""the default name typed for this model. "dasherize" will convert the name to lowercase and
|
||||
replace spaces with dashes. "lowercase" will convert the name to lowercase."""
|
||||
|
||||
__openllm_timeout__: int = Field(36000)
|
||||
"""The default timeout to be set for this given LLM."""
|
||||
|
||||
__openllm_workers_per_resource__: int | float = Field(1)
|
||||
"""The number of workers per resource. This is used to determine the number of workers to use for this model.
|
||||
For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then
|
||||
OpenLLM will use 1 worker per resource. If this is set to 2, then OpenLLM will use 2 workers per resource.
|
||||
|
||||
See StarCoder for more advanced usage. See
|
||||
https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy for more details.
|
||||
|
||||
By default, it is set to 1.
|
||||
"""
|
||||
|
||||
__openllm_runtime__: t.Literal["transformers", "cpp"] = Field("transformers")
|
||||
"""The runtime to use for this model. Possible values are `transformers` or `cpp`. See
|
||||
LlaMA for more information."""
|
||||
|
||||
__openllm_use_pipeline__: bool = Field(False)
|
||||
"""Whether this LLM will use HuggingFace Pipeline API. By default, this is set to False.
|
||||
The reason for this to be here is because we want to access this object before loading
|
||||
@@ -804,16 +830,40 @@ class LLMConfig:
|
||||
and set to False for every other models.
|
||||
"""
|
||||
|
||||
__openllm_default_id__: str = Field(None)
|
||||
"""Return the default model to use when using 'openllm start <model_id>'.
|
||||
This could be one of the keys in 'self.model_ids' or custom users model."""
|
||||
__openllm_model_type__: t.Literal["causal_lm", "seq2seq_lm"] = Field("causal_lm")
|
||||
"""The model type for this given LLM. By default, it should be causal language modeling.
|
||||
Currently supported 'causal_lm' or 'seq2seq_lm'
|
||||
"""
|
||||
|
||||
__openllm_model_ids__: ListStr = Field(None)
|
||||
"""A list of supported pretrained models tag for this given runnable.
|
||||
__openllm_runtime__: t.Literal["transformers", "cpp"] = Field("transformers")
|
||||
"""The runtime to use for this model. Possible values are `transformers` or `cpp`. See
|
||||
LlaMA for more information."""
|
||||
|
||||
For example:
|
||||
For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base",
|
||||
"google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"]
|
||||
__openllm_name_type__: t.Literal["dasherize", "lowercase"] = Field("dasherize")
|
||||
"""the default name typed for this model. "dasherize" will convert the name to lowercase and
|
||||
replace spaces with dashes. "lowercase" will convert the name to lowercase."""
|
||||
|
||||
__openllm_model_name__: str = Field(None)
|
||||
"""The normalized version of __openllm_start_name__, determined by __openllm_name_type__"""
|
||||
|
||||
__openllm_start_name__: str = Field(None)
|
||||
"""Default name to be used with `openllm start`"""
|
||||
|
||||
__openllm_env__: openllm.utils.ModelEnv = Field(None, init=False)
|
||||
"""A ModelEnv instance for this LLMConfig."""
|
||||
|
||||
__openllm_timeout__: int = Field(36000)
|
||||
"""The default timeout to be set for this given LLM."""
|
||||
|
||||
__openllm_workers_per_resource__: int | float = Field(1)
|
||||
"""The number of workers per resource. This is used to determine the number of workers to use for this model.
|
||||
For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then
|
||||
OpenLLM will use 1 worker per resource. If this is set to 2, then OpenLLM will use 2 workers per resource.
|
||||
|
||||
See StarCoder for more advanced usage. See
|
||||
https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy for more details.
|
||||
|
||||
By default, it is set to 1.
|
||||
"""
|
||||
|
||||
__openllm_generation_class__: type[GenerationConfig] = Field(None, init=False)
|
||||
@@ -835,23 +885,10 @@ class LLMConfig:
|
||||
cls.__name__ = f"{cls.__name__}Config"
|
||||
|
||||
# NOTE: auto assignment attributes generated from __config__
|
||||
_make_assignment_script(cls, bentoml_cattr.structure(cls, _ModelSettings))(cls)
|
||||
_make_assignment_script(cls, bentoml_cattr.structure(cls, _ModelSettingsAttr))(cls)
|
||||
# process a fields under cls.__dict__ and auto convert them with dantic.Field
|
||||
cd = cls.__dict__
|
||||
anns = codegen.get_annotations(cls)
|
||||
partialed = functools.partial(_field_env_key, model_name=cls.__openllm_model_name__)
|
||||
|
||||
def auto_config_env(_: type[LLMConfig], attrs: list[attr.Attribute[t.Any]]) -> list[attr.Attribute[t.Any]]:
|
||||
return [
|
||||
a.evolve(
|
||||
default=_populate_value_from_env_var(partialed(key=a.name), fallback=a.default),
|
||||
metadata={
|
||||
"env": a.metadata.get("env", partialed(key=a.name)),
|
||||
"description": a.metadata.get("description", "(not provided)"),
|
||||
},
|
||||
)
|
||||
for a in attrs
|
||||
]
|
||||
|
||||
# _CountingAttr is the underlying representation of attr.field
|
||||
ca_names = {name for name, attr in cd.items() if isinstance(attr, _CountingAttr)}
|
||||
@@ -864,9 +901,9 @@ class LLMConfig:
|
||||
val = cd.get(attr_name, attr.NOTHING)
|
||||
if not LazyType["_CountingAttr[t.Any]"](_CountingAttr).isinstance(val):
|
||||
if val is attr.NOTHING:
|
||||
val = cls.Field(env=partialed(key=attr_name))
|
||||
val = cls.Field(env=_field_env_key(cls.__openllm_model_name__, attr_name))
|
||||
else:
|
||||
val = cls.Field(default=val, env=partialed(key=attr_name))
|
||||
val = cls.Field(default=val, env=_field_env_key(cls.__openllm_model_name__, attr_name))
|
||||
these[attr_name] = val
|
||||
unannotated = ca_names - annotated_names
|
||||
if len(unannotated) > 0:
|
||||
@@ -894,7 +931,7 @@ class LLMConfig:
|
||||
False, # disable auto_attribs, since we already handle these
|
||||
False, # disable kw_only
|
||||
True, # collect_by_mro
|
||||
field_transformer=auto_config_env,
|
||||
field_transformer=_make_env_transformer(cls, cls.__openllm_model_name__),
|
||||
)
|
||||
_weakref_slot = True # slots = True
|
||||
_base_names = {a.name for a in base_attrs}
|
||||
@@ -910,7 +947,7 @@ class LLMConfig:
|
||||
_make_init(
|
||||
cls, # cls (the attrs-decorated class)
|
||||
attrs, # tuple of attr.Attribute of cls
|
||||
_has_pre_init, # pre_initjk
|
||||
_has_pre_init, # pre_init
|
||||
_has_post_init, # post_init
|
||||
False, # frozen
|
||||
True, # slots
|
||||
@@ -1047,14 +1084,14 @@ class LLMConfig:
|
||||
def __getattribute__(self, item: str) -> t.Any:
|
||||
if item in _reserved_namespace:
|
||||
raise ForbiddenAttributeError(
|
||||
f"'{item}' is a reserved namespace for {self.__class__} and should not be access nor modified."
|
||||
f"'{item}' belongs to a private namespace for {self.__class__} and should not be access nor modified."
|
||||
)
|
||||
return _object_getattribute.__get__(self)(item)
|
||||
|
||||
@classmethod
|
||||
def check_if_gpu_is_available(cls, implementation: str | None = None, force: bool = False):
|
||||
if implementation is None:
|
||||
implementation = cls.__openllm_env__.get_framework_env()
|
||||
implementation = cls.__openllm_env__["framework_value"]
|
||||
|
||||
try:
|
||||
if cls.__openllm_requires_gpu__ or force:
|
||||
@@ -1091,7 +1128,7 @@ class LLMConfig:
|
||||
"""
|
||||
attrs = {k: v for k, v in attrs.items() if v is not None}
|
||||
|
||||
model_config = cls.__openllm_env__.model_config
|
||||
model_config = cls.__openllm_env__.config
|
||||
|
||||
env_json_string = os.environ.get(model_config, None)
|
||||
|
||||
|
||||
@@ -35,14 +35,17 @@ from bentoml._internal.types import ModelSignatureDict
|
||||
import openllm
|
||||
|
||||
from .exceptions import ForbiddenAttributeError, OpenLLMException
|
||||
from .utils import (LazyLoader, bentoml_cattr, is_bitsandbytes_available,
|
||||
non_intrusive_setattr)
|
||||
from .utils import (DEBUG, LazyLoader, ModelEnv, bentoml_cattr, first_not_none,
|
||||
get_debug_mode, is_bitsandbytes_available,
|
||||
is_torch_available, non_intrusive_setattr, pkg)
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
import transformers
|
||||
from bentoml._internal.runner.strategy import Strategy
|
||||
|
||||
from .models.auto.factory import _BaseAutoLLMClass
|
||||
|
||||
class LLMRunner(bentoml.Runner):
|
||||
__doc__: str
|
||||
__module__: str
|
||||
@@ -170,7 +173,7 @@ def import_model(
|
||||
# NOTE: We need to free up the cache after importing the model
|
||||
# in the case where users first run openllm start without the model
|
||||
# available locally.
|
||||
if openllm.utils.is_torch_available() and torch.cuda.is_available():
|
||||
if is_torch_available() and torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
@@ -314,16 +317,25 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
model_id: str | None = None,
|
||||
llm_config: openllm.LLMConfig | None = None,
|
||||
*args: t.Any,
|
||||
quantize: t.Literal["int8", "int4", "gptq"] | None = None,
|
||||
bettertransformer: bool | None = None,
|
||||
**attrs: t.Any,
|
||||
) -> LLM[_M, _T]:
|
||||
return cls(model_id=model_id, llm_config=llm_config, *args, **attrs)
|
||||
return cls(
|
||||
model_id=model_id,
|
||||
llm_config=llm_config,
|
||||
*args,
|
||||
quantize=quantize,
|
||||
bettertransformer=bettertransformer,
|
||||
**attrs,
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_id: str | None = None,
|
||||
llm_config: openllm.LLMConfig | None = None,
|
||||
*args: t.Any,
|
||||
quantize: t.Literal["8bit", "4bit", "gptq"] | None = None,
|
||||
quantize: t.Literal["int8", "int4", "gptq"] | None = None,
|
||||
bettertransformer: bool | None = None,
|
||||
**attrs: t.Any,
|
||||
):
|
||||
@@ -402,7 +414,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM
|
||||
will use `config_class` to construct default configuration.
|
||||
quantize: The quantization to use for this LLM. Defaults to None. Possible values
|
||||
include 8bit, 4bit and gptq.
|
||||
include int8, int4 and gptq.
|
||||
bettertransformer: Whether to use BetterTransformer with this model. Defaults to False.
|
||||
*args: The args to be passed to the model.
|
||||
**attrs: The kwargs to be passed to the model.
|
||||
@@ -431,6 +443,14 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
int4_quant_type = attrs.pop("llm_bnb_4bit_quant_type", "nf4")
|
||||
int4_use_double_quant = attrs.pop("llm_bnb_4bit_use_double_quant", True)
|
||||
|
||||
if llm_config is not None:
|
||||
logger.debug("Using given 'llm_config=(%s)' to initialize LLM.", llm_config)
|
||||
self.config = llm_config
|
||||
else:
|
||||
self.config = self.config_class.model_construct_env(**attrs)
|
||||
# The rests of the kwargs that is not used by the config class should be stored into __openllm_extras__.
|
||||
attrs = self.config["extras"]
|
||||
|
||||
if quantization_config and quantize:
|
||||
raise ValueError(
|
||||
"""'quantization_config' and 'quantize' are mutually exclusive. Either customise
|
||||
@@ -452,7 +472,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
self,
|
||||
quantize,
|
||||
)
|
||||
if quantize == "8bit":
|
||||
if quantize == "int8":
|
||||
if int8_skip_modules is None:
|
||||
int8_skip_modules = []
|
||||
if "lm_head" not in int8_skip_modules and self.config["model_type"] == "causal_lm":
|
||||
@@ -465,8 +485,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
llm_int8_skip_modules=int8_skip_modules,
|
||||
llm_int8_has_fp16_weight=int8_has_fp16_weight,
|
||||
)
|
||||
elif quantize == "4bit":
|
||||
trf_versions = openllm.utils.pkg.pkg_version_info("transformers")
|
||||
elif quantize == "int4":
|
||||
trf_versions = pkg.pkg_version_info("transformers")
|
||||
supports_kbits = trf_versions[:2] >= (4, 30)
|
||||
if supports_kbits:
|
||||
quantization_config = transformers.BitsAndBytesConfig(
|
||||
@@ -477,7 +497,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
"'quantize' is set to 4bit, while the current transformers version %s does not support "
|
||||
"'quantize' is set to int4, while the current transformers version %s does not support "
|
||||
"k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore "
|
||||
"make sure to install the latest version of transformers either via PyPI or "
|
||||
"from git source: 'pip install git+https://github.com/huggingface/transformers'.",
|
||||
@@ -495,20 +515,12 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
)
|
||||
raise NotImplementedError("GPTQ is not supported yet.")
|
||||
else:
|
||||
raise ValueError(f"'quantize' must be one of ['8bit', '4bit', 'gptq'], got {quantize} instead.")
|
||||
raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantize} instead.")
|
||||
|
||||
attrs.update({"quantization_config": quantization_config})
|
||||
|
||||
if llm_config is not None:
|
||||
logger.debug("Using given 'llm_config=(%s)' to initialize LLM", llm_config)
|
||||
self.config = llm_config
|
||||
else:
|
||||
self.config = self.config_class.model_construct_env(**attrs)
|
||||
# The rests of the kwargs that is not used by the config class should be stored into __openllm_extras__.
|
||||
attrs = self.config["extras"]
|
||||
|
||||
if not self.config["use_pipeline"]:
|
||||
attrs["low_cpu_mem_usage"] = low_cpu_mem_usage
|
||||
if self.__llm_implementation__ == "pt":
|
||||
if not self.config["use_pipeline"]:
|
||||
attrs["low_cpu_mem_usage"] = low_cpu_mem_usage
|
||||
attrs["quantization_config"] = quantization_config
|
||||
|
||||
model_kwds, tokenizer_kwds = {}, {}
|
||||
if self.__llm_init_kwargs__:
|
||||
@@ -527,8 +539,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
model_id = os.environ.get(self.config["env"].model_id, self.config["default_id"])
|
||||
|
||||
# NOTE: This is the actual given path or pretrained weight for this LLM.
|
||||
if t.TYPE_CHECKING:
|
||||
assert model_id is not None
|
||||
assert model_id is not None
|
||||
self._model_id = model_id
|
||||
|
||||
# parsing tokenizer and model kwargs
|
||||
@@ -590,6 +601,16 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
"model_ids": orjson.dumps(self.config["model_ids"]).decode(),
|
||||
}
|
||||
|
||||
@property
|
||||
def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], dict[str, t.Any]], dict[str, t.Any]]:
|
||||
"""Returning the processed model and tokenizer parameters to be used with
|
||||
'import_model' or any other place that requires loading model and tokenizer.
|
||||
|
||||
See 'openllm.cli.download_models' for example usage.
|
||||
It returns a tuple of (model_args, model_kwargs) & tokenizer_kwargs
|
||||
"""
|
||||
return (self._model_args, self._model_attrs), self._tokenizer_attrs
|
||||
|
||||
@staticmethod
|
||||
def make_tag(
|
||||
model_id: str | None = None,
|
||||
@@ -638,6 +659,10 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
return bentoml.Tag.from_taglike(f"{implementation}-{name}:{model_version}")
|
||||
|
||||
def ensure_model_id_exists(self) -> bentoml.Model:
|
||||
"""This utility function will download the model if it doesn't exist yet.
|
||||
Make sure to call this function if 'ensure_available' is not set during
|
||||
Auto LLM initialisation.
|
||||
"""
|
||||
output = subprocess.check_output(
|
||||
[
|
||||
sys.executable,
|
||||
@@ -651,7 +676,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
"porcelain",
|
||||
]
|
||||
)
|
||||
if openllm.utils.DEBUG:
|
||||
if DEBUG or get_debug_mode():
|
||||
# NOTE: This usually only concern BentoML devs.
|
||||
pattern = r"^__tag__:[^:\n]+:[^:\n]+"
|
||||
matched = re.search(pattern, output.decode("utf-8").strip(), re.MULTILINE)
|
||||
@@ -665,7 +690,15 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
@property
|
||||
def _bentomodel(self) -> bentoml.Model:
|
||||
if self.__llm_bentomodel__ is None:
|
||||
self.__llm_bentomodel__ = self.ensure_model_id_exists()
|
||||
# NOTE: Since PR#28, self.__llm_bentomodel__ changed from
|
||||
# ensure_model_id_exists() into just returning the model ref.
|
||||
# This is because we want to save a few seconds of loading time,
|
||||
# as openllm.Runner and openllm.AutoLLM initialisation is around 700ms
|
||||
# before #28.
|
||||
# If users want to make sure to have the model downloaded,
|
||||
# one should invoke `LLM.ensure_model_id_exists()` manually,
|
||||
# or pass `ensure_available=True` into the Auto LLM initialisation.
|
||||
self.__llm_bentomodel__ = bentoml.transformers.get(self.tag)
|
||||
return self.__llm_bentomodel__
|
||||
|
||||
@property
|
||||
@@ -729,13 +762,14 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
)
|
||||
return self.__llm_tokenizer__
|
||||
|
||||
# order of these fields matter here, make sure to sync it with
|
||||
# openllm.models.auto.factory._BaseAutoLLMClass.for_model
|
||||
def to_runner(
|
||||
self,
|
||||
models: list[bentoml.Model] | None = None,
|
||||
max_batch_size: int | None = None,
|
||||
max_latency_ms: int | None = None,
|
||||
method_configs: dict[str, ModelSignatureDict | ModelSignature] | None = None,
|
||||
embedded: bool = False,
|
||||
scheduling_strategy: type[Strategy] | None = None,
|
||||
) -> LLMRunner:
|
||||
"""Convert this LLM into a Runner.
|
||||
@@ -753,6 +787,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
NOTE: There are some difference between bentoml.models.get().to_runner() and LLM.to_runner(): 'name'.
|
||||
- 'name': will be generated by OpenLLM, hence users don't shouldn't worry about this.
|
||||
The generated name will be 'llm-<model-start-name>-runner' (ex: llm-dolly-v2-runner, llm-chatglm-runner)
|
||||
- 'embedded': Will be disabled by default. There is no reason to run LLM in embedded mode.
|
||||
"""
|
||||
models = models if models is not None else []
|
||||
models.append(self._bentomodel)
|
||||
@@ -768,10 +803,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
method_configs = {"generate": generate_sig, "generate_iterator": generate_iterator_sig}
|
||||
else:
|
||||
signatures = ModelSignature.convert_signatures_dict(method_configs)
|
||||
generate_sig = openllm.utils.first_not_none(signatures.get("generate"), default=generate_sig)
|
||||
generate_iterator_sig = openllm.utils.first_not_none(
|
||||
signatures.get("generate_iterator"), default=generate_iterator_sig
|
||||
)
|
||||
generate_sig = first_not_none(signatures.get("generate"), default=generate_sig)
|
||||
generate_iterator_sig = first_not_none(signatures.get("generate_iterator"), default=generate_iterator_sig)
|
||||
|
||||
class _Runnable(bentoml.Runnable):
|
||||
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
|
||||
@@ -860,11 +893,11 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
},
|
||||
),
|
||||
name=self.runner_name,
|
||||
embedded=False,
|
||||
models=models,
|
||||
max_batch_size=max_batch_size,
|
||||
max_latency_ms=max_latency_ms,
|
||||
method_configs=bentoml_cattr.unstructure(method_configs),
|
||||
embedded=embedded,
|
||||
scheduling_strategy=scheduling_strategy,
|
||||
)
|
||||
|
||||
@@ -918,22 +951,28 @@ def Runner(
|
||||
...
|
||||
|
||||
|
||||
def Runner(model_name: str, **attrs: t.Any) -> LLMRunner:
|
||||
def Runner(model_name: str, ensure_available: bool = True, init_local: bool = False, **attrs: t.Any) -> LLMRunner:
|
||||
"""Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'
|
||||
|
||||
Args:
|
||||
model_name: Supported model name from 'openllm models'
|
||||
ensure_available: If True, it will ensure the model is available before creating the runner.
|
||||
Set to False for faster creation time. Note that you will need to make sure
|
||||
the model for this 'model_id' is available before calling the runner.
|
||||
One can do this by doing the following:
|
||||
```python
|
||||
runner = openllm.Runner("dolly-v2", ensure_available=False)
|
||||
runner.llm.ensure_model_id_exists()
|
||||
```
|
||||
init_local: If True, it will initialize the model locally. This is useful if you want to
|
||||
run the model locally. (Symmetrical to bentoml.Runner.init_local())
|
||||
**attrs: The rest of kwargs will then be passed to the LLM. Refer to the LLM documentation for the kwargs
|
||||
behaviour
|
||||
"""
|
||||
init_local = attrs.pop("init_local", False)
|
||||
ModelEnv = openllm.utils.ModelEnv(model_name)
|
||||
if ModelEnv.get_framework_env() == "flax":
|
||||
runner = openllm.AutoFlaxLLM.create_runner(model_name, **attrs)
|
||||
elif ModelEnv.get_framework_env() == "tf":
|
||||
runner = openllm.AutoTFLLM.create_runner(model_name, **attrs)
|
||||
else:
|
||||
runner = openllm.AutoLLM.create_runner(model_name, **attrs)
|
||||
runner = t.cast(
|
||||
"_BaseAutoLLMClass",
|
||||
openllm[ModelEnv(model_name)["framework_value"]], # type: ignore (internal API)
|
||||
).create_runner(model_name, ensure_available=ensure_available, **attrs)
|
||||
|
||||
if init_local:
|
||||
runner.init_local(quiet=True)
|
||||
|
||||
@@ -29,12 +29,15 @@ from bentoml._internal.bento.build_config import DockerOptions, PythonOptions
|
||||
from bentoml._internal.configuration import get_debug_mode
|
||||
|
||||
import openllm
|
||||
import openllm.utils as utils
|
||||
from openllm.utils import pkg
|
||||
|
||||
from .utils import (ModelEnv, codegen, first_not_none, is_flax_available,
|
||||
is_tf_available, is_torch_available, pkg)
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from fs.base import FS
|
||||
|
||||
from .models.auto.factory import _BaseAutoLLMClass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
OPENLLM_DEV_BUILD = "OPENLLM_DEV_BUILD"
|
||||
@@ -82,10 +85,10 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
|
||||
if not (str(os.environ.get("BENTOML_BUNDLE_LOCAL_BUILD", False)).lower() == "false"):
|
||||
packages.append(f"bentoml>={'.'.join([str(i) for i in pkg.pkg_version_info('bentoml')])}")
|
||||
|
||||
env = llm.config["env"]
|
||||
to_use_framework = env.get_framework_env()
|
||||
if to_use_framework == "flax":
|
||||
assert utils.is_flax_available(), f"Flax is not available, while {env.framework} is set to 'flax'"
|
||||
env: ModelEnv = llm.config["env"]
|
||||
framework_envvar = env["framework_value"]
|
||||
if framework_envvar == "flax":
|
||||
assert is_flax_available(), f"Flax is not available, while {env.framework} is set to 'flax'"
|
||||
packages.extend(
|
||||
[
|
||||
f"flax>={importlib.metadata.version('flax')}",
|
||||
@@ -93,8 +96,8 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
|
||||
f"jaxlib>={importlib.metadata.version('jaxlib')}",
|
||||
]
|
||||
)
|
||||
elif to_use_framework == "tf":
|
||||
assert utils.is_tf_available(), f"TensorFlow is not available, while {env.framework} is set to 'tf'"
|
||||
elif framework_envvar == "tf":
|
||||
assert is_tf_available(), f"TensorFlow is not available, while {env.framework} is set to 'tf'"
|
||||
candidates = (
|
||||
"tensorflow",
|
||||
"tensorflow-cpu",
|
||||
@@ -116,7 +119,7 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
pass
|
||||
else:
|
||||
assert utils.is_torch_available(), "PyTorch is not available. Make sure to have it locally installed."
|
||||
assert is_torch_available(), "PyTorch is not available. Make sure to have it locally installed."
|
||||
packages.extend([f"torch>={importlib.metadata.version('torch')}"])
|
||||
|
||||
wheels: list[str] = []
|
||||
@@ -127,7 +130,13 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
|
||||
return PythonOptions(packages=packages, wheels=wheels, lock_packages=True)
|
||||
|
||||
|
||||
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float) -> DockerOptions:
|
||||
def construct_docker_options(
|
||||
llm: openllm.LLM[t.Any, t.Any],
|
||||
_: FS,
|
||||
workers_per_resource: int | float,
|
||||
quantize: t.LiteralString | None,
|
||||
bettertransformer: bool | None,
|
||||
) -> DockerOptions:
|
||||
_bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
|
||||
_bentoml_config_options_opts = [
|
||||
"api_server.traffic.timeout=36000", # NOTE: Currently we hardcode this value
|
||||
@@ -135,39 +144,112 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_
|
||||
f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}',
|
||||
]
|
||||
_bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
|
||||
env = llm.config["env"]
|
||||
return DockerOptions(
|
||||
cuda_version="11.6", # NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version
|
||||
env={
|
||||
env.framework: env.get_framework_env(),
|
||||
"OPENLLM_MODEL": llm.config["model_name"],
|
||||
"OPENLLM_MODEL_ID": llm.model_id,
|
||||
"BENTOML_DEBUG": str(get_debug_mode()),
|
||||
"BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
|
||||
},
|
||||
system_packages=["git"],
|
||||
env: ModelEnv = llm.config["env"]
|
||||
|
||||
env_dict = {
|
||||
env.framework: env.framework_value,
|
||||
env.config: llm.config.model_dump_json().decode(),
|
||||
"OPENLLM_MODEL": llm.config["model_name"],
|
||||
"OPENLLM_MODEL_ID": llm.model_id,
|
||||
"BENTOML_DEBUG": str(get_debug_mode()),
|
||||
"BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
|
||||
}
|
||||
|
||||
# We need to handle None separately here, as env from subprocess doesn't
|
||||
# accept None value.
|
||||
_env = ModelEnv(llm.config["model_name"], bettertransformer=bettertransformer, quantize=quantize)
|
||||
|
||||
if _env.bettertransformer_value is not None:
|
||||
env_dict[_env.bettertransformer] = _env.bettertransformer_value
|
||||
if _env.quantize_value is not None:
|
||||
env_dict[_env.quantize] = _env.quantize_value
|
||||
|
||||
# NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version
|
||||
return DockerOptions(cuda_version="11.6", env=env_dict, system_packages=["git"])
|
||||
|
||||
|
||||
@t.overload
|
||||
def build(
|
||||
model_name: str,
|
||||
*,
|
||||
model_id: str | None = ...,
|
||||
quantize: t.LiteralString | None = ...,
|
||||
bettertransformer: bool | None = ...,
|
||||
_workers_per_resource: int | float | None = ...,
|
||||
_overwrite_existing_bento: bool = ...,
|
||||
__cli__: t.Literal[False] = ...,
|
||||
**attrs: t.Any,
|
||||
) -> bentoml.Bento:
|
||||
...
|
||||
|
||||
|
||||
@t.overload
|
||||
def build(
|
||||
model_name: str,
|
||||
*,
|
||||
model_id: str | None = ...,
|
||||
quantize: t.LiteralString | None = ...,
|
||||
bettertransformer: bool | None = ...,
|
||||
_workers_per_resource: int | float | None = ...,
|
||||
_overwrite_existing_bento: bool = ...,
|
||||
__cli__: t.Literal[True] = ...,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[bentoml.Bento, bool]:
|
||||
...
|
||||
|
||||
|
||||
def _build_bento(
|
||||
bento_tag: bentoml.Tag,
|
||||
service_name: str,
|
||||
llm_fs: FS,
|
||||
llm: openllm.LLM[t.Any, t.Any],
|
||||
workers_per_resource: int | float,
|
||||
quantize: t.LiteralString | None,
|
||||
bettertransformer: bool | None,
|
||||
) -> bentoml.Bento:
|
||||
framework_envvar = llm.config["env"]["framework_value"]
|
||||
labels = dict(llm.identifying_params)
|
||||
labels.update({"_type": llm.llm_type, "_framework": framework_envvar})
|
||||
logger.info("Building Bento for LLM '%s'", llm.config["start_name"])
|
||||
return bentoml.bentos.build(
|
||||
f"{service_name}:svc",
|
||||
name=bento_tag.name,
|
||||
labels=labels,
|
||||
description=f"OpenLLM service for {llm.config['start_name']}",
|
||||
include=[
|
||||
f for f in llm_fs.walk.files(filter=["*.py"])
|
||||
], # NOTE: By default, we are using _service.py as the default service, for now.
|
||||
exclude=["/venv", "__pycache__/", "*.py[cod]", "*$py.class"],
|
||||
python=construct_python_options(llm, llm_fs),
|
||||
docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer),
|
||||
version=bento_tag.version,
|
||||
build_ctx=llm_fs.getsyspath("/"),
|
||||
)
|
||||
|
||||
|
||||
@t.overload
|
||||
def build(model_name: str, *, __cli__: t.Literal[False] = ..., **attrs: t.Any) -> bentoml.Bento:
|
||||
...
|
||||
def build(
|
||||
model_name: str,
|
||||
*,
|
||||
model_id: str | None = None,
|
||||
quantize: t.LiteralString | None = None,
|
||||
bettertransformer: bool | None = None,
|
||||
_workers_per_resource: int | float | None = None,
|
||||
_overwrite_existing_bento: bool = False,
|
||||
__cli__: bool = False,
|
||||
**attrs: t.Any,
|
||||
) -> tuple[bentoml.Bento, bool] | bentoml.Bento:
|
||||
"""Package a LLM into a Bento.
|
||||
|
||||
The LLM will be built into a BentoService with the following structure:
|
||||
if quantize is passed, it will instruct the model to be quantized dynamically during serving time.
|
||||
if bettertransformer is passed, it will instruct the model to use BetterTransformer during serving time.
|
||||
|
||||
@t.overload
|
||||
def build(model_name: str, *, __cli__: t.Literal[True] = ..., **attrs: t.Any) -> tuple[bentoml.Bento, bool]:
|
||||
...
|
||||
Other parameters including model_name, model_id and attrs will be passed to the LLM class itself.
|
||||
"""
|
||||
|
||||
|
||||
def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[bentoml.Bento, bool] | bentoml.Bento:
|
||||
"""Package a LLM into a Bento."""
|
||||
|
||||
overwrite_existing_bento = attrs.pop("_overwrite_existing_bento", False)
|
||||
_previously_built = False
|
||||
current_model_envvar = os.environ.pop("OPENLLM_MODEL", None)
|
||||
current_model_id_envvar = os.environ.pop("OPENLLM_MODEL_ID", None)
|
||||
_previously_built = False
|
||||
workers_per_resource = attrs.pop("_workers_per_resource", None)
|
||||
model_id: str = attrs.pop("model_id", None)
|
||||
|
||||
llm_config = openllm.AutoConfig.for_model(model_name)
|
||||
|
||||
@@ -178,52 +260,58 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
|
||||
try:
|
||||
os.environ["OPENLLM_MODEL"] = inflection.underscore(model_name)
|
||||
|
||||
to_use_framework = llm_config["env"].get_framework_env()
|
||||
if to_use_framework == "flax":
|
||||
llm = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=llm_config, **attrs)
|
||||
elif to_use_framework == "tf":
|
||||
llm = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=llm_config, **attrs)
|
||||
else:
|
||||
llm = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=llm_config, **attrs)
|
||||
framework_envvar = llm_config["env"]["framework_value"]
|
||||
llm = t.cast(
|
||||
"_BaseAutoLLMClass",
|
||||
openllm[framework_envvar], # type: ignore (internal API)
|
||||
).for_model(
|
||||
model_name,
|
||||
model_id=model_id,
|
||||
llm_config=llm_config,
|
||||
quantize=quantize,
|
||||
bettertransformer=bettertransformer,
|
||||
**attrs,
|
||||
)
|
||||
|
||||
os.environ["OPENLLM_MODEL_ID"] = llm.model_id
|
||||
|
||||
labels = dict(llm.identifying_params)
|
||||
labels.update({"_type": llm.llm_type, "_framework": to_use_framework})
|
||||
labels.update({"_type": llm.llm_type, "_framework": framework_envvar})
|
||||
service_name = f"generated_{llm_config['model_name']}_service.py"
|
||||
workers_per_resource = utils.first_not_none(workers_per_resource, default=llm_config["workers_per_resource"])
|
||||
workers_per_resource = first_not_none(_workers_per_resource, default=llm_config["workers_per_resource"])
|
||||
|
||||
with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
|
||||
# add service.py definition to this temporary folder
|
||||
utils.codegen.write_service(model_name, llm.model_id, service_name, llm_fs)
|
||||
codegen.write_service(model_name, llm.model_id, service_name, llm_fs)
|
||||
|
||||
bento_tag = bentoml.Tag.from_taglike(f"{llm.llm_type}-service:{llm.tag.version}")
|
||||
try:
|
||||
bento = bentoml.get(bento_tag)
|
||||
if overwrite_existing_bento:
|
||||
if _overwrite_existing_bento:
|
||||
logger.info("Overwriting previously saved Bento.")
|
||||
bentoml.delete(bento_tag)
|
||||
raise bentoml.exceptions.NotFound("Overwriting previously saved Bento.")
|
||||
bento = _build_bento(
|
||||
bento_tag,
|
||||
service_name,
|
||||
llm_fs,
|
||||
llm,
|
||||
workers_per_resource=workers_per_resource,
|
||||
quantize=quantize,
|
||||
bettertransformer=bettertransformer,
|
||||
)
|
||||
_previously_built = True
|
||||
except bentoml.exceptions.NotFound:
|
||||
logger.info("Building Bento for LLM '%s'", llm_config["start_name"])
|
||||
bento = bentoml.bentos.build(
|
||||
f"{service_name}:svc",
|
||||
name=bento_tag.name,
|
||||
labels=labels,
|
||||
description=f"OpenLLM service for {llm_config['start_name']}",
|
||||
include=[
|
||||
f for f in llm_fs.walk.files(filter=["*.py"])
|
||||
], # NOTE: By default, we are using _service.py as the default service, for now.
|
||||
exclude=["/venv", "__pycache__/", "*.py[cod]", "*$py.class"],
|
||||
python=construct_python_options(llm, llm_fs),
|
||||
docker=construct_docker_options(llm, llm_fs, workers_per_resource),
|
||||
version=bento_tag.version,
|
||||
build_ctx=llm_fs.getsyspath("/"),
|
||||
bento = _build_bento(
|
||||
bento_tag,
|
||||
service_name,
|
||||
llm_fs,
|
||||
llm,
|
||||
workers_per_resource=workers_per_resource,
|
||||
quantize=quantize,
|
||||
bettertransformer=bettertransformer,
|
||||
)
|
||||
if __cli__:
|
||||
return bento, _previously_built
|
||||
else:
|
||||
return bento
|
||||
return (bento, _previously_built) if __cli__ else bento
|
||||
except Exception as e:
|
||||
logger.error("\nException caught during building LLM %s: \n", model_name, exc_info=e)
|
||||
raise
|
||||
|
||||
@@ -34,7 +34,16 @@ model = os.environ.get("OPENLLM_MODEL", "{__model_name__}") # openllm: model na
|
||||
model_id = os.environ.get("OPENLLM_MODEL_ID", "{__model_id__}") # openllm: model id
|
||||
|
||||
llm_config = openllm.AutoConfig.for_model(model)
|
||||
runner = openllm.Runner(model, model_id=model_id, llm_config=llm_config)
|
||||
|
||||
runner = openllm.Runner(
|
||||
model,
|
||||
model_id=model_id,
|
||||
llm_config=llm_config,
|
||||
bettertransformer=llm_config["env"]["bettertransformer_value"],
|
||||
quantize=llm_config["env"]["quantize_value"],
|
||||
ensure_available=False,
|
||||
init_local=False,
|
||||
)
|
||||
|
||||
svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[runner])
|
||||
|
||||
@@ -57,6 +66,6 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
|
||||
model_id=model_id,
|
||||
timeout=llm_config["timeout"],
|
||||
model_name=llm_config["model_name"],
|
||||
framework=llm_config["env"].get_framework_env(),
|
||||
framework=llm_config["env"]["framework_value"],
|
||||
configuration=llm_config.model_dump_json().decode(),
|
||||
)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -94,22 +94,21 @@ class _BaseAutoLLMClass:
|
||||
>>> llm = openllm.AutoLLM.for_model("flan-t5")
|
||||
```
|
||||
"""
|
||||
runner_kwargs_name = [
|
||||
# order matters here
|
||||
runner_kwargs_name = {
|
||||
"models",
|
||||
"max_batch_size",
|
||||
"max_latency_ms",
|
||||
"method_configs",
|
||||
"embedded",
|
||||
"scheduling_strategy",
|
||||
]
|
||||
}
|
||||
to_runner_attrs = {k: v for k, v in attrs.items() if k in runner_kwargs_name}
|
||||
for k in to_runner_attrs:
|
||||
del attrs[k]
|
||||
normalized = inflection.underscore(model_name)
|
||||
if cls._model_mapping.get(normalized, None, mapping_type="name2model"):
|
||||
attrs = {k: v for k, v in attrs.items() if k not in to_runner_attrs}
|
||||
if cls._model_mapping.get(inflection.underscore(model_name), None, mapping_type="name2model"):
|
||||
if not isinstance(llm_config, openllm.LLMConfig):
|
||||
# The rest of kwargs is now passed to config
|
||||
llm_config = AutoConfig.for_model(normalized, **attrs)
|
||||
llm_config = AutoConfig.for_model(model_name, **attrs)
|
||||
attrs = llm_config.__openllm_extras__
|
||||
# the rest of attrs will be saved to __openllm_extras__
|
||||
llm = cls._model_mapping[type(llm_config)].from_pretrained(
|
||||
model_id,
|
||||
|
||||
@@ -104,25 +104,25 @@ class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrain
|
||||
chat_history.append((prompt, generation_result))
|
||||
return "".join(generation_result)
|
||||
|
||||
@torch.inference_mode()
|
||||
def generate(self, prompt: str, use_default_prompt_template: bool = True, **attrs: t.Any) -> str:
|
||||
self.model.eval()
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> str:
|
||||
with torch.inference_mode():
|
||||
self.model.eval()
|
||||
|
||||
# Only use half precision if the model is not yet quantized
|
||||
if self.config.use_half_precision:
|
||||
self.model.half()
|
||||
# Only use half precision if the model is not yet quantized
|
||||
if self.config.use_half_precision:
|
||||
self.model.half()
|
||||
|
||||
self.model.cuda()
|
||||
self.model.cuda()
|
||||
|
||||
logit_processor: list[LogitsProcessor] = LogitsProcessorList()
|
||||
logit_processor.append(InvalidScoreLogitsProcessor())
|
||||
logit_processor: list[LogitsProcessor] = LogitsProcessorList()
|
||||
logit_processor.append(InvalidScoreLogitsProcessor())
|
||||
|
||||
inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
|
||||
outputs = self.model.generate(
|
||||
**inputs,
|
||||
generation_config=self.config.model_construct_env(do_sample=True, **attrs).to_generation_config(),
|
||||
logits_processor=logit_processor,
|
||||
)
|
||||
outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) :]
|
||||
response = self.tokenizer.decode(outputs)
|
||||
return self.model.process_response(response)
|
||||
inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
|
||||
outputs = self.model.generate(
|
||||
**inputs,
|
||||
generation_config=self.config.model_construct_env(do_sample=True, **attrs).to_generation_config(),
|
||||
logits_processor=logit_processor,
|
||||
)
|
||||
outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) :]
|
||||
response = self.tokenizer.decode(outputs)
|
||||
return self.model.process_response(response)
|
||||
|
||||
@@ -98,19 +98,19 @@ class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedToken
|
||||
) -> str:
|
||||
return generation_result[0]["generated_text"]
|
||||
|
||||
@torch.inference_mode()
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
|
||||
self.model.tokenizer = self.tokenizer
|
||||
llm_config = self.config.model_construct_env(**attrs)
|
||||
decoded: list[dict[t.Literal["generated_text"], str]] = self.model(
|
||||
prompt, generation_config=llm_config.to_generation_config()
|
||||
)
|
||||
with torch.inference_mode():
|
||||
self.model.tokenizer = self.tokenizer
|
||||
llm_config = self.config.model_construct_env(**attrs)
|
||||
decoded: list[dict[t.Literal["generated_text"], str]] = self.model(
|
||||
prompt, generation_config=llm_config.to_generation_config()
|
||||
)
|
||||
|
||||
if llm_config.return_full_text:
|
||||
return [
|
||||
{k: f"{DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt)}\n{generated}"}
|
||||
for i in decoded
|
||||
for k, generated in i.items()
|
||||
]
|
||||
if llm_config.return_full_text:
|
||||
return [
|
||||
{k: f"{DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt)}\n{generated}"}
|
||||
for i in decoded
|
||||
for k, generated in i.items()
|
||||
]
|
||||
|
||||
return decoded
|
||||
return decoded
|
||||
|
||||
@@ -74,14 +74,14 @@ class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformer
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
|
||||
return generation_result[0]
|
||||
|
||||
@torch.inference_mode()
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
if torch.cuda.is_available():
|
||||
self.model.cuda()
|
||||
input_ids = t.cast("torch.Tensor", self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device)
|
||||
result_tensor = self.model.generate(
|
||||
input_ids,
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
)
|
||||
return self.tokenizer.batch_decode(result_tensor, skip_special_tokens=True)
|
||||
with torch.inference_mode():
|
||||
if torch.cuda.is_available():
|
||||
self.model.cuda()
|
||||
input_ids = t.cast("torch.Tensor", self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device)
|
||||
result_tensor = self.model.generate(
|
||||
input_ids,
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
)
|
||||
return self.tokenizer.batch_decode(result_tensor, skip_special_tokens=True)
|
||||
|
||||
@@ -129,15 +129,15 @@ class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer
|
||||
else:
|
||||
return "\n".join(generation_result)
|
||||
|
||||
@torch.inference_mode()
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
if torch.cuda.is_available() and torch.cuda.device_count() == 1:
|
||||
self.model.cuda()
|
||||
with torch.inference_mode():
|
||||
if torch.cuda.is_available() and torch.cuda.device_count() == 1:
|
||||
self.model.cuda()
|
||||
|
||||
input_ids = t.cast(torch.Tensor, self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device)
|
||||
generated_tensors = self.model.generate(
|
||||
input_ids,
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
)
|
||||
return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
|
||||
input_ids = t.cast(torch.Tensor, self.tokenizer(prompt, return_tensors="pt").input_ids).to(self.device)
|
||||
generated_tensors = self.model.generate(
|
||||
input_ids,
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
)
|
||||
return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
|
||||
|
||||
@@ -120,18 +120,20 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
|
||||
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
|
||||
return generation_result[0]
|
||||
|
||||
@torch.inference_mode()
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
inputs = t.cast("torch.Tensor", self.tokenizer.encode(prompt, return_tensors="pt")).to(self.device)
|
||||
result_tensor = self.model.generate(
|
||||
inputs,
|
||||
do_sample=True,
|
||||
pad_token_id=self.tokenizer.eos_token_id,
|
||||
# eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
)
|
||||
# TODO: We will probably want to return the tokenizer here so that we can manually process this
|
||||
# return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
|
||||
return self.tokenizer.batch_decode(
|
||||
result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True
|
||||
)
|
||||
with torch.inference_mode():
|
||||
inputs = t.cast("torch.Tensor", self.tokenizer.encode(prompt, return_tensors="pt")).to(self.device)
|
||||
result_tensor = self.model.generate(
|
||||
inputs,
|
||||
do_sample=True,
|
||||
pad_token_id=self.tokenizer.eos_token_id,
|
||||
# eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
)
|
||||
# TODO: We will probably want to return the tokenizer here so that we can manually process this
|
||||
# return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
|
||||
return self.tokenizer.batch_decode(
|
||||
result_tensor[0],
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=True,
|
||||
)
|
||||
|
||||
@@ -35,6 +35,11 @@ from bentoml._internal.utils import (LazyLoader, bentoml_cattr,
|
||||
|
||||
from .lazy import LazyModule
|
||||
|
||||
# NOTE: The set marks contains a set of modules name
|
||||
# that are available above and are whitelisted
|
||||
# to be included in the extra_objects map.
|
||||
_whitelist_modules = {"pkg"}
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
@@ -86,7 +91,9 @@ DEBUG = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.envi
|
||||
# XXX: define all classes, functions import above this line
|
||||
# since _extras will be the locals() import from this file.
|
||||
_extras: dict[str, t.Any] = {
|
||||
k: v for k, v in locals().items() if not isinstance(v, types.ModuleType) and not k.startswith("_")
|
||||
k: v
|
||||
for k, v in locals().items()
|
||||
if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith("_"))
|
||||
}
|
||||
|
||||
_import_structure = {
|
||||
|
||||
@@ -97,7 +97,7 @@ def attrs_to_options(
|
||||
)
|
||||
|
||||
|
||||
def _default_converter(value: t.Any, env: str | None) -> t.Any:
|
||||
def env_converter(value: t.Any, env: str | None = None) -> t.Any:
|
||||
if env is not None:
|
||||
value = os.environ.get(env, value)
|
||||
if value is not None and isinstance(value, str):
|
||||
@@ -135,7 +135,8 @@ def Field(
|
||||
on kw_only. If kw_only=True, the this field will become 'Required' and the default
|
||||
value is omitted. If kw_only=False, then the default value will be used as before.
|
||||
use_default_converter: a bool indicating whether to use the default converter. Defaults
|
||||
to True. If set to False, then the default converter will not be used.
|
||||
to True. If set to False, then the default converter will not be used. The default
|
||||
converter converts a given value from the environment variable for this given Field.
|
||||
**kwargs: The rest of the arguments are passed to attr.field
|
||||
"""
|
||||
metadata = attrs.pop("metadata", {})
|
||||
@@ -148,7 +149,7 @@ def Field(
|
||||
|
||||
converter = attrs.pop("converter", None)
|
||||
if use_default_converter:
|
||||
converter = functools.partial(_default_converter, env=env)
|
||||
converter = functools.partial(env_converter, env=env)
|
||||
|
||||
if ge is not None:
|
||||
piped.append(attr.validators.ge(ge))
|
||||
|
||||
@@ -15,6 +15,8 @@
|
||||
"""
|
||||
Some imports utils are vendorred from transformers/utils/import_utils.py for performance reasons.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import importlib.metadata
|
||||
import importlib.util
|
||||
@@ -24,7 +26,6 @@ import typing as t
|
||||
from abc import ABCMeta
|
||||
from collections import OrderedDict
|
||||
|
||||
import attr
|
||||
import inflection
|
||||
from bentoml._internal.utils import LazyLoader
|
||||
from packaging import version
|
||||
@@ -236,31 +237,73 @@ def require_backends(o: t.Any, backends: t.MutableSequence[str]):
|
||||
raise ImportError("".join(failed))
|
||||
|
||||
|
||||
@attr.define
|
||||
class ModelEnv:
|
||||
model_name: str = attr.field(converter=inflection.underscore)
|
||||
model_name: str
|
||||
|
||||
@property
|
||||
def framework(self) -> str:
|
||||
return f"OPENLLM_{self.model_name.upper()}_FRAMEWORK"
|
||||
if t.TYPE_CHECKING:
|
||||
config: property
|
||||
model_id: property
|
||||
quantize: property
|
||||
framework: property
|
||||
bettertransformer: property
|
||||
|
||||
@property
|
||||
def model_config(self) -> str:
|
||||
return f"OPENLLM_{self.model_name.upper()}_CONFIG"
|
||||
framework_value: property
|
||||
quantize_value: property
|
||||
bettertransformer_value: property
|
||||
|
||||
@property
|
||||
def model_id(self) -> str:
|
||||
return f"OPENLLM_{self.model_name.upper()}_MODEL_ID"
|
||||
def __getitem__(self, item: str | t.Any) -> t.Any:
|
||||
if hasattr(self, item):
|
||||
return getattr(self, item)
|
||||
raise KeyError(f"Key {item} not found in {self}")
|
||||
|
||||
@property
|
||||
def bettertransformer(self) -> str:
|
||||
return f"OPENLLM_{self.model_name.upper()}_BETTERTRANSFORMER"
|
||||
def __new__(cls, model_name: str, bettertransformer: bool | None = None, quantize: t.LiteralString | None = None):
|
||||
from .._configuration import _field_env_key
|
||||
from . import codegen
|
||||
|
||||
def gen_env_key(self, key: str) -> str:
|
||||
return f"OPENLLM_{self.model_name.upper()}_{key.upper()}"
|
||||
model_name = inflection.underscore(model_name)
|
||||
|
||||
def convert_to_bettertransformer(self) -> bool:
|
||||
return os.environ.get(self.bettertransformer, str(False)).lower() == "true"
|
||||
res = super().__new__(cls)
|
||||
res.model_name = model_name
|
||||
|
||||
# gen properties env key
|
||||
attributes = {"config", "model_id", "quantize", "framework", "bettertransformer"}
|
||||
for att in attributes:
|
||||
setattr(res, att, _field_env_key(model_name, att.upper()))
|
||||
|
||||
# gen properties env value
|
||||
attributes_with_values = {
|
||||
"quantize": (bool, quantize),
|
||||
"bettertransformer": (bool, bettertransformer),
|
||||
"framework": (str, "pt"),
|
||||
}
|
||||
globs: dict[str, t.Any] = {
|
||||
"__bool_vars_value": ENV_VARS_TRUE_VALUES,
|
||||
"__env_get": os.environ.get,
|
||||
"self": res,
|
||||
}
|
||||
|
||||
for attribute, (default_type, default_value) in attributes_with_values.items():
|
||||
lines: list[str] = []
|
||||
if default_type is bool:
|
||||
lines.append(
|
||||
f"return str(__env_get(self['{attribute}'], str(__env_default)).upper() in __bool_vars_value)"
|
||||
)
|
||||
else:
|
||||
lines.append(f"return __env_get(self['{attribute}'], __env_default)")
|
||||
|
||||
setattr(
|
||||
res,
|
||||
f"{attribute}_value",
|
||||
codegen.generate_function(
|
||||
cls,
|
||||
"_env_get_" + attribute,
|
||||
lines,
|
||||
("__env_default",),
|
||||
globs,
|
||||
)(default_value),
|
||||
)
|
||||
|
||||
return res
|
||||
|
||||
@property
|
||||
def start_docstring(self) -> str:
|
||||
@@ -269,9 +312,3 @@ class ModelEnv:
|
||||
@property
|
||||
def module(self) -> LazyLoader:
|
||||
return LazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}")
|
||||
|
||||
def get_framework_env(self) -> t.Literal["pt", "flax", "tf"]:
|
||||
envvar = os.environ.get(self.framework, "pt")
|
||||
if envvar not in ("pt", "tf", "flax"):
|
||||
raise ValueError(f"Invalid framework implementation {envvar}, must be one of 'pt', 'tf', 'flax'")
|
||||
return envvar
|
||||
|
||||
@@ -21,6 +21,19 @@ import os
|
||||
import types
|
||||
import typing as t
|
||||
|
||||
from ..exceptions import ForbiddenAttributeError, OpenLLMException
|
||||
|
||||
|
||||
class UsageNotAllowedError(OpenLLMException):
|
||||
"""Raised when LazyModule.__getitem__ is forbidden."""
|
||||
|
||||
|
||||
class MissingAttributesError(OpenLLMException):
|
||||
"""Raised when given keys is not available in LazyModule special mapping."""
|
||||
|
||||
|
||||
_reserved_namespace = {"__openllm_special__"}
|
||||
|
||||
|
||||
class LazyModule(types.ModuleType):
|
||||
"""
|
||||
@@ -49,9 +62,7 @@ class LazyModule(types.ModuleType):
|
||||
for value in values:
|
||||
self._class_to_module[value] = key
|
||||
# Needed for autocompletion in an IDE
|
||||
self.__all__ = (
|
||||
list(import_structure.keys()) + list(itertools.chain(*import_structure.values())) + list(_extra_objects)
|
||||
)
|
||||
self.__all__ = list(import_structure.keys()) + list(itertools.chain(*import_structure.values()))
|
||||
self.__file__ = module_file
|
||||
self.__spec__ = module_spec
|
||||
self.__path__ = [os.path.dirname(module_file)]
|
||||
@@ -71,13 +82,30 @@ class LazyModule(types.ModuleType):
|
||||
result.append(attribute)
|
||||
return result
|
||||
|
||||
def __getitem__(self, key: str) -> t.Any:
|
||||
if self._objects.get("__openllm_special__") is None:
|
||||
raise UsageNotAllowedError(f"'{self._name}' is not allowed to be used as a dict.")
|
||||
_special_mapping = self._objects.get("__openllm_special__", {})
|
||||
try:
|
||||
if key in _special_mapping:
|
||||
return getattr(self, _special_mapping.__getitem__(key))
|
||||
raise MissingAttributesError(f"Requested '{key}' is not available in given mapping.")
|
||||
except AttributeError as e:
|
||||
raise KeyError(f"'{self._name}' has no attribute {_special_mapping[key]}") from e
|
||||
except Exception as e:
|
||||
raise KeyError(f"Failed to lookup '{key}' in '{self._name}'") from e
|
||||
|
||||
def __getattr__(self, name: str) -> t.Any:
|
||||
if name in _reserved_namespace:
|
||||
raise ForbiddenAttributeError(
|
||||
f"'{name}' is a reserved namespace for {self._name} and should not be access nor modified."
|
||||
)
|
||||
if name in self._objects:
|
||||
return self._objects[name]
|
||||
return self._objects.__getitem__(name)
|
||||
if name in self._modules:
|
||||
value = self._get_module(name)
|
||||
elif name in self._class_to_module.keys():
|
||||
module = self._get_module(self._class_to_module[name])
|
||||
module = self._get_module(self._class_to_module.__getitem__(name))
|
||||
value = getattr(module, name)
|
||||
else:
|
||||
raise AttributeError(f"module {self.__name__} has no attribute {name}")
|
||||
|
||||
@@ -13,9 +13,10 @@
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import dataclasses
|
||||
import typing as t
|
||||
|
||||
import attr
|
||||
|
||||
import openllm
|
||||
from openllm._prompt import PromptFormatter
|
||||
|
||||
@@ -34,13 +35,11 @@ class PartialDict(DictStrStr):
|
||||
return "{" + key + "}"
|
||||
|
||||
|
||||
@dataclasses.dataclass(slots=True)
|
||||
@attr.define(slots=True)
|
||||
class PromptTemplate:
|
||||
template: str
|
||||
input_variables: t.Sequence[str]
|
||||
|
||||
model_config = {"extra": "forbid"}
|
||||
|
||||
def to_str(self, __partial_dict__: PartialDict | None = None, **attrs: str) -> str:
|
||||
"""Generate a prompt from the template and input variables"""
|
||||
if __partial_dict__:
|
||||
|
||||
@@ -25,6 +25,7 @@ import httpx
|
||||
import openllm
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from openllm.models.auto.factory import _BaseAutoLLMClass
|
||||
|
||||
class AnnotatedClient(bentoml.client.Client):
|
||||
def health(self, *args: t.Any, **attrs: t.Any) -> t.Any:
|
||||
@@ -107,12 +108,10 @@ class ClientMixin:
|
||||
@property
|
||||
def llm(self) -> openllm.LLM[t.Any, t.Any]:
|
||||
if self.__llm__ is None:
|
||||
if self.framework == "flax":
|
||||
self.__llm__ = openllm.AutoFlaxLLM.for_model(self.model_name)
|
||||
elif self.framework == "tf":
|
||||
self.__llm__ = openllm.AutoTFLLM.for_model(self.model_name)
|
||||
else:
|
||||
self.__llm__ = openllm.AutoLLM.for_model(self.model_name)
|
||||
self.__llm__ = t.cast(
|
||||
"_BaseAutoLLMClass",
|
||||
openllm[self.framework], # type: ignore (internal API)
|
||||
).for_model(self.model_name)
|
||||
return self.__llm__
|
||||
|
||||
@property
|
||||
|
||||
@@ -33,10 +33,10 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def test_missing_default():
|
||||
with pytest.raises(ValueError, match="The following keys are required*"):
|
||||
with pytest.raises(ValueError, match="Either 'default_id' or 'model_ids'*"):
|
||||
make_llm_config("MissingDefaultId", {"name_type": "lowercase", "requirements": ["bentoml"]})
|
||||
|
||||
with pytest.raises(ValueError, match="The following keys are required*"):
|
||||
with pytest.raises(ValueError, match="Either 'default_id' or 'model_ids'*"):
|
||||
make_llm_config("MissingModelId", {"default_id": "huggingface/t5-tiny-testing", "requirements": ["bentoml"]})
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user