feat: quantization (#27)

This commit is contained in:
Aaron Pham
2023-06-16 18:10:50 -04:00
committed by GitHub
parent 19bc7e3116
commit ded8a9f809
33 changed files with 711 additions and 309 deletions

22
.github/SECURITY.md vendored Normal file
View File

@@ -0,0 +1,22 @@
# Security Policy
## Supported Versions
We are following [semantic versioning](https://semver.org/) with strict
backward-compatibility policy. We can ensure that all minor and major version
are backward compatible. We are more lenient with patch as the development can
move quickly.
If you are just using public API, then feel free to always upgrade. Whenever
there is a breaking policies, it will become a `DeprecationWarning` with a
period of 12 months before becoming broken.
> **Warning:** Everything package under `openllm` that has an underscore
> prefixes are exempt from this. They are considered private API and can change
> at any time. However, you can ensure that all public API, classes and
> functions will be backward-compatible.
## Reporting a Vulnerability
To report a security vulnerability, please send us an
[email](contact@bentoml.com).

View File

@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
set -o errexit -o nounset -o pipefail
# Set by GH actions, see
@@ -41,4 +40,6 @@ All available models: \`\`\`python -m openllm.models\`\`\`
To start a LLM: \`\`\`python -m openllm start dolly-v2\`\`\`
Find more information about this release in the [CHANGELOG.md](https://github.com/bentoml/OpenLLM/blob/main/CHANGELOG.md)
EOF

View File

@@ -35,6 +35,7 @@ echo "Releasing version $RELEASE_VERSION..." && hatch version "${RELEASE_VERSION
jq --arg release_version "${RELEASE_VERSION}" '.version = $release_version' < package.json > package.json.tmp && mv package.json.tmp package.json
towncrier build --yes --version "${RELEASE_VERSION}" && git add CHANGELOG.md changelog.d
git add src/openllm/__about__.py package.json && git commit -sm "infra: prepare for release ${RELEASE_VERSION} [generated]"
git push origin main

View File

@@ -53,7 +53,7 @@ runs:
${{ steps.get-cache-key-prefix.outputs.prefix }}-pypi-
- name: Install dependencies
shell: bash
run: pip install -e ".[all]" hatch -vv
run: pip install -e ".[all]" hatch towncrier -vv
- name: Install pyright
shell: bash
run: npm install -g npm@^7 pyright

View File

@@ -22,8 +22,7 @@ on:
env:
LINES: 120
COLUMNS: 120
BENTOML_DO_NOT_TRACK: True
PYTEST_PLUGINS: bentoml.testing.pytest.plugin
OPENLLM_DO_NOT_TRACK: True
# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrun
defaults:
run:
@@ -38,8 +37,10 @@ jobs:
fetch-depth: 0
- name: Setup CI
uses: ./.github/actions/setup-repo
- name: Format check
run: hatch run dev:style
- name: Running changelog check
run: hatch run changelog
- name: Format and lint check
run: hatch run fmt
- name: Type check
if: ${{ github.event_name == 'pull_request' }}
run: git diff --name-only --diff-filter=AM "origin/$GITHUB_BASE_REF" -z -- '*.py{,i}' | xargs -0 --no-run-if-empty hatch run dev:typing

View File

@@ -13,20 +13,31 @@
# limitations under the License.
ci:
autoupdate_schedule: monthly
autoupdate_schedule: weekly
skip: [check-models-table-update, check-models-table-update]
exclude: '.*\.(css|js|svg)$'
repos:
- repo: local
- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: 'v0.0.272'
hooks:
- id: format-check
name: format-check
language: system
entry: hatch run dev:style
always_run: true
pass_filenames: false
- id: ruff
args: [--fix, --exit-non-zero-on-fix, --show-fixes]
- repo: https://github.com/psf/black
rev: 23.3.0
hooks:
- id: black-jupyter
files: '/(src|tests|docs|examples|typings)/'
- repo: https://github.com/econchick/interrogate
rev: 1.5.0
hooks:
- id: interrogate
types: [python]
exclude: ^(docs|tools|tests)
args: [--config=pyproject.toml]
- repo: local
hooks:
- id: check-license-header
name: license-header-check
name: check for license headers
entry: ./tools/assert-license-headers
language: script
exclude_types:
@@ -36,13 +47,14 @@ repos:
exclude: |
(?x)^(
tools/.*|
changelog.d/.*|
typings/.*|
.github/.*
)$
- repo: local
hooks:
- id: check-models-table-update
name: check-models-table-update
name: check if table in README.md is up-to-date
entry: ./tools/assert-model-table-latest
language: script
files: README.md

19
CHANGELOG.md Normal file
View File

@@ -0,0 +1,19 @@
# Changelog
We are following [semantic versioning](https://semver.org/) with strict
backward-compatibility policy.
You can find out backwards-compatibility policy
[here](https://github.com/bentoml/openllm/blob/main/.github/SECURITY.md).
Changes for the upcoming release can be found in the
['changelog.d' directory](https://github.com/bentoml/openllm/tree/main/changelog.d)
in our repository.
<!--
Do *NOT* add changelog entries here!
This changelog is managed by towncrier and is compiled at release time.
-->
<!-- towncrier release notes start -->

View File

@@ -100,7 +100,7 @@ After setting up your environment, here's how you can start contributing:
3. Run all formatter and linter with `hatch`:
```bash
hatch run dev:fmt
hatch run fmt
```
4. Write tests that verify your feature or fix (see
[Writing Tests](#writing-tests) below).
@@ -127,8 +127,8 @@ After setting up your environment, here's how you can start contributing:
## Using a custom fork
If you wish to use a modified version of OpenLLM, install your fork from source
with `pip install -e` and set `OPENLLM_DEV_BUILD=True`, so that Bentos built will
include the generated wheels for OpenLLM in the bundle.
with `pip install -e` and set `OPENLLM_DEV_BUILD=True`, so that Bentos built
will include the generated wheels for OpenLLM in the bundle.
## Writing Tests
@@ -154,3 +154,61 @@ To release a new version, use `./tools/run-release-action`. It requires `gh`,
```
> Note that currently this workflow can only be run by the BentoML team.
## Changelog
_modeled after the [attrs](https://github.com/python-attrs/attrs) workflow_
If the change is noteworthy, there needs to be a changelog entry so users can
learn about it!
To avoid merge conflicts, we use the
[_Towncrier_](https://pypi.org/project/towncrier) package to manage our
changelog. _towncrier_ uses independent _Markdown_ files for each pull request
so called _news fragments_ instead of one monolithic changelog file. On
release, those news fragments are compiled into
[`CHANGELOG.md`](https://github.com/bentoml/openllm/blob/main/CHANGELOG.md).
You don't need to install _Towncrier_ yourself, you just have to abide by a few
simple rules:
- For each pull request, add a new file into `changelog.d` with a filename
adhering to the `<pr#>.(change|deprecation|breaking|feature).md` schema: For
example, `changelog.d/42.change.md` for a non-breaking change that is proposed
in pull request #42.
- As with other docs, please use [semantic newlines] within news fragments.
- Wrap symbols like modules, functions, or classes into backticks so they are
rendered in a `monospace font`.
- Wrap arguments into asterisks like in docstrings:
`Added new argument *an_argument*.`
- If you mention functions or other callables, add parentheses at the end of
their names: `openllm.func()` or `openllm.LLMClass.method()`. This makes the
changelog a lot more readable.
- Prefer simple past tense or constructions with "now". For example:
- Added `LLM.func()`.
- `LLM.func()` now doesn't do X.Y.Z anymore when passed the _foobar_ argument.
- If you want to reference multiple issues, copy the news fragment to another
filename. _Towncrier_ will merge all news fragments with identical contents
into one entry with multiple links to the respective pull requests.
Example entries:
```md
Added `LLM.func()`.
The feature really _is_ awesome.
```
or:
```md
`openllm.utils.func()` now doesn't X.Y.Z anymore when passed the _foobar_ argument.
The bug really _was_ nasty.
```
---
`hatch run changelog` will render the current changelog to the terminal if you have
any doubts.
[semantic newlines]: https://rhodesmill.org/brandon/2012/one-sentence-per-line/

14
changelog.d/27.feature.md Normal file
View File

@@ -0,0 +1,14 @@
Added support for quantization during serving time.
`openllm start` now support `--quantize 8bit` and `--quantize 4bit`
`GPTQ` quantization support is on the roadmap and currently
being worked on.
`openllm start` now also support `--bettertransformer` to use
`BetterTransformer` for serving
Refactored `openllm.LLMConfig` to be able to use with `__getitem__`
to acecss the config value: `openllm.DollyV2Config()['requirements']`
the order being: `__openllm_*__ > self.<key> > __openllm_generation_class__ > __openllm_extras__`
Added `towncrier` workflow to easily generate changelog entries
Added `use_pipeline`, `bettertransformer` flag into ModelSettings
`LLMConfig` now supported `__dataclass_transform__` protocol to help
with type-checking
Changed `openllm download-models` to `openllm download`

View File

@@ -0,0 +1,29 @@
{%- if versiondata["version"] == "main" -%}
## Changes for the Upcoming Release
:::{warning}
These changes reflect the current [development progress](https://github.com/bentoml/openllm/tree/main) and have **not** been part of a official PyPI release yet.
To try out the latest change, one can do: `pip install -U git+https://github.com/bentoml/openllm.git@main`
:::
{% else -%}
## [{{ versiondata["version"] }}](https://github.com/bentoml/openllm/tree/{{ versiondata["version"] }})
{%- endif %}
{% for section, _ in sections.items() %}
{% if sections[section] %}
{% for category, val in definitions.items() if category in sections[section] %}
### {{ definitions[category]['name'] }}
{% for text, values in sections[section][category].items() %}
- {{ text }}
{{ values|join(',\n ') }}
{% endfor %}
{% endfor %}
{% else %}
No significant changes.
{% endif %}
{% endfor %}

View File

@@ -23,9 +23,3 @@ docker run \
..image_name
```

View File

@@ -24,9 +24,3 @@ docker run \
..image_name
```

View File

@@ -104,6 +104,7 @@ packages = ["src/openllm", "src/openllm_client"]
[tool.hatch.envs.default]
dependencies = [
"coverage[toml]>=6.5",
# NOTE: Tests strategies with Hypothesis and pytest, and snapshot testing with syrupy
"pytest",
"pytest-asyncio>=0.21.0",
"pytest-xdist[psutil]",
@@ -111,42 +112,69 @@ dependencies = [
"pytest-mock",
"pytest-randomly",
"pytest-rerunfailures",
"hypothesis",
"syrupy",
# NOTE: To run all hooks
"pre-commit",
# NOTE: Using under ./tools/update-optional-dependencies.py
"tomlkit",
# NOTE: Using under ./tools/update-readme.py
"markdown-it-py",
# NOTE: Tests strategies with Hypothesis
"hypothesis",
# NOTE: snapshot testing
"syrupy",
# NOTE: pyright for type
"pyright",
# NOTE: towncrier for changelog
"towncrier",
]
[tool.hatch.envs.default.scripts]
cov = ["test-cov", "cov-report"]
changelog = "towncrier build --version main --draft"
cov = ["cov-test", "cov-report"]
cov-report = ["- coverage combine", "coverage report"]
cov-test = "coverage run -m pytest {args:tests}"
fmt = "pre-commit run --all-files"
setup = "pre-commit install"
test = "pytest {args:tests}"
test-cov = "coverage run -m pytest {args:tests}"
typing = "pyright {args:src/openllm tests}"
[tool.towncrier]
directory = "changelog.d"
filename = "CHANGELOG.md"
issue_format = "[#{issue}](https://github.com/bentoml/openllm/issues/{issue})"
name = "openllm"
start_string = "<!-- towncrier release notes start -->\n"
template = "changelog.d/template.md.jinja"
title_format = ""
underlines = ["", "", ""]
[[tool.towncrier.section]]
path = ""
[[tool.towncrier.type]]
directory = "breaking"
name = "Backwards-incompatible Changes"
showcontent = true
[[tool.towncrier.type]]
directory = "deprecation"
name = "Deprecations"
showcontent = true
[[tool.towncrier.type]]
directory = "change"
name = "Changes"
showcontent = true
[[tool.towncrier.type]]
directory = "feature"
name = "Features"
showcontent = true
[[tool.hatch.envs.all.matrix]]
python = ["3.8", "3.9", "3.10", "3.11"]
[tool.hatch.envs.dev]
dependencies = [
"ruff",
"pyright",
"hatch",
# NOTE: black for generating service file.
"black[jupyter]==23.3.0",
]
detached = true
[tool.hatch.envs.dev.scripts]
all = ["fmt", "typing"]
fmt = ["black {args:.}", "black --pyi {args:typings/}", "ruff --fix {args:.}", "style"]
style = ["ruff {args:.}", "black --check --diff {args:.}"]
typing = "pyright {args:src/openllm tests}"
[tool.interrogate]
fail-under = 100
verbose = 2
whitelist-regex = ["test_.*"]
[tool.pytest.ini_options]
addopts = ["-rfEX", "-pno:warnings"]
@@ -206,12 +234,6 @@ force-single-line = true
known-first-party = ["openllm", "bentoml", 'transformers']
lines-after-imports = 2
[tool.ruff.flake8-quotes]
inline-quotes = "single"
[tool.ruff.flake8-tidy-imports]
ban-relative-imports = "all"
[tool.ruff.per-file-ignores]
# Tests can use magic values, assertions, and relative imports
"__init__.py" = ["E402", "F401", "F403", "F811"]
@@ -222,7 +244,7 @@ ban-relative-imports = "all"
[tool.pyright]
analysis.useLibraryCodeForTypes = true
enableTypeIgnoreComments = true
include = ["src/", "tests/"]
include = ["src/", "tests/", "tools/", "examples/"]
pythonVersion = "3.11"
reportMissingImports = "none"
reportMissingModuleSource = "warning"

View File

@@ -65,8 +65,11 @@ from deepmerge.merger import Merger
import openllm
from .exceptions import ForbiddenAttributeError, GpuNotAvailableError, OpenLLMException
from .utils import DEBUG, LazyType, bentoml_cattr, codegen, dantic, first_not_none, lenient_issubclass
from .exceptions import (ForbiddenAttributeError, GpuNotAvailableError,
OpenLLMException)
from .utils import (DEBUG, ENV_VARS_TRUE_VALUES, LazyType, bentoml_cattr,
codegen, dantic, first_not_none, lenient_issubclass,
non_intrusive_setattr)
if hasattr(t, "Required"):
from typing import Required
@@ -78,6 +81,11 @@ if hasattr(t, "NotRequired"):
else:
from typing_extensions import NotRequired
if hasattr(t, "dataclass_transform"):
from typing import dataclass_transform
else:
from typing_extensions import dataclass_transform
_T = t.TypeVar("_T")
@@ -85,7 +93,8 @@ if t.TYPE_CHECKING:
import tensorflow as tf
import torch
import transformers
from attr import _CountingAttr, _make_init, _make_repr, _transform_attrs # type: ignore
from attr import (_CountingAttr, _make_init, _make_repr, # type: ignore
_transform_attrs)
from transformers.generation.beam_constraints import Constraint
from ._types import ClickFunctionWrapper, F, O_co, P
@@ -103,7 +112,8 @@ else:
ItemgetterAny = itemgetter
# NOTE: Using internal API from attr here, since we are actually
# allowing subclass of openllm.LLMConfig to become 'attrs'-ish
from attr._make import _CountingAttr, _make_init, _make_repr, _transform_attrs
from attr._make import (_CountingAttr, _make_init, _make_repr,
_transform_attrs)
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
@@ -369,6 +379,11 @@ class GenerationConfig:
)
self.__attrs_init__(**attrs)
def __getitem__(self, item: str) -> t.Any:
if hasattr(self, item):
return getattr(self, item)
raise KeyError(f"GenerationConfig has no attribute {item}")
bentoml_cattr.register_unstructure_hook_factory(
lambda cls: attr.has(cls) and lenient_issubclass(cls, GenerationConfig),
@@ -411,6 +426,11 @@ class ModelSettings(t.TypedDict, total=False):
requires_gpu: bool
trust_remote_code: bool
requirements: t.Optional[ListStr]
# llm implementation specifics
use_pipeline: bool
bettertransformer: bool
model_type: t.Literal["causal_lm", "seq2seq_lm"]
runtime: t.Literal["transformers", "cpp"]
# naming convention, only name_type is needed to infer from the class
@@ -458,19 +478,19 @@ _ModelSettings: type[attr.AttrsInstance] = codegen.add_method_dunders(
def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
if not lenient_issubclass(cl_, LLMConfig):
raise RuntimeError(f"Given LLMConfig must be a subclass type of 'LLMConfig', got '{cl_}' instead.")
settings = cl_.__config__
if settings is None:
raise RuntimeError("Given LLMConfig must have '__config__' defined.")
if not hasattr(cl_, "__config__") or getattr(cl_, "__config__") is None:
raise RuntimeError("Given LLMConfig must have '__config__' that is not None defined.")
settings = cl_.__config__
assert settings
required = [i.name for i in attr.fields(cls) if i.metadata.get("required", False)]
if any(k not in settings for k in required):
raise ValueError(f"The following keys are required under '__config__': {required}")
if not settings["default_id"] or not settings["model_ids"]:
raise ValueError("Make sure that either 'default_id', 'model_ids' are not emptied under '__config__'.")
if any(k in settings for k in ("env", "start_name", "model_name")):
raise ValueError("The following keys are not allowed under '__config__': env, start_name, model_name")
missing = set(required) - set(settings.keys())
if len(missing) > 0:
raise ValueError(f"The following keys are required under '__config__': {required} (missing: {missing})")
if "generation_class" in settings:
raise ValueError(
@@ -478,10 +498,16 @@ def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
f"all required attributes under '{cl_}.GenerationConfig' when defining the class."
)
if not settings["default_id"] or not settings["model_ids"]:
raise ValueError("Either 'default_id' or 'model_ids' are emptied under '__config__' (required fields).")
# NOTE: value in __config__ can be None, hense we use setdefault
# to update in-place
_cl_name = cl_.__name__.replace("Config", "")
name_type = first_not_none(settings.get("name_type"), "dasherize")
model_name = inflection.underscore(_cl_name) if name_type == "dasherize" else _cl_name.lower()
start_name = inflection.dasherize(model_name) if name_type == "dasherize" else model_name
name_type = settings.setdefault("name_type", "dasherize")
model_name = settings.setdefault(
"model_name", inflection.underscore(_cl_name) if name_type == "dasherize" else _cl_name.lower()
)
partialed = functools.partial(_field_env_key, model_name=model_name, suffix="generation")
def auto_env_transformers(_: t.Any, fields: list[attr.Attribute[t.Any]]) -> list[attr.Attribute[t.Any]]:
@@ -498,21 +524,9 @@ def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
for f in fields
]
return cls(
default_id=settings["default_id"],
model_ids=settings["model_ids"],
url=settings.get("url", ""),
requires_gpu=settings.get("requires_gpu", False),
trust_remote_code=settings.get("trust_remote_code", False),
requirements=settings.get("requirements", None),
name_type=name_type,
model_name=model_name,
start_name=start_name,
runtime=settings.get("runtime", "transformers"),
env=openllm.utils.ModelEnv(model_name),
timeout=settings.get("timeout", 3600),
workers_per_resource=settings.get("workers_per_resource", 1),
generation_class=attr.make_class(
settings.setdefault(
"generation_class",
attr.make_class(
f"{_cl_name}GenerationConfig",
[],
bases=(GenerationConfig,),
@@ -520,10 +534,40 @@ def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
weakref_slot=True,
frozen=False,
repr=True,
collect_by_mro=True,
field_transformer=auto_env_transformers,
),
)
env = settings.setdefault("env", openllm.utils.ModelEnv(model_name))
requires_gpu = settings.setdefault("requires_gpu", False)
# bettertransformer support
bettertransformer = settings.setdefault(
"bettertransformer",
os.environ.get(env.bettertransformer, str(False)).upper() in ENV_VARS_TRUE_VALUES,
)
if requires_gpu:
# For all models that requires GPU, no need to offload it to BetterTransformer
# use bitsandbytes or gptq instead for latency improvement
if bettertransformer:
logger.debug("Model requires GPU by default, disabling bettertransformer.")
bettertransformer = False
settings["bettertransformer"] = bettertransformer
# default value
settings.setdefault("url", "")
settings.setdefault("use_pipeline", False)
settings.setdefault("model_type", "causal_lm")
settings.setdefault("trust_remote_code", False)
settings.setdefault("requirements", None)
settings.setdefault("timeout", 3600)
settings.setdefault("workers_per_resource", 1)
settings.setdefault("runtime", "transformers")
settings.setdefault("start_name", inflection.dasherize(model_name) if name_type == "dasherize" else model_name)
return cls(**settings)
bentoml_cattr.register_structure_hook(_ModelSettings, structure_settings)
@@ -534,15 +578,16 @@ def _setattr_class(attr_name: str, value_var: t.Any, add_dunder: bool = False):
We can't use the cached object.__setattr__ since we are setting
attributes to a class.
"""
if add_dunder:
return f"setattr(cls, '{attr_name}', __add_dunder(cls, {value_var}))"
return f"setattr(cls, '{attr_name}', {value_var})"
val = f"__add_dunder(cls, {value_var})" if add_dunder else value_var
return f"setattr(cls, '{attr_name}', {val})"
_dunder_add = {"generation_class"}
def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance) -> t.Callable[..., None]:
def _make_assignment_script(
cls: type[LLMConfig], attributes: attr.AttrsInstance, _prefix: t.LiteralString = "openllm"
) -> t.Callable[..., None]:
"""Generate the assignment script with prefix attributes __openllm_<value>__"""
args: ListStr = []
globs: DictStrAny = {
@@ -555,7 +600,7 @@ def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance
lines: ListStr = ["_getattr = _cached_getattribute_get(_cached_attribute)"]
for attr_name, field in attr.fields_dict(attributes.__class__).items():
arg_name = field.metadata.get("target", f"__openllm_{inflection.underscore(attr_name)}__")
arg_name = field.metadata.get("target", f"__{_prefix}_{inflection.underscore(attr_name)}__")
args.append(f"{attr_name}=getattr(_cached_attribute, '{attr_name}')")
lines.append(_setattr_class(arg_name, attr_name, add_dunder=attr_name in _dunder_add))
annotations[attr_name] = field.type
@@ -568,6 +613,23 @@ def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance
_reserved_namespace = {"__config__", "GenerationConfig"}
@dataclass_transform(order_default=True, field_specifiers=(attr.field, dantic.Field))
def __llm_config_transform__(cls: type[LLMConfig]) -> type[LLMConfig]:
kwargs: dict[str, t.Any] = {}
if hasattr(cls, "GenerationConfig"):
kwargs = {k: v for k, v in vars(cls.GenerationConfig).items() if not k.startswith("_")}
non_intrusive_setattr(
cls,
"__dataclass_transform__",
{
"order_default": True,
"field_specifiers": (attr.field, dantic.Field),
"kwargs": kwargs,
},
)
return cls
@attr.define(slots=True)
class LLMConfig:
"""
@@ -640,11 +702,11 @@ class LLMConfig:
# NOTE: The following is handled via __init_subclass__, and is only used for TYPE_CHECKING
if t.TYPE_CHECKING:
# NOTE: public attributes to override
__config__: ModelSettings | None = None
__config__: ModelSettings | None = Field(None)
"""Internal configuration for this LLM model. Each of the field in here will be populated
and prefixed with __openllm_<value>__"""
GenerationConfig: type = type
GenerationConfig: type = Field(None)
"""Users can override this subclass of any given LLMConfig to provide GenerationConfig
default value. For example:
@@ -663,7 +725,7 @@ class LLMConfig:
def __attrs_init__(self, **attrs: t.Any):
"""Generated __attrs_init__ for LLMConfig subclass that follows the attrs contract."""
__attrs_attrs__: tuple[attr.Attribute[t.Any], ...] = tuple()
__attrs_attrs__: tuple[attr.Attribute[t.Any], ...] = Field(None, init=False)
"""Since we are writing our own __init_subclass__, which is an alternative way for __prepare__,
we want openllm.LLMConfig to be attrs-like dataclass that has pydantic-like interface.
__attrs_attrs__ will be handled dynamically by __init_subclass__.
@@ -683,33 +745,38 @@ class LLMConfig:
__openllm_url__: str = Field(None, init=False)
"""The resolved url for this LLMConfig."""
__openllm_requires_gpu__: bool = False
__openllm_requires_gpu__: bool = Field(None, init=False)
"""Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU."""
__openllm_trust_remote_code__: bool = False
__openllm_trust_remote_code__: bool = Field(False)
"""Whether to always trust remote code"""
__openllm_requirements__: ListStr | None = None
__openllm_requirements__: ListStr | None = Field(None)
"""The default PyPI requirements needed to run this given LLM. By default, we will depend on
bentoml, torch, transformers."""
__openllm_env__: openllm.utils.ModelEnv = Field(None, init=False)
"""A ModelEnv instance for this LLMConfig."""
__openllm_model_name__: str = ""
__openllm_model_name__: str = Field("")
"""The normalized version of __openllm_start_name__, determined by __openllm_name_type__"""
__openllm_start_name__: str = ""
__openllm_model_type__: t.Literal["causal_lm", "seq2seq_lm"] = Field("causal_lm")
"""The model type for this given LLM. By default, it should be causal language modeling.
Currently supported 'causal_lm' or 'seq2seq_lm'
"""
__openllm_start_name__: str = Field("")
"""Default name to be used with `openllm start`"""
__openllm_name_type__: t.Literal["dasherize", "lowercase"] = "dasherize"
__openllm_name_type__: t.Literal["dasherize", "lowercase"] = Field("dasherize")
"""the default name typed for this model. "dasherize" will convert the name to lowercase and
replace spaces with dashes. "lowercase" will convert the name to lowercase."""
__openllm_timeout__: int = 3600
__openllm_timeout__: int = Field(36000)
"""The default timeout to be set for this given LLM."""
__openllm_workers_per_resource__: int | float = 1
__openllm_workers_per_resource__: int | float = Field(1)
"""The number of workers per resource. This is used to determine the number of workers to use for this model.
For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then
OpenLLM will use 1 worker per resource. If this is set to 2, then OpenLLM will use 2 workers per resource.
@@ -720,10 +787,23 @@ class LLMConfig:
By default, it is set to 1.
"""
__openllm_runtime__: t.Literal["transformers", "cpp"] = "transformers"
__openllm_runtime__: t.Literal["transformers", "cpp"] = Field("transformers")
"""The runtime to use for this model. Possible values are `transformers` or `cpp`. See
LlaMA for more information."""
__openllm_use_pipeline__: bool = Field(False)
"""Whether this LLM will use HuggingFace Pipeline API. By default, this is set to False.
The reason for this to be here is because we want to access this object before loading
the _bentomodel. This is because we will actually download the model weights when accessing
_bentomodel.
"""
__openllm_bettertransformer__: bool = Field(False)
"""Whether to use BetterTransformer for this given LLM. This depends per model
architecture. By default, we will use BetterTransformer for T5 and StableLM models,
and set to False for every other models.
"""
__openllm_default_id__: str = Field(None)
"""Return the default model to use when using 'openllm start <model_id>'.
This could be one of the keys in 'self.model_ids' or custom users model."""
@@ -804,6 +884,7 @@ class LLMConfig:
these["generation_config"] = cls.Field(
default=cls.__openllm_generation_class__(),
description=inspect.cleandoc(cls.__openllm_generation_class__.__doc__ or ""),
type=GenerationConfig,
)
# Generate the base __attrs_attrs__ transformation here.
@@ -884,6 +965,7 @@ class LLMConfig:
cls.__openllm_hints__ = {
f.name: f.type for ite in map(attr.fields, (cls, cls.__openllm_generation_class__)) for f in ite
}
cls = __llm_config_transform__(cls)
def __setattr__(self, attr: str, value: t.Any):
if attr in _reserved_namespace:
@@ -909,14 +991,7 @@ class LLMConfig:
if generation_config is None:
generation_config = {k: v for k, v in attrs.items() if k in _generation_cl_dict}
else:
generation_keys = {k for k in attrs if k in _generation_cl_dict}
if len(generation_keys) > 0:
logger.warning(
"Both 'generation_config' and keys for 'generation_config' are passed."
" The following keys in 'generation_config' will be overriden be keywords argument: %s",
", ".join(generation_keys),
)
config_merger.merge(generation_config, {k: v for k, v in attrs.items() if k in generation_keys})
config_merger.merge(generation_config, {k: v for k, v in attrs.items() if k in _generation_cl_dict})
for k in _cached_keys:
if k in generation_config or attrs.get(k) is None:
@@ -942,7 +1017,32 @@ class LLMConfig:
)
# The rest of attrs should only be the attributes to be passed to __attrs_init__
self.__attrs_init__(generation_config=self.__openllm_generation_class__(**generation_config), **attrs)
self.__attrs_init__(generation_config=self["generation_class"](**generation_config), **attrs)
def __getitem__(self, item: str | t.Any) -> t.Any:
"""Allowing access LLMConfig as a dictionary. The order will always evaluate as
__openllm_*__ > self.key > __openllm_generation_class__ > __openllm_extras__
This method is purely for convenience, and should not be used for performance critical code.
"""
if not isinstance(item, str):
raise TypeError(f"LLM only supports string indexing, not {item.__class__.__name__}")
if item in _reserved_namespace:
raise ForbiddenAttributeError(
f"'{item}' is a reserved namespace for {self.__class__} and should not be access nor modified."
)
internal_attributes = f"__openllm_{item}__"
if hasattr(self, internal_attributes):
return getattr(self, internal_attributes)
elif hasattr(self, item):
return getattr(self, item)
elif hasattr(self.__openllm_generation_class__, item):
return getattr(self.__openllm_generation_class__, item)
elif item in self.__openllm_extras__:
return self.__openllm_extras__[item]
else:
raise KeyError(item)
def __getattribute__(self, item: str) -> t.Any:
if item in _reserved_namespace:
@@ -976,10 +1076,8 @@ class LLMConfig:
def model_dump(self, flatten: bool = False, **_: t.Any):
dumped = bentoml_cattr.unstructure(self)
generation_config = bentoml_cattr.unstructure(self.generation_config)
if not flatten:
dumped["generation_config"] = generation_config
else:
if flatten:
generation_config = dumped.pop("generation_config")
dumped.update(generation_config)
return dumped
@@ -1028,11 +1126,11 @@ class LLMConfig:
key_to_remove: ListStr = []
for k, v in attrs.items():
if k.startswith(f"{self.__openllm_model_name__}_generation_"):
llm_config_attrs["generation_config"][k[len(self.__openllm_model_name__ + "_generation_") :]] = v
if k.startswith(f"{self['model_name']}_generation_"):
llm_config_attrs["generation_config"][k[len(self["model_name"] + "_generation_") :]] = v
key_to_remove.append(k)
elif k.startswith(f"{self.__openllm_model_name__}_"):
llm_config_attrs[k[len(self.__openllm_model_name__ + "_") :]] = v
elif k.startswith(f"{self['model_name']}_"):
llm_config_attrs[k[len(self["model_name"] + "_") :]] = v
key_to_remove.append(k)
return self.model_construct_env(**llm_config_attrs), {k: v for k, v in attrs.items() if k not in key_to_remove}

View File

@@ -35,7 +35,8 @@ from bentoml._internal.types import ModelSignatureDict
import openllm
from .exceptions import ForbiddenAttributeError, OpenLLMException
from .utils import ENV_VARS_TRUE_VALUES, LazyLoader, bentoml_cattr
from .utils import (LazyLoader, bentoml_cattr, is_bitsandbytes_available,
non_intrusive_setattr)
if t.TYPE_CHECKING:
import torch
@@ -60,7 +61,6 @@ else:
logger = logging.getLogger(__name__)
# NOTE: `1-2` -> text-generation and text2text-generation
FRAMEWORK_TO_AUTOCLASS_MAPPING = {
"pt": ("AutoModelForCausalLM", "AutoModelForSeq2SeqLM"),
"tf": ("TFAutoModelForCausalLM", "TFAutoModelForSeq2SeqLM"),
@@ -132,6 +132,7 @@ def import_model(
),
)
# NOTE: `1-2` -> text-generation and text2text-generation
if type(config) in transformers.MODEL_FOR_CAUSAL_LM_MAPPING:
idx = 0
elif type(config) in transformers.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING:
@@ -243,25 +244,6 @@ class LLMInterface(ABC):
raise NotImplementedError
def _default_post_init(self: LLM[t.Any, t.Any]):
# load_in_mha: Whether to apply BetterTransformer (or Torch MultiHeadAttention) during inference load.
# See https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/
# for more information.
# NOTE: set a default variable to transform to BetterTransformer by default for inference
if self.config.__openllm_runtime__ == "cpp":
self.load_in_mha = False
else:
self.load_in_mha = (
os.environ.get(self.config_class.__openllm_env__.bettertransformer, str(False)).upper()
in ENV_VARS_TRUE_VALUES
)
if self.config_class.__openllm_requires_gpu__:
# For all models that requires GPU, no need to offload it to BetterTransformer
# use bitsandbytes instead
self.load_in_mha = False
_M = t.TypeVar("_M")
_T = t.TypeVar("_T")
@@ -285,6 +267,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
_model_attrs: dict[str, t.Any]
_tokenizer_attrs: dict[str, t.Any]
bettertransformer: bool
def __init_subclass__(cls):
cd = cls.__dict__
prefix_class_name_config = cls.__name__
@@ -310,20 +294,6 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
"Missing required key 'config_class'. Make sure to define it within the LLM subclass."
)
if cls.llm_post_init is not LLMInterface.llm_post_init:
original_llm_post_init = cd["llm_post_init"]
def wrapped_llm_post_init(self: t.Self) -> None:
"""We need to both initialize private attributes and call the user-defined model_post_init
method.
"""
_default_post_init(self)
original_llm_post_init(self)
cls.llm_post_init = wrapped_llm_post_init
else:
setattr(cls, "llm_post_init", _default_post_init)
if cls.import_model is LLMInterface.import_model:
# using the default import model
setattr(cls, "import_model", functools.partial(import_model, _model_framework=implementation))
@@ -353,6 +323,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
model_id: str | None = None,
llm_config: openllm.LLMConfig | None = None,
*args: t.Any,
quantize: t.Literal["8bit", "4bit", "gptq"] | None = None,
bettertransformer: bool | None = None,
**attrs: t.Any,
):
"""Initialize the LLM with given pretrained model.
@@ -429,6 +401,9 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
model_id: The pretrained model to use. Defaults to None. If None, 'self.default_id' will be used.
llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM
will use `config_class` to construct default configuration.
quantize: The quantization to use for this LLM. Defaults to None. Possible values
include 8bit, 4bit and gptq.
bettertransformer: Whether to use BetterTransformer with this model. Defaults to False.
*args: The args to be passed to the model.
**attrs: The kwargs to be passed to the model.
@@ -438,16 +413,102 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
However, if `model_id` is a path, this argument is recomended to include.
"""
load_in_mha = attrs.pop("load_in_mha", False)
openllm_model_version = attrs.pop("openllm_model_version", None)
# low_cpu_mem_usage is only available for model
# this is helpful on system with low memory to avoid OOM
low_cpu_mem_usage = attrs.pop("low_cpu_mem_usage", True)
# quantization setup
quantization_config = attrs.pop("quantization_config", None)
# 8 bit configuration
int8_threshold = attrs.pop("llm_int8_threshhold", 6.0)
cpu_offloading = attrs.pop("llm_int8_enable_fp32_cpu_offload", False)
int8_skip_modules: list[str] | None = attrs.pop("llm_int8_skip_modules", None)
int8_has_fp16_weight = attrs.pop("llm_int8_has_fp16_weight", False)
# 4 bit configuration
int4_compute_dtype = attrs.pop("llm_bnb_4bit_compute_dtype", torch.bfloat16)
int4_quant_type = attrs.pop("llm_bnb_4bit_quant_type", "nf4")
int4_use_double_quant = attrs.pop("llm_bnb_4bit_use_double_quant", True)
if quantization_config and quantize:
raise ValueError(
"""'quantization_config' and 'quantize' are mutually exclusive. Either customise
your quantization_config or use the quantize argument."""
)
if quantization_config is None:
# quantize is a openllm.LLM feature, where we can quantize the model
# with bitsandbytes or quantization aware training.
if quantize is not None:
if not is_bitsandbytes_available():
raise RuntimeError(
"Quantization requires bitsandbytes to be installed. Make "
"sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'"
)
logger.debug(
"'quantize' is not None. %s will use a default 'quantization_config' for %s. "
"If you want to customise the quantization config, make sure to pass your "
"own 'quantization_config'",
self,
quantize,
)
if quantize == "8bit":
if int8_skip_modules is None:
int8_skip_modules = []
if "lm_head" not in int8_skip_modules and self.config["model_type"] == "causal_lm":
logger.debug("Skipping 'lm_head' for quantization for %s", self)
int8_skip_modules.append("lm_head")
quantization_config = transformers.BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_enable_fp32_cpu_offload=cpu_offloading,
llm_int8_threshhold=int8_threshold,
llm_int8_skip_modules=int8_skip_modules,
llm_int8_has_fp16_weight=int8_has_fp16_weight,
)
elif quantize == "4bit":
trf_versions = openllm.utils.pkg.pkg_version_info("transformers")
supports_kbits = trf_versions[:2] >= (4, 30)
if supports_kbits:
quantization_config = transformers.BitsAndBytesConfig(
load_in_4bit=True,
llm_bnb_4bit_compute_dtype=int4_compute_dtype,
llm_bnb_4bit_quant_type=int4_quant_type,
llm_bnb_4bit_use_double_quant=int4_use_double_quant,
)
else:
logger.warning(
"'quantize' is set to 4bit, while the current transformers version %s does not support "
"k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore "
"make sure to install the latest version of transformers either via PyPI or "
"from git source: 'pip install git+https://github.com/huggingface/transformers'.",
trf_versions,
)
elif quantize == "gptq":
# TODO: support GPTQ loading quantization
if model_id is None:
raise RuntimeError(
"'quantize=%s' requires passing custom path to quantized weights as we are unable to load "
"the model on the fly. See https://github.com/qwopqwop200/GPTQ-for-LLaMa for "
"instruction on how to quantize '%s' with GPTQ.",
quantize,
self,
)
raise NotImplementedError("GPTQ is not supported yet.")
else:
raise ValueError(f"'quantize' must be one of ['8bit', '4bit', 'gptq'], got {quantize} instead.")
attrs.update({"quantization_config": quantization_config})
if llm_config is not None:
logger.debug("Using given 'llm_config=(%s)' to initialize LLM", llm_config)
self.config = llm_config
else:
self.config = self.config_class.model_construct_env(**attrs)
# The rests of the kwargs that is not used by the config class should be stored into __openllm_extras__.
attrs = self.config.__openllm_extras__
attrs = self.config["extras"]
if not self.config["use_pipeline"]:
attrs["low_cpu_mem_usage"] = low_cpu_mem_usage
model_kwds, tokenizer_kwds = {}, {}
if self.__llm_init_kwargs__:
@@ -463,10 +524,11 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
)
if model_id is None:
model_id = os.environ.get(self.config.__openllm_env__.model_id, self.config.__openllm_default_id__)
model_id = os.environ.get(self.config["env"].model_id, self.config["default_id"])
# NOTE: This is the actual given path or pretrained weight for this LLM.
assert model_id is not None
if t.TYPE_CHECKING:
assert model_id is not None
self._model_id = model_id
# parsing tokenizer and model kwargs
@@ -476,23 +538,24 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
model_kwds.update({k: v for k, v in attrs.items() if not k.startswith(TOKENIZER_PREFIX)})
# handle trust_remote_code
self.__llm_trust_remote_code__ = model_kwds.pop("trust_remote_code", self.config.__openllm_trust_remote_code__)
self.__llm_trust_remote_code__ = model_kwds.pop("trust_remote_code", self.config["trust_remote_code"])
# NOTE: Save the args and kwargs for latter load
self._model_args = args
self._model_attrs = model_kwds
self._tokenizer_attrs = tokenizer_kwds
# we allow users to overwrite the load_in_mha defined by the LLM subclass.
if load_in_mha:
logger.debug("Overwriting 'load_in_mha=%s' (base load_in_mha=%s)", load_in_mha, self.load_in_mha)
self.load_in_mha = load_in_mha
self._openllm_model_version = openllm_model_version
if self.__llm_post_init__:
self.llm_post_init()
# we set it here so that we allow subclass to overwrite bettertransformer in llm_post_init
if bettertransformer:
logger.debug("Using %r with BetterTransformer", self)
self.bettertransformer = bettertransformer
else:
non_intrusive_setattr(self, "bettertransformer", self.config["bettertransformer"])
def __setattr__(self, attr: str, value: t.Any):
if attr in _reserved_namespace:
raise ForbiddenAttributeError(
@@ -513,7 +576,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
@property
def runner_name(self) -> str:
return f"llm-{self.config.__openllm_start_name__}-runner"
return f"llm-{self.config['start_name']}-runner"
# NOTE: The section below defines a loose contract with langchain's LLM interface.
@property
@@ -524,7 +587,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
def identifying_params(self) -> dict[str, t.Any]:
return {
"configuration": self.config.model_dump_json().decode(),
"model_ids": orjson.dumps(self.config.__openllm_model_ids__).decode(),
"model_ids": orjson.dumps(self.config["model_ids"]).decode(),
}
@staticmethod
@@ -580,8 +643,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
sys.executable,
"-m",
"openllm",
"download-models",
self.config.__openllm_start_name__,
"download",
self.config["start_name"],
"--model-id",
self.model_id,
"--output",
@@ -625,7 +688,11 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
kwds = self._model_attrs
kwds["trust_remote_code"] = self.__llm_trust_remote_code__
if self.load_in_mha and "_pretrained_class" not in self._bentomodel.info.metadata:
is_pipeline = "_pretrained_class" in self._bentomodel.info.metadata
# differentiate when saving tokenizer or other pretrained type.
is_pretrained_model = is_pipeline and "_framework" in self._bentomodel.info.metadata
if self.bettertransformer and is_pipeline and self.config["use_pipeline"]:
# This is a pipeline, provide a accelerator args
kwds["accelerator"] = "bettertransformer"
@@ -636,10 +703,10 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
self.__llm_model__ = self._bentomodel.load_model(*self._model_args, **kwds)
if (
self.load_in_mha
and all(i in self._bentomodel.info.metadata for i in ("_framework", "_pretrained_class"))
self.bettertransformer
and is_pretrained_model
and self._bentomodel.info.metadata["_framework"] == "torch"
and self.config.__openllm_runtime__ == "transformers"
and self.config["runtime"] == "transformers"
):
# BetterTransformer is currently only supported on PyTorch.
from optimum.bettertransformer import BetterTransformer
@@ -767,7 +834,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
# NOTE: returning the two langchain API's to the runner
return types.new_class(
inflection.camelize(self.config.__openllm_model_name__) + "Runner",
inflection.camelize(self.config["model_name"]) + "Runner",
(bentoml.Runner,),
exec_body=lambda ns: ns.update(
{
@@ -776,17 +843,17 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
"llm": self, # NOTE: self reference to LLM
"config": self.config,
"__call__": _wrapped_generate_run,
"__module__": f"openllm.models.{self.config.__openllm_model_name__}",
"__doc__": self.config.__openllm_env__.start_docstring,
"__module__": f"openllm.models.{self.config['model_name']}",
"__doc__": self.config["env"].start_docstring,
}
),
)(
types.new_class(
inflection.camelize(self.config.__openllm_model_name__) + "Runnable",
inflection.camelize(self.config["model_name"]) + "Runnable",
(_Runnable,),
{
"SUPPORTED_RESOURCES": ("nvidia.com/gpu", "cpu")
if self.config.__openllm_requires_gpu__
if self.config["requires_gpu"]
else ("nvidia.com/gpu",),
"llm_type": self.llm_type,
"identifying_params": self.identifying_params,

View File

@@ -76,17 +76,16 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
# first, then proceed to install everything inside the wheels/ folder.
packages: list[str] = ["openllm"]
if llm.config.__openllm_requirements__ is not None:
packages.extend(llm.config.__openllm_requirements__)
if llm.config["requirements"] is not None:
packages.extend(llm.config["requirements"])
if not (str(os.environ.get("BENTOML_BUNDLE_LOCAL_BUILD", False)).lower() == "false"):
packages.append(f"bentoml>={'.'.join([str(i) for i in pkg.pkg_version_info('bentoml')])}")
to_use_framework = llm.config.__openllm_env__.get_framework_env()
env = llm.config["env"]
to_use_framework = env.get_framework_env()
if to_use_framework == "flax":
assert (
utils.is_flax_available()
), f"Flax is not available, while {llm.config.__openllm_env__.framework} is set to 'flax'"
assert utils.is_flax_available(), f"Flax is not available, while {env.framework} is set to 'flax'"
packages.extend(
[
f"flax>={importlib.metadata.version('flax')}",
@@ -95,9 +94,7 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
]
)
elif to_use_framework == "tf":
assert (
utils.is_tf_available()
), f"TensorFlow is not available, while {llm.config.__openllm_env__.framework} is set to 'tf'"
assert utils.is_tf_available(), f"TensorFlow is not available, while {env.framework} is set to 'tf'"
candidates = (
"tensorflow",
"tensorflow-cpu",
@@ -133,16 +130,17 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float) -> DockerOptions:
_bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
_bentoml_config_options_opts = [
"api_server.traffic.timeout=3600", # NOTE: Currently we hardcode this value
f'runners."llm-{llm.config.__openllm_start_name__}-runner".traffic.timeout={llm.config.__openllm_timeout__}',
f'runners."llm-{llm.config.__openllm_start_name__}-runner".workers_per_resource={workers_per_resource}',
"api_server.traffic.timeout=36000", # NOTE: Currently we hardcode this value
f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}',
f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}',
]
_bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
env = llm.config["env"]
return DockerOptions(
cuda_version="11.6", # NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version
env={
llm.config.__openllm_env__.framework: llm.config.__openllm_env__.get_framework_env(),
"OPENLLM_MODEL": llm.config.__openllm_model_name__,
env.framework: env.get_framework_env(),
"OPENLLM_MODEL": llm.config["model_name"],
"OPENLLM_MODEL_ID": llm.model_id,
"BENTOML_DEBUG": str(get_debug_mode()),
"BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
@@ -180,7 +178,7 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
try:
os.environ["OPENLLM_MODEL"] = inflection.underscore(model_name)
to_use_framework = llm_config.__openllm_env__.get_framework_env()
to_use_framework = llm_config["env"].get_framework_env()
if to_use_framework == "flax":
llm = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=llm_config, **attrs)
elif to_use_framework == "tf":
@@ -192,12 +190,10 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
labels = dict(llm.identifying_params)
labels.update({"_type": llm.llm_type, "_framework": to_use_framework})
service_name = f"generated_{llm.config.__openllm_model_name__}_service.py"
workers_per_resource = utils.first_not_none(
workers_per_resource, default=llm.config.__openllm_workers_per_resource__
)
service_name = f"generated_{llm_config['model_name']}_service.py"
workers_per_resource = utils.first_not_none(workers_per_resource, default=llm_config["workers_per_resource"])
with fs.open_fs(f"temp://llm_{llm.config.__openllm_model_name__}") as llm_fs:
with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
# add service.py definition to this temporary folder
utils.codegen.write_service(model_name, llm.model_id, service_name, llm_fs)
@@ -209,12 +205,12 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
raise bentoml.exceptions.NotFound("Overwriting previously saved Bento.")
_previously_built = True
except bentoml.exceptions.NotFound:
logger.info("Building Bento for LLM '%s'", llm.config.__openllm_start_name__)
logger.info("Building Bento for LLM '%s'", llm_config["start_name"])
bento = bentoml.bentos.build(
f"{service_name}:svc",
name=bento_tag.name,
labels=labels,
description=f"OpenLLM service for {llm.config.__openllm_start_name__}",
description=f"OpenLLM service for {llm_config['start_name']}",
include=[
f for f in llm_fs.walk.files(filter=["*.py"])
], # NOTE: By default, we are using _service.py as the default service, for now.

View File

@@ -55,7 +55,7 @@ class GenerationInput:
def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]:
llm_config = openllm.AutoConfig.for_model(model_name, **attrs)
return attr.make_class(
inflection.camelize(llm_config.__openllm_model_name__) + "GenerationInput",
inflection.camelize(llm_config["model_name"]) + "GenerationInput",
attrs={
"prompt": attr.field(type=str),
"llm_config": attr.field(

View File

@@ -36,7 +36,7 @@ model_id = os.environ.get("OPENLLM_MODEL_ID", "{__model_id__}") # openllm: mode
llm_config = openllm.AutoConfig.for_model(model)
runner = openllm.Runner(model, model_id=model_id, llm_config=llm_config)
svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", runners=[runner])
svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[runner])
@svc.api(
@@ -55,8 +55,8 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
def metadata_v1(_: str) -> openllm.MetadataOutput:
return openllm.MetadataOutput(
model_id=model_id,
timeout=llm_config.__openllm_timeout__,
model_name=llm_config.__openllm_model_name__,
framework=llm_config.__openllm_env__.get_framework_env(),
timeout=llm_config["timeout"],
model_name=llm_config["model_name"],
framework=llm_config["env"].get_framework_env(),
configuration=llm_config.model_dump_json().decode(),
)

View File

@@ -83,6 +83,29 @@ def _echo(text: t.Any, fg: str = "green", _with_style: bool = True, **attrs: t.A
call(text, **attrs)
def quantize_option(factory: t.Any):
help_str = """Running this model in quantized mode.
Note that GPTQ is currently working in progress and will be available soon.
NOTE: Quantization is only available for PyTorch models.
"""
return factory.option(
"--quantize",
type=click.Choice(["8bit", "4bit", "gptq"]),
default=None,
help=help_str,
)
def bettertransformer_option(factory: t.Any):
return factory.option(
"--bettertransformer",
is_flag=True,
default=None,
help="Use BetterTransformer wrapper to serve model",
)
def start_model_command(
model_name: str,
group: click.Group,
@@ -108,29 +131,30 @@ def start_model_command(
openllm.utils.configure_logging()
llm_config = openllm.AutoConfig.for_model(model_name)
env = llm_config["env"]
docstring = f"""\
{llm_config.__openllm_env__.start_docstring}
{env.start_docstring}
\b
Available model_id(s): {llm_config.__openllm_model_ids__} [default: {llm_config.__openllm_default_id__}]
Available model_id(s): {llm_config['model_ids']} [default: {llm_config['default_id']}]
"""
command_attrs: dict[str, t.Any] = {
"name": llm_config.__openllm_model_name__,
"name": llm_config["model_name"],
"context_settings": _context_settings or {},
"short_help": f"Start a LLMServer for '{model_name}' ('--help' for more details)",
"help": docstring,
}
aliases: list[str] = []
if llm_config.__openllm_name_type__ == "dasherize":
aliases.append(llm_config.__openllm_start_name__)
if llm_config["name_type"] == "dasherize":
aliases.append(llm_config["start_name"])
command_attrs["aliases"] = aliases if len(aliases) > 0 else None
serve_decorator = _http_server_args if not _serve_grpc else _grpc_server_args
available_gpu = openllm.utils.gpu_count()
if llm_config.__openllm_requires_gpu__ and len(available_gpu) < 1:
if llm_config["requires_gpu"] and len(available_gpu) < 1:
# NOTE: The model requires GPU, therefore we will return a dummy command
command_attrs.update(
{
@@ -152,8 +176,13 @@ Available model_id(s): {llm_config.__openllm_model_ids__} [default: {llm_config.
@llm_config.to_click_options
@serve_decorator
@cog.optgroup.group("General LLM Options")
@cog.optgroup.option("--server-timeout", type=int, default=None, help="Server timeout in seconds")
@model_id_option(cog.optgroup, model_env=llm_config.__openllm_env__)
@cog.optgroup.option(
"--server-timeout",
type=int,
default=None,
help="Server timeout in seconds",
)
@model_id_option(cog.optgroup, model_env=env)
@cog.optgroup.option(
"--device",
type=tuple,
@@ -165,34 +194,47 @@ Available model_id(s): {llm_config.__openllm_model_ids__} [default: {llm_config.
show_envvar=True,
)
@workers_per_resource_option(cog.optgroup)
@click.pass_context
@quantize_option(cog.optgroup)
@bettertransformer_option(cog.optgroup)
def model_start(
ctx: click.Context,
server_timeout: int | None,
model_id: str | None,
workers_per_resource: float | None,
device: tuple[str, ...] | None,
quantize: t.Literal["8bit", "4bit", "gptq"] | None,
bettertransformer: bool | None,
**attrs: t.Any,
) -> openllm.LLMConfig:
config, server_attrs = llm_config.model_validate_click(**attrs)
if llm_config.__openllm_env__.get_framework_env() == "flax":
if quantize and env.get_framework_env() != "pt":
_echo("Quantization is only available for PyTorch models.", fg="yellow")
if env.get_framework_env() == "flax":
llm = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=config, ensure_available=True)
elif llm_config.__openllm_env__.get_framework_env() == "tf":
elif env.get_framework_env() == "tf":
llm = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=config, ensure_available=True)
else:
llm = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=config, ensure_available=True)
llm = openllm.AutoLLM.for_model(
model_name,
model_id=model_id,
llm_config=config,
quantize=quantize,
bettertransformer=bettertransformer,
ensure_available=True,
)
if llm.config.__openllm_requirements__ is not None and len(llm.config.__openllm_requirements__) > 0:
requirements = config["requirements"]
if requirements is not None and len(requirements) > 0:
_echo(
f"Make sure to have the following dependencies available: {llm.config.__openllm_requirements__}",
f"Make sure to have the following dependencies available: {requirements}",
fg="yellow",
)
workers_per_resource = openllm.utils.first_not_none(
workers_per_resource, default=llm.config.__openllm_workers_per_resource__
workers_per_resource, default=config["workers_per_resource"]
)
server_timeout = openllm.utils.first_not_none(server_timeout, default=llm.config.__openllm_timeout__)
server_timeout = openllm.utils.first_not_none(server_timeout, default=config["timeout"])
num_workers = int(1 / workers_per_resource)
if num_workers > 1:
@@ -216,26 +258,26 @@ Available model_id(s): {llm_config.__openllm_model_ids__} [default: {llm_config.
_bentoml_config_options_opts = [
"tracing.sample_rate=1.0",
f"api_server.traffic.timeout={server_timeout}",
f'runners."llm-{llm.config.__openllm_start_name__}-runner".traffic.timeout={llm.config.__openllm_timeout__}',
f'runners."llm-{llm.config.__openllm_start_name__}-runner".workers_per_resource={workers_per_resource}',
f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}',
f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}',
]
if device:
if len(device) > 1:
for idx, dev in enumerate(device):
_bentoml_config_options_opts.append(
f'runners."llm-{llm.config.__openllm_start_name__}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
)
else:
_bentoml_config_options_opts.append(
f'runners."llm-{llm.config.__openllm_start_name__}-runner".resources."nvidia.com/gpu"=[{device[0]}]'
f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]'
)
_bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
start_env.update(
{
llm.config.__openllm_env__.framework: llm.config.__openllm_env__.get_framework_env(),
llm.config.__openllm_env__.model_config: llm.config.model_dump_json().decode(),
env.framework: env.get_framework_env(),
env.model_config: llm.config.model_dump_json().decode(),
"OPENLLM_MODEL": model_name,
"OPENLLM_MODEL_ID": llm.model_id,
"BENTOML_DEBUG": str(openllm.utils.get_debug_mode()),
@@ -280,7 +322,8 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
"""
# The following logics is similar to one of BentoMLCommandGroup
from bentoml._internal.configuration import DEBUG_ENV_VAR, QUIET_ENV_VAR
from bentoml._internal.configuration import (DEBUG_ENV_VAR,
QUIET_ENV_VAR)
@click.option("-q", "--quiet", envvar=QUIET_ENV_VAR, is_flag=True, default=False, help="Suppress all output.")
@click.option(
@@ -668,11 +711,15 @@ def start_grpc_cli():
@output_option
@click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.")
@workers_per_resource_option(click, build=True)
@quantize_option(click)
@bettertransformer_option(click)
def build(
model_name: str,
model_id: str | None,
overwrite: bool,
output: OutputLiteral,
quantize: t.Literal["8bit", "4bit", "gptq"] | None,
bettertransformer: bool | None,
workers_per_resource: float | None,
):
"""Package a given models into a Bento.
@@ -695,6 +742,8 @@ def build(
model_name,
__cli__=True,
model_id=model_id,
quantize=quantize,
bettertransformer=bettertransformer,
_workers_per_resource=workers_per_resource,
_overwrite_existing_bento=overwrite,
)
@@ -764,20 +813,20 @@ def models(output: OutputLiteral, show_available: bool):
for m in models:
config = openllm.AutoConfig.for_model(m)
runtime_impl: tuple[t.Literal["pt", "flax", "tf"], ...] = tuple()
if config.__openllm_model_name__ in openllm.MODEL_MAPPING_NAMES:
if config["model_name"] in openllm.MODEL_MAPPING_NAMES:
runtime_impl += ("pt",)
if config.__openllm_model_name__ in openllm.MODEL_FLAX_MAPPING_NAMES:
if config["model_name"] in openllm.MODEL_FLAX_MAPPING_NAMES:
runtime_impl += ("flax",)
if config.__openllm_model_name__ in openllm.MODEL_TF_MAPPING_NAMES:
if config["model_name"] in openllm.MODEL_TF_MAPPING_NAMES:
runtime_impl += ("tf",)
json_data[m] = {
"model_id": config.__openllm_model_ids__,
"url": config.__openllm_url__,
"requires_gpu": config.__openllm_requires_gpu__,
"model_id": config["model_ids"],
"url": config["url"],
"requires_gpu": config["requires_gpu"],
"runtime_impl": runtime_impl,
"installation": "pip install openllm" if m not in extras else f'pip install "openllm[{m}]"',
}
converted.extend([convert_transformers_model_name(i) for i in config.__openllm_model_ids__])
converted.extend([convert_transformers_model_name(i) for i in config["model_ids"]])
if openllm.utils.DEBUG:
try:
openllm.AutoLLM.for_model(m, llm_config=config)
@@ -950,7 +999,7 @@ def query_(
_echo(res["responses"], fg="white")
@cli.command()
@cli.command(name="download")
@click.argument(
"model_name",
type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]),
@@ -967,10 +1016,10 @@ def download_models(model_name: str, model_id: str | None, output: OutputLiteral
openllm.utils.configure_logging()
config = openllm.AutoConfig.for_model(model_name)
env = config.__openllm_env__.get_framework_env()
if env == "flax":
envvar = config["env"].get_framework_env()
if envvar == "flax":
model = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=config)
elif env == "tf":
elif envvar == "tf":
model = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=config)
else:
model = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=config)
@@ -978,11 +1027,11 @@ def download_models(model_name: str, model_id: str | None, output: OutputLiteral
try:
_ref = bentoml.transformers.get(model.tag)
if output == "pretty":
_echo(f"{model_name} is already setup for framework '{env}': {str(_ref.tag)}", nl=True, fg="yellow")
_echo(f"{model_name} is already setup for framework '{envvar}': {str(_ref.tag)}", nl=True, fg="yellow")
elif output == "json":
_echo(
orjson.dumps(
{"previously_setup": True, "framework": env, "model": str(_ref.tag)}, option=orjson.OPT_INDENT_2
{"previously_setup": True, "framework": envvar, "model": str(_ref.tag)}, option=orjson.OPT_INDENT_2
).decode(),
fg="white",
)
@@ -1016,7 +1065,7 @@ def download_models(model_name: str, model_id: str | None, output: OutputLiteral
elif output == "json":
_echo(
orjson.dumps(
{"previously_setup": False, "framework": env, "tag": str(_ref.tag)},
{"previously_setup": False, "framework": envvar, "tag": str(_ref.tag)},
option=orjson.OPT_INDENT_2,
).decode()
)

View File

@@ -46,7 +46,7 @@ class ChatGLMConfig(openllm.LLMConfig):
retain_history: bool = openllm.LLMConfig.Field(
False,
description="""Whether to retain history given to the model.
description="""Whether to retain history given to the model.
If set to True, then the model will retain given history.""",
)

View File

@@ -38,6 +38,7 @@ class DollyV2Config(openllm.LLMConfig):
"timeout": 3600000,
"trust_remote_code": True,
"url": "https://github.com/databrickslabs/dolly",
"use_pipeline": True,
"default_id": "databricks/dolly-v2-3b",
"model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"],
}

View File

@@ -29,6 +29,7 @@ class FalconConfig(openllm.LLMConfig):
"trust_remote_code": True,
"requires_gpu": True,
"timeout": int(36e6),
"use_pipeline": True,
"url": "https://falconllm.tii.ae/",
"requirements": ["einops", "xformers", "safetensors"],
"default_id": "tiiuae/falcon-7b",

View File

@@ -61,6 +61,7 @@ class FlanT5Config(openllm.LLMConfig):
"google/flan-t5-xl",
"google/flan-t5-xxl",
],
"model_type": "seq2seq_lm",
}
class GenerationConfig:

View File

@@ -47,13 +47,12 @@ class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTN
def llm_post_init(self):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.load_in_mha = True if not torch.cuda.is_available() else False
self.bettertransformer = True if not torch.cuda.is_available() else False
@property
def import_kwargs(self):
model_kwds = {
"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
"load_in_8bit": False,
"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
}
tokenizer_kwds: dict[str, t.Any] = {}

View File

@@ -47,8 +47,7 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
def import_kwargs(self):
model_kwds = {
"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
"load_in_8bit": True if torch.cuda.device_count() > 1 else False,
"torch_dtype": torch.float16,
"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
}
tokenizer_kwds = {"padding_side": "left"}
return model_kwds, tokenizer_kwds
@@ -62,7 +61,6 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
**attrs: t.Any,
) -> bentoml.Model:
torch_dtype = attrs.pop("torch_dtype", torch.float16)
load_in_8bit = attrs.pop("load_in_8bit", True)
device_map = attrs.pop("device_map", "auto")
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, **tokenizer_kwds)
@@ -74,7 +72,7 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
)
model = transformers.AutoModelForCausalLM.from_pretrained(
model_id, torch_dtype=torch_dtype, load_in_8bit=load_in_8bit, device_map=device_map, **attrs
model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs
)
try:
return bentoml.transformers.save_model(tag, model, custom_objects={"tokenizer": tokenizer})

View File

@@ -15,42 +15,38 @@
Utilities function for OpenLLM. User can import these function for convenience, but
we won't ensure backward compatibility for these functions. So use with caution.
"""
from __future__ import annotations
from __future__ import annotations as _annotations
import functools
import logging
import os
import sys
import types
import typing as t
from bentoml._internal.configuration import get_debug_mode as get_debug_mode
from bentoml._internal.configuration import get_quiet_mode as get_quiet_mode
from bentoml._internal.configuration import set_debug_mode as set_debug_mode
from bentoml._internal.configuration import set_quiet_mode as set_quiet_mode
from bentoml._internal.log import configure_logging as configure_logging
from bentoml._internal.log import configure_server_logging as configure_server_logging
from bentoml._internal.configuration import (get_debug_mode, get_quiet_mode,
set_debug_mode, set_quiet_mode)
from bentoml._internal.log import configure_logging, configure_server_logging
from bentoml._internal.types import LazyType
from bentoml._internal.utils import (LazyLoader, bentoml_cattr,
copy_file_to_fs_folder, first_not_none,
pkg, reserve_free_port,
resolve_user_filepath)
# NOTE: The following exports useful utils from bentoml
from bentoml._internal.utils import LazyLoader as LazyLoader
from bentoml._internal.utils import bentoml_cattr as bentoml_cattr
from bentoml._internal.utils import copy_file_to_fs_folder as copy_file_to_fs_folder
from bentoml._internal.utils import first_not_none as first_not_none
from bentoml._internal.utils import pkg as pkg
from bentoml._internal.utils import reserve_free_port as reserve_free_port
from bentoml._internal.utils import resolve_user_filepath as resolve_user_filepath
from .lazy import LazyModule
from .lazy import LazyModule as LazyModule
logger = logging.getLogger(__name__)
try:
from typing import GenericAlias as TypingGenericAlias # type: ignore
from typing import GenericAlias as _TypingGenericAlias # type: ignore
except ImportError:
# python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
TypingGenericAlias = ()
_TypingGenericAlias = ()
if sys.version_info < (3, 10):
WithArgsTypes = (TypingGenericAlias,)
_WithArgsTypes = (_TypingGenericAlias,)
else:
WithArgsTypes: t.Any = (
_WithArgsTypes: t.Any = (
t._GenericAlias, # type: ignore (_GenericAlias is the actual GenericAlias implementation)
types.GenericAlias,
types.UnionType,
@@ -61,7 +57,7 @@ def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.An
try:
return isinstance(cls, type) and issubclass(cls, class_or_tuple) # type: ignore[arg-type]
except TypeError:
if isinstance(cls, WithArgsTypes):
if isinstance(cls, _WithArgsTypes):
return False
raise
@@ -72,27 +68,25 @@ def gpu_count() -> tuple[int, ...]:
return tuple(NvidiaGpuResource.from_system())
# equivocal setattr to save one lookup per assignment
_object_setattr = object.__setattr__
def non_intrusive_setattr(obj: t.Any, name: str, value: t.Any) -> None:
"""This makes sure that we don't overwrite any existing attributes on the object"""
_setattr = functools.partial(setattr, obj) if isinstance(obj, type) else _object_setattr.__get__(obj)
if not hasattr(obj, name):
_setattr(name, value)
DEBUG = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.environ.get("OPENLLMDEVDEBUG")))
_extras = {
"get_debug_mode": get_debug_mode,
"get_quiet_mode": get_quiet_mode,
"set_debug_mode": set_debug_mode,
"set_quiet_mode": set_quiet_mode,
"configure_logging": configure_logging,
"configure_server_logging": configure_server_logging,
"LazyType": LazyType,
"LazyLoader": LazyLoader,
"LazyModule": LazyModule,
"bentoml_cattr": bentoml_cattr,
"copy_file_to_fs_folder": copy_file_to_fs_folder,
"first_not_none": first_not_none,
"pkg": pkg,
"reserve_free_port": reserve_free_port,
"resolve_user_filepath": resolve_user_filepath,
"lenient_issubclass": lenient_issubclass,
"gpu_count": gpu_count,
"DEBUG": DEBUG,
# XXX: define all classes, functions import above this line
# since _extras will be the locals() import from this file.
_extras: dict[str, t.Any] = {
k: v for k, v in locals().items() if not isinstance(v, types.ModuleType) and not k.startswith("_")
}
_import_structure = {
@@ -108,23 +102,46 @@ _import_structure = {
"is_flax_available",
"is_tf_available",
"is_torch_available",
"is_bitsandbytes_available",
"require_backends",
],
}
if t.TYPE_CHECKING:
# NOTE: The following exports useful utils from bentoml
from . import LazyLoader as LazyLoader
from . import LazyType as LazyType
from . import analytics as analytics
from . import bentoml_cattr as bentoml_cattr
from . import codegen as codegen
from . import configure_logging as configure_logging
from . import configure_server_logging as configure_server_logging
from . import copy_file_to_fs_folder as copy_file_to_fs_folder
from . import dantic as dantic
from . import first_not_none as first_not_none
from . import get_debug_mode as get_debug_mode
from . import get_quiet_mode as get_quiet_mode
from . import gpu_count as gpu_count
from . import lenient_issubclass as lenient_issubclass
from . import non_intrusive_setattr as non_intrusive_setattr
from . import pkg as pkg
from . import reserve_free_port as reserve_free_port
from . import resolve_user_filepath as resolve_user_filepath
from . import set_debug_mode as set_debug_mode
from . import set_quiet_mode as set_quiet_mode
from .import_utils import ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES
from .import_utils import DummyMetaclass as DummyMetaclass
from .import_utils import ModelEnv as ModelEnv
from .import_utils import is_cpm_kernels_available as is_cpm_kernels_available
from .import_utils import \
is_bitsandbytes_available as is_bitsandbytes_available
from .import_utils import \
is_cpm_kernels_available as is_cpm_kernels_available
from .import_utils import is_einops_available as is_einops_available
from .import_utils import is_flax_available as is_flax_available
from .import_utils import is_tf_available as is_tf_available
from .import_utils import is_torch_available as is_torch_available
from .import_utils import require_backends as require_backends
from .lazy import LazyModule as LazyModule
else:
import sys

View File

@@ -81,7 +81,7 @@ class StartInitEvent(_internal_analytics.schemas.EventMeta):
@staticmethod
def handler(llm_config: openllm.LLMConfig) -> StartInitEvent:
return StartInitEvent(model_name=llm_config.__openllm_model_name__, llm_config=llm_config.model_dump())
return StartInitEvent(model_name=llm_config["model_name"], llm_config=llm_config.model_dump())
def track_start_init(

View File

@@ -61,6 +61,7 @@ _tf_available = importlib.util.find_spec("tensorflow") is not None
_flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None
_einops_available = _is_package_available("einops")
_cpm_kernel_available = _is_package_available("cpm_kernels")
_bitsandbytes_available = _is_package_available("bitsandbytes")
def is_einops_available():
@@ -71,6 +72,10 @@ def is_cpm_kernels_available():
return _cpm_kernel_available
def is_bitsandbytes_available():
return _bitsandbytes_available
def is_torch_available():
global _torch_available
if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:

View File

@@ -37,6 +37,9 @@ def model_settings(draw: st.DrawFn):
requires_gpu=st.booleans(),
trust_remote_code=st.booleans(),
requirements=st.none() | st.lists(st.text(), min_size=1),
use_pipeline=st.booleans(),
model_type=st.sampled_from(["causal_lm", "seq2seq_lm"]),
runtime=st.sampled_from(["transformers", "cpp"]),
name_type=st.sampled_from(["dasherize", "lowercase"]),
timeout=st.integers(min_value=3600),
workers_per_resource=st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)),

View File

@@ -23,7 +23,8 @@ from hypothesis import assume, given
from hypothesis import strategies as st
import openllm
from openllm._configuration import GenerationConfig, ModelSettings, _field_env_key
from openllm._configuration import (GenerationConfig, ModelSettings,
_field_env_key)
from openllm.utils import DEBUG
from ._strategies._configuration import make_llm_config, model_settings
@@ -67,7 +68,7 @@ def test_forbidden_access():
@given(model_settings())
def test_class_normal_gen(gen_settings: ModelSettings):
assume(gen_settings["default_id"] and gen_settings["model_ids"])
assume(gen_settings["default_id"] and all(i for i in gen_settings["model_ids"]))
cl_: type[openllm.LLMConfig] = make_llm_config("NotFullLLM", gen_settings)
assert issubclass(cl_, openllm.LLMConfig)
for key in gen_settings:

View File

@@ -3,11 +3,10 @@
from __future__ import annotations
import os
import subprocess
from markdown_it import MarkdownIt
import openllm
md = MarkdownIt()
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -17,7 +16,7 @@ with open(os.path.join(ROOT, "README.md"), "r") as f:
# NOTE: Currently, we only have one table in README, which is the Model readme.
table = [r for r in readme if r.type == "html_block" and r.content.startswith("<td><a")]
available = len(openllm.CONFIG_MAPPING.keys())
available = subprocess.check_output(["openllm", "models", "-o", "porcelain"]).strip().decode("utf-8").count("\n") + 1
on_table = len(table) # NOTE: minus the header

View File

@@ -31,9 +31,9 @@ FLAN_T5_DEPS = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
OPENAI_DEPS = ["openai", "tiktoken"]
_base_requirements = {
inflection.dasherize(name): config.__openllm_requirements__
for name, config in openllm.CONFIG_MAPPING.items()
if config.__openllm_requirements__
inflection.dasherize(name): config_cls.__openllm_requirements__
for name, config_cls in openllm.CONFIG_MAPPING.items()
if config_cls.__openllm_requirements__
}
# NOTE: update this table when adding new external dependencies

View File

@@ -47,13 +47,13 @@ def main() -> int:
"Model Ids": [],
}
max_install_len_div = 0
for name, config in openllm.CONFIG_MAPPING.items():
for name, config_cls in openllm.CONFIG_MAPPING.items():
dashed = inflection.dasherize(name)
formatted["Model"].append(dashed)
formatted["URL"].append(config.__openllm_url__)
formatted["URL"].append(config_cls.__openllm_url__)
formatted["GPU"].append("")
formatted["CPU"].append("" if not config.__openllm_requires_gpu__ else "")
formatted["Model Ids"].append(config.__openllm_model_ids__)
formatted["CPU"].append("" if not config_cls.__openllm_requires_gpu__ else "")
formatted["Model Ids"].append(config_cls.__openllm_model_ids__)
if dashed in deps:
instruction = f'```bash\npip install "openllm[{dashed}]"\n```'
else: