mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-03-03 14:46:00 -05:00
feat: quantization (#27)
This commit is contained in:
22
.github/SECURITY.md
vendored
Normal file
22
.github/SECURITY.md
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
# Security Policy
|
||||
|
||||
## Supported Versions
|
||||
|
||||
We are following [semantic versioning](https://semver.org/) with strict
|
||||
backward-compatibility policy. We can ensure that all minor and major version
|
||||
are backward compatible. We are more lenient with patch as the development can
|
||||
move quickly.
|
||||
|
||||
If you are just using public API, then feel free to always upgrade. Whenever
|
||||
there is a breaking policies, it will become a `DeprecationWarning` with a
|
||||
period of 12 months before becoming broken.
|
||||
|
||||
> **Warning:** Everything package under `openllm` that has an underscore
|
||||
> prefixes are exempt from this. They are considered private API and can change
|
||||
> at any time. However, you can ensure that all public API, classes and
|
||||
> functions will be backward-compatible.
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
To report a security vulnerability, please send us an
|
||||
[email](contact@bentoml.com).
|
||||
@@ -13,7 +13,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
set -o errexit -o nounset -o pipefail
|
||||
|
||||
# Set by GH actions, see
|
||||
@@ -41,4 +40,6 @@ All available models: \`\`\`python -m openllm.models\`\`\`
|
||||
|
||||
To start a LLM: \`\`\`python -m openllm start dolly-v2\`\`\`
|
||||
|
||||
Find more information about this release in the [CHANGELOG.md](https://github.com/bentoml/OpenLLM/blob/main/CHANGELOG.md)
|
||||
|
||||
EOF
|
||||
|
||||
1
.github/actions/release.sh
vendored
1
.github/actions/release.sh
vendored
@@ -35,6 +35,7 @@ echo "Releasing version $RELEASE_VERSION..." && hatch version "${RELEASE_VERSION
|
||||
|
||||
jq --arg release_version "${RELEASE_VERSION}" '.version = $release_version' < package.json > package.json.tmp && mv package.json.tmp package.json
|
||||
|
||||
towncrier build --yes --version "${RELEASE_VERSION}" && git add CHANGELOG.md changelog.d
|
||||
git add src/openllm/__about__.py package.json && git commit -sm "infra: prepare for release ${RELEASE_VERSION} [generated]"
|
||||
git push origin main
|
||||
|
||||
|
||||
2
.github/actions/setup-repo/action.yml
vendored
2
.github/actions/setup-repo/action.yml
vendored
@@ -53,7 +53,7 @@ runs:
|
||||
${{ steps.get-cache-key-prefix.outputs.prefix }}-pypi-
|
||||
- name: Install dependencies
|
||||
shell: bash
|
||||
run: pip install -e ".[all]" hatch -vv
|
||||
run: pip install -e ".[all]" hatch towncrier -vv
|
||||
- name: Install pyright
|
||||
shell: bash
|
||||
run: npm install -g npm@^7 pyright
|
||||
|
||||
9
.github/workflows/ci.yml
vendored
9
.github/workflows/ci.yml
vendored
@@ -22,8 +22,7 @@ on:
|
||||
env:
|
||||
LINES: 120
|
||||
COLUMNS: 120
|
||||
BENTOML_DO_NOT_TRACK: True
|
||||
PYTEST_PLUGINS: bentoml.testing.pytest.plugin
|
||||
OPENLLM_DO_NOT_TRACK: True
|
||||
# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrun
|
||||
defaults:
|
||||
run:
|
||||
@@ -38,8 +37,10 @@ jobs:
|
||||
fetch-depth: 0
|
||||
- name: Setup CI
|
||||
uses: ./.github/actions/setup-repo
|
||||
- name: Format check
|
||||
run: hatch run dev:style
|
||||
- name: Running changelog check
|
||||
run: hatch run changelog
|
||||
- name: Format and lint check
|
||||
run: hatch run fmt
|
||||
- name: Type check
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
run: git diff --name-only --diff-filter=AM "origin/$GITHUB_BASE_REF" -z -- '*.py{,i}' | xargs -0 --no-run-if-empty hatch run dev:typing
|
||||
|
||||
@@ -13,20 +13,31 @@
|
||||
# limitations under the License.
|
||||
|
||||
ci:
|
||||
autoupdate_schedule: monthly
|
||||
autoupdate_schedule: weekly
|
||||
skip: [check-models-table-update, check-models-table-update]
|
||||
exclude: '.*\.(css|js|svg)$'
|
||||
repos:
|
||||
- repo: local
|
||||
- repo: https://github.com/charliermarsh/ruff-pre-commit
|
||||
rev: 'v0.0.272'
|
||||
hooks:
|
||||
- id: format-check
|
||||
name: format-check
|
||||
language: system
|
||||
entry: hatch run dev:style
|
||||
always_run: true
|
||||
pass_filenames: false
|
||||
- id: ruff
|
||||
args: [--fix, --exit-non-zero-on-fix, --show-fixes]
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 23.3.0
|
||||
hooks:
|
||||
- id: black-jupyter
|
||||
files: '/(src|tests|docs|examples|typings)/'
|
||||
- repo: https://github.com/econchick/interrogate
|
||||
rev: 1.5.0
|
||||
hooks:
|
||||
- id: interrogate
|
||||
types: [python]
|
||||
exclude: ^(docs|tools|tests)
|
||||
args: [--config=pyproject.toml]
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: check-license-header
|
||||
name: license-header-check
|
||||
name: check for license headers
|
||||
entry: ./tools/assert-license-headers
|
||||
language: script
|
||||
exclude_types:
|
||||
@@ -36,13 +47,14 @@ repos:
|
||||
exclude: |
|
||||
(?x)^(
|
||||
tools/.*|
|
||||
changelog.d/.*|
|
||||
typings/.*|
|
||||
.github/.*
|
||||
)$
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: check-models-table-update
|
||||
name: check-models-table-update
|
||||
name: check if table in README.md is up-to-date
|
||||
entry: ./tools/assert-model-table-latest
|
||||
language: script
|
||||
files: README.md
|
||||
|
||||
19
CHANGELOG.md
Normal file
19
CHANGELOG.md
Normal file
@@ -0,0 +1,19 @@
|
||||
# Changelog
|
||||
|
||||
We are following [semantic versioning](https://semver.org/) with strict
|
||||
backward-compatibility policy.
|
||||
|
||||
You can find out backwards-compatibility policy
|
||||
[here](https://github.com/bentoml/openllm/blob/main/.github/SECURITY.md).
|
||||
|
||||
Changes for the upcoming release can be found in the
|
||||
['changelog.d' directory](https://github.com/bentoml/openllm/tree/main/changelog.d)
|
||||
in our repository.
|
||||
|
||||
<!--
|
||||
Do *NOT* add changelog entries here!
|
||||
|
||||
This changelog is managed by towncrier and is compiled at release time.
|
||||
-->
|
||||
|
||||
<!-- towncrier release notes start -->
|
||||
@@ -100,7 +100,7 @@ After setting up your environment, here's how you can start contributing:
|
||||
3. Run all formatter and linter with `hatch`:
|
||||
|
||||
```bash
|
||||
hatch run dev:fmt
|
||||
hatch run fmt
|
||||
```
|
||||
4. Write tests that verify your feature or fix (see
|
||||
[Writing Tests](#writing-tests) below).
|
||||
@@ -127,8 +127,8 @@ After setting up your environment, here's how you can start contributing:
|
||||
## Using a custom fork
|
||||
|
||||
If you wish to use a modified version of OpenLLM, install your fork from source
|
||||
with `pip install -e` and set `OPENLLM_DEV_BUILD=True`, so that Bentos built will
|
||||
include the generated wheels for OpenLLM in the bundle.
|
||||
with `pip install -e` and set `OPENLLM_DEV_BUILD=True`, so that Bentos built
|
||||
will include the generated wheels for OpenLLM in the bundle.
|
||||
|
||||
## Writing Tests
|
||||
|
||||
@@ -154,3 +154,61 @@ To release a new version, use `./tools/run-release-action`. It requires `gh`,
|
||||
```
|
||||
|
||||
> Note that currently this workflow can only be run by the BentoML team.
|
||||
|
||||
## Changelog
|
||||
|
||||
_modeled after the [attrs](https://github.com/python-attrs/attrs) workflow_
|
||||
|
||||
If the change is noteworthy, there needs to be a changelog entry so users can
|
||||
learn about it!
|
||||
|
||||
To avoid merge conflicts, we use the
|
||||
[_Towncrier_](https://pypi.org/project/towncrier) package to manage our
|
||||
changelog. _towncrier_ uses independent _Markdown_ files for each pull request –
|
||||
so called _news fragments_ – instead of one monolithic changelog file. On
|
||||
release, those news fragments are compiled into
|
||||
[`CHANGELOG.md`](https://github.com/bentoml/openllm/blob/main/CHANGELOG.md).
|
||||
|
||||
You don't need to install _Towncrier_ yourself, you just have to abide by a few
|
||||
simple rules:
|
||||
|
||||
- For each pull request, add a new file into `changelog.d` with a filename
|
||||
adhering to the `<pr#>.(change|deprecation|breaking|feature).md` schema: For
|
||||
example, `changelog.d/42.change.md` for a non-breaking change that is proposed
|
||||
in pull request #42.
|
||||
- As with other docs, please use [semantic newlines] within news fragments.
|
||||
- Wrap symbols like modules, functions, or classes into backticks so they are
|
||||
rendered in a `monospace font`.
|
||||
- Wrap arguments into asterisks like in docstrings:
|
||||
`Added new argument *an_argument*.`
|
||||
- If you mention functions or other callables, add parentheses at the end of
|
||||
their names: `openllm.func()` or `openllm.LLMClass.method()`. This makes the
|
||||
changelog a lot more readable.
|
||||
- Prefer simple past tense or constructions with "now". For example:
|
||||
|
||||
- Added `LLM.func()`.
|
||||
- `LLM.func()` now doesn't do X.Y.Z anymore when passed the _foobar_ argument.
|
||||
- If you want to reference multiple issues, copy the news fragment to another
|
||||
filename. _Towncrier_ will merge all news fragments with identical contents
|
||||
into one entry with multiple links to the respective pull requests.
|
||||
|
||||
Example entries:
|
||||
|
||||
```md
|
||||
Added `LLM.func()`.
|
||||
The feature really _is_ awesome.
|
||||
```
|
||||
|
||||
or:
|
||||
|
||||
```md
|
||||
`openllm.utils.func()` now doesn't X.Y.Z anymore when passed the _foobar_ argument.
|
||||
The bug really _was_ nasty.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
`hatch run changelog` will render the current changelog to the terminal if you have
|
||||
any doubts.
|
||||
|
||||
[semantic newlines]: https://rhodesmill.org/brandon/2012/one-sentence-per-line/
|
||||
|
||||
14
changelog.d/27.feature.md
Normal file
14
changelog.d/27.feature.md
Normal file
@@ -0,0 +1,14 @@
|
||||
Added support for quantization during serving time.
|
||||
`openllm start` now support `--quantize 8bit` and `--quantize 4bit`
|
||||
`GPTQ` quantization support is on the roadmap and currently
|
||||
being worked on.
|
||||
`openllm start` now also support `--bettertransformer` to use
|
||||
`BetterTransformer` for serving
|
||||
Refactored `openllm.LLMConfig` to be able to use with `__getitem__`
|
||||
to acecss the config value: `openllm.DollyV2Config()['requirements']`
|
||||
the order being: `__openllm_*__ > self.<key> > __openllm_generation_class__ > __openllm_extras__`
|
||||
Added `towncrier` workflow to easily generate changelog entries
|
||||
Added `use_pipeline`, `bettertransformer` flag into ModelSettings
|
||||
`LLMConfig` now supported `__dataclass_transform__` protocol to help
|
||||
with type-checking
|
||||
Changed `openllm download-models` to `openllm download`
|
||||
29
changelog.d/template.md.jinja
Normal file
29
changelog.d/template.md.jinja
Normal file
@@ -0,0 +1,29 @@
|
||||
{%- if versiondata["version"] == "main" -%}
|
||||
## Changes for the Upcoming Release
|
||||
|
||||
:::{warning}
|
||||
These changes reflect the current [development progress](https://github.com/bentoml/openllm/tree/main) and have **not** been part of a official PyPI release yet.
|
||||
To try out the latest change, one can do: `pip install -U git+https://github.com/bentoml/openllm.git@main`
|
||||
:::
|
||||
{% else -%}
|
||||
## [{{ versiondata["version"] }}](https://github.com/bentoml/openllm/tree/{{ versiondata["version"] }})
|
||||
{%- endif %}
|
||||
|
||||
{% for section, _ in sections.items() %}
|
||||
{% if sections[section] %}
|
||||
{% for category, val in definitions.items() if category in sections[section] %}
|
||||
|
||||
### {{ definitions[category]['name'] }}
|
||||
|
||||
{% for text, values in sections[section][category].items() %}
|
||||
- {{ text }}
|
||||
{{ values|join(',\n ') }}
|
||||
{% endfor %}
|
||||
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
No significant changes.
|
||||
|
||||
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
@@ -23,9 +23,3 @@ docker run \
|
||||
..image_name
|
||||
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -24,9 +24,3 @@ docker run \
|
||||
..image_name
|
||||
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -104,6 +104,7 @@ packages = ["src/openllm", "src/openllm_client"]
|
||||
[tool.hatch.envs.default]
|
||||
dependencies = [
|
||||
"coverage[toml]>=6.5",
|
||||
# NOTE: Tests strategies with Hypothesis and pytest, and snapshot testing with syrupy
|
||||
"pytest",
|
||||
"pytest-asyncio>=0.21.0",
|
||||
"pytest-xdist[psutil]",
|
||||
@@ -111,42 +112,69 @@ dependencies = [
|
||||
"pytest-mock",
|
||||
"pytest-randomly",
|
||||
"pytest-rerunfailures",
|
||||
"hypothesis",
|
||||
"syrupy",
|
||||
# NOTE: To run all hooks
|
||||
"pre-commit",
|
||||
# NOTE: Using under ./tools/update-optional-dependencies.py
|
||||
"tomlkit",
|
||||
# NOTE: Using under ./tools/update-readme.py
|
||||
"markdown-it-py",
|
||||
# NOTE: Tests strategies with Hypothesis
|
||||
"hypothesis",
|
||||
# NOTE: snapshot testing
|
||||
"syrupy",
|
||||
# NOTE: pyright for type
|
||||
"pyright",
|
||||
# NOTE: towncrier for changelog
|
||||
"towncrier",
|
||||
]
|
||||
[tool.hatch.envs.default.scripts]
|
||||
cov = ["test-cov", "cov-report"]
|
||||
changelog = "towncrier build --version main --draft"
|
||||
cov = ["cov-test", "cov-report"]
|
||||
cov-report = ["- coverage combine", "coverage report"]
|
||||
cov-test = "coverage run -m pytest {args:tests}"
|
||||
fmt = "pre-commit run --all-files"
|
||||
setup = "pre-commit install"
|
||||
test = "pytest {args:tests}"
|
||||
test-cov = "coverage run -m pytest {args:tests}"
|
||||
typing = "pyright {args:src/openllm tests}"
|
||||
|
||||
[tool.towncrier]
|
||||
directory = "changelog.d"
|
||||
filename = "CHANGELOG.md"
|
||||
issue_format = "[#{issue}](https://github.com/bentoml/openllm/issues/{issue})"
|
||||
name = "openllm"
|
||||
start_string = "<!-- towncrier release notes start -->\n"
|
||||
template = "changelog.d/template.md.jinja"
|
||||
title_format = ""
|
||||
underlines = ["", "", ""]
|
||||
|
||||
[[tool.towncrier.section]]
|
||||
path = ""
|
||||
|
||||
[[tool.towncrier.type]]
|
||||
directory = "breaking"
|
||||
name = "Backwards-incompatible Changes"
|
||||
showcontent = true
|
||||
|
||||
[[tool.towncrier.type]]
|
||||
directory = "deprecation"
|
||||
name = "Deprecations"
|
||||
showcontent = true
|
||||
|
||||
[[tool.towncrier.type]]
|
||||
directory = "change"
|
||||
name = "Changes"
|
||||
showcontent = true
|
||||
|
||||
[[tool.towncrier.type]]
|
||||
directory = "feature"
|
||||
name = "Features"
|
||||
showcontent = true
|
||||
|
||||
[[tool.hatch.envs.all.matrix]]
|
||||
python = ["3.8", "3.9", "3.10", "3.11"]
|
||||
|
||||
[tool.hatch.envs.dev]
|
||||
dependencies = [
|
||||
"ruff",
|
||||
"pyright",
|
||||
"hatch",
|
||||
# NOTE: black for generating service file.
|
||||
"black[jupyter]==23.3.0",
|
||||
]
|
||||
detached = true
|
||||
|
||||
[tool.hatch.envs.dev.scripts]
|
||||
all = ["fmt", "typing"]
|
||||
fmt = ["black {args:.}", "black --pyi {args:typings/}", "ruff --fix {args:.}", "style"]
|
||||
style = ["ruff {args:.}", "black --check --diff {args:.}"]
|
||||
typing = "pyright {args:src/openllm tests}"
|
||||
[tool.interrogate]
|
||||
fail-under = 100
|
||||
verbose = 2
|
||||
whitelist-regex = ["test_.*"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
addopts = ["-rfEX", "-pno:warnings"]
|
||||
@@ -206,12 +234,6 @@ force-single-line = true
|
||||
known-first-party = ["openllm", "bentoml", 'transformers']
|
||||
lines-after-imports = 2
|
||||
|
||||
[tool.ruff.flake8-quotes]
|
||||
inline-quotes = "single"
|
||||
|
||||
[tool.ruff.flake8-tidy-imports]
|
||||
ban-relative-imports = "all"
|
||||
|
||||
[tool.ruff.per-file-ignores]
|
||||
# Tests can use magic values, assertions, and relative imports
|
||||
"__init__.py" = ["E402", "F401", "F403", "F811"]
|
||||
@@ -222,7 +244,7 @@ ban-relative-imports = "all"
|
||||
[tool.pyright]
|
||||
analysis.useLibraryCodeForTypes = true
|
||||
enableTypeIgnoreComments = true
|
||||
include = ["src/", "tests/"]
|
||||
include = ["src/", "tests/", "tools/", "examples/"]
|
||||
pythonVersion = "3.11"
|
||||
reportMissingImports = "none"
|
||||
reportMissingModuleSource = "warning"
|
||||
|
||||
@@ -65,8 +65,11 @@ from deepmerge.merger import Merger
|
||||
|
||||
import openllm
|
||||
|
||||
from .exceptions import ForbiddenAttributeError, GpuNotAvailableError, OpenLLMException
|
||||
from .utils import DEBUG, LazyType, bentoml_cattr, codegen, dantic, first_not_none, lenient_issubclass
|
||||
from .exceptions import (ForbiddenAttributeError, GpuNotAvailableError,
|
||||
OpenLLMException)
|
||||
from .utils import (DEBUG, ENV_VARS_TRUE_VALUES, LazyType, bentoml_cattr,
|
||||
codegen, dantic, first_not_none, lenient_issubclass,
|
||||
non_intrusive_setattr)
|
||||
|
||||
if hasattr(t, "Required"):
|
||||
from typing import Required
|
||||
@@ -78,6 +81,11 @@ if hasattr(t, "NotRequired"):
|
||||
else:
|
||||
from typing_extensions import NotRequired
|
||||
|
||||
if hasattr(t, "dataclass_transform"):
|
||||
from typing import dataclass_transform
|
||||
else:
|
||||
from typing_extensions import dataclass_transform
|
||||
|
||||
_T = t.TypeVar("_T")
|
||||
|
||||
|
||||
@@ -85,7 +93,8 @@ if t.TYPE_CHECKING:
|
||||
import tensorflow as tf
|
||||
import torch
|
||||
import transformers
|
||||
from attr import _CountingAttr, _make_init, _make_repr, _transform_attrs # type: ignore
|
||||
from attr import (_CountingAttr, _make_init, _make_repr, # type: ignore
|
||||
_transform_attrs)
|
||||
from transformers.generation.beam_constraints import Constraint
|
||||
|
||||
from ._types import ClickFunctionWrapper, F, O_co, P
|
||||
@@ -103,7 +112,8 @@ else:
|
||||
ItemgetterAny = itemgetter
|
||||
# NOTE: Using internal API from attr here, since we are actually
|
||||
# allowing subclass of openllm.LLMConfig to become 'attrs'-ish
|
||||
from attr._make import _CountingAttr, _make_init, _make_repr, _transform_attrs
|
||||
from attr._make import (_CountingAttr, _make_init, _make_repr,
|
||||
_transform_attrs)
|
||||
|
||||
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
|
||||
torch = openllm.utils.LazyLoader("torch", globals(), "torch")
|
||||
@@ -369,6 +379,11 @@ class GenerationConfig:
|
||||
)
|
||||
self.__attrs_init__(**attrs)
|
||||
|
||||
def __getitem__(self, item: str) -> t.Any:
|
||||
if hasattr(self, item):
|
||||
return getattr(self, item)
|
||||
raise KeyError(f"GenerationConfig has no attribute {item}")
|
||||
|
||||
|
||||
bentoml_cattr.register_unstructure_hook_factory(
|
||||
lambda cls: attr.has(cls) and lenient_issubclass(cls, GenerationConfig),
|
||||
@@ -411,6 +426,11 @@ class ModelSettings(t.TypedDict, total=False):
|
||||
requires_gpu: bool
|
||||
trust_remote_code: bool
|
||||
requirements: t.Optional[ListStr]
|
||||
|
||||
# llm implementation specifics
|
||||
use_pipeline: bool
|
||||
bettertransformer: bool
|
||||
model_type: t.Literal["causal_lm", "seq2seq_lm"]
|
||||
runtime: t.Literal["transformers", "cpp"]
|
||||
|
||||
# naming convention, only name_type is needed to infer from the class
|
||||
@@ -458,19 +478,19 @@ _ModelSettings: type[attr.AttrsInstance] = codegen.add_method_dunders(
|
||||
def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
|
||||
if not lenient_issubclass(cl_, LLMConfig):
|
||||
raise RuntimeError(f"Given LLMConfig must be a subclass type of 'LLMConfig', got '{cl_}' instead.")
|
||||
settings = cl_.__config__
|
||||
|
||||
if settings is None:
|
||||
raise RuntimeError("Given LLMConfig must have '__config__' defined.")
|
||||
if not hasattr(cl_, "__config__") or getattr(cl_, "__config__") is None:
|
||||
raise RuntimeError("Given LLMConfig must have '__config__' that is not None defined.")
|
||||
|
||||
settings = cl_.__config__
|
||||
assert settings
|
||||
|
||||
required = [i.name for i in attr.fields(cls) if i.metadata.get("required", False)]
|
||||
if any(k not in settings for k in required):
|
||||
raise ValueError(f"The following keys are required under '__config__': {required}")
|
||||
if not settings["default_id"] or not settings["model_ids"]:
|
||||
raise ValueError("Make sure that either 'default_id', 'model_ids' are not emptied under '__config__'.")
|
||||
|
||||
if any(k in settings for k in ("env", "start_name", "model_name")):
|
||||
raise ValueError("The following keys are not allowed under '__config__': env, start_name, model_name")
|
||||
missing = set(required) - set(settings.keys())
|
||||
|
||||
if len(missing) > 0:
|
||||
raise ValueError(f"The following keys are required under '__config__': {required} (missing: {missing})")
|
||||
|
||||
if "generation_class" in settings:
|
||||
raise ValueError(
|
||||
@@ -478,10 +498,16 @@ def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
|
||||
f"all required attributes under '{cl_}.GenerationConfig' when defining the class."
|
||||
)
|
||||
|
||||
if not settings["default_id"] or not settings["model_ids"]:
|
||||
raise ValueError("Either 'default_id' or 'model_ids' are emptied under '__config__' (required fields).")
|
||||
|
||||
# NOTE: value in __config__ can be None, hense we use setdefault
|
||||
# to update in-place
|
||||
_cl_name = cl_.__name__.replace("Config", "")
|
||||
name_type = first_not_none(settings.get("name_type"), "dasherize")
|
||||
model_name = inflection.underscore(_cl_name) if name_type == "dasherize" else _cl_name.lower()
|
||||
start_name = inflection.dasherize(model_name) if name_type == "dasherize" else model_name
|
||||
name_type = settings.setdefault("name_type", "dasherize")
|
||||
model_name = settings.setdefault(
|
||||
"model_name", inflection.underscore(_cl_name) if name_type == "dasherize" else _cl_name.lower()
|
||||
)
|
||||
partialed = functools.partial(_field_env_key, model_name=model_name, suffix="generation")
|
||||
|
||||
def auto_env_transformers(_: t.Any, fields: list[attr.Attribute[t.Any]]) -> list[attr.Attribute[t.Any]]:
|
||||
@@ -498,21 +524,9 @@ def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
|
||||
for f in fields
|
||||
]
|
||||
|
||||
return cls(
|
||||
default_id=settings["default_id"],
|
||||
model_ids=settings["model_ids"],
|
||||
url=settings.get("url", ""),
|
||||
requires_gpu=settings.get("requires_gpu", False),
|
||||
trust_remote_code=settings.get("trust_remote_code", False),
|
||||
requirements=settings.get("requirements", None),
|
||||
name_type=name_type,
|
||||
model_name=model_name,
|
||||
start_name=start_name,
|
||||
runtime=settings.get("runtime", "transformers"),
|
||||
env=openllm.utils.ModelEnv(model_name),
|
||||
timeout=settings.get("timeout", 3600),
|
||||
workers_per_resource=settings.get("workers_per_resource", 1),
|
||||
generation_class=attr.make_class(
|
||||
settings.setdefault(
|
||||
"generation_class",
|
||||
attr.make_class(
|
||||
f"{_cl_name}GenerationConfig",
|
||||
[],
|
||||
bases=(GenerationConfig,),
|
||||
@@ -520,10 +534,40 @@ def structure_settings(cl_: type[LLMConfig], cls: type[t.Any]):
|
||||
weakref_slot=True,
|
||||
frozen=False,
|
||||
repr=True,
|
||||
collect_by_mro=True,
|
||||
field_transformer=auto_env_transformers,
|
||||
),
|
||||
)
|
||||
|
||||
env = settings.setdefault("env", openllm.utils.ModelEnv(model_name))
|
||||
requires_gpu = settings.setdefault("requires_gpu", False)
|
||||
|
||||
# bettertransformer support
|
||||
bettertransformer = settings.setdefault(
|
||||
"bettertransformer",
|
||||
os.environ.get(env.bettertransformer, str(False)).upper() in ENV_VARS_TRUE_VALUES,
|
||||
)
|
||||
if requires_gpu:
|
||||
# For all models that requires GPU, no need to offload it to BetterTransformer
|
||||
# use bitsandbytes or gptq instead for latency improvement
|
||||
if bettertransformer:
|
||||
logger.debug("Model requires GPU by default, disabling bettertransformer.")
|
||||
bettertransformer = False
|
||||
settings["bettertransformer"] = bettertransformer
|
||||
|
||||
# default value
|
||||
settings.setdefault("url", "")
|
||||
settings.setdefault("use_pipeline", False)
|
||||
settings.setdefault("model_type", "causal_lm")
|
||||
settings.setdefault("trust_remote_code", False)
|
||||
settings.setdefault("requirements", None)
|
||||
settings.setdefault("timeout", 3600)
|
||||
settings.setdefault("workers_per_resource", 1)
|
||||
settings.setdefault("runtime", "transformers")
|
||||
settings.setdefault("start_name", inflection.dasherize(model_name) if name_type == "dasherize" else model_name)
|
||||
|
||||
return cls(**settings)
|
||||
|
||||
|
||||
bentoml_cattr.register_structure_hook(_ModelSettings, structure_settings)
|
||||
|
||||
@@ -534,15 +578,16 @@ def _setattr_class(attr_name: str, value_var: t.Any, add_dunder: bool = False):
|
||||
We can't use the cached object.__setattr__ since we are setting
|
||||
attributes to a class.
|
||||
"""
|
||||
if add_dunder:
|
||||
return f"setattr(cls, '{attr_name}', __add_dunder(cls, {value_var}))"
|
||||
return f"setattr(cls, '{attr_name}', {value_var})"
|
||||
val = f"__add_dunder(cls, {value_var})" if add_dunder else value_var
|
||||
return f"setattr(cls, '{attr_name}', {val})"
|
||||
|
||||
|
||||
_dunder_add = {"generation_class"}
|
||||
|
||||
|
||||
def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance) -> t.Callable[..., None]:
|
||||
def _make_assignment_script(
|
||||
cls: type[LLMConfig], attributes: attr.AttrsInstance, _prefix: t.LiteralString = "openllm"
|
||||
) -> t.Callable[..., None]:
|
||||
"""Generate the assignment script with prefix attributes __openllm_<value>__"""
|
||||
args: ListStr = []
|
||||
globs: DictStrAny = {
|
||||
@@ -555,7 +600,7 @@ def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance
|
||||
|
||||
lines: ListStr = ["_getattr = _cached_getattribute_get(_cached_attribute)"]
|
||||
for attr_name, field in attr.fields_dict(attributes.__class__).items():
|
||||
arg_name = field.metadata.get("target", f"__openllm_{inflection.underscore(attr_name)}__")
|
||||
arg_name = field.metadata.get("target", f"__{_prefix}_{inflection.underscore(attr_name)}__")
|
||||
args.append(f"{attr_name}=getattr(_cached_attribute, '{attr_name}')")
|
||||
lines.append(_setattr_class(arg_name, attr_name, add_dunder=attr_name in _dunder_add))
|
||||
annotations[attr_name] = field.type
|
||||
@@ -568,6 +613,23 @@ def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance
|
||||
_reserved_namespace = {"__config__", "GenerationConfig"}
|
||||
|
||||
|
||||
@dataclass_transform(order_default=True, field_specifiers=(attr.field, dantic.Field))
|
||||
def __llm_config_transform__(cls: type[LLMConfig]) -> type[LLMConfig]:
|
||||
kwargs: dict[str, t.Any] = {}
|
||||
if hasattr(cls, "GenerationConfig"):
|
||||
kwargs = {k: v for k, v in vars(cls.GenerationConfig).items() if not k.startswith("_")}
|
||||
non_intrusive_setattr(
|
||||
cls,
|
||||
"__dataclass_transform__",
|
||||
{
|
||||
"order_default": True,
|
||||
"field_specifiers": (attr.field, dantic.Field),
|
||||
"kwargs": kwargs,
|
||||
},
|
||||
)
|
||||
return cls
|
||||
|
||||
|
||||
@attr.define(slots=True)
|
||||
class LLMConfig:
|
||||
"""
|
||||
@@ -640,11 +702,11 @@ class LLMConfig:
|
||||
# NOTE: The following is handled via __init_subclass__, and is only used for TYPE_CHECKING
|
||||
if t.TYPE_CHECKING:
|
||||
# NOTE: public attributes to override
|
||||
__config__: ModelSettings | None = None
|
||||
__config__: ModelSettings | None = Field(None)
|
||||
"""Internal configuration for this LLM model. Each of the field in here will be populated
|
||||
and prefixed with __openllm_<value>__"""
|
||||
|
||||
GenerationConfig: type = type
|
||||
GenerationConfig: type = Field(None)
|
||||
"""Users can override this subclass of any given LLMConfig to provide GenerationConfig
|
||||
default value. For example:
|
||||
|
||||
@@ -663,7 +725,7 @@ class LLMConfig:
|
||||
def __attrs_init__(self, **attrs: t.Any):
|
||||
"""Generated __attrs_init__ for LLMConfig subclass that follows the attrs contract."""
|
||||
|
||||
__attrs_attrs__: tuple[attr.Attribute[t.Any], ...] = tuple()
|
||||
__attrs_attrs__: tuple[attr.Attribute[t.Any], ...] = Field(None, init=False)
|
||||
"""Since we are writing our own __init_subclass__, which is an alternative way for __prepare__,
|
||||
we want openllm.LLMConfig to be attrs-like dataclass that has pydantic-like interface.
|
||||
__attrs_attrs__ will be handled dynamically by __init_subclass__.
|
||||
@@ -683,33 +745,38 @@ class LLMConfig:
|
||||
__openllm_url__: str = Field(None, init=False)
|
||||
"""The resolved url for this LLMConfig."""
|
||||
|
||||
__openllm_requires_gpu__: bool = False
|
||||
__openllm_requires_gpu__: bool = Field(None, init=False)
|
||||
"""Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU."""
|
||||
|
||||
__openllm_trust_remote_code__: bool = False
|
||||
__openllm_trust_remote_code__: bool = Field(False)
|
||||
"""Whether to always trust remote code"""
|
||||
|
||||
__openllm_requirements__: ListStr | None = None
|
||||
__openllm_requirements__: ListStr | None = Field(None)
|
||||
"""The default PyPI requirements needed to run this given LLM. By default, we will depend on
|
||||
bentoml, torch, transformers."""
|
||||
|
||||
__openllm_env__: openllm.utils.ModelEnv = Field(None, init=False)
|
||||
"""A ModelEnv instance for this LLMConfig."""
|
||||
|
||||
__openllm_model_name__: str = ""
|
||||
__openllm_model_name__: str = Field("")
|
||||
"""The normalized version of __openllm_start_name__, determined by __openllm_name_type__"""
|
||||
|
||||
__openllm_start_name__: str = ""
|
||||
__openllm_model_type__: t.Literal["causal_lm", "seq2seq_lm"] = Field("causal_lm")
|
||||
"""The model type for this given LLM. By default, it should be causal language modeling.
|
||||
Currently supported 'causal_lm' or 'seq2seq_lm'
|
||||
"""
|
||||
|
||||
__openllm_start_name__: str = Field("")
|
||||
"""Default name to be used with `openllm start`"""
|
||||
|
||||
__openllm_name_type__: t.Literal["dasherize", "lowercase"] = "dasherize"
|
||||
__openllm_name_type__: t.Literal["dasherize", "lowercase"] = Field("dasherize")
|
||||
"""the default name typed for this model. "dasherize" will convert the name to lowercase and
|
||||
replace spaces with dashes. "lowercase" will convert the name to lowercase."""
|
||||
|
||||
__openllm_timeout__: int = 3600
|
||||
__openllm_timeout__: int = Field(36000)
|
||||
"""The default timeout to be set for this given LLM."""
|
||||
|
||||
__openllm_workers_per_resource__: int | float = 1
|
||||
__openllm_workers_per_resource__: int | float = Field(1)
|
||||
"""The number of workers per resource. This is used to determine the number of workers to use for this model.
|
||||
For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then
|
||||
OpenLLM will use 1 worker per resource. If this is set to 2, then OpenLLM will use 2 workers per resource.
|
||||
@@ -720,10 +787,23 @@ class LLMConfig:
|
||||
By default, it is set to 1.
|
||||
"""
|
||||
|
||||
__openllm_runtime__: t.Literal["transformers", "cpp"] = "transformers"
|
||||
__openllm_runtime__: t.Literal["transformers", "cpp"] = Field("transformers")
|
||||
"""The runtime to use for this model. Possible values are `transformers` or `cpp`. See
|
||||
LlaMA for more information."""
|
||||
|
||||
__openllm_use_pipeline__: bool = Field(False)
|
||||
"""Whether this LLM will use HuggingFace Pipeline API. By default, this is set to False.
|
||||
The reason for this to be here is because we want to access this object before loading
|
||||
the _bentomodel. This is because we will actually download the model weights when accessing
|
||||
_bentomodel.
|
||||
"""
|
||||
|
||||
__openllm_bettertransformer__: bool = Field(False)
|
||||
"""Whether to use BetterTransformer for this given LLM. This depends per model
|
||||
architecture. By default, we will use BetterTransformer for T5 and StableLM models,
|
||||
and set to False for every other models.
|
||||
"""
|
||||
|
||||
__openllm_default_id__: str = Field(None)
|
||||
"""Return the default model to use when using 'openllm start <model_id>'.
|
||||
This could be one of the keys in 'self.model_ids' or custom users model."""
|
||||
@@ -804,6 +884,7 @@ class LLMConfig:
|
||||
these["generation_config"] = cls.Field(
|
||||
default=cls.__openllm_generation_class__(),
|
||||
description=inspect.cleandoc(cls.__openllm_generation_class__.__doc__ or ""),
|
||||
type=GenerationConfig,
|
||||
)
|
||||
|
||||
# Generate the base __attrs_attrs__ transformation here.
|
||||
@@ -884,6 +965,7 @@ class LLMConfig:
|
||||
cls.__openllm_hints__ = {
|
||||
f.name: f.type for ite in map(attr.fields, (cls, cls.__openllm_generation_class__)) for f in ite
|
||||
}
|
||||
cls = __llm_config_transform__(cls)
|
||||
|
||||
def __setattr__(self, attr: str, value: t.Any):
|
||||
if attr in _reserved_namespace:
|
||||
@@ -909,14 +991,7 @@ class LLMConfig:
|
||||
if generation_config is None:
|
||||
generation_config = {k: v for k, v in attrs.items() if k in _generation_cl_dict}
|
||||
else:
|
||||
generation_keys = {k for k in attrs if k in _generation_cl_dict}
|
||||
if len(generation_keys) > 0:
|
||||
logger.warning(
|
||||
"Both 'generation_config' and keys for 'generation_config' are passed."
|
||||
" The following keys in 'generation_config' will be overriden be keywords argument: %s",
|
||||
", ".join(generation_keys),
|
||||
)
|
||||
config_merger.merge(generation_config, {k: v for k, v in attrs.items() if k in generation_keys})
|
||||
config_merger.merge(generation_config, {k: v for k, v in attrs.items() if k in _generation_cl_dict})
|
||||
|
||||
for k in _cached_keys:
|
||||
if k in generation_config or attrs.get(k) is None:
|
||||
@@ -942,7 +1017,32 @@ class LLMConfig:
|
||||
)
|
||||
|
||||
# The rest of attrs should only be the attributes to be passed to __attrs_init__
|
||||
self.__attrs_init__(generation_config=self.__openllm_generation_class__(**generation_config), **attrs)
|
||||
self.__attrs_init__(generation_config=self["generation_class"](**generation_config), **attrs)
|
||||
|
||||
def __getitem__(self, item: str | t.Any) -> t.Any:
|
||||
"""Allowing access LLMConfig as a dictionary. The order will always evaluate as
|
||||
|
||||
__openllm_*__ > self.key > __openllm_generation_class__ > __openllm_extras__
|
||||
|
||||
This method is purely for convenience, and should not be used for performance critical code.
|
||||
"""
|
||||
if not isinstance(item, str):
|
||||
raise TypeError(f"LLM only supports string indexing, not {item.__class__.__name__}")
|
||||
if item in _reserved_namespace:
|
||||
raise ForbiddenAttributeError(
|
||||
f"'{item}' is a reserved namespace for {self.__class__} and should not be access nor modified."
|
||||
)
|
||||
internal_attributes = f"__openllm_{item}__"
|
||||
if hasattr(self, internal_attributes):
|
||||
return getattr(self, internal_attributes)
|
||||
elif hasattr(self, item):
|
||||
return getattr(self, item)
|
||||
elif hasattr(self.__openllm_generation_class__, item):
|
||||
return getattr(self.__openllm_generation_class__, item)
|
||||
elif item in self.__openllm_extras__:
|
||||
return self.__openllm_extras__[item]
|
||||
else:
|
||||
raise KeyError(item)
|
||||
|
||||
def __getattribute__(self, item: str) -> t.Any:
|
||||
if item in _reserved_namespace:
|
||||
@@ -976,10 +1076,8 @@ class LLMConfig:
|
||||
|
||||
def model_dump(self, flatten: bool = False, **_: t.Any):
|
||||
dumped = bentoml_cattr.unstructure(self)
|
||||
generation_config = bentoml_cattr.unstructure(self.generation_config)
|
||||
if not flatten:
|
||||
dumped["generation_config"] = generation_config
|
||||
else:
|
||||
if flatten:
|
||||
generation_config = dumped.pop("generation_config")
|
||||
dumped.update(generation_config)
|
||||
return dumped
|
||||
|
||||
@@ -1028,11 +1126,11 @@ class LLMConfig:
|
||||
key_to_remove: ListStr = []
|
||||
|
||||
for k, v in attrs.items():
|
||||
if k.startswith(f"{self.__openllm_model_name__}_generation_"):
|
||||
llm_config_attrs["generation_config"][k[len(self.__openllm_model_name__ + "_generation_") :]] = v
|
||||
if k.startswith(f"{self['model_name']}_generation_"):
|
||||
llm_config_attrs["generation_config"][k[len(self["model_name"] + "_generation_") :]] = v
|
||||
key_to_remove.append(k)
|
||||
elif k.startswith(f"{self.__openllm_model_name__}_"):
|
||||
llm_config_attrs[k[len(self.__openllm_model_name__ + "_") :]] = v
|
||||
elif k.startswith(f"{self['model_name']}_"):
|
||||
llm_config_attrs[k[len(self["model_name"] + "_") :]] = v
|
||||
key_to_remove.append(k)
|
||||
|
||||
return self.model_construct_env(**llm_config_attrs), {k: v for k, v in attrs.items() if k not in key_to_remove}
|
||||
|
||||
@@ -35,7 +35,8 @@ from bentoml._internal.types import ModelSignatureDict
|
||||
import openllm
|
||||
|
||||
from .exceptions import ForbiddenAttributeError, OpenLLMException
|
||||
from .utils import ENV_VARS_TRUE_VALUES, LazyLoader, bentoml_cattr
|
||||
from .utils import (LazyLoader, bentoml_cattr, is_bitsandbytes_available,
|
||||
non_intrusive_setattr)
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
@@ -60,7 +61,6 @@ else:
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# NOTE: `1-2` -> text-generation and text2text-generation
|
||||
FRAMEWORK_TO_AUTOCLASS_MAPPING = {
|
||||
"pt": ("AutoModelForCausalLM", "AutoModelForSeq2SeqLM"),
|
||||
"tf": ("TFAutoModelForCausalLM", "TFAutoModelForSeq2SeqLM"),
|
||||
@@ -132,6 +132,7 @@ def import_model(
|
||||
),
|
||||
)
|
||||
|
||||
# NOTE: `1-2` -> text-generation and text2text-generation
|
||||
if type(config) in transformers.MODEL_FOR_CAUSAL_LM_MAPPING:
|
||||
idx = 0
|
||||
elif type(config) in transformers.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING:
|
||||
@@ -243,25 +244,6 @@ class LLMInterface(ABC):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def _default_post_init(self: LLM[t.Any, t.Any]):
|
||||
# load_in_mha: Whether to apply BetterTransformer (or Torch MultiHeadAttention) during inference load.
|
||||
# See https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/
|
||||
# for more information.
|
||||
# NOTE: set a default variable to transform to BetterTransformer by default for inference
|
||||
if self.config.__openllm_runtime__ == "cpp":
|
||||
self.load_in_mha = False
|
||||
else:
|
||||
self.load_in_mha = (
|
||||
os.environ.get(self.config_class.__openllm_env__.bettertransformer, str(False)).upper()
|
||||
in ENV_VARS_TRUE_VALUES
|
||||
)
|
||||
if self.config_class.__openllm_requires_gpu__:
|
||||
# For all models that requires GPU, no need to offload it to BetterTransformer
|
||||
# use bitsandbytes instead
|
||||
|
||||
self.load_in_mha = False
|
||||
|
||||
|
||||
_M = t.TypeVar("_M")
|
||||
_T = t.TypeVar("_T")
|
||||
|
||||
@@ -285,6 +267,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
_model_attrs: dict[str, t.Any]
|
||||
_tokenizer_attrs: dict[str, t.Any]
|
||||
|
||||
bettertransformer: bool
|
||||
|
||||
def __init_subclass__(cls):
|
||||
cd = cls.__dict__
|
||||
prefix_class_name_config = cls.__name__
|
||||
@@ -310,20 +294,6 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
"Missing required key 'config_class'. Make sure to define it within the LLM subclass."
|
||||
)
|
||||
|
||||
if cls.llm_post_init is not LLMInterface.llm_post_init:
|
||||
original_llm_post_init = cd["llm_post_init"]
|
||||
|
||||
def wrapped_llm_post_init(self: t.Self) -> None:
|
||||
"""We need to both initialize private attributes and call the user-defined model_post_init
|
||||
method.
|
||||
"""
|
||||
_default_post_init(self)
|
||||
original_llm_post_init(self)
|
||||
|
||||
cls.llm_post_init = wrapped_llm_post_init
|
||||
else:
|
||||
setattr(cls, "llm_post_init", _default_post_init)
|
||||
|
||||
if cls.import_model is LLMInterface.import_model:
|
||||
# using the default import model
|
||||
setattr(cls, "import_model", functools.partial(import_model, _model_framework=implementation))
|
||||
@@ -353,6 +323,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
model_id: str | None = None,
|
||||
llm_config: openllm.LLMConfig | None = None,
|
||||
*args: t.Any,
|
||||
quantize: t.Literal["8bit", "4bit", "gptq"] | None = None,
|
||||
bettertransformer: bool | None = None,
|
||||
**attrs: t.Any,
|
||||
):
|
||||
"""Initialize the LLM with given pretrained model.
|
||||
@@ -429,6 +401,9 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
model_id: The pretrained model to use. Defaults to None. If None, 'self.default_id' will be used.
|
||||
llm_config: The config to use for this LLM. Defaults to None. If not passed, OpenLLM
|
||||
will use `config_class` to construct default configuration.
|
||||
quantize: The quantization to use for this LLM. Defaults to None. Possible values
|
||||
include 8bit, 4bit and gptq.
|
||||
bettertransformer: Whether to use BetterTransformer with this model. Defaults to False.
|
||||
*args: The args to be passed to the model.
|
||||
**attrs: The kwargs to be passed to the model.
|
||||
|
||||
@@ -438,16 +413,102 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
However, if `model_id` is a path, this argument is recomended to include.
|
||||
"""
|
||||
|
||||
load_in_mha = attrs.pop("load_in_mha", False)
|
||||
openllm_model_version = attrs.pop("openllm_model_version", None)
|
||||
|
||||
# low_cpu_mem_usage is only available for model
|
||||
# this is helpful on system with low memory to avoid OOM
|
||||
low_cpu_mem_usage = attrs.pop("low_cpu_mem_usage", True)
|
||||
|
||||
# quantization setup
|
||||
quantization_config = attrs.pop("quantization_config", None)
|
||||
# 8 bit configuration
|
||||
int8_threshold = attrs.pop("llm_int8_threshhold", 6.0)
|
||||
cpu_offloading = attrs.pop("llm_int8_enable_fp32_cpu_offload", False)
|
||||
int8_skip_modules: list[str] | None = attrs.pop("llm_int8_skip_modules", None)
|
||||
int8_has_fp16_weight = attrs.pop("llm_int8_has_fp16_weight", False)
|
||||
# 4 bit configuration
|
||||
int4_compute_dtype = attrs.pop("llm_bnb_4bit_compute_dtype", torch.bfloat16)
|
||||
int4_quant_type = attrs.pop("llm_bnb_4bit_quant_type", "nf4")
|
||||
int4_use_double_quant = attrs.pop("llm_bnb_4bit_use_double_quant", True)
|
||||
|
||||
if quantization_config and quantize:
|
||||
raise ValueError(
|
||||
"""'quantization_config' and 'quantize' are mutually exclusive. Either customise
|
||||
your quantization_config or use the quantize argument."""
|
||||
)
|
||||
if quantization_config is None:
|
||||
# quantize is a openllm.LLM feature, where we can quantize the model
|
||||
# with bitsandbytes or quantization aware training.
|
||||
if quantize is not None:
|
||||
if not is_bitsandbytes_available():
|
||||
raise RuntimeError(
|
||||
"Quantization requires bitsandbytes to be installed. Make "
|
||||
"sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'"
|
||||
)
|
||||
logger.debug(
|
||||
"'quantize' is not None. %s will use a default 'quantization_config' for %s. "
|
||||
"If you want to customise the quantization config, make sure to pass your "
|
||||
"own 'quantization_config'",
|
||||
self,
|
||||
quantize,
|
||||
)
|
||||
if quantize == "8bit":
|
||||
if int8_skip_modules is None:
|
||||
int8_skip_modules = []
|
||||
if "lm_head" not in int8_skip_modules and self.config["model_type"] == "causal_lm":
|
||||
logger.debug("Skipping 'lm_head' for quantization for %s", self)
|
||||
int8_skip_modules.append("lm_head")
|
||||
quantization_config = transformers.BitsAndBytesConfig(
|
||||
load_in_8bit=True,
|
||||
llm_int8_enable_fp32_cpu_offload=cpu_offloading,
|
||||
llm_int8_threshhold=int8_threshold,
|
||||
llm_int8_skip_modules=int8_skip_modules,
|
||||
llm_int8_has_fp16_weight=int8_has_fp16_weight,
|
||||
)
|
||||
elif quantize == "4bit":
|
||||
trf_versions = openllm.utils.pkg.pkg_version_info("transformers")
|
||||
supports_kbits = trf_versions[:2] >= (4, 30)
|
||||
if supports_kbits:
|
||||
quantization_config = transformers.BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
llm_bnb_4bit_compute_dtype=int4_compute_dtype,
|
||||
llm_bnb_4bit_quant_type=int4_quant_type,
|
||||
llm_bnb_4bit_use_double_quant=int4_use_double_quant,
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
"'quantize' is set to 4bit, while the current transformers version %s does not support "
|
||||
"k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore "
|
||||
"make sure to install the latest version of transformers either via PyPI or "
|
||||
"from git source: 'pip install git+https://github.com/huggingface/transformers'.",
|
||||
trf_versions,
|
||||
)
|
||||
elif quantize == "gptq":
|
||||
# TODO: support GPTQ loading quantization
|
||||
if model_id is None:
|
||||
raise RuntimeError(
|
||||
"'quantize=%s' requires passing custom path to quantized weights as we are unable to load "
|
||||
"the model on the fly. See https://github.com/qwopqwop200/GPTQ-for-LLaMa for "
|
||||
"instruction on how to quantize '%s' with GPTQ.",
|
||||
quantize,
|
||||
self,
|
||||
)
|
||||
raise NotImplementedError("GPTQ is not supported yet.")
|
||||
else:
|
||||
raise ValueError(f"'quantize' must be one of ['8bit', '4bit', 'gptq'], got {quantize} instead.")
|
||||
|
||||
attrs.update({"quantization_config": quantization_config})
|
||||
|
||||
if llm_config is not None:
|
||||
logger.debug("Using given 'llm_config=(%s)' to initialize LLM", llm_config)
|
||||
self.config = llm_config
|
||||
else:
|
||||
self.config = self.config_class.model_construct_env(**attrs)
|
||||
# The rests of the kwargs that is not used by the config class should be stored into __openllm_extras__.
|
||||
attrs = self.config.__openllm_extras__
|
||||
attrs = self.config["extras"]
|
||||
|
||||
if not self.config["use_pipeline"]:
|
||||
attrs["low_cpu_mem_usage"] = low_cpu_mem_usage
|
||||
|
||||
model_kwds, tokenizer_kwds = {}, {}
|
||||
if self.__llm_init_kwargs__:
|
||||
@@ -463,10 +524,11 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
)
|
||||
|
||||
if model_id is None:
|
||||
model_id = os.environ.get(self.config.__openllm_env__.model_id, self.config.__openllm_default_id__)
|
||||
model_id = os.environ.get(self.config["env"].model_id, self.config["default_id"])
|
||||
|
||||
# NOTE: This is the actual given path or pretrained weight for this LLM.
|
||||
assert model_id is not None
|
||||
if t.TYPE_CHECKING:
|
||||
assert model_id is not None
|
||||
self._model_id = model_id
|
||||
|
||||
# parsing tokenizer and model kwargs
|
||||
@@ -476,23 +538,24 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
model_kwds.update({k: v for k, v in attrs.items() if not k.startswith(TOKENIZER_PREFIX)})
|
||||
|
||||
# handle trust_remote_code
|
||||
self.__llm_trust_remote_code__ = model_kwds.pop("trust_remote_code", self.config.__openllm_trust_remote_code__)
|
||||
self.__llm_trust_remote_code__ = model_kwds.pop("trust_remote_code", self.config["trust_remote_code"])
|
||||
|
||||
# NOTE: Save the args and kwargs for latter load
|
||||
self._model_args = args
|
||||
self._model_attrs = model_kwds
|
||||
self._tokenizer_attrs = tokenizer_kwds
|
||||
|
||||
# we allow users to overwrite the load_in_mha defined by the LLM subclass.
|
||||
if load_in_mha:
|
||||
logger.debug("Overwriting 'load_in_mha=%s' (base load_in_mha=%s)", load_in_mha, self.load_in_mha)
|
||||
self.load_in_mha = load_in_mha
|
||||
|
||||
self._openllm_model_version = openllm_model_version
|
||||
|
||||
if self.__llm_post_init__:
|
||||
self.llm_post_init()
|
||||
|
||||
# we set it here so that we allow subclass to overwrite bettertransformer in llm_post_init
|
||||
if bettertransformer:
|
||||
logger.debug("Using %r with BetterTransformer", self)
|
||||
self.bettertransformer = bettertransformer
|
||||
else:
|
||||
non_intrusive_setattr(self, "bettertransformer", self.config["bettertransformer"])
|
||||
|
||||
def __setattr__(self, attr: str, value: t.Any):
|
||||
if attr in _reserved_namespace:
|
||||
raise ForbiddenAttributeError(
|
||||
@@ -513,7 +576,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
|
||||
@property
|
||||
def runner_name(self) -> str:
|
||||
return f"llm-{self.config.__openllm_start_name__}-runner"
|
||||
return f"llm-{self.config['start_name']}-runner"
|
||||
|
||||
# NOTE: The section below defines a loose contract with langchain's LLM interface.
|
||||
@property
|
||||
@@ -524,7 +587,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
def identifying_params(self) -> dict[str, t.Any]:
|
||||
return {
|
||||
"configuration": self.config.model_dump_json().decode(),
|
||||
"model_ids": orjson.dumps(self.config.__openllm_model_ids__).decode(),
|
||||
"model_ids": orjson.dumps(self.config["model_ids"]).decode(),
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
@@ -580,8 +643,8 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
sys.executable,
|
||||
"-m",
|
||||
"openllm",
|
||||
"download-models",
|
||||
self.config.__openllm_start_name__,
|
||||
"download",
|
||||
self.config["start_name"],
|
||||
"--model-id",
|
||||
self.model_id,
|
||||
"--output",
|
||||
@@ -625,7 +688,11 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
kwds = self._model_attrs
|
||||
kwds["trust_remote_code"] = self.__llm_trust_remote_code__
|
||||
|
||||
if self.load_in_mha and "_pretrained_class" not in self._bentomodel.info.metadata:
|
||||
is_pipeline = "_pretrained_class" in self._bentomodel.info.metadata
|
||||
# differentiate when saving tokenizer or other pretrained type.
|
||||
is_pretrained_model = is_pipeline and "_framework" in self._bentomodel.info.metadata
|
||||
|
||||
if self.bettertransformer and is_pipeline and self.config["use_pipeline"]:
|
||||
# This is a pipeline, provide a accelerator args
|
||||
kwds["accelerator"] = "bettertransformer"
|
||||
|
||||
@@ -636,10 +703,10 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
self.__llm_model__ = self._bentomodel.load_model(*self._model_args, **kwds)
|
||||
|
||||
if (
|
||||
self.load_in_mha
|
||||
and all(i in self._bentomodel.info.metadata for i in ("_framework", "_pretrained_class"))
|
||||
self.bettertransformer
|
||||
and is_pretrained_model
|
||||
and self._bentomodel.info.metadata["_framework"] == "torch"
|
||||
and self.config.__openllm_runtime__ == "transformers"
|
||||
and self.config["runtime"] == "transformers"
|
||||
):
|
||||
# BetterTransformer is currently only supported on PyTorch.
|
||||
from optimum.bettertransformer import BetterTransformer
|
||||
@@ -767,7 +834,7 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
|
||||
# NOTE: returning the two langchain API's to the runner
|
||||
return types.new_class(
|
||||
inflection.camelize(self.config.__openllm_model_name__) + "Runner",
|
||||
inflection.camelize(self.config["model_name"]) + "Runner",
|
||||
(bentoml.Runner,),
|
||||
exec_body=lambda ns: ns.update(
|
||||
{
|
||||
@@ -776,17 +843,17 @@ class LLM(LLMInterface, t.Generic[_M, _T]):
|
||||
"llm": self, # NOTE: self reference to LLM
|
||||
"config": self.config,
|
||||
"__call__": _wrapped_generate_run,
|
||||
"__module__": f"openllm.models.{self.config.__openllm_model_name__}",
|
||||
"__doc__": self.config.__openllm_env__.start_docstring,
|
||||
"__module__": f"openllm.models.{self.config['model_name']}",
|
||||
"__doc__": self.config["env"].start_docstring,
|
||||
}
|
||||
),
|
||||
)(
|
||||
types.new_class(
|
||||
inflection.camelize(self.config.__openllm_model_name__) + "Runnable",
|
||||
inflection.camelize(self.config["model_name"]) + "Runnable",
|
||||
(_Runnable,),
|
||||
{
|
||||
"SUPPORTED_RESOURCES": ("nvidia.com/gpu", "cpu")
|
||||
if self.config.__openllm_requires_gpu__
|
||||
if self.config["requires_gpu"]
|
||||
else ("nvidia.com/gpu",),
|
||||
"llm_type": self.llm_type,
|
||||
"identifying_params": self.identifying_params,
|
||||
|
||||
@@ -76,17 +76,16 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
|
||||
# first, then proceed to install everything inside the wheels/ folder.
|
||||
packages: list[str] = ["openllm"]
|
||||
|
||||
if llm.config.__openllm_requirements__ is not None:
|
||||
packages.extend(llm.config.__openllm_requirements__)
|
||||
if llm.config["requirements"] is not None:
|
||||
packages.extend(llm.config["requirements"])
|
||||
|
||||
if not (str(os.environ.get("BENTOML_BUNDLE_LOCAL_BUILD", False)).lower() == "false"):
|
||||
packages.append(f"bentoml>={'.'.join([str(i) for i in pkg.pkg_version_info('bentoml')])}")
|
||||
|
||||
to_use_framework = llm.config.__openllm_env__.get_framework_env()
|
||||
env = llm.config["env"]
|
||||
to_use_framework = env.get_framework_env()
|
||||
if to_use_framework == "flax":
|
||||
assert (
|
||||
utils.is_flax_available()
|
||||
), f"Flax is not available, while {llm.config.__openllm_env__.framework} is set to 'flax'"
|
||||
assert utils.is_flax_available(), f"Flax is not available, while {env.framework} is set to 'flax'"
|
||||
packages.extend(
|
||||
[
|
||||
f"flax>={importlib.metadata.version('flax')}",
|
||||
@@ -95,9 +94,7 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
|
||||
]
|
||||
)
|
||||
elif to_use_framework == "tf":
|
||||
assert (
|
||||
utils.is_tf_available()
|
||||
), f"TensorFlow is not available, while {llm.config.__openllm_env__.framework} is set to 'tf'"
|
||||
assert utils.is_tf_available(), f"TensorFlow is not available, while {env.framework} is set to 'tf'"
|
||||
candidates = (
|
||||
"tensorflow",
|
||||
"tensorflow-cpu",
|
||||
@@ -133,16 +130,17 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS) -> Pyth
|
||||
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float) -> DockerOptions:
|
||||
_bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
|
||||
_bentoml_config_options_opts = [
|
||||
"api_server.traffic.timeout=3600", # NOTE: Currently we hardcode this value
|
||||
f'runners."llm-{llm.config.__openllm_start_name__}-runner".traffic.timeout={llm.config.__openllm_timeout__}',
|
||||
f'runners."llm-{llm.config.__openllm_start_name__}-runner".workers_per_resource={workers_per_resource}',
|
||||
"api_server.traffic.timeout=36000", # NOTE: Currently we hardcode this value
|
||||
f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}',
|
||||
f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}',
|
||||
]
|
||||
_bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
|
||||
env = llm.config["env"]
|
||||
return DockerOptions(
|
||||
cuda_version="11.6", # NOTE: Torch 2.0 currently only support 11.6 as the latest CUDA version
|
||||
env={
|
||||
llm.config.__openllm_env__.framework: llm.config.__openllm_env__.get_framework_env(),
|
||||
"OPENLLM_MODEL": llm.config.__openllm_model_name__,
|
||||
env.framework: env.get_framework_env(),
|
||||
"OPENLLM_MODEL": llm.config["model_name"],
|
||||
"OPENLLM_MODEL_ID": llm.model_id,
|
||||
"BENTOML_DEBUG": str(get_debug_mode()),
|
||||
"BENTOML_CONFIG_OPTIONS": _bentoml_config_options,
|
||||
@@ -180,7 +178,7 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
|
||||
try:
|
||||
os.environ["OPENLLM_MODEL"] = inflection.underscore(model_name)
|
||||
|
||||
to_use_framework = llm_config.__openllm_env__.get_framework_env()
|
||||
to_use_framework = llm_config["env"].get_framework_env()
|
||||
if to_use_framework == "flax":
|
||||
llm = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=llm_config, **attrs)
|
||||
elif to_use_framework == "tf":
|
||||
@@ -192,12 +190,10 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
|
||||
|
||||
labels = dict(llm.identifying_params)
|
||||
labels.update({"_type": llm.llm_type, "_framework": to_use_framework})
|
||||
service_name = f"generated_{llm.config.__openllm_model_name__}_service.py"
|
||||
workers_per_resource = utils.first_not_none(
|
||||
workers_per_resource, default=llm.config.__openllm_workers_per_resource__
|
||||
)
|
||||
service_name = f"generated_{llm_config['model_name']}_service.py"
|
||||
workers_per_resource = utils.first_not_none(workers_per_resource, default=llm_config["workers_per_resource"])
|
||||
|
||||
with fs.open_fs(f"temp://llm_{llm.config.__openllm_model_name__}") as llm_fs:
|
||||
with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
|
||||
# add service.py definition to this temporary folder
|
||||
utils.codegen.write_service(model_name, llm.model_id, service_name, llm_fs)
|
||||
|
||||
@@ -209,12 +205,12 @@ def build(model_name: str, *, __cli__: bool = False, **attrs: t.Any) -> tuple[be
|
||||
raise bentoml.exceptions.NotFound("Overwriting previously saved Bento.")
|
||||
_previously_built = True
|
||||
except bentoml.exceptions.NotFound:
|
||||
logger.info("Building Bento for LLM '%s'", llm.config.__openllm_start_name__)
|
||||
logger.info("Building Bento for LLM '%s'", llm_config["start_name"])
|
||||
bento = bentoml.bentos.build(
|
||||
f"{service_name}:svc",
|
||||
name=bento_tag.name,
|
||||
labels=labels,
|
||||
description=f"OpenLLM service for {llm.config.__openllm_start_name__}",
|
||||
description=f"OpenLLM service for {llm_config['start_name']}",
|
||||
include=[
|
||||
f for f in llm_fs.walk.files(filter=["*.py"])
|
||||
], # NOTE: By default, we are using _service.py as the default service, for now.
|
||||
|
||||
@@ -55,7 +55,7 @@ class GenerationInput:
|
||||
def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]:
|
||||
llm_config = openllm.AutoConfig.for_model(model_name, **attrs)
|
||||
return attr.make_class(
|
||||
inflection.camelize(llm_config.__openllm_model_name__) + "GenerationInput",
|
||||
inflection.camelize(llm_config["model_name"]) + "GenerationInput",
|
||||
attrs={
|
||||
"prompt": attr.field(type=str),
|
||||
"llm_config": attr.field(
|
||||
|
||||
@@ -36,7 +36,7 @@ model_id = os.environ.get("OPENLLM_MODEL_ID", "{__model_id__}") # openllm: mode
|
||||
llm_config = openllm.AutoConfig.for_model(model)
|
||||
runner = openllm.Runner(model, model_id=model_id, llm_config=llm_config)
|
||||
|
||||
svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", runners=[runner])
|
||||
svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[runner])
|
||||
|
||||
|
||||
@svc.api(
|
||||
@@ -55,8 +55,8 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
|
||||
def metadata_v1(_: str) -> openllm.MetadataOutput:
|
||||
return openllm.MetadataOutput(
|
||||
model_id=model_id,
|
||||
timeout=llm_config.__openllm_timeout__,
|
||||
model_name=llm_config.__openllm_model_name__,
|
||||
framework=llm_config.__openllm_env__.get_framework_env(),
|
||||
timeout=llm_config["timeout"],
|
||||
model_name=llm_config["model_name"],
|
||||
framework=llm_config["env"].get_framework_env(),
|
||||
configuration=llm_config.model_dump_json().decode(),
|
||||
)
|
||||
|
||||
@@ -83,6 +83,29 @@ def _echo(text: t.Any, fg: str = "green", _with_style: bool = True, **attrs: t.A
|
||||
call(text, **attrs)
|
||||
|
||||
|
||||
def quantize_option(factory: t.Any):
|
||||
help_str = """Running this model in quantized mode.
|
||||
Note that GPTQ is currently working in progress and will be available soon.
|
||||
|
||||
NOTE: Quantization is only available for PyTorch models.
|
||||
"""
|
||||
return factory.option(
|
||||
"--quantize",
|
||||
type=click.Choice(["8bit", "4bit", "gptq"]),
|
||||
default=None,
|
||||
help=help_str,
|
||||
)
|
||||
|
||||
|
||||
def bettertransformer_option(factory: t.Any):
|
||||
return factory.option(
|
||||
"--bettertransformer",
|
||||
is_flag=True,
|
||||
default=None,
|
||||
help="Use BetterTransformer wrapper to serve model",
|
||||
)
|
||||
|
||||
|
||||
def start_model_command(
|
||||
model_name: str,
|
||||
group: click.Group,
|
||||
@@ -108,29 +131,30 @@ def start_model_command(
|
||||
openllm.utils.configure_logging()
|
||||
|
||||
llm_config = openllm.AutoConfig.for_model(model_name)
|
||||
env = llm_config["env"]
|
||||
|
||||
docstring = f"""\
|
||||
{llm_config.__openllm_env__.start_docstring}
|
||||
{env.start_docstring}
|
||||
\b
|
||||
Available model_id(s): {llm_config.__openllm_model_ids__} [default: {llm_config.__openllm_default_id__}]
|
||||
Available model_id(s): {llm_config['model_ids']} [default: {llm_config['default_id']}]
|
||||
"""
|
||||
command_attrs: dict[str, t.Any] = {
|
||||
"name": llm_config.__openllm_model_name__,
|
||||
"name": llm_config["model_name"],
|
||||
"context_settings": _context_settings or {},
|
||||
"short_help": f"Start a LLMServer for '{model_name}' ('--help' for more details)",
|
||||
"help": docstring,
|
||||
}
|
||||
|
||||
aliases: list[str] = []
|
||||
if llm_config.__openllm_name_type__ == "dasherize":
|
||||
aliases.append(llm_config.__openllm_start_name__)
|
||||
if llm_config["name_type"] == "dasherize":
|
||||
aliases.append(llm_config["start_name"])
|
||||
|
||||
command_attrs["aliases"] = aliases if len(aliases) > 0 else None
|
||||
|
||||
serve_decorator = _http_server_args if not _serve_grpc else _grpc_server_args
|
||||
|
||||
available_gpu = openllm.utils.gpu_count()
|
||||
if llm_config.__openllm_requires_gpu__ and len(available_gpu) < 1:
|
||||
if llm_config["requires_gpu"] and len(available_gpu) < 1:
|
||||
# NOTE: The model requires GPU, therefore we will return a dummy command
|
||||
command_attrs.update(
|
||||
{
|
||||
@@ -152,8 +176,13 @@ Available model_id(s): {llm_config.__openllm_model_ids__} [default: {llm_config.
|
||||
@llm_config.to_click_options
|
||||
@serve_decorator
|
||||
@cog.optgroup.group("General LLM Options")
|
||||
@cog.optgroup.option("--server-timeout", type=int, default=None, help="Server timeout in seconds")
|
||||
@model_id_option(cog.optgroup, model_env=llm_config.__openllm_env__)
|
||||
@cog.optgroup.option(
|
||||
"--server-timeout",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Server timeout in seconds",
|
||||
)
|
||||
@model_id_option(cog.optgroup, model_env=env)
|
||||
@cog.optgroup.option(
|
||||
"--device",
|
||||
type=tuple,
|
||||
@@ -165,34 +194,47 @@ Available model_id(s): {llm_config.__openllm_model_ids__} [default: {llm_config.
|
||||
show_envvar=True,
|
||||
)
|
||||
@workers_per_resource_option(cog.optgroup)
|
||||
@click.pass_context
|
||||
@quantize_option(cog.optgroup)
|
||||
@bettertransformer_option(cog.optgroup)
|
||||
def model_start(
|
||||
ctx: click.Context,
|
||||
server_timeout: int | None,
|
||||
model_id: str | None,
|
||||
workers_per_resource: float | None,
|
||||
device: tuple[str, ...] | None,
|
||||
quantize: t.Literal["8bit", "4bit", "gptq"] | None,
|
||||
bettertransformer: bool | None,
|
||||
**attrs: t.Any,
|
||||
) -> openllm.LLMConfig:
|
||||
config, server_attrs = llm_config.model_validate_click(**attrs)
|
||||
|
||||
if llm_config.__openllm_env__.get_framework_env() == "flax":
|
||||
if quantize and env.get_framework_env() != "pt":
|
||||
_echo("Quantization is only available for PyTorch models.", fg="yellow")
|
||||
|
||||
if env.get_framework_env() == "flax":
|
||||
llm = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=config, ensure_available=True)
|
||||
elif llm_config.__openllm_env__.get_framework_env() == "tf":
|
||||
elif env.get_framework_env() == "tf":
|
||||
llm = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=config, ensure_available=True)
|
||||
else:
|
||||
llm = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=config, ensure_available=True)
|
||||
llm = openllm.AutoLLM.for_model(
|
||||
model_name,
|
||||
model_id=model_id,
|
||||
llm_config=config,
|
||||
quantize=quantize,
|
||||
bettertransformer=bettertransformer,
|
||||
ensure_available=True,
|
||||
)
|
||||
|
||||
if llm.config.__openllm_requirements__ is not None and len(llm.config.__openllm_requirements__) > 0:
|
||||
requirements = config["requirements"]
|
||||
if requirements is not None and len(requirements) > 0:
|
||||
_echo(
|
||||
f"Make sure to have the following dependencies available: {llm.config.__openllm_requirements__}",
|
||||
f"Make sure to have the following dependencies available: {requirements}",
|
||||
fg="yellow",
|
||||
)
|
||||
|
||||
workers_per_resource = openllm.utils.first_not_none(
|
||||
workers_per_resource, default=llm.config.__openllm_workers_per_resource__
|
||||
workers_per_resource, default=config["workers_per_resource"]
|
||||
)
|
||||
server_timeout = openllm.utils.first_not_none(server_timeout, default=llm.config.__openllm_timeout__)
|
||||
server_timeout = openllm.utils.first_not_none(server_timeout, default=config["timeout"])
|
||||
|
||||
num_workers = int(1 / workers_per_resource)
|
||||
if num_workers > 1:
|
||||
@@ -216,26 +258,26 @@ Available model_id(s): {llm_config.__openllm_model_ids__} [default: {llm_config.
|
||||
_bentoml_config_options_opts = [
|
||||
"tracing.sample_rate=1.0",
|
||||
f"api_server.traffic.timeout={server_timeout}",
|
||||
f'runners."llm-{llm.config.__openllm_start_name__}-runner".traffic.timeout={llm.config.__openllm_timeout__}',
|
||||
f'runners."llm-{llm.config.__openllm_start_name__}-runner".workers_per_resource={workers_per_resource}',
|
||||
f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}',
|
||||
f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}',
|
||||
]
|
||||
if device:
|
||||
if len(device) > 1:
|
||||
for idx, dev in enumerate(device):
|
||||
_bentoml_config_options_opts.append(
|
||||
f'runners."llm-{llm.config.__openllm_start_name__}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
|
||||
f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
|
||||
)
|
||||
else:
|
||||
_bentoml_config_options_opts.append(
|
||||
f'runners."llm-{llm.config.__openllm_start_name__}-runner".resources."nvidia.com/gpu"=[{device[0]}]'
|
||||
f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]'
|
||||
)
|
||||
|
||||
_bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
|
||||
|
||||
start_env.update(
|
||||
{
|
||||
llm.config.__openllm_env__.framework: llm.config.__openllm_env__.get_framework_env(),
|
||||
llm.config.__openllm_env__.model_config: llm.config.model_dump_json().decode(),
|
||||
env.framework: env.get_framework_env(),
|
||||
env.model_config: llm.config.model_dump_json().decode(),
|
||||
"OPENLLM_MODEL": model_name,
|
||||
"OPENLLM_MODEL_ID": llm.model_id,
|
||||
"BENTOML_DEBUG": str(openllm.utils.get_debug_mode()),
|
||||
@@ -280,7 +322,8 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
|
||||
"""
|
||||
# The following logics is similar to one of BentoMLCommandGroup
|
||||
|
||||
from bentoml._internal.configuration import DEBUG_ENV_VAR, QUIET_ENV_VAR
|
||||
from bentoml._internal.configuration import (DEBUG_ENV_VAR,
|
||||
QUIET_ENV_VAR)
|
||||
|
||||
@click.option("-q", "--quiet", envvar=QUIET_ENV_VAR, is_flag=True, default=False, help="Suppress all output.")
|
||||
@click.option(
|
||||
@@ -668,11 +711,15 @@ def start_grpc_cli():
|
||||
@output_option
|
||||
@click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.")
|
||||
@workers_per_resource_option(click, build=True)
|
||||
@quantize_option(click)
|
||||
@bettertransformer_option(click)
|
||||
def build(
|
||||
model_name: str,
|
||||
model_id: str | None,
|
||||
overwrite: bool,
|
||||
output: OutputLiteral,
|
||||
quantize: t.Literal["8bit", "4bit", "gptq"] | None,
|
||||
bettertransformer: bool | None,
|
||||
workers_per_resource: float | None,
|
||||
):
|
||||
"""Package a given models into a Bento.
|
||||
@@ -695,6 +742,8 @@ def build(
|
||||
model_name,
|
||||
__cli__=True,
|
||||
model_id=model_id,
|
||||
quantize=quantize,
|
||||
bettertransformer=bettertransformer,
|
||||
_workers_per_resource=workers_per_resource,
|
||||
_overwrite_existing_bento=overwrite,
|
||||
)
|
||||
@@ -764,20 +813,20 @@ def models(output: OutputLiteral, show_available: bool):
|
||||
for m in models:
|
||||
config = openllm.AutoConfig.for_model(m)
|
||||
runtime_impl: tuple[t.Literal["pt", "flax", "tf"], ...] = tuple()
|
||||
if config.__openllm_model_name__ in openllm.MODEL_MAPPING_NAMES:
|
||||
if config["model_name"] in openllm.MODEL_MAPPING_NAMES:
|
||||
runtime_impl += ("pt",)
|
||||
if config.__openllm_model_name__ in openllm.MODEL_FLAX_MAPPING_NAMES:
|
||||
if config["model_name"] in openllm.MODEL_FLAX_MAPPING_NAMES:
|
||||
runtime_impl += ("flax",)
|
||||
if config.__openllm_model_name__ in openllm.MODEL_TF_MAPPING_NAMES:
|
||||
if config["model_name"] in openllm.MODEL_TF_MAPPING_NAMES:
|
||||
runtime_impl += ("tf",)
|
||||
json_data[m] = {
|
||||
"model_id": config.__openllm_model_ids__,
|
||||
"url": config.__openllm_url__,
|
||||
"requires_gpu": config.__openllm_requires_gpu__,
|
||||
"model_id": config["model_ids"],
|
||||
"url": config["url"],
|
||||
"requires_gpu": config["requires_gpu"],
|
||||
"runtime_impl": runtime_impl,
|
||||
"installation": "pip install openllm" if m not in extras else f'pip install "openllm[{m}]"',
|
||||
}
|
||||
converted.extend([convert_transformers_model_name(i) for i in config.__openllm_model_ids__])
|
||||
converted.extend([convert_transformers_model_name(i) for i in config["model_ids"]])
|
||||
if openllm.utils.DEBUG:
|
||||
try:
|
||||
openllm.AutoLLM.for_model(m, llm_config=config)
|
||||
@@ -950,7 +999,7 @@ def query_(
|
||||
_echo(res["responses"], fg="white")
|
||||
|
||||
|
||||
@cli.command()
|
||||
@cli.command(name="download")
|
||||
@click.argument(
|
||||
"model_name",
|
||||
type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]),
|
||||
@@ -967,10 +1016,10 @@ def download_models(model_name: str, model_id: str | None, output: OutputLiteral
|
||||
openllm.utils.configure_logging()
|
||||
|
||||
config = openllm.AutoConfig.for_model(model_name)
|
||||
env = config.__openllm_env__.get_framework_env()
|
||||
if env == "flax":
|
||||
envvar = config["env"].get_framework_env()
|
||||
if envvar == "flax":
|
||||
model = openllm.AutoFlaxLLM.for_model(model_name, model_id=model_id, llm_config=config)
|
||||
elif env == "tf":
|
||||
elif envvar == "tf":
|
||||
model = openllm.AutoTFLLM.for_model(model_name, model_id=model_id, llm_config=config)
|
||||
else:
|
||||
model = openllm.AutoLLM.for_model(model_name, model_id=model_id, llm_config=config)
|
||||
@@ -978,11 +1027,11 @@ def download_models(model_name: str, model_id: str | None, output: OutputLiteral
|
||||
try:
|
||||
_ref = bentoml.transformers.get(model.tag)
|
||||
if output == "pretty":
|
||||
_echo(f"{model_name} is already setup for framework '{env}': {str(_ref.tag)}", nl=True, fg="yellow")
|
||||
_echo(f"{model_name} is already setup for framework '{envvar}': {str(_ref.tag)}", nl=True, fg="yellow")
|
||||
elif output == "json":
|
||||
_echo(
|
||||
orjson.dumps(
|
||||
{"previously_setup": True, "framework": env, "model": str(_ref.tag)}, option=orjson.OPT_INDENT_2
|
||||
{"previously_setup": True, "framework": envvar, "model": str(_ref.tag)}, option=orjson.OPT_INDENT_2
|
||||
).decode(),
|
||||
fg="white",
|
||||
)
|
||||
@@ -1016,7 +1065,7 @@ def download_models(model_name: str, model_id: str | None, output: OutputLiteral
|
||||
elif output == "json":
|
||||
_echo(
|
||||
orjson.dumps(
|
||||
{"previously_setup": False, "framework": env, "tag": str(_ref.tag)},
|
||||
{"previously_setup": False, "framework": envvar, "tag": str(_ref.tag)},
|
||||
option=orjson.OPT_INDENT_2,
|
||||
).decode()
|
||||
)
|
||||
|
||||
@@ -46,7 +46,7 @@ class ChatGLMConfig(openllm.LLMConfig):
|
||||
|
||||
retain_history: bool = openllm.LLMConfig.Field(
|
||||
False,
|
||||
description="""Whether to retain history given to the model.
|
||||
description="""Whether to retain history given to the model.
|
||||
If set to True, then the model will retain given history.""",
|
||||
)
|
||||
|
||||
|
||||
@@ -38,6 +38,7 @@ class DollyV2Config(openllm.LLMConfig):
|
||||
"timeout": 3600000,
|
||||
"trust_remote_code": True,
|
||||
"url": "https://github.com/databrickslabs/dolly",
|
||||
"use_pipeline": True,
|
||||
"default_id": "databricks/dolly-v2-3b",
|
||||
"model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"],
|
||||
}
|
||||
|
||||
@@ -29,6 +29,7 @@ class FalconConfig(openllm.LLMConfig):
|
||||
"trust_remote_code": True,
|
||||
"requires_gpu": True,
|
||||
"timeout": int(36e6),
|
||||
"use_pipeline": True,
|
||||
"url": "https://falconllm.tii.ae/",
|
||||
"requirements": ["einops", "xformers", "safetensors"],
|
||||
"default_id": "tiiuae/falcon-7b",
|
||||
|
||||
@@ -61,6 +61,7 @@ class FlanT5Config(openllm.LLMConfig):
|
||||
"google/flan-t5-xl",
|
||||
"google/flan-t5-xxl",
|
||||
],
|
||||
"model_type": "seq2seq_lm",
|
||||
}
|
||||
|
||||
class GenerationConfig:
|
||||
|
||||
@@ -47,13 +47,12 @@ class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTN
|
||||
|
||||
def llm_post_init(self):
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
self.load_in_mha = True if not torch.cuda.is_available() else False
|
||||
self.bettertransformer = True if not torch.cuda.is_available() else False
|
||||
|
||||
@property
|
||||
def import_kwargs(self):
|
||||
model_kwds = {
|
||||
"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
|
||||
"load_in_8bit": False,
|
||||
"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
|
||||
}
|
||||
tokenizer_kwds: dict[str, t.Any] = {}
|
||||
|
||||
@@ -47,8 +47,7 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
|
||||
def import_kwargs(self):
|
||||
model_kwds = {
|
||||
"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
|
||||
"load_in_8bit": True if torch.cuda.device_count() > 1 else False,
|
||||
"torch_dtype": torch.float16,
|
||||
"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
|
||||
}
|
||||
tokenizer_kwds = {"padding_side": "left"}
|
||||
return model_kwds, tokenizer_kwds
|
||||
@@ -62,7 +61,6 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
|
||||
**attrs: t.Any,
|
||||
) -> bentoml.Model:
|
||||
torch_dtype = attrs.pop("torch_dtype", torch.float16)
|
||||
load_in_8bit = attrs.pop("load_in_8bit", True)
|
||||
device_map = attrs.pop("device_map", "auto")
|
||||
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, **tokenizer_kwds)
|
||||
@@ -74,7 +72,7 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
|
||||
)
|
||||
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
model_id, torch_dtype=torch_dtype, load_in_8bit=load_in_8bit, device_map=device_map, **attrs
|
||||
model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs
|
||||
)
|
||||
try:
|
||||
return bentoml.transformers.save_model(tag, model, custom_objects={"tokenizer": tokenizer})
|
||||
|
||||
@@ -15,42 +15,38 @@
|
||||
Utilities function for OpenLLM. User can import these function for convenience, but
|
||||
we won't ensure backward compatibility for these functions. So use with caution.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from __future__ import annotations as _annotations
|
||||
|
||||
import functools
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import types
|
||||
import typing as t
|
||||
|
||||
from bentoml._internal.configuration import get_debug_mode as get_debug_mode
|
||||
from bentoml._internal.configuration import get_quiet_mode as get_quiet_mode
|
||||
from bentoml._internal.configuration import set_debug_mode as set_debug_mode
|
||||
from bentoml._internal.configuration import set_quiet_mode as set_quiet_mode
|
||||
from bentoml._internal.log import configure_logging as configure_logging
|
||||
from bentoml._internal.log import configure_server_logging as configure_server_logging
|
||||
from bentoml._internal.configuration import (get_debug_mode, get_quiet_mode,
|
||||
set_debug_mode, set_quiet_mode)
|
||||
from bentoml._internal.log import configure_logging, configure_server_logging
|
||||
from bentoml._internal.types import LazyType
|
||||
from bentoml._internal.utils import (LazyLoader, bentoml_cattr,
|
||||
copy_file_to_fs_folder, first_not_none,
|
||||
pkg, reserve_free_port,
|
||||
resolve_user_filepath)
|
||||
|
||||
# NOTE: The following exports useful utils from bentoml
|
||||
from bentoml._internal.utils import LazyLoader as LazyLoader
|
||||
from bentoml._internal.utils import bentoml_cattr as bentoml_cattr
|
||||
from bentoml._internal.utils import copy_file_to_fs_folder as copy_file_to_fs_folder
|
||||
from bentoml._internal.utils import first_not_none as first_not_none
|
||||
from bentoml._internal.utils import pkg as pkg
|
||||
from bentoml._internal.utils import reserve_free_port as reserve_free_port
|
||||
from bentoml._internal.utils import resolve_user_filepath as resolve_user_filepath
|
||||
from .lazy import LazyModule
|
||||
|
||||
from .lazy import LazyModule as LazyModule
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
from typing import GenericAlias as TypingGenericAlias # type: ignore
|
||||
from typing import GenericAlias as _TypingGenericAlias # type: ignore
|
||||
except ImportError:
|
||||
# python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
|
||||
TypingGenericAlias = ()
|
||||
_TypingGenericAlias = ()
|
||||
|
||||
if sys.version_info < (3, 10):
|
||||
WithArgsTypes = (TypingGenericAlias,)
|
||||
_WithArgsTypes = (_TypingGenericAlias,)
|
||||
else:
|
||||
WithArgsTypes: t.Any = (
|
||||
_WithArgsTypes: t.Any = (
|
||||
t._GenericAlias, # type: ignore (_GenericAlias is the actual GenericAlias implementation)
|
||||
types.GenericAlias,
|
||||
types.UnionType,
|
||||
@@ -61,7 +57,7 @@ def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.An
|
||||
try:
|
||||
return isinstance(cls, type) and issubclass(cls, class_or_tuple) # type: ignore[arg-type]
|
||||
except TypeError:
|
||||
if isinstance(cls, WithArgsTypes):
|
||||
if isinstance(cls, _WithArgsTypes):
|
||||
return False
|
||||
raise
|
||||
|
||||
@@ -72,27 +68,25 @@ def gpu_count() -> tuple[int, ...]:
|
||||
return tuple(NvidiaGpuResource.from_system())
|
||||
|
||||
|
||||
# equivocal setattr to save one lookup per assignment
|
||||
_object_setattr = object.__setattr__
|
||||
|
||||
|
||||
def non_intrusive_setattr(obj: t.Any, name: str, value: t.Any) -> None:
|
||||
"""This makes sure that we don't overwrite any existing attributes on the object"""
|
||||
_setattr = functools.partial(setattr, obj) if isinstance(obj, type) else _object_setattr.__get__(obj)
|
||||
|
||||
if not hasattr(obj, name):
|
||||
_setattr(name, value)
|
||||
|
||||
|
||||
DEBUG = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.environ.get("OPENLLMDEVDEBUG")))
|
||||
|
||||
_extras = {
|
||||
"get_debug_mode": get_debug_mode,
|
||||
"get_quiet_mode": get_quiet_mode,
|
||||
"set_debug_mode": set_debug_mode,
|
||||
"set_quiet_mode": set_quiet_mode,
|
||||
"configure_logging": configure_logging,
|
||||
"configure_server_logging": configure_server_logging,
|
||||
"LazyType": LazyType,
|
||||
"LazyLoader": LazyLoader,
|
||||
"LazyModule": LazyModule,
|
||||
"bentoml_cattr": bentoml_cattr,
|
||||
"copy_file_to_fs_folder": copy_file_to_fs_folder,
|
||||
"first_not_none": first_not_none,
|
||||
"pkg": pkg,
|
||||
"reserve_free_port": reserve_free_port,
|
||||
"resolve_user_filepath": resolve_user_filepath,
|
||||
"lenient_issubclass": lenient_issubclass,
|
||||
"gpu_count": gpu_count,
|
||||
"DEBUG": DEBUG,
|
||||
|
||||
# XXX: define all classes, functions import above this line
|
||||
# since _extras will be the locals() import from this file.
|
||||
_extras: dict[str, t.Any] = {
|
||||
k: v for k, v in locals().items() if not isinstance(v, types.ModuleType) and not k.startswith("_")
|
||||
}
|
||||
|
||||
_import_structure = {
|
||||
@@ -108,23 +102,46 @@ _import_structure = {
|
||||
"is_flax_available",
|
||||
"is_tf_available",
|
||||
"is_torch_available",
|
||||
"is_bitsandbytes_available",
|
||||
"require_backends",
|
||||
],
|
||||
}
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
# NOTE: The following exports useful utils from bentoml
|
||||
from . import LazyLoader as LazyLoader
|
||||
from . import LazyType as LazyType
|
||||
from . import analytics as analytics
|
||||
from . import bentoml_cattr as bentoml_cattr
|
||||
from . import codegen as codegen
|
||||
from . import configure_logging as configure_logging
|
||||
from . import configure_server_logging as configure_server_logging
|
||||
from . import copy_file_to_fs_folder as copy_file_to_fs_folder
|
||||
from . import dantic as dantic
|
||||
from . import first_not_none as first_not_none
|
||||
from . import get_debug_mode as get_debug_mode
|
||||
from . import get_quiet_mode as get_quiet_mode
|
||||
from . import gpu_count as gpu_count
|
||||
from . import lenient_issubclass as lenient_issubclass
|
||||
from . import non_intrusive_setattr as non_intrusive_setattr
|
||||
from . import pkg as pkg
|
||||
from . import reserve_free_port as reserve_free_port
|
||||
from . import resolve_user_filepath as resolve_user_filepath
|
||||
from . import set_debug_mode as set_debug_mode
|
||||
from . import set_quiet_mode as set_quiet_mode
|
||||
from .import_utils import ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES
|
||||
from .import_utils import DummyMetaclass as DummyMetaclass
|
||||
from .import_utils import ModelEnv as ModelEnv
|
||||
from .import_utils import is_cpm_kernels_available as is_cpm_kernels_available
|
||||
from .import_utils import \
|
||||
is_bitsandbytes_available as is_bitsandbytes_available
|
||||
from .import_utils import \
|
||||
is_cpm_kernels_available as is_cpm_kernels_available
|
||||
from .import_utils import is_einops_available as is_einops_available
|
||||
from .import_utils import is_flax_available as is_flax_available
|
||||
from .import_utils import is_tf_available as is_tf_available
|
||||
from .import_utils import is_torch_available as is_torch_available
|
||||
from .import_utils import require_backends as require_backends
|
||||
from .lazy import LazyModule as LazyModule
|
||||
else:
|
||||
import sys
|
||||
|
||||
|
||||
@@ -81,7 +81,7 @@ class StartInitEvent(_internal_analytics.schemas.EventMeta):
|
||||
|
||||
@staticmethod
|
||||
def handler(llm_config: openllm.LLMConfig) -> StartInitEvent:
|
||||
return StartInitEvent(model_name=llm_config.__openllm_model_name__, llm_config=llm_config.model_dump())
|
||||
return StartInitEvent(model_name=llm_config["model_name"], llm_config=llm_config.model_dump())
|
||||
|
||||
|
||||
def track_start_init(
|
||||
|
||||
@@ -61,6 +61,7 @@ _tf_available = importlib.util.find_spec("tensorflow") is not None
|
||||
_flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None
|
||||
_einops_available = _is_package_available("einops")
|
||||
_cpm_kernel_available = _is_package_available("cpm_kernels")
|
||||
_bitsandbytes_available = _is_package_available("bitsandbytes")
|
||||
|
||||
|
||||
def is_einops_available():
|
||||
@@ -71,6 +72,10 @@ def is_cpm_kernels_available():
|
||||
return _cpm_kernel_available
|
||||
|
||||
|
||||
def is_bitsandbytes_available():
|
||||
return _bitsandbytes_available
|
||||
|
||||
|
||||
def is_torch_available():
|
||||
global _torch_available
|
||||
if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
|
||||
|
||||
@@ -37,6 +37,9 @@ def model_settings(draw: st.DrawFn):
|
||||
requires_gpu=st.booleans(),
|
||||
trust_remote_code=st.booleans(),
|
||||
requirements=st.none() | st.lists(st.text(), min_size=1),
|
||||
use_pipeline=st.booleans(),
|
||||
model_type=st.sampled_from(["causal_lm", "seq2seq_lm"]),
|
||||
runtime=st.sampled_from(["transformers", "cpp"]),
|
||||
name_type=st.sampled_from(["dasherize", "lowercase"]),
|
||||
timeout=st.integers(min_value=3600),
|
||||
workers_per_resource=st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)),
|
||||
|
||||
@@ -23,7 +23,8 @@ from hypothesis import assume, given
|
||||
from hypothesis import strategies as st
|
||||
|
||||
import openllm
|
||||
from openllm._configuration import GenerationConfig, ModelSettings, _field_env_key
|
||||
from openllm._configuration import (GenerationConfig, ModelSettings,
|
||||
_field_env_key)
|
||||
from openllm.utils import DEBUG
|
||||
|
||||
from ._strategies._configuration import make_llm_config, model_settings
|
||||
@@ -67,7 +68,7 @@ def test_forbidden_access():
|
||||
|
||||
@given(model_settings())
|
||||
def test_class_normal_gen(gen_settings: ModelSettings):
|
||||
assume(gen_settings["default_id"] and gen_settings["model_ids"])
|
||||
assume(gen_settings["default_id"] and all(i for i in gen_settings["model_ids"]))
|
||||
cl_: type[openllm.LLMConfig] = make_llm_config("NotFullLLM", gen_settings)
|
||||
assert issubclass(cl_, openllm.LLMConfig)
|
||||
for key in gen_settings:
|
||||
|
||||
@@ -3,11 +3,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from markdown_it import MarkdownIt
|
||||
|
||||
import openllm
|
||||
|
||||
md = MarkdownIt()
|
||||
|
||||
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
@@ -17,7 +16,7 @@ with open(os.path.join(ROOT, "README.md"), "r") as f:
|
||||
# NOTE: Currently, we only have one table in README, which is the Model readme.
|
||||
table = [r for r in readme if r.type == "html_block" and r.content.startswith("<td><a")]
|
||||
|
||||
available = len(openllm.CONFIG_MAPPING.keys())
|
||||
available = subprocess.check_output(["openllm", "models", "-o", "porcelain"]).strip().decode("utf-8").count("\n") + 1
|
||||
|
||||
on_table = len(table) # NOTE: minus the header
|
||||
|
||||
|
||||
@@ -31,9 +31,9 @@ FLAN_T5_DEPS = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
|
||||
OPENAI_DEPS = ["openai", "tiktoken"]
|
||||
|
||||
_base_requirements = {
|
||||
inflection.dasherize(name): config.__openllm_requirements__
|
||||
for name, config in openllm.CONFIG_MAPPING.items()
|
||||
if config.__openllm_requirements__
|
||||
inflection.dasherize(name): config_cls.__openllm_requirements__
|
||||
for name, config_cls in openllm.CONFIG_MAPPING.items()
|
||||
if config_cls.__openllm_requirements__
|
||||
}
|
||||
|
||||
# NOTE: update this table when adding new external dependencies
|
||||
|
||||
@@ -47,13 +47,13 @@ def main() -> int:
|
||||
"Model Ids": [],
|
||||
}
|
||||
max_install_len_div = 0
|
||||
for name, config in openllm.CONFIG_MAPPING.items():
|
||||
for name, config_cls in openllm.CONFIG_MAPPING.items():
|
||||
dashed = inflection.dasherize(name)
|
||||
formatted["Model"].append(dashed)
|
||||
formatted["URL"].append(config.__openllm_url__)
|
||||
formatted["URL"].append(config_cls.__openllm_url__)
|
||||
formatted["GPU"].append("✅")
|
||||
formatted["CPU"].append("✅" if not config.__openllm_requires_gpu__ else "❌")
|
||||
formatted["Model Ids"].append(config.__openllm_model_ids__)
|
||||
formatted["CPU"].append("✅" if not config_cls.__openllm_requires_gpu__ else "❌")
|
||||
formatted["Model Ids"].append(config_cls.__openllm_model_ids__)
|
||||
if dashed in deps:
|
||||
instruction = f'```bash\npip install "openllm[{dashed}]"\n```'
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user