refactor: focus (#730)

* perf: remove based images

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: update changelog

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: move dockerifle to run on release only

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: cleanup unused types

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-11-24 01:11:31 -05:00
committed by GitHub
parent 52a44b1bfa
commit aab173cd99
19 changed files with 168 additions and 679 deletions

View File

@@ -20,6 +20,11 @@ on:
- 'openllm-core/src/openllm_core/**'
- 'openllm-client/src/openllm_client/**'
types: [labeled, opened, synchronize, reopened]
workflow_call:
inputs:
tags:
required: true
type: string
env:
LINES: 120
COLUMNS: 120
@@ -97,7 +102,8 @@ jobs:
steps:
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # ratchet:actions/checkout@v4.1.1
with:
fetch-depth: 1
fetch-depth: 0
ref: '${{ inputs.tags }}'
- name: Inject slug/short variables
uses: rlespinasse/github-slug-action@102b1a064a9b145e56556e22b18b19c624538d94 # ratchet:rlespinasse/github-slug-action@v4.4.1
- name: Set up QEMU
@@ -121,25 +127,6 @@ jobs:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Login to public ECR
uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # ratchet:docker/login-action@v3.0.0
with:
registry: public.ecr.aws
username: ${{ secrets.AWS_ACCESS_KEY_ID }}
password: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
env:
AWS_REGION: us-east-1
- name: Extract metadata tags and labels on PRs
if: github.event_name == 'pull_request'
id: meta-pr
uses: docker/metadata-action@96383f45573cb7f253c731d3b3ab81c87ef81934 # ratchet:docker/metadata-action@v5.0.0
with:
images: |
public.ecr.aws/y5w8i4y6/bentoml/openllm
tags: |
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
labels: |
org.opencontainers.image.source="https://github.com/bentoml/OpenLLM"
- name: Extract metadata tags and labels for main, release or tag
if: github.event_name != 'pull_request'
id: meta
@@ -148,7 +135,6 @@ jobs:
flavor: |
latest=auto
images: |
public.ecr.aws/y5w8i4y6/bentoml/openllm
ghcr.io/bentoml/openllm
tags: |
type=semver,pattern={{version}}
@@ -166,7 +152,7 @@ jobs:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PROD_SECRET_ACCESS_KEY }}
with:
context: .
file: openllm-python/src/openllm/bundle/oci/Dockerfile
file: Dockerfile
push: true
platforms: 'linux/amd64'
build-args: |

View File

@@ -117,11 +117,21 @@ jobs:
uses: pypa/gh-action-pypi-publish@b7f401de30cb6434a1e19f805ff006643653240e # ratchet:pypa/gh-action-pypi-publish@release/v1
with:
print-hash: true
publish-docker-images:
if: github.repository_owner == 'bentoml'
needs:
- release
- publish-python
name: Publish new base Docker images on GHCR
uses: bentoml/OpenLLM/.github/workflows/build.yml@main # ratchet:exclude
with:
tags: ${{ needs.release.outputs.version }}
prepare-next-dev-cycle:
needs:
- release
- publish-python
- binary-distribution
- publish-docker-images
runs-on: ubuntu-latest
permissions:
contents: write

12
changelog.d/730.change.md Normal file
View File

@@ -0,0 +1,12 @@
We will deprecate support for PyTorch backend and will enforce all
built Bento to use vLLM backend going forward. This means that `openllm build`
with `--backend pt` will now be deprecated and move to `--backend vllm`.
We will focus more on contributing upstream to vLLM and will ensure that the core
value of OpenLLM is to provide a flexible and as streamlined experience to bring these
models to production with ease.
PyTorch backend will be removed from 0.5.0 releases onwards.
The docker images will now only be available on GHCR and not on ECR anymore as a measure
to reduce cost and maintenance one our side

View File

@@ -40,14 +40,7 @@ LiteralBackend = t.Literal['pt', 'vllm', 'ctranslate', 'triton'] # TODO: ggml
AdapterType = t.Literal[
'lora', 'adalora', 'adaption_prompt', 'prefix_tuning', 'p_tuning', 'prompt_tuning', 'ia3', 'loha', 'lokr'
]
# TODO: support quay
LiteralContainerRegistry = t.Literal['docker', 'gh', 'ecr']
LiteralContainerVersionStrategy = t.Literal['release', 'nightly', 'latest', 'custom']
LiteralResourceSpec = t.Literal['cloud-tpus.google.com/v2', 'amd.com/gpu', 'nvidia.com/gpu', 'cpu']
InferenceReturnType = t.Literal['text', 'object', 'token']
LiteralVersionStrategy = t.Literal['release', 'nightly', 'latest', 'custom']
if sys.version_info[:2] >= (3, 11):
from typing import (

View File

@@ -1,30 +1,17 @@
import logging as _logging
import os as _os
import pathlib as _pathlib
import warnings as _warnings
import logging as _logging, os as _os, pathlib as _pathlib, warnings as _warnings
from openllm_cli import _sdk
from . import utils as utils
if utils.DEBUG:
utils.set_debug_mode(True)
_logging.basicConfig(level=_logging.NOTSET)
utils.set_debug_mode(True); _logging.basicConfig(level=_logging.NOTSET)
else:
# configuration for bitsandbytes before import
_os.environ['BITSANDBYTES_NOWELCOME'] = _os.environ.get('BITSANDBYTES_NOWELCOME', '1')
# NOTE: The following warnings from bitsandbytes, and probably not that important for users to see when DEBUG is False
_warnings.filterwarnings(
'ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization'
)
_warnings.filterwarnings(
'ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization'
)
_warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
_warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
_warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')
# NOTE: ignore the following warning from ghapi as it is not important for users
_warnings.filterwarnings(
'ignore', message='Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated'
)
_warnings.filterwarnings('ignore', message='Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated')
COMPILED = _pathlib.Path(__file__).suffix in ('.pyd', '.so')

View File

@@ -1,4 +1 @@
if __name__ == '__main__':
from openllm_cli.entrypoint import cli
cli()
if __name__ == '__main__': from openllm_cli.entrypoint import cli; cli()

View File

@@ -1,7 +1,5 @@
import transformers
def prepare_logits_processor(config):
import transformers
generation_config = config.generation_config
logits_processor = transformers.LogitsProcessorList()
if generation_config['temperature'] >= 1e-5 and generation_config['temperature'] != 1.0:
@@ -13,28 +11,16 @@ def prepare_logits_processor(config):
if generation_config['top_k'] > 0:
logits_processor.append(transformers.TopKLogitsWarper(generation_config['top_k']))
return logits_processor
# NOTE: The ordering here is important. Some models have two of these and we have a preference for which value gets used.
SEQLEN_KEYS = ['max_sequence_length', 'seq_length', 'max_position_embeddings', 'max_seq_len', 'model_max_length']
def get_context_length(config):
rope_scaling = getattr(config, 'rope_scaling', None)
rope_scaling_factor = config.rope_scaling['factor'] if rope_scaling else 1.0
for key in SEQLEN_KEYS:
if getattr(config, key, None) is not None:
return int(rope_scaling_factor * getattr(config, key))
if getattr(config, key, None) is not None: return int(rope_scaling_factor * getattr(config, key))
return 2048
def is_sentence_complete(output):
return output.endswith(('.', '?', '!', '...', '', '?', '!', '', '"', "'", ''))
def is_sentence_complete(output): return output.endswith(('.', '?', '!', '...', '', '?', '!', '', '"', "'", ''))
def is_partial_stop(output, stop_str):
'''Check whether the output contains a partial stop str.'''
for i in range(min(len(output), len(stop_str))):
if stop_str.startswith(output[-i:]):
return True
if stop_str.startswith(output[-i:]): return True
return False

View File

@@ -184,7 +184,7 @@ class LLM(t.Generic[M, T], ReprMixin):
):
torch_dtype = attrs.pop('torch_dtype', None) # backward compatible
if torch_dtype is not None:
warnings.warns(
warnings.warn(
'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',
DeprecationWarning,
stacklevel=3,

View File

@@ -1,14 +1,39 @@
import os
from openllm_core.utils import LazyModule
from __future__ import annotations
import os, attr, functools
from openllm_core._typing_compat import LiteralContainerVersionStrategy
from openllm_core.exceptions import OpenLLMException
from openllm_core.utils.lazy import VersionInfo, LazyModule
_OWNER, _REPO = 'bentoml', 'openllm'
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
class RefResolver:
git_hash: str = attr.field()
version: VersionInfo = attr.field(converter=lambda s: VersionInfo.from_version_string(s))
strategy: LiteralContainerVersionStrategy = attr.field()
@classmethod
@functools.lru_cache(maxsize=64)
def from_strategy(cls, strategy_or_version: LiteralContainerVersionStrategy | None = None) -> RefResolver:
# using default strategy
if strategy_or_version is None or strategy_or_version == 'release':
try:
from ghapi.all import GhApi
ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
meta = ghapi.repos.get_latest_release()
git_hash = ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha']
except Exception as err:
raise OpenLLMException('Failed to determine latest release version.') from err
return cls(git_hash, meta['name'].lstrip('v'), 'release')
elif strategy_or_version in ('latest', 'nightly'): # latest is nightly
return cls('latest', '0.0.0', 'latest')
else:
raise ValueError(f'Unknown strategy: {strategy_or_version}')
@property
def tag(self) -> str: return 'latest' if self.strategy in {'latest', 'nightly'} else repr(self.version)
__lazy = LazyModule(
__name__,
os.path.abspath('__file__'),
{
'_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'],
'oci': ['CONTAINER_NAMES', 'supported_registries', 'RefResolver'],
},
{'_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options']},
extra_objects={'RefResolver': RefResolver}
)
__all__ = __lazy.__all__
__dir__ = __lazy.__dir__

View File

@@ -2,7 +2,7 @@ from typing import Optional
import attr
from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
from openllm_core._typing_compat import LiteralContainerVersionStrategy
from openllm_core.utils.lazy import VersionInfo
from . import _package as _package, oci as oci
@@ -13,9 +13,6 @@ from ._package import (
create_bento as create_bento,
)
CONTAINER_NAMES: dict[LiteralContainerRegistry, str] = ...
supported_registries: list[str] = ...
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
class RefResolver:
git_hash: str
@@ -26,7 +23,3 @@ class RefResolver:
def from_strategy(cls, strategy_or_version: Optional[LiteralContainerVersionStrategy] = ...) -> RefResolver: ...
@property
def tag(self) -> str: ...
@staticmethod
def construct_base_image(
reg: LiteralContainerRegistry, strategy: Optional[LiteralContainerVersionStrategy] = ...
) -> str: ...

View File

@@ -14,60 +14,43 @@ from bentoml._internal.bento.build_config import BentoBuildConfig, DockerOptions
from bentoml._internal.configuration.containers import BentoMLContainer
from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg
from . import oci
logger = logging.getLogger(__name__)
OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
_service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
_SERVICE_VARS = '''import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__}'''
def build_editable(path, package='openllm'):
'''Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set.'''
if not check_bool_env(OPENLLM_DEV_BUILD, default=False):
return None
if not check_bool_env(OPENLLM_DEV_BUILD, default=False): return None
# We need to build the package in editable mode, so that we can import it
# TODO: Upgrade to 1.0.3
from build import ProjectBuilder
from build.env import IsolatedEnvBuilder
module_location = pkg.source_locations(package)
if not module_location:
raise RuntimeError(
'Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.'
)
if not module_location: raise RuntimeError('Could not find the source location of OpenLLM.')
pyproject_path = Path(module_location).parent.parent / 'pyproject.toml'
if os.path.isfile(pyproject_path.__fspath__()):
logger.info('Generating built wheels for package %s...', package)
with IsolatedEnvBuilder() as env:
builder = ProjectBuilder(pyproject_path.parent)
builder.python_executable = env.executable
builder.scripts_dir = env.scripts_dir
env.install(builder.build_system_requires)
return builder.build('wheel', path, config_settings={'--global-option': '--quiet'})
raise RuntimeError(
'Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.'
)
raise RuntimeError('Please install OpenLLM from PyPI or built it from Git source.')
def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=None):
packages = ['scipy', 'bentoml[tracing]>=1.1.10'] # apparently bnb misses this one
if adapter_map is not None:
packages += ['openllm[fine-tune]']
if extra_dependencies is not None:
packages += [f'openllm[{k}]' for k in extra_dependencies]
if llm.config['requirements'] is not None:
packages.extend(llm.config['requirements'])
wheels = None
from . import RefResolver
packages = ['scipy', 'bentoml[tracing]>=1.1.10', 'vllm==0.2.2', 'ray==2.6.0', f'openllm>={RefResolver.from_strategy("release").version}'] # apparently bnb misses this one
if adapter_map is not None: packages += ['openllm[fine-tune]']
if extra_dependencies is not None: packages += [f'openllm[{k}]' for k in extra_dependencies]
if llm.config['requirements'] is not None: packages.extend(llm.config['requirements'])
built_wheels = [build_editable(llm_fs.getsyspath('/'), p) for p in ('openllm_core', 'openllm_client', 'openllm')]
if all(i for i in built_wheels):
wheels = [llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in built_wheels]
return PythonOptions(packages=packages, wheels=wheels, lock_packages=True)
def construct_docker_options(
llm, _, quantize, adapter_map, dockerfile_template, serialisation, container_registry, container_version_strategy
):
return PythonOptions(
packages=packages,
wheels=[llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in built_wheels] if all(i for i in built_wheels) else None,
lock_packages=True
)
def construct_docker_options(llm, _, quantize, adapter_map, dockerfile_template, serialisation):
from openllm_cli.entrypoint import process_environ
environ = process_environ(
llm.config,
llm.config['timeout'],
@@ -84,37 +67,7 @@ def construct_docker_options(
environ['OPENLLM_CONFIG'] = f"'{environ['OPENLLM_CONFIG']}'"
environ.pop('BENTOML_HOME', None) # NOTE: irrelevant in container
environ['NVIDIA_DRIVER_CAPABILITIES'] = 'compute,utility'
return DockerOptions(
base_image=oci.RefResolver.construct_base_image(container_registry, container_version_strategy),
env=environ,
dockerfile_template=dockerfile_template,
)
_service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
_SERVICE_VARS = '''\
import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__}
'''
def write_service(llm, llm_fs, adapter_map):
logger.debug('Generating service vars %s (dir=%s)', llm.model_id, llm_fs.getsyspath('/'))
script = f"# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n\n# fmt: off\n" + _SERVICE_VARS.format(
__model_id__=llm.model_id,
__model_tag__=str(llm.tag),
__model_adapter_map__=orjson.dumps(adapter_map).decode(),
__model_serialization__=llm.config['serialisation'],
__model_trust_remote_code__=str(llm.trust_remote_code),
)
if SHOW_CODEGEN:
logger.info('Generated _service_vars.py:\n%s', script)
llm_fs.writetext('_service_vars.py', script)
with open(_service_file.__fspath__(), 'r') as f:
service_src = f.read()
llm_fs.writetext(llm.config['service_name'], service_src)
return DockerOptions(cuda_version='12.1', env=environ, dockerfile_template=dockerfile_template)
@inject
def create_bento(
bento_tag,
@@ -125,8 +78,6 @@ def create_bento(
adapter_map=None,
extra_dependencies=None,
serialisation=None,
container_registry='ecr',
container_version_strategy='release',
_bento_store=Provide[BentoMLContainer.bento_store],
_model_store=Provide[BentoMLContainer.model_store],
):
@@ -145,11 +96,21 @@ def create_bento(
},
}
)
if adapter_map:
labels.update(adapter_map)
if adapter_map: labels.update(adapter_map)
logger.debug("Building Bento '%s' with model backend '%s'", bento_tag, llm.__llm_backend__)
# add service.py definition to this temporary folder
write_service(llm, llm_fs, adapter_map)
logger.debug('Generating service vars %s (dir=%s)', llm.model_id, llm_fs.getsyspath('/'))
script = f"# fmt: off\n# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n" + _SERVICE_VARS.format(
__model_id__=llm.model_id,
__model_tag__=str(llm.tag),
__model_adapter_map__=orjson.dumps(adapter_map).decode(),
__model_serialization__=llm.config['serialisation'],
__model_trust_remote_code__=str(llm.trust_remote_code),
)
if SHOW_CODEGEN: logger.info('Generated _service_vars.py:\n%s', script)
llm_fs.writetext('_service_vars.py', script)
with open(_service_file.__fspath__(), 'r') as f: service_src = f.read()
llm_fs.writetext(llm.config['service_name'], service_src)
bento = bentoml.Bento.create(
version=bento_tag.version,
@@ -163,16 +124,7 @@ def create_bento(
include=list(llm_fs.walk.files()),
exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
docker=construct_docker_options(
llm,
llm_fs,
quantize,
adapter_map,
dockerfile_template,
_serialisation,
container_registry,
container_version_strategy,
),
docker=construct_docker_options(llm, llm_fs, quantize, adapter_map, dockerfile_template, _serialisation),
),
)

View File

@@ -8,8 +8,6 @@ from bentoml._internal.bento import BentoStore
from bentoml._internal.bento.build_config import DockerOptions, PythonOptions
from bentoml._internal.models.model import ModelStore
from openllm_core._typing_compat import (
LiteralContainerRegistry,
LiteralContainerVersionStrategy,
LiteralQuantise,
LiteralSerialisation,
M,
@@ -32,10 +30,7 @@ def construct_docker_options(
adapter_map: Optional[Dict[str, str]],
dockerfile_template: Optional[str],
serialisation: LiteralSerialisation,
container_registry: LiteralContainerRegistry,
container_version_strategy: LiteralContainerVersionStrategy,
) -> DockerOptions: ...
def write_service(llm: LLM[M, T], llm_fs: FS, adapter_map: Optional[Dict[str, str]]) -> None: ...
def create_bento(
bento_tag: Tag,
llm_fs: FS,
@@ -45,8 +40,6 @@ def create_bento(
adapter_map: Optional[Dict[str, str]] = ...,
extra_dependencies: Optional[Tuple[str, ...]] = ...,
serialisation: Optional[LiteralSerialisation] = ...,
container_registry: LiteralContainerRegistry = ...,
container_version_strategy: LiteralContainerVersionStrategy = ...,
_bento_store: BentoStore = ...,
_model_store: ModelStore = ...,
) -> Bento: ...

View File

@@ -1,82 +0,0 @@
from __future__ import annotations
import functools
import importlib
import logging
import os
import pathlib
import attr
from openllm_core._typing_compat import LiteralContainerVersionStrategy
from openllm_core.exceptions import OpenLLMException
from openllm_core.utils.lazy import VersionInfo
logger = logging.getLogger(__name__)
ROOT_DIR = pathlib.Path(os.path.abspath('__file__')).parent.parent.parent
_CONTAINER_REGISTRY = {
'docker': 'docker.io/bentoml/openllm',
'gh': 'ghcr.io/bentoml/openllm',
'ecr': 'public.ecr.aws/y5w8i4y6/bentoml/openllm',
}
# TODO: support custom fork. Currently it only support openllm main.
_OWNER, _REPO = 'bentoml', 'openllm'
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
class RefResolver:
git_hash: str = attr.field()
version: VersionInfo = attr.field(converter=lambda s: VersionInfo.from_version_string(s))
strategy: LiteralContainerVersionStrategy = attr.field()
@classmethod
@functools.lru_cache(maxsize=64)
def from_strategy(cls, strategy_or_version=None):
# using default strategy
if strategy_or_version is None or strategy_or_version == 'release':
try:
from ghapi.all import GhApi
ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
meta = ghapi.repos.get_latest_release()
git_hash = ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha']
except Exception as err:
raise OpenLLMException('Failed to determine latest release version.') from err
return cls(git_hash=git_hash, version=meta['name'].lstrip('v'), strategy='release')
elif strategy_or_version in ('latest', 'nightly'): # latest is nightly
return cls(git_hash='latest', version='0.0.0', strategy='latest')
else:
raise ValueError(f'Unknown strategy: {strategy_or_version}')
@property
def tag(self):
return 'latest' if self.strategy in {'latest', 'nightly'} else repr(self.version)
@staticmethod
def construct_base_image(reg, strategy=None):
if reg == 'gh':
logger.warning("Setting base registry to 'gh' will affect cold start performance on GCP/AWS.")
elif reg == 'docker':
logger.warning('docker is base image is yet to be supported. Falling back to "ecr".')
reg = 'ecr'
return f'{_CONTAINER_REGISTRY[reg]}:{RefResolver.from_strategy(strategy).tag}'
__all__ = ['CONTAINER_NAMES', 'RefResolver', 'supported_registries']
def __dir__():
return sorted(__all__)
def __getattr__(name):
if name == 'supported_registries':
return functools.lru_cache(1)(lambda: list(_CONTAINER_REGISTRY))()
elif name == 'CONTAINER_NAMES':
return _CONTAINER_REGISTRY
elif name in __all__:
return importlib.import_module('.' + name, __name__)
else:
raise AttributeError(f'{name} does not exists under {__name__}')

View File

@@ -1,17 +1,9 @@
from __future__ import annotations
import functools
import logging
import os
import typing as t
import click
import click_option_group as cog
import inflection
import functools, logging, os, typing as t
import bentoml, openllm, click, inflection, click_option_group as cog
from bentoml_cli.utils import BentoMLCommandGroup
from click import shell_completion as sc
import bentoml
import openllm
from openllm_core._configuration import LLMConfig
from openllm_core._typing_compat import (
Concatenate,
@@ -23,7 +15,6 @@ from openllm_core._typing_compat import (
)
from openllm_core.utils import DEBUG, compose, dantic, resolve_user_filepath
class _OpenLLM_GenericInternalConfig(LLMConfig):
__config__ = {
'name_type': 'lowercase',
@@ -38,7 +29,6 @@ class _OpenLLM_GenericInternalConfig(LLMConfig):
temperature: float = 0.75
max_new_tokens: int = 128
logger = logging.getLogger(__name__)
P = ParamSpec('P')
@@ -47,7 +37,6 @@ LiteralOutput = t.Literal['json', 'pretty', 'porcelain']
_AnyCallable = t.Callable[..., t.Any]
FC = t.TypeVar('FC', bound=t.Union[_AnyCallable, click.Command])
def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
return [
sc.CompletionItem(str(it.tag), help='Bento')
@@ -55,7 +44,6 @@ def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete
if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})
]
def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
return [
sc.CompletionItem(inflection.dasherize(it), help='Model')
@@ -63,7 +51,6 @@ def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete
if it.startswith(incomplete)
]
def parse_config_options(
config: LLMConfig,
server_timeout: int,
@@ -132,60 +119,52 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...
return None
def start_decorator(serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
def wrapper(fn: FC) -> t.Callable[[FC], FC]:
composed = compose(
_OpenLLM_GenericInternalConfig.parse,
_http_server_args if not serve_grpc else _grpc_server_args,
cog.optgroup.group('General LLM Options', help='The following options are related to running LLM Server.'),
dtype_option(factory=cog.optgroup),
model_version_option(factory=cog.optgroup),
cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
workers_per_resource_option(factory=cog.optgroup),
cors_option(factory=cog.optgroup),
backend_option(factory=cog.optgroup),
cog.optgroup.group(
'LLM Optimization Options',
help='''Optimization related options.
def start_decorator(fn: FC) -> FC:
composed = compose(
_OpenLLM_GenericInternalConfig.parse,
_http_server_args,
cog.optgroup.group('General LLM Options', help='The following options are related to running LLM Server.'),
dtype_option(factory=cog.optgroup),
model_version_option(factory=cog.optgroup),
cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
workers_per_resource_option(factory=cog.optgroup),
cors_option(factory=cog.optgroup),
backend_option(factory=cog.optgroup),
cog.optgroup.group(
'LLM Optimization Options',
help='''Optimization related options.
OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
The following are either in our roadmap or currently being worked on:
The following are either in our roadmap or currently being worked on:
- DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
- GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
''',
),
quantize_option(factory=cog.optgroup),
serialisation_option(factory=cog.optgroup),
cog.optgroup.option(
'--device',
type=dantic.CUDA,
multiple=True,
envvar='CUDA_VISIBLE_DEVICES',
callback=parse_device_callback,
help='Assign GPU devices (if available)',
show_envvar=True,
),
adapter_id_option(factory=cog.optgroup),
click.option('--return-process', is_flag=True, default=False, help='Internal use only.', hidden=True),
)
return composed(fn)
- DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
- GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
''',
),
quantize_option(factory=cog.optgroup),
serialisation_option(factory=cog.optgroup),
cog.optgroup.option(
'--device',
type=dantic.CUDA,
multiple=True,
envvar='CUDA_VISIBLE_DEVICES',
callback=parse_device_callback,
help='Assign GPU devices (if available)',
show_envvar=True,
),
adapter_id_option(factory=cog.optgroup),
click.option('--return-process', is_flag=True, default=False, help='Internal use only.', hidden=True),
)
return wrapper
return composed(fn)
def parse_device_callback(
ctx: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None
) -> t.Tuple[str, ...] | None:
def parse_device_callback(_: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
if value is None:
return value
if not isinstance(value, tuple):
ctx.fail(f'{param} only accept multiple values, not {type(value)} (value: {value})')
el: t.Tuple[str, ...] = tuple(i for k in value for i in k)
# NOTE: --device all is a special case
if len(el) == 1 and el[0] == 'all':
return tuple(map(str, openllm.utils.available_devices()))
if len(el) == 1 and el[0] == 'all': return tuple(map(str, openllm.utils.available_devices()))
return el
@@ -195,18 +174,12 @@ def parse_device_callback(
_IGNORED_OPTIONS = {'working_dir', 'production', 'protocol_version'}
def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]:
'''Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`.'''
def parse_serve_args() -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]:
from bentoml_cli.cli import cli
command = 'serve' if not serve_grpc else 'serve-grpc'
group = cog.optgroup.group(
f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options",
help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",
)
group = cog.optgroup.group('Start a HTTP server options', help='Related to serving the model [synonymous to `bentoml serve-http`]')
def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) -> t.Callable[[FC], FC]:
serve_command = cli.commands[command]
serve_command = cli.commands['serve']
# The first variable is the argument bento
# The last five is from BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS
serve_options = [
@@ -225,12 +198,9 @@ def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]
param_decls = (*attrs.pop('opts'), *attrs.pop('secondary_opts'))
f = cog.optgroup.option(*param_decls, **attrs)(f)
return group(f)
return decorator
_http_server_args, _grpc_server_args = parse_serve_args(False), parse_serve_args(True)
_http_server_args = parse_serve_args()
def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | None], FC]:
'''General ``@click`` decorator with some sauce.
@@ -278,11 +248,9 @@ def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC
**attrs,
)(f)
def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--machine', is_flag=True, default=False, hidden=True, **attrs)(f)
def dtype_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
'--dtype',
@@ -293,7 +261,6 @@ def dtype_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[F
**attrs,
)(f)
def model_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
'--model-id',
@@ -327,7 +294,6 @@ def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[
**attrs,
)(f)
def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_argument(
'model_name',
@@ -336,7 +302,6 @@ def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **
**attrs,
)(f)
def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
'--quantise',
@@ -405,7 +370,6 @@ def workers_per_resource_option(
**attrs,
)(f)
def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
'--serialisation',
@@ -429,25 +393,8 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
**attrs,
)(f)
def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
'--container-registry',
'container_registry',
type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
default='ecr',
show_default=True,
show_envvar=True,
envvar='OPENLLM_CONTAINER_REGISTRY',
callback=container_registry_callback,
help='The default container registry to get the base image for building BentoLLM. Currently, it supports ecr, ghcr, docker',
**attrs,
)(f)
_wpr_strategies = {'round_robin', 'conserved'}
def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None:
if value is None:
return value
@@ -465,11 +412,3 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va
) from None
else:
return value
def container_registry_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None:
if value is None:
return value
if value not in openllm.bundle.supported_registries:
raise click.BadParameter(f'Value must be one of {openllm.bundle.supported_registries}', ctx, param)
return value

View File

@@ -20,13 +20,7 @@ from openllm_core.utils import WARNING_ENV_VAR, codegen, first_not_none, get_dis
if t.TYPE_CHECKING:
from bentoml._internal.bento import BentoStore
from openllm_core._configuration import LLMConfig
from openllm_core._typing_compat import (
LiteralBackend,
LiteralContainerRegistry,
LiteralContainerVersionStrategy,
LiteralQuantise,
LiteralString,
)
from openllm_core._typing_compat import LiteralBackend, LiteralQuantise, LiteralString
logger = logging.getLogger(__name__)
@@ -125,8 +119,6 @@ def _build(
enable_features: tuple[str, ...] | None = None,
dockerfile_template: str | None = None,
overwrite: bool = False,
container_registry: LiteralContainerRegistry | None = None,
container_version_strategy: LiteralContainerVersionStrategy | None = None,
push: bool = False,
force_push: bool = False,
containerize: bool = False,
@@ -159,8 +151,6 @@ def _build(
containerize: Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.
Note that 'containerize' and 'push' are mutually exclusive
container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
serialisation: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
additional_args: Additional arguments to pass to ``openllm build``.
bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.
@@ -205,11 +195,6 @@ def _build(
args.extend(['--bento-version', bento_version])
if dockerfile_template:
args.extend(['--dockerfile-template', dockerfile_template])
if container_registry is None:
container_registry = 'ecr'
if container_version_strategy is None:
container_version_strategy = 'release'
args.extend(['--container-registry', container_registry, '--container-version-strategy', container_version_strategy])
if additional_args:
args.extend(additional_args)
if force_push:

View File

@@ -1,31 +1,8 @@
from __future__ import annotations
import enum
import functools
import inspect
import itertools
import logging
import os
import platform
import random
import subprocess
import threading
import time
import traceback
import typing as t
import attr
import click
import click_option_group as cog
import fs
import fs.copy
import fs.errors
import inflection
import orjson
import enum, functools, inspect, itertools, logging, os, platform, random, subprocess, threading, time, traceback, typing as t
import attr, click, fs, inflection, bentoml, openllm, orjson, fs.copy, fs.errors, click_option_group as cog
from bentoml_cli.utils import BentoMLCommandGroup, opt_callback
from simple_di import Provide, inject
import bentoml
import openllm
from bentoml._internal.cloud.config import CloudClientConfig
from bentoml._internal.configuration.containers import BentoMLContainer
from bentoml._internal.models.model import ModelStore
@@ -70,7 +47,6 @@ from ._factory import (
FC,
_AnyCallable,
backend_option,
container_registry_option,
dtype_option,
machine_option,
model_name_argument,
@@ -88,7 +64,6 @@ if t.TYPE_CHECKING:
from bentoml._internal.container import DefaultBuilder
from openllm_client._schemas import StreamingResponse
from openllm_core._configuration import LLMConfig
from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
else:
torch = LazyLoader('torch', globals(), 'torch')
@@ -248,7 +223,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
return super().get_command(ctx, cmd_name)
def list_commands(self, ctx: click.Context) -> list[str]:
return super().list_commands(ctx) + t.cast('Extensions', extension_command).list_commands(ctx)
return super().list_commands(ctx) + extension_command.list_commands(ctx)
def command(self, *args: t.Any, **kwargs: t.Any) -> t.Callable[[t.Callable[..., t.Any]], click.Command]:
"""Override the default 'cli.command' with supports for aliases for given command, and it wraps the implementation with common parameters."""
@@ -371,7 +346,7 @@ def cli() -> None:
default=None,
help='Maximum sequence length for the model. If not specified, we will use the default value from the model config.',
)
@start_decorator(serve_grpc=False)
@start_decorator
def start_command(
model_id: str,
server_timeout: int,
@@ -396,26 +371,21 @@ def start_command(
$ openllm <start|start-http> <model_id> --<options> ...
```
'''
if backend == 'pt': logger.warning('PyTorch backend is deprecated and will be removed in future releases. Make sure to use vLLM instead.')
if model_id in openllm.CONFIG_MAPPING:
_model_name = model_id
if deprecated_model_id is not None:
model_id = deprecated_model_id
else:
model_id = openllm.AutoConfig.for_model(_model_name)['default_id']
termui.warning(
f"Passing 'openllm start {_model_name}{'' if deprecated_model_id is None else ' --model-id ' + deprecated_model_id}' is deprecated and will be remove in a future version. Use 'openllm start {model_id}' instead."
)
logger.warning("Passing 'openllm start %s%s' is deprecated and will be remove in a future version. Use 'openllm start %s' instead.", _model_name, '' if deprecated_model_id is None else f' --model-id {deprecated_model_id}', model_id)
adapter_map: dict[str, str] | None = attrs.pop('adapter_map', None)
from openllm.serialisation.transformers.weights import has_safetensors_weights
serialisation = t.cast(
LiteralSerialisation,
first_not_none(
serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
),
)
serialisation = first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy')
if serialisation == 'safetensors' and quantize is not None:
logger.warning("'--quantize=%s' might not work with 'safetensors' serialisation format.", quantize)
logger.warning(
@@ -449,8 +419,7 @@ def start_command(
config, server_attrs = llm.config.model_validate_click(**attrs)
server_timeout = first_not_none(server_timeout, default=config['timeout'])
server_attrs.update({'working_dir': pkg.source_locations('openllm'), 'timeout': server_timeout})
# XXX: currently, theres no development args in bentoml.Server. To be fixed upstream.
development = server_attrs.pop('development')
development = server_attrs.pop('development') # XXX: currently, theres no development args in bentoml.Server. To be fixed upstream.
server_attrs.setdefault('production', not development)
start_env = process_environ(
@@ -479,145 +448,8 @@ def start_command(
# NOTE: Return the configuration for telemetry purposes.
return config
@cli.command(
context_settings=termui.CONTEXT_SETTINGS,
name='start-grpc',
short_help='Start a gRPC LLMServer for any supported LLM.',
)
@click.argument('model_id', type=click.STRING, metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]', required=True)
@click.option(
'--model-id',
'deprecated_model_id',
type=click.STRING,
default=None,
hidden=True,
metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]',
help='Deprecated. Use positional argument instead.',
)
@start_decorator(serve_grpc=True)
@click.option(
'--max-model-len',
'--max_model_len',
'max_model_len',
default=None,
help='Maximum sequence length for the model. If not specified, we will use the default value from the model config.',
)
def start_grpc_command(
model_id: str,
server_timeout: int,
model_version: str | None,
workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString,
device: t.Tuple[str, ...],
quantize: LiteralQuantise | None,
backend: LiteralBackend | None,
serialisation: LiteralSerialisation | None,
cors: bool,
dtype: LiteralDtype,
adapter_id: str | None,
return_process: bool,
deprecated_model_id: str | None,
max_model_len: int | None,
**attrs: t.Any,
) -> LLMConfig | subprocess.Popen[bytes]:
'''Start any LLM as a gRPC server.
\b
```bash
$ openllm start-grpc <model_id> --<options> ...
```
'''
termui.warning(
'Continuous batching is currently not yet supported with gPRC. If you want to use continuous batching with gRPC, feel free to open a GitHub issue about your usecase.\n'
)
if model_id in openllm.CONFIG_MAPPING:
_model_name = model_id
if deprecated_model_id is not None:
model_id = deprecated_model_id
else:
model_id = openllm.AutoConfig.for_model(_model_name)['default_id']
termui.warning(
f"Passing 'openllm start-grpc {_model_name}{'' if deprecated_model_id is None else ' --model-id ' + deprecated_model_id}' is deprecated and will be remove in a future version. Use 'openllm start-grpc {model_id}' instead."
)
adapter_map: dict[str, str] | None = attrs.pop('adapter_map', None)
from openllm.serialisation.transformers.weights import has_safetensors_weights
serialisation = first_not_none(
serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
)
if serialisation == 'safetensors' and quantize is not None:
logger.warning("'--quantize=%s' might not work with 'safetensors' serialisation format.", quantize)
logger.warning(
"Make sure to check out '%s' repository to see if the weights is in '%s' format if unsure.",
model_id,
serialisation,
)
logger.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.")
import torch
if backend == 'pt' and not torch.cuda.is_available():
if dtype == 'auto':
dtype = 'float'
elif dtype not in {'float', 'float32'}:
logger.warning('"bfloat16" and "half" are not supported on CPU. OpenLLM will default fallback to "float32".')
dtype = 'float' # we need to cast back to full precision if cuda is not available
llm = openllm.LLM[t.Any, t.Any](
model_id=model_id,
model_version=model_version,
backend=backend,
adapter_map=adapter_map,
quantize=quantize,
serialisation=serialisation,
dtype=dtype,
max_model_len=max_model_len,
trust_remote_code=check_bool_env('TRUST_REMOTE_CODE', False),
)
backend_warning(llm.__llm_backend__)
config, server_attrs = llm.config.model_validate_click(**attrs)
server_timeout = first_not_none(server_timeout, default=config['timeout'])
server_attrs.update({'working_dir': pkg.source_locations('openllm'), 'timeout': server_timeout})
server_attrs['grpc_protocol_version'] = 'v1'
# XXX: currently, theres no development args in bentoml.Server. To be fixed upstream.
development = server_attrs.pop('development')
server_attrs.setdefault('production', not development)
start_env = process_environ(
config,
server_timeout,
process_workers_per_resource(first_not_none(workers_per_resource, default=config['workers_per_resource']), device),
device,
cors,
model_id,
adapter_map,
serialisation,
llm,
)
server = bentoml.GrpcServer('_service:svc', **server_attrs)
openllm.utils.analytics.track_start_init(llm.config)
try:
build_bento_instruction(llm, model_id, serialisation, adapter_map)
it = run_server(server.args, start_env, return_process=return_process)
if return_process:
return it
except KeyboardInterrupt:
pass
# NOTE: Return the configuration for telemetry purposes.
return config
def process_environ(
config, server_timeout, wpr, device, cors, model_id, adapter_map, serialisation, llm, use_current_env=True
) -> t.Dict[str, t.Any]:
environ = parse_config_options(
config, server_timeout, wpr, device, cors, os.environ.copy() if use_current_env else {}
)
def process_environ(config, server_timeout, wpr, device, cors, model_id, adapter_map, serialisation, llm, use_current_env=True):
environ = parse_config_options(config, server_timeout, wpr, device, cors, os.environ.copy() if use_current_env else {})
environ.update(
{
'OPENLLM_MODEL_ID': model_id,
@@ -631,11 +463,9 @@ def process_environ(
'TRUST_REMOTE_CODE': str(llm.trust_remote_code),
}
)
if llm.quantise:
environ['QUANTIZE'] = str(llm.quantise)
if llm.quantise: environ['QUANTIZE'] = str(llm.quantise)
return environ
def process_workers_per_resource(wpr: str | float | int, device: tuple[str, ...]) -> TypeGuard[float]:
if isinstance(wpr, str):
if wpr == 'round_robin':
@@ -653,7 +483,6 @@ def process_workers_per_resource(wpr: str | float | int, device: tuple[str, ...]
wpr = float(wpr)
return wpr
def build_bento_instruction(llm, model_id, serialisation, adapter_map):
cmd_name = f'openllm build {model_id} --backend {llm.__llm_backend__}'
if llm.quantise:
@@ -907,13 +736,6 @@ class BuildBentoOutput(t.TypedDict):
help='Optional custom dockerfile template to be used with this BentoLLM.',
)
@serialisation_option
@container_registry_option
@click.option(
'--container-version-strategy',
type=click.Choice(['release', 'latest', 'nightly']),
default='release',
help="Default container version strategy for the image from '--container-registry'",
)
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Utilities options') # type: ignore[misc]
@cog.optgroup.option(
'--containerize',
@@ -951,8 +773,6 @@ def build_command(
containerize: bool,
push: bool,
serialisation: LiteralSerialisation | None,
container_registry: LiteralContainerRegistry,
container_version_strategy: LiteralContainerVersionStrategy,
force_push: bool,
**_: t.Any,
) -> BuildBentoOutput:
@@ -991,6 +811,10 @@ def build_command(
state = ItemState.NOT_FOUND
if backend == 'pt':
logger.warning("PyTorch backend is deprecated and will be removed from the next releases. Will set default backend to 'vllm' instead.")
backend = 'vllm'
llm = openllm.LLM[t.Any, t.Any](
model_id=model_id,
model_version=model_version,
@@ -1069,8 +893,6 @@ def build_command(
quantize=quantize,
extra_dependencies=enable_features,
dockerfile_template=dockerfile_template_path,
container_registry=container_registry,
container_version_strategy=container_version_strategy,
)
if state != ItemState.OVERWRITE:
state = ItemState.ADDED

View File

@@ -1,104 +0,0 @@
from __future__ import annotations
import pathlib
import shutil
import subprocess
import typing as t
import click
import orjson
import bentoml
import openllm
from openllm_cli import termui
from openllm_cli._factory import container_registry_option, machine_option
from openllm_core.utils import get_debug_mode, pkg
if t.TYPE_CHECKING:
from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
_BUILDER = bentoml.container.get_backend('buildx')
_module_location = pkg.source_locations('openllm')
def build_container(
registries: LiteralContainerRegistry | t.Sequence[LiteralContainerRegistry] | None = None,
version_strategy: LiteralContainerVersionStrategy = 'release',
push: bool = False,
machine: bool = False,
) -> dict[str | LiteralContainerRegistry, str]:
try:
if not _BUILDER.health():
raise openllm.exceptions.Error
except (openllm.exceptions.Error, subprocess.CalledProcessError):
raise RuntimeError(
'Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.'
) from None
if not shutil.which('nvidia-container-runtime'):
raise RuntimeError('NVIDIA Container Toolkit is required to compile CUDA kernel in container.')
if not _module_location:
raise RuntimeError("Failed to determine source location of 'openllm'. (Possible broken installation)")
pyproject_path = pathlib.Path(_module_location).parent.parent / 'pyproject.toml'
if not pyproject_path.exists():
raise ValueError(
"This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'"
)
if not registries:
tags = {
alias: openllm.bundle.RefResolver.construct_base_image(alias, version_strategy)
for alias in openllm.bundle.CONTAINER_NAMES
}
else:
registries = [registries] if isinstance(registries, str) else list(registries)
tags = {name: openllm.bundle.RefResolver.construct_base_image(name, version_strategy) for name in registries}
try:
outputs = _BUILDER.build(
file=pathlib.Path(__file__).parent.joinpath('Dockerfile').resolve().__fspath__(),
context_path=pyproject_path.parent.__fspath__(),
tag=tuple(tags.values()),
push=push,
progress='plain' if get_debug_mode() else 'auto',
quiet=machine,
)
if machine and outputs is not None:
tags['image_sha'] = outputs.decode('utf-8').strip()
except Exception as err:
raise openllm.exceptions.OpenLLMException(
f'Failed to containerize base container images (Scroll up to see error above, or set DEBUG=5 for more traceback):\n{err}'
) from err
return tags
@click.command(
'build_base_container',
context_settings=termui.CONTEXT_SETTINGS,
help='''Base image builder for BentoLLM.
By default, the base image will include custom kernels (PagedAttention via vllm, FlashAttention-v2, etc.) built with CUDA 11.8, Python 3.9 on Ubuntu22.04.
Optionally, this can also be pushed directly to remote registry. Currently support ``docker.io``, ``ghcr.io`` and ``quay.io``.
\b
If '--machine' is passed, then it will run the process quietly, and output a JSON to the current running terminal.
This command is only useful for debugging and for building custom base image for extending BentoML with custom base images and custom kernels.
Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
''',
)
@container_registry_option
@click.option(
'--version-strategy',
type=click.Choice(['release', 'latest', 'nightly']),
default='nightly',
help='Version strategy to use for tagging the image.',
)
@click.option('--push/--no-push', help='Whether to push to remote repository', is_flag=True, default=False)
@machine_option
def cli(
container_registry: tuple[LiteralContainerRegistry, ...] | None,
version_strategy: LiteralContainerVersionStrategy,
push: bool,
machine: bool,
) -> dict[str, str]:
mapping = build_container(container_registry, version_strategy, push, machine)
if machine:
termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
return mapping

View File

@@ -100,8 +100,3 @@ while true; do
fi
sleep 10
done
echo "Sleeping for 7 minutes to allow the release to propagate and PyPI to be published..."
sleep 420
echo "Building OpenLLM container for ${RELEASE_TAG}..."
gh workflow run build.yml -R bentoml/openllm -r "${RELEASE_TAG}"