perf: improve build logics and cleanup speed (#657)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-11-15 00:18:31 -05:00
committed by GitHub
parent 103156cd71
commit a58d947bc8
11 changed files with 141 additions and 237 deletions

View File

@@ -635,7 +635,6 @@ def _RunnerFactory(
'config': self.config,
'backend': backend,
'__module__': self.__module__,
'__doc__': getattr(openllm_core.config, f'START_{self.config["model_name"].upper()}_COMMAND_DOCSTRING'),
'__repr__': ReprMixin.__repr__,
'__repr_keys__': property(_wrapped_repr_keys),
'__repr_args__': _wrapped_repr_args,

View File

@@ -8,9 +8,9 @@ import torch
import bentoml
import openllm
from openllm.exceptions import OpenLLMException
from openllm_core._schemas import CompletionChunk, GenerationOutput
from openllm_core._typing_compat import LiteralBackend, M, T
from openllm_core.exceptions import OpenLLMException
from openllm_core.utils import first_not_none, is_vllm_available
if t.TYPE_CHECKING:

View File

@@ -1,20 +1,13 @@
"""Build-related utilities. Some of these utilities are mainly used for 'openllm.build'.
These utilities will stay internal, and its API can be changed or updated without backward-compatibility.
"""
from __future__ import annotations
import os
import typing as t
from openllm_core.utils import LazyModule
_import_structure: dict[str, list[str]] = {
_import_structure = {
'_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'],
'oci': [
'CONTAINER_NAMES',
'get_base_container_tag',
'build_container',
'get_base_container_name',
'supported_registries',
'RefResolver',
@@ -32,7 +25,6 @@ if t.TYPE_CHECKING:
from .oci import (
CONTAINER_NAMES as CONTAINER_NAMES,
RefResolver as RefResolver,
build_container as build_container,
get_base_container_name as get_base_container_name,
get_base_container_tag as get_base_container_tag,
supported_registries as supported_registries,

View File

@@ -145,28 +145,28 @@ def construct_docker_options(
if quantize:
env_dict['OPENLLM_QUANTIZE'] = str(quantize)
return DockerOptions(
base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}',
base_image=f'{oci.get_base_container_name(container_registry)}:{oci.get_base_container_tag(container_version_strategy)}',
env=env_dict,
dockerfile_template=dockerfile_template,
)
OPENLLM_MODEL_NAME = '# openllm: model name'
OPENLLM_MODEL_ID = '# openllm: model id'
OPENLLM_MODEL_TAG = '# openllm: model tag'
OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map'
class ModelNameFormatter(string.Formatter):
model_keyword: LiteralString = '__model_name__'
class _ServiceVarsFormatter(string.Formatter):
keyword: LiteralString = '__model_name__'
identifier: LiteralString = '# openllm: model name'
def __init__(self, model_name: str):
def __init__(self, target: str):
"""The formatter that extends model_name to be formatted the 'service.py'."""
super().__init__()
self.model_name = model_name
self.target = target
def vformat(self, format_string: str, *args: t.Any, **attrs: t.Any) -> t.Any:
return super().vformat(format_string, (), {self.model_keyword: self.model_name})
return super().vformat(format_string, (), {self.keyword: self.target})
def can_format(self, value: str) -> bool:
try:
@@ -175,17 +175,26 @@ class ModelNameFormatter(string.Formatter):
except ValueError:
return False
class ModelIdFormatter(ModelNameFormatter):
model_keyword: LiteralString = '__model_id__'
def parse_line(self, line: str, nl: bool = True) -> str:
if self.identifier not in line:
return line
gen = self.vformat(line)[: -(len(self.identifier) + 3)] + ('\n' if nl else '')
return gen
class ModelTagFormatter(ModelNameFormatter):
model_keyword: LiteralString = '__model_tag__'
class ModelIdFormatter(_ServiceVarsFormatter):
keyword = '__model_id__'
identifier = OPENLLM_MODEL_ID
class ModelAdapterMapFormatter(ModelNameFormatter):
model_keyword: LiteralString = '__model_adapter_map__'
class ModelTagFormatter(_ServiceVarsFormatter):
keyword = '__model_tag__'
identifier = OPENLLM_MODEL_TAG
class ModelAdapterMapFormatter(_ServiceVarsFormatter):
keyword = '__model_adapter_map__'
identifier = OPENLLM_MODEL_ADAPTER_MAP
_service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
@@ -195,41 +204,30 @@ _service_vars_file = Path(os.path.abspath(__file__)).parent.parent / '_service_v
def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str] | None, llm_fs: FS) -> None:
from openllm_core.utils import DEBUG
model_name = llm.config['model_name']
model_id = llm.model_id
model_tag = str(llm.tag)
model_id_formatter = ModelIdFormatter(llm.model_id)
model_tag_formatter = ModelTagFormatter(str(llm.tag))
adapter_map_formatter = ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode())
logger.debug(
'Generating service vars file for %s at %s (dir=%s)', model_name, '_service_vars.py', llm_fs.getsyspath('/')
'Generating service vars file for %s at %s (dir=%s)', llm.model_id, '_service_vars.py', llm_fs.getsyspath('/')
)
with open(_service_vars_file.__fspath__(), 'r') as f:
src_contents = f.readlines()
for it in src_contents:
if OPENLLM_MODEL_NAME in it:
src_contents[src_contents.index(it)] = (
ModelNameFormatter(model_name).vformat(it)[: -(len(OPENLLM_MODEL_NAME) + 3)] + '\n'
)
if OPENLLM_MODEL_ID in it:
src_contents[src_contents.index(it)] = (
ModelIdFormatter(model_id).vformat(it)[: -(len(OPENLLM_MODEL_ID) + 3)] + '\n'
)
elif OPENLLM_MODEL_TAG in it:
src_contents[src_contents.index(it)] = (
ModelTagFormatter(model_tag).vformat(it)[: -(len(OPENLLM_MODEL_TAG) + 3)] + '\n'
)
elif OPENLLM_MODEL_ADAPTER_MAP in it:
src_contents[src_contents.index(it)] = (
ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[
: -(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)
]
+ '\n'
)
script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + ''.join(src_contents)
for i, it in enumerate(src_contents):
if model_id_formatter.identifier in it:
src_contents[i] = model_id_formatter.parse_line(it)
elif model_tag_formatter.identifier in it:
src_contents[i] = model_tag_formatter.parse_line(it)
elif adapter_map_formatter.identifier in it:
src_contents[i] = adapter_map_formatter.parse_line(it)
script = f"# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n\n" + ''.join(src_contents)
if DEBUG:
logger.info('Generated script:\n%s', script)
llm_fs.writetext('_service_vars.py', script)
logger.debug(
'Generating service file for %s at %s (dir=%s)', model_name, llm.config['service_name'], llm_fs.getsyspath('/')
'Generating service file for %s at %s (dir=%s)', llm.model_id, llm.config['service_name'], llm_fs.getsyspath('/')
)
with open(_service_file.__fspath__(), 'r') as f:
service_src = f.read()

View File

@@ -1,46 +1,25 @@
# mypy: disable-error-code="misc"
"""OCI-related utilities for OpenLLM. This module is considered to be internal and API are subjected to change."""
from __future__ import annotations
import functools
import importlib
import logging
import os
import pathlib
import shutil
import subprocess
import typing as t
from datetime import datetime, timedelta, timezone
import attr
import orjson
import bentoml
import openllm
import openllm_core
from openllm_core.exceptions import OpenLLMException
from openllm_core.utils import codegen
from openllm_core.utils.lazy import VersionInfo
if t.TYPE_CHECKING:
from ghapi import all
from openllm_core._typing_compat import (
LiteralContainerRegistry,
LiteralContainerVersionStrategy,
LiteralString,
RefTuple,
)
all = openllm_core.utils.LazyLoader('all', globals(), 'ghapi.all') # noqa: F811
from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy, RefTuple
logger = logging.getLogger(__name__)
_BUILDER = bentoml.container.get_backend('buildx')
ROOT_DIR = pathlib.Path(os.path.abspath('__file__')).parent.parent.parent
# XXX: This registry will be hard code for now for easier to maintain
# but in the future, we can infer based on git repo and everything to make it more options for users
# to build the base image. For now, all of the base image will be <registry>/bentoml/openllm:...
# NOTE: The ECR registry is the public one and currently only @bentoml team has access to push it.
_CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {
'docker': 'docker.io/bentoml/openllm',
'gh': 'ghcr.io/bentoml/openllm',
@@ -48,122 +27,54 @@ _CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {
}
# TODO: support custom fork. Currently it only support openllm main.
_OWNER = 'bentoml'
_REPO = 'openllm'
_module_location = openllm_core.utils.pkg.source_locations('openllm')
@functools.lru_cache
@openllm_core.utils.apply(str.lower)
def get_base_container_name(reg: LiteralContainerRegistry) -> str:
return _CONTAINER_REGISTRY[reg]
_OWNER, _REPO = 'bentoml', 'openllm'
def _convert_version_from_string(s: str) -> VersionInfo:
return VersionInfo.from_version_string(s)
def _commit_time_range(r: int = 5) -> str:
return (datetime.now(timezone.utc) - timedelta(days=r)).strftime('%Y-%m-%dT%H:%M:%SZ')
class VersionNotSupported(openllm.exceptions.OpenLLMException):
"""Raised when the stable release is too low that it doesn't include OpenLLM base container."""
_RefTuple: type[RefTuple] = openllm_core.utils.codegen.make_attr_tuple_class(
'_RefTuple', ['git_hash', 'version', 'strategy']
)
def nightly_resolver(cls: type[RefResolver]) -> str:
# NOTE: all openllm container will have sha-<git_hash[:7]>
# This will use docker to run skopeo to determine the correct latest tag that is available
# If docker is not found, then fallback to previous behaviour. Which the container might not exists.
docker_bin = shutil.which('docker')
if docker_bin is None:
logger.warning(
'To get the correct available nightly container, make sure to have docker available. Fallback to previous behaviour for determine nightly hash (container might not exists due to the lack of GPU machine at a time. See https://github.com/bentoml/OpenLLM/pkgs/container/openllm for available image.)'
)
commits = t.cast('list[dict[str, t.Any]]', cls._ghapi.repos.list_commits(since=_commit_time_range()))
return next(f'sha-{it["sha"][:7]}' for it in commits if '[skip ci]' not in it['commit']['message'])
# now is the correct behaviour
return orjson.loads(
subprocess.check_output(
[
docker_bin,
'run',
'--rm',
'-it',
'quay.io/skopeo/stable:latest',
'list-tags',
'docker://ghcr.io/bentoml/openllm',
]
)
.decode()
.strip()
)['Tags'][-2]
_RefTuple: type[RefTuple] = codegen.make_attr_tuple_class('_RefTuple', ['git_hash', 'version', 'strategy'])
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
class RefResolver:
git_hash: str = attr.field()
version: openllm_core.utils.VersionInfo = attr.field(converter=_convert_version_from_string)
version: VersionInfo = attr.field(converter=_convert_version_from_string)
strategy: LiteralContainerVersionStrategy = attr.field()
_ghapi: t.ClassVar[all.GhApi] = all.GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
@classmethod
def _nightly_ref(cls) -> RefTuple:
return _RefTuple((nightly_resolver(cls), 'refs/heads/main', 'nightly'))
@classmethod
def _release_ref(cls, version_str: str | None = None) -> RefTuple:
try:
from ghapi.all import GhApi
ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
meta = t.cast(t.Dict[str, t.Any], ghapi.repos.get_latest_release())
except Exception as err:
raise OpenLLMException('Failed to determine latest release version.') from err
_use_base_strategy = version_str is None
if version_str is None:
# NOTE: This strategy will only support openllm>0.2.12
meta: dict[str, t.Any] = cls._ghapi.repos.get_latest_release()
version_str = meta['name'].lstrip('v')
version: tuple[str, str | None] = (
cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'],
version_str,
)
version = (ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'], version_str)
else:
version = ('', version_str)
if openllm_core.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12):
raise VersionNotSupported(
f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'"
)
return _RefTuple((*version, 'release' if _use_base_strategy else 'custom'))
@classmethod
@functools.lru_cache(maxsize=64)
def from_strategy(
cls, strategy_or_version: t.Literal['release', 'nightly'] | LiteralString | None = None
) -> RefResolver:
def from_strategy(cls, strategy_or_version: LiteralContainerVersionStrategy | None = None) -> RefResolver:
# using default strategy
if strategy_or_version is None or strategy_or_version == 'release':
return cls(*cls._release_ref())
elif strategy_or_version == 'latest':
return cls('latest', '0.0.0', 'latest')
elif strategy_or_version == 'nightly':
_ref = cls._nightly_ref()
return cls(_ref[0], '0.0.0', _ref[-1])
elif strategy_or_version in ('latest', 'nightly'): # latest is nightly
return cls(git_hash='latest', version='0.0.0', strategy='latest')
else:
logger.warning(
'Using custom %s. Make sure that it is at lease 0.2.12 for base container support.', strategy_or_version
)
return cls(*cls._release_ref(version_str=strategy_or_version))
raise ValueError(f'Unknown strategy: {strategy_or_version}')
@property
def tag(self) -> str:
# NOTE: latest tag can also be nightly, but discouraged to use it. For nightly refer to use sha-<git_hash_short>
if self.strategy == 'latest':
return 'latest'
elif self.strategy == 'nightly':
return self.git_hash
else:
return repr(self.version)
return 'latest' if self.strategy in {'latest', 'nightly'} else repr(self.version)
@functools.lru_cache(maxsize=256)
@@ -171,51 +82,8 @@ def get_base_container_tag(strategy: LiteralContainerVersionStrategy | None = No
return RefResolver.from_strategy(strategy).tag
def build_container(
registries: LiteralContainerRegistry | t.Sequence[LiteralContainerRegistry] | None = None,
version_strategy: LiteralContainerVersionStrategy = 'release',
push: bool = False,
machine: bool = False,
) -> dict[str | LiteralContainerRegistry, str]:
try:
if not _BUILDER.health():
raise openllm.exceptions.Error
except (openllm.exceptions.Error, subprocess.CalledProcessError):
raise RuntimeError(
'Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.'
) from None
if not shutil.which('nvidia-container-runtime'):
raise RuntimeError('NVIDIA Container Toolkit is required to compile CUDA kernel in container.')
if not _module_location:
raise RuntimeError("Failed to determine source location of 'openllm'. (Possible broken installation)")
pyproject_path = pathlib.Path(_module_location).parent.parent / 'pyproject.toml'
if not pyproject_path.exists():
raise ValueError(
"This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'"
)
if not registries:
tags: dict[str | LiteralContainerRegistry, str] = {
alias: f'{value}:{get_base_container_tag(version_strategy)}' for alias, value in _CONTAINER_REGISTRY.items()
}
else:
registries = [registries] if isinstance(registries, str) else list(registries)
tags = {name: f'{_CONTAINER_REGISTRY[name]}:{get_base_container_tag(version_strategy)}' for name in registries}
try:
outputs = _BUILDER.build(
file=pathlib.Path(__file__).parent.joinpath('Dockerfile').resolve().__fspath__(),
context_path=pyproject_path.parent.__fspath__(),
tag=tuple(tags.values()),
push=push,
progress='plain' if openllm_core.utils.get_debug_mode() else 'auto',
quiet=machine,
)
if machine and outputs is not None:
tags['image_sha'] = outputs.decode('utf-8').strip()
except Exception as err:
raise openllm.exceptions.OpenLLMException(
f'Failed to containerize base container images (Scroll up to see error above, or set DEBUG=5 for more traceback):\n{err}'
) from err
return tags
def get_base_container_name(reg: LiteralContainerRegistry) -> str:
return _CONTAINER_REGISTRY[reg]
if t.TYPE_CHECKING:
@@ -225,7 +93,6 @@ if t.TYPE_CHECKING:
__all__ = [
'CONTAINER_NAMES',
'get_base_container_tag',
'build_container',
'get_base_container_name',
'supported_registries',
'RefResolver',

View File

@@ -4,8 +4,8 @@ import importlib
import cloudpickle
import fs
import openllm
from openllm_core._typing_compat import ParamSpec
from openllm_core.exceptions import OpenLLMException
P = ParamSpec('P')
@@ -31,7 +31,7 @@ def load_tokenizer(llm, **tokenizer_attrs):
try:
tokenizer = cloudpickle.load(cofile)['tokenizer']
except KeyError:
raise openllm.exceptions.OpenLLMException(
raise OpenLLMException(
"Bento model does not have tokenizer. Make sure to save the tokenizer within the model via 'custom_objects'. "
'For example: "bentoml.transformers.save_model(..., custom_objects={\'tokenizer\': tokenizer})"'
) from None

View File

@@ -13,7 +13,7 @@ import bentoml
import openllm
from bentoml._internal.configuration.containers import BentoMLContainer
from bentoml._internal.models.model import ModelOptions, ModelSignature
from openllm_core._typing_compat import M, T
from openllm_core.exceptions import OpenLLMException
from ._helpers import get_hash, infer_autoclass_from_llm, process_config
from .weights import HfIgnore
@@ -24,7 +24,7 @@ __all__ = ['import_model', 'get', 'load_model']
_object_setattr = object.__setattr__
def _patch_correct_tag(llm: openllm.LLM[M, T], config: transformers.PretrainedConfig, _revision: str | None = None):
def _patch_correct_tag(llm, config, _revision=None):
# NOTE: The following won't hit during local since we generated a correct version based on local path hash It will only hit if we use model from HF Hub
if llm.revision is not None:
return
@@ -76,7 +76,7 @@ def import_model(llm, *decls, trust_remote_code, _model_store=Provide[BentoMLCon
if quantize == 'gptq':
if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq():
raise openllm.exceptions.OpenLLMException(
raise OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"
)
signatures['generate'] = {'batchable': False}
@@ -175,7 +175,7 @@ def get(llm):
model = bentoml.models.get(llm.tag)
backend = model.info.labels['backend']
if backend != llm.__llm_backend__:
raise openllm.exceptions.OpenLLMException(
raise OpenLLMException(
f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'."
)
_patch_correct_tag(
@@ -185,9 +185,7 @@ def get(llm):
)
return model
except Exception as err:
raise openllm.exceptions.OpenLLMException(
f'Failed while getting stored artefact (lookup for traceback):\n{err}'
) from err
raise OpenLLMException(f'Failed while getting stored artefact (lookup for traceback):\n{err}') from err
def check_unintialised_params(model):
@@ -216,13 +214,11 @@ def load_model(llm, *decls, **attrs):
_quantise = llm.bentomodel.info.metadata['_quantize']
if _quantise == 'gptq':
if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq():
raise openllm.exceptions.OpenLLMException(
raise OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"
)
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(
f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})"
)
raise OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
# TODO: investigate load with flash attention
model = auto_class.from_pretrained(

View File

@@ -1,17 +1,11 @@
from __future__ import annotations
import copy
import typing as t
import transformers
from openllm.serialisation.constants import FRAMEWORK_TO_AUTOCLASS_MAPPING, HUB_ATTRS
from openllm_core.exceptions import OpenLLMException
if t.TYPE_CHECKING:
from openllm_core._typing_compat import M, T
from ..._llm import LLM
def get_hash(config) -> str:
_commit_hash = getattr(config, '_commit_hash', None)
@@ -34,7 +28,7 @@ def process_config(model_id, trust_remote_code, **attrs):
return config, hub_attrs, attrs
def infer_autoclass_from_llm(llm: LLM[M, T], config, /):
def infer_autoclass_from_llm(llm, config, /):
if llm.trust_remote_code:
autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM'
if not hasattr(config, 'auto_map'):

View File

@@ -13,7 +13,6 @@ if t.TYPE_CHECKING:
from huggingface_hub.hf_api import ModelInfo as HfModelInfo
import openllm
from openllm_core._typing_compat import M, T
__global_inst__ = None
__cached_id__: dict[str, HfModelInfo] = dict()
@@ -39,7 +38,7 @@ def ModelInfo(model_id: str, revision: str | None = None) -> HfModelInfo:
def has_safetensors_weights(model_id: str, revision: str | None = None) -> bool:
if validate_is_path(model_id):
return next((True for item in Path(resolve_filepath(model_id)).glob('*.safetensors')), False)
return next((True for _ in Path(resolve_filepath(model_id)).glob('*.safetensors')), False)
return any(s.rfilename.endswith('.safetensors') for s in ModelInfo(model_id, revision=revision).siblings)
@@ -52,7 +51,7 @@ class HfIgnore:
gguf = '*.gguf'
@classmethod
def ignore_patterns(cls, llm: openllm.LLM[M, T]) -> list[str]:
def ignore_patterns(cls, llm: openllm.LLM[t.Any, t.Any]) -> list[str]:
if llm.__llm_backend__ in {'vllm', 'pt'}:
base = [cls.tf, cls.flax, cls.gguf]
if has_safetensors_weights(llm.model_id):

View File

@@ -52,7 +52,6 @@ from bentoml._internal.cloud.config import CloudClientConfig
from bentoml._internal.configuration.containers import BentoMLContainer
from bentoml._internal.models.model import ModelStore
from openllm import bundle
from openllm.exceptions import OpenLLMException
from openllm_core._typing_compat import (
Concatenate,
DictStrAny,
@@ -67,6 +66,7 @@ from openllm_core._typing_compat import (
TypeGuard,
)
from openllm_core.config import CONFIG_MAPPING
from openllm_core.exceptions import OpenLLMException
from openllm_core.utils import (
DEBUG_ENV_VAR,
OPTIONAL_DEPENDENCIES,

View File

@@ -1,31 +1,90 @@
from __future__ import annotations
import pathlib
import shutil
import subprocess
import typing as t
import click
import orjson
import bentoml
import openllm
from openllm_cli import termui
from openllm_cli._factory import container_registry_option, machine_option
from openllm_core.utils import get_debug_mode, pkg
if t.TYPE_CHECKING:
from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
_BUILDER = bentoml.container.get_backend('buildx')
_module_location = pkg.source_locations('openllm')
def build_container(
registries: LiteralContainerRegistry | t.Sequence[LiteralContainerRegistry] | None = None,
version_strategy: LiteralContainerVersionStrategy = 'release',
push: bool = False,
machine: bool = False,
) -> dict[str | LiteralContainerRegistry, str]:
try:
if not _BUILDER.health():
raise openllm.exceptions.Error
except (openllm.exceptions.Error, subprocess.CalledProcessError):
raise RuntimeError(
'Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.'
) from None
if not shutil.which('nvidia-container-runtime'):
raise RuntimeError('NVIDIA Container Toolkit is required to compile CUDA kernel in container.')
if not _module_location:
raise RuntimeError("Failed to determine source location of 'openllm'. (Possible broken installation)")
pyproject_path = pathlib.Path(_module_location).parent.parent / 'pyproject.toml'
if not pyproject_path.exists():
raise ValueError(
"This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'"
)
if not registries:
tags: dict[str | LiteralContainerRegistry, str] = {
alias: f'{value}:{openllm.bundle.get_base_container_tag(version_strategy)}'
for alias, value in openllm.bundle.CONTAINER_NAMES.items()
}
else:
registries = [registries] if isinstance(registries, str) else list(registries)
tags = {
name: f'{openllm.bundle.CONTAINER_NAMES[name]}:{openllm.bundle.get_base_container_tag(version_strategy)}'
for name in registries
}
try:
outputs = _BUILDER.build(
file=pathlib.Path(__file__).parent.joinpath('Dockerfile').resolve().__fspath__(),
context_path=pyproject_path.parent.__fspath__(),
tag=tuple(tags.values()),
push=push,
progress='plain' if get_debug_mode() else 'auto',
quiet=machine,
)
if machine and outputs is not None:
tags['image_sha'] = outputs.decode('utf-8').strip()
except Exception as err:
raise openllm.exceptions.OpenLLMException(
f'Failed to containerize base container images (Scroll up to see error above, or set DEBUG=5 for more traceback):\n{err}'
) from err
return tags
@click.command(
'build_base_container',
context_settings=termui.CONTEXT_SETTINGS,
help="""Base image builder for BentoLLM.
By default, the base image will include custom kernels (PagedAttention via vllm, FlashAttention-v2, etc.) built with CUDA 11.8, Python 3.9 on Ubuntu22.04.
Optionally, this can also be pushed directly to remote registry. Currently support ``docker.io``, ``ghcr.io`` and ``quay.io``.
By default, the base image will include custom kernels (PagedAttention via vllm, FlashAttention-v2, etc.) built with CUDA 11.8, Python 3.9 on Ubuntu22.04.
Optionally, this can also be pushed directly to remote registry. Currently support ``docker.io``, ``ghcr.io`` and ``quay.io``.
\b
If '--machine' is passed, then it will run the process quietly, and output a JSON to the current running terminal.
This command is only useful for debugging and for building custom base image for extending BentoML with custom base images and custom kernels.
\b
If '--machine' is passed, then it will run the process quietly, and output a JSON to the current running terminal.
This command is only useful for debugging and for building custom base image for extending BentoML with custom base images and custom kernels.
Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
""",
Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
""",
)
@container_registry_option
@click.option(
@@ -42,7 +101,7 @@ def cli(
push: bool,
machine: bool,
) -> dict[str, str]:
mapping = openllm.bundle.build_container(container_registry, version_strategy, push, machine)
mapping = build_container(container_registry, version_strategy, push, machine)
if machine:
termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
return mapping