refactor: focus (#730)

* perf: remove based images

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: update changelog

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: move dockerifle to run on release only

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: cleanup unused types

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-11-24 01:11:31 -05:00
committed by GitHub
parent 52a44b1bfa
commit aab173cd99
19 changed files with 168 additions and 679 deletions

View File

@@ -1,30 +1,17 @@
import logging as _logging
import os as _os
import pathlib as _pathlib
import warnings as _warnings
import logging as _logging, os as _os, pathlib as _pathlib, warnings as _warnings
from openllm_cli import _sdk
from . import utils as utils
if utils.DEBUG:
utils.set_debug_mode(True)
_logging.basicConfig(level=_logging.NOTSET)
utils.set_debug_mode(True); _logging.basicConfig(level=_logging.NOTSET)
else:
# configuration for bitsandbytes before import
_os.environ['BITSANDBYTES_NOWELCOME'] = _os.environ.get('BITSANDBYTES_NOWELCOME', '1')
# NOTE: The following warnings from bitsandbytes, and probably not that important for users to see when DEBUG is False
_warnings.filterwarnings(
'ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization'
)
_warnings.filterwarnings(
'ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization'
)
_warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
_warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
_warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')
# NOTE: ignore the following warning from ghapi as it is not important for users
_warnings.filterwarnings(
'ignore', message='Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated'
)
_warnings.filterwarnings('ignore', message='Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated')
COMPILED = _pathlib.Path(__file__).suffix in ('.pyd', '.so')

View File

@@ -1,4 +1 @@
if __name__ == '__main__':
from openllm_cli.entrypoint import cli
cli()
if __name__ == '__main__': from openllm_cli.entrypoint import cli; cli()

View File

@@ -1,7 +1,5 @@
import transformers
def prepare_logits_processor(config):
import transformers
generation_config = config.generation_config
logits_processor = transformers.LogitsProcessorList()
if generation_config['temperature'] >= 1e-5 and generation_config['temperature'] != 1.0:
@@ -13,28 +11,16 @@ def prepare_logits_processor(config):
if generation_config['top_k'] > 0:
logits_processor.append(transformers.TopKLogitsWarper(generation_config['top_k']))
return logits_processor
# NOTE: The ordering here is important. Some models have two of these and we have a preference for which value gets used.
SEQLEN_KEYS = ['max_sequence_length', 'seq_length', 'max_position_embeddings', 'max_seq_len', 'model_max_length']
def get_context_length(config):
rope_scaling = getattr(config, 'rope_scaling', None)
rope_scaling_factor = config.rope_scaling['factor'] if rope_scaling else 1.0
for key in SEQLEN_KEYS:
if getattr(config, key, None) is not None:
return int(rope_scaling_factor * getattr(config, key))
if getattr(config, key, None) is not None: return int(rope_scaling_factor * getattr(config, key))
return 2048
def is_sentence_complete(output):
return output.endswith(('.', '?', '!', '...', '', '?', '!', '', '"', "'", ''))
def is_sentence_complete(output): return output.endswith(('.', '?', '!', '...', '', '?', '!', '', '"', "'", ''))
def is_partial_stop(output, stop_str):
'''Check whether the output contains a partial stop str.'''
for i in range(min(len(output), len(stop_str))):
if stop_str.startswith(output[-i:]):
return True
if stop_str.startswith(output[-i:]): return True
return False

View File

@@ -184,7 +184,7 @@ class LLM(t.Generic[M, T], ReprMixin):
):
torch_dtype = attrs.pop('torch_dtype', None) # backward compatible
if torch_dtype is not None:
warnings.warns(
warnings.warn(
'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',
DeprecationWarning,
stacklevel=3,

View File

@@ -1,14 +1,39 @@
import os
from openllm_core.utils import LazyModule
from __future__ import annotations
import os, attr, functools
from openllm_core._typing_compat import LiteralContainerVersionStrategy
from openllm_core.exceptions import OpenLLMException
from openllm_core.utils.lazy import VersionInfo, LazyModule
_OWNER, _REPO = 'bentoml', 'openllm'
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
class RefResolver:
git_hash: str = attr.field()
version: VersionInfo = attr.field(converter=lambda s: VersionInfo.from_version_string(s))
strategy: LiteralContainerVersionStrategy = attr.field()
@classmethod
@functools.lru_cache(maxsize=64)
def from_strategy(cls, strategy_or_version: LiteralContainerVersionStrategy | None = None) -> RefResolver:
# using default strategy
if strategy_or_version is None or strategy_or_version == 'release':
try:
from ghapi.all import GhApi
ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
meta = ghapi.repos.get_latest_release()
git_hash = ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha']
except Exception as err:
raise OpenLLMException('Failed to determine latest release version.') from err
return cls(git_hash, meta['name'].lstrip('v'), 'release')
elif strategy_or_version in ('latest', 'nightly'): # latest is nightly
return cls('latest', '0.0.0', 'latest')
else:
raise ValueError(f'Unknown strategy: {strategy_or_version}')
@property
def tag(self) -> str: return 'latest' if self.strategy in {'latest', 'nightly'} else repr(self.version)
__lazy = LazyModule(
__name__,
os.path.abspath('__file__'),
{
'_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'],
'oci': ['CONTAINER_NAMES', 'supported_registries', 'RefResolver'],
},
{'_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options']},
extra_objects={'RefResolver': RefResolver}
)
__all__ = __lazy.__all__
__dir__ = __lazy.__dir__

View File

@@ -2,7 +2,7 @@ from typing import Optional
import attr
from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
from openllm_core._typing_compat import LiteralContainerVersionStrategy
from openllm_core.utils.lazy import VersionInfo
from . import _package as _package, oci as oci
@@ -13,9 +13,6 @@ from ._package import (
create_bento as create_bento,
)
CONTAINER_NAMES: dict[LiteralContainerRegistry, str] = ...
supported_registries: list[str] = ...
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
class RefResolver:
git_hash: str
@@ -26,7 +23,3 @@ class RefResolver:
def from_strategy(cls, strategy_or_version: Optional[LiteralContainerVersionStrategy] = ...) -> RefResolver: ...
@property
def tag(self) -> str: ...
@staticmethod
def construct_base_image(
reg: LiteralContainerRegistry, strategy: Optional[LiteralContainerVersionStrategy] = ...
) -> str: ...

View File

@@ -14,60 +14,43 @@ from bentoml._internal.bento.build_config import BentoBuildConfig, DockerOptions
from bentoml._internal.configuration.containers import BentoMLContainer
from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg
from . import oci
logger = logging.getLogger(__name__)
OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
_service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
_SERVICE_VARS = '''import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__}'''
def build_editable(path, package='openllm'):
'''Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set.'''
if not check_bool_env(OPENLLM_DEV_BUILD, default=False):
return None
if not check_bool_env(OPENLLM_DEV_BUILD, default=False): return None
# We need to build the package in editable mode, so that we can import it
# TODO: Upgrade to 1.0.3
from build import ProjectBuilder
from build.env import IsolatedEnvBuilder
module_location = pkg.source_locations(package)
if not module_location:
raise RuntimeError(
'Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.'
)
if not module_location: raise RuntimeError('Could not find the source location of OpenLLM.')
pyproject_path = Path(module_location).parent.parent / 'pyproject.toml'
if os.path.isfile(pyproject_path.__fspath__()):
logger.info('Generating built wheels for package %s...', package)
with IsolatedEnvBuilder() as env:
builder = ProjectBuilder(pyproject_path.parent)
builder.python_executable = env.executable
builder.scripts_dir = env.scripts_dir
env.install(builder.build_system_requires)
return builder.build('wheel', path, config_settings={'--global-option': '--quiet'})
raise RuntimeError(
'Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.'
)
raise RuntimeError('Please install OpenLLM from PyPI or built it from Git source.')
def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=None):
packages = ['scipy', 'bentoml[tracing]>=1.1.10'] # apparently bnb misses this one
if adapter_map is not None:
packages += ['openllm[fine-tune]']
if extra_dependencies is not None:
packages += [f'openllm[{k}]' for k in extra_dependencies]
if llm.config['requirements'] is not None:
packages.extend(llm.config['requirements'])
wheels = None
from . import RefResolver
packages = ['scipy', 'bentoml[tracing]>=1.1.10', 'vllm==0.2.2', 'ray==2.6.0', f'openllm>={RefResolver.from_strategy("release").version}'] # apparently bnb misses this one
if adapter_map is not None: packages += ['openllm[fine-tune]']
if extra_dependencies is not None: packages += [f'openllm[{k}]' for k in extra_dependencies]
if llm.config['requirements'] is not None: packages.extend(llm.config['requirements'])
built_wheels = [build_editable(llm_fs.getsyspath('/'), p) for p in ('openllm_core', 'openllm_client', 'openllm')]
if all(i for i in built_wheels):
wheels = [llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in built_wheels]
return PythonOptions(packages=packages, wheels=wheels, lock_packages=True)
def construct_docker_options(
llm, _, quantize, adapter_map, dockerfile_template, serialisation, container_registry, container_version_strategy
):
return PythonOptions(
packages=packages,
wheels=[llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in built_wheels] if all(i for i in built_wheels) else None,
lock_packages=True
)
def construct_docker_options(llm, _, quantize, adapter_map, dockerfile_template, serialisation):
from openllm_cli.entrypoint import process_environ
environ = process_environ(
llm.config,
llm.config['timeout'],
@@ -84,37 +67,7 @@ def construct_docker_options(
environ['OPENLLM_CONFIG'] = f"'{environ['OPENLLM_CONFIG']}'"
environ.pop('BENTOML_HOME', None) # NOTE: irrelevant in container
environ['NVIDIA_DRIVER_CAPABILITIES'] = 'compute,utility'
return DockerOptions(
base_image=oci.RefResolver.construct_base_image(container_registry, container_version_strategy),
env=environ,
dockerfile_template=dockerfile_template,
)
_service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
_SERVICE_VARS = '''\
import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__}
'''
def write_service(llm, llm_fs, adapter_map):
logger.debug('Generating service vars %s (dir=%s)', llm.model_id, llm_fs.getsyspath('/'))
script = f"# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n\n# fmt: off\n" + _SERVICE_VARS.format(
__model_id__=llm.model_id,
__model_tag__=str(llm.tag),
__model_adapter_map__=orjson.dumps(adapter_map).decode(),
__model_serialization__=llm.config['serialisation'],
__model_trust_remote_code__=str(llm.trust_remote_code),
)
if SHOW_CODEGEN:
logger.info('Generated _service_vars.py:\n%s', script)
llm_fs.writetext('_service_vars.py', script)
with open(_service_file.__fspath__(), 'r') as f:
service_src = f.read()
llm_fs.writetext(llm.config['service_name'], service_src)
return DockerOptions(cuda_version='12.1', env=environ, dockerfile_template=dockerfile_template)
@inject
def create_bento(
bento_tag,
@@ -125,8 +78,6 @@ def create_bento(
adapter_map=None,
extra_dependencies=None,
serialisation=None,
container_registry='ecr',
container_version_strategy='release',
_bento_store=Provide[BentoMLContainer.bento_store],
_model_store=Provide[BentoMLContainer.model_store],
):
@@ -145,11 +96,21 @@ def create_bento(
},
}
)
if adapter_map:
labels.update(adapter_map)
if adapter_map: labels.update(adapter_map)
logger.debug("Building Bento '%s' with model backend '%s'", bento_tag, llm.__llm_backend__)
# add service.py definition to this temporary folder
write_service(llm, llm_fs, adapter_map)
logger.debug('Generating service vars %s (dir=%s)', llm.model_id, llm_fs.getsyspath('/'))
script = f"# fmt: off\n# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n" + _SERVICE_VARS.format(
__model_id__=llm.model_id,
__model_tag__=str(llm.tag),
__model_adapter_map__=orjson.dumps(adapter_map).decode(),
__model_serialization__=llm.config['serialisation'],
__model_trust_remote_code__=str(llm.trust_remote_code),
)
if SHOW_CODEGEN: logger.info('Generated _service_vars.py:\n%s', script)
llm_fs.writetext('_service_vars.py', script)
with open(_service_file.__fspath__(), 'r') as f: service_src = f.read()
llm_fs.writetext(llm.config['service_name'], service_src)
bento = bentoml.Bento.create(
version=bento_tag.version,
@@ -163,16 +124,7 @@ def create_bento(
include=list(llm_fs.walk.files()),
exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
docker=construct_docker_options(
llm,
llm_fs,
quantize,
adapter_map,
dockerfile_template,
_serialisation,
container_registry,
container_version_strategy,
),
docker=construct_docker_options(llm, llm_fs, quantize, adapter_map, dockerfile_template, _serialisation),
),
)

View File

@@ -8,8 +8,6 @@ from bentoml._internal.bento import BentoStore
from bentoml._internal.bento.build_config import DockerOptions, PythonOptions
from bentoml._internal.models.model import ModelStore
from openllm_core._typing_compat import (
LiteralContainerRegistry,
LiteralContainerVersionStrategy,
LiteralQuantise,
LiteralSerialisation,
M,
@@ -32,10 +30,7 @@ def construct_docker_options(
adapter_map: Optional[Dict[str, str]],
dockerfile_template: Optional[str],
serialisation: LiteralSerialisation,
container_registry: LiteralContainerRegistry,
container_version_strategy: LiteralContainerVersionStrategy,
) -> DockerOptions: ...
def write_service(llm: LLM[M, T], llm_fs: FS, adapter_map: Optional[Dict[str, str]]) -> None: ...
def create_bento(
bento_tag: Tag,
llm_fs: FS,
@@ -45,8 +40,6 @@ def create_bento(
adapter_map: Optional[Dict[str, str]] = ...,
extra_dependencies: Optional[Tuple[str, ...]] = ...,
serialisation: Optional[LiteralSerialisation] = ...,
container_registry: LiteralContainerRegistry = ...,
container_version_strategy: LiteralContainerVersionStrategy = ...,
_bento_store: BentoStore = ...,
_model_store: ModelStore = ...,
) -> Bento: ...

View File

@@ -1,45 +0,0 @@
# syntax=docker/dockerfile-upstream:master
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 as base-container
# Automatically set by buildx
ARG TARGETPLATFORM
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
ccache \
curl \
libssl-dev ca-certificates make \
git python3-pip && \
rm -rf /var/lib/apt/lists/*
RUN mkdir -p /openllm-python
RUN mkdir -p /openllm-core
RUN mkdir -p /openllm-client
# Install required dependencies
COPY openllm-python/src /openllm-python/src
COPY hatch.toml README.md CHANGELOG.md openllm-python/pyproject.toml /openllm-python/
# Install all required dependencies
# We have to install autoawq first to avoid conflict with torch, then reinstall torch with vllm
# below
RUN --mount=type=cache,target=/root/.cache/pip \
pip3 install -v --no-cache-dir \
"ray==2.6.0" "vllm==0.2.2" xformers && \
pip3 install --no-cache-dir -e /openllm-python/
COPY openllm-core/src openllm-core/src
COPY hatch.toml README.md CHANGELOG.md openllm-core/pyproject.toml /openllm-core/
RUN --mount=type=cache,target=/root/.cache/pip pip3 install -v --no-cache-dir -e /openllm-core/
COPY openllm-client/src openllm-client/src
COPY hatch.toml README.md CHANGELOG.md openllm-client/pyproject.toml /openllm-client/
RUN --mount=type=cache,target=/root/.cache/pip pip3 install -v --no-cache-dir -e /openllm-client/
FROM base-container
ENTRYPOINT ["python3", "-m", "openllm"]

View File

@@ -1,82 +0,0 @@
from __future__ import annotations
import functools
import importlib
import logging
import os
import pathlib
import attr
from openllm_core._typing_compat import LiteralContainerVersionStrategy
from openllm_core.exceptions import OpenLLMException
from openllm_core.utils.lazy import VersionInfo
logger = logging.getLogger(__name__)
ROOT_DIR = pathlib.Path(os.path.abspath('__file__')).parent.parent.parent
_CONTAINER_REGISTRY = {
'docker': 'docker.io/bentoml/openllm',
'gh': 'ghcr.io/bentoml/openllm',
'ecr': 'public.ecr.aws/y5w8i4y6/bentoml/openllm',
}
# TODO: support custom fork. Currently it only support openllm main.
_OWNER, _REPO = 'bentoml', 'openllm'
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
class RefResolver:
git_hash: str = attr.field()
version: VersionInfo = attr.field(converter=lambda s: VersionInfo.from_version_string(s))
strategy: LiteralContainerVersionStrategy = attr.field()
@classmethod
@functools.lru_cache(maxsize=64)
def from_strategy(cls, strategy_or_version=None):
# using default strategy
if strategy_or_version is None or strategy_or_version == 'release':
try:
from ghapi.all import GhApi
ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
meta = ghapi.repos.get_latest_release()
git_hash = ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha']
except Exception as err:
raise OpenLLMException('Failed to determine latest release version.') from err
return cls(git_hash=git_hash, version=meta['name'].lstrip('v'), strategy='release')
elif strategy_or_version in ('latest', 'nightly'): # latest is nightly
return cls(git_hash='latest', version='0.0.0', strategy='latest')
else:
raise ValueError(f'Unknown strategy: {strategy_or_version}')
@property
def tag(self):
return 'latest' if self.strategy in {'latest', 'nightly'} else repr(self.version)
@staticmethod
def construct_base_image(reg, strategy=None):
if reg == 'gh':
logger.warning("Setting base registry to 'gh' will affect cold start performance on GCP/AWS.")
elif reg == 'docker':
logger.warning('docker is base image is yet to be supported. Falling back to "ecr".')
reg = 'ecr'
return f'{_CONTAINER_REGISTRY[reg]}:{RefResolver.from_strategy(strategy).tag}'
__all__ = ['CONTAINER_NAMES', 'RefResolver', 'supported_registries']
def __dir__():
return sorted(__all__)
def __getattr__(name):
if name == 'supported_registries':
return functools.lru_cache(1)(lambda: list(_CONTAINER_REGISTRY))()
elif name == 'CONTAINER_NAMES':
return _CONTAINER_REGISTRY
elif name in __all__:
return importlib.import_module('.' + name, __name__)
else:
raise AttributeError(f'{name} does not exists under {__name__}')