fix(base-image): update base image to include cuda for now (#720)

* fix(base-image): update base image to include cuda for now

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* fix: build core and client on release images

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: cleanup style changes

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-11-22 01:15:19 -05:00
committed by GitHub
parent 8bb2742a9a
commit 38b7c44df0
41 changed files with 913 additions and 613 deletions

View File

@@ -1,4 +1,4 @@
"""OpenLLM CLI.
'''OpenLLM CLI.
For more information see ``openllm -h``.
"""
'''

View File

@@ -146,7 +146,7 @@ def start_decorator(serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC
backend_option(factory=cog.optgroup),
cog.optgroup.group(
'LLM Optimization Options',
help="""Optimization related options.
help='''Optimization related options.
OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
@@ -154,7 +154,7 @@ def start_decorator(serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC
- DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
- GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
""",
''',
),
quantize_option(factory=cog.optgroup),
serialisation_option(factory=cog.optgroup),
@@ -196,7 +196,7 @@ _IGNORED_OPTIONS = {'working_dir', 'production', 'protocol_version'}
def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]:
"""Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`."""
'''Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`.'''
from bentoml_cli.cli import cli
command = 'serve' if not serve_grpc else 'serve-grpc'
@@ -233,11 +233,11 @@ _http_server_args, _grpc_server_args = parse_serve_args(False), parse_serve_args
def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | None], FC]:
"""General ``@click`` decorator with some sauce.
'''General ``@click`` decorator with some sauce.
This decorator extends the default ``@click.option`` plus a factory option and factory attr to
provide type-safe click.option or click.argument wrapper for all compatible factory.
"""
'''
factory = attrs.pop('factory', click)
factory_attr = attrs.pop('attr', 'option')
if factory_attr != 'argument':
@@ -346,7 +346,7 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
default=None,
envvar='OPENLLM_QUANTIZE',
show_envvar=True,
help="""Dynamic quantization for running this LLM.
help='''Dynamic quantization for running this LLM.
The following quantization strategies are supported:
@@ -361,15 +361,15 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
- ``squeezellm``: ``SqueezeLLM`` [SqueezeLLM: Dense-and-Sparse Quantization](https://arxiv.org/abs/2306.07629)
> [!NOTE] that the model can also be served with quantized weights.
"""
'''
+ (
"""
> [!NOTE] that this will set the mode for serving within deployment."""
'''
> [!NOTE] that this will set the mode for serving within deployment.'''
if build
else ''
)
+ """
> [!NOTE] that quantization are currently only available in *PyTorch* models.""",
+ '''
> [!NOTE] that quantization are currently only available in *PyTorch* models.''',
**attrs,
)(f)
@@ -383,7 +383,7 @@ def workers_per_resource_option(
callback=workers_per_resource_callback,
type=str,
required=False,
help="""Number of workers per resource assigned.
help='''Number of workers per resource assigned.
See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
for more information. By default, this is set to 1.
@@ -393,7 +393,7 @@ def workers_per_resource_option(
- ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
- ``conserved``: This will determine the number of available GPU resources. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``.
"""
'''
+ (
"""\n
> [!NOTE] The workers value passed into 'build' will determine how the LLM can
@@ -416,7 +416,7 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
show_default=True,
show_envvar=True,
envvar='OPENLLM_SERIALIZATION',
help="""Serialisation format for save/load LLM.
help='''Serialisation format for save/load LLM.
Currently the following strategies are supported:
@@ -425,7 +425,7 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
> [!NOTE] Safetensors might not work for every cases, and you can always fallback to ``legacy`` if needed.
- ``legacy``: This will use PyTorch serialisation format, often as ``.bin`` files. This should be used if the model doesn't yet support safetensors.
""",
''',
**attrs,
)(f)

View File

@@ -291,7 +291,7 @@ def _import_model(
def _list_models() -> dict[str, t.Any]:
"""List all available models within the local store."""
'''List all available models within the local store.'''
from .entrypoint import models_command
return models_command.main(args=['--quiet'], standalone_mode=False)

View File

@@ -94,14 +94,14 @@ else:
P = ParamSpec('P')
logger = logging.getLogger('openllm')
OPENLLM_FIGLET = """\
OPENLLM_FIGLET = '''\
██████╗ ██████╗ ███████╗███╗ ██╗██╗ ██╗ ███╗ ███╗
██╔═══██╗██╔══██╗██╔════╝████╗ ██║██║ ██║ ████╗ ████║
██║ ██║██████╔╝█████╗ ██╔██╗ ██║██║ ██║ ██╔████╔██║
██║ ██║██╔═══╝ ██╔══╝ ██║╚██╗██║██║ ██║ ██║╚██╔╝██║
╚██████╔╝██║ ███████╗██║ ╚████║███████╗███████╗██║ ╚═╝ ██║
╚═════╝ ╚═╝ ╚══════╝╚═╝ ╚═══╝╚══════╝╚══════╝╚═╝ ╚═╝
"""
'''
ServeCommand = t.Literal['serve', 'serve-grpc']
@@ -287,7 +287,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
return decorator
def format_commands(self, ctx: click.Context, formatter: click.HelpFormatter) -> None:
"""Additional format methods that include extensions as well as the default cli command."""
'''Additional format methods that include extensions as well as the default cli command.'''
from gettext import gettext as _
commands: list[tuple[str, click.Command]] = []
@@ -334,7 +334,7 @@ _PACKAGE_NAME = 'openllm'
message=f'{_PACKAGE_NAME}, %(version)s (compiled: {openllm.COMPILED})\nPython ({platform.python_implementation()}) {platform.python_version()}',
)
def cli() -> None:
"""\b
'''\b
██████╗ ██████╗ ███████╗███╗ ██╗██╗ ██╗ ███╗ ███╗
██╔═══██╗██╔══██╗██╔════╝████╗ ██║██║ ██║ ████╗ ████║
██║ ██║██████╔╝█████╗ ██╔██╗ ██║██║ ██║ ██╔████╔██║
@@ -345,7 +345,7 @@ def cli() -> None:
\b
An open platform for operating large language models in production.
Fine-tune, serve, deploy, and monitor any LLMs with ease.
"""
'''
@cli.command(
@@ -389,13 +389,13 @@ def start_command(
max_model_len: int | None,
**attrs: t.Any,
) -> LLMConfig | subprocess.Popen[bytes]:
"""Start any LLM as a REST server.
'''Start any LLM as a REST server.
\b
```bash
$ openllm <start|start-http> <model_id> --<options> ...
```
"""
'''
if model_id in openllm.CONFIG_MAPPING:
_model_name = model_id
if deprecated_model_id is not None:
@@ -519,13 +519,13 @@ def start_grpc_command(
max_model_len: int | None,
**attrs: t.Any,
) -> LLMConfig | subprocess.Popen[bytes]:
"""Start any LLM as a gRPC server.
'''Start any LLM as a gRPC server.
\b
```bash
$ openllm start-grpc <model_id> --<options> ...
```
"""
'''
termui.warning(
'Continuous batching is currently not yet supported with gPRC. If you want to use continuous batching with gRPC, feel free to open a GitHub issue about your usecase.\n'
)
@@ -955,7 +955,7 @@ def build_command(
force_push: bool,
**_: t.Any,
) -> BuildBentoOutput:
"""Package a given models into a BentoLLM.
'''Package a given models into a BentoLLM.
\b
```bash
@@ -971,7 +971,7 @@ def build_command(
> [!IMPORTANT]
> To build the bento with compiled OpenLLM, make sure to prepend HATCH_BUILD_HOOKS_ENABLE=1. Make sure that the deployment
> target also use the same Python version and architecture as build machine.
"""
'''
from openllm.serialisation.transformers.weights import has_safetensors_weights
if model_id in openllm.CONFIG_MAPPING:
@@ -1167,13 +1167,13 @@ class ModelItem(t.TypedDict):
@cli.command()
@click.option('--show-available', is_flag=True, default=True, hidden=True)
def models_command(**_: t.Any) -> dict[t.LiteralString, ModelItem]:
"""List all supported models.
'''List all supported models.
\b
```bash
openllm models
```
"""
'''
result: dict[t.LiteralString, ModelItem] = {
m: ModelItem(
architecture=config.__openllm_architecture__,
@@ -1216,11 +1216,11 @@ def prune_command(
bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
**_: t.Any,
) -> None:
"""Remove all saved models, and bentos built with OpenLLM locally.
'''Remove all saved models, and bentos built with OpenLLM locally.
\b
If a model type is passed, then only prune models for that given model type.
"""
'''
available: list[tuple[bentoml.Model | bentoml.Bento, ModelStore | BentoStore]] = [
(m, model_store)
for m in bentoml.models.list()
@@ -1326,13 +1326,13 @@ def query_command(
_memoized: DictStrAny,
**_: t.Any,
) -> None:
"""Query a LLM interactively, from a terminal.
'''Query a LLM interactively, from a terminal.
\b
```bash
$ openllm query --endpoint http://12.323.2.1:3000 "What is the meaning of life?"
```
"""
'''
if server_type == 'grpc':
raise click.ClickException("'grpc' is currently disabled.")
_memoized = {k: orjson.loads(v[0]) for k, v in _memoized.items() if v}
@@ -1353,7 +1353,7 @@ def query_command(
@cli.group(cls=Extensions, hidden=True, name='extension')
def extension_command() -> None:
"""Extension for OpenLLM CLI."""
'''Extension for OpenLLM CLI.'''
if __name__ == '__main__':

View File

@@ -71,7 +71,7 @@ def build_container(
@click.command(
'build_base_container',
context_settings=termui.CONTEXT_SETTINGS,
help="""Base image builder for BentoLLM.
help='''Base image builder for BentoLLM.
By default, the base image will include custom kernels (PagedAttention via vllm, FlashAttention-v2, etc.) built with CUDA 11.8, Python 3.9 on Ubuntu22.04.
Optionally, this can also be pushed directly to remote registry. Currently support ``docker.io``, ``ghcr.io`` and ``quay.io``.
@@ -81,7 +81,7 @@ def build_container(
This command is only useful for debugging and for building custom base image for extending BentoML with custom base images and custom kernels.
Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
""",
''',
)
@container_registry_option
@click.option(

View File

@@ -24,7 +24,7 @@ if t.TYPE_CHECKING:
def cli(
ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
) -> str | None:
"""Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path)."""
'''Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path).'''
try:
bentomodel = _bento_store.get(bento)
except bentoml.exceptions.NotFound:

View File

@@ -13,7 +13,7 @@ from openllm_cli import termui
@click.command('list_bentos', context_settings=termui.CONTEXT_SETTINGS)
@click.pass_context
def cli(ctx: click.Context) -> None:
"""List available bentos built by OpenLLM."""
'''List available bentos built by OpenLLM.'''
mapping = {
k: [
{

View File

@@ -18,7 +18,7 @@ if t.TYPE_CHECKING:
@click.command('list_models', context_settings=termui.CONTEXT_SETTINGS)
@model_name_argument(required=False, shell_complete=model_complete_envvar)
def cli(model_name: str | None) -> DictStrAny:
"""This is equivalent to openllm models --show-available less the nice table."""
'''This is equivalent to openllm models --show-available less the nice table.'''
models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
ids_in_local_store = {
k: [