mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-01-17 03:47:54 -05:00
fix(base-image): update base image to include cuda for now (#720)
* fix(base-image): update base image to include cuda for now Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * fix: build core and client on release images Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: cleanup style changes Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
"""OpenLLM CLI.
|
||||
'''OpenLLM CLI.
|
||||
|
||||
For more information see ``openllm -h``.
|
||||
"""
|
||||
'''
|
||||
|
||||
@@ -146,7 +146,7 @@ def start_decorator(serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC
|
||||
backend_option(factory=cog.optgroup),
|
||||
cog.optgroup.group(
|
||||
'LLM Optimization Options',
|
||||
help="""Optimization related options.
|
||||
help='''Optimization related options.
|
||||
|
||||
OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
|
||||
|
||||
@@ -154,7 +154,7 @@ def start_decorator(serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC
|
||||
|
||||
- DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
|
||||
- GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
|
||||
""",
|
||||
''',
|
||||
),
|
||||
quantize_option(factory=cog.optgroup),
|
||||
serialisation_option(factory=cog.optgroup),
|
||||
@@ -196,7 +196,7 @@ _IGNORED_OPTIONS = {'working_dir', 'production', 'protocol_version'}
|
||||
|
||||
|
||||
def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]:
|
||||
"""Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`."""
|
||||
'''Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`.'''
|
||||
from bentoml_cli.cli import cli
|
||||
|
||||
command = 'serve' if not serve_grpc else 'serve-grpc'
|
||||
@@ -233,11 +233,11 @@ _http_server_args, _grpc_server_args = parse_serve_args(False), parse_serve_args
|
||||
|
||||
|
||||
def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | None], FC]:
|
||||
"""General ``@click`` decorator with some sauce.
|
||||
'''General ``@click`` decorator with some sauce.
|
||||
|
||||
This decorator extends the default ``@click.option`` plus a factory option and factory attr to
|
||||
provide type-safe click.option or click.argument wrapper for all compatible factory.
|
||||
"""
|
||||
'''
|
||||
factory = attrs.pop('factory', click)
|
||||
factory_attr = attrs.pop('attr', 'option')
|
||||
if factory_attr != 'argument':
|
||||
@@ -346,7 +346,7 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
|
||||
default=None,
|
||||
envvar='OPENLLM_QUANTIZE',
|
||||
show_envvar=True,
|
||||
help="""Dynamic quantization for running this LLM.
|
||||
help='''Dynamic quantization for running this LLM.
|
||||
|
||||
The following quantization strategies are supported:
|
||||
|
||||
@@ -361,15 +361,15 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
|
||||
- ``squeezellm``: ``SqueezeLLM`` [SqueezeLLM: Dense-and-Sparse Quantization](https://arxiv.org/abs/2306.07629)
|
||||
|
||||
> [!NOTE] that the model can also be served with quantized weights.
|
||||
"""
|
||||
'''
|
||||
+ (
|
||||
"""
|
||||
> [!NOTE] that this will set the mode for serving within deployment."""
|
||||
'''
|
||||
> [!NOTE] that this will set the mode for serving within deployment.'''
|
||||
if build
|
||||
else ''
|
||||
)
|
||||
+ """
|
||||
> [!NOTE] that quantization are currently only available in *PyTorch* models.""",
|
||||
+ '''
|
||||
> [!NOTE] that quantization are currently only available in *PyTorch* models.''',
|
||||
**attrs,
|
||||
)(f)
|
||||
|
||||
@@ -383,7 +383,7 @@ def workers_per_resource_option(
|
||||
callback=workers_per_resource_callback,
|
||||
type=str,
|
||||
required=False,
|
||||
help="""Number of workers per resource assigned.
|
||||
help='''Number of workers per resource assigned.
|
||||
|
||||
See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
|
||||
for more information. By default, this is set to 1.
|
||||
@@ -393,7 +393,7 @@ def workers_per_resource_option(
|
||||
- ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
|
||||
|
||||
- ``conserved``: This will determine the number of available GPU resources. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``.
|
||||
"""
|
||||
'''
|
||||
+ (
|
||||
"""\n
|
||||
> [!NOTE] The workers value passed into 'build' will determine how the LLM can
|
||||
@@ -416,7 +416,7 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
|
||||
show_default=True,
|
||||
show_envvar=True,
|
||||
envvar='OPENLLM_SERIALIZATION',
|
||||
help="""Serialisation format for save/load LLM.
|
||||
help='''Serialisation format for save/load LLM.
|
||||
|
||||
Currently the following strategies are supported:
|
||||
|
||||
@@ -425,7 +425,7 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
|
||||
> [!NOTE] Safetensors might not work for every cases, and you can always fallback to ``legacy`` if needed.
|
||||
|
||||
- ``legacy``: This will use PyTorch serialisation format, often as ``.bin`` files. This should be used if the model doesn't yet support safetensors.
|
||||
""",
|
||||
''',
|
||||
**attrs,
|
||||
)(f)
|
||||
|
||||
|
||||
@@ -291,7 +291,7 @@ def _import_model(
|
||||
|
||||
|
||||
def _list_models() -> dict[str, t.Any]:
|
||||
"""List all available models within the local store."""
|
||||
'''List all available models within the local store.'''
|
||||
from .entrypoint import models_command
|
||||
|
||||
return models_command.main(args=['--quiet'], standalone_mode=False)
|
||||
|
||||
@@ -94,14 +94,14 @@ else:
|
||||
|
||||
P = ParamSpec('P')
|
||||
logger = logging.getLogger('openllm')
|
||||
OPENLLM_FIGLET = """\
|
||||
OPENLLM_FIGLET = '''\
|
||||
██████╗ ██████╗ ███████╗███╗ ██╗██╗ ██╗ ███╗ ███╗
|
||||
██╔═══██╗██╔══██╗██╔════╝████╗ ██║██║ ██║ ████╗ ████║
|
||||
██║ ██║██████╔╝█████╗ ██╔██╗ ██║██║ ██║ ██╔████╔██║
|
||||
██║ ██║██╔═══╝ ██╔══╝ ██║╚██╗██║██║ ██║ ██║╚██╔╝██║
|
||||
╚██████╔╝██║ ███████╗██║ ╚████║███████╗███████╗██║ ╚═╝ ██║
|
||||
╚═════╝ ╚═╝ ╚══════╝╚═╝ ╚═══╝╚══════╝╚══════╝╚═╝ ╚═╝
|
||||
"""
|
||||
'''
|
||||
|
||||
ServeCommand = t.Literal['serve', 'serve-grpc']
|
||||
|
||||
@@ -287,7 +287,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
|
||||
return decorator
|
||||
|
||||
def format_commands(self, ctx: click.Context, formatter: click.HelpFormatter) -> None:
|
||||
"""Additional format methods that include extensions as well as the default cli command."""
|
||||
'''Additional format methods that include extensions as well as the default cli command.'''
|
||||
from gettext import gettext as _
|
||||
|
||||
commands: list[tuple[str, click.Command]] = []
|
||||
@@ -334,7 +334,7 @@ _PACKAGE_NAME = 'openllm'
|
||||
message=f'{_PACKAGE_NAME}, %(version)s (compiled: {openllm.COMPILED})\nPython ({platform.python_implementation()}) {platform.python_version()}',
|
||||
)
|
||||
def cli() -> None:
|
||||
"""\b
|
||||
'''\b
|
||||
██████╗ ██████╗ ███████╗███╗ ██╗██╗ ██╗ ███╗ ███╗
|
||||
██╔═══██╗██╔══██╗██╔════╝████╗ ██║██║ ██║ ████╗ ████║
|
||||
██║ ██║██████╔╝█████╗ ██╔██╗ ██║██║ ██║ ██╔████╔██║
|
||||
@@ -345,7 +345,7 @@ def cli() -> None:
|
||||
\b
|
||||
An open platform for operating large language models in production.
|
||||
Fine-tune, serve, deploy, and monitor any LLMs with ease.
|
||||
"""
|
||||
'''
|
||||
|
||||
|
||||
@cli.command(
|
||||
@@ -389,13 +389,13 @@ def start_command(
|
||||
max_model_len: int | None,
|
||||
**attrs: t.Any,
|
||||
) -> LLMConfig | subprocess.Popen[bytes]:
|
||||
"""Start any LLM as a REST server.
|
||||
'''Start any LLM as a REST server.
|
||||
|
||||
\b
|
||||
```bash
|
||||
$ openllm <start|start-http> <model_id> --<options> ...
|
||||
```
|
||||
"""
|
||||
'''
|
||||
if model_id in openllm.CONFIG_MAPPING:
|
||||
_model_name = model_id
|
||||
if deprecated_model_id is not None:
|
||||
@@ -519,13 +519,13 @@ def start_grpc_command(
|
||||
max_model_len: int | None,
|
||||
**attrs: t.Any,
|
||||
) -> LLMConfig | subprocess.Popen[bytes]:
|
||||
"""Start any LLM as a gRPC server.
|
||||
'''Start any LLM as a gRPC server.
|
||||
|
||||
\b
|
||||
```bash
|
||||
$ openllm start-grpc <model_id> --<options> ...
|
||||
```
|
||||
"""
|
||||
'''
|
||||
termui.warning(
|
||||
'Continuous batching is currently not yet supported with gPRC. If you want to use continuous batching with gRPC, feel free to open a GitHub issue about your usecase.\n'
|
||||
)
|
||||
@@ -955,7 +955,7 @@ def build_command(
|
||||
force_push: bool,
|
||||
**_: t.Any,
|
||||
) -> BuildBentoOutput:
|
||||
"""Package a given models into a BentoLLM.
|
||||
'''Package a given models into a BentoLLM.
|
||||
|
||||
\b
|
||||
```bash
|
||||
@@ -971,7 +971,7 @@ def build_command(
|
||||
> [!IMPORTANT]
|
||||
> To build the bento with compiled OpenLLM, make sure to prepend HATCH_BUILD_HOOKS_ENABLE=1. Make sure that the deployment
|
||||
> target also use the same Python version and architecture as build machine.
|
||||
"""
|
||||
'''
|
||||
from openllm.serialisation.transformers.weights import has_safetensors_weights
|
||||
|
||||
if model_id in openllm.CONFIG_MAPPING:
|
||||
@@ -1167,13 +1167,13 @@ class ModelItem(t.TypedDict):
|
||||
@cli.command()
|
||||
@click.option('--show-available', is_flag=True, default=True, hidden=True)
|
||||
def models_command(**_: t.Any) -> dict[t.LiteralString, ModelItem]:
|
||||
"""List all supported models.
|
||||
'''List all supported models.
|
||||
|
||||
\b
|
||||
```bash
|
||||
openllm models
|
||||
```
|
||||
"""
|
||||
'''
|
||||
result: dict[t.LiteralString, ModelItem] = {
|
||||
m: ModelItem(
|
||||
architecture=config.__openllm_architecture__,
|
||||
@@ -1216,11 +1216,11 @@ def prune_command(
|
||||
bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
|
||||
**_: t.Any,
|
||||
) -> None:
|
||||
"""Remove all saved models, and bentos built with OpenLLM locally.
|
||||
'''Remove all saved models, and bentos built with OpenLLM locally.
|
||||
|
||||
\b
|
||||
If a model type is passed, then only prune models for that given model type.
|
||||
"""
|
||||
'''
|
||||
available: list[tuple[bentoml.Model | bentoml.Bento, ModelStore | BentoStore]] = [
|
||||
(m, model_store)
|
||||
for m in bentoml.models.list()
|
||||
@@ -1326,13 +1326,13 @@ def query_command(
|
||||
_memoized: DictStrAny,
|
||||
**_: t.Any,
|
||||
) -> None:
|
||||
"""Query a LLM interactively, from a terminal.
|
||||
'''Query a LLM interactively, from a terminal.
|
||||
|
||||
\b
|
||||
```bash
|
||||
$ openllm query --endpoint http://12.323.2.1:3000 "What is the meaning of life?"
|
||||
```
|
||||
"""
|
||||
'''
|
||||
if server_type == 'grpc':
|
||||
raise click.ClickException("'grpc' is currently disabled.")
|
||||
_memoized = {k: orjson.loads(v[0]) for k, v in _memoized.items() if v}
|
||||
@@ -1353,7 +1353,7 @@ def query_command(
|
||||
|
||||
@cli.group(cls=Extensions, hidden=True, name='extension')
|
||||
def extension_command() -> None:
|
||||
"""Extension for OpenLLM CLI."""
|
||||
'''Extension for OpenLLM CLI.'''
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -71,7 +71,7 @@ def build_container(
|
||||
@click.command(
|
||||
'build_base_container',
|
||||
context_settings=termui.CONTEXT_SETTINGS,
|
||||
help="""Base image builder for BentoLLM.
|
||||
help='''Base image builder for BentoLLM.
|
||||
|
||||
By default, the base image will include custom kernels (PagedAttention via vllm, FlashAttention-v2, etc.) built with CUDA 11.8, Python 3.9 on Ubuntu22.04.
|
||||
Optionally, this can also be pushed directly to remote registry. Currently support ``docker.io``, ``ghcr.io`` and ``quay.io``.
|
||||
@@ -81,7 +81,7 @@ def build_container(
|
||||
This command is only useful for debugging and for building custom base image for extending BentoML with custom base images and custom kernels.
|
||||
|
||||
Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
|
||||
""",
|
||||
''',
|
||||
)
|
||||
@container_registry_option
|
||||
@click.option(
|
||||
|
||||
@@ -24,7 +24,7 @@ if t.TYPE_CHECKING:
|
||||
def cli(
|
||||
ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
|
||||
) -> str | None:
|
||||
"""Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path)."""
|
||||
'''Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path).'''
|
||||
try:
|
||||
bentomodel = _bento_store.get(bento)
|
||||
except bentoml.exceptions.NotFound:
|
||||
|
||||
@@ -13,7 +13,7 @@ from openllm_cli import termui
|
||||
@click.command('list_bentos', context_settings=termui.CONTEXT_SETTINGS)
|
||||
@click.pass_context
|
||||
def cli(ctx: click.Context) -> None:
|
||||
"""List available bentos built by OpenLLM."""
|
||||
'''List available bentos built by OpenLLM.'''
|
||||
mapping = {
|
||||
k: [
|
||||
{
|
||||
|
||||
@@ -18,7 +18,7 @@ if t.TYPE_CHECKING:
|
||||
@click.command('list_models', context_settings=termui.CONTEXT_SETTINGS)
|
||||
@model_name_argument(required=False, shell_complete=model_complete_envvar)
|
||||
def cli(model_name: str | None) -> DictStrAny:
|
||||
"""This is equivalent to openllm models --show-available less the nice table."""
|
||||
'''This is equivalent to openllm models --show-available less the nice table.'''
|
||||
models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
|
||||
ids_in_local_store = {
|
||||
k: [
|
||||
|
||||
Reference in New Issue
Block a user