mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-01-18 20:41:11 -05:00
fix(cli): correct set arguments for openllm import and openllm build (#775)
* fix(cli): correct set arguments for `openllm import` and `openllm build` Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: update changelog Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -11,6 +11,7 @@ from openllm_core._typing_compat import (
|
||||
LiteralBackend,
|
||||
LiteralSerialisation,
|
||||
ParamSpec,
|
||||
AnyCallable,
|
||||
get_literal_args,
|
||||
)
|
||||
from openllm_core.utils import DEBUG, compose, dantic, resolve_user_filepath
|
||||
@@ -25,7 +26,7 @@ class _OpenLLM_GenericInternalConfig(LLMConfig):
|
||||
|
||||
class GenerationConfig:
|
||||
top_k: int = 15
|
||||
top_p: float = 0.9
|
||||
top_p: float = 0.78
|
||||
temperature: float = 0.75
|
||||
max_new_tokens: int = 128
|
||||
|
||||
@@ -118,21 +119,22 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...
|
||||
ctx.params[_adapter_mapping_key][adapter_id] = name
|
||||
return None
|
||||
|
||||
def optimization_decorator(fn: FC, *, factory=click, _eager=True) -> FC | list[AnyCallable]:
|
||||
shared = [
|
||||
dtype_option(factory=factory), model_version_option(factory=factory), #
|
||||
backend_option(factory=factory), quantize_option(factory=factory), #
|
||||
serialisation_option(factory=factory),
|
||||
]
|
||||
if not _eager: return shared
|
||||
return compose(*shared)(fn)
|
||||
|
||||
def start_decorator(fn: FC) -> FC:
|
||||
composed = compose(
|
||||
_OpenLLM_GenericInternalConfig.parse,
|
||||
_http_server_args,
|
||||
cog.optgroup.group('General LLM Options', help='The following options are related to running LLM Server.'),
|
||||
dtype_option(factory=cog.optgroup),
|
||||
model_version_option(factory=cog.optgroup),
|
||||
cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
|
||||
workers_per_resource_option(factory=cog.optgroup),
|
||||
cors_option(factory=cog.optgroup),
|
||||
backend_option(factory=cog.optgroup),
|
||||
parse_serve_args(),
|
||||
cog.optgroup.group(
|
||||
'LLM Optimization Options',
|
||||
help='''Optimization related options.
|
||||
'LLM Options',
|
||||
help='''The following options are related to running LLM Server as well as optimization options.
|
||||
|
||||
OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
|
||||
|
||||
@@ -140,10 +142,12 @@ def start_decorator(fn: FC) -> FC:
|
||||
|
||||
- DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
|
||||
- GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
|
||||
''',
|
||||
''',
|
||||
),
|
||||
quantize_option(factory=cog.optgroup),
|
||||
serialisation_option(factory=cog.optgroup),
|
||||
cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
|
||||
workers_per_resource_option(factory=cog.optgroup),
|
||||
cors_option(factory=cog.optgroup),
|
||||
*optimization_decorator(fn, factory=cog.optgroup, _eager=False),
|
||||
cog.optgroup.option(
|
||||
'--device',
|
||||
type=dantic.CUDA,
|
||||
@@ -200,8 +204,6 @@ def parse_serve_args() -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[F
|
||||
return group(f)
|
||||
return decorator
|
||||
|
||||
_http_server_args = parse_serve_args()
|
||||
|
||||
def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | None], FC]:
|
||||
'''General ``@click`` decorator with some sauce.
|
||||
|
||||
@@ -234,7 +236,8 @@ def adapter_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callab
|
||||
multiple=True,
|
||||
callback=_id_callback,
|
||||
metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]',
|
||||
)
|
||||
**attrs,
|
||||
)(f)
|
||||
|
||||
|
||||
def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
@@ -291,8 +294,7 @@ def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[
|
||||
envvar='OPENLLM_BACKEND',
|
||||
show_envvar=True,
|
||||
help='Runtime to use for both serialisation/inference engine.',
|
||||
**attrs,
|
||||
)(f)
|
||||
**attrs)(f)
|
||||
|
||||
def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_argument(
|
||||
@@ -329,15 +331,9 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
|
||||
'''
|
||||
+ (
|
||||
'''
|
||||
> [!NOTE] that this will set the mode for serving within deployment.'''
|
||||
if build
|
||||
else ''
|
||||
)
|
||||
+ '''
|
||||
> [!NOTE] that quantization are currently only available in *PyTorch* models.''',
|
||||
**attrs,
|
||||
)(f)
|
||||
|
||||
> [!NOTE] that this will set the mode for serving within deployment.''' if build else ''
|
||||
),
|
||||
**attrs)(f)
|
||||
|
||||
def workers_per_resource_option(
|
||||
f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any
|
||||
|
||||
@@ -46,15 +46,11 @@ from . import termui
|
||||
from ._factory import (
|
||||
FC,
|
||||
_AnyCallable,
|
||||
backend_option,
|
||||
dtype_option,
|
||||
machine_option,
|
||||
model_name_argument,
|
||||
model_version_option,
|
||||
parse_config_options,
|
||||
quantize_option,
|
||||
serialisation_option,
|
||||
start_decorator,
|
||||
optimization_decorator,
|
||||
)
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
@@ -590,13 +586,11 @@ class ImportModelOutput(t.TypedDict):
|
||||
metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]',
|
||||
help='Deprecated. Use positional argument instead.',
|
||||
)
|
||||
@model_version_option
|
||||
@backend_option
|
||||
@quantize_option
|
||||
@serialisation_option
|
||||
@optimization_decorator
|
||||
def import_command(
|
||||
model_id: str,
|
||||
deprecated_model_id: str | None,
|
||||
dtype: LiteralDtype,
|
||||
model_version: str | None,
|
||||
backend: LiteralBackend | None,
|
||||
quantize: LiteralQuantise | None,
|
||||
@@ -649,6 +643,7 @@ def import_command(
|
||||
model_version=model_version,
|
||||
quantize=quantize,
|
||||
backend=backend,
|
||||
dtype=dtype,
|
||||
serialisation=t.cast(
|
||||
LiteralSerialisation,
|
||||
first_not_none(
|
||||
@@ -712,8 +707,6 @@ class BuildBentoOutput(t.TypedDict):
|
||||
metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]',
|
||||
help='Deprecated. Use positional argument instead.',
|
||||
)
|
||||
@dtype_option
|
||||
@backend_option
|
||||
@click.option(
|
||||
'--bento-version',
|
||||
type=str,
|
||||
@@ -721,8 +714,6 @@ class BuildBentoOutput(t.TypedDict):
|
||||
help='Optional bento version for this BentoLLM. Default is the the model revision.',
|
||||
)
|
||||
@click.option('--overwrite', is_flag=True, help='Overwrite existing Bento for given LLM if it already exists.')
|
||||
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Optimisation options') # type: ignore[misc]
|
||||
@quantize_option(factory=cog.optgroup, build=True)
|
||||
@click.option(
|
||||
'--enable-features',
|
||||
multiple=True,
|
||||
@@ -732,6 +723,7 @@ class BuildBentoOutput(t.TypedDict):
|
||||
', '.join(OPTIONAL_DEPENDENCIES)
|
||||
),
|
||||
)
|
||||
@optimization_decorator
|
||||
@click.option(
|
||||
'--adapter-id',
|
||||
default=None,
|
||||
@@ -740,14 +732,12 @@ class BuildBentoOutput(t.TypedDict):
|
||||
help="Optional adapters id to be included within the Bento. Note that if you are using relative path, '--build-ctx' must be passed.",
|
||||
)
|
||||
@click.option('--build-ctx', help='Build context. This is required if --adapter-id uses relative path', default=None)
|
||||
@model_version_option
|
||||
@click.option(
|
||||
'--dockerfile-template',
|
||||
default=None,
|
||||
type=click.File(),
|
||||
help='Optional custom dockerfile template to be used with this BentoLLM.',
|
||||
)
|
||||
@serialisation_option
|
||||
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Utilities options') # type: ignore[misc]
|
||||
@cog.optgroup.option(
|
||||
'--containerize',
|
||||
@@ -787,20 +777,20 @@ def build_command(
|
||||
deprecated_model_id: str | None,
|
||||
bento_version: str | None,
|
||||
overwrite: bool,
|
||||
quantize: LiteralQuantise | None,
|
||||
machine: bool,
|
||||
dtype: LiteralDtype,
|
||||
model_version: str | None,
|
||||
backend: LiteralBackend | None,
|
||||
quantize: LiteralQuantise | None,
|
||||
serialisation: LiteralSerialisation | None,
|
||||
machine: bool,
|
||||
enable_features: tuple[str, ...] | None,
|
||||
adapter_id: tuple[str, ...],
|
||||
build_ctx: str | None,
|
||||
backend: LiteralBackend | None,
|
||||
model_version: str | None,
|
||||
dockerfile_template: t.TextIO | None,
|
||||
max_model_len: int | None,
|
||||
gpu_memory_utilization:float,
|
||||
containerize: bool,
|
||||
push: bool,
|
||||
serialisation: LiteralSerialisation | None,
|
||||
force_push: bool,
|
||||
**_: t.Any,
|
||||
) -> BuildBentoOutput:
|
||||
|
||||
@@ -18,7 +18,7 @@ if t.TYPE_CHECKING:
|
||||
@click.command('list_models', context_settings=termui.CONTEXT_SETTINGS)
|
||||
@model_name_argument(required=False, shell_complete=model_complete_envvar)
|
||||
def cli(model_name: str | None) -> DictStrAny:
|
||||
'''This is equivalent to openllm models --show-available less the nice table.'''
|
||||
'''List available models in lcoal store to be used wit OpenLLM.'''
|
||||
models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
|
||||
ids_in_local_store = {
|
||||
k: [
|
||||
|
||||
Reference in New Issue
Block a user