feat: 1.2 APIs (#821)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2026-04-21 07:29:41 -04:00 · 2024-03-15 03:49:19 -04:00
parent e3392476be
commit 072b3e97ec
116 changed files with 4451 additions and 6144 deletions
--- a/openllm-python/src/openllm_cli/_factory.py
+++ b/openllm-python/src/openllm_cli/_factory.py
@@ -5,20 +5,18 @@ from bentoml_cli.utils import BentoMLCommandGroup
 from click import shell_completion as sc

 from openllm_core._configuration import LLMConfig
-from openllm_core._typing_compat import Concatenate, DictStrAny, LiteralBackend, LiteralSerialisation, ParamSpec, AnyCallable, get_literal_args
+from openllm_core._typing_compat import (
+  Concatenate,
+  DictStrAny,
+  LiteralBackend,
+  LiteralSerialisation,
+  ParamSpec,
+  AnyCallable,
+  get_literal_args,
+)
 from openllm_core.utils import DEBUG, compose, dantic, resolve_user_filepath


-class _OpenLLM_GenericInternalConfig(LLMConfig):
-  __config__ = {'name_type': 'lowercase', 'default_id': 'openllm/generic', 'model_ids': ['openllm/generic'], 'architecture': 'PreTrainedModel'}
-
-  class GenerationConfig:
-    top_k: int = 15
-    top_p: float = 0.78
-    temperature: float = 0.75
-    max_new_tokens: int = 128
-
-
 logger = logging.getLogger(__name__)

 P = ParamSpec('P')
@@ -37,11 +35,20 @@ def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete


 def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
-  return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]
+  return [
+    sc.CompletionItem(inflection.dasherize(it), help='Model')
+    for it in openllm.CONFIG_MAPPING
+    if it.startswith(incomplete)
+  ]


 def parse_config_options(
-  config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny
+  config: LLMConfig,
+  server_timeout: int,
+  workers_per_resource: float,
+  device: t.Tuple[str, ...] | None,
+  cors: bool,
+  environ: DictStrAny,
 ) -> DictStrAny:
  # TODO: Support amd.com/gpu on k8s
  _bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '')
@@ -56,14 +63,21 @@ def parse_config_options(
  if device:
    if len(device) > 1:
      _bentoml_config_options_opts.extend([
-        f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)
+        f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
+        for idx, dev in enumerate(device)
      ])
    else:
-      _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
+      _bentoml_config_options_opts.append(
+        f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]'
+      )
  if cors:
-    _bentoml_config_options_opts.extend(['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
    _bentoml_config_options_opts.extend([
-      f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])
+      'api_server.http.cors.enabled=true',
+      'api_server.http.cors.access_control_allow_origins="*"',
+    ])
+    _bentoml_config_options_opts.extend([
+      f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"'
+      for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])
    ])
  _bentoml_config_options_env += ' ' if _bentoml_config_options_env else '' + ' '.join(_bentoml_config_options_opts)
  environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env
@@ -96,7 +110,7 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...
 def optimization_decorator(fn: FC, *, factory=click, _eager=True) -> FC | list[AnyCallable]:
  shared = [
    dtype_option(factory=factory),
-    model_version_option(factory=factory),  #
+    revision_option(factory=factory),  #
    backend_option(factory=factory),
    quantize_option(factory=factory),  #
    serialisation_option(factory=factory),
@@ -106,52 +120,6 @@ def optimization_decorator(fn: FC, *, factory=click, _eager=True) -> FC | list[A
  return compose(*shared)(fn)


-def start_decorator(fn: FC) -> FC:
-  composed = compose(
-    _OpenLLM_GenericInternalConfig.parse,
-    parse_serve_args(),
-    cog.optgroup.group(
-      'LLM Options',
-      help="""The following options are related to running LLM Server as well as optimization options.
-
-          OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
-
-          The following are either in our roadmap or currently being worked on:
-
-          - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
-          - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
-    """,
-    ),
-    cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
-    workers_per_resource_option(factory=cog.optgroup),
-    cors_option(factory=cog.optgroup),
-    *optimization_decorator(fn, factory=cog.optgroup, _eager=False),
-    cog.optgroup.option(
-      '--device',
-      type=dantic.CUDA,
-      multiple=True,
-      envvar='CUDA_VISIBLE_DEVICES',
-      callback=parse_device_callback,
-      help='Assign GPU devices (if available)',
-      show_envvar=True,
-    ),
-    adapter_id_option(factory=cog.optgroup),
-    click.option('--return-process', is_flag=True, default=False, help='Internal use only.', hidden=True),
-  )
-
-  return composed(fn)
-
-
-def parse_device_callback(_: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
-  if value is None:
-    return value
-  el: t.Tuple[str, ...] = tuple(i for k in value for i in k)
-  # NOTE: --device all is a special case
-  if len(el) == 1 and el[0] == 'all':
-    return tuple(map(str, openllm.utils.available_devices()))
-  return el
-
-
 # NOTE: A list of bentoml option that is not needed for parsing.
 # NOTE: User shouldn't set '--working-dir', as OpenLLM will setup this.
 # NOTE: production is also deprecated
@@ -161,13 +129,19 @@ _IGNORED_OPTIONS = {'working_dir', 'production', 'protocol_version'}
 def parse_serve_args() -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]:
  from bentoml_cli.cli import cli

-  group = cog.optgroup.group('Start a HTTP server options', help='Related to serving the model [synonymous to `bentoml serve-http`]')
+  group = cog.optgroup.group(
+    'Start a HTTP server options', help='Related to serving the model [synonymous to `bentoml serve-http`]'
+  )

  def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) -> t.Callable[[FC], FC]:
    serve_command = cli.commands['serve']
    # The first variable is the argument bento
    # The last five is from BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS
-    serve_options = [p for p in serve_command.params[1 : -BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS] if p.name not in _IGNORED_OPTIONS]
+    serve_options = [
+      p
+      for p in serve_command.params[1 : -BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS]
+      if p.name not in _IGNORED_OPTIONS
+    ]
    for options in reversed(serve_options):
      attrs = options.to_info_dict()
      # we don't need param_type_name, since it should all be options
@@ -183,6 +157,48 @@ def parse_serve_args() -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[F
  return decorator


+def start_decorator(fn: FC) -> FC:
+  composed = compose(
+    cog.optgroup.group(
+      'LLM Options',
+      help="""The following options are related to running LLM Server as well as optimization options.
+
+          OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
+
+          The following are either in our roadmap or currently being worked on:
+
+          - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
+          - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
+    """,
+    ),
+    *optimization_decorator(fn, factory=cog.optgroup, _eager=False),
+    cog.optgroup.option(
+      '--device',
+      type=dantic.CUDA,
+      multiple=True,
+      envvar='CUDA_VISIBLE_DEVICES',
+      callback=parse_device_callback,
+      help='Assign GPU devices (if available)',
+      show_envvar=True,
+    ),
+    adapter_id_option(factory=cog.optgroup),
+  )
+
+  return composed(fn)
+
+
+def parse_device_callback(
+  _: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None
+) -> t.Tuple[str, ...] | None:
+  if value is None:
+    return value
+  el: t.Tuple[str, ...] = tuple(i for k in value for i in k)
+  # NOTE: --device all is a special case
+  if len(el) == 1 and el[0] == 'all':
+    return tuple(map(str, openllm.utils.available_devices()))
+  return el
+
+
 def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | None], FC]:
  """General ``@click`` decorator with some sauce.

@@ -221,7 +237,13 @@ def adapter_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callab

 def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
-    '--cors/--no-cors', show_default=True, default=False, envvar='OPENLLM_CORS', show_envvar=True, help='Enable CORS for the server.', **attrs
+    '--cors/--no-cors',
+    show_default=True,
+    default=False,
+    envvar='OPENLLM_CORS',
+    show_envvar=True,
+    help='Enable CORS for the server.',
+    **attrs,
  )(f)


@@ -235,7 +257,7 @@ def dtype_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[F
    type=str,
    envvar='TORCH_DTYPE',
    default='auto',
-    help="Optional dtype for casting tensors for running inference ['float16', 'float32', 'bfloat16', 'int8', 'int16']. For CTranslate2, it also accepts the following ['int8_float32', 'int8_float16', 'int8_bfloat16']",
+    help="Optional dtype for casting tensors for running inference ['float16', 'float32', 'bfloat16', 'int8', 'int16']",
    **attrs,
  )(f)

@@ -252,12 +274,14 @@ def model_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable
  )(f)


-def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
+def revision_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
+    '--revision',
    '--model-version',
+    'model_version',
    type=click.STRING,
    default=None,
-    help='Optional model version to save for this model. It will be inferred automatically from model-id.',
+    help='Optional model revision to save for this model. It will be inferred automatically from model-id.',
    **attrs,
  )(f)

@@ -275,7 +299,12 @@ def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[


 def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f)
+  return cli_argument(
+    'model_name',
+    type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]),
+    required=required,
+    **attrs,
+  )(f)


 def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
@@ -313,36 +342,6 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
  )(f)


-def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-    '--workers-per-resource',
-    default=None,
-    callback=workers_per_resource_callback,
-    type=str,
-    required=False,
-    help="""Number of workers per resource assigned.
-
-      See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
-      for more information. By default, this is set to 1.
-
-      > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
-
-      - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
-
-      - ``conserved``: This will determine the number of available GPU resources. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``.
-      """
-    + (
-      """\n
-      > [!NOTE] The workers value passed into 'build' will determine how the LLM can
-      > be provisioned in Kubernetes as well as in standalone container. This will
-      > ensure it has the same effect with 'openllm start --api-workers ...'"""
-      if build
-      else ''
-    ),
-    **attrs,
-  )(f)
-
-
 def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
    '--serialisation',
@@ -365,23 +364,3 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
      """,
    **attrs,
  )(f)
-
-
-_wpr_strategies = {'round_robin', 'conserved'}
-
-
-def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None:
-  if value is None:
-    return value
-  value = inflection.underscore(value)
-  if value in _wpr_strategies:
-    return value
-  else:
-    try:
-      float(value)  # type: ignore[arg-type]
-    except ValueError:
-      raise click.BadParameter(
-        f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, param
-      ) from None
-    else:
-      return value
--- a/openllm-python/src/openllm_cli/_sdk.py
+++ b/openllm-python/src/openllm_cli/_sdk.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
-import itertools, logging, os, re, subprocess, sys, typing as t
+import itertools, logging, os, re, subprocess, sys, typing as t, bentoml, openllm_core, orjson
 from simple_di import Provide, inject
-import bentoml, openllm_core, orjson
 from bentoml._internal.configuration.containers import BentoMLContainer
 from openllm_core._typing_compat import LiteralSerialisation
 from openllm_core.exceptions import OpenLLMException
@@ -69,7 +68,10 @@ def _start(
  if timeout:
    args.extend(['--server-timeout', str(timeout)])
  if workers_per_resource:
-    args.extend(['--workers-per-resource', str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource])
+    args.extend([
+      '--workers-per-resource',
+      str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource,
+    ])
  if device and not os.environ.get('CUDA_VISIBLE_DEVICES'):
    args.extend(['--device', ','.join(device)])
  if quantize:
@@ -77,7 +79,11 @@ def _start(
  if cors:
    args.append('--cors')
  if adapter_map:
-    args.extend(list(itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])))
+    args.extend(
+      list(
+        itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])
+      )
+    )
  if additional_args:
    args.extend(additional_args)
  if __test__:
@@ -148,7 +154,9 @@ def _build(
    '--machine',
    '--quiet',
    '--serialisation',
-    first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'),
+    first_not_none(
+      serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
+    ),
  ]
  if quantize:
    args.extend(['--quantize', quantize])
@@ -265,4 +273,4 @@ start, build, import_model, list_models = (
  codegen.gen_sdk(_import_model),
  codegen.gen_sdk(_list_models),
 )
-__all__ = ['start', 'build', 'import_model', 'list_models']
+__all__ = ['build', 'import_model', 'list_models', 'start']
--- a/openllm-python/src/openllm_cli/entrypoint.py
+++ b/openllm-python/src/openllm_cli/entrypoint.py
--- a/openllm-python/src/openllm_cli/termui.py
+++ b/openllm-python/src/openllm_cli/termui.py
@@ -9,7 +9,7 @@ import click
 import inflection
 import orjson

-from openllm_core._typing_compat import DictStrAny
+from openllm_core._typing_compat import DictStrAny, TypedDict
 from openllm_core.utils import get_debug_mode

 logger = logging.getLogger('openllm')
@@ -25,7 +25,14 @@ class Level(enum.IntEnum):

  @property
  def color(self) -> str | None:
-    return {Level.NOTSET: None, Level.DEBUG: 'cyan', Level.INFO: 'green', Level.WARNING: 'yellow', Level.ERROR: 'red', Level.CRITICAL: 'red'}[self]
+    return {
+      Level.NOTSET: None,
+      Level.DEBUG: 'cyan',
+      Level.INFO: 'green',
+      Level.WARNING: 'yellow',
+      Level.ERROR: 'red',
+      Level.CRITICAL: 'red',
+    }[self]

  @classmethod
  def from_logging_level(cls, level: int) -> Level:
@@ -38,7 +45,7 @@ class Level(enum.IntEnum):
    }[level]


-class JsonLog(t.TypedDict):
+class JsonLog(TypedDict):
  log_level: Level
  content: str

@@ -75,5 +82,9 @@ def echo(text: t.Any, fg: str | None = None, *, _with_style: bool = True, json:


 COLUMNS: int = int(os.environ.get('COLUMNS', str(120)))
-CONTEXT_SETTINGS: DictStrAny = {'help_option_names': ['-h', '--help'], 'max_content_width': COLUMNS, 'token_normalize_func': inflection.underscore}
-__all__ = ['echo', 'COLUMNS', 'CONTEXT_SETTINGS', 'log', 'warning', 'error', 'critical', 'debug', 'info', 'Level']
+CONTEXT_SETTINGS: DictStrAny = {
+  'help_option_names': ['-h', '--help'],
+  'max_content_width': COLUMNS,
+  'token_normalize_func': inflection.underscore,
+}
+__all__ = ['COLUMNS', 'CONTEXT_SETTINGS', 'Level', 'critical', 'debug', 'echo', 'error', 'info', 'log', 'warning']