refactor(cli): move out to its own packages (#619)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-02-19 15:18:12 -05:00 · 2023-11-12 18:25:44 -05:00
parent 38a7d2a5b5
commit e0632a85ed
22 changed files with 70 additions and 48 deletions
--- a/openllm-python/src/openllm/init.py
+++ b/openllm-python/src/openllm/init.py
@@ -14,6 +14,10 @@ import os as _os
 import pathlib as _pathlib
 import warnings as _warnings

+import openllm_cli as _cli
+
+from openllm_cli import _sdk
+
 from . import utils as utils


@@ -55,7 +59,6 @@ __lazy = utils.LazyModule(
    '_strategies': ['CascadingResourceStrategy', 'get_resource'],
    'entrypoints': ['mount_entrypoints'],
    'serialisation': ['ggml', 'transformers'],
-    'cli._sdk': ['start', 'start_grpc', 'build', 'import_model', 'list_models'],
    '_quantisation': ['infer_quantisation_config'],
    '_llm': ['LLM', 'LLMRunner', 'LLMRunnable'],
    '_generation': [
@@ -66,7 +69,15 @@ __lazy = utils.LazyModule(
      'prepare_logits_processor',
    ],
  },
-  extra_objects={'COMPILED': COMPILED},
+  extra_objects={
+    'COMPILED': COMPILED,
+    'cli': _cli,
+    'start': _sdk.start,
+    'start_grpc': _sdk.start_grpc,
+    'build': _sdk.build,
+    'import_model': _sdk.import_model,
+    'list_models': _sdk.list_models,
+  },
 )
 __all__ = __lazy.__all__
 __dir__ = __lazy.__dir__
--- a/openllm-python/src/openllm/init.pyi
+++ b/openllm-python/src/openllm/init.pyi
@@ -1,3 +1,4 @@
+import openllm_cli as _cli
 from openllm_core._configuration import GenerationConfig as GenerationConfig
 from openllm_core._configuration import LLMConfig as LLMConfig
 from openllm_core._configuration import SamplingParams as SamplingParams
@@ -21,7 +22,6 @@ from openllm_core.config import StableLMConfig as StableLMConfig
 from openllm_core.config import StarCoderConfig as StarCoderConfig
 from . import exceptions as exceptions
 from . import bundle as bundle
-from . import cli as cli
 from . import client as client
 from . import playground as playground
 from . import serialisation as serialisation
@@ -39,11 +39,11 @@ from ._llm import LLMRunner as LLMRunner
 from ._quantisation import infer_quantisation_config as infer_quantisation_config
 from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy
 from ._strategies import get_resource as get_resource
-from .cli._sdk import build as build
-from .cli._sdk import import_model as import_model
-from .cli._sdk import list_models as list_models
-from .cli._sdk import start as start
-from .cli._sdk import start_grpc as start_grpc
+from openllm_cli._sdk import build as build
+from openllm_cli._sdk import import_model as import_model
+from openllm_cli._sdk import list_models as list_models
+from openllm_cli._sdk import start as start
+from openllm_cli._sdk import start_grpc as start_grpc
 from .client import AsyncHTTPClient as AsyncHTTPClient
 from .client import HTTPClient as HTTPClient
 from .entrypoints import mount_entrypoints as mount_entrypoints
@@ -51,4 +51,5 @@ from .protocol import openai as openai
 from .serialisation import ggml as ggml
 from .serialisation import transformers as transformers

+cli = _cli
 COMPILED: bool = ...
--- a/openllm-python/src/openllm/main.py
+++ b/openllm-python/src/openllm/main.py
@@ -8,6 +8,6 @@ To start any OpenLLM model:
 """

 if __name__ == '__main__':
-  from openllm.cli.entrypoint import cli
+  from openllm_cli.entrypoint import cli

  cli()
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -130,7 +130,7 @@ def construct_docker_options(
  container_registry: LiteralContainerRegistry,
  container_version_strategy: LiteralContainerVersionStrategy,
 ) -> DockerOptions:
-  from openllm.cli._factory import parse_config_options
+  from openllm_cli._factory import parse_config_options

  environ = parse_config_options(llm.config, llm.config['timeout'], 1.0, None, True, os.environ.copy())
  env_dict = {
--- a/openllm-python/src/openllm/cli/init.py
+++ b/openllm-python/src/openllm/cli/init.py
@@ -1,4 +0,0 @@
-"""OpenLLM CLI.
-
-For more information see ``openllm -h``.
-"""
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -1,501 +0,0 @@
-from __future__ import annotations
-import functools
-import logging
-import os
-import typing as t
-
-import click
-import click_option_group as cog
-import inflection
-
-from bentoml_cli.utils import BentoMLCommandGroup
-from click import ClickException
-from click import shell_completion as sc
-
-import bentoml
-import openllm
-
-from openllm_core._configuration import LLMConfig
-from openllm_core._typing_compat import Concatenate
-from openllm_core._typing_compat import DictStrAny
-from openllm_core._typing_compat import LiteralBackend
-from openllm_core._typing_compat import LiteralQuantise
-from openllm_core._typing_compat import LiteralSerialisation
-from openllm_core._typing_compat import ParamSpec
-from openllm_core._typing_compat import get_literal_args
-from openllm_core.utils import DEBUG
-
-
-class _OpenLLM_GenericInternalConfig(LLMConfig):
-  __config__ = {
-    'name_type': 'lowercase',
-    'default_id': 'openllm/generic',
-    'model_ids': ['openllm/generic'],
-    'architecture': 'PreTrainedModel',
-  }
-
-  class GenerationConfig:
-    top_k: int = 15
-    top_p: float = 0.9
-    temperature: float = 0.75
-    max_new_tokens: int = 128
-
-
-logger = logging.getLogger(__name__)
-
-P = ParamSpec('P')
-LiteralOutput = t.Literal['json', 'pretty', 'porcelain']
-
-_AnyCallable = t.Callable[..., t.Any]
-FC = t.TypeVar('FC', bound=t.Union[_AnyCallable, click.Command])
-
-
-def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
-  return [
-    sc.CompletionItem(str(it.tag), help='Bento')
-    for it in bentoml.list()
-    if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})
-  ]
-
-
-def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
-  return [
-    sc.CompletionItem(inflection.dasherize(it), help='Model')
-    for it in openllm.CONFIG_MAPPING
-    if it.startswith(incomplete)
-  ]
-
-
-def parse_config_options(
-  config: LLMConfig,
-  server_timeout: int,
-  workers_per_resource: float,
-  device: t.Tuple[str, ...] | None,
-  cors: bool,
-  environ: DictStrAny,
-) -> DictStrAny:
-  # TODO: Support amd.com/gpu on k8s
-  _bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '')
-  _bentoml_config_options_opts = [
-    'tracing.sample_rate=1.0',
-    'api_server.max_runner_connections=25',
-    f'runners."llm-{config["start_name"]}-runner".batching.max_batch_size=128',
-    f'api_server.traffic.timeout={server_timeout}',
-    f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}',
-    f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}',
-  ]
-  if device:
-    if len(device) > 1:
-      _bentoml_config_options_opts.extend(
-        [
-          f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
-          for idx, dev in enumerate(device)
-        ]
-      )
-    else:
-      _bentoml_config_options_opts.append(
-        f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]'
-      )
-  if cors:
-    _bentoml_config_options_opts.extend(
-      ['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"']
-    )
-    _bentoml_config_options_opts.extend(
-      [
-        f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"'
-        for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])
-      ]
-    )
-  _bentoml_config_options_env += ' ' if _bentoml_config_options_env else '' + ' '.join(_bentoml_config_options_opts)
-  environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env
-  if DEBUG:
-    logger.debug('Setting BENTOML_CONFIG_OPTIONS=%s', _bentoml_config_options_env)
-  return environ
-
-
-_adapter_mapping_key = 'adapter_map'
-
-
-def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...] | None) -> None:
-  if not value:
-    return None
-  if _adapter_mapping_key not in ctx.params:
-    ctx.params[_adapter_mapping_key] = {}
-  for v in value:
-    adapter_id, *adapter_name = v.rsplit(':', maxsplit=1)
-    # try to resolve the full path if users pass in relative,
-    # currently only support one level of resolve path with current directory
-    try:
-      adapter_id = openllm.utils.resolve_user_filepath(adapter_id, os.getcwd())
-    except FileNotFoundError:
-      pass
-    if len(adapter_name) == 0:
-      raise ClickException(f'Adapter name is required for {adapter_id}')
-    ctx.params[_adapter_mapping_key][adapter_id] = adapter_name[0]
-  return None
-
-
-def start_decorator(serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
-  def wrapper(fn: FC) -> t.Callable[[FC], FC]:
-    composed = openllm.utils.compose(
-      _OpenLLM_GenericInternalConfig().to_click_options,
-      _http_server_args if not serve_grpc else _grpc_server_args,
-      cog.optgroup.group('General LLM Options', help='The following options are related to running LLM Server.'),
-      model_version_option(factory=cog.optgroup),
-      system_message_option(factory=cog.optgroup),
-      prompt_template_file_option(factory=cog.optgroup),
-      cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
-      workers_per_resource_option(factory=cog.optgroup),
-      cors_option(factory=cog.optgroup),
-      backend_option(factory=cog.optgroup),
-      cog.optgroup.group(
-        'LLM Optimization Options',
-        help="""Optimization related options.
-
-            OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
-
-            The following are either in our roadmap or currently being worked on:
-
-            - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
-            - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
-            """,
-      ),
-      quantize_option(factory=cog.optgroup),
-      serialisation_option(factory=cog.optgroup),
-      cog.optgroup.option(
-        '--device',
-        type=openllm.utils.dantic.CUDA,
-        multiple=True,
-        envvar='CUDA_VISIBLE_DEVICES',
-        callback=parse_device_callback,
-        help='Assign GPU devices (if available)',
-        show_envvar=True,
-      ),
-      cog.optgroup.group(
-        'Fine-tuning related options',
-        help="""\
-    Note that the argument `--adapter-id` can accept the following format:
-
-    - `--adapter-id /path/to/adapter` (local adapter)
-
-    - `--adapter-id remote/adapter` (remote adapter from HuggingFace Hub)
-
-    - `--adapter-id remote/adapter:eng_lora` (two previous adapter options with the given adapter_name)
-
-    ```bash
-
-    $ openllm start opt --adapter-id /path/to/adapter_dir --adapter-id remote/adapter:eng_lora
-
-    ```
-    """,
-      ),
-      cog.optgroup.option(
-        '--adapter-id',
-        default=None,
-        help='Optional name or path for given LoRA adapter',
-        multiple=True,
-        callback=_id_callback,
-        metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]',
-      ),
-      click.option('--return-process', is_flag=True, default=False, help='Internal use only.', hidden=True),
-    )
-    return composed(fn)
-
-  return wrapper
-
-
-def parse_device_callback(
-  ctx: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None
-) -> t.Tuple[str, ...] | None:
-  if value is None:
-    return value
-  if not isinstance(value, tuple):
-    ctx.fail(f'{param} only accept multiple values, not {type(value)} (value: {value})')
-  el: t.Tuple[str, ...] = tuple(i for k in value for i in k)
-  # NOTE: --device all is a special case
-  if len(el) == 1 and el[0] == 'all':
-    return tuple(map(str, openllm.utils.available_devices()))
-  return el
-
-
-# NOTE: A list of bentoml option that is not needed for parsing.
-# NOTE: User shouldn't set '--working-dir', as OpenLLM will setup this.
-# NOTE: production is also deprecated
-_IGNORED_OPTIONS = {'working_dir', 'production', 'protocol_version'}
-
-
-def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]:
-  """Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`."""
-  from bentoml_cli.cli import cli
-
-  command = 'serve' if not serve_grpc else 'serve-grpc'
-  group = cog.optgroup.group(
-    f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options",
-    help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",
-  )
-
-  def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) -> t.Callable[[FC], FC]:
-    serve_command = cli.commands[command]
-    # The first variable is the argument bento
-    # The last five is from BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS
-    serve_options = [
-      p
-      for p in serve_command.params[1 : -BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS]
-      if p.name not in _IGNORED_OPTIONS
-    ]
-    for options in reversed(serve_options):
-      attrs = options.to_info_dict()
-      # we don't need param_type_name, since it should all be options
-      attrs.pop('param_type_name')
-      # name is not a valid args
-      attrs.pop('name')
-      # type can be determine from default value
-      attrs.pop('type')
-      param_decls = (*attrs.pop('opts'), *attrs.pop('secondary_opts'))
-      f = cog.optgroup.option(*param_decls, **attrs)(f)
-    return group(f)
-
-  return decorator
-
-
-_http_server_args, _grpc_server_args = parse_serve_args(False), parse_serve_args(True)
-
-
-def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | None], FC]:
-  """General ``@click`` decorator with some sauce.
-
-  This decorator extends the default ``@click.option`` plus a factory option and factory attr to
-  provide type-safe click.option or click.argument wrapper for all compatible factory.
-  """
-  factory = attrs.pop('factory', click)
-  factory_attr = attrs.pop('attr', 'option')
-  if factory_attr != 'argument':
-    attrs.setdefault('help', 'General option for OpenLLM CLI.')
-
-  def decorator(f: FC | None) -> FC:
-    callback = getattr(factory, factory_attr, None)
-    if callback is None:
-      raise ValueError(f'Factory {factory} has no attribute {factory_attr}.')
-    return t.cast(FC, callback(*param_decls, **attrs)(f) if f is not None else callback(*param_decls, **attrs))
-
-  return decorator
-
-
-cli_option = functools.partial(_click_factory_type, attr='option')
-cli_argument = functools.partial(_click_factory_type, attr='argument')
-
-
-def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-    '--cors/--no-cors',
-    show_default=True,
-    default=False,
-    envvar='OPENLLM_CORS',
-    show_envvar=True,
-    help='Enable CORS for the server.',
-    **attrs,
-  )(f)
-
-
-def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option('--machine', is_flag=True, default=False, hidden=True, **attrs)(f)
-
-
-def model_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-    '--model-id',
-    type=click.STRING,
-    default=None,
-    envvar='OPENLLM_MODEL_ID',
-    show_envvar=True,
-    help='Optional model_id name or path for (fine-tune) weight.',
-    **attrs,
-  )(f)
-
-
-def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-    '--model-version',
-    type=click.STRING,
-    default=None,
-    help='Optional model version to save for this model. It will be inferred automatically from model-id.',
-    **attrs,
-  )(f)
-
-
-def system_message_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-    '--system-message',
-    type=click.STRING,
-    default=None,
-    envvar='OPENLLM_SYSTEM_MESSAGE',
-    help='Optional system message for supported LLMs. If given LLM supports system message, OpenLLM will provide a default system message.',
-    **attrs,
-  )(f)
-
-
-def prompt_template_file_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-    '--prompt-template-file',
-    type=click.File(),
-    default=None,
-    help='Optional file path containing user-defined custom prompt template. By default, the prompt template for the specified LLM will be used.',
-    **attrs,
-  )(f)
-
-
-def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  # NOTE: LiteralBackend needs to remove the last two item as ggml and mlc is wip
-  # XXX: remove the check for __args__ once we have ggml and mlc supports
-  return cli_option(
-    '--backend',
-    type=click.Choice(get_literal_args(LiteralBackend)[:2]),
-    default=None,
-    envvar='OPENLLM_BACKEND',
-    show_envvar=True,
-    help='The implementation for saving this LLM.',
-    **attrs,
-  )(f)
-
-
-def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_argument(
-    'model_name',
-    type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]),
-    required=required,
-    **attrs,
-  )(f)
-
-
-def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-    '--quantise',
-    '--quantize',
-    'quantize',
-    type=click.Choice(get_literal_args(LiteralQuantise)),
-    default=None,
-    envvar='OPENLLM_QUANTIZE',
-    show_envvar=True,
-    help="""Dynamic quantization for running this LLM.
-
-      The following quantization strategies are supported:
-
-      - ``int8``: ``LLM.int8`` for [8-bit](https://arxiv.org/abs/2208.07339) quantization.
-
-      - ``int4``: ``SpQR`` for [4-bit](https://arxiv.org/abs/2306.03078) quantization.
-
-      - ``gptq``: ``GPTQ`` [quantization](https://arxiv.org/abs/2210.17323)
-
-      > [!NOTE] that the model can also be served with quantized weights.
-      """
-    + (
-      """
-      > [!NOTE] that this will set the mode for serving within deployment."""
-      if build
-      else ''
-    )
-    + """
-      > [!NOTE] that quantization are currently only available in *PyTorch* models.""",
-    **attrs,
-  )(f)
-
-
-def workers_per_resource_option(
-  f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any
-) -> t.Callable[[FC], FC]:
-  return cli_option(
-    '--workers-per-resource',
-    default=None,
-    callback=workers_per_resource_callback,
-    type=str,
-    required=False,
-    help="""Number of workers per resource assigned.
-
-      See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
-      for more information. By default, this is set to 1.
-
-      > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
-
-      - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
-
-      - ``conserved``: This will determine the number of available GPU resources, and only assign one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``.
-      """
-    + (
-      """\n
-      > [!NOTE] The workers value passed into 'build' will determine how the LLM can
-      > be provisioned in Kubernetes as well as in standalone container. This will
-      > ensure it has the same effect with 'openllm start --api-workers ...'"""
-      if build
-      else ''
-    ),
-    **attrs,
-  )(f)
-
-
-def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-    '--serialisation',
-    '--serialization',
-    'serialisation',
-    type=click.Choice(get_literal_args(LiteralSerialisation)),
-    default=None,
-    show_default=True,
-    show_envvar=True,
-    envvar='OPENLLM_SERIALIZATION',
-    help="""Serialisation format for save/load LLM.
-
-      Currently the following strategies are supported:
-
-      - ``safetensors``: This will use safetensors format, which is synonymous to ``safe_serialization=True``.
-
-      > [!NOTE] Safetensors might not work for every cases, and you can always fallback to ``legacy`` if needed.
-
-      - ``legacy``: This will use PyTorch serialisation format, often as ``.bin`` files. This should be used if the model doesn't yet support safetensors.
-      """,
-    **attrs,
-  )(f)
-
-
-def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-    '--container-registry',
-    'container_registry',
-    type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
-    default='ecr',
-    show_default=True,
-    show_envvar=True,
-    envvar='OPENLLM_CONTAINER_REGISTRY',
-    callback=container_registry_callback,
-    help='The default container registry to get the base image for building BentoLLM. Currently, it supports ecr, ghcr, docker',
-    **attrs,
-  )(f)
-
-
-_wpr_strategies = {'round_robin', 'conserved'}
-
-
-def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None:
-  if value is None:
-    return value
-  value = inflection.underscore(value)
-  if value in _wpr_strategies:
-    return value
-  else:
-    try:
-      float(value)  # type: ignore[arg-type]
-    except ValueError:
-      raise click.BadParameter(
-        f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.",
-        ctx,
-        param,
-      ) from None
-    else:
-      return value
-
-
-def container_registry_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None:
-  if value is None:
-    return value
-  if value not in openllm.bundle.supported_registries:
-    raise click.BadParameter(f'Value must be one of {openllm.bundle.supported_registries}', ctx, param)
-  return value
--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -1,330 +0,0 @@
-from __future__ import annotations
-import itertools
-import logging
-import os
-import re
-import subprocess
-import sys
-import typing as t
-
-import orjson
-
-from simple_di import Provide
-from simple_di import inject
-
-import bentoml
-import openllm_core
-
-from bentoml._internal.configuration.containers import BentoMLContainer
-from openllm_core._typing_compat import LiteralSerialisation
-from openllm_core.exceptions import OpenLLMException
-from openllm_core.utils import WARNING_ENV_VAR
-from openllm_core.utils import codegen
-from openllm_core.utils import first_not_none
-from openllm_core.utils import get_disable_warnings
-from openllm_core.utils import is_vllm_available
-
-
-if t.TYPE_CHECKING:
-  from bentoml._internal.bento import BentoStore
-  from openllm_core._configuration import LLMConfig
-  from openllm_core._typing_compat import LiteralBackend
-  from openllm_core._typing_compat import LiteralContainerRegistry
-  from openllm_core._typing_compat import LiteralContainerVersionStrategy
-  from openllm_core._typing_compat import LiteralQuantise
-  from openllm_core._typing_compat import LiteralString
-
-logger = logging.getLogger(__name__)
-
-
-def _start(
-  model_id: str,
-  timeout: int = 30,
-  workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None,
-  device: tuple[str, ...] | t.Literal['all'] | None = None,
-  quantize: LiteralQuantise | None = None,
-  system_message: str | None = None,
-  prompt_template_file: str | None = None,
-  adapter_map: dict[LiteralString, str | None] | None = None,
-  backend: LiteralBackend | None = None,
-  additional_args: list[str] | None = None,
-  cors: bool = False,
-  _serve_grpc: bool = False,
-  __test__: bool = False,
-  **_: t.Any,
-) -> LLMConfig | subprocess.Popen[bytes]:
-  """Python API to start a LLM server. These provides one-to-one mapping to CLI arguments.
-
-  For all additional arguments, pass it as string to ``additional_args``. For example, if you want to
-  pass ``--port 5001``, you can pass ``additional_args=["--port", "5001"]``
-
-  > [!NOTE] This will create a blocking process, so if you use this API, you can create a running sub thread
-  > to start the server instead of blocking the main thread.
-
-  ``openllm.start`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI interaction.
-
-  Args:
-    model_id: The model id to start this LLMServer
-    timeout: The server timeout
-    system_message: Optional system message for supported LLMs. If given LLM supports system message, OpenLLM will provide a default system message.
-    prompt_template_file: Optional file path containing user-defined custom prompt template. By default, the prompt template for the specified LLM will be used..
-    workers_per_resource: Number of workers per resource assigned.
-                          See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
-                          for more information. By default, this is set to 1.
-
-                          > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
-                          > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
-                          > - ``conserved``: This will determine the number of available GPU resources, and only assign
-                          >                  one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
-                          >                  equivalent to ``--workers-per-resource 0.25``.
-    device: Assign GPU devices (if available) to this LLM. By default, this is set to ``None``. It also accepts 'all'
-    argument to assign all available GPUs to this LLM.
-    quantize: Quantize the model weights. This is only applicable for PyTorch models.
-              Possible quantisation strategies:
-              - int8: Quantize the model with 8bit (bitsandbytes required)
-              - int4: Quantize the model with 4bit (bitsandbytes required)
-              - gptq: Quantize the model with GPTQ (auto-gptq required)
-    cors: Whether to enable CORS for this LLM. By default, this is set to ``False``.
-    adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
-    backend: The backend to use for this LLM. By default, this is set to ``pt``.
-    additional_args: Additional arguments to pass to ``openllm start``.
-  """
-  from .entrypoint import start_command
-  from .entrypoint import start_grpc_command
-
-  os.environ['OPENLLM_BACKEND'] = openllm_core.utils.first_not_none(
-    backend, default='vllm' if is_vllm_available() else 'pt'
-  )
-
-  args: list[str] = [model_id]
-  if system_message:
-    args.extend(['--system-message', system_message])
-  if prompt_template_file:
-    args.extend(['--prompt-template-file', openllm_core.utils.resolve_filepath(prompt_template_file)])
-  if timeout:
-    args.extend(['--server-timeout', str(timeout)])
-  if workers_per_resource:
-    args.extend(
-      [
-        '--workers-per-resource',
-        str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource,
-      ]
-    )
-  if device and not os.environ.get('CUDA_VISIBLE_DEVICES'):
-    args.extend(['--device', ','.join(device)])
-  if quantize:
-    args.extend(['--quantize', str(quantize)])
-  if cors:
-    args.append('--cors')
-  if adapter_map:
-    args.extend(
-      list(
-        itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])
-      )
-    )
-  if additional_args:
-    args.extend(additional_args)
-  if __test__:
-    args.append('--return-process')
-
-  cmd = start_command if not _serve_grpc else start_grpc_command
-  return cmd.main(args=args, standalone_mode=False)
-
-
-@inject
-def _build(
-  model_id: str,
-  model_version: str | None = None,
-  bento_version: str | None = None,
-  quantize: LiteralQuantise | None = None,
-  adapter_map: dict[str, str | None] | None = None,
-  system_message: str | None = None,
-  prompt_template_file: str | None = None,
-  build_ctx: str | None = None,
-  enable_features: tuple[str, ...] | None = None,
-  dockerfile_template: str | None = None,
-  overwrite: bool = False,
-  container_registry: LiteralContainerRegistry | None = None,
-  container_version_strategy: LiteralContainerVersionStrategy | None = None,
-  push: bool = False,
-  force_push: bool = False,
-  containerize: bool = False,
-  serialisation: LiteralSerialisation | None = None,
-  additional_args: list[str] | None = None,
-  bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
-) -> bentoml.Bento:
-  """Package a LLM into a BentoLLM.
-
-  The LLM will be built into a BentoService with the following structure:
-  if ``quantize`` is passed, it will instruct the model to be quantized dynamically during serving time.
-
-  ``openllm.build`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as ``openllm build`` CLI.
-
-  Args:
-    model_id: The model id to build this BentoLLM
-    model_version: Optional model version for this given LLM
-    bento_version: Optional bento veresion for this given BentoLLM
-    system_message: Optional system message for supported LLMs. If given LLM supports system message, OpenLLM will provide a default system message.
-    prompt_template_file: Optional file path containing user-defined custom prompt template. By default, the prompt template for the specified LLM will be used..
-    quantize: Quantize the model weights. This is only applicable for PyTorch models.
-              Possible quantisation strategies:
-              - int8: Quantize the model with 8bit (bitsandbytes required)
-              - int4: Quantize the model with 4bit (bitsandbytes required)
-              - gptq: Quantize the model with GPTQ (auto-gptq required)
-    adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
-    build_ctx: The build context to use for building BentoLLM. By default, it sets to current directory.
-    enable_features: Additional OpenLLM features to be included with this BentoLLM.
-    dockerfile_template: The dockerfile template to use for building BentoLLM. See https://docs.bentoml.com/en/latest/guides/containerization.html#dockerfile-template.
-    overwrite: Whether to overwrite the existing BentoLLM. By default, this is set to ``False``.
-    push: Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.
-    containerize: Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.
-                  Note that 'containerize' and 'push' are mutually exclusive
-                  container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
-    container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
-    container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
-    serialisation: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
-    additional_args: Additional arguments to pass to ``openllm build``.
-    bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.
-
-  Returns:
-      ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
-  """
-  from ..serialisation.transformers.weights import has_safetensors_weights
-
-  args: list[str] = [
-    sys.executable,
-    '-m',
-    'openllm',
-    'build',
-    model_id,
-    '--machine',
-    '--serialisation',
-    first_not_none(
-      serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
-    ),
-  ]
-  if quantize:
-    args.extend(['--quantize', quantize])
-  if containerize and push:
-    raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
-  if push:
-    args.extend(['--push'])
-  if containerize:
-    args.extend(['--containerize'])
-  if build_ctx:
-    args.extend(['--build-ctx', build_ctx])
-  if enable_features:
-    args.extend([f'--enable-features={f}' for f in enable_features])
-  if overwrite:
-    args.append('--overwrite')
-  if system_message:
-    args.extend(['--system-message', system_message])
-  if prompt_template_file:
-    args.extend(['--prompt-template-file', openllm_core.utils.resolve_filepath(prompt_template_file)])
-  if adapter_map:
-    args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()])
-  if model_version:
-    args.extend(['--model-version', model_version])
-  if bento_version:
-    args.extend(['--bento-version', bento_version])
-  if dockerfile_template:
-    args.extend(['--dockerfile-template', dockerfile_template])
-  if container_registry is None:
-    container_registry = 'ecr'
-  if container_version_strategy is None:
-    container_version_strategy = 'release'
-  args.extend(['--container-registry', container_registry, '--container-version-strategy', container_version_strategy])
-  if additional_args:
-    args.extend(additional_args)
-  if force_push:
-    args.append('--force-push')
-
-  current_disable_warning = get_disable_warnings()
-  os.environ[WARNING_ENV_VAR] = str(True)
-  try:
-    output = subprocess.check_output(args, env=os.environ.copy(), cwd=build_ctx or os.getcwd())
-  except subprocess.CalledProcessError as e:
-    logger.error("Exception caught while building Bento for '%s'", model_id, exc_info=e)
-    if e.stderr:
-      raise OpenLLMException(e.stderr.decode('utf-8')) from None
-    raise OpenLLMException(str(e)) from None
-  matched = re.match(r'__object__:(\{.*\})$', output.decode('utf-8').strip())
-  if matched is None:
-    raise ValueError(
-      f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub."
-    )
-  os.environ[WARNING_ENV_VAR] = str(current_disable_warning)
-  try:
-    result = orjson.loads(matched.group(1))
-  except orjson.JSONDecodeError as e:
-    raise ValueError(
-      f"Failed to decode JSON from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub."
-    ) from e
-  return bentoml.get(result['tag'], _bento_store=bento_store)
-
-
-def _import_model(
-  model_id: str,
-  model_version: str | None = None,
-  backend: LiteralBackend | None = None,
-  quantize: LiteralQuantise | None = None,
-  serialisation: LiteralSerialisation | None = None,
-  additional_args: t.Sequence[str] | None = None,
-) -> dict[str, t.Any]:
-  """Import a LLM into local store.
-
-  > [!NOTE]
-  > If ``quantize`` is passed, the model weights will be saved as quantized weights. You should
-  > only use this option if you want the weight to be quantized by default. Note that OpenLLM also
-  > support on-demand quantisation during initial startup.
-
-  ``openllm.import_model`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI ``openllm import``.
-
-  > [!NOTE]
-  > ``openllm.start`` will automatically invoke ``openllm.import_model`` under the hood.
-
-  Args:
-    model_id: required model id for this given LLM
-    model_version: Optional model version for this given LLM
-    backend: The backend to use for this LLM. By default, this is set to ``pt``.
-    quantize: Quantize the model weights. This is only applicable for PyTorch models.
-              Possible quantisation strategies:
-              - int8: Quantize the model with 8bit (bitsandbytes required)
-              - int4: Quantize the model with 4bit (bitsandbytes required)
-              - gptq: Quantize the model with GPTQ (auto-gptq required)
-    serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors. Default behaviour is similar to ``safe_serialization=False``.
-    additional_args: Additional arguments to pass to ``openllm import``.
-
-  Returns:
-    ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
-  """
-  from .entrypoint import import_command
-
-  args = [model_id, '--quiet']
-  if backend is not None:
-    args.extend(['--backend', backend])
-  if model_version is not None:
-    args.extend(['--model-version', str(model_version)])
-  if quantize is not None:
-    args.extend(['--quantize', quantize])
-  if serialisation is not None:
-    args.extend(['--serialisation', serialisation])
-  if additional_args is not None:
-    args.extend(additional_args)
-  return import_command.main(args=args, standalone_mode=False)
-
-
-def _list_models() -> dict[str, t.Any]:
-  """List all available models within the local store."""
-  from .entrypoint import models_command
-
-  return models_command.main(args=['--show-available', '--quiet'], standalone_mode=False)
-
-
-start, start_grpc = codegen.gen_sdk(_start, _serve_grpc=False), codegen.gen_sdk(_start, _serve_grpc=True)
-build, import_model, list_models = (
-  codegen.gen_sdk(_build),
-  codegen.gen_sdk(_import_model),
-  codegen.gen_sdk(_list_models),
-)
-__all__ = ['start', 'start_grpc', 'build', 'import_model', 'list_models']
--- a/openllm-python/src/openllm/cli/entrypoint.py
+++ b/openllm-python/src/openllm/cli/entrypoint.py
--- a/openllm-python/src/openllm/cli/extension/init.py
+++ b/openllm-python/src/openllm/cli/extension/init.py
@@ -1,16 +0,0 @@
-"""OpenLLM CLI Extension.
-
-The following directory contains all possible extensions for OpenLLM CLI
-For adding new extension, just simply name that ext to `<name_ext>.py` and define
-a ``click.command()`` with the following format:
-
-```python
-import click
-
-@click.command(<name_ext>)
-...
-def cli(...): # <- this is important here, it should always name CLI in order for the extension resolver to know how to import this extensions.
-```
-
-NOTE: Make sure to keep this file blank such that it won't mess with the import order.
-"""
--- a/openllm-python/src/openllm/cli/extension/build_base_container.py
+++ b/openllm-python/src/openllm/cli/extension/build_base_container.py
@@ -1,52 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import click
-import orjson
-
-import openllm
-
-from openllm.cli import termui
-from openllm.cli._factory import container_registry_option
-from openllm.cli._factory import machine_option
-
-
-if t.TYPE_CHECKING:
-  from openllm_core._typing_compat import LiteralContainerRegistry
-  from openllm_core._typing_compat import LiteralContainerVersionStrategy
-
-
-@click.command(
-  'build_base_container',
-  context_settings=termui.CONTEXT_SETTINGS,
-  help="""Base image builder for BentoLLM.
-
-                By default, the base image will include custom kernels (PagedAttention via vllm, FlashAttention-v2, etc.) built with CUDA 11.8, Python 3.9 on Ubuntu22.04.
-                Optionally, this can also be pushed directly to remote registry. Currently support ``docker.io``, ``ghcr.io`` and ``quay.io``.
-
-                \b
-                If '--machine' is passed, then it will run the process quietly, and output a JSON to the current running terminal.
-                This command is only useful for debugging and for building custom base image for extending BentoML with custom base images and custom kernels.
-
-                Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
-                """,
-)
-@container_registry_option
-@click.option(
-  '--version-strategy',
-  type=click.Choice(['release', 'latest', 'nightly']),
-  default='nightly',
-  help='Version strategy to use for tagging the image.',
-)
-@click.option('--push/--no-push', help='Whether to push to remote repository', is_flag=True, default=False)
-@machine_option
-def cli(
-  container_registry: tuple[LiteralContainerRegistry, ...] | None,
-  version_strategy: LiteralContainerVersionStrategy,
-  push: bool,
-  machine: bool,
-) -> dict[str, str]:
-  mapping = openllm.bundle.build_container(container_registry, version_strategy, push, machine)
-  if machine:
-    termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
-  return mapping
--- a/openllm-python/src/openllm/cli/extension/dive_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/dive_bentos.py
@@ -1,48 +0,0 @@
-from __future__ import annotations
-import shutil
-import subprocess
-import typing as t
-
-import click
-import psutil
-
-from simple_di import Provide
-from simple_di import inject
-
-import bentoml
-
-from bentoml._internal.configuration.containers import BentoMLContainer
-from openllm.cli import termui
-from openllm.cli._factory import bento_complete_envvar
-from openllm.cli._factory import machine_option
-
-
-if t.TYPE_CHECKING:
-  from bentoml._internal.bento import BentoStore
-
-
-@click.command('dive_bentos', context_settings=termui.CONTEXT_SETTINGS)
-@click.argument('bento', type=str, shell_complete=bento_complete_envvar)
-@machine_option
-@click.pass_context
-@inject
-def cli(
-  ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
-) -> str | None:
-  """Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path)."""
-  try:
-    bentomodel = _bento_store.get(bento)
-  except bentoml.exceptions.NotFound:
-    ctx.fail(f'Bento {bento} not found. Make sure to call `openllm build` first.')
-  if 'bundler' not in bentomodel.info.labels or bentomodel.info.labels['bundler'] != 'openllm.bundle':
-    ctx.fail(
-      f"Bento is either too old or not built with OpenLLM. Make sure to use ``openllm build {bentomodel.info.labels['start_name']}`` for correctness."
-    )
-  if machine:
-    return bentomodel.path
-  # copy and paste this into a new shell
-  if psutil.WINDOWS:
-    subprocess.check_call([shutil.which('dir') or 'dir'], cwd=bentomodel.path)
-  else:
-    subprocess.check_call([shutil.which('ls') or 'ls', '-Rrthla'], cwd=bentomodel.path)
-  ctx.exit(0)
--- a/openllm-python/src/openllm/cli/extension/get_containerfile.py
+++ b/openllm-python/src/openllm/cli/extension/get_containerfile.py
@@ -1,57 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import click
-
-from simple_di import Provide
-from simple_di import inject
-
-import bentoml
-
-from bentoml._internal.bento.bento import BentoInfo
-from bentoml._internal.bento.build_config import DockerOptions
-from bentoml._internal.configuration.containers import BentoMLContainer
-from bentoml._internal.container.generate import generate_containerfile
-from openllm.cli import termui
-from openllm.cli._factory import bento_complete_envvar
-from openllm_core.utils import converter
-
-
-if t.TYPE_CHECKING:
-  from bentoml._internal.bento import BentoStore
-
-
-@click.command(
-  'get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.'
-)
-@click.argument('bento', type=str, shell_complete=bento_complete_envvar)
-@click.pass_context
-@inject
-def cli(ctx: click.Context, bento: str, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str:
-  try:
-    bentomodel = _bento_store.get(bento)
-  except bentoml.exceptions.NotFound:
-    ctx.fail(f'Bento {bento} not found. Make sure to call `openllm build` first.')
-  # The logic below are similar to bentoml._internal.container.construct_containerfile
-  with open(bentomodel.path_of('bento.yaml'), 'r') as f:
-    options = BentoInfo.from_yaml_file(f)
-    # NOTE: dockerfile_template is already included in the
-    # Dockerfile inside bento, and it is not relevant to
-    # construct_containerfile. Hence it is safe to set it to None here.
-    # See https://github.com/bentoml/BentoML/issues/3399.
-    docker_attrs = converter.unstructure(options.docker)
-    # NOTE: if users specify a dockerfile_template, we will
-    # save it to /env/docker/Dockerfile.template. This is necessary
-    # for the reconstruction of the Dockerfile.
-    if 'dockerfile_template' in docker_attrs and docker_attrs['dockerfile_template'] is not None:
-      docker_attrs['dockerfile_template'] = 'env/docker/Dockerfile.template'
-    doc = generate_containerfile(
-      docker=DockerOptions(**docker_attrs),
-      build_ctx=bentomodel.path,
-      conda=options.conda,
-      bento_fs=bentomodel._fs,
-      enable_buildkit=True,
-      add_header=True,
-    )
-    termui.echo(doc, fg='white')
-  return bentomodel.path
--- a/openllm-python/src/openllm/cli/extension/get_prompt.py
+++ b/openllm-python/src/openllm/cli/extension/get_prompt.py
@@ -1,82 +0,0 @@
-from __future__ import annotations
-import logging
-import traceback
-import typing as t
-
-import click
-import inflection
-import orjson
-
-from bentoml_cli.utils import opt_callback
-
-import openllm
-import openllm_core
-
-from openllm.cli import termui
-from openllm.cli._factory import model_complete_envvar
-from openllm_core.prompts import process_prompt
-
-
-logger = logging.getLogger(__name__)
-
-
-@click.command('get_prompt', context_settings=termui.CONTEXT_SETTINGS)
-@click.argument(
-  'model_name',
-  type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]),
-  shell_complete=model_complete_envvar,
-)
-@click.argument('prompt', type=click.STRING)
-@click.option('--format', type=click.STRING, default=None)
-@click.option(
-  '--opt',
-  help="Define additional prompt variables. (format: ``--opt system_prompt='You are a useful assistant'``)",
-  required=False,
-  multiple=True,
-  callback=opt_callback,
-  metavar='ARG=VALUE[,ARG=VALUE]',
-)
-@click.pass_context
-def cli(
-  ctx: click.Context, /, model_name: str, prompt: str, format: str | None, _memoized: dict[str, t.Any], **_: t.Any
-) -> str | None:
-  """Get the default prompt used by OpenLLM."""
-  module = getattr(openllm_core.config, f'configuration_{model_name}')
-  _memoized = {k: v[0] for k, v in _memoized.items() if v}
-  try:
-    template = getattr(module, 'DEFAULT_PROMPT_TEMPLATE', None)
-    prompt_mapping = getattr(module, 'PROMPT_MAPPING', None)
-    if template is None:
-      raise click.BadArgumentUsage(f'model {model_name} does not have a default prompt template') from None
-    if callable(template):
-      if format is None:
-        if not hasattr(module, 'PROMPT_MAPPING') or module.PROMPT_MAPPING is None:
-          raise RuntimeError('Failed to find prompt mapping while DEFAULT_PROMPT_TEMPLATE is a function.')
-        raise click.BadOptionUsage(
-          'format',
-          f"{model_name} prompt requires passing '--format' (available format: {list(module.PROMPT_MAPPING)})",
-        )
-      if prompt_mapping is None:
-        raise click.BadArgumentUsage(
-          f'Failed to fine prompt mapping while the default prompt for {model_name} is a callable.'
-        ) from None
-      if format not in prompt_mapping:
-        raise click.BadOptionUsage(
-          'format', f'Given format {format} is not valid for {model_name} (available format: {list(prompt_mapping)})'
-        )
-      _prompt_template = template(format)
-    else:
-      _prompt_template = template
-    try:
-      # backward-compatible. TO BE REMOVED once every model has default system message and prompt template.
-      fully_formatted = process_prompt(prompt, _prompt_template, True, **_memoized)
-    except RuntimeError as err:
-      logger.debug('Exception caught while formatting prompt: %s', err)
-      fully_formatted = openllm.AutoConfig.for_model(model_name).sanitize_parameters(
-        prompt, prompt_template=_prompt_template
-      )[0]
-    termui.echo(orjson.dumps({'prompt': fully_formatted}, option=orjson.OPT_INDENT_2).decode(), fg='white')
-  except Exception as err:
-    traceback.print_exc()
-    raise click.ClickException(f'Failed to determine a default prompt template for {model_name}.') from err
-  ctx.exit(0)
--- a/openllm-python/src/openllm/cli/extension/list_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/list_bentos.py
@@ -1,35 +0,0 @@
-from __future__ import annotations
-
-import click
-import inflection
-import orjson
-
-import bentoml
-import openllm
-
-from bentoml._internal.utils import human_readable_size
-from openllm.cli import termui
-
-
-@click.command('list_bentos', context_settings=termui.CONTEXT_SETTINGS)
-@click.pass_context
-def cli(ctx: click.Context) -> None:
-  """List available bentos built by OpenLLM."""
-  mapping = {
-    k: [
-      {
-        'tag': str(b.tag),
-        'size': human_readable_size(openllm.utils.calc_dir_size(b.path)),
-        'models': [
-          {'tag': str(m.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(m.path))}
-          for m in (bentoml.models.get(_.tag) for _ in b.info.models)
-        ],
-      }
-      for b in tuple(i for i in bentoml.list() if all(k in i.info.labels for k in {'start_name', 'bundler'}))
-      if b.info.labels['start_name'] == k
-    ]
-    for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
-  }
-  mapping = {k: v for k, v in mapping.items() if v}
-  termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
-  ctx.exit(0)
--- a/openllm-python/src/openllm/cli/extension/list_models.py
+++ b/openllm-python/src/openllm/cli/extension/list_models.py
@@ -1,52 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import click
-import inflection
-import orjson
-
-import bentoml
-import openllm
-
-from bentoml._internal.utils import human_readable_size
-from openllm.cli import termui
-from openllm.cli._factory import model_complete_envvar
-from openllm.cli._factory import model_name_argument
-
-
-if t.TYPE_CHECKING:
-  from openllm_core._typing_compat import DictStrAny
-
-
-@click.command('list_models', context_settings=termui.CONTEXT_SETTINGS)
-@model_name_argument(required=False, shell_complete=model_complete_envvar)
-def cli(model_name: str | None) -> DictStrAny:
-  """This is equivalent to openllm models --show-available less the nice table."""
-  models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
-  ids_in_local_store = {
-    k: [
-      i
-      for i in bentoml.models.list()
-      if 'framework' in i.info.labels
-      and i.info.labels['framework'] == 'openllm'
-      and 'model_name' in i.info.labels
-      and i.info.labels['model_name'] == k
-    ]
-    for k in models
-  }
-  if model_name is not None:
-    ids_in_local_store = {
-      k: [
-        i
-        for i in v
-        if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)
-      ]
-      for k, v in ids_in_local_store.items()
-    }
-  ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
-  local_models = {
-    k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val]
-    for k, val in ids_in_local_store.items()
-  }
-  termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white')
-  return local_models
--- a/openllm-python/src/openllm/cli/extension/playground.py
+++ b/openllm-python/src/openllm/cli/extension/playground.py
@@ -1,114 +0,0 @@
-from __future__ import annotations
-import importlib.machinery
-import logging
-import os
-import pkgutil
-import subprocess
-import sys
-import tempfile
-import typing as t
-
-import click
-import jupytext
-import nbformat
-import yaml
-
-from openllm import playground
-from openllm.cli import termui
-from openllm_core.utils import is_jupyter_available
-from openllm_core.utils import is_jupytext_available
-from openllm_core.utils import is_notebook_available
-
-
-if t.TYPE_CHECKING:
-  from openllm_core._typing_compat import DictStrAny
-
-logger = logging.getLogger(__name__)
-
-
-def load_notebook_metadata() -> DictStrAny:
-  with open(os.path.join(os.path.dirname(playground.__file__), '_meta.yml'), 'r') as f:
-    content = yaml.safe_load(f)
-  if not all('description' in k for k in content.values()):
-    raise ValueError("Invalid metadata file. All entries must have a 'description' key.")
-  return content
-
-
-@click.command('playground', context_settings=termui.CONTEXT_SETTINGS)
-@click.argument('output-dir', default=None, required=False)
-@click.option(
-  '--port',
-  envvar='JUPYTER_PORT',
-  show_envvar=True,
-  show_default=True,
-  default=8888,
-  help='Default port for Jupyter server',
-)
-@click.pass_context
-def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
-  """OpenLLM Playground.
-
-  A collections of notebooks to explore the capabilities of OpenLLM.
-  This includes notebooks for fine-tuning, inference, and more.
-
-  All of the script available in the playground can also be run directly as a Python script:
-  For example:
-
-  \b
-  ```bash
-  python -m openllm.playground.falcon_tuned --help
-  ```
-
-  \b
-  > [!NOTE]
-  > This command requires Jupyter to be installed. Install it with 'pip install "openllm[playground]"'
-  """
-  if not is_jupyter_available() or not is_jupytext_available() or not is_notebook_available():
-    raise RuntimeError(
-      "Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'"
-    )
-  metadata = load_notebook_metadata()
-  _temp_dir = False
-  if output_dir is None:
-    _temp_dir = True
-    output_dir = tempfile.mkdtemp(prefix='openllm-playground-')
-  else:
-    os.makedirs(os.path.abspath(os.path.expandvars(os.path.expanduser(output_dir))), exist_ok=True)
-
-  termui.echo('The playground notebooks will be saved to: ' + os.path.abspath(output_dir), fg='blue')
-  for module in pkgutil.iter_modules(playground.__path__):
-    if module.ispkg or os.path.exists(os.path.join(output_dir, module.name + '.ipynb')):
-      logger.debug(
-        'Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module'
-      )
-      continue
-    if not isinstance(module.module_finder, importlib.machinery.FileFinder):
-      continue
-    termui.echo('Generating notebook for: ' + module.name, fg='magenta')
-    markdown_cell = nbformat.v4.new_markdown_cell(metadata[module.name]['description'])
-    f = jupytext.read(os.path.join(module.module_finder.path, module.name + '.py'))
-    f.cells.insert(0, markdown_cell)
-    jupytext.write(f, os.path.join(output_dir, module.name + '.ipynb'), fmt='notebook')
-  try:
-    subprocess.check_output(
-      [
-        sys.executable,
-        '-m',
-        'jupyter',
-        'notebook',
-        '--notebook-dir',
-        output_dir,
-        '--port',
-        str(port),
-        '--no-browser',
-        '--debug',
-      ]
-    )
-  except subprocess.CalledProcessError as e:
-    termui.echo(e.output, fg='red')
-    raise click.ClickException(f'Failed to start a jupyter server:\n{e}') from None
-  except KeyboardInterrupt:
-    termui.echo('\nShutting down Jupyter server...', fg='yellow')
-    if _temp_dir:
-      termui.echo('Note: You can access the generated notebooks in: ' + output_dir, fg='blue')
-  ctx.exit(0)
--- a/openllm-python/src/openllm/cli/termui.py
+++ b/openllm-python/src/openllm/cli/termui.py
@@ -1,91 +0,0 @@
-from __future__ import annotations
-import enum
-import functools
-import logging
-import os
-import typing as t
-
-import click
-import inflection
-import orjson
-
-from openllm_core._typing_compat import DictStrAny
-from openllm_core.utils import get_debug_mode
-
-
-logger = logging.getLogger('openllm')
-
-
-class Level(enum.IntEnum):
-  NOTSET = logging.DEBUG
-  DEBUG = logging.DEBUG
-  INFO = logging.INFO
-  WARNING = logging.WARNING
-  ERROR = logging.ERROR
-  CRITICAL = logging.CRITICAL
-
-  @property
-  def color(self) -> str | None:
-    return {
-      Level.NOTSET: None,
-      Level.DEBUG: 'cyan',
-      Level.INFO: 'green',
-      Level.WARNING: 'yellow',
-      Level.ERROR: 'red',
-      Level.CRITICAL: 'red',
-    }[self]
-
-  @classmethod
-  def from_logging_level(cls, level: int) -> Level:
-    return {
-      logging.DEBUG: Level.DEBUG,
-      logging.INFO: Level.INFO,
-      logging.WARNING: Level.WARNING,
-      logging.ERROR: Level.ERROR,
-      logging.CRITICAL: Level.CRITICAL,
-    }[level]
-
-
-class JsonLog(t.TypedDict):
-  log_level: Level
-  content: str
-
-
-def log(content: str, level: Level = Level.INFO, fg: str | None = None) -> None:
-  if get_debug_mode():
-    echo(content, fg=fg)
-  else:
-    echo(orjson.dumps(JsonLog(log_level=level, content=content)).decode(), fg=fg, json=True)
-
-
-warning = functools.partial(log, level=Level.WARNING)
-error = functools.partial(log, level=Level.ERROR)
-critical = functools.partial(log, level=Level.CRITICAL)
-debug = functools.partial(log, level=Level.DEBUG)
-info = functools.partial(log, level=Level.INFO)
-notset = functools.partial(log, level=Level.NOTSET)
-
-
-def echo(text: t.Any, fg: str | None = None, *, _with_style: bool = True, json: bool = False, **attrs: t.Any) -> None:
-  if json:
-    text = orjson.loads(text)
-    if 'content' in text and 'log_level' in text:
-      content = text['content']
-      fg = Level.from_logging_level(text['log_level']).color
-    else:
-      content = orjson.dumps(text).decode()
-      fg = Level.INFO.color if not get_debug_mode() else Level.DEBUG.color
-  else:
-    content = t.cast(str, text)
-  attrs['fg'] = fg
-
-  (click.echo if not _with_style else click.secho)(content, **attrs)
-
-
-COLUMNS: int = int(os.environ.get('COLUMNS', str(120)))
-CONTEXT_SETTINGS: DictStrAny = {
-  'help_option_names': ['-h', '--help'],
-  'max_content_width': COLUMNS,
-  'token_normalize_func': inflection.underscore,
-}
-__all__ = ['echo', 'COLUMNS', 'CONTEXT_SETTINGS', 'log', 'warning', 'error', 'critical', 'debug', 'info', 'Level']