OpenLLM/src/openllm/cli.py

# Copyright 2023 BentoML Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""CLI utilities for OpenLLM.

This module also contains the SDK to call ``start`` and ``build`` from SDK

Start any LLM:

```python
openllm.start("falcon", model_id='tiiuae/falcon-7b-instruct')
```

Build a BentoLLM

```python
bento = openllm.build("falcon")
```

Import any LLM into local store
```python
bentomodel = openllm.import_model("falcon", model_id='tiiuae/falcon-7b-instruct')
```
"""
from __future__ import annotations
import functools
import importlib.machinery
import importlib.util
import inspect
import itertools
import logging
import os
import pkgutil
import re
import subprocess
import sys
import tempfile
import time
import traceback
import typing as t

import attr
import click
import click_option_group as cog
import fs
import fs.errors
import inflection
import orjson
import yaml
from bentoml_cli.utils import BentoMLCommandGroup
from bentoml_cli.utils import opt_callback
from simple_di import Provide
from simple_di import inject

import bentoml
import openllm
from bentoml._internal.configuration.containers import BentoMLContainer
from bentoml._internal.models.model import ModelStore

from .__about__ import __version__
from .exceptions import OpenLLMException
from .utils import DEBUG
from .utils import ENV_VARS_TRUE_VALUES
from .utils import EnvVarMixin
from .utils import LazyLoader
from .utils import LazyType
from .utils import analytics
from .utils import available_devices
from .utils import codegen
from .utils import configure_logging
from .utils import dantic
from .utils import device_count
from .utils import first_not_none
from .utils import get_debug_mode
from .utils import get_quiet_mode
from .utils import is_jupyter_available
from .utils import is_jupytext_available
from .utils import is_notebook_available
from .utils import is_peft_available
from .utils import is_torch_available
from .utils import is_transformers_supports_agent
from .utils import resolve_user_filepath
from .utils import set_debug_mode
from .utils import set_quiet_mode


if t.TYPE_CHECKING:
    import jupytext
    import nbformat
    import torch

    from bentoml._internal.bento import BentoStore
    from bentoml._internal.container import DefaultBuilder

    from ._types import ClickFunctionWrapper
    from ._types import DictStrAny
    from ._types import ListStr
    from ._types import LiteralRuntime
    from ._types import P

    ServeCommand = t.Literal["serve", "serve-grpc"]
    OutputLiteral = t.Literal["json", "pretty", "porcelain"]

    TupleStr = tuple[str, ...]
else:
    TupleStr = tuple
    torch = LazyLoader("torch", globals(), "torch")
    jupytext = LazyLoader("jupytext", globals(), "jupytext")
    nbformat = LazyLoader("nbformat", globals(), "nbformat")


# NOTE: We need to do this so that overload can register
# correct overloads to typing registry
if sys.version_info[:2] >= (3, 11):
    from typing import overload
else:
    from typing_extensions import overload

if sys.version_info[:2] >= (3, 12):
    from typing import override
else:
    from typing_extensions import override


logger = logging.getLogger(__name__)

COLUMNS = int(os.environ.get("COLUMNS", 120))

_CONTEXT_SETTINGS = {"help_option_names": ["-h", "--help"], "max_content_width": COLUMNS}

OPENLLM_FIGLET = """\
 ██████╗ ██████╗ ███████╗███╗   ██╗██╗     ██╗     ███╗   ███╗
██╔═══██╗██╔══██╗██╔════╝████╗  ██║██║     ██║     ████╗ ████║
██║   ██║██████╔╝█████╗  ██╔██╗ ██║██║     ██║     ██╔████╔██║
██║   ██║██╔═══╝ ██╔══╝  ██║╚██╗██║██║     ██║     ██║╚██╔╝██║
╚██████╔╝██║     ███████╗██║ ╚████║███████╗███████╗██║ ╚═╝ ██║
 ╚═════╝ ╚═╝     ╚══════╝╚═╝  ╚═══╝╚══════╝╚══════╝╚═╝     ╚═╝
"""


_AnyCallable = t.Callable[..., t.Any]
FC = t.TypeVar("FC", bound=t.Union[_AnyCallable, click.Command])


def parse_device_callback(
    ctx: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None
) -> TupleStr | None:
    if value is None:
        return value

    if not LazyType(TupleStr).isinstance(value):
        ctx.fail(f"{param} only accept multiple values, not {type(value)} (value: {value})")

    el: TupleStr = tuple(i for k in value for i in k)

    # NOTE: --device all is a special case
    if len(el) == 1 and el[0] == "all":
        return tuple(map(str, available_devices()))

    return el


def _echo(text: t.Any, fg: str = "green", _with_style: bool = True, **attrs: t.Any) -> None:
    call = click.echo
    if _with_style:
        attrs["fg"] = fg if not get_debug_mode() else None
        call = click.secho
    call(text, **attrs)


def output_option(f: t.Callable[[FC], FC], *, factory: t.Any = click) -> t.Callable[[FC], FC]:
    return factory.option(
        "-o",
        "--output",
        "output",
        type=click.Choice(["json", "pretty", "porcelain"]),
        default="pretty",
        help="Showing output type.",
        show_default=True,
        envvar="OPENLLM_OUTPUT",
        show_envvar=True,
    )(f)


def machine_option(factory: t.Any) -> t.Callable[[FC], FC]:
    return factory.option("--machine", is_flag=True, default=False, hidden=True)


def model_id_option(factory: t.Any, model_env: EnvVarMixin | None = None) -> t.Callable[[FC], FC]:
    envvar = None
    if model_env is not None:
        envvar = model_env.model_id
    return factory.option(
        "--model-id",
        type=click.STRING,
        default=None,
        help="Optional model_id name or path for (fine-tune) weight.",
        envvar=envvar,
        show_envvar=True if envvar is not None else False,
    )


def model_version_option(factory: t.Any) -> t.Callable[[FC], FC]:
    return factory.option(
        "--model-version",
        type=click.STRING,
        default=None,
        help="Optional model version to save for this model. It will be inferred automatically from model-id.",
    )


def workers_per_resource_option(factory: t.Any, build: bool = False) -> t.Callable[[FC], FC]:
    help_str = """Number of workers per resource assigned.
    See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
    for more information. By default, this is set to 1.

    **Note**: ``--workers-per-resource`` will also accept the following strategies:

    - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.

    - ``conserved``: This will determine the number of available GPU resources, and only assign one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``.
    """
    if build:
        help_str += """\n
    **Note**: The workers value passed into 'build' will determine how the LLM can
    be provisioned in Kubernetes as well as in standalone container. This will
    ensure it has the same effect with 'openllm start --workers ...'"""
    return factory.option("--workers-per-resource", default=None, help=help_str, type=str, required=False)


def quantize_option(factory: t.Any, build: bool = False, model_env: EnvVarMixin | None = None) -> t.Callable[[FC], FC]:
    envvar = None
    if model_env is not None:
        envvar = model_env.quantize
    help_str = (
        "Running this model in quantized mode." if not build else "Set quantization mode for serving in deployment."
    )
    help_str += """\n

    GPTQ is currently working in progress and will be available soon.

    NOTE: Quantization is only available for PyTorch models.
    """
    return factory.option(
        "--quantise",
        "--quantize",
        "quantize",
        type=click.Choice(["int8", "int4", "gptq"]),
        default=None,
        help=help_str,
        envvar=envvar,
        show_envvar=True if envvar is not None else False,
    )


def bettertransformer_option(
    factory: t.Any, build: bool = False, model_env: EnvVarMixin | None = None
) -> t.Callable[[FC], FC]:
    envvar = None
    if model_env is not None:
        envvar = model_env.bettertransformer
    return factory.option(
        "--bettertransformer",
        is_flag=True,
        default=None,
        help="Apply FasterTransformer wrapper to serve model. This will applies during serving time."
        if not build
        else "Set defaul environment variable whether to serve this model with FasterTransformer in build time.",
        envvar=envvar,
        show_envvar=True if envvar is not None else False,
    )


def serialisation_option(factory: t.Any) -> t.Callable[[FC], FC]:
    return factory.option(
        "--serialisation",
        "--serialization",
        "serialisation_format",
        type=click.Choice(["safetensors", "legacy"]),
        default="safetensors",
        help="Serialisation format to save the model in. Default is safetensors, which is similar to `safe_serialization=True`. If the weight is not yet supported my safetensors, make sure to pass in '--serialisation legacy'",
        show_default=True,
        show_envvar=True,
        envvar="OPENLLM_SERIALIZATION",
    )


_adapter_mapping_key = "adapter_map"


def _id_callback(ctx: click.Context, _: click.Parameter, value: tuple[str, ...] | None) -> None:
    if not value:
        return None
    if _adapter_mapping_key not in ctx.params:
        ctx.params[_adapter_mapping_key] = {}
    for v in value:
        adapter_id, *adapter_name = v.rsplit(":", maxsplit=1)
        try:
            # try to resolve the full path if users pass in relative,
            # currently only support one level of resolve path.
            adapter_id = resolve_user_filepath(adapter_id, os.getcwd())
        except FileNotFoundError:
            pass
        ctx.params[_adapter_mapping_key][adapter_id] = adapter_name[0] if len(adapter_name) > 0 else None
    return None


@attr.define
class CliContext:
    cloud_context: str | None = attr.field(default=None, converter=attr.converters.default_if_none("default"))

    def with_options(self, **attrs: t.Any) -> t.Self:
        return attr.evolve(self, **attrs)


class OpenLLMCommandGroup(BentoMLCommandGroup):
    NUMBER_OF_COMMON_PARAMS = 5  # parameters in common_params + 1 faked group option header

    @staticmethod
    def common_params(f: FC) -> t.Callable[[FC], FC]:
        """This is not supposed to be used with unprocessed click function.

        This should be used a the last currying from common_params -> usage_tracking -> exception_handling.
        """
        # The following logics is similar to one of BentoMLCommandGroup

        from bentoml._internal.configuration import DEBUG_ENV_VAR
        from bentoml._internal.configuration import QUIET_ENV_VAR

        @cog.optgroup.group("Global options")
        @cog.optgroup.option(
            "-q", "--quiet", envvar=QUIET_ENV_VAR, is_flag=True, default=False, help="Suppress all output."
        )
        @cog.optgroup.option(
            "--debug",
            "--verbose",
            "debug",
            envvar=DEBUG_ENV_VAR,
            is_flag=True,
            default=False,
            help="Print out debug logs.",
        )
        @cog.optgroup.option(
            "--do-not-track",
            is_flag=True,
            default=False,
            envvar=analytics.OPENLLM_DO_NOT_TRACK,
            help="Do not send usage info",
        )
        @cog.optgroup.option(
            "--context",
            "cloud_context",
            type=click.STRING,
            default=None,
            help="BentoCloud context name.",
        )
        @click.pass_context
        @functools.wraps(f)
        def wrapper(
            ctx: click.Context, quiet: bool, debug: bool, cloud_context: str | None, *args: P.args, **attrs: P.kwargs
        ) -> t.Any:
            ctx.obj = CliContext(cloud_context=cloud_context)
            if quiet:
                set_quiet_mode(True)
                if debug:
                    logger.warning("'--quiet' passed; ignoring '--verbose/--debug'")
            elif debug:
                set_debug_mode(True)

            configure_logging()

            return f(*args, **attrs)

        return wrapper

    @staticmethod
    def usage_tracking(func: _AnyCallable, group: click.Group, **attrs: t.Any) -> _AnyCallable:
        """This is not supposed to be used with unprocessed click function.

        This should be used a the last currying from common_params -> usage_tracking -> exception_handling.
        """
        command_name = attrs.get("name", func.__name__)

        @functools.wraps(func)
        def wrapper(do_not_track: bool, *args: P.args, **attrs: P.kwargs) -> t.Any:
            if do_not_track:
                with analytics.set_bentoml_tracking():
                    return func(*args, **attrs)

            start_time = time.time_ns()

            with analytics.set_bentoml_tracking():
                assert group.name is not None, "group.name should not be None"
                event = analytics.OpenllmCliEvent(cmd_group=group.name, cmd_name=command_name)
                try:
                    return_value = func(*args, **attrs)
                    duration_in_ms = (time.time_ns() - start_time) / 1e6
                    event.duration_in_ms = duration_in_ms
                    analytics.track(event)
                    return return_value
                except Exception as e:
                    duration_in_ms = (time.time_ns() - start_time) / 1e6
                    event.duration_in_ms = duration_in_ms
                    event.error_type = type(e).__name__
                    event.return_code = 2 if isinstance(e, KeyboardInterrupt) else 1
                    analytics.track(event)
                    raise

        return wrapper

    @staticmethod
    def exception_handling(func: _AnyCallable, group: click.Group, **attrs: t.Any) -> ClickFunctionWrapper[..., t.Any]:
        """This is not supposed to be used with unprocessed click function.

        This should be used a the last currying from common_params -> usage_tracking -> exception_handling.
        """
        command_name = attrs.get("name", func.__name__)

        @functools.wraps(func)
        def wrapper(*args: P.args, **attrs: P.kwargs) -> t.Any:
            try:
                return func(*args, **attrs)
            except OpenLLMException as err:
                raise click.ClickException(
                    click.style(f"[{group.name}] '{command_name}' failed: " + err.message, fg="red")
                ) from err
            except KeyboardInterrupt:  # NOTE: silience KeyboardInterrupt
                pass

        return t.cast("ClickFunctionWrapper[..., t.Any]", wrapper)

    def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None:
        cmd_name = self.resolve_alias(cmd_name)
        if ctx.command.name == "start":
            try:
                return _cached_http[cmd_name]
            except KeyError:
                # support start from a bento
                try:
                    return start_command_factory(bentoml.get(cmd_name), _context_settings=_CONTEXT_SETTINGS)
                except bentoml.exceptions.NotFound:
                    pass
                raise click.BadArgumentUsage(
                    f"{cmd_name} is not a valid model identifier supported by OpenLLM."
                ) from None
        elif ctx.command.name == "start-grpc":
            try:
                return _cached_grpc[cmd_name]
            except KeyError:
                # support start from a bento
                try:
                    return start_command_factory(
                        bentoml.get(cmd_name), _context_settings=_CONTEXT_SETTINGS, _serve_grpc=True
                    )
                except bentoml.exceptions.NotFound:
                    pass
                raise click.BadArgumentUsage(
                    f"{cmd_name} is not a valid model identifier supported by OpenLLM."
                ) from None
        return super().get_command(ctx, cmd_name)

    def list_commands(self, ctx: click.Context) -> list[str]:
        if ctx.command.name == "start" or ctx.command.name == "start-grpc":
            return list(openllm.CONFIG_MAPPING.keys())

        return super().list_commands(ctx)

    @override
    def command(self, *args: t.Any, **attrs: t.Any):
        """Override the default 'cli.command' with supports for aliases for given command, and it wraps the implementation with common parameters."""
        if "context_settings" not in attrs:
            attrs["context_settings"] = {}
        if "max_content_width" not in attrs["context_settings"]:
            attrs["context_settings"]["max_content_width"] = 120
        aliases = attrs.pop("aliases", None)

        def wrapper(f: _AnyCallable) -> click.Command:
            name = f.__name__.lower()
            if name.endswith("_command"):
                name = name[:-8]
            name = name.replace("_", "-")
            attrs.setdefault("help", inspect.getdoc(f))
            attrs.setdefault("name", name)

            # Wrap implementation withc common parameters
            wrapped = self.common_params(f)
            # Wrap into OpenLLM tracking
            wrapped = self.usage_tracking(wrapped, self, **attrs)
            # Wrap into exception handling
            wrapped = self.exception_handling(wrapped, self, **attrs)

            # move common parameters to end of the parameters list
            wrapped.__click_params__ = (
                wrapped.__click_params__[-self.NUMBER_OF_COMMON_PARAMS :]
                + wrapped.__click_params__[: -self.NUMBER_OF_COMMON_PARAMS]
            )

            # NOTE: we need to call super of super to avoid conflict with BentoMLCommandGroup command
            # setup
            cmd = super(BentoMLCommandGroup, self).command(*args, **attrs)(wrapped)
            # NOTE: add aliases to a given commands if it is specified.
            if aliases is not None:
                assert cmd.name
                self._commands[cmd.name] = aliases
                self._aliases.update({alias: cmd.name for alias in aliases})

            return cmd

        return wrapper


@click.group(cls=OpenLLMCommandGroup, context_settings=_CONTEXT_SETTINGS, name="openllm")
@click.version_option(__version__, "--version", "-v")
def cli() -> None:
    """\b
     ██████╗ ██████╗ ███████╗███╗   ██╗██╗     ██╗     ███╗   ███╗
    ██╔═══██╗██╔══██╗██╔════╝████╗  ██║██║     ██║     ████╗ ████║
    ██║   ██║██████╔╝█████╗  ██╔██╗ ██║██║     ██║     ██╔████╔██║
    ██║   ██║██╔═══╝ ██╔══╝  ██║╚██╗██║██║     ██║     ██║╚██╔╝██║
    ╚██████╔╝██║     ███████╗██║ ╚████║███████╗███████╗██║ ╚═╝ ██║
     ╚═════╝ ╚═╝     ╚══════╝╚═╝  ╚═══╝╚══════╝╚══════╝╚═╝     ╚═╝.

    \b
    An open platform for operating large language models in production.
    Fine-tune, serve, deploy, and monitor any LLMs with ease.
    """  # noqa: D205


@cli.group(cls=OpenLLMCommandGroup, context_settings=_CONTEXT_SETTINGS, name="start", aliases=["start-http"])
def start_command() -> None:
    """Start any LLM as a REST server.

    \b
    ```bash
    $ openllm <start|start-http> <model_name> --<options> ...
    ```
    """


@cli.group(cls=OpenLLMCommandGroup, context_settings=_CONTEXT_SETTINGS, name="start-grpc")
def start_grpc_command() -> None:
    """Start any LLM as a gRPC server.

    \b
    ```bash
    $ openllm start-grpc <model_name> --<options> ...
    ```
    """


# NOTE: A list of bentoml option that is not needed for parsing.
# NOTE: User shouldn't set '--working-dir', as OpenLLM will setup this.
# NOTE: production is also deprecated
_IGNORED_OPTIONS = {"working_dir", "production", "protocol_version"}


def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., openllm.LLMConfig]], t.Callable[[FC], FC]]:
    """Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`."""
    from bentoml_cli.cli import cli

    command = "serve" if not serve_grpc else "serve-grpc"
    group = cog.optgroup.group(
        f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options",
        help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",
    )

    def decorator(f: t.Callable[t.Concatenate[int, str | None, P], openllm.LLMConfig]):
        serve_command = cli.commands[command]
        # The first variable is the argument bento
        # The last five is from BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS
        serve_options = [
            p
            for p in serve_command.params[1 : -BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS]
            if p.name not in _IGNORED_OPTIONS
        ]
        for options in reversed(serve_options):
            attrs = options.to_info_dict()
            # we don't need param_type_name, since it should all be options
            attrs.pop("param_type_name")
            # name is not a valid args
            attrs.pop("name")
            # type can be determine from default value
            attrs.pop("type")
            param_decls = (*attrs.pop("opts"), *attrs.pop("secondary_opts"))
            f = cog.optgroup.option(*param_decls, **attrs)(f)

        return group(f)

    return decorator


_http_server_args = parse_serve_args(False)
_grpc_server_args = parse_serve_args(True)


def start_decorator(
    llm_config: openllm.LLMConfig, serve_grpc: bool = False
) -> t.Callable[[_AnyCallable], t.Callable[[FC], FC]]:
    opts = [
        llm_config.to_click_options,
        _http_server_args if not serve_grpc else _grpc_server_args,
        cog.optgroup.group("General LLM Options", help="The following options are related to running the LLM Server."),
        cog.optgroup.option(
            "--server-timeout",
            type=int,
            default=None,
            help="Server timeout in seconds",
        ),
        workers_per_resource_option(cog.optgroup),
        model_id_option(cog.optgroup, model_env=llm_config["env"]),
        model_version_option(cog.optgroup),
        cog.optgroup.option(
            "--fast",
            is_flag=True,
            default=False,
            help="Bypass auto model checks and setup. This option is ahead-of-serving time.",
        ),
        cog.optgroup.group(
            "LLM Optimization Options",
            help="""\
    These options are related for dynamic optimization on the fly. Current supported strategies:

    - int8: Quantize the model with 8bit (bitsandbytes required)

    - int4: Quantize the model with 4bit (bitsandbytes required)

    - bettertransformer: Convert given model to FastTransformer

    - GPTQ: [paper](https://arxiv.org/abs/2210.17323)

    It also include serialisation format strategies for faster loading and importing time

    - safetensors: Using safetensors instead of pickling (safetensors required) [default]

    - legacy: using default torch load behaviour.

    The following are currently being worked on:

    - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)

      """,
        ),
        cog.optgroup.option(
            "--device",
            type=dantic.CUDA,
            multiple=True,
            envvar="CUDA_VISIBLE_DEVICES",
            callback=parse_device_callback,
            help=f"Assign GPU devices (if available) for {llm_config['model_name']}.",
            show_envvar=True,
        ),
        cog.optgroup.option(
            "--runtime",
            type=click.Choice(["ggml", "transformers"]),
            default="transformers",
            help="The runtime to use for the given model. Default is transformers.",
        ),
        quantize_option(cog.optgroup, model_env=llm_config["env"]),
        bettertransformer_option(cog.optgroup, model_env=llm_config["env"]),
        serialisation_option(cog.optgroup),
        cog.optgroup.group(
            "Fine-tuning related options",
            help="""\
    Note that the argument `--adapter-id` can accept the following format:

    - `--adapter-id /path/to/adapter` (local adapter)

    - `--adapter-id remote/adapter` (remote adapter from HuggingFace Hub)

    - `--adapter-id remote/adapter:eng_lora` (two previous adapter options with the given adapter_name)

    ```bash

    $ openllm start opt --adapter-id /path/to/adapter_dir --adapter-id remote/adapter:eng_lora

    ```
    """,
        ),
        cog.optgroup.option(
            "--adapter-id",
            default=None,
            help="Optional name or path for given LoRA adapter" + f" to wrap '{llm_config['model_name']}'",
            multiple=True,
            callback=_id_callback,
            metavar="[PATH | [remote/][adapter_name:]adapter_id][, ...]",
        ),
        click.option("--return-process", is_flag=True, default=False, help="Internal use only.", hidden=True),
    ]

    def decorator(f: _AnyCallable) -> _AnyCallable:
        for opt in reversed(opts):
            f = opt(f)
        return f

    return decorator


def parse_config_options(
    config: openllm.LLMConfig,
    server_timeout: int,
    workers_per_resource: float,
    device: tuple[str, ...] | None,
    environ: DictStrAny,
) -> DictStrAny:
    _bentoml_config_options_env = environ.pop("BENTOML_CONFIG_OPTIONS", "")
    _bentoml_config_options_opts = [
        "tracing.sample_rate=1.0",
        f"api_server.traffic.timeout={server_timeout}",
        f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}',
        f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}',
    ]
    if device:
        if len(device) > 1:
            for idx, dev in enumerate(device):
                _bentoml_config_options_opts.append(
                    f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
                )
        else:
            _bentoml_config_options_opts.append(
                f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]'
            )

    _bentoml_config_options_env += " " if _bentoml_config_options_env else "" + " ".join(_bentoml_config_options_opts)
    environ["BENTOML_CONFIG_OPTIONS"] = _bentoml_config_options_env
    return environ


def start_command_factory(
    model_name_or_bento: str | bentoml.Bento,
    _context_settings: DictStrAny | None = None,
    _serve_grpc: bool = False,
) -> click.Command:
    """Generate a 'click.Command' for any given LLM.

    Args:
        model_name_or_bento: The name of the model or the ``bentoml.Bento`` instance.

    Returns:
        The click.Command for starting the model server

    Note that the internal commands will return the llm_config and a boolean determine
    whether the server is run with GPU or not.
    """
    group = start_command if not _serve_grpc else start_grpc_command

    if isinstance(model_name_or_bento, bentoml.Bento):
        if "start_name" not in model_name_or_bento.info.labels:
            raise click.BadOptionUsage(
                "model_name",
                f"'{model_name_or_bento.tag}' is built with older version of OpenLLM and not supported with 'openllm start'. Please use 'bentoml {'serve-http' if not _serve_grpc else 'serve-grpc'} {model_name_or_bento.tag!s}' instead.",
            )
        llm_config = openllm.AutoConfig.infer_class_from_name(
            model_name_or_bento.info.labels["start_name"]
        ).model_construct_json(model_name_or_bento.info.labels["configuration"])
        return start_bento(group, model_name_or_bento, llm_config, _serve_grpc, context_settings=_CONTEXT_SETTINGS)
    else:
        llm_config = openllm.AutoConfig.for_model(model_name_or_bento)

        return start_model(
            group,
            model_name_or_bento,
            llm_config,
            _serve_grpc,
            name=llm_config["model_name"],
            context_settings=_context_settings or {},
            short_help=f"Start a LLMServer for '{model_name_or_bento}'",
            aliases=[llm_config["start_name"]] if llm_config["name_type"] == "dasherize" else None,
            help=f"""\
{llm_config['env'].start_docstring}

\b
Note: ``{llm_config['start_name']}`` can also be run with any other models available on HuggingFace
or fine-tuned variants as long as it belongs to the architecture generation ``{llm_config['architecture']}`` (trust_remote_code={llm_config['trust_remote_code']}).

\b
For example: One can start [Fastchat-T5](https://huggingface.co/lmsys/fastchat-t5-3b-v1.0) with ``openllm start flan-t5``:

\b
$ openllm start flan-t5 --model-id lmsys/fastchat-t5-3b-v1.0

\b
Available official model_id(s): [default: {llm_config['default_id']}]

\b
{orjson.dumps(llm_config['model_ids'], option=orjson.OPT_INDENT_2).decode()}
""",
        )


def start_bento_docstring(bento: bentoml.Bento, llm_config: openllm.LLMConfig, serve_grpc: bool) -> str:
    environ = parse_config_options(llm_config, llm_config["timeout"], llm_config["workers_per_resource"], None, {})

    serve_cmd_envvar = {
        "OPENLLM_MODEL_ID": f"$(bentoml models get {bento.info.labels['_framework']}-{bento.info.labels['_type']} -o path)",
        "BENTOML_DEBUG": get_debug_mode(),
        "BENTOML_CONFIG_OPTIONS": environ["BENTOML_CONFIG_OPTIONS"],
    }

    return f"""\
Start {bento!r} with OpenLLM.

\b
This is a lightwrapper around 'bentoml serve' to provide nicer interaction with LLM Bentos.

\b
The equivalent 'bentoml {'serve' if not serve_grpc else 'serve-grpc'}' command:

\b
```bash
$ {' '.join([f'{k}={v}' for k, v in serve_cmd_envvar.items()])} bentoml {'serve-http' if not serve_grpc else 'serve-grpc'} {bento.tag!s}
```

\b
> Note that if you want to enable GPU with 'bentoml serve', add the following to BENTOML_CONFIG_OPTIONS:

\b
If you have more than 1 GPU:

\b
```bash
BENTOML_CONFIG_OPTIONS += ' runners."llm-{llm_config['start_name']}-runner".resources."nvidia.com/gpu"[<gpu_idx>]=<gpu_device_id>'
```

Make sure to adjust `workers_per_resource` in BENTOML_CONFIG_OPTIONS accordingly. See https://docs.bentoml.com/en/latest/guides/scheduling.html
for more information.

\b
If you only have 1 GPU:

\b
```bash
BENTOML_CONFIG_OPTIONS += ' runners."llm-{llm_config['start_name']}-runner".resources."nvidia.com/gpu"=[<gpu_device_id>]'
```
"""


def noop_command(
    group: click.Group, llm_config: openllm.LLMConfig, reason: str, **command_attrs: t.Any
) -> click.Command:
    context_settings = command_attrs.pop("context_settings", {})
    context_settings["ignore_unknown_options"] = True
    context_settings["allow_extra_args"] = True
    command_attrs["context_settings"] = context_settings

    # NOTE: The model requires GPU, therefore we will return a dummy command
    @group.command(**command_attrs)
    def noop(**_: t.Any) -> openllm.LLMConfig:
        _echo(reason, fg="red")
        analytics.track_start_init(llm_config)
        return llm_config

    return noop


def prerequisite_check(
    ctx: click.Context,
    llm_config: openllm.LLMConfig,
    quantize: t.LiteralString | None,
    adapter_map: dict[str, str | None] | None,
    num_workers: int,
) -> None:
    if quantize:
        if device_count() < 1:
            _echo("Quantization requires at least 1 GPU (got None)", fg="red")
            ctx.exit(1)

    if adapter_map and not is_peft_available():
        _echo(
            "Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'",
            fg="red",
        )
        ctx.exit(1)

    requirements = llm_config["requirements"]
    if requirements is not None and len(requirements) > 0:
        missing_requirements = [i for i in requirements if importlib.util.find_spec(i) is None]
        if len(missing_requirements) > 0:
            _echo(
                f"Make sure to have the following dependencies available: {missing_requirements}",
                fg="yellow",
            )

    if num_workers > 1 and device_count() < num_workers:
        raise click.BadOptionUsage(
            "workers_per_resource",
            f"# of workers is infered to {num_workers} GPUs per runner worker, while there are only"
            f"'{device_count()}' for inference. (Tip: Try again using '--workers-per-resource={1/device_count()}')",
            ctx=ctx,
        )


_wpr_strategies = {"round_robin", "conserved"}


def start_bento(
    group: click.Group,
    bento: bentoml.Bento,
    llm_config: openllm.LLMConfig,
    serve_grpc: bool,
    **command_attrs: t.Any,
) -> click.Command:
    if llm_config["requires_gpu"] and device_count() < 1:
        return noop_command(
            group, llm_config, f"No GPU available, while {bento!r} requires GPU to run.", **command_attrs
        )

    command_attrs["help"] = start_bento_docstring(bento, llm_config, serve_grpc)

    # Now we have to format the model_id accordingly based on the model_fs
    model_type = bento.info.labels["_type"]
    model_framework = bento.info.labels["_framework"]
    # the models should have the type
    try:
        model_store = ModelStore(bento._fs.opendir("models"))
        model = model_store.get(f"{model_framework}-{model_type}")
    except fs.errors.ResourceNotFound:
        # new behaviour with BentoML models
        _model_store = BentoMLContainer.model_store.get()
        model = _model_store.get(f"{model_framework}-{model_type}")
    except bentoml.exceptions.NotFound:
        raise OpenLLMException(f"Failed to find models for {llm_config['start_name']}") from None

    @group.command(**command_attrs)
    @start_decorator(llm_config, serve_grpc=serve_grpc)
    @click.pass_context
    def start_cmd(
        ctx: click.Context,
        server_timeout: int,
        model_id: str | None,
        model_version: str | None,
        workers_per_resource: t.LiteralString | float,
        device: tuple[str, ...],
        quantize: t.Literal["int8", "int4", "gptq"] | None,
        bettertransformer: bool | None,
        runtime: t.Literal["ggml", "transformers"],
        fast: bool,
        adapter_id: str | None,
        return_process: bool,
        serialisation_format: t.Literal["safetensors", "legacy"],
        **attrs: t.Any,
    ) -> openllm.LLMConfig | subprocess.Popen[bytes]:
        if model_id is not None:
            _echo("'model_id' has no effect when starting a BentoLLM", fg="yellow")

        adapter_map: dict[str, str | None] | None = attrs.pop(_adapter_mapping_key, None)

        config, server_attrs = llm_config.model_validate_click(**attrs)
        server_timeout = first_not_none(server_timeout, default=config["timeout"])

        server_attrs.update({"working_dir": os.path.dirname(__file__), "timeout": server_timeout})
        if serve_grpc:
            server_attrs["grpc_protocol_version"] = "v1"
        # NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream.
        development = server_attrs.pop("development")
        server_attrs.setdefault("production", not development)

        workers_per_resource = first_not_none(workers_per_resource, default=config["workers_per_resource"])

        if isinstance(workers_per_resource, str):
            if workers_per_resource == "round_robin":
                workers_per_resource = 1.0
            elif workers_per_resource == "conserved":
                available_gpu = device if device else available_devices()
                workers_per_resource = 1.0 if len(available_gpu) == 0 else float(1 / len(available_gpu))
            else:
                try:
                    workers_per_resource = float(workers_per_resource)
                except ValueError:
                    ctx.fail(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies.")

        num_workers = int(1 / workers_per_resource)

        # Create a new model env to work with the envvar during CLI invocation
        env = EnvVarMixin(
            config["model_name"],
            config["default_implementation"],
            bettertransformer=bettertransformer,
            quantize=quantize,
            runtime=runtime,
        )

        prerequisite_check(ctx, config, quantize, adapter_map, num_workers)

        # NOTE: This is to set current configuration
        start_env = os.environ.copy()
        start_env = parse_config_options(config, server_timeout, workers_per_resource, device, start_env)

        if fast:
            _echo(f"Fast mode has no effects when 'start' {bento.tag!s}", fg="yellow")

        start_env.update(
            {
                env.framework: env.framework_value,
                env.config: config.model_dump_json().decode(),
                env.runtime: env.runtime_value,
                "BENTOML_DEBUG": str(not get_quiet_mode()),
                "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()),
                "OPENLLM_MODEL_ID": model.path,
                "OPENLLM_SERIALIZATION": serialisation_format,
            }
        )

        if adapter_map:
            _echo(f"OpenLLM will convert '{bento.tag!s}' to use provided adapters layers: {list(adapter_map)}")
        start_env["OPENLLM_ADAPTER_MAP"] = orjson.dumps(adapter_map).decode()

        if bettertransformer is not None:
            start_env[env.bettertransformer] = str(bettertransformer)
        if quantize is not None:
            start_env[env.quantize] = quantize

        if serve_grpc:
            server = bentoml.GrpcServer(bento, **server_attrs)
        else:
            server = bentoml.HTTPServer(bento, **server_attrs)
        analytics.track_start_init(config)

        if return_process:
            server.start(env=start_env, text=True)
            process = server.process
            if process is None:
                raise click.ClickException("Failed to start the server.")
            return process
        else:
            try:
                server.start(env=start_env, text=True, blocking=True)
            except Exception as err:
                _echo(f"Error caught while running LLM Server:\n{err}", fg="red")

        # NOTE: Return the configuration for telemetry purposes.
        return config

    return start_cmd


def start_model(
    group: click.Group,
    model_name: str,
    llm_config: openllm.LLMConfig,
    serve_grpc: bool,
    **command_attrs: t.Any,
) -> click.Command:
    if llm_config["requires_gpu"] and device_count() < 1:
        # NOTE: The model requires GPU, therefore we will return a dummy command
        command_attrs.update(
            {
                "short_help": "(Disabled because there is no GPU available)",
                "help": f"""{model_name} is currently not available to run on your
                local machine because it requires GPU for inference.""",
            }
        )
        return noop_command(group, llm_config, "No GPU available, therefore this command is disabled", **command_attrs)

    @group.command(**command_attrs)
    @start_decorator(llm_config, serve_grpc=serve_grpc)
    @click.pass_context
    def start_cmd(
        ctx: click.Context,
        server_timeout: int,
        model_id: str | None,
        model_version: str | None,
        workers_per_resource: t.LiteralString | float,
        device: tuple[str, ...],
        quantize: t.Literal["int8", "int4", "gptq"] | None,
        bettertransformer: bool | None,
        runtime: t.Literal["ggml", "transformers"],
        fast: bool,
        serialisation_format: t.Literal["safetensors", "legacy"],
        adapter_id: str | None,
        return_process: bool,
        **attrs: t.Any,
    ) -> openllm.LLMConfig | subprocess.Popen[bytes]:
        if serialisation_format == "safetensors" and quantize is not None:
            if os.getenv("OPENLLM_SERIALIZATION_WARNING", str(True)).upper() in ENV_VARS_TRUE_VALUES:
                _echo(
                    f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=True\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.",
                    fg="yellow",
                )
        adapter_map: dict[str, str | None] | None = attrs.pop(_adapter_mapping_key, None)

        config, server_attrs = llm_config.model_validate_click(**attrs)
        server_timeout = first_not_none(server_timeout, default=config["timeout"])

        server_attrs.update({"working_dir": os.path.dirname(__file__), "timeout": server_timeout})
        if serve_grpc:
            server_attrs["grpc_protocol_version"] = "v1"
        # NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream.
        development = server_attrs.pop("development")
        server_attrs.setdefault("production", not development)

        workers_per_resource = first_not_none(workers_per_resource, default=config["workers_per_resource"])

        if isinstance(workers_per_resource, str):
            if workers_per_resource == "round_robin":
                workers_per_resource = 1.0
            elif workers_per_resource == "conserved":
                available_gpu = device if device else available_devices()
                workers_per_resource = 1.0 if len(available_gpu) == 0 else float(1 / len(available_gpu))
            else:
                try:
                    workers_per_resource = float(workers_per_resource)
                except ValueError:
                    ctx.fail(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies.")

        num_workers = int(1 / workers_per_resource)

        # Create a new model env to work with the envvar during CLI invocation
        env = EnvVarMixin(
            config["model_name"],
            config["default_implementation"],
            bettertransformer=bettertransformer,
            quantize=quantize,
            runtime=runtime,
        )

        prerequisite_check(ctx, config, quantize, adapter_map, num_workers)

        # NOTE: This is to set current configuration
        start_env = os.environ.copy()
        start_env = parse_config_options(config, server_timeout, workers_per_resource, device, start_env)

        if fast and not get_quiet_mode():
            _echo(
                f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model_name}{'--model-id ' + model_id if model_id else ''}'",
                fg="yellow",
            )

        start_env.update(
            {
                env.framework: env.framework_value,
                "BENTOML_DEBUG": str(not get_quiet_mode()),
                "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()),
                "OPENLLM_SERIALIZATION": serialisation_format,
            }
        )

        if adapter_map:
            _echo(f"OpenLLM will convert '{model_name}' to use provided adapters layers: {list(adapter_map)}")

        llm = openllm.infer_auto_class(env.framework_value).for_model(
            model_name,
            model_id=model_id,
            model_version=model_version,
            llm_config=config,
            ensure_available=not fast,
            return_runner_kwargs=False,
            quantize=quantize,
            bettertransformer=bettertransformer,
            adapter_map=adapter_map,
            runtime=runtime,
            serialisation=serialisation_format,
        )

        start_env.update(
            {
                env.config: llm.config.model_dump_json().decode(),
                env.runtime: env.runtime_value,
                "OPENLLM_MODEL": model_name,
                "OPENLLM_MODEL_ID": llm.model_id,
                "OPENLLM_ADAPTER_MAP": orjson.dumps(adapter_map).decode(),
            }
        )

        if bettertransformer is not None:
            start_env[env.bettertransformer] = str(bettertransformer)
        if quantize is not None:
            start_env[env.quantize] = quantize

        if serve_grpc:
            server = bentoml.GrpcServer("_service.py:svc", **server_attrs)
        else:
            server = bentoml.HTTPServer("_service.py:svc", **server_attrs)
        analytics.track_start_init(llm.config)

        def next_step(model_name: str, adapter_map: DictStrAny | None):
            cmd_name = f"openllm build {model_name}"
            if adapter_map is not None:
                cmd_name += " " + " ".join(
                    [
                        f"--adapter-id {s}"
                        for s in [
                            f"{p}:{name}" if name not in (None, "default") else p for p, name in adapter_map.items()
                        ]
                    ]
                )
            _echo(
                f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}",
                fg="blue",
            )

        if return_process:
            server.start(env=start_env, text=True)
            process = server.process
            if process is None:
                raise click.ClickException("Failed to start the server.")
            return process
        else:
            try:
                server.start(env=start_env, text=True, blocking=True)
            except KeyboardInterrupt:
                next_step(model_name, adapter_map)
            except Exception as err:
                _echo(f"Error caught while running LLM Server:\n{err}", fg="red")
            else:
                next_step(model_name, adapter_map)

        # NOTE: Return the configuration for telemetry purposes.
        return config

    return start_cmd


@cli.command(name="import", aliases=["download"])
@click.argument(
    "model",
    type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]),
)
@click.argument(
    "model_id",
    type=click.STRING,
    default=None,
    metavar="Optional[REMOTE_REPO/MODEL_ID | /path/to/local/model]",
    required=False,
)
@click.argument("converter", envvar="CONVERTER", type=click.STRING, default=None, required=False, metavar=None)
@model_version_option(click)
@click.option(
    "--runtime",
    type=click.Choice(["ggml", "transformers"]),
    default="transformers",
    help="The runtime to use for the given model. Default is transformers.",
)
@output_option
@quantize_option(click)
@machine_option(click)
@click.option("--implementation", type=click.Choice(["pt", "tf", "flax", "vllm"]), default=None, hidden=True)
@serialisation_option(click)
def download_models_command(
    model: str,
    model_id: str | None,
    converter: str | None,
    model_version: str | None,
    output: OutputLiteral,
    runtime: t.Literal["ggml", "transformers"],
    machine: bool,
    implementation: LiteralRuntime | None,
    quantize: t.Literal["int8", "int4", "gptq"] | None,
    serialisation_format: t.Literal["safetensors", "legacy"],
) -> bentoml.Model:
    """Setup LLM interactively.

    It accepts two positional arguments: `model_name` and `model_id`. The first name determine
    the model type to download, and the second one is the optional model id to download.

    \b
    This `model_id` can be either pretrained model id that you can get from HuggingFace Hub, or
    a custom model path from your custom pretrained model. Note that the custom model path should
    contain all files required to construct `transformers.PretrainedConfig`, `transformers.PreTrainedModel`
    and `transformers.PreTrainedTokenizer` objects.

    \b
    Note: This is useful for development and setup for fine-tune.
    This will be automatically called when `ensure_available=True` in `openllm.LLM.for_model`

    \b
    ``--model-version`` is an optional option to save the model. Note that
    this is recommended when the model_id is a custom path. Usually, if you are only using pretrained
    model from HuggingFace Hub, you don't need to specify this. If this is not specified, we will calculate
    the hash from the last modified time from this custom path

    \b
    ```bash
    $ openllm download opt facebook/opt-2.7b
    ```

    \b
    > If ``quantize`` is passed, the model weights will be saved as quantized weights. You should
    > only use this option if you want the weight to be quantized by default. Note that OpenLLM also
    > support on-demand quantisation during initial startup.

    \b
    ## Conversion strategies [EXPERIMENTAL]

    \b
    Some models will include built-in conversion strategies for specific weights format.
    It will be determined via the `CONVERTER` environment variable. Note that this envvar should only be use provisionally as it is not RECOMMENDED to export this
    and save to a ``.env`` file.

    The conversion strategies will have the following format and will be determined per architecture implementation:
    <base_format>-<target_format>

    \b
    For example: the below convert LlaMA-2 model format to hf:

    \b
    ```bash
    $ CONVERTER=llama2-hf openllm import llama /path/to/llama-2
    ```

    > **Note**: This behaviour will override ``--runtime``. Therefore make sure that the LLM contains correct conversion strategies to both GGML and HF.
    """
    llm_config = openllm.AutoConfig.for_model(model)
    impl: LiteralRuntime = first_not_none(
        implementation, default=EnvVarMixin(model, llm_config["default_implementation"]).framework_value
    )
    llm = openllm.infer_auto_class(impl).for_model(
        model,
        llm_config=llm_config,
        model_id=model_id,
        model_version=model_version,
        runtime=runtime,
        return_runner_kwargs=False,
        quantize=quantize,
        ensure_available=False,
        serialisation=serialisation_format,
    )

    _previously_saved = False
    try:
        _ref = bentoml.models.get(llm.tag)
        _previously_saved = True
    except bentoml.exceptions.NotFound:
        if not machine and output == "pretty":
            msg = f"'{model}' does not exists in local store. Saving..."
            if model_id is not None:
                msg = f"'{model}' with 'model_id={model_id}' does not exists in local store. Saving..."
            _echo(msg, fg="yellow", nl=True)

        _ref = llm.import_model(trust_remote_code=llm.__llm_trust_remote_code__)

        if impl == "pt" and is_torch_available() and torch.cuda.is_available():
            torch.cuda.empty_cache()

    if machine:
        # NOTE: We will prefix the tag with __tag__ and we can use regex to correctly
        # get the tag from 'bentoml.bentos.build|build_bentofile'
        _echo(f"__tag__:{_ref.tag}", fg="white")
    elif output == "pretty":
        if _previously_saved:
            _echo(
                f"{model} with 'model_id={model_id}' is already setup for framework '{impl}': {_ref.tag!s}",
                nl=True,
                fg="yellow",
            )
        else:
            _echo(f"Saved model: {_ref.tag}")
    elif output == "json":
        _echo(
            orjson.dumps(
                {"previously_setup": _previously_saved, "framework": impl, "tag": str(_ref.tag)},
                option=orjson.OPT_INDENT_2,
            ).decode()
        )
    else:
        _echo(_ref.tag)

    return _ref


_cached_http = {key: start_command_factory(key, _context_settings=_CONTEXT_SETTINGS) for key in openllm.CONFIG_MAPPING}
_cached_grpc = {
    key: start_command_factory(key, _context_settings=_CONTEXT_SETTINGS, _serve_grpc=True)
    for key in openllm.CONFIG_MAPPING
}


@overload
def _start(
    model_name: str | bentoml.Bento,
    /,
    model_id: str | None = ...,
    timeout: int = ...,
    workers_per_resource: t.Literal["conserved", "round_robin"] | float | None = ...,
    device: tuple[str, ...] | t.Literal["all"] | None = ...,
    quantize: t.Literal["int8", "int4", "gptq"] | None = ...,
    bettertransformer: bool | None = ...,
    runtime: t.Literal["ggml", "transformers"] = ...,
    fast: bool = ...,
    adapter_map: dict[t.LiteralString, str | None] | None = ...,
    framework: LiteralRuntime | None = ...,
    additional_args: ListStr | None = ...,
    _serve_grpc: bool = ...,
    __test__: t.Literal[False] = False,
) -> openllm.LLMConfig:
    ...


@overload
def _start(
    model_name: str | bentoml.Bento,
    /,
    model_id: str | None = ...,
    timeout: int = ...,
    workers_per_resource: t.Literal["conserved", "round_robin"] | float | None = ...,
    device: tuple[str, ...] | t.Literal["all"] | None = ...,
    quantize: t.Literal["int8", "int4", "gptq"] | None = ...,
    bettertransformer: bool | None = ...,
    runtime: t.Literal["ggml", "transformers"] = ...,
    fast: bool = ...,
    adapter_map: dict[t.LiteralString, str | None] | None = ...,
    framework: LiteralRuntime | None = ...,
    additional_args: ListStr | None = ...,
    _serve_grpc: bool = ...,
    __test__: t.Literal[True] = True,
) -> subprocess.Popen[bytes]:
    ...


def _start(
    model_name: str | bentoml.Bento,
    /,
    model_id: str | None = None,
    timeout: int = 30,
    workers_per_resource: t.Literal["conserved", "round_robin"] | float | None = None,
    device: tuple[str, ...] | t.Literal["all"] | None = None,
    quantize: t.Literal["int8", "int4", "gptq"] | None = None,
    bettertransformer: bool | None = None,
    runtime: t.Literal["ggml", "transformers"] = "transformers",
    fast: bool = False,
    adapter_map: dict[t.LiteralString, str | None] | None = None,
    framework: LiteralRuntime | None = None,
    additional_args: ListStr | None = None,
    _serve_grpc: bool = False,
    __test__: bool = False,
) -> openllm.LLMConfig | subprocess.Popen[bytes]:
    """Python API to start a LLM server. These provides one-to-one mapping to CLI arguments.

    For all additional arguments, pass it as string to ``additional_args``. For example, if you want to
    pass ``--port 5001``, you can pass ``additional_args=["--port", "5001"]``

    > **Note**: This will create a blocking process, so if you use this API, you can create a running sub thread
    > to start the server instead of blocking the main thread.

    ``openllm.start`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI interaction.

    > **Note**: ``quantize`` and ``bettertransformer`` are mutually exclusive.

    Args:
        model_name: The model name to start this LLM
        model_id: Optional model id for this given LLM
        timeout: The server timeout
        workers_per_resource: Number of workers per resource assigned.
                              See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
                              for more information. By default, this is set to 1.

                              > **Note**: ``--workers-per-resource`` will also accept the following strategies:

                              > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.

                              > - ``conserved``: Thjis will determine the number of available GPU resources, and only assign
                                                 one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
                                                 equivalent to ``--workers-per-resource 0.25``.
        device: Assign GPU devices (if available) to this LLM. By default, this is set to ``None``. It also accepts 'all'
                argument to assign all available GPUs to this LLM.
        quantize: Quantize the model weights. This is only applicable for PyTorch models.
                  Possible quantisation strategies:
                  - int8: Quantize the model with 8bit (bitsandbytes required)
                  - int4: Quantize the model with 4bit (bitsandbytes required)
                  - gptq: Quantize the model with GPTQ (auto-gptq required)
        bettertransformer: Convert given model to FastTransformer with PyTorch.
        runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
        fast: Enable fast mode. This will skip downloading models, and will raise errors if given model_id does not exists under local store.
        adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
        framework: The framework to use for this LLM. By default, this is set to ``pt``.
        additional_args: Additional arguments to pass to ``openllm start``.
    """
    if isinstance(model_name, str):
        llm_config = openllm.AutoConfig.for_model(model_name)
        _ModelEnv = EnvVarMixin(model_name, llm_config["default_implementation"])
        if framework is None:
            framework = _ModelEnv.framework_value
        os.environ[_ModelEnv.framework] = framework

    args: ListStr = ["--runtime", runtime]
    if model_id:
        if isinstance(model_id, bentoml.Bento):
            logger.warning("'model_id' has no effect if since %s is already a Bento.", model_name)
        else:
            args.extend(["--model-id", model_id])
    if timeout:
        args.extend(["--server-timeout", str(timeout)])
    if workers_per_resource:
        args.extend(
            [
                "--workers-per-resource",
                str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource,
            ]
        )
    if device and not os.getenv("CUDA_VISIBLE_DEVICES"):
        args.extend(["--device", ",".join(device)])

    if quantize and bettertransformer:
        raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")

    if quantize:
        args.extend(["--quantize", str(quantize)])
    if bettertransformer:
        args.append("--bettertransformer")
    if fast:
        args.append("--fast")
    if adapter_map:
        args.extend(
            list(
                itertools.chain.from_iterable(
                    [["--adapter-id", f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()]
                )
            )
        )
    if additional_args:
        args.extend(additional_args)

    if __test__:
        args.append("--return-process")

    return start_command_factory(model_name, _context_settings=_CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(
        args=args if len(args) > 0 else None,
        standalone_mode=False,
    )


def _tag_parsing(output: bytes) -> str:
    # NOTE: This usually only concern BentoML devs.
    pattern = r"^__tag__:[^:\n]+:[^:\n]+"
    matched = re.search(pattern, output.decode("utf-8").strip(), re.MULTILINE)
    assert matched is not None, f"Failed to find tag from output: {output!s}"
    _, _, tag = matched.group(0).partition(":")
    return tag


@inject
def _build(
    model_name: str,
    /,
    *,
    model_id: str | None = None,
    model_version: str | None = None,
    quantize: t.Literal["int8", "int4", "gptq"] | None = None,
    bettertransformer: bool | None = None,
    adapter_map: dict[str, str | None] | None = None,
    build_ctx: str | None = None,
    enable_features: tuple[str, ...] | None = None,
    workers_per_resource: int | float | None = None,
    runtime: t.Literal["ggml", "transformers"] = "transformers",
    dockerfile_template: str | None = None,
    overwrite: bool = False,
    push: bool = False,
    containerize: bool = False,
    serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors",
    additional_args: list[str] | None = None,
    bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
) -> bentoml.Bento:
    """Package a LLM into a Bento.

    The LLM will be built into a BentoService with the following structure:
    if ``quantize`` is passed, it will instruct the model to be quantized dynamically during serving time.
    if ``bettertransformer`` is passed, it will instruct the model to apply FasterTransformer during serving time.

    ``openllm.build`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as ``openllm build`` CLI.

    > **Note**: ``quantize`` and ``bettertransformer`` are mutually exclusive.

    Args:
        model_name: The model name to start this LLM
        model_id: Optional model id for this given LLM
        model_version: Optional model version for this given LLM
        quantize: Quantize the model weights. This is only applicable for PyTorch models.
                  Possible quantisation strategies:
                  - int8: Quantize the model with 8bit (bitsandbytes required)
                  - int4: Quantize the model with 4bit (bitsandbytes required)
                  - gptq: Quantize the model with GPTQ (auto-gptq required)
        bettertransformer: Convert given model to FastTransformer with PyTorch.
        adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
        build_ctx: The build context to use for building BentoLLM. By default, it sets to current directory.
        enable_features: Additional OpenLLM features to be included with this BentoLLM.
        workers_per_resource: Number of workers per resource assigned.
                              See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
                              for more information. By default, this is set to 1.

                              > **Note**: ``--workers-per-resource`` will also accept the following strategies:

                              > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.

                              > - ``conserved``: This will determine the number of available GPU resources, and only assign
                                                 one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
                                                 equivalent to ``--workers-per-resource 0.25``.
        runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
        dockerfile_template: The dockerfile template to use for building BentoLLM. See
                             https://docs.bentoml.com/en/latest/guides/containerization.html#dockerfile-template.
        overwrite: Whether to overwrite the existing BentoLLM. By default, this is set to ``False``.
        push: Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.
        containerize: Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.
                      Note that 'containerize' and 'push' are mutually exclusive
        serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
        additional_args: Additional arguments to pass to ``openllm build``.
        bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.

    Returns:
        ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
                                 If 'format="container"', then it returns the default 'container_name:container_tag'
    """
    args: ListStr = [
        sys.executable,
        "-m",
        "openllm",
        "build",
        model_name,
        "--machine",
        "--runtime",
        runtime,
        "--serialisation",
        serialisation_format,
    ]

    if quantize and bettertransformer:
        raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")

    if quantize:
        args.extend(["--quantize", quantize])
    if bettertransformer:
        args.append("--bettertransformer")

    if containerize and push:
        raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
    if push:
        args.extend(["--push"])
    if containerize:
        args.extend(["--containerize"])

    if model_id:
        args.extend(["--model-id", model_id])
    if build_ctx:
        args.extend(["--build-ctx", build_ctx])
    if enable_features:
        args.extend([f"--enable-features={f}" for f in enable_features])
    if workers_per_resource:
        args.extend(["--workers-per-resource", str(workers_per_resource)])
    if overwrite:
        args.append("--overwrite")
    if adapter_map:
        args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()])
    if model_version:
        args.extend(["--model-version", model_version])
    if dockerfile_template:
        args.extend(["--dockerfile-template", dockerfile_template])
    if additional_args:
        args.extend(additional_args)

    try:
        output = subprocess.check_output(args, env=os.environ.copy(), cwd=build_ctx or os.getcwd())
    except subprocess.CalledProcessError as e:
        logger.error("Exception caught while building %s", model_name, exc_info=e)
        if e.stderr:
            raise OpenLLMException(e.stderr.decode("utf-8")) from None
        raise OpenLLMException(str(e)) from None

    return bentoml.get(_tag_parsing(output), _bento_store=bento_store)


def _import_model(
    model_name: str,
    /,
    *,
    model_id: str | None = None,
    model_version: str | None = None,
    runtime: t.Literal["ggml", "transformers"] = "transformers",
    implementation: LiteralRuntime = "pt",
    quantize: t.Literal["int8", "int4", "gptq"] | None = None,
    serialisation_format: t.Literal["legacy", "safetensors"] = "safetensors",
    additional_args: t.Sequence[str] | None = None,
) -> bentoml.Model:
    """Import a LLM into local store.

    > **Note**: If ``quantize`` is passed, the model weights will be saved as quantized weights. You should
    > only use this option if you want the weight to be quantized by default. Note that OpenLLM also
    > support on-demand quantisation during initial startup.

    ``openllm.download`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI ``openllm import``.

    > **Note**: ``openllm.start`` will automatically invoke ``openllm.download`` under the hood.

    Args:
        model_name: The model name to start this LLM
        model_id: Optional model id for this given LLM
        model_version: Optional model version for this given LLM
        runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
        implementation: The implementation to use for this LLM. By default, this is set to ``pt``.
        quantize: Quantize the model weights. This is only applicable for PyTorch models.
                  Possible quantisation strategies:
                  - int8: Quantize the model with 8bit (bitsandbytes required)
                  - int4: Quantize the model with 4bit (bitsandbytes required)
                  - gptq: Quantize the model with GPTQ (auto-gptq required)
        serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
                              Default behaviour is similar to ``safe_serialization=False``.
        additional_args: Additional arguments to pass to ``openllm import``.

    Returns:
        ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
    """
    args = [
        model_name,
        "--runtime",
        runtime,
        "--implementation",
        implementation,
        "--machine",
        "--serialisation",
        serialisation_format,
    ]
    if model_id is not None:
        args.append(model_id)
    if model_version is not None:
        args.extend(["--model-version", str(model_version)])
    if additional_args is not None:
        args.extend(additional_args)
    if quantize is not None:
        args.extend(["--quantize", quantize])
    return download_models_command.main(args=args, standalone_mode=False)


def _list_models() -> DictStrAny:
    """List all available models within the local store."""
    args = ["-o", "json", "--show-available", "--machine"]
    return models_command.main(args=args, standalone_mode=False)


start, start_grpc, build, import_model, list_models = (
    codegen.gen_sdk(_start, _serve_grpc=False),
    codegen.gen_sdk(_start, _serve_grpc=True),
    codegen.gen_sdk(_build),
    codegen.gen_sdk(_import_model),
    codegen.gen_sdk(_list_models),
)


@cli.command(context_settings={"token_normalize_func": inflection.underscore})
@click.argument(
    "model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()])
)
@model_id_option(click)
@output_option
@machine_option(click)
@click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.")
@workers_per_resource_option(click, build=True)
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name="Optimisation options")
@quantize_option(cog.optgroup, build=True)
@bettertransformer_option(cog.optgroup)
@click.option(
    "--runtime",
    type=click.Choice(["ggml", "transformers"]),
    default="transformers",
    help="The runtime to use for the given model. Default is transformers.",
)
@click.option(
    "--enable-features",
    help="Enable additional features for building this LLM Bento. Available: {}".format(
        ", ".join(openllm.utils.OPTIONAL_DEPENDENCIES)
    ),
    multiple=True,
    nargs=1,
    metavar="FEATURE[,FEATURE]",
)
@click.option(
    "--adapter-id",
    default=None,
    help="Optional adapters id to be included within the Bento. Note that if you are using relative path, '--build-ctx' must be passed.",
    multiple=True,
    metavar="[PATH | [remote/][adapter_name:]adapter_id][, ...]",
)
@click.option("--build-ctx", default=".", help="Build context. This is required if --adapter-id uses relative path")
@model_version_option(click)
@click.option(
    "--dockerfile-template",
    default=None,
    type=click.File(),
    help="Optional custom dockerfile template to be used with this BentoLLM.",
)
@serialisation_option(click)
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name="Utilities options")
@cog.optgroup.option(
    "--containerize",
    default=False,
    is_flag=True,
    type=click.BOOL,
    help="Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.",
)
@cog.optgroup.option(
    "--push",
    default=False,
    is_flag=True,
    type=click.BOOL,
    help="Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.",
)
@click.pass_context
def build_command(
    ctx: click.Context,
    model_name: str,
    model_id: str | None,
    overwrite: bool,
    output: OutputLiteral,
    runtime: t.Literal["ggml", "transformers"],
    quantize: t.Literal["int8", "int4", "gptq"] | None,
    enable_features: tuple[str] | None,
    bettertransformer: bool | None,
    workers_per_resource: float | None,
    adapter_id: tuple[str, ...],
    build_ctx: str | None,
    machine: bool,
    model_version: str | None,
    dockerfile_template: t.TextIO | None,
    containerize: bool,
    push: bool,
    serialisation_format: t.Literal["safetensors", "legacy"],
    **attrs: t.Any,
) -> bentoml.Bento:
    """Package a given models into a Bento.

    \b
    ```bash
    $ openllm build flan-t5 --model-id google/flan-t5-large
    ```

    \b
    > NOTE: To run a container built from this Bento with GPU support, make sure
    > to have https://github.com/NVIDIA/nvidia-container-toolkit install locally.
    """
    adapter_map: dict[str, str | None] | None = None

    if adapter_id:
        if not build_ctx:
            _echo("'build_ctx' must not be None when '--adapter-id' is passsed.", fg="red")
            ctx.exit(1)

        adapter_map = {}
        for v in adapter_id:
            _adapter_id, *adapter_name = v.rsplit(":", maxsplit=1)
            # We don't resolve full path here, leave it to build
            # we are just doing the parsing here.
            adapter_map[_adapter_id] = adapter_name[0] if len(adapter_name) > 0 else None

    if machine:
        output = "porcelain"

    if enable_features:
        enable_features = tuple(itertools.chain.from_iterable((s.split(",") for s in enable_features)))

    _previously_built = False
    current_model_envvar = os.environ.pop("OPENLLM_MODEL", None)
    current_model_id_envvar = os.environ.pop("OPENLLM_MODEL_ID", None)
    current_adapter_map_envvar = os.environ.pop("OPENLLM_ADAPTER_MAP", None)
    current_serialisation_envvar = os.environ.pop("OPENLLM_SERIALIZATION", None)

    llm_config = openllm.AutoConfig.for_model(model_name)

    # NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
    # during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
    try:
        os.environ[llm_config["env"].runtime] = runtime
        os.environ["OPENLLM_MODEL"] = inflection.underscore(model_name)
        os.environ["OPENLLM_ADAPTER_MAP"] = orjson.dumps(adapter_map).decode()
        os.environ["OPENLLM_SERIALIZATION"] = serialisation_format

        framework_envvar = llm_config["env"].framework_value
        llm = openllm.infer_auto_class(framework_envvar).for_model(
            model_name,
            model_id=model_id,
            llm_config=llm_config,
            quantize=quantize,
            adapter_map=adapter_map,
            bettertransformer=bettertransformer,
            return_runner_kwargs=False,
            ensure_available=True,
            model_version=model_version,
            runtime=runtime,
            serialisation=serialisation_format,
            **attrs,
        )
        os.environ["OPENLLM_MODEL_ID"] = str(llm.tag)

        labels = dict(llm.identifying_params)
        labels.update({"_type": llm.llm_type, "_framework": framework_envvar})
        workers_per_resource = first_not_none(workers_per_resource, default=llm_config["workers_per_resource"])

        with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
            dockerfile_template_path = None
            if dockerfile_template:
                with dockerfile_template:
                    llm_fs.writetext("Dockerfile.template", dockerfile_template.read())
                dockerfile_template_path = llm_fs.getsyspath("/Dockerfile.template")

            bento_tag = bentoml.Tag.from_taglike(f"{llm.llm_type}-service:{llm.tag.version}")
            try:
                bento = bentoml.get(bento_tag)
                if overwrite:
                    if output == "pretty":
                        _echo(f"Overwriting existing Bento {bento_tag}", fg="yellow")
                    bentoml.delete(bento_tag)
                    bento = openllm.bundle.create_bento(
                        bento_tag,
                        llm_fs,
                        llm,
                        workers_per_resource=workers_per_resource,
                        adapter_map=adapter_map,
                        quantize=quantize,
                        bettertransformer=bettertransformer,
                        extra_dependencies=enable_features,
                        build_ctx=build_ctx,
                        dockerfile_template=dockerfile_template_path,
                        runtime=runtime,
                    )
                _previously_built = True
            except bentoml.exceptions.NotFound:
                bento = openllm.bundle.create_bento(
                    bento_tag,
                    llm_fs,
                    llm,
                    workers_per_resource=workers_per_resource,
                    adapter_map=adapter_map,
                    quantize=quantize,
                    bettertransformer=bettertransformer,
                    extra_dependencies=enable_features,
                    build_ctx=build_ctx,
                    dockerfile_template=dockerfile_template_path,
                    runtime=runtime,
                )
    except Exception as e:
        logger.error("\nException caught during building LLM %s: \n", model_name, exc_info=e)
        raise
    else:
        os.environ.pop("OPENLLM_MODEL", None)
        os.environ.pop("OPENLLM_MODEL_ID", None)
        os.environ.pop("OPENLLM_ADAPTER_MAP", None)
        os.environ.pop("OPENLLM_SERIALIZATION", None)
        # restore original OPENLLM_MODEL envvar if set.
        if current_model_envvar is not None:
            os.environ["OPENLLM_MODEL"] = current_model_envvar
        if current_model_id_envvar is not None:
            os.environ["OPENLLM_MODEL_ID"] = current_model_id_envvar
        if current_adapter_map_envvar is not None:
            os.environ["OPENLLM_ADAPTER_MAP"] = current_adapter_map_envvar
        if current_serialisation_envvar is not None:
            os.environ["OPENLLM_SERIALIZATION"] = current_serialisation_envvar

    if machine:
        # NOTE: We will prefix the tag with __tag__ and we can use regex to correctly
        # get the tag from 'bentoml.bentos.build|build_bentofile'
        _echo(f"__tag__:{bento.tag}", fg="white")
    elif output == "pretty":
        if not get_quiet_mode():
            _echo("\n" + OPENLLM_FIGLET, fg="white")
            if not _previously_built:
                _echo(f"Successfully built {bento}.", fg="green")
            elif not overwrite:
                _echo(
                    f"'{model_name}' already has a Bento built [{bento}]. To overwrite it pass '--overwrite'.",
                    fg="yellow",
                )

            _echo(
                "📖 Next steps:\n\n"
                + "* Serving BentoLLM locally with 'openllm start':\n"
                + f"    $ openllm start {bento.tag}\n\n"
                + "* Push to BentoCloud with 'bentoml push':\n"
                + f"    $ bentoml push {bento.tag}\n\n"
                + "* Containerize your Bento with 'bentoml containerize':\n"
                + f"    $ bentoml containerize {bento.tag} --opt progress=plain"
                + "\n\n"
                + "    Tip: To enable additional BentoML features for 'containerize', "
                + "use '--enable-features=FEATURE[,FEATURE]' "
                + "[see 'bentoml containerize -h' for more advanced usage]\n",
                fg="blue",
            )
    elif output == "json":
        _echo(orjson.dumps(bento.info.to_dict(), option=orjson.OPT_INDENT_2).decode())
    else:
        _echo(bento.tag)

    if push:
        client = BentoMLContainer.bentocloud_client.get()
        client.push_bento(bento, context=t.cast(CliContext, ctx.obj).cloud_context)
    elif containerize:
        backend = t.cast("DefaultBuilder", os.getenv("BENTOML_CONTAINERIZE_BACKEND", "docker"))
        _echo(f"Building {bento} into a LLMContainer using backend '{backend}'", fg="magenta")
        try:
            bentoml.container.health(backend)
        except subprocess.CalledProcessError:
            raise OpenLLMException(f"Failed to use backend {backend}") from None

        bentoml.container.build(bento.tag, backend=backend, features=("grpc",))

    return bento


@overload
def models_command(
    ctx: click.Context, output: OutputLiteral, show_available: bool, machine: t.Literal[True] = True
) -> DictStrAny:
    ...


@overload
def models_command(
    ctx: click.Context, output: OutputLiteral, show_available: bool, machine: t.Literal[False] = ...
) -> None:
    ...


@cli.command()
@output_option
@click.option(
    "--show-available",
    is_flag=True,
    default=False,
    help="Show available models in local store (mutually exclusive with '-o porcelain').",
)
@machine_option(click)
@click.pass_context
def models_command(
    ctx: click.Context, output: OutputLiteral, show_available: bool, machine: bool
) -> DictStrAny | None:
    """List all supported models.

    \b
    > NOTE: '--show-available' and '-o porcelain' are mutually exclusive.

    \b
    ```bash
    openllm models --show-available
    ```
    """
    from ._llm import normalise_model_name

    models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
    if output == "porcelain":
        if show_available:
            raise click.BadOptionUsage(
                "--show-available", "Cannot use '--show-available' with '-o porcelain' (mutually exclusive)."
            )
        _echo("\n".join(models), fg="white")
    else:
        failed_initialized: list[tuple[str, Exception]] = []

        json_data: dict[
            str,
            dict[t.Literal["architecture", "model_id", "url", "installation", "cpu", "gpu", "runtime_impl"], t.Any]
            | t.Any,
        ] = {}

        converted: list[str] = []
        for m in models:
            config = openllm.AutoConfig.for_model(m)
            runtime_impl: tuple[str, ...] = ()
            if config["model_name"] in openllm.MODEL_MAPPING_NAMES:
                runtime_impl += ("pt",)
            if config["model_name"] in openllm.MODEL_FLAX_MAPPING_NAMES:
                runtime_impl += ("flax",)
            if config["model_name"] in openllm.MODEL_TF_MAPPING_NAMES:
                runtime_impl += ("tf",)
            if config["model_name"] in openllm.MODEL_VLLM_MAPPING_NAMES:
                runtime_impl += ("vllm",)
            json_data[m] = {
                "architecture": config["architecture"],
                "model_id": config["model_ids"],
                "url": config["url"],
                "cpu": not config["requires_gpu"],
                "gpu": True,
                "runtime_impl": runtime_impl,
                "installation": f'pip install "openllm[{m}]"'
                if m in openllm.utils.OPTIONAL_DEPENDENCIES or config["requirements"]
                else "pip install openllm",
            }
            converted.extend([normalise_model_name(i) for i in config["model_ids"]])
            if DEBUG:
                try:
                    openllm.AutoLLM.for_model(m, llm_config=config)
                except Exception as e:
                    failed_initialized.append((m, e))

        ids_in_local_store: DictStrAny | None = None
        local_models: DictStrAny | None = None
        if show_available:
            ids_in_local_store = {
                k: [
                    i
                    for i in bentoml.models.list()
                    if "framework" in i.info.labels
                    and i.info.labels["framework"] == "openllm"
                    and "model_name" in i.info.labels
                    and i.info.labels["model_name"] == k
                ]
                for k in json_data.keys()
            }
            ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
            local_models = {k: [str(i.tag) for i in val] for k, val in ids_in_local_store.items()}

        if machine:
            if show_available:
                assert ids_in_local_store
                assert local_models
                json_data["local"] = local_models
            return json_data
        elif output == "pretty":
            import tabulate

            tabulate.PRESERVE_WHITESPACE = True

            # llm, architecture, url, model_id, installation, cpu, gpu, runtime_impl
            data: list[
                str
                | tuple[
                    str,
                    str,
                    str,
                    list[str],
                    str,
                    t.LiteralString,
                    t.LiteralString,
                    tuple[LiteralRuntime, ...],
                ]
            ] = []
            for m, v in json_data.items():
                data.extend(
                    [
                        (
                            m,
                            v["architecture"],
                            v["url"],
                            v["model_id"],
                            v["installation"],
                            "❌" if not v["cpu"] else "✅",
                            "✅",
                            v["runtime_impl"],
                        )
                    ]
                )
            column_widths = [
                int(COLUMNS / 12),
                int(COLUMNS / 6),
                int(COLUMNS / 6),
                int(COLUMNS / 6),
                int(COLUMNS / 6),
                int(COLUMNS / 12),
                int(COLUMNS / 12),
                int(COLUMNS / 12),
            ]

            if len(data) == 0 and len(failed_initialized) > 0:
                _echo("Exception found while parsing models:\n", fg="yellow")
                for m, err in failed_initialized:
                    _echo(f"- {m}: ", fg="yellow", nl=False)
                    _echo(traceback.print_exception(err, limit=3), fg="red")
                sys.exit(1)

            table = tabulate.tabulate(
                data,
                tablefmt="fancy_grid",
                headers=["LLM", "Architecture", "URL", "Models Id", "Installation", "CPU", "GPU", "Runtime"],
                maxcolwidths=column_widths,
            )

            formatted_table = ""
            for line in table.split("\n"):
                formatted_table += (
                    "".join(f"{cell:{width}}" for cell, width in zip(line.split("\t"), column_widths)) + "\n"
                )
            _echo(formatted_table, fg="white")

            if DEBUG and len(failed_initialized) > 0:
                _echo("\nThe following models are supported but failed to initialize:\n")
                for m, err in failed_initialized:
                    _echo(f"- {m}: ", fg="blue", nl=False)
                    _echo(err, fg="red")

            if show_available:
                assert ids_in_local_store is not None
                assert local_models
                if len(ids_in_local_store) == 0:
                    _echo("No models available locally.")
                    ctx.exit(0)

                _echo("The following are available in local store:", fg="magenta")
                _echo(
                    orjson.dumps(
                        local_models,
                        option=orjson.OPT_INDENT_2,
                    ).decode(),
                    fg="white",
                )
        else:
            if show_available:
                assert ids_in_local_store
                assert local_models
                json_data["local"] = local_models
            _echo(
                orjson.dumps(
                    json_data,
                    option=orjson.OPT_INDENT_2,
                ).decode(),
                fg="white",
            )
    ctx.exit(0)


@cli.command()
@click.argument(
    "model_name",
    type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]),
    required=False,
)
@click.option(
    "-y",
    "--yes",
    "--assume-yes",
    is_flag=True,
    help="Skip confirmation when deleting a specific model",
)
@inject
def prune_command(
    model_name: str | None, yes: bool, model_store: ModelStore = Provide[BentoMLContainer.model_store]
) -> None:
    """Remove all saved models locally.

    \b
    If a model type is passed, then only prune models for that given model type.
    """
    available = [
        m for m in bentoml.models.list() if "framework" in m.info.labels and m.info.labels["framework"] == "openllm"
    ]
    if model_name is not None:
        available = [
            m
            for m in available
            if "model_name" in m.info.labels and m.info.labels["model_name"] == inflection.underscore(model_name)
        ]

    for model in available:
        if yes:
            delete_confirmed = True
        else:
            delete_confirmed = click.confirm(f"delete model {model.tag}?")

        if delete_confirmed:
            model_store.delete(model.tag)
            click.echo(f"{model} deleted.")


def parsing_instruction_callback(
    ctx: click.Context, param: click.Parameter, value: list[str] | str | None
) -> tuple[str, bool | str] | list[str] | str | None:
    if value is None:
        return value

    if isinstance(value, list):
        # we only parse --text foo bar -> --text foo and omit bar
        value = value[-1]

    key, *values = value.split("=")
    if not key.startswith("--"):
        raise click.BadParameter(f"Invalid option format: {value}")
    key = key[2:]
    if len(values) == 0:
        return key, True
    elif len(values) == 1:
        return key, values[0]
    else:
        raise click.BadParameter(f"Invalid option format: {value}")


def shared_client_options(f: t.Callable[[FC], FC]) -> t.Callable[[FC], FC]:
    options = [
        click.option(
            "--endpoint",
            type=click.STRING,
            help="OpenLLM Server endpoint, i.e: http://localhost:3000",
            envvar="OPENLLM_ENDPOINT",
            default="http://localhost:3000",
        ),
        click.option("--timeout", type=click.INT, default=30, help="Default server timeout", show_default=True),
        output_option,
    ]
    for opt in reversed(options):
        f = opt(f)
    return f


@cli.command()
@click.argument("task", type=click.STRING, metavar="TASK")
@shared_client_options
@click.option(
    "--agent",
    type=click.Choice(["hf"]),
    default="hf",
    help="Whether to interact with Agents from given Server endpoint.",
    show_default=True,
)
@click.option(
    "--remote",
    is_flag=True,
    default=False,
    help="Whether or not to use remote tools (inference endpoints) instead of local ones.",
    show_default=True,
)
@click.option(
    "--opt",
    help="Define prompt options. "
    "(format: ``--opt text='I love this' --opt audio:./path/to/audio  --opt image:/path/to/file``)",
    required=False,
    multiple=True,
    callback=opt_callback,
    metavar="ARG=VALUE[,ARG=VALUE]",
)
def instruct(
    endpoint: str,
    timeout: int,
    agent: t.LiteralString,
    output: OutputLiteral,
    remote: bool,
    task: str,
    _memoized: DictStrAny,
    **attrs: t.Any,
) -> str:
    """Instruct agents interactively for given tasks, from a terminal.

    \b
    ```bash
    $ openllm instruct --endpoint http://12.323.2.1:3000 \\
        "Is the following `text` (in Spanish) positive or negative?" \\
        --text "¡Este es un API muy agradable!"
    ```
    """
    client = openllm.client.HTTPClient(endpoint, timeout=timeout)

    if agent == "hf":
        if not is_transformers_supports_agent():
            raise click.UsageError(
                "Transformers version should be at least 4.29 to support HfAgent. "
                "Upgrade with 'pip install -U transformers'"
            )

        _memoized = {k: v[0] for k, v in _memoized.items() if v}

        client._hf_agent.set_stream(logger.info)
        if output != "porcelain":
            _echo(f"Sending the following prompt ('{task}') with the following vars: {_memoized}", fg="magenta")

        result = client.ask_agent(task, agent_type=agent, return_code=False, remote=remote, **_memoized)
        if output == "json":
            _echo(orjson.dumps(result, option=orjson.OPT_INDENT_2).decode(), fg="white")
        else:
            _echo(result, fg="white")
        return result
    else:
        raise click.BadOptionUsage("agent", f"Unknown agent type {agent}")


@cli.command()
@shared_client_options
@click.option(
    "--server-type", type=click.Choice(["grpc", "http"]), help="Server type", default="http", show_default=True
)
@click.argument("prompt", type=click.STRING)
@click.option(
    "--sampling-params",
    help="Define query options. (format: ``--opt temperature=0.8 --opt=top_k:12)",
    required=False,
    multiple=True,
    callback=opt_callback,
    metavar="ARG=VALUE[,ARG=VALUE]",
)
@click.pass_context
def query(
    ctx: click.Context,
    prompt: str,
    endpoint: str,
    timeout: int,
    server_type: t.Literal["http", "grpc"],
    output: OutputLiteral,
    _memoized: DictStrAny,
    **attrs: t.Any,
) -> None:
    """Ask a LLM interactively, from a terminal.

    \b
    ```bash
    $ openllm query --endpoint http://12.323.2.1:3000 "What is the meaning of life?"
    ```
    """
    _memoized = {k: orjson.loads(v[0]) for k, v in _memoized.items() if v}
    if server_type == "grpc":
        endpoint = re.sub(r"http://", "", endpoint)
    client = (
        openllm.client.HTTPClient(endpoint, timeout=timeout)
        if server_type == "http"
        else openllm.client.GrpcClient(endpoint, timeout=timeout)
    )

    input_fg = "magenta"
    generated_fg = "cyan"

    if output != "porcelain":
        _echo("==Input==\n", fg="white")
        _echo(f"{prompt}", fg=input_fg)

    res = client.query(prompt, return_raw_response=True, **_memoized)

    if output == "pretty":
        full_formatted = client.llm.postprocess_generate(prompt, res["responses"])
        response = full_formatted[len(prompt) + 1 :]
        _echo("\n\n==Responses==\n", fg="white")
        _echo(f"{prompt} ", fg=input_fg, nl=False)
        _echo(response, fg=generated_fg)
    elif output == "json":
        _echo(orjson.dumps(res, option=orjson.OPT_INDENT_2).decode(), fg="white")
    else:
        _echo(res["responses"], fg="white")

    ctx.exit(0)


@cli.group(name="utils")
def utils_command() -> None:
    """Utilities Subcommand group."""


@utils_command.command()
@click.pass_context
def list_bentos(ctx: click.Context):
    """List available bentos built by OpenLLM."""
    _local_bentos = {str(i.tag): i.info.labels["start_name"] for i in bentoml.list() if "start_name" in i.info.labels}
    mapping = {
        k: [tag for tag, name in _local_bentos.items() if name == k]
        for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
    }
    mapping = {k: v for k, v in mapping.items() if v}
    _echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg="white")
    ctx.exit(0)


@overload
def get_prompt(
    model_name: str, prompt: str, format: str | None, output: OutputLiteral, machine: t.Literal[True] = True
) -> str:
    ...


@overload
def get_prompt(
    model_name: str, prompt: str, format: str | None, output: OutputLiteral, machine: t.Literal[False] = ...
) -> None:
    ...


@utils_command.command()
@click.argument(
    "model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()])
)
@click.argument("prompt", type=click.STRING)
@output_option
@click.option("--format", type=click.STRING, default=None)
@machine_option(click)
def get_prompt(model_name: str, prompt: str, format: str | None, output: OutputLiteral, machine: bool) -> str | None:
    """Get the default prompt used by OpenLLM."""
    module = openllm.utils.EnvVarMixin(model_name).module
    try:
        template = getattr(module, "DEFAULT_PROMPT_TEMPLATE", None)
        prompt_mapping = getattr(module, "PROMPT_MAPPING", None)
        if template is None:
            raise click.BadArgumentUsage(f"model {model_name} does not have a default prompt template") from None
        if callable(template):
            if format is None:
                if not hasattr(module, "PROMPT_MAPPING") or module.PROMPT_MAPPING is None:
                    raise RuntimeError("Failed to find prompt mapping while DEFAULT_PROMPT_TEMPLATE is a function.")
                raise click.BadOptionUsage(
                    "format",
                    f"{model_name} prompt requires passing '--format' (available format: {list(module.PROMPT_MAPPING)})",
                )
            if prompt_mapping is None:
                raise click.BadArgumentUsage(
                    f"Failed to fine prompt mapping while the default prompt for {model_name} is a callable."
                ) from None
            if format not in prompt_mapping:
                raise click.BadOptionUsage(
                    "format",
                    f"Given format {format} is not valid for {model_name} (available format: {list(prompt_mapping)})",
                )
            _prompt = template(format)
        else:
            _prompt = template

        # XXX: FIX ME, currently doesn't work with all different context variable
        # will need a --opt parser for this
        fully_formatted = _prompt.format(instruction=prompt)

        if machine:
            return repr(fully_formatted)
        elif output == "porcelain":
            _echo(repr(fully_formatted), fg="white")
        elif output == "json":
            _echo(orjson.dumps({"prompt": fully_formatted}, option=orjson.OPT_INDENT_2).decode(), fg="white")
        else:
            _echo(f"== Prompt for {model_name} ==\n", fg="magenta")
            _echo(fully_formatted, fg="white")
    except AttributeError:
        raise click.ClickException(f"Failed to determine a default prompt template for {model_name}.") from None


def load_notebook_metadata() -> DictStrAny:
    with open(os.path.join(os.path.dirname(openllm.playground.__file__), "_meta.yml"), "r") as f:
        content = yaml.safe_load(f)
    if not all("description" in k for k in content.values()):
        raise ValueError("Invalid metadata file. All entries must have a 'description' key.")
    return content


@cli.command()
@click.argument("output-dir", default=None, required=False)
@click.option(
    "--port",
    envvar="JUPYTER_PORT",
    show_envvar=True,
    show_default=True,
    default=8888,
    help="Default port for Jupyter server",
)
@click.pass_context
def playground(ctx: click.Context, output_dir: str | None, port: int) -> None:
    """OpenLLM Playground.

    A collections of notebooks to explore the capabilities of OpenLLM.
    This includes notebooks for fine-tuning, inference, and more.

    All of the script available in the playground can also be run directly as a Python script:
    For example:

    \b
    ```bash
    python -m openllm.playground.falcon_tuned --help
    ```

    \b
    > Note: This command requires Jupyter to be installed. Install it with 'pip install "openllm[playground]"'
    """
    if not is_jupyter_available() or not is_jupytext_available() or not is_notebook_available():
        raise RuntimeError(
            "Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'"
        )
    metadata = load_notebook_metadata()
    _temp_dir = False
    if output_dir is None:
        _temp_dir = True
        output_dir = tempfile.mkdtemp(prefix="openllm-playground-")
    else:
        os.makedirs(os.path.abspath(os.path.expandvars(os.path.expanduser(output_dir))), exist_ok=True)

    _echo("The playground notebooks will be saved to: " + os.path.abspath(output_dir), fg="blue")
    for module in pkgutil.iter_modules(openllm.playground.__path__):
        if module.ispkg or os.path.exists(os.path.join(output_dir, module.name + ".ipynb")):
            logger.debug(
                "Skipping: %s (%s)",
                module.name,
                "File already exists" if not module.ispkg else f"{module.name} is a module",
            )
            continue
        if not isinstance(module.module_finder, importlib.machinery.FileFinder):
            continue
        _echo("Generating notebook for: " + module.name, fg="magenta")
        markdown_cell = nbformat.v4.new_markdown_cell(metadata[module.name]["description"])
        f = jupytext.read(os.path.join(module.module_finder.path, module.name + ".py"))
        f.cells.insert(0, markdown_cell)
        jupytext.write(f, os.path.join(output_dir, module.name + ".ipynb"), fmt="notebook")
    try:
        subprocess.check_output(
            [
                sys.executable,
                "-m",
                "jupyter",
                "notebook",
                "--notebook-dir",
                output_dir,
                "--port",
                str(port),
                "--no-browser",
                "--debug",
            ]
        )
    except subprocess.CalledProcessError as e:
        _echo(e.output, fg="red")
        raise e
    except KeyboardInterrupt:
        _echo("\nShutting down Jupyter server...", fg="yellow")
        if _temp_dir:
            _echo("Note: You can access the generated notebooks in: " + output_dir, fg="blue")
    ctx.exit(0)


if __name__ == "__main__":
    cli()