# Copyright 2023 BentoML Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ CLI utilities for OpenLLM. This extends clidantic and BentoML's internal CLI CommandGroup. """ from __future__ import annotations import functools import inspect import logging import os import re import sys import time import traceback import typing as t import bentoml import click import click_option_group as cog import inflection import orjson import psutil from bentoml._internal.configuration import get_debug_mode, get_quiet_mode, set_quiet_mode from bentoml._internal.configuration.containers import BentoMLContainer from bentoml._internal.log import configure_logging, configure_server_logging from bentoml_cli.utils import BentoMLCommandGroup import openllm if t.TYPE_CHECKING: from ._types import ClickFunctionWrapper, F, P ServeCommand = t.Literal["serve", "serve-grpc"] OutputLiteral = t.Literal["json", "pretty", "porcelain"] logger = logging.getLogger(__name__) COLUMNS = int(os.environ.get("COLUMNS", 120)) _CONTEXT_SETTINGS = {"help_option_names": ["-h", "--help"], "max_content_width": COLUMNS} OPENLLM_FIGLET = """\ ██████╗ ██████╗ ███████╗███╗ ██╗██╗ ██╗ ███╗ ███╗ ██╔═══██╗██╔══██╗██╔════╝████╗ ██║██║ ██║ ████╗ ████║ ██║ ██║██████╔╝█████╗ ██╔██╗ ██║██║ ██║ ██╔████╔██║ ██║ ██║██╔═══╝ ██╔══╝ ██║╚██╗██║██║ ██║ ██║╚██╔╝██║ ╚██████╔╝██║ ███████╗██║ ╚████║███████╗███████╗██║ ╚═╝ ██║ ╚═════╝ ╚═╝ ╚══════╝╚═╝ ╚═══╝╚══════╝╚══════╝╚═╝ ╚═╝ """ def _echo(text: t.Any, fg: str = "green", _with_style: bool = True, **attrs: t.Any) -> None: call = click.echo if _with_style: attrs["fg"] = fg if not get_debug_mode() else None call = click.secho call(text, **attrs) class OpenLLMCommandGroup(BentoMLCommandGroup): NUMBER_OF_COMMON_PARAMS = 3 @staticmethod def common_params(f: F[P, t.Any]) -> ClickFunctionWrapper[..., t.Any]: """This is not supposed to be used with unprocessed click function. This should be used a the last currying from common_params -> usage_tracking -> exception_handling """ # The following logics is similar to one of BentoMLCommandGroup from bentoml._internal.configuration import DEBUG_ENV_VAR, QUIET_ENV_VAR, set_debug_mode from .utils import analytics @click.option("-q", "--quiet", envvar=QUIET_ENV_VAR, is_flag=True, default=False, help="Suppress all output.") @click.option( "--debug", "--verbose", envvar=DEBUG_ENV_VAR, is_flag=True, default=False, help="Print out debug logs." ) @click.option( "--do-not-track", is_flag=True, default=False, envvar=analytics.OPENLLM_DO_NOT_TRACK, help="Do not send usage info", ) @functools.wraps(f) def wrapper(quiet: bool, debug: bool, *args: P.args, **attrs: P.kwargs) -> t.Any: if quiet: set_quiet_mode(True) if debug: logger.warning("'--quiet' passed; ignoring '--verbose/--debug'") elif debug: set_debug_mode(True) configure_logging() return f(*args, **attrs) return t.cast("ClickFunctionWrapper[..., t.Any]", wrapper) @staticmethod def usage_tracking( func: ClickFunctionWrapper[..., t.Any], group: click.Group, **attrs: t.Any ) -> ClickFunctionWrapper[..., t.Any]: """This is not supposed to be used with unprocessed click function. This should be used a the last currying from common_params -> usage_tracking -> exception_handling """ from .utils import analytics command_name = attrs.get("name", func.__name__) @functools.wraps(func) def wrapper(do_not_track: bool, *args: P.args, **attrs: P.kwargs) -> t.Any: if do_not_track: with analytics.set_bentoml_tracking(): return func(*args, **attrs) start_time = time.time_ns() with analytics.set_bentoml_tracking(): assert group.name is not None, "group.name should not be None" event = analytics.OpenllmCliEvent(cmd_group=group.name, cmd_name=command_name) try: return_value = func(*args, **attrs) duration_in_ms = (time.time_ns() - start_time) / 1e6 event.duration_in_ms = duration_in_ms analytics.track(event) return return_value except Exception as e: duration_in_ms = (time.time_ns() - start_time) / 1e6 event.duration_in_ms = duration_in_ms event.error_type = type(e).__name__ event.return_code = 2 if isinstance(e, KeyboardInterrupt) else 1 analytics.track(event) raise return t.cast("ClickFunctionWrapper[..., t.Any]", wrapper) @staticmethod def exception_handling( func: ClickFunctionWrapper[..., t.Any], group: click.Group, **attrs: t.Any ) -> ClickFunctionWrapper[..., t.Any]: """This is not supposed to be used with unprocessed click function. This should be used a the last currying from common_params -> usage_tracking -> exception_handling """ command_name = attrs.get("name", func.__name__) @functools.wraps(func) def wrapper(*args: P.args, **attrs: P.kwargs) -> t.Any: try: return func(*args, **attrs) except openllm.exceptions.OpenLLMException as err: raise click.ClickException( click.style(f"[{group.name}] '{command_name}' failed: " + err.message, fg="red") ) from err except KeyboardInterrupt: # NOTE: silience KeyboardInterrupt pass return t.cast("ClickFunctionWrapper[..., t.Any]", wrapper) def __init__(self, *args: t.Any, **attrs: t.Any) -> None: super(OpenLLMCommandGroup, self).__init__(*args, **attrs) # these two dictionaries will store known aliases for commands and groups self._cached_http: dict[str, t.Any] = {} self._cached_grpc: dict[str, t.Any] = {} def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None: cmd_name = self.resolve_alias(cmd_name) if ctx.command.name == "start": if cmd_name not in self._cached_http: self._cached_http[cmd_name] = start_model_command( cmd_name, self, _context_settings=ctx.command.context_settings ) return self._cached_http[cmd_name] elif ctx.command.name == "start-grpc": if cmd_name not in self._cached_grpc: self._cached_grpc[cmd_name] = start_model_command( cmd_name, self, _context_settings=ctx.command.context_settings, _serve_grpc=True ) return self._cached_grpc[cmd_name] return super().get_command(ctx, cmd_name) def list_commands(self, ctx: click.Context) -> list[str]: if ctx.command.name == "start" or ctx.command.name == "start-grpc": return list(openllm.CONFIG_MAPPING.keys()) return super().list_commands(ctx) def command(self, *args: t.Any, **attrs: t.Any) -> F[[t.Callable[P, t.Any]], click.Command]: """Override the default 'cli.command' with supports for aliases for given command, and it wraps the implementation with common parameters. """ if "context_settings" not in attrs: attrs["context_settings"] = {} attrs["context_settings"]["max_content_width"] = 120 aliases = attrs.pop("aliases", None) def wrapper(f: F[P, t.Any]) -> click.Command: name = f.__name__.lower().replace("_", "-") attrs.setdefault("help", inspect.getdoc(f)) attrs.setdefault("name", name) # Wrap implementation withc common parameters wrapped = self.common_params(f) # Wrap into OpenLLM tracking wrapped = self.usage_tracking(wrapped, self, **attrs) # Wrap into exception handling wrapped = self.exception_handling(wrapped, self, **attrs) # move common parameters to end of the parameters list wrapped.__click_params__ = ( wrapped.__click_params__[-self.NUMBER_OF_COMMON_PARAMS :] + wrapped.__click_params__[: -self.NUMBER_OF_COMMON_PARAMS] ) # NOTE: we need to call super of super to avoid conflict with BentoMLCommandGroup command # setup cmd = super(BentoMLCommandGroup, self).command(*args, **attrs)(wrapped) # NOTE: add aliases to a given commands if it is specified. if aliases is not None: assert cmd.name self._commands[cmd.name] = aliases self._aliases.update({alias: cmd.name for alias in aliases}) return cmd # XXX: The current type coercion is not ideal, but we can really # loosely define it return t.cast("F[[t.Callable[..., t.Any]], click.Command]", wrapper) # NOTE: A list of bentoml option that is not needed for parsing. # NOTE: User shouldn't set '--working-dir', as OpenLLM will setup this. # NOTE: production is also deprecated _IGNORED_OPTIONS = {"working_dir", "production", "protocol_version"} if t.TYPE_CHECKING: WrappedServeFunction = ClickFunctionWrapper[t.Concatenate[int, str | None, OutputLiteral, P], openllm.LLMConfig] else: WrappedServeFunction = t.Any def parse_serve_args(serve_grpc: bool): """Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`""" from bentoml_cli.cli import cli command = "serve" if not serve_grpc else "serve-grpc" group = cog.optgroup.group( f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options", help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]", ) def decorator( f: t.Callable[t.Concatenate[int, str | None, t.Literal["porcelain", "pretty"], P], openllm.LLMConfig] ) -> ClickFunctionWrapper[P, openllm.LLMConfig]: serve_command = cli.commands[command] # The first variable is the argument bento # and the last three are shared default, which we don't need. serve_options = [p for p in serve_command.params[1:-3] if p.name not in _IGNORED_OPTIONS] for options in reversed(serve_options): attrs = options.to_info_dict() # we don't need param_type_name, since it should all be options attrs.pop("param_type_name") # name is not a valid args attrs.pop("name") # type can be determine from default value attrs.pop("type") param_decls = (*attrs.pop("opts"), *attrs.pop("secondary_opts")) f = t.cast("WrappedServeFunction[P]", cog.optgroup.option(*param_decls, **attrs)(f)) return group(f) return decorator def start_model_command( model_name: str, group: click.Group, _context_settings: dict[str, t.Any] | None = None, _serve_grpc: bool = False, ) -> click.Command: """Generate a 'click.Command' for any given LLM. Args: model_name: The name of the model factory: The click.Group to add the command to _context_settings: The context settings to use for the command _serve_grpc: Whether to serve the model via gRPC or HTTP Returns: The click.Command for starting the model server Note that the internal commands will return the llm_config and a boolean determine whether the server is run with GPU or not. """ from bentoml._internal.configuration.containers import BentoMLContainer from bentoml._internal.log import configure_logging configure_logging() ModelEnv = openllm.utils.ModelEnv(model_name) command_attrs: dict[str, t.Any] = { "name": ModelEnv.model_name, "context_settings": _context_settings or {}, "short_help": f"Start a LLMServer for '{model_name}' ('--help' for more details)", "help": ModelEnv.start_docstring, } llm_config = openllm.AutoConfig.for_model(model_name) aliases: list[str] = [] if llm_config.name_type == "dasherize": aliases.append(llm_config.__openllm_start_name__) command_attrs["aliases"] = aliases if len(aliases) > 0 else None gpu_available = False try: llm_config.check_if_gpu_is_available(ModelEnv.get_framework_env()) gpu_available = True if llm_config.__openllm_requires_gpu__ else False except openllm.exceptions.GpuNotAvailableError: # NOTE: The model requires GPU, therefore we will return a dummy command command_attrs.update( { "short_help": "(Disabled because there is no GPU available)", "help": f"""{model_name} is currently not available to run on your local machine because it requires GPU for faster inference.""", } ) @group.command(**command_attrs) def noop() -> openllm.LLMConfig: _echo("No GPU available, therefore this command is disabled", fg="red") openllm.utils.analytics.track_start_init(llm_config, gpu_available) return llm_config return noop @group.command(**command_attrs) @llm_config.to_click_options @parse_serve_args(_serve_grpc) @click.option("--server-timeout", type=int, default=3600, help="Server timeout in seconds") @click.option( "--pretrained", type=click.STRING, default=None, help="Optional pretrained name or path to fine-tune weight." ) @click.option( "-o", "--output", type=click.Choice(["pretty", "porcelain"]), default="porcelain", help="Showing output type. Default to 'pretty'", ) def model_start( server_timeout: int, pretrained: str | None, output: t.Literal["pretty", "porcelain"], **attrs: t.Any, ) -> openllm.LLMConfig: config, server_attrs = llm_config.model_validate_click(**attrs) if ModelEnv.get_framework_env() == "flax": llm = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained, llm_config=config) elif ModelEnv.get_framework_env() == "tf": llm = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained, llm_config=config) else: llm = openllm.AutoLLM.for_model(model_name, pretrained=pretrained, llm_config=config) # NOTE: We need to initialize llm here first to check if the model is already downloaded to # avoid deadlock before the subprocess forking. llm.ensure_pretrained_exists() # NOTE: check for GPU one more time in cases this model doesn't requires GPU but users can still # run this model on GPU try: llm.config.check_if_gpu_is_available(ModelEnv.get_framework_env(), force=True) gpu_available = True except openllm.exceptions.GpuNotAvailableError: gpu_available = False server_attrs.update({"working_dir": os.path.dirname(__file__)}) if _serve_grpc: server_attrs["grpc_protocol_version"] = "v1" # NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream. development = server_attrs.pop("development") server_attrs.setdefault("production", not development) start_env = os.environ.copy() # NOTE: This is a hack to set current configuration _bentoml_config_options = start_env.pop("BENTOML_CONFIG_OPTIONS", "") _bentoml_config_options += ( " " if _bentoml_config_options else "" + f"api_server.traffic.timeout={server_timeout}" + f' runners."llm-{llm.config.__openllm_start_name__}-runner".traffic.timeout' + f"={llm.config.__openllm_timeout__}" ) start_env.update( { ModelEnv.framework: ModelEnv.get_framework_env(), ModelEnv.model_config: llm.config.model_dump_json().decode(), "OPENLLM_MODEL": model_name, "BENTOML_DEBUG": str(get_debug_mode()), "BENTOML_CONFIG_OPTIONS": _bentoml_config_options, "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()), } ) if llm.config.__openllm_requirements__ is not None: _echo( f"Make sure to have the following dependencies available: {llm.config.__openllm_requirements__}", fg="yellow", ) if t.TYPE_CHECKING: server_cls: type[bentoml.HTTPServer] if not _serve_grpc else type[bentoml.GrpcServer] server_cls = getattr(bentoml, "HTTPServer" if not _serve_grpc else "GrpcServer") server_attrs["timeout"] = 90 server = server_cls("_service.py:svc", **server_attrs) try: openllm.utils.analytics.track_start_init(llm.config, gpu_available) server.start(env=start_env, text=True, blocking=True if get_debug_mode() else False) if not get_debug_mode(): assert server.process is not None and server.process.stdout is not None with server.process.stdout: for f in iter(server.process.stdout.readline, b""): _echo(f, nl=False, fg="white") except KeyboardInterrupt: on_start_end(model_name) except Exception as err: _echo(f"Error caught while starting LLM Server:\n{err}", fg="red") raise else: on_start_end(model_name) # NOTE: Return the configuration for telemetry purposes. return llm_config def on_start_end(model_name: str): if not get_debug_mode(): _echo( f"\n🚀 Next step: run 'openllm build {model_name}' to create a Bento for {model_name}", fg="blue", ) return model_start def _start( model_name: str, framework: t.Literal["flax", "tf", "pt"] | None = None, **attrs: t.Any, ): """Python API to start a LLM server.""" from . import utils _serve_grpc = attrs.pop("_serve_grpc", False) ModelEnv = utils.ModelEnv(model_name) if framework is not None: os.environ[ModelEnv.framework] = framework start_model_command(model_name, t.cast(OpenLLMCommandGroup, cli), _serve_grpc=_serve_grpc)( standalone_mode=False, **attrs ) start = functools.partial(_start, _serve_grpc=False) start_grpc = functools.partial(_start, _serve_grpc=True) output_option = click.option( "-o", "--output", type=click.Choice(["json", "pretty", "porcelain"]), default="pretty", help="Showing output type. Default to 'pretty'", ) def cli_factory() -> click.Group: from .__about__ import __version__ configure_logging() model_store = BentoMLContainer.model_store.get() @click.group(cls=OpenLLMCommandGroup, context_settings=_CONTEXT_SETTINGS, name="openllm") @click.version_option(__version__, "--version", "-v") def cli(): """ \b ██████╗ ██████╗ ███████╗███╗ ██╗██╗ ██╗ ███╗ ███╗ ██╔═══██╗██╔══██╗██╔════╝████╗ ██║██║ ██║ ████╗ ████║ ██║ ██║██████╔╝█████╗ ██╔██╗ ██║██║ ██║ ██╔████╔██║ ██║ ██║██╔═══╝ ██╔══╝ ██║╚██╗██║██║ ██║ ██║╚██╔╝██║ ╚██████╔╝██║ ███████╗██║ ╚████║███████╗███████╗██║ ╚═╝ ██║ ╚═════╝ ╚═╝ ╚══════╝╚═╝ ╚═══╝╚══════╝╚══════╝╚═╝ ╚═╝ \b An open platform for operating large language models in production. Fine-tune, serve, deploy, and monitor any LLMs with ease. """ @cli.group(cls=OpenLLMCommandGroup, context_settings=_CONTEXT_SETTINGS) def start(): """ Start any LLM as a REST server. $ openllm start -- ... """ @cli.group(cls=OpenLLMCommandGroup, context_settings=_CONTEXT_SETTINGS) def start_grpc(): """ Start any LLM as a gRPC server. $ openllm start-grpc -- ... """ @cli.command() @click.argument( "model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]) ) @click.option("--pretrained", default=None, help="Given pretrained model name for the given model name [Optional].") @click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.") @output_option def build(model_name: str, pretrained: str | None, overwrite: bool, output: OutputLiteral): """Package a given models into a Bento. $ openllm build flan-t5 """ if output == "porcelain": set_quiet_mode(True) configure_server_logging() bento, _previously_built = openllm.build( model_name, __cli__=True, pretrained=pretrained, _overwrite_existing_bento=overwrite, ) if output == "pretty": if not get_quiet_mode(): _echo("\n" + OPENLLM_FIGLET, fg="white") if not _previously_built: _echo(f"Successfully built {bento}.", fg="green") else: _echo( f"'{model_name}' already has a Bento built [{bento}]. To overwrite it pass '--overwrite'.", fg="yellow", ) _echo( "\nPossible next steps:\n\n" + "* Push to BentoCloud with `bentoml push`:\n" + f" $ bentoml push {bento.tag}\n" + "* Containerize your Bento with `bentoml containerize`:\n" + f" $ bentoml containerize {bento.tag}", fg="blue", ) elif output == "json": _echo(orjson.dumps(bento.info.to_dict(), option=orjson.OPT_INDENT_2).decode()) else: _echo(bento.tag) return bento @cli.command() @output_option def models(output: OutputLiteral): """List all supported models.""" models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys()) if output == "porcelain": _echo("\n".join(models), fg="white") else: failed_initialized: list[tuple[str, Exception]] = [] json_data: dict[str, dict[t.Literal["pretrained", "description"], t.Any]] = {} for m in models: try: model = openllm.AutoLLM.for_model(m) docs = inspect.cleandoc(model.config.__doc__ or "(No description)") json_data[m] = {"pretrained": model.pretrained, "description": docs} except Exception as err: failed_initialized.append((m, err)) if output == "pretty": import tabulate tabulate.PRESERVE_WHITESPACE = True data: list[str | tuple[str, str, list[str]]] = [] for m, v in json_data.items(): data.extend([(m, v["description"], v["pretrained"])]) column_widths = [int(COLUMNS / 6), int(COLUMNS / 3 * 2), int(COLUMNS / 6)] if len(data) == 0 and len(failed_initialized) > 0: _echo("Exception found while parsing models:\n", fg="yellow") for m, err in failed_initialized: _echo(f"- {m}: ", fg="yellow", nl=False) _echo(traceback.print_exception(err, limit=3), fg="red") sys.exit(1) table = tabulate.tabulate( data, tablefmt="fancy_grid", headers=["LLM", "Description", "Pretrained"], maxcolwidths=column_widths, ) formatted_table = "" for line in table.split("\n"): formatted_table += ( "".join(f"{cell:{width}}" for cell, width in zip(line.split("\t"), column_widths)) + "\n" ) _echo(formatted_table, fg="white") if len(failed_initialized) > 0: _echo("\nThe following models are supported but failed to initialize:\n") for m, err in failed_initialized: _echo(f"- {m}: ", fg="blue", nl=False) _echo(err, fg="red") else: _echo(orjson.dumps(json_data, option=orjson.OPT_INDENT_2).decode()) sys.exit(0) @cli.command() @click.argument( "model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]) ) @click.option( "--pretrained", type=click.STRING, default=None, help="Optional pretrained name or path to fine-tune weight." ) @output_option def download_models(model_name: str, pretrained: str | None, output: OutputLiteral): """Setup LLM interactively. Note: This is useful for development and setup for fine-tune. """ config = openllm.AutoConfig.for_model(model_name) env = config.__openllm_env__.get_framework_env() if env == "flax": model = openllm.AutoFlaxLLM.for_model(model_name, pretrained=pretrained, llm_config=config) elif env == "tf": model = openllm.AutoTFLLM.for_model(model_name, pretrained=pretrained, llm_config=config) else: model = openllm.AutoLLM.for_model(model_name, pretrained=pretrained, llm_config=config) tag = model.make_tag(trust_remote_code=config.__openllm_trust_remote_code__) if len(bentoml.models.list(tag)) == 0: if output == "pretty": _echo(f"{tag} does not exists yet!. Downloading...", nl=True) m = model.ensure_pretrained_exists() _echo(f"Saved model: {m.tag}") elif output == "json": m = model.ensure_pretrained_exists() _echo( orjson.dumps( {"previously_setup": False, "framework": env, "tag": str(m.tag)}, option=orjson.OPT_INDENT_2 ).decode() ) else: m = model.ensure_pretrained_exists() _echo(tag) else: m = model.ensure_pretrained_exists() if output == "pretty": _echo(f"{model_name} is already setup for framework '{env}': {str(m.tag)}", nl=True) elif output == "json": _echo( orjson.dumps( {"previously_setup": True, "framework": env, "model": str(m.tag)}, option=orjson.OPT_INDENT_2 ).decode() ) else: _echo(m.tag) return m @cli.command() @click.option( "-y", "--yes", "--assume-yes", is_flag=True, help="Skip confirmation when deleting a specific model", ) def prune(yes: bool): """Remove all saved models locally.""" available = [ m for t in map(inflection.dasherize, openllm.CONFIG_MAPPING.keys()) for m in bentoml.models.list() if t in m.tag.name ] for model in available: if yes: delete_confirmed = True else: delete_confirmed = click.confirm(f"delete model {model.tag}?") if delete_confirmed: model_store.delete(model.tag) click.echo(f"{model} deleted.") @cli.command(name="query") @click.option( "--endpoint", type=click.STRING, help="OpenLLM Server endpoint, i.e: http://0.0.0.0:3000", envvar="OPENLLM_ENDPOINT", default="http://0.0.0.0:3000", ) @click.option("--timeout", type=click.INT, default=30, help="Default server timeout", show_default=True) @click.option( "--server-type", type=click.Choice(["grpc", "http"]), help="Server type", default="http", show_default=True ) @output_option @click.argument("query", type=click.STRING) def query( query: str, endpoint: str, timeout: int, server_type: t.Literal["http", "grpc"], output: OutputLiteral, ): """Ask a LLM interactively, from a terminal. $ openllm query --endpoint http://12.323.2.1 "What is the meaning of life?" """ if server_type == "grpc": endpoint = re.sub(r"http://", "", endpoint) client = ( openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == "http" else openllm.client.GrpcClient(endpoint, timeout=timeout) ) if client.framework == "flax": model = openllm.AutoFlaxLLM.for_model(client.model_name) elif client.framework == "tf": model = openllm.AutoTFLLM.for_model(client.model_name) else: model = openllm.AutoLLM.for_model(client.model_name) if output != "porcelain": _echo(f"Processing query: {query}\n", fg="white") res = client.query(query, return_raw_response=True) if output == "pretty": formatted = model.postprocess_generate(query, res["responses"]) _echo("Responses: ", fg="white", nl=False) _echo(formatted, fg="cyan") elif output == "json": _echo(orjson.dumps(res, option=orjson.OPT_INDENT_2).decode(), fg="white") else: _echo(res["responses"], fg="white") if t.TYPE_CHECKING: assert download_models and build and models and start and start_grpc and query and prune if psutil.WINDOWS: sys.stdout.reconfigure(encoding="utf-8") # type: ignore return cli cli = cli_factory() if __name__ == "__main__": cli()