revert: "ci: pre-commit autoupdate [pre-commit.ci] (#931)"

This reverts commit 7b00c84c2a.
2026-01-17 03:47:54 -05:00 · 2024-03-15 03:47:23 -04:00
parent 7b00c84c2a
commit e3392476be
69 changed files with 368 additions and 1300 deletions
--- a/openllm-python/src/openllm_cli/_factory.py
+++ b/openllm-python/src/openllm_cli/_factory.py
@@ -5,25 +5,12 @@ from bentoml_cli.utils import BentoMLCommandGroup
 from click import shell_completion as sc

 from openllm_core._configuration import LLMConfig
-from openllm_core._typing_compat import (
-  Concatenate,
-  DictStrAny,
-  LiteralBackend,
-  LiteralSerialisation,
-  ParamSpec,
-  AnyCallable,
-  get_literal_args,
-)
+from openllm_core._typing_compat import Concatenate, DictStrAny, LiteralBackend, LiteralSerialisation, ParamSpec, AnyCallable, get_literal_args
 from openllm_core.utils import DEBUG, compose, dantic, resolve_user_filepath


 class _OpenLLM_GenericInternalConfig(LLMConfig):
-  __config__ = {
-    'name_type': 'lowercase',
-    'default_id': 'openllm/generic',
-    'model_ids': ['openllm/generic'],
-    'architecture': 'PreTrainedModel',
-  }
+  __config__ = {'name_type': 'lowercase', 'default_id': 'openllm/generic', 'model_ids': ['openllm/generic'], 'architecture': 'PreTrainedModel'}

  class GenerationConfig:
    top_k: int = 15
@@ -50,20 +37,11 @@ def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete


 def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
-  return [
-    sc.CompletionItem(inflection.dasherize(it), help='Model')
-    for it in openllm.CONFIG_MAPPING
-    if it.startswith(incomplete)
-  ]
+  return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]


 def parse_config_options(
-  config: LLMConfig,
-  server_timeout: int,
-  workers_per_resource: float,
-  device: t.Tuple[str, ...] | None,
-  cors: bool,
-  environ: DictStrAny,
+  config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny
 ) -> DictStrAny:
  # TODO: Support amd.com/gpu on k8s
  _bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '')
@@ -78,21 +56,14 @@ def parse_config_options(
  if device:
    if len(device) > 1:
      _bentoml_config_options_opts.extend([
-        f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
-        for idx, dev in enumerate(device)
+        f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)
      ])
    else:
-      _bentoml_config_options_opts.append(
-        f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]'
-      )
+      _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
  if cors:
+    _bentoml_config_options_opts.extend(['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
    _bentoml_config_options_opts.extend([
-      'api_server.http.cors.enabled=true',
-      'api_server.http.cors.access_control_allow_origins="*"',
-    ])
-    _bentoml_config_options_opts.extend([
-      f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"'
-      for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])
+      f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])
    ])
  _bentoml_config_options_env += ' ' if _bentoml_config_options_env else '' + ' '.join(_bentoml_config_options_opts)
  environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env
@@ -171,9 +142,7 @@ def start_decorator(fn: FC) -> FC:
  return composed(fn)


-def parse_device_callback(
-  _: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None
-) -> t.Tuple[str, ...] | None:
+def parse_device_callback(_: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
  if value is None:
    return value
  el: t.Tuple[str, ...] = tuple(i for k in value for i in k)
@@ -192,19 +161,13 @@ _IGNORED_OPTIONS = {'working_dir', 'production', 'protocol_version'}
 def parse_serve_args() -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]:
  from bentoml_cli.cli import cli

-  group = cog.optgroup.group(
-    'Start a HTTP server options', help='Related to serving the model [synonymous to `bentoml serve-http`]'
-  )
+  group = cog.optgroup.group('Start a HTTP server options', help='Related to serving the model [synonymous to `bentoml serve-http`]')

  def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) -> t.Callable[[FC], FC]:
    serve_command = cli.commands['serve']
    # The first variable is the argument bento
    # The last five is from BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS
-    serve_options = [
-      p
-      for p in serve_command.params[1 : -BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS]
-      if p.name not in _IGNORED_OPTIONS
-    ]
+    serve_options = [p for p in serve_command.params[1 : -BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS] if p.name not in _IGNORED_OPTIONS]
    for options in reversed(serve_options):
      attrs = options.to_info_dict()
      # we don't need param_type_name, since it should all be options
@@ -258,13 +221,7 @@ def adapter_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callab

 def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
-    '--cors/--no-cors',
-    show_default=True,
-    default=False,
-    envvar='OPENLLM_CORS',
-    show_envvar=True,
-    help='Enable CORS for the server.',
-    **attrs,
+    '--cors/--no-cors', show_default=True, default=False, envvar='OPENLLM_CORS', show_envvar=True, help='Enable CORS for the server.', **attrs
  )(f)


@@ -318,12 +275,7 @@ def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[


 def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_argument(
-    'model_name',
-    type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]),
-    required=required,
-    **attrs,
-  )(f)
+  return cli_argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f)


 def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
@@ -361,9 +313,7 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
  )(f)


-def workers_per_resource_option(
-  f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any
-) -> t.Callable[[FC], FC]:
+def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
    '--workers-per-resource',
    default=None,
@@ -431,9 +381,7 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va
      float(value)  # type: ignore[arg-type]
    except ValueError:
      raise click.BadParameter(
-        f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.",
-        ctx,
-        param,
+        f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, param
      ) from None
    else:
      return value
--- a/openllm-python/src/openllm_cli/_sdk.py
+++ b/openllm-python/src/openllm_cli/_sdk.py
@@ -69,10 +69,7 @@ def _start(
  if timeout:
    args.extend(['--server-timeout', str(timeout)])
  if workers_per_resource:
-    args.extend([
-      '--workers-per-resource',
-      str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource,
-    ])
+    args.extend(['--workers-per-resource', str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource])
  if device and not os.environ.get('CUDA_VISIBLE_DEVICES'):
    args.extend(['--device', ','.join(device)])
  if quantize:
@@ -80,11 +77,7 @@ def _start(
  if cors:
    args.append('--cors')
  if adapter_map:
-    args.extend(
-      list(
-        itertools.chain.from_iterable([['--adapter-id', f"{k}{':' + v if v else ''}"] for k, v in adapter_map.items()])
-      )
-    )
+    args.extend(list(itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])))
  if additional_args:
    args.extend(additional_args)
  if __test__:
@@ -155,9 +148,7 @@ def _build(
    '--machine',
    '--quiet',
    '--serialisation',
-    first_not_none(
-      serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
-    ),
+    first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'),
  ]
  if quantize:
    args.extend(['--quantize', quantize])
@@ -174,7 +165,7 @@ def _build(
  if overwrite:
    args.append('--overwrite')
  if adapter_map:
-    args.extend([f"--adapter-id={k}{':' + v if v is not None else ''}" for k, v in adapter_map.items()])
+    args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()])
  if model_version:
    args.extend(['--model-version', model_version])
  if bento_version:
@@ -274,4 +265,4 @@ start, build, import_model, list_models = (
  codegen.gen_sdk(_import_model),
  codegen.gen_sdk(_list_models),
 )
-__all__ = ['build', 'import_model', 'list_models', 'start']
+__all__ = ['start', 'build', 'import_model', 'list_models']
--- a/openllm-python/src/openllm_cli/entrypoint.py
+++ b/openllm-python/src/openllm_cli/entrypoint.py
@@ -43,15 +43,7 @@ from openllm_core.utils import (
 )

 from . import termui
-from ._factory import (
-  FC,
-  _AnyCallable,
-  machine_option,
-  model_name_argument,
-  parse_config_options,
-  start_decorator,
-  optimization_decorator,
-)
+from ._factory import FC, _AnyCallable, machine_option, model_name_argument, parse_config_options, start_decorator, optimization_decorator

 if t.TYPE_CHECKING:
  import torch
@@ -103,18 +95,12 @@ def backend_warning(backend: LiteralBackend, build: bool = False) -> None:
        'vLLM is not available. Note that PyTorch backend is not as performant as vLLM and you should always consider using vLLM for production.'
      )
    if build:
-      logger.info(
-        "Tip: You can set '--backend vllm' to package your Bento with vLLM backend regardless if vLLM is available locally."
-      )
+      logger.info("Tip: You can set '--backend vllm' to package your Bento with vLLM backend regardless if vLLM is available locally.")


 class Extensions(click.MultiCommand):
  def list_commands(self, ctx: click.Context) -> list[str]:
-    return sorted([
-      filename[:-3]
-      for filename in os.listdir(_EXT_FOLDER)
-      if filename.endswith('.py') and not filename.startswith('__')
-    ])
+    return sorted([filename[:-3] for filename in os.listdir(_EXT_FOLDER) if filename.endswith('.py') and not filename.startswith('__')])

  def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None:
    try:
@@ -131,41 +117,19 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
  def common_params(f: t.Callable[P, t.Any]) -> t.Callable[[FC], FC]:
    # The following logics is similar to one of BentoMLCommandGroup
    @cog.optgroup.group(name='Global options', help='Shared globals options for all OpenLLM CLI.')  # type: ignore[misc]
+    @cog.optgroup.option('-q', '--quiet', envvar=QUIET_ENV_VAR, is_flag=True, default=False, help='Suppress all output.', show_envvar=True)
    @cog.optgroup.option(
-      '-q', '--quiet', envvar=QUIET_ENV_VAR, is_flag=True, default=False, help='Suppress all output.', show_envvar=True
+      '--debug', '--verbose', 'debug', envvar=DEBUG_ENV_VAR, is_flag=True, default=False, help='Print out debug logs.', show_envvar=True
    )
    @cog.optgroup.option(
-      '--debug',
-      '--verbose',
-      'debug',
-      envvar=DEBUG_ENV_VAR,
-      is_flag=True,
-      default=False,
-      help='Print out debug logs.',
-      show_envvar=True,
+      '--do-not-track', is_flag=True, default=False, envvar=analytics.OPENLLM_DO_NOT_TRACK, help='Do not send usage info', show_envvar=True
    )
    @cog.optgroup.option(
-      '--do-not-track',
-      is_flag=True,
-      default=False,
-      envvar=analytics.OPENLLM_DO_NOT_TRACK,
-      help='Do not send usage info',
-      show_envvar=True,
-    )
-    @cog.optgroup.option(
-      '--context',
-      'cloud_context',
-      envvar='BENTOCLOUD_CONTEXT',
-      type=click.STRING,
-      default=None,
-      help='BentoCloud context name.',
-      show_envvar=True,
+      '--context', 'cloud_context', envvar='BENTOCLOUD_CONTEXT', type=click.STRING, default=None, help='BentoCloud context name.', show_envvar=True
    )
    @click.pass_context
    @functools.wraps(f)
-    def wrapper(
-      ctx: click.Context, quiet: bool, debug: bool, cloud_context: str | None, *args: P.args, **attrs: P.kwargs
-    ) -> t.Any:
+    def wrapper(ctx: click.Context, quiet: bool, debug: bool, cloud_context: str | None, *args: P.args, **attrs: P.kwargs) -> t.Any:
      ctx.obj = GlobalOptions(cloud_context=cloud_context)
      if quiet:
        set_quiet_mode(True)
@@ -179,9 +143,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
    return wrapper

  @staticmethod
-  def usage_tracking(
-    func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any
-  ) -> t.Callable[Concatenate[bool, P], t.Any]:
+  def usage_tracking(func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any) -> t.Callable[Concatenate[bool, P], t.Any]:
    command_name = attrs.get('name', func.__name__)

    @functools.wraps(func)
@@ -240,9 +202,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
      _memo = getattr(wrapped, '__click_params__', None)
      if _memo is None:
        raise ValueError('Click command not register correctly.')
-      _object_setattr(
-        wrapped, '__click_params__', _memo[-self.NUMBER_OF_COMMON_PARAMS :] + _memo[: -self.NUMBER_OF_COMMON_PARAMS]
-      )
+      _object_setattr(wrapped, '__click_params__', _memo[-self.NUMBER_OF_COMMON_PARAMS :] + _memo[: -self.NUMBER_OF_COMMON_PARAMS])
      # NOTE: we need to call super of super to avoid conflict with BentoMLCommandGroup command setup
      cmd = super(BentoMLCommandGroup, self).command(*args, **kwargs)(wrapped)
      # NOTE: add aliases to a given commands if it is specified.
@@ -250,7 +210,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
        if not cmd.name:
          raise ValueError('name is required when aliases are available.')
        self._commands[cmd.name] = aliases
-        self._aliases.update(dict.fromkeys(aliases, cmd.name))
+        self._aliases.update({alias: cmd.name for alias in aliases})
      return cmd

    return decorator
@@ -317,12 +277,7 @@ def cli() -> None:
  """


-@cli.command(
-  context_settings=termui.CONTEXT_SETTINGS,
-  name='start',
-  aliases=['start-http'],
-  short_help='Start a LLMServer for any supported LLM.',
-)
+@cli.command(context_settings=termui.CONTEXT_SETTINGS, name='start', aliases=['start-http'], short_help='Start a LLMServer for any supported LLM.')
@click.argument('model_id', type=click.STRING, metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]', required=True)
@click.option(
  '--model-id',
@@ -375,9 +330,7 @@ def start_command(
  ```
  """
  if backend == 'pt':
-    logger.warning(
-      'PyTorch backend is deprecated and will be removed in future releases. Make sure to use vLLM instead.'
-    )
+    logger.warning('PyTorch backend is deprecated and will be removed in future releases. Make sure to use vLLM instead.')
  if model_id in openllm.CONFIG_MAPPING:
    _model_name = model_id
    if deprecated_model_id is not None:
@@ -395,17 +348,11 @@ def start_command(

  from openllm.serialisation.transformers.weights import has_safetensors_weights

-  serialisation = first_not_none(
-    serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
-  )
+  serialisation = first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy')

  if serialisation == 'safetensors' and quantize is not None:
    logger.warning("'--quantize=%s' might not work with 'safetensors' serialisation format.", quantize)
-    logger.warning(
-      "Make sure to check out '%s' repository to see if the weights is in '%s' format if unsure.",
-      model_id,
-      serialisation,
-    )
+    logger.warning("Make sure to check out '%s' repository to see if the weights is in '%s' format if unsure.", model_id, serialisation)
    logger.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.")

  import torch
@@ -433,9 +380,7 @@ def start_command(
  config, server_attrs = llm.config.model_validate_click(**attrs)
  server_timeout = first_not_none(server_timeout, default=config['timeout'])
  server_attrs.update({'working_dir': pkg.source_locations('openllm'), 'timeout': server_timeout})
-  development = server_attrs.pop(
-    'development'
-  )  # XXX: currently, theres no development args in bentoml.Server. To be fixed upstream.
+  development = server_attrs.pop('development')  # XXX: currently, theres no development args in bentoml.Server. To be fixed upstream.
  server_attrs.setdefault('production', not development)

  start_env = process_environ(
@@ -465,12 +410,8 @@ def start_command(
  return config


-def process_environ(
-  config, server_timeout, wpr, device, cors, model_id, adapter_map, serialisation, llm, use_current_env=True
-):
-  environ = parse_config_options(
-    config, server_timeout, wpr, device, cors, os.environ.copy() if use_current_env else {}
-  )
+def process_environ(config, server_timeout, wpr, device, cors, model_id, adapter_map, serialisation, llm, use_current_env=True):
+  environ = parse_config_options(config, server_timeout, wpr, device, cors, os.environ.copy() if use_current_env else {})
  environ.update({
    'OPENLLM_MODEL_ID': model_id,
    'BENTOML_DEBUG': str(openllm.utils.get_debug_mode()),
@@ -515,8 +456,7 @@ def build_bento_instruction(llm, model_id, serialisation, adapter_map):
    cmd_name += f' --serialization {serialisation}'
  if adapter_map is not None:
    cmd_name += ' ' + ' '.join([
-      f'--adapter-id {s}'
-      for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]
+      f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]
    ])
  if not openllm.utils.get_quiet_mode():
    termui.info(f"🚀Tip: run '{cmd_name}' to create a BentoLLM for '{model_id}'")
@@ -551,12 +491,8 @@ def run_server(args, env, return_process=False) -> subprocess.Popen[bytes] | int
  if return_process:
    return process
  stop_event = threading.Event()
-  stdout, stderr = (
-    threading.Thread(target=handle, args=(process.stdout, stop_event)),
-    threading.Thread(target=handle, args=(process.stderr, stop_event)),
-  )
-  stdout.start()
-  stderr.start()  # noqa: E702
+  stdout, stderr = threading.Thread(target=handle, args=(process.stdout, stop_event)), threading.Thread(target=handle, args=(process.stderr, stop_event))
+  stdout.start(); stderr.start()  # noqa: E702

  try:
    process.wait()
@@ -571,12 +507,9 @@ def run_server(args, env, return_process=False) -> subprocess.Popen[bytes] | int
    raise
  finally:
    stop_event.set()
-    stdout.join()
-    stderr.join()  # noqa: E702
-    if process.poll() is not None:
-      process.kill()
-    stdout.join()
-    stderr.join()  # noqa: E702
+    stdout.join(); stderr.join()  # noqa: E702
+    if process.poll() is not None: process.kill()
+    stdout.join(); stderr.join()  # noqa: E702

  return process.returncode

@@ -664,10 +597,7 @@ def import_command(
    backend=backend,
    dtype=dtype,
    serialisation=t.cast(
-      LiteralSerialisation,
-      first_not_none(
-        serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
-      ),
+      LiteralSerialisation, first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy')
    ),
  )
  backend_warning(llm.__llm_backend__)
@@ -726,21 +656,14 @@ class BuildBentoOutput(t.TypedDict):
  metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]',
  help='Deprecated. Use positional argument instead.',
 )
-@click.option(
-  '--bento-version',
-  type=str,
-  default=None,
-  help='Optional bento version for this BentoLLM. Default is the the model revision.',
-)
+@click.option('--bento-version', type=str, default=None, help='Optional bento version for this BentoLLM. Default is the the model revision.')
@click.option('--overwrite', is_flag=True, help='Overwrite existing Bento for given LLM if it already exists.')
@click.option(
  '--enable-features',
  multiple=True,
  nargs=1,
  metavar='FEATURE[,FEATURE]',
-  help='Enable additional features for building this LLM Bento. Available: {}'.format(
-    ', '.join(OPTIONAL_DEPENDENCIES)
-  ),
+  help='Enable additional features for building this LLM Bento. Available: {}'.format(', '.join(OPTIONAL_DEPENDENCIES)),
 )
@optimization_decorator
@click.option(
@@ -751,12 +674,7 @@ class BuildBentoOutput(t.TypedDict):
  help="Optional adapters id to be included within the Bento. Note that if you are using relative path, '--build-ctx' must be passed.",
 )
@click.option('--build-ctx', help='Build context. This is required if --adapter-id uses relative path', default=None)
-@click.option(
-  '--dockerfile-template',
-  default=None,
-  type=click.File(),
-  help='Optional custom dockerfile template to be used with this BentoLLM.',
-)
+@click.option('--dockerfile-template', default=None, type=click.File(), help='Optional custom dockerfile template to be used with this BentoLLM.')
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Utilities options')  # type: ignore[misc]
@cog.optgroup.option(
  '--containerize',
@@ -849,9 +767,7 @@ def build_command(
  state = ItemState.NOT_FOUND

  if backend == 'pt':
-    logger.warning(
-      "PyTorch backend is deprecated and will be removed from the next releases. Will set default backend to 'vllm' instead."
-    )
+    logger.warning("PyTorch backend is deprecated and will be removed from the next releases. Will set default backend to 'vllm' instead.")

  llm = openllm.LLM(
    model_id=model_id,
@@ -861,9 +777,7 @@ def build_command(
    dtype=dtype,
    max_model_len=max_model_len,
    gpu_memory_utilization=gpu_memory_utilization,
-    serialisation=first_not_none(
-      serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
-    ),
+    serialisation=first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'),
    _eager=False,
  )
  if llm.__llm_backend__ not in llm.config['backend']:
@@ -875,9 +789,7 @@ def build_command(
    model = openllm.serialisation.import_model(llm, trust_remote_code=llm.trust_remote_code)
  llm._tag = model.tag

-  os.environ.update(
-    **process_environ(llm.config, llm.config['timeout'], 1.0, None, True, llm.model_id, None, llm._serialisation, llm)
-  )
+  os.environ.update(**process_environ(llm.config, llm.config['timeout'], 1.0, None, True, llm.model_id, None, llm._serialisation, llm))

  try:
    assert llm.bentomodel  # HACK: call it here to patch correct tag with revision and everything
@@ -944,11 +856,7 @@ def build_command(

  def get_current_bentocloud_context() -> str | None:
    try:
-      context = (
-        cloud_config.get_context(ctx.obj.cloud_context)
-        if ctx.obj.cloud_context
-        else cloud_config.get_current_context()
-      )
+      context = cloud_config.get_context(ctx.obj.cloud_context) if ctx.obj.cloud_context else cloud_config.get_current_context()
      return context.name
    except Exception:
      return None
@@ -972,9 +880,7 @@ def build_command(
    tag=str(bento_tag),
    backend=llm.__llm_backend__,
    instructions=[
-      DeploymentInstruction.from_content(
-        type='bentocloud', instr="☁️  Push to BentoCloud with 'bentoml push':\n    $ {cmd}", cmd=push_cmd
-      ),
+      DeploymentInstruction.from_content(type='bentocloud', instr="☁️  Push to BentoCloud with 'bentoml push':\n    $ {cmd}", cmd=push_cmd),
      DeploymentInstruction.from_content(
        type='container',
        instr="🐳 Container BentoLLM with 'bentoml containerize':\n    $ {cmd}",
@@ -1000,9 +906,7 @@ def build_command(
        termui.echo(f"  * {instruction['content']}\n", nl=False)

  if push:
-    BentoMLContainer.bentocloud_client.get().push_bento(
-      bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=force_push
-    )
+    BentoMLContainer.bentocloud_client.get().push_bento(bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=force_push)
  elif containerize:
    container_backend = t.cast('DefaultBuilder', os.environ.get('BENTOML_CONTAINERIZE_BACKEND', 'docker'))
    try:
@@ -1042,8 +946,7 @@ def models_command(**_: t.Any) -> dict[t.LiteralString, ModelItem]:
      architecture=config.__openllm_architecture__,
      example_id=random.choice(config.__openllm_model_ids__),
      supported_backends=config.__openllm_backend__,
-      installation='pip install '
-      + (f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config.__openllm_requirements__ else 'openllm'),
+      installation='pip install ' + (f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config.__openllm_requirements__ else 'openllm'),
      items=[
        str(md.tag)
        for md in bentoml.models.list()
@@ -1062,13 +965,7 @@ def models_command(**_: t.Any) -> dict[t.LiteralString, ModelItem]:
@cli.command()
@model_name_argument(required=False)
@click.option('-y', '--yes', '--assume-yes', is_flag=True, help='Skip confirmation when deleting a specific model')
-@click.option(
-  '--include-bentos/--no-include-bentos',
-  is_flag=True,
-  hidden=True,
-  default=True,
-  help='Whether to also include pruning bentos.',
-)
+@click.option('--include-bentos/--no-include-bentos', is_flag=True, hidden=True, default=True, help='Whether to also include pruning bentos.')
@inject
@click.pass_context
 def prune_command(
@@ -1085,32 +982,24 @@ def prune_command(
  If a model type is passed, then only prune models for that given model type.
  """
  available: list[tuple[bentoml.Model | bentoml.Bento, ModelStore | BentoStore]] = [
-    (m, model_store)
-    for m in bentoml.models.list()
-    if 'framework' in m.info.labels and m.info.labels['framework'] == 'openllm'
+    (m, model_store) for m in bentoml.models.list() if 'framework' in m.info.labels and m.info.labels['framework'] == 'openllm'
  ]
  if model_name is not None:
    available = [
-      (m, store)
-      for m, store in available
-      if 'model_name' in m.info.labels and m.info.labels['model_name'] == inflection.underscore(model_name)
+      (m, store) for m, store in available if 'model_name' in m.info.labels and m.info.labels['model_name'] == inflection.underscore(model_name)
    ] + [
      (b, bento_store)
      for b in bentoml.bentos.list()
      if 'start_name' in b.info.labels and b.info.labels['start_name'] == inflection.underscore(model_name)
    ]
  else:
-    available += [
-      (b, bento_store) for b in bentoml.bentos.list() if '_type' in b.info.labels and '_framework' in b.info.labels
-    ]
+    available += [(b, bento_store) for b in bentoml.bentos.list() if '_type' in b.info.labels and '_framework' in b.info.labels]

  for store_item, store in available:
    if yes:
      delete_confirmed = True
    else:
-      delete_confirmed = click.confirm(
-        f"delete {'model' if isinstance(store, ModelStore) else 'bento'} {store_item.tag}?"
-      )
+      delete_confirmed = click.confirm(f"delete {'model' if isinstance(store, ModelStore) else 'bento'} {store_item.tag}?")
    if delete_confirmed:
      store.delete(store_item.tag)
      termui.warning(f"{store_item} deleted from {'model' if isinstance(store, ModelStore) else 'bento'} store.")
@@ -1157,17 +1046,8 @@ def shared_client_options(f: _AnyCallable | None = None) -> t.Callable[[FC], FC]

@cli.command()
@shared_client_options
-@click.option(
-  '--server-type',
-  type=click.Choice(['grpc', 'http']),
-  help='Server type',
-  default='http',
-  show_default=True,
-  hidden=True,
-)
-@click.option(
-  '--stream/--no-stream', type=click.BOOL, is_flag=True, default=True, help='Whether to stream the response.'
-)
+@click.option('--server-type', type=click.Choice(['grpc', 'http']), help='Server type', default='http', show_default=True, hidden=True)
+@click.option('--stream/--no-stream', type=click.BOOL, is_flag=True, default=True, help='Whether to stream the response.')
@click.argument('prompt', type=click.STRING)
@click.option(
  '--sampling-params',
--- a/openllm-python/src/openllm_cli/extension/dive_bentos.py
+++ b/openllm-python/src/openllm_cli/extension/dive_bentos.py
@@ -21,9 +21,7 @@ if t.TYPE_CHECKING:
@machine_option
@click.pass_context
@inject
-def cli(
-  ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
-) -> str | None:
+def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
  """Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path)."""
  try:
    bentomodel = _bento_store.get(bento)
--- a/openllm-python/src/openllm_cli/extension/get_containerfile.py
+++ b/openllm-python/src/openllm_cli/extension/get_containerfile.py
@@ -17,9 +17,7 @@ if t.TYPE_CHECKING:
  from bentoml._internal.bento import BentoStore


-@click.command(
-  'get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.'
-)
+@click.command('get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.')
@click.argument('bento', type=str, shell_complete=bento_complete_envvar)
@click.pass_context
@inject
--- a/openllm-python/src/openllm_cli/extension/get_prompt.py
+++ b/openllm-python/src/openllm_cli/extension/get_prompt.py
@@ -22,9 +22,7 @@ class PromptFormatter(string.Formatter):
      raise ValueError('Positional arguments are not supported')
    return super().vformat(format_string, args, kwargs)

-  def check_unused_args(
-    self, used_args: set[int | str], args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]
-  ) -> None:
+  def check_unused_args(self, used_args: set[int | str], args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]) -> None:
    extras = set(kwargs).difference(used_args)
    if extras:
      raise KeyError(f'Extra params passed: {extras}')
@@ -58,9 +56,7 @@ class PromptTemplate:
    try:
      return self.template.format(**prompt_variables)
    except KeyError as e:
-      raise RuntimeError(
-        f"Missing variable '{e.args[0]}' (required: {self._input_variables}) in the prompt template."
-      ) from None
+      raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {self._input_variables}) in the prompt template.") from None


@click.command('get_prompt', context_settings=termui.CONTEXT_SETTINGS)
@@ -128,21 +124,15 @@ def cli(
  if prompt_template_file and chat_template_file:
    ctx.fail('prompt-template-file and chat-template-file are mutually exclusive.')

-  acceptable = set(openllm.CONFIG_MAPPING_NAMES.keys()) | set(
-    inflection.dasherize(name) for name in openllm.CONFIG_MAPPING_NAMES.keys()
-  )
+  acceptable = set(openllm.CONFIG_MAPPING_NAMES.keys()) | set(inflection.dasherize(name) for name in openllm.CONFIG_MAPPING_NAMES.keys())
  if model_id in acceptable:
-    logger.warning(
-      'Using a default prompt from OpenLLM. Note that this prompt might not work for your intended usage.\n'
-    )
+    logger.warning('Using a default prompt from OpenLLM. Note that this prompt might not work for your intended usage.\n')
    config = openllm.AutoConfig.for_model(model_id)
    template = prompt_template_file.read() if prompt_template_file is not None else config.template
    system_message = system_message or config.system_message

    try:
-      formatted = (
-        PromptTemplate(template).with_options(system_message=system_message).format(instruction=prompt, **_memoized)
-      )
+      formatted = PromptTemplate(template).with_options(system_message=system_message).format(instruction=prompt, **_memoized)
    except RuntimeError as err:
      logger.debug('Exception caught while formatting prompt: %s', err)
      ctx.fail(str(err))
@@ -159,21 +149,15 @@ def cli(
      for architecture in config.architectures:
        if architecture in openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE():
          system_message = (
-            openllm.AutoConfig.infer_class_from_name(
-              openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture]
-            )
+            openllm.AutoConfig.infer_class_from_name(openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture])
            .model_construct_env()
            .system_message
          )
          break
      else:
-        ctx.fail(
-          f'Failed to infer system message from model architecture: {config.architectures}. Please pass in --system-message'
-        )
+        ctx.fail(f'Failed to infer system message from model architecture: {config.architectures}. Please pass in --system-message')
    messages = [{'role': 'system', 'content': system_message}, {'role': 'user', 'content': prompt}]
-    formatted = tokenizer.apply_chat_template(
-      messages, chat_template=chat_template_file, add_generation_prompt=add_generation_prompt, tokenize=False
-    )
+    formatted = tokenizer.apply_chat_template(messages, chat_template=chat_template_file, add_generation_prompt=add_generation_prompt, tokenize=False)

  termui.echo(orjson.dumps({'prompt': formatted}, option=orjson.OPT_INDENT_2).decode(), fg='white')
  ctx.exit(0)
--- a/openllm-python/src/openllm_cli/extension/list_models.py
+++ b/openllm-python/src/openllm_cli/extension/list_models.py
@@ -33,17 +33,12 @@ def cli(model_name: str | None) -> DictStrAny:
  }
  if model_name is not None:
    ids_in_local_store = {
-      k: [
-        i
-        for i in v
-        if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)
-      ]
+      k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)]
      for k, v in ids_in_local_store.items()
    }
  ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
  local_models = {
-    k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val]
-    for k, val in ids_in_local_store.items()
+    k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()
  }
  termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white')
  return local_models
--- a/openllm-python/src/openllm_cli/extension/playground.py
+++ b/openllm-python/src/openllm_cli/extension/playground.py
@@ -32,14 +32,7 @@ def load_notebook_metadata() -> DictStrAny:

@click.command('playground', context_settings=termui.CONTEXT_SETTINGS)
@click.argument('output-dir', default=None, required=False)
-@click.option(
-  '--port',
-  envvar='JUPYTER_PORT',
-  show_envvar=True,
-  show_default=True,
-  default=8888,
-  help='Default port for Jupyter server',
-)
+@click.option('--port', envvar='JUPYTER_PORT', show_envvar=True, show_default=True, default=8888, help='Default port for Jupyter server')
@click.pass_context
 def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  """OpenLLM Playground.
@@ -60,9 +53,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  > This command requires Jupyter to be installed. Install it with 'pip install "openllm[playground]"'
  """
  if not is_jupyter_available() or not is_jupytext_available() or not is_notebook_available():
-    raise RuntimeError(
-      "Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'"
-    )
+    raise RuntimeError("Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'")
  metadata = load_notebook_metadata()
  _temp_dir = False
  if output_dir is None:
@@ -74,9 +65,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  termui.echo('The playground notebooks will be saved to: ' + os.path.abspath(output_dir), fg='blue')
  for module in pkgutil.iter_modules(playground.__path__):
    if module.ispkg or os.path.exists(os.path.join(output_dir, module.name + '.ipynb')):
-      logger.debug(
-        'Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module'
-      )
+      logger.debug('Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module')
      continue
    if not isinstance(module.module_finder, importlib.machinery.FileFinder):
      continue
--- a/openllm-python/src/openllm_cli/termui.py
+++ b/openllm-python/src/openllm_cli/termui.py
@@ -25,14 +25,7 @@ class Level(enum.IntEnum):

  @property
  def color(self) -> str | None:
-    return {
-      Level.NOTSET: None,
-      Level.DEBUG: 'cyan',
-      Level.INFO: 'green',
-      Level.WARNING: 'yellow',
-      Level.ERROR: 'red',
-      Level.CRITICAL: 'red',
-    }[self]
+    return {Level.NOTSET: None, Level.DEBUG: 'cyan', Level.INFO: 'green', Level.WARNING: 'yellow', Level.ERROR: 'red', Level.CRITICAL: 'red'}[self]

  @classmethod
  def from_logging_level(cls, level: int) -> Level:
@@ -82,9 +75,5 @@ def echo(text: t.Any, fg: str | None = None, *, _with_style: bool = True, json:


 COLUMNS: int = int(os.environ.get('COLUMNS', str(120)))
-CONTEXT_SETTINGS: DictStrAny = {
-  'help_option_names': ['-h', '--help'],
-  'max_content_width': COLUMNS,
-  'token_normalize_func': inflection.underscore,
-}
-__all__ = ['COLUMNS', 'CONTEXT_SETTINGS', 'Level', 'critical', 'debug', 'echo', 'error', 'info', 'log', 'warning']
+CONTEXT_SETTINGS: DictStrAny = {'help_option_names': ['-h', '--help'], 'max_content_width': COLUMNS, 'token_normalize_func': inflection.underscore}
+__all__ = ['echo', 'COLUMNS', 'CONTEXT_SETTINGS', 'log', 'warning', 'error', 'critical', 'debug', 'info', 'Level']