ci: pre-commit autoupdate [pre-commit.ci] (#931)

* ci: pre-commit autoupdate [pre-commit.ci] updates: - [github.com/astral-sh/ruff-pre-commit: v0.2.2 → v0.3.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.2.2...v0.3.2) - [github.com/pre-commit/mirrors-eslint: v9.0.0-beta.0 → v9.0.0-beta.2](https://github.com/pre-commit/mirrors-eslint/compare/v9.0.0-beta.0...v9.0.0-beta.2) * ci: auto fixes from pre-commit.ci For more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2026-06-11 18:09:52 -04:00 · 2024-03-15 03:46:28 -04:00
parent 7edbcf8a2d
commit 7b00c84c2a
69 changed files with 1298 additions and 366 deletions
--- a/openllm-python/src/openllm_cli/_factory.py
+++ b/openllm-python/src/openllm_cli/_factory.py
@@ -5,12 +5,25 @@ from bentoml_cli.utils import BentoMLCommandGroup
 from click import shell_completion as sc

 from openllm_core._configuration import LLMConfig
-from openllm_core._typing_compat import Concatenate, DictStrAny, LiteralBackend, LiteralSerialisation, ParamSpec, AnyCallable, get_literal_args
+from openllm_core._typing_compat import (
+  Concatenate,
+  DictStrAny,
+  LiteralBackend,
+  LiteralSerialisation,
+  ParamSpec,
+  AnyCallable,
+  get_literal_args,
+)
 from openllm_core.utils import DEBUG, compose, dantic, resolve_user_filepath


 class _OpenLLM_GenericInternalConfig(LLMConfig):
-  __config__ = {'name_type': 'lowercase', 'default_id': 'openllm/generic', 'model_ids': ['openllm/generic'], 'architecture': 'PreTrainedModel'}
+  __config__ = {
+    'name_type': 'lowercase',
+    'default_id': 'openllm/generic',
+    'model_ids': ['openllm/generic'],
+    'architecture': 'PreTrainedModel',
+  }

  class GenerationConfig:
    top_k: int = 15
@@ -37,11 +50,20 @@ def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete


 def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
-  return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]
+  return [
+    sc.CompletionItem(inflection.dasherize(it), help='Model')
+    for it in openllm.CONFIG_MAPPING
+    if it.startswith(incomplete)
+  ]


 def parse_config_options(
-  config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny
+  config: LLMConfig,
+  server_timeout: int,
+  workers_per_resource: float,
+  device: t.Tuple[str, ...] | None,
+  cors: bool,
+  environ: DictStrAny,
 ) -> DictStrAny:
  # TODO: Support amd.com/gpu on k8s
  _bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '')
@@ -56,14 +78,21 @@ def parse_config_options(
  if device:
    if len(device) > 1:
      _bentoml_config_options_opts.extend([
-        f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)
+        f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
+        for idx, dev in enumerate(device)
      ])
    else:
-      _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
+      _bentoml_config_options_opts.append(
+        f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]'
+      )
  if cors:
-    _bentoml_config_options_opts.extend(['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
    _bentoml_config_options_opts.extend([
-      f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])
+      'api_server.http.cors.enabled=true',
+      'api_server.http.cors.access_control_allow_origins="*"',
+    ])
+    _bentoml_config_options_opts.extend([
+      f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"'
+      for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])
    ])
  _bentoml_config_options_env += ' ' if _bentoml_config_options_env else '' + ' '.join(_bentoml_config_options_opts)
  environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env
@@ -142,7 +171,9 @@ def start_decorator(fn: FC) -> FC:
  return composed(fn)


-def parse_device_callback(_: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
+def parse_device_callback(
+  _: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None
+) -> t.Tuple[str, ...] | None:
  if value is None:
    return value
  el: t.Tuple[str, ...] = tuple(i for k in value for i in k)
@@ -161,13 +192,19 @@ _IGNORED_OPTIONS = {'working_dir', 'production', 'protocol_version'}
 def parse_serve_args() -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]:
  from bentoml_cli.cli import cli

-  group = cog.optgroup.group('Start a HTTP server options', help='Related to serving the model [synonymous to `bentoml serve-http`]')
+  group = cog.optgroup.group(
+    'Start a HTTP server options', help='Related to serving the model [synonymous to `bentoml serve-http`]'
+  )

  def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) -> t.Callable[[FC], FC]:
    serve_command = cli.commands['serve']
    # The first variable is the argument bento
    # The last five is from BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS
-    serve_options = [p for p in serve_command.params[1 : -BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS] if p.name not in _IGNORED_OPTIONS]
+    serve_options = [
+      p
+      for p in serve_command.params[1 : -BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS]
+      if p.name not in _IGNORED_OPTIONS
+    ]
    for options in reversed(serve_options):
      attrs = options.to_info_dict()
      # we don't need param_type_name, since it should all be options
@@ -221,7 +258,13 @@ def adapter_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callab

 def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
-    '--cors/--no-cors', show_default=True, default=False, envvar='OPENLLM_CORS', show_envvar=True, help='Enable CORS for the server.', **attrs
+    '--cors/--no-cors',
+    show_default=True,
+    default=False,
+    envvar='OPENLLM_CORS',
+    show_envvar=True,
+    help='Enable CORS for the server.',
+    **attrs,
  )(f)


@@ -275,7 +318,12 @@ def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[


 def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f)
+  return cli_argument(
+    'model_name',
+    type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]),
+    required=required,
+    **attrs,
+  )(f)


 def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
@@ -313,7 +361,9 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
  )(f)


-def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
+def workers_per_resource_option(
+  f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any
+) -> t.Callable[[FC], FC]:
  return cli_option(
    '--workers-per-resource',
    default=None,
@@ -381,7 +431,9 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va
      float(value)  # type: ignore[arg-type]
    except ValueError:
      raise click.BadParameter(
-        f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, param
+        f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.",
+        ctx,
+        param,
      ) from None
    else:
      return value
--- a/openllm-python/src/openllm_cli/_sdk.py
+++ b/openllm-python/src/openllm_cli/_sdk.py
@@ -69,7 +69,10 @@ def _start(
  if timeout:
    args.extend(['--server-timeout', str(timeout)])
  if workers_per_resource:
-    args.extend(['--workers-per-resource', str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource])
+    args.extend([
+      '--workers-per-resource',
+      str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource,
+    ])
  if device and not os.environ.get('CUDA_VISIBLE_DEVICES'):
    args.extend(['--device', ','.join(device)])
  if quantize:
@@ -77,7 +80,11 @@ def _start(
  if cors:
    args.append('--cors')
  if adapter_map:
-    args.extend(list(itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])))
+    args.extend(
+      list(
+        itertools.chain.from_iterable([['--adapter-id', f"{k}{':' + v if v else ''}"] for k, v in adapter_map.items()])
+      )
+    )
  if additional_args:
    args.extend(additional_args)
  if __test__:
@@ -148,7 +155,9 @@ def _build(
    '--machine',
    '--quiet',
    '--serialisation',
-    first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'),
+    first_not_none(
+      serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
+    ),
  ]
  if quantize:
    args.extend(['--quantize', quantize])
@@ -165,7 +174,7 @@ def _build(
  if overwrite:
    args.append('--overwrite')
  if adapter_map:
-    args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()])
+    args.extend([f"--adapter-id={k}{':' + v if v is not None else ''}" for k, v in adapter_map.items()])
  if model_version:
    args.extend(['--model-version', model_version])
  if bento_version:
@@ -265,4 +274,4 @@ start, build, import_model, list_models = (
  codegen.gen_sdk(_import_model),
  codegen.gen_sdk(_list_models),
 )
-__all__ = ['start', 'build', 'import_model', 'list_models']
+__all__ = ['build', 'import_model', 'list_models', 'start']
--- a/openllm-python/src/openllm_cli/entrypoint.py
+++ b/openllm-python/src/openllm_cli/entrypoint.py
@@ -43,7 +43,15 @@ from openllm_core.utils import (
 )

 from . import termui
-from ._factory import FC, _AnyCallable, machine_option, model_name_argument, parse_config_options, start_decorator, optimization_decorator
+from ._factory import (
+  FC,
+  _AnyCallable,
+  machine_option,
+  model_name_argument,
+  parse_config_options,
+  start_decorator,
+  optimization_decorator,
+)

 if t.TYPE_CHECKING:
  import torch
@@ -95,12 +103,18 @@ def backend_warning(backend: LiteralBackend, build: bool = False) -> None:
        'vLLM is not available. Note that PyTorch backend is not as performant as vLLM and you should always consider using vLLM for production.'
      )
    if build:
-      logger.info("Tip: You can set '--backend vllm' to package your Bento with vLLM backend regardless if vLLM is available locally.")
+      logger.info(
+        "Tip: You can set '--backend vllm' to package your Bento with vLLM backend regardless if vLLM is available locally."
+      )


 class Extensions(click.MultiCommand):
  def list_commands(self, ctx: click.Context) -> list[str]:
-    return sorted([filename[:-3] for filename in os.listdir(_EXT_FOLDER) if filename.endswith('.py') and not filename.startswith('__')])
+    return sorted([
+      filename[:-3]
+      for filename in os.listdir(_EXT_FOLDER)
+      if filename.endswith('.py') and not filename.startswith('__')
+    ])

  def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None:
    try:
@@ -117,19 +131,41 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
  def common_params(f: t.Callable[P, t.Any]) -> t.Callable[[FC], FC]:
    # The following logics is similar to one of BentoMLCommandGroup
    @cog.optgroup.group(name='Global options', help='Shared globals options for all OpenLLM CLI.')  # type: ignore[misc]
-    @cog.optgroup.option('-q', '--quiet', envvar=QUIET_ENV_VAR, is_flag=True, default=False, help='Suppress all output.', show_envvar=True)
    @cog.optgroup.option(
-      '--debug', '--verbose', 'debug', envvar=DEBUG_ENV_VAR, is_flag=True, default=False, help='Print out debug logs.', show_envvar=True
+      '-q', '--quiet', envvar=QUIET_ENV_VAR, is_flag=True, default=False, help='Suppress all output.', show_envvar=True
    )
    @cog.optgroup.option(
-      '--do-not-track', is_flag=True, default=False, envvar=analytics.OPENLLM_DO_NOT_TRACK, help='Do not send usage info', show_envvar=True
+      '--debug',
+      '--verbose',
+      'debug',
+      envvar=DEBUG_ENV_VAR,
+      is_flag=True,
+      default=False,
+      help='Print out debug logs.',
+      show_envvar=True,
    )
    @cog.optgroup.option(
-      '--context', 'cloud_context', envvar='BENTOCLOUD_CONTEXT', type=click.STRING, default=None, help='BentoCloud context name.', show_envvar=True
+      '--do-not-track',
+      is_flag=True,
+      default=False,
+      envvar=analytics.OPENLLM_DO_NOT_TRACK,
+      help='Do not send usage info',
+      show_envvar=True,
+    )
+    @cog.optgroup.option(
+      '--context',
+      'cloud_context',
+      envvar='BENTOCLOUD_CONTEXT',
+      type=click.STRING,
+      default=None,
+      help='BentoCloud context name.',
+      show_envvar=True,
    )
    @click.pass_context
    @functools.wraps(f)
-    def wrapper(ctx: click.Context, quiet: bool, debug: bool, cloud_context: str | None, *args: P.args, **attrs: P.kwargs) -> t.Any:
+    def wrapper(
+      ctx: click.Context, quiet: bool, debug: bool, cloud_context: str | None, *args: P.args, **attrs: P.kwargs
+    ) -> t.Any:
      ctx.obj = GlobalOptions(cloud_context=cloud_context)
      if quiet:
        set_quiet_mode(True)
@@ -143,7 +179,9 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
    return wrapper

  @staticmethod
-  def usage_tracking(func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any) -> t.Callable[Concatenate[bool, P], t.Any]:
+  def usage_tracking(
+    func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any
+  ) -> t.Callable[Concatenate[bool, P], t.Any]:
    command_name = attrs.get('name', func.__name__)

    @functools.wraps(func)
@@ -202,7 +240,9 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
      _memo = getattr(wrapped, '__click_params__', None)
      if _memo is None:
        raise ValueError('Click command not register correctly.')
-      _object_setattr(wrapped, '__click_params__', _memo[-self.NUMBER_OF_COMMON_PARAMS :] + _memo[: -self.NUMBER_OF_COMMON_PARAMS])
+      _object_setattr(
+        wrapped, '__click_params__', _memo[-self.NUMBER_OF_COMMON_PARAMS :] + _memo[: -self.NUMBER_OF_COMMON_PARAMS]
+      )
      # NOTE: we need to call super of super to avoid conflict with BentoMLCommandGroup command setup
      cmd = super(BentoMLCommandGroup, self).command(*args, **kwargs)(wrapped)
      # NOTE: add aliases to a given commands if it is specified.
@@ -210,7 +250,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
        if not cmd.name:
          raise ValueError('name is required when aliases are available.')
        self._commands[cmd.name] = aliases
-        self._aliases.update({alias: cmd.name for alias in aliases})
+        self._aliases.update(dict.fromkeys(aliases, cmd.name))
      return cmd

    return decorator
@@ -277,7 +317,12 @@ def cli() -> None:
  """


-@cli.command(context_settings=termui.CONTEXT_SETTINGS, name='start', aliases=['start-http'], short_help='Start a LLMServer for any supported LLM.')
+@cli.command(
+  context_settings=termui.CONTEXT_SETTINGS,
+  name='start',
+  aliases=['start-http'],
+  short_help='Start a LLMServer for any supported LLM.',
+)
@click.argument('model_id', type=click.STRING, metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]', required=True)
@click.option(
  '--model-id',
@@ -330,7 +375,9 @@ def start_command(
  ```
  """
  if backend == 'pt':
-    logger.warning('PyTorch backend is deprecated and will be removed in future releases. Make sure to use vLLM instead.')
+    logger.warning(
+      'PyTorch backend is deprecated and will be removed in future releases. Make sure to use vLLM instead.'
+    )
  if model_id in openllm.CONFIG_MAPPING:
    _model_name = model_id
    if deprecated_model_id is not None:
@@ -348,11 +395,17 @@ def start_command(

  from openllm.serialisation.transformers.weights import has_safetensors_weights

-  serialisation = first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy')
+  serialisation = first_not_none(
+    serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
+  )

  if serialisation == 'safetensors' and quantize is not None:
    logger.warning("'--quantize=%s' might not work with 'safetensors' serialisation format.", quantize)
-    logger.warning("Make sure to check out '%s' repository to see if the weights is in '%s' format if unsure.", model_id, serialisation)
+    logger.warning(
+      "Make sure to check out '%s' repository to see if the weights is in '%s' format if unsure.",
+      model_id,
+      serialisation,
+    )
    logger.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.")

  import torch
@@ -380,7 +433,9 @@ def start_command(
  config, server_attrs = llm.config.model_validate_click(**attrs)
  server_timeout = first_not_none(server_timeout, default=config['timeout'])
  server_attrs.update({'working_dir': pkg.source_locations('openllm'), 'timeout': server_timeout})
-  development = server_attrs.pop('development')  # XXX: currently, theres no development args in bentoml.Server. To be fixed upstream.
+  development = server_attrs.pop(
+    'development'
+  )  # XXX: currently, theres no development args in bentoml.Server. To be fixed upstream.
  server_attrs.setdefault('production', not development)

  start_env = process_environ(
@@ -410,8 +465,12 @@ def start_command(
  return config


-def process_environ(config, server_timeout, wpr, device, cors, model_id, adapter_map, serialisation, llm, use_current_env=True):
-  environ = parse_config_options(config, server_timeout, wpr, device, cors, os.environ.copy() if use_current_env else {})
+def process_environ(
+  config, server_timeout, wpr, device, cors, model_id, adapter_map, serialisation, llm, use_current_env=True
+):
+  environ = parse_config_options(
+    config, server_timeout, wpr, device, cors, os.environ.copy() if use_current_env else {}
+  )
  environ.update({
    'OPENLLM_MODEL_ID': model_id,
    'BENTOML_DEBUG': str(openllm.utils.get_debug_mode()),
@@ -456,7 +515,8 @@ def build_bento_instruction(llm, model_id, serialisation, adapter_map):
    cmd_name += f' --serialization {serialisation}'
  if adapter_map is not None:
    cmd_name += ' ' + ' '.join([
-      f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]
+      f'--adapter-id {s}'
+      for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]
    ])
  if not openllm.utils.get_quiet_mode():
    termui.info(f"🚀Tip: run '{cmd_name}' to create a BentoLLM for '{model_id}'")
@@ -491,8 +551,12 @@ def run_server(args, env, return_process=False) -> subprocess.Popen[bytes] | int
  if return_process:
    return process
  stop_event = threading.Event()
-  stdout, stderr = threading.Thread(target=handle, args=(process.stdout, stop_event)), threading.Thread(target=handle, args=(process.stderr, stop_event))
-  stdout.start(); stderr.start()  # noqa: E702
+  stdout, stderr = (
+    threading.Thread(target=handle, args=(process.stdout, stop_event)),
+    threading.Thread(target=handle, args=(process.stderr, stop_event)),
+  )
+  stdout.start()
+  stderr.start()  # noqa: E702

  try:
    process.wait()
@@ -507,9 +571,12 @@ def run_server(args, env, return_process=False) -> subprocess.Popen[bytes] | int
    raise
  finally:
    stop_event.set()
-    stdout.join(); stderr.join()  # noqa: E702
-    if process.poll() is not None: process.kill()
-    stdout.join(); stderr.join()  # noqa: E702
+    stdout.join()
+    stderr.join()  # noqa: E702
+    if process.poll() is not None:
+      process.kill()
+    stdout.join()
+    stderr.join()  # noqa: E702

  return process.returncode

@@ -597,7 +664,10 @@ def import_command(
    backend=backend,
    dtype=dtype,
    serialisation=t.cast(
-      LiteralSerialisation, first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy')
+      LiteralSerialisation,
+      first_not_none(
+        serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
+      ),
    ),
  )
  backend_warning(llm.__llm_backend__)
@@ -656,14 +726,21 @@ class BuildBentoOutput(t.TypedDict):
  metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]',
  help='Deprecated. Use positional argument instead.',
 )
-@click.option('--bento-version', type=str, default=None, help='Optional bento version for this BentoLLM. Default is the the model revision.')
+@click.option(
+  '--bento-version',
+  type=str,
+  default=None,
+  help='Optional bento version for this BentoLLM. Default is the the model revision.',
+)
@click.option('--overwrite', is_flag=True, help='Overwrite existing Bento for given LLM if it already exists.')
@click.option(
  '--enable-features',
  multiple=True,
  nargs=1,
  metavar='FEATURE[,FEATURE]',
-  help='Enable additional features for building this LLM Bento. Available: {}'.format(', '.join(OPTIONAL_DEPENDENCIES)),
+  help='Enable additional features for building this LLM Bento. Available: {}'.format(
+    ', '.join(OPTIONAL_DEPENDENCIES)
+  ),
 )
@optimization_decorator
@click.option(
@@ -674,7 +751,12 @@ class BuildBentoOutput(t.TypedDict):
  help="Optional adapters id to be included within the Bento. Note that if you are using relative path, '--build-ctx' must be passed.",
 )
@click.option('--build-ctx', help='Build context. This is required if --adapter-id uses relative path', default=None)
-@click.option('--dockerfile-template', default=None, type=click.File(), help='Optional custom dockerfile template to be used with this BentoLLM.')
+@click.option(
+  '--dockerfile-template',
+  default=None,
+  type=click.File(),
+  help='Optional custom dockerfile template to be used with this BentoLLM.',
+)
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Utilities options')  # type: ignore[misc]
@cog.optgroup.option(
  '--containerize',
@@ -767,7 +849,9 @@ def build_command(
  state = ItemState.NOT_FOUND

  if backend == 'pt':
-    logger.warning("PyTorch backend is deprecated and will be removed from the next releases. Will set default backend to 'vllm' instead.")
+    logger.warning(
+      "PyTorch backend is deprecated and will be removed from the next releases. Will set default backend to 'vllm' instead."
+    )

  llm = openllm.LLM(
    model_id=model_id,
@@ -777,7 +861,9 @@ def build_command(
    dtype=dtype,
    max_model_len=max_model_len,
    gpu_memory_utilization=gpu_memory_utilization,
-    serialisation=first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'),
+    serialisation=first_not_none(
+      serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
+    ),
    _eager=False,
  )
  if llm.__llm_backend__ not in llm.config['backend']:
@@ -789,7 +875,9 @@ def build_command(
    model = openllm.serialisation.import_model(llm, trust_remote_code=llm.trust_remote_code)
  llm._tag = model.tag

-  os.environ.update(**process_environ(llm.config, llm.config['timeout'], 1.0, None, True, llm.model_id, None, llm._serialisation, llm))
+  os.environ.update(
+    **process_environ(llm.config, llm.config['timeout'], 1.0, None, True, llm.model_id, None, llm._serialisation, llm)
+  )

  try:
    assert llm.bentomodel  # HACK: call it here to patch correct tag with revision and everything
@@ -856,7 +944,11 @@ def build_command(

  def get_current_bentocloud_context() -> str | None:
    try:
-      context = cloud_config.get_context(ctx.obj.cloud_context) if ctx.obj.cloud_context else cloud_config.get_current_context()
+      context = (
+        cloud_config.get_context(ctx.obj.cloud_context)
+        if ctx.obj.cloud_context
+        else cloud_config.get_current_context()
+      )
      return context.name
    except Exception:
      return None
@@ -880,7 +972,9 @@ def build_command(
    tag=str(bento_tag),
    backend=llm.__llm_backend__,
    instructions=[
-      DeploymentInstruction.from_content(type='bentocloud', instr="☁️  Push to BentoCloud with 'bentoml push':\n    $ {cmd}", cmd=push_cmd),
+      DeploymentInstruction.from_content(
+        type='bentocloud', instr="☁️  Push to BentoCloud with 'bentoml push':\n    $ {cmd}", cmd=push_cmd
+      ),
      DeploymentInstruction.from_content(
        type='container',
        instr="🐳 Container BentoLLM with 'bentoml containerize':\n    $ {cmd}",
@@ -906,7 +1000,9 @@ def build_command(
        termui.echo(f"  * {instruction['content']}\n", nl=False)

  if push:
-    BentoMLContainer.bentocloud_client.get().push_bento(bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=force_push)
+    BentoMLContainer.bentocloud_client.get().push_bento(
+      bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=force_push
+    )
  elif containerize:
    container_backend = t.cast('DefaultBuilder', os.environ.get('BENTOML_CONTAINERIZE_BACKEND', 'docker'))
    try:
@@ -946,7 +1042,8 @@ def models_command(**_: t.Any) -> dict[t.LiteralString, ModelItem]:
      architecture=config.__openllm_architecture__,
      example_id=random.choice(config.__openllm_model_ids__),
      supported_backends=config.__openllm_backend__,
-      installation='pip install ' + (f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config.__openllm_requirements__ else 'openllm'),
+      installation='pip install '
+      + (f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config.__openllm_requirements__ else 'openllm'),
      items=[
        str(md.tag)
        for md in bentoml.models.list()
@@ -965,7 +1062,13 @@ def models_command(**_: t.Any) -> dict[t.LiteralString, ModelItem]:
@cli.command()
@model_name_argument(required=False)
@click.option('-y', '--yes', '--assume-yes', is_flag=True, help='Skip confirmation when deleting a specific model')
-@click.option('--include-bentos/--no-include-bentos', is_flag=True, hidden=True, default=True, help='Whether to also include pruning bentos.')
+@click.option(
+  '--include-bentos/--no-include-bentos',
+  is_flag=True,
+  hidden=True,
+  default=True,
+  help='Whether to also include pruning bentos.',
+)
@inject
@click.pass_context
 def prune_command(
@@ -982,24 +1085,32 @@ def prune_command(
  If a model type is passed, then only prune models for that given model type.
  """
  available: list[tuple[bentoml.Model | bentoml.Bento, ModelStore | BentoStore]] = [
-    (m, model_store) for m in bentoml.models.list() if 'framework' in m.info.labels and m.info.labels['framework'] == 'openllm'
+    (m, model_store)
+    for m in bentoml.models.list()
+    if 'framework' in m.info.labels and m.info.labels['framework'] == 'openllm'
  ]
  if model_name is not None:
    available = [
-      (m, store) for m, store in available if 'model_name' in m.info.labels and m.info.labels['model_name'] == inflection.underscore(model_name)
+      (m, store)
+      for m, store in available
+      if 'model_name' in m.info.labels and m.info.labels['model_name'] == inflection.underscore(model_name)
    ] + [
      (b, bento_store)
      for b in bentoml.bentos.list()
      if 'start_name' in b.info.labels and b.info.labels['start_name'] == inflection.underscore(model_name)
    ]
  else:
-    available += [(b, bento_store) for b in bentoml.bentos.list() if '_type' in b.info.labels and '_framework' in b.info.labels]
+    available += [
+      (b, bento_store) for b in bentoml.bentos.list() if '_type' in b.info.labels and '_framework' in b.info.labels
+    ]

  for store_item, store in available:
    if yes:
      delete_confirmed = True
    else:
-      delete_confirmed = click.confirm(f"delete {'model' if isinstance(store, ModelStore) else 'bento'} {store_item.tag}?")
+      delete_confirmed = click.confirm(
+        f"delete {'model' if isinstance(store, ModelStore) else 'bento'} {store_item.tag}?"
+      )
    if delete_confirmed:
      store.delete(store_item.tag)
      termui.warning(f"{store_item} deleted from {'model' if isinstance(store, ModelStore) else 'bento'} store.")
@@ -1046,8 +1157,17 @@ def shared_client_options(f: _AnyCallable | None = None) -> t.Callable[[FC], FC]

@cli.command()
@shared_client_options
-@click.option('--server-type', type=click.Choice(['grpc', 'http']), help='Server type', default='http', show_default=True, hidden=True)
-@click.option('--stream/--no-stream', type=click.BOOL, is_flag=True, default=True, help='Whether to stream the response.')
+@click.option(
+  '--server-type',
+  type=click.Choice(['grpc', 'http']),
+  help='Server type',
+  default='http',
+  show_default=True,
+  hidden=True,
+)
+@click.option(
+  '--stream/--no-stream', type=click.BOOL, is_flag=True, default=True, help='Whether to stream the response.'
+)
@click.argument('prompt', type=click.STRING)
@click.option(
  '--sampling-params',
--- a/openllm-python/src/openllm_cli/extension/dive_bentos.py
+++ b/openllm-python/src/openllm_cli/extension/dive_bentos.py
@@ -21,7 +21,9 @@ if t.TYPE_CHECKING:
@machine_option
@click.pass_context
@inject
-def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
+def cli(
+  ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
+) -> str | None:
  """Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path)."""
  try:
    bentomodel = _bento_store.get(bento)
--- a/openllm-python/src/openllm_cli/extension/get_containerfile.py
+++ b/openllm-python/src/openllm_cli/extension/get_containerfile.py
@@ -17,7 +17,9 @@ if t.TYPE_CHECKING:
  from bentoml._internal.bento import BentoStore


-@click.command('get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.')
+@click.command(
+  'get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.'
+)
@click.argument('bento', type=str, shell_complete=bento_complete_envvar)
@click.pass_context
@inject
--- a/openllm-python/src/openllm_cli/extension/get_prompt.py
+++ b/openllm-python/src/openllm_cli/extension/get_prompt.py
@@ -22,7 +22,9 @@ class PromptFormatter(string.Formatter):
      raise ValueError('Positional arguments are not supported')
    return super().vformat(format_string, args, kwargs)

-  def check_unused_args(self, used_args: set[int | str], args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]) -> None:
+  def check_unused_args(
+    self, used_args: set[int | str], args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]
+  ) -> None:
    extras = set(kwargs).difference(used_args)
    if extras:
      raise KeyError(f'Extra params passed: {extras}')
@@ -56,7 +58,9 @@ class PromptTemplate:
    try:
      return self.template.format(**prompt_variables)
    except KeyError as e:
-      raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {self._input_variables}) in the prompt template.") from None
+      raise RuntimeError(
+        f"Missing variable '{e.args[0]}' (required: {self._input_variables}) in the prompt template."
+      ) from None


@click.command('get_prompt', context_settings=termui.CONTEXT_SETTINGS)
@@ -124,15 +128,21 @@ def cli(
  if prompt_template_file and chat_template_file:
    ctx.fail('prompt-template-file and chat-template-file are mutually exclusive.')

-  acceptable = set(openllm.CONFIG_MAPPING_NAMES.keys()) | set(inflection.dasherize(name) for name in openllm.CONFIG_MAPPING_NAMES.keys())
+  acceptable = set(openllm.CONFIG_MAPPING_NAMES.keys()) | set(
+    inflection.dasherize(name) for name in openllm.CONFIG_MAPPING_NAMES.keys()
+  )
  if model_id in acceptable:
-    logger.warning('Using a default prompt from OpenLLM. Note that this prompt might not work for your intended usage.\n')
+    logger.warning(
+      'Using a default prompt from OpenLLM. Note that this prompt might not work for your intended usage.\n'
+    )
    config = openllm.AutoConfig.for_model(model_id)
    template = prompt_template_file.read() if prompt_template_file is not None else config.template
    system_message = system_message or config.system_message

    try:
-      formatted = PromptTemplate(template).with_options(system_message=system_message).format(instruction=prompt, **_memoized)
+      formatted = (
+        PromptTemplate(template).with_options(system_message=system_message).format(instruction=prompt, **_memoized)
+      )
    except RuntimeError as err:
      logger.debug('Exception caught while formatting prompt: %s', err)
      ctx.fail(str(err))
@@ -149,15 +159,21 @@ def cli(
      for architecture in config.architectures:
        if architecture in openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE():
          system_message = (
-            openllm.AutoConfig.infer_class_from_name(openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture])
+            openllm.AutoConfig.infer_class_from_name(
+              openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture]
+            )
            .model_construct_env()
            .system_message
          )
          break
      else:
-        ctx.fail(f'Failed to infer system message from model architecture: {config.architectures}. Please pass in --system-message')
+        ctx.fail(
+          f'Failed to infer system message from model architecture: {config.architectures}. Please pass in --system-message'
+        )
    messages = [{'role': 'system', 'content': system_message}, {'role': 'user', 'content': prompt}]
-    formatted = tokenizer.apply_chat_template(messages, chat_template=chat_template_file, add_generation_prompt=add_generation_prompt, tokenize=False)
+    formatted = tokenizer.apply_chat_template(
+      messages, chat_template=chat_template_file, add_generation_prompt=add_generation_prompt, tokenize=False
+    )

  termui.echo(orjson.dumps({'prompt': formatted}, option=orjson.OPT_INDENT_2).decode(), fg='white')
  ctx.exit(0)
--- a/openllm-python/src/openllm_cli/extension/list_models.py
+++ b/openllm-python/src/openllm_cli/extension/list_models.py
@@ -33,12 +33,17 @@ def cli(model_name: str | None) -> DictStrAny:
  }
  if model_name is not None:
    ids_in_local_store = {
-      k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)]
+      k: [
+        i
+        for i in v
+        if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)
+      ]
      for k, v in ids_in_local_store.items()
    }
  ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
  local_models = {
-    k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()
+    k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val]
+    for k, val in ids_in_local_store.items()
  }
  termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white')
  return local_models
--- a/openllm-python/src/openllm_cli/extension/playground.py
+++ b/openllm-python/src/openllm_cli/extension/playground.py
@@ -32,7 +32,14 @@ def load_notebook_metadata() -> DictStrAny:

@click.command('playground', context_settings=termui.CONTEXT_SETTINGS)
@click.argument('output-dir', default=None, required=False)
-@click.option('--port', envvar='JUPYTER_PORT', show_envvar=True, show_default=True, default=8888, help='Default port for Jupyter server')
+@click.option(
+  '--port',
+  envvar='JUPYTER_PORT',
+  show_envvar=True,
+  show_default=True,
+  default=8888,
+  help='Default port for Jupyter server',
+)
@click.pass_context
 def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  """OpenLLM Playground.
@@ -53,7 +60,9 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  > This command requires Jupyter to be installed. Install it with 'pip install "openllm[playground]"'
  """
  if not is_jupyter_available() or not is_jupytext_available() or not is_notebook_available():
-    raise RuntimeError("Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'")
+    raise RuntimeError(
+      "Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'"
+    )
  metadata = load_notebook_metadata()
  _temp_dir = False
  if output_dir is None:
@@ -65,7 +74,9 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  termui.echo('The playground notebooks will be saved to: ' + os.path.abspath(output_dir), fg='blue')
  for module in pkgutil.iter_modules(playground.__path__):
    if module.ispkg or os.path.exists(os.path.join(output_dir, module.name + '.ipynb')):
-      logger.debug('Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module')
+      logger.debug(
+        'Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module'
+      )
      continue
    if not isinstance(module.module_finder, importlib.machinery.FileFinder):
      continue
--- a/openllm-python/src/openllm_cli/termui.py
+++ b/openllm-python/src/openllm_cli/termui.py
@@ -25,7 +25,14 @@ class Level(enum.IntEnum):

  @property
  def color(self) -> str | None:
-    return {Level.NOTSET: None, Level.DEBUG: 'cyan', Level.INFO: 'green', Level.WARNING: 'yellow', Level.ERROR: 'red', Level.CRITICAL: 'red'}[self]
+    return {
+      Level.NOTSET: None,
+      Level.DEBUG: 'cyan',
+      Level.INFO: 'green',
+      Level.WARNING: 'yellow',
+      Level.ERROR: 'red',
+      Level.CRITICAL: 'red',
+    }[self]

  @classmethod
  def from_logging_level(cls, level: int) -> Level:
@@ -75,5 +82,9 @@ def echo(text: t.Any, fg: str | None = None, *, _with_style: bool = True, json:


 COLUMNS: int = int(os.environ.get('COLUMNS', str(120)))
-CONTEXT_SETTINGS: DictStrAny = {'help_option_names': ['-h', '--help'], 'max_content_width': COLUMNS, 'token_normalize_func': inflection.underscore}
-__all__ = ['echo', 'COLUMNS', 'CONTEXT_SETTINGS', 'log', 'warning', 'error', 'critical', 'debug', 'info', 'Level']
+CONTEXT_SETTINGS: DictStrAny = {
+  'help_option_names': ['-h', '--help'],
+  'max_content_width': COLUMNS,
+  'token_normalize_func': inflection.underscore,
+}
+__all__ = ['COLUMNS', 'CONTEXT_SETTINGS', 'Level', 'critical', 'debug', 'echo', 'error', 'info', 'log', 'warning']