style: google

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-04-25 01:19:27 -04:00 · 2023-08-30 13:52:00 -04:00
parent e2ba6a92a6
commit b545ad2ad1
98 changed files with 3514 additions and 2094 deletions
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -40,27 +40,46 @@ _AnyCallable = t.Callable[..., t.Any]
 FC = t.TypeVar('FC', bound=t.Union[_AnyCallable, click.Command])

 def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
-  return [sc.CompletionItem(str(it.tag), help='Bento') for it in bentoml.list() if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})]
+  return [
+      sc.CompletionItem(str(it.tag), help='Bento')
+      for it in bentoml.list()
+      if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})
+  ]

 def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
-  return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]
+  return [
+      sc.CompletionItem(inflection.dasherize(it), help='Model')
+      for it in openllm.CONFIG_MAPPING
+      if it.startswith(incomplete)
+  ]

-def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
+def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float,
+                         device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
  # TODO: Support amd.com/gpu on k8s
  _bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '')
  _bentoml_config_options_opts = [
-      'tracing.sample_rate=1.0',
-      f'api_server.traffic.timeout={server_timeout}',
+      'tracing.sample_rate=1.0', f'api_server.traffic.timeout={server_timeout}',
      f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}',
      f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}'
  ]
  if device:
-    if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
-    else: _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
-  _bentoml_config_options_opts.append(f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
+    if len(device) > 1:
+      _bentoml_config_options_opts.extend([
+          f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
+          for idx, dev in enumerate(device)
+      ])
+    else:
+      _bentoml_config_options_opts.append(
+          f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
+  _bentoml_config_options_opts.append(
+      f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
  if cors:
-    _bentoml_config_options_opts.extend(['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
-    _bentoml_config_options_opts.extend([f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])])
+    _bentoml_config_options_opts.extend(
+        ['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
+    _bentoml_config_options_opts.extend([
+        f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"'
+        for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])
+    ])
  _bentoml_config_options_env += ' ' if _bentoml_config_options_env else '' + ' '.join(_bentoml_config_options_opts)
  environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env
  if DEBUG: logger.debug('Setting BENTOML_CONFIG_OPTIONS=%s', _bentoml_config_options_env)
@@ -82,7 +101,10 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...
    ctx.params[_adapter_mapping_key][adapter_id] = adapter_name[0] if len(adapter_name) > 0 else None
  return None

-def start_command_factory(group: click.Group, model: str, _context_settings: DictStrAny | None = None, _serve_grpc: bool = False) -> click.Command:
+def start_command_factory(group: click.Group,
+                          model: str,
+                          _context_settings: DictStrAny | None = None,
+                          _serve_grpc: bool = False) -> click.Command:
  llm_config = openllm.AutoConfig.for_model(model)
  command_attrs: DictStrAny = dict(
      name=llm_config['model_name'],
@@ -113,37 +135,29 @@ Available official model_id(s): [default: {llm_config['default_id']}]
  if llm_config['requires_gpu'] and openllm.utils.device_count() < 1:
    # NOTE: The model requires GPU, therefore we will return a dummy command
    command_attrs.update({
-        'short_help': '(Disabled because there is no GPU available)', 'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
+        'short_help':
+            '(Disabled because there is no GPU available)',
+        'help':
+            f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
    })
    return noop_command(group, llm_config, _serve_grpc, **command_attrs)

  @group.command(**command_attrs)
  @start_decorator(llm_config, serve_grpc=_serve_grpc)
  @click.pass_context
-  def start_cmd(
-      ctx: click.Context,
-      /,
-      server_timeout: int,
-      model_id: str | None,
-      model_version: str | None,
-      workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString,
-      device: t.Tuple[str, ...],
-      quantize: t.Literal['int8', 'int4', 'gptq'] | None,
-      bettertransformer: bool | None,
-      runtime: t.Literal['ggml', 'transformers'],
-      fast: bool,
-      serialisation_format: t.Literal['safetensors', 'legacy'],
-      cors: bool,
-      adapter_id: str | None,
-      return_process: bool,
-      **attrs: t.Any,
-  ) -> LLMConfig | subprocess.Popen[bytes]:
+  def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None,
+                workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...],
+                quantize: t.Literal['int8', 'int4', 'gptq'] | None, bettertransformer: bool | None,
+                runtime: t.Literal['ggml', 'transformers'], fast: bool, serialisation_format: t.Literal['safetensors',
+                                                                                                        'legacy'],
+                cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any,
+               ) -> LLMConfig | subprocess.Popen[bytes]:
    fast = str(fast).upper() in openllm.utils.ENV_VARS_TRUE_VALUES
-    if serialisation_format == 'safetensors' and quantize is not None and os.environ.get('OPENLLM_SERIALIZATION_WARNING', str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES:
+    if serialisation_format == 'safetensors' and quantize is not None and os.environ.get(
+        'OPENLLM_SERIALIZATION_WARNING', str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES:
      termui.echo(
          f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.",
-          fg='yellow'
-      )
+          fg='yellow')
    adapter_map: dict[str, str | None] | None = attrs.pop(_adapter_mapping_key, None)
    config, server_attrs = llm_config.model_validate_click(**attrs)
    server_timeout = openllm.utils.first_not_none(server_timeout, default=config['timeout'])
@@ -169,16 +183,21 @@ Available official model_id(s): [default: {llm_config['default_id']}]
      wpr = float(wpr)

    # Create a new model env to work with the envvar during CLI invocation
-    env = openllm.utils.EnvVarMixin(
-        config['model_name'], config.default_implementation(), model_id=model_id or config['default_id'], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime
-    )
+    env = openllm.utils.EnvVarMixin(config['model_name'],
+                                    config.default_implementation(),
+                                    model_id=model_id or config['default_id'],
+                                    bettertransformer=bettertransformer,
+                                    quantize=quantize,
+                                    runtime=runtime)
    prerequisite_check(ctx, config, quantize, adapter_map, int(1 / wpr))

    # NOTE: This is to set current configuration
    start_env = os.environ.copy()
    start_env = parse_config_options(config, server_timeout, wpr, device, cors, start_env)
    if fast:
-      termui.echo(f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'", fg='yellow')
+      termui.echo(
+          f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'",
+          fg='yellow')

    start_env.update({
        'OPENLLM_MODEL': model,
@@ -194,18 +213,28 @@ Available official model_id(s): [default: {llm_config['default_id']}]
    if bettertransformer is not None: start_env[env.bettertransformer] = str(env['bettertransformer_value'])
    if quantize is not None: start_env[env.quantize] = str(t.cast(str, env['quantize_value']))

-    llm = openllm.utils.infer_auto_class(env['framework_value']).for_model(
-        model, model_id=start_env[env.model_id], model_version=model_version, llm_config=config, ensure_available=not fast, adapter_map=adapter_map, serialisation=serialisation_format
-    )
+    llm = openllm.utils.infer_auto_class(env['framework_value']).for_model(model,
+                                                                           model_id=start_env[env.model_id],
+                                                                           model_version=model_version,
+                                                                           llm_config=config,
+                                                                           ensure_available=not fast,
+                                                                           adapter_map=adapter_map,
+                                                                           serialisation=serialisation_format)
    start_env.update({env.config: llm.config.model_dump_json().decode()})

-    server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer('_service:svc', **server_attrs)
+    server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer(
+        '_service:svc', **server_attrs)
    openllm.utils.analytics.track_start_init(llm.config)

    def next_step(model_name: str, adapter_map: DictStrAny | None) -> None:
      cmd_name = f'openllm build {model_name}'
-      if adapter_map is not None: cmd_name += ' ' + ' '.join([f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]])
-      if not openllm.utils.get_quiet_mode(): termui.echo(f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}", fg='blue')
+      if adapter_map is not None:
+        cmd_name += ' ' + ' '.join([
+            f'--adapter-id {s}'
+            for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]
+        ])
+      if not openllm.utils.get_quiet_mode():
+        termui.echo(f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}", fg='blue')

    if return_process:
      server.start(env=start_env, text=True)
@@ -239,30 +268,35 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, *

  return noop

-def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None, adapter_map: dict[str, str | None] | None, num_workers: int) -> None:
-  if adapter_map and not openllm.utils.is_peft_available(): ctx.fail("Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'")
+def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None,
+                       adapter_map: dict[str, str | None] | None, num_workers: int) -> None:
+  if adapter_map and not openllm.utils.is_peft_available():
+    ctx.fail(
+        "Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'")
  if quantize and llm_config.default_implementation() == 'vllm':
-    ctx.fail(f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization.")
+    ctx.fail(
+        f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization."
+    )
  requirements = llm_config['requirements']
  if requirements is not None and len(requirements) > 0:
    missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
-    if len(missing_requirements) > 0: termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')
+    if len(missing_requirements) > 0:
+      termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')

 def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
+
  def wrapper(fn: FC) -> t.Callable[[FC], FC]:
    composed = openllm.utils.compose(
-        llm_config.to_click_options,
-        _http_server_args if not serve_grpc else _grpc_server_args,
-        cog.optgroup.group('General LLM Options', help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
-        model_id_option(factory=cog.optgroup, model_env=llm_config['env']),
-        model_version_option(factory=cog.optgroup),
-        cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
-        workers_per_resource_option(factory=cog.optgroup),
-        cors_option(factory=cog.optgroup),
-        fast_option(factory=cog.optgroup),
+        llm_config.to_click_options, _http_server_args if not serve_grpc else _grpc_server_args,
        cog.optgroup.group(
-            'LLM Optimization Options',
-            help='''Optimization related options.
+            'General LLM Options',
+            help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
+        model_id_option(factory=cog.optgroup, model_env=llm_config['env']), model_version_option(factory=cog.optgroup),
+        cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
+        workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup),
+        fast_option(factory=cog.optgroup),
+        cog.optgroup.group('LLM Optimization Options',
+                           help='''Optimization related options.

            OpenLLM supports running model with [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/),
            k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
@@ -272,23 +306,23 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
            - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
            - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
            ''',
-        ),
-        cog.optgroup.option(
-            '--device',
-            type=openllm.utils.dantic.CUDA,
-            multiple=True,
-            envvar='CUDA_VISIBLE_DEVICES',
-            callback=parse_device_callback,
-            help=f"Assign GPU devices (if available) for {llm_config['model_name']}.",
-            show_envvar=True
-        ),
-        cog.optgroup.option('--runtime', type=click.Choice(['ggml', 'transformers']), default='transformers', help='The runtime to use for the given model. Default is transformers.'),
+                          ),
+        cog.optgroup.option('--device',
+                            type=openllm.utils.dantic.CUDA,
+                            multiple=True,
+                            envvar='CUDA_VISIBLE_DEVICES',
+                            callback=parse_device_callback,
+                            help=f"Assign GPU devices (if available) for {llm_config['model_name']}.",
+                            show_envvar=True),
+        cog.optgroup.option('--runtime',
+                            type=click.Choice(['ggml', 'transformers']),
+                            default='transformers',
+                            help='The runtime to use for the given model. Default is transformers.'),
        quantize_option(factory=cog.optgroup, model_env=llm_config['env']),
        bettertransformer_option(factory=cog.optgroup, model_env=llm_config['env']),
        serialisation_option(factory=cog.optgroup),
-        cog.optgroup.group(
-            'Fine-tuning related options',
-            help='''\
+        cog.optgroup.group('Fine-tuning related options',
+                           help='''\
    Note that the argument `--adapter-id` can accept the following format:

    - `--adapter-id /path/to/adapter` (local adapter)
@@ -302,23 +336,22 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
    $ openllm start opt --adapter-id /path/to/adapter_dir --adapter-id remote/adapter:eng_lora

    ```
-    '''
-        ),
-        cog.optgroup.option(
-            '--adapter-id',
-            default=None,
-            help='Optional name or path for given LoRA adapter' + f" to wrap '{llm_config['model_name']}'",
-            multiple=True,
-            callback=_id_callback,
-            metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]'
-        ),
+    '''),
+        cog.optgroup.option('--adapter-id',
+                            default=None,
+                            help='Optional name or path for given LoRA adapter' +
+                            f" to wrap '{llm_config['model_name']}'",
+                            multiple=True,
+                            callback=_id_callback,
+                            metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]'),
        click.option('--return-process', is_flag=True, default=False, help='Internal use only.', hidden=True),
    )
    return composed(fn)

  return wrapper

-def parse_device_callback(ctx: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
+def parse_device_callback(ctx: click.Context, param: click.Parameter,
+                          value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
  if value is None: return value
  if not isinstance(value, tuple): ctx.fail(f'{param} only accept multiple values, not {type(value)} (value: {value})')
  el: t.Tuple[str, ...] = tuple(i for k in value for i in k)
@@ -337,14 +370,18 @@ def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]

  command = 'serve' if not serve_grpc else 'serve-grpc'
  group = cog.optgroup.group(
-      f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options", help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",
+      f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options",
+      help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",
  )

  def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) -> t.Callable[[FC], FC]:
    serve_command = cli.commands[command]
    # The first variable is the argument bento
    # The last five is from BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS
-    serve_options = [p for p in serve_command.params[1:-BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS] if p.name not in _IGNORED_OPTIONS]
+    serve_options = [
+        p for p in serve_command.params[1:-BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS]
+        if p.name not in _IGNORED_OPTIONS
+    ]
    for options in reversed(serve_options):
      attrs = options.to_info_dict()
      # we don't need param_type_name, since it should all be options
@@ -381,73 +418,90 @@ def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC |
 cli_option = functools.partial(_click_factory_type, attr='option')
 cli_argument = functools.partial(_click_factory_type, attr='argument')

-def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput = 'pretty', **attrs: t.Any) -> t.Callable[[FC], FC]:
+def output_option(f: _AnyCallable | None = None,
+                  *,
+                  default_value: LiteralOutput = 'pretty',
+                  **attrs: t.Any) -> t.Callable[[FC], FC]:
  output = ['json', 'pretty', 'porcelain']

  def complete_output_var(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[CompletionItem]:
    return [CompletionItem(it) for it in output]

-  return cli_option(
-      '-o',
-      '--output',
-      'output',
-      type=click.Choice(output),
-      default=default_value,
-      help='Showing output type.',
-      show_default=True,
-      envvar='OPENLLM_OUTPUT',
-      show_envvar=True,
-      shell_complete=complete_output_var,
-      **attrs
-  )(f)
+  return cli_option('-o',
+                    '--output',
+                    'output',
+                    type=click.Choice(output),
+                    default=default_value,
+                    help='Showing output type.',
+                    show_default=True,
+                    envvar='OPENLLM_OUTPUT',
+                    show_envvar=True,
+                    shell_complete=complete_output_var,
+                    **attrs)(f)

 def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-      '--fast/--no-fast',
-      show_default=True,
-      default=False,
-      envvar='OPENLLM_USE_LOCAL_LATEST',
-      show_envvar=True,
-      help='''Whether to skip checking if models is already in store.
+  return cli_option('--fast/--no-fast',
+                    show_default=True,
+                    default=False,
+                    envvar='OPENLLM_USE_LOCAL_LATEST',
+                    show_envvar=True,
+                    help='''Whether to skip checking if models is already in store.

                                                                                                          This is useful if you already downloaded or setup the model beforehand.
                                                                                                          ''',
-      **attrs
-  )(f)
+                    **attrs)(f)

 def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option('--cors/--no-cors', show_default=True, default=False, envvar='OPENLLM_CORS', show_envvar=True, help='Enable CORS for the server.', **attrs)(f)
+  return cli_option('--cors/--no-cors',
+                    show_default=True,
+                    default=False,
+                    envvar='OPENLLM_CORS',
+                    show_envvar=True,
+                    help='Enable CORS for the server.',
+                    **attrs)(f)

 def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option('--machine', is_flag=True, default=False, hidden=True, **attrs)(f)

-def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-      '--model-id',
-      type=click.STRING,
-      default=None,
-      envvar=model_env.model_id if model_env is not None else None,
-      show_envvar=model_env is not None,
-      help='Optional model_id name or path for (fine-tune) weight.',
-      **attrs
-  )(f)
+def model_id_option(f: _AnyCallable | None = None,
+                    *,
+                    model_env: openllm.utils.EnvVarMixin | None = None,
+                    **attrs: t.Any) -> t.Callable[[FC], FC]:
+  return cli_option('--model-id',
+                    type=click.STRING,
+                    default=None,
+                    envvar=model_env.model_id if model_env is not None else None,
+                    show_envvar=model_env is not None,
+                    help='Optional model_id name or path for (fine-tune) weight.',
+                    **attrs)(f)

 def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option('--model-version', type=click.STRING, default=None, help='Optional model version to save for this model. It will be inferred automatically from model-id.', **attrs)(f)
+  return cli_option(
+      '--model-version',
+      type=click.STRING,
+      default=None,
+      help='Optional model version to save for this model. It will be inferred automatically from model-id.',
+      **attrs)(f)

 def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f)
+  return cli_argument('model_name',
+                      type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]),
+                      required=required,
+                      **attrs)(f)

-def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-      '--quantise',
-      '--quantize',
-      'quantize',
-      type=click.Choice(['int8', 'int4', 'gptq']),
-      default=None,
-      envvar=model_env.quantize if model_env is not None else None,
-      show_envvar=model_env is not None,
-      help='''Dynamic quantization for running this LLM.
+def quantize_option(f: _AnyCallable | None = None,
+                    *,
+                    build: bool = False,
+                    model_env: openllm.utils.EnvVarMixin | None = None,
+                    **attrs: t.Any) -> t.Callable[[FC], FC]:
+  return cli_option('--quantise',
+                    '--quantize',
+                    'quantize',
+                    type=click.Choice(['int8', 'int4', 'gptq']),
+                    default=None,
+                    envvar=model_env.quantize if model_env is not None else None,
+                    show_envvar=model_env is not None,
+                    help='''Dynamic quantization for running this LLM.

      The following quantization strategies are supported:

@@ -461,17 +515,18 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model
      ''' + ('''
      > [!NOTE] that this will set the mode for serving within deployment.''' if build else '') + '''
      > [!NOTE] that quantization are currently only available in *PyTorch* models.''',
-      **attrs
-  )(f)
+                    **attrs)(f)

-def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-      '--workers-per-resource',
-      default=None,
-      callback=workers_per_resource_callback,
-      type=str,
-      required=False,
-      help='''Number of workers per resource assigned.
+def workers_per_resource_option(f: _AnyCallable | None = None,
+                                *,
+                                build: bool = False,
+                                **attrs: t.Any) -> t.Callable[[FC], FC]:
+  return cli_option('--workers-per-resource',
+                    default=None,
+                    callback=workers_per_resource_callback,
+                    type=str,
+                    required=False,
+                    help='''Number of workers per resource assigned.

      See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
      for more information. By default, this is set to 1.
@@ -481,38 +536,37 @@ def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool =
      - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.

      - ``conserved``: This will determine the number of available GPU resources, and only assign one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``.
-      ''' + (
-          """\n
+      ''' + ("""\n
      > [!NOTE] The workers value passed into 'build' will determine how the LLM can
      > be provisioned in Kubernetes as well as in standalone container. This will
-      > ensure it has the same effect with 'openllm start --api-workers ...'""" if build else ''
-      ),
-      **attrs
-  )(f)
+      > ensure it has the same effect with 'openllm start --api-workers ...'""" if build else ''),
+                    **attrs)(f)

-def bettertransformer_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
+def bettertransformer_option(f: _AnyCallable | None = None,
+                             *,
+                             build: bool = False,
+                             model_env: openllm.utils.EnvVarMixin | None = None,
+                             **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
      '--bettertransformer',
      is_flag=True,
      default=None,
      envvar=model_env.bettertransformer if model_env is not None else None,
      show_envvar=model_env is not None,
-      help='Apply FasterTransformer wrapper to serve model. This will applies during serving time.'
-      if not build else 'Set default environment variable whether to serve this model with FasterTransformer in build time.',
-      **attrs
-  )(f)
+      help='Apply FasterTransformer wrapper to serve model. This will applies during serving time.' if not build else
+      'Set default environment variable whether to serve this model with FasterTransformer in build time.',
+      **attrs)(f)

 def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-      '--serialisation',
-      '--serialization',
-      'serialisation_format',
-      type=click.Choice(['safetensors', 'legacy']),
-      default='safetensors',
-      show_default=True,
-      show_envvar=True,
-      envvar='OPENLLM_SERIALIZATION',
-      help='''Serialisation format for save/load LLM.
+  return cli_option('--serialisation',
+                    '--serialization',
+                    'serialisation_format',
+                    type=click.Choice(['safetensors', 'legacy']),
+                    default='safetensors',
+                    show_default=True,
+                    show_envvar=True,
+                    envvar='OPENLLM_SERIALIZATION',
+                    help='''Serialisation format for save/load LLM.

      Currently the following strategies are supported:

@@ -529,28 +583,25 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal

      > [!NOTE] that GGML format is working in progress.
      ''',
-      **attrs
-  )(f)
+                    **attrs)(f)

 def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-      '--container-registry',
-      'container_registry',
-      type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
-      default='ecr',
-      show_default=True,
-      show_envvar=True,
-      envvar='OPENLLM_CONTAINER_REGISTRY',
-      callback=container_registry_callback,
-      help='''The default container registry to get the base image for building BentoLLM.
+  return cli_option('--container-registry',
+                    'container_registry',
+                    type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
+                    default='ecr',
+                    show_default=True,
+                    show_envvar=True,
+                    envvar='OPENLLM_CONTAINER_REGISTRY',
+                    callback=container_registry_callback,
+                    help='''The default container registry to get the base image for building BentoLLM.

      Currently, it supports 'ecr', 'ghcr.io', 'docker.io'

      \b
      > [!NOTE] that in order to build the base image, you will need a GPUs to compile custom kernel. See ``openllm ext build-base-container`` for more information.
      ''',
-      **attrs
-  )(f)
+                    **attrs)(f)

 _wpr_strategies = {'round_robin', 'conserved'}

@@ -562,11 +613,14 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va
    try:
      float(value)  # type: ignore[arg-type]
    except ValueError:
-      raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, param) from None
+      raise click.BadParameter(
+          f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.",
+          ctx, param) from None
    else:
      return value

 def container_registry_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None:
  if value is None: return value
-  if value not in openllm.bundle.supported_registries: raise click.BadParameter(f'Value must be one of {openllm.bundle.supported_registries}', ctx, param)
+  if value not in openllm.bundle.supported_registries:
+    raise click.BadParameter(f'Value must be one of {openllm.bundle.supported_registries}', ctx, param)
  return value
--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -30,25 +30,23 @@ if t.TYPE_CHECKING:

 logger = logging.getLogger(__name__)

-def _start(
-    model_name: str,
-    /,
-    *,
-    model_id: str | None = None,
-    timeout: int = 30,
-    workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None,
-    device: tuple[str, ...] | t.Literal['all'] | None = None,
-    quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-    bettertransformer: bool | None = None,
-    runtime: t.Literal['ggml', 'transformers'] = 'transformers',
-    adapter_map: dict[LiteralString, str | None] | None = None,
-    framework: LiteralRuntime | None = None,
-    additional_args: list[str] | None = None,
-    cors: bool = False,
-    _serve_grpc: bool = False,
-    __test__: bool = False,
-    **_: t.Any
-) -> LLMConfig | subprocess.Popen[bytes]:
+def _start(model_name: str,
+           /,
+           *,
+           model_id: str | None = None,
+           timeout: int = 30,
+           workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None,
+           device: tuple[str, ...] | t.Literal['all'] | None = None,
+           quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
+           bettertransformer: bool | None = None,
+           runtime: t.Literal['ggml', 'transformers'] = 'transformers',
+           adapter_map: dict[LiteralString, str | None] | None = None,
+           framework: LiteralRuntime | None = None,
+           additional_args: list[str] | None = None,
+           cors: bool = False,
+           _serve_grpc: bool = False,
+           __test__: bool = False,
+           **_: t.Any) -> LLMConfig | subprocess.Popen[bytes]:
  """Python API to start a LLM server. These provides one-to-one mapping to CLI arguments.

  For all additional arguments, pass it as string to ``additional_args``. For example, if you want to
@@ -91,58 +89,66 @@ def _start(
  from .entrypoint import start_command
  from .entrypoint import start_grpc_command
  llm_config = openllm.AutoConfig.for_model(model_name)
-  _ModelEnv = openllm_core.utils.EnvVarMixin(
-      model_name,
-      openllm_core.utils.first_not_none(framework, default=llm_config.default_implementation()),
-      model_id=model_id,
-      bettertransformer=bettertransformer,
-      quantize=quantize,
-      runtime=runtime
-  )
+  _ModelEnv = openllm_core.utils.EnvVarMixin(model_name,
+                                             openllm_core.utils.first_not_none(
+                                                 framework, default=llm_config.default_implementation()),
+                                             model_id=model_id,
+                                             bettertransformer=bettertransformer,
+                                             quantize=quantize,
+                                             runtime=runtime)
  os.environ[_ModelEnv.framework] = _ModelEnv['framework_value']

  args: list[str] = ['--runtime', runtime]
  if model_id: args.extend(['--model-id', model_id])
  if timeout: args.extend(['--server-timeout', str(timeout)])
-  if workers_per_resource: args.extend(['--workers-per-resource', str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource])
+  if workers_per_resource:
+    args.extend([
+        '--workers-per-resource',
+        str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource
+    ])
  if device and not os.environ.get('CUDA_VISIBLE_DEVICES'): args.extend(['--device', ','.join(device)])
-  if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
+  if quantize and bettertransformer:
+    raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
  if quantize: args.extend(['--quantize', str(quantize)])
  elif bettertransformer: args.append('--bettertransformer')
  if cors: args.append('--cors')
-  if adapter_map: args.extend(list(itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])))
+  if adapter_map:
+    args.extend(
+        list(
+            itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()
+                                          ])))
  if additional_args: args.extend(additional_args)
  if __test__: args.append('--return-process')

-  return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(
-      args=args if len(args) > 0 else None, standalone_mode=False
-  )
+  return start_command_factory(start_command if not _serve_grpc else start_grpc_command,
+                               model_name,
+                               _context_settings=termui.CONTEXT_SETTINGS,
+                               _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None,
+                                                             standalone_mode=False)

@inject
-def _build(
-    model_name: str,
-    /,
-    *,
-    model_id: str | None = None,
-    model_version: str | None = None,
-    bento_version: str | None = None,
-    quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-    bettertransformer: bool | None = None,
-    adapter_map: dict[str, str | None] | None = None,
-    build_ctx: str | None = None,
-    enable_features: tuple[str, ...] | None = None,
-    workers_per_resource: float | None = None,
-    runtime: t.Literal['ggml', 'transformers'] = 'transformers',
-    dockerfile_template: str | None = None,
-    overwrite: bool = False,
-    container_registry: LiteralContainerRegistry | None = None,
-    container_version_strategy: LiteralContainerVersionStrategy | None = None,
-    push: bool = False,
-    containerize: bool = False,
-    serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
-    additional_args: list[str] | None = None,
-    bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
-) -> bentoml.Bento:
+def _build(model_name: str,
+           /,
+           *,
+           model_id: str | None = None,
+           model_version: str | None = None,
+           bento_version: str | None = None,
+           quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
+           bettertransformer: bool | None = None,
+           adapter_map: dict[str, str | None] | None = None,
+           build_ctx: str | None = None,
+           enable_features: tuple[str, ...] | None = None,
+           workers_per_resource: float | None = None,
+           runtime: t.Literal['ggml', 'transformers'] = 'transformers',
+           dockerfile_template: str | None = None,
+           overwrite: bool = False,
+           container_registry: LiteralContainerRegistry | None = None,
+           container_version_strategy: LiteralContainerVersionStrategy | None = None,
+           push: bool = False,
+           containerize: bool = False,
+           serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
+           additional_args: list[str] | None = None,
+           bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
  """Package a LLM into a Bento.

  The LLM will be built into a BentoService with the following structure:
@@ -192,8 +198,12 @@ def _build(
  Returns:
      ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
  """
-  args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--runtime', runtime, '--serialisation', serialisation_format]
-  if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
+  args: list[str] = [
+      sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--runtime', runtime, '--serialisation',
+      serialisation_format
+  ]
+  if quantize and bettertransformer:
+    raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
  if quantize: args.extend(['--quantize', quantize])
  if bettertransformer: args.append('--bettertransformer')
  if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
@@ -221,21 +231,21 @@ def _build(
    raise OpenLLMException(str(e)) from None
  matched = re.match(r'__tag__:([^:\n]+:[^:\n]+)$', output.decode('utf-8').strip())
  if matched is None:
-    raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.")
+    raise ValueError(
+        f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub."
+    )
  return bentoml.get(matched.group(1), _bento_store=bento_store)

-def _import_model(
-    model_name: str,
-    /,
-    *,
-    model_id: str | None = None,
-    model_version: str | None = None,
-    runtime: t.Literal['ggml', 'transformers'] = 'transformers',
-    implementation: LiteralRuntime = 'pt',
-    quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-    serialisation_format: t.Literal['legacy', 'safetensors'] = 'safetensors',
-    additional_args: t.Sequence[str] | None = None
-) -> bentoml.Model:
+def _import_model(model_name: str,
+                  /,
+                  *,
+                  model_id: str | None = None,
+                  model_version: str | None = None,
+                  runtime: t.Literal['ggml', 'transformers'] = 'transformers',
+                  implementation: LiteralRuntime = 'pt',
+                  quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
+                  serialisation_format: t.Literal['legacy', 'safetensors'] = 'safetensors',
+                  additional_args: t.Sequence[str] | None = None) -> bentoml.Model:
  """Import a LLM into local store.

  > [!NOTE]
@@ -267,7 +277,10 @@ def _import_model(
      ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
  """
  from .entrypoint import import_command
-  args = [model_name, '--runtime', runtime, '--implementation', implementation, '--machine', '--serialisation', serialisation_format,]
+  args = [
+      model_name, '--runtime', runtime, '--implementation', implementation, '--machine', '--serialisation',
+      serialisation_format,
+  ]
  if model_id is not None: args.append(model_id)
  if model_version is not None: args.extend(['--model-version', str(model_version)])
  if additional_args is not None: args.extend(additional_args)
@@ -278,5 +291,9 @@ def _list_models() -> dict[str, t.Any]:
  '''List all available models within the local store.'''
  from .entrypoint import models_command
  return models_command.main(args=['-o', 'json', '--show-available', '--machine'], standalone_mode=False)
-start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(_import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
+
+start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(
+    _start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(
+        _start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(
+            _import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
 __all__ = ['start', 'start_grpc', 'build', 'import_model', 'list_models']
--- a/openllm-python/src/openllm/cli/extension/build_base_container.py
+++ b/openllm-python/src/openllm/cli/extension/build_base_container.py
@@ -14,10 +14,9 @@ if t.TYPE_CHECKING:
  from openllm_core._typing_compat import LiteralContainerRegistry
  from openllm_core._typing_compat import LiteralContainerVersionStrategy

-@click.command(
-    'build_base_container',
-    context_settings=termui.CONTEXT_SETTINGS,
-    help='''Base image builder for BentoLLM.
+@click.command('build_base_container',
+               context_settings=termui.CONTEXT_SETTINGS,
+               help='''Base image builder for BentoLLM.

                By default, the base image will include custom kernels (PagedAttention via vllm, FlashAttention-v2, etc.) built with CUDA 11.8, Python 3.9 on Ubuntu22.04.
                Optionally, this can also be pushed directly to remote registry. Currently support ``docker.io``, ``ghcr.io`` and ``quay.io``.
@@ -27,13 +26,16 @@ if t.TYPE_CHECKING:
                This command is only useful for debugging and for building custom base image for extending BentoML with custom base images and custom kernels.

                Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
-                '''
-)
+                ''')
@container_registry_option
-@click.option('--version-strategy', type=click.Choice(['release', 'latest', 'nightly']), default='nightly', help='Version strategy to use for tagging the image.')
+@click.option('--version-strategy',
+              type=click.Choice(['release', 'latest', 'nightly']),
+              default='nightly',
+              help='Version strategy to use for tagging the image.')
@click.option('--push/--no-push', help='Whether to push to remote repository', is_flag=True, default=False)
@machine_option
-def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
+def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None,
+        version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
  mapping = openllm.bundle.build_container(container_registry, version_strategy, push, machine)
  if machine: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
  return mapping
--- a/openllm-python/src/openllm/cli/extension/dive_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/dive_bentos.py
@@ -24,14 +24,19 @@ if t.TYPE_CHECKING:
@machine_option
@click.pass_context
@inject
-def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
+def cli(ctx: click.Context,
+        bento: str,
+        machine: bool,
+        _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
  '''Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path).'''
  try:
    bentomodel = _bento_store.get(bento)
  except bentoml.exceptions.NotFound:
    ctx.fail(f'Bento {bento} not found. Make sure to call `openllm build` first.')
  if 'bundler' not in bentomodel.info.labels or bentomodel.info.labels['bundler'] != 'openllm.bundle':
-    ctx.fail(f"Bento is either too old or not built with OpenLLM. Make sure to use ``openllm build {bentomodel.info.labels['start_name']}`` for correctness.")
+    ctx.fail(
+        f"Bento is either too old or not built with OpenLLM. Make sure to use ``openllm build {bentomodel.info.labels['start_name']}`` for correctness."
+    )
  if machine: return bentomodel.path
  # copy and paste this into a new shell
  if psutil.WINDOWS: subprocess.check_call([shutil.which('dir') or 'dir'], cwd=bentomodel.path)
--- a/openllm-python/src/openllm/cli/extension/get_containerfile.py
+++ b/openllm-python/src/openllm/cli/extension/get_containerfile.py
@@ -19,7 +19,9 @@ from openllm_core.utils import bentoml_cattr
 if t.TYPE_CHECKING:
  from bentoml._internal.bento import BentoStore

-@click.command('get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.')
+@click.command('get_containerfile',
+               context_settings=termui.CONTEXT_SETTINGS,
+               help='Return Containerfile of any given Bento.')
@click.argument('bento', type=str, shell_complete=bento_complete_envvar)
@click.pass_context
@inject
@@ -39,7 +41,13 @@ def cli(ctx: click.Context, bento: str, _bento_store: BentoStore = Provide[Bento
    # NOTE: if users specify a dockerfile_template, we will
    # save it to /env/docker/Dockerfile.template. This is necessary
    # for the reconstruction of the Dockerfile.
-    if 'dockerfile_template' in docker_attrs and docker_attrs['dockerfile_template'] is not None: docker_attrs['dockerfile_template'] = 'env/docker/Dockerfile.template'
-    doc = generate_containerfile(docker=DockerOptions(**docker_attrs), build_ctx=bentomodel.path, conda=options.conda, bento_fs=bentomodel._fs, enable_buildkit=True, add_header=True)
+    if 'dockerfile_template' in docker_attrs and docker_attrs['dockerfile_template'] is not None:
+      docker_attrs['dockerfile_template'] = 'env/docker/Dockerfile.template'
+    doc = generate_containerfile(docker=DockerOptions(**docker_attrs),
+                                 build_ctx=bentomodel.path,
+                                 conda=options.conda,
+                                 bento_fs=bentomodel._fs,
+                                 enable_buildkit=True,
+                                 add_header=True)
    termui.echo(doc, fg='white')
  return bentomodel.path
--- a/openllm-python/src/openllm/cli/extension/get_prompt.py
+++ b/openllm-python/src/openllm/cli/extension/get_prompt.py
@@ -18,41 +18,51 @@ from openllm_core._prompt import process_prompt
 LiteralOutput = t.Literal['json', 'pretty', 'porcelain']

@click.command('get_prompt', context_settings=termui.CONTEXT_SETTINGS)
-@click.argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]), shell_complete=model_complete_envvar)
+@click.argument('model_name',
+                type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]),
+                shell_complete=model_complete_envvar)
@click.argument('prompt', type=click.STRING)
@output_option
@click.option('--format', type=click.STRING, default=None)
@machine_option
-@click.option(
-    '--opt',
-    help="Define additional prompt variables. (format: ``--opt system_prompt='You are a useful assistant'``)",
-    required=False,
-    multiple=True,
-    callback=opt_callback,
-    metavar='ARG=VALUE[,ARG=VALUE]'
-)
+@click.option('--opt',
+              help="Define additional prompt variables. (format: ``--opt system_prompt='You are a useful assistant'``)",
+              required=False,
+              multiple=True,
+              callback=opt_callback,
+              metavar='ARG=VALUE[,ARG=VALUE]')
@click.pass_context
-def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any], **_: t.Any) -> str | None:
+def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool,
+        _memoized: dict[str, t.Any], **_: t.Any) -> str | None:
  '''Get the default prompt used by OpenLLM.'''
  module = openllm.utils.EnvVarMixin(model_name).module
  _memoized = {k: v[0] for k, v in _memoized.items() if v}
  try:
    template = getattr(module, 'DEFAULT_PROMPT_TEMPLATE', None)
    prompt_mapping = getattr(module, 'PROMPT_MAPPING', None)
-    if template is None: raise click.BadArgumentUsage(f'model {model_name} does not have a default prompt template') from None
+    if template is None:
+      raise click.BadArgumentUsage(f'model {model_name} does not have a default prompt template') from None
    if callable(template):
      if format is None:
-        if not hasattr(module, 'PROMPT_MAPPING') or module.PROMPT_MAPPING is None: raise RuntimeError('Failed to find prompt mapping while DEFAULT_PROMPT_TEMPLATE is a function.')
-        raise click.BadOptionUsage('format', f"{model_name} prompt requires passing '--format' (available format: {list(module.PROMPT_MAPPING)})")
-      if prompt_mapping is None: raise click.BadArgumentUsage(f'Failed to fine prompt mapping while the default prompt for {model_name} is a callable.') from None
-      if format not in prompt_mapping: raise click.BadOptionUsage('format', f'Given format {format} is not valid for {model_name} (available format: {list(prompt_mapping)})')
+        if not hasattr(module, 'PROMPT_MAPPING') or module.PROMPT_MAPPING is None:
+          raise RuntimeError('Failed to find prompt mapping while DEFAULT_PROMPT_TEMPLATE is a function.')
+        raise click.BadOptionUsage(
+            'format',
+            f"{model_name} prompt requires passing '--format' (available format: {list(module.PROMPT_MAPPING)})")
+      if prompt_mapping is None:
+        raise click.BadArgumentUsage(
+            f'Failed to fine prompt mapping while the default prompt for {model_name} is a callable.') from None
+      if format not in prompt_mapping:
+        raise click.BadOptionUsage(
+            'format', f'Given format {format} is not valid for {model_name} (available format: {list(prompt_mapping)})')
      _prompt_template = template(format)
    else:
      _prompt_template = template
    fully_formatted = process_prompt(prompt, _prompt_template, True, **_memoized)
    if machine: return repr(fully_formatted)
    elif output == 'porcelain': termui.echo(repr(fully_formatted), fg='white')
-    elif output == 'json': termui.echo(orjson.dumps({'prompt': fully_formatted}, option=orjson.OPT_INDENT_2).decode(), fg='white')
+    elif output == 'json':
+      termui.echo(orjson.dumps({'prompt': fully_formatted}, option=orjson.OPT_INDENT_2).decode(), fg='white')
    else:
      termui.echo(f'== Prompt for {model_name} ==\n', fg='magenta')
      termui.echo(fully_formatted, fg='white')
--- a/openllm-python/src/openllm/cli/extension/list_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/list_bentos.py
@@ -19,23 +19,27 @@ def cli(ctx: click.Context, output: LiteralOutput) -> None:
  '''List available bentos built by OpenLLM.'''
  mapping = {
      k: [{
-          'tag': str(b.tag),
-          'size': human_readable_size(openllm.utils.calc_dir_size(b.path)),
+          'tag':
+              str(b.tag),
+          'size':
+              human_readable_size(openllm.utils.calc_dir_size(b.path)),
          'models': [{
-              'tag': str(m.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(m.path))
+              'tag': str(m.tag),
+              'size': human_readable_size(openllm.utils.calc_dir_size(m.path))
          } for m in (bentoml.models.get(_.tag) for _ in b.info.models)]
-      } for b in tuple(i for i in bentoml.list() if all(k in i.info.labels for k in {'start_name', 'bundler'})) if b.info.labels['start_name'] == k] for k in tuple(
-          inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys()
-      )
+      } for b in tuple(i for i in bentoml.list() if all(
+          k in i.info.labels for k in {'start_name', 'bundler'})) if b.info.labels['start_name'] == k
+         ] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
  }
  mapping = {k: v for k, v in mapping.items() if v}
  if output == 'pretty':
    import tabulate
    tabulate.PRESERVE_WHITESPACE = True
-    termui.echo(
-        tabulate.tabulate([(k, i['tag'], i['size'], [_['tag'] for _ in i['models']]) for k, v in mapping.items() for i in v], tablefmt='fancy_grid', headers=['LLM', 'Tag', 'Size', 'Models']),
-        fg='white'
-    )
+    termui.echo(tabulate.tabulate(
+        [(k, i['tag'], i['size'], [_['tag'] for _ in i['models']]) for k, v in mapping.items() for i in v],
+        tablefmt='fancy_grid',
+        headers=['LLM', 'Tag', 'Size', 'Models']),
+                fg='white')
  else:
    termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
  ctx.exit(0)
--- a/openllm-python/src/openllm/cli/extension/list_models.py
+++ b/openllm-python/src/openllm/cli/extension/list_models.py
@@ -25,17 +25,33 @@ def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny:
  '''This is equivalent to openllm models --show-available less the nice table.'''
  models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
  ids_in_local_store = {
-      k: [i for i in bentoml.models.list() if 'framework' in i.info.labels and i.info.labels['framework'] == 'openllm' and 'model_name' in i.info.labels and i.info.labels['model_name'] == k
-          ] for k in models
+      k: [
+          i for i in bentoml.models.list() if 'framework' in i.info.labels and
+          i.info.labels['framework'] == 'openllm' and 'model_name' in i.info.labels and i.info.labels['model_name'] == k
+      ] for k in models
  }
  if model_name is not None:
-    ids_in_local_store = {k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)] for k, v in ids_in_local_store.items()}
+    ids_in_local_store = {
+        k: [
+            i
+            for i in v
+            if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)
+        ] for k, v in ids_in_local_store.items()
+    }
  ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
-  local_models = {k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()}
+  local_models = {
+      k: [{
+          'tag': str(i.tag),
+          'size': human_readable_size(openllm.utils.calc_dir_size(i.path))
+      } for i in val] for k, val in ids_in_local_store.items()
+  }
  if output == 'pretty':
    import tabulate
    tabulate.PRESERVE_WHITESPACE = True
-    termui.echo(tabulate.tabulate([(k, i['tag'], i['size']) for k, v in local_models.items() for i in v], tablefmt='fancy_grid', headers=['LLM', 'Tag', 'Size']), fg='white')
+    termui.echo(tabulate.tabulate([(k, i['tag'], i['size']) for k, v in local_models.items() for i in v],
+                                  tablefmt='fancy_grid',
+                                  headers=['LLM', 'Tag', 'Size']),
+                fg='white')
  else:
    termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white')
  return local_models
--- a/openllm-python/src/openllm/cli/extension/playground.py
+++ b/openllm-python/src/openllm/cli/extension/playground.py
@@ -28,12 +28,18 @@ logger = logging.getLogger(__name__)
 def load_notebook_metadata() -> DictStrAny:
  with open(os.path.join(os.path.dirname(playground.__file__), '_meta.yml'), 'r') as f:
    content = yaml.safe_load(f)
-  if not all('description' in k for k in content.values()): raise ValueError("Invalid metadata file. All entries must have a 'description' key.")
+  if not all('description' in k for k in content.values()):
+    raise ValueError("Invalid metadata file. All entries must have a 'description' key.")
  return content

@click.command('playground', context_settings=termui.CONTEXT_SETTINGS)
@click.argument('output-dir', default=None, required=False)
-@click.option('--port', envvar='JUPYTER_PORT', show_envvar=True, show_default=True, default=8888, help='Default port for Jupyter server')
+@click.option('--port',
+              envvar='JUPYTER_PORT',
+              show_envvar=True,
+              show_default=True,
+              default=8888,
+              help='Default port for Jupyter server')
@click.pass_context
 def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  """OpenLLM Playground.
@@ -54,7 +60,9 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  > This command requires Jupyter to be installed. Install it with 'pip install "openllm[playground]"'
  """
  if not is_jupyter_available() or not is_jupytext_available() or not is_notebook_available():
-    raise RuntimeError("Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'")
+    raise RuntimeError(
+        "Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'"
+    )
  metadata = load_notebook_metadata()
  _temp_dir = False
  if output_dir is None:
@@ -66,7 +74,8 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  termui.echo('The playground notebooks will be saved to: ' + os.path.abspath(output_dir), fg='blue')
  for module in pkgutil.iter_modules(playground.__path__):
    if module.ispkg or os.path.exists(os.path.join(output_dir, module.name + '.ipynb')):
-      logger.debug('Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module')
+      logger.debug('Skipping: %s (%s)', module.name,
+                   'File already exists' if not module.ispkg else f'{module.name} is a module')
      continue
    if not isinstance(module.module_finder, importlib.machinery.FileFinder): continue
    termui.echo('Generating notebook for: ' + module.name, fg='magenta')
@@ -75,7 +84,10 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
    f.cells.insert(0, markdown_cell)
    jupytext.write(f, os.path.join(output_dir, module.name + '.ipynb'), fmt='notebook')
  try:
-    subprocess.check_output([sys.executable, '-m', 'jupyter', 'notebook', '--notebook-dir', output_dir, '--port', str(port), '--no-browser', '--debug'])
+    subprocess.check_output([
+        sys.executable, '-m', 'jupyter', 'notebook', '--notebook-dir', output_dir, '--port',
+        str(port), '--no-browser', '--debug'
+    ])
  except subprocess.CalledProcessError as e:
    termui.echo(e.output, fg='red')
    raise click.ClickException(f'Failed to start a jupyter server:\n{e}') from None
--- a/openllm-python/src/openllm/cli/termui.py
+++ b/openllm-python/src/openllm/cli/termui.py
@@ -12,8 +12,13 @@ if t.TYPE_CHECKING:

 def echo(text: t.Any, fg: str = 'green', _with_style: bool = True, **attrs: t.Any) -> None:
  attrs['fg'] = fg if not openllm.utils.get_debug_mode() else None
-  if not openllm.utils.get_quiet_mode(): t.cast(t.Callable[..., None], click.echo if not _with_style else click.secho)(text, **attrs)
+  if not openllm.utils.get_quiet_mode():
+    t.cast(t.Callable[..., None], click.echo if not _with_style else click.secho)(text, **attrs)

 COLUMNS: int = int(os.environ.get('COLUMNS', str(120)))
-CONTEXT_SETTINGS: DictStrAny = {'help_option_names': ['-h', '--help'], 'max_content_width': COLUMNS, 'token_normalize_func': inflection.underscore}
+CONTEXT_SETTINGS: DictStrAny = {
+    'help_option_names': ['-h', '--help'],
+    'max_content_width': COLUMNS,
+    'token_normalize_func': inflection.underscore
+}
 __all__ = ['echo', 'COLUMNS', 'CONTEXT_SETTINGS']