chore(style): synchronized style across packages [skip ci]

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-06-11 18:09:52 -04:00 · 2023-08-23 08:46:22 -04:00
parent bbd9aa7646
commit 787ce1b3b6
124 changed files with 2775 additions and 2771 deletions
--- a/openllm-python/src/openllm/cli/init.py
+++ b/openllm-python/src/openllm/cli/init.py
@@ -1,4 +1,4 @@
-"""OpenLLM CLI.
+'''OpenLLM CLI.

 For more information see ``openllm -h``.
-"""
+'''
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -13,21 +13,21 @@ if t.TYPE_CHECKING:
  from openllm_core._configuration import LLMConfig
 logger = logging.getLogger(__name__)

-P = ParamSpec("P")
-LiteralOutput = t.Literal["json", "pretty", "porcelain"]
+P = ParamSpec('P')
+LiteralOutput = t.Literal['json', 'pretty', 'porcelain']

 _AnyCallable = t.Callable[..., t.Any]
-FC = t.TypeVar("FC", bound=t.Union[_AnyCallable, click.Command])
+FC = t.TypeVar('FC', bound=t.Union[_AnyCallable, click.Command])
 def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
-  return [sc.CompletionItem(str(it.tag), help="Bento") for it in bentoml.list() if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {"start_name", "bundler"})]
+  return [sc.CompletionItem(str(it.tag), help='Bento') for it in bentoml.list() if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})]
 def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
-  return [sc.CompletionItem(inflection.dasherize(it), help="Model") for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]
+  return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]
 def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
  # TODO: Support amd.com/gpu on k8s
-  _bentoml_config_options_env = environ.pop("BENTOML_CONFIG_OPTIONS", "")
+  _bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '')
  _bentoml_config_options_opts = [
-      "tracing.sample_rate=1.0",
-      f"api_server.traffic.timeout={server_timeout}",
+      'tracing.sample_rate=1.0',
+      f'api_server.traffic.timeout={server_timeout}',
      f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}',
      f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}'
  ]
@@ -36,18 +36,18 @@ def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_res
    else: _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
  _bentoml_config_options_opts.append(f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
  if cors:
-    _bentoml_config_options_opts.extend(["api_server.http.cors.enabled=true", 'api_server.http.cors.access_control_allow_origins="*"'])
-    _bentoml_config_options_opts.extend([f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(["GET", "OPTIONS", "POST", "HEAD", "PUT"])])
-  _bentoml_config_options_env += " " if _bentoml_config_options_env else "" + " ".join(_bentoml_config_options_opts)
-  environ["BENTOML_CONFIG_OPTIONS"] = _bentoml_config_options_env
-  if DEBUG: logger.debug("Setting BENTOML_CONFIG_OPTIONS=%s", _bentoml_config_options_env)
+    _bentoml_config_options_opts.extend(['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
+    _bentoml_config_options_opts.extend([f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])])
+  _bentoml_config_options_env += ' ' if _bentoml_config_options_env else '' + ' '.join(_bentoml_config_options_opts)
+  environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env
+  if DEBUG: logger.debug('Setting BENTOML_CONFIG_OPTIONS=%s', _bentoml_config_options_env)
  return environ
-_adapter_mapping_key = "adapter_map"
+_adapter_mapping_key = 'adapter_map'
 def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...] | None) -> None:
  if not value: return None
  if _adapter_mapping_key not in ctx.params: ctx.params[_adapter_mapping_key] = {}
  for v in value:
-    adapter_id, *adapter_name = v.rsplit(":", maxsplit=1)
+    adapter_id, *adapter_name = v.rsplit(':', maxsplit=1)
    # try to resolve the full path if users pass in relative,
    # currently only support one level of resolve path with current directory
    try:
@@ -59,11 +59,11 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...
 def start_command_factory(group: click.Group, model: str, _context_settings: DictStrAny | None = None, _serve_grpc: bool = False) -> click.Command:
  llm_config = openllm.AutoConfig.for_model(model)
  command_attrs: DictStrAny = dict(
-      name=llm_config["model_name"],
+      name=llm_config['model_name'],
      context_settings=_context_settings or termui.CONTEXT_SETTINGS,
      short_help=f"Start a LLMServer for '{model}'",
-      aliases=[llm_config["start_name"]] if llm_config["name_type"] == "dasherize" else None,
-      help=f"""\
+      aliases=[llm_config['start_name']] if llm_config['name_type'] == 'dasherize' else None,
+      help=f'''\
 {llm_config['env'].start_docstring}

 \b
@@ -81,13 +81,13 @@ Available official model_id(s): [default: {llm_config['default_id']}]

 \b
 {orjson.dumps(llm_config['model_ids'], option=orjson.OPT_INDENT_2).decode()}
-""",
+''',
  )

-  if llm_config["requires_gpu"] and openllm.utils.device_count() < 1:
+  if llm_config['requires_gpu'] and openllm.utils.device_count() < 1:
    # NOTE: The model requires GPU, therefore we will return a dummy command
    command_attrs.update({
-        "short_help": "(Disabled because there is no GPU available)", "help": f"{model} is currently not available to run on your local machine because it requires GPU for inference."
+        'short_help': '(Disabled because there is no GPU available)', 'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
    })
    return noop_command(group, llm_config, _serve_grpc, **command_attrs)

@@ -100,39 +100,39 @@ Available official model_id(s): [default: {llm_config['default_id']}]
      server_timeout: int,
      model_id: str | None,
      model_version: str | None,
-      workers_per_resource: t.Literal["conserved", "round_robin"] | LiteralString,
+      workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString,
      device: t.Tuple[str, ...],
-      quantize: t.Literal["int8", "int4", "gptq"] | None,
+      quantize: t.Literal['int8', 'int4', 'gptq'] | None,
      bettertransformer: bool | None,
-      runtime: t.Literal["ggml", "transformers"],
+      runtime: t.Literal['ggml', 'transformers'],
      fast: bool,
-      serialisation_format: t.Literal["safetensors", "legacy"],
+      serialisation_format: t.Literal['safetensors', 'legacy'],
      cors: bool,
      adapter_id: str | None,
      return_process: bool,
      **attrs: t.Any,
  ) -> LLMConfig | subprocess.Popen[bytes]:
    fast = str(fast).upper() in openllm.utils.ENV_VARS_TRUE_VALUES
-    if serialisation_format == "safetensors" and quantize is not None and os.environ.get("OPENLLM_SERIALIZATION_WARNING", str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES:
+    if serialisation_format == 'safetensors' and quantize is not None and os.environ.get('OPENLLM_SERIALIZATION_WARNING', str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES:
      termui.echo(
          f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.",
-          fg="yellow"
+          fg='yellow'
      )
    adapter_map: dict[str, str | None] | None = attrs.pop(_adapter_mapping_key, None)
    config, server_attrs = llm_config.model_validate_click(**attrs)
-    server_timeout = openllm.utils.first_not_none(server_timeout, default=config["timeout"])
-    server_attrs.update({"working_dir": os.path.dirname(os.path.dirname(__file__)), "timeout": server_timeout})
-    if _serve_grpc: server_attrs["grpc_protocol_version"] = "v1"
+    server_timeout = openllm.utils.first_not_none(server_timeout, default=config['timeout'])
+    server_attrs.update({'working_dir': os.path.dirname(os.path.dirname(__file__)), 'timeout': server_timeout})
+    if _serve_grpc: server_attrs['grpc_protocol_version'] = 'v1'
    # NOTE: currently, theres no development args in bentoml.Server. To be fixed upstream.
-    development = server_attrs.pop("development")
-    server_attrs.setdefault("production", not development)
-    wpr = openllm.utils.first_not_none(workers_per_resource, default=config["workers_per_resource"])
+    development = server_attrs.pop('development')
+    server_attrs.setdefault('production', not development)
+    wpr = openllm.utils.first_not_none(workers_per_resource, default=config['workers_per_resource'])

    if isinstance(wpr, str):
-      if wpr == "round_robin": wpr = 1.0
-      elif wpr == "conserved":
+      if wpr == 'round_robin': wpr = 1.0
+      elif wpr == 'conserved':
        if device and openllm.utils.device_count() == 0:
-          termui.echo("--device will have no effect as there is no GPUs available", fg="yellow")
+          termui.echo('--device will have no effect as there is no GPUs available', fg='yellow')
          wpr = 1.0
        else:
          available_gpu = len(device) if device else openllm.utils.device_count()
@@ -144,7 +144,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]

    # Create a new model env to work with the envvar during CLI invocation
    env = openllm.utils.EnvVarMixin(
-        config["model_name"], config.default_implementation(), model_id=model_id or config["default_id"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime
+        config['model_name'], config.default_implementation(), model_id=model_id or config['default_id'], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime
    )
    prerequisite_check(ctx, config, quantize, adapter_map, int(1 / wpr))

@@ -152,38 +152,38 @@ Available official model_id(s): [default: {llm_config['default_id']}]
    start_env = os.environ.copy()
    start_env = parse_config_options(config, server_timeout, wpr, device, cors, start_env)
    if fast:
-      termui.echo(f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'", fg="yellow")
+      termui.echo(f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'", fg='yellow')

    start_env.update({
-        "OPENLLM_MODEL": model,
-        "BENTOML_DEBUG": str(openllm.utils.get_debug_mode()),
-        "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()),
-        "OPENLLM_ADAPTER_MAP": orjson.dumps(adapter_map).decode(),
-        "OPENLLM_SERIALIZATION": serialisation_format,
-        env.runtime: env["runtime_value"],
-        env.framework: env["framework_value"]
+        'OPENLLM_MODEL': model,
+        'BENTOML_DEBUG': str(openllm.utils.get_debug_mode()),
+        'BENTOML_HOME': os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()),
+        'OPENLLM_ADAPTER_MAP': orjson.dumps(adapter_map).decode(),
+        'OPENLLM_SERIALIZATION': serialisation_format,
+        env.runtime: env['runtime_value'],
+        env.framework: env['framework_value']
    })
-    if env["model_id_value"]: start_env[env.model_id] = str(env["model_id_value"])
+    if env['model_id_value']: start_env[env.model_id] = str(env['model_id_value'])
    # NOTE: quantize and bettertransformer value is already assigned within env
-    if bettertransformer is not None: start_env[env.bettertransformer] = str(env["bettertransformer_value"])
-    if quantize is not None: start_env[env.quantize] = str(t.cast(str, env["quantize_value"]))
+    if bettertransformer is not None: start_env[env.bettertransformer] = str(env['bettertransformer_value'])
+    if quantize is not None: start_env[env.quantize] = str(t.cast(str, env['quantize_value']))

-    llm = openllm.utils.infer_auto_class(env["framework_value"]).for_model(
+    llm = openllm.utils.infer_auto_class(env['framework_value']).for_model(
        model, model_id=start_env[env.model_id], model_version=model_version, llm_config=config, ensure_available=not fast, adapter_map=adapter_map, serialisation=serialisation_format
    )
    start_env.update({env.config: llm.config.model_dump_json().decode()})

-    server = bentoml.GrpcServer("_service:svc", **server_attrs) if _serve_grpc else bentoml.HTTPServer("_service:svc", **server_attrs)
+    server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer('_service:svc', **server_attrs)
    openllm.utils.analytics.track_start_init(llm.config)

    def next_step(model_name: str, adapter_map: DictStrAny | None) -> None:
-      cmd_name = f"openllm build {model_name}"
-      if adapter_map is not None: cmd_name += " " + " ".join([f"--adapter-id {s}" for s in [f"{p}:{name}" if name not in (None, "default") else p for p, name in adapter_map.items()]])
-      if not openllm.utils.get_quiet_mode(): termui.echo(f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}", fg="blue")
+      cmd_name = f'openllm build {model_name}'
+      if adapter_map is not None: cmd_name += ' ' + ' '.join([f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]])
+      if not openllm.utils.get_quiet_mode(): termui.echo(f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}", fg='blue')

    if return_process:
      server.start(env=start_env, text=True)
-      if server.process is None: raise click.ClickException("Failed to start the server.")
+      if server.process is None: raise click.ClickException('Failed to start the server.')
      return server.process
    else:
      try:
@@ -191,7 +191,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]
      except KeyboardInterrupt:
        next_step(model, adapter_map)
      except Exception as err:
-        termui.echo(f"Error caught while running LLM Server:\n{err}", fg="red")
+        termui.echo(f'Error caught while running LLM Server:\n{err}', fg='red')
      else:
        next_step(model, adapter_map)

@@ -200,40 +200,40 @@ Available official model_id(s): [default: {llm_config['default_id']}]

  return start_cmd
 def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, **command_attrs: t.Any) -> click.Command:
-  context_settings = command_attrs.pop("context_settings", {})
-  context_settings.update({"ignore_unknown_options": True, "allow_extra_args": True})
-  command_attrs["context_settings"] = context_settings
+  context_settings = command_attrs.pop('context_settings', {})
+  context_settings.update({'ignore_unknown_options': True, 'allow_extra_args': True})
+  command_attrs['context_settings'] = context_settings
  # NOTE: The model requires GPU, therefore we will return a dummy command
  @group.command(**command_attrs)
  def noop(**_: t.Any) -> LLMConfig:
-    termui.echo("No GPU available, therefore this command is disabled", fg="red")
+    termui.echo('No GPU available, therefore this command is disabled', fg='red')
    openllm.utils.analytics.track_start_init(llm_config)
    return llm_config

  return noop
 def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None, adapter_map: dict[str, str | None] | None, num_workers: int) -> None:
  if adapter_map and not openllm.utils.is_peft_available(): ctx.fail("Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'")
-  if quantize and llm_config.default_implementation() == "vllm":
+  if quantize and llm_config.default_implementation() == 'vllm':
    ctx.fail(f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization.")
-  requirements = llm_config["requirements"]
+  requirements = llm_config['requirements']
  if requirements is not None and len(requirements) > 0:
    missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
-    if len(missing_requirements) > 0: termui.echo(f"Make sure to have the following dependencies available: {missing_requirements}", fg="yellow")
+    if len(missing_requirements) > 0: termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')
 def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
  def wrapper(fn: FC) -> t.Callable[[FC], FC]:
    composed = openllm.utils.compose(
        llm_config.to_click_options,
        _http_server_args if not serve_grpc else _grpc_server_args,
-        cog.optgroup.group("General LLM Options", help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
-        model_id_option(factory=cog.optgroup, model_env=llm_config["env"]),
+        cog.optgroup.group('General LLM Options', help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
+        model_id_option(factory=cog.optgroup, model_env=llm_config['env']),
        model_version_option(factory=cog.optgroup),
-        cog.optgroup.option("--server-timeout", type=int, default=None, help="Server timeout in seconds"),
+        cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
        workers_per_resource_option(factory=cog.optgroup),
        cors_option(factory=cog.optgroup),
        fast_option(factory=cog.optgroup),
        cog.optgroup.group(
-            "LLM Optimization Options",
-            help="""Optimization related options.
+            'LLM Optimization Options',
+            help='''Optimization related options.

            OpenLLM supports running model with [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/),
            k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
@@ -242,24 +242,24 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab

            - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
            - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
-            """,
+            ''',
        ),
        cog.optgroup.option(
-            "--device",
+            '--device',
            type=openllm.utils.dantic.CUDA,
            multiple=True,
-            envvar="CUDA_VISIBLE_DEVICES",
+            envvar='CUDA_VISIBLE_DEVICES',
            callback=parse_device_callback,
            help=f"Assign GPU devices (if available) for {llm_config['model_name']}.",
            show_envvar=True
        ),
-        cog.optgroup.option("--runtime", type=click.Choice(["ggml", "transformers"]), default="transformers", help="The runtime to use for the given model. Default is transformers."),
-        quantize_option(factory=cog.optgroup, model_env=llm_config["env"]),
-        bettertransformer_option(factory=cog.optgroup, model_env=llm_config["env"]),
+        cog.optgroup.option('--runtime', type=click.Choice(['ggml', 'transformers']), default='transformers', help='The runtime to use for the given model. Default is transformers.'),
+        quantize_option(factory=cog.optgroup, model_env=llm_config['env']),
+        bettertransformer_option(factory=cog.optgroup, model_env=llm_config['env']),
        serialisation_option(factory=cog.optgroup),
        cog.optgroup.group(
-            "Fine-tuning related options",
-            help="""\
+            'Fine-tuning related options',
+            help='''\
    Note that the argument `--adapter-id` can accept the following format:

    - `--adapter-id /path/to/adapter` (local adapter)
@@ -273,37 +273,37 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
    $ openllm start opt --adapter-id /path/to/adapter_dir --adapter-id remote/adapter:eng_lora

    ```
-    """
+    '''
        ),
        cog.optgroup.option(
-            "--adapter-id",
+            '--adapter-id',
            default=None,
-            help="Optional name or path for given LoRA adapter" + f" to wrap '{llm_config['model_name']}'",
+            help='Optional name or path for given LoRA adapter' + f" to wrap '{llm_config['model_name']}'",
            multiple=True,
            callback=_id_callback,
-            metavar="[PATH | [remote/][adapter_name:]adapter_id][, ...]"
+            metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]'
        ),
-        click.option("--return-process", is_flag=True, default=False, help="Internal use only.", hidden=True),
+        click.option('--return-process', is_flag=True, default=False, help='Internal use only.', hidden=True),
    )
    return composed(fn)

  return wrapper
 def parse_device_callback(ctx: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
  if value is None: return value
-  if not isinstance(value, tuple): ctx.fail(f"{param} only accept multiple values, not {type(value)} (value: {value})")
+  if not isinstance(value, tuple): ctx.fail(f'{param} only accept multiple values, not {type(value)} (value: {value})')
  el: t.Tuple[str, ...] = tuple(i for k in value for i in k)
  # NOTE: --device all is a special case
-  if len(el) == 1 and el[0] == "all": return tuple(map(str, openllm.utils.available_devices()))
+  if len(el) == 1 and el[0] == 'all': return tuple(map(str, openllm.utils.available_devices()))
  return el
 # NOTE: A list of bentoml option that is not needed for parsing.
 # NOTE: User shouldn't set '--working-dir', as OpenLLM will setup this.
 # NOTE: production is also deprecated
-_IGNORED_OPTIONS = {"working_dir", "production", "protocol_version"}
+_IGNORED_OPTIONS = {'working_dir', 'production', 'protocol_version'}
 def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]:
-  """Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`."""
+  '''Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`.'''
  from bentoml_cli.cli import cli

-  command = "serve" if not serve_grpc else "serve-grpc"
+  command = 'serve' if not serve_grpc else 'serve-grpc'
  group = cog.optgroup.group(
      f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options", help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",
  )
@@ -316,95 +316,95 @@ def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]
    for options in reversed(serve_options):
      attrs = options.to_info_dict()
      # we don't need param_type_name, since it should all be options
-      attrs.pop("param_type_name")
+      attrs.pop('param_type_name')
      # name is not a valid args
-      attrs.pop("name")
+      attrs.pop('name')
      # type can be determine from default value
-      attrs.pop("type")
-      param_decls = (*attrs.pop("opts"), *attrs.pop("secondary_opts"))
+      attrs.pop('type')
+      param_decls = (*attrs.pop('opts'), *attrs.pop('secondary_opts'))
      f = cog.optgroup.option(*param_decls, **attrs)(f)
    return group(f)

  return decorator
 _http_server_args, _grpc_server_args = parse_serve_args(False), parse_serve_args(True)
 def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | None], FC]:
-  """General ``@click`` decorator with some sauce.
+  '''General ``@click`` decorator with some sauce.

  This decorator extends the default ``@click.option`` plus a factory option and factory attr to
  provide type-safe click.option or click.argument wrapper for all compatible factory.
-  """
-  factory = attrs.pop("factory", click)
-  factory_attr = attrs.pop("attr", "option")
-  if factory_attr != "argument": attrs.setdefault("help", "General option for OpenLLM CLI.")
+  '''
+  factory = attrs.pop('factory', click)
+  factory_attr = attrs.pop('attr', 'option')
+  if factory_attr != 'argument': attrs.setdefault('help', 'General option for OpenLLM CLI.')

  def decorator(f: FC | None) -> FC:
    callback = getattr(factory, factory_attr, None)
-    if callback is None: raise ValueError(f"Factory {factory} has no attribute {factory_attr}.")
+    if callback is None: raise ValueError(f'Factory {factory} has no attribute {factory_attr}.')
    return t.cast(FC, callback(*param_decls, **attrs)(f) if f is not None else callback(*param_decls, **attrs))

  return decorator
-cli_option = functools.partial(_click_factory_type, attr="option")
-cli_argument = functools.partial(_click_factory_type, attr="argument")
-def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput = "pretty", **attrs: t.Any) -> t.Callable[[FC], FC]:
-  output = ["json", "pretty", "porcelain"]
+cli_option = functools.partial(_click_factory_type, attr='option')
+cli_argument = functools.partial(_click_factory_type, attr='argument')
+def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput = 'pretty', **attrs: t.Any) -> t.Callable[[FC], FC]:
+  output = ['json', 'pretty', 'porcelain']

  def complete_output_var(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[CompletionItem]:
    return [CompletionItem(it) for it in output]

  return cli_option(
-      "-o",
-      "--output",
-      "output",
+      '-o',
+      '--output',
+      'output',
      type=click.Choice(output),
      default=default_value,
-      help="Showing output type.",
+      help='Showing output type.',
      show_default=True,
-      envvar="OPENLLM_OUTPUT",
+      envvar='OPENLLM_OUTPUT',
      show_envvar=True,
      shell_complete=complete_output_var,
      **attrs
  )(f)
 def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
-      "--fast/--no-fast",
+      '--fast/--no-fast',
      show_default=True,
      default=False,
-      envvar="OPENLLM_USE_LOCAL_LATEST",
+      envvar='OPENLLM_USE_LOCAL_LATEST',
      show_envvar=True,
-      help="""Whether to skip checking if models is already in store.
+      help='''Whether to skip checking if models is already in store.

                                                                                                          This is useful if you already downloaded or setup the model beforehand.
-                                                                                                          """,
+                                                                                                          ''',
      **attrs
  )(f)
 def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option("--cors/--no-cors", show_default=True, default=False, envvar="OPENLLM_CORS", show_envvar=True, help="Enable CORS for the server.", **attrs)(f)
+  return cli_option('--cors/--no-cors', show_default=True, default=False, envvar='OPENLLM_CORS', show_envvar=True, help='Enable CORS for the server.', **attrs)(f)
 def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option("--machine", is_flag=True, default=False, hidden=True, **attrs)(f)
+  return cli_option('--machine', is_flag=True, default=False, hidden=True, **attrs)(f)
 def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
-      "--model-id",
+      '--model-id',
      type=click.STRING,
      default=None,
      envvar=model_env.model_id if model_env is not None else None,
      show_envvar=model_env is not None,
-      help="Optional model_id name or path for (fine-tune) weight.",
+      help='Optional model_id name or path for (fine-tune) weight.',
      **attrs
  )(f)
 def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option("--model-version", type=click.STRING, default=None, help="Optional model version to save for this model. It will be inferred automatically from model-id.", **attrs)(f)
+  return cli_option('--model-version', type=click.STRING, default=None, help='Optional model version to save for this model. It will be inferred automatically from model-id.', **attrs)(f)
 def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f)
+  return cli_argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f)
 def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
-      "--quantise",
-      "--quantize",
-      "quantize",
-      type=click.Choice(["int8", "int4", "gptq"]),
+      '--quantise',
+      '--quantize',
+      'quantize',
+      type=click.Choice(['int8', 'int4', 'gptq']),
      default=None,
      envvar=model_env.quantize if model_env is not None else None,
      show_envvar=model_env is not None,
-      help="""Dynamic quantization for running this LLM.
+      help='''Dynamic quantization for running this LLM.

      The following quantization strategies are supported:

@@ -415,19 +415,19 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model
      - ``gptq``: ``GPTQ`` [quantization](https://arxiv.org/abs/2210.17323)

      > [!NOTE] that the model can also be served with quantized weights.
-      """ + ("""
-      > [!NOTE] that this will set the mode for serving within deployment.""" if build else "") + """
-      > [!NOTE] that quantization are currently only available in *PyTorch* models.""",
+      ''' + ('''
+      > [!NOTE] that this will set the mode for serving within deployment.''' if build else '') + '''
+      > [!NOTE] that quantization are currently only available in *PyTorch* models.''',
      **attrs
  )(f)
 def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
-      "--workers-per-resource",
+      '--workers-per-resource',
      default=None,
      callback=workers_per_resource_callback,
      type=str,
      required=False,
-      help="""Number of workers per resource assigned.
+      help='''Number of workers per resource assigned.

      See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
      for more information. By default, this is set to 1.
@@ -437,36 +437,36 @@ def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool =
      - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.

      - ``conserved``: This will determine the number of available GPU resources, and only assign one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``.
-      """ + (
+      ''' + (
          """\n
      > [!NOTE] The workers value passed into 'build' will determine how the LLM can
      > be provisioned in Kubernetes as well as in standalone container. This will
-      > ensure it has the same effect with 'openllm start --api-workers ...'""" if build else ""
+      > ensure it has the same effect with 'openllm start --api-workers ...'""" if build else ''
      ),
      **attrs
  )(f)
 def bettertransformer_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
-      "--bettertransformer",
+      '--bettertransformer',
      is_flag=True,
      default=None,
      envvar=model_env.bettertransformer if model_env is not None else None,
      show_envvar=model_env is not None,
-      help="Apply FasterTransformer wrapper to serve model. This will applies during serving time."
-      if not build else "Set default environment variable whether to serve this model with FasterTransformer in build time.",
+      help='Apply FasterTransformer wrapper to serve model. This will applies during serving time.'
+      if not build else 'Set default environment variable whether to serve this model with FasterTransformer in build time.',
      **attrs
  )(f)
 def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
-      "--serialisation",
-      "--serialization",
-      "serialisation_format",
-      type=click.Choice(["safetensors", "legacy"]),
-      default="safetensors",
+      '--serialisation',
+      '--serialization',
+      'serialisation_format',
+      type=click.Choice(['safetensors', 'legacy']),
+      default='safetensors',
      show_default=True,
      show_envvar=True,
-      envvar="OPENLLM_SERIALIZATION",
-      help="""Serialisation format for save/load LLM.
+      envvar='OPENLLM_SERIALIZATION',
+      help='''Serialisation format for save/load LLM.

      Currently the following strategies are supported:

@@ -482,29 +482,29 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
      - ``legacy``: This will use PyTorch serialisation format, often as ``.bin`` files. This should be used if the model doesn't yet support safetensors.

      > [!NOTE] that GGML format is working in progress.
-      """,
+      ''',
      **attrs
  )(f)
 def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
-      "--container-registry",
-      "container_registry",
+      '--container-registry',
+      'container_registry',
      type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
-      default="ecr",
+      default='ecr',
      show_default=True,
      show_envvar=True,
-      envvar="OPENLLM_CONTAINER_REGISTRY",
+      envvar='OPENLLM_CONTAINER_REGISTRY',
      callback=container_registry_callback,
-      help="""The default container registry to get the base image for building BentoLLM.
+      help='''The default container registry to get the base image for building BentoLLM.

      Currently, it supports 'ecr', 'ghcr.io', 'docker.io'

      \b
      > [!NOTE] that in order to build the base image, you will need a GPUs to compile custom kernel. See ``openllm ext build-base-container`` for more information.
-      """,
+      ''',
      **attrs
  )(f)
-_wpr_strategies = {"round_robin", "conserved"}
+_wpr_strategies = {'round_robin', 'conserved'}
 def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None:
  if value is None: return value
  value = inflection.underscore(value)
@@ -518,5 +518,5 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va
      return value
 def container_registry_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None:
  if value is None: return value
-  if value not in openllm.bundle.supported_registries: raise click.BadParameter(f"Value must be one of {openllm.bundle.supported_registries}", ctx, param)
+  if value not in openllm.bundle.supported_registries: raise click.BadParameter(f'Value must be one of {openllm.bundle.supported_registries}', ctx, param)
  return value
--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -17,11 +17,11 @@ def _start(
    *,
    model_id: str | None = None,
    timeout: int = 30,
-    workers_per_resource: t.Literal["conserved", "round_robin"] | float | None = None,
-    device: tuple[str, ...] | t.Literal["all"] | None = None,
-    quantize: t.Literal["int8", "int4", "gptq"] | None = None,
+    workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None,
+    device: tuple[str, ...] | t.Literal['all'] | None = None,
+    quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
    bettertransformer: bool | None = None,
-    runtime: t.Literal["ggml", "transformers"] = "transformers",
+    runtime: t.Literal['ggml', 'transformers'] = 'transformers',
    adapter_map: dict[LiteralString, str | None] | None = None,
    framework: LiteralRuntime | None = None,
    additional_args: list[str] | None = None,
@@ -79,20 +79,20 @@ def _start(
      quantize=quantize,
      runtime=runtime
  )
-  os.environ[_ModelEnv.framework] = _ModelEnv["framework_value"]
+  os.environ[_ModelEnv.framework] = _ModelEnv['framework_value']

-  args: list[str] = ["--runtime", runtime]
-  if model_id: args.extend(["--model-id", model_id])
-  if timeout: args.extend(["--server-timeout", str(timeout)])
-  if workers_per_resource: args.extend(["--workers-per-resource", str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource])
-  if device and not os.environ.get("CUDA_VISIBLE_DEVICES"): args.extend(["--device", ",".join(device)])
+  args: list[str] = ['--runtime', runtime]
+  if model_id: args.extend(['--model-id', model_id])
+  if timeout: args.extend(['--server-timeout', str(timeout)])
+  if workers_per_resource: args.extend(['--workers-per-resource', str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource])
+  if device and not os.environ.get('CUDA_VISIBLE_DEVICES'): args.extend(['--device', ','.join(device)])
  if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
-  if quantize: args.extend(["--quantize", str(quantize)])
-  elif bettertransformer: args.append("--bettertransformer")
-  if cors: args.append("--cors")
-  if adapter_map: args.extend(list(itertools.chain.from_iterable([["--adapter-id", f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])))
+  if quantize: args.extend(['--quantize', str(quantize)])
+  elif bettertransformer: args.append('--bettertransformer')
+  if cors: args.append('--cors')
+  if adapter_map: args.extend(list(itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])))
  if additional_args: args.extend(additional_args)
-  if __test__: args.append("--return-process")
+  if __test__: args.append('--return-process')

  return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(
      args=args if len(args) > 0 else None, standalone_mode=False
@@ -105,20 +105,20 @@ def _build(
    model_id: str | None = None,
    model_version: str | None = None,
    bento_version: str | None = None,
-    quantize: t.Literal["int8", "int4", "gptq"] | None = None,
+    quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
    bettertransformer: bool | None = None,
    adapter_map: dict[str, str | None] | None = None,
    build_ctx: str | None = None,
    enable_features: tuple[str, ...] | None = None,
    workers_per_resource: float | None = None,
-    runtime: t.Literal["ggml", "transformers"] = "transformers",
+    runtime: t.Literal['ggml', 'transformers'] = 'transformers',
    dockerfile_template: str | None = None,
    overwrite: bool = False,
    container_registry: LiteralContainerRegistry | None = None,
    container_version_strategy: LiteralContainerVersionStrategy | None = None,
    push: bool = False,
    containerize: bool = False,
-    serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors",
+    serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
    additional_args: list[str] | None = None,
    bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
 ) -> bentoml.Bento:
@@ -171,34 +171,34 @@ def _build(
  Returns:
      ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
  """
-  args: list[str] = [sys.executable, "-m", "openllm", "build", model_name, "--machine", "--runtime", runtime, "--serialisation", serialisation_format]
+  args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--runtime', runtime, '--serialisation', serialisation_format]
  if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
-  if quantize: args.extend(["--quantize", quantize])
-  if bettertransformer: args.append("--bettertransformer")
+  if quantize: args.extend(['--quantize', quantize])
+  if bettertransformer: args.append('--bettertransformer')
  if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
-  if push: args.extend(["--push"])
-  if containerize: args.extend(["--containerize"])
-  if model_id: args.extend(["--model-id", model_id])
-  if build_ctx: args.extend(["--build-ctx", build_ctx])
-  if enable_features: args.extend([f"--enable-features={f}" for f in enable_features])
-  if workers_per_resource: args.extend(["--workers-per-resource", str(workers_per_resource)])
-  if overwrite: args.append("--overwrite")
+  if push: args.extend(['--push'])
+  if containerize: args.extend(['--containerize'])
+  if model_id: args.extend(['--model-id', model_id])
+  if build_ctx: args.extend(['--build-ctx', build_ctx])
+  if enable_features: args.extend([f'--enable-features={f}' for f in enable_features])
+  if workers_per_resource: args.extend(['--workers-per-resource', str(workers_per_resource)])
+  if overwrite: args.append('--overwrite')
  if adapter_map: args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()])
-  if model_version: args.extend(["--model-version", model_version])
-  if bento_version: args.extend(["--bento-version", bento_version])
-  if dockerfile_template: args.extend(["--dockerfile-template", dockerfile_template])
-  if container_registry is None: container_registry = "ecr"
-  if container_version_strategy is None: container_version_strategy = "release"
-  args.extend(["--container-registry", container_registry, "--container-version-strategy", container_version_strategy])
+  if model_version: args.extend(['--model-version', model_version])
+  if bento_version: args.extend(['--bento-version', bento_version])
+  if dockerfile_template: args.extend(['--dockerfile-template', dockerfile_template])
+  if container_registry is None: container_registry = 'ecr'
+  if container_version_strategy is None: container_version_strategy = 'release'
+  args.extend(['--container-registry', container_registry, '--container-version-strategy', container_version_strategy])
  if additional_args: args.extend(additional_args)

  try:
    output = subprocess.check_output(args, env=os.environ.copy(), cwd=build_ctx or os.getcwd())
  except subprocess.CalledProcessError as e:
-    logger.error("Exception caught while building %s", model_name, exc_info=e)
-    if e.stderr: raise OpenLLMException(e.stderr.decode("utf-8")) from None
+    logger.error('Exception caught while building %s', model_name, exc_info=e)
+    if e.stderr: raise OpenLLMException(e.stderr.decode('utf-8')) from None
    raise OpenLLMException(str(e)) from None
-  matched = re.match(r"__tag__:([^:\n]+:[^:\n]+)$", output.decode("utf-8").strip())
+  matched = re.match(r'__tag__:([^:\n]+:[^:\n]+)$', output.decode('utf-8').strip())
  if matched is None:
    raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.")
  return bentoml.get(matched.group(1), _bento_store=bento_store)
@@ -208,10 +208,10 @@ def _import_model(
    *,
    model_id: str | None = None,
    model_version: str | None = None,
-    runtime: t.Literal["ggml", "transformers"] = "transformers",
-    implementation: LiteralRuntime = "pt",
-    quantize: t.Literal["int8", "int4", "gptq"] | None = None,
-    serialisation_format: t.Literal["legacy", "safetensors"] = "safetensors",
+    runtime: t.Literal['ggml', 'transformers'] = 'transformers',
+    implementation: LiteralRuntime = 'pt',
+    quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
+    serialisation_format: t.Literal['legacy', 'safetensors'] = 'safetensors',
    additional_args: t.Sequence[str] | None = None
 ) -> bentoml.Model:
  """Import a LLM into local store.
@@ -245,15 +245,15 @@ def _import_model(
      ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
  """
  from .entrypoint import import_command
-  args = [model_name, "--runtime", runtime, "--implementation", implementation, "--machine", "--serialisation", serialisation_format,]
+  args = [model_name, '--runtime', runtime, '--implementation', implementation, '--machine', '--serialisation', serialisation_format,]
  if model_id is not None: args.append(model_id)
-  if model_version is not None: args.extend(["--model-version", str(model_version)])
+  if model_version is not None: args.extend(['--model-version', str(model_version)])
  if additional_args is not None: args.extend(additional_args)
-  if quantize is not None: args.extend(["--quantize", quantize])
+  if quantize is not None: args.extend(['--quantize', quantize])
  return import_command.main(args=args, standalone_mode=False)
 def _list_models() -> dict[str, t.Any]:
-  """List all available models within the local store."""
+  '''List all available models within the local store.'''
  from .entrypoint import models_command
-  return models_command.main(args=["-o", "json", "--show-available", "--machine"], standalone_mode=False)
+  return models_command.main(args=['-o', 'json', '--show-available', '--machine'], standalone_mode=False)
 start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(_import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
-__all__ = ["start", "start_grpc", "build", "import_model", "list_models"]
+__all__ = ['start', 'start_grpc', 'build', 'import_model', 'list_models']
--- a/openllm-python/src/openllm/cli/entrypoint.py
+++ b/openllm-python/src/openllm/cli/entrypoint.py
@@ -41,38 +41,38 @@ if t.TYPE_CHECKING:
  from openllm_core._schema import EmbeddingsOutput
  from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
 else:
-  torch = LazyLoader("torch", globals(), "torch")
+  torch = LazyLoader('torch', globals(), 'torch')

-P = ParamSpec("P")
+P = ParamSpec('P')
 logger = logging.getLogger(__name__)
-OPENLLM_FIGLET = """\
+OPENLLM_FIGLET = '''\
 ██████╗ ██████╗ ███████╗███╗   ██╗██╗     ██╗     ███╗   ███╗
 ██╔═══██╗██╔══██╗██╔════╝████╗  ██║██║     ██║     ████╗ ████║
 ██║   ██║██████╔╝█████╗  ██╔██╗ ██║██║     ██║     ██╔████╔██║
 ██║   ██║██╔═══╝ ██╔══╝  ██║╚██╗██║██║     ██║     ██║╚██╔╝██║
 ╚██████╔╝██║     ███████╗██║ ╚████║███████╗███████╗██║ ╚═╝ ██║
 ╚═════╝ ╚═╝     ╚══════╝╚═╝  ╚═══╝╚══════╝╚══════╝╚═╝     ╚═╝
-"""
+'''

-ServeCommand = t.Literal["serve", "serve-grpc"]
+ServeCommand = t.Literal['serve', 'serve-grpc']
@attr.define
 class GlobalOptions:
  cloud_context: str | None = attr.field(default=None)

  def with_options(self, **attrs: t.Any) -> Self:
    return attr.evolve(self, **attrs)
-GrpType = t.TypeVar("GrpType", bound=click.Group)
+GrpType = t.TypeVar('GrpType', bound=click.Group)

 _object_setattr = object.__setattr__

-_EXT_FOLDER = os.path.abspath(os.path.join(os.path.dirname(__file__), "extension"))
+_EXT_FOLDER = os.path.abspath(os.path.join(os.path.dirname(__file__), 'extension'))
 class Extensions(click.MultiCommand):
  def list_commands(self, ctx: click.Context) -> list[str]:
-    return sorted([filename[:-3] for filename in os.listdir(_EXT_FOLDER) if filename.endswith(".py") and not filename.startswith("__")])
+    return sorted([filename[:-3] for filename in os.listdir(_EXT_FOLDER) if filename.endswith('.py') and not filename.startswith('__')])

  def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None:
    try:
-      mod = __import__(f"openllm.cli.extension.{cmd_name}", None, None, ["cli"])
+      mod = __import__(f'openllm.cli.extension.{cmd_name}', None, None, ['cli'])
    except ImportError:
      return None
    return mod.cli
@@ -82,11 +82,11 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
  @staticmethod
  def common_params(f: t.Callable[P, t.Any]) -> t.Callable[[FC], FC]:
    # The following logics is similar to one of BentoMLCommandGroup
-    @cog.optgroup.group(name="Global options", help="Shared globals options for all OpenLLM CLI.")
-    @cog.optgroup.option("-q", "--quiet", envvar=QUIET_ENV_VAR, is_flag=True, default=False, help="Suppress all output.", show_envvar=True)
-    @cog.optgroup.option("--debug", "--verbose", "debug", envvar=DEBUG_ENV_VAR, is_flag=True, default=False, help="Print out debug logs.", show_envvar=True)
-    @cog.optgroup.option("--do-not-track", is_flag=True, default=False, envvar=analytics.OPENLLM_DO_NOT_TRACK, help="Do not send usage info", show_envvar=True)
-    @cog.optgroup.option("--context", "cloud_context", envvar="BENTOCLOUD_CONTEXT", type=click.STRING, default=None, help="BentoCloud context name.", show_envvar=True)
+    @cog.optgroup.group(name='Global options', help='Shared globals options for all OpenLLM CLI.')
+    @cog.optgroup.option('-q', '--quiet', envvar=QUIET_ENV_VAR, is_flag=True, default=False, help='Suppress all output.', show_envvar=True)
+    @cog.optgroup.option('--debug', '--verbose', 'debug', envvar=DEBUG_ENV_VAR, is_flag=True, default=False, help='Print out debug logs.', show_envvar=True)
+    @cog.optgroup.option('--do-not-track', is_flag=True, default=False, envvar=analytics.OPENLLM_DO_NOT_TRACK, help='Do not send usage info', show_envvar=True)
+    @cog.optgroup.option('--context', 'cloud_context', envvar='BENTOCLOUD_CONTEXT', type=click.STRING, default=None, help='BentoCloud context name.', show_envvar=True)
    @click.pass_context
    @functools.wraps(f)
    def wrapper(ctx: click.Context, quiet: bool, debug: bool, cloud_context: str | None, *args: P.args, **attrs: P.kwargs) -> t.Any:
@@ -102,7 +102,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):

  @staticmethod
  def usage_tracking(func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any) -> t.Callable[Concatenate[bool, P], t.Any]:
-    command_name = attrs.get("name", func.__name__)
+    command_name = attrs.get('name', func.__name__)

    @functools.wraps(func)
    def wrapper(do_not_track: bool, *args: P.args, **attrs: P.kwargs) -> t.Any:
@@ -111,7 +111,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
          return func(*args, **attrs)
      start_time = time.time_ns()
      with analytics.set_bentoml_tracking():
-        if group.name is None: raise ValueError("group.name should not be None")
+        if group.name is None: raise ValueError('group.name should not be None')
        event = analytics.OpenllmCliEvent(cmd_group=group.name, cmd_name=command_name)
        try:
          return_value = func(*args, **attrs)
@@ -131,22 +131,22 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):

  @staticmethod
  def exception_handling(func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any) -> t.Callable[P, t.Any]:
-    command_name = attrs.get("name", func.__name__)
+    command_name = attrs.get('name', func.__name__)

    @functools.wraps(func)
    def wrapper(*args: P.args, **attrs: P.kwargs) -> t.Any:
      try:
        return func(*args, **attrs)
      except OpenLLMException as err:
-        raise click.ClickException(click.style(f"[{group.name}] '{command_name}' failed: " + err.message, fg="red")) from err
+        raise click.ClickException(click.style(f"[{group.name}] '{command_name}' failed: " + err.message, fg='red')) from err
      except KeyboardInterrupt:
        pass

    return wrapper

  def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None:
-    if cmd_name in t.cast("Extensions", extension_command).list_commands(ctx):
-      return t.cast("Extensions", extension_command).get_command(ctx, cmd_name)
+    if cmd_name in t.cast('Extensions', extension_command).list_commands(ctx):
+      return t.cast('Extensions', extension_command).get_command(ctx, cmd_name)
    cmd_name = self.resolve_alias(cmd_name)
    if ctx.command.name in _start_mapping:
      try:
@@ -158,36 +158,36 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
          raise click.ClickException(f"'openllm start {cmd_name}' is currently disabled for the time being. Please let us know if you need this feature by opening an issue on GitHub.")
        except bentoml.exceptions.NotFound:
          pass
-        raise click.BadArgumentUsage(f"{cmd_name} is not a valid model identifier supported by OpenLLM.") from None
+        raise click.BadArgumentUsage(f'{cmd_name} is not a valid model identifier supported by OpenLLM.') from None
    return super().get_command(ctx, cmd_name)

  def list_commands(self, ctx: click.Context) -> list[str]:
-    if ctx.command.name in {"start", "start-grpc"}: return list(CONFIG_MAPPING.keys())
-    return super().list_commands(ctx) + t.cast("Extensions", extension_command).list_commands(ctx)
+    if ctx.command.name in {'start', 'start-grpc'}: return list(CONFIG_MAPPING.keys())
+    return super().list_commands(ctx) + t.cast('Extensions', extension_command).list_commands(ctx)

  def command(self, *args: t.Any, **kwargs: t.Any) -> t.Callable[[t.Callable[..., t.Any]], click.Command]:  # type: ignore[override] # XXX: fix decorator on BentoMLCommandGroup
    """Override the default 'cli.command' with supports for aliases for given command, and it wraps the implementation with common parameters."""
-    if "context_settings" not in kwargs: kwargs["context_settings"] = {}
-    if "max_content_width" not in kwargs["context_settings"]: kwargs["context_settings"]["max_content_width"] = 120
-    aliases = kwargs.pop("aliases", None)
+    if 'context_settings' not in kwargs: kwargs['context_settings'] = {}
+    if 'max_content_width' not in kwargs['context_settings']: kwargs['context_settings']['max_content_width'] = 120
+    aliases = kwargs.pop('aliases', None)

    def decorator(f: _AnyCallable) -> click.Command:
      name = f.__name__.lower()
-      if name.endswith("_command"): name = name[:-8]
-      name = name.replace("_", "-")
-      kwargs.setdefault("help", inspect.getdoc(f))
-      kwargs.setdefault("name", name)
+      if name.endswith('_command'): name = name[:-8]
+      name = name.replace('_', '-')
+      kwargs.setdefault('help', inspect.getdoc(f))
+      kwargs.setdefault('name', name)
      wrapped = self.exception_handling(self.usage_tracking(self.common_params(f), self, **kwargs), self, **kwargs)

      # move common parameters to end of the parameters list
-      _memo = getattr(wrapped, "__click_params__", None)
-      if _memo is None: raise RuntimeError("Click command not register correctly.")
-      _object_setattr(wrapped, "__click_params__", _memo[-self.NUMBER_OF_COMMON_PARAMS:] + _memo[:-self.NUMBER_OF_COMMON_PARAMS])
+      _memo = getattr(wrapped, '__click_params__', None)
+      if _memo is None: raise RuntimeError('Click command not register correctly.')
+      _object_setattr(wrapped, '__click_params__', _memo[-self.NUMBER_OF_COMMON_PARAMS:] + _memo[:-self.NUMBER_OF_COMMON_PARAMS])
      # NOTE: we need to call super of super to avoid conflict with BentoMLCommandGroup command setup
      cmd = super(BentoMLCommandGroup, self).command(*args, **kwargs)(wrapped)
      # NOTE: add aliases to a given commands if it is specified.
      if aliases is not None:
-        if not cmd.name: raise ValueError("name is required when aliases are available.")
+        if not cmd.name: raise ValueError('name is required when aliases are available.')
        self._commands[cmd.name] = aliases
        self._aliases.update({alias: cmd.name for alias in aliases})
      return cmd
@@ -195,11 +195,11 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
    return decorator

  def format_commands(self, ctx: click.Context, formatter: click.HelpFormatter) -> None:
-    """Additional format methods that include extensions as well as the default cli command."""
+    '''Additional format methods that include extensions as well as the default cli command.'''
    from gettext import gettext as _
    commands: list[tuple[str, click.Command]] = []
    extensions: list[tuple[str, click.Command]] = []
-    _cached_extensions: list[str] = t.cast("Extensions", extension_command).list_commands(ctx)
+    _cached_extensions: list[str] = t.cast('Extensions', extension_command).list_commands(ctx)
    for subcommand in self.list_commands(ctx):
      cmd = self.get_command(ctx, subcommand)
      if cmd is None or cmd.hidden: continue
@@ -213,7 +213,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
        help = cmd.get_short_help_str(limit)
        rows.append((subcommand, help))
      if rows:
-        with formatter.section(_("Commands")):
+        with formatter.section(_('Commands')):
          formatter.write_dl(rows)
    if len(extensions):
      limit = formatter.width - 6 - max(len(cmd[0]) for cmd in extensions)
@@ -222,14 +222,14 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
        help = cmd.get_short_help_str(limit)
        rows.append((inflection.dasherize(subcommand), help))
      if rows:
-        with formatter.section(_("Extensions")):
+        with formatter.section(_('Extensions')):
          formatter.write_dl(rows)
-@click.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name="openllm")
+@click.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name='openllm')
@click.version_option(
-    None, "--version", "-v", message=f"%(prog)s, %(version)s (compiled: {'yes' if openllm.COMPILED else 'no'})\nPython ({platform.python_implementation()}) {platform.python_version()}"
+    None, '--version', '-v', message=f"%(prog)s, %(version)s (compiled: {'yes' if openllm.COMPILED else 'no'})\nPython ({platform.python_implementation()}) {platform.python_version()}"
 )
 def cli() -> None:
-  """\b
+  '''\b
   ██████╗ ██████╗ ███████╗███╗   ██╗██╗     ██╗     ███╗   ███╗
  ██╔═══██╗██╔══██╗██╔════╝████╗  ██║██║     ██║     ████╗ ████║
  ██║   ██║██████╔╝█████╗  ██╔██╗ ██║██║     ██║     ██╔████╔██║
@@ -240,43 +240,43 @@ def cli() -> None:
  \b
  An open platform for operating large language models in production.
  Fine-tune, serve, deploy, and monitor any LLMs with ease.
-  """
-@cli.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name="start", aliases=["start-http"])
+  '''
+@cli.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name='start', aliases=['start-http'])
 def start_command() -> None:
-  """Start any LLM as a REST server.
+  '''Start any LLM as a REST server.

  \b
  ```bash
  $ openllm <start|start-http> <model_name> --<options> ...
  ```
-  """
-@cli.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name="start-grpc")
+  '''
+@cli.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name='start-grpc')
 def start_grpc_command() -> None:
-  """Start any LLM as a gRPC server.
+  '''Start any LLM as a gRPC server.

  \b
  ```bash
  $ openllm start-grpc <model_name> --<options> ...
  ```
-  """
+  '''
 _start_mapping = {
-    "start": {
+    'start': {
        key: start_command_factory(start_command, key, _context_settings=termui.CONTEXT_SETTINGS) for key in CONFIG_MAPPING
    },
-    "start-grpc": {
+    'start-grpc': {
        key: start_command_factory(start_grpc_command, key, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=True) for key in CONFIG_MAPPING
    }
 }
-@cli.command(name="import", aliases=["download"])
+@cli.command(name='import', aliases=['download'])
@model_name_argument
-@click.argument("model_id", type=click.STRING, default=None, metavar="Optional[REMOTE_REPO/MODEL_ID | /path/to/local/model]", required=False)
-@click.argument("converter", envvar="CONVERTER", type=click.STRING, default=None, required=False, metavar=None)
+@click.argument('model_id', type=click.STRING, default=None, metavar='Optional[REMOTE_REPO/MODEL_ID | /path/to/local/model]', required=False)
+@click.argument('converter', envvar='CONVERTER', type=click.STRING, default=None, required=False, metavar=None)
@model_version_option
-@click.option("--runtime", type=click.Choice(["ggml", "transformers"]), default="transformers", help="The runtime to use for the given model. Default is transformers.")
+@click.option('--runtime', type=click.Choice(['ggml', 'transformers']), default='transformers', help='The runtime to use for the given model. Default is transformers.')
@output_option
@quantize_option
@machine_option
-@click.option("--implementation", type=click.Choice(["pt", "tf", "flax", "vllm"]), default=None, help="The implementation for saving this LLM.")
+@click.option('--implementation', type=click.Choice(['pt', 'tf', 'flax', 'vllm']), default=None, help='The implementation for saving this LLM.')
@serialisation_option
 def import_command(
    model_name: str,
@@ -284,11 +284,11 @@ def import_command(
    converter: str | None,
    model_version: str | None,
    output: LiteralOutput,
-    runtime: t.Literal["ggml", "transformers"],
+    runtime: t.Literal['ggml', 'transformers'],
    machine: bool,
    implementation: LiteralRuntime | None,
-    quantize: t.Literal["int8", "int4", "gptq"] | None,
-    serialisation_format: t.Literal["safetensors", "legacy"],
+    quantize: t.Literal['int8', 'int4', 'gptq'] | None,
+    serialisation_format: t.Literal['safetensors', 'legacy'],
 ) -> bentoml.Model:
  """Setup LLM interactively.

@@ -344,73 +344,73 @@ def import_command(
  """
  llm_config = AutoConfig.for_model(model_name)
  env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, runtime=runtime, quantize=quantize)
-  impl: LiteralRuntime = first_not_none(implementation, default=env["framework_value"])
+  impl: LiteralRuntime = first_not_none(implementation, default=env['framework_value'])
  llm = infer_auto_class(impl).for_model(
-      model_name, model_id=env["model_id_value"], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format
+      model_name, model_id=env['model_id_value'], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format
  )
  _previously_saved = False
  try:
    _ref = serialisation.get(llm)
    _previously_saved = True
  except bentoml.exceptions.NotFound:
-    if not machine and output == "pretty":
+    if not machine and output == 'pretty':
      msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store for implementation {llm.__llm_implementation__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
-      termui.echo(msg, fg="yellow", nl=True)
+      termui.echo(msg, fg='yellow', nl=True)
    _ref = serialisation.get(llm, auto_import=True)
-    if impl == "pt" and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
+    if impl == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
  if machine: return _ref
-  elif output == "pretty":
-    if _previously_saved: termui.echo(f"{model_name} with 'model_id={model_id}' is already setup for framework '{impl}': {_ref.tag!s}", nl=True, fg="yellow")
-    else: termui.echo(f"Saved model: {_ref.tag}")
-  elif output == "json": termui.echo(orjson.dumps({"previously_setup": _previously_saved, "framework": impl, "tag": str(_ref.tag)}, option=orjson.OPT_INDENT_2).decode())
+  elif output == 'pretty':
+    if _previously_saved: termui.echo(f"{model_name} with 'model_id={model_id}' is already setup for framework '{impl}': {_ref.tag!s}", nl=True, fg='yellow')
+    else: termui.echo(f'Saved model: {_ref.tag}')
+  elif output == 'json': termui.echo(orjson.dumps({'previously_setup': _previously_saved, 'framework': impl, 'tag': str(_ref.tag)}, option=orjson.OPT_INDENT_2).decode())
  else: termui.echo(_ref.tag)
  return _ref
-@cli.command(context_settings={"token_normalize_func": inflection.underscore})
+@cli.command(context_settings={'token_normalize_func': inflection.underscore})
@model_name_argument
@model_id_option
@output_option
@machine_option
-@click.option("--bento-version", type=str, default=None, help="Optional bento version for this BentoLLM. Default is the the model revision.")
-@click.option("--overwrite", is_flag=True, help="Overwrite existing Bento for given LLM if it already exists.")
+@click.option('--bento-version', type=str, default=None, help='Optional bento version for this BentoLLM. Default is the the model revision.')
+@click.option('--overwrite', is_flag=True, help='Overwrite existing Bento for given LLM if it already exists.')
@workers_per_resource_option(factory=click, build=True)
-@click.option("--device", type=dantic.CUDA, multiple=True, envvar="CUDA_VISIBLE_DEVICES", callback=parse_device_callback, help="Set the device", show_envvar=True)
-@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name="Optimisation options")
+@click.option('--device', type=dantic.CUDA, multiple=True, envvar='CUDA_VISIBLE_DEVICES', callback=parse_device_callback, help='Set the device', show_envvar=True)
+@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Optimisation options')
@quantize_option(factory=cog.optgroup, build=True)
@bettertransformer_option(factory=cog.optgroup)
-@click.option("--runtime", type=click.Choice(["ggml", "transformers"]), default="transformers", help="The runtime to use for the given model. Default is transformers.")
+@click.option('--runtime', type=click.Choice(['ggml', 'transformers']), default='transformers', help='The runtime to use for the given model. Default is transformers.')
@click.option(
-    "--enable-features",
+    '--enable-features',
    multiple=True,
    nargs=1,
-    metavar="FEATURE[,FEATURE]",
-    help="Enable additional features for building this LLM Bento. Available: {}".format(", ".join(OPTIONAL_DEPENDENCIES))
+    metavar='FEATURE[,FEATURE]',
+    help='Enable additional features for building this LLM Bento. Available: {}'.format(', '.join(OPTIONAL_DEPENDENCIES))
 )
@click.option(
-    "--adapter-id",
+    '--adapter-id',
    default=None,
    multiple=True,
-    metavar="[PATH | [remote/][adapter_name:]adapter_id][, ...]",
+    metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]',
    help="Optional adapters id to be included within the Bento. Note that if you are using relative path, '--build-ctx' must be passed."
 )
-@click.option("--build-ctx", help="Build context. This is required if --adapter-id uses relative path", default=None)
+@click.option('--build-ctx', help='Build context. This is required if --adapter-id uses relative path', default=None)
@model_version_option
-@click.option("--dockerfile-template", default=None, type=click.File(), help="Optional custom dockerfile template to be used with this BentoLLM.")
+@click.option('--dockerfile-template', default=None, type=click.File(), help='Optional custom dockerfile template to be used with this BentoLLM.')
@serialisation_option
@container_registry_option
@click.option(
-    "--container-version-strategy", type=click.Choice(["release", "latest", "nightly"]), default="release", help="Default container version strategy for the image from '--container-registry'"
+    '--container-version-strategy', type=click.Choice(['release', 'latest', 'nightly']), default='release', help="Default container version strategy for the image from '--container-registry'"
 )
@fast_option
-@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name="Utilities options")
+@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Utilities options')
@cog.optgroup.option(
-    "--containerize",
+    '--containerize',
    default=False,
    is_flag=True,
    type=click.BOOL,
    help="Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'."
 )
-@cog.optgroup.option("--push", default=False, is_flag=True, type=click.BOOL, help="Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.")
-@click.option("--force-push", default=False, is_flag=True, type=click.BOOL, help="Whether to force push.")
+@cog.optgroup.option('--push', default=False, is_flag=True, type=click.BOOL, help="Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.")
+@click.option('--force-push', default=False, is_flag=True, type=click.BOOL, help='Whether to force push.')
@click.pass_context
 def build_command(
    ctx: click.Context,
@@ -420,8 +420,8 @@ def build_command(
    bento_version: str | None,
    overwrite: bool,
    output: LiteralOutput,
-    runtime: t.Literal["ggml", "transformers"],
-    quantize: t.Literal["int8", "int4", "gptq"] | None,
+    runtime: t.Literal['ggml', 'transformers'],
+    quantize: t.Literal['int8', 'int4', 'gptq'] | None,
    enable_features: tuple[str, ...] | None,
    bettertransformer: bool | None,
    workers_per_resource: float | None,
@@ -433,14 +433,14 @@ def build_command(
    dockerfile_template: t.TextIO | None,
    containerize: bool,
    push: bool,
-    serialisation_format: t.Literal["safetensors", "legacy"],
+    serialisation_format: t.Literal['safetensors', 'legacy'],
    fast: bool,
    container_registry: LiteralContainerRegistry,
    container_version_strategy: LiteralContainerVersionStrategy,
    force_push: bool,
    **attrs: t.Any,
 ) -> bentoml.Bento:
-  """Package a given models into a Bento.
+  '''Package a given models into a Bento.

  \b
  ```bash
@@ -456,9 +456,9 @@ def build_command(
  > [!IMPORTANT]
  > To build the bento with compiled OpenLLM, make sure to prepend HATCH_BUILD_HOOKS_ENABLE=1. Make sure that the deployment
  > target also use the same Python version and architecture as build machine.
-  """
-  if machine: output = "porcelain"
-  if enable_features: enable_features = tuple(itertools.chain.from_iterable((s.split(",") for s in enable_features)))
+  '''
+  if machine: output = 'porcelain'
+  if enable_features: enable_features = tuple(itertools.chain.from_iterable((s.split(',') for s in enable_features)))

  _previously_built = False

@@ -468,32 +468,32 @@ def build_command(
  # NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
  # during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
  try:
-    os.environ.update({"OPENLLM_MODEL": inflection.underscore(model_name), env.runtime: str(env["runtime_value"]), "OPENLLM_SERIALIZATION": serialisation_format})
-    if env["model_id_value"]: os.environ[env.model_id] = str(env["model_id_value"])
-    if env["quantize_value"]: os.environ[env.quantize] = str(env["quantize_value"])
-    os.environ[env.bettertransformer] = str(env["bettertransformer_value"])
+    os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), env.runtime: str(env['runtime_value']), 'OPENLLM_SERIALIZATION': serialisation_format})
+    if env['model_id_value']: os.environ[env.model_id] = str(env['model_id_value'])
+    if env['quantize_value']: os.environ[env.quantize] = str(env['quantize_value'])
+    os.environ[env.bettertransformer] = str(env['bettertransformer_value'])

-    llm = infer_auto_class(env["framework_value"]).for_model(
-        model_name, model_id=env["model_id_value"], llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs
+    llm = infer_auto_class(env['framework_value']).for_model(
+        model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs
    )

    labels = dict(llm.identifying_params)
-    labels.update({"_type": llm.llm_type, "_framework": env["framework_value"]})
-    workers_per_resource = first_not_none(workers_per_resource, default=llm_config["workers_per_resource"])
+    labels.update({'_type': llm.llm_type, '_framework': env['framework_value']})
+    workers_per_resource = first_not_none(workers_per_resource, default=llm_config['workers_per_resource'])

    with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
      dockerfile_template_path = None
      if dockerfile_template:
        with dockerfile_template:
-          llm_fs.writetext("Dockerfile.template", dockerfile_template.read())
-        dockerfile_template_path = llm_fs.getsyspath("/Dockerfile.template")
+          llm_fs.writetext('Dockerfile.template', dockerfile_template.read())
+        dockerfile_template_path = llm_fs.getsyspath('/Dockerfile.template')

      adapter_map: dict[str, str | None] | None = None
      if adapter_id:
        if not build_ctx: ctx.fail("'build_ctx' is required when '--adapter-id' is passsed.")
        adapter_map = {}
        for v in adapter_id:
-          _adapter_id, *adapter_name = v.rsplit(":", maxsplit=1)
+          _adapter_id, *adapter_name = v.rsplit(':', maxsplit=1)
          name = adapter_name[0] if len(adapter_name) > 0 else None
          try:
            resolve_user_filepath(_adapter_id, build_ctx)
@@ -508,16 +508,16 @@ def build_command(
          # that edge case.
          except FileNotFoundError:
            adapter_map[_adapter_id] = name
-        os.environ["OPENLLM_ADAPTER_MAP"] = orjson.dumps(adapter_map).decode()
+        os.environ['OPENLLM_ADAPTER_MAP'] = orjson.dumps(adapter_map).decode()

      _bento_version = first_not_none(bento_version, default=llm.tag.version)
-      bento_tag = bentoml.Tag.from_taglike(f"{llm.llm_type}-service:{_bento_version}".lower().strip())
+      bento_tag = bentoml.Tag.from_taglike(f'{llm.llm_type}-service:{_bento_version}'.lower().strip())
      try:
        bento = bentoml.get(bento_tag)
        if overwrite:
-          if output == "pretty": termui.echo(f"Overwriting existing Bento {bento_tag}", fg="yellow")
+          if output == 'pretty': termui.echo(f'Overwriting existing Bento {bento_tag}', fg='yellow')
          bentoml.delete(bento_tag)
-          raise bentoml.exceptions.NotFound(f"Rebuilding existing Bento {bento_tag}") from None
+          raise bentoml.exceptions.NotFound(f'Rebuilding existing Bento {bento_tag}') from None
        _previously_built = True
      except bentoml.exceptions.NotFound:
        bento = bundle.create_bento(
@@ -537,38 +537,38 @@ def build_command(
  except Exception as err:
    raise err from None

-  if machine: termui.echo(f"__tag__:{bento.tag}", fg="white")
-  elif output == "pretty":
+  if machine: termui.echo(f'__tag__:{bento.tag}', fg='white')
+  elif output == 'pretty':
    if not get_quiet_mode() and (not push or not containerize):
-      termui.echo("\n" + OPENLLM_FIGLET, fg="white")
-      if not _previously_built: termui.echo(f"Successfully built {bento}.", fg="green")
-      elif not overwrite: termui.echo(f"'{model_name}' already has a Bento built [{bento}]. To overwrite it pass '--overwrite'.", fg="yellow")
+      termui.echo('\n' + OPENLLM_FIGLET, fg='white')
+      if not _previously_built: termui.echo(f'Successfully built {bento}.', fg='green')
+      elif not overwrite: termui.echo(f"'{model_name}' already has a Bento built [{bento}]. To overwrite it pass '--overwrite'.", fg='yellow')
      termui.echo(
-          "📖 Next steps:\n\n" + f"* Push to BentoCloud with 'bentoml push':\n\t$ bentoml push {bento.tag}\n\n" +
+          '📖 Next steps:\n\n' + f"* Push to BentoCloud with 'bentoml push':\n\t$ bentoml push {bento.tag}\n\n" +
          f"* Containerize your Bento with 'bentoml containerize':\n\t$ bentoml containerize {bento.tag} --opt progress=plain\n\n" +
          "\tTip: To enable additional BentoML features for 'containerize', use '--enable-features=FEATURE[,FEATURE]' [see 'bentoml containerize -h' for more advanced usage]\n",
-          fg="blue",
+          fg='blue',
      )
-  elif output == "json":
+  elif output == 'json':
    termui.echo(orjson.dumps(bento.info.to_dict(), option=orjson.OPT_INDENT_2).decode())
  else:
    termui.echo(bento.tag)

  if push: BentoMLContainer.bentocloud_client.get().push_bento(bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=force_push)
  elif containerize:
-    backend = t.cast("DefaultBuilder", os.environ.get("BENTOML_CONTAINERIZE_BACKEND", "docker"))
+    backend = t.cast('DefaultBuilder', os.environ.get('BENTOML_CONTAINERIZE_BACKEND', 'docker'))
    try:
      bentoml.container.health(backend)
    except subprocess.CalledProcessError:
-      raise OpenLLMException(f"Failed to use backend {backend}") from None
+      raise OpenLLMException(f'Failed to use backend {backend}') from None
    try:
-      bentoml.container.build(bento.tag, backend=backend, features=("grpc", "io"))
+      bentoml.container.build(bento.tag, backend=backend, features=('grpc', 'io'))
    except Exception as err:
      raise OpenLLMException(f"Exception caught while containerizing '{bento.tag!s}':\n{err}") from err
  return bento
@cli.command()
@output_option
-@click.option("--show-available", is_flag=True, default=False, help="Show available models in local store (mutually exclusive with '-o porcelain').")
+@click.option('--show-available', is_flag=True, default=False, help="Show available models in local store (mutually exclusive with '-o porcelain').")
@machine_option
@click.pass_context
 def models_command(ctx: click.Context, output: LiteralOutput, show_available: bool, machine: bool) -> DictStrAny | None:
@@ -585,30 +585,30 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
  from .._llm import normalise_model_name

  models = tuple(inflection.dasherize(key) for key in CONFIG_MAPPING.keys())
-  if output == "porcelain":
-    if show_available: raise click.BadOptionUsage("--show-available", "Cannot use '--show-available' with '-o porcelain' (mutually exclusive).")
-    termui.echo("\n".join(models), fg="white")
+  if output == 'porcelain':
+    if show_available: raise click.BadOptionUsage('--show-available', "Cannot use '--show-available' with '-o porcelain' (mutually exclusive).")
+    termui.echo('\n'.join(models), fg='white')
  else:
    failed_initialized: list[tuple[str, Exception]] = []

-    json_data: dict[str, dict[t.Literal["architecture", "model_id", "url", "installation", "cpu", "gpu", "runtime_impl"], t.Any] | t.Any] = {}
+    json_data: dict[str, dict[t.Literal['architecture', 'model_id', 'url', 'installation', 'cpu', 'gpu', 'runtime_impl'], t.Any] | t.Any] = {}
    converted: list[str] = []
    for m in models:
      config = AutoConfig.for_model(m)
      runtime_impl: tuple[str, ...] = ()
-      if config["model_name"] in MODEL_MAPPING_NAMES: runtime_impl += ("pt",)
-      if config["model_name"] in MODEL_FLAX_MAPPING_NAMES: runtime_impl += ("flax",)
-      if config["model_name"] in MODEL_TF_MAPPING_NAMES: runtime_impl += ("tf",)
-      if config["model_name"] in MODEL_VLLM_MAPPING_NAMES: runtime_impl += ("vllm",)
+      if config['model_name'] in MODEL_MAPPING_NAMES: runtime_impl += ('pt',)
+      if config['model_name'] in MODEL_FLAX_MAPPING_NAMES: runtime_impl += ('flax',)
+      if config['model_name'] in MODEL_TF_MAPPING_NAMES: runtime_impl += ('tf',)
+      if config['model_name'] in MODEL_VLLM_MAPPING_NAMES: runtime_impl += ('vllm',)
      json_data[m] = {
-          "architecture": config["architecture"],
-          "model_id": config["model_ids"],
-          "cpu": not config["requires_gpu"],
-          "gpu": True,
-          "runtime_impl": runtime_impl,
-          "installation": f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config["requirements"] else "openllm",
+          'architecture': config['architecture'],
+          'model_id': config['model_ids'],
+          'cpu': not config['requires_gpu'],
+          'gpu': True,
+          'runtime_impl': runtime_impl,
+          'installation': f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config['requirements'] else 'openllm',
      }
-      converted.extend([normalise_model_name(i) for i in config["model_ids"]])
+      converted.extend([normalise_model_name(i) for i in config['model_ids']])
      if DEBUG:
        try:
          AutoLLM.for_model(m, llm_config=config)
@@ -617,7 +617,7 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo

    ids_in_local_store = {
        k: [
-            i for i in bentoml.models.list() if "framework" in i.info.labels and i.info.labels["framework"] == "openllm" and "model_name" in i.info.labels and i.info.labels["model_name"] == k
+            i for i in bentoml.models.list() if 'framework' in i.info.labels and i.info.labels['framework'] == 'openllm' and 'model_name' in i.info.labels and i.info.labels['model_name'] == k
        ] for k in json_data.keys()
    }
    ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
@@ -626,74 +626,74 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
      local_models = {k: [str(i.tag) for i in val] for k, val in ids_in_local_store.items()}

    if machine:
-      if show_available: json_data["local"] = local_models
+      if show_available: json_data['local'] = local_models
      return json_data
-    elif output == "pretty":
+    elif output == 'pretty':
      import tabulate

      tabulate.PRESERVE_WHITESPACE = True
      # llm, architecture, url, model_id, installation, cpu, gpu, runtime_impl
      data: list[str | tuple[str, str, list[str], str, LiteralString, LiteralString, tuple[LiteralRuntime, ...]]] = []
      for m, v in json_data.items():
-        data.extend([(m, v["architecture"], v["model_id"], v["installation"], "❌" if not v["cpu"] else "✅", "✅", v["runtime_impl"],)])
+        data.extend([(m, v['architecture'], v['model_id'], v['installation'], '❌' if not v['cpu'] else '✅', '✅', v['runtime_impl'],)])
      column_widths = [
          int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 4),
      ]

      if len(data) == 0 and len(failed_initialized) > 0:
-        termui.echo("Exception found while parsing models:\n", fg="yellow")
+        termui.echo('Exception found while parsing models:\n', fg='yellow')
        for m, err in failed_initialized:
-          termui.echo(f"- {m}: ", fg="yellow", nl=False)
-          termui.echo(traceback.print_exception(None, err, None, limit=5), fg="red")  # type: ignore[func-returns-value]
+          termui.echo(f'- {m}: ', fg='yellow', nl=False)
+          termui.echo(traceback.print_exception(None, err, None, limit=5), fg='red')  # type: ignore[func-returns-value]
        sys.exit(1)

-      table = tabulate.tabulate(data, tablefmt="fancy_grid", headers=["LLM", "Architecture", "Models Id", "pip install", "CPU", "GPU", "Runtime"], maxcolwidths=column_widths)
-      termui.echo(table, fg="white")
+      table = tabulate.tabulate(data, tablefmt='fancy_grid', headers=['LLM', 'Architecture', 'Models Id', 'pip install', 'CPU', 'GPU', 'Runtime'], maxcolwidths=column_widths)
+      termui.echo(table, fg='white')

      if DEBUG and len(failed_initialized) > 0:
-        termui.echo("\nThe following models are supported but failed to initialize:\n")
+        termui.echo('\nThe following models are supported but failed to initialize:\n')
        for m, err in failed_initialized:
-          termui.echo(f"- {m}: ", fg="blue", nl=False)
-          termui.echo(err, fg="red")
+          termui.echo(f'- {m}: ', fg='blue', nl=False)
+          termui.echo(err, fg='red')

      if show_available:
        if len(ids_in_local_store) == 0:
-          termui.echo("No models available locally.")
+          termui.echo('No models available locally.')
          ctx.exit(0)
-        termui.echo("The following are available in local store:", fg="magenta")
-        termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg="white")
+        termui.echo('The following are available in local store:', fg='magenta')
+        termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white')
    else:
-      if show_available: json_data["local"] = local_models
-      termui.echo(orjson.dumps(json_data, option=orjson.OPT_INDENT_2,).decode(), fg="white")
+      if show_available: json_data['local'] = local_models
+      termui.echo(orjson.dumps(json_data, option=orjson.OPT_INDENT_2,).decode(), fg='white')
  ctx.exit(0)
@cli.command()
@model_name_argument(required=False)
-@click.option("-y", "--yes", "--assume-yes", is_flag=True, help="Skip confirmation when deleting a specific model")
-@click.option("--include-bentos/--no-include-bentos", is_flag=True, default=False, help="Whether to also include pruning bentos.")
+@click.option('-y', '--yes', '--assume-yes', is_flag=True, help='Skip confirmation when deleting a specific model')
+@click.option('--include-bentos/--no-include-bentos', is_flag=True, default=False, help='Whether to also include pruning bentos.')
@inject
 def prune_command(
    model_name: str | None, yes: bool, include_bentos: bool, model_store: ModelStore = Provide[BentoMLContainer.model_store], bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
 ) -> None:
-  """Remove all saved models, (and optionally bentos) built with OpenLLM locally.
+  '''Remove all saved models, (and optionally bentos) built with OpenLLM locally.

  \b
  If a model type is passed, then only prune models for that given model type.
-  """
+  '''
  available: list[tuple[bentoml.Model | bentoml.Bento,
-                        ModelStore | BentoStore]] = [(m, model_store) for m in bentoml.models.list() if "framework" in m.info.labels and m.info.labels["framework"] == "openllm"]
-  if model_name is not None: available = [(m, store) for m, store in available if "model_name" in m.info.labels and m.info.labels["model_name"] == inflection.underscore(model_name)]
+                        ModelStore | BentoStore]] = [(m, model_store) for m in bentoml.models.list() if 'framework' in m.info.labels and m.info.labels['framework'] == 'openllm']
+  if model_name is not None: available = [(m, store) for m, store in available if 'model_name' in m.info.labels and m.info.labels['model_name'] == inflection.underscore(model_name)]
  if include_bentos:
    if model_name is not None:
-      available += [(b, bento_store) for b in bentoml.bentos.list() if "start_name" in b.info.labels and b.info.labels["start_name"] == inflection.underscore(model_name)]
+      available += [(b, bento_store) for b in bentoml.bentos.list() if 'start_name' in b.info.labels and b.info.labels['start_name'] == inflection.underscore(model_name)]
    else:
-      available += [(b, bento_store) for b in bentoml.bentos.list() if "_type" in b.info.labels and "_framework" in b.info.labels]
+      available += [(b, bento_store) for b in bentoml.bentos.list() if '_type' in b.info.labels and '_framework' in b.info.labels]

  for store_item, store in available:
    if yes: delete_confirmed = True
    else: delete_confirmed = click.confirm(f"delete {'model' if isinstance(store, ModelStore) else 'bento'} {store_item.tag}?")
    if delete_confirmed:
      store.delete(store_item.tag)
-      termui.echo(f"{store_item} deleted from {'model' if isinstance(store, ModelStore) else 'bento'} store.", fg="yellow")
+      termui.echo(f"{store_item} deleted from {'model' if isinstance(store, ModelStore) else 'bento'} store.", fg='yellow')
 def parsing_instruction_callback(ctx: click.Context, param: click.Parameter, value: list[str] | str | None) -> tuple[str, bool | str] | list[str] | str | None:
  if value is None:
    return value
@@ -702,40 +702,40 @@ def parsing_instruction_callback(ctx: click.Context, param: click.Parameter, val
    # we only parse --text foo bar -> --text foo and omit bar
    value = value[-1]

-  key, *values = value.split("=")
-  if not key.startswith("--"):
-    raise click.BadParameter(f"Invalid option format: {value}")
+  key, *values = value.split('=')
+  if not key.startswith('--'):
+    raise click.BadParameter(f'Invalid option format: {value}')
  key = key[2:]
  if len(values) == 0:
    return key, True
  elif len(values) == 1:
    return key, values[0]
  else:
-    raise click.BadParameter(f"Invalid option format: {value}")
-def shared_client_options(f: _AnyCallable | None = None, output_value: t.Literal["json", "porcelain", "pretty"] = "pretty") -> t.Callable[[FC], FC]:
+    raise click.BadParameter(f'Invalid option format: {value}')
+def shared_client_options(f: _AnyCallable | None = None, output_value: t.Literal['json', 'porcelain', 'pretty'] = 'pretty') -> t.Callable[[FC], FC]:
  options = [
-      click.option("--endpoint", type=click.STRING, help="OpenLLM Server endpoint, i.e: http://localhost:3000", envvar="OPENLLM_ENDPOINT", default="http://localhost:3000",
+      click.option('--endpoint', type=click.STRING, help='OpenLLM Server endpoint, i.e: http://localhost:3000', envvar='OPENLLM_ENDPOINT', default='http://localhost:3000',
                   ),
-      click.option("--timeout", type=click.INT, default=30, help="Default server timeout", show_default=True),
+      click.option('--timeout', type=click.INT, default=30, help='Default server timeout', show_default=True),
      output_option(default_value=output_value),
  ]
  return compose(*options)(f) if f is not None else compose(*options)
@cli.command()
-@click.argument("task", type=click.STRING, metavar="TASK")
+@click.argument('task', type=click.STRING, metavar='TASK')
@shared_client_options
-@click.option("--agent", type=click.Choice(["hf"]), default="hf", help="Whether to interact with Agents from given Server endpoint.", show_default=True)
-@click.option("--remote", is_flag=True, default=False, help="Whether or not to use remote tools (inference endpoints) instead of local ones.", show_default=True)
+@click.option('--agent', type=click.Choice(['hf']), default='hf', help='Whether to interact with Agents from given Server endpoint.', show_default=True)
+@click.option('--remote', is_flag=True, default=False, help='Whether or not to use remote tools (inference endpoints) instead of local ones.', show_default=True)
@click.option(
-    "--opt",
+    '--opt',
    help="Define prompt options. "
    "(format: ``--opt text='I love this' --opt audio:./path/to/audio  --opt image:/path/to/file``)",
    required=False,
    multiple=True,
    callback=opt_callback,
-    metavar="ARG=VALUE[,ARG=VALUE]"
+    metavar='ARG=VALUE[,ARG=VALUE]'
 )
 def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output: LiteralOutput, remote: bool, task: str, _memoized: DictStrAny, **attrs: t.Any) -> str:
-  """Instruct agents interactively for given tasks, from a terminal.
+  '''Instruct agents interactively for given tasks, from a terminal.

  \b
  ```bash
@@ -743,92 +743,92 @@ def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output:
        "Is the following `text` (in Spanish) positive or negative?" \\
        --text "¡Este es un API muy agradable!"
  ```
-  """
+  '''
  client = openllm.client.HTTPClient(endpoint, timeout=timeout)

  try:
-    client.call("metadata")
+    client.call('metadata')
  except http.client.BadStatusLine:
-    raise click.ClickException(f"{endpoint} is neither a HTTP server nor reachable.") from None
-  if agent == "hf":
+    raise click.ClickException(f'{endpoint} is neither a HTTP server nor reachable.') from None
+  if agent == 'hf':
    if not is_transformers_supports_agent(): raise click.UsageError("Transformers version should be at least 4.29 to support HfAgent. Upgrade with 'pip install -U transformers'")
    _memoized = {k: v[0] for k, v in _memoized.items() if v}
    client._hf_agent.set_stream(logger.info)
-    if output != "porcelain": termui.echo(f"Sending the following prompt ('{task}') with the following vars: {_memoized}", fg="magenta")
+    if output != 'porcelain': termui.echo(f"Sending the following prompt ('{task}') with the following vars: {_memoized}", fg='magenta')
    result = client.ask_agent(task, agent_type=agent, return_code=False, remote=remote, **_memoized)
-    if output == "json": termui.echo(orjson.dumps(result, option=orjson.OPT_INDENT_2).decode(), fg="white")
-    else: termui.echo(result, fg="white")
+    if output == 'json': termui.echo(orjson.dumps(result, option=orjson.OPT_INDENT_2).decode(), fg='white')
+    else: termui.echo(result, fg='white')
    return result
  else:
-    raise click.BadOptionUsage("agent", f"Unknown agent type {agent}")
+    raise click.BadOptionUsage('agent', f'Unknown agent type {agent}')
@cli.command()
-@shared_client_options(output_value="json")
-@click.option("--server-type", type=click.Choice(["grpc", "http"]), help="Server type", default="http", show_default=True)
-@click.argument("text", type=click.STRING, nargs=-1)
+@shared_client_options(output_value='json')
+@click.option('--server-type', type=click.Choice(['grpc', 'http']), help='Server type', default='http', show_default=True)
+@click.argument('text', type=click.STRING, nargs=-1)
@machine_option
@click.pass_context
 def embed_command(
-    ctx: click.Context, text: tuple[str, ...], endpoint: str, timeout: int, server_type: t.Literal["http", "grpc"], output: LiteralOutput, machine: bool
+    ctx: click.Context, text: tuple[str, ...], endpoint: str, timeout: int, server_type: t.Literal['http', 'grpc'], output: LiteralOutput, machine: bool
 ) -> EmbeddingsOutput | None:
-  """Get embeddings interactively, from a terminal.
+  '''Get embeddings interactively, from a terminal.

  \b
  ```bash
  $ openllm embed --endpoint http://12.323.2.1:3000 "What is the meaning of life?" "How many stars are there in the sky?"
  ```
-  """
-  client = openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == "http" else openllm.client.GrpcClient(endpoint, timeout=timeout)
+  '''
+  client = openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == 'http' else openllm.client.GrpcClient(endpoint, timeout=timeout)
  try:
    gen_embed = client.embed(text)
  except ValueError:
-    raise click.ClickException(f"Endpoint {endpoint} does not support embeddings.") from None
+    raise click.ClickException(f'Endpoint {endpoint} does not support embeddings.') from None
  if machine: return gen_embed
-  elif output == "pretty":
-    termui.echo("Generated embeddings: ", fg="magenta", nl=False)
-    termui.echo(gen_embed.embeddings, fg="white")
-    termui.echo("\nNumber of tokens: ", fg="magenta", nl=False)
-    termui.echo(gen_embed.num_tokens, fg="white")
-  elif output == "json":
-    termui.echo(orjson.dumps(bentoml_cattr.unstructure(gen_embed), option=orjson.OPT_INDENT_2).decode(), fg="white")
+  elif output == 'pretty':
+    termui.echo('Generated embeddings: ', fg='magenta', nl=False)
+    termui.echo(gen_embed.embeddings, fg='white')
+    termui.echo('\nNumber of tokens: ', fg='magenta', nl=False)
+    termui.echo(gen_embed.num_tokens, fg='white')
+  elif output == 'json':
+    termui.echo(orjson.dumps(bentoml_cattr.unstructure(gen_embed), option=orjson.OPT_INDENT_2).decode(), fg='white')
  else:
-    termui.echo(gen_embed.embeddings, fg="white")
+    termui.echo(gen_embed.embeddings, fg='white')
  ctx.exit(0)
@cli.command()
@shared_client_options
-@click.option("--server-type", type=click.Choice(["grpc", "http"]), help="Server type", default="http", show_default=True)
-@click.argument("prompt", type=click.STRING)
+@click.option('--server-type', type=click.Choice(['grpc', 'http']), help='Server type', default='http', show_default=True)
+@click.argument('prompt', type=click.STRING)
@click.option(
-    "--sampling-params", help="Define query options. (format: ``--opt temperature=0.8 --opt=top_k:12)", required=False, multiple=True, callback=opt_callback, metavar="ARG=VALUE[,ARG=VALUE]"
+    '--sampling-params', help='Define query options. (format: ``--opt temperature=0.8 --opt=top_k:12)', required=False, multiple=True, callback=opt_callback, metavar='ARG=VALUE[,ARG=VALUE]'
 )
@click.pass_context
 def query_command(
-    ctx: click.Context, /, prompt: str, endpoint: str, timeout: int, server_type: t.Literal["http", "grpc"], output: LiteralOutput, _memoized: DictStrAny, **attrs: t.Any
+    ctx: click.Context, /, prompt: str, endpoint: str, timeout: int, server_type: t.Literal['http', 'grpc'], output: LiteralOutput, _memoized: DictStrAny, **attrs: t.Any
 ) -> None:
-  """Ask a LLM interactively, from a terminal.
+  '''Ask a LLM interactively, from a terminal.

  \b
  ```bash
  $ openllm query --endpoint http://12.323.2.1:3000 "What is the meaning of life?"
  ```
-  """
+  '''
  _memoized = {k: orjson.loads(v[0]) for k, v in _memoized.items() if v}
-  if server_type == "grpc": endpoint = re.sub(r"http://", "", endpoint)
-  client = openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == "http" else openllm.client.GrpcClient(endpoint, timeout=timeout)
-  input_fg, generated_fg = "magenta", "cyan"
-  if output != "porcelain":
-    termui.echo("==Input==\n", fg="white")
-    termui.echo(f"{prompt}", fg=input_fg)
-  res = client.query(prompt, return_response="raw", **{**client.configuration, **_memoized})
-  if output == "pretty":
-    response = client.config.postprocess_generate(prompt, res["responses"])
-    termui.echo("\n\n==Responses==\n", fg="white")
+  if server_type == 'grpc': endpoint = re.sub(r'http://', '', endpoint)
+  client = openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == 'http' else openllm.client.GrpcClient(endpoint, timeout=timeout)
+  input_fg, generated_fg = 'magenta', 'cyan'
+  if output != 'porcelain':
+    termui.echo('==Input==\n', fg='white')
+    termui.echo(f'{prompt}', fg=input_fg)
+  res = client.query(prompt, return_response='raw', **{**client.configuration, **_memoized})
+  if output == 'pretty':
+    response = client.config.postprocess_generate(prompt, res['responses'])
+    termui.echo('\n\n==Responses==\n', fg='white')
    termui.echo(response, fg=generated_fg)
-  elif output == "json":
-    termui.echo(orjson.dumps(res, option=orjson.OPT_INDENT_2).decode(), fg="white")
+  elif output == 'json':
+    termui.echo(orjson.dumps(res, option=orjson.OPT_INDENT_2).decode(), fg='white')
  else:
-    termui.echo(res["responses"], fg="white")
+    termui.echo(res['responses'], fg='white')
  ctx.exit(0)
-@cli.group(cls=Extensions, hidden=True, name="extension")
+@cli.group(cls=Extensions, hidden=True, name='extension')
 def extension_command() -> None:
-  """Extension for OpenLLM CLI."""
-if __name__ == "__main__": cli()
+  '''Extension for OpenLLM CLI.'''
+if __name__ == '__main__': cli()
--- a/openllm-python/src/openllm/cli/extension/build_base_container.py
+++ b/openllm-python/src/openllm/cli/extension/build_base_container.py
@@ -4,9 +4,9 @@ from openllm.cli import termui
 from openllm.cli._factory import machine_option, container_registry_option
 if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
@click.command(
-    "build_base_container",
+    'build_base_container',
    context_settings=termui.CONTEXT_SETTINGS,
-    help="""Base image builder for BentoLLM.
+    help='''Base image builder for BentoLLM.

                By default, the base image will include custom kernels (PagedAttention via vllm, FlashAttention-v2, etc.) built with CUDA 11.8, Python 3.9 on Ubuntu22.04.
                Optionally, this can also be pushed directly to remote registry. Currently support ``docker.io``, ``ghcr.io`` and ``quay.io``.
@@ -16,13 +16,13 @@ if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralContainerRegi
                This command is only useful for debugging and for building custom base image for extending BentoML with custom base images and custom kernels.

                Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
-                """
+                '''
 )
@container_registry_option
-@click.option("--version-strategy", type=click.Choice(["release", "latest", "nightly"]), default="nightly", help="Version strategy to use for tagging the image.")
-@click.option("--push/--no-push", help="Whether to push to remote repository", is_flag=True, default=False)
+@click.option('--version-strategy', type=click.Choice(['release', 'latest', 'nightly']), default='nightly', help='Version strategy to use for tagging the image.')
+@click.option('--push/--no-push', help='Whether to push to remote repository', is_flag=True, default=False)
@machine_option
 def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
  mapping = openllm.bundle.build_container(container_registry, version_strategy, push, machine)
-  if machine: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg="white")
+  if machine: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
  return mapping
--- a/openllm-python/src/openllm/cli/extension/dive_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/dive_bentos.py
@@ -7,21 +7,21 @@ from openllm.cli import termui
 from openllm.cli._factory import bento_complete_envvar, machine_option

 if t.TYPE_CHECKING: from bentoml._internal.bento import BentoStore
-@click.command("dive_bentos", context_settings=termui.CONTEXT_SETTINGS)
-@click.argument("bento", type=str, shell_complete=bento_complete_envvar)
+@click.command('dive_bentos', context_settings=termui.CONTEXT_SETTINGS)
+@click.argument('bento', type=str, shell_complete=bento_complete_envvar)
@machine_option
@click.pass_context
@inject
 def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
-  """Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path)."""
+  '''Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path).'''
  try:
    bentomodel = _bento_store.get(bento)
  except bentoml.exceptions.NotFound:
-    ctx.fail(f"Bento {bento} not found. Make sure to call `openllm build` first.")
-  if "bundler" not in bentomodel.info.labels or bentomodel.info.labels["bundler"] != "openllm.bundle":
+    ctx.fail(f'Bento {bento} not found. Make sure to call `openllm build` first.')
+  if 'bundler' not in bentomodel.info.labels or bentomodel.info.labels['bundler'] != 'openllm.bundle':
    ctx.fail(f"Bento is either too old or not built with OpenLLM. Make sure to use ``openllm build {bentomodel.info.labels['start_name']}`` for correctness.")
  if machine: return bentomodel.path
  # copy and paste this into a new shell
-  if psutil.WINDOWS: subprocess.check_call([shutil.which("dir") or "dir"], cwd=bentomodel.path)
-  else: subprocess.check_call([shutil.which("ls") or "ls", "-Rrthla"], cwd=bentomodel.path)
+  if psutil.WINDOWS: subprocess.check_call([shutil.which('dir') or 'dir'], cwd=bentomodel.path)
+  else: subprocess.check_call([shutil.which('ls') or 'ls', '-Rrthla'], cwd=bentomodel.path)
  ctx.exit(0)
--- a/openllm-python/src/openllm/cli/extension/get_containerfile.py
+++ b/openllm-python/src/openllm/cli/extension/get_containerfile.py
@@ -10,17 +10,17 @@ from openllm.cli._factory import bento_complete_envvar
 from openllm_core.utils import bentoml_cattr

 if t.TYPE_CHECKING: from bentoml._internal.bento import BentoStore
-@click.command("get_containerfile", context_settings=termui.CONTEXT_SETTINGS, help="Return Containerfile of any given Bento.")
-@click.argument("bento", type=str, shell_complete=bento_complete_envvar)
+@click.command('get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.')
+@click.argument('bento', type=str, shell_complete=bento_complete_envvar)
@click.pass_context
@inject
 def cli(ctx: click.Context, bento: str, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str:
  try:
    bentomodel = _bento_store.get(bento)
  except bentoml.exceptions.NotFound:
-    ctx.fail(f"Bento {bento} not found. Make sure to call `openllm build` first.")
+    ctx.fail(f'Bento {bento} not found. Make sure to call `openllm build` first.')
  # The logic below are similar to bentoml._internal.container.construct_containerfile
-  with open(bentomodel.path_of("bento.yaml"), "r") as f:
+  with open(bentomodel.path_of('bento.yaml'), 'r') as f:
    options = BentoInfo.from_yaml_file(f)
    # NOTE: dockerfile_template is already included in the
    # Dockerfile inside bento, and it is not relevant to
@@ -30,7 +30,7 @@ def cli(ctx: click.Context, bento: str, _bento_store: BentoStore = Provide[Bento
    # NOTE: if users specify a dockerfile_template, we will
    # save it to /env/docker/Dockerfile.template. This is necessary
    # for the reconstruction of the Dockerfile.
-    if "dockerfile_template" in docker_attrs and docker_attrs["dockerfile_template"] is not None: docker_attrs["dockerfile_template"] = "env/docker/Dockerfile.template"
+    if 'dockerfile_template' in docker_attrs and docker_attrs['dockerfile_template'] is not None: docker_attrs['dockerfile_template'] = 'env/docker/Dockerfile.template'
    doc = generate_containerfile(docker=DockerOptions(**docker_attrs), build_ctx=bentomodel.path, conda=options.conda, bento_fs=bentomodel._fs, enable_buildkit=True, add_header=True)
-    termui.echo(doc, fg="white")
+    termui.echo(doc, fg='white')
  return bentomodel.path
--- a/openllm-python/src/openllm/cli/extension/get_prompt.py
+++ b/openllm-python/src/openllm/cli/extension/get_prompt.py
@@ -4,46 +4,46 @@ from bentoml_cli.utils import opt_callback
 from openllm.cli import termui
 from openllm.cli._factory import model_complete_envvar, output_option, machine_option
 from openllm_core._prompt import process_prompt
-LiteralOutput = t.Literal["json", "pretty", "porcelain"]
-@click.command("get_prompt", context_settings=termui.CONTEXT_SETTINGS)
-@click.argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]), shell_complete=model_complete_envvar)
-@click.argument("prompt", type=click.STRING)
+LiteralOutput = t.Literal['json', 'pretty', 'porcelain']
+@click.command('get_prompt', context_settings=termui.CONTEXT_SETTINGS)
+@click.argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]), shell_complete=model_complete_envvar)
+@click.argument('prompt', type=click.STRING)
@output_option
-@click.option("--format", type=click.STRING, default=None)
+@click.option('--format', type=click.STRING, default=None)
@machine_option
@click.option(
-    "--opt",
+    '--opt',
    help="Define additional prompt variables. (format: ``--opt system_prompt='You are a useful assistant'``)",
    required=False,
    multiple=True,
    callback=opt_callback,
-    metavar="ARG=VALUE[,ARG=VALUE]"
+    metavar='ARG=VALUE[,ARG=VALUE]'
 )
@click.pass_context
 def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any], **_: t.Any) -> str | None:
-  """Get the default prompt used by OpenLLM."""
+  '''Get the default prompt used by OpenLLM.'''
  module = openllm.utils.EnvVarMixin(model_name).module
  _memoized = {k: v[0] for k, v in _memoized.items() if v}
  try:
-    template = getattr(module, "DEFAULT_PROMPT_TEMPLATE", None)
-    prompt_mapping = getattr(module, "PROMPT_MAPPING", None)
-    if template is None: raise click.BadArgumentUsage(f"model {model_name} does not have a default prompt template") from None
+    template = getattr(module, 'DEFAULT_PROMPT_TEMPLATE', None)
+    prompt_mapping = getattr(module, 'PROMPT_MAPPING', None)
+    if template is None: raise click.BadArgumentUsage(f'model {model_name} does not have a default prompt template') from None
    if callable(template):
      if format is None:
-        if not hasattr(module, "PROMPT_MAPPING") or module.PROMPT_MAPPING is None: raise RuntimeError("Failed to find prompt mapping while DEFAULT_PROMPT_TEMPLATE is a function.")
-        raise click.BadOptionUsage("format", f"{model_name} prompt requires passing '--format' (available format: {list(module.PROMPT_MAPPING)})")
-      if prompt_mapping is None: raise click.BadArgumentUsage(f"Failed to fine prompt mapping while the default prompt for {model_name} is a callable.") from None
-      if format not in prompt_mapping: raise click.BadOptionUsage("format", f"Given format {format} is not valid for {model_name} (available format: {list(prompt_mapping)})")
+        if not hasattr(module, 'PROMPT_MAPPING') or module.PROMPT_MAPPING is None: raise RuntimeError('Failed to find prompt mapping while DEFAULT_PROMPT_TEMPLATE is a function.')
+        raise click.BadOptionUsage('format', f"{model_name} prompt requires passing '--format' (available format: {list(module.PROMPT_MAPPING)})")
+      if prompt_mapping is None: raise click.BadArgumentUsage(f'Failed to fine prompt mapping while the default prompt for {model_name} is a callable.') from None
+      if format not in prompt_mapping: raise click.BadOptionUsage('format', f'Given format {format} is not valid for {model_name} (available format: {list(prompt_mapping)})')
      _prompt_template = template(format)
    else:
      _prompt_template = template
    fully_formatted = process_prompt(prompt, _prompt_template, True, **_memoized)
    if machine: return repr(fully_formatted)
-    elif output == "porcelain": termui.echo(repr(fully_formatted), fg="white")
-    elif output == "json": termui.echo(orjson.dumps({"prompt": fully_formatted}, option=orjson.OPT_INDENT_2).decode(), fg="white")
+    elif output == 'porcelain': termui.echo(repr(fully_formatted), fg='white')
+    elif output == 'json': termui.echo(orjson.dumps({'prompt': fully_formatted}, option=orjson.OPT_INDENT_2).decode(), fg='white')
    else:
-      termui.echo(f"== Prompt for {model_name} ==\n", fg="magenta")
-      termui.echo(fully_formatted, fg="white")
+      termui.echo(f'== Prompt for {model_name} ==\n', fg='magenta')
+      termui.echo(fully_formatted, fg='white')
  except AttributeError:
-    raise click.ClickException(f"Failed to determine a default prompt template for {model_name}.") from None
+    raise click.ClickException(f'Failed to determine a default prompt template for {model_name}.') from None
  ctx.exit(0)
--- a/openllm-python/src/openllm/cli/extension/list_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/list_bentos.py
@@ -3,30 +3,30 @@ import click, inflection, orjson, bentoml, openllm
 from bentoml._internal.utils import human_readable_size
 from openllm.cli import termui
 from openllm.cli._factory import LiteralOutput, output_option
-@click.command("list_bentos", context_settings=termui.CONTEXT_SETTINGS)
-@output_option(default_value="json")
+@click.command('list_bentos', context_settings=termui.CONTEXT_SETTINGS)
+@output_option(default_value='json')
@click.pass_context
 def cli(ctx: click.Context, output: LiteralOutput) -> None:
-  """List available bentos built by OpenLLM."""
+  '''List available bentos built by OpenLLM.'''
  mapping = {
      k: [{
-          "tag": str(b.tag),
-          "size": human_readable_size(openllm.utils.calc_dir_size(b.path)),
-          "models": [{
-              "tag": str(m.tag), "size": human_readable_size(openllm.utils.calc_dir_size(m.path))
+          'tag': str(b.tag),
+          'size': human_readable_size(openllm.utils.calc_dir_size(b.path)),
+          'models': [{
+              'tag': str(m.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(m.path))
          } for m in (bentoml.models.get(_.tag) for _ in b.info.models)]
-      } for b in tuple(i for i in bentoml.list() if all(k in i.info.labels for k in {"start_name", "bundler"})) if b.info.labels["start_name"] == k] for k in tuple(
+      } for b in tuple(i for i in bentoml.list() if all(k in i.info.labels for k in {'start_name', 'bundler'})) if b.info.labels['start_name'] == k] for k in tuple(
          inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys()
      )
  }
  mapping = {k: v for k, v in mapping.items() if v}
-  if output == "pretty":
+  if output == 'pretty':
    import tabulate
    tabulate.PRESERVE_WHITESPACE = True
    termui.echo(
-        tabulate.tabulate([(k, i["tag"], i["size"], [_["tag"] for _ in i["models"]]) for k, v in mapping.items() for i in v], tablefmt="fancy_grid", headers=["LLM", "Tag", "Size", "Models"]),
-        fg="white"
+        tabulate.tabulate([(k, i['tag'], i['size'], [_['tag'] for _ in i['models']]) for k, v in mapping.items() for i in v], tablefmt='fancy_grid', headers=['LLM', 'Tag', 'Size', 'Models']),
+        fg='white'
    )
  else:
-    termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg="white")
+    termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
  ctx.exit(0)
--- a/openllm-python/src/openllm/cli/extension/list_models.py
+++ b/openllm-python/src/openllm/cli/extension/list_models.py
@@ -5,24 +5,24 @@ from bentoml._internal.utils import human_readable_size
 from openllm.cli._factory import LiteralOutput, model_name_argument, output_option, model_complete_envvar

 if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny
-@click.command("list_models", context_settings=termui.CONTEXT_SETTINGS)
+@click.command('list_models', context_settings=termui.CONTEXT_SETTINGS)
@model_name_argument(required=False, shell_complete=model_complete_envvar)
-@output_option(default_value="json")
+@output_option(default_value='json')
 def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny:
-  """This is equivalent to openllm models --show-available less the nice table."""
+  '''This is equivalent to openllm models --show-available less the nice table.'''
  models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
  ids_in_local_store = {
-      k: [i for i in bentoml.models.list() if "framework" in i.info.labels and i.info.labels["framework"] == "openllm" and "model_name" in i.info.labels and i.info.labels["model_name"] == k
+      k: [i for i in bentoml.models.list() if 'framework' in i.info.labels and i.info.labels['framework'] == 'openllm' and 'model_name' in i.info.labels and i.info.labels['model_name'] == k
          ] for k in models
  }
  if model_name is not None:
-    ids_in_local_store = {k: [i for i in v if "model_name" in i.info.labels and i.info.labels["model_name"] == inflection.dasherize(model_name)] for k, v in ids_in_local_store.items()}
+    ids_in_local_store = {k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)] for k, v in ids_in_local_store.items()}
  ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
-  local_models = {k: [{"tag": str(i.tag), "size": human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()}
-  if output == "pretty":
+  local_models = {k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()}
+  if output == 'pretty':
    import tabulate
    tabulate.PRESERVE_WHITESPACE = True
-    termui.echo(tabulate.tabulate([(k, i["tag"], i["size"]) for k, v in local_models.items() for i in v], tablefmt="fancy_grid", headers=["LLM", "Tag", "Size"]), fg="white")
+    termui.echo(tabulate.tabulate([(k, i['tag'], i['size']) for k, v in local_models.items() for i in v], tablefmt='fancy_grid', headers=['LLM', 'Tag', 'Size']), fg='white')
  else:
-    termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg="white")
+    termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white')
  return local_models
--- a/openllm-python/src/openllm/cli/extension/playground.py
+++ b/openllm-python/src/openllm/cli/extension/playground.py
@@ -9,13 +9,13 @@ if t.TYPE_CHECKING:
  from openllm_core._typing_compat import DictStrAny
 logger = logging.getLogger(__name__)
 def load_notebook_metadata() -> DictStrAny:
-  with open(os.path.join(os.path.dirname(playground.__file__), "_meta.yml"), "r") as f:
+  with open(os.path.join(os.path.dirname(playground.__file__), '_meta.yml'), 'r') as f:
    content = yaml.safe_load(f)
-  if not all("description" in k for k in content.values()): raise ValueError("Invalid metadata file. All entries must have a 'description' key.")
+  if not all('description' in k for k in content.values()): raise ValueError("Invalid metadata file. All entries must have a 'description' key.")
  return content
-@click.command("playground", context_settings=termui.CONTEXT_SETTINGS)
-@click.argument("output-dir", default=None, required=False)
-@click.option("--port", envvar="JUPYTER_PORT", show_envvar=True, show_default=True, default=8888, help="Default port for Jupyter server")
+@click.command('playground', context_settings=termui.CONTEXT_SETTINGS)
+@click.argument('output-dir', default=None, required=False)
+@click.option('--port', envvar='JUPYTER_PORT', show_envvar=True, show_default=True, default=8888, help='Default port for Jupyter server')
@click.pass_context
 def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  """OpenLLM Playground.
@@ -41,27 +41,27 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  _temp_dir = False
  if output_dir is None:
    _temp_dir = True
-    output_dir = tempfile.mkdtemp(prefix="openllm-playground-")
+    output_dir = tempfile.mkdtemp(prefix='openllm-playground-')
  else:
    os.makedirs(os.path.abspath(os.path.expandvars(os.path.expanduser(output_dir))), exist_ok=True)

-  termui.echo("The playground notebooks will be saved to: " + os.path.abspath(output_dir), fg="blue")
+  termui.echo('The playground notebooks will be saved to: ' + os.path.abspath(output_dir), fg='blue')
  for module in pkgutil.iter_modules(playground.__path__):
-    if module.ispkg or os.path.exists(os.path.join(output_dir, module.name + ".ipynb")):
-      logger.debug("Skipping: %s (%s)", module.name, "File already exists" if not module.ispkg else f"{module.name} is a module")
+    if module.ispkg or os.path.exists(os.path.join(output_dir, module.name + '.ipynb')):
+      logger.debug('Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module')
      continue
    if not isinstance(module.module_finder, importlib.machinery.FileFinder): continue
-    termui.echo("Generating notebook for: " + module.name, fg="magenta")
-    markdown_cell = nbformat.v4.new_markdown_cell(metadata[module.name]["description"])
-    f = jupytext.read(os.path.join(module.module_finder.path, module.name + ".py"))
+    termui.echo('Generating notebook for: ' + module.name, fg='magenta')
+    markdown_cell = nbformat.v4.new_markdown_cell(metadata[module.name]['description'])
+    f = jupytext.read(os.path.join(module.module_finder.path, module.name + '.py'))
    f.cells.insert(0, markdown_cell)
-    jupytext.write(f, os.path.join(output_dir, module.name + ".ipynb"), fmt="notebook")
+    jupytext.write(f, os.path.join(output_dir, module.name + '.ipynb'), fmt='notebook')
  try:
-    subprocess.check_output([sys.executable, "-m", "jupyter", "notebook", "--notebook-dir", output_dir, "--port", str(port), "--no-browser", "--debug"])
+    subprocess.check_output([sys.executable, '-m', 'jupyter', 'notebook', '--notebook-dir', output_dir, '--port', str(port), '--no-browser', '--debug'])
  except subprocess.CalledProcessError as e:
-    termui.echo(e.output, fg="red")
-    raise click.ClickException(f"Failed to start a jupyter server:\n{e}") from None
+    termui.echo(e.output, fg='red')
+    raise click.ClickException(f'Failed to start a jupyter server:\n{e}') from None
  except KeyboardInterrupt:
-    termui.echo("\nShutting down Jupyter server...", fg="yellow")
-    if _temp_dir: termui.echo("Note: You can access the generated notebooks in: " + output_dir, fg="blue")
+    termui.echo('\nShutting down Jupyter server...', fg='yellow')
+    if _temp_dir: termui.echo('Note: You can access the generated notebooks in: ' + output_dir, fg='blue')
  ctx.exit(0)
--- a/openllm-python/src/openllm/cli/termui.py
+++ b/openllm-python/src/openllm/cli/termui.py
@@ -1,9 +1,9 @@
 from __future__ import annotations
 import os, typing as t, click, inflection, openllm
 if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny
-def echo(text: t.Any, fg: str = "green", _with_style: bool = True, **attrs: t.Any) -> None:
-  attrs["fg"] = fg if not openllm.utils.get_debug_mode() else None
+def echo(text: t.Any, fg: str = 'green', _with_style: bool = True, **attrs: t.Any) -> None:
+  attrs['fg'] = fg if not openllm.utils.get_debug_mode() else None
  if not openllm.utils.get_quiet_mode(): t.cast(t.Callable[..., None], click.echo if not _with_style else click.secho)(text, **attrs)
-COLUMNS: int = int(os.environ.get("COLUMNS", str(120)))
-CONTEXT_SETTINGS: DictStrAny = {"help_option_names": ["-h", "--help"], "max_content_width": COLUMNS, "token_normalize_func": inflection.underscore}
-__all__ = ["echo", "COLUMNS", "CONTEXT_SETTINGS"]
+COLUMNS: int = int(os.environ.get('COLUMNS', str(120)))
+CONTEXT_SETTINGS: DictStrAny = {'help_option_names': ['-h', '--help'], 'max_content_width': COLUMNS, 'token_normalize_func': inflection.underscore}
+__all__ = ['echo', 'COLUMNS', 'CONTEXT_SETTINGS']