chore: ignore new lines split [skip ci]

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
2026-03-09 02:32:51 -04:00 · 2023-09-01 17:00:49 +00:00
parent 608de0b667
commit 7d893e6cd2
70 changed files with 575 additions and 950 deletions
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -43,35 +43,29 @@ _AnyCallable = t.Callable[..., t.Any]
 FC = t.TypeVar('FC', bound=t.Union[_AnyCallable, click.Command])

 def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
-  return [
-      sc.CompletionItem(str(it.tag), help='Bento')
-      for it in bentoml.list()
-      if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})
-  ]
+  return [sc.CompletionItem(str(it.tag), help='Bento') for it in bentoml.list() if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})]

 def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
  return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]

-def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool,
-                         environ: DictStrAny) -> DictStrAny:
+def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
  # TODO: Support amd.com/gpu on k8s
  _bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '')
  _bentoml_config_options_opts = [
-      'tracing.sample_rate=1.0', f'api_server.traffic.timeout={server_timeout}',
+      'tracing.sample_rate=1.0',
+      f'api_server.traffic.timeout={server_timeout}',
      f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}',
      f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}'
  ]
  if device:
    if len(device) > 1:
-      _bentoml_config_options_opts.extend(
-          [f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
+      _bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
    else:
      _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
  _bentoml_config_options_opts.append(f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
  if cors:
    _bentoml_config_options_opts.extend(['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
-    _bentoml_config_options_opts.extend(
-        [f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])])
+    _bentoml_config_options_opts.extend([f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])])
  _bentoml_config_options_env += ' ' if _bentoml_config_options_env else '' + ' '.join(_bentoml_config_options_opts)
  environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env
  if DEBUG: logger.debug('Setting BENTOML_CONFIG_OPTIONS=%s', _bentoml_config_options_env)
@@ -123,18 +117,27 @@ Available official model_id(s): [default: {llm_config['default_id']}]
  if llm_config['requires_gpu'] and openllm.utils.device_count() < 1:
    # NOTE: The model requires GPU, therefore we will return a dummy command
    command_attrs.update({
-        'short_help': '(Disabled because there is no GPU available)',
-        'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
+        'short_help': '(Disabled because there is no GPU available)', 'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
    })
    return noop_command(group, llm_config, _serve_grpc, **command_attrs)

  @group.command(**command_attrs)
  @start_decorator(llm_config, serve_grpc=_serve_grpc)
  @click.pass_context
-  def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None,
-                workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...],
-                quantize: t.Literal['int8', 'int4', 'gptq'] | None, backend: LiteralBackend, serialisation_format: t.Literal['safetensors', 'legacy'],
-                cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any,
+  def start_cmd(ctx: click.Context,
+                /,
+                server_timeout: int,
+                model_id: str | None,
+                model_version: str | None,
+                workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString,
+                device: t.Tuple[str, ...],
+                quantize: t.Literal['int8', 'int4', 'gptq'] | None,
+                backend: LiteralBackend,
+                serialisation_format: t.Literal['safetensors', 'legacy'],
+                cors: bool,
+                adapter_id: str | None,
+                return_process: bool,
+                **attrs: t.Any,
                ) -> LLMConfig | subprocess.Popen[bytes]:
    if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
      termui.echo(
@@ -202,8 +205,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]
    def next_step(model_name: str, adapter_map: DictStrAny | None) -> None:
      cmd_name = f'openllm build {model_name}'
      if adapter_map is not None:
-        cmd_name += ' ' + ' '.join(
-            [f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]])
+        cmd_name += ' ' + ' '.join([f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]])
      if not openllm.utils.get_quiet_mode():
        termui.echo(f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}", fg='blue')

@@ -242,11 +244,15 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, *
 def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
  def wrapper(fn: FC) -> t.Callable[[FC], FC]:
    composed = openllm.utils.compose(
-        llm_config.to_click_options, _http_server_args if not serve_grpc else _grpc_server_args,
+        llm_config.to_click_options,
+        _http_server_args if not serve_grpc else _grpc_server_args,
        cog.optgroup.group('General LLM Options', help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
-        model_id_option(factory=cog.optgroup), model_version_option(factory=cog.optgroup),
+        model_id_option(factory=cog.optgroup),
+        model_version_option(factory=cog.optgroup),
        cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
-        workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup), backend_option(factory=cog.optgroup),
+        workers_per_resource_option(factory=cog.optgroup),
+        cors_option(factory=cog.optgroup),
+        backend_option(factory=cog.optgroup),
        cog.optgroup.group('LLM Optimization Options',
                           help='''Optimization related options.

@@ -257,7 +263,9 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
            - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
            - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
            ''',
-                           ), quantize_option(factory=cog.optgroup), serialisation_option(factory=cog.optgroup),
+                           ),
+        quantize_option(factory=cog.optgroup),
+        serialisation_option(factory=cog.optgroup),
        cog.optgroup.option('--device',
                            type=openllm.utils.dantic.CUDA,
                            multiple=True,
@@ -375,32 +383,16 @@ def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput
                    **attrs)(f)

 def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option('--cors/--no-cors',
-                    show_default=True,
-                    default=False,
-                    envvar='OPENLLM_CORS',
-                    show_envvar=True,
-                    help='Enable CORS for the server.',
-                    **attrs)(f)
+  return cli_option('--cors/--no-cors', show_default=True, default=False, envvar='OPENLLM_CORS', show_envvar=True, help='Enable CORS for the server.', **attrs)(f)

 def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option('--machine', is_flag=True, default=False, hidden=True, **attrs)(f)

 def model_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option('--model-id',
-                    type=click.STRING,
-                    default=None,
-                    envvar='OPENLLM_MODEL_ID',
-                    show_envvar=True,
-                    help='Optional model_id name or path for (fine-tune) weight.',
-                    **attrs)(f)
+  return cli_option('--model-id', type=click.STRING, default=None, envvar='OPENLLM_MODEL_ID', show_envvar=True, help='Optional model_id name or path for (fine-tune) weight.', **attrs)(f)

 def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option('--model-version',
-                    type=click.STRING,
-                    default=None,
-                    help='Optional model version to save for this model. It will be inferred automatically from model-id.',
-                    **attrs)(f)
+  return cli_option('--model-version', type=click.STRING, default=None, help='Optional model version to save for this model. It will be inferred automatically from model-id.', **attrs)(f)

 def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  # NOTE: LiteralBackend needs to remove the last two item as ggml and mlc is wip
@@ -512,8 +504,7 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va
    try:
      float(value)  # type: ignore[arg-type]
    except ValueError:
-      raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx,
-                               param) from None
+      raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, param) from None
    else:
      return value

--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -83,10 +83,7 @@ def _start(model_name: str,
  from .entrypoint import start_command
  from .entrypoint import start_grpc_command
  llm_config = openllm.AutoConfig.for_model(model_name)
-  _ModelEnv = openllm_core.utils.EnvVarMixin(model_name,
-                                             backend=openllm_core.utils.first_not_none(backend, default=llm_config.default_backend()),
-                                             model_id=model_id,
-                                             quantize=quantize)
+  _ModelEnv = openllm_core.utils.EnvVarMixin(model_name, backend=openllm_core.utils.first_not_none(backend, default=llm_config.default_backend()), model_id=model_id, quantize=quantize)
  os.environ[_ModelEnv.backend] = _ModelEnv['backend_value']

  args: list[str] = []
@@ -102,9 +99,7 @@ def _start(model_name: str,
  if additional_args: args.extend(additional_args)
  if __test__: args.append('--return-process')

-  return start_command_factory(start_command if not _serve_grpc else start_grpc_command,
-                               model_name,
-                               _context_settings=termui.CONTEXT_SETTINGS,
+  return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS,
                               _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False)

@inject
@@ -199,9 +194,7 @@ def _build(model_name: str,
    raise OpenLLMException(str(e)) from None
  matched = re.match(r'__tag__:([^:\n]+:[^:\n]+)$', output.decode('utf-8').strip())
  if matched is None:
-    raise ValueError(
-        f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub."
-    )
+    raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.")
  return bentoml.get(matched.group(1), _bento_store=bento_store)

 def _import_model(model_name: str,
@@ -256,6 +249,5 @@ def _list_models() -> dict[str, t.Any]:
  return models_command.main(args=['-o', 'json', '--show-available', '--machine'], standalone_mode=False)

 start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(
-    _start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(
-        _import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
+    _start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(_import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
 __all__ = ['start', 'start_grpc', 'build', 'import_model', 'list_models']
--- a/openllm-python/src/openllm/cli/extension/build_base_container.py
+++ b/openllm-python/src/openllm/cli/extension/build_base_container.py
@@ -28,14 +28,10 @@ if t.TYPE_CHECKING:
                Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
                ''')
@container_registry_option
-@click.option('--version-strategy',
-              type=click.Choice(['release', 'latest', 'nightly']),
-              default='nightly',
-              help='Version strategy to use for tagging the image.')
+@click.option('--version-strategy', type=click.Choice(['release', 'latest', 'nightly']), default='nightly', help='Version strategy to use for tagging the image.')
@click.option('--push/--no-push', help='Whether to push to remote repository', is_flag=True, default=False)
@machine_option
-def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool,
-        machine: bool) -> dict[str, str]:
+def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
  mapping = openllm.bundle.build_container(container_registry, version_strategy, push, machine)
  if machine: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
  return mapping
--- a/openllm-python/src/openllm/cli/extension/dive_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/dive_bentos.py
@@ -31,9 +31,7 @@ def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore
  except bentoml.exceptions.NotFound:
    ctx.fail(f'Bento {bento} not found. Make sure to call `openllm build` first.')
  if 'bundler' not in bentomodel.info.labels or bentomodel.info.labels['bundler'] != 'openllm.bundle':
-    ctx.fail(
-        f"Bento is either too old or not built with OpenLLM. Make sure to use ``openllm build {bentomodel.info.labels['start_name']}`` for correctness."
-    )
+    ctx.fail(f"Bento is either too old or not built with OpenLLM. Make sure to use ``openllm build {bentomodel.info.labels['start_name']}`` for correctness.")
  if machine: return bentomodel.path
  # copy and paste this into a new shell
  if psutil.WINDOWS: subprocess.check_call([shutil.which('dir') or 'dir'], cwd=bentomodel.path)
--- a/openllm-python/src/openllm/cli/extension/get_containerfile.py
+++ b/openllm-python/src/openllm/cli/extension/get_containerfile.py
@@ -41,11 +41,6 @@ def cli(ctx: click.Context, bento: str, _bento_store: BentoStore = Provide[Bento
    # for the reconstruction of the Dockerfile.
    if 'dockerfile_template' in docker_attrs and docker_attrs['dockerfile_template'] is not None:
      docker_attrs['dockerfile_template'] = 'env/docker/Dockerfile.template'
-    doc = generate_containerfile(docker=DockerOptions(**docker_attrs),
-                                 build_ctx=bentomodel.path,
-                                 conda=options.conda,
-                                 bento_fs=bentomodel._fs,
-                                 enable_buildkit=True,
-                                 add_header=True)
+    doc = generate_containerfile(docker=DockerOptions(**docker_attrs), build_ctx=bentomodel.path, conda=options.conda, bento_fs=bentomodel._fs, enable_buildkit=True, add_header=True)
    termui.echo(doc, fg='white')
  return bentomodel.path
--- a/openllm-python/src/openllm/cli/extension/get_prompt.py
+++ b/openllm-python/src/openllm/cli/extension/get_prompt.py
@@ -18,9 +18,7 @@ from openllm_core._prompt import process_prompt
 LiteralOutput = t.Literal['json', 'pretty', 'porcelain']

@click.command('get_prompt', context_settings=termui.CONTEXT_SETTINGS)
-@click.argument('model_name',
-                type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]),
-                shell_complete=model_complete_envvar)
+@click.argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]), shell_complete=model_complete_envvar)
@click.argument('prompt', type=click.STRING)
@output_option
@click.option('--format', type=click.STRING, default=None)
@@ -32,8 +30,7 @@ LiteralOutput = t.Literal['json', 'pretty', 'porcelain']
              callback=opt_callback,
              metavar='ARG=VALUE[,ARG=VALUE]')
@click.pass_context
-def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any],
-        **_: t.Any) -> str | None:
+def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any], **_: t.Any) -> str | None:
  '''Get the default prompt used by OpenLLM.'''
  module = openllm.utils.EnvVarMixin(model_name).module
  _memoized = {k: v[0] for k, v in _memoized.items() if v}
--- a/openllm-python/src/openllm/cli/extension/list_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/list_bentos.py
@@ -22,17 +22,10 @@ def cli(ctx: click.Context, output: LiteralOutput) -> None:
          'tag': str(b.tag),
          'size': human_readable_size(openllm.utils.calc_dir_size(b.path)),
          'models': [{
-              'tag': str(m.tag),
-              'size': human_readable_size(openllm.utils.calc_dir_size(m.path))
-          }
-                     for m in (bentoml.models.get(_.tag)
-                               for _ in b.info.models)]
-      }
-          for b in tuple(i
-                         for i in bentoml.list()
-                         if all(k in i.info.labels
-                                for k in {'start_name', 'bundler'}))
-          if b.info.labels['start_name'] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
+              'tag': str(m.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(m.path))
+          } for m in (bentoml.models.get(_.tag) for _ in b.info.models)]
+      } for b in tuple(i for i in bentoml.list() if all(
+          k in i.info.labels for k in {'start_name', 'bundler'})) if b.info.labels['start_name'] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
  }
  mapping = {k: v for k, v in mapping.items() if v}
  if output == 'pretty':
--- a/openllm-python/src/openllm/cli/extension/list_models.py
+++ b/openllm-python/src/openllm/cli/extension/list_models.py
@@ -25,30 +25,17 @@ def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny:
  '''This is equivalent to openllm models --show-available less the nice table.'''
  models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
  ids_in_local_store = {
-      k: [
-          i for i in bentoml.models.list() if 'framework' in i.info.labels and i.info.labels['framework'] == 'openllm' and
-          'model_name' in i.info.labels and i.info.labels['model_name'] == k
-      ] for k in models
+      k: [i for i in bentoml.models.list() if 'framework' in i.info.labels and i.info.labels['framework'] == 'openllm' and 'model_name' in i.info.labels and i.info.labels['model_name'] == k]
+      for k in models
  }
  if model_name is not None:
-    ids_in_local_store = {
-        k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)]
-        for k, v in ids_in_local_store.items()
-    }
+    ids_in_local_store = {k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)] for k, v in ids_in_local_store.items()}
  ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
-  local_models = {
-      k: [{
-          'tag': str(i.tag),
-          'size': human_readable_size(openllm.utils.calc_dir_size(i.path))
-      } for i in val] for k, val in ids_in_local_store.items()
-  }
+  local_models = {k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()}
  if output == 'pretty':
    import tabulate
    tabulate.PRESERVE_WHITESPACE = True
-    termui.echo(tabulate.tabulate([(k, i['tag'], i['size']) for k, v in local_models.items() for i in v],
-                                  tablefmt='fancy_grid',
-                                  headers=['LLM', 'Tag', 'Size']),
-                fg='white')
+    termui.echo(tabulate.tabulate([(k, i['tag'], i['size']) for k, v in local_models.items() for i in v], tablefmt='fancy_grid', headers=['LLM', 'Tag', 'Size']), fg='white')
  else:
    termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white')
  return local_models