fix(base-image): update base image to include cuda for now (#720)

* fix(base-image): update base image to include cuda for now Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * fix: build core and client on release images Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: cleanup style changes Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-03-09 10:39:45 -04:00 · 2023-11-22 01:15:19 -05:00
parent 8bb2742a9a
commit 38b7c44df0
41 changed files with 913 additions and 613 deletions
--- a/openllm-python/src/openllm_cli/_factory.py
+++ b/openllm-python/src/openllm_cli/_factory.py
@@ -146,7 +146,7 @@ def start_decorator(serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC
      backend_option(factory=cog.optgroup),
      cog.optgroup.group(
        'LLM Optimization Options',
-        help="""Optimization related options.
+        help='''Optimization related options.

            OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.

@@ -154,7 +154,7 @@ def start_decorator(serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC

            - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
            - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
-            """,
+            ''',
      ),
      quantize_option(factory=cog.optgroup),
      serialisation_option(factory=cog.optgroup),
@@ -196,7 +196,7 @@ _IGNORED_OPTIONS = {'working_dir', 'production', 'protocol_version'}


 def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]:
-  """Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`."""
+  '''Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`.'''
  from bentoml_cli.cli import cli

  command = 'serve' if not serve_grpc else 'serve-grpc'
@@ -233,11 +233,11 @@ _http_server_args, _grpc_server_args = parse_serve_args(False), parse_serve_args


 def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | None], FC]:
-  """General ``@click`` decorator with some sauce.
+  '''General ``@click`` decorator with some sauce.

  This decorator extends the default ``@click.option`` plus a factory option and factory attr to
  provide type-safe click.option or click.argument wrapper for all compatible factory.
-  """
+  '''
  factory = attrs.pop('factory', click)
  factory_attr = attrs.pop('attr', 'option')
  if factory_attr != 'argument':
@@ -346,7 +346,7 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
    default=None,
    envvar='OPENLLM_QUANTIZE',
    show_envvar=True,
-    help="""Dynamic quantization for running this LLM.
+    help='''Dynamic quantization for running this LLM.

      The following quantization strategies are supported:

@@ -361,15 +361,15 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
      - ``squeezellm``: ``SqueezeLLM`` [SqueezeLLM: Dense-and-Sparse Quantization](https://arxiv.org/abs/2306.07629)

      > [!NOTE] that the model can also be served with quantized weights.
-      """
+      '''
    + (
-      """
-      > [!NOTE] that this will set the mode for serving within deployment."""
+      '''
+      > [!NOTE] that this will set the mode for serving within deployment.'''
      if build
      else ''
    )
-    + """
-      > [!NOTE] that quantization are currently only available in *PyTorch* models.""",
+    + '''
+      > [!NOTE] that quantization are currently only available in *PyTorch* models.''',
    **attrs,
  )(f)

@@ -383,7 +383,7 @@ def workers_per_resource_option(
    callback=workers_per_resource_callback,
    type=str,
    required=False,
-    help="""Number of workers per resource assigned.
+    help='''Number of workers per resource assigned.

      See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
      for more information. By default, this is set to 1.
@@ -393,7 +393,7 @@ def workers_per_resource_option(
      - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.

      - ``conserved``: This will determine the number of available GPU resources. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``.
-      """
+      '''
    + (
      """\n
      > [!NOTE] The workers value passed into 'build' will determine how the LLM can
@@ -416,7 +416,7 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
    show_default=True,
    show_envvar=True,
    envvar='OPENLLM_SERIALIZATION',
-    help="""Serialisation format for save/load LLM.
+    help='''Serialisation format for save/load LLM.

      Currently the following strategies are supported:

@@ -425,7 +425,7 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
      > [!NOTE] Safetensors might not work for every cases, and you can always fallback to ``legacy`` if needed.

      - ``legacy``: This will use PyTorch serialisation format, often as ``.bin`` files. This should be used if the model doesn't yet support safetensors.
-      """,
+      ''',
    **attrs,
  )(f)