OpenLLM/openllm-python/src/openllm/cli/_sdk.py

from __future__ import annotations
import itertools, logging, os, re, subprocess, sys, typing as t, bentoml, openllm, openllm_core
from simple_di import Provide, inject
from bentoml._internal.configuration.containers import BentoMLContainer
from openllm.exceptions import OpenLLMException
from . import termui
from ._factory import start_command_factory

if t.TYPE_CHECKING:
  from openllm_core._configuration import LLMConfig
  from openllm_core._typing_compat import LiteralString, LiteralRuntime, LiteralContainerRegistry, LiteralContainerVersionStrategy
  from bentoml._internal.bento import BentoStore
logger = logging.getLogger(__name__)
def _start(
    model_name: str,
    /,
    *,
    model_id: str | None = None,
    timeout: int = 30,
    workers_per_resource: t.Literal["conserved", "round_robin"] | float | None = None,
    device: tuple[str, ...] | t.Literal["all"] | None = None,
    quantize: t.Literal["int8", "int4", "gptq"] | None = None,
    bettertransformer: bool | None = None,
    runtime: t.Literal["ggml", "transformers"] = "transformers",
    adapter_map: dict[LiteralString, str | None] | None = None,
    framework: LiteralRuntime | None = None,
    additional_args: list[str] | None = None,
    cors: bool = False,
    _serve_grpc: bool = False,
    __test__: bool = False,
    **_: t.Any
) -> LLMConfig | subprocess.Popen[bytes]:
  """Python API to start a LLM server. These provides one-to-one mapping to CLI arguments.

  For all additional arguments, pass it as string to ``additional_args``. For example, if you want to
  pass ``--port 5001``, you can pass ``additional_args=["--port", "5001"]``

  > [!NOTE] This will create a blocking process, so if you use this API, you can create a running sub thread
  > to start the server instead of blocking the main thread.

  ``openllm.start`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI interaction.

  > [!NOTE] ``quantize`` and ``bettertransformer`` are mutually exclusive.

  Args:
      model_name: The model name to start this LLM
      model_id: Optional model id for this given LLM
      timeout: The server timeout
      workers_per_resource: Number of workers per resource assigned.
                            See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
                            for more information. By default, this is set to 1.

                            > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
                            > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
                            > - ``conserved``: This will determine the number of available GPU resources, and only assign
                            >                  one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
                            >                  equivalent to ``--workers-per-resource 0.25``.
      device: Assign GPU devices (if available) to this LLM. By default, this is set to ``None``. It also accepts 'all'
      argument to assign all available GPUs to this LLM.
      quantize: Quantize the model weights. This is only applicable for PyTorch models.
                Possible quantisation strategies:
                - int8: Quantize the model with 8bit (bitsandbytes required)
                - int4: Quantize the model with 4bit (bitsandbytes required)
                - gptq: Quantize the model with GPTQ (auto-gptq required)
      bettertransformer: Convert given model to FastTransformer with PyTorch.
      runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
      cors: Whether to enable CORS for this LLM. By default, this is set to ``False``.
      adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
      framework: The framework to use for this LLM. By default, this is set to ``pt``.
      additional_args: Additional arguments to pass to ``openllm start``.
  """
  from .entrypoint import start_command, start_grpc_command
  llm_config = openllm.AutoConfig.for_model(model_name)
  _ModelEnv = openllm_core.utils.EnvVarMixin(model_name, openllm_core.utils.first_not_none(framework, default=llm_config.default_implementation()), model_id=model_id, bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
  os.environ[_ModelEnv.framework] = _ModelEnv["framework_value"]

  args: list[str] = ["--runtime", runtime]
  if model_id: args.extend(["--model-id", model_id])
  if timeout: args.extend(["--server-timeout", str(timeout)])
  if workers_per_resource: args.extend(["--workers-per-resource", str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource])
  if device and not os.environ.get("CUDA_VISIBLE_DEVICES"): args.extend(["--device", ",".join(device)])
  if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
  if quantize: args.extend(["--quantize", str(quantize)])
  elif bettertransformer: args.append("--bettertransformer")
  if cors: args.append("--cors")
  if adapter_map: args.extend(list(itertools.chain.from_iterable([["--adapter-id", f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])))
  if additional_args: args.extend(additional_args)
  if __test__: args.append("--return-process")

  return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False)
@inject
def _build(
    model_name: str,
    /,
    *,
    model_id: str | None = None,
    model_version: str | None = None,
    bento_version: str | None = None,
    quantize: t.Literal["int8", "int4", "gptq"] | None = None,
    bettertransformer: bool | None = None,
    adapter_map: dict[str, str | None] | None = None,
    build_ctx: str | None = None,
    enable_features: tuple[str, ...] | None = None,
    workers_per_resource: float | None = None,
    runtime: t.Literal["ggml", "transformers"] = "transformers",
    dockerfile_template: str | None = None,
    overwrite: bool = False,
    container_registry: LiteralContainerRegistry | None = None,
    container_version_strategy: LiteralContainerVersionStrategy | None = None,
    push: bool = False,
    containerize: bool = False,
    serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors",
    additional_args: list[str] | None = None,
    bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
) -> bentoml.Bento:
  """Package a LLM into a Bento.

  The LLM will be built into a BentoService with the following structure:
  if ``quantize`` is passed, it will instruct the model to be quantized dynamically during serving time.
  if ``bettertransformer`` is passed, it will instruct the model to apply FasterTransformer during serving time.

  ``openllm.build`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as ``openllm build`` CLI.

  > [!NOTE] ``quantize`` and ``bettertransformer`` are mutually exclusive.

  Args:
      model_name: The model name to start this LLM
      model_id: Optional model id for this given LLM
      model_version: Optional model version for this given LLM
      bento_version: Optional bento veresion for this given BentoLLM
      quantize: Quantize the model weights. This is only applicable for PyTorch models.
                Possible quantisation strategies:
                - int8: Quantize the model with 8bit (bitsandbytes required)
                - int4: Quantize the model with 4bit (bitsandbytes required)
                - gptq: Quantize the model with GPTQ (auto-gptq required)
      bettertransformer: Convert given model to FastTransformer with PyTorch.
      adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
      build_ctx: The build context to use for building BentoLLM. By default, it sets to current directory.
      enable_features: Additional OpenLLM features to be included with this BentoLLM.
      workers_per_resource: Number of workers per resource assigned.
                            See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
                            for more information. By default, this is set to 1.

                            > [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
                            > - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
                            > - ``conserved``: This will determine the number of available GPU resources, and only assign
                            >                  one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
                            >                  equivalent to ``--workers-per-resource 0.25``.
      runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
      dockerfile_template: The dockerfile template to use for building BentoLLM. See https://docs.bentoml.com/en/latest/guides/containerization.html#dockerfile-template.
      overwrite: Whether to overwrite the existing BentoLLM. By default, this is set to ``False``.
      push: Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.
      containerize: Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.
                    Note that 'containerize' and 'push' are mutually exclusive
                    container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
      container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
      container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
      serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
      additional_args: Additional arguments to pass to ``openllm build``.
      bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.

  Returns:
      ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
  """
  args: list[str] = [sys.executable, "-m", "openllm", "build", model_name, "--machine", "--runtime", runtime, "--serialisation", serialisation_format]
  if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
  if quantize: args.extend(["--quantize", quantize])
  if bettertransformer: args.append("--bettertransformer")
  if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
  if push: args.extend(["--push"])
  if containerize: args.extend(["--containerize"])
  if model_id: args.extend(["--model-id", model_id])
  if build_ctx: args.extend(["--build-ctx", build_ctx])
  if enable_features: args.extend([f"--enable-features={f}" for f in enable_features])
  if workers_per_resource: args.extend(["--workers-per-resource", str(workers_per_resource)])
  if overwrite: args.append("--overwrite")
  if adapter_map: args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()])
  if model_version: args.extend(["--model-version", model_version])
  if bento_version: args.extend(["--bento-version", bento_version])
  if dockerfile_template: args.extend(["--dockerfile-template", dockerfile_template])
  if container_registry is None: container_registry = "ecr"
  if container_version_strategy is None: container_version_strategy = "release"
  args.extend(["--container-registry", container_registry, "--container-version-strategy", container_version_strategy])
  if additional_args: args.extend(additional_args)

  try:
    output = subprocess.check_output(args, env=os.environ.copy(), cwd=build_ctx or os.getcwd())
  except subprocess.CalledProcessError as e:
    logger.error("Exception caught while building %s", model_name, exc_info=e)
    if e.stderr: raise OpenLLMException(e.stderr.decode("utf-8")) from None
    raise OpenLLMException(str(e)) from None
  matched = re.match(r"__tag__:([^:\n]+:[^:\n]+)$", output.decode("utf-8").strip())
  if matched is None: raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.")
  return bentoml.get(matched.group(1), _bento_store=bento_store)
def _import_model(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", implementation: LiteralRuntime = "pt", quantize: t.Literal["int8", "int4", "gptq"] | None = None, serialisation_format: t.Literal["legacy", "safetensors"] = "safetensors", additional_args: t.Sequence[str] | None = None) -> bentoml.Model:
  """Import a LLM into local store.

  > [!NOTE]
  > If ``quantize`` is passed, the model weights will be saved as quantized weights. You should
  > only use this option if you want the weight to be quantized by default. Note that OpenLLM also
  > support on-demand quantisation during initial startup.

  ``openllm.download`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI ``openllm import``.

  > [!NOTE]
  > ``openllm.start`` will automatically invoke ``openllm.download`` under the hood.

  Args:
      model_name: The model name to start this LLM
      model_id: Optional model id for this given LLM
      model_version: Optional model version for this given LLM
      runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
      implementation: The implementation to use for this LLM. By default, this is set to ``pt``.
      quantize: Quantize the model weights. This is only applicable for PyTorch models.
                Possible quantisation strategies:
                - int8: Quantize the model with 8bit (bitsandbytes required)
                - int4: Quantize the model with 4bit (bitsandbytes required)
                - gptq: Quantize the model with GPTQ (auto-gptq required)
      serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
      Default behaviour is similar to ``safe_serialization=False``.
      additional_args: Additional arguments to pass to ``openllm import``.

  Returns:
      ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
  """
  from .entrypoint import import_command
  args = [model_name, "--runtime", runtime, "--implementation", implementation, "--machine", "--serialisation", serialisation_format,]
  if model_id is not None: args.append(model_id)
  if model_version is not None: args.extend(["--model-version", str(model_version)])
  if additional_args is not None: args.extend(additional_args)
  if quantize is not None: args.extend(["--quantize", quantize])
  return import_command.main(args=args, standalone_mode=False)
def _list_models() -> dict[str, t.Any]:
  """List all available models within the local store."""
  from .entrypoint import models_command
  return models_command.main(args=["-o", "json", "--show-available", "--machine"], standalone_mode=False)
start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(_import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
__all__ = ["start", "start_grpc", "build", "import_model", "list_models"]