chore(strategy): compact and add stubs (#718)

generate service_vars automatically inline without reading from files Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
2026-05-02 21:02:43 -04:00 · 2023-11-21 21:49:28 -05:00
parent 909db8c3bf
commit 04ef08a7f8
6 changed files with 71 additions and 155 deletions
--- a/openllm-python/src/openllm/_service_vars.py
+++ b/openllm-python/src/openllm/_service_vars.py
@@ -1,11 +1,2 @@
-import os
-
-import orjson
-
-from openllm_core.utils import ENV_VARS_TRUE_VALUES
-
-model_id = os.environ['OPENLLM_MODEL_ID']
-model_tag = None
-adapter_map = orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None)))
-serialization = os.getenv('OPENLLM_SERIALIZATION', default='safetensors')
-trust_remote_code = str(os.getenv('TRUST_REMOTE_CODE', default=str(False))).upper() in ENV_VARS_TRUE_VALUES
+# fmt: off
+import os,orjson,openllm_core.utils as coreutils;model_id,model_tag,adapter_map,serialization,trust_remote_code=os.environ['OPENLLM_MODEL_ID'],None,orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP',orjson.dumps(None))),os.getenv('OPENLLM_SERIALIZATION',default='safetensors'),coreutils.check_bool_env('TRUST_REMOTE_CODE',False)
--- a/openllm-python/src/openllm/_service_vars_pkg.py
+++ b/openllm-python/src/openllm/_service_vars_pkg.py
@@ -1,7 +0,0 @@
-import orjson
-
-model_id = '{__model_id__}'  # openllm: model id
-model_tag = '{__model_tag__}'  # openllm: model tag
-adapter_map = orjson.loads("""{__model_adapter_map__}""")  # openllm: model adapter map
-serialization = '{__model_serialization__}'  # openllm: model serialization
-trust_remote_code = {__model_trust_remote_code__}  # openllm: model trust remote code
--- a/openllm-python/src/openllm/_strategies.py
+++ b/openllm-python/src/openllm/_strategies.py
@@ -1,20 +1,8 @@
-# mypy: disable-error-code="no-redef"
 from __future__ import annotations
-import inspect
-import logging
-import math
-import os
-import sys
-import types
-import typing as t
-import warnings
-
-import psutil
-
-import bentoml
+import inspect, logging, math, os, sys, types, warnings, typing as t
+import psutil, bentoml, openllm_core.utils as coreutils
 from bentoml._internal.resource import get_resource, system_resources
 from bentoml._internal.runner.strategy import THREAD_ENVS
-from openllm_core.utils import DEBUG, ReprMixin

 logger = logging.getLogger(__name__)

@@ -46,8 +34,7 @@ def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]:
  return rcs


-def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
-  """CUDA_VISIBLE_DEVICES aware with default var for parsing spec."""
+def _parse_cuda_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
  if respect_env:
    spec = os.environ.get('CUDA_VISIBLE_DEVICES', default_var)
    if not spec:
@@ -116,11 +103,11 @@ def _raw_device_uuid_nvml() -> list[str] | None:
 class _ResourceMixin:
  @staticmethod
  def from_system(cls) -> list[str]:
-    visible_devices = _parse_visible_devices()
+    visible_devices = _parse_cuda_visible_devices()
    if visible_devices is None:
      if cls.resource_id == 'amd.com/gpu':
        if not psutil.LINUX:
-          if DEBUG:
+          if coreutils.DEBUG:
            logger.debug('AMD GPUs is currently only supported on Linux.')
          return []
        # ROCm does not currently have the rocm_smi wheel.
@@ -167,7 +154,7 @@ class _ResourceMixin:
        return []
      if spec.isdigit():
        spec = ','.join([str(i) for i in range(_strtoul(spec))])
-      return _parse_visible_devices(spec, respect_env=False)
+      return _parse_cuda_visible_devices(spec, respect_env=False)
    elif isinstance(spec, list):
      return [str(x) for x in spec]
    else:
@@ -209,7 +196,7 @@ class _ResourceMixin:
 def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[bentoml.Resource[t.List[str]]]:
  return types.new_class(
    name,
-    (bentoml.Resource[t.List[str]], ReprMixin),
+    (bentoml.Resource[t.List[str]], coreutils.ReprMixin),
    {'resource_id': resource_kind},
    lambda ns: ns.update(
      {
@@ -243,24 +230,9 @@ AmdGpuResource = _make_resource_class(
 )


-class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
-  """This is extends the default BentoML strategy where we check for NVIDIA GPU resource -> AMD GPU resource -> CPU resource.
-
-  It also respect CUDA_VISIBLE_DEVICES for both AMD and NVIDIA GPU.
-  See https://rocm.docs.amd.com/en/develop/understand/gpu_isolation.html#cuda-visible-devices
-  for ROCm's support for CUDA_VISIBLE_DEVICES.
-
-  TODO: Support CloudTPUResource
-  """
-
+class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
  @classmethod
-  def get_worker_count(
-    cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: float
-  ) -> int:
-    """Return the number of workers to be used for the given runnable class.
-
-    Note that for all available GPU, the number of workers will always be 1.
-    """
+  def get_worker_count(cls, runnable_class, resource_request, workers_per_resource):
    if resource_request is None:
      resource_request = system_resources()
    # use NVIDIA
@@ -291,21 +263,7 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
    )

  @classmethod
-  def get_worker_env(
-    cls,
-    runnable_class: type[bentoml.Runnable],
-    resource_request: dict[str, t.Any] | None,
-    workers_per_resource: int | float,
-    worker_index: int,
-  ) -> dict[str, t.Any]:
-    """Get worker env for this given worker_index.
-
-    Args:
-      runnable_class: The runnable class to be run.
-      resource_request: The resource request of the runnable.
-      workers_per_resource: # of workers per resource.
-      worker_index: The index of the worker, start from 0.
-    """
+  def get_worker_env(cls, runnable_class, resource_request, workers_per_resource, worker_index):
    cuda_env = os.environ.get('CUDA_VISIBLE_DEVICES', None)
    disabled = cuda_env in ('', '-1')
    environ: dict[str, t.Any] = {}
@@ -350,7 +308,7 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
    return environ

  @staticmethod
-  def transpile_workers_to_cuda_envvar(workers_per_resource: float | int, gpus: list[str], worker_index: int) -> str:
+  def transpile_workers_to_cuda_envvar(workers_per_resource, gpus, worker_index):
    # Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string.
    if isinstance(workers_per_resource, float):
      # NOTE: We hit this branch when workers_per_resource is set to
--- a/openllm-python/src/openllm/_strategies.pyi
+++ b/openllm-python/src/openllm/_strategies.pyi
@@ -0,0 +1,46 @@
+from typing import Any, Type, Dict, Optional, Union, List
+import bentoml
+
+def get_resource(resources: Dict[str, Any], resource_kind: str, validate: bool = ...) -> Any: ...
+
+class CascadingResourceStrategy:
+  """This is extends the default BentoML strategy where we check for NVIDIA GPU resource -> AMD GPU resource -> CPU resource.
+
+  It also respect CUDA_VISIBLE_DEVICES for both AMD and NVIDIA GPU.
+  See https://rocm.docs.amd.com/en/develop/understand/gpu_isolation.html#cuda-visible-devices
+  for ROCm's support for CUDA_VISIBLE_DEVICES.
+
+  TODO: Support CloudTPUResource
+  """
+  @classmethod
+  def get_worker_count(
+    cls,
+    runnable_class: Type[bentoml.Runnable],
+    resource_request: Optional[Dict[str, Any]],
+    workers_per_resource: float,
+  ) -> int:
+    """Return the number of workers to be used for the given runnable class.
+
+    Note that for all available GPU, the number of workers will always be 1.
+    """
+  @classmethod
+  def get_worker_env(
+    cls,
+    runnable_class: type[bentoml.Runnable],
+    resource_request: Optional[Dict[str, Any]],
+    workers_per_resource: Union[int, float],
+    worker_index: int,
+  ) -> Dict[str, Any]:
+    """Get worker env for this given worker_index.
+
+    Args:
+      runnable_class: The runnable class to be run.
+      resource_request: The resource request of the runnable.
+      workers_per_resource: # of workers per resource.
+      worker_index: The index of the worker, start from 0.
+    """
+  @staticmethod
+  def transpile_workers_to_cuda_envvar(
+    workers_per_resource: Union[float, int], gpus: List[str], worker_index: int
+  ) -> str:
+    """Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string."""
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -3,8 +3,6 @@ from __future__ import annotations
 import importlib.metadata
 import logging
 import os
-import string
-import typing as t
 from pathlib import Path

 import orjson
@@ -18,9 +16,6 @@ from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg

 from . import oci

-if t.TYPE_CHECKING:
-  from openllm_core._typing_compat import LiteralString
-
 logger = logging.getLogger(__name__)

 OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
@@ -96,92 +91,25 @@ def construct_docker_options(
  )


-OPENLLM_MODEL_ID = '# openllm: model id'
-OPENLLM_MODEL_TAG = '# openllm: model tag'
-OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map'
-OPENLLM_MODEL_SERIALIZATION = '# openllm: model serialization'
-OPENLLM_MODEL_TRUST_REMOTE_CODE = '# openllm: model trust remote code'
-
-
-class _ServiceVarsFormatter(string.Formatter):
-  keyword: LiteralString = '__model_name__'
-  identifier: LiteralString = '# openllm: model name'
-
-  def __init__(self, target):
-    super().__init__()
-    self.target = target
-
-  def vformat(self, format_string, *args, **attrs) -> str:
-    return super().vformat(format_string, (), {self.keyword: self.target})
-
-  def parse_line(self, line: str, nl: bool = True) -> str:
-    if self.identifier not in line:
-      return line
-    gen = self.vformat(line)[: -(len(self.identifier) + 3)] + ('\n' if nl else '')
-    return gen
-
-
-class ModelIdFormatter(_ServiceVarsFormatter):
-  keyword = '__model_id__'
-  identifier = OPENLLM_MODEL_ID
-
-
-class ModelTagFormatter(_ServiceVarsFormatter):
-  keyword = '__model_tag__'
-  identifier = OPENLLM_MODEL_TAG
-
-
-class ModelAdapterMapFormatter(_ServiceVarsFormatter):
-  keyword = '__model_adapter_map__'
-  identifier = OPENLLM_MODEL_ADAPTER_MAP
-
-
-class ModelSerializationFormatter(_ServiceVarsFormatter):
-  keyword = '__model_serialization__'
-  identifier = OPENLLM_MODEL_SERIALIZATION
-
-
-class ModelTrustRemoteCodeFormatter(_ServiceVarsFormatter):
-  keyword = '__model_trust_remote_code__'
-  identifier = OPENLLM_MODEL_TRUST_REMOTE_CODE
-
-
 _service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
-_service_vars_file = Path(os.path.abspath(__file__)).parent.parent / '_service_vars_pkg.py'
+
+_SERVICE_VARS = '''\
+import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__}
+'''


 def write_service(llm, llm_fs, adapter_map):
-  model_id_formatter = ModelIdFormatter(llm.model_id)
-  model_tag_formatter = ModelTagFormatter(str(llm.tag))
-  adapter_map_formatter = ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode())
-  serialization_formatter = ModelSerializationFormatter(llm.config['serialisation'])
-  trust_remote_code_formatter = ModelTrustRemoteCodeFormatter(str(llm.trust_remote_code))
-
-  logger.debug(
-    'Generating service vars file for %s at %s (dir=%s)', llm.model_id, '_service_vars.py', llm_fs.getsyspath('/')
+  logger.debug('Generating service vars %s (dir=%s)', llm.model_id, llm_fs.getsyspath('/'))
+  script = f"# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n\n# fmt: off\n" + _SERVICE_VARS.format(
+    __model_id__=llm.model_id,
+    __model_tag__=str(llm.tag),
+    __model_adapter_map__=orjson.dumps(adapter_map).decode(),
+    __model_serialization__=llm.config['serialisation'],
+    __model_trust_remote_code__=str(llm.trust_remote_code),
  )
-  with open(_service_vars_file.__fspath__(), 'r') as f:
-    src_contents = f.readlines()
-  for i, it in enumerate(src_contents):
-    if model_id_formatter.identifier in it:
-      src_contents[i] = model_id_formatter.parse_line(it)
-    elif model_tag_formatter.identifier in it:
-      src_contents[i] = model_tag_formatter.parse_line(it)
-    elif adapter_map_formatter.identifier in it:
-      src_contents[i] = adapter_map_formatter.parse_line(it)
-    elif serialization_formatter.identifier in it:
-      src_contents[i] = serialization_formatter.parse_line(it)
-    elif trust_remote_code_formatter.identifier in it:
-      src_contents[i] = trust_remote_code_formatter.parse_line(it)
-
-  script = f"# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n\n" + ''.join(src_contents)
  if SHOW_CODEGEN:
    logger.info('Generated _service_vars.py:\n%s', script)
  llm_fs.writetext('_service_vars.py', script)
-
-  logger.debug(
-    'Generating service file for %s at %s (dir=%s)', llm.model_id, llm.config['service_name'], llm_fs.getsyspath('/')
-  )
  with open(_service_file.__fspath__(), 'r') as f:
    service_src = f.read()
  llm_fs.writetext(llm.config['service_name'], service_src)