feat(strategy): spawn one runner instance (#189)

2026-05-19 05:57:39 -04:00 · 2023-08-08 05:47:11 -04:00
parent 9c3019d236
commit 2d47a54efd
4 changed files with 21 additions and 29 deletions
--- a/changelog.d/189.change.md
+++ b/changelog.d/189.change.md
@@ -0,0 +1,3 @@
+Runners server now will always spawn one instance regardless of the configuration of workers-per-resource
+
+i.e: If CUDA_VISIBLE_DEVICES=0,1,2 and `--workers-per-resource=0.5`, then runners will only use `0,1` index
--- a/src/openllm/_strategies.py
+++ b/src/openllm/_strategies.py
@@ -266,11 +266,11 @@ _CPU_RESOURCE: t.Literal["cpu"] = "cpu"
 NvidiaGpuResource = _make_resource_class("NvidiaGpuResource", _NVIDIA_GPU_RESOURCE, """NVIDIA GPU resource.

    This is a modified version of internal's BentoML's NvidiaGpuResource
-    where it respects and parse CUDA_VISIBLE_DEVICES correctly.""",)
+    where it respects and parse CUDA_VISIBLE_DEVICES correctly.""")
 AmdGpuResource = _make_resource_class("AmdGpuResource", _AMD_GPU_RESOURCE, """AMD GPU resource.

    Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
-    ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""",)
+    ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""")

 LiteralResourceSpec = t.Literal["cloud-tpus.google.com/v2", "amd.com/gpu", "nvidia.com/gpu", "cpu"]

@@ -308,19 +308,14 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
  @classmethod
  def get_worker_count(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: int | float) -> int:
    if resource_request is None: resource_request = system_resources()
-
-    def _get_gpu_count(typ: list[str] | None, kind: str) -> int | None:
-      if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES: return math.ceil(len(typ) * workers_per_resource)
-      return None
-
    # use NVIDIA
    kind = "nvidia.com/gpu"
-    count = _get_gpu_count(get_resource(resource_request, kind), kind)
-    if count: return count
+    nvidia_req = get_resource(resource_request, kind)
+    if nvidia_req is not None: return 1
    # use AMD
    kind = "amd.com/gpu"
-    count = _get_gpu_count(get_resource(resource_request, kind, validate=False), kind)
-    if count: return count
+    amd_req = get_resource(resource_request, kind, validate=False)
+    if amd_req is not None: return 1
    # use CPU
    cpus = get_resource(resource_request, "cpu")
    if cpus is not None and cpus > 0:
@@ -339,10 +334,10 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
    """Get worker env for this given worker_index.

    Args:
-    runnable_class: The runnable class to be run.
-    resource_request: The resource request of the runnable.
-    workers_per_resource: # of workers per resource.
-    worker_index: The index of the worker, start from 0.
+      runnable_class: The runnable class to be run.
+      resource_request: The resource request of the runnable.
+      workers_per_resource: # of workers per resource.
+      worker_index: The index of the worker, start from 0.
    """
    cuda_env = os.environ.get("CUDA_VISIBLE_DEVICES", None)
    disabled = cuda_env in ("", "-1")
--- a/src/openllm/bundle/_package.py
+++ b/src/openllm/bundle/_package.py
@@ -110,16 +110,13 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
  return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"])

 def construct_docker_options(
-    llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float, quantize: t.LiteralString | None, bettertransformer: bool | None, device: tuple[str, ...] | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"],
-    container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy,
+    llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float, quantize: t.LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry,
+    container_version_strategy: LiteralContainerVersionStrategy,
 ) -> DockerOptions:
  _bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
  _bentoml_config_options_opts = [
      "tracing.sample_rate=1.0", f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'api_server.traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}',
  ]
-  if device:
-    if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{llm.config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
-    else: _bentoml_config_options_opts.append(f'runners."llm-{llm.config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
  _bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
  env: openllm.utils.EnvVarMixin = llm.config["env"]
  env_dict = {
@@ -163,7 +160,7 @@ def create_bento(
  llm_spec = ModelSpec.from_item({"tag": str(llm.tag), "alias": llm.tag.name})
  build_config = BentoBuildConfig(
      service=f"{llm.config['service_name']}:svc", name=bento_tag.name, labels=labels, description=f"OpenLLM service for {llm.config['start_name']}", include=list(llm_fs.walk.files()), exclude=["/venv", "/.venv", "__pycache__/", "*.py[cod]", "*$py.class"], python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map), models=[llm_spec],
-      docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer, device, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy)
+      docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy)
  )

  bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath("/"))
--- a/tests/strategies_test.py
+++ b/tests/strategies_test.py
@@ -124,16 +124,13 @@ def unvalidated_get_resource(x: dict[str, t.Any], y: str, validate: bool = False
@pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"])
 def test_cascade_strategy_worker_count(monkeypatch: MonkeyPatch, gpu_type: str):
  monkeypatch.setattr(strategy, "get_resource", unvalidated_get_resource)
-  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: 2}, 1) == 2
-  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: 2}, 2) == 4
-  assert pytest.raises(ValueError, CascadingResourceStrategy.get_worker_count, GPURunnable, {gpu_type: 0}, 1,).match("No known supported resource available for *")
-  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 1) == 2
-  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 2) == 4
+  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: 2}, 1) == 1
+  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 1) == 1

  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 0.5) == 1
-  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 9]}, 0.5) == 2
-  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 8, 9]}, 0.5) == 2
-  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 5, 7, 8, 9]}, 0.4) == 2
+  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 9]}, 0.5) == 1
+  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 8, 9]}, 0.5) == 1
+  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 5, 7, 8, 9]}, 0.4) == 1

@pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"])
 def test_cascade_strategy_worker_env(monkeypatch: MonkeyPatch, gpu_type: str):