From 2d47a54efd78da2b88565fd4c3d69e44feaf2dc1 Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Tue, 8 Aug 2023 05:47:11 -0400
Subject: [PATCH] feat(strategy): spawn one runner instance (#189)

---
 changelog.d/189.change.md      |  3 +++
 src/openllm/_strategies.py     | 25 ++++++++++---------------
 src/openllm/bundle/_package.py |  9 +++------
 tests/strategies_test.py       | 13 +++++--------
 4 files changed, 21 insertions(+), 29 deletions(-)
 create mode 100644 changelog.d/189.change.md

diff --git a/changelog.d/189.change.md b/changelog.d/189.change.md
new file mode 100644
index 00000000..441a0819
--- /dev/null
+++ b/changelog.d/189.change.md
@@ -0,0 +1,3 @@
+Runners server now will always spawn one instance regardless of the configuration of workers-per-resource
+
+i.e: If CUDA_VISIBLE_DEVICES=0,1,2 and `--workers-per-resource=0.5`, then runners will only use `0,1` index
diff --git a/src/openllm/_strategies.py b/src/openllm/_strategies.py
index 85460559..6b11c5c3 100644
--- a/src/openllm/_strategies.py
+++ b/src/openllm/_strategies.py
@@ -266,11 +266,11 @@ _CPU_RESOURCE: t.Literal["cpu"] = "cpu"
 NvidiaGpuResource = _make_resource_class("NvidiaGpuResource", _NVIDIA_GPU_RESOURCE, """NVIDIA GPU resource.
 
     This is a modified version of internal's BentoML's NvidiaGpuResource
-    where it respects and parse CUDA_VISIBLE_DEVICES correctly.""",)
+    where it respects and parse CUDA_VISIBLE_DEVICES correctly.""")
 AmdGpuResource = _make_resource_class("AmdGpuResource", _AMD_GPU_RESOURCE, """AMD GPU resource.
 
     Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
-    ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""",)
+    ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""")
 
 LiteralResourceSpec = t.Literal["cloud-tpus.google.com/v2", "amd.com/gpu", "nvidia.com/gpu", "cpu"]
 
@@ -308,19 +308,14 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
   @classmethod
   def get_worker_count(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: int | float) -> int:
     if resource_request is None: resource_request = system_resources()
-
-    def _get_gpu_count(typ: list[str] | None, kind: str) -> int | None:
-      if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES: return math.ceil(len(typ) * workers_per_resource)
-      return None
-
     # use NVIDIA
     kind = "nvidia.com/gpu"
-    count = _get_gpu_count(get_resource(resource_request, kind), kind)
-    if count: return count
+    nvidia_req = get_resource(resource_request, kind)
+    if nvidia_req is not None: return 1
     # use AMD
     kind = "amd.com/gpu"
-    count = _get_gpu_count(get_resource(resource_request, kind, validate=False), kind)
-    if count: return count
+    amd_req = get_resource(resource_request, kind, validate=False)
+    if amd_req is not None: return 1
     # use CPU
     cpus = get_resource(resource_request, "cpu")
     if cpus is not None and cpus > 0:
@@ -339,10 +334,10 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
     """Get worker env for this given worker_index.
 
     Args:
-    runnable_class: The runnable class to be run.
-    resource_request: The resource request of the runnable.
-    workers_per_resource: # of workers per resource.
-    worker_index: The index of the worker, start from 0.
+      runnable_class: The runnable class to be run.
+      resource_request: The resource request of the runnable.
+      workers_per_resource: # of workers per resource.
+      worker_index: The index of the worker, start from 0.
     """
     cuda_env = os.environ.get("CUDA_VISIBLE_DEVICES", None)
     disabled = cuda_env in ("", "-1")
diff --git a/src/openllm/bundle/_package.py b/src/openllm/bundle/_package.py
index acbd1abc..f618bf48 100644
--- a/src/openllm/bundle/_package.py
+++ b/src/openllm/bundle/_package.py
@@ -110,16 +110,13 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
   return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"])
 
 def construct_docker_options(
-    llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float, quantize: t.LiteralString | None, bettertransformer: bool | None, device: tuple[str, ...] | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"],
-    container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy,
+    llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float, quantize: t.LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry,
+    container_version_strategy: LiteralContainerVersionStrategy,
 ) -> DockerOptions:
   _bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
   _bentoml_config_options_opts = [
       "tracing.sample_rate=1.0", f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'api_server.traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}',
   ]
-  if device:
-    if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{llm.config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
-    else: _bentoml_config_options_opts.append(f'runners."llm-{llm.config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
   _bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
   env: openllm.utils.EnvVarMixin = llm.config["env"]
   env_dict = {
@@ -163,7 +160,7 @@ def create_bento(
   llm_spec = ModelSpec.from_item({"tag": str(llm.tag), "alias": llm.tag.name})
   build_config = BentoBuildConfig(
       service=f"{llm.config['service_name']}:svc", name=bento_tag.name, labels=labels, description=f"OpenLLM service for {llm.config['start_name']}", include=list(llm_fs.walk.files()), exclude=["/venv", "/.venv", "__pycache__/", "*.py[cod]", "*$py.class"], python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map), models=[llm_spec],
-      docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer, device, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy)
+      docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy)
   )
 
   bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath("/"))
diff --git a/tests/strategies_test.py b/tests/strategies_test.py
index d1429e57..88864d3f 100644
--- a/tests/strategies_test.py
+++ b/tests/strategies_test.py
@@ -124,16 +124,13 @@ def unvalidated_get_resource(x: dict[str, t.Any], y: str, validate: bool = False
 @pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"])
 def test_cascade_strategy_worker_count(monkeypatch: MonkeyPatch, gpu_type: str):
   monkeypatch.setattr(strategy, "get_resource", unvalidated_get_resource)
-  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: 2}, 1) == 2
-  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: 2}, 2) == 4
-  assert pytest.raises(ValueError, CascadingResourceStrategy.get_worker_count, GPURunnable, {gpu_type: 0}, 1,).match("No known supported resource available for *")
-  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 1) == 2
-  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 2) == 4
+  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: 2}, 1) == 1
+  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 1) == 1
 
   assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 0.5) == 1
-  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 9]}, 0.5) == 2
-  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 8, 9]}, 0.5) == 2
-  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 5, 7, 8, 9]}, 0.4) == 2
+  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 9]}, 0.5) == 1
+  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 8, 9]}, 0.5) == 1
+  assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 5, 7, 8, 9]}, 0.4) == 1
 
 @pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"])
 def test_cascade_strategy_worker_env(monkeypatch: MonkeyPatch, gpu_type: str):