From 2d47a54efd78da2b88565fd4c3d69e44feaf2dc1 Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Tue, 8 Aug 2023 05:47:11 -0400 Subject: [PATCH] feat(strategy): spawn one runner instance (#189) --- changelog.d/189.change.md | 3 +++ src/openllm/_strategies.py | 25 ++++++++++--------------- src/openllm/bundle/_package.py | 9 +++------ tests/strategies_test.py | 13 +++++-------- 4 files changed, 21 insertions(+), 29 deletions(-) create mode 100644 changelog.d/189.change.md diff --git a/changelog.d/189.change.md b/changelog.d/189.change.md new file mode 100644 index 00000000..441a0819 --- /dev/null +++ b/changelog.d/189.change.md @@ -0,0 +1,3 @@ +Runners server now will always spawn one instance regardless of the configuration of workers-per-resource + +i.e: If CUDA_VISIBLE_DEVICES=0,1,2 and `--workers-per-resource=0.5`, then runners will only use `0,1` index diff --git a/src/openllm/_strategies.py b/src/openllm/_strategies.py index 85460559..6b11c5c3 100644 --- a/src/openllm/_strategies.py +++ b/src/openllm/_strategies.py @@ -266,11 +266,11 @@ _CPU_RESOURCE: t.Literal["cpu"] = "cpu" NvidiaGpuResource = _make_resource_class("NvidiaGpuResource", _NVIDIA_GPU_RESOURCE, """NVIDIA GPU resource. This is a modified version of internal's BentoML's NvidiaGpuResource - where it respects and parse CUDA_VISIBLE_DEVICES correctly.""",) + where it respects and parse CUDA_VISIBLE_DEVICES correctly.""") AmdGpuResource = _make_resource_class("AmdGpuResource", _AMD_GPU_RESOURCE, """AMD GPU resource. Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to - ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""",) + ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""") LiteralResourceSpec = t.Literal["cloud-tpus.google.com/v2", "amd.com/gpu", "nvidia.com/gpu", "cpu"] @@ -308,19 +308,14 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin): @classmethod def get_worker_count(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: int | float) -> int: if resource_request is None: resource_request = system_resources() - - def _get_gpu_count(typ: list[str] | None, kind: str) -> int | None: - if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES: return math.ceil(len(typ) * workers_per_resource) - return None - # use NVIDIA kind = "nvidia.com/gpu" - count = _get_gpu_count(get_resource(resource_request, kind), kind) - if count: return count + nvidia_req = get_resource(resource_request, kind) + if nvidia_req is not None: return 1 # use AMD kind = "amd.com/gpu" - count = _get_gpu_count(get_resource(resource_request, kind, validate=False), kind) - if count: return count + amd_req = get_resource(resource_request, kind, validate=False) + if amd_req is not None: return 1 # use CPU cpus = get_resource(resource_request, "cpu") if cpus is not None and cpus > 0: @@ -339,10 +334,10 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin): """Get worker env for this given worker_index. Args: - runnable_class: The runnable class to be run. - resource_request: The resource request of the runnable. - workers_per_resource: # of workers per resource. - worker_index: The index of the worker, start from 0. + runnable_class: The runnable class to be run. + resource_request: The resource request of the runnable. + workers_per_resource: # of workers per resource. + worker_index: The index of the worker, start from 0. """ cuda_env = os.environ.get("CUDA_VISIBLE_DEVICES", None) disabled = cuda_env in ("", "-1") diff --git a/src/openllm/bundle/_package.py b/src/openllm/bundle/_package.py index acbd1abc..f618bf48 100644 --- a/src/openllm/bundle/_package.py +++ b/src/openllm/bundle/_package.py @@ -110,16 +110,13 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"]) def construct_docker_options( - llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float, quantize: t.LiteralString | None, bettertransformer: bool | None, device: tuple[str, ...] | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], - container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy, + llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float, quantize: t.LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry, + container_version_strategy: LiteralContainerVersionStrategy, ) -> DockerOptions: _bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "") _bentoml_config_options_opts = [ "tracing.sample_rate=1.0", f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'api_server.traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}', ] - if device: - if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{llm.config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)]) - else: _bentoml_config_options_opts.append(f'runners."llm-{llm.config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]') _bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts) env: openllm.utils.EnvVarMixin = llm.config["env"] env_dict = { @@ -163,7 +160,7 @@ def create_bento( llm_spec = ModelSpec.from_item({"tag": str(llm.tag), "alias": llm.tag.name}) build_config = BentoBuildConfig( service=f"{llm.config['service_name']}:svc", name=bento_tag.name, labels=labels, description=f"OpenLLM service for {llm.config['start_name']}", include=list(llm_fs.walk.files()), exclude=["/venv", "/.venv", "__pycache__/", "*.py[cod]", "*$py.class"], python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map), models=[llm_spec], - docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer, device, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy) + docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy) ) bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath("/")) diff --git a/tests/strategies_test.py b/tests/strategies_test.py index d1429e57..88864d3f 100644 --- a/tests/strategies_test.py +++ b/tests/strategies_test.py @@ -124,16 +124,13 @@ def unvalidated_get_resource(x: dict[str, t.Any], y: str, validate: bool = False @pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"]) def test_cascade_strategy_worker_count(monkeypatch: MonkeyPatch, gpu_type: str): monkeypatch.setattr(strategy, "get_resource", unvalidated_get_resource) - assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: 2}, 1) == 2 - assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: 2}, 2) == 4 - assert pytest.raises(ValueError, CascadingResourceStrategy.get_worker_count, GPURunnable, {gpu_type: 0}, 1,).match("No known supported resource available for *") - assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 1) == 2 - assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 2) == 4 + assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: 2}, 1) == 1 + assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 1) == 1 assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 0.5) == 1 - assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 9]}, 0.5) == 2 - assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 8, 9]}, 0.5) == 2 - assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 5, 7, 8, 9]}, 0.4) == 2 + assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 9]}, 0.5) == 1 + assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 8, 9]}, 0.5) == 1 + assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 5, 7, 8, 9]}, 0.4) == 1 @pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"]) def test_cascade_strategy_worker_env(monkeypatch: MonkeyPatch, gpu_type: str):