feat(strategy): spawn one runner instance (#189)

This commit is contained in:
Aaron Pham
2023-08-08 05:47:11 -04:00
committed by GitHub
parent 9c3019d236
commit 2d47a54efd
4 changed files with 21 additions and 29 deletions

View File

@@ -0,0 +1,3 @@
Runners server now will always spawn one instance regardless of the configuration of workers-per-resource
i.e: If CUDA_VISIBLE_DEVICES=0,1,2 and `--workers-per-resource=0.5`, then runners will only use `0,1` index

View File

@@ -266,11 +266,11 @@ _CPU_RESOURCE: t.Literal["cpu"] = "cpu"
NvidiaGpuResource = _make_resource_class("NvidiaGpuResource", _NVIDIA_GPU_RESOURCE, """NVIDIA GPU resource.
This is a modified version of internal's BentoML's NvidiaGpuResource
where it respects and parse CUDA_VISIBLE_DEVICES correctly.""",)
where it respects and parse CUDA_VISIBLE_DEVICES correctly.""")
AmdGpuResource = _make_resource_class("AmdGpuResource", _AMD_GPU_RESOURCE, """AMD GPU resource.
Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""",)
``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""")
LiteralResourceSpec = t.Literal["cloud-tpus.google.com/v2", "amd.com/gpu", "nvidia.com/gpu", "cpu"]
@@ -308,19 +308,14 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
@classmethod
def get_worker_count(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: int | float) -> int:
if resource_request is None: resource_request = system_resources()
def _get_gpu_count(typ: list[str] | None, kind: str) -> int | None:
if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES: return math.ceil(len(typ) * workers_per_resource)
return None
# use NVIDIA
kind = "nvidia.com/gpu"
count = _get_gpu_count(get_resource(resource_request, kind), kind)
if count: return count
nvidia_req = get_resource(resource_request, kind)
if nvidia_req is not None: return 1
# use AMD
kind = "amd.com/gpu"
count = _get_gpu_count(get_resource(resource_request, kind, validate=False), kind)
if count: return count
amd_req = get_resource(resource_request, kind, validate=False)
if amd_req is not None: return 1
# use CPU
cpus = get_resource(resource_request, "cpu")
if cpus is not None and cpus > 0:
@@ -339,10 +334,10 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
"""Get worker env for this given worker_index.
Args:
runnable_class: The runnable class to be run.
resource_request: The resource request of the runnable.
workers_per_resource: # of workers per resource.
worker_index: The index of the worker, start from 0.
runnable_class: The runnable class to be run.
resource_request: The resource request of the runnable.
workers_per_resource: # of workers per resource.
worker_index: The index of the worker, start from 0.
"""
cuda_env = os.environ.get("CUDA_VISIBLE_DEVICES", None)
disabled = cuda_env in ("", "-1")

View File

@@ -110,16 +110,13 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"])
def construct_docker_options(
llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float, quantize: t.LiteralString | None, bettertransformer: bool | None, device: tuple[str, ...] | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"],
container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy,
llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float, quantize: t.LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry,
container_version_strategy: LiteralContainerVersionStrategy,
) -> DockerOptions:
_bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
_bentoml_config_options_opts = [
"tracing.sample_rate=1.0", f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'api_server.traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}',
]
if device:
if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{llm.config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
else: _bentoml_config_options_opts.append(f'runners."llm-{llm.config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
_bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
env: openllm.utils.EnvVarMixin = llm.config["env"]
env_dict = {
@@ -163,7 +160,7 @@ def create_bento(
llm_spec = ModelSpec.from_item({"tag": str(llm.tag), "alias": llm.tag.name})
build_config = BentoBuildConfig(
service=f"{llm.config['service_name']}:svc", name=bento_tag.name, labels=labels, description=f"OpenLLM service for {llm.config['start_name']}", include=list(llm_fs.walk.files()), exclude=["/venv", "/.venv", "__pycache__/", "*.py[cod]", "*$py.class"], python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map), models=[llm_spec],
docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer, device, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy)
docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy)
)
bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath("/"))

View File

@@ -124,16 +124,13 @@ def unvalidated_get_resource(x: dict[str, t.Any], y: str, validate: bool = False
@pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"])
def test_cascade_strategy_worker_count(monkeypatch: MonkeyPatch, gpu_type: str):
monkeypatch.setattr(strategy, "get_resource", unvalidated_get_resource)
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: 2}, 1) == 2
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: 2}, 2) == 4
assert pytest.raises(ValueError, CascadingResourceStrategy.get_worker_count, GPURunnable, {gpu_type: 0}, 1,).match("No known supported resource available for *")
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 1) == 2
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 2) == 4
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: 2}, 1) == 1
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 1) == 1
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 0.5) == 1
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 9]}, 0.5) == 2
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 8, 9]}, 0.5) == 2
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 5, 7, 8, 9]}, 0.4) == 2
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 9]}, 0.5) == 1
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 8, 9]}, 0.5) == 1
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 5, 7, 8, 9]}, 0.4) == 1
@pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"])
def test_cascade_strategy_worker_env(monkeypatch: MonkeyPatch, gpu_type: str):