mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-04-22 16:07:24 -04:00
feat(strategy): spawn one runner instance (#189)
This commit is contained in:
3
changelog.d/189.change.md
Normal file
3
changelog.d/189.change.md
Normal file
@@ -0,0 +1,3 @@
|
||||
Runners server now will always spawn one instance regardless of the configuration of workers-per-resource
|
||||
|
||||
i.e: If CUDA_VISIBLE_DEVICES=0,1,2 and `--workers-per-resource=0.5`, then runners will only use `0,1` index
|
||||
@@ -266,11 +266,11 @@ _CPU_RESOURCE: t.Literal["cpu"] = "cpu"
|
||||
NvidiaGpuResource = _make_resource_class("NvidiaGpuResource", _NVIDIA_GPU_RESOURCE, """NVIDIA GPU resource.
|
||||
|
||||
This is a modified version of internal's BentoML's NvidiaGpuResource
|
||||
where it respects and parse CUDA_VISIBLE_DEVICES correctly.""",)
|
||||
where it respects and parse CUDA_VISIBLE_DEVICES correctly.""")
|
||||
AmdGpuResource = _make_resource_class("AmdGpuResource", _AMD_GPU_RESOURCE, """AMD GPU resource.
|
||||
|
||||
Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
|
||||
``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""",)
|
||||
``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""")
|
||||
|
||||
LiteralResourceSpec = t.Literal["cloud-tpus.google.com/v2", "amd.com/gpu", "nvidia.com/gpu", "cpu"]
|
||||
|
||||
@@ -308,19 +308,14 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
|
||||
@classmethod
|
||||
def get_worker_count(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: int | float) -> int:
|
||||
if resource_request is None: resource_request = system_resources()
|
||||
|
||||
def _get_gpu_count(typ: list[str] | None, kind: str) -> int | None:
|
||||
if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES: return math.ceil(len(typ) * workers_per_resource)
|
||||
return None
|
||||
|
||||
# use NVIDIA
|
||||
kind = "nvidia.com/gpu"
|
||||
count = _get_gpu_count(get_resource(resource_request, kind), kind)
|
||||
if count: return count
|
||||
nvidia_req = get_resource(resource_request, kind)
|
||||
if nvidia_req is not None: return 1
|
||||
# use AMD
|
||||
kind = "amd.com/gpu"
|
||||
count = _get_gpu_count(get_resource(resource_request, kind, validate=False), kind)
|
||||
if count: return count
|
||||
amd_req = get_resource(resource_request, kind, validate=False)
|
||||
if amd_req is not None: return 1
|
||||
# use CPU
|
||||
cpus = get_resource(resource_request, "cpu")
|
||||
if cpus is not None and cpus > 0:
|
||||
@@ -339,10 +334,10 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
|
||||
"""Get worker env for this given worker_index.
|
||||
|
||||
Args:
|
||||
runnable_class: The runnable class to be run.
|
||||
resource_request: The resource request of the runnable.
|
||||
workers_per_resource: # of workers per resource.
|
||||
worker_index: The index of the worker, start from 0.
|
||||
runnable_class: The runnable class to be run.
|
||||
resource_request: The resource request of the runnable.
|
||||
workers_per_resource: # of workers per resource.
|
||||
worker_index: The index of the worker, start from 0.
|
||||
"""
|
||||
cuda_env = os.environ.get("CUDA_VISIBLE_DEVICES", None)
|
||||
disabled = cuda_env in ("", "-1")
|
||||
|
||||
@@ -110,16 +110,13 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
|
||||
return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"])
|
||||
|
||||
def construct_docker_options(
|
||||
llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float, quantize: t.LiteralString | None, bettertransformer: bool | None, device: tuple[str, ...] | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"],
|
||||
container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy,
|
||||
llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: int | float, quantize: t.LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry,
|
||||
container_version_strategy: LiteralContainerVersionStrategy,
|
||||
) -> DockerOptions:
|
||||
_bentoml_config_options = os.environ.pop("BENTOML_CONFIG_OPTIONS", "")
|
||||
_bentoml_config_options_opts = [
|
||||
"tracing.sample_rate=1.0", f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'api_server.traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".traffic.timeout={llm.config["timeout"]}', f'runners."llm-{llm.config["start_name"]}-runner".workers_per_resource={workers_per_resource}',
|
||||
]
|
||||
if device:
|
||||
if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{llm.config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
|
||||
else: _bentoml_config_options_opts.append(f'runners."llm-{llm.config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
|
||||
_bentoml_config_options += " " if _bentoml_config_options else "" + " ".join(_bentoml_config_options_opts)
|
||||
env: openllm.utils.EnvVarMixin = llm.config["env"]
|
||||
env_dict = {
|
||||
@@ -163,7 +160,7 @@ def create_bento(
|
||||
llm_spec = ModelSpec.from_item({"tag": str(llm.tag), "alias": llm.tag.name})
|
||||
build_config = BentoBuildConfig(
|
||||
service=f"{llm.config['service_name']}:svc", name=bento_tag.name, labels=labels, description=f"OpenLLM service for {llm.config['start_name']}", include=list(llm_fs.walk.files()), exclude=["/venv", "/.venv", "__pycache__/", "*.py[cod]", "*$py.class"], python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map), models=[llm_spec],
|
||||
docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer, device, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy)
|
||||
docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy)
|
||||
)
|
||||
|
||||
bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath("/"))
|
||||
|
||||
@@ -124,16 +124,13 @@ def unvalidated_get_resource(x: dict[str, t.Any], y: str, validate: bool = False
|
||||
@pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"])
|
||||
def test_cascade_strategy_worker_count(monkeypatch: MonkeyPatch, gpu_type: str):
|
||||
monkeypatch.setattr(strategy, "get_resource", unvalidated_get_resource)
|
||||
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: 2}, 1) == 2
|
||||
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: 2}, 2) == 4
|
||||
assert pytest.raises(ValueError, CascadingResourceStrategy.get_worker_count, GPURunnable, {gpu_type: 0}, 1,).match("No known supported resource available for *")
|
||||
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 1) == 2
|
||||
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 2) == 4
|
||||
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: 2}, 1) == 1
|
||||
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 1) == 1
|
||||
|
||||
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7]}, 0.5) == 1
|
||||
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 9]}, 0.5) == 2
|
||||
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 8, 9]}, 0.5) == 2
|
||||
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 5, 7, 8, 9]}, 0.4) == 2
|
||||
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 9]}, 0.5) == 1
|
||||
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 8, 9]}, 0.5) == 1
|
||||
assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 5, 7, 8, 9]}, 0.4) == 1
|
||||
|
||||
@pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"])
|
||||
def test_cascade_strategy_worker_env(monkeypatch: MonkeyPatch, gpu_type: str):
|
||||
|
||||
Reference in New Issue
Block a user