chore(strategy): compact and add stubs (#718)

generate service_vars automatically inline without reading from files

Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-11-21 21:49:28 -05:00
committed by GitHub
parent 909db8c3bf
commit 04ef08a7f8
6 changed files with 71 additions and 155 deletions

View File

@@ -1,11 +1,2 @@
import os
import orjson
from openllm_core.utils import ENV_VARS_TRUE_VALUES
model_id = os.environ['OPENLLM_MODEL_ID']
model_tag = None
adapter_map = orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None)))
serialization = os.getenv('OPENLLM_SERIALIZATION', default='safetensors')
trust_remote_code = str(os.getenv('TRUST_REMOTE_CODE', default=str(False))).upper() in ENV_VARS_TRUE_VALUES
# fmt: off
import os,orjson,openllm_core.utils as coreutils;model_id,model_tag,adapter_map,serialization,trust_remote_code=os.environ['OPENLLM_MODEL_ID'],None,orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP',orjson.dumps(None))),os.getenv('OPENLLM_SERIALIZATION',default='safetensors'),coreutils.check_bool_env('TRUST_REMOTE_CODE',False)

View File

@@ -1,7 +0,0 @@
import orjson
model_id = '{__model_id__}' # openllm: model id
model_tag = '{__model_tag__}' # openllm: model tag
adapter_map = orjson.loads("""{__model_adapter_map__}""") # openllm: model adapter map
serialization = '{__model_serialization__}' # openllm: model serialization
trust_remote_code = {__model_trust_remote_code__} # openllm: model trust remote code

View File

@@ -1,20 +1,8 @@
# mypy: disable-error-code="no-redef"
from __future__ import annotations
import inspect
import logging
import math
import os
import sys
import types
import typing as t
import warnings
import psutil
import bentoml
import inspect, logging, math, os, sys, types, warnings, typing as t
import psutil, bentoml, openllm_core.utils as coreutils
from bentoml._internal.resource import get_resource, system_resources
from bentoml._internal.runner.strategy import THREAD_ENVS
from openllm_core.utils import DEBUG, ReprMixin
logger = logging.getLogger(__name__)
@@ -46,8 +34,7 @@ def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]:
return rcs
def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
"""CUDA_VISIBLE_DEVICES aware with default var for parsing spec."""
def _parse_cuda_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
if respect_env:
spec = os.environ.get('CUDA_VISIBLE_DEVICES', default_var)
if not spec:
@@ -116,11 +103,11 @@ def _raw_device_uuid_nvml() -> list[str] | None:
class _ResourceMixin:
@staticmethod
def from_system(cls) -> list[str]:
visible_devices = _parse_visible_devices()
visible_devices = _parse_cuda_visible_devices()
if visible_devices is None:
if cls.resource_id == 'amd.com/gpu':
if not psutil.LINUX:
if DEBUG:
if coreutils.DEBUG:
logger.debug('AMD GPUs is currently only supported on Linux.')
return []
# ROCm does not currently have the rocm_smi wheel.
@@ -167,7 +154,7 @@ class _ResourceMixin:
return []
if spec.isdigit():
spec = ','.join([str(i) for i in range(_strtoul(spec))])
return _parse_visible_devices(spec, respect_env=False)
return _parse_cuda_visible_devices(spec, respect_env=False)
elif isinstance(spec, list):
return [str(x) for x in spec]
else:
@@ -209,7 +196,7 @@ class _ResourceMixin:
def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[bentoml.Resource[t.List[str]]]:
return types.new_class(
name,
(bentoml.Resource[t.List[str]], ReprMixin),
(bentoml.Resource[t.List[str]], coreutils.ReprMixin),
{'resource_id': resource_kind},
lambda ns: ns.update(
{
@@ -243,24 +230,9 @@ AmdGpuResource = _make_resource_class(
)
class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
"""This is extends the default BentoML strategy where we check for NVIDIA GPU resource -> AMD GPU resource -> CPU resource.
It also respect CUDA_VISIBLE_DEVICES for both AMD and NVIDIA GPU.
See https://rocm.docs.amd.com/en/develop/understand/gpu_isolation.html#cuda-visible-devices
for ROCm's support for CUDA_VISIBLE_DEVICES.
TODO: Support CloudTPUResource
"""
class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
@classmethod
def get_worker_count(
cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: float
) -> int:
"""Return the number of workers to be used for the given runnable class.
Note that for all available GPU, the number of workers will always be 1.
"""
def get_worker_count(cls, runnable_class, resource_request, workers_per_resource):
if resource_request is None:
resource_request = system_resources()
# use NVIDIA
@@ -291,21 +263,7 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
)
@classmethod
def get_worker_env(
cls,
runnable_class: type[bentoml.Runnable],
resource_request: dict[str, t.Any] | None,
workers_per_resource: int | float,
worker_index: int,
) -> dict[str, t.Any]:
"""Get worker env for this given worker_index.
Args:
runnable_class: The runnable class to be run.
resource_request: The resource request of the runnable.
workers_per_resource: # of workers per resource.
worker_index: The index of the worker, start from 0.
"""
def get_worker_env(cls, runnable_class, resource_request, workers_per_resource, worker_index):
cuda_env = os.environ.get('CUDA_VISIBLE_DEVICES', None)
disabled = cuda_env in ('', '-1')
environ: dict[str, t.Any] = {}
@@ -350,7 +308,7 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
return environ
@staticmethod
def transpile_workers_to_cuda_envvar(workers_per_resource: float | int, gpus: list[str], worker_index: int) -> str:
def transpile_workers_to_cuda_envvar(workers_per_resource, gpus, worker_index):
# Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string.
if isinstance(workers_per_resource, float):
# NOTE: We hit this branch when workers_per_resource is set to

View File

@@ -0,0 +1,46 @@
from typing import Any, Type, Dict, Optional, Union, List
import bentoml
def get_resource(resources: Dict[str, Any], resource_kind: str, validate: bool = ...) -> Any: ...
class CascadingResourceStrategy:
"""This is extends the default BentoML strategy where we check for NVIDIA GPU resource -> AMD GPU resource -> CPU resource.
It also respect CUDA_VISIBLE_DEVICES for both AMD and NVIDIA GPU.
See https://rocm.docs.amd.com/en/develop/understand/gpu_isolation.html#cuda-visible-devices
for ROCm's support for CUDA_VISIBLE_DEVICES.
TODO: Support CloudTPUResource
"""
@classmethod
def get_worker_count(
cls,
runnable_class: Type[bentoml.Runnable],
resource_request: Optional[Dict[str, Any]],
workers_per_resource: float,
) -> int:
"""Return the number of workers to be used for the given runnable class.
Note that for all available GPU, the number of workers will always be 1.
"""
@classmethod
def get_worker_env(
cls,
runnable_class: type[bentoml.Runnable],
resource_request: Optional[Dict[str, Any]],
workers_per_resource: Union[int, float],
worker_index: int,
) -> Dict[str, Any]:
"""Get worker env for this given worker_index.
Args:
runnable_class: The runnable class to be run.
resource_request: The resource request of the runnable.
workers_per_resource: # of workers per resource.
worker_index: The index of the worker, start from 0.
"""
@staticmethod
def transpile_workers_to_cuda_envvar(
workers_per_resource: Union[float, int], gpus: List[str], worker_index: int
) -> str:
"""Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string."""

View File

@@ -3,8 +3,6 @@ from __future__ import annotations
import importlib.metadata
import logging
import os
import string
import typing as t
from pathlib import Path
import orjson
@@ -18,9 +16,6 @@ from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg
from . import oci
if t.TYPE_CHECKING:
from openllm_core._typing_compat import LiteralString
logger = logging.getLogger(__name__)
OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
@@ -96,92 +91,25 @@ def construct_docker_options(
)
OPENLLM_MODEL_ID = '# openllm: model id'
OPENLLM_MODEL_TAG = '# openllm: model tag'
OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map'
OPENLLM_MODEL_SERIALIZATION = '# openllm: model serialization'
OPENLLM_MODEL_TRUST_REMOTE_CODE = '# openllm: model trust remote code'
class _ServiceVarsFormatter(string.Formatter):
keyword: LiteralString = '__model_name__'
identifier: LiteralString = '# openllm: model name'
def __init__(self, target):
super().__init__()
self.target = target
def vformat(self, format_string, *args, **attrs) -> str:
return super().vformat(format_string, (), {self.keyword: self.target})
def parse_line(self, line: str, nl: bool = True) -> str:
if self.identifier not in line:
return line
gen = self.vformat(line)[: -(len(self.identifier) + 3)] + ('\n' if nl else '')
return gen
class ModelIdFormatter(_ServiceVarsFormatter):
keyword = '__model_id__'
identifier = OPENLLM_MODEL_ID
class ModelTagFormatter(_ServiceVarsFormatter):
keyword = '__model_tag__'
identifier = OPENLLM_MODEL_TAG
class ModelAdapterMapFormatter(_ServiceVarsFormatter):
keyword = '__model_adapter_map__'
identifier = OPENLLM_MODEL_ADAPTER_MAP
class ModelSerializationFormatter(_ServiceVarsFormatter):
keyword = '__model_serialization__'
identifier = OPENLLM_MODEL_SERIALIZATION
class ModelTrustRemoteCodeFormatter(_ServiceVarsFormatter):
keyword = '__model_trust_remote_code__'
identifier = OPENLLM_MODEL_TRUST_REMOTE_CODE
_service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
_service_vars_file = Path(os.path.abspath(__file__)).parent.parent / '_service_vars_pkg.py'
_SERVICE_VARS = '''\
import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__}
'''
def write_service(llm, llm_fs, adapter_map):
model_id_formatter = ModelIdFormatter(llm.model_id)
model_tag_formatter = ModelTagFormatter(str(llm.tag))
adapter_map_formatter = ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode())
serialization_formatter = ModelSerializationFormatter(llm.config['serialisation'])
trust_remote_code_formatter = ModelTrustRemoteCodeFormatter(str(llm.trust_remote_code))
logger.debug(
'Generating service vars file for %s at %s (dir=%s)', llm.model_id, '_service_vars.py', llm_fs.getsyspath('/')
logger.debug('Generating service vars %s (dir=%s)', llm.model_id, llm_fs.getsyspath('/'))
script = f"# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n\n# fmt: off\n" + _SERVICE_VARS.format(
__model_id__=llm.model_id,
__model_tag__=str(llm.tag),
__model_adapter_map__=orjson.dumps(adapter_map).decode(),
__model_serialization__=llm.config['serialisation'],
__model_trust_remote_code__=str(llm.trust_remote_code),
)
with open(_service_vars_file.__fspath__(), 'r') as f:
src_contents = f.readlines()
for i, it in enumerate(src_contents):
if model_id_formatter.identifier in it:
src_contents[i] = model_id_formatter.parse_line(it)
elif model_tag_formatter.identifier in it:
src_contents[i] = model_tag_formatter.parse_line(it)
elif adapter_map_formatter.identifier in it:
src_contents[i] = adapter_map_formatter.parse_line(it)
elif serialization_formatter.identifier in it:
src_contents[i] = serialization_formatter.parse_line(it)
elif trust_remote_code_formatter.identifier in it:
src_contents[i] = trust_remote_code_formatter.parse_line(it)
script = f"# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n\n" + ''.join(src_contents)
if SHOW_CODEGEN:
logger.info('Generated _service_vars.py:\n%s', script)
llm_fs.writetext('_service_vars.py', script)
logger.debug(
'Generating service file for %s at %s (dir=%s)', llm.model_id, llm.config['service_name'], llm_fs.getsyspath('/')
)
with open(_service_file.__fspath__(), 'r') as f:
service_src = f.read()
llm_fs.writetext(llm.config['service_name'], service_src)