mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-05-02 21:02:43 -04:00
chore(strategy): compact and add stubs (#718)
generate service_vars automatically inline without reading from files Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -1,11 +1,2 @@
|
||||
import os
|
||||
|
||||
import orjson
|
||||
|
||||
from openllm_core.utils import ENV_VARS_TRUE_VALUES
|
||||
|
||||
model_id = os.environ['OPENLLM_MODEL_ID']
|
||||
model_tag = None
|
||||
adapter_map = orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None)))
|
||||
serialization = os.getenv('OPENLLM_SERIALIZATION', default='safetensors')
|
||||
trust_remote_code = str(os.getenv('TRUST_REMOTE_CODE', default=str(False))).upper() in ENV_VARS_TRUE_VALUES
|
||||
# fmt: off
|
||||
import os,orjson,openllm_core.utils as coreutils;model_id,model_tag,adapter_map,serialization,trust_remote_code=os.environ['OPENLLM_MODEL_ID'],None,orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP',orjson.dumps(None))),os.getenv('OPENLLM_SERIALIZATION',default='safetensors'),coreutils.check_bool_env('TRUST_REMOTE_CODE',False)
|
||||
|
||||
@@ -1,7 +0,0 @@
|
||||
import orjson
|
||||
|
||||
model_id = '{__model_id__}' # openllm: model id
|
||||
model_tag = '{__model_tag__}' # openllm: model tag
|
||||
adapter_map = orjson.loads("""{__model_adapter_map__}""") # openllm: model adapter map
|
||||
serialization = '{__model_serialization__}' # openllm: model serialization
|
||||
trust_remote_code = {__model_trust_remote_code__} # openllm: model trust remote code
|
||||
@@ -1,20 +1,8 @@
|
||||
# mypy: disable-error-code="no-redef"
|
||||
from __future__ import annotations
|
||||
import inspect
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
import types
|
||||
import typing as t
|
||||
import warnings
|
||||
|
||||
import psutil
|
||||
|
||||
import bentoml
|
||||
import inspect, logging, math, os, sys, types, warnings, typing as t
|
||||
import psutil, bentoml, openllm_core.utils as coreutils
|
||||
from bentoml._internal.resource import get_resource, system_resources
|
||||
from bentoml._internal.runner.strategy import THREAD_ENVS
|
||||
from openllm_core.utils import DEBUG, ReprMixin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -46,8 +34,7 @@ def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]:
|
||||
return rcs
|
||||
|
||||
|
||||
def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
|
||||
"""CUDA_VISIBLE_DEVICES aware with default var for parsing spec."""
|
||||
def _parse_cuda_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
|
||||
if respect_env:
|
||||
spec = os.environ.get('CUDA_VISIBLE_DEVICES', default_var)
|
||||
if not spec:
|
||||
@@ -116,11 +103,11 @@ def _raw_device_uuid_nvml() -> list[str] | None:
|
||||
class _ResourceMixin:
|
||||
@staticmethod
|
||||
def from_system(cls) -> list[str]:
|
||||
visible_devices = _parse_visible_devices()
|
||||
visible_devices = _parse_cuda_visible_devices()
|
||||
if visible_devices is None:
|
||||
if cls.resource_id == 'amd.com/gpu':
|
||||
if not psutil.LINUX:
|
||||
if DEBUG:
|
||||
if coreutils.DEBUG:
|
||||
logger.debug('AMD GPUs is currently only supported on Linux.')
|
||||
return []
|
||||
# ROCm does not currently have the rocm_smi wheel.
|
||||
@@ -167,7 +154,7 @@ class _ResourceMixin:
|
||||
return []
|
||||
if spec.isdigit():
|
||||
spec = ','.join([str(i) for i in range(_strtoul(spec))])
|
||||
return _parse_visible_devices(spec, respect_env=False)
|
||||
return _parse_cuda_visible_devices(spec, respect_env=False)
|
||||
elif isinstance(spec, list):
|
||||
return [str(x) for x in spec]
|
||||
else:
|
||||
@@ -209,7 +196,7 @@ class _ResourceMixin:
|
||||
def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[bentoml.Resource[t.List[str]]]:
|
||||
return types.new_class(
|
||||
name,
|
||||
(bentoml.Resource[t.List[str]], ReprMixin),
|
||||
(bentoml.Resource[t.List[str]], coreutils.ReprMixin),
|
||||
{'resource_id': resource_kind},
|
||||
lambda ns: ns.update(
|
||||
{
|
||||
@@ -243,24 +230,9 @@ AmdGpuResource = _make_resource_class(
|
||||
)
|
||||
|
||||
|
||||
class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
|
||||
"""This is extends the default BentoML strategy where we check for NVIDIA GPU resource -> AMD GPU resource -> CPU resource.
|
||||
|
||||
It also respect CUDA_VISIBLE_DEVICES for both AMD and NVIDIA GPU.
|
||||
See https://rocm.docs.amd.com/en/develop/understand/gpu_isolation.html#cuda-visible-devices
|
||||
for ROCm's support for CUDA_VISIBLE_DEVICES.
|
||||
|
||||
TODO: Support CloudTPUResource
|
||||
"""
|
||||
|
||||
class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
|
||||
@classmethod
|
||||
def get_worker_count(
|
||||
cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: float
|
||||
) -> int:
|
||||
"""Return the number of workers to be used for the given runnable class.
|
||||
|
||||
Note that for all available GPU, the number of workers will always be 1.
|
||||
"""
|
||||
def get_worker_count(cls, runnable_class, resource_request, workers_per_resource):
|
||||
if resource_request is None:
|
||||
resource_request = system_resources()
|
||||
# use NVIDIA
|
||||
@@ -291,21 +263,7 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def get_worker_env(
|
||||
cls,
|
||||
runnable_class: type[bentoml.Runnable],
|
||||
resource_request: dict[str, t.Any] | None,
|
||||
workers_per_resource: int | float,
|
||||
worker_index: int,
|
||||
) -> dict[str, t.Any]:
|
||||
"""Get worker env for this given worker_index.
|
||||
|
||||
Args:
|
||||
runnable_class: The runnable class to be run.
|
||||
resource_request: The resource request of the runnable.
|
||||
workers_per_resource: # of workers per resource.
|
||||
worker_index: The index of the worker, start from 0.
|
||||
"""
|
||||
def get_worker_env(cls, runnable_class, resource_request, workers_per_resource, worker_index):
|
||||
cuda_env = os.environ.get('CUDA_VISIBLE_DEVICES', None)
|
||||
disabled = cuda_env in ('', '-1')
|
||||
environ: dict[str, t.Any] = {}
|
||||
@@ -350,7 +308,7 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
|
||||
return environ
|
||||
|
||||
@staticmethod
|
||||
def transpile_workers_to_cuda_envvar(workers_per_resource: float | int, gpus: list[str], worker_index: int) -> str:
|
||||
def transpile_workers_to_cuda_envvar(workers_per_resource, gpus, worker_index):
|
||||
# Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string.
|
||||
if isinstance(workers_per_resource, float):
|
||||
# NOTE: We hit this branch when workers_per_resource is set to
|
||||
|
||||
46
openllm-python/src/openllm/_strategies.pyi
Normal file
46
openllm-python/src/openllm/_strategies.pyi
Normal file
@@ -0,0 +1,46 @@
|
||||
from typing import Any, Type, Dict, Optional, Union, List
|
||||
import bentoml
|
||||
|
||||
def get_resource(resources: Dict[str, Any], resource_kind: str, validate: bool = ...) -> Any: ...
|
||||
|
||||
class CascadingResourceStrategy:
|
||||
"""This is extends the default BentoML strategy where we check for NVIDIA GPU resource -> AMD GPU resource -> CPU resource.
|
||||
|
||||
It also respect CUDA_VISIBLE_DEVICES for both AMD and NVIDIA GPU.
|
||||
See https://rocm.docs.amd.com/en/develop/understand/gpu_isolation.html#cuda-visible-devices
|
||||
for ROCm's support for CUDA_VISIBLE_DEVICES.
|
||||
|
||||
TODO: Support CloudTPUResource
|
||||
"""
|
||||
@classmethod
|
||||
def get_worker_count(
|
||||
cls,
|
||||
runnable_class: Type[bentoml.Runnable],
|
||||
resource_request: Optional[Dict[str, Any]],
|
||||
workers_per_resource: float,
|
||||
) -> int:
|
||||
"""Return the number of workers to be used for the given runnable class.
|
||||
|
||||
Note that for all available GPU, the number of workers will always be 1.
|
||||
"""
|
||||
@classmethod
|
||||
def get_worker_env(
|
||||
cls,
|
||||
runnable_class: type[bentoml.Runnable],
|
||||
resource_request: Optional[Dict[str, Any]],
|
||||
workers_per_resource: Union[int, float],
|
||||
worker_index: int,
|
||||
) -> Dict[str, Any]:
|
||||
"""Get worker env for this given worker_index.
|
||||
|
||||
Args:
|
||||
runnable_class: The runnable class to be run.
|
||||
resource_request: The resource request of the runnable.
|
||||
workers_per_resource: # of workers per resource.
|
||||
worker_index: The index of the worker, start from 0.
|
||||
"""
|
||||
@staticmethod
|
||||
def transpile_workers_to_cuda_envvar(
|
||||
workers_per_resource: Union[float, int], gpus: List[str], worker_index: int
|
||||
) -> str:
|
||||
"""Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string."""
|
||||
@@ -3,8 +3,6 @@ from __future__ import annotations
|
||||
import importlib.metadata
|
||||
import logging
|
||||
import os
|
||||
import string
|
||||
import typing as t
|
||||
from pathlib import Path
|
||||
|
||||
import orjson
|
||||
@@ -18,9 +16,6 @@ from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg
|
||||
|
||||
from . import oci
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from openllm_core._typing_compat import LiteralString
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
|
||||
@@ -96,92 +91,25 @@ def construct_docker_options(
|
||||
)
|
||||
|
||||
|
||||
OPENLLM_MODEL_ID = '# openllm: model id'
|
||||
OPENLLM_MODEL_TAG = '# openllm: model tag'
|
||||
OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map'
|
||||
OPENLLM_MODEL_SERIALIZATION = '# openllm: model serialization'
|
||||
OPENLLM_MODEL_TRUST_REMOTE_CODE = '# openllm: model trust remote code'
|
||||
|
||||
|
||||
class _ServiceVarsFormatter(string.Formatter):
|
||||
keyword: LiteralString = '__model_name__'
|
||||
identifier: LiteralString = '# openllm: model name'
|
||||
|
||||
def __init__(self, target):
|
||||
super().__init__()
|
||||
self.target = target
|
||||
|
||||
def vformat(self, format_string, *args, **attrs) -> str:
|
||||
return super().vformat(format_string, (), {self.keyword: self.target})
|
||||
|
||||
def parse_line(self, line: str, nl: bool = True) -> str:
|
||||
if self.identifier not in line:
|
||||
return line
|
||||
gen = self.vformat(line)[: -(len(self.identifier) + 3)] + ('\n' if nl else '')
|
||||
return gen
|
||||
|
||||
|
||||
class ModelIdFormatter(_ServiceVarsFormatter):
|
||||
keyword = '__model_id__'
|
||||
identifier = OPENLLM_MODEL_ID
|
||||
|
||||
|
||||
class ModelTagFormatter(_ServiceVarsFormatter):
|
||||
keyword = '__model_tag__'
|
||||
identifier = OPENLLM_MODEL_TAG
|
||||
|
||||
|
||||
class ModelAdapterMapFormatter(_ServiceVarsFormatter):
|
||||
keyword = '__model_adapter_map__'
|
||||
identifier = OPENLLM_MODEL_ADAPTER_MAP
|
||||
|
||||
|
||||
class ModelSerializationFormatter(_ServiceVarsFormatter):
|
||||
keyword = '__model_serialization__'
|
||||
identifier = OPENLLM_MODEL_SERIALIZATION
|
||||
|
||||
|
||||
class ModelTrustRemoteCodeFormatter(_ServiceVarsFormatter):
|
||||
keyword = '__model_trust_remote_code__'
|
||||
identifier = OPENLLM_MODEL_TRUST_REMOTE_CODE
|
||||
|
||||
|
||||
_service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
|
||||
_service_vars_file = Path(os.path.abspath(__file__)).parent.parent / '_service_vars_pkg.py'
|
||||
|
||||
_SERVICE_VARS = '''\
|
||||
import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__}
|
||||
'''
|
||||
|
||||
|
||||
def write_service(llm, llm_fs, adapter_map):
|
||||
model_id_formatter = ModelIdFormatter(llm.model_id)
|
||||
model_tag_formatter = ModelTagFormatter(str(llm.tag))
|
||||
adapter_map_formatter = ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode())
|
||||
serialization_formatter = ModelSerializationFormatter(llm.config['serialisation'])
|
||||
trust_remote_code_formatter = ModelTrustRemoteCodeFormatter(str(llm.trust_remote_code))
|
||||
|
||||
logger.debug(
|
||||
'Generating service vars file for %s at %s (dir=%s)', llm.model_id, '_service_vars.py', llm_fs.getsyspath('/')
|
||||
logger.debug('Generating service vars %s (dir=%s)', llm.model_id, llm_fs.getsyspath('/'))
|
||||
script = f"# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n\n# fmt: off\n" + _SERVICE_VARS.format(
|
||||
__model_id__=llm.model_id,
|
||||
__model_tag__=str(llm.tag),
|
||||
__model_adapter_map__=orjson.dumps(adapter_map).decode(),
|
||||
__model_serialization__=llm.config['serialisation'],
|
||||
__model_trust_remote_code__=str(llm.trust_remote_code),
|
||||
)
|
||||
with open(_service_vars_file.__fspath__(), 'r') as f:
|
||||
src_contents = f.readlines()
|
||||
for i, it in enumerate(src_contents):
|
||||
if model_id_formatter.identifier in it:
|
||||
src_contents[i] = model_id_formatter.parse_line(it)
|
||||
elif model_tag_formatter.identifier in it:
|
||||
src_contents[i] = model_tag_formatter.parse_line(it)
|
||||
elif adapter_map_formatter.identifier in it:
|
||||
src_contents[i] = adapter_map_formatter.parse_line(it)
|
||||
elif serialization_formatter.identifier in it:
|
||||
src_contents[i] = serialization_formatter.parse_line(it)
|
||||
elif trust_remote_code_formatter.identifier in it:
|
||||
src_contents[i] = trust_remote_code_formatter.parse_line(it)
|
||||
|
||||
script = f"# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n\n" + ''.join(src_contents)
|
||||
if SHOW_CODEGEN:
|
||||
logger.info('Generated _service_vars.py:\n%s', script)
|
||||
llm_fs.writetext('_service_vars.py', script)
|
||||
|
||||
logger.debug(
|
||||
'Generating service file for %s at %s (dir=%s)', llm.model_id, llm.config['service_name'], llm_fs.getsyspath('/')
|
||||
)
|
||||
with open(_service_file.__fspath__(), 'r') as f:
|
||||
service_src = f.read()
|
||||
llm_fs.writetext(llm.config['service_name'], service_src)
|
||||
|
||||
Reference in New Issue
Block a user