fix(sdk): remove broken sdk

codespace now around 2.8k lines

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron
2023-11-26 04:53:36 -05:00
parent ed6a82a3f0
commit 96318b65ee
18 changed files with 179 additions and 557 deletions

View File

@@ -1,7 +1,6 @@
import logging as _logging, os as _os, pathlib as _pathlib, warnings as _warnings
from openllm_cli import _sdk
from . import utils as utils
if utils.DEBUG:
utils.set_debug_mode(True); _logging.basicConfig(level=_logging.NOTSET)
else:
@@ -12,11 +11,8 @@ else:
_warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
_warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')
_warnings.filterwarnings('ignore', message='Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated')
COMPILED = _pathlib.Path(__file__).suffix in ('.pyd', '.so')
# NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
__lazy = utils.LazyModule(
__lazy = utils.LazyModule( # NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
__name__,
globals()['__file__'],
{
@@ -34,14 +30,8 @@ __lazy = utils.LazyModule(
'_llm': ['LLM'],
},
extra_objects={
'COMPILED': COMPILED,
'start': _sdk.start,
'start_grpc': _sdk.start_grpc,
'build': _sdk.build,
'import_model': _sdk.import_model,
'list_models': _sdk.list_models,
'COMPILED': COMPILED, 'start': _sdk.start, 'build': _sdk.build, #
'import_model': _sdk.import_model, 'list_models': _sdk.list_models, #
},
)
__all__ = __lazy.__all__
__dir__ = __lazy.__dir__
__getattr__ = __lazy.__getattr__
__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__

View File

@@ -1,65 +1,21 @@
from __future__ import annotations
import logging
import os
import typing as t
import warnings
import logging, os, warnings, typing as t
import openllm
from openllm_core._typing_compat import LiteralBackend, ParamSpec
from openllm_core._typing_compat import LiteralBackend
from openllm_core.utils import first_not_none, getenv, is_vllm_available
if t.TYPE_CHECKING:
from ._runners import Runner as _Runner
P = ParamSpec('P')
__all__ = ['Runner']
logger = logging.getLogger(__name__)
def Runner(
model_name: str,
ensure_available: bool = True,
init_local: bool = False,
backend: LiteralBackend | None = None,
llm_config: openllm.LLMConfig | None = None,
**attrs: t.Any,
) -> _Runner[t.Any, t.Any]:
"""Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'.
> [!WARNING]
> This method is now deprecated and in favor of 'openllm.LLM'
```python
runner = openllm.Runner("dolly-v2")
@svc.on_startup
def download():
runner.download_model()
```
if `init_local=True` (For development workflow), it will also enable `ensure_available`.
Default value of `ensure_available` is None. If set then use that given value, otherwise fallback to the aforementioned behaviour.
Args:
model_name: Supported model name from 'openllm models'
ensure_available: If True, it will download the model if it is not available. If False, it will skip downloading the model.
If False, make sure the model is available locally. Default to True, and openllm.LLM will always check if models
are available locally. based on generated tag.
backend: The given Runner implementation one choose for this Runner. If `OPENLLM_BACKEND` is set, it will respect it.
llm_config: Optional ``openllm.LLMConfig`` to initialise this ``openllm.LLMRunner``.
init_local: If True, it will initialize the model locally. This is useful if you want to run the model locally. (Symmetrical to bentoml.Runner.init_local())
**attrs: The rest of kwargs will then be passed to the LLM. Refer to the LLM documentation for the kwargs behaviour
"""
from ._llm import LLM
if llm_config is None:
llm_config = openllm.AutoConfig.for_model(model_name)
if not ensure_available:
logger.warning(
"'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation."
)
model_name: str, ensure_available: bool = True, #
init_local: bool = False, backend: LiteralBackend | None = None, #
llm_config: openllm.LLMConfig | None = None, **attrs: t.Any,
):
if llm_config is None: llm_config = openllm.AutoConfig.for_model(model_name)
if not ensure_available: logger.warning("'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation.")
model_id = attrs.get('model_id', os.getenv('OPENLLM_MODEL_ID', llm_config['default_id']))
_RUNNER_MSG = f'''\
warnings.warn(f'''\
Using 'openllm.Runner' is now deprecated. Make sure to switch to the following syntax:
```python
@@ -70,22 +26,11 @@ def Runner(
@svc.api(...)
async def chat(input: str) -> str:
async for it in llm.generate_iterator(input): print(it)
```
'''
warnings.warn(_RUNNER_MSG, DeprecationWarning, stacklevel=2)
```''', DeprecationWarning, stacklevel=2)
attrs.update(
{
'model_id': model_id,
'quantize': getenv('QUANTIZE', var=['QUANTISE'], default=attrs.get('quantize', None)),
'serialisation': getenv(
'serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']
),
'model_id': model_id, 'quantize': getenv('QUANTIZE', var=['QUANTISE'], default=attrs.get('quantize', None)), #
'serialisation': getenv('serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']),
}
)
backend = t.cast(LiteralBackend, first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'))
llm = LLM[t.Any, t.Any](backend=backend, llm_config=llm_config, embedded=init_local, **attrs)
return llm.runner
__all__ = ['Runner']
return openllm.LLM(backend=first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'), llm_config=llm_config, embedded=init_local, **attrs).runner

View File

@@ -47,23 +47,17 @@ ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]
@attr.define(slots=False, repr=False, init=False)
class LLM(t.Generic[M, T]):
async def generate(
self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
) -> GenerationOutput:
if adapter_name is not None and self.__llm_backend__ != 'pt':
raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
async def generate(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
if adapter_name is not None and self.__llm_backend__ != 'pt': raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
config = self.config.model_construct_env(**attrs)
texts, token_ids = [[]] * config['n'], [[]] * config['n']
final_result = None
async for result in self.generate_iterator(
prompt, prompt_token_ids, stop, stop_token_ids, request_id, adapter_name, **config.model_dump(flatten=True)
):
for output in result.outputs:
texts[output.index].append(output.text)
token_ids[output.index].extend(output.token_ids)
final_result = result
if final_result is None:
raise RuntimeError('No result is returned.')
if (final_result := result) is None: raise RuntimeError('No result is returned.')
return final_result.with_options(
prompt=prompt,
outputs=[
@@ -72,13 +66,9 @@ class LLM(t.Generic[M, T]):
],
)
async def generate_iterator(
self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
) -> t.AsyncGenerator[GenerationOutput, None]:
async def generate_iterator(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
from bentoml._internal.runner.runner_handle import DummyRunnerHandle
if adapter_name is not None and self.__llm_backend__ != 'pt':
raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
if adapter_name is not None and self.__llm_backend__ != 'pt': raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
if isinstance(self.runner._runner_handle, DummyRunnerHandle):
if os.getenv('BENTO_PATH') is not None:
@@ -87,14 +77,12 @@ class LLM(t.Generic[M, T]):
self.runner.init_local(quiet=True)
config = self.config.model_construct_env(**attrs)
if stop_token_ids is None: stop_token_ids = []
stop_token_ids = stop_token_ids or []
eos_token_id = attrs.get('eos_token_id', config['eos_token_id'])
if eos_token_id is not None:
if not isinstance(eos_token_id, list): eos_token_id = [eos_token_id]
stop_token_ids.extend(eos_token_id)
if config['eos_token_id'] and config['eos_token_id'] not in stop_token_ids: stop_token_ids.append(config['eos_token_id'])
if self.tokenizer.eos_token_id not in stop_token_ids:
stop_token_ids.append(self.tokenizer.eos_token_id)
if eos_token_id and not isinstance(eos_token_id, list): eos_token_id = [eos_token_id]
stop_token_ids.extend(eos_token_id or [])
if (config_eos := config['eos_token_id']) and config_eos not in stop_token_ids: stop_token_ids.append(config_eos)
if self.tokenizer.eos_token_id not in stop_token_ids: stop_token_ids.append(self.tokenizer.eos_token_id)
if stop is None:
stop = set()
elif isinstance(stop, str):
@@ -102,20 +90,16 @@ class LLM(t.Generic[M, T]):
else:
stop = set(stop)
for tid in stop_token_ids:
if tid:
stop.add(self.tokenizer.decode(tid))
if tid: stop.add(self.tokenizer.decode(tid))
if prompt_token_ids is None:
if prompt is None:
raise ValueError('Either prompt or prompt_token_ids must be specified.')
if prompt is None: raise ValueError('Either prompt or prompt_token_ids must be specified.')
prompt_token_ids = self.tokenizer.encode(prompt)
request_id = gen_random_uuid() if request_id is None else request_id
previous_texts, previous_num_tokens = [''] * config['n'], [0] * config['n']
try:
generator = self.runner.generate_iterator.async_stream(
prompt_token_ids, request_id, stop=list(stop), adapter_name=adapter_name, **config.model_dump(flatten=True)
)
generator = self.runner.generate_iterator.async_stream(prompt_token_ids, request_id, stop=list(stop), adapter_name=adapter_name, **config.model_dump(flatten=True))
except Exception as err:
raise RuntimeError(f'Failed to start generation task: {err}') from err
@@ -134,18 +118,11 @@ class LLM(t.Generic[M, T]):
# NOTE: If you are here to see how generate_iterator and generate works, see above.
# The below are mainly for internal implementation that you don't have to worry about.
_model_id: str
_revision: t.Optional[str]
_model_id: str; _revision: t.Optional[str] #
_quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]]
_quantise: t.Optional[LiteralQuantise]
_model_decls: t.Tuple[t.Any, ...]
__model_attrs: t.Dict[str, t.Any]
__tokenizer_attrs: t.Dict[str, t.Any]
_tag: bentoml.Tag
_adapter_map: t.Optional[AdapterMap]
_serialisation: LiteralSerialisation
_local: bool
_max_model_len: t.Optional[int]
_quantise: t.Optional[LiteralQuantise]; _model_decls: t.Tuple[t.Any, ...]; __model_attrs: t.Dict[str, t.Any] #
__tokenizer_attrs: t.Dict[str, t.Any]; _tag: bentoml.Tag; _adapter_map: t.Optional[AdapterMap] #
_serialisation: LiteralSerialisation; _local: bool; _max_model_len: t.Optional[int] #
__llm_dtype__: t.Union[LiteralDtype, t.Literal['auto', 'half', 'float']] = 'auto'
__llm_torch_dtype__: 'torch.dtype' = None
@@ -180,12 +157,7 @@ class LLM(t.Generic[M, T]):
):
torch_dtype = attrs.pop('torch_dtype', None) # backward compatible
if torch_dtype is not None:
warnings.warn(
'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',
DeprecationWarning,
stacklevel=3,
)
dtype = torch_dtype
warnings.warn('The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.', DeprecationWarning, stacklevel=3); dtype = torch_dtype
_local = False
if validate_is_path(model_id): model_id, _local = resolve_filepath(model_id), True
backend = getenv('backend', default=backend)
@@ -291,7 +263,7 @@ class LLM(t.Generic[M, T]):
if is_vllm_available():
return 'vllm'
elif is_ctranslate_available():
return 'ctranslate' # XXX: base OpenLLM image should always include vLLM
return 'ctranslate'
elif is_ctranslate_available():
return 'ctranslate'
else:
@@ -449,8 +421,7 @@ def convert_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
config_file = hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME)
except Exception as err:
raise ValueError(f"Can't find '{PEFT_CONFIG_NAME}' at '{path_or_adapter_id}'") from err
with open(config_file, 'r') as file:
resolved_config = orjson.loads(file.read())
with open(config_file, 'r') as file: resolved_config = orjson.loads(file.read())
_peft_type = resolved_config['peft_type'].lower()
if _peft_type not in resolved: resolved[_peft_type] = ()
resolved[_peft_type] += (_AdapterTuple((path_or_adapter_id, name, resolved_config)),)

View File

@@ -1,13 +1,8 @@
from __future__ import annotations
from openllm_core.exceptions import MissingDependencyError
from openllm_core.utils import is_autoawq_available, is_autogptq_available, is_bitsandbytes_available
def infer_quantisation_config(llm, quantise, **attrs):
import torch
import transformers
import torch, transformers
# 8 bit configuration
int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
@@ -85,25 +80,19 @@ def infer_quantisation_config(llm, quantise, **attrs):
# NOTE: Quantization setup quantize is a openllm.LLM feature, where we can quantize the model with bitsandbytes or quantization aware training.
if not is_bitsandbytes_available():
raise RuntimeError(
'Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\''
)
raise RuntimeError('Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\'')
if quantise == 'int8':
quantisation_config = create_int8_config(int8_skip_modules)
elif quantise == 'int4':
quantisation_config = create_int4_config()
elif quantise == 'gptq':
if not is_autogptq_available():
raise MissingDependencyError(
"GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'"
)
raise MissingDependencyError("GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'")
else:
quantisation_config = create_gptq_config()
elif quantise == 'awq':
if not is_autoawq_available():
raise MissingDependencyError(
"AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'."
)
raise MissingDependencyError("AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'.")
else:
quantisation_config = create_awq_config()
else:

View File

@@ -1,66 +1,43 @@
from __future__ import annotations
import logging
import typing as t
import logging, typing as t
import _service_vars as svars
import bentoml
import openllm
import bentoml, openllm
from openllm_core._schemas import MessageParam
from bentoml.io import JSON, Text
logger = logging.getLogger(__name__)
llm = openllm.LLM[t.Any, t.Any](
model_id=svars.model_id,
model_tag=svars.model_tag,
serialisation=svars.serialization,
adapter_map=svars.adapter_map,
trust_remote_code=svars.trust_remote_code,
model_id=svars.model_id, model_tag=svars.model_tag, adapter_map=svars.adapter_map, #
serialisation=svars.serialization, trust_remote_code=svars.trust_remote_code,
)
svc = bentoml.Service(name=f"llm-{llm.config['start_name']}-service", runners=[llm.runner])
llm_model_class = openllm.GenerationInput.from_llm_config(llm.config)
@svc.api(
route='/v1/generate',
input=JSON.from_sample(llm_model_class.examples()),
output=JSON.from_sample(openllm.GenerationOutput.examples()),
input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()), #
)
async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]:
return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump()
async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]: return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump()
@svc.api(
route='/v1/generate_stream',
input=JSON.from_sample(llm_model_class.examples()),
output=Text(content_type='text/event-stream'),
input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'), #
)
async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()):
yield f'data: {it.model_dump_json()}\n\n'
yield 'data: [DONE]\n\n'
_Metadata = openllm.MetadataOutput(
timeout=llm.config['timeout'],
model_name=llm.config['model_name'],
backend=llm.__llm_backend__,
model_id=llm.model_id,
timeout=llm.config['timeout'], model_name=llm.config['model_name'], #
backend=llm.__llm_backend__, model_id=llm.model_id, #
configuration=llm.config.model_dump_json().decode(),
)
@svc.api(route='/v1/metadata', input=Text(), output=JSON.from_sample(_Metadata.model_dump()))
def metadata_v1(_: str) -> openllm.MetadataOutput:
return _Metadata
class MessagesConverterInput(t.TypedDict):
add_generation_prompt: bool
messages: t.List[t.Dict[str, t.Any]]
def metadata_v1(_: str) -> openllm.MetadataOutput: return _Metadata
class MessagesConverterInput(t.TypedDict): add_generation_prompt: bool; messages: t.List[t.Dict[str, t.Any]]
@svc.api(
route='/v1/helpers/messages',
@@ -69,18 +46,14 @@ class MessagesConverterInput(t.TypedDict):
add_generation_prompt=False,
messages=[
MessageParam(role='system', content='You are acting as Ernest Hemmingway.'),
MessageParam(role='user', content='Hi there!'),
MessageParam(role='assistant', content='Yes?'),
MessageParam(role='user', content='Hi there!'), MessageParam(role='assistant', content='Yes?'), #
],
)
),
output=Text(),
)
def helpers_messages_v1(message: MessagesConverterInput) -> str:
add_generation_prompt = message['add_generation_prompt']
messages = message['messages']
add_generation_prompt, messages = message['add_generation_prompt'], message['messages']
return llm.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
# HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.
openllm.mount_entrypoints(svc, llm)
openllm.mount_entrypoints(svc, llm) # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.

View File

@@ -1,9 +1,2 @@
import os, orjson, openllm_core.utils as coreutils
model_id, model_tag, adapter_map, serialization, trust_remote_code = (
os.environ['OPENLLM_MODEL_ID'],
None,
orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))),
os.getenv('OPENLLM_SERIALIZATION', default='safetensors'),
coreutils.check_bool_env('TRUST_REMOTE_CODE', False),
)
model_id, model_tag, adapter_map, serialization, trust_remote_code = os.environ['OPENLLM_MODEL_ID'], None, orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))), os.getenv('OPENLLM_SERIALIZATION', default='safetensors'), coreutils.check_bool_env('TRUST_REMOTE_CODE', False)

View File

@@ -4,44 +4,33 @@ import psutil, bentoml, openllm_core.utils as coreutils
from bentoml._internal.resource import get_resource, system_resources
from bentoml._internal.runner.strategy import THREAD_ENVS
__all__ = ['CascadingResourceStrategy', 'get_resource']
logger = logging.getLogger(__name__)
def _strtoul(s: str) -> int:
# Return -1 or positive integer sequence string starts with.
if not s:
return -1
if not s: return -1
idx = 0
for idx, c in enumerate(s):
if not (c.isdigit() or (idx == 0 and c in '+-')):
break
if idx + 1 == len(s):
idx += 1 # noqa: PLW2901
if not (c.isdigit() or (idx == 0 and c in '+-')): break
if idx + 1 == len(s): idx += 1 # noqa: PLW2901
# NOTE: idx will be set via enumerate
return int(s[:idx]) if idx > 0 else -1
def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]:
rcs: list[str] = []
rcs = []
for elem in lst.split(','):
# Repeated id results in empty set
if elem in rcs:
return []
if elem in rcs: return []
# Anything other but prefix is ignored
if not elem.startswith(prefix):
break
if not elem.startswith(prefix): break
rcs.append(elem)
return rcs
def _parse_cuda_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
if respect_env:
spec = os.environ.get('CUDA_VISIBLE_DEVICES', default_var)
if not spec:
return None
if not spec: return None
else:
if default_var is None:
raise ValueError('spec is required to be not None when parsing spec.')
if default_var is None: raise ValueError('spec is required to be not None when parsing spec.')
spec = default_var
if spec.startswith('GPU-'):
@@ -55,64 +44,52 @@ def _parse_cuda_visible_devices(default_var: str | None = None, respect_env: boo
for el in spec.split(','):
x = _strtoul(el.strip())
# Repeated ordinal results in empty set
if x in rc:
return []
if x in rc: return []
# Negative value aborts the sequence
if x < 0:
break
if x < 0: break
rc.append(x)
return [str(i) for i in rc]
def _raw_device_uuid_nvml() -> list[str] | None:
from ctypes import CDLL, byref, c_int, c_void_p, create_string_buffer
try:
nvml_h = CDLL('libnvidia-ml.so.1')
except Exception:
warnings.warn('Failed to find nvidia binding', stacklevel=3)
return None
warnings.warn('Failed to find nvidia binding', stacklevel=3); return None
rc = nvml_h.nvmlInit()
if rc != 0:
warnings.warn("Can't initialize NVML", stacklevel=3)
return None
warnings.warn("Can't initialize NVML", stacklevel=3); return None
dev_count = c_int(-1)
rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
if rc != 0:
warnings.warn('Failed to get available device from system.', stacklevel=3)
return None
uuids: list[str] = []
warnings.warn('Failed to get available device from system.', stacklevel=3); return None
uuids = []
for idx in range(dev_count.value):
dev_id = c_void_p()
rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
if rc != 0:
warnings.warn(f'Failed to get device handle for {idx}', stacklevel=3)
return None
warnings.warn(f'Failed to get device handle for {idx}', stacklevel=3); return None
buf_len = 96
buf = create_string_buffer(buf_len)
rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
if rc != 0:
warnings.warn(f'Failed to get device UUID for {idx}', stacklevel=3)
return None
warnings.warn(f'Failed to get device UUID for {idx}', stacklevel=3); return None
uuids.append(buf.raw.decode('ascii').strip('\0'))
del nvml_h
return uuids
class _ResourceMixin:
@staticmethod
def from_system(cls) -> list[str]:
visible_devices = _parse_cuda_visible_devices()
if visible_devices is None:
if cls.resource_id == 'amd.com/gpu':
if not psutil.LINUX:
if coreutils.DEBUG:
logger.debug('AMD GPUs is currently only supported on Linux.')
return []
if not psutil.LINUX: return []
# ROCm does not currently have the rocm_smi wheel.
# So we need to use the ctypes bindings directly.
# we don't want to use CLI because parsing is a pain.
# TODO: Use tinygrad/gpuctypes
sys.path.append('/opt/rocm/libexec/rocm_smi')
try:
from ctypes import byref, c_uint32
@@ -122,8 +99,7 @@ class _ResourceMixin:
device_count = c_uint32(0)
ret = rocmsmi.rsmi_num_monitor_devices(byref(device_count))
if ret == rsmi_status_t.RSMI_STATUS_SUCCESS:
return [str(i) for i in range(device_count.value)]
if ret == rsmi_status_t.RSMI_STATUS_SUCCESS: return [str(i) for i in range(device_count.value)]
return []
# In this case the binary is not found, returning empty list
except (ModuleNotFoundError, ImportError):
@@ -140,59 +116,43 @@ class _ResourceMixin:
except (ImportError, RuntimeError, AttributeError):
return []
return visible_devices
@staticmethod
def from_spec(cls, spec) -> list[str]:
if isinstance(spec, int):
if spec in (-1, 0):
return []
if spec < -1:
raise ValueError('Spec cannot be < -1.')
if spec in (-1, 0): return []
if spec < -1: raise ValueError('Spec cannot be < -1.')
return [str(i) for i in range(spec)]
elif isinstance(spec, str):
if not spec:
return []
if spec.isdigit():
spec = ','.join([str(i) for i in range(_strtoul(spec))])
if not spec: return []
if spec.isdigit(): spec = ','.join([str(i) for i in range(_strtoul(spec))])
return _parse_cuda_visible_devices(spec, respect_env=False)
elif isinstance(spec, list):
return [str(x) for x in spec]
else:
raise TypeError(
f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead."
)
raise TypeError(f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.")
@staticmethod
def validate(cls, val: list[t.Any]) -> None:
if cls.resource_id == 'amd.com/gpu':
raise RuntimeError(
"AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'"
)
raise RuntimeError("AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'")
if not all(isinstance(i, str) for i in val):
raise ValueError('Input list should be all string type.')
try:
from cuda import cuda
err, *_ = cuda.cuInit(0)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Failed to initialise CUDA runtime binding.')
if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError('Failed to initialise CUDA runtime binding.')
# correctly parse handle
for el in val:
if el.startswith(('GPU-', 'MIG-')):
uuids = _raw_device_uuid_nvml()
if uuids is None:
raise ValueError('Failed to parse available GPUs UUID')
if el not in uuids:
raise ValueError(f'Given UUID {el} is not found with available UUID (available: {uuids})')
if uuids is None: raise ValueError('Failed to parse available GPUs UUID')
if el not in uuids: raise ValueError(f'Given UUID {el} is not found with available UUID (available: {uuids})')
elif el.isdigit():
err, _ = cuda.cuDeviceGet(int(el))
if err != cuda.CUresult.CUDA_SUCCESS:
raise ValueError(f'Failed to get device {el}')
if err != cuda.CUresult.CUDA_SUCCESS: raise ValueError(f'Failed to get device {el}')
except (ImportError, RuntimeError):
pass
def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[bentoml.Resource[t.List[str]]]:
return types.new_class(
name,
@@ -201,22 +161,16 @@ def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[
lambda ns: ns.update(
{
'resource_id': resource_kind,
'from_spec': classmethod(_ResourceMixin.from_spec),
'from_system': classmethod(_ResourceMixin.from_system),
'validate': classmethod(_ResourceMixin.validate),
'__repr_keys__': property(lambda _: {'resource_id'}),
'__doc__': inspect.cleandoc(docstring),
'__module__': 'openllm._strategies',
'from_spec': classmethod(_ResourceMixin.from_spec), 'from_system': classmethod(_ResourceMixin.from_system), #
'validate': classmethod(_ResourceMixin.validate), '__repr_keys__': property(lambda _: {'resource_id'}), #
'__doc__': inspect.cleandoc(docstring), '__module__': 'openllm._strategies', #
}
),
)
NvidiaGpuResource = _make_resource_class(
'NvidiaGpuResource',
'nvidia.com/gpu',
'''NVIDIA GPU resource.
This is a modified version of internal's BentoML's NvidiaGpuResource
where it respects and parse CUDA_VISIBLE_DEVICES correctly.''',
)
@@ -224,73 +178,53 @@ AmdGpuResource = _make_resource_class(
'AmdGpuResource',
'amd.com/gpu',
'''AMD GPU resource.
Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
``NvidiaGpuResource``. Currently ``validate`` is not yet supported.''',
)
class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
@classmethod
def get_worker_count(cls, runnable_class, resource_request, workers_per_resource):
if resource_request is None:
resource_request = system_resources()
if resource_request is None: resource_request = system_resources()
# use NVIDIA
kind = 'nvidia.com/gpu'
nvidia_req = get_resource(resource_request, kind)
if nvidia_req is not None:
return 1
if nvidia_req is not None: return 1
# use AMD
kind = 'amd.com/gpu'
amd_req = get_resource(resource_request, kind, validate=False)
if amd_req is not None:
return 1
if amd_req is not None: return 1
# use CPU
cpus = get_resource(resource_request, 'cpu')
if cpus is not None and cpus > 0:
if 'cpu' not in runnable_class.SUPPORTED_RESOURCES:
logger.warning('No known supported resource available for %s, falling back to using CPU.', runnable_class)
if runnable_class.SUPPORTS_CPU_MULTI_THREADING:
if isinstance(workers_per_resource, float) and workers_per_resource < 1.0:
raise ValueError('Fractional CPU multi threading support is not yet supported.')
if isinstance(workers_per_resource, float) and workers_per_resource < 1.0: raise ValueError('Fractional CPU multi threading support is not yet supported.')
return int(workers_per_resource)
return math.ceil(cpus) * workers_per_resource
# this should not be reached by user since we always read system resource as default
raise ValueError(
f'No known supported resource available for {runnable_class}. Please check your resource request. Leaving it blank will allow BentoML to use system resources.'
)
raise ValueError(f'No known supported resource available for {runnable_class}. Please check your resource request. Leaving it blank will allow BentoML to use system resources.')
@classmethod
def get_worker_env(cls, runnable_class, resource_request, workers_per_resource, worker_index):
cuda_env = os.environ.get('CUDA_VISIBLE_DEVICES', None)
disabled = cuda_env in ('', '-1')
environ: dict[str, t.Any] = {}
environ = {}
if resource_request is None:
resource_request = system_resources()
if resource_request is None: resource_request = system_resources()
# use NVIDIA
kind = 'nvidia.com/gpu'
typ = get_resource(resource_request, kind)
if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
if disabled:
logger.debug('CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.', worker_index)
environ['CUDA_VISIBLE_DEVICES'] = cuda_env
return environ
environ['CUDA_VISIBLE_DEVICES'] = cuda_env; return environ
environ['CUDA_VISIBLE_DEVICES'] = cls.transpile_workers_to_cuda_envvar(workers_per_resource, typ, worker_index)
logger.debug('Environ for worker %s: %s', worker_index, environ)
return environ
# use AMD
kind = 'amd.com/gpu'
typ = get_resource(resource_request, kind, validate=False)
if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
if disabled:
logger.debug('CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.', worker_index)
environ['CUDA_VISIBLE_DEVICES'] = cuda_env
return environ
environ['CUDA_VISIBLE_DEVICES'] = cuda_env; return environ
environ['CUDA_VISIBLE_DEVICES'] = cls.transpile_workers_to_cuda_envvar(workers_per_resource, typ, worker_index)
logger.debug('Environ for worker %s: %s', worker_index, environ)
return environ
# use CPU
cpus = get_resource(resource_request, 'cpu')
@@ -298,25 +232,17 @@ class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
environ['CUDA_VISIBLE_DEVICES'] = '-1' # disable gpu
if runnable_class.SUPPORTS_CPU_MULTI_THREADING:
thread_count = math.ceil(cpus)
for thread_env in THREAD_ENVS:
environ[thread_env] = os.environ.get(thread_env, str(thread_count))
logger.debug('Environ for worker %s: %s', worker_index, environ)
for thread_env in THREAD_ENVS: environ[thread_env] = os.environ.get(thread_env, str(thread_count))
return environ
for thread_env in THREAD_ENVS:
environ[thread_env] = os.environ.get(thread_env, '1')
for thread_env in THREAD_ENVS: environ[thread_env] = os.environ.get(thread_env, '1')
return environ
return environ
@staticmethod
def transpile_workers_to_cuda_envvar(workers_per_resource, gpus, worker_index):
# Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string.
if isinstance(workers_per_resource, float):
# NOTE: We hit this branch when workers_per_resource is set to
# float, for example 0.5 or 0.25
if workers_per_resource > 1:
raise ValueError(
"Currently, the default strategy doesn't support workers_per_resource > 1. It is recommended that one should implement a custom strategy in this case."
)
# NOTE: We hit this branch when workers_per_resource is set to float, for example 0.5 or 0.25
if workers_per_resource > 1: raise ValueError('workers_per_resource > 1 is not supported.')
# We are round the assigned resource here. This means if workers_per_resource=.4
# then it will round down to 2. If workers_per_source=0.6, then it will also round up to 2.
assigned_resource_per_worker = round(1 / workers_per_resource)
@@ -327,21 +253,12 @@ class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
worker_index,
assigned_resource_per_worker,
)
raise IndexError(
f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}]."
)
assigned_gpu = gpus[
assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1)
]
raise IndexError(f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}].")
assigned_gpu = gpus[assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1)]
dev = ','.join(assigned_gpu)
else:
idx = worker_index // workers_per_resource
if idx >= len(gpus):
raise ValueError(
f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}'
)
raise ValueError(f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}')
dev = str(gpus[idx])
return dev
__all__ = ['CascadingResourceStrategy', 'get_resource']

View File

@@ -4,7 +4,6 @@ from openllm_core._typing_compat import LiteralVersionStrategy
from openllm_core.exceptions import OpenLLMException
from openllm_core.utils.lazy import VersionInfo, LazyModule
_OWNER, _REPO = 'bentoml', 'openllm'
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
class RefResolver:
git_hash: str = attr.field()
@@ -17,7 +16,7 @@ class RefResolver:
if strategy_or_version is None or strategy_or_version == 'release':
try:
from ghapi.all import GhApi
ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
ghapi = GhApi(owner='bentoml', repo='openllm', authenticate=False)
meta = ghapi.repos.get_latest_release()
git_hash = ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha']
except Exception as err:
@@ -35,6 +34,4 @@ __lazy = LazyModule(
{'_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options']},
extra_objects={'RefResolver': RefResolver}
)
__all__ = __lazy.__all__
__dir__ = __lazy.__dir__
__getattr__ = __lazy.__getattr__
__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__

View File

@@ -1,15 +1,7 @@
# mypy: disable-error-code="misc"
from __future__ import annotations
import importlib.metadata
import logging
import os
from pathlib import Path
import orjson
import importlib.metadata, logging, os, pathlib
import bentoml, orjson, openllm_core
from simple_di import Provide, inject
import bentoml
import openllm_core
from bentoml._internal.bento.build_config import BentoBuildConfig, DockerOptions, ModelSpec, PythonOptions
from bentoml._internal.configuration.containers import BentoMLContainer
from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg
@@ -17,7 +9,7 @@ from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg
logger = logging.getLogger(__name__)
OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
_service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
_service_file = pathlib.Path(os.path.abspath(__file__)).parent.parent / '_service.py'
_SERVICE_VARS = '''import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__}'''
def build_editable(path, package='openllm'):
@@ -28,7 +20,7 @@ def build_editable(path, package='openllm'):
from build.env import IsolatedEnvBuilder
module_location = pkg.source_locations(package)
if not module_location: raise RuntimeError('Could not find the source location of OpenLLM.')
pyproject_path = Path(module_location).parent.parent / 'pyproject.toml'
pyproject_path = pathlib.Path(module_location).parent.parent / 'pyproject.toml'
if os.path.isfile(pyproject_path.__fspath__()):
with IsolatedEnvBuilder() as env:
builder = ProjectBuilder(pyproject_path.parent)
@@ -70,12 +62,9 @@ def create_bento(
labels = dict(llm.identifying_params)
labels.update(
{
'_type': llm.llm_type, '_framework': llm.__llm_backend__, 'start_name': llm.config['start_name'],
'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle',
**{
f'{package.replace("-","_")}_version': importlib.metadata.version(package)
for package in {'openllm', 'openllm-core', 'openllm-client'}
},
'_type': llm.llm_type, '_framework': llm.__llm_backend__,
'start_name': llm.config['start_name'], 'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle',
**{f'{package.replace("-","_")}_version': importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}},
}
)
if adapter_map: labels.update(adapter_map)
@@ -83,18 +72,15 @@ def create_bento(
logger.debug("Building Bento '%s' with model backend '%s'", bento_tag, llm.__llm_backend__)
logger.debug('Generating service vars %s (dir=%s)', llm.model_id, llm_fs.getsyspath('/'))
script = f"# fmt: off\n# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n" + _SERVICE_VARS.format(
__model_id__=llm.model_id,
__model_tag__=str(llm.tag),
__model_adapter_map__=orjson.dumps(adapter_map).decode(),
__model_serialization__=llm.config['serialisation'],
__model_id__=llm.model_id, __model_tag__=str(llm.tag), #
__model_adapter_map__=orjson.dumps(adapter_map).decode(), __model_serialization__=llm.config['serialisation'], #
__model_trust_remote_code__=str(llm.trust_remote_code),
)
if SHOW_CODEGEN: logger.info('Generated _service_vars.py:\n%s', script)
llm_fs.writetext('_service_vars.py', script)
with open(_service_file.__fspath__(), 'r') as f: service_src = f.read()
llm_fs.writetext(llm.config['service_name'], service_src)
bento = bentoml.Bento.create(
return bentoml.Bento.create(
version=bento_tag.version,
build_ctx=llm_fs.getsyspath('/'),
build_config=BentoBuildConfig(
@@ -108,6 +94,4 @@ def create_bento(
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
docker=construct_docker_options(llm, llm_fs, quantize, adapter_map, dockerfile_template, _serialisation),
),
)
return bento.save(bento_store=_bento_store, model_store=_model_store)
).save(bento_store=_bento_store, model_store=_model_store)

View File

@@ -1,10 +1,2 @@
def __dir__():
import openllm_client as _client
return sorted(dir(_client))
def __getattr__(it):
import openllm_client as _client
return getattr(_client, it)
def __dir__(): import openllm_client as _client; return sorted(dir(_client))
def __getattr__(it): import openllm_client as _client; return getattr(_client, it)

View File

@@ -1,20 +1,11 @@
import importlib
from openllm_core.utils import LazyModule
_import_structure = {'openai': [], 'hf': [], 'cohere': []}
def mount_entrypoints(svc, llm):
for module_name in _import_structure:
module = importlib.import_module(f'.{module_name}', __name__)
svc = module.mount_to_svc(svc, llm)
return svc
__lazy = LazyModule(
__name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints}
)
__all__ = __lazy.__all__
__dir__ = __lazy.__dir__
__getattr__ = __lazy.__getattr__
__lazy = LazyModule(__name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints})
__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__

View File

@@ -1,17 +1,11 @@
from __future__ import annotations
import functools
import json
import logging
import traceback
import functools, json, logging, traceback
from http import HTTPStatus
import orjson
from starlette.applications import Starlette
from starlette.responses import JSONResponse, StreamingResponse
from starlette.routing import Route
from openllm_core.utils import DEBUG, converter, gen_random_uuid
from ._openapi import add_schema_definitions, append_schemas, get_generator
from ..protocol.cohere import (
Chat,
@@ -54,41 +48,31 @@ schemas = get_generator(
logger = logging.getLogger(__name__)
def jsonify_attr(obj):
return json.dumps(converter.unstructure(obj))
def jsonify_attr(obj): return json.dumps(converter.unstructure(obj))
def error_response(status_code, message):
return JSONResponse(converter.unstructure(CohereErrorResponse(text=message)), status_code=status_code.value)
async def check_model(request, model):
if request.model is None or request.model == model:
return None
if request.model is None or request.model == model: return None
return error_response(
HTTPStatus.NOT_FOUND,
f"Model '{request.model}' does not exists. Try 'GET /v1/models' to see current running models.",
)
def mount_to_svc(svc, llm):
app = Starlette(
debug=True,
routes=[
Route(
'/v1/generate', endpoint=functools.partial(cohere_generate, llm=llm), name='cohere_generate', methods=['POST']
),
Route('/v1/chat', endpoint=functools.partial(cohere_chat, llm=llm), name='cohere_chat', methods=['POST']),
Route('/schema', endpoint=lambda req: schemas.OpenAPIResponse(req), include_in_schema=False),
Route('/v1/chat', endpoint=functools.partial(cohere_chat, llm=llm), name='cohere_chat', methods=['POST']),
Route('/v1/generate', endpoint=functools.partial(cohere_generate, llm=llm), name='cohere_generate', methods=['POST']),
],
)
mount_path = '/cohere'
svc.mount_asgi_app(app, path=mount_path)
return append_schemas(
svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append', inject=DEBUG
)
return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append', inject=DEBUG)
@add_schema_definitions
async def cohere_generate(req, llm):
@@ -181,7 +165,6 @@ def _transpile_cohere_chat_messages(request: CohereChatRequest) -> list[dict[str
messages.append({'role': 'user', 'content': request.message})
return messages
@add_schema_definitions
async def cohere_chat(req, llm):
json_str = await req.body()

View File

@@ -1,14 +1,10 @@
import functools
import logging
import functools, logging
from http import HTTPStatus
import orjson
from starlette.applications import Starlette
from starlette.responses import JSONResponse
from starlette.routing import Route
from openllm_core.utils import converter
from ._openapi import add_schema_definitions, append_schemas, get_generator
from ..protocol.hf import AgentRequest, AgentResponse, HFErrorResponse
@@ -25,7 +21,6 @@ schemas = get_generator(
)
logger = logging.getLogger(__name__)
def mount_to_svc(svc, llm):
app = Starlette(
debug=True,
@@ -39,13 +34,8 @@ def mount_to_svc(svc, llm):
svc.mount_asgi_app(app, path=mount_path)
return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append')
def error_response(status_code, message):
return JSONResponse(
converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)),
status_code=status_code.value,
)
return JSONResponse(converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
@add_schema_definitions
async def hf_agent(req, llm):
@@ -60,18 +50,14 @@ async def hf_agent(req, llm):
stop = request.parameters.pop('stop', ['\n'])
try:
result = await llm.generate(request.inputs, stop=stop, **request.parameters)
return JSONResponse(
converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value
)
return JSONResponse(converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value)
except Exception as err:
logger.error('Error while generating: %s', err)
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')
@add_schema_definitions
def hf_adapters(req, llm):
if not llm.has_adapters:
return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.')
if not llm.has_adapters: return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.')
return JSONResponse(
{
adapter_tuple[1]: {'adapter_name': k, 'adapter_type': adapter_tuple[0].peft_type.value}

View File

@@ -1,10 +1,7 @@
from openllm_core.exceptions import (
Error as Error,
FineTuneStrategyNotSupportedError as FineTuneStrategyNotSupportedError,
ForbiddenAttributeError as ForbiddenAttributeError,
GpuNotAvailableError as GpuNotAvailableError,
Error as Error, FineTuneStrategyNotSupportedError as FineTuneStrategyNotSupportedError, #
ForbiddenAttributeError as ForbiddenAttributeError, GpuNotAvailableError as GpuNotAvailableError, #
OpenLLMException as OpenLLMException, ValidationError as ValidationError, #
MissingAnnotationAttributeError as MissingAnnotationAttributeError,
MissingDependencyError as MissingDependencyError,
OpenLLMException as OpenLLMException,
ValidationError as ValidationError,
)

View File

@@ -5,11 +5,6 @@ import typing as t
from openllm_core.utils import LazyModule
_import_structure: dict[str, list[str]] = {'openai': [], 'cohere': [], 'hf': []}
if t.TYPE_CHECKING:
from . import cohere as cohere, hf as hf, openai as openai
if t.TYPE_CHECKING: from . import cohere as cohere, hf as hf, openai as openai
__lazy = LazyModule(__name__, os.path.abspath('__file__'), _import_structure)
__all__ = __lazy.__all__
__dir__ = __lazy.__dir__
__getattr__ = __lazy.__getattr__
__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__

View File

@@ -1,36 +1,16 @@
import functools, importlib.metadata, openllm_core
__all__ = ['generate_labels', 'available_devices', 'device_count']
def generate_labels(llm):
return {
'backend': llm.__llm_backend__,
'framework': 'openllm',
'model_name': llm.config['model_name'],
'architecture': llm.config['architecture'],
'serialisation': llm._serialisation,
'backend': llm.__llm_backend__, 'framework': 'openllm', 'model_name': llm.config['model_name'], #
'architecture': llm.config['architecture'], 'serialisation': llm._serialisation, #
**{package: importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}},
}
def available_devices():
from ._strategies import NvidiaGpuResource
return tuple(NvidiaGpuResource.from_system())
def available_devices(): from ._strategies import NvidiaGpuResource; return tuple(NvidiaGpuResource.from_system())
@functools.lru_cache(maxsize=1)
def device_count() -> int:
return len(available_devices())
def device_count() -> int: return len(available_devices())
def __dir__():
coreutils = set(dir(openllm_core.utils)) | set([it for it in openllm_core.utils._extras if not it.startswith('_')])
return sorted(__all__) + sorted(list(coreutils))
coreutils = set(dir(openllm_core.utils)) | set([it for it in openllm_core.utils._extras if not it.startswith('_')]); return sorted(__all__) + sorted(list(coreutils))
def __getattr__(it):
if hasattr(openllm_core.utils, it):
return getattr(openllm_core.utils, it)
if hasattr(openllm_core.utils, it): return getattr(openllm_core.utils, it)
raise AttributeError(f'module {__name__} has no attribute {it}')