From c33b071ee4785d1f7c1f80a6796a32b97552a82f Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Tue, 21 Nov 2023 04:39:48 -0500 Subject: [PATCH] refactor: delete unused code (#716) Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --- mypy.ini | 2 +- openllm-client/src/openllm_client/_utils.pyi | 1 - .../src/openllm_core/utils/__init__.py | 54 +--- openllm-python/src/openllm/__init__.py | 11 - openllm-python/src/openllm/__init__.pyi | 10 - openllm-python/src/openllm/__main__.py | 15 +- openllm-python/src/openllm/_deprecated.py | 13 +- openllm-python/src/openllm/_generation.py | 17 -- openllm-python/src/openllm/_generation.pyi | 22 +- openllm-python/src/openllm/_llm.py | 6 +- openllm-python/src/openllm/_llm.pyi | 2 - openllm-python/src/openllm/_runners.py | 23 +- openllm-python/src/openllm/_strategies.py | 242 +++++++--------- openllm-python/src/openllm/client.py | 9 +- openllm-python/src/openllm/testing.py | 75 ----- .../openllm/{utils/__init__.py => utils.py} | 0 .../openllm/{utils/__init__.pyi => utils.pyi} | 1 - openllm-python/src/openllm_cli/_factory.py | 6 +- openllm-python/src/openllm_cli/_sdk.py | 2 +- openllm-python/src/openllm_cli/entrypoint.py | 115 +------- openllm-python/tests/models/__init__.py | 0 .../flan_t5_test/test_flan_t5[container].json | 33 --- .../flan_t5_test/test_flan_t5[local].json | 33 --- .../opt_test/test_opt_125m[container].json | 34 --- .../opt_test/test_opt_125m[local].json | 34 --- openllm-python/tests/models/conftest.py | 266 ------------------ openllm-python/tests/models/flan_t5_test.py | 40 --- openllm-python/tests/models/opt_test.py | 40 --- 28 files changed, 149 insertions(+), 957 deletions(-) delete mode 100644 openllm-python/src/openllm/testing.py rename openllm-python/src/openllm/{utils/__init__.py => utils.py} (100%) rename openllm-python/src/openllm/{utils/__init__.pyi => utils.pyi} (98%) delete mode 100644 openllm-python/tests/models/__init__.py delete mode 100644 openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[container].json delete mode 100644 openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[local].json delete mode 100644 openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[container].json delete mode 100644 openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[local].json delete mode 100644 openllm-python/tests/models/conftest.py delete mode 100644 openllm-python/tests/models/flan_t5_test.py delete mode 100644 openllm-python/tests/models/opt_test.py diff --git a/mypy.ini b/mypy.ini index c63d8cc2..d8b92f8e 100644 --- a/mypy.ini +++ b/mypy.ini @@ -8,4 +8,4 @@ warn_unused_configs = true ignore_missing_imports = true check_untyped_defs = true warn_unreachable = true -files = openllm-python/src/openllm/bundle/__init__.pyi, openllm-python/src/openllm/serialisation/__init__.pyi, openllm-client/src/openllm_client/__init__.pyi, openllm-client/src/openllm_client/_utils.pyi, openllm-python/src/openllm/__init__.pyi, openllm-client/src/openllm_client/_typing_compat.py, openllm-core/src/openllm_core/_typing_compat.py, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/bundle/_package.pyi, openllm-python/src/openllm/_runners.pyi, openllm-python/src/openllm/_quantisation.pyi, openllm-python/src/openllm/_llm.pyi, openllm-python/src/openllm/_generation.pyi, openllm-python/src/openllm/entrypoints/openai.pyi, openllm-python/src/openllm/entrypoints/__init__.pyi, openllm-python/src/openllm/entrypoints/hf.pyi, openllm-python/src/openllm/entrypoints/_openapi.pyi, openllm-python/src/openllm/entrypoints/cohere.pyi, openllm-python/src/openllm/_service_vars.pyi, openllm-python/src/openllm/utils/__init__.pyi, openllm-python/src/openllm/serialisation/_helpers.pyi +files = openllm-python/src/openllm/bundle/__init__.pyi, openllm-python/src/openllm/serialisation/__init__.pyi, openllm-client/src/openllm_client/__init__.pyi, openllm-client/src/openllm_client/_utils.pyi, openllm-python/src/openllm/__init__.pyi, openllm-client/src/openllm_client/_typing_compat.py, openllm-core/src/openllm_core/_typing_compat.py, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/bundle/_package.pyi, openllm-python/src/openllm/_runners.pyi, openllm-python/src/openllm/_quantisation.pyi, openllm-python/src/openllm/_llm.pyi, openllm-python/src/openllm/_generation.pyi, openllm-python/src/openllm/entrypoints/openai.pyi, openllm-python/src/openllm/entrypoints/__init__.pyi, openllm-python/src/openllm/entrypoints/hf.pyi, openllm-python/src/openllm/entrypoints/_openapi.pyi, openllm-python/src/openllm/entrypoints/cohere.pyi, openllm-python/src/openllm/_service_vars.pyi, openllm-python/src/openllm/serialisation/_helpers.pyi, openllm-python/src/openllm/utils.pyi diff --git a/openllm-client/src/openllm_client/_utils.pyi b/openllm-client/src/openllm_client/_utils.pyi index 6aa84fc2..3e2e5f25 100644 --- a/openllm-client/src/openllm_client/_utils.pyi +++ b/openllm-client/src/openllm_client/_utils.pyi @@ -22,7 +22,6 @@ from openllm_core.utils import ( getenv as getenv, in_notebook as in_notebook, lenient_issubclass as lenient_issubclass, - reserve_free_port as reserve_free_port, resolve_filepath as resolve_filepath, resolve_user_filepath as resolve_user_filepath, serde as serde, diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py index faa92acb..d0963bf1 100644 --- a/openllm-core/src/openllm_core/utils/__init__.py +++ b/openllm-core/src/openllm_core/utils/__init__.py @@ -1,12 +1,9 @@ from __future__ import annotations -import contextlib import functools import hashlib import logging import logging.config import os -import random -import socket import sys import types import typing as t @@ -19,7 +16,6 @@ from .lazy import LazyLoader as LazyLoader, LazyModule as LazyModule, VersionInf from .._typing_compat import overload as _overload if t.TYPE_CHECKING: - from bentoml._internal.models.model import ModelContext from bentoml._internal.types import PathType from openllm_core._typing_compat import AnyCallable @@ -36,52 +32,6 @@ _object_setattr = object.__setattr__ logger = logging.getLogger(__name__) -@contextlib.contextmanager -def reserve_free_port( - host: str = 'localhost', - port: int | None = None, - prefix: str | None = None, - max_retry: int = 50, - enable_so_reuseport: bool = False, -) -> t.Iterator[int]: - """ - detect free port and reserve until exit the context - """ - import psutil - - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - if enable_so_reuseport: - if psutil.WINDOWS: - sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - elif psutil.MACOS or psutil.FREEBSD: - sock.setsockopt(socket.SOL_SOCKET, 0x10000, 1) # SO_REUSEPORT_LB - else: - sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) - if sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT) == 0: - raise RuntimeError('Failed to set SO_REUSEPORT.') from None - if prefix is not None: - prefix_num = int(prefix) * 10 ** (5 - len(prefix)) - suffix_range = min(65535 - prefix_num, 10 ** (5 - len(prefix))) - for _ in range(max_retry): - suffix = random.randint(0, suffix_range) - port = int(f'{prefix_num + suffix}') - try: - sock.bind((host, port)) - break - except OSError: - continue - else: - raise RuntimeError(f'Cannot find free port with prefix {prefix} after {max_retry} retries.') from None - elif port: - sock.bind((host, port)) - else: - sock.bind((host, 0)) - try: - yield sock.getsockname()[1] - finally: - sock.close() - - # fmt: off _T=t.TypeVar('_T') @functools.lru_cache(maxsize=1) @@ -133,7 +83,7 @@ def set_disable_warnings(disable:bool=True)->None: if get_disable_warnings():os.environ[WARNING_ENV_VAR]=str(disable) def set_debug_mode(enabled:bool,level:int=1)->None: if enabled:os.environ[DEV_DEBUG_VAR] = str(level) - os.environ.update({DEBUG_ENV_VAR:str(enabled),_GRPC_DEBUG_ENV_VAR:'DEBUG' if enabled else 'ERROR','CT2_VERBOSE':'3'}) + os.environ.update({DEBUG_ENV_VAR:str(enabled),QUIET_ENV_VAR:str(not enabled),_GRPC_DEBUG_ENV_VAR:'DEBUG' if enabled else 'ERROR','CT2_VERBOSE':'3'}) set_disable_warnings(enabled) def set_quiet_mode(enabled:bool)->None: os.environ.update({QUIET_ENV_VAR:str(enabled),_GRPC_DEBUG_ENV_VAR:'NONE','CT2_VERBOSE':'-1'}) @@ -151,7 +101,7 @@ def first_not_none(*args:_T|None,default:_T)->_T:... @_overload def first_not_none(*args:_T|None)->_T|None:... def first_not_none(*args:_T|None,default:_T|None=None)->_T|None:return next((arg for arg in args if arg is not None),default) -def generate_context(framework_name:str)->ModelContext: +def generate_context(framework_name:str): from bentoml._internal.models.model import ModelContext framework_versions={'transformers':pkg.get_pkg_version('transformers'),'safetensors':pkg.get_pkg_version('safetensors'),'optimum':pkg.get_pkg_version('optimum'),'accelerate':pkg.get_pkg_version('accelerate')} if iutils.is_torch_available():framework_versions['torch']=pkg.get_pkg_version('torch') diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py index 2703d9ec..05bc3d54 100644 --- a/openllm-python/src/openllm/__init__.py +++ b/openllm-python/src/openllm/__init__.py @@ -3,14 +3,12 @@ import os as _os import pathlib as _pathlib import warnings as _warnings -import openllm_cli as _cli from openllm_cli import _sdk from . import utils as utils if utils.DEBUG: utils.set_debug_mode(True) - utils.set_quiet_mode(False) _logging.basicConfig(level=_logging.NOTSET) else: # configuration for bitsandbytes before import @@ -47,18 +45,9 @@ __lazy = utils.LazyModule( 'serialisation': ['ggml', 'transformers'], '_quantisation': ['infer_quantisation_config'], '_llm': ['LLM'], - '_generation': [ - 'StopSequenceCriteria', - 'StopOnTokens', - 'prepare_logits_processor', - 'get_context_length', - 'is_sentence_complete', - 'is_partial_stop', - ], }, extra_objects={ 'COMPILED': COMPILED, - 'cli': _cli, 'start': _sdk.start, 'start_grpc': _sdk.start_grpc, 'build': _sdk.build, diff --git a/openllm-python/src/openllm/__init__.pyi b/openllm-python/src/openllm/__init__.pyi index 11047560..a90cb999 100644 --- a/openllm-python/src/openllm/__init__.pyi +++ b/openllm-python/src/openllm/__init__.pyi @@ -16,7 +16,6 @@ from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING,CONFIG_MAPPING_N # update-config-stubs.py: import stubs stop # fmt: on -import openllm_cli as _cli from openllm_cli._sdk import ( build as build, import_model as import_model, @@ -44,14 +43,6 @@ from . import ( utils as utils, ) from ._deprecated import Runner as Runner -from ._generation import ( - StopOnTokens as StopOnTokens, - StopSequenceCriteria as StopSequenceCriteria, - prepare_logits_processor as prepare_logits_processor, - is_partial_stop as is_partial_stop, - is_sentence_complete as is_sentence_complete, - get_context_length as get_context_length, -) from ._llm import LLM as LLM from ._quantisation import infer_quantisation_config as infer_quantisation_config from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource @@ -60,5 +51,4 @@ from .entrypoints import mount_entrypoints as mount_entrypoints from .protocol import openai as openai from .serialisation import ggml as ggml, transformers as transformers -cli = _cli COMPILED: bool = ... diff --git a/openllm-python/src/openllm/__main__.py b/openllm-python/src/openllm/__main__.py index 2babfd90..99866462 100644 --- a/openllm-python/src/openllm/__main__.py +++ b/openllm-python/src/openllm/__main__.py @@ -1,13 +1,2 @@ -"""CLI entrypoint for OpenLLM. - -Usage: - openllm --help - -To start any OpenLLM model: - openllm start --options ... -""" - -if __name__ == '__main__': - from openllm_cli.entrypoint import cli - - cli() +# fmt: off +if __name__ == '__main__':from openllm_cli.entrypoint import cli;cli() # noqa diff --git a/openllm-python/src/openllm/_deprecated.py b/openllm-python/src/openllm/_deprecated.py index b3564568..c4612209 100644 --- a/openllm-python/src/openllm/_deprecated.py +++ b/openllm-python/src/openllm/_deprecated.py @@ -6,7 +6,10 @@ import warnings import openllm from openllm_core._typing_compat import LiteralBackend, ParamSpec -from openllm_core.utils import first_not_none, is_vllm_available +from openllm_core.utils import first_not_none, getenv, is_vllm_available + +if t.TYPE_CHECKING: + from ._runners import Runner as _Runner P = ParamSpec('P') @@ -20,7 +23,7 @@ def Runner( backend: LiteralBackend | None = None, llm_config: openllm.LLMConfig | None = None, **attrs: t.Any, -) -> openllm.LLMRunner[t.Any, t.Any]: +) -> _Runner[t.Any, t.Any]: """Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'. > [!WARNING] @@ -73,9 +76,9 @@ def Runner( attrs.update( { 'model_id': model_id, - 'quantize': os.getenv('OPENLLM_QUANTIZE', attrs.get('quantize', None)), - 'serialisation': first_not_none( - attrs.get('serialisation'), os.environ.get('OPENLLM_SERIALIZATION'), default=llm_config['serialisation'] + 'quantize': getenv('QUANTIZE', var=['QUANTISE'], default=attrs.get('quantize', None)), + 'serialisation': getenv( + 'serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION'] ), } ) diff --git a/openllm-python/src/openllm/_generation.py b/openllm-python/src/openllm/_generation.py index c3b4fa5b..c4f87c5b 100644 --- a/openllm-python/src/openllm/_generation.py +++ b/openllm-python/src/openllm/_generation.py @@ -1,23 +1,6 @@ import transformers -class StopSequenceCriteria(transformers.StoppingCriteria): - def __init__(self, stop_sequences, tokenizer): - if isinstance(stop_sequences, str): - stop_sequences = [stop_sequences] - self.stop_sequences, self.tokenizer = stop_sequences, tokenizer - - def __call__(self, input_ids, scores, **kwargs): - return any( - self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences - ) - - -class StopOnTokens(transformers.StoppingCriteria): - def __call__(self, input_ids, scores, **kwargs): - return input_ids[0][-1] in {50278, 50279, 50277, 1, 0} - - def prepare_logits_processor(config): generation_config = config.generation_config logits_processor = transformers.LogitsProcessorList() diff --git a/openllm-python/src/openllm/_generation.pyi b/openllm-python/src/openllm/_generation.pyi index c727f6be..845346f5 100644 --- a/openllm-python/src/openllm/_generation.pyi +++ b/openllm-python/src/openllm/_generation.pyi @@ -1,27 +1,7 @@ -from typing import Any, List, Union - -from torch import FloatTensor, LongTensor -from transformers import ( - LogitsProcessorList, - PretrainedConfig, - PreTrainedTokenizer, - PreTrainedTokenizerBase, - PreTrainedTokenizerFast, -) +from transformers import LogitsProcessorList, PretrainedConfig from openllm_core import LLMConfig -Tokenizer = Union[PreTrainedTokenizerBase, PreTrainedTokenizer, PreTrainedTokenizerFast] - -class StopSequenceCriteria: - stop_sequences: List[str] - tokenizer: Tokenizer - def __init__(self, stop_sequences: Union[str, List[str]], tokenizer: Tokenizer) -> None: ... - def __call__(self, input_ids: LongTensor, scores: FloatTensor, **kwargs: Any) -> bool: ... - -class StopOnTokens: - def __call__(self, input_ids: LongTensor, scores: FloatTensor, **kwargs: Any) -> bool: ... - def prepare_logits_processor(config: LLMConfig) -> LogitsProcessorList: ... def get_context_length(config: PretrainedConfig) -> int: ... def is_sentence_complete(output: str) -> bool: ... diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index b08b0e75..31b63d72 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -71,9 +71,7 @@ def normalise_model_name(name: str) -> str: def _resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap: if not is_peft_available(): - raise RuntimeError( - "LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'" - ) + raise RuntimeError("Requires 'peft' to be installed. Do 'pip install \"openllm[fine-tune]\"'") from huggingface_hub import hf_hub_download resolved: AdapterMap = {} @@ -285,8 +283,6 @@ class LLM(t.Generic[M, T], ReprMixin): if env is not None:return str(env).upper() in ENV_VARS_TRUE_VALUES return self.__llm_trust_remote_code__ @property - def runner_name(self):return f"llm-{self.config['start_name']}-runner" - @property def model_id(self):return self._model_id @property def revision(self):return self._revision diff --git a/openllm-python/src/openllm/_llm.pyi b/openllm-python/src/openllm/_llm.pyi index e47a4294..af88d171 100644 --- a/openllm-python/src/openllm/_llm.pyi +++ b/openllm-python/src/openllm/_llm.pyi @@ -97,8 +97,6 @@ class LLM(Generic[M, T]): @property def trust_remote_code(self) -> bool: ... @property - def runner_name(self) -> str: ... - @property def model_id(self) -> str: ... @property def revision(self) -> str: ... diff --git a/openllm-python/src/openllm/_runners.py b/openllm-python/src/openllm/_runners.py index a4293534..a9c72ca9 100644 --- a/openllm-python/src/openllm/_runners.py +++ b/openllm-python/src/openllm/_runners.py @@ -9,7 +9,6 @@ import torch import bentoml import openllm from openllm_core._schemas import CompletionChunk, GenerationOutput, SampleLogprobs -from openllm_core.exceptions import OpenLLMException from openllm_core.utils import ReprMixin, is_ctranslate_available, is_vllm_available __all__ = ['runner'] @@ -28,12 +27,10 @@ def registry(cls=None, *, alias=None): def runner(llm: openllm.LLM): - from ._strategies import CascadingResourceStrategy - try: - models = [llm.bentomodel] - except bentoml.exceptions.NotFound as err: - raise RuntimeError(f'Failed to locate {llm.bentomodel}:{err}') from err + assert llm.bentomodel + except (bentoml.exceptions.NotFound, AssertionError) as err: + raise RuntimeError(f'Failed to locate {llm.bentomodel}: {err}') from err return types.new_class( llm.config.__class__.__name__[:-6] + 'Runner', @@ -73,9 +70,9 @@ def runner(llm: openllm.LLM): ), )( _registry[llm.__llm_backend__], - name=llm.runner_name, - models=models, - scheduling_strategy=CascadingResourceStrategy, + name=f"llm-{llm.config['start_name']}-runner", + models=[llm.bentomodel], + scheduling_strategy=openllm.CascadingResourceStrategy, runnable_init_params={'llm': llm}, ) @@ -87,7 +84,7 @@ class CTranslateRunnable(bentoml.Runnable): def __init__(self, llm): if not is_ctranslate_available(): - raise OpenLLMException('ctranslate is not installed. Please install it with `pip install "openllm[ctranslate]"`') + raise openllm.exceptions.OpenLLMException('ctranslate is not installed. Do `pip install "openllm[ctranslate]"`') self.llm, self.config, self.model, self.tokenizer = llm, llm.config, llm.model, llm.tokenizer @bentoml.Runnable.method(batchable=False) @@ -137,7 +134,7 @@ class vLLMRunnable(bentoml.Runnable): def __init__(self, llm): if not is_vllm_available(): - raise OpenLLMException('vLLM is not installed. Please install it via `pip install "openllm[vllm]"`.') + raise openllm.exceptions.OpenLLMException('vLLM is not installed. Do `pip install "openllm[vllm]"`.') import vllm self.llm, self.config, self.tokenizer = llm, llm.config, llm.tokenizer @@ -162,7 +159,9 @@ class vLLMRunnable(bentoml.Runnable): ) except Exception as err: traceback.print_exc() - raise OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from err + raise openllm.exceptions.OpenLLMException( + f'Failed to initialise vLLMEngine due to the following error:\n{err}' + ) from err @bentoml.Runnable.method(batchable=False) async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs): diff --git a/openllm-python/src/openllm/_strategies.py b/openllm-python/src/openllm/_strategies.py index 1f1d6b47..7a731a52 100644 --- a/openllm-python/src/openllm/_strategies.py +++ b/openllm-python/src/openllm/_strategies.py @@ -14,22 +14,13 @@ import psutil import bentoml from bentoml._internal.resource import get_resource, system_resources from bentoml._internal.runner.strategy import THREAD_ENVS -from openllm_core._typing_compat import overload from openllm_core.utils import DEBUG, ReprMixin - -class DynResource(t.Protocol): - resource_id: t.ClassVar[str] - - @classmethod - def from_system(cls) -> t.Sequence[t.Any]: ... - - logger = logging.getLogger(__name__) def _strtoul(s: str) -> int: - """Return -1 or positive integer sequence string starts with,.""" + # Return -1 or positive integer sequence string starts with. if not s: return -1 idx = 0 @@ -55,21 +46,6 @@ def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]: return rcs -_STACK_LEVEL = 3 - - -@overload # variant: default callback -def _parse_visible_devices() -> list[str] | None: ... - - -@overload # variant: specify None, and respect_env -def _parse_visible_devices(default_var: None, *, respect_env: t.Literal[True]) -> list[str] | None: ... - - -@overload # variant: default var is something other than None -def _parse_visible_devices(default_var: str = ..., *, respect_env: t.Literal[False]) -> list[str]: ... - - def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None: """CUDA_VISIBLE_DEVICES aware with default var for parsing spec.""" if respect_env: @@ -101,146 +77,136 @@ def _parse_visible_devices(default_var: str | None = None, respect_env: bool = T return [str(i) for i in rc] -def _from_system(cls: type[DynResource]) -> list[str]: - visible_devices = _parse_visible_devices() - if visible_devices is None: - if cls.resource_id == 'amd.com/gpu': - if not psutil.LINUX: - if DEBUG: - logger.debug('AMD GPUs is currently only supported on Linux.') - return [] - # ROCm does not currently have the rocm_smi wheel. - # So we need to use the ctypes bindings directly. - # we don't want to use CLI because parsing is a pain. - sys.path.append('/opt/rocm/libexec/rocm_smi') - try: - from ctypes import byref, c_uint32 - - # refers to https://github.com/RadeonOpenCompute/rocm_smi_lib/blob/master/python_smi_tools/rsmiBindings.py - from rsmiBindings import rocmsmi, rsmi_status_t - - device_count = c_uint32(0) - ret = rocmsmi.rsmi_num_monitor_devices(byref(device_count)) - if ret == rsmi_status_t.RSMI_STATUS_SUCCESS: - return [str(i) for i in range(device_count.value)] - return [] - # In this case the binary is not found, returning empty list - except (ModuleNotFoundError, ImportError): - return [] - finally: - sys.path.remove('/opt/rocm/libexec/rocm_smi') - else: - try: - from cuda import cuda - - cuda.cuInit(0) - _, dev = cuda.cuDeviceGetCount() - return [str(i) for i in range(dev)] - except (ImportError, RuntimeError, AttributeError): - return [] - return visible_devices - - -@overload -def _from_spec(cls: type[DynResource], spec: int) -> list[str]: ... - - -@overload -def _from_spec(cls: type[DynResource], spec: list[int | str]) -> list[str]: ... - - -@overload -def _from_spec(cls: type[DynResource], spec: str) -> list[str]: ... - - -def _from_spec(cls: type[DynResource], spec: t.Any) -> list[str]: - if isinstance(spec, int): - if spec in (-1, 0): - return [] - if spec < -1: - raise ValueError('Spec cannot be < -1.') - return [str(i) for i in range(spec)] - elif isinstance(spec, str): - if not spec: - return [] - if spec.isdigit(): - spec = ','.join([str(i) for i in range(_strtoul(spec))]) - return _parse_visible_devices(spec, respect_env=False) - elif isinstance(spec, list): - return [str(x) for x in spec] - else: - raise TypeError( - f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead." - ) - - def _raw_device_uuid_nvml() -> list[str] | None: from ctypes import CDLL, byref, c_int, c_void_p, create_string_buffer try: nvml_h = CDLL('libnvidia-ml.so.1') except Exception: - warnings.warn('Failed to find nvidia binding', stacklevel=_STACK_LEVEL) + warnings.warn('Failed to find nvidia binding', stacklevel=3) return None rc = nvml_h.nvmlInit() if rc != 0: - warnings.warn("Can't initialize NVML", stacklevel=_STACK_LEVEL) + warnings.warn("Can't initialize NVML", stacklevel=3) return None dev_count = c_int(-1) rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count)) if rc != 0: - warnings.warn('Failed to get available device from system.', stacklevel=_STACK_LEVEL) + warnings.warn('Failed to get available device from system.', stacklevel=3) return None uuids: list[str] = [] for idx in range(dev_count.value): dev_id = c_void_p() rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id)) if rc != 0: - warnings.warn(f'Failed to get device handle for {idx}', stacklevel=_STACK_LEVEL) + warnings.warn(f'Failed to get device handle for {idx}', stacklevel=3) return None buf_len = 96 buf = create_string_buffer(buf_len) rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len) if rc != 0: - warnings.warn(f'Failed to get device UUID for {idx}', stacklevel=_STACK_LEVEL) + warnings.warn(f'Failed to get device UUID for {idx}', stacklevel=3) return None uuids.append(buf.raw.decode('ascii').strip('\0')) del nvml_h return uuids -def _validate(cls: type[DynResource], val: list[t.Any]) -> None: - if cls.resource_id == 'amd.com/gpu': - raise RuntimeError( - "AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'" - ) - if not all(isinstance(i, str) for i in val): - raise ValueError('Input list should be all string type.') +class _ResourceMixin: + @staticmethod + def from_system(cls) -> list[str]: + visible_devices = _parse_visible_devices() + if visible_devices is None: + if cls.resource_id == 'amd.com/gpu': + if not psutil.LINUX: + if DEBUG: + logger.debug('AMD GPUs is currently only supported on Linux.') + return [] + # ROCm does not currently have the rocm_smi wheel. + # So we need to use the ctypes bindings directly. + # we don't want to use CLI because parsing is a pain. + sys.path.append('/opt/rocm/libexec/rocm_smi') + try: + from ctypes import byref, c_uint32 - try: - from cuda import cuda + # refers to https://github.com/RadeonOpenCompute/rocm_smi_lib/blob/master/python_smi_tools/rsmiBindings.py + from rsmiBindings import rocmsmi, rsmi_status_t - err, *_ = cuda.cuInit(0) - if err != cuda.CUresult.CUDA_SUCCESS: - raise RuntimeError('Failed to initialise CUDA runtime binding.') - # correctly parse handle - for el in val: - if el.startswith(('GPU-', 'MIG-')): - uuids = _raw_device_uuid_nvml() - if uuids is None: - raise ValueError('Failed to parse available GPUs UUID') - if el not in uuids: - raise ValueError(f'Given UUID {el} is not found with available UUID (available: {uuids})') - elif el.isdigit(): - err, _ = cuda.cuDeviceGet(int(el)) - if err != cuda.CUresult.CUDA_SUCCESS: - raise ValueError(f'Failed to get device {el}') - except (ImportError, RuntimeError): - pass + device_count = c_uint32(0) + ret = rocmsmi.rsmi_num_monitor_devices(byref(device_count)) + if ret == rsmi_status_t.RSMI_STATUS_SUCCESS: + return [str(i) for i in range(device_count.value)] + return [] + # In this case the binary is not found, returning empty list + except (ModuleNotFoundError, ImportError): + return [] + finally: + sys.path.remove('/opt/rocm/libexec/rocm_smi') + else: + try: + from cuda import cuda + + cuda.cuInit(0) + _, dev = cuda.cuDeviceGetCount() + return [str(i) for i in range(dev)] + except (ImportError, RuntimeError, AttributeError): + return [] + return visible_devices + + @staticmethod + def from_spec(cls, spec) -> list[str]: + if isinstance(spec, int): + if spec in (-1, 0): + return [] + if spec < -1: + raise ValueError('Spec cannot be < -1.') + return [str(i) for i in range(spec)] + elif isinstance(spec, str): + if not spec: + return [] + if spec.isdigit(): + spec = ','.join([str(i) for i in range(_strtoul(spec))]) + return _parse_visible_devices(spec, respect_env=False) + elif isinstance(spec, list): + return [str(x) for x in spec] + else: + raise TypeError( + f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead." + ) + + @staticmethod + def validate(cls, val: list[t.Any]) -> None: + if cls.resource_id == 'amd.com/gpu': + raise RuntimeError( + "AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'" + ) + if not all(isinstance(i, str) for i in val): + raise ValueError('Input list should be all string type.') + + try: + from cuda import cuda + + err, *_ = cuda.cuInit(0) + if err != cuda.CUresult.CUDA_SUCCESS: + raise RuntimeError('Failed to initialise CUDA runtime binding.') + # correctly parse handle + for el in val: + if el.startswith(('GPU-', 'MIG-')): + uuids = _raw_device_uuid_nvml() + if uuids is None: + raise ValueError('Failed to parse available GPUs UUID') + if el not in uuids: + raise ValueError(f'Given UUID {el} is not found with available UUID (available: {uuids})') + elif el.isdigit(): + err, _ = cuda.cuDeviceGet(int(el)) + if err != cuda.CUresult.CUDA_SUCCESS: + raise ValueError(f'Failed to get device {el}') + except (ImportError, RuntimeError): + pass -def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]: +def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[bentoml.Resource[t.List[str]]]: return types.new_class( name, (bentoml.Resource[t.List[str]], ReprMixin), @@ -248,9 +214,9 @@ def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[ lambda ns: ns.update( { 'resource_id': resource_kind, - 'from_spec': classmethod(_from_spec), - 'from_system': classmethod(_from_system), - 'validate': classmethod(_validate), + 'from_spec': classmethod(_ResourceMixin.from_spec), + 'from_system': classmethod(_ResourceMixin.from_system), + 'validate': classmethod(_ResourceMixin.validate), '__repr_keys__': property(lambda _: {'resource_id'}), '__doc__': inspect.cleandoc(docstring), '__module__': 'openllm._strategies', @@ -259,15 +225,9 @@ def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[ ) -# NOTE: we need to hint these t.Literal since mypy is to dumb to infer this as literal 🤦 -_TPU_RESOURCE: t.Literal['cloud-tpus.google.com/v2'] = 'cloud-tpus.google.com/v2' -_AMD_GPU_RESOURCE: t.Literal['amd.com/gpu'] = 'amd.com/gpu' -_NVIDIA_GPU_RESOURCE: t.Literal['nvidia.com/gpu'] = 'nvidia.com/gpu' -_CPU_RESOURCE: t.Literal['cpu'] = 'cpu' - NvidiaGpuResource = _make_resource_class( 'NvidiaGpuResource', - _NVIDIA_GPU_RESOURCE, + 'nvidia.com/gpu', """NVIDIA GPU resource. This is a modified version of internal's BentoML's NvidiaGpuResource @@ -275,7 +235,7 @@ NvidiaGpuResource = _make_resource_class( ) AmdGpuResource = _make_resource_class( 'AmdGpuResource', - _AMD_GPU_RESOURCE, + 'amd.com/gpu', """AMD GPU resource. Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to diff --git a/openllm-python/src/openllm/client.py b/openllm-python/src/openllm/client.py index 91a95db4..a10cf35b 100644 --- a/openllm-python/src/openllm/client.py +++ b/openllm-python/src/openllm/client.py @@ -1,9 +1,6 @@ +# fmt: off import openllm_client as _client -def __dir__(): - return sorted(dir(_client)) - - -def __getattr__(it): - return getattr(_client, it) +def __dir__():return sorted(dir(_client)) +def __getattr__(it):return getattr(_client, it) diff --git a/openllm-python/src/openllm/testing.py b/openllm-python/src/openllm/testing.py deleted file mode 100644 index f51355bf..00000000 --- a/openllm-python/src/openllm/testing.py +++ /dev/null @@ -1,75 +0,0 @@ -from __future__ import annotations -import contextlib -import logging -import shutil -import subprocess -import typing as t - -import bentoml -import openllm - -if t.TYPE_CHECKING: - from openllm_core._typing_compat import LiteralBackend, LiteralQuantise - -logger = logging.getLogger(__name__) - - -@contextlib.contextmanager -def build_bento( - model: str, model_id: str | None = None, quantize: LiteralQuantise | None = None, cleanup: bool = False -) -> t.Iterator[bentoml.Bento]: - logger.info('Building BentoML for %s', model) - bento = openllm.build(model, model_id=model_id, quantize=quantize) - yield bento - if cleanup: - logger.info('Deleting %s', bento.tag) - bentoml.bentos.delete(bento.tag) - - -@contextlib.contextmanager -def build_container( - bento: bentoml.Bento | str | bentoml.Tag, image_tag: str | None = None, cleanup: bool = False, **attrs: t.Any -) -> t.Iterator[str]: - if isinstance(bento, bentoml.Bento): - bento_tag = bento.tag - else: - bento_tag = bentoml.Tag.from_taglike(bento) - if image_tag is None: - image_tag = str(bento_tag) - executable = shutil.which('docker') - if not executable: - raise RuntimeError('docker executable not found') - try: - logger.info('Building container for %s', bento_tag) - bentoml.container.build(bento_tag, backend='docker', image_tag=(image_tag,), progress='plain', **attrs) - yield image_tag - finally: - if cleanup: - logger.info('Deleting container %s', image_tag) - subprocess.check_output([executable, 'rmi', '-f', image_tag]) - - -@contextlib.contextmanager -def prepare( - model: str, - model_id: str, - backend: LiteralBackend = 'pt', - deployment_mode: t.Literal['container', 'local'] = 'local', - clean_context: contextlib.ExitStack | None = None, - cleanup: bool = True, -) -> t.Iterator[str]: - if clean_context is None: - clean_context = contextlib.ExitStack() - cleanup = True - llm = openllm.LLM[t.Any, t.Any](model_id, backend=backend) - bento_tag = bentoml.Tag.from_taglike(f'{llm.llm_type}-service:{llm.tag.version}') - if not bentoml.list(bento_tag): - bento = clean_context.enter_context(build_bento(model, model_id=model_id, cleanup=cleanup)) - else: - bento = bentoml.get(bento_tag) - container_name = f'openllm-{model}-{llm.llm_type}'.replace('-', '_') - if deployment_mode == 'container': - container_name = clean_context.enter_context(build_container(bento, image_tag=container_name, cleanup=cleanup)) - yield container_name - if cleanup: - clean_context.close() diff --git a/openllm-python/src/openllm/utils/__init__.py b/openllm-python/src/openllm/utils.py similarity index 100% rename from openllm-python/src/openllm/utils/__init__.py rename to openllm-python/src/openllm/utils.py diff --git a/openllm-python/src/openllm/utils/__init__.pyi b/openllm-python/src/openllm/utils.pyi similarity index 98% rename from openllm-python/src/openllm/utils/__init__.pyi rename to openllm-python/src/openllm/utils.pyi index ae535687..f5627aec 100644 --- a/openllm-python/src/openllm/utils/__init__.pyi +++ b/openllm-python/src/openllm/utils.pyi @@ -44,7 +44,6 @@ from openllm_core.utils import ( is_transformers_available as is_transformers_available, is_vllm_available as is_vllm_available, lenient_issubclass as lenient_issubclass, - reserve_free_port as reserve_free_port, resolve_filepath as resolve_filepath, resolve_user_filepath as resolve_user_filepath, serde as serde, diff --git a/openllm-python/src/openllm_cli/_factory.py b/openllm-python/src/openllm_cli/_factory.py index 3ebd8374..26bf488b 100644 --- a/openllm-python/src/openllm_cli/_factory.py +++ b/openllm-python/src/openllm_cli/_factory.py @@ -21,7 +21,7 @@ from openllm_core._typing_compat import ( ParamSpec, get_literal_args, ) -from openllm_core.utils import DEBUG, resolve_user_filepath +from openllm_core.utils import DEBUG, compose, dantic, resolve_user_filepath class _OpenLLM_GenericInternalConfig(LLMConfig): @@ -134,7 +134,7 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ... def start_decorator(serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]: def wrapper(fn: FC) -> t.Callable[[FC], FC]: - composed = openllm.utils.compose( + composed = compose( _OpenLLM_GenericInternalConfig.parse, _http_server_args if not serve_grpc else _grpc_server_args, cog.optgroup.group('General LLM Options', help='The following options are related to running LLM Server.'), @@ -160,7 +160,7 @@ def start_decorator(serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC serialisation_option(factory=cog.optgroup), cog.optgroup.option( '--device', - type=openllm.utils.dantic.CUDA, + type=dantic.CUDA, multiple=True, envvar='CUDA_VISIBLE_DEVICES', callback=parse_device_callback, diff --git a/openllm-python/src/openllm_cli/_sdk.py b/openllm-python/src/openllm_cli/_sdk.py index d868cbff..6fa1154f 100644 --- a/openllm-python/src/openllm_cli/_sdk.py +++ b/openllm-python/src/openllm_cli/_sdk.py @@ -294,7 +294,7 @@ def _list_models() -> dict[str, t.Any]: """List all available models within the local store.""" from .entrypoint import models_command - return models_command.main(args=['--show-available', '--quiet'], standalone_mode=False) + return models_command.main(args=['--quiet'], standalone_mode=False) start, start_grpc = codegen.gen_sdk(_start, _serve_grpc=False), codegen.gen_sdk(_start, _serve_grpc=True) diff --git a/openllm-python/src/openllm_cli/entrypoint.py b/openllm-python/src/openllm_cli/entrypoint.py index 86b31185..4f0286cc 100644 --- a/openllm-python/src/openllm_cli/entrypoint.py +++ b/openllm-python/src/openllm_cli/entrypoint.py @@ -1,25 +1,3 @@ -"""OpenLLM CLI interface. - -This module also contains the SDK to call ``start`` and ``build`` from SDK - -Start any LLM: - -```python -openllm.start('mistral', model_id='mistralai/Mistral-7B-v0.1') -``` - -Build a BentoLLM - -```python -bento = openllm.build('mistralai/Mistral-7B-v0.1') -``` - -Import any LLM into local store -```python -bentomodel = openllm.import_model('mistralai/Mistral-7B-v0.1') -``` -""" - from __future__ import annotations import enum import functools @@ -91,7 +69,6 @@ from openllm_core.utils import ( from . import termui from ._factory import ( FC, - LiteralOutput, _AnyCallable, backend_option, container_registry_option, @@ -1225,7 +1202,11 @@ def models_command(**_: t.Any) -> dict[t.LiteralString, ModelItem]: @model_name_argument(required=False) @click.option('-y', '--yes', '--assume-yes', is_flag=True, help='Skip confirmation when deleting a specific model') @click.option( - '--include-bentos/--no-include-bentos', is_flag=True, default=False, help='Whether to also include pruning bentos.' + '--include-bentos/--no-include-bentos', + is_flag=True, + hidden=True, + default=True, + help='Whether to also include pruning bentos.', ) @inject @click.pass_context @@ -1233,11 +1214,11 @@ def prune_command( ctx: click.Context, model_name: str | None, yes: bool, - include_bentos: bool, model_store: ModelStore = Provide[BentoMLContainer.model_store], bento_store: BentoStore = Provide[BentoMLContainer.bento_store], + **_: t.Any, ) -> None: - """Remove all saved models, (and optionally bentos) built with OpenLLM locally. + """Remove all saved models, and bentos built with OpenLLM locally. \b If a model type is passed, then only prune models for that given model type. @@ -1252,18 +1233,15 @@ def prune_command( (m, store) for m, store in available if 'model_name' in m.info.labels and m.info.labels['model_name'] == inflection.underscore(model_name) + ] + [ + (b, bento_store) + for b in bentoml.bentos.list() + if 'start_name' in b.info.labels and b.info.labels['start_name'] == inflection.underscore(model_name) + ] + if model_name is None: + available += [ + (b, bento_store) for b in bentoml.bentos.list() if '_type' in b.info.labels and '_framework' in b.info.labels ] - if include_bentos: - if model_name is not None: - available += [ - (b, bento_store) - for b in bentoml.bentos.list() - if 'start_name' in b.info.labels and b.info.labels['start_name'] == inflection.underscore(model_name) - ] - else: - available += [ - (b, bento_store) for b in bentoml.bentos.list() if '_type' in b.info.labels and '_framework' in b.info.labels - ] for store_item, store in available: if yes: @@ -1316,69 +1294,6 @@ def shared_client_options(f: _AnyCallable | None = None) -> t.Callable[[FC], FC] return compose(*options)(f) if f is not None else compose(*options) -@cli.command(hidden=True) -@click.argument('task', type=click.STRING, metavar='TASK') -@shared_client_options -@click.option( - '--agent', - type=click.Choice(['hf']), - default='hf', - help='Whether to interact with Agents from given Server endpoint.', - show_default=True, -) -@click.option( - '--remote', - is_flag=True, - default=False, - help='Whether or not to use remote tools (inference endpoints) instead of local ones.', - show_default=True, -) -@click.option( - '--opt', - help="Define prompt options. (format: ``--opt text='I love this' --opt audio:./path/to/audio --opt image:/path/to/file``)", - required=False, - multiple=True, - callback=opt_callback, - metavar='ARG=VALUE[,ARG=VALUE]', -) -def instruct_command( - endpoint: str, - timeout: int, - agent: LiteralString, - output: LiteralOutput, - remote: bool, - task: str, - _memoized: DictStrAny, - **attrs: t.Any, -) -> str: - """Instruct agents interactively for given tasks, from a terminal. - - \b - ```bash - $ openllm instruct --endpoint http://12.323.2.1:3000 \\ - "Is the following `text` (in Spanish) positive or negative?" \\ - --text "¡Este es un API muy agradable!" - ``` - """ - raise click.ClickException("'instruct' is currently disabled") - # client = openllm.client.HTTPClient(endpoint, timeout=timeout) - # - # try: - # client.call('metadata') - # except http.client.BadStatusLine: - # raise click.ClickException(f'{endpoint} is neither a HTTP server nor reachable.') from None - # if agent == 'hf': - # _memoized = {k: v[0] for k, v in _memoized.items() if v} - # client._hf_agent.set_stream(logger.info) - # if output != 'porcelain': termui.echo(f"Sending the following prompt ('{task}') with the following vars: {_memoized}", fg='magenta') - # result = client.ask_agent(task, agent_type=agent, return_code=False, remote=remote, **_memoized) - # if output == 'json': termui.echo(orjson.dumps(result, option=orjson.OPT_INDENT_2).decode(), fg='white') - # else: termui.echo(result, fg='white') - # return result - # else: - # raise click.BadOptionUsage('agent', f'Unknown agent type {agent}') - - @cli.command() @shared_client_options @click.option( diff --git a/openllm-python/tests/models/__init__.py b/openllm-python/tests/models/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[container].json b/openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[container].json deleted file mode 100644 index 38506cbd..00000000 --- a/openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[container].json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "configuration": { - "generation_config": { - "diversity_penalty": 0.0, - "early_stopping": false, - "encoder_no_repeat_ngram_size": 0, - "encoder_repetition_penalty": 1.0, - "epsilon_cutoff": 0.0, - "eta_cutoff": 0.0, - "length_penalty": 1.0, - "max_new_tokens": 10, - "min_length": 0, - "no_repeat_ngram_size": 0, - "num_beam_groups": 1, - "num_beams": 1, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "remove_invalid_values": false, - "renormalize_logits": false, - "repetition_penalty": 1.0, - "temperature": 0.9, - "top_k": 50, - "top_p": 0.9, - "typical_p": 1.0, - "use_cache": true - } - }, - "responses": [ - "life is a complete physical life" - ] -} \ No newline at end of file diff --git a/openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[local].json b/openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[local].json deleted file mode 100644 index 6f1deb95..00000000 --- a/openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[local].json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "configuration": { - "generation_config": { - "diversity_penalty": 0.0, - "early_stopping": false, - "encoder_no_repeat_ngram_size": 0, - "encoder_repetition_penalty": 1.0, - "epsilon_cutoff": 0.0, - "eta_cutoff": 0.0, - "length_penalty": 1.0, - "max_new_tokens": 10, - "min_length": 0, - "no_repeat_ngram_size": 0, - "num_beam_groups": 1, - "num_beams": 1, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "remove_invalid_values": false, - "renormalize_logits": false, - "repetition_penalty": 1.0, - "temperature": 0.9, - "top_k": 50, - "top_p": 0.9, - "typical_p": 1.0, - "use_cache": true - } - }, - "responses": [ - "life is a state" - ] -} \ No newline at end of file diff --git a/openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[container].json b/openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[container].json deleted file mode 100644 index 0727c509..00000000 --- a/openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[container].json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "configuration": { - "format_outputs": false, - "generation_config": { - "diversity_penalty": 0.0, - "early_stopping": false, - "encoder_no_repeat_ngram_size": 0, - "encoder_repetition_penalty": 1.0, - "epsilon_cutoff": 0.0, - "eta_cutoff": 0.0, - "length_penalty": 1.0, - "max_new_tokens": 20, - "min_length": 0, - "no_repeat_ngram_size": 0, - "num_beam_groups": 1, - "num_beams": 1, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "remove_invalid_values": false, - "renormalize_logits": false, - "repetition_penalty": 1.0, - "temperature": 0.75, - "top_k": 15, - "top_p": 1.0, - "typical_p": 1.0, - "use_cache": true - } - }, - "responses": [ - "What is Deep learning?\nDeep learning is a new way of studying the content and making an informed decision. It is the" - ] -} \ No newline at end of file diff --git a/openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[local].json b/openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[local].json deleted file mode 100644 index b17a783d..00000000 --- a/openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[local].json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "configuration": { - "format_outputs": false, - "generation_config": { - "diversity_penalty": 0.0, - "early_stopping": false, - "encoder_no_repeat_ngram_size": 0, - "encoder_repetition_penalty": 1.0, - "epsilon_cutoff": 0.0, - "eta_cutoff": 0.0, - "length_penalty": 1.0, - "max_new_tokens": 20, - "min_length": 0, - "no_repeat_ngram_size": 0, - "num_beam_groups": 1, - "num_beams": 1, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "remove_invalid_values": false, - "renormalize_logits": false, - "repetition_penalty": 1.0, - "temperature": 0.75, - "top_k": 15, - "top_p": 1.0, - "typical_p": 1.0, - "use_cache": true - } - }, - "responses": [ - "What is Deep learning?\n\nDeep learning is a new, highly-advanced, and powerful tool for the deep learning" - ] -} \ No newline at end of file diff --git a/openllm-python/tests/models/conftest.py b/openllm-python/tests/models/conftest.py deleted file mode 100644 index 205dca20..00000000 --- a/openllm-python/tests/models/conftest.py +++ /dev/null @@ -1,266 +0,0 @@ -from __future__ import annotations -import asyncio -import contextlib -import functools -import logging -import sys -import time -import typing as t -from abc import ABC, abstractmethod - -import attr -import docker -import docker.errors -import docker.types -import orjson -import pytest -from syrupy.extensions.json import JSONSnapshotExtension - -import openllm -from bentoml._internal.types import LazyType -from openllm._llm import self -from openllm_core._typing_compat import DictStrAny, ListAny, LiteralQuantise - -logger = logging.getLogger(__name__) - -if t.TYPE_CHECKING: - import subprocess - - from syrupy.assertion import SnapshotAssertion - from syrupy.types import PropertyFilter, PropertyMatcher, SerializableData, SerializedData - - from openllm.client import BaseAsyncClient - - -class ResponseComparator(JSONSnapshotExtension): - def serialize( - self, data: SerializableData, *, exclude: PropertyFilter | None = None, matcher: PropertyMatcher | None = None - ) -> SerializedData: - if LazyType(ListAny).isinstance(data): - data = [d.unmarshaled for d in data] - else: - data = data.unmarshaled - data = self._filter(data=data, depth=0, path=(), exclude=exclude, matcher=matcher) - return orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS).decode() - - def matches(self, *, serialized_data: SerializableData, snapshot_data: SerializableData) -> bool: - def convert_data(data: SerializableData) -> openllm.GenerationOutput | t.Sequence[openllm.GenerationOutput]: - try: - data = orjson.loads(data) - except orjson.JSONDecodeError as err: - raise ValueError(f'Failed to decode JSON data: {data}') from err - if LazyType(DictStrAny).isinstance(data): - return openllm.GenerationOutput(**data) - elif LazyType(ListAny).isinstance(data): - return [openllm.GenerationOutput(**d) for d in data] - else: - raise NotImplementedError(f'Data {data} has unsupported type.') - - serialized_data = convert_data(serialized_data) - snapshot_data = convert_data(snapshot_data) - - if LazyType(ListAny).isinstance(serialized_data): - serialized_data = [serialized_data] - if LazyType(ListAny).isinstance(snapshot_data): - snapshot_data = [snapshot_data] - - def eq_output(s: openllm.GenerationOutput, t: openllm.GenerationOutput) -> bool: - return len(s.outputs) == len(t.outputs) - - return len(serialized_data) == len(snapshot_data) and all( - [eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)] - ) - - -@pytest.fixture() -def response_snapshot(snapshot: SnapshotAssertion): - return snapshot.use_extension(ResponseComparator) - - -@attr.define(init=False) -class _Handle(ABC): - port: int - deployment_mode: t.Literal['container', 'local'] - - client: BaseAsyncClient[t.Any] = attr.field(init=False) - - if t.TYPE_CHECKING: - - def __attrs_init__(self, *args: t.Any, **attrs: t.Any): ... - - def __attrs_post_init__(self): - self.client = openllm.client.AsyncHTTPClient(f'http://localhost:{self.port}') - - @abstractmethod - def status(self) -> bool: - raise NotImplementedError - - async def health(self, timeout: int = 240): - start_time = time.time() - while time.time() - start_time < timeout: - if not self.status(): - raise RuntimeError(f'Failed to initialise {self.__class__.__name__}') - await self.client.health() - try: - await self.client.query('sanity') - return - except Exception: - time.sleep(1) - raise RuntimeError(f'Handle failed to initialise within {timeout} seconds.') - - -@attr.define(init=False) -class LocalHandle(_Handle): - process: subprocess.Popen[bytes] - - def __init__(self, process: subprocess.Popen[bytes], port: int, deployment_mode: t.Literal['container', 'local']): - self.__attrs_init__(port, deployment_mode, process) - - def status(self) -> bool: - return self.process.poll() is None - - -class HandleProtocol(t.Protocol): - @contextlib.contextmanager - def __call__( - *, model: str, model_id: str, image_tag: str, quantize: t.AnyStr | None = None - ) -> t.Generator[_Handle, None, None]: ... - - -@attr.define(init=False) -class DockerHandle(_Handle): - container_name: str - docker_client: docker.DockerClient - - def __init__( - self, - docker_client: docker.DockerClient, - container_name: str, - port: int, - deployment_mode: t.Literal['container', 'local'], - ): - self.__attrs_init__(port, deployment_mode, container_name, docker_client) - - def status(self) -> bool: - container = self.docker_client.containers.get(self.container_name) - return container.status in ['running', 'created'] - - -@contextlib.contextmanager -def _local_handle( - model: str, - model_id: str, - image_tag: str, - deployment_mode: t.Literal['container', 'local'], - quantize: LiteralQuantise | None = None, - *, - _serve_grpc: bool = False, -): - with openllm.utils.reserve_free_port() as port: - pass - - if not _serve_grpc: - proc = openllm.start( - model, model_id=model_id, quantize=quantize, additional_args=['--port', str(port)], __test__=True - ) - else: - proc = openllm.start_grpc( - model, model_id=model_id, quantize=quantize, additional_args=['--port', str(port)], __test__=True - ) - - yield LocalHandle(proc, port, deployment_mode) - proc.terminate() - proc.wait(60) - - process_output = proc.stdout.read() - print(process_output, file=sys.stderr) - - proc.stdout.close() - if proc.stderr: - proc.stderr.close() - - -@contextlib.contextmanager -def _container_handle( - model: str, - model_id: str, - image_tag: str, - deployment_mode: t.Literal['container', 'local'], - quantize: LiteralQuantise | None = None, - *, - _serve_grpc: bool = False, -): - with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port: - pass - container_name = f'openllm-{model}-{self(model_id)}'.replace('-', '_') - client = docker.from_env() - try: - container = client.containers.get(container_name) - container.stop() - container.wait() - container.remove() - except docker.errors.NotFound: - pass - - args = ['serve' if not _serve_grpc else 'serve-grpc'] - - env: DictStrAny = {} - - if quantize is not None: - env['OPENLLM_QUANTIZE'] = quantize - - gpus = openllm.utils.device_count() or -1 - devs = [docker.types.DeviceRequest(count=gpus, capabilities=[['gpu']])] if gpus > 0 else None - - container = client.containers.run( - image_tag, - command=args, - name=container_name, - environment=env, - auto_remove=False, - detach=True, - device_requests=devs, - ports={'3000/tcp': port, '3001/tcp': prom_port}, - ) - - yield DockerHandle(client, container.name, port, deployment_mode) - - try: - container.stop() - container.wait() - except docker.errors.NotFound: - pass - - container_output = container.logs().decode('utf-8') - print(container_output, file=sys.stderr) - - container.remove() - - -@pytest.fixture(scope='session', autouse=True) -def clean_context() -> t.Generator[contextlib.ExitStack, None, None]: - stack = contextlib.ExitStack() - yield stack - stack.close() - - -@pytest.fixture(scope='module') -def el() -> t.Generator[asyncio.AbstractEventLoop, None, None]: - loop = asyncio.get_event_loop() - yield loop - loop.close() - - -@pytest.fixture(params=['container', 'local'], scope='session') -def deployment_mode(request: pytest.FixtureRequest) -> str: - return request.param - - -@pytest.fixture(scope='module') -def handler(el: asyncio.AbstractEventLoop, deployment_mode: t.Literal['container', 'local']): - if deployment_mode == 'container': - return functools.partial(_container_handle, deployment_mode=deployment_mode) - elif deployment_mode == 'local': - return functools.partial(_local_handle, deployment_mode=deployment_mode) - else: - raise ValueError(f'Unknown deployment mode: {deployment_mode}') diff --git a/openllm-python/tests/models/flan_t5_test.py b/openllm-python/tests/models/flan_t5_test.py deleted file mode 100644 index 2b962703..00000000 --- a/openllm-python/tests/models/flan_t5_test.py +++ /dev/null @@ -1,40 +0,0 @@ -from __future__ import annotations -import typing as t - -import pytest - -import openllm - -if t.TYPE_CHECKING: - import contextlib - - from .conftest import HandleProtocol, ResponseComparator, _Handle - -model = 'flan_t5' -model_id = 'google/flan-t5-small' - - -@pytest.fixture(scope='module') -def flan_t5_handle( - handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack -): - with openllm.testing.prepare( - model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context - ) as image_tag: - with handler(model=model, model_id=model_id, image_tag=image_tag) as handle: - yield handle - - -@pytest.fixture(scope='module') -async def flan_t5(flan_t5_handle: _Handle): - await flan_t5_handle.health(240) - return flan_t5_handle.client - - -@pytest.mark.asyncio() -async def test_flan_t5(flan_t5: t.Awaitable[openllm.client.AsyncHTTPClient], response_snapshot: ResponseComparator): - client = await flan_t5 - response = await client.query('What is the meaning of life?', max_new_tokens=10, top_p=0.9, return_response='attrs') - - assert response.configuration['generation_config']['max_new_tokens'] == 10 - assert response == response_snapshot diff --git a/openllm-python/tests/models/opt_test.py b/openllm-python/tests/models/opt_test.py deleted file mode 100644 index 3ab5befa..00000000 --- a/openllm-python/tests/models/opt_test.py +++ /dev/null @@ -1,40 +0,0 @@ -from __future__ import annotations -import typing as t - -import pytest - -import openllm - -if t.TYPE_CHECKING: - import contextlib - - from .conftest import HandleProtocol, ResponseComparator, _Handle - -model = 'opt' -model_id = 'facebook/opt-125m' - - -@pytest.fixture(scope='module') -def opt_125m_handle( - handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack -): - with openllm.testing.prepare( - model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context - ) as image_tag: - with handler(model=model, model_id=model_id, image_tag=image_tag) as handle: - yield handle - - -@pytest.fixture(scope='module') -async def opt_125m(opt_125m_handle: _Handle): - await opt_125m_handle.health(240) - return opt_125m_handle.client - - -@pytest.mark.asyncio() -async def test_opt_125m(opt_125m: t.Awaitable[openllm.client.AsyncHTTPClient], response_snapshot: ResponseComparator): - client = await opt_125m - response = await client.query('What is Deep learning?', max_new_tokens=20, return_response='attrs') - - assert response.configuration['generation_config']['max_new_tokens'] == 20 - assert response == response_snapshot