From 96318b65ee060f76c81c60c13d0ebe83adf92ffd Mon Sep 17 00:00:00 2001 From: Aaron <29749331+aarnphm@users.noreply.github.com> Date: Sun, 26 Nov 2023 04:53:36 -0500 Subject: [PATCH] fix(sdk): remove broken sdk codespace now around 2.8k lines Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --- .../src/openllm_core/utils/__init__.py | 4 +- openllm-python/src/openllm/__init__.py | 18 +- openllm-python/src/openllm/_deprecated.py | 83 ++------ openllm-python/src/openllm/_llm.py | 69 ++----- openllm-python/src/openllm/_quantisation.py | 19 +- openllm-python/src/openllm/_service.py | 55 ++---- openllm-python/src/openllm/_service_vars.py | 9 +- openllm-python/src/openllm/_strategies.py | 181 +++++------------- openllm-python/src/openllm/bundle/__init__.py | 7 +- openllm-python/src/openllm/bundle/_package.py | 38 ++-- openllm-python/src/openllm/client.py | 12 +- .../src/openllm/entrypoints/__init__.py | 13 +- .../src/openllm/entrypoints/cohere.py | 29 +-- openllm-python/src/openllm/entrypoints/hf.py | 22 +-- openllm-python/src/openllm/exceptions.py | 9 +- .../src/openllm/protocol/__init__.py | 9 +- openllm-python/src/openllm/utils.py | 32 +--- openllm-python/src/openllm_cli/_sdk.py | 127 ++++-------- 18 files changed, 179 insertions(+), 557 deletions(-) diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py index f1db9f89..574c94ee 100644 --- a/openllm-core/src/openllm_core/utils/__init__.py +++ b/openllm-core/src/openllm_core/utils/__init__.py @@ -234,6 +234,4 @@ __lazy = LazyModule( }, extra_objects=_extras, ) -__all__ = __lazy.__all__ -__dir__ = __lazy.__dir__ -__getattr__ = __lazy.__getattr__ +__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__ diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py index 9120074b..8085e06e 100644 --- a/openllm-python/src/openllm/__init__.py +++ b/openllm-python/src/openllm/__init__.py @@ -1,7 +1,6 @@ import logging as _logging, os as _os, pathlib as _pathlib, warnings as _warnings from openllm_cli import _sdk from . import utils as utils - if utils.DEBUG: utils.set_debug_mode(True); _logging.basicConfig(level=_logging.NOTSET) else: @@ -12,11 +11,8 @@ else: _warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization') _warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.') _warnings.filterwarnings('ignore', message='Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated') - COMPILED = _pathlib.Path(__file__).suffix in ('.pyd', '.so') - -# NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__ -__lazy = utils.LazyModule( +__lazy = utils.LazyModule( # NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__ __name__, globals()['__file__'], { @@ -34,14 +30,8 @@ __lazy = utils.LazyModule( '_llm': ['LLM'], }, extra_objects={ - 'COMPILED': COMPILED, - 'start': _sdk.start, - 'start_grpc': _sdk.start_grpc, - 'build': _sdk.build, - 'import_model': _sdk.import_model, - 'list_models': _sdk.list_models, + 'COMPILED': COMPILED, 'start': _sdk.start, 'build': _sdk.build, # + 'import_model': _sdk.import_model, 'list_models': _sdk.list_models, # }, ) -__all__ = __lazy.__all__ -__dir__ = __lazy.__dir__ -__getattr__ = __lazy.__getattr__ +__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__ diff --git a/openllm-python/src/openllm/_deprecated.py b/openllm-python/src/openllm/_deprecated.py index c4a39a4e..a1ffbbdb 100644 --- a/openllm-python/src/openllm/_deprecated.py +++ b/openllm-python/src/openllm/_deprecated.py @@ -1,65 +1,21 @@ from __future__ import annotations -import logging -import os -import typing as t -import warnings - +import logging, os, warnings, typing as t import openllm -from openllm_core._typing_compat import LiteralBackend, ParamSpec +from openllm_core._typing_compat import LiteralBackend from openllm_core.utils import first_not_none, getenv, is_vllm_available -if t.TYPE_CHECKING: - from ._runners import Runner as _Runner - -P = ParamSpec('P') - +__all__ = ['Runner'] logger = logging.getLogger(__name__) - def Runner( - model_name: str, - ensure_available: bool = True, - init_local: bool = False, - backend: LiteralBackend | None = None, - llm_config: openllm.LLMConfig | None = None, - **attrs: t.Any, -) -> _Runner[t.Any, t.Any]: - """Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'. - - > [!WARNING] - > This method is now deprecated and in favor of 'openllm.LLM' - - ```python - runner = openllm.Runner("dolly-v2") - - @svc.on_startup - def download(): - runner.download_model() - ``` - - if `init_local=True` (For development workflow), it will also enable `ensure_available`. - Default value of `ensure_available` is None. If set then use that given value, otherwise fallback to the aforementioned behaviour. - - Args: - model_name: Supported model name from 'openllm models' - ensure_available: If True, it will download the model if it is not available. If False, it will skip downloading the model. - If False, make sure the model is available locally. Default to True, and openllm.LLM will always check if models - are available locally. based on generated tag. - backend: The given Runner implementation one choose for this Runner. If `OPENLLM_BACKEND` is set, it will respect it. - llm_config: Optional ``openllm.LLMConfig`` to initialise this ``openllm.LLMRunner``. - init_local: If True, it will initialize the model locally. This is useful if you want to run the model locally. (Symmetrical to bentoml.Runner.init_local()) - **attrs: The rest of kwargs will then be passed to the LLM. Refer to the LLM documentation for the kwargs behaviour - """ - from ._llm import LLM - - if llm_config is None: - llm_config = openllm.AutoConfig.for_model(model_name) - if not ensure_available: - logger.warning( - "'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation." - ) + model_name: str, ensure_available: bool = True, # + init_local: bool = False, backend: LiteralBackend | None = None, # + llm_config: openllm.LLMConfig | None = None, **attrs: t.Any, +): + if llm_config is None: llm_config = openllm.AutoConfig.for_model(model_name) + if not ensure_available: logger.warning("'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation.") model_id = attrs.get('model_id', os.getenv('OPENLLM_MODEL_ID', llm_config['default_id'])) - _RUNNER_MSG = f'''\ + warnings.warn(f'''\ Using 'openllm.Runner' is now deprecated. Make sure to switch to the following syntax: ```python @@ -70,22 +26,11 @@ def Runner( @svc.api(...) async def chat(input: str) -> str: async for it in llm.generate_iterator(input): print(it) - ``` - ''' - warnings.warn(_RUNNER_MSG, DeprecationWarning, stacklevel=2) + ```''', DeprecationWarning, stacklevel=2) attrs.update( { - 'model_id': model_id, - 'quantize': getenv('QUANTIZE', var=['QUANTISE'], default=attrs.get('quantize', None)), - 'serialisation': getenv( - 'serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION'] - ), + 'model_id': model_id, 'quantize': getenv('QUANTIZE', var=['QUANTISE'], default=attrs.get('quantize', None)), # + 'serialisation': getenv('serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']), } ) - - backend = t.cast(LiteralBackend, first_not_none(backend, default='vllm' if is_vllm_available() else 'pt')) - llm = LLM[t.Any, t.Any](backend=backend, llm_config=llm_config, embedded=init_local, **attrs) - return llm.runner - - -__all__ = ['Runner'] + return openllm.LLM(backend=first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'), llm_config=llm_config, embedded=init_local, **attrs).runner diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index 9b622697..2dbbc563 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -47,23 +47,17 @@ ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]] @attr.define(slots=False, repr=False, init=False) class LLM(t.Generic[M, T]): - async def generate( - self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs - ) -> GenerationOutput: - if adapter_name is not None and self.__llm_backend__ != 'pt': - raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.') + async def generate(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs): + if adapter_name is not None and self.__llm_backend__ != 'pt': raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.') config = self.config.model_construct_env(**attrs) texts, token_ids = [[]] * config['n'], [[]] * config['n'] - final_result = None async for result in self.generate_iterator( prompt, prompt_token_ids, stop, stop_token_ids, request_id, adapter_name, **config.model_dump(flatten=True) ): for output in result.outputs: texts[output.index].append(output.text) token_ids[output.index].extend(output.token_ids) - final_result = result - if final_result is None: - raise RuntimeError('No result is returned.') + if (final_result := result) is None: raise RuntimeError('No result is returned.') return final_result.with_options( prompt=prompt, outputs=[ @@ -72,13 +66,9 @@ class LLM(t.Generic[M, T]): ], ) - async def generate_iterator( - self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs - ) -> t.AsyncGenerator[GenerationOutput, None]: + async def generate_iterator(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs): from bentoml._internal.runner.runner_handle import DummyRunnerHandle - - if adapter_name is not None and self.__llm_backend__ != 'pt': - raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.') + if adapter_name is not None and self.__llm_backend__ != 'pt': raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.') if isinstance(self.runner._runner_handle, DummyRunnerHandle): if os.getenv('BENTO_PATH') is not None: @@ -87,14 +77,12 @@ class LLM(t.Generic[M, T]): self.runner.init_local(quiet=True) config = self.config.model_construct_env(**attrs) - if stop_token_ids is None: stop_token_ids = [] + stop_token_ids = stop_token_ids or [] eos_token_id = attrs.get('eos_token_id', config['eos_token_id']) - if eos_token_id is not None: - if not isinstance(eos_token_id, list): eos_token_id = [eos_token_id] - stop_token_ids.extend(eos_token_id) - if config['eos_token_id'] and config['eos_token_id'] not in stop_token_ids: stop_token_ids.append(config['eos_token_id']) - if self.tokenizer.eos_token_id not in stop_token_ids: - stop_token_ids.append(self.tokenizer.eos_token_id) + if eos_token_id and not isinstance(eos_token_id, list): eos_token_id = [eos_token_id] + stop_token_ids.extend(eos_token_id or []) + if (config_eos := config['eos_token_id']) and config_eos not in stop_token_ids: stop_token_ids.append(config_eos) + if self.tokenizer.eos_token_id not in stop_token_ids: stop_token_ids.append(self.tokenizer.eos_token_id) if stop is None: stop = set() elif isinstance(stop, str): @@ -102,20 +90,16 @@ class LLM(t.Generic[M, T]): else: stop = set(stop) for tid in stop_token_ids: - if tid: - stop.add(self.tokenizer.decode(tid)) + if tid: stop.add(self.tokenizer.decode(tid)) if prompt_token_ids is None: - if prompt is None: - raise ValueError('Either prompt or prompt_token_ids must be specified.') + if prompt is None: raise ValueError('Either prompt or prompt_token_ids must be specified.') prompt_token_ids = self.tokenizer.encode(prompt) request_id = gen_random_uuid() if request_id is None else request_id previous_texts, previous_num_tokens = [''] * config['n'], [0] * config['n'] try: - generator = self.runner.generate_iterator.async_stream( - prompt_token_ids, request_id, stop=list(stop), adapter_name=adapter_name, **config.model_dump(flatten=True) - ) + generator = self.runner.generate_iterator.async_stream(prompt_token_ids, request_id, stop=list(stop), adapter_name=adapter_name, **config.model_dump(flatten=True)) except Exception as err: raise RuntimeError(f'Failed to start generation task: {err}') from err @@ -134,18 +118,11 @@ class LLM(t.Generic[M, T]): # NOTE: If you are here to see how generate_iterator and generate works, see above. # The below are mainly for internal implementation that you don't have to worry about. - _model_id: str - _revision: t.Optional[str] + _model_id: str; _revision: t.Optional[str] # _quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]] - _quantise: t.Optional[LiteralQuantise] - _model_decls: t.Tuple[t.Any, ...] - __model_attrs: t.Dict[str, t.Any] - __tokenizer_attrs: t.Dict[str, t.Any] - _tag: bentoml.Tag - _adapter_map: t.Optional[AdapterMap] - _serialisation: LiteralSerialisation - _local: bool - _max_model_len: t.Optional[int] + _quantise: t.Optional[LiteralQuantise]; _model_decls: t.Tuple[t.Any, ...]; __model_attrs: t.Dict[str, t.Any] # + __tokenizer_attrs: t.Dict[str, t.Any]; _tag: bentoml.Tag; _adapter_map: t.Optional[AdapterMap] # + _serialisation: LiteralSerialisation; _local: bool; _max_model_len: t.Optional[int] # __llm_dtype__: t.Union[LiteralDtype, t.Literal['auto', 'half', 'float']] = 'auto' __llm_torch_dtype__: 'torch.dtype' = None @@ -180,12 +157,7 @@ class LLM(t.Generic[M, T]): ): torch_dtype = attrs.pop('torch_dtype', None) # backward compatible if torch_dtype is not None: - warnings.warn( - 'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.', - DeprecationWarning, - stacklevel=3, - ) - dtype = torch_dtype + warnings.warn('The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.', DeprecationWarning, stacklevel=3); dtype = torch_dtype _local = False if validate_is_path(model_id): model_id, _local = resolve_filepath(model_id), True backend = getenv('backend', default=backend) @@ -291,7 +263,7 @@ class LLM(t.Generic[M, T]): if is_vllm_available(): return 'vllm' elif is_ctranslate_available(): - return 'ctranslate' # XXX: base OpenLLM image should always include vLLM + return 'ctranslate' elif is_ctranslate_available(): return 'ctranslate' else: @@ -449,8 +421,7 @@ def convert_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap: config_file = hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME) except Exception as err: raise ValueError(f"Can't find '{PEFT_CONFIG_NAME}' at '{path_or_adapter_id}'") from err - with open(config_file, 'r') as file: - resolved_config = orjson.loads(file.read()) + with open(config_file, 'r') as file: resolved_config = orjson.loads(file.read()) _peft_type = resolved_config['peft_type'].lower() if _peft_type not in resolved: resolved[_peft_type] = () resolved[_peft_type] += (_AdapterTuple((path_or_adapter_id, name, resolved_config)),) diff --git a/openllm-python/src/openllm/_quantisation.py b/openllm-python/src/openllm/_quantisation.py index 9224f17e..5e430662 100644 --- a/openllm-python/src/openllm/_quantisation.py +++ b/openllm-python/src/openllm/_quantisation.py @@ -1,13 +1,8 @@ from __future__ import annotations - from openllm_core.exceptions import MissingDependencyError from openllm_core.utils import is_autoawq_available, is_autogptq_available, is_bitsandbytes_available - - def infer_quantisation_config(llm, quantise, **attrs): - import torch - import transformers - + import torch, transformers # 8 bit configuration int8_threshold = attrs.pop('llm_int8_threshhold', 6.0) int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False) @@ -85,25 +80,19 @@ def infer_quantisation_config(llm, quantise, **attrs): # NOTE: Quantization setup quantize is a openllm.LLM feature, where we can quantize the model with bitsandbytes or quantization aware training. if not is_bitsandbytes_available(): - raise RuntimeError( - 'Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\'' - ) + raise RuntimeError('Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\'') if quantise == 'int8': quantisation_config = create_int8_config(int8_skip_modules) elif quantise == 'int4': quantisation_config = create_int4_config() elif quantise == 'gptq': if not is_autogptq_available(): - raise MissingDependencyError( - "GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'" - ) + raise MissingDependencyError("GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'") else: quantisation_config = create_gptq_config() elif quantise == 'awq': if not is_autoawq_available(): - raise MissingDependencyError( - "AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'." - ) + raise MissingDependencyError("AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'.") else: quantisation_config = create_awq_config() else: diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py index 58efe43a..1ecea673 100644 --- a/openllm-python/src/openllm/_service.py +++ b/openllm-python/src/openllm/_service.py @@ -1,66 +1,43 @@ from __future__ import annotations -import logging -import typing as t - +import logging, typing as t import _service_vars as svars - -import bentoml -import openllm +import bentoml, openllm from openllm_core._schemas import MessageParam from bentoml.io import JSON, Text logger = logging.getLogger(__name__) - llm = openllm.LLM[t.Any, t.Any]( - model_id=svars.model_id, - model_tag=svars.model_tag, - serialisation=svars.serialization, - adapter_map=svars.adapter_map, - trust_remote_code=svars.trust_remote_code, + model_id=svars.model_id, model_tag=svars.model_tag, adapter_map=svars.adapter_map, # + serialisation=svars.serialization, trust_remote_code=svars.trust_remote_code, ) svc = bentoml.Service(name=f"llm-{llm.config['start_name']}-service", runners=[llm.runner]) - llm_model_class = openllm.GenerationInput.from_llm_config(llm.config) - @svc.api( route='/v1/generate', - input=JSON.from_sample(llm_model_class.examples()), - output=JSON.from_sample(openllm.GenerationOutput.examples()), + input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()), # ) -async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]: - return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump() - +async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]: return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump() @svc.api( route='/v1/generate_stream', - input=JSON.from_sample(llm_model_class.examples()), - output=Text(content_type='text/event-stream'), + input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'), # ) async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]: async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()): yield f'data: {it.model_dump_json()}\n\n' yield 'data: [DONE]\n\n' - _Metadata = openllm.MetadataOutput( - timeout=llm.config['timeout'], - model_name=llm.config['model_name'], - backend=llm.__llm_backend__, - model_id=llm.model_id, + timeout=llm.config['timeout'], model_name=llm.config['model_name'], # + backend=llm.__llm_backend__, model_id=llm.model_id, # configuration=llm.config.model_dump_json().decode(), ) - @svc.api(route='/v1/metadata', input=Text(), output=JSON.from_sample(_Metadata.model_dump())) -def metadata_v1(_: str) -> openllm.MetadataOutput: - return _Metadata - - -class MessagesConverterInput(t.TypedDict): - add_generation_prompt: bool - messages: t.List[t.Dict[str, t.Any]] +def metadata_v1(_: str) -> openllm.MetadataOutput: return _Metadata +class MessagesConverterInput(t.TypedDict): add_generation_prompt: bool; messages: t.List[t.Dict[str, t.Any]] @svc.api( route='/v1/helpers/messages', @@ -69,18 +46,14 @@ class MessagesConverterInput(t.TypedDict): add_generation_prompt=False, messages=[ MessageParam(role='system', content='You are acting as Ernest Hemmingway.'), - MessageParam(role='user', content='Hi there!'), - MessageParam(role='assistant', content='Yes?'), + MessageParam(role='user', content='Hi there!'), MessageParam(role='assistant', content='Yes?'), # ], ) ), output=Text(), ) def helpers_messages_v1(message: MessagesConverterInput) -> str: - add_generation_prompt = message['add_generation_prompt'] - messages = message['messages'] + add_generation_prompt, messages = message['add_generation_prompt'], message['messages'] return llm.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False) - -# HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema. -openllm.mount_entrypoints(svc, llm) +openllm.mount_entrypoints(svc, llm) # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema. diff --git a/openllm-python/src/openllm/_service_vars.py b/openllm-python/src/openllm/_service_vars.py index d8ac5594..9d6f5da4 100644 --- a/openllm-python/src/openllm/_service_vars.py +++ b/openllm-python/src/openllm/_service_vars.py @@ -1,9 +1,2 @@ import os, orjson, openllm_core.utils as coreutils - -model_id, model_tag, adapter_map, serialization, trust_remote_code = ( - os.environ['OPENLLM_MODEL_ID'], - None, - orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))), - os.getenv('OPENLLM_SERIALIZATION', default='safetensors'), - coreutils.check_bool_env('TRUST_REMOTE_CODE', False), -) +model_id, model_tag, adapter_map, serialization, trust_remote_code = os.environ['OPENLLM_MODEL_ID'], None, orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))), os.getenv('OPENLLM_SERIALIZATION', default='safetensors'), coreutils.check_bool_env('TRUST_REMOTE_CODE', False) diff --git a/openllm-python/src/openllm/_strategies.py b/openllm-python/src/openllm/_strategies.py index a37e4b7f..ec1c297a 100644 --- a/openllm-python/src/openllm/_strategies.py +++ b/openllm-python/src/openllm/_strategies.py @@ -4,44 +4,33 @@ import psutil, bentoml, openllm_core.utils as coreutils from bentoml._internal.resource import get_resource, system_resources from bentoml._internal.runner.strategy import THREAD_ENVS +__all__ = ['CascadingResourceStrategy', 'get_resource'] logger = logging.getLogger(__name__) - def _strtoul(s: str) -> int: # Return -1 or positive integer sequence string starts with. - if not s: - return -1 + if not s: return -1 idx = 0 for idx, c in enumerate(s): - if not (c.isdigit() or (idx == 0 and c in '+-')): - break - if idx + 1 == len(s): - idx += 1 # noqa: PLW2901 + if not (c.isdigit() or (idx == 0 and c in '+-')): break + if idx + 1 == len(s): idx += 1 # noqa: PLW2901 # NOTE: idx will be set via enumerate return int(s[:idx]) if idx > 0 else -1 - - def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]: - rcs: list[str] = [] + rcs = [] for elem in lst.split(','): # Repeated id results in empty set - if elem in rcs: - return [] + if elem in rcs: return [] # Anything other but prefix is ignored - if not elem.startswith(prefix): - break + if not elem.startswith(prefix): break rcs.append(elem) return rcs - - def _parse_cuda_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None: if respect_env: spec = os.environ.get('CUDA_VISIBLE_DEVICES', default_var) - if not spec: - return None + if not spec: return None else: - if default_var is None: - raise ValueError('spec is required to be not None when parsing spec.') + if default_var is None: raise ValueError('spec is required to be not None when parsing spec.') spec = default_var if spec.startswith('GPU-'): @@ -55,64 +44,52 @@ def _parse_cuda_visible_devices(default_var: str | None = None, respect_env: boo for el in spec.split(','): x = _strtoul(el.strip()) # Repeated ordinal results in empty set - if x in rc: - return [] + if x in rc: return [] # Negative value aborts the sequence - if x < 0: - break + if x < 0: break rc.append(x) return [str(i) for i in rc] - - def _raw_device_uuid_nvml() -> list[str] | None: from ctypes import CDLL, byref, c_int, c_void_p, create_string_buffer try: nvml_h = CDLL('libnvidia-ml.so.1') except Exception: - warnings.warn('Failed to find nvidia binding', stacklevel=3) - return None + warnings.warn('Failed to find nvidia binding', stacklevel=3); return None rc = nvml_h.nvmlInit() if rc != 0: - warnings.warn("Can't initialize NVML", stacklevel=3) - return None + warnings.warn("Can't initialize NVML", stacklevel=3); return None dev_count = c_int(-1) rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count)) if rc != 0: - warnings.warn('Failed to get available device from system.', stacklevel=3) - return None - uuids: list[str] = [] + warnings.warn('Failed to get available device from system.', stacklevel=3); return None + uuids = [] for idx in range(dev_count.value): dev_id = c_void_p() rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id)) if rc != 0: - warnings.warn(f'Failed to get device handle for {idx}', stacklevel=3) - return None + warnings.warn(f'Failed to get device handle for {idx}', stacklevel=3); return None buf_len = 96 buf = create_string_buffer(buf_len) rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len) if rc != 0: - warnings.warn(f'Failed to get device UUID for {idx}', stacklevel=3) - return None + warnings.warn(f'Failed to get device UUID for {idx}', stacklevel=3); return None uuids.append(buf.raw.decode('ascii').strip('\0')) del nvml_h return uuids - class _ResourceMixin: @staticmethod def from_system(cls) -> list[str]: visible_devices = _parse_cuda_visible_devices() if visible_devices is None: if cls.resource_id == 'amd.com/gpu': - if not psutil.LINUX: - if coreutils.DEBUG: - logger.debug('AMD GPUs is currently only supported on Linux.') - return [] + if not psutil.LINUX: return [] # ROCm does not currently have the rocm_smi wheel. # So we need to use the ctypes bindings directly. # we don't want to use CLI because parsing is a pain. + # TODO: Use tinygrad/gpuctypes sys.path.append('/opt/rocm/libexec/rocm_smi') try: from ctypes import byref, c_uint32 @@ -122,8 +99,7 @@ class _ResourceMixin: device_count = c_uint32(0) ret = rocmsmi.rsmi_num_monitor_devices(byref(device_count)) - if ret == rsmi_status_t.RSMI_STATUS_SUCCESS: - return [str(i) for i in range(device_count.value)] + if ret == rsmi_status_t.RSMI_STATUS_SUCCESS: return [str(i) for i in range(device_count.value)] return [] # In this case the binary is not found, returning empty list except (ModuleNotFoundError, ImportError): @@ -140,59 +116,43 @@ class _ResourceMixin: except (ImportError, RuntimeError, AttributeError): return [] return visible_devices - @staticmethod def from_spec(cls, spec) -> list[str]: if isinstance(spec, int): - if spec in (-1, 0): - return [] - if spec < -1: - raise ValueError('Spec cannot be < -1.') + if spec in (-1, 0): return [] + if spec < -1: raise ValueError('Spec cannot be < -1.') return [str(i) for i in range(spec)] elif isinstance(spec, str): - if not spec: - return [] - if spec.isdigit(): - spec = ','.join([str(i) for i in range(_strtoul(spec))]) + if not spec: return [] + if spec.isdigit(): spec = ','.join([str(i) for i in range(_strtoul(spec))]) return _parse_cuda_visible_devices(spec, respect_env=False) elif isinstance(spec, list): return [str(x) for x in spec] else: - raise TypeError( - f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead." - ) - + raise TypeError(f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.") @staticmethod def validate(cls, val: list[t.Any]) -> None: if cls.resource_id == 'amd.com/gpu': - raise RuntimeError( - "AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'" - ) + raise RuntimeError("AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'") if not all(isinstance(i, str) for i in val): raise ValueError('Input list should be all string type.') try: from cuda import cuda - err, *_ = cuda.cuInit(0) - if err != cuda.CUresult.CUDA_SUCCESS: - raise RuntimeError('Failed to initialise CUDA runtime binding.') + if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError('Failed to initialise CUDA runtime binding.') # correctly parse handle for el in val: if el.startswith(('GPU-', 'MIG-')): uuids = _raw_device_uuid_nvml() - if uuids is None: - raise ValueError('Failed to parse available GPUs UUID') - if el not in uuids: - raise ValueError(f'Given UUID {el} is not found with available UUID (available: {uuids})') + if uuids is None: raise ValueError('Failed to parse available GPUs UUID') + if el not in uuids: raise ValueError(f'Given UUID {el} is not found with available UUID (available: {uuids})') elif el.isdigit(): err, _ = cuda.cuDeviceGet(int(el)) - if err != cuda.CUresult.CUDA_SUCCESS: - raise ValueError(f'Failed to get device {el}') + if err != cuda.CUresult.CUDA_SUCCESS: raise ValueError(f'Failed to get device {el}') except (ImportError, RuntimeError): pass - def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[bentoml.Resource[t.List[str]]]: return types.new_class( name, @@ -201,22 +161,16 @@ def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[ lambda ns: ns.update( { 'resource_id': resource_kind, - 'from_spec': classmethod(_ResourceMixin.from_spec), - 'from_system': classmethod(_ResourceMixin.from_system), - 'validate': classmethod(_ResourceMixin.validate), - '__repr_keys__': property(lambda _: {'resource_id'}), - '__doc__': inspect.cleandoc(docstring), - '__module__': 'openllm._strategies', + 'from_spec': classmethod(_ResourceMixin.from_spec), 'from_system': classmethod(_ResourceMixin.from_system), # + 'validate': classmethod(_ResourceMixin.validate), '__repr_keys__': property(lambda _: {'resource_id'}), # + '__doc__': inspect.cleandoc(docstring), '__module__': 'openllm._strategies', # } ), ) - - NvidiaGpuResource = _make_resource_class( 'NvidiaGpuResource', 'nvidia.com/gpu', '''NVIDIA GPU resource. - This is a modified version of internal's BentoML's NvidiaGpuResource where it respects and parse CUDA_VISIBLE_DEVICES correctly.''', ) @@ -224,73 +178,53 @@ AmdGpuResource = _make_resource_class( 'AmdGpuResource', 'amd.com/gpu', '''AMD GPU resource. - Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.''', ) - class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin): @classmethod def get_worker_count(cls, runnable_class, resource_request, workers_per_resource): - if resource_request is None: - resource_request = system_resources() + if resource_request is None: resource_request = system_resources() # use NVIDIA kind = 'nvidia.com/gpu' nvidia_req = get_resource(resource_request, kind) - if nvidia_req is not None: - return 1 + if nvidia_req is not None: return 1 # use AMD kind = 'amd.com/gpu' amd_req = get_resource(resource_request, kind, validate=False) - if amd_req is not None: - return 1 + if amd_req is not None: return 1 # use CPU cpus = get_resource(resource_request, 'cpu') if cpus is not None and cpus > 0: - if 'cpu' not in runnable_class.SUPPORTED_RESOURCES: - logger.warning('No known supported resource available for %s, falling back to using CPU.', runnable_class) - if runnable_class.SUPPORTS_CPU_MULTI_THREADING: - if isinstance(workers_per_resource, float) and workers_per_resource < 1.0: - raise ValueError('Fractional CPU multi threading support is not yet supported.') + if isinstance(workers_per_resource, float) and workers_per_resource < 1.0: raise ValueError('Fractional CPU multi threading support is not yet supported.') return int(workers_per_resource) return math.ceil(cpus) * workers_per_resource - # this should not be reached by user since we always read system resource as default - raise ValueError( - f'No known supported resource available for {runnable_class}. Please check your resource request. Leaving it blank will allow BentoML to use system resources.' - ) - + raise ValueError(f'No known supported resource available for {runnable_class}. Please check your resource request. Leaving it blank will allow BentoML to use system resources.') @classmethod def get_worker_env(cls, runnable_class, resource_request, workers_per_resource, worker_index): cuda_env = os.environ.get('CUDA_VISIBLE_DEVICES', None) disabled = cuda_env in ('', '-1') - environ: dict[str, t.Any] = {} + environ = {} - if resource_request is None: - resource_request = system_resources() + if resource_request is None: resource_request = system_resources() # use NVIDIA kind = 'nvidia.com/gpu' typ = get_resource(resource_request, kind) if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES: if disabled: - logger.debug('CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.', worker_index) - environ['CUDA_VISIBLE_DEVICES'] = cuda_env - return environ + environ['CUDA_VISIBLE_DEVICES'] = cuda_env; return environ environ['CUDA_VISIBLE_DEVICES'] = cls.transpile_workers_to_cuda_envvar(workers_per_resource, typ, worker_index) - logger.debug('Environ for worker %s: %s', worker_index, environ) return environ # use AMD kind = 'amd.com/gpu' typ = get_resource(resource_request, kind, validate=False) if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES: if disabled: - logger.debug('CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.', worker_index) - environ['CUDA_VISIBLE_DEVICES'] = cuda_env - return environ + environ['CUDA_VISIBLE_DEVICES'] = cuda_env; return environ environ['CUDA_VISIBLE_DEVICES'] = cls.transpile_workers_to_cuda_envvar(workers_per_resource, typ, worker_index) - logger.debug('Environ for worker %s: %s', worker_index, environ) return environ # use CPU cpus = get_resource(resource_request, 'cpu') @@ -298,25 +232,17 @@ class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin): environ['CUDA_VISIBLE_DEVICES'] = '-1' # disable gpu if runnable_class.SUPPORTS_CPU_MULTI_THREADING: thread_count = math.ceil(cpus) - for thread_env in THREAD_ENVS: - environ[thread_env] = os.environ.get(thread_env, str(thread_count)) - logger.debug('Environ for worker %s: %s', worker_index, environ) + for thread_env in THREAD_ENVS: environ[thread_env] = os.environ.get(thread_env, str(thread_count)) return environ - for thread_env in THREAD_ENVS: - environ[thread_env] = os.environ.get(thread_env, '1') + for thread_env in THREAD_ENVS: environ[thread_env] = os.environ.get(thread_env, '1') return environ return environ - @staticmethod def transpile_workers_to_cuda_envvar(workers_per_resource, gpus, worker_index): # Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string. if isinstance(workers_per_resource, float): - # NOTE: We hit this branch when workers_per_resource is set to - # float, for example 0.5 or 0.25 - if workers_per_resource > 1: - raise ValueError( - "Currently, the default strategy doesn't support workers_per_resource > 1. It is recommended that one should implement a custom strategy in this case." - ) + # NOTE: We hit this branch when workers_per_resource is set to float, for example 0.5 or 0.25 + if workers_per_resource > 1: raise ValueError('workers_per_resource > 1 is not supported.') # We are round the assigned resource here. This means if workers_per_resource=.4 # then it will round down to 2. If workers_per_source=0.6, then it will also round up to 2. assigned_resource_per_worker = round(1 / workers_per_resource) @@ -327,21 +253,12 @@ class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin): worker_index, assigned_resource_per_worker, ) - raise IndexError( - f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}]." - ) - assigned_gpu = gpus[ - assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1) - ] + raise IndexError(f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}].") + assigned_gpu = gpus[assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1)] dev = ','.join(assigned_gpu) else: idx = worker_index // workers_per_resource if idx >= len(gpus): - raise ValueError( - f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}' - ) + raise ValueError(f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}') dev = str(gpus[idx]) return dev - - -__all__ = ['CascadingResourceStrategy', 'get_resource'] diff --git a/openllm-python/src/openllm/bundle/__init__.py b/openllm-python/src/openllm/bundle/__init__.py index 6ace1c6a..5dbe6806 100644 --- a/openllm-python/src/openllm/bundle/__init__.py +++ b/openllm-python/src/openllm/bundle/__init__.py @@ -4,7 +4,6 @@ from openllm_core._typing_compat import LiteralVersionStrategy from openllm_core.exceptions import OpenLLMException from openllm_core.utils.lazy import VersionInfo, LazyModule -_OWNER, _REPO = 'bentoml', 'openllm' @attr.attrs(eq=False, order=False, slots=True, frozen=True) class RefResolver: git_hash: str = attr.field() @@ -17,7 +16,7 @@ class RefResolver: if strategy_or_version is None or strategy_or_version == 'release': try: from ghapi.all import GhApi - ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False) + ghapi = GhApi(owner='bentoml', repo='openllm', authenticate=False) meta = ghapi.repos.get_latest_release() git_hash = ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'] except Exception as err: @@ -35,6 +34,4 @@ __lazy = LazyModule( {'_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options']}, extra_objects={'RefResolver': RefResolver} ) -__all__ = __lazy.__all__ -__dir__ = __lazy.__dir__ -__getattr__ = __lazy.__getattr__ +__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__ diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index c5a66b5d..83458f67 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -1,15 +1,7 @@ -# mypy: disable-error-code="misc" from __future__ import annotations -import importlib.metadata -import logging -import os -from pathlib import Path - -import orjson +import importlib.metadata, logging, os, pathlib +import bentoml, orjson, openllm_core from simple_di import Provide, inject - -import bentoml -import openllm_core from bentoml._internal.bento.build_config import BentoBuildConfig, DockerOptions, ModelSpec, PythonOptions from bentoml._internal.configuration.containers import BentoMLContainer from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg @@ -17,7 +9,7 @@ from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg logger = logging.getLogger(__name__) OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD' -_service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py' +_service_file = pathlib.Path(os.path.abspath(__file__)).parent.parent / '_service.py' _SERVICE_VARS = '''import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__}''' def build_editable(path, package='openllm'): @@ -28,7 +20,7 @@ def build_editable(path, package='openllm'): from build.env import IsolatedEnvBuilder module_location = pkg.source_locations(package) if not module_location: raise RuntimeError('Could not find the source location of OpenLLM.') - pyproject_path = Path(module_location).parent.parent / 'pyproject.toml' + pyproject_path = pathlib.Path(module_location).parent.parent / 'pyproject.toml' if os.path.isfile(pyproject_path.__fspath__()): with IsolatedEnvBuilder() as env: builder = ProjectBuilder(pyproject_path.parent) @@ -70,12 +62,9 @@ def create_bento( labels = dict(llm.identifying_params) labels.update( { - '_type': llm.llm_type, '_framework': llm.__llm_backend__, 'start_name': llm.config['start_name'], - 'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle', - **{ - f'{package.replace("-","_")}_version': importlib.metadata.version(package) - for package in {'openllm', 'openllm-core', 'openllm-client'} - }, + '_type': llm.llm_type, '_framework': llm.__llm_backend__, + 'start_name': llm.config['start_name'], 'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle', + **{f'{package.replace("-","_")}_version': importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}}, } ) if adapter_map: labels.update(adapter_map) @@ -83,18 +72,15 @@ def create_bento( logger.debug("Building Bento '%s' with model backend '%s'", bento_tag, llm.__llm_backend__) logger.debug('Generating service vars %s (dir=%s)', llm.model_id, llm_fs.getsyspath('/')) script = f"# fmt: off\n# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n" + _SERVICE_VARS.format( - __model_id__=llm.model_id, - __model_tag__=str(llm.tag), - __model_adapter_map__=orjson.dumps(adapter_map).decode(), - __model_serialization__=llm.config['serialisation'], + __model_id__=llm.model_id, __model_tag__=str(llm.tag), # + __model_adapter_map__=orjson.dumps(adapter_map).decode(), __model_serialization__=llm.config['serialisation'], # __model_trust_remote_code__=str(llm.trust_remote_code), ) if SHOW_CODEGEN: logger.info('Generated _service_vars.py:\n%s', script) llm_fs.writetext('_service_vars.py', script) with open(_service_file.__fspath__(), 'r') as f: service_src = f.read() llm_fs.writetext(llm.config['service_name'], service_src) - - bento = bentoml.Bento.create( + return bentoml.Bento.create( version=bento_tag.version, build_ctx=llm_fs.getsyspath('/'), build_config=BentoBuildConfig( @@ -108,6 +94,4 @@ def create_bento( python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map), docker=construct_docker_options(llm, llm_fs, quantize, adapter_map, dockerfile_template, _serialisation), ), - ) - - return bento.save(bento_store=_bento_store, model_store=_model_store) + ).save(bento_store=_bento_store, model_store=_model_store) diff --git a/openllm-python/src/openllm/client.py b/openllm-python/src/openllm/client.py index 8c5c8fc1..591aecc1 100644 --- a/openllm-python/src/openllm/client.py +++ b/openllm-python/src/openllm/client.py @@ -1,10 +1,2 @@ -def __dir__(): - import openllm_client as _client - - return sorted(dir(_client)) - - -def __getattr__(it): - import openllm_client as _client - - return getattr(_client, it) +def __dir__(): import openllm_client as _client; return sorted(dir(_client)) +def __getattr__(it): import openllm_client as _client; return getattr(_client, it) diff --git a/openllm-python/src/openllm/entrypoints/__init__.py b/openllm-python/src/openllm/entrypoints/__init__.py index b2b4e85a..fc64d69b 100644 --- a/openllm-python/src/openllm/entrypoints/__init__.py +++ b/openllm-python/src/openllm/entrypoints/__init__.py @@ -1,20 +1,11 @@ import importlib - from openllm_core.utils import LazyModule _import_structure = {'openai': [], 'hf': [], 'cohere': []} - - def mount_entrypoints(svc, llm): for module_name in _import_structure: module = importlib.import_module(f'.{module_name}', __name__) svc = module.mount_to_svc(svc, llm) return svc - - -__lazy = LazyModule( - __name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints} -) -__all__ = __lazy.__all__ -__dir__ = __lazy.__dir__ -__getattr__ = __lazy.__getattr__ +__lazy = LazyModule(__name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints}) +__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__ diff --git a/openllm-python/src/openllm/entrypoints/cohere.py b/openllm-python/src/openllm/entrypoints/cohere.py index 7197b54d..1192e3df 100644 --- a/openllm-python/src/openllm/entrypoints/cohere.py +++ b/openllm-python/src/openllm/entrypoints/cohere.py @@ -1,17 +1,11 @@ from __future__ import annotations -import functools -import json -import logging -import traceback +import functools, json, logging, traceback from http import HTTPStatus - import orjson from starlette.applications import Starlette from starlette.responses import JSONResponse, StreamingResponse from starlette.routing import Route - from openllm_core.utils import DEBUG, converter, gen_random_uuid - from ._openapi import add_schema_definitions, append_schemas, get_generator from ..protocol.cohere import ( Chat, @@ -54,41 +48,31 @@ schemas = get_generator( logger = logging.getLogger(__name__) -def jsonify_attr(obj): - return json.dumps(converter.unstructure(obj)) - +def jsonify_attr(obj): return json.dumps(converter.unstructure(obj)) def error_response(status_code, message): return JSONResponse(converter.unstructure(CohereErrorResponse(text=message)), status_code=status_code.value) - async def check_model(request, model): - if request.model is None or request.model == model: - return None + if request.model is None or request.model == model: return None return error_response( HTTPStatus.NOT_FOUND, f"Model '{request.model}' does not exists. Try 'GET /v1/models' to see current running models.", ) - def mount_to_svc(svc, llm): app = Starlette( debug=True, routes=[ - Route( - '/v1/generate', endpoint=functools.partial(cohere_generate, llm=llm), name='cohere_generate', methods=['POST'] - ), - Route('/v1/chat', endpoint=functools.partial(cohere_chat, llm=llm), name='cohere_chat', methods=['POST']), Route('/schema', endpoint=lambda req: schemas.OpenAPIResponse(req), include_in_schema=False), + Route('/v1/chat', endpoint=functools.partial(cohere_chat, llm=llm), name='cohere_chat', methods=['POST']), + Route('/v1/generate', endpoint=functools.partial(cohere_generate, llm=llm), name='cohere_generate', methods=['POST']), ], ) mount_path = '/cohere' svc.mount_asgi_app(app, path=mount_path) - return append_schemas( - svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append', inject=DEBUG - ) - + return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append', inject=DEBUG) @add_schema_definitions async def cohere_generate(req, llm): @@ -181,7 +165,6 @@ def _transpile_cohere_chat_messages(request: CohereChatRequest) -> list[dict[str messages.append({'role': 'user', 'content': request.message}) return messages - @add_schema_definitions async def cohere_chat(req, llm): json_str = await req.body() diff --git a/openllm-python/src/openllm/entrypoints/hf.py b/openllm-python/src/openllm/entrypoints/hf.py index d4e9b86f..51f230b8 100644 --- a/openllm-python/src/openllm/entrypoints/hf.py +++ b/openllm-python/src/openllm/entrypoints/hf.py @@ -1,14 +1,10 @@ -import functools -import logging +import functools, logging from http import HTTPStatus - import orjson from starlette.applications import Starlette from starlette.responses import JSONResponse from starlette.routing import Route - from openllm_core.utils import converter - from ._openapi import add_schema_definitions, append_schemas, get_generator from ..protocol.hf import AgentRequest, AgentResponse, HFErrorResponse @@ -25,7 +21,6 @@ schemas = get_generator( ) logger = logging.getLogger(__name__) - def mount_to_svc(svc, llm): app = Starlette( debug=True, @@ -39,13 +34,8 @@ def mount_to_svc(svc, llm): svc.mount_asgi_app(app, path=mount_path) return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append') - def error_response(status_code, message): - return JSONResponse( - converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)), - status_code=status_code.value, - ) - + return JSONResponse(converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value) @add_schema_definitions async def hf_agent(req, llm): @@ -60,18 +50,14 @@ async def hf_agent(req, llm): stop = request.parameters.pop('stop', ['\n']) try: result = await llm.generate(request.inputs, stop=stop, **request.parameters) - return JSONResponse( - converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value - ) + return JSONResponse(converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value) except Exception as err: logger.error('Error while generating: %s', err) return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).') - @add_schema_definitions def hf_adapters(req, llm): - if not llm.has_adapters: - return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.') + if not llm.has_adapters: return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.') return JSONResponse( { adapter_tuple[1]: {'adapter_name': k, 'adapter_type': adapter_tuple[0].peft_type.value} diff --git a/openllm-python/src/openllm/exceptions.py b/openllm-python/src/openllm/exceptions.py index a4c9b07d..3422fe58 100644 --- a/openllm-python/src/openllm/exceptions.py +++ b/openllm-python/src/openllm/exceptions.py @@ -1,10 +1,7 @@ from openllm_core.exceptions import ( - Error as Error, - FineTuneStrategyNotSupportedError as FineTuneStrategyNotSupportedError, - ForbiddenAttributeError as ForbiddenAttributeError, - GpuNotAvailableError as GpuNotAvailableError, + Error as Error, FineTuneStrategyNotSupportedError as FineTuneStrategyNotSupportedError, # + ForbiddenAttributeError as ForbiddenAttributeError, GpuNotAvailableError as GpuNotAvailableError, # + OpenLLMException as OpenLLMException, ValidationError as ValidationError, # MissingAnnotationAttributeError as MissingAnnotationAttributeError, MissingDependencyError as MissingDependencyError, - OpenLLMException as OpenLLMException, - ValidationError as ValidationError, ) diff --git a/openllm-python/src/openllm/protocol/__init__.py b/openllm-python/src/openllm/protocol/__init__.py index 8b6d271e..78c9cbaa 100644 --- a/openllm-python/src/openllm/protocol/__init__.py +++ b/openllm-python/src/openllm/protocol/__init__.py @@ -5,11 +5,6 @@ import typing as t from openllm_core.utils import LazyModule _import_structure: dict[str, list[str]] = {'openai': [], 'cohere': [], 'hf': []} - -if t.TYPE_CHECKING: - from . import cohere as cohere, hf as hf, openai as openai - +if t.TYPE_CHECKING: from . import cohere as cohere, hf as hf, openai as openai __lazy = LazyModule(__name__, os.path.abspath('__file__'), _import_structure) -__all__ = __lazy.__all__ -__dir__ = __lazy.__dir__ -__getattr__ = __lazy.__getattr__ +__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__ diff --git a/openllm-python/src/openllm/utils.py b/openllm-python/src/openllm/utils.py index da5865c3..ca33da9a 100644 --- a/openllm-python/src/openllm/utils.py +++ b/openllm-python/src/openllm/utils.py @@ -1,36 +1,16 @@ import functools, importlib.metadata, openllm_core - __all__ = ['generate_labels', 'available_devices', 'device_count'] - - def generate_labels(llm): return { - 'backend': llm.__llm_backend__, - 'framework': 'openllm', - 'model_name': llm.config['model_name'], - 'architecture': llm.config['architecture'], - 'serialisation': llm._serialisation, + 'backend': llm.__llm_backend__, 'framework': 'openllm', 'model_name': llm.config['model_name'], # + 'architecture': llm.config['architecture'], 'serialisation': llm._serialisation, # **{package: importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}}, } - - -def available_devices(): - from ._strategies import NvidiaGpuResource - - return tuple(NvidiaGpuResource.from_system()) - - +def available_devices(): from ._strategies import NvidiaGpuResource; return tuple(NvidiaGpuResource.from_system()) @functools.lru_cache(maxsize=1) -def device_count() -> int: - return len(available_devices()) - - +def device_count() -> int: return len(available_devices()) def __dir__(): - coreutils = set(dir(openllm_core.utils)) | set([it for it in openllm_core.utils._extras if not it.startswith('_')]) - return sorted(__all__) + sorted(list(coreutils)) - - + coreutils = set(dir(openllm_core.utils)) | set([it for it in openllm_core.utils._extras if not it.startswith('_')]); return sorted(__all__) + sorted(list(coreutils)) def __getattr__(it): - if hasattr(openllm_core.utils, it): - return getattr(openllm_core.utils, it) + if hasattr(openllm_core.utils, it): return getattr(openllm_core.utils, it) raise AttributeError(f'module {__name__} has no attribute {it}') diff --git a/openllm-python/src/openllm_cli/_sdk.py b/openllm-python/src/openllm_cli/_sdk.py index 9475d704..ab36f90c 100644 --- a/openllm-python/src/openllm_cli/_sdk.py +++ b/openllm-python/src/openllm_cli/_sdk.py @@ -1,22 +1,11 @@ from __future__ import annotations -import itertools -import logging -import os -import re -import subprocess -import sys -import typing as t - -import orjson +import itertools, logging, os, re, subprocess, sys, typing as t from simple_di import Provide, inject - -import bentoml -import openllm_core +import bentoml, openllm_core, orjson from bentoml._internal.configuration.containers import BentoMLContainer from openllm_core._typing_compat import LiteralSerialisation from openllm_core.exceptions import OpenLLMException from openllm_core.utils import WARNING_ENV_VAR, codegen, first_not_none, get_disable_warnings, is_vllm_available - if t.TYPE_CHECKING: from bentoml._internal.bento import BentoStore from openllm_core._configuration import LLMConfig @@ -24,7 +13,6 @@ if t.TYPE_CHECKING: logger = logging.getLogger(__name__) - def _start( model_id: str, timeout: int = 30, @@ -35,7 +23,6 @@ def _start( backend: LiteralBackend | None = None, additional_args: list[str] | None = None, cors: bool = False, - _serve_grpc: bool = False, __test__: bool = False, **_: t.Any, ) -> LLMConfig | subprocess.Popen[bytes]: @@ -73,13 +60,10 @@ def _start( backend: The backend to use for this LLM. By default, this is set to ``pt``. additional_args: Additional arguments to pass to ``openllm start``. """ - from .entrypoint import start_command, start_grpc_command - + from .entrypoint import start_command os.environ['BACKEND'] = openllm_core.utils.first_not_none(backend, default='vllm' if is_vllm_available() else 'pt') - args: list[str] = [model_id] - if timeout: - args.extend(['--server-timeout', str(timeout)]) + if timeout: args.extend(['--server-timeout', str(timeout)]) if workers_per_resource: args.extend( [ @@ -87,24 +71,19 @@ def _start( str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource, ] ) - if device and not os.environ.get('CUDA_VISIBLE_DEVICES'): - args.extend(['--device', ','.join(device)]) - if quantize: - args.extend(['--quantize', str(quantize)]) - if cors: - args.append('--cors') + if device and not os.environ.get('CUDA_VISIBLE_DEVICES'): args.extend(['--device', ','.join(device)]) + if quantize: args.extend(['--quantize', str(quantize)]) + if cors: args.append('--cors') if adapter_map: args.extend( list( itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()]) ) ) - if additional_args: - args.extend(additional_args) - if __test__: - args.append('--return-process') + if additional_args: args.extend(additional_args) + if __test__: args.append('--return-process') - cmd = start_command if not _serve_grpc else start_grpc_command + cmd = start_command return cmd.main(args=args, standalone_mode=False) @@ -159,7 +138,6 @@ def _build( ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud. """ from openllm.serialisation.transformers.weights import has_safetensors_weights - args: list[str] = [ sys.executable, '-m', @@ -173,32 +151,19 @@ def _build( serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy' ), ] - if quantize: - args.extend(['--quantize', quantize]) - if containerize and push: - raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.") - if push: - args.extend(['--push']) - if containerize: - args.extend(['--containerize']) - if build_ctx: - args.extend(['--build-ctx', build_ctx]) - if enable_features: - args.extend([f'--enable-features={f}' for f in enable_features]) - if overwrite: - args.append('--overwrite') - if adapter_map: - args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()]) - if model_version: - args.extend(['--model-version', model_version]) - if bento_version: - args.extend(['--bento-version', bento_version]) - if dockerfile_template: - args.extend(['--dockerfile-template', dockerfile_template]) - if additional_args: - args.extend(additional_args) - if force_push: - args.append('--force-push') + if quantize: args.extend(['--quantize', quantize]) + if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.") + if push: args.extend(['--push']) + if containerize: args.extend(['--containerize']) + if build_ctx: args.extend(['--build-ctx', build_ctx]) + if enable_features: args.extend([f'--enable-features={f}' for f in enable_features]) + if overwrite: args.append('--overwrite') + if adapter_map: args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()]) + if model_version: args.extend(['--model-version', model_version]) + if bento_version: args.extend(['--bento-version', bento_version]) + if dockerfile_template: args.extend(['--dockerfile-template', dockerfile_template]) + if additional_args: args.extend(additional_args) + if force_push: args.append('--force-push') current_disable_warning = get_disable_warnings() os.environ[WARNING_ENV_VAR] = str(True) @@ -206,24 +171,17 @@ def _build( output = subprocess.check_output(args, env=os.environ.copy(), cwd=build_ctx or os.getcwd()) except subprocess.CalledProcessError as e: logger.error("Exception caught while building Bento for '%s'", model_id, exc_info=e) - if e.stderr: - raise OpenLLMException(e.stderr.decode('utf-8')) from None + if e.stderr: raise OpenLLMException(e.stderr.decode('utf-8')) from None raise OpenLLMException(str(e)) from None matched = re.match(r'__object__:(\{.*\})$', output.decode('utf-8').strip()) if matched is None: - raise ValueError( - f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub." - ) + raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.") os.environ[WARNING_ENV_VAR] = str(current_disable_warning) try: result = orjson.loads(matched.group(1)) except orjson.JSONDecodeError as e: - raise ValueError( - f"Failed to decode JSON from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub." - ) from e + raise ValueError(f"Failed to decode JSON from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.") from e return bentoml.get(result['tag'], _bento_store=bento_store) - - def _import_model( model_id: str, model_version: str | None = None, @@ -260,32 +218,15 @@ def _import_model( ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud. """ from .entrypoint import import_command - args = [model_id, '--quiet'] - if backend is not None: - args.extend(['--backend', backend]) - if model_version is not None: - args.extend(['--model-version', str(model_version)]) - if quantize is not None: - args.extend(['--quantize', quantize]) - if serialisation is not None: - args.extend(['--serialisation', serialisation]) - if additional_args is not None: - args.extend(additional_args) + if backend is not None: args.extend(['--backend', backend]) + if model_version is not None: args.extend(['--model-version', str(model_version)]) + if quantize is not None: args.extend(['--quantize', quantize]) + if serialisation is not None: args.extend(['--serialisation', serialisation]) + if additional_args is not None: args.extend(additional_args) return import_command.main(args=args, standalone_mode=False) - - def _list_models() -> dict[str, t.Any]: '''List all available models within the local store.''' - from .entrypoint import models_command - - return models_command.main(args=['--quiet'], standalone_mode=False) - - -start, start_grpc = codegen.gen_sdk(_start, _serve_grpc=False), codegen.gen_sdk(_start, _serve_grpc=True) -build, import_model, list_models = ( - codegen.gen_sdk(_build), - codegen.gen_sdk(_import_model), - codegen.gen_sdk(_list_models), -) -__all__ = ['start', 'start_grpc', 'build', 'import_model', 'list_models'] + from .entrypoint import models_command; return models_command.main(args=['--quiet'], standalone_mode=False) +start, build, import_model, list_models = codegen.gen_sdk(_start), codegen.gen_sdk(_build), codegen.gen_sdk(_import_model), codegen.gen_sdk(_list_models) +__all__ = ['start', 'build', 'import_model', 'list_models']