diff --git a/changelog.d/663.feature.md b/changelog.d/663.feature.md new file mode 100644 index 00000000..a8ee89bf --- /dev/null +++ b/changelog.d/663.feature.md @@ -0,0 +1,3 @@ +Type hints for all exposed API are now provided through stubs. This means REPL +and static analysis tools like mypy can infer types from library instantly without +having to infer types from runtime function signatures. diff --git a/mypy.ini b/mypy.ini index 9648d989..7962f935 100644 --- a/mypy.ini +++ b/mypy.ini @@ -7,4 +7,4 @@ warn_unused_configs = True ignore_missing_imports = true check_untyped_defs = true warn_unreachable = true -files = openllm-client/src/openllm_client/__init__.pyi, openllm-client/src/openllm_client/_utils.pyi, openllm-core/src/openllm_core/_typing_compat.py, openllm-client/src/openllm_client/_typing_compat.py, openllm-python/src/openllm/__init__.pyi, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/serialisation/__init__.pyi +files = openllm-python/src/openllm/bundle/__init__.pyi, openllm-python/src/openllm/serialisation/__init__.pyi, openllm-client/src/openllm_client/__init__.pyi, openllm-client/src/openllm_client/_utils.pyi, openllm-python/src/openllm/__init__.pyi, openllm-client/src/openllm_client/_typing_compat.py, openllm-core/src/openllm_core/_typing_compat.py, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/bundle/_package.pyi, openllm-python/src/openllm/_runners.pyi, openllm-python/src/openllm/_quantisation.pyi, openllm-python/src/openllm/_llm.pyi, openllm-python/src/openllm/_generation.pyi diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py index c926635f..7a9c4065 100644 --- a/openllm-core/src/openllm_core/_typing_compat.py +++ b/openllm-core/src/openllm_core/_typing_compat.py @@ -88,9 +88,3 @@ class AdapterTuple(TupleAny): AdapterMap = t.Dict[AdapterType, t.Tuple[AdapterTuple, ...]] - - -class RefTuple(TupleAny): - git_hash: str - version: VersionInfo - strategy: LiteralContainerVersionStrategy diff --git a/openllm-core/src/openllm_core/config/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py index 2c38a3f1..680e8566 100644 --- a/openllm-core/src/openllm_core/config/configuration_auto.py +++ b/openllm-core/src/openllm_core/config/configuration_auto.py @@ -121,6 +121,9 @@ class AutoConfig: # update-config-stubs.py: auto stubs start @t.overload @classmethod + def for_model(cls,model_name:t.Literal['baichuan'],**attrs:t.Any)->openllm_core.config.BaichuanConfig:... + @t.overload + @classmethod def for_model(cls,model_name:t.Literal['chatglm'],**attrs:t.Any)->openllm_core.config.ChatGLMConfig:... @t.overload @classmethod @@ -139,6 +142,9 @@ class AutoConfig: def for_model(cls,model_name:t.Literal['llama'],**attrs:t.Any)->openllm_core.config.LlamaConfig:... @t.overload @classmethod + def for_model(cls,model_name:t.Literal['mistral'],**attrs:t.Any)->openllm_core.config.MistralConfig:... + @t.overload + @classmethod def for_model(cls,model_name:t.Literal['mpt'],**attrs:t.Any)->openllm_core.config.MPTConfig:... @t.overload @classmethod @@ -151,13 +157,7 @@ class AutoConfig: def for_model(cls,model_name:t.Literal['starcoder'],**attrs:t.Any)->openllm_core.config.StarCoderConfig:... @t.overload @classmethod - def for_model(cls,model_name:t.Literal['mistral'],**attrs:t.Any)->openllm_core.config.MistralConfig:... - @t.overload - @classmethod def for_model(cls,model_name:t.Literal['yi'],**attrs:t.Any)->openllm_core.config.YiConfig:... - @t.overload - @classmethod - def for_model(cls,model_name:t.Literal['baichuan'],**attrs:t.Any)->openllm_core.config.BaichuanConfig:... # update-config-stubs.py: auto stubs stop # fmt: on diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py index a3eda552..835cb195 100644 --- a/openllm-core/src/openllm_core/utils/__init__.py +++ b/openllm-core/src/openllm_core/utils/__init__.py @@ -350,13 +350,16 @@ T = t.TypeVar('T') K = t.TypeVar('K') -# yapf: disable @overload def first_not_none(*args: T | None, default: T) -> T: ... + + @overload def first_not_none(*args: T | None) -> T | None: ... -def first_not_none(*args: T | None, default: None | T = None) -> T | None: return next((arg for arg in args if arg is not None), default) -# yapf: enable + + +def first_not_none(*args: T | None, default: T | None = None) -> T | None: + return next((arg for arg in args if arg is not None), default) def resolve_filepath(path: str, ctx: str | None = None) -> str: diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml index 0995afd8..4a4bb9d3 100644 --- a/openllm-python/pyproject.toml +++ b/openllm-python/pyproject.toml @@ -38,7 +38,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "bentoml[io]>=1.1.2", + "bentoml[io]>=1.1.9", "transformers[torch,tokenizers]>=4.35.0", "openllm-client", "openllm-core", diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py index 3d48570b..2703d9ec 100644 --- a/openllm-python/src/openllm/__init__.py +++ b/openllm-python/src/openllm/__init__.py @@ -1,14 +1,3 @@ -"""OpenLLM. - -An open platform for operating large language models in production. Fine-tune, serve, -deploy, and monitor any LLMs with ease. - -* Built-in support for StableLM, Llama 2, Dolly, Flan-T5, Vicuna -* Option to bring your own fine-tuned LLMs -* Online Serving with HTTP, gRPC, SSE(coming soon) or custom API -* Native integration with BentoML and LangChain for custom LLM apps -""" - import logging as _logging import os as _os import pathlib as _pathlib @@ -57,13 +46,14 @@ __lazy = utils.LazyModule( 'entrypoints': ['mount_entrypoints'], 'serialisation': ['ggml', 'transformers'], '_quantisation': ['infer_quantisation_config'], - '_llm': ['LLM', 'LLMRunner', 'LLMRunnable'], + '_llm': ['LLM'], '_generation': [ 'StopSequenceCriteria', 'StopOnTokens', - 'LogitsProcessorList', - 'StoppingCriteriaList', 'prepare_logits_processor', + 'get_context_length', + 'is_sentence_complete', + 'is_partial_stop', ], }, extra_objects={ diff --git a/openllm-python/src/openllm/__init__.pyi b/openllm-python/src/openllm/__init__.pyi index 531ee8b7..6bb92cb9 100644 --- a/openllm-python/src/openllm/__init__.pyi +++ b/openllm-python/src/openllm/__init__.pyi @@ -1,3 +1,21 @@ +"""OpenLLM. +=========== + +An open platform for operating large language models in production. +Fine-tune, serve, deploy, and monitor any LLMs with ease. + +* Built-in support for Mistral, Llama 2, Yi, StableLM, Dolly, Flan-T5, Vicuna +* Option to bring your own fine-tuned LLMs +* Online Serving with HTTP, gRPC, SSE or custom API +* Native integration with BentoML, LangChain, OpenAI compatible endpoints, LlamaIndex for custom LLM apps +""" + +# fmt: off +# update-config-stubs.py: import stubs start +from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING,CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,AutoConfig as AutoConfig,BaichuanConfig as BaichuanConfig,ChatGLMConfig as ChatGLMConfig,DollyV2Config as DollyV2Config,FalconConfig as FalconConfig,FlanT5Config as FlanT5Config,GPTNeoXConfig as GPTNeoXConfig,LlamaConfig as LlamaConfig,MistralConfig as MistralConfig,MPTConfig as MPTConfig,OPTConfig as OPTConfig,StableLMConfig as StableLMConfig,StarCoderConfig as StarCoderConfig,YiConfig as YiConfig +# update-config-stubs.py: import stubs stop +# fmt: on + import openllm_cli as _cli from openllm_cli._sdk import ( build as build, @@ -16,23 +34,6 @@ from openllm_core._schemas import ( GenerationOutput as GenerationOutput, MetadataOutput as MetadataOutput, ) -from openllm_core.config import ( - CONFIG_MAPPING as CONFIG_MAPPING, - CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, - AutoConfig as AutoConfig, - BaichuanConfig as BaichuanConfig, - ChatGLMConfig as ChatGLMConfig, - DollyV2Config as DollyV2Config, - FalconConfig as FalconConfig, - FlanT5Config as FlanT5Config, - GPTNeoXConfig as GPTNeoXConfig, - LlamaConfig as LlamaConfig, - MistralConfig as MistralConfig, - MPTConfig as MPTConfig, - OPTConfig as OPTConfig, - StableLMConfig as StableLMConfig, - StarCoderConfig as StarCoderConfig, -) from . import ( bundle as bundle, @@ -44,13 +45,14 @@ from . import ( ) from ._deprecated import Runner as Runner from ._generation import ( - LogitsProcessorList as LogitsProcessorList, StopOnTokens as StopOnTokens, - StoppingCriteriaList as StoppingCriteriaList, StopSequenceCriteria as StopSequenceCriteria, prepare_logits_processor as prepare_logits_processor, + is_partial_stop as is_partial_stop, + is_sentence_complete as is_sentence_complete, + get_context_length as get_context_length, ) -from ._llm import LLM as LLM, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner +from ._llm import LLM as LLM from ._quantisation import infer_quantisation_config as infer_quantisation_config from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource from .client import AsyncHTTPClient as AsyncHTTPClient, HTTPClient as HTTPClient diff --git a/openllm-python/src/openllm/_generation.py b/openllm-python/src/openllm/_generation.py index d6d4eaef..c3b4fa5b 100644 --- a/openllm-python/src/openllm/_generation.py +++ b/openllm-python/src/openllm/_generation.py @@ -1,43 +1,24 @@ -# mypy: disable-error-code="misc" -from __future__ import annotations -import typing as t - import transformers -if t.TYPE_CHECKING: - import torch - - import openllm - -# reexport from transformers -LogitsProcessorList = transformers.LogitsProcessorList -StoppingCriteriaList = transformers.StoppingCriteriaList - class StopSequenceCriteria(transformers.StoppingCriteria): - def __init__( - self, - stop_sequences: str | list[str], - tokenizer: transformers.PreTrainedTokenizer - | transformers.PreTrainedTokenizerBase - | transformers.PreTrainedTokenizerFast, - ): + def __init__(self, stop_sequences, tokenizer): if isinstance(stop_sequences, str): stop_sequences = [stop_sequences] self.stop_sequences, self.tokenizer = stop_sequences, tokenizer - def __call__(self, input_ids: torch.Tensor, scores: t.Any, **_: t.Any) -> bool: + def __call__(self, input_ids, scores, **kwargs): return any( self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences ) class StopOnTokens(transformers.StoppingCriteria): - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **_: t.Any) -> bool: + def __call__(self, input_ids, scores, **kwargs): return input_ids[0][-1] in {50278, 50279, 50277, 1, 0} -def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsProcessorList: +def prepare_logits_processor(config): generation_config = config.generation_config logits_processor = transformers.LogitsProcessorList() if generation_config['temperature'] >= 1e-5 and generation_config['temperature'] != 1.0: @@ -55,7 +36,7 @@ def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsPr SEQLEN_KEYS = ['max_sequence_length', 'seq_length', 'max_position_embeddings', 'max_seq_len', 'model_max_length'] -def get_context_length(config: transformers.PretrainedConfig) -> int: +def get_context_length(config): rope_scaling = getattr(config, 'rope_scaling', None) rope_scaling_factor = config.rope_scaling['factor'] if rope_scaling else 1.0 for key in SEQLEN_KEYS: @@ -64,11 +45,11 @@ def get_context_length(config: transformers.PretrainedConfig) -> int: return 2048 -def is_sentence_complete(output: str) -> bool: +def is_sentence_complete(output): return output.endswith(('.', '?', '!', '...', '。', '?', '!', '…', '"', "'", '”')) -def is_partial_stop(output: str, stop_str: str) -> bool: +def is_partial_stop(output, stop_str): """Check whether the output contains a partial stop str.""" for i in range(min(len(output), len(stop_str))): if stop_str.startswith(output[-i:]): diff --git a/openllm-python/src/openllm/_generation.pyi b/openllm-python/src/openllm/_generation.pyi new file mode 100644 index 00000000..c727f6be --- /dev/null +++ b/openllm-python/src/openllm/_generation.pyi @@ -0,0 +1,28 @@ +from typing import Any, List, Union + +from torch import FloatTensor, LongTensor +from transformers import ( + LogitsProcessorList, + PretrainedConfig, + PreTrainedTokenizer, + PreTrainedTokenizerBase, + PreTrainedTokenizerFast, +) + +from openllm_core import LLMConfig + +Tokenizer = Union[PreTrainedTokenizerBase, PreTrainedTokenizer, PreTrainedTokenizerFast] + +class StopSequenceCriteria: + stop_sequences: List[str] + tokenizer: Tokenizer + def __init__(self, stop_sequences: Union[str, List[str]], tokenizer: Tokenizer) -> None: ... + def __call__(self, input_ids: LongTensor, scores: FloatTensor, **kwargs: Any) -> bool: ... + +class StopOnTokens: + def __call__(self, input_ids: LongTensor, scores: FloatTensor, **kwargs: Any) -> bool: ... + +def prepare_logits_processor(config: LLMConfig) -> LogitsProcessorList: ... +def get_context_length(config: PretrainedConfig) -> int: ... +def is_sentence_complete(output: str) -> bool: ... +def is_partial_stop(output: str, stop_str: str) -> bool: ... diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index 61965537..2ee2387d 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -1,6 +1,4 @@ -# mypy: disable-error-code="name-defined,attr-defined" from __future__ import annotations -import abc import functools import logging import os @@ -10,14 +8,12 @@ import typing as t import attr import inflection import orjson -from huggingface_hub import hf_hub_download import bentoml import openllm -import openllm_core from bentoml._internal.models.model import ModelSignature from bentoml._internal.runner.runner_handle import DummyRunnerHandle -from openllm_core._schemas import CompletionChunk, GenerationOutput +from openllm_core._schemas import GenerationOutput from openllm_core._typing_compat import ( AdapterMap, AdapterTuple, @@ -43,32 +39,27 @@ from openllm_core.utils import ( converter, first_not_none, flatten_attrs, + gen_random_uuid, generate_hash_from_file, get_debug_mode, get_disable_warnings, get_quiet_mode, is_peft_available, + is_vllm_available, resolve_filepath, validate_is_path, ) -from ._quantisation import infer_quantisation_config -from ._strategies import CascadingResourceStrategy from .exceptions import ForbiddenAttributeError, OpenLLMException from .serialisation.constants import PEFT_CONFIG_NAME if t.TYPE_CHECKING: - import torch import transformers from peft.config import PeftConfig - from peft.peft_model import PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM - from bentoml._internal.runner.runnable import RunnableMethod - from bentoml._internal.runner.runner import RunnerMethod - from bentoml._internal.runner.runner_handle import RunnerHandle - from bentoml._internal.runner.strategy import Strategy from openllm_core._configuration import LLMConfig - from openllm_core.utils.representation import ReprArgs + + from ._runners import Runner ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]] @@ -84,16 +75,15 @@ def normalise_model_name(name: str) -> str: return inflection.dasherize(name) -def resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap: - """Resolve the type of the PeftConfig given the adapter_map. +def _resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap: + try: + from huggingface_hub import hf_hub_download + except ImportError: + raise MissingDependencyError( + "Failed to import 'huggingface_hub'. Make sure to do 'pip install \"openllm[fine-tune]\"'" + ) from None - This is similar to how PeftConfig resolve its config type. - - Args: - adapter_map: The given mapping from either SDK or CLI. See CLI docs for more information. - """ resolved: AdapterMap = {} - _has_set_default = False for path_or_adapter_id, name in adapter_map.items(): if name is None: raise ValueError('Adapter name must be specified.') @@ -107,7 +97,7 @@ def resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap: with open(config_file, 'r') as file: resolved_config = orjson.loads(file.read()) # all peft_type should be available in PEFT_CONFIG_NAME - _peft_type: AdapterType = resolved_config['peft_type'].lower() + _peft_type = resolved_config['peft_type'].lower() if _peft_type not in resolved: resolved[_peft_type] = () resolved[_peft_type] += (_AdapterTuple((path_or_adapter_id, name, resolved_config)),) @@ -151,7 +141,7 @@ class LLM(t.Generic[M, T], ReprMixin): __llm_config__: LLMConfig | None = None __llm_backend__: LiteralBackend = None # type: ignore __llm_quantization_config__: transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig | None = None - __llm_runner__: t.Optional[LLMRunner[M, T]] = None + __llm_runner__: t.Optional[Runner[M, T]] = None __llm_model__: t.Optional[M] = None __llm_tokenizer__: t.Optional[T] = None __llm_adapter_map__: t.Optional[ResolvedAdapterMap] = None @@ -159,35 +149,29 @@ class LLM(t.Generic[M, T], ReprMixin): def __init__( self, - model_id: str, - model_version: str | None = None, - model_tag: str | bentoml.Tag | None = None, - prompt_template: PromptTemplate | str | None = None, - system_message: str | None = None, - llm_config: LLMConfig | None = None, - backend: LiteralBackend | None = None, - *args: t.Any, - quantize: LiteralQuantise | None = None, - quantization_config: transformers.BitsAndBytesConfig - | transformers.GPTQConfig - | transformers.AwqConfig - | None = None, - adapter_map: dict[str, str] | None = None, - serialisation: LiteralSerialisation = 'safetensors', - trust_remote_code: bool = False, - embedded: bool = False, - torch_dtype: LiteralDtype | t.Literal['auto', 'half', 'float'] = 'auto', - **attrs: t.Any, + model_id, + model_version=None, + model_tag=None, + prompt_template=None, + system_message=None, + llm_config=None, + backend=None, + *args, + quantize=None, + quantization_config=None, + adapter_map=None, + serialisation='safetensors', + trust_remote_code=False, + embedded=False, + torch_dtype='auto', + low_cpu_mem_usage=True, + **attrs, ): - # low_cpu_mem_usage is only available for model this is helpful on system with low memory to avoid OOM - low_cpu_mem_usage = attrs.pop('low_cpu_mem_usage', True) _local = False if validate_is_path(model_id): model_id, _local = resolve_filepath(model_id), True - backend = first_not_none( - backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if openllm.utils.is_vllm_available() else 'pt' - ) + backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt') torch_dtype = first_not_none(os.getenv('TORCH_DTYPE'), torch_dtype, default='auto') quantize = first_not_none(quantize, os.getenv('OPENLLM_QUANTIZE'), default=None) # elif quantization_config is None and quantize is not None: @@ -215,7 +199,7 @@ class LLM(t.Generic[M, T], ReprMixin): quantization_config=quantization_config, quantise=quantize, model_decls=args, - adapter_map=resolve_peft_config_type(adapter_map) if adapter_map is not None else None, + adapter_map=_resolve_peft_config_type(adapter_map) if adapter_map is not None else None, serialisation=serialisation, local=_local, prompt_template=prompt_template, @@ -244,7 +228,7 @@ class LLM(t.Generic[M, T], ReprMixin): self.runner.init_local(quiet=True) @property - def _torch_dtype(self) -> torch.dtype: + def _torch_dtype(self): import torch import transformers @@ -298,11 +282,15 @@ class LLM(t.Generic[M, T], ReprMixin): super().__setattr__(attr, value) @property - def _model_attrs(self) -> dict[str, t.Any]: + def _model_attrs(self): return {**self.import_kwargs[0], **self.__model_attrs} + @_model_attrs.setter + def _model_attrs(self, value): + self.__model_attrs = value + @property - def _tokenizer_attrs(self) -> dict[str, t.Any]: + def _tokenizer_attrs(self): return {**self.import_kwargs[1], **self.__tokenizer_attrs} @property @@ -319,41 +307,42 @@ class LLM(t.Generic[M, T], ReprMixin): def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: import torch - return {'device_map': 'auto' if torch.cuda.is_available() else None, 'torch_dtype': self._torch_dtype}, { - 'padding_side': 'left', - 'truncation_side': 'left', - } + model_attrs = {'device_map': 'auto' if torch.cuda.is_available() else None, 'torch_dtype': self._torch_dtype} + tokenizer_attrs = {'padding_side': 'left', 'truncation_side': 'left'} + return model_attrs, tokenizer_attrs @property - def trust_remote_code(self) -> bool: + def trust_remote_code(self): env = os.getenv('TRUST_REMOTE_CODE') if env is not None: return str(env).upper() in ENV_VARS_TRUE_VALUES return self.__llm_trust_remote_code__ @property - def runner_name(self) -> str: + def runner_name(self): return f"llm-{self.config['start_name']}-runner" @property - def model_id(self) -> str: + def model_id(self): return self._model_id @property - def revision(self) -> str: - return t.cast(str, self._revision) + def revision(self): + return self._revision @property - def tag(self) -> bentoml.Tag: + def tag(self): return self._tag @property - def bentomodel(self) -> bentoml.Model: + def bentomodel(self): return openllm.serialisation.get(self) @property - def quantization_config(self) -> transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig: + def quantization_config(self): if self.__llm_quantization_config__ is None: + from ._quantisation import infer_quantisation_config + if self._quantization_config is not None: self.__llm_quantization_config__ = self._quantization_config elif self._quantise is not None: @@ -365,55 +354,55 @@ class LLM(t.Generic[M, T], ReprMixin): return self.__llm_quantization_config__ @property - def has_adapters(self) -> bool: + def has_adapters(self): return self._adapter_map is not None @property - def local(self) -> bool: + def local(self): return self._local @property - def quantise(self) -> LiteralQuantise | None: + def quantise(self): return self._quantise # NOTE: The section below defines a loose contract with langchain's LLM interface. @property - def llm_type(self) -> str: + def llm_type(self): return normalise_model_name(self._model_id) @property - def identifying_params(self) -> DictStrAny: + def llm_parameters(self): + return (self._model_decls, self._model_attrs), self._tokenizer_attrs + + @property + def identifying_params(self): return { 'configuration': self.config.model_dump_json().decode(), 'model_ids': orjson.dumps(self.config['model_ids']).decode(), 'model_id': self.model_id, } - @property - def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]: - return (self._model_decls, self._model_attrs), self._tokenizer_attrs - # NOTE: This section is the actual model, tokenizer, and config reference here. @property - def config(self) -> LLMConfig: + def config(self): if self.__llm_config__ is None: self.__llm_config__ = openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs) return self.__llm_config__ @property - def tokenizer(self) -> T: + def tokenizer(self): if self.__llm_tokenizer__ is None: self.__llm_tokenizer__ = openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1]) return self.__llm_tokenizer__ @property - def runner(self) -> LLMRunner[M, T]: + def runner(self): if self.__llm_runner__ is None: self.__llm_runner__ = _RunnerFactory(self) return self.__llm_runner__ @property - def model(self) -> M: + def model(self): if self.__llm_model__ is None: model = openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs) # If OOM, then it is probably you don't have enough VRAM to run this model. @@ -439,7 +428,7 @@ class LLM(t.Generic[M, T], ReprMixin): return self.__llm_model__ @property - def adapter_map(self) -> ResolvedAdapterMap: + def adapter_map(self): try: import peft as _ # noqa: F401 except ImportError as err: @@ -461,9 +450,7 @@ class LLM(t.Generic[M, T], ReprMixin): self.__llm_adapter_map__ = _map return self.__llm_adapter_map__ - def prepare_for_training( - self, adapter_type: AdapterType = 'lora', use_gradient_checking: bool = True, **attrs: t.Any - ) -> tuple[PeftModel | PeftModelForCausalLM | PeftModelForSeq2SeqLM, T]: + def prepare_for_training(self, adapter_type='lora', use_gradient_checking=True, **attrs): from peft.mapping import get_peft_model from peft.utils.other import prepare_model_for_kbit_training @@ -484,15 +471,8 @@ class LLM(t.Generic[M, T], ReprMixin): return model, self.tokenizer async def generate( - self, - prompt: str | None, - prompt_token_ids: list[int] | None = None, - stop: str | t.Iterable[str] | None = None, - stop_token_ids: list[int] | None = None, - request_id: str | None = None, - adapter_name: str | None = None, - **attrs: t.Any, - ) -> GenerationOutput: + self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs + ): config = self.config.model_construct_env(**attrs) texts: list[list[str]] = [[]] * config['n'] token_ids: list[list[int]] = [[]] * config['n'] @@ -515,15 +495,8 @@ class LLM(t.Generic[M, T], ReprMixin): ) async def generate_iterator( - self, - prompt: str | None, - prompt_token_ids: list[int] | None = None, - stop: str | t.Iterable[str] | None = None, - stop_token_ids: list[int] | None = None, - request_id: str | None = None, - adapter_name: str | None = None, - **attrs: t.Any, - ) -> t.AsyncGenerator[GenerationOutput, None]: + self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs + ): if isinstance(self.runner._runner_handle, DummyRunnerHandle): if os.getenv('BENTO_PATH') is not None: raise RuntimeError('Runner client failed to set up correctly.') @@ -551,14 +524,13 @@ class LLM(t.Generic[M, T], ReprMixin): raise ValueError('Either prompt or prompt_token_ids must be specified.') prompt_token_ids = self.tokenizer.encode(prompt) - if request_id is None: - request_id = openllm_core.utils.gen_random_uuid() + request_id = gen_random_uuid() if request_id is None else request_id previous_texts, previous_num_tokens = [''] * config['n'], [0] * config['n'] async for out in self.runner.generate_iterator.async_stream( - prompt_token_ids, request_id, stop, adapter_name, **config.model_dump(flatten=True) + prompt_token_ids, request_id, stop=stop, adapter_name=adapter_name, **config.model_dump(flatten=True) ): generated = GenerationOutput.from_runner(out).with_options(prompt=prompt) - delta_outputs = t.cast(t.List[CompletionChunk], [None] * len(generated.outputs)) + delta_outputs = [None] * len(generated.outputs) if generated.finished: break for output in generated.outputs: @@ -570,44 +542,37 @@ class LLM(t.Generic[M, T], ReprMixin): def _RunnerFactory( - self: openllm.LLM[M, T], - /, - models: list[bentoml.Model] | None = None, - max_batch_size: int | None = None, - max_latency_ms: int | None = None, - scheduling_strategy: type[bentoml.Strategy] = CascadingResourceStrategy, - *, - backend: LiteralBackend | None = None, -) -> LLMRunner[M, T]: + llm, /, models=None, max_batch_size=None, max_latency_ms=None, scheduling_strategy=None, *, backend=None +): from ._runners import runnable - backend = t.cast( - LiteralBackend, first_not_none(backend, os.environ.get('OPENLLM_BACKEND'), default=self.__llm_backend__) - ) + if scheduling_strategy is None: + from ._strategies import CascadingResourceStrategy + + scheduling_strategy = CascadingResourceStrategy + + backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND', default=llm.__llm_backend__)) models = models if models is not None else [] try: - models.append(self.bentomodel) + models.append(llm.bentomodel) except bentoml.exceptions.NotFound as err: - raise RuntimeError(f'Failed to locate {self.bentomodel}:{err}') from err + raise RuntimeError(f'Failed to locate {llm.bentomodel}:{err}') from err - if self._prompt_template: - prompt_template = self._prompt_template.to_string() - elif hasattr(self.config, 'default_prompt_template'): - prompt_template = self.config.default_prompt_template + if llm._prompt_template: + prompt_template = llm._prompt_template.to_string() + elif hasattr(llm.config, 'default_prompt_template'): + prompt_template = llm.config.default_prompt_template else: prompt_template = None - if self._system_message: - system_message = self._system_message - elif hasattr(self.config, 'default_system_message'): - system_message = self.config.default_system_message + if llm._system_message: + system_message = llm._system_message + elif hasattr(llm.config, 'default_system_message'): + system_message = llm.config.default_system_message else: system_message = None - def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]: - return {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'} - - def _wrapped_repr_args(_: LLMRunner[M, T]) -> ReprArgs: + def _wrapped_repr_args(_): yield ( 'runner_methods', { @@ -618,89 +583,40 @@ def _RunnerFactory( for method in _.runner_methods }, ) - yield 'config', self.config.model_dump(flatten=True) - yield 'llm_type', self.llm_type + yield 'config', llm.config.model_dump(flatten=True) + yield 'llm_type', llm.llm_type yield 'backend', backend - yield 'llm_tag', self.tag + yield 'llm_tag', llm.tag return types.new_class( - self.__class__.__name__ + 'Runner', + llm.config.__class__.__name__[:-6] + 'Runner', (bentoml.Runner,), exec_body=lambda ns: ns.update( { - 'llm_type': self.llm_type, - 'identifying_params': self.identifying_params, - 'llm_tag': self.tag, - 'llm': self, - 'config': self.config, + 'llm_type': llm.llm_type, + 'identifying_params': llm.identifying_params, + 'llm_tag': llm.tag, + 'llm': llm, + 'config': llm.config, 'backend': backend, - '__module__': self.__module__, + '__doc__': llm.config.__class__.__doc__ or f'Generated Runner class for {llm.config["model_name"]}', + '__module__': llm.__module__, '__repr__': ReprMixin.__repr__, - '__repr_keys__': property(_wrapped_repr_keys), + '__repr_keys__': property(lambda _: {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'}), '__repr_args__': _wrapped_repr_args, - 'has_adapters': self.has_adapters, + 'has_adapters': llm.has_adapters, 'prompt_template': prompt_template, 'system_message': system_message, } ), )( runnable(backend), - name=self.runner_name, + name=llm.runner_name, embedded=False, models=models, max_batch_size=max_batch_size, max_latency_ms=max_latency_ms, scheduling_strategy=scheduling_strategy, - runnable_init_params=dict(llm=self), + runnable_init_params={'llm': llm}, method_configs=converter.unstructure({'generate_iterator': ModelSignature(batchable=False)}), ) - - -@t.final -class LLMRunnable(bentoml.Runnable, t.Generic[M, T]): - SUPPORTED_RESOURCES = ('amd.com/gpu', 'nvidia.com/gpu', 'cpu') - SUPPORTS_CPU_MULTI_THREADING = True - generate_iterator: RunnableMethod[LLMRunnable[M, T], [list[int], str, str | t.Iterable[str] | None, str | None], str] - - -@t.final -class LLMRunner(t.Protocol[M, T]): - __doc__: str - __module__: str - llm_type: str - llm_tag: bentoml.Tag - identifying_params: dict[str, t.Any] - llm: openllm.LLM[M, T] - config: openllm.LLMConfig - backend: LiteralBackend - has_adapters: bool - system_message: str | None - prompt_template: str | None - generate_iterator: RunnerMethod[LLMRunnable[M, T], [list[int], str, str | t.Iterable[str] | None, str | None], str] - - runner_methods: list[RunnerMethod[t.Any, t.Any, t.Any]] - scheduling_strategy: type[Strategy] - workers_per_resource: int | float - runnable_init_params: dict[str, t.Any] - _runner_handle: RunnerHandle - - def __init__( - self, - runnable_class: type[LLMRunnable[M, T]], - *, - runnable_init_params: dict[str, t.Any] | None = ..., - name: str | None = ..., - scheduling_strategy: type[Strategy] = ..., - models: list[bentoml.Model] | None = ..., - max_batch_size: int | None = ..., - max_latency_ms: int | None = ..., - method_configs: dict[str, dict[str, int]] | None = ..., - embedded: bool = False, - ) -> None: ... - - @property - @abc.abstractmethod - def __repr_keys__(self) -> set[str]: ... - - -__all__ = ['LLMRunner', 'LLMRunnable', 'LLM'] diff --git a/openllm-python/src/openllm/_llm.pyi b/openllm-python/src/openllm/_llm.pyi new file mode 100644 index 00000000..bac2b346 --- /dev/null +++ b/openllm-python/src/openllm/_llm.pyi @@ -0,0 +1,158 @@ +from typing import Any, AsyncGenerator, Dict, Generic, Iterable, List, Literal, Optional, Set, Tuple, TypedDict, Union + +import attr +import torch +from peft.config import PeftConfig +from peft.peft_model import PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM + +from bentoml import Model, Tag +from openllm_core import LLMConfig +from openllm_core._schemas import GenerationOutput +from openllm_core._typing_compat import ( + AdapterMap, + AdapterType, + LiteralBackend, + LiteralDtype, + LiteralQuantise, + LiteralSerialisation, + M, + T, +) +from openllm_core.prompts import PromptTemplate +from openllm_core.utils.representation import ReprArgs + +from ._quantisation import QuantizationConfig +from ._runners import Runner + +InjectedModel = Union[PeftModel | PeftModelForCausalLM | PeftModelForSeq2SeqLM] + +class IdentifyingParams(TypedDict): + configuration: str + model_ids: str + model_id: str + +ResolvedAdapterMap = Dict[AdapterType, Dict[str, Tuple[PeftConfig, str]]] +Dtype = Union[LiteralDtype, Literal['auto', 'half', 'float']] + +@attr.define(slots=True, repr=False, init=False) +class LLM(Generic[M, T]): + _model_id: str + _revision: Optional[str] + _quantization_config: Optional[QuantizationConfig] + _quantise: Optional[LiteralQuantise] + _model_decls: Tuple[Any, ...] + __model_attrs: Dict[str, Any] + __tokenizer_attrs: Dict[str, Any] + _tag: Tag + _adapter_map: Optional[AdapterMap] + _serialisation: LiteralSerialisation + _local: bool + _prompt_template: Optional[PromptTemplate] + _system_message: Optional[str] + + __llm_torch_dtype__: Dtype = ... + __llm_config__: Optional[LLMConfig] = ... + __llm_backend__: LiteralBackend = ... + __llm_quantization_config__: Optional[QuantizationConfig] = ... + __llm_runner__: Optional[Runner[M, T]] = ... + __llm_model__: Optional[M] = ... + __llm_tokenizer__: Optional[T] = ... + __llm_adapter_map__: Optional[ResolvedAdapterMap] = ... + __llm_trust_remote_code__: bool = ... + + @property + def __repr_keys__(self) -> Set[str]: ... + def __repr__(self) -> str: ... + def __str__(self) -> str: ... + def __repr_name__(self) -> str: ... + def __repr_str__(self, join_str: str) -> str: ... + def __repr_args__(self) -> ReprArgs: ... + def __init__( + self, + model_id: str, + model_version: Optional[str] = ..., + model_tag: Optional[Union[str, Tag]] = ..., + prompt_template: Optional[Union[str, PromptTemplate]] = ..., + system_message: Optional[str] = ..., + llm_config: Optional[LLMConfig] = ..., + backend: Optional[LiteralBackend] = ..., + *args: Any, + quantize: Optional[LiteralQuantise] = ..., + quantization_config: Optional[QuantizationConfig] = ..., + adapter_map: Optional[Dict[str, str]] = ..., + serialisation: LiteralSerialisation = ..., + trust_remote_code: bool = ..., + embedded: bool = ..., + torch_dtype: Dtype = ..., + low_cpu_mem_usage: bool = ..., + **attrs: Any, + ) -> None: ... + @property + def _torch_dtype(self) -> torch.dtype: ... + @property + def _model_attrs(self) -> Dict[str, Any]: ... + @_model_attrs.setter + def _model_attrs(self, model_attrs: Dict[str, Any]) -> None: ... + @property + def _tokenizer_attrs(self) -> Dict[str, Any]: ... + @property + def import_kwargs(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: ... + @property + def trust_remote_code(self) -> bool: ... + @property + def runner_name(self) -> str: ... + @property + def model_id(self) -> str: ... + @property + def revision(self) -> str: ... + @property + def tag(self) -> Tag: ... + @property + def bentomodel(self) -> Model: ... + @property + def quantization_config(self) -> QuantizationConfig: ... + @property + def has_adapters(self) -> bool: ... + @property + def local(self) -> bool: ... + @property + def quantise(self) -> Optional[LiteralQuantise]: ... + @property + def llm_type(self) -> str: ... + @property + def identifying_params(self) -> IdentifyingParams: ... + @property + def llm_parameters(self) -> Tuple[Tuple[Tuple[Any, ...], Dict[str, Any]], Dict[str, Any]]: ... + @property + def config(self) -> LLMConfig: ... + @property + def tokenizer(self) -> T: ... + @property + def model(self) -> M: ... + @property + def runner(self) -> Runner[M, T]: ... + @property + def adapter_map(self) -> ResolvedAdapterMap: ... + def prepare_for_training( + self, adapter_type: AdapterType = ..., use_gradient_checking: bool = ..., **attrs: Any + ) -> Tuple[InjectedModel, T]: ... + async def generate( + self, + prompt: Optional[str], + prompt_token_ids: Optional[List[int]] = ..., + stop: Optional[Union[str, Iterable[str]]] = ..., + stop_token_ids: Optional[List[int]] = ..., + request_id: Optional[str] = ..., + adapter_name: Optional[str] = ..., + **attrs: Any, + ) -> GenerationOutput: ... + async def generate_iterator( + self, + prompt: Optional[str], + prompt_token_ids: Optional[List[int]] = ..., + stop: Optional[Union[str, Iterable[str]]] = ..., + stop_token_ids: Optional[List[int]] = ..., + request_id: Optional[str] = ..., + adapter_name: Optional[str] = ..., + **attrs: Any, + ) -> AsyncGenerator[GenerationOutput, None]: ... diff --git a/openllm-python/src/openllm/_quantisation.py b/openllm-python/src/openllm/_quantisation.py index 68724559..96a799e2 100644 --- a/openllm-python/src/openllm/_quantisation.py +++ b/openllm-python/src/openllm/_quantisation.py @@ -1,12 +1,5 @@ -# mypy: disable-error-code="name-defined,no-redef" from __future__ import annotations -import logging -import typing as t -import torch -import transformers - -from openllm_core._typing_compat import LiteralQuantise, overload from openllm_core.exceptions import MissingDependencyError from openllm_core.utils import ( is_autoawq_available, @@ -15,35 +8,11 @@ from openllm_core.utils import ( is_optimum_supports_gptq, ) -if t.TYPE_CHECKING: - from openllm_core._typing_compat import DictStrAny - from ._llm import LLM +def infer_quantisation_config(llm, quantise, **attrs): + import torch + import transformers -logger = logging.getLogger(__name__) - - -@overload -def infer_quantisation_config( - self: LLM[t.Any, t.Any], quantise: t.Literal['int8', 'int4'], **attrs: t.Any -) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]: ... - - -@overload -def infer_quantisation_config( - self: LLM[t.Any, t.Any], quantise: t.Literal['gptq'], **attrs: t.Any -) -> tuple[transformers.GPTQConfig, DictStrAny]: ... - - -@overload -def infer_quantisation_config( - self: LLM[t.Any, t.Any], quantise: t.Literal['awq'], **attrs: t.Any -) -> tuple[transformers.AwqConfig, DictStrAny]: ... - - -def infer_quantisation_config( - self: LLM[t.Any, t.Any], quantise: LiteralQuantise, **attrs: t.Any -) -> tuple[transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig, DictStrAny]: # 8 bit configuration int8_threshold = attrs.pop('llm_int8_threshhold', 6.0) int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False) @@ -54,12 +23,17 @@ def infer_quantisation_config( bits = attrs.pop('bits', 4) group_size = attrs.pop('group_size', 128) - def create_awq_config() -> transformers.AwqConfig: + # 4 bit configuration + int4_compute_dtype = attrs.pop('bnb_4bit_compute_dtype', torch.bfloat16) + int4_quant_type = attrs.pop('bnb_4bit_quant_type', 'nf4') + int4_use_double_quant = attrs.pop('bnb_4bit_use_double_quant', True) + + def create_awq_config(): zero_point = attrs.pop('zero_point', True) return transformers.AwqConfig(bits=bits, group_size=group_size, zero_point=zero_point) - def create_gptq_config() -> transformers.GPTQConfig: - gptq_tokenizer = attrs.pop('tokenizer', self.model_id) + def create_gptq_config(): + gptq_tokenizer = attrs.pop('tokenizer', llm.model_id) gptq_dataset = attrs.pop('dataset', 'c4') gptq_damp_percent = attrs.pop('damp_percent', 0.1) gptq_desc_act = attrs.pop('desc_act', False) @@ -94,10 +68,9 @@ def infer_quantisation_config( exllama_config={'version': 1}, ) # XXX: See how to migrate to v2 - def create_int8_config(int8_skip_modules: list[str] | None) -> transformers.BitsAndBytesConfig: + def create_int8_config(int8_skip_modules): # if int8_skip_modules is None: int8_skip_modules = [] # if 'lm_head' not in int8_skip_modules and self.config_class.__openllm_model_type__ == 'causal_lm': - # logger.debug("Skipping 'lm_head' for quantization for %s", self.__name__) # int8_skip_modules.append('lm_head') return transformers.BitsAndBytesConfig( load_in_8bit=True, @@ -107,10 +80,13 @@ def infer_quantisation_config( llm_int8_has_fp16_weight=int8_has_fp16_weight, ) - # 4 bit configuration - int4_compute_dtype = attrs.pop('bnb_4bit_compute_dtype', torch.bfloat16) - int4_quant_type = attrs.pop('bnb_4bit_quant_type', 'nf4') - int4_use_double_quant = attrs.pop('bnb_4bit_use_double_quant', True) + def create_int4_config(): + return transformers.BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=int4_compute_dtype, + bnb_4bit_quant_type=int4_quant_type, + bnb_4bit_use_double_quant=int4_use_double_quant, + ) # NOTE: Quantization setup quantize is a openllm.LLM feature, where we can quantize the model with bitsandbytes or quantization aware training. if not is_bitsandbytes_available(): @@ -120,23 +96,18 @@ def infer_quantisation_config( if quantise == 'int8': quantisation_config = create_int8_config(int8_skip_modules) elif quantise == 'int4': - quantisation_config = transformers.BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_compute_dtype=int4_compute_dtype, - bnb_4bit_quant_type=int4_quant_type, - bnb_4bit_use_double_quant=int4_use_double_quant, - ) + quantisation_config = create_int4_config() elif quantise == 'gptq': if not is_autogptq_available() or not is_optimum_supports_gptq(): raise MissingDependencyError( - "'quantize=\"gptq\"' requires 'auto-gptq' and 'optimum>=0.12' to be installed (missing or failed to import). Make sure to do 'pip install \"openllm[gptq]\"'" + "GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'" ) else: quantisation_config = create_gptq_config() elif quantise == 'awq': if not is_autoawq_available(): raise MissingDependencyError( - "quantize='awq' requires 'auto-awq' to be installed (missing or failed to import). Make sure to do 'pip install \"openllm[awq]\"'." + "AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'." ) else: quantisation_config = create_awq_config() diff --git a/openllm-python/src/openllm/_quantisation.pyi b/openllm-python/src/openllm/_quantisation.pyi new file mode 100644 index 00000000..d41809f7 --- /dev/null +++ b/openllm-python/src/openllm/_quantisation.pyi @@ -0,0 +1,26 @@ +from typing import Any, Dict, Literal, Union + +from transformers import AwqConfig, BitsAndBytesConfig, GPTQConfig + +from openllm_core._typing_compat import LiteralQuantise, M, T, overload + +from ._llm import LLM + +QuantizationConfig = Union[BitsAndBytesConfig, GPTQConfig, AwqConfig] + +@overload +def infer_quantisation_config( + self: LLM[M, T], quantise: Literal['int8', 'int4'], **attrs: Any +) -> tuple[BitsAndBytesConfig, Dict[str, Any]]: ... +@overload +def infer_quantisation_config( + self: LLM[M, T], quantise: Literal['gptq'], **attrs: Any +) -> tuple[GPTQConfig, Dict[str, Any]]: ... +@overload +def infer_quantisation_config( + self: LLM[M, T], quantise: Literal['awq'], **attrs: Any +) -> tuple[AwqConfig, Dict[str, Any]]: ... +@overload +def infer_quantisation_config( + self: LLM[M, T], quantise: LiteralQuantise, **attrs: Any +) -> tuple[QuantizationConfig, Dict[str, Any]]: ... diff --git a/openllm-python/src/openllm/_runners.py b/openllm-python/src/openllm/_runners.py index bc4b2bf7..16b26916 100644 --- a/openllm-python/src/openllm/_runners.py +++ b/openllm-python/src/openllm/_runners.py @@ -9,27 +9,14 @@ import torch import bentoml import openllm from openllm_core._schemas import CompletionChunk, GenerationOutput -from openllm_core._typing_compat import LiteralBackend, M, T from openllm_core.exceptions import OpenLLMException from openllm_core.utils import first_not_none, is_vllm_available -if t.TYPE_CHECKING: - import vllm - - from openllm_core._schemas import FinishReason -else: - vllm = openllm.utils.LazyLoader('vllm', globals(), 'vllm') - -_DEFAULT_TOKENIZER = 'hf-internal-testing/llama-tokenizer' - __all__ = ['runnable'] -def runnable(backend: LiteralBackend | None = None) -> type[bentoml.Runnable]: - backend = t.cast( - LiteralBackend, - first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt'), - ) +def runnable(backend=None): + backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt') return vLLMRunnable if backend == 'vllm' else PyTorchRunnable @@ -37,7 +24,11 @@ class vLLMRunnable(bentoml.Runnable): SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu') SUPPORTS_CPU_MULTI_THREADING = True - def __init__(self, llm: openllm.LLM[M, T]) -> None: + def __init__(self, llm): + try: + import vllm + except ImportError: + raise OpenLLMException('vLLM is not installed. Please install it via `pip install "openllm[vllm]"`.') from None self.config = llm.config num_gpus, dev = 1, openllm.utils.device_count() if dev >= 2: @@ -64,14 +55,7 @@ class vLLMRunnable(bentoml.Runnable): raise OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from err @bentoml.Runnable.method(batchable=False) - async def generate_iterator( - self, - prompt_token_ids: list[int], - request_id: str, - stop: str | t.Iterable[str] | None = None, - adapter_name: str | None = None, - **attrs: t.Any, - ) -> t.AsyncGenerator[str, None]: + async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs): if adapter_name is not None: raise NotImplementedError('Adapter is not supported with vLLM.') stop_: set[str] = set() @@ -99,28 +83,19 @@ class PyTorchRunnable(bentoml.Runnable): SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu') SUPPORTS_CPU_MULTI_THREADING = True - def __init__(self, llm: openllm.LLM[M, T]) -> None: + def __init__(self, llm): self.model = llm.model self.tokenizer = llm.tokenizer self.config = llm.config @bentoml.Runnable.method(batchable=False) - async def generate_iterator( - self, - prompt_token_ids: list[int], - request_id: str, - stop: str | t.Iterable[str] | None = None, - adapter_name: str | None = None, - **attrs: t.Any, - ) -> t.AsyncGenerator[str, None]: + async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs): if adapter_name is not None: self.model.set_adapter(adapter_name) async for generation_output in self.forward(prompt_token_ids, request_id, stop=stop, **attrs): yield generation_output.model_dump_json() - async def forward( - self, prompt_token_ids: list[int], request_id: str, stop: str | t.Iterable[str] | None = None, **attrs: t.Any - ) -> t.AsyncGenerator[GenerationOutput, None]: + async def forward(self, prompt_token_ids, request_id, stop=None, **attrs): from ._generation import is_partial_stop, prepare_logits_processor stop_: set[str] = set() @@ -142,7 +117,7 @@ class PyTorchRunnable(bentoml.Runnable): logits_processor = prepare_logits_processor(config) past_key_values = out = token = None - finish_reason: t.Optional[FinishReason] = None + finish_reason = None for i in range(config['max_new_tokens']): if i == 0: # prefill out = self.model(torch.as_tensor([prompt_token_ids], device=self.model.device), use_cache=True) diff --git a/openllm-python/src/openllm/_runners.pyi b/openllm-python/src/openllm/_runners.pyi new file mode 100644 index 00000000..a1ab4d7f --- /dev/null +++ b/openllm-python/src/openllm/_runners.pyi @@ -0,0 +1,126 @@ +from typing import ( + Any, + AsyncGenerator, + Dict, + Generic, + Iterable, + List, + Literal, + Optional, + Protocol, + Tuple, + Type, + TypeVar, + Union, + final, +) + +from bentoml import Model, Strategy, Tag +from bentoml._internal.runner.runner_handle import RunnerHandle +from openllm_core import LLMConfig +from openllm_core._typing_compat import LiteralBackend, T, overload + +from ._llm import LLM + +try: + from vllm import AsyncLLMEngine +except ImportError: + AsyncLLMEngine = Any + +try: + from transformers import PreTrainedModel +except ImportError: + PreTrainedModel = Any + +Mo = TypeVar('Mo') + +class _Runnable(Protocol[Mo]): + SUPPORTED_RESOURCES: Tuple[Literal['nvidia.com/gpu'], Literal['amd.com/gpu'], Literal['cpu']] = ... + SUPPORTS_CPU_MULTI_THREADING: bool = ... + config: LLMConfig = ... + model: Mo = ... + def __init__(self, llm: LLM[Mo, T]) -> None: ... + async def generate_iterator( + self, + prompt_token_ids: List[int], + request_id: str, + stop: Optional[Union[str, Iterable[str]]] = ..., + adapter_name: Optional[str] = ..., + **attrs: Any, + ) -> AsyncGenerator[str, None]: ... + +In = TypeVar('In') +Ret = TypeVar('Ret') + +class RunnerMethod(Generic[In, Ret]): ... + +@final +class vLLMRunnable(_Runnable[AsyncLLMEngine]): ... + +@final +class PyTorchRunnable(_Runnable[PreTrainedModel]): + tokenizer: Any + +@overload +def runnable(backend: Literal['vllm']) -> Type[vLLMRunnable]: ... +@overload +def runnable(backend: Literal['pt']) -> Type[PyTorchRunnable]: ... +@overload +def runnable(backend: Optional[str] = ...) -> Type[Union[vLLMRunnable, PyTorchRunnable]]: ... + +class Runner(Protocol[Mo, T]): + __doc__: str = ... + __module__: str = ... + llm_type: str = ... + llm_tag: Tag = ... + identifying_params: Dict[str, Any] = ... + llm: LLM[Mo, T] = ... + config: LLMConfig = ... + backend: LiteralBackend = ... + has_adapters: bool = ... + prompt_template: Optional[str] = ... + system_message: Optional[str] = ... + + class generate_iterator(RunnerMethod[List[int], AsyncGenerator[str, None]]): + @staticmethod + def async_stream( + prompt_token_ids: List[int], + request_id: str, + stop: Optional[Union[Iterable[str], str]] = ..., + adapter_name: Optional[str] = ..., + **attrs: Any, + ) -> AsyncGenerator[str, None]: ... + + def __init__( + self, + runnable_class: Type[_Runnable[Mo]], + *, + runnable_init_params: Optional[Dict[str, Any]] = ..., + name: Optional[str] = ..., + scheduling_strategy: Type[Strategy] = ..., + models: Optional[List[Model]] = ..., + max_batch_size: Optional[int] = ..., + max_latency_ms: Optional[int] = ..., + method_configs: Optional[Dict[str, Dict[str, int]]] = ..., + embedded: bool = ..., + ) -> None: ... + + name: str = ... + models: List[Model] = ... + resource_config: Dict[str, Any] + runnable_class: Type[_Runnable[Mo]] + embedded: bool + runner_methods: List[RunnerMethod[Any, Any]] + scheduling_strategy: Type[Strategy] + workers_per_resource: Union[int, float] = ... + runnable_init_params: Dict[str, Any] = ... + _runner_handle: RunnerHandle = ... + + def init_local(self, quiet: bool = False) -> None: ... + def init_client(self, handle_class: Optional[Type[RunnerHandle]] = ..., *args: Any, **kwargs: Any) -> None: ... + async def runner_handle_is_ready(self, timeout: int = ...) -> bool: ... + def destroy(self) -> None: ... + @property + def scheduled_worker_count(self) -> int: ... + @property + def scheduled_worker_env_map(self) -> Dict[int, Dict[str, Any]]: ... diff --git a/openllm-python/src/openllm/_service_vars.py b/openllm-python/src/openllm/_service_vars.py index b7b2821a..49ceedc3 100644 --- a/openllm-python/src/openllm/_service_vars.py +++ b/openllm-python/src/openllm/_service_vars.py @@ -1,4 +1,3 @@ -from __future__ import annotations import os model_id = os.environ['OPENLLM_MODEL_ID'] # openllm: model name diff --git a/openllm-python/src/openllm/_service_vars_pkg.py b/openllm-python/src/openllm/_service_vars_pkg.py index 773a24f8..f7ed217b 100644 --- a/openllm-python/src/openllm/_service_vars_pkg.py +++ b/openllm-python/src/openllm/_service_vars_pkg.py @@ -1,5 +1,3 @@ -from __future__ import annotations - model_id = '{__model_id__}' # openllm: model id model_tag = '{__model_tag__}' # openllm: model tag adapter_map = """{__model_adapter_map__}""" # openllm: model adapter map diff --git a/openllm-python/src/openllm/bundle/__init__.py b/openllm-python/src/openllm/bundle/__init__.py index ea8b0e10..bf508134 100644 --- a/openllm-python/src/openllm/bundle/__init__.py +++ b/openllm-python/src/openllm/bundle/__init__.py @@ -1,36 +1,15 @@ import os -import typing as t from openllm_core.utils import LazyModule -_import_structure = { - '_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'], - 'oci': [ - 'CONTAINER_NAMES', - 'get_base_container_tag', - 'get_base_container_name', - 'supported_registries', - 'RefResolver', - ], -} - -if t.TYPE_CHECKING: - from . import _package as _package, oci as oci - from ._package import ( - build_editable as build_editable, - construct_docker_options as construct_docker_options, - construct_python_options as construct_python_options, - create_bento as create_bento, - ) - from .oci import ( - CONTAINER_NAMES as CONTAINER_NAMES, - RefResolver as RefResolver, - get_base_container_name as get_base_container_name, - get_base_container_tag as get_base_container_tag, - supported_registries as supported_registries, - ) - -__lazy = LazyModule(__name__, os.path.abspath('__file__'), _import_structure) +__lazy = LazyModule( + __name__, + os.path.abspath('__file__'), + { + '_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'], + 'oci': ['CONTAINER_NAMES', 'supported_registries', 'RefResolver'], + }, +) __all__ = __lazy.__all__ __dir__ = __lazy.__dir__ __getattr__ = __lazy.__getattr__ diff --git a/openllm-python/src/openllm/bundle/__init__.pyi b/openllm-python/src/openllm/bundle/__init__.pyi new file mode 100644 index 00000000..46cb314d --- /dev/null +++ b/openllm-python/src/openllm/bundle/__init__.pyi @@ -0,0 +1,32 @@ +from typing import Optional + +import attr + +from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy +from openllm_core.utils.lazy import VersionInfo + +from . import _package as _package, oci as oci +from ._package import ( + build_editable as build_editable, + construct_docker_options as construct_docker_options, + construct_python_options as construct_python_options, + create_bento as create_bento, +) + +CONTAINER_NAMES: dict[LiteralContainerRegistry, str] = ... +supported_registries: list[str] = ... + +@attr.attrs(eq=False, order=False, slots=True, frozen=True) +class RefResolver: + git_hash: str + version: VersionInfo + strategy: LiteralContainerVersionStrategy + + @classmethod + def from_strategy(cls, strategy_or_version: Optional[LiteralContainerVersionStrategy] = ...) -> RefResolver: ... + @property + def tag(self) -> str: ... + @staticmethod + def construct_base_image( + reg: LiteralContainerRegistry, strategy: Optional[LiteralContainerVersionStrategy] = ... + ) -> str: ... diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index 148d14e1..26daf585 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -1,16 +1,12 @@ # mypy: disable-error-code="misc" from __future__ import annotations import importlib.metadata -import inspect import logging import os import string import typing as t from pathlib import Path -import fs -import fs.copy -import fs.errors import orjson from simple_di import Provide, inject @@ -18,38 +14,27 @@ import bentoml import openllm_core from bentoml._internal.bento.build_config import BentoBuildConfig, DockerOptions, ModelSpec, PythonOptions from bentoml._internal.configuration.containers import BentoMLContainer +from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg from . import oci if t.TYPE_CHECKING: - from fs.base import FS - - import openllm - from bentoml._internal.bento import BentoStore - from bentoml._internal.models.model import ModelStore - from openllm_core._typing_compat import ( - LiteralContainerRegistry, - LiteralContainerVersionStrategy, - LiteralSerialisation, - LiteralString, - ) + from openllm_core._typing_compat import LiteralString logger = logging.getLogger(__name__) OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD' -def build_editable( - path: str, package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm' -) -> str | None: +def build_editable(path, package='openllm'): """Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set.""" - if not openllm_core.utils.check_bool_env(OPENLLM_DEV_BUILD, default=False): + if not check_bool_env(OPENLLM_DEV_BUILD, default=False): return None # We need to build the package in editable mode, so that we can import it from build import ProjectBuilder from build.env import IsolatedEnvBuilder - module_location = openllm_core.utils.pkg.source_locations(package) + module_location = pkg.source_locations(package) if not module_location: raise RuntimeError( 'Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.' @@ -68,12 +53,7 @@ def build_editable( ) -def construct_python_options( - llm: openllm.LLM[t.Any, t.Any], - llm_fs: FS, - extra_dependencies: tuple[str, ...] | None = None, - adapter_map: dict[str, str] | None = None, -) -> PythonOptions: +def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=None): packages = ['openllm', 'scipy'] # apparently bnb misses this one if adapter_map is not None: packages += ['openllm[fine-tune]'] @@ -88,24 +68,18 @@ def construct_python_options( if req is not None: packages.extend(req) if str(os.environ.get('BENTOML_BUNDLE_LOCAL_BUILD', False)).lower() == 'false': - packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}") + packages.append(f"bentoml>={'.'.join([str(i) for i in pkg.pkg_version_info('bentoml')])}") - if not openllm_core.utils.is_torch_available(): - raise ValueError('PyTorch is not available. Make sure to have it locally installed.') - packages.extend( - ['torch==2.0.1+cu118', 'vllm==0.2.1.post1', 'xformers==0.0.22', 'bentoml[tracing]==1.1.9'] - ) # XXX: Currently locking this for correctness - wheels: list[str] = [] - built_wheels = [ - build_editable(llm_fs.getsyspath('/'), t.cast(t.Literal['openllm', 'openllm_core', 'openllm_client'], p)) - for p in ('openllm_core', 'openllm_client', 'openllm') - ] + # XXX: Currently locking this for correctness + packages.extend(['torch==2.0.1+cu118', 'vllm==0.2.1.post1', 'xformers==0.0.22', 'bentoml[tracing]==1.1.9']) + wheels = [] + built_wheels = [build_editable(llm_fs.getsyspath('/'), p) for p in ('openllm_core', 'openllm_client', 'openllm')] if all(i for i in built_wheels): wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)]) return PythonOptions( packages=packages, wheels=wheels, - lock_packages=False, + lock_packages=True, extra_index_url=[ 'https://download.pytorch.org/whl/cu118', 'https://huggingface.github.io/autogptq-index/whl/cu118/', @@ -114,15 +88,8 @@ def construct_python_options( def construct_docker_options( - llm: openllm.LLM[t.Any, t.Any], - _: FS, - quantize: LiteralString | None, - adapter_map: dict[str, str] | None, - dockerfile_template: str | None, - serialisation: LiteralSerialisation, - container_registry: LiteralContainerRegistry, - container_version_strategy: LiteralContainerVersionStrategy, -) -> DockerOptions: + llm, _, quantize, adapter_map, dockerfile_template, serialisation, container_registry, container_version_strategy +): from openllm_cli._factory import parse_config_options environ = parse_config_options(llm.config, llm.config['timeout'], 1.0, None, True, os.environ.copy()) @@ -145,7 +112,7 @@ def construct_docker_options( if quantize: env_dict['OPENLLM_QUANTIZE'] = str(quantize) return DockerOptions( - base_image=f'{oci.get_base_container_name(container_registry)}:{oci.get_base_container_tag(container_version_strategy)}', + base_image=oci.RefResolver.construct_base_image(container_registry, container_version_strategy), env=env_dict, dockerfile_template=dockerfile_template, ) @@ -160,21 +127,13 @@ class _ServiceVarsFormatter(string.Formatter): keyword: LiteralString = '__model_name__' identifier: LiteralString = '# openllm: model name' - def __init__(self, target: str): - """The formatter that extends model_name to be formatted the 'service.py'.""" + def __init__(self, target): super().__init__() self.target = target - def vformat(self, format_string: str, *args: t.Any, **attrs: t.Any) -> t.Any: + def vformat(self, format_string, *args, **attrs) -> str: return super().vformat(format_string, (), {self.keyword: self.target}) - def can_format(self, value: str) -> bool: - try: - self.parse(value) - return True - except ValueError: - return False - def parse_line(self, line: str, nl: bool = True) -> str: if self.identifier not in line: return line @@ -201,9 +160,7 @@ _service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py' _service_vars_file = Path(os.path.abspath(__file__)).parent.parent / '_service_vars_pkg.py' -def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str] | None, llm_fs: FS) -> None: - from openllm_core.utils import DEBUG - +def write_service(llm, llm_fs, adapter_map): model_id_formatter = ModelIdFormatter(llm.model_id) model_tag_formatter = ModelTagFormatter(str(llm.tag)) adapter_map_formatter = ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()) @@ -222,8 +179,8 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str] | src_contents[i] = adapter_map_formatter.parse_line(it) script = f"# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n\n" + ''.join(src_contents) - if DEBUG: - logger.info('Generated script:\n%s', script) + if SHOW_CODEGEN: + logger.info('Generated _service_vars.py:\n%s', script) llm_fs.writetext('_service_vars.py', script) logger.debug( @@ -236,22 +193,20 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str] | @inject def create_bento( - bento_tag: bentoml.Tag, - llm_fs: FS, - llm: openllm.LLM[t.Any, t.Any], - quantize: LiteralString | None, - dockerfile_template: str | None, - adapter_map: dict[str, str] | None = None, - extra_dependencies: tuple[str, ...] | None = None, - serialisation: LiteralSerialisation | None = None, - container_registry: LiteralContainerRegistry = 'ecr', - container_version_strategy: LiteralContainerVersionStrategy = 'release', - _bento_store: BentoStore = Provide[BentoMLContainer.bento_store], - _model_store: ModelStore = Provide[BentoMLContainer.model_store], -) -> bentoml.Bento: - _serialisation: LiteralSerialisation = openllm_core.utils.first_not_none( - serialisation, default=llm.config['serialisation'] - ) + bento_tag, + llm_fs, + llm, + quantize, + dockerfile_template, + adapter_map=None, + extra_dependencies=None, + serialisation=None, + container_registry='ecr', + container_version_strategy='release', + _bento_store=Provide[BentoMLContainer.bento_store], + _model_store=Provide[BentoMLContainer.model_store], +): + _serialisation = openllm_core.utils.first_not_none(serialisation, default=llm.config['serialisation']) labels = dict(llm.identifying_params) labels.update( { @@ -270,47 +225,31 @@ def create_bento( labels.update(adapter_map) logger.debug("Building Bento '%s' with model backend '%s'", bento_tag, llm.__llm_backend__) # add service.py definition to this temporary folder - write_service(llm, adapter_map, llm_fs) + write_service(llm, llm_fs, adapter_map) - llm_spec = ModelSpec.from_item({'tag': str(llm.tag), 'alias': llm.tag.name}) - build_config = BentoBuildConfig( - service=f"{llm.config['service_name']}:svc", - name=bento_tag.name, - labels=labels, - models=[llm_spec], - description=f"OpenLLM service for {llm.config['start_name']}", - include=list(llm_fs.walk.files()), - exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'], - python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map), - docker=construct_docker_options( - llm, - llm_fs, - quantize, - adapter_map, - dockerfile_template, - _serialisation, - container_registry, - container_version_strategy, + bento = bentoml.Bento.create( + version=bento_tag.version, + build_ctx=llm_fs.getsyspath('/'), + build_config=BentoBuildConfig( + service=f"{llm.config['service_name']}:svc", + name=bento_tag.name, + labels=labels, + models=[ModelSpec.from_item({'tag': str(llm.tag), 'alias': llm.tag.name})], + description=f"OpenLLM service for {llm.config['start_name']}", + include=list(llm_fs.walk.files()), + exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'], + python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map), + docker=construct_docker_options( + llm, + llm_fs, + quantize, + adapter_map, + dockerfile_template, + _serialisation, + container_registry, + container_version_strategy, + ), ), ) - bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/')) - # NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM. - service_fs_path = fs.path.join('src', llm.config['service_name']) - service_path = bento._fs.getsyspath(service_fs_path) - with open(service_path, 'r') as f: - service_contents = f.readlines() - - for it in service_contents: - if '__bento_name__' in it: - service_contents[service_contents.index(it)] = it.format(__bento_name__=str(bento.tag)) - - script = ''.join(service_contents) - if openllm_core.utils.DEBUG: - logger.info('Generated script:\n%s', script) - - bento._fs.writetext(service_fs_path, script) - if 'model_store' in inspect.signature(bento.save).parameters: - return bento.save(bento_store=_bento_store, model_store=_model_store) - # backward arguments. `model_store` is added recently - return bento.save(bento_store=_bento_store) + return bento.save(bento_store=_bento_store, model_store=_model_store) diff --git a/openllm-python/src/openllm/bundle/_package.pyi b/openllm-python/src/openllm/bundle/_package.pyi new file mode 100644 index 00000000..3289b3bd --- /dev/null +++ b/openllm-python/src/openllm/bundle/_package.pyi @@ -0,0 +1,52 @@ +from typing import Dict, Optional, Tuple + +from fs.base import FS +from typing_extensions import LiteralString + +from bentoml import Bento, Tag +from bentoml._internal.bento import BentoStore +from bentoml._internal.bento.build_config import DockerOptions, PythonOptions +from bentoml._internal.models.model import ModelStore +from openllm_core._typing_compat import ( + LiteralContainerRegistry, + LiteralContainerVersionStrategy, + LiteralQuantise, + LiteralSerialisation, + M, + T, +) + +from .._llm import LLM + +def build_editable(path: str, package: LiteralString) -> Optional[str]: ... +def construct_python_options( + llm: LLM[M, T], + llm_fs: FS, + extra_dependencies: Optional[Tuple[str, ...]] = ..., + adapter_map: Optional[Dict[str, str]] = ..., +) -> PythonOptions: ... +def construct_docker_options( + llm: LLM[M, T], + llm_fs: FS, + quantize: Optional[LiteralQuantise], + adapter_map: Optional[Dict[str, str]], + dockerfile_template: Optional[str], + serialisation: LiteralSerialisation, + container_registry: LiteralContainerRegistry, + container_version_strategy: LiteralContainerVersionStrategy, +) -> DockerOptions: ... +def write_service(llm: LLM[M, T], llm_fs: FS, adapter_map: Optional[Dict[str, str]]) -> None: ... +def create_bento( + bento_tag: Tag, + llm_fs: FS, + llm: LLM[M, T], + quantize: Optional[LiteralQuantise], + dockerfile_template: Optional[str], + adapter_map: Optional[Dict[str, str]] = ..., + extra_dependencies: Optional[Tuple[str, ...]] = ..., + serialisation: Optional[LiteralSerialisation] = ..., + container_registry: LiteralContainerRegistry = ..., + container_version_strategy: LiteralContainerVersionStrategy = ..., + _bento_store: BentoStore = ..., + _model_store: ModelStore = ..., +) -> Bento: ... diff --git a/openllm-python/src/openllm/bundle/oci/__init__.py b/openllm-python/src/openllm/bundle/oci/__init__.py index 8bb4d19d..ddb35c4f 100644 --- a/openllm-python/src/openllm/bundle/oci/__init__.py +++ b/openllm-python/src/openllm/bundle/oci/__init__.py @@ -1,26 +1,21 @@ -# mypy: disable-error-code="misc" from __future__ import annotations import functools import importlib import logging import os import pathlib -import typing as t import attr +from openllm_core._typing_compat import LiteralContainerVersionStrategy from openllm_core.exceptions import OpenLLMException -from openllm_core.utils import codegen from openllm_core.utils.lazy import VersionInfo -if t.TYPE_CHECKING: - from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy, RefTuple - logger = logging.getLogger(__name__) ROOT_DIR = pathlib.Path(os.path.abspath('__file__')).parent.parent.parent -_CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = { +_CONTAINER_REGISTRY = { 'docker': 'docker.io/bentoml/openllm', 'gh': 'ghcr.io/bentoml/openllm', 'ecr': 'public.ecr.aws/y5w8i4y6/bentoml/openllm', @@ -30,80 +25,48 @@ _CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = { _OWNER, _REPO = 'bentoml', 'openllm' -def _convert_version_from_string(s: str) -> VersionInfo: - return VersionInfo.from_version_string(s) - - -_RefTuple: type[RefTuple] = codegen.make_attr_tuple_class('_RefTuple', ['git_hash', 'version', 'strategy']) - - @attr.attrs(eq=False, order=False, slots=True, frozen=True) class RefResolver: git_hash: str = attr.field() - version: VersionInfo = attr.field(converter=_convert_version_from_string) + version: VersionInfo = attr.field(converter=lambda s: VersionInfo.from_version_string(s)) strategy: LiteralContainerVersionStrategy = attr.field() - @classmethod - def _release_ref(cls, version_str: str | None = None) -> RefTuple: - try: - from ghapi.all import GhApi - - ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False) - meta = t.cast(t.Dict[str, t.Any], ghapi.repos.get_latest_release()) - except Exception as err: - raise OpenLLMException('Failed to determine latest release version.') from err - _use_base_strategy = version_str is None - if version_str is None: - # NOTE: This strategy will only support openllm>0.2.12 - version_str = meta['name'].lstrip('v') - version = (ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'], version_str) - else: - version = ('', version_str) - return _RefTuple((*version, 'release' if _use_base_strategy else 'custom')) - @classmethod @functools.lru_cache(maxsize=64) - def from_strategy(cls, strategy_or_version: LiteralContainerVersionStrategy | None = None) -> RefResolver: + def from_strategy(cls, strategy_or_version=None): # using default strategy if strategy_or_version is None or strategy_or_version == 'release': - return cls(*cls._release_ref()) + try: + from ghapi.all import GhApi + + ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False) + meta = ghapi.repos.get_latest_release() + git_hash = ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'] + except Exception as err: + raise OpenLLMException('Failed to determine latest release version.') from err + return cls(git_hash=git_hash, version=meta['name'].lstrip('v'), strategy='release') elif strategy_or_version in ('latest', 'nightly'): # latest is nightly return cls(git_hash='latest', version='0.0.0', strategy='latest') else: raise ValueError(f'Unknown strategy: {strategy_or_version}') @property - def tag(self) -> str: + def tag(self): return 'latest' if self.strategy in {'latest', 'nightly'} else repr(self.version) - -@functools.lru_cache(maxsize=256) -def get_base_container_tag(strategy: LiteralContainerVersionStrategy | None = None) -> str: - return RefResolver.from_strategy(strategy).tag + @staticmethod + def construct_base_image(reg, strategy=None): + return f'{_CONTAINER_REGISTRY[reg]}:{RefResolver.from_strategy(strategy).tag}' -def get_base_container_name(reg: LiteralContainerRegistry) -> str: - return _CONTAINER_REGISTRY[reg] +__all__ = ['CONTAINER_NAMES', 'RefResolver', 'supported_registries'] -if t.TYPE_CHECKING: - CONTAINER_NAMES: dict[LiteralContainerRegistry, str] - supported_registries: list[str] - -__all__ = [ - 'CONTAINER_NAMES', - 'get_base_container_tag', - 'get_base_container_name', - 'supported_registries', - 'RefResolver', -] - - -def __dir__() -> list[str]: +def __dir__(): return sorted(__all__) -def __getattr__(name: str) -> t.Any: +def __getattr__(name): if name == 'supported_registries': return functools.lru_cache(1)(lambda: list(_CONTAINER_REGISTRY))() elif name == 'CONTAINER_NAMES': diff --git a/openllm-python/src/openllm/testing.py b/openllm-python/src/openllm/testing.py index 1fceeaa0..f51355bf 100644 --- a/openllm-python/src/openllm/testing.py +++ b/openllm-python/src/openllm/testing.py @@ -1,5 +1,3 @@ -"""Tests utilities for OpenLLM.""" - from __future__ import annotations import contextlib import logging diff --git a/openllm-python/src/openllm_cli/_factory.py b/openllm-python/src/openllm_cli/_factory.py index fff432c6..b55f27eb 100644 --- a/openllm-python/src/openllm_cli/_factory.py +++ b/openllm-python/src/openllm_cli/_factory.py @@ -429,7 +429,7 @@ def workers_per_resource_option( - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models. - - ``conserved``: This will determine the number of available GPU resources, and only assign one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``. + - ``conserved``: This will determine the number of available GPU resources. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``. """ + ( """\n diff --git a/openllm-python/src/openllm_cli/entrypoint.py b/openllm-python/src/openllm_cli/entrypoint.py index 7741fd6f..c809c110 100644 --- a/openllm-python/src/openllm_cli/entrypoint.py +++ b/openllm-python/src/openllm_cli/entrypoint.py @@ -77,6 +77,7 @@ from openllm_core.utils import ( compose, configure_logging, first_not_none, + gen_random_uuid, get_debug_mode, get_disable_warnings, get_quiet_mode, @@ -986,7 +987,6 @@ def build_command( > To build the bento with compiled OpenLLM, make sure to prepend HATCH_BUILD_HOOKS_ENABLE=1. Make sure that the deployment > target also use the same Python version and architecture as build machine. """ - from openllm._llm import normalise_model_name from openllm.serialisation.transformers.weights import has_safetensors_weights if model_id in openllm.CONFIG_MAPPING: @@ -1046,7 +1046,7 @@ def build_command( labels = dict(llm.identifying_params) labels.update({'_type': llm.llm_type, '_framework': llm.__llm_backend__}) - with fs.open_fs(f'temp://llm_{normalise_model_name(model_id)}') as llm_fs: + with fs.open_fs(f'temp://llm_{gen_random_uuid()}') as llm_fs: dockerfile_template_path = None if dockerfile_template: with dockerfile_template: diff --git a/openllm-python/src/openllm_cli/extension/build_base_container.py b/openllm-python/src/openllm_cli/extension/build_base_container.py index a5783c79..93d560c2 100644 --- a/openllm-python/src/openllm_cli/extension/build_base_container.py +++ b/openllm-python/src/openllm_cli/extension/build_base_container.py @@ -43,16 +43,13 @@ def build_container( "This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'" ) if not registries: - tags: dict[str | LiteralContainerRegistry, str] = { - alias: f'{value}:{openllm.bundle.get_base_container_tag(version_strategy)}' - for alias, value in openllm.bundle.CONTAINER_NAMES.items() + tags = { + alias: openllm.bundle.RefResolver.construct_base_image(alias, version_strategy) + for alias in openllm.bundle.CONTAINER_NAMES } else: registries = [registries] if isinstance(registries, str) else list(registries) - tags = { - name: f'{openllm.bundle.CONTAINER_NAMES[name]}:{openllm.bundle.get_base_container_tag(version_strategy)}' - for name in registries - } + tags = {name: openllm.bundle.RefResolver.construct_base_image(name, version_strategy) for name in registries} try: outputs = _BUILDER.build( file=pathlib.Path(__file__).parent.joinpath('Dockerfile').resolve().__fspath__(), diff --git a/openllm-python/tests/conftest.py b/openllm-python/tests/conftest.py index 8561b899..e49b2656 100644 --- a/openllm-python/tests/conftest.py +++ b/openllm-python/tests/conftest.py @@ -20,9 +20,7 @@ _PROMPT_MAPPING = { } -def parametrise_local_llm( - model: str -) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]: +def parametrise_local_llm(model: str) -> t.Generator[tuple[str, openllm.LLM[t.Any, t.Any]], None, None]: if model not in _MODELING_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.") backends: tuple[LiteralBackend, ...] = ('pt',) diff --git a/ruff.toml b/ruff.toml index 00e5ceb7..0b6f16c5 100644 --- a/ruff.toml +++ b/ruff.toml @@ -121,3 +121,4 @@ docstring-quotes = "double" "openllm-python/src/openllm/_llm.py" = ["F811"] "openllm-core/src/openllm_core/utils/import_utils.py" = ["PLW0603", "F811"] "openllm-core/src/openllm_core/_configuration.py" = ["F811", "Q001"] +"openllm-python/src/openllm/__init__.pyi" = ["I001"] diff --git a/tools/dependencies.py b/tools/dependencies.py index c3b66f35..ce55507c 100755 --- a/tools/dependencies.py +++ b/tools/dependencies.py @@ -140,7 +140,7 @@ class Dependencies: return cls(*decls) -lower_bentoml_constraint = '1.1.2' +lower_bentoml_constraint = '1.1.9' _BENTOML_EXT = ['io'] _TRANSFORMERS_EXT = ['torch', 'tokenizers'] diff --git a/tools/update-config-stubs.py b/tools/update-config-stubs.py index 9340b83e..7d4c5007 100755 --- a/tools/update-config-stubs.py +++ b/tools/update-config-stubs.py @@ -14,10 +14,14 @@ END_ATTRS_COMMENT = f'# {os.path.basename(__file__)}: attrs stop\n' # Stubs for auto class START_AUTO_STUBS_COMMENT = f'# {os.path.basename(__file__)}: auto stubs start\n' END_AUTO_STUBS_COMMENT = f'# {os.path.basename(__file__)}: auto stubs stop\n' +# Stubs for actual imports +START_IMPORT_STUBS_COMMENT = f'# {os.path.basename(__file__)}: import stubs start\n' +END_IMPORT_STUBS_COMMENT = f'# {os.path.basename(__file__)}: import stubs stop\n' ROOT = Path(__file__).parent.parent _TARGET_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / '_configuration.py' _TARGET_AUTO_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / 'config' / 'configuration_auto.py' +_TARGET_INIT_FILE = ROOT / 'openllm-python' / 'src' / 'openllm' / '__init__.pyi' sys.path.insert(0, (ROOT / 'openllm-core' / 'src').__fspath__()) from openllm_core._configuration import GenerationConfig, ModelSettings, SamplingParams @@ -216,6 +220,22 @@ def main() -> int: ) with _TARGET_AUTO_FILE.open('w') as f: f.writelines(processed) + + with _TARGET_INIT_FILE.open('r') as f: + processed = f.readlines() + start_import_stubs_idx, end_import_stubs_idx = ( + processed.index(START_IMPORT_STUBS_COMMENT), + processed.index(END_IMPORT_STUBS_COMMENT), + ) + lines = f'from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING,CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,AutoConfig as AutoConfig,{",".join([a+" as "+a for a in CONFIG_MAPPING_NAMES.values()])}\n' + processed = ( + processed[:start_import_stubs_idx] + + [START_IMPORT_STUBS_COMMENT, lines, END_IMPORT_STUBS_COMMENT] + + processed[end_import_stubs_idx + 1 :] + ) + with _TARGET_INIT_FILE.open('w') as f: + f.writelines(processed) + return 0 diff --git a/tools/update-mypy.py b/tools/update-mypy.py new file mode 100755 index 00000000..bf5cbfb3 --- /dev/null +++ b/tools/update-mypy.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +import concurrent.futures +import configparser +import os +from typing import List + + +# Function to find .pyi files in a given directory +def pyi_in_subdir(directory: str, git_root: str) -> List[str]: + pyi_files = [] + for root, _, files in os.walk(directory): + for file in files: + if file.endswith('.pyi') or file == '_typing_compat.py': + full_path = os.path.join(root, file) + # Convert to relative path with respect to the git root + relative_path = os.path.relpath(full_path, git_root) + pyi_files.append(relative_path) + return pyi_files + + +def find_pyi_files(git_root: str) -> List[str]: + # List all subdirectories + subdirectories = [ + os.path.join(git_root, name) for name in os.listdir(git_root) if os.path.isdir(os.path.join(git_root, name)) + ] + + # Use a thread pool to execute searches concurrently + with concurrent.futures.ThreadPoolExecutor() as executor: + # Map of future to subdirectory + future_to_subdir = {executor.submit(pyi_in_subdir, subdir, git_root): subdir for subdir in subdirectories} + + all_pyi_files = set() + for future in concurrent.futures.as_completed(future_to_subdir): + pyi_files = future.result() + all_pyi_files.update(pyi_files) + + return list(all_pyi_files) + + +# Function to update mypy.ini file +def update_mypy_ini(pyi_files: List[str], mypy_ini_path: str) -> int: + config = configparser.ConfigParser() + config.read(mypy_ini_path) + + # Existing files from mypy.ini + existing_files = config.get('mypy', 'files', fallback='').split(', ') + + # Add new .pyi files if they are not already in the list + updated_files = existing_files + [f for f in pyi_files if f not in existing_files] + + # Update the 'files' entry + config['mypy']['files'] = ', '.join(updated_files) + + # Write changes back to mypy.ini + with open(mypy_ini_path, 'w') as configfile: + config.write(configfile) + return 0 + + +ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +MYPY_CONFIG = os.path.join(ROOT, 'mypy.ini') + +if __name__ == '__main__': + raise SystemExit(update_mypy_ini(find_pyi_files(ROOT), MYPY_CONFIG))