feat(type): provide structured annotations stubs (#663)

* feat(type): provide client stubs

separation of concern for more brevity code base

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* docs: update changelog

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-11-16 02:58:45 -05:00
committed by GitHub
parent c6264f3af7
commit 4a6f13ddd2
32 changed files with 795 additions and 582 deletions

View File

@@ -1,14 +1,3 @@
"""OpenLLM.
An open platform for operating large language models in production. Fine-tune, serve,
deploy, and monitor any LLMs with ease.
* Built-in support for StableLM, Llama 2, Dolly, Flan-T5, Vicuna
* Option to bring your own fine-tuned LLMs
* Online Serving with HTTP, gRPC, SSE(coming soon) or custom API
* Native integration with BentoML and LangChain for custom LLM apps
"""
import logging as _logging
import os as _os
import pathlib as _pathlib
@@ -57,13 +46,14 @@ __lazy = utils.LazyModule(
'entrypoints': ['mount_entrypoints'],
'serialisation': ['ggml', 'transformers'],
'_quantisation': ['infer_quantisation_config'],
'_llm': ['LLM', 'LLMRunner', 'LLMRunnable'],
'_llm': ['LLM'],
'_generation': [
'StopSequenceCriteria',
'StopOnTokens',
'LogitsProcessorList',
'StoppingCriteriaList',
'prepare_logits_processor',
'get_context_length',
'is_sentence_complete',
'is_partial_stop',
],
},
extra_objects={

View File

@@ -1,3 +1,21 @@
"""OpenLLM.
===========
An open platform for operating large language models in production.
Fine-tune, serve, deploy, and monitor any LLMs with ease.
* Built-in support for Mistral, Llama 2, Yi, StableLM, Dolly, Flan-T5, Vicuna
* Option to bring your own fine-tuned LLMs
* Online Serving with HTTP, gRPC, SSE or custom API
* Native integration with BentoML, LangChain, OpenAI compatible endpoints, LlamaIndex for custom LLM apps
"""
# fmt: off
# update-config-stubs.py: import stubs start
from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING,CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,AutoConfig as AutoConfig,BaichuanConfig as BaichuanConfig,ChatGLMConfig as ChatGLMConfig,DollyV2Config as DollyV2Config,FalconConfig as FalconConfig,FlanT5Config as FlanT5Config,GPTNeoXConfig as GPTNeoXConfig,LlamaConfig as LlamaConfig,MistralConfig as MistralConfig,MPTConfig as MPTConfig,OPTConfig as OPTConfig,StableLMConfig as StableLMConfig,StarCoderConfig as StarCoderConfig,YiConfig as YiConfig
# update-config-stubs.py: import stubs stop
# fmt: on
import openllm_cli as _cli
from openllm_cli._sdk import (
build as build,
@@ -16,23 +34,6 @@ from openllm_core._schemas import (
GenerationOutput as GenerationOutput,
MetadataOutput as MetadataOutput,
)
from openllm_core.config import (
CONFIG_MAPPING as CONFIG_MAPPING,
CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,
AutoConfig as AutoConfig,
BaichuanConfig as BaichuanConfig,
ChatGLMConfig as ChatGLMConfig,
DollyV2Config as DollyV2Config,
FalconConfig as FalconConfig,
FlanT5Config as FlanT5Config,
GPTNeoXConfig as GPTNeoXConfig,
LlamaConfig as LlamaConfig,
MistralConfig as MistralConfig,
MPTConfig as MPTConfig,
OPTConfig as OPTConfig,
StableLMConfig as StableLMConfig,
StarCoderConfig as StarCoderConfig,
)
from . import (
bundle as bundle,
@@ -44,13 +45,14 @@ from . import (
)
from ._deprecated import Runner as Runner
from ._generation import (
LogitsProcessorList as LogitsProcessorList,
StopOnTokens as StopOnTokens,
StoppingCriteriaList as StoppingCriteriaList,
StopSequenceCriteria as StopSequenceCriteria,
prepare_logits_processor as prepare_logits_processor,
is_partial_stop as is_partial_stop,
is_sentence_complete as is_sentence_complete,
get_context_length as get_context_length,
)
from ._llm import LLM as LLM, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner
from ._llm import LLM as LLM
from ._quantisation import infer_quantisation_config as infer_quantisation_config
from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
from .client import AsyncHTTPClient as AsyncHTTPClient, HTTPClient as HTTPClient

View File

@@ -1,43 +1,24 @@
# mypy: disable-error-code="misc"
from __future__ import annotations
import typing as t
import transformers
if t.TYPE_CHECKING:
import torch
import openllm
# reexport from transformers
LogitsProcessorList = transformers.LogitsProcessorList
StoppingCriteriaList = transformers.StoppingCriteriaList
class StopSequenceCriteria(transformers.StoppingCriteria):
def __init__(
self,
stop_sequences: str | list[str],
tokenizer: transformers.PreTrainedTokenizer
| transformers.PreTrainedTokenizerBase
| transformers.PreTrainedTokenizerFast,
):
def __init__(self, stop_sequences, tokenizer):
if isinstance(stop_sequences, str):
stop_sequences = [stop_sequences]
self.stop_sequences, self.tokenizer = stop_sequences, tokenizer
def __call__(self, input_ids: torch.Tensor, scores: t.Any, **_: t.Any) -> bool:
def __call__(self, input_ids, scores, **kwargs):
return any(
self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences
)
class StopOnTokens(transformers.StoppingCriteria):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **_: t.Any) -> bool:
def __call__(self, input_ids, scores, **kwargs):
return input_ids[0][-1] in {50278, 50279, 50277, 1, 0}
def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsProcessorList:
def prepare_logits_processor(config):
generation_config = config.generation_config
logits_processor = transformers.LogitsProcessorList()
if generation_config['temperature'] >= 1e-5 and generation_config['temperature'] != 1.0:
@@ -55,7 +36,7 @@ def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsPr
SEQLEN_KEYS = ['max_sequence_length', 'seq_length', 'max_position_embeddings', 'max_seq_len', 'model_max_length']
def get_context_length(config: transformers.PretrainedConfig) -> int:
def get_context_length(config):
rope_scaling = getattr(config, 'rope_scaling', None)
rope_scaling_factor = config.rope_scaling['factor'] if rope_scaling else 1.0
for key in SEQLEN_KEYS:
@@ -64,11 +45,11 @@ def get_context_length(config: transformers.PretrainedConfig) -> int:
return 2048
def is_sentence_complete(output: str) -> bool:
def is_sentence_complete(output):
return output.endswith(('.', '?', '!', '...', '', '?', '!', '', '"', "'", ''))
def is_partial_stop(output: str, stop_str: str) -> bool:
def is_partial_stop(output, stop_str):
"""Check whether the output contains a partial stop str."""
for i in range(min(len(output), len(stop_str))):
if stop_str.startswith(output[-i:]):

View File

@@ -0,0 +1,28 @@
from typing import Any, List, Union
from torch import FloatTensor, LongTensor
from transformers import (
LogitsProcessorList,
PretrainedConfig,
PreTrainedTokenizer,
PreTrainedTokenizerBase,
PreTrainedTokenizerFast,
)
from openllm_core import LLMConfig
Tokenizer = Union[PreTrainedTokenizerBase, PreTrainedTokenizer, PreTrainedTokenizerFast]
class StopSequenceCriteria:
stop_sequences: List[str]
tokenizer: Tokenizer
def __init__(self, stop_sequences: Union[str, List[str]], tokenizer: Tokenizer) -> None: ...
def __call__(self, input_ids: LongTensor, scores: FloatTensor, **kwargs: Any) -> bool: ...
class StopOnTokens:
def __call__(self, input_ids: LongTensor, scores: FloatTensor, **kwargs: Any) -> bool: ...
def prepare_logits_processor(config: LLMConfig) -> LogitsProcessorList: ...
def get_context_length(config: PretrainedConfig) -> int: ...
def is_sentence_complete(output: str) -> bool: ...
def is_partial_stop(output: str, stop_str: str) -> bool: ...

View File

@@ -1,6 +1,4 @@
# mypy: disable-error-code="name-defined,attr-defined"
from __future__ import annotations
import abc
import functools
import logging
import os
@@ -10,14 +8,12 @@ import typing as t
import attr
import inflection
import orjson
from huggingface_hub import hf_hub_download
import bentoml
import openllm
import openllm_core
from bentoml._internal.models.model import ModelSignature
from bentoml._internal.runner.runner_handle import DummyRunnerHandle
from openllm_core._schemas import CompletionChunk, GenerationOutput
from openllm_core._schemas import GenerationOutput
from openllm_core._typing_compat import (
AdapterMap,
AdapterTuple,
@@ -43,32 +39,27 @@ from openllm_core.utils import (
converter,
first_not_none,
flatten_attrs,
gen_random_uuid,
generate_hash_from_file,
get_debug_mode,
get_disable_warnings,
get_quiet_mode,
is_peft_available,
is_vllm_available,
resolve_filepath,
validate_is_path,
)
from ._quantisation import infer_quantisation_config
from ._strategies import CascadingResourceStrategy
from .exceptions import ForbiddenAttributeError, OpenLLMException
from .serialisation.constants import PEFT_CONFIG_NAME
if t.TYPE_CHECKING:
import torch
import transformers
from peft.config import PeftConfig
from peft.peft_model import PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM
from bentoml._internal.runner.runnable import RunnableMethod
from bentoml._internal.runner.runner import RunnerMethod
from bentoml._internal.runner.runner_handle import RunnerHandle
from bentoml._internal.runner.strategy import Strategy
from openllm_core._configuration import LLMConfig
from openllm_core.utils.representation import ReprArgs
from ._runners import Runner
ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]]
@@ -84,16 +75,15 @@ def normalise_model_name(name: str) -> str:
return inflection.dasherize(name)
def resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
"""Resolve the type of the PeftConfig given the adapter_map.
def _resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
try:
from huggingface_hub import hf_hub_download
except ImportError:
raise MissingDependencyError(
"Failed to import 'huggingface_hub'. Make sure to do 'pip install \"openllm[fine-tune]\"'"
) from None
This is similar to how PeftConfig resolve its config type.
Args:
adapter_map: The given mapping from either SDK or CLI. See CLI docs for more information.
"""
resolved: AdapterMap = {}
_has_set_default = False
for path_or_adapter_id, name in adapter_map.items():
if name is None:
raise ValueError('Adapter name must be specified.')
@@ -107,7 +97,7 @@ def resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
with open(config_file, 'r') as file:
resolved_config = orjson.loads(file.read())
# all peft_type should be available in PEFT_CONFIG_NAME
_peft_type: AdapterType = resolved_config['peft_type'].lower()
_peft_type = resolved_config['peft_type'].lower()
if _peft_type not in resolved:
resolved[_peft_type] = ()
resolved[_peft_type] += (_AdapterTuple((path_or_adapter_id, name, resolved_config)),)
@@ -151,7 +141,7 @@ class LLM(t.Generic[M, T], ReprMixin):
__llm_config__: LLMConfig | None = None
__llm_backend__: LiteralBackend = None # type: ignore
__llm_quantization_config__: transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig | None = None
__llm_runner__: t.Optional[LLMRunner[M, T]] = None
__llm_runner__: t.Optional[Runner[M, T]] = None
__llm_model__: t.Optional[M] = None
__llm_tokenizer__: t.Optional[T] = None
__llm_adapter_map__: t.Optional[ResolvedAdapterMap] = None
@@ -159,35 +149,29 @@ class LLM(t.Generic[M, T], ReprMixin):
def __init__(
self,
model_id: str,
model_version: str | None = None,
model_tag: str | bentoml.Tag | None = None,
prompt_template: PromptTemplate | str | None = None,
system_message: str | None = None,
llm_config: LLMConfig | None = None,
backend: LiteralBackend | None = None,
*args: t.Any,
quantize: LiteralQuantise | None = None,
quantization_config: transformers.BitsAndBytesConfig
| transformers.GPTQConfig
| transformers.AwqConfig
| None = None,
adapter_map: dict[str, str] | None = None,
serialisation: LiteralSerialisation = 'safetensors',
trust_remote_code: bool = False,
embedded: bool = False,
torch_dtype: LiteralDtype | t.Literal['auto', 'half', 'float'] = 'auto',
**attrs: t.Any,
model_id,
model_version=None,
model_tag=None,
prompt_template=None,
system_message=None,
llm_config=None,
backend=None,
*args,
quantize=None,
quantization_config=None,
adapter_map=None,
serialisation='safetensors',
trust_remote_code=False,
embedded=False,
torch_dtype='auto',
low_cpu_mem_usage=True,
**attrs,
):
# low_cpu_mem_usage is only available for model this is helpful on system with low memory to avoid OOM
low_cpu_mem_usage = attrs.pop('low_cpu_mem_usage', True)
_local = False
if validate_is_path(model_id):
model_id, _local = resolve_filepath(model_id), True
backend = first_not_none(
backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if openllm.utils.is_vllm_available() else 'pt'
)
backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt')
torch_dtype = first_not_none(os.getenv('TORCH_DTYPE'), torch_dtype, default='auto')
quantize = first_not_none(quantize, os.getenv('OPENLLM_QUANTIZE'), default=None)
# elif quantization_config is None and quantize is not None:
@@ -215,7 +199,7 @@ class LLM(t.Generic[M, T], ReprMixin):
quantization_config=quantization_config,
quantise=quantize,
model_decls=args,
adapter_map=resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
adapter_map=_resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
serialisation=serialisation,
local=_local,
prompt_template=prompt_template,
@@ -244,7 +228,7 @@ class LLM(t.Generic[M, T], ReprMixin):
self.runner.init_local(quiet=True)
@property
def _torch_dtype(self) -> torch.dtype:
def _torch_dtype(self):
import torch
import transformers
@@ -298,11 +282,15 @@ class LLM(t.Generic[M, T], ReprMixin):
super().__setattr__(attr, value)
@property
def _model_attrs(self) -> dict[str, t.Any]:
def _model_attrs(self):
return {**self.import_kwargs[0], **self.__model_attrs}
@_model_attrs.setter
def _model_attrs(self, value):
self.__model_attrs = value
@property
def _tokenizer_attrs(self) -> dict[str, t.Any]:
def _tokenizer_attrs(self):
return {**self.import_kwargs[1], **self.__tokenizer_attrs}
@property
@@ -319,41 +307,42 @@ class LLM(t.Generic[M, T], ReprMixin):
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
import torch
return {'device_map': 'auto' if torch.cuda.is_available() else None, 'torch_dtype': self._torch_dtype}, {
'padding_side': 'left',
'truncation_side': 'left',
}
model_attrs = {'device_map': 'auto' if torch.cuda.is_available() else None, 'torch_dtype': self._torch_dtype}
tokenizer_attrs = {'padding_side': 'left', 'truncation_side': 'left'}
return model_attrs, tokenizer_attrs
@property
def trust_remote_code(self) -> bool:
def trust_remote_code(self):
env = os.getenv('TRUST_REMOTE_CODE')
if env is not None:
return str(env).upper() in ENV_VARS_TRUE_VALUES
return self.__llm_trust_remote_code__
@property
def runner_name(self) -> str:
def runner_name(self):
return f"llm-{self.config['start_name']}-runner"
@property
def model_id(self) -> str:
def model_id(self):
return self._model_id
@property
def revision(self) -> str:
return t.cast(str, self._revision)
def revision(self):
return self._revision
@property
def tag(self) -> bentoml.Tag:
def tag(self):
return self._tag
@property
def bentomodel(self) -> bentoml.Model:
def bentomodel(self):
return openllm.serialisation.get(self)
@property
def quantization_config(self) -> transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig:
def quantization_config(self):
if self.__llm_quantization_config__ is None:
from ._quantisation import infer_quantisation_config
if self._quantization_config is not None:
self.__llm_quantization_config__ = self._quantization_config
elif self._quantise is not None:
@@ -365,55 +354,55 @@ class LLM(t.Generic[M, T], ReprMixin):
return self.__llm_quantization_config__
@property
def has_adapters(self) -> bool:
def has_adapters(self):
return self._adapter_map is not None
@property
def local(self) -> bool:
def local(self):
return self._local
@property
def quantise(self) -> LiteralQuantise | None:
def quantise(self):
return self._quantise
# NOTE: The section below defines a loose contract with langchain's LLM interface.
@property
def llm_type(self) -> str:
def llm_type(self):
return normalise_model_name(self._model_id)
@property
def identifying_params(self) -> DictStrAny:
def llm_parameters(self):
return (self._model_decls, self._model_attrs), self._tokenizer_attrs
@property
def identifying_params(self):
return {
'configuration': self.config.model_dump_json().decode(),
'model_ids': orjson.dumps(self.config['model_ids']).decode(),
'model_id': self.model_id,
}
@property
def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]:
return (self._model_decls, self._model_attrs), self._tokenizer_attrs
# NOTE: This section is the actual model, tokenizer, and config reference here.
@property
def config(self) -> LLMConfig:
def config(self):
if self.__llm_config__ is None:
self.__llm_config__ = openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs)
return self.__llm_config__
@property
def tokenizer(self) -> T:
def tokenizer(self):
if self.__llm_tokenizer__ is None:
self.__llm_tokenizer__ = openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1])
return self.__llm_tokenizer__
@property
def runner(self) -> LLMRunner[M, T]:
def runner(self):
if self.__llm_runner__ is None:
self.__llm_runner__ = _RunnerFactory(self)
return self.__llm_runner__
@property
def model(self) -> M:
def model(self):
if self.__llm_model__ is None:
model = openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs)
# If OOM, then it is probably you don't have enough VRAM to run this model.
@@ -439,7 +428,7 @@ class LLM(t.Generic[M, T], ReprMixin):
return self.__llm_model__
@property
def adapter_map(self) -> ResolvedAdapterMap:
def adapter_map(self):
try:
import peft as _ # noqa: F401
except ImportError as err:
@@ -461,9 +450,7 @@ class LLM(t.Generic[M, T], ReprMixin):
self.__llm_adapter_map__ = _map
return self.__llm_adapter_map__
def prepare_for_training(
self, adapter_type: AdapterType = 'lora', use_gradient_checking: bool = True, **attrs: t.Any
) -> tuple[PeftModel | PeftModelForCausalLM | PeftModelForSeq2SeqLM, T]:
def prepare_for_training(self, adapter_type='lora', use_gradient_checking=True, **attrs):
from peft.mapping import get_peft_model
from peft.utils.other import prepare_model_for_kbit_training
@@ -484,15 +471,8 @@ class LLM(t.Generic[M, T], ReprMixin):
return model, self.tokenizer
async def generate(
self,
prompt: str | None,
prompt_token_ids: list[int] | None = None,
stop: str | t.Iterable[str] | None = None,
stop_token_ids: list[int] | None = None,
request_id: str | None = None,
adapter_name: str | None = None,
**attrs: t.Any,
) -> GenerationOutput:
self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
):
config = self.config.model_construct_env(**attrs)
texts: list[list[str]] = [[]] * config['n']
token_ids: list[list[int]] = [[]] * config['n']
@@ -515,15 +495,8 @@ class LLM(t.Generic[M, T], ReprMixin):
)
async def generate_iterator(
self,
prompt: str | None,
prompt_token_ids: list[int] | None = None,
stop: str | t.Iterable[str] | None = None,
stop_token_ids: list[int] | None = None,
request_id: str | None = None,
adapter_name: str | None = None,
**attrs: t.Any,
) -> t.AsyncGenerator[GenerationOutput, None]:
self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
):
if isinstance(self.runner._runner_handle, DummyRunnerHandle):
if os.getenv('BENTO_PATH') is not None:
raise RuntimeError('Runner client failed to set up correctly.')
@@ -551,14 +524,13 @@ class LLM(t.Generic[M, T], ReprMixin):
raise ValueError('Either prompt or prompt_token_ids must be specified.')
prompt_token_ids = self.tokenizer.encode(prompt)
if request_id is None:
request_id = openllm_core.utils.gen_random_uuid()
request_id = gen_random_uuid() if request_id is None else request_id
previous_texts, previous_num_tokens = [''] * config['n'], [0] * config['n']
async for out in self.runner.generate_iterator.async_stream(
prompt_token_ids, request_id, stop, adapter_name, **config.model_dump(flatten=True)
prompt_token_ids, request_id, stop=stop, adapter_name=adapter_name, **config.model_dump(flatten=True)
):
generated = GenerationOutput.from_runner(out).with_options(prompt=prompt)
delta_outputs = t.cast(t.List[CompletionChunk], [None] * len(generated.outputs))
delta_outputs = [None] * len(generated.outputs)
if generated.finished:
break
for output in generated.outputs:
@@ -570,44 +542,37 @@ class LLM(t.Generic[M, T], ReprMixin):
def _RunnerFactory(
self: openllm.LLM[M, T],
/,
models: list[bentoml.Model] | None = None,
max_batch_size: int | None = None,
max_latency_ms: int | None = None,
scheduling_strategy: type[bentoml.Strategy] = CascadingResourceStrategy,
*,
backend: LiteralBackend | None = None,
) -> LLMRunner[M, T]:
llm, /, models=None, max_batch_size=None, max_latency_ms=None, scheduling_strategy=None, *, backend=None
):
from ._runners import runnable
backend = t.cast(
LiteralBackend, first_not_none(backend, os.environ.get('OPENLLM_BACKEND'), default=self.__llm_backend__)
)
if scheduling_strategy is None:
from ._strategies import CascadingResourceStrategy
scheduling_strategy = CascadingResourceStrategy
backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND', default=llm.__llm_backend__))
models = models if models is not None else []
try:
models.append(self.bentomodel)
models.append(llm.bentomodel)
except bentoml.exceptions.NotFound as err:
raise RuntimeError(f'Failed to locate {self.bentomodel}:{err}') from err
raise RuntimeError(f'Failed to locate {llm.bentomodel}:{err}') from err
if self._prompt_template:
prompt_template = self._prompt_template.to_string()
elif hasattr(self.config, 'default_prompt_template'):
prompt_template = self.config.default_prompt_template
if llm._prompt_template:
prompt_template = llm._prompt_template.to_string()
elif hasattr(llm.config, 'default_prompt_template'):
prompt_template = llm.config.default_prompt_template
else:
prompt_template = None
if self._system_message:
system_message = self._system_message
elif hasattr(self.config, 'default_system_message'):
system_message = self.config.default_system_message
if llm._system_message:
system_message = llm._system_message
elif hasattr(llm.config, 'default_system_message'):
system_message = llm.config.default_system_message
else:
system_message = None
def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]:
return {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'}
def _wrapped_repr_args(_: LLMRunner[M, T]) -> ReprArgs:
def _wrapped_repr_args(_):
yield (
'runner_methods',
{
@@ -618,89 +583,40 @@ def _RunnerFactory(
for method in _.runner_methods
},
)
yield 'config', self.config.model_dump(flatten=True)
yield 'llm_type', self.llm_type
yield 'config', llm.config.model_dump(flatten=True)
yield 'llm_type', llm.llm_type
yield 'backend', backend
yield 'llm_tag', self.tag
yield 'llm_tag', llm.tag
return types.new_class(
self.__class__.__name__ + 'Runner',
llm.config.__class__.__name__[:-6] + 'Runner',
(bentoml.Runner,),
exec_body=lambda ns: ns.update(
{
'llm_type': self.llm_type,
'identifying_params': self.identifying_params,
'llm_tag': self.tag,
'llm': self,
'config': self.config,
'llm_type': llm.llm_type,
'identifying_params': llm.identifying_params,
'llm_tag': llm.tag,
'llm': llm,
'config': llm.config,
'backend': backend,
'__module__': self.__module__,
'__doc__': llm.config.__class__.__doc__ or f'Generated Runner class for {llm.config["model_name"]}',
'__module__': llm.__module__,
'__repr__': ReprMixin.__repr__,
'__repr_keys__': property(_wrapped_repr_keys),
'__repr_keys__': property(lambda _: {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'}),
'__repr_args__': _wrapped_repr_args,
'has_adapters': self.has_adapters,
'has_adapters': llm.has_adapters,
'prompt_template': prompt_template,
'system_message': system_message,
}
),
)(
runnable(backend),
name=self.runner_name,
name=llm.runner_name,
embedded=False,
models=models,
max_batch_size=max_batch_size,
max_latency_ms=max_latency_ms,
scheduling_strategy=scheduling_strategy,
runnable_init_params=dict(llm=self),
runnable_init_params={'llm': llm},
method_configs=converter.unstructure({'generate_iterator': ModelSignature(batchable=False)}),
)
@t.final
class LLMRunnable(bentoml.Runnable, t.Generic[M, T]):
SUPPORTED_RESOURCES = ('amd.com/gpu', 'nvidia.com/gpu', 'cpu')
SUPPORTS_CPU_MULTI_THREADING = True
generate_iterator: RunnableMethod[LLMRunnable[M, T], [list[int], str, str | t.Iterable[str] | None, str | None], str]
@t.final
class LLMRunner(t.Protocol[M, T]):
__doc__: str
__module__: str
llm_type: str
llm_tag: bentoml.Tag
identifying_params: dict[str, t.Any]
llm: openllm.LLM[M, T]
config: openllm.LLMConfig
backend: LiteralBackend
has_adapters: bool
system_message: str | None
prompt_template: str | None
generate_iterator: RunnerMethod[LLMRunnable[M, T], [list[int], str, str | t.Iterable[str] | None, str | None], str]
runner_methods: list[RunnerMethod[t.Any, t.Any, t.Any]]
scheduling_strategy: type[Strategy]
workers_per_resource: int | float
runnable_init_params: dict[str, t.Any]
_runner_handle: RunnerHandle
def __init__(
self,
runnable_class: type[LLMRunnable[M, T]],
*,
runnable_init_params: dict[str, t.Any] | None = ...,
name: str | None = ...,
scheduling_strategy: type[Strategy] = ...,
models: list[bentoml.Model] | None = ...,
max_batch_size: int | None = ...,
max_latency_ms: int | None = ...,
method_configs: dict[str, dict[str, int]] | None = ...,
embedded: bool = False,
) -> None: ...
@property
@abc.abstractmethod
def __repr_keys__(self) -> set[str]: ...
__all__ = ['LLMRunner', 'LLMRunnable', 'LLM']

View File

@@ -0,0 +1,158 @@
from typing import Any, AsyncGenerator, Dict, Generic, Iterable, List, Literal, Optional, Set, Tuple, TypedDict, Union
import attr
import torch
from peft.config import PeftConfig
from peft.peft_model import PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM
from bentoml import Model, Tag
from openllm_core import LLMConfig
from openllm_core._schemas import GenerationOutput
from openllm_core._typing_compat import (
AdapterMap,
AdapterType,
LiteralBackend,
LiteralDtype,
LiteralQuantise,
LiteralSerialisation,
M,
T,
)
from openllm_core.prompts import PromptTemplate
from openllm_core.utils.representation import ReprArgs
from ._quantisation import QuantizationConfig
from ._runners import Runner
InjectedModel = Union[PeftModel | PeftModelForCausalLM | PeftModelForSeq2SeqLM]
class IdentifyingParams(TypedDict):
configuration: str
model_ids: str
model_id: str
ResolvedAdapterMap = Dict[AdapterType, Dict[str, Tuple[PeftConfig, str]]]
Dtype = Union[LiteralDtype, Literal['auto', 'half', 'float']]
@attr.define(slots=True, repr=False, init=False)
class LLM(Generic[M, T]):
_model_id: str
_revision: Optional[str]
_quantization_config: Optional[QuantizationConfig]
_quantise: Optional[LiteralQuantise]
_model_decls: Tuple[Any, ...]
__model_attrs: Dict[str, Any]
__tokenizer_attrs: Dict[str, Any]
_tag: Tag
_adapter_map: Optional[AdapterMap]
_serialisation: LiteralSerialisation
_local: bool
_prompt_template: Optional[PromptTemplate]
_system_message: Optional[str]
__llm_torch_dtype__: Dtype = ...
__llm_config__: Optional[LLMConfig] = ...
__llm_backend__: LiteralBackend = ...
__llm_quantization_config__: Optional[QuantizationConfig] = ...
__llm_runner__: Optional[Runner[M, T]] = ...
__llm_model__: Optional[M] = ...
__llm_tokenizer__: Optional[T] = ...
__llm_adapter_map__: Optional[ResolvedAdapterMap] = ...
__llm_trust_remote_code__: bool = ...
@property
def __repr_keys__(self) -> Set[str]: ...
def __repr__(self) -> str: ...
def __str__(self) -> str: ...
def __repr_name__(self) -> str: ...
def __repr_str__(self, join_str: str) -> str: ...
def __repr_args__(self) -> ReprArgs: ...
def __init__(
self,
model_id: str,
model_version: Optional[str] = ...,
model_tag: Optional[Union[str, Tag]] = ...,
prompt_template: Optional[Union[str, PromptTemplate]] = ...,
system_message: Optional[str] = ...,
llm_config: Optional[LLMConfig] = ...,
backend: Optional[LiteralBackend] = ...,
*args: Any,
quantize: Optional[LiteralQuantise] = ...,
quantization_config: Optional[QuantizationConfig] = ...,
adapter_map: Optional[Dict[str, str]] = ...,
serialisation: LiteralSerialisation = ...,
trust_remote_code: bool = ...,
embedded: bool = ...,
torch_dtype: Dtype = ...,
low_cpu_mem_usage: bool = ...,
**attrs: Any,
) -> None: ...
@property
def _torch_dtype(self) -> torch.dtype: ...
@property
def _model_attrs(self) -> Dict[str, Any]: ...
@_model_attrs.setter
def _model_attrs(self, model_attrs: Dict[str, Any]) -> None: ...
@property
def _tokenizer_attrs(self) -> Dict[str, Any]: ...
@property
def import_kwargs(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: ...
@property
def trust_remote_code(self) -> bool: ...
@property
def runner_name(self) -> str: ...
@property
def model_id(self) -> str: ...
@property
def revision(self) -> str: ...
@property
def tag(self) -> Tag: ...
@property
def bentomodel(self) -> Model: ...
@property
def quantization_config(self) -> QuantizationConfig: ...
@property
def has_adapters(self) -> bool: ...
@property
def local(self) -> bool: ...
@property
def quantise(self) -> Optional[LiteralQuantise]: ...
@property
def llm_type(self) -> str: ...
@property
def identifying_params(self) -> IdentifyingParams: ...
@property
def llm_parameters(self) -> Tuple[Tuple[Tuple[Any, ...], Dict[str, Any]], Dict[str, Any]]: ...
@property
def config(self) -> LLMConfig: ...
@property
def tokenizer(self) -> T: ...
@property
def model(self) -> M: ...
@property
def runner(self) -> Runner[M, T]: ...
@property
def adapter_map(self) -> ResolvedAdapterMap: ...
def prepare_for_training(
self, adapter_type: AdapterType = ..., use_gradient_checking: bool = ..., **attrs: Any
) -> Tuple[InjectedModel, T]: ...
async def generate(
self,
prompt: Optional[str],
prompt_token_ids: Optional[List[int]] = ...,
stop: Optional[Union[str, Iterable[str]]] = ...,
stop_token_ids: Optional[List[int]] = ...,
request_id: Optional[str] = ...,
adapter_name: Optional[str] = ...,
**attrs: Any,
) -> GenerationOutput: ...
async def generate_iterator(
self,
prompt: Optional[str],
prompt_token_ids: Optional[List[int]] = ...,
stop: Optional[Union[str, Iterable[str]]] = ...,
stop_token_ids: Optional[List[int]] = ...,
request_id: Optional[str] = ...,
adapter_name: Optional[str] = ...,
**attrs: Any,
) -> AsyncGenerator[GenerationOutput, None]: ...

View File

@@ -1,12 +1,5 @@
# mypy: disable-error-code="name-defined,no-redef"
from __future__ import annotations
import logging
import typing as t
import torch
import transformers
from openllm_core._typing_compat import LiteralQuantise, overload
from openllm_core.exceptions import MissingDependencyError
from openllm_core.utils import (
is_autoawq_available,
@@ -15,35 +8,11 @@ from openllm_core.utils import (
is_optimum_supports_gptq,
)
if t.TYPE_CHECKING:
from openllm_core._typing_compat import DictStrAny
from ._llm import LLM
def infer_quantisation_config(llm, quantise, **attrs):
import torch
import transformers
logger = logging.getLogger(__name__)
@overload
def infer_quantisation_config(
self: LLM[t.Any, t.Any], quantise: t.Literal['int8', 'int4'], **attrs: t.Any
) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]: ...
@overload
def infer_quantisation_config(
self: LLM[t.Any, t.Any], quantise: t.Literal['gptq'], **attrs: t.Any
) -> tuple[transformers.GPTQConfig, DictStrAny]: ...
@overload
def infer_quantisation_config(
self: LLM[t.Any, t.Any], quantise: t.Literal['awq'], **attrs: t.Any
) -> tuple[transformers.AwqConfig, DictStrAny]: ...
def infer_quantisation_config(
self: LLM[t.Any, t.Any], quantise: LiteralQuantise, **attrs: t.Any
) -> tuple[transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig, DictStrAny]:
# 8 bit configuration
int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
@@ -54,12 +23,17 @@ def infer_quantisation_config(
bits = attrs.pop('bits', 4)
group_size = attrs.pop('group_size', 128)
def create_awq_config() -> transformers.AwqConfig:
# 4 bit configuration
int4_compute_dtype = attrs.pop('bnb_4bit_compute_dtype', torch.bfloat16)
int4_quant_type = attrs.pop('bnb_4bit_quant_type', 'nf4')
int4_use_double_quant = attrs.pop('bnb_4bit_use_double_quant', True)
def create_awq_config():
zero_point = attrs.pop('zero_point', True)
return transformers.AwqConfig(bits=bits, group_size=group_size, zero_point=zero_point)
def create_gptq_config() -> transformers.GPTQConfig:
gptq_tokenizer = attrs.pop('tokenizer', self.model_id)
def create_gptq_config():
gptq_tokenizer = attrs.pop('tokenizer', llm.model_id)
gptq_dataset = attrs.pop('dataset', 'c4')
gptq_damp_percent = attrs.pop('damp_percent', 0.1)
gptq_desc_act = attrs.pop('desc_act', False)
@@ -94,10 +68,9 @@ def infer_quantisation_config(
exllama_config={'version': 1},
) # XXX: See how to migrate to v2
def create_int8_config(int8_skip_modules: list[str] | None) -> transformers.BitsAndBytesConfig:
def create_int8_config(int8_skip_modules):
# if int8_skip_modules is None: int8_skip_modules = []
# if 'lm_head' not in int8_skip_modules and self.config_class.__openllm_model_type__ == 'causal_lm':
# logger.debug("Skipping 'lm_head' for quantization for %s", self.__name__)
# int8_skip_modules.append('lm_head')
return transformers.BitsAndBytesConfig(
load_in_8bit=True,
@@ -107,10 +80,13 @@ def infer_quantisation_config(
llm_int8_has_fp16_weight=int8_has_fp16_weight,
)
# 4 bit configuration
int4_compute_dtype = attrs.pop('bnb_4bit_compute_dtype', torch.bfloat16)
int4_quant_type = attrs.pop('bnb_4bit_quant_type', 'nf4')
int4_use_double_quant = attrs.pop('bnb_4bit_use_double_quant', True)
def create_int4_config():
return transformers.BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=int4_compute_dtype,
bnb_4bit_quant_type=int4_quant_type,
bnb_4bit_use_double_quant=int4_use_double_quant,
)
# NOTE: Quantization setup quantize is a openllm.LLM feature, where we can quantize the model with bitsandbytes or quantization aware training.
if not is_bitsandbytes_available():
@@ -120,23 +96,18 @@ def infer_quantisation_config(
if quantise == 'int8':
quantisation_config = create_int8_config(int8_skip_modules)
elif quantise == 'int4':
quantisation_config = transformers.BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=int4_compute_dtype,
bnb_4bit_quant_type=int4_quant_type,
bnb_4bit_use_double_quant=int4_use_double_quant,
)
quantisation_config = create_int4_config()
elif quantise == 'gptq':
if not is_autogptq_available() or not is_optimum_supports_gptq():
raise MissingDependencyError(
"'quantize=\"gptq\"' requires 'auto-gptq' and 'optimum>=0.12' to be installed (missing or failed to import). Make sure to do 'pip install \"openllm[gptq]\"'"
"GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'"
)
else:
quantisation_config = create_gptq_config()
elif quantise == 'awq':
if not is_autoawq_available():
raise MissingDependencyError(
"quantize='awq' requires 'auto-awq' to be installed (missing or failed to import). Make sure to do 'pip install \"openllm[awq]\"'."
"AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'."
)
else:
quantisation_config = create_awq_config()

View File

@@ -0,0 +1,26 @@
from typing import Any, Dict, Literal, Union
from transformers import AwqConfig, BitsAndBytesConfig, GPTQConfig
from openllm_core._typing_compat import LiteralQuantise, M, T, overload
from ._llm import LLM
QuantizationConfig = Union[BitsAndBytesConfig, GPTQConfig, AwqConfig]
@overload
def infer_quantisation_config(
self: LLM[M, T], quantise: Literal['int8', 'int4'], **attrs: Any
) -> tuple[BitsAndBytesConfig, Dict[str, Any]]: ...
@overload
def infer_quantisation_config(
self: LLM[M, T], quantise: Literal['gptq'], **attrs: Any
) -> tuple[GPTQConfig, Dict[str, Any]]: ...
@overload
def infer_quantisation_config(
self: LLM[M, T], quantise: Literal['awq'], **attrs: Any
) -> tuple[AwqConfig, Dict[str, Any]]: ...
@overload
def infer_quantisation_config(
self: LLM[M, T], quantise: LiteralQuantise, **attrs: Any
) -> tuple[QuantizationConfig, Dict[str, Any]]: ...

View File

@@ -9,27 +9,14 @@ import torch
import bentoml
import openllm
from openllm_core._schemas import CompletionChunk, GenerationOutput
from openllm_core._typing_compat import LiteralBackend, M, T
from openllm_core.exceptions import OpenLLMException
from openllm_core.utils import first_not_none, is_vllm_available
if t.TYPE_CHECKING:
import vllm
from openllm_core._schemas import FinishReason
else:
vllm = openllm.utils.LazyLoader('vllm', globals(), 'vllm')
_DEFAULT_TOKENIZER = 'hf-internal-testing/llama-tokenizer'
__all__ = ['runnable']
def runnable(backend: LiteralBackend | None = None) -> type[bentoml.Runnable]:
backend = t.cast(
LiteralBackend,
first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt'),
)
def runnable(backend=None):
backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt')
return vLLMRunnable if backend == 'vllm' else PyTorchRunnable
@@ -37,7 +24,11 @@ class vLLMRunnable(bentoml.Runnable):
SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
SUPPORTS_CPU_MULTI_THREADING = True
def __init__(self, llm: openllm.LLM[M, T]) -> None:
def __init__(self, llm):
try:
import vllm
except ImportError:
raise OpenLLMException('vLLM is not installed. Please install it via `pip install "openllm[vllm]"`.') from None
self.config = llm.config
num_gpus, dev = 1, openllm.utils.device_count()
if dev >= 2:
@@ -64,14 +55,7 @@ class vLLMRunnable(bentoml.Runnable):
raise OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from err
@bentoml.Runnable.method(batchable=False)
async def generate_iterator(
self,
prompt_token_ids: list[int],
request_id: str,
stop: str | t.Iterable[str] | None = None,
adapter_name: str | None = None,
**attrs: t.Any,
) -> t.AsyncGenerator[str, None]:
async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
if adapter_name is not None:
raise NotImplementedError('Adapter is not supported with vLLM.')
stop_: set[str] = set()
@@ -99,28 +83,19 @@ class PyTorchRunnable(bentoml.Runnable):
SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
SUPPORTS_CPU_MULTI_THREADING = True
def __init__(self, llm: openllm.LLM[M, T]) -> None:
def __init__(self, llm):
self.model = llm.model
self.tokenizer = llm.tokenizer
self.config = llm.config
@bentoml.Runnable.method(batchable=False)
async def generate_iterator(
self,
prompt_token_ids: list[int],
request_id: str,
stop: str | t.Iterable[str] | None = None,
adapter_name: str | None = None,
**attrs: t.Any,
) -> t.AsyncGenerator[str, None]:
async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
if adapter_name is not None:
self.model.set_adapter(adapter_name)
async for generation_output in self.forward(prompt_token_ids, request_id, stop=stop, **attrs):
yield generation_output.model_dump_json()
async def forward(
self, prompt_token_ids: list[int], request_id: str, stop: str | t.Iterable[str] | None = None, **attrs: t.Any
) -> t.AsyncGenerator[GenerationOutput, None]:
async def forward(self, prompt_token_ids, request_id, stop=None, **attrs):
from ._generation import is_partial_stop, prepare_logits_processor
stop_: set[str] = set()
@@ -142,7 +117,7 @@ class PyTorchRunnable(bentoml.Runnable):
logits_processor = prepare_logits_processor(config)
past_key_values = out = token = None
finish_reason: t.Optional[FinishReason] = None
finish_reason = None
for i in range(config['max_new_tokens']):
if i == 0: # prefill
out = self.model(torch.as_tensor([prompt_token_ids], device=self.model.device), use_cache=True)

View File

@@ -0,0 +1,126 @@
from typing import (
Any,
AsyncGenerator,
Dict,
Generic,
Iterable,
List,
Literal,
Optional,
Protocol,
Tuple,
Type,
TypeVar,
Union,
final,
)
from bentoml import Model, Strategy, Tag
from bentoml._internal.runner.runner_handle import RunnerHandle
from openllm_core import LLMConfig
from openllm_core._typing_compat import LiteralBackend, T, overload
from ._llm import LLM
try:
from vllm import AsyncLLMEngine
except ImportError:
AsyncLLMEngine = Any
try:
from transformers import PreTrainedModel
except ImportError:
PreTrainedModel = Any
Mo = TypeVar('Mo')
class _Runnable(Protocol[Mo]):
SUPPORTED_RESOURCES: Tuple[Literal['nvidia.com/gpu'], Literal['amd.com/gpu'], Literal['cpu']] = ...
SUPPORTS_CPU_MULTI_THREADING: bool = ...
config: LLMConfig = ...
model: Mo = ...
def __init__(self, llm: LLM[Mo, T]) -> None: ...
async def generate_iterator(
self,
prompt_token_ids: List[int],
request_id: str,
stop: Optional[Union[str, Iterable[str]]] = ...,
adapter_name: Optional[str] = ...,
**attrs: Any,
) -> AsyncGenerator[str, None]: ...
In = TypeVar('In')
Ret = TypeVar('Ret')
class RunnerMethod(Generic[In, Ret]): ...
@final
class vLLMRunnable(_Runnable[AsyncLLMEngine]): ...
@final
class PyTorchRunnable(_Runnable[PreTrainedModel]):
tokenizer: Any
@overload
def runnable(backend: Literal['vllm']) -> Type[vLLMRunnable]: ...
@overload
def runnable(backend: Literal['pt']) -> Type[PyTorchRunnable]: ...
@overload
def runnable(backend: Optional[str] = ...) -> Type[Union[vLLMRunnable, PyTorchRunnable]]: ...
class Runner(Protocol[Mo, T]):
__doc__: str = ...
__module__: str = ...
llm_type: str = ...
llm_tag: Tag = ...
identifying_params: Dict[str, Any] = ...
llm: LLM[Mo, T] = ...
config: LLMConfig = ...
backend: LiteralBackend = ...
has_adapters: bool = ...
prompt_template: Optional[str] = ...
system_message: Optional[str] = ...
class generate_iterator(RunnerMethod[List[int], AsyncGenerator[str, None]]):
@staticmethod
def async_stream(
prompt_token_ids: List[int],
request_id: str,
stop: Optional[Union[Iterable[str], str]] = ...,
adapter_name: Optional[str] = ...,
**attrs: Any,
) -> AsyncGenerator[str, None]: ...
def __init__(
self,
runnable_class: Type[_Runnable[Mo]],
*,
runnable_init_params: Optional[Dict[str, Any]] = ...,
name: Optional[str] = ...,
scheduling_strategy: Type[Strategy] = ...,
models: Optional[List[Model]] = ...,
max_batch_size: Optional[int] = ...,
max_latency_ms: Optional[int] = ...,
method_configs: Optional[Dict[str, Dict[str, int]]] = ...,
embedded: bool = ...,
) -> None: ...
name: str = ...
models: List[Model] = ...
resource_config: Dict[str, Any]
runnable_class: Type[_Runnable[Mo]]
embedded: bool
runner_methods: List[RunnerMethod[Any, Any]]
scheduling_strategy: Type[Strategy]
workers_per_resource: Union[int, float] = ...
runnable_init_params: Dict[str, Any] = ...
_runner_handle: RunnerHandle = ...
def init_local(self, quiet: bool = False) -> None: ...
def init_client(self, handle_class: Optional[Type[RunnerHandle]] = ..., *args: Any, **kwargs: Any) -> None: ...
async def runner_handle_is_ready(self, timeout: int = ...) -> bool: ...
def destroy(self) -> None: ...
@property
def scheduled_worker_count(self) -> int: ...
@property
def scheduled_worker_env_map(self) -> Dict[int, Dict[str, Any]]: ...

View File

@@ -1,4 +1,3 @@
from __future__ import annotations
import os
model_id = os.environ['OPENLLM_MODEL_ID'] # openllm: model name

View File

@@ -1,5 +1,3 @@
from __future__ import annotations
model_id = '{__model_id__}' # openllm: model id
model_tag = '{__model_tag__}' # openllm: model tag
adapter_map = """{__model_adapter_map__}""" # openllm: model adapter map

View File

@@ -1,36 +1,15 @@
import os
import typing as t
from openllm_core.utils import LazyModule
_import_structure = {
'_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'],
'oci': [
'CONTAINER_NAMES',
'get_base_container_tag',
'get_base_container_name',
'supported_registries',
'RefResolver',
],
}
if t.TYPE_CHECKING:
from . import _package as _package, oci as oci
from ._package import (
build_editable as build_editable,
construct_docker_options as construct_docker_options,
construct_python_options as construct_python_options,
create_bento as create_bento,
)
from .oci import (
CONTAINER_NAMES as CONTAINER_NAMES,
RefResolver as RefResolver,
get_base_container_name as get_base_container_name,
get_base_container_tag as get_base_container_tag,
supported_registries as supported_registries,
)
__lazy = LazyModule(__name__, os.path.abspath('__file__'), _import_structure)
__lazy = LazyModule(
__name__,
os.path.abspath('__file__'),
{
'_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'],
'oci': ['CONTAINER_NAMES', 'supported_registries', 'RefResolver'],
},
)
__all__ = __lazy.__all__
__dir__ = __lazy.__dir__
__getattr__ = __lazy.__getattr__

View File

@@ -0,0 +1,32 @@
from typing import Optional
import attr
from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
from openllm_core.utils.lazy import VersionInfo
from . import _package as _package, oci as oci
from ._package import (
build_editable as build_editable,
construct_docker_options as construct_docker_options,
construct_python_options as construct_python_options,
create_bento as create_bento,
)
CONTAINER_NAMES: dict[LiteralContainerRegistry, str] = ...
supported_registries: list[str] = ...
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
class RefResolver:
git_hash: str
version: VersionInfo
strategy: LiteralContainerVersionStrategy
@classmethod
def from_strategy(cls, strategy_or_version: Optional[LiteralContainerVersionStrategy] = ...) -> RefResolver: ...
@property
def tag(self) -> str: ...
@staticmethod
def construct_base_image(
reg: LiteralContainerRegistry, strategy: Optional[LiteralContainerVersionStrategy] = ...
) -> str: ...

View File

@@ -1,16 +1,12 @@
# mypy: disable-error-code="misc"
from __future__ import annotations
import importlib.metadata
import inspect
import logging
import os
import string
import typing as t
from pathlib import Path
import fs
import fs.copy
import fs.errors
import orjson
from simple_di import Provide, inject
@@ -18,38 +14,27 @@ import bentoml
import openllm_core
from bentoml._internal.bento.build_config import BentoBuildConfig, DockerOptions, ModelSpec, PythonOptions
from bentoml._internal.configuration.containers import BentoMLContainer
from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg
from . import oci
if t.TYPE_CHECKING:
from fs.base import FS
import openllm
from bentoml._internal.bento import BentoStore
from bentoml._internal.models.model import ModelStore
from openllm_core._typing_compat import (
LiteralContainerRegistry,
LiteralContainerVersionStrategy,
LiteralSerialisation,
LiteralString,
)
from openllm_core._typing_compat import LiteralString
logger = logging.getLogger(__name__)
OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
def build_editable(
path: str, package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm'
) -> str | None:
def build_editable(path, package='openllm'):
"""Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set."""
if not openllm_core.utils.check_bool_env(OPENLLM_DEV_BUILD, default=False):
if not check_bool_env(OPENLLM_DEV_BUILD, default=False):
return None
# We need to build the package in editable mode, so that we can import it
from build import ProjectBuilder
from build.env import IsolatedEnvBuilder
module_location = openllm_core.utils.pkg.source_locations(package)
module_location = pkg.source_locations(package)
if not module_location:
raise RuntimeError(
'Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.'
@@ -68,12 +53,7 @@ def build_editable(
)
def construct_python_options(
llm: openllm.LLM[t.Any, t.Any],
llm_fs: FS,
extra_dependencies: tuple[str, ...] | None = None,
adapter_map: dict[str, str] | None = None,
) -> PythonOptions:
def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=None):
packages = ['openllm', 'scipy'] # apparently bnb misses this one
if adapter_map is not None:
packages += ['openllm[fine-tune]']
@@ -88,24 +68,18 @@ def construct_python_options(
if req is not None:
packages.extend(req)
if str(os.environ.get('BENTOML_BUNDLE_LOCAL_BUILD', False)).lower() == 'false':
packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}")
packages.append(f"bentoml>={'.'.join([str(i) for i in pkg.pkg_version_info('bentoml')])}")
if not openllm_core.utils.is_torch_available():
raise ValueError('PyTorch is not available. Make sure to have it locally installed.')
packages.extend(
['torch==2.0.1+cu118', 'vllm==0.2.1.post1', 'xformers==0.0.22', 'bentoml[tracing]==1.1.9']
) # XXX: Currently locking this for correctness
wheels: list[str] = []
built_wheels = [
build_editable(llm_fs.getsyspath('/'), t.cast(t.Literal['openllm', 'openllm_core', 'openllm_client'], p))
for p in ('openllm_core', 'openllm_client', 'openllm')
]
# XXX: Currently locking this for correctness
packages.extend(['torch==2.0.1+cu118', 'vllm==0.2.1.post1', 'xformers==0.0.22', 'bentoml[tracing]==1.1.9'])
wheels = []
built_wheels = [build_editable(llm_fs.getsyspath('/'), p) for p in ('openllm_core', 'openllm_client', 'openllm')]
if all(i for i in built_wheels):
wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)])
return PythonOptions(
packages=packages,
wheels=wheels,
lock_packages=False,
lock_packages=True,
extra_index_url=[
'https://download.pytorch.org/whl/cu118',
'https://huggingface.github.io/autogptq-index/whl/cu118/',
@@ -114,15 +88,8 @@ def construct_python_options(
def construct_docker_options(
llm: openllm.LLM[t.Any, t.Any],
_: FS,
quantize: LiteralString | None,
adapter_map: dict[str, str] | None,
dockerfile_template: str | None,
serialisation: LiteralSerialisation,
container_registry: LiteralContainerRegistry,
container_version_strategy: LiteralContainerVersionStrategy,
) -> DockerOptions:
llm, _, quantize, adapter_map, dockerfile_template, serialisation, container_registry, container_version_strategy
):
from openllm_cli._factory import parse_config_options
environ = parse_config_options(llm.config, llm.config['timeout'], 1.0, None, True, os.environ.copy())
@@ -145,7 +112,7 @@ def construct_docker_options(
if quantize:
env_dict['OPENLLM_QUANTIZE'] = str(quantize)
return DockerOptions(
base_image=f'{oci.get_base_container_name(container_registry)}:{oci.get_base_container_tag(container_version_strategy)}',
base_image=oci.RefResolver.construct_base_image(container_registry, container_version_strategy),
env=env_dict,
dockerfile_template=dockerfile_template,
)
@@ -160,21 +127,13 @@ class _ServiceVarsFormatter(string.Formatter):
keyword: LiteralString = '__model_name__'
identifier: LiteralString = '# openllm: model name'
def __init__(self, target: str):
"""The formatter that extends model_name to be formatted the 'service.py'."""
def __init__(self, target):
super().__init__()
self.target = target
def vformat(self, format_string: str, *args: t.Any, **attrs: t.Any) -> t.Any:
def vformat(self, format_string, *args, **attrs) -> str:
return super().vformat(format_string, (), {self.keyword: self.target})
def can_format(self, value: str) -> bool:
try:
self.parse(value)
return True
except ValueError:
return False
def parse_line(self, line: str, nl: bool = True) -> str:
if self.identifier not in line:
return line
@@ -201,9 +160,7 @@ _service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
_service_vars_file = Path(os.path.abspath(__file__)).parent.parent / '_service_vars_pkg.py'
def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str] | None, llm_fs: FS) -> None:
from openllm_core.utils import DEBUG
def write_service(llm, llm_fs, adapter_map):
model_id_formatter = ModelIdFormatter(llm.model_id)
model_tag_formatter = ModelTagFormatter(str(llm.tag))
adapter_map_formatter = ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode())
@@ -222,8 +179,8 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str] |
src_contents[i] = adapter_map_formatter.parse_line(it)
script = f"# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n\n" + ''.join(src_contents)
if DEBUG:
logger.info('Generated script:\n%s', script)
if SHOW_CODEGEN:
logger.info('Generated _service_vars.py:\n%s', script)
llm_fs.writetext('_service_vars.py', script)
logger.debug(
@@ -236,22 +193,20 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str] |
@inject
def create_bento(
bento_tag: bentoml.Tag,
llm_fs: FS,
llm: openllm.LLM[t.Any, t.Any],
quantize: LiteralString | None,
dockerfile_template: str | None,
adapter_map: dict[str, str] | None = None,
extra_dependencies: tuple[str, ...] | None = None,
serialisation: LiteralSerialisation | None = None,
container_registry: LiteralContainerRegistry = 'ecr',
container_version_strategy: LiteralContainerVersionStrategy = 'release',
_bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
_model_store: ModelStore = Provide[BentoMLContainer.model_store],
) -> bentoml.Bento:
_serialisation: LiteralSerialisation = openllm_core.utils.first_not_none(
serialisation, default=llm.config['serialisation']
)
bento_tag,
llm_fs,
llm,
quantize,
dockerfile_template,
adapter_map=None,
extra_dependencies=None,
serialisation=None,
container_registry='ecr',
container_version_strategy='release',
_bento_store=Provide[BentoMLContainer.bento_store],
_model_store=Provide[BentoMLContainer.model_store],
):
_serialisation = openllm_core.utils.first_not_none(serialisation, default=llm.config['serialisation'])
labels = dict(llm.identifying_params)
labels.update(
{
@@ -270,47 +225,31 @@ def create_bento(
labels.update(adapter_map)
logger.debug("Building Bento '%s' with model backend '%s'", bento_tag, llm.__llm_backend__)
# add service.py definition to this temporary folder
write_service(llm, adapter_map, llm_fs)
write_service(llm, llm_fs, adapter_map)
llm_spec = ModelSpec.from_item({'tag': str(llm.tag), 'alias': llm.tag.name})
build_config = BentoBuildConfig(
service=f"{llm.config['service_name']}:svc",
name=bento_tag.name,
labels=labels,
models=[llm_spec],
description=f"OpenLLM service for {llm.config['start_name']}",
include=list(llm_fs.walk.files()),
exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
docker=construct_docker_options(
llm,
llm_fs,
quantize,
adapter_map,
dockerfile_template,
_serialisation,
container_registry,
container_version_strategy,
bento = bentoml.Bento.create(
version=bento_tag.version,
build_ctx=llm_fs.getsyspath('/'),
build_config=BentoBuildConfig(
service=f"{llm.config['service_name']}:svc",
name=bento_tag.name,
labels=labels,
models=[ModelSpec.from_item({'tag': str(llm.tag), 'alias': llm.tag.name})],
description=f"OpenLLM service for {llm.config['start_name']}",
include=list(llm_fs.walk.files()),
exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
docker=construct_docker_options(
llm,
llm_fs,
quantize,
adapter_map,
dockerfile_template,
_serialisation,
container_registry,
container_version_strategy,
),
),
)
bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/'))
# NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM.
service_fs_path = fs.path.join('src', llm.config['service_name'])
service_path = bento._fs.getsyspath(service_fs_path)
with open(service_path, 'r') as f:
service_contents = f.readlines()
for it in service_contents:
if '__bento_name__' in it:
service_contents[service_contents.index(it)] = it.format(__bento_name__=str(bento.tag))
script = ''.join(service_contents)
if openllm_core.utils.DEBUG:
logger.info('Generated script:\n%s', script)
bento._fs.writetext(service_fs_path, script)
if 'model_store' in inspect.signature(bento.save).parameters:
return bento.save(bento_store=_bento_store, model_store=_model_store)
# backward arguments. `model_store` is added recently
return bento.save(bento_store=_bento_store)
return bento.save(bento_store=_bento_store, model_store=_model_store)

View File

@@ -0,0 +1,52 @@
from typing import Dict, Optional, Tuple
from fs.base import FS
from typing_extensions import LiteralString
from bentoml import Bento, Tag
from bentoml._internal.bento import BentoStore
from bentoml._internal.bento.build_config import DockerOptions, PythonOptions
from bentoml._internal.models.model import ModelStore
from openllm_core._typing_compat import (
LiteralContainerRegistry,
LiteralContainerVersionStrategy,
LiteralQuantise,
LiteralSerialisation,
M,
T,
)
from .._llm import LLM
def build_editable(path: str, package: LiteralString) -> Optional[str]: ...
def construct_python_options(
llm: LLM[M, T],
llm_fs: FS,
extra_dependencies: Optional[Tuple[str, ...]] = ...,
adapter_map: Optional[Dict[str, str]] = ...,
) -> PythonOptions: ...
def construct_docker_options(
llm: LLM[M, T],
llm_fs: FS,
quantize: Optional[LiteralQuantise],
adapter_map: Optional[Dict[str, str]],
dockerfile_template: Optional[str],
serialisation: LiteralSerialisation,
container_registry: LiteralContainerRegistry,
container_version_strategy: LiteralContainerVersionStrategy,
) -> DockerOptions: ...
def write_service(llm: LLM[M, T], llm_fs: FS, adapter_map: Optional[Dict[str, str]]) -> None: ...
def create_bento(
bento_tag: Tag,
llm_fs: FS,
llm: LLM[M, T],
quantize: Optional[LiteralQuantise],
dockerfile_template: Optional[str],
adapter_map: Optional[Dict[str, str]] = ...,
extra_dependencies: Optional[Tuple[str, ...]] = ...,
serialisation: Optional[LiteralSerialisation] = ...,
container_registry: LiteralContainerRegistry = ...,
container_version_strategy: LiteralContainerVersionStrategy = ...,
_bento_store: BentoStore = ...,
_model_store: ModelStore = ...,
) -> Bento: ...

View File

@@ -1,26 +1,21 @@
# mypy: disable-error-code="misc"
from __future__ import annotations
import functools
import importlib
import logging
import os
import pathlib
import typing as t
import attr
from openllm_core._typing_compat import LiteralContainerVersionStrategy
from openllm_core.exceptions import OpenLLMException
from openllm_core.utils import codegen
from openllm_core.utils.lazy import VersionInfo
if t.TYPE_CHECKING:
from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy, RefTuple
logger = logging.getLogger(__name__)
ROOT_DIR = pathlib.Path(os.path.abspath('__file__')).parent.parent.parent
_CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {
_CONTAINER_REGISTRY = {
'docker': 'docker.io/bentoml/openllm',
'gh': 'ghcr.io/bentoml/openllm',
'ecr': 'public.ecr.aws/y5w8i4y6/bentoml/openllm',
@@ -30,80 +25,48 @@ _CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {
_OWNER, _REPO = 'bentoml', 'openllm'
def _convert_version_from_string(s: str) -> VersionInfo:
return VersionInfo.from_version_string(s)
_RefTuple: type[RefTuple] = codegen.make_attr_tuple_class('_RefTuple', ['git_hash', 'version', 'strategy'])
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
class RefResolver:
git_hash: str = attr.field()
version: VersionInfo = attr.field(converter=_convert_version_from_string)
version: VersionInfo = attr.field(converter=lambda s: VersionInfo.from_version_string(s))
strategy: LiteralContainerVersionStrategy = attr.field()
@classmethod
def _release_ref(cls, version_str: str | None = None) -> RefTuple:
try:
from ghapi.all import GhApi
ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
meta = t.cast(t.Dict[str, t.Any], ghapi.repos.get_latest_release())
except Exception as err:
raise OpenLLMException('Failed to determine latest release version.') from err
_use_base_strategy = version_str is None
if version_str is None:
# NOTE: This strategy will only support openllm>0.2.12
version_str = meta['name'].lstrip('v')
version = (ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'], version_str)
else:
version = ('', version_str)
return _RefTuple((*version, 'release' if _use_base_strategy else 'custom'))
@classmethod
@functools.lru_cache(maxsize=64)
def from_strategy(cls, strategy_or_version: LiteralContainerVersionStrategy | None = None) -> RefResolver:
def from_strategy(cls, strategy_or_version=None):
# using default strategy
if strategy_or_version is None or strategy_or_version == 'release':
return cls(*cls._release_ref())
try:
from ghapi.all import GhApi
ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
meta = ghapi.repos.get_latest_release()
git_hash = ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha']
except Exception as err:
raise OpenLLMException('Failed to determine latest release version.') from err
return cls(git_hash=git_hash, version=meta['name'].lstrip('v'), strategy='release')
elif strategy_or_version in ('latest', 'nightly'): # latest is nightly
return cls(git_hash='latest', version='0.0.0', strategy='latest')
else:
raise ValueError(f'Unknown strategy: {strategy_or_version}')
@property
def tag(self) -> str:
def tag(self):
return 'latest' if self.strategy in {'latest', 'nightly'} else repr(self.version)
@functools.lru_cache(maxsize=256)
def get_base_container_tag(strategy: LiteralContainerVersionStrategy | None = None) -> str:
return RefResolver.from_strategy(strategy).tag
@staticmethod
def construct_base_image(reg, strategy=None):
return f'{_CONTAINER_REGISTRY[reg]}:{RefResolver.from_strategy(strategy).tag}'
def get_base_container_name(reg: LiteralContainerRegistry) -> str:
return _CONTAINER_REGISTRY[reg]
__all__ = ['CONTAINER_NAMES', 'RefResolver', 'supported_registries']
if t.TYPE_CHECKING:
CONTAINER_NAMES: dict[LiteralContainerRegistry, str]
supported_registries: list[str]
__all__ = [
'CONTAINER_NAMES',
'get_base_container_tag',
'get_base_container_name',
'supported_registries',
'RefResolver',
]
def __dir__() -> list[str]:
def __dir__():
return sorted(__all__)
def __getattr__(name: str) -> t.Any:
def __getattr__(name):
if name == 'supported_registries':
return functools.lru_cache(1)(lambda: list(_CONTAINER_REGISTRY))()
elif name == 'CONTAINER_NAMES':

View File

@@ -1,5 +1,3 @@
"""Tests utilities for OpenLLM."""
from __future__ import annotations
import contextlib
import logging