mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-02-19 15:18:12 -05:00
feat(type): provide structured annotations stubs (#663)
* feat(type): provide client stubs separation of concern for more brevity code base Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * docs: update changelog Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -1,14 +1,3 @@
|
||||
"""OpenLLM.
|
||||
|
||||
An open platform for operating large language models in production. Fine-tune, serve,
|
||||
deploy, and monitor any LLMs with ease.
|
||||
|
||||
* Built-in support for StableLM, Llama 2, Dolly, Flan-T5, Vicuna
|
||||
* Option to bring your own fine-tuned LLMs
|
||||
* Online Serving with HTTP, gRPC, SSE(coming soon) or custom API
|
||||
* Native integration with BentoML and LangChain for custom LLM apps
|
||||
"""
|
||||
|
||||
import logging as _logging
|
||||
import os as _os
|
||||
import pathlib as _pathlib
|
||||
@@ -57,13 +46,14 @@ __lazy = utils.LazyModule(
|
||||
'entrypoints': ['mount_entrypoints'],
|
||||
'serialisation': ['ggml', 'transformers'],
|
||||
'_quantisation': ['infer_quantisation_config'],
|
||||
'_llm': ['LLM', 'LLMRunner', 'LLMRunnable'],
|
||||
'_llm': ['LLM'],
|
||||
'_generation': [
|
||||
'StopSequenceCriteria',
|
||||
'StopOnTokens',
|
||||
'LogitsProcessorList',
|
||||
'StoppingCriteriaList',
|
||||
'prepare_logits_processor',
|
||||
'get_context_length',
|
||||
'is_sentence_complete',
|
||||
'is_partial_stop',
|
||||
],
|
||||
},
|
||||
extra_objects={
|
||||
|
||||
@@ -1,3 +1,21 @@
|
||||
"""OpenLLM.
|
||||
===========
|
||||
|
||||
An open platform for operating large language models in production.
|
||||
Fine-tune, serve, deploy, and monitor any LLMs with ease.
|
||||
|
||||
* Built-in support for Mistral, Llama 2, Yi, StableLM, Dolly, Flan-T5, Vicuna
|
||||
* Option to bring your own fine-tuned LLMs
|
||||
* Online Serving with HTTP, gRPC, SSE or custom API
|
||||
* Native integration with BentoML, LangChain, OpenAI compatible endpoints, LlamaIndex for custom LLM apps
|
||||
"""
|
||||
|
||||
# fmt: off
|
||||
# update-config-stubs.py: import stubs start
|
||||
from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING,CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,AutoConfig as AutoConfig,BaichuanConfig as BaichuanConfig,ChatGLMConfig as ChatGLMConfig,DollyV2Config as DollyV2Config,FalconConfig as FalconConfig,FlanT5Config as FlanT5Config,GPTNeoXConfig as GPTNeoXConfig,LlamaConfig as LlamaConfig,MistralConfig as MistralConfig,MPTConfig as MPTConfig,OPTConfig as OPTConfig,StableLMConfig as StableLMConfig,StarCoderConfig as StarCoderConfig,YiConfig as YiConfig
|
||||
# update-config-stubs.py: import stubs stop
|
||||
# fmt: on
|
||||
|
||||
import openllm_cli as _cli
|
||||
from openllm_cli._sdk import (
|
||||
build as build,
|
||||
@@ -16,23 +34,6 @@ from openllm_core._schemas import (
|
||||
GenerationOutput as GenerationOutput,
|
||||
MetadataOutput as MetadataOutput,
|
||||
)
|
||||
from openllm_core.config import (
|
||||
CONFIG_MAPPING as CONFIG_MAPPING,
|
||||
CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,
|
||||
AutoConfig as AutoConfig,
|
||||
BaichuanConfig as BaichuanConfig,
|
||||
ChatGLMConfig as ChatGLMConfig,
|
||||
DollyV2Config as DollyV2Config,
|
||||
FalconConfig as FalconConfig,
|
||||
FlanT5Config as FlanT5Config,
|
||||
GPTNeoXConfig as GPTNeoXConfig,
|
||||
LlamaConfig as LlamaConfig,
|
||||
MistralConfig as MistralConfig,
|
||||
MPTConfig as MPTConfig,
|
||||
OPTConfig as OPTConfig,
|
||||
StableLMConfig as StableLMConfig,
|
||||
StarCoderConfig as StarCoderConfig,
|
||||
)
|
||||
|
||||
from . import (
|
||||
bundle as bundle,
|
||||
@@ -44,13 +45,14 @@ from . import (
|
||||
)
|
||||
from ._deprecated import Runner as Runner
|
||||
from ._generation import (
|
||||
LogitsProcessorList as LogitsProcessorList,
|
||||
StopOnTokens as StopOnTokens,
|
||||
StoppingCriteriaList as StoppingCriteriaList,
|
||||
StopSequenceCriteria as StopSequenceCriteria,
|
||||
prepare_logits_processor as prepare_logits_processor,
|
||||
is_partial_stop as is_partial_stop,
|
||||
is_sentence_complete as is_sentence_complete,
|
||||
get_context_length as get_context_length,
|
||||
)
|
||||
from ._llm import LLM as LLM, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner
|
||||
from ._llm import LLM as LLM
|
||||
from ._quantisation import infer_quantisation_config as infer_quantisation_config
|
||||
from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
|
||||
from .client import AsyncHTTPClient as AsyncHTTPClient, HTTPClient as HTTPClient
|
||||
|
||||
@@ -1,43 +1,24 @@
|
||||
# mypy: disable-error-code="misc"
|
||||
from __future__ import annotations
|
||||
import typing as t
|
||||
|
||||
import transformers
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
|
||||
import openllm
|
||||
|
||||
# reexport from transformers
|
||||
LogitsProcessorList = transformers.LogitsProcessorList
|
||||
StoppingCriteriaList = transformers.StoppingCriteriaList
|
||||
|
||||
|
||||
class StopSequenceCriteria(transformers.StoppingCriteria):
|
||||
def __init__(
|
||||
self,
|
||||
stop_sequences: str | list[str],
|
||||
tokenizer: transformers.PreTrainedTokenizer
|
||||
| transformers.PreTrainedTokenizerBase
|
||||
| transformers.PreTrainedTokenizerFast,
|
||||
):
|
||||
def __init__(self, stop_sequences, tokenizer):
|
||||
if isinstance(stop_sequences, str):
|
||||
stop_sequences = [stop_sequences]
|
||||
self.stop_sequences, self.tokenizer = stop_sequences, tokenizer
|
||||
|
||||
def __call__(self, input_ids: torch.Tensor, scores: t.Any, **_: t.Any) -> bool:
|
||||
def __call__(self, input_ids, scores, **kwargs):
|
||||
return any(
|
||||
self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences
|
||||
)
|
||||
|
||||
|
||||
class StopOnTokens(transformers.StoppingCriteria):
|
||||
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **_: t.Any) -> bool:
|
||||
def __call__(self, input_ids, scores, **kwargs):
|
||||
return input_ids[0][-1] in {50278, 50279, 50277, 1, 0}
|
||||
|
||||
|
||||
def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsProcessorList:
|
||||
def prepare_logits_processor(config):
|
||||
generation_config = config.generation_config
|
||||
logits_processor = transformers.LogitsProcessorList()
|
||||
if generation_config['temperature'] >= 1e-5 and generation_config['temperature'] != 1.0:
|
||||
@@ -55,7 +36,7 @@ def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsPr
|
||||
SEQLEN_KEYS = ['max_sequence_length', 'seq_length', 'max_position_embeddings', 'max_seq_len', 'model_max_length']
|
||||
|
||||
|
||||
def get_context_length(config: transformers.PretrainedConfig) -> int:
|
||||
def get_context_length(config):
|
||||
rope_scaling = getattr(config, 'rope_scaling', None)
|
||||
rope_scaling_factor = config.rope_scaling['factor'] if rope_scaling else 1.0
|
||||
for key in SEQLEN_KEYS:
|
||||
@@ -64,11 +45,11 @@ def get_context_length(config: transformers.PretrainedConfig) -> int:
|
||||
return 2048
|
||||
|
||||
|
||||
def is_sentence_complete(output: str) -> bool:
|
||||
def is_sentence_complete(output):
|
||||
return output.endswith(('.', '?', '!', '...', '。', '?', '!', '…', '"', "'", '”'))
|
||||
|
||||
|
||||
def is_partial_stop(output: str, stop_str: str) -> bool:
|
||||
def is_partial_stop(output, stop_str):
|
||||
"""Check whether the output contains a partial stop str."""
|
||||
for i in range(min(len(output), len(stop_str))):
|
||||
if stop_str.startswith(output[-i:]):
|
||||
|
||||
28
openllm-python/src/openllm/_generation.pyi
Normal file
28
openllm-python/src/openllm/_generation.pyi
Normal file
@@ -0,0 +1,28 @@
|
||||
from typing import Any, List, Union
|
||||
|
||||
from torch import FloatTensor, LongTensor
|
||||
from transformers import (
|
||||
LogitsProcessorList,
|
||||
PretrainedConfig,
|
||||
PreTrainedTokenizer,
|
||||
PreTrainedTokenizerBase,
|
||||
PreTrainedTokenizerFast,
|
||||
)
|
||||
|
||||
from openllm_core import LLMConfig
|
||||
|
||||
Tokenizer = Union[PreTrainedTokenizerBase, PreTrainedTokenizer, PreTrainedTokenizerFast]
|
||||
|
||||
class StopSequenceCriteria:
|
||||
stop_sequences: List[str]
|
||||
tokenizer: Tokenizer
|
||||
def __init__(self, stop_sequences: Union[str, List[str]], tokenizer: Tokenizer) -> None: ...
|
||||
def __call__(self, input_ids: LongTensor, scores: FloatTensor, **kwargs: Any) -> bool: ...
|
||||
|
||||
class StopOnTokens:
|
||||
def __call__(self, input_ids: LongTensor, scores: FloatTensor, **kwargs: Any) -> bool: ...
|
||||
|
||||
def prepare_logits_processor(config: LLMConfig) -> LogitsProcessorList: ...
|
||||
def get_context_length(config: PretrainedConfig) -> int: ...
|
||||
def is_sentence_complete(output: str) -> bool: ...
|
||||
def is_partial_stop(output: str, stop_str: str) -> bool: ...
|
||||
@@ -1,6 +1,4 @@
|
||||
# mypy: disable-error-code="name-defined,attr-defined"
|
||||
from __future__ import annotations
|
||||
import abc
|
||||
import functools
|
||||
import logging
|
||||
import os
|
||||
@@ -10,14 +8,12 @@ import typing as t
|
||||
import attr
|
||||
import inflection
|
||||
import orjson
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
import bentoml
|
||||
import openllm
|
||||
import openllm_core
|
||||
from bentoml._internal.models.model import ModelSignature
|
||||
from bentoml._internal.runner.runner_handle import DummyRunnerHandle
|
||||
from openllm_core._schemas import CompletionChunk, GenerationOutput
|
||||
from openllm_core._schemas import GenerationOutput
|
||||
from openllm_core._typing_compat import (
|
||||
AdapterMap,
|
||||
AdapterTuple,
|
||||
@@ -43,32 +39,27 @@ from openllm_core.utils import (
|
||||
converter,
|
||||
first_not_none,
|
||||
flatten_attrs,
|
||||
gen_random_uuid,
|
||||
generate_hash_from_file,
|
||||
get_debug_mode,
|
||||
get_disable_warnings,
|
||||
get_quiet_mode,
|
||||
is_peft_available,
|
||||
is_vllm_available,
|
||||
resolve_filepath,
|
||||
validate_is_path,
|
||||
)
|
||||
|
||||
from ._quantisation import infer_quantisation_config
|
||||
from ._strategies import CascadingResourceStrategy
|
||||
from .exceptions import ForbiddenAttributeError, OpenLLMException
|
||||
from .serialisation.constants import PEFT_CONFIG_NAME
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
import transformers
|
||||
from peft.config import PeftConfig
|
||||
from peft.peft_model import PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM
|
||||
|
||||
from bentoml._internal.runner.runnable import RunnableMethod
|
||||
from bentoml._internal.runner.runner import RunnerMethod
|
||||
from bentoml._internal.runner.runner_handle import RunnerHandle
|
||||
from bentoml._internal.runner.strategy import Strategy
|
||||
from openllm_core._configuration import LLMConfig
|
||||
from openllm_core.utils.representation import ReprArgs
|
||||
|
||||
from ._runners import Runner
|
||||
|
||||
ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]]
|
||||
|
||||
@@ -84,16 +75,15 @@ def normalise_model_name(name: str) -> str:
|
||||
return inflection.dasherize(name)
|
||||
|
||||
|
||||
def resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
|
||||
"""Resolve the type of the PeftConfig given the adapter_map.
|
||||
def _resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
|
||||
try:
|
||||
from huggingface_hub import hf_hub_download
|
||||
except ImportError:
|
||||
raise MissingDependencyError(
|
||||
"Failed to import 'huggingface_hub'. Make sure to do 'pip install \"openllm[fine-tune]\"'"
|
||||
) from None
|
||||
|
||||
This is similar to how PeftConfig resolve its config type.
|
||||
|
||||
Args:
|
||||
adapter_map: The given mapping from either SDK or CLI. See CLI docs for more information.
|
||||
"""
|
||||
resolved: AdapterMap = {}
|
||||
_has_set_default = False
|
||||
for path_or_adapter_id, name in adapter_map.items():
|
||||
if name is None:
|
||||
raise ValueError('Adapter name must be specified.')
|
||||
@@ -107,7 +97,7 @@ def resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
|
||||
with open(config_file, 'r') as file:
|
||||
resolved_config = orjson.loads(file.read())
|
||||
# all peft_type should be available in PEFT_CONFIG_NAME
|
||||
_peft_type: AdapterType = resolved_config['peft_type'].lower()
|
||||
_peft_type = resolved_config['peft_type'].lower()
|
||||
if _peft_type not in resolved:
|
||||
resolved[_peft_type] = ()
|
||||
resolved[_peft_type] += (_AdapterTuple((path_or_adapter_id, name, resolved_config)),)
|
||||
@@ -151,7 +141,7 @@ class LLM(t.Generic[M, T], ReprMixin):
|
||||
__llm_config__: LLMConfig | None = None
|
||||
__llm_backend__: LiteralBackend = None # type: ignore
|
||||
__llm_quantization_config__: transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig | None = None
|
||||
__llm_runner__: t.Optional[LLMRunner[M, T]] = None
|
||||
__llm_runner__: t.Optional[Runner[M, T]] = None
|
||||
__llm_model__: t.Optional[M] = None
|
||||
__llm_tokenizer__: t.Optional[T] = None
|
||||
__llm_adapter_map__: t.Optional[ResolvedAdapterMap] = None
|
||||
@@ -159,35 +149,29 @@ class LLM(t.Generic[M, T], ReprMixin):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_id: str,
|
||||
model_version: str | None = None,
|
||||
model_tag: str | bentoml.Tag | None = None,
|
||||
prompt_template: PromptTemplate | str | None = None,
|
||||
system_message: str | None = None,
|
||||
llm_config: LLMConfig | None = None,
|
||||
backend: LiteralBackend | None = None,
|
||||
*args: t.Any,
|
||||
quantize: LiteralQuantise | None = None,
|
||||
quantization_config: transformers.BitsAndBytesConfig
|
||||
| transformers.GPTQConfig
|
||||
| transformers.AwqConfig
|
||||
| None = None,
|
||||
adapter_map: dict[str, str] | None = None,
|
||||
serialisation: LiteralSerialisation = 'safetensors',
|
||||
trust_remote_code: bool = False,
|
||||
embedded: bool = False,
|
||||
torch_dtype: LiteralDtype | t.Literal['auto', 'half', 'float'] = 'auto',
|
||||
**attrs: t.Any,
|
||||
model_id,
|
||||
model_version=None,
|
||||
model_tag=None,
|
||||
prompt_template=None,
|
||||
system_message=None,
|
||||
llm_config=None,
|
||||
backend=None,
|
||||
*args,
|
||||
quantize=None,
|
||||
quantization_config=None,
|
||||
adapter_map=None,
|
||||
serialisation='safetensors',
|
||||
trust_remote_code=False,
|
||||
embedded=False,
|
||||
torch_dtype='auto',
|
||||
low_cpu_mem_usage=True,
|
||||
**attrs,
|
||||
):
|
||||
# low_cpu_mem_usage is only available for model this is helpful on system with low memory to avoid OOM
|
||||
low_cpu_mem_usage = attrs.pop('low_cpu_mem_usage', True)
|
||||
_local = False
|
||||
if validate_is_path(model_id):
|
||||
model_id, _local = resolve_filepath(model_id), True
|
||||
|
||||
backend = first_not_none(
|
||||
backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if openllm.utils.is_vllm_available() else 'pt'
|
||||
)
|
||||
backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt')
|
||||
torch_dtype = first_not_none(os.getenv('TORCH_DTYPE'), torch_dtype, default='auto')
|
||||
quantize = first_not_none(quantize, os.getenv('OPENLLM_QUANTIZE'), default=None)
|
||||
# elif quantization_config is None and quantize is not None:
|
||||
@@ -215,7 +199,7 @@ class LLM(t.Generic[M, T], ReprMixin):
|
||||
quantization_config=quantization_config,
|
||||
quantise=quantize,
|
||||
model_decls=args,
|
||||
adapter_map=resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
|
||||
adapter_map=_resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
|
||||
serialisation=serialisation,
|
||||
local=_local,
|
||||
prompt_template=prompt_template,
|
||||
@@ -244,7 +228,7 @@ class LLM(t.Generic[M, T], ReprMixin):
|
||||
self.runner.init_local(quiet=True)
|
||||
|
||||
@property
|
||||
def _torch_dtype(self) -> torch.dtype:
|
||||
def _torch_dtype(self):
|
||||
import torch
|
||||
import transformers
|
||||
|
||||
@@ -298,11 +282,15 @@ class LLM(t.Generic[M, T], ReprMixin):
|
||||
super().__setattr__(attr, value)
|
||||
|
||||
@property
|
||||
def _model_attrs(self) -> dict[str, t.Any]:
|
||||
def _model_attrs(self):
|
||||
return {**self.import_kwargs[0], **self.__model_attrs}
|
||||
|
||||
@_model_attrs.setter
|
||||
def _model_attrs(self, value):
|
||||
self.__model_attrs = value
|
||||
|
||||
@property
|
||||
def _tokenizer_attrs(self) -> dict[str, t.Any]:
|
||||
def _tokenizer_attrs(self):
|
||||
return {**self.import_kwargs[1], **self.__tokenizer_attrs}
|
||||
|
||||
@property
|
||||
@@ -319,41 +307,42 @@ class LLM(t.Generic[M, T], ReprMixin):
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
|
||||
import torch
|
||||
|
||||
return {'device_map': 'auto' if torch.cuda.is_available() else None, 'torch_dtype': self._torch_dtype}, {
|
||||
'padding_side': 'left',
|
||||
'truncation_side': 'left',
|
||||
}
|
||||
model_attrs = {'device_map': 'auto' if torch.cuda.is_available() else None, 'torch_dtype': self._torch_dtype}
|
||||
tokenizer_attrs = {'padding_side': 'left', 'truncation_side': 'left'}
|
||||
return model_attrs, tokenizer_attrs
|
||||
|
||||
@property
|
||||
def trust_remote_code(self) -> bool:
|
||||
def trust_remote_code(self):
|
||||
env = os.getenv('TRUST_REMOTE_CODE')
|
||||
if env is not None:
|
||||
return str(env).upper() in ENV_VARS_TRUE_VALUES
|
||||
return self.__llm_trust_remote_code__
|
||||
|
||||
@property
|
||||
def runner_name(self) -> str:
|
||||
def runner_name(self):
|
||||
return f"llm-{self.config['start_name']}-runner"
|
||||
|
||||
@property
|
||||
def model_id(self) -> str:
|
||||
def model_id(self):
|
||||
return self._model_id
|
||||
|
||||
@property
|
||||
def revision(self) -> str:
|
||||
return t.cast(str, self._revision)
|
||||
def revision(self):
|
||||
return self._revision
|
||||
|
||||
@property
|
||||
def tag(self) -> bentoml.Tag:
|
||||
def tag(self):
|
||||
return self._tag
|
||||
|
||||
@property
|
||||
def bentomodel(self) -> bentoml.Model:
|
||||
def bentomodel(self):
|
||||
return openllm.serialisation.get(self)
|
||||
|
||||
@property
|
||||
def quantization_config(self) -> transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig:
|
||||
def quantization_config(self):
|
||||
if self.__llm_quantization_config__ is None:
|
||||
from ._quantisation import infer_quantisation_config
|
||||
|
||||
if self._quantization_config is not None:
|
||||
self.__llm_quantization_config__ = self._quantization_config
|
||||
elif self._quantise is not None:
|
||||
@@ -365,55 +354,55 @@ class LLM(t.Generic[M, T], ReprMixin):
|
||||
return self.__llm_quantization_config__
|
||||
|
||||
@property
|
||||
def has_adapters(self) -> bool:
|
||||
def has_adapters(self):
|
||||
return self._adapter_map is not None
|
||||
|
||||
@property
|
||||
def local(self) -> bool:
|
||||
def local(self):
|
||||
return self._local
|
||||
|
||||
@property
|
||||
def quantise(self) -> LiteralQuantise | None:
|
||||
def quantise(self):
|
||||
return self._quantise
|
||||
|
||||
# NOTE: The section below defines a loose contract with langchain's LLM interface.
|
||||
@property
|
||||
def llm_type(self) -> str:
|
||||
def llm_type(self):
|
||||
return normalise_model_name(self._model_id)
|
||||
|
||||
@property
|
||||
def identifying_params(self) -> DictStrAny:
|
||||
def llm_parameters(self):
|
||||
return (self._model_decls, self._model_attrs), self._tokenizer_attrs
|
||||
|
||||
@property
|
||||
def identifying_params(self):
|
||||
return {
|
||||
'configuration': self.config.model_dump_json().decode(),
|
||||
'model_ids': orjson.dumps(self.config['model_ids']).decode(),
|
||||
'model_id': self.model_id,
|
||||
}
|
||||
|
||||
@property
|
||||
def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]:
|
||||
return (self._model_decls, self._model_attrs), self._tokenizer_attrs
|
||||
|
||||
# NOTE: This section is the actual model, tokenizer, and config reference here.
|
||||
@property
|
||||
def config(self) -> LLMConfig:
|
||||
def config(self):
|
||||
if self.__llm_config__ is None:
|
||||
self.__llm_config__ = openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs)
|
||||
return self.__llm_config__
|
||||
|
||||
@property
|
||||
def tokenizer(self) -> T:
|
||||
def tokenizer(self):
|
||||
if self.__llm_tokenizer__ is None:
|
||||
self.__llm_tokenizer__ = openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1])
|
||||
return self.__llm_tokenizer__
|
||||
|
||||
@property
|
||||
def runner(self) -> LLMRunner[M, T]:
|
||||
def runner(self):
|
||||
if self.__llm_runner__ is None:
|
||||
self.__llm_runner__ = _RunnerFactory(self)
|
||||
return self.__llm_runner__
|
||||
|
||||
@property
|
||||
def model(self) -> M:
|
||||
def model(self):
|
||||
if self.__llm_model__ is None:
|
||||
model = openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs)
|
||||
# If OOM, then it is probably you don't have enough VRAM to run this model.
|
||||
@@ -439,7 +428,7 @@ class LLM(t.Generic[M, T], ReprMixin):
|
||||
return self.__llm_model__
|
||||
|
||||
@property
|
||||
def adapter_map(self) -> ResolvedAdapterMap:
|
||||
def adapter_map(self):
|
||||
try:
|
||||
import peft as _ # noqa: F401
|
||||
except ImportError as err:
|
||||
@@ -461,9 +450,7 @@ class LLM(t.Generic[M, T], ReprMixin):
|
||||
self.__llm_adapter_map__ = _map
|
||||
return self.__llm_adapter_map__
|
||||
|
||||
def prepare_for_training(
|
||||
self, adapter_type: AdapterType = 'lora', use_gradient_checking: bool = True, **attrs: t.Any
|
||||
) -> tuple[PeftModel | PeftModelForCausalLM | PeftModelForSeq2SeqLM, T]:
|
||||
def prepare_for_training(self, adapter_type='lora', use_gradient_checking=True, **attrs):
|
||||
from peft.mapping import get_peft_model
|
||||
from peft.utils.other import prepare_model_for_kbit_training
|
||||
|
||||
@@ -484,15 +471,8 @@ class LLM(t.Generic[M, T], ReprMixin):
|
||||
return model, self.tokenizer
|
||||
|
||||
async def generate(
|
||||
self,
|
||||
prompt: str | None,
|
||||
prompt_token_ids: list[int] | None = None,
|
||||
stop: str | t.Iterable[str] | None = None,
|
||||
stop_token_ids: list[int] | None = None,
|
||||
request_id: str | None = None,
|
||||
adapter_name: str | None = None,
|
||||
**attrs: t.Any,
|
||||
) -> GenerationOutput:
|
||||
self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
|
||||
):
|
||||
config = self.config.model_construct_env(**attrs)
|
||||
texts: list[list[str]] = [[]] * config['n']
|
||||
token_ids: list[list[int]] = [[]] * config['n']
|
||||
@@ -515,15 +495,8 @@ class LLM(t.Generic[M, T], ReprMixin):
|
||||
)
|
||||
|
||||
async def generate_iterator(
|
||||
self,
|
||||
prompt: str | None,
|
||||
prompt_token_ids: list[int] | None = None,
|
||||
stop: str | t.Iterable[str] | None = None,
|
||||
stop_token_ids: list[int] | None = None,
|
||||
request_id: str | None = None,
|
||||
adapter_name: str | None = None,
|
||||
**attrs: t.Any,
|
||||
) -> t.AsyncGenerator[GenerationOutput, None]:
|
||||
self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
|
||||
):
|
||||
if isinstance(self.runner._runner_handle, DummyRunnerHandle):
|
||||
if os.getenv('BENTO_PATH') is not None:
|
||||
raise RuntimeError('Runner client failed to set up correctly.')
|
||||
@@ -551,14 +524,13 @@ class LLM(t.Generic[M, T], ReprMixin):
|
||||
raise ValueError('Either prompt or prompt_token_ids must be specified.')
|
||||
prompt_token_ids = self.tokenizer.encode(prompt)
|
||||
|
||||
if request_id is None:
|
||||
request_id = openllm_core.utils.gen_random_uuid()
|
||||
request_id = gen_random_uuid() if request_id is None else request_id
|
||||
previous_texts, previous_num_tokens = [''] * config['n'], [0] * config['n']
|
||||
async for out in self.runner.generate_iterator.async_stream(
|
||||
prompt_token_ids, request_id, stop, adapter_name, **config.model_dump(flatten=True)
|
||||
prompt_token_ids, request_id, stop=stop, adapter_name=adapter_name, **config.model_dump(flatten=True)
|
||||
):
|
||||
generated = GenerationOutput.from_runner(out).with_options(prompt=prompt)
|
||||
delta_outputs = t.cast(t.List[CompletionChunk], [None] * len(generated.outputs))
|
||||
delta_outputs = [None] * len(generated.outputs)
|
||||
if generated.finished:
|
||||
break
|
||||
for output in generated.outputs:
|
||||
@@ -570,44 +542,37 @@ class LLM(t.Generic[M, T], ReprMixin):
|
||||
|
||||
|
||||
def _RunnerFactory(
|
||||
self: openllm.LLM[M, T],
|
||||
/,
|
||||
models: list[bentoml.Model] | None = None,
|
||||
max_batch_size: int | None = None,
|
||||
max_latency_ms: int | None = None,
|
||||
scheduling_strategy: type[bentoml.Strategy] = CascadingResourceStrategy,
|
||||
*,
|
||||
backend: LiteralBackend | None = None,
|
||||
) -> LLMRunner[M, T]:
|
||||
llm, /, models=None, max_batch_size=None, max_latency_ms=None, scheduling_strategy=None, *, backend=None
|
||||
):
|
||||
from ._runners import runnable
|
||||
|
||||
backend = t.cast(
|
||||
LiteralBackend, first_not_none(backend, os.environ.get('OPENLLM_BACKEND'), default=self.__llm_backend__)
|
||||
)
|
||||
if scheduling_strategy is None:
|
||||
from ._strategies import CascadingResourceStrategy
|
||||
|
||||
scheduling_strategy = CascadingResourceStrategy
|
||||
|
||||
backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND', default=llm.__llm_backend__))
|
||||
|
||||
models = models if models is not None else []
|
||||
try:
|
||||
models.append(self.bentomodel)
|
||||
models.append(llm.bentomodel)
|
||||
except bentoml.exceptions.NotFound as err:
|
||||
raise RuntimeError(f'Failed to locate {self.bentomodel}:{err}') from err
|
||||
raise RuntimeError(f'Failed to locate {llm.bentomodel}:{err}') from err
|
||||
|
||||
if self._prompt_template:
|
||||
prompt_template = self._prompt_template.to_string()
|
||||
elif hasattr(self.config, 'default_prompt_template'):
|
||||
prompt_template = self.config.default_prompt_template
|
||||
if llm._prompt_template:
|
||||
prompt_template = llm._prompt_template.to_string()
|
||||
elif hasattr(llm.config, 'default_prompt_template'):
|
||||
prompt_template = llm.config.default_prompt_template
|
||||
else:
|
||||
prompt_template = None
|
||||
if self._system_message:
|
||||
system_message = self._system_message
|
||||
elif hasattr(self.config, 'default_system_message'):
|
||||
system_message = self.config.default_system_message
|
||||
if llm._system_message:
|
||||
system_message = llm._system_message
|
||||
elif hasattr(llm.config, 'default_system_message'):
|
||||
system_message = llm.config.default_system_message
|
||||
else:
|
||||
system_message = None
|
||||
|
||||
def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]:
|
||||
return {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'}
|
||||
|
||||
def _wrapped_repr_args(_: LLMRunner[M, T]) -> ReprArgs:
|
||||
def _wrapped_repr_args(_):
|
||||
yield (
|
||||
'runner_methods',
|
||||
{
|
||||
@@ -618,89 +583,40 @@ def _RunnerFactory(
|
||||
for method in _.runner_methods
|
||||
},
|
||||
)
|
||||
yield 'config', self.config.model_dump(flatten=True)
|
||||
yield 'llm_type', self.llm_type
|
||||
yield 'config', llm.config.model_dump(flatten=True)
|
||||
yield 'llm_type', llm.llm_type
|
||||
yield 'backend', backend
|
||||
yield 'llm_tag', self.tag
|
||||
yield 'llm_tag', llm.tag
|
||||
|
||||
return types.new_class(
|
||||
self.__class__.__name__ + 'Runner',
|
||||
llm.config.__class__.__name__[:-6] + 'Runner',
|
||||
(bentoml.Runner,),
|
||||
exec_body=lambda ns: ns.update(
|
||||
{
|
||||
'llm_type': self.llm_type,
|
||||
'identifying_params': self.identifying_params,
|
||||
'llm_tag': self.tag,
|
||||
'llm': self,
|
||||
'config': self.config,
|
||||
'llm_type': llm.llm_type,
|
||||
'identifying_params': llm.identifying_params,
|
||||
'llm_tag': llm.tag,
|
||||
'llm': llm,
|
||||
'config': llm.config,
|
||||
'backend': backend,
|
||||
'__module__': self.__module__,
|
||||
'__doc__': llm.config.__class__.__doc__ or f'Generated Runner class for {llm.config["model_name"]}',
|
||||
'__module__': llm.__module__,
|
||||
'__repr__': ReprMixin.__repr__,
|
||||
'__repr_keys__': property(_wrapped_repr_keys),
|
||||
'__repr_keys__': property(lambda _: {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'}),
|
||||
'__repr_args__': _wrapped_repr_args,
|
||||
'has_adapters': self.has_adapters,
|
||||
'has_adapters': llm.has_adapters,
|
||||
'prompt_template': prompt_template,
|
||||
'system_message': system_message,
|
||||
}
|
||||
),
|
||||
)(
|
||||
runnable(backend),
|
||||
name=self.runner_name,
|
||||
name=llm.runner_name,
|
||||
embedded=False,
|
||||
models=models,
|
||||
max_batch_size=max_batch_size,
|
||||
max_latency_ms=max_latency_ms,
|
||||
scheduling_strategy=scheduling_strategy,
|
||||
runnable_init_params=dict(llm=self),
|
||||
runnable_init_params={'llm': llm},
|
||||
method_configs=converter.unstructure({'generate_iterator': ModelSignature(batchable=False)}),
|
||||
)
|
||||
|
||||
|
||||
@t.final
|
||||
class LLMRunnable(bentoml.Runnable, t.Generic[M, T]):
|
||||
SUPPORTED_RESOURCES = ('amd.com/gpu', 'nvidia.com/gpu', 'cpu')
|
||||
SUPPORTS_CPU_MULTI_THREADING = True
|
||||
generate_iterator: RunnableMethod[LLMRunnable[M, T], [list[int], str, str | t.Iterable[str] | None, str | None], str]
|
||||
|
||||
|
||||
@t.final
|
||||
class LLMRunner(t.Protocol[M, T]):
|
||||
__doc__: str
|
||||
__module__: str
|
||||
llm_type: str
|
||||
llm_tag: bentoml.Tag
|
||||
identifying_params: dict[str, t.Any]
|
||||
llm: openllm.LLM[M, T]
|
||||
config: openllm.LLMConfig
|
||||
backend: LiteralBackend
|
||||
has_adapters: bool
|
||||
system_message: str | None
|
||||
prompt_template: str | None
|
||||
generate_iterator: RunnerMethod[LLMRunnable[M, T], [list[int], str, str | t.Iterable[str] | None, str | None], str]
|
||||
|
||||
runner_methods: list[RunnerMethod[t.Any, t.Any, t.Any]]
|
||||
scheduling_strategy: type[Strategy]
|
||||
workers_per_resource: int | float
|
||||
runnable_init_params: dict[str, t.Any]
|
||||
_runner_handle: RunnerHandle
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
runnable_class: type[LLMRunnable[M, T]],
|
||||
*,
|
||||
runnable_init_params: dict[str, t.Any] | None = ...,
|
||||
name: str | None = ...,
|
||||
scheduling_strategy: type[Strategy] = ...,
|
||||
models: list[bentoml.Model] | None = ...,
|
||||
max_batch_size: int | None = ...,
|
||||
max_latency_ms: int | None = ...,
|
||||
method_configs: dict[str, dict[str, int]] | None = ...,
|
||||
embedded: bool = False,
|
||||
) -> None: ...
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def __repr_keys__(self) -> set[str]: ...
|
||||
|
||||
|
||||
__all__ = ['LLMRunner', 'LLMRunnable', 'LLM']
|
||||
|
||||
158
openllm-python/src/openllm/_llm.pyi
Normal file
158
openllm-python/src/openllm/_llm.pyi
Normal file
@@ -0,0 +1,158 @@
|
||||
from typing import Any, AsyncGenerator, Dict, Generic, Iterable, List, Literal, Optional, Set, Tuple, TypedDict, Union
|
||||
|
||||
import attr
|
||||
import torch
|
||||
from peft.config import PeftConfig
|
||||
from peft.peft_model import PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM
|
||||
|
||||
from bentoml import Model, Tag
|
||||
from openllm_core import LLMConfig
|
||||
from openllm_core._schemas import GenerationOutput
|
||||
from openllm_core._typing_compat import (
|
||||
AdapterMap,
|
||||
AdapterType,
|
||||
LiteralBackend,
|
||||
LiteralDtype,
|
||||
LiteralQuantise,
|
||||
LiteralSerialisation,
|
||||
M,
|
||||
T,
|
||||
)
|
||||
from openllm_core.prompts import PromptTemplate
|
||||
from openllm_core.utils.representation import ReprArgs
|
||||
|
||||
from ._quantisation import QuantizationConfig
|
||||
from ._runners import Runner
|
||||
|
||||
InjectedModel = Union[PeftModel | PeftModelForCausalLM | PeftModelForSeq2SeqLM]
|
||||
|
||||
class IdentifyingParams(TypedDict):
|
||||
configuration: str
|
||||
model_ids: str
|
||||
model_id: str
|
||||
|
||||
ResolvedAdapterMap = Dict[AdapterType, Dict[str, Tuple[PeftConfig, str]]]
|
||||
Dtype = Union[LiteralDtype, Literal['auto', 'half', 'float']]
|
||||
|
||||
@attr.define(slots=True, repr=False, init=False)
|
||||
class LLM(Generic[M, T]):
|
||||
_model_id: str
|
||||
_revision: Optional[str]
|
||||
_quantization_config: Optional[QuantizationConfig]
|
||||
_quantise: Optional[LiteralQuantise]
|
||||
_model_decls: Tuple[Any, ...]
|
||||
__model_attrs: Dict[str, Any]
|
||||
__tokenizer_attrs: Dict[str, Any]
|
||||
_tag: Tag
|
||||
_adapter_map: Optional[AdapterMap]
|
||||
_serialisation: LiteralSerialisation
|
||||
_local: bool
|
||||
_prompt_template: Optional[PromptTemplate]
|
||||
_system_message: Optional[str]
|
||||
|
||||
__llm_torch_dtype__: Dtype = ...
|
||||
__llm_config__: Optional[LLMConfig] = ...
|
||||
__llm_backend__: LiteralBackend = ...
|
||||
__llm_quantization_config__: Optional[QuantizationConfig] = ...
|
||||
__llm_runner__: Optional[Runner[M, T]] = ...
|
||||
__llm_model__: Optional[M] = ...
|
||||
__llm_tokenizer__: Optional[T] = ...
|
||||
__llm_adapter_map__: Optional[ResolvedAdapterMap] = ...
|
||||
__llm_trust_remote_code__: bool = ...
|
||||
|
||||
@property
|
||||
def __repr_keys__(self) -> Set[str]: ...
|
||||
def __repr__(self) -> str: ...
|
||||
def __str__(self) -> str: ...
|
||||
def __repr_name__(self) -> str: ...
|
||||
def __repr_str__(self, join_str: str) -> str: ...
|
||||
def __repr_args__(self) -> ReprArgs: ...
|
||||
def __init__(
|
||||
self,
|
||||
model_id: str,
|
||||
model_version: Optional[str] = ...,
|
||||
model_tag: Optional[Union[str, Tag]] = ...,
|
||||
prompt_template: Optional[Union[str, PromptTemplate]] = ...,
|
||||
system_message: Optional[str] = ...,
|
||||
llm_config: Optional[LLMConfig] = ...,
|
||||
backend: Optional[LiteralBackend] = ...,
|
||||
*args: Any,
|
||||
quantize: Optional[LiteralQuantise] = ...,
|
||||
quantization_config: Optional[QuantizationConfig] = ...,
|
||||
adapter_map: Optional[Dict[str, str]] = ...,
|
||||
serialisation: LiteralSerialisation = ...,
|
||||
trust_remote_code: bool = ...,
|
||||
embedded: bool = ...,
|
||||
torch_dtype: Dtype = ...,
|
||||
low_cpu_mem_usage: bool = ...,
|
||||
**attrs: Any,
|
||||
) -> None: ...
|
||||
@property
|
||||
def _torch_dtype(self) -> torch.dtype: ...
|
||||
@property
|
||||
def _model_attrs(self) -> Dict[str, Any]: ...
|
||||
@_model_attrs.setter
|
||||
def _model_attrs(self, model_attrs: Dict[str, Any]) -> None: ...
|
||||
@property
|
||||
def _tokenizer_attrs(self) -> Dict[str, Any]: ...
|
||||
@property
|
||||
def import_kwargs(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: ...
|
||||
@property
|
||||
def trust_remote_code(self) -> bool: ...
|
||||
@property
|
||||
def runner_name(self) -> str: ...
|
||||
@property
|
||||
def model_id(self) -> str: ...
|
||||
@property
|
||||
def revision(self) -> str: ...
|
||||
@property
|
||||
def tag(self) -> Tag: ...
|
||||
@property
|
||||
def bentomodel(self) -> Model: ...
|
||||
@property
|
||||
def quantization_config(self) -> QuantizationConfig: ...
|
||||
@property
|
||||
def has_adapters(self) -> bool: ...
|
||||
@property
|
||||
def local(self) -> bool: ...
|
||||
@property
|
||||
def quantise(self) -> Optional[LiteralQuantise]: ...
|
||||
@property
|
||||
def llm_type(self) -> str: ...
|
||||
@property
|
||||
def identifying_params(self) -> IdentifyingParams: ...
|
||||
@property
|
||||
def llm_parameters(self) -> Tuple[Tuple[Tuple[Any, ...], Dict[str, Any]], Dict[str, Any]]: ...
|
||||
@property
|
||||
def config(self) -> LLMConfig: ...
|
||||
@property
|
||||
def tokenizer(self) -> T: ...
|
||||
@property
|
||||
def model(self) -> M: ...
|
||||
@property
|
||||
def runner(self) -> Runner[M, T]: ...
|
||||
@property
|
||||
def adapter_map(self) -> ResolvedAdapterMap: ...
|
||||
def prepare_for_training(
|
||||
self, adapter_type: AdapterType = ..., use_gradient_checking: bool = ..., **attrs: Any
|
||||
) -> Tuple[InjectedModel, T]: ...
|
||||
async def generate(
|
||||
self,
|
||||
prompt: Optional[str],
|
||||
prompt_token_ids: Optional[List[int]] = ...,
|
||||
stop: Optional[Union[str, Iterable[str]]] = ...,
|
||||
stop_token_ids: Optional[List[int]] = ...,
|
||||
request_id: Optional[str] = ...,
|
||||
adapter_name: Optional[str] = ...,
|
||||
**attrs: Any,
|
||||
) -> GenerationOutput: ...
|
||||
async def generate_iterator(
|
||||
self,
|
||||
prompt: Optional[str],
|
||||
prompt_token_ids: Optional[List[int]] = ...,
|
||||
stop: Optional[Union[str, Iterable[str]]] = ...,
|
||||
stop_token_ids: Optional[List[int]] = ...,
|
||||
request_id: Optional[str] = ...,
|
||||
adapter_name: Optional[str] = ...,
|
||||
**attrs: Any,
|
||||
) -> AsyncGenerator[GenerationOutput, None]: ...
|
||||
@@ -1,12 +1,5 @@
|
||||
# mypy: disable-error-code="name-defined,no-redef"
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
import torch
|
||||
import transformers
|
||||
|
||||
from openllm_core._typing_compat import LiteralQuantise, overload
|
||||
from openllm_core.exceptions import MissingDependencyError
|
||||
from openllm_core.utils import (
|
||||
is_autoawq_available,
|
||||
@@ -15,35 +8,11 @@ from openllm_core.utils import (
|
||||
is_optimum_supports_gptq,
|
||||
)
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
|
||||
from ._llm import LLM
|
||||
def infer_quantisation_config(llm, quantise, **attrs):
|
||||
import torch
|
||||
import transformers
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@overload
|
||||
def infer_quantisation_config(
|
||||
self: LLM[t.Any, t.Any], quantise: t.Literal['int8', 'int4'], **attrs: t.Any
|
||||
) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]: ...
|
||||
|
||||
|
||||
@overload
|
||||
def infer_quantisation_config(
|
||||
self: LLM[t.Any, t.Any], quantise: t.Literal['gptq'], **attrs: t.Any
|
||||
) -> tuple[transformers.GPTQConfig, DictStrAny]: ...
|
||||
|
||||
|
||||
@overload
|
||||
def infer_quantisation_config(
|
||||
self: LLM[t.Any, t.Any], quantise: t.Literal['awq'], **attrs: t.Any
|
||||
) -> tuple[transformers.AwqConfig, DictStrAny]: ...
|
||||
|
||||
|
||||
def infer_quantisation_config(
|
||||
self: LLM[t.Any, t.Any], quantise: LiteralQuantise, **attrs: t.Any
|
||||
) -> tuple[transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig, DictStrAny]:
|
||||
# 8 bit configuration
|
||||
int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
|
||||
int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
|
||||
@@ -54,12 +23,17 @@ def infer_quantisation_config(
|
||||
bits = attrs.pop('bits', 4)
|
||||
group_size = attrs.pop('group_size', 128)
|
||||
|
||||
def create_awq_config() -> transformers.AwqConfig:
|
||||
# 4 bit configuration
|
||||
int4_compute_dtype = attrs.pop('bnb_4bit_compute_dtype', torch.bfloat16)
|
||||
int4_quant_type = attrs.pop('bnb_4bit_quant_type', 'nf4')
|
||||
int4_use_double_quant = attrs.pop('bnb_4bit_use_double_quant', True)
|
||||
|
||||
def create_awq_config():
|
||||
zero_point = attrs.pop('zero_point', True)
|
||||
return transformers.AwqConfig(bits=bits, group_size=group_size, zero_point=zero_point)
|
||||
|
||||
def create_gptq_config() -> transformers.GPTQConfig:
|
||||
gptq_tokenizer = attrs.pop('tokenizer', self.model_id)
|
||||
def create_gptq_config():
|
||||
gptq_tokenizer = attrs.pop('tokenizer', llm.model_id)
|
||||
gptq_dataset = attrs.pop('dataset', 'c4')
|
||||
gptq_damp_percent = attrs.pop('damp_percent', 0.1)
|
||||
gptq_desc_act = attrs.pop('desc_act', False)
|
||||
@@ -94,10 +68,9 @@ def infer_quantisation_config(
|
||||
exllama_config={'version': 1},
|
||||
) # XXX: See how to migrate to v2
|
||||
|
||||
def create_int8_config(int8_skip_modules: list[str] | None) -> transformers.BitsAndBytesConfig:
|
||||
def create_int8_config(int8_skip_modules):
|
||||
# if int8_skip_modules is None: int8_skip_modules = []
|
||||
# if 'lm_head' not in int8_skip_modules and self.config_class.__openllm_model_type__ == 'causal_lm':
|
||||
# logger.debug("Skipping 'lm_head' for quantization for %s", self.__name__)
|
||||
# int8_skip_modules.append('lm_head')
|
||||
return transformers.BitsAndBytesConfig(
|
||||
load_in_8bit=True,
|
||||
@@ -107,10 +80,13 @@ def infer_quantisation_config(
|
||||
llm_int8_has_fp16_weight=int8_has_fp16_weight,
|
||||
)
|
||||
|
||||
# 4 bit configuration
|
||||
int4_compute_dtype = attrs.pop('bnb_4bit_compute_dtype', torch.bfloat16)
|
||||
int4_quant_type = attrs.pop('bnb_4bit_quant_type', 'nf4')
|
||||
int4_use_double_quant = attrs.pop('bnb_4bit_use_double_quant', True)
|
||||
def create_int4_config():
|
||||
return transformers.BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=int4_compute_dtype,
|
||||
bnb_4bit_quant_type=int4_quant_type,
|
||||
bnb_4bit_use_double_quant=int4_use_double_quant,
|
||||
)
|
||||
|
||||
# NOTE: Quantization setup quantize is a openllm.LLM feature, where we can quantize the model with bitsandbytes or quantization aware training.
|
||||
if not is_bitsandbytes_available():
|
||||
@@ -120,23 +96,18 @@ def infer_quantisation_config(
|
||||
if quantise == 'int8':
|
||||
quantisation_config = create_int8_config(int8_skip_modules)
|
||||
elif quantise == 'int4':
|
||||
quantisation_config = transformers.BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=int4_compute_dtype,
|
||||
bnb_4bit_quant_type=int4_quant_type,
|
||||
bnb_4bit_use_double_quant=int4_use_double_quant,
|
||||
)
|
||||
quantisation_config = create_int4_config()
|
||||
elif quantise == 'gptq':
|
||||
if not is_autogptq_available() or not is_optimum_supports_gptq():
|
||||
raise MissingDependencyError(
|
||||
"'quantize=\"gptq\"' requires 'auto-gptq' and 'optimum>=0.12' to be installed (missing or failed to import). Make sure to do 'pip install \"openllm[gptq]\"'"
|
||||
"GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'"
|
||||
)
|
||||
else:
|
||||
quantisation_config = create_gptq_config()
|
||||
elif quantise == 'awq':
|
||||
if not is_autoawq_available():
|
||||
raise MissingDependencyError(
|
||||
"quantize='awq' requires 'auto-awq' to be installed (missing or failed to import). Make sure to do 'pip install \"openllm[awq]\"'."
|
||||
"AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'."
|
||||
)
|
||||
else:
|
||||
quantisation_config = create_awq_config()
|
||||
|
||||
26
openllm-python/src/openllm/_quantisation.pyi
Normal file
26
openllm-python/src/openllm/_quantisation.pyi
Normal file
@@ -0,0 +1,26 @@
|
||||
from typing import Any, Dict, Literal, Union
|
||||
|
||||
from transformers import AwqConfig, BitsAndBytesConfig, GPTQConfig
|
||||
|
||||
from openllm_core._typing_compat import LiteralQuantise, M, T, overload
|
||||
|
||||
from ._llm import LLM
|
||||
|
||||
QuantizationConfig = Union[BitsAndBytesConfig, GPTQConfig, AwqConfig]
|
||||
|
||||
@overload
|
||||
def infer_quantisation_config(
|
||||
self: LLM[M, T], quantise: Literal['int8', 'int4'], **attrs: Any
|
||||
) -> tuple[BitsAndBytesConfig, Dict[str, Any]]: ...
|
||||
@overload
|
||||
def infer_quantisation_config(
|
||||
self: LLM[M, T], quantise: Literal['gptq'], **attrs: Any
|
||||
) -> tuple[GPTQConfig, Dict[str, Any]]: ...
|
||||
@overload
|
||||
def infer_quantisation_config(
|
||||
self: LLM[M, T], quantise: Literal['awq'], **attrs: Any
|
||||
) -> tuple[AwqConfig, Dict[str, Any]]: ...
|
||||
@overload
|
||||
def infer_quantisation_config(
|
||||
self: LLM[M, T], quantise: LiteralQuantise, **attrs: Any
|
||||
) -> tuple[QuantizationConfig, Dict[str, Any]]: ...
|
||||
@@ -9,27 +9,14 @@ import torch
|
||||
import bentoml
|
||||
import openllm
|
||||
from openllm_core._schemas import CompletionChunk, GenerationOutput
|
||||
from openllm_core._typing_compat import LiteralBackend, M, T
|
||||
from openllm_core.exceptions import OpenLLMException
|
||||
from openllm_core.utils import first_not_none, is_vllm_available
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import vllm
|
||||
|
||||
from openllm_core._schemas import FinishReason
|
||||
else:
|
||||
vllm = openllm.utils.LazyLoader('vllm', globals(), 'vllm')
|
||||
|
||||
_DEFAULT_TOKENIZER = 'hf-internal-testing/llama-tokenizer'
|
||||
|
||||
__all__ = ['runnable']
|
||||
|
||||
|
||||
def runnable(backend: LiteralBackend | None = None) -> type[bentoml.Runnable]:
|
||||
backend = t.cast(
|
||||
LiteralBackend,
|
||||
first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt'),
|
||||
)
|
||||
def runnable(backend=None):
|
||||
backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt')
|
||||
return vLLMRunnable if backend == 'vllm' else PyTorchRunnable
|
||||
|
||||
|
||||
@@ -37,7 +24,11 @@ class vLLMRunnable(bentoml.Runnable):
|
||||
SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
|
||||
SUPPORTS_CPU_MULTI_THREADING = True
|
||||
|
||||
def __init__(self, llm: openllm.LLM[M, T]) -> None:
|
||||
def __init__(self, llm):
|
||||
try:
|
||||
import vllm
|
||||
except ImportError:
|
||||
raise OpenLLMException('vLLM is not installed. Please install it via `pip install "openllm[vllm]"`.') from None
|
||||
self.config = llm.config
|
||||
num_gpus, dev = 1, openllm.utils.device_count()
|
||||
if dev >= 2:
|
||||
@@ -64,14 +55,7 @@ class vLLMRunnable(bentoml.Runnable):
|
||||
raise OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from err
|
||||
|
||||
@bentoml.Runnable.method(batchable=False)
|
||||
async def generate_iterator(
|
||||
self,
|
||||
prompt_token_ids: list[int],
|
||||
request_id: str,
|
||||
stop: str | t.Iterable[str] | None = None,
|
||||
adapter_name: str | None = None,
|
||||
**attrs: t.Any,
|
||||
) -> t.AsyncGenerator[str, None]:
|
||||
async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
|
||||
if adapter_name is not None:
|
||||
raise NotImplementedError('Adapter is not supported with vLLM.')
|
||||
stop_: set[str] = set()
|
||||
@@ -99,28 +83,19 @@ class PyTorchRunnable(bentoml.Runnable):
|
||||
SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
|
||||
SUPPORTS_CPU_MULTI_THREADING = True
|
||||
|
||||
def __init__(self, llm: openllm.LLM[M, T]) -> None:
|
||||
def __init__(self, llm):
|
||||
self.model = llm.model
|
||||
self.tokenizer = llm.tokenizer
|
||||
self.config = llm.config
|
||||
|
||||
@bentoml.Runnable.method(batchable=False)
|
||||
async def generate_iterator(
|
||||
self,
|
||||
prompt_token_ids: list[int],
|
||||
request_id: str,
|
||||
stop: str | t.Iterable[str] | None = None,
|
||||
adapter_name: str | None = None,
|
||||
**attrs: t.Any,
|
||||
) -> t.AsyncGenerator[str, None]:
|
||||
async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
|
||||
if adapter_name is not None:
|
||||
self.model.set_adapter(adapter_name)
|
||||
async for generation_output in self.forward(prompt_token_ids, request_id, stop=stop, **attrs):
|
||||
yield generation_output.model_dump_json()
|
||||
|
||||
async def forward(
|
||||
self, prompt_token_ids: list[int], request_id: str, stop: str | t.Iterable[str] | None = None, **attrs: t.Any
|
||||
) -> t.AsyncGenerator[GenerationOutput, None]:
|
||||
async def forward(self, prompt_token_ids, request_id, stop=None, **attrs):
|
||||
from ._generation import is_partial_stop, prepare_logits_processor
|
||||
|
||||
stop_: set[str] = set()
|
||||
@@ -142,7 +117,7 @@ class PyTorchRunnable(bentoml.Runnable):
|
||||
logits_processor = prepare_logits_processor(config)
|
||||
|
||||
past_key_values = out = token = None
|
||||
finish_reason: t.Optional[FinishReason] = None
|
||||
finish_reason = None
|
||||
for i in range(config['max_new_tokens']):
|
||||
if i == 0: # prefill
|
||||
out = self.model(torch.as_tensor([prompt_token_ids], device=self.model.device), use_cache=True)
|
||||
|
||||
126
openllm-python/src/openllm/_runners.pyi
Normal file
126
openllm-python/src/openllm/_runners.pyi
Normal file
@@ -0,0 +1,126 @@
|
||||
from typing import (
|
||||
Any,
|
||||
AsyncGenerator,
|
||||
Dict,
|
||||
Generic,
|
||||
Iterable,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
Protocol,
|
||||
Tuple,
|
||||
Type,
|
||||
TypeVar,
|
||||
Union,
|
||||
final,
|
||||
)
|
||||
|
||||
from bentoml import Model, Strategy, Tag
|
||||
from bentoml._internal.runner.runner_handle import RunnerHandle
|
||||
from openllm_core import LLMConfig
|
||||
from openllm_core._typing_compat import LiteralBackend, T, overload
|
||||
|
||||
from ._llm import LLM
|
||||
|
||||
try:
|
||||
from vllm import AsyncLLMEngine
|
||||
except ImportError:
|
||||
AsyncLLMEngine = Any
|
||||
|
||||
try:
|
||||
from transformers import PreTrainedModel
|
||||
except ImportError:
|
||||
PreTrainedModel = Any
|
||||
|
||||
Mo = TypeVar('Mo')
|
||||
|
||||
class _Runnable(Protocol[Mo]):
|
||||
SUPPORTED_RESOURCES: Tuple[Literal['nvidia.com/gpu'], Literal['amd.com/gpu'], Literal['cpu']] = ...
|
||||
SUPPORTS_CPU_MULTI_THREADING: bool = ...
|
||||
config: LLMConfig = ...
|
||||
model: Mo = ...
|
||||
def __init__(self, llm: LLM[Mo, T]) -> None: ...
|
||||
async def generate_iterator(
|
||||
self,
|
||||
prompt_token_ids: List[int],
|
||||
request_id: str,
|
||||
stop: Optional[Union[str, Iterable[str]]] = ...,
|
||||
adapter_name: Optional[str] = ...,
|
||||
**attrs: Any,
|
||||
) -> AsyncGenerator[str, None]: ...
|
||||
|
||||
In = TypeVar('In')
|
||||
Ret = TypeVar('Ret')
|
||||
|
||||
class RunnerMethod(Generic[In, Ret]): ...
|
||||
|
||||
@final
|
||||
class vLLMRunnable(_Runnable[AsyncLLMEngine]): ...
|
||||
|
||||
@final
|
||||
class PyTorchRunnable(_Runnable[PreTrainedModel]):
|
||||
tokenizer: Any
|
||||
|
||||
@overload
|
||||
def runnable(backend: Literal['vllm']) -> Type[vLLMRunnable]: ...
|
||||
@overload
|
||||
def runnable(backend: Literal['pt']) -> Type[PyTorchRunnable]: ...
|
||||
@overload
|
||||
def runnable(backend: Optional[str] = ...) -> Type[Union[vLLMRunnable, PyTorchRunnable]]: ...
|
||||
|
||||
class Runner(Protocol[Mo, T]):
|
||||
__doc__: str = ...
|
||||
__module__: str = ...
|
||||
llm_type: str = ...
|
||||
llm_tag: Tag = ...
|
||||
identifying_params: Dict[str, Any] = ...
|
||||
llm: LLM[Mo, T] = ...
|
||||
config: LLMConfig = ...
|
||||
backend: LiteralBackend = ...
|
||||
has_adapters: bool = ...
|
||||
prompt_template: Optional[str] = ...
|
||||
system_message: Optional[str] = ...
|
||||
|
||||
class generate_iterator(RunnerMethod[List[int], AsyncGenerator[str, None]]):
|
||||
@staticmethod
|
||||
def async_stream(
|
||||
prompt_token_ids: List[int],
|
||||
request_id: str,
|
||||
stop: Optional[Union[Iterable[str], str]] = ...,
|
||||
adapter_name: Optional[str] = ...,
|
||||
**attrs: Any,
|
||||
) -> AsyncGenerator[str, None]: ...
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
runnable_class: Type[_Runnable[Mo]],
|
||||
*,
|
||||
runnable_init_params: Optional[Dict[str, Any]] = ...,
|
||||
name: Optional[str] = ...,
|
||||
scheduling_strategy: Type[Strategy] = ...,
|
||||
models: Optional[List[Model]] = ...,
|
||||
max_batch_size: Optional[int] = ...,
|
||||
max_latency_ms: Optional[int] = ...,
|
||||
method_configs: Optional[Dict[str, Dict[str, int]]] = ...,
|
||||
embedded: bool = ...,
|
||||
) -> None: ...
|
||||
|
||||
name: str = ...
|
||||
models: List[Model] = ...
|
||||
resource_config: Dict[str, Any]
|
||||
runnable_class: Type[_Runnable[Mo]]
|
||||
embedded: bool
|
||||
runner_methods: List[RunnerMethod[Any, Any]]
|
||||
scheduling_strategy: Type[Strategy]
|
||||
workers_per_resource: Union[int, float] = ...
|
||||
runnable_init_params: Dict[str, Any] = ...
|
||||
_runner_handle: RunnerHandle = ...
|
||||
|
||||
def init_local(self, quiet: bool = False) -> None: ...
|
||||
def init_client(self, handle_class: Optional[Type[RunnerHandle]] = ..., *args: Any, **kwargs: Any) -> None: ...
|
||||
async def runner_handle_is_ready(self, timeout: int = ...) -> bool: ...
|
||||
def destroy(self) -> None: ...
|
||||
@property
|
||||
def scheduled_worker_count(self) -> int: ...
|
||||
@property
|
||||
def scheduled_worker_env_map(self) -> Dict[int, Dict[str, Any]]: ...
|
||||
@@ -1,4 +1,3 @@
|
||||
from __future__ import annotations
|
||||
import os
|
||||
|
||||
model_id = os.environ['OPENLLM_MODEL_ID'] # openllm: model name
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
from __future__ import annotations
|
||||
|
||||
model_id = '{__model_id__}' # openllm: model id
|
||||
model_tag = '{__model_tag__}' # openllm: model tag
|
||||
adapter_map = """{__model_adapter_map__}""" # openllm: model adapter map
|
||||
|
||||
@@ -1,36 +1,15 @@
|
||||
import os
|
||||
import typing as t
|
||||
|
||||
from openllm_core.utils import LazyModule
|
||||
|
||||
_import_structure = {
|
||||
'_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'],
|
||||
'oci': [
|
||||
'CONTAINER_NAMES',
|
||||
'get_base_container_tag',
|
||||
'get_base_container_name',
|
||||
'supported_registries',
|
||||
'RefResolver',
|
||||
],
|
||||
}
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from . import _package as _package, oci as oci
|
||||
from ._package import (
|
||||
build_editable as build_editable,
|
||||
construct_docker_options as construct_docker_options,
|
||||
construct_python_options as construct_python_options,
|
||||
create_bento as create_bento,
|
||||
)
|
||||
from .oci import (
|
||||
CONTAINER_NAMES as CONTAINER_NAMES,
|
||||
RefResolver as RefResolver,
|
||||
get_base_container_name as get_base_container_name,
|
||||
get_base_container_tag as get_base_container_tag,
|
||||
supported_registries as supported_registries,
|
||||
)
|
||||
|
||||
__lazy = LazyModule(__name__, os.path.abspath('__file__'), _import_structure)
|
||||
__lazy = LazyModule(
|
||||
__name__,
|
||||
os.path.abspath('__file__'),
|
||||
{
|
||||
'_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'],
|
||||
'oci': ['CONTAINER_NAMES', 'supported_registries', 'RefResolver'],
|
||||
},
|
||||
)
|
||||
__all__ = __lazy.__all__
|
||||
__dir__ = __lazy.__dir__
|
||||
__getattr__ = __lazy.__getattr__
|
||||
|
||||
32
openllm-python/src/openllm/bundle/__init__.pyi
Normal file
32
openllm-python/src/openllm/bundle/__init__.pyi
Normal file
@@ -0,0 +1,32 @@
|
||||
from typing import Optional
|
||||
|
||||
import attr
|
||||
|
||||
from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
|
||||
from openllm_core.utils.lazy import VersionInfo
|
||||
|
||||
from . import _package as _package, oci as oci
|
||||
from ._package import (
|
||||
build_editable as build_editable,
|
||||
construct_docker_options as construct_docker_options,
|
||||
construct_python_options as construct_python_options,
|
||||
create_bento as create_bento,
|
||||
)
|
||||
|
||||
CONTAINER_NAMES: dict[LiteralContainerRegistry, str] = ...
|
||||
supported_registries: list[str] = ...
|
||||
|
||||
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
|
||||
class RefResolver:
|
||||
git_hash: str
|
||||
version: VersionInfo
|
||||
strategy: LiteralContainerVersionStrategy
|
||||
|
||||
@classmethod
|
||||
def from_strategy(cls, strategy_or_version: Optional[LiteralContainerVersionStrategy] = ...) -> RefResolver: ...
|
||||
@property
|
||||
def tag(self) -> str: ...
|
||||
@staticmethod
|
||||
def construct_base_image(
|
||||
reg: LiteralContainerRegistry, strategy: Optional[LiteralContainerVersionStrategy] = ...
|
||||
) -> str: ...
|
||||
@@ -1,16 +1,12 @@
|
||||
# mypy: disable-error-code="misc"
|
||||
from __future__ import annotations
|
||||
import importlib.metadata
|
||||
import inspect
|
||||
import logging
|
||||
import os
|
||||
import string
|
||||
import typing as t
|
||||
from pathlib import Path
|
||||
|
||||
import fs
|
||||
import fs.copy
|
||||
import fs.errors
|
||||
import orjson
|
||||
from simple_di import Provide, inject
|
||||
|
||||
@@ -18,38 +14,27 @@ import bentoml
|
||||
import openllm_core
|
||||
from bentoml._internal.bento.build_config import BentoBuildConfig, DockerOptions, ModelSpec, PythonOptions
|
||||
from bentoml._internal.configuration.containers import BentoMLContainer
|
||||
from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg
|
||||
|
||||
from . import oci
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from fs.base import FS
|
||||
|
||||
import openllm
|
||||
from bentoml._internal.bento import BentoStore
|
||||
from bentoml._internal.models.model import ModelStore
|
||||
from openllm_core._typing_compat import (
|
||||
LiteralContainerRegistry,
|
||||
LiteralContainerVersionStrategy,
|
||||
LiteralSerialisation,
|
||||
LiteralString,
|
||||
)
|
||||
from openllm_core._typing_compat import LiteralString
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
|
||||
|
||||
|
||||
def build_editable(
|
||||
path: str, package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm'
|
||||
) -> str | None:
|
||||
def build_editable(path, package='openllm'):
|
||||
"""Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set."""
|
||||
if not openllm_core.utils.check_bool_env(OPENLLM_DEV_BUILD, default=False):
|
||||
if not check_bool_env(OPENLLM_DEV_BUILD, default=False):
|
||||
return None
|
||||
# We need to build the package in editable mode, so that we can import it
|
||||
from build import ProjectBuilder
|
||||
from build.env import IsolatedEnvBuilder
|
||||
|
||||
module_location = openllm_core.utils.pkg.source_locations(package)
|
||||
module_location = pkg.source_locations(package)
|
||||
if not module_location:
|
||||
raise RuntimeError(
|
||||
'Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.'
|
||||
@@ -68,12 +53,7 @@ def build_editable(
|
||||
)
|
||||
|
||||
|
||||
def construct_python_options(
|
||||
llm: openllm.LLM[t.Any, t.Any],
|
||||
llm_fs: FS,
|
||||
extra_dependencies: tuple[str, ...] | None = None,
|
||||
adapter_map: dict[str, str] | None = None,
|
||||
) -> PythonOptions:
|
||||
def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=None):
|
||||
packages = ['openllm', 'scipy'] # apparently bnb misses this one
|
||||
if adapter_map is not None:
|
||||
packages += ['openllm[fine-tune]']
|
||||
@@ -88,24 +68,18 @@ def construct_python_options(
|
||||
if req is not None:
|
||||
packages.extend(req)
|
||||
if str(os.environ.get('BENTOML_BUNDLE_LOCAL_BUILD', False)).lower() == 'false':
|
||||
packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}")
|
||||
packages.append(f"bentoml>={'.'.join([str(i) for i in pkg.pkg_version_info('bentoml')])}")
|
||||
|
||||
if not openllm_core.utils.is_torch_available():
|
||||
raise ValueError('PyTorch is not available. Make sure to have it locally installed.')
|
||||
packages.extend(
|
||||
['torch==2.0.1+cu118', 'vllm==0.2.1.post1', 'xformers==0.0.22', 'bentoml[tracing]==1.1.9']
|
||||
) # XXX: Currently locking this for correctness
|
||||
wheels: list[str] = []
|
||||
built_wheels = [
|
||||
build_editable(llm_fs.getsyspath('/'), t.cast(t.Literal['openllm', 'openllm_core', 'openllm_client'], p))
|
||||
for p in ('openllm_core', 'openllm_client', 'openllm')
|
||||
]
|
||||
# XXX: Currently locking this for correctness
|
||||
packages.extend(['torch==2.0.1+cu118', 'vllm==0.2.1.post1', 'xformers==0.0.22', 'bentoml[tracing]==1.1.9'])
|
||||
wheels = []
|
||||
built_wheels = [build_editable(llm_fs.getsyspath('/'), p) for p in ('openllm_core', 'openllm_client', 'openllm')]
|
||||
if all(i for i in built_wheels):
|
||||
wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)])
|
||||
return PythonOptions(
|
||||
packages=packages,
|
||||
wheels=wheels,
|
||||
lock_packages=False,
|
||||
lock_packages=True,
|
||||
extra_index_url=[
|
||||
'https://download.pytorch.org/whl/cu118',
|
||||
'https://huggingface.github.io/autogptq-index/whl/cu118/',
|
||||
@@ -114,15 +88,8 @@ def construct_python_options(
|
||||
|
||||
|
||||
def construct_docker_options(
|
||||
llm: openllm.LLM[t.Any, t.Any],
|
||||
_: FS,
|
||||
quantize: LiteralString | None,
|
||||
adapter_map: dict[str, str] | None,
|
||||
dockerfile_template: str | None,
|
||||
serialisation: LiteralSerialisation,
|
||||
container_registry: LiteralContainerRegistry,
|
||||
container_version_strategy: LiteralContainerVersionStrategy,
|
||||
) -> DockerOptions:
|
||||
llm, _, quantize, adapter_map, dockerfile_template, serialisation, container_registry, container_version_strategy
|
||||
):
|
||||
from openllm_cli._factory import parse_config_options
|
||||
|
||||
environ = parse_config_options(llm.config, llm.config['timeout'], 1.0, None, True, os.environ.copy())
|
||||
@@ -145,7 +112,7 @@ def construct_docker_options(
|
||||
if quantize:
|
||||
env_dict['OPENLLM_QUANTIZE'] = str(quantize)
|
||||
return DockerOptions(
|
||||
base_image=f'{oci.get_base_container_name(container_registry)}:{oci.get_base_container_tag(container_version_strategy)}',
|
||||
base_image=oci.RefResolver.construct_base_image(container_registry, container_version_strategy),
|
||||
env=env_dict,
|
||||
dockerfile_template=dockerfile_template,
|
||||
)
|
||||
@@ -160,21 +127,13 @@ class _ServiceVarsFormatter(string.Formatter):
|
||||
keyword: LiteralString = '__model_name__'
|
||||
identifier: LiteralString = '# openllm: model name'
|
||||
|
||||
def __init__(self, target: str):
|
||||
"""The formatter that extends model_name to be formatted the 'service.py'."""
|
||||
def __init__(self, target):
|
||||
super().__init__()
|
||||
self.target = target
|
||||
|
||||
def vformat(self, format_string: str, *args: t.Any, **attrs: t.Any) -> t.Any:
|
||||
def vformat(self, format_string, *args, **attrs) -> str:
|
||||
return super().vformat(format_string, (), {self.keyword: self.target})
|
||||
|
||||
def can_format(self, value: str) -> bool:
|
||||
try:
|
||||
self.parse(value)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def parse_line(self, line: str, nl: bool = True) -> str:
|
||||
if self.identifier not in line:
|
||||
return line
|
||||
@@ -201,9 +160,7 @@ _service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
|
||||
_service_vars_file = Path(os.path.abspath(__file__)).parent.parent / '_service_vars_pkg.py'
|
||||
|
||||
|
||||
def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str] | None, llm_fs: FS) -> None:
|
||||
from openllm_core.utils import DEBUG
|
||||
|
||||
def write_service(llm, llm_fs, adapter_map):
|
||||
model_id_formatter = ModelIdFormatter(llm.model_id)
|
||||
model_tag_formatter = ModelTagFormatter(str(llm.tag))
|
||||
adapter_map_formatter = ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode())
|
||||
@@ -222,8 +179,8 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str] |
|
||||
src_contents[i] = adapter_map_formatter.parse_line(it)
|
||||
|
||||
script = f"# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n\n" + ''.join(src_contents)
|
||||
if DEBUG:
|
||||
logger.info('Generated script:\n%s', script)
|
||||
if SHOW_CODEGEN:
|
||||
logger.info('Generated _service_vars.py:\n%s', script)
|
||||
llm_fs.writetext('_service_vars.py', script)
|
||||
|
||||
logger.debug(
|
||||
@@ -236,22 +193,20 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str] |
|
||||
|
||||
@inject
|
||||
def create_bento(
|
||||
bento_tag: bentoml.Tag,
|
||||
llm_fs: FS,
|
||||
llm: openllm.LLM[t.Any, t.Any],
|
||||
quantize: LiteralString | None,
|
||||
dockerfile_template: str | None,
|
||||
adapter_map: dict[str, str] | None = None,
|
||||
extra_dependencies: tuple[str, ...] | None = None,
|
||||
serialisation: LiteralSerialisation | None = None,
|
||||
container_registry: LiteralContainerRegistry = 'ecr',
|
||||
container_version_strategy: LiteralContainerVersionStrategy = 'release',
|
||||
_bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
|
||||
_model_store: ModelStore = Provide[BentoMLContainer.model_store],
|
||||
) -> bentoml.Bento:
|
||||
_serialisation: LiteralSerialisation = openllm_core.utils.first_not_none(
|
||||
serialisation, default=llm.config['serialisation']
|
||||
)
|
||||
bento_tag,
|
||||
llm_fs,
|
||||
llm,
|
||||
quantize,
|
||||
dockerfile_template,
|
||||
adapter_map=None,
|
||||
extra_dependencies=None,
|
||||
serialisation=None,
|
||||
container_registry='ecr',
|
||||
container_version_strategy='release',
|
||||
_bento_store=Provide[BentoMLContainer.bento_store],
|
||||
_model_store=Provide[BentoMLContainer.model_store],
|
||||
):
|
||||
_serialisation = openllm_core.utils.first_not_none(serialisation, default=llm.config['serialisation'])
|
||||
labels = dict(llm.identifying_params)
|
||||
labels.update(
|
||||
{
|
||||
@@ -270,47 +225,31 @@ def create_bento(
|
||||
labels.update(adapter_map)
|
||||
logger.debug("Building Bento '%s' with model backend '%s'", bento_tag, llm.__llm_backend__)
|
||||
# add service.py definition to this temporary folder
|
||||
write_service(llm, adapter_map, llm_fs)
|
||||
write_service(llm, llm_fs, adapter_map)
|
||||
|
||||
llm_spec = ModelSpec.from_item({'tag': str(llm.tag), 'alias': llm.tag.name})
|
||||
build_config = BentoBuildConfig(
|
||||
service=f"{llm.config['service_name']}:svc",
|
||||
name=bento_tag.name,
|
||||
labels=labels,
|
||||
models=[llm_spec],
|
||||
description=f"OpenLLM service for {llm.config['start_name']}",
|
||||
include=list(llm_fs.walk.files()),
|
||||
exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
|
||||
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
|
||||
docker=construct_docker_options(
|
||||
llm,
|
||||
llm_fs,
|
||||
quantize,
|
||||
adapter_map,
|
||||
dockerfile_template,
|
||||
_serialisation,
|
||||
container_registry,
|
||||
container_version_strategy,
|
||||
bento = bentoml.Bento.create(
|
||||
version=bento_tag.version,
|
||||
build_ctx=llm_fs.getsyspath('/'),
|
||||
build_config=BentoBuildConfig(
|
||||
service=f"{llm.config['service_name']}:svc",
|
||||
name=bento_tag.name,
|
||||
labels=labels,
|
||||
models=[ModelSpec.from_item({'tag': str(llm.tag), 'alias': llm.tag.name})],
|
||||
description=f"OpenLLM service for {llm.config['start_name']}",
|
||||
include=list(llm_fs.walk.files()),
|
||||
exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
|
||||
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
|
||||
docker=construct_docker_options(
|
||||
llm,
|
||||
llm_fs,
|
||||
quantize,
|
||||
adapter_map,
|
||||
dockerfile_template,
|
||||
_serialisation,
|
||||
container_registry,
|
||||
container_version_strategy,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/'))
|
||||
# NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM.
|
||||
service_fs_path = fs.path.join('src', llm.config['service_name'])
|
||||
service_path = bento._fs.getsyspath(service_fs_path)
|
||||
with open(service_path, 'r') as f:
|
||||
service_contents = f.readlines()
|
||||
|
||||
for it in service_contents:
|
||||
if '__bento_name__' in it:
|
||||
service_contents[service_contents.index(it)] = it.format(__bento_name__=str(bento.tag))
|
||||
|
||||
script = ''.join(service_contents)
|
||||
if openllm_core.utils.DEBUG:
|
||||
logger.info('Generated script:\n%s', script)
|
||||
|
||||
bento._fs.writetext(service_fs_path, script)
|
||||
if 'model_store' in inspect.signature(bento.save).parameters:
|
||||
return bento.save(bento_store=_bento_store, model_store=_model_store)
|
||||
# backward arguments. `model_store` is added recently
|
||||
return bento.save(bento_store=_bento_store)
|
||||
return bento.save(bento_store=_bento_store, model_store=_model_store)
|
||||
|
||||
52
openllm-python/src/openllm/bundle/_package.pyi
Normal file
52
openllm-python/src/openllm/bundle/_package.pyi
Normal file
@@ -0,0 +1,52 @@
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
||||
from fs.base import FS
|
||||
from typing_extensions import LiteralString
|
||||
|
||||
from bentoml import Bento, Tag
|
||||
from bentoml._internal.bento import BentoStore
|
||||
from bentoml._internal.bento.build_config import DockerOptions, PythonOptions
|
||||
from bentoml._internal.models.model import ModelStore
|
||||
from openllm_core._typing_compat import (
|
||||
LiteralContainerRegistry,
|
||||
LiteralContainerVersionStrategy,
|
||||
LiteralQuantise,
|
||||
LiteralSerialisation,
|
||||
M,
|
||||
T,
|
||||
)
|
||||
|
||||
from .._llm import LLM
|
||||
|
||||
def build_editable(path: str, package: LiteralString) -> Optional[str]: ...
|
||||
def construct_python_options(
|
||||
llm: LLM[M, T],
|
||||
llm_fs: FS,
|
||||
extra_dependencies: Optional[Tuple[str, ...]] = ...,
|
||||
adapter_map: Optional[Dict[str, str]] = ...,
|
||||
) -> PythonOptions: ...
|
||||
def construct_docker_options(
|
||||
llm: LLM[M, T],
|
||||
llm_fs: FS,
|
||||
quantize: Optional[LiteralQuantise],
|
||||
adapter_map: Optional[Dict[str, str]],
|
||||
dockerfile_template: Optional[str],
|
||||
serialisation: LiteralSerialisation,
|
||||
container_registry: LiteralContainerRegistry,
|
||||
container_version_strategy: LiteralContainerVersionStrategy,
|
||||
) -> DockerOptions: ...
|
||||
def write_service(llm: LLM[M, T], llm_fs: FS, adapter_map: Optional[Dict[str, str]]) -> None: ...
|
||||
def create_bento(
|
||||
bento_tag: Tag,
|
||||
llm_fs: FS,
|
||||
llm: LLM[M, T],
|
||||
quantize: Optional[LiteralQuantise],
|
||||
dockerfile_template: Optional[str],
|
||||
adapter_map: Optional[Dict[str, str]] = ...,
|
||||
extra_dependencies: Optional[Tuple[str, ...]] = ...,
|
||||
serialisation: Optional[LiteralSerialisation] = ...,
|
||||
container_registry: LiteralContainerRegistry = ...,
|
||||
container_version_strategy: LiteralContainerVersionStrategy = ...,
|
||||
_bento_store: BentoStore = ...,
|
||||
_model_store: ModelStore = ...,
|
||||
) -> Bento: ...
|
||||
@@ -1,26 +1,21 @@
|
||||
# mypy: disable-error-code="misc"
|
||||
from __future__ import annotations
|
||||
import functools
|
||||
import importlib
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import typing as t
|
||||
|
||||
import attr
|
||||
|
||||
from openllm_core._typing_compat import LiteralContainerVersionStrategy
|
||||
from openllm_core.exceptions import OpenLLMException
|
||||
from openllm_core.utils import codegen
|
||||
from openllm_core.utils.lazy import VersionInfo
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy, RefTuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ROOT_DIR = pathlib.Path(os.path.abspath('__file__')).parent.parent.parent
|
||||
|
||||
_CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {
|
||||
_CONTAINER_REGISTRY = {
|
||||
'docker': 'docker.io/bentoml/openllm',
|
||||
'gh': 'ghcr.io/bentoml/openllm',
|
||||
'ecr': 'public.ecr.aws/y5w8i4y6/bentoml/openllm',
|
||||
@@ -30,80 +25,48 @@ _CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {
|
||||
_OWNER, _REPO = 'bentoml', 'openllm'
|
||||
|
||||
|
||||
def _convert_version_from_string(s: str) -> VersionInfo:
|
||||
return VersionInfo.from_version_string(s)
|
||||
|
||||
|
||||
_RefTuple: type[RefTuple] = codegen.make_attr_tuple_class('_RefTuple', ['git_hash', 'version', 'strategy'])
|
||||
|
||||
|
||||
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
|
||||
class RefResolver:
|
||||
git_hash: str = attr.field()
|
||||
version: VersionInfo = attr.field(converter=_convert_version_from_string)
|
||||
version: VersionInfo = attr.field(converter=lambda s: VersionInfo.from_version_string(s))
|
||||
strategy: LiteralContainerVersionStrategy = attr.field()
|
||||
|
||||
@classmethod
|
||||
def _release_ref(cls, version_str: str | None = None) -> RefTuple:
|
||||
try:
|
||||
from ghapi.all import GhApi
|
||||
|
||||
ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
|
||||
meta = t.cast(t.Dict[str, t.Any], ghapi.repos.get_latest_release())
|
||||
except Exception as err:
|
||||
raise OpenLLMException('Failed to determine latest release version.') from err
|
||||
_use_base_strategy = version_str is None
|
||||
if version_str is None:
|
||||
# NOTE: This strategy will only support openllm>0.2.12
|
||||
version_str = meta['name'].lstrip('v')
|
||||
version = (ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'], version_str)
|
||||
else:
|
||||
version = ('', version_str)
|
||||
return _RefTuple((*version, 'release' if _use_base_strategy else 'custom'))
|
||||
|
||||
@classmethod
|
||||
@functools.lru_cache(maxsize=64)
|
||||
def from_strategy(cls, strategy_or_version: LiteralContainerVersionStrategy | None = None) -> RefResolver:
|
||||
def from_strategy(cls, strategy_or_version=None):
|
||||
# using default strategy
|
||||
if strategy_or_version is None or strategy_or_version == 'release':
|
||||
return cls(*cls._release_ref())
|
||||
try:
|
||||
from ghapi.all import GhApi
|
||||
|
||||
ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
|
||||
meta = ghapi.repos.get_latest_release()
|
||||
git_hash = ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha']
|
||||
except Exception as err:
|
||||
raise OpenLLMException('Failed to determine latest release version.') from err
|
||||
return cls(git_hash=git_hash, version=meta['name'].lstrip('v'), strategy='release')
|
||||
elif strategy_or_version in ('latest', 'nightly'): # latest is nightly
|
||||
return cls(git_hash='latest', version='0.0.0', strategy='latest')
|
||||
else:
|
||||
raise ValueError(f'Unknown strategy: {strategy_or_version}')
|
||||
|
||||
@property
|
||||
def tag(self) -> str:
|
||||
def tag(self):
|
||||
return 'latest' if self.strategy in {'latest', 'nightly'} else repr(self.version)
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=256)
|
||||
def get_base_container_tag(strategy: LiteralContainerVersionStrategy | None = None) -> str:
|
||||
return RefResolver.from_strategy(strategy).tag
|
||||
@staticmethod
|
||||
def construct_base_image(reg, strategy=None):
|
||||
return f'{_CONTAINER_REGISTRY[reg]}:{RefResolver.from_strategy(strategy).tag}'
|
||||
|
||||
|
||||
def get_base_container_name(reg: LiteralContainerRegistry) -> str:
|
||||
return _CONTAINER_REGISTRY[reg]
|
||||
__all__ = ['CONTAINER_NAMES', 'RefResolver', 'supported_registries']
|
||||
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
CONTAINER_NAMES: dict[LiteralContainerRegistry, str]
|
||||
supported_registries: list[str]
|
||||
|
||||
__all__ = [
|
||||
'CONTAINER_NAMES',
|
||||
'get_base_container_tag',
|
||||
'get_base_container_name',
|
||||
'supported_registries',
|
||||
'RefResolver',
|
||||
]
|
||||
|
||||
|
||||
def __dir__() -> list[str]:
|
||||
def __dir__():
|
||||
return sorted(__all__)
|
||||
|
||||
|
||||
def __getattr__(name: str) -> t.Any:
|
||||
def __getattr__(name):
|
||||
if name == 'supported_registries':
|
||||
return functools.lru_cache(1)(lambda: list(_CONTAINER_REGISTRY))()
|
||||
elif name == 'CONTAINER_NAMES':
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
"""Tests utilities for OpenLLM."""
|
||||
|
||||
from __future__ import annotations
|
||||
import contextlib
|
||||
import logging
|
||||
|
||||
Reference in New Issue
Block a user